from .structs import TokenType, Token from .exceptions import LexError import sys types = { ":int": TokenType.INT_TYPE, ":float": TokenType.FLOAT_TYPE, ":number": TokenType.NUMBER_TYPE, ":string": TokenType.STRING_TYPE, ":list": TokenType.LIST_TYPE, ":any": TokenType.ANY_TYPE, ":literal": TokenType.LITERAL_TYPE, ":bool": TokenType.BOOL_TYPE } keywords = { "def": TokenType.DEF, "lambda": TokenType.LAMBDA, "&": TokenType.MANY, "func": TokenType.FUNC } WHITESPACE = [" ", "\n", "\t"] SEPARATORS = WHITESPACE + [")", "]", "}"] DIGITS = list("0123456789") def lex(data): start = 0 current = 0 line = 1 end = len(data) tokens = [] while current < end: char = data[current] if char == ";": while char != "\n" and current < end: current += 1 char = data[current] continue if char == "\n": line += 1 if char in WHITESPACE: current += 1 continue elif char == "(": tokens.append(Token(TokenType.OPEN_PAREN, "(", None, line)) elif char == ")": tokens.append(Token(TokenType.CLOSE_PAREN, ")", None, line)) elif char == "[": tokens.append(Token(TokenType.OPEN_BRACKET, "[", None, line)) elif char == "]": tokens.append(Token(TokenType.CLOSE_BRACKET, "]", None, line)) elif char == "{": tokens.append(Token(TokenType.OPEN_BRACE, "{", None, line)) elif char == "}": tokens.append(Token(TokenType.CLOSE_BRACE, "}", None, line)) elif char == ":": tokens.append(Token(TokenType.COLON, ":", None, line)) # numbers #elif char in DIGITS or char == ".": elif char in DIGITS: tok, length = get_number(data[current:], line) tokens.append(tok) current += length # strings elif char == '"': tok, length, offset = get_string(data[current+1:], line) tokens.append(tok) current += length line += offset # bools elif char == "#": tok, length = get_bool(data[current+1:], line) tokens.append(tok) current += length # single quotes elif char == "'": tokens.append(Token(TokenType.APOSTROPHE, "'", None, line)) # symbols else: tok, length = get_symbol(data[current:], line) if tok.text in keywords: tok.type_ = keywords[tok.text] tokens.append(tok) current += length current += 1 tokens.append(Token(TokenType.EOF, "", None, line)) return tokens def get_number(data, line): counter = 0 value = "" is_float = False char = data[counter] while char not in SEPARATORS: if char in DIGITS: value += char elif char == ".": if is_float: raise LexError("too many '.' in number", line) is_float = True value += char else: raise Exception(f"invalid number: {value}") counter += 1 if counter >= len(data): break char = data[counter] if is_float: return Token(TokenType.FLOAT, value, float(value), line), counter - 1 else: return Token(TokenType.INT, value, int(value), line), counter - 1 def get_string(data, line): offset = 0 counter = 0 string = "" while data[counter] != '"': if data[counter] == "\n": offset += 1 # look ahead to see if it's a double quote if data[counter] == "\\" and \ len(data) > counter and \ data[counter+1] == '"': string += '"' counter += 1 else: string += data[counter] counter += 1 if counter >= len(data): raise Exception("couldn't parse string") string = string.encode().decode("unicode_escape") return Token(TokenType.STRING, str(string), str(string), line), counter + 1, offset def get_bool(data, line): counter = 0 value = "" while data[counter] not in SEPARATORS: value += data[counter] counter += 1 if counter >= len(data): break if value == "true": return Token(TokenType.TRUE, "#true", True, line), 4 elif value == "false": return Token(TokenType.FALSE, "#false", False, line), 5 else: raise LexError("couldn't parse boolean", line) def get_symbol(data, line): counter = 0 value = "" while data[counter] not in SEPARATORS: value += data[counter] counter += 1 if counter >= len(data): break return Token(TokenType.SYMBOL, value, None, line), counter - 1