from structs import TokenType, Token import sys class LexError(BaseException): def __init__(self, message, line): super().__init__(f"line {line}: {message}") types = { ":int": TokenType.INT_TYPE, ":float": TokenType.FLOAT_TYPE, ":number": TokenType.NUMBER_TYPE, ":string": TokenType.STRING_TYPE, ":list": TokenType.LIST_TYPE, ":any": TokenType.ANY_TYPE, ":literal": TokenType.LITERAL_TYPE, ":bool": TokenType.BOOL_TYPE } keywords = { "print": TokenType.PRINT, "+": TokenType.PLUS, "-": TokenType.DASH, "*": TokenType.STAR, "/": TokenType.SLASH, ">": TokenType.GREATER, ">=": TokenType.GREATER_EQUAL, "<": TokenType.LESS, "<=": TokenType.LESS_EQUAL, "eq?": TokenType.EQUAL, "not": TokenType.NOT, "and": TokenType.AND, "or": TokenType.OR, "if": TokenType.IF, "for-count": TokenType.FOR_COUNT, "|": TokenType.PIPE, "def": TokenType.DEF, "lambda": TokenType.LAMBDA, "&": TokenType.MANY } WHITESPACE = [" ", "\n", "\t"] SEPARATORS = WHITESPACE + [")"] DIGITS = list("0123456789") def lex(data): start = 0 current = 0 line = 1 end = len(data) tokens = [] while current < end: char = data[current] if char == ";": while char != "\n" and current < end: current += 1 char = data[current] continue if char == "\n": line += 1 if char in WHITESPACE: current += 1 continue elif char == "(": tokens.append(Token(TokenType.OPEN_PAREN, "(", None, line)) elif char == ")": tokens.append(Token(TokenType.CLOSE_PAREN, ")", None, line)) # numbers elif char in DIGITS or char == ".": tok, length = get_number(data[current:], line) tokens.append(tok) current += length # strings elif char == '"': tok, length, offset = get_string(data[current+1:], line) tokens.append(tok) current += length line += offset # bools elif char == "#": tok, length = get_bool(data[current+1:], line) tokens.append(tok) current += length #types elif char == ":": tok, length = get_type(data[current:], line) # include : tokens.append(tok) current += length # symbols else: tok, length = get_symbol(data[current:], line) if tok.text in keywords: tok.type_ = keywords[tok.text] tokens.append(tok) current += length current += 1 tokens.append(Token(TokenType.EOF, "", None, line)) return tokens def get_number(data, line): counter = 0 value = "" is_float = False char = data[counter] while char not in SEPARATORS: if char in DIGITS: value += char elif char == ".": if is_float: raise LexError("too many '.' in number", line) is_float = True value += char else: raise Exception(f"invalid number: {value}") counter += 1 if counter >= len(data): break char = data[counter] if is_float: return Token(TokenType.FLOAT, value, float(value), line), counter - 1 else: return Token(TokenType.INT, value, int(value), line), counter - 1 def get_string(data, line): offset = 0 counter = 0 string = "" while data[counter] != '"': if data[counter] == "\n": offset += 1 # look ahead to see if it's a double quote if data[counter] == "\\" and \ len(data) > counter and \ data[counter+1] == '"': string += '"' counter += 1 else: string += data[counter] counter += 1 if counter >= len(data): raise Exception("couldn't parse string") string = string.encode().decode("unicode_escape") return Token(TokenType.STRING, str(string), str(string), line), counter + 1, offset def get_bool(data, line): if len(data) >= 4 and data[:4] == "true": return Token(TokenType.TRUE, "#true", True, line), 4 elif len(data) >= 5 and data[:5] == "false": return Token(TokenType.FALSE, "#false", False, line), 5 else: raise Exception("couldn't parse boolean") def get_symbol(data, line): counter = 0 value = "" while data[counter] not in SEPARATORS: value += data[counter] counter += 1 if counter >= len(data): break return Token(TokenType.SYMBOL, value, None, line), counter - 1 def get_type(data, line): counter = 0 value = "" while data[counter] not in SEPARATORS: value += data[counter] counter += 1 if counter >= len(data): break if value not in types: raise LexError(f"unrecognized type {value}", line) return Token(types[value], value, None, line), counter - 1 def main(data): try: tokens = lex(data) except LexError as error: print(error) sys.exit() for tok in tokens: print(f"{tok}") if __name__ == "__main__": if len(sys.argv) != 2: print("need a file") sys.exit() with open(sys.argv[1], "r") as fil: data = fil.read() main(data)