from dataclasses import dataclass from enum import Enum, auto from typing import Any import sys class LexError(BaseException): def __init__(self, message, line): super().__init__(f"line {line}: {message}") class TokenType(Enum): PRINT = auto() OPEN_PAREN = auto() CLOSE_PAREN = auto() EOF = auto() # literals INT = auto() FLOAT = auto() STRING = auto() TRUE = auto() FALSE = auto() # arithmetic PLUS = auto() DASH = auto() STAR = auto() SLASH = auto() # strings DOUBLE_QUOTE = auto() # comparison GREATER = auto() GREATER_EQUAL = auto() LESS = auto() LESS_EQUAL = auto() EQUAL = auto() NOT = auto() AND = auto() OR = auto() # flow IF = auto() # keywords DEF = auto() LAMBDA = auto() # symbols SYMBOL = auto() keywords = { "print": TokenType.PRINT, "+": TokenType.PLUS, "-": TokenType.DASH, "*": TokenType.STAR, "/": TokenType.SLASH, ">": TokenType.GREATER, ">=": TokenType.GREATER_EQUAL, "<": TokenType.LESS, "<=": TokenType.LESS_EQUAL, "eq?": TokenType.EQUAL, "not": TokenType.NOT, "and": TokenType.AND, "or": TokenType.OR, "if": TokenType.IF, "def": TokenType.DEF, "lambda": TokenType.LAMBDA } @dataclass class Token: type_: TokenType text: str value: Any line: int def __str__(self): return f"{self.type_.name} {self.text} {self.line}" WHITESPACE = [" ", "\n", "\t"] SEPARATORS = WHITESPACE + [")"] DIGITS = list("0123456789") def lex(data): start = 0 current = 0 line = 1 end = len(data) tokens = [] while current < end: char = data[current] if char == "\n": line += 1 if char in WHITESPACE: current += 1 continue elif char == "(": tokens.append(Token(TokenType.OPEN_PAREN, "(", None, line)) elif char == ")": tokens.append(Token(TokenType.CLOSE_PAREN, ")", None, line)) # numbers elif char in DIGITS or char == ".": tok, length = get_number(data[current:], line) tokens.append(tok) current += length # strings elif char == '"': tok, length = get_string(data[current+1:], line) tokens.append(tok) current += length # bools elif char == "#": tok, length = get_bool(data[current+1:], line) tokens.append(tok) current += length # symbols else: tok, length = get_symbol(data[current:], line) if tok.text in keywords: tok.type_ = keywords[tok.text] tokens.append(tok) current += length current += 1 tokens.append(Token(TokenType.EOF, "", None, line)) return tokens def get_number(data, line): counter = 0 value = "" is_float = False char = data[counter] while char not in SEPARATORS: if char in DIGITS: value += char elif char == ".": if is_float: raise LexError("too many '.' in number", line) is_float = True value += char else: raise Exception(f"invalid number: {value}") counter += 1 if counter >= len(data): break char = data[counter] if is_float: return Token(TokenType.FLOAT, value, float(value), line), counter - 1 else: return Token(TokenType.INT, value, int(value), line), counter - 1 def get_string(data, line): counter = 0 string = "" while data[counter] != '"': string += data[counter] counter += 1 if counter >= len(data): raise Exception("couldn't parse string") return Token(TokenType.STRING, string, string, line), counter + 1 def get_bool(data, line): if len(data) >= 4 and data[:4] == "true": return Token(TokenType.TRUE, "#true", True, line), 4 elif len(data) >= 5 and data[:5] == "false": return Token(TokenType.FALSE, "#false", False, line), 5 else: raise Exception("couldn't parse boolean") def get_symbol(data, line): counter = 0 value = "" while data[counter] not in SEPARATORS: value += data[counter] counter += 1 if counter >= len(data): break return Token(TokenType.SYMBOL, value, None, line), counter - 1 def main(data): try: tokens = lex(data) except LexError as error: print(error) sys.exit() for tok in tokens: print(f"{tok}") if __name__ == "__main__": if len(sys.argv) != 2: print("need a file") sys.exit() with open(sys.argv[1], "r") as fil: data = fil.read() main(data)