diff options
| -rw-r--r-- | lexer.py | 364 | ||||
| -rw-r--r-- | repl.py | 14 |
2 files changed, 201 insertions, 177 deletions
@@ -1,175 +1,205 @@ -from tokens import * - -# consts -DOUBLE_QUOTE = '"' -BACKSLASH = "\\" -OPEN_PAREN = "(" -CLOSE_PAREN = ")" -OPEN_BRACE = "[" -CLOSE_BRACE = "]" -DIGITS = "0123456789" -LETTERS = "abcdefghijklmnopqrstuvwxyz" -PUNCTUATION = "|-_!*$@%^&=+/?<>~" -SYMBOL_VALS = list(LETTERS + LETTERS.upper() + DIGITS + PUNCTUATION) - - -def lex_string(inp): - token = "" - esc = False - for idx, c in enumerate(inp): - # if we're escaping a quote, don't add the \ - if esc: - if c == DOUBLE_QUOTE: - token += DOUBLE_QUOTE - elif c == BACKSLASH: - token += BACKSLASH - else: - token += f"{BACKSLASH}{c}" - - # if it's an ecsape char, set esc and continue - elif c == BACKSLASH: - esc = True - continue - - elif c == DOUBLE_QUOTE: - #return token, inp[idx + 1:] - return NebString(token), inp[idx + 1:] +from dataclasses import dataclass +from enum import Enum, auto +from typing import Any +import sys - else: - token += c +class LexError(BaseException): - esc = False + def __init__(self, message, line): + super().__init__(f"line {line}: {message}") - raise Exception("improperly ended string!") +class TokenType(Enum): -def lex_bool(inp): - if inp[0:4] == "true": - token = True - elif inp[0:5] == "false": - token = False - else: - raise Exception("invalid boolean") - - if peek(inp[len(str(token)):]) not in (None, " ", CLOSE_PAREN, CLOSE_BRACE): - raise Exception("invalid boolean") - - #return token, inp[len(str(token)):] - return NebBool(token), inp[len(str(token)):] - - -def lex_number(inp): - token = "" - for idx, c in enumerate(inp): - if c in (" ", CLOSE_PAREN, CLOSE_BRACE): - if "." in token: - #return float(token), inp[idx:] - return NebFloat(float(token)), inp[idx:] - else: - #return int(token), inp[idx:] - return NebInt(int(token)), inp[idx:] - - if c in list(DIGITS): # or c in ("-", "."): - token += c - elif c == "+": - if idx == 0: - continue - else: - raise Exception("improper sign placement!") - elif c == "-": - if idx == 0: - token += c - else: - raise Exception("improper sign placement!") - elif c == ".": - if c not in token: - token += c - else: - raise Exception("too many decimal points") - else: - raise Exception("improper numeric!") + PRINT = auto() + + OPEN_PAREN = auto() + CLOSE_PAREN = auto() + + EOF = auto() + + # literals + INT = auto() + FLOAT = auto() + STRING = auto() + TRUE = auto() + FALSE = auto() + + # arithmetic + PLUS = auto() + DASH = auto() + STAR = auto() + SLASH = auto() - if "." in token: - #return float(token), "" - return NebFloat(float(token)), "" - else: - #return int(token), "" - return NebInt(int(token)), "" - -def lex_symbol(inp): - token = "" - for idx, c in enumerate(inp): - if c in (CLOSE_PAREN, CLOSE_BRACE, " "): - return NebSymbol(token), inp[idx:] - elif c in SYMBOL_VALS: - token += c - else: - raise Exception("improper symbol") - return NebSymbol(token), "" - -def peek(inp): - if len(inp) == 0: - return None - return inp[0] - -def lex(inp, tokens): - inp = inp.strip() # white space doesn't matter at this point - nxt = peek(inp) - if nxt is None: - #print(f"returning [{tokens}]") - return tokens - # parens - if nxt == OPEN_PAREN: - tokens.append(NebOpen()) - return lex(inp[1:], tokens) - elif nxt == CLOSE_PAREN: - tokens.append(NebClose()) - return lex(inp[1:], tokens) - # braces - elif nxt == OPEN_BRACE: - tokens.append(NebListStart()) - return lex(inp[1:], tokens) - elif nxt == CLOSE_BRACE: - tokens.append(NebListEnd()) - return lex(inp[1:], tokens) - # numbers - elif nxt in list(DIGITS) or nxt in ("+", "-", "."): - # + and - are symbols, too - if nxt in ("+", "-"): - after = peek(inp[1:]) - if after not in DIGITS: # parse a symbol - token, remainder = lex_symbol(inp) - if peek(remainder) not in (None, CLOSE_PAREN, CLOSE_BRACE, " "): - print(f"{peek(remainder)}") - raise Exception("spaces required between tokens") - tokens.append(token) - return lex(remainder, tokens) - token, remainder = lex_number(inp) - tokens.append(token) - return lex(remainder, tokens) # strings - elif nxt == DOUBLE_QUOTE: - token, remainder = lex_string(inp[1:]) - #print(f"received [{token}] [{remainder}]") - if peek(remainder) not in (None, CLOSE_PAREN, " ", CLOSE_BRACE): - print(f"{peek(remainder)}") - raise Exception("spaces required between tokens") - tokens.append(token) - return lex(remainder, tokens) - # bool - elif nxt == "#": - token, remainder = lex_bool(inp[1:]) - if peek(remainder) not in (None, CLOSE_PAREN, " ", CLOSE_BRACE): - print(f"{peek(remainder)}") - raise Exception("spaces required between tokens") - tokens.append(token) - return lex(remainder, tokens) + DOUBLE_QUOTE = auto() + + # comparison + GREATER = auto() + GREATER_EQUAL = auto() + LESS = auto() + LESS_EQUAL = auto() + EQUAL = auto() + NOT = auto() + AND = auto() + OR = auto() + + # flow + IF = auto() + + # keywords + DEF = auto() + LAMBDA = auto() + # symbols - elif nxt in SYMBOL_VALS: - token, remainder = lex_symbol(inp) - if peek(remainder) not in (None, CLOSE_PAREN, " ", CLOSE_BRACE): - print(f"{peek(remainder)}") - raise Exception("spaces required between tokens") - tokens.append(token) - return lex(remainder, tokens) + SYMBOL = auto() + +keywords = { + "print": TokenType.PRINT, + "+": TokenType.PLUS, + "-": TokenType.DASH, + "*": TokenType.STAR, + "/": TokenType.SLASH, + ">": TokenType.GREATER, + ">=": TokenType.GREATER_EQUAL, + "<": TokenType.LESS, + "<=": TokenType.LESS_EQUAL, + "eq?": TokenType.EQUAL, + "not": TokenType.NOT, + "and": TokenType.AND, + "or": TokenType.OR, + "if": TokenType.IF, + "def": TokenType.DEF, + "lambda": TokenType.LAMBDA } + + +@dataclass +class Token: + type_: TokenType + text: str + value: Any + line: int + + def __str__(self): + return f"{self.type_.name} {self.text} {self.line}" + +WHITESPACE = [" ", "\n", "\t"] +SEPARATORS = WHITESPACE + [")"] +DIGITS = list("0123456789") + +def lex(data): + start = 0 + current = 0 + line = 1 + end = len(data) + + tokens = [] + while current < end: + char = data[current] + if char == "\n": + line += 1 + if char in WHITESPACE: + current += 1 + continue + elif char == "(": + tokens.append(Token(TokenType.OPEN_PAREN, "(", None, line)) + elif char == ")": + tokens.append(Token(TokenType.CLOSE_PAREN, ")", None, line)) + # numbers + elif char in DIGITS or char == ".": + tok, length = get_number(data[current:], line) + tokens.append(tok) + current += length + # strings + elif char == '"': + tok, length = get_string(data[current+1:], line) + tokens.append(tok) + current += length + # bools + elif char == "#": + tok, length = get_bool(data[current+1:], line) + tokens.append(tok) + current += length + # symbols + else: + tok, length = get_symbol(data[current:], line) + if tok.text in keywords: + tok.type_ = keywords[tok.text] + tokens.append(tok) + current += length + + current += 1 + tokens.append(Token(TokenType.EOF, "", None, line)) + return tokens + +def get_number(data, line): + counter = 0 + value = "" + is_float = False + char = data[counter] + while char not in SEPARATORS: + if char in DIGITS: + value += char + elif char == ".": + if is_float: + raise LexError("too many '.' in number", line) + is_float = True + value += char + else: + raise Exception(f"invalid number: {value}") + counter += 1 + if counter >= len(data): + raise Exception("couldn't parse number") + char = data[counter] + if is_float: + return Token(TokenType.FLOAT, value, float(value), line), counter - 1 + else: + return Token(TokenType.INT, value, int(value), line), counter - 1 + + +def get_string(data, line): + counter = 0 + string = "" + while data[counter] != '"': + string += data[counter] + counter += 1 + if counter >= len(data): + raise Exception("couldn't parse string") + return Token(TokenType.STRING, string, string, line), counter + 1 + +def get_bool(data, line): + if len(data) >= 4 and data[:4] == "true": + return Token(TokenType.TRUE, "true", True, line), 4 + elif len(data) >= 5 and data[:5] == "false": + return Token(TokenType.FALSE, "false", False, line), 5 else: - raise Exception("unable to lex") + raise Exception("couldn't parse boolean") + +def get_symbol(data, line): + counter = 0 + value = "" + while data[counter] not in SEPARATORS: + value += data[counter] + counter += 1 + if counter >= len(data): + break + return Token(TokenType.SYMBOL, value, None, line), counter + + +def main(data): + try: + tokens = lex(data) + except LexError as error: + print(error) + sys.exit() + for tok in tokens: + print(f"{tok}") + + +if __name__ == "__main__": + if len(sys.argv) != 2: + print("need a file") + sys.exit() + with open(sys.argv[1], "r") as fil: + data = fil.read() + main(data) + @@ -1,8 +1,7 @@ from lexer import lex -from parser import parse -from runner import evaluate -from std import _get_debug +def _get_debug(): + return True def main(): print("### neb :)(:") @@ -13,16 +12,11 @@ def main(): if len(inp.strip()) == 0: continue try: - lexed = lex(inp, []) + lexed = lex(inp) + #lexed = lex(inp, []) if _get_debug(): acc = " ".join([f"{l}" for l in lexed]) print(f" - LEX: {acc}") - parsed = parse(lexed, []) - if _get_debug(): - acc = " ".join([f"{p}" for p in parsed]) - print(f" - PARSE: {acc}") - ev = evaluate(parsed, []) - print(f"=> {ev}") idx += 1 except Exception as e: print(f"panic! {e}") |
