rewrite of lexer, using a measured approach

author: mryouse 2022-05-18 03:07:23 +0000
committer: mryouse 2022-05-18 03:07:23 +0000
commit: 5d1298d100888173ad98fbc5b9a0725ed78cd1a5 (patch)
tree: 3755c61ea9b76f1589d8174e9a9555bf438a3b07
parent: 8906deac1ad119f7b8b07eb476be500d7f454fee (diff)
2 files changed, 201 insertions, 177 deletions
diff --git a/lexer.py b/lexer.py
index 976bce8..bc5a438 100644
--- a/lexer.py
+++ b/lexer.py
@@ -1,175 +1,205 @@
-from tokens import *
-
-# consts
-DOUBLE_QUOTE = '"'
-BACKSLASH = "\\"
-OPEN_PAREN = "("
-CLOSE_PAREN = ")"
-OPEN_BRACE = "["
-CLOSE_BRACE = "]"
-DIGITS = "0123456789"
-LETTERS = "abcdefghijklmnopqrstuvwxyz"
-PUNCTUATION = "|-_!*$@%^&=+/?<>~"
-SYMBOL_VALS = list(LETTERS + LETTERS.upper() + DIGITS + PUNCTUATION)
-
-
-def lex_string(inp):
-    token = ""
-    esc = False
-    for idx, c in enumerate(inp):
-        # if we're escaping a quote, don't add the \
-        if esc:
-            if c == DOUBLE_QUOTE:
-                token += DOUBLE_QUOTE
-            elif c == BACKSLASH:
-                token += BACKSLASH
-            else:
-                token += f"{BACKSLASH}{c}"
-
-        # if it's an ecsape char, set esc and continue
-        elif c == BACKSLASH:
-            esc = True
-            continue
-        
-        elif c == DOUBLE_QUOTE:
-            #return token, inp[idx + 1:]
-            return NebString(token), inp[idx + 1:]
+from dataclasses import dataclass
+from enum import Enum, auto
+from typing import Any
+import sys
 
-        else:
-            token += c 
+class LexError(BaseException):
 
-        esc = False
+    def __init__(self, message, line):
+        super().__init__(f"line {line}: {message}")
 
-    raise Exception("improperly ended string!")
+class TokenType(Enum):
 
-def lex_bool(inp):
-    if inp[0:4] == "true":
-        token = True
-    elif inp[0:5] == "false":
-        token = False
-    else:
-        raise Exception("invalid boolean")
-    
-    if peek(inp[len(str(token)):]) not in (None, " ", CLOSE_PAREN, CLOSE_BRACE):
-        raise Exception("invalid boolean")
-        
-    #return token, inp[len(str(token)):]
-    return NebBool(token), inp[len(str(token)):]
-
-
-def lex_number(inp):
-    token = ""
-    for idx, c in enumerate(inp):
-        if c in (" ", CLOSE_PAREN, CLOSE_BRACE):
-            if "." in token:
-                #return float(token), inp[idx:]
-                return NebFloat(float(token)), inp[idx:]
-            else:
-                #return int(token), inp[idx:]
-                return NebInt(int(token)), inp[idx:]
-
-        if c in list(DIGITS): # or c in ("-", "."):
-            token += c
-        elif c == "+": 
-            if idx == 0:
-                continue
-            else:
-                raise Exception("improper sign placement!")
-        elif c == "-":
-            if idx == 0:
-                token += c
-            else:
-                raise Exception("improper sign placement!")
-        elif c == ".":
-            if c not in token:
-                token += c
-            else:
-                raise Exception("too many decimal points")
-        else:
-            raise Exception("improper numeric!")
+    PRINT = auto()
+
+    OPEN_PAREN = auto()
+    CLOSE_PAREN = auto()
+
+    EOF = auto()
+
+    # literals
+    INT = auto()
+    FLOAT = auto()
+    STRING = auto()
+    TRUE = auto()
+    FALSE = auto()
+
+    # arithmetic
+    PLUS = auto()
+    DASH = auto()
+    STAR = auto()
+    SLASH = auto()
 
-    if "." in token:
-        #return float(token), ""
-        return NebFloat(float(token)), ""
-    else:
-        #return int(token), ""
-        return NebInt(int(token)), ""
-
-def lex_symbol(inp):
-    token = ""
-    for idx, c in enumerate(inp):
-        if c in (CLOSE_PAREN, CLOSE_BRACE, " "):
-            return NebSymbol(token), inp[idx:]
-        elif c in SYMBOL_VALS:
-            token += c
-        else:
-            raise Exception("improper symbol")
-    return NebSymbol(token), ""
-
-def peek(inp):
-    if len(inp) == 0:
-        return None
-    return inp[0]
-
-def lex(inp, tokens):
-    inp = inp.strip() # white space doesn't matter at this point
-    nxt = peek(inp)
-    if nxt is None:
-        #print(f"returning [{tokens}]")
-        return tokens
-    # parens
-    if nxt == OPEN_PAREN:
-        tokens.append(NebOpen())
-        return lex(inp[1:], tokens)
-    elif nxt == CLOSE_PAREN:
-        tokens.append(NebClose())
-        return lex(inp[1:], tokens)
-    # braces
-    elif nxt == OPEN_BRACE:
-        tokens.append(NebListStart())
-        return lex(inp[1:], tokens)
-    elif nxt == CLOSE_BRACE:
-        tokens.append(NebListEnd())
-        return lex(inp[1:], tokens)
-    # numbers
-    elif nxt in list(DIGITS) or nxt in ("+", "-", "."):
-        # + and - are symbols, too
-        if nxt in ("+", "-"):
-            after = peek(inp[1:])
-            if after not in DIGITS: # parse a symbol
-                token, remainder = lex_symbol(inp)
-                if peek(remainder) not in (None, CLOSE_PAREN, CLOSE_BRACE, " "):
-                    print(f"{peek(remainder)}")
-                    raise Exception("spaces required between tokens")
-                tokens.append(token)
-                return lex(remainder, tokens)
-        token, remainder = lex_number(inp)
-        tokens.append(token)
-        return lex(remainder, tokens)
     # strings
-    elif nxt == DOUBLE_QUOTE:
-        token, remainder = lex_string(inp[1:])
-        #print(f"received [{token}] [{remainder}]")
-        if peek(remainder) not in (None, CLOSE_PAREN, " ", CLOSE_BRACE):
-            print(f"{peek(remainder)}")
-            raise Exception("spaces required between tokens")
-        tokens.append(token)
-        return lex(remainder, tokens)
-    # bool
-    elif nxt == "#":
-        token, remainder = lex_bool(inp[1:])
-        if peek(remainder) not in (None, CLOSE_PAREN, " ", CLOSE_BRACE):
-            print(f"{peek(remainder)}")
-            raise Exception("spaces required between tokens")
-        tokens.append(token)
-        return lex(remainder, tokens)
+    DOUBLE_QUOTE = auto()
+
+    # comparison
+    GREATER = auto()
+    GREATER_EQUAL = auto()
+    LESS = auto()
+    LESS_EQUAL = auto()
+    EQUAL = auto()
+    NOT = auto()
+    AND = auto()
+    OR = auto()
+
+    # flow
+    IF = auto()
+
+    # keywords
+    DEF = auto()
+    LAMBDA = auto()
+
     # symbols
-    elif nxt in SYMBOL_VALS:
-        token, remainder = lex_symbol(inp)
-        if peek(remainder) not in (None, CLOSE_PAREN, " ", CLOSE_BRACE):
-            print(f"{peek(remainder)}")
-            raise Exception("spaces required between tokens")
-        tokens.append(token)
-        return lex(remainder, tokens)
+    SYMBOL = auto()
+
+keywords = {
+    "print": TokenType.PRINT,
+    "+": TokenType.PLUS,
+    "-": TokenType.DASH,
+    "*": TokenType.STAR,
+    "/": TokenType.SLASH,
+    ">": TokenType.GREATER,
+    ">=": TokenType.GREATER_EQUAL,
+    "<": TokenType.LESS,
+    "<=": TokenType.LESS_EQUAL,
+    "eq?": TokenType.EQUAL,
+    "not": TokenType.NOT,
+    "and": TokenType.AND,
+    "or": TokenType.OR,
+    "if": TokenType.IF,
+    "def": TokenType.DEF,
+    "lambda": TokenType.LAMBDA }
+
+
+@dataclass
+class Token:
+    type_: TokenType
+    text: str
+    value: Any
+    line: int
+
+    def __str__(self):
+        return f"{self.type_.name} {self.text} {self.line}"
+
+WHITESPACE = [" ", "\n", "\t"]
+SEPARATORS = WHITESPACE + [")"]
+DIGITS = list("0123456789")
+
+def lex(data):
+    start = 0
+    current = 0
+    line = 1
+    end = len(data)
+
+    tokens = []
+    while current < end:
+        char = data[current]
+        if char == "\n":
+            line += 1
+        if char in WHITESPACE:
+            current += 1
+            continue
+        elif char == "(":
+            tokens.append(Token(TokenType.OPEN_PAREN, "(", None, line))
+        elif char == ")":
+            tokens.append(Token(TokenType.CLOSE_PAREN, ")", None, line))
+        # numbers
+        elif char in DIGITS or char == ".":
+            tok, length = get_number(data[current:], line)
+            tokens.append(tok)
+            current += length
+        # strings
+        elif char == '"':
+            tok, length = get_string(data[current+1:], line)
+            tokens.append(tok)
+            current += length
+        # bools
+        elif char == "#":
+            tok, length = get_bool(data[current+1:], line)
+            tokens.append(tok)
+            current += length
+        # symbols
+        else:
+            tok, length = get_symbol(data[current:], line)
+            if tok.text in keywords:
+                tok.type_ = keywords[tok.text]
+            tokens.append(tok)
+            current += length
+
+        current += 1
+    tokens.append(Token(TokenType.EOF, "", None, line))
+    return tokens
+
+def get_number(data, line):
+    counter = 0
+    value = ""
+    is_float = False
+    char = data[counter]
+    while char not in SEPARATORS:
+        if char in DIGITS:
+            value += char
+        elif char == ".":
+            if is_float:
+                raise LexError("too many '.' in number", line)
+            is_float = True
+            value += char
+        else:
+            raise Exception(f"invalid number: {value}")
+        counter += 1
+        if counter >= len(data):
+            raise Exception("couldn't parse number")
+        char = data[counter]
+    if is_float:
+        return Token(TokenType.FLOAT, value, float(value), line), counter - 1
+    else:
+        return Token(TokenType.INT, value, int(value), line), counter - 1
+
+
+def get_string(data, line):
+    counter = 0
+    string = ""
+    while data[counter] != '"':
+        string += data[counter]
+        counter += 1
+        if counter >= len(data):
+            raise Exception("couldn't parse string")
+    return Token(TokenType.STRING, string, string, line), counter + 1
+
+def get_bool(data, line):
+    if len(data) >= 4 and data[:4] == "true":
+        return Token(TokenType.TRUE, "true", True, line), 4
+    elif len(data) >= 5 and data[:5] == "false":
+        return Token(TokenType.FALSE, "false", False, line), 5
     else:
-        raise Exception("unable to lex")
+        raise Exception("couldn't parse boolean")
+
+def get_symbol(data, line):
+    counter = 0
+    value = ""
+    while data[counter] not in SEPARATORS:
+        value += data[counter]
+        counter += 1
+        if counter >= len(data):
+            break
+    return Token(TokenType.SYMBOL, value, None, line), counter
+
+
+def main(data):
+    try:
+        tokens = lex(data)
+    except LexError as error:
+        print(error)
+        sys.exit()
+    for tok in tokens:
+        print(f"{tok}")
+
+
+if __name__ == "__main__":
+    if len(sys.argv) != 2:
+        print("need a file")
+        sys.exit()
+    with open(sys.argv[1], "r") as fil:
+        data = fil.read()
+    main(data)
+
diff --git a/repl.py b/repl.py
index b7da451..9310ffc 100644
--- a/repl.py
+++ b/repl.py
@@ -1,8 +1,7 @@
 from lexer import lex
-from parser import parse
-from runner import evaluate
-from std import _get_debug
 
+def _get_debug():
+    return True
 
 def main():
     print("### neb :)(:")
@@ -13,16 +12,11 @@ def main():
         if len(inp.strip()) == 0:
             continue
         try:
-            lexed = lex(inp, [])
+            lexed = lex(inp)
+            #lexed = lex(inp, [])
             if _get_debug():
                 acc = " ".join([f"{l}" for l in lexed])
                 print(f" - LEX:  {acc}")
-            parsed = parse(lexed, [])
-            if _get_debug():
-                acc = " ".join([f"{p}" for p in parsed])
-                print(f" - PARSE: {acc}")
-            ev = evaluate(parsed, [])
-            print(f"=> {ev}")
             idx += 1
         except Exception as e:
             print(f"panic! {e}")
author	mryouse	2022-05-18 03:07:23 +0000
committer	mryouse	2022-05-18 03:07:23 +0000
commit	5d1298d100888173ad98fbc5b9a0725ed78cd1a5 (patch)
tree	3755c61ea9b76f1589d8174e9a9555bf438a3b07
parent	8906deac1ad119f7b8b07eb476be500d7f454fee (diff)