aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authormryouse2022-05-18 03:07:23 +0000
committermryouse2022-05-18 03:07:23 +0000
commit5d1298d100888173ad98fbc5b9a0725ed78cd1a5 (patch)
tree3755c61ea9b76f1589d8174e9a9555bf438a3b07
parent8906deac1ad119f7b8b07eb476be500d7f454fee (diff)
rewrite of lexer, using a measured approach
-rw-r--r--lexer.py364
-rw-r--r--repl.py14
2 files changed, 201 insertions, 177 deletions
diff --git a/lexer.py b/lexer.py
index 976bce8..bc5a438 100644
--- a/lexer.py
+++ b/lexer.py
@@ -1,175 +1,205 @@
-from tokens import *
-
-# consts
-DOUBLE_QUOTE = '"'
-BACKSLASH = "\\"
-OPEN_PAREN = "("
-CLOSE_PAREN = ")"
-OPEN_BRACE = "["
-CLOSE_BRACE = "]"
-DIGITS = "0123456789"
-LETTERS = "abcdefghijklmnopqrstuvwxyz"
-PUNCTUATION = "|-_!*$@%^&=+/?<>~"
-SYMBOL_VALS = list(LETTERS + LETTERS.upper() + DIGITS + PUNCTUATION)
-
-
-def lex_string(inp):
- token = ""
- esc = False
- for idx, c in enumerate(inp):
- # if we're escaping a quote, don't add the \
- if esc:
- if c == DOUBLE_QUOTE:
- token += DOUBLE_QUOTE
- elif c == BACKSLASH:
- token += BACKSLASH
- else:
- token += f"{BACKSLASH}{c}"
-
- # if it's an ecsape char, set esc and continue
- elif c == BACKSLASH:
- esc = True
- continue
-
- elif c == DOUBLE_QUOTE:
- #return token, inp[idx + 1:]
- return NebString(token), inp[idx + 1:]
+from dataclasses import dataclass
+from enum import Enum, auto
+from typing import Any
+import sys
- else:
- token += c
+class LexError(BaseException):
- esc = False
+ def __init__(self, message, line):
+ super().__init__(f"line {line}: {message}")
- raise Exception("improperly ended string!")
+class TokenType(Enum):
-def lex_bool(inp):
- if inp[0:4] == "true":
- token = True
- elif inp[0:5] == "false":
- token = False
- else:
- raise Exception("invalid boolean")
-
- if peek(inp[len(str(token)):]) not in (None, " ", CLOSE_PAREN, CLOSE_BRACE):
- raise Exception("invalid boolean")
-
- #return token, inp[len(str(token)):]
- return NebBool(token), inp[len(str(token)):]
-
-
-def lex_number(inp):
- token = ""
- for idx, c in enumerate(inp):
- if c in (" ", CLOSE_PAREN, CLOSE_BRACE):
- if "." in token:
- #return float(token), inp[idx:]
- return NebFloat(float(token)), inp[idx:]
- else:
- #return int(token), inp[idx:]
- return NebInt(int(token)), inp[idx:]
-
- if c in list(DIGITS): # or c in ("-", "."):
- token += c
- elif c == "+":
- if idx == 0:
- continue
- else:
- raise Exception("improper sign placement!")
- elif c == "-":
- if idx == 0:
- token += c
- else:
- raise Exception("improper sign placement!")
- elif c == ".":
- if c not in token:
- token += c
- else:
- raise Exception("too many decimal points")
- else:
- raise Exception("improper numeric!")
+ PRINT = auto()
+
+ OPEN_PAREN = auto()
+ CLOSE_PAREN = auto()
+
+ EOF = auto()
+
+ # literals
+ INT = auto()
+ FLOAT = auto()
+ STRING = auto()
+ TRUE = auto()
+ FALSE = auto()
+
+ # arithmetic
+ PLUS = auto()
+ DASH = auto()
+ STAR = auto()
+ SLASH = auto()
- if "." in token:
- #return float(token), ""
- return NebFloat(float(token)), ""
- else:
- #return int(token), ""
- return NebInt(int(token)), ""
-
-def lex_symbol(inp):
- token = ""
- for idx, c in enumerate(inp):
- if c in (CLOSE_PAREN, CLOSE_BRACE, " "):
- return NebSymbol(token), inp[idx:]
- elif c in SYMBOL_VALS:
- token += c
- else:
- raise Exception("improper symbol")
- return NebSymbol(token), ""
-
-def peek(inp):
- if len(inp) == 0:
- return None
- return inp[0]
-
-def lex(inp, tokens):
- inp = inp.strip() # white space doesn't matter at this point
- nxt = peek(inp)
- if nxt is None:
- #print(f"returning [{tokens}]")
- return tokens
- # parens
- if nxt == OPEN_PAREN:
- tokens.append(NebOpen())
- return lex(inp[1:], tokens)
- elif nxt == CLOSE_PAREN:
- tokens.append(NebClose())
- return lex(inp[1:], tokens)
- # braces
- elif nxt == OPEN_BRACE:
- tokens.append(NebListStart())
- return lex(inp[1:], tokens)
- elif nxt == CLOSE_BRACE:
- tokens.append(NebListEnd())
- return lex(inp[1:], tokens)
- # numbers
- elif nxt in list(DIGITS) or nxt in ("+", "-", "."):
- # + and - are symbols, too
- if nxt in ("+", "-"):
- after = peek(inp[1:])
- if after not in DIGITS: # parse a symbol
- token, remainder = lex_symbol(inp)
- if peek(remainder) not in (None, CLOSE_PAREN, CLOSE_BRACE, " "):
- print(f"{peek(remainder)}")
- raise Exception("spaces required between tokens")
- tokens.append(token)
- return lex(remainder, tokens)
- token, remainder = lex_number(inp)
- tokens.append(token)
- return lex(remainder, tokens)
# strings
- elif nxt == DOUBLE_QUOTE:
- token, remainder = lex_string(inp[1:])
- #print(f"received [{token}] [{remainder}]")
- if peek(remainder) not in (None, CLOSE_PAREN, " ", CLOSE_BRACE):
- print(f"{peek(remainder)}")
- raise Exception("spaces required between tokens")
- tokens.append(token)
- return lex(remainder, tokens)
- # bool
- elif nxt == "#":
- token, remainder = lex_bool(inp[1:])
- if peek(remainder) not in (None, CLOSE_PAREN, " ", CLOSE_BRACE):
- print(f"{peek(remainder)}")
- raise Exception("spaces required between tokens")
- tokens.append(token)
- return lex(remainder, tokens)
+ DOUBLE_QUOTE = auto()
+
+ # comparison
+ GREATER = auto()
+ GREATER_EQUAL = auto()
+ LESS = auto()
+ LESS_EQUAL = auto()
+ EQUAL = auto()
+ NOT = auto()
+ AND = auto()
+ OR = auto()
+
+ # flow
+ IF = auto()
+
+ # keywords
+ DEF = auto()
+ LAMBDA = auto()
+
# symbols
- elif nxt in SYMBOL_VALS:
- token, remainder = lex_symbol(inp)
- if peek(remainder) not in (None, CLOSE_PAREN, " ", CLOSE_BRACE):
- print(f"{peek(remainder)}")
- raise Exception("spaces required between tokens")
- tokens.append(token)
- return lex(remainder, tokens)
+ SYMBOL = auto()
+
+keywords = {
+ "print": TokenType.PRINT,
+ "+": TokenType.PLUS,
+ "-": TokenType.DASH,
+ "*": TokenType.STAR,
+ "/": TokenType.SLASH,
+ ">": TokenType.GREATER,
+ ">=": TokenType.GREATER_EQUAL,
+ "<": TokenType.LESS,
+ "<=": TokenType.LESS_EQUAL,
+ "eq?": TokenType.EQUAL,
+ "not": TokenType.NOT,
+ "and": TokenType.AND,
+ "or": TokenType.OR,
+ "if": TokenType.IF,
+ "def": TokenType.DEF,
+ "lambda": TokenType.LAMBDA }
+
+
+@dataclass
+class Token:
+ type_: TokenType
+ text: str
+ value: Any
+ line: int
+
+ def __str__(self):
+ return f"{self.type_.name} {self.text} {self.line}"
+
+WHITESPACE = [" ", "\n", "\t"]
+SEPARATORS = WHITESPACE + [")"]
+DIGITS = list("0123456789")
+
+def lex(data):
+ start = 0
+ current = 0
+ line = 1
+ end = len(data)
+
+ tokens = []
+ while current < end:
+ char = data[current]
+ if char == "\n":
+ line += 1
+ if char in WHITESPACE:
+ current += 1
+ continue
+ elif char == "(":
+ tokens.append(Token(TokenType.OPEN_PAREN, "(", None, line))
+ elif char == ")":
+ tokens.append(Token(TokenType.CLOSE_PAREN, ")", None, line))
+ # numbers
+ elif char in DIGITS or char == ".":
+ tok, length = get_number(data[current:], line)
+ tokens.append(tok)
+ current += length
+ # strings
+ elif char == '"':
+ tok, length = get_string(data[current+1:], line)
+ tokens.append(tok)
+ current += length
+ # bools
+ elif char == "#":
+ tok, length = get_bool(data[current+1:], line)
+ tokens.append(tok)
+ current += length
+ # symbols
+ else:
+ tok, length = get_symbol(data[current:], line)
+ if tok.text in keywords:
+ tok.type_ = keywords[tok.text]
+ tokens.append(tok)
+ current += length
+
+ current += 1
+ tokens.append(Token(TokenType.EOF, "", None, line))
+ return tokens
+
+def get_number(data, line):
+ counter = 0
+ value = ""
+ is_float = False
+ char = data[counter]
+ while char not in SEPARATORS:
+ if char in DIGITS:
+ value += char
+ elif char == ".":
+ if is_float:
+ raise LexError("too many '.' in number", line)
+ is_float = True
+ value += char
+ else:
+ raise Exception(f"invalid number: {value}")
+ counter += 1
+ if counter >= len(data):
+ raise Exception("couldn't parse number")
+ char = data[counter]
+ if is_float:
+ return Token(TokenType.FLOAT, value, float(value), line), counter - 1
+ else:
+ return Token(TokenType.INT, value, int(value), line), counter - 1
+
+
+def get_string(data, line):
+ counter = 0
+ string = ""
+ while data[counter] != '"':
+ string += data[counter]
+ counter += 1
+ if counter >= len(data):
+ raise Exception("couldn't parse string")
+ return Token(TokenType.STRING, string, string, line), counter + 1
+
+def get_bool(data, line):
+ if len(data) >= 4 and data[:4] == "true":
+ return Token(TokenType.TRUE, "true", True, line), 4
+ elif len(data) >= 5 and data[:5] == "false":
+ return Token(TokenType.FALSE, "false", False, line), 5
else:
- raise Exception("unable to lex")
+ raise Exception("couldn't parse boolean")
+
+def get_symbol(data, line):
+ counter = 0
+ value = ""
+ while data[counter] not in SEPARATORS:
+ value += data[counter]
+ counter += 1
+ if counter >= len(data):
+ break
+ return Token(TokenType.SYMBOL, value, None, line), counter
+
+
+def main(data):
+ try:
+ tokens = lex(data)
+ except LexError as error:
+ print(error)
+ sys.exit()
+ for tok in tokens:
+ print(f"{tok}")
+
+
+if __name__ == "__main__":
+ if len(sys.argv) != 2:
+ print("need a file")
+ sys.exit()
+ with open(sys.argv[1], "r") as fil:
+ data = fil.read()
+ main(data)
+
diff --git a/repl.py b/repl.py
index b7da451..9310ffc 100644
--- a/repl.py
+++ b/repl.py
@@ -1,8 +1,7 @@
from lexer import lex
-from parser import parse
-from runner import evaluate
-from std import _get_debug
+def _get_debug():
+ return True
def main():
print("### neb :)(:")
@@ -13,16 +12,11 @@ def main():
if len(inp.strip()) == 0:
continue
try:
- lexed = lex(inp, [])
+ lexed = lex(inp)
+ #lexed = lex(inp, [])
if _get_debug():
acc = " ".join([f"{l}" for l in lexed])
print(f" - LEX: {acc}")
- parsed = parse(lexed, [])
- if _get_debug():
- acc = " ".join([f"{p}" for p in parsed])
- print(f" - PARSE: {acc}")
- ev = evaluate(parsed, [])
- print(f"=> {ev}")
idx += 1
except Exception as e:
print(f"panic! {e}")