refactor: make neb a module

author: mryouse 2022-06-18 02:45:04 +0000
committer: mryouse 2022-06-18 02:45:04 +0000
commit: d1a1c1592e610526c4a0432f93bd9ea6ae96d6e8 (patch)
tree: df4c78f3ce5dfb1369d5fc6c155ca43e8bfc729f /neb/lexer.py
parent: 065d138ca3013a4d1ef1aa3d7c48982d8bee5de2 (diff)
1 files changed, 168 insertions, 0 deletions
diff --git a/neb/lexer.py b/neb/lexer.py
new file mode 100644
index 0000000..b522460
--- /dev/null
+++ b/neb/lexer.py
@@ -0,0 +1,168 @@
+from .structs import TokenType, Token
+from .exceptions import LexError
+import sys
+
+
+types = {
+    ":int": TokenType.INT_TYPE,
+    ":float": TokenType.FLOAT_TYPE,
+    ":number": TokenType.NUMBER_TYPE,
+    ":string": TokenType.STRING_TYPE,
+    ":list": TokenType.LIST_TYPE,
+    ":any": TokenType.ANY_TYPE,
+    ":literal": TokenType.LITERAL_TYPE,
+    ":bool": TokenType.BOOL_TYPE }
+
+keywords = {
+    "if": TokenType.IF,
+    "for-count": TokenType.FOR_COUNT,
+    "def": TokenType.DEF,
+    "lambda": TokenType.LAMBDA,
+    "&": TokenType.MANY,
+    "func": TokenType.FUNC }
+
+
+WHITESPACE = [" ", "\n", "\t"]
+SEPARATORS = WHITESPACE + [")"]
+DIGITS = list("0123456789")
+
+def lex(data):
+    start = 0
+    current = 0
+    line = 1
+    end = len(data)
+
+    tokens = []
+    while current < end:
+        char = data[current]
+        if char == ";":
+            while char != "\n" and current < end:
+                current += 1
+                char = data[current]
+            continue
+        if char == "\n":
+            line += 1
+        if char in WHITESPACE:
+            current += 1
+            continue
+        elif char == "(":
+            tokens.append(Token(TokenType.OPEN_PAREN, "(", None, line))
+        elif char == ")":
+            tokens.append(Token(TokenType.CLOSE_PAREN, ")", None, line))
+        # numbers
+        elif char in DIGITS or char == ".":
+            tok, length = get_number(data[current:], line)
+            tokens.append(tok)
+            current += length
+        # strings
+        elif char == '"':
+            tok, length, offset = get_string(data[current+1:], line)
+            tokens.append(tok)
+            current += length
+            line += offset
+        # bools
+        elif char == "#":
+            tok, length = get_bool(data[current+1:], line)
+            tokens.append(tok)
+            current += length
+        #types
+        elif char == ":":
+            tok, length = get_type(data[current:], line) # include :
+            tokens.append(tok)
+            current += length
+        # symbols
+        else:
+            tok, length = get_symbol(data[current:], line)
+            if tok.text in keywords:
+                tok.type_ = keywords[tok.text]
+            tokens.append(tok)
+            current += length
+
+        current += 1
+    tokens.append(Token(TokenType.EOF, "", None, line))
+    return tokens
+
+def get_number(data, line):
+    counter = 0
+    value = ""
+    is_float = False
+    char = data[counter]
+    while char not in SEPARATORS:
+        if char in DIGITS:
+            value += char
+        elif char == ".":
+            if is_float:
+                raise LexError("too many '.' in number", line)
+            is_float = True
+            value += char
+        else:
+            raise Exception(f"invalid number: {value}")
+        counter += 1
+        if counter >= len(data):
+            break
+        char = data[counter]
+    if is_float:
+        return Token(TokenType.FLOAT, value, float(value), line), counter - 1
+    else:
+        return Token(TokenType.INT, value, int(value), line), counter - 1
+
+
+def get_string(data, line):
+    offset = 0
+    counter = 0
+    string = ""
+    while data[counter] != '"':
+        if data[counter] == "\n":
+            offset += 1
+
+        # look ahead to see if it's a double quote
+        if data[counter] == "\\" and \
+                    len(data) > counter and \
+                    data[counter+1] == '"':
+            string += '"'
+            counter += 1
+        else:
+            string += data[counter]
+        counter += 1
+        if counter >= len(data):
+            raise Exception("couldn't parse string")
+    string = string.encode().decode("unicode_escape")
+    return Token(TokenType.STRING, str(string), str(string), line), counter + 1, offset
+
+def get_bool(data, line):
+    counter = 0
+    value = ""
+    while data[counter] not in SEPARATORS:
+        value += data[counter]
+        counter += 1
+        if counter >= len(data):
+            break
+    if value == "true":
+        return Token(TokenType.TRUE, "#true", True, line), 4
+    elif value == "false":
+        return Token(TokenType.FALSE, "#false", False, line), 5
+    else:
+        raise LexError("couldn't parse boolean", line)
+
+def get_symbol(data, line):
+    counter = 0
+    value = ""
+    while data[counter] not in SEPARATORS:
+        value += data[counter]
+        counter += 1
+        if counter >= len(data):
+            break
+    return Token(TokenType.SYMBOL, value, None, line), counter - 1
+
+def get_type(data, line):
+    counter = 0
+    value = ""
+    while data[counter] not in SEPARATORS:
+        value += data[counter]
+        counter += 1
+        if counter >= len(data):
+            break
+    if value not in types:
+        raise LexError(f"unrecognized type {value}", line)
+    return Token(types[value], value, None, line), counter - 1
+
author	mryouse	2022-06-18 02:45:04 +0000
committer	mryouse	2022-06-18 02:45:04 +0000
commit	d1a1c1592e610526c4a0432f93bd9ea6ae96d6e8 (patch)
tree	df4c78f3ce5dfb1369d5fc6c155ca43e8bfc729f /neb/lexer.py
parent	065d138ca3013a4d1ef1aa3d7c48982d8bee5de2 (diff)