aboutsummaryrefslogtreecommitdiff
path: root/neb/lexer.py
diff options
context:
space:
mode:
authormryouse2022-06-18 02:45:04 +0000
committermryouse2022-06-18 02:45:04 +0000
commitd1a1c1592e610526c4a0432f93bd9ea6ae96d6e8 (patch)
treedf4c78f3ce5dfb1369d5fc6c155ca43e8bfc729f /neb/lexer.py
parent065d138ca3013a4d1ef1aa3d7c48982d8bee5de2 (diff)
refactor: make neb a module
Diffstat (limited to 'neb/lexer.py')
-rw-r--r--neb/lexer.py168
1 files changed, 168 insertions, 0 deletions
diff --git a/neb/lexer.py b/neb/lexer.py
new file mode 100644
index 0000000..b522460
--- /dev/null
+++ b/neb/lexer.py
@@ -0,0 +1,168 @@
+from .structs import TokenType, Token
+from .exceptions import LexError
+import sys
+
+
+types = {
+ ":int": TokenType.INT_TYPE,
+ ":float": TokenType.FLOAT_TYPE,
+ ":number": TokenType.NUMBER_TYPE,
+ ":string": TokenType.STRING_TYPE,
+ ":list": TokenType.LIST_TYPE,
+ ":any": TokenType.ANY_TYPE,
+ ":literal": TokenType.LITERAL_TYPE,
+ ":bool": TokenType.BOOL_TYPE }
+
+keywords = {
+ "if": TokenType.IF,
+ "for-count": TokenType.FOR_COUNT,
+ "def": TokenType.DEF,
+ "lambda": TokenType.LAMBDA,
+ "&": TokenType.MANY,
+ "func": TokenType.FUNC }
+
+
+WHITESPACE = [" ", "\n", "\t"]
+SEPARATORS = WHITESPACE + [")"]
+DIGITS = list("0123456789")
+
+def lex(data):
+ start = 0
+ current = 0
+ line = 1
+ end = len(data)
+
+ tokens = []
+ while current < end:
+ char = data[current]
+ if char == ";":
+ while char != "\n" and current < end:
+ current += 1
+ char = data[current]
+ continue
+ if char == "\n":
+ line += 1
+ if char in WHITESPACE:
+ current += 1
+ continue
+ elif char == "(":
+ tokens.append(Token(TokenType.OPEN_PAREN, "(", None, line))
+ elif char == ")":
+ tokens.append(Token(TokenType.CLOSE_PAREN, ")", None, line))
+ # numbers
+ elif char in DIGITS or char == ".":
+ tok, length = get_number(data[current:], line)
+ tokens.append(tok)
+ current += length
+ # strings
+ elif char == '"':
+ tok, length, offset = get_string(data[current+1:], line)
+ tokens.append(tok)
+ current += length
+ line += offset
+ # bools
+ elif char == "#":
+ tok, length = get_bool(data[current+1:], line)
+ tokens.append(tok)
+ current += length
+ #types
+ elif char == ":":
+ tok, length = get_type(data[current:], line) # include :
+ tokens.append(tok)
+ current += length
+ # symbols
+ else:
+ tok, length = get_symbol(data[current:], line)
+ if tok.text in keywords:
+ tok.type_ = keywords[tok.text]
+ tokens.append(tok)
+ current += length
+
+ current += 1
+ tokens.append(Token(TokenType.EOF, "", None, line))
+ return tokens
+
+def get_number(data, line):
+ counter = 0
+ value = ""
+ is_float = False
+ char = data[counter]
+ while char not in SEPARATORS:
+ if char in DIGITS:
+ value += char
+ elif char == ".":
+ if is_float:
+ raise LexError("too many '.' in number", line)
+ is_float = True
+ value += char
+ else:
+ raise Exception(f"invalid number: {value}")
+ counter += 1
+ if counter >= len(data):
+ break
+ char = data[counter]
+ if is_float:
+ return Token(TokenType.FLOAT, value, float(value), line), counter - 1
+ else:
+ return Token(TokenType.INT, value, int(value), line), counter - 1
+
+
+def get_string(data, line):
+ offset = 0
+ counter = 0
+ string = ""
+ while data[counter] != '"':
+ if data[counter] == "\n":
+ offset += 1
+
+ # look ahead to see if it's a double quote
+ if data[counter] == "\\" and \
+ len(data) > counter and \
+ data[counter+1] == '"':
+ string += '"'
+ counter += 1
+ else:
+ string += data[counter]
+ counter += 1
+ if counter >= len(data):
+ raise Exception("couldn't parse string")
+ string = string.encode().decode("unicode_escape")
+ return Token(TokenType.STRING, str(string), str(string), line), counter + 1, offset
+
+def get_bool(data, line):
+ counter = 0
+ value = ""
+ while data[counter] not in SEPARATORS:
+ value += data[counter]
+ counter += 1
+ if counter >= len(data):
+ break
+ if value == "true":
+ return Token(TokenType.TRUE, "#true", True, line), 4
+ elif value == "false":
+ return Token(TokenType.FALSE, "#false", False, line), 5
+ else:
+ raise LexError("couldn't parse boolean", line)
+
+def get_symbol(data, line):
+ counter = 0
+ value = ""
+ while data[counter] not in SEPARATORS:
+ value += data[counter]
+ counter += 1
+ if counter >= len(data):
+ break
+ return Token(TokenType.SYMBOL, value, None, line), counter - 1
+
+def get_type(data, line):
+ counter = 0
+ value = ""
+ while data[counter] not in SEPARATORS:
+ value += data[counter]
+ counter += 1
+ if counter >= len(data):
+ break
+ if value not in types:
+ raise LexError(f"unrecognized type {value}", line)
+ return Token(types[value], value, None, line), counter - 1
+