aboutsummaryrefslogtreecommitdiff
path: root/lexer.py
diff options
context:
space:
mode:
authormryouse2022-06-18 02:45:04 +0000
committermryouse2022-06-18 02:45:04 +0000
commitd1a1c1592e610526c4a0432f93bd9ea6ae96d6e8 (patch)
treedf4c78f3ce5dfb1369d5fc6c155ca43e8bfc729f /lexer.py
parent065d138ca3013a4d1ef1aa3d7c48982d8bee5de2 (diff)
refactor: make neb a module
Diffstat (limited to 'lexer.py')
-rw-r--r--lexer.py168
1 files changed, 0 insertions, 168 deletions
diff --git a/lexer.py b/lexer.py
deleted file mode 100644
index fa3db90..0000000
--- a/lexer.py
+++ /dev/null
@@ -1,168 +0,0 @@
-from structs import TokenType, Token
-from exceptions import LexError
-import sys
-
-
-types = {
- ":int": TokenType.INT_TYPE,
- ":float": TokenType.FLOAT_TYPE,
- ":number": TokenType.NUMBER_TYPE,
- ":string": TokenType.STRING_TYPE,
- ":list": TokenType.LIST_TYPE,
- ":any": TokenType.ANY_TYPE,
- ":literal": TokenType.LITERAL_TYPE,
- ":bool": TokenType.BOOL_TYPE }
-
-keywords = {
- "if": TokenType.IF,
- "for-count": TokenType.FOR_COUNT,
- "def": TokenType.DEF,
- "lambda": TokenType.LAMBDA,
- "&": TokenType.MANY,
- "func": TokenType.FUNC }
-
-
-WHITESPACE = [" ", "\n", "\t"]
-SEPARATORS = WHITESPACE + [")"]
-DIGITS = list("0123456789")
-
-def lex(data):
- start = 0
- current = 0
- line = 1
- end = len(data)
-
- tokens = []
- while current < end:
- char = data[current]
- if char == ";":
- while char != "\n" and current < end:
- current += 1
- char = data[current]
- continue
- if char == "\n":
- line += 1
- if char in WHITESPACE:
- current += 1
- continue
- elif char == "(":
- tokens.append(Token(TokenType.OPEN_PAREN, "(", None, line))
- elif char == ")":
- tokens.append(Token(TokenType.CLOSE_PAREN, ")", None, line))
- # numbers
- elif char in DIGITS or char == ".":
- tok, length = get_number(data[current:], line)
- tokens.append(tok)
- current += length
- # strings
- elif char == '"':
- tok, length, offset = get_string(data[current+1:], line)
- tokens.append(tok)
- current += length
- line += offset
- # bools
- elif char == "#":
- tok, length = get_bool(data[current+1:], line)
- tokens.append(tok)
- current += length
- #types
- elif char == ":":
- tok, length = get_type(data[current:], line) # include :
- tokens.append(tok)
- current += length
- # symbols
- else:
- tok, length = get_symbol(data[current:], line)
- if tok.text in keywords:
- tok.type_ = keywords[tok.text]
- tokens.append(tok)
- current += length
-
- current += 1
- tokens.append(Token(TokenType.EOF, "", None, line))
- return tokens
-
-def get_number(data, line):
- counter = 0
- value = ""
- is_float = False
- char = data[counter]
- while char not in SEPARATORS:
- if char in DIGITS:
- value += char
- elif char == ".":
- if is_float:
- raise LexError("too many '.' in number", line)
- is_float = True
- value += char
- else:
- raise Exception(f"invalid number: {value}")
- counter += 1
- if counter >= len(data):
- break
- char = data[counter]
- if is_float:
- return Token(TokenType.FLOAT, value, float(value), line), counter - 1
- else:
- return Token(TokenType.INT, value, int(value), line), counter - 1
-
-
-def get_string(data, line):
- offset = 0
- counter = 0
- string = ""
- while data[counter] != '"':
- if data[counter] == "\n":
- offset += 1
-
- # look ahead to see if it's a double quote
- if data[counter] == "\\" and \
- len(data) > counter and \
- data[counter+1] == '"':
- string += '"'
- counter += 1
- else:
- string += data[counter]
- counter += 1
- if counter >= len(data):
- raise Exception("couldn't parse string")
- string = string.encode().decode("unicode_escape")
- return Token(TokenType.STRING, str(string), str(string), line), counter + 1, offset
-
-def get_bool(data, line):
- counter = 0
- value = ""
- while data[counter] not in SEPARATORS:
- value += data[counter]
- counter += 1
- if counter >= len(data):
- break
- if value == "true":
- return Token(TokenType.TRUE, "#true", True, line), 4
- elif value == "false":
- return Token(TokenType.FALSE, "#false", False, line), 5
- else:
- raise LexError("couldn't parse boolean", line)
-
-def get_symbol(data, line):
- counter = 0
- value = ""
- while data[counter] not in SEPARATORS:
- value += data[counter]
- counter += 1
- if counter >= len(data):
- break
- return Token(TokenType.SYMBOL, value, None, line), counter - 1
-
-def get_type(data, line):
- counter = 0
- value = ""
- while data[counter] not in SEPARATORS:
- value += data[counter]
- counter += 1
- if counter >= len(data):
- break
- if value not in types:
- raise LexError(f"unrecognized type {value}", line)
- return Token(types[value], value, None, line), counter - 1
-