diff options
| author | mryouse | 2022-06-18 02:45:04 +0000 |
|---|---|---|
| committer | mryouse | 2022-06-18 02:45:04 +0000 |
| commit | d1a1c1592e610526c4a0432f93bd9ea6ae96d6e8 (patch) | |
| tree | df4c78f3ce5dfb1369d5fc6c155ca43e8bfc729f /lexer.py | |
| parent | 065d138ca3013a4d1ef1aa3d7c48982d8bee5de2 (diff) | |
refactor: make neb a module
Diffstat (limited to 'lexer.py')
| -rw-r--r-- | lexer.py | 168 |
1 files changed, 0 insertions, 168 deletions
diff --git a/lexer.py b/lexer.py deleted file mode 100644 index fa3db90..0000000 --- a/lexer.py +++ /dev/null @@ -1,168 +0,0 @@ -from structs import TokenType, Token -from exceptions import LexError -import sys - - -types = { - ":int": TokenType.INT_TYPE, - ":float": TokenType.FLOAT_TYPE, - ":number": TokenType.NUMBER_TYPE, - ":string": TokenType.STRING_TYPE, - ":list": TokenType.LIST_TYPE, - ":any": TokenType.ANY_TYPE, - ":literal": TokenType.LITERAL_TYPE, - ":bool": TokenType.BOOL_TYPE } - -keywords = { - "if": TokenType.IF, - "for-count": TokenType.FOR_COUNT, - "def": TokenType.DEF, - "lambda": TokenType.LAMBDA, - "&": TokenType.MANY, - "func": TokenType.FUNC } - - -WHITESPACE = [" ", "\n", "\t"] -SEPARATORS = WHITESPACE + [")"] -DIGITS = list("0123456789") - -def lex(data): - start = 0 - current = 0 - line = 1 - end = len(data) - - tokens = [] - while current < end: - char = data[current] - if char == ";": - while char != "\n" and current < end: - current += 1 - char = data[current] - continue - if char == "\n": - line += 1 - if char in WHITESPACE: - current += 1 - continue - elif char == "(": - tokens.append(Token(TokenType.OPEN_PAREN, "(", None, line)) - elif char == ")": - tokens.append(Token(TokenType.CLOSE_PAREN, ")", None, line)) - # numbers - elif char in DIGITS or char == ".": - tok, length = get_number(data[current:], line) - tokens.append(tok) - current += length - # strings - elif char == '"': - tok, length, offset = get_string(data[current+1:], line) - tokens.append(tok) - current += length - line += offset - # bools - elif char == "#": - tok, length = get_bool(data[current+1:], line) - tokens.append(tok) - current += length - #types - elif char == ":": - tok, length = get_type(data[current:], line) # include : - tokens.append(tok) - current += length - # symbols - else: - tok, length = get_symbol(data[current:], line) - if tok.text in keywords: - tok.type_ = keywords[tok.text] - tokens.append(tok) - current += length - - current += 1 - tokens.append(Token(TokenType.EOF, "", None, line)) - return tokens - -def get_number(data, line): - counter = 0 - value = "" - is_float = False - char = data[counter] - while char not in SEPARATORS: - if char in DIGITS: - value += char - elif char == ".": - if is_float: - raise LexError("too many '.' in number", line) - is_float = True - value += char - else: - raise Exception(f"invalid number: {value}") - counter += 1 - if counter >= len(data): - break - char = data[counter] - if is_float: - return Token(TokenType.FLOAT, value, float(value), line), counter - 1 - else: - return Token(TokenType.INT, value, int(value), line), counter - 1 - - -def get_string(data, line): - offset = 0 - counter = 0 - string = "" - while data[counter] != '"': - if data[counter] == "\n": - offset += 1 - - # look ahead to see if it's a double quote - if data[counter] == "\\" and \ - len(data) > counter and \ - data[counter+1] == '"': - string += '"' - counter += 1 - else: - string += data[counter] - counter += 1 - if counter >= len(data): - raise Exception("couldn't parse string") - string = string.encode().decode("unicode_escape") - return Token(TokenType.STRING, str(string), str(string), line), counter + 1, offset - -def get_bool(data, line): - counter = 0 - value = "" - while data[counter] not in SEPARATORS: - value += data[counter] - counter += 1 - if counter >= len(data): - break - if value == "true": - return Token(TokenType.TRUE, "#true", True, line), 4 - elif value == "false": - return Token(TokenType.FALSE, "#false", False, line), 5 - else: - raise LexError("couldn't parse boolean", line) - -def get_symbol(data, line): - counter = 0 - value = "" - while data[counter] not in SEPARATORS: - value += data[counter] - counter += 1 - if counter >= len(data): - break - return Token(TokenType.SYMBOL, value, None, line), counter - 1 - -def get_type(data, line): - counter = 0 - value = "" - while data[counter] not in SEPARATORS: - value += data[counter] - counter += 1 - if counter >= len(data): - break - if value not in types: - raise LexError(f"unrecognized type {value}", line) - return Token(types[value], value, None, line), counter - 1 - |
