diff options
| author | mryouse | 2022-06-18 02:45:04 +0000 |
|---|---|---|
| committer | mryouse | 2022-06-18 02:45:04 +0000 |
| commit | d1a1c1592e610526c4a0432f93bd9ea6ae96d6e8 (patch) | |
| tree | df4c78f3ce5dfb1369d5fc6c155ca43e8bfc729f /neb/lexer.py | |
| parent | 065d138ca3013a4d1ef1aa3d7c48982d8bee5de2 (diff) | |
refactor: make neb a module
Diffstat (limited to 'neb/lexer.py')
| -rw-r--r-- | neb/lexer.py | 168 |
1 files changed, 168 insertions, 0 deletions
diff --git a/neb/lexer.py b/neb/lexer.py new file mode 100644 index 0000000..b522460 --- /dev/null +++ b/neb/lexer.py @@ -0,0 +1,168 @@ +from .structs import TokenType, Token +from .exceptions import LexError +import sys + + +types = { + ":int": TokenType.INT_TYPE, + ":float": TokenType.FLOAT_TYPE, + ":number": TokenType.NUMBER_TYPE, + ":string": TokenType.STRING_TYPE, + ":list": TokenType.LIST_TYPE, + ":any": TokenType.ANY_TYPE, + ":literal": TokenType.LITERAL_TYPE, + ":bool": TokenType.BOOL_TYPE } + +keywords = { + "if": TokenType.IF, + "for-count": TokenType.FOR_COUNT, + "def": TokenType.DEF, + "lambda": TokenType.LAMBDA, + "&": TokenType.MANY, + "func": TokenType.FUNC } + + +WHITESPACE = [" ", "\n", "\t"] +SEPARATORS = WHITESPACE + [")"] +DIGITS = list("0123456789") + +def lex(data): + start = 0 + current = 0 + line = 1 + end = len(data) + + tokens = [] + while current < end: + char = data[current] + if char == ";": + while char != "\n" and current < end: + current += 1 + char = data[current] + continue + if char == "\n": + line += 1 + if char in WHITESPACE: + current += 1 + continue + elif char == "(": + tokens.append(Token(TokenType.OPEN_PAREN, "(", None, line)) + elif char == ")": + tokens.append(Token(TokenType.CLOSE_PAREN, ")", None, line)) + # numbers + elif char in DIGITS or char == ".": + tok, length = get_number(data[current:], line) + tokens.append(tok) + current += length + # strings + elif char == '"': + tok, length, offset = get_string(data[current+1:], line) + tokens.append(tok) + current += length + line += offset + # bools + elif char == "#": + tok, length = get_bool(data[current+1:], line) + tokens.append(tok) + current += length + #types + elif char == ":": + tok, length = get_type(data[current:], line) # include : + tokens.append(tok) + current += length + # symbols + else: + tok, length = get_symbol(data[current:], line) + if tok.text in keywords: + tok.type_ = keywords[tok.text] + tokens.append(tok) + current += length + + current += 1 + tokens.append(Token(TokenType.EOF, "", None, line)) + return tokens + +def get_number(data, line): + counter = 0 + value = "" + is_float = False + char = data[counter] + while char not in SEPARATORS: + if char in DIGITS: + value += char + elif char == ".": + if is_float: + raise LexError("too many '.' in number", line) + is_float = True + value += char + else: + raise Exception(f"invalid number: {value}") + counter += 1 + if counter >= len(data): + break + char = data[counter] + if is_float: + return Token(TokenType.FLOAT, value, float(value), line), counter - 1 + else: + return Token(TokenType.INT, value, int(value), line), counter - 1 + + +def get_string(data, line): + offset = 0 + counter = 0 + string = "" + while data[counter] != '"': + if data[counter] == "\n": + offset += 1 + + # look ahead to see if it's a double quote + if data[counter] == "\\" and \ + len(data) > counter and \ + data[counter+1] == '"': + string += '"' + counter += 1 + else: + string += data[counter] + counter += 1 + if counter >= len(data): + raise Exception("couldn't parse string") + string = string.encode().decode("unicode_escape") + return Token(TokenType.STRING, str(string), str(string), line), counter + 1, offset + +def get_bool(data, line): + counter = 0 + value = "" + while data[counter] not in SEPARATORS: + value += data[counter] + counter += 1 + if counter >= len(data): + break + if value == "true": + return Token(TokenType.TRUE, "#true", True, line), 4 + elif value == "false": + return Token(TokenType.FALSE, "#false", False, line), 5 + else: + raise LexError("couldn't parse boolean", line) + +def get_symbol(data, line): + counter = 0 + value = "" + while data[counter] not in SEPARATORS: + value += data[counter] + counter += 1 + if counter >= len(data): + break + return Token(TokenType.SYMBOL, value, None, line), counter - 1 + +def get_type(data, line): + counter = 0 + value = "" + while data[counter] not in SEPARATORS: + value += data[counter] + counter += 1 + if counter >= len(data): + break + if value not in types: + raise LexError(f"unrecognized type {value}", line) + return Token(types[value], value, None, line), counter - 1 + |
