diff options
Diffstat (limited to 'lexer.py')
| -rw-r--r-- | lexer.py | 154 |
1 files changed, 154 insertions, 0 deletions
diff --git a/lexer.py b/lexer.py new file mode 100644 index 0000000..913a9aa --- /dev/null +++ b/lexer.py @@ -0,0 +1,154 @@ +from tokens import * + +# consts +DOUBLE_QUOTE = '"' +BACKSLASH = "\\" +OPEN_PAREN = "(" +CLOSE_PAREN = ")" +DIGITS = "0123456789" +LETTERS = "abcdefghijklmnopqrstuvwxyz" +PUNCTUATION = "-_!*$@%^&=+/" +SYMBOL_VALS = list(LETTERS + LETTERS.upper() + DIGITS + PUNCTUATION) + + +def lex_string(inp): + token = "" + esc = False + for idx, c in enumerate(inp): + # if we're escaping a quote, don't add the \ + if esc: + if c == DOUBLE_QUOTE: + token += DOUBLE_QUOTE + elif c == BACKSLASH: + token += BACKSLASH + else: + token += f"{BACKSLASH}{c}" + + # if it's an ecsape char, set esc and continue + elif c == BACKSLASH: + esc = True + continue + + elif c == DOUBLE_QUOTE: + #return token, inp[idx + 1:] + return NebLiteral(NebType.STRING, token), inp[idx + 1:] + + else: + token += c + + esc = False + + raise Exception("improperly ended string!") + +def lex_bool(inp): + if inp[0:4] == "true": + token = True + elif inp[0:5] == "false": + token = False + else: + raise Exception("invalid boolean") + + if peek(inp[len(str(token)):]) not in (None, " ", CLOSE_PAREN): + raise Exception("invalid boolean") + + #return token, inp[len(str(token)):] + return NebLiteral(NebType.BOOL, token), inp[len(str(token)):] + + +def lex_number(inp): + token = "" + for idx, c in enumerate(inp): + if c in (" ", CLOSE_PAREN): + if "." in token: + #return float(token), inp[idx:] + return NebLiteral(NebType.FLOAT, float(token)), inp[idx:] + else: + #return int(token), inp[idx:] + return NebLiteral(NebType.INT, int(token)), inp[idx:] + + if c in list(DIGITS): # or c in ("-", "."): + token += c + elif c == "+": + if idx == 0: + continue + else: + raise Exception("improper sign placement!") + elif c == "-": + if idx == 0: + token += c + else: + raise Exception("improper sign placement!") + elif c == ".": + if c not in token: + token += c + else: + raise Exception("too many decimal points") + else: + raise Exception("improper numeric!") + + if "." in token: + #return float(token), "" + return NebLiteral(NebType.FLOAT, float(token)), "" + else: + #return int(token), "" + return NebLiteral(NebType.INT, int(token)), "" + +def lex_symbol(inp): + token = "" + for idx, c in enumerate(inp): + if c in (CLOSE_PAREN, " "): + return NebSymbol(token), inp[idx:] + elif c in SYMBOL_VALS: + token += c + else: + raise Exception("improper symbol") + return NebSymbol(token), "" + + +def peek(inp): + if len(inp) == 0: + return None + return inp[0] + +def lex(inp, tokens): + inp = inp.strip() # white space doesn't matter at this point + nxt = peek(inp) + if nxt is None: + #print(f"returning [{tokens}]") + return tokens + # parens + if nxt == OPEN_PAREN: + tokens.append(NebOpen()) + return lex(inp[1:], tokens) + elif nxt == CLOSE_PAREN: + tokens.append(NebClose()) + return lex(inp[1:], tokens) + # numbers + elif nxt in list(DIGITS) or nxt in ("+", "-", "."): + token, remainder = lex_number(inp) + tokens.append(token) + return lex(remainder, tokens) + # strings + elif nxt == DOUBLE_QUOTE: + token, remainder = lex_string(inp[1:]) + #print(f"received [{token}] [{remainder}]") + if peek(remainder) not in (None, CLOSE_PAREN, " "): + raise Exception("spaces required between tokens") + tokens.append(token) + return lex(remainder, tokens) + # bool + elif nxt == "#": + token, remainder = lex_bool(inp[1:]) + if peek(remainder) not in (None, CLOSE_PAREN, " "): + raise Exception("spaces required between tokens") + tokens.append(token) + return lex(remainder, tokens) + # symbols + elif nxt in SYMBOL_VALS: + token, remainder = lex_symbol(inp) + if peek(remainder) not in (None, CLOSE_PAREN, " "): + raise Exception("spaces required between tokens") + tokens.append(token) + return lex(remainder, tokens) + else: + raise Exception("unable to lex") |
