diff options
Diffstat (limited to 'lexer.py')
| -rw-r--r-- | lexer.py | 168 | 
1 files changed, 0 insertions, 168 deletions
| diff --git a/lexer.py b/lexer.py deleted file mode 100644 index fa3db90..0000000 --- a/lexer.py +++ /dev/null @@ -1,168 +0,0 @@ -from structs import TokenType, Token -from exceptions import LexError -import sys - - -types = { -    ":int": TokenType.INT_TYPE, -    ":float": TokenType.FLOAT_TYPE, -    ":number": TokenType.NUMBER_TYPE, -    ":string": TokenType.STRING_TYPE, -    ":list": TokenType.LIST_TYPE, -    ":any": TokenType.ANY_TYPE, -    ":literal": TokenType.LITERAL_TYPE, -    ":bool": TokenType.BOOL_TYPE } - -keywords = { -    "if": TokenType.IF, -    "for-count": TokenType.FOR_COUNT, -    "def": TokenType.DEF, -    "lambda": TokenType.LAMBDA, -    "&": TokenType.MANY, -    "func": TokenType.FUNC } - - -WHITESPACE = [" ", "\n", "\t"] -SEPARATORS = WHITESPACE + [")"] -DIGITS = list("0123456789") - -def lex(data): -    start = 0 -    current = 0 -    line = 1 -    end = len(data) - -    tokens = [] -    while current < end: -        char = data[current] -        if char == ";": -            while char != "\n" and current < end: -                current += 1 -                char = data[current] -            continue -        if char == "\n": -            line += 1 -        if char in WHITESPACE: -            current += 1 -            continue -        elif char == "(": -            tokens.append(Token(TokenType.OPEN_PAREN, "(", None, line)) -        elif char == ")": -            tokens.append(Token(TokenType.CLOSE_PAREN, ")", None, line)) -        # numbers -        elif char in DIGITS or char == ".": -            tok, length = get_number(data[current:], line) -            tokens.append(tok) -            current += length -        # strings -        elif char == '"': -            tok, length, offset = get_string(data[current+1:], line) -            tokens.append(tok) -            current += length -            line += offset -        # bools -        elif char == "#": -            tok, length = get_bool(data[current+1:], line) -            tokens.append(tok) -            current += length -        #types -        elif char == ":": -            tok, length = get_type(data[current:], line) # include : -            tokens.append(tok) -            current += length -        # symbols -        else: -            tok, length = get_symbol(data[current:], line) -            if tok.text in keywords: -                tok.type_ = keywords[tok.text] -            tokens.append(tok) -            current += length - -        current += 1 -    tokens.append(Token(TokenType.EOF, "", None, line)) -    return tokens - -def get_number(data, line): -    counter = 0 -    value = "" -    is_float = False -    char = data[counter] -    while char not in SEPARATORS: -        if char in DIGITS: -            value += char -        elif char == ".": -            if is_float: -                raise LexError("too many '.' in number", line) -            is_float = True -            value += char -        else: -            raise Exception(f"invalid number: {value}") -        counter += 1 -        if counter >= len(data): -            break -        char = data[counter] -    if is_float: -        return Token(TokenType.FLOAT, value, float(value), line), counter - 1 -    else: -        return Token(TokenType.INT, value, int(value), line), counter - 1 - - -def get_string(data, line): -    offset = 0 -    counter = 0 -    string = "" -    while data[counter] != '"': -        if data[counter] == "\n": -            offset += 1 - -        # look ahead to see if it's a double quote -        if data[counter] == "\\" and \ -                    len(data) > counter and \ -                    data[counter+1] == '"': -            string += '"' -            counter += 1 -        else: -            string += data[counter] -        counter += 1 -        if counter >= len(data): -            raise Exception("couldn't parse string") -    string = string.encode().decode("unicode_escape") -    return Token(TokenType.STRING, str(string), str(string), line), counter + 1, offset - -def get_bool(data, line): -    counter = 0 -    value = "" -    while data[counter] not in SEPARATORS: -        value += data[counter] -        counter += 1 -        if counter >= len(data): -            break -    if value == "true": -        return Token(TokenType.TRUE, "#true", True, line), 4 -    elif value == "false": -        return Token(TokenType.FALSE, "#false", False, line), 5 -    else: -        raise LexError("couldn't parse boolean", line) - -def get_symbol(data, line): -    counter = 0 -    value = "" -    while data[counter] not in SEPARATORS: -        value += data[counter] -        counter += 1 -        if counter >= len(data): -            break -    return Token(TokenType.SYMBOL, value, None, line), counter - 1 - -def get_type(data, line): -    counter = 0 -    value = "" -    while data[counter] not in SEPARATORS: -        value += data[counter] -        counter += 1 -        if counter >= len(data): -            break -    if value not in types: -        raise LexError(f"unrecognized type {value}", line) -    return Token(types[value], value, None, line), counter - 1 - | 
