Untitled

mail@pastecode.io avatar
unknown
plain_text
2 years ago
8.3 kB
1
Indexable
Never
"""
This file contains the lexer for the calc language.

What a lexer does:
    - Recognizes the regular portion of a grammar.
    - Reads input character by character and builds tokens.

How to a Construct a Lexer:
    1.) Construct an interface for the input. Scan character by character, 
        keeping track of line and column number.

    2.) Create a function which can skip whitespace.

    3.) Examine our BNF, and find all the things that are regular.
         A->a  or A->aA
         A->a  or A->Aa

         - All terminals are automatically regular.
         - Which rules can be written as a regular grammar.

    4.) Create a representation for all the tokens.
         - Token: Broad category of strings (lexer's output)
         - Lexeme: Actual string that was matched to the token.
         - Value: Any lexeme that can be converted a value will have a value.
         - Line: Line number where the token begins
         - Col: Column number where the token begins

    5.) Create a function called "next" which advances the lexer to 
        the next lexeme.

        5.1) Group tokens according to difficulty.
             Group 1 - Single characters which are not the prefix of
                       anything else. (+, /, =)

             Group 2 - Fixed width tokens which may include prefixes of
                       each other. (<, <=, >, >=...)

             Group 3 - Variable width tokens, may include some fixed-width
                       tokens which can be prefixes of wider tokens.
                       Numbers, Variable Names, Keywords
"""
import sys
from enum import Enum,auto
from collections import namedtuple

class Token(Enum):
    """
    Calc tokens
    """
    EQUAL = auto()
    PLUS = auto()
    MINUS = auto()
    TIMES = auto()
    DIVIDE = auto()
    EXPONENT = auto()
    LPAREN = auto()
    RPAREN = auto()
    DOT = auto()
    REF = auto()
    INTLIT = auto()
    FLOATLIT = auto()
    INVALID = auto()
    INPUT = auto()
    EOF = auto()

Lexeme = namedtuple('Lexeme', ("token", "lexeme", "value", "line", "col"))


class Lexer:
    def __init__(self, file=sys.stdin):
        self.file = file
        self.line_num = 1
        self.col_num = 0
        self.cur_char = None
        self.current = None

        # start looking at the first character
        self.consume()


    def next(self):
        """
        Advance the lexer to the next token.

        Return the token
        """

        self.skip_space_and_comments()

        # check for eof
        if not self.cur_char:
            self.current = Lexeme(Token.EOF, '', None, 0, 0)
            return self.current

        if self.lex_single():
            return self.current
        elif self.lex_multi_fixed():
            return self.current
        elif self.lex_other():
            return self.current
        else:
            self.current = Lexeme(Token.INVALID, self.cur_char, None, self.line_num, self.col_num)
            self.consume()
            return self.current



    def consume(self):
        self.cur_char = self.file.read(1)
        self.col_num += 1
        if self.cur_char == '\n':
            self.line_num += 1
            self.col_num = 0


    def skip_space_and_comments(self):
        """
        Consumes characters until we encounter non-whitespace.
        Also, skips comments.
        Also, stops on end of file.
        """
        while self.cur_char.isspace() or self.cur_char == '#':
            if self.cur_char == '#':
                # consume the rest of the line
                while self.cur_char != '\n':
                    self.consume()

            # consume all the whitespace
            while self.cur_char.isspace():
                self.consume()


    def lex_single(self):
        """
        Attempt to match our single character tokens.
        Return True on a match, False otherwise
        """
        lexemes = (("=", Token.EQUAL), 
                   ("+", Token.PLUS),
                   ("-", Token.MINUS), 
                   ("/", Token.DIVIDE),
                   ("^", Token.EXPONENT),
                   ("(", Token.LPAREN),
                   (")", Token.RPAREN))
        for l in lexemes:
            if l[0] == self.cur_char:
                self.current = Lexeme(l[1], l[0], None, self.line_num, self.col_num)
                self.consume()
                return True
        return False


    def lex_multi_fixed(self):
        """
        Recognize multiple character tokens of fixed length.
        These tokens may share a common prefix.

        Basic Strategy:
            Find the longest string for which exactly one token
            exists.
              - Scan until we find an inconsistent character.
              - Make sure that we have an exact match.
        """
        lexemes = [ ("*", Token.TIMES),
                    ("**", Token.EXPONENT) ]

        # make a note of where we are in the file (token's beginning)
        line = self.line_num
        col = self.col_num

        # the accumulated lexeme string
        lstr = ''
        while len(lexemes) > 0:
            # try adding this to the token list
            trial = lstr + self.cur_char

            # find the lexemes that are consistent with trial
            consistent = [ l for l in lexemes if l[0].startswith(trial) ]

            # we stop if this character would make the token inconsisten
            if len(consistent) == 0:
                break
            else:
                lexemes = consistent
                lstr = trial
                self.consume()

        # make sure that at least one character matches
        if len(lstr) == 0:
            return False

        # try to make an exact match for lstr
        lexemes = [ l for l in lexemes if l[0] == lstr ]
        if len(lexemes) == 0:
            t = Token.INVALID
        else:
            t = lexemes[0][1]

        # build the token
        self.current = Lexeme(t, lstr, None, line, col)

        return True



    def lex_other(self):
        """
        Our only strategy is to build a state machine.
        """

        if self.cur_char == '.' or self.cur_char.isdigit():
            return self.lex_number()
        elif self.cur_char.isalpha() or self.cur_char == "_":
            return self.lex_kw_ref()
        return False


    def lex_number(self):
        # accumulate characters until they are inconsistent 
        lstr = ''
        line = self.line_num
        col = self.col_num

        # assume an integer
        t = Token.INTLIT

        # accumulate all the digits
        while self.cur_char.isdigit():
            lstr += self.cur_char
            self.consume()

        if self.cur_char == '.':
            # this is a float
            t = Token.FLOATLIT

            lstr += self.cur_char
            self.consume()

            # grab the final digits
            while self.cur_char.isdigit():
                lstr += self.cur_char
                self.consume()

        #validate and extract the value
        if lstr[-1] == '.':
            t = Token.INVALID
            v = None
        elif t == Token.INTLIT:
            v = int(lstr)
        else:
            v = float(lstr)

        self.current = Lexeme(t, lstr, v, line, col)
        return True

    def lex_kw_ref(self):
        # a list of keywords
        kw = [ ("input", Token.INPUT) ] 

        # accumulate all the alpha numeric characters and _
        lstr = ''
        line = self.line_num
        col = self.col_num

        while self.cur_char.isalnum() or self.cur_char == "_":
            lstr += self.cur_char
            self.consume()

        # search for keyword or make it a ref
        kw = [ w for w in kw if w[0] == lstr ]
        if len(kw) == 1:
            t = kw[0][1]
        else:
            t = Token.REF

        self.current = Lexeme(t, lstr, None, line, col)
        return True

if __name__ == '__main__':
    lexer = Lexer()
    while lexer.cur_char != '':
        print(lexer.next())