Untitled
unknown
plain_text
3 years ago
8.3 kB
5
Indexable
""" This file contains the lexer for the calc language. What a lexer does: - Recognizes the regular portion of a grammar. - Reads input character by character and builds tokens. How to a Construct a Lexer: 1.) Construct an interface for the input. Scan character by character, keeping track of line and column number. 2.) Create a function which can skip whitespace. 3.) Examine our BNF, and find all the things that are regular. A->a or A->aA A->a or A->Aa - All terminals are automatically regular. - Which rules can be written as a regular grammar. 4.) Create a representation for all the tokens. - Token: Broad category of strings (lexer's output) - Lexeme: Actual string that was matched to the token. - Value: Any lexeme that can be converted a value will have a value. - Line: Line number where the token begins - Col: Column number where the token begins 5.) Create a function called "next" which advances the lexer to the next lexeme. 5.1) Group tokens according to difficulty. Group 1 - Single characters which are not the prefix of anything else. (+, /, =) Group 2 - Fixed width tokens which may include prefixes of each other. (<, <=, >, >=...) Group 3 - Variable width tokens, may include some fixed-width tokens which can be prefixes of wider tokens. Numbers, Variable Names, Keywords """ import sys from enum import Enum,auto from collections import namedtuple class Token(Enum): """ Calc tokens """ EQUAL = auto() PLUS = auto() MINUS = auto() TIMES = auto() DIVIDE = auto() EXPONENT = auto() LPAREN = auto() RPAREN = auto() DOT = auto() REF = auto() INTLIT = auto() FLOATLIT = auto() INVALID = auto() INPUT = auto() EOF = auto() Lexeme = namedtuple('Lexeme', ("token", "lexeme", "value", "line", "col")) class Lexer: def __init__(self, file=sys.stdin): self.file = file self.line_num = 1 self.col_num = 0 self.cur_char = None self.current = None # start looking at the first character self.consume() def next(self): """ Advance the lexer to the next token. Return the token """ self.skip_space_and_comments() # check for eof if not self.cur_char: self.current = Lexeme(Token.EOF, '', None, 0, 0) return self.current if self.lex_single(): return self.current elif self.lex_multi_fixed(): return self.current elif self.lex_other(): return self.current else: self.current = Lexeme(Token.INVALID, self.cur_char, None, self.line_num, self.col_num) self.consume() return self.current def consume(self): self.cur_char = self.file.read(1) self.col_num += 1 if self.cur_char == '\n': self.line_num += 1 self.col_num = 0 def skip_space_and_comments(self): """ Consumes characters until we encounter non-whitespace. Also, skips comments. Also, stops on end of file. """ while self.cur_char.isspace() or self.cur_char == '#': if self.cur_char == '#': # consume the rest of the line while self.cur_char != '\n': self.consume() # consume all the whitespace while self.cur_char.isspace(): self.consume() def lex_single(self): """ Attempt to match our single character tokens. Return True on a match, False otherwise """ lexemes = (("=", Token.EQUAL), ("+", Token.PLUS), ("-", Token.MINUS), ("/", Token.DIVIDE), ("^", Token.EXPONENT), ("(", Token.LPAREN), (")", Token.RPAREN)) for l in lexemes: if l[0] == self.cur_char: self.current = Lexeme(l[1], l[0], None, self.line_num, self.col_num) self.consume() return True return False def lex_multi_fixed(self): """ Recognize multiple character tokens of fixed length. These tokens may share a common prefix. Basic Strategy: Find the longest string for which exactly one token exists. - Scan until we find an inconsistent character. - Make sure that we have an exact match. """ lexemes = [ ("*", Token.TIMES), ("**", Token.EXPONENT) ] # make a note of where we are in the file (token's beginning) line = self.line_num col = self.col_num # the accumulated lexeme string lstr = '' while len(lexemes) > 0: # try adding this to the token list trial = lstr + self.cur_char # find the lexemes that are consistent with trial consistent = [ l for l in lexemes if l[0].startswith(trial) ] # we stop if this character would make the token inconsisten if len(consistent) == 0: break else: lexemes = consistent lstr = trial self.consume() # make sure that at least one character matches if len(lstr) == 0: return False # try to make an exact match for lstr lexemes = [ l for l in lexemes if l[0] == lstr ] if len(lexemes) == 0: t = Token.INVALID else: t = lexemes[0][1] # build the token self.current = Lexeme(t, lstr, None, line, col) return True def lex_other(self): """ Our only strategy is to build a state machine. """ if self.cur_char == '.' or self.cur_char.isdigit(): return self.lex_number() elif self.cur_char.isalpha() or self.cur_char == "_": return self.lex_kw_ref() return False def lex_number(self): # accumulate characters until they are inconsistent lstr = '' line = self.line_num col = self.col_num # assume an integer t = Token.INTLIT # accumulate all the digits while self.cur_char.isdigit(): lstr += self.cur_char self.consume() if self.cur_char == '.': # this is a float t = Token.FLOATLIT lstr += self.cur_char self.consume() # grab the final digits while self.cur_char.isdigit(): lstr += self.cur_char self.consume() #validate and extract the value if lstr[-1] == '.': t = Token.INVALID v = None elif t == Token.INTLIT: v = int(lstr) else: v = float(lstr) self.current = Lexeme(t, lstr, v, line, col) return True def lex_kw_ref(self): # a list of keywords kw = [ ("input", Token.INPUT) ] # accumulate all the alpha numeric characters and _ lstr = '' line = self.line_num col = self.col_num while self.cur_char.isalnum() or self.cur_char == "_": lstr += self.cur_char self.consume() # search for keyword or make it a ref kw = [ w for w in kw if w[0] == lstr ] if len(kw) == 1: t = kw[0][1] else: t = Token.REF self.current = Lexeme(t, lstr, None, line, col) return True if __name__ == '__main__': lexer = Lexer() while lexer.cur_char != '': print(lexer.next())
Editor is loading...