parser + tokenizer, hell yeah
iterative statements + if statement parser + tokenizer hell yeah (predictive parsing) hell yeahchamanEiqbal
python
a year ago
7.4 kB
4
Indexable
# Tokens
# Define token constants used in the tokenizer and parser
IF = 'if'
WHILE = 'while'
FOR = 'for'
IN = 'in'
COLON = ':'
OPEN_PAREN = '('
CLOSE_PAREN = ')'
COMMA = ','
SEMI_COLON = ';'
NEWLINE = '\n'
INDENT = '\t'
# Terminals
# Define terminal symbols for grammar productions
VARIABLE = 'VARIABLE'
EXPR = 'EXPR'
STATEMENT = 'STATEMENT'
ITERABLE = 'ITERABLE'
# Productions
# Define grammar rules for syntactic structures
# Start with iterative statements
ITERATIVE_STATEMENT = []
# Initialize start symbol for the grammar
start = ITERATIVE_STATEMENT
# Define patterns for iterative statements
ITERATIVE_STATEMENT = [
(WHILE, OPEN_PAREN, EXPR, CLOSE_PAREN, COLON, NEWLINE, STATEMENT), # while (expr): \n\t statement
(FOR, VARIABLE, IN, ITERABLE, COLON, NEWLINE, STATEMENT), # for variable in iterable: \n\t statement
(IF, EXPR, COLON, NEWLINE, STATEMENT), # if expr: \n\t statement
]
# Define patterns for expressions
EXPR = [
(EXPR, COMMA, EXPR),
(EXPR, SEMI_COLON, EXPR),
(VARIABLE,),
]
# Define patterns for iterables
ITERABLE = [
(EXPR, COMMA, EXPR),
(EXPR, SEMI_COLON, EXPR),
(VARIABLE,),
]
# Define patterns for statements
STATEMENT = [
(EXPR,),
(ITERATIVE_STATEMENT,),
]
# Tokenizer function
def tokenize(string):
# Tokenize an input string into individual components
tokens = []
current_token = ''
for char in string:
if char in ' \t\n': # Handle whitespace, tabs, and newlines
if current_token:
tokens.append(current_token)
current_token = ''
if char == '\n':
tokens.append(NEWLINE)
elif char == '\t': # Add INDENT token for tabs
tokens.append(INDENT)
elif char in '()[]{}:,;': # Handle punctuation tokens
if current_token:
tokens.append(current_token)
current_token = ''
tokens.append(char)
else:
current_token += char # Build token from characters
if current_token:
tokens.append(current_token)
return tokens
# Parser function
def parse(tokens):
# Parse a list of tokens into an abstract syntax tree (AST)
if not tokens:
raise ValueError("No tokens to parse.")
token = tokens.pop(0)
if token == WHILE:
return parse_while(tokens)
elif token == FOR:
return parse_for(tokens)
elif token == IF:
return parse_if(tokens)
else:
# Handle general statement or expression
tokens.insert(0, token) # Put back token for further parsing
return parse_statement(tokens)
def parse_while(tokens):
# Parse a while-loop structure
if tokens.pop(0) != OPEN_PAREN:
raise ValueError('Expected ( after "while".')
expr = parse_expr(tokens) # Parse the condition expression
if tokens.pop(0) != CLOSE_PAREN:
raise ValueError('Expected ) after expression.')
if tokens.pop(0) != COLON:
raise ValueError('Expected : after ")".')
if tokens.pop(0) != NEWLINE:
raise ValueError('Expected newline after ":".')
# Parse the single statement in the body of the while loop
if tokens and tokens[0] == INDENT:
tokens.pop(0) # Consume indentation
statement = parse_statement(tokens)
else:
raise ValueError('Expected an indented statement in while-loop body.')
return ('while', expr, statement)
def parse_for(tokens):
# Parse a for-loop structure
variable = tokens.pop(0)
if not variable.isidentifier():
raise ValueError(f'Expected a valid variable, got: {variable}')
if tokens.pop(0) != IN:
raise ValueError('Expected "in" after variable.')
iterable_token = tokens.pop(0)
if iterable_token == 'range':
if tokens.pop(0) != OPEN_PAREN:
raise ValueError('Expected ( after "range".')
range_args = []
while tokens[0] != CLOSE_PAREN:
range_args.append(parse_expr(tokens))
if tokens[0] == COMMA:
tokens.pop(0) # Consume ','
tokens.pop(0) # Consume ')'
expr = ('range', *range_args)
else:
expr = iterable_token
if tokens.pop(0) != COLON:
raise ValueError('Expected : after iterable.')
if tokens.pop(0) != NEWLINE:
raise ValueError('Expected newline after ":".')
# Parse the body of the for-loop
body = []
while tokens and tokens[0] == INDENT:
tokens.pop(0) # Consume indentation
body.append(parse_statement(tokens)) # Parse each indented statement
if not body:
raise ValueError('Expected at least one indented statement in for-loop body.')
return ('for', variable, expr, body)
def parse_if(tokens):
# Parse an if-statement structure
expr = parse_expr(tokens)
if tokens.pop(0) != COLON:
raise ValueError('Expected : after expression.')
if tokens.pop(0) != NEWLINE:
raise ValueError('Expected newline after ":".')
body = []
while tokens and tokens[0] == INDENT:
tokens.pop(0) # Consume indentation
body.append(parse_statement(tokens)) # Parse each indented statement
if not body:
raise ValueError('Expected at least one indented statement in for-loop body.')
return ('if', expr, body)
def parse_expr(tokens):
if not tokens:
raise ValueError("Empty tokens in expression.")
lhs = parse_primary(tokens)
while tokens and tokens[0] in ['<', '>', '==', '!=', '+', '-', '*', '/', '%']:
operator = tokens.pop(0)
rhs = parse_primary(tokens)
lhs = ('binop', operator, lhs, rhs)
return lhs
def parse_primary(tokens):
token = tokens.pop(0)
if token.isidentifier() or token.isdigit():
return token
elif token == OPEN_PAREN:
expr = parse_expr(tokens)
if not tokens or tokens.pop(0) != CLOSE_PAREN:
raise ValueError('Expected closing )')
return expr
else:
raise ValueError(f"Unexpected token in primary expression: {token}")
def parse_statement(tokens):
# Parse a general statement
token = tokens.pop(0)
if token.isidentifier():
# Handle function calls or assignments
if tokens and tokens[0] == OPEN_PAREN:
tokens.pop(0) # Consume '('
args = []
while tokens[0] != CLOSE_PAREN:
args.append(parse_expr(tokens))
if tokens[0] == COMMA:
tokens.pop(0) # Consume ','
tokens.pop(0) # Consume ')'
return ('call', token, args)
else:
return ('expr', token)
elif token in {WHILE, FOR, IF}:
# Recursive parsing of nested structures
tokens.insert(0, token) # Put back token
return parse(tokens)
else:
raise ValueError(f'Invalid token in statement: {token}')
# Main execution block!
if __name__ == '__main__':
strings = [
"for x in range(10):\n\tprint(x)",
"while (x < 10):\n\tprint(x)",
"if (x < 10):\n\tprint(x)"
]
for string in strings:
tokens = tokenize(string)
print("Tokens:", tokens)
try:
ast = parse(tokens)
print("Parsed AST:", ast)
except ValueError as e:
print("Parsing error:", e)
print()Editor is loading...
Leave a Comment