parser + tokenizer, hell yeah
iterative statements + if statement parser + tokenizer hell yeah (predictive parsing) hell yeahchamanEiqbal
python
4 months ago
7.4 kB
3
Indexable
# Tokens # Define token constants used in the tokenizer and parser IF = 'if' WHILE = 'while' FOR = 'for' IN = 'in' COLON = ':' OPEN_PAREN = '(' CLOSE_PAREN = ')' COMMA = ',' SEMI_COLON = ';' NEWLINE = '\n' INDENT = '\t' # Terminals # Define terminal symbols for grammar productions VARIABLE = 'VARIABLE' EXPR = 'EXPR' STATEMENT = 'STATEMENT' ITERABLE = 'ITERABLE' # Productions # Define grammar rules for syntactic structures # Start with iterative statements ITERATIVE_STATEMENT = [] # Initialize start symbol for the grammar start = ITERATIVE_STATEMENT # Define patterns for iterative statements ITERATIVE_STATEMENT = [ (WHILE, OPEN_PAREN, EXPR, CLOSE_PAREN, COLON, NEWLINE, STATEMENT), # while (expr): \n\t statement (FOR, VARIABLE, IN, ITERABLE, COLON, NEWLINE, STATEMENT), # for variable in iterable: \n\t statement (IF, EXPR, COLON, NEWLINE, STATEMENT), # if expr: \n\t statement ] # Define patterns for expressions EXPR = [ (EXPR, COMMA, EXPR), (EXPR, SEMI_COLON, EXPR), (VARIABLE,), ] # Define patterns for iterables ITERABLE = [ (EXPR, COMMA, EXPR), (EXPR, SEMI_COLON, EXPR), (VARIABLE,), ] # Define patterns for statements STATEMENT = [ (EXPR,), (ITERATIVE_STATEMENT,), ] # Tokenizer function def tokenize(string): # Tokenize an input string into individual components tokens = [] current_token = '' for char in string: if char in ' \t\n': # Handle whitespace, tabs, and newlines if current_token: tokens.append(current_token) current_token = '' if char == '\n': tokens.append(NEWLINE) elif char == '\t': # Add INDENT token for tabs tokens.append(INDENT) elif char in '()[]{}:,;': # Handle punctuation tokens if current_token: tokens.append(current_token) current_token = '' tokens.append(char) else: current_token += char # Build token from characters if current_token: tokens.append(current_token) return tokens # Parser function def parse(tokens): # Parse a list of tokens into an abstract syntax tree (AST) if not tokens: raise ValueError("No tokens to parse.") token = tokens.pop(0) if token == WHILE: return parse_while(tokens) elif token == FOR: return parse_for(tokens) elif token == IF: return parse_if(tokens) else: # Handle general statement or expression tokens.insert(0, token) # Put back token for further parsing return parse_statement(tokens) def parse_while(tokens): # Parse a while-loop structure if tokens.pop(0) != OPEN_PAREN: raise ValueError('Expected ( after "while".') expr = parse_expr(tokens) # Parse the condition expression if tokens.pop(0) != CLOSE_PAREN: raise ValueError('Expected ) after expression.') if tokens.pop(0) != COLON: raise ValueError('Expected : after ")".') if tokens.pop(0) != NEWLINE: raise ValueError('Expected newline after ":".') # Parse the single statement in the body of the while loop if tokens and tokens[0] == INDENT: tokens.pop(0) # Consume indentation statement = parse_statement(tokens) else: raise ValueError('Expected an indented statement in while-loop body.') return ('while', expr, statement) def parse_for(tokens): # Parse a for-loop structure variable = tokens.pop(0) if not variable.isidentifier(): raise ValueError(f'Expected a valid variable, got: {variable}') if tokens.pop(0) != IN: raise ValueError('Expected "in" after variable.') iterable_token = tokens.pop(0) if iterable_token == 'range': if tokens.pop(0) != OPEN_PAREN: raise ValueError('Expected ( after "range".') range_args = [] while tokens[0] != CLOSE_PAREN: range_args.append(parse_expr(tokens)) if tokens[0] == COMMA: tokens.pop(0) # Consume ',' tokens.pop(0) # Consume ')' expr = ('range', *range_args) else: expr = iterable_token if tokens.pop(0) != COLON: raise ValueError('Expected : after iterable.') if tokens.pop(0) != NEWLINE: raise ValueError('Expected newline after ":".') # Parse the body of the for-loop body = [] while tokens and tokens[0] == INDENT: tokens.pop(0) # Consume indentation body.append(parse_statement(tokens)) # Parse each indented statement if not body: raise ValueError('Expected at least one indented statement in for-loop body.') return ('for', variable, expr, body) def parse_if(tokens): # Parse an if-statement structure expr = parse_expr(tokens) if tokens.pop(0) != COLON: raise ValueError('Expected : after expression.') if tokens.pop(0) != NEWLINE: raise ValueError('Expected newline after ":".') body = [] while tokens and tokens[0] == INDENT: tokens.pop(0) # Consume indentation body.append(parse_statement(tokens)) # Parse each indented statement if not body: raise ValueError('Expected at least one indented statement in for-loop body.') return ('if', expr, body) def parse_expr(tokens): if not tokens: raise ValueError("Empty tokens in expression.") lhs = parse_primary(tokens) while tokens and tokens[0] in ['<', '>', '==', '!=', '+', '-', '*', '/', '%']: operator = tokens.pop(0) rhs = parse_primary(tokens) lhs = ('binop', operator, lhs, rhs) return lhs def parse_primary(tokens): token = tokens.pop(0) if token.isidentifier() or token.isdigit(): return token elif token == OPEN_PAREN: expr = parse_expr(tokens) if not tokens or tokens.pop(0) != CLOSE_PAREN: raise ValueError('Expected closing )') return expr else: raise ValueError(f"Unexpected token in primary expression: {token}") def parse_statement(tokens): # Parse a general statement token = tokens.pop(0) if token.isidentifier(): # Handle function calls or assignments if tokens and tokens[0] == OPEN_PAREN: tokens.pop(0) # Consume '(' args = [] while tokens[0] != CLOSE_PAREN: args.append(parse_expr(tokens)) if tokens[0] == COMMA: tokens.pop(0) # Consume ',' tokens.pop(0) # Consume ')' return ('call', token, args) else: return ('expr', token) elif token in {WHILE, FOR, IF}: # Recursive parsing of nested structures tokens.insert(0, token) # Put back token return parse(tokens) else: raise ValueError(f'Invalid token in statement: {token}') # Main execution block! if __name__ == '__main__': strings = [ "for x in range(10):\n\tprint(x)", "while (x < 10):\n\tprint(x)", "if (x < 10):\n\tprint(x)" ] for string in strings: tokens = tokenize(string) print("Tokens:", tokens) try: ast = parse(tokens) print("Parsed AST:", ast) except ValueError as e: print("Parsing error:", e) print()
Editor is loading...
Leave a Comment