parser + tokenizer, hell yeah

iterative statements + if statement parser + tokenizer hell yeah (predictive parsing) hell yeah
 avatar
chamanEiqbal
python
4 months ago
7.4 kB
3
Indexable
# Tokens
# Define token constants used in the tokenizer and parser
IF = 'if'
WHILE = 'while'
FOR = 'for'
IN = 'in'
COLON = ':'
OPEN_PAREN = '('
CLOSE_PAREN = ')'
COMMA = ','
SEMI_COLON = ';'
NEWLINE = '\n'
INDENT = '\t'

# Terminals
# Define terminal symbols for grammar productions
VARIABLE = 'VARIABLE'
EXPR = 'EXPR'
STATEMENT = 'STATEMENT'
ITERABLE = 'ITERABLE'

# Productions
# Define grammar rules for syntactic structures

# Start with iterative statements
ITERATIVE_STATEMENT = []

# Initialize start symbol for the grammar
start = ITERATIVE_STATEMENT

# Define patterns for iterative statements
ITERATIVE_STATEMENT = [
    (WHILE, OPEN_PAREN, EXPR, CLOSE_PAREN, COLON, NEWLINE, STATEMENT),  # while (expr): \n\t statement
    (FOR, VARIABLE, IN, ITERABLE, COLON, NEWLINE, STATEMENT),  # for variable in iterable: \n\t statement
    (IF, EXPR, COLON, NEWLINE, STATEMENT),  # if expr: \n\t statement
]

# Define patterns for expressions
EXPR = [
    (EXPR, COMMA, EXPR),
    (EXPR, SEMI_COLON, EXPR),
    (VARIABLE,),
]

# Define patterns for iterables
ITERABLE = [
    (EXPR, COMMA, EXPR),
    (EXPR, SEMI_COLON, EXPR),
    (VARIABLE,),
]

# Define patterns for statements
STATEMENT = [
    (EXPR,),
    (ITERATIVE_STATEMENT,),
]

# Tokenizer function
def tokenize(string):
    # Tokenize an input string into individual components
    tokens = []
    current_token = ''
    for char in string:
        if char in ' \t\n':  # Handle whitespace, tabs, and newlines
            if current_token:
                tokens.append(current_token)
                current_token = ''
            if char == '\n':
                tokens.append(NEWLINE)
            elif char == '\t':  # Add INDENT token for tabs
                tokens.append(INDENT)
        elif char in '()[]{}:,;':  # Handle punctuation tokens
            if current_token:
                tokens.append(current_token)
                current_token = ''
            tokens.append(char)
        else:
            current_token += char  # Build token from characters

    if current_token:
        tokens.append(current_token)
    return tokens


# Parser function
def parse(tokens):
    # Parse a list of tokens into an abstract syntax tree (AST)
    if not tokens:
        raise ValueError("No tokens to parse.")
    token = tokens.pop(0)
    if token == WHILE:
        return parse_while(tokens)
    elif token == FOR:
        return parse_for(tokens)
    elif token == IF:
        return parse_if(tokens)
    else:
        # Handle general statement or expression
        tokens.insert(0, token)  # Put back token for further parsing
        return parse_statement(tokens)

def parse_while(tokens):
    # Parse a while-loop structure
    if tokens.pop(0) != OPEN_PAREN:
        raise ValueError('Expected ( after "while".')
    expr = parse_expr(tokens)  # Parse the condition expression
    if tokens.pop(0) != CLOSE_PAREN:
        raise ValueError('Expected ) after expression.')
    if tokens.pop(0) != COLON:
        raise ValueError('Expected : after ")".')
    if tokens.pop(0) != NEWLINE:
        raise ValueError('Expected newline after ":".')
    
    # Parse the single statement in the body of the while loop
    if tokens and tokens[0] == INDENT:
        tokens.pop(0)  # Consume indentation
        statement = parse_statement(tokens)
    else:
        raise ValueError('Expected an indented statement in while-loop body.')

    return ('while', expr, statement)

def parse_for(tokens):
    # Parse a for-loop structure
    variable = tokens.pop(0)
    if not variable.isidentifier():
        raise ValueError(f'Expected a valid variable, got: {variable}')

    if tokens.pop(0) != IN:
        raise ValueError('Expected "in" after variable.')

    iterable_token = tokens.pop(0)
    if iterable_token == 'range':
        if tokens.pop(0) != OPEN_PAREN:
            raise ValueError('Expected ( after "range".')

        range_args = []
        while tokens[0] != CLOSE_PAREN:
            range_args.append(parse_expr(tokens))
            if tokens[0] == COMMA:
                tokens.pop(0)  # Consume ','

        tokens.pop(0)  # Consume ')'
        expr = ('range', *range_args)
    else:
        expr = iterable_token

    if tokens.pop(0) != COLON:
        raise ValueError('Expected : after iterable.')

    if tokens.pop(0) != NEWLINE:
        raise ValueError('Expected newline after ":".')

    # Parse the body of the for-loop
    body = []
    while tokens and tokens[0] == INDENT:
        tokens.pop(0)  # Consume indentation
        body.append(parse_statement(tokens))  # Parse each indented statement

    if not body:
        raise ValueError('Expected at least one indented statement in for-loop body.')

    return ('for', variable, expr, body)

def parse_if(tokens):
    # Parse an if-statement structure
    expr = parse_expr(tokens)
    if tokens.pop(0) != COLON:
        raise ValueError('Expected : after expression.')
    if tokens.pop(0) != NEWLINE:
        raise ValueError('Expected newline after ":".')
    body = []
    while tokens and tokens[0] == INDENT:
        tokens.pop(0)  # Consume indentation
        body.append(parse_statement(tokens))  # Parse each indented statement

    if not body:
        raise ValueError('Expected at least one indented statement in for-loop body.')
    
    return ('if', expr, body)

def parse_expr(tokens):
    if not tokens:
        raise ValueError("Empty tokens in expression.")
    lhs = parse_primary(tokens)
    while tokens and tokens[0] in ['<', '>', '==', '!=', '+', '-', '*', '/', '%']:
        operator = tokens.pop(0)
        rhs = parse_primary(tokens)
        lhs = ('binop', operator, lhs, rhs)
    return lhs

def parse_primary(tokens):
    token = tokens.pop(0)
    if token.isidentifier() or token.isdigit():
        return token
    elif token == OPEN_PAREN:
        expr = parse_expr(tokens)
        if not tokens or tokens.pop(0) != CLOSE_PAREN:
            raise ValueError('Expected closing )')
        return expr
    else:
        raise ValueError(f"Unexpected token in primary expression: {token}")

def parse_statement(tokens):
    # Parse a general statement
    token = tokens.pop(0)
    if token.isidentifier():
        # Handle function calls or assignments
        if tokens and tokens[0] == OPEN_PAREN:
            tokens.pop(0)  # Consume '('
            args = []
            while tokens[0] != CLOSE_PAREN:
                args.append(parse_expr(tokens))
                if tokens[0] == COMMA:
                    tokens.pop(0)  # Consume ','
            tokens.pop(0)  # Consume ')'
            return ('call', token, args)
        else:
            return ('expr', token)
    elif token in {WHILE, FOR, IF}:
        # Recursive parsing of nested structures
        tokens.insert(0, token)  # Put back token
        return parse(tokens)
    else:
        raise ValueError(f'Invalid token in statement: {token}')

# Main execution block!
if __name__ == '__main__':
    strings = [
        "for x in range(10):\n\tprint(x)",
        "while (x < 10):\n\tprint(x)",
        "if (x < 10):\n\tprint(x)"
    ]
    for string in strings:
        tokens = tokenize(string)
        print("Tokens:", tokens)
        try:
            ast = parse(tokens)
            print("Parsed AST:", ast)
        except ValueError as e:
            print("Parsing error:", e)
        print()
Editor is loading...
Leave a Comment