Untitled

mail@pastecode.io avatar
unknown
plain_text
18 days ago
2.9 kB
3
Indexable
Never
import pandas as pd
import logging

def get_paragraph_idx(data: list) -> dict:
    """
    Function to extract paragraph names and return them in the form of a dictionary.
    Key  : seq number
    Value: paragraph name

    Parameters:
    data : List : List of statements

    Returns:
    Dict : Dictionary of seq no and paragraph name as values
    """
    logging.debug('Paragraph Name Extraction ==> Started')
    
    # Regex pattern to capture paragraphs like '0000-MAIN.', '1234-ABCD.'
    paragraph_pattern = r'^(\d{4}-[\w-]+)\.$'
    
    # Convert the list to a pandas Series for easier manipulation
    df = pd.Series(data)
    
    # Extract lines that match the paragraph pattern
    df_paragraphs = df[pd.notna(df.str.extract(paragraph_pattern, expand=False))]
    
    # Remove 'EXIT' statements to avoid them being treated as paragraphs
    st = df[~df.str.contains('EXIT')].str.strip()
    
    # Extract paragraph names (start points)
    st = pd.DataFrame({'st_index': st.index, 'st_par': st.str.extract(paragraph_pattern, expand=False), 'name': st})
    
    # Find where 'EXIT' occurs (end points of paragraphs)
    ed = df[df.str.contains('EXIT')].str.strip()
    
    # Create a DataFrame of end points
    ed = pd.DataFrame({'ed_index': ed.index, 'ed_par': ed.str.extract(paragraph_pattern, expand=False)})
    
    # Merge start and end points based on paragraph names
    dd = pd.merge(left=st, right=ed, left_on='st_par', right_on='ed_par', how='left')
    
    # Handle paragraphs that don't have explicit 'EXIT' by looking for the next paragraph start
    paragraph_dict = {}
    
    # Track the last index in case there's no explicit 'EXIT'
    last_paragraph = None
    last_paragraph_start = None

    for i, d in st.iterrows():
        current_paragraph = d['st_par']
        start_index = d['st_index']
        
        # If there's a last paragraph without an 'EXIT', we end it at the current paragraph's start
        if last_paragraph is not None:
            for r in range(last_paragraph_start, start_index):
                paragraph_dict[r] = last_paragraph
        
        # Now, mark the current paragraph as the last one
        last_paragraph = current_paragraph
        last_paragraph_start = start_index
    
    # Handle the last paragraph if it doesn't have an explicit 'EXIT'
    for r in range(last_paragraph_start, len(df)):
        paragraph_dict[r] = last_paragraph
    
    logging.debug('Paragraph Name Extraction ==> Completed')
    
    return paragraph_dict


# Example usage
data = [
    "0000-MAIN.",
    "MOVE A TO B.",
    "ADD 1 TO C.",
    "MULTIPLY X BY Y.",
    "0000-EXIT.",
    "0002-MAIN.",
    "COMPUTE D = E + F.",
    "MOVE Z TO Y.",
    "DISPLAY 'HELLO'.",
    "0003-MAIN-POERT.",
    "MOVE",
    "ADD"
]

result = get_paragraph_idx(data)
output_dict = pd.Series(result).ffill().to_dict()
# Print the output dictionary
print(output_dict)
Leave a Comment