Untitled

import pandas as pd
import logging

def get_paragraph_idx(self, data: list) -> dict:
    """
    Function to extract paragraph names and return them as a dictionary.
    
    Key   : Sequence number (index)
    Value : Paragraph name

    Parameters:
    data : List[str] : List of COBOL statements

    Returns:
    Dict[int, str] : Dictionary mapping sequence numbers to paragraph names.
    """
    logging.debug('Paragraph Name Extraction ==> Started')

    df = pd.Series(data).str.strip()

    # Extract valid paragraph names (ending with a period, not EXIT)
    paragraph_pattern = r'^\s*([\w-]+)\s*\.$'
    st = df[df.str.contains(paragraph_pattern, regex=True) & ~df.str.contains(r'\bEXIT\b')]
    
    st = pd.DataFrame({'st_index': st.index, 'st_par': st.str.extract(paragraph_pattern)[0], 'name': st})
    
    # Extract EXIT paragraph indices
    ed = df[df.str.contains(r'\bEXIT\b')].str.extract(paragraph_pattern)[0]
    ed = pd.DataFrame({'ed_index': ed.index, 'ed_par': ed})
    
    # Identify paragraphs ending with a single "."
    single_periods = df[df == "."].index.to_series().reset_index(drop=True)
    
    # Merge EXITs and periods to determine paragraph end boundaries
    paragraph_endings = pd.concat([ed.set_index('ed_index'), pd.DataFrame({'ed_index': single_periods})])
    
    # Merge start and end indices
    dd = pd.merge(st, paragraph_endings, left_on='st_par', right_index=True, how='left')

    # Fill missing end indexes using the next "." if EXIT is not found
    dd['ed_index'] = dd['ed_index'].fillna(single_periods[single_periods > dd['st_index']].min())

    # Remove rows where no valid end index is found
    dd = dd[pd.notna(dd['ed_index'])]

    # Create paragraph mapping
    paragraph_dict = {
        r: row.name for row in dd.itertuples(index=False) for r in range(row.st_index, int(row.ed_index) + 1)
    }

    logging.debug('Paragraph Name Extraction ==> Completed')
    return paragraph_dict
Editor is loading...