Untitled

 avatar
unknown
plain_text
8 days ago
3.3 kB
1
Indexable
import pandas as pd
import logging

logging.basicConfig(level=logging.DEBUG)

def get_paragraph_idx(data: list) -> dict:
    """
    Extracts paragraph names and assigns them to the corresponding COBOL lines.
    Skips lines that do not belong to any paragraph.

    Parameters:
    data : List[str] : List of COBOL statements.

    Returns:
    Dict[int, str] : Dictionary mapping line index to paragraph name.
    """
    logging.debug('Paragraph Name Extraction ==> Started')

    df = pd.Series(data)

    # Paragraph name pattern: Alphanumeric + hyphen(s) + ending with period
    # paragraph_pattern = r'^[A-Z0-9]+(-[A-Z0-9]+)*\.$'
    paragraph_pattern = r'^(?!.*\bEXIT\b)[A-Z0-9]+(-[A-Z0-9]+)*\.$'
    df_paragraphs = df[df.str.strip().str.match(paragraph_pattern, na=False)].str.strip()

    # Find all paragraph start indexes and names
    st = pd.DataFrame({'st_index': df_paragraphs.index, 'name': df_paragraphs})

    # Find lines that contain only `.`
    # ed_period = df[df.str.strip() == '.']
    # ed_period = df[df.str.strip().str.contains(r'(^\.$|\bEXIT\b.*\.$)', regex=True)]
    ed_period = df[df.str.strip().str.contains(r'(^\.$|\bEXIT\b.*\.$)', regex=True, na=False)]
    ed_period = pd.DataFrame({'ed_index': ed_period.index})

    paragraph_dict = {}
    start_indices = st['st_index'].tolist()
    start_names = st['name'].tolist()

    # Track the last paragraph name
    last_paragraph_name = None

    for i, start_idx in enumerate(start_indices):
        paragraph_name = start_names[i]

        # Ignore EXIT paragraphs; continue using the last known paragraph
        # if paragraph_name.endswith("EXIT."):
        #     continue

        last_paragraph_name = paragraph_name

        # Find the next paragraph end marker (`.` or next paragraph start)
        if i + 1 < len(start_indices):
            end_idx = start_indices[i + 1] - 1
        else:
            end_idx = len(data) - 1

        # Ensure paragraphs extend to standalone `.` if present
        period_end_idx = ed_period[ed_period['ed_index'] > start_idx]['ed_index'].min()
        if pd.notna(period_end_idx) and period_end_idx < end_idx:
            end_idx = period_end_idx

        # Assign paragraph name to all its lines
        for r in range(start_idx, int(end_idx) + 1):
            paragraph_dict[r] = paragraph_name
        
        if paragraph_name.endswith("EXIT."):
          continue

    logging.debug('Paragraph Name Extraction ==> Completed')
    return paragraph_dict

# Example Input
data = [
    "0000-MAINLINE.",
    "Some COBOL code...",
    "More COBOL statements...",
    "0000-EXIT.",
    "ASDFADGDAFG",  # Skipped
    "yiuy56756",  # Skipped
    "A-B-C-123.",
    "Some other lines...",
    "Another line...",
    ".",
    "ASDFADGDAFG",  # Skipped
    "yiuy56756",  # Skipped
    "100-START.",
    "Processing data...",
    "100-EXIT.",
    "ADGSDFGFH",  # Skipped
    "700-OPIUI.",
    "IHUOUOUOU",
    "iopipoi",
    "200-PROCESS.",
    "Logic continues...",
    ".",
    "ASDFADGDAFG",  # Skipped
    "yiuy56756",  # Skipped
    "456-ADF.",
    "STRHR",
    ".",
    "RTUR3456TUYu",
    "RIYTUII"
]

# Run function and print output
result = get_paragraph_idx(data)
print(result)
Leave a Comment