Untitled
import pandas as pd import logging logging.basicConfig(level=logging.DEBUG) def get_paragraph_idx(data: list) -> dict: """ Extracts paragraph names and assigns them to the corresponding COBOL lines. Skips lines that do not belong to any paragraph. Parameters: data : List[str] : List of COBOL statements. Returns: Dict[int, str] : Dictionary mapping line index to paragraph name. """ logging.debug('Paragraph Name Extraction ==> Started') df = pd.Series(data) # Paragraph name pattern: Alphanumeric + hyphen(s) + ending with period # paragraph_pattern = r'^[A-Z0-9]+(-[A-Z0-9]+)*\.$' paragraph_pattern = r'^(?!.*\bEXIT\b)[A-Z0-9]+(-[A-Z0-9]+)*\.$' df_paragraphs = df[df.str.strip().str.match(paragraph_pattern, na=False)].str.strip() # Find all paragraph start indexes and names st = pd.DataFrame({'st_index': df_paragraphs.index, 'name': df_paragraphs}) # Find lines that contain only `.` # ed_period = df[df.str.strip() == '.'] # ed_period = df[df.str.strip().str.contains(r'(^\.$|\bEXIT\b.*\.$)', regex=True)] ed_period = df[df.str.strip().str.contains(r'(^\.$|\bEXIT\b.*\.$)', regex=True, na=False)] ed_period = pd.DataFrame({'ed_index': ed_period.index}) paragraph_dict = {} start_indices = st['st_index'].tolist() start_names = st['name'].tolist() # Track the last paragraph name last_paragraph_name = None for i, start_idx in enumerate(start_indices): paragraph_name = start_names[i] # Ignore EXIT paragraphs; continue using the last known paragraph # if paragraph_name.endswith("EXIT."): # continue last_paragraph_name = paragraph_name # Find the next paragraph end marker (`.` or next paragraph start) if i + 1 < len(start_indices): end_idx = start_indices[i + 1] - 1 else: end_idx = len(data) - 1 # Ensure paragraphs extend to standalone `.` if present period_end_idx = ed_period[ed_period['ed_index'] > start_idx]['ed_index'].min() if pd.notna(period_end_idx) and period_end_idx < end_idx: end_idx = period_end_idx # Assign paragraph name to all its lines for r in range(start_idx, int(end_idx) + 1): paragraph_dict[r] = paragraph_name if paragraph_name.endswith("EXIT."): continue logging.debug('Paragraph Name Extraction ==> Completed') return paragraph_dict # Example Input data = [ "0000-MAINLINE.", "Some COBOL code...", "More COBOL statements...", "0000-EXIT.", "ASDFADGDAFG", # Skipped "yiuy56756", # Skipped "A-B-C-123.", "Some other lines...", "Another line...", ".", "ASDFADGDAFG", # Skipped "yiuy56756", # Skipped "100-START.", "Processing data...", "100-EXIT.", "ADGSDFGFH", # Skipped "700-OPIUI.", "IHUOUOUOU", "iopipoi", "200-PROCESS.", "Logic continues...", ".", "ASDFADGDAFG", # Skipped "yiuy56756", # Skipped "456-ADF.", "STRHR", ".", "RTUR3456TUYu", "RIYTUII" ] # Run function and print output result = get_paragraph_idx(data) print(result)
Leave a Comment