Untitled
unknown
plain_text
5 months ago
2.0 kB
5
Indexable
import pandas as pd import logging def get_paragraph_idx(self, data: list) -> dict: """ Function to extract paragraph names and return them as a dictionary. Key : Sequence number (index) Value : Paragraph name Parameters: data : List[str] : List of COBOL statements Returns: Dict[int, str] : Dictionary mapping sequence numbers to paragraph names. """ logging.debug('Paragraph Name Extraction ==> Started') df = pd.Series(data).str.strip() # Extract valid paragraph names (ending with a period, not EXIT) paragraph_pattern = r'^\s*([\w-]+)\s*\.$' st = df[df.str.contains(paragraph_pattern, regex=True) & ~df.str.contains(r'\bEXIT\b')] st = pd.DataFrame({'st_index': st.index, 'st_par': st.str.extract(paragraph_pattern)[0], 'name': st}) # Extract EXIT paragraph indices ed = df[df.str.contains(r'\bEXIT\b')].str.extract(paragraph_pattern)[0] ed = pd.DataFrame({'ed_index': ed.index, 'ed_par': ed}) # Identify paragraphs ending with a single "." single_periods = df[df == "."].index.to_series().reset_index(drop=True) # Merge EXITs and periods to determine paragraph end boundaries paragraph_endings = pd.concat([ed.set_index('ed_index'), pd.DataFrame({'ed_index': single_periods})]) # Merge start and end indices dd = pd.merge(st, paragraph_endings, left_on='st_par', right_index=True, how='left') # Fill missing end indexes using the next "." if EXIT is not found dd['ed_index'] = dd['ed_index'].fillna(single_periods[single_periods > dd['st_index']].min()) # Remove rows where no valid end index is found dd = dd[pd.notna(dd['ed_index'])] # Create paragraph mapping paragraph_dict = { r: row.name for row in dd.itertuples(index=False) for r in range(row.st_index, int(row.ed_index) + 1) } logging.debug('Paragraph Name Extraction ==> Completed') return paragraph_dict
Editor is loading...
Leave a Comment