Untitled
unknown
plain_text
9 months ago
2.0 kB
6
Indexable
import pandas as pd
import logging
def get_paragraph_idx(self, data: list) -> dict:
"""
Function to extract paragraph names and return them as a dictionary.
Key : Sequence number (index)
Value : Paragraph name
Parameters:
data : List[str] : List of COBOL statements
Returns:
Dict[int, str] : Dictionary mapping sequence numbers to paragraph names.
"""
logging.debug('Paragraph Name Extraction ==> Started')
df = pd.Series(data).str.strip()
# Extract valid paragraph names (ending with a period, not EXIT)
paragraph_pattern = r'^\s*([\w-]+)\s*\.$'
st = df[df.str.contains(paragraph_pattern, regex=True) & ~df.str.contains(r'\bEXIT\b')]
st = pd.DataFrame({'st_index': st.index, 'st_par': st.str.extract(paragraph_pattern)[0], 'name': st})
# Extract EXIT paragraph indices
ed = df[df.str.contains(r'\bEXIT\b')].str.extract(paragraph_pattern)[0]
ed = pd.DataFrame({'ed_index': ed.index, 'ed_par': ed})
# Identify paragraphs ending with a single "."
single_periods = df[df == "."].index.to_series().reset_index(drop=True)
# Merge EXITs and periods to determine paragraph end boundaries
paragraph_endings = pd.concat([ed.set_index('ed_index'), pd.DataFrame({'ed_index': single_periods})])
# Merge start and end indices
dd = pd.merge(st, paragraph_endings, left_on='st_par', right_index=True, how='left')
# Fill missing end indexes using the next "." if EXIT is not found
dd['ed_index'] = dd['ed_index'].fillna(single_periods[single_periods > dd['st_index']].min())
# Remove rows where no valid end index is found
dd = dd[pd.notna(dd['ed_index'])]
# Create paragraph mapping
paragraph_dict = {
r: row.name for row in dd.itertuples(index=False) for r in range(row.st_index, int(row.ed_index) + 1)
}
logging.debug('Paragraph Name Extraction ==> Completed')
return paragraph_dict
Editor is loading...
Leave a Comment