Untitled
unknown
plain_text
9 months ago
3.3 kB
7
Indexable
import pandas as pd
import logging
logging.basicConfig(level=logging.DEBUG)
def get_paragraph_idx(data: list) -> dict:
"""
Extracts paragraph names and assigns them to the corresponding COBOL lines.
Skips lines that do not belong to any paragraph.
Parameters:
data : List[str] : List of COBOL statements.
Returns:
Dict[int, str] : Dictionary mapping line index to paragraph name.
"""
logging.debug('Paragraph Name Extraction ==> Started')
df = pd.Series(data)
# Paragraph name pattern: Alphanumeric + hyphen(s) + ending with period
# paragraph_pattern = r'^[A-Z0-9]+(-[A-Z0-9]+)*\.$'
paragraph_pattern = r'^(?!.*\bEXIT\b)[A-Z0-9]+(-[A-Z0-9]+)*\.$'
df_paragraphs = df[df.str.strip().str.match(paragraph_pattern, na=False)].str.strip()
# Find all paragraph start indexes and names
st = pd.DataFrame({'st_index': df_paragraphs.index, 'name': df_paragraphs})
# Find lines that contain only `.`
# ed_period = df[df.str.strip() == '.']
# ed_period = df[df.str.strip().str.contains(r'(^\.$|\bEXIT\b.*\.$)', regex=True)]
ed_period = df[df.str.strip().str.contains(r'(^\.$|\bEXIT\b.*\.$)', regex=True, na=False)]
ed_period = pd.DataFrame({'ed_index': ed_period.index})
paragraph_dict = {}
start_indices = st['st_index'].tolist()
start_names = st['name'].tolist()
# Track the last paragraph name
last_paragraph_name = None
for i, start_idx in enumerate(start_indices):
paragraph_name = start_names[i]
# Ignore EXIT paragraphs; continue using the last known paragraph
# if paragraph_name.endswith("EXIT."):
# continue
last_paragraph_name = paragraph_name
# Find the next paragraph end marker (`.` or next paragraph start)
if i + 1 < len(start_indices):
end_idx = start_indices[i + 1] - 1
else:
end_idx = len(data) - 1
# Ensure paragraphs extend to standalone `.` if present
period_end_idx = ed_period[ed_period['ed_index'] > start_idx]['ed_index'].min()
if pd.notna(period_end_idx) and period_end_idx < end_idx:
end_idx = period_end_idx
# Assign paragraph name to all its lines
for r in range(start_idx, int(end_idx) + 1):
paragraph_dict[r] = paragraph_name
if paragraph_name.endswith("EXIT."):
continue
logging.debug('Paragraph Name Extraction ==> Completed')
return paragraph_dict
# Example Input
data = [
"0000-MAINLINE.",
"Some COBOL code...",
"More COBOL statements...",
"0000-EXIT.",
"ASDFADGDAFG", # Skipped
"yiuy56756", # Skipped
"A-B-C-123.",
"Some other lines...",
"Another line...",
".",
"ASDFADGDAFG", # Skipped
"yiuy56756", # Skipped
"100-START.",
"Processing data...",
"100-EXIT.",
"ADGSDFGFH", # Skipped
"700-OPIUI.",
"IHUOUOUOU",
"iopipoi",
"200-PROCESS.",
"Logic continues...",
".",
"ASDFADGDAFG", # Skipped
"yiuy56756", # Skipped
"456-ADF.",
"STRHR",
".",
"RTUR3456TUYu",
"RIYTUII"
]
# Run function and print output
result = get_paragraph_idx(data)
print(result)
Editor is loading...
Leave a Comment