Untitled
unknown
plain_text
a year ago
2.9 kB
10
Indexable
import pandas as pd
import logging
def get_paragraph_idx(data: list) -> dict:
"""
Function to extract paragraph names and return them in the form of a dictionary.
Key : seq number
Value: paragraph name
Parameters:
data : List : List of statements
Returns:
Dict : Dictionary of seq no and paragraph name as values
"""
logging.debug('Paragraph Name Extraction ==> Started')
# Regex pattern to capture paragraphs like '0000-MAIN.', '1234-ABCD.'
paragraph_pattern = r'^(\d{4}-[\w-]+)\.$'
# Convert the list to a pandas Series for easier manipulation
df = pd.Series(data)
# Extract lines that match the paragraph pattern
df_paragraphs = df[pd.notna(df.str.extract(paragraph_pattern, expand=False))]
# Remove 'EXIT' statements to avoid them being treated as paragraphs
st = df[~df.str.contains('EXIT')].str.strip()
# Extract paragraph names (start points)
st = pd.DataFrame({'st_index': st.index, 'st_par': st.str.extract(paragraph_pattern, expand=False), 'name': st})
# Find where 'EXIT' occurs (end points of paragraphs)
ed = df[df.str.contains('EXIT')].str.strip()
# Create a DataFrame of end points
ed = pd.DataFrame({'ed_index': ed.index, 'ed_par': ed.str.extract(paragraph_pattern, expand=False)})
# Merge start and end points based on paragraph names
dd = pd.merge(left=st, right=ed, left_on='st_par', right_on='ed_par', how='left')
# Handle paragraphs that don't have explicit 'EXIT' by looking for the next paragraph start
paragraph_dict = {}
# Track the last index in case there's no explicit 'EXIT'
last_paragraph = None
last_paragraph_start = None
for i, d in st.iterrows():
current_paragraph = d['st_par']
start_index = d['st_index']
# If there's a last paragraph without an 'EXIT', we end it at the current paragraph's start
if last_paragraph is not None:
for r in range(last_paragraph_start, start_index):
paragraph_dict[r] = last_paragraph
# Now, mark the current paragraph as the last one
last_paragraph = current_paragraph
last_paragraph_start = start_index
# Handle the last paragraph if it doesn't have an explicit 'EXIT'
for r in range(last_paragraph_start, len(df)):
paragraph_dict[r] = last_paragraph
logging.debug('Paragraph Name Extraction ==> Completed')
return paragraph_dict
# Example usage
data = [
"0000-MAIN.",
"MOVE A TO B.",
"ADD 1 TO C.",
"MULTIPLY X BY Y.",
"0000-EXIT.",
"0002-MAIN.",
"COMPUTE D = E + F.",
"MOVE Z TO Y.",
"DISPLAY 'HELLO'.",
"0003-MAIN-POERT.",
"MOVE",
"ADD"
]
result = get_paragraph_idx(data)
output_dict = pd.Series(result).ffill().to_dict()
# Print the output dictionary
print(output_dict)
Editor is loading...
Leave a Comment