Untitled
unknown
plain_text
18 days ago
2.9 kB
3
Indexable
Never
import pandas as pd import logging def get_paragraph_idx(data: list) -> dict: """ Function to extract paragraph names and return them in the form of a dictionary. Key : seq number Value: paragraph name Parameters: data : List : List of statements Returns: Dict : Dictionary of seq no and paragraph name as values """ logging.debug('Paragraph Name Extraction ==> Started') # Regex pattern to capture paragraphs like '0000-MAIN.', '1234-ABCD.' paragraph_pattern = r'^(\d{4}-[\w-]+)\.$' # Convert the list to a pandas Series for easier manipulation df = pd.Series(data) # Extract lines that match the paragraph pattern df_paragraphs = df[pd.notna(df.str.extract(paragraph_pattern, expand=False))] # Remove 'EXIT' statements to avoid them being treated as paragraphs st = df[~df.str.contains('EXIT')].str.strip() # Extract paragraph names (start points) st = pd.DataFrame({'st_index': st.index, 'st_par': st.str.extract(paragraph_pattern, expand=False), 'name': st}) # Find where 'EXIT' occurs (end points of paragraphs) ed = df[df.str.contains('EXIT')].str.strip() # Create a DataFrame of end points ed = pd.DataFrame({'ed_index': ed.index, 'ed_par': ed.str.extract(paragraph_pattern, expand=False)}) # Merge start and end points based on paragraph names dd = pd.merge(left=st, right=ed, left_on='st_par', right_on='ed_par', how='left') # Handle paragraphs that don't have explicit 'EXIT' by looking for the next paragraph start paragraph_dict = {} # Track the last index in case there's no explicit 'EXIT' last_paragraph = None last_paragraph_start = None for i, d in st.iterrows(): current_paragraph = d['st_par'] start_index = d['st_index'] # If there's a last paragraph without an 'EXIT', we end it at the current paragraph's start if last_paragraph is not None: for r in range(last_paragraph_start, start_index): paragraph_dict[r] = last_paragraph # Now, mark the current paragraph as the last one last_paragraph = current_paragraph last_paragraph_start = start_index # Handle the last paragraph if it doesn't have an explicit 'EXIT' for r in range(last_paragraph_start, len(df)): paragraph_dict[r] = last_paragraph logging.debug('Paragraph Name Extraction ==> Completed') return paragraph_dict # Example usage data = [ "0000-MAIN.", "MOVE A TO B.", "ADD 1 TO C.", "MULTIPLY X BY Y.", "0000-EXIT.", "0002-MAIN.", "COMPUTE D = E + F.", "MOVE Z TO Y.", "DISPLAY 'HELLO'.", "0003-MAIN-POERT.", "MOVE", "ADD" ] result = get_paragraph_idx(data) output_dict = pd.Series(result).ffill().to_dict() # Print the output dictionary print(output_dict)
Leave a Comment