Untitled

mail@pastecode.io avatar
unknown
plain_text
25 days ago
1.3 kB
4
Indexable
Never
def get_paragraph_idx(self, data : list) -> dict:
        """
        Function to extract paragraph name and returns in form of dict
        Key  : seq number
        Value: paragraph name
        
        Parameter:
        data : List : List of statements
        
        Returns:
        Dict : Dictonary of seq no and paragraph name as values
        """
        logging.debug('Paragraph Name Extraction ==> Started')
        df = pd.Series(data)
        df = df[pd.notna(df.str.rstrip('SKIP1').str.rstrip().str.extract(pat='(^\\w).*\\.$', expand=False))]
        st = df[~df.str.contains('EXIT')].str.strip()#.str.split('-').str.get(0)
        st = pd.DataFrame({'st_index': st.index, 'st_par':st.str.split('-').str.get(0), 'name':st})
        ed = df[df.str.contains('EXIT')].str.strip().str.split('-').str.get(0)
        ed = pd.DataFrame({'ed_index': ed.index, 'ed_par':ed})
        dd = pd.merge(left=st, right=ed, left_on='st_par', right_on='ed_par', how='left')
        dd = dd[pd.notna(dd['ed_index'])]
        paragraph_dict = {}
        for i,d in dd.iterrows():
            for r in range(d['st_index'], int(d['ed_index'])+1):
                paragraph_dict[r] = d['name']
        logging.debug('Paragraph Name Extraction ==> Completed')
        return paragraph_dict
Leave a Comment