Untitled
unknown
plain_text
25 days ago
1.3 kB
4
Indexable
Never
def get_paragraph_idx(self, data : list) -> dict: """ Function to extract paragraph name and returns in form of dict Key : seq number Value: paragraph name Parameter: data : List : List of statements Returns: Dict : Dictonary of seq no and paragraph name as values """ logging.debug('Paragraph Name Extraction ==> Started') df = pd.Series(data) df = df[pd.notna(df.str.rstrip('SKIP1').str.rstrip().str.extract(pat='(^\\w).*\\.$', expand=False))] st = df[~df.str.contains('EXIT')].str.strip()#.str.split('-').str.get(0) st = pd.DataFrame({'st_index': st.index, 'st_par':st.str.split('-').str.get(0), 'name':st}) ed = df[df.str.contains('EXIT')].str.strip().str.split('-').str.get(0) ed = pd.DataFrame({'ed_index': ed.index, 'ed_par':ed}) dd = pd.merge(left=st, right=ed, left_on='st_par', right_on='ed_par', how='left') dd = dd[pd.notna(dd['ed_index'])] paragraph_dict = {} for i,d in dd.iterrows(): for r in range(d['st_index'], int(d['ed_index'])+1): paragraph_dict[r] = d['name'] logging.debug('Paragraph Name Extraction ==> Completed') return paragraph_dict
Leave a Comment