Untitled
unknown
plain_text
a year ago
1.3 kB
11
Indexable
def get_paragraph_idx(self, data : list) -> dict:
"""
Function to extract paragraph name and returns in form of dict
Key : seq number
Value: paragraph name
Parameter:
data : List : List of statements
Returns:
Dict : Dictonary of seq no and paragraph name as values
"""
logging.debug('Paragraph Name Extraction ==> Started')
df = pd.Series(data)
df = df[pd.notna(df.str.rstrip('SKIP1').str.rstrip().str.extract(pat='(^\\w).*\\.$', expand=False))]
st = df[~df.str.contains('EXIT')].str.strip()#.str.split('-').str.get(0)
st = pd.DataFrame({'st_index': st.index, 'st_par':st.str.split('-').str.get(0), 'name':st})
ed = df[df.str.contains('EXIT')].str.strip().str.split('-').str.get(0)
ed = pd.DataFrame({'ed_index': ed.index, 'ed_par':ed})
dd = pd.merge(left=st, right=ed, left_on='st_par', right_on='ed_par', how='left')
dd = dd[pd.notna(dd['ed_index'])]
paragraph_dict = {}
for i,d in dd.iterrows():
for r in range(d['st_index'], int(d['ed_index'])+1):
paragraph_dict[r] = d['name']
logging.debug('Paragraph Name Extraction ==> Completed')
return paragraph_dictEditor is loading...
Leave a Comment