Untitled
unknown
python
a year ago
12 kB
2
Indexable
from string import punctuation import pandas as pd import numpy as np import re from collections import defaultdict from data_preprocessing import data_initialization_preprocessing not_allowed_keyword = [i for i in list(punctuation) if i not in [',', '/']] def remove_comma_alpha(s): special_chars = [i for i in list(punctuation) if i not in [',', '.', '(', '-']] s = "".join([i for i in str(s) if i.isalpha() == False and i not in special_chars]) s = s.replace("(", "-") while "," in s: pos = s.find(",") if len(s[pos + 1:]) >= 3: s = s[:pos] + s[pos+1:] elif len(s[pos + 1:]) <= 2: s = s[:pos] + "." + s[pos+1:] return s def checking_word_inter_and_value_inter(targetPath, predJsonsPath, block, place, standart_dimensions, standart_quantities): standart_dimensions[None] = [None] standart_dimensions['not_exist'] = [None] standart_dimensions['s'].append('usd') standart_dimensions['azerbajjani manats'].append('azn') standart_dimensions['azerbajani manats'].append('azn') standart_quantities[None] = None standart_quantities['milion'] = 'миллион' standart_quantities['1000'] = 'тысячи' result_df, sequences, filePathDictJSON = data_initialization_preprocessing(targetPath, predJsonsPath, block, place) print('yes') wordInter = [] wordNumInter = [] wordNumDimInter = [] wordNumDimQuantityInter = [] wordNumDimQuantityPeriodInter = [] all_keywords = [] all_values = [] all_dimensions = [] all_quantities = [] all_periods = [] table_title = [] all_chapter_names = [] all_ids = [] all_priorities = [] all_columns = [] special_chars_exception = [i for i in list(punctuation) if i not in ['.', '-']] entities_dict = defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: defaultdict(list))))) for seq_key in sequences: for page in sequences[seq_key]['pages']: for table in sequences[seq_key]['pages'][page]['tables']['id']: for entity in sequences[seq_key]['pages'][page]['tables']['id'][table]['entities']: if entity['priority'] == 2: entities_dict[seq_key][''.join([i for i in entity['iclass_txt'].lower().replace(' ', '') if i not in not_allowed_keyword])][entity['ivalue_txt']][entity['unit1_txt']][entity['scaler1_txt']].append(entity['iperiod_txt']) for docname, industry, page, keyword, value, dimension, quantity, period in zip(result_df['doc_name'], result_df['industry'], result_df['page'], result_df['keyword_primary'], result_df['value'], result_df['only_dimension'], result_df['only_quantity'], result_df['normalized_period']): page = str(int(page)) keyword = ''.join([i for i in str(keyword).lower().replace(' ', '') if i not in not_allowed_keyword]) if pd.isna(value) == False: if any(i.isalpha() or i in special_chars_exception for i in str(value)): value = remove_comma_alpha(value) try: value = float(value) except: # print(f"странное значение: {value}, то что смогла найти модель: {sequences[searchKeyword]}") value = None elif any(val.isdigit() for val in str(value)) == False: value = None else: try: value = float(value) except: # print(f"странное значение: {value}, то что смогла найти модель: {sequences[searchKeyword]}") value = None else: value = None if pd.isna(dimension) == True or dimension == '' or dimension == ' ': dimension = 'empty' else: dimension = dimension.lower().strip() if pd.isna(quantity) == True or quantity == '' or quantity == ' ': quantity = None else: quantity = quantity.lower().strip() if pd.isna(period) == True or period == '' or period == ' ': period = None else: period = str(period) seq_key = docname + industry # if page in sequences[seq_key]['pages']: # all_tables = [] # for table in sequences[seq_key]['pages'][page]['tables']['id']: # if sequences[seq_key]['pages'][page]['tables']['id'][table]['table_title'] != '': # all_tables.append(sequences[seq_key]['pages'][page]['tables']['id'][table]['table_title']) # table_title.append(list(set(all_tables))) # entities_dict = defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: defaultdict(list)))) # all_keywords_page = [] # all_values_page = [] # all_dimensions_page= [] # all_quantities_page = [] # all_periods_page = [] # all_chapter_names_page = [] # all_ids_page = [] # all_priorities_page = [] # all_columns_page = [] # for table in sequences[seq_key]['pages'][page]['tables']['id']: # for entity in sequences[seq_key]['pages'][page]['tables']['id'][table]['entities']: # # if entity['priority'] == 2: # entities_dict[''.join([i for i in entity['iclass_txt'].lower().replace(' ', '') if i not in not_allowed_keyword])][entity['ivalue_txt']][entity['unit1_txt']][entity['scaler1_txt']].append(entity['iperiod_txt']) # all_keywords_page.append(entity['iclass_txt']) # all_values_page.append(entity['ivalue_txt']) # all_dimensions_page.append(entity['unit1_txt']) # all_quantities_page.append(entity['scaler1_txt']) # all_periods_page.append(entity['iperiod_txt']) # all_chapter_names_page.append(entity['chapter_name']) # all_ids_page.append(entity['iclass_id']) # # all_priorities_page.append(entity['priority']) # # all_columns_page.append(entity['column']) # all_keywords.append(all_keywords_page) # all_values.append(all_values_page) # all_dimensions.append(all_dimensions_page) # all_quantities.append(all_quantities_page) # all_periods.append(all_periods_page) # all_chapter_names.append(set(all_chapter_names_page)) # all_ids.append(all_ids_page) # all_priorities.append(all_priorities_page) # all_columns.append(all_columns_page) # print(seq_key) # print(entities_dict.keys()) if keyword in entities_dict[seq_key]: wordInter.append(keyword) if (value in entities_dict[seq_key][keyword]) or (value and (-abs(value) in entities_dict[seq_key][keyword] or -abs(value) * 10 in entities_dict[seq_key][keyword] or value * 10 in entities_dict[seq_key][keyword])): if value == None: wordNumInter.append('empty') elif value and -abs(value) in entities_dict[seq_key][keyword]: wordNumInter.append(-abs(value)) value = -abs(value) elif value and -abs(value) * 10 in entities_dict[seq_key][keyword]: wordNumInter.append(-abs(value) * 10) value = -abs(value) * 10 elif value and value * 10 in entities_dict[seq_key][keyword]: wordNumInter.append(value * 10) value = value * 10 else: wordNumInter.append(value) for i in entities_dict[seq_key][keyword][value]: if len(set(standart_dimensions[dimension]) & set(standart_dimensions[i])) != 0: dimension = i if dimension in entities_dict[seq_key][keyword][value]: if dimension: wordNumDimInter.append(dimension) else: wordNumDimInter.append('empty') for i in entities_dict[seq_key][keyword][value][dimension]: try: if standart_quantities[quantity] == standart_quantities[i]: quantity = i except: breakpoint() if quantity in entities_dict[seq_key][keyword][value][dimension]: if quantity: wordNumDimQuantityInter.append(quantity) else: wordNumDimQuantityInter.append('empty') if period in entities_dict[seq_key][keyword][value][dimension][quantity]: wordNumDimQuantityPeriodInter.append(period) elif period == None and period in entities_dict[seq_key][keyword][value][dimension][quantity]: wordNumDimQuantityPeriodInter.append('empty') else: wordNumDimQuantityPeriodInter.append(None) else: wordNumDimQuantityInter.append(None) wordNumDimQuantityPeriodInter.append(None) else: wordNumDimInter.append(None) wordNumDimQuantityInter.append(None) wordNumDimQuantityPeriodInter.append(None) else: wordNumInter.append(None) wordNumDimInter.append(None) wordNumDimQuantityInter.append(None) wordNumDimQuantityPeriodInter.append(None) else: wordInter.append(None) wordNumInter.append(None) wordNumDimInter.append(None) wordNumDimQuantityInter.append(None) wordNumDimQuantityPeriodInter.append(None) # else: # wordInter.append(None) # wordNumInter.append(None) # wordNumDimInter.append(None) # wordNumDimQuantityInter.append(None) # wordNumDimQuantityPeriodInter.append(None) def find_nth(haystack, needle, n): start = haystack.find(needle) while start >= 0 and n > 1: start = haystack.find(needle, start+len(needle)) n -= 1 return start result_df["keyword_inter"] = wordInter result_df["found_value"] = wordNumInter result_df["found_dimension"] = wordNumDimInter result_df["found_quantity"] = wordNumDimQuantityInter result_df["found_period"] = wordNumDimQuantityPeriodInter # result_df["all_keywords"] = all_keywords # result_df["all_values"] = all_values # result_df["all_dimensions"] = all_dimensions # result_df["all_quantities"] = all_quantities # result_df["all_periods"] = all_periods # result_df['table_title'] = table_title # result_df['chapter_name'] = all_chapter_names result_df['file_path_json'] = (result_df['doc_name'] + result_df['industry']).map(filePathDictJSON) result_df['file_path_pdf'] = result_df['file_path_json'].apply(lambda x: '1_data/reports/package_3_reports_fin_table' + x[find_nth(x, '/', 3):].replace('.json', '.pdf')) result_df['doc_name'] = result_df['file_path_pdf'].apply(lambda x: x[x.rfind('/') + 1:]) result_df['company_name'] = result_df['file_path_pdf'].apply(lambda x: x.split('/')[4]) # result_df['all_ids'] = all_ids # result_df['all_priorities'] = all_priorities # result_df['all_columns'] = all_columns print('yes2') return result_df
Editor is loading...