from string import punctuation
import pandas as pd
import numpy as np
import re
from collections import defaultdict
from data_preprocessing import data_initialization_preprocessing
not_allowed_keyword = [i for i in list(punctuation) if i not in [',', '/']]
def remove_comma_alpha(s):
special_chars = [i for i in list(punctuation) if i not in [',', '.', '(', '-']]
s = "".join([i for i in str(s) if i.isalpha() == False and i not in special_chars])
s = s.replace("(", "-")
while "," in s:
pos = s.find(",")
if len(s[pos + 1:]) >= 3:
s = s[:pos] + s[pos+1:]
elif len(s[pos + 1:]) <= 2:
s = s[:pos] + "." + s[pos+1:]
return s
def checking_word_inter_and_value_inter(targetPath, predJsonsPath, block, place, standart_dimensions, standart_quantities):
standart_dimensions[None] = [None]
standart_dimensions['not_exist'] = [None]
standart_dimensions['s'].append('usd')
standart_dimensions['azerbajjani manats'].append('azn')
standart_dimensions['azerbajani manats'].append('azn')
standart_quantities[None] = None
standart_quantities['milion'] = 'миллион'
standart_quantities['1000'] = 'тысячи'
result_df, sequences, filePathDictJSON = data_initialization_preprocessing(targetPath, predJsonsPath, block, place)
print('yes')
wordInter = []
wordNumInter = []
wordNumDimInter = []
wordNumDimQuantityInter = []
wordNumDimQuantityPeriodInter = []
all_keywords = []
all_values = []
all_dimensions = []
all_quantities = []
all_periods = []
table_title = []
all_chapter_names = []
all_ids = []
all_priorities = []
all_columns = []
special_chars_exception = [i for i in list(punctuation) if i not in ['.', '-']]
entities_dict = defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: defaultdict(list)))))
for seq_key in sequences:
for page in sequences[seq_key]['pages']:
for table in sequences[seq_key]['pages'][page]['tables']['id']:
for entity in sequences[seq_key]['pages'][page]['tables']['id'][table]['entities']:
if entity['priority'] == 2:
entities_dict[seq_key][''.join([i for i in entity['iclass_txt'].lower().replace(' ', '') if i not in not_allowed_keyword])][entity['ivalue_txt']][entity['unit1_txt']][entity['scaler1_txt']].append(entity['iperiod_txt'])
for docname, industry, page, keyword, value, dimension, quantity, period in zip(result_df['doc_name'], result_df['industry'], result_df['page'], result_df['keyword_primary'], result_df['value'], result_df['only_dimension'], result_df['only_quantity'], result_df['normalized_period']):
page = str(int(page))
keyword = ''.join([i for i in str(keyword).lower().replace(' ', '') if i not in not_allowed_keyword])
if pd.isna(value) == False:
if any(i.isalpha() or i in special_chars_exception for i in str(value)):
value = remove_comma_alpha(value)
try:
value = float(value)
except:
# print(f"странное значение: {value}, то что смогла найти модель: {sequences[searchKeyword]}")
value = None
elif any(val.isdigit() for val in str(value)) == False:
value = None
else:
try:
value = float(value)
except:
# print(f"странное значение: {value}, то что смогла найти модель: {sequences[searchKeyword]}")
value = None
else:
value = None
if pd.isna(dimension) == True or dimension == '' or dimension == ' ':
dimension = 'empty'
else:
dimension = dimension.lower().strip()
if pd.isna(quantity) == True or quantity == '' or quantity == ' ':
quantity = None
else:
quantity = quantity.lower().strip()
if pd.isna(period) == True or period == '' or period == ' ':
period = None
else:
period = str(period)
seq_key = docname + industry
# if page in sequences[seq_key]['pages']:
# all_tables = []
# for table in sequences[seq_key]['pages'][page]['tables']['id']:
# if sequences[seq_key]['pages'][page]['tables']['id'][table]['table_title'] != '':
# all_tables.append(sequences[seq_key]['pages'][page]['tables']['id'][table]['table_title'])
# table_title.append(list(set(all_tables)))
# entities_dict = defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: defaultdict(list))))
# all_keywords_page = []
# all_values_page = []
# all_dimensions_page= []
# all_quantities_page = []
# all_periods_page = []
# all_chapter_names_page = []
# all_ids_page = []
# all_priorities_page = []
# all_columns_page = []
# for table in sequences[seq_key]['pages'][page]['tables']['id']:
# for entity in sequences[seq_key]['pages'][page]['tables']['id'][table]['entities']:
# # if entity['priority'] == 2:
# entities_dict[''.join([i for i in entity['iclass_txt'].lower().replace(' ', '') if i not in not_allowed_keyword])][entity['ivalue_txt']][entity['unit1_txt']][entity['scaler1_txt']].append(entity['iperiod_txt'])
# all_keywords_page.append(entity['iclass_txt'])
# all_values_page.append(entity['ivalue_txt'])
# all_dimensions_page.append(entity['unit1_txt'])
# all_quantities_page.append(entity['scaler1_txt'])
# all_periods_page.append(entity['iperiod_txt'])
# all_chapter_names_page.append(entity['chapter_name'])
# all_ids_page.append(entity['iclass_id'])
# # all_priorities_page.append(entity['priority'])
# # all_columns_page.append(entity['column'])
# all_keywords.append(all_keywords_page)
# all_values.append(all_values_page)
# all_dimensions.append(all_dimensions_page)
# all_quantities.append(all_quantities_page)
# all_periods.append(all_periods_page)
# all_chapter_names.append(set(all_chapter_names_page))
# all_ids.append(all_ids_page)
# all_priorities.append(all_priorities_page)
# all_columns.append(all_columns_page)
# print(seq_key)
# print(entities_dict.keys())
if keyword in entities_dict[seq_key]:
wordInter.append(keyword)
if (value in entities_dict[seq_key][keyword]) or (value and (-abs(value) in entities_dict[seq_key][keyword] or -abs(value) * 10 in entities_dict[seq_key][keyword] or value * 10 in entities_dict[seq_key][keyword])):
if value == None:
wordNumInter.append('empty')
elif value and -abs(value) in entities_dict[seq_key][keyword]:
wordNumInter.append(-abs(value))
value = -abs(value)
elif value and -abs(value) * 10 in entities_dict[seq_key][keyword]:
wordNumInter.append(-abs(value) * 10)
value = -abs(value) * 10
elif value and value * 10 in entities_dict[seq_key][keyword]:
wordNumInter.append(value * 10)
value = value * 10
else:
wordNumInter.append(value)
for i in entities_dict[seq_key][keyword][value]:
if len(set(standart_dimensions[dimension]) & set(standart_dimensions[i])) != 0:
dimension = i
if dimension in entities_dict[seq_key][keyword][value]:
if dimension:
wordNumDimInter.append(dimension)
else:
wordNumDimInter.append('empty')
for i in entities_dict[seq_key][keyword][value][dimension]:
try:
if standart_quantities[quantity] == standart_quantities[i]:
quantity = i
except:
breakpoint()
if quantity in entities_dict[seq_key][keyword][value][dimension]:
if quantity:
wordNumDimQuantityInter.append(quantity)
else:
wordNumDimQuantityInter.append('empty')
if period in entities_dict[seq_key][keyword][value][dimension][quantity]:
wordNumDimQuantityPeriodInter.append(period)
elif period == None and period in entities_dict[seq_key][keyword][value][dimension][quantity]:
wordNumDimQuantityPeriodInter.append('empty')
else:
wordNumDimQuantityPeriodInter.append(None)
else:
wordNumDimQuantityInter.append(None)
wordNumDimQuantityPeriodInter.append(None)
else:
wordNumDimInter.append(None)
wordNumDimQuantityInter.append(None)
wordNumDimQuantityPeriodInter.append(None)
else:
wordNumInter.append(None)
wordNumDimInter.append(None)
wordNumDimQuantityInter.append(None)
wordNumDimQuantityPeriodInter.append(None)
else:
wordInter.append(None)
wordNumInter.append(None)
wordNumDimInter.append(None)
wordNumDimQuantityInter.append(None)
wordNumDimQuantityPeriodInter.append(None)
# else:
# wordInter.append(None)
# wordNumInter.append(None)
# wordNumDimInter.append(None)
# wordNumDimQuantityInter.append(None)
# wordNumDimQuantityPeriodInter.append(None)
def find_nth(haystack, needle, n):
start = haystack.find(needle)
while start >= 0 and n > 1:
start = haystack.find(needle, start+len(needle))
n -= 1
return start
result_df["keyword_inter"] = wordInter
result_df["found_value"] = wordNumInter
result_df["found_dimension"] = wordNumDimInter
result_df["found_quantity"] = wordNumDimQuantityInter
result_df["found_period"] = wordNumDimQuantityPeriodInter
# result_df["all_keywords"] = all_keywords
# result_df["all_values"] = all_values
# result_df["all_dimensions"] = all_dimensions
# result_df["all_quantities"] = all_quantities
# result_df["all_periods"] = all_periods
# result_df['table_title'] = table_title
# result_df['chapter_name'] = all_chapter_names
result_df['file_path_json'] = (result_df['doc_name'] + result_df['industry']).map(filePathDictJSON)
result_df['file_path_pdf'] = result_df['file_path_json'].apply(lambda x: '1_data/reports/package_3_reports_fin_table' + x[find_nth(x, '/', 3):].replace('.json', '.pdf'))
result_df['doc_name'] = result_df['file_path_pdf'].apply(lambda x: x[x.rfind('/') + 1:])
result_df['company_name'] = result_df['file_path_pdf'].apply(lambda x: x.split('/')[4])
# result_df['all_ids'] = all_ids
# result_df['all_priorities'] = all_priorities
# result_df['all_columns'] = all_columns
print('yes2')
return result_df