Untitled
unknown
python
2 years ago
16 kB
18
Indexable
import pandas as pd
import numpy as np
def result_metrics(target, big_df):
print('bbbbbbbbbbbbbbbbbbbbb')
target_grouped = target.groupby(['doc_name', 'industry'], as_index=False, sort=False).agg(list)
word_inter = []
found_id = []
found_on_page = []
value_inter = []
dimension_inter = []
quantity_inter = []
period_inter = []
all_keywords = []
all_values = []
all_dimensions = []
all_quantities = []
all_periods = []
all_tables = []
all_chapters = []
priorities = []
big_df['word_match'] = 0
big_df['value_match'] = 0
big_df['dimension_match'] = 0
big_df['quantity_match'] = 0
big_df['period_match'] = 0
big_df['valid_id'] = 0
big_df['valid_year'] = 0
for docname, industry, pages, keywords, values, dimensions, quantities, periods, ids in zip(target_grouped['doc_name'], target_grouped['industry'], target_grouped['page'], target_grouped['preprocessed_keyword_primary'], target_grouped['preprocessed_value'], target_grouped['standartized_dimension'], target_grouped['standartized_quantity'], target_grouped['only_year'], target_grouped['standart_id']):
res_current_selection = big_df.loc[(big_df['doc_name'] == docname) & (big_df['industry'] == industry)]
#check valid id
big_df.loc[((big_df['doc_name'] == docname) & (big_df['industry'] == industry)) & (big_df['id'].isin(ids)), 'valid_id'] = 1
#check valid year
def extract_year(some_period):
if '.' in some_period:
periods = some_period.split('-')
year = max([int(i.split('.')[-1]) for i in periods])
return year
elif '-' in some_period:
periods = some_period.split('-')
year = max([int(i) for i in periods])
return year
else:
return int(some_period)
valid_years = pd.Series([i.split('-') if '-' in i else i for i in set(periods)]).explode().astype(int).to_list()
max_valid_year = max(valid_years)
big_df.loc[((big_df['doc_name'] == docname) & (big_df['industry'] == industry)) & (big_df['valid_id'] == 0) & (big_df['year'].str.contains(str(max_valid_year))), 'valid_year'] = 1
valid_years_id_dict = dict(zip(ids, periods))
def valid_year_id(data):
if valid_years_id_dict[data['id']] == data['year']:
return 1
else:
return 0
big_df.loc[((big_df['doc_name'] == docname) & (big_df['industry'] == industry)) & (big_df['valid_id'] == 1), 'valid_year'] = big_df[((big_df['doc_name'] == docname) & (big_df['industry'] == industry)) & (big_df['valid_id'] == 1)][['id', 'year']].apply(valid_year_id, axis=1)
for page, keyword, value, dimension, quantity, period, standart_id in zip(pages, keywords, values, dimensions, quantities, periods, ids):
all_entities_page = res_current_selection[res_current_selection['page'] == page]
all_keywords.append(all_entities_page['keyword'].to_list())
all_values.append(all_entities_page['value'].to_list())
all_dimensions.append(all_entities_page['dimension'].to_list())
all_quantities.append(all_entities_page['quantity'].to_list())
all_periods.append(all_entities_page['year'].to_list())
all_tables.append(all_entities_page['table_title_raw'].unique())
all_chapters.append(all_entities_page['chapter_name'].unique())
word_inter_selection = res_current_selection.loc[#(res_current_selection['page'] == page) &
(res_current_selection['id'] == standart_id) &
(res_current_selection['word_match'] != 1) &
(res_current_selection['value_match'] != 1) &
(res_current_selection['dimension_match'] != 1) &
(res_current_selection['quantity_match'] != 1) &
(res_current_selection['period_match'] != 1)]
if len(word_inter_selection) > 0:
word_inter_max_prior_indexes = word_inter_selection[word_inter_selection['priority'] == word_inter_selection['priority'].max()].drop_duplicates(subset=['priority']).index.values
value_inter_selection = word_inter_selection.loc[(word_inter_selection['value'] == value) |
(word_inter_selection['value'] == -abs(value)) |
(word_inter_selection['value'] == value * 10) |
(word_inter_selection['value'] == -abs(value) * 10) |
(word_inter_selection['value'] == value * 100) |
(word_inter_selection['value'] == -abs(value) * 100) |
(word_inter_selection['value'] == value * 1000) |
(word_inter_selection['value'] == -abs(value) * 1000)].copy()
if len(value_inter_selection) > 0:
value_inter_selection['value'] = value_inter_selection['value'].astype(str)
value_inter.append(''.join(value_inter_selection['value'].unique()))
value_inter_max_prior_indexes = value_inter_selection[value_inter_selection['priority'] == value_inter_selection['priority'].max()].drop_duplicates(subset=['priority']).index.values
dimension_inter_selection = value_inter_selection.copy()
dimension_inter_selection = dimension_inter_selection.explode('standartized_dimension')
dimension_inter_selection = dimension_inter_selection.loc[dimension_inter_selection['standartized_dimension'].isin(dimension)]
if len(dimension_inter_selection) > 0:
dimension_inter_selection['dimension'] = dimension_inter_selection['dimension'].fillna('empty')
dimension_inter.append(''.join(dimension_inter_selection['dimension'].unique()))
dim_inter_max_prior_indexes = dimension_inter_selection[dimension_inter_selection['priority'] == dimension_inter_selection['priority'].max()].drop_duplicates(subset=['priority']).index.values
quantity_inter_selection = dimension_inter_selection.copy()
quantity_inter_selection = quantity_inter_selection.loc[quantity_inter_selection['standartized_quantity'] == quantity]
if len(quantity_inter_selection) > 0:
quantity_inter.append(''.join(quantity_inter_selection['quantity'].unique()))
quan_inter_max_prior_indexes = quantity_inter_selection[quantity_inter_selection['priority'] == quantity_inter_selection['priority'].max()].drop_duplicates(subset=['priority']).index.values
period_inter_selection = quantity_inter_selection.loc[(quantity_inter_selection['year'] == period)]
if len(period_inter_selection) > 0:
word_inter.append(''.join(period_inter_selection[period_inter_selection['priority'] == period_inter_selection['priority'].max()].drop_duplicates(subset=['id'])['keyword'].unique()))
found_on_page.append(''.join(period_inter_selection[period_inter_selection['priority'] == period_inter_selection['priority'].max()].drop_duplicates(subset=['id'])['page'].astype(str).unique()))
period_inter.append(''.join(period_inter_selection['year'].unique()))
found_id.append(''.join(period_inter_selection['id'].astype(str).unique()))
priorities.append(period_inter_selection['priority'].max())
full_inter_max_prior_indexes = period_inter_selection[period_inter_selection['priority'] == period_inter_selection['priority'].max()].drop_duplicates(subset=['priority']).index.values
for index in full_inter_max_prior_indexes:
big_df.at[index, 'word_match'] = 1
big_df.at[index, 'value_match'] = 1
big_df.at[index, 'dimension_match'] = 1
big_df.at[index, 'quantity_match'] = 1
big_df.at[index, 'period_match'] = 1
res_current_selection.at[index, 'word_match'] = 1
res_current_selection.at[index, 'value_match'] = 1
res_current_selection.at[index, 'dimension_match'] = 1
res_current_selection.at[index, 'quantity_match'] = 1
res_current_selection.at[index, 'period_match'] = 1
else:
found_id.append(None)
found_on_page.append(''.join(quantity_inter_selection[quantity_inter_selection['priority'] == quantity_inter_selection['priority'].max()].drop_duplicates(subset=['id'])['page'].astype(str).unique()))
word_inter.append(''.join(quantity_inter_selection[quantity_inter_selection['priority'] == quantity_inter_selection['priority'].max()].drop_duplicates(subset=['id'])['keyword'].unique()))
period_inter.append(None)
priorities.append(quantity_inter_selection['priority'].max())
for index in quan_inter_max_prior_indexes:
big_df.at[index, 'word_match'] = 1
big_df.at[index, 'value_match'] = 1
big_df.at[index, 'dimension_match'] = 1
big_df.at[index, 'quantity_match'] = 1
res_current_selection.at[index, 'word_match'] = 1
res_current_selection.at[index, 'value_match'] = 1
res_current_selection.at[index, 'dimension_match'] = 1
res_current_selection.at[index, 'quantity_match'] = 1
else:
found_id.append(None)
found_on_page.append(''.join(dimension_inter_selection[dimension_inter_selection['priority'] == dimension_inter_selection['priority'].max()].drop_duplicates(subset=['id'])['page'].astype(str).unique()))
word_inter.append(''.join(dimension_inter_selection[dimension_inter_selection['priority'] == dimension_inter_selection['priority'].max()].drop_duplicates(subset=['id'])['keyword'].unique()))
quantity_inter.append(None)
period_inter.append(None)
priorities.append(dimension_inter_selection['priority'].max())
for index in dim_inter_max_prior_indexes:
big_df.at[index, 'word_match'] = 1
big_df.at[index, 'value_match'] = 1
big_df.at[index, 'dimension_match'] = 1
res_current_selection.at[index, 'word_match'] = 1
res_current_selection.at[index, 'value_match'] = 1
res_current_selection.at[index, 'dimension_match'] = 1
else:
found_id.append(None)
found_on_page.append(''.join(value_inter_selection[value_inter_selection['priority'] == value_inter_selection['priority'].max()].drop_duplicates(subset=['id'])['page'].astype(str).unique()))
word_inter.append(''.join(value_inter_selection[value_inter_selection['priority'] == value_inter_selection['priority'].max()].drop_duplicates(subset=['id'])['keyword'].unique()))
dimension_inter.append(None)
quantity_inter.append(None)
period_inter.append(None)
priorities.append(value_inter_selection['priority'].max())
for index in value_inter_max_prior_indexes:
big_df.at[index, 'word_match'] = 1
big_df.at[index, 'value_match'] = 1
res_current_selection.at[index, 'word_match'] = 1
res_current_selection.at[index, 'value_match'] = 1
else:
found_id.append(None)
found_on_page.append(''.join(word_inter_selection[word_inter_selection['priority'] == word_inter_selection['priority'].max()].drop_duplicates(subset=['id'])['page'].astype(str).unique()))
word_inter.append(''.join(word_inter_selection[word_inter_selection['priority'] == word_inter_selection['priority'].max()].drop_duplicates(subset=['id'])['keyword'].unique()))
value_inter.append(None)
dimension_inter.append(None)
quantity_inter.append(None)
period_inter.append(None)
priorities.append(word_inter_selection['priority'].max())
for index in word_inter_max_prior_indexes:
big_df.at[index, 'word_match'] = 1
res_current_selection.at[index, 'word_match'] = 1
else:
word_inter.append(None)
found_on_page.append(None)
found_id.append(None)
priorities.append(None)
value_inter.append(None)
dimension_inter.append(None)
quantity_inter.append(None)
period_inter.append(None)
stat_res = target.copy()
stat_res['priority'] = priorities
stat_res['found_on_page'] = found_on_page
stat_res['found_keyword'] = word_inter
stat_res['found_id'] = found_id
stat_res['found_value'] = value_inter
stat_res['found_dimension'] = dimension_inter
stat_res['found_quantity'] = quantity_inter
stat_res['found_period'] = period_inter
stat_res['all_keywords'] = all_keywords
stat_res['all_values'] = all_values
stat_res['all_dimensions'] = all_dimensions
stat_res['all_quantities'] = all_quantities
stat_res['all_periods'] = all_periods
stat_res['all_tables'] = all_tables
stat_res['all_chapters'] = all_chapters
big_df = big_df[big_df['valid_year'] == 1].copy()
return stat_res, big_df
#В main придется вынести
# #Validation
# print('мэтч по таргету')
# print(len(stat_res[~pd.isna(stat_res['found_keyword'])]))
# print(len(stat_res[~pd.isna(stat_res['found_value'])]))
# print(len(stat_res[~pd.isna(stat_res['found_dimension'])]))
# print(len(stat_res[~pd.isna(stat_res['found_quantity'])]))
# print(len(stat_res[~pd.isna(stat_res['found_period'])]))
# print('мэтч по результату')
# print(len(big_df[big_df['word_match'] == 1]))
# print(len(big_df[big_df['value_match'] == 1]))
# print(len(big_df[big_df['dimension_match'] == 1]))
# print(len(big_df[big_df['quantity_match'] == 1]))
# print(len(big_df[big_df['period_match'] == 1]))Editor is loading...