Untitled
unknown
python
a year ago
16 kB
11
Indexable
Never
import pandas as pd import numpy as np def result_metrics(target, big_df): print('bbbbbbbbbbbbbbbbbbbbb') target_grouped = target.groupby(['doc_name', 'industry'], as_index=False, sort=False).agg(list) word_inter = [] found_id = [] found_on_page = [] value_inter = [] dimension_inter = [] quantity_inter = [] period_inter = [] all_keywords = [] all_values = [] all_dimensions = [] all_quantities = [] all_periods = [] all_tables = [] all_chapters = [] priorities = [] big_df['word_match'] = 0 big_df['value_match'] = 0 big_df['dimension_match'] = 0 big_df['quantity_match'] = 0 big_df['period_match'] = 0 big_df['valid_id'] = 0 big_df['valid_year'] = 0 for docname, industry, pages, keywords, values, dimensions, quantities, periods, ids in zip(target_grouped['doc_name'], target_grouped['industry'], target_grouped['page'], target_grouped['preprocessed_keyword_primary'], target_grouped['preprocessed_value'], target_grouped['standartized_dimension'], target_grouped['standartized_quantity'], target_grouped['only_year'], target_grouped['standart_id']): res_current_selection = big_df.loc[(big_df['doc_name'] == docname) & (big_df['industry'] == industry)] #check valid id big_df.loc[((big_df['doc_name'] == docname) & (big_df['industry'] == industry)) & (big_df['id'].isin(ids)), 'valid_id'] = 1 #check valid year def extract_year(some_period): if '.' in some_period: periods = some_period.split('-') year = max([int(i.split('.')[-1]) for i in periods]) return year elif '-' in some_period: periods = some_period.split('-') year = max([int(i) for i in periods]) return year else: return int(some_period) valid_years = pd.Series([i.split('-') if '-' in i else i for i in set(periods)]).explode().astype(int).to_list() max_valid_year = max(valid_years) big_df.loc[((big_df['doc_name'] == docname) & (big_df['industry'] == industry)) & (big_df['valid_id'] == 0) & (big_df['year'].str.contains(str(max_valid_year))), 'valid_year'] = 1 valid_years_id_dict = dict(zip(ids, periods)) def valid_year_id(data): if valid_years_id_dict[data['id']] == data['year']: return 1 else: return 0 big_df.loc[((big_df['doc_name'] == docname) & (big_df['industry'] == industry)) & (big_df['valid_id'] == 1), 'valid_year'] = big_df[((big_df['doc_name'] == docname) & (big_df['industry'] == industry)) & (big_df['valid_id'] == 1)][['id', 'year']].apply(valid_year_id, axis=1) for page, keyword, value, dimension, quantity, period, standart_id in zip(pages, keywords, values, dimensions, quantities, periods, ids): all_entities_page = res_current_selection[res_current_selection['page'] == page] all_keywords.append(all_entities_page['keyword'].to_list()) all_values.append(all_entities_page['value'].to_list()) all_dimensions.append(all_entities_page['dimension'].to_list()) all_quantities.append(all_entities_page['quantity'].to_list()) all_periods.append(all_entities_page['year'].to_list()) all_tables.append(all_entities_page['table_title_raw'].unique()) all_chapters.append(all_entities_page['chapter_name'].unique()) word_inter_selection = res_current_selection.loc[#(res_current_selection['page'] == page) & (res_current_selection['id'] == standart_id) & (res_current_selection['word_match'] != 1) & (res_current_selection['value_match'] != 1) & (res_current_selection['dimension_match'] != 1) & (res_current_selection['quantity_match'] != 1) & (res_current_selection['period_match'] != 1)] if len(word_inter_selection) > 0: word_inter_max_prior_indexes = word_inter_selection[word_inter_selection['priority'] == word_inter_selection['priority'].max()].drop_duplicates(subset=['priority']).index.values value_inter_selection = word_inter_selection.loc[(word_inter_selection['value'] == value) | (word_inter_selection['value'] == -abs(value)) | (word_inter_selection['value'] == value * 10) | (word_inter_selection['value'] == -abs(value) * 10) | (word_inter_selection['value'] == value * 100) | (word_inter_selection['value'] == -abs(value) * 100) | (word_inter_selection['value'] == value * 1000) | (word_inter_selection['value'] == -abs(value) * 1000)].copy() if len(value_inter_selection) > 0: value_inter_selection['value'] = value_inter_selection['value'].astype(str) value_inter.append(''.join(value_inter_selection['value'].unique())) value_inter_max_prior_indexes = value_inter_selection[value_inter_selection['priority'] == value_inter_selection['priority'].max()].drop_duplicates(subset=['priority']).index.values dimension_inter_selection = value_inter_selection.copy() dimension_inter_selection = dimension_inter_selection.explode('standartized_dimension') dimension_inter_selection = dimension_inter_selection.loc[dimension_inter_selection['standartized_dimension'].isin(dimension)] if len(dimension_inter_selection) > 0: dimension_inter_selection['dimension'] = dimension_inter_selection['dimension'].fillna('empty') dimension_inter.append(''.join(dimension_inter_selection['dimension'].unique())) dim_inter_max_prior_indexes = dimension_inter_selection[dimension_inter_selection['priority'] == dimension_inter_selection['priority'].max()].drop_duplicates(subset=['priority']).index.values quantity_inter_selection = dimension_inter_selection.copy() quantity_inter_selection = quantity_inter_selection.loc[quantity_inter_selection['standartized_quantity'] == quantity] if len(quantity_inter_selection) > 0: quantity_inter.append(''.join(quantity_inter_selection['quantity'].unique())) quan_inter_max_prior_indexes = quantity_inter_selection[quantity_inter_selection['priority'] == quantity_inter_selection['priority'].max()].drop_duplicates(subset=['priority']).index.values period_inter_selection = quantity_inter_selection.loc[(quantity_inter_selection['year'] == period)] if len(period_inter_selection) > 0: word_inter.append(''.join(period_inter_selection[period_inter_selection['priority'] == period_inter_selection['priority'].max()].drop_duplicates(subset=['id'])['keyword'].unique())) found_on_page.append(''.join(period_inter_selection[period_inter_selection['priority'] == period_inter_selection['priority'].max()].drop_duplicates(subset=['id'])['page'].astype(str).unique())) period_inter.append(''.join(period_inter_selection['year'].unique())) found_id.append(''.join(period_inter_selection['id'].astype(str).unique())) priorities.append(period_inter_selection['priority'].max()) full_inter_max_prior_indexes = period_inter_selection[period_inter_selection['priority'] == period_inter_selection['priority'].max()].drop_duplicates(subset=['priority']).index.values for index in full_inter_max_prior_indexes: big_df.at[index, 'word_match'] = 1 big_df.at[index, 'value_match'] = 1 big_df.at[index, 'dimension_match'] = 1 big_df.at[index, 'quantity_match'] = 1 big_df.at[index, 'period_match'] = 1 res_current_selection.at[index, 'word_match'] = 1 res_current_selection.at[index, 'value_match'] = 1 res_current_selection.at[index, 'dimension_match'] = 1 res_current_selection.at[index, 'quantity_match'] = 1 res_current_selection.at[index, 'period_match'] = 1 else: found_id.append(None) found_on_page.append(''.join(quantity_inter_selection[quantity_inter_selection['priority'] == quantity_inter_selection['priority'].max()].drop_duplicates(subset=['id'])['page'].astype(str).unique())) word_inter.append(''.join(quantity_inter_selection[quantity_inter_selection['priority'] == quantity_inter_selection['priority'].max()].drop_duplicates(subset=['id'])['keyword'].unique())) period_inter.append(None) priorities.append(quantity_inter_selection['priority'].max()) for index in quan_inter_max_prior_indexes: big_df.at[index, 'word_match'] = 1 big_df.at[index, 'value_match'] = 1 big_df.at[index, 'dimension_match'] = 1 big_df.at[index, 'quantity_match'] = 1 res_current_selection.at[index, 'word_match'] = 1 res_current_selection.at[index, 'value_match'] = 1 res_current_selection.at[index, 'dimension_match'] = 1 res_current_selection.at[index, 'quantity_match'] = 1 else: found_id.append(None) found_on_page.append(''.join(dimension_inter_selection[dimension_inter_selection['priority'] == dimension_inter_selection['priority'].max()].drop_duplicates(subset=['id'])['page'].astype(str).unique())) word_inter.append(''.join(dimension_inter_selection[dimension_inter_selection['priority'] == dimension_inter_selection['priority'].max()].drop_duplicates(subset=['id'])['keyword'].unique())) quantity_inter.append(None) period_inter.append(None) priorities.append(dimension_inter_selection['priority'].max()) for index in dim_inter_max_prior_indexes: big_df.at[index, 'word_match'] = 1 big_df.at[index, 'value_match'] = 1 big_df.at[index, 'dimension_match'] = 1 res_current_selection.at[index, 'word_match'] = 1 res_current_selection.at[index, 'value_match'] = 1 res_current_selection.at[index, 'dimension_match'] = 1 else: found_id.append(None) found_on_page.append(''.join(value_inter_selection[value_inter_selection['priority'] == value_inter_selection['priority'].max()].drop_duplicates(subset=['id'])['page'].astype(str).unique())) word_inter.append(''.join(value_inter_selection[value_inter_selection['priority'] == value_inter_selection['priority'].max()].drop_duplicates(subset=['id'])['keyword'].unique())) dimension_inter.append(None) quantity_inter.append(None) period_inter.append(None) priorities.append(value_inter_selection['priority'].max()) for index in value_inter_max_prior_indexes: big_df.at[index, 'word_match'] = 1 big_df.at[index, 'value_match'] = 1 res_current_selection.at[index, 'word_match'] = 1 res_current_selection.at[index, 'value_match'] = 1 else: found_id.append(None) found_on_page.append(''.join(word_inter_selection[word_inter_selection['priority'] == word_inter_selection['priority'].max()].drop_duplicates(subset=['id'])['page'].astype(str).unique())) word_inter.append(''.join(word_inter_selection[word_inter_selection['priority'] == word_inter_selection['priority'].max()].drop_duplicates(subset=['id'])['keyword'].unique())) value_inter.append(None) dimension_inter.append(None) quantity_inter.append(None) period_inter.append(None) priorities.append(word_inter_selection['priority'].max()) for index in word_inter_max_prior_indexes: big_df.at[index, 'word_match'] = 1 res_current_selection.at[index, 'word_match'] = 1 else: word_inter.append(None) found_on_page.append(None) found_id.append(None) priorities.append(None) value_inter.append(None) dimension_inter.append(None) quantity_inter.append(None) period_inter.append(None) stat_res = target.copy() stat_res['priority'] = priorities stat_res['found_on_page'] = found_on_page stat_res['found_keyword'] = word_inter stat_res['found_id'] = found_id stat_res['found_value'] = value_inter stat_res['found_dimension'] = dimension_inter stat_res['found_quantity'] = quantity_inter stat_res['found_period'] = period_inter stat_res['all_keywords'] = all_keywords stat_res['all_values'] = all_values stat_res['all_dimensions'] = all_dimensions stat_res['all_quantities'] = all_quantities stat_res['all_periods'] = all_periods stat_res['all_tables'] = all_tables stat_res['all_chapters'] = all_chapters big_df = big_df[big_df['valid_year'] == 1].copy() return stat_res, big_df #В main придется вынести # #Validation # print('мэтч по таргету') # print(len(stat_res[~pd.isna(stat_res['found_keyword'])])) # print(len(stat_res[~pd.isna(stat_res['found_value'])])) # print(len(stat_res[~pd.isna(stat_res['found_dimension'])])) # print(len(stat_res[~pd.isna(stat_res['found_quantity'])])) # print(len(stat_res[~pd.isna(stat_res['found_period'])])) # print('мэтч по результату') # print(len(big_df[big_df['word_match'] == 1])) # print(len(big_df[big_df['value_match'] == 1])) # print(len(big_df[big_df['dimension_match'] == 1])) # print(len(big_df[big_df['quantity_match'] == 1])) # print(len(big_df[big_df['period_match'] == 1]))