Untitled

mail@pastecode.io avatar
unknown
python
a year ago
16 kB
11
Indexable
Never
import pandas as pd
import numpy as np

def result_metrics(target, big_df):
    print('bbbbbbbbbbbbbbbbbbbbb')
    target_grouped = target.groupby(['doc_name', 'industry'], as_index=False, sort=False).agg(list)

    word_inter = []
    found_id = []
    found_on_page = []
    value_inter = []
    dimension_inter = []
    quantity_inter = []
    period_inter = []
    all_keywords = []
    all_values = []
    all_dimensions = []
    all_quantities = []
    all_periods = []
    all_tables = []
    all_chapters = []
    priorities = []

    big_df['word_match'] = 0
    big_df['value_match'] = 0
    big_df['dimension_match'] = 0
    big_df['quantity_match'] = 0
    big_df['period_match'] = 0
    big_df['valid_id'] = 0
    big_df['valid_year'] = 0

    for docname, industry, pages, keywords, values, dimensions, quantities, periods, ids in zip(target_grouped['doc_name'], target_grouped['industry'], target_grouped['page'], target_grouped['preprocessed_keyword_primary'], target_grouped['preprocessed_value'], target_grouped['standartized_dimension'], target_grouped['standartized_quantity'], target_grouped['only_year'], target_grouped['standart_id']):

        res_current_selection = big_df.loc[(big_df['doc_name'] == docname) & (big_df['industry'] == industry)]

        #check valid id
        big_df.loc[((big_df['doc_name'] == docname) & (big_df['industry'] == industry)) & (big_df['id'].isin(ids)), 'valid_id'] = 1

        #check valid year

        def extract_year(some_period):
            if '.' in some_period:
                periods = some_period.split('-')
                year = max([int(i.split('.')[-1]) for i in periods])
                return year
            elif '-' in some_period:
                periods = some_period.split('-')
                year = max([int(i) for i in periods])
                return year
            else:
                return int(some_period)

        valid_years = pd.Series([i.split('-') if '-' in i else i for i in set(periods)]).explode().astype(int).to_list()
        max_valid_year = max(valid_years)
        big_df.loc[((big_df['doc_name'] == docname) & (big_df['industry'] == industry)) & (big_df['valid_id'] == 0) & (big_df['year'].str.contains(str(max_valid_year))), 'valid_year'] = 1

        valid_years_id_dict = dict(zip(ids, periods))

        def valid_year_id(data):
            if valid_years_id_dict[data['id']] == data['year']:
                return 1
            else:
                return 0

        big_df.loc[((big_df['doc_name'] == docname) & (big_df['industry'] == industry))  & (big_df['valid_id'] == 1), 'valid_year'] = big_df[((big_df['doc_name'] == docname) & (big_df['industry'] == industry))  & (big_df['valid_id'] == 1)][['id', 'year']].apply(valid_year_id, axis=1)

        for page, keyword, value, dimension, quantity, period, standart_id in zip(pages, keywords, values, dimensions, quantities, periods, ids):

            all_entities_page = res_current_selection[res_current_selection['page'] == page]
            all_keywords.append(all_entities_page['keyword'].to_list())
            all_values.append(all_entities_page['value'].to_list())
            all_dimensions.append(all_entities_page['dimension'].to_list())
            all_quantities.append(all_entities_page['quantity'].to_list())
            all_periods.append(all_entities_page['year'].to_list())
            all_tables.append(all_entities_page['table_title_raw'].unique())
            all_chapters.append(all_entities_page['chapter_name'].unique())


            word_inter_selection = res_current_selection.loc[#(res_current_selection['page'] == page) & 
                                                            (res_current_selection['id'] == standart_id) &
                                                            (res_current_selection['word_match'] != 1) & 
                                                            (res_current_selection['value_match'] != 1) & 
                                                            (res_current_selection['dimension_match'] != 1) & 
                                                            (res_current_selection['quantity_match'] != 1) & 
                                                            (res_current_selection['period_match'] != 1)]
            
            if len(word_inter_selection) > 0:
                
                word_inter_max_prior_indexes = word_inter_selection[word_inter_selection['priority'] == word_inter_selection['priority'].max()].drop_duplicates(subset=['priority']).index.values

                value_inter_selection = word_inter_selection.loc[(word_inter_selection['value'] == value) | 
                                                                (word_inter_selection['value'] == -abs(value)) | 
                                                                (word_inter_selection['value'] == value * 10) | 
                                                                (word_inter_selection['value'] == -abs(value) * 10) | 
                                                                (word_inter_selection['value'] == value * 100) | 
                                                                (word_inter_selection['value'] == -abs(value) * 100) | 
                                                                (word_inter_selection['value'] == value * 1000) | 
                                                                (word_inter_selection['value'] == -abs(value) * 1000)].copy()

                if len(value_inter_selection) > 0:
                    value_inter_selection['value'] = value_inter_selection['value'].astype(str)
                    value_inter.append(''.join(value_inter_selection['value'].unique()))

                    value_inter_max_prior_indexes = value_inter_selection[value_inter_selection['priority'] == value_inter_selection['priority'].max()].drop_duplicates(subset=['priority']).index.values

                    dimension_inter_selection = value_inter_selection.copy()
                    dimension_inter_selection = dimension_inter_selection.explode('standartized_dimension')

                    dimension_inter_selection = dimension_inter_selection.loc[dimension_inter_selection['standartized_dimension'].isin(dimension)]

                    if len(dimension_inter_selection) > 0:
                        dimension_inter_selection['dimension'] = dimension_inter_selection['dimension'].fillna('empty')
                        dimension_inter.append(''.join(dimension_inter_selection['dimension'].unique()))

                        dim_inter_max_prior_indexes = dimension_inter_selection[dimension_inter_selection['priority'] == dimension_inter_selection['priority'].max()].drop_duplicates(subset=['priority']).index.values

                        quantity_inter_selection = dimension_inter_selection.copy()

                        quantity_inter_selection = quantity_inter_selection.loc[quantity_inter_selection['standartized_quantity'] == quantity]

                        if len(quantity_inter_selection) > 0:

                            quantity_inter.append(''.join(quantity_inter_selection['quantity'].unique()))

                            quan_inter_max_prior_indexes = quantity_inter_selection[quantity_inter_selection['priority'] == quantity_inter_selection['priority'].max()].drop_duplicates(subset=['priority']).index.values

                            period_inter_selection = quantity_inter_selection.loc[(quantity_inter_selection['year'] == period)]

                            if len(period_inter_selection) > 0:
                                word_inter.append(''.join(period_inter_selection[period_inter_selection['priority'] == period_inter_selection['priority'].max()].drop_duplicates(subset=['id'])['keyword'].unique()))

                                found_on_page.append(''.join(period_inter_selection[period_inter_selection['priority'] == period_inter_selection['priority'].max()].drop_duplicates(subset=['id'])['page'].astype(str).unique()))
        
                                period_inter.append(''.join(period_inter_selection['year'].unique()))
                                found_id.append(''.join(period_inter_selection['id'].astype(str).unique()))
                                priorities.append(period_inter_selection['priority'].max())

                                full_inter_max_prior_indexes = period_inter_selection[period_inter_selection['priority'] == period_inter_selection['priority'].max()].drop_duplicates(subset=['priority']).index.values
                                for index in full_inter_max_prior_indexes:
                                    big_df.at[index, 'word_match'] = 1
                                    big_df.at[index, 'value_match'] = 1
                                    big_df.at[index, 'dimension_match'] = 1
                                    big_df.at[index, 'quantity_match'] = 1
                                    big_df.at[index, 'period_match'] = 1
                                    res_current_selection.at[index, 'word_match'] = 1
                                    res_current_selection.at[index, 'value_match'] = 1
                                    res_current_selection.at[index, 'dimension_match'] = 1
                                    res_current_selection.at[index, 'quantity_match'] = 1
                                    res_current_selection.at[index, 'period_match'] = 1
                            else:
                                found_id.append(None)
                                found_on_page.append(''.join(quantity_inter_selection[quantity_inter_selection['priority'] == quantity_inter_selection['priority'].max()].drop_duplicates(subset=['id'])['page'].astype(str).unique()))
                                word_inter.append(''.join(quantity_inter_selection[quantity_inter_selection['priority'] == quantity_inter_selection['priority'].max()].drop_duplicates(subset=['id'])['keyword'].unique()))
                                period_inter.append(None)

                                priorities.append(quantity_inter_selection['priority'].max())

                                for index in quan_inter_max_prior_indexes:
                                    big_df.at[index, 'word_match'] = 1
                                    big_df.at[index, 'value_match'] = 1
                                    big_df.at[index, 'dimension_match'] = 1
                                    big_df.at[index, 'quantity_match'] = 1
                                    res_current_selection.at[index, 'word_match'] = 1
                                    res_current_selection.at[index, 'value_match'] = 1
                                    res_current_selection.at[index, 'dimension_match'] = 1
                                    res_current_selection.at[index, 'quantity_match'] = 1

                        else:
                            found_id.append(None)
                            found_on_page.append(''.join(dimension_inter_selection[dimension_inter_selection['priority'] == dimension_inter_selection['priority'].max()].drop_duplicates(subset=['id'])['page'].astype(str).unique()))
                            word_inter.append(''.join(dimension_inter_selection[dimension_inter_selection['priority'] == dimension_inter_selection['priority'].max()].drop_duplicates(subset=['id'])['keyword'].unique()))
                            quantity_inter.append(None)
                            period_inter.append(None)

                            priorities.append(dimension_inter_selection['priority'].max())

                            for index in dim_inter_max_prior_indexes:
                                big_df.at[index, 'word_match'] = 1
                                big_df.at[index, 'value_match'] = 1
                                big_df.at[index, 'dimension_match'] = 1
                                res_current_selection.at[index, 'word_match'] = 1
                                res_current_selection.at[index, 'value_match'] = 1
                                res_current_selection.at[index, 'dimension_match'] = 1

                    else:
                        found_id.append(None)
                        found_on_page.append(''.join(value_inter_selection[value_inter_selection['priority'] == value_inter_selection['priority'].max()].drop_duplicates(subset=['id'])['page'].astype(str).unique()))
                        word_inter.append(''.join(value_inter_selection[value_inter_selection['priority'] == value_inter_selection['priority'].max()].drop_duplicates(subset=['id'])['keyword'].unique()))
                        dimension_inter.append(None)
                        quantity_inter.append(None)
                        period_inter.append(None)

                        priorities.append(value_inter_selection['priority'].max())

                        for index in value_inter_max_prior_indexes:
                            big_df.at[index, 'word_match'] = 1
                            big_df.at[index, 'value_match'] = 1
                            res_current_selection.at[index, 'word_match'] = 1
                            res_current_selection.at[index, 'value_match'] = 1

                else:
                    found_id.append(None)
                    found_on_page.append(''.join(word_inter_selection[word_inter_selection['priority'] == word_inter_selection['priority'].max()].drop_duplicates(subset=['id'])['page'].astype(str).unique()))
                    word_inter.append(''.join(word_inter_selection[word_inter_selection['priority'] == word_inter_selection['priority'].max()].drop_duplicates(subset=['id'])['keyword'].unique()))
                    value_inter.append(None)
                    dimension_inter.append(None)
                    quantity_inter.append(None)
                    period_inter.append(None)

                    priorities.append(word_inter_selection['priority'].max())
                    for index in word_inter_max_prior_indexes:
                        big_df.at[index, 'word_match'] = 1
                        res_current_selection.at[index, 'word_match'] = 1

            else:
                word_inter.append(None)
                found_on_page.append(None)
                found_id.append(None)
                priorities.append(None)
                value_inter.append(None)
                dimension_inter.append(None)
                quantity_inter.append(None)
                period_inter.append(None)

    stat_res = target.copy()

    stat_res['priority'] = priorities
    stat_res['found_on_page'] = found_on_page
    stat_res['found_keyword'] = word_inter
    stat_res['found_id'] = found_id
    stat_res['found_value'] = value_inter
    stat_res['found_dimension'] = dimension_inter
    stat_res['found_quantity'] = quantity_inter
    stat_res['found_period'] = period_inter
    stat_res['all_keywords'] = all_keywords
    stat_res['all_values'] = all_values
    stat_res['all_dimensions'] = all_dimensions
    stat_res['all_quantities'] = all_quantities
    stat_res['all_periods'] = all_periods
    stat_res['all_tables'] = all_tables
    stat_res['all_chapters'] = all_chapters

    big_df = big_df[big_df['valid_year'] == 1].copy()

    return stat_res, big_df






    #В main придется вынести

    # #Validation
    # print('мэтч по таргету')
    # print(len(stat_res[~pd.isna(stat_res['found_keyword'])]))
    # print(len(stat_res[~pd.isna(stat_res['found_value'])]))
    # print(len(stat_res[~pd.isna(stat_res['found_dimension'])]))
    # print(len(stat_res[~pd.isna(stat_res['found_quantity'])]))
    # print(len(stat_res[~pd.isna(stat_res['found_period'])]))

    # print('мэтч по результату')
    # print(len(big_df[big_df['word_match'] == 1]))
    # print(len(big_df[big_df['value_match'] == 1]))
    # print(len(big_df[big_df['dimension_match'] == 1]))
    # print(len(big_df[big_df['quantity_match'] == 1]))
    # print(len(big_df[big_df['period_match'] == 1]))