Untitled

 avatar
unknown
python
a year ago
12 kB
1
Indexable
from string import punctuation
import pandas as pd
import numpy as np
import re
from collections import defaultdict

from data_preprocessing import data_initialization_preprocessing

not_allowed_keyword = [i for i in list(punctuation) if i not in [',', '/']]

def remove_comma_alpha(s):
    special_chars = [i for i in list(punctuation) if i not in [',', '.', '(', '-']]
    s = "".join([i for i in str(s) if i.isalpha() == False and i not in special_chars])
    s = s.replace("(", "-")
    while "," in s:
        pos = s.find(",")
        if len(s[pos + 1:]) >= 3:
            s = s[:pos] + s[pos+1:]
        elif len(s[pos + 1:]) <= 2:
            s = s[:pos] + "." + s[pos+1:]
    return s

def checking_word_inter_and_value_inter(targetPath, predJsonsPath, block, place, standart_dimensions, standart_quantities):

    standart_dimensions[None] = [None]
    standart_dimensions['not_exist'] = [None]
    standart_dimensions['s'].append('usd')
    standart_dimensions['azerbajjani manats'].append('azn')
    standart_dimensions['azerbajani manats'].append('azn')
    standart_quantities[None] = None
    standart_quantities['milion'] = 'миллион'
    standart_quantities['1000'] = 'тысячи'

    result_df, sequences, filePathDictJSON = data_initialization_preprocessing(targetPath, predJsonsPath, block, place)
    print('yes')
    wordInter = []
    wordNumInter = []
    wordNumDimInter = []
    wordNumDimQuantityInter = []
    wordNumDimQuantityPeriodInter = []
    all_keywords = []
    all_values = []
    all_dimensions = []
    all_quantities = []
    all_periods = []
    table_title = []
    all_chapter_names = []
    all_ids = []
    all_priorities = []
    all_columns = []
    special_chars_exception = [i for i in list(punctuation) if i not in ['.', '-']]



    entities_dict = defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: defaultdict(list)))))
    for seq_key in sequences:
        for page in sequences[seq_key]['pages']:
            for table in sequences[seq_key]['pages'][page]['tables']['id']:
                for entity in sequences[seq_key]['pages'][page]['tables']['id'][table]['entities']:

                    if entity['priority'] == 2:
                    
                        entities_dict[seq_key][''.join([i for i in entity['iclass_txt'].lower().replace(' ', '') if i not in not_allowed_keyword])][entity['ivalue_txt']][entity['unit1_txt']][entity['scaler1_txt']].append(entity['iperiod_txt'])


    for docname, industry, page, keyword, value, dimension, quantity, period in zip(result_df['doc_name'], result_df['industry'], result_df['page'], result_df['keyword_primary'], result_df['value'], result_df['only_dimension'], result_df['only_quantity'], result_df['normalized_period']):

        page = str(int(page))
        keyword = ''.join([i for i in str(keyword).lower().replace(' ', '') if i not in not_allowed_keyword])

        if pd.isna(value) == False:
            if any(i.isalpha() or i in special_chars_exception for i in str(value)):
                value = remove_comma_alpha(value)

                try:
                    value = float(value)
                    
                except:
                    # print(f"странное значение: {value}, то что смогла найти модель: {sequences[searchKeyword]}")
                    value = None
                    
            elif any(val.isdigit() for val in str(value)) == False:
                value = None

            else:

                try:
                    value = float(value)
                    
                except:
                    # print(f"странное значение: {value}, то что смогла найти модель: {sequences[searchKeyword]}")
                    value = None
        else:
            value = None

        if pd.isna(dimension) == True or dimension == '' or dimension == ' ':
            dimension = 'empty'
        else:
            dimension = dimension.lower().strip()
        
        if pd.isna(quantity) == True or quantity == '' or quantity == ' ':
            quantity = None
        else:
            quantity = quantity.lower().strip()
        
        if pd.isna(period) == True or period == '' or period == ' ':
            period = None
        else:
            period = str(period)
        seq_key = docname + industry
        # if page in sequences[seq_key]['pages']:
        #     all_tables = []
        #     for table in sequences[seq_key]['pages'][page]['tables']['id']:
        #         if sequences[seq_key]['pages'][page]['tables']['id'][table]['table_title'] != '':
        #             all_tables.append(sequences[seq_key]['pages'][page]['tables']['id'][table]['table_title'])
        #     table_title.append(list(set(all_tables)))

        #     entities_dict = defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: defaultdict(list))))
        #     all_keywords_page = []
        #     all_values_page = []
        #     all_dimensions_page= []
        #     all_quantities_page = []
        #     all_periods_page = []
        #     all_chapter_names_page = []
        #     all_ids_page = []
        #     all_priorities_page = []
        #     all_columns_page = []

        #     for table in sequences[seq_key]['pages'][page]['tables']['id']:
        #         for entity in sequences[seq_key]['pages'][page]['tables']['id'][table]['entities']:

        #             # if entity['priority'] == 2:
                    
        #             entities_dict[''.join([i for i in entity['iclass_txt'].lower().replace(' ', '') if i not in not_allowed_keyword])][entity['ivalue_txt']][entity['unit1_txt']][entity['scaler1_txt']].append(entity['iperiod_txt'])
        #             all_keywords_page.append(entity['iclass_txt'])
                    
        #             all_values_page.append(entity['ivalue_txt'])
        #             all_dimensions_page.append(entity['unit1_txt'])
        #             all_quantities_page.append(entity['scaler1_txt'])
        #             all_periods_page.append(entity['iperiod_txt'])
        #             all_chapter_names_page.append(entity['chapter_name'])
        #             all_ids_page.append(entity['iclass_id'])
        #             # all_priorities_page.append(entity['priority'])
        #             # all_columns_page.append(entity['column'])

        #     all_keywords.append(all_keywords_page)
        #     all_values.append(all_values_page)
        #     all_dimensions.append(all_dimensions_page)
        #     all_quantities.append(all_quantities_page)
        #     all_periods.append(all_periods_page)
        #     all_chapter_names.append(set(all_chapter_names_page))
        #     all_ids.append(all_ids_page)
        #     all_priorities.append(all_priorities_page)
        #     all_columns.append(all_columns_page)
        # print(seq_key)
        # print(entities_dict.keys())
        if keyword in entities_dict[seq_key]:
            wordInter.append(keyword)
            if (value in entities_dict[seq_key][keyword]) or (value and (-abs(value) in entities_dict[seq_key][keyword] or -abs(value) * 10 in entities_dict[seq_key][keyword] or value * 10 in entities_dict[seq_key][keyword])):
                if value == None:
                    wordNumInter.append('empty')
                elif value and -abs(value) in entities_dict[seq_key][keyword]:
                    wordNumInter.append(-abs(value))
                    value = -abs(value)
                elif value and -abs(value) * 10 in entities_dict[seq_key][keyword]:
                    wordNumInter.append(-abs(value) * 10)
                    value = -abs(value) * 10
                elif value and value * 10 in entities_dict[seq_key][keyword]:
                    wordNumInter.append(value * 10)
                    value = value * 10
                else:
                    wordNumInter.append(value)

                for i in entities_dict[seq_key][keyword][value]:
                    if len(set(standart_dimensions[dimension]) & set(standart_dimensions[i])) != 0:
                        dimension = i

                if dimension in entities_dict[seq_key][keyword][value]:
                    if dimension:
                        wordNumDimInter.append(dimension)
                    else:
                        wordNumDimInter.append('empty')

                    for i in entities_dict[seq_key][keyword][value][dimension]:
                        try:
                            if standart_quantities[quantity] == standart_quantities[i]:
                                quantity = i
                        except:
                            breakpoint()

                    if quantity in entities_dict[seq_key][keyword][value][dimension]:
                        if quantity:
                            wordNumDimQuantityInter.append(quantity)
                        else:
                            wordNumDimQuantityInter.append('empty')
                        if period in entities_dict[seq_key][keyword][value][dimension][quantity]:
                            wordNumDimQuantityPeriodInter.append(period)
                        elif period == None and period in entities_dict[seq_key][keyword][value][dimension][quantity]:
                            wordNumDimQuantityPeriodInter.append('empty')
                        else:
                            wordNumDimQuantityPeriodInter.append(None)
                    else:
                        wordNumDimQuantityInter.append(None)
                        wordNumDimQuantityPeriodInter.append(None)
                else:
                    wordNumDimInter.append(None)
                    wordNumDimQuantityInter.append(None)
                    wordNumDimQuantityPeriodInter.append(None)
            else:
                wordNumInter.append(None)
                wordNumDimInter.append(None)
                wordNumDimQuantityInter.append(None)
                wordNumDimQuantityPeriodInter.append(None)
        
        else:
            wordInter.append(None)
            wordNumInter.append(None)
            wordNumDimInter.append(None)
            wordNumDimQuantityInter.append(None)
            wordNumDimQuantityPeriodInter.append(None)

        # else:
        #     wordInter.append(None)
        #     wordNumInter.append(None)
        #     wordNumDimInter.append(None)
        #     wordNumDimQuantityInter.append(None)
        #     wordNumDimQuantityPeriodInter.append(None)


    def find_nth(haystack, needle, n):
        start = haystack.find(needle)
        while start >= 0 and n > 1:
            start = haystack.find(needle, start+len(needle))
            n -= 1
        return start

    result_df["keyword_inter"] = wordInter
    result_df["found_value"] = wordNumInter
    result_df["found_dimension"] = wordNumDimInter
    result_df["found_quantity"] = wordNumDimQuantityInter
    result_df["found_period"] = wordNumDimQuantityPeriodInter
    # result_df["all_keywords"] = all_keywords
    # result_df["all_values"] = all_values
    # result_df["all_dimensions"] = all_dimensions
    # result_df["all_quantities"] = all_quantities
    # result_df["all_periods"] = all_periods
    # result_df['table_title'] = table_title
    # result_df['chapter_name'] = all_chapter_names
    result_df['file_path_json'] = (result_df['doc_name'] + result_df['industry']).map(filePathDictJSON)
    result_df['file_path_pdf'] = result_df['file_path_json'].apply(lambda x: '1_data/reports/package_3_reports_fin_table' + x[find_nth(x, '/', 3):].replace('.json', '.pdf'))
    result_df['doc_name'] = result_df['file_path_pdf'].apply(lambda x: x[x.rfind('/') + 1:])
    result_df['company_name'] = result_df['file_path_pdf'].apply(lambda x: x.split('/')[4])
    # result_df['all_ids'] = all_ids
    # result_df['all_priorities'] = all_priorities
    # result_df['all_columns'] = all_columns 
    print('yes2')
    return result_df