Untitled

mail@pastecode.io avatar
unknown
python
a year ago
5.6 kB
1
Indexable
Never
from string import punctuation
import pandas as pd
import numpy as np
import re

from data_preprocessing import data_initialization_preprocessing
from generating_pairs import generating_pairs_from_grouped_df

def remove_comma_alpha(s):
    special_chars = [i for i in list(punctuation) if i not in [',', '.', '(', '-']]
    s = "".join([i for i in str(s) if i.isalpha() == False and i not in special_chars])
    s = s.replace("(", "-")
    while "," in s:
        pos = s.find(",")
        if len(s[pos + 1:]) >= 3:
            s = s[:pos] + s[pos+1:]
        elif len(s[pos + 1:]) <= 2:
            s = s[:pos] + "." + s[pos+1:]
    return s

def checking_word_inter_and_value_inter(targetPath, predJsonsPath, block, place):

    merged_df = data_initialization_preprocessing(targetPath, predJsonsPath, block, place)

    allSequences = generating_pairs_from_grouped_df(merged_df)

    wordInter = []
    wordNumInter = []
    wordNumDimInter = []
    wordNumDimQuantityInter = []
    wordNumDimQuantityYearInter = []
    special_chars_exception = [i for i in list(punctuation) if i not in ['.', '-']]

    for keyword, value, dimension, year, sequences in zip(merged_df['keyword_primary'], merged_df['value'], merged_df['dimension'], merged_df['year'], allSequences):

        searchKeyword = str(keyword).lower().replace(' ', '')

        if pd.isna(year) == False:
            searchYear = str(int(year))
        else:
            searchYear = None

        if any(i.isalpha() or i in special_chars_exception for i in str(value)):
            value = remove_comma_alpha(value)

            try:
                value = float(value)
                
            except:
                print(f"странное значение: {value}, то что смогла найти модель: {sequences[searchKeyword]}")
                value = None
                
        elif any(val.isdigit() for val in str(value)) == False:
            value = None

        else:
            try:
                value = float(value)
                
            except:
                print(f"странное значение: {value}, то что смогла найти модель: {sequences[searchKeyword]}")
                value = None

        if sequences != None and searchKeyword in sequences:
            wordInter.append(keyword)

            if value in sequences[searchKeyword]:
                if pd.isna(value) == True:
                    wordNumInter.append('empty')
                else:
                    wordNumInter.append(value)
                if pd.isna(dimension) == False:
                    dimension_temp = None
                    quantity_temp = None
                    for dimquan in sequences[searchKeyword][value]:
                        if dimquan[0] is not None and re.search(re.compile(r'[^a-z]'+re.escape(dimquan[0])+r'[^a-z]', flags = re.IGNORECASE), ' '+dimension+' ') != None:
                            dimension_temp = dimquan[0]
                        if dimquan[1] is not None and re.search(re.compile(r'[^a-z]'+re.escape(dimquan[1])+r'[^a-z]', flags = re.IGNORECASE), ' '+dimension+' ') != None:
                            quantity_temp = dimquan[1]
                    wordNumDimInter.append(dimension_temp)
                    wordNumDimQuantityInter.append(quantity_temp)
                    if dimension_temp is not None and quantity_temp is not None:
                        if searchYear in sequences[searchKeyword][value][(dimension_temp, quantity_temp)]:
                            if pd.isna(searchYear) == False:
                                wordNumDimQuantityYearInter.append(searchYear)
                            else:
                                wordNumDimQuantityYearInter.append('empty')
                        else:
                            wordNumDimQuantityYearInter.append(None)
                    else:
                        wordNumDimQuantityYearInter.append(None)
                else:
                    if (None, None) in sequences[searchKeyword][value]:
                        wordNumDimInter.append('empty')
                        wordNumDimQuantityInter.append('empty')
                        if searchYear in sequences[searchKeyword][value][(None, None)]:
                            if pd.isna(searchYear) == False:
                                wordNumDimQuantityYearInter.append(searchYear)
                            else:
                                wordNumDimQuantityYearInter.append('empty')
                        else:
                            wordNumDimQuantityYearInter.append(None)
                    else:
                        wordNumDimInter.append(None)
                        wordNumDimQuantityInter.append(None)
                        wordNumDimQuantityYearInter.append(None)
            
            else:
                wordNumInter.append(None)
                wordNumDimInter.append(None)
                wordNumDimQuantityInter.append(None)
                wordNumDimQuantityYearInter.append(None)
        
        else:
            wordInter.append(None)
            wordNumInter.append(None)
            wordNumDimInter.append(None)
            wordNumDimQuantityInter.append(None)
            wordNumDimQuantityYearInter.append(None)

    merged_df["keyword_inter"] = wordInter
    merged_df["found_value"] = wordNumInter
    merged_df["found_dimension"] = wordNumDimInter
    merged_df["found_quantity"] = wordNumDimQuantityInter
    merged_df["found_year"] = wordNumDimQuantityYearInter

    return merged_df