Untitled
unknown
python
2 years ago
5.6 kB
8
Indexable
from string import punctuation
import pandas as pd
import numpy as np
import re
from data_preprocessing import data_initialization_preprocessing
from generating_pairs import generating_pairs_from_grouped_df
def remove_comma_alpha(s):
special_chars = [i for i in list(punctuation) if i not in [',', '.', '(', '-']]
s = "".join([i for i in str(s) if i.isalpha() == False and i not in special_chars])
s = s.replace("(", "-")
while "," in s:
pos = s.find(",")
if len(s[pos + 1:]) >= 3:
s = s[:pos] + s[pos+1:]
elif len(s[pos + 1:]) <= 2:
s = s[:pos] + "." + s[pos+1:]
return s
def checking_word_inter_and_value_inter(targetPath, predJsonsPath, block, place):
merged_df = data_initialization_preprocessing(targetPath, predJsonsPath, block, place)
allSequences = generating_pairs_from_grouped_df(merged_df)
wordInter = []
wordNumInter = []
wordNumDimInter = []
wordNumDimQuantityInter = []
wordNumDimQuantityYearInter = []
special_chars_exception = [i for i in list(punctuation) if i not in ['.', '-']]
for keyword, value, dimension, year, sequences in zip(merged_df['keyword_primary'], merged_df['value'], merged_df['dimension'], merged_df['year'], allSequences):
searchKeyword = str(keyword).lower().replace(' ', '')
if pd.isna(year) == False:
searchYear = str(int(year))
else:
searchYear = None
if any(i.isalpha() or i in special_chars_exception for i in str(value)):
value = remove_comma_alpha(value)
try:
value = float(value)
except:
print(f"странное значение: {value}, то что смогла найти модель: {sequences[searchKeyword]}")
value = None
elif any(val.isdigit() for val in str(value)) == False:
value = None
else:
try:
value = float(value)
except:
print(f"странное значение: {value}, то что смогла найти модель: {sequences[searchKeyword]}")
value = None
if sequences != None and searchKeyword in sequences:
wordInter.append(keyword)
if value in sequences[searchKeyword]:
if pd.isna(value) == True:
wordNumInter.append('empty')
else:
wordNumInter.append(value)
if pd.isna(dimension) == False:
dimension_temp = None
quantity_temp = None
for dimquan in sequences[searchKeyword][value]:
if dimquan[0] is not None and re.search(re.compile(r'[^a-z]'+re.escape(dimquan[0])+r'[^a-z]', flags = re.IGNORECASE), ' '+dimension+' ') != None:
dimension_temp = dimquan[0]
if dimquan[1] is not None and re.search(re.compile(r'[^a-z]'+re.escape(dimquan[1])+r'[^a-z]', flags = re.IGNORECASE), ' '+dimension+' ') != None:
quantity_temp = dimquan[1]
wordNumDimInter.append(dimension_temp)
wordNumDimQuantityInter.append(quantity_temp)
if dimension_temp is not None and quantity_temp is not None:
if searchYear in sequences[searchKeyword][value][(dimension_temp, quantity_temp)]:
if pd.isna(searchYear) == False:
wordNumDimQuantityYearInter.append(searchYear)
else:
wordNumDimQuantityYearInter.append('empty')
else:
wordNumDimQuantityYearInter.append(None)
else:
wordNumDimQuantityYearInter.append(None)
else:
if (None, None) in sequences[searchKeyword][value]:
wordNumDimInter.append('empty')
wordNumDimQuantityInter.append('empty')
if searchYear in sequences[searchKeyword][value][(None, None)]:
if pd.isna(searchYear) == False:
wordNumDimQuantityYearInter.append(searchYear)
else:
wordNumDimQuantityYearInter.append('empty')
else:
wordNumDimQuantityYearInter.append(None)
else:
wordNumDimInter.append(None)
wordNumDimQuantityInter.append(None)
wordNumDimQuantityYearInter.append(None)
else:
wordNumInter.append(None)
wordNumDimInter.append(None)
wordNumDimQuantityInter.append(None)
wordNumDimQuantityYearInter.append(None)
else:
wordInter.append(None)
wordNumInter.append(None)
wordNumDimInter.append(None)
wordNumDimQuantityInter.append(None)
wordNumDimQuantityYearInter.append(None)
merged_df["keyword_inter"] = wordInter
merged_df["found_value"] = wordNumInter
merged_df["found_dimension"] = wordNumDimInter
merged_df["found_quantity"] = wordNumDimQuantityInter
merged_df["found_year"] = wordNumDimQuantityYearInter
return merged_dfEditor is loading...