Untitled
unknown
python
2 years ago
5.6 kB
3
Indexable
from string import punctuation import pandas as pd import numpy as np import re from data_preprocessing import data_initialization_preprocessing from generating_pairs import generating_pairs_from_grouped_df def remove_comma_alpha(s): special_chars = [i for i in list(punctuation) if i not in [',', '.', '(', '-']] s = "".join([i for i in str(s) if i.isalpha() == False and i not in special_chars]) s = s.replace("(", "-") while "," in s: pos = s.find(",") if len(s[pos + 1:]) >= 3: s = s[:pos] + s[pos+1:] elif len(s[pos + 1:]) <= 2: s = s[:pos] + "." + s[pos+1:] return s def checking_word_inter_and_value_inter(targetPath, predJsonsPath, block, place): merged_df = data_initialization_preprocessing(targetPath, predJsonsPath, block, place) allSequences = generating_pairs_from_grouped_df(merged_df) wordInter = [] wordNumInter = [] wordNumDimInter = [] wordNumDimQuantityInter = [] wordNumDimQuantityYearInter = [] special_chars_exception = [i for i in list(punctuation) if i not in ['.', '-']] for keyword, value, dimension, year, sequences in zip(merged_df['keyword_primary'], merged_df['value'], merged_df['dimension'], merged_df['year'], allSequences): searchKeyword = str(keyword).lower().replace(' ', '') if pd.isna(year) == False: searchYear = str(int(year)) else: searchYear = None if any(i.isalpha() or i in special_chars_exception for i in str(value)): value = remove_comma_alpha(value) try: value = float(value) except: print(f"странное значение: {value}, то что смогла найти модель: {sequences[searchKeyword]}") value = None elif any(val.isdigit() for val in str(value)) == False: value = None else: try: value = float(value) except: print(f"странное значение: {value}, то что смогла найти модель: {sequences[searchKeyword]}") value = None if sequences != None and searchKeyword in sequences: wordInter.append(keyword) if value in sequences[searchKeyword]: if pd.isna(value) == True: wordNumInter.append('empty') else: wordNumInter.append(value) if pd.isna(dimension) == False: dimension_temp = None quantity_temp = None for dimquan in sequences[searchKeyword][value]: if dimquan[0] is not None and re.search(re.compile(r'[^a-z]'+re.escape(dimquan[0])+r'[^a-z]', flags = re.IGNORECASE), ' '+dimension+' ') != None: dimension_temp = dimquan[0] if dimquan[1] is not None and re.search(re.compile(r'[^a-z]'+re.escape(dimquan[1])+r'[^a-z]', flags = re.IGNORECASE), ' '+dimension+' ') != None: quantity_temp = dimquan[1] wordNumDimInter.append(dimension_temp) wordNumDimQuantityInter.append(quantity_temp) if dimension_temp is not None and quantity_temp is not None: if searchYear in sequences[searchKeyword][value][(dimension_temp, quantity_temp)]: if pd.isna(searchYear) == False: wordNumDimQuantityYearInter.append(searchYear) else: wordNumDimQuantityYearInter.append('empty') else: wordNumDimQuantityYearInter.append(None) else: wordNumDimQuantityYearInter.append(None) else: if (None, None) in sequences[searchKeyword][value]: wordNumDimInter.append('empty') wordNumDimQuantityInter.append('empty') if searchYear in sequences[searchKeyword][value][(None, None)]: if pd.isna(searchYear) == False: wordNumDimQuantityYearInter.append(searchYear) else: wordNumDimQuantityYearInter.append('empty') else: wordNumDimQuantityYearInter.append(None) else: wordNumDimInter.append(None) wordNumDimQuantityInter.append(None) wordNumDimQuantityYearInter.append(None) else: wordNumInter.append(None) wordNumDimInter.append(None) wordNumDimQuantityInter.append(None) wordNumDimQuantityYearInter.append(None) else: wordInter.append(None) wordNumInter.append(None) wordNumDimInter.append(None) wordNumDimQuantityInter.append(None) wordNumDimQuantityYearInter.append(None) merged_df["keyword_inter"] = wordInter merged_df["found_value"] = wordNumInter merged_df["found_dimension"] = wordNumDimInter merged_df["found_quantity"] = wordNumDimQuantityInter merged_df["found_year"] = wordNumDimQuantityYearInter return merged_df
Editor is loading...