Untitled
unknown
plain_text
2 years ago
7.8 kB
12
Indexable
import sys
sys.path.insert(0, '/Users/yakovlev/tg_proj/WB/')
from objects.objects import Product, Review, ConfigReviewsFilter, ConfigProductsFilter
from parsers.reviews_parser import ReviewParser, ReviewActions
from parsers.products_parser import ProductFilter
from photo_handlers.PhDownloader import PhDownloader
from photo_handlers.PhDeletor import PhDeletor
from photo_handlers.ml.nudeNetDetector import NudeNetDetector
from consumers.Telegram import TgIntegration
from selenium.webdriver.chrome.options import Options
from selenium import webdriver
import time
import multiprocessing as mp
from typing import List
import time as t
import logging
import pickle
from pathlib import Path
from shutil import rmtree
logging.basicConfig(level=logging.CRITICAL)
from tqdm.contrib.concurrent import process_map
# products = [
# Product(article=166462256),
# Product(article=162259738),
# Product(article=146295378),
# Product(article=164914471),
# Product(article=155096368),
# Product(article=120466540),
# Product(article=155781684),
# Product(article=147088557),
# # Product(article=send nude),
# Product(article=153810479),
# Product(article=125611383),
# Product(article=143758866),
# Product(article=145664801),
# Product(article=155888162),
# Product(article=154638410),
# Product(article=143758388),
# Product(article=155785013),
# # Product(article=hello kit),
# Product(article=87248210),
# Product(article=143142431),
# Product(article=151985798),
# Product(article=148048918),
# Product(article=163265985),
# # Product(article=SHINRA TE),
# Product(article=151138774),
# Product(article=170589332),
# Product(article=150857627),
# Product(article=153810479),
# Product(article=166766028)
# ]
def parse_products_queries():
chrome_options = Options()
chrome_options.add_experimental_option("detach", True)
# chrome_options.add_argument("--headless")
# driver = webdriver.Chrome(
# options=chrome_options
# )
# pf = ProductFilter(driver=driver, config=ConfigProductsFilter(min_num_reviews=50))
# products = pf.get_products(search_query='прозрачные женские трусы', num_pages=20, sort_condition='popular')
# products += pf.get_products(search_query='прозрачное белье женское', num_pages=20, sort_condition='popular')
# products += pf.get_products(search_query='юбка женская аниме', num_pages=14, sort_condition='popular')
# products += pf.get_products(search_query='чулки женские аниме', num_pages=14, sort_condition='popular')
# products += pf.get_products(search_query='чулки женские', num_pages=14, sort_condition='popular')
# products += pf.get_products(search_query='кружевные чулки', num_pages=8, sort_condition='popular')
# products += pf.get_products(search_query='топик женский аниме', num_pages=8, sort_condition='popular')
# products += pf.get_products(search_query='трусы женский аниме', num_pages=8, sort_condition='popular')
# products += pf.get_products(search_query='кельвин клейн комплект женский', num_pages=8, sort_condition='popular')
# products += pf.get_products(search_query='пирсинг соски', num_pages=8, sort_condition='popular')
# products += pf.get_products(search_query='прозрачный топик', num_pages=8, sort_condition='popular')
# products += pf.get_products(search_query='колготки чулки в сетку', num_pages=11, sort_condition='popular')
# products += pf.get_products(search_query='белье kitty klaw', num_pages=8, sort_condition='popular')
# products += pf.get_products(search_query='белье send nudes', num_pages=8, sort_condition='popular')
# products += pf.get_products(search_query='женское нижнее белье с доступом', num_pages=8, sort_condition='popular')
# products += pf.get_products(search_query='с доступом', num_pages=8, sort_condition='popular')
# products += pf.get_products(search_query='сексуальное', num_pages=14, sort_condition='popular')
# products += pf.get_products(search_query='боди кружевное', num_pages=14, sort_condition='popular')
# products += pf.get_products(search_query='кружевной комплект женский', num_pages=20, sort_condition='popular')
# products += pf.get_products(search_query='колготки женские сексуальные', num_pages=25, sort_condition='popular')
# products += pf.get_products(search_query='портупея', num_pages=25, sort_condition='popular')
# products += pf.get_products(search_query='стринги', num_pages=15, sort_condition='popular')
def load_products_from_file(filepath):
with open (filepath, 'rb') as fp:
itemlist = pickle.load(fp)
return itemlist
products = load_products_from_file('articles')
def drop_duplicates(products: List[Product]):
output_l = list()
t_l = list()
for product in products:
if product.article not in t_l:
t_l.append(product.article)
output_l.append(product)
del t_l
duplicates = list()
final = list()
for product in output_l:
k = (product.name, product.num_reviews)
if k not in duplicates:
final.append(product)
duplicates.append(k)
return final
def save_articles(articles: List[Product]) -> None:
with open('articles', 'wb') as fp:
pickle.dump(articles, fp)
print('parsed products with duplicates: ', len(products))
# products = drop_duplicates(products)
if len(products) > 100:
save_articles(products)
print('products w/o duplicates', len(products))
return products
def per_product_logic(product: Product):
try:
chrome_options = Options()
chrome_options.add_experimental_option("detach", True)
# chrome_options.add_argument("--headless")
driver = webdriver.Chrome(
options=chrome_options
)
driver.implicitly_wait(15)
ReviewActions().open(driver, product)
# ReviewActions().click_this_product_variant(driver)
ReviewActions().click_only_with_photos(driver)
# слип нужен чтобы подтянуть только с фото
t.sleep(2)
ReviewParser(driver=driver, product=product).get_reviews()
PhDownloader().download_reviews(product)
NudeNetDetector(product).evaluate()
PhDeletor(product).delete_by_prob()
TgIntegration().send(product)
except Exception as e:
logging.warning(f'CANT DOWNLOAD {product.article}')
logging.warning(f"ERROR: {e}")
print(product)
print('---------')
def run(products):
with mp.Pool(4) as p:
p.map(per_product_logic, products)
# process_map(per_product_logic, products, max_workers=4)
def main():
for path in Path("/Users/yakovlev/tg_proj/WB/photo").glob("**/*"):
if path.is_file():
path.unlink()
elif path.is_dir():
rmtree(path)
start_time = time.time()
logging.info('starting products parsing')
products = parse_products_queries()
logging.info('finished product parsing, starting each product parse')
run(products)
print(f'--- {time.time() - start_time} seconds ---')
main()
# for product in products:
# main(product)Editor is loading...
Leave a Comment