Untitled
unknown
plain_text
a year ago
7.8 kB
6
Indexable
import sys sys.path.insert(0, '/Users/yakovlev/tg_proj/WB/') from objects.objects import Product, Review, ConfigReviewsFilter, ConfigProductsFilter from parsers.reviews_parser import ReviewParser, ReviewActions from parsers.products_parser import ProductFilter from photo_handlers.PhDownloader import PhDownloader from photo_handlers.PhDeletor import PhDeletor from photo_handlers.ml.nudeNetDetector import NudeNetDetector from consumers.Telegram import TgIntegration from selenium.webdriver.chrome.options import Options from selenium import webdriver import time import multiprocessing as mp from typing import List import time as t import logging import pickle from pathlib import Path from shutil import rmtree logging.basicConfig(level=logging.CRITICAL) from tqdm.contrib.concurrent import process_map # products = [ # Product(article=166462256), # Product(article=162259738), # Product(article=146295378), # Product(article=164914471), # Product(article=155096368), # Product(article=120466540), # Product(article=155781684), # Product(article=147088557), # # Product(article=send nude), # Product(article=153810479), # Product(article=125611383), # Product(article=143758866), # Product(article=145664801), # Product(article=155888162), # Product(article=154638410), # Product(article=143758388), # Product(article=155785013), # # Product(article=hello kit), # Product(article=87248210), # Product(article=143142431), # Product(article=151985798), # Product(article=148048918), # Product(article=163265985), # # Product(article=SHINRA TE), # Product(article=151138774), # Product(article=170589332), # Product(article=150857627), # Product(article=153810479), # Product(article=166766028) # ] def parse_products_queries(): chrome_options = Options() chrome_options.add_experimental_option("detach", True) # chrome_options.add_argument("--headless") # driver = webdriver.Chrome( # options=chrome_options # ) # pf = ProductFilter(driver=driver, config=ConfigProductsFilter(min_num_reviews=50)) # products = pf.get_products(search_query='прозрачные женские трусы', num_pages=20, sort_condition='popular') # products += pf.get_products(search_query='прозрачное белье женское', num_pages=20, sort_condition='popular') # products += pf.get_products(search_query='юбка женская аниме', num_pages=14, sort_condition='popular') # products += pf.get_products(search_query='чулки женские аниме', num_pages=14, sort_condition='popular') # products += pf.get_products(search_query='чулки женские', num_pages=14, sort_condition='popular') # products += pf.get_products(search_query='кружевные чулки', num_pages=8, sort_condition='popular') # products += pf.get_products(search_query='топик женский аниме', num_pages=8, sort_condition='popular') # products += pf.get_products(search_query='трусы женский аниме', num_pages=8, sort_condition='popular') # products += pf.get_products(search_query='кельвин клейн комплект женский', num_pages=8, sort_condition='popular') # products += pf.get_products(search_query='пирсинг соски', num_pages=8, sort_condition='popular') # products += pf.get_products(search_query='прозрачный топик', num_pages=8, sort_condition='popular') # products += pf.get_products(search_query='колготки чулки в сетку', num_pages=11, sort_condition='popular') # products += pf.get_products(search_query='белье kitty klaw', num_pages=8, sort_condition='popular') # products += pf.get_products(search_query='белье send nudes', num_pages=8, sort_condition='popular') # products += pf.get_products(search_query='женское нижнее белье с доступом', num_pages=8, sort_condition='popular') # products += pf.get_products(search_query='с доступом', num_pages=8, sort_condition='popular') # products += pf.get_products(search_query='сексуальное', num_pages=14, sort_condition='popular') # products += pf.get_products(search_query='боди кружевное', num_pages=14, sort_condition='popular') # products += pf.get_products(search_query='кружевной комплект женский', num_pages=20, sort_condition='popular') # products += pf.get_products(search_query='колготки женские сексуальные', num_pages=25, sort_condition='popular') # products += pf.get_products(search_query='портупея', num_pages=25, sort_condition='popular') # products += pf.get_products(search_query='стринги', num_pages=15, sort_condition='popular') def load_products_from_file(filepath): with open (filepath, 'rb') as fp: itemlist = pickle.load(fp) return itemlist products = load_products_from_file('articles') def drop_duplicates(products: List[Product]): output_l = list() t_l = list() for product in products: if product.article not in t_l: t_l.append(product.article) output_l.append(product) del t_l duplicates = list() final = list() for product in output_l: k = (product.name, product.num_reviews) if k not in duplicates: final.append(product) duplicates.append(k) return final def save_articles(articles: List[Product]) -> None: with open('articles', 'wb') as fp: pickle.dump(articles, fp) print('parsed products with duplicates: ', len(products)) # products = drop_duplicates(products) if len(products) > 100: save_articles(products) print('products w/o duplicates', len(products)) return products def per_product_logic(product: Product): try: chrome_options = Options() chrome_options.add_experimental_option("detach", True) # chrome_options.add_argument("--headless") driver = webdriver.Chrome( options=chrome_options ) driver.implicitly_wait(15) ReviewActions().open(driver, product) # ReviewActions().click_this_product_variant(driver) ReviewActions().click_only_with_photos(driver) # слип нужен чтобы подтянуть только с фото t.sleep(2) ReviewParser(driver=driver, product=product).get_reviews() PhDownloader().download_reviews(product) NudeNetDetector(product).evaluate() PhDeletor(product).delete_by_prob() TgIntegration().send(product) except Exception as e: logging.warning(f'CANT DOWNLOAD {product.article}') logging.warning(f"ERROR: {e}") print(product) print('---------') def run(products): with mp.Pool(4) as p: p.map(per_product_logic, products) # process_map(per_product_logic, products, max_workers=4) def main(): for path in Path("/Users/yakovlev/tg_proj/WB/photo").glob("**/*"): if path.is_file(): path.unlink() elif path.is_dir(): rmtree(path) start_time = time.time() logging.info('starting products parsing') products = parse_products_queries() logging.info('finished product parsing, starting each product parse') run(products) print(f'--- {time.time() - start_time} seconds ---') main() # for product in products: # main(product)
Editor is loading...
Leave a Comment