Untitled

 avatar
unknown
plain_text
a year ago
7.8 kB
6
Indexable
import sys
sys.path.insert(0, '/Users/yakovlev/tg_proj/WB/')


from objects.objects import Product, Review, ConfigReviewsFilter, ConfigProductsFilter
from parsers.reviews_parser import ReviewParser, ReviewActions
from parsers.products_parser import ProductFilter
from photo_handlers.PhDownloader import PhDownloader
from photo_handlers.PhDeletor import PhDeletor
from photo_handlers.ml.nudeNetDetector import NudeNetDetector
from consumers.Telegram import TgIntegration
from selenium.webdriver.chrome.options import Options
from selenium import webdriver
import time
import multiprocessing as mp
from typing import List
import time as t
import logging
import pickle
from pathlib import Path
from shutil import rmtree
logging.basicConfig(level=logging.CRITICAL)
from tqdm.contrib.concurrent import process_map

# products = [
#     Product(article=166462256),
#     Product(article=162259738),
#     Product(article=146295378),
#     Product(article=164914471),
#     Product(article=155096368),
#     Product(article=120466540),
#     Product(article=155781684),
#     Product(article=147088557),
    
#     # Product(article=send nude),
#     Product(article=153810479),
#     Product(article=125611383),
#     Product(article=143758866),
#     Product(article=145664801),
#     Product(article=155888162),
#     Product(article=154638410),
#     Product(article=143758388),
#     Product(article=155785013),
#     # Product(article=hello kit),
#     Product(article=87248210),
#     Product(article=143142431),
#     Product(article=151985798),
#     Product(article=148048918),
#     Product(article=163265985),
#     # Product(article=SHINRA TE),
#     Product(article=151138774),
#     Product(article=170589332),

#     Product(article=150857627),
#     Product(article=153810479),
#     Product(article=166766028)
#     ]


def parse_products_queries():
    chrome_options = Options()
    chrome_options.add_experimental_option("detach", True)
    # chrome_options.add_argument("--headless")

    # driver = webdriver.Chrome(
    #                             options=chrome_options
    #                             )

    # pf = ProductFilter(driver=driver, config=ConfigProductsFilter(min_num_reviews=50))


    # products = pf.get_products(search_query='прозрачные женские трусы', num_pages=20, sort_condition='popular')
    # products += pf.get_products(search_query='прозрачное белье женское', num_pages=20, sort_condition='popular')
    # products += pf.get_products(search_query='юбка женская аниме', num_pages=14, sort_condition='popular')
    # products += pf.get_products(search_query='чулки женские аниме', num_pages=14, sort_condition='popular')
    # products += pf.get_products(search_query='чулки женские', num_pages=14, sort_condition='popular')
    # products += pf.get_products(search_query='кружевные чулки', num_pages=8, sort_condition='popular')
    # products += pf.get_products(search_query='топик женский аниме', num_pages=8, sort_condition='popular')
    # products += pf.get_products(search_query='трусы женский аниме', num_pages=8, sort_condition='popular')
    # products += pf.get_products(search_query='кельвин клейн комплект женский', num_pages=8, sort_condition='popular')
    # products += pf.get_products(search_query='пирсинг соски', num_pages=8, sort_condition='popular')
    # products += pf.get_products(search_query='прозрачный топик', num_pages=8, sort_condition='popular')
    # products += pf.get_products(search_query='колготки чулки в сетку', num_pages=11, sort_condition='popular')
    # products += pf.get_products(search_query='белье kitty klaw', num_pages=8, sort_condition='popular')
    # products += pf.get_products(search_query='белье send nudes', num_pages=8, sort_condition='popular')
    # products += pf.get_products(search_query='женское нижнее белье с доступом', num_pages=8, sort_condition='popular')
    # products += pf.get_products(search_query='с доступом', num_pages=8, sort_condition='popular')
    # products += pf.get_products(search_query='сексуальное', num_pages=14, sort_condition='popular')
    # products += pf.get_products(search_query='боди кружевное', num_pages=14, sort_condition='popular')
    # products += pf.get_products(search_query='кружевной комплект женский', num_pages=20, sort_condition='popular')
    # products += pf.get_products(search_query='колготки женские сексуальные', num_pages=25, sort_condition='popular')
    # products += pf.get_products(search_query='портупея', num_pages=25, sort_condition='popular')
    # products += pf.get_products(search_query='стринги', num_pages=15, sort_condition='popular')

    def load_products_from_file(filepath):
        with open (filepath, 'rb') as fp:
            itemlist = pickle.load(fp)
            return itemlist

    products = load_products_from_file('articles')
    
    def drop_duplicates(products: List[Product]):
        output_l = list()
        t_l = list()
        for product in products:
            if product.article not in t_l:
                t_l.append(product.article)
                output_l.append(product)
        del t_l

        duplicates = list()
        final = list()
        for product in output_l:
            k = (product.name, product.num_reviews)
            if k not in duplicates:
                final.append(product)
                duplicates.append(k)
        return final
    
    def save_articles(articles: List[Product]) -> None:
        with open('articles', 'wb') as fp:
            pickle.dump(articles, fp)

    print('parsed products with duplicates:  ', len(products))
    # products = drop_duplicates(products)
    if len(products) > 100:
        save_articles(products)
    print('products w/o duplicates', len(products))


    return products



def per_product_logic(product: Product):
    try:

        chrome_options = Options()
        chrome_options.add_experimental_option("detach", True)
        # chrome_options.add_argument("--headless")

        driver = webdriver.Chrome(
                                  options=chrome_options
                                  )
        
        driver.implicitly_wait(15)                  

        ReviewActions().open(driver, product)
        # ReviewActions().click_this_product_variant(driver)
        ReviewActions().click_only_with_photos(driver)

        # слип нужен чтобы подтянуть только с фото
        t.sleep(2)
        ReviewParser(driver=driver, product=product).get_reviews()
        PhDownloader().download_reviews(product)
        NudeNetDetector(product).evaluate()
        PhDeletor(product).delete_by_prob()
        TgIntegration().send(product)
    except Exception as e:
        logging.warning(f'CANT DOWNLOAD {product.article}')
        logging.warning(f"ERROR: {e}")
        print(product)
        print('---------')


def run(products):
    with mp.Pool(4) as p:
        p.map(per_product_logic, products)

    # process_map(per_product_logic, products, max_workers=4)


def main():

    for path in Path("/Users/yakovlev/tg_proj/WB/photo").glob("**/*"):
        if path.is_file():
            path.unlink()
        elif path.is_dir():
            rmtree(path)


    start_time = time.time()
    logging.info('starting products parsing')
    products = parse_products_queries()

    logging.info('finished product parsing, starting each product parse')
    run(products)

    print(f'--- {time.time() - start_time} seconds ---')
    
main()
# for product in products:
#     main(product)
Editor is loading...
Leave a Comment