Untitled

 avatar
unknown
plain_text
3 years ago
3.4 kB
1
Indexable
import random
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import time
from bs4 import BeautifulSoup
import csv


def write_csv(info):
    print('Created file "parse_result.csv"')
    with open('parse_result.csv', 'w', newline='', encoding="utf-8") as f:
        writer = csv.writer(f, delimiter=':')
        writer.writerow(('category', 'subcategory_1', 'subcategory_2', 'product', 'price'))
        for result in info:
            writer.writerow((result['category'], result['subcategory_1'], result['subcategory_2'],
                             result['product'], result['price']))


def get_links():
    categories_dict = {}
    category_dict = {}
    parse_url = 'https://www.perekrestok.ru'
    html_links_category = driver.page_source
    soup = BeautifulSoup(html_links_category, 'lxml')
    for category in soup.find_all('div', {'class': 'category-filter-item__header'}):
        category_name = category.text
        urls = category.find('a', href=True)
        href_category = urls.attrs.get('href')
        categories_dict.setdefault(category_name, parse_url + href_category)
        subcategories = category.find_next_sibling().find_all('a')
        for subcategory in subcategories:
            href_subcategory = subcategory.attrs.get('href')
            subcategories_dict.setdefault(subcategory.text, parse_url + href_subcategory)
            category_dict.setdefault(subcategory.text, category_name)
    return category_dict


def get_products():
    info = []
    product_list = []
    counter_product = 0
    for subcategory_key in subcategories_dict:
        driver.get(subcategories_dict[subcategory_key])
        html_links_products = driver.page_source
        soup_sub = BeautifulSoup(html_links_products, 'lxml')
        for sub in soup_sub.find_all('h2', {'class': 'catalog-content-group__title'}):
            for products in sub.find_next_sibling('div', {'class': 'catalog-content-group__list'}):
                for product in products.find_all('span', {'class': 'product-card__link-text'}):
                    product_list.append(product.text)
                for price in products.find_all('div', {'class': 'price-new'}):
                    replace_price = price.text
                    replace_price = replace_price.replace('\xa0₽', '')
                    info.append({'category': get_category[subcategory_key], 'subcategory_1': subcategory_key,
                                 'subcategory_2': sub.text, 'product': product_list[counter_product],
                                 'price': replace_price + ' руб'})
                    counter_product += 1

        print(info)
        time.sleep(random.randrange(10, 15))
    driver.close()
    return info






options = Options()
# options.add_argument('--headless')
# options.add_argument('--no-sandbox') # for linux only
options.add_argument('--lang=ru')
options.add_argument('--disable-blink-features=AutomationControlled')
options.add_argument('--incognito')
options.add_argument('--disable-gpu')
options.add_argument('--disable-extensions')
options.add_argument("--disable-dev-shm-usage")
options.add_argument('--incognito')
driver = webdriver.Chrome(options=options)
driver.get('https://www.perekrestok.ru/cat')
time.sleep(random.randrange(3, 5))

subcategories_dict = {}


get_category = get_links()
full_info = get_products()
write_csv(full_info)
Editor is loading...