Untitled
unknown
plain_text
3 years ago
3.4 kB
1
Indexable
import random from selenium import webdriver from selenium.webdriver.chrome.options import Options import time from bs4 import BeautifulSoup import csv def write_csv(info): print('Created file "parse_result.csv"') with open('parse_result.csv', 'w', newline='', encoding="utf-8") as f: writer = csv.writer(f, delimiter=':') writer.writerow(('category', 'subcategory_1', 'subcategory_2', 'product', 'price')) for result in info: writer.writerow((result['category'], result['subcategory_1'], result['subcategory_2'], result['product'], result['price'])) def get_links(): categories_dict = {} category_dict = {} parse_url = 'https://www.perekrestok.ru' html_links_category = driver.page_source soup = BeautifulSoup(html_links_category, 'lxml') for category in soup.find_all('div', {'class': 'category-filter-item__header'}): category_name = category.text urls = category.find('a', href=True) href_category = urls.attrs.get('href') categories_dict.setdefault(category_name, parse_url + href_category) subcategories = category.find_next_sibling().find_all('a') for subcategory in subcategories: href_subcategory = subcategory.attrs.get('href') subcategories_dict.setdefault(subcategory.text, parse_url + href_subcategory) category_dict.setdefault(subcategory.text, category_name) return category_dict def get_products(): info = [] product_list = [] counter_product = 0 for subcategory_key in subcategories_dict: driver.get(subcategories_dict[subcategory_key]) html_links_products = driver.page_source soup_sub = BeautifulSoup(html_links_products, 'lxml') for sub in soup_sub.find_all('h2', {'class': 'catalog-content-group__title'}): for products in sub.find_next_sibling('div', {'class': 'catalog-content-group__list'}): for product in products.find_all('span', {'class': 'product-card__link-text'}): product_list.append(product.text) for price in products.find_all('div', {'class': 'price-new'}): replace_price = price.text replace_price = replace_price.replace('\xa0₽', '') info.append({'category': get_category[subcategory_key], 'subcategory_1': subcategory_key, 'subcategory_2': sub.text, 'product': product_list[counter_product], 'price': replace_price + ' руб'}) counter_product += 1 print(info) time.sleep(random.randrange(10, 15)) driver.close() return info options = Options() # options.add_argument('--headless') # options.add_argument('--no-sandbox') # for linux only options.add_argument('--lang=ru') options.add_argument('--disable-blink-features=AutomationControlled') options.add_argument('--incognito') options.add_argument('--disable-gpu') options.add_argument('--disable-extensions') options.add_argument("--disable-dev-shm-usage") options.add_argument('--incognito') driver = webdriver.Chrome(options=options) driver.get('https://www.perekrestok.ru/cat') time.sleep(random.randrange(3, 5)) subcategories_dict = {} get_category = get_links() full_info = get_products() write_csv(full_info)
Editor is loading...