Untitled
unknown
plain_text
3 years ago
3.4 kB
4
Indexable
import random
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import time
from bs4 import BeautifulSoup
import csv
def write_csv(info):
print('Created file "parse_result.csv"')
with open('parse_result.csv', 'w', newline='', encoding="utf-8") as f:
writer = csv.writer(f, delimiter=':')
writer.writerow(('category', 'subcategory_1', 'subcategory_2', 'product', 'price'))
for result in info:
writer.writerow((result['category'], result['subcategory_1'], result['subcategory_2'],
result['product'], result['price']))
def get_links():
categories_dict = {}
category_dict = {}
parse_url = 'https://www.perekrestok.ru'
html_links_category = driver.page_source
soup = BeautifulSoup(html_links_category, 'lxml')
for category in soup.find_all('div', {'class': 'category-filter-item__header'}):
category_name = category.text
urls = category.find('a', href=True)
href_category = urls.attrs.get('href')
categories_dict.setdefault(category_name, parse_url + href_category)
subcategories = category.find_next_sibling().find_all('a')
for subcategory in subcategories:
href_subcategory = subcategory.attrs.get('href')
subcategories_dict.setdefault(subcategory.text, parse_url + href_subcategory)
category_dict.setdefault(subcategory.text, category_name)
return category_dict
def get_products():
info = []
product_list = []
counter_product = 0
for subcategory_key in subcategories_dict:
driver.get(subcategories_dict[subcategory_key])
html_links_products = driver.page_source
soup_sub = BeautifulSoup(html_links_products, 'lxml')
for sub in soup_sub.find_all('h2', {'class': 'catalog-content-group__title'}):
for products in sub.find_next_sibling('div', {'class': 'catalog-content-group__list'}):
for product in products.find_all('span', {'class': 'product-card__link-text'}):
product_list.append(product.text)
for price in products.find_all('div', {'class': 'price-new'}):
replace_price = price.text
replace_price = replace_price.replace('\xa0₽', '')
info.append({'category': get_category[subcategory_key], 'subcategory_1': subcategory_key,
'subcategory_2': sub.text, 'product': product_list[counter_product],
'price': replace_price + ' руб'})
counter_product += 1
print(info)
time.sleep(random.randrange(10, 15))
driver.close()
return info
options = Options()
# options.add_argument('--headless')
# options.add_argument('--no-sandbox') # for linux only
options.add_argument('--lang=ru')
options.add_argument('--disable-blink-features=AutomationControlled')
options.add_argument('--incognito')
options.add_argument('--disable-gpu')
options.add_argument('--disable-extensions')
options.add_argument("--disable-dev-shm-usage")
options.add_argument('--incognito')
driver = webdriver.Chrome(options=options)
driver.get('https://www.perekrestok.ru/cat')
time.sleep(random.randrange(3, 5))
subcategories_dict = {}
get_category = get_links()
full_info = get_products()
write_csv(full_info)Editor is loading...