Untitled
unknown
plain_text
2 years ago
1.9 kB
7
Indexable
import glob import re from lxml import html from datetime import datetime from scrapy.http import HtmlResponse with open('Cannazon_2021-03-20/CONCENTRATES/2/file.html',mode = 'r', encoding="utf8") as f: page = f.read() response = HtmlResponse(url="", body=page, encoding='utf-8') products = [] vendors = [] scrape_dates = [] cat_1 = [] origin_country = [] shipping_countries = [] for file in sorted(glob.glob('**/*.html',recursive=True),key=os.path.getmtime): # sort by folder creation time print(file) print('-----') folder_category = os.path.abspath(file).split('\\',7)[7].split('\\')[0] print(folder_category) with open(file,mode = 'r', encoding="utf8") as f: page = f.read() response = HtmlResponse(url="", body=page, encoding='utf-8') abs_path = os.path.abspath(file) scrape_date = abs_path.split('\\')[-4] print(scrape_date) for i in response.xpath('//div/div/div[@class="col-xs-6 col-sm-3 col-md-3 clearfix"]/div/div'): # final div above is div[@class="single-products"] # listing title products.append(i.xpath('.//div/a[2]/p/text()').get()) # vendor vendors.append(i.xpath('.//div/a[3]/p/text()').get()) # ships from origin_country.append(i.xpath('.//p[@class="product-box-shipping"]/text()').get()) # ships to shipping_countries.append(i.xpath('.//p[@class="product-box-shipping"]/text()[2]').get().strip()) # category cat_1.append(folder_category) scrape_dates.append(scrape_date) df = pd.DataFrame({ 'scrape_dates':scrape_dates, 'product':products, 'vendor':vendors, 'category1':cat_1, 'origin_country':origin_country, 'countries_to':shipping_countries })
Editor is loading...