Untitled
unknown
plain_text
2 years ago
1.9 kB
8
Indexable
import glob
import re
from lxml import html
from datetime import datetime
from scrapy.http import HtmlResponse
with open('Cannazon_2021-03-20/CONCENTRATES/2/file.html',mode = 'r', encoding="utf8") as f:
page = f.read()
response = HtmlResponse(url="", body=page, encoding='utf-8')
products = []
vendors = []
scrape_dates = []
cat_1 = []
origin_country = []
shipping_countries = []
for file in sorted(glob.glob('**/*.html',recursive=True),key=os.path.getmtime):
# sort by folder creation time
print(file)
print('-----')
folder_category = os.path.abspath(file).split('\\',7)[7].split('\\')[0]
print(folder_category)
with open(file,mode = 'r', encoding="utf8") as f:
page = f.read()
response = HtmlResponse(url="", body=page, encoding='utf-8')
abs_path = os.path.abspath(file)
scrape_date = abs_path.split('\\')[-4]
print(scrape_date)
for i in response.xpath('//div/div/div[@class="col-xs-6 col-sm-3 col-md-3 clearfix"]/div/div'):
# final div above is div[@class="single-products"]
# listing title
products.append(i.xpath('.//div/a[2]/p/text()').get())
# vendor
vendors.append(i.xpath('.//div/a[3]/p/text()').get())
# ships from
origin_country.append(i.xpath('.//p[@class="product-box-shipping"]/text()').get())
# ships to
shipping_countries.append(i.xpath('.//p[@class="product-box-shipping"]/text()[2]').get().strip())
# category
cat_1.append(folder_category)
scrape_dates.append(scrape_date)
df = pd.DataFrame({
'scrape_dates':scrape_dates,
'product':products,
'vendor':vendors,
'category1':cat_1,
'origin_country':origin_country,
'countries_to':shipping_countries
})Editor is loading...