Untitled

mail@pastecode.io avatar
unknown
plain_text
9 months ago
1.9 kB
5
Indexable
Never
import glob
import re
from lxml import html
from datetime import datetime
from scrapy.http import HtmlResponse

with open('Cannazon_2021-03-20/CONCENTRATES/2/file.html',mode = 'r', encoding="utf8") as f:
    page = f.read()

response = HtmlResponse(url="", body=page, encoding='utf-8')

products = []
vendors = []
scrape_dates = []
cat_1 = []
origin_country = []
shipping_countries = []


for file in sorted(glob.glob('**/*.html',recursive=True),key=os.path.getmtime):
    # sort by folder creation time
    print(file)
    print('-----')
    
    folder_category = os.path.abspath(file).split('\\',7)[7].split('\\')[0]
    print(folder_category)
    
    with open(file,mode = 'r', encoding="utf8") as f:
        page = f.read()

    response = HtmlResponse(url="", body=page, encoding='utf-8')
    
    abs_path = os.path.abspath(file)
    scrape_date = abs_path.split('\\')[-4]
    print(scrape_date)
    
    for i in response.xpath('//div/div/div[@class="col-xs-6 col-sm-3 col-md-3 clearfix"]/div/div'):
        # final div above is div[@class="single-products"]

        # listing title
        products.append(i.xpath('.//div/a[2]/p/text()').get())

        # vendor
        vendors.append(i.xpath('.//div/a[3]/p/text()').get())
        
        # ships from
        origin_country.append(i.xpath('.//p[@class="product-box-shipping"]/text()').get())

        # ships to
        shipping_countries.append(i.xpath('.//p[@class="product-box-shipping"]/text()[2]').get().strip())
        
        # category
        cat_1.append(folder_category)
        scrape_dates.append(scrape_date)


df = pd.DataFrame({
    'scrape_dates':scrape_dates,
    'product':products,
    'vendor':vendors,
    'category1':cat_1,
    'origin_country':origin_country,
    'countries_to':shipping_countries
})