Untitled

mail@pastecode.io avatar
unknown
plain_text
2 years ago
2.1 kB
1
Indexable
Never
from selenium.webdriver.chrome.options import Options
from selenium import webdriver
import chromedriver_binary
from bs4 import BeautifulSoup
import re
import csv
import time
from datetime import datetime, timedelta

class Scraper:
    def __init__(self):
        self.reset()

    def reset(self):
        chrome_options = Options()
        # chrome_options.add_argument('--headless')
        # chrome_options.add_argument('--no-sandbox')
        # chrome_options.add_argument('--disable-dev-shm-usage')
        self.driver = webdriver.Chrome(options=chrome_options)
        self.driver.maximize_window()
        self.driver.get("https://merolagani.com/Floorsheet.aspx")

    def scrape(self, date=None):
        if date is not None:
            self.driver.find_element_by_xpath('//input[@id="ctl00_ContentPlaceHolder1_txtFloorsheetDateFilter"]').send_keys(date)
            self.driver.find_element_by_xpath('//a[@id="ctl00_ContentPlaceHolder1_lbtnSearchFloorsheet"]').click()
            time.sleep(5)
        total_pages = int(re.search(r'Total pages:\s*(\d+)', self.driver.page_source).group(1))
        for _ in range(1, total_pages):
            self.extract(self.driver.page_source)
            self.driver.find_element_by_xpath('//div[@class="pagging"]/div/a[last()-1]').click()
            time.sleep(5)

    def extract(self, page):
        date = re.search(r'As of (\d{4}\/\d{2}\/\d{2})', page).group(1)
        soup = BeautifulSoup(page, 'lxml')
        list_ = []
        for row in soup.find_all('tr')[1:]:
            list_.append([])
            for td in row.find_all('td')[1:]:
                list_[-1].append(td.text.strip().replace(',', ''))

        with open(date.replace('/', '-') + '.txt', 'a', newline="") as f:
            writer = csv.writer(f)
            writer.writerows(list_)


if __name__ == "__main__":
    x = Scraper()
    delta = timedelta(days=1)
    today = datetime.now()
    x.scrape()
    for i in range(1):
        date = today - delta
        x.scrape(date.strftime("%m/%d/%Y"))