Untitled
unknown
plain_text
3 years ago
2.1 kB
4
Indexable
from selenium.webdriver.chrome.options import Options from selenium import webdriver import chromedriver_binary from bs4 import BeautifulSoup import re import csv import time from datetime import datetime, timedelta class Scraper: def __init__(self): self.reset() def reset(self): chrome_options = Options() # chrome_options.add_argument('--headless') # chrome_options.add_argument('--no-sandbox') # chrome_options.add_argument('--disable-dev-shm-usage') self.driver = webdriver.Chrome(options=chrome_options) self.driver.maximize_window() self.driver.get("https://merolagani.com/Floorsheet.aspx") def scrape(self, date=None): if date is not None: self.driver.find_element_by_xpath('//input[@id="ctl00_ContentPlaceHolder1_txtFloorsheetDateFilter"]').send_keys(date) self.driver.find_element_by_xpath('//a[@id="ctl00_ContentPlaceHolder1_lbtnSearchFloorsheet"]').click() time.sleep(5) total_pages = int(re.search(r'Total pages:\s*(\d+)', self.driver.page_source).group(1)) for _ in range(1, total_pages): self.extract(self.driver.page_source) self.driver.find_element_by_xpath('//div[@class="pagging"]/div/a[last()-1]').click() time.sleep(5) def extract(self, page): date = re.search(r'As of (\d{4}\/\d{2}\/\d{2})', page).group(1) soup = BeautifulSoup(page, 'lxml') list_ = [] for row in soup.find_all('tr')[1:]: list_.append([]) for td in row.find_all('td')[1:]: list_[-1].append(td.text.strip().replace(',', '')) with open(date.replace('/', '-') + '.txt', 'a', newline="") as f: writer = csv.writer(f) writer.writerows(list_) if __name__ == "__main__": x = Scraper() delta = timedelta(days=1) today = datetime.now() x.scrape() for i in range(1): date = today - delta x.scrape(date.strftime("%m/%d/%Y"))
Editor is loading...