Untitled
unknown
plain_text
4 years ago
2.1 kB
8
Indexable
from selenium.webdriver.chrome.options import Options
from selenium import webdriver
import chromedriver_binary
from bs4 import BeautifulSoup
import re
import csv
import time
from datetime import datetime, timedelta
class Scraper:
def __init__(self):
self.reset()
def reset(self):
chrome_options = Options()
# chrome_options.add_argument('--headless')
# chrome_options.add_argument('--no-sandbox')
# chrome_options.add_argument('--disable-dev-shm-usage')
self.driver = webdriver.Chrome(options=chrome_options)
self.driver.maximize_window()
self.driver.get("https://merolagani.com/Floorsheet.aspx")
def scrape(self, date=None):
if date is not None:
self.driver.find_element_by_xpath('//input[@id="ctl00_ContentPlaceHolder1_txtFloorsheetDateFilter"]').send_keys(date)
self.driver.find_element_by_xpath('//a[@id="ctl00_ContentPlaceHolder1_lbtnSearchFloorsheet"]').click()
time.sleep(5)
total_pages = int(re.search(r'Total pages:\s*(\d+)', self.driver.page_source).group(1))
for _ in range(1, total_pages):
self.extract(self.driver.page_source)
self.driver.find_element_by_xpath('//div[@class="pagging"]/div/a[last()-1]').click()
time.sleep(5)
def extract(self, page):
date = re.search(r'As of (\d{4}\/\d{2}\/\d{2})', page).group(1)
soup = BeautifulSoup(page, 'lxml')
list_ = []
for row in soup.find_all('tr')[1:]:
list_.append([])
for td in row.find_all('td')[1:]:
list_[-1].append(td.text.strip().replace(',', ''))
with open(date.replace('/', '-') + '.txt', 'a', newline="") as f:
writer = csv.writer(f)
writer.writerows(list_)
if __name__ == "__main__":
x = Scraper()
delta = timedelta(days=1)
today = datetime.now()
x.scrape()
for i in range(1):
date = today - delta
x.scrape(date.strftime("%m/%d/%Y"))
Editor is loading...