Untitled
unknown
python
a year ago
4.9 kB
7
Indexable
import os import time from bs4 import BeautifulSoup from loguru import logger from openpyxl import load_workbook from openpyxl.workbook import Workbook from selenium import webdriver from selenium.webdriver.chrome.service import Service from selenium.webdriver.common.by import By from tqdm import tqdm, trange sheet = load_workbook("Fortune500.xlsx").active fortune_500 = [row[-1] for row in sheet.iter_rows(values_only=True)] driver = webdriver.Chrome() def save_to_excel(markup, page): soup = BeautifulSoup(markup, "lxml") review_list = soup.find_all("div", {"data-tn-entitytype": "reviewId"}) data = [] wb = load_workbook(OUTPUT_EXCEL) sheet = wb.active try: for reviewOne in review_list: starRating = ( reviewOne.find("div", {"itemprop": "reviewRating"}) .select_one("button") .text ) author_loc_date = reviewOne.find("span", {"itemprop": "author"}).text.split( " - " ) author_loc_date = reviewOne.find("span", {"itemprop": "author"}).text.split( " - " ) reviewDate = author_loc_date.pop() reviewLocation = author_loc_date.pop() reviewAuthor = (" - ").join(author_loc_date) reviewText = reviewOne.find("span", {"itemprop": "reviewBody"}).text reviewTitle = reviewOne.find("h2", {"data-testid": "title"}).text whetherFeatured = bool( reviewOne.find("h2", string="Indeed Featured review") ) pros = reviewOne.find("h2", string="Pros") cons = reviewOne.find("h2", string="Cons") prosText = consText = "NA" if pros: prosText = ( pros.find_next_sibling().text if pros.find_next_sibling() else "NA" ) if cons: consText = ( cons.find_next_sibling().text if cons.find_next_sibling() else "NA" ) sheet.append( [ starRating, reviewTitle, reviewAuthor, reviewLocation, reviewDate, reviewText, whetherFeatured, int((page / 20) + 1), prosText, consText, ] ) wb.save(OUTPUT_EXCEL) except: print(author_loc_date) with open(f"bugs/page_{page}.html", "w", encoding="utf-8") as file: file.write(markup) def get_company_details(company): STOP = 0 google = driver.get("https://www.google.com") time.sleep(3) driver.find_element(By.CSS_SELECTOR, "textarea").send_keys( company + " review indeed.com" ) driver.find_element(By.CSS_SELECTOR, "form").submit() time.sleep(3) google = driver.page_source soup = BeautifulSoup(google, "lxml") links = soup.select("#search a") for link in [link.get("href") for link in links]: if link.startswith("https://www.indeed.com/cmp/") and link.endswith("/reviews"): cpm_link = link + "?fcountry=ALL" break driver.get(cpm_link) time.sleep(3) soup = BeautifulSoup(driver.page_source, "lxml").find( "li", {"data-tn-element": "reviews-tab"} ) if soup: STOP = int(soup.select_one("div").text) return STOP, cpm_link for company in tqdm(iterable=fortune_500, desc="Total Progress", position=0, ncols=80): OUTPUT_EXCEL = f"{company.replace(' ', '_')}.xlsx" START = 0 STOP, cml_link = get_company_details(company) # get_company_details(company) if not os.path.exists(OUTPUT_EXCEL): wb = Workbook() page = wb.active page.title = "reviews" page.append( [ "Star Rating", "Review Title", "Review Author", "Review Location", "Review Date", "Review Text", "Featured", "Pagination", "Pros", "Cons", ] ) wb.save(OUTPUT_EXCEL) sheet = load_workbook(OUTPUT_EXCEL)["reviews"] existingPage = [x.value for x in sheet["H"][1:]] if existingPage: START = max(existingPage) * 20 logger.info(f"scraping started for {company} from page {START}") for page in trange( START, STOP, 20, desc=f"Progress for {company}", ncols=80, position=1 ): driver.get(f"{cml_link}&start={page}") markup = driver.page_source save_to_excel(markup, page) print("Done✅")
Editor is loading...
Leave a Comment