Untitled

 avatar
unknown
python
a year ago
4.9 kB
7
Indexable
import os
import time

from bs4 import BeautifulSoup
from loguru import logger
from openpyxl import load_workbook
from openpyxl.workbook import Workbook
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from tqdm import tqdm, trange

sheet = load_workbook("Fortune500.xlsx").active
fortune_500 = [row[-1] for row in sheet.iter_rows(values_only=True)]

driver = webdriver.Chrome()


def save_to_excel(markup, page):
    soup = BeautifulSoup(markup, "lxml")
    review_list = soup.find_all("div", {"data-tn-entitytype": "reviewId"})
    data = []
    wb = load_workbook(OUTPUT_EXCEL)
    sheet = wb.active
    try:
        for reviewOne in review_list:
            starRating = (
                reviewOne.find("div", {"itemprop": "reviewRating"})
                .select_one("button")
                .text
            )
            author_loc_date = reviewOne.find("span", {"itemprop": "author"}).text.split(
                " - "
            )

            author_loc_date = reviewOne.find("span", {"itemprop": "author"}).text.split(
                " - "
            )

            reviewDate = author_loc_date.pop()
            reviewLocation = author_loc_date.pop()
            reviewAuthor = (" - ").join(author_loc_date)

            reviewText = reviewOne.find("span", {"itemprop": "reviewBody"}).text
            reviewTitle = reviewOne.find("h2", {"data-testid": "title"}).text
            whetherFeatured = bool(
                reviewOne.find("h2", string="Indeed Featured review")
            )
            pros = reviewOne.find("h2", string="Pros")
            cons = reviewOne.find("h2", string="Cons")
            prosText = consText = "NA"
            if pros:
                prosText = (
                    pros.find_next_sibling().text if pros.find_next_sibling() else "NA"
                )
            if cons:
                consText = (
                    cons.find_next_sibling().text if cons.find_next_sibling() else "NA"
                )

            sheet.append(
                [
                    starRating,
                    reviewTitle,
                    reviewAuthor,
                    reviewLocation,
                    reviewDate,
                    reviewText,
                    whetherFeatured,
                    int((page / 20) + 1),
                    prosText,
                    consText,
                ]
            )

        wb.save(OUTPUT_EXCEL)
    except:
        print(author_loc_date)
        with open(f"bugs/page_{page}.html", "w", encoding="utf-8") as file:
            file.write(markup)


def get_company_details(company):
    STOP = 0
    google = driver.get("https://www.google.com")
    time.sleep(3)
    driver.find_element(By.CSS_SELECTOR, "textarea").send_keys(
        company + " review indeed.com"
    )
    driver.find_element(By.CSS_SELECTOR, "form").submit()
    time.sleep(3)
    google = driver.page_source
    soup = BeautifulSoup(google, "lxml")
    links = soup.select("#search a")
    for link in [link.get("href") for link in links]:
        if link.startswith("https://www.indeed.com/cmp/") and link.endswith("/reviews"):
            cpm_link = link + "?fcountry=ALL"
            break
    driver.get(cpm_link)
    time.sleep(3)
    soup = BeautifulSoup(driver.page_source, "lxml").find(
        "li", {"data-tn-element": "reviews-tab"}
    )
    if soup:
        STOP = int(soup.select_one("div").text)
    return STOP, cpm_link


for company in tqdm(iterable=fortune_500, desc="Total Progress", position=0, ncols=80):
    OUTPUT_EXCEL = f"{company.replace(' ', '_')}.xlsx"
    START = 0
    STOP, cml_link = get_company_details(company)
    # get_company_details(company)
    if not os.path.exists(OUTPUT_EXCEL):
        wb = Workbook()
        page = wb.active
        page.title = "reviews"
        page.append(
            [
                "Star Rating",
                "Review Title",
                "Review Author",
                "Review Location",
                "Review Date",
                "Review Text",
                "Featured",
                "Pagination",
                "Pros",
                "Cons",
            ]
        )
        wb.save(OUTPUT_EXCEL)

    sheet = load_workbook(OUTPUT_EXCEL)["reviews"]
    existingPage = [x.value for x in sheet["H"][1:]]
    if existingPage:
        START = max(existingPage) * 20

    logger.info(f"scraping started for {company} from page {START}")

    for page in trange(
        START, STOP, 20, desc=f"Progress for {company}", ncols=80, position=1
    ):
        driver.get(f"{cml_link}&start={page}")
        markup = driver.page_source
        save_to_excel(markup, page)


print("Done✅")
Editor is loading...
Leave a Comment