Untitled
unknown
python
2 years ago
4.9 kB
9
Indexable
import os
import time
from bs4 import BeautifulSoup
from loguru import logger
from openpyxl import load_workbook
from openpyxl.workbook import Workbook
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from tqdm import tqdm, trange
sheet = load_workbook("Fortune500.xlsx").active
fortune_500 = [row[-1] for row in sheet.iter_rows(values_only=True)]
driver = webdriver.Chrome()
def save_to_excel(markup, page):
soup = BeautifulSoup(markup, "lxml")
review_list = soup.find_all("div", {"data-tn-entitytype": "reviewId"})
data = []
wb = load_workbook(OUTPUT_EXCEL)
sheet = wb.active
try:
for reviewOne in review_list:
starRating = (
reviewOne.find("div", {"itemprop": "reviewRating"})
.select_one("button")
.text
)
author_loc_date = reviewOne.find("span", {"itemprop": "author"}).text.split(
" - "
)
author_loc_date = reviewOne.find("span", {"itemprop": "author"}).text.split(
" - "
)
reviewDate = author_loc_date.pop()
reviewLocation = author_loc_date.pop()
reviewAuthor = (" - ").join(author_loc_date)
reviewText = reviewOne.find("span", {"itemprop": "reviewBody"}).text
reviewTitle = reviewOne.find("h2", {"data-testid": "title"}).text
whetherFeatured = bool(
reviewOne.find("h2", string="Indeed Featured review")
)
pros = reviewOne.find("h2", string="Pros")
cons = reviewOne.find("h2", string="Cons")
prosText = consText = "NA"
if pros:
prosText = (
pros.find_next_sibling().text if pros.find_next_sibling() else "NA"
)
if cons:
consText = (
cons.find_next_sibling().text if cons.find_next_sibling() else "NA"
)
sheet.append(
[
starRating,
reviewTitle,
reviewAuthor,
reviewLocation,
reviewDate,
reviewText,
whetherFeatured,
int((page / 20) + 1),
prosText,
consText,
]
)
wb.save(OUTPUT_EXCEL)
except:
print(author_loc_date)
with open(f"bugs/page_{page}.html", "w", encoding="utf-8") as file:
file.write(markup)
def get_company_details(company):
STOP = 0
google = driver.get("https://www.google.com")
time.sleep(3)
driver.find_element(By.CSS_SELECTOR, "textarea").send_keys(
company + " review indeed.com"
)
driver.find_element(By.CSS_SELECTOR, "form").submit()
time.sleep(3)
google = driver.page_source
soup = BeautifulSoup(google, "lxml")
links = soup.select("#search a")
for link in [link.get("href") for link in links]:
if link.startswith("https://www.indeed.com/cmp/") and link.endswith("/reviews"):
cpm_link = link + "?fcountry=ALL"
break
driver.get(cpm_link)
time.sleep(3)
soup = BeautifulSoup(driver.page_source, "lxml").find(
"li", {"data-tn-element": "reviews-tab"}
)
if soup:
STOP = int(soup.select_one("div").text)
return STOP, cpm_link
for company in tqdm(iterable=fortune_500, desc="Total Progress", position=0, ncols=80):
OUTPUT_EXCEL = f"{company.replace(' ', '_')}.xlsx"
START = 0
STOP, cml_link = get_company_details(company)
# get_company_details(company)
if not os.path.exists(OUTPUT_EXCEL):
wb = Workbook()
page = wb.active
page.title = "reviews"
page.append(
[
"Star Rating",
"Review Title",
"Review Author",
"Review Location",
"Review Date",
"Review Text",
"Featured",
"Pagination",
"Pros",
"Cons",
]
)
wb.save(OUTPUT_EXCEL)
sheet = load_workbook(OUTPUT_EXCEL)["reviews"]
existingPage = [x.value for x in sheet["H"][1:]]
if existingPage:
START = max(existingPage) * 20
logger.info(f"scraping started for {company} from page {START}")
for page in trange(
START, STOP, 20, desc=f"Progress for {company}", ncols=80, position=1
):
driver.get(f"{cml_link}&start={page}")
markup = driver.page_source
save_to_excel(markup, page)
print("Done✅")
Editor is loading...
Leave a Comment