Untitled
unknown
plain_text
2 years ago
3.0 kB
12
Indexable
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException, TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
from bs4 import BeautifulSoup
import csv
import requests
# Path to your WebDriver executable
webdriver_path = '/Users/matej/Downloads/chromedriver-mac-x64/chromedriver'
# Initialize Selenium WebDriver
driver = webdriver.Chrome(executable_path=webdriver_path)
# Open the website
driver.get('https://dogtime.com/dog-breeds')
# Wait for the cookies banner and accept cookies
try:
wait = WebDriverWait(driver, 10)
accept_cookies_button = wait.until(EC.element_to_be_clickable((By.XPATH, '//button[text()="Accept all"]')))
accept_cookies_button.click()
except (NoSuchElementException, TimeoutException):
print('Could not find the "Accept all" button, or page took too long to load.')
# Function to click the "Load More" button
def click_load_more():
try:
wait = WebDriverWait(driver, 10)
load_more_button = wait.until(EC.element_to_be_clickable((By.XPATH, '//button[text()="Load More"]')))
# Execute a JavaScript click event
driver.execute_script("arguments[0].click();", load_more_button)
return True
except (NoSuchElementException, TimeoutException):
print('No more "Load More" button or page took too long to load.')
return False
# Click the "Load More" button until all breeds are loaded
while click_load_more():
pass
# Get the HTML of the page
page_html = driver.page_source
# Close the WebDriver
driver.quit()
# Parse the HTML with BeautifulSoup
soup = BeautifulSoup(page_html, 'html.parser')
# Base URL of the website
base_url = 'https://dogtime.com'
# Find and extract all breed names and URLs
breed_links = []
breed_names = []
for breed in soup.find_all('h3', class_='wp-block-xwp-curated-content__card-title wp-block-xwp-curated-content__card-title--unclamped'):
link = breed.find('a')['href']
name = breed.text
breed_links.append(link)
breed_names.append(name)
# Define a function to fetch and parse an individual breed page
def get_breed_info(url):
r = requests.get(url)
r.raise_for_status() # Ensure the request was successful
soup = BeautifulSoup(r.text, 'html.parser')
breed_info_div = soup.find('p')
return breed_info_div.text if breed_info_div else None
# Loop through the breed URLs and get the breed info, and write to CSV
with open('breeds.csv', 'w', newline='', encoding='utf-8') as file:
writer = csv.writer(file)
# Write header row
writer.writerow(['Breed Name', 'Breed Info'])
for name, url in zip(breed_names, breed_links):
breed_info = get_breed_info(url)
# Write data row
writer.writerow([name, breed_info])
print('Data written to breeds.csv')
Editor is loading...