Untitled

mail@pastecode.io avatar
unknown
plain_text
7 months ago
3.0 kB
4
Indexable
Never
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException, TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
from bs4 import BeautifulSoup
import csv
import requests

# Path to your WebDriver executable
webdriver_path = '/Users/matej/Downloads/chromedriver-mac-x64/chromedriver'

# Initialize Selenium WebDriver
driver = webdriver.Chrome(executable_path=webdriver_path)

# Open the website
driver.get('https://dogtime.com/dog-breeds')

# Wait for the cookies banner and accept cookies
try:
    wait = WebDriverWait(driver, 10)
    accept_cookies_button = wait.until(EC.element_to_be_clickable((By.XPATH, '//button[text()="Accept all"]')))
    accept_cookies_button.click()
except (NoSuchElementException, TimeoutException):
    print('Could not find the "Accept all" button, or page took too long to load.')

# Function to click the "Load More" button
def click_load_more():
    try:
        wait = WebDriverWait(driver, 10)
        load_more_button = wait.until(EC.element_to_be_clickable((By.XPATH, '//button[text()="Load More"]')))
        
        # Execute a JavaScript click event
        driver.execute_script("arguments[0].click();", load_more_button)
        
        return True
    except (NoSuchElementException, TimeoutException):
        print('No more "Load More" button or page took too long to load.')
        return False


# Click the "Load More" button until all breeds are loaded
while click_load_more():
    pass

# Get the HTML of the page
page_html = driver.page_source

# Close the WebDriver
driver.quit()

# Parse the HTML with BeautifulSoup
soup = BeautifulSoup(page_html, 'html.parser')

# Base URL of the website
base_url = 'https://dogtime.com'

# Find and extract all breed names and URLs
breed_links = []
breed_names = []
for breed in soup.find_all('h3', class_='wp-block-xwp-curated-content__card-title wp-block-xwp-curated-content__card-title--unclamped'):
    link = breed.find('a')['href']
    name = breed.text
    breed_links.append(link)
    breed_names.append(name)

# Define a function to fetch and parse an individual breed page
def get_breed_info(url):
    r = requests.get(url)
    r.raise_for_status()  # Ensure the request was successful
    soup = BeautifulSoup(r.text, 'html.parser')
    breed_info_div = soup.find('p')
    return breed_info_div.text if breed_info_div else None

# Loop through the breed URLs and get the breed info, and write to CSV
with open('breeds.csv', 'w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    # Write header row
    writer.writerow(['Breed Name', 'Breed Info'])
    for name, url in zip(breed_names, breed_links):
        breed_info = get_breed_info(url)
        # Write data row
        writer.writerow([name, breed_info])

print('Data written to breeds.csv')