Untitled
unknown
plain_text
2 years ago
3.0 kB
8
Indexable
from selenium import webdriver from selenium.common.exceptions import NoSuchElementException, TimeoutException from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.common.action_chains import ActionChains from bs4 import BeautifulSoup import csv import requests # Path to your WebDriver executable webdriver_path = '/Users/matej/Downloads/chromedriver-mac-x64/chromedriver' # Initialize Selenium WebDriver driver = webdriver.Chrome(executable_path=webdriver_path) # Open the website driver.get('https://dogtime.com/dog-breeds') # Wait for the cookies banner and accept cookies try: wait = WebDriverWait(driver, 10) accept_cookies_button = wait.until(EC.element_to_be_clickable((By.XPATH, '//button[text()="Accept all"]'))) accept_cookies_button.click() except (NoSuchElementException, TimeoutException): print('Could not find the "Accept all" button, or page took too long to load.') # Function to click the "Load More" button def click_load_more(): try: wait = WebDriverWait(driver, 10) load_more_button = wait.until(EC.element_to_be_clickable((By.XPATH, '//button[text()="Load More"]'))) # Execute a JavaScript click event driver.execute_script("arguments[0].click();", load_more_button) return True except (NoSuchElementException, TimeoutException): print('No more "Load More" button or page took too long to load.') return False # Click the "Load More" button until all breeds are loaded while click_load_more(): pass # Get the HTML of the page page_html = driver.page_source # Close the WebDriver driver.quit() # Parse the HTML with BeautifulSoup soup = BeautifulSoup(page_html, 'html.parser') # Base URL of the website base_url = 'https://dogtime.com' # Find and extract all breed names and URLs breed_links = [] breed_names = [] for breed in soup.find_all('h3', class_='wp-block-xwp-curated-content__card-title wp-block-xwp-curated-content__card-title--unclamped'): link = breed.find('a')['href'] name = breed.text breed_links.append(link) breed_names.append(name) # Define a function to fetch and parse an individual breed page def get_breed_info(url): r = requests.get(url) r.raise_for_status() # Ensure the request was successful soup = BeautifulSoup(r.text, 'html.parser') breed_info_div = soup.find('p') return breed_info_div.text if breed_info_div else None # Loop through the breed URLs and get the breed info, and write to CSV with open('breeds.csv', 'w', newline='', encoding='utf-8') as file: writer = csv.writer(file) # Write header row writer.writerow(['Breed Name', 'Breed Info']) for name, url in zip(breed_names, breed_links): breed_info = get_breed_info(url) # Write data row writer.writerow([name, breed_info]) print('Data written to breeds.csv')
Editor is loading...