Untitled
plain_text
2 months ago
2.4 kB
4
Indexable
Never
from selenium import webdriver from selenium.webdriver.chrome.options import Options import pandas as pd import time # Define the URL base_url = 'https://www.embopress.org/journal/17444292' # Initialize an empty list to store the publication time and article titles publication_times = [] article_titles = [] # Configure Chrome options to run headless (without opening a visible browser window) chrome_options = Options() chrome_options.add_argument('--headless') # Specify the user agent chrome_options.user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:99.0) Gecko/20100101 Firefox/99.0' # Initialize Chrome WebDriver driver = webdriver.Chrome(executable_path='./chromedriver', options=chrome_options) # Visit the provided page driver.get(base_url) # Add a delay of 1 second to allow the page to load time.sleep(1) # Find all article links on the page article_links = driver.find_elements_by_css_selector('article > a') # Loop through the article links for link in article_links: # Click on the link to access the article page link.click() # Add a delay of 1 second to allow the article page to load time.sleep(1) # Check if the article type is "Article" article_type = driver.find_elements_by_css_selector('span.content-type') if article_type and article_type[0].text.strip() == 'Article': # Extract the publication time time_element = driver.find_elements_by_css_selector('time[datetime]') if time_element: time_text = time_element[0].text.strip() publication_times.append(time_text) # Extract the article title title_element = driver.find_elements_by_css_selector('h1') if title_element: title_text = title_element[0].text.strip() article_titles.append(title_text) # Go back to the main page driver.execute_script('window.history.go(-1)') # Add a delay of 1 second before proceeding to the next link time.sleep(1) # Close the WebDriver driver.quit() # Create a DataFrame to store the publication time and article titles data = pd.DataFrame({'Publication Time': publication_times, 'Article Title': article_titles}) # Save the data to an Excel file data.to_excel('article_data.xlsx', index=False) print("Data saved to 'article_data.xlsx'.")