from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import pandas as pd
import time
# Define the URL
base_url = 'https://www.embopress.org/journal/17444292'
# Initialize an empty list to store the publication time and article titles
publication_times = []
article_titles = []
# Configure Chrome options to run headless (without opening a visible browser window)
chrome_options = Options()
chrome_options.add_argument('--headless')
# Specify the user agent
chrome_options.user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:99.0) Gecko/20100101 Firefox/99.0'
# Initialize Chrome WebDriver
driver = webdriver.Chrome(executable_path='./chromedriver', options=chrome_options)
# Visit the provided page
driver.get(base_url)
# Add a delay of 1 second to allow the page to load
time.sleep(1)
# Find all article links on the page
article_links = driver.find_elements_by_css_selector('article > a')
# Loop through the article links
for link in article_links:
# Click on the link to access the article page
link.click()
# Add a delay of 1 second to allow the article page to load
time.sleep(1)
# Check if the article type is "Article"
article_type = driver.find_elements_by_css_selector('span.content-type')
if article_type and article_type[0].text.strip() == 'Article':
# Extract the publication time
time_element = driver.find_elements_by_css_selector('time[datetime]')
if time_element:
time_text = time_element[0].text.strip()
publication_times.append(time_text)
# Extract the article title
title_element = driver.find_elements_by_css_selector('h1')
if title_element:
title_text = title_element[0].text.strip()
article_titles.append(title_text)
# Go back to the main page
driver.execute_script('window.history.go(-1)')
# Add a delay of 1 second before proceeding to the next link
time.sleep(1)
# Close the WebDriver
driver.quit()
# Create a DataFrame to store the publication time and article titles
data = pd.DataFrame({'Publication Time': publication_times, 'Article Title': article_titles})
# Save the data to an Excel file
data.to_excel('article_data.xlsx', index=False)
print("Data saved to 'article_data.xlsx'.")