Untitled

mail@pastecode.io avatar
unknown
plain_text
a year ago
2.4 kB
5
Indexable
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import pandas as pd
import time

# Define the URL
base_url = 'https://www.embopress.org/journal/17444292'

# Initialize an empty list to store the publication time and article titles
publication_times = []
article_titles = []

# Configure Chrome options to run headless (without opening a visible browser window)
chrome_options = Options()
chrome_options.add_argument('--headless')

# Specify the user agent
chrome_options.user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:99.0) Gecko/20100101 Firefox/99.0'

# Initialize Chrome WebDriver
driver = webdriver.Chrome(executable_path='./chromedriver', options=chrome_options)

# Visit the provided page
driver.get(base_url)

# Add a delay of 1 second to allow the page to load
time.sleep(1)

# Find all article links on the page
article_links = driver.find_elements_by_css_selector('article > a')

# Loop through the article links
for link in article_links:
    # Click on the link to access the article page
    link.click()

    # Add a delay of 1 second to allow the article page to load
    time.sleep(1)

    # Check if the article type is "Article"
    article_type = driver.find_elements_by_css_selector('span.content-type')
    if article_type and article_type[0].text.strip() == 'Article':
        # Extract the publication time
        time_element = driver.find_elements_by_css_selector('time[datetime]')
        if time_element:
            time_text = time_element[0].text.strip()
            publication_times.append(time_text)

        # Extract the article title
        title_element = driver.find_elements_by_css_selector('h1')
        if title_element:
            title_text = title_element[0].text.strip()
            article_titles.append(title_text)

    # Go back to the main page
    driver.execute_script('window.history.go(-1)')

    # Add a delay of 1 second before proceeding to the next link
    time.sleep(1)

# Close the WebDriver
driver.quit()

# Create a DataFrame to store the publication time and article titles
data = pd.DataFrame({'Publication Time': publication_times, 'Article Title': article_titles})

# Save the data to an Excel file
data.to_excel('article_data.xlsx', index=False)

print("Data saved to 'article_data.xlsx'.")