Untitled

mail@pastecode.io avatar
unknown
plain_text
a year ago
2.9 kB
3
Indexable
Never
import pandas as pd
import time
import requests
import undetected_chromedriver as webdriver
from selenium.webdriver.chrome.options import Options as ChromeOptions

try:
    driver = undetected_chromedriver.Chrome
    driver.get("https://www.embopress.org/journal/17444292")
    time.sleep(20)
except Exception as ex:
    print(ex)
finally:
    driver.close()
    driver.quit()


# Define the URL
base_url = 'https://www.embopress.org/journal/17444292'

# Initialize empty lists to store the publication time and article titles
publication_times = []
article_titles = []

# Configure Chrome options to run headless (without opening a visible browser window)
chrome_options = ChromeOptions()
chrome_options.add_argument('--headless')

# Initialize Chrome WebDriver using undetected-chromedriver
chrome_driver = Chrome(options=chrome_options)

# Send a request to the base URL using requests library to check response status code
response = requests.get(base_url)
chrome_response_code = response.status_code
print(f"Chrome Response Status Code: {chrome_response_code}")

# Visit the URL with the Chrome WebDriver
chrome_driver.get(base_url)

# Add a delay of 1 second to allow the page to load
time.sleep(1)

# Find all article links on the page using Chrome
article_links = chrome_driver.find_elements_by_css_selector('article > a')

# Loop through the article links
for link in article_links:
    # Click on the link to access the article page
    link.click()

    # Add a delay of 1 second to allow the article page to load
    time.sleep(1)

    try:
        # Check if the article type is "Article" using Chrome
        article_type = chrome_driver.find_element_by_css_selector('span.content-type')
        if article_type.text.strip() == 'Article':
            # Extract the publication time using Chrome
            time_element = chrome_driver.find_element_by_css_selector('time[datetime]')
            if time_element:
                time_text = time_element.text.strip()
                publication_times.append(time_text)

            # Extract the article title using Chrome
            title_element = chrome_driver.find_element_by_css_selector('h1')
            if title_element:
                title_text = title_element.text.strip()
                article_titles.append(title_text)
    except:
        pass

    # Go back to the main page using Chrome
    chrome_driver.execute_script('window.history.go(-1)')

    # Add a delay of 1 second before proceeding to the next link
    time.sleep(1)

# Close the Chrome WebDriver
chrome_driver.quit()

# Create a DataFrame to store the publication time and article titles
data = pd.DataFrame({'Publication Time': publication_times, 'Article Title': article_titles})

# Save the data to an Excel file
data.to_excel('article_data.xlsx', index=False)

print("Data saved to 'article_data.xlsx'.")