Untitled
unknown
plain_text
2 years ago
2.9 kB
12
Indexable
import pandas as pd
import time
import requests
import undetected_chromedriver as webdriver
from selenium.webdriver.chrome.options import Options as ChromeOptions
try:
driver = undetected_chromedriver.Chrome
driver.get("https://www.embopress.org/journal/17444292")
time.sleep(20)
except Exception as ex:
print(ex)
finally:
driver.close()
driver.quit()
# Define the URL
base_url = 'https://www.embopress.org/journal/17444292'
# Initialize empty lists to store the publication time and article titles
publication_times = []
article_titles = []
# Configure Chrome options to run headless (without opening a visible browser window)
chrome_options = ChromeOptions()
chrome_options.add_argument('--headless')
# Initialize Chrome WebDriver using undetected-chromedriver
chrome_driver = Chrome(options=chrome_options)
# Send a request to the base URL using requests library to check response status code
response = requests.get(base_url)
chrome_response_code = response.status_code
print(f"Chrome Response Status Code: {chrome_response_code}")
# Visit the URL with the Chrome WebDriver
chrome_driver.get(base_url)
# Add a delay of 1 second to allow the page to load
time.sleep(1)
# Find all article links on the page using Chrome
article_links = chrome_driver.find_elements_by_css_selector('article > a')
# Loop through the article links
for link in article_links:
# Click on the link to access the article page
link.click()
# Add a delay of 1 second to allow the article page to load
time.sleep(1)
try:
# Check if the article type is "Article" using Chrome
article_type = chrome_driver.find_element_by_css_selector('span.content-type')
if article_type.text.strip() == 'Article':
# Extract the publication time using Chrome
time_element = chrome_driver.find_element_by_css_selector('time[datetime]')
if time_element:
time_text = time_element.text.strip()
publication_times.append(time_text)
# Extract the article title using Chrome
title_element = chrome_driver.find_element_by_css_selector('h1')
if title_element:
title_text = title_element.text.strip()
article_titles.append(title_text)
except:
pass
# Go back to the main page using Chrome
chrome_driver.execute_script('window.history.go(-1)')
# Add a delay of 1 second before proceeding to the next link
time.sleep(1)
# Close the Chrome WebDriver
chrome_driver.quit()
# Create a DataFrame to store the publication time and article titles
data = pd.DataFrame({'Publication Time': publication_times, 'Article Title': article_titles})
# Save the data to an Excel file
data.to_excel('article_data.xlsx', index=False)
print("Data saved to 'article_data.xlsx'.")
Editor is loading...