Untitled
unknown
plain_text
a year ago
2.9 kB
3
Indexable
Never
import pandas as pd import time import requests import undetected_chromedriver as webdriver from selenium.webdriver.chrome.options import Options as ChromeOptions try: driver = undetected_chromedriver.Chrome driver.get("https://www.embopress.org/journal/17444292") time.sleep(20) except Exception as ex: print(ex) finally: driver.close() driver.quit() # Define the URL base_url = 'https://www.embopress.org/journal/17444292' # Initialize empty lists to store the publication time and article titles publication_times = [] article_titles = [] # Configure Chrome options to run headless (without opening a visible browser window) chrome_options = ChromeOptions() chrome_options.add_argument('--headless') # Initialize Chrome WebDriver using undetected-chromedriver chrome_driver = Chrome(options=chrome_options) # Send a request to the base URL using requests library to check response status code response = requests.get(base_url) chrome_response_code = response.status_code print(f"Chrome Response Status Code: {chrome_response_code}") # Visit the URL with the Chrome WebDriver chrome_driver.get(base_url) # Add a delay of 1 second to allow the page to load time.sleep(1) # Find all article links on the page using Chrome article_links = chrome_driver.find_elements_by_css_selector('article > a') # Loop through the article links for link in article_links: # Click on the link to access the article page link.click() # Add a delay of 1 second to allow the article page to load time.sleep(1) try: # Check if the article type is "Article" using Chrome article_type = chrome_driver.find_element_by_css_selector('span.content-type') if article_type.text.strip() == 'Article': # Extract the publication time using Chrome time_element = chrome_driver.find_element_by_css_selector('time[datetime]') if time_element: time_text = time_element.text.strip() publication_times.append(time_text) # Extract the article title using Chrome title_element = chrome_driver.find_element_by_css_selector('h1') if title_element: title_text = title_element.text.strip() article_titles.append(title_text) except: pass # Go back to the main page using Chrome chrome_driver.execute_script('window.history.go(-1)') # Add a delay of 1 second before proceeding to the next link time.sleep(1) # Close the Chrome WebDriver chrome_driver.quit() # Create a DataFrame to store the publication time and article titles data = pd.DataFrame({'Publication Time': publication_times, 'Article Title': article_titles}) # Save the data to an Excel file data.to_excel('article_data.xlsx', index=False) print("Data saved to 'article_data.xlsx'.")