Untitled
unknown
python
a year ago
1.5 kB
4
Indexable
import time from selenium import webdriver from selenium.webdriver.common.by import By # change your chromedriver path here webdriver_path = "/Users/erichung/Desktop/chromedriver" options = webdriver.ChromeOptions() options.add_argument("--headless") # Run in headless mode (no GUI) options.executable_path = webdriver_path # Open the URL in the browser driver = webdriver.Chrome(options=options) def get_plurk_data(max_links): plurk_links = [] key_word = 1 used = set() while True: url = "https://www.plurk.com/search?q=" + str(key_word) driver.get(url) # Scroll down to load more content (you may need to adjust the scroll amount) for _ in range(5): driver.execute_script("window.scrollTo(0, document.body.scrollHeight);") time.sleep(1) # Wait for the page to load # Find all links containing "/p/" links = driver.find_elements(By.CSS_SELECTOR, 'a[href^="/p/"]') # Extract and save the links for link in links: plurk_link = link.get_attribute("href") if plurk_link not in used: plurk_links.append({'link': plurk_link}) used.add(plurk_link) if len(plurk_links) == max_links: driver.quit() return plurk_links key_word += 1 # 設定欲獲取的連結數量 max_links = 500 # 獲取類似資料的連結 result = get_plurk_data(max_links) # 顯示結果 for link in result: print(link['link'])
Editor is loading...
Leave a Comment