Untitled
unknown
python
2 years ago
1.6 kB
4
Indexable
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
# change your chromedriver path here
webdriver_path = "/Users/erichung/Desktop/chromedriver"
options = webdriver.ChromeOptions()
# options.add_argument("--headless") # Run in headless mode (no GUI)
# options.add_argument("--disable-gpu")
options.executable_path = webdriver_path
# Open the URL in the browser
driver = webdriver.Chrome(options=options)
def get_plurk_data(max_links):
plurk_links = []
key_word = 1
used = set()
while True:
url = "https://www.plurk.com/search?q=" + str(key_word)
driver.get(url)
# Scroll down to load more content (you may need to adjust the scroll amount)
for _ in range(5):
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(1) # Wait for the page to load
# Find all links containing "/p/"
links = driver.find_elements(By.CSS_SELECTOR, 'a[href^="/p/"]')
# Extract and save the links
for link in links:
plurk_link = link.get_attribute("href")
if plurk_link not in used:
plurk_links.append({'link': plurk_link})
used.add(plurk_link)
if len(plurk_links) == max_links:
driver.quit()
return plurk_links
key_word += 1
# 設定欲獲取的連結數量
max_links = 500
# 獲取類似資料的連結
result = get_plurk_data(max_links)
# 顯示結果
for link in result:
print(link['link'])
Editor is loading...
Leave a Comment