Untitled

mail@pastecode.io avatar
unknown
python
2 months ago
1.5 kB
2
Indexable
Never
import time
from selenium import webdriver
from selenium.webdriver.common.by import By

# change your chromedriver path here
webdriver_path = "/Users/erichung/Desktop/chromedriver"

options = webdriver.ChromeOptions()
options.add_argument("--headless")  # Run in headless mode (no GUI)
options.executable_path = webdriver_path

# Open the URL in the browser
driver = webdriver.Chrome(options=options)

def get_plurk_data(max_links):
    plurk_links = []
    key_word = 1
    used = set()
    while True:
        url = "https://www.plurk.com/search?q=" + str(key_word)
        driver.get(url)

        # Scroll down to load more content (you may need to adjust the scroll amount)
        for _ in range(5): 
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            time.sleep(1)  # Wait for the page to load

        # Find all links containing "/p/"
        links = driver.find_elements(By.CSS_SELECTOR, 'a[href^="/p/"]')

        # Extract and save the links
        for link in links:
            plurk_link = link.get_attribute("href")
            if plurk_link not in used:
                plurk_links.append({'link': plurk_link})
                used.add(plurk_link)

                if len(plurk_links) == max_links:
                    driver.quit()
                    return plurk_links
        
        key_word += 1


# 設定欲獲取的連結數量
max_links = 500

# 獲取類似資料的連結
result = get_plurk_data(max_links)

# 顯示結果
for link in result:
    print(link['link'])
Leave a Comment