Untitled

mail@pastecode.io avatar
unknown
python
2 years ago
2.4 kB
2
Indexable
Never
import requests
import time, sys
import urllib
from bs4 import BeautifulSoup


def get_data(search_url):
    headers = {
        'user-agent':
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36',
        'Cookie':
        'SINAGLOBAL=3625514568571.675.1640344823601; SCF=AsXjuW2rI4VE2PIypVl2t7ClRwc6JwQS7_uj3CW_vc5At-WtcSaBANQcueWOdC1tC8WpBIp4vqgpHuiF3rxYmlY.; _s_tentry=www.google.com; Apache=495808330753.0181.1640930548458; ULV=1640930548468:5:5:3:495808330753.0181.1640930548458:1640863361595; SUB=_2AkMWkjCrdcPxrAVZn_gdxWrkZIpH-jylR1ldAn7uJhMyAxh77m0xqSVutBF-XHLEQDUT5mLijjSfTMa398VIFefh; SUBP=0033WrSXqPxfM72wWs9jqgMF55529P9D9WFGB2BwOD_hvI-IGQ8Y7GCU5JpVF02RS0201hqNe0n4'
    }

    resp = requests.get(search_url, headers=headers, timeout=30)
    soup = BeautifulSoup(resp.text, 'lxml')

    # names = []
    # name_div = soup.find_all("a", attrs={'class': 'name'}, href=True)

    # for name in name_div:
    #     names.append(name.text)

    # contents = []
    # content_div = soup.find_all("p", attrs={'node-type': 'feed_list_content_full'})

    # for content in content_div:
    #     contents.append(content.text)

    # print(names, contents)
    print(soup)
    sys.exit(0)
    cards = soup.find_all("div", attrs={'class': 'card'})

    for card in cards:
        # get user name
        name = card.find("a", attrs={'class': 'name'}, href=True)
        if name is None:
            break
        name_ = name.text

        # get full text
        content_ = card.find("p", attrs={'node-type': 'feed_list_content_full'})
        if content_ is None:
            content_ = card.find("p", attrs={
                'node-type': 'feed_list_content'
            }).text
        else:
            content_ = content_.text

        print(name_, content_)
        # sys.exit(0)

    # next page


if __name__ == "__main__":

    key = "气候变化"
    time_span = "2019-06-01:2021-12-31"
    key = urllib.parse.quote(key)

    for i in range(2, 51):
        search_url = "https://s.weibo.com/weibo?q={}&typeall=1&suball=1&timescope=custom:{}&Refer=g&page={}".format(
            key, time_span, i)

        time.sleep(1)
        print("第 {} 页".format(i), search_url)
        get_data(search_url)