Untitled
unknown
python
3 years ago
2.4 kB
3
Indexable
import requests import time, sys import urllib from bs4 import BeautifulSoup def get_data(search_url): headers = { 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36', 'Cookie': 'SINAGLOBAL=3625514568571.675.1640344823601; SCF=AsXjuW2rI4VE2PIypVl2t7ClRwc6JwQS7_uj3CW_vc5At-WtcSaBANQcueWOdC1tC8WpBIp4vqgpHuiF3rxYmlY.; _s_tentry=www.google.com; Apache=495808330753.0181.1640930548458; ULV=1640930548468:5:5:3:495808330753.0181.1640930548458:1640863361595; SUB=_2AkMWkjCrdcPxrAVZn_gdxWrkZIpH-jylR1ldAn7uJhMyAxh77m0xqSVutBF-XHLEQDUT5mLijjSfTMa398VIFefh; SUBP=0033WrSXqPxfM72wWs9jqgMF55529P9D9WFGB2BwOD_hvI-IGQ8Y7GCU5JpVF02RS0201hqNe0n4' } resp = requests.get(search_url, headers=headers, timeout=30) soup = BeautifulSoup(resp.text, 'lxml') # names = [] # name_div = soup.find_all("a", attrs={'class': 'name'}, href=True) # for name in name_div: # names.append(name.text) # contents = [] # content_div = soup.find_all("p", attrs={'node-type': 'feed_list_content_full'}) # for content in content_div: # contents.append(content.text) # print(names, contents) print(soup) sys.exit(0) cards = soup.find_all("div", attrs={'class': 'card'}) for card in cards: # get user name name = card.find("a", attrs={'class': 'name'}, href=True) if name is None: break name_ = name.text # get full text content_ = card.find("p", attrs={'node-type': 'feed_list_content_full'}) if content_ is None: content_ = card.find("p", attrs={ 'node-type': 'feed_list_content' }).text else: content_ = content_.text print(name_, content_) # sys.exit(0) # next page if __name__ == "__main__": key = "气候变化" time_span = "2019-06-01:2021-12-31" key = urllib.parse.quote(key) for i in range(2, 51): search_url = "https://s.weibo.com/weibo?q={}&typeall=1&suball=1×cope=custom:{}&Refer=g&page={}".format( key, time_span, i) time.sleep(1) print("第 {} 页".format(i), search_url) get_data(search_url)
Editor is loading...