import requests
import time, sys
import urllib
from bs4 import BeautifulSoup
def get_data(search_url):
headers = {
'user-agent':
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36',
'Cookie':
'SINAGLOBAL=3625514568571.675.1640344823601; SCF=AsXjuW2rI4VE2PIypVl2t7ClRwc6JwQS7_uj3CW_vc5At-WtcSaBANQcueWOdC1tC8WpBIp4vqgpHuiF3rxYmlY.; _s_tentry=www.google.com; Apache=495808330753.0181.1640930548458; ULV=1640930548468:5:5:3:495808330753.0181.1640930548458:1640863361595; SUB=_2AkMWkjCrdcPxrAVZn_gdxWrkZIpH-jylR1ldAn7uJhMyAxh77m0xqSVutBF-XHLEQDUT5mLijjSfTMa398VIFefh; SUBP=0033WrSXqPxfM72wWs9jqgMF55529P9D9WFGB2BwOD_hvI-IGQ8Y7GCU5JpVF02RS0201hqNe0n4'
}
resp = requests.get(search_url, headers=headers, timeout=30)
soup = BeautifulSoup(resp.text, 'lxml')
# names = []
# name_div = soup.find_all("a", attrs={'class': 'name'}, href=True)
# for name in name_div:
# names.append(name.text)
# contents = []
# content_div = soup.find_all("p", attrs={'node-type': 'feed_list_content_full'})
# for content in content_div:
# contents.append(content.text)
# print(names, contents)
print(soup)
sys.exit(0)
cards = soup.find_all("div", attrs={'class': 'card'})
for card in cards:
# get user name
name = card.find("a", attrs={'class': 'name'}, href=True)
if name is None:
break
name_ = name.text
# get full text
content_ = card.find("p", attrs={'node-type': 'feed_list_content_full'})
if content_ is None:
content_ = card.find("p", attrs={
'node-type': 'feed_list_content'
}).text
else:
content_ = content_.text
print(name_, content_)
# sys.exit(0)
# next page
if __name__ == "__main__":
key = "气候变化"
time_span = "2019-06-01:2021-12-31"
key = urllib.parse.quote(key)
for i in range(2, 51):
search_url = "https://s.weibo.com/weibo?q={}&typeall=1&suball=1×cope=custom:{}&Refer=g&page={}".format(
key, time_span, i)
time.sleep(1)
print("第 {} 页".format(i), search_url)
get_data(search_url)