Untitled
unknown
plain_text
a year ago
1.1 kB
3
Indexable
Never
from bs4 import BeautifulSoup from urllib.request import Request, urlopen from requests_html import HTMLSession import requests import re from urllib.request import build_opener, HTTPCookieProcessor hdr = {'User-Agent': 'Mozilla/5.0'} req = Request("https://www.deu.edu.tr//",headers=hdr) html_page = urlopen(req) soup = BeautifulSoup(html_page, "lxml") list_words=[] links = [] for link in soup.findAll('a'): links.append(link.get('href')) print(links) print(links[0]) opener = build_opener(HTTPCookieProcessor()) flag=True for i in range(360): print(i) if(links[i]!='#'): try: req= Request(links[i],headers=hdr) response=opener.open(req,timeout=30) content=response.read() soup = BeautifulSoup(content, "lxml") tag=soup.body for string in tag.strings: words =string.split() for word in words: list_words.append(word) except: print("Kelime yok")