Untitled

mail@pastecode.io avatar
unknown
plain_text
2 years ago
1.1 kB
3
Indexable
from bs4 import BeautifulSoup
from urllib.request import Request, urlopen
from requests_html import HTMLSession
import requests
import re
from urllib.request import build_opener, HTTPCookieProcessor
hdr = {'User-Agent': 'Mozilla/5.0'}
req = Request("https://www.deu.edu.tr//",headers=hdr)
html_page = urlopen(req)
soup = BeautifulSoup(html_page, "lxml")
list_words=[]
links = []
for link in soup.findAll('a'):
    links.append(link.get('href'))
print(links)
print(links[0])
opener = build_opener(HTTPCookieProcessor())
flag=True
for i in range(360):
    print(i)
    if(links[i]!='#'):
        try:
            req= Request(links[i],headers=hdr)
            response=opener.open(req,timeout=30)
            content=response.read()
            soup = BeautifulSoup(content, "lxml")
            tag=soup.body
            for string in tag.strings:
                words =string.split()
                for word in words:
                    list_words.append(word)
        except:
            print("Kelime yok")