Untitled
unknown
python
2 years ago
2.8 kB
10
Indexable
import os
import re
import requests
import time
from bs4 import BeautifulSoup
from fake_useragent import UserAgent
class PoetryScraper:
def __init__(self, base_url='https://stihi.ru'):
self.base_url = base_url
self.headers = {
'Accept': '*/*',
'Connection': 'keep-alive',
'User-Agent': UserAgent().random,
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'en-US;q=0.5,en;q=0.3',
'Cache-Control': 'max-age=0',
'Upgrade-Insecure-Requests': '1',
'Referer': 'https://google.com',
'Pragma': 'no-cache'
}
def get_page(self, url):
req = requests.get(url, headers=self.headers)
time.sleep(1)
return req.text
def extract_author_links(self, page_content):
soup = BeautifulSoup(page_content, 'lxml')
author_links = soup.find('ul')
author_links = re.findall(r'href="/avtor/([^<>]+)">', str(author_links))
return [f'{self.base_url}/avtor/{author}' for author in author_links]
def extract_poems_links(self, page_content):
soup = BeautifulSoup(page_content, 'lxml')
poems_links = soup.find_all(class_='poemlink')
poems_links = re.findall(r'href="([^<>]+)"', str(poems_links))
return [f'{self.base_url}{link}' for link in poems_links]
def extract_text_of_poem(self, page_content):
soup = BeautifulSoup(page_content, 'lxml')
text_of_poem = soup.find(class_='text')
return ' '.join(re.findall(r'>([^<>]+)<', str(text_of_poem)))
def download_poems(self, author_url, output_path):
author_page_content = self.get_page(author_url)
author_dir = os.path.join(output_path, author_url.split('/')[-1])
os.makedirs(author_dir, exist_ok=True)
poems_links = self.extract_poems_links(author_page_content)
for poem_link in poems_links:
poem_page_content = self.get_page(poem_link)
poem_name = poem_link.split('/')[-1]
text_of_poem = self.extract_text_of_poem(poem_page_content)
with open(os.path.join(author_dir, f'{poem_name}.txt'), 'w', encoding='utf-8') as f:
f.write(text_of_poem)
def scrape_authors(self, url, output_path):
authors_page_content = self.get_page(url)
author_urls = self.extract_author_links(authors_page_content)
for author_url in author_urls:
self.download_poems(author_url, output_path)
scraper = PoetryScraper()
# Скрапинг авторов из первой страницы
scraper.scrape_authors('https://stihi.ru/authors/editor.html', 'editor')
# Скрапинг авторов из второй страницы
scraper.scrape_authors('https://stihi.ru/authors/invitations.html', 'invitations')
Editor is loading...
Leave a Comment