Untitled
unknown
python
a year ago
2.8 kB
8
Indexable
import os import re import requests import time from bs4 import BeautifulSoup from fake_useragent import UserAgent class PoetryScraper: def __init__(self, base_url='https://stihi.ru'): self.base_url = base_url self.headers = { 'Accept': '*/*', 'Connection': 'keep-alive', 'User-Agent': UserAgent().random, 'Accept-Encoding': 'gzip, deflate, br', 'Accept-Language': 'en-US;q=0.5,en;q=0.3', 'Cache-Control': 'max-age=0', 'Upgrade-Insecure-Requests': '1', 'Referer': 'https://google.com', 'Pragma': 'no-cache' } def get_page(self, url): req = requests.get(url, headers=self.headers) time.sleep(1) return req.text def extract_author_links(self, page_content): soup = BeautifulSoup(page_content, 'lxml') author_links = soup.find('ul') author_links = re.findall(r'href="/avtor/([^<>]+)">', str(author_links)) return [f'{self.base_url}/avtor/{author}' for author in author_links] def extract_poems_links(self, page_content): soup = BeautifulSoup(page_content, 'lxml') poems_links = soup.find_all(class_='poemlink') poems_links = re.findall(r'href="([^<>]+)"', str(poems_links)) return [f'{self.base_url}{link}' for link in poems_links] def extract_text_of_poem(self, page_content): soup = BeautifulSoup(page_content, 'lxml') text_of_poem = soup.find(class_='text') return ' '.join(re.findall(r'>([^<>]+)<', str(text_of_poem))) def download_poems(self, author_url, output_path): author_page_content = self.get_page(author_url) author_dir = os.path.join(output_path, author_url.split('/')[-1]) os.makedirs(author_dir, exist_ok=True) poems_links = self.extract_poems_links(author_page_content) for poem_link in poems_links: poem_page_content = self.get_page(poem_link) poem_name = poem_link.split('/')[-1] text_of_poem = self.extract_text_of_poem(poem_page_content) with open(os.path.join(author_dir, f'{poem_name}.txt'), 'w', encoding='utf-8') as f: f.write(text_of_poem) def scrape_authors(self, url, output_path): authors_page_content = self.get_page(url) author_urls = self.extract_author_links(authors_page_content) for author_url in author_urls: self.download_poems(author_url, output_path) scraper = PoetryScraper() # Скрапинг авторов из первой страницы scraper.scrape_authors('https://stihi.ru/authors/editor.html', 'editor') # Скрапинг авторов из второй страницы scraper.scrape_authors('https://stihi.ru/authors/invitations.html', 'invitations')
Editor is loading...
Leave a Comment