Untitled

mail@pastecode.io avatar
unknown
python
a month ago
2.8 kB
5
Indexable
Never
import os
import re
import requests
import time
from bs4 import BeautifulSoup
from fake_useragent import UserAgent


class PoetryScraper:
    def __init__(self, base_url='https://stihi.ru'):
        self.base_url = base_url
        self.headers = {
            'Accept': '*/*',
            'Connection': 'keep-alive',
            'User-Agent': UserAgent().random,
            'Accept-Encoding': 'gzip, deflate, br',
            'Accept-Language': 'en-US;q=0.5,en;q=0.3',
            'Cache-Control': 'max-age=0',
            'Upgrade-Insecure-Requests': '1',
            'Referer': 'https://google.com',
            'Pragma': 'no-cache'
        }

    def get_page(self, url):
        req = requests.get(url, headers=self.headers)
        time.sleep(1)
        return req.text


    def extract_author_links(self, page_content):
        soup = BeautifulSoup(page_content, 'lxml')
        author_links = soup.find('ul')
        author_links = re.findall(r'href="/avtor/([^<>]+)">', str(author_links))
        return [f'{self.base_url}/avtor/{author}' for author in author_links]


    def extract_poems_links(self, page_content):
        soup = BeautifulSoup(page_content, 'lxml')
        poems_links = soup.find_all(class_='poemlink')
        poems_links = re.findall(r'href="([^<>]+)"', str(poems_links))
        return [f'{self.base_url}{link}' for link in poems_links]


    def extract_text_of_poem(self, page_content):
        soup = BeautifulSoup(page_content, 'lxml')
        text_of_poem = soup.find(class_='text')
        return ' '.join(re.findall(r'>([^<>]+)<', str(text_of_poem)))


    def download_poems(self, author_url, output_path):
        author_page_content = self.get_page(author_url)
        author_dir = os.path.join(output_path, author_url.split('/')[-1])
        os.makedirs(author_dir, exist_ok=True)

        poems_links = self.extract_poems_links(author_page_content)
        for poem_link in poems_links:
            poem_page_content = self.get_page(poem_link)
            poem_name = poem_link.split('/')[-1]
            text_of_poem = self.extract_text_of_poem(poem_page_content)

            with open(os.path.join(author_dir, f'{poem_name}.txt'), 'w', encoding='utf-8') as f:
                f.write(text_of_poem)


    def scrape_authors(self, url, output_path):
        authors_page_content = self.get_page(url)
        author_urls = self.extract_author_links(authors_page_content)

        for author_url in author_urls:
            self.download_poems(author_url, output_path)


scraper = PoetryScraper()

# Скрапинг авторов из первой страницы
scraper.scrape_authors('https://stihi.ru/authors/editor.html', 'editor')

# Скрапинг авторов из второй страницы
scraper.scrape_authors('https://stihi.ru/authors/invitations.html', 'invitations')
Leave a Comment