Calibre

 avatar
unknown
python
a year ago
1.8 kB
56
No Index
#!/usr/bin/env python
# vim:fileencoding=utf-8
from calibre.web.feeds.news import BasicNewsRecipe
from bs4 import BeautifulSoup

class AdvancedUserRecipe1730916914(BasicNewsRecipe):
    title = 'Revista do Vitor'
    oldest_article = 5
    max_articles_per_feed = 50
    auto_cleanup = True
    no_stylesheets = False
    encoding = 'utf-8'
    use_pagination = True
    use_embedded_content = True  # Permite conteúdo embutido como imagens externas
    language = 'pt_BR'

    feeds = [

        ('Canal Meio', 'https://kill-the-newsletter.com/feeds/HASH.xml'),
		# outros feeds 
    ]

    def preprocess_html(self, soup):
        with open('/tmp/debug.html', 'w') as f:
            f.write(str(soup))
        for table in soup.find_all('table'):
            table.unwrap()  # Remove apenas as tags <table> mas mantém o conteúdo dentro
        for tag in soup.find_all(['thead', 'tbody', 'tr', 'td', 'tfoot']):
            tag.unwrap()  # Remove as tags <thead>, <tbody>, <tr>, <td>, <tfoot> mas mantém o conteúdo dentro
        return soup

    def get_article_data(self, article, soup):
        feed_title = article.get('feed_name', '')
        if feed_title in ['AIDrop', 'TechDrop', 'MoneyDrop', 'Canal Meio']:
            soup = self.preprocess_html(soup)
        return super(AdvancedUserRecipe1730916914, self).get_article_data(article, soup)
    
    def fetch_article(self, url):
        # Overriding the method to fetch full articles
        article = super(AdvancedUserRecipe1730916914, self).fetch_article(url)
        if self.use_embedded_content:
            self.fetch_embedded_article(self.output_dir, self.log, article, len(self.feeds))
            self.fetch_embedded_images(self.output_dir, self.log, article)
        return article
Editor is loading...
Leave a Comment