Calibre
unknown
python
a year ago
1.8 kB
56
No Index
#!/usr/bin/env python
# vim:fileencoding=utf-8
from calibre.web.feeds.news import BasicNewsRecipe
from bs4 import BeautifulSoup
class AdvancedUserRecipe1730916914(BasicNewsRecipe):
title = 'Revista do Vitor'
oldest_article = 5
max_articles_per_feed = 50
auto_cleanup = True
no_stylesheets = False
encoding = 'utf-8'
use_pagination = True
use_embedded_content = True # Permite conteúdo embutido como imagens externas
language = 'pt_BR'
feeds = [
('Canal Meio', 'https://kill-the-newsletter.com/feeds/HASH.xml'),
# outros feeds
]
def preprocess_html(self, soup):
with open('/tmp/debug.html', 'w') as f:
f.write(str(soup))
for table in soup.find_all('table'):
table.unwrap() # Remove apenas as tags <table> mas mantém o conteúdo dentro
for tag in soup.find_all(['thead', 'tbody', 'tr', 'td', 'tfoot']):
tag.unwrap() # Remove as tags <thead>, <tbody>, <tr>, <td>, <tfoot> mas mantém o conteúdo dentro
return soup
def get_article_data(self, article, soup):
feed_title = article.get('feed_name', '')
if feed_title in ['AIDrop', 'TechDrop', 'MoneyDrop', 'Canal Meio']:
soup = self.preprocess_html(soup)
return super(AdvancedUserRecipe1730916914, self).get_article_data(article, soup)
def fetch_article(self, url):
# Overriding the method to fetch full articles
article = super(AdvancedUserRecipe1730916914, self).fetch_article(url)
if self.use_embedded_content:
self.fetch_embedded_article(self.output_dir, self.log, article, len(self.feeds))
self.fetch_embedded_images(self.output_dir, self.log, article)
return articleEditor is loading...
Leave a Comment