Código:
from calibre.web.feeds.news import re
class ZonaDeAjedrez(BasicNewsRecipe):
title = u'Zona de Ajedrez'
__author__ = 'Jefferson Frantz'
description = 'Portal de Ajedrez en español'
timefmt = ' [%d %b, %Y]'
language = 'es_ES'
no_stylesheets = True
remove_javascript = True
extra_css = ' .txt_articulo{ font-family: sans-serif; font-size: medium; text-align: justify } .contentheading{font-family: serif; font-size: large; font-weight: bold; color: #000000; text-align: center}'
preprocess_regexps = [
(re.compile(r'<iframe width.*?</body>', re.DOTALL|re.IGNORECASE),lambda match: '')
]
def preprocess_html(self, soup):
for item in soup.findAll(style=True):
del item['style']
return soup
keep_only_tags = [dict(name='div', attrs={'id':['ja-current-content']})]
remove_tags = [
dict(name=['object','link','script','ul','iframe','ol'])
,dict(name='span', attrs={'class':['article-section']})
,dict(name='span', attrs={'class':['content_rating']})
,dict(name='span', attrs={'class':['content_vote']})
,dict(name='span', attrs={'class':['createby']})
,dict(name='div', attrs={'class':['dialog']})
,dict(name='div', attrs={'id':['jcWrapper']})
,dict(name='div', attrs={'class':['buttonheading']})
,dict(name='div', attrs={'class':['authordetails']})
,dict(name='table', attrs={'class':['pagenav']})
,dict(name='div', attrs={'id':['jc_commentFormDiv']})
]
remove_tags_after = dict(name='div', attrs={'id':'sidebar'})
def nz_parse_section(self, url):
soup = self.index_to_soup(url)
current_articles = []
for h4_tag in soup.findAll(attrs={'class':['jazin-title']}):
a = h4_tag.find('a', href=True)
if a is None:
continue
title = self.tag_to_string(a)
url = a.get('href', False)
if not url or not title:
continue
if url.startswith('/'):
url = 'http://www.zonadeajedrez.com'+url
self.log('\t\tFound article:', title)
self.log('\t\t\t', url)
current_articles.append({'title': title, 'url':url,
'description':'', 'date':''})
return current_articles
def parse_index(self):
feeds = []
for title, url in [
('Noticias',
'http://www.zonadeajedrez.com/noticias/noticias.html'),
('Reportajes',
'http://www.zonadeajedrez.com/articulos/reportajes.html'),
('Torneos',
'http://www.zonadeajedrez.com/noticias/torneos.html'),
('Artículos',
'http://www.zonadeajedrez.com/articulos/articulos-de-opinion.html'),
('Cursos',
'http://www.zonadeajedrez.com/aprendizaje/cursos.html'),
('Problemas',
'http://www.zonadeajedrez.com/aprendizaje/problemas.html'),
('Táctica',
'http://www.zonadeajedrez.com/aprendizaje/tactica.html'),
('Partidas Comentadas',
'http://www.zonadeajedrez.com/aprendizaje/partidas-comentadas.html'),
]:
articles = self.nz_parse_section(url)
if articles:
feeds.append((title, articles))
return feeds