Código:
class AdvancedUserRecipe1289098587(BasicNewsRecipe):
title = u'GUIA PROGRAMACION TV ejemplo'
oldest_article = 1
max_articles_per_feed = 150
__author__ = ' KRorschachZ.'
description = 'Tv rss'
timefmt = ' [%d %b, %Y]'
language = 'es_ES'
no_stylesheets = True
remove_javascript = True
extra_css = ' .txt_articulo{ font-family: sans-serif; font-size: medium; text-align: justify } .contentheading{font-family: serif; font-size: large; font-weight: bold; color: #000000; text-align: center}'
feeds = [(u'Noticias TV', u'http://www.sincroguia.tv/rss/rss.php?types=news'), (u'TVE 1', u'http://www.miguiatv.com/rss/tve1.xml'), (u'TVE 2', u'http://www.miguiatv.com/rss/la2.xml'),(u'ANT 3', u'http://www.miguiatv.com/rss/antena3.xml'), (u'Cuatro TV', u'http://www.miguiatv.com/rss/cuatro.xml'), (u'Tele 5', u'http://www.miguiatv.com/rss/telecinco.xml'), (u'La Sexta', u'http://www.miguiatv.com/rss/la-sexta.xml'), (u'Peliculas', u'http://www.laguiatv.com/rss/feeds/peliculas.xml')]
remove_tags_before = dict(id='article')
remove_tags_after = dict(id='article')
remove_tags = [dict(attrs={'class':[
'articleFooter',
'articleTools',
'columnGroup doubleRule',
'columnGroup singleRule',
'columnGroup last',
'columnGroup last',
'doubleRule',
'dottedLine',
'entry-meta',
'entry-response module',
'icon enlargeThis',
'leftNavTabs',
'module box nav',
'nextArticleLink',
'nextArticleLink clearfix',
'post-tools',
'relatedSearchesModule',
'side_tool',
'singleAd',
'subNavigation clearfix',
'subNavigation tabContent active',
'subNavigation tabContent active clearfix',
]}),
dict(id=[
'adxLeaderboard',
'archive',
'articleExtras',
'articleInline',
'blog_sidebar',
'businessSearchBar',
'cCol',
'entertainmentSearchBar',
'footer',
'header',
'header_search',
'login',
'masthead',
'masthead-nav',
'memberTools',
'navigation',
'portfolioInline',
'relatedArticles',
'respond',
'side_search',
'side_index',
'side_tool',
'toolsRight',
]),
dict(name=['script', 'noscript', 'style'])]
remove_tags = [
dict(name=['object','link','script','ul','iframe','ol'])
,dict(name='span', attrs={'class':['article-section']})
,dict(name='span', attrs={'class':['content_rating']})
,dict(name='span', attrs={'class':['content_vote']})
,dict(name='span', attrs={'class':['createby']})
,dict(name='div', attrs={'class':['dialog']})
,dict(name='div', attrs={'id':['jcWrapper']})
,dict(name='div', attrs={'class':['buttonheading']})
,dict(name='div', attrs={'class':['authordetails']})
,dict(name='table', attrs={'class':['pagenav']})
,dict(name='div', attrs={'id':['jc_commentFormDiv']})
]
remove_tags_after = dict(name='div', attrs={'id':'sidebar'})
remove_attributes = ['width','height']
extra_css = '''
h2{font-family: serif; font-size: small; font-weight: bold; color: #000000; text-align: justify}
'''
remove_tags = [
dict(name=['object','link','script','ul'])
,dict(name='div', attrs={'id':['scrAdSense','herramientas2','participacion','participacion2','bloque1resultados','bloque2resultados','cont_vinyetesAnt','tinta','noticiasSuperior','cintillopublicidad2']})
,dict(name='p', attrs={'class':['masinformacion','hora']})
,dict(name='a', attrs={'class':["'link'"]})
,dict(name='div', attrs={'class':['addthis_toolbox addthis_default_style','firma','pretitularnoticia']})
,dict(name='form', attrs={'id':['formularioDeBusquedaAvanzada']})
]
def preprocess_html(self, soup):
for item in soup.findAll(style=True):
del item['style']
# del item['link']
return soup