Thanks, Kovid! (for everything).
Quote:
Originally Posted by kovidgoyal
Try this
preprocess_regexps= [(re.compile(r'<!DOCTYPE[^>]+>', re.I), '')]
No luck with that, either. But perhaps I'm nout using it the proper way...
|
Quote:
...and note that you can also define preprocess_raw_html() i your recipe to remove the doctype programmitacally if you have trouble with regeps.
|
Mmmmmm... not sure how to use it
exactly, and unfortunately I didn't find any example in the built-in recipes.
Maybe I shoud clarify that until today I have zero experiencie with recipes, and only know something about HTML and Javascript. But I manage to make the recipe work with a local file, removing manually the DOCTYPE declaration in the index file.
BTW, here's the recipe:
Spoiler:
Code:
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
'''
pagina12.com.ar
'''
import re
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import Tag, NavigableString
class Pagina12(BasicNewsRecipe):
title = 'Pagina12'
__author__ = 'Pablo Marfil'
description = 'Diario argentino'
#INDEX = 'http://www.pagina12.com.ar/diario/secciones/index.html'
INDEX = 'file:///C:/Archivos%20de%20programa/Calibre2/pagina12.htm'
language = 'es'
encoding = 'cp1252'
remove_tags_before = dict(id='fecha')
remove_tags_after = dict(id='fin')
remove_tags = [dict(id=['fecha', 'fin', 'pageControls'])]
no_stylesheets = True
#preprocess_regexps = [(re.compile(r'<!--.*?-->', re.DOTALL), lambda m: '')]
#preprocess_regexps = [(re.compile(r'<!DOCTYPE.*dtd">', re.IGNORECASE),lambda x: '<!DOCTYPE html> '),]
#def print_version(self, url):
# return url.replace('/archive/', '/print/')
def parse_index(self):
articles = []
numero = 1
soup = self.index_to_soup(self.INDEX)
ts = soup.find(id='magazineTopStories')
#ds = self.tag_to_string(ts.find('h1')).split(':')[-1]
#self.timefmt = ' [%s]'%ds
cover = soup.find('img', src=True, attrs={'class':'cover'})
if cover is not None:
self.cover_url = cover['src']
feeds = []
#feeds.append((u'ULTIMAS NOTICIAS',u'http://www.pagina12.com.ar/diario/rss/ultimas_noticias.xml'))
seen_titles = set([])
for section in soup.findAll('div','seccionx'):
numero+=1
print (numero)
section_title = self.tag_to_string(section.find('div','desplegable_titulo on_principal right'))
self.log('Found section:', section_title)
articles = []
for post in section.findAll('h2'):
h = post.find('a', href=True)
title = self.tag_to_string(h)
if title in seen_titles:
continue
seen_titles.add(title)
a = post.find('a', href=True)
url = a['href']
if url.startswith('/'):
url = 'http://pagina12.com.ar/imprimir'+url
p = post.find('div', attrs={'h2'})
desc = None
self.log('\tFound article:', title, 'at', url)
if p is not None:
desc = self.tag_to_string(p)
self.log('\t\t', desc)
articles.append({'title':title, 'url':url, 'description':desc,
'date':''})
if articles:
feeds.append((section_title, articles))
return feeds
def postprocess_html(self, soup, first):
for table in soup.findAll('table', align='right'):
img = table.find('img')
if img is not None:
img.extract()
caption = self.tag_to_string(table).strip()
div = Tag(soup, 'div')
div['style'] = 'text-align:center'
div.insert(0, img)
div.insert(1, Tag(soup, 'br'))
if caption:
div.insert(2, NavigableString(caption))
table.replaceWith(div)
return soup
If you could tell where and how to try your suggestions...
Meanwhile, I wrote to the webmaster's newspaper about the mistake. No answer as for today. ;-(