Hi,
Here's an interesting one. The following recipe produces a perfectly decent .epub, but if using .mobi (even when converting the good .epub to .mobi), almost all of the article somehow disappear... Of course I have kindle, so I can't use epub. Does it make sense to anyone?
Thanks.
The recipe:
Code:
import string, re
from calibre import strftime
from calibre.web.feeds.recipes import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup
class WV(BasicNewsRecipe):
title = 'Workers Vanguard'
description = 'Current issue of Workers Vanguard'
needs_subscription = False
no_stylesheets = True
def print_version(self, url):
return string.join(["http://www.spartacist.org/print/english/wv/", url],'')
def parse_index(self):
soup = self.index_to_soup('http://spartacist.org/english/wv/index.html')
articles = []
# find print URL of main article in index page
for div in soup.findAll(text=re.compile("Printable")):
a = div.findParent('a', href=True)
if not a: continue
else:
url1 = string.split(re.sub(r'\?.*', '', a['href']), '/')
url = string.join([url1[-2], '/', url1[-1]],'')
# get issue number and date. Note we could find issue number from the URLs...
for div in soup.findAll(id='folio'):
a = div.string
if a:
date = a
print string.join(['Found date: ', date])
self.timefmt = date
else:
pubname = div.i.string
print(pubname)
issuenostring = div.i.findNextSibling(text=True)
print string.join(['Found issue number string: ', issuenostring])
self.title = string.join([pubname, issuenostring], '')
# find headline of main article in index page
for div in soup.findAll(id='headline'):
headline = div.string
print(string.join(['Found article ', headline, 'at url', url]))
articles.append({'title':headline, 'url':url, 'description':'', 'date':date})
# find following articles articles (parsing Table of Content at right of index page)
for div in soup.findAll(id='smlheadline'):
a = div.find('a', href=True)
if not a: continue
else:
url = re.sub(r'\?.*', '', a['href'])
headline = a.string
print(string.join(['Found article', headline, 'at url', url]))
articles.append({'title':headline, 'url':url, 'description':'', 'date':date})
return [(string.join(['Workers Vanguard', issuenostring], ''), articles)]
def postprocess_html(self, soup, first):
for div in soup.findAll(id='headline'):
div.name = 'h1'
for div in soup.findAll(id='kicker'):
div.name = 'h2'
for div in soup.findAll(id='subhead'):
div.name = 'h3'
for div in soup.findAll(id='nytimes'):
div.name = 'h3'
for div in soup.findAll(id='wvquote'):
div.name = 'blockquote'
for div in soup.findAll(id='wvcite'):
div.name = 'blockquote'
return soup