Hi,
May I suggest the following improved version of the "Die Zeit" recipe. General improvements include the correct handling of dashes and the download of the correct cover based on the front page of the newspaper; while improvements specifically for Kindle users include the removal of the empty left margin as well as the conversion of subscript numbers to non-subscripted but smaller numbers (the Kindle does not render the subscript unicode characters). The latter is important, for example, when articles talk about CO2. Here's the new recipe:
Code:
#!/usr/bin/env python
# -*- coding: utf-8 mode: python -*-
__license__ = 'GPL v3'
__copyright__ = '2010-2011, Steffen Siebert <calibre at steffensiebert.de>'
__docformat__ = 'restructuredtext de'
__version__ = '1.2'
"""
Die Zeit EPUB
"""
import os, urllib2, zipfile, re, string
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ptempfile import PersistentTemporaryFile
from calibre import walk
class ZeitEPUBAbo(BasicNewsRecipe):
title = u'Die Zeit'
description = u'Das EPUB Abo der Zeit (needs subscription)'
language = 'de'
lang = 'de-DE'
__author__ = 'Steffen Siebert, revised by Tobias Isenberg (with some code by Kovid Goyal)'
needs_subscription = True
conversion_options = {
'no_default_epub_cover' : True,
# fixing the wrong left margin
'mobi_ignore_margins' : True,
}
preprocess_regexps = [
# filtering for correct dashes
(re.compile(r' - '), lambda match: ' – '), # regular "Gedankenstrich"
(re.compile(r' -,'), lambda match: ' –,'), # "Gedankenstrich" before a comma
(re.compile(r'(?<=\d)-(?=\d)'), lambda match: '–'), # number-number
# filtering for unicode characters that are missing on the Kindle,
# try to replace them with meaningful work-arounds
(re.compile(u'\u2080'), lambda match: '<span style="font-size: 50%;">0</span>'), # subscript-0
(re.compile(u'\u2081'), lambda match: '<span style="font-size: 50%;">1</span>'), # subscript-1
(re.compile(u'\u2082'), lambda match: '<span style="font-size: 50%;">2</span>'), # subscript-2
(re.compile(u'\u2083'), lambda match: '<span style="font-size: 50%;">3</span>'), # subscript-3
(re.compile(u'\u2084'), lambda match: '<span style="font-size: 50%;">4</span>'), # subscript-4
(re.compile(u'\u2085'), lambda match: '<span style="font-size: 50%;">5</span>'), # subscript-5
(re.compile(u'\u2086'), lambda match: '<span style="font-size: 50%;">6</span>'), # subscript-6
(re.compile(u'\u2087'), lambda match: '<span style="font-size: 50%;">7</span>'), # subscript-7
(re.compile(u'\u2088'), lambda match: '<span style="font-size: 50%;">8</span>'), # subscript-8
(re.compile(u'\u2089'), lambda match: '<span style="font-size: 50%;">9</span>'), # subscript-9
]
def build_index(self):
domain = "http://premium.zeit.de"
url = domain + "/abovorteile/cgi-bin/_er_member/p4z.fpl?ER_Do=getUserData&ER_NextTemplate=login_ok"
browser = self.get_browser()
browser.add_password("http://premium.zeit.de", self.username, self.password)
try:
browser.open(url)
except urllib2.HTTPError:
self.report_progress(0,_("Can't login to download issue"))
raise ValueError('Failed to login, check your username and password')
response = browser.follow_link(text="DIE ZEIT als E-Paper")
response = browser.follow_link(url_regex=re.compile('^http://contentserver.hgv-online.de/nodrm/fulfillment\\?distributor=zeit-online&orderid=zeit_online.*'))
tmp = PersistentTemporaryFile(suffix='.epub')
self.report_progress(0,_('downloading epub'))
tmp.write(response.read())
tmp.close()
zfile = zipfile.ZipFile(tmp.name, 'r')
self.report_progress(0,_('extracting epub'))
zfile.extractall(self.output_dir)
tmp.close()
index = os.path.join(self.output_dir, 'content.opf')
self.report_progress(1,_('epub downloaded and extracted'))
# doing regular expression filtering
for path in walk('.'):
(shortname, extension) = os.path.splitext(path)
if extension.lower() in ('.html', '.htm', '.xhtml'):
with open(path, 'r+b') as f:
raw = f.read()
raw = raw.decode('utf-8')
for pat, func in self.preprocess_regexps:
raw = pat.sub(func, raw)
f.seek(0)
f.truncate()
f.write(raw.encode('utf-8'))
# adding real cover
self.report_progress(0,_('trying to download cover image (titlepage)'))
self.download_cover()
self.conversion_options["cover"] = self.cover_path
return index
# getting url of the cover
def get_cover_url(self):
try:
inhalt = self.index_to_soup('http://www.zeit.de/inhalt')
cover_url = inhalt.find('div', attrs={'class':'singlearchive clearfix'}).img['src'].replace('icon_','')
except:
cover_url = 'http://images.zeit.de/bilder/titelseiten_zeit/1946/001_001.jpg'
return cover_url
Cheers,
Tobias