MobileRead Forums - View Single Post - "DIE ZEIT" im Online-Abo auch als ePub

tobias2 · 03-20-2011, 03:39 PM

Hallo zusammen,

Scheinbar wurden meine Verbesserungen im Zeit-Skript nicht in Calibre uebernommen. Ich habe aber nochmal etwas an dem Recipe gearbeitet. Daher hier noch einmal eine neue Version, die jetzt noch sehr viel mehr Fehler in der Originaldatei (soweit das automatisch moeglich ist) korrigiert und auch visuell ein paar Verbesserungen bringt. Ausserdem wurde der Fehler meiner vorigen Version beseitigt, der das Titelbild entfernt hatte. Im Prinzip sind nur die Filterregeln (preprocess_regexps) geaendert worden, aber trotzdem hier das gesamte Recipe:

Code:

#!/usr/bin/env  python
# -*- coding: utf-8 mode: python -*-

__license__   = 'GPL v3'
__copyright__ = '2010, Steffen Siebert <calibre at steffensiebert.de>'
__docformat__ = 'restructuredtext de'
__version__   = '1.3'

"""
Die Zeit EPUB
"""

import os, urllib2, zipfile, re, string
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ptempfile import PersistentTemporaryFile
from calibre import walk

class ZeitEPUBAbo(BasicNewsRecipe):

    title = u'Die Zeit'
    description = u'Das EPUB Abo der Zeit (needs subscription)'
    language = 'de'
    lang = 'de-DE'

    __author__ = 'Steffen Siebert, revised by Tobias Isenberg (with some code by Kovid Goyal)'
    needs_subscription = True

    conversion_options = {
        'no_default_epub_cover' : True,
        # fixing the wrong left margin
        'mobi_ignore_margins' : True,
    }

    preprocess_regexps    = [
        # filtering for correct dashes
        (re.compile(' - '), lambda match: u' \u2013 '), # regular "Gedankenstrich"
        (re.compile(' -,'), lambda match: u' \u2013,'), # "Gedankenstrich" before a comma
        (re.compile(r'(?<=\d)-(?=\d)'), lambda match: u'\u2013'), # number-number
        # fix the number dash number dash for the title image that was broken by the previous line
        (re.compile(u'(?<=\d\d\d\d)\u2013(?=\d?\d\.png)'), lambda match: '-'),
        # filtering for certain dash cases
        (re.compile(r'Bild - Zeitung'), lambda match: 'Bild-Zeitung'), # the obvious
        (re.compile(r'EMail'), lambda match: 'E-Mail'), # the obvious
        # the next two lines not only fix errors but also create new ones. this is due to additional errors in
        # the typesetting such as missing commas or wrongly placed dashes. but more is fixed than broken.
        (re.compile(r'(?<!und|der|\w\w,) -(?=\w)'), lambda match: '-'), # space too much before a connecting dash
        (re.compile(r'(?<=\w)- (?!und\b|oder\b|wie\b|aber\b|sondern\b|bis\b|&amp;|&\s)'), lambda match: '-'), # space too much after a connecting dash
        # filtering for missing spaces before the month in long dates
        (re.compile(r'(?<=\d)\.(?=Januar)'), lambda match: '. '),
        (re.compile(r'(?<=\d)\.(?=Februar)'), lambda match: '. '),
        (re.compile(u'(?<=\d)\.(?=M\u00E4rz)'), lambda match: '. '),
        (re.compile(r'(?<=\d)\.(?=April)'), lambda match: '. '),
        (re.compile(r'(?<=\d)\.(?=Mai)'), lambda match: '. '),
        (re.compile(r'(?<=\d)\.(?=Juni)'), lambda match: '. '),
        (re.compile(r'(?<=\d)\.(?=August)'), lambda match: '. '),
        (re.compile(r'(?<=\d)\.(?=September)'), lambda match: '. '),
        (re.compile(r'(?<=\d)\.(?=Oktober)'), lambda match: '. '),
        (re.compile(r'(?<=\d)\.(?=November)'), lambda match: '. '),
        (re.compile(r'(?<=\d)\.(?=Dezember)'), lambda match: '. '),
        # filtering for other missing spaces
        (re.compile(r'(?<=[A-Za-zÄÖÜäöü]),(?=[A-Za-zÄÖÜäöü])'), lambda match: ', '),
        (re.compile(r'(?<=[a-zäöü])\.(?=[A-ZÄÖÜ][A-Za-zÄÖÜäöü])'), lambda match: '. '),
        (re.compile(r'u\. a\.'), lambda match: 'u.a.'), # fix abbreviation that was broken by the previous line
        (re.compile(r'(?<=\d)[Pp]rozent'), lambda match: ' Prozent'),
        (re.compile(r'\.\.\.\.+'), lambda match: '...'), # too many dots (....)
        (re.compile(r'(?<=[^\s])\.\.\.'), lambda match: ' ...'), # spaces before ...
        (re.compile(r'\.\.\.(?=[^\s])'), lambda match: '... '), # spaces after ...
        (re.compile(r'(?<=[\[\(]) \.\.\. (?=[\]\)])'), lambda match: '...'), # fix special cases of ... in brackets
        (re.compile(u'(?<=[\u00BB\u203A]) \.\.\.'), lambda match: '...'), # fix special cases of ... after a quotation mark
        (re.compile(u'\.\.\. (?=[\u00AB\u2039,])'), lambda match: '...'), # fix special cases of ... before a quotation mark or comma
        (re.compile(r'(?<=\d)(?=(Femto|Piko|Nano|Mikro|Milli|Zenti|Dezi|Hekto|Kilo|Mega|Giga|Tera|Peta|Tausend|Trilli|Kubik|Quadrat))'), lambda match: ' '), # fix missing spaced before units
        # filtering for spaces in large numbers for better readability
        (re.compile(r'(?<=\d\d)(?=\d\d\d[ ,\.;\)<\?!-])'), lambda match: u'\u2008'), # end of the number with some character following
        (re.compile(r'(?<=\d\d)(?=\d\d\d. )'), lambda match: u'\u2008'), # end of the number with full-stop following, then space is necessary (avoid file names)
        (re.compile(u'(?<=\d)(?=\d\d\d\u2008)'), lambda match: u'\u2008'), # next level
        (re.compile(u'(?<=\d)(?=\d\d\d\u2008)'), lambda match: u'\u2008'), # next level
        (re.compile(u'(?<=\d)(?=\d\d\d\u2008)'), lambda match: u'\u2008'), # next level
        (re.compile(u'(?<=\d)(?=\d\d\d\u2008)'), lambda match: u'\u2008'), # next level
        # filtering for unicode characters that are missing on the Kindle,
        # try to replace them with meaningful work-arounds
        (re.compile(u'\u2080'), lambda match: '<span style="font-size: 40%;">0</span>'), # subscript-0
        (re.compile(u'\u2081'), lambda match: '<span style="font-size: 40%;">1</span>'), # subscript-1
        (re.compile(u'\u2082'), lambda match: '<span style="font-size: 40%;">2</span>'), # subscript-2
        (re.compile(u'\u2083'), lambda match: '<span style="font-size: 40%;">3</span>'), # subscript-3
        (re.compile(u'\u2084'), lambda match: '<span style="font-size: 40%;">4</span>'), # subscript-4
        (re.compile(u'\u2085'), lambda match: '<span style="font-size: 40%;">5</span>'), # subscript-5
        (re.compile(u'\u2086'), lambda match: '<span style="font-size: 40%;">6</span>'), # subscript-6
        (re.compile(u'\u2087'), lambda match: '<span style="font-size: 40%;">7</span>'), # subscript-7
        (re.compile(u'\u2088'), lambda match: '<span style="font-size: 40%;">8</span>'), # subscript-8
        (re.compile(u'\u2089'), lambda match: '<span style="font-size: 40%;">9</span>'), # subscript-9
        # better layout for the top line of each article
        (re.compile(u'(?<=DIE ZEIT N\u00B0 \d /) (?=\d\d)'), lambda match: ' 20'), # proper year in edition number
        (re.compile(u'(?<=DIE ZEIT N\u00B0 \d\d /) (?=\d\d)'), lambda match: ' 20'), # proper year in edition number
        (re.compile(u'(?<=>)(?=DIE ZEIT N\u00B0 \d\d / 20\d\d)'), lambda match: u' \u2013 '), # dash between category and DIE ZEIT
    ]

    def build_index(self):
        domain = "http://premium.zeit.de"
        url = domain + "/abovorteile/cgi-bin/_er_member/p4z.fpl?ER_Do=getUserData&ER_NextTemplate=login_ok"

        browser = self.get_browser()
        browser.add_password("http://premium.zeit.de", self.username, self.password)

        try:
            browser.open(url)
        except urllib2.HTTPError:
            self.report_progress(0,_("Can't login to download issue"))
            raise ValueError('Failed to login, check your username and password')

        response = browser.follow_link(text="DIE ZEIT als E-Paper")
        response = browser.follow_link(url_regex=re.compile('^http://contentserver.hgv-online.de/nodrm/fulfillment\\?distributor=zeit-online&orderid=zeit_online.*'))

        tmp = PersistentTemporaryFile(suffix='.epub')
        self.report_progress(0,_('downloading epub'))
        tmp.write(response.read())
        tmp.close()

        zfile = zipfile.ZipFile(tmp.name, 'r')
        self.report_progress(0,_('extracting epub'))

        zfile.extractall(self.output_dir)

        tmp.close()

        index = os.path.join(self.output_dir, 'content.opf')

        self.report_progress(1,_('epub downloaded and extracted'))

        # doing regular expression filtering
        for path in walk('.'):
            (shortname, extension) = os.path.splitext(path)  
            if extension.lower() in ('.html', '.htm', '.xhtml'):
                with open(path, 'r+b') as f:
                    raw = f.read()
                    raw = raw.decode('utf-8')
                    for pat, func in self.preprocess_regexps:
                        raw = pat.sub(func, raw)
                    f.seek(0)
                    f.truncate()
                    f.write(raw.encode('utf-8'))

        # adding real cover
        self.report_progress(0,_('trying to download cover image (titlepage)'))
        self.download_cover()
        self.conversion_options["cover"] = self.cover_path

        return index

    # getting url of the cover
    def get_cover_url(self):
        try:
            inhalt = self.index_to_soup('http://www.zeit.de/inhalt')
            cover_url = inhalt.find('div', attrs={'class':'singlearchive clearfix'}).img['src'].replace('icon_','')
        except:
            cover_url = 'http://images.zeit.de/bilder/titelseiten_zeit/1946/001_001.jpg'
        return cover_url

Viel Spass damit,

Tobias