MobileRead Forums - View Single Post - "DIE ZEIT" im Online-Abo auch als ePub

tobias2 · 12-21-2011, 07:41 AM

Hallo zusammen,

Wegen einer kleineren Aenderung im Download-bereich war wieder mal ein Update faellig, viel Spass damit:

Code:

#!/usr/bin/env  python
# -*- coding: utf-8 mode: python -*-

__license__   = 'GPL v3'
__copyright__ = '2010, Steffen Siebert <calibre at steffensiebert.de>'
__docformat__ = 'restructuredtext de'
__version__   = '1.7'

"""
Die Zeit EPUB
"""

import os, urllib2, zipfile, re, string, cStringIO
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ptempfile import PersistentTemporaryFile
from calibre import walk
from urlparse import urlparse
from contextlib import closing
from calibre.utils.magick.draw import save_cover_data_to

class ZeitEPUBAbo(BasicNewsRecipe):

    title = u'Die Zeit'
    description = u'Das EPUB Abo der Zeit (needs subscription)'
    language = 'de'
    lang = 'de-DE'

    __author__ = 'Steffen Siebert, revised by Tobias Isenberg (with some code by Kovid Goyal and achims)'
    needs_subscription = True

    conversion_options = {
        'no_default_epub_cover' : True,
        # fixing the wrong left margin
        'mobi_ignore_margins' : True,
        'keep_ligatures' : True,
    }

    preprocess_regexps    = [
        # filtering for correct dashes ("Gedankenstrich" and "bis")
        (re.compile(u' (-|\u2212)(?=[ ,])'), lambda match: u' \u2013'),
        (re.compile(r'(?<=\d)-(?=\d)'), lambda match: u'\u2013'), # number-number
        (re.compile(u'(?<=\d,)-(?= ?\u20AC)'), lambda match: u'\u2013'), # ,- Euro
        # fix the number dash number dash for the title image that was broken by the previous line
        (re.compile(u'(?<=\d\d\d\d)\u2013(?=\d?\d\.png)'), lambda match: '-'),
        # filtering for certain dash cases
        (re.compile(r'Bild - Zeitung'), lambda match: 'Bild-Zeitung'), # the obvious
        (re.compile(r'EMail'), lambda match: 'E-Mail'), # the obvious
        (re.compile(r'SBahn'), lambda match: 'S-Bahn'), # the obvious
        (re.compile(r'UBoot'), lambda match: 'U-Boot'), # the obvious
        (re.compile(r'T Shirt'), lambda match: 'T-Shirt'), # the obvious
        (re.compile(r'TShirt'), lambda match: 'T-Shirt'), # the obvious
        (re.compile(r'3-D'), lambda match: '3D'), # the obvious
        # the next two lines not only fix errors but also create new ones. this is due to additional errors in
        # the typesetting such as missing commas or wrongly placed dashes. but more is fixed than broken.
        (re.compile(r'(?<!und|der|\w\w,) -(?=\w)'), lambda match: '-'), # space too much before a connecting dash
        (re.compile(r'(?<=\w)- (?!und\b|oder\b|wie\b|als\b|aber\b|auch\b|sondern\b|bis\b|&amp;|&\s|bzw\.|auf\b|eher\b|noch\b)'), lambda match: '-'), # space too much after a connecting dash
        # filtering for missing spaces before or after the month in long dates
        (re.compile(u'(?<=\d)\.(?=(Januar|Februar|M\u00E4rz|April|Mai|Juni|Juli|August|September|Oktober|November|Dezember))'), lambda match: '. '),
        (re.compile(u'(?<=Januar)(?=\d)'), lambda match: ' '),
        (re.compile(u'(?<=Februar)(?=\d)'), lambda match: ' '),
        (re.compile(u'(?<=M\u00E4rz)(?=\d)'), lambda match: ' '),
        (re.compile(u'(?<=April)(?=\d)'), lambda match: ' '),
        (re.compile(u'(?<=Mai)(?=\d)'), lambda match: ' '),
        (re.compile(u'(?<=Juni)(?=\d)'), lambda match: ' '),
        (re.compile(u'(?<=Juli)(?=\d)'), lambda match: ' '),
        (re.compile(u'(?<=August)(?=\d)'), lambda match: ' '),
        (re.compile(u'(?<=September)(?=\d)'), lambda match: ' '),
        (re.compile(u'(?<=Oktober)(?=\d)'), lambda match: ' '),
        (re.compile(u'(?<=November)(?=\d)'), lambda match: ' '),
        (re.compile(u'(?<=Dezember)(?=\d)'), lambda match: ' '),
        # filtering for other missing spaces and other issues
        (re.compile(r'Stuttgart21'), lambda match: 'Stuttgart 21'), # the obvious
        (re.compile(u'(?<=\d)(?=\u20AC)'), lambda match: u'\u2013'), # Zahl[no space]Euro
        (re.compile(r'(?<=[:])(?=[^\d\s</])'), lambda match: ' '), # missing space after colon
        (re.compile(u'(?<=\S)\u0022 '), lambda match: u'\u00AB '), # wrong closing quotation
        (re.compile(u' \u0022(?=\S)'), lambda match: u' \u00BB'), # wrong opening quotation
        (re.compile(u'\u00AB(?=[^\-\.:;,\?!<\)\s])'), lambda match: u'\u00AB '), # missing space after closing quotation
        (re.compile(u'(?<=[^\s\(>])\u00BB'), lambda match: u' \u00BB'), # missing space before opening quotation
        (re.compile(u'(?<=<p class="absatz">)(?=[^(\u00BB|</p>)]*\u00AB)'), lambda match: u'\u00BB '), # missing opening quotation at start of paragraph
        (re.compile(r'(?<=[a-z])(?=(I|II|III|IV|V|VI|VII|VIII|IX|X|XI|XII|XIII|XIV|XV|XVI|XVII|XVIII|XIX|XX)\.)'), lambda match: ' '), # missing space before Roman numeral
        (re.compile(r'(?<=(I|V|X)\.)(?=[\w])'), lambda match: ' '), # missing space after Roman numeral
        (re.compile(r'(?<=(II|IV|VI|IX|XI|XV|XX)\.)(?=[\w])'), lambda match: ' '), # missing space after Roman numeral
        (re.compile(r'(?<=(III|VII|XII|XIV|XVI|XIX)\.)(?=[\w])'), lambda match: ' '), # missing space after Roman numeral
        (re.compile(r'(?<=(VIII|XIII|XVII)\.)(?=[\w])'), lambda match: ' '), # missing space after Roman numeral
        (re.compile(r'(?<=(XVIII)\.)(?=[\w])'), lambda match: ' '), # missing space after Roman numeral
        (re.compile(r'(?<=[A-Za-zÄÖÜäöü]),(?=[A-Za-zÄÖÜäöü])'), lambda match: ', '), # missing space after comma
        (re.compile(r'(?<=[a-zäöü])\.(?=[A-ZÄÖÜ][A-Za-zÄÖÜäöü])'), lambda match: '. '), # missing space after full-stop
        (re.compile(r'(?<=[uU]\.) (?=a\.)'), lambda match: u'\u2008'), # fix abbreviation that was potentially broken previously
        (re.compile(r'(?<=[iI]\.) (?=A\.)'), lambda match: u'\u2008'), # fix abbreviation that was potentially broken previously
        (re.compile(r'(?<=[zZ]\.) (?=B\.)'), lambda match: u'\u2008'), # fix abbreviation that was potentially broken previously
        (re.compile(r'(?<=\w\.) (?=[A-Z][a-z]*@)'), lambda match: ''), # fix e-mail address that was potentially broken previously
        (re.compile(r'(?<=\d)[Pp]rozent'), lambda match: ' Prozent'),
        (re.compile(r'\.\.\.\.+'), lambda match: '...'), # too many dots (....)
        (re.compile(r'(?<=[^\s])\.\.\.'), lambda match: ' ...'), # spaces before ...
        (re.compile(r'\.\.\.(?=[^\s])'), lambda match: '... '), # spaces after ...
        (re.compile(r'(?<=[\[\(]) \.\.\. (?=[\]\)])'), lambda match: '...'), # fix special cases of ... in brackets
        (re.compile(u'(?<=[\u00BB\u203A]) \.\.\.'), lambda match: '...'), # fix special cases of ... after a quotation mark
        (re.compile(u'\.\.\. (?=[\u00AB\u2039,])'), lambda match: '...'), # fix special cases of ... before a quotation mark or comma
        (re.compile(u'\u2013 \(\)'), lambda match: ''), # fix empty opening and closing brackets with leading dash and space
        (re.compile(r' \(\)'), lambda match: ''), # fix empty opening and closing brackets with leading space
        (re.compile(r'\(\)'), lambda match: ''), # fix empty opening and closing brackets
        (re.compile(r': -(?=[\(\)\\/])'), lambda match: ':-'), # fix wrong smilies
        (re.compile(u'  (?=[\wÄÖÜäöü])'), lambda match: ' '), # fix certain double spaces
        # fix missing spaces between numbers and any sort of units, possibly with dot
        (re.compile(r'(?<=\d)(?=(Femto|Piko|Nano|Mikro|Milli|Zenti|Dezi|Hekto|Kilo|Mega|Giga|Tera|Peta|Tausend|Trilli|Kubik|Quadrat|Meter|Uhr|Jahr|Schuljahr|Seite|Division|Kompanie|Armee))'), lambda match: ' '),
        (re.compile(r'(?<=\d\.)(?=(Femto|Piko|Nano|Mikro|Milli|Zenti|Dezi|Hekto|Kilo|Mega|Giga|Tera|Peta|Tausend|Trilli|Kubik|Quadrat|Meter|Uhr|Jahr|Schuljahr|Seite|Division|Kompanie|Armee))'), lambda match: ' '),
        # fix wrong spaces
        (re.compile(r'(?<=<p class="absatz">[A-ZÄÖÜ]) (?=[a-zäöü\-])'), lambda match: ''), # at beginning of paragraphs
        (re.compile(u' \u00AB'), lambda match: u'\u00AB '), # before closing quotation
        (re.compile(u'\u00BB '), lambda match: u' \u00BB'), # after opening quotation
        # filtering for spaces in large numbers for better readability
        (re.compile(r'(?<=\d\d)(?=\d\d\d[ ,\.;\)<\?!-])'), lambda match: u'\u2008'), # end of the number with some character following
        (re.compile(r'(?<=\d\d)(?=\d\d\d. )'), lambda match: u'\u2008'), # end of the number with full-stop following, then space is necessary (avoid file names)
        (re.compile(u'(?<=\d)(?=\d\d\d\u2008)'), lambda match: u'\u2008'), # next level
        (re.compile(u'(?<=\d)(?=\d\d\d\u2008)'), lambda match: u'\u2008'), # next level
        (re.compile(u'(?<=\d)(?=\d\d\d\u2008)'), lambda match: u'\u2008'), # next level
        (re.compile(u'(?<=\d)(?=\d\d\d\u2008)'), lambda match: u'\u2008'), # next level
        # filtering for unicode characters that are missing on the Kindle,
        # try to replace them with meaningful work-arounds
        (re.compile(u'\u2080'), lambda match: '<span style="font-size: 40%;">0</span>'), # subscript-0
        (re.compile(u'\u2081'), lambda match: '<span style="font-size: 40%;">1</span>'), # subscript-1
        (re.compile(u'\u2082'), lambda match: '<span style="font-size: 40%;">2</span>'), # subscript-2
        (re.compile(u'\u2083'), lambda match: '<span style="font-size: 40%;">3</span>'), # subscript-3
        (re.compile(u'\u2084'), lambda match: '<span style="font-size: 40%;">4</span>'), # subscript-4
        (re.compile(u'\u2085'), lambda match: '<span style="font-size: 40%;">5</span>'), # subscript-5
        (re.compile(u'\u2086'), lambda match: '<span style="font-size: 40%;">6</span>'), # subscript-6
        (re.compile(u'\u2087'), lambda match: '<span style="font-size: 40%;">7</span>'), # subscript-7
        (re.compile(u'\u2088'), lambda match: '<span style="font-size: 40%;">8</span>'), # subscript-8
        (re.compile(u'\u2089'), lambda match: '<span style="font-size: 40%;">9</span>'), # subscript-9
        (re.compile(u'\u2070'), lambda match: '^0'), # superscript-0
        (re.compile(u'\u2071'), lambda match: '^1'), # superscript-1
        (re.compile(u'\u2072'), lambda match: '^2'), # superscript-2
        (re.compile(u'\u2073'), lambda match: '^3'), # superscript-3
        (re.compile(u'\u2074'), lambda match: '^4'), # superscript-4
        (re.compile(u'\u2075'), lambda match: '^5'), # superscript-5
        (re.compile(u'\u2076'), lambda match: '^6'), # superscript-6
        (re.compile(u'\u2077'), lambda match: '^7'), # superscript-7
        (re.compile(u'\u2078'), lambda match: '^8'), # superscript-8
        (re.compile(u'\u2079'), lambda match: '^9'), # superscript-9
        # always change CO2
        (re.compile(r'CO2'), lambda match: 'CO<span style="font-size: 40%;">2</span>'), # CO2
        # remove *** paragraphs
        (re.compile(r'<p class="absatz">\*\*\*</p>'), lambda match: ''),
        # better layout for the top line of each article
        (re.compile(u'(?<=DIE ZEIT N\u00B0 \d /) (?=\d\d)'), lambda match: ' 20'), # proper year in edition number
        (re.compile(u'(?<=DIE ZEIT N\u00B0 \d\d /) (?=\d\d)'), lambda match: ' 20'), # proper year in edition number
        (re.compile(u'(?<=>)(?=DIE ZEIT N\u00B0 \d\d / 20\d\d)'), lambda match: u' \u2014 '), # m-dash between category and DIE ZEIT
        (re.compile(r'(?<=Z[Ee][Ii][Tt] Nr\. \d\d/)(?=[01234]\d)'), lambda match: '20'), # DIE ZEIT number with full year
        (re.compile(r'(?<=Z[Ee][Ii][Tt] Nr\. \d/)(?=[01234]\d)'), lambda match: '20'),    # DIE ZEIT number with full year
        (re.compile(r'(?<=Z[Ee][Ii][Tt] Nr\. \d\d/)(?=[56789]\d)'), lambda match: '19'), # DIE ZEIT number with full year
        (re.compile(r'(?<=Z[Ee][Ii][Tt] Nr\. \d/)(?=[56789]\d)'), lambda match: '19'),    # DIE ZEIT number with full year
        (re.compile(r'(?<=ANALYSE UND MEINUNG) '), lambda match: ': '), # "ANALYSE UND MEINUNG" with following colon
        # better layout and saving space for the bottom of each article
        (re.compile(r"]</span>\n[ ]*</a>\n[ ]*<br />\n[ ]*(?=<a href='[\w]*.xhtm[#\w_]*' class='toc_link'>)"), lambda match: ']</span>\n</a>\n'), # remove line breaks for navi links
        (re.compile(r'\[zum Inhaltsverzeichnis]'), lambda match: '[Inhalt]'), # shorten Inhaltsverzeichnis
        (re.compile(r'\[zum Ressort '), lambda match: '['), # shorten Ressort
        (re.compile(u'\[zur Übersicht '), lambda match: '['), # shorten Übersicht
    ]

    def build_index(self):
        domain = "https://premium.zeit.de"
        url = domain + "/abo/zeit_digital"
        browser = self.get_browser()

        # new login process
        response = browser.open(url)
        # get rid of nested form
        response.set_data(response.get_data().replace('<div><form action="/abo/zeit_digital?destination=node%2F94"  accept-charset="UTF-8" method="post" id="user-login-form" class="zol_inlinelabel">', ''))
        browser.set_response(response)
        # find correct form and submit
        browser.select_form(nr=2)
        browser.form['name']=self.username
        browser.form['pass']=self.password
        browser.submit()
        # now find the correct file, we will still use the ePub file
        epublink = browser.find_link(text_regex=re.compile('.*Download als Datei im ePub-Format.*'))
        response = browser.follow_link(epublink)
        self.report_progress(1,_('next step'))

        tmp = PersistentTemporaryFile(suffix='.epub')
        self.report_progress(0,_('downloading epub'))
        tmp.write(response.read())
        tmp.close()

        zfile = zipfile.ZipFile(tmp.name, 'r')
        self.report_progress(0,_('extracting epub'))

        zfile.extractall(self.output_dir)

        tmp.close()

        index = os.path.join(self.output_dir, 'content.opf')

        self.report_progress(1,_('epub downloaded and extracted'))

        # doing regular expression filtering
        for path in walk('.'):
            (shortname, extension) = os.path.splitext(path)  
            if extension.lower() in ('.html', '.htm', '.xhtml'):
                with open(path, 'r+b') as f:
                    raw = f.read()
                    raw = raw.decode('utf-8')
                    for pat, func in self.preprocess_regexps:
                        raw = pat.sub(func, raw)
                    f.seek(0)
                    f.truncate()
                    f.write(raw.encode('utf-8'))

        # adding real cover
        self.report_progress(0,_('trying to download cover image (titlepage)'))
        self.download_cover()
        self.conversion_options["cover"] = self.cover_path

        return index

    # getting url of the cover
    def get_cover_url(self):
        self.log.warning('Downloading cover')
        try:
            self.log.warning('Trying PDF-based cover')
            domain = "https://premium.zeit.de"
            url = domain + "/abo/zeit_digital"
            browser = self.get_browser()

            # new login process
            response = browser.open(url)
            # get rid of nested form
            response.set_data(response.get_data().replace('<div><form action="/abo/zeit_digital?destination=node%2F94"  accept-charset="UTF-8" method="post" id="user-login-form" class="zol_inlinelabel">', ''))
            browser.set_response(response)
            # find correct form and submit
            browser.select_form(nr=2)
            browser.form['name']=self.username
            browser.form['pass']=self.password
            browser.submit()
            # actual cover search
            pdflink = browser.find_link(url_regex=re.compile('system/files/epaper/DZ/pdf/DZ_ePaper*'))
            cover_url = urlparse(pdflink.base_url)[0]+'://'+urlparse(pdflink.base_url)[1]+''+(urlparse(pdflink.url)[2]).replace('ePaper_','').replace('.pdf','_001.pdf')
            self.log.warning('PDF link found:')
            self.log.warning(cover_url)
            # download the cover (has to be here due to new login process)
            with closing(browser.open(cover_url)) as r:
                cdata = r.read()
            from calibre.ebooks.metadata.pdf import get_metadata
            stream = cStringIO.StringIO(cdata)
            cdata = None
            mi = get_metadata(stream)
            if mi.cover_data and mi.cover_data[1]:
                cdata = mi.cover_data[1]

            cpath = os.path.join(self.output_dir, 'cover.jpg')
            save_cover_data_to(cdata, cpath)
            cover_url = cpath

        except:
            self.log.warning('Trying low-res cover')
            try:
                inhalt = self.index_to_soup('http://www.zeit.de/inhalt')
                cover_url = inhalt.find('div', attrs={'class':'singlearchive clearfix'}).img['src'].replace('icon_','')
            except:
                self.log.warning('Using static old low-res cover')
                cover_url = 'http://images.zeit.de/bilder/titelseiten_zeit/1946/001_001.jpg'
        return cover_url