Register Guidelines E-Books Search Today's Posts Mark Forums Read

Go Back   MobileRead Forums > Non-English Discussions > Deutsches Forum

Notices

Reply
 
Thread Tools Search this Thread
Old 06-04-2011, 02:17 PM   #136
kratzbaum
Junior Member
kratzbaum began at the beginning.
 
Posts: 1
Karma: 10
Join Date: Jun 2011
Device: PRS-650
Unhappy ZEIT-Recipe geht nicht mehr

Die ZEIT hat ihren Abo-Bereich umgestaltet - und nun versagt das oben gepostete Calibre-Recipe leider mit
Code:
calibre, version 0.8.3
ERROR: Konvertierungsfehler: <b>Misslungen</b>: Nachrichten abrufen von Die Zeit

Nachrichten abrufen von Die Zeit
Resolved conversion options
calibre version: 0.8.3
{'asciiize': False,
   [ ...]
  File "site-packages\mechanize-0.2.4-py2.7.egg\mechanize\_mechanize.py", line 620, in find_link
mechanize._mechanize.LinkNotFoundError
....
kratzbaum is offline   Reply With Quote
Old 06-23-2011, 05:25 AM   #137
tobias2
Member
tobias2 began at the beginning.
 
Posts: 18
Karma: 36
Join Date: Feb 2011
Device: Kindle
Neues Calibre-Skript

Hallo zusammen,

Nach ein wenig Basteln funktioniert das Skript jetzt wieder. Ich benutze weiterhin das ePub als Quelle, da ich finde, dass die so generierte Variante besser als die Mobi-Version der Zeit aussieht. Viel Spass damit!

Tobias

Code:
#!/usr/bin/env  python
# -*- coding: utf-8 mode: python -*-

__license__   = 'GPL v3'
__copyright__ = '2010, Steffen Siebert <calibre at steffensiebert.de>'
__docformat__ = 'restructuredtext de'
__version__   = '1.5'

"""
Die Zeit EPUB
"""

import os, urllib2, zipfile, re, string, cStringIO
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ptempfile import PersistentTemporaryFile
from calibre import walk
from urlparse import urlparse
from contextlib import closing
from calibre.utils.magick.draw import save_cover_data_to

class ZeitEPUBAbo(BasicNewsRecipe):

    title = u'Die Zeit'
    description = u'Das EPUB Abo der Zeit (needs subscription)'
    language = 'de'
    lang = 'de-DE'

    __author__ = 'Steffen Siebert, revised by Tobias Isenberg (with some code by Kovid Goyal)'
    needs_subscription = True

    conversion_options = {
        'no_default_epub_cover' : True,
        # fixing the wrong left margin
        'mobi_ignore_margins' : True,
        'keep_ligatures' : True,
    }

    preprocess_regexps    = [
        # filtering for correct dashes ("Gedankenstrich" and "bis")
        (re.compile(u' (-|\u2212)(?=[ ,])'), lambda match: u' \u2013'),
        (re.compile(r'(?<=\d)-(?=\d)'), lambda match: u'\u2013'), # number-number
        (re.compile(u'(?<=\d,)-(?= ?\u20AC)'), lambda match: u'\u2013'), # ,- Euro
        # fix the number dash number dash for the title image that was broken by the previous line
        (re.compile(u'(?<=\d\d\d\d)\u2013(?=\d?\d\.png)'), lambda match: '-'),
        # filtering for certain dash cases
        (re.compile(r'Bild - Zeitung'), lambda match: 'Bild-Zeitung'), # the obvious
        (re.compile(r'EMail'), lambda match: 'E-Mail'), # the obvious
        (re.compile(r'SBahn'), lambda match: 'S-Bahn'), # the obvious
        (re.compile(r'UBoot'), lambda match: 'U-Boot'), # the obvious
        (re.compile(r'T Shirt'), lambda match: 'T-Shirt'), # the obvious
        (re.compile(r'TShirt'), lambda match: 'T-Shirt'), # the obvious
        # the next two lines not only fix errors but also create new ones. this is due to additional errors in
        # the typesetting such as missing commas or wrongly placed dashes. but more is fixed than broken.
        (re.compile(r'(?<!und|der|\w\w,) -(?=\w)'), lambda match: '-'), # space too much before a connecting dash
        (re.compile(r'(?<=\w)- (?!und\b|oder\b|wie\b|aber\b|auch\b|sondern\b|bis\b|&amp;|&\s|bzw\.|auf\b|eher\b)'), lambda match: '-'), # space too much after a connecting dash
        # filtering for missing spaces before the month in long dates
        (re.compile(u'(?<=\d)\.(?=(Januar|Februar|M\u00E4rz|April|Mai|Juni|Juli|August|September|Oktober|November|Dezember))'), lambda match: '. '),
        # filtering for other missing spaces
        (re.compile(r'Stuttgart21'), lambda match: 'Stuttgart 21'), # the obvious
        (re.compile(u'(?<=\d)(?=\u20AC)'), lambda match: u'\u2013'), # Zahl[no space]Euro
        (re.compile(r':(?=[^\d\s</])'), lambda match: ': '), # missing space after colon
        (re.compile(u'\u00AB(?=[^\-\.:;,\?!<\)\s])'), lambda match: u'\u00AB '), # missing space after closing quotation
        (re.compile(u'(?<=[^\s\(>])\u00BB'), lambda match: u' \u00BB'), # missing space before opening quotation
        (re.compile(r'(?<=[a-z])(?=(I|II|III|IV|V|VI|VII|VIII|IX|X|XI|XII|XIII|XIV|XV|XVI|XVII|XVIII|XIX|XX)\.)'), lambda match: ' '), # missing space before Roman numeral
        (re.compile(r'(?<=(I|V|X)\.)(?=[\w])'), lambda match: ' '), # missing space after Roman numeral
        (re.compile(r'(?<=(II|IV|VI|IX|XI|XV|XX)\.)(?=[\w])'), lambda match: ' '), # missing space after Roman numeral
        (re.compile(r'(?<=(III|VII|XII|XIV|XVI|XIX)\.)(?=[\w])'), lambda match: ' '), # missing space after Roman numeral
        (re.compile(r'(?<=(VIII|XIII|XVII)\.)(?=[\w])'), lambda match: ' '), # missing space after Roman numeral
        (re.compile(r'(?<=(XVIII)\.)(?=[\w])'), lambda match: ' '), # missing space after Roman numeral
        (re.compile(r'(?<=[A-Za-zÄÖÜäöü]),(?=[A-Za-zÄÖÜäöü])'), lambda match: ', '), # missing space after comma
        (re.compile(r'(?<=[a-zäöü])\.(?=[A-ZÄÖÜ][A-Za-zÄÖÜäöü])'), lambda match: '. '), # missing space after full-stop
        (re.compile(r'(?<=[uU]\.) (?=a\.)'), lambda match: u'\u2008'), # fix abbreviation that was potentially broken previously
        (re.compile(r'(?<=[iI]\.) (?=A\.)'), lambda match: u'\u2008'), # fix abbreviation that was potentially broken previously
        (re.compile(r'(?<=[zZ]\.) (?=B\.)'), lambda match: u'\u2008'), # fix abbreviation that was potentially broken previously
        (re.compile(r'(?<=\w\.) (?=[A-Z][a-z]*@)'), lambda match: ''), # fix e-mail address that was potentially broken previously
        (re.compile(r'(?<=\d)[Pp]rozent'), lambda match: ' Prozent'),
        (re.compile(r'\.\.\.\.+'), lambda match: '...'), # too many dots (....)
        (re.compile(r'(?<=[^\s])\.\.\.'), lambda match: ' ...'), # spaces before ...
        (re.compile(r'\.\.\.(?=[^\s])'), lambda match: '... '), # spaces after ...
        (re.compile(r'(?<=[\[\(]) \.\.\. (?=[\]\)])'), lambda match: '...'), # fix special cases of ... in brackets
        (re.compile(u'(?<=[\u00BB\u203A]) \.\.\.'), lambda match: '...'), # fix special cases of ... after a quotation mark
        (re.compile(u'\.\.\. (?=[\u00AB\u2039,])'), lambda match: '...'), # fix special cases of ... before a quotation mark or comma
        # fix missing spaces between numbers and any sort of units, possibly with dot
        (re.compile(r'(?<=\d)(?=(Femto|Piko|Nano|Mikro|Milli|Zenti|Dezi|Hekto|Kilo|Mega|Giga|Tera|Peta|Tausend|Trilli|Kubik|Quadrat|Meter|Uhr|Jahr|Schuljahr|Seite))'), lambda match: ' '),
        (re.compile(r'(?<=\d\.)(?=(Femto|Piko|Nano|Mikro|Milli|Zenti|Dezi|Hekto|Kilo|Mega|Giga|Tera|Peta|Tausend|Trilli|Kubik|Quadrat|Meter|Uhr|Jahr|Schuljahr|Seite))'), lambda match: ' '),
        # fix wrong spaces
        (re.compile(r'(?<=<p class="absatz">[A-ZÄÖÜ]) (?=[a-zäöü\-])'), lambda match: ''), # at beginning of paragraphs
        (re.compile(u' \u00AB'), lambda match: u'\u00AB '), # before closing quotation
        (re.compile(u'\u00BB '), lambda match: u' \u00BB'), # after opening quotation
        # filtering for spaces in large numbers for better readability
        (re.compile(r'(?<=\d\d)(?=\d\d\d[ ,\.;\)<\?!-])'), lambda match: u'\u2008'), # end of the number with some character following
        (re.compile(r'(?<=\d\d)(?=\d\d\d. )'), lambda match: u'\u2008'), # end of the number with full-stop following, then space is necessary (avoid file names)
        (re.compile(u'(?<=\d)(?=\d\d\d\u2008)'), lambda match: u'\u2008'), # next level
        (re.compile(u'(?<=\d)(?=\d\d\d\u2008)'), lambda match: u'\u2008'), # next level
        (re.compile(u'(?<=\d)(?=\d\d\d\u2008)'), lambda match: u'\u2008'), # next level
        (re.compile(u'(?<=\d)(?=\d\d\d\u2008)'), lambda match: u'\u2008'), # next level
        # filtering for unicode characters that are missing on the Kindle,
        # try to replace them with meaningful work-arounds
        (re.compile(u'\u2080'), lambda match: '<span style="font-size: 40%;">0</span>'), # subscript-0
        (re.compile(u'\u2081'), lambda match: '<span style="font-size: 40%;">1</span>'), # subscript-1
        (re.compile(u'\u2082'), lambda match: '<span style="font-size: 40%;">2</span>'), # subscript-2
        (re.compile(u'\u2083'), lambda match: '<span style="font-size: 40%;">3</span>'), # subscript-3
        (re.compile(u'\u2084'), lambda match: '<span style="font-size: 40%;">4</span>'), # subscript-4
        (re.compile(u'\u2085'), lambda match: '<span style="font-size: 40%;">5</span>'), # subscript-5
        (re.compile(u'\u2086'), lambda match: '<span style="font-size: 40%;">6</span>'), # subscript-6
        (re.compile(u'\u2087'), lambda match: '<span style="font-size: 40%;">7</span>'), # subscript-7
        (re.compile(u'\u2088'), lambda match: '<span style="font-size: 40%;">8</span>'), # subscript-8
        (re.compile(u'\u2089'), lambda match: '<span style="font-size: 40%;">9</span>'), # subscript-9
        # always chance CO2
        (re.compile(r'CO2'), lambda match: 'CO<span style="font-size: 40%;">2</span>'), # CO2
        # remove *** paragraphs
        (re.compile(r'<p class="absatz">\*\*\*</p>'), lambda match: ''),
        # better layout for the top line of each article
        (re.compile(u'(?<=DIE ZEIT N\u00B0 \d /) (?=\d\d)'), lambda match: ' 20'), # proper year in edition number
        (re.compile(u'(?<=DIE ZEIT N\u00B0 \d\d /) (?=\d\d)'), lambda match: ' 20'), # proper year in edition number
        (re.compile(u'(?<=>)(?=DIE ZEIT N\u00B0 \d\d / 20\d\d)'), lambda match: u' \u2014 '), # m-dash between category and DIE ZEIT
    ]

    def build_index(self):
        domain = "https://premium.zeit.de"
        url = domain + "/abo/zeit_digital"
        browser = self.get_browser()

        # new login process
        response = browser.open(url)
        browser.select_form(nr=2)
        browser.form['name']=self.username
        browser.form['pass']=self.password
        browser.submit()
        # now find the correct file, we will still use the ePub file
        epublink = browser.find_link(text_regex=re.compile('.*Ausgabe als Datei im ePub-Format.*'))
        response = browser.follow_link(epublink)
        self.report_progress(1,_('next step'))

        tmp = PersistentTemporaryFile(suffix='.epub')
        self.report_progress(0,_('downloading epub'))
        tmp.write(response.read())
        tmp.close()

        zfile = zipfile.ZipFile(tmp.name, 'r')
        self.report_progress(0,_('extracting epub'))

        zfile.extractall(self.output_dir)

        tmp.close()

        index = os.path.join(self.output_dir, 'content.opf')

        self.report_progress(1,_('epub downloaded and extracted'))

        # doing regular expression filtering
        for path in walk('.'):
            (shortname, extension) = os.path.splitext(path)  
            if extension.lower() in ('.html', '.htm', '.xhtml'):
                with open(path, 'r+b') as f:
                    raw = f.read()
                    raw = raw.decode('utf-8')
                    for pat, func in self.preprocess_regexps:
                        raw = pat.sub(func, raw)
                    f.seek(0)
                    f.truncate()
                    f.write(raw.encode('utf-8'))

        # adding real cover
        self.report_progress(0,_('trying to download cover image (titlepage)'))
        self.download_cover()
        self.conversion_options["cover"] = self.cover_path

        return index

    # getting url of the cover
    def get_cover_url(self):
        self.log.warning('Downloading cover')
        try:
            self.log.warning('Trying PDF-based cover')
            domain = "https://premium.zeit.de"
            url = domain + "/abo/zeit_digital"
            browser = self.get_browser()

            # new login process
            response = browser.open(url)
            browser.select_form(nr=2)
            browser.form['name']=self.username
            browser.form['pass']=self.password
            browser.submit()
            # actual cover search
            pdflink = browser.find_link(url_regex=re.compile('system/files/epaper/DZ/pdf/DZ_ePaper*'))
            cover_url = urlparse(pdflink.base_url)[0]+'://'+urlparse(pdflink.base_url)[1]+''+(urlparse(pdflink.url)[2]).replace('ePaper_','').replace('.pdf','_001.pdf')
            self.log.warning('PDF link found:')
            self.log.warning(cover_url)
            # download the cover (has to be here due to new login process)
            with closing(browser.open(cover_url)) as r:
                cdata = r.read()
            from calibre.ebooks.metadata.pdf import get_metadata
            stream = cStringIO.StringIO(cdata)
            cdata = None
            mi = get_metadata(stream)
            if mi.cover_data and mi.cover_data[1]:
                cdata = mi.cover_data[1]

            cpath = os.path.join(self.output_dir, 'cover.jpg')
            save_cover_data_to(cdata, cpath)
            cover_url = cpath

        except:
            self.log.warning('Trying low-res cover')
            try:
                inhalt = self.index_to_soup('http://www.zeit.de/inhalt')
                cover_url = inhalt.find('div', attrs={'class':'singlearchive clearfix'}).img['src'].replace('icon_','')
            except:
                self.log.warning('Using static old low-res cover')
                cover_url = 'http://images.zeit.de/bilder/titelseiten_zeit/1946/001_001.jpg'
        return cover_url
tobias2 is offline   Reply With Quote
Advert
Old 07-21-2011, 04:01 PM   #138
Christian.Heise
Member
Christian.Heise has a complete set of Star Wars action figures.Christian.Heise has a complete set of Star Wars action figures.Christian.Heise has a complete set of Star Wars action figures.
 
Christian.Heise's Avatar
 
Posts: 20
Karma: 250
Join Date: Jul 2010
Location: Hamburg, Berlin
Device: iPad, Kindle, Italica, Stanza, Sony (diverse)
Hallo,

wir haben das .mobi in unserem Premium-Bereich weiter überarbeitet. Wenn Sie dazu Anmerkungen oder Wünsche haben, können Sie uns diese hier oder via E-Mail mitteilen.

Kleine Information zum aktuellen Stand der Entwicklungen: Wir sind weiter an dem Thema Bildrechte dran und werden unseren Premiumbereich immer weiter ausbauen. Dazu gehört auch die Prüfung der Anbindung an den Kindle-Store (siehe NYTimes). Ausserdem überarbeiten wir im Moment die Quellenbasis für das ePub und das mobi auf premium.zeit.de und werden dort bald hoffentlich auch die Texte des ZEITmagazins anbieten können.

MfG,
Christian Heise
---
ZEIT ONLINE GmbH
Christian.Heise is offline   Reply With Quote
Old 09-05-2011, 10:42 AM   #139
Quoquad
Member
Quoquad began at the beginning.
 
Quoquad's Avatar
 
Posts: 15
Karma: 10
Join Date: Apr 2009
Location: Hamburg
Device: PB 903 pro - 612 pro - Lumia 920
Quote:
Originally Posted by ewy View Post
Hi,

gerade habe ich festgestellt, dass das Online-Abo der "ZEIT" nun auch als ePub erhältlich ist - seit Neuestem sogar DRM-frei.
Das Abo kostet 30 Euro für 1/2 Jahr und enthält nicht nur die Zeitung als ePub und PDF, sondern auch noch ausgewählte Artikel als Audio-Dateien:
http://www.zeit.de/angebote/audio/epaper

Das ist für mich mal ein echt gutes Angebot! Die "ZEIT" lese ich schon immer gerne, aber das Riesen-Format der Papierversion ist IMHO extrem unhandlich, und die Online-Version gab es letztes Mal, als ich danach geschaut hatte, nur als PDF (welches am Bildschirm nur mit viel Scrollen zu lesen war).

Gruß,

ewy
Achtung, die Preisinformation ist nicht mehr aktuell. Das ZEIT ePaper Abo kostet jetzt 2,99 € pro Ausgabe. Selbst Printabonennten müssen jetzt 40 Cent für die ePaper Version bezahlen, war vorher im Print Abo mitenthalten.
Grund? Seit Juli bietet der ZEIT Verlag eine iPhone-iPad-App an. Und da Apple ja an allen Apps 30% für sich einfordert dieser drastische Preisanstieg. Da ich seit Jahren das ZEIT epaper Abo habe, bleibt es für mich bei den günstigen 30 € im halben Jahr.
Quoquad is offline   Reply With Quote
Old 09-18-2011, 07:57 AM   #140
fortwienix
Enthusiast
fortwienix is clearly one to watchfortwienix is clearly one to watchfortwienix is clearly one to watchfortwienix is clearly one to watchfortwienix is clearly one to watchfortwienix is clearly one to watchfortwienix is clearly one to watchfortwienix is clearly one to watchfortwienix is clearly one to watchfortwienix is clearly one to watchfortwienix is clearly one to watch
 
Posts: 47
Karma: 10848
Join Date: Mar 2011
Device: 902
Hallo,

bin grad über diesen Thread gestolpert und schon seit langer Zeit Leser der Zeit. Erstmal vielen Dank an die Zeit Techniker, die:
- ein lesbares elektronisches Format anbieten
- DRM frei (das erleichtert ungemein das Kopieren auf den E-Book Reader, v.a. wenn kein Windows auf die Schnelle vorhanden ist)
- das Onlineabo deutlich billiger als die Printversion ist.

Ich hab das Printabo gegen das Onlineabo Anfang dieses Jahres getauscht. Es gab die gleichen Probleme wie ein Leser hier schrieb, der einen neuen Account anlegen mußte und dann trotzdem mit dem alten noch angemeldet wurde. Das war etwas umständlich, ist aber vielleicht schon verbesert worden.

Ich lese die Epub Version auf einem Pocketbook 902. Soweit past alles. Grafiken vermisse ich leider manchmal in der epub, besonders im Politikteil die beiden gegenübergestellten Fotos (dritt- und vorletzte Seite) und die Grafik auf der Themenseite im Wissen. Wenn die mal noch integriert werden könnten, wäre das super. Bei vielen anderen Fotos schmerzt mich der Verlust kaum (höchstens in der Rubrik Reisen noch).
Mit dem Inhaltsverzeichnis bin ich ganz gut zurecht gekommen. Ich lese, wie schon auch in der Printausgabe, von vorn nach hinten. Wenn mich ein Artikel nicht so sehr interessiert, springe ich zum nächsten.
Das das Magazin nur als PDF erscheint, ist ok, duch as Layout und die Grafiken würde in der nur Textvariante zuviel verloren gehen. Gleichwohl ist auf meinem Pocketbook das Lesen des Magazins manchmal sehr anstrengend. Blättern dauert oder der Reader stürtzt ab. Das liegt dann aber vermutlich am Gerät und derren Software.
Ebenso würde ich mir wünschen, die Zeitung direkt vom E-Bookreader aus laden zu können. Das ist dann aber hauptsächlich ebenfalls ein Problem des Readers.

In diesem Sinne, vielen Dank an die Zeit. So werden sinnvoll neue Vertriebswege aufgegriffen und es wird nicht allgemein über das Internet lamentiert, welches die Zeitungslandschaft deutlich verändert hat. Auch vielen Dank an die Redakteure, die sich der Kritik der Leser hier oder in der Kommentarfunktion unter den Onlineartikeln stellen.

Grüße, fortwienix

Last edited by fortwienix; 09-18-2011 at 08:09 AM.
fortwienix is offline   Reply With Quote
Advert
Old 11-05-2011, 12:11 PM   #141
tobias2
Member
tobias2 began at the beginning.
 
Posts: 18
Karma: 36
Join Date: Feb 2011
Device: Kindle
Recipe Update

Hallo zusammen,

Die Zeit hatte vor einigen Wochen mal wieder an ihren Webseiten gebastelt und das derzeitige Login entspricht nicht den HTML-Spezifikationen. Dank achims funktioniert das Recipe jetzt aber wieder, und ich habe noch einige neue Korrekturen und Verbesserungen eingebaut. Viel Spass damit, siehe unten! Ausserdem noch der Hinweis, dass achims ein neues Recipe erstellt hat, das auch die PDFs herunterladen kann, siehe https://www.mobileread.com/forums/sho...d.php?t=155792.

Tobias

Code:
#!/usr/bin/env  python
# -*- coding: utf-8 mode: python -*-

__license__   = 'GPL v3'
__copyright__ = '2010, Steffen Siebert <calibre at steffensiebert.de>'
__docformat__ = 'restructuredtext de'
__version__   = '1.6'

"""
Die Zeit EPUB
"""

import os, urllib2, zipfile, re, string, cStringIO
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ptempfile import PersistentTemporaryFile
from calibre import walk
from urlparse import urlparse
from contextlib import closing
from calibre.utils.magick.draw import save_cover_data_to

class ZeitEPUBAbo(BasicNewsRecipe):

    title = u'Die Zeit'
    description = u'Das EPUB Abo der Zeit (needs subscription)'
    language = 'de'
    lang = 'de-DE'

    __author__ = 'Steffen Siebert, revised by Tobias Isenberg (with some code by Kovid Goyal)'
    needs_subscription = True

    conversion_options = {
        'no_default_epub_cover' : True,
        # fixing the wrong left margin
        'mobi_ignore_margins' : True,
        'keep_ligatures' : True,
    }

    preprocess_regexps    = [
        # filtering for correct dashes ("Gedankenstrich" and "bis")
        (re.compile(u' (-|\u2212)(?=[ ,])'), lambda match: u' \u2013'),
        (re.compile(r'(?<=\d)-(?=\d)'), lambda match: u'\u2013'), # number-number
        (re.compile(u'(?<=\d,)-(?= ?\u20AC)'), lambda match: u'\u2013'), # ,- Euro
        # fix the number dash number dash for the title image that was broken by the previous line
        (re.compile(u'(?<=\d\d\d\d)\u2013(?=\d?\d\.png)'), lambda match: '-'),
        # filtering for certain dash cases
        (re.compile(r'Bild - Zeitung'), lambda match: 'Bild-Zeitung'), # the obvious
        (re.compile(r'EMail'), lambda match: 'E-Mail'), # the obvious
        (re.compile(r'SBahn'), lambda match: 'S-Bahn'), # the obvious
        (re.compile(r'UBoot'), lambda match: 'U-Boot'), # the obvious
        (re.compile(r'T Shirt'), lambda match: 'T-Shirt'), # the obvious
        (re.compile(r'TShirt'), lambda match: 'T-Shirt'), # the obvious
        # the next two lines not only fix errors but also create new ones. this is due to additional errors in
        # the typesetting such as missing commas or wrongly placed dashes. but more is fixed than broken.
        (re.compile(r'(?<!und|der|\w\w,) -(?=\w)'), lambda match: '-'), # space too much before a connecting dash
        (re.compile(r'(?<=\w)- (?!und\b|oder\b|wie\b|als\b|aber\b|auch\b|sondern\b|bis\b|&amp;|&\s|bzw\.|auf\b|eher\b|noch\b)'), lambda match: '-'), # space too much after a connecting dash
        # filtering for missing spaces before the month in long dates
        (re.compile(u'(?<=\d)\.(?=(Januar|Februar|M\u00E4rz|April|Mai|Juni|Juli|August|September|Oktober|November|Dezember))'), lambda match: '. '),
        # filtering for other missing spaces
        (re.compile(r'Stuttgart21'), lambda match: 'Stuttgart 21'), # the obvious
        (re.compile(u'(?<=\d)(?=\u20AC)'), lambda match: u'\u2013'), # Zahl[no space]Euro
        (re.compile(r':(?=[^\d\s</])'), lambda match: ': '), # missing space after colon
        (re.compile(u'(?<=\S)\u0022 '), lambda match: u'\u00AB '), # wrong closing quotation
        (re.compile(u' \u0022(?=\S)'), lambda match: u' \u00BB'), # wrong opening quotation
        (re.compile(u'\u00AB(?=[^\-\.:;,\?!<\)\s])'), lambda match: u'\u00AB '), # missing space after closing quotation
        (re.compile(u'(?<=[^\s\(>])\u00BB'), lambda match: u' \u00BB'), # missing space before opening quotation
        (re.compile(r'(?<=[a-z])(?=(I|II|III|IV|V|VI|VII|VIII|IX|X|XI|XII|XIII|XIV|XV|XVI|XVII|XVIII|XIX|XX)\.)'), lambda match: ' '), # missing space before Roman numeral
        (re.compile(r'(?<=(I|V|X)\.)(?=[\w])'), lambda match: ' '), # missing space after Roman numeral
        (re.compile(r'(?<=(II|IV|VI|IX|XI|XV|XX)\.)(?=[\w])'), lambda match: ' '), # missing space after Roman numeral
        (re.compile(r'(?<=(III|VII|XII|XIV|XVI|XIX)\.)(?=[\w])'), lambda match: ' '), # missing space after Roman numeral
        (re.compile(r'(?<=(VIII|XIII|XVII)\.)(?=[\w])'), lambda match: ' '), # missing space after Roman numeral
        (re.compile(r'(?<=(XVIII)\.)(?=[\w])'), lambda match: ' '), # missing space after Roman numeral
        (re.compile(r'(?<=[A-Za-zÄÖÜäöü]),(?=[A-Za-zÄÖÜäöü])'), lambda match: ', '), # missing space after comma
        (re.compile(r'(?<=[a-zäöü])\.(?=[A-ZÄÖÜ][A-Za-zÄÖÜäöü])'), lambda match: '. '), # missing space after full-stop
        (re.compile(r'(?<=[uU]\.) (?=a\.)'), lambda match: u'\u2008'), # fix abbreviation that was potentially broken previously
        (re.compile(r'(?<=[iI]\.) (?=A\.)'), lambda match: u'\u2008'), # fix abbreviation that was potentially broken previously
        (re.compile(r'(?<=[zZ]\.) (?=B\.)'), lambda match: u'\u2008'), # fix abbreviation that was potentially broken previously
        (re.compile(r'(?<=\w\.) (?=[A-Z][a-z]*@)'), lambda match: ''), # fix e-mail address that was potentially broken previously
        (re.compile(r'(?<=\d)[Pp]rozent'), lambda match: ' Prozent'),
        (re.compile(r'\.\.\.\.+'), lambda match: '...'), # too many dots (....)
        (re.compile(r'(?<=[^\s])\.\.\.'), lambda match: ' ...'), # spaces before ...
        (re.compile(r'\.\.\.(?=[^\s])'), lambda match: '... '), # spaces after ...
        (re.compile(r'(?<=[\[\(]) \.\.\. (?=[\]\)])'), lambda match: '...'), # fix special cases of ... in brackets
        (re.compile(u'(?<=[\u00BB\u203A]) \.\.\.'), lambda match: '...'), # fix special cases of ... after a quotation mark
        (re.compile(u'\.\.\. (?=[\u00AB\u2039,])'), lambda match: '...'), # fix special cases of ... before a quotation mark or comma
        # fix missing spaces between numbers and any sort of units, possibly with dot
        (re.compile(r'(?<=\d)(?=(Femto|Piko|Nano|Mikro|Milli|Zenti|Dezi|Hekto|Kilo|Mega|Giga|Tera|Peta|Tausend|Trilli|Kubik|Quadrat|Meter|Uhr|Jahr|Schuljahr|Seite|Division|Kompanie|Armee))'), lambda match: ' '),
        (re.compile(r'(?<=\d\.)(?=(Femto|Piko|Nano|Mikro|Milli|Zenti|Dezi|Hekto|Kilo|Mega|Giga|Tera|Peta|Tausend|Trilli|Kubik|Quadrat|Meter|Uhr|Jahr|Schuljahr|Seite|Division|Kompanie|Armee))'), lambda match: ' '),
        # fix wrong spaces
        (re.compile(r'(?<=<p class="absatz">[A-ZÄÖÜ]) (?=[a-zäöü\-])'), lambda match: ''), # at beginning of paragraphs
        (re.compile(u' \u00AB'), lambda match: u'\u00AB '), # before closing quotation
        (re.compile(u'\u00BB '), lambda match: u' \u00BB'), # after opening quotation
        # filtering for spaces in large numbers for better readability
        (re.compile(r'(?<=\d\d)(?=\d\d\d[ ,\.;\)<\?!-])'), lambda match: u'\u2008'), # end of the number with some character following
        (re.compile(r'(?<=\d\d)(?=\d\d\d. )'), lambda match: u'\u2008'), # end of the number with full-stop following, then space is necessary (avoid file names)
        (re.compile(u'(?<=\d)(?=\d\d\d\u2008)'), lambda match: u'\u2008'), # next level
        (re.compile(u'(?<=\d)(?=\d\d\d\u2008)'), lambda match: u'\u2008'), # next level
        (re.compile(u'(?<=\d)(?=\d\d\d\u2008)'), lambda match: u'\u2008'), # next level
        (re.compile(u'(?<=\d)(?=\d\d\d\u2008)'), lambda match: u'\u2008'), # next level
        # filtering for unicode characters that are missing on the Kindle,
        # try to replace them with meaningful work-arounds
        (re.compile(u'\u2080'), lambda match: '<span style="font-size: 40%;">0</span>'), # subscript-0
        (re.compile(u'\u2081'), lambda match: '<span style="font-size: 40%;">1</span>'), # subscript-1
        (re.compile(u'\u2082'), lambda match: '<span style="font-size: 40%;">2</span>'), # subscript-2
        (re.compile(u'\u2083'), lambda match: '<span style="font-size: 40%;">3</span>'), # subscript-3
        (re.compile(u'\u2084'), lambda match: '<span style="font-size: 40%;">4</span>'), # subscript-4
        (re.compile(u'\u2085'), lambda match: '<span style="font-size: 40%;">5</span>'), # subscript-5
        (re.compile(u'\u2086'), lambda match: '<span style="font-size: 40%;">6</span>'), # subscript-6
        (re.compile(u'\u2087'), lambda match: '<span style="font-size: 40%;">7</span>'), # subscript-7
        (re.compile(u'\u2088'), lambda match: '<span style="font-size: 40%;">8</span>'), # subscript-8
        (re.compile(u'\u2089'), lambda match: '<span style="font-size: 40%;">9</span>'), # subscript-9
        (re.compile(u'\u2070'), lambda match: '^0'), # superscript-0
        (re.compile(u'\u2071'), lambda match: '^1'), # superscript-1
        (re.compile(u'\u2072'), lambda match: '^2'), # superscript-2
        (re.compile(u'\u2073'), lambda match: '^3'), # superscript-3
        (re.compile(u'\u2074'), lambda match: '^4'), # superscript-4
        (re.compile(u'\u2075'), lambda match: '^5'), # superscript-5
        (re.compile(u'\u2076'), lambda match: '^6'), # superscript-6
        (re.compile(u'\u2077'), lambda match: '^7'), # superscript-7
        (re.compile(u'\u2078'), lambda match: '^8'), # superscript-8
        (re.compile(u'\u2079'), lambda match: '^9'), # superscript-9
        # always change CO2
        (re.compile(r'CO2'), lambda match: 'CO<span style="font-size: 40%;">2</span>'), # CO2
        # remove *** paragraphs
        (re.compile(r'<p class="absatz">\*\*\*</p>'), lambda match: ''),
        # better layout for the top line of each article
        (re.compile(u'(?<=DIE ZEIT N\u00B0 \d /) (?=\d\d)'), lambda match: ' 20'), # proper year in edition number
        (re.compile(u'(?<=DIE ZEIT N\u00B0 \d\d /) (?=\d\d)'), lambda match: ' 20'), # proper year in edition number
        (re.compile(u'(?<=>)(?=DIE ZEIT N\u00B0 \d\d / 20\d\d)'), lambda match: u' \u2014 '), # m-dash between category and DIE ZEIT
        (re.compile(r'(?<=ZEIT Nr\. \d\d/)(?=[01234]\d)'), lambda match: '20'), # DIE ZEIT number with full year
        (re.compile(r'(?<=ZEIT Nr\. \d/)(?=[01234]\d)'), lambda match: '20'),    # DIE ZEIT number with full year
        (re.compile(r'(?<=ZEIT Nr\. \d\d/)(?=[56789]\d)'), lambda match: '19'), # DIE ZEIT number with full year
        (re.compile(r'(?<=ZEIT Nr\. \d/)(?=[56789]\d)'), lambda match: '19'),    # DIE ZEIT number with full year
        (re.compile(r'(?<=ANALYSE UND MEINUNG) '), lambda match: ': '), # "ANALYSE UND MEINUNG" with following colon
        # better layout and saving space for the bottom of each article
        (re.compile(r"]</span>\n[ ]*</a>\n[ ]*<br />\n[ ]*(?=<a href='[\w]*.xhtm[#\w_]*' class='toc_link'>)"), lambda match: ']</span>\n</a>\n'), # remove line breaks for navi links
        (re.compile(r'\[zum Inhaltsverzeichnis]'), lambda match: '[Inhalt]'), # shorten Inhaltsverzeichnis
        (re.compile(r'\[zum Ressort '), lambda match: '['), # shorten Ressort
        (re.compile(u'\[zur Übersicht '), lambda match: '['), # shorten Übersicht
    ]

    def build_index(self):
        domain = "https://premium.zeit.de"
        url = domain + "/abo/zeit_digital"
        browser = self.get_browser()

        # new login process
        response = browser.open(url)
        # get rid of nested form
        response.set_data(response.get_data().replace('<div><form action="/abo/zeit_digital?destination=node%2F94"  accept-charset="UTF-8" method="post" id="user-login-form" class="zol_inlinelabel">', ''))
        browser.set_response(response)
        # find correct form and submit
        browser.select_form(nr=2)
        browser.form['name']=self.username
        browser.form['pass']=self.password
        browser.submit()
        # now find the correct file, we will still use the ePub file
        epublink = browser.find_link(text_regex=re.compile('.*Ausgabe als Datei im ePub-Format.*'))
        response = browser.follow_link(epublink)
        self.report_progress(1,_('next step'))

        tmp = PersistentTemporaryFile(suffix='.epub')
        self.report_progress(0,_('downloading epub'))
        tmp.write(response.read())
        tmp.close()

        zfile = zipfile.ZipFile(tmp.name, 'r')
        self.report_progress(0,_('extracting epub'))

        zfile.extractall(self.output_dir)

        tmp.close()

        index = os.path.join(self.output_dir, 'content.opf')

        self.report_progress(1,_('epub downloaded and extracted'))

        # doing regular expression filtering
        for path in walk('.'):
            (shortname, extension) = os.path.splitext(path)  
            if extension.lower() in ('.html', '.htm', '.xhtml'):
                with open(path, 'r+b') as f:
                    raw = f.read()
                    raw = raw.decode('utf-8')
                    for pat, func in self.preprocess_regexps:
                        raw = pat.sub(func, raw)
                    f.seek(0)
                    f.truncate()
                    f.write(raw.encode('utf-8'))

        # adding real cover
        self.report_progress(0,_('trying to download cover image (titlepage)'))
        self.download_cover()
        self.conversion_options["cover"] = self.cover_path

        return index

    # getting url of the cover
    def get_cover_url(self):
        self.log.warning('Downloading cover')
        try:
            self.log.warning('Trying PDF-based cover')
            domain = "https://premium.zeit.de"
            url = domain + "/abo/zeit_digital"
            browser = self.get_browser()

            # new login process
            response = browser.open(url)
            # get rid of nested form
            response.set_data(response.get_data().replace('<div><form action="/abo/zeit_digital?destination=node%2F94"  accept-charset="UTF-8" method="post" id="user-login-form" class="zol_inlinelabel">', ''))
            browser.set_response(response)
            # find correct form and submit
            browser.select_form(nr=2)
            browser.form['name']=self.username
            browser.form['pass']=self.password
            browser.submit()
            # actual cover search
            pdflink = browser.find_link(url_regex=re.compile('system/files/epaper/DZ/pdf/DZ_ePaper*'))
            cover_url = urlparse(pdflink.base_url)[0]+'://'+urlparse(pdflink.base_url)[1]+''+(urlparse(pdflink.url)[2]).replace('ePaper_','').replace('.pdf','_001.pdf')
            self.log.warning('PDF link found:')
            self.log.warning(cover_url)
            # download the cover (has to be here due to new login process)
            with closing(browser.open(cover_url)) as r:
                cdata = r.read()
            from calibre.ebooks.metadata.pdf import get_metadata
            stream = cStringIO.StringIO(cdata)
            cdata = None
            mi = get_metadata(stream)
            if mi.cover_data and mi.cover_data[1]:
                cdata = mi.cover_data[1]

            cpath = os.path.join(self.output_dir, 'cover.jpg')
            save_cover_data_to(cdata, cpath)
            cover_url = cpath

        except:
            self.log.warning('Trying low-res cover')
            try:
                inhalt = self.index_to_soup('http://www.zeit.de/inhalt')
                cover_url = inhalt.find('div', attrs={'class':'singlearchive clearfix'}).img['src'].replace('icon_','')
            except:
                self.log.warning('Using static old low-res cover')
                cover_url = 'http://images.zeit.de/bilder/titelseiten_zeit/1946/001_001.jpg'
        return cover_url
tobias2 is offline   Reply With Quote
Old 12-04-2011, 01:18 PM   #142
nafets
Junior Member
nafets began at the beginning.
 
Posts: 1
Karma: 10
Join Date: Dec 2011
Device: none
"Die Zeit" am PRS T1

Hi,

hat jemand 'ne Lösung, wie man "Die Zeit" vernünftig am Sony PRS-T1 incl. der Grafiken lesen kann?
Gibt es eine einfache Variante, die PDF-Version incl. der Grafiken in EPUB umzuwandeln?

nafets

PS: habe noch nie ein solch tolles, konstruktives Forum gelesen wie dieses!
nafets is offline   Reply With Quote
Old 12-21-2011, 07:41 AM   #143
tobias2
Member
tobias2 began at the beginning.
 
Posts: 18
Karma: 36
Join Date: Feb 2011
Device: Kindle
Wieder ein Update

Hallo zusammen,

Wegen einer kleineren Aenderung im Download-bereich war wieder mal ein Update faellig, viel Spass damit:

Code:
#!/usr/bin/env  python
# -*- coding: utf-8 mode: python -*-

__license__   = 'GPL v3'
__copyright__ = '2010, Steffen Siebert <calibre at steffensiebert.de>'
__docformat__ = 'restructuredtext de'
__version__   = '1.7'

"""
Die Zeit EPUB
"""

import os, urllib2, zipfile, re, string, cStringIO
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ptempfile import PersistentTemporaryFile
from calibre import walk
from urlparse import urlparse
from contextlib import closing
from calibre.utils.magick.draw import save_cover_data_to

class ZeitEPUBAbo(BasicNewsRecipe):

    title = u'Die Zeit'
    description = u'Das EPUB Abo der Zeit (needs subscription)'
    language = 'de'
    lang = 'de-DE'

    __author__ = 'Steffen Siebert, revised by Tobias Isenberg (with some code by Kovid Goyal and achims)'
    needs_subscription = True

    conversion_options = {
        'no_default_epub_cover' : True,
        # fixing the wrong left margin
        'mobi_ignore_margins' : True,
        'keep_ligatures' : True,
    }

    preprocess_regexps    = [
        # filtering for correct dashes ("Gedankenstrich" and "bis")
        (re.compile(u' (-|\u2212)(?=[ ,])'), lambda match: u' \u2013'),
        (re.compile(r'(?<=\d)-(?=\d)'), lambda match: u'\u2013'), # number-number
        (re.compile(u'(?<=\d,)-(?= ?\u20AC)'), lambda match: u'\u2013'), # ,- Euro
        # fix the number dash number dash for the title image that was broken by the previous line
        (re.compile(u'(?<=\d\d\d\d)\u2013(?=\d?\d\.png)'), lambda match: '-'),
        # filtering for certain dash cases
        (re.compile(r'Bild - Zeitung'), lambda match: 'Bild-Zeitung'), # the obvious
        (re.compile(r'EMail'), lambda match: 'E-Mail'), # the obvious
        (re.compile(r'SBahn'), lambda match: 'S-Bahn'), # the obvious
        (re.compile(r'UBoot'), lambda match: 'U-Boot'), # the obvious
        (re.compile(r'T Shirt'), lambda match: 'T-Shirt'), # the obvious
        (re.compile(r'TShirt'), lambda match: 'T-Shirt'), # the obvious
        (re.compile(r'3-D'), lambda match: '3D'), # the obvious
        # the next two lines not only fix errors but also create new ones. this is due to additional errors in
        # the typesetting such as missing commas or wrongly placed dashes. but more is fixed than broken.
        (re.compile(r'(?<!und|der|\w\w,) -(?=\w)'), lambda match: '-'), # space too much before a connecting dash
        (re.compile(r'(?<=\w)- (?!und\b|oder\b|wie\b|als\b|aber\b|auch\b|sondern\b|bis\b|&amp;|&\s|bzw\.|auf\b|eher\b|noch\b)'), lambda match: '-'), # space too much after a connecting dash
        # filtering for missing spaces before or after the month in long dates
        (re.compile(u'(?<=\d)\.(?=(Januar|Februar|M\u00E4rz|April|Mai|Juni|Juli|August|September|Oktober|November|Dezember))'), lambda match: '. '),
        (re.compile(u'(?<=Januar)(?=\d)'), lambda match: ' '),
        (re.compile(u'(?<=Februar)(?=\d)'), lambda match: ' '),
        (re.compile(u'(?<=M\u00E4rz)(?=\d)'), lambda match: ' '),
        (re.compile(u'(?<=April)(?=\d)'), lambda match: ' '),
        (re.compile(u'(?<=Mai)(?=\d)'), lambda match: ' '),
        (re.compile(u'(?<=Juni)(?=\d)'), lambda match: ' '),
        (re.compile(u'(?<=Juli)(?=\d)'), lambda match: ' '),
        (re.compile(u'(?<=August)(?=\d)'), lambda match: ' '),
        (re.compile(u'(?<=September)(?=\d)'), lambda match: ' '),
        (re.compile(u'(?<=Oktober)(?=\d)'), lambda match: ' '),
        (re.compile(u'(?<=November)(?=\d)'), lambda match: ' '),
        (re.compile(u'(?<=Dezember)(?=\d)'), lambda match: ' '),
        # filtering for other missing spaces and other issues
        (re.compile(r'Stuttgart21'), lambda match: 'Stuttgart 21'), # the obvious
        (re.compile(u'(?<=\d)(?=\u20AC)'), lambda match: u'\u2013'), # Zahl[no space]Euro
        (re.compile(r'(?<=[:])(?=[^\d\s</])'), lambda match: ' '), # missing space after colon
        (re.compile(u'(?<=\S)\u0022 '), lambda match: u'\u00AB '), # wrong closing quotation
        (re.compile(u' \u0022(?=\S)'), lambda match: u' \u00BB'), # wrong opening quotation
        (re.compile(u'\u00AB(?=[^\-\.:;,\?!<\)\s])'), lambda match: u'\u00AB '), # missing space after closing quotation
        (re.compile(u'(?<=[^\s\(>])\u00BB'), lambda match: u' \u00BB'), # missing space before opening quotation
        (re.compile(u'(?<=<p class="absatz">)(?=[^(\u00BB|</p>)]*\u00AB)'), lambda match: u'\u00BB '), # missing opening quotation at start of paragraph
        (re.compile(r'(?<=[a-z])(?=(I|II|III|IV|V|VI|VII|VIII|IX|X|XI|XII|XIII|XIV|XV|XVI|XVII|XVIII|XIX|XX)\.)'), lambda match: ' '), # missing space before Roman numeral
        (re.compile(r'(?<=(I|V|X)\.)(?=[\w])'), lambda match: ' '), # missing space after Roman numeral
        (re.compile(r'(?<=(II|IV|VI|IX|XI|XV|XX)\.)(?=[\w])'), lambda match: ' '), # missing space after Roman numeral
        (re.compile(r'(?<=(III|VII|XII|XIV|XVI|XIX)\.)(?=[\w])'), lambda match: ' '), # missing space after Roman numeral
        (re.compile(r'(?<=(VIII|XIII|XVII)\.)(?=[\w])'), lambda match: ' '), # missing space after Roman numeral
        (re.compile(r'(?<=(XVIII)\.)(?=[\w])'), lambda match: ' '), # missing space after Roman numeral
        (re.compile(r'(?<=[A-Za-zÄÖÜäöü]),(?=[A-Za-zÄÖÜäöü])'), lambda match: ', '), # missing space after comma
        (re.compile(r'(?<=[a-zäöü])\.(?=[A-ZÄÖÜ][A-Za-zÄÖÜäöü])'), lambda match: '. '), # missing space after full-stop
        (re.compile(r'(?<=[uU]\.) (?=a\.)'), lambda match: u'\u2008'), # fix abbreviation that was potentially broken previously
        (re.compile(r'(?<=[iI]\.) (?=A\.)'), lambda match: u'\u2008'), # fix abbreviation that was potentially broken previously
        (re.compile(r'(?<=[zZ]\.) (?=B\.)'), lambda match: u'\u2008'), # fix abbreviation that was potentially broken previously
        (re.compile(r'(?<=\w\.) (?=[A-Z][a-z]*@)'), lambda match: ''), # fix e-mail address that was potentially broken previously
        (re.compile(r'(?<=\d)[Pp]rozent'), lambda match: ' Prozent'),
        (re.compile(r'\.\.\.\.+'), lambda match: '...'), # too many dots (....)
        (re.compile(r'(?<=[^\s])\.\.\.'), lambda match: ' ...'), # spaces before ...
        (re.compile(r'\.\.\.(?=[^\s])'), lambda match: '... '), # spaces after ...
        (re.compile(r'(?<=[\[\(]) \.\.\. (?=[\]\)])'), lambda match: '...'), # fix special cases of ... in brackets
        (re.compile(u'(?<=[\u00BB\u203A]) \.\.\.'), lambda match: '...'), # fix special cases of ... after a quotation mark
        (re.compile(u'\.\.\. (?=[\u00AB\u2039,])'), lambda match: '...'), # fix special cases of ... before a quotation mark or comma
        (re.compile(u'\u2013 \(\)'), lambda match: ''), # fix empty opening and closing brackets with leading dash and space
        (re.compile(r' \(\)'), lambda match: ''), # fix empty opening and closing brackets with leading space
        (re.compile(r'\(\)'), lambda match: ''), # fix empty opening and closing brackets
        (re.compile(r': -(?=[\(\)\\/])'), lambda match: ':-'), # fix wrong smilies
        (re.compile(u'  (?=[\wÄÖÜäöü])'), lambda match: ' '), # fix certain double spaces
        # fix missing spaces between numbers and any sort of units, possibly with dot
        (re.compile(r'(?<=\d)(?=(Femto|Piko|Nano|Mikro|Milli|Zenti|Dezi|Hekto|Kilo|Mega|Giga|Tera|Peta|Tausend|Trilli|Kubik|Quadrat|Meter|Uhr|Jahr|Schuljahr|Seite|Division|Kompanie|Armee))'), lambda match: ' '),
        (re.compile(r'(?<=\d\.)(?=(Femto|Piko|Nano|Mikro|Milli|Zenti|Dezi|Hekto|Kilo|Mega|Giga|Tera|Peta|Tausend|Trilli|Kubik|Quadrat|Meter|Uhr|Jahr|Schuljahr|Seite|Division|Kompanie|Armee))'), lambda match: ' '),
        # fix wrong spaces
        (re.compile(r'(?<=<p class="absatz">[A-ZÄÖÜ]) (?=[a-zäöü\-])'), lambda match: ''), # at beginning of paragraphs
        (re.compile(u' \u00AB'), lambda match: u'\u00AB '), # before closing quotation
        (re.compile(u'\u00BB '), lambda match: u' \u00BB'), # after opening quotation
        # filtering for spaces in large numbers for better readability
        (re.compile(r'(?<=\d\d)(?=\d\d\d[ ,\.;\)<\?!-])'), lambda match: u'\u2008'), # end of the number with some character following
        (re.compile(r'(?<=\d\d)(?=\d\d\d. )'), lambda match: u'\u2008'), # end of the number with full-stop following, then space is necessary (avoid file names)
        (re.compile(u'(?<=\d)(?=\d\d\d\u2008)'), lambda match: u'\u2008'), # next level
        (re.compile(u'(?<=\d)(?=\d\d\d\u2008)'), lambda match: u'\u2008'), # next level
        (re.compile(u'(?<=\d)(?=\d\d\d\u2008)'), lambda match: u'\u2008'), # next level
        (re.compile(u'(?<=\d)(?=\d\d\d\u2008)'), lambda match: u'\u2008'), # next level
        # filtering for unicode characters that are missing on the Kindle,
        # try to replace them with meaningful work-arounds
        (re.compile(u'\u2080'), lambda match: '<span style="font-size: 40%;">0</span>'), # subscript-0
        (re.compile(u'\u2081'), lambda match: '<span style="font-size: 40%;">1</span>'), # subscript-1
        (re.compile(u'\u2082'), lambda match: '<span style="font-size: 40%;">2</span>'), # subscript-2
        (re.compile(u'\u2083'), lambda match: '<span style="font-size: 40%;">3</span>'), # subscript-3
        (re.compile(u'\u2084'), lambda match: '<span style="font-size: 40%;">4</span>'), # subscript-4
        (re.compile(u'\u2085'), lambda match: '<span style="font-size: 40%;">5</span>'), # subscript-5
        (re.compile(u'\u2086'), lambda match: '<span style="font-size: 40%;">6</span>'), # subscript-6
        (re.compile(u'\u2087'), lambda match: '<span style="font-size: 40%;">7</span>'), # subscript-7
        (re.compile(u'\u2088'), lambda match: '<span style="font-size: 40%;">8</span>'), # subscript-8
        (re.compile(u'\u2089'), lambda match: '<span style="font-size: 40%;">9</span>'), # subscript-9
        (re.compile(u'\u2070'), lambda match: '^0'), # superscript-0
        (re.compile(u'\u2071'), lambda match: '^1'), # superscript-1
        (re.compile(u'\u2072'), lambda match: '^2'), # superscript-2
        (re.compile(u'\u2073'), lambda match: '^3'), # superscript-3
        (re.compile(u'\u2074'), lambda match: '^4'), # superscript-4
        (re.compile(u'\u2075'), lambda match: '^5'), # superscript-5
        (re.compile(u'\u2076'), lambda match: '^6'), # superscript-6
        (re.compile(u'\u2077'), lambda match: '^7'), # superscript-7
        (re.compile(u'\u2078'), lambda match: '^8'), # superscript-8
        (re.compile(u'\u2079'), lambda match: '^9'), # superscript-9
        # always change CO2
        (re.compile(r'CO2'), lambda match: 'CO<span style="font-size: 40%;">2</span>'), # CO2
        # remove *** paragraphs
        (re.compile(r'<p class="absatz">\*\*\*</p>'), lambda match: ''),
        # better layout for the top line of each article
        (re.compile(u'(?<=DIE ZEIT N\u00B0 \d /) (?=\d\d)'), lambda match: ' 20'), # proper year in edition number
        (re.compile(u'(?<=DIE ZEIT N\u00B0 \d\d /) (?=\d\d)'), lambda match: ' 20'), # proper year in edition number
        (re.compile(u'(?<=>)(?=DIE ZEIT N\u00B0 \d\d / 20\d\d)'), lambda match: u' \u2014 '), # m-dash between category and DIE ZEIT
        (re.compile(r'(?<=Z[Ee][Ii][Tt] Nr\. \d\d/)(?=[01234]\d)'), lambda match: '20'), # DIE ZEIT number with full year
        (re.compile(r'(?<=Z[Ee][Ii][Tt] Nr\. \d/)(?=[01234]\d)'), lambda match: '20'),    # DIE ZEIT number with full year
        (re.compile(r'(?<=Z[Ee][Ii][Tt] Nr\. \d\d/)(?=[56789]\d)'), lambda match: '19'), # DIE ZEIT number with full year
        (re.compile(r'(?<=Z[Ee][Ii][Tt] Nr\. \d/)(?=[56789]\d)'), lambda match: '19'),    # DIE ZEIT number with full year
        (re.compile(r'(?<=ANALYSE UND MEINUNG) '), lambda match: ': '), # "ANALYSE UND MEINUNG" with following colon
        # better layout and saving space for the bottom of each article
        (re.compile(r"]</span>\n[ ]*</a>\n[ ]*<br />\n[ ]*(?=<a href='[\w]*.xhtm[#\w_]*' class='toc_link'>)"), lambda match: ']</span>\n</a>\n'), # remove line breaks for navi links
        (re.compile(r'\[zum Inhaltsverzeichnis]'), lambda match: '[Inhalt]'), # shorten Inhaltsverzeichnis
        (re.compile(r'\[zum Ressort '), lambda match: '['), # shorten Ressort
        (re.compile(u'\[zur Übersicht '), lambda match: '['), # shorten Übersicht
    ]

    def build_index(self):
        domain = "https://premium.zeit.de"
        url = domain + "/abo/zeit_digital"
        browser = self.get_browser()

        # new login process
        response = browser.open(url)
        # get rid of nested form
        response.set_data(response.get_data().replace('<div><form action="/abo/zeit_digital?destination=node%2F94"  accept-charset="UTF-8" method="post" id="user-login-form" class="zol_inlinelabel">', ''))
        browser.set_response(response)
        # find correct form and submit
        browser.select_form(nr=2)
        browser.form['name']=self.username
        browser.form['pass']=self.password
        browser.submit()
        # now find the correct file, we will still use the ePub file
        epublink = browser.find_link(text_regex=re.compile('.*Download als Datei im ePub-Format.*'))
        response = browser.follow_link(epublink)
        self.report_progress(1,_('next step'))

        tmp = PersistentTemporaryFile(suffix='.epub')
        self.report_progress(0,_('downloading epub'))
        tmp.write(response.read())
        tmp.close()

        zfile = zipfile.ZipFile(tmp.name, 'r')
        self.report_progress(0,_('extracting epub'))

        zfile.extractall(self.output_dir)

        tmp.close()

        index = os.path.join(self.output_dir, 'content.opf')

        self.report_progress(1,_('epub downloaded and extracted'))

        # doing regular expression filtering
        for path in walk('.'):
            (shortname, extension) = os.path.splitext(path)  
            if extension.lower() in ('.html', '.htm', '.xhtml'):
                with open(path, 'r+b') as f:
                    raw = f.read()
                    raw = raw.decode('utf-8')
                    for pat, func in self.preprocess_regexps:
                        raw = pat.sub(func, raw)
                    f.seek(0)
                    f.truncate()
                    f.write(raw.encode('utf-8'))

        # adding real cover
        self.report_progress(0,_('trying to download cover image (titlepage)'))
        self.download_cover()
        self.conversion_options["cover"] = self.cover_path

        return index

    # getting url of the cover
    def get_cover_url(self):
        self.log.warning('Downloading cover')
        try:
            self.log.warning('Trying PDF-based cover')
            domain = "https://premium.zeit.de"
            url = domain + "/abo/zeit_digital"
            browser = self.get_browser()

            # new login process
            response = browser.open(url)
            # get rid of nested form
            response.set_data(response.get_data().replace('<div><form action="/abo/zeit_digital?destination=node%2F94"  accept-charset="UTF-8" method="post" id="user-login-form" class="zol_inlinelabel">', ''))
            browser.set_response(response)
            # find correct form and submit
            browser.select_form(nr=2)
            browser.form['name']=self.username
            browser.form['pass']=self.password
            browser.submit()
            # actual cover search
            pdflink = browser.find_link(url_regex=re.compile('system/files/epaper/DZ/pdf/DZ_ePaper*'))
            cover_url = urlparse(pdflink.base_url)[0]+'://'+urlparse(pdflink.base_url)[1]+''+(urlparse(pdflink.url)[2]).replace('ePaper_','').replace('.pdf','_001.pdf')
            self.log.warning('PDF link found:')
            self.log.warning(cover_url)
            # download the cover (has to be here due to new login process)
            with closing(browser.open(cover_url)) as r:
                cdata = r.read()
            from calibre.ebooks.metadata.pdf import get_metadata
            stream = cStringIO.StringIO(cdata)
            cdata = None
            mi = get_metadata(stream)
            if mi.cover_data and mi.cover_data[1]:
                cdata = mi.cover_data[1]

            cpath = os.path.join(self.output_dir, 'cover.jpg')
            save_cover_data_to(cdata, cpath)
            cover_url = cpath

        except:
            self.log.warning('Trying low-res cover')
            try:
                inhalt = self.index_to_soup('http://www.zeit.de/inhalt')
                cover_url = inhalt.find('div', attrs={'class':'singlearchive clearfix'}).img['src'].replace('icon_','')
            except:
                self.log.warning('Using static old low-res cover')
                cover_url = 'http://images.zeit.de/bilder/titelseiten_zeit/1946/001_001.jpg'
        return cover_url
tobias2 is offline   Reply With Quote
Reply

Thread Tools Search this Thread
Search this Thread:

Advanced Search

Forum Jump

Similar Threads
Thread Thread Starter Forum Replies Last Post
"Zeit-Odyssee"-Trilogie droht das "dunkle Turm"-Schicksal ThR E-Books 4 02-10-2010 05:18 AM
Pocketbook wurde als "Hersteller des Jahres 2009" bei lesen.net ausgewählt! Forkosigan Deutsches Forum 2 12-30-2009 02:55 PM
Google Editions: Auch "Kindle"-Inhalte sind geplant Marc_liest Amazon Kindle 2 12-14-2009 05:55 AM
libreka! unterstützt ab sofort auch "harten" Kopierschutz mtravellerh Deutsches Forum 1 06-17-2009 04:39 PM


All times are GMT -4. The time now is 11:51 PM.


MobileRead.com is a privately owned, operated and funded community.