Register Guidelines E-Books Search Today's Posts Mark Forums Read

Go Back   MobileRead Forums > E-Book Software > Calibre > Recipes

Notices

Reply
 
Thread Tools Search this Thread
Old 07-16-2011, 10:08 AM   #16
Moik
Enthusiast
Moik began at the beginning.
 
Posts: 46
Karma: 10
Join Date: Oct 2010
Device: Kindle 3
Could anybody confirm if he/she is still successfully using the recipe? It doesn't work for me anymore: http://www.mobileread.com/forums/sho...d.php?t=141110

Edit: Found a solution here: http://www.mobileread.com/forums/sho...number&page=10

Code:
#!/usr/bin/env  python
# -*- coding: utf-8 mode: python -*-

__license__   = 'GPL v3'
__copyright__ = '2010, Steffen Siebert <calibre at steffensiebert.de>'
__docformat__ = 'restructuredtext de'
__version__   = '1.5'

"""
Die Zeit EPUB
"""

import os, urllib2, zipfile, re, string, cStringIO
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ptempfile import PersistentTemporaryFile
from calibre import walk
from urlparse import urlparse
from contextlib import closing
from calibre.utils.magick.draw import save_cover_data_to

class ZeitEPUBAbo(BasicNewsRecipe):

    title = u'Die Zeit'
    description = u'Das EPUB Abo der Zeit (needs subscription)'
    language = 'de'
    lang = 'de-DE'

    __author__ = 'Steffen Siebert, revised by Tobias Isenberg (with some code by Kovid Goyal)'
    needs_subscription = True

    conversion_options = {
        'no_default_epub_cover' : True,
        # fixing the wrong left margin
        'mobi_ignore_margins' : True,
        'keep_ligatures' : True,
    }

    preprocess_regexps    = [
        # filtering for correct dashes ("Gedankenstrich" and "bis")
        (re.compile(u' (-|\u2212)(?=[ ,])'), lambda match: u' \u2013'),
        (re.compile(r'(?<=\d)-(?=\d)'), lambda match: u'\u2013'), # number-number
        (re.compile(u'(?<=\d,)-(?= ?\u20AC)'), lambda match: u'\u2013'), # ,- Euro
        # fix the number dash number dash for the title image that was broken by the previous line
        (re.compile(u'(?<=\d\d\d\d)\u2013(?=\d?\d\.png)'), lambda match: '-'),
        # filtering for certain dash cases
        (re.compile(r'Bild - Zeitung'), lambda match: 'Bild-Zeitung'), # the obvious
        (re.compile(r'EMail'), lambda match: 'E-Mail'), # the obvious
        (re.compile(r'SBahn'), lambda match: 'S-Bahn'), # the obvious
        (re.compile(r'UBoot'), lambda match: 'U-Boot'), # the obvious
        (re.compile(r'T Shirt'), lambda match: 'T-Shirt'), # the obvious
        (re.compile(r'TShirt'), lambda match: 'T-Shirt'), # the obvious
        # the next two lines not only fix errors but also create new ones. this is due to additional errors in
        # the typesetting such as missing commas or wrongly placed dashes. but more is fixed than broken.
        (re.compile(r'(?<!und|der|\w\w,) -(?=\w)'), lambda match: '-'), # space too much before a connecting dash
        (re.compile(r'(?<=\w)- (?!und\b|oder\b|wie\b|aber\b|auch\b|sondern\b|bis\b|&amp;|&\s|bzw\.|auf\b|eher\b)'), lambda match: '-'), # space too much after a connecting dash
        # filtering for missing spaces before the month in long dates
        (re.compile(u'(?<=\d)\.(?=(Januar|Februar|M\u00E4rz|April|Mai|Juni|Juli|August|September|Oktober|November|Dezember))'), lambda match: '. '),
        # filtering for other missing spaces
        (re.compile(r'Stuttgart21'), lambda match: 'Stuttgart 21'), # the obvious
        (re.compile(u'(?<=\d)(?=\u20AC)'), lambda match: u'\u2013'), # Zahl[no space]Euro
        (re.compile(r':(?=[^\d\s</])'), lambda match: ': '), # missing space after colon
        (re.compile(u'\u00AB(?=[^\-\.:;,\?!<\)\s])'), lambda match: u'\u00AB '), # missing space after closing quotation
        (re.compile(u'(?<=[^\s\(>])\u00BB'), lambda match: u' \u00BB'), # missing space before opening quotation
        (re.compile(r'(?<=[a-z])(?=(I|II|III|IV|V|VI|VII|VIII|IX|X|XI|XII|XIII|XIV|XV|XVI|XVII|XVIII|XIX|XX)\.)'), lambda match: ' '), # missing space before Roman numeral
        (re.compile(r'(?<=(I|V|X)\.)(?=[\w])'), lambda match: ' '), # missing space after Roman numeral
        (re.compile(r'(?<=(II|IV|VI|IX|XI|XV|XX)\.)(?=[\w])'), lambda match: ' '), # missing space after Roman numeral
        (re.compile(r'(?<=(III|VII|XII|XIV|XVI|XIX)\.)(?=[\w])'), lambda match: ' '), # missing space after Roman numeral
        (re.compile(r'(?<=(VIII|XIII|XVII)\.)(?=[\w])'), lambda match: ' '), # missing space after Roman numeral
        (re.compile(r'(?<=(XVIII)\.)(?=[\w])'), lambda match: ' '), # missing space after Roman numeral
        (re.compile(r'(?<=[A-Za-z]),(?=[A-Za-z])'), lambda match: ', '), # missing space after comma
        (re.compile(r'(?<=[a-z])\.(?=[A-Z][A-Za-z])'), lambda match: '. '), # missing space after full-stop
        (re.compile(r'(?<=[uU]\.) (?=a\.)'), lambda match: u'\u2008'), # fix abbreviation that was potentially broken previously
        (re.compile(r'(?<=[iI]\.) (?=A\.)'), lambda match: u'\u2008'), # fix abbreviation that was potentially broken previously
        (re.compile(r'(?<=[zZ]\.) (?=B\.)'), lambda match: u'\u2008'), # fix abbreviation that was potentially broken previously
        (re.compile(r'(?<=\w\.) (?=[A-Z][a-z]*@)'), lambda match: ''), # fix e-mail address that was potentially broken previously
        (re.compile(r'(?<=\d)[Pp]rozent'), lambda match: ' Prozent'),
        (re.compile(r'\.\.\.\.+'), lambda match: '...'), # too many dots (....)
        (re.compile(r'(?<=[^\s])\.\.\.'), lambda match: ' ...'), # spaces before ...
        (re.compile(r'\.\.\.(?=[^\s])'), lambda match: '... '), # spaces after ...
        (re.compile(r'(?<=[\[\(]) \.\.\. (?=[\]\)])'), lambda match: '...'), # fix special cases of ... in brackets
        (re.compile(u'(?<=[\u00BB\u203A]) \.\.\.'), lambda match: '...'), # fix special cases of ... after a quotation mark
        (re.compile(u'\.\.\. (?=[\u00AB\u2039,])'), lambda match: '...'), # fix special cases of ... before a quotation mark or comma
        # fix missing spaces between numbers and any sort of units, possibly with dot
        (re.compile(r'(?<=\d)(?=(Femto|Piko|Nano|Mikro|Milli|Zenti|Dezi|Hekto|Kilo|Mega|Giga|Tera|Peta|Tausend|Trilli|Kubik|Quadrat|Meter|Uhr|Jahr|Schuljahr|Seite))'), lambda match: ' '),
        (re.compile(r'(?<=\d\.)(?=(Femto|Piko|Nano|Mikro|Milli|Zenti|Dezi|Hekto|Kilo|Mega|Giga|Tera|Peta|Tausend|Trilli|Kubik|Quadrat|Meter|Uhr|Jahr|Schuljahr|Seite))'), lambda match: ' '),
        # fix wrong spaces
        (re.compile(r'(?<=<p class="absatz">[A-Z]) (?=[a-z\-])'), lambda match: ''), # at beginning of paragraphs
        (re.compile(u' \u00AB'), lambda match: u'\u00AB '), # before closing quotation
        (re.compile(u'\u00BB '), lambda match: u' \u00BB'), # after opening quotation
        # filtering for spaces in large numbers for better readability
        (re.compile(r'(?<=\d\d)(?=\d\d\d[ ,\.;\)<\?!-])'), lambda match: u'\u2008'), # end of the number with some character following
        (re.compile(r'(?<=\d\d)(?=\d\d\d. )'), lambda match: u'\u2008'), # end of the number with full-stop following, then space is necessary (avoid file names)
        (re.compile(u'(?<=\d)(?=\d\d\d\u2008)'), lambda match: u'\u2008'), # next level
        (re.compile(u'(?<=\d)(?=\d\d\d\u2008)'), lambda match: u'\u2008'), # next level
        (re.compile(u'(?<=\d)(?=\d\d\d\u2008)'), lambda match: u'\u2008'), # next level
        (re.compile(u'(?<=\d)(?=\d\d\d\u2008)'), lambda match: u'\u2008'), # next level
        # filtering for unicode characters that are missing on the Kindle,
        # try to replace them with meaningful work-arounds
        (re.compile(u'\u2080'), lambda match: '<span style="font-size: 40%;">0</span>'), # subscript-0
        (re.compile(u'\u2081'), lambda match: '<span style="font-size: 40%;">1</span>'), # subscript-1
        (re.compile(u'\u2082'), lambda match: '<span style="font-size: 40%;">2</span>'), # subscript-2
        (re.compile(u'\u2083'), lambda match: '<span style="font-size: 40%;">3</span>'), # subscript-3
        (re.compile(u'\u2084'), lambda match: '<span style="font-size: 40%;">4</span>'), # subscript-4
        (re.compile(u'\u2085'), lambda match: '<span style="font-size: 40%;">5</span>'), # subscript-5
        (re.compile(u'\u2086'), lambda match: '<span style="font-size: 40%;">6</span>'), # subscript-6
        (re.compile(u'\u2087'), lambda match: '<span style="font-size: 40%;">7</span>'), # subscript-7
        (re.compile(u'\u2088'), lambda match: '<span style="font-size: 40%;">8</span>'), # subscript-8
        (re.compile(u'\u2089'), lambda match: '<span style="font-size: 40%;">9</span>'), # subscript-9
        # always chance CO2
        (re.compile(r'CO2'), lambda match: 'CO<span style="font-size: 40%;">2</span>'), # CO2
        # remove *** paragraphs
        (re.compile(r'<p class="absatz">\*\*\*</p>'), lambda match: ''),
        # better layout for the top line of each article
        (re.compile(u'(?<=DIE ZEIT N\u00B0 \d /) (?=\d\d)'), lambda match: ' 20'), # proper year in edition number
        (re.compile(u'(?<=DIE ZEIT N\u00B0 \d\d /) (?=\d\d)'), lambda match: ' 20'), # proper year in edition number
        (re.compile(u'(?<=>)(?=DIE ZEIT N\u00B0 \d\d / 20\d\d)'), lambda match: u' \u2014 '), # m-dash between category and DIE ZEIT
    ]

    def build_index(self):
        domain = "https://premium.zeit.de"
        url = domain + "/abo/zeit_digital"
        browser = self.get_browser()

        # new login process
        response = browser.open(url)
        browser.select_form(nr=2)
        browser.form['name']=self.username
        browser.form['pass']=self.password
        browser.submit()
        # now find the correct file, we will still use the ePub file
        epublink = browser.find_link(text_regex=re.compile('.*Ausgabe als Datei im ePub-Format.*'))
        response = browser.follow_link(epublink)
        self.report_progress(1,_('next step'))

        tmp = PersistentTemporaryFile(suffix='.epub')
        self.report_progress(0,_('downloading epub'))
        tmp.write(response.read())
        tmp.close()

        zfile = zipfile.ZipFile(tmp.name, 'r')
        self.report_progress(0,_('extracting epub'))

        zfile.extractall(self.output_dir)

        tmp.close()

        index = os.path.join(self.output_dir, 'content.opf')

        self.report_progress(1,_('epub downloaded and extracted'))

        # doing regular expression filtering
        for path in walk('.'):
            (shortname, extension) = os.path.splitext(path)  
            if extension.lower() in ('.html', '.htm', '.xhtml'):
                with open(path, 'r+b') as f:
                    raw = f.read()
                    raw = raw.decode('utf-8')
                    for pat, func in self.preprocess_regexps:
                        raw = pat.sub(func, raw)
                    f.seek(0)
                    f.truncate()
                    f.write(raw.encode('utf-8'))

        # adding real cover
        self.report_progress(0,_('trying to download cover image (titlepage)'))
        self.download_cover()
        self.conversion_options["cover"] = self.cover_path

        return index

    # getting url of the cover
    def get_cover_url(self):
        self.log.warning('Downloading cover')
        try:
            self.log.warning('Trying PDF-based cover')
            domain = "https://premium.zeit.de"
            url = domain + "/abo/zeit_digital"
            browser = self.get_browser()

            # new login process
            response = browser.open(url)
            browser.select_form(nr=2)
            browser.form['name']=self.username
            browser.form['pass']=self.password
            browser.submit()
            # actual cover search
            pdflink = browser.find_link(url_regex=re.compile('system/files/epaper/DZ/pdf/DZ_ePaper*'))
            cover_url = urlparse(pdflink.base_url)[0]+'://'+urlparse(pdflink.base_url)[1]+''+(urlparse(pdflink.url)[2]).replace('ePaper_','').replace('.pdf','_001.pdf')
            self.log.warning('PDF link found:')
            self.log.warning(cover_url)
            # download the cover (has to be here due to new login process)
            with closing(browser.open(cover_url)) as r:
                cdata = r.read()
            from calibre.ebooks.metadata.pdf import get_metadata
            stream = cStringIO.StringIO(cdata)
            cdata = None
            mi = get_metadata(stream)
            if mi.cover_data and mi.cover_data[1]:
                cdata = mi.cover_data[1]

            cpath = os.path.join(self.output_dir, 'cover.jpg')
            save_cover_data_to(cdata, cpath)
            cover_url = cpath

        except:
            self.log.warning('Trying low-res cover')
            try:
                inhalt = self.index_to_soup('http://www.zeit.de/inhalt')
                cover_url = inhalt.find('div', attrs={'class':'singlearchive clearfix'}).img['src'].replace('icon_','')
            except:
                self.log.warning('Using static old low-res cover')
                cover_url = 'http://images.zeit.de/bilder/titelseiten_zeit/1946/001_001.jpg'
        return cover_url

Last edited by Moik; 07-16-2011 at 01:46 PM.
Moik is offline   Reply With Quote
Old 10-28-2011, 07:11 AM   #17
Divingduck
Fanatic
Divingduck never is beset by a damp, drizzly November in his or her soul.Divingduck never is beset by a damp, drizzly November in his or her soul.Divingduck never is beset by a damp, drizzly November in his or her soul.Divingduck never is beset by a damp, drizzly November in his or her soul.Divingduck never is beset by a damp, drizzly November in his or her soul.Divingduck never is beset by a damp, drizzly November in his or her soul.Divingduck never is beset by a damp, drizzly November in his or her soul.Divingduck never is beset by a damp, drizzly November in his or her soul.Divingduck never is beset by a damp, drizzly November in his or her soul.Divingduck never is beset by a damp, drizzly November in his or her soul.Divingduck never is beset by a damp, drizzly November in his or her soul.
 
Posts: 558
Karma: 59934
Join Date: Nov 2010
Location: Germany
Device: Sony PRS-650
Hello, in the German forum is a request regarding the actual Zeit-Abo Download what will not work for him. I haven't a account for checking this. May be there is someone able to check if the recipe is working in general.
His post is this one:http://www.mobileread.com/forums/sho...d.php?t=154764
Divingduck is offline   Reply With Quote
Old 11-03-2011, 07:51 AM   #18
achims
Member
achims began at the beginning.
 
Posts: 24
Karma: 12
Join Date: Oct 2011
Device: Xperia Active, Iconia A500, Galaxy I5500
The Zeit epub download recipe did not work anymore for some time, resulting in a "nested form" error.

Here is a working version.
Have fun

Achim

Code:
#!/usr/bin/env  python2
# -*- coding: utf-8 mode: python -*-

__license__   = 'GPL v3'
__copyright__ = '2010, Steffen Siebert <calibre at steffensiebert.de>'
__docformat__ = 'restructuredtext de'
__version__   = '1.5'

"""
Die Zeit EPUB
"""

import os, zipfile, re, cStringIO
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ptempfile import PersistentTemporaryFile
from calibre import walk
from urlparse import urlparse
from contextlib import closing
from calibre.utils.magick.draw import save_cover_data_to

class ZeitEPUBAbo(BasicNewsRecipe):

    title = u'Die Zeit'
    description = u'Das EPUB Abo der Zeit (needs subscription)'
    language = 'de'
    lang = 'de-DE'

    __author__ = 'Steffen Siebert, revised by Tobias Isenberg (with some code by Kovid Goyal)'
    needs_subscription = True

    conversion_options = {
        'no_default_epub_cover' : True,
        # fixing the wrong left margin
        'mobi_ignore_margins' : True,
        'keep_ligatures' : True,
    }

    preprocess_regexps    = [
        # filtering for correct dashes ("Gedankenstrich" and "bis")
        (re.compile(u' (-|\u2212)(?=[ ,])'), lambda match: u' \u2013'),
        (re.compile(r'(?<=\d)-(?=\d)'), lambda match: u'\u2013'), # number-number
        (re.compile(u'(?<=\d,)-(?= ?\u20AC)'), lambda match: u'\u2013'), # ,- Euro
        # fix the number dash number dash for the title image that was broken by the previous line
        (re.compile(u'(?<=\d\d\d\d)\u2013(?=\d?\d\.png)'), lambda match: '-'),
        # filtering for certain dash cases
        (re.compile(r'Bild - Zeitung'), lambda match: 'Bild-Zeitung'), # the obvious
        (re.compile(r'EMail'), lambda match: 'E-Mail'), # the obvious
        (re.compile(r'SBahn'), lambda match: 'S-Bahn'), # the obvious
        (re.compile(r'UBoot'), lambda match: 'U-Boot'), # the obvious
        (re.compile(r'T Shirt'), lambda match: 'T-Shirt'), # the obvious
        (re.compile(r'TShirt'), lambda match: 'T-Shirt'), # the obvious
        # the next two lines not only fix errors but also create new ones. this is due to additional errors in
        # the typesetting such as missing commas or wrongly placed dashes. but more is fixed than broken.
        (re.compile(r'(?<!und|der|\w\w,) -(?=\w)'), lambda match: '-'), # space too much before a connecting dash
        (re.compile(r'(?<=\w)- (?!und\b|oder\b|wie\b|aber\b|auch\b|sondern\b|bis\b|&amp;|&\s|bzw\.|auf\b|eher\b)'), lambda match: '-'), # space too much after a connecting dash
        # filtering for missing spaces before the month in long dates
        (re.compile(u'(?<=\d)\.(?=(Januar|Februar|M\u00E4rz|April|Mai|Juni|Juli|August|September|Oktober|November|Dezember))'), lambda match: '. '),
        # filtering for other missing spaces
        (re.compile(r'Stuttgart21'), lambda match: 'Stuttgart 21'), # the obvious
        (re.compile(u'(?<=\d)(?=\u20AC)'), lambda match: u'\u2013'), # Zahl[no space]Euro
        (re.compile(r':(?=[^\d\s</])'), lambda match: ': '), # missing space after colon
        (re.compile(u'\u00AB(?=[^\-\.:;,\?!<\)\s])'), lambda match: u'\u00AB '), # missing space after closing quotation
        (re.compile(u'(?<=[^\s\(>])\u00BB'), lambda match: u' \u00BB'), # missing space before opening quotation
        (re.compile(r'(?<=[a-z])(?=(I|II|III|IV|V|VI|VII|VIII|IX|X|XI|XII|XIII|XIV|XV|XVI|XVII|XVIII|XIX|XX)\.)'), lambda match: ' '), # missing space before Roman numeral
        (re.compile(r'(?<=(I|V|X)\.)(?=[\w])'), lambda match: ' '), # missing space after Roman numeral
        (re.compile(r'(?<=(II|IV|VI|IX|XI|XV|XX)\.)(?=[\w])'), lambda match: ' '), # missing space after Roman numeral
        (re.compile(r'(?<=(III|VII|XII|XIV|XVI|XIX)\.)(?=[\w])'), lambda match: ' '), # missing space after Roman numeral
        (re.compile(r'(?<=(VIII|XIII|XVII)\.)(?=[\w])'), lambda match: ' '), # missing space after Roman numeral
        (re.compile(r'(?<=(XVIII)\.)(?=[\w])'), lambda match: ' '), # missing space after Roman numeral
        (re.compile(r'(?<=[A-Za-z]),(?=[A-Za-z])'), lambda match: ', '), # missing space after comma
        (re.compile(r'(?<=[a-z])\.(?=[A-Z][A-Za-z])'), lambda match: '. '), # missing space after full-stop
        (re.compile(r'(?<=[uU]\.) (?=a\.)'), lambda match: u'\u2008'), # fix abbreviation that was potentially broken previously
        (re.compile(r'(?<=[iI]\.) (?=A\.)'), lambda match: u'\u2008'), # fix abbreviation that was potentially broken previously
        (re.compile(r'(?<=[zZ]\.) (?=B\.)'), lambda match: u'\u2008'), # fix abbreviation that was potentially broken previously
        (re.compile(r'(?<=\w\.) (?=[A-Z][a-z]*@)'), lambda match: ''), # fix e-mail address that was potentially broken previously
        (re.compile(r'(?<=\d)[Pp]rozent'), lambda match: ' Prozent'),
        (re.compile(r'\.\.\.\.+'), lambda match: '...'), # too many dots (....)
        (re.compile(r'(?<=[^\s])\.\.\.'), lambda match: ' ...'), # spaces before ...
        (re.compile(r'\.\.\.(?=[^\s])'), lambda match: '... '), # spaces after ...
        (re.compile(r'(?<=[\[\(]) \.\.\. (?=[\]\)])'), lambda match: '...'), # fix special cases of ... in brackets
        (re.compile(u'(?<=[\u00BB\u203A]) \.\.\.'), lambda match: '...'), # fix special cases of ... after a quotation mark
        (re.compile(u'\.\.\. (?=[\u00AB\u2039,])'), lambda match: '...'), # fix special cases of ... before a quotation mark or comma
        # fix missing spaces between numbers and any sort of units, possibly with dot
        (re.compile(r'(?<=\d)(?=(Femto|Piko|Nano|Mikro|Milli|Zenti|Dezi|Hekto|Kilo|Mega|Giga|Tera|Peta|Tausend|Trilli|Kubik|Quadrat|Meter|Uhr|Jahr|Schuljahr|Seite))'), lambda match: ' '),
        (re.compile(r'(?<=\d\.)(?=(Femto|Piko|Nano|Mikro|Milli|Zenti|Dezi|Hekto|Kilo|Mega|Giga|Tera|Peta|Tausend|Trilli|Kubik|Quadrat|Meter|Uhr|Jahr|Schuljahr|Seite))'), lambda match: ' '),
        # fix wrong spaces
        (re.compile(r'(?<=<p class="absatz">[A-Z]) (?=[a-z\-])'), lambda match: ''), # at beginning of paragraphs
        (re.compile(u' \u00AB'), lambda match: u'\u00AB '), # before closing quotation
        (re.compile(u'\u00BB '), lambda match: u' \u00BB'), # after opening quotation
        # filtering for spaces in large numbers for better readability
        (re.compile(r'(?<=\d\d)(?=\d\d\d[ ,\.;\)<\?!-])'), lambda match: u'\u2008'), # end of the number with some character following
        (re.compile(r'(?<=\d\d)(?=\d\d\d. )'), lambda match: u'\u2008'), # end of the number with full-stop following, then space is necessary (avoid file names)
        (re.compile(u'(?<=\d)(?=\d\d\d\u2008)'), lambda match: u'\u2008'), # next level
        (re.compile(u'(?<=\d)(?=\d\d\d\u2008)'), lambda match: u'\u2008'), # next level
        (re.compile(u'(?<=\d)(?=\d\d\d\u2008)'), lambda match: u'\u2008'), # next level
        (re.compile(u'(?<=\d)(?=\d\d\d\u2008)'), lambda match: u'\u2008'), # next level
        # filtering for unicode characters that are missing on the Kindle,
        # try to replace them with meaningful work-arounds
        (re.compile(u'\u2080'), lambda match: '<span style="font-size: 40%;">0</span>'), # subscript-0
        (re.compile(u'\u2081'), lambda match: '<span style="font-size: 40%;">1</span>'), # subscript-1
        (re.compile(u'\u2082'), lambda match: '<span style="font-size: 40%;">2</span>'), # subscript-2
        (re.compile(u'\u2083'), lambda match: '<span style="font-size: 40%;">3</span>'), # subscript-3
        (re.compile(u'\u2084'), lambda match: '<span style="font-size: 40%;">4</span>'), # subscript-4
        (re.compile(u'\u2085'), lambda match: '<span style="font-size: 40%;">5</span>'), # subscript-5
        (re.compile(u'\u2086'), lambda match: '<span style="font-size: 40%;">6</span>'), # subscript-6
        (re.compile(u'\u2087'), lambda match: '<span style="font-size: 40%;">7</span>'), # subscript-7
        (re.compile(u'\u2088'), lambda match: '<span style="font-size: 40%;">8</span>'), # subscript-8
        (re.compile(u'\u2089'), lambda match: '<span style="font-size: 40%;">9</span>'), # subscript-9
        # always chance CO2
        (re.compile(r'CO2'), lambda match: 'CO<span style="font-size: 40%;">2</span>'), # CO2
        # remove *** paragraphs
        (re.compile(r'<p class="absatz">\*\*\*</p>'), lambda match: ''),
        # better layout for the top line of each article
        (re.compile(u'(?<=DIE ZEIT N\u00B0 \d /) (?=\d\d)'), lambda match: ' 20'), # proper year in edition number
        (re.compile(u'(?<=DIE ZEIT N\u00B0 \d\d /) (?=\d\d)'), lambda match: ' 20'), # proper year in edition number
        (re.compile(u'(?<=>)(?=DIE ZEIT N\u00B0 \d\d / 20\d\d)'), lambda match: u' \u2014 '), # m-dash between category and DIE ZEIT
    ]

    def build_index(self):
        domain = "https://premium.zeit.de"
        url = domain + "/abo/zeit_digital"
        browser = self.get_browser()

        # new login process
        response = browser.open(url)
        # Get rid of nested form
        response.set_data(response.get_data().replace('<div><form action="/abo/zeit_digital?destination=node%2F94"  accept-charset="UTF-8" method="post" id="user-login-form" class="zol_inlinelabel">', ''))
        browser.set_response(response)
        browser.select_form(nr=2)
        browser.form['name']=self.username
        browser.form['pass']=self.password
        browser.submit()
        # now find the correct file, we will still use the ePub file
        epublink = browser.find_link(text_regex=re.compile('.*Ausgabe als Datei im ePub-Format.*'))
        response = browser.follow_link(epublink)
        self.report_progress(1,_('next step'))

        tmp = PersistentTemporaryFile(suffix='.epub')
        self.report_progress(0,_('downloading epub'))
        tmp.write(response.read())
        tmp.close()

        zfile = zipfile.ZipFile(tmp.name, 'r')
        self.report_progress(0,_('extracting epub'))

        zfile.extractall(self.output_dir)

        tmp.close()

        index = os.path.join(self.output_dir, 'content.opf')

        self.report_progress(1,_('epub downloaded and extracted'))

        # doing regular expression filtering
        for path in walk('.'):
            (shortname, extension) = os.path.splitext(path)
            if extension.lower() in ('.html', '.htm', '.xhtml'):
                with open(path, 'r+b') as f:
                    raw = f.read()
                    raw = raw.decode('utf-8')
                    for pat, func in self.preprocess_regexps:
                        raw = pat.sub(func, raw)
                    f.seek(0)
                    f.truncate()
                    f.write(raw.encode('utf-8'))

        # adding real cover
        self.report_progress(0,_('trying to download cover image (titlepage)'))
        self.download_cover()
        self.conversion_options["cover"] = self.cover_path

        return index

    # getting url of the cover
    def get_cover_url(self):
        self.log.warning('Downloading cover')
        try:
            self.log.warning('Trying PDF-based cover')
            domain = "https://premium.zeit.de"
            url = domain + "/abo/zeit_digital"
            browser = self.get_browser()

            # new login process
            response=browser.open(url)
            # Get rid of nested form
            response.set_data(response.get_data().replace('<div><form action="/abo/zeit_digital?destination=node%2F94"  accept-charset="UTF-8" method="post" id="user-login-form" class="zol_inlinelabel">', ''))
            browser.set_response(response)

            browser.select_form(nr=2)
            browser.form['name']=self.username
            browser.form['pass']=self.password
            browser.submit()
            # actual cover search
            pdflink = browser.find_link(url_regex=re.compile('system/files/epaper/DZ/pdf/DZ_ePaper*'))
            cover_url = urlparse(pdflink.base_url)[0]+'://'+urlparse(pdflink.base_url)[1]+''+(urlparse(pdflink.url)[2]).replace('ePaper_','').replace('.pdf','_001.pdf')
            self.log.warning('PDF link found:')
            self.log.warning(cover_url)
            # download the cover (has to be here due to new login process)
            with closing(browser.open(cover_url)) as r:
                cdata = r.read()
            from calibre.ebooks.metadata.pdf import get_metadata
            stream = cStringIO.StringIO(cdata)
            cdata = None
            mi = get_metadata(stream)
            if mi.cover_data and mi.cover_data[1]:
                cdata = mi.cover_data[1]

            cpath = os.path.join(self.output_dir, 'cover.jpg')
            save_cover_data_to(cdata, cpath)
            cover_url = cpath

        except:
            self.log.warning('Trying low-res cover')
            try:
                inhalt = self.index_to_soup('http://www.zeit.de/inhalt')
                cover_url = inhalt.find('div', attrs={'class':'singlearchive clearfix'}).img['src'].replace('icon_','')
            except:
                self.log.warning('Using static old low-res cover')
                cover_url = 'http://images.zeit.de/bilder/titelseiten_zeit/1946/001_001.jpg'
        return cover_url
achims is offline   Reply With Quote
Old 12-14-2012, 04:21 AM   #19
Timbo80
Junior Member
Timbo80 began at the beginning.
 
Posts: 6
Karma: 10
Join Date: Nov 2012
Device: Kindle Touch
no need for a recipe anymore

Hi there,
If you have a "Digital-Abo" subscription, DIE ZEIT offers free push-eMail-delivery of the current issue on wendnesday evenings. (epub: To the adress you initially subscribed with, .mobi: to a @kindle.com adress of your choice).
Login to your premium.zeit.de account, go to "Kontoinformationen" -> "Einstellungen fr automatische Benachrichtigungen/Versand bearbeiten". There you can enter your desired informations.

This is quite comfortable, no need to run calibre anymore.

Last edited by Timbo80; 12-14-2012 at 04:24 AM.
Timbo80 is offline   Reply With Quote
Old 02-20-2013, 08:44 AM   #20
bverspeiser
Member
bverspeiser began at the beginning.
 
Posts: 23
Karma: 10
Join Date: Dec 2011
Device: pb 603
Links do not work

I have the following problem:

get the zeit epub from zeit.de
get it via this plugin.

View the documents with an epub reader (I tested it with fbreader on a pocketbook and the calibre epub reader).

Plugin epub: only the main index page links work fine, every other link (e. g. sub indexes, "next article...") do not work (clickable but you are not forwarded anywhere)

Original epub: every link works fine

I am not shure when this effect occured but I think it is since the images are inserted in the original epub.

Yes, it is possible to let the epub sent by zeit.de but I have a cron job with some recipes running daily/weekly which puts the epubs into my home web server database automatically and this is definitely easier than import an epub from an imap server
bverspeiser is offline   Reply With Quote
Old 03-03-2013, 09:05 AM   #21
bverspeiser
Member
bverspeiser began at the beginning.
 
Posts: 23
Karma: 10
Join Date: Dec 2011
Device: pb 603
Ah, seems to work again with the improvements of the Zeit recipe with calibre 0.9.21. Thanks.
bverspeiser is offline   Reply With Quote
Old 03-13-2013, 11:12 AM   #22
Timbo80
Junior Member
Timbo80 began at the beginning.
 
Posts: 6
Karma: 10
Join Date: Nov 2012
Device: Kindle Touch
Calibre download?

Hi, what exactly is the advantage of using calibre to download the .epub / .mobi compared to have it emailed to you bei "Die Zeit"? I'm trying to figure out why to involve calibre.
Does it format the .mobi into a "periodical" format instead of an "ebook" format? That's the only annoying thing about the .mobi.

Once again in German, because we all know it's the language of love:

Wo ist der Vorteil darin, calibre zu nutzen um das epub/mobi der Zeit herunterzuladen? Der automatische eMail-Versand ist recht praktisch.

Weitere Frage: Konvertiert Calibre das .mobi von "ebook" in "periodical"? Das wre das einzige, was mir beim eReader Angebot der Zeit noch fehlen wrde.
Timbo80 is offline   Reply With Quote
Old 03-13-2013, 11:16 AM   #23
siebert
Developer
siebert has a complete set of Star Wars action figures.siebert has a complete set of Star Wars action figures.siebert has a complete set of Star Wars action figures.
 
Posts: 136
Karma: 280
Join Date: Nov 2010
Device: Kindle 3 (Keyboard) 3G / iPad 3 WiFi / Nexus 4 (Android)
First of all, there was no official email option back when the script was created.
siebert is offline   Reply With Quote
Old 03-15-2013, 02:20 PM   #24
Divingduck
Fanatic
Divingduck never is beset by a damp, drizzly November in his or her soul.Divingduck never is beset by a damp, drizzly November in his or her soul.Divingduck never is beset by a damp, drizzly November in his or her soul.Divingduck never is beset by a damp, drizzly November in his or her soul.Divingduck never is beset by a damp, drizzly November in his or her soul.Divingduck never is beset by a damp, drizzly November in his or her soul.Divingduck never is beset by a damp, drizzly November in his or her soul.Divingduck never is beset by a damp, drizzly November in his or her soul.Divingduck never is beset by a damp, drizzly November in his or her soul.Divingduck never is beset by a damp, drizzly November in his or her soul.Divingduck never is beset by a damp, drizzly November in his or her soul.
 
Posts: 558
Karma: 59934
Join Date: Nov 2010
Location: Germany
Device: Sony PRS-650
And second, not everyone owns a reader with integrated WLAN and email support. So, everyone can use what fits best for the own situation.
It’s like live, different people, different needs and different possible solutions – in five words: the Calibre way of live.
Divingduck is offline   Reply With Quote
Old 03-17-2013, 09:29 AM   #25
tobias2
Member
tobias2 began at the beginning.
 
Posts: 18
Karma: 36
Join Date: Feb 2011
Device: Kindle
And third, initially the mobi version was in the periodicals form (not sure what it is like now) and some people like the normal book form better and this you can create with the script. The books, for example, you can place into folders on the Kindle, the periodicals you cannot. Also the navigation I like in the book form better.

And fourth, with the script you can distribute the paper to several kindles (e.g., in your family and/or on a kindle and a tablet), while the e-mailed one can only go to a single kindle.com address.

And fifth, the version that Die Zeit offers at least used to have a lot of typographic and other errors that can be corrected by means of Calibre's automated processing, which is what "my" changed version of one of the scripts that is offered here has been doing (along with adding the full first page as an icon instead of the cut-off version that it used to have).

Hope that helps.

Cheers,

Tobias
tobias2 is offline   Reply With Quote
Reply

Thread Tools Search this Thread
Search this Thread:

Advanced Search

Forum Jump

Similar Threads
Thread Thread Starter Forum Replies Last Post
"DIE ZEIT" im Online-Abo auch als ePub ewy Deutsches Forum 142 12-21-2011 07:41 AM
Google Reader Recipe hack - Download all unread insted of just starred rollercoaster Recipes 82 06-17-2011 04:39 PM
Passing parameters to recipe from "Schedule News Download" Window (e.g. for filtering oecherprinte Recipes 6 05-13-2011 11:38 AM
Error with adding font to EPUB news recipe megabadd Calibre 2 01-11-2010 10:16 AM
How to specify options for epub in a recipe? kiklop74 Calibre 6 02-06-2009 03:43 PM


All times are GMT -4. The time now is 07:59 AM.


MobileRead.com is a privately owned, operated and funded community.