MobileRead Forums - View Single Post

achims · 11-08-2011, 10:56 AM

Hi all,

I have an updated version of the ZEIT recipe. These are the changes:

- No more system calls to 'calibredb add'. Instead, (modified) internal calibre functions are used. These mods were needed for the next point:

- Set metadata to your likings. You can set authors and tags. Zeit Magazin and Beilage now have correct title and author.

Have fun
Achim

Spoiler:

Code:

import sys, re, zipfile, os
from calibre.ptempfile import PersistentTemporaryDirectory
from calibre.ptempfile import PersistentTemporaryFile
from urlparse import urlparse
from calibre.ebooks.metadata import MetaInformation, string_to_authors
from calibre.library.cli import do_add_empty, send_message, write_dirtied, do_add
from calibre.utils.config import prefs
from calibre.library.database2 import LibraryDatabase2



GET_MOBI=False
GET_PDF=False
GET_AUDIO=False
GET_MAGAZIN=True
GET_BEILAGE=True
authors = 'Zeitverlag Gerd Bucerius GmbH und Co. KG'
tags = ['Die Zeit']
languages = ['de']

class ZeitPremiumAllFormats(BasicNewsRecipe):
    title          = u'Zeit Premium All Formats'
    description    = u'Lädt alle angebotenen E-Book Formate der aktuellen Woche aus dem Zeit Premium Bereich (kostenpflichtiges Abo): Die Zeit als epub, mobi, pdf und alle Audiofiles als zip. Sie werden in der Calibre Datenbank als ein einziges Buch eingetragen. Das Zeit Magazin und ggfls. die Beilage als pdf als je eigenständiges Buch. Aus technischen Gründen wird ein doppelter Bucheintrag der Zeit erstellt, der ein epub in einer abgewandelten Version erhält. Dieser Eintrag kann gelöscht werden. Alle Formate ausser epub können ein- oder ausgeschaltet werden. Anmerkung: Während der Umstellung auf eine neue Ausgabe (Mittwoch abends) werden nicht alle Formate gleichzeitig erneuert. Im Calibre Eintrag können dann die verschiedenen Formate zu verschiedenen Ausgaben gehören! Bei mehrfachem Aufruf werden Duplikate der Bucheinträge erstellt.'
    __author__ = 'Achim Schumacher'
    language = 'de'
    needs_subscription = True
    conversion_options = {
        'no_default_epub_cover' : True,
    }

    #
    # Login process required:
    # Override BasicNewsRecipe.get_browser()
    #
    def get_browser(self):
        br = BasicNewsRecipe.get_browser()
        # new login process
        domain = "https://premium.zeit.de"
        response = br.open(domain)
        # Get rid of nested form
        response.set_data(re.sub('<div><form action=.*', '', response.get_data() ))
        br.set_response(response)
        br.select_form(nr=2)
        br.form['name']=self.username
        br.form['pass']=self.password
        br.submit()
        return br


    # Copies only those parts of the new metadata to the old metadata
    # which have actual data.
    def copy_metadata(self, new, old):
        mi = old
        if new.title:
            mi.title = new.title
        if new.authors:
            mi.authors = new.authors
        if new.isbn:
            mi.isbn = new.isbn
        if new.tags:
            mi.tags = new.tags
        if new.languages:
            mi.languages = new.languages
        return mi

    # Override calibre.library.import_book_directory
    # because it does not offer options to set metadata.
    # This version adds a new option mi: 
    # for all mi-fields which have data, the data is copied to ebook's metadata
    def import_book_directory(self, db, dirpath, mi2, callback=None):
        from calibre.ebooks.metadata.meta import metadata_from_formats
        dirpath = os.path.abspath(dirpath)
        formats = db.find_books_in_directory(dirpath, True)
        formats = list(formats)[0]
        if not formats:
            return
        mi = metadata_from_formats(formats)
        mi = self.copy_metadata(mi2, mi)
        if mi.title is None:
            return
        if db.has_book(mi):
            return [(mi, formats)]
        db.import_book(mi, formats)
        if callable(callback):
            callback(mi.title)


    # Override calibre.library.do_add,
    # because it does not offer options to set metadata.
    # This version adds a new option mi: 
    # for all mi-fields which have data, the data is copied to ebook's metadata
    # In this version: recurse=False, one_book_per_directory=True
    def do_add(self, db, paths, mi2, add_duplicates):
        from calibre.ebooks.metadata.meta import get_metadata
        orig = sys.stdout
        #sys.stdout = NULL
        try:
            files, dirs = [], []
            for path in paths:
                path = os.path.abspath(path)
                if os.path.isdir(path):
                    dirs.append(path)
                else:
                    if os.path.exists(path):
                        files.append(path)
                    else:
                        print path, 'not found'

            formats, metadata = [], []
            for book in files:
                format = os.path.splitext(book)[1]
                format = format[1:] if format else None
                if not format:
                    continue
                stream = open(book, 'rb')
                mi = get_metadata(stream, stream_type=format, use_libprs_metadata=True)
                if not mi.title:
                    mi.title = os.path.splitext(os.path.basename(book))[0]
                if not mi.authors:
                    mi.authors = [_('Unknown')]
                mi = self.copy_metadata(mi2, mi)
                formats.append(format)
                metadata.append(mi)

            file_duplicates = []
            if files:
                file_duplicates = db.add_books(files, formats, metadata,
                                               add_duplicates=add_duplicates)
                if file_duplicates:
                    file_duplicates = file_duplicates[0]
    

            dir_dups = []
            for dir in dirs:
#                if recurse:
#                    dir_dups.extend(db.recursive_import(dir, single_book_per_directory=one_book_per_directory))
#                else:
                    func = self.import_book_directory(db, dir, mi2)# if one_book_per_directory else db.import_book_directory_multiple
                    dups = func
                    if not dups:
                        dups = []
                    dir_dups.extend(dups)

            sys.stdout = sys.__stdout__

            if add_duplicates:
                for mi, formats in dir_dups:
                    mi = self.copy_metadata(mi2, mi)
                    db.import_book(mi, formats)
            else:
                if dir_dups or file_duplicates:
                    print >>sys.stderr, _('The following books were not added as '
                                          'they already exist in the database '
                                          '(see --duplicates option):')
                for mi, formats in dir_dups:
                    title = mi.title
                    if isinstance(title, unicode):
                        title = title.encode(preferred_encoding)
                    print >>sys.stderr, '\t', title + ':'
                    for path in formats:
                        print >>sys.stderr, '\t\t ', path
                if file_duplicates:
                    for path, mi in zip(file_duplicates[0], file_duplicates[2]):
                        title = mi.title
                        if isinstance(title, unicode):
                            title = title.encode(preferred_encoding)
                        print >>sys.stderr, '\t', title+':'
                        print >>sys.stderr, '\t\t ', path

            write_dirtied(db)
            send_message()
        finally:
            sys.stdout = orig



    # Do not fetch news and convert them to E-Books.
    # Instead, download the epub directly from the site.
    # For this, override BasicNewsRecipe.build_index()
    #
    def build_index(self):
        browser = self.get_browser()
        # Get the path to the db
        dbpath = prefs['library_path']
        # Get access to the database
        dbpath = os.path.abspath(dbpath)
        db = LibraryDatabase2(dbpath)


        # find the links
        epublink = browser.find_link(text_regex=re.compile('.*Ausgabe als Datei im ePub-Format.*'))
        mobilink = browser.find_link(text_regex=re.compile('.*Ausgabe als Datei im Mobi-Format.*'))
        pdflink = browser.find_link(text_regex=re.compile('.*Download der gesamten Ausgabe als PDF Datei.*'))
        audiolink = browser.find_link(text_regex=re.compile('.*Alle Audios der aktuellen ZEIT.*'))
        #edition = (urlparse(pdflink.url)[2]).replace('/system/files/epaper/DZ/pdf/DZ_ePaper_','').replace('.pdf','')
        edition_ = re.split('_', (urlparse(pdflink.url)[2]).replace('/system/files/epaper/DZ/pdf/DZ_ePaper_','').replace('.pdf','') )
        edition = '20' + edition_[1] + ' - ' + edition_[0]
        zm_url = urlparse(pdflink.base_url)[0]+'://'+urlparse(pdflink.base_url)[1]+''+(urlparse(pdflink.url)[2]).replace('DZ/pdf/DZ_ePaper','ZM/pdf/ZM_ePaper')
        bl_url = urlparse(pdflink.base_url)[0]+'://'+urlparse(pdflink.base_url)[1]+''+(urlparse(pdflink.url)[2]).replace('DZ/pdf/DZ_ePaper','BL/pdf/BL_ePaper')
        print "Found epub-link: %s" % epublink.url
        print "Found Mobi-link: %s" % mobilink.url
        print "Found pdf-link: %s" % pdflink.url
        print "Found audio-link: %s" % audiolink.url
        print "Will try ZM-link: %s" % zm_url
        print "Will try BL-link: %s" % bl_url
        print "This edition is: %s" % edition

        # The following part is from a recipe by Starson17
        #
        # It modifies build_index, which is the method that gets the 
        # masthead image and cover, parses the feed for articles, retrieves
        # the articles, removes tags from articles, etc. All of those steps 
        # ultimately produce a local directory structure that looks like an 
        # unzipped EPUB. 
        #
        # This part grabs the link to one EPUB, saves the EPUB locally,
        # extracts it, and passes the result back into the recipe system
        # as though all the other steps had been completed normally.
        #
        # This has to be done, even if one does not want to use this
        # calibre-modified epub. Otherwise, the recipe runs into an error.
        # This is the reason why there shows up a second Die Zeit entry
        # in calibre db.
        self.report_progress(0,_('downloading epub'))
        response = browser.follow_link(epublink)
        # We need two different directories for Die Zeit and Zeit Magazin
        DZdir = PersistentTemporaryDirectory(prefix='DZ_')
        ZMdir = PersistentTemporaryDirectory(prefix='ZM_')
        BLdir = PersistentTemporaryDirectory(prefix='BL_')
        epub_file = PersistentTemporaryFile(suffix='.epub',dir=DZdir)
        epub_file.write(response.read())
        epub_file.close()
        zfile = zipfile.ZipFile(epub_file.name, 'r')
        self.report_progress(0.1,_('extracting epub'))
        zfile.extractall(self.output_dir)
        epub_file.close()
        index = os.path.join(self.output_dir, 'content.opf')
        self.report_progress(0.2,_('epub downloaded and extracted'))

        #
        # Now, download the remaining files
        #
        print "output_dir is: %s" % self.output_dir
        print "DZdir is: %s" % DZdir
        print "ZMdir is: %s" % ZMdir
        print "BLdir is: %s" % BLdir

        if (GET_MOBI):
           self.report_progress(0.3,_('downloading mobi'))
           mobi_file = PersistentTemporaryFile(suffix='.mobi',dir=DZdir)
           browser.back()
           response = browser.follow_link(mobilink)
           mobi_file.write(response.read())
           mobi_file.close()

        if (GET_PDF):
           self.report_progress(0.4,_('downloading pdf'))
           pdf_file = PersistentTemporaryFile(suffix='.pdf',dir=DZdir)
           browser.back()
           response = browser.follow_link(pdflink)
           pdf_file.write(response.read())
           pdf_file.close()

        if (GET_AUDIO):
           self.report_progress(0.5,_('downloading audio'))
           audio_file = PersistentTemporaryFile(suffix='.mp3.zip',dir=DZdir)
           browser.back()
           response = browser.follow_link(audiolink)
           audio_file.write(response.read())
           audio_file.close()

        # Get all Die Zeit formats into Calibre's database
        self.report_progress(0.6,_('Adding Die Zeit to Calibre db'))
        mi = MetaInformation(None)
        title="Die ZEIT "+edition
        mi.title = title
        mi.authors = string_to_authors(authors)
        mi.tags = tags
        mi.languages = languages
        self.do_add(db, [DZdir], mi, True)
        

        # Zeit Magazin has to be handled differently.
        # First, it has to be downloaded into it's own directory, since it
        # is a different book as Die Zeit.
        # Second, we know its url rather than its link.
        # Third, there is no Metadata present in the file itself.
        if (GET_MAGAZIN):
           self.report_progress(0.7,_('downloading ZM'))
           title="ZEIT Magazin "+edition
           ZM_file = PersistentTemporaryFile(suffix='.pdf',dir=ZMdir)
           try:
              response = browser.open(zm_url)
              ZM_file.write(response.read())
              ZM_file.close()
              # Get Zeit Magazin into Calibre's database
              self.report_progress(0.8,_('Adding Zeit Magazin to Calibre db'))
              mi.title = title
              self.do_add(db, [ZMdir], mi, True)

           except:
              self.report_progress(0.8,_('No Zeit Magazin found...'))

        # Zeit Beilage is technically the same as Zeit Magazin, but it is
        # not included in every edition. So, the use of try: is 
        # obligatory here.
        if (GET_BEILAGE):
           self.report_progress(0.9,_('downloading BL'))
           title="ZEIT Beilage "+edition
           BL_file = PersistentTemporaryFile(suffix='.pdf',dir=BLdir)
           try:
              response = browser.open(bl_url)
              BL_file.write(response.read())
              BL_file.close()
              # Get Zeit Beilage into Calibre's database
              self.report_progress(0.9,_('Adding Zeit Beilage to Calibre db'))
              mi.title = title
              self.do_add(db, [BLdir], mi, True)
           except:
              self.report_progress(0.9,_('No Zeit Beilage found...'))

        return index