I have created a new recipe for ZEIT Premium (subscription only).
It downloads all E-Books the page has to offer:
- The main newspaper Die Zeit in all offered formats (epub, mobi, pdf, and a zip with all audiobooks of the newspaper). All formats are imported into calibre db as one logical book entry.
- Zeit Magazin (pdf) imported in its own new book entry.
The user can easily switch on/off the different formats to download at the recipe's header.
I think this is the first recipe to download pdf's etc, so it might be interesting for other recipe developers, too.
Code:
import re, zipfile, os
from calibre.ptempfile import PersistentTemporaryDirectory
from calibre.ptempfile import PersistentTemporaryFile
from urlparse import urlparse
GET_MOBI=False
GET_PDF=True
GET_AUDIO=True
GET_MAGAZIN=True
class ZeitPremiumAllFormats(BasicNewsRecipe):
title = u'Zeit Premium All Formats'
description = u'Lädt alle angebotenen E-Book Formate der aktuellen Woche aus dem Zeit Premium Bereich (kostenpflichtiges Abo). Dies beinhaltet für Die Zeit die Formate epub, mobi, pdf und alle Audiofiles als zip. Sie werden in der Calibre Datenbank als ein einziges Buch eingetragen. Des weiteren das Zeit Magazin als pdf als eigenständiges Buch. Aus technischen Gründen wird ein dritter Bucheintrag erstellt, der Die Zeit in einer abgewandelten epub Version erhält. Dieser Eintrag kann getrost gelöscht werden. Alle Formate ausser epub können ein- oder ausgeschaltet werden. Anmerkung: Während der Umstellung auf eine neue Ausgabe (Mittwoch abends) werden nicht alle Formate gleichzeitig erneuert. Im Calibre Eintrag können dann die verschiedenen Formate zu verschiedenen Ausgaben gehören! ___Getestet unter Unix___ - unter anderen Betriebssystemen funktioniert dieses recipe möglicherweise nicht.'
__author__ = 'Achim Schumacher'
language = 'de'
needs_subscription = True
conversion_options = {
'no_default_epub_cover' : True,
}
#
# Login process required:
# Override BasicNewsRecipe.get_browser()
#
def get_browser(self):
br = BasicNewsRecipe.get_browser()
# new login process
domain = "https://premium.zeit.de"
response = br.open(domain)
# Get rid of nested form
response.set_data(re.sub('<div><form action=.*', '', response.get_data() ))
br.set_response(response)
br.select_form(nr=2)
br.form['name']=self.username
br.form['pass']=self.password
br.submit()
return br
# Do not fetch news and convert them to E-Books.
# Instead, download the epub directly from the site.
# For this, override BasicNewsRecipe.build_index()
#
def build_index(self):
browser = self.get_browser()
# find the links
epublink = browser.find_link(text_regex=re.compile('.*Ausgabe als Datei im ePub-Format.*'))
mobilink = browser.find_link(text_regex=re.compile('.*Ausgabe als Datei im Mobi-Format.*'))
pdflink = browser.find_link(text_regex=re.compile('.*Download der gesamten Ausgabe als PDF Datei.*'))
audiolink = browser.find_link(text_regex=re.compile('.*Alle Audios der aktuellen ZEIT.*'))
edition = (urlparse(pdflink.url)[2]).replace('/system/files/epaper/DZ/pdf/DZ_ePaper_','').replace('.pdf','')
zm_url = urlparse(pdflink.base_url)[0]+'://'+urlparse(pdflink.base_url)[1]+''+(urlparse(pdflink.url)[2]).replace('DZ/pdf/DZ_ePaper','ZM/pdf/ZM_ePaper')
# TODO: Test for other books that are only published once in a while
# (e.g., Die Zeit Beilage)
print "Found epub-link: %s" % epublink.url
print "Found Mobi-link: %s" % mobilink.url
print "Found pdf-link: %s" % pdflink.url
print "Found audio-link: %s" % audiolink.url
print "Found ZM-link: %s" % zm_url
print "This edition is: %s" % edition
# The following part is from a recipe by Starsom17
#
# It modifies build_index, which is the method that gets the
# masthead image and cover, parses the feed for articles, retrieves
# the articles, removes tags from articles, etc. All of those steps
# ultimately produce a local directory structure that looks like an
# unzipped EPUB.
#
# This part grabs the link to one EPUB, saves the EPUB locally,
# extracts it, and passes the result back into the recipe system
# as though all the other steps had been completed normally.
#
# This has to be done, even if one does not want to use this
# calibre-modified epub. Otherwise, the recipe runs into an error.
# This is the reason why there shows up a second Die Zeit entry
# in calibre db.
self.report_progress(0,_('downloading epub'))
response = browser.follow_link(epublink)
# We need two different directories for Die Zeit and Zeit Magazin
DZdir = PersistentTemporaryDirectory()
ZMdir = PersistentTemporaryDirectory()
epub_file = PersistentTemporaryFile(suffix='.epub',dir=DZdir)
epub_file.write(response.read())
epub_file.close()
zfile = zipfile.ZipFile(epub_file.name, 'r')
self.report_progress(0.1,_('extracting epub'))
zfile.extractall(self.output_dir)
epub_file.close()
index = os.path.join(self.output_dir, 'content.opf')
self.report_progress(0.2,_('epub downloaded and extracted'))
#
# Now, download the remaining files
#
print "output_dir is: %s" % self.output_dir
print "DZdir is: %s" % DZdir
print "ZMdir is: %s" % ZMdir
if (GET_MOBI):
self.report_progress(0.3,_('downloading mobi'))
mobi_file = PersistentTemporaryFile(suffix='.mobi',dir=DZdir)
browser.back()
response = browser.follow_link(mobilink)
mobi_file.write(response.read())
mobi_file.close()
if (GET_PDF):
self.report_progress(0.4,_('downloading pdf'))
pdf_file = PersistentTemporaryFile(suffix='.pdf',dir=DZdir)
browser.back()
response = browser.follow_link(pdflink)
pdf_file.write(response.read())
pdf_file.close()
if (GET_AUDIO):
self.report_progress(0.5,_('downloading audio'))
audio_file = PersistentTemporaryFile(suffix='.mp3.zip',dir=DZdir)
browser.back()
response = browser.follow_link(audiolink)
audio_file.write(response.read())
audio_file.close()
# Get all Die Zeit formats into Calibre's database
self.report_progress(0.6,_('Adding Die Zeit to Calibre db'))
cmd = "calibredb add -1 " + DZdir
os.system(cmd)
# Zeit Magazin has to be handled differently.
# First, it has to be downloaded into it's own directory, since it
# is a different book as Die Zeit.
# Second, we know its url rather than its link.
# Third, there is no Metadata present, so we need to give it
# a proper name so that calibre will set Author and Title at import.
# Unfortunately, the present solution includes a random part in the
# name which after db import has to be manually resolved by the user.
if (GET_MAGAZIN):
self.report_progress(0.7,_('downloading ZM'))
ZM_file = PersistentTemporaryFile(suffix=' Zeit Magazin '+edition+' - Zeitverlag Gerd Bucerius GmbH und Co. KG.pdf',dir=ZMdir)
response = browser.open(zm_url)
ZM_file.write(response.read())
ZM_file.close()
# Get Zeit Magazin into Calibre's database
self.report_progress(0.8,_('Adding Zeit Magazin to Calibre db'))
cmd = "calibredb add -1 " + ZMdir
os.system(cmd)
return index