View Single Post
Old 11-27-2010, 06:23 AM   #1
siebert
Developer
siebert has a complete set of Star Wars action figures.siebert has a complete set of Star Wars action figures.siebert has a complete set of Star Wars action figures.
 
Posts: 155
Karma: 280
Join Date: Nov 2010
Device: Kindle 3 (Keyboard) 3G / iPad 9 WiFi / Google Pixel 6a (Android)
Exclamation Calibre epub from recipe fails in Sigil and FBReader on Android

Hi,

I created a new recipe (included below) which seems to work fine as long as I open the generated epub file with the calibre viewer.

Using the same epub with FBReader on Android or Sigil reveals some issues though:

FBReader 0.7.17:
All pages show the same picture.
Most navigation links are broken.
Text in "Kurzkritik" section has same fontsize as "Kurzkritik" heading. Text in "Kritik" section is fine though.

Sigil 0.3.1:
All pages show the same picture.
Some navigation links are broken.

I opened the epub with winrar and noticed that all html and image files are stored using the same filename (index.html and img1.jpg) and the distinction is only made by the path in which the files are stored. Maybe this is the reason for the behaviour, as I don't experience these issues with epub files where the filenames are different.

Is this a known issue? Can I do anything within the recipe to prevent it?

Here is the recipe I'm using:
Code:
#!/usr/bin/env  python
# -*- coding: utf-8 mode: python -*-

__license__   = 'GPL v3'
__copyright__ = 'Steffen Siebert <calibre at steffensiebert.de>'
__version__   = '1.0'

""" http://film-dienst.de """

import re
import string
from calibre.web.feeds.recipes import BasicNewsRecipe
from calibre.ptempfile import PersistentTemporaryFile

class FilmDienst(BasicNewsRecipe):
    __author__ = 'Steffen Siebert'
    title = 'film-dienst'
    description = 'Filmmagazin'
    publisher ='Deutsche Zeitung GmbH, Bonn'
    category = 'Film, Germany'
    lang = 'de-DE'
    encoding = "windows-1252"
    language = 'de'
    publication_type = 'magazine'
    articles_are_obfuscated = True
    use_embedded_content = False
    no_stylesheets = True
    
    conversion_options = {'comments': description, 'tags': category, 'language': language,
                          'publisher': publisher}

    IMAGE_RE = re.compile('<img src="kritikenimages/([0-9]+)\.jpg"')
    REVIEW_RE = re.compile('<tr><td><table[^>]+><tr><td class="rubrikgross" align="left">([^<]+)</td><td align="right"><a[^>]+><img [^>]+></a></td></tr></table></td></tr>\s+<tr><td>(<img src="([^"]+)" align="right">)?(.*)</td></tr>\s+<tr><td align="right"><i>([^<]+)?</i></td></tr>', re.DOTALL)
    SHORT_REVIEW_RE = re.compile('<tr><td class="rubrikgross">[^<]+</td></tr>\s+<tr><td><img [^>]+>(.*)</td></tr>')

    """
    Calibre recipe for the film-dienst magazine.

    Only the reviews are fully available without a subscription, so we ignore the remaining articles.
    """

    def get_obfuscated_article(self, url):
        """
        The film-dienst pages are very hard to handle with BeautifulSoup.
        So we extract the desired content using regular expressions and create a simple
        html page with minimal formatting to convert into the ebook.
        """

        shortReview = None
        imageUrl = None
        fdNumber = None

        result = re.match("^http://film-dienst\.kim-info\.de/kritiken.php\?nr=([0-9]+)$", url)
        number = result.group(1)

        br = self.get_browser()

        # Fetch review text.
        reviewUrl = "http://film-dienst.kim-info.de/kritiken.php?nr=%s"        
        con = br.open(reviewUrl % number)
        output = con.read()
        match = self.IMAGE_RE.search(output)
        if match:
            fdNumber = match.group(1)
            imageUrl = "http://film-dienst.kim-info.de/kritikenimages/%s.jpg" % fdNumber
    
        match = self.REVIEW_RE.search(output)
        title = match.group(1)
        review = re.sub("</p>\n</p>", "<p/>\n", match.group(4))
        author = match.group(5)
        if author == None:
            author = "-"
        
        # Fetch short review text.
        shortReviewUrl = "http://film-dienst.kim-info.de/kritiken.php?pos=Kurz&nr=%s"
        con = br.open(shortReviewUrl % number)
        output = con.read()

        match = self.SHORT_REVIEW_RE.search(output)
        if match:
            shortReview = match.group(1)
    
        # Write content to new temporary html file.
        html = PersistentTemporaryFile('_fa.html')
        if fdNumber:
            html.write('<html>\n<head>\n<title>%s - fd %s</title>\n</head>\n' % (title, fdNumber))
            html.write("<body>\n<b>fd %s</b><h1>%s</h1>\n" % (fdNumber, title))
        else:
            html.write('<html>\n<head>\n<title>%s</title>\n</head>\n' % (title))
            html.write("<body>\n<h1>%s</h1>\n" % (title))
        if shortReview:
            html.write("<h2>Kurzkritik</h2>\n")
            html.write("%s\n" % shortReview)
        html.write("<h2>Kritik</h2>\n")
        if imageUrl:
            html.write('<img src="%s"><br>' % imageUrl)
        html.write("%s<br>\n" % review)
        html.write("<i>%s</i><br>\n" % author)
        html.write("</body>\n</html>\n")
        html.close()

        return html.name

    def parse_index(self):
        """
        Find all review links and group them by movie start date.
        Also get magazine cover and issue number.
        """

        feedName = None
        feeds = []
        articles = []

        # Find cover image.
        soup = self.index_to_soup("http://film-dienst.kim-info.de/")
        cover = soup.find('img', alt='Cover film-dienst')
        self.cover_url = 'http://film-dienst.kim-info.de/' + cover['src']

        # Find issue number.
        issue = soup.find('span', attrs={'class':'jahr'})
        self.timefmt = self.tag_to_string(issue)

        # Navigate to nested table containing the list of reviews.
        start = soup.find("td", "rubrikgross")
        table = start.parent.parent
        for row in table.findAll("tr"):
            if row.table:
                # We found the right table.
                # Now handle all table rows.
                for row in row.table.findAll("tr"):
                    # Movie start date is enclosed in a bold tag.
                    b = row.find("b")
                    if b:
                        # If it's not the first section, append previous section to list of feeds.
                        if feedName != None:
                            feeds.append((feedName, articles))
                        articles = []
                        feedName = self.tag_to_string(b)
                        continue
                    # Find reviews via the link tag.
                    link = row.find("a")
                    if link:
                        url = "http://film-dienst.kim-info.de/" + link['href']
                        articles.append({'title': self.tag_to_string(link), 'url': url, 'date': ''})
                        
                # Append last section to list of feeds.
                if feedName:
                    feeds.append((feedName, articles))

                return feeds
siebert is offline   Reply With Quote