MobileRead Forums - View Single Post

siebert · 11-21-2010, 07:37 AM

Hi,

I took the liberty to enhance the existing brand eins recipe.

Here is my changelog:
NEW: The issue to download can be selected via the username field.
NEW: Add cover image.
NEW: Prevent that conversion date is appended to title.
NEW: Remove "This article was downloaded by calibre from..." section from bottom of each page.
FIXED: "brand eins" is written in lowercase.

And here is the recipe:

Code:

#!/usr/bin/env  python
# -*- coding: utf-8 mode: python -*-

__license__   = 'GPL v3'
__copyright__ = '2010, Constantin Hofstetter <consti at consti.de>, Steffen Siebert <calibre at steffensiebert.de>'
__version__   = '0.96'

''' http://brandeins.de - Wirtschaftsmagazin '''
import re
import string
from calibre.web.feeds.recipes import BasicNewsRecipe
from calibre.web.feeds.templates import Template, CLASS
from lxml.html.builder import HTML, HEAD, TITLE, STYLE, DIV, BODY, BR, A, HR, UL

class MyNavBarTemplate(Template):
  """
  Same as calibre.web.feeds.templates.NavBarTemplate but without the
  'This article was downloaded by calibre from...'
  text at the bottom.
  """

  def _generate(self, bottom, feed, art, number_of_articles_in_feed,
                two_levels, url, __appname__, prefix='', center=True,
                extra_css=None, style=None):
    head = HEAD(TITLE('navbar'))
    if style:
      head.append(STYLE(style, type='text/css'))
    if extra_css:
      head.append(STYLE(extra_css, type='text/css'))

    if prefix and not prefix.endswith('/'):
      prefix += '/'
    align = 'center' if center else 'left'

    navbar = DIV(CLASS('calibre_navbar', 'calibre_rescale_70',
                       style='text-align:'+align))
    if bottom:
      if not url.startswith('file://'):
        navbar.append(HR())
    else:
      next = 'feed_%d'%(feed+1) if art == number_of_articles_in_feed - 1 \
          else 'article_%d'%(art+1)
      up = '../..' if art == number_of_articles_in_feed - 1 else '..'
      href = '%s%s/%s/index.html'%(prefix, up, next)
      navbar.text = '| '
      navbar.append(A('Next', href=href))
    href = '%s../index.html#article_%d'%(prefix, art)
    navbar.iterchildren(reversed=True).next().tail = ' | '
    navbar.append(A('Section Menu', href=href))
    href = '%s../../index.html#feed_%d'%(prefix, feed)
    navbar.iterchildren(reversed=True).next().tail = ' | '
    navbar.append(A('Main Menu', href=href))
    if art > 0 and not bottom:
      href = '%s../article_%d/index.html'%(prefix, art-1)
      navbar.iterchildren(reversed=True).next().tail = ' | '
      navbar.append(A('Previous', href=href))
    navbar.iterchildren(reversed=True).next().tail = ' | '
    if not bottom:
      navbar.append(HR())

    self.root = HTML(head, BODY(navbar))

class BrandEins(BasicNewsRecipe):

  title = u'brand eins'
  __author__ = 'Constantin Hofstetter'
  description = u'Wirtschaftsmagazin'
  publisher ='brandeins.de'
  category = 'politics, business, wirtschaft, Germany'
  use_embedded_content = False
  lang = 'de-DE'
  no_stylesheets = True
  encoding = 'utf-8'
  language = 'de'
  publication_type = 'magazine'
  needs_subscription = True
  # Prevent that conversion date is appended to title
  timefmt = ''

  # 2 is the last full magazine (default)
  # 1 is the newest (but not full)
  # 3 is one before 2 etc.
  # This value can be set via the username field.
  default_issue = 2

  keep_only_tags = [dict(name='div', attrs={'id':'theContent'}), dict(name='div', attrs={'id':'sidebar'}), dict(name='div', attrs={'class':'intro'}), dict(name='p', attrs={'class':'bodytext'}), dict(name='div', attrs={'class':'single_image'})]

  '''
  brandeins.de
  '''

  def __init__(self, options, log, progress_reporter):
    """ Constructor. """
    BasicNewsRecipe.__init__(self, options, log, progress_reporter)
    self.navbar = MyNavBarTemplate()
  
  def postprocess_html(self, soup,first):

    # Move the image of the sidebar right below the h3
    first_h3 = soup.find(name='div', attrs={'id':'theContent'}).find('h3')
    for imgdiv in soup.findAll(name='div', attrs={'class':'single_image'}):
      if len(first_h3.findNextSiblings('div', {'class':'intro'})) >= 1:
        # first_h3.parent.insert(2, imgdiv)
        first_h3.findNextSiblings('div', {'class':'intro'})[0].parent.insert(4, imgdiv)
      else:
        first_h3.parent.insert(2, imgdiv)

    # Now, remove the sidebar
    soup.find(name='div', attrs={'id':'sidebar'}).extract()

    # Remove the rating-image (stars) from the h3
    for img in first_h3.findAll(name='img'):
        img.extract()

    # Mark the intro texts as italic
    for div in soup.findAll(name='div', attrs={'class':'intro'}):
      for p in div.findAll('p'):
        content = self.tag_to_string(p)
        new_p = "<p><i>"+ content +"</i></p>"
        p.replaceWith(new_p)

    return soup

  def get_cover(self, soup):
    cover_url = None
    cover_item = soup.find('div', attrs = {'class': 'cover_image'})
    if cover_item:
      cover_url = 'http://www.brandeins.de/' + cover_item.img['src']
    return cover_url

  def parse_index(self):
    feeds = []

    archive = "http://www.brandeins.de/archiv.html"

    issue = self.default_issue
    if self.username:
      try:
        issue = int(self.username)
      except:
        pass

    soup = self.index_to_soup(archive)
    latest_jahrgang = soup.findAll('div', attrs={'class': re.compile(r'\bjahrgang-latest\b') })[0].findAll('ul')[0]
    pre_latest_issue = latest_jahrgang.findAll('a')[len(latest_jahrgang.findAll('a'))-issue]
    url = pre_latest_issue.get('href', False)
    # Get the title for the magazin - build it out of the title of the cover - take the issue and year;
    self.title = "brand eins "+ re.search(r"(?P<date>\d\d\/\d\d\d\d)", pre_latest_issue.find('img').get('title', False)).group('date')
    url = 'http://brandeins.de/'+url

    # url = "http://www.brandeins.de/archiv/magazin/tierisch.html"
    titles_and_articles = self.brand_eins_parse_latest_issue(url)
    if titles_and_articles:
      for title, articles in titles_and_articles:
        feeds.append((title, articles))
    return feeds

  def brand_eins_parse_latest_issue(self, url):
    soup = self.index_to_soup(url)
    self.cover_url = self.get_cover(soup)
    article_lists = [soup.find('div', attrs={'class':'subColumnLeft articleList'}), soup.find('div', attrs={'class':'subColumnRight articleList'})]

    titles_and_articles = []
    current_articles = []
    chapter_title = "Editorial"
    self.log('Found Chapter:', chapter_title)

    # Remove last list of links (thats just the impressum and the 'gewinnspiel')
    article_lists[1].findAll('ul')[len(article_lists[1].findAll('ul'))-1].extract()

    for article_list in article_lists:
      for chapter in article_list.findAll('ul'):
        if len(chapter.findPreviousSiblings('h3')) >= 1:
          new_chapter_title = string.capwords(self.tag_to_string(chapter.findPreviousSiblings('h3')[0]))
          if new_chapter_title != chapter_title:
            titles_and_articles.append([chapter_title, current_articles])
            current_articles = []
            self.log('Found Chapter:', new_chapter_title)
          chapter_title = new_chapter_title
        for li in chapter.findAll('li'):
          a = li.find('a', href = True)
          if a is None:
            continue
          title = self.tag_to_string(a)
          url = a.get('href', False)
          if not url or not title:
            continue
          url = 'http://brandeins.de/'+url
          if len(a.parent.findNextSiblings('p')) >= 1:
            description = self.tag_to_string(a.parent.findNextSiblings('p')[0])
          else:
            description = ''

          self.log('\t\tFound article:', title)
          self.log('\t\t\t', url)
          self.log('\t\t\t', description)

          current_articles.append({'title': title, 'url': url, 'description': description, 'date':''})
    titles_and_articles.append([chapter_title, current_articles])
    return titles_and_articles

Ciao,
Steffen