MobileRead Forums - View Single Post

siebert · 01-01-2011, 09:39 AM

Hi,

first of all: Happy new year to all of you!

The current brand eins recipe has a flaw as it handles only issues of the latest year, so with the addition of the (current, thus incomplete) january 2011 issue to the archive it's no longer possible to download issues from 2010, especially the december issue which is now the latest complete issue.

So I fixed the recipe and it's now possible to download any issue from all years.

Ciao,
Steffen

Code:

#!/usr/bin/env  python
# -*- coding: utf-8 mode: python -*-

__license__   = 'GPL v3'
__copyright__ = '2010, Constantin Hofstetter <consti at consti.de>, Steffen Siebert <calibre at steffensiebert.de>'
__version__   = '0.97'

''' http://brandeins.de - Wirtschaftsmagazin '''
import re
import string
from calibre.ebooks.BeautifulSoup import Tag
from calibre.web.feeds.recipes import BasicNewsRecipe
from calibre.web.feeds.templates import Template, CLASS
from lxml.html.builder import HTML, HEAD, TITLE, STYLE, DIV, BODY, BR, A, HR, UL

class BrandEins(BasicNewsRecipe):

  title = u'brand eins'
  __author__ = 'Constantin Hofstetter'
  description = u'Wirtschaftsmagazin'
  publisher ='brandeins.de'
  category = 'politics, business, wirtschaft, Germany'
  use_embedded_content = False
  lang = 'de-DE'
  no_stylesheets = True
  encoding = 'utf-8'
  language = 'de'
  publication_type = 'magazine'
  needs_subscription = 'optional'
  # Prevent that conversion date is appended to title
  timefmt = ''

  # 2 is the last full magazine (default)
  # 1 is the newest (but not full)
  # 3 is one before 2 etc.
  # This value can be set via the username field.
  default_issue = 2

  keep_only_tags = [dict(name='div', attrs={'id':'theContent'}), dict(name='div', attrs={'id':'sidebar'}), dict(name='div', attrs={'class':'intro'}), dict(name='p', attrs={'class':'bodytext'}), dict(name='div', attrs={'class':'single_image'})]

  '''
  brandeins.de
  '''

  def postprocess_html(self, soup,first):

    # Move the image of the sidebar right below the h3
    first_h3 = soup.find(name='div', attrs={'id':'theContent'}).find('h3')
    for imgdiv in soup.findAll(name='div', attrs={'class':'single_image'}):
      if len(first_h3.findNextSiblings('div', {'class':'intro'})) >= 1:
        # first_h3.parent.insert(2, imgdiv)
        first_h3.findNextSiblings('div', {'class':'intro'})[0].parent.insert(4, imgdiv)
      else:
        first_h3.parent.insert(2, imgdiv)

    # Now, remove the sidebar
    soup.find(name='div', attrs={'id':'sidebar'}).extract()

    # Remove the rating-image (stars) from the h3
    for img in first_h3.findAll(name='img'):
        img.extract()

    # Mark the intro texts as italic
    for div in soup.findAll(name='div', attrs={'class':'intro'}):
      for p in div.findAll('p'):
        content = self.tag_to_string(p)
        new_p = "<p><i>"+ content +"</i></p>"
        p.replaceWith(new_p)

    # Change <h3> to <h1>
    header = soup.find("h3")
    if header:
        tag = Tag(soup, "h1")
        tag.insert(0, header.contents[0])
        header.replaceWith(tag)

    return soup

  def get_cover(self, soup):
    cover_url = None
    cover_item = soup.find('div', attrs = {'class': 'cover_image'})
    if cover_item:
      cover_url = 'http://www.brandeins.de/' + cover_item.img['src']
    return cover_url

  def parse_index(self):
    feeds = []
    issue_map = {}

    archive = "http://www.brandeins.de/archiv.html"

    issue = self.default_issue
    if self.username:
      try:
        issue = int(self.username)
      except:
        pass

    soup = self.index_to_soup(archive)
    issue_list = soup.findAll('div', attrs={'class': 'tx-brandeinsmagazine-pi1'})[0].findAll('a')
    issue_list = [i for i in issue_list if i.get('onmouseover', False)]
    for i in issue_list:
        issue_number_string = i.get('onmouseover', False)
        if issue_number_string:
            match = re.match("^switch_magazine\(([0-9]+), ([0-9]+)\)$", issue_number_string)
            issue_number = "%04i%02i" % (int(match.group(1)), int(match.group(2)))
            issue_map[issue_number] = i
    keys = issue_map.keys()
    keys.sort()
    keys.reverse()
    selected_issue = issue_map[keys[issue-1]]
    url = selected_issue.get('href', False)
    # Get the title for the magazin - build it out of the title of the cover - take the issue and year;
    self.title = "brand eins "+ re.search(r"(?P<date>\d\d\/\d\d\d\d)", selected_issue.find('img').get('title', False)).group('date')
    url = 'http://brandeins.de/'+url

    # url = "http://www.brandeins.de/archiv/magazin/tierisch.html"
    titles_and_articles = self.brand_eins_parse_issue(url)
    if titles_and_articles:
      for title, articles in titles_and_articles:
        feeds.append((title, articles))
    return feeds

  def brand_eins_parse_issue(self, url):
    soup = self.index_to_soup(url)
    self.cover_url = self.get_cover(soup)
    article_lists = [soup.find('div', attrs={'class':'subColumnLeft articleList'}), soup.find('div', attrs={'class':'subColumnRight articleList'})]

    titles_and_articles = []
    current_articles = []
    chapter_title = "Editorial"
    self.log('Found Chapter:', chapter_title)

    # Remove last list of links (thats just the impressum and the 'gewinnspiel')
    article_lists[1].findAll('ul')[len(article_lists[1].findAll('ul'))-1].extract()

    for article_list in article_lists:
      for chapter in article_list.findAll('ul'):
        if len(chapter.findPreviousSiblings('h3')) >= 1:
          new_chapter_title = string.capwords(self.tag_to_string(chapter.findPreviousSiblings('h3')[0]))
          if new_chapter_title != chapter_title:
            titles_and_articles.append([chapter_title, current_articles])
            current_articles = []
            self.log('Found Chapter:', new_chapter_title)
          chapter_title = new_chapter_title
        for li in chapter.findAll('li'):
          a = li.find('a', href = True)
          if a is None:
            continue
          title = self.tag_to_string(a)
          url = a.get('href', False)
          if not url or not title:
            continue
          url = 'http://brandeins.de/'+url
          if len(a.parent.findNextSiblings('p')) >= 1:
            description = self.tag_to_string(a.parent.findNextSiblings('p')[0])
          else:
            description = ''

          self.log('\t\tFound article:', title)
          self.log('\t\t\t', url)
          self.log('\t\t\t', description)

          current_articles.append({'title': title, 'url': url, 'description': description, 'date':''})
    titles_and_articles.append([chapter_title, current_articles])
    return titles_and_articles