Register Guidelines E-Books Today's Posts Search

Go Back   MobileRead Forums > E-Book Software > Calibre > Recipes

Notices

Reply
 
Thread Tools Search this Thread
Old 01-01-2011, 09:39 AM   #1
siebert
Developer
siebert has a complete set of Star Wars action figures.siebert has a complete set of Star Wars action figures.siebert has a complete set of Star Wars action figures.
 
Posts: 155
Karma: 280
Join Date: Nov 2010
Device: Kindle 3 (Keyboard) 3G / iPad 9 WiFi / Google Pixel 6a (Android)
Fixed brand eins recipe

Hi,

first of all: Happy new year to all of you!

The current brand eins recipe has a flaw as it handles only issues of the latest year, so with the addition of the (current, thus incomplete) january 2011 issue to the archive it's no longer possible to download issues from 2010, especially the december issue which is now the latest complete issue.

So I fixed the recipe and it's now possible to download any issue from all years.

Ciao,
Steffen

Code:
#!/usr/bin/env  python
# -*- coding: utf-8 mode: python -*-

__license__   = 'GPL v3'
__copyright__ = '2010, Constantin Hofstetter <consti at consti.de>, Steffen Siebert <calibre at steffensiebert.de>'
__version__   = '0.97'

''' http://brandeins.de - Wirtschaftsmagazin '''
import re
import string
from calibre.ebooks.BeautifulSoup import Tag
from calibre.web.feeds.recipes import BasicNewsRecipe
from calibre.web.feeds.templates import Template, CLASS
from lxml.html.builder import HTML, HEAD, TITLE, STYLE, DIV, BODY, BR, A, HR, UL

class BrandEins(BasicNewsRecipe):

  title = u'brand eins'
  __author__ = 'Constantin Hofstetter'
  description = u'Wirtschaftsmagazin'
  publisher ='brandeins.de'
  category = 'politics, business, wirtschaft, Germany'
  use_embedded_content = False
  lang = 'de-DE'
  no_stylesheets = True
  encoding = 'utf-8'
  language = 'de'
  publication_type = 'magazine'
  needs_subscription = 'optional'
  # Prevent that conversion date is appended to title
  timefmt = ''

  # 2 is the last full magazine (default)
  # 1 is the newest (but not full)
  # 3 is one before 2 etc.
  # This value can be set via the username field.
  default_issue = 2

  keep_only_tags = [dict(name='div', attrs={'id':'theContent'}), dict(name='div', attrs={'id':'sidebar'}), dict(name='div', attrs={'class':'intro'}), dict(name='p', attrs={'class':'bodytext'}), dict(name='div', attrs={'class':'single_image'})]

  '''
  brandeins.de
  '''

  def postprocess_html(self, soup,first):

    # Move the image of the sidebar right below the h3
    first_h3 = soup.find(name='div', attrs={'id':'theContent'}).find('h3')
    for imgdiv in soup.findAll(name='div', attrs={'class':'single_image'}):
      if len(first_h3.findNextSiblings('div', {'class':'intro'})) >= 1:
        # first_h3.parent.insert(2, imgdiv)
        first_h3.findNextSiblings('div', {'class':'intro'})[0].parent.insert(4, imgdiv)
      else:
        first_h3.parent.insert(2, imgdiv)

    # Now, remove the sidebar
    soup.find(name='div', attrs={'id':'sidebar'}).extract()

    # Remove the rating-image (stars) from the h3
    for img in first_h3.findAll(name='img'):
        img.extract()

    # Mark the intro texts as italic
    for div in soup.findAll(name='div', attrs={'class':'intro'}):
      for p in div.findAll('p'):
        content = self.tag_to_string(p)
        new_p = "<p><i>"+ content +"</i></p>"
        p.replaceWith(new_p)

    # Change <h3> to <h1>
    header = soup.find("h3")
    if header:
        tag = Tag(soup, "h1")
        tag.insert(0, header.contents[0])
        header.replaceWith(tag)

    return soup

  def get_cover(self, soup):
    cover_url = None
    cover_item = soup.find('div', attrs = {'class': 'cover_image'})
    if cover_item:
      cover_url = 'http://www.brandeins.de/' + cover_item.img['src']
    return cover_url

  def parse_index(self):
    feeds = []
    issue_map = {}

    archive = "http://www.brandeins.de/archiv.html"

    issue = self.default_issue
    if self.username:
      try:
        issue = int(self.username)
      except:
        pass

    soup = self.index_to_soup(archive)
    issue_list = soup.findAll('div', attrs={'class': 'tx-brandeinsmagazine-pi1'})[0].findAll('a')
    issue_list = [i for i in issue_list if i.get('onmouseover', False)]
    for i in issue_list:
        issue_number_string = i.get('onmouseover', False)
        if issue_number_string:
            match = re.match("^switch_magazine\(([0-9]+), ([0-9]+)\)$", issue_number_string)
            issue_number = "%04i%02i" % (int(match.group(1)), int(match.group(2)))
            issue_map[issue_number] = i
    keys = issue_map.keys()
    keys.sort()
    keys.reverse()
    selected_issue = issue_map[keys[issue-1]]
    url = selected_issue.get('href', False)
    # Get the title for the magazin - build it out of the title of the cover - take the issue and year;
    self.title = "brand eins "+ re.search(r"(?P<date>\d\d\/\d\d\d\d)", selected_issue.find('img').get('title', False)).group('date')
    url = 'http://brandeins.de/'+url

    # url = "http://www.brandeins.de/archiv/magazin/tierisch.html"
    titles_and_articles = self.brand_eins_parse_issue(url)
    if titles_and_articles:
      for title, articles in titles_and_articles:
        feeds.append((title, articles))
    return feeds

  def brand_eins_parse_issue(self, url):
    soup = self.index_to_soup(url)
    self.cover_url = self.get_cover(soup)
    article_lists = [soup.find('div', attrs={'class':'subColumnLeft articleList'}), soup.find('div', attrs={'class':'subColumnRight articleList'})]

    titles_and_articles = []
    current_articles = []
    chapter_title = "Editorial"
    self.log('Found Chapter:', chapter_title)

    # Remove last list of links (thats just the impressum and the 'gewinnspiel')
    article_lists[1].findAll('ul')[len(article_lists[1].findAll('ul'))-1].extract()

    for article_list in article_lists:
      for chapter in article_list.findAll('ul'):
        if len(chapter.findPreviousSiblings('h3')) >= 1:
          new_chapter_title = string.capwords(self.tag_to_string(chapter.findPreviousSiblings('h3')[0]))
          if new_chapter_title != chapter_title:
            titles_and_articles.append([chapter_title, current_articles])
            current_articles = []
            self.log('Found Chapter:', new_chapter_title)
          chapter_title = new_chapter_title
        for li in chapter.findAll('li'):
          a = li.find('a', href = True)
          if a is None:
            continue
          title = self.tag_to_string(a)
          url = a.get('href', False)
          if not url or not title:
            continue
          url = 'http://brandeins.de/'+url
          if len(a.parent.findNextSiblings('p')) >= 1:
            description = self.tag_to_string(a.parent.findNextSiblings('p')[0])
          else:
            description = ''

          self.log('\t\tFound article:', title)
          self.log('\t\t\t', url)
          self.log('\t\t\t', description)

          current_articles.append({'title': title, 'url': url, 'description': description, 'date':''})
    titles_and_articles.append([chapter_title, current_articles])
    return titles_and_articles
siebert is offline   Reply With Quote
Old 04-07-2011, 02:38 PM   #2
fritzifratz
Junior Member
fritzifratz began at the beginning.
 
Posts: 8
Karma: 10
Join Date: Apr 2011
Device: PRS-650B
Hi Steffen,

thanks for putting this recipe together. Unfortunately, I am facing problems using it. Whenever I try to pull the articles from the website using Calibre, I get the following error log. Can you advise?

File "site-packages\calibre\web\feeds\news.py", line 872, in build_index
File "c:\users\f\appdata\local\temp\calibre_0.7.53_tmp_ uy9v4j\calibre_0.7.53_qykifp_recipes\recipe0.py", line 103, in parse_index
issue_list = soup.findAll('div', attrs={'class': 'tx-brandeinsmagazine-pi1'})[0].findAll('a')
IndexError: list index out of range

Thanks and Best Regards!
fritzifratz is offline   Reply With Quote
Advert
Old 04-07-2011, 03:15 PM   #3
siebert
Developer
siebert has a complete set of Star Wars action figures.siebert has a complete set of Star Wars action figures.siebert has a complete set of Star Wars action figures.
 
Posts: 155
Karma: 280
Join Date: Nov 2010
Device: Kindle 3 (Keyboard) 3G / iPad 9 WiFi / Google Pixel 6a (Android)
Hm, I just tried the recipe and it works fine for me.

Did you select a special issue via the username field or did you try to download the default isse (which is currently march 2011)?

Could you successfully use the recipe in the past or is this your first try?

Ciao,
Steffen
siebert is offline   Reply With Quote
Old 04-07-2011, 03:39 PM   #4
fritzifratz
Junior Member
fritzifratz began at the beginning.
 
Posts: 8
Karma: 10
Join Date: Apr 2011
Device: PRS-650B
Hi Steffen,

thanks for the quick reply. It's the first time I am using this recipe. Just got my eReader and use Calibre v0.7.53. Did not select a specific issue. Problem keeps arising.

Thanks.

Detailed error log:
Code:
calibre, version 0.7.53
ERROR: Conversion Error: <b>Failed</b>: Fetch news from brand eins

Fetch news from brand eins
Resolved conversion options
calibre version: 0.7.53
{'asciiize': False,
 'author_sort': None,
 'authors': None,
 'base_font_size': 0,
 'book_producer': None,
 'change_justification': 'original',
 'chapter': None,
 'chapter_mark': 'pagebreak',
 'comments': None,
 'cover': None,
 'debug_pipeline': None,
 'dehyphenate': True,
 'delete_blank_paragraphs': True,
 'disable_font_rescaling': False,
 'dont_download_recipe': False,
 'dont_split_on_page_breaks': True,
 'enable_heuristics': False,
 'epub_flatten': False,
 'extra_css': None,
 'extract_to': None,
 'fix_indents': True,
 'flow_size': 260,
 'font_size_mapping': None,
 'format_scene_breaks': True,
 'html_unwrap_factor': 0.4,
 'input_encoding': None,
 'input_profile': <calibre.customize.profiles.InputProfile object at 0x04F013D0>,
 'insert_blank_line': False,
 'insert_metadata': False,
 'isbn': None,
 'italicize_common_cases': True,
 'keep_ligatures': False,
 'language': None,
 'level1_toc': None,
 'level2_toc': None,
 'level3_toc': None,
 'line_height': 0,
 'linearize_tables': False,
 'lrf': False,
 'margin_bottom': 5.0,
 'margin_left': 5.0,
 'margin_right': 5.0,
 'margin_top': 5.0,
 'markup_chapter_headings': True,
 'max_toc_links': 50,
 'minimum_line_height': 120.0,
 'no_chapters_in_toc': False,
 'no_default_epub_cover': False,
 'no_inline_navbars': False,
 'no_svg_cover': False,
 'output_profile': <calibre.customize.profiles.SonyReaderOutput object at 0x04F01810>,
 'page_breaks_before': None,
 'password': None,
 'prefer_metadata_cover': False,
 'preserve_cover_aspect_ratio': False,
 'pretty_print': True,
 'pubdate': None,
 'publisher': None,
 'rating': None,
 'read_metadata_from_opf': None,
 'remove_fake_margins': True,
 'remove_first_image': False,
 'remove_paragraph_spacing': False,
 'remove_paragraph_spacing_indent_size': 1.5,
 'renumber_headings': True,
 'replace_scene_breaks': '',
 'series': None,
 'series_index': None,
 'smarten_punctuation': False,
 'sr1_replace': '',
 'sr1_search': '',
 'sr2_replace': '',
 'sr2_search': '',
 'sr3_replace': '',
 'sr3_search': '',
 'tags': None,
 'test': False,
 'timestamp': None,
 'title': None,
 'title_sort': None,
 'toc_filter': None,
 'toc_threshold': 6,
 'unwrap_lines': True,
 'use_auto_toc': False,
 'username': None,
 'verbose': 2}
InputFormatPlugin: Recipe Input running
Python function terminated unexpectedly
  list index out of range (Error Code: 1)
Traceback (most recent call last):
  File "site.py", line 103, in main
  File "site.py", line 85, in run_entry_point
  File "site-packages\calibre\utils\ipc\worker.py", line 119, in main
  File "site-packages\calibre\gui2\convert\gui_conversion.py", line 25, in gui_convert
  File "site-packages\calibre\ebooks\conversion\plumber.py", line 915, in run
  File "site-packages\calibre\customize\conversion.py", line 204, in __call__
  File "site-packages\calibre\web\feeds\input.py", line 105, in convert
  File "site-packages\calibre\web\feeds\news.py", line 735, in download
  File "site-packages\calibre\web\feeds\news.py", line 872, in build_index
  File "c:\users\...\appdata\local\temp\calibre_0.7.53_tmp_uy9v4j\calibre_0.7.53_ft2pjg_recipes\recipe0.py", line 103, in parse_index
    issue_list = soup.findAll('div', attrs={'class': 'tx-brandeinsmagazine-pi1'})[0].findAll('a')
IndexError: list index out of range

Last edited by fritzifratz; 04-07-2011 at 03:43 PM.
fritzifratz is offline   Reply With Quote
Old 04-07-2011, 03:42 PM   #5
siebert
Developer
siebert has a complete set of Star Wars action figures.siebert has a complete set of Star Wars action figures.siebert has a complete set of Star Wars action figures.
 
Posts: 155
Karma: 280
Join Date: Nov 2010
Device: Kindle 3 (Keyboard) 3G / iPad 9 WiFi / Google Pixel 6a (Android)
Can you try the url http://www.brandeins.de/archiv.html in a browser? Do you see the brand eins issues on that page?

Can you search the source of the page for the string "tx-brandeinsmagazine-pi1"?

Ciao,
Steffen
siebert is offline   Reply With Quote
Advert
Old 04-07-2011, 03:44 PM   #6
fritzifratz
Junior Member
fritzifratz began at the beginning.
 
Posts: 8
Karma: 10
Join Date: Apr 2011
Device: PRS-650B
Yes, the website works and the string exists in the source code. Tried to understand and play with the source code as well, but I am new to python :-(
fritzifratz is offline   Reply With Quote
Old 04-07-2011, 04:05 PM   #7
siebert
Developer
siebert has a complete set of Star Wars action figures.siebert has a complete set of Star Wars action figures.siebert has a complete set of Star Wars action figures.
 
Posts: 155
Karma: 280
Join Date: Nov 2010
Device: Kindle 3 (Keyboard) 3G / iPad 9 WiFi / Google Pixel 6a (Android)
I'm still using 0.7.48 and can't update to 0.7.53 as I don't use python 2.7. So I've installed 0.7.53 from scratch in a virtual machine and the calibre recipe works fine. I have no idea what's going wrong with your installation.

Ciao,
Steffen
siebert is offline   Reply With Quote
Old 04-07-2011, 04:12 PM   #8
fritzifratz
Junior Member
fritzifratz began at the beginning.
 
Posts: 8
Karma: 10
Join Date: Apr 2011
Device: PRS-650B
Hmm... weird... At least good to know that the script is working in general. Must be something wrong with my installation then. Will try to find out and let you know if I find a solution. Thanks for your help, Steffen! Have a good evening.
fritzifratz is offline   Reply With Quote
Old 04-07-2011, 04:49 PM   #9
siebert
Developer
siebert has a complete set of Star Wars action figures.siebert has a complete set of Star Wars action figures.siebert has a complete set of Star Wars action figures.
 
Posts: 155
Karma: 280
Join Date: Nov 2010
Device: Kindle 3 (Keyboard) 3G / iPad 9 WiFi / Google Pixel 6a (Android)
Quote:
Originally Posted by fritzifratz View Post
Must be something wrong with my installation then.
On what OS do you run Calibre? I'm using Windows 7 and Windows XP, maybe that's the difference?

Ciao,
Steffen
siebert is offline   Reply With Quote
Old 04-07-2011, 05:12 PM   #10
fritzifratz
Junior Member
fritzifratz began at the beginning.
 
Posts: 8
Karma: 10
Join Date: Apr 2011
Device: PRS-650B
Using Win7 32bit. Just realized that no news download works. When using the Spiegel Online recipe, I get a proper article list, but all the content of all articles consists of binary data / special chars. Same for Financial Times.

The weird thing is, that it works when using a Google Reader account and that special recipe built-in. Already searched the forum for a solution but all I found was some proxy discussion. Checked my proxy settings (also using the debug mode of Calibre) but everything seems to be correct (no proxy set).
fritzifratz is offline   Reply With Quote
Old 04-07-2011, 05:47 PM   #11
fritzifratz
Junior Member
fritzifratz began at the beginning.
 
Posts: 8
Karma: 10
Join Date: Apr 2011
Device: PRS-650B
Yes!!!! I found the problem. I already tried playing with the firewall settings before because I was assuming, it might be firewall-related. However, disabling didn't solve the problem. Finally, I found a setting in the options of my Desktop Firewall (GData) that seemed to be active even though the firewall is switched off. It was the Web Filter which scans web traffic (content) before it reaches the browser / clients. Once I disabled that filter, things ran smooth. Thanks a lot for your support, Steffen!

P.S. The web filter seems to be implemented using a transparent proxy. Neither IE, nor the debug mode of Calibre shows that proxy.
fritzifratz is offline   Reply With Quote
Old 04-29-2011, 03:41 AM   #12
ericepe
Junior Member
ericepe began at the beginning.
 
Posts: 8
Karma: 10
Join Date: Apr 2011
Device: kindle
brand eins -different error

I'm running calibre under ubuntu 10.10 and getting the following error. any idea?

calibre, version 0.7.56
ERROR: Konvertierungsfehler: <b>Misslungen</b>: Nachrichten abrufen von brand eins

Nachrichten abrufen von brand eins
Resolved conversion options
calibre version: 0.7.56
{'asciiize': False,
'author_sort': None,
'authors': None,
'base_font_size': 0,
'book_producer': None,
'change_justification': 'original',
'chapter': None,
'chapter_mark': 'pagebreak',
'comments': None,
'cover': None,
'debug_pipeline': None,
'dehyphenate': True,
'delete_blank_paragraphs': True,
'disable_font_rescaling': False,
'dont_compress': False,
'dont_download_recipe': False,
'enable_heuristics': False,
'extra_css': None,
'fix_indents': True,
'font_size_mapping': None,
'format_scene_breaks': True,
'html_unwrap_factor': 0.4,
'input_encoding': None,
'input_profile': <calibre.customize.profiles.InputProfile object at 0x3dca6d0>,
'insert_blank_line': False,
'insert_metadata': False,
'isbn': None,
'italicize_common_cases': True,
'keep_ligatures': False,
'language': None,
'level1_toc': None,
'level2_toc': None,
'level3_toc': None,
'line_height': 0,
'linearize_tables': False,
'lrf': False,
'margin_bottom': 5.0,
'margin_left': 5.0,
'margin_right': 5.0,
'margin_top': 5.0,
'markup_chapter_headings': True,
'max_toc_links': 50,
'minimum_line_height': 120.0,
'mobi_ignore_margins': False,
'no_chapters_in_toc': False,
'no_inline_navbars': True,
'no_inline_toc': False,
'output_profile': <calibre.customize.profiles.KindleOutput object at 0x3dcad10>,
'page_breaks_before': None,
'password': '',
'personal_doc': '[PDOC]',
'prefer_author_sort': False,
'prefer_metadata_cover': False,
'pretty_print': False,
'pubdate': None,
'publisher': None,
'rating': None,
'read_metadata_from_opf': None,
'remove_fake_margins': True,
'remove_first_image': False,
'remove_paragraph_spacing': False,
'remove_paragraph_spacing_indent_size': 1.5,
'renumber_headings': True,
'replace_scene_breaks': '',
'rescale_images': False,
'series': None,
'series_index': None,
'smarten_punctuation': False,
'sr1_replace': '',
'sr1_search': '',
'sr2_replace': '',
'sr2_search': '',
'sr3_replace': '',
'sr3_search': '',
'tags': None,
'test': False,
'timestamp': None,
'title': None,
'title_sort': None,
'toc_filter': None,
'toc_threshold': 6,
'toc_title': None,
'unwrap_lines': True,
'use_auto_toc': False,
'username': '2',
'verbose': 2}
InputFormatPlugin: Recipe Input running
Traceback (most recent call last):
File "site.py", line 58, in main
File "site-packages/calibre/utils/ipc/worker.py", line 119, in main
File "site-packages/calibre/gui2/convert/gui_conversion.py", line 25, in gui_convert
File "site-packages/calibre/ebooks/conversion/plumber.py", line 915, in run
File "site-packages/calibre/customize/conversion.py", line 204, in __call__
File "site-packages/calibre/web/feeds/input.py", line 105, in convert
File "site-packages/calibre/web/feeds/news.py", line 735, in download
File "site-packages/calibre/web/feeds/news.py", line 872, in build_index
File "/tmp/calibre_0.7.56_tmp_WfUDXh/calibre_0.7.56_0wjqRK_recipes/recipe0.py", line 117, in parse_index
self.title = "brand eins "+ re.search(r"(?P<date>\d\d\/\d\d\d\d)", selected_issue.find('img').get('title', False)).group('date')
File "re.py", line 142, in search
TypeError: expected string or buffer
ericepe is offline   Reply With Quote
Old 05-01-2011, 06:07 AM   #13
Holgerman
Junior Member
Holgerman began at the beginning.
 
Posts: 8
Karma: 10
Join Date: May 2011
Device: Kindle 3
Same problem here. Any idea?
Holgerman is offline   Reply With Quote
Old 05-01-2011, 07:20 AM   #14
siebert
Developer
siebert has a complete set of Star Wars action figures.siebert has a complete set of Star Wars action figures.siebert has a complete set of Star Wars action figures.
 
Posts: 155
Karma: 280
Join Date: Nov 2010
Device: Kindle 3 (Keyboard) 3G / iPad 9 WiFi / Google Pixel 6a (Android)
Hi,

though I don't need this recipe any longer (thanks to my new iPad 2 I can now read the pdf version of brand eins here is the fixed recipe:

Code:
#!/usr/bin/env  python
# -*- coding: utf-8 mode: python -*-

__license__   = 'GPL v3'
__copyright__ = '2010, Constantin Hofstetter <consti at consti.de>, Steffen Siebert <calibre at steffensiebert.de>'
__version__   = '0.98'

''' http://brandeins.de - Wirtschaftsmagazin '''
import re
import string
from calibre.ebooks.BeautifulSoup import Tag
from calibre.web.feeds.recipes import BasicNewsRecipe
from calibre.web.feeds.templates import Template, CLASS
from lxml.html.builder import HTML, HEAD, TITLE, STYLE, DIV, BODY, BR, A, HR, UL

class BrandEins(BasicNewsRecipe):

  title = u'brand eins'
  __author__ = 'Constantin Hofstetter'
  description = u'Wirtschaftsmagazin'
  publisher ='brandeins.de'
  category = 'politics, business, wirtschaft, Germany'
  use_embedded_content = False
  lang = 'de-DE'
  no_stylesheets = True
  encoding = 'utf-8'
  language = 'de'
  publication_type = 'magazine'
  needs_subscription = 'optional'
  # Prevent that conversion date is appended to title
  timefmt = ''

  # 2 is the last full magazine (default)
  # 1 is the newest (but not full)
  # 3 is one before 2 etc.
  # This value can be set via the username field.
  default_issue = 2

  keep_only_tags = [dict(name='div', attrs={'id':'theContent'}), dict(name='div', attrs={'id':'sidebar'}), dict(name='div', attrs={'class':'intro'}), dict(name='p', attrs={'class':'bodytext'}), dict(name='div', attrs={'class':'single_image'})]

  '''
  brandeins.de
  '''

  def postprocess_html(self, soup,first):

    # Move the image of the sidebar right below the h3
    first_h3 = soup.find(name='div', attrs={'id':'theContent'}).find('h3')
    for imgdiv in soup.findAll(name='div', attrs={'class':'single_image'}):
      if len(first_h3.findNextSiblings('div', {'class':'intro'})) >= 1:
        # first_h3.parent.insert(2, imgdiv)
        first_h3.findNextSiblings('div', {'class':'intro'})[0].parent.insert(4, imgdiv)
      else:
        first_h3.parent.insert(2, imgdiv)

    # Now, remove the sidebar
    soup.find(name='div', attrs={'id':'sidebar'}).extract()

    # Remove the rating-image (stars) from the h3
    for img in first_h3.findAll(name='img'):
        img.extract()

    # Mark the intro texts as italic
    for div in soup.findAll(name='div', attrs={'class':'intro'}):
      for p in div.findAll('p'):
        content = self.tag_to_string(p)
        new_p = "<p><i>"+ content +"</i></p>"
        p.replaceWith(new_p)

    # Change <h3> to <h1>
    header = soup.find("h3")
    if header:
        tag = Tag(soup, "h1")
        tag.insert(0, header.contents[0])
        header.replaceWith(tag)

    return soup

  def get_cover(self, soup):
    cover_url = None
    cover_item = soup.find('div', attrs = {'class': 'cover_image'})
    if cover_item:
      cover_url = 'http://www.brandeins.de/' + cover_item.img['src']
    return cover_url

  def parse_index(self):
    feeds = []
    issue_map = {}

    archive = "http://www.brandeins.de/archiv.html"

    issue = self.default_issue
    if self.username:
      try:
        issue = int(self.username)
      except:
        pass

    soup = self.index_to_soup(archive)
    issue_list = soup.findAll('div', attrs={'class': 'tx-brandeinsmagazine-pi1'})[0].findAll('a')
    issue_list = [i for i in issue_list if i.get('onmouseover', False)]
    for i in issue_list:
        issue_number_string = i.get('onmouseover', False)
        if issue_number_string:
            match = re.match("^switch_magazine\(([0-9]+), ([0-9]+)\)$", issue_number_string)
            issue_number = "%04i%02i" % (int(match.group(1)), int(match.group(2)))
            issue_map[issue_number] = i
    keys = issue_map.keys()
    keys.sort()
    keys.reverse()
    selected_issue_key = keys[issue - 1]
    selected_issue = issue_map[selected_issue_key]
    url = selected_issue.get('href', False)
    # Get the title for the magazin - build it out of the title of the cover - take the issue and year;
    self.title = "brand eins " + selected_issue_key[4:] + "/" + selected_issue_key[0:4]
    url = 'http://brandeins.de/'+url

    # url = "http://www.brandeins.de/archiv/magazin/tierisch.html"
    titles_and_articles = self.brand_eins_parse_issue(url)
    if titles_and_articles:
      for title, articles in titles_and_articles:
        feeds.append((title, articles))
    return feeds

  def brand_eins_parse_issue(self, url):
    soup = self.index_to_soup(url)
    self.cover_url = self.get_cover(soup)
    article_lists = [soup.find('div', attrs={'class':'subColumnLeft articleList'}), soup.find('div', attrs={'class':'subColumnRight articleList'})]

    titles_and_articles = []
    current_articles = []
    chapter_title = "Editorial"
    self.log('Found Chapter:', chapter_title)

    # Remove last list of links (thats just the impressum and the 'gewinnspiel')
    article_lists[1].findAll('ul')[len(article_lists[1].findAll('ul'))-1].extract()

    for article_list in article_lists:
      for chapter in article_list.findAll('ul'):
        if len(chapter.findPreviousSiblings('h3')) >= 1:
          new_chapter_title = string.capwords(self.tag_to_string(chapter.findPreviousSiblings('h3')[0]))
          if new_chapter_title != chapter_title:
            titles_and_articles.append([chapter_title, current_articles])
            current_articles = []
            self.log('Found Chapter:', new_chapter_title)
          chapter_title = new_chapter_title
        for li in chapter.findAll('li'):
          a = li.find('a', href = True)
          if a is None:
            continue
          title = self.tag_to_string(a)
          url = a.get('href', False)
          if not url or not title:
            continue
          url = 'http://brandeins.de/'+url
          if len(a.parent.findNextSiblings('p')) >= 1:
            description = self.tag_to_string(a.parent.findNextSiblings('p')[0])
          else:
            description = ''

          self.log('\t\tFound article:', title)
          self.log('\t\t\t', url)
          self.log('\t\t\t', description)

          current_articles.append({'title': title, 'url': url, 'description': description, 'date':''})
    titles_and_articles.append([chapter_title, current_articles])
    return titles_and_articles
Ciao,
Steffen
siebert is offline   Reply With Quote
Old 05-01-2011, 10:02 AM   #15
Holgerman
Junior Member
Holgerman began at the beginning.
 
Posts: 8
Karma: 10
Join Date: May 2011
Device: Kindle 3
Wow! Thank you very much! This works!
Holgerman is offline   Reply With Quote
Reply


Forum Jump

Similar Threads
Thread Thread Starter Forum Replies Last Post
Recipe works when mocked up as Python file, fails when converted to Recipe ode Recipes 7 09-04-2011 04:57 AM
Enhanced brand eins recipe siebert Recipes 12 04-11-2011 04:58 AM
AJC Recipe not working correctly (will be fixed soon) TonytheBookworm Recipes 0 12-24-2010 07:41 PM
Brand new 700... with brand new problems! :S ozk Sony Reader 6 02-28-2009 08:40 PM


All times are GMT -4. The time now is 09:28 PM.


MobileRead.com is a privately owned, operated and funded community.