View Single Post
Old 04-25-2011, 12:42 PM   #5
aerodynamik
Enthusiast
aerodynamik doesn't litteraerodynamik doesn't litter
 
Posts: 43
Karma: 136
Join Date: Mar 2011
Device: Kindle Paperwhite
Süddeutsche Zeitung Magazin

Okay, I think this should work now. There is still room for improvement (images, line breaks, additional blogs).

The issue regarding HTML comments is still unclear to me.

In addition, I understand that remove_tags is applied after preprocess_html. Is there a smart way to re-implementing remove_tags? Is there a way to process the subsequent pages equally as any other downloaded page?

But for now, have fun with this, let me know if it works for you as well.

Spoiler:
Code:
#!/usr/bin/env  python

__license__   = 'GPL v3'
__copyright__ = '2011, Nikolas Mangold <nmangold at gmail.com>'
'''
sz-magazin.de
'''
from calibre.web.feeds.news import BasicNewsRecipe
from calibre import strftime
import re

class SueddeutscheZeitungMagazin(BasicNewsRecipe):
    title                  = u'Süddeutsche Zeitung Magazin'
    __author__             = 'Nikolas Mangold'
    description            = u'Süddeutsche Zeitung Magazin'
    publisher              = u'Magazin Verlagsgesellschaft Süddeutsche Zeitung mbH'
    category               = 'Germany'
    no_stylesheets         = True
    encoding               = 'cp1252'
    remove_empty_feeds     = True
    delay                  = 1
    PREFIX                 = 'http://sz-magazin.sueddeutsche.de'
    INDEX                  = PREFIX + '/hefte'
    use_embedded_content   = False
    masthead_url = 'http://sz-magazin.sueddeutsche.de/img/general/logo.gif'
    language               = 'de'
    publication_type       = 'magazine'
    extra_css              = ' body{font-family: Arial,Helvetica,sans-serif} '
    timefmt = '%W %Y'

    conversion_options = {
                          'comment'          : description
                        , 'tags'             : category
                        , 'publisher'        : publisher
                        , 'language'         : language
                        , 'linearize_tables' : True
                        }

    remove_tags_before =  dict(attrs={'class':'vorspann'})
    remove_tags_after  =  dict(attrs={'id':'commentsContainer'})
    remove_tags = [
         dict(name='ul',attrs={'class':'textoptions'}),
         dict(name='div',attrs={'class':'BannerBug'}),
         dict(name='div',attrs={'id':'commentsContainer'}),
         dict(name='div',attrs={'class':'plugin-linkbox'}), #not working
         dict(name='div',attrs={'id':'galleryInfo0'}),
         dict(name='div',attrs={'class':'control'})
    ]
        
    def parse_index(self):
        feeds = []

        # determine current issue
        index = self.index_to_soup(self.INDEX)
        year_index = index.find('ul', attrs={'class':'hefte-jahre'})
        week_index = index.find('ul', attrs={'class':'heftindex'})
        year = self.tag_to_string(year_index.find('li')).strip()
        tmp = week_index.find('li').a
        week = self.tag_to_string(tmp)
        aktuelles_heft = self.PREFIX + tmp['href']

        # set cover
        self.cover_url = '{0}/img/hefte/thumbs_l/{1}{2}.jpg'.format(self.PREFIX,year,week)

        # find articles and add to main feed
        soup = self.index_to_soup(aktuelles_heft)
        content = soup.find('div',{'id':'maincontent'})
        mainfeed = 'SZ Magazin {0}/{1}'.format(week, year)
        articles = []
        for article in content.findAll('li'):
            txt = article.find('div',{'class':'text-holder'})
            if txt is None:
                continue
            link = txt.find('a')
            desc = txt.find('p')
            title = self.tag_to_string(link).strip()
            self.log('Found article ', title)
            url = self.PREFIX + link['href']
            articles.append({'title' : title, 'date' : strftime(self.timefmt), 'url' : url, 'desc' : desc})
        feeds.append((mainfeed,articles))

        return feeds;

    def preprocess_html(self, soup):
        # determine if multipage, if not bail out
        multipage = soup.find('ul',attrs={'class':'blaettern'})
        if multipage is None:
            return soup;
        
        # get all subsequent pages and delete multipage links
        next_pages = []
        for next in multipage.findAll('li'):
           if next.a is None:
               continue
           nexturl = next.a['href']
           nexttitle = self.tag_to_string(next).strip()
           next_pages.append((self.PREFIX + nexturl,nexttitle))
        multipage.extract()

        # extract article from subsequent pages and insert at end of first page article
        firstpage = soup.find('body')
        firstpage_header = firstpage.find('div',attrs={'class':'vorspann'})
        firstpage_article = firstpage.find('div',attrs={'id':'artikel'})
        firstpage_header.insert(len(firstpage_header.contents),firstpage_article)

        for url, title in next_pages:
            next_soup = self.index_to_soup(url)
            next_article = next_soup.find('div',attrs={'id':'artikel'})

            # remove banner ad
            banner = next_article.find('div',attrs={'class':'BannerBug'})
            if banner:
                banner.extract()

            # remove remaining HTML comments
            comments = next_article.findAll(text=re.compile('google_ad'))
            [comment.extract() for comment in comments]

            firstpage_header.insert(len(firstpage_header.contents), next_article)

        return firstpage_header
aerodynamik is offline   Reply With Quote