View Single Post
Old 09-07-2010, 10:28 AM   #2665
JvdW
Zealot
JvdW doesn't litterJvdW doesn't litter
 
Posts: 115
Karma: 150
Join Date: Jul 2008
Location: Netherlands Veenendaal
Device: Palm T5, Sony PRS-505, Nook Color
Quote:
Originally Posted by kovidgoyal View Post
@JvdW: nrcnext uses the parse index function to get a list of articles and the website has changed, so it fails. Unfortunately, as I don't read Dutch, it's hard for me to fix.
Thanks for the parse index function pointer. After reading up a bit and pasting part of the recipe into the editor of PortablePython I managed to find the problematic line:
Code:
for post in soup.findAll(True, attrs={'class' : 'post'}) :
should be:
Code:
for post in soup.findAll(True, attrs={'class' : 'post '}) :
Note the space after post!

This fixes this recipe but one problem remains now that it generates an epub.
How to get rid of the comments section. Can't remember that I had those a couple of weeks ago so that part must have changed also but adding things like:
remove_tags.append(dict(name = 'h3', attrs = {'class' : 'reacties'}))
doesn't seem to help much. But looking a bit further and experimenting I found the ideal mix. The complete recipe is as follows:
Spoiler:

Code:
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag

class NrcNextRecipe(BasicNewsRecipe):
    __license__  = 'GPL v3'
    __author__ = 'kwetal'
    language = 'nl'
    country = 'NL'
    version = 2

    title = u'nrcnext'
    publisher = u'NRC Media'
    category = u'News, Opinion, the Netherlands'
    description = u'Dutch newsblog from the Dutch daily newspaper nrcnext.'

    conversion_options = {'comments': description, 'language': language, 'publisher': publisher}

    no_stylesheets = True
    remove_javascript = True

    keep_only_tags = [dict(name='div', attrs={'id' : 'main'})]

    remove_tags = []
    remove_tags.append(dict(name = 'div', attrs = {'class' : 'meta'}))
    remove_tags.append(dict(name = 'p', attrs = {'class' : 'meta'}))
    remove_tags.append(dict(name = 'div', attrs = {'class' : 'datumlabel'}))
    remove_tags.append(dict(name = 'div', attrs = {'class' : 'sharing-is-caring'}))
    remove_tags.append(dict(name = 'div', attrs = {'class' : 'navigation'}))
    remove_tags.append(dict(name = 'div', attrs = {'class' : 'reageer'}))
    remove_tags.append(dict(name = 'div', attrs = {'class' : 'comment odd alt thread-odd thread-alt depth-1 reactie '}))
    remove_tags.append(dict(name = 'div', attrs = {'class' : 'comment even thread-even depth-1 reactie '}))
    remove_tags.append(dict(name = 'ul', attrs = {'class' : 'cats single'}))
    remove_tags.append(dict(name = 'ul', attrs = {'class' : 'cats onderwerpen'}))
    remove_tags.append(dict(name = 'ul', attrs = {'class' : 'cats rubrieken'}))
    remove_tags.append(dict(name = 'h3', attrs = {'class' : 'reacties'}))

	

    extra_css = '''
                body {font-family: verdana, arial, helvetica, geneva, sans-serif; text-align: left;}
                p.wp-caption-text {font-size: x-small; color: #666666;}
                h2.sub_title {font-size: medium; color: #696969;}
                h2.vlag {font-size: small; font-weight: bold;}
                '''

    def parse_index(self) :
        # Use the wesbite as an index. Their RSS feeds can be out of date.
        feeds = {}
        feeds[u'columnisten'] = u'http://www.nrcnext.nl/columnisten/'
        feeds[u'koken'] = u'http://www.nrcnext.nl/koken/'
        feeds[u'geld & werk'] = u'http://www.nrcnext.nl/geld-en-werk/'
        feeds[u'vandaag'] = u'http://www.nrcnext.nl'
        # feeds[u'city life in afrika']  = u'http://www.nrcnext.nl/city-life-in-afrika/'
        answer = []
        articles = {}
        indices = []

        for index, feed in feeds.items() :
            soup = self.index_to_soup(feed)
            for post in soup.findAll(True, attrs={'class' : 'post '}) :
                # Find the links to the actual articles and rember the location they're pointing to and the title
                a = post.find('a', attrs={'rel' : 'bookmark'})
                href = a['href']
                title = self.tag_to_string(a)
                if index == 'columnisten' :
                    # In this feed/page articles can be written by more than one author.
                    # It is nice to see their names in the titles.
                    flag = post.find('h2', attrs = {'class' : 'vlag'})
                    author = flag.contents[0].renderContents()
                    completeTitle = u''.join([author, u': ', title])
                else :
                    completeTitle = title

                # Add the article to a temporary list
                article = {'title' : completeTitle, 'date' : u'', 'url'  : href, 'description' : '<p>&nbsp;</p>'}
                if not articles.has_key(index) :
                    articles[index] = []
                articles[index].append(article)

            # Add the index title to a temporary list
            indices.append(index)

        # Now, sort the temporary list of feeds in the order they appear on the website
        # indices = self.sort_index_by(indices, {u'columnisten' : 1, u'koken' : 3, u'geld & werk' : 2, u'vandaag' : 0, u'city life in afrika' : 4})
        indices = self.sort_index_by(indices, {u'columnisten' : 1, u'koken' : 3, u'geld & werk' : 2, u'vandaag' : 0})
        # Apply this sort order to the actual list of feeds and articles
        answer = [(key, articles[key]) for key in indices if articles.has_key(key)]

        return answer

    def preprocess_html(self, soup) :
        if soup.find('div', attrs = {'id' : 'main', 'class' : 'single'}):
            tag = soup.find('div', attrs = {'class' : 'post'})
            if tag:
                h2 = tag.find('h2', 'vlag')
                if h2:
                    new_h2 = Tag(soup, 'h2', attrs = [('class', 'vlag')])
                    new_h2.append(self.tag_to_string(h2))
                    h2.replaceWith(new_h2)
                else:
                    h2 = tag.find('h2')
                    if h2:
                        new_h2 = Tag(soup, 'h2', attrs = [('class', 'sub_title')])
                        new_h2.append(self.tag_to_string(h2))
                        h2.replaceWith(new_h2)

                h1 = tag.find('h1')
                if h1:
                    new_h1 = Tag(soup, 'h1')
                    new_h1.append(self.tag_to_string(h1))
                    h1.replaceWith(new_h1)

                # Slows down my reader.
                for movie in tag.findAll('span', attrs = {'class' : 'vvqbox vvqvimeo'}):
                    movie.extract()
                for movie in tag.findAll('span', attrs = {'class' : 'vvqbox vvqyoutube'}):
                    movie.extract()
                for iframe in tag.findAll('iframe') :
                    iframe.extract()

                fresh_soup = self.getFreshSoup(soup)
                fresh_soup.body.append(tag)

                return fresh_soup
            else:
                # This should never happen and other famous last words...
                return soup

    def getFreshSoup(self, oldSoup):
        freshSoup = BeautifulSoup('<html><head><title></title></head><body></body></html>')
        if oldSoup.head.title:
            freshSoup.head.title.append(self.tag_to_string(oldSoup.head.title))
        return freshSoup


Kovid, will you please update the builtin recipe with this updated code?

Thanks in advance,

Joop
JvdW is offline