MobileRead Forums - View Single Post

Selcal · 04-27-2011, 01:07 PM

A week with no trouble! I'm happy to release this now. Python is not my first language, so to say, so any optimizations are welcome. I've tried to output relevant information so the job details will show what is happening.

File attached zipped, and visible here:

Spoiler:

Code:

from calibre import strftime
from calibre.ebooks.BeautifulSoup import BeautifulSoup
from BeautifulSoup import BeautifulStoneSoup
from calibre.web.feeds.news import BasicNewsRecipe

class Volkskrant_full(BasicNewsRecipe):
    # This recipe will download the Volkskrant newspaper,
    # from the subscribers site. It requires a password.
    # Known issues are: articles that are spread out over
    # multiple pages will appear multiple times. Pages
    # that contain only adverts will appear, but empty.
    # The supplement 'Volkskrant Magazine' on saturday
    # is currently not downloaded.
    # You can set a manual date, to download an archived
    # newspaper. Volkskrant stores over a month at the
    # moment of writing. To do so I suggest you unmark
    # the date on the line below, and insert it in the title. Then
    # follow the instructions marked further below.

    title = 'De Volkskrant' # [za, 13 nov 2010]'
    __author__ = u'Selcal'
    description = u"Volkskrant"
    oldest_article = 30
    max_articles_per_feed = 100
    no_stylesheets = True
    use_embedded_content = False
    simultaneous_downloads = 1
    delay = 1
    needs_subscription = True
    # Set RETRIEVEDATE to 'yyyymmdd' to load an older
    # edition. Otherwise keep '%Y%m%d'
    # When setting a manual date, unmark and add the date
    # to the title above, and unmark the timefmt line to stop
    # Calibre from adding today's date in addition.

    # timefmt = ''
    RETRIEVEDATE = strftime('%Y%m%d')
    INDEX_MAIN = 'http://www.volkskrant.nl/vk-online/VK/' + RETRIEVEDATE + '___/VKN01_001/#text'
    INDEX_ARTICLE = 'http://www.volkskrant.nl/vk-online/VK/' + RETRIEVEDATE + '___/VKN01_001/'
    LOGIN = 'http://www.volkskrant.nl/vk/user/loggedIn.do'
    remove_tags = [dict(name='address')]
    cover_url = 'http://www.volkskrant.nl/vk-online/VK/' + RETRIEVEDATE + '___/VKN01_001/page.jpg'
	
    def get_browser(self):
        br = BasicNewsRecipe.get_browser()

        if self.username is not None and self.password is not None:
           br.open(self.LOGIN)
           br.select_form(nr = 0)
           br['username'] = self.username
           br['password'] = self.password
           br.submit()
        return br
        
    def parse_index(self):
        krant = []
        def strip_title(_title):
            i = 0 
            while ((_title[i] <> ":") and (i <= len(_title))): 
               i = i + 1
            return(_title[0:i])		     
        for temp in range (5):
              try:
                soup = self.index_to_soup(self.INDEX_MAIN)
                break
              except:
                print '(Retrying main index load)'
                continue
        mainsoup = soup.find('td', attrs={'id': 'select_page_top'})
        for option in mainsoup.findAll('option'):
           articles = []
           _INDEX = 'http://www.volkskrant.nl/vk-online/VK/' + self.RETRIEVEDATE + '___/' + option['value'] + '/#text'
           _INDEX_ARTICLE = 'http://www.volkskrant.nl/vk-online/VK/' + self.RETRIEVEDATE + '___/' + option['value'] + '/'
           print ''
           print '<-------    Processing section: ' + _INDEX + ' ------------------------->'
           for temp in range (5):
              try:
                soup = self.index_to_soup(_INDEX)
                break
              except:
                print '(Retrying index load)'
                continue
           for item in soup.findAll('area'):
              art_nr = item['class']
              attrname = art_nr[0:12] + '_section' + option['value'][0:5] + '_' + art_nr[26:len(art_nr)]
              print '==> Found: ' + attrname;
              index_title = soup.find('div', attrs={'class': attrname})
              get_title = index_title['title'];
              _ARTICLE   = _INDEX_ARTICLE + attrname + '.html#text'
              title = get_title;
              print '--> Title: ' + title;
              print '--> URL: ' + _ARTICLE;
              for temp in range (5):
                 try:
                   souparticle =  self.index_to_soup(_ARTICLE);
                   break
                 except:
                   print '(Retrying URL load)'
                   continue
              headerurl = souparticle.findAll('frame')[0]['src'];
              print '--> Read frame name for header: ' + headerurl;
              url = _INDEX_ARTICLE + headerurl[0:len(headerurl)-12] + '_text.html';
              print '--> Corrected URL: ' + url;
              if (get_title <> ''):
                 title = strip_title(get_title)
                 date  = strftime(' %B %Y')
              if (title <> ''):
                 articles.append({
                                         'title'      :title
                                        ,'date'       :date
                                        ,'url'        :url
                                        ,'description':''
                                        })
           krant.append( (option.string, articles))
        return krant