View Single Post
Old 09-06-2013, 11:26 AM   #8
Divingduck
Wizard
Divingduck ought to be getting tired of karma fortunes by now.Divingduck ought to be getting tired of karma fortunes by now.Divingduck ought to be getting tired of karma fortunes by now.Divingduck ought to be getting tired of karma fortunes by now.Divingduck ought to be getting tired of karma fortunes by now.Divingduck ought to be getting tired of karma fortunes by now.Divingduck ought to be getting tired of karma fortunes by now.Divingduck ought to be getting tired of karma fortunes by now.Divingduck ought to be getting tired of karma fortunes by now.Divingduck ought to be getting tired of karma fortunes by now.Divingduck ought to be getting tired of karma fortunes by now.
 
Posts: 1,166
Karma: 1410083
Join Date: Nov 2010
Location: Germany
Device: Sony PRS-650
A new update for this recipe. HCN made some changes.

Spoiler:
Code:
# -*- coding: utf-8 -*-
##
## Written:      2012-01-28
## Last Edited:  2013-09-06
## Remark:       Version 1.3 
##               Update cleanup for new web article design
##
__license__   = 'GPL v3'
__copyright__ = '2013, Armin Geller'

'''
Fetch High Country News
'''
from calibre.web.feeds.news import BasicNewsRecipe
class HighCountryNews(BasicNewsRecipe):

    title                 = u'High Country News'
    description           = u'High Country News (RSS Version)'
    __author__            = 'Armin Geller'
    publisher             = 'High Country News'
    category              = 'news, politics'
    timefmt               = ' [%a, %d %b %Y]'
    language              = 'en-Us'
    encoding              = 'UTF-8'
    publication_type      = 'newspaper'
    oldest_article        = 14
    max_articles_per_feed = 100
    no_stylesheets        = True 
    auto_cleanup          = False
    remove_javascript     = True
    remove_empty_feeds    = True
    use_embedded_content  = False  
    
    masthead_url          = 'http://www.hcn.org/logo.jpg'
    cover_source          = 'http://www.hcn.org'
    
    def get_cover_url(self):
       cover_source_soup = self.index_to_soup(self.cover_source)
       preview_image_div = cover_source_soup.find(attrs={'class':' portaltype-Plone Site content--hcn template-homepage_view'})
       return preview_image_div.div.img['src']

    
    feeds = [
              (u'Most recent', u'http://feeds.feedburner.com/hcn/most-recent?format=xml'),
              (u'Current Issue', u'http://feeds.feedburner.com/hcn/current-issue?format=xml'),
              
              (u'From the Blogs', u'http://feeds.feedburner.com/hcn/FromTheBlogs?format=xml'),
              (u'Heard around the West', u'http://feeds.feedburner.com/hcn/heard?format=xml'),
              (u'The GOAT Blog', u'http://feeds.feedburner.com/hcn/goat?format=xml'),
              (u'The Range', u'http://feeds.feedburner.com/hcn/range?format=xml'),

              (u'Writers on the Range', u'http://feeds.feedburner.com/hcn/wotr'),
              (u'High Country Views', u'http://feeds.feedburner.com/hcn/HighCountryViews'),
             ]
 
 # 2013-07-23 AGe New coding w/o using print_version
 
    keep_only_tags    = [
                          dict(name='div', attrs={'id':['content']}),
                        ]

    remove_tags = [
                    dict(name='div', attrs={'class':['documentActions supercedeDocumentActions editorialDocumentActions', 
                                                      'documentActions supercedeDocumentActions editorialDocumentActions editorialFooterDocumentActions',
                                                      'article-sidebar',
                                                      'image-viewer-controls nojs',
                                                      'protectedArticleWrapper',
                                                      'visualClear',
                                                      'feed-icons', #2013-09-06 AGe add
                                                      'PayWallEmail', #2013-09-06 AGe add
                                                     ]}),
                    dict(name='div', attrs={'id':['offer-below-locked-article']}), #2013-09-06 AGe add                                
                  ]
 
    INDEX                 = ''
    def append_page(self, soup, appendtag, position):
        pager = soup.find('span',attrs={'class':'next'})
        if pager:
           nexturl = self.INDEX + pager.a['href']
           soup2 = self.index_to_soup(nexturl)
           texttag = soup2.find('div', attrs={'class':'article-text'})
           newpos = len(texttag.contents)
           self.append_page(soup2,texttag,newpos)
           texttag.extract()
           appendtag.insert(position,texttag)

    def preprocess_html(self, soup):
        self.append_page(soup, soup.body, 3)
        pager = soup.find('div',attrs={'class':'listingBar listingBar-article'})
        if pager:
           pager.extract()
        return self.adeify_images(soup)
Attached Files
File Type: zip HighCountryNews_AGeV1.3.zip (1.5 KB, 259 views)
Divingduck is offline   Reply With Quote