View Single Post
Old 02-19-2011, 06:33 PM   #2
spedinfargo
Groupie
spedinfargo is the king of the Divan.spedinfargo is the king of the Divan.spedinfargo is the king of the Divan.spedinfargo is the king of the Divan.spedinfargo is the king of the Divan.spedinfargo is the king of the Divan.spedinfargo is the king of the Divan.spedinfargo is the king of the Divan.spedinfargo is the king of the Divan.spedinfargo is the king of the Divan.spedinfargo is the king of the Divan.
 
Posts: 155
Karma: 106422
Join Date: Nov 2010
Device: none
Code:

import re
import urllib2
import time
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup, SoupStrainer

class EbertJournal(BasicNewsRecipe):
    title                 = 'Roger Ebert Journal'
    __author__            = 'Shane Erstad'
    description           = 'Roger Ebert Journal'
    publisher             = 'Chicago Sun Times'
    category              = 'movies'
    oldest_article        = 8
    max_articles_per_feed = 100
    no_stylesheets        = True
    use_embedded_content  = False
    encoding              = 'ISO-8859-1'
    masthead_url          = 'http://rogerebert.suntimes.com/graphics/global/roger.jpg'
    language              = 'en'
    remove_empty_feeds    = False
    PREFIX                  = 'http://blogs.suntimes.com/ebert'
    
    remove_tags_before = dict(id='content')
    remove_tags_after = dict(id='comments-open')

    



    extra_css             = """
                                @font-face {font-family: "sans1";src:url(res:///opt/sony/ebook/FONT/tt0003m_.ttf)}
                                .article_description,body{font-family: Arial,Helvetica,sans1,sans-serif}
                                .color-2{display:block; margin-bottom: 10px; padding: 5px, 10px;
                                border-left: 1px solid #D00000; color: #D00000}
                                img{margin-bottom: 0.8em} """


    conversion_options = {
                          'comment'          : description
                        , 'tags'             : category
                        , 'publisher'        : publisher
                        , 'language'         : language
                        , 'linearize_tables' : True
                        }


    feeds          = [
                        (u'Roger Ebert Journal'   , u'http://blogs.suntimes.com/ebert/' )
                     ]

    preprocess_regexps = [

        (re.compile(r'<span class="vcard author">Roger Ebert</span>', re.DOTALL|re.IGNORECASE),
            lambda m: 'Roger Ebert'),
        
        (re.compile(r'<span class="vcard author">', re.DOTALL|re.IGNORECASE),
            lambda m: '<hr width="80%"><span class="vcard author">'),
        
        (re.compile(r'<blockquote>', re.DOTALL|re.IGNORECASE),
            lambda m: ''),

        (re.compile(r'<a class="a2a_dd".*?</a>', re.DOTALL|re.IGNORECASE),
            lambda m: ''),

        (re.compile(r'<h2 class="comments-open-header">Leave a comment</h2>', re.DOTALL|re.IGNORECASE),
            lambda m: ''),

        (re.compile(r'a title="Reply".*?</a>', re.DOTALL|re.IGNORECASE),
            lambda m: '')
    ]
    

    def parse_index(self):

        totalfeeds = []
        lfeeds = self.get_feeds()
        for feedobj in lfeeds:
            feedtitle, feedurl = feedobj
            self.report_progress(0, _('Fetching feed')+' %s...'%(feedtitle if feedtitle else feedurl))
            articles = []
            soup = self.index_to_soup(feedurl)
            for item in soup.findAll(attrs={'class':['entry-asset asset hentry']}):

                clearimages = item.find(attrs={'class':['mt-image-left']}).replaceWith("")
                bodysection = item.find(attrs={'class':['asset-body']})
                datesection = item.find(attrs={'class':['published']})
                titlesection = item.find(attrs={'class':['asset-name entry-title']})


                self.log(bodysection)

                link = titlesection.find('a')
                url         = link['href']
                title       = self.tag_to_string(link)
                self.log(url)
                self.log(title)
                articles.append({
                                      'title'      :title
                                     ,'date'       :' [' + self.tag_to_string(datesection) + ']'
                                     ,'url'        :url
                                     ,'description':self.tag_to_string(bodysection)
                                    })
            totalfeeds.append((feedtitle, articles))
        return totalfeeds

Last edited by spedinfargo; 11-25-2011 at 02:51 PM.
spedinfargo is offline   Reply With Quote