View Single Post
Old 02-19-2011, 07:45 PM   #5
spedinfargo
Groupie
spedinfargo is the king of the Divan.spedinfargo is the king of the Divan.spedinfargo is the king of the Divan.spedinfargo is the king of the Divan.spedinfargo is the king of the Divan.spedinfargo is the king of the Divan.spedinfargo is the king of the Divan.spedinfargo is the king of the Divan.spedinfargo is the king of the Divan.spedinfargo is the king of the Divan.spedinfargo is the king of the Divan.
 
Posts: 155
Karma: 106422
Join Date: Nov 2010
Device: none
Updated version. Kovid, is there something I should do to check in my changes, or do you just copy and paste from here?

Code:
import re
import urllib2
import time
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup, SoupStrainer
from calibre import strftime

'''
      Help Needed:
       Still can't figure out why I'm getting strange characters.  Esp. the Great Movies descriptions in the TOC.
       Anyone help me figure that out?
       
      Change Log:
       2011-02-19:  Version 2:  Added "Oscars" section and fixed date problem
'''

class Ebert(BasicNewsRecipe):
    title                 = 'Roger Ebert'
    __author__            = 'Shane Erstad'
    version               = 2
    description           = 'Roger Ebert Movie Reviews'
    publisher             = 'Chicago Sun Times'
    category              = 'movies'
    oldest_article        = 8
    max_articles_per_feed = 100
    no_stylesheets        = True
    use_embedded_content  = False
    encoding              = 'UTF-8'
    masthead_url          = 'http://rogerebert.suntimes.com/graphics/global/roger.jpg'
    language              = 'en'
    remove_empty_feeds    = False
    PREFIX                  = 'http://rogerebert.suntimes.com'
    patternReviews                = r'<span class="*?movietitle"*?>(.*?)</span>.*?<div class="*?headline"*?>(.*?)</div>(.*?)</div>'
    patternCommentary       = r'<div class="*?headline"*?>.*?(<a href="/apps/pbcs.dll/article\?AID=.*?COMMENTARY.*?" id="ltred">.*?</a>).*?<div class="blurb clear">(.*?)</div>'
    patternPeople           = r'<div class="*?headline"*?>.*?(<a href="/apps/pbcs.dll/article\?AID=.*?PEOPLE.*?" id="ltred">.*?</a>).*?<div class="blurb clear">(.*?)</div>'
    patternOscars           = r'<div class="*?headline"*?>.*?(<a href="/apps/pbcs.dll/article\?AID=.*?OSCARS.*?" id="ltred">.*?</a>).*?<div class="blurb clear">(.*?)</div>'
    patternGlossary           = r'<div class="*?headline"*?>.*?(<a href="/apps/pbcs.dll/article\?AID=.*?GLOSSARY.*?" id="ltred">.*?</a>).*?<div class="blurb clear">(.*?)</div>'
    


    conversion_options = {
                          'comment'          : description
                        , 'tags'             : category
                        , 'publisher'        : publisher
                        , 'language'         : language
                        , 'linearize_tables' : True
                        }


    feeds          = [
                        (u'Reviews'   , u'http://rogerebert.suntimes.com/apps/pbcs.dll/section?category=reviews' )
                        ,(u'Commentary'   , u'http://rogerebert.suntimes.com/apps/pbcs.dll/section?category=COMMENTARY')
                        ,(u'Great Movies'   , u'http://rogerebert.suntimes.com/apps/pbcs.dll/section?category=REVIEWS08')
                        ,(u'People'   , u'http://rogerebert.suntimes.com/apps/pbcs.dll/section?category=PEOPLE')
                        ,(u'Oscars'   , u'http://rogerebert.suntimes.com/apps/pbcs.dll/section?category=OSCARS')
                        ,(u'Glossary'   , u'http://rogerebert.suntimes.com/apps/pbcs.dll/section?category=GLOSSARY')
                        
                     ]

    preprocess_regexps = [
        (re.compile(r'<font.*?>.*?This is a printer friendly.*?</font>.*?<hr>', re.DOTALL|re.IGNORECASE),
            lambda m: '')
    ]
    


    def print_version(self, url):
        return url + '&template=printart'

    def parse_index(self):
        totalfeeds = []
        lfeeds = self.get_feeds()
        for feedobj in lfeeds:
            feedtitle, feedurl = feedobj
            self.log('\tFeedurl: ', feedurl)
            self.report_progress(0, _('Fetching feed')+' %s...'%(feedtitle if feedtitle else feedurl))
            articles = []
            page = urllib2.urlopen(feedurl).read()

            if feedtitle == 'Reviews' or feedtitle == 'Great Movies':
                    pattern = self.patternReviews
            elif feedtitle == 'Commentary':
                    pattern = self.patternCommentary
            elif feedtitle == 'People':
                    pattern = self.patternPeople
            elif feedtitle == 'Glossary':
                    pattern = self.patternGlossary
            elif feedtitle == 'Oscars':
                    pattern = self.patternOscars
                    
                    
            regex = re.compile(pattern, re.IGNORECASE|re.DOTALL)

            for match in regex.finditer(page):
                if feedtitle == 'Reviews' or feedtitle == 'Great Movies':
                    movietitle = match.group(1)
                    thislink = match.group(2)
                    description = match.group(3)
                elif feedtitle == 'Commentary' or feedtitle == 'People' or feedtitle == 'Glossary' or feedtitle == 'Oscars':
                    thislink = match.group(1)
                    description = match.group(2)

                self.log(thislink)
                 
                for link in BeautifulSoup(thislink, parseOnlyThese=SoupStrainer('a')):
                    thisurl = self.PREFIX + link['href']
                    thislinktext = self.tag_to_string(link)

                    if feedtitle == 'Reviews' or feedtitle == 'Great Movies':
                        thistitle = movietitle
                    elif feedtitle == 'Commentary' or feedtitle == 'People' or feedtitle == 'Glossary' or feedtitle == 'Oscars':
                        thistitle = thislinktext

                    if thistitle == '':
                        continue
                    
                    
                    pattern2 = r'AID=\/(.*?)\/'
                    reg2 = re.compile(pattern2, re.IGNORECASE|re.DOTALL)
                    match2 = reg2.search(thisurl)
                    if match2:
                        c = time.strptime(match2.group(1),"%Y%m%d")
                        mydate=strftime("%A, %B %d, %Y", c)
                    else:
                        mydate = strftime("%A, %B %d, %Y")
                    self.log(mydate)
                    
                    articles.append({
                                      'title'      :thistitle
                                     ,'date'       :'  [' + mydate + ']'
                                     ,'url'        :thisurl
                                     ,'description':description
                                    })
            totalfeeds.append((feedtitle, articles))

        return totalfeeds
spedinfargo is offline   Reply With Quote