View Single Post
Old 01-05-2011, 01:54 PM   #3
Junior Member
jwiv doesn't litterjwiv doesn't litter
Posts: 7
Karma: 110
Join Date: Dec 2010
Device: Kindle 3, Nexus 7 (2012)
Redid this pretty substantially using the Chicago Tribune as a base as these are both Tribune papers. It runs a lot faster now and is fairly clean. The only oddity is that The Baltimore Sun's version of the Nation/World feed flat out will not work with Calibre (even when added as a basic recipe RSS feed). In terms of content however, it is identical to the feed on the Chicago Tribune, so I've chosen to use that instead.

from __future__ import with_statement
__license__ = 'GPL 3'
__copyright__ = 'Original 2009, Kovid Goyal <>'
__copyright__= 'Modified 2011,  Josh Hall <>'
__docformat__ = 'restructuredtext en'


from import BasicNewsRecipe

class BaltimoreSun(BasicNewsRecipe):

    title       = 'The Baltimore Sun'
    __author__  = 'Kovid Goyal and Sujata Raman'
    __author__ = 'Modified for use with The Baltimore Sun by Josh Hall'
    description = 'Politics, local and business news from Baltimore'
    language = 'en'
    oldest_article = 1
    max_articles_per_feed = 100
    remove_empty_feeds    = True
    use_embedded_content    = False
    no_stylesheets        = True
    remove_javascript = True
    #masthead_url = ''

    remove_tags_before = dict(name='div', attrs={'class':['story', 'entry']})
    remove_tags_after = [    
                                      dict(name='div', attrs={'class':'shirttail-promo right clearfix'}), 

    keep_only_tags = [dict(name='div', attrs={'class':["story","entry-asset asset hentry"]}),
                      dict(name='div', attrs={'id':["pagebody","story","maincontentcontainer"]}),

    remove_tags = [{'id':["moduleArticleTools","content-bottom","rail","articleRelates module","toolSet","relatedrailcontent","div-wrapper","beta","atp-comments","footer","article-promo"]},
                   {'class':["entry-footer-left","entry-footer-right","shirttail-promo right clearfix","clearfix","relatedTitle","articleRelates module","asset-footer","tools","comments","featurePromo","featurePromo fp-topjobs brownBackground","clearfix fullSpan brownBackground","curvedContent","toppaginate","module","module-header","module-content"]},
    extra_css = '''
                    h1{font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:large;}
                    h2{font-family:Arial,Helvetica,sans-serif; font-weight:normal;font-size:small;}
                    .byline {font-family:Arial,Helvetica,sans-serif; font-size:xx-small;}
                    .date {font-family:Arial,Helvetica,sans-serif; font-size:xx-small;}
                    .copyright {font-family:Arial,Helvetica,sans-serif;font-size:xx-small;text-align:center}
                    .entry-asset asset hentry{font-family:Arial,Helvetica,sans-serif;font-size:small;}
    feeds = [
         (u'Top Headlines', u''),
         (u'Breaking News', u''),
         (u'Top Maryland', u''),
         #(u'Anne Arundel County', u''),
         (u'Baltimore City', u''),
         #(u'Baltimore County', u''),
         #(u'Carroll County', u''),
         #(u'Harford County', u''),
         #(u'Howard County', u''),
         (u'Education', u''), 	
         #(u'Obituaries', u''),
         (u'Local Politics', u''), 
         (u'Weather', u''),
         #(u'Traffic', u''),
         (u'Nation/world', u''), 
         (u'Weird News', u''),
         (u'Top Sports', u''),
         (u'Orioles/Baseball', u''),
         (u'Ravens/Football', u''),
         #(u'Terps', u''),
         #(u'College Football', u''),
         #(u'Lacrosse', u''),
         #(u'Horse Racing', u''),
         #(u'Golf', u''),
         #(u'NBA', u''),
         #(u'High School', u''),
         #(u'Outdoors', u''),
         (u'Celebrity News', u''),
         (u'Arts & Theater', u''),
         (u'Movies', u''),
         (u'Music & Nightlife', u''),
         (u'Restaurants & Food', u''),
         (u'TV/Media', u''),

         (u'Health&Wellness', u''),
         (u'Home & Garden', u''),
         (u'Living Green', u''),
         (u'Parenting', u''),
         (u'Fashion', u''),
         (u'Travel', u''),
        (u'Faith', u''),
         (u'Top Business', u''),
         (u'Technology', u''),
         (u'Personal finance', u''), 
         (u'Real Estate', u''),
         (u'Jobs', u''),
         (u'DIY', u''),
         (u'Consumer Safety', u''),
         (u'Investing', u''),

         (u'Sun Editorials', u''),
         (u'Op/Ed', u''),
         (u'Readers Respond', u''),

         (u'Kevin Cowherd', ',0,6829726.columnist-rss2.0.xml'),
         (u'Jay Hancock', u',0,6673611.columnist-rss2.0.xml'),
         (u'Jacques Kelly', u',0,1154701.columnist-rss2.0.xml'),
         (u'Marta H. Mossburg', u',0,7982155.columnist-rss2.0.xml'),
         (u'Mike Preston', u',0,6169796.columnist-rss2.0.xml'),
         (u'Susan Reimer', u',0,162466.columnist-rss2.0.xml'),
         (u'Dan Rodricks', u',0,7089843.columnist-rss2.0.xml'),
         (u'Thomas F. Schaller', u',0,897397.columnist-rss2.0.xml'),
         (u'Peter Schmuck', u',0,7485088.columnist-rss2.0.xml'),
         (u'Ron Smith', u',0,3964803.columnist-rss2.0.xml'),

         (u'Baltimore Crime Beat', u''),
         (u'Getting There', u''),
         (u'InsideEd', u''),
         (u'Maryland Politics', u''),
         (u'Maryland Weather', u''),
         (u'Second Opinion', u''),
         (u'You Dont Say', u''),
         (u'BaltTech', u''),
         (u'Consuming Interests', u''),
         (u'Jay Hancocks Blog', u''),
         (u'The Real Estate Wonk', u''),
         (u'Clef Notes', ''),
         (u'Dining at Large', u''),
         (u'Midnight Sun', u''),
         (u'Mike Sragow Gets Reel', u''),
         (u'Read Street', u''),
         (u'Reality Check', u''),
         (u'Z on TV', u''),

         (u'BMore Green', u''),
         (u'Charm City Moms', u''),
         (u'Exercists', u''),
         (u'Garden Variety', ''),
         #(u'In Good Faith', u''),
         (u'Picture of Health', u''),
         (u'Unleashed', u''),

         #(u'Faceoff', u''),
         #(u'MMA Stomping Grounds', u''),
         (u'Orioles Insider', u''),
         #(u'Outdoors Girl', u''),
         (u'Ravens Insider', u''),
         #(u'Recruiting Report', u''),
         #(u'Ring Posts', u''),
         (u'The Schmuck Stops Here', u''),
         (u'Toy Department', u''),
         #(u'Tracking the Terps', u''),
         #(u'Varsity Letters', u''),
         (u'Virtual Vensanity', u''),


    def get_article_url(self, article):
        print article.get('feedburner_origlink', article.get('guid', article.get('link')))
        return article.get('feedburner_origlink', article.get('guid', article.get('link')))

    def postprocess_html(self, soup, first_fetch):
        for t in soup.findAll(['table', 'tr', 'td']):
   = 'div'

        for tag in soup.findAll('form', dict(attrs={'name':["comments_form"]})):
        for tag in soup.findAll('font', dict(attrs={'id':["cr-other-headlines"]})):

        return soup
jwiv is offline   Reply With Quote