![]() |
#1 |
Junior Member
![]() ![]() Posts: 7
Karma: 110
Join Date: Dec 2010
Device: Kindle 3, Nexus 7 (2012)
|
Baltimore Sun Recipe (updated)
Finally got around to updating this to match the newer revisions of the Chicago Tribune recipe which resolve some formatting, multi page, and blog issues that were introduced due to site changes. Again, all real credit should go to Kovid Goyal, Sujata Raman, and a.peter for their work on the base recipe.
Code:
from __future__ import with_statement __license__ = 'GPL 3' __copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>' __copyright__ = '2012 Josh Hall<jwtheiv@gmail.com>' __docformat__ = 'restructuredtext en' import urllib, re from calibre.web.feeds.news import BasicNewsRecipe class BaltimoreSun(BasicNewsRecipe): title = 'The Baltimore Sun' __author__ = 'Kovid Goyal and Sujata Raman, a.peter' __author__ = 'Revised for use with the Baltimore Sun by Josh Hall' description = 'Complete local news and blogs from Baltimore' language = 'en' version = 2 oldest_article = 1 max_articles_per_feed = 100 use_embedded_content = False no_stylesheets = True remove_javascript = True recursions = 1 keep_only_tags = [dict(name='div', attrs={'class':["story","entry-asset asset hentry"]}), dict(name='div', attrs={'id':["pagebody","story","maincontentcontainer"]}), ] remove_tags_after = [{'class':['photo_article',]}] match_regexps = [r'page=[0-9]+'] remove_tags = [{'id':["moduleArticleTools","content-bottom","rail","articleRelates module","toolSet","relatedrailcontent","div-wrapper","beta","atp-comments","footer",'gallery-subcontent','subFooter']}, {'class':["clearfix","relatedTitle","articleRelates module","asset-footer","tools","comments","featurePromo","featurePromo fp-topjobs brownBackground","clearfix fullSpan brownBackground","curvedContent",'nextgen-share-tools','outbrainTools', 'google-ad-story-bottom']}, dict(name='font',attrs={'id':["cr-other-headlines"]})] extra_css = ''' h1{font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:large;} h2{font-family:Arial,Helvetica,sans-serif; font-weight:normal;font-size:small;} .byline {font-family:Arial,Helvetica,sans-serif; font-size:xx-small;} .date {font-family:Arial,Helvetica,sans-serif; font-size:xx-small;} p{font-family:Arial,Helvetica,sans-serif;font-size:small;} .copyright {font-family:Arial,Helvetica,sans-serif;font-size:xx-small;text-align:center} .story{font-family:Arial,Helvetica,sans-serif;font-size:small;} .entry-asset asset hentry{font-family:Arial,Helvetica,sans-serif;font-size:small;} .pagebody{font-family:Arial,Helvetica,sans-serif;font-size:small;} .maincontentcontainer{font-family:Arial,Helvetica,sans-serif;font-size:small;} .story-body{font-family:Arial,Helvetica,sans-serif;font-size:small;} body{font-family:Helvetica,Arial,sans-serif;font-size:small;} ''' feeds = [ ## News ## (u'Top Headlines', u'http://www.baltimoresun.com/rss2.0.xml'), (u'Breaking News', u'http://www.baltimoresun.com/news/breaking/rss2.0.xml'), (u'Top Maryland', u'http://www.baltimoresun.com/news/maryland/rss2.0.xml'), #(u'Anne Arundel County', u'http://www.baltimoresun.com/news/maryland/anne-arundel/rss2.0.xml'), (u'Baltimore City', u'http://www.baltimoresun.com/news/maryland/baltimore-city/rss2.0.xml'), #(u'Baltimore County', u'http://www.baltimoresun.com/news/maryland/baltimore-county/rss2.0.xml'), #(u'Carroll County', u'http://www.baltimoresun.com/news/maryland/carroll/rss2.0.xml'), #(u'Harford County', u'http://www.baltimoresun.com/news/maryland/harford/rss2.0.xml'), #(u'Howard County', u'http://www.baltimoresun.com/news/maryland/howard/rss2.0.xml'), (u'Education', u'http://www.baltimoresun.com/news/education/rss2.0.xml'), #(u'Obituaries', u'http://www.baltimoresun.com/news/obituaries/rss2.0.xml'), (u'Local Politics', u'http://www.baltimoresun.com/news/maryland/politics/rss2.0.xml'), (u'Weather', u'http://www.baltimoresun.com/news/weather/rss2.0.xml'), #(u'Traffic', u'http://www.baltimoresun.com/features/commuting/rss2.0.xml'), (u'Nation/world', u'http://feeds.feedburner.com/baltimoresun/news/nationworld/rss2'), (u'Weird News', u'http://www.baltimoresun.com/news/offbeat/rss2.0.xml'), ##Sports## (u'Top Sports', u'http://www.baltimoresun.com/sports/rss2.0.xml'), (u'Orioles/Baseball', u'http://www.baltimoresun.com/sports/orioles/rss2.0.xml'), (u'Ravens/Football', u'http://www.baltimoresun.com/sports/ravens/rss2.0.xml'), #(u'Terps', u'http://www.baltimoresun.com/sports/terps/rss2.0.xml'), #(u'College Football', u'http://www.baltimoresun.com/sports/college/football/rss2.0.xml'), #(u'Lacrosse', u'http://www.baltimoresun.com/sports/college/lacrosse/rss2.0.xml'), #(u'Horse Racing', u'http://www.baltimoresun.com/sports/horse-racing/rss2.0.xml'), #(u'Golf', u'http://www.baltimoresun.com/sports/golf/rss2.0.xml'), #(u'NBA', u'http://www.baltimoresun.com/sports/nba/rss2.0.xml'), #(u'High School', u'http://www.baltimoresun.com/sports/high-school/rss2.0.xml'), #(u'Outdoors', u'http://www.baltimoresun.com/sports/outdoors/rss2.0.xml'), ## Entertainment ## (u'Celebrity News', u'http://www.baltimoresun.com/entertainment/celebrities/rss2.0.xml'), (u'Arts & Theater', u'http://www.baltimoresun.com/entertainment/arts/rss2.0.xml'), (u'Movies', u'http://www.baltimoresun.com/entertainment/movies/rss2.0.xml'), (u'Music & Nightlife', u'http://www.baltimoresun.com/entertainment/music/rss2.0.xml'), (u'Restaurants & Food', u'http://www.baltimoresun.com/entertainment/dining/rss2.0.xml'), (u'TV/Media', u'http://www.baltimoresun.com/entertainment/tv/rss2.0.xml'), ## Life ## (u'Health&Wellness', u'http://www.baltimoresun.com/health/rss2.0.xml'), (u'Home & Garden', u'http://www.baltimoresun.com/features/home-garden/rss2.0.xml'), (u'Living Green', u'http://www.baltimoresun.com/features/green/rss2.0.xml'), (u'Parenting', u'http://www.baltimoresun.com/features/parenting/rss2.0.xml'), (u'Fashion', u'http://www.baltimoresun.com/features/fashion/rss2.0.xml'), (u'Travel', u'http://www.baltimoresun.com/travel/rss2.0.xml'), #(u'Faith', u'http://www.baltimoresun.com/features/faith/rss2.0.xml'), ## Business ## (u'Top Business', u'http://www.baltimoresun.com/business/rss2.0.xml'), (u'Technology', u'http://www.baltimoresun.com/business/technology/rss2.0.xml'), (u'Personal finance', u'http://www.baltimoresun.com/business/money/rss2.0.xml'), (u'Real Estate', u'http://www.baltimoresun.com/classified/realestate/rss2.0.xml'), (u'Jobs', u'http://www.baltimoresun.com/classified/jobs/rss2.0.xml'), (u'DIY', u'http://www.baltimoresun.com/features/do-it-yourself/rss2.0.xml'), (u'Consumer Safety', u'http://www.baltimoresun.com/business/consumer-safety/rss2.0.xml'), (u'Investing', u'http://www.baltimoresun.com/business/money/rss2.0.xml'), ## Opinion## (u'Sun Editorials', u'http://www.baltimoresun.com/news/opinion/editorial/rss2.0.xml'), (u'Op/Ed', u'http://www.baltimoresun.com/news/opinion/oped/rss2.0.xml'), (u'Readers Respond', u'http://www.baltimoresun.com/news/opinion/readersrespond/'), ## Columnists ## (u'Kevin Cowherd', u'http://www.baltimoresun.com/sports/bal-columnist-cowherd,0,6829726.columnist-rss2.0.xml'), (u'Robert Ehrlich', u'http://www.baltimoresun.com/news/opinion/columnists/bal-columnist-ehrlich,0,1825227.columnist-rss2.0.xml'), (u'Jacques Kelly', u'http://www.baltimoresun.com/news/maryland/bal-columnist-kelly,0,1154701.columnist-rss2.0.xml'), (u'Marta H. Mossburg', u'http://www.baltimoresun.com/news/opinion/oped/bal-columnist-mossburg,0,7982155.columnist-rss2.0.xml'), (u'Mike Preston', u'http://www.baltimoresun.com/sports/bal-columnist-preston,0,6169796.columnist-rss2.0.xml'), (u'Susan Reimer', u'http://www.baltimoresun.com/news/opinion/bal-columnist-reimer,0,162466.columnist-rss2.0.xml'), (u'Dan Rodricks', u'http://www.baltimoresun.com/news/maryland/bal-columnist-rodricks,0,7089843.columnist-rss2.0.xml'), (u'Thomas F. Schaller', u'http://www.baltimoresun.com/news/opinion/columnists/bal-columnist-schaller,0,897397.columnist-rss2.0.xml'), (u'Peter Schmuck', u'http://www.baltimoresun.com/sports/bal-columnist-schmuck,0,7485088.columnist-rss2.0.xml'), ## News Blogs ## (u'Baltimore Crime Beat', u'http://baltimore.feedsportal.com/c/34255/f/623075/index.rss'), (u'InsideEd', u'http://www.baltimoresun.com/news/maryland/education/blog/rss2.0.xml'), (u'Maryland Politics', u'http://www.baltimoresun.com/news/maryland/politics/blog/rss2.0.xml'), (u'Maryland Weather', u'http://www.baltimoresun.com/news/weather/weather-blog/rss2.0.xml'), (u'Second Opinion', u'http://www.baltimoresun.com/news/opinion/second-opinion-blog/rss2.0.xml'), (u'Sun Investigates', u'http://www.baltimoresun.com/news/maryland/sun-investigates/rss2.0.xml'), (u'You Dont Say', u'http://www.baltimoresun.com/news/language-blog/rss2.0.xml'), ## Business Blogs ## (u'BaltTech', u'http://www.baltimoresun.com/business/technology/blog/rss2.0.xml'), (u'Consuming Interests', u'http://www.baltimoresun.com/business/consuming-interests-blog/rss2.0.xml'), (u'The Real Estate Wonk', u'http://www.baltimoresun.com/business/real-estate/wonk/rss2.0.xml'), ## Entertainment Blogs ## (u'Clef Notes & Drama Queens', 'http://weblogs.baltimoresun.com/entertainment/classicalmusic/index.xml'), (u'Baltimore Diner', u'http://baltimore.feedsportal.com/c/34255/f/623088/index.rss'), (u'Midnight Sun', u'http://www.baltimoresun.com/entertainment/music/midnight-sun-blog/rss2.0.xml'), (u'Read Street', u'http://www.baltimoresun.com/features/books/read-street/rss2.0.xml'), (u'Z on TV', u'http://www.baltimoresun.com/entertainment/tv/z-on-tv-blog/rss2.0.xml'), ## Life Blogs ## (u'BMore Green', u'http://weblogs.baltimoresun.com/features/green/index.xml'), (u'Baltimore Insider',u'http://www.baltimoresun.com/features/baltimore-insider-blog/rss2.0.xml'), (u'Homefront', u'http://www.baltimoresun.com/features/parenting/homefront/rss2.0.xml'), (u'Picture of Health', u'http://www.baltimoresun.com/health/blog/rss2.0.xml'), (u'Unleashed', u'http://weblogs.baltimoresun.com/features/mutts/blog/index.xml'), ## b the site blogs ## (u'Game Cache', u'http://www.baltimoresun.com/entertainment/bthesite/game-cache/rss2.0.xml'), (u'TV Lust', u'http://www.baltimoresun.com/entertainment/bthesite/tv-lust/rss2.0.xml'), ## Sports Blogs ## (u'Baltimore Sports Blitz', u'http://baltimore.feedsportal.com/c/34255/f/623097/index.rss'), #(u'Faceoff', u'http://weblogs.baltimoresun.com/sports/lacrosse/blog/index.xml'), #(u'MMA Stomping Grounds', u'http://weblogs.baltimoresun.com/sports/mma/blog/index.xml'), (u'Orioles Insider', u'http://baltimore.feedsportal.com/c/34255/f/623100/index.rss'), (u'Ravens Insider', u'http://www.baltimoresun.com/sports/ravens/ravens-insider/rss2.0.xml'), #(u'Recruiting Report', u'http://weblogs.baltimoresun.com/sports/college/recruiting/index.xml'), #(u'Ring Posts', u'http://weblogs.baltimoresun.com/sports/wrestling/blog/index.xml'), (u'The Schmuck Stops Here', u'http://www.baltimoresun.com/sports/schmuck-blog/rss2.0.xml'), #(u'Tracking the Terps', u'http://weblogs.baltimoresun.com/sports/college/maryland_terps/blog/index.xml'), #(u'Varsity Letters', u'http://weblogs.baltimoresun.com/sports/highschool/varsityletters/index.xml'), ] def get_article_url(self, article): ans = None try: s = article.summary ans = urllib.unquote( re.search(r'href=".+?bookmark.cfm.+?link=(.+?)"', s).group(1)) except: pass if ans is None: ans = article.get('feedburner_origlink', article.get('guid', article.get('link'))) if ans is not None: return ans.replace('?track=rss', '') def skip_ad_pages(self, soup): text = soup.find(text='click here to continue to article') if text: a = text.parent url = a.get('href') if url: return self.index_to_soup(url, raw=True) def postprocess_html(self, soup, first_fetch): # Remove the navigation bar. It was kept until now to be able to follow # the links to further pages. But now we don't need them anymore. for nav in soup.findAll(attrs={'class':['toppaginate','article-nav clearfix']}): nav.extract() for t in soup.findAll(['table', 'tr', 'td']): t.name = 'div' for tag in soup.findAll('form', dict(attrs={'name':["comments_form"]})): tag.extract() for tag in soup.findAll('font', dict(attrs={'id':["cr-other-headlines"]})): tag.extract() return soup |
![]() |
![]() |
![]() |
#2 | |
Wizard
![]() ![]() ![]() ![]() ![]() ![]() ![]() ![]() ![]() ![]() ![]() Posts: 1,835
Karma: 4985051
Join Date: Sep 2010
Location: Maryland
Device: Kindle
|
Thanks so much for this recipe!
I notice the latest version of Calibre (0.9.2) mentions Quote:
Thanks for your time. ![]() (GO ORIOLES!) |
|
![]() |
![]() |
Advert | |
|
![]() |
#3 |
Junior Member
![]() ![]() Posts: 7
Karma: 110
Join Date: Dec 2010
Device: Kindle 3, Nexus 7 (2012)
|
Oh that's very nice. And now added.
Code:
from __future__ import with_statement __license__ = 'GPL 3' __copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>' __copyright__ = '2012 Josh Hall<jwtheiv@gmail.com>' __docformat__ = 'restructuredtext en' import urllib, re from calibre.web.feeds.news import BasicNewsRecipe class BaltimoreSun(BasicNewsRecipe): title = 'The Baltimore Sun (revised)' __author__ = 'Kovid Goyal and Sujata Raman, a.peter' __author__ = 'Revised for use with the Baltimore Sun by Josh Hall' description = 'Complete local news and blogs from Baltimore' language = 'en' version = 2.1 oldest_article = 1 max_articles_per_feed = 100 use_embedded_content = False no_stylesheets = True remove_javascript = True recursions = 1 ignore_duplicate_articles = {'title'} keep_only_tags = [dict(name='div', attrs={'class':["story","entry-asset asset hentry"]}), dict(name='div', attrs={'id':["pagebody","story","maincontentcontainer"]}), ] remove_tags_after = [{'class':['photo_article',]}] match_regexps = [r'page=[0-9]+'] remove_tags = [{'id':["moduleArticleTools","content-bottom","rail","articleRelates module","toolSet","relatedrailcontent","div-wrapper","beta","atp-comments","footer",'gallery-subcontent','subFooter']}, {'class':["clearfix","relatedTitle","articleRelates module","asset-footer","tools","comments","featurePromo","featurePromo fp-topjobs brownBackground","clearfix fullSpan brownBackground","curvedContent",'nextgen-share-tools','outbrainTools', 'google-ad-story-bottom']}, dict(name='font',attrs={'id':["cr-other-headlines"]})] extra_css = ''' h1{font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:large;} h2{font-family:Arial,Helvetica,sans-serif; font-weight:normal;font-size:small;} .byline {font-family:Arial,Helvetica,sans-serif; font-size:xx-small;} .date {font-family:Arial,Helvetica,sans-serif; font-size:xx-small;} p{font-family:Arial,Helvetica,sans-serif;font-size:small;} .copyright {font-family:Arial,Helvetica,sans-serif;font-size:xx-small;text-align:center} .story{font-family:Arial,Helvetica,sans-serif;font-size:small;} .entry-asset asset hentry{font-family:Arial,Helvetica,sans-serif;font-size:small;} .pagebody{font-family:Arial,Helvetica,sans-serif;font-size:small;} .maincontentcontainer{font-family:Arial,Helvetica,sans-serif;font-size:small;} .story-body{font-family:Arial,Helvetica,sans-serif;font-size:small;} body{font-family:Helvetica,Arial,sans-serif;font-size:small;} ''' feeds = [ ## News ## (u'Top Headlines', u'http://www.baltimoresun.com/rss2.0.xml'), (u'Breaking News', u'http://www.baltimoresun.com/news/breaking/rss2.0.xml'), (u'Top Maryland', u'http://www.baltimoresun.com/news/maryland/rss2.0.xml'), #(u'Anne Arundel County', u'http://www.baltimoresun.com/news/maryland/anne-arundel/rss2.0.xml'), (u'Baltimore City', u'http://www.baltimoresun.com/news/maryland/baltimore-city/rss2.0.xml'), #(u'Baltimore County', u'http://www.baltimoresun.com/news/maryland/baltimore-county/rss2.0.xml'), #(u'Carroll County', u'http://www.baltimoresun.com/news/maryland/carroll/rss2.0.xml'), #(u'Harford County', u'http://www.baltimoresun.com/news/maryland/harford/rss2.0.xml'), #(u'Howard County', u'http://www.baltimoresun.com/news/maryland/howard/rss2.0.xml'), (u'Education', u'http://www.baltimoresun.com/news/education/rss2.0.xml'), #(u'Obituaries', u'http://www.baltimoresun.com/news/obituaries/rss2.0.xml'), (u'Local Politics', u'http://www.baltimoresun.com/news/maryland/politics/rss2.0.xml'), (u'Weather', u'http://www.baltimoresun.com/news/weather/rss2.0.xml'), #(u'Traffic', u'http://www.baltimoresun.com/features/commuting/rss2.0.xml'), (u'Nation/world', u'http://feeds.feedburner.com/baltimoresun/news/nationworld/rss2'), (u'Weird News', u'http://www.baltimoresun.com/news/offbeat/rss2.0.xml'), ##Sports## (u'Top Sports', u'http://www.baltimoresun.com/sports/rss2.0.xml'), (u'Orioles/Baseball', u'http://www.baltimoresun.com/sports/orioles/rss2.0.xml'), (u'Ravens/Football', u'http://www.baltimoresun.com/sports/ravens/rss2.0.xml'), #(u'Terps', u'http://www.baltimoresun.com/sports/terps/rss2.0.xml'), #(u'College Football', u'http://www.baltimoresun.com/sports/college/football/rss2.0.xml'), #(u'Lacrosse', u'http://www.baltimoresun.com/sports/college/lacrosse/rss2.0.xml'), #(u'Horse Racing', u'http://www.baltimoresun.com/sports/horse-racing/rss2.0.xml'), #(u'Golf', u'http://www.baltimoresun.com/sports/golf/rss2.0.xml'), #(u'NBA', u'http://www.baltimoresun.com/sports/nba/rss2.0.xml'), #(u'High School', u'http://www.baltimoresun.com/sports/high-school/rss2.0.xml'), #(u'Outdoors', u'http://www.baltimoresun.com/sports/outdoors/rss2.0.xml'), ## Entertainment ## (u'Celebrity News', u'http://www.baltimoresun.com/entertainment/celebrities/rss2.0.xml'), (u'Arts & Theater', u'http://www.baltimoresun.com/entertainment/arts/rss2.0.xml'), (u'Movies', u'http://www.baltimoresun.com/entertainment/movies/rss2.0.xml'), (u'Music & Nightlife', u'http://www.baltimoresun.com/entertainment/music/rss2.0.xml'), (u'Restaurants & Food', u'http://www.baltimoresun.com/entertainment/dining/rss2.0.xml'), (u'TV/Media', u'http://www.baltimoresun.com/entertainment/tv/rss2.0.xml'), ## Life ## (u'Health&Wellness', u'http://www.baltimoresun.com/health/rss2.0.xml'), (u'Home & Garden', u'http://www.baltimoresun.com/features/home-garden/rss2.0.xml'), (u'Living Green', u'http://www.baltimoresun.com/features/green/rss2.0.xml'), (u'Parenting', u'http://www.baltimoresun.com/features/parenting/rss2.0.xml'), (u'Fashion', u'http://www.baltimoresun.com/features/fashion/rss2.0.xml'), (u'Travel', u'http://www.baltimoresun.com/travel/rss2.0.xml'), #(u'Faith', u'http://www.baltimoresun.com/features/faith/rss2.0.xml'), ## Business ## (u'Top Business', u'http://www.baltimoresun.com/business/rss2.0.xml'), (u'Technology', u'http://www.baltimoresun.com/business/technology/rss2.0.xml'), (u'Personal finance', u'http://www.baltimoresun.com/business/money/rss2.0.xml'), (u'Real Estate', u'http://www.baltimoresun.com/classified/realestate/rss2.0.xml'), (u'Jobs', u'http://www.baltimoresun.com/classified/jobs/rss2.0.xml'), (u'DIY', u'http://www.baltimoresun.com/features/do-it-yourself/rss2.0.xml'), (u'Consumer Safety', u'http://www.baltimoresun.com/business/consumer-safety/rss2.0.xml'), (u'Investing', u'http://www.baltimoresun.com/business/money/rss2.0.xml'), ## Opinion## (u'Sun Editorials', u'http://www.baltimoresun.com/news/opinion/editorial/rss2.0.xml'), (u'Op/Ed', u'http://www.baltimoresun.com/news/opinion/oped/rss2.0.xml'), (u'Readers Respond', u'http://www.baltimoresun.com/news/opinion/readersrespond/'), ## Columnists ## (u'Kevin Cowherd', u'http://www.baltimoresun.com/sports/bal-columnist-cowherd,0,6829726.columnist-rss2.0.xml'), (u'Robert Ehrlich', u'http://www.baltimoresun.com/news/opinion/columnists/bal-columnist-ehrlich,0,1825227.columnist-rss2.0.xml'), (u'Jacques Kelly', u'http://www.baltimoresun.com/news/maryland/bal-columnist-kelly,0,1154701.columnist-rss2.0.xml'), (u'Marta H. Mossburg', u'http://www.baltimoresun.com/news/opinion/oped/bal-columnist-mossburg,0,7982155.columnist-rss2.0.xml'), (u'Mike Preston', u'http://www.baltimoresun.com/sports/bal-columnist-preston,0,6169796.columnist-rss2.0.xml'), (u'Susan Reimer', u'http://www.baltimoresun.com/news/opinion/bal-columnist-reimer,0,162466.columnist-rss2.0.xml'), (u'Dan Rodricks', u'http://www.baltimoresun.com/news/maryland/bal-columnist-rodricks,0,7089843.columnist-rss2.0.xml'), (u'Thomas F. Schaller', u'http://www.baltimoresun.com/news/opinion/columnists/bal-columnist-schaller,0,897397.columnist-rss2.0.xml'), (u'Peter Schmuck', u'http://www.baltimoresun.com/sports/bal-columnist-schmuck,0,7485088.columnist-rss2.0.xml'), ## News Blogs ## (u'Baltimore Crime Beat', u'http://baltimore.feedsportal.com/c/34255/f/623075/index.rss'), (u'InsideEd', u'http://www.baltimoresun.com/news/maryland/education/blog/rss2.0.xml'), (u'Maryland Politics', u'http://www.baltimoresun.com/news/maryland/politics/blog/rss2.0.xml'), (u'Maryland Weather', u'http://www.baltimoresun.com/news/weather/weather-blog/rss2.0.xml'), (u'Second Opinion', u'http://www.baltimoresun.com/news/opinion/second-opinion-blog/rss2.0.xml'), (u'Sun Investigates', u'http://www.baltimoresun.com/news/maryland/sun-investigates/rss2.0.xml'), (u'You Dont Say', u'http://www.baltimoresun.com/news/language-blog/rss2.0.xml'), ## Business Blogs ## (u'BaltTech', u'http://www.baltimoresun.com/business/technology/blog/rss2.0.xml'), (u'Consuming Interests', u'http://www.baltimoresun.com/business/consuming-interests-blog/rss2.0.xml'), (u'The Real Estate Wonk', u'http://www.baltimoresun.com/business/real-estate/wonk/rss2.0.xml'), ## Entertainment Blogs ## (u'Clef Notes & Drama Queens', 'http://weblogs.baltimoresun.com/entertainment/classicalmusic/index.xml'), (u'Baltimore Diner', u'http://baltimore.feedsportal.com/c/34255/f/623088/index.rss'), (u'Midnight Sun', u'http://www.baltimoresun.com/entertainment/music/midnight-sun-blog/rss2.0.xml'), (u'Read Street', u'http://www.baltimoresun.com/features/books/read-street/rss2.0.xml'), (u'Z on TV', u'http://www.baltimoresun.com/entertainment/tv/z-on-tv-blog/rss2.0.xml'), ## Life Blogs ## (u'BMore Green', u'http://weblogs.baltimoresun.com/features/green/index.xml'), (u'Baltimore Insider',u'http://www.baltimoresun.com/features/baltimore-insider-blog/rss2.0.xml'), (u'Homefront', u'http://www.baltimoresun.com/features/parenting/homefront/rss2.0.xml'), (u'Picture of Health', u'http://www.baltimoresun.com/health/blog/rss2.0.xml'), (u'Unleashed', u'http://weblogs.baltimoresun.com/features/mutts/blog/index.xml'), ## b the site blogs ## (u'Game Cache', u'http://www.baltimoresun.com/entertainment/bthesite/game-cache/rss2.0.xml'), (u'TV Lust', u'http://www.baltimoresun.com/entertainment/bthesite/tv-lust/rss2.0.xml'), ## Sports Blogs ## (u'Baltimore Sports Blitz', u'http://baltimore.feedsportal.com/c/34255/f/623097/index.rss'), #(u'Faceoff', u'http://weblogs.baltimoresun.com/sports/lacrosse/blog/index.xml'), #(u'MMA Stomping Grounds', u'http://weblogs.baltimoresun.com/sports/mma/blog/index.xml'), (u'Orioles Insider', u'http://baltimore.feedsportal.com/c/34255/f/623100/index.rss'), (u'Ravens Insider', u'http://www.baltimoresun.com/sports/ravens/ravens-insider/rss2.0.xml'), #(u'Recruiting Report', u'http://weblogs.baltimoresun.com/sports/college/recruiting/index.xml'), #(u'Ring Posts', u'http://weblogs.baltimoresun.com/sports/wrestling/blog/index.xml'), (u'The Schmuck Stops Here', u'http://www.baltimoresun.com/sports/schmuck-blog/rss2.0.xml'), #(u'Tracking the Terps', u'http://weblogs.baltimoresun.com/sports/college/maryland_terps/blog/index.xml'), #(u'Varsity Letters', u'http://weblogs.baltimoresun.com/sports/highschool/varsityletters/index.xml'), ] def get_article_url(self, article): ans = None try: s = article.summary ans = urllib.unquote( re.search(r'href=".+?bookmark.cfm.+?link=(.+?)"', s).group(1)) except: pass if ans is None: ans = article.get('feedburner_origlink', article.get('guid', article.get('link'))) if ans is not None: return ans.replace('?track=rss', '') def skip_ad_pages(self, soup): text = soup.find(text='click here to continue to article') if text: a = text.parent url = a.get('href') if url: return self.index_to_soup(url, raw=True) def postprocess_html(self, soup, first_fetch): # Remove the navigation bar. It was kept until now to be able to follow # the links to further pages. But now we don't need them anymore. for nav in soup.findAll(attrs={'class':['toppaginate','article-nav clearfix']}): nav.extract() for t in soup.findAll(['table', 'tr', 'td']): t.name = 'div' for tag in soup.findAll('form', dict(attrs={'name':["comments_form"]})): tag.extract() for tag in soup.findAll('font', dict(attrs={'id':["cr-other-headlines"]})): tag.extract() return soup |
![]() |
![]() |
![]() |
#4 |
Junior Member
![]() ![]() Posts: 7
Karma: 110
Join Date: Dec 2010
Device: Kindle 3, Nexus 7 (2012)
|
Updated to use internal browser due to issues with feedburner/feedproxy
Updated RSS links Code:
from __future__ import with_statement __license__ = 'GPL 3' __copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>' __copyright__ = '2012 Josh Hall<jwtheiv@gmail.com>' __docformat__ = 'restructuredtext en' import urllib, re from calibre.web.feeds.news import BasicNewsRecipe class BaltimoreSun(BasicNewsRecipe): title = 'The Baltimore Sun' __author__ = 'Kovid Goyal and Sujata Raman, a.peter' __author__ = 'Revised for use with the Baltimore Sun by Josh Hall' description = 'Complete local news and blogs from Baltimore' language = 'en' version = 2.4 oldest_article = 1 max_articles_per_feed = 100 use_embedded_content = False no_stylesheets = True remove_javascript = True remove_empty_feeds= True recursions = 1 ignore_duplicate_articles = {'title'} keep_only_tags = [dict(name='div', attrs={'class':["story","entry-asset asset hentry"]}), dict(name='div', attrs={'id':["pagebody","story","maincontentcontainer"]}), ] remove_tags_after = [{'class':['photo_article',]}] match_regexps = [r'page=[0-9]+'] remove_tags = [{'id':["moduleArticleTools","content-bottom","rail","articleRelates module","toolSet","relatedrailcontent","div-wrapper","beta","atp-comments","footer",'gallery-subcontent','subFooter']}, {'class':["clearfix","relatedTitle","articleRelates module","asset-footer","tools","comments","featurePromo","featurePromo fp-topjobs brownBackground","clearfix fullSpan brownBackground","curvedContent",'nextgen-share-tools','nextgen-comments-container','nextgen-comments-content','outbrainTools','fb-like' 'google-ad-story-bottom']}, dict(name='font',attrs={'id':["cr-other-headlines"]})] extra_css = ''' h1{font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:large;} h2{font-family:Arial,Helvetica,sans-serif; font-weight:normal;font-size:small;} .byline {font-family:Arial,Helvetica,sans-serif; font-size:xx-small;} .date {font-family:Arial,Helvetica,sans-serif; font-size:xx-small;} p{font-family:Arial,Helvetica,sans-serif;font-size:small;} .copyright {font-family:Arial,Helvetica,sans-serif;font-size:xx-small;text-align:center} .story{font-family:Arial,Helvetica,sans-serif;font-size:small;} .entry-asset asset hentry{font-family:Arial,Helvetica,sans-serif;font-size:small;} .pagebody{font-family:Arial,Helvetica,sans-serif;font-size:small;} .maincontentcontainer{font-family:Arial,Helvetica,sans-serif;font-size:small;} .story-body{font-family:Arial,Helvetica,sans-serif;font-size:small;} body{font-family:Helvetica,Arial,sans-serif;font-size:small;} ''' feeds = [ ## News ## (u'Top Headlines', u'http://feeds.feedburner.com/baltimoresun/news/rss2'), (u'Breaking News', u'http://feeds.feedburner.com/baltimoresun/news/local/annearundel/rss2'), (u'Top Maryland', u'http://feeds.feedburner.com/baltimoresun/news/local/rss2'), #(u'Anne Arundel County', u'http://feeds.feedburner.com/baltimoresun/news/local/annearundel/rss2'), (u'Baltimore City', u'http://feeds.feedburner.com/baltimoresun/news/local/baltimore_city/rss20xml'), #(u'Baltimore County', u'http://feeds.feedburner.com/baltimoresun/news/local/baltimore_county/rss2'), #(u'Carroll County', u'http://feeds.feedburner.com/baltimoresun/news/local/carroll/rss2'), #(u'Harford County', u'http://feeds.feedburner.com/baltimoresun/news/local/harford/rss2), #(u'Howard County', u'http://feeds.feedburner.com/baltimoresun/news/local/howard/rss2'), (u'Education', u'http://feeds.feedburner.com/baltimoresun/news/education/rss2'), #(u'Obituaries', u'http://feeds.feedburner.com/baltimoresun/news/obituaries/rss2'), (u'Local Politics', u'http://feeds.feedburner.com/baltimoresun/news/local/politics/rss2'), (u'Weather', u'http://feeds.feedburner.com/baltimoresun/news/weather/site/rss2'), #(u'Traffic', u'http://feeds.feedburner.com/baltimoresun/news/traffic/rss2'), (u'Nation/world', u'http://feeds.feedburner.com/baltimoresun/news/nationworld/rss2'), #(u'Weird News', u'http://feeds.feedburner.com/baltsun-weirdnews'), ##Sports## (u'Top Sports', u'http://feeds.feedburner.com/baltimoresun/sports/rss2'), (u'Orioles/Baseball', u'http://www.baltimoresun.com/sports/orioles/rss2.0.xml'), (u'Ravens/Football', u'http://feeds.feedburner.com/baltimoresun/sports/football/rss2'), #(u'Terps', u''http://feeds.feedburner.com/baltimoresun/sports/terps/rss2'), #(u'College Football', u''feed://feeds.feedburner.com/baltimoresun/sports/college/football/rss2'), #(u'Lacrosse', u'http://feeds.feedburner.com/baltimoresun/sports/college/lacrosse/rss2'), #(u'Horse Racing', u'http://feeds.feedburner.com/baltimoresun/sports/horseracing/rss2'), #(u'Golf', u'http://feeds.feedburner.com/baltimoresun/sports/golf/rss2'), #(u'NBA', u'http://feeds.feedburner.com/baltimoresun/sports/basketball/rss2'), #(u'High School', u'http://feeds.feedburner.com/baltimoresun/sports/highschool/rss2'), #(u'Outdoors', u'http://feeds.feedburner.com/baltimoresun/sports/outdoors/rss2'), ## Entertainment ## (u'Celebrity News', u'http://baltimore.feedsportal.com/c/34255/f/623042/index.rss'), (u'Arts & Theater', u'http://feeds.feedburner.com/baltimoresun/entertainment/galleriesmuseums/rss2'), (u'Movies', u'http://www.baltimoresun.com/entertainment/movies/rss2.0.xml'), (u'Music & Nightlife', u'http://www.baltimoresun.com/entertainment/music/rss2.0.xml'), (u'Restaurants & Food', u'http://www.baltimoresun.com/entertainment/dining/rss2.0.xml'), (u'TV/Media', u'http://www.baltimoresun.com/entertainment/tv/rss2.0.xml'), ## Life ## (u'Health&Wellness', u'http://www.baltimoresun.com/health/rss2.0.xml'), (u'Home & Garden', u'http://www.baltimoresun.com/features/home-garden/rss2.0.xml'), (u'Living Green', u'http://www.baltimoresun.com/features/green/rss2.0.xml'), (u'Fashion', u'http://www.baltimoresun.com/features/fashion/rss2.0.xml'), (u'Travel', u'http://www.baltimoresun.com/travel/rss2.0.xml'), #(u'Faith', u'http://www.baltimoresun.com/features/faith/rss2.0.xml'), ## Business ## (u'Top Business', u'http://www.baltimoresun.com/business/rss2.0.xml'), (u'Technology', u'http://www.baltimoresun.com/business/technology/rss2.0.xml'), (u'Personal finance', u'http://baltimore.feedsportal.com/c/34255/f/623057/index.rss'), (u'Real Estate', u'http://www.baltimoresun.com/classified/realestate/rss2.0.xml'), (u'Jobs', u'http://baltimore.feedsportal.com/c/34255/f/623059/index.rss'), #(u'DIY', u'http://baltimore.feedsportal.com/c/34255/f/623060/index.rss'), #(u'Consumer Safety', u'http://baltimore.feedsportal.com/c/34255/f/623061/index.rss'), (u'Investing', u'http://www.baltimoresun.com/business/money/rss2.0.xml'), ## Opinion## (u'Sun Editorials', u'http://www.baltimoresun.com/news/opinion/editorial/rss2.0.xml'), (u'Op/Ed', u'http://www.baltimoresun.com/news/opinion/oped/rss2.0.xml'), (u'Readers Respond', u'http://baltimore.feedsportal.com/c/34255/f/623065/index.rss'), ## Columnists ## (u'Kevin Cowherd', u'http://www.baltimoresun.com/sports/bal-columnist-cowherd,0,6829726.columnist-rss2.0.xml'), (u'Robert Ehrlich', u'http://www.baltimoresun.com/news/opinion/columnists/bal-columnist-ehrlich,0,1825227.columnist-rss2.0.xml'), (u'Jacques Kelly', u'http://www.baltimoresun.com/news/maryland/bal-columnist-kelly,0,1154701.columnist-rss2.0.xml'), (u'Marta H. Mossburg', u'http://www.baltimoresun.com/news/opinion/oped/bal-columnist-mossburg,0,7982155.columnist-rss2.0.xml'), (u'Mike Preston', u'http://www.baltimoresun.com/sports/bal-columnist-preston,0,6169796.columnist-rss2.0.xml'), (u'Susan Reimer', u'http://www.baltimoresun.com/news/opinion/bal-columnist-reimer,0,162466.columnist-rss2.0.xml'), (u'Dan Rodricks', u'http://www.baltimoresun.com/news/maryland/bal-columnist-rodricks,0,7089843.columnist-rss2.0.xml'), (u'Thomas F. Schaller', u'http://www.baltimoresun.com/news/opinion/columnists/bal-columnist-schaller,0,897397.columnist-rss2.0.xml'), (u'Peter Schmuck', u'http://www.baltimoresun.com/sports/bal-columnist-schmuck,0,7485088.columnist-rss2.0.xml'), ## News Blogs ## (u'Baltimore Crime Beat', u'http://baltimore.feedsportal.com/c/34255/f/623075/index.rss'), (u'InsideEd', u'http://www.baltimoresun.com/news/maryland/education/blog/rss2.0.xml'), (u'Maryland Politics', u'http://www.baltimoresun.com/news/maryland/politics/blog/rss2.0.xml'), (u'Maryland Weather', u'http://www.baltimoresun.com/news/weather/weather-blog/rss2.0.xml'), (u'Second Opinion', u'http://www.baltimoresun.com/news/opinion/second-opinion-blog/rss2.0.xml'), (u'Sun Investigates', u'http://www.baltimoresun.com/news/maryland/sun-investigates/rss2.0.xml'), (u'You Dont Say', u'http://www.baltimoresun.com/news/language-blog/rss2.0.xml'), ## Business Blogs ## (u'BaltTech', u'http://www.baltimoresun.com/business/technology/blog/rss2.0.xml'), (u'Consuming Interests', u'http://www.baltimoresun.com/business/consuming-interests-blog/rss2.0.xml'), (u'The Real Estate Wonk', u'http://www.baltimoresun.com/business/real-estate/wonk/rss2.0.xml'), ## Entertainment Blogs ## (u'ArtSmash', 'http://www.baltimoresun.com/entertainment/arts/artsmash/rss2.0.xml'), (u'Baltimore Diner', u'http://baltimore.feedsportal.com/c/34255/f/623088/index.rss'), (u'Midnight Sun', u'http://www.baltimoresun.com/entertainment/music/midnight-sun-blog/rss2.0.xml'), (u'Read Street', u'http://www.baltimoresun.com/features/books/read-street/rss2.0.xml'), (u'Z on TV', u'http://www.baltimoresun.com/entertainment/tv/z-on-tv-blog/rss2.0.xml'), ## Life Blogs ## #(u'BMore Green', u'http://weblogs.baltimoresun.com/features/green/index.xml'), (u'Baltimore Insider',u'http://www.baltimoresun.com/features/baltimore-insider-blog/rss2.0.xml'), (u'Picture of Health', u'http://www.baltimoresun.com/health/blog/rss2.0.xml'), #(u'Unleashed', u'http://weblogs.baltimoresun.com/features/mutts/blog/index.xml'), ## b the site blogs ## (u'TV Lust', u'http://baltimore.feedsportal.com/c/34255/f/623096/index.rss'), ## Sports Blogs ## (u'Baltimore Sports Blitz', u'http://baltimore.feedsportal.com/c/34255/f/623097/index.rss'), ## (u'Lacrosse Insider',u'http://www.baltimoresun.com/sports/lacrosse-blog/rss2.0.xml'), (u'Orioles Insider', u'http://baltimore.feedsportal.com/c/34255/f/623100/index.rss'), (u'Ravens Insider', u'http://www.baltimoresun.com/sports/ravens/ravens-insider/rss2.0.xml'), #(u'Ring Posts', u'http://weblogs.baltimoresun.com/sports/wrestling/blog/index.xml'), (u'The Schmuck Stops Here', u'http://www.baltimoresun.com/sports/schmuck-blog/rss2.0.xml'), #(u'Tracking the Terps', u'http://weblogs.baltimoresun.com/sports/college/maryland_terps/blog/index.xml'), #(u'Varsity Letters', u'http://weblogs.baltimoresun.com/sports/highschool/varsityletters/index.xml'), ] def get_article_url(self, article): ans = None try: s = article.summary ans = urllib.unquote( re.search(r'href=".+?bookmark.cfm.+?link=(.+?)"', s).group(1)) except: pass if ans is None: ans = article.get('feedburner_origlink', article.get('guid', article.get('link'))) if ans is not None: return ans.replace('?track=rss', '') def skip_ad_pages(self, soup): text = soup.find(text='click here to continue to article') if text: a = text.parent url = a.get('href') if url: return self.index_to_soup(url, raw=True) def print_version(self, url): return self.browser.open_novisit(url).geturl() def postprocess_html(self, soup, first_fetch): # Remove the navigation bar. It was kept until now to be able to follow # the links to further pages. But now we don't need them anymore. for nav in soup.findAll(attrs={'class':['toppaginate','article-nav clearfix']}): nav.extract() for t in soup.findAll(['table', 'tr', 'td']): t.name = 'div' for tag in soup.findAll('form', dict(attrs={'name':["comments_form"]})): tag.extract() for tag in soup.findAll('font', dict(attrs={'id':["cr-other-headlines"]})): tag.extract() return soup Last edited by jwiv; 04-28-2013 at 06:16 PM. Reason: Changed recursion and oldest article |
![]() |
![]() |
![]() |
#5 |
Junior Member
![]() ![]() Posts: 7
Karma: 110
Join Date: Dec 2010
Device: Kindle 3, Nexus 7 (2012)
|
Updated to change recursion and timeframe to 1 and 1 respectively.
Code:
from __future__ import with_statement __license__ = 'GPL 3' __copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>' __copyright__ = '2012 Josh Hall<jwtheiv@gmail.com>' __docformat__ = 'restructuredtext en' import urllib, re from calibre.web.feeds.news import BasicNewsRecipe class BaltimoreSun(BasicNewsRecipe): title = 'The Baltimore Sun' __author__ = 'Kovid Goyal and Sujata Raman, a.peter' __author__ = 'Revised for use with the Baltimore Sun by Josh Hall' description = 'Complete local news and blogs from Baltimore' language = 'en' version = 2.5 oldest_article = 1 max_articles_per_feed = 100 use_embedded_content = False no_stylesheets = True remove_javascript = True remove_empty_feeds= True recursions = 1 ignore_duplicate_articles = {'title'} keep_only_tags = [dict(name='div', attrs={'class':["story","entry-asset asset hentry"]}), dict(name='div', attrs={'id':["pagebody","story","maincontentcontainer"]}), ] remove_tags_after = [{'class':['photo_article',]}] match_regexps = [r'page=[0-9]+'] remove_tags = [{'id':["moduleArticleTools","content-bottom","rail","articleRelates module","toolSet","relatedrailcontent","div-wrapper","beta","atp-comments","footer",'gallery-subcontent','subFooter']}, {'class':["clearfix","relatedTitle","articleRelates module","asset-footer","tools","comments","featurePromo","featurePromo fp-topjobs brownBackground","clearfix fullSpan brownBackground","curvedContent",'nextgen-share-tools','nextgen-comments-container','nextgen-comments-content','outbrainTools','fb-like' 'google-ad-story-bottom']}, dict(name='font',attrs={'id':["cr-other-headlines"]})] extra_css = ''' h1{font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:large;} h2{font-family:Arial,Helvetica,sans-serif; font-weight:normal;font-size:small;} .byline {font-family:Arial,Helvetica,sans-serif; font-size:xx-small;} .date {font-family:Arial,Helvetica,sans-serif; font-size:xx-small;} p{font-family:Arial,Helvetica,sans-serif;font-size:small;} .copyright {font-family:Arial,Helvetica,sans-serif;font-size:xx-small;text-align:center} .story{font-family:Arial,Helvetica,sans-serif;font-size:small;} .entry-asset asset hentry{font-family:Arial,Helvetica,sans-serif;font-size:small;} .pagebody{font-family:Arial,Helvetica,sans-serif;font-size:small;} .maincontentcontainer{font-family:Arial,Helvetica,sans-serif;font-size:small;} .story-body{font-family:Arial,Helvetica,sans-serif;font-size:small;} body{font-family:Helvetica,Arial,sans-serif;font-size:small;} ''' feeds = [ ## News ## (u'Top Headlines', u'http://feeds.feedburner.com/baltimoresun/news/rss2'), (u'Breaking News', u'http://feeds.feedburner.com/baltimoresun/news/local/annearundel/rss2'), (u'Top Maryland', u'http://feeds.feedburner.com/baltimoresun/news/local/rss2'), #(u'Anne Arundel County', u'http://feeds.feedburner.com/baltimoresun/news/local/annearundel/rss2'), (u'Baltimore City', u'http://feeds.feedburner.com/baltimoresun/news/local/baltimore_city/rss20xml'), #(u'Baltimore County', u'http://feeds.feedburner.com/baltimoresun/news/local/baltimore_county/rss2'), #(u'Carroll County', u'http://feeds.feedburner.com/baltimoresun/news/local/carroll/rss2'), #(u'Harford County', u'http://feeds.feedburner.com/baltimoresun/news/local/harford/rss2), #(u'Howard County', u'http://feeds.feedburner.com/baltimoresun/news/local/howard/rss2'), (u'Education', u'http://feeds.feedburner.com/baltimoresun/news/education/rss2'), #(u'Obituaries', u'http://feeds.feedburner.com/baltimoresun/news/obituaries/rss2'), (u'Local Politics', u'http://feeds.feedburner.com/baltimoresun/news/local/politics/rss2'), (u'Weather', u'http://feeds.feedburner.com/baltimoresun/news/weather/site/rss2'), #(u'Traffic', u'http://feeds.feedburner.com/baltimoresun/news/traffic/rss2'), (u'Nation/world', u'http://feeds.feedburner.com/baltimoresun/news/nationworld/rss2'), #(u'Weird News', u'http://feeds.feedburner.com/baltsun-weirdnews'), ##Sports## (u'Top Sports', u'http://feeds.feedburner.com/baltimoresun/sports/rss2'), (u'Orioles/Baseball', u'http://www.baltimoresun.com/sports/orioles/rss2.0.xml'), (u'Ravens/Football', u'http://feeds.feedburner.com/baltimoresun/sports/football/rss2'), #(u'Terps', u''http://feeds.feedburner.com/baltimoresun/sports/terps/rss2'), #(u'College Football', u''feed://feeds.feedburner.com/baltimoresun/sports/college/football/rss2'), #(u'Lacrosse', u'http://feeds.feedburner.com/baltimoresun/sports/college/lacrosse/rss2'), #(u'Horse Racing', u'http://feeds.feedburner.com/baltimoresun/sports/horseracing/rss2'), #(u'Golf', u'http://feeds.feedburner.com/baltimoresun/sports/golf/rss2'), #(u'NBA', u'http://feeds.feedburner.com/baltimoresun/sports/basketball/rss2'), #(u'High School', u'http://feeds.feedburner.com/baltimoresun/sports/highschool/rss2'), #(u'Outdoors', u'http://feeds.feedburner.com/baltimoresun/sports/outdoors/rss2'), ## Entertainment ## (u'Celebrity News', u'http://baltimore.feedsportal.com/c/34255/f/623042/index.rss'), (u'Arts & Theater', u'http://feeds.feedburner.com/baltimoresun/entertainment/galleriesmuseums/rss2'), (u'Movies', u'http://www.baltimoresun.com/entertainment/movies/rss2.0.xml'), (u'Music & Nightlife', u'http://www.baltimoresun.com/entertainment/music/rss2.0.xml'), (u'Restaurants & Food', u'http://www.baltimoresun.com/entertainment/dining/rss2.0.xml'), (u'TV/Media', u'http://www.baltimoresun.com/entertainment/tv/rss2.0.xml'), ## Life ## (u'Health&Wellness', u'http://www.baltimoresun.com/health/rss2.0.xml'), (u'Home & Garden', u'http://www.baltimoresun.com/features/home-garden/rss2.0.xml'), (u'Living Green', u'http://www.baltimoresun.com/features/green/rss2.0.xml'), (u'Fashion', u'http://www.baltimoresun.com/features/fashion/rss2.0.xml'), (u'Travel', u'http://www.baltimoresun.com/travel/rss2.0.xml'), #(u'Faith', u'http://www.baltimoresun.com/features/faith/rss2.0.xml'), ## Business ## (u'Top Business', u'http://www.baltimoresun.com/business/rss2.0.xml'), (u'Technology', u'http://www.baltimoresun.com/business/technology/rss2.0.xml'), (u'Personal finance', u'http://baltimore.feedsportal.com/c/34255/f/623057/index.rss'), (u'Real Estate', u'http://www.baltimoresun.com/classified/realestate/rss2.0.xml'), (u'Jobs', u'http://baltimore.feedsportal.com/c/34255/f/623059/index.rss'), #(u'DIY', u'http://baltimore.feedsportal.com/c/34255/f/623060/index.rss'), #(u'Consumer Safety', u'http://baltimore.feedsportal.com/c/34255/f/623061/index.rss'), (u'Investing', u'http://www.baltimoresun.com/business/money/rss2.0.xml'), ## Opinion## (u'Sun Editorials', u'http://www.baltimoresun.com/news/opinion/editorial/rss2.0.xml'), (u'Op/Ed', u'http://www.baltimoresun.com/news/opinion/oped/rss2.0.xml'), (u'Readers Respond', u'http://baltimore.feedsportal.com/c/34255/f/623065/index.rss'), ## Columnists ## (u'Kevin Cowherd', u'http://www.baltimoresun.com/sports/bal-columnist-cowherd,0,6829726.columnist-rss2.0.xml'), (u'Robert Ehrlich', u'http://www.baltimoresun.com/news/opinion/columnists/bal-columnist-ehrlich,0,1825227.columnist-rss2.0.xml'), (u'Jacques Kelly', u'http://www.baltimoresun.com/news/maryland/bal-columnist-kelly,0,1154701.columnist-rss2.0.xml'), (u'Marta H. Mossburg', u'http://www.baltimoresun.com/news/opinion/oped/bal-columnist-mossburg,0,7982155.columnist-rss2.0.xml'), (u'Mike Preston', u'http://www.baltimoresun.com/sports/bal-columnist-preston,0,6169796.columnist-rss2.0.xml'), (u'Susan Reimer', u'http://www.baltimoresun.com/news/opinion/bal-columnist-reimer,0,162466.columnist-rss2.0.xml'), (u'Dan Rodricks', u'http://www.baltimoresun.com/news/maryland/bal-columnist-rodricks,0,7089843.columnist-rss2.0.xml'), (u'Thomas F. Schaller', u'http://www.baltimoresun.com/news/opinion/columnists/bal-columnist-schaller,0,897397.columnist-rss2.0.xml'), (u'Peter Schmuck', u'http://www.baltimoresun.com/sports/bal-columnist-schmuck,0,7485088.columnist-rss2.0.xml'), ## News Blogs ## (u'Baltimore Crime Beat', u'http://baltimore.feedsportal.com/c/34255/f/623075/index.rss'), (u'InsideEd', u'http://www.baltimoresun.com/news/maryland/education/blog/rss2.0.xml'), (u'Maryland Politics', u'http://www.baltimoresun.com/news/maryland/politics/blog/rss2.0.xml'), (u'Maryland Weather', u'http://www.baltimoresun.com/news/weather/weather-blog/rss2.0.xml'), (u'Second Opinion', u'http://www.baltimoresun.com/news/opinion/second-opinion-blog/rss2.0.xml'), (u'Sun Investigates', u'http://www.baltimoresun.com/news/maryland/sun-investigates/rss2.0.xml'), (u'You Dont Say', u'http://www.baltimoresun.com/news/language-blog/rss2.0.xml'), ## Business Blogs ## (u'BaltTech', u'http://www.baltimoresun.com/business/technology/blog/rss2.0.xml'), (u'Consuming Interests', u'http://www.baltimoresun.com/business/consuming-interests-blog/rss2.0.xml'), (u'The Real Estate Wonk', u'http://www.baltimoresun.com/business/real-estate/wonk/rss2.0.xml'), ## Entertainment Blogs ## (u'ArtSmash', 'http://www.baltimoresun.com/entertainment/arts/artsmash/rss2.0.xml'), (u'Baltimore Diner', u'http://baltimore.feedsportal.com/c/34255/f/623088/index.rss'), (u'Midnight Sun', u'http://www.baltimoresun.com/entertainment/music/midnight-sun-blog/rss2.0.xml'), (u'Read Street', u'http://www.baltimoresun.com/features/books/read-street/rss2.0.xml'), (u'Z on TV', u'http://www.baltimoresun.com/entertainment/tv/z-on-tv-blog/rss2.0.xml'), ## Life Blogs ## #(u'BMore Green', u'http://weblogs.baltimoresun.com/features/green/index.xml'), (u'Baltimore Insider',u'http://www.baltimoresun.com/features/baltimore-insider-blog/rss2.0.xml'), (u'Picture of Health', u'http://www.baltimoresun.com/health/blog/rss2.0.xml'), #(u'Unleashed', u'http://weblogs.baltimoresun.com/features/mutts/blog/index.xml'), ## b the site blogs ## (u'TV Lust', u'http://baltimore.feedsportal.com/c/34255/f/623096/index.rss'), ## Sports Blogs ## (u'Baltimore Sports Blitz', u'http://baltimore.feedsportal.com/c/34255/f/623097/index.rss'), ## (u'Lacrosse Insider',u'http://www.baltimoresun.com/sports/lacrosse-blog/rss2.0.xml'), (u'Orioles Insider', u'http://baltimore.feedsportal.com/c/34255/f/623100/index.rss'), (u'Ravens Insider', u'http://www.baltimoresun.com/sports/ravens/ravens-insider/rss2.0.xml'), #(u'Ring Posts', u'http://weblogs.baltimoresun.com/sports/wrestling/blog/index.xml'), (u'The Schmuck Stops Here', u'http://www.baltimoresun.com/sports/schmuck-blog/rss2.0.xml'), #(u'Tracking the Terps', u'http://weblogs.baltimoresun.com/sports/college/maryland_terps/blog/index.xml'), #(u'Varsity Letters', u'http://weblogs.baltimoresun.com/sports/highschool/varsityletters/index.xml'), ] def get_article_url(self, article): ans = None try: s = article.summary ans = urllib.unquote( re.search(r'href=".+?bookmark.cfm.+?link=(.+?)"', s).group(1)) except: pass if ans is None: ans = article.get('feedburner_origlink', article.get('guid', article.get('link'))) if ans is not None: return ans.replace('?track=rss', '') def skip_ad_pages(self, soup): text = soup.find(text='click here to continue to article') if text: a = text.parent url = a.get('href') if url: return self.index_to_soup(url, raw=True) def print_version(self, url): return self.browser.open_novisit(url).geturl() def postprocess_html(self, soup, first_fetch): # Remove the navigation bar. It was kept until now to be able to follow # the links to further pages. But now we don't need them anymore. for nav in soup.findAll(attrs={'class':['toppaginate','article-nav clearfix']}): nav.extract() for t in soup.findAll(['table', 'tr', 'td']): t.name = 'div' for tag in soup.findAll('form', dict(attrs={'name':["comments_form"]})): tag.extract() for tag in soup.findAll('font', dict(attrs={'id':["cr-other-headlines"]})): tag.extract() return soup |
![]() |
![]() |
Advert | |
|
![]() |
|
![]() |
||||
Thread | Thread Starter | Forum | Replies | Last Post |
The Sun UK recipe updated due to website change | scissors | Recipes | 0 | 07-25-2012 04:48 PM |
The Sun UK Recipe - Google reader NOT required | scissors | Recipes | 0 | 12-28-2011 07:07 AM |
Baltimore Sun recipe? | JerryinOCMD | Recipes | 2 | 01-05-2011 01:54 PM |
Baltimore sun help? | copyrite | Recipes | 2 | 10-31-2010 03:59 PM |
PRS-900 Baltimore Sun? | luqmaninbmore | Sony Reader | 1 | 02-10-2010 05:28 PM |