MobileRead Forums - View Single Post - Very new to this - please help me parse a local newspaper's RSS

Starson17 · 03-08-2011, 10:17 AM

Quote:

Originally Posted by clintiepoo

I'm about done trying

Try this:

Spoiler:

Code:

#!/usr/bin/env  python


'''
http://www.herald-review.com
'''
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import Tag

class DecaturHerald(BasicNewsRecipe):
    title                 = u'Herald and Review'
    __author__            = u'Clint and Starson17'
    description           = u"Decatur, IL Newspaper"
    oldest_article        = 7
    language = 'en'

    max_articles_per_feed = 100
    no_stylesheets        = True
    use_embedded_content  = False
    extra_css = '''
                 h1               {text-align:left;}
                 .updated         {font-family:monospace;text-align:left;margin-bottom: 1em;}
                 .img             {text-align:center;}
                 .gallery-cutline {text-align:center;font-size:smaller;font-style:italic}
                 .credit          {text-align:right;margin-bottom:0em;font-size:smaller;}
                 .div             {text-align:left;}
                 '''
    
    cover_url = 'http://www.herald-review.com/content/tncms/live/global/resources/images/hr_logo.jpg'
    
    keep_only_tags = [ 
                        dict(name='h1'),
                        dict(name='span', attrs={'class':'updated'}),
                        dict(name='img', attrs={'id':'img-holder'}),
                        dict(name='span', attrs={'id':'gallery-cutline'}),                        
                        dict(name='div', attrs={'id':'blox-story-text'}) 
                     ]
                     
    remove_tags = [
                     dict(name='a')                 
                  ]       
                     
    feeds       = [ 
                    (u'Local News', u'http://www.herald-review.com/search/?f=rss&c[]=news/local&sd=desc&s=start_time'),
#                    (u'Breaking News', u'http://www.herald-review.com/search/?f=rss&k[]=%23breaking&sd=desc&s=start_time'),
#                    (u'State and Regional ', u'http://www.herald-review.com/search/?f=rss&c[]=news/state-and-regional&sd=desc&s=start_time'),
#                    (u'Crime and courts', u'http://www.herald-review.com/search/?f=rss&c[]=news/local/crime-and-courts&sd=desc&s=start_time'),
#                    (u'Local Business ', u'http://www.herald-review.com/search/?f=rss&c[]=business/local&sd=desc&s=start_time'),
#                    (u'Editorials', u'http://www.herald-review.com/search/?f=rss&c[]=news/opinion/editorial&sd=desc&s=start_time'),
#                    (u'Illini News', u'http://www.herald-review.com/search/?f=rss&q=illini&sd=desc&s=start_time')

                    ]

    def preprocess_html(self,soup):
        print 'the soup is: ', soup
        for img_tag in soup.findAll('img'):
            previousSibling_tag = img_tag.previousSibling
            if previousSibling_tag.name == 'span':
                new_tag = Tag(soup,'p')
                new_tag.insert(0,img_tag)
                previousSibling_tag.insert(1,new_tag)
        return soup

I used previousSibling to find the span tag that preceded the img tag. Since the span tag had useful text (the date), and was still in the soup, I used it as the marker and just put the img tag into it, after putting it into a p tag.
I didn't look closely at your code, but I did see it used "tag" instead of "Tag." Note the imports and the print, which you can comment out with "#".