Quote:
Originally Posted by clintiepoo
I'm about done trying 
|
Try this:
Spoiler:
Code:
#!/usr/bin/env python
'''
http://www.herald-review.com
'''
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import Tag
class DecaturHerald(BasicNewsRecipe):
title = u'Herald and Review'
__author__ = u'Clint and Starson17'
description = u"Decatur, IL Newspaper"
oldest_article = 7
language = 'en'
max_articles_per_feed = 100
no_stylesheets = True
use_embedded_content = False
extra_css = '''
h1 {text-align:left;}
.updated {font-family:monospace;text-align:left;margin-bottom: 1em;}
.img {text-align:center;}
.gallery-cutline {text-align:center;font-size:smaller;font-style:italic}
.credit {text-align:right;margin-bottom:0em;font-size:smaller;}
.div {text-align:left;}
'''
cover_url = 'http://www.herald-review.com/content/tncms/live/global/resources/images/hr_logo.jpg'
keep_only_tags = [
dict(name='h1'),
dict(name='span', attrs={'class':'updated'}),
dict(name='img', attrs={'id':'img-holder'}),
dict(name='span', attrs={'id':'gallery-cutline'}),
dict(name='div', attrs={'id':'blox-story-text'})
]
remove_tags = [
dict(name='a')
]
feeds = [
(u'Local News', u'http://www.herald-review.com/search/?f=rss&c[]=news/local&sd=desc&s=start_time'),
# (u'Breaking News', u'http://www.herald-review.com/search/?f=rss&k[]=%23breaking&sd=desc&s=start_time'),
# (u'State and Regional ', u'http://www.herald-review.com/search/?f=rss&c[]=news/state-and-regional&sd=desc&s=start_time'),
# (u'Crime and courts', u'http://www.herald-review.com/search/?f=rss&c[]=news/local/crime-and-courts&sd=desc&s=start_time'),
# (u'Local Business ', u'http://www.herald-review.com/search/?f=rss&c[]=business/local&sd=desc&s=start_time'),
# (u'Editorials', u'http://www.herald-review.com/search/?f=rss&c[]=news/opinion/editorial&sd=desc&s=start_time'),
# (u'Illini News', u'http://www.herald-review.com/search/?f=rss&q=illini&sd=desc&s=start_time')
]
def preprocess_html(self,soup):
print 'the soup is: ', soup
for img_tag in soup.findAll('img'):
previousSibling_tag = img_tag.previousSibling
if previousSibling_tag.name == 'span':
new_tag = Tag(soup,'p')
new_tag.insert(0,img_tag)
previousSibling_tag.insert(1,new_tag)
return soup
I used previousSibling to find the span tag that preceded the img tag. Since the span tag had useful text (the date), and was still in the soup, I used it as the marker and just put the img tag into it, after putting it into a p tag.
I didn't look closely at your code, but I did see it used "tag" instead of "Tag." Note the imports and the print, which you can comment out with "#".