MobileRead Forums - View Single Post - postprocess_html receives html string instead of soup

Rackamouth · 07-08-2013, 09:57 AM

Here's the recipe. If you could explain why extra_css doesn't do anything as well (when postprocess_html is commented out) that's be awesome.
Thanks!

Code:

import string, re
from calibre import strftime
from calibre.web.feeds.recipes import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup

class WV(BasicNewsRecipe):

	title       = 'Workers Vanguard'
	__author__  = ''
	description = 'Current issue of WV'
	needs_subscription = False
	no_stylesheets = True
	extra_css = '#wvbody {font-size: 11pt; margin-left: 0px; margin-right: 0px; margin-bottom: 5px; margin-top: 5px; text-align: justify; text-indent: .2in} #wvbodyfl {font-size: 11pt; margin-left: 0px; margin-right: 0px; margin-bottom: 5px; margin-top: 5px; text-align: justify} #wvquote {font-size: 10pt; margin-left: 20px; margin-right: 00px; text-align: justify; margin-top: 13px; margin-bottom: 0px} #wvcite {font-size: 10pt; margin-left: 20px; margin-right: 0px; margin-top: 0px; margin-bottom: 5px} #wvdatecite {font-size: 11pt; margin-left: 0px; margin-right: 0px; margin-bottom: 13px; margin-top: 13px; text-align: right} #wvbodyctr {font-size: 11pt; margin-left: 0px; margin-right: 0px; margin-bottom: 13px; margin-top: 13px; text-align: center; text-indent: .2in} #headline {font-size: 20pt; font-weight: bolder; margin-bottom: 5px; margin-top: 0px; text-align: center} #kicker {font-size: 16pt; font-weight: bold; margin-bottom: 5px; margin-top: 0px; text-align: center} #nytimes {font-size: 12pt; font-weight: bold; margin-bottom: 5px; margin-top: 0px; text-align: center} #subhead {font-size: 11pt; font-weight: bold; text-align: left; margin-bottom: 14px; margin-top: 14px} #folio {font-size: 9pt} #smlheadline {font-size: 9pt; font-weight: bold; margin-bottom: 0px; margin-top: 0px} #smlkicker {font-size: 9pt; font-weight: bold; margin-bottom: 0px; margin-top: 0px} #smlfolio {font-size: 9pt; margin-bottom: 0px; margin-top: 0px} #smlarticletype {font-size: 7pt; margin-bottom: 0px; margin-top: 0px}'
    
    
	def print_version(self, url):
		return string.join(["http://www.spartacist.org/print/english/wv/", url],'')

	def parse_index(self):
		soup = self.index_to_soup('http://spartacist.org/english/wv/index.html')
		articles = []
		
		# get issue number and date.
		for div in soup.findAll(id='folio'):
			a = div.string
			if a:
				date = a
				print string.join(['Found date: ', date])
				self.timefmt = date
			else:
				issuenostring = div.i.findNextSibling(text=True)
				print string.join(['Found issue number string: ', issuenostring]) 
		
		# find print URL of main article in index page
		for div in soup.findAll(text=re.compile("Printable")):
			a = div.findParent('a', href=True)
			if not a: continue 
			else: 
				url1 = string.split(re.sub(r'\?.*', '', a['href']), '/')
				url = string.join([url1[-2], '/', url1[-1]],'')
				
		# find headline of main article in index page
		for div in soup.findAll(id='headline'):
			headline = div.string
			print(string.join(['Found article ', headline, 'at url', url]))
			articles.append({'title':headline, 'url':url, 'description':'', 'date':date})		
		
		# find following articles articles (parsing Table of Content at right of index page)
		for div in soup.findAll(id='smlheadline'):
			a = div.find('a', href=True)
			if not a: continue 
			else: 
				url = re.sub(r'\?.*', '', a['href'])
				headline = a.string
				print(string.join(['Found article', headline, 'at url', url]))
				articles.append({'title':headline, 'url':url, 'description':'', 'date':''})
				
		return [(string.join(['WV', issuenostring], ''), articles)]
	
	# Replace id-based styling by tag-based standard styling
	def postprocess_html(self, soup, first):
		print soup
		for div in soup.findAll(id='headline'):
			div.name = 'h1'
		for div in soup.findAll(id='kicker'):
			div.name = 'h2'
		for div in soup.findAll(id='subhead'):
			div.name = 'h3'
		for div in soup.findAll(id='wvquote'):
			div.name = 'blockquote'
		for div in soup.findAll(id='wvcite'):
			div.name = 'blockquote'