Here's the recipe. If you could explain why extra_css doesn't do anything as well (when postprocess_html is commented out) that's be awesome.
Thanks!
Code:
import string, re
from calibre import strftime
from calibre.web.feeds.recipes import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup
class WV(BasicNewsRecipe):
title = 'Workers Vanguard'
__author__ = ''
description = 'Current issue of WV'
needs_subscription = False
no_stylesheets = True
extra_css = '#wvbody {font-size: 11pt; margin-left: 0px; margin-right: 0px; margin-bottom: 5px; margin-top: 5px; text-align: justify; text-indent: .2in} #wvbodyfl {font-size: 11pt; margin-left: 0px; margin-right: 0px; margin-bottom: 5px; margin-top: 5px; text-align: justify} #wvquote {font-size: 10pt; margin-left: 20px; margin-right: 00px; text-align: justify; margin-top: 13px; margin-bottom: 0px} #wvcite {font-size: 10pt; margin-left: 20px; margin-right: 0px; margin-top: 0px; margin-bottom: 5px} #wvdatecite {font-size: 11pt; margin-left: 0px; margin-right: 0px; margin-bottom: 13px; margin-top: 13px; text-align: right} #wvbodyctr {font-size: 11pt; margin-left: 0px; margin-right: 0px; margin-bottom: 13px; margin-top: 13px; text-align: center; text-indent: .2in} #headline {font-size: 20pt; font-weight: bolder; margin-bottom: 5px; margin-top: 0px; text-align: center} #kicker {font-size: 16pt; font-weight: bold; margin-bottom: 5px; margin-top: 0px; text-align: center} #nytimes {font-size: 12pt; font-weight: bold; margin-bottom: 5px; margin-top: 0px; text-align: center} #subhead {font-size: 11pt; font-weight: bold; text-align: left; margin-bottom: 14px; margin-top: 14px} #folio {font-size: 9pt} #smlheadline {font-size: 9pt; font-weight: bold; margin-bottom: 0px; margin-top: 0px} #smlkicker {font-size: 9pt; font-weight: bold; margin-bottom: 0px; margin-top: 0px} #smlfolio {font-size: 9pt; margin-bottom: 0px; margin-top: 0px} #smlarticletype {font-size: 7pt; margin-bottom: 0px; margin-top: 0px}'
def print_version(self, url):
return string.join(["http://www.spartacist.org/print/english/wv/", url],'')
def parse_index(self):
soup = self.index_to_soup('http://spartacist.org/english/wv/index.html')
articles = []
# get issue number and date.
for div in soup.findAll(id='folio'):
a = div.string
if a:
date = a
print string.join(['Found date: ', date])
self.timefmt = date
else:
issuenostring = div.i.findNextSibling(text=True)
print string.join(['Found issue number string: ', issuenostring])
# find print URL of main article in index page
for div in soup.findAll(text=re.compile("Printable")):
a = div.findParent('a', href=True)
if not a: continue
else:
url1 = string.split(re.sub(r'\?.*', '', a['href']), '/')
url = string.join([url1[-2], '/', url1[-1]],'')
# find headline of main article in index page
for div in soup.findAll(id='headline'):
headline = div.string
print(string.join(['Found article ', headline, 'at url', url]))
articles.append({'title':headline, 'url':url, 'description':'', 'date':date})
# find following articles articles (parsing Table of Content at right of index page)
for div in soup.findAll(id='smlheadline'):
a = div.find('a', href=True)
if not a: continue
else:
url = re.sub(r'\?.*', '', a['href'])
headline = a.string
print(string.join(['Found article', headline, 'at url', url]))
articles.append({'title':headline, 'url':url, 'description':'', 'date':''})
return [(string.join(['WV', issuenostring], ''), articles)]
# Replace id-based styling by tag-based standard styling
def postprocess_html(self, soup, first):
print soup
for div in soup.findAll(id='headline'):
div.name = 'h1'
for div in soup.findAll(id='kicker'):
div.name = 'h2'
for div in soup.findAll(id='subhead'):
div.name = 'h3'
for div in soup.findAll(id='wvquote'):
div.name = 'blockquote'
for div in soup.findAll(id='wvcite'):
div.name = 'blockquote'