I've just started using Calibre and found that it is pretty freakin' awesome!
I'm trying to get the Washington Post Comics recipe to work. If the comic is from Uclick (e.g.
http://www.uclick.com/client/wpc/dt/) the comic strip will download properly. However if it is a King Syndicate link (e.g.
http://www.washingtonpost.com/wp-srv...html?name=Zits) the comic strip fails to download.
I've looked at the recipe code. Mind you my knowledge of Python was zero before today, so I'm struggling a little. From what I can tell, the recipe uses Beautiful Soup to find the select tag for the date of the comic strip in the HTML from the comics page on Washington Post website. It looks at the name of the select tag to determine a course of action.
For the UClick comics, the name of the select tag is "url" and the recipe handles this fine and the comic strip is downloaded. It looks as though the Washington Post has changed the format of the links to non-Uclick comics since the recipe was written. Instead of something like "http://www.creators.com/featurepages/11_editorialcartoons_mike-luckovich.html?name=lk" it is now something like "http://www.washingtonpost.com/wp-srv/artsandliving/comics/king_zits.html?name=Zits". This new page has a lot of javascript going and doig a "view source" reveals nothing of the form elements. However, in Chrome, "Inspect Element" shows that the name of the select tag is "dest", which the recipe should be able to handle. In addition the values for the options are in the form "July 1, 2011". The cartoonCandidatesCreatorsCom() method looks like it should be able to handle the date in that format. However this reaches the limit of my Python skills. I don't know how to use the debug mode to step through the recipe.
So, can anyone create a fix for this, or at least provide some guidance?
Thanks -
Rob
BTW, the recipe code is:
Code:
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup
from datetime import date, timedelta
class WaPoCartoonsRecipe(BasicNewsRecipe):
__license__ = 'GPL v3'
__author__ = 'kwetal'
language = 'en'
version = 2
title = u'Washington Post Cartoons'
publisher = u'Washington Post'
category = u'News, Cartoons'
description = u'Cartoons from the Washington Post'
oldest_article = 1
max_articles_per_feed = 100
use_embedded_content = False
no_stylesheets = True
feeds = []
feeds.append((u'Dilbert', u'http://www.uclick.com/client/wpc/dt/'))
feeds.append((u'Mutts', u'http://www.washingtonpost.com/wp-srv/artsandliving/comics/king_mutts.html?name=Mutts'))
feeds.append((u'Sally Forth', u'http://www.washingtonpost.com/wp-srv/artsandliving/comics/king_sally_forth.html?name=Sally_Forth'))
feeds.append((u'Shermans Lagoon', u'http://www.washingtonpost.com/wp-srv/artsandliving/comics/king_shermans_lagoon.html?name=Shermans_Lagoon'))
feeds.append((u'Zits', u'http://www.washingtonpost.com/wp-srv/artsandliving/comics/king_zits.html?name=Zits'))
feeds.append((u'Baby Blues', u'http://www.washingtonpost.com/wp-srv/artsandliving/comics/king_baby_blues.html?name=Baby_Blues'))
feeds.append((u'Barney And Clyde', u'http://www.washingtonpost.com/wp-srv/artsandliving/comics/barney_clyde.html?name=Barney_Clyde'))
extra_css = '''
body {font-family: verdana, arial, helvetica, geneva, sans-serif;}
h1 {font-size: medium; font-weight: bold; margin-bottom: -0.1em; padding: 0em; text-align: left;}
#name {margin-bottom: 0.2em}
#copyright {font-size: xx-small; color: #696969; text-align: right; margin-top: 0.2em;}
'''
def parse_index(self):
index = []
oldestDate = date.today() - timedelta(days = self.oldest_article)
oldest = oldestDate.strftime('%Y%m%d')
for feed in self.feeds:
cartoons = []
soup = self.index_to_soup(feed[1])
cartoon = {'title': 'Current', 'date': None, 'url': feed[1], 'description' : ''}
cartoons.append(cartoon)
select = soup.find('select', attrs = {'name': ['url', 'dest']})
if select:
cartoonCandidates = []
if select['name'] == 'url':
cartoonCandidates = self.cartoonCandidatesWaPo(select, oldest)
else:
cartoonCandidates = self.cartoonCandidatesCreatorsCom(select, oldest)
for cartoon in cartoonCandidates:
cartoons.append(cartoon)
index.append([feed[0], cartoons])
return index
def preprocess_html(self, soup):
freshSoup = self.getFreshSoup(soup)
div = soup.find('div', attrs = {'id': 'name'})
if div:
freshSoup.body.append(div)
comic = soup.find('div', attrs = {'id': 'comic_full'})
img = comic.find('img')
if '&' in img['src']:
img['src'], sep, bad = img['src'].rpartition('&')
freshSoup.body.append(comic)
freshSoup.body.append(soup.find('div', attrs = {'id': 'copyright'}))
else:
span = soup.find('span', attrs = {'class': 'title'})
if span:
del span['class']
span['id'] = 'name'
span.name = 'div'
freshSoup.body.append(span)
img = soup.find('img', attrs = {'class': 'pic_big'})
if img:
td = img.parent
if td.has_key('style'):
del td['style']
td.name = 'div'
td['id'] = 'comic_full'
freshSoup.body.append(td)
td = soup.find('td', attrs = {'class': 'copy'})
if td:
for a in td.find('a'):
a.extract()
del td['class']
td['id'] = 'copyright'
td.name = 'div'
freshSoup.body.append(td)
return freshSoup
def getFreshSoup(self, oldSoup):
freshSoup = BeautifulSoup('<html><head><title></title></head><body></body></html>')
if oldSoup.head.title:
freshSoup.head.title.append(self.tag_to_string(oldSoup.head.title))
return freshSoup
def cartoonCandidatesWaPo(self, select, oldest):
opts = select.findAll('option')
for i in range(1, len(opts)):
url = opts[i]['value'].rstrip('/')
dateparts = url.split('/')[-3:]
datenum = str(dateparts[0]) + str(dateparts[1]) + str(dateparts[2])
if datenum >= oldest:
yield {'title': self.tag_to_string(opts[i]), 'date': None, 'url': url, 'description': ''}
else:
return
def cartoonCandidatesCreatorsCom(self, select, oldest):
monthNames = {'January': '01', 'February': '02', 'March': '03', 'April': '04', 'May': '05',
'June': '06', 'July': '07', 'August': '08', 'September': '09', 'October': '10',
'November': '11', 'December': '12'}
opts = select.findAll('option')
for i in range(1, len(opts)):
if opts[i].has_key('selected'):
continue
dateString = self.tag_to_string(opts[i])
rest, sep, year = dateString.rpartition(', ')
parts = rest.split(' ')
day = parts[2].rjust(2, '0')
month = monthNames[parts[1]]
datenum = str(year) + month + str(day)
if datenum >= oldest:
yield {'title': dateString, 'date': None, 'url': opts[i]['value'], 'description': ''}
else:
return