This was kindly written by Giacomo Lacava.
Code:
__license__ = 'GPL v3'
__copyright__ = '2010, Giacomo Lacava <g.lacava@gmail.com>'
DEV_KEY = "xxxxxxxxxxxxxxxxx"
GR_API_SEARCH = "http://www.goodreads.com/search/search?" + "format=xml&key=%(key)s&q=%(param)s&page=%(page)i"
GR_API_BOOK_SHOW = "http://www.goodreads.com/book/show/%(id)s?key=%(key)s"
import time
from urllib import quote_plus
from lxml import etree
from calibre import browser
from calibre.customize import Plugin
from calibre.ebooks.metadata.fetch import MetadataSource
from calibre.ebooks.metadata import MetaInformation
class GrSource(MetadataSource):
author = 'Giacomo Lacava'
metadata_type = 'basic'
name = 'Goodreads'
description = _('Downloads metadata from Goodreads')
version = (1, 0, 0)
supported_platforms = ['windows', 'osx', 'linux']
def __init__(self,*params):
super(MetadataSource,self).__init__(params)
def fetch(self):
param = ""
if self.isbn is not None: #isbn excludes any other parameter
param = self.isbn
else:
for attr in [self.title,self.book_author]:
if attr is not None:
# strip stuff that GR doesn't (or may not) like
for char in "()[]~@?/\\`&*!£$%*+={}#><":
attr = attr.encode("utf8").replace(char,"")
param = "+".join([quote_plus(attr),param])
if param is not None:
page = 1
results_num = 0
self.results = []
while True: # run until we got all results
url = GR_API_SEARCH % {"key":DEV_KEY,
"param": param,
"page": page
}
print "calling " + url
br = browser()
response_xml = br.open(url).read()
root = etree.fromstring(response_xml)
time.sleep(1) # to avoid throttling from GR
for book in root.findall(".//work"):
title = book.find("./best_book/title").text
authors = [ author.find("./name").text for author in book.findall("./best_book/author")]
print "got " + title + " by " + "/".join(authors)
mi = MetaInformation(title,authors)
# what we should do now is to get descriptions etc
# no point threading it because of GR throttling requests
bookurl = GR_API_BOOK_SHOW % { "key":DEV_KEY,
"id": book.find("./best_book/id").text }
print "Calling " + bookurl
book_xml = br.open(bookurl).read()
xmlroot = etree.fromstring(book_xml)
time.sleep(1) # to avoid throttling from GR
## things we can add here:
# 'author_sort', 'title_sort', 'comments', 'category',
# 'publisher', 'series', 'series_index', 'rating',
# 'isbn', 'tags', 'cover_data', 'application_id', 'guide',
# 'manifest', 'spine', 'toc', 'cover', 'language',
# 'book_producer', 'timestamp', 'lccn', 'lcc', 'ddc',
# 'pubdate', 'rights', 'publication_type', 'uuid'
mi.comments = xmlroot.find(".//book/description").text
mi.publisher = xmlroot.find(".//book/publisher").text
mi.isbn = xmlroot.find(".//book/isbn13").text
if mi.isbn is None: ## let's try with isbn10 then
xmlroot.find(".//book/isbn").text
cover_url = xmlroot.find(".//book/image_url").text
# er, shall I download this cover...? and save it how ...?
print mi
self.results.append(mi)
results_num += int(root.find(".//results-end").text)
total_results = int(root.find(".//total-results").text)
print "fetched %i of %i results" % (results_num, total_results)
# if enough results, get out of loop
if results_num >= total_results:
break
else:
# otherwise, fetch next page
page += 1