View Single Post
Old 09-24-2010, 08:44 AM   #6
DarkAbsynthe
Member
DarkAbsynthe began at the beginning.
 
Posts: 11
Karma: 10
Join Date: Sep 2010
Device: Kindle
This was kindly written by Giacomo Lacava.

Code:
__license__   = 'GPL v3'
__copyright__ = '2010, Giacomo Lacava <g.lacava@gmail.com>'

DEV_KEY = "xxxxxxxxxxxxxxxxx"

GR_API_SEARCH = "http://www.goodreads.com/search/search?" + "format=xml&key=%(key)s&q=%(param)s&page=%(page)i"
GR_API_BOOK_SHOW = "http://www.goodreads.com/book/show/%(id)s?key=%(key)s"

import time
from urllib import quote_plus
from lxml import etree

from calibre import browser
from calibre.customize import Plugin
from calibre.ebooks.metadata.fetch import MetadataSource
from calibre.ebooks.metadata import MetaInformation

class GrSource(MetadataSource):
    
    author = 'Giacomo Lacava'
    metadata_type = 'basic'
    name = 'Goodreads'
    description = _('Downloads metadata from Goodreads')
    version             = (1, 0, 0)
    supported_platforms = ['windows', 'osx', 'linux'] 
        
    def __init__(self,*params):
        super(MetadataSource,self).__init__(params)
        
    def fetch(self):
        
        
        param = ""
        if self.isbn is not None: #isbn excludes any other parameter
            param = self.isbn
        else:
            for attr in [self.title,self.book_author]:
                if attr is not None:                             
                    # strip stuff that GR doesn't (or may not) like
                    for char in "()[]~@?/\\`&*!£$%*+={}#><":
                        attr = attr.encode("utf8").replace(char,"")
                    param = "+".join([quote_plus(attr),param])
        
        if param is not None:
            page = 1
            results_num = 0
            self.results = []
            while True: # run until we got all results
                
                url = GR_API_SEARCH % {"key":DEV_KEY, 
                    "param": param,
                    "page": page
                    }
                print "calling " + url
                br = browser()
                response_xml = br.open(url).read()
                root = etree.fromstring(response_xml)
                time.sleep(1) # to avoid throttling from GR
                    
                for book in root.findall(".//work"):
                    title = book.find("./best_book/title").text
                    authors = [ author.find("./name").text for author in book.findall("./best_book/author")]
                        
                    print "got " + title + " by " + "/".join(authors)
                    mi = MetaInformation(title,authors)
                    
                    # what we should do now is to get descriptions etc
                    # no point threading it because of GR throttling requests
                    bookurl = GR_API_BOOK_SHOW % { "key":DEV_KEY,
                        "id": book.find("./best_book/id").text }
                    print "Calling " + bookurl
                    book_xml = br.open(bookurl).read()
                    xmlroot = etree.fromstring(book_xml)
                    time.sleep(1) # to avoid throttling from GR
                    
                    ## things we can add here:
                    # 'author_sort', 'title_sort', 'comments', 'category',
                    # 'publisher', 'series', 'series_index', 'rating',
                    # 'isbn', 'tags', 'cover_data', 'application_id', 'guide',
                    # 'manifest', 'spine', 'toc', 'cover', 'language',
                    # 'book_producer', 'timestamp', 'lccn', 'lcc', 'ddc',
                    # 'pubdate', 'rights', 'publication_type', 'uuid'

                    mi.comments = xmlroot.find(".//book/description").text
                    mi.publisher = xmlroot.find(".//book/publisher").text
                    mi.isbn = xmlroot.find(".//book/isbn13").text
                    if mi.isbn is None: ## let's try with isbn10 then
                        xmlroot.find(".//book/isbn").text
                    
                    cover_url = xmlroot.find(".//book/image_url").text
                    # er, shall I download this cover...? and save it how ...?
                    
                    print mi
                    self.results.append(mi)
                
                results_num += int(root.find(".//results-end").text)
                total_results = int(root.find(".//total-results").text)
                print "fetched %i of %i results" % (results_num, total_results)
                
                # if enough results, get out of loop
                if results_num >= total_results: 
                    break
                else:
                    # otherwise, fetch next page
                    page += 1
DarkAbsynthe is offline   Reply With Quote