#! /usr/bin/python

import zipfile, Image, os, cStringIO
from xml.dom import minidom


def get_cover(book, opf):
    "Extract cover image and return PIL object."
    try:
        #Look for a cover image. Unfortunately, there does not apear to be a
        #standard for this.
        #The following will work for files made from .lits using convertlit.
        cover_path = None
        for tag in opf.getElementsByTagName('reference'):
            if tag.attributes['type'].value == "other.ms-thumbimage-standard":
                cover_path = os.path.join('OEBPS', tag.attributes["href"].value)
                break
        if not cover_path:
            for tag in opf.getElementsByTagName('item'):
                if tag.attributes['media-type'].value[0:5] == 'image':
                    cover_path = os.path.join('OEBPS', tag.attributes["href"].value)
                    break
        if cover_path:
            im = Image.open(cStringIO.StringIO(book.read(cover_path)))
        else :
            im = None
        return im
    except:
        print "Thumbnail could not be generated."
        return None

def init(FileInfo):
    class PDF(FileInfo):
        "PDF metadata"
        def __init__(self, filename=None):
            "Assign all properties."
            FileInfo.__init__(self, filename)

            #Get and parse opf file.
            book = zipfile.ZipFile(filename, 'r')
            meta_info = minidom.parseString(book.read('META-INF/container.xml'))
            opf_el = meta_info.getElementsByTagName('rootfile')
            opf_path = opf_el[0].attributes["full-path"].value
            opf = minidom.parseString(book.read(opf_path))

            self.cover = get_cover(book, opf)

            get_data = lambda x: \
                opf.getElementsByTagName(x)[0].firstChild.data.encode('ascii', 'xmlcharrefreplace').strip('\n')
            try:
                self.title = get_data('dc:title')
            except:
                #ConvertLit incorrectly uses capitalized tags.
                try:
                    self.title = get_data('dc:Title')
                except: pass
            try:
                self.author = get_data('dc:creator')
            except:
                try:
                    self.author = get_data('dc:Creator')
                except:pass
            #self.pages = metadata["pages"]
    return PDF


