#!/usr/bin/env python
import re, subprocess, Image, pyPdf, cStringIO

def get_cover(pdf_file):
    "Extract cover image and return PIL object."
    try:
        pdftoppm = subprocess.Popen(['pdftoppm', '-l', '1', '-gray', pdf_file],  \
            stdout=subprocess.PIPE, shell=False)
        im = Image.open(cStringIO.StringIO(pdftoppm.stdout.read()))
        return im
    except:
        print "Thumbnail could not be generated."
        return None

def get_metadata(pdf_file):
    metadata = {'title' : None , 'author' : None, 'date' : None , 'pages': None}

    try:
        document = pyPdf.PdfFileReader(file(pdf_file, "rb"))

        title = document.getDocumentInfo().title or ""
        author = document.getDocumentInfo().author or ""
        metadata["title"] = title.encode('ascii', 'xmlcharrefreplace')
        metadata["author"] = author.encode('ascii', 'xmlcharrefreplace')
        metadata["date"] = None
        metadata["pages"] = document.getNumPages()

        #TOR (and maybe others) set the title to a string of numbers.
        #Force the script to look at text instead.
        bad_title = re.compile('\d+\.pdf')
        if bad_title.search(metadata["title"]):
            metadata["title"] = None
        #If we don't have title or author, try to get them from file contents.
        parsed_title = None
        parsed_author = None
        if not (metadata["title"] and metadata["author"]):
            #Assume first non-empty line of text is title, second is author.
            page = document.getPage(0)
            text = page.extractText().splitlines()
            if len(text) < 2:
                page = document.getPage(1)
                text.extend(page.extractText().splitlines())
            text = [line for line in text if line is not '']
            if len(text) >= 1:
                parsed_title = text[0].encode('ascii', 'xmlcharrefreplace')
            if len(text) >= 2:
                parsed_author = text[1].encode('ascii', 'xmlcharrefreplace')
        metadata["title"] = metadata["title"] or parsed_title or None
        metadata["author"] = metadata["author"] or parsed_author or None
    except: pass
    return metadata


def init(FileInfo):
    class PDF(FileInfo):
        "PDF metadata"
        def __init__(self, filename=None):
            "Assign all properties."
            FileInfo.__init__(self, filename)
            self.cover = get_cover(self.file)
            metadata = get_metadata(self.file)
            self.title = metadata["title"]
            self.author = metadata["author"]
            self.date = metadata["date"]
            self.pages = metadata["pages"]

        def fullscreen(self, manifest):
            "Create minidom object signalling view to open in fullscreen mode."
            view_settings = manifest.createElement("viewer-settings")
            fullscreen = manifest.createElement("modefullscreen")
            fullscreen.appendChild(manifest.createTextNode("1"))
            view_settings.appendChild(fullscreen)
            for node in manifest.childNodes:
                if node.localName == 'package':
                    package = node
            package.appendChild(view_settings)
            return manifest
    return PDF


