#!/usr/bin/env python
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai

import sys, re, htmlentitydefs
from uuid import uuid4
from smartypants import smartyPants

ENTITIES_TO_UNESCAPE = ['&#8220;', '&#8221;', '&#8216;', '&#8217;', '&#8212;', '&#8230;']
UNESCAPE_ENTITIES = True

def unescape(text):
   """Removes HTML or XML character references 
      and entities from a text string.
   @param text The HTML (or XML) source text.
   @return The plain text, as a Unicode string, if necessary.
   from Fredrik Lundh
   2008-01-03: input only unicode characters string.
   http://effbot.org/zone/re-sub.htm#unescape-html
   """
   def fixup(m):
      text = m.group(0)
      print text
      if text not in ENTITIES_TO_UNESCAPE:
         return text
      if text[:2] == "&#":
         # character reference
         try:
            if text[:3] == "&#x":
               return unichr(int(text[3:-1], 16))
            else:
               return unichr(int(text[2:-1]))
         except ValueError:
            print "Value Error"
            pass
      else:
         # named entity
         # reescape the reserved characters.
         try:
            if text[1:-1] == "amp":
               text = "&amp;amp;"
            elif text[1:-1] == "gt":
               text = "&amp;gt;"
            elif text[1:-1] == "lt":
               text = "&amp;lt;"
            else:
               print text[1:-1]
               text = unichr(htmlentitydefs.name2codepoint[text[1:-1]])
         except KeyError:
            print "keyerror"
            pass
      return text # leave as is
   return re.sub("&#?\w+;", fixup, text)
   
def main(argv=sys.argv):
    if len(argv) != 2:
        print "Usage:"
        print "  smartwrapper.py infile"
        return 1
    else:  
      infile = argv[1]

      html = file(infile, 'rb').read()
      try:
          from BeautifulSoup import BeautifulSoup
          enc = BeautifulSoup(html).originalEncoding
          print 'Original Encoding from BeautifulSoup: %s' % enc
      except ImportError:
          enc = 'utf-8'
          print 'Assumed Original Encoding: %s' % enc
      pass

      start = 'smartypants-'+str(uuid4())
      stop = 'smartypants-'+str(uuid4())
      html = html.replace('<!--', start)
      html = html.replace('-->', stop)
      # convert double dashes to em-dash
      # removing preceding and trailing spaces (if any).
      html = re.sub(r'\s?--\s?', '&#8212;', html)
      html = smartyPants(html)
      html = html.replace(start, '<!--')
      html = html.replace(stop, '-->')
      html = re.sub(r'(?u)(?<=\w)\s?&#8230;', '&hellip;', html)
      
      if UNESCAPE_ENTITIES:
         if isinstance(html, unicode):
            html = unescape(html)
         else:
            html = unescape(unicode(html, enc))
      file(infile, 'wb').write(html.encode(enc))

if __name__ == "__main__":
    sys.exit(main())
