#!/usr/bin/env python
"""Bloglines 2 HTML

This is a command-line application that queries Bloglines web service
<http://bloglines.com/services/> and download unread entries. For more
information and latest version, visit

    http://fucoder.com/code/bloglines2html/

Required: Python 2.3 <http://python.org/>
Required: feedparser <http://feedparser.org/>

"""
############################################################################
# CONFIGURATION SECTION
# Change the username and password here if you do not wish to pass it to the
# program via command line parameter everytime.
############################################################################

default_username = ''
default_password = ''
default_variables = {
    'charset':  'utf-8',
    'title':    'Bloglines Feed',
}
default_img_excludes = 'ypn-rss.overture.com|www.assoc-amazon.com'

############################################################################
# MAIN CODE
############################################################################

__author__      = 'Scott Yang <http://scott.yang.id.au/>'
__copyright__   = 'Copyright (c) 2004-2006 Scott Yang'
__date__        = '2006-05-25'
__version__     = '0.3'

import os
import re
import sys
import time
import urllib2
import sha

# try to use BeautifulSoup
try:
    from BeautifulSoup import BeautifulSoup
except ImportError:
    BeautifulSoup = None


# Just to ensure that the users do remember to download and install the
# excellent feedparser Python module by Mark Pilgrim.
try:
    import feedparser
except ImportError:
    print >> sys.stderr, """\
Error: Cannot import Python module "feedparser".  Please download and install
this module from the following website:

    http://feedparser.sourceforge.net/
"""
    sys.exit(1)

# Configuring logger.
import logging
logging.basicConfig()
logger = logging.getLogger('bloglines2html')
logger.setLevel(logging.INFO)
for _handler in logger.handlers:
    logger.removeHandler(_handler)
_handler = logging.StreamHandler(sys.stderr)
_handler.setFormatter(logging.Formatter('%(levelname)s: %(message)s'))
logger.addHandler(_handler)
logger.propagate = False
del _handler
del logging


class ApplicationError(Exception):
    pass


class Template:
    re_parse = re.compile(r'<blines:(\S+)\b(.*?)(?:/>|>(.*?)</blines:\1>)',
        re.U | re.S)
    re_parse_attr = re.compile(r'^([a-z]+)\s*=\s*"([^"]*)"(\s+|$)', re.U|re.I)

    def __init__(self):
        pass

    def parse(self, text):
        return self.re_parse.sub(self.process_tag, text)

    def parse_attr(self, text):
        text = text.strip()
        attr = {}
        pos = 0
        while text:
            match = self.re_parse_attr.match(text, pos)
            if not match:
                break
            attr[match.group(1)] = match.group(2)
            pos += len(match.group(0))

        return attr

    def process_tag(self, match):
        tag  = match.group(1)
        attr = self.parse_attr(match.group(2))
        data = match.group(3) or u''

        try:
            handler = getattr(self, 'tag_%s' % tag)
        except AttributeError:
            return ''
        else:
            result = handler(attr, data)
            if isinstance(result, unicode):
                result = result.encode('utf-8')
            return result


class TemplateCommon(Template):
    def __init__(self, blines):
        Template.__init__(self)
        self.blines = blines

    def getvalue(self, name, format):
        return self.blines.getvalue(name, format)

    def tag_if_not_var(self, attr, data):
        # Handle <blines:if_not_var name="foo"> ... </blines:if_not_var>
        val = self.getvalue(attr.get('name', ''), attr.get('format', ''))
        if not val:
            return self.parse(data)
        else:
            return ''

    def tag_if_var(self, attr, data):
        # Handle <blines:if_var name="foo"> ... </blines:if_var>
        val = self.getvalue(attr.get('name', ''), attr.get('format'))
        if val:
            return self.parse(data)
        else:
            return ''

    def tag_now(self, attr, dummy):
        # Handle <blines:now format="..." />
        format = attr.get('format')
        if not format:
            format = '%d %b %Y %I:%M %P'
        return time.strftime(format)

    def tag_var(self, attr, dummy):
        # Handle <blines:var name="foo" />
        val = self.getvalue(attr.get('name', ''), attr.get('format'))
        if isinstance(val, unicode):
            val = val.encode('utf-8')
        return val


class TemplateEntry(TemplateCommon):
    def __init__(self, blines, feed=None):
        TemplateCommon.__init__(self, blines)
        self.feed = feed
        self.feed_page = feed is not None
        self.entry = None

    def getvalue(self, name, format):
        val = ''
        if name.startswith('feed_'):
            if self.feed is not None:
                name = name[5:]
                val = self.feed.get(name, '')
            else:
                logger.warn('<blines:var name="%s"/> outside <blines:feeds/>',
                    name)
        elif name.startswith('entry_'):
            if self.entry is not None:
                name = name[6:]
                if name in ('author', 'id', 'link', 'summary', 'title'):
                    val = self.entry.get(name, '')
                    if name == 'id' and self.entry.get('link'):
                        val = sha.sha(self.entry.get('link')).hexdigest()
                    if name == 'summary' and BeautifulSoup:
                        val = self.transform_entry(val)
                elif name in ('created', 'issued', 'modified'):
                    val = self.entry.get(name+'_parsed', '')
                    if val:
                        if not format:
                            format = '%d %b %Y %I:%M %P'
                        val = time.strftime(format, val)
                elif name == 'category':
                    # We will only support one category per entry.
                    categories = self.entry.get('category')
                    if categories:
                        if isinstance(categories, basestring):
                            val = categories
                        elif isinstance(categories, list):
                            categories = [category for category in categories
                                if category]
                            if len(categories) > 0:
                                val = categories[0]
                    else:
                      val = 'Default'
            else:
                logger.warn('<blines:var name="%s"/> outside <blines:entries/>',
                    name)
        else:
            val = TemplateCommon.getvalue(self, name, format)

        return val

    def tag_entries(self, dummy, data):
        # Feed does not exist, or 'unread' equals to 0 - there is no point of
        # fetching the RSS feed as we would get a 304 anyway.
        if not self.feed or self.feed.get('BloglinesUnread', '0') == '0':
            return ''

        items = self.blines.getitems(self.feed)
        if not items:
            return ''

        result = []
        try:
            for entry in items.entries:
                self.entry = entry
                result.append(self.parse(data))
        finally:
            self.entry = None

        return ''.join(result)

    def tag_feeds(self, attr, data):
        # Handle <blines:feeds> ... </blines:feeds>
        if self.feed_page:
            return self.parse(data)
        else:
            result = []
            try:
                for feed in self.blines.listsubs(attr.get('order', '')):
                    self.feed = feed
                    self.entry = None
                    result.append(self.parse(data))
            finally:
                self.feed = None

            return ''.join(result)

    def transform_entry(self, summary):
        entry = BeautifulSoup(summary)

        # remove FeedBurner feed flares
        [tag.extract() for tag in entry.findAll("div", "feedflare")]
        [tag.extract() for tag in entry.findAll("img", src=re.compile(r'feeds\.feedburner\.com'))]

        # Exclude images using blacklist
        [tag.extract() for tag in entry.findAll("img", src=re.compile(default_img_excludes))]


        # replace IMG with local hrefs
        for tag in entry.findAll('img', src=True):
            self.image_replace(tag, 'src')

        # remove HREFs to remote images
        for tag in entry.findAll('a', href=re.compile(r'\.(png|jpg|gif)$', re.I)):
            del(tag['href'])

        # make all text justified
        for tag in entry.findAll(['p', 'div', 'li'], align=None):
            tag['align'] = 'justify'

        return str(entry)

    def image_replace(self, tag, attr):
        try:
            hashname = sha.sha(tag[attr]).hexdigest()
            if not hashname in self.blines.img_cache:
                outimage  = os.path.join(self.blines.output, hashname)
                if not os.path.exists(outimage):
                    logger.info("  [IMG] %s" % tag[attr])
                    img_data = urllib2.urlopen(tag[attr]).read()
                    f = open(outimage, 'w')
                    f.write(img_data)
                    f.close()
                self.blines.img_cache.append(hashname)
            tag[attr] = hashname
        except urllib2.HTTPError:
            pass




class TemplateIndex(TemplateCommon):
    def __init__(self, blines):
        TemplateCommon.__init__(self, blines)
        self.feed = None

    def getvalue(self, name, format):
        # Handle <blines:var name="foo" />
        if name.startswith('feed_'):
            if self.feed is not None:
                name = name[5:]
                return self.feed.get(name, '')
            else:
                logger.warn('<blines:var name="%s"/> outside <blines:feeds/>',
                    name)
        else:
            return TemplateCommon.getvalue(self, name, format)

        return ''

    def tag_feeds(self, attr, data):
        # Handle <blines:feeds> ... </blines:feeds>
        result = []
        try:
            for feed in self.blines.listsubs(attr.get('order', '')):
                self.feed = feed
                result.append(self.parse(data))
        finally:
            self.feed = None

        return ''.join(result)

    def tag_feed_link(self, dummy, data):
        if self.feed:
            return self.blines.getfeedlink(self.feed)
        else:
            logger.warn('<blines:feed_link/> outside <blines:feeds/>')
            return ''

    def tag_if_not_var(self, attr, data):
        # Handle <blines:if_not_var name="foo"> ... </blines:if_not_var>
        val = self.tag_var(attr, data)
        if not val:
            return self.parse(data)
        else:
            return ''

    def tag_if_var(self, attr, data):
        # Handle <blines:if_var name="foo"> ... </blines:if_var>
        val = self.tag_var(attr, data)
        if val:
            return self.parse(data)
        else:
            return ''

    def tag_now(self, attr, dummy):
        # Handle <blines:now format="..." />
        format = attr.get('format')
        if not format:
            format = '%d %b %Y %I:%M %P'
        return time.strftime(format)


class Bloglines2HTML:
    def __init__(self, bws):
        self.bws = bws
        self.var = default_variables.copy()
        self.var['version'] = __version__

        self.output = 'blogs'
        self.tpl_entry = DEFAULT_ENTRY_TEMPLATE
        self.tpl_index = DEFAULT_INDEX_TEMPLATE
        self.multi_file = True
        self.subs_cache  = None
        self.feed_cache  = {}
        self.img_cache   = []

    def execute(self):
        if self.multi_file:
            self.execute_multi()
        else:
            self.execute_single()

    def execute_multi(self):
        if not isinstance(self.output, str):
            raise ApplicationError, \
                'Output has not been defined in multi-file mode'
        if os.path.exists(self.output):
            if not os.path.isdir(self.output):
                raise ApplicationError, \
                    'Output "%s" is not a directory' % self.output
        else:
            os.makedirs(self.output)

        tpl = TemplateIndex(self)
        output = tpl.parse(self.tpl_index)
        outfile = self.getfile('index.html')
        outfile.write(output)
        outfile.close()

    def execute_single(self):
        if isinstance(self.output, str):
            self.output = file(self.output, 'w')
        tpl = TemplateEntry(self)
        self.output.write(tpl.parse(self.tpl_entry))
        self.output.close()

    def getfeedlink(self, feed):
        tpl = TemplateEntry(self, feed)
        output = tpl.parse(self.tpl_entry)
        outname = 'feed%s.html' % feed['BloglinesSubId']
        outfile = self.getfile(outname)
        outfile.write(output)
        outfile.close()
        return outname

    def getfile(self, filename):
        """Open a writable file in the multi-file output directory."""
        return file(os.path.join(self.output, filename), 'w')

    def getitems(self, feed):
        subid = feed.get('BloglinesSubId')
        title = feed.get('title')
        if not self.feed_cache.has_key(subid):
            logger.info('Downloading feed for "%s"', title)
            self.feed_cache[subid] = self.bws.getitems(subid)

        items = self.feed_cache[subid]
        try:
            if items.status != 200:
                logger.warn('Download feed for "%s" failed - Status %d', title,
                    items.status)
                return None
        except AttributeError:
            # "status" not available?!
            logger.error('Download feed for "%s" failed - Status unknown',
                title)
            return None

        logger.debug('Downloaded feed for "%s" - %d entries', title,
            len(items.entries))

        return items

    def getvalue(self, name, dummy):
        return self.var.get(name, '')

    def listsubs(self, order=None):
        if not self.subs_cache:
          logger.info('Downloading subscription list')
          self.subs_cache = self.bws.listsubs()

        feeds = self.subs_cache
        logger.debug('Downloaded subscription list - %d feeds' % len(feeds))

        # Doing some re-ordering.
        if order:
            if order.startswith('-'):
                order = order[1:]
                reverse = True
            else:
                reverse = False

            order = order.lower()
            if order == 'folder':
                sorter = lambda x, y: cmp(x['folder'] + x['title'],
                    y['folder'] + y['title'])
            elif order == 'title':
                sorter = lambda x, y: cmp(x['title'], y['title'])
            elif order == 'unread':
                sorter = lambda x, y: cmp(int(x['BloglinesUnread']),
                    int(y['BloglinesUnread']))
            else:
                sorter = None

            if sorter:
                feeds.sort(sorter)
                if reverse:
                    feeds.reverse()

        return feeds


class BloglinesWS:
    """A class that wraps around the Bloglines web service.

    PyBloglines <http://www.josephson.org/projects/pybloglines/> was the
    starting point for this class. Currently it can only do listsubs and
    getitems.

    """

    # Hostname of the Bloglines Web Service.
    hostname = 'rpc.bloglines.com'

    # Username used in authentication. By default it is 'default_username',
    # (which defaults to None that would cause operations to bark) but can be
    # changed using '-u [username]' command line option.
    username = default_username

    # Password used in authentication. By default it is an empty string that
    # can be changed using '-p [password]' command line option.
    password = default_password

    # By default it would list feeds from all folders. It can be changed using
    # '-f [folder]' flag so that it would only list feeds inside a specific
    # folder.
    folder = None

    # Whether to list feeds that have already been read, i.e. BloglinesUnread
    # attribute equals to 0.
    list_read = False

    # Mark the entries on Bloglines as read.
    mark_as_read = False

    def __init__(self):
        self.feeds = None

    def filter_feed(self, feed):
        if not self.list_read and feed.get('BloglinesUnread', '0') == '0':
            return False

        if self.folder and \
                self.folder.lower() != feed.get('folder', '').lower():
            return False

        return True

    def getitems(self, bloglines_subid):
        import urllib

        if not self.username:
            raise ApplicationError, 'Username is not given.'

        if feedparser.__version__ < '4':
            url = 'http://%s/getitems' % self.hostname
            auth = urllib2.HTTPBasicAuthHandler()
            auth.add_password('Bloglines RPC', self.hostname, self.username,
                self.password)
            handlers = [auth]
        else:
            url = 'http://%s:%s@%s/getitems' % (urllib.quote(self.username),
                urllib.quote(self.password), self.hostname)
            handlers = []

        par = [('s', str(bloglines_subid))]
        if self.mark_as_read:
            par.append(('n', '1'))
        par = urllib.urlencode(par)

        return feedparser.parse(url+'?'+par,
            agent='bloglines2html/%s' % __version__, handlers=handlers)

    def listsubs(self):
        import base64

        if self.feeds is not None:
            return self.feeds

        if not self.username:
            raise ApplicationError, 'Username is not given.'

        url = 'http://%s/listsubs' % self.hostname
        b64 = base64.encodestring('%s:%s' % (self.username, self.password))
        req = urllib2.Request(url)
        req.add_header('Authorization', 'Basic %s' % b64)

        try:
            data = urllib2.urlopen(req)
        except urllib2.HTTPError, ex:
            if ex.code == 401:
                raise ApplicationError, \
                    'Username and password cannot be authenticated.'
            raise

        parser = OpmlParser()
        self.feeds = parser.parse(data)
        self.feeds = [feed for feed in self.feeds if self.filter_feed(feed)]
        return self.feeds


class OpmlParser:
    """A simple OPML parser that probably only applies to Bloglines.

    It would return a list of feeds returned by Bloglines' listsubs function.
    It also tries to work out the folder of which the feeds are belong to.
    Feeds are Python dictionary objects.

    """
    def __init__(self):
        self.feeds = []
        self.folder = []

    def parse(self, data):
        from xml.parsers.expat import ParserCreate

        parser = ParserCreate()
        parser.StartElementHandler = self.start_element
        parser.EndElementHandler = self.end_element
        if isinstance(data, basestring):
            parser.Parse(data)
        elif hasattr(data, 'read'):
            parser.ParseFile(data)
        else:
            raise TypeError, '"data" must be a string or a readable object'

        return self.feeds

    def start_element(self, name, attrs):
        if name == 'outline':
            if 'xmlUrl' in attrs:
                feed = attrs.copy()
                for key, val in feed.items():
                    feed[key.encode('utf-8')] = val.encode('utf-8')

                feed['folder'] = self.folder[-1]
                self.feeds.append(feed)
            elif attrs.get('title') == 'Subscriptions':
                # The top level of OPML is called "Subscription", but called
                # "Top Level" in Bloglines. Hack here for this exception.
                attrs['title'] = 'Top Level'

            self.folder.append(attrs.get('title', u'Unknown').encode('utf-8'))

    def end_element(self, name):
        if name == 'outline':
            self.folder.pop()


def command_line(args):
    """Entry point if it runs from command line.

    It would try to use getopt to parse the command line parameters,
    initialise the application object, and the dump the output to either
    stdout or a file.

    """

    import getopt
    try:
        opts, args = getopt.getopt(args, 'D:f:hi:mMo:p:rt:u:vV')
    except getopt.GetoptError, ex:
        logger.error('Unable to parse command line argument: %s', ex)
        command_line_help()
        sys.exit(1)

    bws = BloglinesWS()
    blines = Bloglines2HTML(bws)

    for opt, arg in opts:
        if opt == '-D':
            match = re.match('([^=]+)=(.*)', arg)
            if match:
                blines.var[match.group(1)] = match.group(2)
        elif opt == '-f':
            bws.folder = arg
        elif opt == '-h':
            command_line_help()
            sys.exit(0)
        elif opt == '-i':
            blines.tpl_index = open(arg, 'r').read()
        elif opt == '-M':
            blines.multi_file = True
        elif opt == '-m':
            bws.mark_as_read = True
        elif opt == '-o':
            blines.output = arg
        elif opt == '-p':
            bws.password = arg
        elif opt == '-r':
            bws.list_read = True
        elif opt == '-t':
            blines.tpl_entry = open(arg, 'r').read()
        elif opt == '-u':
            bws.username = arg
        elif opt == '-v':
            if logger.level > 0:
                logger.level -= 10
        elif opt == '-V':
            print >> sys.stderr, 'Version %s (%s)' % \
                (__version__, __date__)
            sys.exit(0)
        else:
            logger.warn('Option "%s" is not handled.', opt)

    try:
        blines.execute()
    except ApplicationError, ex:
        logger.error(ex[0])
    except Exception, ex:
        logger.exception('Uncaught error on executing bloglines2html')


def command_line_help():
    print >> sys.stderr, CMDLINE_HELP % os.path.basename(sys.argv[0])


CMDLINE_HELP = """\
Usage:
    %s [options]

Options:
    -D var=val  Define variable equals to value.
    -f folder   List only feeds inside this folder.
    -h          Display this help message.
    -i template Index template file to be used
    -m          Mark the feeds as read.
    -M          Multi-file support.
                -o must be present and will be a directory.
    -o file     Write the output to this file, instead of stdout.
    -p password Password to log into Bloglines.
    -r          List feeds that have already been read.
    -t template Feed template file to be used
    -u username Username to log into Bloglines. Usually your email address.
    -v          Turn up verbose level.
                -v = info.  -vv = debug.
    -V          Show version information.
"""

DEFAULT_INDEX_TEMPLATE = """\
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml">
  <head>
    <meta http-equiv="Content-Type" content="text/html; charset=<blines:var name="charset" />" />
    <meta NAME="title" CONTENT="Feeds <blines:now format="%Y-%m-%d %H:%M" />" />
    <META NAME="genre" CONTENT="Feeds" />
    <title>Feeds <blines:now format="%Y-%m-%d %H:%M" /></title>
  </head>
  <body>
    <div id="container">
      <div id="banner"><h1>Feeds <blines:now format="%Y-%m-%d %H:%M" /></h1></div>
      <div id="content">
        <ul>
          <blines:feeds order="folder">
          <li><blines:var name="feed_folder" />: <a href="<blines:feed_link />"><blines:var name="feed_title" /></a> (<blines:var name="feed_BloglinesUnread" />)</li>
          </blines:feeds>
        </ul>
      </div>
      <div id="footer">Generated on <blines:now format="%Y-%m-%d %H:%M"/> by <a href="http://fucoder.com/code/bloglines2html/">Bloglines2HTML</a> <blines:var name="version" /></div>
    </div>
  </body>
</html>
"""

DEFAULT_ENTRY_TEMPLATE = """\
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml">
  <head>
    <meta http-equiv="Content-Type" content="text/html; charset=<blines:var name="charset" />" />
    <blines:feeds>
    <title><blines:var name="feed_title" /></title>
    </blines:feeds>
  </head>
  <body>
    <div id="listing">
        <blines:feeds>
          <h1><a href="index.html"><blines:var name="feed_title" /></a></h1>
          <blines:entries>
          <ul>
            <li><a name="i<blines:var name="entry_id" />" href="#e<blines:var name="entry_id" />"><blines:var name="entry_title" /></a></li>
          </ul>
          </blines:entries>
          <hr />
        </blines:feeds>
    </div>
    <div id="entries">
        <blines:feeds>
          <blines:entries>
            <h2><a name="e<blines:var name="entry_id" />" href="#i<blines:var name="entry_id" />"><blines:var name="entry_title" /></a></h2>
            <blockquote><p class="posted">Posted on <blines:var name="entry_modified" /> in [<blines:var name="entry_category" />]</p></blockquote>
            <div class="entry_body"><blines:var name="entry_summary" /></div>
            <hr NEW-PAGE/>
          </blines:entries>
        </blines:feeds>
    </div>
    <div id="footer">Generated on <blines:now /> by <a href="http://fucoder.com/code/bloglines2html/">Bloglines2HTML</a> <blines:var name="version" /></div>
  </body>
</html>
"""

if __name__ == '__main__':
    command_line(sys.argv[1:])
