#!/usr/bin/env python
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import (unicode_literals, division, absolute_import, print_function)

__license__   = 'GPL v3'
__copyright__ = '2013, Tobias Zeumer'
__docformat__ = 'restructuredtext en'

import sys, time, datetime, re
from urllib.parse import quote
from queue import Queue, Empty

# XML parsing
from lxml import etree
from lxml.html import fromstring, tostring

# Useful
from calibre import as_unicode
from calibre.ebooks.metadata import check_isbn
from calibre.ebooks.metadata.sources.base import Source
from calibre.utils.icu import lower
from calibre.utils.cleantext import clean_ascii_chars
from calibre.utils.localization import get_udc

from calibre.ebooks.metadata.book.base import Metadata
from calibre.library.comments import sanitize_comments_html
from calibre.utils.localization import canonicalize_lang

'''
OK, let's start!
'''
class SRU(Source):
    # Basic plugin information 
    name                    = 'SRU'
    description             = _('Downloads metadata via SRU from DNB and LOC (Search/Retrieve via URL) sources')
    supported_platforms     = ['windows', 'osx', 'linux']
    author                  = 'Tobias Zeumer, updated by DJG'
    version                 = (1, 0, 0)
    minimum_calibre_version = (5, 0, 0)

    # Setting properties (or however it is called in Python)
    # http://manual.calibre-ebook.com/plugins.html#module-calibre.ebooks.metadata.sources.base
    capabilities = frozenset(['identify'])
    touched_fields = frozenset(['title', 'authors', 'series', 'comments', 'publisher', 'pubdate', 'tags', 'series', 'languages', 'identifier:isbn', 'identifier:gbv', 'identifier:loc'])
    has_html_comments = False
    supports_gzip_transfer_encoding = True
    cached_cover_url_is_reliable = True


    '''
    Set to false to force user to configure the plugins settings.
    Maybe better?
    '''
    def is_configured(self):
        return True


    '''
    Overriding the default configuration screen for our own custom configuration
    #def is_customizable(self):
    #    return True
    '''
    def config_widget(self):
        from calibre_plugins.SRU.config import ConfigWidget
        return ConfigWidget(self)


    '''
    This must be complete nonsense, but I don't know better yet. Want variables 
    to be defined in config, but used here as well as in worker.
    '''
    def load_config(self, log, result_queue, title, authors):
        import calibre_plugins.SRU.config as cfg
        self.MAX_EDITIONS = cfg.plugin_prefs[cfg.STORE_NAME][cfg.KEY_MAX_DOWNLOADS]

        self.SOURCE_NAME = cfg.plugin_prefs[cfg.STORE_NAME][cfg.KEY_SOURCE_NAME]
        self.SOURCE_SRU = cfg.plugin_prefs[cfg.STORE_NAME][cfg.KEY_SOURCE_SRU]
        self.SOURCE_WEB = cfg.plugin_prefs[cfg.STORE_NAME][cfg.KEY_SOURCE_WEB]
        self.SOURCE_PICA = bool(cfg.plugin_prefs[cfg.STORE_NAME][cfg.KEY_SOURCE_PICA])

        self.TAGPREFIX_GENRE = cfg.plugin_prefs[cfg.STORE_NAME][cfg.KEY_TAG_GENRE]
        self.TAGPREFIX_TOPIC1 = cfg.plugin_prefs[cfg.STORE_NAME][cfg.KEY_TAG_TOPIC1]
        self.TAGPREFIX_TOPIC2 = cfg.plugin_prefs[cfg.STORE_NAME][cfg.KEY_TAG_TOPIC2]

        # Makes more sense then always passing as argument?
        self.log = log
        self.result_queue = result_queue

        self.orig_title = self.orig_title_short = title
        self.orig_authors = authors

        self.resultByISBN = False
        return True


    '''
    Identify a book by its title/author/isbn/etc.
    ratings   = []  #hmm?  identify_results_keygen(title=None, authors=None,
                    identifiers={})[source]¶
    '''
    def identify(self, log, result_queue, abort, title=None, authors=None, identifiers={}, timeout=30):
        self.load_config(log, result_queue, title, authors)
        
        isbn = check_isbn(identifiers.get('isbn', None))
        br = self.browser
        matches   = False

        # Don't take subtitles that may vary (e.g. 'Animal farm: a fairy story')
        # Hope the regex is not to restrictive
        match = re.search('[\w\d\s\']*', title)
        self.orig_title_short = match.group(0) if match else None

        # Create the query url
        query = self.create_query(title=self.orig_title_short, authors=authors, identifiers=identifiers)
        if query is None:
            self.resultByISBN = False #must be false, first try is always isbn
            log.error('Insufficient metadata to construct query')
            return

        log.info('Querying: %s' % query)
        
        # Call the query url ("browse") 
        response = br.open_novisit(query, timeout=timeout)
        try:				    
            raw = response.read().strip()
            if not raw:
                log.error('Failed to get raw result for query: %r' % query)
                return
            # make the raw (XML) response processable 
            parser = etree.XMLParser(ns_clean=True, recover=True)
            root = etree.fromstring(raw, parser)
        except:
            msg = 'Failed to parse SRU source for query: %r' % query
            log.exception(msg)
            return msg

        # Check the result with own rules to check, if it is a match. 
        matches = self._parse_search_results(root, timeout)

        # I guess this is, so a hit on cancel in calibre actually kills this 
        # plugins process
        if abort.is_set():
            return

        # Did _parse_search_results not return a match?
        if matches == False:
            # This already has been done "transparently" above if ISBN is set
            if identifiers and title and authors:
                log.info('No matches found with identifiers, retrying using only title and authors')
                # Now call this method again, only without argument identifier
                return self.identify(log, result_queue, abort, title=title,
                        authors=authors, timeout=timeout)
            log.error('No matches found with query: %r' % query)
            return


    '''
    Create a SRU query, based on the source set in the options
    '''
    def create_query(self, title=None, authors=None, identifiers={}):
        # Pica is more powerful (e.g. num might find wrong or alternativ ISBNs)
        # dc is standard for SRU
        pica  = ['pica.num', 'pica.aut', 'pica.tit']
        loc   = ['bath.isbn', 'dc.creator', 'dc.title']
        dc    = ['dc.identifier', 'dc.creator', 'dc.title']

        if self.SOURCE_PICA is True:
            query_fld = pica
        else:
            query_fld = loc

        # See about.txt for help on how a SRU 1.1 query ist build
        # hmm, which sorting would work for LOC...?
        base_query = 'version=1.1&operation=searchRetrieve&recordSchema=marcxml&maximumRecords='+self.MAX_EDITIONS+'&query='
        #sorting = '%20sortby+date%2Fdescending' #SRU 1.2
        #sorting = '&sortKeys=date,,1'  #SRU 1.1
        sorting = ''
        q = ''
        tokens = []

        # Ebook has ISBN set already?
        isbn = check_isbn(identifiers.get('isbn', None))
        if isbn is not None:
            self.resultByISBN = True
            q = query_fld[0] + '=' + isbn
            return self.SOURCE_SRU + '?' + base_query + q

        # Explained: http://manual.calibre-ebook.com/plugins.html#calibre.ebooks.metadata.sources.base.Source.get_title_tokens
        title_str = ''
        if title:		   
            title = get_udc().decode(title)
            title_tokens = list(self.get_title_tokens(title, strip_joiners=False, strip_subtitle=True))
            if title_tokens:
                tokens = [quote(t.encode('utf-8') if isinstance(t, str) else t) for t in title_tokens]
                q = '%20'.join(tokens)
                title_str = '"' + '%20'.join(tokens) +'"'
                title_str = query_fld[2] + '%3D' + title_str

        # Explained: http://manual.calibre-ebook.com/plugins.html#calibre.ebooks.metadata.sources.base.Source.get_author_tokens
        author_str = ''
        if authors:
            authors = [get_udc().decode(a) for a in authors]
            author_tokens = self.get_author_tokens(authors, only_first_author=True)
            if author_tokens:
                tokens = [quote(t.encode('utf-8') if isinstance(t, str) else t) for t in author_tokens]
                q += '%20' + '%20'.join(tokens)
                # 'and' is a bit dumb, but otherwise forename/name-order might
                # be a problem
                author_str = '%20and%20'.join(tokens)
                author_str = query_fld[1] + '%3D(' + author_str + ')'

        if not q:
            return None
        if isinstance(q, str):
            q = q.encode('utf-8')
        return self.SOURCE_SRU + '?' + base_query + author_str + '%20and%20' + title_str +sorting

                
    '''
    The results must be relevant. Because we search for ISBN or for Author AND 
    Title and this must be a perfect hit in a library catalog (as perfect as it 
    gets with the right data).
    Yet catalogs might have multiple edition. Best would be if it were possible 
    to reliably query resource types (ebook, book; not audiobook, periodical). 
    SRU offers ways with xpath, but that is - for now - no option.
    But there might be point to check various results (different editions and/or
    media types) for quality hints. Like having isbn, comments and "tags".
    But on the other hand, if I check everything here, worker is pointless resp. 
    should only return an array being created here
    '''
    def _parse_search_results(self, root, timeout):
        #self.log.error (etree.tostring(root))
        #result = root.xpath('/zs:searchRetrieveResponse/zs:records/zs:record/zs:recordData/dft:record', namespaces=self.ns)
        result = root.xpath('//*[local-name()="numberOfRecords"]')
        total_res_num = int(root.xpath('//*[local-name()="numberOfRecords"]')[0].text)
        
        matches = True

        # Ok, we got nothing for sure
        if total_res_num == None:
            sru_diagnostics = root.xpath('//*[local-name()="message"]')[1].text
            if sru_diagnostics:
                self.log.error(sru_diagnostics)
                matches = False
                return matches
        elif total_res_num == 0:
            self.log.info ('Empty result...')
            matches = False
            return matches
        # More (potential) results than allowed in plugin config by user
        # total_res_num takes the value from a node in the result. This node
        # says how many you COULD get if you wouldn't apply maximumRecords
        # to the query
        # Alternative: just count the recordData/record-nodes ...
        elif total_res_num > int(self.MAX_EDITIONS):
            res_num = int(self.MAX_EDITIONS)
        # Else just use the number that is given
        else:
            res_num = total_res_num

        # loop the result
        for result_id in range(1, res_num+1):
            self.parse_details(root, result_id)

        return matches


    '''
    Get the details for a single MARC result data node and put it in result queu
    '''
    def parse_details(self, root, node_id):
        self.log.info('\nMY ID: '+str(node_id))
        self.RECORDSET = '//*[local-name()="records"]/*[local-name()="record"]['+str(node_id)+']//*[local-name()="record"]'

        # Let's do some relevance scoring for results...
        relevance_score = 0
        weight_isbn     = 50  #with isbn we can get data every where else
        weight_author   = 5   #low, could there even really be a case without?
        weight_title    = 5  #low, could there even really be a case without?
        #weight_subtitle = 5  #low, but it is nice
        weight_series   = 10  #having right series data ist nice!
        weight_comment  = 15  #comments are nice
        weight_tags     = 10  #nice to have, but not to exepct
        # sum : 100%


        try:
            source_id = self.parse_unique_id(root)
            url = self.SOURCE_WEB+str(source_id)
        except:
            # The 'HTML'-Url, not the xml source
            url = ''
            self.log.exception('Error parsing unique id of title at provider for url: %r'%url)
            source_id = None

        try:		    
            title = self.parse_title(root)
            relevance_score = relevance_score + weight_title
        except:
            self.log.exception('Error parsing title for url: %r'%url)
            title = None

        try:
            authors = self.parse_authors(root)
            relevance_score = relevance_score + weight_author
        except:
            self.log.exception('Error parsing authors for url: %r'%url)
            authors = []        	

        # Weighting could catch this case in a more sophisticed way (e.g. if
        # an author is not to be expected...)
        if not title or not authors or not source_id:
            self.log.info('Could not find title/authors/source_id for %r'%url)
            #self.log.info('source_id: %r Title: %r Authors: %r'%(source_id, title, authors))
            return

        # Set something for mi only if Author AND Title AND Identifier are found
        mi = Metadata(title, authors)
        # Maybe create option to backup old data. Or not to overwrite existing data, just empty fields?
        #remember = 'TITLE:\n'+mi.title + '\n\nAUTHOR:\n'+mi.authors + '\n\nSERIES:\n'+mi.series
        mi.set_identifier(lower(self.SOURCE_NAME), source_id)
        self.source_id = source_id
        mi.authors = authors
        mi.title = title

        try:
            self.isbn = self.parse_isbn(root)
            if self.isbn:
                mi.isbn = self.isbn
                relevance_score = relevance_score + weight_isbn
        except:
            self.log.exception('Error parsing ISBN for url: %r'%url)

        try:
            series = None
            series_index = None
            (series, series_index) = self.parse_series(root)
            if series:
                mi.series = series
                mi.series_index = series_index
                relevance_score = relevance_score + weight_series
        except:
            self.log.exception('Error parsing series for url: %r'%url)
         
        try:
            comments = '';
            comments = self.parse_comments(root)
            # Don't grab really short comments. Most likely not helpful.
            if len(str(comments)) > 120:
                mi.comments =  comments
                relevance_score = relevance_score + weight_comment
        except:
            self.log.exception('Error parsing comments for url: %r'%url)

        '''
        Cover is possible, but library catalogs are just not the best source
        for this

        try:
            self.cover_url = self.parse_cover(root)
        except:
            self.log.exception('Error parsing cover for url: %r'%url)
            
        mi.has_cover = bool(self.cover_url)
        '''

        try:
            mi.publisher = self.parse_publisher(root)
        except:
            self.log.exception('Error parsing publisher for url: %r'%url)

        try:
            mi.pubdate = self.parse_published_date(root)
        except:
            self.log.exception('Error parsing published date for url: %r'%url)

        try:
            mi.language = self.parse_language(root)
        except:
            self.log.exception('Error parsing language for url: %r'%url)

        try:
            tags = self.parse_tags(root)
            if tags:
                mi.tags = tags
                relevance_score = relevance_score + weight_tags
        except:
            self.log.exception('Error parsing tags for url: %r'%url)
        
        mi.source_relevance = relevance_score

        if self.source_id:
            if self.isbn:
                self.cache_isbn_to_identifier(self.isbn, self.source_id)

        self.log.info('Relevance for '+str(node_id)+': '+str(relevance_score))

        # Ah! If results "get lost" it's because calibre automatically merges
        # results http://www.mobileread.com/forums/showthread.php?t=177779
        # Does identify_results_keygen do the trick?
        self.clean_downloaded_metadata(mi)
        self.result_queue.put(mi)


    '''
    Return a 3-tuple or None. The 3-tuple is of the form: (identifier_type,
    identifier_value, URL). The URL is the URL for the book identified by
    identifiers at this source. identifier_type, identifier_value specify the
    identifier corresponding to the URL. This URL must be browseable to by a
    human using a browser
    TODO: Seems not to work with variables?
    ''
    def get_book_url(self, identifiers):
        source_identifier = identifiers.get(lower(self.SOURCE_NAME), None)
        if source_identifier:
            return ('SRU', source_identifier, self.SOURCE_WEB+source_identifier)
    '''
    def get_book_url(self, identifiers):
        id_gbv = identifiers.get('gbv', None)
        id_loc = identifiers.get('loc', None)

        url_gbv = 'http://kxp.k10plus.de/DB=2.1/PPNSET?PPN='
        url_loc = 'http://lccn.loc.gov/'
        if id_gbv:
            return ('gbv', id_gbv, url_gbv + id_gbv)
        elif id_loc:
            return ('loc', id_loc,
                    '%s%s' % (url_loc, id_loc))


    '''
    Return a human readable name from the return value of get_book_url().
    '''
    def get_book_url_name(self, idtype, idval, url):
        return idtype.capitalize()


    '''
    Return a function that is used to generate a key that can sort Metadata
    objects by their relevance given a search query (title, authors,
    identifiers).
    These keys are used to sort the results of a call to :meth:`identify`.
    For details on the default algorithm see
    :class:`InternalMetadataCompareKeyGen`. Re-implement this function in
    your plugin if the default algorithm is not suitable.
    '''
     #remove
    def identify_results_keygen(self, title=None, authors=None, identifiers={}):
        def keygen(mi):
            self.log.info ('My Relevance-key: '+str(mi.source_relevance))
            return mi.source_relevance
            #return InternalMetadataCompareKeyGen(mi, self, title, authors, identifiers)
        return keygen


    '''
    GBS: it's the "PPN" (Pica Production Number)
    LLCN is needed for LOC permalink. If found use this instead of 001
    '''
    def parse_unique_id(self, root):
        llcn = root.xpath(self.RECORDSET + '/*[local-name()="datafield"][@tag="010"]/*[local-name()="subfield"][@code="a"]')

        if llcn:
            source_identifier = llcn
        else:
            source_identifier = root.xpath(self.RECORDSET + '/*[local-name()="controlfield"][@tag="001"]')

        if source_identifier:
            self.log.info('FOUND identifier: ' + source_identifier[0].text.strip())
            return source_identifier[0].text.strip()


    '''
    NOTE: Whoa, complicated. Should be 490, but if it exists 800 might serve
          better
    Example: http://sru.gbv.de/gvk?version=1.1&operation=searchRetrieve&sortKeys=year,,1&maximumRecords=1&query=pica.num=9783789132223&recordSchema=marcxml
    '''
    def parse_series(self, root):
        # direct approach
        series_nodeName  = root.xpath(self.RECORDSET + '/*[local-name()="datafield"][@tag="800"]/*[local-name()="subfield"][@code="t"]')
        series_nodeIndex  = root.xpath(self.RECORDSET + '/*[local-name()="datafield"][@tag="800"]/*[local-name()="subfield"][@code="v"]')
        
        # less nice approach
        if not series_nodeName:
            series_nodeName  = root.xpath(self.RECORDSET + '/*[local-name()="datafield"][@tag="490"]/*[local-name()="subfield"][@code="a"]')
            series_nodeIndex  = root.xpath(self.RECORDSET + '/*[local-name()="datafield"][@tag="490"]/*[local-name()="subfield"][@code="v"]')

        if not series_nodeName:
            return (None, None)
        # RegEx extract numbers like "12" or "12.3242" Or "12,2323"
        # Cleans any strings from series number (like "Vol" or "Bd")
        # Or nastier stuff like:
        # <subfield code="v"> / Erin Hunter. Übers. von Klaus Weimann ; 1
        # </subfield>
        else:
            if not series_nodeIndex:
                series_index = '0'
            else:
                series_index = series_nodeIndex[0].text
                match = re.search('(\d+)([\.|,]{1}\d+)?', series_index)
                series_index = match.group(0) if match else None

        series_name = self.clean_string(series_nodeName[0].text)
        
        self.log.info ('Series checked. (' + series_name + ' : ' + series_index +')')
        return (series_name, series_index)

    
    '''
    TODO: Maybe check for "Varying Form of Title" (246) and Former Title (246)
    TODO: Think about default titel/subtitle divider
    '''
    def parse_title(self,root):
        title = None
        marc_title    = root.xpath(self.RECORDSET + '/*[local-name()="datafield"][@tag="245"]/*[local-name()="subfield"][@code="a"]')
        marc_subtitle = root.xpath(self.RECORDSET + '/*[local-name()="datafield"][@tag="245"]/*[local-name()="subfield"][@code="b"]')

        if marc_title:
            title = self.clean_string(marc_title[0].text)
            if marc_subtitle:
                title = title + '. ' + self.clean_string(marc_subtitle[0].text)
        
            # Just to make sure pica.title or dc.title didn't match some other
            # MARC field and we end up with a completley different title
            # Hope there is no codetable problem
            if self.resultByISBN == False and not lower(self.orig_title_short) in lower(title):
                self.log.info ('Unmatched original titel: '+self.orig_title+' NOT CLOSE ENOUGHT TO: '+title)
                title = None
            elif self.resultByISBN == True:
                self.log.info ('Assuming title is correct, because it\'s an ISBN hit. Anyway... original titel: '+self.orig_title+' and found title: '+title)
            else:
                self.log.info ('Matched original titel: '+self.orig_title+' EQUALS: '+title)

        '''
        Trick calibre's merging algorithm?
        from random import randint
        title = str(randint(1,100)) + '_' + title + '_' + str(randint(1,100))
        '''

        return title


    '''
    NOTE: Nearly always to find: "name, forname", but other options...
    NOTE: Theoretically disambiguation should be possible
    NOTE: Author name might in some cases be found in or 700 or 800;
          but only interesting if you don't care that the data source
          might be a audiobooks or something like that. Stick to 100 here for
          now
          Yet... some titles might not have an author, but some other "entity"
    '''
    def parse_authors(self, root):
        marc_personalName = root.xpath(self.RECORDSET + '/*[local-name()="datafield"][@tag="100"][@ind1="1"]/*[local-name()="subfield"][@code="a"]')

        # Check org. author - audiobook etc.?
        if not marc_personalName:
            marc_personalName = root.xpath(self.RECORDSET + '/*[local-name()="datafield"][@tag="700"][@ind1="1"]/*[local-name()="subfield"][@code="a"]')

        authors = []
        loginfo = ''
        # Take the first original author, split forename and name, take the name
        org_name = self.orig_authors[0].split(' ')[-1]
        if marc_personalName:
            for author in marc_personalName:
                #Split comma), reverse resulting list for calibre standard
                author_name = author.text.strip().split(',',2)
                author_name.reverse()
                author = ' '.join(author_name)
                if self.resultByISBN == False and not lower(org_name) in lower(author):
                    loginfo = loginfo + author + ' (no match for ' + self.orig_authors[0]+'), '
                else:
                    loginfo = loginfo + author + ' (match for ' + self.orig_authors[0]+'), '
                    authors.append(author)

        self.log.info ('Authors checked: ' + loginfo)
        return authors


    '''
    NOTE: There might be multiple ISBNs (paperback, hardcover etc.),
          taking first
    Could check for invalid/canceled isbn
    RegEx because stuff like "9783446235960 (GB.)"
    '''
    def parse_isbn(self, root):
        marc_nodeIsbn = root.xpath(self.RECORDSET + '/*[local-name()="datafield"][@tag="020"]/*[local-name()="subfield"][@code="a"]')
        isbn = ''
        if marc_nodeIsbn:
            isbn = marc_nodeIsbn[0].text
            match = re.search('(97(8|9))?[\d|\-]*(\d|X|x)', isbn)
            isbn = match.group(0) if match else None

        self.log.info ('ISBN checked: ' + isbn)
        return isbn

#    def parse_LCC(self, root):
#        LCC = ''
#        marc_nodeLCC = root.xpath(self.RECORDSET + '/*[local-name()="datafield"][@tag="050"]/*[local-name()="subfield"][@code="a"][@code="b"]')
#        if marc_nodeLCC:
#            LCC = self.clean_string(marc_nodeLCC[0].text)
#
#        self.log.info ('LCC checked: ' + LCC)
#        return LCC


    '''
    Get Publisher
    '''
    def parse_publisher(self, root):
        publisher = ''
        marc_nodePublication = root.xpath(self.RECORDSET + '/*[local-name()="datafield"][@tag="260"]/*[local-name()="subfield"][@code="b"]')
        if marc_nodePublication:
            publisher = self.clean_string(marc_nodePublication[0].text)

        self.log.info ('Publisher checked: ' + publisher)
        return publisher


    '''
    TODO: finetune?
    '''
    def parse_published_date(self, root):
        year = None
        marc_nodePublicationDate = root.xpath(self.RECORDSET + '/*[local-name()="datafield"][@tag="260"]/*[local-name()="subfield"][@code="c"]')
        if marc_nodePublicationDate:
            pub_date = marc_nodePublicationDate[0].text
            match = re.search('\d{4}', pub_date)
            year_str = int(match.group(0)) if match else None
            if year_str:
                from calibre.utils.date import utc_tz
                year = datetime.datetime(year_str, 1, 1, tzinfo=utc_tz)

        self.log.info ('Pubyear checked: ' + str(year))
        return year


    '''
    Bit tricky
    TODO: What does: canonicalize_lang('English')
    '''
    def parse_language(self, root):
        language = None
        marc_nodeLanguage = root.xpath(self.RECORDSET + '/*[local-name()="datafield"][@tag="041"]/*[local-name()="subfield"][@code="a"]')

        if marc_nodeLanguage:
            # GBV uses its own codes. LOC? Maybe check for more sophisticated
            # solution later.
            lang_check = marc_nodeLanguage[0].text
            if lang_check == 'ger':
                language = 'German'
            elif lang_check == 'eng':
                language = 'English'
        # Library of Congress seems not to provide language usually. Guess it
        # defaults to english if not set
        elif not language and lower(self.SOURCE_NAME) == 'loc':
            language = 'English'

        self.log.info ('Language checked. ('+str(language)+')')
        return language


    '''
    Quite a lot possible...
    '''
    def parse_comments(self, root):
        comments = None
        marc_notes = root.xpath(self.RECORDSET + '/*[local-name()="datafield"][@tag="520"]/*[local-name()="subfield"][@code="a"]')

        if marc_notes:
            comments = marc_notes[0].text

        self.log.info ('Comments checked.')
        return comments


    '''
    Very much possible; restricting to genre and topic
    '''
    def parse_tags(self, root):
        tags = []
        marc_subjectGenre = root.xpath(self.RECORDSET + '/*[local-name()="datafield"][@tag="655"]/*[local-name()="subfield"][@code="a"]')

        if marc_subjectGenre:
            for tag in marc_subjectGenre:
                single_tag = self.TAGPREFIX_GENRE + tag.text
                tags.append(single_tag)

        marc_subjectTopic = root.xpath(self.RECORDSET + '/*[local-name()="datafield"][@tag="650"]/*[local-name()="subfield"][@code="a"]')

        if marc_subjectTopic:
            for tag in marc_subjectTopic:
                single_tag = self.TAGPREFIX_TOPIC1 + tag.text
                tags.append(single_tag)
        
        # Special GBV; ind2=7 isn't defined in marc21, but obviously used
        # for topic? Should be 0
        marc_uncontrolledTopic = root.xpath(self.RECORDSET + '/*[local-name()="datafield"][@tag="653"][@ind2="7"]/*[local-name()="subfield"][@code="a"]')
        if marc_uncontrolledTopic:
            for tag in marc_uncontrolledTopic:
                single_tag = self.TAGPREFIX_TOPIC2 + tag.text
                tags.append(single_tag)
                
        self.log.info ('Tags checked.' + ', '.join(tags))
        return tags


    '''
    Not yet
    '''
    def parse_cover(self, root):
        result_node=root.xpath('img_url', namespaces=self.ns)
        if result_node:
            img_url = result_node[0].text
            pos=img_url.count('sinportada')
            if pos==0:
                self.plugin.cache_identifier_to_cover_url(self.source_id, img_url)
                return img_url


    '''
    Remove nasty parts in a string. GBV foten uses ' :' at end; LOC " /"
    TODO: Should only remove at end of string...
    '''
    def clean_string(self, string):
        string = string.replace(' :', '')
        string = string.replace(' /', '')
        string = string.replace(' ;', '')
        string = string.replace(':.', '.')
        string = string.strip()
        return string
