MobileRead Forums - View Single Post - Extract text from selected books, convert them to tags, and add them to metadata.

lizzie1170 · 08-21-2022, 02:25 AM

Quote:

Originally Posted by davidfor

Well, you get that error because you didn't actually call the method. "_extract_body_text" appears to be a method that takes a string of some sort. But, when you used it, you treated it as something else.

And that doesn't look anything like what Page Count does. It will open the epub as an iterator, then iterate through the files in the spine, extract the text from each of them and combine them into a big long chunk of text. Then it process that. You have passed "path_to_epub" into your method, but, never actually used it. From the Count Pages plugin, you need to look at statistic.py and follow the flow starting with "get_word_count"

I added get_word_count definition but it depends on other definitions. Running the code results in TypeError: TagsFromEpub.run() takes 3 positional arguments but 4 were given.

Code:

from calibre.ebooks.oeb.iterator import EbookIterator
from calibre_plugins.action_chains.actions.base import ChainAction

with open("test_dict.txt", "r") as f:
    tags_dict = f.read()

class TagsFromEpub(ChainAction):
    name = 'Tags_F_Epub'
    support_scopes = True

    def get_word_count(iterator, book_path, icu_wordcount):
        '''Given an iterator for the epub (if already opened/converted), estimate a word count'''
        from calibre.utils.localization import get_lang
        if iterator is None:
            iterator = _open_epub_file(book_path)
            lang = iterator.opf.language
            lang = get_lang() if not lang else lang
            DEFAULT_STORE_VALUES = {}
            KEY_USE_ICU_WORDCOUNT = 'useIcuWordcount'
            icu_wordcount = c.get(cfg.KEY_USE_ICU_WORDCOUNT, cfg.DEFAULT_STORE_VALUES[cfg.KEY_USE_ICU_WORDCOUNT])
            count = _get_epub_standard_word_count(iterator, lang, icu_wordcount)
            print('\tWord count:', count)
            return iterator, count

    def _open_epub_file(book_path, strip_html=False):
        '''Given a path to an EPUB file, read the contents into a giant block of text'''
        iterator = EbookIterator(book_path)
        iterator.__enter__(only_input_plugin=True, run_char_count=True, read_anchor_map=False)
        return iterator
    
    def _get_epub_standard_word_count(iterator, lang='en', icu_wordcount=False):
        '''This algorithm counts individual words instead of pages'''
        book_text = _read_epub_contents(iterator, strip_html=True)
        wordcount = None
        if icu_wordcount:
            try:
                from calibre.spell.break_iterator import count_words
                print('\tWord count using icu_wordcount - trying to count_words')
                wordcount = count_words(book_text, lang)
                print('\tWord count - used count_words:', wordcount)
            except:
                try: # The above method is new and no-one will have it as of 08/01/2016.
                    print('\tWord count using icu_wordcount - trying to import split_into_words_and_positions')
                    from calibre.spell.break_iterator import split_into_words_and_positions
                    print('\tWord count - trying split_into_words_and_positions:')
                    wordcount = len(split_into_words_and_positions(book_text, lang))
                    print('\tWord count - used split_into_words_and_positions:', wordcount)
                except:
                    pass
        if not wordcount: # If not using icu wordcount, or it failed, use the old method.
            from calibre.utils.wordcount import get_wordcount_obj
            print('\tWord count using older method - trying get_wordcount_obj')
            wordcount = get_wordcount_obj(book_text)
            wordcount = wordcount.words
        return wordcount 
    
    def tags_from_epub(path_to_epub):
        temp = []
        res = dict()
        for line in wordcount:
            for key,value in tags_dict.items():
                if re.search(rf'{value}', line):
                    if value not in temp:
                        temp.append(value)
                        res[key] = value                
                        regex = re.compile(value) 
                        match_array = regex.finditer(line) 
                        match_list = list(match_array)
                        for m in match_list:
                            print(key, ":",m.group())
    
    def run(gui, settings, chain):
        db = gui.current_db
        for book_id in chain.scope().get_book_ids():
            fmts = [ fmt.strip() for fmt in db.formats(book_id, index_is_id=True).split(',') ]
            if 'EPUB' in fmts:
                path_to_epub = db.format_abspath(book_id, 'EPUB', index_is_id=True)
                tags_from_epub(path_to_epub)