Quote:
Originally Posted by davidfor
Well, you get that error because you didn't actually call the method. "_extract_body_text" appears to be a method that takes a string of some sort. But, when you used it, you treated it as something else.
And that doesn't look anything like what Page Count does. It will open the epub as an iterator, then iterate through the files in the spine, extract the text from each of them and combine them into a big long chunk of text. Then it process that. You have passed "path_to_epub" into your method, but, never actually used it. From the Count Pages plugin, you need to look at statistic.py and follow the flow starting with "get_word_count"
|

I added get_word_count definition but it depends on other definitions. Running the code results in
TypeError: TagsFromEpub.run() takes 3 positional arguments but 4 were given.
Code:
from calibre.ebooks.oeb.iterator import EbookIterator
from calibre_plugins.action_chains.actions.base import ChainAction
with open("test_dict.txt", "r") as f:
tags_dict = f.read()
class TagsFromEpub(ChainAction):
name = 'Tags_F_Epub'
support_scopes = True
def get_word_count(iterator, book_path, icu_wordcount):
'''Given an iterator for the epub (if already opened/converted), estimate a word count'''
from calibre.utils.localization import get_lang
if iterator is None:
iterator = _open_epub_file(book_path)
lang = iterator.opf.language
lang = get_lang() if not lang else lang
DEFAULT_STORE_VALUES = {}
KEY_USE_ICU_WORDCOUNT = 'useIcuWordcount'
icu_wordcount = c.get(cfg.KEY_USE_ICU_WORDCOUNT, cfg.DEFAULT_STORE_VALUES[cfg.KEY_USE_ICU_WORDCOUNT])
count = _get_epub_standard_word_count(iterator, lang, icu_wordcount)
print('\tWord count:', count)
return iterator, count
def _open_epub_file(book_path, strip_html=False):
'''Given a path to an EPUB file, read the contents into a giant block of text'''
iterator = EbookIterator(book_path)
iterator.__enter__(only_input_plugin=True, run_char_count=True, read_anchor_map=False)
return iterator
def _get_epub_standard_word_count(iterator, lang='en', icu_wordcount=False):
'''This algorithm counts individual words instead of pages'''
book_text = _read_epub_contents(iterator, strip_html=True)
wordcount = None
if icu_wordcount:
try:
from calibre.spell.break_iterator import count_words
print('\tWord count using icu_wordcount - trying to count_words')
wordcount = count_words(book_text, lang)
print('\tWord count - used count_words:', wordcount)
except:
try: # The above method is new and no-one will have it as of 08/01/2016.
print('\tWord count using icu_wordcount - trying to import split_into_words_and_positions')
from calibre.spell.break_iterator import split_into_words_and_positions
print('\tWord count - trying split_into_words_and_positions:')
wordcount = len(split_into_words_and_positions(book_text, lang))
print('\tWord count - used split_into_words_and_positions:', wordcount)
except:
pass
if not wordcount: # If not using icu wordcount, or it failed, use the old method.
from calibre.utils.wordcount import get_wordcount_obj
print('\tWord count using older method - trying get_wordcount_obj')
wordcount = get_wordcount_obj(book_text)
wordcount = wordcount.words
return wordcount
def tags_from_epub(path_to_epub):
temp = []
res = dict()
for line in wordcount:
for key,value in tags_dict.items():
if re.search(rf'{value}', line):
if value not in temp:
temp.append(value)
res[key] = value
regex = re.compile(value)
match_array = regex.finditer(line)
match_list = list(match_array)
for m in match_list:
print(key, ":",m.group())
def run(gui, settings, chain):
db = gui.current_db
for book_id in chain.scope().get_book_ids():
fmts = [ fmt.strip() for fmt in db.formats(book_id, index_is_id=True).split(',') ]
if 'EPUB' in fmts:
path_to_epub = db.format_abspath(book_id, 'EPUB', index_is_id=True)
tags_from_epub(path_to_epub)