# -*- coding: utf-8 -*-
__license__   = 'GPL v3'
__copyright__ = '2016,2017,2018,2019,2020,2021,2022,2023 DaltonST'
__my_version__ = "1.0.191"  #Qt6

from polyglot.builtins import as_unicode, unicode_type

from calibre_plugins.job_spy.config import prefs

from html.parser import HTMLParser as myhtmlparser    #Python 3

#--------------------------------------------------------------------------------------------
class JSHTMLGetContent(myhtmlparser):
	def __init__(self):
		self.reset()
		myhtmlparser.__init__(self)
		self.reset()
		self.content = ""
		s = prefs['GUI_TOOLS_EXTRACT_ORIGINAL_TITLE_KEYWORD']
		s_split = s.split("|")
		self.original_title_list = []
		for row in s_split:
			row = row.strip()
			if row > " ":
				self.original_title_list.append(row)
		s = prefs['GUI_TOOLS_EXTRACT_TRANSLATOR_KEYWORD']
		s_split = s.split("|")
		self.translator_list = []
		for row in s_split:
			row = row.strip()
			if row > " ":
				self.translator_list.append(row)
		self.begin_accumulating_data = False
	def handle_starttag(self, tag, attrs):
		if self.begin_accumulating_data:
			if "body" in tag:
				self.begin_accumulating_data = False
	def handle_endtag(self, tag):
		if self.begin_accumulating_data:
			if "body" in tag:
				self.begin_accumulating_data = False
	def handle_data(self, data):
		if not isinstance(data,unicode_type):
			data = as_unicode(data)
		if not self.begin_accumulating_data:
			for row in self.original_title_list:
				if data.count(row) > 0:
					self.begin_accumulating_data = True
					break
		if not self.begin_accumulating_data:
			for row in self.translator_list:
				if data.count(row) > 0:
					self.begin_accumulating_data = True
					break
		if self.begin_accumulating_data:
			data = "<$STARTDATA$>" + data + "<$ENDDATA$>"
			self.content = self.content + data
		def handle_entityref(self,name):
			return
		def handle_charref(self,name):
			return
		def handle_comment(self,data):
			return
		def handle_pi(self,data):
			return
		def handle_unknown_decl(self,data):
			return
#--------------------------------------------------------------------------------------------
#--------------------------------------------------------------------------------------------
#--------------------------------------------------------------------------------------------
#END OF text_extraction_utils.py
