SimpleChatTC:SimpleProxy:HtmlParser more generic and flexible
also now track header, footer and nav so that they arent captured
This commit is contained in:
parent
cd226e8dae
commit
73a144c44d
|
|
@ -97,29 +97,34 @@ class TextHtmlParser(html.parser.HTMLParser):
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
self.bBody = False
|
self.inside = {
|
||||||
|
'body': False,
|
||||||
|
'script': False,
|
||||||
|
'style': False,
|
||||||
|
'header': False,
|
||||||
|
'footer': False,
|
||||||
|
'nav': False
|
||||||
|
}
|
||||||
|
self.monitored = [ 'body', 'script', 'style', 'header', 'footer', 'nav' ]
|
||||||
self.bCapture = False
|
self.bCapture = False
|
||||||
self.text = ""
|
self.text = ""
|
||||||
self.textStripped = ""
|
self.textStripped = ""
|
||||||
|
|
||||||
|
def do_capture(self):
|
||||||
|
if self.inside['body'] and not (self.inside['script'] or self.inside['style'] or self.inside['header'] or self.inside['footer'] or self.inside['nav']):
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]):
|
def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]):
|
||||||
if tag == 'body':
|
if tag in self.monitored:
|
||||||
self.bBody = True
|
self.inside[tag] = True
|
||||||
self.bCapture = True
|
|
||||||
if tag == 'script':
|
|
||||||
self.bCapture = False
|
|
||||||
if tag == 'style':
|
|
||||||
self.bCapture = False
|
|
||||||
|
|
||||||
def handle_endtag(self, tag: str):
|
def handle_endtag(self, tag: str):
|
||||||
if tag == 'body':
|
if tag in self.monitored:
|
||||||
self.bBody = False
|
self.inside[tag] = False
|
||||||
if tag == 'script' or tag == 'style':
|
|
||||||
if self.bBody:
|
|
||||||
self.bCapture = True
|
|
||||||
|
|
||||||
def handle_data(self, data: str):
|
def handle_data(self, data: str):
|
||||||
if self.bCapture:
|
if self.do_capture():
|
||||||
self.text += f"{data}\n"
|
self.text += f"{data}\n"
|
||||||
|
|
||||||
def syncup(self):
|
def syncup(self):
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue