SimpleChatTC:WebTools: urltext-tag-drops python side - skel
Rename search-drops to urltext-tag-drops, to indicate its more generic semantic. Rather search drops specified in UI by user will be mapped to urltext-tag-drops header entry of a urltext web fetch request. Implement a crude urltext-tag-drops logic in TextHtmlParser. If there is any mismatch with opening and closing tags in the html being parsed and inturn wrt the type of tag being targetted for dropping, things can mess up.
This commit is contained in:
parent
f75bdb0e00
commit
06fd41a88e
|
|
@ -8,6 +8,7 @@ from dataclasses import dataclass
|
||||||
import html.parser
|
import html.parser
|
||||||
import debug
|
import debug
|
||||||
import filemagic as mFile
|
import filemagic as mFile
|
||||||
|
import json
|
||||||
from typing import TYPE_CHECKING
|
from typing import TYPE_CHECKING
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
|
|
@ -95,36 +96,52 @@ class TextHtmlParser(html.parser.HTMLParser):
|
||||||
This helps return a relatively clean textual representation of the html file/content being parsed.
|
This helps return a relatively clean textual representation of the html file/content being parsed.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self, tagDrops: dict):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
|
self.tagDrops = tagDrops
|
||||||
self.inside = {
|
self.inside = {
|
||||||
'body': False,
|
'body': False,
|
||||||
'script': False,
|
'script': False,
|
||||||
'style': False,
|
'style': False,
|
||||||
'header': False,
|
'header': False,
|
||||||
'footer': False,
|
'footer': False,
|
||||||
'nav': False
|
'nav': False,
|
||||||
}
|
}
|
||||||
self.monitored = [ 'body', 'script', 'style', 'header', 'footer', 'nav' ]
|
self.monitored = [ 'body', 'script', 'style', 'header', 'footer', 'nav' ]
|
||||||
self.bCapture = False
|
self.bCapture = False
|
||||||
self.text = ""
|
self.text = ""
|
||||||
self.textStripped = ""
|
self.textStripped = ""
|
||||||
|
self.droptagType = None
|
||||||
|
self.droptagCount = 0
|
||||||
|
|
||||||
def do_capture(self):
|
def do_capture(self):
|
||||||
"""
|
"""
|
||||||
Helps decide whether to capture contents or discard them.
|
Helps decide whether to capture contents or discard them.
|
||||||
"""
|
"""
|
||||||
if self.inside['body'] and not (self.inside['script'] or self.inside['style'] or self.inside['header'] or self.inside['footer'] or self.inside['nav']):
|
if self.inside['body'] and not (self.inside['script'] or self.inside['style'] or self.inside['header'] or self.inside['footer'] or self.inside['nav'] or (self.droptagCount > 0)):
|
||||||
return True
|
return True
|
||||||
return False
|
return False
|
||||||
|
|
||||||
def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]):
|
def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]):
|
||||||
if tag in self.monitored:
|
if tag in self.monitored:
|
||||||
self.inside[tag] = True
|
self.inside[tag] = True
|
||||||
|
for tagMeta in self.tagDrops:
|
||||||
|
if tag != tagMeta.tag:
|
||||||
|
continue
|
||||||
|
for attr in attrs:
|
||||||
|
if attr[0] != 'id':
|
||||||
|
continue
|
||||||
|
if attr[1] == tagMeta.id:
|
||||||
|
self.droptagCount += 1
|
||||||
|
self.droptagType = tag
|
||||||
|
|
||||||
def handle_endtag(self, tag: str):
|
def handle_endtag(self, tag: str):
|
||||||
if tag in self.monitored:
|
if tag in self.monitored:
|
||||||
self.inside[tag] = False
|
self.inside[tag] = False
|
||||||
|
if tag == self.droptagType:
|
||||||
|
self.droptagCount -= 1
|
||||||
|
if self.droptagCount < 0:
|
||||||
|
self.droptagCount = 0
|
||||||
|
|
||||||
def handle_data(self, data: str):
|
def handle_data(self, data: str):
|
||||||
if self.do_capture():
|
if self.do_capture():
|
||||||
|
|
@ -167,7 +184,12 @@ def handle_urltext(ph: 'ProxyHandler', pr: urllib.parse.ParseResult):
|
||||||
ph.send_error(got.httpStatus, got.httpStatusMsg)
|
ph.send_error(got.httpStatus, got.httpStatusMsg)
|
||||||
return
|
return
|
||||||
# Extract Text
|
# Extract Text
|
||||||
textHtml = TextHtmlParser()
|
tagDrops = ph.headers.get('urltext-tag-drops')
|
||||||
|
if not tagDrops:
|
||||||
|
tagDrops = {}
|
||||||
|
else:
|
||||||
|
tagDrops = json.loads(tagDrops)
|
||||||
|
textHtml = TextHtmlParser(tagDrops)
|
||||||
textHtml.feed(got.contentData)
|
textHtml.feed(got.contentData)
|
||||||
# Send back to client
|
# Send back to client
|
||||||
ph.send_response(got.httpStatus)
|
ph.send_response(got.httpStatus)
|
||||||
|
|
|
||||||
|
|
@ -259,7 +259,7 @@ function searchwebtext_run(chatid, toolcallid, toolname, obj) {
|
||||||
searchUrl = searchUrl.replace("SEARCHWORDS", encodeURIComponent(obj.words));
|
searchUrl = searchUrl.replace("SEARCHWORDS", encodeURIComponent(obj.words));
|
||||||
delete(obj.words)
|
delete(obj.words)
|
||||||
obj['url'] = searchUrl
|
obj['url'] = searchUrl
|
||||||
let headers = { 'Search-Drops': get_gme().tools.searchDrops }
|
let headers = { 'urltext-tag-drops': get_gme().tools.searchDrops }
|
||||||
return proxyserver_get_anyargs(chatid, toolcallid, toolname, obj, 'urltext', headers);
|
return proxyserver_get_anyargs(chatid, toolcallid, toolname, obj, 'urltext', headers);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue