SimpleChatTC:WebTools: urltext-tag-drops python side - skel

Rename search-drops to urltext-tag-drops, to indicate its more
generic semantic. Rather search drops specified in UI by user
will be mapped to urltext-tag-drops header entry of a urltext
web fetch request.

Implement a crude urltext-tag-drops logic in TextHtmlParser.
If there is any mismatch with opening and closing tags in the
html being parsed and inturn wrt the type of tag being targetted
for dropping, things can mess up.
This commit is contained in:
hanishkvc 2025-11-03 20:59:18 +05:30
parent f75bdb0e00
commit 06fd41a88e
2 changed files with 27 additions and 5 deletions

View File

@ -8,6 +8,7 @@ from dataclasses import dataclass
import html.parser import html.parser
import debug import debug
import filemagic as mFile import filemagic as mFile
import json
from typing import TYPE_CHECKING from typing import TYPE_CHECKING
if TYPE_CHECKING: if TYPE_CHECKING:
@ -95,36 +96,52 @@ class TextHtmlParser(html.parser.HTMLParser):
This helps return a relatively clean textual representation of the html file/content being parsed. This helps return a relatively clean textual representation of the html file/content being parsed.
""" """
def __init__(self): def __init__(self, tagDrops: dict):
super().__init__() super().__init__()
self.tagDrops = tagDrops
self.inside = { self.inside = {
'body': False, 'body': False,
'script': False, 'script': False,
'style': False, 'style': False,
'header': False, 'header': False,
'footer': False, 'footer': False,
'nav': False 'nav': False,
} }
self.monitored = [ 'body', 'script', 'style', 'header', 'footer', 'nav' ] self.monitored = [ 'body', 'script', 'style', 'header', 'footer', 'nav' ]
self.bCapture = False self.bCapture = False
self.text = "" self.text = ""
self.textStripped = "" self.textStripped = ""
self.droptagType = None
self.droptagCount = 0
def do_capture(self): def do_capture(self):
""" """
Helps decide whether to capture contents or discard them. Helps decide whether to capture contents or discard them.
""" """
if self.inside['body'] and not (self.inside['script'] or self.inside['style'] or self.inside['header'] or self.inside['footer'] or self.inside['nav']): if self.inside['body'] and not (self.inside['script'] or self.inside['style'] or self.inside['header'] or self.inside['footer'] or self.inside['nav'] or (self.droptagCount > 0)):
return True return True
return False return False
def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]): def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]):
if tag in self.monitored: if tag in self.monitored:
self.inside[tag] = True self.inside[tag] = True
for tagMeta in self.tagDrops:
if tag != tagMeta.tag:
continue
for attr in attrs:
if attr[0] != 'id':
continue
if attr[1] == tagMeta.id:
self.droptagCount += 1
self.droptagType = tag
def handle_endtag(self, tag: str): def handle_endtag(self, tag: str):
if tag in self.monitored: if tag in self.monitored:
self.inside[tag] = False self.inside[tag] = False
if tag == self.droptagType:
self.droptagCount -= 1
if self.droptagCount < 0:
self.droptagCount = 0
def handle_data(self, data: str): def handle_data(self, data: str):
if self.do_capture(): if self.do_capture():
@ -167,7 +184,12 @@ def handle_urltext(ph: 'ProxyHandler', pr: urllib.parse.ParseResult):
ph.send_error(got.httpStatus, got.httpStatusMsg) ph.send_error(got.httpStatus, got.httpStatusMsg)
return return
# Extract Text # Extract Text
textHtml = TextHtmlParser() tagDrops = ph.headers.get('urltext-tag-drops')
if not tagDrops:
tagDrops = {}
else:
tagDrops = json.loads(tagDrops)
textHtml = TextHtmlParser(tagDrops)
textHtml.feed(got.contentData) textHtml.feed(got.contentData)
# Send back to client # Send back to client
ph.send_response(got.httpStatus) ph.send_response(got.httpStatus)

View File

@ -259,7 +259,7 @@ function searchwebtext_run(chatid, toolcallid, toolname, obj) {
searchUrl = searchUrl.replace("SEARCHWORDS", encodeURIComponent(obj.words)); searchUrl = searchUrl.replace("SEARCHWORDS", encodeURIComponent(obj.words));
delete(obj.words) delete(obj.words)
obj['url'] = searchUrl obj['url'] = searchUrl
let headers = { 'Search-Drops': get_gme().tools.searchDrops } let headers = { 'urltext-tag-drops': get_gme().tools.searchDrops }
return proxyserver_get_anyargs(chatid, toolcallid, toolname, obj, 'urltext', headers); return proxyserver_get_anyargs(chatid, toolcallid, toolname, obj, 'urltext', headers);
} }
} }