From 06fd41a88eeae7de26e7bb60d816d5a648de5924 Mon Sep 17 00:00:00 2001 From: hanishkvc Date: Mon, 3 Nov 2025 20:59:18 +0530 Subject: [PATCH] SimpleChatTC:WebTools: urltext-tag-drops python side - skel Rename search-drops to urltext-tag-drops, to indicate its more generic semantic. Rather search drops specified in UI by user will be mapped to urltext-tag-drops header entry of a urltext web fetch request. Implement a crude urltext-tag-drops logic in TextHtmlParser. If there is any mismatch with opening and closing tags in the html being parsed and inturn wrt the type of tag being targetted for dropping, things can mess up. --- .../public_simplechat/local.tools/webmagic.py | 30 ++++++++++++++++--- tools/server/public_simplechat/toolweb.mjs | 2 +- 2 files changed, 27 insertions(+), 5 deletions(-) diff --git a/tools/server/public_simplechat/local.tools/webmagic.py b/tools/server/public_simplechat/local.tools/webmagic.py index 9d910a02a5..d371fa736e 100644 --- a/tools/server/public_simplechat/local.tools/webmagic.py +++ b/tools/server/public_simplechat/local.tools/webmagic.py @@ -8,6 +8,7 @@ from dataclasses import dataclass import html.parser import debug import filemagic as mFile +import json from typing import TYPE_CHECKING if TYPE_CHECKING: @@ -95,36 +96,52 @@ class TextHtmlParser(html.parser.HTMLParser): This helps return a relatively clean textual representation of the html file/content being parsed. """ - def __init__(self): + def __init__(self, tagDrops: dict): super().__init__() + self.tagDrops = tagDrops self.inside = { 'body': False, 'script': False, 'style': False, 'header': False, 'footer': False, - 'nav': False + 'nav': False, } self.monitored = [ 'body', 'script', 'style', 'header', 'footer', 'nav' ] self.bCapture = False self.text = "" self.textStripped = "" + self.droptagType = None + self.droptagCount = 0 def do_capture(self): """ Helps decide whether to capture contents or discard them. """ - if self.inside['body'] and not (self.inside['script'] or self.inside['style'] or self.inside['header'] or self.inside['footer'] or self.inside['nav']): + if self.inside['body'] and not (self.inside['script'] or self.inside['style'] or self.inside['header'] or self.inside['footer'] or self.inside['nav'] or (self.droptagCount > 0)): return True return False def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]): if tag in self.monitored: self.inside[tag] = True + for tagMeta in self.tagDrops: + if tag != tagMeta.tag: + continue + for attr in attrs: + if attr[0] != 'id': + continue + if attr[1] == tagMeta.id: + self.droptagCount += 1 + self.droptagType = tag def handle_endtag(self, tag: str): if tag in self.monitored: self.inside[tag] = False + if tag == self.droptagType: + self.droptagCount -= 1 + if self.droptagCount < 0: + self.droptagCount = 0 def handle_data(self, data: str): if self.do_capture(): @@ -167,7 +184,12 @@ def handle_urltext(ph: 'ProxyHandler', pr: urllib.parse.ParseResult): ph.send_error(got.httpStatus, got.httpStatusMsg) return # Extract Text - textHtml = TextHtmlParser() + tagDrops = ph.headers.get('urltext-tag-drops') + if not tagDrops: + tagDrops = {} + else: + tagDrops = json.loads(tagDrops) + textHtml = TextHtmlParser(tagDrops) textHtml.feed(got.contentData) # Send back to client ph.send_response(got.httpStatus) diff --git a/tools/server/public_simplechat/toolweb.mjs b/tools/server/public_simplechat/toolweb.mjs index e473e250dd..f52aca4357 100644 --- a/tools/server/public_simplechat/toolweb.mjs +++ b/tools/server/public_simplechat/toolweb.mjs @@ -259,7 +259,7 @@ function searchwebtext_run(chatid, toolcallid, toolname, obj) { searchUrl = searchUrl.replace("SEARCHWORDS", encodeURIComponent(obj.words)); delete(obj.words) obj['url'] = searchUrl - let headers = { 'Search-Drops': get_gme().tools.searchDrops } + let headers = { 'urltext-tag-drops': get_gme().tools.searchDrops } return proxyserver_get_anyargs(chatid, toolcallid, toolname, obj, 'urltext', headers); } }