SimpleChatTC:WebTools: urltext-tag-drops python side - skel

Rename search-drops to urltext-tag-drops, to indicate its more generic semantic. Rather search drops specified in UI by user will be mapped to urltext-tag-drops header entry of a urltext web fetch request. Implement a crude urltext-tag-drops logic in TextHtmlParser. If there is any mismatch with opening and closing tags in the html being parsed and inturn wrt the type of tag being targetted for dropping, things can mess up.
2025-11-03 20:59:18 +05:30 · 2025-11-03 20:59:18 +05:30 · 06fd41a88e
parent f75bdb0e00
commit 06fd41a88e
2 changed files with 27 additions and 5 deletions
--- a/tools/server/public_simplechat/local.tools/webmagic.py
+++ b/tools/server/public_simplechat/local.tools/webmagic.py
@ -8,6 +8,7 @@ from dataclasses import dataclass
 import html.parser
 import debug
 import filemagic as mFile
 import json
 from typing import TYPE_CHECKING
 if TYPE_CHECKING:
@ -95,36 +96,52 @@ class TextHtmlParser(html.parser.HTMLParser):
    This helps return a relatively clean textual representation of the html file/content being parsed.
    """
-    def __init__(self):
+    def __init__(self, tagDrops: dict):
        super().__init__()
        self.tagDrops = tagDrops
        self.inside = {
            'body': False,
            'script': False,
            'style': False,
            'header': False,
            'footer': False,
-            'nav': False
+            'nav': False,
        }
        self.monitored = [ 'body', 'script', 'style', 'header', 'footer', 'nav' ]
        self.bCapture = False
        self.text = ""
        self.textStripped = ""
        self.droptagType = None
        self.droptagCount = 0
    def do_capture(self):
        """
        Helps decide whether to capture contents or discard them.
        """
-        if self.inside['body'] and not (self.inside['script'] or self.inside['style'] or self.inside['header'] or self.inside['footer'] or self.inside['nav']):
+        if self.inside['body'] and not (self.inside['script'] or self.inside['style'] or self.inside['header'] or self.inside['footer'] or self.inside['nav'] or (self.droptagCount > 0)):
            return True
        return False
    def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]):
        if tag in self.monitored:
            self.inside[tag] = True
        for tagMeta in self.tagDrops:
            if tag != tagMeta.tag:
                continue
            for attr in attrs:
                if attr[0] != 'id':
                    continue
                if attr[1] == tagMeta.id:
                    self.droptagCount += 1
                    self.droptagType = tag
    def handle_endtag(self, tag: str):
        if tag in self.monitored:
            self.inside[tag] = False
        if tag == self.droptagType:
            self.droptagCount -= 1
            if self.droptagCount < 0:
                self.droptagCount = 0
    def handle_data(self, data: str):
        if self.do_capture():
@ -167,7 +184,12 @@ def handle_urltext(ph: 'ProxyHandler', pr: urllib.parse.ParseResult):
            ph.send_error(got.httpStatus, got.httpStatusMsg)
            return
        # Extract Text
-        textHtml = TextHtmlParser()
+        tagDrops = ph.headers.get('urltext-tag-drops')
        if not tagDrops:
            tagDrops = {}
        else:
            tagDrops = json.loads(tagDrops)
        textHtml = TextHtmlParser(tagDrops)
        textHtml.feed(got.contentData)
        # Send back to client
        ph.send_response(got.httpStatus)
--- a/tools/server/public_simplechat/toolweb.mjs
+++ b/tools/server/public_simplechat/toolweb.mjs
@ -259,7 +259,7 @@ function searchwebtext_run(chatid, toolcallid, toolname, obj) {
        searchUrl = searchUrl.replace("SEARCHWORDS", encodeURIComponent(obj.words));
        delete(obj.words)
        obj['url'] = searchUrl
-        let headers = { 'Search-Drops': get_gme().tools.searchDrops }
+        let headers = { 'urltext-tag-drops': get_gme().tools.searchDrops }
        return proxyserver_get_anyargs(chatid, toolcallid, toolname, obj, 'urltext', headers);
    }
 }