SimpleChatTC:XmlText: Cleanup initial go

At simpleproxy end * Add the tag names hierarchy before contents of a tag * Remember to convert the tagDrops to small case as HTMLParser base class seems to do that by default. At the client ui end * if undefined remember to pass a empty list wrt tagDrops. * cleanup the func description and also mention possible tagDrops for RSS feeds in the tool meta
2025-11-07 03:53:34 +05:30 · 2025-11-07 03:53:34 +05:30 · b8bb258dd5
parent fbe9b2369f
commit b8bb258dd5
3 changed files with 19 additions and 8 deletions
--- a/tools/server/public_simplechat/local.tools/webmagic.py
+++ b/tools/server/public_simplechat/local.tools/webmagic.py
@ -232,7 +232,7 @@ class TextXMLParser(html.parser.HTMLParser):
    def __init__(self, tagDrops: list[str]):
        super().__init__()
-        self.tagDrops = tagDrops
+        self.tagDrops = list(map(str.lower, tagDrops))
        print(f"DBUG:TextXMLParser:{self.tagDrops}")
        self.insideTagDrops = {
        }
@ -240,7 +240,7 @@ class TextXMLParser(html.parser.HTMLParser):
            self.insideTagDrops[tag] = False
        self.bCapture = False
        self.text = ""
-        self.prefix = ""
+        self.prefix = []
    def do_capture(self):
        """
@ -252,18 +252,18 @@ class TextXMLParser(html.parser.HTMLParser):
        return True
    def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]):
-        self.prefix += " "
+        self.prefix.append(tag)
        if tag in self.tagDrops:
            self.insideTagDrops[tag] = True
    def handle_endtag(self, tag: str):
-        self.prefix = self.prefix[:-1]
+        self.prefix.pop()
        if tag in self.tagDrops:
            self.insideTagDrops[tag] = False
    def handle_data(self, data: str):
        if self.do_capture():
-            self.text += f"{self.prefix}{data}\n"
+            self.text += f"{':'.join(self.prefix)}:{data}\n"
 def handle_xmltext(ph: 'ProxyHandler', pr: urllib.parse.ParseResult):
--- a/tools/server/public_simplechat/readme.md
+++ b/tools/server/public_simplechat/readme.md
@ -463,6 +463,10 @@ plain textual content from the search result page.
 * fetch_pdf_as_text - fetch/read specified pdf file and extract its textual content
  * this depends on the pypdf python based open source library
 * fetch_xml_as_text - fetch/read specified xml file and extract its textual content
  * prefixes the tag heirarchy with each leaf content
  * allows one to specify a list of tags that are to be dropped fully.
 the above set of web related tool calls work by handshaking with a bundled simple local web proxy
 (/caching in future) server logic, this helps bypass the CORS restrictions applied if trying to
 directly fetch from the browser js runtime environment.
@ -650,6 +654,7 @@ sliding window based drop off or even before they kick in, this can help in many
  or if there is no response within the configured timeout period.
  NOTE: Currently the logic supports only 1 pending tool call per chat session.
 * add support for fetch_xml_as_text tool call, fix importmaps in index.html
 #### ToDo
--- a/tools/server/public_simplechat/toolweb.mjs
+++ b/tools/server/public_simplechat/toolweb.mjs
@ -334,11 +334,13 @@ async function fetchpdftext_setup(tcs) {
 //
 let gRSSTagDropsDefault = [ "guid", "link", "description", "image", "enclosure" ]
 let fetchxmltext_meta = {
        "type": "function",
        "function": {
            "name": "fetch_xml_as_text",
-            "description": "Fetch the requested xml url through a proxy server and return its text content after stripping away the xml tags, in few seconds",
+            "description": "Fetch requested xml url through a proxy server and return its cleaned up text contents. Each content is prefixed with the xml tag heirarchy that it belongs to. Will take few seconds",
            "parameters": {
                "type": "object",
                "properties": {
@ -348,7 +350,7 @@ let fetchxmltext_meta = {
                    },
                    "tagDrops":{
                        "type":"string",
-                        "description":"specify a json stringified form of list of xml tags to drop"
+                        "description":`Optionally specify a json stringified list of xml tags to drop. For example for rss feeds one could use ${JSON.stringify(gRSSTagDropsDefault)} and so...`
                    }
                },
                "required": ["url"]
@ -367,7 +369,11 @@ let fetchxmltext_meta = {
 * @param {any} obj
 */
 function fetchxmltext_run(chatid, toolcallid, toolname, obj) {
-    let headers = { 'xmltext-tag-drops': obj.tagDrops }
+    let tagDrops = obj.tagDrops
    if (tagDrops == undefined) {
        tagDrops = JSON.stringify([]) // JSON.stringify(gRSSTagDropsDefault)
    }
    let headers = { 'xmltext-tag-drops': tagDrops }
    return proxyserver_get_anyargs(chatid, toolcallid, toolname, obj, 'xmltext', headers);
 }