SimpleChatTC:XmlFiltered: Use re with heirarchy of tags to filter

Rename xmltext to xmlfiltered. This simplifies the filtering related logic as well as gives more fine grained flexibility wrt filtering bcas of re.
2025-11-07 16:03:00 +05:30 · 2025-11-07 16:03:00 +05:30 · 9f5c3d7776
parent 9ed1cf9886
commit 9f5c3d7776
4 changed files with 76 additions and 54 deletions
--- a/tools/server/public_simplechat/local.tools/simpleproxy.py
+++ b/tools/server/public_simplechat/local.tools/simpleproxy.py
@ -49,7 +49,7 @@ gConfigType = {
 gConfigNeeded = [ '--allowed.schemes', '--allowed.domains', '--bearer.insecure' ]
 gAllowedCalls = {
-    "xmltext": [],
+    "xmlfiltered": [],
    "urltext": [],
    "urlraw": [],
    "pdftext": [ "pypdf" ]
@ -140,8 +140,8 @@ class ProxyHandler(http.server.BaseHTTPRequestHandler):
                self.auth_and_run(pr, mWeb.handle_urlraw)
            case '/urltext':
                self.auth_and_run(pr, mWeb.handle_urltext)
-            case '/xmltext':
+            case '/xmlfiltered':
-                self.auth_and_run(pr, mWeb.handle_xmltext)
+                self.auth_and_run(pr, mWeb.handle_xmlfiltered)
            case '/pdftext':
                self.auth_and_run(pr, mPdf.handle_pdftext)
            case '/aum':
--- a/tools/server/public_simplechat/local.tools/webmagic.py
+++ b/tools/server/public_simplechat/local.tools/webmagic.py
@ -8,6 +8,7 @@ import html.parser
 import debug
 import filemagic as mFile
 import json
 import re
 from typing import TYPE_CHECKING, Any, cast
 if TYPE_CHECKING:
@ -218,25 +219,30 @@ def handle_urltext(ph: 'ProxyHandler', pr: urllib.parse.ParseResult):
        ph.send_error(502, f"WARN:UrlTextFailed:{exc}")
-class TextXMLParser(html.parser.HTMLParser):
+class XMLFilterParser(html.parser.HTMLParser):
    """
    A simple minded logic used to strip xml content of
-    * unwanted tags and their contents.
+    * unwanted tags and their contents, using re
-    * this works properly only if the xml being processed has proper opening and ending tags
+    * this works properly only if the xml being processed has
-    around the area of interest.
+      proper opening and ending tags around the area of interest.
    This can help return a cleaned up xml file.
    """
-    def __init__(self, tagDrops: list[str]):
+    def __init__(self, tagDropREs: list[str]):
        """
        tagDropREs - allows one to specify a list of tags related REs,
        to help drop the corresponding tags and their contents fully.
        To drop a tag, specify regular expression
        * that matches the corresponding heirarchy of tags involved
            * where the tag names should be in lower case and suffixed with :
        * if interested in dropping a tag independent of where it appears use
          ".*:tagname:.*" re template
        """
        super().__init__()
-        self.tagDrops = list(map(str.lower, tagDrops))
+        self.tagDropREs = list(map(str.lower, tagDropREs))
-        print(f"DBUG:TextXMLParser:{self.tagDrops}")
+        print(f"DBUG:XMLFilterParser:{self.tagDropREs}")
        self.insideTagDrops = {
        }
        for tag in tagDrops:
            self.insideTagDrops[tag] = False
        self.bCapture = False
        self.text = ""
        self.prefixTags = []
        self.prefix = ""
@ -246,8 +252,9 @@ class TextXMLParser(html.parser.HTMLParser):
        """
        Helps decide whether to capture contents or discard them.
        """
-        for tag in self.tagDrops:
+        curTagH = f'{":".join(self.prefixTags)}:'
-            if self.insideTagDrops[tag]:
+        for dropRE in self.tagDropREs:
            if re.match(dropRE, curTagH):
                return False
        return True
@ -256,8 +263,6 @@ class TextXMLParser(html.parser.HTMLParser):
        self.prefixTags.append(tag)
        self.prefix += "\t"
        self.text += f"\n{self.prefix}<{tag}>"
        if tag in self.tagDrops:
            self.insideTagDrops[tag] = True
    def handle_endtag(self, tag: str):
        if (self.lastTrackedCB == "endtag"):
@ -267,36 +272,34 @@ class TextXMLParser(html.parser.HTMLParser):
        self.lastTrackedCB = "endtag"
        self.prefixTags.pop()
        self.prefix = self.prefix[:-1]
        if tag in self.tagDrops:
            self.insideTagDrops[tag] = False
    def handle_data(self, data: str):
        if self.do_capture():
            self.text += f"{data}"
-def handle_xmltext(ph: 'ProxyHandler', pr: urllib.parse.ParseResult):
+def handle_xmlfiltered(ph: 'ProxyHandler', pr: urllib.parse.ParseResult):
    try:
        # Get requested url
-        got = handle_urlreq(ph, pr, "HandleXMLText")
+        got = handle_urlreq(ph, pr, "HandleXMLFiltered")
        if not got.callOk:
            ph.send_error(got.httpStatus, got.httpStatusMsg)
            return
        # Extract Text
-        tagDrops = ph.headers.get('xmltext-tag-drops')
+        tagDropREs = ph.headers.get('xmlfiltered-tagdrop-res')
-        if not tagDrops:
+        if not tagDropREs:
-            tagDrops = []
+            tagDropREs = []
        else:
-            tagDrops = cast(list[str], json.loads(tagDrops))
+            tagDropREs = cast(list[str], json.loads(tagDropREs))
-        textXML = TextXMLParser(tagDrops)
+        xmlFiltered = XMLFilterParser(tagDropREs)
-        textXML.feed(got.contentData)
+        xmlFiltered.feed(got.contentData)
        # Send back to client
        ph.send_response(got.httpStatus)
        ph.send_header('Content-Type', got.contentType)
        # Add CORS for browser fetch, just in case
        ph.send_header('Access-Control-Allow-Origin', '*')
        ph.end_headers()
-        ph.wfile.write(textXML.text.encode('utf-8'))
+        ph.wfile.write(xmlFiltered.text.encode('utf-8'))
-        debug.dump({ 'RawText': 'yes', 'StrippedText': 'yes' }, { 'RawText': textXML.text })
+        debug.dump({ 'XMLFiltered': 'yes' }, { 'RawText': xmlFiltered.text })
    except Exception as exc:
-        ph.send_error(502, f"WARN:XMLTextFailed:{exc}")
+        ph.send_error(502, f"WARN:XMLFiltered:Failed:{exc}")
--- a/tools/server/public_simplechat/readme.md
+++ b/tools/server/public_simplechat/readme.md
@ -463,9 +463,15 @@ plain textual content from the search result page.
 * fetch_pdf_as_text - fetch/read specified pdf file and extract its textual content
  * this depends on the pypdf python based open source library
-* fetch_xml_as_text - fetch/read specified xml file and extract its textual content
+* fetch_xml_filtered - fetch/read specified xml file and optionally filter out any specified tags
-  * prefixes the tag heirarchy with each leaf content
+  * allows one to specify a list of tags related REs,
-  * allows one to specify a list of tags that are to be dropped fully.
+    to help drop the corresponding tags and their contents fully.
  * to drop a tag, specify regular expression
    * that matches the corresponding heirarchy of tags involved
      * where the tag names should be in lower case and suffixed with :
    * if interested in dropping a tag independent of where it appears use
      * .*:tagname:.*
      * rather the tool call meta data passed to ai model explains the same and provides a sample.
 the above set of web related tool calls work by handshaking with a bundled simple local web proxy
 (/caching in future) server logic, this helps bypass the CORS restrictions applied if trying to
@ -656,6 +662,10 @@ sliding window based drop off or even before they kick in, this can help in many
 * add support for fetch_xml_as_text tool call, fix importmaps in index.html
 * renamed and updated logic wrt xml fetching to be fetch_xml_filtered. allow one to use re to identify
  the tags to be filtered in a fine grained manner including filtering based on tag heirarchy
 #### ToDo
 Is the tool call promise land trap deep enough, need to think through and explore around this once later.
--- a/tools/server/public_simplechat/toolweb.mjs
+++ b/tools/server/public_simplechat/toolweb.mjs
@ -330,16 +330,22 @@ async function fetchpdftext_setup(tcs) {
 //
-// Fetch XML Text
+// Fetch XML Filtered
 //
-let gRSSTagDropsDefault = [ "guid", "link", "description", "image", "enclosure" ]
+let gRSSTagDropsDefault = [
    "^rss:channel:item:guid:.*",
    "^rss:channel:item:link:.*",
    "^rss:channel:item:description:.*",
    ".*:image:.*",
    ".*:enclosure:.*"
 ];
-let fetchxmltext_meta = {
+let fetchxmlfiltered_meta = {
        "type": "function",
        "function": {
-            "name": "fetch_xml_as_text",
+            "name": "fetch_xml_filtered",
            "description": "Fetch requested xml url through a proxy server that can optionally filter out unwanted tags and their contents. Will take few seconds",
            "parameters": {
                "type": "object",
@ -348,9 +354,12 @@ let fetchxmltext_meta = {
                        "type":"string",
                        "description":"url of the xml file that will be fetched"
                    },
-                    "tagDrops":{
+                    "tagDropREs":{
                        "type":"string",
-                        "description":`Optionally specify a json stringified list of xml tags to drop. For example for rss feeds one could use ${JSON.stringify(gRSSTagDropsDefault)} and so...`
+                        "description":`Optionally specify a json stringified list of xml tag heirarchies to drop.
                        For each tag that needs to be dropped, one needs to specify regular expression of the heirarchy of tags involved,
                        where the tag names are always mentioned in lower case along with a : as suffix.
                        For example for rss feeds one could use ${JSON.stringify(gRSSTagDropsDefault)} and so...`
                    }
                },
                "required": ["url"]
@ -360,7 +369,7 @@ let fetchxmltext_meta = {
 /**
- * Implementation of the fetch xml as text logic.
+ * Implementation of the fetch xml filtered logic.
 * Expects simpleproxy to be running at specified url and providing xmltext service
 * ALERT: Accesses a seperate/external web proxy/caching server, be aware and careful
 * @param {string} chatid
@ -368,25 +377,25 @@ let fetchxmltext_meta = {
 * @param {string} toolname
 * @param {any} obj
 */
-function fetchxmltext_run(chatid, toolcallid, toolname, obj) {
+function fetchxmlfiltered_run(chatid, toolcallid, toolname, obj) {
-    let tagDrops = obj.tagDrops
+    let tagDropREs = obj.tagDropREs
-    if (tagDrops == undefined) {
+    if (tagDropREs == undefined) {
-        tagDrops = JSON.stringify([]) // JSON.stringify(gRSSTagDropsDefault)
+        tagDropREs = JSON.stringify([]) // JSON.stringify(gRSSTagDropsDefault)
    }
-    let headers = { 'xmltext-tag-drops': tagDrops }
+    let headers = { 'xmlfiltered-tagdrop-res': tagDropREs }
-    return proxyserver_get_anyargs(chatid, toolcallid, toolname, obj, 'xmltext', headers);
+    return proxyserver_get_anyargs(chatid, toolcallid, toolname, obj, 'xmlfiltered', headers);
 }
 /**
- * Setup fetch_xml_as_text for tool calling
+ * Setup fetch_xml_filtered for tool calling
 * NOTE: Currently the logic is setup for the bundled simpleproxy.py
 * @param {Object<string, Object<string, any>>} tcs
 */
-async function fetchxmltext_setup(tcs) {
+async function fetchxmlfiltered_setup(tcs) {
-    return proxyserver_tc_setup('FetchXmlAsText', 'xmltext', 'fetch_xml_as_text', {
+    return proxyserver_tc_setup('FetchXmlFiltered', 'xmlfiltered', 'fetch_xml_filtered', {
-        "handler": fetchxmltext_run,
+        "handler": fetchxmlfiltered_run,
-        "meta": fetchxmltext_meta,
+        "meta": fetchxmlfiltered_meta,
        "result": ""
    }, tcs);
 }
@ -412,6 +421,6 @@ export async function init(me) {
    await fetchweburltext_setup(tc_switch)
    await searchwebtext_setup(tc_switch)
    await fetchpdftext_setup(tc_switch)
-    await fetchxmltext_setup(tc_switch)
+    await fetchxmlfiltered_setup(tc_switch)
    return tc_switch
 }