SimpleChatTC:XMLFiltered: Retain xml tags with selective dropping

instead of the prefixing of tag heirarchy retain the xml structure while parallely allowing unwanted tags and their contents to be dropped.
2025-11-07 15:22:10 +05:30 · 2025-11-07 15:22:10 +05:30 · 9ed1cf9886
parent b8bb258dd5
commit 9ed1cf9886
2 changed files with 18 additions and 9 deletions
--- a/tools/server/public_simplechat/local.tools/webmagic.py
+++ b/tools/server/public_simplechat/local.tools/webmagic.py
@ -221,13 +221,11 @@ def handle_urltext(ph: 'ProxyHandler', pr: urllib.parse.ParseResult):
 class TextXMLParser(html.parser.HTMLParser):
    """
    A simple minded logic used to strip xml content of
-    * all the xml tags as well as
-    * all the contents belonging to below predefined tags like guid, enclosure, ...
-
+    * unwanted tags and their contents.
    * this works properly only if the xml being processed has proper opening and ending tags
    around the area of interest.

-    This helps return a relatively clean textual representation of the xml file/content being parsed.
+    This can help return a cleaned up xml file.
    """

    def __init__(self, tagDrops: list[str]):
@ -240,7 +238,9 @@ class TextXMLParser(html.parser.HTMLParser):
            self.insideTagDrops[tag] = False
        self.bCapture = False
        self.text = ""
-        self.prefix = []
+        self.prefixTags = []
+        self.prefix = ""
+        self.lastTrackedCB = ""

    def do_capture(self):
        """
@ -252,18 +252,27 @@ class TextXMLParser(html.parser.HTMLParser):
        return True

    def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]):
-        self.prefix.append(tag)
+        self.lastTrackedCB = "starttag"
+        self.prefixTags.append(tag)
+        self.prefix += "\t"
+        self.text += f"\n{self.prefix}<{tag}>"
        if tag in self.tagDrops:
            self.insideTagDrops[tag] = True

    def handle_endtag(self, tag: str):
-        self.prefix.pop()
+        if (self.lastTrackedCB == "endtag"):
+            self.text += f"\n{self.prefix}</{tag}>"
+        else:
+            self.text += f"</{tag}>"
+        self.lastTrackedCB = "endtag"
+        self.prefixTags.pop()
+        self.prefix = self.prefix[:-1]
        if tag in self.tagDrops:
            self.insideTagDrops[tag] = False

    def handle_data(self, data: str):
        if self.do_capture():
-            self.text += f"{':'.join(self.prefix)}:{data}\n"
+            self.text += f"{data}"


 def handle_xmltext(ph: 'ProxyHandler', pr: urllib.parse.ParseResult):
--- a/tools/server/public_simplechat/toolweb.mjs
+++ b/tools/server/public_simplechat/toolweb.mjs
@ -340,7 +340,7 @@ let fetchxmltext_meta = {
        "type": "function",
        "function": {
            "name": "fetch_xml_as_text",
-            "description": "Fetch requested xml url through a proxy server and return its cleaned up text contents. Each content is prefixed with the xml tag heirarchy that it belongs to. Will take few seconds",
+            "description": "Fetch requested xml url through a proxy server that can optionally filter out unwanted tags and their contents. Will take few seconds",
            "parameters": {
                "type": "object",
                "properties": {