From 9f5c3d77765a04a512b4486a0980443d89b76168 Mon Sep 17 00:00:00 2001 From: hanishkvc Date: Fri, 7 Nov 2025 16:03:00 +0530 Subject: [PATCH] SimpleChatTC:XmlFiltered: Use re with heirarchy of tags to filter Rename xmltext to xmlfiltered. This simplifies the filtering related logic as well as gives more fine grained flexibility wrt filtering bcas of re. --- .../local.tools/simpleproxy.py | 6 +- .../public_simplechat/local.tools/webmagic.py | 61 ++++++++++--------- tools/server/public_simplechat/readme.md | 16 ++++- tools/server/public_simplechat/toolweb.mjs | 47 ++++++++------ 4 files changed, 76 insertions(+), 54 deletions(-) diff --git a/tools/server/public_simplechat/local.tools/simpleproxy.py b/tools/server/public_simplechat/local.tools/simpleproxy.py index 8348373e40..d36a2443f0 100644 --- a/tools/server/public_simplechat/local.tools/simpleproxy.py +++ b/tools/server/public_simplechat/local.tools/simpleproxy.py @@ -49,7 +49,7 @@ gConfigType = { gConfigNeeded = [ '--allowed.schemes', '--allowed.domains', '--bearer.insecure' ] gAllowedCalls = { - "xmltext": [], + "xmlfiltered": [], "urltext": [], "urlraw": [], "pdftext": [ "pypdf" ] @@ -140,8 +140,8 @@ class ProxyHandler(http.server.BaseHTTPRequestHandler): self.auth_and_run(pr, mWeb.handle_urlraw) case '/urltext': self.auth_and_run(pr, mWeb.handle_urltext) - case '/xmltext': - self.auth_and_run(pr, mWeb.handle_xmltext) + case '/xmlfiltered': + self.auth_and_run(pr, mWeb.handle_xmlfiltered) case '/pdftext': self.auth_and_run(pr, mPdf.handle_pdftext) case '/aum': diff --git a/tools/server/public_simplechat/local.tools/webmagic.py b/tools/server/public_simplechat/local.tools/webmagic.py index 588e562359..aaf7d5f332 100644 --- a/tools/server/public_simplechat/local.tools/webmagic.py +++ b/tools/server/public_simplechat/local.tools/webmagic.py @@ -8,6 +8,7 @@ import html.parser import debug import filemagic as mFile import json +import re from typing import TYPE_CHECKING, Any, cast if TYPE_CHECKING: @@ -218,25 +219,30 @@ def handle_urltext(ph: 'ProxyHandler', pr: urllib.parse.ParseResult): ph.send_error(502, f"WARN:UrlTextFailed:{exc}") -class TextXMLParser(html.parser.HTMLParser): +class XMLFilterParser(html.parser.HTMLParser): """ A simple minded logic used to strip xml content of - * unwanted tags and their contents. - * this works properly only if the xml being processed has proper opening and ending tags - around the area of interest. + * unwanted tags and their contents, using re + * this works properly only if the xml being processed has + proper opening and ending tags around the area of interest. This can help return a cleaned up xml file. """ - def __init__(self, tagDrops: list[str]): + def __init__(self, tagDropREs: list[str]): + """ + tagDropREs - allows one to specify a list of tags related REs, + to help drop the corresponding tags and their contents fully. + + To drop a tag, specify regular expression + * that matches the corresponding heirarchy of tags involved + * where the tag names should be in lower case and suffixed with : + * if interested in dropping a tag independent of where it appears use + ".*:tagname:.*" re template + """ super().__init__() - self.tagDrops = list(map(str.lower, tagDrops)) - print(f"DBUG:TextXMLParser:{self.tagDrops}") - self.insideTagDrops = { - } - for tag in tagDrops: - self.insideTagDrops[tag] = False - self.bCapture = False + self.tagDropREs = list(map(str.lower, tagDropREs)) + print(f"DBUG:XMLFilterParser:{self.tagDropREs}") self.text = "" self.prefixTags = [] self.prefix = "" @@ -246,8 +252,9 @@ class TextXMLParser(html.parser.HTMLParser): """ Helps decide whether to capture contents or discard them. """ - for tag in self.tagDrops: - if self.insideTagDrops[tag]: + curTagH = f'{":".join(self.prefixTags)}:' + for dropRE in self.tagDropREs: + if re.match(dropRE, curTagH): return False return True @@ -256,8 +263,6 @@ class TextXMLParser(html.parser.HTMLParser): self.prefixTags.append(tag) self.prefix += "\t" self.text += f"\n{self.prefix}<{tag}>" - if tag in self.tagDrops: - self.insideTagDrops[tag] = True def handle_endtag(self, tag: str): if (self.lastTrackedCB == "endtag"): @@ -267,36 +272,34 @@ class TextXMLParser(html.parser.HTMLParser): self.lastTrackedCB = "endtag" self.prefixTags.pop() self.prefix = self.prefix[:-1] - if tag in self.tagDrops: - self.insideTagDrops[tag] = False def handle_data(self, data: str): if self.do_capture(): self.text += f"{data}" -def handle_xmltext(ph: 'ProxyHandler', pr: urllib.parse.ParseResult): +def handle_xmlfiltered(ph: 'ProxyHandler', pr: urllib.parse.ParseResult): try: # Get requested url - got = handle_urlreq(ph, pr, "HandleXMLText") + got = handle_urlreq(ph, pr, "HandleXMLFiltered") if not got.callOk: ph.send_error(got.httpStatus, got.httpStatusMsg) return # Extract Text - tagDrops = ph.headers.get('xmltext-tag-drops') - if not tagDrops: - tagDrops = [] + tagDropREs = ph.headers.get('xmlfiltered-tagdrop-res') + if not tagDropREs: + tagDropREs = [] else: - tagDrops = cast(list[str], json.loads(tagDrops)) - textXML = TextXMLParser(tagDrops) - textXML.feed(got.contentData) + tagDropREs = cast(list[str], json.loads(tagDropREs)) + xmlFiltered = XMLFilterParser(tagDropREs) + xmlFiltered.feed(got.contentData) # Send back to client ph.send_response(got.httpStatus) ph.send_header('Content-Type', got.contentType) # Add CORS for browser fetch, just in case ph.send_header('Access-Control-Allow-Origin', '*') ph.end_headers() - ph.wfile.write(textXML.text.encode('utf-8')) - debug.dump({ 'RawText': 'yes', 'StrippedText': 'yes' }, { 'RawText': textXML.text }) + ph.wfile.write(xmlFiltered.text.encode('utf-8')) + debug.dump({ 'XMLFiltered': 'yes' }, { 'RawText': xmlFiltered.text }) except Exception as exc: - ph.send_error(502, f"WARN:XMLTextFailed:{exc}") + ph.send_error(502, f"WARN:XMLFiltered:Failed:{exc}") diff --git a/tools/server/public_simplechat/readme.md b/tools/server/public_simplechat/readme.md index 2936102fdb..8bfd287701 100644 --- a/tools/server/public_simplechat/readme.md +++ b/tools/server/public_simplechat/readme.md @@ -463,9 +463,15 @@ plain textual content from the search result page. * fetch_pdf_as_text - fetch/read specified pdf file and extract its textual content * this depends on the pypdf python based open source library -* fetch_xml_as_text - fetch/read specified xml file and extract its textual content - * prefixes the tag heirarchy with each leaf content - * allows one to specify a list of tags that are to be dropped fully. +* fetch_xml_filtered - fetch/read specified xml file and optionally filter out any specified tags + * allows one to specify a list of tags related REs, + to help drop the corresponding tags and their contents fully. + * to drop a tag, specify regular expression + * that matches the corresponding heirarchy of tags involved + * where the tag names should be in lower case and suffixed with : + * if interested in dropping a tag independent of where it appears use + * .*:tagname:.* + * rather the tool call meta data passed to ai model explains the same and provides a sample. the above set of web related tool calls work by handshaking with a bundled simple local web proxy (/caching in future) server logic, this helps bypass the CORS restrictions applied if trying to @@ -656,6 +662,10 @@ sliding window based drop off or even before they kick in, this can help in many * add support for fetch_xml_as_text tool call, fix importmaps in index.html +* renamed and updated logic wrt xml fetching to be fetch_xml_filtered. allow one to use re to identify + the tags to be filtered in a fine grained manner including filtering based on tag heirarchy + + #### ToDo Is the tool call promise land trap deep enough, need to think through and explore around this once later. diff --git a/tools/server/public_simplechat/toolweb.mjs b/tools/server/public_simplechat/toolweb.mjs index 7dbbe51a05..c871cdbd0e 100644 --- a/tools/server/public_simplechat/toolweb.mjs +++ b/tools/server/public_simplechat/toolweb.mjs @@ -330,16 +330,22 @@ async function fetchpdftext_setup(tcs) { // -// Fetch XML Text +// Fetch XML Filtered // -let gRSSTagDropsDefault = [ "guid", "link", "description", "image", "enclosure" ] +let gRSSTagDropsDefault = [ + "^rss:channel:item:guid:.*", + "^rss:channel:item:link:.*", + "^rss:channel:item:description:.*", + ".*:image:.*", + ".*:enclosure:.*" +]; -let fetchxmltext_meta = { +let fetchxmlfiltered_meta = { "type": "function", "function": { - "name": "fetch_xml_as_text", + "name": "fetch_xml_filtered", "description": "Fetch requested xml url through a proxy server that can optionally filter out unwanted tags and their contents. Will take few seconds", "parameters": { "type": "object", @@ -348,9 +354,12 @@ let fetchxmltext_meta = { "type":"string", "description":"url of the xml file that will be fetched" }, - "tagDrops":{ + "tagDropREs":{ "type":"string", - "description":`Optionally specify a json stringified list of xml tags to drop. For example for rss feeds one could use ${JSON.stringify(gRSSTagDropsDefault)} and so...` + "description":`Optionally specify a json stringified list of xml tag heirarchies to drop. + For each tag that needs to be dropped, one needs to specify regular expression of the heirarchy of tags involved, + where the tag names are always mentioned in lower case along with a : as suffix. + For example for rss feeds one could use ${JSON.stringify(gRSSTagDropsDefault)} and so...` } }, "required": ["url"] @@ -360,7 +369,7 @@ let fetchxmltext_meta = { /** - * Implementation of the fetch xml as text logic. + * Implementation of the fetch xml filtered logic. * Expects simpleproxy to be running at specified url and providing xmltext service * ALERT: Accesses a seperate/external web proxy/caching server, be aware and careful * @param {string} chatid @@ -368,25 +377,25 @@ let fetchxmltext_meta = { * @param {string} toolname * @param {any} obj */ -function fetchxmltext_run(chatid, toolcallid, toolname, obj) { - let tagDrops = obj.tagDrops - if (tagDrops == undefined) { - tagDrops = JSON.stringify([]) // JSON.stringify(gRSSTagDropsDefault) +function fetchxmlfiltered_run(chatid, toolcallid, toolname, obj) { + let tagDropREs = obj.tagDropREs + if (tagDropREs == undefined) { + tagDropREs = JSON.stringify([]) // JSON.stringify(gRSSTagDropsDefault) } - let headers = { 'xmltext-tag-drops': tagDrops } - return proxyserver_get_anyargs(chatid, toolcallid, toolname, obj, 'xmltext', headers); + let headers = { 'xmlfiltered-tagdrop-res': tagDropREs } + return proxyserver_get_anyargs(chatid, toolcallid, toolname, obj, 'xmlfiltered', headers); } /** - * Setup fetch_xml_as_text for tool calling + * Setup fetch_xml_filtered for tool calling * NOTE: Currently the logic is setup for the bundled simpleproxy.py * @param {Object>} tcs */ -async function fetchxmltext_setup(tcs) { - return proxyserver_tc_setup('FetchXmlAsText', 'xmltext', 'fetch_xml_as_text', { - "handler": fetchxmltext_run, - "meta": fetchxmltext_meta, +async function fetchxmlfiltered_setup(tcs) { + return proxyserver_tc_setup('FetchXmlFiltered', 'xmlfiltered', 'fetch_xml_filtered', { + "handler": fetchxmlfiltered_run, + "meta": fetchxmlfiltered_meta, "result": "" }, tcs); } @@ -412,6 +421,6 @@ export async function init(me) { await fetchweburltext_setup(tc_switch) await searchwebtext_setup(tc_switch) await fetchpdftext_setup(tc_switch) - await fetchxmltext_setup(tc_switch) + await fetchxmlfiltered_setup(tc_switch) return tc_switch }