SimpleChatTC:XmlFiltered: Use re with heirarchy of tags to filter

Rename xmltext to xmlfiltered.

This simplifies the filtering related logic as well as gives more
fine grained flexibility wrt filtering bcas of re.
This commit is contained in:
hanishkvc 2025-11-07 16:03:00 +05:30
parent 9ed1cf9886
commit 9f5c3d7776
4 changed files with 76 additions and 54 deletions

View File

@ -49,7 +49,7 @@ gConfigType = {
gConfigNeeded = [ '--allowed.schemes', '--allowed.domains', '--bearer.insecure' ] gConfigNeeded = [ '--allowed.schemes', '--allowed.domains', '--bearer.insecure' ]
gAllowedCalls = { gAllowedCalls = {
"xmltext": [], "xmlfiltered": [],
"urltext": [], "urltext": [],
"urlraw": [], "urlraw": [],
"pdftext": [ "pypdf" ] "pdftext": [ "pypdf" ]
@ -140,8 +140,8 @@ class ProxyHandler(http.server.BaseHTTPRequestHandler):
self.auth_and_run(pr, mWeb.handle_urlraw) self.auth_and_run(pr, mWeb.handle_urlraw)
case '/urltext': case '/urltext':
self.auth_and_run(pr, mWeb.handle_urltext) self.auth_and_run(pr, mWeb.handle_urltext)
case '/xmltext': case '/xmlfiltered':
self.auth_and_run(pr, mWeb.handle_xmltext) self.auth_and_run(pr, mWeb.handle_xmlfiltered)
case '/pdftext': case '/pdftext':
self.auth_and_run(pr, mPdf.handle_pdftext) self.auth_and_run(pr, mPdf.handle_pdftext)
case '/aum': case '/aum':

View File

@ -8,6 +8,7 @@ import html.parser
import debug import debug
import filemagic as mFile import filemagic as mFile
import json import json
import re
from typing import TYPE_CHECKING, Any, cast from typing import TYPE_CHECKING, Any, cast
if TYPE_CHECKING: if TYPE_CHECKING:
@ -218,25 +219,30 @@ def handle_urltext(ph: 'ProxyHandler', pr: urllib.parse.ParseResult):
ph.send_error(502, f"WARN:UrlTextFailed:{exc}") ph.send_error(502, f"WARN:UrlTextFailed:{exc}")
class TextXMLParser(html.parser.HTMLParser): class XMLFilterParser(html.parser.HTMLParser):
""" """
A simple minded logic used to strip xml content of A simple minded logic used to strip xml content of
* unwanted tags and their contents. * unwanted tags and their contents, using re
* this works properly only if the xml being processed has proper opening and ending tags * this works properly only if the xml being processed has
around the area of interest. proper opening and ending tags around the area of interest.
This can help return a cleaned up xml file. This can help return a cleaned up xml file.
""" """
def __init__(self, tagDrops: list[str]): def __init__(self, tagDropREs: list[str]):
"""
tagDropREs - allows one to specify a list of tags related REs,
to help drop the corresponding tags and their contents fully.
To drop a tag, specify regular expression
* that matches the corresponding heirarchy of tags involved
* where the tag names should be in lower case and suffixed with :
* if interested in dropping a tag independent of where it appears use
".*:tagname:.*" re template
"""
super().__init__() super().__init__()
self.tagDrops = list(map(str.lower, tagDrops)) self.tagDropREs = list(map(str.lower, tagDropREs))
print(f"DBUG:TextXMLParser:{self.tagDrops}") print(f"DBUG:XMLFilterParser:{self.tagDropREs}")
self.insideTagDrops = {
}
for tag in tagDrops:
self.insideTagDrops[tag] = False
self.bCapture = False
self.text = "" self.text = ""
self.prefixTags = [] self.prefixTags = []
self.prefix = "" self.prefix = ""
@ -246,8 +252,9 @@ class TextXMLParser(html.parser.HTMLParser):
""" """
Helps decide whether to capture contents or discard them. Helps decide whether to capture contents or discard them.
""" """
for tag in self.tagDrops: curTagH = f'{":".join(self.prefixTags)}:'
if self.insideTagDrops[tag]: for dropRE in self.tagDropREs:
if re.match(dropRE, curTagH):
return False return False
return True return True
@ -256,8 +263,6 @@ class TextXMLParser(html.parser.HTMLParser):
self.prefixTags.append(tag) self.prefixTags.append(tag)
self.prefix += "\t" self.prefix += "\t"
self.text += f"\n{self.prefix}<{tag}>" self.text += f"\n{self.prefix}<{tag}>"
if tag in self.tagDrops:
self.insideTagDrops[tag] = True
def handle_endtag(self, tag: str): def handle_endtag(self, tag: str):
if (self.lastTrackedCB == "endtag"): if (self.lastTrackedCB == "endtag"):
@ -267,36 +272,34 @@ class TextXMLParser(html.parser.HTMLParser):
self.lastTrackedCB = "endtag" self.lastTrackedCB = "endtag"
self.prefixTags.pop() self.prefixTags.pop()
self.prefix = self.prefix[:-1] self.prefix = self.prefix[:-1]
if tag in self.tagDrops:
self.insideTagDrops[tag] = False
def handle_data(self, data: str): def handle_data(self, data: str):
if self.do_capture(): if self.do_capture():
self.text += f"{data}" self.text += f"{data}"
def handle_xmltext(ph: 'ProxyHandler', pr: urllib.parse.ParseResult): def handle_xmlfiltered(ph: 'ProxyHandler', pr: urllib.parse.ParseResult):
try: try:
# Get requested url # Get requested url
got = handle_urlreq(ph, pr, "HandleXMLText") got = handle_urlreq(ph, pr, "HandleXMLFiltered")
if not got.callOk: if not got.callOk:
ph.send_error(got.httpStatus, got.httpStatusMsg) ph.send_error(got.httpStatus, got.httpStatusMsg)
return return
# Extract Text # Extract Text
tagDrops = ph.headers.get('xmltext-tag-drops') tagDropREs = ph.headers.get('xmlfiltered-tagdrop-res')
if not tagDrops: if not tagDropREs:
tagDrops = [] tagDropREs = []
else: else:
tagDrops = cast(list[str], json.loads(tagDrops)) tagDropREs = cast(list[str], json.loads(tagDropREs))
textXML = TextXMLParser(tagDrops) xmlFiltered = XMLFilterParser(tagDropREs)
textXML.feed(got.contentData) xmlFiltered.feed(got.contentData)
# Send back to client # Send back to client
ph.send_response(got.httpStatus) ph.send_response(got.httpStatus)
ph.send_header('Content-Type', got.contentType) ph.send_header('Content-Type', got.contentType)
# Add CORS for browser fetch, just in case # Add CORS for browser fetch, just in case
ph.send_header('Access-Control-Allow-Origin', '*') ph.send_header('Access-Control-Allow-Origin', '*')
ph.end_headers() ph.end_headers()
ph.wfile.write(textXML.text.encode('utf-8')) ph.wfile.write(xmlFiltered.text.encode('utf-8'))
debug.dump({ 'RawText': 'yes', 'StrippedText': 'yes' }, { 'RawText': textXML.text }) debug.dump({ 'XMLFiltered': 'yes' }, { 'RawText': xmlFiltered.text })
except Exception as exc: except Exception as exc:
ph.send_error(502, f"WARN:XMLTextFailed:{exc}") ph.send_error(502, f"WARN:XMLFiltered:Failed:{exc}")

View File

@ -463,9 +463,15 @@ plain textual content from the search result page.
* fetch_pdf_as_text - fetch/read specified pdf file and extract its textual content * fetch_pdf_as_text - fetch/read specified pdf file and extract its textual content
* this depends on the pypdf python based open source library * this depends on the pypdf python based open source library
* fetch_xml_as_text - fetch/read specified xml file and extract its textual content * fetch_xml_filtered - fetch/read specified xml file and optionally filter out any specified tags
* prefixes the tag heirarchy with each leaf content * allows one to specify a list of tags related REs,
* allows one to specify a list of tags that are to be dropped fully. to help drop the corresponding tags and their contents fully.
* to drop a tag, specify regular expression
* that matches the corresponding heirarchy of tags involved
* where the tag names should be in lower case and suffixed with :
* if interested in dropping a tag independent of where it appears use
* .*:tagname:.*
* rather the tool call meta data passed to ai model explains the same and provides a sample.
the above set of web related tool calls work by handshaking with a bundled simple local web proxy the above set of web related tool calls work by handshaking with a bundled simple local web proxy
(/caching in future) server logic, this helps bypass the CORS restrictions applied if trying to (/caching in future) server logic, this helps bypass the CORS restrictions applied if trying to
@ -656,6 +662,10 @@ sliding window based drop off or even before they kick in, this can help in many
* add support for fetch_xml_as_text tool call, fix importmaps in index.html * add support for fetch_xml_as_text tool call, fix importmaps in index.html
* renamed and updated logic wrt xml fetching to be fetch_xml_filtered. allow one to use re to identify
the tags to be filtered in a fine grained manner including filtering based on tag heirarchy
#### ToDo #### ToDo
Is the tool call promise land trap deep enough, need to think through and explore around this once later. Is the tool call promise land trap deep enough, need to think through and explore around this once later.

View File

@ -330,16 +330,22 @@ async function fetchpdftext_setup(tcs) {
// //
// Fetch XML Text // Fetch XML Filtered
// //
let gRSSTagDropsDefault = [ "guid", "link", "description", "image", "enclosure" ] let gRSSTagDropsDefault = [
"^rss:channel:item:guid:.*",
"^rss:channel:item:link:.*",
"^rss:channel:item:description:.*",
".*:image:.*",
".*:enclosure:.*"
];
let fetchxmltext_meta = { let fetchxmlfiltered_meta = {
"type": "function", "type": "function",
"function": { "function": {
"name": "fetch_xml_as_text", "name": "fetch_xml_filtered",
"description": "Fetch requested xml url through a proxy server that can optionally filter out unwanted tags and their contents. Will take few seconds", "description": "Fetch requested xml url through a proxy server that can optionally filter out unwanted tags and their contents. Will take few seconds",
"parameters": { "parameters": {
"type": "object", "type": "object",
@ -348,9 +354,12 @@ let fetchxmltext_meta = {
"type":"string", "type":"string",
"description":"url of the xml file that will be fetched" "description":"url of the xml file that will be fetched"
}, },
"tagDrops":{ "tagDropREs":{
"type":"string", "type":"string",
"description":`Optionally specify a json stringified list of xml tags to drop. For example for rss feeds one could use ${JSON.stringify(gRSSTagDropsDefault)} and so...` "description":`Optionally specify a json stringified list of xml tag heirarchies to drop.
For each tag that needs to be dropped, one needs to specify regular expression of the heirarchy of tags involved,
where the tag names are always mentioned in lower case along with a : as suffix.
For example for rss feeds one could use ${JSON.stringify(gRSSTagDropsDefault)} and so...`
} }
}, },
"required": ["url"] "required": ["url"]
@ -360,7 +369,7 @@ let fetchxmltext_meta = {
/** /**
* Implementation of the fetch xml as text logic. * Implementation of the fetch xml filtered logic.
* Expects simpleproxy to be running at specified url and providing xmltext service * Expects simpleproxy to be running at specified url and providing xmltext service
* ALERT: Accesses a seperate/external web proxy/caching server, be aware and careful * ALERT: Accesses a seperate/external web proxy/caching server, be aware and careful
* @param {string} chatid * @param {string} chatid
@ -368,25 +377,25 @@ let fetchxmltext_meta = {
* @param {string} toolname * @param {string} toolname
* @param {any} obj * @param {any} obj
*/ */
function fetchxmltext_run(chatid, toolcallid, toolname, obj) { function fetchxmlfiltered_run(chatid, toolcallid, toolname, obj) {
let tagDrops = obj.tagDrops let tagDropREs = obj.tagDropREs
if (tagDrops == undefined) { if (tagDropREs == undefined) {
tagDrops = JSON.stringify([]) // JSON.stringify(gRSSTagDropsDefault) tagDropREs = JSON.stringify([]) // JSON.stringify(gRSSTagDropsDefault)
} }
let headers = { 'xmltext-tag-drops': tagDrops } let headers = { 'xmlfiltered-tagdrop-res': tagDropREs }
return proxyserver_get_anyargs(chatid, toolcallid, toolname, obj, 'xmltext', headers); return proxyserver_get_anyargs(chatid, toolcallid, toolname, obj, 'xmlfiltered', headers);
} }
/** /**
* Setup fetch_xml_as_text for tool calling * Setup fetch_xml_filtered for tool calling
* NOTE: Currently the logic is setup for the bundled simpleproxy.py * NOTE: Currently the logic is setup for the bundled simpleproxy.py
* @param {Object<string, Object<string, any>>} tcs * @param {Object<string, Object<string, any>>} tcs
*/ */
async function fetchxmltext_setup(tcs) { async function fetchxmlfiltered_setup(tcs) {
return proxyserver_tc_setup('FetchXmlAsText', 'xmltext', 'fetch_xml_as_text', { return proxyserver_tc_setup('FetchXmlFiltered', 'xmlfiltered', 'fetch_xml_filtered', {
"handler": fetchxmltext_run, "handler": fetchxmlfiltered_run,
"meta": fetchxmltext_meta, "meta": fetchxmlfiltered_meta,
"result": "" "result": ""
}, tcs); }, tcs);
} }
@ -412,6 +421,6 @@ export async function init(me) {
await fetchweburltext_setup(tc_switch) await fetchweburltext_setup(tc_switch)
await searchwebtext_setup(tc_switch) await searchwebtext_setup(tc_switch)
await fetchpdftext_setup(tc_switch) await fetchpdftext_setup(tc_switch)
await fetchxmltext_setup(tc_switch) await fetchxmlfiltered_setup(tc_switch)
return tc_switch return tc_switch
} }