diff --git a/tools/server/public_simplechat/local.tools/simpleproxy.py b/tools/server/public_simplechat/local.tools/simpleproxy.py index 8348373e40..d36a2443f0 100644 --- a/tools/server/public_simplechat/local.tools/simpleproxy.py +++ b/tools/server/public_simplechat/local.tools/simpleproxy.py @@ -49,7 +49,7 @@ gConfigType = { gConfigNeeded = [ '--allowed.schemes', '--allowed.domains', '--bearer.insecure' ] gAllowedCalls = { - "xmltext": [], + "xmlfiltered": [], "urltext": [], "urlraw": [], "pdftext": [ "pypdf" ] @@ -140,8 +140,8 @@ class ProxyHandler(http.server.BaseHTTPRequestHandler): self.auth_and_run(pr, mWeb.handle_urlraw) case '/urltext': self.auth_and_run(pr, mWeb.handle_urltext) - case '/xmltext': - self.auth_and_run(pr, mWeb.handle_xmltext) + case '/xmlfiltered': + self.auth_and_run(pr, mWeb.handle_xmlfiltered) case '/pdftext': self.auth_and_run(pr, mPdf.handle_pdftext) case '/aum': diff --git a/tools/server/public_simplechat/local.tools/webmagic.py b/tools/server/public_simplechat/local.tools/webmagic.py index 588e562359..aaf7d5f332 100644 --- a/tools/server/public_simplechat/local.tools/webmagic.py +++ b/tools/server/public_simplechat/local.tools/webmagic.py @@ -8,6 +8,7 @@ import html.parser import debug import filemagic as mFile import json +import re from typing import TYPE_CHECKING, Any, cast if TYPE_CHECKING: @@ -218,25 +219,30 @@ def handle_urltext(ph: 'ProxyHandler', pr: urllib.parse.ParseResult): ph.send_error(502, f"WARN:UrlTextFailed:{exc}") -class TextXMLParser(html.parser.HTMLParser): +class XMLFilterParser(html.parser.HTMLParser): """ A simple minded logic used to strip xml content of - * unwanted tags and their contents. - * this works properly only if the xml being processed has proper opening and ending tags - around the area of interest. + * unwanted tags and their contents, using re + * this works properly only if the xml being processed has + proper opening and ending tags around the area of interest. This can help return a cleaned up xml file. """ - def __init__(self, tagDrops: list[str]): + def __init__(self, tagDropREs: list[str]): + """ + tagDropREs - allows one to specify a list of tags related REs, + to help drop the corresponding tags and their contents fully. + + To drop a tag, specify regular expression + * that matches the corresponding heirarchy of tags involved + * where the tag names should be in lower case and suffixed with : + * if interested in dropping a tag independent of where it appears use + ".*:tagname:.*" re template + """ super().__init__() - self.tagDrops = list(map(str.lower, tagDrops)) - print(f"DBUG:TextXMLParser:{self.tagDrops}") - self.insideTagDrops = { - } - for tag in tagDrops: - self.insideTagDrops[tag] = False - self.bCapture = False + self.tagDropREs = list(map(str.lower, tagDropREs)) + print(f"DBUG:XMLFilterParser:{self.tagDropREs}") self.text = "" self.prefixTags = [] self.prefix = "" @@ -246,8 +252,9 @@ class TextXMLParser(html.parser.HTMLParser): """ Helps decide whether to capture contents or discard them. """ - for tag in self.tagDrops: - if self.insideTagDrops[tag]: + curTagH = f'{":".join(self.prefixTags)}:' + for dropRE in self.tagDropREs: + if re.match(dropRE, curTagH): return False return True @@ -256,8 +263,6 @@ class TextXMLParser(html.parser.HTMLParser): self.prefixTags.append(tag) self.prefix += "\t" self.text += f"\n{self.prefix}<{tag}>" - if tag in self.tagDrops: - self.insideTagDrops[tag] = True def handle_endtag(self, tag: str): if (self.lastTrackedCB == "endtag"): @@ -267,36 +272,34 @@ class TextXMLParser(html.parser.HTMLParser): self.lastTrackedCB = "endtag" self.prefixTags.pop() self.prefix = self.prefix[:-1] - if tag in self.tagDrops: - self.insideTagDrops[tag] = False def handle_data(self, data: str): if self.do_capture(): self.text += f"{data}" -def handle_xmltext(ph: 'ProxyHandler', pr: urllib.parse.ParseResult): +def handle_xmlfiltered(ph: 'ProxyHandler', pr: urllib.parse.ParseResult): try: # Get requested url - got = handle_urlreq(ph, pr, "HandleXMLText") + got = handle_urlreq(ph, pr, "HandleXMLFiltered") if not got.callOk: ph.send_error(got.httpStatus, got.httpStatusMsg) return # Extract Text - tagDrops = ph.headers.get('xmltext-tag-drops') - if not tagDrops: - tagDrops = [] + tagDropREs = ph.headers.get('xmlfiltered-tagdrop-res') + if not tagDropREs: + tagDropREs = [] else: - tagDrops = cast(list[str], json.loads(tagDrops)) - textXML = TextXMLParser(tagDrops) - textXML.feed(got.contentData) + tagDropREs = cast(list[str], json.loads(tagDropREs)) + xmlFiltered = XMLFilterParser(tagDropREs) + xmlFiltered.feed(got.contentData) # Send back to client ph.send_response(got.httpStatus) ph.send_header('Content-Type', got.contentType) # Add CORS for browser fetch, just in case ph.send_header('Access-Control-Allow-Origin', '*') ph.end_headers() - ph.wfile.write(textXML.text.encode('utf-8')) - debug.dump({ 'RawText': 'yes', 'StrippedText': 'yes' }, { 'RawText': textXML.text }) + ph.wfile.write(xmlFiltered.text.encode('utf-8')) + debug.dump({ 'XMLFiltered': 'yes' }, { 'RawText': xmlFiltered.text }) except Exception as exc: - ph.send_error(502, f"WARN:XMLTextFailed:{exc}") + ph.send_error(502, f"WARN:XMLFiltered:Failed:{exc}") diff --git a/tools/server/public_simplechat/readme.md b/tools/server/public_simplechat/readme.md index 2936102fdb..8bfd287701 100644 --- a/tools/server/public_simplechat/readme.md +++ b/tools/server/public_simplechat/readme.md @@ -463,9 +463,15 @@ plain textual content from the search result page. * fetch_pdf_as_text - fetch/read specified pdf file and extract its textual content * this depends on the pypdf python based open source library -* fetch_xml_as_text - fetch/read specified xml file and extract its textual content - * prefixes the tag heirarchy with each leaf content - * allows one to specify a list of tags that are to be dropped fully. +* fetch_xml_filtered - fetch/read specified xml file and optionally filter out any specified tags + * allows one to specify a list of tags related REs, + to help drop the corresponding tags and their contents fully. + * to drop a tag, specify regular expression + * that matches the corresponding heirarchy of tags involved + * where the tag names should be in lower case and suffixed with : + * if interested in dropping a tag independent of where it appears use + * .*:tagname:.* + * rather the tool call meta data passed to ai model explains the same and provides a sample. the above set of web related tool calls work by handshaking with a bundled simple local web proxy (/caching in future) server logic, this helps bypass the CORS restrictions applied if trying to @@ -656,6 +662,10 @@ sliding window based drop off or even before they kick in, this can help in many * add support for fetch_xml_as_text tool call, fix importmaps in index.html +* renamed and updated logic wrt xml fetching to be fetch_xml_filtered. allow one to use re to identify + the tags to be filtered in a fine grained manner including filtering based on tag heirarchy + + #### ToDo Is the tool call promise land trap deep enough, need to think through and explore around this once later. diff --git a/tools/server/public_simplechat/toolweb.mjs b/tools/server/public_simplechat/toolweb.mjs index 7dbbe51a05..c871cdbd0e 100644 --- a/tools/server/public_simplechat/toolweb.mjs +++ b/tools/server/public_simplechat/toolweb.mjs @@ -330,16 +330,22 @@ async function fetchpdftext_setup(tcs) { // -// Fetch XML Text +// Fetch XML Filtered // -let gRSSTagDropsDefault = [ "guid", "link", "description", "image", "enclosure" ] +let gRSSTagDropsDefault = [ + "^rss:channel:item:guid:.*", + "^rss:channel:item:link:.*", + "^rss:channel:item:description:.*", + ".*:image:.*", + ".*:enclosure:.*" +]; -let fetchxmltext_meta = { +let fetchxmlfiltered_meta = { "type": "function", "function": { - "name": "fetch_xml_as_text", + "name": "fetch_xml_filtered", "description": "Fetch requested xml url through a proxy server that can optionally filter out unwanted tags and their contents. Will take few seconds", "parameters": { "type": "object", @@ -348,9 +354,12 @@ let fetchxmltext_meta = { "type":"string", "description":"url of the xml file that will be fetched" }, - "tagDrops":{ + "tagDropREs":{ "type":"string", - "description":`Optionally specify a json stringified list of xml tags to drop. For example for rss feeds one could use ${JSON.stringify(gRSSTagDropsDefault)} and so...` + "description":`Optionally specify a json stringified list of xml tag heirarchies to drop. + For each tag that needs to be dropped, one needs to specify regular expression of the heirarchy of tags involved, + where the tag names are always mentioned in lower case along with a : as suffix. + For example for rss feeds one could use ${JSON.stringify(gRSSTagDropsDefault)} and so...` } }, "required": ["url"] @@ -360,7 +369,7 @@ let fetchxmltext_meta = { /** - * Implementation of the fetch xml as text logic. + * Implementation of the fetch xml filtered logic. * Expects simpleproxy to be running at specified url and providing xmltext service * ALERT: Accesses a seperate/external web proxy/caching server, be aware and careful * @param {string} chatid @@ -368,25 +377,25 @@ let fetchxmltext_meta = { * @param {string} toolname * @param {any} obj */ -function fetchxmltext_run(chatid, toolcallid, toolname, obj) { - let tagDrops = obj.tagDrops - if (tagDrops == undefined) { - tagDrops = JSON.stringify([]) // JSON.stringify(gRSSTagDropsDefault) +function fetchxmlfiltered_run(chatid, toolcallid, toolname, obj) { + let tagDropREs = obj.tagDropREs + if (tagDropREs == undefined) { + tagDropREs = JSON.stringify([]) // JSON.stringify(gRSSTagDropsDefault) } - let headers = { 'xmltext-tag-drops': tagDrops } - return proxyserver_get_anyargs(chatid, toolcallid, toolname, obj, 'xmltext', headers); + let headers = { 'xmlfiltered-tagdrop-res': tagDropREs } + return proxyserver_get_anyargs(chatid, toolcallid, toolname, obj, 'xmlfiltered', headers); } /** - * Setup fetch_xml_as_text for tool calling + * Setup fetch_xml_filtered for tool calling * NOTE: Currently the logic is setup for the bundled simpleproxy.py * @param {Object>} tcs */ -async function fetchxmltext_setup(tcs) { - return proxyserver_tc_setup('FetchXmlAsText', 'xmltext', 'fetch_xml_as_text', { - "handler": fetchxmltext_run, - "meta": fetchxmltext_meta, +async function fetchxmlfiltered_setup(tcs) { + return proxyserver_tc_setup('FetchXmlFiltered', 'xmlfiltered', 'fetch_xml_filtered', { + "handler": fetchxmlfiltered_run, + "meta": fetchxmlfiltered_meta, "result": "" }, tcs); } @@ -412,6 +421,6 @@ export async function init(me) { await fetchweburltext_setup(tc_switch) await searchwebtext_setup(tc_switch) await fetchpdftext_setup(tc_switch) - await fetchxmltext_setup(tc_switch) + await fetchxmlfiltered_setup(tc_switch) return tc_switch }