diff --git a/tools/server/public_simplechat/local.tools/webmagic.py b/tools/server/public_simplechat/local.tools/webmagic.py index a6f025cc60..3d0c1bc48d 100644 --- a/tools/server/public_simplechat/local.tools/webmagic.py +++ b/tools/server/public_simplechat/local.tools/webmagic.py @@ -232,7 +232,7 @@ class TextXMLParser(html.parser.HTMLParser): def __init__(self, tagDrops: list[str]): super().__init__() - self.tagDrops = tagDrops + self.tagDrops = list(map(str.lower, tagDrops)) print(f"DBUG:TextXMLParser:{self.tagDrops}") self.insideTagDrops = { } @@ -240,7 +240,7 @@ class TextXMLParser(html.parser.HTMLParser): self.insideTagDrops[tag] = False self.bCapture = False self.text = "" - self.prefix = "" + self.prefix = [] def do_capture(self): """ @@ -252,18 +252,18 @@ class TextXMLParser(html.parser.HTMLParser): return True def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]): - self.prefix += " " + self.prefix.append(tag) if tag in self.tagDrops: self.insideTagDrops[tag] = True def handle_endtag(self, tag: str): - self.prefix = self.prefix[:-1] + self.prefix.pop() if tag in self.tagDrops: self.insideTagDrops[tag] = False def handle_data(self, data: str): if self.do_capture(): - self.text += f"{self.prefix}{data}\n" + self.text += f"{':'.join(self.prefix)}:{data}\n" def handle_xmltext(ph: 'ProxyHandler', pr: urllib.parse.ParseResult): diff --git a/tools/server/public_simplechat/readme.md b/tools/server/public_simplechat/readme.md index a1764b740a..2936102fdb 100644 --- a/tools/server/public_simplechat/readme.md +++ b/tools/server/public_simplechat/readme.md @@ -463,6 +463,10 @@ plain textual content from the search result page. * fetch_pdf_as_text - fetch/read specified pdf file and extract its textual content * this depends on the pypdf python based open source library +* fetch_xml_as_text - fetch/read specified xml file and extract its textual content + * prefixes the tag heirarchy with each leaf content + * allows one to specify a list of tags that are to be dropped fully. + the above set of web related tool calls work by handshaking with a bundled simple local web proxy (/caching in future) server logic, this helps bypass the CORS restrictions applied if trying to directly fetch from the browser js runtime environment. @@ -650,6 +654,7 @@ sliding window based drop off or even before they kick in, this can help in many or if there is no response within the configured timeout period. NOTE: Currently the logic supports only 1 pending tool call per chat session. +* add support for fetch_xml_as_text tool call, fix importmaps in index.html #### ToDo diff --git a/tools/server/public_simplechat/toolweb.mjs b/tools/server/public_simplechat/toolweb.mjs index 266e564dc4..8a8405c5ba 100644 --- a/tools/server/public_simplechat/toolweb.mjs +++ b/tools/server/public_simplechat/toolweb.mjs @@ -334,11 +334,13 @@ async function fetchpdftext_setup(tcs) { // +let gRSSTagDropsDefault = [ "guid", "link", "description", "image", "enclosure" ] + let fetchxmltext_meta = { "type": "function", "function": { "name": "fetch_xml_as_text", - "description": "Fetch the requested xml url through a proxy server and return its text content after stripping away the xml tags, in few seconds", + "description": "Fetch requested xml url through a proxy server and return its cleaned up text contents. Each content is prefixed with the xml tag heirarchy that it belongs to. Will take few seconds", "parameters": { "type": "object", "properties": { @@ -348,7 +350,7 @@ let fetchxmltext_meta = { }, "tagDrops":{ "type":"string", - "description":"specify a json stringified form of list of xml tags to drop" + "description":`Optionally specify a json stringified list of xml tags to drop. For example for rss feeds one could use ${JSON.stringify(gRSSTagDropsDefault)} and so...` } }, "required": ["url"] @@ -367,7 +369,11 @@ let fetchxmltext_meta = { * @param {any} obj */ function fetchxmltext_run(chatid, toolcallid, toolname, obj) { - let headers = { 'xmltext-tag-drops': obj.tagDrops } + let tagDrops = obj.tagDrops + if (tagDrops == undefined) { + tagDrops = JSON.stringify([]) // JSON.stringify(gRSSTagDropsDefault) + } + let headers = { 'xmltext-tag-drops': tagDrops } return proxyserver_get_anyargs(chatid, toolcallid, toolname, obj, 'xmltext', headers); }