SimpleChatTC:XmlText: Cleanup initial go

At simpleproxy end

* Add the tag names hierarchy before contents of a tag

* Remember to convert the tagDrops to small case as HTMLParser base
  class seems to do that by default.

At the client ui end

* if undefined remember to pass a empty list wrt tagDrops.

* cleanup the func description and also mention possible tagDrops
  for RSS feeds in the tool meta
This commit is contained in:
hanishkvc 2025-11-07 03:53:34 +05:30
parent fbe9b2369f
commit b8bb258dd5
3 changed files with 19 additions and 8 deletions

View File

@ -232,7 +232,7 @@ class TextXMLParser(html.parser.HTMLParser):
def __init__(self, tagDrops: list[str]):
super().__init__()
self.tagDrops = tagDrops
self.tagDrops = list(map(str.lower, tagDrops))
print(f"DBUG:TextXMLParser:{self.tagDrops}")
self.insideTagDrops = {
}
@ -240,7 +240,7 @@ class TextXMLParser(html.parser.HTMLParser):
self.insideTagDrops[tag] = False
self.bCapture = False
self.text = ""
self.prefix = ""
self.prefix = []
def do_capture(self):
"""
@ -252,18 +252,18 @@ class TextXMLParser(html.parser.HTMLParser):
return True
def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]):
self.prefix += " "
self.prefix.append(tag)
if tag in self.tagDrops:
self.insideTagDrops[tag] = True
def handle_endtag(self, tag: str):
self.prefix = self.prefix[:-1]
self.prefix.pop()
if tag in self.tagDrops:
self.insideTagDrops[tag] = False
def handle_data(self, data: str):
if self.do_capture():
self.text += f"{self.prefix}{data}\n"
self.text += f"{':'.join(self.prefix)}:{data}\n"
def handle_xmltext(ph: 'ProxyHandler', pr: urllib.parse.ParseResult):

View File

@ -463,6 +463,10 @@ plain textual content from the search result page.
* fetch_pdf_as_text - fetch/read specified pdf file and extract its textual content
* this depends on the pypdf python based open source library
* fetch_xml_as_text - fetch/read specified xml file and extract its textual content
* prefixes the tag heirarchy with each leaf content
* allows one to specify a list of tags that are to be dropped fully.
the above set of web related tool calls work by handshaking with a bundled simple local web proxy
(/caching in future) server logic, this helps bypass the CORS restrictions applied if trying to
directly fetch from the browser js runtime environment.
@ -650,6 +654,7 @@ sliding window based drop off or even before they kick in, this can help in many
or if there is no response within the configured timeout period.
NOTE: Currently the logic supports only 1 pending tool call per chat session.
* add support for fetch_xml_as_text tool call, fix importmaps in index.html
#### ToDo

View File

@ -334,11 +334,13 @@ async function fetchpdftext_setup(tcs) {
//
let gRSSTagDropsDefault = [ "guid", "link", "description", "image", "enclosure" ]
let fetchxmltext_meta = {
"type": "function",
"function": {
"name": "fetch_xml_as_text",
"description": "Fetch the requested xml url through a proxy server and return its text content after stripping away the xml tags, in few seconds",
"description": "Fetch requested xml url through a proxy server and return its cleaned up text contents. Each content is prefixed with the xml tag heirarchy that it belongs to. Will take few seconds",
"parameters": {
"type": "object",
"properties": {
@ -348,7 +350,7 @@ let fetchxmltext_meta = {
},
"tagDrops":{
"type":"string",
"description":"specify a json stringified form of list of xml tags to drop"
"description":`Optionally specify a json stringified list of xml tags to drop. For example for rss feeds one could use ${JSON.stringify(gRSSTagDropsDefault)} and so...`
}
},
"required": ["url"]
@ -367,7 +369,11 @@ let fetchxmltext_meta = {
* @param {any} obj
*/
function fetchxmltext_run(chatid, toolcallid, toolname, obj) {
let headers = { 'xmltext-tag-drops': obj.tagDrops }
let tagDrops = obj.tagDrops
if (tagDrops == undefined) {
tagDrops = JSON.stringify([]) // JSON.stringify(gRSSTagDropsDefault)
}
let headers = { 'xmltext-tag-drops': tagDrops }
return proxyserver_get_anyargs(chatid, toolcallid, toolname, obj, 'xmltext', headers);
}