SimpleChatTC:XmlText: Cleanup initial go

At simpleproxy end

* Add the tag names hierarchy before contents of a tag

* Remember to convert the tagDrops to small case as HTMLParser base
  class seems to do that by default.

At the client ui end

* if undefined remember to pass a empty list wrt tagDrops.

* cleanup the func description and also mention possible tagDrops
  for RSS feeds in the tool meta
This commit is contained in:
hanishkvc 2025-11-07 03:53:34 +05:30
parent fbe9b2369f
commit b8bb258dd5
3 changed files with 19 additions and 8 deletions

View File

@ -232,7 +232,7 @@ class TextXMLParser(html.parser.HTMLParser):
def __init__(self, tagDrops: list[str]): def __init__(self, tagDrops: list[str]):
super().__init__() super().__init__()
self.tagDrops = tagDrops self.tagDrops = list(map(str.lower, tagDrops))
print(f"DBUG:TextXMLParser:{self.tagDrops}") print(f"DBUG:TextXMLParser:{self.tagDrops}")
self.insideTagDrops = { self.insideTagDrops = {
} }
@ -240,7 +240,7 @@ class TextXMLParser(html.parser.HTMLParser):
self.insideTagDrops[tag] = False self.insideTagDrops[tag] = False
self.bCapture = False self.bCapture = False
self.text = "" self.text = ""
self.prefix = "" self.prefix = []
def do_capture(self): def do_capture(self):
""" """
@ -252,18 +252,18 @@ class TextXMLParser(html.parser.HTMLParser):
return True return True
def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]): def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]):
self.prefix += " " self.prefix.append(tag)
if tag in self.tagDrops: if tag in self.tagDrops:
self.insideTagDrops[tag] = True self.insideTagDrops[tag] = True
def handle_endtag(self, tag: str): def handle_endtag(self, tag: str):
self.prefix = self.prefix[:-1] self.prefix.pop()
if tag in self.tagDrops: if tag in self.tagDrops:
self.insideTagDrops[tag] = False self.insideTagDrops[tag] = False
def handle_data(self, data: str): def handle_data(self, data: str):
if self.do_capture(): if self.do_capture():
self.text += f"{self.prefix}{data}\n" self.text += f"{':'.join(self.prefix)}:{data}\n"
def handle_xmltext(ph: 'ProxyHandler', pr: urllib.parse.ParseResult): def handle_xmltext(ph: 'ProxyHandler', pr: urllib.parse.ParseResult):

View File

@ -463,6 +463,10 @@ plain textual content from the search result page.
* fetch_pdf_as_text - fetch/read specified pdf file and extract its textual content * fetch_pdf_as_text - fetch/read specified pdf file and extract its textual content
* this depends on the pypdf python based open source library * this depends on the pypdf python based open source library
* fetch_xml_as_text - fetch/read specified xml file and extract its textual content
* prefixes the tag heirarchy with each leaf content
* allows one to specify a list of tags that are to be dropped fully.
the above set of web related tool calls work by handshaking with a bundled simple local web proxy the above set of web related tool calls work by handshaking with a bundled simple local web proxy
(/caching in future) server logic, this helps bypass the CORS restrictions applied if trying to (/caching in future) server logic, this helps bypass the CORS restrictions applied if trying to
directly fetch from the browser js runtime environment. directly fetch from the browser js runtime environment.
@ -650,6 +654,7 @@ sliding window based drop off or even before they kick in, this can help in many
or if there is no response within the configured timeout period. or if there is no response within the configured timeout period.
NOTE: Currently the logic supports only 1 pending tool call per chat session. NOTE: Currently the logic supports only 1 pending tool call per chat session.
* add support for fetch_xml_as_text tool call, fix importmaps in index.html
#### ToDo #### ToDo

View File

@ -334,11 +334,13 @@ async function fetchpdftext_setup(tcs) {
// //
let gRSSTagDropsDefault = [ "guid", "link", "description", "image", "enclosure" ]
let fetchxmltext_meta = { let fetchxmltext_meta = {
"type": "function", "type": "function",
"function": { "function": {
"name": "fetch_xml_as_text", "name": "fetch_xml_as_text",
"description": "Fetch the requested xml url through a proxy server and return its text content after stripping away the xml tags, in few seconds", "description": "Fetch requested xml url through a proxy server and return its cleaned up text contents. Each content is prefixed with the xml tag heirarchy that it belongs to. Will take few seconds",
"parameters": { "parameters": {
"type": "object", "type": "object",
"properties": { "properties": {
@ -348,7 +350,7 @@ let fetchxmltext_meta = {
}, },
"tagDrops":{ "tagDrops":{
"type":"string", "type":"string",
"description":"specify a json stringified form of list of xml tags to drop" "description":`Optionally specify a json stringified list of xml tags to drop. For example for rss feeds one could use ${JSON.stringify(gRSSTagDropsDefault)} and so...`
} }
}, },
"required": ["url"] "required": ["url"]
@ -367,7 +369,11 @@ let fetchxmltext_meta = {
* @param {any} obj * @param {any} obj
*/ */
function fetchxmltext_run(chatid, toolcallid, toolname, obj) { function fetchxmltext_run(chatid, toolcallid, toolname, obj) {
let headers = { 'xmltext-tag-drops': obj.tagDrops } let tagDrops = obj.tagDrops
if (tagDrops == undefined) {
tagDrops = JSON.stringify([]) // JSON.stringify(gRSSTagDropsDefault)
}
let headers = { 'xmltext-tag-drops': tagDrops }
return proxyserver_get_anyargs(chatid, toolcallid, toolname, obj, 'xmltext', headers); return proxyserver_get_anyargs(chatid, toolcallid, toolname, obj, 'xmltext', headers);
} }