SimpleChatTC:XmlText: Cleanup initial go
At simpleproxy end * Add the tag names hierarchy before contents of a tag * Remember to convert the tagDrops to small case as HTMLParser base class seems to do that by default. At the client ui end * if undefined remember to pass a empty list wrt tagDrops. * cleanup the func description and also mention possible tagDrops for RSS feeds in the tool meta
This commit is contained in:
parent
fbe9b2369f
commit
b8bb258dd5
|
|
@ -232,7 +232,7 @@ class TextXMLParser(html.parser.HTMLParser):
|
|||
|
||||
def __init__(self, tagDrops: list[str]):
|
||||
super().__init__()
|
||||
self.tagDrops = tagDrops
|
||||
self.tagDrops = list(map(str.lower, tagDrops))
|
||||
print(f"DBUG:TextXMLParser:{self.tagDrops}")
|
||||
self.insideTagDrops = {
|
||||
}
|
||||
|
|
@ -240,7 +240,7 @@ class TextXMLParser(html.parser.HTMLParser):
|
|||
self.insideTagDrops[tag] = False
|
||||
self.bCapture = False
|
||||
self.text = ""
|
||||
self.prefix = ""
|
||||
self.prefix = []
|
||||
|
||||
def do_capture(self):
|
||||
"""
|
||||
|
|
@ -252,18 +252,18 @@ class TextXMLParser(html.parser.HTMLParser):
|
|||
return True
|
||||
|
||||
def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]):
|
||||
self.prefix += " "
|
||||
self.prefix.append(tag)
|
||||
if tag in self.tagDrops:
|
||||
self.insideTagDrops[tag] = True
|
||||
|
||||
def handle_endtag(self, tag: str):
|
||||
self.prefix = self.prefix[:-1]
|
||||
self.prefix.pop()
|
||||
if tag in self.tagDrops:
|
||||
self.insideTagDrops[tag] = False
|
||||
|
||||
def handle_data(self, data: str):
|
||||
if self.do_capture():
|
||||
self.text += f"{self.prefix}{data}\n"
|
||||
self.text += f"{':'.join(self.prefix)}:{data}\n"
|
||||
|
||||
|
||||
def handle_xmltext(ph: 'ProxyHandler', pr: urllib.parse.ParseResult):
|
||||
|
|
|
|||
|
|
@ -463,6 +463,10 @@ plain textual content from the search result page.
|
|||
* fetch_pdf_as_text - fetch/read specified pdf file and extract its textual content
|
||||
* this depends on the pypdf python based open source library
|
||||
|
||||
* fetch_xml_as_text - fetch/read specified xml file and extract its textual content
|
||||
* prefixes the tag heirarchy with each leaf content
|
||||
* allows one to specify a list of tags that are to be dropped fully.
|
||||
|
||||
the above set of web related tool calls work by handshaking with a bundled simple local web proxy
|
||||
(/caching in future) server logic, this helps bypass the CORS restrictions applied if trying to
|
||||
directly fetch from the browser js runtime environment.
|
||||
|
|
@ -650,6 +654,7 @@ sliding window based drop off or even before they kick in, this can help in many
|
|||
or if there is no response within the configured timeout period.
|
||||
NOTE: Currently the logic supports only 1 pending tool call per chat session.
|
||||
|
||||
* add support for fetch_xml_as_text tool call, fix importmaps in index.html
|
||||
|
||||
#### ToDo
|
||||
|
||||
|
|
|
|||
|
|
@ -334,11 +334,13 @@ async function fetchpdftext_setup(tcs) {
|
|||
//
|
||||
|
||||
|
||||
let gRSSTagDropsDefault = [ "guid", "link", "description", "image", "enclosure" ]
|
||||
|
||||
let fetchxmltext_meta = {
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "fetch_xml_as_text",
|
||||
"description": "Fetch the requested xml url through a proxy server and return its text content after stripping away the xml tags, in few seconds",
|
||||
"description": "Fetch requested xml url through a proxy server and return its cleaned up text contents. Each content is prefixed with the xml tag heirarchy that it belongs to. Will take few seconds",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
|
|
@ -348,7 +350,7 @@ let fetchxmltext_meta = {
|
|||
},
|
||||
"tagDrops":{
|
||||
"type":"string",
|
||||
"description":"specify a json stringified form of list of xml tags to drop"
|
||||
"description":`Optionally specify a json stringified list of xml tags to drop. For example for rss feeds one could use ${JSON.stringify(gRSSTagDropsDefault)} and so...`
|
||||
}
|
||||
},
|
||||
"required": ["url"]
|
||||
|
|
@ -367,7 +369,11 @@ let fetchxmltext_meta = {
|
|||
* @param {any} obj
|
||||
*/
|
||||
function fetchxmltext_run(chatid, toolcallid, toolname, obj) {
|
||||
let headers = { 'xmltext-tag-drops': obj.tagDrops }
|
||||
let tagDrops = obj.tagDrops
|
||||
if (tagDrops == undefined) {
|
||||
tagDrops = JSON.stringify([]) // JSON.stringify(gRSSTagDropsDefault)
|
||||
}
|
||||
let headers = { 'xmltext-tag-drops': tagDrops }
|
||||
return proxyserver_get_anyargs(chatid, toolcallid, toolname, obj, 'xmltext', headers);
|
||||
}
|
||||
|
||||
|
|
|
|||
Loading…
Reference in New Issue