SimpleChatTC:XmlFiltered: Use re with heirarchy of tags to filter
Rename xmltext to xmlfiltered. This simplifies the filtering related logic as well as gives more fine grained flexibility wrt filtering bcas of re.
This commit is contained in:
parent
9ed1cf9886
commit
9f5c3d7776
|
|
@ -49,7 +49,7 @@ gConfigType = {
|
||||||
gConfigNeeded = [ '--allowed.schemes', '--allowed.domains', '--bearer.insecure' ]
|
gConfigNeeded = [ '--allowed.schemes', '--allowed.domains', '--bearer.insecure' ]
|
||||||
|
|
||||||
gAllowedCalls = {
|
gAllowedCalls = {
|
||||||
"xmltext": [],
|
"xmlfiltered": [],
|
||||||
"urltext": [],
|
"urltext": [],
|
||||||
"urlraw": [],
|
"urlraw": [],
|
||||||
"pdftext": [ "pypdf" ]
|
"pdftext": [ "pypdf" ]
|
||||||
|
|
@ -140,8 +140,8 @@ class ProxyHandler(http.server.BaseHTTPRequestHandler):
|
||||||
self.auth_and_run(pr, mWeb.handle_urlraw)
|
self.auth_and_run(pr, mWeb.handle_urlraw)
|
||||||
case '/urltext':
|
case '/urltext':
|
||||||
self.auth_and_run(pr, mWeb.handle_urltext)
|
self.auth_and_run(pr, mWeb.handle_urltext)
|
||||||
case '/xmltext':
|
case '/xmlfiltered':
|
||||||
self.auth_and_run(pr, mWeb.handle_xmltext)
|
self.auth_and_run(pr, mWeb.handle_xmlfiltered)
|
||||||
case '/pdftext':
|
case '/pdftext':
|
||||||
self.auth_and_run(pr, mPdf.handle_pdftext)
|
self.auth_and_run(pr, mPdf.handle_pdftext)
|
||||||
case '/aum':
|
case '/aum':
|
||||||
|
|
|
||||||
|
|
@ -8,6 +8,7 @@ import html.parser
|
||||||
import debug
|
import debug
|
||||||
import filemagic as mFile
|
import filemagic as mFile
|
||||||
import json
|
import json
|
||||||
|
import re
|
||||||
from typing import TYPE_CHECKING, Any, cast
|
from typing import TYPE_CHECKING, Any, cast
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
|
|
@ -218,25 +219,30 @@ def handle_urltext(ph: 'ProxyHandler', pr: urllib.parse.ParseResult):
|
||||||
ph.send_error(502, f"WARN:UrlTextFailed:{exc}")
|
ph.send_error(502, f"WARN:UrlTextFailed:{exc}")
|
||||||
|
|
||||||
|
|
||||||
class TextXMLParser(html.parser.HTMLParser):
|
class XMLFilterParser(html.parser.HTMLParser):
|
||||||
"""
|
"""
|
||||||
A simple minded logic used to strip xml content of
|
A simple minded logic used to strip xml content of
|
||||||
* unwanted tags and their contents.
|
* unwanted tags and their contents, using re
|
||||||
* this works properly only if the xml being processed has proper opening and ending tags
|
* this works properly only if the xml being processed has
|
||||||
around the area of interest.
|
proper opening and ending tags around the area of interest.
|
||||||
|
|
||||||
This can help return a cleaned up xml file.
|
This can help return a cleaned up xml file.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, tagDrops: list[str]):
|
def __init__(self, tagDropREs: list[str]):
|
||||||
|
"""
|
||||||
|
tagDropREs - allows one to specify a list of tags related REs,
|
||||||
|
to help drop the corresponding tags and their contents fully.
|
||||||
|
|
||||||
|
To drop a tag, specify regular expression
|
||||||
|
* that matches the corresponding heirarchy of tags involved
|
||||||
|
* where the tag names should be in lower case and suffixed with :
|
||||||
|
* if interested in dropping a tag independent of where it appears use
|
||||||
|
".*:tagname:.*" re template
|
||||||
|
"""
|
||||||
super().__init__()
|
super().__init__()
|
||||||
self.tagDrops = list(map(str.lower, tagDrops))
|
self.tagDropREs = list(map(str.lower, tagDropREs))
|
||||||
print(f"DBUG:TextXMLParser:{self.tagDrops}")
|
print(f"DBUG:XMLFilterParser:{self.tagDropREs}")
|
||||||
self.insideTagDrops = {
|
|
||||||
}
|
|
||||||
for tag in tagDrops:
|
|
||||||
self.insideTagDrops[tag] = False
|
|
||||||
self.bCapture = False
|
|
||||||
self.text = ""
|
self.text = ""
|
||||||
self.prefixTags = []
|
self.prefixTags = []
|
||||||
self.prefix = ""
|
self.prefix = ""
|
||||||
|
|
@ -246,8 +252,9 @@ class TextXMLParser(html.parser.HTMLParser):
|
||||||
"""
|
"""
|
||||||
Helps decide whether to capture contents or discard them.
|
Helps decide whether to capture contents or discard them.
|
||||||
"""
|
"""
|
||||||
for tag in self.tagDrops:
|
curTagH = f'{":".join(self.prefixTags)}:'
|
||||||
if self.insideTagDrops[tag]:
|
for dropRE in self.tagDropREs:
|
||||||
|
if re.match(dropRE, curTagH):
|
||||||
return False
|
return False
|
||||||
return True
|
return True
|
||||||
|
|
||||||
|
|
@ -256,8 +263,6 @@ class TextXMLParser(html.parser.HTMLParser):
|
||||||
self.prefixTags.append(tag)
|
self.prefixTags.append(tag)
|
||||||
self.prefix += "\t"
|
self.prefix += "\t"
|
||||||
self.text += f"\n{self.prefix}<{tag}>"
|
self.text += f"\n{self.prefix}<{tag}>"
|
||||||
if tag in self.tagDrops:
|
|
||||||
self.insideTagDrops[tag] = True
|
|
||||||
|
|
||||||
def handle_endtag(self, tag: str):
|
def handle_endtag(self, tag: str):
|
||||||
if (self.lastTrackedCB == "endtag"):
|
if (self.lastTrackedCB == "endtag"):
|
||||||
|
|
@ -267,36 +272,34 @@ class TextXMLParser(html.parser.HTMLParser):
|
||||||
self.lastTrackedCB = "endtag"
|
self.lastTrackedCB = "endtag"
|
||||||
self.prefixTags.pop()
|
self.prefixTags.pop()
|
||||||
self.prefix = self.prefix[:-1]
|
self.prefix = self.prefix[:-1]
|
||||||
if tag in self.tagDrops:
|
|
||||||
self.insideTagDrops[tag] = False
|
|
||||||
|
|
||||||
def handle_data(self, data: str):
|
def handle_data(self, data: str):
|
||||||
if self.do_capture():
|
if self.do_capture():
|
||||||
self.text += f"{data}"
|
self.text += f"{data}"
|
||||||
|
|
||||||
|
|
||||||
def handle_xmltext(ph: 'ProxyHandler', pr: urllib.parse.ParseResult):
|
def handle_xmlfiltered(ph: 'ProxyHandler', pr: urllib.parse.ParseResult):
|
||||||
try:
|
try:
|
||||||
# Get requested url
|
# Get requested url
|
||||||
got = handle_urlreq(ph, pr, "HandleXMLText")
|
got = handle_urlreq(ph, pr, "HandleXMLFiltered")
|
||||||
if not got.callOk:
|
if not got.callOk:
|
||||||
ph.send_error(got.httpStatus, got.httpStatusMsg)
|
ph.send_error(got.httpStatus, got.httpStatusMsg)
|
||||||
return
|
return
|
||||||
# Extract Text
|
# Extract Text
|
||||||
tagDrops = ph.headers.get('xmltext-tag-drops')
|
tagDropREs = ph.headers.get('xmlfiltered-tagdrop-res')
|
||||||
if not tagDrops:
|
if not tagDropREs:
|
||||||
tagDrops = []
|
tagDropREs = []
|
||||||
else:
|
else:
|
||||||
tagDrops = cast(list[str], json.loads(tagDrops))
|
tagDropREs = cast(list[str], json.loads(tagDropREs))
|
||||||
textXML = TextXMLParser(tagDrops)
|
xmlFiltered = XMLFilterParser(tagDropREs)
|
||||||
textXML.feed(got.contentData)
|
xmlFiltered.feed(got.contentData)
|
||||||
# Send back to client
|
# Send back to client
|
||||||
ph.send_response(got.httpStatus)
|
ph.send_response(got.httpStatus)
|
||||||
ph.send_header('Content-Type', got.contentType)
|
ph.send_header('Content-Type', got.contentType)
|
||||||
# Add CORS for browser fetch, just in case
|
# Add CORS for browser fetch, just in case
|
||||||
ph.send_header('Access-Control-Allow-Origin', '*')
|
ph.send_header('Access-Control-Allow-Origin', '*')
|
||||||
ph.end_headers()
|
ph.end_headers()
|
||||||
ph.wfile.write(textXML.text.encode('utf-8'))
|
ph.wfile.write(xmlFiltered.text.encode('utf-8'))
|
||||||
debug.dump({ 'RawText': 'yes', 'StrippedText': 'yes' }, { 'RawText': textXML.text })
|
debug.dump({ 'XMLFiltered': 'yes' }, { 'RawText': xmlFiltered.text })
|
||||||
except Exception as exc:
|
except Exception as exc:
|
||||||
ph.send_error(502, f"WARN:XMLTextFailed:{exc}")
|
ph.send_error(502, f"WARN:XMLFiltered:Failed:{exc}")
|
||||||
|
|
|
||||||
|
|
@ -463,9 +463,15 @@ plain textual content from the search result page.
|
||||||
* fetch_pdf_as_text - fetch/read specified pdf file and extract its textual content
|
* fetch_pdf_as_text - fetch/read specified pdf file and extract its textual content
|
||||||
* this depends on the pypdf python based open source library
|
* this depends on the pypdf python based open source library
|
||||||
|
|
||||||
* fetch_xml_as_text - fetch/read specified xml file and extract its textual content
|
* fetch_xml_filtered - fetch/read specified xml file and optionally filter out any specified tags
|
||||||
* prefixes the tag heirarchy with each leaf content
|
* allows one to specify a list of tags related REs,
|
||||||
* allows one to specify a list of tags that are to be dropped fully.
|
to help drop the corresponding tags and their contents fully.
|
||||||
|
* to drop a tag, specify regular expression
|
||||||
|
* that matches the corresponding heirarchy of tags involved
|
||||||
|
* where the tag names should be in lower case and suffixed with :
|
||||||
|
* if interested in dropping a tag independent of where it appears use
|
||||||
|
* .*:tagname:.*
|
||||||
|
* rather the tool call meta data passed to ai model explains the same and provides a sample.
|
||||||
|
|
||||||
the above set of web related tool calls work by handshaking with a bundled simple local web proxy
|
the above set of web related tool calls work by handshaking with a bundled simple local web proxy
|
||||||
(/caching in future) server logic, this helps bypass the CORS restrictions applied if trying to
|
(/caching in future) server logic, this helps bypass the CORS restrictions applied if trying to
|
||||||
|
|
@ -656,6 +662,10 @@ sliding window based drop off or even before they kick in, this can help in many
|
||||||
|
|
||||||
* add support for fetch_xml_as_text tool call, fix importmaps in index.html
|
* add support for fetch_xml_as_text tool call, fix importmaps in index.html
|
||||||
|
|
||||||
|
* renamed and updated logic wrt xml fetching to be fetch_xml_filtered. allow one to use re to identify
|
||||||
|
the tags to be filtered in a fine grained manner including filtering based on tag heirarchy
|
||||||
|
|
||||||
|
|
||||||
#### ToDo
|
#### ToDo
|
||||||
|
|
||||||
Is the tool call promise land trap deep enough, need to think through and explore around this once later.
|
Is the tool call promise land trap deep enough, need to think through and explore around this once later.
|
||||||
|
|
|
||||||
|
|
@ -330,16 +330,22 @@ async function fetchpdftext_setup(tcs) {
|
||||||
|
|
||||||
|
|
||||||
//
|
//
|
||||||
// Fetch XML Text
|
// Fetch XML Filtered
|
||||||
//
|
//
|
||||||
|
|
||||||
|
|
||||||
let gRSSTagDropsDefault = [ "guid", "link", "description", "image", "enclosure" ]
|
let gRSSTagDropsDefault = [
|
||||||
|
"^rss:channel:item:guid:.*",
|
||||||
|
"^rss:channel:item:link:.*",
|
||||||
|
"^rss:channel:item:description:.*",
|
||||||
|
".*:image:.*",
|
||||||
|
".*:enclosure:.*"
|
||||||
|
];
|
||||||
|
|
||||||
let fetchxmltext_meta = {
|
let fetchxmlfiltered_meta = {
|
||||||
"type": "function",
|
"type": "function",
|
||||||
"function": {
|
"function": {
|
||||||
"name": "fetch_xml_as_text",
|
"name": "fetch_xml_filtered",
|
||||||
"description": "Fetch requested xml url through a proxy server that can optionally filter out unwanted tags and their contents. Will take few seconds",
|
"description": "Fetch requested xml url through a proxy server that can optionally filter out unwanted tags and their contents. Will take few seconds",
|
||||||
"parameters": {
|
"parameters": {
|
||||||
"type": "object",
|
"type": "object",
|
||||||
|
|
@ -348,9 +354,12 @@ let fetchxmltext_meta = {
|
||||||
"type":"string",
|
"type":"string",
|
||||||
"description":"url of the xml file that will be fetched"
|
"description":"url of the xml file that will be fetched"
|
||||||
},
|
},
|
||||||
"tagDrops":{
|
"tagDropREs":{
|
||||||
"type":"string",
|
"type":"string",
|
||||||
"description":`Optionally specify a json stringified list of xml tags to drop. For example for rss feeds one could use ${JSON.stringify(gRSSTagDropsDefault)} and so...`
|
"description":`Optionally specify a json stringified list of xml tag heirarchies to drop.
|
||||||
|
For each tag that needs to be dropped, one needs to specify regular expression of the heirarchy of tags involved,
|
||||||
|
where the tag names are always mentioned in lower case along with a : as suffix.
|
||||||
|
For example for rss feeds one could use ${JSON.stringify(gRSSTagDropsDefault)} and so...`
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"required": ["url"]
|
"required": ["url"]
|
||||||
|
|
@ -360,7 +369,7 @@ let fetchxmltext_meta = {
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Implementation of the fetch xml as text logic.
|
* Implementation of the fetch xml filtered logic.
|
||||||
* Expects simpleproxy to be running at specified url and providing xmltext service
|
* Expects simpleproxy to be running at specified url and providing xmltext service
|
||||||
* ALERT: Accesses a seperate/external web proxy/caching server, be aware and careful
|
* ALERT: Accesses a seperate/external web proxy/caching server, be aware and careful
|
||||||
* @param {string} chatid
|
* @param {string} chatid
|
||||||
|
|
@ -368,25 +377,25 @@ let fetchxmltext_meta = {
|
||||||
* @param {string} toolname
|
* @param {string} toolname
|
||||||
* @param {any} obj
|
* @param {any} obj
|
||||||
*/
|
*/
|
||||||
function fetchxmltext_run(chatid, toolcallid, toolname, obj) {
|
function fetchxmlfiltered_run(chatid, toolcallid, toolname, obj) {
|
||||||
let tagDrops = obj.tagDrops
|
let tagDropREs = obj.tagDropREs
|
||||||
if (tagDrops == undefined) {
|
if (tagDropREs == undefined) {
|
||||||
tagDrops = JSON.stringify([]) // JSON.stringify(gRSSTagDropsDefault)
|
tagDropREs = JSON.stringify([]) // JSON.stringify(gRSSTagDropsDefault)
|
||||||
}
|
}
|
||||||
let headers = { 'xmltext-tag-drops': tagDrops }
|
let headers = { 'xmlfiltered-tagdrop-res': tagDropREs }
|
||||||
return proxyserver_get_anyargs(chatid, toolcallid, toolname, obj, 'xmltext', headers);
|
return proxyserver_get_anyargs(chatid, toolcallid, toolname, obj, 'xmlfiltered', headers);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Setup fetch_xml_as_text for tool calling
|
* Setup fetch_xml_filtered for tool calling
|
||||||
* NOTE: Currently the logic is setup for the bundled simpleproxy.py
|
* NOTE: Currently the logic is setup for the bundled simpleproxy.py
|
||||||
* @param {Object<string, Object<string, any>>} tcs
|
* @param {Object<string, Object<string, any>>} tcs
|
||||||
*/
|
*/
|
||||||
async function fetchxmltext_setup(tcs) {
|
async function fetchxmlfiltered_setup(tcs) {
|
||||||
return proxyserver_tc_setup('FetchXmlAsText', 'xmltext', 'fetch_xml_as_text', {
|
return proxyserver_tc_setup('FetchXmlFiltered', 'xmlfiltered', 'fetch_xml_filtered', {
|
||||||
"handler": fetchxmltext_run,
|
"handler": fetchxmlfiltered_run,
|
||||||
"meta": fetchxmltext_meta,
|
"meta": fetchxmlfiltered_meta,
|
||||||
"result": ""
|
"result": ""
|
||||||
}, tcs);
|
}, tcs);
|
||||||
}
|
}
|
||||||
|
|
@ -412,6 +421,6 @@ export async function init(me) {
|
||||||
await fetchweburltext_setup(tc_switch)
|
await fetchweburltext_setup(tc_switch)
|
||||||
await searchwebtext_setup(tc_switch)
|
await searchwebtext_setup(tc_switch)
|
||||||
await fetchpdftext_setup(tc_switch)
|
await fetchpdftext_setup(tc_switch)
|
||||||
await fetchxmltext_setup(tc_switch)
|
await fetchxmlfiltered_setup(tc_switch)
|
||||||
return tc_switch
|
return tc_switch
|
||||||
}
|
}
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue