SimpleSallap:SimpleMCP:TCWeb:XMLFiltered initial go wrt new flow
Also remember to picks the tagDropREs from passed args object and not from got http header. Even TCHtmlText updated to get the tags to drop from passed args object and not got http header. And inturn allow ai to pass this optional arg, as it sees fit in co-ordination with user.
This commit is contained in:
parent
5bf608dedd
commit
66038f99cf
|
|
@ -184,18 +184,29 @@ class TextHtmlParser(html.parser.HTMLParser):
|
|||
return self.textStripped
|
||||
|
||||
|
||||
gTagDropsHTMLTextSample = [ { 'tag': 'div', 'id': "header" } ]
|
||||
|
||||
class TCHtmlText(mTC.ToolCall):
|
||||
|
||||
def tcf_meta(self) -> mTC.TCFunction:
|
||||
return mTC.TCFunction(
|
||||
self.name,
|
||||
"Fetch html content from given url through a proxy server and return its text content after stripping away the html tags as well as head, script, style, header, footer, nav blocks, in few seconds",
|
||||
"Fetch html content from given url through a proxy server and return its text content after stripping away html tags as well as uneeded blocks like head, script, style, header, footer, nav in few seconds",
|
||||
mTC.TCInParameters(
|
||||
"object",
|
||||
{
|
||||
"url": mTC.TCInProperty(
|
||||
"string",
|
||||
"url of the html page that needs to be fetched and inturn unwanted stuff stripped from its contents to some extent"
|
||||
"url of the html page that needs to be fetched and inturn unwanted stuff stripped from its contents to an extent"
|
||||
),
|
||||
"tagDrops": mTC.TCInProperty(
|
||||
"string",
|
||||
(
|
||||
"Optionally specify a json stringified list of tag-and-id dicts of tag blocks to drop from html."
|
||||
"For each tag block that needs to be dropped, one needs to specify the tag type and its associated id attribute."
|
||||
"where the tag types (ie div, span, p, a ...) are always mentioned in lower case."
|
||||
f"For example when fetching a search web site, one could use {json.dumps(gTagDropsHTMLTextSample)} and so..."
|
||||
)
|
||||
)
|
||||
},
|
||||
[ "url" ]
|
||||
|
|
@ -209,7 +220,7 @@ class TCHtmlText(mTC.ToolCall):
|
|||
if not got.callOk:
|
||||
return got
|
||||
# Extract Text
|
||||
tagDrops = inHeaders.get('htmltext-tag-drops')
|
||||
tagDrops = args.get('tagDrops')
|
||||
if not tagDrops:
|
||||
tagDrops = []
|
||||
else:
|
||||
|
|
@ -284,28 +295,56 @@ class XMLFilterParser(html.parser.HTMLParser):
|
|||
self.text += f"{data}"
|
||||
|
||||
|
||||
def handle_xmlfiltered(ph: 'ProxyHandler', pr: urllib.parse.ParseResult):
|
||||
gRSSTagDropREsDefault = [
|
||||
"^rss:channel:item:guid:.*",
|
||||
"^rss:channel:item:link:.*",
|
||||
"^rss:channel:item:description:.*",
|
||||
".*:image:.*",
|
||||
".*:enclosure:.*"
|
||||
]
|
||||
|
||||
class TCXmlFiltered(mTC.ToolCall):
|
||||
|
||||
def tcf_meta(self) -> mTC.TCFunction:
|
||||
return mTC.TCFunction(
|
||||
self.name,
|
||||
"Fetch requested xml url through a proxy server that can optionally filter out unwanted tags and their contents. Will take few seconds",
|
||||
mTC.TCInParameters(
|
||||
"object",
|
||||
{
|
||||
"url": mTC.TCInProperty(
|
||||
"string",
|
||||
"url of the xml file that will be fetched"
|
||||
),
|
||||
"tagDropREs": mTC.TCInProperty(
|
||||
"string",
|
||||
(
|
||||
"Optionally specify a json stringified list of xml tag heirarchies to drop."
|
||||
"For each tag that needs to be dropped, one needs to specify regular expression of the heirarchy of tags involved,"
|
||||
"where the tag names are always mentioned in lower case along with a : as suffix."
|
||||
f"For example for rss feeds one could use {json.dumps(gRSSTagDropREsDefault)} and so..."
|
||||
)
|
||||
)
|
||||
},
|
||||
[ "url" ]
|
||||
)
|
||||
)
|
||||
|
||||
def tc_handle(self, args: mTC.TCInArgs, inHeaders: http.client.HTTPMessage) -> mTC.TCOutResponse:
|
||||
try:
|
||||
# Get requested url
|
||||
got = handle_urlreq(ph, pr, "HandleXMLFiltered")
|
||||
got = handle_urlreq(args['url'], inHeaders, "HandleTCXMLFiltered")
|
||||
if not got.callOk:
|
||||
ph.send_error(got.httpStatus, got.httpStatusMsg)
|
||||
return
|
||||
return got
|
||||
# Extract Text
|
||||
tagDropREs = ph.headers.get('xmlfiltered-tagdrop-res')
|
||||
tagDropREs = args.get('tagDropREs')
|
||||
if not tagDropREs:
|
||||
tagDropREs = []
|
||||
else:
|
||||
tagDropREs = cast(list[str], json.loads(tagDropREs))
|
||||
xmlFiltered = XMLFilterParser(tagDropREs)
|
||||
xmlFiltered.feed(got.contentData)
|
||||
# Send back to client
|
||||
ph.send_response(got.httpStatus)
|
||||
ph.send_header('Content-Type', got.contentType)
|
||||
# Add CORS for browser fetch, just in case
|
||||
ph.send_header('Access-Control-Allow-Origin', '*')
|
||||
ph.end_headers()
|
||||
ph.wfile.write(xmlFiltered.text.encode('utf-8'))
|
||||
debug.dump({ 'XMLFiltered': 'yes' }, { 'RawText': xmlFiltered.text })
|
||||
xmlFiltered.feed(got.contentData.decode('utf-8'))
|
||||
debug.dump({ 'op': 'MCPWeb.XMLFiltered' }, { 'RawText': xmlFiltered.text })
|
||||
return mTC.TCOutResponse(True, got.statusCode, got.statusMsg, got.contentType, xmlFiltered.text.encode('utf-8'))
|
||||
except Exception as exc:
|
||||
ph.send_error(502, f"WARN:XMLFiltered:Failed:{exc}")
|
||||
return mTC.TCOutResponse(False, 502, f"WARN:XMLFiltered:Failed:{exc}")
|
||||
|
|
|
|||
Loading…
Reference in New Issue