diff --git a/tools/server/public_simplechat/local.tools/simpleproxy.py b/tools/server/public_simplechat/local.tools/simpleproxy.py index d36a2443f0..fb55482cb3 100644 --- a/tools/server/public_simplechat/local.tools/simpleproxy.py +++ b/tools/server/public_simplechat/local.tools/simpleproxy.py @@ -50,7 +50,7 @@ gConfigNeeded = [ '--allowed.schemes', '--allowed.domains', '--bearer.insecure' gAllowedCalls = { "xmlfiltered": [], - "urltext": [], + "htmltext": [], "urlraw": [], "pdftext": [ "pypdf" ] } @@ -138,8 +138,8 @@ class ProxyHandler(http.server.BaseHTTPRequestHandler): match pr.path: case '/urlraw': self.auth_and_run(pr, mWeb.handle_urlraw) - case '/urltext': - self.auth_and_run(pr, mWeb.handle_urltext) + case '/htmltext': + self.auth_and_run(pr, mWeb.handle_htmltext) case '/xmlfiltered': self.auth_and_run(pr, mWeb.handle_xmlfiltered) case '/pdftext': diff --git a/tools/server/public_simplechat/local.tools/webmagic.py b/tools/server/public_simplechat/local.tools/webmagic.py index aaf7d5f332..f8d65c90c4 100644 --- a/tools/server/public_simplechat/local.tools/webmagic.py +++ b/tools/server/public_simplechat/local.tools/webmagic.py @@ -192,15 +192,15 @@ class TextHtmlParser(html.parser.HTMLParser): return self.textStripped -def handle_urltext(ph: 'ProxyHandler', pr: urllib.parse.ParseResult): +def handle_htmltext(ph: 'ProxyHandler', pr: urllib.parse.ParseResult): try: # Get requested url - got = handle_urlreq(ph, pr, "HandleUrlText") + got = handle_urlreq(ph, pr, "HandleHtmlText") if not got.callOk: ph.send_error(got.httpStatus, got.httpStatusMsg) return # Extract Text - tagDrops = ph.headers.get('urltext-tag-drops') + tagDrops = ph.headers.get('htmltext-tag-drops') if not tagDrops: tagDrops = [] else: @@ -216,7 +216,7 @@ def handle_urltext(ph: 'ProxyHandler', pr: urllib.parse.ParseResult): ph.wfile.write(textHtml.get_stripped_text().encode('utf-8')) debug.dump({ 'RawText': 'yes', 'StrippedText': 'yes' }, { 'RawText': textHtml.text, 'StrippedText': textHtml.get_stripped_text() }) except Exception as exc: - ph.send_error(502, f"WARN:UrlTextFailed:{exc}") + ph.send_error(502, f"WARN:HtmlText:Failed:{exc}") class XMLFilterParser(html.parser.HTMLParser): diff --git a/tools/server/public_simplechat/readme.md b/tools/server/public_simplechat/readme.md index 7c42b41f16..d87e0cd44f 100644 --- a/tools/server/public_simplechat/readme.md +++ b/tools/server/public_simplechat/readme.md @@ -453,7 +453,7 @@ Either way always remember to cross check the tool requests and generated respon * fetch_web_url_raw - fetch contents of the requested url through a proxy server -* fetch_web_url_text - fetch text parts of the content from the requested url through a proxy server. +* fetch_html_text - fetch text parts of the html content from the requested url through a proxy server. Related logic tries to strip html response of html tags and also head, script, style, header,footer, nav, ... blocks. @@ -669,6 +669,10 @@ sliding window based drop off or even before they kick in, this can help in many arguments generated by the ai. This ensures that the chat ui itself doesnt get stuck in it. Instead now the tool call response can inform the ai model that its function call had issues. +* renamed fetch_web_url_text to fetch_html_text, so that gen ai model wont try to use this to fetch xml or + rss files, because it will return empty content, because there wont be any html content to strip the tags + and unwanted blocks before returning. + #### ToDo diff --git a/tools/server/public_simplechat/toolweb.mjs b/tools/server/public_simplechat/toolweb.mjs index c871cdbd0e..5209de2cf2 100644 --- a/tools/server/public_simplechat/toolweb.mjs +++ b/tools/server/public_simplechat/toolweb.mjs @@ -6,6 +6,16 @@ // by Humans for All // +// +// The simpleproxy.py server is expected to provide the below services +// urlraw - fetch the request url content as is +// htmltext - fetch the requested html content and provide plain text version +// after stripping it of tag blocks like head, script, style, header, footer, nav, ... +// pdftext - fetch the requested pdf and provide the plain text version +// xmlfiltered - fetch the requested xml content and provide a optionally filtered version of same +// + + import * as mChatMagic from './simplechat.js' @@ -141,21 +151,21 @@ async function fetchweburlraw_setup(tcs) { // -// Fetch Web Url Text +// Fetch html Text // -let fetchweburltext_meta = { +let fetchhtmltext_meta = { "type": "function", "function": { - "name": "fetch_web_url_text", - "description": "Fetch the requested web url through a proxy server and return its text content after stripping away the html tags as well as head, script, style, header, footer, nav blocks, in few seconds", + "name": "fetch_html_text", + "description": "Fetch html content from given url through a proxy server and return its text content after stripping away the html tags as well as head, script, style, header, footer, nav blocks, in few seconds", "parameters": { "type": "object", "properties": { "url":{ "type":"string", - "description":"url of the page that will be fetched from the internet and inturn unwanted stuff stripped from its contents to some extent" + "description":"url of the html page that needs to be fetched and inturn unwanted stuff stripped from its contents to some extent" } }, "required": ["url"] @@ -165,35 +175,30 @@ let fetchweburltext_meta = { /** - * Implementation of the fetch web url text logic. - * Expects a simple minded proxy server to be running locally - * * listening on a configured port - * * expecting http requests - * * with a query token named url wrt urltext path, - * which gives the actual url to fetch - * * strips out head as well as any script, style, header, footer, nav and so blocks in body - * before returning remaining body contents. + * Implementation of the fetch html text logic. + * Expects the simple minded simpleproxy server to be running locally, + * providing service for htmltext path. * ALERT: Accesses a seperate/external web proxy/caching server, be aware and careful * @param {string} chatid * @param {string} toolcallid * @param {string} toolname * @param {any} obj */ -function fetchweburltext_run(chatid, toolcallid, toolname, obj) { +function fetchhtmltext_run(chatid, toolcallid, toolname, obj) { // maybe filter out any key other than 'url' in obj - return proxyserver_get_anyargs(chatid, toolcallid, toolname, obj, 'urltext'); + return proxyserver_get_anyargs(chatid, toolcallid, toolname, obj, 'htmltext'); } /** - * Setup fetch_web_url_text for tool calling + * Setup fetch_html_text for tool calling * NOTE: Currently the logic is setup for the bundled simpleproxy.py * @param {Object>} tcs */ -async function fetchweburltext_setup(tcs) { - return proxyserver_tc_setup('FetchWebUrlText', 'urltext', 'fetch_web_url_text', { - "handler": fetchweburltext_run, - "meta": fetchweburltext_meta, +async function fetchhtmltext_setup(tcs) { + return proxyserver_tc_setup('FetchHtmlText', 'htmltext', 'fetch_html_text', { + "handler": fetchhtmltext_run, + "meta": fetchhtmltext_meta, "result": "" }, tcs); } @@ -225,14 +230,7 @@ let searchwebtext_meta = { /** * Implementation of the search web text logic. Initial go. - * Builds on urltext path of the bundled simpleproxy.py. - * Expects simpleproxy.py server to be running locally - * * listening on a configured port - * * expecting http requests - * * with a query token named url wrt urltext path, - * which gives the actual url to fetch - * * strips out head as well as any script, style, header, footer, nav and so blocks in body - * before returning remaining body contents. + * Builds on htmltext path service of the bundled simpleproxy.py. * ALERT: Accesses a seperate/external web proxy/caching server, be aware and careful * @param {string} chatid * @param {string} toolcallid @@ -245,8 +243,8 @@ function searchwebtext_run(chatid, toolcallid, toolname, obj) { searchUrl = searchUrl.replace("SEARCHWORDS", encodeURIComponent(obj.words)); delete(obj.words) obj['url'] = searchUrl - let headers = { 'urltext-tag-drops': JSON.stringify(gMe.tools.searchDrops) } - return proxyserver_get_anyargs(chatid, toolcallid, toolname, obj, 'urltext', headers); + let headers = { 'htmltext-tag-drops': JSON.stringify(gMe.tools.searchDrops) } + return proxyserver_get_anyargs(chatid, toolcallid, toolname, obj, 'htmltext', headers); } @@ -256,7 +254,7 @@ function searchwebtext_run(chatid, toolcallid, toolname, obj) { * @param {Object>} tcs */ async function searchwebtext_setup(tcs) { - return proxyserver_tc_setup('SearchWebText', 'urltext', 'search_web_text', { + return proxyserver_tc_setup('SearchWebText', 'htmltext', 'search_web_text', { "handler": searchwebtext_run, "meta": searchwebtext_meta, "result": "" @@ -418,7 +416,7 @@ export async function init(me) { let tc_switch = {} gMe = me await fetchweburlraw_setup(tc_switch) - await fetchweburltext_setup(tc_switch) + await fetchhtmltext_setup(tc_switch) await searchwebtext_setup(tc_switch) await fetchpdftext_setup(tc_switch) await fetchxmlfiltered_setup(tc_switch)