SimpleChatTC:Rename fetch_web_url_text to fetch_html_text

To make it easier for the ai model to understand that this works mainly for html pages and not say xml or pdf or so. For those one needs to use other explict tool calls provided like fetchpdftext or fetchxmltext or so The server service path renamed from urltext to htmltext. SearchWebText also updated to use htmltext now
2025-11-07 19:13:05 +05:30 · 2025-11-07 19:13:05 +05:30 · 143f9c0b1a
parent c0f7c8654e
commit 143f9c0b1a
4 changed files with 42 additions and 40 deletions
--- a/tools/server/public_simplechat/local.tools/simpleproxy.py
+++ b/tools/server/public_simplechat/local.tools/simpleproxy.py
@ -50,7 +50,7 @@ gConfigNeeded = [ '--allowed.schemes', '--allowed.domains', '--bearer.insecure'
 gAllowedCalls = {
    "xmlfiltered": [],
-    "urltext": [],
+    "htmltext": [],
    "urlraw": [],
    "pdftext": [ "pypdf" ]
    }
@ -138,8 +138,8 @@ class ProxyHandler(http.server.BaseHTTPRequestHandler):
        match pr.path:
            case '/urlraw':
                self.auth_and_run(pr, mWeb.handle_urlraw)
-            case '/urltext':
+            case '/htmltext':
-                self.auth_and_run(pr, mWeb.handle_urltext)
+                self.auth_and_run(pr, mWeb.handle_htmltext)
            case '/xmlfiltered':
                self.auth_and_run(pr, mWeb.handle_xmlfiltered)
            case '/pdftext':
--- a/tools/server/public_simplechat/local.tools/webmagic.py
+++ b/tools/server/public_simplechat/local.tools/webmagic.py
@ -192,15 +192,15 @@ class TextHtmlParser(html.parser.HTMLParser):
        return self.textStripped
-def handle_urltext(ph: 'ProxyHandler', pr: urllib.parse.ParseResult):
+def handle_htmltext(ph: 'ProxyHandler', pr: urllib.parse.ParseResult):
    try:
        # Get requested url
-        got = handle_urlreq(ph, pr, "HandleUrlText")
+        got = handle_urlreq(ph, pr, "HandleHtmlText")
        if not got.callOk:
            ph.send_error(got.httpStatus, got.httpStatusMsg)
            return
        # Extract Text
-        tagDrops = ph.headers.get('urltext-tag-drops')
+        tagDrops = ph.headers.get('htmltext-tag-drops')
        if not tagDrops:
            tagDrops = []
        else:
@ -216,7 +216,7 @@ def handle_urltext(ph: 'ProxyHandler', pr: urllib.parse.ParseResult):
        ph.wfile.write(textHtml.get_stripped_text().encode('utf-8'))
        debug.dump({ 'RawText': 'yes', 'StrippedText': 'yes' }, { 'RawText': textHtml.text, 'StrippedText': textHtml.get_stripped_text() })
    except Exception as exc:
-        ph.send_error(502, f"WARN:UrlTextFailed:{exc}")
+        ph.send_error(502, f"WARN:HtmlText:Failed:{exc}")
 class XMLFilterParser(html.parser.HTMLParser):
--- a/tools/server/public_simplechat/readme.md
+++ b/tools/server/public_simplechat/readme.md
@ -453,7 +453,7 @@ Either way always remember to cross check the tool requests and generated respon
 * fetch_web_url_raw - fetch contents of the requested url through a proxy server
-* fetch_web_url_text - fetch text parts of the content from the requested url through a proxy server.
+* fetch_html_text - fetch text parts of the html content from the requested url through a proxy server.
  Related logic tries to strip html response of html tags and also head, script, style, header,footer,
  nav, ... blocks.
@ -669,6 +669,10 @@ sliding window based drop off or even before they kick in, this can help in many
  arguments generated by the ai. This ensures that the chat ui itself doesnt get stuck in it. Instead now
  the tool call response can inform the ai model that its function call had issues.
 * renamed fetch_web_url_text to fetch_html_text, so that gen ai model wont try to use this to fetch xml or
  rss files, because it will return empty content, because there wont be any html content to strip the tags
  and unwanted blocks before returning.
 #### ToDo
--- a/tools/server/public_simplechat/toolweb.mjs
+++ b/tools/server/public_simplechat/toolweb.mjs
@ -6,6 +6,16 @@
 // by Humans for All
 //
 //
 // The simpleproxy.py server is expected to provide the below services
 // urlraw - fetch the request url content as is
 // htmltext - fetch the requested html content and provide plain text version
 //     after stripping it of tag blocks like head, script, style, header, footer, nav, ...
 // pdftext - fetch the requested pdf and provide the plain text version
 // xmlfiltered - fetch the requested xml content and provide a optionally filtered version of same
 //
 import * as mChatMagic from './simplechat.js'
@ -141,21 +151,21 @@ async function fetchweburlraw_setup(tcs) {
 //
-// Fetch Web Url Text
+// Fetch html Text
 //
-let fetchweburltext_meta = {
+let fetchhtmltext_meta = {
        "type": "function",
        "function": {
-            "name": "fetch_web_url_text",
+            "name": "fetch_html_text",
-            "description": "Fetch the requested web url through a proxy server and return its text content after stripping away the html tags as well as head, script, style, header, footer, nav blocks, in few seconds",
+            "description": "Fetch html content from given url through a proxy server and return its text content after stripping away the html tags as well as head, script, style, header, footer, nav blocks, in few seconds",
            "parameters": {
                "type": "object",
                "properties": {
                    "url":{
                        "type":"string",
-                        "description":"url of the page that will be fetched from the internet and inturn unwanted stuff stripped from its contents to some extent"
+                        "description":"url of the html page that needs to be fetched and inturn unwanted stuff stripped from its contents to some extent"
                    }
                },
                "required": ["url"]
@ -165,35 +175,30 @@ let fetchweburltext_meta = {
 /**
- * Implementation of the fetch web url text logic.
+ * Implementation of the fetch html text logic.
- * Expects a simple minded proxy server to be running locally
+ * Expects the simple minded simpleproxy server to be running locally,
- * * listening on a configured port
+ * providing service for htmltext path.
 * * expecting http requests
 *   * with a query token named url wrt urltext path,
 *     which gives the actual url to fetch
 * * strips out head as well as any script, style, header, footer, nav and so blocks in body
 *   before returning remaining body contents.
 * ALERT: Accesses a seperate/external web proxy/caching server, be aware and careful
 * @param {string} chatid
 * @param {string} toolcallid
 * @param {string} toolname
 * @param {any} obj
 */
-function fetchweburltext_run(chatid, toolcallid, toolname, obj) {
+function fetchhtmltext_run(chatid, toolcallid, toolname, obj) {
    // maybe filter out any key other than 'url' in obj
-    return proxyserver_get_anyargs(chatid, toolcallid, toolname, obj, 'urltext');
+    return proxyserver_get_anyargs(chatid, toolcallid, toolname, obj, 'htmltext');
 }
 /**
- * Setup fetch_web_url_text for tool calling
+ * Setup fetch_html_text for tool calling
 * NOTE: Currently the logic is setup for the bundled simpleproxy.py
 * @param {Object<string, Object<string, any>>} tcs
 */
-async function fetchweburltext_setup(tcs) {
+async function fetchhtmltext_setup(tcs) {
-    return proxyserver_tc_setup('FetchWebUrlText', 'urltext', 'fetch_web_url_text', {
+    return proxyserver_tc_setup('FetchHtmlText', 'htmltext', 'fetch_html_text', {
-        "handler": fetchweburltext_run,
+        "handler": fetchhtmltext_run,
-        "meta": fetchweburltext_meta,
+        "meta": fetchhtmltext_meta,
        "result": ""
    }, tcs);
 }
@ -225,14 +230,7 @@ let searchwebtext_meta = {
 /**
 * Implementation of the search web text logic. Initial go.
- * Builds on urltext path of the bundled simpleproxy.py.
+ * Builds on htmltext path service of the bundled simpleproxy.py.
 * Expects simpleproxy.py server to be running locally
 * * listening on a configured port
 * * expecting http requests
 *   * with a query token named url wrt urltext path,
 *     which gives the actual url to fetch
 * * strips out head as well as any script, style, header, footer, nav and so blocks in body
 *   before returning remaining body contents.
 * ALERT: Accesses a seperate/external web proxy/caching server, be aware and careful
 * @param {string} chatid
 * @param {string} toolcallid
@ -245,8 +243,8 @@ function searchwebtext_run(chatid, toolcallid, toolname, obj) {
    searchUrl = searchUrl.replace("SEARCHWORDS", encodeURIComponent(obj.words));
    delete(obj.words)
    obj['url'] = searchUrl
-    let headers = { 'urltext-tag-drops': JSON.stringify(gMe.tools.searchDrops) }
+    let headers = { 'htmltext-tag-drops': JSON.stringify(gMe.tools.searchDrops) }
-    return proxyserver_get_anyargs(chatid, toolcallid, toolname, obj, 'urltext', headers);
+    return proxyserver_get_anyargs(chatid, toolcallid, toolname, obj, 'htmltext', headers);
 }
@ -256,7 +254,7 @@ function searchwebtext_run(chatid, toolcallid, toolname, obj) {
 * @param {Object<string, Object<string, any>>} tcs
 */
 async function searchwebtext_setup(tcs) {
-    return proxyserver_tc_setup('SearchWebText', 'urltext', 'search_web_text', {
+    return proxyserver_tc_setup('SearchWebText', 'htmltext', 'search_web_text', {
        "handler": searchwebtext_run,
        "meta": searchwebtext_meta,
        "result": ""
@ -418,7 +416,7 @@ export async function init(me) {
    let tc_switch = {}
    gMe = me
    await fetchweburlraw_setup(tc_switch)
-    await fetchweburltext_setup(tc_switch)
+    await fetchhtmltext_setup(tc_switch)
    await searchwebtext_setup(tc_switch)
    await fetchpdftext_setup(tc_switch)
    await fetchxmlfiltered_setup(tc_switch)