From 5bf608dedd39aeb7524a793295309351f211c4fe Mon Sep 17 00:00:00 2001 From: hanishkvc Date: Sat, 6 Dec 2025 02:17:26 +0530 Subject: [PATCH] SimpleSallap:SimpleMCP:TCWeb:HtmlText updated for new flow Rather initial go at the new flow, things require to be tweaked later wrt final valid runnable flow --- .../public_simplechat/local.tools/tcweb.py | 63 +++++++++++-------- 1 file changed, 37 insertions(+), 26 deletions(-) diff --git a/tools/server/public_simplechat/local.tools/tcweb.py b/tools/server/public_simplechat/local.tools/tcweb.py index ed1dad3aee..b5b20395ba 100644 --- a/tools/server/public_simplechat/local.tools/tcweb.py +++ b/tools/server/public_simplechat/local.tools/tcweb.py @@ -72,7 +72,7 @@ class TCUrlRaw(mTC.ToolCall): got = handle_urlreq(args['url'], inHeaders, "HandleTCUrlRaw") return got except Exception as exc: - return mTC.TCOutResponse(False, 502, f"WARN:UrlRawFailed:{exc}") + return mTC.TCOutResponse(False, 502, f"WARN:UrlRaw:Failed:{exc}") class TextHtmlParser(html.parser.HTMLParser): @@ -184,31 +184,42 @@ class TextHtmlParser(html.parser.HTMLParser): return self.textStripped -def handle_htmltext(ph: 'ProxyHandler', pr: urllib.parse.ParseResult): - try: - # Get requested url - got = handle_urlreq(ph, pr, "HandleHtmlText") - if not got.callOk: - ph.send_error(got.httpStatus, got.httpStatusMsg) - return - # Extract Text - tagDrops = ph.headers.get('htmltext-tag-drops') - if not tagDrops: - tagDrops = [] - else: - tagDrops = cast(list[dict[str,Any]], json.loads(tagDrops)) - textHtml = TextHtmlParser(tagDrops) - textHtml.feed(got.contentData) - # Send back to client - ph.send_response(got.httpStatus) - ph.send_header('Content-Type', got.contentType) - # Add CORS for browser fetch, just in case - ph.send_header('Access-Control-Allow-Origin', '*') - ph.end_headers() - ph.wfile.write(textHtml.get_stripped_text().encode('utf-8')) - debug.dump({ 'op': 'WebMagic.HtmlText', 'RawText': 'yes', 'StrippedText': 'yes' }, { 'RawText': textHtml.text, 'StrippedText': textHtml.get_stripped_text() }) - except Exception as exc: - ph.send_error(502, f"WARN:HtmlText:Failed:{exc}") +class TCHtmlText(mTC.ToolCall): + + def tcf_meta(self) -> mTC.TCFunction: + return mTC.TCFunction( + self.name, + "Fetch html content from given url through a proxy server and return its text content after stripping away the html tags as well as head, script, style, header, footer, nav blocks, in few seconds", + mTC.TCInParameters( + "object", + { + "url": mTC.TCInProperty( + "string", + "url of the html page that needs to be fetched and inturn unwanted stuff stripped from its contents to some extent" + ) + }, + [ "url" ] + ) + ) + + def tc_handle(self, args: mTC.TCInArgs, inHeaders: http.client.HTTPMessage) -> mTC.TCOutResponse: + try: + # Get requested url + got = handle_urlreq(args['url'], inHeaders, "HandleTCHtmlText") + if not got.callOk: + return got + # Extract Text + tagDrops = inHeaders.get('htmltext-tag-drops') + if not tagDrops: + tagDrops = [] + else: + tagDrops = cast(list[dict[str,Any]], json.loads(tagDrops)) + textHtml = TextHtmlParser(tagDrops) + textHtml.feed(got.contentData.decode('utf-8')) + debug.dump({ 'op': 'MCPWeb.HtmlText', 'RawText': 'yes', 'StrippedText': 'yes' }, { 'RawText': textHtml.text, 'StrippedText': textHtml.get_stripped_text() }) + return mTC.TCOutResponse(True, got.statusCode, got.statusMsg, got.contentType, textHtml.get_stripped_text().encode('utf-8')) + except Exception as exc: + return mTC.TCOutResponse(False, 502, f"WARN:HtmlText:Failed:{exc}") class XMLFilterParser(html.parser.HTMLParser):