SimpleSallap:SimpleMCP:TCWeb:HtmlText updated for new flow

Rather initial go at the new flow, things require to be tweaked
later wrt final valid runnable flow
This commit is contained in:
hanishkvc 2025-12-06 02:17:26 +05:30
parent b17cd18bc5
commit 5bf608dedd
1 changed files with 37 additions and 26 deletions

View File

@ -72,7 +72,7 @@ class TCUrlRaw(mTC.ToolCall):
got = handle_urlreq(args['url'], inHeaders, "HandleTCUrlRaw") got = handle_urlreq(args['url'], inHeaders, "HandleTCUrlRaw")
return got return got
except Exception as exc: except Exception as exc:
return mTC.TCOutResponse(False, 502, f"WARN:UrlRawFailed:{exc}") return mTC.TCOutResponse(False, 502, f"WARN:UrlRaw:Failed:{exc}")
class TextHtmlParser(html.parser.HTMLParser): class TextHtmlParser(html.parser.HTMLParser):
@ -184,31 +184,42 @@ class TextHtmlParser(html.parser.HTMLParser):
return self.textStripped return self.textStripped
def handle_htmltext(ph: 'ProxyHandler', pr: urllib.parse.ParseResult): class TCHtmlText(mTC.ToolCall):
try:
# Get requested url def tcf_meta(self) -> mTC.TCFunction:
got = handle_urlreq(ph, pr, "HandleHtmlText") return mTC.TCFunction(
if not got.callOk: self.name,
ph.send_error(got.httpStatus, got.httpStatusMsg) "Fetch html content from given url through a proxy server and return its text content after stripping away the html tags as well as head, script, style, header, footer, nav blocks, in few seconds",
return mTC.TCInParameters(
# Extract Text "object",
tagDrops = ph.headers.get('htmltext-tag-drops') {
if not tagDrops: "url": mTC.TCInProperty(
tagDrops = [] "string",
else: "url of the html page that needs to be fetched and inturn unwanted stuff stripped from its contents to some extent"
tagDrops = cast(list[dict[str,Any]], json.loads(tagDrops)) )
textHtml = TextHtmlParser(tagDrops) },
textHtml.feed(got.contentData) [ "url" ]
# Send back to client )
ph.send_response(got.httpStatus) )
ph.send_header('Content-Type', got.contentType)
# Add CORS for browser fetch, just in case def tc_handle(self, args: mTC.TCInArgs, inHeaders: http.client.HTTPMessage) -> mTC.TCOutResponse:
ph.send_header('Access-Control-Allow-Origin', '*') try:
ph.end_headers() # Get requested url
ph.wfile.write(textHtml.get_stripped_text().encode('utf-8')) got = handle_urlreq(args['url'], inHeaders, "HandleTCHtmlText")
debug.dump({ 'op': 'WebMagic.HtmlText', 'RawText': 'yes', 'StrippedText': 'yes' }, { 'RawText': textHtml.text, 'StrippedText': textHtml.get_stripped_text() }) if not got.callOk:
except Exception as exc: return got
ph.send_error(502, f"WARN:HtmlText:Failed:{exc}") # Extract Text
tagDrops = inHeaders.get('htmltext-tag-drops')
if not tagDrops:
tagDrops = []
else:
tagDrops = cast(list[dict[str,Any]], json.loads(tagDrops))
textHtml = TextHtmlParser(tagDrops)
textHtml.feed(got.contentData.decode('utf-8'))
debug.dump({ 'op': 'MCPWeb.HtmlText', 'RawText': 'yes', 'StrippedText': 'yes' }, { 'RawText': textHtml.text, 'StrippedText': textHtml.get_stripped_text() })
return mTC.TCOutResponse(True, got.statusCode, got.statusMsg, got.contentType, textHtml.get_stripped_text().encode('utf-8'))
except Exception as exc:
return mTC.TCOutResponse(False, 502, f"WARN:HtmlText:Failed:{exc}")
class XMLFilterParser(html.parser.HTMLParser): class XMLFilterParser(html.parser.HTMLParser):