SimpleSallap:SimpleMCP:TCWeb:HtmlText updated for new flow
Rather initial go at the new flow, things require to be tweaked later wrt final valid runnable flow
This commit is contained in:
parent
b17cd18bc5
commit
5bf608dedd
|
|
@ -72,7 +72,7 @@ class TCUrlRaw(mTC.ToolCall):
|
||||||
got = handle_urlreq(args['url'], inHeaders, "HandleTCUrlRaw")
|
got = handle_urlreq(args['url'], inHeaders, "HandleTCUrlRaw")
|
||||||
return got
|
return got
|
||||||
except Exception as exc:
|
except Exception as exc:
|
||||||
return mTC.TCOutResponse(False, 502, f"WARN:UrlRawFailed:{exc}")
|
return mTC.TCOutResponse(False, 502, f"WARN:UrlRaw:Failed:{exc}")
|
||||||
|
|
||||||
|
|
||||||
class TextHtmlParser(html.parser.HTMLParser):
|
class TextHtmlParser(html.parser.HTMLParser):
|
||||||
|
|
@ -184,31 +184,42 @@ class TextHtmlParser(html.parser.HTMLParser):
|
||||||
return self.textStripped
|
return self.textStripped
|
||||||
|
|
||||||
|
|
||||||
def handle_htmltext(ph: 'ProxyHandler', pr: urllib.parse.ParseResult):
|
class TCHtmlText(mTC.ToolCall):
|
||||||
|
|
||||||
|
def tcf_meta(self) -> mTC.TCFunction:
|
||||||
|
return mTC.TCFunction(
|
||||||
|
self.name,
|
||||||
|
"Fetch html content from given url through a proxy server and return its text content after stripping away the html tags as well as head, script, style, header, footer, nav blocks, in few seconds",
|
||||||
|
mTC.TCInParameters(
|
||||||
|
"object",
|
||||||
|
{
|
||||||
|
"url": mTC.TCInProperty(
|
||||||
|
"string",
|
||||||
|
"url of the html page that needs to be fetched and inturn unwanted stuff stripped from its contents to some extent"
|
||||||
|
)
|
||||||
|
},
|
||||||
|
[ "url" ]
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
def tc_handle(self, args: mTC.TCInArgs, inHeaders: http.client.HTTPMessage) -> mTC.TCOutResponse:
|
||||||
try:
|
try:
|
||||||
# Get requested url
|
# Get requested url
|
||||||
got = handle_urlreq(ph, pr, "HandleHtmlText")
|
got = handle_urlreq(args['url'], inHeaders, "HandleTCHtmlText")
|
||||||
if not got.callOk:
|
if not got.callOk:
|
||||||
ph.send_error(got.httpStatus, got.httpStatusMsg)
|
return got
|
||||||
return
|
|
||||||
# Extract Text
|
# Extract Text
|
||||||
tagDrops = ph.headers.get('htmltext-tag-drops')
|
tagDrops = inHeaders.get('htmltext-tag-drops')
|
||||||
if not tagDrops:
|
if not tagDrops:
|
||||||
tagDrops = []
|
tagDrops = []
|
||||||
else:
|
else:
|
||||||
tagDrops = cast(list[dict[str,Any]], json.loads(tagDrops))
|
tagDrops = cast(list[dict[str,Any]], json.loads(tagDrops))
|
||||||
textHtml = TextHtmlParser(tagDrops)
|
textHtml = TextHtmlParser(tagDrops)
|
||||||
textHtml.feed(got.contentData)
|
textHtml.feed(got.contentData.decode('utf-8'))
|
||||||
# Send back to client
|
debug.dump({ 'op': 'MCPWeb.HtmlText', 'RawText': 'yes', 'StrippedText': 'yes' }, { 'RawText': textHtml.text, 'StrippedText': textHtml.get_stripped_text() })
|
||||||
ph.send_response(got.httpStatus)
|
return mTC.TCOutResponse(True, got.statusCode, got.statusMsg, got.contentType, textHtml.get_stripped_text().encode('utf-8'))
|
||||||
ph.send_header('Content-Type', got.contentType)
|
|
||||||
# Add CORS for browser fetch, just in case
|
|
||||||
ph.send_header('Access-Control-Allow-Origin', '*')
|
|
||||||
ph.end_headers()
|
|
||||||
ph.wfile.write(textHtml.get_stripped_text().encode('utf-8'))
|
|
||||||
debug.dump({ 'op': 'WebMagic.HtmlText', 'RawText': 'yes', 'StrippedText': 'yes' }, { 'RawText': textHtml.text, 'StrippedText': textHtml.get_stripped_text() })
|
|
||||||
except Exception as exc:
|
except Exception as exc:
|
||||||
ph.send_error(502, f"WARN:HtmlText:Failed:{exc}")
|
return mTC.TCOutResponse(False, 502, f"WARN:HtmlText:Failed:{exc}")
|
||||||
|
|
||||||
|
|
||||||
class XMLFilterParser(html.parser.HTMLParser):
|
class XMLFilterParser(html.parser.HTMLParser):
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue