From cbb632eec0417214007ed8c1a0144f52d8840f86 Mon Sep 17 00:00:00 2001 From: hanishkvc Date: Sat, 6 Dec 2025 01:45:56 +0530 Subject: [PATCH] SimpleSallap:SimpleMCP:TCWeb: Duplicate webmagic starting point To help with switching to tool call class++ based flow --- .../public_simplechat/local.tools/tcweb.py | 308 ++++++++++++++++++ 1 file changed, 308 insertions(+) create mode 100644 tools/server/public_simplechat/local.tools/tcweb.py diff --git a/tools/server/public_simplechat/local.tools/tcweb.py b/tools/server/public_simplechat/local.tools/tcweb.py new file mode 100644 index 0000000000..9bde105aba --- /dev/null +++ b/tools/server/public_simplechat/local.tools/tcweb.py @@ -0,0 +1,308 @@ +# Helper to manage web related requests +# by Humans for All + +import urllib.parse +import urlvalidator as uv +from dataclasses import dataclass +import html.parser +import debug +import filemagic as mFile +import json +import re +from typing import TYPE_CHECKING, Any, cast + +if TYPE_CHECKING: + from simpleproxy import ProxyHandler + + + +@dataclass(frozen=True) +class UrlReqResp: + """ + Used to return result wrt urlreq helper below. + """ + callOk: bool + httpStatus: int + httpStatusMsg: str = "" + contentType: str = "" + contentData: str = "" + + +def handle_urlreq(ph: 'ProxyHandler', pr: urllib.parse.ParseResult, tag: str): + """ + Common part of the url request handling used by both urlraw and urltext. + + Verify the url being requested is allowed. + + Include User-Agent, Accept-Language and Accept in the generated request using + equivalent values got in the request being proxied, so as to try mimic the + real client, whose request we are proxying. In case a header is missing in the + got request, fallback to using some possibly ok enough defaults. + + Fetch the requested url. + """ + tag=f"UrlReq:{tag}" + queryParams = urllib.parse.parse_qs(pr.query) + url = queryParams['url'] + print(f"DBUG:{tag}:Url:{url}") + url = url[0] + gotVU = uv.validate_url(url, tag) + if not gotVU.callOk: + return UrlReqResp(gotVU.callOk, gotVU.statusCode, gotVU.statusMsg) + try: + hUA = ph.headers.get('User-Agent', None) + hAL = ph.headers.get('Accept-Language', None) + hA = ph.headers.get('Accept', None) + headers = { + 'User-Agent': hUA, + 'Accept': hA, + 'Accept-Language': hAL + } + # Get requested url + gotFile = mFile.get_file(url, tag, "text/html", headers) + return UrlReqResp(gotFile.callOk, gotFile.statusCode, gotFile.statusMsg, gotFile.contentType, gotFile.contentData.decode('utf-8')) + except Exception as exc: + return UrlReqResp(False, 502, f"WARN:{tag}:Failed:{exc}") + + +def handle_urlraw(ph: 'ProxyHandler', pr: urllib.parse.ParseResult): + try: + # Get requested url + got = handle_urlreq(ph, pr, "HandleUrlRaw") + if not got.callOk: + ph.send_error(got.httpStatus, got.httpStatusMsg) + return + # Send back to client + ph.send_response(got.httpStatus) + ph.send_header('Content-Type', got.contentType) + # Add CORS for browser fetch, just in case + ph.send_header('Access-Control-Allow-Origin', '*') + ph.end_headers() + ph.wfile.write(got.contentData.encode('utf-8')) + except Exception as exc: + ph.send_error(502, f"WARN:UrlRawFailed:{exc}") + + +class TextHtmlParser(html.parser.HTMLParser): + """ + A simple minded logic used to strip html content of + * all the html tags as well as + * all the contents belonging to below predefined tags like script, style, header, ... + + NOTE: if the html content/page uses any javascript for client side manipulation/generation of + html content, that logic wont be triggered, so also such client side dynamic content wont be + got. + + Supports one to specify a list of tags and their corresponding id attributes, so that contents + within such specified blocks will be dropped. + + * this works properly only if the html being processed has proper opening and ending tags + around the area of interest. + * remember to specify non overlapping tag blocks, if more than one specified for dropping. + * this path not tested, but should logically work + + This helps return a relatively clean textual representation of the html file/content being parsed. + """ + + def __init__(self, tagDrops: list[dict[str, Any]]): + super().__init__() + self.tagDrops = tagDrops + print(f"DBUG:TextHtmlParser:{self.tagDrops}") + self.inside = { + 'body': False, + 'script': False, + 'style': False, + 'header': False, + 'footer': False, + 'nav': False, + } + self.monitored = [ 'body', 'script', 'style', 'header', 'footer', 'nav' ] + self.bCapture = False + self.text = "" + self.textStripped = "" + self.droptagType = None + self.droptagCount = 0 + + def do_capture(self): + """ + Helps decide whether to capture contents or discard them. + """ + if self.inside['body'] and not (self.inside['script'] or self.inside['style'] or self.inside['header'] or self.inside['footer'] or self.inside['nav'] or (self.droptagCount > 0)): + return True + return False + + def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]): + if tag in self.monitored: + self.inside[tag] = True + for tagMeta in self.tagDrops: + if tag != tagMeta['tag']: + continue + if (self.droptagCount > 0) and (self.droptagType == tag): + self.droptagCount += 1 + continue + for attr in attrs: + if attr[0] != 'id': + continue + if attr[1] == tagMeta['id']: + self.droptagCount += 1 + self.droptagType = tag + print(f"DBUG:THP:Start:Tag found [{tag}:{attr[1]}]...") + + def handle_endtag(self, tag: str): + if tag in self.monitored: + self.inside[tag] = False + if self.droptagType and (tag == self.droptagType): + self.droptagCount -= 1 + if self.droptagCount == 0: + self.droptagType = None + print("DBUG:THP:End:Tag found...") + if self.droptagCount < 0: + self.droptagCount = 0 + + def handle_data(self, data: str): + if self.do_capture(): + self.text += f"{data}\n" + + def syncup(self): + self.textStripped = self.text + + def strip_adjacent_newlines(self): + oldLen = -99 + newLen = len(self.textStripped) + aStripped = self.textStripped; + while oldLen != newLen: + oldLen = newLen + aStripped = aStripped.replace("\n\n\n","\n") + newLen = len(aStripped) + self.textStripped = aStripped + + def strip_whitespace_lines(self): + aLines = self.textStripped.splitlines() + self.textStripped = "" + for line in aLines: + if (len(line.strip())==0): + self.textStripped += "\n" + continue + self.textStripped += f"{line}\n" + + def get_stripped_text(self): + self.syncup() + self.strip_whitespace_lines() + self.strip_adjacent_newlines() + return self.textStripped + + +def handle_htmltext(ph: 'ProxyHandler', pr: urllib.parse.ParseResult): + try: + # Get requested url + got = handle_urlreq(ph, pr, "HandleHtmlText") + if not got.callOk: + ph.send_error(got.httpStatus, got.httpStatusMsg) + return + # Extract Text + tagDrops = ph.headers.get('htmltext-tag-drops') + if not tagDrops: + tagDrops = [] + else: + tagDrops = cast(list[dict[str,Any]], json.loads(tagDrops)) + textHtml = TextHtmlParser(tagDrops) + textHtml.feed(got.contentData) + # Send back to client + ph.send_response(got.httpStatus) + ph.send_header('Content-Type', got.contentType) + # Add CORS for browser fetch, just in case + ph.send_header('Access-Control-Allow-Origin', '*') + ph.end_headers() + ph.wfile.write(textHtml.get_stripped_text().encode('utf-8')) + debug.dump({ 'op': 'WebMagic.HtmlText', 'RawText': 'yes', 'StrippedText': 'yes' }, { 'RawText': textHtml.text, 'StrippedText': textHtml.get_stripped_text() }) + except Exception as exc: + ph.send_error(502, f"WARN:HtmlText:Failed:{exc}") + + +class XMLFilterParser(html.parser.HTMLParser): + """ + A simple minded logic used to strip xml content of + * unwanted tags and their contents, using re + * this works properly only if the xml being processed has + proper opening and ending tags around the area of interest. + + This can help return a cleaned up xml file. + """ + + def __init__(self, tagDropREs: list[str]): + """ + tagDropREs - allows one to specify a list of tags related REs, + to help drop the corresponding tags and their contents fully. + + To drop a tag, specify regular expression + * that matches the corresponding heirarchy of tags involved + * where the tag names should be in lower case and suffixed with : + * if interested in dropping a tag independent of where it appears use + ".*:tagname:.*" re template + """ + super().__init__() + self.tagDropREs = list(map(str.lower, tagDropREs)) + print(f"DBUG:XMLFilterParser:{self.tagDropREs}") + self.text = "" + self.prefixTags = [] + self.prefix = "" + self.lastTrackedCB = "" + + def do_capture(self): + """ + Helps decide whether to capture contents or discard them. + """ + curTagH = f'{":".join(self.prefixTags)}:' + for dropRE in self.tagDropREs: + if re.match(dropRE, curTagH): + return False + return True + + def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]): + self.prefixTags.append(tag) + if not self.do_capture(): + return + self.lastTrackedCB = "starttag" + self.prefix += "\t" + self.text += f"\n{self.prefix}<{tag}>" + + def handle_endtag(self, tag: str): + if self.do_capture(): + if (self.lastTrackedCB == "endtag"): + self.text += f"\n{self.prefix}" + else: + self.text += f"" + self.lastTrackedCB = "endtag" + self.prefix = self.prefix[:-1] + self.prefixTags.pop() + + def handle_data(self, data: str): + if self.do_capture(): + self.text += f"{data}" + + +def handle_xmlfiltered(ph: 'ProxyHandler', pr: urllib.parse.ParseResult): + try: + # Get requested url + got = handle_urlreq(ph, pr, "HandleXMLFiltered") + if not got.callOk: + ph.send_error(got.httpStatus, got.httpStatusMsg) + return + # Extract Text + tagDropREs = ph.headers.get('xmlfiltered-tagdrop-res') + if not tagDropREs: + tagDropREs = [] + else: + tagDropREs = cast(list[str], json.loads(tagDropREs)) + xmlFiltered = XMLFilterParser(tagDropREs) + xmlFiltered.feed(got.contentData) + # Send back to client + ph.send_response(got.httpStatus) + ph.send_header('Content-Type', got.contentType) + # Add CORS for browser fetch, just in case + ph.send_header('Access-Control-Allow-Origin', '*') + ph.end_headers() + ph.wfile.write(xmlFiltered.text.encode('utf-8')) + debug.dump({ 'XMLFiltered': 'yes' }, { 'RawText': xmlFiltered.text }) + except Exception as exc: + ph.send_error(502, f"WARN:XMLFiltered:Failed:{exc}")