From 45b05df21bb8e9dfe37f43cac07284a73b0c8c72 Mon Sep 17 00:00:00 2001 From: hanishkvc Date: Fri, 17 Oct 2025 01:30:46 +0530 Subject: [PATCH] SimpleChatTC:SimpleProxy: Switch to html.parser As html can be malformed, xml ElementTree XMLParser cant handle the same properly, so switch to the HtmlParser helper class that is provided by python and try extend it. Currently a minimal skeleton to just start it out, which captures only the body contents. --- .../local.tools/simpleproxy.py | 29 +++++++++++++++---- 1 file changed, 24 insertions(+), 5 deletions(-) diff --git a/tools/server/public_simplechat/local.tools/simpleproxy.py b/tools/server/public_simplechat/local.tools/simpleproxy.py index 0014b0219b..4ac26b6b22 100644 --- a/tools/server/public_simplechat/local.tools/simpleproxy.py +++ b/tools/server/public_simplechat/local.tools/simpleproxy.py @@ -12,7 +12,7 @@ import http.server import urllib.parse import urllib.request from dataclasses import dataclass -import xml.etree.ElementTree as xmlET +import html.parser gMe = { @@ -83,6 +83,26 @@ def handle_urlraw(ph: ProxyHandler, pr: urllib.parse.ParseResult): ph.send_error(502, f"WARN:UrlFetchFailed:{exc}") +class TextHtmlParser(html.parser.HTMLParser): + + def __init__(self): + super().__init__() + self.bBody = False + self.text = "" + + def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]): + if tag == 'body': + self.bBody = True + + def handle_endtag(self, tag: str): + if tag == 'body': + self.bBody = False + + def handle_data(self, data: str): + if self.bBody: + self.text += f"{data}\n" + + def handle_urltext(ph: ProxyHandler, pr: urllib.parse.ParseResult): try: # Get requested url @@ -91,16 +111,15 @@ def handle_urltext(ph: ProxyHandler, pr: urllib.parse.ParseResult): ph.send_error(got.httpStatus, got.httpStatusMsg) return # Extract Text - html = xmlET.fromstring(got.contentData) - for el in html.iter(): - print(el) + textHtml = TextHtmlParser() + textHtml.feed(got.contentData) # Send back to client ph.send_response(got.httpStatus) ph.send_header('Content-Type', got.contentType) # Add CORS for browser fetch, just in case ph.send_header('Access-Control-Allow-Origin', '*') ph.end_headers() - ph.wfile.write(got.contentData.encode('utf-8')) + ph.wfile.write(textHtml.text.encode('utf-8')) except Exception as exc: ph.send_error(502, f"WARN:UrlFetchFailed:{exc}")