From bf63b8f45ad6e475906dc5a14b4626ec7ecd3d3b Mon Sep 17 00:00:00 2001 From: hanishkvc Date: Fri, 17 Oct 2025 03:02:53 +0530 Subject: [PATCH] SimpleChatTC:SimpleProxy:UrlText: Slightly better trimming First identify lines which have only whitespace and replace them with lines with only newline char in them. Next strip out adjacent lines, if they have only newlines --- .../local.tools/simpleproxy.py | 27 ++++++++++++++++--- 1 file changed, 23 insertions(+), 4 deletions(-) diff --git a/tools/server/public_simplechat/local.tools/simpleproxy.py b/tools/server/public_simplechat/local.tools/simpleproxy.py index ad21cb3dc4..ad85b1b809 100644 --- a/tools/server/public_simplechat/local.tools/simpleproxy.py +++ b/tools/server/public_simplechat/local.tools/simpleproxy.py @@ -90,6 +90,7 @@ class TextHtmlParser(html.parser.HTMLParser): self.bBody = False self.bCapture = False self.text = "" + self.textStripped = "" def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]): if tag == 'body': @@ -111,15 +112,33 @@ class TextHtmlParser(html.parser.HTMLParser): if self.bCapture: self.text += f"{data}\n" - def get_stripped_text(self): + def syncup(self): + self.textStripped = self.text + + def strip_adjacent_newlines(self): oldLen = -99 - newLen = len(self.text) - aStripped = self.text; + newLen = len(self.textStripped) + aStripped = self.textStripped; while oldLen != newLen: oldLen = newLen aStripped = aStripped.replace("\n\n\n","\n") newLen = len(aStripped) - return aStripped + self.textStripped = aStripped + + def strip_whitespace_lines(self): + aLines = self.textStripped.splitlines() + self.textStripped = "" + for line in aLines: + if (len(line.strip())==0): + self.textStripped += "\n" + continue + self.textStripped += f"{line}\n" + + def get_stripped_text(self): + self.syncup() + self.strip_whitespace_lines() + self.strip_adjacent_newlines() + return self.textStripped def handle_urltext(ph: ProxyHandler, pr: urllib.parse.ParseResult):