SimpleChatTC:SimpleProxy:UrlText: Slightly better trimming

First identify lines which have only whitespace and replace them
with lines with only newline char in them.

Next strip out adjacent lines, if they have only newlines
This commit is contained in:
hanishkvc 2025-10-17 03:02:53 +05:30
parent 266e825c68
commit bf63b8f45a
1 changed files with 23 additions and 4 deletions

View File

@ -90,6 +90,7 @@ class TextHtmlParser(html.parser.HTMLParser):
self.bBody = False
self.bCapture = False
self.text = ""
self.textStripped = ""
def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]):
if tag == 'body':
@ -111,15 +112,33 @@ class TextHtmlParser(html.parser.HTMLParser):
if self.bCapture:
self.text += f"{data}\n"
def get_stripped_text(self):
def syncup(self):
self.textStripped = self.text
def strip_adjacent_newlines(self):
oldLen = -99
newLen = len(self.text)
aStripped = self.text;
newLen = len(self.textStripped)
aStripped = self.textStripped;
while oldLen != newLen:
oldLen = newLen
aStripped = aStripped.replace("\n\n\n","\n")
newLen = len(aStripped)
return aStripped
self.textStripped = aStripped
def strip_whitespace_lines(self):
aLines = self.textStripped.splitlines()
self.textStripped = ""
for line in aLines:
if (len(line.strip())==0):
self.textStripped += "\n"
continue
self.textStripped += f"{line}\n"
def get_stripped_text(self):
self.syncup()
self.strip_whitespace_lines()
self.strip_adjacent_newlines()
return self.textStripped
def handle_urltext(ph: ProxyHandler, pr: urllib.parse.ParseResult):