SimpleChatTC:SimpleProxy: Switch to html.parser

As html can be malformed, xml ElementTree XMLParser cant handle
the same properly, so switch to the HtmlParser helper class that is
provided by python and try extend it.

Currently a minimal skeleton to just start it out, which captures
only the body contents.
This commit is contained in:
hanishkvc 2025-10-17 01:30:46 +05:30
parent d5f4183f7c
commit 45b05df21b
1 changed files with 24 additions and 5 deletions

View File

@ -12,7 +12,7 @@ import http.server
import urllib.parse import urllib.parse
import urllib.request import urllib.request
from dataclasses import dataclass from dataclasses import dataclass
import xml.etree.ElementTree as xmlET import html.parser
gMe = { gMe = {
@ -83,6 +83,26 @@ def handle_urlraw(ph: ProxyHandler, pr: urllib.parse.ParseResult):
ph.send_error(502, f"WARN:UrlFetchFailed:{exc}") ph.send_error(502, f"WARN:UrlFetchFailed:{exc}")
class TextHtmlParser(html.parser.HTMLParser):
def __init__(self):
super().__init__()
self.bBody = False
self.text = ""
def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]):
if tag == 'body':
self.bBody = True
def handle_endtag(self, tag: str):
if tag == 'body':
self.bBody = False
def handle_data(self, data: str):
if self.bBody:
self.text += f"{data}\n"
def handle_urltext(ph: ProxyHandler, pr: urllib.parse.ParseResult): def handle_urltext(ph: ProxyHandler, pr: urllib.parse.ParseResult):
try: try:
# Get requested url # Get requested url
@ -91,16 +111,15 @@ def handle_urltext(ph: ProxyHandler, pr: urllib.parse.ParseResult):
ph.send_error(got.httpStatus, got.httpStatusMsg) ph.send_error(got.httpStatus, got.httpStatusMsg)
return return
# Extract Text # Extract Text
html = xmlET.fromstring(got.contentData) textHtml = TextHtmlParser()
for el in html.iter(): textHtml.feed(got.contentData)
print(el)
# Send back to client # Send back to client
ph.send_response(got.httpStatus) ph.send_response(got.httpStatus)
ph.send_header('Content-Type', got.contentType) ph.send_header('Content-Type', got.contentType)
# Add CORS for browser fetch, just in case # Add CORS for browser fetch, just in case
ph.send_header('Access-Control-Allow-Origin', '*') ph.send_header('Access-Control-Allow-Origin', '*')
ph.end_headers() ph.end_headers()
ph.wfile.write(got.contentData.encode('utf-8')) ph.wfile.write(textHtml.text.encode('utf-8'))
except Exception as exc: except Exception as exc:
ph.send_error(502, f"WARN:UrlFetchFailed:{exc}") ph.send_error(502, f"WARN:UrlFetchFailed:{exc}")