SimpleChatTC:SimpleProxy: Move web requests to its own module
This commit is contained in:
parent
a7de002fd0
commit
350d7d77e0
|
|
@ -21,13 +21,11 @@
|
||||||
import sys
|
import sys
|
||||||
import http.server
|
import http.server
|
||||||
import urllib.parse
|
import urllib.parse
|
||||||
import urllib.request
|
|
||||||
from dataclasses import dataclass
|
|
||||||
import html.parser
|
|
||||||
import time
|
import time
|
||||||
import urlvalidator as uv
|
import urlvalidator as uv
|
||||||
from typing import Callable
|
from typing import Callable
|
||||||
import pdfmagic as mPdf
|
import pdfmagic as mPdf
|
||||||
|
import webmagic as mWeb
|
||||||
|
|
||||||
|
|
||||||
gMe = {
|
gMe = {
|
||||||
|
|
@ -133,9 +131,9 @@ class ProxyHandler(http.server.BaseHTTPRequestHandler):
|
||||||
print(f"DBUG:ProxyHandler:GET:{pr}")
|
print(f"DBUG:ProxyHandler:GET:{pr}")
|
||||||
match pr.path:
|
match pr.path:
|
||||||
case '/urlraw':
|
case '/urlraw':
|
||||||
self.auth_and_run(pr, handle_urlraw)
|
self.auth_and_run(pr, mWeb.handle_urlraw)
|
||||||
case '/urltext':
|
case '/urltext':
|
||||||
self.auth_and_run(pr, handle_urltext)
|
self.auth_and_run(pr, mWeb.handle_urltext)
|
||||||
case '/pdf2text':
|
case '/pdf2text':
|
||||||
self.auth_and_run(pr, mPdf.handle_pdf2text)
|
self.auth_and_run(pr, mPdf.handle_pdf2text)
|
||||||
case '/aum':
|
case '/aum':
|
||||||
|
|
@ -175,18 +173,6 @@ def handle_aum(ph: ProxyHandler, pr: urllib.parse.ParseResult):
|
||||||
ph.end_headers()
|
ph.end_headers()
|
||||||
|
|
||||||
|
|
||||||
@dataclass(frozen=True)
|
|
||||||
class UrlReqResp:
|
|
||||||
"""
|
|
||||||
Used to return result wrt urlreq helper below.
|
|
||||||
"""
|
|
||||||
callOk: bool
|
|
||||||
httpStatus: int
|
|
||||||
httpStatusMsg: str = ""
|
|
||||||
contentType: str = ""
|
|
||||||
contentData: str = ""
|
|
||||||
|
|
||||||
|
|
||||||
def debug_dump(meta: dict, data: dict):
|
def debug_dump(meta: dict, data: dict):
|
||||||
if not gMe['--debug']:
|
if not gMe['--debug']:
|
||||||
return
|
return
|
||||||
|
|
@ -199,167 +185,6 @@ def debug_dump(meta: dict, data: dict):
|
||||||
f.write(f"\n\n\n\n{k}:{data[k]}\n\n\n\n")
|
f.write(f"\n\n\n\n{k}:{data[k]}\n\n\n\n")
|
||||||
|
|
||||||
|
|
||||||
def handle_urlreq(ph: ProxyHandler, pr: urllib.parse.ParseResult, tag: str):
|
|
||||||
"""
|
|
||||||
Common part of the url request handling used by both urlraw and urltext.
|
|
||||||
|
|
||||||
Verify the url being requested is allowed.
|
|
||||||
|
|
||||||
Include User-Agent, Accept-Language and Accept in the generated request using
|
|
||||||
equivalent values got in the request being proxied, so as to try mimic the
|
|
||||||
real client, whose request we are proxying. In case a header is missing in the
|
|
||||||
got request, fallback to using some possibly ok enough defaults.
|
|
||||||
|
|
||||||
Fetch the requested url.
|
|
||||||
"""
|
|
||||||
tag=f"UrlReq:{tag}"
|
|
||||||
queryParams = urllib.parse.parse_qs(pr.query)
|
|
||||||
url = queryParams['url']
|
|
||||||
print(f"DBUG:{tag}:Url:{url}")
|
|
||||||
url = url[0]
|
|
||||||
gotVU = uv.validate_url(url, tag)
|
|
||||||
if not gotVU.callOk:
|
|
||||||
return UrlReqResp(gotVU.callOk, gotVU.statusCode, gotVU.statusMsg)
|
|
||||||
try:
|
|
||||||
hUA = ph.headers.get('User-Agent', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:132.0) Gecko/20100101 Firefox/132.0')
|
|
||||||
hAL = ph.headers.get('Accept-Language', "en-US,en;q=0.9")
|
|
||||||
hA = ph.headers.get('Accept', "text/html,*/*")
|
|
||||||
headers = {
|
|
||||||
'User-Agent': hUA,
|
|
||||||
'Accept': hA,
|
|
||||||
'Accept-Language': hAL
|
|
||||||
}
|
|
||||||
req = urllib.request.Request(url, headers=headers)
|
|
||||||
# Get requested url
|
|
||||||
print(f"DBUG:{tag}:Req:{req.full_url}:{req.headers}")
|
|
||||||
with urllib.request.urlopen(req, timeout=10) as response:
|
|
||||||
contentData = response.read().decode('utf-8')
|
|
||||||
statusCode = response.status or 200
|
|
||||||
contentType = response.getheader('Content-Type') or 'text/html'
|
|
||||||
debug_dump({ 'url': req.full_url, 'headers': req.headers, 'ctype': contentType }, { 'cdata': contentData })
|
|
||||||
return UrlReqResp(True, statusCode, "", contentType, contentData)
|
|
||||||
except Exception as exc:
|
|
||||||
return UrlReqResp(False, 502, f"WARN:{tag}:Failed:{exc}")
|
|
||||||
|
|
||||||
|
|
||||||
def handle_urlraw(ph: ProxyHandler, pr: urllib.parse.ParseResult):
|
|
||||||
try:
|
|
||||||
# Get requested url
|
|
||||||
got = handle_urlreq(ph, pr, "HandleUrlRaw")
|
|
||||||
if not got.callOk:
|
|
||||||
ph.send_error(got.httpStatus, got.httpStatusMsg)
|
|
||||||
return
|
|
||||||
# Send back to client
|
|
||||||
ph.send_response(got.httpStatus)
|
|
||||||
ph.send_header('Content-Type', got.contentType)
|
|
||||||
# Add CORS for browser fetch, just in case
|
|
||||||
ph.send_header('Access-Control-Allow-Origin', '*')
|
|
||||||
ph.end_headers()
|
|
||||||
ph.wfile.write(got.contentData.encode('utf-8'))
|
|
||||||
except Exception as exc:
|
|
||||||
ph.send_error(502, f"WARN:UrlRawFailed:{exc}")
|
|
||||||
|
|
||||||
|
|
||||||
class TextHtmlParser(html.parser.HTMLParser):
|
|
||||||
"""
|
|
||||||
A simple minded logic used to strip html content of
|
|
||||||
* all the html tags as well as
|
|
||||||
* all the contents belonging to below predefined tags like script, style, header, ...
|
|
||||||
|
|
||||||
NOTE: if the html content/page uses any javascript for client side manipulation/generation of
|
|
||||||
html content, that logic wont be triggered, so also such client side dynamic content wont be
|
|
||||||
got.
|
|
||||||
|
|
||||||
This helps return a relatively clean textual representation of the html file/content being parsed.
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(self):
|
|
||||||
super().__init__()
|
|
||||||
self.inside = {
|
|
||||||
'body': False,
|
|
||||||
'script': False,
|
|
||||||
'style': False,
|
|
||||||
'header': False,
|
|
||||||
'footer': False,
|
|
||||||
'nav': False
|
|
||||||
}
|
|
||||||
self.monitored = [ 'body', 'script', 'style', 'header', 'footer', 'nav' ]
|
|
||||||
self.bCapture = False
|
|
||||||
self.text = ""
|
|
||||||
self.textStripped = ""
|
|
||||||
|
|
||||||
def do_capture(self):
|
|
||||||
"""
|
|
||||||
Helps decide whether to capture contents or discard them.
|
|
||||||
"""
|
|
||||||
if self.inside['body'] and not (self.inside['script'] or self.inside['style'] or self.inside['header'] or self.inside['footer'] or self.inside['nav']):
|
|
||||||
return True
|
|
||||||
return False
|
|
||||||
|
|
||||||
def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]):
|
|
||||||
if tag in self.monitored:
|
|
||||||
self.inside[tag] = True
|
|
||||||
|
|
||||||
def handle_endtag(self, tag: str):
|
|
||||||
if tag in self.monitored:
|
|
||||||
self.inside[tag] = False
|
|
||||||
|
|
||||||
def handle_data(self, data: str):
|
|
||||||
if self.do_capture():
|
|
||||||
self.text += f"{data}\n"
|
|
||||||
|
|
||||||
def syncup(self):
|
|
||||||
self.textStripped = self.text
|
|
||||||
|
|
||||||
def strip_adjacent_newlines(self):
|
|
||||||
oldLen = -99
|
|
||||||
newLen = len(self.textStripped)
|
|
||||||
aStripped = self.textStripped;
|
|
||||||
while oldLen != newLen:
|
|
||||||
oldLen = newLen
|
|
||||||
aStripped = aStripped.replace("\n\n\n","\n")
|
|
||||||
newLen = len(aStripped)
|
|
||||||
self.textStripped = aStripped
|
|
||||||
|
|
||||||
def strip_whitespace_lines(self):
|
|
||||||
aLines = self.textStripped.splitlines()
|
|
||||||
self.textStripped = ""
|
|
||||||
for line in aLines:
|
|
||||||
if (len(line.strip())==0):
|
|
||||||
self.textStripped += "\n"
|
|
||||||
continue
|
|
||||||
self.textStripped += f"{line}\n"
|
|
||||||
|
|
||||||
def get_stripped_text(self):
|
|
||||||
self.syncup()
|
|
||||||
self.strip_whitespace_lines()
|
|
||||||
self.strip_adjacent_newlines()
|
|
||||||
return self.textStripped
|
|
||||||
|
|
||||||
|
|
||||||
def handle_urltext(ph: ProxyHandler, pr: urllib.parse.ParseResult):
|
|
||||||
try:
|
|
||||||
# Get requested url
|
|
||||||
got = handle_urlreq(ph, pr, "HandleUrlText")
|
|
||||||
if not got.callOk:
|
|
||||||
ph.send_error(got.httpStatus, got.httpStatusMsg)
|
|
||||||
return
|
|
||||||
# Extract Text
|
|
||||||
textHtml = TextHtmlParser()
|
|
||||||
textHtml.feed(got.contentData)
|
|
||||||
# Send back to client
|
|
||||||
ph.send_response(got.httpStatus)
|
|
||||||
ph.send_header('Content-Type', got.contentType)
|
|
||||||
# Add CORS for browser fetch, just in case
|
|
||||||
ph.send_header('Access-Control-Allow-Origin', '*')
|
|
||||||
ph.end_headers()
|
|
||||||
ph.wfile.write(textHtml.get_stripped_text().encode('utf-8'))
|
|
||||||
debug_dump({ 'RawText': 'yes', 'StrippedText': 'yes' }, { 'RawText': textHtml.text, 'StrippedText': textHtml.get_stripped_text() })
|
|
||||||
except Exception as exc:
|
|
||||||
ph.send_error(502, f"WARN:UrlTextFailed:{exc}")
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def load_config():
|
def load_config():
|
||||||
"""
|
"""
|
||||||
Allow loading of a json based config file
|
Allow loading of a json based config file
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,181 @@
|
||||||
|
# Helper to manage web related requests
|
||||||
|
# by Humans for All
|
||||||
|
|
||||||
|
import urllib.parse
|
||||||
|
import urllib.request
|
||||||
|
import simpleproxy as root
|
||||||
|
import urlvalidator as uv
|
||||||
|
from dataclasses import dataclass
|
||||||
|
import html.parser
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass(frozen=True)
|
||||||
|
class UrlReqResp:
|
||||||
|
"""
|
||||||
|
Used to return result wrt urlreq helper below.
|
||||||
|
"""
|
||||||
|
callOk: bool
|
||||||
|
httpStatus: int
|
||||||
|
httpStatusMsg: str = ""
|
||||||
|
contentType: str = ""
|
||||||
|
contentData: str = ""
|
||||||
|
|
||||||
|
|
||||||
|
def handle_urlreq(ph: root.ProxyHandler, pr: urllib.parse.ParseResult, tag: str):
|
||||||
|
"""
|
||||||
|
Common part of the url request handling used by both urlraw and urltext.
|
||||||
|
|
||||||
|
Verify the url being requested is allowed.
|
||||||
|
|
||||||
|
Include User-Agent, Accept-Language and Accept in the generated request using
|
||||||
|
equivalent values got in the request being proxied, so as to try mimic the
|
||||||
|
real client, whose request we are proxying. In case a header is missing in the
|
||||||
|
got request, fallback to using some possibly ok enough defaults.
|
||||||
|
|
||||||
|
Fetch the requested url.
|
||||||
|
"""
|
||||||
|
tag=f"UrlReq:{tag}"
|
||||||
|
queryParams = urllib.parse.parse_qs(pr.query)
|
||||||
|
url = queryParams['url']
|
||||||
|
print(f"DBUG:{tag}:Url:{url}")
|
||||||
|
url = url[0]
|
||||||
|
gotVU = uv.validate_url(url, tag)
|
||||||
|
if not gotVU.callOk:
|
||||||
|
return UrlReqResp(gotVU.callOk, gotVU.statusCode, gotVU.statusMsg)
|
||||||
|
try:
|
||||||
|
hUA = ph.headers.get('User-Agent', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:132.0) Gecko/20100101 Firefox/132.0')
|
||||||
|
hAL = ph.headers.get('Accept-Language', "en-US,en;q=0.9")
|
||||||
|
hA = ph.headers.get('Accept', "text/html,*/*")
|
||||||
|
headers = {
|
||||||
|
'User-Agent': hUA,
|
||||||
|
'Accept': hA,
|
||||||
|
'Accept-Language': hAL
|
||||||
|
}
|
||||||
|
req = urllib.request.Request(url, headers=headers)
|
||||||
|
# Get requested url
|
||||||
|
print(f"DBUG:{tag}:Req:{req.full_url}:{req.headers}")
|
||||||
|
with urllib.request.urlopen(req, timeout=10) as response:
|
||||||
|
contentData = response.read().decode('utf-8')
|
||||||
|
statusCode = response.status or 200
|
||||||
|
contentType = response.getheader('Content-Type') or 'text/html'
|
||||||
|
root.debug_dump({ 'url': req.full_url, 'headers': req.headers, 'ctype': contentType }, { 'cdata': contentData })
|
||||||
|
return UrlReqResp(True, statusCode, "", contentType, contentData)
|
||||||
|
except Exception as exc:
|
||||||
|
return UrlReqResp(False, 502, f"WARN:{tag}:Failed:{exc}")
|
||||||
|
|
||||||
|
|
||||||
|
def handle_urlraw(ph: root.ProxyHandler, pr: urllib.parse.ParseResult):
|
||||||
|
try:
|
||||||
|
# Get requested url
|
||||||
|
got = handle_urlreq(ph, pr, "HandleUrlRaw")
|
||||||
|
if not got.callOk:
|
||||||
|
ph.send_error(got.httpStatus, got.httpStatusMsg)
|
||||||
|
return
|
||||||
|
# Send back to client
|
||||||
|
ph.send_response(got.httpStatus)
|
||||||
|
ph.send_header('Content-Type', got.contentType)
|
||||||
|
# Add CORS for browser fetch, just in case
|
||||||
|
ph.send_header('Access-Control-Allow-Origin', '*')
|
||||||
|
ph.end_headers()
|
||||||
|
ph.wfile.write(got.contentData.encode('utf-8'))
|
||||||
|
except Exception as exc:
|
||||||
|
ph.send_error(502, f"WARN:UrlRawFailed:{exc}")
|
||||||
|
|
||||||
|
|
||||||
|
class TextHtmlParser(html.parser.HTMLParser):
|
||||||
|
"""
|
||||||
|
A simple minded logic used to strip html content of
|
||||||
|
* all the html tags as well as
|
||||||
|
* all the contents belonging to below predefined tags like script, style, header, ...
|
||||||
|
|
||||||
|
NOTE: if the html content/page uses any javascript for client side manipulation/generation of
|
||||||
|
html content, that logic wont be triggered, so also such client side dynamic content wont be
|
||||||
|
got.
|
||||||
|
|
||||||
|
This helps return a relatively clean textual representation of the html file/content being parsed.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
super().__init__()
|
||||||
|
self.inside = {
|
||||||
|
'body': False,
|
||||||
|
'script': False,
|
||||||
|
'style': False,
|
||||||
|
'header': False,
|
||||||
|
'footer': False,
|
||||||
|
'nav': False
|
||||||
|
}
|
||||||
|
self.monitored = [ 'body', 'script', 'style', 'header', 'footer', 'nav' ]
|
||||||
|
self.bCapture = False
|
||||||
|
self.text = ""
|
||||||
|
self.textStripped = ""
|
||||||
|
|
||||||
|
def do_capture(self):
|
||||||
|
"""
|
||||||
|
Helps decide whether to capture contents or discard them.
|
||||||
|
"""
|
||||||
|
if self.inside['body'] and not (self.inside['script'] or self.inside['style'] or self.inside['header'] or self.inside['footer'] or self.inside['nav']):
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
|
def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]):
|
||||||
|
if tag in self.monitored:
|
||||||
|
self.inside[tag] = True
|
||||||
|
|
||||||
|
def handle_endtag(self, tag: str):
|
||||||
|
if tag in self.monitored:
|
||||||
|
self.inside[tag] = False
|
||||||
|
|
||||||
|
def handle_data(self, data: str):
|
||||||
|
if self.do_capture():
|
||||||
|
self.text += f"{data}\n"
|
||||||
|
|
||||||
|
def syncup(self):
|
||||||
|
self.textStripped = self.text
|
||||||
|
|
||||||
|
def strip_adjacent_newlines(self):
|
||||||
|
oldLen = -99
|
||||||
|
newLen = len(self.textStripped)
|
||||||
|
aStripped = self.textStripped;
|
||||||
|
while oldLen != newLen:
|
||||||
|
oldLen = newLen
|
||||||
|
aStripped = aStripped.replace("\n\n\n","\n")
|
||||||
|
newLen = len(aStripped)
|
||||||
|
self.textStripped = aStripped
|
||||||
|
|
||||||
|
def strip_whitespace_lines(self):
|
||||||
|
aLines = self.textStripped.splitlines()
|
||||||
|
self.textStripped = ""
|
||||||
|
for line in aLines:
|
||||||
|
if (len(line.strip())==0):
|
||||||
|
self.textStripped += "\n"
|
||||||
|
continue
|
||||||
|
self.textStripped += f"{line}\n"
|
||||||
|
|
||||||
|
def get_stripped_text(self):
|
||||||
|
self.syncup()
|
||||||
|
self.strip_whitespace_lines()
|
||||||
|
self.strip_adjacent_newlines()
|
||||||
|
return self.textStripped
|
||||||
|
|
||||||
|
|
||||||
|
def handle_urltext(ph: root.ProxyHandler, pr: urllib.parse.ParseResult):
|
||||||
|
try:
|
||||||
|
# Get requested url
|
||||||
|
got = handle_urlreq(ph, pr, "HandleUrlText")
|
||||||
|
if not got.callOk:
|
||||||
|
ph.send_error(got.httpStatus, got.httpStatusMsg)
|
||||||
|
return
|
||||||
|
# Extract Text
|
||||||
|
textHtml = TextHtmlParser()
|
||||||
|
textHtml.feed(got.contentData)
|
||||||
|
# Send back to client
|
||||||
|
ph.send_response(got.httpStatus)
|
||||||
|
ph.send_header('Content-Type', got.contentType)
|
||||||
|
# Add CORS for browser fetch, just in case
|
||||||
|
ph.send_header('Access-Control-Allow-Origin', '*')
|
||||||
|
ph.end_headers()
|
||||||
|
ph.wfile.write(textHtml.get_stripped_text().encode('utf-8'))
|
||||||
|
root.debug_dump({ 'RawText': 'yes', 'StrippedText': 'yes' }, { 'RawText': textHtml.text, 'StrippedText': textHtml.get_stripped_text() })
|
||||||
|
except Exception as exc:
|
||||||
|
ph.send_error(502, f"WARN:UrlTextFailed:{exc}")
|
||||||
Loading…
Reference in New Issue