diff --git a/tools/server/public_simplechat/local.tools/simpleproxy.py b/tools/server/public_simplechat/local.tools/simpleproxy.py index b6018676f2..dd74e539f0 100644 --- a/tools/server/public_simplechat/local.tools/simpleproxy.py +++ b/tools/server/public_simplechat/local.tools/simpleproxy.py @@ -46,7 +46,7 @@ gConfigType = { gConfigNeeded = [ '--allowed.domains', '--bearer.insecure' ] -gAllowedCalls = [ "urltext", "urlraw" ] +gAllowedCalls = [ "urltext", "urlraw", "pdf2text" ] def bearer_transform(): @@ -128,6 +128,12 @@ class ProxyHandler(http.server.BaseHTTPRequestHandler): self.send_error(400, f"WARN:{acGot['Msg']}") else: handle_urltext(self, pr) + case '/pdf2text': + acGot = self.auth_check() + if not acGot['AllOk']: + self.send_error(400, f"WARN:{acGot['Msg']}") + else: + handle_pdf2text(self, pr) case '/aum': handle_aum(self, pr) case _: @@ -372,6 +378,35 @@ def handle_urltext(ph: ProxyHandler, pr: urllib.parse.ParseResult): ph.send_error(502, f"WARN:UrlTextFailed:{exc}") +def do_pdf2text(fUrl: str): + import pypdf + + + +gAllowedPdfUrlTypes = [ "file", "http", "https" ] + +def handle_pdf2text(ph: ProxyHandler, pr: urllib.parse.ParseResult): + """ + Handle requests to pdf2text path, which is used to extract plain text + from the specified pdf file. + """ + queryParams = urllib.parse.parse_qs(pr.query) + url = queryParams['url'] + print(f"DBUG:HandlePdf2Text:Url:{url}") + url = url[0] + if (not url) or (len(url) == 0): + ph.send_error(400, f"WARN:HandlePdf2Text:MissingUrl!") + return + urlParts = url.split('://',1) + if not (urlParts[0] in gAllowedPdfUrlTypes): + ph.send_error(403, f"WARN:HandlePdf2Text:ForbiddedUrlType:{urlParts[0]}:AllowedUrlTypes:{gAllowedPdfUrlTypes}") + return + print(f"INFO:HandlePdf2Text:Processing:{url}") + ph.send_response_only(200, "Pdf2Text Response follows") + ph.end_headers() + + + def load_config(): """ Allow loading of a json based config file