diff --git a/tools/server/public_simplechat/local.tools/pdfmagic.py b/tools/server/public_simplechat/local.tools/pdfmagic.py new file mode 100644 index 0000000000..407674b0f6 --- /dev/null +++ b/tools/server/public_simplechat/local.tools/pdfmagic.py @@ -0,0 +1,58 @@ +# Helper to manage pdf related requests +# by Humans for All + +import urllib.parse +import urlvalidator as uv +import simpleproxy as root + + +def process_pdf2text(url: str, startPN: int, endPN: int): + import pypdf + import io + gotVU = uv.validate_url(url, "HandlePdf2Text") + if not gotVU.callOk: + return { 'status': gotVU.statusCode, 'msg': gotVU.statusMsg } + urlParts = urllib.parse.urlparse(url) + fPdf = open(urlParts.path, 'rb') + dPdf = fPdf.read() + tPdf = "" + oPdf = pypdf.PdfReader(io.BytesIO(dPdf)) + if (startPN < 0): + startPN = 0 + if (endPN < 0) or (endPN >= len(oPdf.pages)): + endPN = len(oPdf.pages)-1 + for i in range(startPN, endPN+1): + pd = oPdf.pages[i] + tPdf = tPdf + pd.extract_text() + return { 'status': 200, 'msg': "Pdf2Text Response follows", 'data': tPdf } + + +def handle_pdf2text(ph: root.ProxyHandler, pr: urllib.parse.ParseResult): + """ + Handle requests to pdf2text path, which is used to extract plain text + from the specified pdf file. + """ + queryParams = urllib.parse.parse_qs(pr.query) + url = queryParams['url'][0] + startP = queryParams['startPageNumber'][0] + if startP: + startP = int(startP) + else: + startP = -1 + endP = queryParams['endPageNumber'][0] + if endP: + endP = int(endP) + else: + endP = -1 + print(f"INFO:HandlePdf2Text:Processing:{url}:{startP}:{endP}...") + gotP2T = process_pdf2text(url, startP, endP) + if (gotP2T['status'] != 200): + ph.send_error(gotP2T['status'], gotP2T['msg'] ) + return + ph.send_response(gotP2T['status'], gotP2T['msg']) + ph.send_header('Content-Type', 'text/text') + # Add CORS for browser fetch, just in case + ph.send_header('Access-Control-Allow-Origin', '*') + ph.end_headers() + print(f"INFO:HandlePdf2Text:ExtractedText:{url}...") + ph.wfile.write(gotP2T['data'].encode('utf-8')) diff --git a/tools/server/public_simplechat/local.tools/simpleproxy.py b/tools/server/public_simplechat/local.tools/simpleproxy.py index b3baf76459..2c289a45ae 100644 --- a/tools/server/public_simplechat/local.tools/simpleproxy.py +++ b/tools/server/public_simplechat/local.tools/simpleproxy.py @@ -27,6 +27,7 @@ import html.parser import time import urlvalidator as uv from typing import Callable +import pdfmagic as mPdf gMe = { @@ -136,7 +137,7 @@ class ProxyHandler(http.server.BaseHTTPRequestHandler): case '/urltext': self.auth_and_run(pr, handle_urltext) case '/pdf2text': - self.auth_and_run(pr, handle_pdf2text) + self.auth_and_run(pr, mPdf.handle_pdf2text) case '/aum': handle_aum(self, pr) case _: @@ -358,58 +359,6 @@ def handle_urltext(ph: ProxyHandler, pr: urllib.parse.ParseResult): ph.send_error(502, f"WARN:UrlTextFailed:{exc}") -def process_pdf2text(url: str, startPN: int, endPN: int): - import pypdf - import io - gotVU = uv.validate_url(url, "HandlePdf2Text") - if not gotVU.callOk: - return { 'status': gotVU.statusCode, 'msg': gotVU.statusMsg } - urlParts = urllib.parse.urlparse(url) - fPdf = open(urlParts.path, 'rb') - dPdf = fPdf.read() - tPdf = "" - oPdf = pypdf.PdfReader(io.BytesIO(dPdf)) - if (startPN < 0): - startPN = 0 - if (endPN < 0) or (endPN >= len(oPdf.pages)): - endPN = len(oPdf.pages)-1 - for i in range(startPN, endPN+1): - pd = oPdf.pages[i] - tPdf = tPdf + pd.extract_text() - return { 'status': 200, 'msg': "Pdf2Text Response follows", 'data': tPdf } - - -def handle_pdf2text(ph: ProxyHandler, pr: urllib.parse.ParseResult): - """ - Handle requests to pdf2text path, which is used to extract plain text - from the specified pdf file. - """ - queryParams = urllib.parse.parse_qs(pr.query) - url = queryParams['url'][0] - startP = queryParams['startPageNumber'][0] - if startP: - startP = int(startP) - else: - startP = -1 - endP = queryParams['endPageNumber'][0] - if endP: - endP = int(endP) - else: - endP = -1 - print(f"INFO:HandlePdf2Text:Processing:{url}:{startP}:{endP}...") - gotP2T = process_pdf2text(url, startP, endP) - if (gotP2T['status'] != 200): - ph.send_error(gotP2T['status'], gotP2T['msg'] ) - return - ph.send_response(gotP2T['status'], gotP2T['msg']) - ph.send_header('Content-Type', 'text/text') - # Add CORS for browser fetch, just in case - ph.send_header('Access-Control-Allow-Origin', '*') - ph.end_headers() - print(f"INFO:HandlePdf2Text:ExtractedText:{url}...") - ph.wfile.write(gotP2T['data'].encode('utf-8')) - - def load_config(): """