SimpleChatTC:SimpleProxy:Move pdf logic into its own module

This commit is contained in:
hanishkvc 2025-11-02 16:36:41 +05:30
parent b18aed4449
commit a7de002fd0
2 changed files with 60 additions and 53 deletions

View File

@ -0,0 +1,58 @@
# Helper to manage pdf related requests
# by Humans for All
import urllib.parse
import urlvalidator as uv
import simpleproxy as root
def process_pdf2text(url: str, startPN: int, endPN: int):
import pypdf
import io
gotVU = uv.validate_url(url, "HandlePdf2Text")
if not gotVU.callOk:
return { 'status': gotVU.statusCode, 'msg': gotVU.statusMsg }
urlParts = urllib.parse.urlparse(url)
fPdf = open(urlParts.path, 'rb')
dPdf = fPdf.read()
tPdf = ""
oPdf = pypdf.PdfReader(io.BytesIO(dPdf))
if (startPN < 0):
startPN = 0
if (endPN < 0) or (endPN >= len(oPdf.pages)):
endPN = len(oPdf.pages)-1
for i in range(startPN, endPN+1):
pd = oPdf.pages[i]
tPdf = tPdf + pd.extract_text()
return { 'status': 200, 'msg': "Pdf2Text Response follows", 'data': tPdf }
def handle_pdf2text(ph: root.ProxyHandler, pr: urllib.parse.ParseResult):
"""
Handle requests to pdf2text path, which is used to extract plain text
from the specified pdf file.
"""
queryParams = urllib.parse.parse_qs(pr.query)
url = queryParams['url'][0]
startP = queryParams['startPageNumber'][0]
if startP:
startP = int(startP)
else:
startP = -1
endP = queryParams['endPageNumber'][0]
if endP:
endP = int(endP)
else:
endP = -1
print(f"INFO:HandlePdf2Text:Processing:{url}:{startP}:{endP}...")
gotP2T = process_pdf2text(url, startP, endP)
if (gotP2T['status'] != 200):
ph.send_error(gotP2T['status'], gotP2T['msg'] )
return
ph.send_response(gotP2T['status'], gotP2T['msg'])
ph.send_header('Content-Type', 'text/text')
# Add CORS for browser fetch, just in case
ph.send_header('Access-Control-Allow-Origin', '*')
ph.end_headers()
print(f"INFO:HandlePdf2Text:ExtractedText:{url}...")
ph.wfile.write(gotP2T['data'].encode('utf-8'))

View File

@ -27,6 +27,7 @@ import html.parser
import time
import urlvalidator as uv
from typing import Callable
import pdfmagic as mPdf
gMe = {
@ -136,7 +137,7 @@ class ProxyHandler(http.server.BaseHTTPRequestHandler):
case '/urltext':
self.auth_and_run(pr, handle_urltext)
case '/pdf2text':
self.auth_and_run(pr, handle_pdf2text)
self.auth_and_run(pr, mPdf.handle_pdf2text)
case '/aum':
handle_aum(self, pr)
case _:
@ -358,58 +359,6 @@ def handle_urltext(ph: ProxyHandler, pr: urllib.parse.ParseResult):
ph.send_error(502, f"WARN:UrlTextFailed:{exc}")
def process_pdf2text(url: str, startPN: int, endPN: int):
import pypdf
import io
gotVU = uv.validate_url(url, "HandlePdf2Text")
if not gotVU.callOk:
return { 'status': gotVU.statusCode, 'msg': gotVU.statusMsg }
urlParts = urllib.parse.urlparse(url)
fPdf = open(urlParts.path, 'rb')
dPdf = fPdf.read()
tPdf = ""
oPdf = pypdf.PdfReader(io.BytesIO(dPdf))
if (startPN < 0):
startPN = 0
if (endPN < 0) or (endPN >= len(oPdf.pages)):
endPN = len(oPdf.pages)-1
for i in range(startPN, endPN+1):
pd = oPdf.pages[i]
tPdf = tPdf + pd.extract_text()
return { 'status': 200, 'msg': "Pdf2Text Response follows", 'data': tPdf }
def handle_pdf2text(ph: ProxyHandler, pr: urllib.parse.ParseResult):
"""
Handle requests to pdf2text path, which is used to extract plain text
from the specified pdf file.
"""
queryParams = urllib.parse.parse_qs(pr.query)
url = queryParams['url'][0]
startP = queryParams['startPageNumber'][0]
if startP:
startP = int(startP)
else:
startP = -1
endP = queryParams['endPageNumber'][0]
if endP:
endP = int(endP)
else:
endP = -1
print(f"INFO:HandlePdf2Text:Processing:{url}:{startP}:{endP}...")
gotP2T = process_pdf2text(url, startP, endP)
if (gotP2T['status'] != 200):
ph.send_error(gotP2T['status'], gotP2T['msg'] )
return
ph.send_response(gotP2T['status'], gotP2T['msg'])
ph.send_header('Content-Type', 'text/text')
# Add CORS for browser fetch, just in case
ph.send_header('Access-Control-Allow-Origin', '*')
ph.end_headers()
print(f"INFO:HandlePdf2Text:ExtractedText:{url}...")
ph.wfile.write(gotP2T['data'].encode('utf-8'))
def load_config():
"""