SimpleChatTC:SimpleProxy:Move pdf logic into its own module
This commit is contained in:
parent
b18aed4449
commit
a7de002fd0
|
|
@ -0,0 +1,58 @@
|
|||
# Helper to manage pdf related requests
|
||||
# by Humans for All
|
||||
|
||||
import urllib.parse
|
||||
import urlvalidator as uv
|
||||
import simpleproxy as root
|
||||
|
||||
|
||||
def process_pdf2text(url: str, startPN: int, endPN: int):
|
||||
import pypdf
|
||||
import io
|
||||
gotVU = uv.validate_url(url, "HandlePdf2Text")
|
||||
if not gotVU.callOk:
|
||||
return { 'status': gotVU.statusCode, 'msg': gotVU.statusMsg }
|
||||
urlParts = urllib.parse.urlparse(url)
|
||||
fPdf = open(urlParts.path, 'rb')
|
||||
dPdf = fPdf.read()
|
||||
tPdf = ""
|
||||
oPdf = pypdf.PdfReader(io.BytesIO(dPdf))
|
||||
if (startPN < 0):
|
||||
startPN = 0
|
||||
if (endPN < 0) or (endPN >= len(oPdf.pages)):
|
||||
endPN = len(oPdf.pages)-1
|
||||
for i in range(startPN, endPN+1):
|
||||
pd = oPdf.pages[i]
|
||||
tPdf = tPdf + pd.extract_text()
|
||||
return { 'status': 200, 'msg': "Pdf2Text Response follows", 'data': tPdf }
|
||||
|
||||
|
||||
def handle_pdf2text(ph: root.ProxyHandler, pr: urllib.parse.ParseResult):
|
||||
"""
|
||||
Handle requests to pdf2text path, which is used to extract plain text
|
||||
from the specified pdf file.
|
||||
"""
|
||||
queryParams = urllib.parse.parse_qs(pr.query)
|
||||
url = queryParams['url'][0]
|
||||
startP = queryParams['startPageNumber'][0]
|
||||
if startP:
|
||||
startP = int(startP)
|
||||
else:
|
||||
startP = -1
|
||||
endP = queryParams['endPageNumber'][0]
|
||||
if endP:
|
||||
endP = int(endP)
|
||||
else:
|
||||
endP = -1
|
||||
print(f"INFO:HandlePdf2Text:Processing:{url}:{startP}:{endP}...")
|
||||
gotP2T = process_pdf2text(url, startP, endP)
|
||||
if (gotP2T['status'] != 200):
|
||||
ph.send_error(gotP2T['status'], gotP2T['msg'] )
|
||||
return
|
||||
ph.send_response(gotP2T['status'], gotP2T['msg'])
|
||||
ph.send_header('Content-Type', 'text/text')
|
||||
# Add CORS for browser fetch, just in case
|
||||
ph.send_header('Access-Control-Allow-Origin', '*')
|
||||
ph.end_headers()
|
||||
print(f"INFO:HandlePdf2Text:ExtractedText:{url}...")
|
||||
ph.wfile.write(gotP2T['data'].encode('utf-8'))
|
||||
|
|
@ -27,6 +27,7 @@ import html.parser
|
|||
import time
|
||||
import urlvalidator as uv
|
||||
from typing import Callable
|
||||
import pdfmagic as mPdf
|
||||
|
||||
|
||||
gMe = {
|
||||
|
|
@ -136,7 +137,7 @@ class ProxyHandler(http.server.BaseHTTPRequestHandler):
|
|||
case '/urltext':
|
||||
self.auth_and_run(pr, handle_urltext)
|
||||
case '/pdf2text':
|
||||
self.auth_and_run(pr, handle_pdf2text)
|
||||
self.auth_and_run(pr, mPdf.handle_pdf2text)
|
||||
case '/aum':
|
||||
handle_aum(self, pr)
|
||||
case _:
|
||||
|
|
@ -358,58 +359,6 @@ def handle_urltext(ph: ProxyHandler, pr: urllib.parse.ParseResult):
|
|||
ph.send_error(502, f"WARN:UrlTextFailed:{exc}")
|
||||
|
||||
|
||||
def process_pdf2text(url: str, startPN: int, endPN: int):
|
||||
import pypdf
|
||||
import io
|
||||
gotVU = uv.validate_url(url, "HandlePdf2Text")
|
||||
if not gotVU.callOk:
|
||||
return { 'status': gotVU.statusCode, 'msg': gotVU.statusMsg }
|
||||
urlParts = urllib.parse.urlparse(url)
|
||||
fPdf = open(urlParts.path, 'rb')
|
||||
dPdf = fPdf.read()
|
||||
tPdf = ""
|
||||
oPdf = pypdf.PdfReader(io.BytesIO(dPdf))
|
||||
if (startPN < 0):
|
||||
startPN = 0
|
||||
if (endPN < 0) or (endPN >= len(oPdf.pages)):
|
||||
endPN = len(oPdf.pages)-1
|
||||
for i in range(startPN, endPN+1):
|
||||
pd = oPdf.pages[i]
|
||||
tPdf = tPdf + pd.extract_text()
|
||||
return { 'status': 200, 'msg': "Pdf2Text Response follows", 'data': tPdf }
|
||||
|
||||
|
||||
def handle_pdf2text(ph: ProxyHandler, pr: urllib.parse.ParseResult):
|
||||
"""
|
||||
Handle requests to pdf2text path, which is used to extract plain text
|
||||
from the specified pdf file.
|
||||
"""
|
||||
queryParams = urllib.parse.parse_qs(pr.query)
|
||||
url = queryParams['url'][0]
|
||||
startP = queryParams['startPageNumber'][0]
|
||||
if startP:
|
||||
startP = int(startP)
|
||||
else:
|
||||
startP = -1
|
||||
endP = queryParams['endPageNumber'][0]
|
||||
if endP:
|
||||
endP = int(endP)
|
||||
else:
|
||||
endP = -1
|
||||
print(f"INFO:HandlePdf2Text:Processing:{url}:{startP}:{endP}...")
|
||||
gotP2T = process_pdf2text(url, startP, endP)
|
||||
if (gotP2T['status'] != 200):
|
||||
ph.send_error(gotP2T['status'], gotP2T['msg'] )
|
||||
return
|
||||
ph.send_response(gotP2T['status'], gotP2T['msg'])
|
||||
ph.send_header('Content-Type', 'text/text')
|
||||
# Add CORS for browser fetch, just in case
|
||||
ph.send_header('Access-Control-Allow-Origin', '*')
|
||||
ph.end_headers()
|
||||
print(f"INFO:HandlePdf2Text:ExtractedText:{url}...")
|
||||
ph.wfile.write(gotP2T['data'].encode('utf-8'))
|
||||
|
||||
|
||||
|
||||
def load_config():
|
||||
"""
|
||||
|
|
|
|||
Loading…
Reference in New Issue