SimpleChatTC:SimpleProxy:Move pdf logic into its own module
This commit is contained in:
parent
b18aed4449
commit
a7de002fd0
|
|
@ -0,0 +1,58 @@
|
||||||
|
# Helper to manage pdf related requests
|
||||||
|
# by Humans for All
|
||||||
|
|
||||||
|
import urllib.parse
|
||||||
|
import urlvalidator as uv
|
||||||
|
import simpleproxy as root
|
||||||
|
|
||||||
|
|
||||||
|
def process_pdf2text(url: str, startPN: int, endPN: int):
|
||||||
|
import pypdf
|
||||||
|
import io
|
||||||
|
gotVU = uv.validate_url(url, "HandlePdf2Text")
|
||||||
|
if not gotVU.callOk:
|
||||||
|
return { 'status': gotVU.statusCode, 'msg': gotVU.statusMsg }
|
||||||
|
urlParts = urllib.parse.urlparse(url)
|
||||||
|
fPdf = open(urlParts.path, 'rb')
|
||||||
|
dPdf = fPdf.read()
|
||||||
|
tPdf = ""
|
||||||
|
oPdf = pypdf.PdfReader(io.BytesIO(dPdf))
|
||||||
|
if (startPN < 0):
|
||||||
|
startPN = 0
|
||||||
|
if (endPN < 0) or (endPN >= len(oPdf.pages)):
|
||||||
|
endPN = len(oPdf.pages)-1
|
||||||
|
for i in range(startPN, endPN+1):
|
||||||
|
pd = oPdf.pages[i]
|
||||||
|
tPdf = tPdf + pd.extract_text()
|
||||||
|
return { 'status': 200, 'msg': "Pdf2Text Response follows", 'data': tPdf }
|
||||||
|
|
||||||
|
|
||||||
|
def handle_pdf2text(ph: root.ProxyHandler, pr: urllib.parse.ParseResult):
|
||||||
|
"""
|
||||||
|
Handle requests to pdf2text path, which is used to extract plain text
|
||||||
|
from the specified pdf file.
|
||||||
|
"""
|
||||||
|
queryParams = urllib.parse.parse_qs(pr.query)
|
||||||
|
url = queryParams['url'][0]
|
||||||
|
startP = queryParams['startPageNumber'][0]
|
||||||
|
if startP:
|
||||||
|
startP = int(startP)
|
||||||
|
else:
|
||||||
|
startP = -1
|
||||||
|
endP = queryParams['endPageNumber'][0]
|
||||||
|
if endP:
|
||||||
|
endP = int(endP)
|
||||||
|
else:
|
||||||
|
endP = -1
|
||||||
|
print(f"INFO:HandlePdf2Text:Processing:{url}:{startP}:{endP}...")
|
||||||
|
gotP2T = process_pdf2text(url, startP, endP)
|
||||||
|
if (gotP2T['status'] != 200):
|
||||||
|
ph.send_error(gotP2T['status'], gotP2T['msg'] )
|
||||||
|
return
|
||||||
|
ph.send_response(gotP2T['status'], gotP2T['msg'])
|
||||||
|
ph.send_header('Content-Type', 'text/text')
|
||||||
|
# Add CORS for browser fetch, just in case
|
||||||
|
ph.send_header('Access-Control-Allow-Origin', '*')
|
||||||
|
ph.end_headers()
|
||||||
|
print(f"INFO:HandlePdf2Text:ExtractedText:{url}...")
|
||||||
|
ph.wfile.write(gotP2T['data'].encode('utf-8'))
|
||||||
|
|
@ -27,6 +27,7 @@ import html.parser
|
||||||
import time
|
import time
|
||||||
import urlvalidator as uv
|
import urlvalidator as uv
|
||||||
from typing import Callable
|
from typing import Callable
|
||||||
|
import pdfmagic as mPdf
|
||||||
|
|
||||||
|
|
||||||
gMe = {
|
gMe = {
|
||||||
|
|
@ -136,7 +137,7 @@ class ProxyHandler(http.server.BaseHTTPRequestHandler):
|
||||||
case '/urltext':
|
case '/urltext':
|
||||||
self.auth_and_run(pr, handle_urltext)
|
self.auth_and_run(pr, handle_urltext)
|
||||||
case '/pdf2text':
|
case '/pdf2text':
|
||||||
self.auth_and_run(pr, handle_pdf2text)
|
self.auth_and_run(pr, mPdf.handle_pdf2text)
|
||||||
case '/aum':
|
case '/aum':
|
||||||
handle_aum(self, pr)
|
handle_aum(self, pr)
|
||||||
case _:
|
case _:
|
||||||
|
|
@ -358,58 +359,6 @@ def handle_urltext(ph: ProxyHandler, pr: urllib.parse.ParseResult):
|
||||||
ph.send_error(502, f"WARN:UrlTextFailed:{exc}")
|
ph.send_error(502, f"WARN:UrlTextFailed:{exc}")
|
||||||
|
|
||||||
|
|
||||||
def process_pdf2text(url: str, startPN: int, endPN: int):
|
|
||||||
import pypdf
|
|
||||||
import io
|
|
||||||
gotVU = uv.validate_url(url, "HandlePdf2Text")
|
|
||||||
if not gotVU.callOk:
|
|
||||||
return { 'status': gotVU.statusCode, 'msg': gotVU.statusMsg }
|
|
||||||
urlParts = urllib.parse.urlparse(url)
|
|
||||||
fPdf = open(urlParts.path, 'rb')
|
|
||||||
dPdf = fPdf.read()
|
|
||||||
tPdf = ""
|
|
||||||
oPdf = pypdf.PdfReader(io.BytesIO(dPdf))
|
|
||||||
if (startPN < 0):
|
|
||||||
startPN = 0
|
|
||||||
if (endPN < 0) or (endPN >= len(oPdf.pages)):
|
|
||||||
endPN = len(oPdf.pages)-1
|
|
||||||
for i in range(startPN, endPN+1):
|
|
||||||
pd = oPdf.pages[i]
|
|
||||||
tPdf = tPdf + pd.extract_text()
|
|
||||||
return { 'status': 200, 'msg': "Pdf2Text Response follows", 'data': tPdf }
|
|
||||||
|
|
||||||
|
|
||||||
def handle_pdf2text(ph: ProxyHandler, pr: urllib.parse.ParseResult):
|
|
||||||
"""
|
|
||||||
Handle requests to pdf2text path, which is used to extract plain text
|
|
||||||
from the specified pdf file.
|
|
||||||
"""
|
|
||||||
queryParams = urllib.parse.parse_qs(pr.query)
|
|
||||||
url = queryParams['url'][0]
|
|
||||||
startP = queryParams['startPageNumber'][0]
|
|
||||||
if startP:
|
|
||||||
startP = int(startP)
|
|
||||||
else:
|
|
||||||
startP = -1
|
|
||||||
endP = queryParams['endPageNumber'][0]
|
|
||||||
if endP:
|
|
||||||
endP = int(endP)
|
|
||||||
else:
|
|
||||||
endP = -1
|
|
||||||
print(f"INFO:HandlePdf2Text:Processing:{url}:{startP}:{endP}...")
|
|
||||||
gotP2T = process_pdf2text(url, startP, endP)
|
|
||||||
if (gotP2T['status'] != 200):
|
|
||||||
ph.send_error(gotP2T['status'], gotP2T['msg'] )
|
|
||||||
return
|
|
||||||
ph.send_response(gotP2T['status'], gotP2T['msg'])
|
|
||||||
ph.send_header('Content-Type', 'text/text')
|
|
||||||
# Add CORS for browser fetch, just in case
|
|
||||||
ph.send_header('Access-Control-Allow-Origin', '*')
|
|
||||||
ph.end_headers()
|
|
||||||
print(f"INFO:HandlePdf2Text:ExtractedText:{url}...")
|
|
||||||
ph.wfile.write(gotP2T['data'].encode('utf-8'))
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def load_config():
|
def load_config():
|
||||||
"""
|
"""
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue