# Helper to manage pdf related requests # by Humans for All import urllib.parse import urlvalidator as uv import filemagic as mFile from typing import TYPE_CHECKING, Any if TYPE_CHECKING: from simpleproxy import ProxyHandler def extract_pdfoutline(ol: Any, prefix: str): """ Extract the pdf outline recursively. 1st tuple entry returned indicates whether to increase outline entry numbering 2nd tuple entry returns the outline string that provides the extracted outline. """ if type(ol).__name__ != type([]).__name__: return (1, f"{prefix}:{ol['/Title']}\n") olText = "" olNum = 1 for (i,iol) in enumerate(ol): got = extract_pdfoutline(iol, f"{prefix}.{olNum}") olNum += got[0] olText += got[1] return (0, olText) def process_pdftext(url: str, startPN: int, endPN: int): """ Extract textual content from given pdf. * Validate the got url. * Get the pdf file. * Extract textual contents of the pdf from given start page number to end page number (inclusive). * if -1 | 0 is specified wrt startPN, the actual starting page number (rather 1) will be used. * if -1 | 0 is specified wrt endPN, the actual ending page number will be used. NOTE: Page numbers start from 1, while the underlying list data structure index starts from 0 """ import pypdf import io gotVU = uv.validate_url(url, "HandlePdfText") if not gotVU.callOk: return { 'status': gotVU.statusCode, 'msg': gotVU.statusMsg } gotFile = mFile.get_file(url, "ProcessPdfText", "application/pdf", {}) if not gotFile.callOk: return { 'status': gotFile.statusCode, 'msg': gotFile.statusMsg, 'data': gotFile.contentData} tPdf = "" oPdf = pypdf.PdfReader(io.BytesIO(gotFile.contentData)) if (startPN <= 0): startPN = 1 if (endPN <= 0) or (endPN > len(oPdf.pages)): endPN = len(oPdf.pages) outlineGot = extract_pdfoutline(oPdf.outline, "") tPdf += outlineGot[1] for i in range(startPN, endPN+1): pd = oPdf.pages[i-1] tPdf = tPdf + pd.extract_text() return { 'status': 200, 'msg': "PdfText Response follows", 'data': tPdf } def handle_pdftext(ph: 'ProxyHandler', pr: urllib.parse.ParseResult): """ Handle requests to pdftext path, which is used to extract plain text from the specified pdf file. """ queryParams = urllib.parse.parse_qs(pr.query) url = queryParams['url'][0] startP = queryParams.get('startPageNumber', -1) if isinstance(startP, list): startP = int(startP[0]) endP = queryParams.get('endPageNumber', -1) if isinstance(endP, list): endP = int(endP[0]) print(f"INFO:HandlePdfText:Processing:{url}:{startP}:{endP}...") gotP2T = process_pdftext(url, startP, endP) if (gotP2T['status'] != 200): ph.send_error(gotP2T['status'], gotP2T['msg'] ) return ph.send_response(gotP2T['status'], gotP2T['msg']) ph.send_header('Content-Type', 'text/text') # Add CORS for browser fetch, just in case ph.send_header('Access-Control-Allow-Origin', '*') ph.end_headers() print(f"INFO:HandlePdfText:ExtractedText:{url}...") ph.wfile.write(gotP2T['data'].encode('utf-8'))