llama.cpp/tools/server/public_simplechat/local.tools/pdfmagic.py

# Helper to manage pdf related requests
# by Humans for All

import urllib.parse
import urlvalidator as uv
import filemagic as mFile
from typing import TYPE_CHECKING, Any

if TYPE_CHECKING:
    from simpleproxy import ProxyHandler


def extract_pdfoutline(ol: Any, prefix: str):
    """
    Extract the pdf outline recursively.
    1st tuple entry returned indicates whether to increase outline entry numbering
    2nd tuple entry returns the outline string that provides the extracted outline.
    """
    if type(ol).__name__ != type([]).__name__:
        return (1, f"{prefix}:{ol['/Title']}\n")
    olText = ""
    olNum = 1
    for (i,iol) in enumerate(ol):
        got = extract_pdfoutline(iol, f"{prefix}.{olNum}")
        olNum += got[0]
        olText += got[1]
    return (0, olText)


def process_pdftext(url: str, startPN: int, endPN: int):
    """
    Extract textual content from given pdf.

    * Validate the got url.
    * Get the pdf file.
    * Extract textual contents of the pdf from given start page number to end page number (inclusive).
        * if -1 | 0 is specified wrt startPN, the actual starting page number (rather 1) will be used.
        * if -1 | 0 is specified wrt endPN, the actual ending page number will be used.

    NOTE: Page numbers start from 1, while the underlying list data structure index starts from 0
    """
    import pypdf
    import io
    gotVU = uv.validate_url(url, "HandlePdfText")
    if not gotVU.callOk:
        return { 'status': gotVU.statusCode, 'msg': gotVU.statusMsg }
    gotFile = mFile.get_file(url, "ProcessPdfText", "application/pdf", {})
    if not gotFile.callOk:
        return { 'status': gotFile.statusCode, 'msg': gotFile.statusMsg, 'data': gotFile.contentData}
    tPdf = ""
    oPdf = pypdf.PdfReader(io.BytesIO(gotFile.contentData))
    if (startPN <= 0):
        startPN = 1
    if (endPN <= 0) or (endPN > len(oPdf.pages)):
        endPN = len(oPdf.pages)
    outlineGot = extract_pdfoutline(oPdf.outline, "")
    tPdf += outlineGot[1]
    for i in range(startPN, endPN+1):
        pd = oPdf.pages[i-1]
        tPdf = tPdf + pd.extract_text()
    return { 'status': 200, 'msg': "PdfText Response follows", 'data': tPdf }


def handle_pdftext(ph: 'ProxyHandler', pr: urllib.parse.ParseResult):
    """
    Handle requests to pdftext path, which is used to extract plain text
    from the specified pdf file.
    """
    queryParams = urllib.parse.parse_qs(pr.query)
    url = queryParams['url'][0]
    startP = queryParams.get('startPageNumber', -1)
    if isinstance(startP, list):
        startP = int(startP[0])
    endP = queryParams.get('endPageNumber', -1)
    if isinstance(endP, list):
        endP = int(endP[0])
    print(f"INFO:HandlePdfText:Processing:{url}:{startP}:{endP}...")
    gotP2T = process_pdftext(url, startP, endP)
    if (gotP2T['status'] != 200):
        ph.send_error(gotP2T['status'], gotP2T['msg'] )
        return
    ph.send_response(gotP2T['status'], gotP2T['msg'])
    ph.send_header('Content-Type', 'text/text')
    # Add CORS for browser fetch, just in case
    ph.send_header('Access-Control-Allow-Origin', '*')
    ph.end_headers()
    print(f"INFO:HandlePdfText:ExtractedText:{url}...")
    ph.wfile.write(gotP2T['data'].encode('utf-8'))