SimpleChatTC:Pdf2Text: Make it work with a subset of pages

Initial go, need to review the code flow as well as test it out
2025-11-02 03:38:43 +05:30 · 2025-11-02 03:38:43 +05:30 · dd0a7ec500
parent 8bc7de4416
commit dd0a7ec500
2 changed files with 28 additions and 5 deletions
--- a/tools/server/public_simplechat/local.tools/simpleproxy.py
+++ b/tools/server/public_simplechat/local.tools/simpleproxy.py
@ -378,7 +378,7 @@ def handle_urltext(ph: ProxyHandler, pr: urllib.parse.ParseResult):
        ph.send_error(502, f"WARN:UrlTextFailed:{exc}")


-def process_pdf2text(url: str):
+def process_pdf2text(url: str, startPN: int, endPN: int):
    import pypdf
    import io
    urlParts = url.split('://',1)
@ -388,7 +388,12 @@ def process_pdf2text(url: str):
    dPdf = fPdf.read()
    tPdf = ""
    oPdf = pypdf.PdfReader(io.BytesIO(dPdf))
-    for (pn, pd) in enumerate(oPdf.pages):
+    if (startPN < 0):
+        startPN = 0
+    if (endPN < 0) or (endPN >= len(oPdf.pages)):
+        endPN = len(oPdf.pages)-1
+    for i in range(startPN, endPN+1):
+        pd = oPdf.pages[i]
        tPdf = tPdf + pd.extract_text()
    return { 'status': 200, 'msg': "Pdf2Text Response follows", 'data': tPdf }

@ -407,8 +412,18 @@ def handle_pdf2text(ph: ProxyHandler, pr: urllib.parse.ParseResult):
    if (not url) or (len(url) == 0):
        ph.send_error(400, f"WARN:HandlePdf2Text:MissingUrl!")
        return
+    startP = queryParams['startPageNumber'][0]
+    if startP:
+        startP = int(startP)
+    else:
+        startP = -1
+    endP = queryParams['endPageNumber'][0]
+    if endP:
+        endP = int(endP)
+    else:
+        endP = -1
    print(f"INFO:HandlePdf2Text:Processing:{url}...")
-    gotP2T = process_pdf2text(url)
+    gotP2T = process_pdf2text(url, startP, endP)
    if (gotP2T['status'] != 200):
        ph.send_error(gotP2T['status'], gotP2T['msg'] )
        return
--- a/tools/server/public_simplechat/toolweb.mjs
+++ b/tools/server/public_simplechat/toolweb.mjs
@ -284,14 +284,22 @@ let pdf2text_meta = {
        "type": "function",
        "function": {
            "name": "pdf2text",
-            "description": "Read pdf from requested local file path / web url through a proxy server and return its text content after converting pdf to text, in few seconds",
+            "description": "Read pdf from requested local file path / web url through a proxy server and return its text content after converting pdf to text, in few seconds. One is allowed to get a part of the pdf by specifying the starting and ending page numbers",
            "parameters": {
                "type": "object",
                "properties": {
                    "url":{
                        "type":"string",
                        "description":"local file path (file://) / web (http/https) based url of the pdf that will be got and inturn converted to text to an extent"
-                    }
+                    },
+                    "startPageNumber":{
+                        "type":"integer",
+                        "description":"Specify the starting page number within the pdf, this is optional. If not specified set to first page."
+                    },
+                    "endPageNumber":{
+                        "type":"integer",
+                        "description":"Specify the ending page number within the pdf, this is optional. If not specified set to the last page."
+                    },
                },
                "required": ["url"]
            }