From 9484bea71a6583f18aa8deceb74aaf860df6d5ce Mon Sep 17 00:00:00 2001
From: hanishkvc <hanishkvc@gmail.com>
Date: Sat, 8 Nov 2025 04:32:30 +0530
Subject: [PATCH] SimpleChatTC:PdfText:Basic Outline and its Numbering done

Pass a list to keep track of the numbering at different depths
as well as to delay incrementing the numbering to the last min

Dont let recursion go beyond a predefined limit
---
 .../public_simplechat/local.tools/pdfmagic.py | 30 +++++++++++--------
 tools/server/public_simplechat/readme.md      |  4 +++
 2 files changed, 22 insertions(+), 12 deletions(-)

diff --git a/tools/server/public_simplechat/local.tools/pdfmagic.py b/tools/server/public_simplechat/local.tools/pdfmagic.py
index 26a1f3c11f..963d3d6ecf 100644
--- a/tools/server/public_simplechat/local.tools/pdfmagic.py
+++ b/tools/server/public_simplechat/local.tools/pdfmagic.py
@@ -10,21 +10,24 @@ if TYPE_CHECKING:
     from simpleproxy import ProxyHandler
 
 
-def extract_pdfoutline(ol: Any, prefix: str):
+PDFOUTLINE_MAXDEPTH=4
+
+
+def extract_pdfoutline(ol: Any, prefix: list[int]):
     """
-    Extract the pdf outline recursively.
-    1st tuple entry returned indicates whether to increase outline entry numbering
-    2nd tuple entry returns the outline string that provides the extracted outline.
+    Helps extract the pdf outline recursively, along with its numbering.
     """
+    if (len(prefix) > PDFOUTLINE_MAXDEPTH):
+        return ""
     if type(ol).__name__ != type([]).__name__:
-        return (1, f"{prefix}:{ol['/Title']}\n")
+        prefix[-1] += 1
+        return f"{".".join(map(str,prefix))}:{ol['/Title']}\n"
     olText = ""
-    olNum = 1
+    prefix.append(0)
     for (i,iol) in enumerate(ol):
-        got = extract_pdfoutline(iol, f"{prefix}.{olNum}")
-        olNum += got[0]
-        olText += got[1]
-    return (0, olText)
+        olText += extract_pdfoutline(iol, prefix)
+    prefix.pop()
+    return olText
 
 
 def process_pdftext(url: str, startPN: int, endPN: int):
@@ -53,8 +56,11 @@ def process_pdftext(url: str, startPN: int, endPN: int):
         startPN = 1
     if (endPN <= 0) or (endPN > len(oPdf.pages)):
         endPN = len(oPdf.pages)
-    outlineGot = extract_pdfoutline(oPdf.outline, "")
-    tPdf += outlineGot[1]
+    # Add the pdf outline, if available
+    outlineGot = extract_pdfoutline(oPdf.outline, [])
+    if outlineGot:
+        tPdf += f"\n\nOutline Start\n\n{outlineGot}\n\nOutline End\n\n"
+    # Add the pdf page contents
     for i in range(startPN, endPN+1):
         pd = oPdf.pages[i-1]
         tPdf = tPdf + pd.extract_text()
diff --git a/tools/server/public_simplechat/readme.md b/tools/server/public_simplechat/readme.md
index 197b5329eb..0433170213 100644
--- a/tools/server/public_simplechat/readme.md
+++ b/tools/server/public_simplechat/readme.md
@@ -462,6 +462,7 @@ plain textual content from the search result page.
 
 * fetch_pdf_as_text - fetch/read specified pdf file and extract its textual content
   * this depends on the pypdf python based open source library
+  * create a outline of titles along with numbering if the pdf contains a outline/toc
 
 * fetch_xml_filtered - fetch/read specified xml file and optionally filter out any specified tags
   * allows one to specify a list of tags related REs,
@@ -676,6 +677,9 @@ sliding window based drop off or even before they kick in, this can help in many
 
 * capture the body of ai server not ok responses, to help debug as well as to show same to user.
 
+* extract and include the outline of titles (along with calculated numbering) in the text output of pdftext
+  * ensure that one doesnt recurse beyond a predefined limit.
+
 
 #### ToDo