SimpleChatTC:PdfText:Add initial skeleton for outline

2025-11-08 03:30:52 +05:30 · 2025-11-08 03:30:52 +05:30 · 51707b5169
parent 272e2689f7
commit 51707b5169
1 changed files with 14 additions and 1 deletions
--- a/tools/server/public_simplechat/local.tools/pdfmagic.py
+++ b/tools/server/public_simplechat/local.tools/pdfmagic.py
@ -4,12 +4,24 @@
 import urllib.parse
 import urlvalidator as uv
 import filemagic as mFile
-from typing import TYPE_CHECKING
+from typing import TYPE_CHECKING, Any

 if TYPE_CHECKING:
    from simpleproxy import ProxyHandler


+def extract_pdfoutline(ol: Any, prefix: str):
+    """
+    Extract the pdf outline
+    """
+    if type(ol).__name__ != type([]).__name__:
+        return f"{prefix}{ol['/Title']}\n"
+    olText = ""
+    for iol in ol:
+        olText += extract_pdfoutline(iol, prefix+"\t")
+    return olText
+
+
 def process_pdftext(url: str, startPN: int, endPN: int):
    """
    Extract textual content from given pdf.
@ -36,6 +48,7 @@ def process_pdftext(url: str, startPN: int, endPN: int):
        startPN = 1
    if (endPN <= 0) or (endPN > len(oPdf.pages)):
        endPN = len(oPdf.pages)
+    tPdf += extract_pdfoutline(oPdf.outline, "")
    for i in range(startPN, endPN+1):
        pd = oPdf.pages[i-1]
        tPdf = tPdf + pd.extract_text()