From 9484bea71a6583f18aa8deceb74aaf860df6d5ce Mon Sep 17 00:00:00 2001 From: hanishkvc Date: Sat, 8 Nov 2025 04:32:30 +0530 Subject: [PATCH] SimpleChatTC:PdfText:Basic Outline and its Numbering done Pass a list to keep track of the numbering at different depths as well as to delay incrementing the numbering to the last min Dont let recursion go beyond a predefined limit --- .../public_simplechat/local.tools/pdfmagic.py | 30 +++++++++++-------- tools/server/public_simplechat/readme.md | 4 +++ 2 files changed, 22 insertions(+), 12 deletions(-) diff --git a/tools/server/public_simplechat/local.tools/pdfmagic.py b/tools/server/public_simplechat/local.tools/pdfmagic.py index 26a1f3c11f..963d3d6ecf 100644 --- a/tools/server/public_simplechat/local.tools/pdfmagic.py +++ b/tools/server/public_simplechat/local.tools/pdfmagic.py @@ -10,21 +10,24 @@ if TYPE_CHECKING: from simpleproxy import ProxyHandler -def extract_pdfoutline(ol: Any, prefix: str): +PDFOUTLINE_MAXDEPTH=4 + + +def extract_pdfoutline(ol: Any, prefix: list[int]): """ - Extract the pdf outline recursively. - 1st tuple entry returned indicates whether to increase outline entry numbering - 2nd tuple entry returns the outline string that provides the extracted outline. + Helps extract the pdf outline recursively, along with its numbering. """ + if (len(prefix) > PDFOUTLINE_MAXDEPTH): + return "" if type(ol).__name__ != type([]).__name__: - return (1, f"{prefix}:{ol['/Title']}\n") + prefix[-1] += 1 + return f"{".".join(map(str,prefix))}:{ol['/Title']}\n" olText = "" - olNum = 1 + prefix.append(0) for (i,iol) in enumerate(ol): - got = extract_pdfoutline(iol, f"{prefix}.{olNum}") - olNum += got[0] - olText += got[1] - return (0, olText) + olText += extract_pdfoutline(iol, prefix) + prefix.pop() + return olText def process_pdftext(url: str, startPN: int, endPN: int): @@ -53,8 +56,11 @@ def process_pdftext(url: str, startPN: int, endPN: int): startPN = 1 if (endPN <= 0) or (endPN > len(oPdf.pages)): endPN = len(oPdf.pages) - outlineGot = extract_pdfoutline(oPdf.outline, "") - tPdf += outlineGot[1] + # Add the pdf outline, if available + outlineGot = extract_pdfoutline(oPdf.outline, []) + if outlineGot: + tPdf += f"\n\nOutline Start\n\n{outlineGot}\n\nOutline End\n\n" + # Add the pdf page contents for i in range(startPN, endPN+1): pd = oPdf.pages[i-1] tPdf = tPdf + pd.extract_text() diff --git a/tools/server/public_simplechat/readme.md b/tools/server/public_simplechat/readme.md index 197b5329eb..0433170213 100644 --- a/tools/server/public_simplechat/readme.md +++ b/tools/server/public_simplechat/readme.md @@ -462,6 +462,7 @@ plain textual content from the search result page. * fetch_pdf_as_text - fetch/read specified pdf file and extract its textual content * this depends on the pypdf python based open source library + * create a outline of titles along with numbering if the pdf contains a outline/toc * fetch_xml_filtered - fetch/read specified xml file and optionally filter out any specified tags * allows one to specify a list of tags related REs, @@ -676,6 +677,9 @@ sliding window based drop off or even before they kick in, this can help in many * capture the body of ai server not ok responses, to help debug as well as to show same to user. +* extract and include the outline of titles (along with calculated numbering) in the text output of pdftext + * ensure that one doesnt recurse beyond a predefined limit. + #### ToDo