SimpleChatTC:PdfText:Numbering T2 - Need diff scheme

This increaments before itself, but we need to increment after
This commit is contained in:
hanishkvc 2025-11-08 04:04:39 +05:30
parent bd60437cc6
commit 15e99843db
1 changed files with 11 additions and 5 deletions

View File

@ -12,14 +12,19 @@ if TYPE_CHECKING:
def extract_pdfoutline(ol: Any, prefix: str): def extract_pdfoutline(ol: Any, prefix: str):
""" """
Extract the pdf outline Extract the pdf outline recursively.
1st tuple entry returned indicates whether to increase outline entry numbering
2nd tuple entry returns the outline string that provides the extracted outline.
""" """
if type(ol).__name__ != type([]).__name__: if type(ol).__name__ != type([]).__name__:
return f"{prefix}:{ol['/Title']}\n" return (1, f"{prefix}:{ol['/Title']}\n")
olText = "" olText = ""
olNum = 1
for (i,iol) in enumerate(ol): for (i,iol) in enumerate(ol):
olText += extract_pdfoutline(iol, f"{prefix}.{i+1}") got = extract_pdfoutline(iol, f"{prefix}.{olNum}")
return olText olNum += got[0]
olText += got[1]
return (0, olText)
def process_pdftext(url: str, startPN: int, endPN: int): def process_pdftext(url: str, startPN: int, endPN: int):
@ -48,7 +53,8 @@ def process_pdftext(url: str, startPN: int, endPN: int):
startPN = 1 startPN = 1
if (endPN <= 0) or (endPN > len(oPdf.pages)): if (endPN <= 0) or (endPN > len(oPdf.pages)):
endPN = len(oPdf.pages) endPN = len(oPdf.pages)
tPdf += extract_pdfoutline(oPdf.outline, "") outlineGot = extract_pdfoutline(oPdf.outline, "")
tPdf += outlineGot[1]
for i in range(startPN, endPN+1): for i in range(startPN, endPN+1):
pd = oPdf.pages[i-1] pd = oPdf.pages[i-1]
tPdf = tPdf + pd.extract_text() tPdf = tPdf + pd.extract_text()