SimpleChatTC:PdfText:Add initial skeleton for outline

This commit is contained in:
hanishkvc 2025-11-08 03:30:52 +05:30
parent 272e2689f7
commit 51707b5169
1 changed files with 14 additions and 1 deletions

View File

@ -4,12 +4,24 @@
import urllib.parse
import urlvalidator as uv
import filemagic as mFile
from typing import TYPE_CHECKING
from typing import TYPE_CHECKING, Any
if TYPE_CHECKING:
from simpleproxy import ProxyHandler
def extract_pdfoutline(ol: Any, prefix: str):
"""
Extract the pdf outline
"""
if type(ol).__name__ != type([]).__name__:
return f"{prefix}{ol['/Title']}\n"
olText = ""
for iol in ol:
olText += extract_pdfoutline(iol, prefix+"\t")
return olText
def process_pdftext(url: str, startPN: int, endPN: int):
"""
Extract textual content from given pdf.
@ -36,6 +48,7 @@ def process_pdftext(url: str, startPN: int, endPN: int):
startPN = 1
if (endPN <= 0) or (endPN > len(oPdf.pages)):
endPN = len(oPdf.pages)
tPdf += extract_pdfoutline(oPdf.outline, "")
for i in range(startPN, endPN+1):
pd = oPdf.pages[i-1]
tPdf = tPdf + pd.extract_text()