SimpleChatTC:PdfText:Basic Outline and its Numbering done
Pass a list to keep track of the numbering at different depths as well as to delay incrementing the numbering to the last min Dont let recursion go beyond a predefined limit
This commit is contained in:
parent
15e99843db
commit
9484bea71a
|
|
@ -10,21 +10,24 @@ if TYPE_CHECKING:
|
|||
from simpleproxy import ProxyHandler
|
||||
|
||||
|
||||
def extract_pdfoutline(ol: Any, prefix: str):
|
||||
PDFOUTLINE_MAXDEPTH=4
|
||||
|
||||
|
||||
def extract_pdfoutline(ol: Any, prefix: list[int]):
|
||||
"""
|
||||
Extract the pdf outline recursively.
|
||||
1st tuple entry returned indicates whether to increase outline entry numbering
|
||||
2nd tuple entry returns the outline string that provides the extracted outline.
|
||||
Helps extract the pdf outline recursively, along with its numbering.
|
||||
"""
|
||||
if (len(prefix) > PDFOUTLINE_MAXDEPTH):
|
||||
return ""
|
||||
if type(ol).__name__ != type([]).__name__:
|
||||
return (1, f"{prefix}:{ol['/Title']}\n")
|
||||
prefix[-1] += 1
|
||||
return f"{".".join(map(str,prefix))}:{ol['/Title']}\n"
|
||||
olText = ""
|
||||
olNum = 1
|
||||
prefix.append(0)
|
||||
for (i,iol) in enumerate(ol):
|
||||
got = extract_pdfoutline(iol, f"{prefix}.{olNum}")
|
||||
olNum += got[0]
|
||||
olText += got[1]
|
||||
return (0, olText)
|
||||
olText += extract_pdfoutline(iol, prefix)
|
||||
prefix.pop()
|
||||
return olText
|
||||
|
||||
|
||||
def process_pdftext(url: str, startPN: int, endPN: int):
|
||||
|
|
@ -53,8 +56,11 @@ def process_pdftext(url: str, startPN: int, endPN: int):
|
|||
startPN = 1
|
||||
if (endPN <= 0) or (endPN > len(oPdf.pages)):
|
||||
endPN = len(oPdf.pages)
|
||||
outlineGot = extract_pdfoutline(oPdf.outline, "")
|
||||
tPdf += outlineGot[1]
|
||||
# Add the pdf outline, if available
|
||||
outlineGot = extract_pdfoutline(oPdf.outline, [])
|
||||
if outlineGot:
|
||||
tPdf += f"\n\nOutline Start\n\n{outlineGot}\n\nOutline End\n\n"
|
||||
# Add the pdf page contents
|
||||
for i in range(startPN, endPN+1):
|
||||
pd = oPdf.pages[i-1]
|
||||
tPdf = tPdf + pd.extract_text()
|
||||
|
|
|
|||
|
|
@ -462,6 +462,7 @@ plain textual content from the search result page.
|
|||
|
||||
* fetch_pdf_as_text - fetch/read specified pdf file and extract its textual content
|
||||
* this depends on the pypdf python based open source library
|
||||
* create a outline of titles along with numbering if the pdf contains a outline/toc
|
||||
|
||||
* fetch_xml_filtered - fetch/read specified xml file and optionally filter out any specified tags
|
||||
* allows one to specify a list of tags related REs,
|
||||
|
|
@ -676,6 +677,9 @@ sliding window based drop off or even before they kick in, this can help in many
|
|||
|
||||
* capture the body of ai server not ok responses, to help debug as well as to show same to user.
|
||||
|
||||
* extract and include the outline of titles (along with calculated numbering) in the text output of pdftext
|
||||
* ensure that one doesnt recurse beyond a predefined limit.
|
||||
|
||||
|
||||
#### ToDo
|
||||
|
||||
|
|
|
|||
Loading…
Reference in New Issue