SimpleChatTC:PdfText:Basic Outline and its Numbering done
Pass a list to keep track of the numbering at different depths as well as to delay incrementing the numbering to the last min Dont let recursion go beyond a predefined limit
This commit is contained in:
parent
15e99843db
commit
9484bea71a
|
|
@ -10,21 +10,24 @@ if TYPE_CHECKING:
|
||||||
from simpleproxy import ProxyHandler
|
from simpleproxy import ProxyHandler
|
||||||
|
|
||||||
|
|
||||||
def extract_pdfoutline(ol: Any, prefix: str):
|
PDFOUTLINE_MAXDEPTH=4
|
||||||
|
|
||||||
|
|
||||||
|
def extract_pdfoutline(ol: Any, prefix: list[int]):
|
||||||
"""
|
"""
|
||||||
Extract the pdf outline recursively.
|
Helps extract the pdf outline recursively, along with its numbering.
|
||||||
1st tuple entry returned indicates whether to increase outline entry numbering
|
|
||||||
2nd tuple entry returns the outline string that provides the extracted outline.
|
|
||||||
"""
|
"""
|
||||||
|
if (len(prefix) > PDFOUTLINE_MAXDEPTH):
|
||||||
|
return ""
|
||||||
if type(ol).__name__ != type([]).__name__:
|
if type(ol).__name__ != type([]).__name__:
|
||||||
return (1, f"{prefix}:{ol['/Title']}\n")
|
prefix[-1] += 1
|
||||||
|
return f"{".".join(map(str,prefix))}:{ol['/Title']}\n"
|
||||||
olText = ""
|
olText = ""
|
||||||
olNum = 1
|
prefix.append(0)
|
||||||
for (i,iol) in enumerate(ol):
|
for (i,iol) in enumerate(ol):
|
||||||
got = extract_pdfoutline(iol, f"{prefix}.{olNum}")
|
olText += extract_pdfoutline(iol, prefix)
|
||||||
olNum += got[0]
|
prefix.pop()
|
||||||
olText += got[1]
|
return olText
|
||||||
return (0, olText)
|
|
||||||
|
|
||||||
|
|
||||||
def process_pdftext(url: str, startPN: int, endPN: int):
|
def process_pdftext(url: str, startPN: int, endPN: int):
|
||||||
|
|
@ -53,8 +56,11 @@ def process_pdftext(url: str, startPN: int, endPN: int):
|
||||||
startPN = 1
|
startPN = 1
|
||||||
if (endPN <= 0) or (endPN > len(oPdf.pages)):
|
if (endPN <= 0) or (endPN > len(oPdf.pages)):
|
||||||
endPN = len(oPdf.pages)
|
endPN = len(oPdf.pages)
|
||||||
outlineGot = extract_pdfoutline(oPdf.outline, "")
|
# Add the pdf outline, if available
|
||||||
tPdf += outlineGot[1]
|
outlineGot = extract_pdfoutline(oPdf.outline, [])
|
||||||
|
if outlineGot:
|
||||||
|
tPdf += f"\n\nOutline Start\n\n{outlineGot}\n\nOutline End\n\n"
|
||||||
|
# Add the pdf page contents
|
||||||
for i in range(startPN, endPN+1):
|
for i in range(startPN, endPN+1):
|
||||||
pd = oPdf.pages[i-1]
|
pd = oPdf.pages[i-1]
|
||||||
tPdf = tPdf + pd.extract_text()
|
tPdf = tPdf + pd.extract_text()
|
||||||
|
|
|
||||||
|
|
@ -462,6 +462,7 @@ plain textual content from the search result page.
|
||||||
|
|
||||||
* fetch_pdf_as_text - fetch/read specified pdf file and extract its textual content
|
* fetch_pdf_as_text - fetch/read specified pdf file and extract its textual content
|
||||||
* this depends on the pypdf python based open source library
|
* this depends on the pypdf python based open source library
|
||||||
|
* create a outline of titles along with numbering if the pdf contains a outline/toc
|
||||||
|
|
||||||
* fetch_xml_filtered - fetch/read specified xml file and optionally filter out any specified tags
|
* fetch_xml_filtered - fetch/read specified xml file and optionally filter out any specified tags
|
||||||
* allows one to specify a list of tags related REs,
|
* allows one to specify a list of tags related REs,
|
||||||
|
|
@ -676,6 +677,9 @@ sliding window based drop off or even before they kick in, this can help in many
|
||||||
|
|
||||||
* capture the body of ai server not ok responses, to help debug as well as to show same to user.
|
* capture the body of ai server not ok responses, to help debug as well as to show same to user.
|
||||||
|
|
||||||
|
* extract and include the outline of titles (along with calculated numbering) in the text output of pdftext
|
||||||
|
* ensure that one doesnt recurse beyond a predefined limit.
|
||||||
|
|
||||||
|
|
||||||
#### ToDo
|
#### ToDo
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue