SimpleChatTC:PdfText:Numbering T2 - Need diff scheme
This increaments before itself, but we need to increment after
This commit is contained in:
parent
bd60437cc6
commit
15e99843db
|
|
@ -12,14 +12,19 @@ if TYPE_CHECKING:
|
||||||
|
|
||||||
def extract_pdfoutline(ol: Any, prefix: str):
|
def extract_pdfoutline(ol: Any, prefix: str):
|
||||||
"""
|
"""
|
||||||
Extract the pdf outline
|
Extract the pdf outline recursively.
|
||||||
|
1st tuple entry returned indicates whether to increase outline entry numbering
|
||||||
|
2nd tuple entry returns the outline string that provides the extracted outline.
|
||||||
"""
|
"""
|
||||||
if type(ol).__name__ != type([]).__name__:
|
if type(ol).__name__ != type([]).__name__:
|
||||||
return f"{prefix}:{ol['/Title']}\n"
|
return (1, f"{prefix}:{ol['/Title']}\n")
|
||||||
olText = ""
|
olText = ""
|
||||||
|
olNum = 1
|
||||||
for (i,iol) in enumerate(ol):
|
for (i,iol) in enumerate(ol):
|
||||||
olText += extract_pdfoutline(iol, f"{prefix}.{i+1}")
|
got = extract_pdfoutline(iol, f"{prefix}.{olNum}")
|
||||||
return olText
|
olNum += got[0]
|
||||||
|
olText += got[1]
|
||||||
|
return (0, olText)
|
||||||
|
|
||||||
|
|
||||||
def process_pdftext(url: str, startPN: int, endPN: int):
|
def process_pdftext(url: str, startPN: int, endPN: int):
|
||||||
|
|
@ -48,7 +53,8 @@ def process_pdftext(url: str, startPN: int, endPN: int):
|
||||||
startPN = 1
|
startPN = 1
|
||||||
if (endPN <= 0) or (endPN > len(oPdf.pages)):
|
if (endPN <= 0) or (endPN > len(oPdf.pages)):
|
||||||
endPN = len(oPdf.pages)
|
endPN = len(oPdf.pages)
|
||||||
tPdf += extract_pdfoutline(oPdf.outline, "")
|
outlineGot = extract_pdfoutline(oPdf.outline, "")
|
||||||
|
tPdf += outlineGot[1]
|
||||||
for i in range(startPN, endPN+1):
|
for i in range(startPN, endPN+1):
|
||||||
pd = oPdf.pages[i-1]
|
pd = oPdf.pages[i-1]
|
||||||
tPdf = tPdf + pd.extract_text()
|
tPdf = tPdf + pd.extract_text()
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue