SimpleChatTC:Pdf2Text: Make it work with a subset of pages

Initial go, need to review the code flow as well as test it out
This commit is contained in:
hanishkvc 2025-11-02 03:38:43 +05:30
parent 8bc7de4416
commit dd0a7ec500
2 changed files with 28 additions and 5 deletions

View File

@ -378,7 +378,7 @@ def handle_urltext(ph: ProxyHandler, pr: urllib.parse.ParseResult):
ph.send_error(502, f"WARN:UrlTextFailed:{exc}") ph.send_error(502, f"WARN:UrlTextFailed:{exc}")
def process_pdf2text(url: str): def process_pdf2text(url: str, startPN: int, endPN: int):
import pypdf import pypdf
import io import io
urlParts = url.split('://',1) urlParts = url.split('://',1)
@ -388,7 +388,12 @@ def process_pdf2text(url: str):
dPdf = fPdf.read() dPdf = fPdf.read()
tPdf = "" tPdf = ""
oPdf = pypdf.PdfReader(io.BytesIO(dPdf)) oPdf = pypdf.PdfReader(io.BytesIO(dPdf))
for (pn, pd) in enumerate(oPdf.pages): if (startPN < 0):
startPN = 0
if (endPN < 0) or (endPN >= len(oPdf.pages)):
endPN = len(oPdf.pages)-1
for i in range(startPN, endPN+1):
pd = oPdf.pages[i]
tPdf = tPdf + pd.extract_text() tPdf = tPdf + pd.extract_text()
return { 'status': 200, 'msg': "Pdf2Text Response follows", 'data': tPdf } return { 'status': 200, 'msg': "Pdf2Text Response follows", 'data': tPdf }
@ -407,8 +412,18 @@ def handle_pdf2text(ph: ProxyHandler, pr: urllib.parse.ParseResult):
if (not url) or (len(url) == 0): if (not url) or (len(url) == 0):
ph.send_error(400, f"WARN:HandlePdf2Text:MissingUrl!") ph.send_error(400, f"WARN:HandlePdf2Text:MissingUrl!")
return return
startP = queryParams['startPageNumber'][0]
if startP:
startP = int(startP)
else:
startP = -1
endP = queryParams['endPageNumber'][0]
if endP:
endP = int(endP)
else:
endP = -1
print(f"INFO:HandlePdf2Text:Processing:{url}...") print(f"INFO:HandlePdf2Text:Processing:{url}...")
gotP2T = process_pdf2text(url) gotP2T = process_pdf2text(url, startP, endP)
if (gotP2T['status'] != 200): if (gotP2T['status'] != 200):
ph.send_error(gotP2T['status'], gotP2T['msg'] ) ph.send_error(gotP2T['status'], gotP2T['msg'] )
return return

View File

@ -284,14 +284,22 @@ let pdf2text_meta = {
"type": "function", "type": "function",
"function": { "function": {
"name": "pdf2text", "name": "pdf2text",
"description": "Read pdf from requested local file path / web url through a proxy server and return its text content after converting pdf to text, in few seconds", "description": "Read pdf from requested local file path / web url through a proxy server and return its text content after converting pdf to text, in few seconds. One is allowed to get a part of the pdf by specifying the starting and ending page numbers",
"parameters": { "parameters": {
"type": "object", "type": "object",
"properties": { "properties": {
"url":{ "url":{
"type":"string", "type":"string",
"description":"local file path (file://) / web (http/https) based url of the pdf that will be got and inturn converted to text to an extent" "description":"local file path (file://) / web (http/https) based url of the pdf that will be got and inturn converted to text to an extent"
} },
"startPageNumber":{
"type":"integer",
"description":"Specify the starting page number within the pdf, this is optional. If not specified set to first page."
},
"endPageNumber":{
"type":"integer",
"description":"Specify the ending page number within the pdf, this is optional. If not specified set to the last page."
},
}, },
"required": ["url"] "required": ["url"]
} }