SimpleChatTC:Pdf2Text: Make it work with a subset of pages

Initial go, need to review the code flow as well as test it out
This commit is contained in:
hanishkvc 2025-11-02 03:38:43 +05:30
parent 8bc7de4416
commit dd0a7ec500
2 changed files with 28 additions and 5 deletions

View File

@ -378,7 +378,7 @@ def handle_urltext(ph: ProxyHandler, pr: urllib.parse.ParseResult):
ph.send_error(502, f"WARN:UrlTextFailed:{exc}")
def process_pdf2text(url: str):
def process_pdf2text(url: str, startPN: int, endPN: int):
import pypdf
import io
urlParts = url.split('://',1)
@ -388,7 +388,12 @@ def process_pdf2text(url: str):
dPdf = fPdf.read()
tPdf = ""
oPdf = pypdf.PdfReader(io.BytesIO(dPdf))
for (pn, pd) in enumerate(oPdf.pages):
if (startPN < 0):
startPN = 0
if (endPN < 0) or (endPN >= len(oPdf.pages)):
endPN = len(oPdf.pages)-1
for i in range(startPN, endPN+1):
pd = oPdf.pages[i]
tPdf = tPdf + pd.extract_text()
return { 'status': 200, 'msg': "Pdf2Text Response follows", 'data': tPdf }
@ -407,8 +412,18 @@ def handle_pdf2text(ph: ProxyHandler, pr: urllib.parse.ParseResult):
if (not url) or (len(url) == 0):
ph.send_error(400, f"WARN:HandlePdf2Text:MissingUrl!")
return
startP = queryParams['startPageNumber'][0]
if startP:
startP = int(startP)
else:
startP = -1
endP = queryParams['endPageNumber'][0]
if endP:
endP = int(endP)
else:
endP = -1
print(f"INFO:HandlePdf2Text:Processing:{url}...")
gotP2T = process_pdf2text(url)
gotP2T = process_pdf2text(url, startP, endP)
if (gotP2T['status'] != 200):
ph.send_error(gotP2T['status'], gotP2T['msg'] )
return

View File

@ -284,14 +284,22 @@ let pdf2text_meta = {
"type": "function",
"function": {
"name": "pdf2text",
"description": "Read pdf from requested local file path / web url through a proxy server and return its text content after converting pdf to text, in few seconds",
"description": "Read pdf from requested local file path / web url through a proxy server and return its text content after converting pdf to text, in few seconds. One is allowed to get a part of the pdf by specifying the starting and ending page numbers",
"parameters": {
"type": "object",
"properties": {
"url":{
"type":"string",
"description":"local file path (file://) / web (http/https) based url of the pdf that will be got and inturn converted to text to an extent"
}
},
"startPageNumber":{
"type":"integer",
"description":"Specify the starting page number within the pdf, this is optional. If not specified set to first page."
},
"endPageNumber":{
"type":"integer",
"description":"Specify the ending page number within the pdf, this is optional. If not specified set to the last page."
},
},
"required": ["url"]
}