SimpleChatTC:Pdf2Text: Make it work with a subset of pages
Initial go, need to review the code flow as well as test it out
This commit is contained in:
parent
8bc7de4416
commit
dd0a7ec500
|
|
@ -378,7 +378,7 @@ def handle_urltext(ph: ProxyHandler, pr: urllib.parse.ParseResult):
|
|||
ph.send_error(502, f"WARN:UrlTextFailed:{exc}")
|
||||
|
||||
|
||||
def process_pdf2text(url: str):
|
||||
def process_pdf2text(url: str, startPN: int, endPN: int):
|
||||
import pypdf
|
||||
import io
|
||||
urlParts = url.split('://',1)
|
||||
|
|
@ -388,7 +388,12 @@ def process_pdf2text(url: str):
|
|||
dPdf = fPdf.read()
|
||||
tPdf = ""
|
||||
oPdf = pypdf.PdfReader(io.BytesIO(dPdf))
|
||||
for (pn, pd) in enumerate(oPdf.pages):
|
||||
if (startPN < 0):
|
||||
startPN = 0
|
||||
if (endPN < 0) or (endPN >= len(oPdf.pages)):
|
||||
endPN = len(oPdf.pages)-1
|
||||
for i in range(startPN, endPN+1):
|
||||
pd = oPdf.pages[i]
|
||||
tPdf = tPdf + pd.extract_text()
|
||||
return { 'status': 200, 'msg': "Pdf2Text Response follows", 'data': tPdf }
|
||||
|
||||
|
|
@ -407,8 +412,18 @@ def handle_pdf2text(ph: ProxyHandler, pr: urllib.parse.ParseResult):
|
|||
if (not url) or (len(url) == 0):
|
||||
ph.send_error(400, f"WARN:HandlePdf2Text:MissingUrl!")
|
||||
return
|
||||
startP = queryParams['startPageNumber'][0]
|
||||
if startP:
|
||||
startP = int(startP)
|
||||
else:
|
||||
startP = -1
|
||||
endP = queryParams['endPageNumber'][0]
|
||||
if endP:
|
||||
endP = int(endP)
|
||||
else:
|
||||
endP = -1
|
||||
print(f"INFO:HandlePdf2Text:Processing:{url}...")
|
||||
gotP2T = process_pdf2text(url)
|
||||
gotP2T = process_pdf2text(url, startP, endP)
|
||||
if (gotP2T['status'] != 200):
|
||||
ph.send_error(gotP2T['status'], gotP2T['msg'] )
|
||||
return
|
||||
|
|
|
|||
|
|
@ -284,14 +284,22 @@ let pdf2text_meta = {
|
|||
"type": "function",
|
||||
"function": {
|
||||
"name": "pdf2text",
|
||||
"description": "Read pdf from requested local file path / web url through a proxy server and return its text content after converting pdf to text, in few seconds",
|
||||
"description": "Read pdf from requested local file path / web url through a proxy server and return its text content after converting pdf to text, in few seconds. One is allowed to get a part of the pdf by specifying the starting and ending page numbers",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"url":{
|
||||
"type":"string",
|
||||
"description":"local file path (file://) / web (http/https) based url of the pdf that will be got and inturn converted to text to an extent"
|
||||
}
|
||||
},
|
||||
"startPageNumber":{
|
||||
"type":"integer",
|
||||
"description":"Specify the starting page number within the pdf, this is optional. If not specified set to first page."
|
||||
},
|
||||
"endPageNumber":{
|
||||
"type":"integer",
|
||||
"description":"Specify the ending page number within the pdf, this is optional. If not specified set to the last page."
|
||||
},
|
||||
},
|
||||
"required": ["url"]
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in New Issue