SimpleChatTC:Pdf2Text: Make it work with a subset of pages
Initial go, need to review the code flow as well as test it out
This commit is contained in:
parent
8bc7de4416
commit
dd0a7ec500
|
|
@ -378,7 +378,7 @@ def handle_urltext(ph: ProxyHandler, pr: urllib.parse.ParseResult):
|
||||||
ph.send_error(502, f"WARN:UrlTextFailed:{exc}")
|
ph.send_error(502, f"WARN:UrlTextFailed:{exc}")
|
||||||
|
|
||||||
|
|
||||||
def process_pdf2text(url: str):
|
def process_pdf2text(url: str, startPN: int, endPN: int):
|
||||||
import pypdf
|
import pypdf
|
||||||
import io
|
import io
|
||||||
urlParts = url.split('://',1)
|
urlParts = url.split('://',1)
|
||||||
|
|
@ -388,7 +388,12 @@ def process_pdf2text(url: str):
|
||||||
dPdf = fPdf.read()
|
dPdf = fPdf.read()
|
||||||
tPdf = ""
|
tPdf = ""
|
||||||
oPdf = pypdf.PdfReader(io.BytesIO(dPdf))
|
oPdf = pypdf.PdfReader(io.BytesIO(dPdf))
|
||||||
for (pn, pd) in enumerate(oPdf.pages):
|
if (startPN < 0):
|
||||||
|
startPN = 0
|
||||||
|
if (endPN < 0) or (endPN >= len(oPdf.pages)):
|
||||||
|
endPN = len(oPdf.pages)-1
|
||||||
|
for i in range(startPN, endPN+1):
|
||||||
|
pd = oPdf.pages[i]
|
||||||
tPdf = tPdf + pd.extract_text()
|
tPdf = tPdf + pd.extract_text()
|
||||||
return { 'status': 200, 'msg': "Pdf2Text Response follows", 'data': tPdf }
|
return { 'status': 200, 'msg': "Pdf2Text Response follows", 'data': tPdf }
|
||||||
|
|
||||||
|
|
@ -407,8 +412,18 @@ def handle_pdf2text(ph: ProxyHandler, pr: urllib.parse.ParseResult):
|
||||||
if (not url) or (len(url) == 0):
|
if (not url) or (len(url) == 0):
|
||||||
ph.send_error(400, f"WARN:HandlePdf2Text:MissingUrl!")
|
ph.send_error(400, f"WARN:HandlePdf2Text:MissingUrl!")
|
||||||
return
|
return
|
||||||
|
startP = queryParams['startPageNumber'][0]
|
||||||
|
if startP:
|
||||||
|
startP = int(startP)
|
||||||
|
else:
|
||||||
|
startP = -1
|
||||||
|
endP = queryParams['endPageNumber'][0]
|
||||||
|
if endP:
|
||||||
|
endP = int(endP)
|
||||||
|
else:
|
||||||
|
endP = -1
|
||||||
print(f"INFO:HandlePdf2Text:Processing:{url}...")
|
print(f"INFO:HandlePdf2Text:Processing:{url}...")
|
||||||
gotP2T = process_pdf2text(url)
|
gotP2T = process_pdf2text(url, startP, endP)
|
||||||
if (gotP2T['status'] != 200):
|
if (gotP2T['status'] != 200):
|
||||||
ph.send_error(gotP2T['status'], gotP2T['msg'] )
|
ph.send_error(gotP2T['status'], gotP2T['msg'] )
|
||||||
return
|
return
|
||||||
|
|
|
||||||
|
|
@ -284,14 +284,22 @@ let pdf2text_meta = {
|
||||||
"type": "function",
|
"type": "function",
|
||||||
"function": {
|
"function": {
|
||||||
"name": "pdf2text",
|
"name": "pdf2text",
|
||||||
"description": "Read pdf from requested local file path / web url through a proxy server and return its text content after converting pdf to text, in few seconds",
|
"description": "Read pdf from requested local file path / web url through a proxy server and return its text content after converting pdf to text, in few seconds. One is allowed to get a part of the pdf by specifying the starting and ending page numbers",
|
||||||
"parameters": {
|
"parameters": {
|
||||||
"type": "object",
|
"type": "object",
|
||||||
"properties": {
|
"properties": {
|
||||||
"url":{
|
"url":{
|
||||||
"type":"string",
|
"type":"string",
|
||||||
"description":"local file path (file://) / web (http/https) based url of the pdf that will be got and inturn converted to text to an extent"
|
"description":"local file path (file://) / web (http/https) based url of the pdf that will be got and inturn converted to text to an extent"
|
||||||
}
|
},
|
||||||
|
"startPageNumber":{
|
||||||
|
"type":"integer",
|
||||||
|
"description":"Specify the starting page number within the pdf, this is optional. If not specified set to first page."
|
||||||
|
},
|
||||||
|
"endPageNumber":{
|
||||||
|
"type":"integer",
|
||||||
|
"description":"Specify the ending page number within the pdf, this is optional. If not specified set to the last page."
|
||||||
|
},
|
||||||
},
|
},
|
||||||
"required": ["url"]
|
"required": ["url"]
|
||||||
}
|
}
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue