diff --git a/tools/server/public_simplechat/local.tools/simpleproxy.py b/tools/server/public_simplechat/local.tools/simpleproxy.py index 99a7004cd2..03b9a330eb 100644 --- a/tools/server/public_simplechat/local.tools/simpleproxy.py +++ b/tools/server/public_simplechat/local.tools/simpleproxy.py @@ -378,7 +378,7 @@ def handle_urltext(ph: ProxyHandler, pr: urllib.parse.ParseResult): ph.send_error(502, f"WARN:UrlTextFailed:{exc}") -def process_pdf2text(url: str): +def process_pdf2text(url: str, startPN: int, endPN: int): import pypdf import io urlParts = url.split('://',1) @@ -388,7 +388,12 @@ def process_pdf2text(url: str): dPdf = fPdf.read() tPdf = "" oPdf = pypdf.PdfReader(io.BytesIO(dPdf)) - for (pn, pd) in enumerate(oPdf.pages): + if (startPN < 0): + startPN = 0 + if (endPN < 0) or (endPN >= len(oPdf.pages)): + endPN = len(oPdf.pages)-1 + for i in range(startPN, endPN+1): + pd = oPdf.pages[i] tPdf = tPdf + pd.extract_text() return { 'status': 200, 'msg': "Pdf2Text Response follows", 'data': tPdf } @@ -407,8 +412,18 @@ def handle_pdf2text(ph: ProxyHandler, pr: urllib.parse.ParseResult): if (not url) or (len(url) == 0): ph.send_error(400, f"WARN:HandlePdf2Text:MissingUrl!") return + startP = queryParams['startPageNumber'][0] + if startP: + startP = int(startP) + else: + startP = -1 + endP = queryParams['endPageNumber'][0] + if endP: + endP = int(endP) + else: + endP = -1 print(f"INFO:HandlePdf2Text:Processing:{url}...") - gotP2T = process_pdf2text(url) + gotP2T = process_pdf2text(url, startP, endP) if (gotP2T['status'] != 200): ph.send_error(gotP2T['status'], gotP2T['msg'] ) return diff --git a/tools/server/public_simplechat/toolweb.mjs b/tools/server/public_simplechat/toolweb.mjs index a83ce2c38e..56ddd8ae67 100644 --- a/tools/server/public_simplechat/toolweb.mjs +++ b/tools/server/public_simplechat/toolweb.mjs @@ -284,14 +284,22 @@ let pdf2text_meta = { "type": "function", "function": { "name": "pdf2text", - "description": "Read pdf from requested local file path / web url through a proxy server and return its text content after converting pdf to text, in few seconds", + "description": "Read pdf from requested local file path / web url through a proxy server and return its text content after converting pdf to text, in few seconds. One is allowed to get a part of the pdf by specifying the starting and ending page numbers", "parameters": { "type": "object", "properties": { "url":{ "type":"string", "description":"local file path (file://) / web (http/https) based url of the pdf that will be got and inturn converted to text to an extent" - } + }, + "startPageNumber":{ + "type":"integer", + "description":"Specify the starting page number within the pdf, this is optional. If not specified set to first page." + }, + "endPageNumber":{ + "type":"integer", + "description":"Specify the ending page number within the pdf, this is optional. If not specified set to the last page." + }, }, "required": ["url"] }