From 1d1894ad144bfe46701f52ae3ce474f2f4fb30bb Mon Sep 17 00:00:00 2001 From: hanishkvc Date: Mon, 3 Nov 2025 12:12:41 +0530 Subject: [PATCH] SimpleChatTC:PdfText:Cleanup rename to follow a common convention Rename path and tags/identifiers from Pdf2Text to PdfText Rename the function call to pdf_to_text, this should also help indicate semantic more unambiguously, just in case, especially for smaller models. --- .../public_simplechat/local.tools/pdfmagic.py | 19 ++++++++------- .../local.tools/simpleproxy.py | 6 ++--- tools/server/public_simplechat/readme.md | 6 ++--- tools/server/public_simplechat/toolweb.mjs | 24 +++++++++---------- 4 files changed, 28 insertions(+), 27 deletions(-) diff --git a/tools/server/public_simplechat/local.tools/pdfmagic.py b/tools/server/public_simplechat/local.tools/pdfmagic.py index 336a61250a..971ba2c796 100644 --- a/tools/server/public_simplechat/local.tools/pdfmagic.py +++ b/tools/server/public_simplechat/local.tools/pdfmagic.py @@ -10,11 +10,12 @@ if TYPE_CHECKING: from simpleproxy import ProxyHandler -def process_pdf2text(url: str, startPN: int, endPN: int): +def process_pdftext(url: str, startPN: int, endPN: int): """ Extract textual content from given pdf. * Validate the got url. + * Get the pdf file. * Extract textual contents of the pdf from given start page number to end page number (inclusive). * if -1 | 0 is specified wrt startPN, the actual starting page number (rather 1) will be used. * if -1 | 0 is specified wrt endPN, the actual ending page number will be used. @@ -23,10 +24,10 @@ def process_pdf2text(url: str, startPN: int, endPN: int): """ import pypdf import io - gotVU = uv.validate_url(url, "HandlePdf2Text") + gotVU = uv.validate_url(url, "HandlePdfText") if not gotVU.callOk: return { 'status': gotVU.statusCode, 'msg': gotVU.statusMsg } - gotFile = mFile.get_file(url, "ProcessPdf2Text", "application/pdf", {}) + gotFile = mFile.get_file(url, "ProcessPdfText", "application/pdf", {}) if not gotFile.callOk: return { 'status': gotFile.statusCode, 'msg': gotFile.statusMsg, 'data': gotFile.contentData} tPdf = "" @@ -38,12 +39,12 @@ def process_pdf2text(url: str, startPN: int, endPN: int): for i in range(startPN, endPN+1): pd = oPdf.pages[i-1] tPdf = tPdf + pd.extract_text() - return { 'status': 200, 'msg': "Pdf2Text Response follows", 'data': tPdf } + return { 'status': 200, 'msg': "PdfText Response follows", 'data': tPdf } -def handle_pdf2text(ph: 'ProxyHandler', pr: urllib.parse.ParseResult): +def handle_pdftext(ph: 'ProxyHandler', pr: urllib.parse.ParseResult): """ - Handle requests to pdf2text path, which is used to extract plain text + Handle requests to pdftext path, which is used to extract plain text from the specified pdf file. """ queryParams = urllib.parse.parse_qs(pr.query) @@ -54,8 +55,8 @@ def handle_pdf2text(ph: 'ProxyHandler', pr: urllib.parse.ParseResult): endP = queryParams.get('endPageNumber', -1) if isinstance(endP, list): endP = int(endP[0]) - print(f"INFO:HandlePdf2Text:Processing:{url}:{startP}:{endP}...") - gotP2T = process_pdf2text(url, startP, endP) + print(f"INFO:HandlePdfText:Processing:{url}:{startP}:{endP}...") + gotP2T = process_pdftext(url, startP, endP) if (gotP2T['status'] != 200): ph.send_error(gotP2T['status'], gotP2T['msg'] ) return @@ -64,5 +65,5 @@ def handle_pdf2text(ph: 'ProxyHandler', pr: urllib.parse.ParseResult): # Add CORS for browser fetch, just in case ph.send_header('Access-Control-Allow-Origin', '*') ph.end_headers() - print(f"INFO:HandlePdf2Text:ExtractedText:{url}...") + print(f"INFO:HandlePdfText:ExtractedText:{url}...") ph.wfile.write(gotP2T['data'].encode('utf-8')) diff --git a/tools/server/public_simplechat/local.tools/simpleproxy.py b/tools/server/public_simplechat/local.tools/simpleproxy.py index 4a74c6a254..862951f56a 100644 --- a/tools/server/public_simplechat/local.tools/simpleproxy.py +++ b/tools/server/public_simplechat/local.tools/simpleproxy.py @@ -48,7 +48,7 @@ gConfigType = { gConfigNeeded = [ '--allowed.schemes', '--allowed.domains', '--bearer.insecure' ] -gAllowedCalls = [ "urltext", "urlraw", "pdf2text" ] +gAllowedCalls = [ "urltext", "urlraw", "pdftext" ] def bearer_transform(): @@ -135,8 +135,8 @@ class ProxyHandler(http.server.BaseHTTPRequestHandler): self.auth_and_run(pr, mWeb.handle_urlraw) case '/urltext': self.auth_and_run(pr, mWeb.handle_urltext) - case '/pdf2text': - self.auth_and_run(pr, mPdf.handle_pdf2text) + case '/pdftext': + self.auth_and_run(pr, mPdf.handle_pdftext) case '/aum': handle_aum(self, pr) case _: diff --git a/tools/server/public_simplechat/readme.md b/tools/server/public_simplechat/readme.md index 9a8b586e6e..e3a835df0d 100644 --- a/tools/server/public_simplechat/readme.md +++ b/tools/server/public_simplechat/readme.md @@ -448,7 +448,7 @@ Either way always remember to cross check the tool requests and generated respon * search_web_text - search for the specified words using the configured search engine and return the plain textual content from the search result page. -* pdf2text - fetch/read specified pdf file and extract its textual content +* pdf_to_text - fetch/read specified pdf file and extract its textual content * this depends on the pypdf python based open source library the above set of web related tool calls work by handshaking with a bundled simple local web proxy @@ -469,7 +469,7 @@ Depending on the path specified wrt the proxy server, it executes the correspond urltext path is used (and not urlraw), the logic in addition to fetching content from given url, it tries to convert html content into equivalent plain text content to some extent in a simple minded manner by dropping head block as well as all scripts/styles/footers/headers/nav blocks and inturn -also dropping the html tags. Similarly for pdf2text. +also dropping the html tags. Similarly for pdftext. The client ui logic does a simple check to see if the bundled simpleproxy is running at specified proxyUrl before enabling these web and related tool calls. @@ -579,7 +579,7 @@ users) own data or data of ai model. Trap http response errors and inform user the specific error returned by ai server. -Initial go at a pdf2text tool call. It allows web / local pdf files to be read and their text content +Initial go at a pdftext tool call. It allows web / local pdf files to be read and their text content extracted and passed to ai model for further processing, as decided by ai and end user. One could either work with the full pdf or a subset of adjacent pages. diff --git a/tools/server/public_simplechat/toolweb.mjs b/tools/server/public_simplechat/toolweb.mjs index 56ddd8ae67..ba9ad93bfb 100644 --- a/tools/server/public_simplechat/toolweb.mjs +++ b/tools/server/public_simplechat/toolweb.mjs @@ -276,14 +276,14 @@ async function searchwebtext_setup(tcs) { // -// Pdf2Text +// PdfText // -let pdf2text_meta = { +let pdftext_meta = { "type": "function", "function": { - "name": "pdf2text", + "name": "pdf_to_text", "description": "Read pdf from requested local file path / web url through a proxy server and return its text content after converting pdf to text, in few seconds. One is allowed to get a part of the pdf by specifying the starting and ending page numbers", "parameters": { "type": "object", @@ -312,7 +312,7 @@ let pdf2text_meta = { * Expects a simple minded proxy server to be running locally * * listening on a configured port * * expecting http requests - * * with a query token named url wrt pdf2text path, + * * with a query token named url wrt pdftext path, * which gives the actual url to fetch * * gets the requested pdf and converts to text, before returning same. * ALERT: Accesses a seperate/external web proxy/caching server, be aware and careful @@ -321,20 +321,20 @@ let pdf2text_meta = { * @param {string} toolname * @param {any} obj */ -function pdf2text_run(chatid, toolcallid, toolname, obj) { - return proxyserver_get_anyargs(chatid, toolcallid, toolname, obj, 'pdf2text'); +function pdftext_run(chatid, toolcallid, toolname, obj) { + return proxyserver_get_anyargs(chatid, toolcallid, toolname, obj, 'pdftext'); } /** - * Setup pdf2text for tool calling + * Setup pdftext for tool calling * NOTE: Currently the logic is setup for the bundled simpleproxy.py * @param {Object>} tcs */ -async function pdf2text_setup(tcs) { - return proxyserver_tc_setup('Pdf2Text', 'pdf2text', 'pdf2text', { - "handler": pdf2text_run, - "meta": pdf2text_meta, +async function pdftext_setup(tcs) { + return proxyserver_tc_setup('PdfText', 'pdftext', 'pdf_to_text', { + "handler": pdftext_run, + "meta": pdftext_meta, "result": "" }, tcs); } @@ -355,6 +355,6 @@ export async function init(toolsWorker) { await fetchweburlraw_setup(tc_switch) await fetchweburltext_setup(tc_switch) await searchwebtext_setup(tc_switch) - await pdf2text_setup(tc_switch) + await pdftext_setup(tc_switch) return tc_switch }