SimpleChatTC:PdfText:Cleanup rename to follow a common convention

Rename path and tags/identifiers from Pdf2Text to PdfText

Rename the function call to pdf_to_text, this should also help
indicate semantic more unambiguously, just in case, especially
for smaller models.
This commit is contained in:
hanishkvc 2025-11-03 12:12:41 +05:30
parent 8501759f60
commit 1d1894ad14
4 changed files with 28 additions and 27 deletions

View File

@ -10,11 +10,12 @@ if TYPE_CHECKING:
from simpleproxy import ProxyHandler
def process_pdf2text(url: str, startPN: int, endPN: int):
def process_pdftext(url: str, startPN: int, endPN: int):
"""
Extract textual content from given pdf.
* Validate the got url.
* Get the pdf file.
* Extract textual contents of the pdf from given start page number to end page number (inclusive).
* if -1 | 0 is specified wrt startPN, the actual starting page number (rather 1) will be used.
* if -1 | 0 is specified wrt endPN, the actual ending page number will be used.
@ -23,10 +24,10 @@ def process_pdf2text(url: str, startPN: int, endPN: int):
"""
import pypdf
import io
gotVU = uv.validate_url(url, "HandlePdf2Text")
gotVU = uv.validate_url(url, "HandlePdfText")
if not gotVU.callOk:
return { 'status': gotVU.statusCode, 'msg': gotVU.statusMsg }
gotFile = mFile.get_file(url, "ProcessPdf2Text", "application/pdf", {})
gotFile = mFile.get_file(url, "ProcessPdfText", "application/pdf", {})
if not gotFile.callOk:
return { 'status': gotFile.statusCode, 'msg': gotFile.statusMsg, 'data': gotFile.contentData}
tPdf = ""
@ -38,12 +39,12 @@ def process_pdf2text(url: str, startPN: int, endPN: int):
for i in range(startPN, endPN+1):
pd = oPdf.pages[i-1]
tPdf = tPdf + pd.extract_text()
return { 'status': 200, 'msg': "Pdf2Text Response follows", 'data': tPdf }
return { 'status': 200, 'msg': "PdfText Response follows", 'data': tPdf }
def handle_pdf2text(ph: 'ProxyHandler', pr: urllib.parse.ParseResult):
def handle_pdftext(ph: 'ProxyHandler', pr: urllib.parse.ParseResult):
"""
Handle requests to pdf2text path, which is used to extract plain text
Handle requests to pdftext path, which is used to extract plain text
from the specified pdf file.
"""
queryParams = urllib.parse.parse_qs(pr.query)
@ -54,8 +55,8 @@ def handle_pdf2text(ph: 'ProxyHandler', pr: urllib.parse.ParseResult):
endP = queryParams.get('endPageNumber', -1)
if isinstance(endP, list):
endP = int(endP[0])
print(f"INFO:HandlePdf2Text:Processing:{url}:{startP}:{endP}...")
gotP2T = process_pdf2text(url, startP, endP)
print(f"INFO:HandlePdfText:Processing:{url}:{startP}:{endP}...")
gotP2T = process_pdftext(url, startP, endP)
if (gotP2T['status'] != 200):
ph.send_error(gotP2T['status'], gotP2T['msg'] )
return
@ -64,5 +65,5 @@ def handle_pdf2text(ph: 'ProxyHandler', pr: urllib.parse.ParseResult):
# Add CORS for browser fetch, just in case
ph.send_header('Access-Control-Allow-Origin', '*')
ph.end_headers()
print(f"INFO:HandlePdf2Text:ExtractedText:{url}...")
print(f"INFO:HandlePdfText:ExtractedText:{url}...")
ph.wfile.write(gotP2T['data'].encode('utf-8'))

View File

@ -48,7 +48,7 @@ gConfigType = {
gConfigNeeded = [ '--allowed.schemes', '--allowed.domains', '--bearer.insecure' ]
gAllowedCalls = [ "urltext", "urlraw", "pdf2text" ]
gAllowedCalls = [ "urltext", "urlraw", "pdftext" ]
def bearer_transform():
@ -135,8 +135,8 @@ class ProxyHandler(http.server.BaseHTTPRequestHandler):
self.auth_and_run(pr, mWeb.handle_urlraw)
case '/urltext':
self.auth_and_run(pr, mWeb.handle_urltext)
case '/pdf2text':
self.auth_and_run(pr, mPdf.handle_pdf2text)
case '/pdftext':
self.auth_and_run(pr, mPdf.handle_pdftext)
case '/aum':
handle_aum(self, pr)
case _:

View File

@ -448,7 +448,7 @@ Either way always remember to cross check the tool requests and generated respon
* search_web_text - search for the specified words using the configured search engine and return the
plain textual content from the search result page.
* pdf2text - fetch/read specified pdf file and extract its textual content
* pdf_to_text - fetch/read specified pdf file and extract its textual content
* this depends on the pypdf python based open source library
the above set of web related tool calls work by handshaking with a bundled simple local web proxy
@ -469,7 +469,7 @@ Depending on the path specified wrt the proxy server, it executes the correspond
urltext path is used (and not urlraw), the logic in addition to fetching content from given url, it
tries to convert html content into equivalent plain text content to some extent in a simple minded
manner by dropping head block as well as all scripts/styles/footers/headers/nav blocks and inturn
also dropping the html tags. Similarly for pdf2text.
also dropping the html tags. Similarly for pdftext.
The client ui logic does a simple check to see if the bundled simpleproxy is running at specified
proxyUrl before enabling these web and related tool calls.
@ -579,7 +579,7 @@ users) own data or data of ai model.
Trap http response errors and inform user the specific error returned by ai server.
Initial go at a pdf2text tool call. It allows web / local pdf files to be read and their text content
Initial go at a pdftext tool call. It allows web / local pdf files to be read and their text content
extracted and passed to ai model for further processing, as decided by ai and end user. One could
either work with the full pdf or a subset of adjacent pages.

View File

@ -276,14 +276,14 @@ async function searchwebtext_setup(tcs) {
//
// Pdf2Text
// PdfText
//
let pdf2text_meta = {
let pdftext_meta = {
"type": "function",
"function": {
"name": "pdf2text",
"name": "pdf_to_text",
"description": "Read pdf from requested local file path / web url through a proxy server and return its text content after converting pdf to text, in few seconds. One is allowed to get a part of the pdf by specifying the starting and ending page numbers",
"parameters": {
"type": "object",
@ -312,7 +312,7 @@ let pdf2text_meta = {
* Expects a simple minded proxy server to be running locally
* * listening on a configured port
* * expecting http requests
* * with a query token named url wrt pdf2text path,
* * with a query token named url wrt pdftext path,
* which gives the actual url to fetch
* * gets the requested pdf and converts to text, before returning same.
* ALERT: Accesses a seperate/external web proxy/caching server, be aware and careful
@ -321,20 +321,20 @@ let pdf2text_meta = {
* @param {string} toolname
* @param {any} obj
*/
function pdf2text_run(chatid, toolcallid, toolname, obj) {
return proxyserver_get_anyargs(chatid, toolcallid, toolname, obj, 'pdf2text');
function pdftext_run(chatid, toolcallid, toolname, obj) {
return proxyserver_get_anyargs(chatid, toolcallid, toolname, obj, 'pdftext');
}
/**
* Setup pdf2text for tool calling
* Setup pdftext for tool calling
* NOTE: Currently the logic is setup for the bundled simpleproxy.py
* @param {Object<string, Object<string, any>>} tcs
*/
async function pdf2text_setup(tcs) {
return proxyserver_tc_setup('Pdf2Text', 'pdf2text', 'pdf2text', {
"handler": pdf2text_run,
"meta": pdf2text_meta,
async function pdftext_setup(tcs) {
return proxyserver_tc_setup('PdfText', 'pdftext', 'pdf_to_text', {
"handler": pdftext_run,
"meta": pdftext_meta,
"result": ""
}, tcs);
}
@ -355,6 +355,6 @@ export async function init(toolsWorker) {
await fetchweburlraw_setup(tc_switch)
await fetchweburltext_setup(tc_switch)
await searchwebtext_setup(tc_switch)
await pdf2text_setup(tc_switch)
await pdftext_setup(tc_switch)
return tc_switch
}