SimpleChatTC:PdfText:Cleanup rename to follow a common convention
Rename path and tags/identifiers from Pdf2Text to PdfText Rename the function call to pdf_to_text, this should also help indicate semantic more unambiguously, just in case, especially for smaller models.
This commit is contained in:
parent
8501759f60
commit
1d1894ad14
|
|
@ -10,11 +10,12 @@ if TYPE_CHECKING:
|
||||||
from simpleproxy import ProxyHandler
|
from simpleproxy import ProxyHandler
|
||||||
|
|
||||||
|
|
||||||
def process_pdf2text(url: str, startPN: int, endPN: int):
|
def process_pdftext(url: str, startPN: int, endPN: int):
|
||||||
"""
|
"""
|
||||||
Extract textual content from given pdf.
|
Extract textual content from given pdf.
|
||||||
|
|
||||||
* Validate the got url.
|
* Validate the got url.
|
||||||
|
* Get the pdf file.
|
||||||
* Extract textual contents of the pdf from given start page number to end page number (inclusive).
|
* Extract textual contents of the pdf from given start page number to end page number (inclusive).
|
||||||
* if -1 | 0 is specified wrt startPN, the actual starting page number (rather 1) will be used.
|
* if -1 | 0 is specified wrt startPN, the actual starting page number (rather 1) will be used.
|
||||||
* if -1 | 0 is specified wrt endPN, the actual ending page number will be used.
|
* if -1 | 0 is specified wrt endPN, the actual ending page number will be used.
|
||||||
|
|
@ -23,10 +24,10 @@ def process_pdf2text(url: str, startPN: int, endPN: int):
|
||||||
"""
|
"""
|
||||||
import pypdf
|
import pypdf
|
||||||
import io
|
import io
|
||||||
gotVU = uv.validate_url(url, "HandlePdf2Text")
|
gotVU = uv.validate_url(url, "HandlePdfText")
|
||||||
if not gotVU.callOk:
|
if not gotVU.callOk:
|
||||||
return { 'status': gotVU.statusCode, 'msg': gotVU.statusMsg }
|
return { 'status': gotVU.statusCode, 'msg': gotVU.statusMsg }
|
||||||
gotFile = mFile.get_file(url, "ProcessPdf2Text", "application/pdf", {})
|
gotFile = mFile.get_file(url, "ProcessPdfText", "application/pdf", {})
|
||||||
if not gotFile.callOk:
|
if not gotFile.callOk:
|
||||||
return { 'status': gotFile.statusCode, 'msg': gotFile.statusMsg, 'data': gotFile.contentData}
|
return { 'status': gotFile.statusCode, 'msg': gotFile.statusMsg, 'data': gotFile.contentData}
|
||||||
tPdf = ""
|
tPdf = ""
|
||||||
|
|
@ -38,12 +39,12 @@ def process_pdf2text(url: str, startPN: int, endPN: int):
|
||||||
for i in range(startPN, endPN+1):
|
for i in range(startPN, endPN+1):
|
||||||
pd = oPdf.pages[i-1]
|
pd = oPdf.pages[i-1]
|
||||||
tPdf = tPdf + pd.extract_text()
|
tPdf = tPdf + pd.extract_text()
|
||||||
return { 'status': 200, 'msg': "Pdf2Text Response follows", 'data': tPdf }
|
return { 'status': 200, 'msg': "PdfText Response follows", 'data': tPdf }
|
||||||
|
|
||||||
|
|
||||||
def handle_pdf2text(ph: 'ProxyHandler', pr: urllib.parse.ParseResult):
|
def handle_pdftext(ph: 'ProxyHandler', pr: urllib.parse.ParseResult):
|
||||||
"""
|
"""
|
||||||
Handle requests to pdf2text path, which is used to extract plain text
|
Handle requests to pdftext path, which is used to extract plain text
|
||||||
from the specified pdf file.
|
from the specified pdf file.
|
||||||
"""
|
"""
|
||||||
queryParams = urllib.parse.parse_qs(pr.query)
|
queryParams = urllib.parse.parse_qs(pr.query)
|
||||||
|
|
@ -54,8 +55,8 @@ def handle_pdf2text(ph: 'ProxyHandler', pr: urllib.parse.ParseResult):
|
||||||
endP = queryParams.get('endPageNumber', -1)
|
endP = queryParams.get('endPageNumber', -1)
|
||||||
if isinstance(endP, list):
|
if isinstance(endP, list):
|
||||||
endP = int(endP[0])
|
endP = int(endP[0])
|
||||||
print(f"INFO:HandlePdf2Text:Processing:{url}:{startP}:{endP}...")
|
print(f"INFO:HandlePdfText:Processing:{url}:{startP}:{endP}...")
|
||||||
gotP2T = process_pdf2text(url, startP, endP)
|
gotP2T = process_pdftext(url, startP, endP)
|
||||||
if (gotP2T['status'] != 200):
|
if (gotP2T['status'] != 200):
|
||||||
ph.send_error(gotP2T['status'], gotP2T['msg'] )
|
ph.send_error(gotP2T['status'], gotP2T['msg'] )
|
||||||
return
|
return
|
||||||
|
|
@ -64,5 +65,5 @@ def handle_pdf2text(ph: 'ProxyHandler', pr: urllib.parse.ParseResult):
|
||||||
# Add CORS for browser fetch, just in case
|
# Add CORS for browser fetch, just in case
|
||||||
ph.send_header('Access-Control-Allow-Origin', '*')
|
ph.send_header('Access-Control-Allow-Origin', '*')
|
||||||
ph.end_headers()
|
ph.end_headers()
|
||||||
print(f"INFO:HandlePdf2Text:ExtractedText:{url}...")
|
print(f"INFO:HandlePdfText:ExtractedText:{url}...")
|
||||||
ph.wfile.write(gotP2T['data'].encode('utf-8'))
|
ph.wfile.write(gotP2T['data'].encode('utf-8'))
|
||||||
|
|
|
||||||
|
|
@ -48,7 +48,7 @@ gConfigType = {
|
||||||
|
|
||||||
gConfigNeeded = [ '--allowed.schemes', '--allowed.domains', '--bearer.insecure' ]
|
gConfigNeeded = [ '--allowed.schemes', '--allowed.domains', '--bearer.insecure' ]
|
||||||
|
|
||||||
gAllowedCalls = [ "urltext", "urlraw", "pdf2text" ]
|
gAllowedCalls = [ "urltext", "urlraw", "pdftext" ]
|
||||||
|
|
||||||
|
|
||||||
def bearer_transform():
|
def bearer_transform():
|
||||||
|
|
@ -135,8 +135,8 @@ class ProxyHandler(http.server.BaseHTTPRequestHandler):
|
||||||
self.auth_and_run(pr, mWeb.handle_urlraw)
|
self.auth_and_run(pr, mWeb.handle_urlraw)
|
||||||
case '/urltext':
|
case '/urltext':
|
||||||
self.auth_and_run(pr, mWeb.handle_urltext)
|
self.auth_and_run(pr, mWeb.handle_urltext)
|
||||||
case '/pdf2text':
|
case '/pdftext':
|
||||||
self.auth_and_run(pr, mPdf.handle_pdf2text)
|
self.auth_and_run(pr, mPdf.handle_pdftext)
|
||||||
case '/aum':
|
case '/aum':
|
||||||
handle_aum(self, pr)
|
handle_aum(self, pr)
|
||||||
case _:
|
case _:
|
||||||
|
|
|
||||||
|
|
@ -448,7 +448,7 @@ Either way always remember to cross check the tool requests and generated respon
|
||||||
* search_web_text - search for the specified words using the configured search engine and return the
|
* search_web_text - search for the specified words using the configured search engine and return the
|
||||||
plain textual content from the search result page.
|
plain textual content from the search result page.
|
||||||
|
|
||||||
* pdf2text - fetch/read specified pdf file and extract its textual content
|
* pdf_to_text - fetch/read specified pdf file and extract its textual content
|
||||||
* this depends on the pypdf python based open source library
|
* this depends on the pypdf python based open source library
|
||||||
|
|
||||||
the above set of web related tool calls work by handshaking with a bundled simple local web proxy
|
the above set of web related tool calls work by handshaking with a bundled simple local web proxy
|
||||||
|
|
@ -469,7 +469,7 @@ Depending on the path specified wrt the proxy server, it executes the correspond
|
||||||
urltext path is used (and not urlraw), the logic in addition to fetching content from given url, it
|
urltext path is used (and not urlraw), the logic in addition to fetching content from given url, it
|
||||||
tries to convert html content into equivalent plain text content to some extent in a simple minded
|
tries to convert html content into equivalent plain text content to some extent in a simple minded
|
||||||
manner by dropping head block as well as all scripts/styles/footers/headers/nav blocks and inturn
|
manner by dropping head block as well as all scripts/styles/footers/headers/nav blocks and inturn
|
||||||
also dropping the html tags. Similarly for pdf2text.
|
also dropping the html tags. Similarly for pdftext.
|
||||||
|
|
||||||
The client ui logic does a simple check to see if the bundled simpleproxy is running at specified
|
The client ui logic does a simple check to see if the bundled simpleproxy is running at specified
|
||||||
proxyUrl before enabling these web and related tool calls.
|
proxyUrl before enabling these web and related tool calls.
|
||||||
|
|
@ -579,7 +579,7 @@ users) own data or data of ai model.
|
||||||
|
|
||||||
Trap http response errors and inform user the specific error returned by ai server.
|
Trap http response errors and inform user the specific error returned by ai server.
|
||||||
|
|
||||||
Initial go at a pdf2text tool call. It allows web / local pdf files to be read and their text content
|
Initial go at a pdftext tool call. It allows web / local pdf files to be read and their text content
|
||||||
extracted and passed to ai model for further processing, as decided by ai and end user. One could
|
extracted and passed to ai model for further processing, as decided by ai and end user. One could
|
||||||
either work with the full pdf or a subset of adjacent pages.
|
either work with the full pdf or a subset of adjacent pages.
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -276,14 +276,14 @@ async function searchwebtext_setup(tcs) {
|
||||||
|
|
||||||
|
|
||||||
//
|
//
|
||||||
// Pdf2Text
|
// PdfText
|
||||||
//
|
//
|
||||||
|
|
||||||
|
|
||||||
let pdf2text_meta = {
|
let pdftext_meta = {
|
||||||
"type": "function",
|
"type": "function",
|
||||||
"function": {
|
"function": {
|
||||||
"name": "pdf2text",
|
"name": "pdf_to_text",
|
||||||
"description": "Read pdf from requested local file path / web url through a proxy server and return its text content after converting pdf to text, in few seconds. One is allowed to get a part of the pdf by specifying the starting and ending page numbers",
|
"description": "Read pdf from requested local file path / web url through a proxy server and return its text content after converting pdf to text, in few seconds. One is allowed to get a part of the pdf by specifying the starting and ending page numbers",
|
||||||
"parameters": {
|
"parameters": {
|
||||||
"type": "object",
|
"type": "object",
|
||||||
|
|
@ -312,7 +312,7 @@ let pdf2text_meta = {
|
||||||
* Expects a simple minded proxy server to be running locally
|
* Expects a simple minded proxy server to be running locally
|
||||||
* * listening on a configured port
|
* * listening on a configured port
|
||||||
* * expecting http requests
|
* * expecting http requests
|
||||||
* * with a query token named url wrt pdf2text path,
|
* * with a query token named url wrt pdftext path,
|
||||||
* which gives the actual url to fetch
|
* which gives the actual url to fetch
|
||||||
* * gets the requested pdf and converts to text, before returning same.
|
* * gets the requested pdf and converts to text, before returning same.
|
||||||
* ALERT: Accesses a seperate/external web proxy/caching server, be aware and careful
|
* ALERT: Accesses a seperate/external web proxy/caching server, be aware and careful
|
||||||
|
|
@ -321,20 +321,20 @@ let pdf2text_meta = {
|
||||||
* @param {string} toolname
|
* @param {string} toolname
|
||||||
* @param {any} obj
|
* @param {any} obj
|
||||||
*/
|
*/
|
||||||
function pdf2text_run(chatid, toolcallid, toolname, obj) {
|
function pdftext_run(chatid, toolcallid, toolname, obj) {
|
||||||
return proxyserver_get_anyargs(chatid, toolcallid, toolname, obj, 'pdf2text');
|
return proxyserver_get_anyargs(chatid, toolcallid, toolname, obj, 'pdftext');
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Setup pdf2text for tool calling
|
* Setup pdftext for tool calling
|
||||||
* NOTE: Currently the logic is setup for the bundled simpleproxy.py
|
* NOTE: Currently the logic is setup for the bundled simpleproxy.py
|
||||||
* @param {Object<string, Object<string, any>>} tcs
|
* @param {Object<string, Object<string, any>>} tcs
|
||||||
*/
|
*/
|
||||||
async function pdf2text_setup(tcs) {
|
async function pdftext_setup(tcs) {
|
||||||
return proxyserver_tc_setup('Pdf2Text', 'pdf2text', 'pdf2text', {
|
return proxyserver_tc_setup('PdfText', 'pdftext', 'pdf_to_text', {
|
||||||
"handler": pdf2text_run,
|
"handler": pdftext_run,
|
||||||
"meta": pdf2text_meta,
|
"meta": pdftext_meta,
|
||||||
"result": ""
|
"result": ""
|
||||||
}, tcs);
|
}, tcs);
|
||||||
}
|
}
|
||||||
|
|
@ -355,6 +355,6 @@ export async function init(toolsWorker) {
|
||||||
await fetchweburlraw_setup(tc_switch)
|
await fetchweburlraw_setup(tc_switch)
|
||||||
await fetchweburltext_setup(tc_switch)
|
await fetchweburltext_setup(tc_switch)
|
||||||
await searchwebtext_setup(tc_switch)
|
await searchwebtext_setup(tc_switch)
|
||||||
await pdf2text_setup(tc_switch)
|
await pdftext_setup(tc_switch)
|
||||||
return tc_switch
|
return tc_switch
|
||||||
}
|
}
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue