SimpleChatTC:PdfText:Cleanup rename to follow a common convention
Rename path and tags/identifiers from Pdf2Text to PdfText Rename the function call to pdf_to_text, this should also help indicate semantic more unambiguously, just in case, especially for smaller models.
This commit is contained in:
parent
8501759f60
commit
1d1894ad14
|
|
@ -10,11 +10,12 @@ if TYPE_CHECKING:
|
|||
from simpleproxy import ProxyHandler
|
||||
|
||||
|
||||
def process_pdf2text(url: str, startPN: int, endPN: int):
|
||||
def process_pdftext(url: str, startPN: int, endPN: int):
|
||||
"""
|
||||
Extract textual content from given pdf.
|
||||
|
||||
* Validate the got url.
|
||||
* Get the pdf file.
|
||||
* Extract textual contents of the pdf from given start page number to end page number (inclusive).
|
||||
* if -1 | 0 is specified wrt startPN, the actual starting page number (rather 1) will be used.
|
||||
* if -1 | 0 is specified wrt endPN, the actual ending page number will be used.
|
||||
|
|
@ -23,10 +24,10 @@ def process_pdf2text(url: str, startPN: int, endPN: int):
|
|||
"""
|
||||
import pypdf
|
||||
import io
|
||||
gotVU = uv.validate_url(url, "HandlePdf2Text")
|
||||
gotVU = uv.validate_url(url, "HandlePdfText")
|
||||
if not gotVU.callOk:
|
||||
return { 'status': gotVU.statusCode, 'msg': gotVU.statusMsg }
|
||||
gotFile = mFile.get_file(url, "ProcessPdf2Text", "application/pdf", {})
|
||||
gotFile = mFile.get_file(url, "ProcessPdfText", "application/pdf", {})
|
||||
if not gotFile.callOk:
|
||||
return { 'status': gotFile.statusCode, 'msg': gotFile.statusMsg, 'data': gotFile.contentData}
|
||||
tPdf = ""
|
||||
|
|
@ -38,12 +39,12 @@ def process_pdf2text(url: str, startPN: int, endPN: int):
|
|||
for i in range(startPN, endPN+1):
|
||||
pd = oPdf.pages[i-1]
|
||||
tPdf = tPdf + pd.extract_text()
|
||||
return { 'status': 200, 'msg': "Pdf2Text Response follows", 'data': tPdf }
|
||||
return { 'status': 200, 'msg': "PdfText Response follows", 'data': tPdf }
|
||||
|
||||
|
||||
def handle_pdf2text(ph: 'ProxyHandler', pr: urllib.parse.ParseResult):
|
||||
def handle_pdftext(ph: 'ProxyHandler', pr: urllib.parse.ParseResult):
|
||||
"""
|
||||
Handle requests to pdf2text path, which is used to extract plain text
|
||||
Handle requests to pdftext path, which is used to extract plain text
|
||||
from the specified pdf file.
|
||||
"""
|
||||
queryParams = urllib.parse.parse_qs(pr.query)
|
||||
|
|
@ -54,8 +55,8 @@ def handle_pdf2text(ph: 'ProxyHandler', pr: urllib.parse.ParseResult):
|
|||
endP = queryParams.get('endPageNumber', -1)
|
||||
if isinstance(endP, list):
|
||||
endP = int(endP[0])
|
||||
print(f"INFO:HandlePdf2Text:Processing:{url}:{startP}:{endP}...")
|
||||
gotP2T = process_pdf2text(url, startP, endP)
|
||||
print(f"INFO:HandlePdfText:Processing:{url}:{startP}:{endP}...")
|
||||
gotP2T = process_pdftext(url, startP, endP)
|
||||
if (gotP2T['status'] != 200):
|
||||
ph.send_error(gotP2T['status'], gotP2T['msg'] )
|
||||
return
|
||||
|
|
@ -64,5 +65,5 @@ def handle_pdf2text(ph: 'ProxyHandler', pr: urllib.parse.ParseResult):
|
|||
# Add CORS for browser fetch, just in case
|
||||
ph.send_header('Access-Control-Allow-Origin', '*')
|
||||
ph.end_headers()
|
||||
print(f"INFO:HandlePdf2Text:ExtractedText:{url}...")
|
||||
print(f"INFO:HandlePdfText:ExtractedText:{url}...")
|
||||
ph.wfile.write(gotP2T['data'].encode('utf-8'))
|
||||
|
|
|
|||
|
|
@ -48,7 +48,7 @@ gConfigType = {
|
|||
|
||||
gConfigNeeded = [ '--allowed.schemes', '--allowed.domains', '--bearer.insecure' ]
|
||||
|
||||
gAllowedCalls = [ "urltext", "urlraw", "pdf2text" ]
|
||||
gAllowedCalls = [ "urltext", "urlraw", "pdftext" ]
|
||||
|
||||
|
||||
def bearer_transform():
|
||||
|
|
@ -135,8 +135,8 @@ class ProxyHandler(http.server.BaseHTTPRequestHandler):
|
|||
self.auth_and_run(pr, mWeb.handle_urlraw)
|
||||
case '/urltext':
|
||||
self.auth_and_run(pr, mWeb.handle_urltext)
|
||||
case '/pdf2text':
|
||||
self.auth_and_run(pr, mPdf.handle_pdf2text)
|
||||
case '/pdftext':
|
||||
self.auth_and_run(pr, mPdf.handle_pdftext)
|
||||
case '/aum':
|
||||
handle_aum(self, pr)
|
||||
case _:
|
||||
|
|
|
|||
|
|
@ -448,7 +448,7 @@ Either way always remember to cross check the tool requests and generated respon
|
|||
* search_web_text - search for the specified words using the configured search engine and return the
|
||||
plain textual content from the search result page.
|
||||
|
||||
* pdf2text - fetch/read specified pdf file and extract its textual content
|
||||
* pdf_to_text - fetch/read specified pdf file and extract its textual content
|
||||
* this depends on the pypdf python based open source library
|
||||
|
||||
the above set of web related tool calls work by handshaking with a bundled simple local web proxy
|
||||
|
|
@ -469,7 +469,7 @@ Depending on the path specified wrt the proxy server, it executes the correspond
|
|||
urltext path is used (and not urlraw), the logic in addition to fetching content from given url, it
|
||||
tries to convert html content into equivalent plain text content to some extent in a simple minded
|
||||
manner by dropping head block as well as all scripts/styles/footers/headers/nav blocks and inturn
|
||||
also dropping the html tags. Similarly for pdf2text.
|
||||
also dropping the html tags. Similarly for pdftext.
|
||||
|
||||
The client ui logic does a simple check to see if the bundled simpleproxy is running at specified
|
||||
proxyUrl before enabling these web and related tool calls.
|
||||
|
|
@ -579,7 +579,7 @@ users) own data or data of ai model.
|
|||
|
||||
Trap http response errors and inform user the specific error returned by ai server.
|
||||
|
||||
Initial go at a pdf2text tool call. It allows web / local pdf files to be read and their text content
|
||||
Initial go at a pdftext tool call. It allows web / local pdf files to be read and their text content
|
||||
extracted and passed to ai model for further processing, as decided by ai and end user. One could
|
||||
either work with the full pdf or a subset of adjacent pages.
|
||||
|
||||
|
|
|
|||
|
|
@ -276,14 +276,14 @@ async function searchwebtext_setup(tcs) {
|
|||
|
||||
|
||||
//
|
||||
// Pdf2Text
|
||||
// PdfText
|
||||
//
|
||||
|
||||
|
||||
let pdf2text_meta = {
|
||||
let pdftext_meta = {
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "pdf2text",
|
||||
"name": "pdf_to_text",
|
||||
"description": "Read pdf from requested local file path / web url through a proxy server and return its text content after converting pdf to text, in few seconds. One is allowed to get a part of the pdf by specifying the starting and ending page numbers",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
|
|
@ -312,7 +312,7 @@ let pdf2text_meta = {
|
|||
* Expects a simple minded proxy server to be running locally
|
||||
* * listening on a configured port
|
||||
* * expecting http requests
|
||||
* * with a query token named url wrt pdf2text path,
|
||||
* * with a query token named url wrt pdftext path,
|
||||
* which gives the actual url to fetch
|
||||
* * gets the requested pdf and converts to text, before returning same.
|
||||
* ALERT: Accesses a seperate/external web proxy/caching server, be aware and careful
|
||||
|
|
@ -321,20 +321,20 @@ let pdf2text_meta = {
|
|||
* @param {string} toolname
|
||||
* @param {any} obj
|
||||
*/
|
||||
function pdf2text_run(chatid, toolcallid, toolname, obj) {
|
||||
return proxyserver_get_anyargs(chatid, toolcallid, toolname, obj, 'pdf2text');
|
||||
function pdftext_run(chatid, toolcallid, toolname, obj) {
|
||||
return proxyserver_get_anyargs(chatid, toolcallid, toolname, obj, 'pdftext');
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Setup pdf2text for tool calling
|
||||
* Setup pdftext for tool calling
|
||||
* NOTE: Currently the logic is setup for the bundled simpleproxy.py
|
||||
* @param {Object<string, Object<string, any>>} tcs
|
||||
*/
|
||||
async function pdf2text_setup(tcs) {
|
||||
return proxyserver_tc_setup('Pdf2Text', 'pdf2text', 'pdf2text', {
|
||||
"handler": pdf2text_run,
|
||||
"meta": pdf2text_meta,
|
||||
async function pdftext_setup(tcs) {
|
||||
return proxyserver_tc_setup('PdfText', 'pdftext', 'pdf_to_text', {
|
||||
"handler": pdftext_run,
|
||||
"meta": pdftext_meta,
|
||||
"result": ""
|
||||
}, tcs);
|
||||
}
|
||||
|
|
@ -355,6 +355,6 @@ export async function init(toolsWorker) {
|
|||
await fetchweburlraw_setup(tc_switch)
|
||||
await fetchweburltext_setup(tc_switch)
|
||||
await searchwebtext_setup(tc_switch)
|
||||
await pdf2text_setup(tc_switch)
|
||||
await pdftext_setup(tc_switch)
|
||||
return tc_switch
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in New Issue