SimpleSallap:SimpleMCP:TCPdf: update
Implement pdftext around toolcall class++ flow
This commit is contained in:
parent
01a7800f51
commit
4ce55eb0af
|
|
@ -4,11 +4,10 @@
|
||||||
import urllib.parse
|
import urllib.parse
|
||||||
import urlvalidator as uv
|
import urlvalidator as uv
|
||||||
import filemagic as mFile
|
import filemagic as mFile
|
||||||
|
import toolcall as mTC
|
||||||
|
import http.client
|
||||||
from typing import TYPE_CHECKING, Any
|
from typing import TYPE_CHECKING, Any
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
|
||||||
from simpleproxy import ProxyHandler
|
|
||||||
|
|
||||||
|
|
||||||
PDFOUTLINE_MAXDEPTH=4
|
PDFOUTLINE_MAXDEPTH=4
|
||||||
|
|
||||||
|
|
@ -44,12 +43,12 @@ def process_pdftext(url: str, startPN: int, endPN: int):
|
||||||
"""
|
"""
|
||||||
import pypdf
|
import pypdf
|
||||||
import io
|
import io
|
||||||
gotVU = uv.validate_url(url, "HandlePdfText")
|
gotVU = uv.validate_url(url, "ProcessPdfText")
|
||||||
if not gotVU.callOk:
|
if not gotVU.callOk:
|
||||||
return { 'status': gotVU.statusCode, 'msg': gotVU.statusMsg }
|
return mTC.TCOutResponse(False, gotVU.statusCode, gotVU.statusMsg)
|
||||||
gotFile = mFile.get_file(url, "ProcessPdfText", "application/pdf", {})
|
gotFile = mFile.get_file(url, "ProcessPdfText", "application/pdf", {})
|
||||||
if not gotFile.callOk:
|
if not gotFile.callOk:
|
||||||
return { 'status': gotFile.statusCode, 'msg': gotFile.statusMsg, 'data': gotFile.contentData}
|
return mTC.TCOutResponse(False, gotFile.statusCode, gotFile.statusMsg, gotFile.contentType, gotFile.contentData)
|
||||||
tPdf = ""
|
tPdf = ""
|
||||||
oPdf = pypdf.PdfReader(io.BytesIO(gotFile.contentData))
|
oPdf = pypdf.PdfReader(io.BytesIO(gotFile.contentData))
|
||||||
if (startPN <= 0):
|
if (startPN <= 0):
|
||||||
|
|
@ -64,31 +63,45 @@ def process_pdftext(url: str, startPN: int, endPN: int):
|
||||||
for i in range(startPN, endPN+1):
|
for i in range(startPN, endPN+1):
|
||||||
pd = oPdf.pages[i-1]
|
pd = oPdf.pages[i-1]
|
||||||
tPdf = tPdf + pd.extract_text()
|
tPdf = tPdf + pd.extract_text()
|
||||||
return { 'status': 200, 'msg': "PdfText Response follows", 'data': tPdf }
|
return mTC.TCOutResponse(True, 200, "PdfText Response follows", "text/text", tPdf.encode('utf-8'))
|
||||||
|
|
||||||
|
|
||||||
def handle_pdftext(ph: 'ProxyHandler', pr: urllib.parse.ParseResult):
|
class TCPdfText(mTC.ToolCall):
|
||||||
|
|
||||||
|
def tcf_meta(self) -> mTC.TCFunction:
|
||||||
|
return mTC.TCFunction(
|
||||||
|
self.name,
|
||||||
|
"Fetch pdf from requested local file path / web url through a proxy server and return its text content after converting pdf to text, in few seconds. One is allowed to get a part of the pdf by specifying the starting and ending page numbers",
|
||||||
|
mTC.TCInParameters(
|
||||||
|
"object",
|
||||||
|
{
|
||||||
|
"url": mTC.TCInProperty(
|
||||||
|
"string",
|
||||||
|
"local file path (file://) / web (http/https) based url of the pdf that will be got and inturn converted to text"
|
||||||
|
),
|
||||||
|
"startPageNumber": mTC.TCInProperty(
|
||||||
|
"integer",
|
||||||
|
"Specify the starting page number within the pdf, this is optional. If not specified set to first page."
|
||||||
|
),
|
||||||
|
"endPageNumber": mTC.TCInProperty(
|
||||||
|
"integer",
|
||||||
|
"Specify the ending page number within the pdf, this is optional. If not specified set to the last page."
|
||||||
|
)
|
||||||
|
},
|
||||||
|
[ "url" ]
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
def tc_handle(self, args: mTC.TCInArgs, inHeaders: http.client.HTTPMessage) -> mTC.TCOutResponse:
|
||||||
"""
|
"""
|
||||||
Handle requests to pdftext path, which is used to extract plain text
|
Handle pdftext request,
|
||||||
from the specified pdf file.
|
which is used to extract plain text from the specified pdf file.
|
||||||
"""
|
"""
|
||||||
queryParams = urllib.parse.parse_qs(pr.query)
|
try:
|
||||||
url = queryParams['url'][0]
|
url = args['url']
|
||||||
startP = queryParams.get('startPageNumber', -1)
|
startP = int(args.get('startPageNumber', -1))
|
||||||
if isinstance(startP, list):
|
endP = int(args.get('endPageNumber', -1))
|
||||||
startP = int(startP[0])
|
|
||||||
endP = queryParams.get('endPageNumber', -1)
|
|
||||||
if isinstance(endP, list):
|
|
||||||
endP = int(endP[0])
|
|
||||||
print(f"INFO:HandlePdfText:Processing:{url}:{startP}:{endP}...")
|
print(f"INFO:HandlePdfText:Processing:{url}:{startP}:{endP}...")
|
||||||
gotP2T = process_pdftext(url, startP, endP)
|
return process_pdftext(url, startP, endP)
|
||||||
if (gotP2T['status'] != 200):
|
except Exception as exc:
|
||||||
ph.send_error(gotP2T['status'], gotP2T['msg'] )
|
return mTC.TCOutResponse(False, 502, f"WARN:HandlePdfText:Failed:{exc}")
|
||||||
return
|
|
||||||
ph.send_response(gotP2T['status'], gotP2T['msg'])
|
|
||||||
ph.send_header('Content-Type', 'text/text')
|
|
||||||
# Add CORS for browser fetch, just in case
|
|
||||||
ph.send_header('Access-Control-Allow-Origin', '*')
|
|
||||||
ph.end_headers()
|
|
||||||
print(f"INFO:HandlePdfText:ExtractedText:{url}...")
|
|
||||||
ph.wfile.write(gotP2T['data'].encode('utf-8'))
|
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue