SimpleSallap:SimpleMCP:TCPdf: update

Implement pdftext around toolcall class++ flow
This commit is contained in:
hanishkvc 2025-12-06 15:27:13 +05:30
parent 01a7800f51
commit 4ce55eb0af
1 changed files with 45 additions and 32 deletions

View File

@ -4,11 +4,10 @@
import urllib.parse import urllib.parse
import urlvalidator as uv import urlvalidator as uv
import filemagic as mFile import filemagic as mFile
import toolcall as mTC
import http.client
from typing import TYPE_CHECKING, Any from typing import TYPE_CHECKING, Any
if TYPE_CHECKING:
from simpleproxy import ProxyHandler
PDFOUTLINE_MAXDEPTH=4 PDFOUTLINE_MAXDEPTH=4
@ -44,12 +43,12 @@ def process_pdftext(url: str, startPN: int, endPN: int):
""" """
import pypdf import pypdf
import io import io
gotVU = uv.validate_url(url, "HandlePdfText") gotVU = uv.validate_url(url, "ProcessPdfText")
if not gotVU.callOk: if not gotVU.callOk:
return { 'status': gotVU.statusCode, 'msg': gotVU.statusMsg } return mTC.TCOutResponse(False, gotVU.statusCode, gotVU.statusMsg)
gotFile = mFile.get_file(url, "ProcessPdfText", "application/pdf", {}) gotFile = mFile.get_file(url, "ProcessPdfText", "application/pdf", {})
if not gotFile.callOk: if not gotFile.callOk:
return { 'status': gotFile.statusCode, 'msg': gotFile.statusMsg, 'data': gotFile.contentData} return mTC.TCOutResponse(False, gotFile.statusCode, gotFile.statusMsg, gotFile.contentType, gotFile.contentData)
tPdf = "" tPdf = ""
oPdf = pypdf.PdfReader(io.BytesIO(gotFile.contentData)) oPdf = pypdf.PdfReader(io.BytesIO(gotFile.contentData))
if (startPN <= 0): if (startPN <= 0):
@ -64,31 +63,45 @@ def process_pdftext(url: str, startPN: int, endPN: int):
for i in range(startPN, endPN+1): for i in range(startPN, endPN+1):
pd = oPdf.pages[i-1] pd = oPdf.pages[i-1]
tPdf = tPdf + pd.extract_text() tPdf = tPdf + pd.extract_text()
return { 'status': 200, 'msg': "PdfText Response follows", 'data': tPdf } return mTC.TCOutResponse(True, 200, "PdfText Response follows", "text/text", tPdf.encode('utf-8'))
def handle_pdftext(ph: 'ProxyHandler', pr: urllib.parse.ParseResult): class TCPdfText(mTC.ToolCall):
"""
Handle requests to pdftext path, which is used to extract plain text def tcf_meta(self) -> mTC.TCFunction:
from the specified pdf file. return mTC.TCFunction(
""" self.name,
queryParams = urllib.parse.parse_qs(pr.query) "Fetch pdf from requested local file path / web url through a proxy server and return its text content after converting pdf to text, in few seconds. One is allowed to get a part of the pdf by specifying the starting and ending page numbers",
url = queryParams['url'][0] mTC.TCInParameters(
startP = queryParams.get('startPageNumber', -1) "object",
if isinstance(startP, list): {
startP = int(startP[0]) "url": mTC.TCInProperty(
endP = queryParams.get('endPageNumber', -1) "string",
if isinstance(endP, list): "local file path (file://) / web (http/https) based url of the pdf that will be got and inturn converted to text"
endP = int(endP[0]) ),
print(f"INFO:HandlePdfText:Processing:{url}:{startP}:{endP}...") "startPageNumber": mTC.TCInProperty(
gotP2T = process_pdftext(url, startP, endP) "integer",
if (gotP2T['status'] != 200): "Specify the starting page number within the pdf, this is optional. If not specified set to first page."
ph.send_error(gotP2T['status'], gotP2T['msg'] ) ),
return "endPageNumber": mTC.TCInProperty(
ph.send_response(gotP2T['status'], gotP2T['msg']) "integer",
ph.send_header('Content-Type', 'text/text') "Specify the ending page number within the pdf, this is optional. If not specified set to the last page."
# Add CORS for browser fetch, just in case )
ph.send_header('Access-Control-Allow-Origin', '*') },
ph.end_headers() [ "url" ]
print(f"INFO:HandlePdfText:ExtractedText:{url}...") )
ph.wfile.write(gotP2T['data'].encode('utf-8')) )
def tc_handle(self, args: mTC.TCInArgs, inHeaders: http.client.HTTPMessage) -> mTC.TCOutResponse:
"""
Handle pdftext request,
which is used to extract plain text from the specified pdf file.
"""
try:
url = args['url']
startP = int(args.get('startPageNumber', -1))
endP = int(args.get('endPageNumber', -1))
print(f"INFO:HandlePdfText:Processing:{url}:{startP}:{endP}...")
return process_pdftext(url, startP, endP)
except Exception as exc:
return mTC.TCOutResponse(False, 502, f"WARN:HandlePdfText:Failed:{exc}")