SimpleChatTC:Rename fetch_web_url_text to fetch_html_text
To make it easier for the ai model to understand that this works mainly for html pages and not say xml or pdf or so. For those one needs to use other explict tool calls provided like fetchpdftext or fetchxmltext or so The server service path renamed from urltext to htmltext. SearchWebText also updated to use htmltext now
This commit is contained in:
parent
c0f7c8654e
commit
143f9c0b1a
|
|
@ -50,7 +50,7 @@ gConfigNeeded = [ '--allowed.schemes', '--allowed.domains', '--bearer.insecure'
|
||||||
|
|
||||||
gAllowedCalls = {
|
gAllowedCalls = {
|
||||||
"xmlfiltered": [],
|
"xmlfiltered": [],
|
||||||
"urltext": [],
|
"htmltext": [],
|
||||||
"urlraw": [],
|
"urlraw": [],
|
||||||
"pdftext": [ "pypdf" ]
|
"pdftext": [ "pypdf" ]
|
||||||
}
|
}
|
||||||
|
|
@ -138,8 +138,8 @@ class ProxyHandler(http.server.BaseHTTPRequestHandler):
|
||||||
match pr.path:
|
match pr.path:
|
||||||
case '/urlraw':
|
case '/urlraw':
|
||||||
self.auth_and_run(pr, mWeb.handle_urlraw)
|
self.auth_and_run(pr, mWeb.handle_urlraw)
|
||||||
case '/urltext':
|
case '/htmltext':
|
||||||
self.auth_and_run(pr, mWeb.handle_urltext)
|
self.auth_and_run(pr, mWeb.handle_htmltext)
|
||||||
case '/xmlfiltered':
|
case '/xmlfiltered':
|
||||||
self.auth_and_run(pr, mWeb.handle_xmlfiltered)
|
self.auth_and_run(pr, mWeb.handle_xmlfiltered)
|
||||||
case '/pdftext':
|
case '/pdftext':
|
||||||
|
|
|
||||||
|
|
@ -192,15 +192,15 @@ class TextHtmlParser(html.parser.HTMLParser):
|
||||||
return self.textStripped
|
return self.textStripped
|
||||||
|
|
||||||
|
|
||||||
def handle_urltext(ph: 'ProxyHandler', pr: urllib.parse.ParseResult):
|
def handle_htmltext(ph: 'ProxyHandler', pr: urllib.parse.ParseResult):
|
||||||
try:
|
try:
|
||||||
# Get requested url
|
# Get requested url
|
||||||
got = handle_urlreq(ph, pr, "HandleUrlText")
|
got = handle_urlreq(ph, pr, "HandleHtmlText")
|
||||||
if not got.callOk:
|
if not got.callOk:
|
||||||
ph.send_error(got.httpStatus, got.httpStatusMsg)
|
ph.send_error(got.httpStatus, got.httpStatusMsg)
|
||||||
return
|
return
|
||||||
# Extract Text
|
# Extract Text
|
||||||
tagDrops = ph.headers.get('urltext-tag-drops')
|
tagDrops = ph.headers.get('htmltext-tag-drops')
|
||||||
if not tagDrops:
|
if not tagDrops:
|
||||||
tagDrops = []
|
tagDrops = []
|
||||||
else:
|
else:
|
||||||
|
|
@ -216,7 +216,7 @@ def handle_urltext(ph: 'ProxyHandler', pr: urllib.parse.ParseResult):
|
||||||
ph.wfile.write(textHtml.get_stripped_text().encode('utf-8'))
|
ph.wfile.write(textHtml.get_stripped_text().encode('utf-8'))
|
||||||
debug.dump({ 'RawText': 'yes', 'StrippedText': 'yes' }, { 'RawText': textHtml.text, 'StrippedText': textHtml.get_stripped_text() })
|
debug.dump({ 'RawText': 'yes', 'StrippedText': 'yes' }, { 'RawText': textHtml.text, 'StrippedText': textHtml.get_stripped_text() })
|
||||||
except Exception as exc:
|
except Exception as exc:
|
||||||
ph.send_error(502, f"WARN:UrlTextFailed:{exc}")
|
ph.send_error(502, f"WARN:HtmlText:Failed:{exc}")
|
||||||
|
|
||||||
|
|
||||||
class XMLFilterParser(html.parser.HTMLParser):
|
class XMLFilterParser(html.parser.HTMLParser):
|
||||||
|
|
|
||||||
|
|
@ -453,7 +453,7 @@ Either way always remember to cross check the tool requests and generated respon
|
||||||
|
|
||||||
* fetch_web_url_raw - fetch contents of the requested url through a proxy server
|
* fetch_web_url_raw - fetch contents of the requested url through a proxy server
|
||||||
|
|
||||||
* fetch_web_url_text - fetch text parts of the content from the requested url through a proxy server.
|
* fetch_html_text - fetch text parts of the html content from the requested url through a proxy server.
|
||||||
Related logic tries to strip html response of html tags and also head, script, style, header,footer,
|
Related logic tries to strip html response of html tags and also head, script, style, header,footer,
|
||||||
nav, ... blocks.
|
nav, ... blocks.
|
||||||
|
|
||||||
|
|
@ -669,6 +669,10 @@ sliding window based drop off or even before they kick in, this can help in many
|
||||||
arguments generated by the ai. This ensures that the chat ui itself doesnt get stuck in it. Instead now
|
arguments generated by the ai. This ensures that the chat ui itself doesnt get stuck in it. Instead now
|
||||||
the tool call response can inform the ai model that its function call had issues.
|
the tool call response can inform the ai model that its function call had issues.
|
||||||
|
|
||||||
|
* renamed fetch_web_url_text to fetch_html_text, so that gen ai model wont try to use this to fetch xml or
|
||||||
|
rss files, because it will return empty content, because there wont be any html content to strip the tags
|
||||||
|
and unwanted blocks before returning.
|
||||||
|
|
||||||
|
|
||||||
#### ToDo
|
#### ToDo
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -6,6 +6,16 @@
|
||||||
// by Humans for All
|
// by Humans for All
|
||||||
//
|
//
|
||||||
|
|
||||||
|
//
|
||||||
|
// The simpleproxy.py server is expected to provide the below services
|
||||||
|
// urlraw - fetch the request url content as is
|
||||||
|
// htmltext - fetch the requested html content and provide plain text version
|
||||||
|
// after stripping it of tag blocks like head, script, style, header, footer, nav, ...
|
||||||
|
// pdftext - fetch the requested pdf and provide the plain text version
|
||||||
|
// xmlfiltered - fetch the requested xml content and provide a optionally filtered version of same
|
||||||
|
//
|
||||||
|
|
||||||
|
|
||||||
import * as mChatMagic from './simplechat.js'
|
import * as mChatMagic from './simplechat.js'
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -141,21 +151,21 @@ async function fetchweburlraw_setup(tcs) {
|
||||||
|
|
||||||
|
|
||||||
//
|
//
|
||||||
// Fetch Web Url Text
|
// Fetch html Text
|
||||||
//
|
//
|
||||||
|
|
||||||
|
|
||||||
let fetchweburltext_meta = {
|
let fetchhtmltext_meta = {
|
||||||
"type": "function",
|
"type": "function",
|
||||||
"function": {
|
"function": {
|
||||||
"name": "fetch_web_url_text",
|
"name": "fetch_html_text",
|
||||||
"description": "Fetch the requested web url through a proxy server and return its text content after stripping away the html tags as well as head, script, style, header, footer, nav blocks, in few seconds",
|
"description": "Fetch html content from given url through a proxy server and return its text content after stripping away the html tags as well as head, script, style, header, footer, nav blocks, in few seconds",
|
||||||
"parameters": {
|
"parameters": {
|
||||||
"type": "object",
|
"type": "object",
|
||||||
"properties": {
|
"properties": {
|
||||||
"url":{
|
"url":{
|
||||||
"type":"string",
|
"type":"string",
|
||||||
"description":"url of the page that will be fetched from the internet and inturn unwanted stuff stripped from its contents to some extent"
|
"description":"url of the html page that needs to be fetched and inturn unwanted stuff stripped from its contents to some extent"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"required": ["url"]
|
"required": ["url"]
|
||||||
|
|
@ -165,35 +175,30 @@ let fetchweburltext_meta = {
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Implementation of the fetch web url text logic.
|
* Implementation of the fetch html text logic.
|
||||||
* Expects a simple minded proxy server to be running locally
|
* Expects the simple minded simpleproxy server to be running locally,
|
||||||
* * listening on a configured port
|
* providing service for htmltext path.
|
||||||
* * expecting http requests
|
|
||||||
* * with a query token named url wrt urltext path,
|
|
||||||
* which gives the actual url to fetch
|
|
||||||
* * strips out head as well as any script, style, header, footer, nav and so blocks in body
|
|
||||||
* before returning remaining body contents.
|
|
||||||
* ALERT: Accesses a seperate/external web proxy/caching server, be aware and careful
|
* ALERT: Accesses a seperate/external web proxy/caching server, be aware and careful
|
||||||
* @param {string} chatid
|
* @param {string} chatid
|
||||||
* @param {string} toolcallid
|
* @param {string} toolcallid
|
||||||
* @param {string} toolname
|
* @param {string} toolname
|
||||||
* @param {any} obj
|
* @param {any} obj
|
||||||
*/
|
*/
|
||||||
function fetchweburltext_run(chatid, toolcallid, toolname, obj) {
|
function fetchhtmltext_run(chatid, toolcallid, toolname, obj) {
|
||||||
// maybe filter out any key other than 'url' in obj
|
// maybe filter out any key other than 'url' in obj
|
||||||
return proxyserver_get_anyargs(chatid, toolcallid, toolname, obj, 'urltext');
|
return proxyserver_get_anyargs(chatid, toolcallid, toolname, obj, 'htmltext');
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Setup fetch_web_url_text for tool calling
|
* Setup fetch_html_text for tool calling
|
||||||
* NOTE: Currently the logic is setup for the bundled simpleproxy.py
|
* NOTE: Currently the logic is setup for the bundled simpleproxy.py
|
||||||
* @param {Object<string, Object<string, any>>} tcs
|
* @param {Object<string, Object<string, any>>} tcs
|
||||||
*/
|
*/
|
||||||
async function fetchweburltext_setup(tcs) {
|
async function fetchhtmltext_setup(tcs) {
|
||||||
return proxyserver_tc_setup('FetchWebUrlText', 'urltext', 'fetch_web_url_text', {
|
return proxyserver_tc_setup('FetchHtmlText', 'htmltext', 'fetch_html_text', {
|
||||||
"handler": fetchweburltext_run,
|
"handler": fetchhtmltext_run,
|
||||||
"meta": fetchweburltext_meta,
|
"meta": fetchhtmltext_meta,
|
||||||
"result": ""
|
"result": ""
|
||||||
}, tcs);
|
}, tcs);
|
||||||
}
|
}
|
||||||
|
|
@ -225,14 +230,7 @@ let searchwebtext_meta = {
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Implementation of the search web text logic. Initial go.
|
* Implementation of the search web text logic. Initial go.
|
||||||
* Builds on urltext path of the bundled simpleproxy.py.
|
* Builds on htmltext path service of the bundled simpleproxy.py.
|
||||||
* Expects simpleproxy.py server to be running locally
|
|
||||||
* * listening on a configured port
|
|
||||||
* * expecting http requests
|
|
||||||
* * with a query token named url wrt urltext path,
|
|
||||||
* which gives the actual url to fetch
|
|
||||||
* * strips out head as well as any script, style, header, footer, nav and so blocks in body
|
|
||||||
* before returning remaining body contents.
|
|
||||||
* ALERT: Accesses a seperate/external web proxy/caching server, be aware and careful
|
* ALERT: Accesses a seperate/external web proxy/caching server, be aware and careful
|
||||||
* @param {string} chatid
|
* @param {string} chatid
|
||||||
* @param {string} toolcallid
|
* @param {string} toolcallid
|
||||||
|
|
@ -245,8 +243,8 @@ function searchwebtext_run(chatid, toolcallid, toolname, obj) {
|
||||||
searchUrl = searchUrl.replace("SEARCHWORDS", encodeURIComponent(obj.words));
|
searchUrl = searchUrl.replace("SEARCHWORDS", encodeURIComponent(obj.words));
|
||||||
delete(obj.words)
|
delete(obj.words)
|
||||||
obj['url'] = searchUrl
|
obj['url'] = searchUrl
|
||||||
let headers = { 'urltext-tag-drops': JSON.stringify(gMe.tools.searchDrops) }
|
let headers = { 'htmltext-tag-drops': JSON.stringify(gMe.tools.searchDrops) }
|
||||||
return proxyserver_get_anyargs(chatid, toolcallid, toolname, obj, 'urltext', headers);
|
return proxyserver_get_anyargs(chatid, toolcallid, toolname, obj, 'htmltext', headers);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -256,7 +254,7 @@ function searchwebtext_run(chatid, toolcallid, toolname, obj) {
|
||||||
* @param {Object<string, Object<string, any>>} tcs
|
* @param {Object<string, Object<string, any>>} tcs
|
||||||
*/
|
*/
|
||||||
async function searchwebtext_setup(tcs) {
|
async function searchwebtext_setup(tcs) {
|
||||||
return proxyserver_tc_setup('SearchWebText', 'urltext', 'search_web_text', {
|
return proxyserver_tc_setup('SearchWebText', 'htmltext', 'search_web_text', {
|
||||||
"handler": searchwebtext_run,
|
"handler": searchwebtext_run,
|
||||||
"meta": searchwebtext_meta,
|
"meta": searchwebtext_meta,
|
||||||
"result": ""
|
"result": ""
|
||||||
|
|
@ -418,7 +416,7 @@ export async function init(me) {
|
||||||
let tc_switch = {}
|
let tc_switch = {}
|
||||||
gMe = me
|
gMe = me
|
||||||
await fetchweburlraw_setup(tc_switch)
|
await fetchweburlraw_setup(tc_switch)
|
||||||
await fetchweburltext_setup(tc_switch)
|
await fetchhtmltext_setup(tc_switch)
|
||||||
await searchwebtext_setup(tc_switch)
|
await searchwebtext_setup(tc_switch)
|
||||||
await fetchpdftext_setup(tc_switch)
|
await fetchpdftext_setup(tc_switch)
|
||||||
await fetchxmlfiltered_setup(tc_switch)
|
await fetchxmlfiltered_setup(tc_switch)
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue