diff --git a/tools/server/public_simplechat/local.tools/webmagic.py b/tools/server/public_simplechat/local.tools/webmagic.py index d371fa736e..2e3d5811cc 100644 --- a/tools/server/public_simplechat/local.tools/webmagic.py +++ b/tools/server/public_simplechat/local.tools/webmagic.py @@ -9,7 +9,7 @@ import html.parser import debug import filemagic as mFile import json -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, Any, cast if TYPE_CHECKING: from simpleproxy import ProxyHandler @@ -93,12 +93,21 @@ class TextHtmlParser(html.parser.HTMLParser): html content, that logic wont be triggered, so also such client side dynamic content wont be got. + Supports one to specify a list of tags and their corresponding id attributes, so that contents + within such specified blocks will be dropped. + + * this works properly only if the html being processed has proper opening and ending tags + around the area of interest. + * remember to specify non overlapping tag blocks, if more than one specified for dropping. + * this path not tested, but should logically work + This helps return a relatively clean textual representation of the html file/content being parsed. """ - def __init__(self, tagDrops: dict): + def __init__(self, tagDrops: list[dict[str, Any]]): super().__init__() self.tagDrops = tagDrops + print(f"DBUG:TextHtmlParser:{self.tagDrops}") self.inside = { 'body': False, 'script': False, @@ -126,20 +135,27 @@ class TextHtmlParser(html.parser.HTMLParser): if tag in self.monitored: self.inside[tag] = True for tagMeta in self.tagDrops: - if tag != tagMeta.tag: + if tag != tagMeta['tag']: + continue + if (self.droptagCount > 0) and (self.droptagType == tag): + self.droptagCount += 1 continue for attr in attrs: if attr[0] != 'id': continue - if attr[1] == tagMeta.id: + if attr[1] == tagMeta['id']: self.droptagCount += 1 self.droptagType = tag + print(f"DBUG:THP:Start:Tag found [{tag}:{attr[1]}]...") def handle_endtag(self, tag: str): if tag in self.monitored: self.inside[tag] = False - if tag == self.droptagType: + if self.droptagType and (tag == self.droptagType): self.droptagCount -= 1 + if self.droptagCount == 0: + self.droptagType = None + print("DBUG:THP:End:Tag found...") if self.droptagCount < 0: self.droptagCount = 0 @@ -186,9 +202,9 @@ def handle_urltext(ph: 'ProxyHandler', pr: urllib.parse.ParseResult): # Extract Text tagDrops = ph.headers.get('urltext-tag-drops') if not tagDrops: - tagDrops = {} + tagDrops = [] else: - tagDrops = json.loads(tagDrops) + tagDrops = cast(list[dict[str,Any]], json.loads(tagDrops)) textHtml = TextHtmlParser(tagDrops) textHtml.feed(got.contentData) # Send back to client diff --git a/tools/server/public_simplechat/readme.md b/tools/server/public_simplechat/readme.md index 9244a30645..13c2f57872 100644 --- a/tools/server/public_simplechat/readme.md +++ b/tools/server/public_simplechat/readme.md @@ -590,6 +590,13 @@ SimpleProxy updates * Helpers to fetch file from local file system or the web, transparently * Help check for needed modules before a particular service path is acknowledged as available through /aum service path +* urltext and related - logic to drop contents of specified tag with a given id + * allow its use for the web search tool flow + * setup wrt default duckduckgo search result urltext plain text cleanup and found working. + * this works properly only if the html being processed has proper opening and ending tags + around the area of interest. + * remember to specify non overlapping tag blocks, if more than one specified for dropping. + * this path not tested, but should logically work Settings/Config default changes diff --git a/tools/server/public_simplechat/toolweb.mjs b/tools/server/public_simplechat/toolweb.mjs index f52aca4357..d4c2788340 100644 --- a/tools/server/public_simplechat/toolweb.mjs +++ b/tools/server/public_simplechat/toolweb.mjs @@ -259,7 +259,7 @@ function searchwebtext_run(chatid, toolcallid, toolname, obj) { searchUrl = searchUrl.replace("SEARCHWORDS", encodeURIComponent(obj.words)); delete(obj.words) obj['url'] = searchUrl - let headers = { 'urltext-tag-drops': get_gme().tools.searchDrops } + let headers = { 'urltext-tag-drops': JSON.stringify(get_gme().tools.searchDrops) } return proxyserver_get_anyargs(chatid, toolcallid, toolname, obj, 'urltext', headers); } }