diff --git a/tools/server/public_simplechat/local.tools/webmagic.py b/tools/server/public_simplechat/local.tools/webmagic.py index 3d0c1bc48d..588e562359 100644 --- a/tools/server/public_simplechat/local.tools/webmagic.py +++ b/tools/server/public_simplechat/local.tools/webmagic.py @@ -221,13 +221,11 @@ def handle_urltext(ph: 'ProxyHandler', pr: urllib.parse.ParseResult): class TextXMLParser(html.parser.HTMLParser): """ A simple minded logic used to strip xml content of - * all the xml tags as well as - * all the contents belonging to below predefined tags like guid, enclosure, ... - + * unwanted tags and their contents. * this works properly only if the xml being processed has proper opening and ending tags around the area of interest. - This helps return a relatively clean textual representation of the xml file/content being parsed. + This can help return a cleaned up xml file. """ def __init__(self, tagDrops: list[str]): @@ -240,7 +238,9 @@ class TextXMLParser(html.parser.HTMLParser): self.insideTagDrops[tag] = False self.bCapture = False self.text = "" - self.prefix = [] + self.prefixTags = [] + self.prefix = "" + self.lastTrackedCB = "" def do_capture(self): """ @@ -252,18 +252,27 @@ class TextXMLParser(html.parser.HTMLParser): return True def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]): - self.prefix.append(tag) + self.lastTrackedCB = "starttag" + self.prefixTags.append(tag) + self.prefix += "\t" + self.text += f"\n{self.prefix}<{tag}>" if tag in self.tagDrops: self.insideTagDrops[tag] = True def handle_endtag(self, tag: str): - self.prefix.pop() + if (self.lastTrackedCB == "endtag"): + self.text += f"\n{self.prefix}" + else: + self.text += f"" + self.lastTrackedCB = "endtag" + self.prefixTags.pop() + self.prefix = self.prefix[:-1] if tag in self.tagDrops: self.insideTagDrops[tag] = False def handle_data(self, data: str): if self.do_capture(): - self.text += f"{':'.join(self.prefix)}:{data}\n" + self.text += f"{data}" def handle_xmltext(ph: 'ProxyHandler', pr: urllib.parse.ParseResult): diff --git a/tools/server/public_simplechat/toolweb.mjs b/tools/server/public_simplechat/toolweb.mjs index 8a8405c5ba..7dbbe51a05 100644 --- a/tools/server/public_simplechat/toolweb.mjs +++ b/tools/server/public_simplechat/toolweb.mjs @@ -340,7 +340,7 @@ let fetchxmltext_meta = { "type": "function", "function": { "name": "fetch_xml_as_text", - "description": "Fetch requested xml url through a proxy server and return its cleaned up text contents. Each content is prefixed with the xml tag heirarchy that it belongs to. Will take few seconds", + "description": "Fetch requested xml url through a proxy server that can optionally filter out unwanted tags and their contents. Will take few seconds", "parameters": { "type": "object", "properties": {