SimpleChatTC:XMLFiltered: Retain xml tags with selective dropping

instead of the prefixing of tag heirarchy retain the xml structure
while parallely allowing unwanted tags and their contents to be
dropped.
This commit is contained in:
hanishkvc 2025-11-07 15:22:10 +05:30
parent b8bb258dd5
commit 9ed1cf9886
2 changed files with 18 additions and 9 deletions

View File

@ -221,13 +221,11 @@ def handle_urltext(ph: 'ProxyHandler', pr: urllib.parse.ParseResult):
class TextXMLParser(html.parser.HTMLParser): class TextXMLParser(html.parser.HTMLParser):
""" """
A simple minded logic used to strip xml content of A simple minded logic used to strip xml content of
* all the xml tags as well as * unwanted tags and their contents.
* all the contents belonging to below predefined tags like guid, enclosure, ...
* this works properly only if the xml being processed has proper opening and ending tags * this works properly only if the xml being processed has proper opening and ending tags
around the area of interest. around the area of interest.
This helps return a relatively clean textual representation of the xml file/content being parsed. This can help return a cleaned up xml file.
""" """
def __init__(self, tagDrops: list[str]): def __init__(self, tagDrops: list[str]):
@ -240,7 +238,9 @@ class TextXMLParser(html.parser.HTMLParser):
self.insideTagDrops[tag] = False self.insideTagDrops[tag] = False
self.bCapture = False self.bCapture = False
self.text = "" self.text = ""
self.prefix = [] self.prefixTags = []
self.prefix = ""
self.lastTrackedCB = ""
def do_capture(self): def do_capture(self):
""" """
@ -252,18 +252,27 @@ class TextXMLParser(html.parser.HTMLParser):
return True return True
def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]): def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]):
self.prefix.append(tag) self.lastTrackedCB = "starttag"
self.prefixTags.append(tag)
self.prefix += "\t"
self.text += f"\n{self.prefix}<{tag}>"
if tag in self.tagDrops: if tag in self.tagDrops:
self.insideTagDrops[tag] = True self.insideTagDrops[tag] = True
def handle_endtag(self, tag: str): def handle_endtag(self, tag: str):
self.prefix.pop() if (self.lastTrackedCB == "endtag"):
self.text += f"\n{self.prefix}</{tag}>"
else:
self.text += f"</{tag}>"
self.lastTrackedCB = "endtag"
self.prefixTags.pop()
self.prefix = self.prefix[:-1]
if tag in self.tagDrops: if tag in self.tagDrops:
self.insideTagDrops[tag] = False self.insideTagDrops[tag] = False
def handle_data(self, data: str): def handle_data(self, data: str):
if self.do_capture(): if self.do_capture():
self.text += f"{':'.join(self.prefix)}:{data}\n" self.text += f"{data}"
def handle_xmltext(ph: 'ProxyHandler', pr: urllib.parse.ParseResult): def handle_xmltext(ph: 'ProxyHandler', pr: urllib.parse.ParseResult):

View File

@ -340,7 +340,7 @@ let fetchxmltext_meta = {
"type": "function", "type": "function",
"function": { "function": {
"name": "fetch_xml_as_text", "name": "fetch_xml_as_text",
"description": "Fetch requested xml url through a proxy server and return its cleaned up text contents. Each content is prefixed with the xml tag heirarchy that it belongs to. Will take few seconds", "description": "Fetch requested xml url through a proxy server that can optionally filter out unwanted tags and their contents. Will take few seconds",
"parameters": { "parameters": {
"type": "object", "type": "object",
"properties": { "properties": {