SimpleChatTC:XMLFiltered: Retain xml tags with selective dropping
instead of the prefixing of tag heirarchy retain the xml structure while parallely allowing unwanted tags and their contents to be dropped.
This commit is contained in:
parent
b8bb258dd5
commit
9ed1cf9886
|
|
@ -221,13 +221,11 @@ def handle_urltext(ph: 'ProxyHandler', pr: urllib.parse.ParseResult):
|
||||||
class TextXMLParser(html.parser.HTMLParser):
|
class TextXMLParser(html.parser.HTMLParser):
|
||||||
"""
|
"""
|
||||||
A simple minded logic used to strip xml content of
|
A simple minded logic used to strip xml content of
|
||||||
* all the xml tags as well as
|
* unwanted tags and their contents.
|
||||||
* all the contents belonging to below predefined tags like guid, enclosure, ...
|
|
||||||
|
|
||||||
* this works properly only if the xml being processed has proper opening and ending tags
|
* this works properly only if the xml being processed has proper opening and ending tags
|
||||||
around the area of interest.
|
around the area of interest.
|
||||||
|
|
||||||
This helps return a relatively clean textual representation of the xml file/content being parsed.
|
This can help return a cleaned up xml file.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, tagDrops: list[str]):
|
def __init__(self, tagDrops: list[str]):
|
||||||
|
|
@ -240,7 +238,9 @@ class TextXMLParser(html.parser.HTMLParser):
|
||||||
self.insideTagDrops[tag] = False
|
self.insideTagDrops[tag] = False
|
||||||
self.bCapture = False
|
self.bCapture = False
|
||||||
self.text = ""
|
self.text = ""
|
||||||
self.prefix = []
|
self.prefixTags = []
|
||||||
|
self.prefix = ""
|
||||||
|
self.lastTrackedCB = ""
|
||||||
|
|
||||||
def do_capture(self):
|
def do_capture(self):
|
||||||
"""
|
"""
|
||||||
|
|
@ -252,18 +252,27 @@ class TextXMLParser(html.parser.HTMLParser):
|
||||||
return True
|
return True
|
||||||
|
|
||||||
def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]):
|
def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]):
|
||||||
self.prefix.append(tag)
|
self.lastTrackedCB = "starttag"
|
||||||
|
self.prefixTags.append(tag)
|
||||||
|
self.prefix += "\t"
|
||||||
|
self.text += f"\n{self.prefix}<{tag}>"
|
||||||
if tag in self.tagDrops:
|
if tag in self.tagDrops:
|
||||||
self.insideTagDrops[tag] = True
|
self.insideTagDrops[tag] = True
|
||||||
|
|
||||||
def handle_endtag(self, tag: str):
|
def handle_endtag(self, tag: str):
|
||||||
self.prefix.pop()
|
if (self.lastTrackedCB == "endtag"):
|
||||||
|
self.text += f"\n{self.prefix}</{tag}>"
|
||||||
|
else:
|
||||||
|
self.text += f"</{tag}>"
|
||||||
|
self.lastTrackedCB = "endtag"
|
||||||
|
self.prefixTags.pop()
|
||||||
|
self.prefix = self.prefix[:-1]
|
||||||
if tag in self.tagDrops:
|
if tag in self.tagDrops:
|
||||||
self.insideTagDrops[tag] = False
|
self.insideTagDrops[tag] = False
|
||||||
|
|
||||||
def handle_data(self, data: str):
|
def handle_data(self, data: str):
|
||||||
if self.do_capture():
|
if self.do_capture():
|
||||||
self.text += f"{':'.join(self.prefix)}:{data}\n"
|
self.text += f"{data}"
|
||||||
|
|
||||||
|
|
||||||
def handle_xmltext(ph: 'ProxyHandler', pr: urllib.parse.ParseResult):
|
def handle_xmltext(ph: 'ProxyHandler', pr: urllib.parse.ParseResult):
|
||||||
|
|
|
||||||
|
|
@ -340,7 +340,7 @@ let fetchxmltext_meta = {
|
||||||
"type": "function",
|
"type": "function",
|
||||||
"function": {
|
"function": {
|
||||||
"name": "fetch_xml_as_text",
|
"name": "fetch_xml_as_text",
|
||||||
"description": "Fetch requested xml url through a proxy server and return its cleaned up text contents. Each content is prefixed with the xml tag heirarchy that it belongs to. Will take few seconds",
|
"description": "Fetch requested xml url through a proxy server that can optionally filter out unwanted tags and their contents. Will take few seconds",
|
||||||
"parameters": {
|
"parameters": {
|
||||||
"type": "object",
|
"type": "object",
|
||||||
"properties": {
|
"properties": {
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue