From e501e1dec917a9d41253551659b1b490cc396f1c Mon Sep 17 00:00:00 2001 From: Piotr Wilkin Date: Sat, 14 Feb 2026 00:56:22 +0100 Subject: [PATCH] Basic universal PEG parser wrapper with tag-to-dict based extractor --- common/chat-peg-parser.cpp | 25 ++++++++++++++ common/chat-peg-parser.h | 21 ++++++++++++ tests/test-chat-peg-parser.cpp | 59 ++++++++++++++++++++++++++++++++++ 3 files changed, 105 insertions(+) diff --git a/common/chat-peg-parser.cpp b/common/chat-peg-parser.cpp index 6e58dc6761..039e52177c 100644 --- a/common/chat-peg-parser.cpp +++ b/common/chat-peg-parser.cpp @@ -177,6 +177,31 @@ void common_chat_peg_mapper::map(const common_peg_ast_node & node) { } } +void tag_based_peg_mapper::from_ast(const common_peg_ast_arena & arena, const common_peg_parse_result & result) { + arena.visit(result, [this](const common_peg_ast_node & node) { + if (!node.tag.empty()) { + tags[node.tag] = std::string(node.text); + } + }); +} + +tagged_parse_result tagged_peg_parser::parse_and_extract(const std::string & input, bool is_partial) const { + common_peg_parse_context ctx(input, is_partial); + auto parse_result = arena.parse(ctx); + + tag_based_peg_mapper mapper; + mapper.from_ast(ctx.ast, parse_result); + + return { std::move(parse_result), std::move(mapper.tags) }; +} + +tagged_peg_parser build_tagged_peg_parser( + const std::function & fn) { + common_peg_parser_builder builder; + builder.set_root(fn(builder)); + return { builder.build() }; +} + common_peg_parser common_chat_peg_builder::tag_with_safe_content(const std::string & tag_name, const std::string & marker, const common_peg_parser & p) { diff --git a/common/chat-peg-parser.h b/common/chat-peg-parser.h index c0392f0c5d..6219c819d6 100644 --- a/common/chat-peg-parser.h +++ b/common/chat-peg-parser.h @@ -138,6 +138,27 @@ inline common_peg_arena build_chat_peg_unified_parser( return builder.build(); } +class tag_based_peg_mapper { + public: + std::map tags; + + void from_ast(const common_peg_ast_arena & arena, const common_peg_parse_result & result); +}; + +struct tagged_parse_result { + common_peg_parse_result result; + std::map tags; +}; + +struct tagged_peg_parser { + common_peg_arena arena; + + tagged_parse_result parse_and_extract(const std::string & input, bool is_partial = false) const; +}; + +tagged_peg_parser build_tagged_peg_parser( + const std::function & fn); + class common_chat_peg_unified_mapper : public common_chat_peg_mapper { std::optional pending_tool_call; // Tool call waiting for name common_chat_tool_call * current_tool = nullptr; diff --git a/tests/test-chat-peg-parser.cpp b/tests/test-chat-peg-parser.cpp index d59880e3dc..95a989e6f8 100644 --- a/tests/test-chat-peg-parser.cpp +++ b/tests/test-chat-peg-parser.cpp @@ -20,6 +20,7 @@ static void test_example_qwen3_coder(testing & t); static void test_example_qwen3_non_coder(testing & t); static void test_command7_parser_compare(testing & t); static void test_prefix_tool_names(testing & t); +static void test_tagged_peg_parser(testing & t); int main(int argc, char * argv[]) { testing t(std::cout); @@ -37,6 +38,7 @@ int main(int argc, char * argv[]) { t.test("qwen3 non-coder", test_example_qwen3_non_coder); t.test("comparison", test_command7_parser_compare); t.test("prefix tool names", test_prefix_tool_names); + t.test("tagged peg parser", test_tagged_peg_parser); return t.summary(); } @@ -878,3 +880,60 @@ static void test_prefix_tool_names(testing & t) { } }); } + +static void test_tagged_peg_parser(testing & t) { + t.test("basic tag extraction", [&](testing & t) { + auto parser = build_tagged_peg_parser([](common_peg_parser_builder & p) { + return p.tag("greeting", p.until(" ")) + " " + p.tag("name", p.rest()) + p.end(); + }); + + auto result = parser.parse_and_extract("Hello World"); + t.assert_true("success", result.result.success()); + t.assert_equal("greeting tag", "Hello", result.tags.at("greeting")); + t.assert_equal("name tag", "World", result.tags.at("name")); + }); + + t.test("duplicate tags overwrite", [&](testing & t) { + auto parser = build_tagged_peg_parser([](common_peg_parser_builder & p) { + return p.tag("item", p.until(",")) + "," + p.tag("item", p.rest()) + p.end(); + }); + + auto result = parser.parse_and_extract("first,second"); + t.assert_true("success", result.result.success()); + t.assert_equal("item tag", "second", result.tags.at("item")); + }); + + t.test("no tags extracted", [&](testing & t) { + auto parser = build_tagged_peg_parser([](common_peg_parser_builder & p) { + return p.rest() + p.end(); + }); + + auto result = parser.parse_and_extract("Hello"); + t.assert_true("success", result.result.success()); + t.assert_equal("empty tags", 0u, result.tags.size()); + }); + + t.test("structured extraction", [&](testing & t) { + auto parser = build_tagged_peg_parser([](common_peg_parser_builder & p) { + auto header = p.tag("header", p.until("\n")); + auto body = p.tag("body", p.rest()); + return header + "\n" + body + p.end(); + }); + + auto result = parser.parse_and_extract("Title\nBody content here"); + t.assert_true("success", result.result.success()); + t.assert_equal("header", "Title", result.tags.at("header")); + t.assert_equal("body", "Body content here", result.tags.at("body")); + }); + + t.test("partial parse", [&](testing & t) { + auto parser = build_tagged_peg_parser([](common_peg_parser_builder & p) { + return p.tag("prefix", p.until(":")) + ":" + p.tag("value", p.rest()) + p.end(); + }); + + auto result = parser.parse_and_extract("key:val", true); + t.assert_true("not fail", !result.result.fail()); + t.assert_equal("prefix tag", "key", result.tags.at("prefix")); + t.assert_equal("value tag", "val", result.tags.at("value")); + }); +}