Basic universal PEG parser wrapper with tag-to-dict based extractor

This commit is contained in:
Piotr Wilkin 2026-02-14 00:56:22 +01:00
parent 0884aad1c5
commit e501e1dec9
3 changed files with 105 additions and 0 deletions

View File

@ -177,6 +177,31 @@ void common_chat_peg_mapper::map(const common_peg_ast_node & node) {
}
}
void tag_based_peg_mapper::from_ast(const common_peg_ast_arena & arena, const common_peg_parse_result & result) {
arena.visit(result, [this](const common_peg_ast_node & node) {
if (!node.tag.empty()) {
tags[node.tag] = std::string(node.text);
}
});
}
tagged_parse_result tagged_peg_parser::parse_and_extract(const std::string & input, bool is_partial) const {
common_peg_parse_context ctx(input, is_partial);
auto parse_result = arena.parse(ctx);
tag_based_peg_mapper mapper;
mapper.from_ast(ctx.ast, parse_result);
return { std::move(parse_result), std::move(mapper.tags) };
}
tagged_peg_parser build_tagged_peg_parser(
const std::function<common_peg_parser(common_peg_parser_builder & builder)> & fn) {
common_peg_parser_builder builder;
builder.set_root(fn(builder));
return { builder.build() };
}
common_peg_parser common_chat_peg_builder::tag_with_safe_content(const std::string & tag_name,
const std::string & marker,
const common_peg_parser & p) {

View File

@ -138,6 +138,27 @@ inline common_peg_arena build_chat_peg_unified_parser(
return builder.build();
}
class tag_based_peg_mapper {
public:
std::map<std::string, std::string> tags;
void from_ast(const common_peg_ast_arena & arena, const common_peg_parse_result & result);
};
struct tagged_parse_result {
common_peg_parse_result result;
std::map<std::string, std::string> tags;
};
struct tagged_peg_parser {
common_peg_arena arena;
tagged_parse_result parse_and_extract(const std::string & input, bool is_partial = false) const;
};
tagged_peg_parser build_tagged_peg_parser(
const std::function<common_peg_parser(common_peg_parser_builder & builder)> & fn);
class common_chat_peg_unified_mapper : public common_chat_peg_mapper {
std::optional<common_chat_tool_call> pending_tool_call; // Tool call waiting for name
common_chat_tool_call * current_tool = nullptr;

View File

@ -20,6 +20,7 @@ static void test_example_qwen3_coder(testing & t);
static void test_example_qwen3_non_coder(testing & t);
static void test_command7_parser_compare(testing & t);
static void test_prefix_tool_names(testing & t);
static void test_tagged_peg_parser(testing & t);
int main(int argc, char * argv[]) {
testing t(std::cout);
@ -37,6 +38,7 @@ int main(int argc, char * argv[]) {
t.test("qwen3 non-coder", test_example_qwen3_non_coder);
t.test("comparison", test_command7_parser_compare);
t.test("prefix tool names", test_prefix_tool_names);
t.test("tagged peg parser", test_tagged_peg_parser);
return t.summary();
}
@ -878,3 +880,60 @@ static void test_prefix_tool_names(testing & t) {
}
});
}
static void test_tagged_peg_parser(testing & t) {
t.test("basic tag extraction", [&](testing & t) {
auto parser = build_tagged_peg_parser([](common_peg_parser_builder & p) {
return p.tag("greeting", p.until(" ")) + " " + p.tag("name", p.rest()) + p.end();
});
auto result = parser.parse_and_extract("Hello World");
t.assert_true("success", result.result.success());
t.assert_equal("greeting tag", "Hello", result.tags.at("greeting"));
t.assert_equal("name tag", "World", result.tags.at("name"));
});
t.test("duplicate tags overwrite", [&](testing & t) {
auto parser = build_tagged_peg_parser([](common_peg_parser_builder & p) {
return p.tag("item", p.until(",")) + "," + p.tag("item", p.rest()) + p.end();
});
auto result = parser.parse_and_extract("first,second");
t.assert_true("success", result.result.success());
t.assert_equal("item tag", "second", result.tags.at("item"));
});
t.test("no tags extracted", [&](testing & t) {
auto parser = build_tagged_peg_parser([](common_peg_parser_builder & p) {
return p.rest() + p.end();
});
auto result = parser.parse_and_extract("Hello");
t.assert_true("success", result.result.success());
t.assert_equal("empty tags", 0u, result.tags.size());
});
t.test("structured extraction", [&](testing & t) {
auto parser = build_tagged_peg_parser([](common_peg_parser_builder & p) {
auto header = p.tag("header", p.until("\n"));
auto body = p.tag("body", p.rest());
return header + "\n" + body + p.end();
});
auto result = parser.parse_and_extract("Title\nBody content here");
t.assert_true("success", result.result.success());
t.assert_equal("header", "Title", result.tags.at("header"));
t.assert_equal("body", "Body content here", result.tags.at("body"));
});
t.test("partial parse", [&](testing & t) {
auto parser = build_tagged_peg_parser([](common_peg_parser_builder & p) {
return p.tag("prefix", p.until(":")) + ":" + p.tag("value", p.rest()) + p.end();
});
auto result = parser.parse_and_extract("key:val", true);
t.assert_true("not fail", !result.result.fail());
t.assert_equal("prefix tag", "key", result.tags.at("prefix"));
t.assert_equal("value tag", "val", result.tags.at("value"));
});
}