From 451ef08432d1f7d3d6071d4006cbbeda21dcfbec Mon Sep 17 00:00:00 2001 From: Aldehir Rojas Date: Sun, 8 Mar 2026 11:17:02 -0500 Subject: [PATCH] common : gracefully handle incomplete output (#20191) * common : handle incomplete UTF-8 at end of input in PEG parser * cont : if reached end prematurely, emit needs_more_input to propagate partial output * cont: refactor peg parse context to add lenient flag * cont : remove partial flag, keep lenient flag --- common/chat-peg-parser.cpp | 9 ++- common/chat-peg-parser.h | 8 +-- common/chat.cpp | 12 ++-- common/peg-parser.cpp | 75 +++++++++----------- common/peg-parser.h | 37 +++++++--- tests/peg-parser/test-basic.cpp | 54 +++++++------- tests/peg-parser/test-json-parser.cpp | 12 ++-- tests/peg-parser/test-python-dict-parser.cpp | 6 +- tests/peg-parser/test-unicode.cpp | 25 +++---- tests/test-chat-auto-parser.cpp | 8 +-- tests/test-chat-peg-parser.cpp | 18 ++--- 11 files changed, 139 insertions(+), 125 deletions(-) diff --git a/common/chat-peg-parser.cpp b/common/chat-peg-parser.cpp index ef9dec5935..3448c4f5be 100644 --- a/common/chat-peg-parser.cpp +++ b/common/chat-peg-parser.cpp @@ -167,8 +167,8 @@ void tag_based_peg_mapper::from_ast(const common_peg_ast_arena & arena, const co }); } -tagged_parse_result tagged_peg_parser::parse_and_extract(const std::string & input, bool is_partial) const { - common_peg_parse_context ctx(input, is_partial); +tagged_parse_result tagged_peg_parser::parse_and_extract(const std::string & input, common_peg_parse_flags extra_flags) const { + common_peg_parse_context ctx(input, flags | extra_flags); auto parse_result = arena.parse(ctx); tag_based_peg_mapper mapper; @@ -179,11 +179,10 @@ tagged_parse_result tagged_peg_parser::parse_and_extract(const std::string & inp tagged_parse_result tagged_peg_parser::parse_anywhere_and_extract(const std::string & input) const { if (input.empty()) { - return parse_and_extract(input, false); + return parse_and_extract(input); } for (size_t i = 0; i < input.size(); i++) { - common_peg_parse_context ctx(input, false); - ctx.debug = debug; + common_peg_parse_context ctx(input, flags); auto parse_result = arena.parse(ctx, i); if (parse_result.success() || i == input.size() - 1) { tag_based_peg_mapper mapper; diff --git a/common/chat-peg-parser.h b/common/chat-peg-parser.h index e130ceea5f..fe4c1b648f 100644 --- a/common/chat-peg-parser.h +++ b/common/chat-peg-parser.h @@ -155,19 +155,19 @@ struct tagged_parse_result { struct tagged_peg_parser { common_peg_arena arena; - bool debug = false; + common_peg_parse_flags flags = COMMON_PEG_PARSE_FLAG_NONE; tagged_peg_parser & withDebug() { - debug = true; + flags |= COMMON_PEG_PARSE_FLAG_DEBUG; return *this; } tagged_peg_parser & withoutDebug() { - debug = false; + flags = flags & ~COMMON_PEG_PARSE_FLAG_DEBUG; return *this; } - tagged_parse_result parse_and_extract(const std::string & input, bool is_partial = false) const; + tagged_parse_result parse_and_extract(const std::string & input, common_peg_parse_flags extra_flags = COMMON_PEG_PARSE_FLAG_NONE) const; tagged_parse_result parse_anywhere_and_extract(const std::string & input) const; }; diff --git a/common/chat.cpp b/common/chat.cpp index dff7879af0..d12802bd76 100644 --- a/common/chat.cpp +++ b/common/chat.cpp @@ -1527,8 +1527,12 @@ common_chat_msg common_chat_peg_parse(const common_peg_arena & src_pars LOG_DBG("Parsing PEG input with format %s: %s\n", common_chat_format_name(params.format), input.c_str()); - common_peg_parse_context ctx(input, is_partial); - ctx.debug = params.debug; + common_peg_parse_flags flags = COMMON_PEG_PARSE_FLAG_LENIENT; + if (params.debug) { + flags |= COMMON_PEG_PARSE_FLAG_DEBUG; + } + + common_peg_parse_context ctx(input, flags); auto result = parser.parse(ctx); if (result.fail()) { @@ -1541,7 +1545,7 @@ common_chat_msg common_chat_peg_parse(const common_peg_arena & src_pars auto mapper = common_chat_peg_mapper(msg); mapper.from_ast(ctx.ast, result); - if (ctx.debug) { + if (ctx.is_debug()) { fprintf(stderr, "\nAST for partial parse (fail):\n%s\n", ctx.ast.dump().c_str()); fflush(stderr); } @@ -1557,7 +1561,7 @@ common_chat_msg common_chat_peg_parse(const common_peg_arena & src_pars auto mapper = common_chat_peg_mapper(msg); mapper.from_ast(ctx.ast, result); - if (ctx.debug) { + if (ctx.is_debug()) { fprintf(stderr, "\nAST for %s parse:\n%s\n", is_partial ? "partial" : "full", ctx.ast.dump().c_str()); fflush(stderr); } diff --git a/common/peg-parser.cpp b/common/peg-parser.cpp index 48379f1ec8..81630b68a9 100644 --- a/common/peg-parser.cpp +++ b/common/peg-parser.cpp @@ -349,7 +349,7 @@ struct parser_executor { auto pos = start_pos; for (auto i = 0u; i < p.literal.size(); ++i) { if (pos >= ctx.input.size()) { - if (!ctx.is_partial) { + if (!ctx.is_lenient()) { return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_FAIL, start_pos); } return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT, start_pos, pos); @@ -364,7 +364,7 @@ struct parser_executor { } common_peg_parse_result operator()(const common_peg_sequence_parser & p) { - if (ctx.debug) { + if (ctx.is_debug()) { LOG_DBG("%sSEQ start at %zu '%s' (%zu children)\n", debug_indent().c_str(), start_pos, debug_input_snippet(start_pos).c_str(), p.children.size()); } @@ -375,26 +375,19 @@ struct parser_executor { for (size_t i = 0; i < p.children.size(); i++) { const auto & child_id = p.children[i]; - if (ctx.debug) { + if (ctx.is_debug()) { fprintf(stderr, "%sSEQ child %zu: %s\n", debug_indent().c_str(), i, arena.dump(child_id).c_str()); } auto result = arena.parse(child_id, ctx, pos); - if (ctx.debug) { + if (ctx.is_debug()) { fprintf(stderr, "%sSEQ child %zu: %s at %zu->%zu\n", debug_indent().c_str(), i, common_peg_parse_result_type_name(result.type), result.start, result.end); } if (result.fail()) { ctx.parse_depth--; - if (ctx.is_partial && result.end >= ctx.input.size()) { - if (ctx.debug) { - fprintf(stderr, "%sSEQ -> NEED_MORE (child failed at end)\n", debug_indent().c_str()); - } - return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT, start_pos, result.end, - std::move(nodes)); - } - if (ctx.debug) { + if (ctx.is_debug()) { fprintf(stderr, "%sSEQ -> FAIL\n", debug_indent().c_str()); } return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_FAIL, start_pos, result.end); @@ -406,7 +399,7 @@ struct parser_executor { if (result.need_more_input()) { ctx.parse_depth--; - if (ctx.debug) { + if (ctx.is_debug()) { fprintf(stderr, "%sSEQ -> NEED_MORE\n", debug_indent().c_str()); } return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT, start_pos, result.end, std::move(nodes)); @@ -416,14 +409,14 @@ struct parser_executor { } ctx.parse_depth--; - if (ctx.debug) { + if (ctx.is_debug()) { fprintf(stderr, "%sSEQ -> SUCCESS at %zu->%zu\n", debug_indent().c_str(), start_pos, pos); } return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_SUCCESS, start_pos, pos, std::move(nodes)); } common_peg_parse_result operator()(const common_peg_choice_parser & p) { - if (ctx.debug) { + if (ctx.is_debug()) { fprintf(stderr, "%sCHOICE start at %zu '%s' (%zu options)\n", debug_indent().c_str(), start_pos, debug_input_snippet(start_pos).c_str(), p.children.size()); } @@ -432,17 +425,17 @@ struct parser_executor { auto pos = start_pos; for (size_t i = 0; i < p.children.size(); i++) { const auto & child_id = p.children[i]; - if (ctx.debug) { + if (ctx.is_debug()) { fprintf(stderr, "%sCHOICE option %zu: %s\n", debug_indent().c_str(), i, arena.dump(child_id).c_str()); } auto result = arena.parse(child_id, ctx, pos); - if (ctx.debug) { + if (ctx.is_debug()) { fprintf(stderr, "%sCHOICE option %zu: %s\n", debug_indent().c_str(), i, common_peg_parse_result_type_name(result.type)); } if (!result.fail()) { ctx.parse_depth--; - if (ctx.debug) { + if (ctx.is_debug()) { fprintf(stderr, "%sCHOICE -> %s (option %zu)\n", debug_indent().c_str(), common_peg_parse_result_type_name(result.type), i); } @@ -451,14 +444,14 @@ struct parser_executor { } ctx.parse_depth--; - if (ctx.debug) { + if (ctx.is_debug()) { fprintf(stderr, "%sCHOICE -> FAIL (no options matched)\n", debug_indent().c_str()); } return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_FAIL, start_pos); } common_peg_parse_result operator()(const common_peg_repetition_parser & p) { - if (ctx.debug) { + if (ctx.is_debug()) { fprintf(stderr, "%sREPEAT start at %zu '%s' (min=%d, max=%d)\n", debug_indent().c_str(), start_pos, debug_input_snippet(start_pos).c_str(), p.min_count, p.max_count); } @@ -471,7 +464,7 @@ struct parser_executor { // Try to match up to max_count times (or unlimited if max_count is -1) while (p.max_count == -1 || match_count < p.max_count) { if (pos >= ctx.input.size()) { - if (ctx.debug) { + if (ctx.is_debug()) { fprintf(stderr, "%sREPEAT: at end of input, count=%d\n", debug_indent().c_str(), match_count); } break; @@ -479,7 +472,7 @@ struct parser_executor { auto result = arena.parse(p.child, ctx, pos); - if (ctx.debug) { + if (ctx.is_debug()) { fprintf(stderr, "%sREPEAT iter %d: %s at %zu->%zu, nodes=%zu\n", debug_indent().c_str(), match_count, common_peg_parse_result_type_name(result.type), result.start, result.end, result.nodes.size()); fprintf(stderr, "%sREPEAT CHILD: %s\n", debug_indent().c_str(), arena.dump(p.child).c_str()); @@ -488,7 +481,7 @@ struct parser_executor { if (result.success()) { // Prevent infinite loop on empty matches if (result.end == pos) { - if (ctx.debug) { + if (ctx.is_debug()) { fprintf(stderr, "%s REPEAT: empty match, stopping\n", debug_indent().c_str()); } break; @@ -509,7 +502,7 @@ struct parser_executor { } ctx.parse_depth--; - if (ctx.debug) { + if (ctx.is_debug()) { fprintf(stderr, "%sREPEAT -> NEED_MORE (count=%d, nodes=%zu)\n", debug_indent().c_str(), match_count, nodes.size()); } @@ -517,7 +510,7 @@ struct parser_executor { } // Child failed - stop trying - if (ctx.debug) { + if (ctx.is_debug()) { fprintf(stderr, "%sREPEAT: child failed, stopping\n", debug_indent().c_str()); } break; @@ -526,14 +519,14 @@ struct parser_executor { // Check if we got enough matches if (p.min_count > 0 && match_count < p.min_count) { ctx.parse_depth--; - if (pos >= ctx.input.size() && ctx.is_partial) { - if (ctx.debug) { + if (pos >= ctx.input.size() && ctx.is_lenient()) { + if (ctx.is_debug()) { fprintf(stderr, "%sREPEAT -> NEED_MORE (not enough matches: %d < %d)\n", debug_indent().c_str(), match_count, p.min_count); } return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT, start_pos, pos, std::move(nodes)); } - if (ctx.debug) { + if (ctx.is_debug()) { fprintf(stderr, "%sREPEAT -> FAIL (not enough matches: %d < %d)\n", debug_indent().c_str(), match_count, p.min_count); } @@ -541,7 +534,7 @@ struct parser_executor { } ctx.parse_depth--; - if (ctx.debug) { + if (ctx.is_debug()) { fprintf(stderr, "%sREPEAT -> SUCCESS (count=%d, nodes=%zu)\n", debug_indent().c_str(), match_count, nodes.size()); } @@ -576,7 +569,7 @@ struct parser_executor { auto result = common_parse_utf8_codepoint(ctx.input, start_pos); if (result.status == utf8_parse_result::INCOMPLETE) { - if (!ctx.is_partial) { + if (!ctx.is_lenient()) { return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_FAIL, start_pos); } return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT, start_pos); @@ -615,7 +608,7 @@ struct parser_executor { return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_SUCCESS, start_pos, pos); } // Not enough matches yet - if (!ctx.is_partial) { + if (!ctx.is_lenient()) { return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_FAIL, start_pos); } return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT, start_pos, pos); @@ -656,7 +649,7 @@ struct parser_executor { // Check if we got enough matches if (match_count < p.min_count) { - if (pos >= ctx.input.size() && ctx.is_partial) { + if (pos >= ctx.input.size() && ctx.is_lenient()) { return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT, start_pos, pos); } return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_FAIL, start_pos, pos); @@ -668,7 +661,7 @@ struct parser_executor { static common_peg_parse_result handle_escape_sequence(common_peg_parse_context & ctx, size_t start, size_t & pos) { ++pos; // consume '\' if (pos >= ctx.input.size()) { - if (!ctx.is_partial) { + if (!ctx.is_lenient()) { return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_FAIL, start); } return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT, start, pos); @@ -698,7 +691,7 @@ struct parser_executor { ++pos; // consume 'u' for (int i = 0; i < 4; ++i) { if (pos >= ctx.input.size()) { - if (!ctx.is_partial) { + if (!ctx.is_lenient()) { return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_FAIL, start); } return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT, start, pos); @@ -732,7 +725,7 @@ struct parser_executor { auto utf8_result = common_parse_utf8_codepoint(ctx.input, pos); if (utf8_result.status == utf8_parse_result::INCOMPLETE) { - if (!ctx.is_partial) { + if (!ctx.is_lenient()) { return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_FAIL, start_pos); } return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT, start_pos, pos); @@ -747,7 +740,7 @@ struct parser_executor { } // Reached end without finding closing quote - if (!ctx.is_partial) { + if (!ctx.is_lenient()) { return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_FAIL, start_pos, pos); } return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT, start_pos, pos); @@ -774,7 +767,7 @@ struct parser_executor { auto utf8_result = common_parse_utf8_codepoint(ctx.input, pos); if (utf8_result.status == utf8_parse_result::INCOMPLETE) { - if (!ctx.is_partial) { + if (!ctx.is_lenient()) { return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_FAIL, start_pos); } return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT, start_pos, pos); @@ -789,7 +782,7 @@ struct parser_executor { } // Reached end without finding closing quote - if (!ctx.is_partial) { + if (!ctx.is_lenient()) { return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_FAIL, start_pos, pos); } return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT, start_pos, pos); @@ -807,7 +800,7 @@ struct parser_executor { if (utf8_result.status == utf8_parse_result::INCOMPLETE) { // Incomplete UTF-8 sequence - if (!ctx.is_partial) { + if (!ctx.is_lenient()) { // Input is complete but UTF-8 is incomplete = malformed return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_FAIL, start_pos); } @@ -837,7 +830,7 @@ struct parser_executor { last_valid_pos = pos; } - if (last_valid_pos == ctx.input.size() && ctx.is_partial) { + if (last_valid_pos == ctx.input.size() && ctx.is_lenient()) { // Reached the end of a partial stream, there might still be more input that we need to consume. return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT, start_pos, last_valid_pos); } @@ -876,7 +869,7 @@ struct parser_executor { common_peg_parse_result operator()(const common_peg_tag_parser & p) { // Parse the child - if (ctx.debug) { + if (ctx.is_debug()) { fprintf(stderr, "%sTAG: %s\n", debug_indent().c_str(), p.tag.c_str()); } auto result = arena.parse(p.child, ctx, start_pos); diff --git a/common/peg-parser.h b/common/peg-parser.h index 57d4bcd8ea..9f81df2e9a 100644 --- a/common/peg-parser.h +++ b/common/peg-parser.h @@ -139,22 +139,43 @@ struct common_peg_parse_result { bool success() const { return type == COMMON_PEG_PARSE_RESULT_SUCCESS; } }; +enum common_peg_parse_flags { + COMMON_PEG_PARSE_FLAG_NONE = 0, + COMMON_PEG_PARSE_FLAG_LENIENT = 1 << 0, + COMMON_PEG_PARSE_FLAG_DEBUG = 1 << 1, +}; + +inline common_peg_parse_flags operator|(common_peg_parse_flags a, common_peg_parse_flags b) { + return static_cast(int(a) | int(b)); +} + +inline common_peg_parse_flags & operator|=(common_peg_parse_flags & a, common_peg_parse_flags b) { + return a = a | b; +} + +inline common_peg_parse_flags operator&(common_peg_parse_flags a, common_peg_parse_flags b) { + return static_cast(int(a) & int(b)); +} + +inline common_peg_parse_flags operator~(common_peg_parse_flags a) { + return static_cast(~int(a)); +} + struct common_peg_parse_context { std::string input; - bool is_partial; - bool debug = false; // Enable debug output for parser tracing + common_peg_parse_flags flags; common_peg_ast_arena ast; int parse_depth; - common_peg_parse_context() - : is_partial(false), parse_depth(0) {} + common_peg_parse_context(common_peg_parse_flags flags = COMMON_PEG_PARSE_FLAG_NONE) + : flags(flags), parse_depth(0) {} - common_peg_parse_context(const std::string & input) - : input(input), is_partial(false), parse_depth(0) {} + common_peg_parse_context(const std::string & input, common_peg_parse_flags flags = COMMON_PEG_PARSE_FLAG_NONE) + : input(input), flags(flags), parse_depth(0) {} - common_peg_parse_context(const std::string & input, bool is_partial) - : input(input), is_partial(is_partial), parse_depth(0) {} + bool is_lenient() const { return flags & COMMON_PEG_PARSE_FLAG_LENIENT; } + bool is_debug() const { return flags & COMMON_PEG_PARSE_FLAG_DEBUG; } }; class common_peg_arena; diff --git a/tests/peg-parser/test-basic.cpp b/tests/peg-parser/test-basic.cpp index 872f16a78d..b6af61491d 100644 --- a/tests/peg-parser/test-basic.cpp +++ b/tests/peg-parser/test-basic.cpp @@ -120,7 +120,7 @@ void test_basic(testing & t) { return p.literal("hello") + p.optional(p.literal(" world")); }); - auto ctx = common_peg_parse_context("hello", false); + auto ctx = common_peg_parse_context("hello"); auto result = parser.parse(ctx); t.assert_equal("optional_absent", true, result.success()); t.assert_equal("optional_absent_end", 5u, result.end); @@ -132,7 +132,7 @@ void test_basic(testing & t) { return p.literal("hello") + p.optional(p.literal(" world")); }); - auto ctx = common_peg_parse_context("hello ", true); + auto ctx = common_peg_parse_context("hello ", COMMON_PEG_PARSE_FLAG_LENIENT); auto result = parser.parse(ctx); t.assert_equal("partial_match_need_more", true, result.need_more_input()); }); @@ -215,7 +215,7 @@ void test_basic(testing & t) { t.test("sequence_partial_match_1", [&](testing & t) { auto parser = build_peg_parser([](common_peg_parser_builder & p) { return p.literal("") + p.literal(""); }); - auto ctx = common_peg_parse_context("") + p.literal(""); }); - auto ctx = common_peg_parse_context("") + p.literal(""); }); - auto ctx = common_peg_parse_context("I am common_chat_combinator_parser", true); + auto ctx = common_peg_parse_context("I am common_chat_combinator_parser", COMMON_PEG_PARSE_FLAG_LENIENT); auto result = parser.parse(ctx); t.assert_equal("sequence_no_match", true, result.fail()); }); @@ -260,7 +260,7 @@ void test_basic(testing & t) { t.test("choices_partial_match_1", [&](testing & t) { auto parser = build_peg_parser([](common_peg_parser_builder & p) { return p.literal("option1") | p.literal("option2"); }); - auto ctx = common_peg_parse_context("opt", true); + auto ctx = common_peg_parse_context("opt", COMMON_PEG_PARSE_FLAG_LENIENT); auto result = parser.parse(ctx); t.assert_equal("choices_partial_match_1", true, result.need_more_input()); }); @@ -270,7 +270,7 @@ void test_basic(testing & t) { auto parser = build_peg_parser([](common_peg_parser_builder & p) { return p.literal("choice_a") | p.literal("choice_b"); }); - auto ctx = common_peg_parse_context("choice", true); + auto ctx = common_peg_parse_context("choice", COMMON_PEG_PARSE_FLAG_LENIENT); auto result = parser.parse(ctx); t.assert_equal("choices_partial_match_2", true, result.need_more_input()); }); @@ -279,7 +279,7 @@ void test_basic(testing & t) { t.test("choices_full_match_1", [&](testing & t) { auto parser = build_peg_parser([](common_peg_parser_builder & p) { return p.literal("first") | p.literal("second"); }); - auto ctx = common_peg_parse_context("first", false); + auto ctx = common_peg_parse_context("first"); auto result = parser.parse(ctx); t.assert_equal("choices_full_match_1", true, result.success()); }); @@ -288,7 +288,7 @@ void test_basic(testing & t) { t.test("choices_full_match_2", [&](testing & t) { auto parser = build_peg_parser([](common_peg_parser_builder & p) { return p.literal("alpha") | p.literal("beta"); }); - auto ctx = common_peg_parse_context("beta", false); + auto ctx = common_peg_parse_context("beta"); auto result = parser.parse(ctx); t.assert_equal("choices_full_match_2", true, result.success()); }); @@ -297,7 +297,7 @@ void test_basic(testing & t) { t.test("choices_no_match", [&](testing & t) { auto parser = build_peg_parser([](common_peg_parser_builder & p) { return p.literal("good") | p.literal("better"); }); - auto ctx = common_peg_parse_context("best", false); + auto ctx = common_peg_parse_context("best"); auto result = parser.parse(ctx); t.assert_equal("choices_no_match", true, result.fail()); }); @@ -306,7 +306,7 @@ void test_basic(testing & t) { t.test("zero_or_more_partial_match_1", [&](testing & t) { auto parser = build_peg_parser([](common_peg_parser_builder & p) { return p.zero_or_more(p.literal("ab")); }); - auto ctx = common_peg_parse_context("a", true); + auto ctx = common_peg_parse_context("a", COMMON_PEG_PARSE_FLAG_LENIENT); auto result = parser.parse(ctx); t.assert_equal("zero_or_more_partial_match_1", true, result.need_more_input()); }); @@ -315,7 +315,7 @@ void test_basic(testing & t) { t.test("zero_or_more_partial_match_2", [&](testing & t) { auto parser = build_peg_parser([](common_peg_parser_builder & p) { return p.zero_or_more(p.literal("xy")); }); - auto ctx = common_peg_parse_context("xyx", true); + auto ctx = common_peg_parse_context("xyx", COMMON_PEG_PARSE_FLAG_LENIENT); auto result = parser.parse(ctx); t.assert_equal("zero_or_more_partial_match_2", true, result.need_more_input()); }); @@ -324,7 +324,7 @@ void test_basic(testing & t) { t.test("zero_or_more_full_match", [&](testing & t) { auto parser = build_peg_parser([](common_peg_parser_builder & p) { return p.zero_or_more(p.literal("test")); }); - auto ctx = common_peg_parse_context("test", false); + auto ctx = common_peg_parse_context("test"); auto result = parser.parse(ctx); t.assert_equal("zero_or_more_full_match", true, result.success()); }); @@ -333,7 +333,7 @@ void test_basic(testing & t) { t.test("one_or_more_partial_match_1", [&](testing & t) { auto parser = build_peg_parser([](common_peg_parser_builder & p) { return p.one_or_more(p.literal("repeat")); }); - auto ctx = common_peg_parse_context("rep", true); + auto ctx = common_peg_parse_context("rep", COMMON_PEG_PARSE_FLAG_LENIENT); auto result = parser.parse(ctx); t.assert_equal("one_or_more_partial_match_1", true, result.need_more_input()); }); @@ -342,7 +342,7 @@ void test_basic(testing & t) { t.test("one_or_more_partial_match_2", [&](testing & t) { auto parser = build_peg_parser([](common_peg_parser_builder & p) { return p.one_or_more(p.literal("ab")); }); - auto ctx = common_peg_parse_context("aba", true); + auto ctx = common_peg_parse_context("aba", COMMON_PEG_PARSE_FLAG_LENIENT); auto result = parser.parse(ctx); t.assert_equal("one_or_more_partial_match_2", true, result.need_more_input()); }); @@ -351,7 +351,7 @@ void test_basic(testing & t) { t.test("one_or_more_full_match", [&](testing & t) { auto parser = build_peg_parser([](common_peg_parser_builder & p) { return p.one_or_more(p.literal("single")); }); - auto ctx = common_peg_parse_context("single", false); + auto ctx = common_peg_parse_context("single"); auto result = parser.parse(ctx); t.assert_equal("one_or_more_full_match", true, result.success()); }); @@ -360,7 +360,7 @@ void test_basic(testing & t) { t.test("one_or_more_no_match", [&](testing & t) { auto parser = build_peg_parser([](common_peg_parser_builder & p) { return p.one_or_more(p.literal("()")); }); - auto ctx = common_peg_parse_context("success", false); + auto ctx = common_peg_parse_context("success"); auto result = parser.parse(ctx); t.assert_equal("one_or_more_no_match", true, result.fail()); }); @@ -376,7 +376,7 @@ void test_basic(testing & t) { return p.rule("value", p.ref("number") | p.ref("list")); }); - common_peg_parse_context ctx("1", false); + common_peg_parse_context ctx("1"); auto result = value_parser.parse(ctx); t.assert_equal("result_is_success", true, result.success()); @@ -390,7 +390,7 @@ void test_basic(testing & t) { return p.rule("value", p.ref("number") | p.ref("list")); }); - common_peg_parse_context ctx("[1]", false); + common_peg_parse_context ctx("[1]"); auto result = value_parser.parse(ctx); t.assert_equal("result_is_success", true, result.success()); @@ -404,7 +404,7 @@ void test_basic(testing & t) { return p.rule("value", p.ref("number") | p.ref("list")); }); - common_peg_parse_context ctx("[[2]]", false); + common_peg_parse_context ctx("[[2]]"); auto result = value_parser.parse(ctx); t.assert_equal("result_is_success", true, result.success()); @@ -418,7 +418,7 @@ void test_basic(testing & t) { return p.rule("value", p.ref("number") | p.ref("list")); }); - common_peg_parse_context ctx("[[[3]]]", false); + common_peg_parse_context ctx("[[[3]]]"); auto result = value_parser.parse(ctx); t.assert_equal("result_is_success", true, result.success()); @@ -432,7 +432,7 @@ void test_basic(testing & t) { return p.rule("value", p.ref("number") | p.ref("list")); }); - common_peg_parse_context ctx("[[", true); + common_peg_parse_context ctx("[[", COMMON_PEG_PARSE_FLAG_LENIENT); auto result = value_parser.parse(ctx); t.assert_equal("result_is_need_more_input", true, result.need_more_input()); @@ -446,7 +446,7 @@ void test_basic(testing & t) { return p.rule("value", p.ref("number") | p.ref("list")); }); - common_peg_parse_context ctx("[a]", false); + common_peg_parse_context ctx("[a]"); auto result = value_parser.parse(ctx); t.assert_equal("result_is_fail", true, result.fail()); @@ -458,8 +458,8 @@ void test_basic(testing & t) { return p.marker(); }); - common_peg_parse_context ctx_square("[marker]", false); - common_peg_parse_context ctx_sharp("", false); + common_peg_parse_context ctx_square("[marker]"); + common_peg_parse_context ctx_sharp(""); auto result_square = bracket_parser.parse(ctx_square); auto result_sharp = bracket_parser.parse(ctx_sharp); diff --git a/tests/peg-parser/test-json-parser.cpp b/tests/peg-parser/test-json-parser.cpp index 48351cd66f..5dd00115ce 100644 --- a/tests/peg-parser/test-json-parser.cpp +++ b/tests/peg-parser/test-json-parser.cpp @@ -46,7 +46,7 @@ void test_json_parser(testing &t) { auto json = build_peg_parser([](common_peg_parser_builder & p) { return p.json(); }); std::string input = R"({"name": "test", "value": )"; - common_peg_parse_context ctx(input, true); + common_peg_parse_context ctx(input, COMMON_PEG_PARSE_FLAG_LENIENT); auto result = json.parse(ctx); @@ -58,7 +58,7 @@ void test_json_parser(testing &t) { auto json = build_peg_parser([](common_peg_parser_builder & p) { return p.json(); }); std::string input = R"([1, 2, 3, )"; - common_peg_parse_context ctx(input, true); + common_peg_parse_context ctx(input, COMMON_PEG_PARSE_FLAG_LENIENT); auto result = json.parse(ctx); @@ -70,7 +70,7 @@ void test_json_parser(testing &t) { auto json = build_peg_parser([](common_peg_parser_builder & p) { return p.json(); }); std::string input = R"({"data": {"nested": )"; - common_peg_parse_context ctx(input, true); + common_peg_parse_context ctx(input, COMMON_PEG_PARSE_FLAG_LENIENT); auto result = json.parse(ctx); @@ -84,7 +84,7 @@ void test_json_parser(testing &t) { t.test("success", [&](testing &t) { std::string input = R"("name": "bob")"; - common_peg_parse_context ctx(input, false); + common_peg_parse_context ctx(input); auto result = parser.parse(ctx); t.assert_true("success", result.success()); @@ -92,7 +92,7 @@ void test_json_parser(testing &t) { t.test("partial", [&](testing &t) { std::string input = R"("name": "bo)"; - common_peg_parse_context ctx(input, true); + common_peg_parse_context ctx(input, COMMON_PEG_PARSE_FLAG_LENIENT); auto result = parser.parse(ctx); t.assert_true("need more input", result.need_more_input()); @@ -100,7 +100,7 @@ void test_json_parser(testing &t) { t.test("failed", [&](testing &t) { std::string input = R"([])"; - common_peg_parse_context ctx(input, false); + common_peg_parse_context ctx(input); auto result = parser.parse(ctx); t.assert_true("fail", result.fail()); diff --git a/tests/peg-parser/test-python-dict-parser.cpp b/tests/peg-parser/test-python-dict-parser.cpp index d9946a4916..18e7d901b8 100644 --- a/tests/peg-parser/test-python-dict-parser.cpp +++ b/tests/peg-parser/test-python-dict-parser.cpp @@ -85,7 +85,7 @@ void test_python_dict_parser(testing &t) { auto parser = build_peg_parser([](common_peg_parser_builder & p) { return p.python_value(); }); std::string input = "{'name': 'test', 'value': "; - common_peg_parse_context ctx(input, true); + common_peg_parse_context ctx(input, COMMON_PEG_PARSE_FLAG_LENIENT); auto result = parser.parse(ctx); @@ -97,7 +97,7 @@ void test_python_dict_parser(testing &t) { auto parser = build_peg_parser([](common_peg_parser_builder & p) { return p.python_value(); }); std::string input = "{'name': 'test"; - common_peg_parse_context ctx(input, true); + common_peg_parse_context ctx(input, COMMON_PEG_PARSE_FLAG_LENIENT); auto result = parser.parse(ctx); @@ -229,7 +229,7 @@ void test_python_dict_parser(testing &t) { t.test("incomplete string", [&](testing &t) { std::string input = "'hello"; - common_peg_parse_context ctx(input, true); + common_peg_parse_context ctx(input, COMMON_PEG_PARSE_FLAG_LENIENT); auto result = parser.parse(ctx); t.assert_true("need_more_input", result.need_more_input()); diff --git a/tests/peg-parser/test-unicode.cpp b/tests/peg-parser/test-unicode.cpp index 19d9b9e41c..9cbdb0d387 100644 --- a/tests/peg-parser/test-unicode.cpp +++ b/tests/peg-parser/test-unicode.cpp @@ -58,7 +58,7 @@ void test_unicode(testing &t) { std::string test_name = "case " + std::to_string(i) + ": " + hex_dump(tc.input); t.test(test_name, [&](testing &t) { - common_peg_parse_context ctx(tc.input, true); + common_peg_parse_context ctx(tc.input, COMMON_PEG_PARSE_FLAG_LENIENT); auto result = parser.parse(ctx); // Assert result type matches @@ -101,7 +101,7 @@ void test_unicode(testing &t) { std::string test_name = "case " + std::to_string(i) + ": " + hex_dump(tc.input); t.test(test_name, [&](testing &t) { - common_peg_parse_context ctx(tc.input, true); + common_peg_parse_context ctx(tc.input, COMMON_PEG_PARSE_FLAG_LENIENT); auto result = parser.parse(ctx); // Assert result type matches @@ -142,7 +142,7 @@ void test_unicode(testing &t) { std::string test_name = "case " + std::to_string(i) + ": " + hex_dump(tc.input); t.test(test_name, [&](testing &t) { - common_peg_parse_context ctx(tc.input, true); + common_peg_parse_context ctx(tc.input, COMMON_PEG_PARSE_FLAG_LENIENT); auto result = parser.parse(ctx); // Assert result type matches @@ -187,7 +187,7 @@ void test_unicode(testing &t) { std::string test_name = "case " + std::to_string(i) + ": " + hex_dump(tc.input); t.test(test_name, [&](testing &t) { - common_peg_parse_context ctx(tc.input, true); + common_peg_parse_context ctx(tc.input, COMMON_PEG_PARSE_FLAG_LENIENT); auto result = parser.parse(ctx); // Assert result type matches @@ -225,7 +225,7 @@ void test_unicode(testing &t) { std::string test_name = "case " + std::to_string(i) + ": " + hex_dump(tc.input); t.test(test_name, [&](testing &t) { - common_peg_parse_context ctx(tc.input, false); + common_peg_parse_context ctx(tc.input); auto result = parser.parse(ctx); assert_result_equal(t, tc.expected_result, result.type); @@ -259,7 +259,7 @@ void test_unicode(testing &t) { std::string test_name = "case " + std::to_string(i) + ": " + hex_dump(tc.input); t.test(test_name, [&](testing &t) { - common_peg_parse_context ctx(tc.input, true); + common_peg_parse_context ctx(tc.input, COMMON_PEG_PARSE_FLAG_LENIENT); auto result = parser.parse(ctx); assert_result_equal(t, tc.expected_result, result.type); @@ -293,7 +293,7 @@ void test_unicode(testing &t) { std::string test_name = "case " + std::to_string(i) + ": " + hex_dump(tc.input); t.test(test_name, [&](testing &t) { - common_peg_parse_context ctx(tc.input, false); + common_peg_parse_context ctx(tc.input); auto result = parser.parse(ctx); assert_result_equal(t, tc.expected_result, result.type); @@ -330,7 +330,7 @@ void test_unicode(testing &t) { return p.sequence({p.json_string_content(), p.literal("\"")}); }); - common_peg_parse_context ctx(tc.input, false); + common_peg_parse_context ctx(tc.input); auto result = parser.parse(ctx); assert_result_equal(t, tc.expected_result, result.type); @@ -367,7 +367,7 @@ void test_unicode(testing &t) { return p.json_string_content(); }); - common_peg_parse_context ctx(tc.input, true); + common_peg_parse_context ctx(tc.input, COMMON_PEG_PARSE_FLAG_LENIENT); auto result = parser.parse(ctx); assert_result_equal(t, tc.expected_result, result.type); @@ -390,9 +390,6 @@ void test_unicode(testing &t) { // Invalid continuation byte {std::string("\xC3\x28"), "", COMMON_PEG_PARSE_RESULT_FAIL}, - - // Overlong encoding (security issue) - {std::string("\xC0\x80"), "", COMMON_PEG_PARSE_RESULT_FAIL}, }; for (size_t i = 0; i < test_cases.size(); i++) { @@ -404,7 +401,7 @@ void test_unicode(testing &t) { return p.json_string_content(); }); - common_peg_parse_context ctx(tc.input, false); + common_peg_parse_context ctx(tc.input); auto result = parser.parse(ctx); assert_result_equal(t, tc.expected_result, result.type); @@ -433,7 +430,7 @@ void test_unicode(testing &t) { return p.sequence({p.json_string_content(), p.literal("\"")}); }); - common_peg_parse_context ctx(tc.input, false); + common_peg_parse_context ctx(tc.input); auto result = parser.parse(ctx); assert_result_equal(t, tc.expected_result, result.type); diff --git a/tests/test-chat-auto-parser.cpp b/tests/test-chat-auto-parser.cpp index f2364862c5..eaa57872a8 100644 --- a/tests/test-chat-auto-parser.cpp +++ b/tests/test-chat-auto-parser.cpp @@ -1478,7 +1478,7 @@ static void test_standard_json_tools_openai(testing & t) { R"({"id": "call_abc123", "function": {"name": "get_current_weather", "arguments": {"location": "NYC"}}})" ""; - common_peg_parse_context ctx(input, false); + common_peg_parse_context ctx(input); auto result = parser.parse(ctx); if (!t.assert_true("parse success", result.success())) { @@ -1524,7 +1524,7 @@ static void test_standard_json_tools_cohere(testing & t) { R"({"tool_call_id": 0, "tool_name": "get_current_weather", "parameters": {"location": "NYC", "unit": "celsius"}})" "]<|END_ACTION|>"; - common_peg_parse_context ctx(input, false); + common_peg_parse_context ctx(input); auto result = parser.parse(ctx); if (!t.assert_true("parse success", result.success())) { @@ -1570,7 +1570,7 @@ static void test_standard_json_tools_function_key(testing & t) { R"({"get_current_weather": {"id": "call-0001", "args": {"location": "NYC", "unit": "celsius"}}})" "]"; - common_peg_parse_context ctx(input, false); + common_peg_parse_context ctx(input); auto result = parser.parse(ctx); if (!t.assert_true("parse success", result.success())) { @@ -1845,7 +1845,7 @@ static void test_tagged_args_with_embedded_quotes(testing & t) { "\n" ""; - common_peg_parse_context ctx(input, false); + common_peg_parse_context ctx(input); auto result = parser.parse(ctx); if (!t.assert_true("parse success", result.success())) { diff --git a/tests/test-chat-peg-parser.cpp b/tests/test-chat-peg-parser.cpp index 7626ca12db..112de1d555 100644 --- a/tests/test-chat-peg-parser.cpp +++ b/tests/test-chat-peg-parser.cpp @@ -361,7 +361,7 @@ static void test_example_native(testing & t) { t.log(line); } - common_peg_parse_context ctx(tc.input, false); + common_peg_parse_context ctx(tc.input); auto result = parser.parse(ctx); t.assert_true("success", result.success()); @@ -458,7 +458,7 @@ static void test_example_qwen3_coder(testing & t) { for (auto it = tokens.begin(); it != tokens.end(); it++) { std::string in = std::accumulate(tokens.begin(), it + 1, std::string()); - common_peg_parse_context ctx(in, it + 1 < tokens.end()); + common_peg_parse_context ctx(in, (it + 1 < tokens.end()) ? COMMON_PEG_PARSE_FLAG_LENIENT : COMMON_PEG_PARSE_FLAG_NONE); auto result = parser.parse(ctx); if (!t.assert_equal("not fail", false, result.fail())) { @@ -523,7 +523,7 @@ static void test_example_qwen3_non_coder(testing & t) { "\"fahrenheit\"}}" ""; - common_peg_parse_context ctx(input, false); + common_peg_parse_context ctx(input); auto result = parser.parse(ctx); t.assert_true("success", result.success()); @@ -556,7 +556,7 @@ static void test_example_qwen3_non_coder(testing & t) { for (auto it = tokens.begin(); it != tokens.end(); it++) { std::string in = std::accumulate(tokens.begin(), it + 1, std::string()); - common_peg_parse_context ctx(in, it + 1 < tokens.end()); + common_peg_parse_context ctx(in, (it + 1 < tokens.end()) ? COMMON_PEG_PARSE_FLAG_LENIENT : COMMON_PEG_PARSE_FLAG_NONE); auto result = parser.parse(ctx); if (!t.assert_equal("not fail", false, result.fail())) { @@ -617,7 +617,7 @@ void test_command7_parser_compare(testing & t) { auto test_current = [&](const common_peg_arena & p, const std::string & input, bool is_partial, bool print_results) { - common_peg_parse_context ctx(input, is_partial); + common_peg_parse_context ctx(input, is_partial ? COMMON_PEG_PARSE_FLAG_LENIENT : COMMON_PEG_PARSE_FLAG_NONE); auto result = p.parse(ctx); common_chat_msg msg; @@ -780,7 +780,7 @@ static void test_prefix_tool_names(testing & t) { "" ""; - common_peg_parse_context ctx(input, false); + common_peg_parse_context ctx(input); auto result = parser.parse(ctx); t.assert_true("success", result.success()); @@ -814,7 +814,7 @@ static void test_prefix_tool_names(testing & t) { for (auto it = tokens.begin(); it != tokens.end(); it++) { std::string in = std::accumulate(tokens.begin(), it + 1, std::string()); - common_peg_parse_context ctx(in, it + 1 < tokens.end()); + common_peg_parse_context ctx(in, (it + 1 < tokens.end()) ? COMMON_PEG_PARSE_FLAG_LENIENT : COMMON_PEG_PARSE_FLAG_NONE); auto result = parser.parse(ctx); if (!t.assert_equal("not fail", false, result.fail())) { @@ -864,7 +864,7 @@ static void test_prefix_tool_names(testing & t) { "" ""; - common_peg_parse_context ctx(input, false); + common_peg_parse_context ctx(input); auto result = parser.parse(ctx); t.assert_true("success", result.success()); @@ -931,7 +931,7 @@ static void test_tagged_peg_parser(testing & t) { return p.tag("prefix", p.until(":")) + ":" + p.tag("value", p.rest()) + p.end(); }); - auto result = parser.parse_and_extract("key:val", true); + auto result = parser.parse_and_extract("key:val", COMMON_PEG_PARSE_FLAG_LENIENT); t.assert_true("not fail", !result.result.fail()); t.assert_equal("prefix tag", "key", result.tags.at("prefix")); t.assert_equal("value tag", "val", result.tags.at("value"));