From c96f608d9861f7e8466bc1b6ac2ff4e3c6f96641 Mon Sep 17 00:00:00 2001 From: Aldehir Rojas Date: Mon, 9 Mar 2026 18:29:21 -0500 Subject: [PATCH] common: consolidate PEG string parsers (#20263) * common : consolidate PEG string parsers * cont : fix json_string_content() --- common/chat-auto-parser-generator.cpp | 2 +- common/chat-peg-parser.cpp | 16 +- common/peg-parser.cpp | 248 +++++++++---------- common/peg-parser.h | 21 +- tests/peg-parser/test-python-dict-parser.cpp | 2 +- tests/peg-parser/test-unicode.cpp | 8 +- tests/test-chat-peg-parser.cpp | 4 +- 7 files changed, 142 insertions(+), 159 deletions(-) diff --git a/common/chat-auto-parser-generator.cpp b/common/chat-auto-parser-generator.cpp index 03dfb8f10f..1c74ad30d9 100644 --- a/common/chat-auto-parser-generator.cpp +++ b/common/chat-auto-parser-generator.cpp @@ -90,7 +90,7 @@ common_peg_arena autoparser::build_parser(const templates_params & inputs) const // pre-register a json-string rule that accepts both quote styles. This must happen // before any call to p.json() so that all JSON parsing inherits the flexible rule. if (tools.format.uses_python_dicts) { - p.rule("json-string", [&]() { return p.choice({ p.double_quoted_string(), p.single_quoted_string() }); }); + p.rule("json-string", p.quoted_string()); } parser_build_context ctx(p, inputs); diff --git a/common/chat-peg-parser.cpp b/common/chat-peg-parser.cpp index e055447e0a..cbdf202f03 100644 --- a/common/chat-peg-parser.cpp +++ b/common/chat-peg-parser.cpp @@ -507,8 +507,8 @@ common_peg_parser common_chat_peg_builder::python_style_tool_calls( common_peg_parser arg_value_parser = eps(); auto string_value_parser = choice({ - literal("\"") + tool_arg_string_value(json_string_content()) + literal("\""), - literal("'") + tool_arg_string_value(json_string_content()) + literal("'") + literal("\"") + tool_arg_string_value(string_content('"')) + literal("\""), + literal("'") + tool_arg_string_value(string_content('\'')) + literal("'") }); if (is_string_type) { @@ -577,7 +577,7 @@ common_peg_parser common_chat_peg_builder::build_json_tools_function_is_key( if (!call_id_key.empty()) { auto id_parser = atomic( literal("\"" + call_id_key + "\"") + space() + literal(":") + space() + - literal("\"") + tool_id(json_string_content()) + literal("\"") + literal("\"") + tool_id(string_content('"')) + literal("\"") ); inner_fields.push_back(optional(id_parser + space() + optional(literal(",") + space()))); } @@ -586,7 +586,7 @@ common_peg_parser common_chat_peg_builder::build_json_tools_function_is_key( auto gen_id_parser = atomic( literal("\"" + gen_call_id_key + "\"") + space() + literal(":") + space() + choice({ - literal("\"") + tool_id(json_string_content()) + literal("\""), + literal("\"") + tool_id(string_content('"')) + literal("\""), tool_id(json_number()) }) ); @@ -675,7 +675,7 @@ common_peg_parser common_chat_peg_builder::build_json_tools_nested_keys( if (id_spec.first.empty()) { auto id_parser = atomic( literal("\"" + call_id_key + "\"") + space() + literal(":") + space() + - literal("\"") + tool_id(json_string_content()) + literal("\"") + literal("\"") + tool_id(string_content('"')) + literal("\"") ); tool_parser_body = tool_parser_body + optional(id_parser + space() + literal(",") + space()); } @@ -687,7 +687,7 @@ common_peg_parser common_chat_peg_builder::build_json_tools_nested_keys( auto gen_id_parser = atomic( literal("\"" + gen_call_id_key + "\"") + space() + literal(":") + space() + choice({ - literal("\"") + tool_id(json_string_content()) + literal("\""), + literal("\"") + tool_id(string_content('"')) + literal("\""), tool_id(json_number()) }) ); @@ -736,7 +736,7 @@ common_peg_parser common_chat_peg_builder::build_json_tools_flat_keys( id_parser = atomic( literal("\"" + call_id_key + "\"") + space() + literal(":") + space() + choice({ - literal("\"") + tool_id(json_string_content()) + literal("\""), + literal("\"") + tool_id(string_content('"')) + literal("\""), tool_id(json_number()) }) ); @@ -747,7 +747,7 @@ common_peg_parser common_chat_peg_builder::build_json_tools_flat_keys( gen_id_parser = atomic( literal("\"" + gen_call_id_key + "\"") + space() + literal(":") + space() + choice({ - literal("\"") + tool_id(json_string_content()) + literal("\""), + literal("\"") + tool_id(string_content('"')) + literal("\""), tool_id(json_number()) }) ); diff --git a/common/peg-parser.cpp b/common/peg-parser.cpp index 81630b68a9..a6d9a4c27c 100644 --- a/common/peg-parser.cpp +++ b/common/peg-parser.cpp @@ -658,7 +658,7 @@ struct parser_executor { return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_SUCCESS, start_pos, pos); } - static common_peg_parse_result handle_escape_sequence(common_peg_parse_context & ctx, size_t start, size_t & pos) { + static common_peg_parse_result handle_escape_sequence(common_peg_parse_context & ctx, size_t start, size_t & pos, const char delimiter) { ++pos; // consume '\' if (pos >= ctx.input.size()) { if (!ctx.is_lenient()) { @@ -667,23 +667,14 @@ struct parser_executor { return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT, start, pos); } - switch (ctx.input[pos]) { - case '"': - case '\'': - case '\\': - case '/': - case 'b': - case 'f': - case 'n': - case 'r': - case 't': - ++pos; - return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_SUCCESS, start, pos); - case 'u': - return handle_unicode_escape(ctx, start, pos); - default: - // Invalid escape sequence - return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_FAIL, start); + char c = ctx.input[pos]; + if (c == delimiter || c == '\\' || c == '/' || c == 'b' || c == 'f' || c == 'n' || c == 'r' || c == 't') { + ++pos; + return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_SUCCESS, start, pos); + } else if (c == 'u') { + return handle_unicode_escape(ctx, start, pos); + } else { + return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_FAIL, start); } } @@ -704,62 +695,20 @@ struct parser_executor { return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_SUCCESS, start, pos); } - common_peg_parse_result operator()(const common_peg_json_string_parser & /* p */) { + common_peg_parse_result operator()(const common_peg_string_parser & p) { auto pos = start_pos; // Parse string content (without quotes) while (pos < ctx.input.size()) { char c = ctx.input[pos]; - if (c == '"') { - // Found closing quote - success (don't consume it) + if (c == p.delimiter) { + // Found closing delimiter - success (don't consume it) return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_SUCCESS, start_pos, pos); } if (c == '\\') { - auto result = handle_escape_sequence(ctx, start_pos, pos); - if (!result.success()) { - return result; - } - } else { - auto utf8_result = common_parse_utf8_codepoint(ctx.input, pos); - - if (utf8_result.status == utf8_parse_result::INCOMPLETE) { - if (!ctx.is_lenient()) { - return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_FAIL, start_pos); - } - return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT, start_pos, pos); - } - - if (utf8_result.status == utf8_parse_result::INVALID) { - return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_FAIL, start_pos); - } - - pos += utf8_result.bytes_consumed; - } - } - - // Reached end without finding closing quote - if (!ctx.is_lenient()) { - return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_FAIL, start_pos, pos); - } - return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT, start_pos, pos); - } - - common_peg_parse_result operator()(const common_peg_python_dict_string_parser & /* p */) { - auto pos = start_pos; - - // Parse string content (without quotes) - while (pos < ctx.input.size()) { - char c = ctx.input[pos]; - - if (c == '\'') { - // Found closing quote - success (don't consume it) - return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_SUCCESS, start_pos, pos); - } - - if (c == '\\') { - auto result = handle_escape_sequence(ctx, start_pos, pos); + auto result = handle_escape_sequence(ctx, start_pos, pos, p.delimiter); if (!result.success()) { return result; } @@ -988,8 +937,7 @@ void common_peg_arena::resolve_refs() { std::is_same_v || std::is_same_v || std::is_same_v || - std::is_same_v || - std::is_same_v || + std::is_same_v || std::is_same_v || std::is_same_v || std::is_same_v) { @@ -1065,10 +1013,8 @@ std::string common_peg_arena::dump_impl(common_peg_parser_id return "CharRepeat(" + p.pattern + ", " + std::to_string(p.min_count) + ", unbounded)"; } return "CharRepeat(" + p.pattern + ", " + std::to_string(p.min_count) + ", " + std::to_string(p.max_count) + ")"; - } else if constexpr (std::is_same_v) { - return "JsonString()"; - } else if constexpr (std::is_same_v) { - return "PythonDictString()"; + } else if constexpr (std::is_same_v) { + return "String(" + std::string(1, p.delimiter) + ")"; } else if constexpr (std::is_same_v) { return "Until(" + string_join(p.delimiters, " | ") + ")"; } else if constexpr (std::is_same_v) { @@ -1281,47 +1227,25 @@ common_peg_arena common_peg_parser_builder::build() { // String primitives -common_peg_parser common_peg_parser_builder::json_string_content() { - return wrap(arena_.add_parser(common_peg_json_string_parser{})); -} - -common_peg_parser common_peg_parser_builder::single_quoted_string_content() { - return wrap(arena_.add_parser(common_peg_python_dict_string_parser{})); +common_peg_parser common_peg_parser_builder::string_content(char delimiter) { + return wrap(arena_.add_parser(common_peg_string_parser{delimiter})); } common_peg_parser common_peg_parser_builder::double_quoted_string() { - return rule("dq-string", - [this]() { return sequence({ literal("\""), json_string_content(), literal("\""), space() }); }); -} - -common_peg_parser common_peg_parser_builder::single_quoted_string() { - return rule("sq-string", - [this]() { return sequence({ literal("'"), single_quoted_string_content(), literal("'"), space() }); }); -} - -common_peg_parser common_peg_parser_builder::flexible_string() { - return rule("flexible-string", [this]() { return choice({ double_quoted_string(), single_quoted_string() }); }); -} - -// Generic helpers for object/array structure - -common_peg_parser common_peg_parser_builder::generic_object(const std::string & name, - const common_peg_parser & string_parser, - const common_peg_parser & value_parser) { - return rule(name, [this, string_parser, value_parser]() { - auto ws = space(); - auto member = sequence({ string_parser, ws, literal(":"), ws, value_parser }); - auto members = sequence({ member, zero_or_more(sequence({ ws, literal(","), ws, member })) }); - return sequence({ literal("{"), ws, choice({ literal("}"), sequence({ members, ws, literal("}") }) }) }); + return rule("double-quoted-string", [this]() { + return sequence({literal("\""), string_content('"'), literal("\""), space()}); }); } -common_peg_parser common_peg_parser_builder::generic_array(const std::string & name, - const common_peg_parser & value_parser) { - return rule(name, [this, value_parser]() { - auto ws = space(); - auto elements = sequence({ value_parser, zero_or_more(sequence({ literal(","), ws, value_parser })) }); - return sequence({ literal("["), ws, choice({ literal("]"), sequence({ elements, ws, literal("]") }) }) }); +common_peg_parser common_peg_parser_builder::single_quoted_string() { + return rule("single-quoted-string", [this]() { + return sequence({literal("'"), string_content('\''), literal("'"), space()}); + }); +} + +common_peg_parser common_peg_parser_builder::quoted_string() { + return rule("quoted-string", [this]() { + return choice({double_quoted_string(), single_quoted_string()}); }); } @@ -1344,7 +1268,7 @@ common_peg_parser common_peg_parser_builder::json_number() { common_peg_parser common_peg_parser_builder::json_string() { return rule("json-string", [this]() { - return sequence({literal("\""), json_string_content(), literal("\""), space()}); + return sequence({literal("\""), string_content('"'), literal("\""), space()}); }); } @@ -1361,11 +1285,36 @@ common_peg_parser common_peg_parser_builder::json_null() { } common_peg_parser common_peg_parser_builder::json_object() { - return generic_object("json-object", json_string(), json()); + return rule("json-object", [this]() { + auto ws = space(); + auto member = sequence({json_string(), ws, literal(":"), ws, json()}); + auto members = sequence({member, zero_or_more(sequence({ws, literal(","), ws, member}))}); + return sequence({ + literal("{"), + ws, + choice({ + literal("}"), + sequence({members, ws, literal("}")}) + }), + ws + }); + }); } common_peg_parser common_peg_parser_builder::json_array() { - return generic_array("json-array", json()); + return rule("json-array", [this]() { + auto ws = space(); + auto elements = sequence({json(), zero_or_more(sequence({literal(","), ws, json()}))}); + return sequence({ + literal("["), + ws, + choice({ + literal("]"), + sequence({elements, ws, literal("]")}) + }), + ws + }); + }); } common_peg_parser common_peg_parser_builder::json() { @@ -1382,7 +1331,9 @@ common_peg_parser common_peg_parser_builder::json() { } common_peg_parser common_peg_parser_builder::python_string() { - return rule("python-string", [this]() { return choice({ double_quoted_string(), single_quoted_string() }); }); + return rule("python-string", [this]() { + return choice({double_quoted_string(), single_quoted_string()}); + }); } common_peg_parser common_peg_parser_builder::python_number() { @@ -1390,24 +1341,63 @@ common_peg_parser common_peg_parser_builder::python_number() { } common_peg_parser common_peg_parser_builder::python_bool() { - return rule("python-bool", [this]() { return sequence({ choice({ literal("True"), literal("False") }), space() }); }); + return rule("python-bool", [this]() { + return sequence({ + choice({literal("True"), literal("False")}), + space() + }); + }); } common_peg_parser common_peg_parser_builder::python_null() { - return rule("python-none", [this]() { return sequence({ literal("None"), space() }); }); + return rule("python-none", [this]() { + return sequence({literal("None"), space()}); + }); } common_peg_parser common_peg_parser_builder::python_dict() { - return generic_object("python-dict", python_string(), python_value()); + return rule("python-dict", [this]() { + auto ws = space(); + auto member = sequence({python_string(), ws, literal(":"), ws, python_value()}); + auto members = sequence({member, zero_or_more(sequence({ws, literal(","), ws, member}))}); + return sequence({ + literal("{"), + ws, + choice({ + literal("}"), + sequence({members, ws, literal("}")}) + }), + ws + }); + }); } common_peg_parser common_peg_parser_builder::python_array() { - return generic_array("python-array", python_value()); + return rule("python-array", [this]() { + auto ws = space(); + auto elements = sequence({python_value(), zero_or_more(sequence({literal(","), ws, python_value()}))}); + return sequence({ + literal("["), + ws, + choice({ + literal("]"), + sequence({elements, ws, literal("]")}) + }), + ws + }); + }); } common_peg_parser common_peg_parser_builder::python_value() { return rule("python-value", [this]() { - return choice({ python_dict(), python_array(), python_string(), python_number(), python_bool(), python_null() }); + return choice({ + python_dict(), + python_array(), + python_string(), + python_number(), + python_bool(), + python_null() + }); }); } @@ -1528,8 +1518,7 @@ static std::unordered_set collect_reachable_rules( std::is_same_v || std::is_same_v || std::is_same_v || - std::is_same_v || - std::is_same_v) { + std::is_same_v) { // These parsers do not have any children } else if constexpr (std::is_same_v) { for (auto child : p.children) { @@ -1665,10 +1654,9 @@ void common_peg_arena::build_grammar(const common_grammar_builder & builder, boo return result + "{" + std::to_string(p.min_count) + "}"; } return result + "{" + std::to_string(p.min_count) + "," + std::to_string(p.max_count) + "}"; - } else if constexpr (std::is_same_v) { - return R"(( [^"\\] | "\\" ( ["\\/ bfnrt] | "u" [0-9a-fA-F]{4} ) )*)"; - } else if constexpr (std::is_same_v) { - return R"(( [^"\\] | "\\" ( ["\\/ bfnrt] | "u" [0-9a-fA-F]{4} ) )*)"; + } else if constexpr (std::is_same_v) { + const std::string delim(1, p.delimiter); + return R"(( [^)" + delim + R"(\\] | "\\" ( [)" + delim + R"(\\/ bfnrt] | "u" [0-9a-fA-F]{4} ) )*)"; } else if constexpr (std::is_same_v) { if (p.delimiters.empty()) { return ".*"; @@ -1798,10 +1786,8 @@ static nlohmann::json serialize_parser_variant(const common_peg_parser_variant & {"min_count", p.min_count}, {"max_count", p.max_count} }; - } else if constexpr (std::is_same_v) { - return json{{"type", "json_string"}}; - } else if constexpr (std::is_same_v) { - return json{{ "type", "python_dict_string" }}; + } else if constexpr (std::is_same_v) { + return json{{"type", "string"}, {"delimiter", std::string(1, p.delimiter)}}; } else if constexpr (std::is_same_v) { return json{{"type", "until"}, {"delimiters", p.delimiters}}; } else if constexpr (std::is_same_v) { @@ -1928,11 +1914,15 @@ static common_peg_parser_variant deserialize_parser_variant(const nlohmann::json } return parser; } - if (type == "json_string") { - return common_peg_json_string_parser{}; - } - if (type == "python_dict_string") { - return common_peg_python_dict_string_parser{}; + if (type == "string") { + if (!j.contains("delimiter")) { + throw std::runtime_error("string parser missing delimiter field."); + } + std::string delimiter = j["delimiter"]; + if (delimiter.empty()) { + throw std::runtime_error("string parser delimiter is empty."); + } + return common_peg_string_parser{delimiter[0]}; } if (type == "until") { if (!j.contains("delimiters") || !j["delimiters"].is_array()) { diff --git a/common/peg-parser.h b/common/peg-parser.h index 9f81df2e9a..31cdf9ec2d 100644 --- a/common/peg-parser.h +++ b/common/peg-parser.h @@ -231,8 +231,9 @@ struct common_peg_chars_parser { int max_count; // -1 for unbounded }; -struct common_peg_json_string_parser {}; -struct common_peg_python_dict_string_parser {}; +struct common_peg_string_parser { + char delimiter; +}; struct common_peg_until_parser { std::vector delimiters; @@ -280,8 +281,7 @@ using common_peg_parser_variant = std::variant< common_peg_any_parser, common_peg_space_parser, common_peg_chars_parser, - common_peg_json_string_parser, - common_peg_python_dict_string_parser, + common_peg_string_parser, common_peg_until_parser, common_peg_schema_parser, common_peg_rule_parser, @@ -340,10 +340,6 @@ class common_peg_parser_builder { common_peg_parser wrap(common_peg_parser_id id) { return common_peg_parser(id, *this); } common_peg_parser add(const common_peg_parser_variant & p) { return wrap(arena_.add_parser(p)); } - // Generic helpers for building object/array structures with configurable string/value parsers. - common_peg_parser generic_object(const std::string & name, const common_peg_parser & string_parser, const common_peg_parser & value_parser); - common_peg_parser generic_array(const std::string & name, const common_peg_parser & value_parser); - public: common_peg_parser_builder(); @@ -444,13 +440,10 @@ class common_peg_parser_builder { common_peg_parser single_quoted_string(); // Matches a string that accepts both double-quoted and single-quoted styles. - common_peg_parser flexible_string(); + common_peg_parser quoted_string(); - // Matches double-quoted string content without the surrounding quotes. - common_peg_parser json_string_content(); - - // Matches single-quoted string content without the surrounding quotes. - common_peg_parser single_quoted_string_content(); + // Matches string content without the surrounding delimiter. + common_peg_parser string_content(char delimiter); // Creates a complete JSON parser supporting objects, arrays, strings, numbers, booleans, and null. // value -> object | array | string | number | true | false | null diff --git a/tests/peg-parser/test-python-dict-parser.cpp b/tests/peg-parser/test-python-dict-parser.cpp index 18e7d901b8..1a549106b8 100644 --- a/tests/peg-parser/test-python-dict-parser.cpp +++ b/tests/peg-parser/test-python-dict-parser.cpp @@ -197,7 +197,7 @@ void test_python_dict_parser(testing &t) { // Test single-quoted string content parser directly t.test("single-quoted string content parser", [](testing &t) { auto parser = build_peg_parser([](common_peg_parser_builder & p) { - return p.sequence({ p.literal("'"), p.single_quoted_string_content(), p.literal("'"), p.space() }); + return p.sequence({ p.literal("'"), p.string_content('\''), p.literal("'"), p.space() }); }); t.test("simple string", [&](testing &t) { diff --git a/tests/peg-parser/test-unicode.cpp b/tests/peg-parser/test-unicode.cpp index 9cbdb0d387..24663d7017 100644 --- a/tests/peg-parser/test-unicode.cpp +++ b/tests/peg-parser/test-unicode.cpp @@ -327,7 +327,7 @@ void test_unicode(testing &t) { t.test(test_name, [&](testing &t) { auto parser = build_peg_parser([](common_peg_parser_builder& p) { - return p.sequence({p.json_string_content(), p.literal("\"")}); + return p.sequence({p.string_content('"'), p.literal("\"")}); }); common_peg_parse_context ctx(tc.input); @@ -364,7 +364,7 @@ void test_unicode(testing &t) { t.test(test_name, [&](testing &t) { auto parser = build_peg_parser([](common_peg_parser_builder& p) { - return p.json_string_content(); + return p.string_content('"'); }); common_peg_parse_context ctx(tc.input, COMMON_PEG_PARSE_FLAG_LENIENT); @@ -398,7 +398,7 @@ void test_unicode(testing &t) { t.test(test_name, [&](testing &t) { auto parser = build_peg_parser([](common_peg_parser_builder& p) { - return p.json_string_content(); + return p.string_content('"'); }); common_peg_parse_context ctx(tc.input); @@ -427,7 +427,7 @@ void test_unicode(testing &t) { t.test(test_name, [&](testing &t) { auto parser = build_peg_parser([](common_peg_parser_builder& p) { - return p.sequence({p.json_string_content(), p.literal("\"")}); + return p.sequence({p.string_content('"'), p.literal("\"")}); }); common_peg_parse_context ctx(tc.input); diff --git a/tests/test-chat-peg-parser.cpp b/tests/test-chat-peg-parser.cpp index 112de1d555..dc8724be34 100644 --- a/tests/test-chat-peg-parser.cpp +++ b/tests/test-chat-peg-parser.cpp @@ -597,9 +597,9 @@ void test_command7_parser_compare(testing & t) { auto response = "<|START_RESPONSE|>" << p.content(p.until("<|END_RESPONSE|>")) << "<|END_RESPONSE|>"; - auto tool_call_id = p.atomic("\"tool_call_id\"" << (":" << ("\"" + p.tool_id(p.json_string_content()) + "\""))); + auto tool_call_id = p.atomic("\"tool_call_id\"" << (":" << ("\"" + p.tool_id(p.string_content('"')) + "\""))); auto tool_call_name = - p.atomic("\"tool_name\"" << (":" << ("\"" + p.tool_name(p.json_string_content()) + "\""))); + p.atomic("\"tool_name\"" << (":" << ("\"" + p.tool_name(p.string_content('"')) + "\""))); auto tool_call_args = "\"parameters\"" << (":" << p.tool_args(p.json())); auto tool_call_fields = p.rule("tool-call-fields", tool_call_id | tool_call_name | tool_call_args);