From e397d3885c40fac0b91bc2784c2d948a259de8a4 Mon Sep 17 00:00:00 2001 From: Adrien Date: Sat, 28 Mar 2026 17:55:38 +0100 Subject: [PATCH] common/json-schema: fix: handle non-capturing groups (?:...) in JSON schema pattern converter (#21124) The regex-to-grammar converter in _visit_pattern() crashes with SIGSEGV when a JSON schema "pattern" field contains a non-capturing group (?:...). Root cause: when the parser sees '(' followed by '?', it pushes a warning but does not advance past '?:'. The recursive transform() call then interprets '?' as a quantifier and calls seq.back() on an empty vector, causing undefined behavior. This commonly occurs when serving OpenAI-compatible tool calls from clients that include complex regex patterns in their JSON schemas (e.g., date validation patterns like ^(?:(?:\d\d[2468][048]|...)-02-29|...)$). The fix: - Skip '?:' after '(' to treat non-capturing groups as regular groups - For unsupported syntax (?=, ?!, etc.), skip to matching ')' safely, handling escaped characters to avoid miscounting parenthesis depth - Adjust the ')' unbalanced-parentheses check using direct char comparisons instead of substr - Add test cases for non-capturing groups (C++ only, as the JS/Python implementations do not yet support this syntax) --- common/json-schema-to-grammar.cpp | 21 ++++++++++++-- tests/test-json-schema-to-grammar.cpp | 41 +++++++++++++++++++++++++++ 2 files changed, 59 insertions(+), 3 deletions(-) diff --git a/common/json-schema-to-grammar.cpp b/common/json-schema-to-grammar.cpp index c7057d651d..e2c4d6ce22 100644 --- a/common/json-schema-to-grammar.cpp +++ b/common/json-schema-to-grammar.cpp @@ -416,15 +416,30 @@ private: i++; } else if (c == '(') { i++; - if (i < length) { - if (sub_pattern[i] == '?') { + if (i < length && sub_pattern[i] == '?') { + if (i + 1 < length && sub_pattern[i + 1] == ':') { + i += 2; // skip "?:" for non-capturing group, treat as regular group + } else { + // lookahead/lookbehind (?=, ?!, ?<=, ? 0) { + if (sub_pattern[i] == '\\' && i + 1 < length) { + i += 2; // skip escaped character + } else { + if (sub_pattern[i] == '(') depth++; + else if (sub_pattern[i] == ')') depth--; + i++; + } + } + continue; } } seq.emplace_back("(" + to_rule(transform()) + ")", false); } else if (c == ')') { i++; - if (start > 0 && sub_pattern[start - 1] != '(') { + if (start > 0 && sub_pattern[start - 1] != '(' && (start < 2 || sub_pattern[start - 2] != '?' || sub_pattern[start - 1] != ':')) { _errors.push_back("Unbalanced parentheses"); } return join_seq(); diff --git a/tests/test-json-schema-to-grammar.cpp b/tests/test-json-schema-to-grammar.cpp index ac697c4d24..85584ef12b 100755 --- a/tests/test-json-schema-to-grammar.cpp +++ b/tests/test-json-schema-to-grammar.cpp @@ -1525,6 +1525,47 @@ int main() { } }); + // C++ only tests (features not yet supported in JS/Python implementations) + { + fprintf(stderr, "#\n# Testing C++ only features\n#\n"); + auto run = [](const TestCase & tc) { + fprintf(stderr, "- %s\n", tc.name.c_str()); + try { + tc.verify(json_schema_to_grammar(nlohmann::ordered_json::parse(tc.schema), true)); + tc.verify_status(SUCCESS); + } catch (const std::invalid_argument & ex) { + fprintf(stderr, "Error: %s\n", ex.what()); + tc.verify_status(FAILURE); + } + }; + + run({ + SUCCESS, + "regexp with non-capturing group", + R"""({ + "type": "string", + "pattern": "^(?:foo|bar)baz$" + })""", + R"""( + root ::= "\"" (("foo" | "bar") "baz") "\"" space + space ::= | " " | "\n"{1,2} [ \t]{0,20} + )""", + }); + + run({ + SUCCESS, + "regexp with nested non-capturing groups", + R"""({ + "type": "string", + "pattern": "^(?:(?:ab)+c)?d$" + })""", + R"""( + root ::= "\"" ((("ab")+ "c")? "d") "\"" space + space ::= | " " | "\n"{1,2} [ \t]{0,20} + )""", + }); + } + if (getenv("LLAMA_SKIP_TESTS_SLOW_ON_EMULATOR")) { fprintf(stderr, "\033[33mWARNING: Skipping slow tests on emulator.\n\033[0m"); } else {