common/json-schema: fix: handle non-capturing groups (?:...) in JSON schema pattern converter (#21124)
The regex-to-grammar converter in _visit_pattern() crashes with SIGSEGV
when a JSON schema "pattern" field contains a non-capturing group (?:...).
Root cause: when the parser sees '(' followed by '?', it pushes a warning
but does not advance past '?:'. The recursive transform() call then
interprets '?' as a quantifier and calls seq.back() on an empty vector,
causing undefined behavior.
This commonly occurs when serving OpenAI-compatible tool calls from
clients that include complex regex patterns in their JSON schemas (e.g.,
date validation patterns like ^(?:(?:\d\d[2468][048]|...)-02-29|...)$).
The fix:
- Skip '?:' after '(' to treat non-capturing groups as regular groups
- For unsupported syntax (?=, ?!, etc.), skip to matching ')' safely,
handling escaped characters to avoid miscounting parenthesis depth
- Adjust the ')' unbalanced-parentheses check using direct char
comparisons instead of substr
- Add test cases for non-capturing groups (C++ only, as the JS/Python
implementations do not yet support this syntax)
This commit is contained in:
parent
e6f2ec01ff
commit
e397d3885c
|
|
@ -416,15 +416,30 @@ private:
|
|||
i++;
|
||||
} else if (c == '(') {
|
||||
i++;
|
||||
if (i < length) {
|
||||
if (sub_pattern[i] == '?') {
|
||||
if (i < length && sub_pattern[i] == '?') {
|
||||
if (i + 1 < length && sub_pattern[i + 1] == ':') {
|
||||
i += 2; // skip "?:" for non-capturing group, treat as regular group
|
||||
} else {
|
||||
// lookahead/lookbehind (?=, ?!, ?<=, ?<!) - not supported
|
||||
_warnings.push_back("Unsupported pattern syntax");
|
||||
// skip to matching ')' to avoid UB on empty seq
|
||||
int depth = 1;
|
||||
while (i < length && depth > 0) {
|
||||
if (sub_pattern[i] == '\\' && i + 1 < length) {
|
||||
i += 2; // skip escaped character
|
||||
} else {
|
||||
if (sub_pattern[i] == '(') depth++;
|
||||
else if (sub_pattern[i] == ')') depth--;
|
||||
i++;
|
||||
}
|
||||
}
|
||||
continue;
|
||||
}
|
||||
}
|
||||
seq.emplace_back("(" + to_rule(transform()) + ")", false);
|
||||
} else if (c == ')') {
|
||||
i++;
|
||||
if (start > 0 && sub_pattern[start - 1] != '(') {
|
||||
if (start > 0 && sub_pattern[start - 1] != '(' && (start < 2 || sub_pattern[start - 2] != '?' || sub_pattern[start - 1] != ':')) {
|
||||
_errors.push_back("Unbalanced parentheses");
|
||||
}
|
||||
return join_seq();
|
||||
|
|
|
|||
|
|
@ -1525,6 +1525,47 @@ int main() {
|
|||
}
|
||||
});
|
||||
|
||||
// C++ only tests (features not yet supported in JS/Python implementations)
|
||||
{
|
||||
fprintf(stderr, "#\n# Testing C++ only features\n#\n");
|
||||
auto run = [](const TestCase & tc) {
|
||||
fprintf(stderr, "- %s\n", tc.name.c_str());
|
||||
try {
|
||||
tc.verify(json_schema_to_grammar(nlohmann::ordered_json::parse(tc.schema), true));
|
||||
tc.verify_status(SUCCESS);
|
||||
} catch (const std::invalid_argument & ex) {
|
||||
fprintf(stderr, "Error: %s\n", ex.what());
|
||||
tc.verify_status(FAILURE);
|
||||
}
|
||||
};
|
||||
|
||||
run({
|
||||
SUCCESS,
|
||||
"regexp with non-capturing group",
|
||||
R"""({
|
||||
"type": "string",
|
||||
"pattern": "^(?:foo|bar)baz$"
|
||||
})""",
|
||||
R"""(
|
||||
root ::= "\"" (("foo" | "bar") "baz") "\"" space
|
||||
space ::= | " " | "\n"{1,2} [ \t]{0,20}
|
||||
)""",
|
||||
});
|
||||
|
||||
run({
|
||||
SUCCESS,
|
||||
"regexp with nested non-capturing groups",
|
||||
R"""({
|
||||
"type": "string",
|
||||
"pattern": "^(?:(?:ab)+c)?d$"
|
||||
})""",
|
||||
R"""(
|
||||
root ::= "\"" ((("ab")+ "c")? "d") "\"" space
|
||||
space ::= | " " | "\n"{1,2} [ \t]{0,20}
|
||||
)""",
|
||||
});
|
||||
}
|
||||
|
||||
if (getenv("LLAMA_SKIP_TESTS_SLOW_ON_EMULATOR")) {
|
||||
fprintf(stderr, "\033[33mWARNING: Skipping slow tests on emulator.\n\033[0m");
|
||||
} else {
|
||||
|
|
|
|||
Loading…
Reference in New Issue