diff --git a/common/json-schema-to-grammar.cpp b/common/json-schema-to-grammar.cpp index c7057d651d..e2c4d6ce22 100644 --- a/common/json-schema-to-grammar.cpp +++ b/common/json-schema-to-grammar.cpp @@ -416,15 +416,30 @@ private: i++; } else if (c == '(') { i++; - if (i < length) { - if (sub_pattern[i] == '?') { + if (i < length && sub_pattern[i] == '?') { + if (i + 1 < length && sub_pattern[i + 1] == ':') { + i += 2; // skip "?:" for non-capturing group, treat as regular group + } else { + // lookahead/lookbehind (?=, ?!, ?<=, ? 0) { + if (sub_pattern[i] == '\\' && i + 1 < length) { + i += 2; // skip escaped character + } else { + if (sub_pattern[i] == '(') depth++; + else if (sub_pattern[i] == ')') depth--; + i++; + } + } + continue; } } seq.emplace_back("(" + to_rule(transform()) + ")", false); } else if (c == ')') { i++; - if (start > 0 && sub_pattern[start - 1] != '(') { + if (start > 0 && sub_pattern[start - 1] != '(' && (start < 2 || sub_pattern[start - 2] != '?' || sub_pattern[start - 1] != ':')) { _errors.push_back("Unbalanced parentheses"); } return join_seq(); diff --git a/tests/test-json-schema-to-grammar.cpp b/tests/test-json-schema-to-grammar.cpp index ac697c4d24..85584ef12b 100755 --- a/tests/test-json-schema-to-grammar.cpp +++ b/tests/test-json-schema-to-grammar.cpp @@ -1525,6 +1525,47 @@ int main() { } }); + // C++ only tests (features not yet supported in JS/Python implementations) + { + fprintf(stderr, "#\n# Testing C++ only features\n#\n"); + auto run = [](const TestCase & tc) { + fprintf(stderr, "- %s\n", tc.name.c_str()); + try { + tc.verify(json_schema_to_grammar(nlohmann::ordered_json::parse(tc.schema), true)); + tc.verify_status(SUCCESS); + } catch (const std::invalid_argument & ex) { + fprintf(stderr, "Error: %s\n", ex.what()); + tc.verify_status(FAILURE); + } + }; + + run({ + SUCCESS, + "regexp with non-capturing group", + R"""({ + "type": "string", + "pattern": "^(?:foo|bar)baz$" + })""", + R"""( + root ::= "\"" (("foo" | "bar") "baz") "\"" space + space ::= | " " | "\n"{1,2} [ \t]{0,20} + )""", + }); + + run({ + SUCCESS, + "regexp with nested non-capturing groups", + R"""({ + "type": "string", + "pattern": "^(?:(?:ab)+c)?d$" + })""", + R"""( + root ::= "\"" ((("ab")+ "c")? "d") "\"" space + space ::= | " " | "\n"{1,2} [ \t]{0,20} + )""", + }); + } + if (getenv("LLAMA_SKIP_TESTS_SLOW_ON_EMULATOR")) { fprintf(stderr, "\033[33mWARNING: Skipping slow tests on emulator.\n\033[0m"); } else {