common : expose json-schema functionality to extract type info
This commit is contained in:
parent
2995341730
commit
36524a630b
|
|
@ -305,8 +305,9 @@ static std::string format_literal(const std::string & literal) {
|
|||
|
||||
std::string gbnf_format_literal(const std::string & literal) { return format_literal(literal); }
|
||||
|
||||
class SchemaConverter {
|
||||
class common_schema_converter {
|
||||
private:
|
||||
friend class common_schema_info;
|
||||
friend std::string build_grammar(const std::function<void(const common_grammar_builder &)> & cb, const common_grammar_options & options);
|
||||
std::function<json(const std::string &)> _fetch_json;
|
||||
bool _dotall;
|
||||
|
|
@ -729,7 +730,7 @@ private:
|
|||
}
|
||||
|
||||
public:
|
||||
SchemaConverter(
|
||||
common_schema_converter(
|
||||
const std::function<json(const std::string &)> & fetch_json,
|
||||
bool dotall)
|
||||
: _fetch_json(fetch_json), _dotall(dotall)
|
||||
|
|
@ -990,6 +991,134 @@ public:
|
|||
}
|
||||
};
|
||||
|
||||
// common_schema_info implementation (pimpl)
|
||||
|
||||
common_schema_info::common_schema_info()
|
||||
: impl_(std::make_unique<common_schema_converter>(
|
||||
[](const std::string &) { return json(); },
|
||||
false)) {}
|
||||
|
||||
common_schema_info::~common_schema_info() = default;
|
||||
|
||||
common_schema_info::common_schema_info(common_schema_info &&) noexcept = default;
|
||||
common_schema_info & common_schema_info::operator=(common_schema_info &&) noexcept = default;
|
||||
|
||||
void common_schema_info::resolve_refs(nlohmann::ordered_json & schema) {
|
||||
impl_->resolve_refs(schema, "");
|
||||
}
|
||||
|
||||
// Determines if a JSON schema can resolve to a string type through any path.
|
||||
// Some models emit raw string values rather than JSON-encoded strings for string parameters.
|
||||
// If any branch of the schema (via oneOf, anyOf, $ref, etc.) permits a string, this returns
|
||||
// true, allowing callers to handle the value as a raw string for simplicity.
|
||||
bool common_schema_info::resolves_to_string(const nlohmann::ordered_json & schema) {
|
||||
std::unordered_set<std::string> visited_refs;
|
||||
|
||||
std::function<bool(const json &)> check = [&](const json & s) -> bool {
|
||||
if (!s.is_object()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Handle $ref
|
||||
if (s.contains("$ref")) {
|
||||
const std::string & ref = s["$ref"];
|
||||
if (visited_refs.find(ref) != visited_refs.end()) {
|
||||
// Circular reference, assume not a string to be safe
|
||||
return false;
|
||||
}
|
||||
visited_refs.insert(ref);
|
||||
auto it = impl_->_refs.find(ref);
|
||||
if (it != impl_->_refs.end()) {
|
||||
return check(it->second);
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
// Check type field
|
||||
if (s.contains("type")) {
|
||||
const json & schema_type = s["type"];
|
||||
if (schema_type.is_string()) {
|
||||
if (schema_type == "string") {
|
||||
return true;
|
||||
}
|
||||
} else if (schema_type.is_array()) {
|
||||
// Type can be an array like ["string", "null"]
|
||||
for (const auto & t : schema_type) {
|
||||
if (t == "string") {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Check oneOf/anyOf - if any alternative can be a string
|
||||
if (s.contains("oneOf")) {
|
||||
for (const auto & alt : s["oneOf"]) {
|
||||
if (check(alt)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (s.contains("anyOf")) {
|
||||
for (const auto & alt : s["anyOf"]) {
|
||||
if (check(alt)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Check allOf - all components must be compatible with string type
|
||||
if (s.contains("allOf")) {
|
||||
bool all_string = true;
|
||||
for (const auto & component : s["allOf"]) {
|
||||
if (!check(component)) {
|
||||
all_string = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (all_string) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
// Check const - if the constant value is a string
|
||||
if (s.contains("const")) {
|
||||
if (s["const"].is_string()) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
// Check enum - if any enum value is a string
|
||||
if (s.contains("enum")) {
|
||||
for (const auto & val : s["enum"]) {
|
||||
if (val.is_string()) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// String-specific keywords imply string type
|
||||
if (s.contains("pattern") || s.contains("minLength") || s.contains("maxLength")) {
|
||||
return true;
|
||||
}
|
||||
|
||||
// Check format - many formats imply string
|
||||
if (s.contains("format")) {
|
||||
const std::string & fmt = s["format"];
|
||||
if (fmt == "date" || fmt == "time" || fmt == "date-time" ||
|
||||
fmt == "uri" || fmt == "email" || fmt == "hostname" ||
|
||||
fmt == "ipv4" || fmt == "ipv6" || fmt == "uuid" ||
|
||||
fmt.find("uuid") == 0) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
};
|
||||
|
||||
return check(schema);
|
||||
}
|
||||
|
||||
std::string json_schema_to_grammar(const json & schema, bool force_gbnf) {
|
||||
#ifdef LLAMA_USE_LLGUIDANCE
|
||||
if (!force_gbnf) {
|
||||
|
|
@ -1006,7 +1135,7 @@ std::string json_schema_to_grammar(const json & schema, bool force_gbnf) {
|
|||
}
|
||||
|
||||
std::string build_grammar(const std::function<void(const common_grammar_builder &)> & cb, const common_grammar_options & options) {
|
||||
SchemaConverter converter([&](const std::string &) { return json(); }, options.dotall);
|
||||
common_schema_converter converter([&](const std::string &) { return json(); }, options.dotall);
|
||||
common_grammar_builder builder {
|
||||
/* .add_rule = */ [&](const std::string & name, const std::string & rule) {
|
||||
return converter._add_rule(name, rule);
|
||||
|
|
|
|||
|
|
@ -3,11 +3,31 @@
|
|||
#include <nlohmann/json_fwd.hpp>
|
||||
|
||||
#include <functional>
|
||||
#include <memory>
|
||||
#include <string>
|
||||
|
||||
std::string json_schema_to_grammar(const nlohmann::ordered_json & schema,
|
||||
bool force_gbnf = false);
|
||||
|
||||
class common_schema_converter;
|
||||
|
||||
// Probes a JSON schema to extract information about its structure and type constraints.
|
||||
class common_schema_info {
|
||||
std::unique_ptr<common_schema_converter> impl_;
|
||||
|
||||
public:
|
||||
common_schema_info();
|
||||
~common_schema_info();
|
||||
|
||||
common_schema_info(const common_schema_info &) = delete;
|
||||
common_schema_info & operator=(const common_schema_info &) = delete;
|
||||
common_schema_info(common_schema_info &&) noexcept;
|
||||
common_schema_info & operator=(common_schema_info &&) noexcept;
|
||||
|
||||
void resolve_refs(nlohmann::ordered_json & schema);
|
||||
bool resolves_to_string(const nlohmann::ordered_json & schema);
|
||||
};
|
||||
|
||||
struct common_grammar_builder {
|
||||
std::function<std::string(const std::string &, const std::string &)> add_rule;
|
||||
std::function<std::string(const std::string &, const nlohmann::ordered_json &)> add_schema;
|
||||
|
|
|
|||
|
|
@ -1367,10 +1367,85 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
|
|||
});
|
||||
}
|
||||
|
||||
static void test_resolves_to_string() {
|
||||
fprintf(stderr, "#\n# Testing resolves_to_string\n#\n");
|
||||
|
||||
auto test = [](const std::string & name, const std::string & schema_str, bool expected) {
|
||||
fprintf(stderr, "- %s\n", name.c_str());
|
||||
common_schema_info info;
|
||||
auto schema = nlohmann::ordered_json::parse(schema_str);
|
||||
info.resolve_refs(schema);
|
||||
bool result = info.resolves_to_string(schema);
|
||||
if (result != expected) {
|
||||
fprintf(stderr, "#\n# Test '%s' failed.\n#\n", name.c_str());
|
||||
fprintf(stderr, "Schema: %s\n", schema_str.c_str());
|
||||
fprintf(stderr, "Expected: %s, Got: %s\n", expected ? "true" : "false", result ? "true" : "false");
|
||||
assert(false);
|
||||
}
|
||||
};
|
||||
|
||||
// Basic type checks
|
||||
test("type string", R"({"type": "string"})", true);
|
||||
test("type integer", R"({"type": "integer"})", false);
|
||||
test("type number", R"({"type": "number"})", false);
|
||||
test("type boolean", R"({"type": "boolean"})", false);
|
||||
test("type object", R"({"type": "object"})", false);
|
||||
test("type array", R"({"type": "array"})", false);
|
||||
|
||||
// Type array (nullable string)
|
||||
test("type array with string", R"({"type": ["string", "null"]})", true);
|
||||
test("type array without string", R"({"type": ["integer", "null"]})", false);
|
||||
|
||||
// String-specific keywords
|
||||
test("minLength implies string", R"({"minLength": 1})", true);
|
||||
test("maxLength implies string", R"({"maxLength": 10})", true);
|
||||
test("pattern implies string", R"({"pattern": "^[a-z]+$"})", true);
|
||||
|
||||
// Format
|
||||
test("format date", R"({"format": "date"})", true);
|
||||
test("format uuid", R"({"format": "uuid"})", true);
|
||||
test("format email", R"({"format": "email"})", true);
|
||||
|
||||
// Const
|
||||
test("const string", R"({"const": "hello"})", true);
|
||||
test("const number", R"({"const": 123})", false);
|
||||
|
||||
// Enum
|
||||
test("enum with strings", R"({"enum": ["a", "b", "c"]})", true);
|
||||
test("enum with numbers", R"({"enum": [1, 2, 3]})", false);
|
||||
test("enum mixed with string", R"({"enum": [1, "a", null]})", true);
|
||||
|
||||
// anyOf
|
||||
test("anyOf with string", R"({"anyOf": [{"type": "string"}, {"type": "integer"}]})", true);
|
||||
test("anyOf without string", R"({"anyOf": [{"type": "integer"}, {"type": "boolean"}]})", false);
|
||||
|
||||
// oneOf
|
||||
test("oneOf with string", R"({"oneOf": [{"type": "string"}, {"type": "number"}]})", true);
|
||||
test("oneOf without string", R"({"oneOf": [{"type": "object"}, {"type": "array"}]})", false);
|
||||
|
||||
// allOf - all must be strings
|
||||
test("allOf all strings", R"({"allOf": [{"type": "string"}, {"minLength": 1}]})", true);
|
||||
test("allOf mixed types", R"({"allOf": [{"type": "string"}, {"type": "integer"}]})", false);
|
||||
|
||||
// $ref
|
||||
test("$ref to string",
|
||||
R"({"$ref": "#/$defs/str", "$defs": {"str": {"type": "string"}}})", true);
|
||||
test("$ref to integer",
|
||||
R"({"$ref": "#/$defs/num", "$defs": {"num": {"type": "integer"}}})", false);
|
||||
|
||||
// Nested
|
||||
test("nested anyOf with string",
|
||||
R"({"anyOf": [{"anyOf": [{"type": "integer"}, {"type": "string"}]}, {"type": "boolean"}]})", true);
|
||||
|
||||
fprintf(stderr, "All resolves_to_string tests passed!\n");
|
||||
}
|
||||
|
||||
int main() {
|
||||
fprintf(stderr, "LLAMA_NODE_AVAILABLE = %s\n", getenv("LLAMA_NODE_AVAILABLE") ? "true" : "false");
|
||||
fprintf(stderr, "LLAMA_PYTHON_AVAILABLE = %s\n", getenv("LLAMA_PYTHON_AVAILABLE") ? "true" : "false");
|
||||
|
||||
test_resolves_to_string();
|
||||
|
||||
test_all("C++", [](const TestCase & tc) {
|
||||
try {
|
||||
tc.verify(json_schema_to_grammar(nlohmann::ordered_json::parse(tc.schema), true));
|
||||
|
|
|
|||
Loading…
Reference in New Issue