tests : model metadata loading from huggingface (#19796)

* Add model metadata loading from huggingface for use with other tests

* Add incremental chunking instead of full redownload, fix caching issue and add warning when it fails

* Add support for split models, load metadata from each individual split file, also avoid mmproj

* Code cleanup, revert incremental downloading

* Only compile when cpp-httplib has SSL support

* Fix formatting
This commit is contained in:
Bartowski 2026-02-28 04:44:38 -05:00 committed by GitHub
parent ecbcb7ea9d
commit d979f2b176
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 791 additions and 0 deletions

View File

@ -257,6 +257,21 @@ set(LLAMA_TEST_NAME test-mtmd-c-api)
llama_build_and_test(test-mtmd-c-api.c)
target_link_libraries(${LLAMA_TEST_NAME} PRIVATE mtmd)
# GGUF model data fetcher library for tests that need real model metadata
# Only compile when cpp-httplib has SSL support (CPPHTTPLIB_OPENSSL_SUPPORT)
if (TARGET cpp-httplib)
get_target_property(_cpp_httplib_defs cpp-httplib INTERFACE_COMPILE_DEFINITIONS)
if (_cpp_httplib_defs MATCHES "CPPHTTPLIB_OPENSSL_SUPPORT")
add_library(gguf-model-data STATIC gguf-model-data.cpp)
target_link_libraries(gguf-model-data PRIVATE common cpp-httplib)
target_include_directories(gguf-model-data PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
add_executable(test-gguf-model-data test-gguf-model-data.cpp)
target_link_libraries(test-gguf-model-data PRIVATE gguf-model-data common)
llama_test(test-gguf-model-data LABEL "model")
endif()
endif()
# dummy executable - not installed
get_filename_component(TEST_TARGET test-c.c NAME_WE)
add_executable(${TEST_TARGET} test-c.c)

613
tests/gguf-model-data.cpp Normal file
View File

@ -0,0 +1,613 @@
// GGUF binary parser adapted from the huggingface/gguf package.
// Reference: https://github.com/huggingface/huggingface.js
#include "gguf-model-data.h"
#include "common.h"
#include "gguf.h"
#include <algorithm>
#include <cstdio>
#include <cstring>
#include <filesystem>
#include <fstream>
#include "http.h"
#define JSON_ASSERT GGML_ASSERT
#include <nlohmann/json.hpp>
// Equivalent of RangeView
struct gguf_buf_reader {
const char * data;
size_t size;
size_t pos;
gguf_buf_reader(const std::vector<char> & buf) : data(buf.data()), size(buf.size()), pos(0) {}
bool has_n_bytes(size_t n) const {
return pos + n <= size;
}
template <typename T>
bool read_val(T & out) {
if (!has_n_bytes(sizeof(T))) {
return false;
}
memcpy(&out, data + pos, sizeof(T));
pos += sizeof(T);
return true;
}
bool read_str(std::string & out) {
uint64_t len;
if (!read_val(len)) {
return false;
}
if (!has_n_bytes((size_t)len)) {
return false;
}
out.assign(data + pos, (size_t)len);
pos += (size_t)len;
return true;
}
bool skip(size_t n) {
if (!has_n_bytes(n)) {
return false;
}
pos += n;
return true;
}
};
static size_t gguf_val_type_size(int32_t vtype) {
switch (vtype) {
case GGUF_TYPE_UINT8: return 1;
case GGUF_TYPE_INT8: return 1;
case GGUF_TYPE_UINT16: return 2;
case GGUF_TYPE_INT16: return 2;
case GGUF_TYPE_UINT32: return 4;
case GGUF_TYPE_INT32: return 4;
case GGUF_TYPE_FLOAT32: return 4;
case GGUF_TYPE_BOOL: return 1;
case GGUF_TYPE_UINT64: return 8;
case GGUF_TYPE_INT64: return 8;
case GGUF_TYPE_FLOAT64: return 8;
default: return 0; // string/array handled separately
}
}
// Equivalent of readMetadataValue(), skips unused values rather than storing
static bool gguf_skip_value(gguf_buf_reader & r, int32_t vtype) {
if (vtype == GGUF_TYPE_STRING) {
std::string tmp;
return r.read_str(tmp);
}
if (vtype == GGUF_TYPE_ARRAY) {
int32_t elem_type;
uint64_t count;
if (!r.read_val(elem_type)) {
return false;
}
if (!r.read_val(count)) {
return false;
}
if (elem_type == GGUF_TYPE_STRING) {
for (uint64_t i = 0; i < count; i++) {
std::string tmp;
if (!r.read_str(tmp)) {
return false;
}
}
return true;
}
if (elem_type == GGUF_TYPE_ARRAY) {
// nested arrays - recurse
for (uint64_t i = 0; i < count; i++) {
if (!gguf_skip_value(r, GGUF_TYPE_ARRAY)) {
return false;
}
}
return true;
}
size_t elem_sz = gguf_val_type_size(elem_type);
if (elem_sz == 0) {
return false;
}
return r.skip((size_t)count * elem_sz);
}
size_t sz = gguf_val_type_size(vtype);
if (sz == 0) {
return false;
}
return r.skip(sz);
}
static bool gguf_read_uint32_val(gguf_buf_reader & r, int32_t vtype, uint32_t & out) {
if (vtype == GGUF_TYPE_UINT8) {
uint8_t v;
if (!r.read_val(v)) {
return false;
}
out = v;
return true;
}
if (vtype == GGUF_TYPE_INT8) {
int8_t v;
if (!r.read_val(v)) {
return false;
}
out = (uint32_t)v;
return true;
}
if (vtype == GGUF_TYPE_UINT16) {
uint16_t v;
if (!r.read_val(v)) {
return false;
}
out = v;
return true;
}
if (vtype == GGUF_TYPE_INT16) {
int16_t v;
if (!r.read_val(v)) {
return false;
}
out = (uint32_t)v;
return true;
}
if (vtype == GGUF_TYPE_UINT32) {
uint32_t v;
if (!r.read_val(v)) {
return false;
}
out = v;
return true;
}
if (vtype == GGUF_TYPE_INT32) {
int32_t v;
if (!r.read_val(v)) {
return false;
}
out = (uint32_t)v;
return true;
}
if (vtype == GGUF_TYPE_UINT64) {
uint64_t v;
if (!r.read_val(v)) {
return false;
}
out = (uint32_t)v;
return true;
}
if (vtype == GGUF_TYPE_INT64) {
int64_t v;
if (!r.read_val(v)) {
return false;
}
out = (uint32_t)v;
return true;
}
return false;
}
// Follows the same header -> KV -> tensor parsing sequence as gguf() huggingface/gguf
static std::optional<gguf_remote_model> gguf_parse_meta(const std::vector<char> & buf) {
gguf_buf_reader r(buf);
// Header: magic(4) + version(4) + tensor_count(8) + kv_count(8) = 24 bytes minimum
uint32_t magic_raw;
if (!r.read_val(magic_raw)) {
return std::nullopt;
}
if (memcmp(&magic_raw, "GGUF", 4) != 0) {
fprintf(stderr, "gguf_parse_meta: invalid magic\n");
return std::nullopt;
}
uint32_t version;
if (!r.read_val(version)) {
return std::nullopt;
}
if (version < 2 || version > 3) {
fprintf(stderr, "gguf_parse_meta: unsupported version %u\n", version);
return std::nullopt;
}
int64_t tensor_count_raw;
int64_t kv_count_raw;
if (!r.read_val(tensor_count_raw)) {
return std::nullopt;
}
if (!r.read_val(kv_count_raw)) {
return std::nullopt;
}
uint64_t tensor_count = (uint64_t)tensor_count_raw;
uint64_t kv_count = (uint64_t)kv_count_raw;
gguf_remote_model model;
std::string arch_prefix;
// Parse KV pairs
for (uint64_t i = 0; i < kv_count; i++) {
std::string key;
if (!r.read_str(key)) {
return std::nullopt;
}
int32_t vtype;
if (!r.read_val(vtype)) {
return std::nullopt;
}
if (key == "general.architecture" && vtype == GGUF_TYPE_STRING) {
if (!r.read_str(model.architecture)) {
return std::nullopt;
}
arch_prefix = model.architecture + ".";
continue;
}
// Extract split.count for proper handling of split files
if (key == "split.count") {
uint32_t v;
if (!gguf_read_uint32_val(r, vtype, v)) {
return std::nullopt;
}
model.n_split = (uint16_t)v;
continue;
}
// Extract split.tensors.count so we can verify we have all tensors
if (key == "split.tensors.count") {
uint32_t v;
if (!gguf_read_uint32_val(r, vtype, v)) {
return std::nullopt;
}
model.n_split_tensors = v;
continue;
}
if (!arch_prefix.empty()) {
uint32_t * target = nullptr;
if (key == arch_prefix + "embedding_length") { target = &model.n_embd; }
else if (key == arch_prefix + "feed_forward_length") { target = &model.n_ff; }
else if (key == arch_prefix + "block_count") { target = &model.n_layer; }
else if (key == arch_prefix + "attention.head_count") { target = &model.n_head; }
else if (key == arch_prefix + "attention.head_count_kv") { target = &model.n_head_kv; }
else if (key == arch_prefix + "expert_count") { target = &model.n_expert; }
else if (key == arch_prefix + "attention.key_length") { target = &model.n_embd_head_k; }
else if (key == arch_prefix + "attention.value_length") { target = &model.n_embd_head_v; }
if (target) {
if (!gguf_read_uint32_val(r, vtype, *target)) {
return std::nullopt;
}
continue;
}
}
if (!gguf_skip_value(r, vtype)) {
return std::nullopt;
}
}
// Parse tensor info entries
model.tensors.reserve((size_t)tensor_count);
for (uint64_t i = 0; i < tensor_count; i++) {
gguf_remote_tensor t;
if (!r.read_str(t.name)) {
return std::nullopt;
}
if (!r.read_val(t.n_dims)) {
return std::nullopt;
}
if (t.n_dims > 4) {
fprintf(stderr, "gguf_parse_meta: tensor '%s' has %u dims (max 4)\n", t.name.c_str(), t.n_dims);
return std::nullopt;
}
for (uint32_t d = 0; d < t.n_dims; d++) {
if (!r.read_val(t.ne[d])) {
return std::nullopt;
}
}
int32_t type_raw;
if (!r.read_val(type_raw)) {
return std::nullopt;
}
t.type = (ggml_type)type_raw;
uint64_t offset;
if (!r.read_val(offset)) {
return std::nullopt;
}
// Infer n_vocab from token_embd.weight
if (t.name == "token_embd.weight") {
model.n_vocab = (uint32_t)t.ne[1];
}
model.tensors.push_back(std::move(t));
}
return model;
}
// cache handling for local download
static std::string get_default_cache_dir() {
return fs_get_cache_directory() + "gguf-headers/";
}
static std::string sanitize_for_path(const std::string & s) {
std::string out = s;
for (char & c : out) {
if (c == '/' || c == '\\' || c == ':') {
c = '_';
}
}
return out;
}
static bool read_file(const std::string & path, std::vector<char> & out) {
std::ifstream f(path, std::ios::binary | std::ios::ate);
if (!f.good()) {
return false;
}
auto sz = f.tellg();
if (sz <= 0) {
return false;
}
out.resize((size_t)sz);
f.seekg(0);
f.read(out.data(), sz);
return f.good();
}
static bool write_file(const std::string & path, const std::vector<char> & data) {
std::ofstream f(path, std::ios::binary | std::ios::trunc);
if (!f.good()) {
return false;
}
f.write(data.data(), (std::streamsize)data.size());
return f.good();
}
// HuggingFace file auto-detection and HTTP download
static std::pair<long, std::vector<char>> gguf_http_get(
const std::string & url,
const httplib::Headers & headers = {},
int timeout_sec = 60) {
try {
auto [cli, parts] = common_http_client(url);
if (timeout_sec > 0) {
cli.set_read_timeout(timeout_sec, 0);
cli.set_write_timeout(timeout_sec, 0);
}
cli.set_connection_timeout(30, 0);
std::vector<char> body;
auto res = cli.Get(parts.path, headers,
[&](const char * data, size_t len) {
body.insert(body.end(), data, data + len);
return true;
}, nullptr);
if (!res) {
fprintf(stderr, "gguf_fetch: HTTP request failed for %s (error %d)\n",
url.c_str(), (int)res.error());
return {-1, {}};
}
return {res->status, std::move(body)};
} catch (const std::exception & e) {
fprintf(stderr, "gguf_fetch: HTTP error: %s\n", e.what());
return {-1, {}};
}
}
// Find the filename for given repo/quant.
// For split models, returns the first shard (the one containing "00001-of-")
// split_prefix is set to the portion before "-00001-of-XXXXX.gguf" when a split file is found
static std::string detect_gguf_filename(const std::string & repo, const std::string & quant,
std::string & split_prefix) {
split_prefix.clear();
std::string api_url = "https://huggingface.co/api/models/" + repo;
auto [code, body] = gguf_http_get(api_url, {}, 30);
if (code != 200 || body.empty()) {
fprintf(stderr, "gguf_fetch: failed to query HF API for %s (HTTP %ld)\n", repo.c_str(), code);
return "";
}
nlohmann::json j;
try {
j = nlohmann::json::parse(body.begin(), body.end());
} catch (...) {
fprintf(stderr, "gguf_fetch: failed to parse HF API response\n");
return "";
}
if (!j.contains("siblings") || !j["siblings"].is_array()) {
fprintf(stderr, "gguf_fetch: unexpected HF API response format\n");
return "";
}
std::vector<std::string> matches;
std::string quant_upper = quant;
for (char & c : quant_upper) { c = (char)toupper(c); }
for (const auto & sibling : j["siblings"]) {
if (!sibling.contains("rfilename")) { continue; }
std::string fname = sibling["rfilename"].get<std::string>();
if (fname.size() < 5 || fname.substr(fname.size() - 5) != ".gguf") {
continue;
}
std::string fname_upper = fname;
for (char & c : fname_upper) { c = (char)toupper(c); }
if (fname_upper.find(quant_upper) != std::string::npos) {
matches.push_back(fname);
}
}
if (matches.empty()) {
fprintf(stderr, "gguf_fetch: no .gguf files matching '%s' in %s\n", quant.c_str(), repo.c_str());
return "";
}
std::sort(matches.begin(), matches.end());
// Prefer non-split, non-supplementary file
for (const auto & m : matches) {
if (m.find("-of-") == std::string::npos && m.find("mmproj") == std::string::npos) {
return m;
}
}
// Return the first shard (00001-of-) and extract the prefix
for (const auto & m : matches) {
auto pos = m.find("-00001-of-");
if (pos != std::string::npos) {
split_prefix = m.substr(0, pos);
return m;
}
}
return matches[0];
}
static std::optional<gguf_remote_model> fetch_and_parse(
const std::string & repo,
const std::string & filename,
const std::string & cache_path) {
std::string url = "https://huggingface.co/" + repo + "/resolve/main/" + filename;
// Progressive download inspired by RangeView.fetchChunk()
// Start at 2MB, double each time, cap at 64MB
size_t chunk_size = 2 * 1024 * 1024;
const size_t max_chunk = 64 * 1024 * 1024;
while (chunk_size <= max_chunk) {
fprintf(stderr, "gguf_fetch: downloading %zu bytes from %s\n", chunk_size, filename.c_str());
char range_buf[64];
snprintf(range_buf, sizeof(range_buf), "bytes=0-%zu", chunk_size - 1);
httplib::Headers headers = {{"Range", range_buf}};
auto [code, body] = gguf_http_get(url, headers, 120);
if (code != 200 && code != 206) {
fprintf(stderr, "gguf_fetch: HTTP %ld fetching %s\n", code, url.c_str());
return std::nullopt;
}
if (body.empty()) {
fprintf(stderr, "gguf_fetch: empty response\n");
return std::nullopt;
}
auto result = gguf_parse_meta(body);
if (result.has_value()) {
write_file(cache_path, body);
return result;
}
if (code == 200) {
fprintf(stderr, "gguf_fetch: server returned full response but metadata parse failed\n");
return std::nullopt;
}
// Parse failed, try larger chunk
chunk_size *= 2;
}
fprintf(stderr, "gguf_fetch: metadata exceeds 64MB, giving up\n");
return std::nullopt;
}
// Try cache first, then fetch and parse a single GGUF shard.
static std::optional<gguf_remote_model> fetch_or_cached(
const std::string & repo,
const std::string & filename,
const std::string & cdir,
const std::string & repo_part) {
std::string fname_part = sanitize_for_path(filename);
std::string cache_path = cdir + "/" + repo_part + "--" + fname_part + ".partial";
{
std::vector<char> cached;
if (std::filesystem::exists(cache_path) && read_file(cache_path, cached)) {
auto result = gguf_parse_meta(cached);
if (result.has_value()) {
fprintf(stderr, "gguf_fetch: loaded from cache: %s\n", cache_path.c_str());
return result;
}
}
}
fs_create_directory_with_parents(cdir);
return fetch_and_parse(repo, filename, cache_path);
}
std::optional<gguf_remote_model> gguf_fetch_model_meta(
const std::string & repo,
const std::string & quant,
const std::string & cache_dir) {
std::string cdir = cache_dir.empty() ? get_default_cache_dir() : cache_dir;
std::string repo_part = sanitize_for_path(repo);
std::string split_prefix;
std::string filename = detect_gguf_filename(repo, quant, split_prefix);
if (filename.empty()) {
return std::nullopt;
}
auto model_opt = fetch_or_cached(repo, filename, cdir, repo_part);
if (!model_opt.has_value()) {
fprintf(stderr, "gguf_fetch: failed to fetch %s\n", filename.c_str());
return std::nullopt;
}
auto & model = model_opt.value();
// If the model is split across multiple files we need to fetch the remaining shards metadata
if (model.n_split > 1) {
if (split_prefix.empty()) {
fprintf(stderr, "gguf_fetch: model reports %u splits but filename has no split pattern\n", model.n_split);
return std::nullopt;
}
fprintf(stderr, "gguf_fetch: split model with %u shards, fetching remaining %u...\n",
model.n_split, model.n_split - 1);
for (int i = 2; i <= model.n_split; i++) {
char num_buf[6], total_buf[6];
snprintf(num_buf, sizeof(num_buf), "%05d", i);
snprintf(total_buf, sizeof(total_buf), "%05d", (int)model.n_split);
std::string shard_name = split_prefix + "-" + num_buf + "-of-" + total_buf + ".gguf";
auto shard = fetch_or_cached(repo, shard_name, cdir, repo_part);
if (!shard.has_value()) {
fprintf(stderr, "gguf_fetch: failed to fetch shard %d: %s\n", i, shard_name.c_str());
return std::nullopt;
}
model.tensors.insert(model.tensors.end(),
std::make_move_iterator(shard->tensors.begin()),
std::make_move_iterator(shard->tensors.end()));
}
if (model.n_split_tensors > 0 && model.tensors.size() != model.n_split_tensors) {
fprintf(stderr, "gguf_fetch: WARNING: expected %u tensors from split.tensors.count, got %zu\n",
model.n_split_tensors, model.tensors.size());
}
}
return model_opt;
}

42
tests/gguf-model-data.h Normal file
View File

@ -0,0 +1,42 @@
#pragma once
#include "ggml.h"
#include <cstdint>
#include <optional>
#include <string>
#include <vector>
struct gguf_remote_tensor {
std::string name;
ggml_type type = GGML_TYPE_F32;
int64_t ne[4] = {1, 1, 1, 1}; // dimensions, unused dims = 1
uint32_t n_dims = 0;
};
struct gguf_remote_model {
// Selected KV metadata
std::string architecture; // general.architecture
uint32_t n_embd = 0; // <arch>.embedding_length
uint32_t n_ff = 0; // <arch>.feed_forward_length
uint32_t n_vocab = 0; // inferred from token_embd.weight ne[1]
uint32_t n_layer = 0; // <arch>.block_count
uint32_t n_head = 0; // <arch>.attention.head_count
uint32_t n_head_kv = 0; // <arch>.attention.head_count_kv
uint32_t n_expert = 0; // <arch>.expert_count (0 if absent)
uint32_t n_embd_head_k = 0; // <arch>.attention.key_length
uint32_t n_embd_head_v = 0; // <arch>.attention.value_length
uint16_t n_split = 0; // split.count (0 = not split)
uint32_t n_split_tensors = 0; // split.tensors.count (0 if not split)
std::vector<gguf_remote_tensor> tensors;
};
// Fetch model metadata from HuggingFace with local caching.
// repo: e.g., "ggml-org/Qwen3-32B-GGUF"
// quant: e.g., "Q8_0" -- auto-detects filename (including first shard of split models)
// Returns nullopt if download fails or network is unavailable.
std::optional<gguf_remote_model> gguf_fetch_model_meta(
const std::string & repo,
const std::string & quant = "Q8_0",
const std::string & cache_dir = ""); // empty = default

View File

@ -0,0 +1,121 @@
#include "gguf-model-data.h"
#include <cstdio>
#define TEST_ASSERT(cond, msg) \
do { \
if (!(cond)) { \
fprintf(stderr, "FAIL: %s (line %d): %s\n", #cond, __LINE__, msg); \
return 1; \
} \
} while (0)
int main() {
fprintf(stderr, "=== test-gguf-model-data ===\n");
// Fetch Qwen3-0.6B Q8_0 metadata
auto result = gguf_fetch_model_meta("ggml-org/Qwen3-0.6B-GGUF", "Q8_0");
if (!result.has_value()) {
fprintf(stderr, "SKIP: could not fetch model metadata (no network or HTTP disabled)\n");
return 0;
}
const auto & model = result.value();
fprintf(stderr, "Architecture: %s\n", model.architecture.c_str());
fprintf(stderr, "n_embd: %u\n", model.n_embd);
fprintf(stderr, "n_ff: %u\n", model.n_ff);
fprintf(stderr, "n_vocab: %u\n", model.n_vocab);
fprintf(stderr, "n_layer: %u\n", model.n_layer);
fprintf(stderr, "n_head: %u\n", model.n_head);
fprintf(stderr, "n_head_kv: %u\n", model.n_head_kv);
fprintf(stderr, "n_expert: %u\n", model.n_expert);
fprintf(stderr, "n_embd_head_k: %u\n", model.n_embd_head_k);
fprintf(stderr, "n_embd_head_v: %u\n", model.n_embd_head_v);
fprintf(stderr, "tensors: %zu\n", model.tensors.size());
// Verify architecture
TEST_ASSERT(model.architecture == "qwen3", "expected architecture 'qwen3'");
// Verify key dimensions (Qwen3-0.6B)
TEST_ASSERT(model.n_layer == 28, "expected n_layer == 28");
TEST_ASSERT(model.n_embd == 1024, "expected n_embd == 1024");
TEST_ASSERT(model.n_head == 16, "expected n_head == 16");
TEST_ASSERT(model.n_head_kv == 8, "expected n_head_kv == 8");
TEST_ASSERT(model.n_expert == 0, "expected n_expert == 0 (not MoE)");
TEST_ASSERT(model.n_vocab == 151936, "expected n_vocab == 151936");
// Verify tensor count
TEST_ASSERT(model.tensors.size() == 311, "expected tensor count == 311");
// Verify known tensor names exist
bool found_attn_q = false;
bool found_token_embd = false;
bool found_output_norm = false;
for (const auto & t : model.tensors) {
if (t.name == "blk.0.attn_q.weight") {
found_attn_q = true;
}
if (t.name == "token_embd.weight") {
found_token_embd = true;
}
if (t.name == "output_norm.weight") {
found_output_norm = true;
}
}
TEST_ASSERT(found_attn_q, "expected tensor 'blk.0.attn_q.weight'");
TEST_ASSERT(found_token_embd, "expected tensor 'token_embd.weight'");
TEST_ASSERT(found_output_norm, "expected tensor 'output_norm.weight'");
// Verify token_embd.weight shape
for (const auto & t : model.tensors) {
if (t.name == "token_embd.weight") {
TEST_ASSERT(t.ne[0] == 1024, "expected token_embd.weight ne[0] == 1024");
TEST_ASSERT(t.n_dims == 2, "expected token_embd.weight to be 2D");
break;
}
}
// Test that second call uses cache (just call again, it should work)
auto result2 = gguf_fetch_model_meta("ggml-org/Qwen3-0.6B-GGUF", "Q8_0");
TEST_ASSERT(result2.has_value(), "cached fetch should succeed");
TEST_ASSERT(result2->tensors.size() == model.tensors.size(), "cached result should match");
// Test a split MoE model without specifying quant (should default to Q8_0)
auto result3 = gguf_fetch_model_meta("ggml-org/GLM-4.6V-GGUF");
if (!result3.has_value()) {
fprintf(stderr, "SKIP: could not fetch GLM-4.6V metadata (no network?)\n");
return 0;
}
const auto & model3 = result3.value();
fprintf(stderr, "Architecture: %s\n", model3.architecture.c_str());
fprintf(stderr, "n_embd: %u\n", model3.n_embd);
fprintf(stderr, "n_ff: %u\n", model3.n_ff);
fprintf(stderr, "n_vocab: %u\n", model3.n_vocab);
fprintf(stderr, "n_layer: %u\n", model3.n_layer);
fprintf(stderr, "n_head: %u\n", model3.n_head);
fprintf(stderr, "n_head_kv: %u\n", model3.n_head_kv);
fprintf(stderr, "n_expert: %u\n", model3.n_expert);
fprintf(stderr, "n_embd_head_k: %u\n", model3.n_embd_head_k);
fprintf(stderr, "n_embd_head_v: %u\n", model3.n_embd_head_v);
fprintf(stderr, "tensors: %zu\n", model3.tensors.size());
// Verify architecture
TEST_ASSERT(model3.architecture == "glm4moe", "expected architecture 'glm4moe'");
// Verify key dimensions (GLM-4.6V)
TEST_ASSERT(model3.n_layer == 46, "expected n_layer == 46");
TEST_ASSERT(model3.n_embd == 4096, "expected n_embd == 4096");
TEST_ASSERT(model3.n_head == 96, "expected n_head == 96");
TEST_ASSERT(model3.n_head_kv == 8, "expected n_head_kv == 8");
TEST_ASSERT(model3.n_expert == 128, "expected n_expert == 128 (MoE)");
TEST_ASSERT(model3.n_vocab == 151552, "expected n_vocab == 151552");
// Verify tensor count
TEST_ASSERT(model3.tensors.size() == 780, "expected tensor count == 780");
fprintf(stderr, "=== ALL TESTS PASSED ===\n");
return 0;
}