#include "common.h" #include "log.h" #include "ggml-backend.h" #include "ggml.h" #include "gguf.h" #include "ggml-cpp.h" #include "llama.h" #include "llama-cpp.h" #include "../src/llama-arch.h" #include "../src/llama-model-saver.h" #include #include #include #include #include #include #include #include #include // normalized mean squared error = mse(a, b) / mse(a, 0) static double nmse(const std::vector & a, const std::vector & b) { GGML_ASSERT(a.size() == b.size()); double mse_a_b = 0.0; double mse_a_0 = 0.0; for (size_t i = 0; i < a.size(); i++) { float a_i = a[i]; float b_i = b[i]; mse_a_b += (a_i - b_i) * (a_i - b_i); mse_a_0 += a_i * a_i; } return mse_a_b / mse_a_0; } static void set_tensor_data(struct ggml_tensor * tensor, void * userdata) { std::hash hasher; std::mt19937 gen(hasher(tensor->name) + *(const size_t *) userdata); std::normal_distribution dis(0.0f, 1.0e-2f); const int64_t ne = ggml_nelements(tensor); if (tensor->type == GGML_TYPE_F32) { std::vector tmp(ne); for (int64_t i = 0; i < ne; i++) { tmp[i] = dis(gen); } ggml_backend_tensor_set(tensor, tmp.data(), 0, ggml_nbytes(tensor)); } else if (tensor->type == GGML_TYPE_F16) { std::vector tmp(ne); for (int64_t i = 0; i < ne; i++) { tmp[i] = ggml_fp32_to_fp16(dis(gen)); } ggml_backend_tensor_set(tensor, tmp.data(), 0, ggml_nbytes(tensor)); } else { GGML_ABORT("fatal error"); } } static void usage(char ** argv) { printf("Usage: %s [-a/--arch arch] [-s/--seed seed] [-v/--verbose]\n", argv[0]); } static std::vector get_tokens(const uint32_t n_tokens, const uint32_t n_vocab, const size_t seed){ std::mt19937 gen(seed); std::uniform_int_distribution<> dis(0, n_vocab - 1); std::vector ret; ret.reserve(n_tokens); for (uint32_t i = 0; i < n_tokens; i++) { ret.push_back(dis(gen)); } return ret; } static gguf_context_ptr get_gguf_ctx(const llm_arch arch, const bool moe) { gguf_context_ptr ret(gguf_init_empty()); llama_model_saver ms(arch, ret.get()); const uint32_t n_ctx = 128; uint32_t n_vocab = 128; uint32_t n_embd = 256; uint32_t n_head = 2; uint32_t n_ff = 384; uint32_t n_layer = 2; if (arch == LLM_ARCH_LLAMA4) { n_layer = 4; // hparams.n_no_rope_layer_step is hard-coded to 4 } else if (arch == LLM_ARCH_GEMMA3N) { n_embd = 64; n_head = 1; n_ff = 96; } else if (arch == LLM_ARCH_DEEPSEEK2 || arch == LLM_ARCH_GLM_DSA || arch == LLM_ARCH_KIMI_LINEAR) { n_embd = 128; n_head = 1; n_ff = 192; } else if (arch == LLM_ARCH_NEMOTRON_H || arch == LLM_ARCH_NEMOTRON_H_MOE) { n_layer = 3; } else if (arch == LLM_ARCH_CHAMELEON) { n_vocab = 10240; } else if (arch == LLM_ARCH_GEMMA3N) { n_layer = 22; // hparams.n_layer_kv_from_start = 20 is hardcoded } const uint32_t n_embd_head = n_embd / n_head; ms.add_kv(LLM_KV_GENERAL_ARCHITECTURE, llm_arch_name(arch)); ms.add_kv(LLM_KV_VOCAB_SIZE, n_vocab); ms.add_kv(LLM_KV_CONTEXT_LENGTH, n_ctx); ms.add_kv(LLM_KV_EMBEDDING_LENGTH, n_embd); ms.add_kv(LLM_KV_FEATURES_LENGTH, n_embd); ms.add_kv(LLM_KV_BLOCK_COUNT, n_layer); ms.add_kv(LLM_KV_LEADING_DENSE_BLOCK_COUNT, uint32_t(1)); if (arch == LLM_ARCH_NEMOTRON_H || arch == LLM_ARCH_NEMOTRON_H_MOE) { std::vector n_ff_per_layer; n_ff_per_layer.reserve(n_layer); for (uint32_t il = 0; il < n_layer; il++) { n_ff_per_layer.push_back(il <= 1 ? 0 : n_ff); } ms.add_kv(LLM_KV_FEED_FORWARD_LENGTH, n_ff_per_layer); } else { ms.add_kv(LLM_KV_FEED_FORWARD_LENGTH, n_ff); } ms.add_kv(LLM_KV_USE_PARALLEL_RESIDUAL, false); ms.add_kv(LLM_KV_LOGIT_SCALE, 1.0f); ms.add_kv(LLM_KV_TIME_MIX_EXTRA_DIM, uint32_t(64)); ms.add_kv(LLM_KV_TIME_DECAY_EXTRA_DIM, uint32_t(128)); ms.add_kv(LLM_KV_FULL_ATTENTION_INTERVAL, uint32_t(2)); if (arch == LLM_ARCH_PLAMO2 || arch == LLM_ARCH_JAMBA || arch == LLM_ARCH_NEMOTRON_H || arch == LLM_ARCH_NEMOTRON_H_MOE || arch == LLM_ARCH_GRANITE_HYBRID || arch == LLM_ARCH_LFM2 || arch == LLM_ARCH_LFM2MOE || arch == LLM_ARCH_KIMI_LINEAR) { GGML_ASSERT(n_layer >= 2); std::vector n_head_per_layer; n_head_per_layer.reserve(n_layer); for (uint32_t il = 0; il < n_layer; il++) { n_head_per_layer.push_back(il == 1 ? 0 : n_head); } ms.add_kv(LLM_KV_ATTENTION_HEAD_COUNT, n_head_per_layer); ms.add_kv(LLM_KV_ATTENTION_HEAD_COUNT_KV, n_head_per_layer); } else { ms.add_kv(LLM_KV_ATTENTION_HEAD_COUNT, n_head); ms.add_kv(LLM_KV_ATTENTION_HEAD_COUNT_KV, n_head); } ms.add_kv(LLM_KV_ATTENTION_MAX_ALIBI_BIAS, 8.0f); if (arch == LLM_ARCH_DEEPSEEK2 || arch == LLM_ARCH_GLM_DSA || arch == LLM_ARCH_KIMI_LINEAR) { ms.add_kv(LLM_KV_ATTENTION_KEY_LENGTH, uint32_t(576)); ms.add_kv(LLM_KV_ATTENTION_VALUE_LENGTH, uint32_t(512)); ms.add_kv(LLM_KV_ROPE_DIMENSION_COUNT, uint32_t(64)); ms.add_kv(LLM_KV_ATTENTION_KEY_LENGTH_MLA, uint32_t(192)); ms.add_kv(LLM_KV_ATTENTION_VALUE_LENGTH_MLA, uint32_t(128)); } ms.add_kv(LLM_KV_ATTENTION_CLAMP_KQV, 1.0f); ms.add_kv(LLM_KV_ATTENTION_LAYERNORM_EPS, 1e-5f); ms.add_kv(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, 1e-5f); ms.add_kv(LLM_KV_ATTENTION_GROUPNORM_EPS, 1e-5f); ms.add_kv(LLM_KV_ATTENTION_GROUPNORM_GROUPS, uint32_t(8)); ms.add_kv(LLM_KV_ATTENTION_Q_LORA_RANK, uint32_t(512)); ms.add_kv(LLM_KV_ATTENTION_KV_LORA_RANK, uint32_t(512)); ms.add_kv(LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, uint32_t(8)); ms.add_kv(LLM_KV_ATTENTION_SLIDING_WINDOW, n_ctx/8); if (arch == LLM_ARCH_MIMO2 || arch == LLM_ARCH_STEP35) { std::vector pattern; pattern.reserve(n_layer); for (uint32_t il = 0; il < n_layer; il++) { pattern.push_back(il % 2); } ms.add_kv(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, pattern); } else { ms.add_kv(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, uint32_t(2)); } ms.add_kv(LLM_KV_ATTENTION_INDEXER_HEAD_COUNT, uint32_t(1)); ms.add_kv(LLM_KV_ATTENTION_INDEXER_KEY_LENGTH, uint32_t(64)); ms.add_kv(LLM_KV_ATTENTION_INDEXER_TOP_K, uint32_t(8)); ms.add_kv(LLM_KV_ROPE_DIMENSION_SECTIONS, std::vector({n_embd_head/4, n_embd_head/4, n_embd_head/4, n_embd_head/4})); ms.add_kv(LLM_KV_TOKENIZER_MODEL, "no_vocab"); // ms.add_kv(LLM_KV_DENSE_2_FEAT_OUT, n_embd); // ms.add_kv(LLM_KV_DENSE_3_FEAT_IN, n_embd); if (moe) { ms.add_kv(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, n_ff); ms.add_kv(LLM_KV_INTERLEAVE_MOE_LAYER_STEP, uint32_t(2)); ms.add_kv(LLM_KV_EXPERT_COUNT, uint32_t(2)); ms.add_kv(LLM_KV_EXPERT_USED_COUNT, uint32_t(1)); ms.add_kv(LLM_KV_EXPERT_SHARED_COUNT, uint32_t(1)); ms.add_kv(LLM_KV_EXPERT_GATING_FUNC, uint32_t(2)); // sigmoid ms.add_kv(LLM_KV_EXPERT_GROUP_SCALE, 1.0f); ms.add_kv(LLM_KV_EXPERTS_PER_GROUP, uint32_t(1)); } ms.add_kv(LLM_KV_POSNET_EMBEDDING_LENGTH, n_embd); ms.add_kv(LLM_KV_POSNET_BLOCK_COUNT, n_layer); ms.add_kv(LLM_KV_CONVNEXT_EMBEDDING_LENGTH, n_embd); ms.add_kv(LLM_KV_CONVNEXT_BLOCK_COUNT, n_layer); ms.add_kv(LLM_KV_XIELU_ALPHA_N, 1.0f); ms.add_kv(LLM_KV_XIELU_ALPHA_P, 1.0f); ms.add_kv(LLM_KV_XIELU_BETA, 1.0f); ms.add_kv(LLM_KV_XIELU_EPS, 1.0e-7f); ms.add_kv(LLM_KV_SSM_INNER_SIZE, arch == LLM_ARCH_QWEN3NEXT || arch == LLM_ARCH_QWEN35 || arch == LLM_ARCH_QWEN35MOE ? 64 : 2*n_embd); ms.add_kv(LLM_KV_SSM_CONV_KERNEL, uint32_t(4)); ms.add_kv(LLM_KV_SSM_STATE_SIZE, uint32_t(32)); ms.add_kv(LLM_KV_SSM_TIME_STEP_RANK, n_head); ms.add_kv(LLM_KV_SSM_GROUP_COUNT, arch == LLM_ARCH_PLAMO2 ? 0 : uint32_t(2)); ms.add_kv(LLM_KV_KDA_HEAD_DIM, uint32_t(128)); ms.add_kv(LLM_KV_WKV_HEAD_SIZE, n_embd/n_head); ms.add_kv(LLM_KV_SHORTCONV_L_CACHE, uint32_t(3)); for (uint32_t il = 0; il < n_layer; il++) { ggml_tensor t; memset(&t, 0, sizeof(ggml_tensor)); t.type = GGML_TYPE_F16; ggml_format_name(&t, "conv%" PRIu32 "d.weight", il); gguf_add_tensor(ms.gguf_ctx, &t); ggml_format_name(&t, "posnet.%" PRIu32 ".conv1.weight", il); gguf_add_tensor(ms.gguf_ctx, &t); ggml_format_name(&t, "posnet.%" PRIu32 ".conv2.weight", il); gguf_add_tensor(ms.gguf_ctx, &t); ggml_format_name(&t, "convnext.%" PRIu32 ".dw.weight", il); gguf_add_tensor(ms.gguf_ctx, &t); } return ret; } static std::pair get_model_and_ctx( struct gguf_context * gguf_ctx, const size_t seed, const std::vector & devs) { llama_model_params model_params = llama_model_default_params(); std::vector devs_copy = devs; devs_copy.push_back(nullptr); model_params.devices = devs_copy.data(); llama_context_params ctx_params = llama_context_default_params(); ctx_params.n_ctx = 0; ctx_params.n_threads = 4; ctx_params.n_threads_batch = 4; size_t tmp = seed; llama_model_ptr model(llama_model_init_from_user(gguf_ctx, set_tensor_data, &tmp, model_params)); if (!model) { throw std::runtime_error("failed to create llama model"); } llama_context_ptr lctx(llama_init_from_model(model.get(), ctx_params)); if (!lctx) { throw std::runtime_error("failed to create llama context"); } return std::make_pair(std::move(model), std::move(lctx)); } static std::vector get_logits( llama_model * model, llama_context * lctx, const std::vector & tokens, bool encode = false) { const uint32_t n_vocab = llama_vocab_n_tokens(llama_model_get_vocab(model)); const uint32_t n_ctx = llama_n_ctx(lctx); const uint32_t n_tokens = tokens.size(); llama_batch batch = llama_batch_init(n_ctx, 0, 1); GGML_ASSERT(n_tokens <= n_ctx); for (uint32_t pos = 0; pos < n_tokens; pos++) { common_batch_add(batch, tokens[pos], pos, {0}, true); } batch.n_tokens = n_tokens; if (encode) { if (llama_encode(lctx, batch)) { llama_batch_free(batch); throw std::runtime_error("failed to encode batch"); } } if (llama_decode(lctx, batch)) { llama_batch_free(batch); throw std::runtime_error("failed to decode batch"); } std::vector ret; ret.reserve(n_tokens*n_vocab); for (uint32_t i = 0; i < n_tokens; i++) { const float * logits_ith = llama_get_logits_ith(lctx, i); for (uint32_t j = 0; j < n_vocab; j++) { ret.push_back(logits_ith[j]); } } llama_batch_free(batch); return ret; } static bool moe_mandatory(const llm_arch arch) { switch (arch) { case LLM_ARCH_LLAMA4: case LLM_ARCH_GROK: case LLM_ARCH_QWEN2MOE: case LLM_ARCH_QWEN3MOE: case LLM_ARCH_QWEN3NEXT: case LLM_ARCH_QWEN3VLMOE: case LLM_ARCH_QWEN35MOE: case LLM_ARCH_PHIMOE: case LLM_ARCH_DBRX: case LLM_ARCH_OLMOE: case LLM_ARCH_ARCTIC: case LLM_ARCH_DEEPSEEK: case LLM_ARCH_DEEPSEEK2: case LLM_ARCH_GLM4_MOE: case LLM_ARCH_GLM_DSA: case LLM_ARCH_EXAONE_MOE: case LLM_ARCH_BAILINGMOE: case LLM_ARCH_BAILINGMOE2: case LLM_ARCH_DOTS1: case LLM_ARCH_AFMOE: case LLM_ARCH_ERNIE4_5: case LLM_ARCH_ERNIE4_5_MOE: case LLM_ARCH_HUNYUAN_MOE: case LLM_ARCH_OPENAI_MOE: case LLM_ARCH_LFM2MOE: case LLM_ARCH_SMALLTHINKER: case LLM_ARCH_LLADA_MOE: case LLM_ARCH_GROVEMOE: case LLM_ARCH_MINIMAX_M2: case LLM_ARCH_RND1: case LLM_ARCH_PADDLEOCR: case LLM_ARCH_MIMO2: case LLM_ARCH_KIMI_LINEAR: case LLM_ARCH_STEP35: return true; default: return false; } } static bool moe_implemented(const llm_arch arch) { if (moe_mandatory(arch)) { return true; } switch (arch) { case LLM_ARCH_LLAMA: case LLM_ARCH_REFACT: case LLM_ARCH_MINICPM: case LLM_ARCH_GRANITE: case LLM_ARCH_GRANITE_MOE: case LLM_ARCH_MISTRAL3: case LLM_ARCH_LLAMA_EMBED: return true; default: return false; } } static int save_models(const llm_arch target_arch, const size_t seed, const ggml_log_level log_level, const std::string & dir) { GGML_ABORT("llama_model_save_to_file is broken"); struct user_data_t { struct { ggml_log_callback callback; void * user_data; } original_logger; ggml_log_level min_level; // prints below this log level go to debug log }; user_data_t ud; llama_log_get(&ud.original_logger.callback, &ud.original_logger.user_data); ud.min_level = log_level; llama_log_set([](ggml_log_level level, const char * text, void * user_data) { const user_data_t * ud = (const user_data_t *) user_data; const ggml_log_level level_eff = level >= ud->min_level ? level : GGML_LOG_LEVEL_DEBUG; ud->original_logger.callback(level_eff, text, ud->original_logger.user_data); }, &ud); for (const llm_arch & arch : llm_arch_all()) { if (target_arch != LLM_ARCH_UNKNOWN && arch != target_arch) { continue; } if (arch == LLM_ARCH_CLIP || arch == LLM_ARCH_GPTJ || arch == LLM_ARCH_UNKNOWN) { continue; // These models don't have usable implementations. } for (bool moe : {false, true}) { if (moe && !moe_implemented(arch)) { continue; } if (!moe && moe_mandatory(arch)) { continue; } gguf_context_ptr gguf_ctx = get_gguf_ctx(arch, moe); auto model_and_ctx = get_model_and_ctx(gguf_ctx.get(), seed, {}); const std::string path = dir + "/" + llm_arch_name(arch) + (moe ? "-moe.gguf" : "-dense.gguf"); LOG_INF("%s: Saving %s model (%s) to %s...\n", __func__, llm_arch_name(arch), moe ? "MoE" : "dense", path.c_str()); llama_model_save_to_file(model_and_ctx.first.get(), path.c_str()); } } llama_log_set(ud.original_logger.callback, ud.original_logger.user_data); return 0; } static int test_backends(const llm_arch target_arch, const size_t seed, const ggml_log_level log_level) { struct user_data_t { struct { ggml_log_callback callback; void * user_data; } original_logger; ggml_log_level min_level; // prints below this log level go to debug log }; user_data_t ud; llama_log_get(&ud.original_logger.callback, &ud.original_logger.user_data); ud.min_level = log_level; llama_log_set([](ggml_log_level level, const char * text, void * user_data) { const user_data_t * ud = (const user_data_t *) user_data; const ggml_log_level level_eff = level >= ud->min_level ? level : GGML_LOG_LEVEL_DEBUG; ud->original_logger.callback(level_eff, text, ud->original_logger.user_data); }, &ud); const std::vector tokens = get_tokens(128, 128, seed); bool all_ok = true; common_log_flush(common_log_main()); printf("|%15s|%30s|%6s|%8s|%6s|\n", "Model arch.", "Device", "Config", "NMSE", "Status"); printf("|---------------|------------------------------|------|--------|------|\n"); for (const llm_arch & arch : llm_arch_all()) { if (target_arch != LLM_ARCH_UNKNOWN && arch != target_arch) { continue; } if (arch == LLM_ARCH_CLIP || arch == LLM_ARCH_GPTJ || arch == LLM_ARCH_UNKNOWN) { continue; // These models don't have usable implementations. } if (arch == LLM_ARCH_WAVTOKENIZER_DEC) { continue; // FIXME CUDA backend crashes. } if (arch == LLM_ARCH_LLAMA_EMBED || arch == LLM_ARCH_GEMMA_EMBEDDING || arch == LLM_ARCH_T5ENCODER) { continue; // FIXME Embedding (?) models produce inconsistent results. } if (arch == LLM_ARCH_RWKV6 || arch == LLM_ARCH_RWKV6QWEN2 || arch == LLM_ARCH_RWKV7 || arch == LLM_ARCH_ARWKV7) { continue; // FIXME RWKV models hang indefinitely. } if (arch == LLM_ARCH_BERT || arch == LLM_ARCH_MODERN_BERT || arch == LLM_ARCH_NOMIC_BERT || arch == LLM_ARCH_NOMIC_BERT_MOE || arch == LLM_ARCH_NEO_BERT || arch == LLM_ARCH_JINA_BERT_V2 || arch == LLM_ARCH_JINA_BERT_V3 || arch == LLM_ARCH_EUROBERT) { continue; // TODO vocab } if (arch == LLM_ARCH_PLM) { continue; // TODO tensor shapes } // FIXME some models are segfaulting with WebGPU: #ifdef GGML_USE_WEBGPU if (arch == LLM_ARCH_QWEN3NEXT || arch == LLM_ARCH_QWEN35 || arch == LLM_ARCH_QWEN35MOE || arch == LLM_ARCH_KIMI_LINEAR) { continue; } #endif // GGML_USE_WEBGPU const bool encode = arch == LLM_ARCH_T5; for (bool moe : {false, true}) { if (moe && !moe_implemented(arch)) { continue; } if (!moe && moe_mandatory(arch)) { continue; } gguf_context_ptr gguf_ctx = get_gguf_ctx(arch, moe); auto model_and_ctx_cpu = get_model_and_ctx(gguf_ctx.get(), seed, {}); const std::vector logits_cpu = get_logits(model_and_ctx_cpu.first.get(), model_and_ctx_cpu.second.get(), tokens, encode); for (size_t i = 0; i < ggml_backend_dev_count(); i++) { ggml_backend_dev_t dev = ggml_backend_dev_get(i); if (ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_CPU) { continue; } auto model_and_ctx_dev = get_model_and_ctx(gguf_ctx.get(), seed, {dev}); const std::vector logits_dev = get_logits(model_and_ctx_dev.first.get(), model_and_ctx_dev.second.get(), tokens, encode); const double nmse_val = nmse(logits_cpu, logits_dev); const bool ok = nmse_val <= 1e-4; all_ok = all_ok && ok; char nmse_str[10]; snprintf(nmse_str, sizeof(nmse_str), "%.2e", nmse_val); printf("|%15s|%30s|%6s|%8s|%17s|\n", llm_arch_name(arch), ggml_backend_dev_description(dev), moe ? "MoE" : "Dense", nmse_str, ok ? "\033[1;32mOK\033[0m" : "\033[1;31mFAIL\033[0m"); } } } llama_log_set(ud.original_logger.callback, ud.original_logger.user_data); return all_ok ? 0 : 1; } int main(int argc, char ** argv) { // FIXME these tests are disabled in the CI for macOS-latest-cmake-arm64 because they are segfaulting common_init(); std::random_device rd; llm_arch arch = LLM_ARCH_UNKNOWN; size_t seed = rd(); ggml_log_level log_level = GGML_LOG_LEVEL_ERROR; std::string out; for (int i = 1; i < argc; i++) { if (strcmp(argv[i], "-a") == 0 || strcmp(argv[i], "--arch") == 0) { if (i + 1 < argc) { const std::string arch_name = argv[++i]; arch = llm_arch_from_string(arch_name); if (arch == LLM_ARCH_UNKNOWN) { LOG_ERR("%s: unkown LLM architecture: %s\n", __func__, arch_name.c_str()); return 1; } } else { usage(argv); return 1; } } if (strcmp(argv[i], "-s") == 0 || strcmp(argv[i], "--seed") == 0) { if (i + 1 < argc) { seed = std::stoull(argv[++i]); } else { usage(argv); return 1; } } if (strcmp(argv[i], "-v") == 0 || strcmp(argv[i], "--verbose") == 0) { log_level = GGML_LOG_LEVEL_INFO; continue; } if (strcmp(argv[i], "-o") == 0 || strcmp(argv[i], "--out") == 0) { if (i + 1 < argc) { out = argv[++i]; } else { usage(argv); return 1; } } } try { if (!out.empty()) { return save_models(arch, seed, log_level, out); } return test_backends(arch, seed, log_level); } catch (const std::exception & err) { fprintf(stderr, "encountered runtime error: %s\n", err.what()); return -1; } }