533 lines
21 KiB
C++
533 lines
21 KiB
C++
#include "common.h"
|
|
#include "log.h"
|
|
#include "ggml-backend.h"
|
|
#include "ggml.h"
|
|
#include "gguf.h"
|
|
#include "ggml-cpp.h"
|
|
#include "llama.h"
|
|
#include "llama-cpp.h"
|
|
#include "../src/llama-arch.h"
|
|
#include "../src/llama-model-saver.h"
|
|
|
|
#include <cinttypes>
|
|
#include <cstdio>
|
|
#include <cstring>
|
|
#include <cstdint>
|
|
#include <random>
|
|
#include <stdexcept>
|
|
#include <string>
|
|
#include <utility>
|
|
#include <vector>
|
|
|
|
// normalized mean squared error = mse(a, b) / mse(a, 0)
|
|
static double nmse(const std::vector<float> & a, const std::vector<float> & b) {
|
|
GGML_ASSERT(a.size() == b.size());
|
|
double mse_a_b = 0.0;
|
|
double mse_a_0 = 0.0;
|
|
|
|
for (size_t i = 0; i < a.size(); i++) {
|
|
float a_i = a[i];
|
|
float b_i = b[i];
|
|
|
|
mse_a_b += (a_i - b_i) * (a_i - b_i);
|
|
mse_a_0 += a_i * a_i;
|
|
}
|
|
|
|
return mse_a_b / mse_a_0;
|
|
}
|
|
|
|
static void set_tensor_data(struct ggml_tensor * tensor, void * userdata) {
|
|
std::hash<std::string> hasher;
|
|
std::mt19937 gen(hasher(tensor->name) + *(const size_t *) userdata);
|
|
std::normal_distribution<float> dis(0.0f, 1.0e-2f);
|
|
|
|
const int64_t ne = ggml_nelements(tensor);
|
|
if (tensor->type == GGML_TYPE_F32) {
|
|
std::vector<float> tmp(ne);
|
|
for (int64_t i = 0; i < ne; i++) {
|
|
tmp[i] = dis(gen);
|
|
}
|
|
ggml_backend_tensor_set(tensor, tmp.data(), 0, ggml_nbytes(tensor));
|
|
} else if (tensor->type == GGML_TYPE_F16) {
|
|
std::vector<ggml_fp16_t> tmp(ne);
|
|
for (int64_t i = 0; i < ne; i++) {
|
|
tmp[i] = ggml_fp32_to_fp16(dis(gen));
|
|
}
|
|
ggml_backend_tensor_set(tensor, tmp.data(), 0, ggml_nbytes(tensor));
|
|
} else {
|
|
GGML_ABORT("fatal error");
|
|
}
|
|
}
|
|
|
|
static void usage(char ** argv) {
|
|
printf("Usage: %s [-a/--arch arch] [-s/--seed seed] [-v/--verbose]\n", argv[0]);
|
|
}
|
|
|
|
static std::vector<llama_token> get_tokens(const uint32_t n_tokens, const uint32_t n_vocab, const size_t seed){
|
|
std::mt19937 gen(seed);
|
|
std::uniform_int_distribution<> dis(0, n_vocab - 1);
|
|
std::vector<llama_token> ret;
|
|
ret.reserve(n_tokens);
|
|
for (uint32_t i = 0; i < n_tokens; i++) {
|
|
ret.push_back(dis(gen));
|
|
}
|
|
return ret;
|
|
}
|
|
|
|
static gguf_context_ptr get_gguf_ctx(const llm_arch arch, const bool moe) {
|
|
gguf_context_ptr ret(gguf_init_empty());
|
|
llama_model_saver ms(arch, ret.get());
|
|
const uint32_t n_ctx = 128;
|
|
|
|
uint32_t n_vocab = 128;
|
|
uint32_t n_embd = 256;
|
|
uint32_t n_head = 2;
|
|
uint32_t n_ff = 384;
|
|
uint32_t n_layer = 2;
|
|
if (arch == LLM_ARCH_LLAMA4) {
|
|
n_layer = 4; // hparams.n_no_rope_layer_step is hard-coded to 4
|
|
} else if (arch == LLM_ARCH_GEMMA3N) {
|
|
n_embd = 64;
|
|
n_head = 1;
|
|
n_ff = 96;
|
|
} else if (arch == LLM_ARCH_DEEPSEEK2 || arch == LLM_ARCH_GLM_DSA || arch == LLM_ARCH_KIMI_LINEAR) {
|
|
n_embd = 128;
|
|
n_head = 1;
|
|
n_ff = 192;
|
|
} else if (arch == LLM_ARCH_NEMOTRON_H || arch == LLM_ARCH_NEMOTRON_H_MOE) {
|
|
n_layer = 3;
|
|
} else if (arch == LLM_ARCH_CHAMELEON) {
|
|
n_vocab = 10240;
|
|
} else if (arch == LLM_ARCH_GEMMA3N) {
|
|
n_layer = 22; // hparams.n_layer_kv_from_start = 20 is hardcoded
|
|
}
|
|
|
|
const uint32_t n_embd_head = n_embd / n_head;
|
|
|
|
ms.add_kv(LLM_KV_GENERAL_ARCHITECTURE, llm_arch_name(arch));
|
|
ms.add_kv(LLM_KV_VOCAB_SIZE, n_vocab);
|
|
ms.add_kv(LLM_KV_CONTEXT_LENGTH, n_ctx);
|
|
ms.add_kv(LLM_KV_EMBEDDING_LENGTH, n_embd);
|
|
ms.add_kv(LLM_KV_FEATURES_LENGTH, n_embd);
|
|
ms.add_kv(LLM_KV_BLOCK_COUNT, n_layer);
|
|
ms.add_kv(LLM_KV_LEADING_DENSE_BLOCK_COUNT, uint32_t(1));
|
|
|
|
if (arch == LLM_ARCH_NEMOTRON_H || arch == LLM_ARCH_NEMOTRON_H_MOE) {
|
|
std::vector<uint32_t> n_ff_per_layer;
|
|
n_ff_per_layer.reserve(n_layer);
|
|
for (uint32_t il = 0; il < n_layer; il++) {
|
|
n_ff_per_layer.push_back(il <= 1 ? 0 : n_ff);
|
|
}
|
|
ms.add_kv(LLM_KV_FEED_FORWARD_LENGTH, n_ff_per_layer);
|
|
} else {
|
|
ms.add_kv(LLM_KV_FEED_FORWARD_LENGTH, n_ff);
|
|
}
|
|
|
|
ms.add_kv(LLM_KV_USE_PARALLEL_RESIDUAL, false);
|
|
ms.add_kv(LLM_KV_LOGIT_SCALE, 1.0f);
|
|
ms.add_kv(LLM_KV_TIME_MIX_EXTRA_DIM, uint32_t(64));
|
|
ms.add_kv(LLM_KV_TIME_DECAY_EXTRA_DIM, uint32_t(128));
|
|
ms.add_kv(LLM_KV_FULL_ATTENTION_INTERVAL, uint32_t(2));
|
|
|
|
if (arch == LLM_ARCH_PLAMO2 || arch == LLM_ARCH_JAMBA || arch == LLM_ARCH_NEMOTRON_H || arch == LLM_ARCH_NEMOTRON_H_MOE ||
|
|
arch == LLM_ARCH_GRANITE_HYBRID || arch == LLM_ARCH_LFM2 || arch == LLM_ARCH_LFM2MOE || arch == LLM_ARCH_KIMI_LINEAR) {
|
|
GGML_ASSERT(n_layer >= 2);
|
|
std::vector<uint32_t> n_head_per_layer;
|
|
n_head_per_layer.reserve(n_layer);
|
|
for (uint32_t il = 0; il < n_layer; il++) {
|
|
n_head_per_layer.push_back(il == 1 ? 0 : n_head);
|
|
}
|
|
ms.add_kv(LLM_KV_ATTENTION_HEAD_COUNT, n_head_per_layer);
|
|
ms.add_kv(LLM_KV_ATTENTION_HEAD_COUNT_KV, n_head_per_layer);
|
|
} else {
|
|
ms.add_kv(LLM_KV_ATTENTION_HEAD_COUNT, n_head);
|
|
ms.add_kv(LLM_KV_ATTENTION_HEAD_COUNT_KV, n_head);
|
|
}
|
|
|
|
ms.add_kv(LLM_KV_ATTENTION_MAX_ALIBI_BIAS, 8.0f);
|
|
if (arch == LLM_ARCH_DEEPSEEK2 || arch == LLM_ARCH_GLM_DSA || arch == LLM_ARCH_KIMI_LINEAR) {
|
|
ms.add_kv(LLM_KV_ATTENTION_KEY_LENGTH, uint32_t(576));
|
|
ms.add_kv(LLM_KV_ATTENTION_VALUE_LENGTH, uint32_t(512));
|
|
ms.add_kv(LLM_KV_ROPE_DIMENSION_COUNT, uint32_t(64));
|
|
ms.add_kv(LLM_KV_ATTENTION_KEY_LENGTH_MLA, uint32_t(192));
|
|
ms.add_kv(LLM_KV_ATTENTION_VALUE_LENGTH_MLA, uint32_t(128));
|
|
}
|
|
ms.add_kv(LLM_KV_ATTENTION_CLAMP_KQV, 1.0f);
|
|
ms.add_kv(LLM_KV_ATTENTION_LAYERNORM_EPS, 1e-5f);
|
|
ms.add_kv(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, 1e-5f);
|
|
ms.add_kv(LLM_KV_ATTENTION_GROUPNORM_EPS, 1e-5f);
|
|
ms.add_kv(LLM_KV_ATTENTION_GROUPNORM_GROUPS, uint32_t(8));
|
|
ms.add_kv(LLM_KV_ATTENTION_Q_LORA_RANK, uint32_t(512));
|
|
ms.add_kv(LLM_KV_ATTENTION_KV_LORA_RANK, uint32_t(512));
|
|
ms.add_kv(LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, uint32_t(8));
|
|
ms.add_kv(LLM_KV_ATTENTION_SLIDING_WINDOW, n_ctx/8);
|
|
|
|
if (arch == LLM_ARCH_MIMO2 || arch == LLM_ARCH_STEP35) {
|
|
std::vector<uint32_t> pattern;
|
|
pattern.reserve(n_layer);
|
|
for (uint32_t il = 0; il < n_layer; il++) {
|
|
pattern.push_back(il % 2);
|
|
}
|
|
ms.add_kv(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, pattern);
|
|
} else {
|
|
ms.add_kv(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, uint32_t(2));
|
|
}
|
|
|
|
ms.add_kv(LLM_KV_ATTENTION_INDEXER_HEAD_COUNT, uint32_t(1));
|
|
ms.add_kv(LLM_KV_ATTENTION_INDEXER_KEY_LENGTH, uint32_t(64));
|
|
ms.add_kv(LLM_KV_ATTENTION_INDEXER_TOP_K, uint32_t(8));
|
|
ms.add_kv(LLM_KV_ROPE_DIMENSION_SECTIONS, std::vector<uint32_t>({n_embd_head/4, n_embd_head/4, n_embd_head/4, n_embd_head/4}));
|
|
ms.add_kv(LLM_KV_TOKENIZER_MODEL, "no_vocab");
|
|
// ms.add_kv(LLM_KV_DENSE_2_FEAT_OUT, n_embd);
|
|
// ms.add_kv(LLM_KV_DENSE_3_FEAT_IN, n_embd);
|
|
|
|
if (moe) {
|
|
ms.add_kv(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, n_ff);
|
|
ms.add_kv(LLM_KV_INTERLEAVE_MOE_LAYER_STEP, uint32_t(2));
|
|
ms.add_kv(LLM_KV_EXPERT_COUNT, uint32_t(2));
|
|
ms.add_kv(LLM_KV_EXPERT_USED_COUNT, uint32_t(1));
|
|
ms.add_kv(LLM_KV_EXPERT_SHARED_COUNT, uint32_t(1));
|
|
ms.add_kv(LLM_KV_EXPERT_GATING_FUNC, uint32_t(2)); // sigmoid
|
|
ms.add_kv(LLM_KV_EXPERT_GROUP_SCALE, 1.0f);
|
|
ms.add_kv(LLM_KV_EXPERTS_PER_GROUP, uint32_t(1));
|
|
}
|
|
|
|
ms.add_kv(LLM_KV_POSNET_EMBEDDING_LENGTH, n_embd);
|
|
ms.add_kv(LLM_KV_POSNET_BLOCK_COUNT, n_layer);
|
|
ms.add_kv(LLM_KV_CONVNEXT_EMBEDDING_LENGTH, n_embd);
|
|
ms.add_kv(LLM_KV_CONVNEXT_BLOCK_COUNT, n_layer);
|
|
ms.add_kv(LLM_KV_XIELU_ALPHA_N, 1.0f);
|
|
ms.add_kv(LLM_KV_XIELU_ALPHA_P, 1.0f);
|
|
ms.add_kv(LLM_KV_XIELU_BETA, 1.0f);
|
|
ms.add_kv(LLM_KV_XIELU_EPS, 1.0e-7f);
|
|
ms.add_kv(LLM_KV_SSM_INNER_SIZE, arch == LLM_ARCH_QWEN3NEXT || arch == LLM_ARCH_QWEN35 || arch == LLM_ARCH_QWEN35MOE ? 64 : 2*n_embd);
|
|
ms.add_kv(LLM_KV_SSM_CONV_KERNEL, uint32_t(4));
|
|
ms.add_kv(LLM_KV_SSM_STATE_SIZE, uint32_t(32));
|
|
ms.add_kv(LLM_KV_SSM_TIME_STEP_RANK, n_head);
|
|
ms.add_kv(LLM_KV_SSM_GROUP_COUNT, arch == LLM_ARCH_PLAMO2 ? 0 : uint32_t(2));
|
|
ms.add_kv(LLM_KV_KDA_HEAD_DIM, uint32_t(128));
|
|
ms.add_kv(LLM_KV_WKV_HEAD_SIZE, n_embd/n_head);
|
|
ms.add_kv(LLM_KV_SHORTCONV_L_CACHE, uint32_t(3));
|
|
|
|
for (uint32_t il = 0; il < n_layer; il++) {
|
|
ggml_tensor t;
|
|
memset(&t, 0, sizeof(ggml_tensor));
|
|
t.type = GGML_TYPE_F16;
|
|
ggml_format_name(&t, "conv%" PRIu32 "d.weight", il);
|
|
gguf_add_tensor(ms.gguf_ctx, &t);
|
|
ggml_format_name(&t, "posnet.%" PRIu32 ".conv1.weight", il);
|
|
gguf_add_tensor(ms.gguf_ctx, &t);
|
|
ggml_format_name(&t, "posnet.%" PRIu32 ".conv2.weight", il);
|
|
gguf_add_tensor(ms.gguf_ctx, &t);
|
|
ggml_format_name(&t, "convnext.%" PRIu32 ".dw.weight", il);
|
|
gguf_add_tensor(ms.gguf_ctx, &t);
|
|
}
|
|
return ret;
|
|
}
|
|
|
|
static std::pair<llama_model_ptr, llama_context_ptr> get_model_and_ctx(
|
|
struct gguf_context * gguf_ctx, const size_t seed, const std::vector<ggml_backend_dev_t> & devs) {
|
|
llama_model_params model_params = llama_model_default_params();
|
|
std::vector<ggml_backend_dev_t> devs_copy = devs;
|
|
devs_copy.push_back(nullptr);
|
|
model_params.devices = devs_copy.data();
|
|
|
|
llama_context_params ctx_params = llama_context_default_params();
|
|
ctx_params.n_ctx = 0;
|
|
ctx_params.n_threads = 4;
|
|
ctx_params.n_threads_batch = 4;
|
|
|
|
size_t tmp = seed;
|
|
llama_model_ptr model(llama_model_init_from_user(gguf_ctx, set_tensor_data, &tmp, model_params));
|
|
if (!model) {
|
|
throw std::runtime_error("failed to create llama model");
|
|
}
|
|
llama_context_ptr lctx(llama_init_from_model(model.get(), ctx_params));
|
|
if (!lctx) {
|
|
throw std::runtime_error("failed to create llama context");
|
|
}
|
|
return std::make_pair(std::move(model), std::move(lctx));
|
|
}
|
|
|
|
static std::vector<float> get_logits(
|
|
llama_model * model, llama_context * lctx, const std::vector<llama_token> & tokens, bool encode = false) {
|
|
const uint32_t n_vocab = llama_vocab_n_tokens(llama_model_get_vocab(model));
|
|
const uint32_t n_ctx = llama_n_ctx(lctx);
|
|
const uint32_t n_tokens = tokens.size();
|
|
llama_batch batch = llama_batch_init(n_ctx, 0, 1);
|
|
GGML_ASSERT(n_tokens <= n_ctx);
|
|
for (uint32_t pos = 0; pos < n_tokens; pos++) {
|
|
common_batch_add(batch, tokens[pos], pos, {0}, true);
|
|
}
|
|
batch.n_tokens = n_tokens;
|
|
if (encode) {
|
|
if (llama_encode(lctx, batch)) {
|
|
llama_batch_free(batch);
|
|
throw std::runtime_error("failed to encode batch");
|
|
}
|
|
}
|
|
if (llama_decode(lctx, batch)) {
|
|
llama_batch_free(batch);
|
|
throw std::runtime_error("failed to decode batch");
|
|
}
|
|
|
|
std::vector<float> ret;
|
|
ret.reserve(n_tokens*n_vocab);
|
|
for (uint32_t i = 0; i < n_tokens; i++) {
|
|
const float * logits_ith = llama_get_logits_ith(lctx, i);
|
|
for (uint32_t j = 0; j < n_vocab; j++) {
|
|
ret.push_back(logits_ith[j]);
|
|
}
|
|
}
|
|
llama_batch_free(batch);
|
|
return ret;
|
|
}
|
|
|
|
static bool moe_mandatory(const llm_arch arch) {
|
|
switch (arch) {
|
|
case LLM_ARCH_LLAMA4:
|
|
case LLM_ARCH_GROK:
|
|
case LLM_ARCH_QWEN2MOE:
|
|
case LLM_ARCH_QWEN3MOE:
|
|
case LLM_ARCH_QWEN3NEXT:
|
|
case LLM_ARCH_QWEN3VLMOE:
|
|
case LLM_ARCH_QWEN35MOE:
|
|
case LLM_ARCH_PHIMOE:
|
|
case LLM_ARCH_DBRX:
|
|
case LLM_ARCH_OLMOE:
|
|
case LLM_ARCH_ARCTIC:
|
|
case LLM_ARCH_DEEPSEEK:
|
|
case LLM_ARCH_DEEPSEEK2:
|
|
case LLM_ARCH_GLM4_MOE:
|
|
case LLM_ARCH_GLM_DSA:
|
|
case LLM_ARCH_EXAONE_MOE:
|
|
case LLM_ARCH_BAILINGMOE:
|
|
case LLM_ARCH_BAILINGMOE2:
|
|
case LLM_ARCH_DOTS1:
|
|
case LLM_ARCH_AFMOE:
|
|
case LLM_ARCH_ERNIE4_5:
|
|
case LLM_ARCH_ERNIE4_5_MOE:
|
|
case LLM_ARCH_HUNYUAN_MOE:
|
|
case LLM_ARCH_OPENAI_MOE:
|
|
case LLM_ARCH_LFM2MOE:
|
|
case LLM_ARCH_SMALLTHINKER:
|
|
case LLM_ARCH_LLADA_MOE:
|
|
case LLM_ARCH_GROVEMOE:
|
|
case LLM_ARCH_MINIMAX_M2:
|
|
case LLM_ARCH_RND1:
|
|
case LLM_ARCH_PADDLEOCR:
|
|
case LLM_ARCH_MIMO2:
|
|
case LLM_ARCH_KIMI_LINEAR:
|
|
case LLM_ARCH_STEP35:
|
|
return true;
|
|
default:
|
|
return false;
|
|
}
|
|
}
|
|
|
|
static bool moe_implemented(const llm_arch arch) {
|
|
if (moe_mandatory(arch)) {
|
|
return true;
|
|
}
|
|
switch (arch) {
|
|
case LLM_ARCH_LLAMA:
|
|
case LLM_ARCH_REFACT:
|
|
case LLM_ARCH_MINICPM:
|
|
case LLM_ARCH_GRANITE:
|
|
case LLM_ARCH_GRANITE_MOE:
|
|
case LLM_ARCH_MISTRAL3:
|
|
case LLM_ARCH_LLAMA_EMBED:
|
|
return true;
|
|
default:
|
|
return false;
|
|
}
|
|
}
|
|
|
|
static int save_models(const llm_arch target_arch, const size_t seed, const ggml_log_level log_level, const std::string & dir) {
|
|
GGML_ABORT("llama_model_save_to_file is broken");
|
|
struct user_data_t {
|
|
struct {
|
|
ggml_log_callback callback;
|
|
void * user_data;
|
|
} original_logger;
|
|
ggml_log_level min_level; // prints below this log level go to debug log
|
|
};
|
|
user_data_t ud;
|
|
llama_log_get(&ud.original_logger.callback, &ud.original_logger.user_data);
|
|
ud.min_level = log_level;
|
|
|
|
llama_log_set([](ggml_log_level level, const char * text, void * user_data) {
|
|
const user_data_t * ud = (const user_data_t *) user_data;
|
|
const ggml_log_level level_eff = level >= ud->min_level ? level : GGML_LOG_LEVEL_DEBUG;
|
|
ud->original_logger.callback(level_eff, text, ud->original_logger.user_data);
|
|
}, &ud);
|
|
|
|
for (const llm_arch & arch : llm_arch_all()) {
|
|
if (target_arch != LLM_ARCH_UNKNOWN && arch != target_arch) {
|
|
continue;
|
|
}
|
|
if (arch == LLM_ARCH_CLIP || arch == LLM_ARCH_GPTJ || arch == LLM_ARCH_UNKNOWN) {
|
|
continue; // These models don't have usable implementations.
|
|
}
|
|
for (bool moe : {false, true}) {
|
|
if (moe && !moe_implemented(arch)) {
|
|
continue;
|
|
}
|
|
if (!moe && moe_mandatory(arch)) {
|
|
continue;
|
|
}
|
|
gguf_context_ptr gguf_ctx = get_gguf_ctx(arch, moe);
|
|
auto model_and_ctx = get_model_and_ctx(gguf_ctx.get(), seed, {});
|
|
const std::string path = dir + "/" + llm_arch_name(arch) + (moe ? "-moe.gguf" : "-dense.gguf");
|
|
LOG_INF("%s: Saving %s model (%s) to %s...\n", __func__, llm_arch_name(arch), moe ? "MoE" : "dense", path.c_str());
|
|
llama_model_save_to_file(model_and_ctx.first.get(), path.c_str());
|
|
}
|
|
}
|
|
llama_log_set(ud.original_logger.callback, ud.original_logger.user_data);
|
|
return 0;
|
|
}
|
|
|
|
static int test_backends(const llm_arch target_arch, const size_t seed, const ggml_log_level log_level) {
|
|
struct user_data_t {
|
|
struct {
|
|
ggml_log_callback callback;
|
|
void * user_data;
|
|
} original_logger;
|
|
ggml_log_level min_level; // prints below this log level go to debug log
|
|
};
|
|
user_data_t ud;
|
|
llama_log_get(&ud.original_logger.callback, &ud.original_logger.user_data);
|
|
ud.min_level = log_level;
|
|
|
|
llama_log_set([](ggml_log_level level, const char * text, void * user_data) {
|
|
const user_data_t * ud = (const user_data_t *) user_data;
|
|
const ggml_log_level level_eff = level >= ud->min_level ? level : GGML_LOG_LEVEL_DEBUG;
|
|
ud->original_logger.callback(level_eff, text, ud->original_logger.user_data);
|
|
}, &ud);
|
|
|
|
const std::vector<llama_token> tokens = get_tokens(128, 128, seed);
|
|
|
|
bool all_ok = true;
|
|
common_log_flush(common_log_main());
|
|
printf("|%15s|%30s|%6s|%8s|%6s|\n", "Model arch.", "Device", "Config", "NMSE", "Status");
|
|
printf("|---------------|------------------------------|------|--------|------|\n");
|
|
for (const llm_arch & arch : llm_arch_all()) {
|
|
if (target_arch != LLM_ARCH_UNKNOWN && arch != target_arch) {
|
|
continue;
|
|
}
|
|
if (arch == LLM_ARCH_CLIP || arch == LLM_ARCH_GPTJ || arch == LLM_ARCH_UNKNOWN) {
|
|
continue; // These models don't have usable implementations.
|
|
}
|
|
if (arch == LLM_ARCH_WAVTOKENIZER_DEC) {
|
|
continue; // FIXME CUDA backend crashes.
|
|
}
|
|
if (arch == LLM_ARCH_LLAMA_EMBED || arch == LLM_ARCH_GEMMA_EMBEDDING || arch == LLM_ARCH_T5ENCODER) {
|
|
continue; // FIXME Embedding (?) models produce inconsistent results.
|
|
}
|
|
if (arch == LLM_ARCH_RWKV6 || arch == LLM_ARCH_RWKV6QWEN2 || arch == LLM_ARCH_RWKV7 || arch == LLM_ARCH_ARWKV7) {
|
|
continue; // FIXME RWKV models hang indefinitely.
|
|
}
|
|
if (arch == LLM_ARCH_BERT || arch == LLM_ARCH_MODERN_BERT || arch == LLM_ARCH_NOMIC_BERT || arch == LLM_ARCH_NOMIC_BERT_MOE ||
|
|
arch == LLM_ARCH_NEO_BERT || arch == LLM_ARCH_JINA_BERT_V2 || arch == LLM_ARCH_JINA_BERT_V3 || arch == LLM_ARCH_EUROBERT) {
|
|
continue; // TODO vocab
|
|
}
|
|
if (arch == LLM_ARCH_PLM) {
|
|
continue; // TODO tensor shapes
|
|
}
|
|
|
|
// FIXME some models are segfaulting with WebGPU:
|
|
#ifdef GGML_USE_WEBGPU
|
|
if (arch == LLM_ARCH_QWEN3NEXT || arch == LLM_ARCH_QWEN35 || arch == LLM_ARCH_QWEN35MOE || arch == LLM_ARCH_KIMI_LINEAR) {
|
|
continue;
|
|
}
|
|
#endif // GGML_USE_WEBGPU
|
|
|
|
const bool encode = arch == LLM_ARCH_T5;
|
|
for (bool moe : {false, true}) {
|
|
if (moe && !moe_implemented(arch)) {
|
|
continue;
|
|
}
|
|
if (!moe && moe_mandatory(arch)) {
|
|
continue;
|
|
}
|
|
gguf_context_ptr gguf_ctx = get_gguf_ctx(arch, moe);
|
|
auto model_and_ctx_cpu = get_model_and_ctx(gguf_ctx.get(), seed, {});
|
|
const std::vector<float> logits_cpu = get_logits(model_and_ctx_cpu.first.get(), model_and_ctx_cpu.second.get(), tokens, encode);
|
|
for (size_t i = 0; i < ggml_backend_dev_count(); i++) {
|
|
ggml_backend_dev_t dev = ggml_backend_dev_get(i);
|
|
if (ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_CPU) {
|
|
continue;
|
|
}
|
|
auto model_and_ctx_dev = get_model_and_ctx(gguf_ctx.get(), seed, {dev});
|
|
const std::vector<float> logits_dev = get_logits(model_and_ctx_dev.first.get(), model_and_ctx_dev.second.get(), tokens, encode);
|
|
const double nmse_val = nmse(logits_cpu, logits_dev);
|
|
const bool ok = nmse_val <= 1e-4;
|
|
all_ok = all_ok && ok;
|
|
char nmse_str[10];
|
|
snprintf(nmse_str, sizeof(nmse_str), "%.2e", nmse_val);
|
|
printf("|%15s|%30s|%6s|%8s|%17s|\n", llm_arch_name(arch), ggml_backend_dev_description(dev),
|
|
moe ? "MoE" : "Dense", nmse_str, ok ? "\033[1;32mOK\033[0m" : "\033[1;31mFAIL\033[0m");
|
|
}
|
|
}
|
|
}
|
|
llama_log_set(ud.original_logger.callback, ud.original_logger.user_data);
|
|
return all_ok ? 0 : 1;
|
|
}
|
|
|
|
int main(int argc, char ** argv) {
|
|
// FIXME these tests are disabled in the CI for macOS-latest-cmake-arm64 because they are segfaulting
|
|
common_init();
|
|
std::random_device rd;
|
|
|
|
llm_arch arch = LLM_ARCH_UNKNOWN;
|
|
size_t seed = rd();
|
|
ggml_log_level log_level = GGML_LOG_LEVEL_ERROR;
|
|
std::string out;
|
|
|
|
for (int i = 1; i < argc; i++) {
|
|
if (strcmp(argv[i], "-a") == 0 || strcmp(argv[i], "--arch") == 0) {
|
|
if (i + 1 < argc) {
|
|
const std::string arch_name = argv[++i];
|
|
arch = llm_arch_from_string(arch_name);
|
|
if (arch == LLM_ARCH_UNKNOWN) {
|
|
LOG_ERR("%s: unkown LLM architecture: %s\n", __func__, arch_name.c_str());
|
|
return 1;
|
|
}
|
|
} else {
|
|
usage(argv);
|
|
return 1;
|
|
}
|
|
}
|
|
if (strcmp(argv[i], "-s") == 0 || strcmp(argv[i], "--seed") == 0) {
|
|
if (i + 1 < argc) {
|
|
seed = std::stoull(argv[++i]);
|
|
} else {
|
|
usage(argv);
|
|
return 1;
|
|
}
|
|
}
|
|
if (strcmp(argv[i], "-v") == 0 || strcmp(argv[i], "--verbose") == 0) {
|
|
log_level = GGML_LOG_LEVEL_INFO;
|
|
continue;
|
|
}
|
|
if (strcmp(argv[i], "-o") == 0 || strcmp(argv[i], "--out") == 0) {
|
|
if (i + 1 < argc) {
|
|
out = argv[++i];
|
|
} else {
|
|
usage(argv);
|
|
return 1;
|
|
}
|
|
}
|
|
}
|
|
|
|
try {
|
|
if (!out.empty()) {
|
|
return save_models(arch, seed, log_level, out);
|
|
}
|
|
return test_backends(arch, seed, log_level);
|
|
} catch (const std::exception & err) {
|
|
fprintf(stderr, "encountered runtime error: %s\n", err.what());
|
|
return -1;
|
|
}
|
|
}
|