diff --git a/ggml/include/gguf.h b/ggml/include/gguf.h index 79ee202062..02d5f221c0 100644 --- a/ggml/include/gguf.h +++ b/ggml/include/gguf.h @@ -77,6 +77,7 @@ extern "C" { }; GGML_API struct gguf_context * gguf_init_empty(void); + GGML_API struct gguf_context * gguf_init_from_file_ptr(FILE * file, struct gguf_init_params params); GGML_API struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_params params); //GGML_API struct gguf_context * gguf_init_from_buffer(..); @@ -189,6 +190,7 @@ extern "C" { // // write the entire context to a binary file + GGML_API bool gguf_write_to_file_ptr(const struct gguf_context * ctx, FILE * file, bool only_meta); GGML_API bool gguf_write_to_file(const struct gguf_context * ctx, const char * fname, bool only_meta); // get the size in bytes of the meta data (header, kv pairs, tensor info) including padding diff --git a/ggml/src/ggml-impl.h b/ggml/src/ggml-impl.h index 9256865595..0639db362e 100644 --- a/ggml/src/ggml-impl.h +++ b/ggml/src/ggml-impl.h @@ -773,6 +773,5 @@ inline bool ggml_check_edges(const struct ggml_cgraph * cgraph, // expose GGUF internals for test code GGML_API size_t gguf_type_size(enum gguf_type type); -GGML_API struct gguf_context * gguf_init_from_file_impl(FILE * file, struct gguf_init_params params); GGML_API void gguf_write_to_buf(const struct gguf_context * ctx, std::vector & buf, bool only_meta); #endif // __cplusplus diff --git a/ggml/src/gguf.cpp b/ggml/src/gguf.cpp index cbeedf6c4b..ab3cc97486 100644 --- a/ggml/src/gguf.cpp +++ b/ggml/src/gguf.cpp @@ -394,7 +394,11 @@ bool gguf_read_emplace_helper(const struct gguf_reader & gr, std::vector & bu gguf_write_out(ctx, gw, only_meta); } +bool gguf_write_to_file_ptr(const struct gguf_context * ctx, FILE * file, bool only_meta) { + GGML_ASSERT(file); + + try { + gguf_writer_file gw(file); + gguf_write_out(ctx, gw, only_meta); + } catch (const std::runtime_error& ex) { + GGML_LOG_ERROR("%s: failed to write GGUF data: %s\n", __func__, ex.what()); + return false; + } + return true; +} + bool gguf_write_to_file(const struct gguf_context * ctx, const char * fname, bool only_meta) { FILE * file = ggml_fopen(fname, "wb"); @@ -1516,17 +1533,13 @@ bool gguf_write_to_file(const struct gguf_context * ctx, const char * fname, boo return false; } - try { - gguf_writer_file gw(file); - gguf_write_out(ctx, gw, only_meta); - } catch (const std::runtime_error& ex) { - GGML_LOG_ERROR("%s: failed to write GGUF data into '%s': %s\n", __func__, fname, ex.what()); - fclose(file); - return false; + const bool success = gguf_write_to_file_ptr(ctx, file, only_meta); + if (!success) { + GGML_LOG_ERROR("%s: failed to write GGUF data into '%s'\n", __func__, fname); } fclose(file); - return true; + return success; } size_t gguf_get_meta_size(const struct gguf_context * ctx) { diff --git a/include/llama.h b/include/llama.h index 6e72db7e3c..60e4b6b2ef 100644 --- a/include/llama.h +++ b/include/llama.h @@ -465,6 +465,11 @@ extern "C" { const char * path_model, struct llama_model_params params); + // Load a model from an open FILE pointer + LLAMA_API struct llama_model * llama_model_load_from_file_ptr( + FILE * file, + struct llama_model_params params); + // Load a model from multiple splits (support custom naming scheme) // The paths must be in the correct order LLAMA_API struct llama_model * llama_model_load_from_splits( diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp index 84dc6d8f1b..322a66bc05 100644 --- a/src/llama-arch.cpp +++ b/src/llama-arch.cpp @@ -544,6 +544,10 @@ static std::set llm_get_tensor_names(llm_arch arch) { case LLM_ARCH_CLIP: return {}; case LLM_ARCH_LLAMA: + case LLM_ARCH_REFACT: + case LLM_ARCH_MINICPM: + case LLM_ARCH_GRANITE: + case LLM_ARCH_GRANITE_MOE: case LLM_ARCH_DECI: case LLM_ARCH_MISTRAL3: case LLM_ARCH_LLAMA_EMBED: @@ -744,11 +748,9 @@ static std::set llm_get_tensor_names(llm_arch arch) { LLM_TENSOR_ATTN_Q_NORM, LLM_TENSOR_ATTN_K_NORM, }; - case LLM_ARCH_REFACT: case LLM_ARCH_QWEN2: case LLM_ARCH_QWEN2VL: case LLM_ARCH_INTERNLM2: - case LLM_ARCH_GRANITE: case LLM_ARCH_ERNIE4_5: case LLM_ARCH_PADDLEOCR: case LLM_ARCH_SMOLLM3: @@ -759,6 +761,7 @@ static std::set llm_get_tensor_names(llm_arch arch) { LLM_TENSOR_TOKEN_EMBD, LLM_TENSOR_OUTPUT_NORM, LLM_TENSOR_OUTPUT, + LLM_TENSOR_ROPE_FREQS, LLM_TENSOR_ATTN_NORM, LLM_TENSOR_ATTN_Q, LLM_TENSOR_ATTN_K, @@ -1232,29 +1235,6 @@ static std::set llm_get_tensor_names(llm_arch arch) { LLM_TENSOR_FFN_DOWN, LLM_TENSOR_FFN_UP, }; - case LLM_ARCH_MINICPM: - return { - LLM_TENSOR_TOKEN_EMBD, - LLM_TENSOR_OUTPUT_NORM, - LLM_TENSOR_OUTPUT, - LLM_TENSOR_ROPE_FREQS, - LLM_TENSOR_ROPE_FACTORS_LONG, - LLM_TENSOR_ROPE_FACTORS_SHORT, - LLM_TENSOR_ATTN_NORM, - LLM_TENSOR_ATTN_Q, - LLM_TENSOR_ATTN_K, - LLM_TENSOR_ATTN_V, - LLM_TENSOR_ATTN_OUT, - LLM_TENSOR_ATTN_ROT_EMBD, - LLM_TENSOR_FFN_GATE_INP, - LLM_TENSOR_FFN_NORM, - LLM_TENSOR_FFN_GATE, - LLM_TENSOR_FFN_DOWN, - LLM_TENSOR_FFN_UP, - LLM_TENSOR_FFN_GATE_EXP, - LLM_TENSOR_FFN_DOWN_EXP, - LLM_TENSOR_FFN_UP_EXP, - }; case LLM_ARCH_MINICPM3: return { LLM_TENSOR_TOKEN_EMBD, @@ -1442,6 +1422,7 @@ static std::set llm_get_tensor_names(llm_arch arch) { LLM_TENSOR_TOKEN_EMBD, LLM_TENSOR_OUTPUT, LLM_TENSOR_OUTPUT_NORM, + LLM_TENSOR_ROPE_FREQS, LLM_TENSOR_ATTN_NORM, LLM_TENSOR_ATTN_Q, LLM_TENSOR_ATTN_K, @@ -1657,7 +1638,9 @@ static std::set llm_get_tensor_names(llm_arch arch) { LLM_TENSOR_ROPE_FREQS, LLM_TENSOR_OUTPUT_NORM, LLM_TENSOR_OUTPUT, + LLM_TENSOR_TOKEN_EMBD, LLM_TENSOR_ATTN_NORM, + LLM_TENSOR_ATTN_QKV, LLM_TENSOR_ATTN_Q, LLM_TENSOR_ATTN_K, LLM_TENSOR_ATTN_V, @@ -2061,30 +2044,12 @@ static std::set llm_get_tensor_names(llm_arch arch) { LLM_TENSOR_FFN_DOWN, LLM_TENSOR_FFN_UP, }; - case LLM_ARCH_GRANITE_MOE: - return { - LLM_TENSOR_TOKEN_EMBD, - LLM_TENSOR_OUTPUT_NORM, - LLM_TENSOR_OUTPUT, - LLM_TENSOR_ATTN_NORM, - LLM_TENSOR_ATTN_Q, - LLM_TENSOR_ATTN_K, - LLM_TENSOR_ATTN_V, - LLM_TENSOR_ATTN_OUT, - LLM_TENSOR_FFN_NORM, - LLM_TENSOR_FFN_GATE_INP, - LLM_TENSOR_FFN_GATE_EXPS, - LLM_TENSOR_FFN_DOWN_EXPS, - LLM_TENSOR_FFN_UP_EXPS, - LLM_TENSOR_FFN_GATE_SHEXP, - LLM_TENSOR_FFN_DOWN_SHEXP, - LLM_TENSOR_FFN_UP_SHEXP, - }; case LLM_ARCH_GRANITE_HYBRID: return { LLM_TENSOR_TOKEN_EMBD, LLM_TENSOR_OUTPUT_NORM, LLM_TENSOR_OUTPUT, + LLM_TENSOR_ROPE_FREQS, LLM_TENSOR_ATTN_NORM, LLM_TENSOR_SSM_IN, LLM_TENSOR_SSM_CONV1D, @@ -2412,6 +2377,7 @@ static std::set llm_get_tensor_names(llm_arch arch) { LLM_TENSOR_TOKEN_EMBD, LLM_TENSOR_OUTPUT_NORM, LLM_TENSOR_OUTPUT, + LLM_TENSOR_ROPE_FREQS, LLM_TENSOR_ATTN_NORM, LLM_TENSOR_ATTN_QKV, LLM_TENSOR_ATTN_OUT, @@ -2789,7 +2755,12 @@ std::string LLM_TN_IMPL::str() const { } if (model_tensors.find(tensor) == model_tensors.end()) { - return LLM_TENSOR_NAMES.at(tensor); + const char * name = LLM_TENSOR_NAMES.at(tensor); + if (suffix != nullptr || bid != -1 || xid != -1) { + LLAMA_LOG_WARN("%s: cannot properly format tensor name %s with suffix=%s bid=%d xid=%d\n", + __func__, name, suffix, bid, xid); + } + return name; } std::string name = ::format(LLM_TENSOR_NAMES.at(tensor), bid, xid); diff --git a/src/llama-mmap.cpp b/src/llama-mmap.cpp index c03228e9ce..ccc29c1302 100644 --- a/src/llama-mmap.cpp +++ b/src/llama-mmap.cpp @@ -86,6 +86,14 @@ struct llama_file::impl { seek(0, SEEK_SET); } + impl(FILE * file) : owns_fp(false) { + fp = file; + fp_win32 = (HANDLE) _get_osfhandle(_fileno(fp)); + seek(0, SEEK_END); + size = tell(); + seek(0, SEEK_SET); + } + size_t tell() const { LARGE_INTEGER li; li.QuadPart = 0; @@ -159,7 +167,7 @@ struct llama_file::impl { } ~impl() { - if (fp) { + if (fp && owns_fp) { std::fclose(fp); } } @@ -209,6 +217,13 @@ struct llama_file::impl { seek(0, SEEK_SET); } + impl(FILE * file) : fname("(file*)"), owns_fp(false) { + fp = file; + seek(0, SEEK_END); + size = tell(); + seek(0, SEEK_SET); + } + size_t tell() const { if (fd == -1) { long ret = std::ftell(fp); @@ -353,7 +368,7 @@ struct llama_file::impl { ~impl() { if (fd != -1) { close(fd); - } else { + } else if (owns_fp) { std::fclose(fp); } } @@ -369,10 +384,14 @@ struct llama_file::impl { FILE * fp{}; size_t size{}; + bool owns_fp = true; }; llama_file::llama_file(const char * fname, const char * mode, const bool use_direct_io) : pimpl(std::make_unique(fname, mode, use_direct_io)) {} + +llama_file::llama_file(FILE * file) : pimpl(std::make_unique(file)) {} + llama_file::~llama_file() = default; size_t llama_file::tell() const { return pimpl->tell(); } diff --git a/src/llama-mmap.h b/src/llama-mmap.h index 29ce4d2468..b7d5c61e95 100644 --- a/src/llama-mmap.h +++ b/src/llama-mmap.h @@ -15,6 +15,7 @@ using llama_mlocks = std::vector>; struct llama_file { llama_file(const char * fname, const char * mode, bool use_direct_io = false); + llama_file(FILE * file); ~llama_file(); size_t tell() const; diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp index 413f34c226..2457a7ed4b 100644 --- a/src/llama-model-loader.cpp +++ b/src/llama-model-loader.cpp @@ -511,6 +511,7 @@ llama_model_loader::llama_model_loader( void * set_tensor_data_ud, const std::string & fname, std::vector & splits, + FILE * file, bool use_mmap, bool use_direct_io, bool check_tensors, @@ -658,6 +659,36 @@ llama_model_loader::llama_model_loader( LLAMA_LOG_INFO("%s: additional %d GGUFs metadata loaded.\n", __func__, n_split - 1); } + } else if (file != nullptr) { + struct ggml_context * ctx = NULL; + struct gguf_init_params params = { + /*.no_alloc = */ true, + /*.ctx = */ &ctx, + }; + + metadata_ptr.reset(gguf_init_from_file_ptr(file, params)); + metadata = metadata_ptr.get(); + if (metadata == nullptr) { + throw std::runtime_error(format("%s: failed to load model from file pointer", __func__)); + } + + get_key(llm_kv(LLM_KV_GENERAL_ARCHITECTURE), arch_name, false); + llm_kv = LLM_KV(llm_arch_from_string(arch_name)); + + files.emplace_back(new llama_file(file)); + contexts.emplace_back(ctx); + + // Save tensors data offset info of the main file. + for (ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) { + std::string tensor_name = std::string(cur->name); + // make sure there is no duplicated tensor names + if (weights_map.find(tensor_name) != weights_map.end()) { + throw std::runtime_error(format("invalid model: tensor '%s' is duplicated", ggml_get_name(cur))); + } + n_elements += ggml_nelements(cur); + n_bytes += ggml_nbytes(cur); + weights_map.emplace(tensor_name, llama_tensor_weight(files.back().get(), 0, metadata, cur)); + } } else { get_key(llm_kv(LLM_KV_GENERAL_ARCHITECTURE), arch_name, false); llm_kv = LLM_KV(llm_arch_from_string(arch_name)); @@ -669,7 +700,7 @@ llama_model_loader::llama_model_loader( fver = (enum llama_fver) gguf_get_version(metadata); LLAMA_LOG_INFO("%s: loaded meta data with %d key-value pairs and %d tensors from %s (version %s)\n", - __func__, n_kv, n_tensors, fname.c_str(), llama_file_version_name(fver)); + __func__, n_kv, n_tensors, fname.empty() ? "(file*)" : fname.c_str(), llama_file_version_name(fver)); // determine file type based on the number of tensors for each quantization and print meta data // TODO: make optional diff --git a/src/llama-model-loader.h b/src/llama-model-loader.h index ed5de729ca..7b3d6703c0 100644 --- a/src/llama-model-loader.h +++ b/src/llama-model-loader.h @@ -125,6 +125,7 @@ struct llama_model_loader { void * set_tensor_data_ud, const std::string & fname, std::vector & splits, // optional, only need if the split does not follow naming scheme + FILE * file, bool use_mmap, bool use_direct_io, bool check_tensors, diff --git a/src/llama-model-saver.cpp b/src/llama-model-saver.cpp index 6f6538aecc..26864c18e9 100644 --- a/src/llama-model-saver.cpp +++ b/src/llama-model-saver.cpp @@ -1,7 +1,9 @@ #include "llama-model-saver.h" +#include "ggml.h" #include "gguf.h" +#include "llama-arch.h" #include "llama.h" #include "llama-hparams.h" #include "llama-model.h" @@ -10,8 +12,33 @@ #include #include +bool llama_model_saver_supports_arch(llm_arch arch) { + switch (arch) { + case LLM_ARCH_QWEN3NEXT: + case LLM_ARCH_QWEN35: + case LLM_ARCH_QWEN35MOE: + case LLM_ARCH_PLAMO3: + case LLM_ARCH_GEMMA3: + case LLM_ARCH_GEMMA3N: + case LLM_ARCH_COHERE2: + case LLM_ARCH_OLMO2: + case LLM_ARCH_BITNET: + case LLM_ARCH_T5: + case LLM_ARCH_EXAONE_MOE: + case LLM_ARCH_AFMOE: + case LLM_ARCH_APERTUS: + case LLM_ARCH_MIMO2: + case LLM_ARCH_STEP35: + return false; + default: + return true; + } +} + llama_model_saver::llama_model_saver(const struct llama_model * model) : - gguf_ctx(gguf_init_empty()), gguf_ctx_owned(true), model(model), llm_kv(model->arch) {} + gguf_ctx(gguf_init_empty()), gguf_ctx_owned(true), model(model), llm_kv(model->arch) { + GGML_ASSERT(llama_model_saver_supports_arch(model->arch)); +} llama_model_saver::llama_model_saver(enum llm_arch arch, struct gguf_context * gguf_ctx) : gguf_ctx(gguf_ctx == nullptr ? gguf_init_empty() : gguf_ctx), gguf_ctx_owned(gguf_ctx == nullptr), model(nullptr), llm_kv(arch) {} @@ -105,7 +132,10 @@ void llama_model_saver::add_tensor(const struct ggml_tensor * tensor) { return; } if (gguf_find_tensor(gguf_ctx, tensor->name) >= 0) { - GGML_ASSERT(std::string(tensor->name) == "rope_freqs.weight"); // FIXME + const std::string tensor_name = tensor->name; + GGML_ASSERT( + tensor_name == "rope_freqs.weight" || tensor_name == "rope_factors_long.weight" || + tensor_name == "rope_factors_short.weight"); // FIXME return; } gguf_add_tensor(gguf_ctx, tensor); @@ -127,6 +157,7 @@ void llama_model_saver::add_kv_from_model() { tokens[id] = token_data.text; scores[id] = token_data.score; + // FIXME should this be treated as flags? switch(token_data.attr) { case LLAMA_TOKEN_ATTR_UNKNOWN: token_types[id] = LLAMA_TOKEN_TYPE_UNKNOWN; break; case LLAMA_TOKEN_ATTR_UNUSED: token_types[id] = LLAMA_TOKEN_TYPE_UNUSED; break; @@ -134,6 +165,9 @@ void llama_model_saver::add_kv_from_model() { case LLAMA_TOKEN_ATTR_CONTROL: token_types[id] = LLAMA_TOKEN_TYPE_CONTROL; break; case LLAMA_TOKEN_ATTR_USER_DEFINED: token_types[id] = LLAMA_TOKEN_TYPE_USER_DEFINED; break; case LLAMA_TOKEN_ATTR_BYTE: token_types[id] = LLAMA_TOKEN_TYPE_BYTE; break; + // case LLAMA_TOKEN_ATTR_NORMALIZED: ??? + // case LLAMA_TOKEN_ATTR_LSTRIP: ??? + // case LLAMA_TOKEN_ATTR_RSTRIP: ??? case LLAMA_TOKEN_ATTR_UNDEFINED: default: token_types[id] = LLAMA_TOKEN_TYPE_UNDEFINED; break; } @@ -144,6 +178,19 @@ void llama_model_saver::add_kv_from_model() { add_kv(LLM_KV_GENERAL_ARCHITECTURE, model->arch_name()); // add_kv(LLM_KV_GENERAL_QUANTIZATION_VERSION, ???); // add_kv(LLM_KV_GENERAL_ALIGNMENT, ???); + // add_kv(LLM_KV_GENERAL_FILE_TYPE, ???); + // add_kv(LLM_KV_GENERAL_SAMPLING_SEQUENCE, ???); + // add_kv(LLM_KV_GENERAL_SAMPLING_TOP_K, ???); + // add_kv(LLM_KV_GENERAL_SAMPLING_TOP_P, ???); + // add_kv(LLM_KV_GENERAL_SAMPLING_MIN_P, ???); + // add_kv(LLM_KV_GENERAL_SAMPLING_XTC_PROBABILITY, ???); + // add_kv(LLM_KV_GENERAL_SAMPLING_XTC_THRESHOLD, ???); + // add_kv(LLM_KV_GENERAL_SAMPLING_TEMP, ???); + // add_kv(LLM_KV_GENERAL_SAMPLING_PENALTY_LAST_N, ???); + // add_kv(LLM_KV_GENERAL_SAMPLING_PENALTY_REPEAT, ???); + // add_kv(LLM_KV_GENERAL_SAMPLING_MIROSTAT, ???); + // add_kv(LLM_KV_GENERAL_SAMPLING_MIROSTAT_TAU, ???); + // add_kv(LLM_KV_GENERAL_SAMPLING_MIROSTAT_ETA, ???); add_kv(LLM_KV_GENERAL_NAME, model->name); // add_kv(LLM_KV_GENERAL_AUTHOR, ???); // add_kv(LLM_KV_GENERAL_VERSION, ???); @@ -163,17 +210,31 @@ void llama_model_saver::add_kv_from_model() { add_kv(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead); add_kv(LLM_KV_FEED_FORWARD_LENGTH, hparams.n_ff_arr, true); add_kv(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp); - add_kv(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_exp); + add_kv(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp); + add_kv(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_chexp); + add_kv(LLM_KV_SWIGLU_CLAMP_EXP, hparams.swiglu_clamp_exp); + add_kv(LLM_KV_SWIGLU_CLAMP_SHEXP, hparams.swiglu_clamp_shexp); add_kv(LLM_KV_USE_PARALLEL_RESIDUAL, hparams.use_par_res); // add_kv(LLM_KV_TENSOR_DATA_LAYOUT, ???); add_kv(LLM_KV_EXPERT_COUNT, hparams.n_expert); add_kv(LLM_KV_EXPERT_USED_COUNT, hparams.n_expert_used); add_kv(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared); + add_kv(LLM_KV_EXPERT_GROUP_COUNT, hparams.n_expert_groups); + add_kv(LLM_KV_EXPERT_GROUP_USED_COUNT, hparams.n_group_used); add_kv(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale); + add_kv(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm); + add_kv(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func); + add_kv(LLM_KV_EXPERT_GROUP_SCALE, hparams.expert_group_scale); + add_kv(LLM_KV_EXPERTS_PER_GROUP, hparams.n_group_experts); + add_kv(LLM_KV_MOE_EVERY_N_LAYERS, hparams.moe_every_n_layers); + add_kv(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.nextn_predict_layers); + add_kv(LLM_KV_NUM_DEEPSTACK_LAYERS, hparams.n_deepstack_layers); add_kv(LLM_KV_POOLING_TYPE, uint32_t(hparams.pooling_type)); add_kv(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale); add_kv(LLM_KV_DECODER_START_TOKEN_ID, hparams.dec_start_token_id); + add_kv(LLM_KV_DECODER_BLOCK_COUNT, hparams.dec_n_layer); add_kv(LLM_KV_ATTN_LOGIT_SOFTCAPPING, hparams.f_attn_logit_softcapping); + add_kv(LLM_KV_ROUTER_LOGIT_SOFTCAPPING, hparams.f_router_logit_softcapping); add_kv(LLM_KV_FINAL_LOGIT_SOFTCAPPING, hparams.f_final_logit_softcapping); add_kv(LLM_KV_SWIN_NORM, hparams.swin_norm); add_kv(LLM_KV_RESCALE_EVERY_N_LAYERS, hparams.rescale_every_n_layers); @@ -181,6 +242,9 @@ void llama_model_saver::add_kv_from_model() { add_kv(LLM_KV_TIME_DECAY_EXTRA_DIM, hparams.time_decay_extra_dim); add_kv(LLM_KV_RESIDUAL_SCALE, hparams.f_residual_scale); add_kv(LLM_KV_EMBEDDING_SCALE, hparams.f_embedding_scale); + add_kv(LLM_KV_TOKEN_SHIFT_COUNT, hparams.token_shift_count); + add_kv(LLM_KV_INTERLEAVE_MOE_LAYER_STEP, hparams.n_moe_layer_step); + // add_kv(LLM_KV_FULL_ATTENTION_INTERVAL, ???); add_kv(LLM_KV_ATTENTION_HEAD_COUNT, hparams.n_head_arr, true); add_kv(LLM_KV_ATTENTION_HEAD_COUNT_KV, hparams.n_head_kv_arr, true); @@ -188,22 +252,39 @@ void llama_model_saver::add_kv_from_model() { add_kv(LLM_KV_ATTENTION_CLAMP_KQV, hparams.f_clamp_kqv); add_kv(LLM_KV_ATTENTION_KEY_LENGTH, hparams.n_embd_head_k_full); add_kv(LLM_KV_ATTENTION_VALUE_LENGTH, hparams.n_embd_head_v_full); - add_kv(LLM_KV_ATTENTION_KEY_LENGTH_SWA, hparams.n_embd_head_k_swa); - add_kv(LLM_KV_ATTENTION_VALUE_LENGTH_SWA, hparams.n_embd_head_v_swa); add_kv(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); add_kv(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); + add_kv(LLM_KV_ATTENTION_GROUPNORM_EPS, hparams.f_norm_group_eps); + add_kv(LLM_KV_ATTENTION_GROUPNORM_GROUPS, hparams.n_norm_groups); add_kv(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn); add_kv(LLM_KV_ATTENTION_Q_LORA_RANK, hparams.n_lora_q); add_kv(LLM_KV_ATTENTION_KV_LORA_RANK, hparams.n_lora_kv); + add_kv(LLM_KV_ATTENTION_DECAY_LORA_RANK, hparams.n_lora_decay); + add_kv(LLM_KV_ATTENTION_ICLR_LORA_RANK, hparams.n_lora_iclr); + add_kv(LLM_KV_ATTENTION_VALUE_RESIDUAL_MIX_LORA_RANK, hparams.n_lora_value_res_mix); + add_kv(LLM_KV_ATTENTION_GATE_LORA_RANK, hparams.n_lora_gate); add_kv(LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, hparams.n_rel_attn_bkts); add_kv(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa); + // add_kv(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, ???); add_kv(LLM_KV_ATTENTION_SCALE, hparams.f_attention_scale); + add_kv(LLM_KV_ATTENTION_OUTPUT_SCALE, hparams.f_attn_out_scale); + add_kv(LLM_KV_ATTENTION_TEMPERATURE_LENGTH, hparams.attn_temp_length); + add_kv(LLM_KV_ATTENTION_TEMPERATURE_SCALE, hparams.f_attn_temp_scale); + add_kv(LLM_KV_ATTENTION_KEY_LENGTH_MLA, hparams.n_embd_head_k_mla_impl); + add_kv(LLM_KV_ATTENTION_VALUE_LENGTH_MLA, hparams.n_embd_head_v_mla_impl); + add_kv(LLM_KV_ATTENTION_KEY_LENGTH_SWA, hparams.n_embd_head_k_swa); + add_kv(LLM_KV_ATTENTION_VALUE_LENGTH_SWA, hparams.n_embd_head_v_swa); + add_kv(LLM_KV_ATTENTION_INDEXER_HEAD_COUNT, hparams.indexer_n_head); + add_kv(LLM_KV_ATTENTION_INDEXER_KEY_LENGTH, hparams.indexer_head_size); + add_kv(LLM_KV_ATTENTION_INDEXER_TOP_K, hparams.indexer_top_k); const float rope_scaling_factor = hparams.rope_freq_scale_train == 1.0f ? 0.0f : 1.0f/hparams.rope_freq_scale_train; add_kv(LLM_KV_ROPE_DIMENSION_COUNT, hparams.n_rot_full); add_kv(LLM_KV_ROPE_DIMENSION_COUNT_SWA, hparams.n_rot_swa); + add_kv(LLM_KV_ROPE_DIMENSION_SECTIONS, hparams.rope_sections); add_kv(LLM_KV_ROPE_FREQ_BASE, hparams.rope_freq_base_train); + add_kv(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa); // add_kv(LLM_KV_ROPE_SCALE_LINEAR, rope_scaling_factor); // old name add_kv(LLM_KV_ROPE_SCALING_TYPE, llama_rope_scaling_type_name(hparams.rope_scaling_type_train)); add_kv(LLM_KV_ROPE_SCALING_FACTOR, rope_scaling_factor); @@ -211,6 +292,10 @@ void llama_model_saver::add_kv_from_model() { add_kv(LLM_KV_ROPE_SCALING_ORIG_CTX_LEN, hparams.n_ctx_orig_yarn); add_kv(LLM_KV_ROPE_SCALING_FINETUNED, hparams.rope_finetuned); add_kv(LLM_KV_ROPE_SCALING_YARN_LOG_MUL, hparams.rope_yarn_log_mul); + add_kv(LLM_KV_ROPE_SCALING_YARN_EXT_FACTOR, hparams.yarn_ext_factor); + add_kv(LLM_KV_ROPE_SCALING_YARN_ATTN_FACTOR, hparams.yarn_attn_factor); + add_kv(LLM_KV_ROPE_SCALING_YARN_BETA_FAST, hparams.yarn_beta_fast); + add_kv(LLM_KV_ROPE_SCALING_YARN_BETA_SLOW, hparams.yarn_beta_slow); // TODO: implement split file support // add_kv(LLM_KV_SPLIT_NO, ???); @@ -221,8 +306,11 @@ void llama_model_saver::add_kv_from_model() { add_kv(LLM_KV_SSM_CONV_KERNEL, hparams.ssm_d_conv); add_kv(LLM_KV_SSM_STATE_SIZE, hparams.ssm_d_state); add_kv(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank); + add_kv(LLM_KV_SSM_GROUP_COUNT, hparams.ssm_n_group); add_kv(LLM_KV_SSM_DT_B_C_RMS, hparams.ssm_dt_b_c_rms); + add_kv(LLM_KV_KDA_HEAD_DIM, hparams.n_embd_head_kda); + add_kv(LLM_KV_WKV_HEAD_SIZE, hparams.wkv_head_size); add_kv(LLM_KV_TOKENIZER_MODEL, vocab.get_tokenizer_model()); @@ -260,15 +348,39 @@ void llama_model_saver::add_kv_from_model() { // TODO: implement LoRA support // add_kv(LLM_KV_ADAPTER_TYPE, ???); // add_kv(LLM_KV_ADAPTER_LORA_ALPHA, ???); + // add_kv(LLM_KV_ADAPTER_LORA_TASK_NAME, ???); + // add_kv(LLM_KV_ADAPTER_LORA_PROMPT_PREFIX, ???); + // add_kv(LLM_KV_ADAPTER_ALORA_INVOCATION_TOKENS, ???); + + add_kv(LLM_KV_POSNET_EMBEDDING_LENGTH, hparams.posnet.n_embd); + add_kv(LLM_KV_POSNET_BLOCK_COUNT, hparams.posnet.n_layer); + + add_kv(LLM_KV_CONVNEXT_EMBEDDING_LENGTH, hparams.convnext.n_embd); + add_kv(LLM_KV_CONVNEXT_BLOCK_COUNT, hparams.convnext.n_layer); + + add_kv(LLM_KV_CLASSIFIER_OUTPUT_LABELS, model->classifier_labels); + + add_kv(LLM_KV_SHORTCONV_L_CACHE, hparams.n_shortconv_l_cache); + + add_kv(LLM_KV_XIELU_ALPHA_N, hparams.xielu_alpha_n); + add_kv(LLM_KV_XIELU_ALPHA_P, hparams.xielu_alpha_p); + add_kv(LLM_KV_XIELU_BETA, hparams.xielu_beta); + add_kv(LLM_KV_XIELU_EPS, hparams.xielu_eps); // deprecated // add_kv(LLM_KV_TOKENIZER_PREFIX_ID, ???); // add_kv(LLM_KV_TOKENIZER_SUFFIX_ID, ???); // add_kv(LLM_KV_TOKENIZER_MIDDLE_ID, ???); + + add_kv(LLM_KV_DENSE_2_FEAT_IN, hparams.dense_2_feat_in); + add_kv(LLM_KV_DENSE_2_FEAT_OUT, hparams.dense_2_feat_out); + add_kv(LLM_KV_DENSE_3_FEAT_IN, hparams.dense_3_feat_in); + add_kv(LLM_KV_DENSE_3_FEAT_OUT, hparams.dense_3_feat_out); } void llama_model_saver::add_tensors_from_model() { - if (std::string(model->output->name) != std::string(model->tok_embd->name)) { + if (model->output != nullptr && + std::string(model->output->name) != std::string(model->tok_embd->name)) { add_tensor(model->tok_embd); // some models use the same tensor for tok_embd and output } add_tensor(model->type_embd); @@ -297,3 +409,6 @@ void llama_model_saver::save(const std::string & path_model) { gguf_write_to_file(gguf_ctx, path_model.c_str(), false); } +void llama_model_saver::save(FILE * file) { + gguf_write_to_file_ptr(gguf_ctx, file, false); +} diff --git a/src/llama-model-saver.h b/src/llama-model-saver.h index 2b3541ce6c..36a715e2b6 100644 --- a/src/llama-model-saver.h +++ b/src/llama-model-saver.h @@ -6,6 +6,9 @@ #include +// FIXME temporary function for better error messages +bool llama_model_saver_supports_arch(llm_arch arch); + struct llama_model_saver { struct gguf_context * gguf_ctx = nullptr; const bool gguf_ctx_owned; @@ -37,4 +40,5 @@ struct llama_model_saver { void add_tensors_from_model(); void save(const std::string & path_model); + void save(FILE * file); }; diff --git a/src/llama-model.cpp b/src/llama-model.cpp index f8caad2889..490e8f336b 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -1624,7 +1624,7 @@ void llama_model::load_hparams(llama_model_loader & ml) { // (optional) temperature tuning - used by mistral-large ml.get_key(LLM_KV_ATTENTION_TEMPERATURE_SCALE, hparams.f_attn_temp_scale, false); - ml.get_key(LLM_KV_ATTENTION_TEMPERATURE_LENGTH, hparams.n_attn_temp_floor_scale, false); + ml.get_key(LLM_KV_ATTENTION_TEMPERATURE_LENGTH, hparams.n_attn_temp_floor_scale, false); // FIXME why not use temperature_length? hparams.f_attn_temp_offset = 0.0f; diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 8e8ce23124..c414656e0a 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -859,7 +859,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: std::vector splits = {}; llama_model_loader ml(/*metadata*/ nullptr, /*set_tensor_data*/ nullptr, /*set_tensor_data_ud*/ nullptr, - fname_inp, splits, use_mmap, /*use_direct_io*/ false, /*check_tensors*/ true, /*no_alloc*/ false, kv_overrides, nullptr); + fname_inp, splits, /*file*/ nullptr, use_mmap, /*use_direct_io*/ false, /*check_tensors*/ true, /*no_alloc*/ false, kv_overrides, nullptr); ml.init_mappings(false); // no prefetching llama_model model(llama_model_default_params()); diff --git a/src/llama.cpp b/src/llama.cpp index 872e659edc..1810a59d8e 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -828,7 +828,7 @@ int64_t llama_time_us(void) { // Returns 0 on success, -1 on error, and -2 on cancellation via llama_progress_callback static int llama_model_load(struct gguf_context * metadata, llama_model_set_tensor_data_t set_tensor_data, void * set_tensor_data_ud, - const std::string & fname, std::vector & splits, llama_model & model, llama_model_params & params) { + const std::string & fname, std::vector & splits, FILE * file, llama_model & model, llama_model_params & params) { // loading time will be recalculated after the first eval, so // we take page faults deferred by mmap() into consideration model.t_load_us = 0; @@ -837,7 +837,7 @@ static int llama_model_load(struct gguf_context * metadata, llama_model_set_tens model.t_start_us = tm.t_start_us; try { - llama_model_loader ml(metadata, set_tensor_data, set_tensor_data_ud, fname, splits, params.use_mmap, params.use_direct_io, + llama_model_loader ml(metadata, set_tensor_data, set_tensor_data_ud, fname, splits, file, params.use_mmap, params.use_direct_io, params.check_tensors, params.no_alloc, params.kv_overrides, params.tensor_buft_overrides); ml.print_info(); @@ -889,8 +889,24 @@ static struct llama_model * llama_model_load_from_file_impl( void * set_tensor_data_ud, const std::string & path_model, std::vector & splits, + FILE * file, struct llama_model_params params) { - GGML_ASSERT((metadata == nullptr) != path_model.empty() && "exactly one out of metadata and path_model needs to be defined"); + { + int n_sources_defined = 0; + if (metadata != nullptr) { + n_sources_defined++; + } + if (!path_model.empty()) { + n_sources_defined++; + } + if (file != nullptr) { + n_sources_defined++; + } + if (n_sources_defined != 1) { + LLAMA_LOG_ERROR("%s: exactly one out metadata, path_model, and file must be defined\n", __func__); + return nullptr; + } + } ggml_time_init(); if (!params.vocab_only && ggml_backend_reg_count() == 0) { @@ -1011,7 +1027,7 @@ static struct llama_model * llama_model_load_from_file_impl( props.memory_free/1024/1024); } - const int status = llama_model_load(metadata, set_tensor_data, set_tensor_data_ud, path_model, splits, *model, params); + const int status = llama_model_load(metadata, set_tensor_data, set_tensor_data_ud, path_model, splits, file, *model, params); GGML_ASSERT(status <= 0); if (status < 0) { if (status == -1) { @@ -1037,7 +1053,7 @@ struct llama_model * llama_model_init_from_user( std::vector splits = {}; params.use_mmap = false; params.use_extra_bufts = false; - return llama_model_load_from_file_impl(metadata, set_tensor_data, set_tensor_data_ud, path_model, splits, params); + return llama_model_load_from_file_impl(metadata, set_tensor_data, set_tensor_data_ud, path_model, splits, /*file*/ nullptr, params); } // deprecated struct llama_model * llama_load_model_from_file( @@ -1050,7 +1066,7 @@ struct llama_model * llama_model_load_from_file( const char * path_model, struct llama_model_params params) { std::vector splits = {}; - return llama_model_load_from_file_impl(nullptr, nullptr, nullptr, path_model, splits, params); + return llama_model_load_from_file_impl(nullptr, nullptr, nullptr, path_model, splits, /*file*/ nullptr, params); } struct llama_model * llama_model_load_from_splits( @@ -1066,7 +1082,17 @@ struct llama_model * llama_model_load_from_splits( for (size_t i = 0; i < n_paths; ++i) { splits.push_back(paths[i]); } - return llama_model_load_from_file_impl(nullptr, nullptr, nullptr, splits.front(), splits, params); + return llama_model_load_from_file_impl(nullptr, nullptr, nullptr, splits.front(), splits, /*file*/ nullptr, params); +} + +struct llama_model * llama_model_load_from_file_ptr(FILE * file, struct llama_model_params params) { + if (!file) { + LLAMA_LOG_ERROR("%s: file is NULL\n", __func__); + return nullptr; + } + std::string path_model; + std::vector splits = {}; + return llama_model_load_from_file_impl(nullptr, nullptr, nullptr, path_model, splits, file, params); } void llama_model_save_to_file(const struct llama_model * model, const char * path_model) { diff --git a/tests/test-gguf.cpp b/tests/test-gguf.cpp index 8ebd16ba82..ed3070dc4d 100644 --- a/tests/test-gguf.cpp +++ b/tests/test-gguf.cpp @@ -742,7 +742,7 @@ static std::pair test_handcrafted_file(const unsigned int seed) { /*ctx =*/ hft >= offset_has_data ? &ctx : nullptr, }; - struct gguf_context * gguf_ctx = gguf_init_from_file_impl(file, gguf_params); + struct gguf_context * gguf_ctx = gguf_init_from_file_ptr(file, gguf_params); if (expect_context_not_null(hft)) { printf("%s: - context_not_null: ", __func__); @@ -1125,19 +1125,15 @@ static std::pair test_roundtrip(ggml_backend_dev_t dev, const unsigned GGML_ASSERT(file); #endif // _WIN32 - { - std::vector buf; - gguf_write_to_buf(gguf_ctx_0, buf, only_meta); - GGML_ASSERT(fwrite(buf.data(), 1, buf.size(), file) == buf.size()); - rewind(file); - } + gguf_write_to_file_ptr(gguf_ctx_0, file, only_meta); + rewind(file); struct ggml_context * ctx_1 = nullptr; struct gguf_init_params gguf_params = { /*no_alloc =*/ false, /*ctx =*/ only_meta ? nullptr : &ctx_1, }; - struct gguf_context * gguf_ctx_1 = gguf_init_from_file_impl(file, gguf_params); + struct gguf_context * gguf_ctx_1 = gguf_init_from_file_ptr(file, gguf_params); printf("%s: same_version: ", __func__); if (gguf_get_version(gguf_ctx_0) == gguf_get_version(gguf_ctx_1)) { diff --git a/tests/test-llama-archs.cpp b/tests/test-llama-archs.cpp index d51c09e99f..593d370e19 100644 --- a/tests/test-llama-archs.cpp +++ b/tests/test-llama-archs.cpp @@ -90,6 +90,7 @@ static gguf_context_ptr get_gguf_ctx(const llm_arch arch, const bool moe) { n_embd = 64; n_head = 1; n_ff = 96; + n_layer = 22; // hparams.n_layer_kv_from_start = 20 is hardcoded } else if (arch == LLM_ARCH_DEEPSEEK2 || arch == LLM_ARCH_GLM_DSA || arch == LLM_ARCH_KIMI_LINEAR @@ -101,8 +102,6 @@ static gguf_context_ptr get_gguf_ctx(const llm_arch arch, const bool moe) { n_layer = 3; } else if (arch == LLM_ARCH_CHAMELEON) { n_vocab = 10240; - } else if (arch == LLM_ARCH_GEMMA3N) { - n_layer = 22; // hparams.n_layer_kv_from_start = 20 is hardcoded } const uint32_t n_embd_head = n_embd / n_head; @@ -231,9 +230,15 @@ static gguf_context_ptr get_gguf_ctx(const llm_arch arch, const bool moe) { return ret; } +static bool silent_model_load_progress(float /*progress*/, void * /*user_data*/) { + return true; +} + static std::pair get_model_and_ctx( - struct gguf_context * gguf_ctx, const size_t seed, const std::vector & devs) { + struct gguf_context * gguf_ctx, FILE * file, const size_t seed, const std::vector & devs) { + GGML_ASSERT((gguf_ctx == nullptr) != (file == nullptr)); llama_model_params model_params = llama_model_default_params(); + model_params.progress_callback = silent_model_load_progress; std::vector devs_copy = devs; devs_copy.push_back(nullptr); model_params.devices = devs_copy.data(); @@ -244,7 +249,9 @@ static std::pair get_model_and_ctx( ctx_params.n_threads_batch = 4; size_t tmp = seed; - llama_model_ptr model(llama_model_init_from_user(gguf_ctx, set_tensor_data, &tmp, model_params)); + llama_model_ptr model(gguf_ctx != nullptr ? + llama_model_init_from_user(gguf_ctx, set_tensor_data, &tmp, model_params) : + llama_model_load_from_file_ptr(file, model_params)); if (!model) { throw std::runtime_error("failed to create llama model"); } @@ -351,7 +358,6 @@ static bool moe_implemented(const llm_arch arch) { } static int save_models(const llm_arch target_arch, const size_t seed, const ggml_log_level log_level, const std::string & dir) { - GGML_ABORT("llama_model_save_to_file is broken"); struct user_data_t { struct { ggml_log_callback callback; @@ -376,6 +382,16 @@ static int save_models(const llm_arch target_arch, const size_t seed, const ggml if (arch == LLM_ARCH_CLIP || arch == LLM_ARCH_GPTJ || arch == LLM_ARCH_UNKNOWN) { continue; // These models don't have usable implementations. } + if (arch == LLM_ARCH_RWKV6 || arch == LLM_ARCH_RWKV6QWEN2 || arch == LLM_ARCH_RWKV7 || arch == LLM_ARCH_ARWKV7) { + continue; // FIXME + } + if (arch == LLM_ARCH_BERT || arch == LLM_ARCH_MODERN_BERT || arch == LLM_ARCH_NOMIC_BERT || arch == LLM_ARCH_NOMIC_BERT_MOE || + arch == LLM_ARCH_NEO_BERT || arch == LLM_ARCH_JINA_BERT_V2 || arch == LLM_ARCH_JINA_BERT_V3 || arch == LLM_ARCH_EUROBERT) { + continue; // TODO vocab + } + if (arch == LLM_ARCH_PLM) { + continue; // TODO tensor shapes + } for (bool moe : {false, true}) { if (moe && !moe_implemented(arch)) { continue; @@ -383,8 +399,12 @@ static int save_models(const llm_arch target_arch, const size_t seed, const ggml if (!moe && moe_mandatory(arch)) { continue; } + if (!llama_model_saver_supports_arch(arch)) { + LOG_INF("%s: %s model (%s) is unsupported, skipping\n", __func__, llm_arch_name(arch), moe ? "MoE" : "dense"); + continue; + } gguf_context_ptr gguf_ctx = get_gguf_ctx(arch, moe); - auto model_and_ctx = get_model_and_ctx(gguf_ctx.get(), seed, {}); + auto model_and_ctx = get_model_and_ctx(gguf_ctx.get(), nullptr, seed, {}); const std::string path = dir + "/" + llm_arch_name(arch) + (moe ? "-moe.gguf" : "-dense.gguf"); LOG_INF("%s: Saving %s model (%s) to %s...\n", __func__, llm_arch_name(arch), moe ? "MoE" : "dense", path.c_str()); llama_model_save_to_file(model_and_ctx.first.get(), path.c_str()); @@ -416,8 +436,8 @@ static int test_backends(const llm_arch target_arch, const size_t seed, const gg bool all_ok = true; common_log_flush(common_log_main()); - printf("|%15s|%30s|%6s|%8s|%6s|\n", "Model arch.", "Device", "Config", "NMSE", "Status"); - printf("|---------------|------------------------------|------|--------|------|\n"); + printf("|%15s|%30s|%6s|%15s|%9s|\n", "Model arch.", "Device", "Config", "NMSE vs. CPU", "Roundtrip"); + printf("|---------------|------------------------------|------|---------------|---------|\n"); for (const llm_arch & arch : llm_arch_all()) { if (target_arch != LLM_ARCH_UNKNOWN && arch != target_arch) { continue; @@ -458,22 +478,50 @@ static int test_backends(const llm_arch target_arch, const size_t seed, const gg continue; } gguf_context_ptr gguf_ctx = get_gguf_ctx(arch, moe); - auto model_and_ctx_cpu = get_model_and_ctx(gguf_ctx.get(), seed, {}); + auto model_and_ctx_cpu = get_model_and_ctx(gguf_ctx.get(), nullptr, seed, {}); const std::vector logits_cpu = get_logits(model_and_ctx_cpu.first.get(), model_and_ctx_cpu.second.get(), tokens, encode); for (size_t i = 0; i < ggml_backend_dev_count(); i++) { ggml_backend_dev_t dev = ggml_backend_dev_get(i); if (ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_CPU) { continue; } - auto model_and_ctx_dev = get_model_and_ctx(gguf_ctx.get(), seed, {dev}); + auto model_and_ctx_dev = get_model_and_ctx(gguf_ctx.get(), nullptr, seed, {dev}); + std::string config_name = moe ? "MoE" : "Dense"; const std::vector logits_dev = get_logits(model_and_ctx_dev.first.get(), model_and_ctx_dev.second.get(), tokens, encode); const double nmse_val = nmse(logits_cpu, logits_dev); - const bool ok = nmse_val <= 1e-4; - all_ok = all_ok && ok; char nmse_str[10]; snprintf(nmse_str, sizeof(nmse_str), "%.2e", nmse_val); - printf("|%15s|%30s|%6s|%8s|%17s|\n", llm_arch_name(arch), ggml_backend_dev_description(dev), - moe ? "MoE" : "Dense", nmse_str, ok ? "\033[1;32mOK\033[0m" : "\033[1;31mFAIL\033[0m"); + std::string status_nmse = "\033[1;32mOK\033[0m"; + if (nmse_val > 1e-4) { + all_ok = false; + status_nmse = "\033[1;31mFAIL\033[0m"; + } + + std::string status_roundtrip = "\033[1;33mSKIP\033[0m"; + FILE * file = tmpfile(); // Can be null on Windows without administrator privileges. + if (file != nullptr && llama_model_saver_supports_arch(arch)) { + llama_model_saver ms = llama_model_saver(model_and_ctx_dev.first.get()); + ms.add_kv_from_model(); + ms.add_tensors_from_model(); + ms.save(file); + rewind(file); + + auto model_and_ctx_roundtrip = get_model_and_ctx(nullptr, file, seed, {dev}); + const std::vector logits_roundtrip = get_logits( + model_and_ctx_roundtrip.first.get(), model_and_ctx_roundtrip.second.get(), tokens, encode); + status_roundtrip = "\033[1;32mOK\033[0m"; + GGML_ASSERT(logits_roundtrip.size() == logits_dev.size()); + for (size_t i = 0; i < logits_roundtrip.size(); i++) { + if (logits_roundtrip[i] != logits_dev[i]) { + all_ok = false; + status_roundtrip = "\033[1;31mFAIL\033[0m"; + break; + } + } + } + + printf("|%15s|%30s|%6s|%15s (%8s)|%20s|\n", llm_arch_name(arch), ggml_backend_dev_description(dev), + config_name.c_str(), status_nmse.c_str(), nmse_str, status_roundtrip.c_str()); } } }