diff --git a/ggml/include/gguf.h b/ggml/include/gguf.h index 9d8e321ba0..42763b86f9 100644 --- a/ggml/include/gguf.h +++ b/ggml/include/gguf.h @@ -191,6 +191,7 @@ extern "C" { // write the entire context to a binary file GGML_API bool gguf_write_to_file(const struct gguf_context * ctx, const char * fname, bool only_meta); + GGML_API bool gguf_write_to_file_ptr(const struct gguf_context * ctx, FILE * file, bool only_meta); // get the size in bytes of the meta data (header, kv pairs, tensor info) including padding GGML_API size_t gguf_get_meta_size(const struct gguf_context * ctx); diff --git a/ggml/src/gguf.cpp b/ggml/src/gguf.cpp index 49afeacae3..407b514f71 100644 --- a/ggml/src/gguf.cpp +++ b/ggml/src/gguf.cpp @@ -1520,16 +1520,25 @@ bool gguf_write_to_file(const struct gguf_context * ctx, const char * fname, boo return false; } + const bool success = gguf_write_to_file_ptr(ctx, file, only_meta); + if (!success) { + GGML_LOG_ERROR("%s: failed to write GGUF data into '%s'\n", __func__, fname); + } + + fclose(file); + return success; +} + +bool gguf_write_to_file_ptr(const struct gguf_context * ctx, FILE * file, bool only_meta) { + GGML_ASSERT(file); + try { gguf_writer_file gw(file); gguf_write_out(ctx, gw, only_meta); } catch (const std::runtime_error& ex) { - GGML_LOG_ERROR("%s: failed to write GGUF data into '%s': %s\n", __func__, fname, ex.what()); - fclose(file); + GGML_LOG_ERROR("%s: failed to write GGUF data: %s\n", __func__, ex.what()); return false; } - - fclose(file); return true; } diff --git a/src/llama-model-saver.cpp b/src/llama-model-saver.cpp index 637be9b7ce..26864c18e9 100644 --- a/src/llama-model-saver.cpp +++ b/src/llama-model-saver.cpp @@ -1,5 +1,6 @@ #include "llama-model-saver.h" +#include "ggml.h" #include "gguf.h" #include "llama-arch.h" @@ -11,8 +12,33 @@ #include #include +bool llama_model_saver_supports_arch(llm_arch arch) { + switch (arch) { + case LLM_ARCH_QWEN3NEXT: + case LLM_ARCH_QWEN35: + case LLM_ARCH_QWEN35MOE: + case LLM_ARCH_PLAMO3: + case LLM_ARCH_GEMMA3: + case LLM_ARCH_GEMMA3N: + case LLM_ARCH_COHERE2: + case LLM_ARCH_OLMO2: + case LLM_ARCH_BITNET: + case LLM_ARCH_T5: + case LLM_ARCH_EXAONE_MOE: + case LLM_ARCH_AFMOE: + case LLM_ARCH_APERTUS: + case LLM_ARCH_MIMO2: + case LLM_ARCH_STEP35: + return false; + default: + return true; + } +} + llama_model_saver::llama_model_saver(const struct llama_model * model) : - gguf_ctx(gguf_init_empty()), gguf_ctx_owned(true), model(model), llm_kv(model->arch) {} + gguf_ctx(gguf_init_empty()), gguf_ctx_owned(true), model(model), llm_kv(model->arch) { + GGML_ASSERT(llama_model_saver_supports_arch(model->arch)); +} llama_model_saver::llama_model_saver(enum llm_arch arch, struct gguf_context * gguf_ctx) : gguf_ctx(gguf_ctx == nullptr ? gguf_init_empty() : gguf_ctx), gguf_ctx_owned(gguf_ctx == nullptr), model(nullptr), llm_kv(arch) {} @@ -383,3 +409,6 @@ void llama_model_saver::save(const std::string & path_model) { gguf_write_to_file(gguf_ctx, path_model.c_str(), false); } +void llama_model_saver::save(FILE * file) { + gguf_write_to_file_ptr(gguf_ctx, file, false); +} diff --git a/src/llama-model-saver.h b/src/llama-model-saver.h index 2b3541ce6c..36a715e2b6 100644 --- a/src/llama-model-saver.h +++ b/src/llama-model-saver.h @@ -6,6 +6,9 @@ #include +// FIXME temporary function for better error messages +bool llama_model_saver_supports_arch(llm_arch arch); + struct llama_model_saver { struct gguf_context * gguf_ctx = nullptr; const bool gguf_ctx_owned; @@ -37,4 +40,5 @@ struct llama_model_saver { void add_tensors_from_model(); void save(const std::string & path_model); + void save(FILE * file); }; diff --git a/tests/test-gguf.cpp b/tests/test-gguf.cpp index 78ca95dcbd..ed3070dc4d 100644 --- a/tests/test-gguf.cpp +++ b/tests/test-gguf.cpp @@ -1125,12 +1125,8 @@ static std::pair test_roundtrip(ggml_backend_dev_t dev, const unsigned GGML_ASSERT(file); #endif // _WIN32 - { - std::vector buf; - gguf_write_to_buf(gguf_ctx_0, buf, only_meta); - GGML_ASSERT(fwrite(buf.data(), 1, buf.size(), file) == buf.size()); - rewind(file); - } + gguf_write_to_file_ptr(gguf_ctx_0, file, only_meta); + rewind(file); struct ggml_context * ctx_1 = nullptr; struct gguf_init_params gguf_params = { diff --git a/tests/test-llama-archs.cpp b/tests/test-llama-archs.cpp index 8ee0fe1e7c..add340d22c 100644 --- a/tests/test-llama-archs.cpp +++ b/tests/test-llama-archs.cpp @@ -424,8 +424,8 @@ static int test_backends(const llm_arch target_arch, const size_t seed, const gg bool all_ok = true; common_log_flush(common_log_main()); - printf("|%15s|%30s|%6s|%8s|%6s|\n", "Model arch.", "Device", "Config", "NMSE", "Status"); - printf("|---------------|------------------------------|------|--------|------|\n"); + printf("|%15s|%30s|%16s|%8s|%6s|\n", "Model arch.", "Device", "Config", "NMSE", "Status"); + printf("|---------------|------------------------------|----------------|--------|------|\n"); for (const llm_arch & arch : llm_arch_all()) { if (target_arch != LLM_ARCH_UNKNOWN && arch != target_arch) { continue; @@ -474,14 +474,57 @@ static int test_backends(const llm_arch target_arch, const size_t seed, const gg continue; } auto model_and_ctx_dev = get_model_and_ctx(gguf_ctx.get(), seed, {dev}); - const std::vector logits_dev = get_logits(model_and_ctx_dev.first.get(), model_and_ctx_dev.second.get(), tokens, encode); - const double nmse_val = nmse(logits_cpu, logits_dev); - const bool ok = nmse_val <= 1e-4; - all_ok = all_ok && ok; - char nmse_str[10]; - snprintf(nmse_str, sizeof(nmse_str), "%.2e", nmse_val); - printf("|%15s|%30s|%6s|%8s|%17s|\n", llm_arch_name(arch), ggml_backend_dev_description(dev), - moe ? "MoE" : "Dense", nmse_str, ok ? "\033[1;32mOK\033[0m" : "\033[1;31mFAIL\033[0m"); + std::string config_name = moe ? "MoE" : "Dense"; + { + const std::vector logits_dev = get_logits(model_and_ctx_dev.first.get(), model_and_ctx_dev.second.get(), tokens, encode); + const double nmse_val = nmse(logits_cpu, logits_dev); + const bool ok = nmse_val <= 1e-4; + all_ok = all_ok && ok; + char nmse_str[10]; + snprintf(nmse_str, sizeof(nmse_str), "%.2e", nmse_val); + printf("|%15s|%30s|%16s|%8s|%17s|\n", llm_arch_name(arch), ggml_backend_dev_description(dev), + config_name.c_str(), nmse_str, ok ? "\033[1;32mOK\033[0m" : "\033[1;31mFAIL\033[0m"); + } + if (llama_model_saver_supports_arch(arch)) { + FILE * file = tmpfile(); +#ifdef _WIN32 + if (!file) { + continue; + } +#else + GGML_ASSERT(file); +#endif // _WIN32 + llama_model_saver ms = llama_model_saver(model_and_ctx_dev.first.get()); + ms.add_kv_from_model(); + ms.add_tensors_from_model(); + ms.save(file); + rewind(file); + llama_model_params model_params = llama_model_default_params(); + std::vector devs_copy = {dev}; + devs_copy.push_back(nullptr); + model_params.devices = devs_copy.data(); + llama_model_ptr model_roundtrip(llama_model_load_from_file_ptr(file, model_params)); + GGML_ASSERT(model_roundtrip); + config_name += ",roundtrip"; + + llama_context_params ctx_params = llama_context_default_params(); + ctx_params.n_ctx = 0; + ctx_params.n_threads = 4; + ctx_params.n_threads_batch = 4; + llama_context_ptr lctx_roundtrip(llama_init_from_model(model_roundtrip.get(), ctx_params)); + if (!lctx_roundtrip) { + throw std::runtime_error("failed to create llama context"); + } + + const std::vector logits_dev = get_logits(model_roundtrip.get(), lctx_roundtrip.get(), tokens, encode); + const double nmse_val = nmse(logits_cpu, logits_dev); + const bool ok = nmse_val <= 1e-4; + all_ok = all_ok && ok; + char nmse_str[10]; + snprintf(nmse_str, sizeof(nmse_str), "%.2e", nmse_val); + printf("|%15s|%30s|%16s|%8s|%17s|\n", llm_arch_name(arch), ggml_backend_dev_description(dev), + config_name.c_str(), nmse_str, ok ? "\033[1;32mOK\033[0m" : "\033[1;31mFAIL\033[0m"); + } } } }