From 4101758ab64bb3696cd8bc784f95e34a976543b4 Mon Sep 17 00:00:00 2001 From: Siddhesh2377 Date: Wed, 11 Mar 2026 19:17:31 +0530 Subject: [PATCH 01/12] llama : add fd-based model loading via llama_model_load_from_fd --- ggml/include/gguf.h | 1 + ggml/src/gguf.cpp | 32 ++++++++++++++++++++++++ include/llama.h | 4 +++ src/llama-mmap.cpp | 26 ++++++++++++++++++++ src/llama-mmap.h | 3 ++- src/llama-model-loader.cpp | 33 ++++++++++++++++++++++++- src/llama-model-loader.h | 1 + src/llama-quant.cpp | 2 +- src/llama.cpp | 31 ++++++++++++++++++------ tests/CMakeLists.txt | 1 + tests/test-model-load-fd.cpp | 47 ++++++++++++++++++++++++++++++++++++ 11 files changed, 171 insertions(+), 10 deletions(-) create mode 100644 tests/test-model-load-fd.cpp diff --git a/ggml/include/gguf.h b/ggml/include/gguf.h index 79ee202062..bd12997372 100644 --- a/ggml/include/gguf.h +++ b/ggml/include/gguf.h @@ -78,6 +78,7 @@ extern "C" { GGML_API struct gguf_context * gguf_init_empty(void); GGML_API struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_params params); + GGML_API struct gguf_context * gguf_init_from_fd(int fd, struct gguf_init_params params); //GGML_API struct gguf_context * gguf_init_from_buffer(..); GGML_API void gguf_free(struct gguf_context * ctx); diff --git a/ggml/src/gguf.cpp b/ggml/src/gguf.cpp index cbeedf6c4b..8eea785404 100644 --- a/ggml/src/gguf.cpp +++ b/ggml/src/gguf.cpp @@ -15,6 +15,10 @@ #include #include +#ifndef _WIN32 +#include +#endif + #define GGUF_MAX_STRING_LENGTH (1024*1024*1024) #define GGUF_MAX_ARRAY_ELEMENTS (1024*1024*1024) @@ -853,6 +857,34 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p return result; } +#ifndef _WIN32 +struct gguf_context * gguf_init_from_fd(int fd, struct gguf_init_params params) { + const int fd_duped = dup(fd); + if (fd_duped < 0) { + GGML_LOG_ERROR("%s: failed to dup fd %d: %s\n", __func__, fd, strerror(errno)); + return nullptr; + } + + FILE * file = fdopen(fd_duped, "rb"); + if (!file) { + close(fd_duped); + GGML_LOG_ERROR("%s: failed to fdopen fd %d: %s\n", __func__, fd, strerror(errno)); + return nullptr; + } + + struct gguf_context * result = gguf_init_from_file_impl(file, params); + fclose(file); + return result; +} +#else +struct gguf_context * gguf_init_from_fd(int fd, struct gguf_init_params params) { + GGML_LOG_ERROR("%s: fd-based loading is not supported on Windows\n", __func__); + GGML_UNUSED(fd); + GGML_UNUSED(params); + return nullptr; +} +#endif + void gguf_free(struct gguf_context * ctx) { if (ctx == nullptr) { return; diff --git a/include/llama.h b/include/llama.h index 6e72db7e3c..1864c53b88 100644 --- a/include/llama.h +++ b/include/llama.h @@ -465,6 +465,10 @@ extern "C" { const char * path_model, struct llama_model_params params); + // Load a model from a POSIX file descriptor + // Not supported on Windows + LLAMA_API struct llama_model * llama_model_load_from_fd(int fd, struct llama_model_params params); + // Load a model from multiple splits (support custom naming scheme) // The paths must be in the correct order LLAMA_API struct llama_model * llama_model_load_from_splits( diff --git a/src/llama-mmap.cpp b/src/llama-mmap.cpp index c03228e9ce..5ea00d3fa7 100644 --- a/src/llama-mmap.cpp +++ b/src/llama-mmap.cpp @@ -86,6 +86,10 @@ struct llama_file::impl { seek(0, SEEK_SET); } + impl(int /*fd_src*/) { + throw std::runtime_error("fd-based loading is not supported on Windows"); + } + size_t tell() const { LARGE_INTEGER li; li.QuadPart = 0; @@ -209,6 +213,25 @@ struct llama_file::impl { seek(0, SEEK_SET); } + impl(int fd_src) : fname("(fd:" + std::to_string(fd_src) + ")") { + init_from_fd(fd_src); + } + + void init_from_fd(int fd_src) { + const int fd_duped = dup(fd_src); + if (fd_duped < 0) { + throw std::runtime_error(format("llama_file: failed to dup fd %d: %s", fd_src, strerror(errno))); + } + fp = fdopen(fd_duped, "rb"); + if (!fp) { + close(fd_duped); + throw std::runtime_error(format("llama_file: failed to fdopen fd %d: %s", fd_src, strerror(errno))); + } + seek(0, SEEK_END); + size = tell(); + seek(0, SEEK_SET); + } + size_t tell() const { if (fd == -1) { long ret = std::ftell(fp); @@ -373,6 +396,9 @@ struct llama_file::impl { llama_file::llama_file(const char * fname, const char * mode, const bool use_direct_io) : pimpl(std::make_unique(fname, mode, use_direct_io)) {} + +llama_file::llama_file(int fd) : pimpl(std::make_unique(fd)) {} + llama_file::~llama_file() = default; size_t llama_file::tell() const { return pimpl->tell(); } diff --git a/src/llama-mmap.h b/src/llama-mmap.h index 29ce4d2468..2d1eac91a3 100644 --- a/src/llama-mmap.h +++ b/src/llama-mmap.h @@ -15,12 +15,13 @@ using llama_mlocks = std::vector>; struct llama_file { llama_file(const char * fname, const char * mode, bool use_direct_io = false); + llama_file(int fd); ~llama_file(); size_t tell() const; size_t size() const; - int file_id() const; // fileno overload + int file_id() const; void seek(size_t offset, int whence) const; diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp index 413f34c226..1b38ee08a6 100644 --- a/src/llama-model-loader.cpp +++ b/src/llama-model-loader.cpp @@ -511,6 +511,7 @@ llama_model_loader::llama_model_loader( void * set_tensor_data_ud, const std::string & fname, std::vector & splits, + int fd, bool use_mmap, bool use_direct_io, bool check_tensors, @@ -658,6 +659,36 @@ llama_model_loader::llama_model_loader( LLAMA_LOG_INFO("%s: additional %d GGUFs metadata loaded.\n", __func__, n_split - 1); } + } else if (fd >= 0) { + struct ggml_context * ctx = NULL; + struct gguf_init_params params = { + /*.no_alloc = */ true, + /*.ctx = */ &ctx, + }; + + metadata_ptr.reset(gguf_init_from_fd(fd, params)); + metadata = metadata_ptr.get(); + if (metadata == nullptr) { + throw std::runtime_error(format("%s: failed to load model from fd %d", __func__, fd)); + } + + get_key(llm_kv(LLM_KV_GENERAL_ARCHITECTURE), arch_name, false); + llm_kv = LLM_KV(llm_arch_from_string(arch_name)); + + contexts.emplace_back(ctx); + files.emplace_back(new llama_file(fd)); + + // Save tensors data offset info of the main file. + for (ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) { + std::string tensor_name = std::string(cur->name); + // make sure there is no duplicated tensor names + if (weights_map.find(tensor_name) != weights_map.end()) { + throw std::runtime_error(format("invalid model: tensor '%s' is duplicated", ggml_get_name(cur))); + } + n_elements += ggml_nelements(cur); + n_bytes += ggml_nbytes(cur); + weights_map.emplace(tensor_name, llama_tensor_weight(files.back().get(), 0, metadata, cur)); + } } else { get_key(llm_kv(LLM_KV_GENERAL_ARCHITECTURE), arch_name, false); llm_kv = LLM_KV(llm_arch_from_string(arch_name)); @@ -669,7 +700,7 @@ llama_model_loader::llama_model_loader( fver = (enum llama_fver) gguf_get_version(metadata); LLAMA_LOG_INFO("%s: loaded meta data with %d key-value pairs and %d tensors from %s (version %s)\n", - __func__, n_kv, n_tensors, fname.c_str(), llama_file_version_name(fver)); + __func__, n_kv, n_tensors, fname.empty() ? "(fd)" : fname.c_str(), llama_file_version_name(fver)); // determine file type based on the number of tensors for each quantization and print meta data // TODO: make optional diff --git a/src/llama-model-loader.h b/src/llama-model-loader.h index ed5de729ca..6e5a5a4712 100644 --- a/src/llama-model-loader.h +++ b/src/llama-model-loader.h @@ -125,6 +125,7 @@ struct llama_model_loader { void * set_tensor_data_ud, const std::string & fname, std::vector & splits, // optional, only need if the split does not follow naming scheme + int fd, bool use_mmap, bool use_direct_io, bool check_tensors, diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 8e8ce23124..d047944dd6 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -859,7 +859,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: std::vector splits = {}; llama_model_loader ml(/*metadata*/ nullptr, /*set_tensor_data*/ nullptr, /*set_tensor_data_ud*/ nullptr, - fname_inp, splits, use_mmap, /*use_direct_io*/ false, /*check_tensors*/ true, /*no_alloc*/ false, kv_overrides, nullptr); + fname_inp, splits, /*fd*/ -1, use_mmap, /*use_direct_io*/ false, /*check_tensors*/ true, /*no_alloc*/ false, kv_overrides, nullptr); ml.init_mappings(false); // no prefetching llama_model model(llama_model_default_params()); diff --git a/src/llama.cpp b/src/llama.cpp index 872e659edc..daf3c3bd8d 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -828,7 +828,7 @@ int64_t llama_time_us(void) { // Returns 0 on success, -1 on error, and -2 on cancellation via llama_progress_callback static int llama_model_load(struct gguf_context * metadata, llama_model_set_tensor_data_t set_tensor_data, void * set_tensor_data_ud, - const std::string & fname, std::vector & splits, llama_model & model, llama_model_params & params) { + const std::string & fname, std::vector & splits, int fd, llama_model & model, llama_model_params & params) { // loading time will be recalculated after the first eval, so // we take page faults deferred by mmap() into consideration model.t_load_us = 0; @@ -837,7 +837,7 @@ static int llama_model_load(struct gguf_context * metadata, llama_model_set_tens model.t_start_us = tm.t_start_us; try { - llama_model_loader ml(metadata, set_tensor_data, set_tensor_data_ud, fname, splits, params.use_mmap, params.use_direct_io, + llama_model_loader ml(metadata, set_tensor_data, set_tensor_data_ud, fname, splits, fd, params.use_mmap, params.use_direct_io, params.check_tensors, params.no_alloc, params.kv_overrides, params.tensor_buft_overrides); ml.print_info(); @@ -889,8 +889,12 @@ static struct llama_model * llama_model_load_from_file_impl( void * set_tensor_data_ud, const std::string & path_model, std::vector & splits, + int fd, struct llama_model_params params) { - GGML_ASSERT((metadata == nullptr) != path_model.empty() && "exactly one out of metadata and path_model needs to be defined"); + if (metadata == nullptr && path_model.empty() && fd < 0) { + LLAMA_LOG_ERROR("%s: no model source provided\n", __func__); + return nullptr; + } ggml_time_init(); if (!params.vocab_only && ggml_backend_reg_count() == 0) { @@ -1011,7 +1015,7 @@ static struct llama_model * llama_model_load_from_file_impl( props.memory_free/1024/1024); } - const int status = llama_model_load(metadata, set_tensor_data, set_tensor_data_ud, path_model, splits, *model, params); + const int status = llama_model_load(metadata, set_tensor_data, set_tensor_data_ud, path_model, splits, fd, *model, params); GGML_ASSERT(status <= 0); if (status < 0) { if (status == -1) { @@ -1037,7 +1041,7 @@ struct llama_model * llama_model_init_from_user( std::vector splits = {}; params.use_mmap = false; params.use_extra_bufts = false; - return llama_model_load_from_file_impl(metadata, set_tensor_data, set_tensor_data_ud, path_model, splits, params); + return llama_model_load_from_file_impl(metadata, set_tensor_data, set_tensor_data_ud, path_model, splits, /*fd*/ -1, params); } // deprecated struct llama_model * llama_load_model_from_file( @@ -1050,7 +1054,7 @@ struct llama_model * llama_model_load_from_file( const char * path_model, struct llama_model_params params) { std::vector splits = {}; - return llama_model_load_from_file_impl(nullptr, nullptr, nullptr, path_model, splits, params); + return llama_model_load_from_file_impl(nullptr, nullptr, nullptr, path_model, splits, /*fd*/ -1, params); } struct llama_model * llama_model_load_from_splits( @@ -1066,7 +1070,20 @@ struct llama_model * llama_model_load_from_splits( for (size_t i = 0; i < n_paths; ++i) { splits.push_back(paths[i]); } - return llama_model_load_from_file_impl(nullptr, nullptr, nullptr, splits.front(), splits, params); + return llama_model_load_from_file_impl(nullptr, nullptr, nullptr, splits.front(), splits, /*fd*/ -1, params); +} + +struct llama_model * llama_model_load_from_fd(int fd, struct llama_model_params params) { +#ifdef _WIN32 + LLAMA_LOG_ERROR("%s: fd-based loading is not supported on Windows\n", __func__); + GGML_UNUSED(fd); + GGML_UNUSED(params); + return nullptr; +#else + std::string path_model; + std::vector splits = {}; + return llama_model_load_from_file_impl(nullptr, nullptr, nullptr, path_model, splits, fd, params); +#endif } void llama_model_save_to_file(const struct llama_model * model, const char * path_model) { diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 9582164b58..c333fe7e40 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -240,6 +240,7 @@ llama_build_and_test(test-gguf.cpp) llama_build_and_test(test-backend-ops.cpp) llama_build_and_test(test-model-load-cancel.cpp LABEL "model") +llama_build_and_test(test-model-load-fd.cpp LABEL "model") llama_build_and_test(test-autorelease.cpp LABEL "model") llama_build_and_test(test-backend-sampler.cpp LABEL "model") diff --git a/tests/test-model-load-fd.cpp b/tests/test-model-load-fd.cpp new file mode 100644 index 0000000000..d5102942d0 --- /dev/null +++ b/tests/test-model-load-fd.cpp @@ -0,0 +1,47 @@ +#include "llama.h" +#include "get-model.h" + +#include +#include + +#ifdef _WIN32 +int main(int /*argc*/, char ** /*argv*/) { + fprintf(stderr, "skipping on Windows\n"); + return EXIT_SUCCESS; +} +#else +# include +# include + +int main(int argc, char ** argv) { + auto * model_path = get_model_or_exit(argc, argv); + + llama_backend_init(); + + const int fd = open(model_path, O_RDONLY); + if (fd < 0) { + fprintf(stderr, "failed to open %s\n", model_path); + return EXIT_FAILURE; + } + + auto params = llama_model_default_params(); + params.use_mmap = true; + params.vocab_only = true; + + struct llama_model * model = llama_model_load_from_fd(fd, params); + close(fd); + + if (model == nullptr) { + fprintf(stderr, "load from fd failed\n"); + return EXIT_FAILURE; + } + + const int n_vocab = llama_vocab_n_tokens(llama_model_get_vocab(model)); + fprintf(stderr, "loaded %d tokens from fd\n", n_vocab); + + llama_model_free(model); + llama_backend_free(); + + return n_vocab > 0 ? EXIT_SUCCESS : EXIT_FAILURE; +} +#endif From 2c3223177df02267400474f0e109c4deb8ab8bee Mon Sep 17 00:00:00 2001 From: Siddhesh2377 Date: Sat, 14 Mar 2026 00:44:33 +0530 Subject: [PATCH 02/12] llama : address review feedback for fd-based model loading --- ggml/include/gguf.h | 2 +- ggml/src/gguf.cpp | 29 ++--------------------------- src/llama-mmap.cpp | 4 ++-- src/llama-model-loader.cpp | 20 ++++++++++++++++++-- src/llama.cpp | 2 +- tests/test-model-load-fd.cpp | 2 +- 6 files changed, 25 insertions(+), 34 deletions(-) diff --git a/ggml/include/gguf.h b/ggml/include/gguf.h index bd12997372..9d8e321ba0 100644 --- a/ggml/include/gguf.h +++ b/ggml/include/gguf.h @@ -78,7 +78,7 @@ extern "C" { GGML_API struct gguf_context * gguf_init_empty(void); GGML_API struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_params params); - GGML_API struct gguf_context * gguf_init_from_fd(int fd, struct gguf_init_params params); + GGML_API struct gguf_context * gguf_init_from_file_ptr(FILE * file, struct gguf_init_params params); //GGML_API struct gguf_context * gguf_init_from_buffer(..); GGML_API void gguf_free(struct gguf_context * ctx); diff --git a/ggml/src/gguf.cpp b/ggml/src/gguf.cpp index 8eea785404..bf28dabb06 100644 --- a/ggml/src/gguf.cpp +++ b/ggml/src/gguf.cpp @@ -15,10 +15,6 @@ #include #include -#ifndef _WIN32 -#include -#endif - #define GGUF_MAX_STRING_LENGTH (1024*1024*1024) #define GGUF_MAX_ARRAY_ELEMENTS (1024*1024*1024) @@ -857,33 +853,12 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p return result; } -#ifndef _WIN32 -struct gguf_context * gguf_init_from_fd(int fd, struct gguf_init_params params) { - const int fd_duped = dup(fd); - if (fd_duped < 0) { - GGML_LOG_ERROR("%s: failed to dup fd %d: %s\n", __func__, fd, strerror(errno)); - return nullptr; - } - - FILE * file = fdopen(fd_duped, "rb"); +struct gguf_context * gguf_init_from_file_ptr(FILE * file, struct gguf_init_params params) { if (!file) { - close(fd_duped); - GGML_LOG_ERROR("%s: failed to fdopen fd %d: %s\n", __func__, fd, strerror(errno)); return nullptr; } - - struct gguf_context * result = gguf_init_from_file_impl(file, params); - fclose(file); - return result; + return gguf_init_from_file_impl(file, params); } -#else -struct gguf_context * gguf_init_from_fd(int fd, struct gguf_init_params params) { - GGML_LOG_ERROR("%s: fd-based loading is not supported on Windows\n", __func__); - GGML_UNUSED(fd); - GGML_UNUSED(params); - return nullptr; -} -#endif void gguf_free(struct gguf_context * ctx) { if (ctx == nullptr) { diff --git a/src/llama-mmap.cpp b/src/llama-mmap.cpp index 5ea00d3fa7..706d72f1e0 100644 --- a/src/llama-mmap.cpp +++ b/src/llama-mmap.cpp @@ -220,12 +220,12 @@ struct llama_file::impl { void init_from_fd(int fd_src) { const int fd_duped = dup(fd_src); if (fd_duped < 0) { - throw std::runtime_error(format("llama_file: failed to dup fd %d: %s", fd_src, strerror(errno))); + throw std::runtime_error(format("failed to dup fd %d: %s", fd_src, strerror(errno))); } fp = fdopen(fd_duped, "rb"); if (!fp) { close(fd_duped); - throw std::runtime_error(format("llama_file: failed to fdopen fd %d: %s", fd_src, strerror(errno))); + throw std::runtime_error(format("failed to fdopen fd %d: %s", fd_src, strerror(errno))); } seek(0, SEEK_END); size = tell(); diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp index 1b38ee08a6..4d09f5ebbf 100644 --- a/src/llama-model-loader.cpp +++ b/src/llama-model-loader.cpp @@ -13,6 +13,10 @@ #include #include +#ifndef _WIN32 +#include +#endif // _WIN32 + static const size_t kiB = 1024; static const size_t MiB = 1024*kiB; static const size_t GiB = 1024*MiB; @@ -660,13 +664,25 @@ llama_model_loader::llama_model_loader( LLAMA_LOG_INFO("%s: additional %d GGUFs metadata loaded.\n", __func__, n_split - 1); } } else if (fd >= 0) { + const int fd_duped = dup(fd); + if (fd_duped < 0) { + throw std::runtime_error(format("%s: failed to dup fd %d: %s", __func__, fd, strerror(errno))); + } + + FILE * f = fdopen(fd_duped, "rb"); + if (!f) { + close(fd_duped); + throw std::runtime_error(format("%s: failed to fdopen fd %d: %s", __func__, fd, strerror(errno))); + } + struct ggml_context * ctx = NULL; struct gguf_init_params params = { /*.no_alloc = */ true, /*.ctx = */ &ctx, }; - metadata_ptr.reset(gguf_init_from_fd(fd, params)); + metadata_ptr.reset(gguf_init_from_file_ptr(f, params)); + fclose(f); metadata = metadata_ptr.get(); if (metadata == nullptr) { throw std::runtime_error(format("%s: failed to load model from fd %d", __func__, fd)); @@ -675,8 +691,8 @@ llama_model_loader::llama_model_loader( get_key(llm_kv(LLM_KV_GENERAL_ARCHITECTURE), arch_name, false); llm_kv = LLM_KV(llm_arch_from_string(arch_name)); - contexts.emplace_back(ctx); files.emplace_back(new llama_file(fd)); + contexts.emplace_back(ctx); // Save tensors data offset info of the main file. for (ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) { diff --git a/src/llama.cpp b/src/llama.cpp index daf3c3bd8d..c40d5c9d51 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -1083,7 +1083,7 @@ struct llama_model * llama_model_load_from_fd(int fd, struct llama_model_params std::string path_model; std::vector splits = {}; return llama_model_load_from_file_impl(nullptr, nullptr, nullptr, path_model, splits, fd, params); -#endif +#endif // _WIN32 } void llama_model_save_to_file(const struct llama_model * model, const char * path_model) { diff --git a/tests/test-model-load-fd.cpp b/tests/test-model-load-fd.cpp index d5102942d0..dd982ba907 100644 --- a/tests/test-model-load-fd.cpp +++ b/tests/test-model-load-fd.cpp @@ -44,4 +44,4 @@ int main(int argc, char ** argv) { return n_vocab > 0 ? EXIT_SUCCESS : EXIT_FAILURE; } -#endif +#endif // _WIN32 From c44d34ee736aa060900902af23bd09b52baac8f6 Mon Sep 17 00:00:00 2001 From: Siddhesh2377 Date: Sat, 14 Mar 2026 01:20:50 +0530 Subject: [PATCH 03/12] llama : use FILE pointer instead of fd in public API --- include/llama.h | 5 ++--- src/llama-model-loader.cpp | 28 ++++++---------------------- src/llama-model-loader.h | 2 +- src/llama-quant.cpp | 2 +- src/llama.cpp | 31 ++++++++++++++----------------- tests/test-model-load-fd.cpp | 15 +++++++++++---- 6 files changed, 35 insertions(+), 48 deletions(-) diff --git a/include/llama.h b/include/llama.h index 1864c53b88..5c3c50c132 100644 --- a/include/llama.h +++ b/include/llama.h @@ -465,9 +465,8 @@ extern "C" { const char * path_model, struct llama_model_params params); - // Load a model from a POSIX file descriptor - // Not supported on Windows - LLAMA_API struct llama_model * llama_model_load_from_fd(int fd, struct llama_model_params params); + // Load a model from an open FILE pointer + LLAMA_API struct llama_model * llama_model_load_from_file_ptr(FILE * file, struct llama_model_params params); // Load a model from multiple splits (support custom naming scheme) // The paths must be in the correct order diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp index 4d09f5ebbf..d29207f735 100644 --- a/src/llama-model-loader.cpp +++ b/src/llama-model-loader.cpp @@ -13,10 +13,6 @@ #include #include -#ifndef _WIN32 -#include -#endif // _WIN32 - static const size_t kiB = 1024; static const size_t MiB = 1024*kiB; static const size_t GiB = 1024*MiB; @@ -515,7 +511,7 @@ llama_model_loader::llama_model_loader( void * set_tensor_data_ud, const std::string & fname, std::vector & splits, - int fd, + FILE * file, bool use_mmap, bool use_direct_io, bool check_tensors, @@ -663,35 +659,23 @@ llama_model_loader::llama_model_loader( LLAMA_LOG_INFO("%s: additional %d GGUFs metadata loaded.\n", __func__, n_split - 1); } - } else if (fd >= 0) { - const int fd_duped = dup(fd); - if (fd_duped < 0) { - throw std::runtime_error(format("%s: failed to dup fd %d: %s", __func__, fd, strerror(errno))); - } - - FILE * f = fdopen(fd_duped, "rb"); - if (!f) { - close(fd_duped); - throw std::runtime_error(format("%s: failed to fdopen fd %d: %s", __func__, fd, strerror(errno))); - } - + } else if (file) { struct ggml_context * ctx = NULL; struct gguf_init_params params = { /*.no_alloc = */ true, /*.ctx = */ &ctx, }; - metadata_ptr.reset(gguf_init_from_file_ptr(f, params)); - fclose(f); + metadata_ptr.reset(gguf_init_from_file_ptr(file, params)); metadata = metadata_ptr.get(); if (metadata == nullptr) { - throw std::runtime_error(format("%s: failed to load model from fd %d", __func__, fd)); + throw std::runtime_error(format("%s: failed to load model from file pointer", __func__)); } get_key(llm_kv(LLM_KV_GENERAL_ARCHITECTURE), arch_name, false); llm_kv = LLM_KV(llm_arch_from_string(arch_name)); - files.emplace_back(new llama_file(fd)); + files.emplace_back(new llama_file(fileno(file))); contexts.emplace_back(ctx); // Save tensors data offset info of the main file. @@ -716,7 +700,7 @@ llama_model_loader::llama_model_loader( fver = (enum llama_fver) gguf_get_version(metadata); LLAMA_LOG_INFO("%s: loaded meta data with %d key-value pairs and %d tensors from %s (version %s)\n", - __func__, n_kv, n_tensors, fname.empty() ? "(fd)" : fname.c_str(), llama_file_version_name(fver)); + __func__, n_kv, n_tensors, fname.empty() ? "(file*)" : fname.c_str(), llama_file_version_name(fver)); // determine file type based on the number of tensors for each quantization and print meta data // TODO: make optional diff --git a/src/llama-model-loader.h b/src/llama-model-loader.h index 6e5a5a4712..7b3d6703c0 100644 --- a/src/llama-model-loader.h +++ b/src/llama-model-loader.h @@ -125,7 +125,7 @@ struct llama_model_loader { void * set_tensor_data_ud, const std::string & fname, std::vector & splits, // optional, only need if the split does not follow naming scheme - int fd, + FILE * file, bool use_mmap, bool use_direct_io, bool check_tensors, diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index d047944dd6..c414656e0a 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -859,7 +859,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: std::vector splits = {}; llama_model_loader ml(/*metadata*/ nullptr, /*set_tensor_data*/ nullptr, /*set_tensor_data_ud*/ nullptr, - fname_inp, splits, /*fd*/ -1, use_mmap, /*use_direct_io*/ false, /*check_tensors*/ true, /*no_alloc*/ false, kv_overrides, nullptr); + fname_inp, splits, /*file*/ nullptr, use_mmap, /*use_direct_io*/ false, /*check_tensors*/ true, /*no_alloc*/ false, kv_overrides, nullptr); ml.init_mappings(false); // no prefetching llama_model model(llama_model_default_params()); diff --git a/src/llama.cpp b/src/llama.cpp index c40d5c9d51..d35fb2cbe6 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -828,7 +828,7 @@ int64_t llama_time_us(void) { // Returns 0 on success, -1 on error, and -2 on cancellation via llama_progress_callback static int llama_model_load(struct gguf_context * metadata, llama_model_set_tensor_data_t set_tensor_data, void * set_tensor_data_ud, - const std::string & fname, std::vector & splits, int fd, llama_model & model, llama_model_params & params) { + const std::string & fname, std::vector & splits, FILE * file, llama_model & model, llama_model_params & params) { // loading time will be recalculated after the first eval, so // we take page faults deferred by mmap() into consideration model.t_load_us = 0; @@ -837,7 +837,7 @@ static int llama_model_load(struct gguf_context * metadata, llama_model_set_tens model.t_start_us = tm.t_start_us; try { - llama_model_loader ml(metadata, set_tensor_data, set_tensor_data_ud, fname, splits, fd, params.use_mmap, params.use_direct_io, + llama_model_loader ml(metadata, set_tensor_data, set_tensor_data_ud, fname, splits, file, params.use_mmap, params.use_direct_io, params.check_tensors, params.no_alloc, params.kv_overrides, params.tensor_buft_overrides); ml.print_info(); @@ -889,9 +889,9 @@ static struct llama_model * llama_model_load_from_file_impl( void * set_tensor_data_ud, const std::string & path_model, std::vector & splits, - int fd, + FILE * file, struct llama_model_params params) { - if (metadata == nullptr && path_model.empty() && fd < 0) { + if (metadata == nullptr && path_model.empty() && !file) { LLAMA_LOG_ERROR("%s: no model source provided\n", __func__); return nullptr; } @@ -1015,7 +1015,7 @@ static struct llama_model * llama_model_load_from_file_impl( props.memory_free/1024/1024); } - const int status = llama_model_load(metadata, set_tensor_data, set_tensor_data_ud, path_model, splits, fd, *model, params); + const int status = llama_model_load(metadata, set_tensor_data, set_tensor_data_ud, path_model, splits, file, *model, params); GGML_ASSERT(status <= 0); if (status < 0) { if (status == -1) { @@ -1041,7 +1041,7 @@ struct llama_model * llama_model_init_from_user( std::vector splits = {}; params.use_mmap = false; params.use_extra_bufts = false; - return llama_model_load_from_file_impl(metadata, set_tensor_data, set_tensor_data_ud, path_model, splits, /*fd*/ -1, params); + return llama_model_load_from_file_impl(metadata, set_tensor_data, set_tensor_data_ud, path_model, splits, /*file*/ nullptr, params); } // deprecated struct llama_model * llama_load_model_from_file( @@ -1054,7 +1054,7 @@ struct llama_model * llama_model_load_from_file( const char * path_model, struct llama_model_params params) { std::vector splits = {}; - return llama_model_load_from_file_impl(nullptr, nullptr, nullptr, path_model, splits, /*fd*/ -1, params); + return llama_model_load_from_file_impl(nullptr, nullptr, nullptr, path_model, splits, /*file*/ nullptr, params); } struct llama_model * llama_model_load_from_splits( @@ -1070,20 +1070,17 @@ struct llama_model * llama_model_load_from_splits( for (size_t i = 0; i < n_paths; ++i) { splits.push_back(paths[i]); } - return llama_model_load_from_file_impl(nullptr, nullptr, nullptr, splits.front(), splits, /*fd*/ -1, params); + return llama_model_load_from_file_impl(nullptr, nullptr, nullptr, splits.front(), splits, /*file*/ nullptr, params); } -struct llama_model * llama_model_load_from_fd(int fd, struct llama_model_params params) { -#ifdef _WIN32 - LLAMA_LOG_ERROR("%s: fd-based loading is not supported on Windows\n", __func__); - GGML_UNUSED(fd); - GGML_UNUSED(params); - return nullptr; -#else +struct llama_model * llama_model_load_from_file_ptr(FILE * file, struct llama_model_params params) { + if (!file) { + LLAMA_LOG_ERROR("%s: file is NULL\n", __func__); + return nullptr; + } std::string path_model; std::vector splits = {}; - return llama_model_load_from_file_impl(nullptr, nullptr, nullptr, path_model, splits, fd, params); -#endif // _WIN32 + return llama_model_load_from_file_impl(nullptr, nullptr, nullptr, path_model, splits, file, params); } void llama_model_save_to_file(const struct llama_model * model, const char * path_model) { diff --git a/tests/test-model-load-fd.cpp b/tests/test-model-load-fd.cpp index dd982ba907..b7ff237ee5 100644 --- a/tests/test-model-load-fd.cpp +++ b/tests/test-model-load-fd.cpp @@ -24,20 +24,27 @@ int main(int argc, char ** argv) { return EXIT_FAILURE; } + FILE * f = fdopen(dup(fd), "rb"); + close(fd); + if (!f) { + fprintf(stderr, "failed to fdopen\n"); + return EXIT_FAILURE; + } + auto params = llama_model_default_params(); params.use_mmap = true; params.vocab_only = true; - struct llama_model * model = llama_model_load_from_fd(fd, params); - close(fd); + struct llama_model * model = llama_model_load_from_file_ptr(f, params); + fclose(f); if (model == nullptr) { - fprintf(stderr, "load from fd failed\n"); + fprintf(stderr, "load from file pointer failed\n"); return EXIT_FAILURE; } const int n_vocab = llama_vocab_n_tokens(llama_model_get_vocab(model)); - fprintf(stderr, "loaded %d tokens from fd\n", n_vocab); + fprintf(stderr, "loaded %d tokens via file pointer\n", n_vocab); llama_model_free(model); llama_backend_free(); From 6de1857936736a394a6c229c8e9584c1aaf69119 Mon Sep 17 00:00:00 2001 From: Siddhesh2377 Date: Sat, 14 Mar 2026 21:40:01 +0530 Subject: [PATCH 04/12] llama : use FILE pointer consistently, address review feedback --- ggml/src/ggml-impl.h | 1 - ggml/src/gguf.cpp | 15 ++++++--------- include/llama.h | 4 +++- src/llama-mmap.cpp | 31 ++++++++++++------------------- src/llama-mmap.h | 2 +- src/llama-model-loader.cpp | 2 +- tests/test-gguf.cpp | 4 ++-- 7 files changed, 25 insertions(+), 34 deletions(-) diff --git a/ggml/src/ggml-impl.h b/ggml/src/ggml-impl.h index 9256865595..0639db362e 100644 --- a/ggml/src/ggml-impl.h +++ b/ggml/src/ggml-impl.h @@ -773,6 +773,5 @@ inline bool ggml_check_edges(const struct ggml_cgraph * cgraph, // expose GGUF internals for test code GGML_API size_t gguf_type_size(enum gguf_type type); -GGML_API struct gguf_context * gguf_init_from_file_impl(FILE * file, struct gguf_init_params params); GGML_API void gguf_write_to_buf(const struct gguf_context * ctx, std::vector & buf, bool only_meta); #endif // __cplusplus diff --git a/ggml/src/gguf.cpp b/ggml/src/gguf.cpp index bf28dabb06..49afeacae3 100644 --- a/ggml/src/gguf.cpp +++ b/ggml/src/gguf.cpp @@ -394,7 +394,11 @@ bool gguf_read_emplace_helper(const struct gguf_reader & gr, std::vector(fname, mode, use_direct_io)) {} -llama_file::llama_file(int fd) : pimpl(std::make_unique(fd)) {} +llama_file::llama_file(FILE * file) : pimpl(std::make_unique(file)) {} llama_file::~llama_file() = default; diff --git a/src/llama-mmap.h b/src/llama-mmap.h index 2d1eac91a3..32fab23119 100644 --- a/src/llama-mmap.h +++ b/src/llama-mmap.h @@ -15,7 +15,7 @@ using llama_mlocks = std::vector>; struct llama_file { llama_file(const char * fname, const char * mode, bool use_direct_io = false); - llama_file(int fd); + llama_file(FILE * file); ~llama_file(); size_t tell() const; diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp index d29207f735..3ad8a51d9b 100644 --- a/src/llama-model-loader.cpp +++ b/src/llama-model-loader.cpp @@ -675,7 +675,7 @@ llama_model_loader::llama_model_loader( get_key(llm_kv(LLM_KV_GENERAL_ARCHITECTURE), arch_name, false); llm_kv = LLM_KV(llm_arch_from_string(arch_name)); - files.emplace_back(new llama_file(fileno(file))); + files.emplace_back(new llama_file(file)); contexts.emplace_back(ctx); // Save tensors data offset info of the main file. diff --git a/tests/test-gguf.cpp b/tests/test-gguf.cpp index 8ebd16ba82..78ca95dcbd 100644 --- a/tests/test-gguf.cpp +++ b/tests/test-gguf.cpp @@ -742,7 +742,7 @@ static std::pair test_handcrafted_file(const unsigned int seed) { /*ctx =*/ hft >= offset_has_data ? &ctx : nullptr, }; - struct gguf_context * gguf_ctx = gguf_init_from_file_impl(file, gguf_params); + struct gguf_context * gguf_ctx = gguf_init_from_file_ptr(file, gguf_params); if (expect_context_not_null(hft)) { printf("%s: - context_not_null: ", __func__); @@ -1137,7 +1137,7 @@ static std::pair test_roundtrip(ggml_backend_dev_t dev, const unsigned /*no_alloc =*/ false, /*ctx =*/ only_meta ? nullptr : &ctx_1, }; - struct gguf_context * gguf_ctx_1 = gguf_init_from_file_impl(file, gguf_params); + struct gguf_context * gguf_ctx_1 = gguf_init_from_file_ptr(file, gguf_params); printf("%s: same_version: ", __func__); if (gguf_get_version(gguf_ctx_0) == gguf_get_version(gguf_ctx_1)) { From f76e53108c14d81a42f089b52d7a7625bf7d95f0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= Date: Sun, 22 Mar 2026 20:51:41 +0100 Subject: [PATCH 05/12] fixup --- src/llama-mmap.h | 2 +- src/llama-model-loader.cpp | 2 +- src/llama.cpp | 18 ++++++++++-- tests/CMakeLists.txt | 1 - tests/test-model-load-fd.cpp | 54 ------------------------------------ 5 files changed, 17 insertions(+), 60 deletions(-) delete mode 100644 tests/test-model-load-fd.cpp diff --git a/src/llama-mmap.h b/src/llama-mmap.h index 32fab23119..b7d5c61e95 100644 --- a/src/llama-mmap.h +++ b/src/llama-mmap.h @@ -21,7 +21,7 @@ struct llama_file { size_t tell() const; size_t size() const; - int file_id() const; + int file_id() const; // fileno overload void seek(size_t offset, int whence) const; diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp index 3ad8a51d9b..2457a7ed4b 100644 --- a/src/llama-model-loader.cpp +++ b/src/llama-model-loader.cpp @@ -659,7 +659,7 @@ llama_model_loader::llama_model_loader( LLAMA_LOG_INFO("%s: additional %d GGUFs metadata loaded.\n", __func__, n_split - 1); } - } else if (file) { + } else if (file != nullptr) { struct ggml_context * ctx = NULL; struct gguf_init_params params = { /*.no_alloc = */ true, diff --git a/src/llama.cpp b/src/llama.cpp index d35fb2cbe6..1810a59d8e 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -891,9 +891,21 @@ static struct llama_model * llama_model_load_from_file_impl( std::vector & splits, FILE * file, struct llama_model_params params) { - if (metadata == nullptr && path_model.empty() && !file) { - LLAMA_LOG_ERROR("%s: no model source provided\n", __func__); - return nullptr; + { + int n_sources_defined = 0; + if (metadata != nullptr) { + n_sources_defined++; + } + if (!path_model.empty()) { + n_sources_defined++; + } + if (file != nullptr) { + n_sources_defined++; + } + if (n_sources_defined != 1) { + LLAMA_LOG_ERROR("%s: exactly one out metadata, path_model, and file must be defined\n", __func__); + return nullptr; + } } ggml_time_init(); diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index c333fe7e40..9582164b58 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -240,7 +240,6 @@ llama_build_and_test(test-gguf.cpp) llama_build_and_test(test-backend-ops.cpp) llama_build_and_test(test-model-load-cancel.cpp LABEL "model") -llama_build_and_test(test-model-load-fd.cpp LABEL "model") llama_build_and_test(test-autorelease.cpp LABEL "model") llama_build_and_test(test-backend-sampler.cpp LABEL "model") diff --git a/tests/test-model-load-fd.cpp b/tests/test-model-load-fd.cpp deleted file mode 100644 index b7ff237ee5..0000000000 --- a/tests/test-model-load-fd.cpp +++ /dev/null @@ -1,54 +0,0 @@ -#include "llama.h" -#include "get-model.h" - -#include -#include - -#ifdef _WIN32 -int main(int /*argc*/, char ** /*argv*/) { - fprintf(stderr, "skipping on Windows\n"); - return EXIT_SUCCESS; -} -#else -# include -# include - -int main(int argc, char ** argv) { - auto * model_path = get_model_or_exit(argc, argv); - - llama_backend_init(); - - const int fd = open(model_path, O_RDONLY); - if (fd < 0) { - fprintf(stderr, "failed to open %s\n", model_path); - return EXIT_FAILURE; - } - - FILE * f = fdopen(dup(fd), "rb"); - close(fd); - if (!f) { - fprintf(stderr, "failed to fdopen\n"); - return EXIT_FAILURE; - } - - auto params = llama_model_default_params(); - params.use_mmap = true; - params.vocab_only = true; - - struct llama_model * model = llama_model_load_from_file_ptr(f, params); - fclose(f); - - if (model == nullptr) { - fprintf(stderr, "load from file pointer failed\n"); - return EXIT_FAILURE; - } - - const int n_vocab = llama_vocab_n_tokens(llama_model_get_vocab(model)); - fprintf(stderr, "loaded %d tokens via file pointer\n", n_vocab); - - llama_model_free(model); - llama_backend_free(); - - return n_vocab > 0 ? EXIT_SUCCESS : EXIT_FAILURE; -} -#endif // _WIN32 From e0ee16ce77c3a4d8526b804b2a3b1fbff66a6ed8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= Date: Sun, 8 Mar 2026 21:07:53 +0100 Subject: [PATCH 06/12] fix tensor names --- src/llama-arch.cpp | 61 ++++++++++++---------------------------------- 1 file changed, 16 insertions(+), 45 deletions(-) diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp index 84dc6d8f1b..f6084fa84a 100644 --- a/src/llama-arch.cpp +++ b/src/llama-arch.cpp @@ -544,6 +544,10 @@ static std::set llm_get_tensor_names(llm_arch arch) { case LLM_ARCH_CLIP: return {}; case LLM_ARCH_LLAMA: + case LLM_ARCH_REFACT: + case LLM_ARCH_MINICPM: + case LLM_ARCH_GRANITE: + case LLM_ARCH_GRANITE_MOE: case LLM_ARCH_DECI: case LLM_ARCH_MISTRAL3: case LLM_ARCH_LLAMA_EMBED: @@ -744,11 +748,9 @@ static std::set llm_get_tensor_names(llm_arch arch) { LLM_TENSOR_ATTN_Q_NORM, LLM_TENSOR_ATTN_K_NORM, }; - case LLM_ARCH_REFACT: case LLM_ARCH_QWEN2: case LLM_ARCH_QWEN2VL: case LLM_ARCH_INTERNLM2: - case LLM_ARCH_GRANITE: case LLM_ARCH_ERNIE4_5: case LLM_ARCH_PADDLEOCR: case LLM_ARCH_SMOLLM3: @@ -759,6 +761,7 @@ static std::set llm_get_tensor_names(llm_arch arch) { LLM_TENSOR_TOKEN_EMBD, LLM_TENSOR_OUTPUT_NORM, LLM_TENSOR_OUTPUT, + LLM_TENSOR_ROPE_FREQS, LLM_TENSOR_ATTN_NORM, LLM_TENSOR_ATTN_Q, LLM_TENSOR_ATTN_K, @@ -1232,29 +1235,6 @@ static std::set llm_get_tensor_names(llm_arch arch) { LLM_TENSOR_FFN_DOWN, LLM_TENSOR_FFN_UP, }; - case LLM_ARCH_MINICPM: - return { - LLM_TENSOR_TOKEN_EMBD, - LLM_TENSOR_OUTPUT_NORM, - LLM_TENSOR_OUTPUT, - LLM_TENSOR_ROPE_FREQS, - LLM_TENSOR_ROPE_FACTORS_LONG, - LLM_TENSOR_ROPE_FACTORS_SHORT, - LLM_TENSOR_ATTN_NORM, - LLM_TENSOR_ATTN_Q, - LLM_TENSOR_ATTN_K, - LLM_TENSOR_ATTN_V, - LLM_TENSOR_ATTN_OUT, - LLM_TENSOR_ATTN_ROT_EMBD, - LLM_TENSOR_FFN_GATE_INP, - LLM_TENSOR_FFN_NORM, - LLM_TENSOR_FFN_GATE, - LLM_TENSOR_FFN_DOWN, - LLM_TENSOR_FFN_UP, - LLM_TENSOR_FFN_GATE_EXP, - LLM_TENSOR_FFN_DOWN_EXP, - LLM_TENSOR_FFN_UP_EXP, - }; case LLM_ARCH_MINICPM3: return { LLM_TENSOR_TOKEN_EMBD, @@ -1442,6 +1422,7 @@ static std::set llm_get_tensor_names(llm_arch arch) { LLM_TENSOR_TOKEN_EMBD, LLM_TENSOR_OUTPUT, LLM_TENSOR_OUTPUT_NORM, + LLM_TENSOR_ROPE_FREQS, LLM_TENSOR_ATTN_NORM, LLM_TENSOR_ATTN_Q, LLM_TENSOR_ATTN_K, @@ -1657,7 +1638,9 @@ static std::set llm_get_tensor_names(llm_arch arch) { LLM_TENSOR_ROPE_FREQS, LLM_TENSOR_OUTPUT_NORM, LLM_TENSOR_OUTPUT, + LLM_TENSOR_TOKEN_EMBD, LLM_TENSOR_ATTN_NORM, + LLM_TENSOR_ATTN_QKV, LLM_TENSOR_ATTN_Q, LLM_TENSOR_ATTN_K, LLM_TENSOR_ATTN_V, @@ -2061,30 +2044,12 @@ static std::set llm_get_tensor_names(llm_arch arch) { LLM_TENSOR_FFN_DOWN, LLM_TENSOR_FFN_UP, }; - case LLM_ARCH_GRANITE_MOE: - return { - LLM_TENSOR_TOKEN_EMBD, - LLM_TENSOR_OUTPUT_NORM, - LLM_TENSOR_OUTPUT, - LLM_TENSOR_ATTN_NORM, - LLM_TENSOR_ATTN_Q, - LLM_TENSOR_ATTN_K, - LLM_TENSOR_ATTN_V, - LLM_TENSOR_ATTN_OUT, - LLM_TENSOR_FFN_NORM, - LLM_TENSOR_FFN_GATE_INP, - LLM_TENSOR_FFN_GATE_EXPS, - LLM_TENSOR_FFN_DOWN_EXPS, - LLM_TENSOR_FFN_UP_EXPS, - LLM_TENSOR_FFN_GATE_SHEXP, - LLM_TENSOR_FFN_DOWN_SHEXP, - LLM_TENSOR_FFN_UP_SHEXP, - }; case LLM_ARCH_GRANITE_HYBRID: return { LLM_TENSOR_TOKEN_EMBD, LLM_TENSOR_OUTPUT_NORM, LLM_TENSOR_OUTPUT, + LLM_TENSOR_ROPE_FREQS, LLM_TENSOR_ATTN_NORM, LLM_TENSOR_SSM_IN, LLM_TENSOR_SSM_CONV1D, @@ -2412,6 +2377,7 @@ static std::set llm_get_tensor_names(llm_arch arch) { LLM_TENSOR_TOKEN_EMBD, LLM_TENSOR_OUTPUT_NORM, LLM_TENSOR_OUTPUT, + LLM_TENSOR_ROPE_FREQS, LLM_TENSOR_ATTN_NORM, LLM_TENSOR_ATTN_QKV, LLM_TENSOR_ATTN_OUT, @@ -2789,7 +2755,12 @@ std::string LLM_TN_IMPL::str() const { } if (model_tensors.find(tensor) == model_tensors.end()) { - return LLM_TENSOR_NAMES.at(tensor); + const char * name = LLM_TENSOR_NAMES.at(tensor); + if (suffix != nullptr || bid != -1 || xid != -1) { + LLAMA_LOG_ERROR("%s: cannot properly format tensor name %s with suffix=%s bid=%d xid=%d\n", + __func__, name, suffix, bid, xid); + } + return name; } std::string name = ::format(LLM_TENSOR_NAMES.at(tensor), bid, xid); From e8e2f634e7df8f892e8f1b73fae9c7c5979b60e6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= Date: Fri, 13 Mar 2026 12:32:05 +0100 Subject: [PATCH 07/12] fix llama-model-saver --- src/llama-model-saver.cpp | 96 ++++++++++++++++++++++++++++++++++++-- src/llama-model.cpp | 4 +- tests/test-llama-archs.cpp | 14 ++++-- 3 files changed, 105 insertions(+), 9 deletions(-) diff --git a/src/llama-model-saver.cpp b/src/llama-model-saver.cpp index 6f6538aecc..637be9b7ce 100644 --- a/src/llama-model-saver.cpp +++ b/src/llama-model-saver.cpp @@ -2,6 +2,7 @@ #include "gguf.h" +#include "llama-arch.h" #include "llama.h" #include "llama-hparams.h" #include "llama-model.h" @@ -105,7 +106,10 @@ void llama_model_saver::add_tensor(const struct ggml_tensor * tensor) { return; } if (gguf_find_tensor(gguf_ctx, tensor->name) >= 0) { - GGML_ASSERT(std::string(tensor->name) == "rope_freqs.weight"); // FIXME + const std::string tensor_name = tensor->name; + GGML_ASSERT( + tensor_name == "rope_freqs.weight" || tensor_name == "rope_factors_long.weight" || + tensor_name == "rope_factors_short.weight"); // FIXME return; } gguf_add_tensor(gguf_ctx, tensor); @@ -127,6 +131,7 @@ void llama_model_saver::add_kv_from_model() { tokens[id] = token_data.text; scores[id] = token_data.score; + // FIXME should this be treated as flags? switch(token_data.attr) { case LLAMA_TOKEN_ATTR_UNKNOWN: token_types[id] = LLAMA_TOKEN_TYPE_UNKNOWN; break; case LLAMA_TOKEN_ATTR_UNUSED: token_types[id] = LLAMA_TOKEN_TYPE_UNUSED; break; @@ -134,6 +139,9 @@ void llama_model_saver::add_kv_from_model() { case LLAMA_TOKEN_ATTR_CONTROL: token_types[id] = LLAMA_TOKEN_TYPE_CONTROL; break; case LLAMA_TOKEN_ATTR_USER_DEFINED: token_types[id] = LLAMA_TOKEN_TYPE_USER_DEFINED; break; case LLAMA_TOKEN_ATTR_BYTE: token_types[id] = LLAMA_TOKEN_TYPE_BYTE; break; + // case LLAMA_TOKEN_ATTR_NORMALIZED: ??? + // case LLAMA_TOKEN_ATTR_LSTRIP: ??? + // case LLAMA_TOKEN_ATTR_RSTRIP: ??? case LLAMA_TOKEN_ATTR_UNDEFINED: default: token_types[id] = LLAMA_TOKEN_TYPE_UNDEFINED; break; } @@ -144,6 +152,19 @@ void llama_model_saver::add_kv_from_model() { add_kv(LLM_KV_GENERAL_ARCHITECTURE, model->arch_name()); // add_kv(LLM_KV_GENERAL_QUANTIZATION_VERSION, ???); // add_kv(LLM_KV_GENERAL_ALIGNMENT, ???); + // add_kv(LLM_KV_GENERAL_FILE_TYPE, ???); + // add_kv(LLM_KV_GENERAL_SAMPLING_SEQUENCE, ???); + // add_kv(LLM_KV_GENERAL_SAMPLING_TOP_K, ???); + // add_kv(LLM_KV_GENERAL_SAMPLING_TOP_P, ???); + // add_kv(LLM_KV_GENERAL_SAMPLING_MIN_P, ???); + // add_kv(LLM_KV_GENERAL_SAMPLING_XTC_PROBABILITY, ???); + // add_kv(LLM_KV_GENERAL_SAMPLING_XTC_THRESHOLD, ???); + // add_kv(LLM_KV_GENERAL_SAMPLING_TEMP, ???); + // add_kv(LLM_KV_GENERAL_SAMPLING_PENALTY_LAST_N, ???); + // add_kv(LLM_KV_GENERAL_SAMPLING_PENALTY_REPEAT, ???); + // add_kv(LLM_KV_GENERAL_SAMPLING_MIROSTAT, ???); + // add_kv(LLM_KV_GENERAL_SAMPLING_MIROSTAT_TAU, ???); + // add_kv(LLM_KV_GENERAL_SAMPLING_MIROSTAT_ETA, ???); add_kv(LLM_KV_GENERAL_NAME, model->name); // add_kv(LLM_KV_GENERAL_AUTHOR, ???); // add_kv(LLM_KV_GENERAL_VERSION, ???); @@ -163,17 +184,31 @@ void llama_model_saver::add_kv_from_model() { add_kv(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead); add_kv(LLM_KV_FEED_FORWARD_LENGTH, hparams.n_ff_arr, true); add_kv(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp); - add_kv(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_exp); + add_kv(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp); + add_kv(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_chexp); + add_kv(LLM_KV_SWIGLU_CLAMP_EXP, hparams.swiglu_clamp_exp); + add_kv(LLM_KV_SWIGLU_CLAMP_SHEXP, hparams.swiglu_clamp_shexp); add_kv(LLM_KV_USE_PARALLEL_RESIDUAL, hparams.use_par_res); // add_kv(LLM_KV_TENSOR_DATA_LAYOUT, ???); add_kv(LLM_KV_EXPERT_COUNT, hparams.n_expert); add_kv(LLM_KV_EXPERT_USED_COUNT, hparams.n_expert_used); add_kv(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared); + add_kv(LLM_KV_EXPERT_GROUP_COUNT, hparams.n_expert_groups); + add_kv(LLM_KV_EXPERT_GROUP_USED_COUNT, hparams.n_group_used); add_kv(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale); + add_kv(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm); + add_kv(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func); + add_kv(LLM_KV_EXPERT_GROUP_SCALE, hparams.expert_group_scale); + add_kv(LLM_KV_EXPERTS_PER_GROUP, hparams.n_group_experts); + add_kv(LLM_KV_MOE_EVERY_N_LAYERS, hparams.moe_every_n_layers); + add_kv(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.nextn_predict_layers); + add_kv(LLM_KV_NUM_DEEPSTACK_LAYERS, hparams.n_deepstack_layers); add_kv(LLM_KV_POOLING_TYPE, uint32_t(hparams.pooling_type)); add_kv(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale); add_kv(LLM_KV_DECODER_START_TOKEN_ID, hparams.dec_start_token_id); + add_kv(LLM_KV_DECODER_BLOCK_COUNT, hparams.dec_n_layer); add_kv(LLM_KV_ATTN_LOGIT_SOFTCAPPING, hparams.f_attn_logit_softcapping); + add_kv(LLM_KV_ROUTER_LOGIT_SOFTCAPPING, hparams.f_router_logit_softcapping); add_kv(LLM_KV_FINAL_LOGIT_SOFTCAPPING, hparams.f_final_logit_softcapping); add_kv(LLM_KV_SWIN_NORM, hparams.swin_norm); add_kv(LLM_KV_RESCALE_EVERY_N_LAYERS, hparams.rescale_every_n_layers); @@ -181,6 +216,9 @@ void llama_model_saver::add_kv_from_model() { add_kv(LLM_KV_TIME_DECAY_EXTRA_DIM, hparams.time_decay_extra_dim); add_kv(LLM_KV_RESIDUAL_SCALE, hparams.f_residual_scale); add_kv(LLM_KV_EMBEDDING_SCALE, hparams.f_embedding_scale); + add_kv(LLM_KV_TOKEN_SHIFT_COUNT, hparams.token_shift_count); + add_kv(LLM_KV_INTERLEAVE_MOE_LAYER_STEP, hparams.n_moe_layer_step); + // add_kv(LLM_KV_FULL_ATTENTION_INTERVAL, ???); add_kv(LLM_KV_ATTENTION_HEAD_COUNT, hparams.n_head_arr, true); add_kv(LLM_KV_ATTENTION_HEAD_COUNT_KV, hparams.n_head_kv_arr, true); @@ -188,22 +226,39 @@ void llama_model_saver::add_kv_from_model() { add_kv(LLM_KV_ATTENTION_CLAMP_KQV, hparams.f_clamp_kqv); add_kv(LLM_KV_ATTENTION_KEY_LENGTH, hparams.n_embd_head_k_full); add_kv(LLM_KV_ATTENTION_VALUE_LENGTH, hparams.n_embd_head_v_full); - add_kv(LLM_KV_ATTENTION_KEY_LENGTH_SWA, hparams.n_embd_head_k_swa); - add_kv(LLM_KV_ATTENTION_VALUE_LENGTH_SWA, hparams.n_embd_head_v_swa); add_kv(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); add_kv(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); + add_kv(LLM_KV_ATTENTION_GROUPNORM_EPS, hparams.f_norm_group_eps); + add_kv(LLM_KV_ATTENTION_GROUPNORM_GROUPS, hparams.n_norm_groups); add_kv(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn); add_kv(LLM_KV_ATTENTION_Q_LORA_RANK, hparams.n_lora_q); add_kv(LLM_KV_ATTENTION_KV_LORA_RANK, hparams.n_lora_kv); + add_kv(LLM_KV_ATTENTION_DECAY_LORA_RANK, hparams.n_lora_decay); + add_kv(LLM_KV_ATTENTION_ICLR_LORA_RANK, hparams.n_lora_iclr); + add_kv(LLM_KV_ATTENTION_VALUE_RESIDUAL_MIX_LORA_RANK, hparams.n_lora_value_res_mix); + add_kv(LLM_KV_ATTENTION_GATE_LORA_RANK, hparams.n_lora_gate); add_kv(LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, hparams.n_rel_attn_bkts); add_kv(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa); + // add_kv(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, ???); add_kv(LLM_KV_ATTENTION_SCALE, hparams.f_attention_scale); + add_kv(LLM_KV_ATTENTION_OUTPUT_SCALE, hparams.f_attn_out_scale); + add_kv(LLM_KV_ATTENTION_TEMPERATURE_LENGTH, hparams.attn_temp_length); + add_kv(LLM_KV_ATTENTION_TEMPERATURE_SCALE, hparams.f_attn_temp_scale); + add_kv(LLM_KV_ATTENTION_KEY_LENGTH_MLA, hparams.n_embd_head_k_mla_impl); + add_kv(LLM_KV_ATTENTION_VALUE_LENGTH_MLA, hparams.n_embd_head_v_mla_impl); + add_kv(LLM_KV_ATTENTION_KEY_LENGTH_SWA, hparams.n_embd_head_k_swa); + add_kv(LLM_KV_ATTENTION_VALUE_LENGTH_SWA, hparams.n_embd_head_v_swa); + add_kv(LLM_KV_ATTENTION_INDEXER_HEAD_COUNT, hparams.indexer_n_head); + add_kv(LLM_KV_ATTENTION_INDEXER_KEY_LENGTH, hparams.indexer_head_size); + add_kv(LLM_KV_ATTENTION_INDEXER_TOP_K, hparams.indexer_top_k); const float rope_scaling_factor = hparams.rope_freq_scale_train == 1.0f ? 0.0f : 1.0f/hparams.rope_freq_scale_train; add_kv(LLM_KV_ROPE_DIMENSION_COUNT, hparams.n_rot_full); add_kv(LLM_KV_ROPE_DIMENSION_COUNT_SWA, hparams.n_rot_swa); + add_kv(LLM_KV_ROPE_DIMENSION_SECTIONS, hparams.rope_sections); add_kv(LLM_KV_ROPE_FREQ_BASE, hparams.rope_freq_base_train); + add_kv(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa); // add_kv(LLM_KV_ROPE_SCALE_LINEAR, rope_scaling_factor); // old name add_kv(LLM_KV_ROPE_SCALING_TYPE, llama_rope_scaling_type_name(hparams.rope_scaling_type_train)); add_kv(LLM_KV_ROPE_SCALING_FACTOR, rope_scaling_factor); @@ -211,6 +266,10 @@ void llama_model_saver::add_kv_from_model() { add_kv(LLM_KV_ROPE_SCALING_ORIG_CTX_LEN, hparams.n_ctx_orig_yarn); add_kv(LLM_KV_ROPE_SCALING_FINETUNED, hparams.rope_finetuned); add_kv(LLM_KV_ROPE_SCALING_YARN_LOG_MUL, hparams.rope_yarn_log_mul); + add_kv(LLM_KV_ROPE_SCALING_YARN_EXT_FACTOR, hparams.yarn_ext_factor); + add_kv(LLM_KV_ROPE_SCALING_YARN_ATTN_FACTOR, hparams.yarn_attn_factor); + add_kv(LLM_KV_ROPE_SCALING_YARN_BETA_FAST, hparams.yarn_beta_fast); + add_kv(LLM_KV_ROPE_SCALING_YARN_BETA_SLOW, hparams.yarn_beta_slow); // TODO: implement split file support // add_kv(LLM_KV_SPLIT_NO, ???); @@ -221,8 +280,11 @@ void llama_model_saver::add_kv_from_model() { add_kv(LLM_KV_SSM_CONV_KERNEL, hparams.ssm_d_conv); add_kv(LLM_KV_SSM_STATE_SIZE, hparams.ssm_d_state); add_kv(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank); + add_kv(LLM_KV_SSM_GROUP_COUNT, hparams.ssm_n_group); add_kv(LLM_KV_SSM_DT_B_C_RMS, hparams.ssm_dt_b_c_rms); + add_kv(LLM_KV_KDA_HEAD_DIM, hparams.n_embd_head_kda); + add_kv(LLM_KV_WKV_HEAD_SIZE, hparams.wkv_head_size); add_kv(LLM_KV_TOKENIZER_MODEL, vocab.get_tokenizer_model()); @@ -260,15 +322,39 @@ void llama_model_saver::add_kv_from_model() { // TODO: implement LoRA support // add_kv(LLM_KV_ADAPTER_TYPE, ???); // add_kv(LLM_KV_ADAPTER_LORA_ALPHA, ???); + // add_kv(LLM_KV_ADAPTER_LORA_TASK_NAME, ???); + // add_kv(LLM_KV_ADAPTER_LORA_PROMPT_PREFIX, ???); + // add_kv(LLM_KV_ADAPTER_ALORA_INVOCATION_TOKENS, ???); + + add_kv(LLM_KV_POSNET_EMBEDDING_LENGTH, hparams.posnet.n_embd); + add_kv(LLM_KV_POSNET_BLOCK_COUNT, hparams.posnet.n_layer); + + add_kv(LLM_KV_CONVNEXT_EMBEDDING_LENGTH, hparams.convnext.n_embd); + add_kv(LLM_KV_CONVNEXT_BLOCK_COUNT, hparams.convnext.n_layer); + + add_kv(LLM_KV_CLASSIFIER_OUTPUT_LABELS, model->classifier_labels); + + add_kv(LLM_KV_SHORTCONV_L_CACHE, hparams.n_shortconv_l_cache); + + add_kv(LLM_KV_XIELU_ALPHA_N, hparams.xielu_alpha_n); + add_kv(LLM_KV_XIELU_ALPHA_P, hparams.xielu_alpha_p); + add_kv(LLM_KV_XIELU_BETA, hparams.xielu_beta); + add_kv(LLM_KV_XIELU_EPS, hparams.xielu_eps); // deprecated // add_kv(LLM_KV_TOKENIZER_PREFIX_ID, ???); // add_kv(LLM_KV_TOKENIZER_SUFFIX_ID, ???); // add_kv(LLM_KV_TOKENIZER_MIDDLE_ID, ???); + + add_kv(LLM_KV_DENSE_2_FEAT_IN, hparams.dense_2_feat_in); + add_kv(LLM_KV_DENSE_2_FEAT_OUT, hparams.dense_2_feat_out); + add_kv(LLM_KV_DENSE_3_FEAT_IN, hparams.dense_3_feat_in); + add_kv(LLM_KV_DENSE_3_FEAT_OUT, hparams.dense_3_feat_out); } void llama_model_saver::add_tensors_from_model() { - if (std::string(model->output->name) != std::string(model->tok_embd->name)) { + if (model->output != nullptr && + std::string(model->output->name) != std::string(model->tok_embd->name)) { add_tensor(model->tok_embd); // some models use the same tensor for tok_embd and output } add_tensor(model->type_embd); diff --git a/src/llama-model.cpp b/src/llama-model.cpp index f8caad2889..6d63d58fa7 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -1624,7 +1624,7 @@ void llama_model::load_hparams(llama_model_loader & ml) { // (optional) temperature tuning - used by mistral-large ml.get_key(LLM_KV_ATTENTION_TEMPERATURE_SCALE, hparams.f_attn_temp_scale, false); - ml.get_key(LLM_KV_ATTENTION_TEMPERATURE_LENGTH, hparams.n_attn_temp_floor_scale, false); + ml.get_key(LLM_KV_ATTENTION_TEMPERATURE_LENGTH, hparams.n_attn_temp_floor_scale, false); // FIXME why not use temperature_length? hparams.f_attn_temp_offset = 0.0f; @@ -7453,6 +7453,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) { // generic pass: load optional per-tensor/per-expert ".scale" tensors (e.g. NVFP4 scale2) // this avoids having to add scale loading to every architecture + if (arch != LLM_ARCH_T5) { for (int i = 0; i < n_layer; ++i) { auto & layer = layers[i]; @@ -7522,6 +7523,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) { } } } + } ml.done_getting_tensors(); diff --git a/tests/test-llama-archs.cpp b/tests/test-llama-archs.cpp index d51c09e99f..8ee0fe1e7c 100644 --- a/tests/test-llama-archs.cpp +++ b/tests/test-llama-archs.cpp @@ -90,6 +90,7 @@ static gguf_context_ptr get_gguf_ctx(const llm_arch arch, const bool moe) { n_embd = 64; n_head = 1; n_ff = 96; + n_layer = 22; // hparams.n_layer_kv_from_start = 20 is hardcoded } else if (arch == LLM_ARCH_DEEPSEEK2 || arch == LLM_ARCH_GLM_DSA || arch == LLM_ARCH_KIMI_LINEAR @@ -101,8 +102,6 @@ static gguf_context_ptr get_gguf_ctx(const llm_arch arch, const bool moe) { n_layer = 3; } else if (arch == LLM_ARCH_CHAMELEON) { n_vocab = 10240; - } else if (arch == LLM_ARCH_GEMMA3N) { - n_layer = 22; // hparams.n_layer_kv_from_start = 20 is hardcoded } const uint32_t n_embd_head = n_embd / n_head; @@ -351,7 +350,6 @@ static bool moe_implemented(const llm_arch arch) { } static int save_models(const llm_arch target_arch, const size_t seed, const ggml_log_level log_level, const std::string & dir) { - GGML_ABORT("llama_model_save_to_file is broken"); struct user_data_t { struct { ggml_log_callback callback; @@ -376,6 +374,16 @@ static int save_models(const llm_arch target_arch, const size_t seed, const ggml if (arch == LLM_ARCH_CLIP || arch == LLM_ARCH_GPTJ || arch == LLM_ARCH_UNKNOWN) { continue; // These models don't have usable implementations. } + if (arch == LLM_ARCH_RWKV6 || arch == LLM_ARCH_RWKV6QWEN2 || arch == LLM_ARCH_RWKV7 || arch == LLM_ARCH_ARWKV7) { + continue; // FIXME + } + if (arch == LLM_ARCH_BERT || arch == LLM_ARCH_MODERN_BERT || arch == LLM_ARCH_NOMIC_BERT || arch == LLM_ARCH_NOMIC_BERT_MOE || + arch == LLM_ARCH_NEO_BERT || arch == LLM_ARCH_JINA_BERT_V2 || arch == LLM_ARCH_JINA_BERT_V3 || arch == LLM_ARCH_EUROBERT) { + continue; // TODO vocab + } + if (arch == LLM_ARCH_PLM) { + continue; // TODO tensor shapes + } for (bool moe : {false, true}) { if (moe && !moe_implemented(arch)) { continue; From c66fd8a22730840ebb4d44df61b91f6cc10a8085 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= Date: Sun, 22 Mar 2026 23:45:29 +0100 Subject: [PATCH 08/12] roundtrip tests --- ggml/include/gguf.h | 1 + ggml/src/gguf.cpp | 17 +++++++--- src/llama-model-saver.cpp | 31 ++++++++++++++++++- src/llama-model-saver.h | 4 +++ tests/test-gguf.cpp | 8 ++--- tests/test-llama-archs.cpp | 63 ++++++++++++++++++++++++++++++++------ 6 files changed, 103 insertions(+), 21 deletions(-) diff --git a/ggml/include/gguf.h b/ggml/include/gguf.h index 9d8e321ba0..42763b86f9 100644 --- a/ggml/include/gguf.h +++ b/ggml/include/gguf.h @@ -191,6 +191,7 @@ extern "C" { // write the entire context to a binary file GGML_API bool gguf_write_to_file(const struct gguf_context * ctx, const char * fname, bool only_meta); + GGML_API bool gguf_write_to_file_ptr(const struct gguf_context * ctx, FILE * file, bool only_meta); // get the size in bytes of the meta data (header, kv pairs, tensor info) including padding GGML_API size_t gguf_get_meta_size(const struct gguf_context * ctx); diff --git a/ggml/src/gguf.cpp b/ggml/src/gguf.cpp index 49afeacae3..407b514f71 100644 --- a/ggml/src/gguf.cpp +++ b/ggml/src/gguf.cpp @@ -1520,16 +1520,25 @@ bool gguf_write_to_file(const struct gguf_context * ctx, const char * fname, boo return false; } + const bool success = gguf_write_to_file_ptr(ctx, file, only_meta); + if (!success) { + GGML_LOG_ERROR("%s: failed to write GGUF data into '%s'\n", __func__, fname); + } + + fclose(file); + return success; +} + +bool gguf_write_to_file_ptr(const struct gguf_context * ctx, FILE * file, bool only_meta) { + GGML_ASSERT(file); + try { gguf_writer_file gw(file); gguf_write_out(ctx, gw, only_meta); } catch (const std::runtime_error& ex) { - GGML_LOG_ERROR("%s: failed to write GGUF data into '%s': %s\n", __func__, fname, ex.what()); - fclose(file); + GGML_LOG_ERROR("%s: failed to write GGUF data: %s\n", __func__, ex.what()); return false; } - - fclose(file); return true; } diff --git a/src/llama-model-saver.cpp b/src/llama-model-saver.cpp index 637be9b7ce..26864c18e9 100644 --- a/src/llama-model-saver.cpp +++ b/src/llama-model-saver.cpp @@ -1,5 +1,6 @@ #include "llama-model-saver.h" +#include "ggml.h" #include "gguf.h" #include "llama-arch.h" @@ -11,8 +12,33 @@ #include #include +bool llama_model_saver_supports_arch(llm_arch arch) { + switch (arch) { + case LLM_ARCH_QWEN3NEXT: + case LLM_ARCH_QWEN35: + case LLM_ARCH_QWEN35MOE: + case LLM_ARCH_PLAMO3: + case LLM_ARCH_GEMMA3: + case LLM_ARCH_GEMMA3N: + case LLM_ARCH_COHERE2: + case LLM_ARCH_OLMO2: + case LLM_ARCH_BITNET: + case LLM_ARCH_T5: + case LLM_ARCH_EXAONE_MOE: + case LLM_ARCH_AFMOE: + case LLM_ARCH_APERTUS: + case LLM_ARCH_MIMO2: + case LLM_ARCH_STEP35: + return false; + default: + return true; + } +} + llama_model_saver::llama_model_saver(const struct llama_model * model) : - gguf_ctx(gguf_init_empty()), gguf_ctx_owned(true), model(model), llm_kv(model->arch) {} + gguf_ctx(gguf_init_empty()), gguf_ctx_owned(true), model(model), llm_kv(model->arch) { + GGML_ASSERT(llama_model_saver_supports_arch(model->arch)); +} llama_model_saver::llama_model_saver(enum llm_arch arch, struct gguf_context * gguf_ctx) : gguf_ctx(gguf_ctx == nullptr ? gguf_init_empty() : gguf_ctx), gguf_ctx_owned(gguf_ctx == nullptr), model(nullptr), llm_kv(arch) {} @@ -383,3 +409,6 @@ void llama_model_saver::save(const std::string & path_model) { gguf_write_to_file(gguf_ctx, path_model.c_str(), false); } +void llama_model_saver::save(FILE * file) { + gguf_write_to_file_ptr(gguf_ctx, file, false); +} diff --git a/src/llama-model-saver.h b/src/llama-model-saver.h index 2b3541ce6c..36a715e2b6 100644 --- a/src/llama-model-saver.h +++ b/src/llama-model-saver.h @@ -6,6 +6,9 @@ #include +// FIXME temporary function for better error messages +bool llama_model_saver_supports_arch(llm_arch arch); + struct llama_model_saver { struct gguf_context * gguf_ctx = nullptr; const bool gguf_ctx_owned; @@ -37,4 +40,5 @@ struct llama_model_saver { void add_tensors_from_model(); void save(const std::string & path_model); + void save(FILE * file); }; diff --git a/tests/test-gguf.cpp b/tests/test-gguf.cpp index 78ca95dcbd..ed3070dc4d 100644 --- a/tests/test-gguf.cpp +++ b/tests/test-gguf.cpp @@ -1125,12 +1125,8 @@ static std::pair test_roundtrip(ggml_backend_dev_t dev, const unsigned GGML_ASSERT(file); #endif // _WIN32 - { - std::vector buf; - gguf_write_to_buf(gguf_ctx_0, buf, only_meta); - GGML_ASSERT(fwrite(buf.data(), 1, buf.size(), file) == buf.size()); - rewind(file); - } + gguf_write_to_file_ptr(gguf_ctx_0, file, only_meta); + rewind(file); struct ggml_context * ctx_1 = nullptr; struct gguf_init_params gguf_params = { diff --git a/tests/test-llama-archs.cpp b/tests/test-llama-archs.cpp index 8ee0fe1e7c..add340d22c 100644 --- a/tests/test-llama-archs.cpp +++ b/tests/test-llama-archs.cpp @@ -424,8 +424,8 @@ static int test_backends(const llm_arch target_arch, const size_t seed, const gg bool all_ok = true; common_log_flush(common_log_main()); - printf("|%15s|%30s|%6s|%8s|%6s|\n", "Model arch.", "Device", "Config", "NMSE", "Status"); - printf("|---------------|------------------------------|------|--------|------|\n"); + printf("|%15s|%30s|%16s|%8s|%6s|\n", "Model arch.", "Device", "Config", "NMSE", "Status"); + printf("|---------------|------------------------------|----------------|--------|------|\n"); for (const llm_arch & arch : llm_arch_all()) { if (target_arch != LLM_ARCH_UNKNOWN && arch != target_arch) { continue; @@ -474,14 +474,57 @@ static int test_backends(const llm_arch target_arch, const size_t seed, const gg continue; } auto model_and_ctx_dev = get_model_and_ctx(gguf_ctx.get(), seed, {dev}); - const std::vector logits_dev = get_logits(model_and_ctx_dev.first.get(), model_and_ctx_dev.second.get(), tokens, encode); - const double nmse_val = nmse(logits_cpu, logits_dev); - const bool ok = nmse_val <= 1e-4; - all_ok = all_ok && ok; - char nmse_str[10]; - snprintf(nmse_str, sizeof(nmse_str), "%.2e", nmse_val); - printf("|%15s|%30s|%6s|%8s|%17s|\n", llm_arch_name(arch), ggml_backend_dev_description(dev), - moe ? "MoE" : "Dense", nmse_str, ok ? "\033[1;32mOK\033[0m" : "\033[1;31mFAIL\033[0m"); + std::string config_name = moe ? "MoE" : "Dense"; + { + const std::vector logits_dev = get_logits(model_and_ctx_dev.first.get(), model_and_ctx_dev.second.get(), tokens, encode); + const double nmse_val = nmse(logits_cpu, logits_dev); + const bool ok = nmse_val <= 1e-4; + all_ok = all_ok && ok; + char nmse_str[10]; + snprintf(nmse_str, sizeof(nmse_str), "%.2e", nmse_val); + printf("|%15s|%30s|%16s|%8s|%17s|\n", llm_arch_name(arch), ggml_backend_dev_description(dev), + config_name.c_str(), nmse_str, ok ? "\033[1;32mOK\033[0m" : "\033[1;31mFAIL\033[0m"); + } + if (llama_model_saver_supports_arch(arch)) { + FILE * file = tmpfile(); +#ifdef _WIN32 + if (!file) { + continue; + } +#else + GGML_ASSERT(file); +#endif // _WIN32 + llama_model_saver ms = llama_model_saver(model_and_ctx_dev.first.get()); + ms.add_kv_from_model(); + ms.add_tensors_from_model(); + ms.save(file); + rewind(file); + llama_model_params model_params = llama_model_default_params(); + std::vector devs_copy = {dev}; + devs_copy.push_back(nullptr); + model_params.devices = devs_copy.data(); + llama_model_ptr model_roundtrip(llama_model_load_from_file_ptr(file, model_params)); + GGML_ASSERT(model_roundtrip); + config_name += ",roundtrip"; + + llama_context_params ctx_params = llama_context_default_params(); + ctx_params.n_ctx = 0; + ctx_params.n_threads = 4; + ctx_params.n_threads_batch = 4; + llama_context_ptr lctx_roundtrip(llama_init_from_model(model_roundtrip.get(), ctx_params)); + if (!lctx_roundtrip) { + throw std::runtime_error("failed to create llama context"); + } + + const std::vector logits_dev = get_logits(model_roundtrip.get(), lctx_roundtrip.get(), tokens, encode); + const double nmse_val = nmse(logits_cpu, logits_dev); + const bool ok = nmse_val <= 1e-4; + all_ok = all_ok && ok; + char nmse_str[10]; + snprintf(nmse_str, sizeof(nmse_str), "%.2e", nmse_val); + printf("|%15s|%30s|%16s|%8s|%17s|\n", llm_arch_name(arch), ggml_backend_dev_description(dev), + config_name.c_str(), nmse_str, ok ? "\033[1;32mOK\033[0m" : "\033[1;31mFAIL\033[0m"); + } } } } From e7f31055b33e99682217b2e0753cf4d993391d04 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= Date: Mon, 23 Mar 2026 21:37:32 +0100 Subject: [PATCH 09/12] fixup --- ggml/include/gguf.h | 4 ++-- ggml/src/gguf.cpp | 26 +++++++++++++------------- src/llama-arch.cpp | 2 +- src/llama-model.cpp | 2 -- 4 files changed, 16 insertions(+), 18 deletions(-) diff --git a/ggml/include/gguf.h b/ggml/include/gguf.h index 42763b86f9..02d5f221c0 100644 --- a/ggml/include/gguf.h +++ b/ggml/include/gguf.h @@ -77,8 +77,8 @@ extern "C" { }; GGML_API struct gguf_context * gguf_init_empty(void); - GGML_API struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_params params); GGML_API struct gguf_context * gguf_init_from_file_ptr(FILE * file, struct gguf_init_params params); + GGML_API struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_params params); //GGML_API struct gguf_context * gguf_init_from_buffer(..); GGML_API void gguf_free(struct gguf_context * ctx); @@ -190,8 +190,8 @@ extern "C" { // // write the entire context to a binary file - GGML_API bool gguf_write_to_file(const struct gguf_context * ctx, const char * fname, bool only_meta); GGML_API bool gguf_write_to_file_ptr(const struct gguf_context * ctx, FILE * file, bool only_meta); + GGML_API bool gguf_write_to_file(const struct gguf_context * ctx, const char * fname, bool only_meta); // get the size in bytes of the meta data (header, kv pairs, tensor info) including padding GGML_API size_t gguf_get_meta_size(const struct gguf_context * ctx); diff --git a/ggml/src/gguf.cpp b/ggml/src/gguf.cpp index 407b514f71..ab3cc97486 100644 --- a/ggml/src/gguf.cpp +++ b/ggml/src/gguf.cpp @@ -1512,6 +1512,19 @@ void gguf_write_to_buf(const struct gguf_context * ctx, std::vector & bu gguf_write_out(ctx, gw, only_meta); } +bool gguf_write_to_file_ptr(const struct gguf_context * ctx, FILE * file, bool only_meta) { + GGML_ASSERT(file); + + try { + gguf_writer_file gw(file); + gguf_write_out(ctx, gw, only_meta); + } catch (const std::runtime_error& ex) { + GGML_LOG_ERROR("%s: failed to write GGUF data: %s\n", __func__, ex.what()); + return false; + } + return true; +} + bool gguf_write_to_file(const struct gguf_context * ctx, const char * fname, bool only_meta) { FILE * file = ggml_fopen(fname, "wb"); @@ -1529,19 +1542,6 @@ bool gguf_write_to_file(const struct gguf_context * ctx, const char * fname, boo return success; } -bool gguf_write_to_file_ptr(const struct gguf_context * ctx, FILE * file, bool only_meta) { - GGML_ASSERT(file); - - try { - gguf_writer_file gw(file); - gguf_write_out(ctx, gw, only_meta); - } catch (const std::runtime_error& ex) { - GGML_LOG_ERROR("%s: failed to write GGUF data: %s\n", __func__, ex.what()); - return false; - } - return true; -} - size_t gguf_get_meta_size(const struct gguf_context * ctx) { // only return size std::vector buf; diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp index f6084fa84a..322a66bc05 100644 --- a/src/llama-arch.cpp +++ b/src/llama-arch.cpp @@ -2757,7 +2757,7 @@ std::string LLM_TN_IMPL::str() const { if (model_tensors.find(tensor) == model_tensors.end()) { const char * name = LLM_TENSOR_NAMES.at(tensor); if (suffix != nullptr || bid != -1 || xid != -1) { - LLAMA_LOG_ERROR("%s: cannot properly format tensor name %s with suffix=%s bid=%d xid=%d\n", + LLAMA_LOG_WARN("%s: cannot properly format tensor name %s with suffix=%s bid=%d xid=%d\n", __func__, name, suffix, bid, xid); } return name; diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 6d63d58fa7..490e8f336b 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -7453,7 +7453,6 @@ bool llama_model::load_tensors(llama_model_loader & ml) { // generic pass: load optional per-tensor/per-expert ".scale" tensors (e.g. NVFP4 scale2) // this avoids having to add scale loading to every architecture - if (arch != LLM_ARCH_T5) { for (int i = 0; i < n_layer; ++i) { auto & layer = layers[i]; @@ -7523,7 +7522,6 @@ bool llama_model::load_tensors(llama_model_loader & ml) { } } } - } ml.done_getting_tensors(); From 445fc0bf2119a5b5431c055421b0aafcc4f06718 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= Date: Mon, 23 Mar 2026 22:35:19 +0100 Subject: [PATCH 10/12] refactor tests --- tests/test-llama-archs.cpp | 84 ++++++++++++++++---------------------- 1 file changed, 36 insertions(+), 48 deletions(-) diff --git a/tests/test-llama-archs.cpp b/tests/test-llama-archs.cpp index add340d22c..2bff2d787f 100644 --- a/tests/test-llama-archs.cpp +++ b/tests/test-llama-archs.cpp @@ -231,7 +231,8 @@ static gguf_context_ptr get_gguf_ctx(const llm_arch arch, const bool moe) { } static std::pair get_model_and_ctx( - struct gguf_context * gguf_ctx, const size_t seed, const std::vector & devs) { + struct gguf_context * gguf_ctx, FILE * file, const size_t seed, const std::vector & devs) { + GGML_ASSERT((gguf_ctx == nullptr) != (file == nullptr)); llama_model_params model_params = llama_model_default_params(); std::vector devs_copy = devs; devs_copy.push_back(nullptr); @@ -243,7 +244,9 @@ static std::pair get_model_and_ctx( ctx_params.n_threads_batch = 4; size_t tmp = seed; - llama_model_ptr model(llama_model_init_from_user(gguf_ctx, set_tensor_data, &tmp, model_params)); + llama_model_ptr model(gguf_ctx != nullptr ? + llama_model_init_from_user(gguf_ctx, set_tensor_data, &tmp, model_params) : + llama_model_load_from_file_ptr(file, model_params)); if (!model) { throw std::runtime_error("failed to create llama model"); } @@ -392,7 +395,7 @@ static int save_models(const llm_arch target_arch, const size_t seed, const ggml continue; } gguf_context_ptr gguf_ctx = get_gguf_ctx(arch, moe); - auto model_and_ctx = get_model_and_ctx(gguf_ctx.get(), seed, {}); + auto model_and_ctx = get_model_and_ctx(gguf_ctx.get(), nullptr, seed, {}); const std::string path = dir + "/" + llm_arch_name(arch) + (moe ? "-moe.gguf" : "-dense.gguf"); LOG_INF("%s: Saving %s model (%s) to %s...\n", __func__, llm_arch_name(arch), moe ? "MoE" : "dense", path.c_str()); llama_model_save_to_file(model_and_ctx.first.get(), path.c_str()); @@ -424,8 +427,8 @@ static int test_backends(const llm_arch target_arch, const size_t seed, const gg bool all_ok = true; common_log_flush(common_log_main()); - printf("|%15s|%30s|%16s|%8s|%6s|\n", "Model arch.", "Device", "Config", "NMSE", "Status"); - printf("|---------------|------------------------------|----------------|--------|------|\n"); + printf("|%15s|%30s|%16s|%15s|%9s|\n", "Model arch.", "Device", "Config", "NMSE vs. CPU", "Roundtrip"); + printf("|---------------|------------------------------|----------------|---------------|---------|\n"); for (const llm_arch & arch : llm_arch_all()) { if (target_arch != LLM_ARCH_UNKNOWN && arch != target_arch) { continue; @@ -466,65 +469,50 @@ static int test_backends(const llm_arch target_arch, const size_t seed, const gg continue; } gguf_context_ptr gguf_ctx = get_gguf_ctx(arch, moe); - auto model_and_ctx_cpu = get_model_and_ctx(gguf_ctx.get(), seed, {}); + auto model_and_ctx_cpu = get_model_and_ctx(gguf_ctx.get(), nullptr, seed, {}); const std::vector logits_cpu = get_logits(model_and_ctx_cpu.first.get(), model_and_ctx_cpu.second.get(), tokens, encode); for (size_t i = 0; i < ggml_backend_dev_count(); i++) { ggml_backend_dev_t dev = ggml_backend_dev_get(i); if (ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_CPU) { continue; } - auto model_and_ctx_dev = get_model_and_ctx(gguf_ctx.get(), seed, {dev}); + auto model_and_ctx_dev = get_model_and_ctx(gguf_ctx.get(), nullptr, seed, {dev}); std::string config_name = moe ? "MoE" : "Dense"; - { - const std::vector logits_dev = get_logits(model_and_ctx_dev.first.get(), model_and_ctx_dev.second.get(), tokens, encode); - const double nmse_val = nmse(logits_cpu, logits_dev); - const bool ok = nmse_val <= 1e-4; - all_ok = all_ok && ok; - char nmse_str[10]; - snprintf(nmse_str, sizeof(nmse_str), "%.2e", nmse_val); - printf("|%15s|%30s|%16s|%8s|%17s|\n", llm_arch_name(arch), ggml_backend_dev_description(dev), - config_name.c_str(), nmse_str, ok ? "\033[1;32mOK\033[0m" : "\033[1;31mFAIL\033[0m"); + const std::vector logits_dev = get_logits(model_and_ctx_dev.first.get(), model_and_ctx_dev.second.get(), tokens, encode); + const double nmse_val = nmse(logits_cpu, logits_dev); + char nmse_str[10]; + snprintf(nmse_str, sizeof(nmse_str), "%.2e", nmse_val); + std::string status_nmse = "\033[1;32mOK\033[0m"; + if (nmse_val > 1e-4) { + all_ok = false; + status_nmse = "\033[1;31mFAIL\033[0m"; } - if (llama_model_saver_supports_arch(arch)) { - FILE * file = tmpfile(); -#ifdef _WIN32 - if (!file) { - continue; - } -#else - GGML_ASSERT(file); -#endif // _WIN32 + + std::string status_roundtrip = "\033[1;33mSKIP\033[0m"; + FILE * file = tmpfile(); // Can be null on Windows without administrator privileges. + if (file != nullptr && llama_model_saver_supports_arch(arch)) { llama_model_saver ms = llama_model_saver(model_and_ctx_dev.first.get()); ms.add_kv_from_model(); ms.add_tensors_from_model(); ms.save(file); rewind(file); - llama_model_params model_params = llama_model_default_params(); - std::vector devs_copy = {dev}; - devs_copy.push_back(nullptr); - model_params.devices = devs_copy.data(); - llama_model_ptr model_roundtrip(llama_model_load_from_file_ptr(file, model_params)); - GGML_ASSERT(model_roundtrip); - config_name += ",roundtrip"; - llama_context_params ctx_params = llama_context_default_params(); - ctx_params.n_ctx = 0; - ctx_params.n_threads = 4; - ctx_params.n_threads_batch = 4; - llama_context_ptr lctx_roundtrip(llama_init_from_model(model_roundtrip.get(), ctx_params)); - if (!lctx_roundtrip) { - throw std::runtime_error("failed to create llama context"); + auto model_and_ctx_roundtrip = get_model_and_ctx(nullptr, file, seed, {dev}); + const std::vector logits_roundtrip = get_logits( + model_and_ctx_roundtrip.first.get(), model_and_ctx_roundtrip.second.get(), tokens, encode); + status_roundtrip = "\033[1;32mOK\033[0m"; + GGML_ASSERT(logits_roundtrip.size() == logits_dev.size()); + for (size_t i = 0; i < logits_roundtrip.size(); i++) { + if (logits_roundtrip[i] != logits_dev[i]) { + all_ok = false; + status_roundtrip = "\033[1;31mFAIL\033[0m"; + break; + } } - - const std::vector logits_dev = get_logits(model_roundtrip.get(), lctx_roundtrip.get(), tokens, encode); - const double nmse_val = nmse(logits_cpu, logits_dev); - const bool ok = nmse_val <= 1e-4; - all_ok = all_ok && ok; - char nmse_str[10]; - snprintf(nmse_str, sizeof(nmse_str), "%.2e", nmse_val); - printf("|%15s|%30s|%16s|%8s|%17s|\n", llm_arch_name(arch), ggml_backend_dev_description(dev), - config_name.c_str(), nmse_str, ok ? "\033[1;32mOK\033[0m" : "\033[1;31mFAIL\033[0m"); } + + printf("|%15s|%30s|%16s|%15s (%8s)|%20s|\n", llm_arch_name(arch), ggml_backend_dev_description(dev), + config_name.c_str(), status_nmse.c_str(), nmse_str, status_roundtrip.c_str()); } } } From dd2564bc388e04dd124ad269a98eda384ab34f8e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= Date: Mon, 23 Mar 2026 22:45:44 +0100 Subject: [PATCH 11/12] fix prints --- tests/test-llama-archs.cpp | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/tests/test-llama-archs.cpp b/tests/test-llama-archs.cpp index 2bff2d787f..8fcc8a211d 100644 --- a/tests/test-llama-archs.cpp +++ b/tests/test-llama-archs.cpp @@ -230,10 +230,15 @@ static gguf_context_ptr get_gguf_ctx(const llm_arch arch, const bool moe) { return ret; } +static bool silent_model_load_progress(float /*progress*/, void * /*user_data*/) { + return true; +} + static std::pair get_model_and_ctx( struct gguf_context * gguf_ctx, FILE * file, const size_t seed, const std::vector & devs) { GGML_ASSERT((gguf_ctx == nullptr) != (file == nullptr)); llama_model_params model_params = llama_model_default_params(); + model_params.progress_callback = silent_model_load_progress; std::vector devs_copy = devs; devs_copy.push_back(nullptr); model_params.devices = devs_copy.data(); @@ -427,8 +432,8 @@ static int test_backends(const llm_arch target_arch, const size_t seed, const gg bool all_ok = true; common_log_flush(common_log_main()); - printf("|%15s|%30s|%16s|%15s|%9s|\n", "Model arch.", "Device", "Config", "NMSE vs. CPU", "Roundtrip"); - printf("|---------------|------------------------------|----------------|---------------|---------|\n"); + printf("|%15s|%30s|%6s|%15s|%9s|\n", "Model arch.", "Device", "Config", "NMSE vs. CPU", "Roundtrip"); + printf("|---------------|------------------------------|------|---------------|---------|\n"); for (const llm_arch & arch : llm_arch_all()) { if (target_arch != LLM_ARCH_UNKNOWN && arch != target_arch) { continue; @@ -511,7 +516,7 @@ static int test_backends(const llm_arch target_arch, const size_t seed, const gg } } - printf("|%15s|%30s|%16s|%15s (%8s)|%20s|\n", llm_arch_name(arch), ggml_backend_dev_description(dev), + printf("|%15s|%30s|%6s|%15s (%8s)|%20s|\n", llm_arch_name(arch), ggml_backend_dev_description(dev), config_name.c_str(), status_nmse.c_str(), nmse_str, status_roundtrip.c_str()); } } From 60312f6a466648e4f5bf5408c213dcbdbe95f046 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= Date: Mon, 23 Mar 2026 22:56:47 +0100 Subject: [PATCH 12/12] fix model saving --- tests/test-llama-archs.cpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tests/test-llama-archs.cpp b/tests/test-llama-archs.cpp index 8fcc8a211d..593d370e19 100644 --- a/tests/test-llama-archs.cpp +++ b/tests/test-llama-archs.cpp @@ -399,6 +399,10 @@ static int save_models(const llm_arch target_arch, const size_t seed, const ggml if (!moe && moe_mandatory(arch)) { continue; } + if (!llama_model_saver_supports_arch(arch)) { + LOG_INF("%s: %s model (%s) is unsupported, skipping\n", __func__, llm_arch_name(arch), moe ? "MoE" : "dense"); + continue; + } gguf_context_ptr gguf_ctx = get_gguf_ctx(arch, moe); auto model_and_ctx = get_model_and_ctx(gguf_ctx.get(), nullptr, seed, {}); const std::string path = dir + "/" + llm_arch_name(arch) + (moe ? "-moe.gguf" : "-dense.gguf");