From 158239a2b142d709a19f19c66d19588f3b846dd0 Mon Sep 17 00:00:00 2001 From: Siddhesh2377 Date: Wed, 11 Mar 2026 19:17:31 +0530 Subject: [PATCH 1/4] llama : add fd-based model loading via llama_model_load_from_fd --- ggml/include/gguf.h | 1 + ggml/src/gguf.cpp | 32 ++++++++++++++++++++++++ include/llama.h | 4 +++ src/llama-mmap.cpp | 26 ++++++++++++++++++++ src/llama-mmap.h | 3 ++- src/llama-model-loader.cpp | 33 ++++++++++++++++++++++++- src/llama-model-loader.h | 1 + src/llama-quant.cpp | 2 +- src/llama.cpp | 31 ++++++++++++++++++------ tests/CMakeLists.txt | 1 + tests/test-model-load-fd.cpp | 47 ++++++++++++++++++++++++++++++++++++ 11 files changed, 171 insertions(+), 10 deletions(-) create mode 100644 tests/test-model-load-fd.cpp diff --git a/ggml/include/gguf.h b/ggml/include/gguf.h index 79ee202062..bd12997372 100644 --- a/ggml/include/gguf.h +++ b/ggml/include/gguf.h @@ -78,6 +78,7 @@ extern "C" { GGML_API struct gguf_context * gguf_init_empty(void); GGML_API struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_params params); + GGML_API struct gguf_context * gguf_init_from_fd(int fd, struct gguf_init_params params); //GGML_API struct gguf_context * gguf_init_from_buffer(..); GGML_API void gguf_free(struct gguf_context * ctx); diff --git a/ggml/src/gguf.cpp b/ggml/src/gguf.cpp index cbeedf6c4b..8eea785404 100644 --- a/ggml/src/gguf.cpp +++ b/ggml/src/gguf.cpp @@ -15,6 +15,10 @@ #include #include +#ifndef _WIN32 +#include +#endif + #define GGUF_MAX_STRING_LENGTH (1024*1024*1024) #define GGUF_MAX_ARRAY_ELEMENTS (1024*1024*1024) @@ -853,6 +857,34 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p return result; } +#ifndef _WIN32 +struct gguf_context * gguf_init_from_fd(int fd, struct gguf_init_params params) { + const int fd_duped = dup(fd); + if (fd_duped < 0) { + GGML_LOG_ERROR("%s: failed to dup fd %d: %s\n", __func__, fd, strerror(errno)); + return nullptr; + } + + FILE * file = fdopen(fd_duped, "rb"); + if (!file) { + close(fd_duped); + GGML_LOG_ERROR("%s: failed to fdopen fd %d: %s\n", __func__, fd, strerror(errno)); + return nullptr; + } + + struct gguf_context * result = gguf_init_from_file_impl(file, params); + fclose(file); + return result; +} +#else +struct gguf_context * gguf_init_from_fd(int fd, struct gguf_init_params params) { + GGML_LOG_ERROR("%s: fd-based loading is not supported on Windows\n", __func__); + GGML_UNUSED(fd); + GGML_UNUSED(params); + return nullptr; +} +#endif + void gguf_free(struct gguf_context * ctx) { if (ctx == nullptr) { return; diff --git a/include/llama.h b/include/llama.h index 0bd10294cb..f05e9bd247 100644 --- a/include/llama.h +++ b/include/llama.h @@ -464,6 +464,10 @@ extern "C" { const char * path_model, struct llama_model_params params); + // Load a model from a POSIX file descriptor + // Not supported on Windows + LLAMA_API struct llama_model * llama_model_load_from_fd(int fd, struct llama_model_params params); + // Load a model from multiple splits (support custom naming scheme) // The paths must be in the correct order LLAMA_API struct llama_model * llama_model_load_from_splits( diff --git a/src/llama-mmap.cpp b/src/llama-mmap.cpp index c03228e9ce..5ea00d3fa7 100644 --- a/src/llama-mmap.cpp +++ b/src/llama-mmap.cpp @@ -86,6 +86,10 @@ struct llama_file::impl { seek(0, SEEK_SET); } + impl(int /*fd_src*/) { + throw std::runtime_error("fd-based loading is not supported on Windows"); + } + size_t tell() const { LARGE_INTEGER li; li.QuadPart = 0; @@ -209,6 +213,25 @@ struct llama_file::impl { seek(0, SEEK_SET); } + impl(int fd_src) : fname("(fd:" + std::to_string(fd_src) + ")") { + init_from_fd(fd_src); + } + + void init_from_fd(int fd_src) { + const int fd_duped = dup(fd_src); + if (fd_duped < 0) { + throw std::runtime_error(format("llama_file: failed to dup fd %d: %s", fd_src, strerror(errno))); + } + fp = fdopen(fd_duped, "rb"); + if (!fp) { + close(fd_duped); + throw std::runtime_error(format("llama_file: failed to fdopen fd %d: %s", fd_src, strerror(errno))); + } + seek(0, SEEK_END); + size = tell(); + seek(0, SEEK_SET); + } + size_t tell() const { if (fd == -1) { long ret = std::ftell(fp); @@ -373,6 +396,9 @@ struct llama_file::impl { llama_file::llama_file(const char * fname, const char * mode, const bool use_direct_io) : pimpl(std::make_unique(fname, mode, use_direct_io)) {} + +llama_file::llama_file(int fd) : pimpl(std::make_unique(fd)) {} + llama_file::~llama_file() = default; size_t llama_file::tell() const { return pimpl->tell(); } diff --git a/src/llama-mmap.h b/src/llama-mmap.h index 29ce4d2468..2d1eac91a3 100644 --- a/src/llama-mmap.h +++ b/src/llama-mmap.h @@ -15,12 +15,13 @@ using llama_mlocks = std::vector>; struct llama_file { llama_file(const char * fname, const char * mode, bool use_direct_io = false); + llama_file(int fd); ~llama_file(); size_t tell() const; size_t size() const; - int file_id() const; // fileno overload + int file_id() const; void seek(size_t offset, int whence) const; diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp index 623a3455dd..c0e1e754e7 100644 --- a/src/llama-model-loader.cpp +++ b/src/llama-model-loader.cpp @@ -510,6 +510,7 @@ llama_model_loader::llama_model_loader( void * set_tensor_data_ud, const std::string & fname, std::vector & splits, + int fd, bool use_mmap, bool use_direct_io, bool check_tensors, @@ -657,6 +658,36 @@ llama_model_loader::llama_model_loader( LLAMA_LOG_INFO("%s: additional %d GGUFs metadata loaded.\n", __func__, n_split - 1); } + } else if (fd >= 0) { + struct ggml_context * ctx = NULL; + struct gguf_init_params params = { + /*.no_alloc = */ true, + /*.ctx = */ &ctx, + }; + + metadata_ptr.reset(gguf_init_from_fd(fd, params)); + metadata = metadata_ptr.get(); + if (metadata == nullptr) { + throw std::runtime_error(format("%s: failed to load model from fd %d", __func__, fd)); + } + + get_key(llm_kv(LLM_KV_GENERAL_ARCHITECTURE), arch_name, false); + llm_kv = LLM_KV(llm_arch_from_string(arch_name)); + + contexts.emplace_back(ctx); + files.emplace_back(new llama_file(fd)); + + // Save tensors data offset info of the main file. + for (ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) { + std::string tensor_name = std::string(cur->name); + // make sure there is no duplicated tensor names + if (weights_map.find(tensor_name) != weights_map.end()) { + throw std::runtime_error(format("invalid model: tensor '%s' is duplicated", ggml_get_name(cur))); + } + n_elements += ggml_nelements(cur); + n_bytes += ggml_nbytes(cur); + weights_map.emplace(tensor_name, llama_tensor_weight(files.back().get(), 0, metadata, cur)); + } } else { get_key(llm_kv(LLM_KV_GENERAL_ARCHITECTURE), arch_name, false); llm_kv = LLM_KV(llm_arch_from_string(arch_name)); @@ -668,7 +699,7 @@ llama_model_loader::llama_model_loader( fver = (enum llama_fver) gguf_get_version(metadata); LLAMA_LOG_INFO("%s: loaded meta data with %d key-value pairs and %d tensors from %s (version %s)\n", - __func__, n_kv, n_tensors, fname.c_str(), llama_file_version_name(fver)); + __func__, n_kv, n_tensors, fname.empty() ? "(fd)" : fname.c_str(), llama_file_version_name(fver)); // determine file type based on the number of tensors for each quantization and print meta data // TODO: make optional diff --git a/src/llama-model-loader.h b/src/llama-model-loader.h index ed5de729ca..6e5a5a4712 100644 --- a/src/llama-model-loader.h +++ b/src/llama-model-loader.h @@ -125,6 +125,7 @@ struct llama_model_loader { void * set_tensor_data_ud, const std::string & fname, std::vector & splits, // optional, only need if the split does not follow naming scheme + int fd, bool use_mmap, bool use_direct_io, bool check_tensors, diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 8e8ce23124..d047944dd6 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -859,7 +859,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: std::vector splits = {}; llama_model_loader ml(/*metadata*/ nullptr, /*set_tensor_data*/ nullptr, /*set_tensor_data_ud*/ nullptr, - fname_inp, splits, use_mmap, /*use_direct_io*/ false, /*check_tensors*/ true, /*no_alloc*/ false, kv_overrides, nullptr); + fname_inp, splits, /*fd*/ -1, use_mmap, /*use_direct_io*/ false, /*check_tensors*/ true, /*no_alloc*/ false, kv_overrides, nullptr); ml.init_mappings(false); // no prefetching llama_model model(llama_model_default_params()); diff --git a/src/llama.cpp b/src/llama.cpp index 872e659edc..daf3c3bd8d 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -828,7 +828,7 @@ int64_t llama_time_us(void) { // Returns 0 on success, -1 on error, and -2 on cancellation via llama_progress_callback static int llama_model_load(struct gguf_context * metadata, llama_model_set_tensor_data_t set_tensor_data, void * set_tensor_data_ud, - const std::string & fname, std::vector & splits, llama_model & model, llama_model_params & params) { + const std::string & fname, std::vector & splits, int fd, llama_model & model, llama_model_params & params) { // loading time will be recalculated after the first eval, so // we take page faults deferred by mmap() into consideration model.t_load_us = 0; @@ -837,7 +837,7 @@ static int llama_model_load(struct gguf_context * metadata, llama_model_set_tens model.t_start_us = tm.t_start_us; try { - llama_model_loader ml(metadata, set_tensor_data, set_tensor_data_ud, fname, splits, params.use_mmap, params.use_direct_io, + llama_model_loader ml(metadata, set_tensor_data, set_tensor_data_ud, fname, splits, fd, params.use_mmap, params.use_direct_io, params.check_tensors, params.no_alloc, params.kv_overrides, params.tensor_buft_overrides); ml.print_info(); @@ -889,8 +889,12 @@ static struct llama_model * llama_model_load_from_file_impl( void * set_tensor_data_ud, const std::string & path_model, std::vector & splits, + int fd, struct llama_model_params params) { - GGML_ASSERT((metadata == nullptr) != path_model.empty() && "exactly one out of metadata and path_model needs to be defined"); + if (metadata == nullptr && path_model.empty() && fd < 0) { + LLAMA_LOG_ERROR("%s: no model source provided\n", __func__); + return nullptr; + } ggml_time_init(); if (!params.vocab_only && ggml_backend_reg_count() == 0) { @@ -1011,7 +1015,7 @@ static struct llama_model * llama_model_load_from_file_impl( props.memory_free/1024/1024); } - const int status = llama_model_load(metadata, set_tensor_data, set_tensor_data_ud, path_model, splits, *model, params); + const int status = llama_model_load(metadata, set_tensor_data, set_tensor_data_ud, path_model, splits, fd, *model, params); GGML_ASSERT(status <= 0); if (status < 0) { if (status == -1) { @@ -1037,7 +1041,7 @@ struct llama_model * llama_model_init_from_user( std::vector splits = {}; params.use_mmap = false; params.use_extra_bufts = false; - return llama_model_load_from_file_impl(metadata, set_tensor_data, set_tensor_data_ud, path_model, splits, params); + return llama_model_load_from_file_impl(metadata, set_tensor_data, set_tensor_data_ud, path_model, splits, /*fd*/ -1, params); } // deprecated struct llama_model * llama_load_model_from_file( @@ -1050,7 +1054,7 @@ struct llama_model * llama_model_load_from_file( const char * path_model, struct llama_model_params params) { std::vector splits = {}; - return llama_model_load_from_file_impl(nullptr, nullptr, nullptr, path_model, splits, params); + return llama_model_load_from_file_impl(nullptr, nullptr, nullptr, path_model, splits, /*fd*/ -1, params); } struct llama_model * llama_model_load_from_splits( @@ -1066,7 +1070,20 @@ struct llama_model * llama_model_load_from_splits( for (size_t i = 0; i < n_paths; ++i) { splits.push_back(paths[i]); } - return llama_model_load_from_file_impl(nullptr, nullptr, nullptr, splits.front(), splits, params); + return llama_model_load_from_file_impl(nullptr, nullptr, nullptr, splits.front(), splits, /*fd*/ -1, params); +} + +struct llama_model * llama_model_load_from_fd(int fd, struct llama_model_params params) { +#ifdef _WIN32 + LLAMA_LOG_ERROR("%s: fd-based loading is not supported on Windows\n", __func__); + GGML_UNUSED(fd); + GGML_UNUSED(params); + return nullptr; +#else + std::string path_model; + std::vector splits = {}; + return llama_model_load_from_file_impl(nullptr, nullptr, nullptr, path_model, splits, fd, params); +#endif } void llama_model_save_to_file(const struct llama_model * model, const char * path_model) { diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index bb0f0ef0ed..cb31fb2f4a 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -240,6 +240,7 @@ llama_build_and_test(test-gguf.cpp) llama_build_and_test(test-backend-ops.cpp) llama_build_and_test(test-model-load-cancel.cpp LABEL "model") +llama_build_and_test(test-model-load-fd.cpp LABEL "model") llama_build_and_test(test-autorelease.cpp LABEL "model") llama_build_and_test(test-backend-sampler.cpp LABEL "model") diff --git a/tests/test-model-load-fd.cpp b/tests/test-model-load-fd.cpp new file mode 100644 index 0000000000..d5102942d0 --- /dev/null +++ b/tests/test-model-load-fd.cpp @@ -0,0 +1,47 @@ +#include "llama.h" +#include "get-model.h" + +#include +#include + +#ifdef _WIN32 +int main(int /*argc*/, char ** /*argv*/) { + fprintf(stderr, "skipping on Windows\n"); + return EXIT_SUCCESS; +} +#else +# include +# include + +int main(int argc, char ** argv) { + auto * model_path = get_model_or_exit(argc, argv); + + llama_backend_init(); + + const int fd = open(model_path, O_RDONLY); + if (fd < 0) { + fprintf(stderr, "failed to open %s\n", model_path); + return EXIT_FAILURE; + } + + auto params = llama_model_default_params(); + params.use_mmap = true; + params.vocab_only = true; + + struct llama_model * model = llama_model_load_from_fd(fd, params); + close(fd); + + if (model == nullptr) { + fprintf(stderr, "load from fd failed\n"); + return EXIT_FAILURE; + } + + const int n_vocab = llama_vocab_n_tokens(llama_model_get_vocab(model)); + fprintf(stderr, "loaded %d tokens from fd\n", n_vocab); + + llama_model_free(model); + llama_backend_free(); + + return n_vocab > 0 ? EXIT_SUCCESS : EXIT_FAILURE; +} +#endif From a4cfaf07c4be80a02a0a3b354fccf6955edbc14b Mon Sep 17 00:00:00 2001 From: Siddhesh2377 Date: Sat, 14 Mar 2026 00:44:33 +0530 Subject: [PATCH 2/4] llama : address review feedback for fd-based model loading --- ggml/include/gguf.h | 2 +- ggml/src/gguf.cpp | 29 ++--------------------------- src/llama-mmap.cpp | 4 ++-- src/llama-model-loader.cpp | 20 ++++++++++++++++++-- src/llama.cpp | 2 +- tests/test-model-load-fd.cpp | 2 +- 6 files changed, 25 insertions(+), 34 deletions(-) diff --git a/ggml/include/gguf.h b/ggml/include/gguf.h index bd12997372..9d8e321ba0 100644 --- a/ggml/include/gguf.h +++ b/ggml/include/gguf.h @@ -78,7 +78,7 @@ extern "C" { GGML_API struct gguf_context * gguf_init_empty(void); GGML_API struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_params params); - GGML_API struct gguf_context * gguf_init_from_fd(int fd, struct gguf_init_params params); + GGML_API struct gguf_context * gguf_init_from_file_ptr(FILE * file, struct gguf_init_params params); //GGML_API struct gguf_context * gguf_init_from_buffer(..); GGML_API void gguf_free(struct gguf_context * ctx); diff --git a/ggml/src/gguf.cpp b/ggml/src/gguf.cpp index 8eea785404..bf28dabb06 100644 --- a/ggml/src/gguf.cpp +++ b/ggml/src/gguf.cpp @@ -15,10 +15,6 @@ #include #include -#ifndef _WIN32 -#include -#endif - #define GGUF_MAX_STRING_LENGTH (1024*1024*1024) #define GGUF_MAX_ARRAY_ELEMENTS (1024*1024*1024) @@ -857,33 +853,12 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p return result; } -#ifndef _WIN32 -struct gguf_context * gguf_init_from_fd(int fd, struct gguf_init_params params) { - const int fd_duped = dup(fd); - if (fd_duped < 0) { - GGML_LOG_ERROR("%s: failed to dup fd %d: %s\n", __func__, fd, strerror(errno)); - return nullptr; - } - - FILE * file = fdopen(fd_duped, "rb"); +struct gguf_context * gguf_init_from_file_ptr(FILE * file, struct gguf_init_params params) { if (!file) { - close(fd_duped); - GGML_LOG_ERROR("%s: failed to fdopen fd %d: %s\n", __func__, fd, strerror(errno)); return nullptr; } - - struct gguf_context * result = gguf_init_from_file_impl(file, params); - fclose(file); - return result; + return gguf_init_from_file_impl(file, params); } -#else -struct gguf_context * gguf_init_from_fd(int fd, struct gguf_init_params params) { - GGML_LOG_ERROR("%s: fd-based loading is not supported on Windows\n", __func__); - GGML_UNUSED(fd); - GGML_UNUSED(params); - return nullptr; -} -#endif void gguf_free(struct gguf_context * ctx) { if (ctx == nullptr) { diff --git a/src/llama-mmap.cpp b/src/llama-mmap.cpp index 5ea00d3fa7..706d72f1e0 100644 --- a/src/llama-mmap.cpp +++ b/src/llama-mmap.cpp @@ -220,12 +220,12 @@ struct llama_file::impl { void init_from_fd(int fd_src) { const int fd_duped = dup(fd_src); if (fd_duped < 0) { - throw std::runtime_error(format("llama_file: failed to dup fd %d: %s", fd_src, strerror(errno))); + throw std::runtime_error(format("failed to dup fd %d: %s", fd_src, strerror(errno))); } fp = fdopen(fd_duped, "rb"); if (!fp) { close(fd_duped); - throw std::runtime_error(format("llama_file: failed to fdopen fd %d: %s", fd_src, strerror(errno))); + throw std::runtime_error(format("failed to fdopen fd %d: %s", fd_src, strerror(errno))); } seek(0, SEEK_END); size = tell(); diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp index c0e1e754e7..358b505f18 100644 --- a/src/llama-model-loader.cpp +++ b/src/llama-model-loader.cpp @@ -13,6 +13,10 @@ #include #include +#ifndef _WIN32 +#include +#endif // _WIN32 + static const size_t kiB = 1024; static const size_t MiB = 1024*kiB; static const size_t GiB = 1024*MiB; @@ -659,13 +663,25 @@ llama_model_loader::llama_model_loader( LLAMA_LOG_INFO("%s: additional %d GGUFs metadata loaded.\n", __func__, n_split - 1); } } else if (fd >= 0) { + const int fd_duped = dup(fd); + if (fd_duped < 0) { + throw std::runtime_error(format("%s: failed to dup fd %d: %s", __func__, fd, strerror(errno))); + } + + FILE * f = fdopen(fd_duped, "rb"); + if (!f) { + close(fd_duped); + throw std::runtime_error(format("%s: failed to fdopen fd %d: %s", __func__, fd, strerror(errno))); + } + struct ggml_context * ctx = NULL; struct gguf_init_params params = { /*.no_alloc = */ true, /*.ctx = */ &ctx, }; - metadata_ptr.reset(gguf_init_from_fd(fd, params)); + metadata_ptr.reset(gguf_init_from_file_ptr(f, params)); + fclose(f); metadata = metadata_ptr.get(); if (metadata == nullptr) { throw std::runtime_error(format("%s: failed to load model from fd %d", __func__, fd)); @@ -674,8 +690,8 @@ llama_model_loader::llama_model_loader( get_key(llm_kv(LLM_KV_GENERAL_ARCHITECTURE), arch_name, false); llm_kv = LLM_KV(llm_arch_from_string(arch_name)); - contexts.emplace_back(ctx); files.emplace_back(new llama_file(fd)); + contexts.emplace_back(ctx); // Save tensors data offset info of the main file. for (ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) { diff --git a/src/llama.cpp b/src/llama.cpp index daf3c3bd8d..c40d5c9d51 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -1083,7 +1083,7 @@ struct llama_model * llama_model_load_from_fd(int fd, struct llama_model_params std::string path_model; std::vector splits = {}; return llama_model_load_from_file_impl(nullptr, nullptr, nullptr, path_model, splits, fd, params); -#endif +#endif // _WIN32 } void llama_model_save_to_file(const struct llama_model * model, const char * path_model) { diff --git a/tests/test-model-load-fd.cpp b/tests/test-model-load-fd.cpp index d5102942d0..dd982ba907 100644 --- a/tests/test-model-load-fd.cpp +++ b/tests/test-model-load-fd.cpp @@ -44,4 +44,4 @@ int main(int argc, char ** argv) { return n_vocab > 0 ? EXIT_SUCCESS : EXIT_FAILURE; } -#endif +#endif // _WIN32 From 626823b2d9aa7603bdcf01ed6ec073e730f6eb86 Mon Sep 17 00:00:00 2001 From: Siddhesh2377 Date: Sat, 14 Mar 2026 01:20:50 +0530 Subject: [PATCH 3/4] llama : use FILE pointer instead of fd in public API --- include/llama.h | 5 ++--- src/llama-model-loader.cpp | 28 ++++++---------------------- src/llama-model-loader.h | 2 +- src/llama-quant.cpp | 2 +- src/llama.cpp | 31 ++++++++++++++----------------- tests/test-model-load-fd.cpp | 15 +++++++++++---- 6 files changed, 35 insertions(+), 48 deletions(-) diff --git a/include/llama.h b/include/llama.h index f05e9bd247..df2ab4ab4b 100644 --- a/include/llama.h +++ b/include/llama.h @@ -464,9 +464,8 @@ extern "C" { const char * path_model, struct llama_model_params params); - // Load a model from a POSIX file descriptor - // Not supported on Windows - LLAMA_API struct llama_model * llama_model_load_from_fd(int fd, struct llama_model_params params); + // Load a model from an open FILE pointer + LLAMA_API struct llama_model * llama_model_load_from_file_ptr(FILE * file, struct llama_model_params params); // Load a model from multiple splits (support custom naming scheme) // The paths must be in the correct order diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp index 358b505f18..6af0ee1fe5 100644 --- a/src/llama-model-loader.cpp +++ b/src/llama-model-loader.cpp @@ -13,10 +13,6 @@ #include #include -#ifndef _WIN32 -#include -#endif // _WIN32 - static const size_t kiB = 1024; static const size_t MiB = 1024*kiB; static const size_t GiB = 1024*MiB; @@ -514,7 +510,7 @@ llama_model_loader::llama_model_loader( void * set_tensor_data_ud, const std::string & fname, std::vector & splits, - int fd, + FILE * file, bool use_mmap, bool use_direct_io, bool check_tensors, @@ -662,35 +658,23 @@ llama_model_loader::llama_model_loader( LLAMA_LOG_INFO("%s: additional %d GGUFs metadata loaded.\n", __func__, n_split - 1); } - } else if (fd >= 0) { - const int fd_duped = dup(fd); - if (fd_duped < 0) { - throw std::runtime_error(format("%s: failed to dup fd %d: %s", __func__, fd, strerror(errno))); - } - - FILE * f = fdopen(fd_duped, "rb"); - if (!f) { - close(fd_duped); - throw std::runtime_error(format("%s: failed to fdopen fd %d: %s", __func__, fd, strerror(errno))); - } - + } else if (file) { struct ggml_context * ctx = NULL; struct gguf_init_params params = { /*.no_alloc = */ true, /*.ctx = */ &ctx, }; - metadata_ptr.reset(gguf_init_from_file_ptr(f, params)); - fclose(f); + metadata_ptr.reset(gguf_init_from_file_ptr(file, params)); metadata = metadata_ptr.get(); if (metadata == nullptr) { - throw std::runtime_error(format("%s: failed to load model from fd %d", __func__, fd)); + throw std::runtime_error(format("%s: failed to load model from file pointer", __func__)); } get_key(llm_kv(LLM_KV_GENERAL_ARCHITECTURE), arch_name, false); llm_kv = LLM_KV(llm_arch_from_string(arch_name)); - files.emplace_back(new llama_file(fd)); + files.emplace_back(new llama_file(fileno(file))); contexts.emplace_back(ctx); // Save tensors data offset info of the main file. @@ -715,7 +699,7 @@ llama_model_loader::llama_model_loader( fver = (enum llama_fver) gguf_get_version(metadata); LLAMA_LOG_INFO("%s: loaded meta data with %d key-value pairs and %d tensors from %s (version %s)\n", - __func__, n_kv, n_tensors, fname.empty() ? "(fd)" : fname.c_str(), llama_file_version_name(fver)); + __func__, n_kv, n_tensors, fname.empty() ? "(file*)" : fname.c_str(), llama_file_version_name(fver)); // determine file type based on the number of tensors for each quantization and print meta data // TODO: make optional diff --git a/src/llama-model-loader.h b/src/llama-model-loader.h index 6e5a5a4712..7b3d6703c0 100644 --- a/src/llama-model-loader.h +++ b/src/llama-model-loader.h @@ -125,7 +125,7 @@ struct llama_model_loader { void * set_tensor_data_ud, const std::string & fname, std::vector & splits, // optional, only need if the split does not follow naming scheme - int fd, + FILE * file, bool use_mmap, bool use_direct_io, bool check_tensors, diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index d047944dd6..c414656e0a 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -859,7 +859,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: std::vector splits = {}; llama_model_loader ml(/*metadata*/ nullptr, /*set_tensor_data*/ nullptr, /*set_tensor_data_ud*/ nullptr, - fname_inp, splits, /*fd*/ -1, use_mmap, /*use_direct_io*/ false, /*check_tensors*/ true, /*no_alloc*/ false, kv_overrides, nullptr); + fname_inp, splits, /*file*/ nullptr, use_mmap, /*use_direct_io*/ false, /*check_tensors*/ true, /*no_alloc*/ false, kv_overrides, nullptr); ml.init_mappings(false); // no prefetching llama_model model(llama_model_default_params()); diff --git a/src/llama.cpp b/src/llama.cpp index c40d5c9d51..d35fb2cbe6 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -828,7 +828,7 @@ int64_t llama_time_us(void) { // Returns 0 on success, -1 on error, and -2 on cancellation via llama_progress_callback static int llama_model_load(struct gguf_context * metadata, llama_model_set_tensor_data_t set_tensor_data, void * set_tensor_data_ud, - const std::string & fname, std::vector & splits, int fd, llama_model & model, llama_model_params & params) { + const std::string & fname, std::vector & splits, FILE * file, llama_model & model, llama_model_params & params) { // loading time will be recalculated after the first eval, so // we take page faults deferred by mmap() into consideration model.t_load_us = 0; @@ -837,7 +837,7 @@ static int llama_model_load(struct gguf_context * metadata, llama_model_set_tens model.t_start_us = tm.t_start_us; try { - llama_model_loader ml(metadata, set_tensor_data, set_tensor_data_ud, fname, splits, fd, params.use_mmap, params.use_direct_io, + llama_model_loader ml(metadata, set_tensor_data, set_tensor_data_ud, fname, splits, file, params.use_mmap, params.use_direct_io, params.check_tensors, params.no_alloc, params.kv_overrides, params.tensor_buft_overrides); ml.print_info(); @@ -889,9 +889,9 @@ static struct llama_model * llama_model_load_from_file_impl( void * set_tensor_data_ud, const std::string & path_model, std::vector & splits, - int fd, + FILE * file, struct llama_model_params params) { - if (metadata == nullptr && path_model.empty() && fd < 0) { + if (metadata == nullptr && path_model.empty() && !file) { LLAMA_LOG_ERROR("%s: no model source provided\n", __func__); return nullptr; } @@ -1015,7 +1015,7 @@ static struct llama_model * llama_model_load_from_file_impl( props.memory_free/1024/1024); } - const int status = llama_model_load(metadata, set_tensor_data, set_tensor_data_ud, path_model, splits, fd, *model, params); + const int status = llama_model_load(metadata, set_tensor_data, set_tensor_data_ud, path_model, splits, file, *model, params); GGML_ASSERT(status <= 0); if (status < 0) { if (status == -1) { @@ -1041,7 +1041,7 @@ struct llama_model * llama_model_init_from_user( std::vector splits = {}; params.use_mmap = false; params.use_extra_bufts = false; - return llama_model_load_from_file_impl(metadata, set_tensor_data, set_tensor_data_ud, path_model, splits, /*fd*/ -1, params); + return llama_model_load_from_file_impl(metadata, set_tensor_data, set_tensor_data_ud, path_model, splits, /*file*/ nullptr, params); } // deprecated struct llama_model * llama_load_model_from_file( @@ -1054,7 +1054,7 @@ struct llama_model * llama_model_load_from_file( const char * path_model, struct llama_model_params params) { std::vector splits = {}; - return llama_model_load_from_file_impl(nullptr, nullptr, nullptr, path_model, splits, /*fd*/ -1, params); + return llama_model_load_from_file_impl(nullptr, nullptr, nullptr, path_model, splits, /*file*/ nullptr, params); } struct llama_model * llama_model_load_from_splits( @@ -1070,20 +1070,17 @@ struct llama_model * llama_model_load_from_splits( for (size_t i = 0; i < n_paths; ++i) { splits.push_back(paths[i]); } - return llama_model_load_from_file_impl(nullptr, nullptr, nullptr, splits.front(), splits, /*fd*/ -1, params); + return llama_model_load_from_file_impl(nullptr, nullptr, nullptr, splits.front(), splits, /*file*/ nullptr, params); } -struct llama_model * llama_model_load_from_fd(int fd, struct llama_model_params params) { -#ifdef _WIN32 - LLAMA_LOG_ERROR("%s: fd-based loading is not supported on Windows\n", __func__); - GGML_UNUSED(fd); - GGML_UNUSED(params); - return nullptr; -#else +struct llama_model * llama_model_load_from_file_ptr(FILE * file, struct llama_model_params params) { + if (!file) { + LLAMA_LOG_ERROR("%s: file is NULL\n", __func__); + return nullptr; + } std::string path_model; std::vector splits = {}; - return llama_model_load_from_file_impl(nullptr, nullptr, nullptr, path_model, splits, fd, params); -#endif // _WIN32 + return llama_model_load_from_file_impl(nullptr, nullptr, nullptr, path_model, splits, file, params); } void llama_model_save_to_file(const struct llama_model * model, const char * path_model) { diff --git a/tests/test-model-load-fd.cpp b/tests/test-model-load-fd.cpp index dd982ba907..b7ff237ee5 100644 --- a/tests/test-model-load-fd.cpp +++ b/tests/test-model-load-fd.cpp @@ -24,20 +24,27 @@ int main(int argc, char ** argv) { return EXIT_FAILURE; } + FILE * f = fdopen(dup(fd), "rb"); + close(fd); + if (!f) { + fprintf(stderr, "failed to fdopen\n"); + return EXIT_FAILURE; + } + auto params = llama_model_default_params(); params.use_mmap = true; params.vocab_only = true; - struct llama_model * model = llama_model_load_from_fd(fd, params); - close(fd); + struct llama_model * model = llama_model_load_from_file_ptr(f, params); + fclose(f); if (model == nullptr) { - fprintf(stderr, "load from fd failed\n"); + fprintf(stderr, "load from file pointer failed\n"); return EXIT_FAILURE; } const int n_vocab = llama_vocab_n_tokens(llama_model_get_vocab(model)); - fprintf(stderr, "loaded %d tokens from fd\n", n_vocab); + fprintf(stderr, "loaded %d tokens via file pointer\n", n_vocab); llama_model_free(model); llama_backend_free(); From 26c04d4b31f265589e81605eca7accae7e7d22bc Mon Sep 17 00:00:00 2001 From: Siddhesh2377 Date: Sat, 14 Mar 2026 21:40:01 +0530 Subject: [PATCH 4/4] llama : use FILE pointer consistently, address review feedback --- ggml/src/ggml-impl.h | 1 - ggml/src/gguf.cpp | 15 ++++++--------- include/llama.h | 4 +++- src/llama-mmap.cpp | 31 ++++++++++++------------------- src/llama-mmap.h | 2 +- src/llama-model-loader.cpp | 2 +- tests/test-gguf.cpp | 4 ++-- 7 files changed, 25 insertions(+), 34 deletions(-) diff --git a/ggml/src/ggml-impl.h b/ggml/src/ggml-impl.h index e3714b38a6..ba0730ead2 100644 --- a/ggml/src/ggml-impl.h +++ b/ggml/src/ggml-impl.h @@ -718,6 +718,5 @@ inline bool ggml_check_edges(const struct ggml_cgraph * cgraph, // expose GGUF internals for test code GGML_API size_t gguf_type_size(enum gguf_type type); -GGML_API struct gguf_context * gguf_init_from_file_impl(FILE * file, struct gguf_init_params params); GGML_API void gguf_write_to_buf(const struct gguf_context * ctx, std::vector & buf, bool only_meta); #endif // __cplusplus diff --git a/ggml/src/gguf.cpp b/ggml/src/gguf.cpp index bf28dabb06..49afeacae3 100644 --- a/ggml/src/gguf.cpp +++ b/ggml/src/gguf.cpp @@ -394,7 +394,11 @@ bool gguf_read_emplace_helper(const struct gguf_reader & gr, std::vector(fname, mode, use_direct_io)) {} -llama_file::llama_file(int fd) : pimpl(std::make_unique(fd)) {} +llama_file::llama_file(FILE * file) : pimpl(std::make_unique(file)) {} llama_file::~llama_file() = default; diff --git a/src/llama-mmap.h b/src/llama-mmap.h index 2d1eac91a3..32fab23119 100644 --- a/src/llama-mmap.h +++ b/src/llama-mmap.h @@ -15,7 +15,7 @@ using llama_mlocks = std::vector>; struct llama_file { llama_file(const char * fname, const char * mode, bool use_direct_io = false); - llama_file(int fd); + llama_file(FILE * file); ~llama_file(); size_t tell() const; diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp index 6af0ee1fe5..8046df0194 100644 --- a/src/llama-model-loader.cpp +++ b/src/llama-model-loader.cpp @@ -674,7 +674,7 @@ llama_model_loader::llama_model_loader( get_key(llm_kv(LLM_KV_GENERAL_ARCHITECTURE), arch_name, false); llm_kv = LLM_KV(llm_arch_from_string(arch_name)); - files.emplace_back(new llama_file(fileno(file))); + files.emplace_back(new llama_file(file)); contexts.emplace_back(ctx); // Save tensors data offset info of the main file. diff --git a/tests/test-gguf.cpp b/tests/test-gguf.cpp index 8ebd16ba82..78ca95dcbd 100644 --- a/tests/test-gguf.cpp +++ b/tests/test-gguf.cpp @@ -742,7 +742,7 @@ static std::pair test_handcrafted_file(const unsigned int seed) { /*ctx =*/ hft >= offset_has_data ? &ctx : nullptr, }; - struct gguf_context * gguf_ctx = gguf_init_from_file_impl(file, gguf_params); + struct gguf_context * gguf_ctx = gguf_init_from_file_ptr(file, gguf_params); if (expect_context_not_null(hft)) { printf("%s: - context_not_null: ", __func__); @@ -1137,7 +1137,7 @@ static std::pair test_roundtrip(ggml_backend_dev_t dev, const unsigned /*no_alloc =*/ false, /*ctx =*/ only_meta ? nullptr : &ctx_1, }; - struct gguf_context * gguf_ctx_1 = gguf_init_from_file_impl(file, gguf_params); + struct gguf_context * gguf_ctx_1 = gguf_init_from_file_ptr(file, gguf_params); printf("%s: same_version: ", __func__); if (gguf_get_version(gguf_ctx_0) == gguf_get_version(gguf_ctx_1)) {