From 158239a2b142d709a19f19c66d19588f3b846dd0 Mon Sep 17 00:00:00 2001
From: Siddhesh2377 <siddheshsonar2377@gmail.com>
Date: Wed, 11 Mar 2026 19:17:31 +0530
Subject: [PATCH 1/4] llama : add fd-based model loading via
 llama_model_load_from_fd

---
 ggml/include/gguf.h          |  1 +
 ggml/src/gguf.cpp            | 32 ++++++++++++++++++++++++
 include/llama.h              |  4 +++
 src/llama-mmap.cpp           | 26 ++++++++++++++++++++
 src/llama-mmap.h             |  3 ++-
 src/llama-model-loader.cpp   | 33 ++++++++++++++++++++++++-
 src/llama-model-loader.h     |  1 +
 src/llama-quant.cpp          |  2 +-
 src/llama.cpp                | 31 ++++++++++++++++++------
 tests/CMakeLists.txt         |  1 +
 tests/test-model-load-fd.cpp | 47 ++++++++++++++++++++++++++++++++++++
 11 files changed, 171 insertions(+), 10 deletions(-)
 create mode 100644 tests/test-model-load-fd.cpp
diff --git a/ggml/include/gguf.h b/ggml/include/gguf.h
index 79ee202062..bd12997372 100644
--- a/ggml/include/gguf.h
+++ b/ggml/include/gguf.h
@@ -78,6 +78,7 @@ extern "C" {
 
     GGML_API struct gguf_context * gguf_init_empty(void);
     GGML_API struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_params params);
+    GGML_API struct gguf_context * gguf_init_from_fd(int fd, struct gguf_init_params params);
     //GGML_API struct gguf_context * gguf_init_from_buffer(..);
 
     GGML_API void gguf_free(struct gguf_context * ctx);
diff --git a/ggml/src/gguf.cpp b/ggml/src/gguf.cpp
index cbeedf6c4b..8eea785404 100644
--- a/ggml/src/gguf.cpp
+++ b/ggml/src/gguf.cpp
@@ -15,6 +15,10 @@
 #include <string>
 #include <vector>
 
+#ifndef _WIN32
+#include <unistd.h>
+#endif
+
 #define GGUF_MAX_STRING_LENGTH  (1024*1024*1024)
 #define GGUF_MAX_ARRAY_ELEMENTS (1024*1024*1024)
 
@@ -853,6 +857,34 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
     return result;
 }
 
+#ifndef _WIN32
+struct gguf_context * gguf_init_from_fd(int fd, struct gguf_init_params params) {
+    const int fd_duped = dup(fd);
+    if (fd_duped < 0) {
+        GGML_LOG_ERROR("%s: failed to dup fd %d: %s\n", __func__, fd, strerror(errno));
+        return nullptr;
+    }
+
+    FILE * file = fdopen(fd_duped, "rb");
+    if (!file) {
+        close(fd_duped);
+        GGML_LOG_ERROR("%s: failed to fdopen fd %d: %s\n", __func__, fd, strerror(errno));
+        return nullptr;
+    }
+
+    struct gguf_context * result = gguf_init_from_file_impl(file, params);
+    fclose(file);
+    return result;
+}
+#else
+struct gguf_context * gguf_init_from_fd(int fd, struct gguf_init_params params) {
+    GGML_LOG_ERROR("%s: fd-based loading is not supported on Windows\n", __func__);
+    GGML_UNUSED(fd);
+    GGML_UNUSED(params);
+    return nullptr;
+}
+#endif
+
 void gguf_free(struct gguf_context * ctx) {
     if (ctx == nullptr) {
         return;
diff --git a/include/llama.h b/include/llama.h
index 0bd10294cb..f05e9bd247 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -464,6 +464,10 @@ extern "C" {
                              const char * path_model,
               struct llama_model_params   params);
 
+    // Load a model from a POSIX file descriptor
+    // Not supported on Windows
+    LLAMA_API struct llama_model * llama_model_load_from_fd(int fd, struct llama_model_params params);
+
     // Load a model from multiple splits (support custom naming scheme)
     // The paths must be in the correct order
     LLAMA_API struct llama_model * llama_model_load_from_splits(
diff --git a/src/llama-mmap.cpp b/src/llama-mmap.cpp
index c03228e9ce..5ea00d3fa7 100644
--- a/src/llama-mmap.cpp
+++ b/src/llama-mmap.cpp
@@ -86,6 +86,10 @@ struct llama_file::impl {
         seek(0, SEEK_SET);
     }
 
+    impl(int /*fd_src*/) {
+        throw std::runtime_error("fd-based loading is not supported on Windows");
+    }
+
     size_t tell() const {
         LARGE_INTEGER li;
         li.QuadPart = 0;
@@ -209,6 +213,25 @@ struct llama_file::impl {
         seek(0, SEEK_SET);
     }
 
+    impl(int fd_src) : fname("(fd:" + std::to_string(fd_src) + ")") {
+        init_from_fd(fd_src);
+    }
+
+    void init_from_fd(int fd_src) {
+        const int fd_duped = dup(fd_src);
+        if (fd_duped < 0) {
+            throw std::runtime_error(format("llama_file: failed to dup fd %d: %s", fd_src, strerror(errno)));
+        }
+        fp = fdopen(fd_duped, "rb");
+        if (!fp) {
+            close(fd_duped);
+            throw std::runtime_error(format("llama_file: failed to fdopen fd %d: %s", fd_src, strerror(errno)));
+        }
+        seek(0, SEEK_END);
+        size = tell();
+        seek(0, SEEK_SET);
+    }
+
     size_t tell() const {
         if (fd == -1) {
             long ret = std::ftell(fp);
@@ -373,6 +396,9 @@ struct llama_file::impl {
 
 llama_file::llama_file(const char * fname, const char * mode, const bool use_direct_io) :
     pimpl(std::make_unique<impl>(fname, mode, use_direct_io)) {}
+
+llama_file::llama_file(int fd) : pimpl(std::make_unique<impl>(fd)) {}
+
 llama_file::~llama_file() = default;
 
 size_t llama_file::tell() const { return pimpl->tell(); }
diff --git a/src/llama-mmap.h b/src/llama-mmap.h
index 29ce4d2468..2d1eac91a3 100644
--- a/src/llama-mmap.h
+++ b/src/llama-mmap.h
@@ -15,12 +15,13 @@ using llama_mlocks = std::vector<std::unique_ptr<llama_mlock>>;
 
 struct llama_file {
     llama_file(const char * fname, const char * mode, bool use_direct_io = false);
+    llama_file(int fd);
     ~llama_file();
 
     size_t tell() const;
     size_t size() const;
 
-    int file_id() const; // fileno overload
+    int file_id() const;
 
     void seek(size_t offset, int whence) const;
 
diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp
index 623a3455dd..c0e1e754e7 100644
--- a/src/llama-model-loader.cpp
+++ b/src/llama-model-loader.cpp
@@ -510,6 +510,7 @@ llama_model_loader::llama_model_loader(
         void * set_tensor_data_ud,
         const std::string & fname,
         std::vector<std::string> & splits,
+        int fd,
         bool use_mmap,
         bool use_direct_io,
         bool check_tensors,
@@ -657,6 +658,36 @@ llama_model_loader::llama_model_loader(
 
             LLAMA_LOG_INFO("%s: additional %d GGUFs metadata loaded.\n",  __func__, n_split - 1);
         }
+    } else if (fd >= 0) {
+        struct ggml_context * ctx = NULL;
+        struct gguf_init_params params = {
+            /*.no_alloc = */ true,
+            /*.ctx      = */ &ctx,
+        };
+
+        metadata_ptr.reset(gguf_init_from_fd(fd, params));
+        metadata = metadata_ptr.get();
+        if (metadata == nullptr) {
+            throw std::runtime_error(format("%s: failed to load model from fd %d", __func__, fd));
+        }
+
+        get_key(llm_kv(LLM_KV_GENERAL_ARCHITECTURE), arch_name, false);
+        llm_kv = LLM_KV(llm_arch_from_string(arch_name));
+
+        contexts.emplace_back(ctx);
+        files.emplace_back(new llama_file(fd));
+
+        // Save tensors data offset info of the main file.
+        for (ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) {
+            std::string tensor_name = std::string(cur->name);
+            // make sure there is no duplicated tensor names
+            if (weights_map.find(tensor_name) != weights_map.end()) {
+                throw std::runtime_error(format("invalid model: tensor '%s' is duplicated", ggml_get_name(cur)));
+            }
+            n_elements += ggml_nelements(cur);
+            n_bytes    += ggml_nbytes(cur);
+            weights_map.emplace(tensor_name, llama_tensor_weight(files.back().get(), 0, metadata, cur));
+        }
     } else {
         get_key(llm_kv(LLM_KV_GENERAL_ARCHITECTURE), arch_name, false);
         llm_kv = LLM_KV(llm_arch_from_string(arch_name));
@@ -668,7 +699,7 @@ llama_model_loader::llama_model_loader(
     fver = (enum llama_fver) gguf_get_version(metadata);
 
     LLAMA_LOG_INFO("%s: loaded meta data with %d key-value pairs and %d tensors from %s (version %s)\n",
-            __func__, n_kv, n_tensors, fname.c_str(), llama_file_version_name(fver));
+            __func__, n_kv, n_tensors, fname.empty() ? "(fd)" : fname.c_str(), llama_file_version_name(fver));
 
     // determine file type based on the number of tensors for each quantization and print meta data
     // TODO: make optional
diff --git a/src/llama-model-loader.h b/src/llama-model-loader.h
index ed5de729ca..6e5a5a4712 100644
--- a/src/llama-model-loader.h
+++ b/src/llama-model-loader.h
@@ -125,6 +125,7 @@ struct llama_model_loader {
         void * set_tensor_data_ud,
         const std::string & fname,
         std::vector<std::string> & splits, // optional, only need if the split does not follow naming scheme
+        int fd,
         bool use_mmap,
         bool use_direct_io,
         bool check_tensors,
diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 8e8ce23124..d047944dd6 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -859,7 +859,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
 
     std::vector<std::string> splits = {};
     llama_model_loader ml(/*metadata*/ nullptr, /*set_tensor_data*/ nullptr, /*set_tensor_data_ud*/ nullptr,
-        fname_inp, splits, use_mmap, /*use_direct_io*/ false, /*check_tensors*/ true, /*no_alloc*/ false, kv_overrides, nullptr);
+        fname_inp, splits, /*fd*/ -1, use_mmap, /*use_direct_io*/ false, /*check_tensors*/ true, /*no_alloc*/ false, kv_overrides, nullptr);
     ml.init_mappings(false); // no prefetching
 
     llama_model model(llama_model_default_params());
diff --git a/src/llama.cpp b/src/llama.cpp
index 872e659edc..daf3c3bd8d 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -828,7 +828,7 @@ int64_t llama_time_us(void) {
 
 // Returns 0 on success, -1 on error, and -2 on cancellation via llama_progress_callback
 static int llama_model_load(struct gguf_context * metadata, llama_model_set_tensor_data_t set_tensor_data, void * set_tensor_data_ud,
-        const std::string & fname, std::vector<std::string> & splits, llama_model & model, llama_model_params & params) {
+        const std::string & fname, std::vector<std::string> & splits, int fd, llama_model & model, llama_model_params & params) {
     // loading time will be recalculated after the first eval, so
     // we take page faults deferred by mmap() into consideration
     model.t_load_us = 0;
@@ -837,7 +837,7 @@ static int llama_model_load(struct gguf_context * metadata, llama_model_set_tens
     model.t_start_us = tm.t_start_us;
 
     try {
-        llama_model_loader ml(metadata, set_tensor_data, set_tensor_data_ud, fname, splits, params.use_mmap, params.use_direct_io,
+        llama_model_loader ml(metadata, set_tensor_data, set_tensor_data_ud, fname, splits, fd, params.use_mmap, params.use_direct_io,
             params.check_tensors, params.no_alloc, params.kv_overrides, params.tensor_buft_overrides);
 
         ml.print_info();
@@ -889,8 +889,12 @@ static struct llama_model * llama_model_load_from_file_impl(
         void * set_tensor_data_ud,
         const std::string & path_model,
         std::vector<std::string> & splits,
+        int fd,
         struct llama_model_params params) {
-    GGML_ASSERT((metadata == nullptr) != path_model.empty() && "exactly one out of metadata and path_model needs to be defined");
+    if (metadata == nullptr && path_model.empty() && fd < 0) {
+        LLAMA_LOG_ERROR("%s: no model source provided\n", __func__);
+        return nullptr;
+    }
     ggml_time_init();
 
     if (!params.vocab_only && ggml_backend_reg_count() == 0) {
@@ -1011,7 +1015,7 @@ static struct llama_model * llama_model_load_from_file_impl(
                 props.memory_free/1024/1024);
     }
 
-    const int status = llama_model_load(metadata, set_tensor_data, set_tensor_data_ud, path_model, splits, *model, params);
+    const int status = llama_model_load(metadata, set_tensor_data, set_tensor_data_ud, path_model, splits, fd, *model, params);
     GGML_ASSERT(status <= 0);
     if (status < 0) {
         if (status == -1) {
@@ -1037,7 +1041,7 @@ struct llama_model * llama_model_init_from_user(
     std::vector<std::string> splits = {};
     params.use_mmap = false;
     params.use_extra_bufts = false;
-    return llama_model_load_from_file_impl(metadata, set_tensor_data, set_tensor_data_ud, path_model, splits, params);
+    return llama_model_load_from_file_impl(metadata, set_tensor_data, set_tensor_data_ud, path_model, splits, /*fd*/ -1, params);
 }
 // deprecated
 struct llama_model * llama_load_model_from_file(
@@ -1050,7 +1054,7 @@ struct llama_model * llama_model_load_from_file(
         const char * path_model,
         struct llama_model_params params) {
     std::vector<std::string> splits = {};
-    return llama_model_load_from_file_impl(nullptr, nullptr, nullptr, path_model, splits, params);
+    return llama_model_load_from_file_impl(nullptr, nullptr, nullptr, path_model, splits, /*fd*/ -1, params);
 }
 
 struct llama_model * llama_model_load_from_splits(
@@ -1066,7 +1070,20 @@ struct llama_model * llama_model_load_from_splits(
     for (size_t i = 0; i < n_paths; ++i) {
         splits.push_back(paths[i]);
     }
-    return llama_model_load_from_file_impl(nullptr, nullptr, nullptr, splits.front(), splits, params);
+    return llama_model_load_from_file_impl(nullptr, nullptr, nullptr, splits.front(), splits, /*fd*/ -1, params);
+}
+
+struct llama_model * llama_model_load_from_fd(int fd, struct llama_model_params params) {
+#ifdef _WIN32
+    LLAMA_LOG_ERROR("%s: fd-based loading is not supported on Windows\n", __func__);
+    GGML_UNUSED(fd);
+    GGML_UNUSED(params);
+    return nullptr;
+#else
+    std::string path_model;
+    std::vector<std::string> splits = {};
+    return llama_model_load_from_file_impl(nullptr, nullptr, nullptr, path_model, splits, fd, params);
+#endif
 }
 
 void llama_model_save_to_file(const struct llama_model * model, const char * path_model) {
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index bb0f0ef0ed..cb31fb2f4a 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -240,6 +240,7 @@ llama_build_and_test(test-gguf.cpp)
 llama_build_and_test(test-backend-ops.cpp)
 
 llama_build_and_test(test-model-load-cancel.cpp LABEL "model")
+llama_build_and_test(test-model-load-fd.cpp     LABEL "model")
 llama_build_and_test(test-autorelease.cpp       LABEL "model")
 llama_build_and_test(test-backend-sampler.cpp   LABEL "model")
 
diff --git a/tests/test-model-load-fd.cpp b/tests/test-model-load-fd.cpp
new file mode 100644
index 0000000000..d5102942d0
--- /dev/null
+++ b/tests/test-model-load-fd.cpp
@@ -0,0 +1,47 @@
+#include "llama.h"
+#include "get-model.h"
+
+#include <cstdio>
+#include <cstdlib>
+
+#ifdef _WIN32
+int main(int /*argc*/, char ** /*argv*/) {
+    fprintf(stderr, "skipping on Windows\n");
+    return EXIT_SUCCESS;
+}
+#else
+#    include <fcntl.h>
+#    include <unistd.h>
+
+int main(int argc, char ** argv) {
+    auto * model_path = get_model_or_exit(argc, argv);
+
+    llama_backend_init();
+
+    const int fd = open(model_path, O_RDONLY);
+    if (fd < 0) {
+        fprintf(stderr, "failed to open %s\n", model_path);
+        return EXIT_FAILURE;
+    }
+
+    auto params = llama_model_default_params();
+    params.use_mmap = true;
+    params.vocab_only = true;
+
+    struct llama_model * model = llama_model_load_from_fd(fd, params);
+    close(fd);
+
+    if (model == nullptr) {
+        fprintf(stderr, "load from fd failed\n");
+        return EXIT_FAILURE;
+    }
+
+    const int n_vocab = llama_vocab_n_tokens(llama_model_get_vocab(model));
+    fprintf(stderr, "loaded %d tokens from fd\n", n_vocab);
+
+    llama_model_free(model);
+    llama_backend_free();
+
+    return n_vocab > 0 ? EXIT_SUCCESS : EXIT_FAILURE;
+}
+#endif

From a4cfaf07c4be80a02a0a3b354fccf6955edbc14b Mon Sep 17 00:00:00 2001
From: Siddhesh2377 <siddheshsonar2377@gmail.com>
Date: Sat, 14 Mar 2026 00:44:33 +0530
Subject: [PATCH 2/4] llama : address review feedback for fd-based model
 loading

---
 ggml/include/gguf.h          |  2 +-
 ggml/src/gguf.cpp            | 29 ++---------------------------
 src/llama-mmap.cpp           |  4 ++--
 src/llama-model-loader.cpp   | 20 ++++++++++++++++++--
 src/llama.cpp                |  2 +-
 tests/test-model-load-fd.cpp |  2 +-
 6 files changed, 25 insertions(+), 34 deletions(-)

diff --git a/ggml/include/gguf.h b/ggml/include/gguf.h
index bd12997372..9d8e321ba0 100644
--- a/ggml/include/gguf.h
+++ b/ggml/include/gguf.h
@@ -78,7 +78,7 @@ extern "C" {
 
     GGML_API struct gguf_context * gguf_init_empty(void);
     GGML_API struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_params params);
-    GGML_API struct gguf_context * gguf_init_from_fd(int fd, struct gguf_init_params params);
+    GGML_API struct gguf_context * gguf_init_from_file_ptr(FILE * file, struct gguf_init_params params);
     //GGML_API struct gguf_context * gguf_init_from_buffer(..);
 
     GGML_API void gguf_free(struct gguf_context * ctx);
diff --git a/ggml/src/gguf.cpp b/ggml/src/gguf.cpp
index 8eea785404..bf28dabb06 100644
--- a/ggml/src/gguf.cpp
+++ b/ggml/src/gguf.cpp
@@ -15,10 +15,6 @@
 #include <string>
 #include <vector>
 
-#ifndef _WIN32
-#include <unistd.h>
-#endif
-
 #define GGUF_MAX_STRING_LENGTH  (1024*1024*1024)
 #define GGUF_MAX_ARRAY_ELEMENTS (1024*1024*1024)
 
@@ -857,33 +853,12 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
     return result;
 }
 
-#ifndef _WIN32
-struct gguf_context * gguf_init_from_fd(int fd, struct gguf_init_params params) {
-    const int fd_duped = dup(fd);
-    if (fd_duped < 0) {
-        GGML_LOG_ERROR("%s: failed to dup fd %d: %s\n", __func__, fd, strerror(errno));
-        return nullptr;
-    }
-
-    FILE * file = fdopen(fd_duped, "rb");
+struct gguf_context * gguf_init_from_file_ptr(FILE * file, struct gguf_init_params params) {
     if (!file) {
-        close(fd_duped);
-        GGML_LOG_ERROR("%s: failed to fdopen fd %d: %s\n", __func__, fd, strerror(errno));
         return nullptr;
     }
-
-    struct gguf_context * result = gguf_init_from_file_impl(file, params);
-    fclose(file);
-    return result;
+    return gguf_init_from_file_impl(file, params);
 }
-#else
-struct gguf_context * gguf_init_from_fd(int fd, struct gguf_init_params params) {
-    GGML_LOG_ERROR("%s: fd-based loading is not supported on Windows\n", __func__);
-    GGML_UNUSED(fd);
-    GGML_UNUSED(params);
-    return nullptr;
-}
-#endif
 
 void gguf_free(struct gguf_context * ctx) {
     if (ctx == nullptr) {
diff --git a/src/llama-mmap.cpp b/src/llama-mmap.cpp
index 5ea00d3fa7..706d72f1e0 100644
--- a/src/llama-mmap.cpp
+++ b/src/llama-mmap.cpp
@@ -220,12 +220,12 @@ struct llama_file::impl {
     void init_from_fd(int fd_src) {
         const int fd_duped = dup(fd_src);
         if (fd_duped < 0) {
-            throw std::runtime_error(format("llama_file: failed to dup fd %d: %s", fd_src, strerror(errno)));
+            throw std::runtime_error(format("failed to dup fd %d: %s", fd_src, strerror(errno)));
         }
         fp = fdopen(fd_duped, "rb");
         if (!fp) {
             close(fd_duped);
-            throw std::runtime_error(format("llama_file: failed to fdopen fd %d: %s", fd_src, strerror(errno)));
+            throw std::runtime_error(format("failed to fdopen fd %d: %s", fd_src, strerror(errno)));
         }
         seek(0, SEEK_END);
         size = tell();
diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp
index c0e1e754e7..358b505f18 100644
--- a/src/llama-model-loader.cpp
+++ b/src/llama-model-loader.cpp
@@ -13,6 +13,10 @@
 #include <future>
 #include <regex>
 
+#ifndef _WIN32
+#include <unistd.h>
+#endif // _WIN32
+
 static const size_t kiB = 1024;
 static const size_t MiB = 1024*kiB;
 static const size_t GiB = 1024*MiB;
@@ -659,13 +663,25 @@ llama_model_loader::llama_model_loader(
             LLAMA_LOG_INFO("%s: additional %d GGUFs metadata loaded.\n",  __func__, n_split - 1);
         }
     } else if (fd >= 0) {
+        const int fd_duped = dup(fd);
+        if (fd_duped < 0) {
+            throw std::runtime_error(format("%s: failed to dup fd %d: %s", __func__, fd, strerror(errno)));
+        }
+
+        FILE * f = fdopen(fd_duped, "rb");
+        if (!f) {
+            close(fd_duped);
+            throw std::runtime_error(format("%s: failed to fdopen fd %d: %s", __func__, fd, strerror(errno)));
+        }
+
         struct ggml_context * ctx = NULL;
         struct gguf_init_params params = {
             /*.no_alloc = */ true,
             /*.ctx      = */ &ctx,
         };
 
-        metadata_ptr.reset(gguf_init_from_fd(fd, params));
+        metadata_ptr.reset(gguf_init_from_file_ptr(f, params));
+        fclose(f);
         metadata = metadata_ptr.get();
         if (metadata == nullptr) {
             throw std::runtime_error(format("%s: failed to load model from fd %d", __func__, fd));
@@ -674,8 +690,8 @@ llama_model_loader::llama_model_loader(
         get_key(llm_kv(LLM_KV_GENERAL_ARCHITECTURE), arch_name, false);
         llm_kv = LLM_KV(llm_arch_from_string(arch_name));
 
-        contexts.emplace_back(ctx);
         files.emplace_back(new llama_file(fd));
+        contexts.emplace_back(ctx);
 
         // Save tensors data offset info of the main file.
         for (ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) {
diff --git a/src/llama.cpp b/src/llama.cpp
index daf3c3bd8d..c40d5c9d51 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -1083,7 +1083,7 @@ struct llama_model * llama_model_load_from_fd(int fd, struct llama_model_params
     std::string path_model;
     std::vector<std::string> splits = {};
     return llama_model_load_from_file_impl(nullptr, nullptr, nullptr, path_model, splits, fd, params);
-#endif
+#endif // _WIN32
 }
 
 void llama_model_save_to_file(const struct llama_model * model, const char * path_model) {
diff --git a/tests/test-model-load-fd.cpp b/tests/test-model-load-fd.cpp
index d5102942d0..dd982ba907 100644
--- a/tests/test-model-load-fd.cpp
+++ b/tests/test-model-load-fd.cpp
@@ -44,4 +44,4 @@ int main(int argc, char ** argv) {
 
     return n_vocab > 0 ? EXIT_SUCCESS : EXIT_FAILURE;
 }
-#endif
+#endif // _WIN32

From 626823b2d9aa7603bdcf01ed6ec073e730f6eb86 Mon Sep 17 00:00:00 2001
From: Siddhesh2377 <siddheshsonar2377@gmail.com>
Date: Sat, 14 Mar 2026 01:20:50 +0530
Subject: [PATCH 3/4] llama : use FILE pointer instead of fd in public API

---
 include/llama.h              |  5 ++---
 src/llama-model-loader.cpp   | 28 ++++++----------------------
 src/llama-model-loader.h     |  2 +-
 src/llama-quant.cpp          |  2 +-
 src/llama.cpp                | 31 ++++++++++++++-----------------
 tests/test-model-load-fd.cpp | 15 +++++++++++----
 6 files changed, 35 insertions(+), 48 deletions(-)

diff --git a/include/llama.h b/include/llama.h
index f05e9bd247..df2ab4ab4b 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -464,9 +464,8 @@ extern "C" {
                              const char * path_model,
               struct llama_model_params   params);
 
-    // Load a model from a POSIX file descriptor
-    // Not supported on Windows
-    LLAMA_API struct llama_model * llama_model_load_from_fd(int fd, struct llama_model_params params);
+    // Load a model from an open FILE pointer
+    LLAMA_API struct llama_model * llama_model_load_from_file_ptr(FILE * file, struct llama_model_params params);
 
     // Load a model from multiple splits (support custom naming scheme)
     // The paths must be in the correct order
diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp
index 358b505f18..6af0ee1fe5 100644
--- a/src/llama-model-loader.cpp
+++ b/src/llama-model-loader.cpp
@@ -13,10 +13,6 @@
 #include <future>
 #include <regex>
 
-#ifndef _WIN32
-#include <unistd.h>
-#endif // _WIN32
-
 static const size_t kiB = 1024;
 static const size_t MiB = 1024*kiB;
 static const size_t GiB = 1024*MiB;
@@ -514,7 +510,7 @@ llama_model_loader::llama_model_loader(
         void * set_tensor_data_ud,
         const std::string & fname,
         std::vector<std::string> & splits,
-        int fd,
+        FILE * file,
         bool use_mmap,
         bool use_direct_io,
         bool check_tensors,
@@ -662,35 +658,23 @@ llama_model_loader::llama_model_loader(
 
             LLAMA_LOG_INFO("%s: additional %d GGUFs metadata loaded.\n",  __func__, n_split - 1);
         }
-    } else if (fd >= 0) {
-        const int fd_duped = dup(fd);
-        if (fd_duped < 0) {
-            throw std::runtime_error(format("%s: failed to dup fd %d: %s", __func__, fd, strerror(errno)));
-        }
-
-        FILE * f = fdopen(fd_duped, "rb");
-        if (!f) {
-            close(fd_duped);
-            throw std::runtime_error(format("%s: failed to fdopen fd %d: %s", __func__, fd, strerror(errno)));
-        }
-
+    } else if (file) {
         struct ggml_context * ctx = NULL;
         struct gguf_init_params params = {
             /*.no_alloc = */ true,
             /*.ctx      = */ &ctx,
         };
 
-        metadata_ptr.reset(gguf_init_from_file_ptr(f, params));
-        fclose(f);
+        metadata_ptr.reset(gguf_init_from_file_ptr(file, params));
         metadata = metadata_ptr.get();
         if (metadata == nullptr) {
-            throw std::runtime_error(format("%s: failed to load model from fd %d", __func__, fd));
+            throw std::runtime_error(format("%s: failed to load model from file pointer", __func__));
         }
 
         get_key(llm_kv(LLM_KV_GENERAL_ARCHITECTURE), arch_name, false);
         llm_kv = LLM_KV(llm_arch_from_string(arch_name));
 
-        files.emplace_back(new llama_file(fd));
+        files.emplace_back(new llama_file(fileno(file)));
         contexts.emplace_back(ctx);
 
         // Save tensors data offset info of the main file.
@@ -715,7 +699,7 @@ llama_model_loader::llama_model_loader(
     fver = (enum llama_fver) gguf_get_version(metadata);
 
     LLAMA_LOG_INFO("%s: loaded meta data with %d key-value pairs and %d tensors from %s (version %s)\n",
-            __func__, n_kv, n_tensors, fname.empty() ? "(fd)" : fname.c_str(), llama_file_version_name(fver));
+            __func__, n_kv, n_tensors, fname.empty() ? "(file*)" : fname.c_str(), llama_file_version_name(fver));
 
     // determine file type based on the number of tensors for each quantization and print meta data
     // TODO: make optional
diff --git a/src/llama-model-loader.h b/src/llama-model-loader.h
index 6e5a5a4712..7b3d6703c0 100644
--- a/src/llama-model-loader.h
+++ b/src/llama-model-loader.h
@@ -125,7 +125,7 @@ struct llama_model_loader {
         void * set_tensor_data_ud,
         const std::string & fname,
         std::vector<std::string> & splits, // optional, only need if the split does not follow naming scheme
-        int fd,
+        FILE * file,
         bool use_mmap,
         bool use_direct_io,
         bool check_tensors,
diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index d047944dd6..c414656e0a 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -859,7 +859,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
 
     std::vector<std::string> splits = {};
     llama_model_loader ml(/*metadata*/ nullptr, /*set_tensor_data*/ nullptr, /*set_tensor_data_ud*/ nullptr,
-        fname_inp, splits, /*fd*/ -1, use_mmap, /*use_direct_io*/ false, /*check_tensors*/ true, /*no_alloc*/ false, kv_overrides, nullptr);
+        fname_inp, splits, /*file*/ nullptr, use_mmap, /*use_direct_io*/ false, /*check_tensors*/ true, /*no_alloc*/ false, kv_overrides, nullptr);
     ml.init_mappings(false); // no prefetching
 
     llama_model model(llama_model_default_params());
diff --git a/src/llama.cpp b/src/llama.cpp
index c40d5c9d51..d35fb2cbe6 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -828,7 +828,7 @@ int64_t llama_time_us(void) {
 
 // Returns 0 on success, -1 on error, and -2 on cancellation via llama_progress_callback
 static int llama_model_load(struct gguf_context * metadata, llama_model_set_tensor_data_t set_tensor_data, void * set_tensor_data_ud,
-        const std::string & fname, std::vector<std::string> & splits, int fd, llama_model & model, llama_model_params & params) {
+        const std::string & fname, std::vector<std::string> & splits, FILE * file, llama_model & model, llama_model_params & params) {
     // loading time will be recalculated after the first eval, so
     // we take page faults deferred by mmap() into consideration
     model.t_load_us = 0;
@@ -837,7 +837,7 @@ static int llama_model_load(struct gguf_context * metadata, llama_model_set_tens
     model.t_start_us = tm.t_start_us;
 
     try {
-        llama_model_loader ml(metadata, set_tensor_data, set_tensor_data_ud, fname, splits, fd, params.use_mmap, params.use_direct_io,
+        llama_model_loader ml(metadata, set_tensor_data, set_tensor_data_ud, fname, splits, file, params.use_mmap, params.use_direct_io,
             params.check_tensors, params.no_alloc, params.kv_overrides, params.tensor_buft_overrides);
 
         ml.print_info();
@@ -889,9 +889,9 @@ static struct llama_model * llama_model_load_from_file_impl(
         void * set_tensor_data_ud,
         const std::string & path_model,
         std::vector<std::string> & splits,
-        int fd,
+        FILE * file,
         struct llama_model_params params) {
-    if (metadata == nullptr && path_model.empty() && fd < 0) {
+    if (metadata == nullptr && path_model.empty() && !file) {
         LLAMA_LOG_ERROR("%s: no model source provided\n", __func__);
         return nullptr;
     }
@@ -1015,7 +1015,7 @@ static struct llama_model * llama_model_load_from_file_impl(
                 props.memory_free/1024/1024);
     }
 
-    const int status = llama_model_load(metadata, set_tensor_data, set_tensor_data_ud, path_model, splits, fd, *model, params);
+    const int status = llama_model_load(metadata, set_tensor_data, set_tensor_data_ud, path_model, splits, file, *model, params);
     GGML_ASSERT(status <= 0);
     if (status < 0) {
         if (status == -1) {
@@ -1041,7 +1041,7 @@ struct llama_model * llama_model_init_from_user(
     std::vector<std::string> splits = {};
     params.use_mmap = false;
     params.use_extra_bufts = false;
-    return llama_model_load_from_file_impl(metadata, set_tensor_data, set_tensor_data_ud, path_model, splits, /*fd*/ -1, params);
+    return llama_model_load_from_file_impl(metadata, set_tensor_data, set_tensor_data_ud, path_model, splits, /*file*/ nullptr, params);
 }
 // deprecated
 struct llama_model * llama_load_model_from_file(
@@ -1054,7 +1054,7 @@ struct llama_model * llama_model_load_from_file(
         const char * path_model,
         struct llama_model_params params) {
     std::vector<std::string> splits = {};
-    return llama_model_load_from_file_impl(nullptr, nullptr, nullptr, path_model, splits, /*fd*/ -1, params);
+    return llama_model_load_from_file_impl(nullptr, nullptr, nullptr, path_model, splits, /*file*/ nullptr, params);
 }
 
 struct llama_model * llama_model_load_from_splits(
@@ -1070,20 +1070,17 @@ struct llama_model * llama_model_load_from_splits(
     for (size_t i = 0; i < n_paths; ++i) {
         splits.push_back(paths[i]);
     }
-    return llama_model_load_from_file_impl(nullptr, nullptr, nullptr, splits.front(), splits, /*fd*/ -1, params);
+    return llama_model_load_from_file_impl(nullptr, nullptr, nullptr, splits.front(), splits, /*file*/ nullptr, params);
 }
 
-struct llama_model * llama_model_load_from_fd(int fd, struct llama_model_params params) {
-#ifdef _WIN32
-    LLAMA_LOG_ERROR("%s: fd-based loading is not supported on Windows\n", __func__);
-    GGML_UNUSED(fd);
-    GGML_UNUSED(params);
-    return nullptr;
-#else
+struct llama_model * llama_model_load_from_file_ptr(FILE * file, struct llama_model_params params) {
+    if (!file) {
+        LLAMA_LOG_ERROR("%s: file is NULL\n", __func__);
+        return nullptr;
+    }
     std::string path_model;
     std::vector<std::string> splits = {};
-    return llama_model_load_from_file_impl(nullptr, nullptr, nullptr, path_model, splits, fd, params);
-#endif // _WIN32
+    return llama_model_load_from_file_impl(nullptr, nullptr, nullptr, path_model, splits, file, params);
 }
 
 void llama_model_save_to_file(const struct llama_model * model, const char * path_model) {
diff --git a/tests/test-model-load-fd.cpp b/tests/test-model-load-fd.cpp
index dd982ba907..b7ff237ee5 100644
--- a/tests/test-model-load-fd.cpp
+++ b/tests/test-model-load-fd.cpp
@@ -24,20 +24,27 @@ int main(int argc, char ** argv) {
         return EXIT_FAILURE;
     }
 
+    FILE * f = fdopen(dup(fd), "rb");
+    close(fd);
+    if (!f) {
+        fprintf(stderr, "failed to fdopen\n");
+        return EXIT_FAILURE;
+    }
+
     auto params = llama_model_default_params();
     params.use_mmap = true;
     params.vocab_only = true;
 
-    struct llama_model * model = llama_model_load_from_fd(fd, params);
-    close(fd);
+    struct llama_model * model = llama_model_load_from_file_ptr(f, params);
+    fclose(f);
 
     if (model == nullptr) {
-        fprintf(stderr, "load from fd failed\n");
+        fprintf(stderr, "load from file pointer failed\n");
         return EXIT_FAILURE;
     }
 
     const int n_vocab = llama_vocab_n_tokens(llama_model_get_vocab(model));
-    fprintf(stderr, "loaded %d tokens from fd\n", n_vocab);
+    fprintf(stderr, "loaded %d tokens via file pointer\n", n_vocab);
 
     llama_model_free(model);
     llama_backend_free();

From 26c04d4b31f265589e81605eca7accae7e7d22bc Mon Sep 17 00:00:00 2001
From: Siddhesh2377 <siddheshsonar2377@gmail.com>
Date: Sat, 14 Mar 2026 21:40:01 +0530
Subject: [PATCH 4/4] llama : use FILE pointer consistently, address review
 feedback

---
 ggml/src/ggml-impl.h       |  1 -
 ggml/src/gguf.cpp          | 15 ++++++---------
 include/llama.h            |  4 +++-
 src/llama-mmap.cpp         | 31 ++++++++++++-------------------
 src/llama-mmap.h           |  2 +-
 src/llama-model-loader.cpp |  2 +-
 tests/test-gguf.cpp        |  4 ++--
 7 files changed, 25 insertions(+), 34 deletions(-)

diff --git a/ggml/src/ggml-impl.h b/ggml/src/ggml-impl.h
index e3714b38a6..ba0730ead2 100644
--- a/ggml/src/ggml-impl.h
+++ b/ggml/src/ggml-impl.h
@@ -718,6 +718,5 @@ inline bool ggml_check_edges(const struct ggml_cgraph *                cgraph,
 
 // expose GGUF internals for test code
 GGML_API size_t gguf_type_size(enum gguf_type type);
-GGML_API struct gguf_context * gguf_init_from_file_impl(FILE * file, struct gguf_init_params params);
 GGML_API void gguf_write_to_buf(const struct gguf_context * ctx, std::vector<int8_t> & buf, bool only_meta);
 #endif // __cplusplus
diff --git a/ggml/src/gguf.cpp b/ggml/src/gguf.cpp
index bf28dabb06..49afeacae3 100644
--- a/ggml/src/gguf.cpp
+++ b/ggml/src/gguf.cpp
@@ -394,7 +394,11 @@ bool gguf_read_emplace_helper(const struct gguf_reader & gr, std::vector<struct
     return true;
 }
 
-struct gguf_context * gguf_init_from_file_impl(FILE * file, struct gguf_init_params params) {
+struct gguf_context * gguf_init_from_file_ptr(FILE * file, struct gguf_init_params params) {
+    if (!file) {
+        return nullptr;
+    }
+
     const struct gguf_reader gr(file);
     struct gguf_context * ctx = new gguf_context;
 
@@ -848,18 +852,11 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
         return nullptr;
     }
 
-    struct gguf_context * result = gguf_init_from_file_impl(file, params);
+    struct gguf_context * result = gguf_init_from_file_ptr(file, params);
     fclose(file);
     return result;
 }
 
-struct gguf_context * gguf_init_from_file_ptr(FILE * file, struct gguf_init_params params) {
-    if (!file) {
-        return nullptr;
-    }
-    return gguf_init_from_file_impl(file, params);
-}
-
 void gguf_free(struct gguf_context * ctx) {
     if (ctx == nullptr) {
         return;
diff --git a/include/llama.h b/include/llama.h
index df2ab4ab4b..342666a625 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -465,7 +465,9 @@ extern "C" {
               struct llama_model_params   params);
 
     // Load a model from an open FILE pointer
-    LLAMA_API struct llama_model * llama_model_load_from_file_ptr(FILE * file, struct llama_model_params params);
+    LLAMA_API struct llama_model * llama_model_load_from_file_ptr(
+                                   FILE * file,
+              struct llama_model_params   params);
 
     // Load a model from multiple splits (support custom naming scheme)
     // The paths must be in the correct order
diff --git a/src/llama-mmap.cpp b/src/llama-mmap.cpp
index 706d72f1e0..ccc29c1302 100644
--- a/src/llama-mmap.cpp
+++ b/src/llama-mmap.cpp
@@ -86,8 +86,12 @@ struct llama_file::impl {
         seek(0, SEEK_SET);
     }
 
-    impl(int /*fd_src*/) {
-        throw std::runtime_error("fd-based loading is not supported on Windows");
+    impl(FILE * file) : owns_fp(false) {
+        fp = file;
+        fp_win32 = (HANDLE) _get_osfhandle(_fileno(fp));
+        seek(0, SEEK_END);
+        size = tell();
+        seek(0, SEEK_SET);
     }
 
     size_t tell() const {
@@ -163,7 +167,7 @@ struct llama_file::impl {
     }
 
     ~impl() {
-        if (fp) {
+        if (fp && owns_fp) {
             std::fclose(fp);
         }
     }
@@ -213,20 +217,8 @@ struct llama_file::impl {
         seek(0, SEEK_SET);
     }
 
-    impl(int fd_src) : fname("(fd:" + std::to_string(fd_src) + ")") {
-        init_from_fd(fd_src);
-    }
-
-    void init_from_fd(int fd_src) {
-        const int fd_duped = dup(fd_src);
-        if (fd_duped < 0) {
-            throw std::runtime_error(format("failed to dup fd %d: %s", fd_src, strerror(errno)));
-        }
-        fp = fdopen(fd_duped, "rb");
-        if (!fp) {
-            close(fd_duped);
-            throw std::runtime_error(format("failed to fdopen fd %d: %s", fd_src, strerror(errno)));
-        }
+    impl(FILE * file) : fname("(file*)"), owns_fp(false) {
+        fp = file;
         seek(0, SEEK_END);
         size = tell();
         seek(0, SEEK_SET);
@@ -376,7 +368,7 @@ struct llama_file::impl {
     ~impl() {
         if (fd != -1) {
             close(fd);
-        } else {
+        } else if (owns_fp) {
             std::fclose(fp);
         }
     }
@@ -392,12 +384,13 @@ struct llama_file::impl {
 
     FILE * fp{};
     size_t size{};
+    bool owns_fp = true;
 };
 
 llama_file::llama_file(const char * fname, const char * mode, const bool use_direct_io) :
     pimpl(std::make_unique<impl>(fname, mode, use_direct_io)) {}
 
-llama_file::llama_file(int fd) : pimpl(std::make_unique<impl>(fd)) {}
+llama_file::llama_file(FILE * file) : pimpl(std::make_unique<impl>(file)) {}
 
 llama_file::~llama_file() = default;
 
diff --git a/src/llama-mmap.h b/src/llama-mmap.h
index 2d1eac91a3..32fab23119 100644
--- a/src/llama-mmap.h
+++ b/src/llama-mmap.h
@@ -15,7 +15,7 @@ using llama_mlocks = std::vector<std::unique_ptr<llama_mlock>>;
 
 struct llama_file {
     llama_file(const char * fname, const char * mode, bool use_direct_io = false);
-    llama_file(int fd);
+    llama_file(FILE * file);
     ~llama_file();
 
     size_t tell() const;
diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp
index 6af0ee1fe5..8046df0194 100644
--- a/src/llama-model-loader.cpp
+++ b/src/llama-model-loader.cpp
@@ -674,7 +674,7 @@ llama_model_loader::llama_model_loader(
         get_key(llm_kv(LLM_KV_GENERAL_ARCHITECTURE), arch_name, false);
         llm_kv = LLM_KV(llm_arch_from_string(arch_name));
 
-        files.emplace_back(new llama_file(fileno(file)));
+        files.emplace_back(new llama_file(file));
         contexts.emplace_back(ctx);
 
         // Save tensors data offset info of the main file.
diff --git a/tests/test-gguf.cpp b/tests/test-gguf.cpp
index 8ebd16ba82..78ca95dcbd 100644
--- a/tests/test-gguf.cpp
+++ b/tests/test-gguf.cpp
@@ -742,7 +742,7 @@ static std::pair<int, int> test_handcrafted_file(const unsigned int seed) {
             /*ctx      =*/ hft >= offset_has_data ? &ctx : nullptr,
         };
 
-        struct gguf_context * gguf_ctx = gguf_init_from_file_impl(file, gguf_params);
+        struct gguf_context * gguf_ctx = gguf_init_from_file_ptr(file, gguf_params);
 
         if (expect_context_not_null(hft)) {
             printf("%s:   - context_not_null: ", __func__);
@@ -1137,7 +1137,7 @@ static std::pair<int, int> test_roundtrip(ggml_backend_dev_t dev, const unsigned
         /*no_alloc =*/ false,
         /*ctx      =*/ only_meta ? nullptr : &ctx_1,
     };
-    struct gguf_context * gguf_ctx_1 = gguf_init_from_file_impl(file, gguf_params);
+    struct gguf_context * gguf_ctx_1 = gguf_init_from_file_ptr(file, gguf_params);
 
     printf("%s: same_version: ", __func__);
     if (gguf_get_version(gguf_ctx_0) == gguf_get_version(gguf_ctx_1)) {