From 829c25007343b0cb6f7241f3ebb2f389106ba19a Mon Sep 17 00:00:00 2001
From: Ruben Ortlam <rortlam@redhat.com>
Date: Mon, 30 Mar 2026 14:54:46 +0200
Subject: [PATCH 1/5] tests: allow exporting graph ops from HF file without
 downloading weights

---
 common/arg.cpp             |  8 ++--
 tests/CMakeLists.txt       |  4 ++
 tests/export-graph-ops.cpp | 72 +++++++++++++++++++++++++---
 tests/gguf-model-data.cpp  | 98 +++++++++++++++++++++++++++++++++++++-
 tests/gguf-model-data.h    |  6 +++
 5 files changed, 177 insertions(+), 11 deletions(-)
diff --git a/common/arg.cpp b/common/arg.cpp
index 538d2a4b0a..7dc6d0d0a8 100644
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -537,9 +537,11 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
     } catch (const std::exception & e) {
         LOG_WRN("HF cache migration failed: %s\n", e.what());
     }
+    // export_graph_ops loads only metadata
+    const bool skip_model_download = ctx_arg.ex == LLAMA_EXAMPLE_EXPORT_GRAPH_OPS;
 
     // maybe handle remote preset
-    if (!params.model.hf_repo.empty()) {
+    if (!params.model.hf_repo.empty() && !skip_model_download) {
         std::string cli_hf_repo = params.model.hf_repo;
         bool has_preset = common_params_handle_remote_preset(params, ctx_arg.ex);
 
@@ -570,7 +572,7 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
     }
 
     // handle model and download
-    {
+    if (!skip_model_download) {
         auto res = common_params_handle_model(params.model, params.hf_token, params.offline);
         if (params.no_mmproj) {
             params.mmproj = {};
@@ -591,7 +593,7 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
 
     // model is required (except for server)
     // TODO @ngxson : maybe show a list of available models in CLI in this case
-    if (params.model.path.empty() && ctx_arg.ex != LLAMA_EXAMPLE_SERVER && !params.usage && !params.completion) {
+    if (params.model.path.empty() && ctx_arg.ex != LLAMA_EXAMPLE_SERVER && !skip_model_download&& !params.usage && !params.completion) {
         throw std::invalid_argument("error: --model is required\n");
     }
 
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 9582164b58..8355c08070 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -287,3 +287,7 @@ target_include_directories(test-alloc PRIVATE ${PROJECT_SOURCE_DIR}/ggml/src)
 
 llama_build(export-graph-ops.cpp)
 target_include_directories(export-graph-ops PRIVATE ${PROJECT_SOURCE_DIR}/ggml/src)
+if (TARGET gguf-model-data)
+    target_link_libraries(export-graph-ops PRIVATE gguf-model-data)
+    target_compile_definitions(export-graph-ops PRIVATE LLAMA_HF_FETCH)
+endif()
diff --git a/tests/export-graph-ops.cpp b/tests/export-graph-ops.cpp
index 754089d068..82f674787b 100644
--- a/tests/export-graph-ops.cpp
+++ b/tests/export-graph-ops.cpp
@@ -1,15 +1,26 @@
 #include "arg.h"
 #include "common.h"
 #include "log.h"
-#include "llama.h"
+#include "llama-cpp.h"
 #include "../src/llama-ext.h"
 #include "ggml.h"
+#include "gguf-model-data.h"
+#include "gguf.h"
+#include "ggml-backend.h"
+#include "download.h"
 
 #include <array>
 #include <vector>
 #include <set>
 #include <fstream>
 #include <iostream>
+#include <random>
+
+// Noop because weights are not needed
+static void set_tensor_data(struct ggml_tensor * tensor, void * userdata) {
+    GGML_UNUSED(tensor);
+    GGML_UNUSED(userdata);
+}
 
 struct input_tensor {
     ggml_type type;
@@ -132,9 +143,50 @@ int main(int argc, char ** argv) {
 
     params.warmup = false;
 
-    auto init_result = common_init_from_params(params);
+    llama_context * ctx;
+    common_init_result_ptr init_result;
+    llama_model_ptr model;
 
-    llama_context * ctx = init_result->context();
+    if (params.model.hf_repo.empty()) {
+        init_result = common_init_from_params(params);
+
+        ctx = init_result->context();
+    } else {
+#ifdef LLAMA_HF_FETCH
+        auto [hf_repo, hf_quant] = common_download_split_repo_tag(params.model.hf_repo);
+        if (hf_quant.empty() || hf_quant == "latest") {
+            hf_quant = "Q4_K_M";
+        }
+
+        gguf_context * gguf_ctx = gguf_fetch_gguf_ctx(hf_repo, hf_quant);
+        if (!gguf_ctx) {
+            LOG_ERR("failed to fetch GGUF metadata from %s\n", hf_repo.c_str());
+            return 1;
+        }
+
+        llama_model_params model_params = llama_model_default_params();
+        model_params.devices = params.devices.data();
+
+        model.reset(llama_model_init_from_user(gguf_ctx, set_tensor_data, nullptr, model_params));
+        gguf_free(gguf_ctx);
+
+        if (!model) {
+            LOG_ERR("failed to create llama_model from %s\n", hf_repo.c_str());
+            return 1;
+        }
+
+        llama_context_params ctx_params = llama_context_default_params();
+        ctx = llama_init_from_model(model.get(), ctx_params);
+
+        if (!ctx) {
+            LOG_ERR("failed to create llama_context\n");
+            return 1;
+        }
+#else
+        LOG_ERR("export-graph-ops compiled without HF fetch support\n");
+        return 1;
+#endif
+    }
 
     const uint32_t n_seqs  = llama_n_seq_max(ctx);
     const uint32_t n_tokens = std::min(llama_n_ctx(ctx), llama_n_ubatch(ctx));
@@ -143,13 +195,15 @@ int main(int argc, char ** argv) {
 
     auto * gf_pp = llama_graph_reserve(ctx, n_tokens, n_seqs, n_tokens);
     if (!gf_pp) {
-        throw std::runtime_error("failed to reserve prompt processing graph");
+        LOG_ERR("failed to reserve prompt processing graph\n");
+        return 1;
     }
     extract_graph_ops(gf_pp, "pp", tests);
 
     auto * gf_tg = llama_graph_reserve(ctx, n_seqs, n_seqs, n_seqs);
     if (!gf_tg) {
-        throw std::runtime_error("failed to reserve token generation graph");
+        LOG_ERR("failed to reserve token generation graph\n");
+        return 1;
     }
     extract_graph_ops(gf_tg, "tg", tests);
 
@@ -158,12 +212,18 @@ int main(int argc, char ** argv) {
     std::ofstream f(params.out_file);
 
     if (!f.is_open()) {
-        throw std::runtime_error("Unable to open output file");
+        LOG_ERR("unable to open output file: %s\n", params.out_file.c_str());
+        return 1;
     }
 
     for (const auto& test : tests) {
         test.serialize(f);
     }
 
+    if (!params.model.hf_repo.empty()) {
+        // Context is not owned by common_init_result in this case
+        llama_free(ctx);
+    }
+
     return 0;
 }
diff --git a/tests/gguf-model-data.cpp b/tests/gguf-model-data.cpp
index 3bc82c88da..343f86d918 100644
--- a/tests/gguf-model-data.cpp
+++ b/tests/gguf-model-data.cpp
@@ -531,14 +531,18 @@ static std::optional<gguf_remote_model> fetch_and_parse(
     return std::nullopt;
 }
 
+static std::string get_cache_file_path(const std::string& cdir, const std::string& repo_part, const std::string& filename) {
+    std::string fname_part = sanitize_for_path(filename);
+    return cdir + "/" + repo_part + "--" + fname_part + ".partial";
+}
+
 // Try cache first, then fetch and parse a single GGUF shard.
 static std::optional<gguf_remote_model> fetch_or_cached(
         const std::string & repo,
         const std::string & filename,
         const std::string & cdir,
         const std::string & repo_part) {
-    std::string fname_part = sanitize_for_path(filename);
-    std::string cache_path = cdir + "/" + repo_part + "--" + fname_part + ".partial";
+    std::string cache_path = get_cache_file_path(cdir, repo_part, filename);
 
     {
         std::vector<char> cached;
@@ -611,3 +615,93 @@ std::optional<gguf_remote_model> gguf_fetch_model_meta(
 
     return model_opt;
 }
+
+gguf_context * gguf_fetch_gguf_ctx(
+        const std::string & repo,
+        const std::string & quant,
+        const std::string & cache_dir) {
+    std::string cdir = cache_dir.empty() ? get_default_cache_dir() : cache_dir;
+    std::string repo_part = sanitize_for_path(repo);
+
+    std::string split_prefix;
+    std::string filename = detect_gguf_filename(repo, quant, split_prefix);
+
+    if (filename.empty()) {
+        return nullptr;
+    }
+
+    auto model_opt = fetch_or_cached(repo, filename, cdir, repo_part);
+    if (!model_opt.has_value()) {
+        fprintf(stderr, "gguf_fetch: failed to fetch %s\n", filename.c_str());
+        return nullptr;
+    }
+
+    auto & model = model_opt.value();
+
+    const std::string cache_path = get_cache_file_path(cdir, repo_part, filename);
+
+    ggml_context * ggml_ctx;
+    gguf_init_params params{true, &ggml_ctx};
+    gguf_context * ctx = gguf_init_from_file(cache_path.c_str(), params);
+
+    if (ctx == nullptr) {
+        fprintf(stderr, "gguf_fetch: gguf_init_from_file failed\n");
+        ggml_free(ggml_ctx);
+        return nullptr;
+    }
+
+    // If the model is split across multiple files we need to fetch the remaining shards metadata
+    if (model.n_split > 1) {
+        if (split_prefix.empty()) {
+            fprintf(stderr, "gguf_fetch: model reports %u splits but filename has no split pattern\n", model.n_split);
+            gguf_free(ctx);
+            ggml_free(ggml_ctx);
+            return nullptr;
+        }
+
+        fprintf(stderr, "gguf_fetch: split model with %u shards, fetching remaining %u...\n",
+                model.n_split, model.n_split - 1);
+
+        for (int i = 2; i <= model.n_split; i++) {
+            char num_buf[6], total_buf[6];
+            snprintf(num_buf,   sizeof(num_buf),   "%05d", i);
+            snprintf(total_buf, sizeof(total_buf), "%05d", (int)model.n_split);
+            std::string shard_name = split_prefix + "-" + num_buf + "-of-" + total_buf + ".gguf";
+
+            auto shard = fetch_or_cached(repo, shard_name, cdir, repo_part);
+            if (!shard.has_value()) {
+                fprintf(stderr, "gguf_fetch: failed to fetch shard %d: %s\n", i, shard_name.c_str());
+                gguf_free(ctx);
+                ggml_free(ggml_ctx);
+                return nullptr;
+            }
+
+            // Load tensors from shard and add to main gguf_context
+            const std::string shard_path = get_cache_file_path(cdir, repo_part, shard_name);
+            ggml_context * shard_ggml_ctx;
+            gguf_init_params shard_params{true, &shard_ggml_ctx};
+            gguf_context * shard_ctx = gguf_init_from_file(shard_path.c_str(), shard_params);
+
+            if (shard_ctx == nullptr) {
+                fprintf(stderr, "gguf_fetch: shard gguf_init_from_file failed\n");
+                ggml_free(shard_ggml_ctx);
+                gguf_free(ctx);
+                ggml_free(ggml_ctx);
+                return nullptr;
+            }
+
+            for (ggml_tensor * t = ggml_get_first_tensor(shard_ggml_ctx); t; t = ggml_get_next_tensor(shard_ggml_ctx, t)) {
+                gguf_add_tensor(ctx, t);
+            }
+
+            gguf_free(shard_ctx);
+            ggml_free(shard_ggml_ctx);
+        }
+
+        gguf_set_val_u16(ctx, "split.count", 1);
+    }
+
+    ggml_free(ggml_ctx);
+
+    return ctx;
+}
diff --git a/tests/gguf-model-data.h b/tests/gguf-model-data.h
index ed433791ad..9c2ff02513 100644
--- a/tests/gguf-model-data.h
+++ b/tests/gguf-model-data.h
@@ -1,6 +1,7 @@
 #pragma once
 
 #include "ggml.h"
+#include "gguf.h"
 
 #include <cstdint>
 #include <optional>
@@ -40,3 +41,8 @@ std::optional<gguf_remote_model> gguf_fetch_model_meta(
     const std::string & repo,
     const std::string & quant = "Q8_0",
     const std::string & cache_dir = "");  // empty = default
+
+gguf_context * gguf_fetch_gguf_ctx(
+    const std::string & repo,
+    const std::string & quant = "Q8_0",
+    const std::string & cache_dir = "");

From 9037b78263a571b81a3ba97f504c67ed6bb781e7 Mon Sep 17 00:00:00 2001
From: Ruben Ortlam <rortlam@redhat.com>
Date: Tue, 31 Mar 2026 09:26:33 +0200
Subject: [PATCH 2/5] use unique_ptr for llama_context in HF metadata case

---
 tests/export-graph-ops.cpp | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/tests/export-graph-ops.cpp b/tests/export-graph-ops.cpp
index 82f674787b..e37855eee6 100644
--- a/tests/export-graph-ops.cpp
+++ b/tests/export-graph-ops.cpp
@@ -145,6 +145,7 @@ int main(int argc, char ** argv) {
 
     llama_context * ctx;
     common_init_result_ptr init_result;
+    llama_context_ptr ctx2;
     llama_model_ptr model;
 
     if (params.model.hf_repo.empty()) {
@@ -176,7 +177,8 @@ int main(int argc, char ** argv) {
         }
 
         llama_context_params ctx_params = llama_context_default_params();
-        ctx = llama_init_from_model(model.get(), ctx_params);
+        ctx2.reset(llama_init_from_model(model.get(), ctx_params));
+        ctx = ctx2.get();
 
         if (!ctx) {
             LOG_ERR("failed to create llama_context\n");
@@ -220,10 +222,5 @@ int main(int argc, char ** argv) {
         test.serialize(f);
     }
 
-    if (!params.model.hf_repo.empty()) {
-        // Context is not owned by common_init_result in this case
-        llama_free(ctx);
-    }
-
     return 0;
 }

From b7870ef6b824bb21b74e496a0f22319c2672fefb Mon Sep 17 00:00:00 2001
From: Ruben Ortlam <rortlam@redhat.com>
Date: Tue, 31 Mar 2026 09:50:59 +0200
Subject: [PATCH 3/5] fix missing non-required tensors falling back to type f32

---
 src/llama-model-loader.cpp | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp
index 3d549cae5b..9dbc108036 100644
--- a/src/llama-model-loader.cpp
+++ b/src/llama-model-loader.cpp
@@ -1215,6 +1215,10 @@ struct ggml_tensor * llama_model_loader::create_tensor(
         const int64_t tid = gguf_find_tensor(metadata, tn.str().c_str());
         if (tid != -1) {
             type = gguf_get_tensor_type(metadata, tid);
+        } else if (flags & TENSOR_NOT_REQUIRED) {
+            // If the tensor is not found and not required, return nullptr to allow
+            // the caller to fall back
+            return nullptr;
         }
 
         // for tensors that are not required some of the dimensions can be invalid:

From d6fc8fe0c74069b752c91f338639a7de972f2b9f Mon Sep 17 00:00:00 2001
From: Ruben Ortlam <rortlam@redhat.com>
Date: Tue, 31 Mar 2026 11:22:36 +0200
Subject: [PATCH 4/5] use unique pointers where possible

---
 tests/export-graph-ops.cpp |  5 ++---
 tests/gguf-model-data.cpp  | 32 ++++++++++++--------------------
 tests/gguf-model-data.h    |  4 ++--
 3 files changed, 16 insertions(+), 25 deletions(-)

diff --git a/tests/export-graph-ops.cpp b/tests/export-graph-ops.cpp
index e37855eee6..2d75a27960 100644
--- a/tests/export-graph-ops.cpp
+++ b/tests/export-graph-ops.cpp
@@ -159,7 +159,7 @@ int main(int argc, char ** argv) {
             hf_quant = "Q4_K_M";
         }
 
-        gguf_context * gguf_ctx = gguf_fetch_gguf_ctx(hf_repo, hf_quant);
+        gguf_context_ptr gguf_ctx = gguf_fetch_gguf_ctx(hf_repo, hf_quant);
         if (!gguf_ctx) {
             LOG_ERR("failed to fetch GGUF metadata from %s\n", hf_repo.c_str());
             return 1;
@@ -168,8 +168,7 @@ int main(int argc, char ** argv) {
         llama_model_params model_params = llama_model_default_params();
         model_params.devices = params.devices.data();
 
-        model.reset(llama_model_init_from_user(gguf_ctx, set_tensor_data, nullptr, model_params));
-        gguf_free(gguf_ctx);
+        model.reset(llama_model_init_from_user(gguf_ctx.get(), set_tensor_data, nullptr, model_params));
 
         if (!model) {
             LOG_ERR("failed to create llama_model from %s\n", hf_repo.c_str());
diff --git a/tests/gguf-model-data.cpp b/tests/gguf-model-data.cpp
index 343f86d918..adfd6bec68 100644
--- a/tests/gguf-model-data.cpp
+++ b/tests/gguf-model-data.cpp
@@ -4,6 +4,7 @@
 #include "gguf-model-data.h"
 
 #include "common.h"
+#include "ggml-cpp.h"
 #include "gguf.h"
 
 #include <algorithm>
@@ -616,7 +617,7 @@ std::optional<gguf_remote_model> gguf_fetch_model_meta(
     return model_opt;
 }
 
-gguf_context * gguf_fetch_gguf_ctx(
+gguf_context_ptr gguf_fetch_gguf_ctx(
         const std::string & repo,
         const std::string & quant,
         const std::string & cache_dir) {
@@ -640,13 +641,14 @@ gguf_context * gguf_fetch_gguf_ctx(
 
     const std::string cache_path = get_cache_file_path(cdir, repo_part, filename);
 
-    ggml_context * ggml_ctx;
+    ggml_context_ptr ggml_ctx_ptr;
+    ggml_context * ggml_ctx{};
     gguf_init_params params{true, &ggml_ctx};
-    gguf_context * ctx = gguf_init_from_file(cache_path.c_str(), params);
+    gguf_context_ptr ctx{gguf_init_from_file(cache_path.c_str(), params)};
+    ggml_ctx_ptr.reset(ggml_ctx);
 
     if (ctx == nullptr) {
         fprintf(stderr, "gguf_fetch: gguf_init_from_file failed\n");
-        ggml_free(ggml_ctx);
         return nullptr;
     }
 
@@ -654,8 +656,6 @@ gguf_context * gguf_fetch_gguf_ctx(
     if (model.n_split > 1) {
         if (split_prefix.empty()) {
             fprintf(stderr, "gguf_fetch: model reports %u splits but filename has no split pattern\n", model.n_split);
-            gguf_free(ctx);
-            ggml_free(ggml_ctx);
             return nullptr;
         }
 
@@ -671,37 +671,29 @@ gguf_context * gguf_fetch_gguf_ctx(
             auto shard = fetch_or_cached(repo, shard_name, cdir, repo_part);
             if (!shard.has_value()) {
                 fprintf(stderr, "gguf_fetch: failed to fetch shard %d: %s\n", i, shard_name.c_str());
-                gguf_free(ctx);
-                ggml_free(ggml_ctx);
                 return nullptr;
             }
 
             // Load tensors from shard and add to main gguf_context
             const std::string shard_path = get_cache_file_path(cdir, repo_part, shard_name);
-            ggml_context * shard_ggml_ctx;
+            ggml_context_ptr shard_ggml_ctx_ptr;
+            ggml_context * shard_ggml_ctx{};
             gguf_init_params shard_params{true, &shard_ggml_ctx};
-            gguf_context * shard_ctx = gguf_init_from_file(shard_path.c_str(), shard_params);
+            gguf_context_ptr shard_ctx{gguf_init_from_file(shard_path.c_str(), shard_params)};
+            shard_ggml_ctx_ptr.reset(shard_ggml_ctx);
 
             if (shard_ctx == nullptr) {
                 fprintf(stderr, "gguf_fetch: shard gguf_init_from_file failed\n");
-                ggml_free(shard_ggml_ctx);
-                gguf_free(ctx);
-                ggml_free(ggml_ctx);
                 return nullptr;
             }
 
             for (ggml_tensor * t = ggml_get_first_tensor(shard_ggml_ctx); t; t = ggml_get_next_tensor(shard_ggml_ctx, t)) {
-                gguf_add_tensor(ctx, t);
+                gguf_add_tensor(ctx.get(), t);
             }
-
-            gguf_free(shard_ctx);
-            ggml_free(shard_ggml_ctx);
         }
 
-        gguf_set_val_u16(ctx, "split.count", 1);
+        gguf_set_val_u16(ctx.get(), "split.count", 1);
     }
 
-    ggml_free(ggml_ctx);
-
     return ctx;
 }
diff --git a/tests/gguf-model-data.h b/tests/gguf-model-data.h
index 9c2ff02513..61ce24bb05 100644
--- a/tests/gguf-model-data.h
+++ b/tests/gguf-model-data.h
@@ -1,6 +1,6 @@
 #pragma once
 
-#include "ggml.h"
+#include "ggml-cpp.h"
 #include "gguf.h"
 
 #include <cstdint>
@@ -42,7 +42,7 @@ std::optional<gguf_remote_model> gguf_fetch_model_meta(
     const std::string & quant = "Q8_0",
     const std::string & cache_dir = "");  // empty = default
 
-gguf_context * gguf_fetch_gguf_ctx(
+gguf_context_ptr gguf_fetch_gguf_ctx(
     const std::string & repo,
     const std::string & quant = "Q8_0",
     const std::string & cache_dir = "");

From 646f0a7d78e399dcf8884db22d0e54df3b39c7b2 Mon Sep 17 00:00:00 2001
From: Ruben Ortlam <rortlam@redhat.com>
Date: Tue, 31 Mar 2026 14:39:43 +0200
Subject: [PATCH 5/5] use no_alloc instead of fixing f32 fallback

---
 common/common.cpp          | 1 +
 common/common.h            | 1 +
 src/llama-model-loader.cpp | 4 ----
 tests/export-graph-ops.cpp | 1 +
 4 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/common/common.cpp b/common/common.cpp
index a9bd494191..a99862db5a 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -1434,6 +1434,7 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
 
     mparams.progress_callback           = params.load_progress_callback;
     mparams.progress_callback_user_data = params.load_progress_callback_user_data;
+    mparams.no_alloc                    = params.no_alloc;
 
     return mparams;
 }
diff --git a/common/common.h b/common/common.h
index 17dc3fb232..31a337daa6 100644
--- a/common/common.h
+++ b/common/common.h
@@ -679,6 +679,7 @@ struct common_params {
     // return false from callback to abort model loading or true to continue
     llama_progress_callback load_progress_callback = NULL;
     void *                  load_progress_callback_user_data = NULL;
+    bool no_alloc = false; // Don't allocate model buffers
 };
 
 // call once at the start of a program if it uses libcommon
diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp
index 9dbc108036..3d549cae5b 100644
--- a/src/llama-model-loader.cpp
+++ b/src/llama-model-loader.cpp
@@ -1215,10 +1215,6 @@ struct ggml_tensor * llama_model_loader::create_tensor(
         const int64_t tid = gguf_find_tensor(metadata, tn.str().c_str());
         if (tid != -1) {
             type = gguf_get_tensor_type(metadata, tid);
-        } else if (flags & TENSOR_NOT_REQUIRED) {
-            // If the tensor is not found and not required, return nullptr to allow
-            // the caller to fall back
-            return nullptr;
         }
 
         // for tensors that are not required some of the dimensions can be invalid:
diff --git a/tests/export-graph-ops.cpp b/tests/export-graph-ops.cpp
index 2d75a27960..f4f82b8664 100644
--- a/tests/export-graph-ops.cpp
+++ b/tests/export-graph-ops.cpp
@@ -167,6 +167,7 @@ int main(int argc, char ** argv) {
 
         llama_model_params model_params = llama_model_default_params();
         model_params.devices = params.devices.data();
+        model_params.no_alloc = true;
 
         model.reset(llama_model_init_from_user(gguf_ctx.get(), set_tensor_data, nullptr, model_params));