llama : reuse compute graphs

ggml-ci
2025-07-01 15:59:43 +03:00 · 2025-07-01 15:59:43 +03:00 · 76681e3c73
parent bac8bed248
commit 76681e3c73
17 changed files with 458 additions and 187 deletions
--- a/common/arg.cpp
+++ b/common/arg.cpp
@ -1464,6 +1464,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
            params.swa_full = true;
        }
    ).set_env("LLAMA_ARG_SWA_FULL"));
+    add_opt(common_arg(
+        {"--graph-reuse", "-gr"},
+        string_format("reuse previous compute graphs when possible (default: %s)"
+            "[(more info)](https://github.com/ggml-org/llama.cpp/pull/14482)", params.graph_reuse ? "true" : "false"),
+        [](common_params & params) {
+            params.graph_reuse = true;
+        }
+    ).set_env("LLAMA_ARG_GRAPH_REUSE"));
    add_opt(common_arg(
        {"--no-context-shift"},
        string_format("disables context shift on infinite text generation (default: %s)", params.ctx_shift ? "disabled" : "enabled"),
--- a/common/common.cpp
+++ b/common/common.cpp
@ -1157,6 +1157,7 @@ struct llama_context_params common_context_params_to_llama(const common_params &
    cparams.no_perf           = params.no_perf;
    cparams.op_offload        = !params.no_op_offload;
    cparams.swa_full          = params.swa_full;
+    cparams.graph_reuse       = params.graph_reuse;

    cparams.type_k = params.cache_type_k;
    cparams.type_v = params.cache_type_v;
--- a/common/common.h
+++ b/common/common.h
@ -330,6 +330,7 @@ struct common_params {
    bool no_perf           = false; // disable performance metrics
    bool ctx_shift         = true;  // context shift on inifinite text generation
    bool swa_full          = false; // use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
+    bool graph_reuse       = false; // reuse previous compute graphs when possible

    bool input_prefix_bos  = false; // prefix BOS to user inputs, preceding input_prefix
    bool use_mmap          = true;  // use mmap for faster loads
--- a/include/llama.h
+++ b/include/llama.h
@ -374,6 +374,8 @@ extern "C" {
        bool swa_full;    // use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
                          // NOTE: setting to false when n_seq_max > 1 can cause bad performance in some cases
                          //       ref: https://github.com/ggml-org/llama.cpp/pull/13845#issuecomment-2924800573
+
+        bool graph_reuse; // reuse previous compute graphs when possible
    };

    // model quantization parameters
@ -1429,6 +1431,7 @@ extern "C" {

        int32_t n_p_eval;
        int32_t n_eval;
+        int32_t n_reused;
    };

    struct llama_perf_sampler_data {
--- a/src/llama-batch.h
+++ b/src/llama-batch.h
@ -34,6 +34,31 @@ struct llama_ubatch {
    llama_seq_id *  seq_id_unq; // [n_seqs_unq]       | s   | seq_id
    int32_t      *  seq_idx;    // [LLAMA_MAX_SEQ]    | -   | seq_idx
    int8_t       *  output;     // [n_tokens]         | i   | -
+
+    bool is_same(const llama_ubatch & other) const {
+        bool res =
+            equal_seqs   == other.equal_seqs &&
+            n_tokens     == other.n_tokens &&
+            n_seq_tokens == other.n_seq_tokens &&
+            n_seqs       == other.n_seqs &&
+            n_seqs_unq   == other.n_seqs_unq &&
+            (
+                (!token && !other.token) ||
+                (!embd  && !other.embd)
+            );
+
+        if (!res) {
+            return false;
+        }
+
+        // TODO: this won't work because seq_id_unq ptr can point to an old balloc that has
+        //       been freed by this point. find a way to fix this
+        //for (uint32_t s = 0; s < n_seqs_unq; ++s) {
+        //    res &= seq_id_unq[s] == other.seq_id_unq[s];
+        //}
+
+        return res;
+    }
 };

 // a helper for sanitizing, fulfilling and splitting a batch
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@ -101,7 +101,8 @@ llama_context::llama_context(

    cparams.n_ubatch = std::min(cparams.n_batch, params.n_ubatch == 0 ? params.n_batch : params.n_ubatch);

-    cparams.op_offload = params.op_offload;
+    cparams.op_offload  = params.op_offload;
+    cparams.graph_reuse = params.graph_reuse;

    const uint32_t n_ctx_per_seq = cparams.n_ctx / cparams.n_seq_max;

@ -227,8 +228,8 @@ llama_context::llama_context(

        LLAMA_LOG_DEBUG("%s: max_nodes = %zu\n", __func__, max_nodes);

-        // buffer used to store the computation graph and the tensor meta data
-        buf_compute_meta.resize(ggml_tensor_overhead()*max_nodes + ggml_graph_overhead_custom(max_nodes, false));
+        gf_res_prev.reset(new llm_graph_result(max_nodes));
+        gf_res_reserve.reset(new llm_graph_result(max_nodes));

        // TODO: move these checks to ggml_backend_sched
        // enabling pipeline parallelism in the scheduler increases memory usage, so it is only done when necessary
@ -388,10 +389,6 @@ ggml_backend_sched_t llama_context::get_sched() const {
    return sched.get();
 }

-ggml_context * llama_context::get_ctx_compute() const {
-    return ctx_compute.get();
-}
-
 uint32_t llama_context::n_ctx() const {
    return cparams.n_ctx;
 }
@ -678,38 +675,52 @@ bool llama_context::apply_adapter_cvec(
    return cvec.apply(model, data, len, n_embd, il_start, il_end);
 }

-llm_graph_result_ptr llama_context::process_ubatch(const llama_ubatch & ubatch, llm_graph_type gtype, llama_memory_context_i * mctx, ggml_status & ret) {
+llm_graph_result_i * llama_context::process_ubatch(const llama_ubatch & ubatch, llm_graph_type gtype, llama_memory_context_i * mctx, ggml_status & ret) {
    if (mctx && !mctx->apply()) {
        LLAMA_LOG_ERROR("%s: failed to apply memory context\n", __func__);
        ret = GGML_STATUS_FAILED;
        return nullptr;
    }

-    auto * gf = graph_init();
-    if (!gf) {
-        LLAMA_LOG_ERROR("%s: failed to initialize graph\n", __func__);
-        ret = GGML_STATUS_FAILED;
-        return nullptr;
-    }
+    auto * res = gf_res_prev.get();
+    auto * gf  = res->get_gf();

-    auto res = graph_build(ctx_compute.get(), gf, ubatch, gtype, mctx);
-    if (!res) {
-        LLAMA_LOG_ERROR("%s: failed to build graph\n", __func__);
-        ret = GGML_STATUS_FAILED;
-        return nullptr;
-    }
+    // the new graph parameters
+    // in order to correctly reuse a graph, it's full topology has to be uniquely determined by these parameters
+    const auto gparams = graph_params(res, ubatch, mctx, gtype);

-    // LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
+    const bool can_reuse = cparams.graph_reuse && res->update(gparams);
+    if (can_reuse) {
+        LLAMA_LOG_DEBUG("%s: reusing previous graph\n", __func__);
+        n_reused++;
+    } else {
+        res->reset();

-    if (!ggml_backend_sched_alloc_graph(sched.get(), gf)) {
-        LLAMA_LOG_ERROR("%s: failed to allocate graph\n", __func__);
-        ret = GGML_STATUS_ALLOC_FAILED;
-        return nullptr;
+        ggml_backend_sched_reset(sched.get());
+        ggml_backend_sched_set_eval_callback(sched.get(), cparams.cb_eval, cparams.cb_eval_user_data);
+
+        //const auto t_start_us = ggml_time_us();
+
+        gf = model.build_graph(gparams);
+
+        //LLAMA_LOG_INFO("graph build time: %.3f ms\n", (ggml_time_us() - t_start_us)/1000.0);
+
+        if (!gf) {
+            LLAMA_LOG_ERROR("%s: failed to initialize graph\n", __func__);
+            ret = GGML_STATUS_FAILED;
+            return nullptr;
+        }
+
+        if (!ggml_backend_sched_alloc_graph(sched.get(), gf)) {
+            LLAMA_LOG_ERROR("%s: failed to allocate graph\n", __func__);
+            ret = GGML_STATUS_ALLOC_FAILED;
+            return nullptr;
+        }
    }

    res->set_inputs(&ubatch);

-    const auto status = graph_compute(gf, ubatch.n_tokens > 1);
+    const auto status = graph_compute(res->get_gf(), ubatch.n_tokens > 1);
    if (status != GGML_STATUS_SUCCESS) {
        LLAMA_LOG_ERROR("%s: failed to compute graph, compute status: %d\n", __func__, status);
        ret = status;
@ -767,9 +778,6 @@ int llama_context::encode(const llama_batch & batch_inp) {

    n_outputs = n_tokens;

-    ggml_backend_sched_reset(sched.get());
-    ggml_backend_sched_set_eval_callback(sched.get(), cparams.cb_eval, cparams.cb_eval_user_data);
-
    const auto causal_attn_org = cparams.causal_attn;

    // always use non-causal attention for encoder graphs
@ -778,7 +786,7 @@ int llama_context::encode(const llama_batch & batch_inp) {
    cparams.causal_attn = false;

    ggml_status status;
-    const auto res = process_ubatch(ubatch, LLM_GRAPH_TYPE_ENCODER, nullptr, status);
+    const auto * res = process_ubatch(ubatch, LLM_GRAPH_TYPE_ENCODER, nullptr, status);

    cparams.causal_attn = causal_attn_org;

@ -846,7 +854,9 @@ int llama_context::encode(const llama_batch & batch_inp) {

    // Reset state for the next token before backend sync, to allow the CPU activities in the reset to
    // overlap with device computation.
-    ggml_backend_sched_reset(sched.get());
+    if (!cparams.graph_reuse) {
+        ggml_backend_sched_reset(sched.get());
+    }

    // TODO: hacky solution
    if (model.arch == LLM_ARCH_T5 && t_embd) {
@ -1005,11 +1015,8 @@ int llama_context::decode(const llama_batch & batch_inp) {
            n_outputs = n_outputs_new;
        }

-        ggml_backend_sched_reset(sched.get());
-        ggml_backend_sched_set_eval_callback(sched.get(), cparams.cb_eval, cparams.cb_eval_user_data);
-
        ggml_status status;
-        const auto res = process_ubatch(ubatch, LLM_GRAPH_TYPE_DECODER, mctx.get(), status);
+        const auto * res = process_ubatch(ubatch, LLM_GRAPH_TYPE_DECODER, mctx.get(), status);

        if (!res) {
            // the last ubatch failed or was aborted -> remove all positions of that ubatch from the KV cache
@ -1192,7 +1199,9 @@ int llama_context::decode(const llama_batch & batch_inp) {

    // Reset state for the next token before backend sync, to allow the CPU activities in the reset to
    // overlap with device computation.
-    ggml_backend_sched_reset(sched.get());
+    if (!cparams.graph_reuse) {
+        ggml_backend_sched_reset(sched.get());
+    }

    return 0;
 }
@ -1275,20 +1284,8 @@ uint32_t llama_context::output_reserve(int32_t n_outputs) {
 // graph
 //

-int32_t llama_context::graph_max_nodes() const {
-    return std::max<int32_t>(65536, 5*model.n_tensors());
-}
-
-ggml_cgraph * llama_context::graph_init() {
-    ggml_init_params params = {
-        /*.mem_size   =*/ buf_compute_meta.size(),
-        /*.mem_buffer =*/ buf_compute_meta.data(),
-        /*.no_alloc   =*/ true,
-    };
-
-    ctx_compute.reset(ggml_init(params));
-
-    return ggml_new_graph_custom(ctx_compute.get(), graph_max_nodes(), false);
+uint32_t llama_context::graph_max_nodes() const {
+    return std::max<uint32_t>(65536u, 5u*model.n_tensors());
 }

 ggml_cgraph * llama_context::graph_reserve(uint32_t n_tokens, uint32_t n_seqs, uint32_t n_outputs, const llama_memory_context_i * mctx) {
@ -1301,6 +1298,9 @@ ggml_cgraph * llama_context::graph_reserve(uint32_t n_tokens, uint32_t n_seqs, u
        LLAMA_LOG_DEBUG("%s: making n_tokens a multiple of n_seqs - n_tokens = %u, n_seqs = %u, n_outputs = %u\n", __func__, n_tokens, n_seqs, n_outputs);
    }

+    gf_res_prev->reset();
+    ggml_backend_sched_reset(sched.get());
+
    // store the n_outputs as it is, and restore it afterwards
    // TODO: not sure if needed, might simplify in the future by removing this
    const auto save_n_outputs = this->n_outputs;
@ -1310,18 +1310,16 @@ ggml_cgraph * llama_context::graph_reserve(uint32_t n_tokens, uint32_t n_seqs, u
    llama_batch_allocr balloc(model.hparams.n_pos_per_embd());
    llama_ubatch ubatch = balloc.ubatch_reserve(n_tokens/n_seqs, n_seqs);

-    auto * gf = graph_init();
-    auto res = graph_build(ctx_compute.get(), gf, ubatch, LLM_GRAPH_TYPE_DEFAULT, mctx);
+    auto * res = gf_res_reserve.get();
+
+    const auto gparams = graph_params(res, ubatch, mctx, LLM_GRAPH_TYPE_DEFAULT);
+
+    res->reset();
+
+    auto * gf = model.build_graph(gparams);

    this->n_outputs = save_n_outputs;

-    if (!res) {
-        LLAMA_LOG_ERROR("%s: failed to build worst-case graph\n", __func__);
-        return nullptr;
-    }
-
-    ggml_backend_sched_reset(sched.get());
-
    // initialize scheduler with the specified graph
    if (!ggml_backend_sched_reserve(sched.get(), gf)) {
        LLAMA_LOG_ERROR("%s: failed to allocate compute buffers\n", __func__);
@ -1331,28 +1329,27 @@ ggml_cgraph * llama_context::graph_reserve(uint32_t n_tokens, uint32_t n_seqs, u
    return gf;
 }

-llm_graph_result_ptr llama_context::graph_build(
-                      ggml_context * ctx,
-                       ggml_cgraph * gf,
-                const llama_ubatch & ubatch,
-                    llm_graph_type   gtype,
-      const llama_memory_context_i * mctx) {
-    return model.build_graph(
-            {
-                /*.ctx         =*/ ctx,
-                /*.arch        =*/ model.arch,
-                /*.hparams     =*/ model.hparams,
-                /*.cparams     =*/ cparams,
-                /*.ubatch      =*/ ubatch,
-                /*.sched       =*/ sched.get(),
-                /*.backend_cpu =*/ backend_cpu,
-                /*.cvec        =*/ &cvec,
-                /*.loras       =*/ &loras,
-                /*.mctx        =*/ mctx,
-                /*.cross       =*/ &cross,
-                /*.n_outputs   =*/ n_outputs,
-                /*.cb          =*/ graph_get_cb(),
-            }, gf, gtype);
+llm_graph_params llama_context::graph_params(
+                      llm_graph_result_i * res,
+                      const llama_ubatch & ubatch,
+            const llama_memory_context_i * mctx,
+            llm_graph_type   gtype) const {
+    return {
+        /*.arch        =*/ model.arch,
+        /*.hparams     =*/ model.hparams,
+        /*.cparams     =*/ cparams,
+        /*.ubatch      =*/ ubatch,
+        /*.gtype       =*/ gtype,
+        /*.sched       =*/ sched.get(),
+        /*.backend_cpu =*/ backend_cpu,
+        /*.cvec        =*/ &cvec,
+        /*.loras       =*/ &loras,
+        /*.mctx        =*/ mctx,
+        /*.cross       =*/ &cross,
+        /*.n_outputs   =*/ n_outputs,
+        /*.cb          =*/ graph_get_cb(),
+        /*.res         =*/ res,
+    };
 }

 ggml_status llama_context::graph_compute(
@ -1930,6 +1927,7 @@ llama_perf_context_data llama_context::perf_get_data() const {
    data.t_eval_ms   = 1e-3 * t_eval_us;
    data.n_p_eval    = std::max(1, n_p_eval);
    data.n_eval      = std::max(1, n_eval);
+    data.n_reused    = std::max(0, n_reused);

    return data;
 }
@ -1938,6 +1936,7 @@ void llama_context::perf_reset() {
    t_start_us  = ggml_time_us();
    t_eval_us   = n_eval = 0;
    t_p_eval_us = n_p_eval = 0;
+    n_reused    = 0;
 }

 //
@ -2064,8 +2063,13 @@ void llama_context::opt_epoch_iter(
                break;
            }

-            auto * gf = graph_init();
-            auto res = graph_build(ctx_compute.get(), gf, ubatch, LLM_GRAPH_TYPE_DEFAULT, mctx.get());
+            auto * res = gf_res_prev.get();
+
+            const auto gparams = graph_params(res, ubatch, mctx.get(), LLM_GRAPH_TYPE_DEFAULT);
+
+            res->reset();
+
+            auto * gf = model.build_graph(gparams);

            struct ggml_context * ctx_compute_opt;
            {
@ -2187,6 +2191,7 @@ llama_context_params llama_context_default_params() {
        /*.no_perf                     =*/ true,
        /*.op_offload                  =*/ true,
        /*.swa_full                    =*/ true,
+        /*.graph_reuse                 =*/ false,
    };

    return result;
@ -2807,6 +2812,7 @@ void llama_perf_context_print(const llama_context * ctx) {
    LLAMA_LOG_INFO("%s:        eval time = %10.2f ms / %5d runs   (%8.2f ms per token, %8.2f tokens per second)\n",
            __func__, data.t_eval_ms, data.n_eval, data.t_eval_ms / data.n_eval, 1e3 / data.t_eval_ms * data.n_eval);
    LLAMA_LOG_INFO("%s:       total time = %10.2f ms / %5d tokens\n", __func__, (t_end_ms - data.t_start_ms), (data.n_p_eval + data.n_eval));
+    LLAMA_LOG_INFO("%s:    graphs reused = %10d\n", __func__, data.n_reused);
 }

 void llama_perf_context_reset(llama_context * ctx) {
--- a/src/llama-context.h
+++ b/src/llama-context.h
@ -35,8 +35,6 @@ struct llama_context {

    ggml_backend_sched_t get_sched() const;

-    ggml_context * get_ctx_compute() const;
-
    uint32_t n_ctx()         const;
    uint32_t n_ctx_per_seq() const;
    uint32_t n_batch()       const;
@ -96,7 +94,7 @@ struct llama_context {
    // if memory_context is provided, it will be applied first to the context's memory
    // ret contains the status of the graph computation
    // returns nullptr only if ret != GGML_STATUS_SUCCESS
-    llm_graph_result_ptr process_ubatch(
+    llm_graph_result_i * process_ubatch(
                const llama_ubatch & ubatch,
                    llm_graph_type   gtype,
            llama_memory_context_i * mctx,
@ -188,10 +186,7 @@ private:
    //

 public:
-    int32_t graph_max_nodes() const;
-
-    // zero-out inputs and create the ctx_compute for the compute graph
-    ggml_cgraph * graph_init();
+    uint32_t graph_max_nodes() const;

    // returns the result of ggml_backend_sched_graph_compute_async execution
    ggml_status graph_compute(ggml_cgraph * gf, bool batched);
@ -200,12 +195,11 @@ public:
    ggml_cgraph * graph_reserve(uint32_t n_tokens, uint32_t n_seqs, uint32_t n_outputs, const llama_memory_context_i * mctx);

 private:
-    llm_graph_result_ptr graph_build(
-                      ggml_context * ctx,
-                       ggml_cgraph * gf,
-                const llama_ubatch & ubatch,
-                    llm_graph_type   gtype,
-      const llama_memory_context_i * mctx);
+    llm_graph_params graph_params(
+                      llm_graph_result_i * res,
+                      const llama_ubatch & ubatch,
+            const llama_memory_context_i * mctx,
+                          llm_graph_type   gtype) const;

    llm_graph_cb graph_get_cb() const;

@ -258,8 +252,6 @@ private:
    ggml_backend_t backend_cpu = nullptr;
    std::vector<ggml_backend_ptr> backends;

-    ggml_context_ptr ctx_compute;
-
    // training
    ggml_opt_context_t opt_ctx = nullptr;

@ -275,8 +267,8 @@ private:
    std::vector<ggml_backend_t>             backend_ptrs;
    std::vector<ggml_backend_buffer_type_t> backend_buft;

-    // memory buffers used to evaluate the model
-    std::vector<uint8_t> buf_compute_meta;
+    llm_graph_result_ptr gf_res_prev;
+    llm_graph_result_ptr gf_res_reserve;

    // host buffer for the model output (logits and embeddings)
    ggml_backend_buffer_ptr buf_output;
@ -294,4 +286,6 @@ private:

    mutable int32_t n_p_eval = 0; // number of tokens in eval calls for the prompt (with batch size > 1)
    mutable int32_t n_eval   = 0; // number of eval calls
+
+    mutable int32_t n_reused = 0; // number of times the previous graph was reused
 };
--- a/src/llama-cparams.cpp
+++ b/src/llama-cparams.cpp
@ -3,3 +3,32 @@
 size_t llama_max_parallel_sequences(void) {
    return LLAMA_MAX_SEQ;
 }
+
+bool llama_cparams::is_same(const llama_cparams & other) const {
+    return
+        n_ctx               == other.n_ctx               &&
+        n_batch             == other.n_batch             &&
+        n_ubatch            == other.n_ubatch            &&
+        n_seq_max           == other.n_seq_max           &&
+        n_threads           == other.n_threads           &&
+        n_threads_batch     == other.n_threads_batch     &&
+        rope_freq_base      == other.rope_freq_base      &&
+        rope_freq_scale     == other.rope_freq_scale     &&
+        n_ctx_orig_yarn     == other.n_ctx_orig_yarn     &&
+        yarn_ext_factor     == other.yarn_ext_factor     &&
+        yarn_attn_factor    == other.yarn_attn_factor    &&
+        yarn_beta_fast      == other.yarn_beta_fast      &&
+        yarn_beta_slow      == other.yarn_beta_slow      &&
+        defrag_thold        == other.defrag_thold        &&
+        embeddings          == other.embeddings          &&
+        causal_attn         == other.causal_attn         &&
+        offload_kqv         == other.offload_kqv         &&
+        flash_attn          == other.flash_attn          &&
+        no_perf             == other.no_perf             &&
+        warmup              == other.warmup              &&
+        op_offload          == other.op_offload          &&
+        graph_reuse         == other.graph_reuse         &&
+        pooling_type        == other.pooling_type        &&
+        cb_eval             == other.cb_eval             &&
+        cb_eval_user_data   == other.cb_eval_user_data;
+}
--- a/src/llama-cparams.h
+++ b/src/llama-cparams.h
@ -33,9 +33,12 @@ struct llama_cparams {
    bool no_perf;
    bool warmup;
    bool op_offload;
+    bool graph_reuse;

    enum llama_pooling_type pooling_type;

    ggml_backend_sched_eval_callback cb_eval;
    void * cb_eval_user_data;
+
+    bool is_same(const llama_cparams & other) const;
 };
--- a/src/llama-graph.cpp
+++ b/src/llama-graph.cpp
@ -28,6 +28,15 @@ void llm_graph_input_embd::set_input(const llama_ubatch * ubatch) {
    }
 }

+bool llm_graph_input_embd::update(const llm_graph_params & params) {
+    bool res = true;
+
+    res &= (!tokens && !params.ubatch.token) || (tokens && tokens->ne[0] == params.ubatch.n_tokens);
+    res &= (!embd   && !params.ubatch.embd)  || (embd   &&   embd->ne[0] == params.ubatch.n_tokens);
+
+    return res;
+}
+
 void llm_graph_input_pos::set_input(const llama_ubatch * ubatch) {
    if (ubatch->pos && pos) {
        const int64_t n_tokens = ubatch->n_tokens;
@ -50,6 +59,14 @@ void llm_graph_input_pos::set_input(const llama_ubatch * ubatch) {
    }
 }

+bool llm_graph_input_pos::update(const llm_graph_params & params) {
+    bool res = true;
+
+    res &= pos->ne[0] == params.ubatch.n_tokens;
+
+    return res;
+}
+
 void llm_graph_input_attn_temp::set_input(const llama_ubatch * ubatch) {
    if (ubatch->pos && attn_scale) {
        const int64_t n_tokens = ubatch->n_tokens;
@ -118,6 +135,14 @@ void llm_graph_input_out_ids::set_input(const llama_ubatch * ubatch) {
    }
 }

+bool llm_graph_input_out_ids::update(const llm_graph_params & params) {
+    bool res = true;
+
+    res &= n_outputs == params.n_outputs;
+
+    return res;
+}
+
 void llm_graph_input_mean::set_input(const llama_ubatch * ubatch) {
    if (cparams.embeddings && cparams.pooling_type == LLAMA_POOLING_TYPE_MEAN) {
        const int64_t n_tokens     = ubatch->n_tokens;
@ -287,6 +312,24 @@ void llm_graph_input_attn_kv_unified::set_input(const llama_ubatch * ubatch) {
    mctx->set_input_kq_mask(self_kq_mask, ubatch, cparams.causal_attn);
 }

+bool llm_graph_input_attn_kv_unified::update(const llm_graph_params & params) {
+    const auto * mctx = static_cast<const llama_kv_cache_unified_context *>(params.mctx);
+
+    this->mctx = mctx;
+
+    bool res = true;
+
+    res &= self_k_idxs->ne[0] == params.ubatch.n_tokens;
+    res &= self_v_idxs->ne[0] == params.ubatch.n_tokens;
+
+    res &= self_kq_mask->ne[0] == mctx->get_n_kv();
+    res &= self_kq_mask->ne[1] == GGML_PAD(params.ubatch.n_tokens, GGML_KQ_MASK_PAD);
+
+    res &= mctx->get_supports_set_rows(); // TODO: tmp
+
+    return res;
+}
+
 void llm_graph_input_attn_kv_unified_iswa::set_input(const llama_ubatch * ubatch) {
    mctx->get_base()->set_input_k_idxs(self_k_idxs, ubatch);
    mctx->get_base()->set_input_v_idxs(self_v_idxs, ubatch);
@ -299,6 +342,30 @@ void llm_graph_input_attn_kv_unified_iswa::set_input(const llama_ubatch * ubatch
    mctx->get_swa()->set_input_kq_mask(self_kq_mask_swa, ubatch, cparams.causal_attn);
 }

+bool llm_graph_input_attn_kv_unified_iswa::update(const llm_graph_params & params) {
+    const auto * mctx = static_cast<const llama_kv_cache_unified_iswa_context *>(params.mctx);
+
+    this->mctx = mctx;
+
+    bool res = true;
+
+    res &= self_k_idxs->ne[0] == params.ubatch.n_tokens;
+    res &= self_v_idxs->ne[0] == params.ubatch.n_tokens;
+
+    res &= self_k_idxs_swa->ne[0] == params.ubatch.n_tokens;
+    res &= self_v_idxs_swa->ne[0] == params.ubatch.n_tokens;
+
+    res &= self_kq_mask->ne[0] == mctx->get_base()->get_n_kv();
+    res &= self_kq_mask->ne[1] == GGML_PAD(params.ubatch.n_tokens, GGML_KQ_MASK_PAD);
+
+    res &= self_kq_mask_swa->ne[0] == mctx->get_swa()->get_n_kv();
+    res &= self_kq_mask_swa->ne[1] == GGML_PAD(params.ubatch.n_tokens, GGML_KQ_MASK_PAD);
+
+    res &= mctx->get_base()->get_supports_set_rows(); // TODO: tmp
+
+    return res;
+}
+
 void llm_graph_input_attn_cross::set_input(const llama_ubatch * ubatch) {
    GGML_ASSERT(cross_kq_mask);

@ -395,7 +462,6 @@ llm_graph_context::llm_graph_context(const llm_graph_params & params) :
    n_ctx_orig       (cparams.n_ctx_orig_yarn),
    pooling_type     (cparams.pooling_type),
    rope_type        (hparams.rope_type),
-    ctx0             (params.ctx),
    sched            (params.sched),
    backend_cpu      (params.backend_cpu),
    cvec             (params.cvec),
@ -403,7 +469,8 @@ llm_graph_context::llm_graph_context(const llm_graph_params & params) :
    mctx             (params.mctx),
    cross            (params.cross),
    cb_func          (params.cb),
-    res              (std::make_unique<llm_graph_result>()) {
+    res              (static_cast<llm_graph_result *>(params.res)),
+    ctx0             (res->get_ctx()) {
    }

 void llm_graph_context::cb(ggml_tensor * cur, const char * name, int il) const {
--- a/src/llama-graph.h
+++ b/src/llama-graph.h
@ -1,6 +1,7 @@
 #pragma once

 #include "llama-arch.h"
+#include "llama-batch.h"
 #include "llama-hparams.h"
 #include "llama-adapter.h"

@ -14,7 +15,6 @@ struct ggml_cgraph;
 struct ggml_context;
 struct ggml_tensor;

-struct llama_ubatch;
 struct llama_cparams;

 struct llama_memory_context_i;
@ -69,6 +69,8 @@ struct llama_cross {
    std::vector<std::set<llama_seq_id>> seq_ids_enc;
 };

+struct llm_graph_params;
+
 //
 // llm_graph_input
 //
@ -78,11 +80,19 @@ public:
    virtual ~llm_graph_input_i() = default;

    virtual void set_input(const llama_ubatch * ubatch) = 0;
+
+    // return true if the resulting input tensors using the provided graph parameters would be
+    //   the same as the previous input tensors that we have currently stored in the object
+    virtual bool update(const llm_graph_params & params) {
+        // returning false here by default will prevent from reusing the graph if the check
+        //   for the input type has not been implemented yet
+        GGML_UNUSED(params);
+        return false;
+    }
 };

 using llm_graph_input_ptr = std::unique_ptr<llm_graph_input_i>;

-
 class llm_graph_input_embd : public llm_graph_input_i {
 public:
    llm_graph_input_embd()          = default;
@ -90,6 +100,8 @@ public:

    void set_input(const llama_ubatch * ubatch) override;

+    bool update(const llm_graph_params & params) override;
+
    ggml_tensor * tokens = nullptr; // I32 [n_batch]
    ggml_tensor * embd   = nullptr; // F32 [n_embd, n_batch]
 };
@ -101,6 +113,8 @@ public:

    void set_input(const llama_ubatch * ubatch) override;

+    bool update(const llm_graph_params & params) override;
+
    ggml_tensor * pos = nullptr; // I32 [n_batch]

    const uint32_t n_pos_per_embd = 1;
@ -154,17 +168,19 @@ public:
    llm_graph_input_out_ids(
            const llama_hparams & hparams,
            const llama_cparams & cparams,
-            int32_t n_outputs) : hparams(hparams), cparams(cparams), n_outputs(n_outputs) {}
+            uint32_t n_outputs) : hparams(hparams), cparams(cparams), n_outputs(n_outputs) {}
    virtual ~llm_graph_input_out_ids() = default;

    void set_input(const llama_ubatch * ubatch) override;

+    bool update(const llm_graph_params & params) override;
+
    ggml_tensor * out_ids; // I32 [n_outputs]

    const llama_hparams & hparams;
    const llama_cparams & cparams;

-    const int32_t n_outputs;
+    const uint32_t n_outputs;
 };

 class llm_graph_input_mean : public llm_graph_input_i {
@ -249,6 +265,8 @@ public:

    void set_input(const llama_ubatch * ubatch) override;

+    bool update(const llm_graph_params & params) override;
+
    ggml_tensor * get_k_idxs() const { return self_k_idxs; }
    ggml_tensor * get_v_idxs() const { return self_v_idxs; }

@ -280,6 +298,8 @@ public:

    void set_input(const llama_ubatch * ubatch) override;

+    bool update(const llm_graph_params & params) override;
+
    ggml_tensor * get_k_idxs()     const { return self_k_idxs; }
    ggml_tensor * get_v_idxs()     const { return self_v_idxs; }
    ggml_tensor * get_k_idxs_swa() const { return self_k_idxs_swa; }
@ -373,29 +393,110 @@ public:
 // along with the input tensors, the object also provides commonly used outputs tensors, such as logits, embeddings, etc.
 //   these are used by the llama_context to extact the relevant data, based on the compute parameters

+// TODO: this interface seems redundant - remove it
 class llm_graph_result_i {
 public:
    virtual ~llm_graph_result_i() = default;

-    virtual ggml_tensor * get_tokens()      = 0;
-    virtual ggml_tensor * get_logits()      = 0;
-    virtual ggml_tensor * get_embd()        = 0;
-    virtual ggml_tensor * get_embd_pooled() = 0;
+    virtual ggml_tensor * get_tokens()      const = 0;
+    virtual ggml_tensor * get_logits()      const = 0;
+    virtual ggml_tensor * get_embd()        const = 0;
+    virtual ggml_tensor * get_embd_pooled() const = 0;
+
+    virtual ggml_cgraph  * get_gf()  = 0;
+    virtual ggml_context * get_ctx() = 0;
+
+    virtual void reset() = 0;

    virtual void set_inputs(const llama_ubatch * ubatch) = 0;
+
+    virtual bool update(const llm_graph_params & params) = 0;
 };

 using llm_graph_result_ptr = std::unique_ptr<llm_graph_result_i>;

+// callback that allows us to apply custom logic to each tensor (e.g. ggml-alloc, offloading, etc.)
+using llm_graph_cb = std::function<void(const llama_ubatch & ubatch, ggml_tensor * cur, const char * name, int il)>;
+
+struct llm_graph_params {
+    llm_arch arch = LLM_ARCH_UNKNOWN;
+
+    llama_hparams hparams;
+    llama_cparams cparams;
+
+    llama_ubatch ubatch; // note: intentionally make a copy
+
+    llm_graph_type gtype;
+
+    ggml_backend_sched_t sched;
+    ggml_backend_t backend_cpu;
+
+    const llama_adapter_cvec     * cvec;
+    const llama_adapter_loras    * loras;
+    const llama_memory_context_i * mctx;
+    const llama_cross            * cross;
+
+    uint32_t n_outputs;
+
+    llm_graph_cb cb;
+
+    // TODO: temporary
+    llm_graph_result_i * res;
+
+    bool is_same(const llm_graph_params & other) const {
+        return
+            hparams.is_same(other.hparams) &&
+            cparams.is_same(other.cparams) &&
+            ubatch .is_same(other.ubatch)  &&
+            arch      == other.arch  &&
+            gtype     == other.gtype &&
+            cvec      == other.cvec  &&
+            loras     == other.loras &&
+            cross     == other.cross &&
+            n_outputs == other.n_outputs;
+    }
+};

 class llm_graph_result : public llm_graph_result_i {
 public:
+    llm_graph_result(int64_t max_nodes) : max_nodes(max_nodes) {
+        reset();
+    }
+
    virtual ~llm_graph_result() = default;

-    ggml_tensor * get_tokens()      override { return t_tokens; }
-    ggml_tensor * get_logits()      override { return t_logits; }
-    ggml_tensor * get_embd()        override { return t_embd; }
-    ggml_tensor * get_embd_pooled() override { return t_embd_pooled; }
+    ggml_tensor * get_tokens()      const override { return t_tokens; }
+    ggml_tensor * get_logits()      const override { return t_logits; }
+    ggml_tensor * get_embd()        const override { return t_embd; }
+    ggml_tensor * get_embd_pooled() const override { return t_embd_pooled; }
+
+    ggml_cgraph  * get_gf()  override { return gf; }
+    ggml_context * get_ctx() override { return ctx_compute.get(); }
+
+    void set_max_nodes(int64_t max_nodes) {
+        this->max_nodes = max_nodes;
+    }
+
+    void reset() override {
+        t_tokens      = nullptr;
+        t_logits      = nullptr;
+        t_embd        = nullptr;
+        t_embd_pooled = nullptr;
+
+        inputs.clear();
+
+        buf_compute_meta.resize(ggml_tensor_overhead()*max_nodes + ggml_graph_overhead_custom(max_nodes, false));
+
+        ggml_init_params params = {
+            /*.mem_size   =*/ buf_compute_meta.size(),
+            /*.mem_buffer =*/ buf_compute_meta.data(),
+            /*.no_alloc   =*/ true,
+        };
+
+        ctx_compute.reset(ggml_init(params));
+
+        gf = ggml_new_graph_custom(ctx_compute.get(), max_nodes, false);
+    }

    void set_inputs(const llama_ubatch * ubatch) override {
        for (auto & input : inputs) {
@ -403,6 +504,25 @@ public:
        }
    }

+    // try to update the existing graph result using the new graph parameters
+    // this can only be done if we determine that the resulting graph using the new graph parameters
+    //   would be identical to the existing graph. in that case, we simply have to update the memory
+    //   contexts of the input tensors of the graph and we can reuse it for another computation
+    // return true if the graph was updated and can be reused
+    bool update(const llm_graph_params & params) override {
+        if (!this->params.is_same(params)) {
+            return false;
+        }
+
+        bool res = true;
+
+        for (auto & input : inputs) {
+            res &= input->update(params);
+        }
+
+        return res;
+    }
+
    llm_graph_input_i * add_input(llm_graph_input_ptr input) {
        inputs.emplace_back(std::move(input));
        return inputs.back().get();
@ -415,37 +535,26 @@ public:
    ggml_tensor * t_embd_pooled = nullptr;

    std::vector<llm_graph_input_ptr> inputs;
+
+    ggml_context_ptr ctx_compute;
+
+    // memory buffers used to evaluate the model
+    std::vector<uint8_t> buf_compute_meta;
+
+    ggml_cgraph * gf;
+
+    int64_t max_nodes;
+
+    // keep a copy of the previous graph parameters
+    // we will use this to determine whether the graph can be reused by comparing them with the new parameters
+    // note: these are updated after constructing the new graph
+    llm_graph_params params;
 };

 //
 // llm_graph_context
 //

-// callback that allows us to apply custom logic to each tensor (e.g. ggml-alloc, offloading, etc.)
-using llm_graph_cb = std::function<void(const llama_ubatch & ubatch, ggml_tensor * cur, const char * name, int il)>;
-
-struct llm_graph_params {
-    ggml_context * ctx;
-
-    const llm_arch arch;
-
-    const llama_hparams & hparams;
-    const llama_cparams & cparams;
-    const llama_ubatch  & ubatch;
-
-    ggml_backend_sched_t sched;
-    ggml_backend_t backend_cpu;
-
-    const llama_adapter_cvec     * cvec;
-    const llama_adapter_loras    * loras;
-    const llama_memory_context_i * mctx;
-    const llama_cross            * cross;
-
-    uint32_t n_outputs;
-
-    const llm_graph_cb & cb;
-};
-
 // used in build_rs to properly order writes and avoid unnecessary copies
 using llm_graph_get_rows_fn = std::function<ggml_tensor * (ggml_context *, ggml_tensor * states, ggml_tensor * ids)>;

@ -485,8 +594,6 @@ struct llm_graph_context {
    const enum llama_pooling_type pooling_type;
    const enum llama_rope_type    rope_type;

-    ggml_context * ctx0 = nullptr;
-
    ggml_backend_sched_t sched;

    ggml_backend_t backend_cpu; // TODO: needed by build_attn_mha, figure out a way to remove?
@ -498,7 +605,9 @@ struct llm_graph_context {

    const llm_graph_cb & cb_func;

-    std::unique_ptr<llm_graph_result> res;
+    llm_graph_result * res;
+
+    ggml_context * ctx0 = nullptr;

    llm_graph_context(const llm_graph_params & params);
    virtual ~llm_graph_context() = default;
--- a/src/llama-hparams.cpp
+++ b/src/llama-hparams.cpp
@ -102,3 +102,12 @@ bool llama_hparams::is_swa(uint32_t il) const {

    GGML_ABORT("fatal error");
 }
+
+bool llama_hparams::is_same(const llama_hparams & other) const {
+    return
+        n_ctx_train == other.n_ctx_train &&
+        n_embd == other.n_embd &&
+        n_layer == other.n_layer &&
+        n_expert == other.n_expert &&
+        n_expert_used == other.n_expert_used;
+}
--- a/src/llama-hparams.h
+++ b/src/llama-hparams.h
@ -202,6 +202,8 @@ struct llama_hparams {
    uint32_t n_pos_per_embd() const;

    bool is_swa(uint32_t il) const;
+
+    bool is_same(const llama_hparams & other) const;
 };

 static_assert(std::is_trivially_copyable<llama_hparams>::value, "llama_hparams must be trivially copyable");
--- a/src/llama-kv-cache-unified.cpp
+++ b/src/llama-kv-cache-unified.cpp
@ -68,6 +68,8 @@ llama_kv_cache_unified::llama_kv_cache_unified(

    cells.resize(kv_size);

+    gf_res.reset(new llm_graph_result(32768)); // note: the max nodes will be updated later
+
    for (uint32_t il = 0; il < n_layer_cache; il++) {
        if (filter && !filter(il)) {
            LLAMA_LOG_DEBUG("%s: layer %3d: skipped\n", __func__, il);
@ -158,7 +160,7 @@ llama_kv_cache_unified::llama_kv_cache_unified(
    debug = LLAMA_KV_CACHE_DEBUG ? atoi(LLAMA_KV_CACHE_DEBUG) : 0;

    const char * LLAMA_SET_ROWS = getenv("LLAMA_SET_ROWS");
-    supports_set_rows = LLAMA_SET_ROWS ? atoi(LLAMA_SET_ROWS) : 0;
+    supports_set_rows = LLAMA_SET_ROWS ? atoi(LLAMA_SET_ROWS) != 0 : 0;

    if (!supports_set_rows) {
        LLAMA_LOG_WARN("%s: LLAMA_SET_ROWS=0, using old ggml_cpy() method for backwards compatibility\n", __func__);
@ -480,14 +482,12 @@ bool llama_kv_cache_unified::update(llama_context * lctx, bool do_shift, const d
        if (hparams.rope_type != LLAMA_ROPE_TYPE_NONE) {
            ggml_backend_sched_reset(sched);

-            auto * gf = lctx->graph_init();
+            auto * res = gf_res.get();

-            auto res = build_graph_shift(lctx->get_cparams(), lctx->get_ctx_compute(), gf);
-            if (!res) {
-                LLAMA_LOG_ERROR("%s: failed to build graph for K-shift\n", __func__);
-                return updated;
-            }
+            res->set_max_nodes(lctx->graph_max_nodes());
+            res->reset();

+            auto * gf = build_graph_shift(res, lctx);
            if (!ggml_backend_sched_alloc_graph(sched, gf)) {
                LLAMA_LOG_ERROR("%s: failed to allocate compute graph for K-shift\n", __func__);
                return updated;
@ -529,14 +529,12 @@ bool llama_kv_cache_unified::update(llama_context * lctx, bool do_shift, const d

        ggml_backend_sched_reset(sched);

-        auto * gf = lctx->graph_init();
+        auto * res = gf_res.get();

-        auto res = build_graph_defrag(lctx->get_cparams(), lctx->get_ctx_compute(), gf, dinfo);
-        if (!res) {
-            LLAMA_LOG_ERROR("%s: failed to build graph for defrag\n", __func__);
-            return updated;
-        }
+        res->set_max_nodes(lctx->graph_max_nodes());
+        res->reset();

+        auto * gf = build_graph_defrag(res, lctx, dinfo);
        if (!ggml_backend_sched_alloc_graph(sched, gf)) {
            LLAMA_LOG_ERROR("%s: failed to allocate compute graph for defrag\n", __func__);
            return updated;
@ -780,6 +778,10 @@ uint32_t llama_kv_cache_unified::get_n_kv() const {
    return std::min(cells.size(), std::max(n_pad, GGML_PAD(cells.used_max_p1(), n_pad)));
 }

+bool llama_kv_cache_unified::get_supports_set_rows() const {
+    return supports_set_rows;
+}
+
 ggml_tensor * llama_kv_cache_unified::get_k(ggml_context * ctx, int32_t il, uint32_t n_kv) const {
    const int32_t ikv = map_layer_ids.at(il);

@ -1142,11 +1144,9 @@ void llm_graph_input_k_shift::set_input(const llama_ubatch * ubatch) {
    }
 }

-llm_graph_result_ptr llama_kv_cache_unified::build_graph_shift(
-        const llama_cparams & cparams,
-               ggml_context * ctx,
-                ggml_cgraph * gf) const {
-    auto res = std::make_unique<llm_graph_result>();
+ggml_cgraph * llama_kv_cache_unified::build_graph_shift(llm_graph_result * res, llama_context * lctx) const {
+    auto * ctx = res->get_ctx();
+    auto * gf  = res->get_gf();

    const auto & n_embd_head_k = hparams.n_embd_head_k;
  //const auto & n_embd_head_v = hparams.n_embd_head_v;
@ -1156,6 +1156,8 @@ llm_graph_result_ptr llama_kv_cache_unified::build_graph_shift(
    inp->k_shift = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, cells.size());
    ggml_set_input(inp->k_shift);

+    const auto & cparams = lctx->get_cparams();
+
    for (const auto & layer : layers) {
        const uint32_t il = layer.il;

@ -1181,18 +1183,20 @@ llm_graph_result_ptr llama_kv_cache_unified::build_graph_shift(

    res->add_input(std::move(inp));

-    return res;
+    return gf;
 }

-llm_graph_result_ptr llama_kv_cache_unified::build_graph_defrag(
-                const llama_cparams & cparams,
-                       ggml_context * ctx,
-                        ggml_cgraph * gf,
-                  const defrag_info & dinfo) const {
-    auto res = std::make_unique<llm_graph_result>();
+ggml_cgraph * llama_kv_cache_unified::build_graph_defrag(
+         llm_graph_result * res,
+            llama_context * lctx,
+        const defrag_info & dinfo) const {
+    auto * ctx = res->get_ctx();
+    auto * gf  = res->get_gf();

    const auto & ids = dinfo.ids;

+    const auto & cparams = lctx->get_cparams();
+
 #if 0
    // CPU defrag
    //
@ -1329,7 +1333,7 @@ llm_graph_result_ptr llama_kv_cache_unified::build_graph_defrag(
    //LLAMA_LOG_INFO("gf->n_nodes = %d\n", gf->n_nodes);
 #endif

-    return res;
+    return gf;
 }

 llama_kv_cache_unified::defrag_info llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) const {
@ -1940,6 +1944,10 @@ uint32_t llama_kv_cache_unified_context::get_n_kv() const {
    return n_kv;
 }

+bool llama_kv_cache_unified_context::get_supports_set_rows() const {
+    return kv->get_supports_set_rows();
+}
+
 ggml_tensor * llama_kv_cache_unified_context::get_k(ggml_context * ctx, int32_t il) const {
    return kv->get_k(ctx, il, n_kv);
 }
--- a/src/llama-kv-cache-unified.h
+++ b/src/llama-kv-cache-unified.h
@ -121,6 +121,9 @@ public:

    uint32_t get_n_kv() const;

+    // TODO: temporary
+    bool get_supports_set_rows() const;
+
    // get views of the current state of the cache
    ggml_tensor * get_k(ggml_context * ctx, int32_t il, uint32_t n_kv) const;
    ggml_tensor * get_v(ggml_context * ctx, int32_t il, uint32_t n_kv) const;
@ -193,13 +196,15 @@ private:

    // env: LLAMA_SET_ROWS (temporary)
    // ref: https://github.com/ggml-org/llama.cpp/pull/14285
-    int supports_set_rows = false;
+    bool supports_set_rows = false;

    const llama_swa_type swa_type = LLAMA_SWA_TYPE_NONE;

    std::vector<ggml_context_ptr>        ctxs;
    std::vector<ggml_backend_buffer_ptr> bufs;

+    std::unique_ptr<llm_graph_result> gf_res;
+
    llama_kv_cells_unified cells;

    std::vector<kv_layer> layers;
@ -226,15 +231,13 @@ private:
                          float   freq_base,
                          float   freq_scale) const;

-    llm_graph_result_ptr build_graph_shift(
-            const llama_cparams & cparams,
-                   ggml_context * ctx,
-                    ggml_cgraph * gf) const;
+    ggml_cgraph * build_graph_shift(
+               llm_graph_result * res,
+                  llama_context * lctx) const;

-    llm_graph_result_ptr build_graph_defrag(
-            const llama_cparams & cparams,
-                   ggml_context * ctx,
-                    ggml_cgraph * gf,
+    ggml_cgraph * build_graph_defrag(
+               llm_graph_result * res,
+                  llama_context * lctx,
              const defrag_info & dinfo) const;

    void state_write_meta(llama_io_write_i & io, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges, llama_seq_id seq_id = -1) const;
@ -288,6 +291,9 @@ public:

    uint32_t get_n_kv() const;

+    // TODO: temporary
+    bool get_supports_set_rows() const;
+
    // get views of the current state of the cache
    ggml_tensor * get_k(ggml_context * ctx, int32_t il) const;
    ggml_tensor * get_v(ggml_context * ctx, int32_t il) const;
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@ -14751,10 +14751,10 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
    return res;
 }

-llm_graph_result_ptr llama_model::build_graph(
-        const llm_graph_params & params,
-                   ggml_cgraph * gf,
-                llm_graph_type   type) const {
+ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
+    // TODO: temporary - will refactor this to keep the "gf" instance in the llm_graph_context and avoid passing it everywhere
+    auto * gf = params.res->get_gf();
+
    std::unique_ptr<llm_graph_context> llm;

    switch (arch) {
@ -14961,7 +14961,7 @@ llm_graph_result_ptr llama_model::build_graph(
            } break;
        case LLM_ARCH_T5:
            {
-                switch (type) {
+                switch (params.gtype) {
                    case LLM_GRAPH_TYPE_ENCODER:
                        llm = std::make_unique<llm_build_t5_enc>(*this, params, gf);
                        break;
@ -15047,7 +15047,10 @@ llm_graph_result_ptr llama_model::build_graph(
    // add on pooling layer
    llm->build_pooling(gf, cls, cls_b, cls_out, cls_out_b);

-    return std::move(llm->res);
+    // TODO: updating the graph parameters here is a little bit obscure - figure out something better
+    llm->res->params = params;
+
+    return llm->res->get_gf();
 }

 //
--- a/src/llama-model.h
+++ b/src/llama-model.h
@ -436,10 +436,7 @@ struct llama_model {
    llama_memory_i * create_memory(const llama_memory_params & params, llama_cparams & cparams) const;

    // TODO: move this to new llm_arch_model_i interface
-    llm_graph_result_ptr build_graph(
-            const llm_graph_params & params,
-                       ggml_cgraph * gf,
-                    llm_graph_type   type) const;
+    ggml_cgraph * build_graph(const llm_graph_params & params) const;

 private:
    struct impl;