From 0f7daa9d1bce23b962d6c648dc4d7f71d338c8c6 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Fri, 28 Feb 2025 19:56:10 +0200
Subject: [PATCH] graph : move non-context related logic to llm_build_context

ggml-ci
---
 src/llama-context.cpp | 620 +++++++++++-------------------------------
 src/llama-context.h   | 118 ++++----
 src/llama-graph.cpp   |  12 +-
 src/llama-graph.h     |  67 ++---
 src/llama-model.cpp   | 425 +++++++++++++++++++++++------
 src/llama-model.h     |   1 -
 6 files changed, 579 insertions(+), 664 deletions(-)

diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index 8587f480fd..7ba86a2a7f 100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -71,26 +71,7 @@ void llama_graph_input_embd::set_input(const llama_ubatch * ubatch) {
     }
 }
 
-class llama_graph_input_pos : public llama_graph_input_i {
-public:
-    llama_graph_input_pos(int64_t n_pos_per_token) : n_pos_per_token(n_pos_per_token) {}
-    virtual ~llama_graph_input_pos() = default;
-
-    void set_input(const llama_ubatch * ubatch) override;
-
-    ggml_tensor * pos = nullptr; // I32 [n_batch]
-
-    const int64_t n_pos_per_token = 1;
-};
-
-void llama_graph_input_pos::set_input(const llama_ubatch * ubatch) {
-    if (ubatch->pos && pos) {
-        const int64_t n_tokens = ubatch->n_tokens;
-
-        ggml_backend_tensor_set(pos, ubatch->pos, 0, n_tokens*n_pos_per_token*ggml_element_size(pos));
-    }
-}
-
+// I32 [n_batch, n_batch]
 class llama_graph_input_pos_bucket : public llama_graph_input_i {
 public:
     llama_graph_input_pos_bucket(const llama_hparams & hparams) : hparams(hparams) {}
@@ -98,19 +79,17 @@ public:
 
     void set_input(const llama_ubatch * ubatch) override;
 
-    ggml_tensor * pos_bucket; // I32 [n_batch, n_batch]
-
     const llama_hparams & hparams;
 };
 
 void llama_graph_input_pos_bucket::set_input(const llama_ubatch * ubatch) {
-    if (pos_bucket) {
+    if (cur) {
         const int64_t n_tokens = ubatch->n_tokens;
 
-        GGML_ASSERT(ggml_backend_buffer_is_host(pos_bucket->buffer));
+        GGML_ASSERT(ggml_backend_buffer_is_host(cur->buffer));
         GGML_ASSERT(!ubatch->equal_seqs); // TODO: use ubatch->n_seqs instead of failing
 
-        int32_t * data = (int32_t *) pos_bucket->data;
+        int32_t * data = (int32_t *) cur->data;
 
         for (int h = 0; h < 1; ++h) {
             for (int j = 0; j < n_tokens; ++j) {
@@ -122,192 +101,6 @@ void llama_graph_input_pos_bucket::set_input(const llama_ubatch * ubatch) {
     }
 }
 
-class llama_graph_input_out_ids : public llama_graph_input_i {
-public:
-    llama_graph_input_out_ids(
-            const llama_hparams & hparams,
-            const llama_cparams & cparams,
-            int32_t n_outputs) : hparams(hparams), cparams(cparams), n_outputs(n_outputs) {}
-    virtual ~llama_graph_input_out_ids() = default;
-
-    void set_input(const llama_ubatch * ubatch) override;
-
-    ggml_tensor * out_ids; // I32 [n_outputs]
-
-    const llama_hparams & hparams;
-    const llama_cparams & cparams;
-
-    const int32_t n_outputs;
-};
-
-void llama_graph_input_out_ids::set_input(const llama_ubatch * ubatch) {
-    if (hparams.causal_attn || cparams.pooling_type == LLAMA_POOLING_TYPE_NONE) {
-        //GGML_ASSERT(out_ids && "every model that can must skip unused outputs");
-
-        if (!out_ids) {
-            LLAMA_LOG_WARN("%s: 'out_ids' is not created\n", __func__);
-        } else {
-            const int64_t n_tokens = ubatch->n_tokens;
-
-            GGML_ASSERT(ggml_backend_buffer_is_host(out_ids->buffer));
-            int32_t * data = (int32_t *) out_ids->data;
-
-            if (n_outputs == n_tokens) {
-                for (int i = 0; i < n_tokens; ++i) {
-                    data[i] = i;
-                }
-            } else if (ubatch->output) {
-                int32_t n_outputs = 0;
-                for (int i = 0; i < n_tokens; ++i) {
-                    if (ubatch->output[i]) {
-                        data[n_outputs++] = i;
-                    }
-                }
-                // the graph needs to have been passed the correct number of outputs
-                GGML_ASSERT(n_outputs == n_outputs);
-            } else if (n_outputs == 1) {
-                // only keep last output
-                data[0] = n_tokens - 1;
-            } else {
-                GGML_ASSERT(n_outputs == 0);
-            }
-        }
-    }
-}
-
-class llama_graph_input_mean : public llama_graph_input_i {
-public:
-    llama_graph_input_mean(const llama_cparams & cparams) : cparams(cparams) {}
-    virtual ~llama_graph_input_mean() = default;
-
-    void set_input(const llama_ubatch * ubatch) override;
-
-    ggml_tensor * mean; // F32 [n_batch, n_batch]
-
-    const llama_cparams & cparams;
-};
-
-void llama_graph_input_mean::set_input(const llama_ubatch * ubatch) {
-    if (cparams.embeddings && cparams.pooling_type == LLAMA_POOLING_TYPE_MEAN) {
-        const int64_t n_tokens     = ubatch->n_tokens;
-        const int64_t n_seq_tokens = ubatch->n_seq_tokens;
-        const int64_t n_seqs       = ubatch->n_seqs;
-
-        GGML_ASSERT(mean);
-        GGML_ASSERT(ggml_backend_buffer_is_host(mean->buffer));
-
-        float * data = (float *) mean->data;
-        memset(mean->data, 0, n_tokens * n_tokens * ggml_element_size(mean));
-
-        std::vector<uint64_t> sum(n_tokens, 0);
-
-        for (int s = 0; s < n_seqs; ++s) {
-            const llama_seq_id seq_id = ubatch->seq_id[s][0];
-
-            // TODO: adapt limits to n_seqs when ubatch->equal_seqs is true
-            GGML_ASSERT(seq_id < n_tokens && "seq_id cannot be larger than n_tokens with pooling_type == MEAN");
-
-            sum[seq_id] += ubatch->n_seq_tokens;
-        }
-
-        std::vector<float> div(n_tokens, 0.0f);
-        for (int i = 0; i < n_tokens; ++i) {
-            const uint64_t s = sum[i];
-            if (s > 0) {
-                div[i] = 1.0f/float(s);
-            }
-        }
-
-        for (int s = 0; s < n_seqs; ++s) {
-            const llama_seq_id seq_id = ubatch->seq_id[s][0];
-
-            for (int i = 0; i < n_seq_tokens; ++i) {
-                data[seq_id*n_tokens + s*n_seq_tokens + i] = div[seq_id];
-            }
-        }
-    }
-}
-
-class llama_graph_input_cls : public llama_graph_input_i {
-public:
-    llama_graph_input_cls(const llama_cparams & cparams) : cparams(cparams) {}
-    virtual ~llama_graph_input_cls() = default;
-
-    void set_input(const llama_ubatch * ubatch) override;
-
-    ggml_tensor * cls; // I32 [n_batch]
-
-    const llama_cparams & cparams;
-};
-
-void llama_graph_input_cls::set_input(const llama_ubatch * ubatch) {
-    if (cparams.embeddings && (
-                cparams.pooling_type == LLAMA_POOLING_TYPE_CLS ||
-                cparams.pooling_type == LLAMA_POOLING_TYPE_RANK)) {
-        const int64_t n_tokens     = ubatch->n_tokens;
-        const int64_t n_seq_tokens = ubatch->n_seq_tokens;
-        const int64_t n_seqs       = ubatch->n_seqs;
-
-        GGML_ASSERT(cls);
-        GGML_ASSERT(ggml_backend_buffer_is_host(cls->buffer));
-
-        uint32_t * data = (uint32_t *) cls->data;
-        memset(cls->data, 0, n_tokens * ggml_element_size(cls));
-
-        for (int s = 0; s < n_seqs; ++s) {
-            const llama_seq_id seq_id = ubatch->seq_id[s][0];
-
-            // TODO: adapt limits to n_seqs when ubatch->equal_seqs is true
-            GGML_ASSERT(seq_id < n_tokens && "seq_id cannot be larger than n_tokens with pooling_type == CLS or RANK");
-
-            for (int i = 0; i < n_seq_tokens; ++i) {
-                const llama_pos pos = ubatch->pos[s*n_seq_tokens + i];
-
-                if (pos == 0) {
-                    data[seq_id] = s*n_seq_tokens + i;
-                }
-            }
-        }
-    }
-
-    if (cparams.embeddings && cparams.pooling_type == LLAMA_POOLING_TYPE_LAST) {
-        const int64_t n_tokens     = ubatch->n_tokens;
-        const int64_t n_seq_tokens = ubatch->n_seq_tokens;
-        const int64_t n_seqs       = ubatch->n_seqs;
-
-        GGML_ASSERT(cls);
-        GGML_ASSERT(ggml_backend_buffer_is_host(cls->buffer));
-
-        uint32_t * data = (uint32_t *) cls->data;
-        memset(cls->data, 0, n_tokens * ggml_element_size(cls));
-
-        std::vector<int> last_pos(n_tokens, -1);
-        std::vector<int> last_row(n_tokens, -1);
-
-        for (int s = 0; s < n_seqs; ++s) {
-            const llama_seq_id seq_id = ubatch->seq_id[s][0];
-
-            // TODO: adapt limits to n_seqs when ubatch->equal_seqs is true
-            GGML_ASSERT(seq_id < n_tokens && "seq_id cannot be larger than n_tokens with pooling_type == LAST");
-
-            for (int i = 0; i < n_seq_tokens; ++i) {
-                const llama_pos pos = ubatch->pos[s*n_seq_tokens + i];
-
-                if (pos >= last_pos[seq_id]) {
-                    last_pos[seq_id] = pos;
-                    last_row[seq_id] = s*n_seq_tokens + i;
-                }
-            }
-        }
-
-        for (int i = 0; i < n_tokens; ++i) {
-            if (last_row[i] >= 0) {
-                data[i] = last_row[i];
-            }
-        }
-    }
-}
-
 class llama_graph_input_attn_base : public llama_graph_input_attn_i {
 public:
     llama_graph_input_attn_base(const llama_hparams & hparams, const llama_cparams & cparams) :
@@ -1359,14 +1152,6 @@ int llama_context_base::decode(llama_batch & inp_batch) {
     return 0;
 }
 
-//
-// input
-//
-
-int64_t llama_context_base::n_pos_per_token() const {
-    return model.arch == LLM_ARCH_QWEN2VL ? 4 : 1;
-}
-
 //
 // output
 //
@@ -1535,6 +1320,10 @@ enum ggml_status llama_context_base::graph_compute(
 // graph build API
 //
 
+int32_t llama_context_base::get_n_outputs() const {
+    return n_outputs;
+}
+
 void llama_context_base::build_cb(
          ggml_tensor * cur,
           const char * name,
@@ -1650,6 +1439,117 @@ ggml_tensor * llama_context_base::build_rope_factors(int il) const {
     return model.layers[il].rope_short;
 }
 
+llama_graph_input_ptr llama_context_base::build_inp_embd(
+              ggml_context * ctx0,
+               ggml_tensor * tok_embd,
+        const llama_ubatch & ubatch) const {
+    const auto & hparams = model.hparams;
+
+    const int64_t n_embd = hparams.n_embd;
+
+    auto inp = std::make_shared<llama_graph_input_embd>();
+
+    auto & cur = inp->cur;
+
+    if (ubatch.token) {
+        inp->tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ubatch.n_tokens);
+        //cb(inp->tokens, "inp_tokens", -1);
+        ggml_set_input(inp->tokens);
+
+        cur = ggml_get_rows(ctx0, tok_embd, inp->tokens);
+
+        // apply lora for embedding tokens if needed
+        for (const auto & lora : loras) {
+            struct llama_adapter_lora_weight * lw = lora.first->get_weight(tok_embd);
+            if (lw == nullptr) {
+                continue;
+            }
+
+            const float adapter_scale = lora.second;
+            const float scale = lw->get_scale(lora.first->alpha, adapter_scale);
+
+            struct ggml_tensor * inpL_delta = ggml_scale(ctx0, ggml_mul_mat(
+                        ctx0, lw->b, // non-transposed lora_b
+                        ggml_get_rows(ctx0, lw->a, inp->tokens)
+                        ), scale);
+
+            cur = ggml_add(ctx0, cur, inpL_delta);
+        }
+    } else {
+        inp->embd = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, ubatch.n_tokens);
+        cur = inp->embd;
+        ggml_set_input(inp->embd);
+    }
+
+    // For Granite architecture
+    if (hparams.f_embedding_scale != 0.0f) {
+        cur = ggml_scale(ctx0, cur, hparams.f_embedding_scale);
+    }
+
+    //cb(cur, "inp_embd", -1);
+
+    return inp;
+}
+
+llama_graph_input_ptr llama_context_base::build_inp_pos_bucket(
+              ggml_context * ctx0,
+                   int32_t   n_tokens) const {
+    auto inp = std::make_shared<llama_graph_input_pos_bucket>(model.hparams);
+
+    inp->cur = ggml_new_tensor_2d(ctx0, GGML_TYPE_I32, n_tokens, n_tokens);
+    ggml_set_input(inp->cur);
+
+    return inp;
+}
+
+llama_graph_input_attn_ptr llama_context_base::build_attn_inp(
+              ggml_context * ctx0,
+                   int32_t   n_tokens,
+                      bool   causal,
+                      bool   swa) const {
+    auto inp = std::make_shared<llama_graph_input_attn_base>(model.hparams, cparams);
+
+    // note: there is no KV cache, so the number of KV values is equal to the number of tokens in the batch
+    GGML_UNUSED(causal);
+    GGML_UNUSED(swa);
+
+    inp->kq_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_tokens, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD));
+    //cb(inp_kq_mask, "KQ_mask", -1);
+    ggml_set_input(inp->kq_mask);
+
+    inp->kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->kq_mask, GGML_TYPE_F16) : inp->kq_mask;
+
+    return inp;
+}
+
+ggml_tensor * llama_context_base::build_attn(
+        llama_graph_input_attn_i * inp,
+        ggml_context * ctx0,
+         ggml_cgraph * gf,
+         ggml_tensor * q_cur,
+         ggml_tensor * k_cur,
+         ggml_tensor * v_cur,
+         ggml_tensor * kq_b,
+             float     kq_scale,
+             int       il) const {
+    GGML_UNUSED(il);
+
+    const auto & kq_mask = inp->get_kq_mask();
+
+    ggml_tensor * q = ggml_permute(ctx0, q_cur, 0, 2, 1, 3);
+    //cb(q, "q", il);
+
+    ggml_tensor * k = ggml_permute(ctx0, k_cur, 0, 2, 1, 3);
+    //cb(k, "k", il);
+
+    ggml_tensor * v = ggml_permute(ctx0, v_cur, 0, 2, 1, 3);
+    //cb(k, "v", il);
+
+    ggml_tensor * cur = build_attn_mha(ctx0, gf, q, k, v, kq_b, kq_mask, false, kq_scale);
+
+    return cur;
+}
+
 ggml_tensor * llama_context_base::build_rope_shift(
         ggml_context * ctx0,
         ggml_tensor * cur,
@@ -1699,181 +1599,6 @@ ggml_tensor * llama_context_base::build_rope_shift(
     return tmp;
 }
 
-ggml_tensor * llama_context_base::build_inp_embd(
-        llama_graph_result * res,
-              ggml_context * ctx0,
-               ggml_tensor * tok_embd,
-        const llama_ubatch & ubatch) const {
-    const auto & hparams = model.hparams;
-
-    const int64_t n_embd = hparams.n_embd;
-
-    auto inp = std::make_shared<llama_graph_input_embd>();
-
-    struct ggml_tensor * inpL;
-
-    if (ubatch.token) {
-        inp->tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ubatch.n_tokens);
-        //cb(inp->tokens, "inp_tokens", -1);
-        ggml_set_input(inp->tokens);
-
-        inpL = ggml_get_rows(ctx0, tok_embd, inp->tokens);
-
-        // apply lora for embedding tokens if needed
-        for (const auto & lora : loras) {
-            struct llama_adapter_lora_weight * lw = lora.first->get_weight(tok_embd);
-            if (lw == nullptr) {
-                continue;
-            }
-
-            const float adapter_scale = lora.second;
-            const float scale = lw->get_scale(lora.first->alpha, adapter_scale);
-
-            struct ggml_tensor * inpL_delta = ggml_scale(ctx0, ggml_mul_mat(
-                        ctx0, lw->b, // non-transposed lora_b
-                        ggml_get_rows(ctx0, lw->a, inp->tokens)
-                        ), scale);
-
-            inpL = ggml_add(ctx0, inpL, inpL_delta);
-        }
-    } else {
-        inp->embd = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, ubatch.n_tokens);
-        inpL = inp->embd;
-        ggml_set_input(inp->embd);
-    }
-
-    // For Granite architecture
-    if (hparams.f_embedding_scale != 0.0f) {
-        inpL = ggml_scale(ctx0, inpL, hparams.f_embedding_scale);
-    }
-
-    res->add_input(std::move(inp));
-
-    //cb(inpL, "inp_embd", -1);
-
-    return inpL;
-}
-
-ggml_tensor * llama_context_base::build_inp_pos(
-        llama_graph_result * res,
-              ggml_context * ctx0,
-                   int32_t   n_tokens) const {
-    auto inp = std::make_shared<llama_graph_input_pos>(n_pos_per_token());
-
-    inp->pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens*n_pos_per_token());
-    ggml_set_input(inp->pos);
-
-    res->add_input(inp);
-
-    return inp->pos;
-}
-
-ggml_tensor * llama_context_base::build_inp_pos_bucket(
-        llama_graph_result * res,
-              ggml_context * ctx0,
-                   int32_t   n_tokens) const {
-    auto inp = std::make_shared<llama_graph_input_pos_bucket>(model.hparams);
-
-    inp->pos_bucket = ggml_new_tensor_2d(ctx0, GGML_TYPE_I32, n_tokens, n_tokens);
-    ggml_set_input(inp->pos_bucket);
-
-    res->add_input(inp);
-
-    return inp->pos_bucket;
-}
-
-ggml_tensor * llama_context_base::build_inp_out_ids(
-        llama_graph_result * res,
-              ggml_context * ctx0) const {
-    auto inp = std::make_shared<llama_graph_input_out_ids>(model.hparams, cparams, n_outputs);
-
-    inp->out_ids = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_outputs);
-    ggml_set_input(inp->out_ids);
-
-    res->add_input(inp);
-
-    return inp->out_ids;
-}
-
-ggml_tensor * llama_context_base::build_inp_mean(
-        llama_graph_result * res,
-              ggml_context * ctx0,
-                   int32_t   n_tokens) const {
-    auto inp = std::make_shared<llama_graph_input_mean>(cparams);
-
-    inp->mean = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_tokens, n_tokens);
-    ggml_set_input(inp->mean);
-
-    res->add_input(inp);
-
-    return inp->mean;
-}
-
-ggml_tensor * llama_context_base::build_inp_cls(
-        llama_graph_result * res,
-              ggml_context * ctx0,
-                   int32_t   n_tokens) const {
-    auto inp = std::make_shared<llama_graph_input_cls>(cparams);
-
-    inp->cls = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
-    ggml_set_input(inp->cls);
-
-    res->add_input(inp);
-
-    return inp->cls;
-}
-
-llama_graph_input_attn_ptr llama_context_base::build_attn_inp(
-        llama_graph_result * res,
-              ggml_context * ctx0,
-                   int32_t   n_tokens,
-                      bool   causal,
-                      bool   swa) const {
-    auto inp = std::make_shared<llama_graph_input_attn_base>(model.hparams, cparams);
-
-    // note: there is no KV cache, so the number of KV values is equal to the number of tokens in the batch
-    GGML_UNUSED(causal);
-    GGML_UNUSED(swa);
-
-    inp->kq_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_tokens, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD));
-    //cb(inp_kq_mask, "KQ_mask", -1);
-    ggml_set_input(inp->kq_mask);
-
-    inp->kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->kq_mask, GGML_TYPE_F16) : inp->kq_mask;
-
-    res->add_input(inp);
-
-    return inp;
-}
-
-ggml_tensor * llama_context_base::build_attn(
-        llama_graph_input_attn_i * inp,
-        ggml_context * ctx0,
-         ggml_cgraph * gf,
-         ggml_tensor * q_cur,
-         ggml_tensor * k_cur,
-         ggml_tensor * v_cur,
-         ggml_tensor * kq_b,
-             float     kq_scale,
-             int       il) const {
-    GGML_UNUSED(il);
-
-    const auto & kq_mask = inp->get_kq_mask();
-
-    ggml_tensor * q = ggml_permute(ctx0, q_cur, 0, 2, 1, 3);
-    //cb(q, "q", il);
-
-    ggml_tensor * k = ggml_permute(ctx0, k_cur, 0, 2, 1, 3);
-    //cb(k, "k", il);
-
-    ggml_tensor * v = ggml_permute(ctx0, v_cur, 0, 2, 1, 3);
-    //cb(k, "v", il);
-
-    ggml_tensor * cur = build_attn_mha(ctx0, gf, q, k, v, kq_b, kq_mask, false, kq_scale);
-
-    return cur;
-}
-
 ggml_tensor * llama_context_base::build_attn_mha(
         ggml_context * ctx0,
          ggml_cgraph * gf,
@@ -2485,6 +2210,7 @@ size_t llama_context_base::state_seq_read_data(llama_io_read_i & io, llama_seq_i
 // llama_context_kv_self
 //
 
+// I32 [n_kv, n_batch]
 class llama_graph_input_pos_bucket_kv : public llama_graph_input_i {
 public:
     llama_graph_input_pos_bucket_kv(
@@ -2494,20 +2220,18 @@ public:
 
     void set_input(const llama_ubatch * ubatch) override;
 
-    ggml_tensor * pos_bucket; // I32 [n_batch, n_batch]
-
     const llama_hparams & hparams;
     const llama_kv_cache_unified * kv_self;
 };
 
 void llama_graph_input_pos_bucket_kv::set_input(const llama_ubatch * ubatch) {
-    if (pos_bucket) {
+    if (cur) {
         const int64_t n_tokens = ubatch->n_tokens;
 
-        GGML_ASSERT(ggml_backend_buffer_is_host(pos_bucket->buffer));
+        GGML_ASSERT(ggml_backend_buffer_is_host(cur->buffer));
         GGML_ASSERT(!ubatch->equal_seqs); // TODO: use ubatch->n_seqs instead of failing
 
-        int32_t * data = (int32_t *) pos_bucket->data;
+        int32_t * data = (int32_t *) cur->data;
 
         const int64_t n_kv = kv_self->n;
 
@@ -3311,24 +3035,20 @@ ggml_cgraph * llama_context_kv_self::graph_init() {
     return llama_context_base::graph_init();
 }
 
-ggml_tensor * llama_context_kv_self::build_inp_pos_bucket(
-        llama_graph_result * res,
+llama_graph_input_ptr llama_context_kv_self::build_inp_pos_bucket(
               ggml_context * ctx0,
                    int32_t   n_tokens) const {
     auto inp = std::make_shared<llama_graph_input_pos_bucket_kv>(model.hparams, kv_self.get());
 
     const auto n_kv = kv_self->n;
 
-    inp->pos_bucket = ggml_new_tensor_2d(ctx0, GGML_TYPE_I32, n_kv, n_tokens);
-    ggml_set_input(inp->pos_bucket);
+    inp->cur = ggml_new_tensor_2d(ctx0, GGML_TYPE_I32, n_kv, n_tokens);
+    ggml_set_input(inp->cur);
 
-    res->inputs.push_back(inp);
-
-    return inp->pos_bucket;
+    return inp;
 }
 
 llama_graph_input_attn_ptr llama_context_kv_self::build_attn_inp(
-      llama_graph_result * res,
             ggml_context * ctx0,
                  int32_t   n_tokens,
                     bool   causal,
@@ -3359,8 +3079,6 @@ llama_graph_input_attn_ptr llama_context_kv_self::build_attn_inp(
         inp->self_kq_mask_swa_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask_swa, GGML_TYPE_F16) : inp->self_kq_mask_swa;
     }
 
-    res->add_input(inp);
-
     return inp;
 }
 
@@ -3833,6 +3551,7 @@ size_t llama_context_kv_self::state_seq_read_data(llama_io_read_i & io, llama_se
 // llama_context_recurrent
 //
 
+// I32 [kv_size]
 class llama_graph_input_s_copy : public llama_graph_input_i {
 public:
     llama_graph_input_s_copy(llama_kv_cache_recurrent * kv_self) : kv_self(kv_self) {}
@@ -3840,8 +3559,6 @@ public:
 
     void set_input(const llama_ubatch * ubatch) override;
 
-    ggml_tensor * s_copy; // I32 [kv_size]
-
     llama_kv_cache_recurrent * kv_self;
 };
 
@@ -3850,9 +3567,9 @@ void llama_graph_input_s_copy::set_input(const llama_ubatch * ubatch) {
 
     const int64_t n_kv = kv_self->n;
 
-    if (s_copy) {
-        GGML_ASSERT(ggml_backend_buffer_is_host(s_copy->buffer));
-        int32_t * data = (int32_t *) s_copy->data;
+    if (cur) {
+        GGML_ASSERT(ggml_backend_buffer_is_host(cur->buffer));
+        int32_t * data = (int32_t *) cur->data;
 
         // assuming copy destinations ALWAYS happen ONLY on the cells between head and head+n
         for (uint32_t i = 0; i < n_kv; ++i) {
@@ -3878,6 +3595,7 @@ void llama_graph_input_s_copy::set_input(const llama_ubatch * ubatch) {
     }
 }
 
+// F32 [1, n_kv]
 class llama_graph_input_s_mask : public llama_graph_input_i {
 public:
     llama_graph_input_s_mask(llama_kv_cache_recurrent * kv_self) : kv_self(kv_self) {}
@@ -3885,8 +3603,6 @@ public:
 
     void set_input(const llama_ubatch * ubatch) override;
 
-    ggml_tensor * s_mask; // F32 [1, n_kv]
-
     llama_kv_cache_recurrent * kv_self;
 };
 
@@ -3895,9 +3611,9 @@ void llama_graph_input_s_mask::set_input(const llama_ubatch * ubatch) {
 
     const int64_t n_kv = kv_self->n;
 
-    if (s_mask) {
-        GGML_ASSERT(ggml_backend_buffer_is_host(s_mask->buffer));
-        float * data = (float *) s_mask->data;
+    if (cur) {
+        GGML_ASSERT(ggml_backend_buffer_is_host(cur->buffer));
+        float * data = (float *) cur->data;
 
         // clear unused states
         for (int i = 0; i < n_kv; ++i) {
@@ -4302,36 +4018,30 @@ ggml_cgraph * llama_context_recurrent::graph_init() {
     return llama_context_base::graph_init();
 }
 
-ggml_tensor * llama_context_recurrent::build_inp_s_copy(
-      llama_graph_result * res,
+llama_graph_input_ptr llama_context_recurrent::build_inp_s_copy(
             ggml_context * ctx0) const {
     auto inp = std::make_shared<llama_graph_input_s_copy>(kv_self.get());
 
     const auto n_kv = kv_self->n;
 
-    inp->s_copy = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_kv);
-    //cb(inp.s_copy, "inp_s_copy", -1);
-    ggml_set_input(inp->s_copy);
+    inp->cur = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_kv);
+    //cb(inp.cur, "inp_s_copy", -1);
+    ggml_set_input(inp->cur);
 
-    res->add_input(inp);
-
-    return inp->s_copy;
+    return inp;
 }
 
-ggml_tensor * llama_context_recurrent::build_inp_s_mask(
-      llama_graph_result * res,
+llama_graph_input_ptr llama_context_recurrent::build_inp_s_mask(
             ggml_context * ctx0) const {
     auto inp = std::make_shared<llama_graph_input_s_mask>(kv_self.get());
 
     const auto n_kv = kv_self->n;
 
-    inp->s_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, 1, n_kv);
-    //cb(inp->s_mask, "inp_s_mask", -1);
-    ggml_set_input(inp->s_mask);
+    inp->cur = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, 1, n_kv);
+    //cb(inp->cur, "inp_s_mask", -1);
+    ggml_set_input(inp->cur);
 
-    res->add_input(inp);
-
-    return inp->s_mask;
+    return inp;
 }
 
 ggml_tensor * llama_context_recurrent::build_copy_mask_state(
@@ -4904,6 +4614,7 @@ int llama_context_enc::encode(llama_batch & inp_batch) {
 // llama_context_dec
 //
 
+// F32 [n_embd, n_outputs_enc]
 class llama_graph_input_cross_embd : public llama_graph_input_i {
 public:
     llama_graph_input_cross_embd(
@@ -4912,26 +4623,24 @@ public:
 
     void set_input(const llama_ubatch * ubatch) override;
 
-    ggml_tensor * cross_embd; // F32 [n_embd, n_outputs_enc]
-
     const llama_cross * cross;
 };
 
 void llama_graph_input_cross_embd::set_input(const llama_ubatch * ubatch) {
     GGML_UNUSED(ubatch);
 
-    if (cross_embd && cross->t_embd) {
-        assert(cross_embd->type == GGML_TYPE_F32);
+    if (cur && cross->t_embd) {
+        assert(cur->type == GGML_TYPE_F32);
 
-        ggml_backend_tensor_set(cross_embd, cross->v_embd, 0, ggml_nbytes(cross_embd));
+        ggml_backend_tensor_set(cur, cross->v_embd, 0, ggml_nbytes(cur));
     }
 }
 
 class llama_graph_input_attn_dec : public llama_graph_input_attn_i {
 public:
     llama_graph_input_attn_dec(
-            llama_graph_input_attn_i * inp_kv_self,
-            const llama_cross * cross) : inp_kv_self(inp_kv_self), cross(cross) {}
+            llama_graph_input_attn_ptr inp_kv_self,
+            const llama_cross * cross) : inp_kv_self(std::move(inp_kv_self)), cross(cross) {}
 
     void set_input(const llama_ubatch * ubatch) override;
 
@@ -4942,11 +4651,14 @@ public:
     ggml_tensor * cross_kq_mask     = nullptr; // F32 [n_outputs_enc, n_batch]
     ggml_tensor * cross_kq_mask_cnv = nullptr; // F32 [n_outputs_enc, n_batch]
 
-    llama_graph_input_attn_i * inp_kv_self = nullptr;
+    llama_graph_input_attn_ptr inp_kv_self = nullptr;
+
     const llama_cross * cross = nullptr;
 };
 
 void llama_graph_input_attn_dec::set_input(const llama_ubatch * ubatch) {
+    inp_kv_self->set_input(ubatch);
+
     if (cross_kq_mask) {
         const int64_t n_enc    = cross_kq_mask->ne[0];
         const int64_t n_tokens = ubatch->n_tokens;
@@ -4990,17 +4702,16 @@ ggml_cgraph * llama_context_dec::graph_init() {
     return llama_context_kv_self::graph_init();
 }
 
-ggml_tensor * llama_context_dec::build_inp_cross_embd(
-      llama_graph_result * res,
+llama_graph_input_ptr llama_context_dec::build_inp_cross_embd(
             ggml_context * ctx0) const {
     auto inp = std::make_shared<llama_graph_input_cross_embd>(cross);
 
     // if we have the output embeddings from the encoder, use them directly
     // TODO: needs more work to be correct, for now just use the tensor shape
     //if (cross->t_embd) {
-    //    inp->cross_embd = ggml_view_tensor(ctx0, cross->t_embd);
+    //    inp->cur = ggml_view_tensor(ctx0, cross->t_embd);
 
-    //    return inp->cross_embd;
+    //    return inp->cur;
     //}
 
     const auto & hparams = model.hparams;
@@ -5008,23 +4719,20 @@ ggml_tensor * llama_context_dec::build_inp_cross_embd(
     const auto n_embd = cross->t_embd ? cross->t_embd->ne[0] : hparams.n_embd;
     const auto n_enc  = cross->t_embd ? cross->t_embd->ne[1] : hparams.n_ctx_train;
 
-    inp->cross_embd = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_enc);
-    ggml_set_input(inp->cross_embd);
+    inp->cur = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_enc);
+    ggml_set_input(inp->cur);
 
-    res->add_input(inp);
-
-    return inp->cross_embd;
+    return inp;
 }
 
 llama_graph_input_attn_ptr llama_context_dec::build_attn_inp(
-      llama_graph_result * res,
             ggml_context * ctx0,
                  int32_t   n_tokens,
                     bool   causal,
                     bool   swa) const {
-    auto inp_kv_self = llama_context_kv_self::build_attn_inp(res, ctx0, n_tokens, causal, swa);
+    auto inp_kv_self = llama_context_kv_self::build_attn_inp(ctx0, n_tokens, causal, swa);
 
-    auto inp = std::make_shared<llama_graph_input_attn_dec>(inp_kv_self.get(), cross);
+    auto inp = std::make_shared<llama_graph_input_attn_dec>(std::move(inp_kv_self), cross);
 
     const int32_t n_enc = cross->t_embd ? cross->t_embd->ne[1] : model.hparams.n_ctx_train;
 
@@ -5033,8 +4741,6 @@ llama_graph_input_attn_ptr llama_context_dec::build_attn_inp(
 
     inp->cross_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->cross_kq_mask, GGML_TYPE_F16) : inp->cross_kq_mask;
 
-    res->add_input(inp);
-
     return inp;
 }
 
diff --git a/src/llama-context.h b/src/llama-context.h
index 21015e8796..a5159bc5b3 100644
--- a/src/llama-context.h
+++ b/src/llama-context.h
@@ -242,12 +242,6 @@ public:
     int decode(llama_batch & inp_batch) override;
 
 protected:
-    //
-    // input
-    //
-
-    virtual int64_t n_pos_per_token() const; // vision
-
     //
     // output
     //
@@ -287,6 +281,8 @@ public:
     // graph build
     //
 
+    int32_t get_n_outputs() const override;
+
     void build_cb(
              ggml_tensor * cur,
               const char * name,
@@ -314,45 +310,16 @@ public:
 
     ggml_tensor * build_rope_factors(int il) const override;
 
-    ggml_tensor * build_rope_shift(
-            ggml_context * ctx0,
-             ggml_tensor * cur,
-             ggml_tensor * shift,
-             ggml_tensor * factors,
-             ggml_backend_buffer * bbuf) const override;
-
-    ggml_tensor * build_inp_embd(
-            llama_graph_result * res,
+    llama_graph_input_ptr build_inp_embd(
                   ggml_context * ctx0,
                    ggml_tensor * tok_embd,
             const llama_ubatch & ubatch) const override;
 
-    ggml_tensor * build_inp_pos(
-      llama_graph_result * res,
-            ggml_context * ctx0,
-                 int32_t   n_tokens) const override;
-
-    ggml_tensor * build_inp_pos_bucket(
-      llama_graph_result * res,
-            ggml_context * ctx0,
-                 int32_t   n_tokens) const override;
-
-    ggml_tensor * build_inp_out_ids(
-      llama_graph_result * res,
-            ggml_context * ctx0) const override;
-
-    ggml_tensor * build_inp_mean(
-      llama_graph_result * res,
-            ggml_context * ctx0,
-                 int32_t   n_tokens) const override;
-
-    ggml_tensor * build_inp_cls(
-      llama_graph_result * res,
+    llama_graph_input_ptr build_inp_pos_bucket(
             ggml_context * ctx0,
                  int32_t   n_tokens) const override;
 
     llama_graph_input_attn_ptr build_attn_inp(
-      llama_graph_result * res,
             ggml_context * ctx0,
                  int32_t   n_tokens,
                     bool   causal,
@@ -370,7 +337,15 @@ public:
                      int   il) const override;
 
 protected:
-    virtual ggml_tensor * build_attn_mha(
+    // note: optionally set the backend to be the same as the bbuf's backend
+    ggml_tensor * build_rope_shift(
+            ggml_context * ctx0,
+             ggml_tensor * cur,
+             ggml_tensor * shift,
+             ggml_tensor * factors,
+             ggml_backend_buffer * bbuf) const;
+
+    ggml_tensor * build_attn_mha(
             ggml_context * ctx0,
              ggml_cgraph * gf,
              ggml_tensor * q,
@@ -458,28 +433,9 @@ protected:
     llama_loras        loras;
     llama_sbatch       sbatch;
 
-    ggml_threadpool_t threadpool       = nullptr;
-    ggml_threadpool_t threadpool_batch = nullptr;
-
-    ggml_abort_callback abort_callback      = nullptr;
-    void *              abort_callback_data = nullptr;
-
-    ggml_backend_t backend_cpu = nullptr;
-    std::vector<ggml_backend_ptr> backends;
-
-    std::vector<std::pair<ggml_backend_t, ggml_backend_set_n_threads_t>> set_n_threads_fns;
-
     ggml_backend_sched_ptr sched;
 
-    // buffer types used for the compute buffer of each backend
-    std::vector<ggml_backend_t>             backend_ptrs;
-    std::vector<ggml_backend_buffer_type_t> backend_buft;
-
-    // memory buffers used to evaluate the model
-    std::vector<uint8_t> buf_compute_meta;
-
-    // host buffer for the model output (logits and embeddings)
-    ggml_backend_buffer_ptr buf_output;
+    // TODO: these below likely need some rework in the future, together with the batch-refactoring
 
     // TODO: remove
     bool logits_all = false;
@@ -502,6 +458,30 @@ protected:
 
     std::vector<int32_t> output_ids; // map batch token positions to ids of the logits and embd buffers
 
+private:
+    // base functionality - should not leak into derived classes
+
+    ggml_threadpool_t threadpool       = nullptr;
+    ggml_threadpool_t threadpool_batch = nullptr;
+
+    ggml_abort_callback abort_callback      = nullptr;
+    void *              abort_callback_data = nullptr;
+
+    ggml_backend_t backend_cpu = nullptr;
+    std::vector<ggml_backend_ptr> backends;
+
+    std::vector<std::pair<ggml_backend_t, ggml_backend_set_n_threads_t>> set_n_threads_fns;
+
+    // buffer types used for the compute buffer of each backend
+    std::vector<ggml_backend_t>             backend_ptrs;
+    std::vector<ggml_backend_buffer_type_t> backend_buft;
+
+    // memory buffers used to evaluate the model
+    std::vector<uint8_t> buf_compute_meta;
+
+    // host buffer for the model output (logits and embeddings)
+    ggml_backend_buffer_ptr buf_output;
+
     bool has_evaluated_once = false;
 };
 
@@ -539,13 +519,11 @@ public:
     // graph build
     //
 
-    ggml_tensor * build_inp_pos_bucket(
-      llama_graph_result * res,
+    llama_graph_input_ptr build_inp_pos_bucket(
             ggml_context * ctx0,
                  int32_t   n_tokens) const override;
 
     llama_graph_input_attn_ptr build_attn_inp(
-      llama_graph_result * res,
             ggml_context * ctx0,
                  int32_t   n_tokens,
                     bool   causal,
@@ -624,12 +602,10 @@ public:
     // graph build
     //
 
-    ggml_tensor * build_inp_s_copy(
-      llama_graph_result * res,
+    llama_graph_input_ptr build_inp_s_copy(
             ggml_context * ctx0) const override;
 
-    ggml_tensor * build_inp_s_mask(
-      llama_graph_result * res,
+    llama_graph_input_ptr build_inp_s_mask(
             ggml_context * ctx0) const override;
 
     ggml_tensor * build_copy_mask_state(
@@ -694,6 +670,10 @@ private:
     std::unique_ptr<llama_kv_cache_recurrent> kv_self;
 };
 
+//
+// enc-dec
+//
+
 // TODO: tmp - need something better to pass the data from the encoder to the decoder
 struct llama_cross {
     // the output embeddings from the encoder as a ggml tensor
@@ -714,7 +694,7 @@ public:
 
     int encode(llama_batch & inp_batch) override;
 
-    llama_cross * cross = nullptr;
+    llama_cross * cross = nullptr; // TODO: hacky, rework
 };
 
 class llama_context_dec : public llama_context_kv_self {
@@ -730,12 +710,10 @@ protected:
 
     ggml_cgraph * graph_init() override;
 
-    ggml_tensor * build_inp_cross_embd(
-      llama_graph_result * res,
+    llama_graph_input_ptr build_inp_cross_embd(
             ggml_context * ctx0) const override;
 
     llama_graph_input_attn_ptr build_attn_inp(
-      llama_graph_result * res,
             ggml_context * ctx0,
                  int32_t   n_tokens,
                     bool   causal,
@@ -753,7 +731,7 @@ protected:
                  int       il) const override;
 
 public:
-    llama_cross * cross = nullptr;
+    llama_cross * cross = nullptr; // TODO: hacky, rework
 };
 
 class llama_context_enc_dec : public llama_context {
diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp
index 79b26d1734..89e311a915 100644
--- a/src/llama-graph.cpp
+++ b/src/llama-graph.cpp
@@ -67,20 +67,16 @@ ggml_tensor * llama_graph_i::build_attn_cross(
     return nullptr;
 }
 
-ggml_tensor * llama_graph_i::build_inp_cross_embd(
-      llama_graph_result * res,
+llama_graph_input_ptr llama_graph_i::build_inp_cross_embd(
             ggml_context * ctx0) const {
-    GGML_UNUSED(res);
     GGML_UNUSED(ctx0);
 
     LLAMA_LOG_ERROR("%s: not implemented\n", __func__);
     return nullptr;
 }
 
-ggml_tensor * llama_graph_i::build_inp_s_copy (
-      llama_graph_result * res,
+llama_graph_input_ptr llama_graph_i::build_inp_s_copy (
             ggml_context * ctx0) const {
-    GGML_UNUSED(res);
     GGML_UNUSED(ctx0);
 
     LLAMA_LOG_ERROR("%s: not implemented\n", __func__);
@@ -88,10 +84,8 @@ ggml_tensor * llama_graph_i::build_inp_s_copy (
     return nullptr; // NOLINT
 }
 
-ggml_tensor * llama_graph_i::build_inp_s_mask(
-      llama_graph_result * res,
+llama_graph_input_ptr llama_graph_i::build_inp_s_mask(
             ggml_context * ctx0) const {
-    GGML_UNUSED(res);
     GGML_UNUSED(ctx0);
 
     LLAMA_LOG_ERROR("%s: not implemented\n", __func__);
diff --git a/src/llama-graph.h b/src/llama-graph.h
index 7ae99becc7..343d4a0772 100644
--- a/src/llama-graph.h
+++ b/src/llama-graph.h
@@ -29,6 +29,9 @@ public:
     virtual ~llama_graph_input_i() = default;
 
     virtual void set_input(const llama_ubatch * ubatch) = 0;
+
+    // by default, we produce a single input tensor, but some children could produce more
+    ggml_tensor * cur = nullptr;
 };
 
 using llama_graph_input_ptr = std::shared_ptr<llama_graph_input_i>;
@@ -76,7 +79,7 @@ public:
         }
     }
 
-    void add_input(llama_graph_input_ptr && input) {
+    void add_input(llama_graph_input_ptr input) {
         inputs.emplace_back(std::move(input));
     }
 
@@ -92,19 +95,23 @@ public:
 // llama_graph
 //
 
+// note: keep all methods const
 // TODO: can become more granular in the future
-// TODO: move all methods that do not require things from llama_context to llm_build_context
 class llama_graph_i {
 public:
     llama_graph_i(llama_graph_type type);
     virtual ~llama_graph_i() = default;
 
-    llama_graph_type get_type() const { return type; }
+    llama_graph_type get_type() const {
+        return type;
+    }
 
-protected:
+private:
     llama_graph_type type;
 
 public:
+    virtual int32_t get_n_outputs() const = 0;
+
     // callback that allows us to apply custom logic to each tensor (e.g. ggml-alloc, offloading, etc.)
     virtual void build_cb(
              ggml_tensor * cur,
@@ -131,50 +138,27 @@ public:
              ggml_tensor * cur, // struct ggml_tensor * b
              ggml_tensor * ids) const = 0;
 
+    // rope factors based on the current context size
     virtual ggml_tensor * build_rope_factors(int il) const = 0;
 
-    // note: optionally set the backend to be the same as the bbuf's backend
-    virtual ggml_tensor * build_rope_shift(
-            ggml_context * ctx0,
-             ggml_tensor * cur,
-             ggml_tensor * shift,
-             ggml_tensor * factors,
-             ggml_backend_buffer * bbuf) const = 0;
-
     // graph build API (context-specific)
 
-    virtual ggml_tensor * build_inp_embd(
-      llama_graph_result * res,
+    // input embeddings with optional lora
+    virtual llama_graph_input_ptr build_inp_embd(
             ggml_context * ctx0,
              ggml_tensor * tok_embd,
       const llama_ubatch & ubatch) const = 0;
 
-    virtual ggml_tensor * build_inp_pos(
-      llama_graph_result * res,
+    // enc-dec pos
+    virtual llama_graph_input_ptr build_inp_pos_bucket(
             ggml_context * ctx0,
                  int32_t   n_tokens) const = 0;
 
-    virtual ggml_tensor * build_inp_pos_bucket(
-      llama_graph_result * res,
-            ggml_context * ctx0,
-                 int32_t   n_tokens) const = 0;
-
-    virtual ggml_tensor * build_inp_out_ids(
-      llama_graph_result * res,
-            ggml_context * ctx0) const = 0;
-
-    virtual ggml_tensor * build_inp_mean(
-      llama_graph_result * res,
-            ggml_context * ctx0,
-                 int32_t   n_tokens) const = 0;
-
-    virtual ggml_tensor * build_inp_cls(
-      llama_graph_result * res,
-            ggml_context * ctx0,
-                 int32_t   n_tokens) const = 0;
+    //
+    // attention API
+    //
 
     virtual llama_graph_input_attn_ptr build_attn_inp(
-      llama_graph_result * res,
             ggml_context * ctx0,
                  int32_t   n_tokens,
                     bool   causal,
@@ -202,16 +186,17 @@ public:
                  float     kq_scale,
                  int       il) const;
 
-    virtual ggml_tensor * build_inp_cross_embd(
-      llama_graph_result * res,
+    virtual llama_graph_input_ptr build_inp_cross_embd(
             ggml_context * ctx0) const;
 
-    virtual ggml_tensor * build_inp_s_copy(
-      llama_graph_result * res,
+    //
+    // recurrent API
+    //
+
+    virtual llama_graph_input_ptr build_inp_s_copy(
             ggml_context * ctx0) const;
 
-    virtual ggml_tensor * build_inp_s_mask(
-      llama_graph_result * res,
+    virtual llama_graph_input_ptr build_inp_s_mask(
             ggml_context * ctx0) const;
 
     virtual ggml_tensor * build_copy_mask_state(
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
index 7fae82c6ec..60a8cc0f8b 100644
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -3813,6 +3813,212 @@ enum llm_norm_type {
     LLM_NORM_GROUP,
 };
 
+class llama_graph_input_pos : public llama_graph_input_i {
+public:
+    llama_graph_input_pos(int64_t n_pos_per_token) : n_pos_per_token(n_pos_per_token) {}
+    virtual ~llama_graph_input_pos() = default;
+
+    void set_input(const llama_ubatch * ubatch) override;
+
+    ggml_tensor * pos = nullptr; // I32 [n_batch]
+
+    const int64_t n_pos_per_token = 1;
+};
+
+void llama_graph_input_pos::set_input(const llama_ubatch * ubatch) {
+    if (ubatch->pos && pos) {
+        const int64_t n_tokens = ubatch->n_tokens;
+
+        ggml_backend_tensor_set(pos, ubatch->pos, 0, n_tokens*n_pos_per_token*ggml_element_size(pos));
+    }
+}
+
+class llama_graph_input_out_ids : public llama_graph_input_i {
+public:
+    llama_graph_input_out_ids(
+            const llama_hparams & hparams,
+            const llama_cparams & cparams,
+            int32_t n_outputs) : hparams(hparams), cparams(cparams), n_outputs(n_outputs) {}
+    virtual ~llama_graph_input_out_ids() = default;
+
+    void set_input(const llama_ubatch * ubatch) override;
+
+    ggml_tensor * out_ids; // I32 [n_outputs]
+
+    const llama_hparams & hparams;
+    const llama_cparams & cparams;
+
+    const int32_t n_outputs;
+};
+
+void llama_graph_input_out_ids::set_input(const llama_ubatch * ubatch) {
+    if (hparams.causal_attn || cparams.pooling_type == LLAMA_POOLING_TYPE_NONE) {
+        //GGML_ASSERT(out_ids && "every model that can must skip unused outputs");
+
+        if (!out_ids) {
+            LLAMA_LOG_WARN("%s: 'out_ids' is not created\n", __func__);
+        } else {
+            const int64_t n_tokens = ubatch->n_tokens;
+
+            GGML_ASSERT(ggml_backend_buffer_is_host(out_ids->buffer));
+            int32_t * data = (int32_t *) out_ids->data;
+
+            if (n_outputs == n_tokens) {
+                for (int i = 0; i < n_tokens; ++i) {
+                    data[i] = i;
+                }
+            } else if (ubatch->output) {
+                int32_t n_outputs = 0;
+                for (int i = 0; i < n_tokens; ++i) {
+                    if (ubatch->output[i]) {
+                        data[n_outputs++] = i;
+                    }
+                }
+                // the graph needs to have been passed the correct number of outputs
+                GGML_ASSERT(n_outputs == n_outputs);
+            } else if (n_outputs == 1) {
+                // only keep last output
+                data[0] = n_tokens - 1;
+            } else {
+                GGML_ASSERT(n_outputs == 0);
+            }
+        }
+    }
+}
+
+class llama_graph_input_mean : public llama_graph_input_i {
+public:
+    llama_graph_input_mean(const llama_cparams & cparams) : cparams(cparams) {}
+    virtual ~llama_graph_input_mean() = default;
+
+    void set_input(const llama_ubatch * ubatch) override;
+
+    ggml_tensor * mean; // F32 [n_batch, n_batch]
+
+    const llama_cparams & cparams;
+};
+
+void llama_graph_input_mean::set_input(const llama_ubatch * ubatch) {
+    if (cparams.embeddings && cparams.pooling_type == LLAMA_POOLING_TYPE_MEAN) {
+        const int64_t n_tokens     = ubatch->n_tokens;
+        const int64_t n_seq_tokens = ubatch->n_seq_tokens;
+        const int64_t n_seqs       = ubatch->n_seqs;
+
+        GGML_ASSERT(mean);
+        GGML_ASSERT(ggml_backend_buffer_is_host(mean->buffer));
+
+        float * data = (float *) mean->data;
+        memset(mean->data, 0, n_tokens * n_tokens * ggml_element_size(mean));
+
+        std::vector<uint64_t> sum(n_tokens, 0);
+
+        for (int s = 0; s < n_seqs; ++s) {
+            const llama_seq_id seq_id = ubatch->seq_id[s][0];
+
+            // TODO: adapt limits to n_seqs when ubatch->equal_seqs is true
+            GGML_ASSERT(seq_id < n_tokens && "seq_id cannot be larger than n_tokens with pooling_type == MEAN");
+
+            sum[seq_id] += ubatch->n_seq_tokens;
+        }
+
+        std::vector<float> div(n_tokens, 0.0f);
+        for (int i = 0; i < n_tokens; ++i) {
+            const uint64_t s = sum[i];
+            if (s > 0) {
+                div[i] = 1.0f/float(s);
+            }
+        }
+
+        for (int s = 0; s < n_seqs; ++s) {
+            const llama_seq_id seq_id = ubatch->seq_id[s][0];
+
+            for (int i = 0; i < n_seq_tokens; ++i) {
+                data[seq_id*n_tokens + s*n_seq_tokens + i] = div[seq_id];
+            }
+        }
+    }
+}
+
+class llama_graph_input_cls : public llama_graph_input_i {
+public:
+    llama_graph_input_cls(const llama_cparams & cparams) : cparams(cparams) {}
+    virtual ~llama_graph_input_cls() = default;
+
+    void set_input(const llama_ubatch * ubatch) override;
+
+    ggml_tensor * cls; // I32 [n_batch]
+
+    const llama_cparams & cparams;
+};
+
+void llama_graph_input_cls::set_input(const llama_ubatch * ubatch) {
+    if (cparams.embeddings && (
+                cparams.pooling_type == LLAMA_POOLING_TYPE_CLS ||
+                cparams.pooling_type == LLAMA_POOLING_TYPE_RANK)) {
+        const int64_t n_tokens     = ubatch->n_tokens;
+        const int64_t n_seq_tokens = ubatch->n_seq_tokens;
+        const int64_t n_seqs       = ubatch->n_seqs;
+
+        GGML_ASSERT(cls);
+        GGML_ASSERT(ggml_backend_buffer_is_host(cls->buffer));
+
+        uint32_t * data = (uint32_t *) cls->data;
+        memset(cls->data, 0, n_tokens * ggml_element_size(cls));
+
+        for (int s = 0; s < n_seqs; ++s) {
+            const llama_seq_id seq_id = ubatch->seq_id[s][0];
+
+            // TODO: adapt limits to n_seqs when ubatch->equal_seqs is true
+            GGML_ASSERT(seq_id < n_tokens && "seq_id cannot be larger than n_tokens with pooling_type == CLS or RANK");
+
+            for (int i = 0; i < n_seq_tokens; ++i) {
+                const llama_pos pos = ubatch->pos[s*n_seq_tokens + i];
+
+                if (pos == 0) {
+                    data[seq_id] = s*n_seq_tokens + i;
+                }
+            }
+        }
+    }
+
+    if (cparams.embeddings && cparams.pooling_type == LLAMA_POOLING_TYPE_LAST) {
+        const int64_t n_tokens     = ubatch->n_tokens;
+        const int64_t n_seq_tokens = ubatch->n_seq_tokens;
+        const int64_t n_seqs       = ubatch->n_seqs;
+
+        GGML_ASSERT(cls);
+        GGML_ASSERT(ggml_backend_buffer_is_host(cls->buffer));
+
+        uint32_t * data = (uint32_t *) cls->data;
+        memset(cls->data, 0, n_tokens * ggml_element_size(cls));
+
+        std::vector<int> last_pos(n_tokens, -1);
+        std::vector<int> last_row(n_tokens, -1);
+
+        for (int s = 0; s < n_seqs; ++s) {
+            const llama_seq_id seq_id = ubatch->seq_id[s][0];
+
+            // TODO: adapt limits to n_seqs when ubatch->equal_seqs is true
+            GGML_ASSERT(seq_id < n_tokens && "seq_id cannot be larger than n_tokens with pooling_type == LAST");
+
+            for (int i = 0; i < n_seq_tokens; ++i) {
+                const llama_pos pos = ubatch->pos[s*n_seq_tokens + i];
+
+                if (pos >= last_pos[seq_id]) {
+                    last_pos[seq_id] = pos;
+                    last_row[seq_id] = s*n_seq_tokens + i;
+                }
+            }
+        }
+
+        for (int i = 0; i < n_tokens; ++i) {
+            if (last_row[i] >= 0) {
+                data[i] = last_row[i];
+            }
+        }
+    }
+}
+
 struct llm_build_context {
     const llama_model   & model;
     const llama_hparams & hparams;
@@ -3895,55 +4101,75 @@ struct llm_build_context {
         res              (std::make_unique<llama_graph_result>()) {
         }
 
+    int64_t n_pos_per_token() const {
+        return model.arch == LLM_ARCH_QWEN2VL ? 4 : 1;
+    }
+
     // TODO: tmp
     void cb(struct ggml_tensor * cur, const char * name, int il) {
         lgf->build_cb(cur, name, ubatch, il);
     }
 
-    // TODO: tmp
     struct ggml_tensor * build_inp_embd(struct ggml_tensor * tok_embd) {
-        struct ggml_tensor * inpL = lgf->build_inp_embd(res.get(), ctx0, tok_embd, ubatch);
-        cb(inpL, "inp_embd", -1);
+        auto inp = lgf->build_inp_embd(ctx0, tok_embd, ubatch);
 
-        return inpL;
+        cb(inp->cur, "inp_embd", -1);
+
+        res->add_input(inp);
+
+        return inp->cur;
     }
 
-    // TODO: tmp
-    struct ggml_tensor * build_inp_pos() {
-        ggml_tensor * cur = lgf->build_inp_pos(res.get(), ctx0, n_tokens);
-        cb(cur, "inp_pos", -1);
+    struct ggml_tensor * build_inp_pos() const {
+        auto inp = std::make_shared<llama_graph_input_pos>(n_pos_per_token());
 
-        return cur;
+        inp->pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens*n_pos_per_token());
+        ggml_set_input(inp->pos);
+
+        res->add_input(inp);
+
+        return inp->pos;
     }
 
-    // TODO: tmp
     struct ggml_tensor * build_inp_out_ids() {
-        ggml_tensor * cur = lgf->build_inp_out_ids(res.get(), ctx0);
-        cb(cur, "inp_out_ids", -1);
+        const auto n_outputs = lgf->get_n_outputs();
 
-        return cur;
+        auto inp = std::make_shared<llama_graph_input_out_ids>(hparams, cparams, n_outputs);
+
+        inp->out_ids = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_outputs);
+        ggml_set_input(inp->out_ids);
+
+        res->add_input(inp);
+
+        return inp->out_ids;
     }
 
-    // TODO: tmp
     struct ggml_tensor * build_inp_mean() {
-        ggml_tensor * cur = lgf->build_inp_mean(res.get(), ctx0, n_tokens);
-        cb(cur, "inp_mean", -1);
+        auto inp = std::make_shared<llama_graph_input_mean>(cparams);
 
-        return cur;
+        inp->mean = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_tokens, n_tokens);
+        ggml_set_input(inp->mean);
+
+        res->add_input(inp);
+
+        return inp->mean;
     }
 
-    // TODO: tmp
     struct ggml_tensor * build_inp_cls() {
-        ggml_tensor * cur = lgf->build_inp_cls(res.get(), ctx0, n_tokens);
-        cb(cur, "inp_cls", -1);
+        auto inp = std::make_shared<llama_graph_input_cls>(cparams);
 
-        return cur;
+        inp->cls = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
+        ggml_set_input(inp->cls);
+
+        res->add_input(inp);
+
+        return inp->cls;
     }
 
     // TODO: tmp
     struct ggml_tensor * build_lora_mm(
               struct ggml_tensor * w,
-              struct ggml_tensor * cur) {
+              struct ggml_tensor * cur) const {
         return lgf->build_lora_mm(ctx0, w, cur);
     }
 
@@ -3951,24 +4177,42 @@ struct llm_build_context {
     struct ggml_tensor * build_lora_mm_id(
               struct ggml_tensor * w,   // struct ggml_tensor * as
               struct ggml_tensor * cur, // struct ggml_tensor * b
-              struct ggml_tensor * ids) {
+              struct ggml_tensor * ids) const {
         return lgf->build_lora_mm_id(ctx0, w, cur, ids);
     }
 
-    // TODO: tmp
     struct ggml_tensor * build_pos_bucket() {
-        ggml_tensor * cur = lgf->build_inp_pos_bucket(res.get(), ctx0, n_tokens);
-        cb(cur, "pos_bucket", -1);
+        auto inp = lgf->build_inp_pos_bucket(ctx0, n_tokens);
+        cb(inp->cur, "pos_bucket", -1);
 
-        return cur;
+        res->add_input(inp);
+
+        return inp->cur;
     }
 
-    // TODO: tmp
     struct ggml_tensor * build_inp_cross_embd() {
-        ggml_tensor * cur = lgf->build_inp_cross_embd(res.get(), ctx0);
-        cb(cur, "embd_enc", -1);
+        auto inp = lgf->build_inp_cross_embd(ctx0);
+        cb(inp->cur, "embd_enc", -1);
 
-        return cur;
+        res->add_input(inp);
+
+        return inp->cur;
+    }
+
+    struct ggml_tensor * build_inp_s_copy() const {
+        auto inp = lgf->build_inp_s_copy(ctx0);
+
+        res->add_input(inp);
+
+        return inp->cur;
+    }
+
+    struct ggml_tensor * build_inp_s_mask() const {
+        auto inp = lgf->build_inp_s_mask(ctx0);
+
+        res->add_input(inp);
+
+        return inp->cur;
     }
 
     struct ggml_tensor * build_norm(
@@ -4250,6 +4494,18 @@ struct llm_build_context {
         return moe_out;
     }
 
+    llama_graph_input_attn_ptr build_attn_inp(
+            ggml_context * ctx0,
+                 int32_t   n_tokens,
+                    bool   causal,
+                    bool   swa) const {
+        auto inp = lgf->build_attn_inp(ctx0, n_tokens, causal, swa);
+
+        res->add_input(inp);
+
+        return inp;
+    }
+
     struct ggml_tensor * build_attn(
             llama_graph_input_attn_i * inp,
             ggml_cgraph * gf,
@@ -4490,7 +4746,7 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false);
+        auto inp_attn = build_attn_inp(ctx0, n_tokens, true, false);
 
         const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
         for (int il = 0; il < n_layer; ++il) {
@@ -4651,7 +4907,7 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false);
+        auto inp_attn = build_attn_inp(ctx0, n_tokens, true, false);
 
         const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
         for (int il = 0; il < n_layer; ++il) {
@@ -4807,7 +5063,7 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = model.type == LLM_TYPE_7B ? build_inp_pos() : nullptr;
 
-        auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false);
+        auto inp_attn = build_attn_inp(ctx0, n_tokens, true, false);
 
         for (int il = 0; il < n_layer; ++il) {
             struct ggml_tensor * inpSA = inpL;
@@ -4923,7 +5179,7 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false);
+        auto inp_attn = build_attn_inp(ctx0, n_tokens, true, false);
 
         for (int il = 0; il < n_layer; ++il) {
             struct ggml_tensor * inpSA = inpL;
@@ -5028,7 +5284,7 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false);
+        auto inp_attn = build_attn_inp(ctx0, n_tokens, true, false);
 
         for (int il = 0; il < n_layer; ++il) {
             struct ggml_tensor * attn_norm;
@@ -5151,7 +5407,7 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false);
+        auto inp_attn = build_attn_inp(ctx0, n_tokens, true, false);
 
         for (int il = 0; il < n_layer; ++il) {
             struct ggml_tensor * inpSA = inpL;
@@ -5303,7 +5559,7 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false);
+        auto inp_attn = build_attn_inp(ctx0, n_tokens, true, false);
 
         for (int il = 0; il < n_layer; ++il) {
             struct ggml_tensor * inpSA = inpL;
@@ -5425,7 +5681,7 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false);
+        auto inp_attn = build_attn_inp(ctx0, n_tokens, true, false);
 
         struct ggml_tensor * pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos);
         cb(pos, "pos_embd", -1);
@@ -5526,7 +5782,7 @@ struct llm_build_context {
 
         inpL = build_inp_embd(model.tok_embd);
 
-        auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false);
+        auto inp_attn = build_attn_inp(ctx0, n_tokens, true, false);
 
         for (int il = 0; il < n_layer; ++il) {
             struct ggml_tensor * inpSA = inpL;
@@ -5640,7 +5896,7 @@ struct llm_build_context {
         inpL = build_norm(inpL, model.tok_norm, model.tok_norm_b, LLM_NORM, -1);
         cb(inpL, "inp_norm", -1);
 
-        auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, false, false);
+        auto inp_attn = build_attn_inp(ctx0, n_tokens, false, false);
 
         // iterate layers
         for (int il = 0; il < n_layer; ++il) {
@@ -5785,7 +6041,7 @@ struct llm_build_context {
 
         inpL = build_inp_embd(model.tok_embd);
 
-        auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false);
+        auto inp_attn = build_attn_inp(ctx0, n_tokens, true, false);
 
         inpL = build_norm(inpL,
                 model.tok_norm,
@@ -5888,7 +6144,7 @@ struct llm_build_context {
 
         inpL = build_inp_embd(model.tok_embd);
 
-        auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false);
+        auto inp_attn = build_attn_inp(ctx0, n_tokens, true, false);
 
         if (model.pos_embd) {
             // inp_pos - contains the positions
@@ -6030,11 +6286,9 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false);
+        auto inp_attn = build_attn_inp(ctx0, n_tokens, true, false);
 
         for (int il = 0; il < n_layer; ++il) {
-
-
             // norm
             cur = build_norm(inpL,
                     model.layers[il].attn_norm,
@@ -6181,7 +6435,7 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false);
+        auto inp_attn = build_attn_inp(ctx0, n_tokens, true, false);
 
         for (int il = 0; il < n_layer; ++il) {
             struct ggml_tensor * inpSA = inpL;
@@ -6295,7 +6549,7 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false);
+        auto inp_attn = build_attn_inp(ctx0, n_tokens, true, false);
 
         for (int il = 0; il < n_layer; ++il) {
             struct ggml_tensor * inpSA = inpL;
@@ -6408,7 +6662,7 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false);
+        auto inp_attn = build_attn_inp(ctx0, n_tokens, true, false);
 
         int sections[4];
         std::copy(std::begin(hparams.rope_sections), std::begin(hparams.rope_sections) + 4, sections);
@@ -6526,7 +6780,7 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false);
+        auto inp_attn = build_attn_inp(ctx0, n_tokens, true, false);
 
         for (int il = 0; il < n_layer; ++il) {
             struct ggml_tensor * inpSA = inpL;
@@ -6673,7 +6927,7 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false);
+        auto inp_attn = build_attn_inp(ctx0, n_tokens, true, false);
 
         for (int il = 0; il < n_layer; ++il) {
             attn_norm_output = build_norm(inpL,
@@ -6795,8 +7049,7 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
-        auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, true);
+        auto inp_attn = build_attn_inp(ctx0, n_tokens, true, true);
 
         for (int il = 0; il < n_layer; ++il) {
             auto * residual = inpL;
@@ -6940,7 +7193,7 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false);
+        auto inp_attn = build_attn_inp(ctx0, n_tokens, true, false);
 
         for (int il = 0; il < n_layer; ++il) {
 
@@ -7046,7 +7299,7 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false);
+        auto inp_attn = build_attn_inp(ctx0, n_tokens, true, false);
 
         pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos);
         cb(pos, "pos_embd", -1);
@@ -7152,7 +7405,7 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false);
+        auto inp_attn = build_attn_inp(ctx0, n_tokens, true, false);
 
         for (int il = 0; il < n_layer; ++il) {
             cur = build_norm(inpL,
@@ -7263,7 +7516,7 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false);
+        auto inp_attn = build_attn_inp(ctx0, n_tokens, true, false);
 
         for (int il = 0; il < n_layer; ++il) {
             struct ggml_tensor * inpSA = inpL;
@@ -7382,7 +7635,7 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false);
+        auto inp_attn = build_attn_inp(ctx0, n_tokens, true, false);
 
         for (int il = 0; il < n_layer; ++il) {
             struct ggml_tensor * inpSA = inpL;
@@ -7510,7 +7763,7 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false);
+        auto inp_attn = build_attn_inp(ctx0, n_tokens, true, false);
 
         for (int il = 0; il < n_layer; ++il) {
             struct ggml_tensor * inpSA = inpL;
@@ -7711,7 +7964,7 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false);
+        auto inp_attn = build_attn_inp(ctx0, n_tokens, true, false);
 
         for (int il = 0; il < n_layer; ++il) {
             // norm
@@ -7819,7 +8072,7 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, true);
+        auto inp_attn = build_attn_inp(ctx0, n_tokens, true, true);
 
         for (int il = 0; il < n_layer; ++il) {
             // norm
@@ -7949,7 +8202,7 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false);
+        auto inp_attn = build_attn_inp(ctx0, n_tokens, true, false);
 
         for (int il = 0; il < n_layer; ++il) {
             struct ggml_tensor * inpSA = inpL;
@@ -8062,8 +8315,8 @@ struct llm_build_context {
         // {n_embd, n_tokens}
         inpL = build_inp_embd(model.tok_embd);
 
-        struct ggml_tensor * state_copy = lgf->build_inp_s_copy(res.get(), ctx0);
-        struct ggml_tensor * state_mask = lgf->build_inp_s_mask(res.get(), ctx0);
+        struct ggml_tensor * state_copy = build_inp_s_copy();
+        struct ggml_tensor * state_mask = build_inp_s_mask();
 
         for (int il = 0; il < n_layer; ++il) {
             // norm
@@ -8124,7 +8377,7 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false);
+        auto inp_attn = build_attn_inp(ctx0, n_tokens, true, false);
 
         for (int il = 0; il < n_layer; ++il) {
 
@@ -8272,7 +8525,7 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, true);
+        auto inp_attn = build_attn_inp(ctx0, n_tokens, true, true);
 
         // sliding window switch pattern
         const int32_t sliding_window_pattern = 4;
@@ -8407,7 +8660,7 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false);
+        auto inp_attn = build_attn_inp(ctx0, n_tokens, true, false);
 
         for (int il = 0; il < n_layer; ++il) {
             struct ggml_tensor * inpSA = inpL;
@@ -8527,7 +8780,7 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false);
+        auto inp_attn = build_attn_inp(ctx0, n_tokens, true, false);
 
         for (int il = 0; il < n_layer; ++il) {
             struct ggml_tensor * inpSA = inpL;
@@ -8651,7 +8904,7 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false);
+        auto inp_attn = build_attn_inp(ctx0, n_tokens, true, false);
 
         for (int il = 0; il < n_layer; ++il) {
             struct ggml_tensor * inpSA = inpL;
@@ -8772,7 +9025,7 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false);
+        auto inp_attn = build_attn_inp(ctx0, n_tokens, true, false);
 
         for (int il = 0; il < n_layer; ++il) {
             const int64_t n_head    = hparams.n_head(il);
@@ -8900,7 +9153,7 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false);
+        auto inp_attn = build_attn_inp(ctx0, n_tokens, true, false);
 
         for (int il = 0; il < n_layer; ++il) {
             cur = build_norm(inpL,
@@ -9044,7 +9297,7 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false);
+        auto inp_attn = build_attn_inp(ctx0, n_tokens, true, false);
 
         for (int il = 0; il < n_layer; ++il) {
             struct ggml_tensor * inpSA = inpL;
@@ -9174,7 +9427,7 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false);
+        auto inp_attn = build_attn_inp(ctx0, n_tokens, true, false);
 
         const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
 
@@ -9337,7 +9590,7 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false);
+        auto inp_attn = build_attn_inp(ctx0, n_tokens, true, false);
 
         for (int il = 0; il < n_layer; ++il) {
             struct ggml_tensor * inpSA = inpL;
@@ -9555,7 +9808,7 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false);
+        auto inp_attn = build_attn_inp(ctx0, n_tokens, true, false);
 
         for (int il = 0; il < n_layer; ++il) {
             struct ggml_tensor * inpSA = inpL;
@@ -9706,7 +9959,7 @@ struct llm_build_context {
 
         struct ggml_tensor * pos_bucket_enc = build_pos_bucket();
 
-        auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, false, false);
+        auto inp_attn = build_attn_inp(ctx0, n_tokens, false, false);
 
         for (int il = 0; il < n_layer; ++il) {
             struct ggml_tensor * inpSA = inpL;
@@ -9809,7 +10062,7 @@ struct llm_build_context {
 
         const int64_t n_outputs_enc = embd_enc->ne[1];
 
-        auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false);
+        auto inp_attn = build_attn_inp(ctx0, n_tokens, true, false);
 
         for (int il = 0; il < n_layer; ++il) {
             struct ggml_tensor * inpSA = inpL;
@@ -9972,7 +10225,7 @@ struct llm_build_context {
 
         inpL = build_inp_embd(model.tok_embd);
 
-        auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false);
+        auto inp_attn = build_attn_inp(ctx0, n_tokens, true, false);
 
         for (int il = 0; il < n_layer; ++il) {
             cur = build_norm(inpL,
@@ -10066,7 +10319,7 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false);
+        auto inp_attn = build_attn_inp(ctx0, n_tokens, true, false);
 
         for (int il = 0; il < n_layer; ++il) {
             struct ggml_tensor * inpSA = inpL;
@@ -10196,7 +10449,7 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false);
+        auto inp_attn = build_attn_inp(ctx0, n_tokens, true, false);
 
         for (int il = 0; il < n_layer; ++il) {
             struct ggml_tensor * inpSA = inpL;
@@ -10317,7 +10570,7 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false);
+        auto inp_attn = build_attn_inp(ctx0, n_tokens, true, false);
 
         for (int il = 0; il < n_layer; ++il) {
             struct ggml_tensor * inpSA = inpL;
@@ -10435,8 +10688,8 @@ struct llm_build_context {
         inpL = build_inp_embd(model.tok_embd);
         inpL = build_norm(inpL, model.tok_norm, model.tok_norm_b, LLM_NORM, -1);
 
-        struct ggml_tensor * state_copy = lgf->build_inp_s_copy(res.get(), ctx0);
-        struct ggml_tensor * state_mask = lgf->build_inp_s_mask(res.get(), ctx0);
+        struct ggml_tensor * state_copy = build_inp_s_copy();
+        struct ggml_tensor * state_mask = build_inp_s_mask();
 
         const auto n_embd = hparams.n_embd;
         const auto n_seq_tokens = ubatch.n_seq_tokens;
@@ -10527,8 +10780,8 @@ struct llm_build_context {
 
         inpL = build_inp_embd(model.tok_embd);
 
-        struct ggml_tensor * state_copy = lgf->build_inp_s_copy(res.get(), ctx0);
-        struct ggml_tensor * state_mask = lgf->build_inp_s_mask(res.get(), ctx0);
+        struct ggml_tensor * state_copy = build_inp_s_copy();
+        struct ggml_tensor * state_mask = build_inp_s_mask();
 
         const auto n_embd = hparams.n_embd;
         const auto n_seq_tokens = ubatch.n_seq_tokens;
@@ -10622,7 +10875,7 @@ struct llm_build_context {
         // inp_pos - contains the positions
         struct ggml_tensor * inp_pos = build_inp_pos();
 
-        auto inp_attn = lgf->build_attn_inp(res.get(), ctx0, n_tokens, true, false);
+        auto inp_attn = build_attn_inp(ctx0, n_tokens, true, false);
 
         for (int il = 0; il < n_layer; ++il) {
             struct ggml_tensor * inpSA = inpL;
diff --git a/src/llama-model.h b/src/llama-model.h
index 2d64c0d242..45abce7d53 100644
--- a/src/llama-model.h
+++ b/src/llama-model.h
@@ -365,7 +365,6 @@ struct llama_model {
 
     const struct ggml_tensor * get_tensor(const char * name) const;
 
-    // TODO: add encode/decode graphs
     llama_graph_result_ptr build_graph(
               ggml_context * ctx,
                ggml_cgraph * gf,