Merge 57a8def44e into 4951250235

2026-04-01 07:48:12 +02:00 · 2026-04-01 07:48:12 +02:00 · cd9691014c
parent 4951250235 57a8def44e
commit cd9691014c
26 changed files with 4204 additions and 8 deletions
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@ -831,6 +831,8 @@ class ModelBase:
                            gguf.MODEL_TENSOR.SSM_CONV1D_Q,
                            gguf.MODEL_TENSOR.SSM_CONV1D_K,
                            gguf.MODEL_TENSOR.SSM_CONV1D_V,
+                            # DSA indexer weights should be F32
+                            gguf.MODEL_TENSOR.INDEXER_PROJ,
                        )
                    )
                    or new_name[-7:] not in (".weight", ".lora_a", ".lora_b")
@ -8737,6 +8739,147 @@ class DeepseekV2Model(TextModel):
                raise ValueError(f"Unprocessed experts: {experts}")


+@ModelBase.register(
+    "DeepseekV32ForCausalLM",
+)
+class DeepseekV32Model(TextModel):
+    model_arch = gguf.MODEL_ARCH.DEEPSEEK32
+
+    # TODO @ngxson : remove this when we support MTP for deepseek models
+    skip_mtp = True
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.block_count = self.hparams["num_hidden_layers"] + self.hparams.get("num_nextn_predict_layers", 0)
+        self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count)
+
+    def set_vocab(self):
+        from transformers import AutoTokenizer
+        tokenizer = AutoTokenizer.from_pretrained(self.dir_model)
+        assert tokenizer.add_bos_token, "Change value of add_bos_token to true in tokenizer_config.json file."
+        self._set_vocab_gpt2()
+
+    def set_gguf_parameters(self):
+
+        # note: deepseek32 using MLA converts into MQA (ie: GQA with 1 group)
+        self.hparams["num_key_value_heads"] = 1
+
+        super().set_gguf_parameters()
+        hparams = self.hparams
+
+        # first_k_dense_replace: number of leading layers using dense FFN instead of MoE
+        self.gguf_writer.add_leading_dense_block_count(hparams["first_k_dense_replace"])
+        self.gguf_writer.add_vocab_size(hparams["vocab_size"])
+        self.gguf_writer.add_q_lora_rank(hparams["q_lora_rank"])
+        self.gguf_writer.add_kv_lora_rank(hparams["kv_lora_rank"])
+
+        # note: deepseek32 using MLA converts into MQA with larger heads, then decompresses to MHA
+        self.gguf_writer.add_key_length(hparams["kv_lora_rank"] + hparams["qk_rope_head_dim"])
+        self.gguf_writer.add_value_length(hparams["kv_lora_rank"])
+        self.gguf_writer.add_key_length_mla(hparams["qk_nope_head_dim"] + hparams["qk_rope_head_dim"])
+        self.gguf_writer.add_value_length_mla(hparams["v_head_dim"])
+
+        # MoE parameters (required by C++ code for DEEPSEEK32 arch)
+        self.gguf_writer.add_expert_feed_forward_length(hparams["moe_intermediate_size"])
+        self.gguf_writer.add_expert_count(hparams["n_routed_experts"])
+        self.gguf_writer.add_expert_shared_count(hparams["n_shared_experts"])
+        self.gguf_writer.add_expert_weights_scale(self.hparams["routed_scaling_factor"])
+        self.gguf_writer.add_expert_weights_norm(self.hparams["norm_topk_prob"])
+
+        self.gguf_writer.add_rope_dimension_count(hparams["qk_rope_head_dim"])
+
+        if (rope_mscale_all := self.rope_parameters.get("mscale_all_dim")) is not None:
+            # [TAG_DEEPSEEK2_YARN_LOG_MUL_FIX]
+            # note: for legacy reasons, this is not consistent with the other usages of self.gguf_writer.add_rope_scaling_yarn_log_mul
+            # ref https://github.com/ggml-org/llama.cpp/pull/17945
+            self.gguf_writer.add_rope_scaling_yarn_log_mul(0.1 * rope_mscale_all)
+
+        # NextN/MTP prediction layers
+        if (num_nextn_predict_layers := self.hparams.get("num_nextn_predict_layers")) is not None:
+            self.gguf_writer.add_nextn_predict_layers(num_nextn_predict_layers)
+
+        # DSA indexer parameters
+        self.gguf_writer.add_indexer_head_count(self.hparams["index_n_heads"])
+        self.gguf_writer.add_indexer_key_length(self.hparams["index_head_dim"])
+        self.gguf_writer.add_indexer_top_k(self.hparams["index_topk"])
+
+    _experts: list[dict[str, Tensor]] | None = None
+
+    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
+        if name.startswith("language_model."):
+            name = name.replace("language_model.", "")
+
+        # rename e_score_correction_bias tensors
+        if name.endswith("e_score_correction_bias"):
+            name = name.replace("e_score_correction_bias", "e_score_correction.bias")
+
+        # skip Multi-Token Prediction (MTP) layers
+        if self.skip_mtp:
+            block_count = self.hparams["num_hidden_layers"]
+            match = re.match(r"model.layers.(\d+)", name)
+            if match and int(match.group(1)) >= block_count:
+                return
+
+        # process the experts separately
+        if name.find("mlp.experts") != -1:
+            n_experts = self.hparams["n_routed_experts"]
+            assert bid is not None
+
+            if self._experts is None:
+                self._experts = [{} for _ in range(self.block_count)]
+
+            self._experts[bid][name] = data_torch
+
+            if len(self._experts[bid]) >= n_experts * 3:
+                # merge the experts into a single 3d tensor
+                for w_name in ["down_proj", "gate_proj", "up_proj"]:
+                    datas: list[Tensor] = []
+
+                    for xid in range(n_experts):
+                        ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight"
+                        datas.append(self._experts[bid][ename])
+                        del self._experts[bid][ename]
+
+                    data_torch = torch.stack(datas, dim=0)
+
+                    merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight"
+
+                    yield from super().modify_tensors(data_torch, merged_name, bid)
+                return
+            else:
+                return
+
+        # note: MLA with the absorption optimization, needs these two split and k_b_proj transposed
+        if name.endswith("kv_b_proj.weight"):
+            name_kb = name.replace("kv_b_proj", "k_b_proj")
+            name_vb = name.replace("kv_b_proj", "v_b_proj")
+
+            n_head_kv = self.hparams["num_key_value_heads"]
+            v_head_dim = self.hparams["v_head_dim"]
+            qk_nope_head_dim = self.hparams["qk_nope_head_dim"]
+
+            assert data_torch.shape[0] == n_head_kv * (v_head_dim + qk_nope_head_dim)
+
+            kv_b = data_torch.view(n_head_kv, v_head_dim + qk_nope_head_dim, data_torch.shape[-1])
+            k_b, v_b = torch.split(kv_b, [qk_nope_head_dim, v_head_dim], dim=1)
+            k_b = k_b.transpose(1, 2)
+
+            yield from super().modify_tensors(k_b, name_kb, bid)
+            yield from super().modify_tensors(v_b, name_vb, bid)
+            return
+
+        yield from super().modify_tensors(data_torch, name, bid)
+
+    def prepare_tensors(self):
+        super().prepare_tensors()
+
+        if self._experts is not None:
+            # flatten `list[dict[str, Tensor]]` into `list[str]`
+            experts = [k for d in self._experts for k in d.keys()]
+            if len(experts) > 0:
+                raise ValueError(f"Unprocessed experts: {experts}")
+
+
@ModelBase.register(
    "Mistral3ForConditionalGeneration",
    "Ministral3ForCausalLM",
--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@ -559,6 +559,8 @@ extern "C" {
        GGML_OP_RWKV_WKV7,
        GGML_OP_SOLVE_TRI,
        GGML_OP_GATED_DELTA_NET,
+        GGML_OP_HADAMARD,
+        GGML_OP_SCATTER,

        GGML_OP_UNARY,

@ -2481,6 +2483,23 @@ extern "C" {
            struct ggml_tensor  * beta,
            struct ggml_tensor  * state);

+    GGML_API struct ggml_tensor * ggml_hadamard(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        int                   n);
+
+    GGML_API struct ggml_tensor * ggml_scatter(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        struct ggml_tensor  * ids,
+        float                 c);
+
+    GGML_API struct ggml_tensor * ggml_scatter_inplace(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        struct ggml_tensor  * ids,
+        float                 c);
+
    // custom operators

    typedef void (*ggml_custom1_op_t)(struct ggml_tensor * dst , const struct ggml_tensor * a, int ith, int nth, void * userdata);
--- a/ggml/src/ggml-cpu/ggml-cpu.c
+++ b/ggml/src/ggml-cpu/ggml-cpu.c
@ -2031,6 +2031,14 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
            {
                ggml_compute_forward_gated_delta_net(params, tensor);
            } break;
+        case GGML_OP_HADAMARD:
+            {
+                ggml_compute_forward_hadamard(params, tensor);
+            } break;
+        case GGML_OP_SCATTER:
+            {
+                ggml_compute_forward_scatter(params, tensor);
+            } break;
        case GGML_OP_MAP_CUSTOM1:
            {
                ggml_compute_forward_map_custom1(params, tensor);
@ -2353,6 +2361,8 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
        case GGML_OP_RWKV_WKV6:
        case GGML_OP_GATED_LINEAR_ATTN:
        case GGML_OP_RWKV_WKV7:
+        case GGML_OP_HADAMARD:
+        case GGML_OP_SCATTER:
            {
                n_tasks = n_threads;
            } break;
--- a/ggml/src/ggml-cpu/ops.cpp
+++ b/ggml/src/ggml-cpu/ops.cpp
@ -2232,8 +2232,42 @@ static void ggml_compute_forward_fill_f32(const ggml_compute_params * params, gg
    }
 }

+static void ggml_compute_forward_fill_f16(const ggml_compute_params * params, ggml_tensor * dst) {
+    const ggml_fp16_t c = GGML_CPU_FP32_TO_FP16(ggml_get_op_params_f32(dst, 0));
+
+    GGML_TENSOR_LOCALS(int64_t, ne, dst, ne);
+    GGML_TENSOR_LOCALS(size_t,  nb, dst, nb);
+
+    const auto [ir0, ir1] = get_thread_range(params, dst);
+
+    for (int64_t ir = ir0; ir < ir1; ++ir) {
+        const int64_t i03 = ir/(ne2*ne1);
+        const int64_t i02 = (ir - i03*ne2*ne1)/ne1;
+        const int64_t i01 = (ir - i03*ne2*ne1 - i02*ne1);
+
+        ggml_fp16_t * dst_ptr  = (ggml_fp16_t *) ((char *) dst->data + i03*nb3 + i02*nb2 + i01*nb1);
+
+        ggml_vec_set_f16(ne0, dst_ptr, c);
+    }
+}
+
 void ggml_compute_forward_fill(const ggml_compute_params * params, ggml_tensor * dst) {
-    ggml_compute_forward_fill_f32(params, dst);
+    const ggml_tensor * src0 = dst->src[0];
+
+    switch (src0->type) {
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_fill_f32(params, dst);
+            } break;
+        case GGML_TYPE_F16:
+            {
+                ggml_compute_forward_fill_f16(params, dst);
+            } break;
+        default:
+            {
+                GGML_ABORT("unsupported type for ggml_compute_forward_fill: %s", ggml_type_name(src0->type));
+            }
+    }
 }

 // ggml_compute_tri
@ -11217,3 +11251,227 @@ void ggml_compute_forward_opt_step_sgd(const ggml_compute_params * params, ggml_
            }
    }
 }
+
+// ggml_compute_forward_hadamard
+
+// Based on a source code from: https://github.com/ikawrakow/ik_llama.cpp
+// Copyright (C) 2025 Iwan Kawrakow
+// MIT license
+// SPDX-License-Identifier: MIT
+
+template <typename T>
+void fast_ht(int n, T * values) {
+    constexpr float ksqrt2 = 0.707106781f;
+    float scale = 1;
+    for (int h = 1; h < n; h <<= 1) {
+        for (int i = 0; i < n; i += 2*h) {
+            for (int j = i; j < i + h; ++j) {
+                T x = values[j], y = values[j + h];
+                values[j+0] = x + y;
+                values[j+h] = x - y;
+            }
+        }
+        scale *= ksqrt2;
+    }
+    for (int i = 0; i < n; ++i) values[i] *= scale;
+}
+
+static void ggml_compute_forward_hadamard_f32(
+        const ggml_compute_params * params,
+        ggml_tensor * dst) {
+
+    const ggml_tensor * src0 = dst->src[0];
+
+    GGML_ASSERT(ggml_is_contiguous(src0));
+    GGML_ASSERT(ggml_is_contiguous(dst));
+    GGML_ASSERT(ggml_are_same_shape(src0, dst));
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    int nh = dst->op_params[0];
+    GGML_ASSERT(nh > 1 && ((nh & (nh - 1)) == 0)); // power of 2
+    GGML_ASSERT(dst->ne[0] % nh == 0);
+
+    int nc = dst->ne[0]/nh;
+    int nr = ggml_nrows(dst) * nc;
+
+    int npt = (nr + nth - 1)/nth;
+    int first = npt*ith;
+    int last  = std::min(first + npt, nr);
+
+    for (int ir = first; ir < last; ++ir) {
+        int i3 = ir / (dst->ne[1] * dst->ne[2] * nc);
+        int i2 = (ir - i3*dst->ne[1] * dst->ne[2] * nc)/(dst->ne[1] * nc);
+        int i1 = (ir - i3*dst->ne[1] * dst->ne[2] * nc - i2*dst->ne[1]*nc)/nc;
+        int ic = (ir - i3*dst->ne[1] * dst->ne[2] * nc - i2*dst->ne[1]*nc - i1*nc);
+
+        auto x = (const float *)((const char *)src0->data + i3*src0->nb[3] + i2*src0->nb[2] + i1*src0->nb[1]) + ic*nh;
+        auto y = (      float *)((      char *)dst->data + i3*dst->nb[3] + i2*dst->nb[2] + i1*dst->nb[1]) + ic*nh;
+        memcpy(y, x, nh*sizeof(float));
+        fast_ht(nh, y);
+    }
+}
+
+void ggml_compute_forward_hadamard(
+        const ggml_compute_params * params,
+        ggml_tensor * dst) {
+
+    const ggml_tensor * src0 = dst->src[0];
+
+    switch (src0->type) {
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_hadamard_f32(params, dst);
+            } break;
+        default:
+            {
+                GGML_ABORT("fatal error");
+            }
+    }
+}
+
+// ggml_compute_forward_scatter
+
+static void ggml_compute_forward_scatter_f32(
+        const ggml_compute_params * params,
+        ggml_tensor * dst) {
+
+    const ggml_tensor * src0 = dst->src[0];
+    const ggml_tensor * src1 = dst->src[1];
+
+    const float c = ggml_get_op_params_f32(dst, 0);
+    const bool inplace = ggml_get_op_params_i32(dst, 1);
+
+    GGML_ASSERT(ggml_are_same_shape(src0, dst));
+
+    GGML_ASSERT(dst->type  == GGML_TYPE_F32);
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT(src1->type == GGML_TYPE_I32);
+
+    GGML_ASSERT(src0->nb[0] == sizeof(float));
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    const int nr  = ggml_nrows(src0);
+
+    GGML_TENSOR_BINARY_OP_LOCALS
+
+    GGML_ASSERT( nb0 == sizeof(float));
+    GGML_ASSERT(nb00 == sizeof(float));
+
+    // rows per thread
+    const int dr = (nr + nth - 1)/nth;
+
+    // row range for this thread
+    const int ir0 = dr*ith;
+    const int ir1 = MIN(ir0 + dr, nr);
+
+    for (int ir = ir0; ir < ir1; ++ir) {
+        // src0 indices
+        const int i3 = ir/(ne2*ne1);
+        const int i2 = (ir - i3*ne2*ne1)/ne1;
+        const int i1 = (ir - i3*ne2*ne1 - i2*ne1);
+
+        const float * src0_ptr = (float *) ((char *) src0->data  + i3*nb3  + i2*nb2  + i1*nb1 );
+        const int32_t * ids_ptr = (int32_t *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11);
+        float * dst_ptr = (float *) ((char *) dst->data  + i3*nb3  + i2*nb2  + i1*nb1 );
+
+        // copy whole row from src0
+        if (!inplace) {
+            ggml_vec_cpy_f32(ne00, dst_ptr, src0_ptr);
+        }
+
+        // set dst elements indicated by indices in src1 to c
+        for (int j = 0; j < ne10; ++j) {
+            int id = ids_ptr[j];
+            GGML_ASSERT(id >= 0 && id < ne00);
+            dst_ptr[id] = c;
+        }
+    }
+}
+
+static void ggml_compute_forward_scatter_f16(
+        const ggml_compute_params * params,
+        ggml_tensor * dst) {
+
+    const ggml_tensor * src0 = dst->src[0];
+    const ggml_tensor * src1 = dst->src[1];
+
+    const ggml_fp16_t c = GGML_CPU_FP32_TO_FP16(ggml_get_op_params_f32(dst, 0));
+    const bool inplace = ggml_get_op_params_i32(dst, 1);
+
+    GGML_ASSERT(ggml_are_same_shape(src0, dst));
+
+    GGML_ASSERT(dst->type  == GGML_TYPE_F16);
+    GGML_ASSERT(src0->type == GGML_TYPE_F16);
+    GGML_ASSERT(src1->type == GGML_TYPE_I32);
+
+    GGML_ASSERT(src0->nb[0] == sizeof(ggml_fp16_t));
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    const int nr  = ggml_nrows(src0);
+
+    GGML_TENSOR_BINARY_OP_LOCALS
+
+    GGML_ASSERT( nb0 == sizeof(ggml_fp16_t));
+    GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
+
+    // rows per thread
+    const int dr = (nr + nth - 1)/nth;
+
+    // row range for this thread
+    const int ir0 = dr*ith;
+    const int ir1 = MIN(ir0 + dr, nr);
+
+    for (int ir = ir0; ir < ir1; ++ir) {
+        // src0 indices
+        const int i3 = ir/(ne2*ne1);
+        const int i2 = (ir - i3*ne2*ne1)/ne1;
+        const int i1 = (ir - i3*ne2*ne1 - i2*ne1);
+
+        const ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data  + i3*nb3  + i2*nb2  + i1*nb1 );
+        const int32_t * ids_ptr = (int32_t *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11);
+        ggml_fp16_t * dst_ptr = (ggml_fp16_t *) ((char *) dst->data  + i3*nb3  + i2*nb2  + i1*nb1 );
+
+        // copy whole row from src0
+        if (!inplace) {
+            // ggml_vec_cpy_f16(ne00, dst_ptr, src0_ptr)
+            for (int i = 0; i < ne00; ++i) {
+                dst_ptr[i]  = src0_ptr[i];
+            }
+        }
+
+        // set dst elements indicated by indices in src1 to c
+        for (int j = 0; j < ne10; ++j) {
+            int id = ids_ptr[j];
+            GGML_ASSERT(id >= 0 && id < ne00);
+            dst_ptr[id] = c;
+        }
+    }
+}
+
+void ggml_compute_forward_scatter(
+        const ggml_compute_params * params,
+        ggml_tensor * dst) {
+
+    const ggml_tensor * src0 = dst->src[0];
+
+    switch (src0->type) {
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_scatter_f32(params, dst);
+            } break;
+        case GGML_TYPE_F16:
+            {
+                ggml_compute_forward_scatter_f16(params, dst);
+            } break;
+        default:
+            {
+                GGML_ABORT("unsupported type for ggml_compute_forward_scatter: %s", ggml_type_name(src0->type));
+            }
+    }
+}
--- a/ggml/src/ggml-cpu/ops.h
+++ b/ggml/src/ggml-cpu/ops.h
@ -103,6 +103,8 @@ void ggml_compute_forward_rwkv_wkv7(const struct ggml_compute_params * params, s
 void ggml_compute_forward_solve_tri(const struct ggml_compute_params * params, struct ggml_tensor * dst);
 void ggml_compute_forward_gla(const struct ggml_compute_params * params, struct ggml_tensor * dst);
 void ggml_compute_forward_gated_delta_net(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_hadamard(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_scatter(const struct ggml_compute_params * params, struct ggml_tensor * dst);
 void ggml_compute_forward_map_custom1(const struct ggml_compute_params * params, struct ggml_tensor * dst);
 void ggml_compute_forward_map_custom2(const struct ggml_compute_params * params, struct ggml_tensor * dst);
 void ggml_compute_forward_map_custom3(const struct ggml_compute_params * params, struct ggml_tensor * dst);
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@ -61,6 +61,8 @@
 #include "ggml-cuda/tri.cuh"
 #include "ggml-cuda/cumsum.cuh"
 #include "ggml-cuda/fill.cuh"
+#include "ggml-cuda/hadamard.cuh"
+#include "ggml-cuda/scatter.cuh"
 #include "ggml.h"

 #include <algorithm>
@ -2815,6 +2817,12 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
        case GGML_OP_FILL:
            ggml_cuda_op_fill(ctx, dst);
            break;
+        case GGML_OP_HADAMARD:
+            ggml_cuda_op_hadamard(ctx, dst);
+            break;
+        case GGML_OP_SCATTER:
+            ggml_cuda_op_scatter(ctx, dst);
+            break;
        default:
            return false;
    }
@ -5057,8 +5065,12 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
        case GGML_OP_TRI:
        case GGML_OP_DIAG:
        case GGML_OP_SOLVE_TRI:
+        case GGML_OP_SCATTER:
            return true;
-
+        case GGML_OP_HADAMARD: {
+            int nh = op->op_params[0];
+            return (nh == 64 || nh == 128 || nh == 256) && op->ne[0] % nh == 0 && op->type == GGML_TYPE_F32 && op->src[0]->type == GGML_TYPE_F32;
+        }
        default:
            return false;
    }
--- a/ggml/src/ggml-cuda/hadamard.cu
+++ b/ggml/src/ggml-cuda/hadamard.cu
@ -0,0 +1,73 @@
+// Copyright (C) 2025 Iwan Kawrakow
+// MIT license
+// SPDX-License-Identifier: MIT
+
+#include "hadamard.cuh"
+
+template <int nh>
+static __global__ void hadamard_f32(const char * src, char * dst, int ne0,
+        size_t nb01, size_t nb02, size_t nb03, size_t nb1, size_t nb2, size_t nb3) {
+
+    constexpr float ksqrt2 = 0.707106781f;
+
+    int nc  = ne0/nh;
+    int ii1 = blockIdx.x;
+    int i1  = ii1 / nc;
+    int ic  = ii1 % nc;
+    int i2  = blockIdx.y;
+    int i3  = blockIdx.z;
+
+    int tid = threadIdx.x;
+
+    const float * x = (const float *)((const char *)src + i1*nb01 + i2*nb02 + i3*nb03) + ic*nh;
+          float * y = (      float *)((const char *)dst + i1*nb1  + i2*nb2  + i3*nb3)  + ic*nh;
+
+    __shared__ float ys[nh];
+
+    ys[2*tid+0] = x[2*tid+0] + x[2*tid+1];
+    ys[2*tid+1] = x[2*tid+0] - x[2*tid+1];
+
+    float scale = ksqrt2;
+
+#pragma unroll
+    for (int h = 2; h < nh; h <<= 1) {
+        __syncthreads();
+        int ii = tid/h, jj = tid%h;
+        int j = 2*h*ii+jj;
+        float u = ys[j], v = ys[j+h];
+        ys[j+0] = u + v;
+        ys[j+h] = u - v;
+        scale *= ksqrt2;
+    }
+
+    __syncthreads();
+    y[2*tid+0] = ys[2*tid+0] * scale;
+    y[2*tid+1] = ys[2*tid+1] * scale;
+}
+
+static void hadamard_f32_cuda(int nh, const char * x, char * y, int ne0, int ne1, int ne2, int ne3,
+        size_t nb01, size_t nb02, size_t nb03, size_t nb1, size_t nb2, size_t nb3, cudaStream_t stream) {
+    int nc = ne0/nh;
+    int nrows = nc*ne1;
+    dim3 num_blocks = dim3(nrows, ne2, ne3);
+    switch (nh) {
+        case  64: hadamard_f32< 64><<<num_blocks,  32, 0, stream>>>(x, y, ne0, nb01, nb02, nb03, nb1, nb2, nb3); break;
+        case 128: hadamard_f32<128><<<num_blocks,  64, 0, stream>>>(x, y, ne0, nb01, nb02, nb03, nb1, nb2, nb3); break;
+        case 256: hadamard_f32<256><<<num_blocks, 128, 0, stream>>>(x, y, ne0, nb01, nb02, nb03, nb1, nb2, nb3); break;
+        default: GGML_ABORT("Unsupported Hadamard block size");
+    }
+}
+
+void ggml_cuda_op_hadamard(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
+    const ggml_tensor * src = dst->src[0];
+    GGML_ASSERT(src->type == GGML_TYPE_F32);
+    GGML_ASSERT(dst->type == GGML_TYPE_F32);
+    GGML_ASSERT(ggml_are_same_shape(src, dst));
+
+    int nh = dst->op_params[0];
+    GGML_ASSERT(dst->ne[0] % nh == 0);
+    GGML_ASSERT(nh > 1 && ((nh & (nh - 1)) == 0)); // power of 2
+
+    hadamard_f32_cuda(nh, (const char *)src->data, (char *)dst->data, src->ne[0], src->ne[1], src->ne[2], src->ne[3],
+            src->nb[1], src->nb[2], src->nb[3], dst->nb[1], dst->nb[2], dst->nb[3], ctx.stream());
+}
--- a/ggml/src/ggml-cuda/hadamard.cuh
+++ b/ggml/src/ggml-cuda/hadamard.cuh
@ -0,0 +1,3 @@
+#include "common.cuh"
+
+void ggml_cuda_op_hadamard(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
--- a/ggml/src/ggml-cuda/scatter.cu
+++ b/ggml/src/ggml-cuda/scatter.cu
@ -0,0 +1,88 @@
+#include "scatter.cuh"
+#include "convert.cuh"
+
+template <typename T>
+static __global__ void scatter_kernel(
+        const int32_t * src0, T * dst, const T c,
+        int64_t ne00, int64_t ne01, int64_t ne02, int64_t ne03,
+        size_t nb1, size_t nb2, size_t nb3,
+        size_t nb01, size_t nb02, size_t nb03
+    ) {
+
+    const int64_t total_blocks = ne01 * ne02 * ne03;
+
+    for (int64_t block_idx = blockIdx.x; block_idx < total_blocks; block_idx += gridDim.x) {
+
+        const int64_t i1 = block_idx % ne01;
+        const int64_t i2 = (block_idx / ne01) % ne02;
+        const int64_t i3 = block_idx / (ne01 * ne02);
+
+        T * dst_row = (T *)((char *)dst + i1*nb1 + i2*nb2 + i3*nb3);
+        const int * src0_row = (const int *)((const char *)src0 + i1*nb01 + i2*nb02 + i3*nb03);
+
+        for (int64_t i0 = threadIdx.x; i0 < ne00; i0 += blockDim.x) {
+            const int32_t id = src0_row[i0];
+            dst_row[id] = c;
+        }
+    }
+}
+
+void ggml_cuda_op_scatter(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
+    const ggml_tensor * src0 = dst->src[0];
+    const ggml_tensor * src1 = dst->src[1];
+
+    GGML_TENSOR_BINARY_OP_LOCALS
+
+    GGML_ASSERT(ggml_is_contiguous(dst));
+    GGML_ASSERT(ggml_is_contiguous(src0));
+    GGML_ASSERT(ggml_is_contiguous(src1));
+
+    GGML_ASSERT(dst->type  == src0->type);
+    GGML_ASSERT(src1->type == GGML_TYPE_I32);
+
+    GGML_ASSERT(nb10 == sizeof(int32_t));
+
+    GGML_ASSERT(ggml_nbytes(src0) == ggml_nbytes(dst));
+
+    float c = ggml_get_op_params_f32(dst, 0);
+    bool inplace = ggml_get_op_params_i32(dst, 1);
+
+    // step 1 - copy whole src0 to dst
+    if (!inplace) {
+        cudaStream_t main_stream = ctx.stream();
+        char * dst_ddc = (char *) dst->data;
+        char * src0_ddc = (char *) src0->data;
+
+        CUDA_CHECK(cudaMemcpyAsync(dst_ddc, src0_ddc, ggml_nbytes(src0), cudaMemcpyDeviceToDevice, main_stream));
+    }
+
+    // step 2 - set elements in dst indicated by ids to c
+    const int32_t * src1_d = (const int32_t *) src1->data;
+    void * dst_d = dst->data;
+
+    int threads = std::min((int) ne10, 512); // ids
+
+    int64_t total_blocks = ne11 * ne12 * ne13;
+    int blocks = (int) std::min((int64_t) 65535, total_blocks);
+
+    switch (dst->type) {
+        case GGML_TYPE_F32:
+            scatter_kernel<<<blocks, threads, 0, ctx.stream()>>>(
+                src1_d, (float *) dst_d, c,
+                ne10, ne11, ne12, ne13,
+                nb1, nb2, nb3,
+                nb11, nb12, nb13
+            );
+            break;
+        case GGML_TYPE_F16:
+            scatter_kernel<<<blocks, threads, 0, ctx.stream()>>>(
+                src1_d, (half *) dst_d, ggml_cuda_cast<half>(c),
+                ne10, ne11, ne12, ne13,
+                nb1, nb2, nb3,
+                nb11, nb12, nb13
+            );
+            break;
+        default:
+            GGML_ABORT("unsupported type");
+    }
+}
--- a/ggml/src/ggml-cuda/scatter.cuh
+++ b/ggml/src/ggml-cuda/scatter.cuh
@ -0,0 +1,3 @@
+#include "common.cuh"
+
+void ggml_cuda_op_scatter(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@ -1040,6 +1040,8 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
    "RWKV_WKV7",
    "SOLVE_TRI",
    "GATED_DELTA_NET",
+    "HADAMARD",
+    "SCATTER",

    "UNARY",

@ -1057,7 +1059,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
    "GLU",
 };

-static_assert(GGML_OP_COUNT == 96, "GGML_OP_COUNT != 96");
+static_assert(GGML_OP_COUNT == 98, "GGML_OP_COUNT != 98");

 static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
    "none",
@ -1150,6 +1152,8 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
    "rwkv_wkv7(r, w, k, v, a, b, s)",
    "A X = B, A triangular, solve X",
    "gated_delta_net(q, k, v, g, beta, s)",
+    "hadamard(x)",
+    "scatter(x,ids,c)",

    "unary(x)",

@ -1167,7 +1171,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
    "glu(x)",
 };

-static_assert(GGML_OP_COUNT == 96, "GGML_OP_COUNT != 96");
+static_assert(GGML_OP_COUNT == 98, "GGML_OP_COUNT != 98");

 static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");

@ -5189,7 +5193,7 @@ static struct ggml_tensor * ggml_fill_impl(
    struct ggml_tensor  * a,
    float                 c,
    bool                  inplace) {
-    GGML_ASSERT(a->type == GGML_TYPE_F32);
+    GGML_ASSERT(a->type == GGML_TYPE_F32 || a->type == GGML_TYPE_F16);
    GGML_ASSERT(ggml_is_contiguous(a));

    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
@ -6194,6 +6198,69 @@ struct ggml_tensor * ggml_gated_delta_net(
    return result;
 }

+// ggml_hadamard
+
+struct ggml_tensor * ggml_hadamard(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        int                   n) {
+
+    GGML_ASSERT(a->type == GGML_TYPE_F32); // will not bother implementing for other data types
+    GGML_ASSERT(n > 1);                    // no point in Hadamard transforms with less than 2 elements
+    GGML_ASSERT(a->ne[0] % n == 0);
+    GGML_ASSERT(n > 0 && ((n & (n - 1)) == 0)); // must be a power of 2
+
+    struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, GGML_MAX_DIMS, a->ne);
+
+    result->op   = GGML_OP_HADAMARD;
+    result->src[0] = a;
+
+    result->op_params[0] = n;
+
+    return result;
+}
+
+// ggml_scatter
+
+static struct ggml_tensor * ggml_scatter_impl(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        struct ggml_tensor  * ids,
+        float                 c,
+        bool                  inplace) {
+
+    GGML_ASSERT(a->type == GGML_TYPE_F32 || a->type == GGML_TYPE_F16);
+    GGML_ASSERT(ids->type == GGML_TYPE_I32);
+    GGML_ASSERT(a->ne[1] == ids->ne[1]);
+
+    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
+
+    ggml_set_op_params_f32(result, 0, c);
+    ggml_set_op_params_i32(result, 1, inplace ? 1 : 0);
+
+    result->op   = GGML_OP_SCATTER;
+    result->src[0] = a;
+    result->src[1] = ids;
+
+    return result;
+}
+
+struct ggml_tensor * ggml_scatter(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        struct ggml_tensor  * ids,
+        float                 c) {
+    return ggml_scatter_impl(ctx, a, ids, c, false);
+}
+
+struct ggml_tensor * ggml_scatter_inplace(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        struct ggml_tensor  * ids,
+        float                 c) {
+    return ggml_scatter_impl(ctx, a, ids, c, true);
+}
+
 ////////////////////////////////////////////////////////////////////////////////

 struct ggml_hash_set ggml_hash_set_new(size_t size) {
--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@ -440,6 +440,7 @@ class MODEL_ARCH(IntEnum):
    DEEPSEEK         = auto()
    DEEPSEEK2        = auto()
    DEEPSEEK2OCR     = auto()
+    DEEPSEEK32       = auto()
    CHATGLM          = auto()
    GLM4             = auto()
    GLM4_MOE         = auto()
@ -903,6 +904,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
    MODEL_ARCH.DEEPSEEK:         "deepseek",
    MODEL_ARCH.DEEPSEEK2:        "deepseek2",
    MODEL_ARCH.DEEPSEEK2OCR:     "deepseek2-ocr",
+    MODEL_ARCH.DEEPSEEK32:       "deepseek32",
    MODEL_ARCH.CHATGLM:          "chatglm",
    MODEL_ARCH.GLM4:             "glm4",
    MODEL_ARCH.GLM4_MOE:         "glm4moe",
@ -2720,6 +2722,46 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
        MODEL_TENSOR.FFN_UP_SHEXP,
        MODEL_TENSOR.FFN_EXP_PROBS_B,
    ],
+    MODEL_ARCH.DEEPSEEK32: [
+        MODEL_TENSOR.TOKEN_EMBD,
+        MODEL_TENSOR.OUTPUT_NORM,
+        MODEL_TENSOR.OUTPUT,
+        MODEL_TENSOR.ROPE_FREQS,
+        MODEL_TENSOR.ATTN_NORM,
+        MODEL_TENSOR.ATTN_Q,
+        MODEL_TENSOR.ATTN_Q_A,
+        MODEL_TENSOR.ATTN_Q_B,
+        MODEL_TENSOR.ATTN_KV_A_MQA,
+        MODEL_TENSOR.ATTN_K_B,
+        MODEL_TENSOR.ATTN_V_B,
+        MODEL_TENSOR.ATTN_Q_A_NORM,
+        MODEL_TENSOR.ATTN_KV_A_NORM,
+        MODEL_TENSOR.ATTN_OUT,
+        MODEL_TENSOR.ATTN_ROT_EMBD,
+        MODEL_TENSOR.FFN_GATE_INP,
+        MODEL_TENSOR.FFN_NORM,
+        MODEL_TENSOR.FFN_GATE,
+        MODEL_TENSOR.FFN_DOWN,
+        MODEL_TENSOR.FFN_UP,
+        MODEL_TENSOR.FFN_GATE_EXP,
+        MODEL_TENSOR.FFN_DOWN_EXP,
+        MODEL_TENSOR.FFN_UP_EXP,
+        MODEL_TENSOR.FFN_GATE_SHEXP,
+        MODEL_TENSOR.FFN_DOWN_SHEXP,
+        MODEL_TENSOR.FFN_UP_SHEXP,
+        MODEL_TENSOR.FFN_EXP_PROBS_B,
+        MODEL_TENSOR.INDEXER_K_NORM,
+        MODEL_TENSOR.INDEXER_PROJ,
+        MODEL_TENSOR.INDEXER_ATTN_K,
+        MODEL_TENSOR.INDEXER_ATTN_Q_B,
+        # NextN/MTP tensors - preserved but unused
+        MODEL_TENSOR.NEXTN_EH_PROJ,
+        MODEL_TENSOR.NEXTN_EMBED_TOKENS,
+        MODEL_TENSOR.NEXTN_ENORM,
+        MODEL_TENSOR.NEXTN_HNORM,
+        MODEL_TENSOR.NEXTN_SHARED_HEAD_HEAD,
+        MODEL_TENSOR.NEXTN_SHARED_HEAD_NORM,
+    ],
    MODEL_ARCH.ERNIE4_5_MOE: [
        MODEL_TENSOR.TOKEN_EMBD,
        MODEL_TENSOR.OUTPUT_NORM,
@ -3833,6 +3875,10 @@ MODEL_TENSOR_SKIP: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
        MODEL_TENSOR.ROPE_FREQS,
        MODEL_TENSOR.ATTN_ROT_EMBD,
    ],
+    MODEL_ARCH.DEEPSEEK32: [
+        MODEL_TENSOR.ROPE_FREQS,
+        MODEL_TENSOR.ATTN_ROT_EMBD,
+    ],
    MODEL_ARCH.CHATGLM: [
        MODEL_TENSOR.ROPE_FREQS,
    ],
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@ -22,6 +22,8 @@ add_library(llama
            llama-io.cpp
            llama-kv-cache.cpp
            llama-kv-cache-iswa.cpp
+            llama-ik-cache.cpp
+            llama-kv-cache-dsa.cpp
            llama-memory.cpp
            llama-memory-hybrid.cpp
            llama-memory-hybrid-iswa.cpp
@ -57,6 +59,7 @@ add_library(llama
            models/deci.cpp
            models/deepseek.cpp
            models/deepseek2.cpp
+            models/deepseek32.cpp
            models/delta-net-base.cpp
            models/dots1.cpp
            models/dream.cpp
--- a/src/llama-arch.cpp
+++ b/src/llama-arch.cpp
@ -74,6 +74,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
    { LLM_ARCH_DEEPSEEK,         "deepseek"         },
    { LLM_ARCH_DEEPSEEK2,        "deepseek2"        },
    { LLM_ARCH_DEEPSEEK2OCR,     "deepseek2-ocr"    },
+    { LLM_ARCH_DEEPSEEK32,       "deepseek32"       },
    { LLM_ARCH_CHATGLM,          "chatglm"          },
    { LLM_ARCH_GLM4,             "glm4"             },
    { LLM_ARCH_GLM4_MOE,         "glm4moe"          },
@ -1608,6 +1609,44 @@ static std::set<llm_tensor> llm_get_tensor_names(llm_arch arch) {
                LLM_TENSOR_FFN_UP_SHEXP,
                LLM_TENSOR_FFN_EXP_PROBS_B,
            };
+        case LLM_ARCH_DEEPSEEK32:
+            return {
+                LLM_TENSOR_TOKEN_EMBD,
+                LLM_TENSOR_OUTPUT_NORM,
+                LLM_TENSOR_OUTPUT,
+                LLM_TENSOR_ATTN_NORM,
+                LLM_TENSOR_ATTN_Q_A_NORM,
+                LLM_TENSOR_ATTN_KV_A_NORM,
+                LLM_TENSOR_ATTN_Q_A,
+                LLM_TENSOR_ATTN_Q_B,
+                LLM_TENSOR_ATTN_KV_A_MQA,
+                LLM_TENSOR_ATTN_KV_B,
+                LLM_TENSOR_ATTN_K_B,
+                LLM_TENSOR_ATTN_V_B,
+                LLM_TENSOR_ATTN_OUT,
+                LLM_TENSOR_FFN_NORM,
+                LLM_TENSOR_FFN_GATE,
+                LLM_TENSOR_FFN_UP,
+                LLM_TENSOR_FFN_DOWN,
+                LLM_TENSOR_FFN_GATE_INP,
+                LLM_TENSOR_FFN_GATE_EXPS,
+                LLM_TENSOR_FFN_DOWN_EXPS,
+                LLM_TENSOR_FFN_UP_EXPS,
+                LLM_TENSOR_FFN_GATE_SHEXP,
+                LLM_TENSOR_FFN_DOWN_SHEXP,
+                LLM_TENSOR_FFN_UP_SHEXP,
+                LLM_TENSOR_FFN_EXP_PROBS_B,
+                LLM_TENSOR_INDEXER_K_NORM,
+                LLM_TENSOR_INDEXER_PROJ,
+                LLM_TENSOR_INDEXER_ATTN_K,
+                LLM_TENSOR_INDEXER_ATTN_Q_B,
+                LLM_TENSOR_NEXTN_EH_PROJ,
+                LLM_TENSOR_NEXTN_EMBED_TOKENS,
+                LLM_TENSOR_NEXTN_ENORM,
+                LLM_TENSOR_NEXTN_HNORM,
+                LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD,
+                LLM_TENSOR_NEXTN_SHARED_HEAD_NORM,
+            };
        case LLM_ARCH_PLM:
            return {
                LLM_TENSOR_TOKEN_EMBD,
--- a/src/llama-arch.h
+++ b/src/llama-arch.h
@ -78,6 +78,7 @@ enum llm_arch {
    LLM_ARCH_DEEPSEEK,
    LLM_ARCH_DEEPSEEK2,
    LLM_ARCH_DEEPSEEK2OCR,
+    LLM_ARCH_DEEPSEEK32,
    LLM_ARCH_CHATGLM,
    LLM_ARCH_GLM4,
    LLM_ARCH_GLM4_MOE,
--- a/src/llama-graph.cpp
+++ b/src/llama-graph.cpp
@ -6,6 +6,7 @@

 #include "llama-kv-cache.h"
 #include "llama-kv-cache-iswa.h"
+#include "llama-kv-cache-dsa.h"
 #include "llama-memory-hybrid.h"
 #include "llama-memory-hybrid-iswa.h"
 #include "llama-memory-recurrent.h"
@ -31,6 +32,18 @@ static ggml_tensor * build_kq_mask(
    return ggml_new_tensor_4d(ctx, GGML_TYPE_F32, n_kv, n_tokens/n_stream, 1, n_stream);
 }

+static ggml_tensor * build_kq_mask(
+        ggml_context * ctx,
+        const llama_ik_cache_context * mctx,
+        const llama_ubatch & ubatch,
+        const llama_cparams & cparams) {
+    const auto n_kv     = mctx->get_n_kv();
+    const auto n_tokens = ubatch.n_tokens;
+    const auto n_stream = cparams.kv_unified ? 1 : ubatch.n_seqs_unq;
+
+    return ggml_new_tensor_4d(ctx, GGML_TYPE_F32, n_kv, n_tokens/n_stream, 1, n_stream);
+}
+
 static bool can_reuse_kq_mask(
        ggml_tensor * kq_mask,
        const llama_kv_cache_context * mctx,
@ -50,6 +63,25 @@ static bool can_reuse_kq_mask(
    return res;
 }

+static bool can_reuse_kq_mask(
+        ggml_tensor * kq_mask,
+        const llama_ik_cache_context * mctx,
+        const llama_ubatch & ubatch,
+        const llama_cparams & cparams) {
+    const auto n_kv     = mctx->get_n_kv();
+    const auto n_tokens = ubatch.n_tokens;
+    const auto n_stream = cparams.kv_unified ? 1 : ubatch.n_seqs_unq;
+
+    bool res = true;
+
+    res &= (kq_mask->ne[0] == n_kv);
+    res &= (kq_mask->ne[1] == n_tokens/n_stream);
+    res &= (kq_mask->ne[2] == 1);
+    res &= (kq_mask->ne[3] == n_stream);
+
+    return res;
+}
+
 // impl

 void llm_graph_input_embd::set_input(const llama_ubatch * ubatch) {
@ -2159,6 +2191,112 @@ ggml_tensor * llm_graph_context::build_attn(
    return cur;
 }

+ggml_tensor * llm_graph_context::build_attn(
+        llm_graph_input_attn_k * inp,
+        ggml_tensor * wo,
+        ggml_tensor * wo_b,
+        ggml_tensor * q_cur,
+        ggml_tensor * k_cur,
+        ggml_tensor * v_cur,
+        ggml_tensor * kq_b,
+        ggml_tensor * sinks,
+        ggml_tensor * v_mla,
+        ggml_tensor * top_k,
+            float     kq_scale,
+            int       il) const {
+    // these nodes are added to the graph together so that they are not reordered
+    // by doing so, the number of splits in the graph is reduced
+    // expand k later to enable rope fusion which directly writes into k-v cache
+    ggml_build_forward_expand(gf, q_cur);
+    ggml_build_forward_expand(gf, v_cur);
+    ggml_build_forward_expand(gf, k_cur);
+
+    const auto * mctx_cur = inp->mctx;
+
+    // store to KV cache
+    {
+        const auto & k_idxs = inp->get_k_idxs();
+
+        ggml_build_forward_expand(gf, mctx_cur->cpy_k(ctx0, k_cur, k_idxs, il));
+    }
+
+    const auto & kq_mask = inp->get_kq_mask();
+
+    // prepare new kq mask - starts filled with -INFINITY
+    ggml_tensor * kq_mask_all = ggml_fill(ctx0, kq_mask, -INFINITY);
+
+    // modify it by unmasking tokens that are in top_k indices
+    ggml_tensor * kq_mask_top_k = ggml_scatter(ctx0, kq_mask_all, top_k, 0);
+
+    // combine with the original kq mask
+    kq_mask_top_k = ggml_add(ctx0, kq_mask_top_k, kq_mask);
+
+    ggml_tensor * q = q_cur;
+    ggml_tensor * k = mctx_cur->get_k(ctx0, il);
+    ggml_tensor * v = ggml_view_4d(ctx0, k, v_cur->ne[0], k->ne[1], k->ne[2], k->ne[3], k->nb[1], k->nb[2], k->nb[3], 0);
+
+    ggml_tensor * cur = build_attn_mha(q, k, v, kq_b, kq_mask_top_k, sinks, v_mla, kq_scale, il);
+    cb(cur, "kqv_out", il);
+
+    if (wo) {
+        cur = build_lora_mm(wo, cur);
+        if (arch == LLM_ARCH_GLM4 || arch == LLM_ARCH_GLM4_MOE) {
+            // GLM4 and GLM4_MOE seem to have numerical issues with half-precision accumulators
+            ggml_mul_mat_set_prec(cur, GGML_PREC_F32);
+        }
+    }
+
+    if (wo_b) {
+        cur = ggml_add(ctx0, cur, wo_b);
+    }
+
+    return cur;
+}
+
+
+static std::unique_ptr<llm_graph_input_attn_ik> build_attn_inp_ik_impl(
+           ggml_context * ctx0,
+     const llama_ubatch & ubatch,
+    const llama_hparams & hparams,
+    const llama_cparams & cparams,
+    const llama_ik_cache_context * mctx_cur) {
+
+    auto inp = std::make_unique<llm_graph_input_attn_ik>(hparams, cparams, mctx_cur);
+
+    {
+        GGML_ASSERT(hparams.swa_type == LLAMA_SWA_TYPE_NONE && "Use llama_kv_cache_iswa for SWA");
+
+        inp->self_k_idxs = mctx_cur->build_input_k_idxs(ctx0, ubatch);
+
+        inp->self_kq_mask = build_kq_mask(ctx0, mctx_cur, ubatch, cparams);
+        ggml_set_input(inp->self_kq_mask);
+
+        inp->self_kq_mask_cnv = inp->self_kq_mask;
+    }
+
+    return inp;
+}
+
+void llm_graph_input_attn_ik::set_input(const llama_ubatch * ubatch) {
+    mctx->set_input_k_idxs(self_k_idxs, ubatch);
+
+    mctx->set_input_kq_mask(self_kq_mask, ubatch, cparams.causal_attn);
+}
+
+bool llm_graph_input_attn_ik::can_reuse(const llm_graph_params & params) {
+    const auto * mctx = static_cast<const llama_ik_cache_context *>(params.mctx);
+
+    this->mctx = mctx;
+
+    bool res = true;
+
+    res &= self_k_idxs->ne[0] == params.ubatch.n_tokens;
+
+    res &= can_reuse_kq_mask(self_kq_mask, mctx, params.ubatch, params.cparams);
+
+    return res;
+}
+
 ggml_tensor * llm_graph_context::build_attn(
        llm_graph_input_attn_kv_iswa * inp,
        ggml_tensor * wo,
@ -2281,6 +2419,17 @@ ggml_tensor * llm_graph_context::build_attn(
    return cur;
 }

+std::pair<llm_graph_input_attn_k *, llm_graph_input_attn_ik *> llm_graph_context::build_attn_inp_k_dsa() const {
+    const auto * mctx_cur = static_cast<const llama_kv_cache_dsa_context *>(mctx);
+
+    auto inp_k = build_attn_inp_k_impl(ctx0, ubatch, hparams, cparams, mctx_cur->get_base());
+    auto inp_ik = build_attn_inp_ik_impl(ctx0, ubatch, hparams, cparams, mctx_cur->get_ik());
+
+    return std::make_pair(
+        (llm_graph_input_attn_k *) res->add_input(std::move(inp_k)),
+        (llm_graph_input_attn_ik *) res->add_input(std::move(inp_ik)));
+}
+
 // TODO: maybe separate the inner implementation into a separate function
 //       like with the non-sliding window equivalent
 //       once sliding-window hybrid caches are a thing.
--- a/src/llama-graph.h
+++ b/src/llama-graph.h
@ -21,6 +21,7 @@ struct llama_cparams;
 struct llama_memory_context_i;

 class llama_kv_cache_context;
+class llama_ik_cache_context;
 class llama_kv_cache_iswa_context;
 class llama_memory_recurrent_context;
 class llama_memory_hybrid_context;
@ -350,6 +351,39 @@ public:
    const llama_kv_cache_context * mctx;
 };

+// V-less input for the indexer KV cache
+class llm_graph_input_attn_ik : public llm_graph_input_i {
+public:
+    llm_graph_input_attn_ik(
+            const llama_hparams & hparams,
+            const llama_cparams & cparams,
+            const llama_ik_cache_context * mctx) :
+        hparams(hparams),
+        cparams(cparams),
+        mctx(mctx) {
+    }
+    ~llm_graph_input_attn_ik() = default;
+
+    void set_input(const llama_ubatch * ubatch) override;
+
+    bool can_reuse(const llm_graph_params & params) override;
+
+    ggml_tensor * get_k_idxs() const { return self_k_idxs; }
+
+    ggml_tensor * get_kq_mask() const { return self_kq_mask_cnv; }
+
+    ggml_tensor * self_k_idxs = nullptr; // I64 [n_batch]
+
+    ggml_tensor * self_kq_mask     = nullptr; // F32 [n_kv, n_batch/n_stream, 1, n_stream]
+    ggml_tensor * self_kq_mask_cnv = nullptr; //     [n_kv, n_batch/n_stream, 1, n_stream]
+
+    const llama_hparams hparams;
+    const llama_cparams cparams;
+
+    const llama_ik_cache_context * mctx;
+};
+
+
 class llm_graph_input_attn_kv_iswa : public llm_graph_input_i {
 public:
    llm_graph_input_attn_kv_iswa(
@ -921,6 +955,20 @@ struct llm_graph_context {
                  float   kq_scale,
                    int   il) const;

+    ggml_tensor * build_attn(
+            llm_graph_input_attn_k * inp,
+            ggml_tensor * wo,
+            ggml_tensor * wo_b,
+            ggml_tensor * q_cur, // [n_embd_head_q, n_head_q, n_tokens]
+            ggml_tensor * k_cur, // [n_embd_head_k, n_head_k, n_tokens]
+            ggml_tensor * v_cur, // [n_embd_head_v, n_head_v, n_tokens]
+            ggml_tensor * kq_b,
+            ggml_tensor * sinks, // [n_head_q]
+            ggml_tensor * v_mla, // [n_embd_head_v_mla, n_embd_head_v, n_head_v]
+            ggml_tensor * top_k, // [n_indexer_top_k, n_tokens]
+                  float   kq_scale,
+                    int   il) const;
+
    llm_graph_input_attn_kv_iswa * build_attn_inp_kv_iswa() const;

    // note: if k_cur or v_cur are not provided, they will not be stored in the memory
@ -952,6 +1000,8 @@ struct llm_graph_context {
                  float   kq_scale,
                    int   il) const;

+    std::pair<llm_graph_input_attn_k *, llm_graph_input_attn_ik *> build_attn_inp_k_dsa() const;
+
    //
    // recurrent
    //
--- a/src/llama-ik-cache.cpp
+++ b/src/llama-ik-cache.cpp
--- a/src/llama-ik-cache.h
+++ b/src/llama-ik-cache.h
@ -0,0 +1,306 @@
+#pragma once
+
+#include "llama-kv-cache.h"
+
+#include "llama-batch.h"
+#include "llama-graph.h"
+#include "llama-kv-cells.h"
+#include "llama-memory.h"
+
+#include <unordered_map>
+#include <vector>
+
+struct llama_cparams;
+struct llama_hparams;
+struct llama_model;
+struct llama_context;
+
+//
+// llama_ik_cache
+//
+
+class llama_ik_cache : public llama_memory_i {
+public:
+    using stream_copy_info = llama_kv_cache::stream_copy_info;
+    using slot_info = llama_kv_cache::slot_info;
+    using slot_info_vec_t = std::vector<slot_info>;
+
+    llama_ik_cache(
+            const llama_model & model,
+                    ggml_type   type_k,
+                    ggml_type   type_v,
+                         bool   v_trans,
+                         bool   offload,
+                         bool   unified,
+                     uint32_t   kv_size,
+                     uint32_t   n_seq_max,
+                     uint32_t   n_pad,
+                     uint32_t   n_swa,
+               llama_swa_type   swa_type,
+        const layer_filter_cb & filter,
+        const  layer_reuse_cb & reuse);
+
+    ~llama_ik_cache() = default;
+
+    //
+    // llama_memory_i
+    //
+
+    llama_memory_context_ptr init_batch(
+            llama_batch_allocr & balloc,
+            uint32_t n_ubatch,
+            bool embd_all) override;
+
+    llama_memory_context_ptr init_full() override;
+
+    llama_memory_context_ptr init_update(llama_context * lctx, bool optimize) override;
+
+    bool get_can_shift() const override;
+
+    void clear(bool data) override;
+
+    bool seq_rm  (llama_seq_id seq_id,                              llama_pos p0, llama_pos p1) override;
+    void seq_cp  (llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) override;
+    void seq_keep(llama_seq_id seq_id)                                                          override;
+    void seq_add (llama_seq_id seq_id,                              llama_pos p0, llama_pos p1, llama_pos shift) override;
+    void seq_div (llama_seq_id seq_id,                              llama_pos p0, llama_pos p1, int d) override;
+
+    llama_pos seq_pos_min(llama_seq_id seq_id) const override;
+    llama_pos seq_pos_max(llama_seq_id seq_id) const override;
+
+    std::map<ggml_backend_buffer_type_t, size_t> memory_breakdown() const override;
+
+    // state write/load
+
+    void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1, llama_state_seq_flags flags = 0) const override;
+    void state_read (llama_io_read_i  & io, llama_seq_id seq_id = -1, llama_state_seq_flags flags = 0) override;
+
+    //
+    // llama_ik_cache specific API
+    //
+
+    uint32_t get_size()     const;
+    uint32_t get_n_stream() const;
+
+    bool get_has_shift() const;
+
+    //
+    // graph_build API
+    //
+
+    uint32_t get_n_kv(const slot_info & sinfo) const;
+
+    // get views of the current state of the cache
+    ggml_tensor * get_k(ggml_context * ctx, int32_t il, uint32_t n_kv, const slot_info & sinfo) const;
+
+    // store k_cur and v_cur in the cache based on the provided head location
+    ggml_tensor * cpy_k(ggml_context * ctx, ggml_tensor * k_cur, ggml_tensor * k_idxs, int32_t il, const slot_info & sinfo) const;
+
+    //
+    // preparation API
+    //
+
+    // find places for the provided ubatches in the cache, returns the slot infos
+    // return empty vector on failure
+    slot_info_vec_t prepare(const std::vector<llama_ubatch> & ubatches);
+
+    bool update(llama_context * lctx, bool do_shift, const stream_copy_info & sc_info);
+
+    // find a slot of kv cells that can hold the ubatch
+    // if cont == true, then the slot must be continuous
+    // return empty slot_info on failure
+    slot_info find_slot(const llama_ubatch & ubatch, bool cont) const;
+
+    // emplace the ubatch context into slot: [sinfo.idxs[0...ubatch.n_tokens - 1]]
+    void apply_ubatch(const slot_info & sinfo, const llama_ubatch & ubatch);
+
+    //
+    // input API
+    //
+
+    ggml_tensor * build_input_k_idxs(ggml_context * ctx, const llama_ubatch & ubatch) const;
+
+    void set_input_k_idxs(ggml_tensor * dst, const llama_ubatch * ubatch, const slot_info & sinfo) const;
+
+    void set_input_k_shift(ggml_tensor * dst) const;
+
+    void set_input_kq_mask   (ggml_tensor * dst, const llama_ubatch * ubatch, bool causal_attn) const;
+
+private:
+    const llama_model & model;
+    const llama_hparams & hparams;
+
+    struct kv_layer {
+        // layer index in the model
+        // note: can be different from the layer index in the KV cache
+        uint32_t il;
+
+        ggml_tensor * k;
+
+        std::vector<ggml_tensor *> k_stream;
+    };
+
+    bool v_trans = true;  // the value tensor is transposed
+
+    const uint32_t n_seq_max = 1;
+    const uint32_t n_stream  = 1;
+
+    // required padding
+    const uint32_t n_pad = 1;
+
+    // SWA
+    const uint32_t n_swa = 0;
+
+    // env: LLAMA_KV_CACHE_DEBUG
+    int debug = 0;
+
+    // this is the SWA type of the cache - not to be confused with the model SWA type
+    const llama_swa_type swa_type = LLAMA_SWA_TYPE_NONE;
+
+    // ggml contexts for the KV cache along with the allocated backend buffers:
+    std::vector<std::pair<ggml_context_ptr, ggml_backend_buffer_ptr>> ctxs_bufs;
+
+    // the current index from where we start searching for a free slot in the ring buffer of KV cells (see find_slot())
+    // note: this is not part of the KV state and it's only used to speed-up the find_slot() method
+    std::vector<uint32_t> v_heads;
+
+    std::vector<llama_kv_cells> v_cells;
+
+    // maps from a sequence id to a stream id
+    std::vector<uint32_t> seq_to_stream;
+
+    // pending stream copies that will be applied during the next update
+    stream_copy_info sc_info;
+
+    std::vector<kv_layer> layers;
+
+    // model layer id -> KV cache layer id
+    std::unordered_map<int32_t, int32_t> map_layer_ids;
+
+    size_t total_size() const;
+
+    size_t size_k_bytes() const;
+
+    ggml_tensor * build_rope_shift(
+            const llama_cparams & cparams,
+                   ggml_context * ctx,
+                    ggml_tensor * cur,
+                    ggml_tensor * shift,
+                    ggml_tensor * factors,
+                          float   freq_base,
+                          float   freq_scale,
+                       uint32_t   il) const;
+
+    ggml_cgraph * build_graph_shift(
+               llm_graph_result * res,
+                  llama_context * lctx) const;
+
+    struct cell_ranges_t {
+        uint32_t strm;
+
+        std::vector<std::pair<uint32_t, uint32_t>> data; // ranges, from inclusive, to exclusive
+    };
+
+    void state_write_meta(llama_io_write_i & io, const cell_ranges_t & cr, llama_seq_id seq_id = -1) const;
+    void state_write_data(llama_io_write_i & io, const cell_ranges_t & cr) const;
+
+    bool state_read_meta(llama_io_read_i & io, uint32_t strm, uint32_t cell_count,       slot_info & sinfo, llama_seq_id dest_seq_id = -1);
+    bool state_read_data(llama_io_read_i & io, uint32_t strm, uint32_t cell_count, const slot_info & sinfo);
+};
+
+class llama_ik_cache_context : public llama_memory_context_i {
+public:
+    // some shorthands
+    using slot_info_vec_t  = llama_kv_cache::slot_info_vec_t;
+    using stream_copy_info = llama_kv_cache::stream_copy_info;
+
+    // used for errors
+    llama_ik_cache_context(llama_memory_status status);
+
+    // used to create a full-cache context
+    llama_ik_cache_context(
+            llama_ik_cache * kv);
+
+    // used to create an update context
+    llama_ik_cache_context(
+            llama_ik_cache * kv,
+            llama_context * lctx,
+            bool do_shift,
+            stream_copy_info sc_info);
+
+    // used to create a batch processing context from a batch
+    llama_ik_cache_context(
+            llama_ik_cache * kv,
+            slot_info_vec_t sinfos,
+            std::vector<llama_ubatch> ubatches);
+
+    virtual ~llama_ik_cache_context();
+
+    //
+    // llama_memory_context_i
+    //
+
+    bool next()  override;
+    bool apply() override;
+
+    llama_memory_status  get_status() const override;
+    const llama_ubatch & get_ubatch() const override;
+
+    //
+    // llama_ik_cache_context specific API
+    //
+
+    uint32_t get_n_kv() const;
+
+    // get views of the current state of the cache
+    ggml_tensor * get_k(ggml_context * ctx, int32_t il) const;
+
+    // store k_cur and v_cur in the cache based on the provided head location
+    // note: the heads in k_cur and v_cur should be layed out contiguously in memory
+    //   - k_cur  [n_embd_head_k, n_head_k, n_tokens]
+    //   - k_idxs [n_tokens]
+    ggml_tensor * cpy_k(ggml_context * ctx, ggml_tensor * k_cur, ggml_tensor * k_idxs, int32_t il) const;
+
+    // create destination indices for each head of the current batch for where it would be written in the KV cache
+    // the indices address the global KV cache (not per stream) - this is not relevant for the user of this API, but
+    //   helps understand the implementation logic of cpy_k
+    ggml_tensor * build_input_k_idxs(ggml_context * ctx, const llama_ubatch & ubatch) const;
+
+    void set_input_k_idxs(ggml_tensor * dst, const llama_ubatch * ubatch) const;
+
+    void set_input_k_shift   (ggml_tensor * dst) const;
+    void set_input_kq_mask   (ggml_tensor * dst, const llama_ubatch * ubatch, bool causal_attn) const;
+
+private:
+    llama_memory_status status;
+
+    llama_ik_cache * kv;
+    llama_context * lctx;
+
+    //
+    // update context
+    //
+
+    bool do_shift = false;
+
+    stream_copy_info sc_info;
+
+    //
+    // batch processing context
+    //
+
+    // the index of the cur ubatch to process
+    size_t i_cur = 0;
+
+    slot_info_vec_t sinfos;
+
+    std::vector<llama_ubatch> ubatches;
+
+    //
+    // data needed for building the compute graph for the current ubatch:
+    //
+
+    // a heuristic, to avoid attending the full cache if it is not yet utilized
+    // as the cache gets filled, the benefit from this heuristic disappears
+    int32_t n_kv;
+};
--- a/src/llama-kv-cache-dsa.cpp
+++ b/src/llama-kv-cache-dsa.cpp
@ -0,0 +1,251 @@
+#include "llama-kv-cache-dsa.h"
+
+#include "llama-impl.h"
+#include "llama-batch.h"
+#include "llama-model.h"
+
+#include <algorithm>
+#include <cassert>
+
+//
+// llama_kv_cache_dsa
+//
+
+llama_kv_cache_dsa::llama_kv_cache_dsa(
+        const llama_model & model,
+                ggml_type   type_k,
+                ggml_type   type_v,
+                     bool   v_trans,
+                     bool   offload,
+                     bool   unified,
+                 uint32_t   kv_size,
+                 uint32_t   n_seq_max,
+                 uint32_t   n_pad,
+                 uint32_t   n_swa,
+           llama_swa_type   swa_type,
+    const layer_filter_cb & filter,
+    const  layer_reuse_cb & reuse) :
+    n_stream(unified ? 1 : n_seq_max) {
+
+    LLAMA_LOG_INFO("%s: creating main KV cache, size = %u cells\n", __func__, kv_size);
+
+    kv_base = std::make_unique<llama_kv_cache>(
+            model, type_k, type_v,
+            v_trans, offload, unified, kv_size, n_seq_max, n_pad,
+            n_swa, swa_type, filter, reuse);
+
+    LLAMA_LOG_INFO("%s: creating indexer KV cache, size = %u cells\n", __func__, kv_size);
+
+    kv_ik = std::make_unique<llama_ik_cache>(
+            model, type_k, type_v,
+            v_trans, offload, unified, kv_size, n_seq_max, n_pad,
+            n_swa, swa_type, filter, reuse);
+}
+
+void llama_kv_cache_dsa::clear(bool data) {
+    kv_base->clear(data);
+    kv_ik ->clear(data);
+}
+
+bool llama_kv_cache_dsa::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos p1) {
+    bool res = true;
+
+    res = res & kv_base->seq_rm(seq_id, p0, p1);
+    res = res & kv_ik ->seq_rm(seq_id, p0, p1);
+
+    return res;
+}
+
+void llama_kv_cache_dsa::seq_cp(llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) {
+    kv_base->seq_cp(seq_id_src, seq_id_dst, p0, p1);
+    kv_ik ->seq_cp(seq_id_src, seq_id_dst, p0, p1);
+}
+
+void llama_kv_cache_dsa::seq_keep(llama_seq_id seq_id) {
+    kv_base->seq_keep(seq_id);
+    kv_ik ->seq_keep(seq_id);
+}
+
+void llama_kv_cache_dsa::seq_add(llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos shift) {
+    kv_base->seq_add(seq_id, p0, p1, shift);
+    kv_ik ->seq_add(seq_id, p0, p1, shift);
+}
+
+void llama_kv_cache_dsa::seq_div(llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d) {
+    kv_base->seq_div(seq_id, p0, p1, d);
+    kv_ik ->seq_div(seq_id, p0, p1, d);
+}
+
+llama_pos llama_kv_cache_dsa::seq_pos_min(llama_seq_id seq_id) const {
+    return kv_base->seq_pos_min(seq_id);
+}
+
+llama_pos llama_kv_cache_dsa::seq_pos_max(llama_seq_id seq_id) const {
+    return kv_base->seq_pos_max(seq_id);
+}
+
+std::map<ggml_backend_buffer_type_t, size_t> llama_kv_cache_dsa::memory_breakdown() const {
+    std::map<ggml_backend_buffer_type_t, size_t> mb = kv_base->memory_breakdown();
+    for (const auto & buft_size : kv_ik->memory_breakdown()) {
+        mb[buft_size.first] += buft_size.second;
+    }
+    return mb;
+}
+
+llama_memory_context_ptr llama_kv_cache_dsa::init_batch(
+            llama_batch_allocr & balloc,
+            uint32_t n_ubatch,
+            bool embd_all) {
+    GGML_UNUSED(embd_all);
+
+    do {
+        balloc.split_reset();
+
+        std::vector<llama_ubatch> ubatches;
+        while (true) {
+            auto ubatch = n_stream == 1 ? balloc.split_simple(n_ubatch) : balloc.split_equal(n_ubatch, true);
+
+            if (ubatch.n_tokens == 0) {
+                break;
+            }
+
+            ubatches.push_back(std::move(ubatch)); // NOLINT
+        }
+
+        if (balloc.get_n_used() < balloc.get_n_tokens()) {
+            // failed to find a suitable split
+            break;
+        }
+
+        auto sinfos_base = kv_base->prepare(ubatches);
+        if (sinfos_base.empty()) {
+            break;
+        }
+
+        auto sinfos_ik = kv_ik->prepare(ubatches);
+        if (sinfos_ik.empty()) {
+            break;
+        }
+
+        assert(sinfos_base.size() == sinfos_ik.size());
+
+        return std::make_unique<llama_kv_cache_dsa_context>(
+                this, std::move(sinfos_base), std::move(sinfos_ik), std::move(ubatches));
+    } while (false);
+
+    return std::make_unique<llama_kv_cache_dsa_context>(LLAMA_MEMORY_STATUS_FAILED_PREPARE);
+}
+
+llama_memory_context_ptr llama_kv_cache_dsa::init_full() {
+    return std::make_unique<llama_kv_cache_dsa_context>(this);
+}
+
+llama_memory_context_ptr llama_kv_cache_dsa::init_update(llama_context * lctx, bool optimize) {
+    return std::make_unique<llama_kv_cache_dsa_context>(this, lctx, optimize);
+}
+
+bool llama_kv_cache_dsa::get_can_shift() const {
+    return kv_base->get_can_shift() &&
+           kv_ik->get_can_shift() &&
+           kv_base->get_size() == kv_ik->get_size();
+}
+
+void llama_kv_cache_dsa::state_write(llama_io_write_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) const {
+    kv_base->state_write(io, seq_id, flags);
+    kv_ik->state_write(io, seq_id, flags);
+}
+
+void llama_kv_cache_dsa::state_read(llama_io_read_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) {
+    kv_base->state_read(io, seq_id, flags);
+    kv_ik->state_read(io, seq_id, flags);
+}
+
+llama_kv_cache * llama_kv_cache_dsa::get_base() const {
+    return kv_base.get();
+}
+
+llama_ik_cache * llama_kv_cache_dsa::get_ik() const {
+    return kv_ik.get();
+}
+
+//
+// llama_kv_cache_dsa_context
+//
+
+llama_kv_cache_dsa_context::llama_kv_cache_dsa_context(llama_memory_status status) : status(status) {}
+
+llama_kv_cache_dsa_context::llama_kv_cache_dsa_context(
+        llama_kv_cache_dsa * kv) :
+    ctx_base(kv->get_base()->init_full()),
+    ctx_ik(kv->get_ik()->init_full()),
+    status(llama_memory_status_combine(ctx_base->get_status(), ctx_ik->get_status())) {
+}
+
+llama_kv_cache_dsa_context::llama_kv_cache_dsa_context(
+        llama_kv_cache_dsa * kv,
+        llama_context * lctx,
+        bool optimize) :
+    ctx_base(kv->get_base()->init_update(lctx, optimize)),
+    ctx_ik(kv->get_ik()->init_update(lctx, optimize)),
+    status(llama_memory_status_combine(ctx_base->get_status(), ctx_ik->get_status())) {
+}
+
+llama_kv_cache_dsa_context::llama_kv_cache_dsa_context(
+        llama_kv_cache_dsa * kv,
+        slot_info_vec_t sinfos_base,
+        slot_info_vec_t sinfos_ik,
+        std::vector<llama_ubatch> ubatches) :
+    ubatches(std::move(ubatches)),
+    // note: here we copy the ubatches. not sure if this is ideal
+    ctx_base(new llama_kv_cache_context(kv->get_base(), std::move(sinfos_base), this->ubatches)),
+    ctx_ik(new llama_ik_cache_context(kv->get_ik(), std::move(sinfos_ik), this->ubatches)),
+    status(llama_memory_status_combine(ctx_base->get_status(), ctx_ik->get_status())) {
+}
+
+llama_kv_cache_dsa_context:: ~llama_kv_cache_dsa_context() = default;
+
+bool llama_kv_cache_dsa_context::next() {
+    assert(status == LLAMA_MEMORY_STATUS_SUCCESS);
+
+    ctx_base->next();
+    ctx_ik ->next();
+
+    if (++i_next >= ubatches.size()) {
+        return false;
+    }
+
+    return true;
+}
+
+bool llama_kv_cache_dsa_context::apply() {
+    assert(!llama_memory_status_is_fail(status));
+
+    bool res = true;
+
+    res = res & ctx_base->apply();
+    res = res & ctx_ik ->apply();
+
+    return res;
+}
+
+llama_memory_status llama_kv_cache_dsa_context::get_status() const {
+    return status;
+}
+
+const llama_ubatch & llama_kv_cache_dsa_context::get_ubatch() const {
+    assert(status == LLAMA_MEMORY_STATUS_SUCCESS);
+
+    return ubatches[i_next];
+}
+
+const llama_kv_cache_context * llama_kv_cache_dsa_context::get_base() const {
+    assert(status == LLAMA_MEMORY_STATUS_SUCCESS);
+
+    return static_cast<const llama_kv_cache_context *>(ctx_base.get());
+}
+
+const llama_ik_cache_context * llama_kv_cache_dsa_context::get_ik()  const {
+    assert(status == LLAMA_MEMORY_STATUS_SUCCESS);
+
+    return static_cast<const llama_ik_cache_context *>(ctx_ik.get());
+}
--- a/src/llama-kv-cache-dsa.h
+++ b/src/llama-kv-cache-dsa.h
@ -0,0 +1,137 @@
+#pragma once
+
+#include "llama-kv-cache.h"
+#include "llama-ik-cache.h"
+
+#include <vector>
+
+//
+// llama_kv_cache_dsa
+//
+
+// utilizes two KV cache instances: llama_kv_cache and llama_ik_cache
+//   the first instance is for caching key tensors of the model,
+//   the second instance is for caching lightning indexer key tensors
+
+class llama_kv_cache_dsa : public llama_memory_i {
+public:
+    llama_kv_cache_dsa(
+            const llama_model & model,
+                    ggml_type   type_k,
+                    ggml_type   type_v,
+                         bool   v_trans,
+                         bool   offload,
+                         bool   unified,
+                     uint32_t   kv_size,
+                     uint32_t   n_seq_max,
+                     uint32_t   n_pad,
+                     uint32_t   n_swa,
+               llama_swa_type   swa_type,
+        const layer_filter_cb & filter,
+        const  layer_reuse_cb & reuse);
+
+    ~llama_kv_cache_dsa() = default;
+
+    //
+    // llama_memory_i
+    //
+
+    llama_memory_context_ptr init_batch(
+            llama_batch_allocr & balloc,
+            uint32_t n_ubatch,
+            bool embd_all) override;
+
+    llama_memory_context_ptr init_full() override;
+
+    llama_memory_context_ptr init_update(llama_context * lctx, bool optimize) override;
+
+    bool get_can_shift() const override;
+
+    void clear(bool data) override;
+
+    bool seq_rm  (llama_seq_id seq_id,                              llama_pos p0, llama_pos p1) override;
+    void seq_cp  (llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) override;
+    void seq_keep(llama_seq_id seq_id)                                                          override;
+    void seq_add (llama_seq_id seq_id,                              llama_pos p0, llama_pos p1, llama_pos shift) override;
+    void seq_div (llama_seq_id seq_id,                              llama_pos p0, llama_pos p1, int d) override;
+
+    llama_pos seq_pos_min(llama_seq_id seq_id) const override;
+    llama_pos seq_pos_max(llama_seq_id seq_id) const override;
+
+    std::map<ggml_backend_buffer_type_t, size_t> memory_breakdown() const override;
+
+    // state write/load
+
+    void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1, llama_state_seq_flags flags = 0) const override;
+    void state_read (llama_io_read_i  & io, llama_seq_id seq_id = -1, llama_state_seq_flags flags = 0) override;
+
+    //
+    // llama_kv_cache_dsa specific API
+    //
+
+    llama_kv_cache * get_base() const;
+    llama_ik_cache * get_ik () const;
+
+private:
+    const uint32_t n_stream  = 1;
+
+    std::unique_ptr<llama_kv_cache> kv_base;
+    std::unique_ptr<llama_ik_cache> kv_ik;
+};
+
+class llama_kv_cache_dsa_context : public llama_memory_context_i {
+public:
+    using slot_info_vec_t = llama_kv_cache::slot_info_vec_t;
+
+    // used for errors
+    llama_kv_cache_dsa_context(llama_memory_status status);
+
+    // used to create a full-cache context
+    llama_kv_cache_dsa_context(
+            llama_kv_cache_dsa * kv);
+
+    // used to create an update context
+    llama_kv_cache_dsa_context(
+            llama_kv_cache_dsa * kv,
+            llama_context * lctx,
+            bool optimize);
+
+    // used to create a batch processing context from a batch
+    llama_kv_cache_dsa_context(
+            llama_kv_cache_dsa * kv,
+            slot_info_vec_t sinfos_base,
+            slot_info_vec_t sinfos_ik,
+            std::vector<llama_ubatch> ubatches);
+
+    virtual ~llama_kv_cache_dsa_context();
+
+    //
+    // llama_memory_context_i
+    //
+
+    bool next()  override;
+    bool apply() override;
+
+    llama_memory_status  get_status() const override;
+    const llama_ubatch & get_ubatch() const override;
+
+    //
+    // llama_kv_cache_dsa_context specific API
+    //
+
+    const llama_kv_cache_context * get_base() const;
+    const llama_ik_cache_context * get_ik()  const;
+
+private:
+    //llama_kv_cache_dsa * kv;
+
+    // the index of the next ubatch to process
+    size_t i_next = 0;
+
+    std::vector<llama_ubatch> ubatches;
+
+    const llama_memory_context_ptr ctx_base;
+    const llama_memory_context_ptr ctx_ik;
+
+    const llama_memory_status status;
+};
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@ -8,6 +8,7 @@

 #include "llama-kv-cache.h"
 #include "llama-kv-cache-iswa.h"
+#include "llama-kv-cache-dsa.h"
 #include "llama-memory-hybrid.h"
 #include "llama-memory-hybrid-iswa.h"
 #include "llama-memory-recurrent.h"
@ -144,6 +145,7 @@ const char * llm_type_name(llm_type type) {
        case LLM_TYPE_310B_A15B:     return "310B.A15B";
        case LLM_TYPE_355B_A32B:     return "355B.A32B";
        case LLM_TYPE_397B_A17B:     return "397B.A17B";
+        case LLM_TYPE_685B_A37B:     return "685B.A37B";
        case LLM_TYPE_744B_A40B:     return "744B.A40B";
        case LLM_TYPE_E2B:           return "E2B";
        case LLM_TYPE_E4B:           return "E4B";
@ -1641,6 +1643,56 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                    default: type = LLM_TYPE_UNKNOWN;
                }
            } break;
+        case LLM_ARCH_DEEPSEEK32:
+            {
+                ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH,     hparams.n_ff_exp);
+                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS,    hparams.f_norm_rms_eps);
+                hparams.f_norm_eps = 1e-6;  // eps for layer norm
+                ml.get_key_or_arr(LLM_KV_ROPE_DIMENSION_SECTIONS, hparams.rope_sections, 4, false);
+
+                // MoE parameters
+                ml.get_key(LLM_KV_EXPERT_COUNT,                hparams.n_expert);
+                ml.get_key(LLM_KV_EXPERT_USED_COUNT,           hparams.n_expert_used);
+                ml.get_key(LLM_KV_EXPERT_SHARED_COUNT,         hparams.n_expert_shared);
+                ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT,   hparams.n_layer_dense_lead, false);
+                ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE,        hparams.expert_weights_scale, false);
+                ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM,         hparams.expert_weights_norm, false);
+
+                // deepseek MLA parameters
+                ml.get_key(LLM_KV_ATTENTION_Q_LORA_RANK,      hparams.n_lora_q);
+                ml.get_key(LLM_KV_ATTENTION_KV_LORA_RANK,     hparams.n_lora_kv);
+                ml.get_key(LLM_KV_ATTENTION_KEY_LENGTH_MLA,   hparams.n_embd_head_k_mla_impl, false);
+                ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH_MLA, hparams.n_embd_head_v_mla_impl, false);
+                ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
+                ml.get_key(LLM_KV_EXPERT_SHARED_COUNT,        hparams.n_expert_shared);
+
+                // DSA parameters
+                ml.get_key(LLM_KV_ATTENTION_INDEXER_HEAD_COUNT, hparams.indexer_n_head);
+                ml.get_key(LLM_KV_ATTENTION_INDEXER_KEY_LENGTH, hparams.indexer_head_size);
+                ml.get_key(LLM_KV_ATTENTION_INDEXER_TOP_K,      hparams.indexer_top_k);
+
+                // Expert gating function
+                ml.get_key(LLM_KV_EXPERT_GATING_FUNC,          hparams.expert_gating_func);
+
+                if (ml.get_key(LLM_KV_ROPE_SCALING_YARN_LOG_MUL, hparams.rope_yarn_log_mul, 0.0f)) {
+                    // [TAG_DEEPSEEK2_YARN_LOG_MUL_FIX]
+                    // cancel the factor from the convert script
+                    hparams.rope_yarn_log_mul /= 0.1f;
+                }
+
+                // NextN/MTP parameters
+                ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS,        hparams.nextn_predict_layers, false);
+                GGML_ASSERT(hparams.nextn_predict_layers < hparams.n_layer && "nextn_predict_layers must be < n_layer");
+
+                // TODO: when MTP is implemented, this should probably be updated if needed
+                hparams.n_layer_kv_from_start = hparams.n_layer - hparams.nextn_predict_layers;
+
+                switch (hparams.n_layer) {
+                    case 61: type = LLM_TYPE_685B_A37B; break;
+                    default: type = LLM_TYPE_UNKNOWN;
+                }
+            } break;
+
        case LLM_ARCH_PLM:
            {
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
@ -5032,6 +5084,108 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
                        }
                    }
                } break;
+            case LLM_ARCH_DEEPSEEK32:
+                {
+                    const bool is_mla = hparams.is_mla();
+                    if (!is_mla) {
+                        throw std::runtime_error("DEEPSEEK32 architecture requires MLA");
+                    }
+
+                    // note: these are the actual head sizes you get when treating as MHA or after "decompression" using wv_b for MLA
+                    const int64_t n_embd_head_k_mla = hparams.n_embd_head_k_mla();
+                    const int64_t n_embd_head_v_mla = hparams.n_embd_head_v_mla();
+
+                    const int64_t n_embd_head_qk_rope = hparams.n_rot();
+                    const int64_t n_embd_head_qk_nope = n_embd_head_k_mla - n_embd_head_qk_rope;
+
+                    const int64_t q_lora_rank  = hparams.n_lora_q;
+                    const int64_t kv_lora_rank = hparams.n_lora_kv;
+
+                    const int64_t n_ff_exp        = hparams.n_ff_exp;
+                    const int64_t n_expert_shared = hparams.n_expert_shared;
+
+                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+                    // output
+                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+                    // try to load output.weight, if not found, use token_embd (tied embeddings)
+                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
+                    if (!output) {
+                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
+                    }
+
+                    for (int i = 0; i < n_layer; ++i) {
+                        int flags = 0;
+                        if (hparams.nextn_predict_layers > 0 && static_cast<uint32_t>(i) >= n_layer - hparams.nextn_predict_layers) {
+                            // skip all tensors in the NextN layers
+                            // TODO @ngxson : TENSOR_NOT_REQUIRED was a hack, need to remove it later
+                            flags |= TENSOR_SKIP | TENSOR_NOT_REQUIRED;
+                        }
+
+                        auto & layer = layers[i];
+
+                        layer.attn_norm      = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, flags);
+                        layer.attn_q_a_norm  = create_tensor(tn(LLM_TENSOR_ATTN_Q_A_NORM, "weight", i), {q_lora_rank}, flags);
+                        layer.attn_kv_a_norm = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_NORM, "weight", i), {kv_lora_rank}, flags);
+
+                        layer.wq_a = create_tensor(tn(LLM_TENSOR_ATTN_Q_A, "weight", i), {n_embd, q_lora_rank}, flags);
+                        layer.wq_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_B, "weight", i), {q_lora_rank, n_head * n_embd_head_k_mla}, flags);
+
+                        layer.wkv_a_mqa = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_MQA, "weight", i), {n_embd, kv_lora_rank + n_embd_head_qk_rope}, flags);
+
+                        // note: only old legacy GGUF files will have the unsplit wkv_b tensor in
+                        layer.wk_b = create_tensor(tn(LLM_TENSOR_ATTN_K_B, "weight", i), {n_embd_head_qk_nope, kv_lora_rank, n_head}, flags);
+                        layer.wv_b = create_tensor(tn(LLM_TENSOR_ATTN_V_B, "weight", i), {kv_lora_rank, n_embd_head_v_mla, n_head}, flags);
+
+                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_head * n_embd_head_v_mla, n_embd}, flags);
+
+                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, flags);
+
+                        // DSA indexer
+                        layer.indexer_k_norm   = create_tensor(tn(LLM_TENSOR_INDEXER_K_NORM,   "weight", i), {hparams.indexer_head_size}, flags);
+                        layer.indexer_k_norm_b = create_tensor(tn(LLM_TENSOR_INDEXER_K_NORM,   "bias",   i), {hparams.indexer_head_size}, flags);
+                        layer.indexer_proj     = create_tensor(tn(LLM_TENSOR_INDEXER_PROJ,     "weight", i), {n_embd, hparams.indexer_n_head}, flags);
+                        layer.indexer_attn_k   = create_tensor(tn(LLM_TENSOR_INDEXER_ATTN_K,   "weight", i), {n_embd, hparams.indexer_head_size}, flags);
+                        layer.indexer_attn_q_b = create_tensor(tn(LLM_TENSOR_INDEXER_ATTN_Q_B, "weight", i), {q_lora_rank, hparams.indexer_n_head * hparams.indexer_head_size}, flags);
+                        if (i < (int) hparams.n_layer_dense_lead) {
+                            layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, flags);
+                            layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, flags);
+                            layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, flags);
+                        } else {
+                            layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, flags);
+                            layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, TENSOR_NOT_REQUIRED);
+
+                            if (n_expert == 0) {
+                                throw std::runtime_error("n_expert must be > 0");
+                            }
+                            if (n_expert_used == 0) {
+                                throw std::runtime_error("n_expert_used must be > 0");
+                            }
+
+                            // MoE branch
+                            layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {  n_embd, n_ff_exp, n_expert}, flags);
+                            layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp,   n_embd, n_expert}, flags);
+                            layer.ffn_up_exps   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {  n_embd, n_ff_exp, n_expert}, flags);
+
+                            // Shared expert branch
+                            layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, flags);
+                            layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {        n_ff_exp * n_expert_shared, n_embd}, flags);
+                            layer.ffn_up_shexp   = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP,   "weight", i), {n_embd, n_ff_exp * n_expert_shared}, flags);
+                        }
+
+                        // NextN/MTP tensors (preserved but unused) - conditionally load for last nextn_predict_layers
+                        if (hparams.nextn_predict_layers > 0 && static_cast<uint32_t>(i) >= n_layer - hparams.nextn_predict_layers) {
+                            layer.nextn.eh_proj          = create_tensor(tn(LLM_TENSOR_NEXTN_EH_PROJ, "weight", i), { 2 * n_embd, n_embd }, flags);
+                            layer.nextn.enorm            = create_tensor(tn(LLM_TENSOR_NEXTN_ENORM, "weight", i), { n_embd }, flags);
+                            layer.nextn.hnorm            = create_tensor(tn(LLM_TENSOR_NEXTN_HNORM, "weight", i), { n_embd }, flags);
+
+                            // Optional tensors
+                            layer.nextn.embed_tokens     = create_tensor(tn(LLM_TENSOR_NEXTN_EMBED_TOKENS, "weight", i), { n_embd, n_vocab }, flags | TENSOR_NOT_REQUIRED);
+                            layer.nextn.shared_head_head = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD, "weight", i), { n_embd, n_vocab }, flags | TENSOR_NOT_REQUIRED);
+                            layer.nextn.shared_head_norm = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_NORM, "weight", i), { n_embd }, flags | TENSOR_NOT_REQUIRED);
+                        }
+                    }
+                } break;
            case LLM_ARCH_PLM:
                {
                    const int64_t n_embd_head_qk_rope = hparams.n_rot();
@ -7975,7 +8129,7 @@ void llama_model::print_info() const {
        LLAMA_LOG_INFO("%s: expert_weights_scale  = %.1f\n",   __func__, hparams.expert_weights_scale);
    }

-    if (arch == LLM_ARCH_DEEPSEEK2 || arch == LLM_ARCH_DEEPSEEK2OCR || arch == LLM_ARCH_GLM_DSA || arch == LLM_ARCH_MISTRAL4) {
+    if (arch == LLM_ARCH_DEEPSEEK2 || arch == LLM_ARCH_DEEPSEEK2OCR || arch == LLM_ARCH_DEEPSEEK32 || arch == LLM_ARCH_GLM_DSA || arch == LLM_ARCH_MISTRAL4) {
        LLAMA_LOG_INFO("%s: n_layer_dense_lead    = %d\n",     __func__, hparams.n_layer_dense_lead);
        LLAMA_LOG_INFO("%s: n_lora_q              = %d\n",     __func__, hparams.n_lora_q);
        LLAMA_LOG_INFO("%s: n_lora_kv             = %d\n",     __func__, hparams.n_lora_kv);
@ -8162,6 +8316,23 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
            {
                res = nullptr;
            } break;
+        case LLM_ARCH_DEEPSEEK32:
+            {
+                res = new llama_kv_cache_dsa(
+                        *this,
+                        params.type_k,
+                        params.type_v,
+                        !cparams.flash_attn,
+                        cparams.offload_kqv,
+                        cparams.kv_unified,
+                        cparams.n_ctx_seq,
+                        cparams.n_seq_max,
+                        1,
+                        hparams.n_swa,
+                        hparams.swa_type,
+                        nullptr,
+                        nullptr);
+            } break;
        // Models that need standard caching should rely on recurrent/hybrid
        // checks
        default:
@ -8558,6 +8729,10 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
            {
                llm = std::make_unique<llm_build_deepseek2>(*this, params);
            } break;
+        case LLM_ARCH_DEEPSEEK32:
+            {
+                llm = std::make_unique<llm_build_deepseek32>(*this, params);
+            } break;
        case LLM_ARCH_CHATGLM:
            {
                llm = std::make_unique<llm_build_chatglm>(*this, params);
@ -8954,6 +9129,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
        case LLM_ARCH_DEEPSEEK:
        case LLM_ARCH_DEEPSEEK2:
        case LLM_ARCH_DEEPSEEK2OCR:
+        case LLM_ARCH_DEEPSEEK32:
        case LLM_ARCH_PLM:
        case LLM_ARCH_CHATGLM:
        case LLM_ARCH_GRANITE:
--- a/src/llama-model.h
+++ b/src/llama-model.h
@ -135,6 +135,7 @@ enum llm_type {
    LLM_TYPE_310B_A15B, // /MiMo-V2-Flash
    LLM_TYPE_355B_A32B, // GLM-4.5
    LLM_TYPE_397B_A17B, // Qwen3.5
+    LLM_TYPE_685B_A37B, // DeepSeek V3.2
    LLM_TYPE_744B_A40B, // GLM-5
    LLM_TYPE_E2B,
    LLM_TYPE_E4B,
--- a/src/models/deepseek32.cpp
+++ b/src/models/deepseek32.cpp
@ -0,0 +1,351 @@
+#include "models.h"
+
+#include "llama-kv-cache.h"
+#include "llama-ik-cache.h"
+
+llm_build_deepseek32::llm_build_deepseek32(const llama_model & model, const llm_graph_params & params) :
+    llm_graph_context(params) {
+    const bool is_mla = hparams.is_mla();
+    GGML_ASSERT(is_mla);
+
+    // note: these are the actual head sizes you get when treating as MHA or after "decompression" using wv_b for MLA
+    const int64_t n_embd_head_k = hparams.n_embd_head_k_mla();
+    const int64_t n_embd_head_v = hparams.n_embd_head_v_mla();
+    GGML_UNUSED(n_embd_head_v);
+
+    const int64_t n_embd_head_qk_rope = hparams.n_rot();
+    const int64_t n_embd_head_qk_nope = n_embd_head_k - n_embd_head_qk_rope;
+
+    const int64_t n_indexer_head = hparams.indexer_n_head;
+    const int64_t n_embd_indexer_head = hparams.indexer_head_size;
+    const int64_t n_embd_indexer_head_rope = hparams.n_rot();
+    const int64_t n_embd_indexer_head_nope = n_embd_indexer_head - n_embd_indexer_head_rope;
+    const uint32_t n_indexer_top_k = hparams.indexer_top_k;
+
+    const uint32_t kv_lora_rank = hparams.n_lora_kv;
+
+    // We have to pre-scale kq_scale and attn_factor to make the YaRN RoPE work correctly.
+    // See https://github.com/ggml-org/llama.cpp/discussions/7416 for detailed explanation.
+    // And also: https://github.com/ggml-org/llama.cpp/pull/17945 [TAG_DEEPSEEK2_YARN_LOG_MUL_FIX]
+
+    // first cancel the adjustment from llama_hparams::yarn_attn_factor_adjust to get the original attn_factor
+    GGML_ASSERT(ext_factor >= 0.0f);
+    const float attn_factor_org = attn_factor * (1.0f + 0.1f * logf(1.0f / freq_scale));
+
+    // use the original attn_factor to pre-scale the kq_scale
+    const float mscale   = attn_factor_org * (1.0f + 0.1f * hparams.rope_yarn_log_mul * logf(1.0f / freq_scale));
+    const float kq_scale = 1.0f * mscale * mscale / sqrtf(float(n_embd_head_k));
+
+    ggml_tensor * cur;
+    ggml_tensor * inpL;
+
+    // {n_embd, n_tokens}
+    inpL = build_inp_embd(model.tok_embd);
+
+    // inp_pos - contains the positions
+    ggml_tensor * inp_pos = build_inp_pos();
+
+    std::pair<llm_graph_input_attn_k*, llm_graph_input_attn_ik*> inp_attn_dsa = build_attn_inp_k_dsa();
+    auto * inp_attn_k = inp_attn_dsa.first;
+    auto * inp_attn_ik = inp_attn_dsa.second;
+
+    ggml_tensor * inp_out_ids = build_inp_out_ids();
+
+    int effective_n_layers = hparams.n_layer - hparams.nextn_predict_layers;
+    for (int il = 0; il < effective_n_layers; ++il) {
+        ggml_tensor * inpSA = inpL;
+
+        // norm
+        cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
+        cb(cur, "attn_norm", il);
+
+        // self_attention
+        {
+            ggml_tensor * qr = ggml_mul_mat(ctx0, model.layers[il].wq_a, cur);
+            cb(qr, "qr", il);
+
+            qr = build_norm(qr, model.layers[il].attn_q_a_norm, nullptr, LLM_NORM_RMS, il);
+            cb(qr, "qr", il);
+
+            ggml_tensor * top_k = nullptr;
+
+            // lightning indexer
+            {
+                ggml_tensor * indexer_q = ggml_mul_mat(ctx0, model.layers[il].indexer_attn_q_b, qr);
+                cb(indexer_q, "indexer_q", il);
+
+                // split into {n_embd_indexer_head_rope, n_indexer_head, n_tokens}
+                ggml_tensor * indexer_q_pe =
+                    ggml_view_3d(ctx0, indexer_q, n_embd_indexer_head_rope, n_indexer_head, n_tokens,
+                                 ggml_row_size(indexer_q->type, n_embd_indexer_head), 
+                                 ggml_row_size(indexer_q->type, n_embd_indexer_head) * n_indexer_head, 0); 
+                cb(indexer_q_pe, "indexer_q_pe", il);
+
+                // and {n_embd_indexer_head_nope, n_indexer_head, n_tokens}
+                ggml_tensor * indexer_q_nope =
+                    ggml_view_3d(ctx0, indexer_q, n_embd_indexer_head_nope, n_indexer_head, n_tokens,
+                                 ggml_row_size(indexer_q->type, n_embd_indexer_head), 
+                                 ggml_row_size(indexer_q->type, n_embd_indexer_head) * n_indexer_head,
+                                 ggml_row_size(indexer_q->type, n_embd_indexer_head_nope));
+                cb(indexer_q_nope, "indexer_q_nope", il);
+
+                indexer_q_pe = ggml_rope_ext(ctx0, indexer_q_pe, inp_pos, nullptr, n_rot, 
+                                     LLAMA_ROPE_TYPE_NEOX, n_ctx_orig, freq_base, freq_scale,
+                                     ext_factor, attn_factor, beta_fast, beta_slow);
+                cb(indexer_q_pe, "indexer_q_pe", il);
+
+                // {n_embd_indexer_head_rope + n_embd_indexer_head_nope, n_head, n_tokens}
+                indexer_q = ggml_concat(ctx0, indexer_q_pe, indexer_q_nope, 0);
+                cb(indexer_q, "indexer_q", il);
+
+                ggml_tensor * indexer_k = ggml_mul_mat(ctx0, model.layers[il].indexer_attn_k, cur);
+                cb(indexer_k, "indexer_k", il);
+
+                indexer_k = build_norm(indexer_k, model.layers[il].indexer_k_norm, model.layers[il].indexer_k_norm_b, LLM_NORM, il);
+                cb(indexer_k, "indexer_k", il);
+
+                // split into {n_embd_indexer_head_rope, 1, n_tokens}
+                ggml_tensor * indexer_k_pe =
+                    ggml_view_3d(ctx0, indexer_k, n_embd_indexer_head_rope, 1, n_tokens,
+                                 ggml_row_size(indexer_k->type, n_embd_indexer_head), 
+                                 ggml_row_size(indexer_k->type, n_embd_indexer_head) * 1, 0); 
+                cb(indexer_k_pe, "indexer_k_pe", il);
+
+                // and {n_embd_indexer_head_nope, 1, n_tokens}
+                ggml_tensor * indexer_k_nope =
+                    ggml_view_3d(ctx0, indexer_k, n_embd_indexer_head_nope, 1, n_tokens,
+                                 ggml_row_size(indexer_k->type, n_embd_indexer_head), 
+                                 ggml_row_size(indexer_k->type, n_embd_indexer_head) * 1,
+                                 ggml_row_size(indexer_k->type, n_embd_indexer_head_nope));
+                cb(indexer_k_nope, "indexer_k_nope", il);
+
+                indexer_k_pe = ggml_rope_ext(ctx0, indexer_k_pe, inp_pos, nullptr, n_rot, 
+                                     LLAMA_ROPE_TYPE_NEOX, n_ctx_orig, freq_base, freq_scale,
+                                     ext_factor, attn_factor, beta_fast, beta_slow);
+                cb(indexer_k_pe, "indexer_k_pe", il);
+
+                // {n_embd_indexer_head_rope + n_embd_indexer_head_nope, 1, n_tokens}
+                indexer_k = ggml_concat(ctx0, indexer_k_pe, indexer_k_nope, 0);
+                cb(indexer_k, "indexer_k", il);
+
+                // perform Hadamard transform on indexer q and k
+                indexer_q = ggml_hadamard(ctx0, indexer_q, n_embd_indexer_head);
+                cb(indexer_q, "indexer_q", il);
+                indexer_k = ggml_hadamard(ctx0, indexer_k, n_embd_indexer_head);
+                cb(indexer_k, "indexer_k", il);
+
+                // store indexer keys to KV cache
+                const auto * mctx_cur = inp_attn_ik->mctx;
+                const auto & k_idxs = inp_attn_ik->get_k_idxs();
+                ggml_build_forward_expand(gf, mctx_cur->cpy_k(ctx0, indexer_k, k_idxs, il));
+
+                // prepare indexer weights
+                ggml_tensor * indexer_weights = ggml_mul_mat(ctx0, model.layers[il].indexer_proj, cur);
+                cb(indexer_weights, "indexer_weights", il);
+
+                indexer_weights = ggml_scale(ctx0, indexer_weights, 1.0f / sqrtf(float(n_indexer_head)));
+                cb(indexer_weights, "indexer_weights", il);
+
+                // get cached indexer keys
+                indexer_k = mctx_cur->get_k(ctx0, il);
+
+                // split the batch into streams if needed
+                const auto n_stream = indexer_k->ne[3];
+                indexer_q = ggml_view_4d(ctx0, indexer_q, indexer_q->ne[0], indexer_q->ne[1], indexer_q->ne[2]/n_stream, n_stream, indexer_q->nb[1], indexer_q->nb[2], indexer_q->nb[3]/n_stream, 0);
+                indexer_weights = ggml_view_4d(ctx0, indexer_weights, indexer_weights->ne[0], indexer_weights->ne[1]/n_stream, indexer_weights->ne[2], n_stream, indexer_weights->nb[1], indexer_weights->nb[2]/n_stream, indexer_weights->nb[3]/n_stream, 0);
+
+                // calculate indexer kq
+                indexer_q = ggml_permute(ctx0, indexer_q, 0, 2, 1, 3);
+                cb(indexer_q, "indexer_q", il);
+                indexer_k = ggml_permute(ctx0, indexer_k, 0, 2, 1, 3);
+                cb(indexer_k, "indexer_k", il);
+
+                ggml_tensor * indexer_kq = ggml_mul_mat(ctx0, indexer_k, indexer_q);
+                cb(indexer_kq, "indexer_kq", il);
+
+                // ReLU requires contiguous tensors
+                indexer_kq = ggml_cont(ctx0, ggml_permute(ctx0, indexer_kq, 2, 1, 0, 3));
+                cb(indexer_kq, "indexer_kq", il);
+
+                // apply ReLU
+                ggml_tensor * indexer_score = ggml_relu(ctx0, indexer_kq);
+                cb(indexer_score, "indexer_score", il);
+
+                // multiply scores by indexer weights
+                indexer_score = ggml_mul(ctx0, indexer_score, indexer_weights);
+                cb(indexer_score, "indexer_score", il);
+
+                // sum by q n_indexer_head dimension
+                indexer_score = ggml_sum_rows(ctx0, indexer_score);
+                cb(indexer_score, "indexer_score", il);
+
+                indexer_score = ggml_permute(ctx0, indexer_score, 2, 1, 0, 3);
+                cb(indexer_score, "indexer_score", il);
+
+                indexer_score = ggml_cont(ctx0, indexer_score);
+                cb(indexer_score, "indexer_score", il);
+
+                // TODO maybe pre-scale indexer weights, so we won't have to do it here
+                indexer_score = ggml_scale(ctx0, indexer_score, 1.0f / sqrtf(float(n_embd_indexer_head)));
+                cb(indexer_score, "indexer_score", il);
+
+                // mask indexer scores
+                ggml_tensor * indexer_kq_mask = inp_attn_ik->get_kq_mask();
+                indexer_score = ggml_add(ctx0, indexer_score, indexer_kq_mask);
+                cb(indexer_score, "indexer_score", il);
+
+                // get indices of top k indexer scores
+                uint32_t n_top_k = indexer_score->ne[0] < n_indexer_top_k ? indexer_score->ne[0] : n_indexer_top_k;
+                top_k = ggml_cont(ctx0, ggml_top_k(ctx0, indexer_score, n_top_k));
+                cb(top_k, "top_k", il);
+            }
+
+            ggml_tensor * q = ggml_mul_mat(ctx0, model.layers[il].wq_b, qr);
+            cb(q, "q", il);
+
+            // split into {n_embd_head_qk_nope, n_head, n_tokens}
+            ggml_tensor * q_nope =
+                ggml_view_3d(ctx0, q, n_embd_head_qk_nope, n_head, n_tokens, ggml_row_size(q->type, n_embd_head_k),
+                             ggml_row_size(q->type, n_embd_head_k) * n_head, 0);
+            cb(q_nope, "q_nope", il);
+
+            // and {n_embd_head_qk_rope, n_head, n_tokens}
+            ggml_tensor * q_pe = ggml_view_3d(
+                ctx0, q, n_embd_head_qk_rope, n_head, n_tokens, ggml_row_size(q->type, n_embd_head_k),
+                ggml_row_size(q->type, n_embd_head_k) * n_head, ggml_row_size(q->type, n_embd_head_qk_nope));
+            cb(q_pe, "q_pe", il);
+
+            ggml_tensor * kv_cmpr_pe = ggml_mul_mat(ctx0, model.layers[il].wkv_a_mqa, cur);
+            cb(kv_cmpr_pe, "kv_cmpr_pe", il);
+
+            // split into {kv_lora_rank, n_tokens}
+            ggml_tensor * kv_cmpr =
+                ggml_view_2d(ctx0, kv_cmpr_pe, kv_lora_rank, n_tokens,
+                             ggml_row_size(kv_cmpr_pe->type, kv_lora_rank + n_embd_head_qk_rope), 0);
+            cb(kv_cmpr, "kv_cmpr", il);
+
+            // and {n_embd_head_qk_rope, 1, n_tokens}
+            ggml_tensor * k_pe = ggml_view_3d(ctx0, kv_cmpr_pe, n_embd_head_qk_rope, 1, n_tokens,
+                                              ggml_row_size(kv_cmpr_pe->type, kv_lora_rank + n_embd_head_qk_rope),
+                                              ggml_row_size(kv_cmpr_pe->type, kv_lora_rank + n_embd_head_qk_rope),
+                                              ggml_row_size(kv_cmpr_pe->type, kv_lora_rank));
+            cb(k_pe, "k_pe", il);
+
+            q_pe = ggml_rope_ext(ctx0, q_pe, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                                 ext_factor, attn_factor, beta_fast, beta_slow);
+            cb(q_pe, "q_pe", il);
+
+            k_pe = ggml_rope_ext(ctx0, k_pe, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                                 ext_factor, attn_factor, beta_fast, beta_slow);
+            cb(k_pe, "k_pe", il);
+
+            kv_cmpr = build_norm(kv_cmpr, model.layers[il].attn_kv_a_norm, nullptr, LLM_NORM_RMS, il);
+            cb(kv_cmpr, "kv_cmpr", il);
+
+            // MLA attention
+            {
+                // {n_embd_head_qk_nope, n_tokens, n_head}
+                q_nope = ggml_permute(ctx0, q_nope, 0, 2, 1, 3);
+                cb(q_nope, "q_nope_perm", il);
+
+                // {n_embd_head_qk_nope, kv_lora_rank, n_head} x {n_embd_head_qk_nope, n_tokens, n_head}
+                ggml_tensor * q_nope_absorbed = ggml_mul_mat(ctx0, model.layers[il].wk_b, q_nope);
+                cb(q_nope_absorbed, "q_nope_absorbed", il);
+
+                // {kv_lora_rank, n_head, n_tokens}
+                q_nope_absorbed = ggml_permute(ctx0, q_nope_absorbed, 0, 2, 1, 3);
+                cb(q_nope_absorbed, "q_nope_absorbed_perm", il);
+
+                // {n_embd_head_qk_rope + kv_lora_rank, n_head, n_tokens}
+                // note: rope must go first for in-place context shifting in build_rope_shift()
+                ggml_tensor * Qcur = ggml_concat(ctx0, q_nope_absorbed, q_pe, 0);
+                cb(Qcur, "Qcur", il);
+
+                kv_cmpr = ggml_reshape_3d(ctx0, kv_cmpr, kv_lora_rank, 1, n_tokens);
+                cb(kv_cmpr, "kv_cmpr_reshape", il);
+
+                // {n_embd_head_qk_rope + kv_lora_rank, 1, n_tokens}
+                ggml_tensor * Kcur = ggml_concat(ctx0, kv_cmpr, k_pe, 0);
+                cb(Kcur, "Kcur", il);
+
+                // {kv_lora_rank, 1, n_tokens}
+                ggml_tensor * Vcur = kv_cmpr;
+                cb(Vcur, "Vcur", il);
+
+                // note: MLA with the absorption optimization converts into MQA (ie: GQA with 1 group)
+                cur = build_attn(inp_attn_k,
+                        model.layers[il].wo, NULL,
+                        Qcur, Kcur, Vcur, nullptr, nullptr, model.layers[il].wv_b, top_k, kq_scale, il);
+            }
+        }
+        if (il == effective_n_layers - 1 && inp_out_ids) {
+            cur   = ggml_get_rows(ctx0, cur, inp_out_ids);
+            inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+        }
+        ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+        cb(ffn_inp, "ffn_inp", il);
+
+        cur = build_norm(ffn_inp, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, il);
+        cb(cur, "ffn_norm", il);
+
+        if ((uint32_t) il < hparams.n_layer_dense_lead) {
+            cur = build_ffn(cur,
+                model.layers[il].ffn_up, NULL, NULL,
+                model.layers[il].ffn_gate, NULL, NULL,
+                model.layers[il].ffn_down, NULL, NULL,
+                NULL, LLM_FFN_SILU, LLM_FFN_PAR, il);
+            cb(cur, "ffn_out", il);
+        } else {
+            // MoE branch
+            ggml_tensor * moe_out = build_moe_ffn(cur,
+                model.layers[il].ffn_gate_inp,
+                model.layers[il].ffn_up_exps,
+                model.layers[il].ffn_gate_exps,
+                model.layers[il].ffn_down_exps,
+                model.layers[il].ffn_exp_probs_b,
+                n_expert, n_expert_used,
+                LLM_FFN_SILU, hparams.expert_weights_norm,
+                hparams.expert_weights_scale,
+                (llama_expert_gating_func_type) hparams.expert_gating_func,
+                il,
+                nullptr,
+                model.layers[il].ffn_gate_up_exps);
+            cb(moe_out, "ffn_moe_out", il);
+
+            // FFN shared expert
+            {
+                ggml_tensor * ffn_shexp =
+                    build_ffn(cur,
+                        model.layers[il].ffn_up_shexp, NULL, NULL,
+                        model.layers[il].ffn_gate_shexp, NULL, NULL,
+                        model.layers[il].ffn_down_shexp, NULL, NULL,
+                        NULL, LLM_FFN_SILU, LLM_FFN_PAR, il);
+                cb(ffn_shexp, "ffn_shexp", il);
+
+                cur = ggml_add(ctx0, moe_out, ffn_shexp);
+                cb(cur, "ffn_out", il);
+            }
+        }
+        cur = ggml_add(ctx0, cur, ffn_inp);
+
+        cur = build_cvec(cur, il);
+        cb(cur, "l_out", il);
+
+        // input for next layer
+        inpL = cur;
+    }
+    cur = inpL;
+
+    cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1);
+
+    cb(cur, "result_norm", -1);
+    res->t_embd = cur;
+
+    // lm_head
+    cur = ggml_mul_mat(ctx0, model.output, cur);
+
+    cb(cur, "result_output", -1);
+    res->t_logits = cur;
+
+    ggml_build_forward_expand(gf, cur);
+}
--- a/src/models/models.h
+++ b/src/models/models.h
@ -186,12 +186,16 @@ struct llm_build_deci : public llm_graph_context {
    llm_build_deci(const llama_model & model, const llm_graph_params & params);
 };

+struct llm_build_deepseek : public llm_graph_context {
+    llm_build_deepseek(const llama_model & model, const llm_graph_params & params);
+};
+
 struct llm_build_deepseek2 : public llm_graph_context {
    llm_build_deepseek2(const llama_model & model, const llm_graph_params & params);
 };

-struct llm_build_deepseek : public llm_graph_context {
-    llm_build_deepseek(const llama_model & model, const llm_graph_params & params);
+struct llm_build_deepseek32 : public llm_graph_context {
+    llm_build_deepseek32(const llama_model & model, const llm_graph_params & params);
 };

 struct llm_build_dots1 : public llm_graph_context {
--- a/tests/test-backend-ops.cpp
+++ b/tests/test-backend-ops.cpp
@ -6655,6 +6655,97 @@ struct test_diag : public test_case {
    }
 };

+// GGML_OP_HADAMARD
+struct test_hadamard : public test_case {
+    const ggml_type type_a;
+    const std::array<int64_t, 4> ne_a;
+    int nh;
+
+    std::string vars() override {
+        return VARS_TO_STR3(type_a, ne_a, nh);
+    }
+
+    test_hadamard(ggml_type type_a = GGML_TYPE_F32,
+            std::array<int64_t, 4> ne_a = {128, 10, 10, 10},
+            int nh = 128)
+        : type_a(type_a), ne_a(ne_a), nh(nh) {}
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        ggml_tensor * a = ggml_new_tensor(ctx, type_a, 4, ne_a.data());
+        ggml_set_param(a);
+        ggml_set_name(a, "a");
+
+        ggml_tensor * out = ggml_hadamard(ctx, a, nh);
+        ggml_set_name(out, "out");
+
+        return out;
+    }
+
+    void initialize_tensors(ggml_context * ctx) override {
+        for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
+            init_tensor_uniform(t, -1.0f, 1.0f);
+        }
+    }
+};
+
+// GGML_OP_SCATTER
+struct test_scatter : public test_case {
+    const ggml_type type_a;
+    const ggml_type type_ids;
+    const std::array<int64_t, 4> ne_a;
+    const std::array<int64_t, 4> ne_ids;
+    float c;
+    bool inplace;
+
+    std::string vars() override {
+        return VARS_TO_STR6(type_a, type_ids, ne_a, ne_ids, c, inplace);
+    }
+
+    test_scatter(ggml_type type_a = GGML_TYPE_F32,
+            ggml_type type_ids = GGML_TYPE_I32,
+            std::array<int64_t, 4> ne_a = {10, 10, 10, 10},
+            std::array<int64_t, 4> ne_ids = {3, 10, 10, 10},
+            float c = 2.0f,
+            bool inplace = false)
+        : type_a(type_a), type_ids(type_ids), ne_a(ne_a), ne_ids(ne_ids), c(c), inplace(inplace) {}
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        ggml_tensor * a = ggml_new_tensor(ctx, type_a, 4, ne_a.data());
+        ggml_set_param(a);
+        ggml_set_name(a, "a");
+
+        ggml_tensor * ids = ggml_new_tensor(ctx, type_ids, 4, ne_ids.data());
+        ggml_set_param(ids);
+        ggml_set_name(ids, "ids");
+
+        ggml_tensor * out;
+        if (inplace) {
+            out = ggml_scatter_inplace(ctx, a, ids, c);
+        } else {
+            out = ggml_scatter(ctx, a, ids, c);
+        }
+        ggml_set_name(out, "out");
+
+        return out;
+    }
+
+    void initialize_tensors(ggml_context * ctx) override {
+        for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
+            if (t->type == GGML_TYPE_I32) {
+                // ids
+                const int num_pos_ids = ggml_nelements(t);
+                std::vector<int32_t> data(num_pos_ids);
+                for (int i = 0; i < num_pos_ids; i++) {
+                    data[i] = rand() % ne_a[0];
+                }
+                ggml_backend_tensor_set(t, data.data(), 0, num_pos_ids * sizeof(int));
+            } else {
+                init_tensor_uniform(t);
+            }
+        }
+    }
+};
+
 // Deserializable generic test case
 struct input_tensor {
    ggml_type type;
@ -8717,6 +8808,19 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
    test_cases.emplace_back(new test_falcon(2));
 #endif

+    // hadamard
+    test_cases.emplace_back(new test_hadamard());
+
+    // scatter
+    test_cases.emplace_back(new test_scatter(GGML_TYPE_F32, GGML_TYPE_I32, {10, 1, 1, 1}, {3, 1, 1, 1}, 0.0f, true));
+    test_cases.emplace_back(new test_scatter(GGML_TYPE_F32, GGML_TYPE_I32, {10, 1, 1, 1}, {3, 1, 1, 1}, 0.0f, false));
+    test_cases.emplace_back(new test_scatter(GGML_TYPE_F32, GGML_TYPE_I32, {10, 10, 10, 10}, {3, 10, 10, 10}, 0.0f, true));
+    test_cases.emplace_back(new test_scatter(GGML_TYPE_F32, GGML_TYPE_I32, {10, 10, 10, 10}, {3, 10, 10, 10}, 0.0f, false));
+    test_cases.emplace_back(new test_scatter(GGML_TYPE_F16, GGML_TYPE_I32, {10, 1, 1, 1}, {3, 1, 1, 1}, 0.0f, true));
+    test_cases.emplace_back(new test_scatter(GGML_TYPE_F16, GGML_TYPE_I32, {10, 1, 1, 1}, {3, 1, 1, 1}, 0.0f, false));
+    test_cases.emplace_back(new test_scatter(GGML_TYPE_F16, GGML_TYPE_I32, {10, 10, 10, 10}, {3, 10, 10, 10}, 0.0f, true));
+    test_cases.emplace_back(new test_scatter(GGML_TYPE_F16, GGML_TYPE_I32, {10, 10, 10, 10}, {3, 10, 10, 10}, 0.0f, false));
+
    return test_cases;
 }
 #ifdef _MSC_VER
@ -8992,6 +9096,15 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_perf() {
    test_cases.emplace_back(new test_gated_delta_net(GGML_TYPE_F32, 4, 128, 1024, 1)); // 4h PP-1024
    test_cases.emplace_back(new test_gated_delta_net(GGML_TYPE_F32, 32, 128, 64, 1, 1, false, true)); // KDA PP-64

+    // hadamard
+    test_cases.emplace_back(new test_hadamard());
+
+    // scatter
+    test_cases.emplace_back(new test_scatter(GGML_TYPE_F32, GGML_TYPE_I32, {65536, 1, 1, 1}, {2048, 1, 1, 1}, 0.0f, true));
+    test_cases.emplace_back(new test_scatter(GGML_TYPE_F32, GGML_TYPE_I32, {65536, 1, 1, 1}, {2048, 1, 1, 1}, 0.0f, false));
+    test_cases.emplace_back(new test_scatter(GGML_TYPE_F16, GGML_TYPE_I32, {65536, 1, 1, 1}, {2048, 1, 1, 1}, 0.0f, true));
+    test_cases.emplace_back(new test_scatter(GGML_TYPE_F16, GGML_TYPE_I32, {65536, 1, 1, 1}, {2048, 1, 1, 1}, 0.0f, false));
+
    return test_cases;
 }