diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h index 48da68fe7e..888f00c2e8 100644 --- a/ggml/include/ggml.h +++ b/ggml/include/ggml.h @@ -539,6 +539,7 @@ extern "C" { GGML_OP_FLASH_ATTN_BACK, GGML_OP_SSM_CONV, GGML_OP_SSM_SCAN, + GGML_OP_KDA_SCAN, GGML_OP_WIN_PART, GGML_OP_WIN_UNPART, GGML_OP_GET_REL_POS, @@ -2337,6 +2338,28 @@ extern "C" { struct ggml_tensor * C, struct ggml_tensor * ids); + // KDA (Kimi Delta Attention) scan + // Delta attention recurrence: + // h[t] = exp(g[t]) * h[t-1] + k[t]^T * (beta[t] * (v[t] - h[t-1] @ k[t])) + // o[t] = q[t]^T @ h[t] + // Parameters: + // h: hidden state {head_dim, head_dim, n_head, n_seqs+} + // q: query {head_dim, n_head, n_seq_tokens, n_seqs} + // k: key {head_dim, n_head, n_seq_tokens, n_seqs} + // v: value {head_dim, n_head, n_seq_tokens, n_seqs} + // g: gate {head_dim, n_head, n_seq_tokens, n_seqs} + // beta: mixing {n_head, n_seq_tokens, n_seqs} + // ids: seq indices {n_seqs} + GGML_API struct ggml_tensor * ggml_kda_scan( + struct ggml_context * ctx, + struct ggml_tensor * h, + struct ggml_tensor * q, + struct ggml_tensor * k, + struct ggml_tensor * v, + struct ggml_tensor * g, + struct ggml_tensor * beta, + struct ggml_tensor * ids); + // partition into non-overlapping windows with padding if needed // example: // a: 768 64 64 1 diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index cfe541fc41..8774ef2dfd 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -498,6 +498,7 @@ class TensorNameMap: "model.layers.{bid}.mlp.shared_experts.gate_proj", # deepseek deepseek2 "model.layers.{bid}.feed_forward.shared_expert.gate_proj", # llama4 "model.layers.{bid}.mlp.shared_mlp.gate_proj", # hunyuan + "model.layers.{bid}.block_sparse_moe.shared_experts.gate_proj", # kimi ), MODEL_TENSOR.FFN_GATE_CHEXP: (