diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h index 888f00c2e8..48da68fe7e 100644 --- a/ggml/include/ggml.h +++ b/ggml/include/ggml.h @@ -539,7 +539,6 @@ extern "C" { GGML_OP_FLASH_ATTN_BACK, GGML_OP_SSM_CONV, GGML_OP_SSM_SCAN, - GGML_OP_KDA_SCAN, GGML_OP_WIN_PART, GGML_OP_WIN_UNPART, GGML_OP_GET_REL_POS, @@ -2338,28 +2337,6 @@ extern "C" { struct ggml_tensor * C, struct ggml_tensor * ids); - // KDA (Kimi Delta Attention) scan - // Delta attention recurrence: - // h[t] = exp(g[t]) * h[t-1] + k[t]^T * (beta[t] * (v[t] - h[t-1] @ k[t])) - // o[t] = q[t]^T @ h[t] - // Parameters: - // h: hidden state {head_dim, head_dim, n_head, n_seqs+} - // q: query {head_dim, n_head, n_seq_tokens, n_seqs} - // k: key {head_dim, n_head, n_seq_tokens, n_seqs} - // v: value {head_dim, n_head, n_seq_tokens, n_seqs} - // g: gate {head_dim, n_head, n_seq_tokens, n_seqs} - // beta: mixing {n_head, n_seq_tokens, n_seqs} - // ids: seq indices {n_seqs} - GGML_API struct ggml_tensor * ggml_kda_scan( - struct ggml_context * ctx, - struct ggml_tensor * h, - struct ggml_tensor * q, - struct ggml_tensor * k, - struct ggml_tensor * v, - struct ggml_tensor * g, - struct ggml_tensor * beta, - struct ggml_tensor * ids); - // partition into non-overlapping windows with padding if needed // example: // a: 768 64 64 1 diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c index 4cc15b0981..3247af8bb0 100644 --- a/ggml/src/ggml-cpu/ggml-cpu.c +++ b/ggml/src/ggml-cpu/ggml-cpu.c @@ -2320,7 +2320,6 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) { case GGML_OP_FLASH_ATTN_BACK: case GGML_OP_SSM_CONV: case GGML_OP_SSM_SCAN: - case GGML_OP_KDA_SCAN: case GGML_OP_RWKV_WKV6: case GGML_OP_GATED_LINEAR_ATTN: case GGML_OP_RWKV_WKV7: diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu index bbd7810dd2..eb2e273110 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -4503,11 +4503,6 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g return op->src[0]->ne[0] == 16 && op->src[0]->ne[1] == 1 && op->src[0]->ne[2] % 128 == 0 && op->src[4]->ne[1] == 1; } } - case GGML_OP_KDA_SCAN: { - // KDA scan kernel supports head_dim 64 or 128 - const int64_t head_dim = op->src[0]->ne[0]; - return head_dim == 64 || head_dim == 128; - } case GGML_OP_SSM_CONV: { // assumes d_inner % threads == 0 return op->src[0]->ne[1] % 128 == 0; diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index a167d6a574..173ec6b98f 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -999,7 +999,6 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = { "FLASH_ATTN_BACK", "SSM_CONV", "SSM_SCAN", - "KDA_SCAN", "WIN_PART", "WIN_UNPART", "GET_REL_POS",