Move KIMI_LINEAR to llm_arch_is_hybrid to enable KV cache

2026-01-02 21:20:34 +08:00 · 2026-01-02 21:20:34 +08:00 · f85e5c73b9
parent f67a42d572
commit f85e5c73b9
1 changed files with 1 additions and 4 deletions
--- a/src/llama-arch.cpp
+++ b/src/llama-arch.cpp
@ -2863,7 +2863,6 @@ bool llm_arch_is_recurrent(const llm_arch & arch) {
        case LLM_ARCH_RWKV6QWEN2:
        case LLM_ARCH_RWKV7:
        case LLM_ARCH_ARWKV7:
-        case LLM_ARCH_KIMI_LINEAR:  // KDA layers use delta attention with recurrent state
            return true;
        default:
            return false;
@ -2880,9 +2879,7 @@ bool llm_arch_is_hybrid(const llm_arch & arch) {
        case LLM_ARCH_LFM2MOE:
        case LLM_ARCH_NEMOTRON_H:
        case LLM_ARCH_QWEN3NEXT:
-        // Kimi: Currently using recurrent-only mode since MLA doesn't use KV cache
-        // TODO: Enable hybrid when MLA KV caching is implemented
-        // case LLM_ARCH_KIMI_LINEAR:
+        case LLM_ARCH_KIMI_LINEAR:
            return true;
        default:
            return false;