From f85e5c73b91370d605f4835c2f6112fb729006cd Mon Sep 17 00:00:00 2001 From: Yee Man Chan Date: Fri, 2 Jan 2026 21:20:34 +0800 Subject: [PATCH] Move KIMI_LINEAR to llm_arch_is_hybrid to enable KV cache --- src/llama-arch.cpp | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp index 6aabdb7f7d..cf5ea1177f 100644 --- a/src/llama-arch.cpp +++ b/src/llama-arch.cpp @@ -2863,7 +2863,6 @@ bool llm_arch_is_recurrent(const llm_arch & arch) { case LLM_ARCH_RWKV6QWEN2: case LLM_ARCH_RWKV7: case LLM_ARCH_ARWKV7: - case LLM_ARCH_KIMI_LINEAR: // KDA layers use delta attention with recurrent state return true; default: return false; @@ -2880,9 +2879,7 @@ bool llm_arch_is_hybrid(const llm_arch & arch) { case LLM_ARCH_LFM2MOE: case LLM_ARCH_NEMOTRON_H: case LLM_ARCH_QWEN3NEXT: - // Kimi: Currently using recurrent-only mode since MLA doesn't use KV cache - // TODO: Enable hybrid when MLA KV caching is implemented - // case LLM_ARCH_KIMI_LINEAR: + case LLM_ARCH_KIMI_LINEAR: return true; default: return false;