From 3ba12fed0a50af94bd9cfdea6f0b59e5aba8ed4a Mon Sep 17 00:00:00 2001
From: Erik Scholz <Green-Sky@users.noreply.github.com>
Date: Wed, 8 Apr 2026 15:08:57 +0200
Subject: [PATCH] kv-cache : extend cache quantization checks (#21586)

to also check for enabled flash attention, instead of just auto.
---
 src/llama-context.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index a808e3e454..cf29bad8ea 100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -2942,7 +2942,7 @@ llama_context * llama_init_from_model(
         params.flash_attn_type = LLAMA_FLASH_ATTN_TYPE_DISABLED;
     }
 
-    if (params.flash_attn_type == LLAMA_FLASH_ATTN_TYPE_AUTO && ggml_is_quantized(params.type_k)) {
+    if (params.flash_attn_type != LLAMA_FLASH_ATTN_TYPE_DISABLED && ggml_is_quantized(params.type_k)) {
         const uint32_t blck_size = ggml_blck_size(params.type_k);
         for (uint32_t il = 0; il < model->hparams.n_layer; ++il) {
             if (model->hparams.n_embd_head_k(il) % blck_size != 0) {
@@ -2953,7 +2953,7 @@ llama_context * llama_init_from_model(
         }
     }
 
-    if (params.flash_attn_type == LLAMA_FLASH_ATTN_TYPE_AUTO && ggml_is_quantized(params.type_v)) {
+    if (params.flash_attn_type != LLAMA_FLASH_ATTN_TYPE_DISABLED && ggml_is_quantized(params.type_v)) {
         const uint32_t blck_size = ggml_blck_size(params.type_v);
         for (uint32_t il = 0; il < model->hparams.n_layer; ++il) {
             if (model->hparams.n_embd_head_v(il) % blck_size != 0) {