From 631cbfcc7a21869e2f3f6b78ed88e9863cc5a862 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Wed, 28 Jan 2026 09:15:27 +0200 Subject: [PATCH] cuda : fix "V is K view" check for non-unified KV cache (#19145) --- common/arg.cpp | 5 +++-- ggml/src/ggml-cuda/fattn-common.cuh | 2 +- ggml/src/ggml-cuda/fattn.cu | 2 +- 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/common/arg.cpp b/common/arg.cpp index 2f68bdc1c0..cd3a1b6397 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -1295,9 +1295,10 @@ common_params_context common_params_parser_init(common_params & params, llama_ex ).set_env("LLAMA_ARG_CACHE_RAM").set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI})); add_opt(common_arg( {"-kvu", "--kv-unified"}, + {"-no-kvu", "--no-kv-unified"}, "use single unified KV buffer shared across all sequences (default: enabled if number of slots is auto)", - [](common_params & params) { - params.kv_unified = true; + [](common_params & params, bool value) { + params.kv_unified = value; } ).set_env("LLAMA_ARG_KV_UNIFIED").set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_PERPLEXITY, LLAMA_EXAMPLE_BATCHED})); add_opt(common_arg( diff --git a/ggml/src/ggml-cuda/fattn-common.cuh b/ggml/src/ggml-cuda/fattn-common.cuh index 3d7daccfdf..b6a7460da8 100644 --- a/ggml/src/ggml-cuda/fattn-common.cuh +++ b/ggml/src/ggml-cuda/fattn-common.cuh @@ -789,7 +789,7 @@ void launch_fattn( const ggml_tensor * K = dst->src[1]; const ggml_tensor * V = dst->src[2]; - const bool V_is_K_view = V->view_src && V->view_offs == 0 && (V->view_src == K || V->view_src == K->view_src); + const bool V_is_K_view = V->view_src && (V->view_src == K || (V->view_src == K->view_src && V->view_offs == K->view_offs)); const ggml_tensor * mask = dst->src[3]; const ggml_tensor * sinks = dst->src[4]; diff --git a/ggml/src/ggml-cuda/fattn.cu b/ggml/src/ggml-cuda/fattn.cu index fe18ff6c7d..195904ee20 100644 --- a/ggml/src/ggml-cuda/fattn.cu +++ b/ggml/src/ggml-cuda/fattn.cu @@ -310,7 +310,7 @@ static best_fattn_kernel ggml_cuda_get_best_fattn_kernel(const int device, const } } - const bool V_is_K_view = V->view_src && V->view_offs == 0 && (V->view_src == K || V->view_src == K->view_src); + const bool V_is_K_view = V->view_src && (V->view_src == K || (V->view_src == K->view_src && V->view_offs == K->view_offs)); const int cc = ggml_cuda_info().devices[device].cc;