From 8ef5f900db54408e659a3da242b431b3f8586245 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sun, 7 Dec 2025 12:52:25 +0200
Subject: [PATCH] cont : fixes

---
 common/arg.cpp                                |  2 +-
 ggml/src/ggml-cuda/cumsum.cu                  |  4 ++-
 src/llama-graph.cpp                           | 36 ++++++++++++++++---
 src/llama-graph.h                             |  1 +
 src/llama-model.cpp                           |  2 ++
 src/llama-sampling.cpp                        | 18 ++++------
 .../server/tests/unit/test_chat_completion.py |  2 +-
 tools/server/tests/unit/test_completion.py    | 10 +++---
 8 files changed, 52 insertions(+), 23 deletions(-)
diff --git a/common/arg.cpp b/common/arg.cpp
index 7699c048b7..888c52eaa1 100644
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -1286,7 +1286,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
             params.sampling.top_k = value;
             params.sampling.user_sampling_config |= common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_TOP_K;
         }
-    ).set_sparam());
+    ).set_sparam().set_env("LLAMA_ARG_TOP_K"));
     add_opt(common_arg(
         {"--top-p"}, "N",
         string_format("top-p sampling (default: %.1f, 1.0 = disabled)", (double)params.sampling.top_p),
diff --git a/ggml/src/ggml-cuda/cumsum.cu b/ggml/src/ggml-cuda/cumsum.cu
index 7aeee449ef..27e185bc58 100644
--- a/ggml/src/ggml-cuda/cumsum.cu
+++ b/ggml/src/ggml-cuda/cumsum.cu
@@ -149,6 +149,7 @@ static __global__ void cumsum_kernel(
     }
 }
 
+#ifdef GGML_CUDA_USE_CUB
 template <typename T>
 static void cumsum_cub(ggml_cuda_pool & pool,
                        const T *        src,
@@ -171,6 +172,7 @@ static void cumsum_cub(ggml_cuda_pool & pool,
     // Perform the inclusive scan
     cub::DeviceScan::InclusiveSum((void *) tmp_alloc.get(), tmp_size, src, dst, ne, stream);
 }
+#endif // GGML_CUDA_USE_CUB
 
 template<typename T>
 static void cumsum_cuda(
@@ -188,7 +190,7 @@ static void cumsum_cuda(
 
     if (is_contiguous) {
         use_cub = true;
-        int64_t nrows = ne01 * ne02 * ne03; 
+        int64_t nrows = ne01 * ne02 * ne03;
         // TODO: Compare with DeviceSegmentedScan::InclusiveSegmentedSum for nrows > 1 once InclusiveSegmentedSum is released
         // Heuristics were determined as part of https://github.com/ggml-org/llama.cpp/pull/17004
         if (((nrows == 1) && (ne00 > 1024)) || (ne00 / nrows > 4096)) {
diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp
index 2e37748f1a..00e7f906e0 100644
--- a/src/llama-graph.cpp
+++ b/src/llama-graph.cpp
@@ -537,6 +537,38 @@ void llm_graph_result::set_inputs(const llama_ubatch * ubatch) {
     }
 }
 
+void llm_graph_result::set_outputs() {
+    if (t_logits != nullptr) {
+        ggml_set_output(t_logits);
+    }
+    if (t_embd != nullptr) {
+        ggml_set_output(t_embd);
+    }
+    if (t_embd_pooled != nullptr) {
+        ggml_set_output(t_embd_pooled);
+    }
+    for (auto & [seq_id, t] : t_sampled) {
+        if (t != nullptr) {
+            ggml_set_output(t);
+        }
+    }
+    for (auto & [seq_id, t] : t_sampled_probs) {
+        if (t != nullptr) {
+            ggml_set_output(t);
+        }
+    }
+    for (auto & [seq_id, t] : t_sampled_logits) {
+        if (t != nullptr) {
+            ggml_set_output(t);
+        }
+    }
+    for (auto & [seq_id, t] : t_candidates) {
+        if (t != nullptr) {
+            ggml_set_output(t);
+        }
+    }
+}
+
 bool llm_graph_result::can_reuse(const llm_graph_params & params) {
     if (!this->params.allow_reuse(params)) {
         if (debug > 1) {
@@ -2100,25 +2132,21 @@ void llm_graph_context::build_sampling() const {
         sampler->iface->backend_apply(sampler, ctx0, gf, &data);
 
         if (data.sampled != nullptr) {
-            ggml_set_output(data.sampled);
             res->t_sampled[seq_id] = data.sampled;
             ggml_build_forward_expand(gf, data.sampled);
         }
 
         if (data.probs != nullptr) {
-            ggml_set_output(data.probs);
             res->t_sampled_probs[seq_id] = data.probs;
             ggml_build_forward_expand(gf, data.probs);
         }
 
         if (data.logits != nullptr) {
-            ggml_set_output(data.logits);
             res->t_sampled_logits[seq_id] = data.logits;
             ggml_build_forward_expand(gf, data.logits);
         }
 
         if (data.candidates != nullptr) {
-            ggml_set_output(data.candidates);
             res->t_candidates[seq_id] = data.candidates;
             ggml_build_forward_expand(gf, data.candidates);
         }
diff --git a/src/llama-graph.h b/src/llama-graph.h
index b44bb109f2..006cae3c84 100644
--- a/src/llama-graph.h
+++ b/src/llama-graph.h
@@ -523,6 +523,7 @@ public:
     void reset();
 
     void set_inputs(const llama_ubatch * ubatch);
+    void set_outputs();
 
     // try to update the existing graph result using the new graph parameters in order to reuse it
     // this can only be done if we determine that the resulting graph using the new graph parameters
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
index 7a7c581c73..9e05cf0a80 100644
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -7581,6 +7581,8 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
     // TODO: move reranking logic here and generalize
     llm->build_dense_out(dense_2_out_layers, dense_3_out_layers);
 
+    llm->res->set_outputs();
+
     return llm->res->get_gf();
 }
 
diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp
index b961dcf487..ec1812b067 100644
--- a/src/llama-sampling.cpp
+++ b/src/llama-sampling.cpp
@@ -1367,7 +1367,7 @@ static void llama_sampler_top_p_backend_apply(
         struct llama_sampler_data * data) {
     auto * sctx = (llama_sampler_top_p *) smpl->ctx;
 
-    auto ggml_sort = [& ctx](struct ggml_tensor * a, struct ggml_tensor * b) {
+    auto ggml_sort = [ctx](struct ggml_tensor * a, struct ggml_tensor * b) {
         GGML_ASSERT(ggml_nrows(a) == 1);
         struct ggml_tensor * a_reshaped = ggml_reshape_2d(ctx, a, 1, a->ne[0]);
         struct ggml_tensor * a_sorted   = ggml_get_rows(ctx, a_reshaped, b);
@@ -1386,7 +1386,7 @@ static void llama_sampler_top_p_backend_apply(
     ggml_set_name(softmax, "top_p_softmax");
 
     // If candidates are provided, sort them as well. Otherwise, set sorted indices as candidates.
-    if (data->candidates != nullptr) {
+    if (data->candidates) {
         data->candidates = ggml_sort(data->candidates, sorted_idx);
     } else {
         data->candidates = sorted_idx;
@@ -1412,8 +1412,9 @@ static void llama_sampler_top_p_backend_apply(
     // Make top-p inclusive (i.e. return all values such that cum_sum/cdf >= p)
     struct ggml_tensor * mask_reshaped = ggml_reshape_2d(ctx, mask, 1, mask->ne[0]);
     // construct ones tensor to set the value in the mask
-    struct ggml_tensor * ones = ggml_dup_tensor(ctx, mask_reshaped);
-    ones = ggml_clamp(ctx, ones, 1.0f, 1.0f);
+    struct ggml_tensor * ones = ggml_clamp(ctx, mask_reshaped, 1.0f, 1.0f);
+    ggml_set_name(ones, "top_p_ones");
+
     mask_reshaped = ggml_set_rows(ctx, mask_reshaped, ones, ggml_cast(ctx, ggml_repeat(ctx, idxf, mask), GGML_TYPE_I32));
     mask = ggml_reshape_1d(ctx, mask_reshaped, mask->ne[0]);
 
@@ -1780,12 +1781,7 @@ static void llama_sampler_backend_temp_sampling(
         return;
     }
 
-    struct ggml_tensor * scaled = ggml_scale(ctx, data->logits, 1.0f / temp);
-    ggml_set_name(scaled, "temp_scaled");
-
-    // Make sure the scaled tensor is contiguous for subsequent operations
-    data->logits = ggml_cont(ctx, scaled);
-    ggml_set_name(data->logits, "temp_scaled_logits");
+    data->logits = ggml_scale(ctx, data->logits, 1.0f / temp);
 
     GGML_UNUSED(gf);
 }
@@ -3278,7 +3274,7 @@ static void llama_sampler_logit_bias_backend_apply(
     }
 
     // Add the sparse logit logit_bias to the logits
-    struct ggml_tensor * logit_biased = ggml_add_inplace(ctx, data->logits, sctx->inp_logit_bias);
+    struct ggml_tensor * logit_biased = ggml_add(ctx, data->logits, sctx->inp_logit_bias);
     data->logits = logit_biased;
 }
 
diff --git a/tools/server/tests/unit/test_chat_completion.py b/tools/server/tests/unit/test_chat_completion.py
index c4b142f71a..b22f82ae26 100644
--- a/tools/server/tests/unit/test_chat_completion.py
+++ b/tools/server/tests/unit/test_chat_completion.py
@@ -18,7 +18,7 @@ def create_server():
         (None, "Book", "What is the best book", 8, "(Suddenly)+|\\{ \" Sarax.|Timmy", 77, 8, "length", False, None),
         (None, "Book", "What is the best book", 8, "(Suddenly)+|\\{ \" Sarax.|Timmy", 77, 8, "length", True,  None),
         (None, "Book", "What is the best book", 8, "(Suddenly)+|\\{ \" Sarax.|Timmy", 77, 8, "length", True, 'chatml'),
-        (None, "Book", "What is the best book", 8, "^ blue|very teaful", 23, 8, "length", True, "This is not a chat template, it is"),
+        (None, "Book", "What is the best book", 8, "^ blue|very teaful|very busy", 23, 8, "length", True, "This is not a chat template, it is"),
         ("codellama70b", "You are a coding assistant.", "Write the fibonacci function in c++.", 128, "(Aside|she|felter|alonger|shake)+", 104, 128, "length", False, None),
         ("codellama70b", "You are a coding assistant.", "Write the fibonacci function in c++.", 128, "(Aside|she|felter|alonger|shake)+", 104, 128, "length", True, None),
         (None, "Book", [{"type": "text", "text": "What is"}, {"type": "text", "text": "the best book"}], 8, "Whillicter|Some", 79, 8, "length", False, None),
diff --git a/tools/server/tests/unit/test_completion.py b/tools/server/tests/unit/test_completion.py
index 4291715af5..f146d6475c 100644
--- a/tools/server/tests/unit/test_completion.py
+++ b/tools/server/tests/unit/test_completion.py
@@ -17,7 +17,7 @@ def create_server():
     server = ServerPreset.tinyllama2()
 
 @pytest.mark.parametrize("prompt,n_predict,re_content,n_prompt,n_predicted,truncated,return_tokens", [
-    ("I believe the meaning of life is", 8, "(going|bed)+|froze and every", 18, 8, False, False),
+    ("I believe the meaning of life is", 8, "(going|bed)+|froze and every|froze and bri", 18, 8, False, False),
     ("Write a joke about AI from a very long prompt which will not be truncated", 64, "(princesses|everyone|kids|Anna|forest)+", 46, 64, False, True),
 ])
 def test_completion(prompt: str, n_predict: int, re_content: str, n_prompt: int, n_predicted: int, truncated: bool, return_tokens: bool):
@@ -42,7 +42,7 @@ def test_completion(prompt: str, n_predict: int, re_content: str, n_prompt: int,
 
 
 @pytest.mark.parametrize("prompt,n_predict,re_content,n_prompt,n_predicted,truncated", [
-    ("I believe the meaning of life is", 8, "(going|bed)+|froze and every", 18, 8, False),
+    ("I believe the meaning of life is", 8, "(going|bed)+|froze and every|froze and bri", 18, 8, False),
     ("Write a joke about AI from a very long prompt which will not be truncated", 64, "(princesses|everyone|kids|Anna|forest)+", 46, 64, False),
 ])
 def test_completion_stream(prompt: str, n_predict: int, re_content: str, n_prompt: int, n_predicted: int, truncated: bool):
@@ -103,7 +103,7 @@ def test_completion_with_openai_library():
     assert res.system_fingerprint is not None and res.system_fingerprint.startswith("b")
     assert res.choices[0].finish_reason == "length"
     assert res.choices[0].text is not None
-    assert match_regex("(going|bed)+|froze and every", res.choices[0].text)
+    assert match_regex("(going|bed)+|froze and every|froze and bri", res.choices[0].text)
 
 
 def test_completion_stream_with_openai_library():
@@ -122,7 +122,7 @@ def test_completion_stream_with_openai_library():
         if choice.finish_reason is None:
             assert choice.text is not None
             output_text += choice.text
-    assert match_regex("(going|bed)+|froze and every", output_text)
+    assert match_regex("(going|bed)+|froze and every|froze and bri", output_text)
 
 
 # Test case from https://github.com/ggml-org/llama.cpp/issues/13780
@@ -146,7 +146,7 @@ def test_completion_stream_with_openai_library_stops():
         if choice.finish_reason is None:
             assert choice.text is not None
             output_text += choice.text
-    assert match_regex("Sure, here's one for[\\s\\S]*|Sure thing..Why don't", output_text), f'Unexpected output: {output_text}'
+    assert match_regex("Sure, here's one for[\\s\\S]*|Sure thing..Why don't|Sure! Here's one for you:", output_text), f'Unexpected output: {output_text}'
 
 
 @pytest.mark.parametrize("n_slots", [1, 2])