sampling : fix copying both sampled tokens and logits/probs from backend

This commit fixes the issue where both sampled tokens and logits/probs were not being copied correctly from the backend to the host when multiple backend samplers were used. A test for this scenario has also been added to ensure that both types of data are copied correctly when different backend samplers are employed.
2025-11-23 13:08:08 +01:00 · 2025-11-23 13:08:08 +01:00 · 9e273f7aa4
parent ae23d2d2c1
commit 9e273f7aa4
2 changed files with 65 additions and 10 deletions
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@ -1470,16 +1470,14 @@ int llama_context::decode(const llama_batch & batch_inp) {
            const auto seq_to_output_row = build_seq_to_output_row(ubatch, n_outputs_prev);
            const auto stride = n_vocab;

-            // If a backend sampler has sampled a token we only want to copy the
-            // sampled tokens and avoid copying logits and probabilites.
-            if (!res->t_sampled.empty()) {
-                // async copy the sampled tokens from the backend to the host.
-                copy_tensor_async_ints(res->t_sampled, sampling.sampled, sampling.sampled_size, seq_to_output_row, sched.get());
-            } else {
-                // async copy the sampled logits/probs from the backend to the host.
-                copy_tensor_async_floats(res->t_sampled_logits, sampling.logits, stride, sampling.logits_count, seq_to_output_row, sched.get());
-                copy_tensor_async_floats(res->t_sampled_probs,  sampling.probs,  stride, sampling.probs_count,  seq_to_output_row, sched.get());
-            }
+            // async copy the sampled tokens from the backend to the host.
+            copy_tensor_async_ints(res->t_sampled, sampling.sampled, sampling.sampled_size, seq_to_output_row, sched.get());
+
+            // async copy the sampled logits from the backend to the host.
+            copy_tensor_async_floats(res->t_sampled_logits, sampling.logits, stride, sampling.logits_count, seq_to_output_row, sched.get());
+
+            // async copy the sampled probablities from the backend to the host.
+            copy_tensor_async_floats(res->t_sampled_probs,  sampling.probs,  stride, sampling.probs_count,  seq_to_output_row, sched.get());

            // async copy the candidate token ids from the backend to the host.
            // These are needed by CPU samplers to map probability/logit indices to vocab token ids.
--- a/tests/test-backend-sampler.cpp
+++ b/tests/test-backend-sampler.cpp
@ -573,6 +573,62 @@ static void test_backend_logit_bias_sampling(const char * model_path) {
    GGML_ASSERT(backend_token == bias_token);
 }

+// This test verifies that it is possible to have two different backend sampler,
+// one that used the backend dist sampler, and another that uses CPU dist sampler.
+static void test_backend_mixed_sampling(const char * model_path) {
+    test_model_context test_ctx;
+
+    struct llama_sampler_chain_params chain_params_0 = llama_sampler_chain_default_params();
+    struct llama_sampler * sampler_chain_0 = llama_sampler_chain_init(chain_params_0);
+    llama_sampler_chain_add(sampler_chain_0, llama_sampler_backend_init_dist(88));
+
+    int k = 40;
+    struct llama_sampler_chain_params chain_params_1 = llama_sampler_chain_default_params();
+    struct llama_sampler * sampler_chain_1 = llama_sampler_chain_init(chain_params_1);
+    llama_sampler_chain_add(sampler_chain_1, llama_sampler_backend_init_top_k(k));
+
+    std::vector<llama_sampler_seq_config> backend_sampler_configs = {
+        { 0, sampler_chain_0 },
+        { 1, sampler_chain_1 }
+    };
+
+    if (!test_ctx.setup(model_path, backend_sampler_configs)) {
+        return;
+    }
+
+    std::map<llama_seq_id, std::string> prompts = {
+        {0, "Hello"},
+        {1, "Some"}
+    };
+
+    if (!test_ctx.decode(prompts)) {
+        return;
+    }
+
+    // Verfiy sequence 0 that used the dist backend sampler.
+    {
+        int32_t batch_idx = test_ctx.idx_for_seq(0);
+        llama_token token = llama_get_backend_sampled_token_ith(test_ctx.ctx, batch_idx);
+        const std::string token_str = test_ctx.token_to_piece(token, false);
+        printf("sampled token id=%d, string='%s'\n", token, token_str.c_str());
+        GGML_ASSERT(token >= 0 && token < test_ctx.n_vocab);
+        GGML_ASSERT(llama_get_backend_sampled_logits_ith(test_ctx.ctx, batch_idx) == nullptr);
+        GGML_ASSERT(llama_get_backend_sampled_logits_count_ith(test_ctx.ctx, batch_idx) == 0);
+    }
+
+    // Verfiy sequence 0 that used the top-k backend sampler.
+    {
+        int32_t batch_idx = test_ctx.idx_for_seq(1);
+        float * logits = llama_get_backend_sampled_logits_ith(test_ctx.ctx, batch_idx);
+        GGML_ASSERT(logits != nullptr);
+        size_t n_logits = llama_get_backend_sampled_logits_count_ith(test_ctx.ctx, batch_idx);
+        GGML_ASSERT(n_logits == (size_t) k);
+        GGML_ASSERT(llama_get_backend_sampled_token_ith(test_ctx.ctx, batch_idx) == LLAMA_TOKEN_NULL);
+    }
+
+    printf("backend mixed sampling test PASSED\n");
+}
+
 static void test_backend_set_sampler(const char * model_path) {
    test_model_context test_ctx;

@ -695,6 +751,7 @@ static const backend_test_case BACKEND_TESTS[] = {
    { "dist_and_cpu",    test_backend_dist_sampling_and_cpu,   true  },
    { "set_sampler",     test_backend_set_sampler,             true  },
    { "max_outputs",     test_backend_max_outputs,             true  },
+    { "mixed",           test_backend_mixed_sampling,          true  },
 };

 struct backend_cli_args {