Fix backend_top_p_sampler

softmax(softmax) will return uniform distribution, so we should not return the softmax but the logits instead.
2025-12-01 12:07:30 +01:00 · 2025-12-01 12:07:30 +01:00 · 8bee483c97
parent cf0e1475c5
commit 8bee483c97
1 changed files with 34 additions and 32 deletions
--- a/src/llama-sampling.cpp
+++ b/src/llama-sampling.cpp
@ -1107,46 +1107,44 @@ static void llama_sampler_top_p_backend_apply(
        struct llama_sampler_data * data) {
    auto * sctx = (llama_sampler_top_p *) smpl->ctx;

-    struct ggml_tensor * softmax = ggml_soft_max(ctx, data->logits);
-    ggml_set_name(softmax, "top_p_softmax");
-
-    // Get the sorted indices of the softmax probabilities in descending order.
-    struct ggml_tensor * sorted_idx = ggml_argsort(ctx, softmax, GGML_SORT_ORDER_DESC);
+    // Get the sorted logits in descending order.
+    struct ggml_tensor * sorted_idx = ggml_argsort(ctx, data->logits, GGML_SORT_ORDER_DESC);
    ggml_set_name(sorted_idx, "top_p_sorted_idx");

    // Do the sorting via reshape + get_rows
-    struct ggml_tensor * softmax_reshaped = ggml_reshape_2d(ctx, softmax, 1, softmax->ne[0]);
-    ggml_set_name(softmax_reshaped, "top_p_softmax_reshaped");
+    struct ggml_tensor * logits_reshaped = ggml_reshape_2d(ctx, data->logits, 1, data->logits->ne[0]);
+    struct ggml_tensor * sorted_logits_reshaped = ggml_get_rows(ctx, logits_reshaped, sorted_idx);
+    struct ggml_tensor * sorted_logits = ggml_reshape_1d(ctx, sorted_logits_reshaped, data->logits->ne[0]);
+    ggml_set_name(sorted_logits, "top_p_sorted_logits");

-    struct ggml_tensor * sorted_probs = ggml_get_rows(ctx, softmax_reshaped, sorted_idx);
-    ggml_set_name(sorted_probs, "top_p_sorted_probs");
+    struct ggml_tensor * softmax = ggml_soft_max(ctx, sorted_logits);
+    ggml_set_name(softmax, "top_p_softmax");
+
+    // If candidates are provided, sort them as well. Otherwise, set sorted indices as candidates.
+    if (data->candidates != nullptr) {
+        struct ggml_tensor * candidates_reshaped = ggml_reshape_2d(ctx, data->candidates, 1, data->candidates->ne[0]);
+        ggml_set_name(candidates_reshaped, "top_p_candidates_reshaped");
+
+        struct ggml_tensor * sorted_candidates = ggml_get_rows(ctx, candidates_reshaped, sorted_idx);
+        ggml_set_name(sorted_candidates, "top_p_sorted_candidates");
+
+        data->candidates = ggml_reshape_1d(ctx, sorted_candidates, data->candidates->ne[0]);
+        ggml_set_name(data->candidates, "top_p_candidates");
+    } else {
+        data->candidates = sorted_idx;
+        ggml_set_name(data->candidates, "top_p_candidates");
+    }

-    struct ggml_tensor * sorted_probs_reshaped = ggml_reshape_2d(ctx, sorted_probs, softmax->ne[0], 1);
-    ggml_set_name(sorted_probs_reshaped, "top_p_sorted_probs_reshaped");
    // Compute Cumulative Distribution Function (CDF) by means of GGML_OP_CUMSUM.
-    struct ggml_tensor * sorted_cdf = ggml_cumsum(ctx, sorted_probs_reshaped);
-    ggml_set_name(sorted_cdf, "top_p_sorted_cdf");
+    struct ggml_tensor * cdf = ggml_cumsum(ctx, softmax);
+    ggml_set_name(cdf, "top_p_cdf");

+    // TODO: Make it inclusive of probability p
    // Invert CDF and add top-p value so that ggml_step yields 1 for values we want to keep
-    struct ggml_tensor * sorted_cdf_scaled = ggml_scale_bias(ctx, sorted_cdf, -1.0f, sctx->p);
-    ggml_set_name(sorted_cdf_scaled, "top_p_sorted_cdf_scaled");
+    struct ggml_tensor * cdf_scaled = ggml_scale_bias(ctx, cdf, -1.0f, sctx->p);
+    ggml_set_name(cdf_scaled, "top_p_cdf_scaled");

-    struct ggml_tensor * sorted_mask = ggml_step(ctx, sorted_cdf_scaled);
-    ggml_set_name(sorted_mask, "top_p_sorted_mask");
-
-    // reverse sorting by argsort(argsort)
-    // cast to F32 since cuda only supports float inputs
-    struct ggml_tensor * reverse_argsort = ggml_argsort(ctx, ggml_cast(ctx, sorted_idx, GGML_TYPE_F32), GGML_SORT_ORDER_ASC);
-    ggml_set_name(reverse_argsort, "top_p_reverse_argsort");
-
-    // Do the sorting via reshape + get_rows
-    struct ggml_tensor * sorted_reshaped_mask = ggml_reshape_2d(ctx, sorted_mask, 1, sorted_mask->ne[0]);
-    ggml_set_name(sorted_reshaped_mask, "top_p_sorted_reshaped_mask");
-
-    struct ggml_tensor * reshaped_mask = ggml_get_rows(ctx, sorted_reshaped_mask, reverse_argsort);
-    ggml_set_name(reshaped_mask, "top_p_reshaped_mask");
-
-    struct ggml_tensor * mask = ggml_reshape_2d(ctx, reshaped_mask, sorted_mask->ne[0], 1);
+    struct ggml_tensor * mask = ggml_step(ctx, cdf_scaled);
    ggml_set_name(mask, "top_p_mask");

    // Use ggml_scale_bias (output = (a * s) + b) which in this case becomes:
@ -1157,9 +1155,13 @@ static void llama_sampler_top_p_backend_apply(
    struct ggml_tensor * top_p_bias = ggml_scale_bias(ctx, mask, large_val, -large_val);
    ggml_set_name(top_p_bias, "top_p_bias");

-    data->logits = ggml_add(ctx, data->logits, top_p_bias);
+    data->logits = ggml_add(ctx, sorted_logits, top_p_bias);
    ggml_set_name(data->logits, "top_p_logits");

+    ggml_set_output(data->candidates);
+    ggml_build_forward_expand(gf, data->candidates);
+    
+    ggml_set_output(data->logits);
    ggml_build_forward_expand(gf, data->logits);
 }