update `power-law` -> `adaptive-p`

2025-12-27 02:10:20 -06:00 · 2025-12-27 02:10:20 -06:00 · b95b0884dd
parent 90f3bfbe96
commit b95b0884dd
6 changed files with 102 additions and 114 deletions
--- a/common/arg.cpp
+++ b/common/arg.cpp
@ -1597,21 +1597,23 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        }
    ).set_sparam());
    add_opt(common_arg(
-        {"--power-law-target"}, "N",
-        string_format("power law sampler: select tokens near this probability (valid range 0.0 "
-                      "to 1.0; <0 = disabled) (default: %.2f)\n"
+        {"--adaptive-target"}, "N",
+        string_format("adaptive-p: select tokens near this probability (valid range 0.0 "
+                      "to 1.0; negative = disabled) (default: %.2f)\n"
                      "[(more info)]""(https://github.com/ggml-org/llama.cpp/pull/17927)",
-                      (double)params.sampling.power_law_target),
+                      (double)params.sampling.adaptive_target),
        [](common_params & params, const std::string & value) {
-            params.sampling.power_law_target = std::stof(value);
+            params.sampling.adaptive_target = std::stof(value);
        }
    ).set_sparam());
    add_opt(common_arg(
-        {"--power-law-decay"}, "N",
-        string_format("decay rate for target adaptation over time. lower values -> faster but less stable adaptation.\n"
-        "(valid range 0.0 to 1.0; ≤0 = no adaptation) (default: %.2f)", (double)params.sampling.power_law_decay),
+        {"--adaptive-decay"}, "N",
+        string_format("adaptive-p: decay rate for target adaptation over time. lower values "
+                      "are more reactive, higher values are more stable.\n"
+                      "(valid range 0.0 to 0.99) (default: %.2f)",
+                      (double)params.sampling.adaptive_decay),
        [](common_params & params, const std::string & value) {
-            params.sampling.power_law_decay = std::stof(value);
+            params.sampling.adaptive_decay = std::stof(value);
        }
    ).set_sparam());
    add_opt(common_arg(
--- a/common/common.h
+++ b/common/common.h
@ -117,7 +117,7 @@ enum common_sampler_type {
    COMMON_SAMPLER_TYPE_INFILL      = 9,
    COMMON_SAMPLER_TYPE_PENALTIES   = 10,
    COMMON_SAMPLER_TYPE_TOP_N_SIGMA = 11,
-    COMMON_SAMPLER_TYPE_POWER_LAW   = 12,
+    COMMON_SAMPLER_TYPE_ADAPTIVE_P  = 12,
 };

 // dimensionality reduction methods, used by cvector-generator
@ -185,8 +185,8 @@ struct common_params_sampling {
    float   dry_base           = 1.75f;  // 0.0 = disabled;      multiplier * base ^ (length of sequence before token - allowed length)
    int32_t dry_allowed_length = 2;      // tokens extending repetitions beyond this receive penalty
    int32_t dry_penalty_last_n = -1;     // how many tokens to scan for repetitions (0 = disable penalty, -1 = context size)
-    float   power_law_target   = -1.0f;  // select tokens near this probability (valid range 0.0 to 1.0; <0 = disabled)
-    float   power_law_decay    = 0.90f;  // decay rate for target adaptation over time. lower values -> faster but less stable adaptation. (valid range 0.0 to 1.0; ≤0 = no adaptation)
+    float   adaptive_target    = -1.0f;  // select tokens near this probability (valid range 0.0 to 1.0; negative = disabled)
+    float   adaptive_decay     = 0.90f;  // EMA decay for adaptation; history ≈ 1/(1-decay) tokens (0.0 - 0.99)
    int32_t mirostat           = 0;      // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
    float   top_n_sigma        = -1.00f; // -1.0 = disabled
    float   mirostat_tau       = 5.00f;  // target entropy
--- a/common/sampling.cpp
+++ b/common/sampling.cpp
@ -150,11 +150,11 @@ std::string common_params_sampling::print() const {
            "\trepeat_last_n = %d, repeat_penalty = %.3f, frequency_penalty = %.3f, presence_penalty = %.3f\n"
            "\tdry_multiplier = %.3f, dry_base = %.3f, dry_allowed_length = %d, dry_penalty_last_n = %d\n"
            "\ttop_k = %d, top_p = %.3f, min_p = %.3f, xtc_probability = %.3f, xtc_threshold = %.3f, typical_p = %.3f, top_n_sigma = %.3f, temp = %.3f\n"
-            "\tmirostat = %d, mirostat_lr = %.3f, mirostat_ent = %.3f, power_law_target = %.3f, power_law_decay = %.3f",
+            "\tmirostat = %d, mirostat_lr = %.3f, mirostat_ent = %.3f, adaptive_target = %.3f, adaptive_decay = %.3f",
            penalty_last_n, penalty_repeat, penalty_freq, penalty_present,
            dry_multiplier, dry_base, dry_allowed_length, dry_penalty_last_n,
            top_k, top_p, min_p, xtc_probability, xtc_threshold, typ_p, top_n_sigma, temp,
-            mirostat, mirostat_eta, mirostat_tau, power_law_target, power_law_decay);
+            mirostat, mirostat_eta, mirostat_tau, adaptive_target, adaptive_decay);

    return std::string(result);
 }
@ -237,7 +237,7 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co

    if (params.mirostat == 0) {

-        bool use_power_law = false;
+        bool use_adaptive_p = false; // see below

        for (const auto & cnstr : params.samplers) {
            switch (cnstr) {
@ -278,20 +278,20 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
                case COMMON_SAMPLER_TYPE_PENALTIES:
                    samplers.push_back(llama_sampler_init_penalties(params.penalty_last_n, params.penalty_repeat, params.penalty_freq, params.penalty_present));
                    break;
-                case COMMON_SAMPLER_TYPE_POWER_LAW:
-                    // the `power_law` sampler is like `dist` in that it selects a single token,
-                    // so we will add `dist` at the end of the chain by default, unless the user
-                    // specifically included `power_law`. we set this flag here so we know to add
-                    // it at the very end.
-                    use_power_law = true;
+                case COMMON_SAMPLER_TYPE_ADAPTIVE_P:
+                    // the `adaptive-p` sampler is like `dist` and `mirostat` in that it selects
+                    // a single token, so we will add `dist` at the end of the chain by default,
+                    // unless the user specifically included `adaptive-p`. we set this flag here
+                    // so we know to add the sampler at the very end.
+                    use_adaptive_p = true;
                    break;
                default:
                    GGML_ASSERT(false && "unknown sampler type");
            }
        }
-        if (use_power_law) {
-            // only if user explicitly included power_law sampler
-            samplers.push_back(llama_sampler_init_power_law(params.power_law_target, params.power_law_decay, params.seed));
+        if (use_adaptive_p) {
+            // only if user explicitly included adaptive-p sampler
+            samplers.push_back(llama_sampler_init_adaptive_p(params.adaptive_target, params.adaptive_decay, params.seed));
        } else {
            // default: sample from distribution
            samplers.push_back(llama_sampler_init_dist(params.seed));
@ -581,7 +581,7 @@ char common_sampler_type_to_chr(enum common_sampler_type cnstr) {
        case COMMON_SAMPLER_TYPE_XTC:         return 'x';
        case COMMON_SAMPLER_TYPE_INFILL:      return 'i';
        case COMMON_SAMPLER_TYPE_PENALTIES:   return 'e';
-        case COMMON_SAMPLER_TYPE_POWER_LAW:   return 'w';
+        case COMMON_SAMPLER_TYPE_ADAPTIVE_P:  return 'a';
        default : return '?';
    }
 }
@ -598,7 +598,7 @@ std::string common_sampler_type_to_str(enum common_sampler_type cnstr) {
        case COMMON_SAMPLER_TYPE_XTC:         return "xtc";
        case COMMON_SAMPLER_TYPE_INFILL:      return "infill";
        case COMMON_SAMPLER_TYPE_PENALTIES:   return "penalties";
-        case COMMON_SAMPLER_TYPE_POWER_LAW:   return "power_law";
+        case COMMON_SAMPLER_TYPE_ADAPTIVE_P:  return "adaptive_p";
        default : return "";
    }
 }
@ -615,7 +615,7 @@ std::vector<common_sampler_type> common_sampler_types_from_names(const std::vect
        { "xtc",         COMMON_SAMPLER_TYPE_XTC },
        { "infill",      COMMON_SAMPLER_TYPE_INFILL },
        { "penalties",   COMMON_SAMPLER_TYPE_PENALTIES },
-        { "power_law",   COMMON_SAMPLER_TYPE_POWER_LAW },
+        { "adaptive_p",  COMMON_SAMPLER_TYPE_ADAPTIVE_P },
    };

    // since samplers names are written multiple ways
@ -631,7 +631,7 @@ std::vector<common_sampler_type> common_sampler_types_from_names(const std::vect
        { "typ",         COMMON_SAMPLER_TYPE_TYPICAL_P },
        { "min-p",       COMMON_SAMPLER_TYPE_MIN_P },
        { "temp",        COMMON_SAMPLER_TYPE_TEMPERATURE },
-        { "power-law",   COMMON_SAMPLER_TYPE_POWER_LAW },
+        { "adaptive-p",  COMMON_SAMPLER_TYPE_ADAPTIVE_P },
    };

    std::vector<common_sampler_type> samplers;
--- a/include/llama.h
+++ b/include/llama.h
@ -1304,25 +1304,28 @@ extern "C" {
                          const char ** seq_breakers,
                              size_t    num_breakers);

-    /// power-law
+    /// adaptive-p: select tokens near a configurable target probability over time.
    ///
-    /// this sampler implements a power law probability transformation with adaptive
-    /// target tracking. it reshapes token probability distributions to favor tokens near a
-    /// configurable target probability, rather than always selecting from the highest probability
-    /// candidates.
+    /// the adaptive-p sampler transforms the token probability distribution to favor tokens
+    /// that fall near a user-configurable probability target.
    ///
-    /// this sampler is like `greedy`, `dist`, and `mirostat` in that it actually selects a token ID
-    /// rather than just transforming logits. therefore it must always be the last sampler in the
-    /// sampler chain.
+    /// internally, the sampler maintains an exponential moving average of the *ORIGINAL*
+    /// probabilities of selected tokens at each sampling step. it uses this EMA to compute an
+    /// adapted target probability at each sampling step, thus maintaining the desired target
+    /// probability over time.
    ///
-    /// minimal truncation before this sampler is recommended.
+    /// adaptive-p selects a token ID rather than just mutating candidates, so it must be last
+    /// in the sampler chain (like mirostat, dist, greedy).
    ///
-    /// @param target select tokens near this probability (valid range 0.0 to 1.0; <0 = disabled)
-    /// @param decay decay rate for target adaptation over time. lower values -> faster but less stable adaptation. (valid range 0.0 to 1.0; ≤0 = no adaptation)
+    /// only mild truncation before this sampler is recommended. we suggest applying min-p
+    /// before adaptive-p as the only other active sampler in the chain.
    ///
-    /// ref: https://github.com/MrJackSpade/llama.cpp/tree/master (original impl)
-    /// ref: https://github.com/ggml-org/llama.cpp/pull/17927     (llama.cpp PR)
-    LLAMA_API struct llama_sampler * llama_sampler_init_power_law(
+    /// @param target select tokens near this probability (valid range 0.0 to 1.0; negative = disabled)
+    /// @param decay  EMA decay for adaptation; history ≈ 1/(1-decay) tokens (valid range 0.0 - 0.99)
+    /// @param seed   RNG seed
+    ///
+    /// ref: https://github.com/ggml-org/llama.cpp/pull/17927
+    LLAMA_API struct llama_sampler * llama_sampler_init_adaptive_p(
                               float   target,
                               float   decay,
                            uint32_t   seed);
--- a/src/llama-sampling.cpp
+++ b/src/llama-sampling.cpp
@ -2329,61 +2329,39 @@ struct llama_sampler * llama_sampler_init_dry_testing(int32_t context_size, floa
    return result;
 }

-// power-law
+// adaptive-p sampler state
 //
-// this sampler implements a power law probability transformation with adaptive
-// target tracking. it reshapes token probability distributions to favor tokens near a
-// configurable target probability, rather than always selecting from the highest probability
-// candidates.
+// maintains an exponential moving average of the *ORIGINAL* probabilities
+// of selected tokens, used to compute an adapted target at each sampling step.
 //
-// this sampler is like `greedy`, `dist`, and `mirostat` in that it actually selects a token ID
-// rather than just transforming logits. therefore it must always be the last sampler in the
-// sampler chain.
-//
-// minimal truncation before this sampler is recommended.
-//
-// ref: https://github.com/MrJackSpade/llama.cpp/tree/master (original impl)
-// ref: https://github.com/ggml-org/llama.cpp/pull/17927     (llama.cpp PR)
-
-struct llama_sampler_power_law {
-
-    // the desired average probability for selected tokens (0.0 to 1.0)
-    // higher values favor more probable tokens (more stable and predictable)
-    // lower values favor less probable tokens (more creative)
-    // negative values disable Power Law sampling (sample from distribution as-is)
-    const float target;
-
-    // controls how quickly history influence fades (0.0 to 0.99)
-    // lower values = faster adaptation, more reactive to recent tokens
-    // higher values = slower adaptation, more stable over time
-    // effective history length ≈ 1/(1-decay) tokens
-    // example: decay=0.5 --> ~2 tokens; decay=0.9 --> ~10 tokens; decay=0.95 --> ~20 tokens
-    // internally clamped to <= 0.99 to prevent unbounded accumulation
-    const float decay;
-
-    const uint32_t seed;
-    std::mt19937   rng;
-
-    // member variables
-    float              weighted_sum;   // historical token probabilities weighted by recency
-    float              total_weight;   // sum of weights, converges to 1/(1-decay)
-    std::vector<float> original_probs; // used to store original token probabilities
+// see llama.h for a full description of the sampler
+// ref: https://github.com/ggml-org/llama.cpp/pull/17927
+struct llama_sampler_adaptive_p {
+    const float    target; // target probability (0.0 - 1.0; negative = disabled)
+    const float    decay;  // EMA decay; history ≈ 1/(1-decay) tokens (0.0 - 0.99)
+    const uint32_t seed;   // RNG seed
+    std::mt19937   rng;    // RNG
+    float weighted_sum;    // sum(p_i * decay^i)
+    float total_weight;    // sum(decay^i), converges to 1/(1-decay)
+    std::vector<float> original_probs; // pre-transform probs, cached for EMA update
 };

-// transformation constants
+// adaptive probability transformation constants
 static constexpr float DISTRIBUTION_WIDTH = 0.3f;
 static constexpr float PEAK_LOGIT_VALUE   = 5.0f;
+static constexpr float SHARPNESS          = 4.0f;
 static constexpr float INV_WIDTH          = 1.0f / DISTRIBUTION_WIDTH;

-static const char * llama_sampler_power_law_name(const struct llama_sampler * /*smpl*/) {
-    return "power-law";
+static const char * llama_sampler_adaptive_p_name(const struct llama_sampler * /*smpl*/) {
+    return "adaptive-p";
 }

-static void llama_sampler_power_law_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
-    auto * ctx = (llama_sampler_power_law *) smpl->ctx;
+static void llama_sampler_adaptive_p_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
+    auto * ctx = (llama_sampler_adaptive_p *) smpl->ctx;

    if (ctx->target < 0.0f) {
-        // no-op: just sample from the distribution as-is
+        // at negative target values, adaptive-p is no-op
+        // we simply sample from the existing distribution
        llama_sampler_softmax_impl(cur_p, false);
        cur_p->selected = llama_sample_dist(cur_p, ctx->rng);
        return;
@ -2397,38 +2375,43 @@ static void llama_sampler_power_law_apply(struct llama_sampler * smpl, llama_tok
    }

    // compute the adapted target probability for the current sampling step
-    float computed_target = std::clamp(
-        ctx->total_weight == 0.0f ? ctx->target : 2.0f * ctx->target - (ctx->weighted_sum / ctx->total_weight),
+    auto target = std::clamp(ctx->target, 0.0f, 1.0f);
+    float adapted_target = std::clamp(
+        ctx->total_weight == 0.0f ? target : 2.0f * target - (ctx->weighted_sum / ctx->total_weight),
        0.0f, 1.0f
    );

-    // power law transform
+    // adaptive probability transform
+    //
+    // quadratic near target for fine differentiation, transitioning to linear decay in the
+    // tails. unbounded negative logits ensure proper suppression of far-from-target tokens
+    // after the softmax.
+    //
    for (size_t i = 0; i < cur_p->size; ++i) {
-        float dist = (cur_p->data[i].p - computed_target) * INV_WIDTH;
-        cur_p->data[i].logit = PEAK_LOGIT_VALUE / (1.0f + dist * dist);
+        float dist = std::abs((cur_p->data[i].p - adapted_target) * INV_WIDTH);
+        cur_p->data[i].logit = PEAK_LOGIT_VALUE - SHARPNESS * dist * dist / (1.0f + dist);
    }

+    // softmax and sample from the transformed distribution
    llama_sampler_softmax_impl(cur_p, false);
-
-    // sample from transformed distribution
    const int idx   = llama_sample_dist(cur_p, ctx->rng);
    cur_p->selected = idx;

-    // update running history with the original probability of the selected token
-    ctx->weighted_sum = ctx->original_probs[idx] + ctx->decay * ctx->weighted_sum; // history fades over time
+    // update history with the original probability of the selected token
+    ctx->weighted_sum = ctx->original_probs[idx] + ctx->decay * ctx->weighted_sum;
    ctx->total_weight = 1.0f + ctx->decay * ctx->total_weight;
 }

-static void llama_sampler_power_law_reset(struct llama_sampler * smpl) {
-    auto * ctx        = (llama_sampler_power_law *) smpl->ctx;
+static void llama_sampler_adaptive_p_reset(struct llama_sampler * smpl) {
+    auto * ctx        = (llama_sampler_adaptive_p *) smpl->ctx;
    ctx->weighted_sum = 0.0f;
    ctx->total_weight = 0.0f;
 }

-static struct llama_sampler * llama_sampler_power_law_clone(const struct llama_sampler * smpl) {
-    const auto * ctx  = (const llama_sampler_power_law *) smpl->ctx;
-    auto * result     = llama_sampler_init_power_law(ctx->target, ctx->decay, ctx->seed);
-    auto * result_ctx = (llama_sampler_power_law *) result->ctx;
+static struct llama_sampler * llama_sampler_adaptive_p_clone(const struct llama_sampler * smpl) {
+    const auto * ctx  = (const llama_sampler_adaptive_p *) smpl->ctx;
+    auto * result     = llama_sampler_init_adaptive_p(ctx->target, ctx->decay, ctx->seed);
+    auto * result_ctx = (llama_sampler_adaptive_p *) result->ctx;

    result_ctx->rng          = ctx->rng;
    result_ctx->weighted_sum = ctx->weighted_sum;
@ -2438,29 +2421,29 @@ static struct llama_sampler * llama_sampler_power_law_clone(const struct llama_s
    return result;
 }

-static void llama_sampler_power_law_free(struct llama_sampler * smpl) {
-    delete (llama_sampler_power_law *) smpl->ctx;
+static void llama_sampler_adaptive_p_free(struct llama_sampler * smpl) {
+    delete (llama_sampler_adaptive_p *) smpl->ctx;
 }

-static struct llama_sampler_i llama_sampler_power_law_i = {
-    /* .name   = */ llama_sampler_power_law_name,
+static struct llama_sampler_i llama_sampler_adaptive_p_i = {
+    /* .name   = */ llama_sampler_adaptive_p_name,
    /* .accept = */ nullptr,
-    /* .apply  = */ llama_sampler_power_law_apply,
-    /* .reset  = */ llama_sampler_power_law_reset,
-    /* .clone  = */ llama_sampler_power_law_clone,
-    /* .free   = */ llama_sampler_power_law_free,
+    /* .apply  = */ llama_sampler_adaptive_p_apply,
+    /* .reset  = */ llama_sampler_adaptive_p_reset,
+    /* .clone  = */ llama_sampler_adaptive_p_clone,
+    /* .free   = */ llama_sampler_adaptive_p_free,
 };

-struct llama_sampler * llama_sampler_init_power_law(
+struct llama_sampler * llama_sampler_init_adaptive_p(
    float    target,
    float    decay,
    uint32_t seed
 ) {
    auto seed_cur = get_rng_seed(seed);
    return llama_sampler_init(
-        /* .iface = */ &llama_sampler_power_law_i,
-        /* .ctx   = */ new llama_sampler_power_law {
-            /* .target         = */ std::clamp(target, 0.0f, 1.0f),
+        /* .iface = */ &llama_sampler_adaptive_p_i,
+        /* .ctx   = */ new llama_sampler_adaptive_p {
+            /* .target         = */ target,
            /* .decay          = */ std::clamp(decay, 0.0f, 0.99f),
            /* .seed           = */ seed_cur,
            /* .rng            = */ std::mt19937(seed_cur),
--- a/tools/server/server-task.cpp
+++ b/tools/server/server-task.cpp
@ -201,8 +201,8 @@ task_params server_task::params_from_json_cmpl(
    params.sampling.mirostat           = json_value(data, "mirostat",            defaults.sampling.mirostat);
    params.sampling.mirostat_tau       = json_value(data, "mirostat_tau",        defaults.sampling.mirostat_tau);
    params.sampling.mirostat_eta       = json_value(data, "mirostat_eta",        defaults.sampling.mirostat_eta);
-    params.sampling.power_law_target   = json_value(data, "power_law_target",    defaults.sampling.power_law_target);
-    params.sampling.power_law_decay    = json_value(data, "power_law_decay",     defaults.sampling.power_law_decay);
+    params.sampling.adaptive_target    = json_value(data, "adaptive_target",     defaults.sampling.adaptive_target);
+    params.sampling.adaptive_decay     = json_value(data, "adaptive_decay",      defaults.sampling.adaptive_decay);
    params.sampling.seed               = json_value(data, "seed",                defaults.sampling.seed);
    params.sampling.n_probs            = json_value(data, "n_probs",             defaults.sampling.n_probs);
    params.sampling.min_keep           = json_value(data, "min_keep",            defaults.sampling.min_keep);