From b95b0884ddd74a46b8aa98c5edf38f194d9515ab Mon Sep 17 00:00:00 2001 From: ddh0 Date: Sat, 27 Dec 2025 02:10:20 -0600 Subject: [PATCH] update `power-law` -> `adaptive-p` --- common/arg.cpp | 20 +++--- common/common.h | 6 +- common/sampling.cpp | 32 ++++----- include/llama.h | 31 +++++---- src/llama-sampling.cpp | 123 +++++++++++++++-------------------- tools/server/server-task.cpp | 4 +- 6 files changed, 102 insertions(+), 114 deletions(-) diff --git a/common/arg.cpp b/common/arg.cpp index 1ef8d70548..87438d8d09 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -1597,21 +1597,23 @@ common_params_context common_params_parser_init(common_params & params, llama_ex } ).set_sparam()); add_opt(common_arg( - {"--power-law-target"}, "N", - string_format("power law sampler: select tokens near this probability (valid range 0.0 " - "to 1.0; <0 = disabled) (default: %.2f)\n" + {"--adaptive-target"}, "N", + string_format("adaptive-p: select tokens near this probability (valid range 0.0 " + "to 1.0; negative = disabled) (default: %.2f)\n" "[(more info)]""(https://github.com/ggml-org/llama.cpp/pull/17927)", - (double)params.sampling.power_law_target), + (double)params.sampling.adaptive_target), [](common_params & params, const std::string & value) { - params.sampling.power_law_target = std::stof(value); + params.sampling.adaptive_target = std::stof(value); } ).set_sparam()); add_opt(common_arg( - {"--power-law-decay"}, "N", - string_format("decay rate for target adaptation over time. lower values -> faster but less stable adaptation.\n" - "(valid range 0.0 to 1.0; ≤0 = no adaptation) (default: %.2f)", (double)params.sampling.power_law_decay), + {"--adaptive-decay"}, "N", + string_format("adaptive-p: decay rate for target adaptation over time. lower values " + "are more reactive, higher values are more stable.\n" + "(valid range 0.0 to 0.99) (default: %.2f)", + (double)params.sampling.adaptive_decay), [](common_params & params, const std::string & value) { - params.sampling.power_law_decay = std::stof(value); + params.sampling.adaptive_decay = std::stof(value); } ).set_sparam()); add_opt(common_arg( diff --git a/common/common.h b/common/common.h index 66e738e30a..2145f4f4c2 100644 --- a/common/common.h +++ b/common/common.h @@ -117,7 +117,7 @@ enum common_sampler_type { COMMON_SAMPLER_TYPE_INFILL = 9, COMMON_SAMPLER_TYPE_PENALTIES = 10, COMMON_SAMPLER_TYPE_TOP_N_SIGMA = 11, - COMMON_SAMPLER_TYPE_POWER_LAW = 12, + COMMON_SAMPLER_TYPE_ADAPTIVE_P = 12, }; // dimensionality reduction methods, used by cvector-generator @@ -185,8 +185,8 @@ struct common_params_sampling { float dry_base = 1.75f; // 0.0 = disabled; multiplier * base ^ (length of sequence before token - allowed length) int32_t dry_allowed_length = 2; // tokens extending repetitions beyond this receive penalty int32_t dry_penalty_last_n = -1; // how many tokens to scan for repetitions (0 = disable penalty, -1 = context size) - float power_law_target = -1.0f; // select tokens near this probability (valid range 0.0 to 1.0; <0 = disabled) - float power_law_decay = 0.90f; // decay rate for target adaptation over time. lower values -> faster but less stable adaptation. (valid range 0.0 to 1.0; ≤0 = no adaptation) + float adaptive_target = -1.0f; // select tokens near this probability (valid range 0.0 to 1.0; negative = disabled) + float adaptive_decay = 0.90f; // EMA decay for adaptation; history ≈ 1/(1-decay) tokens (0.0 - 0.99) int32_t mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0 float top_n_sigma = -1.00f; // -1.0 = disabled float mirostat_tau = 5.00f; // target entropy diff --git a/common/sampling.cpp b/common/sampling.cpp index 4c95450a73..140404f12f 100644 --- a/common/sampling.cpp +++ b/common/sampling.cpp @@ -150,11 +150,11 @@ std::string common_params_sampling::print() const { "\trepeat_last_n = %d, repeat_penalty = %.3f, frequency_penalty = %.3f, presence_penalty = %.3f\n" "\tdry_multiplier = %.3f, dry_base = %.3f, dry_allowed_length = %d, dry_penalty_last_n = %d\n" "\ttop_k = %d, top_p = %.3f, min_p = %.3f, xtc_probability = %.3f, xtc_threshold = %.3f, typical_p = %.3f, top_n_sigma = %.3f, temp = %.3f\n" - "\tmirostat = %d, mirostat_lr = %.3f, mirostat_ent = %.3f, power_law_target = %.3f, power_law_decay = %.3f", + "\tmirostat = %d, mirostat_lr = %.3f, mirostat_ent = %.3f, adaptive_target = %.3f, adaptive_decay = %.3f", penalty_last_n, penalty_repeat, penalty_freq, penalty_present, dry_multiplier, dry_base, dry_allowed_length, dry_penalty_last_n, top_k, top_p, min_p, xtc_probability, xtc_threshold, typ_p, top_n_sigma, temp, - mirostat, mirostat_eta, mirostat_tau, power_law_target, power_law_decay); + mirostat, mirostat_eta, mirostat_tau, adaptive_target, adaptive_decay); return std::string(result); } @@ -237,7 +237,7 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co if (params.mirostat == 0) { - bool use_power_law = false; + bool use_adaptive_p = false; // see below for (const auto & cnstr : params.samplers) { switch (cnstr) { @@ -278,20 +278,20 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co case COMMON_SAMPLER_TYPE_PENALTIES: samplers.push_back(llama_sampler_init_penalties(params.penalty_last_n, params.penalty_repeat, params.penalty_freq, params.penalty_present)); break; - case COMMON_SAMPLER_TYPE_POWER_LAW: - // the `power_law` sampler is like `dist` in that it selects a single token, - // so we will add `dist` at the end of the chain by default, unless the user - // specifically included `power_law`. we set this flag here so we know to add - // it at the very end. - use_power_law = true; + case COMMON_SAMPLER_TYPE_ADAPTIVE_P: + // the `adaptive-p` sampler is like `dist` and `mirostat` in that it selects + // a single token, so we will add `dist` at the end of the chain by default, + // unless the user specifically included `adaptive-p`. we set this flag here + // so we know to add the sampler at the very end. + use_adaptive_p = true; break; default: GGML_ASSERT(false && "unknown sampler type"); } } - if (use_power_law) { - // only if user explicitly included power_law sampler - samplers.push_back(llama_sampler_init_power_law(params.power_law_target, params.power_law_decay, params.seed)); + if (use_adaptive_p) { + // only if user explicitly included adaptive-p sampler + samplers.push_back(llama_sampler_init_adaptive_p(params.adaptive_target, params.adaptive_decay, params.seed)); } else { // default: sample from distribution samplers.push_back(llama_sampler_init_dist(params.seed)); @@ -581,7 +581,7 @@ char common_sampler_type_to_chr(enum common_sampler_type cnstr) { case COMMON_SAMPLER_TYPE_XTC: return 'x'; case COMMON_SAMPLER_TYPE_INFILL: return 'i'; case COMMON_SAMPLER_TYPE_PENALTIES: return 'e'; - case COMMON_SAMPLER_TYPE_POWER_LAW: return 'w'; + case COMMON_SAMPLER_TYPE_ADAPTIVE_P: return 'a'; default : return '?'; } } @@ -598,7 +598,7 @@ std::string common_sampler_type_to_str(enum common_sampler_type cnstr) { case COMMON_SAMPLER_TYPE_XTC: return "xtc"; case COMMON_SAMPLER_TYPE_INFILL: return "infill"; case COMMON_SAMPLER_TYPE_PENALTIES: return "penalties"; - case COMMON_SAMPLER_TYPE_POWER_LAW: return "power_law"; + case COMMON_SAMPLER_TYPE_ADAPTIVE_P: return "adaptive_p"; default : return ""; } } @@ -615,7 +615,7 @@ std::vector common_sampler_types_from_names(const std::vect { "xtc", COMMON_SAMPLER_TYPE_XTC }, { "infill", COMMON_SAMPLER_TYPE_INFILL }, { "penalties", COMMON_SAMPLER_TYPE_PENALTIES }, - { "power_law", COMMON_SAMPLER_TYPE_POWER_LAW }, + { "adaptive_p", COMMON_SAMPLER_TYPE_ADAPTIVE_P }, }; // since samplers names are written multiple ways @@ -631,7 +631,7 @@ std::vector common_sampler_types_from_names(const std::vect { "typ", COMMON_SAMPLER_TYPE_TYPICAL_P }, { "min-p", COMMON_SAMPLER_TYPE_MIN_P }, { "temp", COMMON_SAMPLER_TYPE_TEMPERATURE }, - { "power-law", COMMON_SAMPLER_TYPE_POWER_LAW }, + { "adaptive-p", COMMON_SAMPLER_TYPE_ADAPTIVE_P }, }; std::vector samplers; diff --git a/include/llama.h b/include/llama.h index f903d34a56..5e8974c94f 100644 --- a/include/llama.h +++ b/include/llama.h @@ -1304,25 +1304,28 @@ extern "C" { const char ** seq_breakers, size_t num_breakers); - /// power-law + /// adaptive-p: select tokens near a configurable target probability over time. /// - /// this sampler implements a power law probability transformation with adaptive - /// target tracking. it reshapes token probability distributions to favor tokens near a - /// configurable target probability, rather than always selecting from the highest probability - /// candidates. + /// the adaptive-p sampler transforms the token probability distribution to favor tokens + /// that fall near a user-configurable probability target. /// - /// this sampler is like `greedy`, `dist`, and `mirostat` in that it actually selects a token ID - /// rather than just transforming logits. therefore it must always be the last sampler in the - /// sampler chain. + /// internally, the sampler maintains an exponential moving average of the *ORIGINAL* + /// probabilities of selected tokens at each sampling step. it uses this EMA to compute an + /// adapted target probability at each sampling step, thus maintaining the desired target + /// probability over time. /// - /// minimal truncation before this sampler is recommended. + /// adaptive-p selects a token ID rather than just mutating candidates, so it must be last + /// in the sampler chain (like mirostat, dist, greedy). /// - /// @param target select tokens near this probability (valid range 0.0 to 1.0; <0 = disabled) - /// @param decay decay rate for target adaptation over time. lower values -> faster but less stable adaptation. (valid range 0.0 to 1.0; ≤0 = no adaptation) + /// only mild truncation before this sampler is recommended. we suggest applying min-p + /// before adaptive-p as the only other active sampler in the chain. /// - /// ref: https://github.com/MrJackSpade/llama.cpp/tree/master (original impl) - /// ref: https://github.com/ggml-org/llama.cpp/pull/17927 (llama.cpp PR) - LLAMA_API struct llama_sampler * llama_sampler_init_power_law( + /// @param target select tokens near this probability (valid range 0.0 to 1.0; negative = disabled) + /// @param decay EMA decay for adaptation; history ≈ 1/(1-decay) tokens (valid range 0.0 - 0.99) + /// @param seed RNG seed + /// + /// ref: https://github.com/ggml-org/llama.cpp/pull/17927 + LLAMA_API struct llama_sampler * llama_sampler_init_adaptive_p( float target, float decay, uint32_t seed); diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp index a4b03193dd..5a823ca457 100644 --- a/src/llama-sampling.cpp +++ b/src/llama-sampling.cpp @@ -2329,61 +2329,39 @@ struct llama_sampler * llama_sampler_init_dry_testing(int32_t context_size, floa return result; } -// power-law +// adaptive-p sampler state // -// this sampler implements a power law probability transformation with adaptive -// target tracking. it reshapes token probability distributions to favor tokens near a -// configurable target probability, rather than always selecting from the highest probability -// candidates. +// maintains an exponential moving average of the *ORIGINAL* probabilities +// of selected tokens, used to compute an adapted target at each sampling step. // -// this sampler is like `greedy`, `dist`, and `mirostat` in that it actually selects a token ID -// rather than just transforming logits. therefore it must always be the last sampler in the -// sampler chain. -// -// minimal truncation before this sampler is recommended. -// -// ref: https://github.com/MrJackSpade/llama.cpp/tree/master (original impl) -// ref: https://github.com/ggml-org/llama.cpp/pull/17927 (llama.cpp PR) - -struct llama_sampler_power_law { - - // the desired average probability for selected tokens (0.0 to 1.0) - // higher values favor more probable tokens (more stable and predictable) - // lower values favor less probable tokens (more creative) - // negative values disable Power Law sampling (sample from distribution as-is) - const float target; - - // controls how quickly history influence fades (0.0 to 0.99) - // lower values = faster adaptation, more reactive to recent tokens - // higher values = slower adaptation, more stable over time - // effective history length ≈ 1/(1-decay) tokens - // example: decay=0.5 --> ~2 tokens; decay=0.9 --> ~10 tokens; decay=0.95 --> ~20 tokens - // internally clamped to <= 0.99 to prevent unbounded accumulation - const float decay; - - const uint32_t seed; - std::mt19937 rng; - - // member variables - float weighted_sum; // historical token probabilities weighted by recency - float total_weight; // sum of weights, converges to 1/(1-decay) - std::vector original_probs; // used to store original token probabilities +// see llama.h for a full description of the sampler +// ref: https://github.com/ggml-org/llama.cpp/pull/17927 +struct llama_sampler_adaptive_p { + const float target; // target probability (0.0 - 1.0; negative = disabled) + const float decay; // EMA decay; history ≈ 1/(1-decay) tokens (0.0 - 0.99) + const uint32_t seed; // RNG seed + std::mt19937 rng; // RNG + float weighted_sum; // sum(p_i * decay^i) + float total_weight; // sum(decay^i), converges to 1/(1-decay) + std::vector original_probs; // pre-transform probs, cached for EMA update }; -// transformation constants +// adaptive probability transformation constants static constexpr float DISTRIBUTION_WIDTH = 0.3f; static constexpr float PEAK_LOGIT_VALUE = 5.0f; +static constexpr float SHARPNESS = 4.0f; static constexpr float INV_WIDTH = 1.0f / DISTRIBUTION_WIDTH; -static const char * llama_sampler_power_law_name(const struct llama_sampler * /*smpl*/) { - return "power-law"; +static const char * llama_sampler_adaptive_p_name(const struct llama_sampler * /*smpl*/) { + return "adaptive-p"; } -static void llama_sampler_power_law_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) { - auto * ctx = (llama_sampler_power_law *) smpl->ctx; +static void llama_sampler_adaptive_p_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) { + auto * ctx = (llama_sampler_adaptive_p *) smpl->ctx; if (ctx->target < 0.0f) { - // no-op: just sample from the distribution as-is + // at negative target values, adaptive-p is no-op + // we simply sample from the existing distribution llama_sampler_softmax_impl(cur_p, false); cur_p->selected = llama_sample_dist(cur_p, ctx->rng); return; @@ -2397,38 +2375,43 @@ static void llama_sampler_power_law_apply(struct llama_sampler * smpl, llama_tok } // compute the adapted target probability for the current sampling step - float computed_target = std::clamp( - ctx->total_weight == 0.0f ? ctx->target : 2.0f * ctx->target - (ctx->weighted_sum / ctx->total_weight), + auto target = std::clamp(ctx->target, 0.0f, 1.0f); + float adapted_target = std::clamp( + ctx->total_weight == 0.0f ? target : 2.0f * target - (ctx->weighted_sum / ctx->total_weight), 0.0f, 1.0f ); - // power law transform + // adaptive probability transform + // + // quadratic near target for fine differentiation, transitioning to linear decay in the + // tails. unbounded negative logits ensure proper suppression of far-from-target tokens + // after the softmax. + // for (size_t i = 0; i < cur_p->size; ++i) { - float dist = (cur_p->data[i].p - computed_target) * INV_WIDTH; - cur_p->data[i].logit = PEAK_LOGIT_VALUE / (1.0f + dist * dist); + float dist = std::abs((cur_p->data[i].p - adapted_target) * INV_WIDTH); + cur_p->data[i].logit = PEAK_LOGIT_VALUE - SHARPNESS * dist * dist / (1.0f + dist); } + // softmax and sample from the transformed distribution llama_sampler_softmax_impl(cur_p, false); - - // sample from transformed distribution const int idx = llama_sample_dist(cur_p, ctx->rng); cur_p->selected = idx; - // update running history with the original probability of the selected token - ctx->weighted_sum = ctx->original_probs[idx] + ctx->decay * ctx->weighted_sum; // history fades over time + // update history with the original probability of the selected token + ctx->weighted_sum = ctx->original_probs[idx] + ctx->decay * ctx->weighted_sum; ctx->total_weight = 1.0f + ctx->decay * ctx->total_weight; } -static void llama_sampler_power_law_reset(struct llama_sampler * smpl) { - auto * ctx = (llama_sampler_power_law *) smpl->ctx; +static void llama_sampler_adaptive_p_reset(struct llama_sampler * smpl) { + auto * ctx = (llama_sampler_adaptive_p *) smpl->ctx; ctx->weighted_sum = 0.0f; ctx->total_weight = 0.0f; } -static struct llama_sampler * llama_sampler_power_law_clone(const struct llama_sampler * smpl) { - const auto * ctx = (const llama_sampler_power_law *) smpl->ctx; - auto * result = llama_sampler_init_power_law(ctx->target, ctx->decay, ctx->seed); - auto * result_ctx = (llama_sampler_power_law *) result->ctx; +static struct llama_sampler * llama_sampler_adaptive_p_clone(const struct llama_sampler * smpl) { + const auto * ctx = (const llama_sampler_adaptive_p *) smpl->ctx; + auto * result = llama_sampler_init_adaptive_p(ctx->target, ctx->decay, ctx->seed); + auto * result_ctx = (llama_sampler_adaptive_p *) result->ctx; result_ctx->rng = ctx->rng; result_ctx->weighted_sum = ctx->weighted_sum; @@ -2438,29 +2421,29 @@ static struct llama_sampler * llama_sampler_power_law_clone(const struct llama_s return result; } -static void llama_sampler_power_law_free(struct llama_sampler * smpl) { - delete (llama_sampler_power_law *) smpl->ctx; +static void llama_sampler_adaptive_p_free(struct llama_sampler * smpl) { + delete (llama_sampler_adaptive_p *) smpl->ctx; } -static struct llama_sampler_i llama_sampler_power_law_i = { - /* .name = */ llama_sampler_power_law_name, +static struct llama_sampler_i llama_sampler_adaptive_p_i = { + /* .name = */ llama_sampler_adaptive_p_name, /* .accept = */ nullptr, - /* .apply = */ llama_sampler_power_law_apply, - /* .reset = */ llama_sampler_power_law_reset, - /* .clone = */ llama_sampler_power_law_clone, - /* .free = */ llama_sampler_power_law_free, + /* .apply = */ llama_sampler_adaptive_p_apply, + /* .reset = */ llama_sampler_adaptive_p_reset, + /* .clone = */ llama_sampler_adaptive_p_clone, + /* .free = */ llama_sampler_adaptive_p_free, }; -struct llama_sampler * llama_sampler_init_power_law( +struct llama_sampler * llama_sampler_init_adaptive_p( float target, float decay, uint32_t seed ) { auto seed_cur = get_rng_seed(seed); return llama_sampler_init( - /* .iface = */ &llama_sampler_power_law_i, - /* .ctx = */ new llama_sampler_power_law { - /* .target = */ std::clamp(target, 0.0f, 1.0f), + /* .iface = */ &llama_sampler_adaptive_p_i, + /* .ctx = */ new llama_sampler_adaptive_p { + /* .target = */ target, /* .decay = */ std::clamp(decay, 0.0f, 0.99f), /* .seed = */ seed_cur, /* .rng = */ std::mt19937(seed_cur), diff --git a/tools/server/server-task.cpp b/tools/server/server-task.cpp index 13fa0fdec2..d0b547c235 100644 --- a/tools/server/server-task.cpp +++ b/tools/server/server-task.cpp @@ -201,8 +201,8 @@ task_params server_task::params_from_json_cmpl( params.sampling.mirostat = json_value(data, "mirostat", defaults.sampling.mirostat); params.sampling.mirostat_tau = json_value(data, "mirostat_tau", defaults.sampling.mirostat_tau); params.sampling.mirostat_eta = json_value(data, "mirostat_eta", defaults.sampling.mirostat_eta); - params.sampling.power_law_target = json_value(data, "power_law_target", defaults.sampling.power_law_target); - params.sampling.power_law_decay = json_value(data, "power_law_decay", defaults.sampling.power_law_decay); + params.sampling.adaptive_target = json_value(data, "adaptive_target", defaults.sampling.adaptive_target); + params.sampling.adaptive_decay = json_value(data, "adaptive_decay", defaults.sampling.adaptive_decay); params.sampling.seed = json_value(data, "seed", defaults.sampling.seed); params.sampling.n_probs = json_value(data, "n_probs", defaults.sampling.n_probs); params.sampling.min_keep = json_value(data, "min_keep", defaults.sampling.min_keep);