update `power-law` -> `adaptive-p`

This commit is contained in:
ddh0 2025-12-27 02:10:20 -06:00
parent 90f3bfbe96
commit b95b0884dd
6 changed files with 102 additions and 114 deletions

View File

@ -1597,21 +1597,23 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
}
).set_sparam());
add_opt(common_arg(
{"--power-law-target"}, "N",
string_format("power law sampler: select tokens near this probability (valid range 0.0 "
"to 1.0; <0 = disabled) (default: %.2f)\n"
{"--adaptive-target"}, "N",
string_format("adaptive-p: select tokens near this probability (valid range 0.0 "
"to 1.0; negative = disabled) (default: %.2f)\n"
"[(more info)]""(https://github.com/ggml-org/llama.cpp/pull/17927)",
(double)params.sampling.power_law_target),
(double)params.sampling.adaptive_target),
[](common_params & params, const std::string & value) {
params.sampling.power_law_target = std::stof(value);
params.sampling.adaptive_target = std::stof(value);
}
).set_sparam());
add_opt(common_arg(
{"--power-law-decay"}, "N",
string_format("decay rate for target adaptation over time. lower values -> faster but less stable adaptation.\n"
"(valid range 0.0 to 1.0; ≤0 = no adaptation) (default: %.2f)", (double)params.sampling.power_law_decay),
{"--adaptive-decay"}, "N",
string_format("adaptive-p: decay rate for target adaptation over time. lower values "
"are more reactive, higher values are more stable.\n"
"(valid range 0.0 to 0.99) (default: %.2f)",
(double)params.sampling.adaptive_decay),
[](common_params & params, const std::string & value) {
params.sampling.power_law_decay = std::stof(value);
params.sampling.adaptive_decay = std::stof(value);
}
).set_sparam());
add_opt(common_arg(

View File

@ -117,7 +117,7 @@ enum common_sampler_type {
COMMON_SAMPLER_TYPE_INFILL = 9,
COMMON_SAMPLER_TYPE_PENALTIES = 10,
COMMON_SAMPLER_TYPE_TOP_N_SIGMA = 11,
COMMON_SAMPLER_TYPE_POWER_LAW = 12,
COMMON_SAMPLER_TYPE_ADAPTIVE_P = 12,
};
// dimensionality reduction methods, used by cvector-generator
@ -185,8 +185,8 @@ struct common_params_sampling {
float dry_base = 1.75f; // 0.0 = disabled; multiplier * base ^ (length of sequence before token - allowed length)
int32_t dry_allowed_length = 2; // tokens extending repetitions beyond this receive penalty
int32_t dry_penalty_last_n = -1; // how many tokens to scan for repetitions (0 = disable penalty, -1 = context size)
float power_law_target = -1.0f; // select tokens near this probability (valid range 0.0 to 1.0; <0 = disabled)
float power_law_decay = 0.90f; // decay rate for target adaptation over time. lower values -> faster but less stable adaptation. (valid range 0.0 to 1.0; ≤0 = no adaptation)
float adaptive_target = -1.0f; // select tokens near this probability (valid range 0.0 to 1.0; negative = disabled)
float adaptive_decay = 0.90f; // EMA decay for adaptation; history ≈ 1/(1-decay) tokens (0.0 - 0.99)
int32_t mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
float top_n_sigma = -1.00f; // -1.0 = disabled
float mirostat_tau = 5.00f; // target entropy

View File

@ -150,11 +150,11 @@ std::string common_params_sampling::print() const {
"\trepeat_last_n = %d, repeat_penalty = %.3f, frequency_penalty = %.3f, presence_penalty = %.3f\n"
"\tdry_multiplier = %.3f, dry_base = %.3f, dry_allowed_length = %d, dry_penalty_last_n = %d\n"
"\ttop_k = %d, top_p = %.3f, min_p = %.3f, xtc_probability = %.3f, xtc_threshold = %.3f, typical_p = %.3f, top_n_sigma = %.3f, temp = %.3f\n"
"\tmirostat = %d, mirostat_lr = %.3f, mirostat_ent = %.3f, power_law_target = %.3f, power_law_decay = %.3f",
"\tmirostat = %d, mirostat_lr = %.3f, mirostat_ent = %.3f, adaptive_target = %.3f, adaptive_decay = %.3f",
penalty_last_n, penalty_repeat, penalty_freq, penalty_present,
dry_multiplier, dry_base, dry_allowed_length, dry_penalty_last_n,
top_k, top_p, min_p, xtc_probability, xtc_threshold, typ_p, top_n_sigma, temp,
mirostat, mirostat_eta, mirostat_tau, power_law_target, power_law_decay);
mirostat, mirostat_eta, mirostat_tau, adaptive_target, adaptive_decay);
return std::string(result);
}
@ -237,7 +237,7 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
if (params.mirostat == 0) {
bool use_power_law = false;
bool use_adaptive_p = false; // see below
for (const auto & cnstr : params.samplers) {
switch (cnstr) {
@ -278,20 +278,20 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
case COMMON_SAMPLER_TYPE_PENALTIES:
samplers.push_back(llama_sampler_init_penalties(params.penalty_last_n, params.penalty_repeat, params.penalty_freq, params.penalty_present));
break;
case COMMON_SAMPLER_TYPE_POWER_LAW:
// the `power_law` sampler is like `dist` in that it selects a single token,
// so we will add `dist` at the end of the chain by default, unless the user
// specifically included `power_law`. we set this flag here so we know to add
// it at the very end.
use_power_law = true;
case COMMON_SAMPLER_TYPE_ADAPTIVE_P:
// the `adaptive-p` sampler is like `dist` and `mirostat` in that it selects
// a single token, so we will add `dist` at the end of the chain by default,
// unless the user specifically included `adaptive-p`. we set this flag here
// so we know to add the sampler at the very end.
use_adaptive_p = true;
break;
default:
GGML_ASSERT(false && "unknown sampler type");
}
}
if (use_power_law) {
// only if user explicitly included power_law sampler
samplers.push_back(llama_sampler_init_power_law(params.power_law_target, params.power_law_decay, params.seed));
if (use_adaptive_p) {
// only if user explicitly included adaptive-p sampler
samplers.push_back(llama_sampler_init_adaptive_p(params.adaptive_target, params.adaptive_decay, params.seed));
} else {
// default: sample from distribution
samplers.push_back(llama_sampler_init_dist(params.seed));
@ -581,7 +581,7 @@ char common_sampler_type_to_chr(enum common_sampler_type cnstr) {
case COMMON_SAMPLER_TYPE_XTC: return 'x';
case COMMON_SAMPLER_TYPE_INFILL: return 'i';
case COMMON_SAMPLER_TYPE_PENALTIES: return 'e';
case COMMON_SAMPLER_TYPE_POWER_LAW: return 'w';
case COMMON_SAMPLER_TYPE_ADAPTIVE_P: return 'a';
default : return '?';
}
}
@ -598,7 +598,7 @@ std::string common_sampler_type_to_str(enum common_sampler_type cnstr) {
case COMMON_SAMPLER_TYPE_XTC: return "xtc";
case COMMON_SAMPLER_TYPE_INFILL: return "infill";
case COMMON_SAMPLER_TYPE_PENALTIES: return "penalties";
case COMMON_SAMPLER_TYPE_POWER_LAW: return "power_law";
case COMMON_SAMPLER_TYPE_ADAPTIVE_P: return "adaptive_p";
default : return "";
}
}
@ -615,7 +615,7 @@ std::vector<common_sampler_type> common_sampler_types_from_names(const std::vect
{ "xtc", COMMON_SAMPLER_TYPE_XTC },
{ "infill", COMMON_SAMPLER_TYPE_INFILL },
{ "penalties", COMMON_SAMPLER_TYPE_PENALTIES },
{ "power_law", COMMON_SAMPLER_TYPE_POWER_LAW },
{ "adaptive_p", COMMON_SAMPLER_TYPE_ADAPTIVE_P },
};
// since samplers names are written multiple ways
@ -631,7 +631,7 @@ std::vector<common_sampler_type> common_sampler_types_from_names(const std::vect
{ "typ", COMMON_SAMPLER_TYPE_TYPICAL_P },
{ "min-p", COMMON_SAMPLER_TYPE_MIN_P },
{ "temp", COMMON_SAMPLER_TYPE_TEMPERATURE },
{ "power-law", COMMON_SAMPLER_TYPE_POWER_LAW },
{ "adaptive-p", COMMON_SAMPLER_TYPE_ADAPTIVE_P },
};
std::vector<common_sampler_type> samplers;

View File

@ -1304,25 +1304,28 @@ extern "C" {
const char ** seq_breakers,
size_t num_breakers);
/// power-law
/// adaptive-p: select tokens near a configurable target probability over time.
///
/// this sampler implements a power law probability transformation with adaptive
/// target tracking. it reshapes token probability distributions to favor tokens near a
/// configurable target probability, rather than always selecting from the highest probability
/// candidates.
/// the adaptive-p sampler transforms the token probability distribution to favor tokens
/// that fall near a user-configurable probability target.
///
/// this sampler is like `greedy`, `dist`, and `mirostat` in that it actually selects a token ID
/// rather than just transforming logits. therefore it must always be the last sampler in the
/// sampler chain.
/// internally, the sampler maintains an exponential moving average of the *ORIGINAL*
/// probabilities of selected tokens at each sampling step. it uses this EMA to compute an
/// adapted target probability at each sampling step, thus maintaining the desired target
/// probability over time.
///
/// minimal truncation before this sampler is recommended.
/// adaptive-p selects a token ID rather than just mutating candidates, so it must be last
/// in the sampler chain (like mirostat, dist, greedy).
///
/// @param target select tokens near this probability (valid range 0.0 to 1.0; <0 = disabled)
/// @param decay decay rate for target adaptation over time. lower values -> faster but less stable adaptation. (valid range 0.0 to 1.0; ≤0 = no adaptation)
/// only mild truncation before this sampler is recommended. we suggest applying min-p
/// before adaptive-p as the only other active sampler in the chain.
///
/// ref: https://github.com/MrJackSpade/llama.cpp/tree/master (original impl)
/// ref: https://github.com/ggml-org/llama.cpp/pull/17927 (llama.cpp PR)
LLAMA_API struct llama_sampler * llama_sampler_init_power_law(
/// @param target select tokens near this probability (valid range 0.0 to 1.0; negative = disabled)
/// @param decay EMA decay for adaptation; history ≈ 1/(1-decay) tokens (valid range 0.0 - 0.99)
/// @param seed RNG seed
///
/// ref: https://github.com/ggml-org/llama.cpp/pull/17927
LLAMA_API struct llama_sampler * llama_sampler_init_adaptive_p(
float target,
float decay,
uint32_t seed);

View File

@ -2329,61 +2329,39 @@ struct llama_sampler * llama_sampler_init_dry_testing(int32_t context_size, floa
return result;
}
// power-law
// adaptive-p sampler state
//
// this sampler implements a power law probability transformation with adaptive
// target tracking. it reshapes token probability distributions to favor tokens near a
// configurable target probability, rather than always selecting from the highest probability
// candidates.
// maintains an exponential moving average of the *ORIGINAL* probabilities
// of selected tokens, used to compute an adapted target at each sampling step.
//
// this sampler is like `greedy`, `dist`, and `mirostat` in that it actually selects a token ID
// rather than just transforming logits. therefore it must always be the last sampler in the
// sampler chain.
//
// minimal truncation before this sampler is recommended.
//
// ref: https://github.com/MrJackSpade/llama.cpp/tree/master (original impl)
// ref: https://github.com/ggml-org/llama.cpp/pull/17927 (llama.cpp PR)
struct llama_sampler_power_law {
// the desired average probability for selected tokens (0.0 to 1.0)
// higher values favor more probable tokens (more stable and predictable)
// lower values favor less probable tokens (more creative)
// negative values disable Power Law sampling (sample from distribution as-is)
const float target;
// controls how quickly history influence fades (0.0 to 0.99)
// lower values = faster adaptation, more reactive to recent tokens
// higher values = slower adaptation, more stable over time
// effective history length ≈ 1/(1-decay) tokens
// example: decay=0.5 --> ~2 tokens; decay=0.9 --> ~10 tokens; decay=0.95 --> ~20 tokens
// internally clamped to <= 0.99 to prevent unbounded accumulation
const float decay;
const uint32_t seed;
std::mt19937 rng;
// member variables
float weighted_sum; // historical token probabilities weighted by recency
float total_weight; // sum of weights, converges to 1/(1-decay)
std::vector<float> original_probs; // used to store original token probabilities
// see llama.h for a full description of the sampler
// ref: https://github.com/ggml-org/llama.cpp/pull/17927
struct llama_sampler_adaptive_p {
const float target; // target probability (0.0 - 1.0; negative = disabled)
const float decay; // EMA decay; history ≈ 1/(1-decay) tokens (0.0 - 0.99)
const uint32_t seed; // RNG seed
std::mt19937 rng; // RNG
float weighted_sum; // sum(p_i * decay^i)
float total_weight; // sum(decay^i), converges to 1/(1-decay)
std::vector<float> original_probs; // pre-transform probs, cached for EMA update
};
// transformation constants
// adaptive probability transformation constants
static constexpr float DISTRIBUTION_WIDTH = 0.3f;
static constexpr float PEAK_LOGIT_VALUE = 5.0f;
static constexpr float SHARPNESS = 4.0f;
static constexpr float INV_WIDTH = 1.0f / DISTRIBUTION_WIDTH;
static const char * llama_sampler_power_law_name(const struct llama_sampler * /*smpl*/) {
return "power-law";
static const char * llama_sampler_adaptive_p_name(const struct llama_sampler * /*smpl*/) {
return "adaptive-p";
}
static void llama_sampler_power_law_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
auto * ctx = (llama_sampler_power_law *) smpl->ctx;
static void llama_sampler_adaptive_p_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
auto * ctx = (llama_sampler_adaptive_p *) smpl->ctx;
if (ctx->target < 0.0f) {
// no-op: just sample from the distribution as-is
// at negative target values, adaptive-p is no-op
// we simply sample from the existing distribution
llama_sampler_softmax_impl(cur_p, false);
cur_p->selected = llama_sample_dist(cur_p, ctx->rng);
return;
@ -2397,38 +2375,43 @@ static void llama_sampler_power_law_apply(struct llama_sampler * smpl, llama_tok
}
// compute the adapted target probability for the current sampling step
float computed_target = std::clamp(
ctx->total_weight == 0.0f ? ctx->target : 2.0f * ctx->target - (ctx->weighted_sum / ctx->total_weight),
auto target = std::clamp(ctx->target, 0.0f, 1.0f);
float adapted_target = std::clamp(
ctx->total_weight == 0.0f ? target : 2.0f * target - (ctx->weighted_sum / ctx->total_weight),
0.0f, 1.0f
);
// power law transform
// adaptive probability transform
//
// quadratic near target for fine differentiation, transitioning to linear decay in the
// tails. unbounded negative logits ensure proper suppression of far-from-target tokens
// after the softmax.
//
for (size_t i = 0; i < cur_p->size; ++i) {
float dist = (cur_p->data[i].p - computed_target) * INV_WIDTH;
cur_p->data[i].logit = PEAK_LOGIT_VALUE / (1.0f + dist * dist);
float dist = std::abs((cur_p->data[i].p - adapted_target) * INV_WIDTH);
cur_p->data[i].logit = PEAK_LOGIT_VALUE - SHARPNESS * dist * dist / (1.0f + dist);
}
// softmax and sample from the transformed distribution
llama_sampler_softmax_impl(cur_p, false);
// sample from transformed distribution
const int idx = llama_sample_dist(cur_p, ctx->rng);
cur_p->selected = idx;
// update running history with the original probability of the selected token
ctx->weighted_sum = ctx->original_probs[idx] + ctx->decay * ctx->weighted_sum; // history fades over time
// update history with the original probability of the selected token
ctx->weighted_sum = ctx->original_probs[idx] + ctx->decay * ctx->weighted_sum;
ctx->total_weight = 1.0f + ctx->decay * ctx->total_weight;
}
static void llama_sampler_power_law_reset(struct llama_sampler * smpl) {
auto * ctx = (llama_sampler_power_law *) smpl->ctx;
static void llama_sampler_adaptive_p_reset(struct llama_sampler * smpl) {
auto * ctx = (llama_sampler_adaptive_p *) smpl->ctx;
ctx->weighted_sum = 0.0f;
ctx->total_weight = 0.0f;
}
static struct llama_sampler * llama_sampler_power_law_clone(const struct llama_sampler * smpl) {
const auto * ctx = (const llama_sampler_power_law *) smpl->ctx;
auto * result = llama_sampler_init_power_law(ctx->target, ctx->decay, ctx->seed);
auto * result_ctx = (llama_sampler_power_law *) result->ctx;
static struct llama_sampler * llama_sampler_adaptive_p_clone(const struct llama_sampler * smpl) {
const auto * ctx = (const llama_sampler_adaptive_p *) smpl->ctx;
auto * result = llama_sampler_init_adaptive_p(ctx->target, ctx->decay, ctx->seed);
auto * result_ctx = (llama_sampler_adaptive_p *) result->ctx;
result_ctx->rng = ctx->rng;
result_ctx->weighted_sum = ctx->weighted_sum;
@ -2438,29 +2421,29 @@ static struct llama_sampler * llama_sampler_power_law_clone(const struct llama_s
return result;
}
static void llama_sampler_power_law_free(struct llama_sampler * smpl) {
delete (llama_sampler_power_law *) smpl->ctx;
static void llama_sampler_adaptive_p_free(struct llama_sampler * smpl) {
delete (llama_sampler_adaptive_p *) smpl->ctx;
}
static struct llama_sampler_i llama_sampler_power_law_i = {
/* .name = */ llama_sampler_power_law_name,
static struct llama_sampler_i llama_sampler_adaptive_p_i = {
/* .name = */ llama_sampler_adaptive_p_name,
/* .accept = */ nullptr,
/* .apply = */ llama_sampler_power_law_apply,
/* .reset = */ llama_sampler_power_law_reset,
/* .clone = */ llama_sampler_power_law_clone,
/* .free = */ llama_sampler_power_law_free,
/* .apply = */ llama_sampler_adaptive_p_apply,
/* .reset = */ llama_sampler_adaptive_p_reset,
/* .clone = */ llama_sampler_adaptive_p_clone,
/* .free = */ llama_sampler_adaptive_p_free,
};
struct llama_sampler * llama_sampler_init_power_law(
struct llama_sampler * llama_sampler_init_adaptive_p(
float target,
float decay,
uint32_t seed
) {
auto seed_cur = get_rng_seed(seed);
return llama_sampler_init(
/* .iface = */ &llama_sampler_power_law_i,
/* .ctx = */ new llama_sampler_power_law {
/* .target = */ std::clamp(target, 0.0f, 1.0f),
/* .iface = */ &llama_sampler_adaptive_p_i,
/* .ctx = */ new llama_sampler_adaptive_p {
/* .target = */ target,
/* .decay = */ std::clamp(decay, 0.0f, 0.99f),
/* .seed = */ seed_cur,
/* .rng = */ std::mt19937(seed_cur),

View File

@ -201,8 +201,8 @@ task_params server_task::params_from_json_cmpl(
params.sampling.mirostat = json_value(data, "mirostat", defaults.sampling.mirostat);
params.sampling.mirostat_tau = json_value(data, "mirostat_tau", defaults.sampling.mirostat_tau);
params.sampling.mirostat_eta = json_value(data, "mirostat_eta", defaults.sampling.mirostat_eta);
params.sampling.power_law_target = json_value(data, "power_law_target", defaults.sampling.power_law_target);
params.sampling.power_law_decay = json_value(data, "power_law_decay", defaults.sampling.power_law_decay);
params.sampling.adaptive_target = json_value(data, "adaptive_target", defaults.sampling.adaptive_target);
params.sampling.adaptive_decay = json_value(data, "adaptive_decay", defaults.sampling.adaptive_decay);
params.sampling.seed = json_value(data, "seed", defaults.sampling.seed);
params.sampling.n_probs = json_value(data, "n_probs", defaults.sampling.n_probs);
params.sampling.min_keep = json_value(data, "min_keep", defaults.sampling.min_keep);