From 774cf23ee556cac320fb68fd553e78100a8a9855 Mon Sep 17 00:00:00 2001 From: ddh0 Date: Wed, 10 Dec 2025 22:13:58 -0600 Subject: [PATCH 01/38] initial commit for branch --- include/llama.h | 9 +++ src/llama-sampling.cpp | 134 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 143 insertions(+) diff --git a/include/llama.h b/include/llama.h index b52eaacfa7..7e1e65523b 100644 --- a/include/llama.h +++ b/include/llama.h @@ -1289,6 +1289,15 @@ extern "C" { const char ** seq_breakers, size_t num_breakers); + /// @details power law sampler, reshapes probability distribution to target specific probability ranges + /// ref: https://github.com/MrJackSpade/llama.cpp + /// ref: [PR] + LLAMA_API struct llama_sampler * llama_sampler_init_power_law( + float target, // target probability (0.0 to 1.0) + float target_range, // adaptive target range (±range from target) + int32_t queue_size, // rolling history window size for adaptation + uint32_t seed); // RNG seed + LLAMA_API struct llama_sampler * llama_sampler_init_logit_bias( int32_t n_vocab, int32_t n_logit_bias, diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp index 3f4a729bc3..6ef8121d7c 100644 --- a/src/llama-sampling.cpp +++ b/src/llama-sampling.cpp @@ -2313,6 +2313,140 @@ struct llama_sampler * llama_sampler_init_dry_testing(int32_t context_size, floa return result; } +// power-law +// ref: https://github.com/MrJackSpade/llama.cpp/tree/master +// ref: [PR] + +struct llama_sampler_power_law { + const float target; + const float target_range; + const int32_t queue_size; + const uint32_t seed; + + std::mt19937 rng; + ring_buffer history; +}; + +static const char * llama_sampler_power_law_name(const struct llama_sampler * /*smpl*/) { + return "power-law"; +} + +static void llama_sampler_power_law_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) { + auto * ctx = (llama_sampler_power_law *) smpl->ctx; + + // these don't need to be modified or exposed to the user + const float peak_logit_value = 3.0f; + const float tail_heaviness = 3.0f; + + const float min_target = ctx->target - ctx->target_range; + const float max_target = ctx->target + ctx->target_range; + + // compute probabilities to get the "original" values + llama_sampler_softmax_impl(cur_p, false); + + // store original probabilities (needed for history update) + std::vector original_probs; + original_probs.reserve(cur_p->size); + for (size_t i = 0; i < cur_p->size; ++i) { + original_probs.push_back(cur_p->data[i].p); + } + + // calculate adaptive target + float computed_target = ctx->target; + if (ctx->history.size() > 0) { + float sum_excluding_oldest = 0.0f; + size_t sz = ctx->history.size(); + + // sum all except the oldest element + for (size_t i = 0; i < sz - 1; ++i) { + sum_excluding_oldest += ctx->history.rat(i); + } + + float next_value = (ctx->target * ctx->queue_size) - sum_excluding_oldest; + computed_target = std::max(min_target, std::min(next_value, max_target)); + } + + // find closest token (for degenerate width ~ 0 case) + float min_distance = FLT_MAX; + int closest_token_idx = -1; + + for (size_t i = 0; i < cur_p->size; ++i) { + float distance = std::abs(cur_p->data[i].p - computed_target); + if (distance < min_distance) { + min_distance = distance; + closest_token_idx = (int) i; + } + } + + // apply power law transformation + for (size_t i = 0; i < cur_p->size; ++i) { + float p = cur_p->data[i].p; + + float distance = std::abs(p - computed_target); + float normalized_distance = distance / 0.2f; + cur_p->data[i].logit = peak_logit_value / (1.0f + std::pow(normalized_distance, tail_heaviness)); + } + + llama_sampler_softmax_impl(cur_p, false); + + // sample from distribution + const int idx = llama_sample_dist(cur_p, ctx->rng); + + // set sampled token + cur_p->selected = idx; + + // update history with ORIGINAL probability + ctx->history.push_back(original_probs[idx]); +} + +static void llama_sampler_power_law_reset(struct llama_sampler * smpl) { + auto * ctx = (llama_sampler_power_law *) smpl->ctx; + ctx->history = ring_buffer(ctx->queue_size); +} + +static struct llama_sampler * llama_sampler_power_law_clone(const struct llama_sampler * smpl) { + const auto * ctx = (const llama_sampler_power_law *) smpl->ctx; + auto * result = llama_sampler_init_power_law(ctx->target, ctx->target_range, ctx->queue_size, ctx->seed); + auto * result_ctx = (llama_sampler_power_law *) result->ctx; + + result_ctx->history = ctx->history; + + return result; +} + +static void llama_sampler_power_law_free(struct llama_sampler * smpl) { + delete (llama_sampler_power_law *) smpl->ctx; +} + +static struct llama_sampler_i llama_sampler_power_law_i = { + /* .name = */ llama_sampler_power_law_name, + /* .accept = */ nullptr, + /* .apply = */ llama_sampler_power_law_apply, + /* .reset = */ llama_sampler_power_law_reset, + /* .clone = */ llama_sampler_power_law_clone, + /* .free = */ llama_sampler_power_law_free, +}; + +struct llama_sampler * llama_sampler_init_power_law( + float target, + float target_range, + int32_t queue_size, + uint32_t seed +) { + auto seed_cur = get_rng_seed(seed); + return llama_sampler_init( + /* .iface = */ &llama_sampler_power_law_i, + /* .ctx = */ new llama_sampler_power_law { + /* .target = */ target, + /* .target_range = */ target_range, + /* .queue_size = */ queue_size, + /* .seed = */ seed_cur, + /* .rng = */ std::mt19937(seed_cur), + /* .history = */ ring_buffer(queue_size), + } + ); +} + // logit-bias struct llama_sampler_logit_bias { From 5ab4ff7e445266f63929617c4f77cb518d24e7ae Mon Sep 17 00:00:00 2001 From: ddh0 Date: Wed, 10 Dec 2025 22:30:14 -0600 Subject: [PATCH 02/38] simplify constants --- src/llama-sampling.cpp | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp index 6ef8121d7c..173f660c73 100644 --- a/src/llama-sampling.cpp +++ b/src/llama-sampling.cpp @@ -2334,10 +2334,6 @@ static const char * llama_sampler_power_law_name(const struct llama_sampler * /* static void llama_sampler_power_law_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) { auto * ctx = (llama_sampler_power_law *) smpl->ctx; - // these don't need to be modified or exposed to the user - const float peak_logit_value = 3.0f; - const float tail_heaviness = 3.0f; - const float min_target = ctx->target - ctx->target_range; const float max_target = ctx->target + ctx->target_range; @@ -2382,9 +2378,8 @@ static void llama_sampler_power_law_apply(struct llama_sampler * smpl, llama_tok for (size_t i = 0; i < cur_p->size; ++i) { float p = cur_p->data[i].p; - float distance = std::abs(p - computed_target); - float normalized_distance = distance / 0.2f; - cur_p->data[i].logit = peak_logit_value / (1.0f + std::pow(normalized_distance, tail_heaviness)); + float normalized_distance = std::abs(p - computed_target) / 0.2f; + cur_p->data[i].logit = 3.0f / (1.0f + std::pow(normalized_distance, 3.0f)); } llama_sampler_softmax_impl(cur_p, false); From 88fb0f3f3288724eada8b1212ed6b8bd4552ac33 Mon Sep 17 00:00:00 2001 From: ddh0 Date: Thu, 11 Dec 2025 13:47:51 -0600 Subject: [PATCH 03/38] add params to `struct common_params_sampling`, add reference to PR --- common/common.h | 58 ++++++++++++++++++++++-------------------- include/llama.h | 10 ++++---- src/llama-sampling.cpp | 5 ++-- 3 files changed, 39 insertions(+), 34 deletions(-) diff --git a/common/common.h b/common/common.h index 2fd83f0cf9..e6d8af4b73 100644 --- a/common/common.h +++ b/common/common.h @@ -116,6 +116,7 @@ enum common_sampler_type { COMMON_SAMPLER_TYPE_INFILL = 9, COMMON_SAMPLER_TYPE_PENALTIES = 10, COMMON_SAMPLER_TYPE_TOP_N_SIGMA = 11, + COMMON_SAMPLER_TYPE_POWER_LAW = 12, }; // dimensionality reduction methods, used by cvector-generator @@ -163,33 +164,36 @@ enum common_params_sampling_config : uint64_t { struct common_params_sampling { uint32_t seed = LLAMA_DEFAULT_SEED; // the seed used to initialize llama_sampler - int32_t n_prev = 64; // number of previous tokens to remember - int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens. - int32_t min_keep = 0; // 0 = disabled, otherwise samplers should return at least min_keep tokens - int32_t top_k = 40; // <= 0 to use vocab size - float top_p = 0.95f; // 1.0 = disabled - float min_p = 0.05f; // 0.0 = disabled - float xtc_probability = 0.00f; // 0.0 = disabled - float xtc_threshold = 0.10f; // > 0.5 disables XTC - float typ_p = 1.00f; // typical_p, 1.0 = disabled - float temp = 0.80f; // <= 0.0 to sample greedily, 0.0 to not output probabilities - float dynatemp_range = 0.00f; // 0.0 = disabled - float dynatemp_exponent = 1.00f; // controls how entropy maps to temperature in dynamic temperature sampler - int32_t penalty_last_n = 64; // last n tokens to penalize (0 = disable penalty, -1 = context size) - float penalty_repeat = 1.00f; // 1.0 = disabled - float penalty_freq = 0.00f; // 0.0 = disabled - float penalty_present = 0.00f; // 0.0 = disabled - float dry_multiplier = 0.0f; // 0.0 = disabled; DRY repetition penalty for tokens extending repetition: - float dry_base = 1.75f; // 0.0 = disabled; multiplier * base ^ (length of sequence before token - allowed length) - int32_t dry_allowed_length = 2; // tokens extending repetitions beyond this receive penalty - int32_t dry_penalty_last_n = -1; // how many tokens to scan for repetitions (0 = disable penalty, -1 = context size) - int32_t mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0 - float top_n_sigma = -1.00f;// -1.0 = disabled - float mirostat_tau = 5.00f; // target entropy - float mirostat_eta = 0.10f; // learning rate - bool ignore_eos = false; - bool no_perf = false; // disable performance metrics - bool timing_per_token = false; + int32_t n_prev = 64; // number of previous tokens to remember + int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens. + int32_t min_keep = 0; // 0 = disabled, otherwise samplers should return at least min_keep tokens + int32_t top_k = 40; // <= 0 to use vocab size + float top_p = 0.95f; // 1.0 = disabled + float min_p = 0.05f; // 0.0 = disabled + float xtc_probability = 0.00f; // 0.0 = disabled + float xtc_threshold = 0.10f; // > 0.5 disables XTC + float typ_p = 1.00f; // typical_p, 1.0 = disabled + float temp = 0.80f; // <= 0.0 to sample greedily, 0.0 to not output probabilities + float dynatemp_range = 0.00f; // 0.0 = disabled + float dynatemp_exponent = 1.00f; // controls how entropy maps to temperature in dynamic temperature sampler + int32_t penalty_last_n = 64; // last n tokens to penalize (0 = disable penalty, -1 = context size) + float penalty_repeat = 1.00f; // 1.0 = disabled + float penalty_freq = 0.00f; // 0.0 = disabled + float penalty_present = 0.00f; // 0.0 = disabled + float dry_multiplier = 0.0f; // 0.0 = disabled; DRY repetition penalty for tokens extending repetition: + float dry_base = 1.75f; // 0.0 = disabled; multiplier * base ^ (length of sequence before token - allowed length) + int32_t dry_allowed_length = 2; // tokens extending repetitions beyond this receive penalty + int32_t dry_penalty_last_n = -1; // how many tokens to scan for repetitions (0 = disable penalty, -1 = context size) + float power_law_target = 0.5; // target probability (0.0 to 1.0) + float power_law_target_range = 0.5; // adapt the target within this range (target +/- range) + int32_t power_law_queue_size = 10; // rolling history window size for adaptation + int32_t mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0 + float top_n_sigma = -1.00f; // -1.0 = disabled + float mirostat_tau = 5.00f; // target entropy + float mirostat_eta = 0.10f; // learning rate + bool ignore_eos = false; + bool no_perf = false; // disable performance metrics + bool timing_per_token = false; uint64_t user_sampling_config = 0; // bitfield to track user-specified samplers diff --git a/include/llama.h b/include/llama.h index 7e1e65523b..3adfdb9993 100644 --- a/include/llama.h +++ b/include/llama.h @@ -1291,12 +1291,12 @@ extern "C" { /// @details power law sampler, reshapes probability distribution to target specific probability ranges /// ref: https://github.com/MrJackSpade/llama.cpp - /// ref: [PR] + /// ref: https://github.com/ggml-org/llama.cpp/pull/17927 LLAMA_API struct llama_sampler * llama_sampler_init_power_law( - float target, // target probability (0.0 to 1.0) - float target_range, // adaptive target range (±range from target) - int32_t queue_size, // rolling history window size for adaptation - uint32_t seed); // RNG seed + float target, // target probability (0.0 to 1.0) + float target_range, // adaptive target range (±range from target) + int32_t queue_size, // rolling history window size for adaptation + uint32_t seed); // RNG seed LLAMA_API struct llama_sampler * llama_sampler_init_logit_bias( int32_t n_vocab, diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp index 173f660c73..fb488acffe 100644 --- a/src/llama-sampling.cpp +++ b/src/llama-sampling.cpp @@ -2315,7 +2315,7 @@ struct llama_sampler * llama_sampler_init_dry_testing(int32_t context_size, floa // power-law // ref: https://github.com/MrJackSpade/llama.cpp/tree/master -// ref: [PR] +// ref: https://github.com/ggml-org/llama.cpp/pull/17927 struct llama_sampler_power_law { const float target; @@ -2404,7 +2404,8 @@ static struct llama_sampler * llama_sampler_power_law_clone(const struct llama_s auto * result = llama_sampler_init_power_law(ctx->target, ctx->target_range, ctx->queue_size, ctx->seed); auto * result_ctx = (llama_sampler_power_law *) result->ctx; - result_ctx->history = ctx->history; + result_ctx->rng = ctx->rng; + result_ctx->history = ctx->history; return result; } From 374bfd43634e2ab2b42957243fa0a8295dd8de99 Mon Sep 17 00:00:00 2001 From: ddh0 Date: Thu, 11 Dec 2025 14:22:58 -0600 Subject: [PATCH 04/38] explicitly clamp `min_target` and `max_target` to `[0.0, 1.0]` --- src/llama-sampling.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp index fb488acffe..eacad79448 100644 --- a/src/llama-sampling.cpp +++ b/src/llama-sampling.cpp @@ -2334,8 +2334,9 @@ static const char * llama_sampler_power_law_name(const struct llama_sampler * /* static void llama_sampler_power_law_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) { auto * ctx = (llama_sampler_power_law *) smpl->ctx; - const float min_target = ctx->target - ctx->target_range; - const float max_target = ctx->target + ctx->target_range; + // clamp the target range to [0.0, 1.0] + const float min_target = std::max(ctx->target - ctx->target_range, 0.0f); + const float max_target = std::min(ctx->target + ctx->target_range, 1.0f); // compute probabilities to get the "original" values llama_sampler_softmax_impl(cur_p, false); From ffe163911be3201f303c40cf18df431ce14e6e71 Mon Sep 17 00:00:00 2001 From: ddh0 Date: Thu, 11 Dec 2025 15:16:11 -0600 Subject: [PATCH 05/38] add args, rename `queue_size` -> `window_size` --- common/arg.cpp | 23 +++++++++++++++++++++++ common/common.h | 2 +- include/llama.h | 4 ++-- src/llama-sampling.cpp | 14 +++++++------- 4 files changed, 33 insertions(+), 10 deletions(-) diff --git a/common/arg.cpp b/common/arg.cpp index a31dcbc689..4210633398 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -1501,6 +1501,29 @@ common_params_context common_params_parser_init(common_params & params, llama_ex } } ).set_sparam()); + add_opt(common_arg( + {"--power-law-target"}, "N", + string_format("Power Law sampler target probability (default: %.2f, 0.0 to 1.0)\n" + "[(more info)](https://github.com/ggml-org/llama.cpp/pull/17927)", + (double)params.sampling.power_law_target), + [](common_params & params, const std::string & value) { + params.sampling.power_law_target = std::stof(value); + } + ).set_sparam()); + add_opt(common_arg( + {"--power-law-target-range"}, "N", + string_format("Power Law sampler adaptive range +/- from target (default: %.2f, 0.0 = no adaptation)", (double)params.sampling.power_law_target_range), + [](common_params & params, const std::string & value) { + params.sampling.power_law_target_range = std::stof(value); + } + ).set_sparam()); + add_opt(common_arg( + {"--power-law-window-size"}, "N", + string_format("Power Law sampler rolling window size, in tokens (default: %d)", params.sampling.power_law_window_size), + [](common_params & params, int value) { + params.sampling.power_law_window_size = value; + } + ).set_sparam()); add_opt(common_arg( {"--dynatemp-range"}, "N", string_format("dynamic temperature range (default: %.1f, 0.0 = disabled)", (double)params.sampling.dynatemp_range), diff --git a/common/common.h b/common/common.h index e6d8af4b73..d4f1229a7e 100644 --- a/common/common.h +++ b/common/common.h @@ -186,7 +186,7 @@ struct common_params_sampling { int32_t dry_penalty_last_n = -1; // how many tokens to scan for repetitions (0 = disable penalty, -1 = context size) float power_law_target = 0.5; // target probability (0.0 to 1.0) float power_law_target_range = 0.5; // adapt the target within this range (target +/- range) - int32_t power_law_queue_size = 10; // rolling history window size for adaptation + int32_t power_law_window_size = 10; // rolling history window size for target adaptation int32_t mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0 float top_n_sigma = -1.00f; // -1.0 = disabled float mirostat_tau = 5.00f; // target entropy diff --git a/include/llama.h b/include/llama.h index 3adfdb9993..1aace655d0 100644 --- a/include/llama.h +++ b/include/llama.h @@ -1294,8 +1294,8 @@ extern "C" { /// ref: https://github.com/ggml-org/llama.cpp/pull/17927 LLAMA_API struct llama_sampler * llama_sampler_init_power_law( float target, // target probability (0.0 to 1.0) - float target_range, // adaptive target range (±range from target) - int32_t queue_size, // rolling history window size for adaptation + float target_range, // adaptive target range (+/- range from target) + int32_t window_size, // rolling history window size for target adaptation uint32_t seed); // RNG seed LLAMA_API struct llama_sampler * llama_sampler_init_logit_bias( diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp index eacad79448..e2c229cd9f 100644 --- a/src/llama-sampling.cpp +++ b/src/llama-sampling.cpp @@ -2320,7 +2320,7 @@ struct llama_sampler * llama_sampler_init_dry_testing(int32_t context_size, floa struct llama_sampler_power_law { const float target; const float target_range; - const int32_t queue_size; + const int32_t window_size; const uint32_t seed; std::mt19937 rng; @@ -2359,7 +2359,7 @@ static void llama_sampler_power_law_apply(struct llama_sampler * smpl, llama_tok sum_excluding_oldest += ctx->history.rat(i); } - float next_value = (ctx->target * ctx->queue_size) - sum_excluding_oldest; + float next_value = (ctx->target * ctx->window_size) - sum_excluding_oldest; computed_target = std::max(min_target, std::min(next_value, max_target)); } @@ -2397,12 +2397,12 @@ static void llama_sampler_power_law_apply(struct llama_sampler * smpl, llama_tok static void llama_sampler_power_law_reset(struct llama_sampler * smpl) { auto * ctx = (llama_sampler_power_law *) smpl->ctx; - ctx->history = ring_buffer(ctx->queue_size); + ctx->history = ring_buffer(ctx->window_size); } static struct llama_sampler * llama_sampler_power_law_clone(const struct llama_sampler * smpl) { const auto * ctx = (const llama_sampler_power_law *) smpl->ctx; - auto * result = llama_sampler_init_power_law(ctx->target, ctx->target_range, ctx->queue_size, ctx->seed); + auto * result = llama_sampler_init_power_law(ctx->target, ctx->target_range, ctx->window_size, ctx->seed); auto * result_ctx = (llama_sampler_power_law *) result->ctx; result_ctx->rng = ctx->rng; @@ -2427,7 +2427,7 @@ static struct llama_sampler_i llama_sampler_power_law_i = { struct llama_sampler * llama_sampler_init_power_law( float target, float target_range, - int32_t queue_size, + int32_t window_size, uint32_t seed ) { auto seed_cur = get_rng_seed(seed); @@ -2436,10 +2436,10 @@ struct llama_sampler * llama_sampler_init_power_law( /* .ctx = */ new llama_sampler_power_law { /* .target = */ target, /* .target_range = */ target_range, - /* .queue_size = */ queue_size, + /* .window_size = */ window_size, /* .seed = */ seed_cur, /* .rng = */ std::mt19937(seed_cur), - /* .history = */ ring_buffer(queue_size), + /* .history = */ ring_buffer(window_size), } ); } From 4959878a748be461f0bf1e7fecfe93694d5eaba4 Mon Sep 17 00:00:00 2001 From: ddh0 Date: Thu, 11 Dec 2025 16:27:14 -0600 Subject: [PATCH 06/38] improved comments --- common/arg.cpp | 2 +- include/llama.h | 15 +++++++++++---- src/llama-sampling.cpp | 11 +++++++++-- 3 files changed, 21 insertions(+), 7 deletions(-) diff --git a/common/arg.cpp b/common/arg.cpp index 4210633398..eac7454768 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -1512,7 +1512,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex ).set_sparam()); add_opt(common_arg( {"--power-law-target-range"}, "N", - string_format("Power Law sampler adaptive range +/- from target (default: %.2f, 0.0 = no adaptation)", (double)params.sampling.power_law_target_range), + string_format("Power Law sampler adaptive target range (target±range) (default: %.2f, 0.0 = fixed target)", (double)params.sampling.power_law_target_range), [](common_params & params, const std::string & value) { params.sampling.power_law_target_range = std::stof(value); } diff --git a/include/llama.h b/include/llama.h index 1aace655d0..95df1058cc 100644 --- a/include/llama.h +++ b/include/llama.h @@ -1289,12 +1289,19 @@ extern "C" { const char ** seq_breakers, size_t num_breakers); - /// @details power law sampler, reshapes probability distribution to target specific probability ranges - /// ref: https://github.com/MrJackSpade/llama.cpp - /// ref: https://github.com/ggml-org/llama.cpp/pull/17927 + /// @details power-law sampler - reshapes probability distribution to target specific probability ranges + /// + /// this sampler is like `greedy`, `dist`, and `mirostat` in that it actually selects a token ID + /// rather than just transforming logits. therefore it must always be the last sampler in the + /// sampler chain. + /// + /// it is recommended to only perform minimal truncation before this sampler. + /// + /// ref: https://github.com/MrJackSpade/llama.cpp/tree/master (original impl, documentation) + /// ref: https://github.com/ggml-org/llama.cpp/pull/17927 (llama.cpp PR) LLAMA_API struct llama_sampler * llama_sampler_init_power_law( float target, // target probability (0.0 to 1.0) - float target_range, // adaptive target range (+/- range from target) + float target_range, // adaptive target range (target±range) int32_t window_size, // rolling history window size for target adaptation uint32_t seed); // RNG seed diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp index e2c229cd9f..0b591d60a8 100644 --- a/src/llama-sampling.cpp +++ b/src/llama-sampling.cpp @@ -2314,8 +2314,15 @@ struct llama_sampler * llama_sampler_init_dry_testing(int32_t context_size, floa } // power-law -// ref: https://github.com/MrJackSpade/llama.cpp/tree/master -// ref: https://github.com/ggml-org/llama.cpp/pull/17927 +// +// this sampler is like `greedy`, `dist`, and `mirostat` in that it actually selects a token ID +// rather than just transforming logits. therefore it must always be the last sampler in the +// sampler chain. +// +// it is recommended to only perform minimal truncation before this sampler. +// +// ref: https://github.com/MrJackSpade/llama.cpp/tree/master (original impl, documentation) +// ref: https://github.com/ggml-org/llama.cpp/pull/17927 (llama.cpp PR) struct llama_sampler_power_law { const float target; From f3457a83e653b85074dff573ee723069f7cf1fed Mon Sep 17 00:00:00 2001 From: ddh0 Date: Thu, 11 Dec 2025 16:36:00 -0600 Subject: [PATCH 07/38] minor --- common/arg.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/common/arg.cpp b/common/arg.cpp index eac7454768..18259c72c2 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -1503,7 +1503,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex ).set_sparam()); add_opt(common_arg( {"--power-law-target"}, "N", - string_format("Power Law sampler target probability (default: %.2f, 0.0 to 1.0)\n" + string_format("Power Law sampler target probability (default: %.2f; allowed range 0.0 to 1.0)\n" "[(more info)](https://github.com/ggml-org/llama.cpp/pull/17927)", (double)params.sampling.power_law_target), [](common_params & params, const std::string & value) { @@ -1512,7 +1512,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex ).set_sparam()); add_opt(common_arg( {"--power-law-target-range"}, "N", - string_format("Power Law sampler adaptive target range (target±range) (default: %.2f, 0.0 = fixed target)", (double)params.sampling.power_law_target_range), + string_format("Power Law sampler adaptive target range (target±range) (default: %.2f; 0.0 = fixed target)", (double)params.sampling.power_law_target_range), [](common_params & params, const std::string & value) { params.sampling.power_law_target_range = std::stof(value); } From 93169593b8f4571df120f6e8dbf8c21185a589ff Mon Sep 17 00:00:00 2001 From: ddh0 Date: Thu, 11 Dec 2025 16:46:17 -0600 Subject: [PATCH 08/38] remove old unused code from algorithm --- src/llama-sampling.cpp | 15 +-------------- 1 file changed, 1 insertion(+), 14 deletions(-) diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp index 0b591d60a8..b61202c636 100644 --- a/src/llama-sampling.cpp +++ b/src/llama-sampling.cpp @@ -2370,24 +2370,11 @@ static void llama_sampler_power_law_apply(struct llama_sampler * smpl, llama_tok computed_target = std::max(min_target, std::min(next_value, max_target)); } - // find closest token (for degenerate width ~ 0 case) - float min_distance = FLT_MAX; - int closest_token_idx = -1; - - for (size_t i = 0; i < cur_p->size; ++i) { - float distance = std::abs(cur_p->data[i].p - computed_target); - if (distance < min_distance) { - min_distance = distance; - closest_token_idx = (int) i; - } - } - // apply power law transformation for (size_t i = 0; i < cur_p->size; ++i) { float p = cur_p->data[i].p; - float normalized_distance = std::abs(p - computed_target) / 0.2f; - cur_p->data[i].logit = 3.0f / (1.0f + std::pow(normalized_distance, 3.0f)); + cur_p->data[i].logit = 3.0f / (1.0f + std::pow(normalized_distance, 3.0f)); } llama_sampler_softmax_impl(cur_p, false); From b3aea5776865d09bda4f35729ee367b70cb47f64 Mon Sep 17 00:00:00 2001 From: ddh0 Date: Thu, 11 Dec 2025 16:48:52 -0600 Subject: [PATCH 09/38] minor --- src/llama-sampling.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp index b61202c636..06a1eef148 100644 --- a/src/llama-sampling.cpp +++ b/src/llama-sampling.cpp @@ -2390,8 +2390,8 @@ static void llama_sampler_power_law_apply(struct llama_sampler * smpl, llama_tok } static void llama_sampler_power_law_reset(struct llama_sampler * smpl) { - auto * ctx = (llama_sampler_power_law *) smpl->ctx; - ctx->history = ring_buffer(ctx->window_size); + auto * ctx = (llama_sampler_power_law *) smpl->ctx; + ctx->history = ring_buffer(ctx->window_size); } static struct llama_sampler * llama_sampler_power_law_clone(const struct llama_sampler * smpl) { From cd7de7c7a8fc30ec45737df428a09e2b80c30289 Mon Sep 17 00:00:00 2001 From: ddh0 Date: Thu, 11 Dec 2025 17:23:27 -0600 Subject: [PATCH 10/38] add power law case to `common_sampler_init`, add sampler name mappings --- common/sampling.cpp | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/common/sampling.cpp b/common/sampling.cpp index 7a6b7be1e0..07d7153384 100644 --- a/common/sampling.cpp +++ b/common/sampling.cpp @@ -243,6 +243,9 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co params.logit_bias.data())); if (params.mirostat == 0) { + // if this flag is set, we will not need to add `dist` at the end of the sampler chain + bool has_distribution_sampler = false; + for (const auto & cnstr : params.samplers) { switch (cnstr) { case COMMON_SAMPLER_TYPE_DRY: @@ -253,7 +256,7 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co c_breakers.push_back(str.c_str()); } - llama_sampler_chain_add(result->chain, llama_sampler_init_dry (vocab, llama_model_n_ctx_train(model), params.dry_multiplier, params.dry_base, params.dry_allowed_length, params.dry_penalty_last_n, c_breakers.data(), c_breakers.size())); + llama_sampler_chain_add(result->chain, llama_sampler_init_dry (vocab, llama_model_n_ctx_train(model), params.dry_multiplier, params.dry_base, params.dry_allowed_length, params.dry_penalty_last_n, c_breakers.data(), c_breakers.size())); } break; case COMMON_SAMPLER_TYPE_TOP_K: @@ -283,11 +286,18 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co case COMMON_SAMPLER_TYPE_PENALTIES: llama_sampler_chain_add(result->chain, llama_sampler_init_penalties (params.penalty_last_n, params.penalty_repeat, params.penalty_freq, params.penalty_present)); break; + case COMMON_SAMPLER_TYPE_POWER_LAW: + llama_sampler_chain_add(result->chain, llama_sampler_init_power_law (params.power_law_target, params.power_law_target_range, params.power_law_window_size, params.seed)); + has_distribution_sampler = true; + break; default: GGML_ASSERT(false && "unknown sampler type"); } } - llama_sampler_chain_add(result->chain, llama_sampler_init_dist(params.seed)); + // only add `dist` to the end of the chain if no other distribution samplers were added + if (!has_distribution_sampler) { + llama_sampler_chain_add(result->chain, llama_sampler_init_dist(params.seed)); + } } else if (params.mirostat == 1) { llama_sampler_chain_add(result->chain, llama_sampler_init_temp(params.temp)); llama_sampler_chain_add(result->chain, llama_sampler_init_mirostat(llama_vocab_n_tokens(vocab), params.seed, params.mirostat_tau, params.mirostat_eta, 100)); @@ -586,6 +596,7 @@ std::vector common_sampler_types_from_names(const std::vect { "xtc", COMMON_SAMPLER_TYPE_XTC }, { "infill", COMMON_SAMPLER_TYPE_INFILL }, { "penalties", COMMON_SAMPLER_TYPE_PENALTIES }, + { "power_law", COMMON_SAMPLER_TYPE_POWER_LAW }, }; // since samplers names are written multiple ways @@ -601,6 +612,7 @@ std::vector common_sampler_types_from_names(const std::vect { "typ", COMMON_SAMPLER_TYPE_TYPICAL_P }, { "min-p", COMMON_SAMPLER_TYPE_MIN_P }, { "temp", COMMON_SAMPLER_TYPE_TEMPERATURE }, + { "power-law", COMMON_SAMPLER_TYPE_POWER_LAW }, }; std::vector samplers; From 534cb4fbba8782cef4b40f3a789811d801d72db5 Mon Sep 17 00:00:00 2001 From: ddh0 Date: Thu, 11 Dec 2025 17:29:04 -0600 Subject: [PATCH 11/38] clarify behaviour when `window_size = 0` --- common/arg.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/common/arg.cpp b/common/arg.cpp index 18259c72c2..31f67627f6 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -1519,7 +1519,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex ).set_sparam()); add_opt(common_arg( {"--power-law-window-size"}, "N", - string_format("Power Law sampler rolling window size, in tokens (default: %d)", params.sampling.power_law_window_size), + string_format("Power Law sampler rolling window size, in tokens (default: %d; 0 = fixed target)", params.sampling.power_law_window_size), [](common_params & params, int value) { params.sampling.power_law_window_size = value; } From dcada035b4d18702cce3135a052c7c5dea71e478 Mon Sep 17 00:00:00 2001 From: ddh0 Date: Thu, 11 Dec 2025 17:49:47 -0600 Subject: [PATCH 12/38] add missing enums --- common/sampling.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/common/sampling.cpp b/common/sampling.cpp index 07d7153384..90f48c5a05 100644 --- a/common/sampling.cpp +++ b/common/sampling.cpp @@ -564,6 +564,7 @@ char common_sampler_type_to_chr(enum common_sampler_type cnstr) { case COMMON_SAMPLER_TYPE_XTC: return 'x'; case COMMON_SAMPLER_TYPE_INFILL: return 'i'; case COMMON_SAMPLER_TYPE_PENALTIES: return 'e'; + case COMMON_SAMPLER_TYPE_POWER_LAW: return 'w'; default : return '?'; } } @@ -580,6 +581,7 @@ std::string common_sampler_type_to_str(enum common_sampler_type cnstr) { case COMMON_SAMPLER_TYPE_XTC: return "xtc"; case COMMON_SAMPLER_TYPE_INFILL: return "infill"; case COMMON_SAMPLER_TYPE_PENALTIES: return "penalties"; + case COMMON_SAMPLER_TYPE_POWER_LAW: return "power_law"; default : return ""; } } From 2d62bbea9fcdb3cb40b7a437680f3a5c716bebd6 Mon Sep 17 00:00:00 2001 From: ddh0 Date: Thu, 11 Dec 2025 22:43:10 -0600 Subject: [PATCH 13/38] remove `target_range` param, make `target == 1` no-op, cleanup code --- common/arg.cpp | 13 ++------- common/common.h | 60 +++++++++++++++++++--------------------- include/llama.h | 11 +++++--- src/llama-sampling.cpp | 63 +++++++++++++++++++++++++----------------- 4 files changed, 77 insertions(+), 70 deletions(-) diff --git a/common/arg.cpp b/common/arg.cpp index 31f67627f6..a8ea0caf33 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -1503,23 +1503,16 @@ common_params_context common_params_parser_init(common_params & params, llama_ex ).set_sparam()); add_opt(common_arg( {"--power-law-target"}, "N", - string_format("Power Law sampler target probability (default: %.2f; allowed range 0.0 to 1.0)\n" - "[(more info)](https://github.com/ggml-org/llama.cpp/pull/17927)", + string_format("target probability for Power Law sampling (valid range 0.0 to 1.0; <0 = disabled) " + "(%.1f = default)\n[(more info)](https://github.com/ggml-org/llama.cpp/pull/17927)", (double)params.sampling.power_law_target), [](common_params & params, const std::string & value) { params.sampling.power_law_target = std::stof(value); } ).set_sparam()); - add_opt(common_arg( - {"--power-law-target-range"}, "N", - string_format("Power Law sampler adaptive target range (target±range) (default: %.2f; 0.0 = fixed target)", (double)params.sampling.power_law_target_range), - [](common_params & params, const std::string & value) { - params.sampling.power_law_target_range = std::stof(value); - } - ).set_sparam()); add_opt(common_arg( {"--power-law-window-size"}, "N", - string_format("Power Law sampler rolling window size, in tokens (default: %d; 0 = fixed target)", params.sampling.power_law_window_size), + string_format("rolling window size for target adaptation in Power Law sampling (≤0 = fixed target; %d = default)", params.sampling.power_law_window_size), [](common_params & params, int value) { params.sampling.power_law_window_size = value; } diff --git a/common/common.h b/common/common.h index d4f1229a7e..ba3d776bdc 100644 --- a/common/common.h +++ b/common/common.h @@ -164,42 +164,40 @@ enum common_params_sampling_config : uint64_t { struct common_params_sampling { uint32_t seed = LLAMA_DEFAULT_SEED; // the seed used to initialize llama_sampler - int32_t n_prev = 64; // number of previous tokens to remember - int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens. - int32_t min_keep = 0; // 0 = disabled, otherwise samplers should return at least min_keep tokens - int32_t top_k = 40; // <= 0 to use vocab size - float top_p = 0.95f; // 1.0 = disabled - float min_p = 0.05f; // 0.0 = disabled - float xtc_probability = 0.00f; // 0.0 = disabled - float xtc_threshold = 0.10f; // > 0.5 disables XTC - float typ_p = 1.00f; // typical_p, 1.0 = disabled - float temp = 0.80f; // <= 0.0 to sample greedily, 0.0 to not output probabilities - float dynatemp_range = 0.00f; // 0.0 = disabled - float dynatemp_exponent = 1.00f; // controls how entropy maps to temperature in dynamic temperature sampler - int32_t penalty_last_n = 64; // last n tokens to penalize (0 = disable penalty, -1 = context size) - float penalty_repeat = 1.00f; // 1.0 = disabled - float penalty_freq = 0.00f; // 0.0 = disabled - float penalty_present = 0.00f; // 0.0 = disabled - float dry_multiplier = 0.0f; // 0.0 = disabled; DRY repetition penalty for tokens extending repetition: - float dry_base = 1.75f; // 0.0 = disabled; multiplier * base ^ (length of sequence before token - allowed length) - int32_t dry_allowed_length = 2; // tokens extending repetitions beyond this receive penalty - int32_t dry_penalty_last_n = -1; // how many tokens to scan for repetitions (0 = disable penalty, -1 = context size) - float power_law_target = 0.5; // target probability (0.0 to 1.0) - float power_law_target_range = 0.5; // adapt the target within this range (target +/- range) - int32_t power_law_window_size = 10; // rolling history window size for target adaptation - int32_t mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0 - float top_n_sigma = -1.00f; // -1.0 = disabled - float mirostat_tau = 5.00f; // target entropy - float mirostat_eta = 0.10f; // learning rate - bool ignore_eos = false; - bool no_perf = false; // disable performance metrics - bool timing_per_token = false; + int32_t n_prev = 64; // number of previous tokens to remember + int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens. + int32_t min_keep = 0; // 0 = disabled, otherwise samplers should return at least min_keep tokens + int32_t top_k = 40; // <= 0 to use vocab size + float top_p = 0.95f; // 1.0 = disabled + float min_p = 0.05f; // 0.0 = disabled + float xtc_probability = 0.00f; // 0.0 = disabled + float xtc_threshold = 0.10f; // > 0.5 disables XTC + float typ_p = 1.00f; // typical_p, 1.0 = disabled + float temp = 0.80f; // <= 0.0 to sample greedily, 0.0 to not output probabilities + float dynatemp_range = 0.00f; // 0.0 = disabled + float dynatemp_exponent = 1.00f; // controls how entropy maps to temperature in dynamic temperature sampler + int32_t penalty_last_n = 64; // last n tokens to penalize (0 = disable penalty, -1 = context size) + float penalty_repeat = 1.00f; // 1.0 = disabled + float penalty_freq = 0.00f; // 0.0 = disabled + float penalty_present = 0.00f; // 0.0 = disabled + float dry_multiplier = 0.0f; // 0.0 = disabled; DRY repetition penalty for tokens extending repetition: + float dry_base = 1.75f; // 0.0 = disabled; multiplier * base ^ (length of sequence before token - allowed length) + int32_t dry_allowed_length = 2; // tokens extending repetitions beyond this receive penalty + int32_t dry_penalty_last_n = -1; // how many tokens to scan for repetitions (0 = disable penalty, -1 = context size) + float power_law_target = -1.0f; // target probability for Power Law sampling (valid range 0.0 to 1.0; <0 = disabled) + int32_t power_law_window_size = 10; // rolling window size for target adaptation in Power Law sampling (≤0 = fixed target) + int32_t mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0 + float top_n_sigma = -1.00f; // -1.0 = disabled + float mirostat_tau = 5.00f; // target entropy + float mirostat_eta = 0.10f; // learning rate + bool ignore_eos = false; + bool no_perf = false; // disable performance metrics + bool timing_per_token = false; uint64_t user_sampling_config = 0; // bitfield to track user-specified samplers std::vector dry_sequence_breakers = {"\n", ":", "\"", "*"}; // default sequence breakers for DRY - std::vector samplers = { COMMON_SAMPLER_TYPE_PENALTIES, COMMON_SAMPLER_TYPE_DRY, diff --git a/include/llama.h b/include/llama.h index 95df1058cc..ce1308d2bd 100644 --- a/include/llama.h +++ b/include/llama.h @@ -1297,13 +1297,16 @@ extern "C" { /// /// it is recommended to only perform minimal truncation before this sampler. /// + /// @param target target probability (valid range 0.0 to 1.0; <0 = disabled) + /// @param window_size rolling window size for target adaptation (≤0 = fixed target) + /// @param seed RNG seed + /// /// ref: https://github.com/MrJackSpade/llama.cpp/tree/master (original impl, documentation) /// ref: https://github.com/ggml-org/llama.cpp/pull/17927 (llama.cpp PR) LLAMA_API struct llama_sampler * llama_sampler_init_power_law( - float target, // target probability (0.0 to 1.0) - float target_range, // adaptive target range (target±range) - int32_t window_size, // rolling history window size for target adaptation - uint32_t seed); // RNG seed + float target, + int32_t window_size, + uint32_t seed); LLAMA_API struct llama_sampler * llama_sampler_init_logit_bias( int32_t n_vocab, diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp index 06a1eef148..d5f485f846 100644 --- a/src/llama-sampling.cpp +++ b/src/llama-sampling.cpp @@ -2326,12 +2326,11 @@ struct llama_sampler * llama_sampler_init_dry_testing(int32_t context_size, floa struct llama_sampler_power_law { const float target; - const float target_range; const int32_t window_size; - const uint32_t seed; + const uint32_t seed; std::mt19937 rng; - ring_buffer history; + ring_buffer window; }; static const char * llama_sampler_power_law_name(const struct llama_sampler * /*smpl*/) { @@ -2341,66 +2340,82 @@ static const char * llama_sampler_power_law_name(const struct llama_sampler * /* static void llama_sampler_power_law_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) { auto * ctx = (llama_sampler_power_law *) smpl->ctx; - // clamp the target range to [0.0, 1.0] - const float min_target = std::max(ctx->target - ctx->target_range, 0.0f); - const float max_target = std::min(ctx->target + ctx->target_range, 1.0f); + if (ctx->target < 0.0f) { + // no-op: just sample from the distribution as-is + llama_sampler_softmax_impl(cur_p, false); + const int idx = llama_sample_dist(cur_p, ctx->rng); + cur_p->selected = idx; + return; + } + + // fixed power law transform parameters (from original implementation) + const float distribution_width = 0.2f; + const float peak_logit_value = 3.0f; + const float tail_heaviness = 3.0f; // compute probabilities to get the "original" values llama_sampler_softmax_impl(cur_p, false); - // store original probabilities (needed for history update) + // store original probabilities (used for future target adaptation) std::vector original_probs; original_probs.reserve(cur_p->size); for (size_t i = 0; i < cur_p->size; ++i) { original_probs.push_back(cur_p->data[i].p); } + // // calculate adaptive target + // + + const float min_target = 0.0f; + const float max_target = 1.0f; + float computed_target = ctx->target; - if (ctx->history.size() > 0) { + if (ctx->window.size() > 0) { float sum_excluding_oldest = 0.0f; - size_t sz = ctx->history.size(); + size_t sz = ctx->window.size(); // sum all except the oldest element for (size_t i = 0; i < sz - 1; ++i) { - sum_excluding_oldest += ctx->history.rat(i); + sum_excluding_oldest += ctx->window.rat(i); } float next_value = (ctx->target * ctx->window_size) - sum_excluding_oldest; computed_target = std::max(min_target, std::min(next_value, max_target)); } - // apply power law transformation + // + // power law transform + // + for (size_t i = 0; i < cur_p->size; ++i) { float p = cur_p->data[i].p; - float normalized_distance = std::abs(p - computed_target) / 0.2f; - cur_p->data[i].logit = 3.0f / (1.0f + std::pow(normalized_distance, 3.0f)); + float normalized_distance = std::abs(p - computed_target) / distribution_width; + cur_p->data[i].logit = peak_logit_value / (1.0f + std::pow(normalized_distance, tail_heaviness)); } llama_sampler_softmax_impl(cur_p, false); - // sample from distribution + // sample from the transformed distribution const int idx = llama_sample_dist(cur_p, ctx->rng); - - // set sampled token cur_p->selected = idx; - // update history with ORIGINAL probability - ctx->history.push_back(original_probs[idx]); + // add the ORIGINAL probability to the rolling window + ctx->window.push_back(original_probs[idx]); } static void llama_sampler_power_law_reset(struct llama_sampler * smpl) { - auto * ctx = (llama_sampler_power_law *) smpl->ctx; - ctx->history = ring_buffer(ctx->window_size); + auto * ctx = (llama_sampler_power_law *) smpl->ctx; + ctx->window = ring_buffer(ctx->window_size); } static struct llama_sampler * llama_sampler_power_law_clone(const struct llama_sampler * smpl) { const auto * ctx = (const llama_sampler_power_law *) smpl->ctx; - auto * result = llama_sampler_init_power_law(ctx->target, ctx->target_range, ctx->window_size, ctx->seed); + auto * result = llama_sampler_init_power_law(ctx->target, ctx->window_size, ctx->seed); auto * result_ctx = (llama_sampler_power_law *) result->ctx; result_ctx->rng = ctx->rng; - result_ctx->history = ctx->history; + result_ctx->window = ctx->window; return result; } @@ -2420,7 +2435,6 @@ static struct llama_sampler_i llama_sampler_power_law_i = { struct llama_sampler * llama_sampler_init_power_law( float target, - float target_range, int32_t window_size, uint32_t seed ) { @@ -2429,11 +2443,10 @@ struct llama_sampler * llama_sampler_init_power_law( /* .iface = */ &llama_sampler_power_law_i, /* .ctx = */ new llama_sampler_power_law { /* .target = */ target, - /* .target_range = */ target_range, /* .window_size = */ window_size, /* .seed = */ seed_cur, /* .rng = */ std::mt19937(seed_cur), - /* .history = */ ring_buffer(window_size), + /* .window = */ ring_buffer(window_size), } ); } From 5c78b7927fed36512538539d8ff7518c0d23d8cb Mon Sep 17 00:00:00 2001 From: ddh0 Date: Thu, 11 Dec 2025 22:47:36 -0600 Subject: [PATCH 14/38] oops, straggler --- common/sampling.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/common/sampling.cpp b/common/sampling.cpp index 90f48c5a05..63a17287dc 100644 --- a/common/sampling.cpp +++ b/common/sampling.cpp @@ -287,7 +287,7 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co llama_sampler_chain_add(result->chain, llama_sampler_init_penalties (params.penalty_last_n, params.penalty_repeat, params.penalty_freq, params.penalty_present)); break; case COMMON_SAMPLER_TYPE_POWER_LAW: - llama_sampler_chain_add(result->chain, llama_sampler_init_power_law (params.power_law_target, params.power_law_target_range, params.power_law_window_size, params.seed)); + llama_sampler_chain_add(result->chain, llama_sampler_init_power_law (params.power_law_target, params.power_law_window_size, params.seed)); has_distribution_sampler = true; break; default: From 53380c183f225a63ab788cf00e0a0188da073e47 Mon Sep 17 00:00:00 2001 From: ddh0 Date: Fri, 12 Dec 2025 22:39:51 -0600 Subject: [PATCH 15/38] add missing parameters in `server-task.cpp` --- tools/server/server-task.cpp | 52 +++++++++++++++++++----------------- 1 file changed, 27 insertions(+), 25 deletions(-) diff --git a/tools/server/server-task.cpp b/tools/server/server-task.cpp index 360826062b..c3ac98f13f 100644 --- a/tools/server/server-task.cpp +++ b/tools/server/server-task.cpp @@ -182,31 +182,33 @@ task_params server_task::params_from_json_cmpl( params.t_max_predict_ms = json_value(data, "t_max_predict_ms", defaults.t_max_predict_ms); params.response_fields = json_value(data, "response_fields", std::vector()); - params.sampling.top_k = json_value(data, "top_k", defaults.sampling.top_k); - params.sampling.top_p = json_value(data, "top_p", defaults.sampling.top_p); - params.sampling.min_p = json_value(data, "min_p", defaults.sampling.min_p); - params.sampling.top_n_sigma = json_value(data, "top_n_sigma", defaults.sampling.top_n_sigma); - params.sampling.xtc_probability = json_value(data, "xtc_probability", defaults.sampling.xtc_probability); - params.sampling.xtc_threshold = json_value(data, "xtc_threshold", defaults.sampling.xtc_threshold); - params.sampling.typ_p = json_value(data, "typical_p", defaults.sampling.typ_p); - params.sampling.temp = json_value(data, "temperature", defaults.sampling.temp); - params.sampling.dynatemp_range = json_value(data, "dynatemp_range", defaults.sampling.dynatemp_range); - params.sampling.dynatemp_exponent = json_value(data, "dynatemp_exponent", defaults.sampling.dynatemp_exponent); - params.sampling.penalty_last_n = json_value(data, "repeat_last_n", defaults.sampling.penalty_last_n); - params.sampling.penalty_repeat = json_value(data, "repeat_penalty", defaults.sampling.penalty_repeat); - params.sampling.penalty_freq = json_value(data, "frequency_penalty", defaults.sampling.penalty_freq); - params.sampling.penalty_present = json_value(data, "presence_penalty", defaults.sampling.penalty_present); - params.sampling.dry_multiplier = json_value(data, "dry_multiplier", defaults.sampling.dry_multiplier); - params.sampling.dry_base = json_value(data, "dry_base", defaults.sampling.dry_base); - params.sampling.dry_allowed_length = json_value(data, "dry_allowed_length", defaults.sampling.dry_allowed_length); - params.sampling.dry_penalty_last_n = json_value(data, "dry_penalty_last_n", defaults.sampling.dry_penalty_last_n); - params.sampling.mirostat = json_value(data, "mirostat", defaults.sampling.mirostat); - params.sampling.mirostat_tau = json_value(data, "mirostat_tau", defaults.sampling.mirostat_tau); - params.sampling.mirostat_eta = json_value(data, "mirostat_eta", defaults.sampling.mirostat_eta); - params.sampling.seed = json_value(data, "seed", defaults.sampling.seed); - params.sampling.n_probs = json_value(data, "n_probs", defaults.sampling.n_probs); - params.sampling.min_keep = json_value(data, "min_keep", defaults.sampling.min_keep); - params.post_sampling_probs = json_value(data, "post_sampling_probs", defaults.post_sampling_probs); + params.sampling.top_k = json_value(data, "top_k", defaults.sampling.top_k); + params.sampling.top_p = json_value(data, "top_p", defaults.sampling.top_p); + params.sampling.min_p = json_value(data, "min_p", defaults.sampling.min_p); + params.sampling.top_n_sigma = json_value(data, "top_n_sigma", defaults.sampling.top_n_sigma); + params.sampling.xtc_probability = json_value(data, "xtc_probability", defaults.sampling.xtc_probability); + params.sampling.xtc_threshold = json_value(data, "xtc_threshold", defaults.sampling.xtc_threshold); + params.sampling.typ_p = json_value(data, "typical_p", defaults.sampling.typ_p); + params.sampling.temp = json_value(data, "temperature", defaults.sampling.temp); + params.sampling.dynatemp_range = json_value(data, "dynatemp_range", defaults.sampling.dynatemp_range); + params.sampling.dynatemp_exponent = json_value(data, "dynatemp_exponent", defaults.sampling.dynatemp_exponent); + params.sampling.penalty_last_n = json_value(data, "repeat_last_n", defaults.sampling.penalty_last_n); + params.sampling.penalty_repeat = json_value(data, "repeat_penalty", defaults.sampling.penalty_repeat); + params.sampling.penalty_freq = json_value(data, "frequency_penalty", defaults.sampling.penalty_freq); + params.sampling.penalty_present = json_value(data, "presence_penalty", defaults.sampling.penalty_present); + params.sampling.dry_multiplier = json_value(data, "dry_multiplier", defaults.sampling.dry_multiplier); + params.sampling.dry_base = json_value(data, "dry_base", defaults.sampling.dry_base); + params.sampling.dry_allowed_length = json_value(data, "dry_allowed_length", defaults.sampling.dry_allowed_length); + params.sampling.dry_penalty_last_n = json_value(data, "dry_penalty_last_n", defaults.sampling.dry_penalty_last_n); + params.sampling.mirostat = json_value(data, "mirostat", defaults.sampling.mirostat); + params.sampling.mirostat_tau = json_value(data, "mirostat_tau", defaults.sampling.mirostat_tau); + params.sampling.mirostat_eta = json_value(data, "mirostat_eta", defaults.sampling.mirostat_eta); + params.sampling.power_law_target = json_value(data, "power_law_target", defaults.sampling.power_law_target); + params.sampling.power_law_window_size = json_value(data, "power_law_window_size", defaults.sampling.power_law_window_size); + params.sampling.seed = json_value(data, "seed", defaults.sampling.seed); + params.sampling.n_probs = json_value(data, "n_probs", defaults.sampling.n_probs); + params.sampling.min_keep = json_value(data, "min_keep", defaults.sampling.min_keep); + params.post_sampling_probs = json_value(data, "post_sampling_probs", defaults.post_sampling_probs); params.speculative.n_min = json_value(data, "speculative.n_min", defaults.speculative.n_min); params.speculative.n_max = json_value(data, "speculative.n_max", defaults.speculative.n_max); From 94cb883ed9184ac96a838566b0cbbb7918237b64 Mon Sep 17 00:00:00 2001 From: ddh0 Date: Fri, 12 Dec 2025 23:19:08 -0600 Subject: [PATCH 16/38] copy from author ref: https://gist.github.com/MrJackSpade/9be99c7efbba7b95a41377e123b7b069 --- src/llama-sampling.cpp | 156 +++++++++++++++++++++++++++++++++-------- 1 file changed, 125 insertions(+), 31 deletions(-) diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp index d5f485f846..738fd05caa 100644 --- a/src/llama-sampling.cpp +++ b/src/llama-sampling.cpp @@ -2337,21 +2337,134 @@ static const char * llama_sampler_power_law_name(const struct llama_sampler * /* return "power-law"; } +// Computes the target probability for the current sampling step. +// +// The target determines which token probabilities the power law distribution +// will favor. This function implements a dynamic feedback mechanism to maintain +// an average selection probability close to the base target over time. +// +// When the window is empty: +// - Returns the base target value (ctx->target) +// +// When the window has entries: +// - Calculates what the next target should be to keep the weighted average +// of selected token probabilities equal to ctx->target +// - Uses exponential decay weighting: newer values have more influence +// +// Exponential Decay Weighting: +// After inserting the new value, the weights will be: +// new_value: weight = 1 (age 0, newest) +// rat(0): weight = decay (age 1) +// rat(1): weight = decay^2 (age 2) +// ... +// rat(sz-2): weight = decay^(sz-1) +// rat(sz-1): evicted (oldest) +// +// The "effective window size" is approximately 1/(1-decay): +// decay=0.9 → effective window ≈ 10 tokens +// decay=0.95 → effective window ≈ 20 tokens +// decay=1.0 → no decay, equivalent to simple average (original behavior) +// +// Formula derivation: +// We want the weighted average after insertion to equal target: +// +// (new_value * 1 + Σ rat(i) * decay^(i+1)) / total_weight = target +// +// Where total_weight = 1 + decay + decay^2 + ... + decay^(sz-1) +// = (1 - decay^sz) / (1 - decay) [geometric series] +// +// Solving for new_value: +// new_value = target * total_weight - decay * Σ rat(i) * decay^i +// +// The factor of 'decay' on the sum accounts for all existing values +// shifting one position older when the new value is inserted. +// +// The exponential decay helps prevent "fishtailing" - a phenomenon where +// forced high-probability selections (when the model is very confident) +// cause the algorithm to overcorrect with many low-probability selections, +// then swing back the other way. By decaying old values, the influence of +// forced selections fades faster, reducing oscillation amplitude and +// recovery time. +// +// Finally, the computed target is clamped to [min_target, max_target] to +// prevent extreme values that could destabilize sampling. +// +static float llama_sampler_power_law_compute_target(const llama_sampler_power_law * ctx, + float min_target, + float max_target, + float tail_decay) { + float computed_target = ctx->target; + size_t sz = ctx->window.size(); + + if (sz > 0) { + // Check if window is at capacity (oldest element will be evicted on next push) + // Use the window_size parameter from context, not a capacity() method + const bool window_full = (sz == ctx->window_size); + + // Compute weighted sum with exponential decay + // rat(0) = newest in buffer, gets weight 1 + // rat(i) gets weight decay^i + // + // When window is full: exclude oldest element (it will be evicted) + // When window is not full: include all elements (nothing evicted) + float weighted_sum = 0.0f; + float weight = 1.0f; + size_t elements_to_sum = window_full ? (sz - 1) : sz; + + for (size_t i = 0; i < elements_to_sum; ++i) { + weighted_sum += ctx->window.rat(i) * weight; + weight *= tail_decay; + } + + // Compute total weight after new value is inserted + // When full: sz elements remain (oldest evicted, new added) + // When not full: sz + 1 elements (new added, nothing evicted) + size_t final_element_count = window_full ? sz : (sz + 1); + + float total_weight; + if (std::abs(tail_decay - 1.0f) < FLT_EPSILON) { + total_weight = (float) final_element_count; + } else { + total_weight = (1.0f - std::pow(tail_decay, (float) final_element_count)) / (1.0f - tail_decay); + } + + // Shift weights to account for new value taking position 0 + // All existing values age by 1, so multiply their weights by decay + float shifted_weighted_sum = weighted_sum * tail_decay; + + // Solve for the new value that achieves target weighted average + float next_value = (ctx->target * total_weight) - shifted_weighted_sum; + + // Clamp to allowed range + computed_target = std::max(min_target, std::min(next_value, max_target)); + } + + return computed_target; +} + static void llama_sampler_power_law_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) { auto * ctx = (llama_sampler_power_law *) smpl->ctx; if (ctx->target < 0.0f) { + fprintf(stderr, "Target below zero, sampling from distribution\n"); // no-op: just sample from the distribution as-is llama_sampler_softmax_impl(cur_p, false); - const int idx = llama_sample_dist(cur_p, ctx->rng); + const int idx = llama_sample_dist(cur_p, ctx->rng); cur_p->selected = idx; return; } - // fixed power law transform parameters (from original implementation) - const float distribution_width = 0.2f; - const float peak_logit_value = 3.0f; - const float tail_heaviness = 3.0f; + // fixed power law transform parameters + const float distribution_width = 0.3f; + const float peak_logit_value = 5.0f; + const float tail_heaviness = 2.0f; + + // target computation parameters + const float min_target = 0.0f; + const float max_target = 1.0f; + const float tail_decay = 0.50f; // Exponential decay factor for history weighting + // Lower = faster response, higher = more stability + // Effective window ≈ 1/(1-decay) ≈ 20 tokens // compute probabilities to get the "original" values llama_sampler_softmax_impl(cur_p, false); @@ -2363,45 +2476,26 @@ static void llama_sampler_power_law_apply(struct llama_sampler * smpl, llama_tok original_probs.push_back(cur_p->data[i].p); } - // // calculate adaptive target - // + float computed_target = llama_sampler_power_law_compute_target(ctx, min_target, max_target, tail_decay); - const float min_target = 0.0f; - const float max_target = 1.0f; - - float computed_target = ctx->target; - if (ctx->window.size() > 0) { - float sum_excluding_oldest = 0.0f; - size_t sz = ctx->window.size(); - - // sum all except the oldest element - for (size_t i = 0; i < sz - 1; ++i) { - sum_excluding_oldest += ctx->window.rat(i); - } - - float next_value = (ctx->target * ctx->window_size) - sum_excluding_oldest; - computed_target = std::max(min_target, std::min(next_value, max_target)); - } - - // // power law transform - // - for (size_t i = 0; i < cur_p->size; ++i) { - float p = cur_p->data[i].p; + float p = cur_p->data[i].p; float normalized_distance = std::abs(p - computed_target) / distribution_width; - cur_p->data[i].logit = peak_logit_value / (1.0f + std::pow(normalized_distance, tail_heaviness)); + cur_p->data[i].logit = peak_logit_value / (1.0f + std::pow(normalized_distance, tail_heaviness)); } llama_sampler_softmax_impl(cur_p, false); // sample from the transformed distribution - const int idx = llama_sample_dist(cur_p, ctx->rng); + const int idx = llama_sample_dist(cur_p, ctx->rng); cur_p->selected = idx; // add the ORIGINAL probability to the rolling window - ctx->window.push_back(original_probs[idx]); + float original_p = original_probs[idx]; + + ctx->window.push_back(original_p); } static void llama_sampler_power_law_reset(struct llama_sampler * smpl) { From 0a19a3fd6c179d0e2761130a86cf945acc838c83 Mon Sep 17 00:00:00 2001 From: ddh0 Date: Fri, 12 Dec 2025 23:32:57 -0600 Subject: [PATCH 17/38] remove old debug log, style nit --- src/llama-sampling.cpp | 28 ++++++++++++++++------------ 1 file changed, 16 insertions(+), 12 deletions(-) diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp index 738fd05caa..5871668d96 100644 --- a/src/llama-sampling.cpp +++ b/src/llama-sampling.cpp @@ -2389,10 +2389,12 @@ static const char * llama_sampler_power_law_name(const struct llama_sampler * /* // Finally, the computed target is clamped to [min_target, max_target] to // prevent extreme values that could destabilize sampling. // -static float llama_sampler_power_law_compute_target(const llama_sampler_power_law * ctx, - float min_target, - float max_target, - float tail_decay) { +static float llama_sampler_power_law_compute_target( + const llama_sampler_power_law * ctx, + float min_target, + float max_target, + float tail_decay) { + float computed_target = ctx->target; size_t sz = ctx->window.size(); @@ -2416,6 +2418,10 @@ static float llama_sampler_power_law_compute_target(const llama_sampler_power_la weight *= tail_decay; } + // Shift weights to account for new value taking position 0 + // All existing values age by 1, so multiply their weights by decay + float shifted_weighted_sum = weighted_sum * tail_decay; + // Compute total weight after new value is inserted // When full: sz elements remain (oldest evicted, new added) // When not full: sz + 1 elements (new added, nothing evicted) @@ -2428,10 +2434,6 @@ static float llama_sampler_power_law_compute_target(const llama_sampler_power_la total_weight = (1.0f - std::pow(tail_decay, (float) final_element_count)) / (1.0f - tail_decay); } - // Shift weights to account for new value taking position 0 - // All existing values age by 1, so multiply their weights by decay - float shifted_weighted_sum = weighted_sum * tail_decay; - // Solve for the new value that achieves target weighted average float next_value = (ctx->target * total_weight) - shifted_weighted_sum; @@ -2446,7 +2448,6 @@ static void llama_sampler_power_law_apply(struct llama_sampler * smpl, llama_tok auto * ctx = (llama_sampler_power_law *) smpl->ctx; if (ctx->target < 0.0f) { - fprintf(stderr, "Target below zero, sampling from distribution\n"); // no-op: just sample from the distribution as-is llama_sampler_softmax_impl(cur_p, false); const int idx = llama_sample_dist(cur_p, ctx->rng); @@ -2462,9 +2463,9 @@ static void llama_sampler_power_law_apply(struct llama_sampler * smpl, llama_tok // target computation parameters const float min_target = 0.0f; const float max_target = 1.0f; - const float tail_decay = 0.50f; // Exponential decay factor for history weighting - // Lower = faster response, higher = more stability - // Effective window ≈ 1/(1-decay) ≈ 20 tokens + const float tail_decay = 0.50f; // exponential decay factor for history weighting + // lower = faster response, higher = more stability + // effective window ≈ 1/(1-decay) ≈ 20 tokens // compute probabilities to get the "original" values llama_sampler_softmax_impl(cur_p, false); @@ -2479,7 +2480,10 @@ static void llama_sampler_power_law_apply(struct llama_sampler * smpl, llama_tok // calculate adaptive target float computed_target = llama_sampler_power_law_compute_target(ctx, min_target, max_target, tail_decay); + // // power law transform + // + for (size_t i = 0; i < cur_p->size; ++i) { float p = cur_p->data[i].p; float normalized_distance = std::abs(p - computed_target) / distribution_width; From 824bb3aa6ebc14e5bf9c2bb5d0959841100f10fd Mon Sep 17 00:00:00 2001 From: ddh0 Date: Sat, 13 Dec 2025 00:23:15 -0600 Subject: [PATCH 18/38] fix compiler warning, add commented-out logging per token --- src/llama-sampling.cpp | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp index 5871668d96..7686f59148 100644 --- a/src/llama-sampling.cpp +++ b/src/llama-sampling.cpp @@ -2401,7 +2401,7 @@ static float llama_sampler_power_law_compute_target( if (sz > 0) { // Check if window is at capacity (oldest element will be evicted on next push) // Use the window_size parameter from context, not a capacity() method - const bool window_full = (sz == ctx->window_size); + const bool window_full = (sz == (size_t)ctx->window_size); // Compute weighted sum with exponential decay // rat(0) = newest in buffer, gets weight 1 @@ -2496,6 +2496,18 @@ static void llama_sampler_power_law_apply(struct llama_sampler * smpl, llama_tok const int idx = llama_sample_dist(cur_p, ctx->rng); cur_p->selected = idx; + // uncomment this to log the target values and history window contents for every token + // + // fprintf(stderr, "power_law: window_size=%zu/%d values=[", + // ctx->window.size(), ctx->window_size); + // for (size_t i = 0; i < ctx->window.size(); ++i) { + // fprintf(stderr, "%.1f", ctx->window.rat(i)); + // if (i < ctx->window.size() - 1) fprintf(stderr, ","); + // } + // fprintf(stderr, "] computed_target=%.4f selected_token=%d orig_prob=%.4f\n", + // computed_target, cur_p->data[idx].id, original_probs[idx]); + // fflush(stderr); + // add the ORIGINAL probability to the rolling window float original_p = original_probs[idx]; From a96ddd743a8badf058a31edf893ce5c660a02eee Mon Sep 17 00:00:00 2001 From: ddh0 Date: Sat, 13 Dec 2025 22:15:03 -0600 Subject: [PATCH 19/38] re-write + change parameters + simplify --- common/common.h | 58 +++++----- include/llama.h | 22 ++-- src/llama-sampling.cpp | 207 +++++++++++------------------------ tools/server/server-task.cpp | 54 ++++----- 4 files changed, 130 insertions(+), 211 deletions(-) diff --git a/common/common.h b/common/common.h index ba3d776bdc..66a6ca96b3 100644 --- a/common/common.h +++ b/common/common.h @@ -164,35 +164,35 @@ enum common_params_sampling_config : uint64_t { struct common_params_sampling { uint32_t seed = LLAMA_DEFAULT_SEED; // the seed used to initialize llama_sampler - int32_t n_prev = 64; // number of previous tokens to remember - int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens. - int32_t min_keep = 0; // 0 = disabled, otherwise samplers should return at least min_keep tokens - int32_t top_k = 40; // <= 0 to use vocab size - float top_p = 0.95f; // 1.0 = disabled - float min_p = 0.05f; // 0.0 = disabled - float xtc_probability = 0.00f; // 0.0 = disabled - float xtc_threshold = 0.10f; // > 0.5 disables XTC - float typ_p = 1.00f; // typical_p, 1.0 = disabled - float temp = 0.80f; // <= 0.0 to sample greedily, 0.0 to not output probabilities - float dynatemp_range = 0.00f; // 0.0 = disabled - float dynatemp_exponent = 1.00f; // controls how entropy maps to temperature in dynamic temperature sampler - int32_t penalty_last_n = 64; // last n tokens to penalize (0 = disable penalty, -1 = context size) - float penalty_repeat = 1.00f; // 1.0 = disabled - float penalty_freq = 0.00f; // 0.0 = disabled - float penalty_present = 0.00f; // 0.0 = disabled - float dry_multiplier = 0.0f; // 0.0 = disabled; DRY repetition penalty for tokens extending repetition: - float dry_base = 1.75f; // 0.0 = disabled; multiplier * base ^ (length of sequence before token - allowed length) - int32_t dry_allowed_length = 2; // tokens extending repetitions beyond this receive penalty - int32_t dry_penalty_last_n = -1; // how many tokens to scan for repetitions (0 = disable penalty, -1 = context size) - float power_law_target = -1.0f; // target probability for Power Law sampling (valid range 0.0 to 1.0; <0 = disabled) - int32_t power_law_window_size = 10; // rolling window size for target adaptation in Power Law sampling (≤0 = fixed target) - int32_t mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0 - float top_n_sigma = -1.00f; // -1.0 = disabled - float mirostat_tau = 5.00f; // target entropy - float mirostat_eta = 0.10f; // learning rate - bool ignore_eos = false; - bool no_perf = false; // disable performance metrics - bool timing_per_token = false; + int32_t n_prev = 64; // number of previous tokens to remember + int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens. + int32_t min_keep = 0; // 0 = disabled, otherwise samplers should return at least min_keep tokens + int32_t top_k = 40; // <= 0 to use vocab size + float top_p = 0.95f; // 1.0 = disabled + float min_p = 0.05f; // 0.0 = disabled + float xtc_probability = 0.00f; // 0.0 = disabled + float xtc_threshold = 0.10f; // > 0.5 disables XTC + float typ_p = 1.00f; // typical_p, 1.0 = disabled + float temp = 0.80f; // <= 0.0 to sample greedily, 0.0 to not output probabilities + float dynatemp_range = 0.00f; // 0.0 = disabled + float dynatemp_exponent = 1.00f; // controls how entropy maps to temperature in dynamic temperature sampler + int32_t penalty_last_n = 64; // last n tokens to penalize (0 = disable penalty, -1 = context size) + float penalty_repeat = 1.00f; // 1.0 = disabled + float penalty_freq = 0.00f; // 0.0 = disabled + float penalty_present = 0.00f; // 0.0 = disabled + float dry_multiplier = 0.0f; // 0.0 = disabled; DRY repetition penalty for tokens extending repetition: + float dry_base = 1.75f; // 0.0 = disabled; multiplier * base ^ (length of sequence before token - allowed length) + int32_t dry_allowed_length = 2; // tokens extending repetitions beyond this receive penalty + int32_t dry_penalty_last_n = -1; // how many tokens to scan for repetitions (0 = disable penalty, -1 = context size) + float power_law_target = -1.0f; // select tokens near this probability (valid range 0.0 to 1.0; <0 = disabled) + float power_law_decay = 0.9f; // decay rate for target adaptation over time. lower values -> faster but less stable adaptation. (valid range 0.0 to 1.0; ≤0 = no adaptation) + int32_t mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0 + float top_n_sigma = -1.00f; // -1.0 = disabled + float mirostat_tau = 5.00f; // target entropy + float mirostat_eta = 0.10f; // learning rate + bool ignore_eos = false; + bool no_perf = false; // disable performance metrics + bool timing_per_token = false; uint64_t user_sampling_config = 0; // bitfield to track user-specified samplers diff --git a/include/llama.h b/include/llama.h index ce1308d2bd..f3867c6988 100644 --- a/include/llama.h +++ b/include/llama.h @@ -1289,24 +1289,28 @@ extern "C" { const char ** seq_breakers, size_t num_breakers); - /// @details power-law sampler - reshapes probability distribution to target specific probability ranges + /// power-law + /// + /// this sampler implements a power law probability transformation with adaptive + /// target tracking. it reshapes token probability distributions to favor tokens near a + /// configurable target probability, rather than always selecting from the highest probability + /// candidates. it is ideal for creative, unpredictable text generation. /// /// this sampler is like `greedy`, `dist`, and `mirostat` in that it actually selects a token ID /// rather than just transforming logits. therefore it must always be the last sampler in the /// sampler chain. /// - /// it is recommended to only perform minimal truncation before this sampler. + /// minimal truncation before this sampler is recommended. /// - /// @param target target probability (valid range 0.0 to 1.0; <0 = disabled) - /// @param window_size rolling window size for target adaptation (≤0 = fixed target) - /// @param seed RNG seed + /// @param target select tokens near this probability (valid range 0.0 to 1.0; <0 = disabled) + /// @param decay decay rate for target adaptation over time. lower values -> faster but less stable adaptation. (valid range 0.0 to 1.0; ≤0 = no adaptation) /// - /// ref: https://github.com/MrJackSpade/llama.cpp/tree/master (original impl, documentation) + /// ref: https://github.com/MrJackSpade/llama.cpp/tree/master (original impl) /// ref: https://github.com/ggml-org/llama.cpp/pull/17927 (llama.cpp PR) LLAMA_API struct llama_sampler * llama_sampler_init_power_law( - float target, - int32_t window_size, - uint32_t seed); + float target, + float decay, + uint32_t seed); LLAMA_API struct llama_sampler * llama_sampler_init_logit_bias( int32_t n_vocab, diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp index 7686f59148..db126a18d5 100644 --- a/src/llama-sampling.cpp +++ b/src/llama-sampling.cpp @@ -2315,133 +2315,62 @@ struct llama_sampler * llama_sampler_init_dry_testing(int32_t context_size, floa // power-law // +// this sampler implements a power law probability transformation with adaptive +// target tracking. it reshapes token probability distributions to favor tokens near a +// configurable target probability, rather than always selecting from the highest probability +// candidates. it is ideal for creative, unpredictable text generation. +// // this sampler is like `greedy`, `dist`, and `mirostat` in that it actually selects a token ID // rather than just transforming logits. therefore it must always be the last sampler in the // sampler chain. // -// it is recommended to only perform minimal truncation before this sampler. +// minimal truncation before this sampler is recommended. // -// ref: https://github.com/MrJackSpade/llama.cpp/tree/master (original impl, documentation) +// ref: https://github.com/MrJackSpade/llama.cpp/tree/master (original impl) // ref: https://github.com/ggml-org/llama.cpp/pull/17927 (llama.cpp PR) struct llama_sampler_power_law { - const float target; - const int32_t window_size; - const uint32_t seed; - std::mt19937 rng; - ring_buffer window; + // the desired average probability for selected tokens (0.0 to 1.0) + // higher values favor more probable tokens (more deterministic) + // lower values favor less probable tokens (more creative) + // negative values disable Power Law sampling (sample from distribution as-is) + const float target; + + // controls how quickly history influence fades (0.0 to 0.99) + // lower values = faster adaptation, more reactive to recent tokens + // higher values = slower adaptation, more stable over time + // effective history length ≈ 1/(1-decay) tokens + // examples: decay=0.5 → ~2 tokens, decay=0.9 → ~10, decay=0.95 → ~20 + // internally clamped to <= 0.99 to prevent unbounded accumulation + const float decay; + + const uint32_t seed; + std::mt19937 rng; + + // historical token probabilities weighted by recency + float weighted_sum; + // sum of weights, converges to 1/(1-decay) + float total_weight; }; static const char * llama_sampler_power_law_name(const struct llama_sampler * /*smpl*/) { return "power-law"; } -// Computes the target probability for the current sampling step. -// -// The target determines which token probabilities the power law distribution -// will favor. This function implements a dynamic feedback mechanism to maintain -// an average selection probability close to the base target over time. -// -// When the window is empty: -// - Returns the base target value (ctx->target) -// -// When the window has entries: -// - Calculates what the next target should be to keep the weighted average -// of selected token probabilities equal to ctx->target -// - Uses exponential decay weighting: newer values have more influence -// -// Exponential Decay Weighting: -// After inserting the new value, the weights will be: -// new_value: weight = 1 (age 0, newest) -// rat(0): weight = decay (age 1) -// rat(1): weight = decay^2 (age 2) -// ... -// rat(sz-2): weight = decay^(sz-1) -// rat(sz-1): evicted (oldest) -// -// The "effective window size" is approximately 1/(1-decay): -// decay=0.9 → effective window ≈ 10 tokens -// decay=0.95 → effective window ≈ 20 tokens -// decay=1.0 → no decay, equivalent to simple average (original behavior) -// -// Formula derivation: -// We want the weighted average after insertion to equal target: -// -// (new_value * 1 + Σ rat(i) * decay^(i+1)) / total_weight = target -// -// Where total_weight = 1 + decay + decay^2 + ... + decay^(sz-1) -// = (1 - decay^sz) / (1 - decay) [geometric series] -// -// Solving for new_value: -// new_value = target * total_weight - decay * Σ rat(i) * decay^i -// -// The factor of 'decay' on the sum accounts for all existing values -// shifting one position older when the new value is inserted. -// -// The exponential decay helps prevent "fishtailing" - a phenomenon where -// forced high-probability selections (when the model is very confident) -// cause the algorithm to overcorrect with many low-probability selections, -// then swing back the other way. By decaying old values, the influence of -// forced selections fades faster, reducing oscillation amplitude and -// recovery time. -// -// Finally, the computed target is clamped to [min_target, max_target] to -// prevent extreme values that could destabilize sampling. -// -static float llama_sampler_power_law_compute_target( - const llama_sampler_power_law * ctx, - float min_target, - float max_target, - float tail_decay) { - - float computed_target = ctx->target; - size_t sz = ctx->window.size(); - - if (sz > 0) { - // Check if window is at capacity (oldest element will be evicted on next push) - // Use the window_size parameter from context, not a capacity() method - const bool window_full = (sz == (size_t)ctx->window_size); - - // Compute weighted sum with exponential decay - // rat(0) = newest in buffer, gets weight 1 - // rat(i) gets weight decay^i - // - // When window is full: exclude oldest element (it will be evicted) - // When window is not full: include all elements (nothing evicted) - float weighted_sum = 0.0f; - float weight = 1.0f; - size_t elements_to_sum = window_full ? (sz - 1) : sz; - - for (size_t i = 0; i < elements_to_sum; ++i) { - weighted_sum += ctx->window.rat(i) * weight; - weight *= tail_decay; - } - - // Shift weights to account for new value taking position 0 - // All existing values age by 1, so multiply their weights by decay - float shifted_weighted_sum = weighted_sum * tail_decay; - - // Compute total weight after new value is inserted - // When full: sz elements remain (oldest evicted, new added) - // When not full: sz + 1 elements (new added, nothing evicted) - size_t final_element_count = window_full ? sz : (sz + 1); - - float total_weight; - if (std::abs(tail_decay - 1.0f) < FLT_EPSILON) { - total_weight = (float) final_element_count; - } else { - total_weight = (1.0f - std::pow(tail_decay, (float) final_element_count)) / (1.0f - tail_decay); - } - - // Solve for the new value that achieves target weighted average - float next_value = (ctx->target * total_weight) - shifted_weighted_sum; - - // Clamp to allowed range - computed_target = std::max(min_target, std::min(next_value, max_target)); +// compute the adaptive target probability for the current sampling step +static float llama_sampler_power_law_compute_target(const llama_sampler_power_law * ctx, float decay) { + if (ctx->total_weight == 0.0f) { + // if there is no history, just use base target + return ctx->target; } - return computed_target; + // maintain a running weighted sum with exponential decay + float new_total_weight = 1.0f + decay * ctx->total_weight; + float next_value = ctx->target * new_total_weight - decay * ctx->weighted_sum; + + // clamp to [0.0, 1.0] + return std::max(0.0f, std::min(next_value, 1.0f)); } static void llama_sampler_power_law_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) { @@ -2455,30 +2384,25 @@ static void llama_sampler_power_law_apply(struct llama_sampler * smpl, llama_tok return; } + // clamp decay to avoid degenerate case at 1.0 (unbounded accumulation) + const float decay = std::min(ctx->decay, 0.99f); + // fixed power law transform parameters const float distribution_width = 0.3f; const float peak_logit_value = 5.0f; const float tail_heaviness = 2.0f; - // target computation parameters - const float min_target = 0.0f; - const float max_target = 1.0f; - const float tail_decay = 0.50f; // exponential decay factor for history weighting - // lower = faster response, higher = more stability - // effective window ≈ 1/(1-decay) ≈ 20 tokens - - // compute probabilities to get the "original" values + // get the original probabilities llama_sampler_softmax_impl(cur_p, false); - // store original probabilities (used for future target adaptation) + // store the original probabilities (needed for history update after selection) std::vector original_probs; original_probs.reserve(cur_p->size); for (size_t i = 0; i < cur_p->size; ++i) { original_probs.push_back(cur_p->data[i].p); } - // calculate adaptive target - float computed_target = llama_sampler_power_law_compute_target(ctx, min_target, max_target, tail_decay); + float computed_target = llama_sampler_power_law_compute_target(ctx, decay); // // power law transform @@ -2492,40 +2416,30 @@ static void llama_sampler_power_law_apply(struct llama_sampler * smpl, llama_tok llama_sampler_softmax_impl(cur_p, false); - // sample from the transformed distribution + // sample from transformed distribution const int idx = llama_sample_dist(cur_p, ctx->rng); cur_p->selected = idx; - // uncomment this to log the target values and history window contents for every token - // - // fprintf(stderr, "power_law: window_size=%zu/%d values=[", - // ctx->window.size(), ctx->window_size); - // for (size_t i = 0; i < ctx->window.size(); ++i) { - // fprintf(stderr, "%.1f", ctx->window.rat(i)); - // if (i < ctx->window.size() - 1) fprintf(stderr, ","); - // } - // fprintf(stderr, "] computed_target=%.4f selected_token=%d orig_prob=%.4f\n", - // computed_target, cur_p->data[idx].id, original_probs[idx]); - // fflush(stderr); - - // add the ORIGINAL probability to the rolling window - float original_p = original_probs[idx]; - - ctx->window.push_back(original_p); + // update running history with the original probability of the selected token + float original_p = original_probs[idx]; + ctx->weighted_sum = original_p + decay * ctx->weighted_sum; + ctx->total_weight = 1.0f + decay * ctx->total_weight; } static void llama_sampler_power_law_reset(struct llama_sampler * smpl) { - auto * ctx = (llama_sampler_power_law *) smpl->ctx; - ctx->window = ring_buffer(ctx->window_size); + auto * ctx = (llama_sampler_power_law *) smpl->ctx; + ctx->weighted_sum = 0.0f; + ctx->total_weight = 0.0f; } static struct llama_sampler * llama_sampler_power_law_clone(const struct llama_sampler * smpl) { const auto * ctx = (const llama_sampler_power_law *) smpl->ctx; - auto * result = llama_sampler_init_power_law(ctx->target, ctx->window_size, ctx->seed); + auto * result = llama_sampler_init_power_law(ctx->target, ctx->decay, ctx->seed); auto * result_ctx = (llama_sampler_power_law *) result->ctx; - result_ctx->rng = ctx->rng; - result_ctx->window = ctx->window; + result_ctx->rng = ctx->rng; + result_ctx->weighted_sum = ctx->weighted_sum; + result_ctx->total_weight = ctx->total_weight; return result; } @@ -2545,7 +2459,7 @@ static struct llama_sampler_i llama_sampler_power_law_i = { struct llama_sampler * llama_sampler_init_power_law( float target, - int32_t window_size, + float decay, uint32_t seed ) { auto seed_cur = get_rng_seed(seed); @@ -2553,10 +2467,11 @@ struct llama_sampler * llama_sampler_init_power_law( /* .iface = */ &llama_sampler_power_law_i, /* .ctx = */ new llama_sampler_power_law { /* .target = */ target, - /* .window_size = */ window_size, + /* .decay = */ decay, /* .seed = */ seed_cur, /* .rng = */ std::mt19937(seed_cur), - /* .window = */ ring_buffer(window_size), + /* .weighted_sum = */ 0.0f, + /* .total_weight = */ 0.0f, } ); } diff --git a/tools/server/server-task.cpp b/tools/server/server-task.cpp index c3ac98f13f..6c083e6624 100644 --- a/tools/server/server-task.cpp +++ b/tools/server/server-task.cpp @@ -182,33 +182,33 @@ task_params server_task::params_from_json_cmpl( params.t_max_predict_ms = json_value(data, "t_max_predict_ms", defaults.t_max_predict_ms); params.response_fields = json_value(data, "response_fields", std::vector()); - params.sampling.top_k = json_value(data, "top_k", defaults.sampling.top_k); - params.sampling.top_p = json_value(data, "top_p", defaults.sampling.top_p); - params.sampling.min_p = json_value(data, "min_p", defaults.sampling.min_p); - params.sampling.top_n_sigma = json_value(data, "top_n_sigma", defaults.sampling.top_n_sigma); - params.sampling.xtc_probability = json_value(data, "xtc_probability", defaults.sampling.xtc_probability); - params.sampling.xtc_threshold = json_value(data, "xtc_threshold", defaults.sampling.xtc_threshold); - params.sampling.typ_p = json_value(data, "typical_p", defaults.sampling.typ_p); - params.sampling.temp = json_value(data, "temperature", defaults.sampling.temp); - params.sampling.dynatemp_range = json_value(data, "dynatemp_range", defaults.sampling.dynatemp_range); - params.sampling.dynatemp_exponent = json_value(data, "dynatemp_exponent", defaults.sampling.dynatemp_exponent); - params.sampling.penalty_last_n = json_value(data, "repeat_last_n", defaults.sampling.penalty_last_n); - params.sampling.penalty_repeat = json_value(data, "repeat_penalty", defaults.sampling.penalty_repeat); - params.sampling.penalty_freq = json_value(data, "frequency_penalty", defaults.sampling.penalty_freq); - params.sampling.penalty_present = json_value(data, "presence_penalty", defaults.sampling.penalty_present); - params.sampling.dry_multiplier = json_value(data, "dry_multiplier", defaults.sampling.dry_multiplier); - params.sampling.dry_base = json_value(data, "dry_base", defaults.sampling.dry_base); - params.sampling.dry_allowed_length = json_value(data, "dry_allowed_length", defaults.sampling.dry_allowed_length); - params.sampling.dry_penalty_last_n = json_value(data, "dry_penalty_last_n", defaults.sampling.dry_penalty_last_n); - params.sampling.mirostat = json_value(data, "mirostat", defaults.sampling.mirostat); - params.sampling.mirostat_tau = json_value(data, "mirostat_tau", defaults.sampling.mirostat_tau); - params.sampling.mirostat_eta = json_value(data, "mirostat_eta", defaults.sampling.mirostat_eta); - params.sampling.power_law_target = json_value(data, "power_law_target", defaults.sampling.power_law_target); - params.sampling.power_law_window_size = json_value(data, "power_law_window_size", defaults.sampling.power_law_window_size); - params.sampling.seed = json_value(data, "seed", defaults.sampling.seed); - params.sampling.n_probs = json_value(data, "n_probs", defaults.sampling.n_probs); - params.sampling.min_keep = json_value(data, "min_keep", defaults.sampling.min_keep); - params.post_sampling_probs = json_value(data, "post_sampling_probs", defaults.post_sampling_probs); + params.sampling.top_k = json_value(data, "top_k", defaults.sampling.top_k); + params.sampling.top_p = json_value(data, "top_p", defaults.sampling.top_p); + params.sampling.min_p = json_value(data, "min_p", defaults.sampling.min_p); + params.sampling.top_n_sigma = json_value(data, "top_n_sigma", defaults.sampling.top_n_sigma); + params.sampling.xtc_probability = json_value(data, "xtc_probability", defaults.sampling.xtc_probability); + params.sampling.xtc_threshold = json_value(data, "xtc_threshold", defaults.sampling.xtc_threshold); + params.sampling.typ_p = json_value(data, "typical_p", defaults.sampling.typ_p); + params.sampling.temp = json_value(data, "temperature", defaults.sampling.temp); + params.sampling.dynatemp_range = json_value(data, "dynatemp_range", defaults.sampling.dynatemp_range); + params.sampling.dynatemp_exponent = json_value(data, "dynatemp_exponent", defaults.sampling.dynatemp_exponent); + params.sampling.penalty_last_n = json_value(data, "repeat_last_n", defaults.sampling.penalty_last_n); + params.sampling.penalty_repeat = json_value(data, "repeat_penalty", defaults.sampling.penalty_repeat); + params.sampling.penalty_freq = json_value(data, "frequency_penalty", defaults.sampling.penalty_freq); + params.sampling.penalty_present = json_value(data, "presence_penalty", defaults.sampling.penalty_present); + params.sampling.dry_multiplier = json_value(data, "dry_multiplier", defaults.sampling.dry_multiplier); + params.sampling.dry_base = json_value(data, "dry_base", defaults.sampling.dry_base); + params.sampling.dry_allowed_length = json_value(data, "dry_allowed_length", defaults.sampling.dry_allowed_length); + params.sampling.dry_penalty_last_n = json_value(data, "dry_penalty_last_n", defaults.sampling.dry_penalty_last_n); + params.sampling.mirostat = json_value(data, "mirostat", defaults.sampling.mirostat); + params.sampling.mirostat_tau = json_value(data, "mirostat_tau", defaults.sampling.mirostat_tau); + params.sampling.mirostat_eta = json_value(data, "mirostat_eta", defaults.sampling.mirostat_eta); + params.sampling.power_law_target = json_value(data, "power_law_target", defaults.sampling.power_law_target); + params.sampling.power_law_decay = json_value(data, "power_law_decay", defaults.sampling.power_law_decay); + params.sampling.seed = json_value(data, "seed", defaults.sampling.seed); + params.sampling.n_probs = json_value(data, "n_probs", defaults.sampling.n_probs); + params.sampling.min_keep = json_value(data, "min_keep", defaults.sampling.min_keep); + params.post_sampling_probs = json_value(data, "post_sampling_probs", defaults.post_sampling_probs); params.speculative.n_min = json_value(data, "speculative.n_min", defaults.speculative.n_min); params.speculative.n_max = json_value(data, "speculative.n_max", defaults.speculative.n_max); From b8a9626a739541cc6f65cd07ced19b12c364bf48 Mon Sep 17 00:00:00 2001 From: ddh0 Date: Sat, 13 Dec 2025 22:17:08 -0600 Subject: [PATCH 20/38] oops forgot args.cpp --- common/arg.cpp | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/common/arg.cpp b/common/arg.cpp index 0226a6e644..919e37b7f8 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -1559,18 +1559,22 @@ common_params_context common_params_parser_init(common_params & params, llama_ex ).set_sparam()); add_opt(common_arg( {"--power-law-target"}, "N", - string_format("target probability for Power Law sampling (valid range 0.0 to 1.0; <0 = disabled) " - "(%.1f = default)\n[(more info)](https://github.com/ggml-org/llama.cpp/pull/17927)", + string_format("power law sampler: select tokens near this probability (valid range 0.0 " + "to 1.0; <0 = disabled) (default: %.2f)\n" + "[(more info)]""(https://github.com/ggml-org/llama.cpp/pull/17927)", (double)params.sampling.power_law_target), [](common_params & params, const std::string & value) { params.sampling.power_law_target = std::stof(value); } ).set_sparam()); add_opt(common_arg( - {"--power-law-window-size"}, "N", - string_format("rolling window size for target adaptation in Power Law sampling (≤0 = fixed target; %d = default)", params.sampling.power_law_window_size), + {"--power-law-decay"}, "N", + string_format("power law sampler: decay rate for target adaptation over time. lower " + "values -> faster but less stable adaptation. " + "(valid range 0.0 to 1.0; ≤0 = no adaptation) (default: %.2f)", + (double)params.sampling.power_law_decay), [](common_params & params, int value) { - params.sampling.power_law_window_size = value; + params.sampling.power_law_decay = value; } ).set_sparam()); add_opt(common_arg( From 965bcc9dc4675432d37340647a6916adbe79f184 Mon Sep 17 00:00:00 2001 From: ddh0 Date: Sat, 13 Dec 2025 22:19:15 -0600 Subject: [PATCH 21/38] fix leftover `window_size` --- common/sampling.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/common/sampling.cpp b/common/sampling.cpp index 63a17287dc..8bfdae3be1 100644 --- a/common/sampling.cpp +++ b/common/sampling.cpp @@ -287,7 +287,7 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co llama_sampler_chain_add(result->chain, llama_sampler_init_penalties (params.penalty_last_n, params.penalty_repeat, params.penalty_freq, params.penalty_present)); break; case COMMON_SAMPLER_TYPE_POWER_LAW: - llama_sampler_chain_add(result->chain, llama_sampler_init_power_law (params.power_law_target, params.power_law_window_size, params.seed)); + llama_sampler_chain_add(result->chain, llama_sampler_init_power_law (params.power_law_target, params.power_law_decay, params.seed)); has_distribution_sampler = true; break; default: From d1e5c60442aebfc788e5096eac8d810efea3c1df Mon Sep 17 00:00:00 2001 From: ddh0 Date: Sat, 13 Dec 2025 23:26:03 -0600 Subject: [PATCH 22/38] add missing values to `common_params_sampling::print()` --- common/sampling.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/common/sampling.cpp b/common/sampling.cpp index 8bfdae3be1..a8494a679d 100644 --- a/common/sampling.cpp +++ b/common/sampling.cpp @@ -151,11 +151,11 @@ std::string common_params_sampling::print() const { "\trepeat_last_n = %d, repeat_penalty = %.3f, frequency_penalty = %.3f, presence_penalty = %.3f\n" "\tdry_multiplier = %.3f, dry_base = %.3f, dry_allowed_length = %d, dry_penalty_last_n = %d\n" "\ttop_k = %d, top_p = %.3f, min_p = %.3f, xtc_probability = %.3f, xtc_threshold = %.3f, typical_p = %.3f, top_n_sigma = %.3f, temp = %.3f\n" - "\tmirostat = %d, mirostat_lr = %.3f, mirostat_ent = %.3f", + "\tmirostat = %d, mirostat_lr = %.3f, mirostat_ent = %.3f, power_law_target = %.3f, power_law_decay = %.3f", penalty_last_n, penalty_repeat, penalty_freq, penalty_present, dry_multiplier, dry_base, dry_allowed_length, dry_penalty_last_n, top_k, top_p, min_p, xtc_probability, xtc_threshold, typ_p, top_n_sigma, temp, - mirostat, mirostat_eta, mirostat_tau); + mirostat, mirostat_eta, mirostat_tau, power_law_target, power_law_decay); return std::string(result); } From 9613c481725a0fb39784db5b292cdc3de446156f Mon Sep 17 00:00:00 2001 From: ddh0 Date: Sun, 14 Dec 2025 00:36:59 -0600 Subject: [PATCH 23/38] with logging --- src/llama-sampling.cpp | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp index db126a18d5..ae3e269ea2 100644 --- a/src/llama-sampling.cpp +++ b/src/llama-sampling.cpp @@ -2362,12 +2362,16 @@ static const char * llama_sampler_power_law_name(const struct llama_sampler * /* static float llama_sampler_power_law_compute_target(const llama_sampler_power_law * ctx, float decay) { if (ctx->total_weight == 0.0f) { // if there is no history, just use base target + fprintf(stderr, "power-law: compute_target: total_weight == 0.0 (target fixed at %.3f)\n", ctx->target); + fflush(stderr); return ctx->target; } // maintain a running weighted sum with exponential decay float new_total_weight = 1.0f + decay * ctx->total_weight; + fprintf(stderr, "power-law: compute_target: new_total_weight = %.3f\n", new_total_weight); fflush(stderr); float next_value = ctx->target * new_total_weight - decay * ctx->weighted_sum; + fprintf(stderr, "power-law: compute_target: next_value = %.3f\n", next_value); fflush(stderr); // clamp to [0.0, 1.0] return std::max(0.0f, std::min(next_value, 1.0f)); @@ -2378,14 +2382,16 @@ static void llama_sampler_power_law_apply(struct llama_sampler * smpl, llama_tok if (ctx->target < 0.0f) { // no-op: just sample from the distribution as-is + fprintf(stderr, "power-law: no-op!"); fflush(stderr); llama_sampler_softmax_impl(cur_p, false); - const int idx = llama_sample_dist(cur_p, ctx->rng); + const int idx = llama_sample_dist(cur_p, ctx->rng); cur_p->selected = idx; return; } // clamp decay to avoid degenerate case at 1.0 (unbounded accumulation) const float decay = std::min(ctx->decay, 0.99f); + fprintf(stderr, "power-law: decay = %.3f\n", decay); fflush(stderr); // fixed power law transform parameters const float distribution_width = 0.3f; @@ -2403,15 +2409,20 @@ static void llama_sampler_power_law_apply(struct llama_sampler * smpl, llama_tok } float computed_target = llama_sampler_power_law_compute_target(ctx, decay); + fprintf(stderr, "power-law: computed_target = %.3f\n", computed_target); fflush(stderr); // // power law transform // for (size_t i = 0; i < cur_p->size; ++i) { - float p = cur_p->data[i].p; - float normalized_distance = std::abs(p - computed_target) / distribution_width; - cur_p->data[i].logit = peak_logit_value / (1.0f + std::pow(normalized_distance, tail_heaviness)); + float p = cur_p->data[i].p; + fprintf(stderr, "power-law: transform: p = %.3f\n", p); fflush(stderr); + float normed_distance = std::abs(p - computed_target) / distribution_width; + fprintf(stderr, "power-law: transform: normed_distance = %.3f\n", normed_distance); fflush(stderr); + float new_p = peak_logit_value / (1.0f + std::pow(normed_distance, tail_heaviness)); + fprintf(stderr, "power-law: transform: new_p = %.3f\n", new_p); fflush(stderr); + cur_p->data[i].logit = new_p; } llama_sampler_softmax_impl(cur_p, false); @@ -2419,6 +2430,7 @@ static void llama_sampler_power_law_apply(struct llama_sampler * smpl, llama_tok // sample from transformed distribution const int idx = llama_sample_dist(cur_p, ctx->rng); cur_p->selected = idx; + fprintf(stderr, "power-law: selected token %d\n", idx); fflush(stderr); // update running history with the original probability of the selected token float original_p = original_probs[idx]; From 2a3f579d1ffcd2dffeb60ea21e7a4ceba6d15e22 Mon Sep 17 00:00:00 2001 From: ddh0 Date: Sun, 14 Dec 2025 01:55:02 -0600 Subject: [PATCH 24/38] does this fix it? --- src/llama-sampling.cpp | 49 ++++++++++++++++++------------------------ 1 file changed, 21 insertions(+), 28 deletions(-) diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp index ae3e269ea2..26135a4f82 100644 --- a/src/llama-sampling.cpp +++ b/src/llama-sampling.cpp @@ -2358,23 +2358,20 @@ static const char * llama_sampler_power_law_name(const struct llama_sampler * /* return "power-law"; } -// compute the adaptive target probability for the current sampling step -static float llama_sampler_power_law_compute_target(const llama_sampler_power_law * ctx, float decay) { +// compute the adapted target probability for the current sampling step +static float llama_sampler_power_law_compute_target(const llama_sampler_power_law * ctx) { + const float base_target = ctx->target; if (ctx->total_weight == 0.0f) { - // if there is no history, just use base target - fprintf(stderr, "power-law: compute_target: total_weight == 0.0 (target fixed at %.3f)\n", ctx->target); - fflush(stderr); - return ctx->target; + fprintf(stderr, "power-law: compute_target: total_weight == 0.0 (target fixed at %.3f)\n", base_target); + return base_target; } + float target = 2.0f * base_target - (ctx->weighted_sum / ctx->total_weight); + fprintf(stderr, "power-law: compute_target: target = %.3f\n", target); - // maintain a running weighted sum with exponential decay - float new_total_weight = 1.0f + decay * ctx->total_weight; - fprintf(stderr, "power-law: compute_target: new_total_weight = %.3f\n", new_total_weight); fflush(stderr); - float next_value = ctx->target * new_total_weight - decay * ctx->weighted_sum; - fprintf(stderr, "power-law: compute_target: next_value = %.3f\n", next_value); fflush(stderr); - - // clamp to [0.0, 1.0] - return std::max(0.0f, std::min(next_value, 1.0f)); + // clamp result to [0.0, 1.0] + target = std::max(0.0f, std::min(target, 1.0f)); + fprintf(stderr, "power-law: compute_target: target (post-clamp) = %.3f\n", target); fflush(stderr); + return target; } static void llama_sampler_power_law_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) { @@ -2393,11 +2390,6 @@ static void llama_sampler_power_law_apply(struct llama_sampler * smpl, llama_tok const float decay = std::min(ctx->decay, 0.99f); fprintf(stderr, "power-law: decay = %.3f\n", decay); fflush(stderr); - // fixed power law transform parameters - const float distribution_width = 0.3f; - const float peak_logit_value = 5.0f; - const float tail_heaviness = 2.0f; - // get the original probabilities llama_sampler_softmax_impl(cur_p, false); @@ -2408,21 +2400,22 @@ static void llama_sampler_power_law_apply(struct llama_sampler * smpl, llama_tok original_probs.push_back(cur_p->data[i].p); } - float computed_target = llama_sampler_power_law_compute_target(ctx, decay); + float computed_target = llama_sampler_power_law_compute_target(ctx); fprintf(stderr, "power-law: computed_target = %.3f\n", computed_target); fflush(stderr); // // power law transform // + // transformation constants + const float distribution_width = 0.3f; + const float peak_logit_value = 5.0f; + + const float inv_width = 1.0f / distribution_width; + for (size_t i = 0; i < cur_p->size; ++i) { - float p = cur_p->data[i].p; - fprintf(stderr, "power-law: transform: p = %.3f\n", p); fflush(stderr); - float normed_distance = std::abs(p - computed_target) / distribution_width; - fprintf(stderr, "power-law: transform: normed_distance = %.3f\n", normed_distance); fflush(stderr); - float new_p = peak_logit_value / (1.0f + std::pow(normed_distance, tail_heaviness)); - fprintf(stderr, "power-law: transform: new_p = %.3f\n", new_p); fflush(stderr); - cur_p->data[i].logit = new_p; + float dist = (cur_p->data[i].p - computed_target) * inv_width; + cur_p->data[i].logit = peak_logit_value / (1.0f + dist * dist); } llama_sampler_softmax_impl(cur_p, false); @@ -2430,7 +2423,7 @@ static void llama_sampler_power_law_apply(struct llama_sampler * smpl, llama_tok // sample from transformed distribution const int idx = llama_sample_dist(cur_p, ctx->rng); cur_p->selected = idx; - fprintf(stderr, "power-law: selected token %d\n", idx); fflush(stderr); + fprintf(stderr, "power-law: selected token at index %d\n", idx); fflush(stderr); // update running history with the original probability of the selected token float original_p = original_probs[idx]; From ec54fe5f1445e982e68b6a9c05975de1310719e8 Mon Sep 17 00:00:00 2001 From: ddh0 Date: Sun, 14 Dec 2025 02:54:14 -0600 Subject: [PATCH 25/38] no, but does this? --- common/arg.cpp | 10 ++++------ src/llama-sampling.cpp | 3 +++ 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/common/arg.cpp b/common/arg.cpp index 919e37b7f8..e7bb44f8f5 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -1569,12 +1569,10 @@ common_params_context common_params_parser_init(common_params & params, llama_ex ).set_sparam()); add_opt(common_arg( {"--power-law-decay"}, "N", - string_format("power law sampler: decay rate for target adaptation over time. lower " - "values -> faster but less stable adaptation. " - "(valid range 0.0 to 1.0; ≤0 = no adaptation) (default: %.2f)", - (double)params.sampling.power_law_decay), - [](common_params & params, int value) { - params.sampling.power_law_decay = value; + string_format("decay rate for target adaptation over time. lower values -> faster but less stable adaptation.\n" + "(valid range 0.0 to 1.0; ≤0 = no adaptation) (default: %.2f)", (double)params.sampling.power_law_decay), + [](common_params & params, const std::string & value) { + params.sampling.power_law_decay = std::stof(value); } ).set_sparam()); add_opt(common_arg( diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp index 26135a4f82..6beb927a6c 100644 --- a/src/llama-sampling.cpp +++ b/src/llama-sampling.cpp @@ -2427,8 +2427,11 @@ static void llama_sampler_power_law_apply(struct llama_sampler * smpl, llama_tok // update running history with the original probability of the selected token float original_p = original_probs[idx]; + fprintf(stderr, "power-law: original prob was %.3f\n", original_p); fflush(stderr); ctx->weighted_sum = original_p + decay * ctx->weighted_sum; + fprintf(stderr, "power-law: updated ctx->weighted_sum = %.3f\n", ctx->weighted_sum); fflush(stderr); ctx->total_weight = 1.0f + decay * ctx->total_weight; + fprintf(stderr, "power-law: updated ctx->total_weight = %.3f\n", ctx->total_weight); fflush(stderr); } static void llama_sampler_power_law_reset(struct llama_sampler * smpl) { From 667b70fdac1054401f6ab278fba99a90bcf5253b Mon Sep 17 00:00:00 2001 From: ddh0 Date: Sun, 14 Dec 2025 03:41:28 -0600 Subject: [PATCH 26/38] update default decay --- common/common.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/common/common.h b/common/common.h index 66a6ca96b3..7fe62b4111 100644 --- a/common/common.h +++ b/common/common.h @@ -185,7 +185,7 @@ struct common_params_sampling { int32_t dry_allowed_length = 2; // tokens extending repetitions beyond this receive penalty int32_t dry_penalty_last_n = -1; // how many tokens to scan for repetitions (0 = disable penalty, -1 = context size) float power_law_target = -1.0f; // select tokens near this probability (valid range 0.0 to 1.0; <0 = disabled) - float power_law_decay = 0.9f; // decay rate for target adaptation over time. lower values -> faster but less stable adaptation. (valid range 0.0 to 1.0; ≤0 = no adaptation) + float power_law_decay = 0.50f; // decay rate for target adaptation over time. lower values -> faster but less stable adaptation. (valid range 0.0 to 1.0; ≤0 = no adaptation) int32_t mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0 float top_n_sigma = -1.00f; // -1.0 = disabled float mirostat_tau = 5.00f; // target entropy From 693478066981b41f3e3b7a714c9327310a87dfc2 Mon Sep 17 00:00:00 2001 From: ddh0 Date: Sun, 14 Dec 2025 16:26:15 -0600 Subject: [PATCH 27/38] optimize --- src/llama-sampling.cpp | 53 ++++++++++++++++++++---------------------- 1 file changed, 25 insertions(+), 28 deletions(-) diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp index 6beb927a6c..78fe7706b9 100644 --- a/src/llama-sampling.cpp +++ b/src/llama-sampling.cpp @@ -2349,11 +2349,18 @@ struct llama_sampler_power_law { std::mt19937 rng; // historical token probabilities weighted by recency - float weighted_sum; + float weighted_sum; // sum of weights, converges to 1/(1-decay) - float total_weight; + float total_weight; + // used to store original token probabilities (needed for history update after selection) + std::vector original_probs; }; +// transformation constants +static constexpr float DISTRIBUTION_WIDTH = 0.3f; +static constexpr float PEAK_LOGIT_VALUE = 5.0f; +static constexpr float INV_WIDTH = 1.0f / DISTRIBUTION_WIDTH; + static const char * llama_sampler_power_law_name(const struct llama_sampler * /*smpl*/) { return "power-law"; } @@ -2369,7 +2376,7 @@ static float llama_sampler_power_law_compute_target(const llama_sampler_power_la fprintf(stderr, "power-law: compute_target: target = %.3f\n", target); // clamp result to [0.0, 1.0] - target = std::max(0.0f, std::min(target, 1.0f)); + target = std::clamp(target, 0.0f, 1.0f); fprintf(stderr, "power-law: compute_target: target (post-clamp) = %.3f\n", target); fflush(stderr); return target; } @@ -2379,43 +2386,32 @@ static void llama_sampler_power_law_apply(struct llama_sampler * smpl, llama_tok if (ctx->target < 0.0f) { // no-op: just sample from the distribution as-is - fprintf(stderr, "power-law: no-op!"); fflush(stderr); + fprintf(stderr, "power-law: no-op!"); llama_sampler_softmax_impl(cur_p, false); const int idx = llama_sample_dist(cur_p, ctx->rng); cur_p->selected = idx; return; } - // clamp decay to avoid degenerate case at 1.0 (unbounded accumulation) - const float decay = std::min(ctx->decay, 0.99f); - fprintf(stderr, "power-law: decay = %.3f\n", decay); fflush(stderr); - // get the original probabilities llama_sampler_softmax_impl(cur_p, false); - // store the original probabilities (needed for history update after selection) - std::vector original_probs; - original_probs.reserve(cur_p->size); + // store the original probabilities + ctx->original_probs.resize(cur_p->size); for (size_t i = 0; i < cur_p->size; ++i) { - original_probs.push_back(cur_p->data[i].p); + ctx->original_probs[i] = cur_p->data[i].p; } float computed_target = llama_sampler_power_law_compute_target(ctx); - fprintf(stderr, "power-law: computed_target = %.3f\n", computed_target); fflush(stderr); + fprintf(stderr, "power-law: computed_target = %.3f\n", computed_target); // // power law transform // - // transformation constants - const float distribution_width = 0.3f; - const float peak_logit_value = 5.0f; - - const float inv_width = 1.0f / distribution_width; - for (size_t i = 0; i < cur_p->size; ++i) { - float dist = (cur_p->data[i].p - computed_target) * inv_width; - cur_p->data[i].logit = peak_logit_value / (1.0f + dist * dist); + float dist = (cur_p->data[i].p - computed_target) * INV_WIDTH; + cur_p->data[i].logit = PEAK_LOGIT_VALUE / (1.0f + dist * dist); } llama_sampler_softmax_impl(cur_p, false); @@ -2423,14 +2419,14 @@ static void llama_sampler_power_law_apply(struct llama_sampler * smpl, llama_tok // sample from transformed distribution const int idx = llama_sample_dist(cur_p, ctx->rng); cur_p->selected = idx; - fprintf(stderr, "power-law: selected token at index %d\n", idx); fflush(stderr); + fprintf(stderr, "power-law: selected token at index %d\n", idx); // update running history with the original probability of the selected token - float original_p = original_probs[idx]; - fprintf(stderr, "power-law: original prob was %.3f\n", original_p); fflush(stderr); - ctx->weighted_sum = original_p + decay * ctx->weighted_sum; - fprintf(stderr, "power-law: updated ctx->weighted_sum = %.3f\n", ctx->weighted_sum); fflush(stderr); - ctx->total_weight = 1.0f + decay * ctx->total_weight; + float original_p = ctx->original_probs[idx]; + fprintf(stderr, "power-law: original prob was %.3f\n", original_p); + ctx->weighted_sum = original_p + ctx->decay * ctx->weighted_sum; + fprintf(stderr, "power-law: updated ctx->weighted_sum = %.3f\n", ctx->weighted_sum); + ctx->total_weight = 1.0f + ctx->decay * ctx->total_weight; fprintf(stderr, "power-law: updated ctx->total_weight = %.3f\n", ctx->total_weight); fflush(stderr); } @@ -2448,6 +2444,7 @@ static struct llama_sampler * llama_sampler_power_law_clone(const struct llama_s result_ctx->rng = ctx->rng; result_ctx->weighted_sum = ctx->weighted_sum; result_ctx->total_weight = ctx->total_weight; + result_ctx->original_probs.reserve(ctx->original_probs.capacity()); return result; } @@ -2475,7 +2472,7 @@ struct llama_sampler * llama_sampler_init_power_law( /* .iface = */ &llama_sampler_power_law_i, /* .ctx = */ new llama_sampler_power_law { /* .target = */ target, - /* .decay = */ decay, + /* .decay = */ std::min(decay, 0.99f), /* .seed = */ seed_cur, /* .rng = */ std::mt19937(seed_cur), /* .weighted_sum = */ 0.0f, From f5d08724e75d3f41d4737c333349e03b21baa704 Mon Sep 17 00:00:00 2001 From: ddh0 Date: Sun, 14 Dec 2025 21:51:59 -0600 Subject: [PATCH 28/38] fix bad merge my git skills are lacking --- common/sampling.cpp | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/common/sampling.cpp b/common/sampling.cpp index ee58aa50b3..1e26f44a6c 100644 --- a/common/sampling.cpp +++ b/common/sampling.cpp @@ -253,8 +253,6 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co for (const auto & str : params.dry_sequence_breakers) { c_breakers.push_back(str.c_str()); } - - llama_sampler_chain_add(result->chain, llama_sampler_init_dry (vocab, llama_model_n_ctx_train(model), params.dry_multiplier, params.dry_base, params.dry_allowed_length, params.dry_penalty_last_n, c_breakers.data(), c_breakers.size())); samplers.push_back(llama_sampler_init_dry (vocab, llama_model_n_ctx_train(model), params.dry_multiplier, params.dry_base, params.dry_allowed_length, params.dry_penalty_last_n, c_breakers.data(), c_breakers.size())); } break; @@ -286,7 +284,6 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co samplers.push_back(llama_sampler_init_penalties (params.penalty_last_n, params.penalty_repeat, params.penalty_freq, params.penalty_present)); break; case COMMON_SAMPLER_TYPE_POWER_LAW: - llama_sampler_chain_add(result->chain, llama_sampler_init_power_law (params.power_law_target, params.power_law_decay, params.seed)); has_distribution_sampler = true; break; default: @@ -295,10 +292,8 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co } // only add `dist` to the end of the chain if no other distribution samplers were added if (!has_distribution_sampler) { - llama_sampler_chain_add(result->chain, llama_sampler_init_dist(params.seed)); + samplers.push_back(llama_sampler_init_dist(params.seed)); } - - samplers.push_back(llama_sampler_init_dist(params.seed)); } else if (params.mirostat == 1) { samplers.push_back(llama_sampler_init_temp(params.temp)); samplers.push_back(llama_sampler_init_mirostat(llama_vocab_n_tokens(vocab), params.seed, params.mirostat_tau, params.mirostat_eta, 100)); From 493bf301ff861cc1ce52dc86e8204954c98f8f80 Mon Sep 17 00:00:00 2001 From: ddh0 Date: Sun, 14 Dec 2025 21:55:45 -0600 Subject: [PATCH 29/38] silence `missing initializer for member` --- src/llama-sampling.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp index 78fe7706b9..e044ef5898 100644 --- a/src/llama-sampling.cpp +++ b/src/llama-sampling.cpp @@ -2477,6 +2477,7 @@ struct llama_sampler * llama_sampler_init_power_law( /* .rng = */ std::mt19937(seed_cur), /* .weighted_sum = */ 0.0f, /* .total_weight = */ 0.0f, + /* .original_probs = */ {}, } ); } From 68543257e944acf75f2483619c54638ee46a3901 Mon Sep 17 00:00:00 2001 From: ddh0 Date: Sun, 14 Dec 2025 22:03:17 -0600 Subject: [PATCH 30/38] update default decay to 0.9 --- common/common.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/common/common.h b/common/common.h index 7231cbc5b8..4cc909beeb 100644 --- a/common/common.h +++ b/common/common.h @@ -185,7 +185,7 @@ struct common_params_sampling { int32_t dry_allowed_length = 2; // tokens extending repetitions beyond this receive penalty int32_t dry_penalty_last_n = -1; // how many tokens to scan for repetitions (0 = disable penalty, -1 = context size) float power_law_target = -1.0f; // select tokens near this probability (valid range 0.0 to 1.0; <0 = disabled) - float power_law_decay = 0.50f; // decay rate for target adaptation over time. lower values -> faster but less stable adaptation. (valid range 0.0 to 1.0; ≤0 = no adaptation) + float power_law_decay = 0.90f; // decay rate for target adaptation over time. lower values -> faster but less stable adaptation. (valid range 0.0 to 1.0; ≤0 = no adaptation) int32_t mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0 float top_n_sigma = -1.00f; // -1.0 = disabled float mirostat_tau = 5.00f; // target entropy From b5ed673ce92fdc9753679742ef28a218b5df1e68 Mon Sep 17 00:00:00 2001 From: ddh0 Date: Sun, 14 Dec 2025 22:08:36 -0600 Subject: [PATCH 31/38] fix logging --- src/llama-sampling.cpp | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp index e044ef5898..1c1febee2d 100644 --- a/src/llama-sampling.cpp +++ b/src/llama-sampling.cpp @@ -2386,17 +2386,15 @@ static void llama_sampler_power_law_apply(struct llama_sampler * smpl, llama_tok if (ctx->target < 0.0f) { // no-op: just sample from the distribution as-is - fprintf(stderr, "power-law: no-op!"); + fprintf(stderr, "power-law: no-op!\n"); fflush(stderr); llama_sampler_softmax_impl(cur_p, false); const int idx = llama_sample_dist(cur_p, ctx->rng); cur_p->selected = idx; return; } - // get the original probabilities + // softmax and store the original probabilities llama_sampler_softmax_impl(cur_p, false); - - // store the original probabilities ctx->original_probs.resize(cur_p->size); for (size_t i = 0; i < cur_p->size; ++i) { ctx->original_probs[i] = cur_p->data[i].p; @@ -2409,6 +2407,7 @@ static void llama_sampler_power_law_apply(struct llama_sampler * smpl, llama_tok // power law transform // + fprintf(stderr, "power-law: transform: cur_p->size = %.3f\n", cur_p->size); for (size_t i = 0; i < cur_p->size; ++i) { float dist = (cur_p->data[i].p - computed_target) * INV_WIDTH; cur_p->data[i].logit = PEAK_LOGIT_VALUE / (1.0f + dist * dist); From 4e28eb2ffe9d052132f9daa4e5b0d73dec27bb0a Mon Sep 17 00:00:00 2001 From: ddh0 Date: Sun, 14 Dec 2025 22:11:34 -0600 Subject: [PATCH 32/38] format (double) --- src/llama-sampling.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp index 1c1febee2d..f255340837 100644 --- a/src/llama-sampling.cpp +++ b/src/llama-sampling.cpp @@ -2407,7 +2407,7 @@ static void llama_sampler_power_law_apply(struct llama_sampler * smpl, llama_tok // power law transform // - fprintf(stderr, "power-law: transform: cur_p->size = %.3f\n", cur_p->size); + fprintf(stderr, "power-law: transform: cur_p->size = %.3f\n", (double)cur_p->size); for (size_t i = 0; i < cur_p->size; ++i) { float dist = (cur_p->data[i].p - computed_target) * INV_WIDTH; cur_p->data[i].logit = PEAK_LOGIT_VALUE / (1.0f + dist * dist); From 1c58e9a96a3060e907a60cfae41c837b6f46e2ea Mon Sep 17 00:00:00 2001 From: ddh0 Date: Sun, 14 Dec 2025 22:32:27 -0600 Subject: [PATCH 33/38] add power law to the new `samplers` vector --- common/sampling.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/common/sampling.cpp b/common/sampling.cpp index 1e26f44a6c..05e44170e4 100644 --- a/common/sampling.cpp +++ b/common/sampling.cpp @@ -285,6 +285,7 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co break; case COMMON_SAMPLER_TYPE_POWER_LAW: has_distribution_sampler = true; + samplers.push_back(llama_sampler_init_power_law (params.power_law_target, params.power_law_decay, params.seed)); break; default: GGML_ASSERT(false && "unknown sampler type"); From 4e04bd1ce21f6ec85897e89058866f18d4214b3a Mon Sep 17 00:00:00 2001 From: ddh0 Date: Sun, 14 Dec 2025 23:14:51 -0600 Subject: [PATCH 34/38] log sampler init values --- src/llama-sampling.cpp | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp index f255340837..cf235b57d4 100644 --- a/src/llama-sampling.cpp +++ b/src/llama-sampling.cpp @@ -2466,16 +2466,19 @@ struct llama_sampler * llama_sampler_init_power_law( float decay, uint32_t seed ) { + const float _decay = std::min(decay, 0.99f); + fprintf(stderr, "power-law: init: target %.3f, decay %.3f\n", (double)target, (double)_decay); + fflush(stderr); auto seed_cur = get_rng_seed(seed); return llama_sampler_init( /* .iface = */ &llama_sampler_power_law_i, /* .ctx = */ new llama_sampler_power_law { - /* .target = */ target, - /* .decay = */ std::min(decay, 0.99f), - /* .seed = */ seed_cur, - /* .rng = */ std::mt19937(seed_cur), - /* .weighted_sum = */ 0.0f, - /* .total_weight = */ 0.0f, + /* .target = */ target, + /* .decay = */ _decay, + /* .seed = */ seed_cur, + /* .rng = */ std::mt19937(seed_cur), + /* .weighted_sum = */ 0.0f, + /* .total_weight = */ 0.0f, /* .original_probs = */ {}, } ); From 9c50b573f5e316037700d2fd548adc8a81074d6c Mon Sep 17 00:00:00 2001 From: ddh0 Date: Mon, 15 Dec 2025 09:25:05 -0600 Subject: [PATCH 35/38] improve logging messages in llama_sampler_power_law --- src/llama-sampling.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp index cf235b57d4..dc827fe219 100644 --- a/src/llama-sampling.cpp +++ b/src/llama-sampling.cpp @@ -2369,15 +2369,15 @@ static const char * llama_sampler_power_law_name(const struct llama_sampler * /* static float llama_sampler_power_law_compute_target(const llama_sampler_power_law * ctx) { const float base_target = ctx->target; if (ctx->total_weight == 0.0f) { - fprintf(stderr, "power-law: compute_target: total_weight == 0.0 (target fixed at %.3f)\n", base_target); + fprintf(stderr, "power-law: compute_target: total_weight == 0.0 (target fixed at %.3f)\n", base_target); fflush(stderr); return base_target; } float target = 2.0f * base_target - (ctx->weighted_sum / ctx->total_weight); - fprintf(stderr, "power-law: compute_target: target = %.3f\n", target); + fprintf(stderr, "power-law: compute_target: raw target = %.3f\n", target); // clamp result to [0.0, 1.0] target = std::clamp(target, 0.0f, 1.0f); - fprintf(stderr, "power-law: compute_target: target (post-clamp) = %.3f\n", target); fflush(stderr); + fprintf(stderr, "power-law: compute_target: clamped target = %.3f\n", target); fflush(stderr); return target; } @@ -2407,7 +2407,7 @@ static void llama_sampler_power_law_apply(struct llama_sampler * smpl, llama_tok // power law transform // - fprintf(stderr, "power-law: transform: cur_p->size = %.3f\n", (double)cur_p->size); + fprintf(stderr, "power-law: transform: cur_p->size = %d\n", (size_t)cur_p->size); for (size_t i = 0; i < cur_p->size; ++i) { float dist = (cur_p->data[i].p - computed_target) * INV_WIDTH; cur_p->data[i].logit = PEAK_LOGIT_VALUE / (1.0f + dist * dist); From 0344068cf112e524eb3fbdbd58c171870b63e56c Mon Sep 17 00:00:00 2001 From: ddh0 Date: Mon, 15 Dec 2025 09:35:44 -0600 Subject: [PATCH 36/38] remove extraneous logging --- src/llama-sampling.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp index dc827fe219..7b48e5d970 100644 --- a/src/llama-sampling.cpp +++ b/src/llama-sampling.cpp @@ -2401,7 +2401,6 @@ static void llama_sampler_power_law_apply(struct llama_sampler * smpl, llama_tok } float computed_target = llama_sampler_power_law_compute_target(ctx); - fprintf(stderr, "power-law: computed_target = %.3f\n", computed_target); // // power law transform From 1c2d2e900d487d70f704441bafe9ac87afd89d6f Mon Sep 17 00:00:00 2001 From: ddh0 Date: Mon, 15 Dec 2025 21:02:11 -0600 Subject: [PATCH 37/38] simplify target computation last commit with debug logging! --- src/llama-sampling.cpp | 26 +++++++------------------- 1 file changed, 7 insertions(+), 19 deletions(-) diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp index 7b48e5d970..7684c8f38c 100644 --- a/src/llama-sampling.cpp +++ b/src/llama-sampling.cpp @@ -2365,22 +2365,6 @@ static const char * llama_sampler_power_law_name(const struct llama_sampler * /* return "power-law"; } -// compute the adapted target probability for the current sampling step -static float llama_sampler_power_law_compute_target(const llama_sampler_power_law * ctx) { - const float base_target = ctx->target; - if (ctx->total_weight == 0.0f) { - fprintf(stderr, "power-law: compute_target: total_weight == 0.0 (target fixed at %.3f)\n", base_target); fflush(stderr); - return base_target; - } - float target = 2.0f * base_target - (ctx->weighted_sum / ctx->total_weight); - fprintf(stderr, "power-law: compute_target: raw target = %.3f\n", target); - - // clamp result to [0.0, 1.0] - target = std::clamp(target, 0.0f, 1.0f); - fprintf(stderr, "power-law: compute_target: clamped target = %.3f\n", target); fflush(stderr); - return target; -} - static void llama_sampler_power_law_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) { auto * ctx = (llama_sampler_power_law *) smpl->ctx; @@ -2400,13 +2384,18 @@ static void llama_sampler_power_law_apply(struct llama_sampler * smpl, llama_tok ctx->original_probs[i] = cur_p->data[i].p; } - float computed_target = llama_sampler_power_law_compute_target(ctx); + // compute the adapted target probability for the current sampling step + float computed_target = std::clamp( + ctx->total_weight == 0.0f ? ctx->target : 2.0f * ctx->target - (ctx->weighted_sum / ctx->total_weight), + 0.0f, 1.0f + ); + fprintf(stderr, "power-law: computed target = %.3f\n", computed_target); // // power law transform // - fprintf(stderr, "power-law: transform: cur_p->size = %d\n", (size_t)cur_p->size); + fprintf(stderr, "power-law: cur_p->size = %d\n", (int)cur_p->size); for (size_t i = 0; i < cur_p->size; ++i) { float dist = (cur_p->data[i].p - computed_target) * INV_WIDTH; cur_p->data[i].logit = PEAK_LOGIT_VALUE / (1.0f + dist * dist); @@ -2421,7 +2410,6 @@ static void llama_sampler_power_law_apply(struct llama_sampler * smpl, llama_tok // update running history with the original probability of the selected token float original_p = ctx->original_probs[idx]; - fprintf(stderr, "power-law: original prob was %.3f\n", original_p); ctx->weighted_sum = original_p + ctx->decay * ctx->weighted_sum; fprintf(stderr, "power-law: updated ctx->weighted_sum = %.3f\n", ctx->weighted_sum); ctx->total_weight = 1.0f + ctx->decay * ctx->total_weight; From fcb512908630db298337c3ad13361e4493f1fb8b Mon Sep 17 00:00:00 2001 From: ddh0 Date: Mon, 15 Dec 2025 21:42:29 -0600 Subject: [PATCH 38/38] remove debug logging, explicitly clamp params at init --- src/llama-sampling.cpp | 24 +++++------------------- 1 file changed, 5 insertions(+), 19 deletions(-) diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp index 7684c8f38c..77ec141a56 100644 --- a/src/llama-sampling.cpp +++ b/src/llama-sampling.cpp @@ -2370,10 +2370,8 @@ static void llama_sampler_power_law_apply(struct llama_sampler * smpl, llama_tok if (ctx->target < 0.0f) { // no-op: just sample from the distribution as-is - fprintf(stderr, "power-law: no-op!\n"); fflush(stderr); llama_sampler_softmax_impl(cur_p, false); - const int idx = llama_sample_dist(cur_p, ctx->rng); - cur_p->selected = idx; + cur_p->selected = llama_sample_dist(cur_p, ctx->rng); return; } @@ -2389,13 +2387,8 @@ static void llama_sampler_power_law_apply(struct llama_sampler * smpl, llama_tok ctx->total_weight == 0.0f ? ctx->target : 2.0f * ctx->target - (ctx->weighted_sum / ctx->total_weight), 0.0f, 1.0f ); - fprintf(stderr, "power-law: computed target = %.3f\n", computed_target); - // // power law transform - // - - fprintf(stderr, "power-law: cur_p->size = %d\n", (int)cur_p->size); for (size_t i = 0; i < cur_p->size; ++i) { float dist = (cur_p->data[i].p - computed_target) * INV_WIDTH; cur_p->data[i].logit = PEAK_LOGIT_VALUE / (1.0f + dist * dist); @@ -2406,14 +2399,10 @@ static void llama_sampler_power_law_apply(struct llama_sampler * smpl, llama_tok // sample from transformed distribution const int idx = llama_sample_dist(cur_p, ctx->rng); cur_p->selected = idx; - fprintf(stderr, "power-law: selected token at index %d\n", idx); // update running history with the original probability of the selected token - float original_p = ctx->original_probs[idx]; - ctx->weighted_sum = original_p + ctx->decay * ctx->weighted_sum; - fprintf(stderr, "power-law: updated ctx->weighted_sum = %.3f\n", ctx->weighted_sum); - ctx->total_weight = 1.0f + ctx->decay * ctx->total_weight; - fprintf(stderr, "power-law: updated ctx->total_weight = %.3f\n", ctx->total_weight); fflush(stderr); + ctx->weighted_sum = ctx->original_probs[idx] + ctx->decay * ctx->weighted_sum; + ctx->total_weight = 1.0f + ctx->decay * ctx->total_weight; // history fades over time } static void llama_sampler_power_law_reset(struct llama_sampler * smpl) { @@ -2453,15 +2442,12 @@ struct llama_sampler * llama_sampler_init_power_law( float decay, uint32_t seed ) { - const float _decay = std::min(decay, 0.99f); - fprintf(stderr, "power-law: init: target %.3f, decay %.3f\n", (double)target, (double)_decay); - fflush(stderr); auto seed_cur = get_rng_seed(seed); return llama_sampler_init( /* .iface = */ &llama_sampler_power_law_i, /* .ctx = */ new llama_sampler_power_law { - /* .target = */ target, - /* .decay = */ _decay, + /* .target = */ std::clamp(target, 0.0f, 1.0f), + /* .decay = */ std::clamp(decay, 0.0f, 0.99f), /* .seed = */ seed_cur, /* .rng = */ std::mt19937(seed_cur), /* .weighted_sum = */ 0.0f,