diff --git a/common/arg.cpp b/common/arg.cpp index 31f67627f6..a8ea0caf33 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -1503,23 +1503,16 @@ common_params_context common_params_parser_init(common_params & params, llama_ex ).set_sparam()); add_opt(common_arg( {"--power-law-target"}, "N", - string_format("Power Law sampler target probability (default: %.2f; allowed range 0.0 to 1.0)\n" - "[(more info)](https://github.com/ggml-org/llama.cpp/pull/17927)", + string_format("target probability for Power Law sampling (valid range 0.0 to 1.0; <0 = disabled) " + "(%.1f = default)\n[(more info)](https://github.com/ggml-org/llama.cpp/pull/17927)", (double)params.sampling.power_law_target), [](common_params & params, const std::string & value) { params.sampling.power_law_target = std::stof(value); } ).set_sparam()); - add_opt(common_arg( - {"--power-law-target-range"}, "N", - string_format("Power Law sampler adaptive target range (target±range) (default: %.2f; 0.0 = fixed target)", (double)params.sampling.power_law_target_range), - [](common_params & params, const std::string & value) { - params.sampling.power_law_target_range = std::stof(value); - } - ).set_sparam()); add_opt(common_arg( {"--power-law-window-size"}, "N", - string_format("Power Law sampler rolling window size, in tokens (default: %d; 0 = fixed target)", params.sampling.power_law_window_size), + string_format("rolling window size for target adaptation in Power Law sampling (≤0 = fixed target; %d = default)", params.sampling.power_law_window_size), [](common_params & params, int value) { params.sampling.power_law_window_size = value; } diff --git a/common/common.h b/common/common.h index d4f1229a7e..ba3d776bdc 100644 --- a/common/common.h +++ b/common/common.h @@ -164,42 +164,40 @@ enum common_params_sampling_config : uint64_t { struct common_params_sampling { uint32_t seed = LLAMA_DEFAULT_SEED; // the seed used to initialize llama_sampler - int32_t n_prev = 64; // number of previous tokens to remember - int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens. - int32_t min_keep = 0; // 0 = disabled, otherwise samplers should return at least min_keep tokens - int32_t top_k = 40; // <= 0 to use vocab size - float top_p = 0.95f; // 1.0 = disabled - float min_p = 0.05f; // 0.0 = disabled - float xtc_probability = 0.00f; // 0.0 = disabled - float xtc_threshold = 0.10f; // > 0.5 disables XTC - float typ_p = 1.00f; // typical_p, 1.0 = disabled - float temp = 0.80f; // <= 0.0 to sample greedily, 0.0 to not output probabilities - float dynatemp_range = 0.00f; // 0.0 = disabled - float dynatemp_exponent = 1.00f; // controls how entropy maps to temperature in dynamic temperature sampler - int32_t penalty_last_n = 64; // last n tokens to penalize (0 = disable penalty, -1 = context size) - float penalty_repeat = 1.00f; // 1.0 = disabled - float penalty_freq = 0.00f; // 0.0 = disabled - float penalty_present = 0.00f; // 0.0 = disabled - float dry_multiplier = 0.0f; // 0.0 = disabled; DRY repetition penalty for tokens extending repetition: - float dry_base = 1.75f; // 0.0 = disabled; multiplier * base ^ (length of sequence before token - allowed length) - int32_t dry_allowed_length = 2; // tokens extending repetitions beyond this receive penalty - int32_t dry_penalty_last_n = -1; // how many tokens to scan for repetitions (0 = disable penalty, -1 = context size) - float power_law_target = 0.5; // target probability (0.0 to 1.0) - float power_law_target_range = 0.5; // adapt the target within this range (target +/- range) - int32_t power_law_window_size = 10; // rolling history window size for target adaptation - int32_t mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0 - float top_n_sigma = -1.00f; // -1.0 = disabled - float mirostat_tau = 5.00f; // target entropy - float mirostat_eta = 0.10f; // learning rate - bool ignore_eos = false; - bool no_perf = false; // disable performance metrics - bool timing_per_token = false; + int32_t n_prev = 64; // number of previous tokens to remember + int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens. + int32_t min_keep = 0; // 0 = disabled, otherwise samplers should return at least min_keep tokens + int32_t top_k = 40; // <= 0 to use vocab size + float top_p = 0.95f; // 1.0 = disabled + float min_p = 0.05f; // 0.0 = disabled + float xtc_probability = 0.00f; // 0.0 = disabled + float xtc_threshold = 0.10f; // > 0.5 disables XTC + float typ_p = 1.00f; // typical_p, 1.0 = disabled + float temp = 0.80f; // <= 0.0 to sample greedily, 0.0 to not output probabilities + float dynatemp_range = 0.00f; // 0.0 = disabled + float dynatemp_exponent = 1.00f; // controls how entropy maps to temperature in dynamic temperature sampler + int32_t penalty_last_n = 64; // last n tokens to penalize (0 = disable penalty, -1 = context size) + float penalty_repeat = 1.00f; // 1.0 = disabled + float penalty_freq = 0.00f; // 0.0 = disabled + float penalty_present = 0.00f; // 0.0 = disabled + float dry_multiplier = 0.0f; // 0.0 = disabled; DRY repetition penalty for tokens extending repetition: + float dry_base = 1.75f; // 0.0 = disabled; multiplier * base ^ (length of sequence before token - allowed length) + int32_t dry_allowed_length = 2; // tokens extending repetitions beyond this receive penalty + int32_t dry_penalty_last_n = -1; // how many tokens to scan for repetitions (0 = disable penalty, -1 = context size) + float power_law_target = -1.0f; // target probability for Power Law sampling (valid range 0.0 to 1.0; <0 = disabled) + int32_t power_law_window_size = 10; // rolling window size for target adaptation in Power Law sampling (≤0 = fixed target) + int32_t mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0 + float top_n_sigma = -1.00f; // -1.0 = disabled + float mirostat_tau = 5.00f; // target entropy + float mirostat_eta = 0.10f; // learning rate + bool ignore_eos = false; + bool no_perf = false; // disable performance metrics + bool timing_per_token = false; uint64_t user_sampling_config = 0; // bitfield to track user-specified samplers std::vector dry_sequence_breakers = {"\n", ":", "\"", "*"}; // default sequence breakers for DRY - std::vector samplers = { COMMON_SAMPLER_TYPE_PENALTIES, COMMON_SAMPLER_TYPE_DRY, diff --git a/include/llama.h b/include/llama.h index 95df1058cc..ce1308d2bd 100644 --- a/include/llama.h +++ b/include/llama.h @@ -1297,13 +1297,16 @@ extern "C" { /// /// it is recommended to only perform minimal truncation before this sampler. /// + /// @param target target probability (valid range 0.0 to 1.0; <0 = disabled) + /// @param window_size rolling window size for target adaptation (≤0 = fixed target) + /// @param seed RNG seed + /// /// ref: https://github.com/MrJackSpade/llama.cpp/tree/master (original impl, documentation) /// ref: https://github.com/ggml-org/llama.cpp/pull/17927 (llama.cpp PR) LLAMA_API struct llama_sampler * llama_sampler_init_power_law( - float target, // target probability (0.0 to 1.0) - float target_range, // adaptive target range (target±range) - int32_t window_size, // rolling history window size for target adaptation - uint32_t seed); // RNG seed + float target, + int32_t window_size, + uint32_t seed); LLAMA_API struct llama_sampler * llama_sampler_init_logit_bias( int32_t n_vocab, diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp index 06a1eef148..d5f485f846 100644 --- a/src/llama-sampling.cpp +++ b/src/llama-sampling.cpp @@ -2326,12 +2326,11 @@ struct llama_sampler * llama_sampler_init_dry_testing(int32_t context_size, floa struct llama_sampler_power_law { const float target; - const float target_range; const int32_t window_size; - const uint32_t seed; + const uint32_t seed; std::mt19937 rng; - ring_buffer history; + ring_buffer window; }; static const char * llama_sampler_power_law_name(const struct llama_sampler * /*smpl*/) { @@ -2341,66 +2340,82 @@ static const char * llama_sampler_power_law_name(const struct llama_sampler * /* static void llama_sampler_power_law_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) { auto * ctx = (llama_sampler_power_law *) smpl->ctx; - // clamp the target range to [0.0, 1.0] - const float min_target = std::max(ctx->target - ctx->target_range, 0.0f); - const float max_target = std::min(ctx->target + ctx->target_range, 1.0f); + if (ctx->target < 0.0f) { + // no-op: just sample from the distribution as-is + llama_sampler_softmax_impl(cur_p, false); + const int idx = llama_sample_dist(cur_p, ctx->rng); + cur_p->selected = idx; + return; + } + + // fixed power law transform parameters (from original implementation) + const float distribution_width = 0.2f; + const float peak_logit_value = 3.0f; + const float tail_heaviness = 3.0f; // compute probabilities to get the "original" values llama_sampler_softmax_impl(cur_p, false); - // store original probabilities (needed for history update) + // store original probabilities (used for future target adaptation) std::vector original_probs; original_probs.reserve(cur_p->size); for (size_t i = 0; i < cur_p->size; ++i) { original_probs.push_back(cur_p->data[i].p); } + // // calculate adaptive target + // + + const float min_target = 0.0f; + const float max_target = 1.0f; + float computed_target = ctx->target; - if (ctx->history.size() > 0) { + if (ctx->window.size() > 0) { float sum_excluding_oldest = 0.0f; - size_t sz = ctx->history.size(); + size_t sz = ctx->window.size(); // sum all except the oldest element for (size_t i = 0; i < sz - 1; ++i) { - sum_excluding_oldest += ctx->history.rat(i); + sum_excluding_oldest += ctx->window.rat(i); } float next_value = (ctx->target * ctx->window_size) - sum_excluding_oldest; computed_target = std::max(min_target, std::min(next_value, max_target)); } - // apply power law transformation + // + // power law transform + // + for (size_t i = 0; i < cur_p->size; ++i) { float p = cur_p->data[i].p; - float normalized_distance = std::abs(p - computed_target) / 0.2f; - cur_p->data[i].logit = 3.0f / (1.0f + std::pow(normalized_distance, 3.0f)); + float normalized_distance = std::abs(p - computed_target) / distribution_width; + cur_p->data[i].logit = peak_logit_value / (1.0f + std::pow(normalized_distance, tail_heaviness)); } llama_sampler_softmax_impl(cur_p, false); - // sample from distribution + // sample from the transformed distribution const int idx = llama_sample_dist(cur_p, ctx->rng); - - // set sampled token cur_p->selected = idx; - // update history with ORIGINAL probability - ctx->history.push_back(original_probs[idx]); + // add the ORIGINAL probability to the rolling window + ctx->window.push_back(original_probs[idx]); } static void llama_sampler_power_law_reset(struct llama_sampler * smpl) { - auto * ctx = (llama_sampler_power_law *) smpl->ctx; - ctx->history = ring_buffer(ctx->window_size); + auto * ctx = (llama_sampler_power_law *) smpl->ctx; + ctx->window = ring_buffer(ctx->window_size); } static struct llama_sampler * llama_sampler_power_law_clone(const struct llama_sampler * smpl) { const auto * ctx = (const llama_sampler_power_law *) smpl->ctx; - auto * result = llama_sampler_init_power_law(ctx->target, ctx->target_range, ctx->window_size, ctx->seed); + auto * result = llama_sampler_init_power_law(ctx->target, ctx->window_size, ctx->seed); auto * result_ctx = (llama_sampler_power_law *) result->ctx; result_ctx->rng = ctx->rng; - result_ctx->history = ctx->history; + result_ctx->window = ctx->window; return result; } @@ -2420,7 +2435,6 @@ static struct llama_sampler_i llama_sampler_power_law_i = { struct llama_sampler * llama_sampler_init_power_law( float target, - float target_range, int32_t window_size, uint32_t seed ) { @@ -2429,11 +2443,10 @@ struct llama_sampler * llama_sampler_init_power_law( /* .iface = */ &llama_sampler_power_law_i, /* .ctx = */ new llama_sampler_power_law { /* .target = */ target, - /* .target_range = */ target_range, /* .window_size = */ window_size, /* .seed = */ seed_cur, /* .rng = */ std::mt19937(seed_cur), - /* .history = */ ring_buffer(window_size), + /* .window = */ ring_buffer(window_size), } ); }