with logging
This commit is contained in:
parent
d1e5c60442
commit
9613c48172
|
|
@ -2362,12 +2362,16 @@ static const char * llama_sampler_power_law_name(const struct llama_sampler * /*
|
||||||
static float llama_sampler_power_law_compute_target(const llama_sampler_power_law * ctx, float decay) {
|
static float llama_sampler_power_law_compute_target(const llama_sampler_power_law * ctx, float decay) {
|
||||||
if (ctx->total_weight == 0.0f) {
|
if (ctx->total_weight == 0.0f) {
|
||||||
// if there is no history, just use base target
|
// if there is no history, just use base target
|
||||||
|
fprintf(stderr, "power-law: compute_target: total_weight == 0.0 (target fixed at %.3f)\n", ctx->target);
|
||||||
|
fflush(stderr);
|
||||||
return ctx->target;
|
return ctx->target;
|
||||||
}
|
}
|
||||||
|
|
||||||
// maintain a running weighted sum with exponential decay
|
// maintain a running weighted sum with exponential decay
|
||||||
float new_total_weight = 1.0f + decay * ctx->total_weight;
|
float new_total_weight = 1.0f + decay * ctx->total_weight;
|
||||||
|
fprintf(stderr, "power-law: compute_target: new_total_weight = %.3f\n", new_total_weight); fflush(stderr);
|
||||||
float next_value = ctx->target * new_total_weight - decay * ctx->weighted_sum;
|
float next_value = ctx->target * new_total_weight - decay * ctx->weighted_sum;
|
||||||
|
fprintf(stderr, "power-law: compute_target: next_value = %.3f\n", next_value); fflush(stderr);
|
||||||
|
|
||||||
// clamp to [0.0, 1.0]
|
// clamp to [0.0, 1.0]
|
||||||
return std::max(0.0f, std::min(next_value, 1.0f));
|
return std::max(0.0f, std::min(next_value, 1.0f));
|
||||||
|
|
@ -2378,6 +2382,7 @@ static void llama_sampler_power_law_apply(struct llama_sampler * smpl, llama_tok
|
||||||
|
|
||||||
if (ctx->target < 0.0f) {
|
if (ctx->target < 0.0f) {
|
||||||
// no-op: just sample from the distribution as-is
|
// no-op: just sample from the distribution as-is
|
||||||
|
fprintf(stderr, "power-law: no-op!"); fflush(stderr);
|
||||||
llama_sampler_softmax_impl(cur_p, false);
|
llama_sampler_softmax_impl(cur_p, false);
|
||||||
const int idx = llama_sample_dist(cur_p, ctx->rng);
|
const int idx = llama_sample_dist(cur_p, ctx->rng);
|
||||||
cur_p->selected = idx;
|
cur_p->selected = idx;
|
||||||
|
|
@ -2386,6 +2391,7 @@ static void llama_sampler_power_law_apply(struct llama_sampler * smpl, llama_tok
|
||||||
|
|
||||||
// clamp decay to avoid degenerate case at 1.0 (unbounded accumulation)
|
// clamp decay to avoid degenerate case at 1.0 (unbounded accumulation)
|
||||||
const float decay = std::min(ctx->decay, 0.99f);
|
const float decay = std::min(ctx->decay, 0.99f);
|
||||||
|
fprintf(stderr, "power-law: decay = %.3f\n", decay); fflush(stderr);
|
||||||
|
|
||||||
// fixed power law transform parameters
|
// fixed power law transform parameters
|
||||||
const float distribution_width = 0.3f;
|
const float distribution_width = 0.3f;
|
||||||
|
|
@ -2403,6 +2409,7 @@ static void llama_sampler_power_law_apply(struct llama_sampler * smpl, llama_tok
|
||||||
}
|
}
|
||||||
|
|
||||||
float computed_target = llama_sampler_power_law_compute_target(ctx, decay);
|
float computed_target = llama_sampler_power_law_compute_target(ctx, decay);
|
||||||
|
fprintf(stderr, "power-law: computed_target = %.3f\n", computed_target); fflush(stderr);
|
||||||
|
|
||||||
//
|
//
|
||||||
// power law transform
|
// power law transform
|
||||||
|
|
@ -2410,8 +2417,12 @@ static void llama_sampler_power_law_apply(struct llama_sampler * smpl, llama_tok
|
||||||
|
|
||||||
for (size_t i = 0; i < cur_p->size; ++i) {
|
for (size_t i = 0; i < cur_p->size; ++i) {
|
||||||
float p = cur_p->data[i].p;
|
float p = cur_p->data[i].p;
|
||||||
float normalized_distance = std::abs(p - computed_target) / distribution_width;
|
fprintf(stderr, "power-law: transform: p = %.3f\n", p); fflush(stderr);
|
||||||
cur_p->data[i].logit = peak_logit_value / (1.0f + std::pow(normalized_distance, tail_heaviness));
|
float normed_distance = std::abs(p - computed_target) / distribution_width;
|
||||||
|
fprintf(stderr, "power-law: transform: normed_distance = %.3f\n", normed_distance); fflush(stderr);
|
||||||
|
float new_p = peak_logit_value / (1.0f + std::pow(normed_distance, tail_heaviness));
|
||||||
|
fprintf(stderr, "power-law: transform: new_p = %.3f\n", new_p); fflush(stderr);
|
||||||
|
cur_p->data[i].logit = new_p;
|
||||||
}
|
}
|
||||||
|
|
||||||
llama_sampler_softmax_impl(cur_p, false);
|
llama_sampler_softmax_impl(cur_p, false);
|
||||||
|
|
@ -2419,6 +2430,7 @@ static void llama_sampler_power_law_apply(struct llama_sampler * smpl, llama_tok
|
||||||
// sample from transformed distribution
|
// sample from transformed distribution
|
||||||
const int idx = llama_sample_dist(cur_p, ctx->rng);
|
const int idx = llama_sample_dist(cur_p, ctx->rng);
|
||||||
cur_p->selected = idx;
|
cur_p->selected = idx;
|
||||||
|
fprintf(stderr, "power-law: selected token %d\n", idx); fflush(stderr);
|
||||||
|
|
||||||
// update running history with the original probability of the selected token
|
// update running history with the original probability of the selected token
|
||||||
float original_p = original_probs[idx];
|
float original_p = original_probs[idx];
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue