From e856c8f95933a1896c0ebdb5cc24057dd04e5676 Mon Sep 17 00:00:00 2001 From: Jan Boon Date: Wed, 4 Feb 2026 17:43:49 +0000 Subject: [PATCH 01/13] llama : add blue noise rng implementation --- src/llama-sampling.cpp | 75 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 75 insertions(+) diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp index 515d6c163b..5dd094ce7a 100644 --- a/src/llama-sampling.cpp +++ b/src/llama-sampling.cpp @@ -333,6 +333,81 @@ static void llama_sampler_top_k_impl(llama_token_data_array * cur_p, int32_t k) cur_p->size = k; } +// pseudo-random number generator with ~6db/octave blue noise temporal autocorrelation +struct blue_noise_rng { + uint8_t bit_depth = 0; + uint32_t seed = 0; + uint32_t position = 0; + + // binary tree of 1-bit 50% duty cycle blue noise generators + std::vector> states; // {err0, err1} per tree node + + blue_noise_rng() = default; + + blue_noise_rng(uint8_t bit_depth, uint32_t seed) { + init(bit_depth, seed); + } + + static uint32_t hash(uint32_t x) { // lowbias32 + x ^= x >> 16; x *= 0x21f0aaad; + x ^= x >> 15; x *= 0x735a2d97; + x ^= x >> 15; + return x; + } + + void init(uint8_t depth, uint32_t s) { + bit_depth = std::clamp(depth, 1, 16); + seed = hash(s); + + const int n = (1 << bit_depth) - 1; + states.resize(n); + + reset(); + } + + void reset() { + const int n = (int)states.size(); + position = 0; + + // 5 reachable states with stationary distribution 3:3:2:1:1 (out of 10) + static const int8_t tbl[10][2] = { + { 0, 0}, { 0, 0}, { 0, 0}, + {-1, 0}, {-1, 0}, {-1, 0}, + { 0, -1}, { 0, -1}, + {-2, 0}, + {-1, -1}, + }; + for (int i = 0; i < n; i++) { + uint32_t h = hash((uint32_t)i ^ seed) % 10; + states[i] = {tbl[h][0], tbl[h][1]}; // random initial state + } + } + + uint16_t next() { + uint32_t h = hash(position ^ seed); + position++; + + // traverse binary tree root-to-leaf, one error diffusion ditherer per bit + uint32_t acc = 0; + for (int level = 0; level < bit_depth; level++) { + auto & s = states[(1 << level) - 1 + acc]; // heap-style index + + int out = (s[0] >= 0) ? 1 : 0; + int8_t qe = s[0] + (int8_t)(out ? -1 : 1); // inverse autocorrelation + + s[0] = s[1]; // step forward + s[1] = 0; + + // error diffusion dithering using binary weight perturbation + s[(h >> level) & 1 ? 0 : 1] += qe; // forward to t+1 or defer to t+2 + + acc = acc * 2 + out; + } + + return (uint16_t)acc; + } +}; + static uint32_t get_rng_seed(uint32_t seed) { if (seed == LLAMA_DEFAULT_SEED) { // use system clock if std::random_device is not a true RNG From b2ee2fbc0a8e9fef59a8ab27d93cb99b4a23b782 Mon Sep 17 00:00:00 2001 From: Jan Boon Date: Wed, 4 Feb 2026 18:18:56 +0000 Subject: [PATCH 02/13] llama : add floating point blue noise rng --- src/llama-sampling.cpp | 32 ++++++++++++++++++++++++++++---- 1 file changed, 28 insertions(+), 4 deletions(-) diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp index 5dd094ce7a..7c83095582 100644 --- a/src/llama-sampling.cpp +++ b/src/llama-sampling.cpp @@ -333,13 +333,13 @@ static void llama_sampler_top_k_impl(llama_token_data_array * cur_p, int32_t k) cur_p->size = k; } -// pseudo-random number generator with ~6db/octave blue noise temporal autocorrelation +// pseudo-random number generator with ~6db/octave blue noise struct blue_noise_rng { uint8_t bit_depth = 0; uint32_t seed = 0; uint32_t position = 0; - // binary tree of 1-bit 50% duty cycle blue noise generators + // binary tree of 1-bit 50% duty cycle error diffusion dithering blue noise generators std::vector> states; // {err0, err1} per tree node blue_noise_rng() = default; @@ -383,11 +383,12 @@ struct blue_noise_rng { } } - uint16_t next() { + uint16_t next(uint32_t * hash_remainder = nullptr) { uint32_t h = hash(position ^ seed); position++; - // traverse binary tree root-to-leaf, one error diffusion ditherer per bit + // traverse binary tree, one error diffusion ditherer per population split + // thresholding output at any value still produces blue noise uint32_t acc = 0; for (int level = 0; level < bit_depth; level++) { auto & s = states[(1 << level) - 1 + acc]; // heap-style index @@ -404,8 +405,31 @@ struct blue_noise_rng { acc = acc * 2 + out; } + if (hash_remainder) { + *hash_remainder = h >> bit_depth; // unused bits from random hash + } + return (uint16_t)acc; } + + // blue noise in the upper bit_depth bits, white noise hash remainder in the lower bits + // do not use with modulo operator, as it would just produce white noise + uint32_t next32() { + uint32_t rem; + uint32_t val = next(&rem); + return (val << (32 - bit_depth)) | rem; + } + + // uniform double in [0, 1) with blue noise temporal autocorrelation + double nextf() { + double res = 0.0; + res += hash(position ^ ~seed); // fill low bits with white noise + res *= 1.0 / 4294967296.0; + res += next32(); + res *= 1.0 / 4294967296.0; + if (res >= 1.0) res = std::nextafter(1.0, 0.0); + return res; + } }; static uint32_t get_rng_seed(uint32_t seed) { From f271576d81ca920d5d35a76f44a663da47608adb Mon Sep 17 00:00:00 2001 From: Jan Boon Date: Wed, 4 Feb 2026 19:12:50 +0000 Subject: [PATCH 03/13] llama : initial blue noise test implementation --- common/arg.cpp | 7 ++ common/common.h | 1 + common/sampling.cpp | 6 +- include/llama.h | 3 +- src/llama-sampling.cpp | 197 ++++++++++++++++++++++++++++++++++- tools/server/server-task.cpp | 3 + 6 files changed, 214 insertions(+), 3 deletions(-) diff --git a/common/arg.cpp b/common/arg.cpp index 5fbc9022c0..924b5198a2 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -1577,6 +1577,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex params.sampling.ignore_eos = true; } ).set_sparam()); + add_opt(common_arg( + {"--blue-noise"}, + "use blue noise RNG for sampling instead of white noise", + [](common_params & params) { + params.sampling.blue_noise = true; + } + ).set_sparam()); add_opt(common_arg( {"--temp"}, "N", string_format("temperature (default: %.2f)", (double)params.sampling.temp), diff --git a/common/common.h b/common/common.h index 398ebb0960..0a76a1e26c 100644 --- a/common/common.h +++ b/common/common.h @@ -209,6 +209,7 @@ struct common_params_sampling { bool ignore_eos = false; bool no_perf = false; // disable performance metrics bool timing_per_token = false; + bool blue_noise = false; // use blue noise RNG instead of white noise for dist sampler uint64_t user_sampling_config = 0; // bitfield to track user-specified samplers diff --git a/common/sampling.cpp b/common/sampling.cpp index 11a1d48398..2811eb3a48 100644 --- a/common/sampling.cpp +++ b/common/sampling.cpp @@ -313,7 +313,11 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, st samplers.push_back(llama_sampler_init_adaptive_p(params.adaptive_target, params.adaptive_decay, params.seed)); } else { // default: sample from distribution - samplers.push_back(llama_sampler_init_dist(params.seed)); + if (params.blue_noise) { + samplers.push_back(llama_sampler_init_dist_blue_noise(params.seed)); + } else { + samplers.push_back(llama_sampler_init_dist(params.seed)); + } } } else if (params.mirostat == 1) { samplers.push_back(llama_sampler_init_temp(params.temp)); diff --git a/include/llama.h b/include/llama.h index bf4e28a8be..22f08e1683 100644 --- a/include/llama.h +++ b/include/llama.h @@ -1295,7 +1295,8 @@ extern "C" { LLAMA_API struct llama_sampler * llama_sampler_init_greedy(void); /// seed == LLAMA_DEFAULT_SEED to use a random seed. - LLAMA_API struct llama_sampler * llama_sampler_init_dist(uint32_t seed); + LLAMA_API struct llama_sampler * llama_sampler_init_dist (uint32_t seed); + LLAMA_API struct llama_sampler * llama_sampler_init_dist_blue_noise(uint32_t seed); /// @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751 /// Setting k <= 0 makes this a noop diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp index 7c83095582..09fd3a4700 100644 --- a/src/llama-sampling.cpp +++ b/src/llama-sampling.cpp @@ -369,7 +369,7 @@ struct blue_noise_rng { const int n = (int)states.size(); position = 0; - // 5 reachable states with stationary distribution 3:3:2:1:1 (out of 10) + // 5 reachable states with distribution 3:3:2:1:1 static const int8_t tbl[10][2] = { { 0, 0}, { 0, 0}, { 0, 0}, {-1, 0}, {-1, 0}, {-1, 0}, @@ -1340,6 +1340,197 @@ struct llama_sampler * llama_sampler_init_dist(uint32_t seed) { ); } +// dist (blue noise) + +struct llama_sampler_dist_blue_noise : public llama_sampler_backend { + const uint32_t seed; + uint32_t seed_cur; + + blue_noise_rng bn_rng; + + ggml_tensor * inp_uniform; +}; + +static const char * llama_sampler_dist_blue_noise_name(const struct llama_sampler * smpl) { + auto * sctx = (llama_sampler_dist_blue_noise *) smpl->ctx; + return sctx->get_name(); +} + +static void llama_sampler_dist_blue_noise_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) { + auto * ctx = (llama_sampler_dist_blue_noise *) smpl->ctx; + + // edge cases + if (cur_p->size == 0) { + cur_p->selected = -1; + return; + } + + cur_p->selected = 0; + + if (cur_p->size == 1) { + cur_p->data[0].p = 1.0f; + return; + } + + // max logit for numerical stability + float max_l = cur_p->data[0].logit; + if (!cur_p->sorted) { + for (size_t i = 1; i < cur_p->size; ++i) { + max_l = std::max(max_l, cur_p->data[i].logit); + } + } + + // apply softmax to obtain the probabilities + double sum_cum = 0.0f; + for (size_t i = 0; i < cur_p->size; ++i) { + float p = expf(cur_p->data[i].logit - max_l); + cur_p->data[i].p = p; + sum_cum += p; + } + + // sample using blue noise RNG + const double rnd = ctx->bn_rng.nextf(); + + double sum_run = 0.0f; + const double sum_tgt = sum_cum*rnd; + + bool found = false; + for (size_t i = 0; i < cur_p->size; ++i) { + if (!found) { + sum_run += cur_p->data[i].p; + if (sum_run >= sum_tgt) { + cur_p->selected = i; + found = true; + } + } + + // normalize probs + cur_p->data[i].p /= sum_cum; + } + + assert(found); + if (!found) { + cur_p->selected = cur_p->size - 1; + } +} + +static void llama_sampler_dist_blue_noise_reset(struct llama_sampler * smpl) { + auto * ctx = (llama_sampler_dist_blue_noise *) smpl->ctx; + ctx->seed_cur = get_rng_seed(ctx->seed); + ctx->bn_rng.init(16, ctx->seed_cur); +} + +static struct llama_sampler * llama_sampler_dist_blue_noise_clone(const struct llama_sampler * smpl) { + const auto * ctx = (const llama_sampler_dist_blue_noise *) smpl->ctx; + auto * result = llama_sampler_init_dist_blue_noise(ctx->seed); + + // copy the state + { + auto * result_ctx = (llama_sampler_dist_blue_noise *) result->ctx; + + result_ctx->seed_cur = ctx->seed_cur; + result_ctx->bn_rng = ctx->bn_rng; + } + + return result; +} + +static void llama_sampler_dist_blue_noise_free(struct llama_sampler * smpl) { + delete (llama_sampler_dist_blue_noise *) smpl->ctx; +} + +static bool llama_sampler_dist_blue_noise_backend_init( + struct llama_sampler * smpl, + ggml_backend_buffer_type_t buft) { + auto * sctx = (llama_sampler_dist_blue_noise *) smpl->ctx; + + const bool res = llama_sampler_backend_support(smpl, buft); + + sctx->init(res); + + return res; +} + +static void llama_sampler_dist_blue_noise_backend_apply( + struct llama_sampler * smpl, + struct ggml_context * ctx, + struct ggml_cgraph * gf, + struct llama_sampler_data * data) { + GGML_UNUSED(gf); + + auto * sctx = (llama_sampler_dist_blue_noise *) smpl->ctx; + + sctx->inp_uniform = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1); + ggml_set_name (sctx->inp_uniform, "uniform"); + ggml_set_input(sctx->inp_uniform); + + struct ggml_tensor * probs = ggml_soft_max(ctx, data->logits); + ggml_set_name(probs, "dist_probs"); + + struct ggml_tensor * cumsum = ggml_cumsum(ctx, probs); + ggml_set_name(cumsum, "dist_cumsum"); + + struct ggml_tensor * diff = ggml_sub(ctx, cumsum, sctx->inp_uniform); + ggml_set_name(diff, "dist_cumsum"); + + struct ggml_tensor * mask = ggml_step(ctx, diff); + ggml_set_name(mask, "dist_mask"); + + struct ggml_tensor * idxf = ggml_sum(ctx, mask); + ggml_set_name(idxf, "dist_index_f32"); + + struct ggml_tensor * idx = ggml_cast(ctx, ggml_scale_bias(ctx, idxf, -1.0f, mask->ne[0]), GGML_TYPE_I32); + ggml_set_name(idx, "dist_index_i32"); + + struct ggml_tensor * sampled_token = idx; + if (data->candidates != nullptr) { + struct ggml_tensor * candidates = ggml_reshape_2d(ctx, data->candidates, 1, ggml_nelements(data->candidates)); + + sampled_token = ggml_get_rows(ctx, candidates, idx); + ggml_set_name(sampled_token, "dist_sampled_token"); + } + + data->sampled = sampled_token; + data->probs = probs; +} + +static void llama_sampler_dist_blue_noise_backend_set_input(struct llama_sampler * smpl) { + auto * sctx = (llama_sampler_dist_blue_noise *) smpl->ctx; + + GGML_ASSERT(sctx->inp_uniform != nullptr); + + const float rnd = (float)sctx->bn_rng.nextf(); + + ggml_backend_tensor_set(sctx->inp_uniform, &rnd, 0, sizeof(float)); +} + +static struct llama_sampler_i llama_sampler_dist_blue_noise_i = { + /* .name = */ llama_sampler_dist_blue_noise_name, + /* .accept = */ nullptr, + /* .apply = */ llama_sampler_dist_blue_noise_apply, + /* .reset = */ llama_sampler_dist_blue_noise_reset, + /* .clone = */ llama_sampler_dist_blue_noise_clone, + /* .free = */ llama_sampler_dist_blue_noise_free, + /* .backend_init = */ llama_sampler_dist_blue_noise_backend_init, + /* .backend_accept = */ nullptr, + /* .backend_apply = */ llama_sampler_dist_blue_noise_backend_apply, + /* .backend_set_input = */ llama_sampler_dist_blue_noise_backend_set_input, +}; + +struct llama_sampler * llama_sampler_init_dist_blue_noise(uint32_t seed) { + auto seed_cur = get_rng_seed(seed); + return llama_sampler_init( + /* .iface = */ &llama_sampler_dist_blue_noise_i, + /* .ctx = */ new llama_sampler_dist_blue_noise { + ("dist-blue-noise"), + /* .seed = */ seed, + /* .seed_cur = */ seed_cur, + /* .bn_rng = */ blue_noise_rng(16, seed_cur), + /* .inp_uniform = */ nullptr, + } + ); +} + // top-k struct llama_sampler_top_k : public llama_sampler_backend { @@ -3928,6 +4119,10 @@ uint32_t llama_sampler_get_seed(const struct llama_sampler * smpl) { return ((const llama_sampler_dist *) smpl->ctx)->seed_cur; } + if (smpl->iface == &llama_sampler_dist_blue_noise_i) { + return ((const llama_sampler_dist_blue_noise *) smpl->ctx)->seed_cur; + } + if (smpl->iface == &llama_sampler_mirostat_i) { return ((const llama_sampler_mirostat *) smpl->ctx)->seed_cur; } diff --git a/tools/server/server-task.cpp b/tools/server/server-task.cpp index 2d25db63b7..16c3cf12d0 100644 --- a/tools/server/server-task.cpp +++ b/tools/server/server-task.cpp @@ -66,6 +66,7 @@ json task_params::to_json(bool only_metrics) const { {"n_keep", n_keep}, {"n_discard", n_discard}, {"ignore_eos", sampling.ignore_eos}, + {"blue_noise", sampling.blue_noise}, {"stream", stream}, {"n_probs", sampling.n_probs}, {"min_keep", sampling.min_keep}, @@ -125,6 +126,7 @@ json task_params::to_json(bool only_metrics) const { {"n_keep", n_keep}, {"n_discard", n_discard}, {"ignore_eos", sampling.ignore_eos}, + {"blue_noise", sampling.blue_noise}, {"stream", stream}, {"logit_bias", format_logit_bias(sampling.logit_bias)}, {"n_probs", sampling.n_probs}, @@ -467,6 +469,7 @@ task_params server_task::params_from_json_cmpl( } } + params.sampling.blue_noise = json_value(data, "blue_noise", params_base.sampling.blue_noise); params.sampling.ignore_eos = json_value(data, "ignore_eos", params_base.sampling.ignore_eos); if (params.sampling.ignore_eos) { params.sampling.logit_bias.insert( From 3b4061981b92a091ed65b853add69adcc1a5a091 Mon Sep 17 00:00:00 2001 From: Jan Boon Date: Wed, 4 Feb 2026 23:05:54 +0000 Subject: [PATCH 04/13] llama : make the sampler rng modular --- src/llama-sampling.cpp | 285 ++++++++++++----------------------------- 1 file changed, 81 insertions(+), 204 deletions(-) diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp index 09fd3a4700..c41666aaa7 100644 --- a/src/llama-sampling.cpp +++ b/src/llama-sampling.cpp @@ -432,6 +432,56 @@ struct blue_noise_rng { } }; +// abstract RNG interface for the dist sampler +struct llama_dist_rng { + virtual ~llama_dist_rng() = default; + + virtual double nextf() = 0; // uniform double in [0, 1) + virtual void reseed(uint32_t s) = 0; + virtual std::unique_ptr clone() const = 0; +}; + +struct llama_dist_rng_white : llama_dist_rng { + std::mt19937 rng; + + llama_dist_rng_white(uint32_t seed) : rng(seed) {} + + double nextf() override { + std::uniform_real_distribution dist(0.0, 1.0); + return dist(rng); + } + + void reseed(uint32_t s) override { + rng.seed(s); + } + + std::unique_ptr clone() const override { + auto c = std::make_unique(0); + c->rng = rng; + return c; + } +}; + +struct llama_dist_rng_blue : llama_dist_rng { + blue_noise_rng bn_rng; + + llama_dist_rng_blue(uint32_t seed) : bn_rng(16, seed) {} + + double nextf() override { + return bn_rng.nextf(); + } + + void reseed(uint32_t s) override { + bn_rng.init(16, s); + } + + std::unique_ptr clone() const override { + auto c = std::make_unique(0); + c->bn_rng = bn_rng; + return c; + } +}; + static uint32_t get_rng_seed(uint32_t seed) { if (seed == LLAMA_DEFAULT_SEED) { // use system clock if std::random_device is not a true RNG @@ -1122,7 +1172,7 @@ struct llama_sampler_dist : public llama_sampler_backend { const uint32_t seed; uint32_t seed_cur; - std::mt19937 rng; + std::unique_ptr rng; ggml_tensor * inp_uniform; }; @@ -1168,8 +1218,7 @@ static void llama_sampler_dist_apply(struct llama_sampler * smpl, llama_token_da // sample from the obtained probabilities and normalize the probs in a single pass // this is ~3x faster on Mac with full gpt-oss vocab than the version below // - std::uniform_real_distribution dist(0.0f, 1.0f); - const double rnd = dist(ctx->rng); + const double rnd = ctx->rng->nextf(); double sum_run = 0.0f; const double sum_tgt = sum_cum*rnd; @@ -1200,28 +1249,37 @@ static void llama_sampler_dist_apply(struct llama_sampler * smpl, llama_token_da cur_p->data[i].p /= sum_cum; } - cur_p->selected = llama_sample_dist(cur_p, ctx->rng); + const double rnd = ctx->rng->nextf(); + double cum = 0.0; + for (size_t i = 0; i < cur_p->size; ++i) { + cum += cur_p->data[i].p; + if (cum >= rnd) { + cur_p->selected = i; + break; + } + } #endif } static void llama_sampler_dist_reset(struct llama_sampler * smpl) { auto * ctx = (llama_sampler_dist *) smpl->ctx; ctx->seed_cur = get_rng_seed(ctx->seed); - ctx->rng.seed(ctx->seed_cur); + ctx->rng->reseed(ctx->seed_cur); } static struct llama_sampler * llama_sampler_dist_clone(const struct llama_sampler * smpl) { - const auto * ctx = (const llama_sampler_dist *) smpl->ctx; - auto * result = llama_sampler_init_dist(ctx->seed); + auto * ctx = (llama_sampler_dist *) smpl->ctx; - // copy the state - { - auto * result_ctx = (llama_sampler_dist *) result->ctx; - - result_ctx->rng = ctx->rng; - } - - return result; + return llama_sampler_init( + /* .iface = */ smpl->iface, + /* .ctx = */ new llama_sampler_dist { + {ctx->get_name()}, + /* .seed = */ ctx->seed, + /* .seed_cur = */ ctx->seed_cur, + /* .rng = */ ctx->rng->clone(), + /* .inp_uniform = */ nullptr, + } + ); } static void llama_sampler_dist_free(struct llama_sampler * smpl) { @@ -1307,8 +1365,8 @@ static void llama_sampler_dist_backend_set_input(struct llama_sampler * smpl) { // std::uniform_real_distribution and // std::uniform_real_distribution with same rng will produce // different sequences). - std::uniform_real_distribution dist(0.0f, 1.0f); - const float rnd = dist(sctx->rng); + // nextf returns double, equivalent to std::uniform_real_distribution + const float rnd = (float)sctx->rng->nextf(); ggml_backend_tensor_set(sctx->inp_uniform, &rnd, 0, sizeof(float)); } @@ -1331,201 +1389,24 @@ struct llama_sampler * llama_sampler_init_dist(uint32_t seed) { return llama_sampler_init( /* .iface = */ &llama_sampler_dist_i, /* .ctx = */ new llama_sampler_dist { - ("dist"), + {"dist"}, /* .seed = */ seed, /* .seed_cur = */ seed_cur, - /* .rng = */ std::mt19937(seed_cur), + /* .rng = */ std::make_unique(seed_cur), /* .inp_uniform = */ nullptr, } ); } -// dist (blue noise) - -struct llama_sampler_dist_blue_noise : public llama_sampler_backend { - const uint32_t seed; - uint32_t seed_cur; - - blue_noise_rng bn_rng; - - ggml_tensor * inp_uniform; -}; - -static const char * llama_sampler_dist_blue_noise_name(const struct llama_sampler * smpl) { - auto * sctx = (llama_sampler_dist_blue_noise *) smpl->ctx; - return sctx->get_name(); -} - -static void llama_sampler_dist_blue_noise_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) { - auto * ctx = (llama_sampler_dist_blue_noise *) smpl->ctx; - - // edge cases - if (cur_p->size == 0) { - cur_p->selected = -1; - return; - } - - cur_p->selected = 0; - - if (cur_p->size == 1) { - cur_p->data[0].p = 1.0f; - return; - } - - // max logit for numerical stability - float max_l = cur_p->data[0].logit; - if (!cur_p->sorted) { - for (size_t i = 1; i < cur_p->size; ++i) { - max_l = std::max(max_l, cur_p->data[i].logit); - } - } - - // apply softmax to obtain the probabilities - double sum_cum = 0.0f; - for (size_t i = 0; i < cur_p->size; ++i) { - float p = expf(cur_p->data[i].logit - max_l); - cur_p->data[i].p = p; - sum_cum += p; - } - - // sample using blue noise RNG - const double rnd = ctx->bn_rng.nextf(); - - double sum_run = 0.0f; - const double sum_tgt = sum_cum*rnd; - - bool found = false; - for (size_t i = 0; i < cur_p->size; ++i) { - if (!found) { - sum_run += cur_p->data[i].p; - if (sum_run >= sum_tgt) { - cur_p->selected = i; - found = true; - } - } - - // normalize probs - cur_p->data[i].p /= sum_cum; - } - - assert(found); - if (!found) { - cur_p->selected = cur_p->size - 1; - } -} - -static void llama_sampler_dist_blue_noise_reset(struct llama_sampler * smpl) { - auto * ctx = (llama_sampler_dist_blue_noise *) smpl->ctx; - ctx->seed_cur = get_rng_seed(ctx->seed); - ctx->bn_rng.init(16, ctx->seed_cur); -} - -static struct llama_sampler * llama_sampler_dist_blue_noise_clone(const struct llama_sampler * smpl) { - const auto * ctx = (const llama_sampler_dist_blue_noise *) smpl->ctx; - auto * result = llama_sampler_init_dist_blue_noise(ctx->seed); - - // copy the state - { - auto * result_ctx = (llama_sampler_dist_blue_noise *) result->ctx; - - result_ctx->seed_cur = ctx->seed_cur; - result_ctx->bn_rng = ctx->bn_rng; - } - - return result; -} - -static void llama_sampler_dist_blue_noise_free(struct llama_sampler * smpl) { - delete (llama_sampler_dist_blue_noise *) smpl->ctx; -} - -static bool llama_sampler_dist_blue_noise_backend_init( - struct llama_sampler * smpl, - ggml_backend_buffer_type_t buft) { - auto * sctx = (llama_sampler_dist_blue_noise *) smpl->ctx; - - const bool res = llama_sampler_backend_support(smpl, buft); - - sctx->init(res); - - return res; -} - -static void llama_sampler_dist_blue_noise_backend_apply( - struct llama_sampler * smpl, - struct ggml_context * ctx, - struct ggml_cgraph * gf, - struct llama_sampler_data * data) { - GGML_UNUSED(gf); - - auto * sctx = (llama_sampler_dist_blue_noise *) smpl->ctx; - - sctx->inp_uniform = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1); - ggml_set_name (sctx->inp_uniform, "uniform"); - ggml_set_input(sctx->inp_uniform); - - struct ggml_tensor * probs = ggml_soft_max(ctx, data->logits); - ggml_set_name(probs, "dist_probs"); - - struct ggml_tensor * cumsum = ggml_cumsum(ctx, probs); - ggml_set_name(cumsum, "dist_cumsum"); - - struct ggml_tensor * diff = ggml_sub(ctx, cumsum, sctx->inp_uniform); - ggml_set_name(diff, "dist_cumsum"); - - struct ggml_tensor * mask = ggml_step(ctx, diff); - ggml_set_name(mask, "dist_mask"); - - struct ggml_tensor * idxf = ggml_sum(ctx, mask); - ggml_set_name(idxf, "dist_index_f32"); - - struct ggml_tensor * idx = ggml_cast(ctx, ggml_scale_bias(ctx, idxf, -1.0f, mask->ne[0]), GGML_TYPE_I32); - ggml_set_name(idx, "dist_index_i32"); - - struct ggml_tensor * sampled_token = idx; - if (data->candidates != nullptr) { - struct ggml_tensor * candidates = ggml_reshape_2d(ctx, data->candidates, 1, ggml_nelements(data->candidates)); - - sampled_token = ggml_get_rows(ctx, candidates, idx); - ggml_set_name(sampled_token, "dist_sampled_token"); - } - - data->sampled = sampled_token; - data->probs = probs; -} - -static void llama_sampler_dist_blue_noise_backend_set_input(struct llama_sampler * smpl) { - auto * sctx = (llama_sampler_dist_blue_noise *) smpl->ctx; - - GGML_ASSERT(sctx->inp_uniform != nullptr); - - const float rnd = (float)sctx->bn_rng.nextf(); - - ggml_backend_tensor_set(sctx->inp_uniform, &rnd, 0, sizeof(float)); -} - -static struct llama_sampler_i llama_sampler_dist_blue_noise_i = { - /* .name = */ llama_sampler_dist_blue_noise_name, - /* .accept = */ nullptr, - /* .apply = */ llama_sampler_dist_blue_noise_apply, - /* .reset = */ llama_sampler_dist_blue_noise_reset, - /* .clone = */ llama_sampler_dist_blue_noise_clone, - /* .free = */ llama_sampler_dist_blue_noise_free, - /* .backend_init = */ llama_sampler_dist_blue_noise_backend_init, - /* .backend_accept = */ nullptr, - /* .backend_apply = */ llama_sampler_dist_blue_noise_backend_apply, - /* .backend_set_input = */ llama_sampler_dist_blue_noise_backend_set_input, -}; - struct llama_sampler * llama_sampler_init_dist_blue_noise(uint32_t seed) { auto seed_cur = get_rng_seed(seed); return llama_sampler_init( - /* .iface = */ &llama_sampler_dist_blue_noise_i, - /* .ctx = */ new llama_sampler_dist_blue_noise { - ("dist-blue-noise"), + /* .iface = */ &llama_sampler_dist_i, + /* .ctx = */ new llama_sampler_dist { + {"dist-blue-noise"}, /* .seed = */ seed, /* .seed_cur = */ seed_cur, - /* .bn_rng = */ blue_noise_rng(16, seed_cur), + /* .rng = */ std::make_unique(seed_cur), /* .inp_uniform = */ nullptr, } ); @@ -4119,10 +4000,6 @@ uint32_t llama_sampler_get_seed(const struct llama_sampler * smpl) { return ((const llama_sampler_dist *) smpl->ctx)->seed_cur; } - if (smpl->iface == &llama_sampler_dist_blue_noise_i) { - return ((const llama_sampler_dist_blue_noise *) smpl->ctx)->seed_cur; - } - if (smpl->iface == &llama_sampler_mirostat_i) { return ((const llama_sampler_mirostat *) smpl->ctx)->seed_cur; } From 766d86df29e34f2fdae74e4b37b8875aa02a1906 Mon Sep 17 00:00:00 2001 From: Jan Boon Date: Wed, 4 Feb 2026 23:35:22 +0000 Subject: [PATCH 05/13] llama : cleanup and restore alternate code path --- src/llama-sampling.cpp | 48 +++++++++++++++++++++++++++++++++--------- 1 file changed, 38 insertions(+), 10 deletions(-) diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp index c41666aaa7..060538eb12 100644 --- a/src/llama-sampling.cpp +++ b/src/llama-sampling.cpp @@ -214,7 +214,8 @@ static void llama_token_data_array_partial_sort_inplace(llama_token_data_array * cur_p->sorted = true; } -static int llama_sample_dist(llama_token_data_array * cur_p, std::mt19937 & rng) { +template +static int llama_sample_dist(llama_token_data_array * cur_p, RNG & rng) { // iterator for the probabilities #ifdef __GNUC__ #pragma GCC diagnostic push @@ -334,6 +335,10 @@ static void llama_sampler_top_k_impl(llama_token_data_array * cur_p, int32_t k) } // pseudo-random number generator with ~6db/octave blue noise +// important: blue noise properties cannot be preserved when +// the generator is used for multiple purposes simultaneously +// nor when multiple next calls are used to construct a larger value +// nor when integer outputs are used with the modulo operator struct blue_noise_rng { uint8_t bit_depth = 0; uint32_t seed = 0; @@ -436,16 +441,38 @@ struct blue_noise_rng { struct llama_dist_rng { virtual ~llama_dist_rng() = default; + virtual uint32_t rng_min() = 0; + virtual uint32_t rng_max() = 0; + virtual uint32_t next() = 0; // uniform bits in [rng_min(), rng_max()] virtual double nextf() = 0; // uniform double in [0, 1) virtual void reseed(uint32_t s) = 0; virtual std::unique_ptr clone() const = 0; }; +// adapter to satisfy UniformRandomBitGenerator for std::discrete_distribution +// note: not guaranteed to preserve blue noise properties +struct llama_dist_urbg { + using result_type = uint32_t; + + llama_dist_rng & rng; + + result_type min() { return rng.rng_min(); } + result_type max() { return rng.rng_max(); } + result_type operator()() { return rng.next(); } +}; + struct llama_dist_rng_white : llama_dist_rng { std::mt19937 rng; llama_dist_rng_white(uint32_t seed) : rng(seed) {} + uint32_t rng_min() override { return std::mt19937::min(); } + uint32_t rng_max() override { return std::mt19937::max(); } + + uint32_t next() override { + return rng(); + } + double nextf() override { std::uniform_real_distribution dist(0.0, 1.0); return dist(rng); @@ -467,6 +494,13 @@ struct llama_dist_rng_blue : llama_dist_rng { llama_dist_rng_blue(uint32_t seed) : bn_rng(16, seed) {} + uint32_t rng_min() override { return 0; } + uint32_t rng_max() override { return (1u << bn_rng.bit_depth) - 1; } + + uint32_t next() override { + return bn_rng.next(); + } + double nextf() override { return bn_rng.nextf(); } @@ -1249,15 +1283,9 @@ static void llama_sampler_dist_apply(struct llama_sampler * smpl, llama_token_da cur_p->data[i].p /= sum_cum; } - const double rnd = ctx->rng->nextf(); - double cum = 0.0; - for (size_t i = 0; i < cur_p->size; ++i) { - cum += cur_p->data[i].p; - if (cum >= rnd) { - cur_p->selected = i; - break; - } - } + // this implementation is not guaranteed to preserve blue noise properties + llama_dist_urbg urbg{*ctx->rng}; + cur_p->selected = llama_sample_dist(cur_p, urbg); #endif } From ad73188337ad45a60d404d8cdb97a0eaee6e2599 Mon Sep 17 00:00:00 2001 From: Jan Boon Date: Thu, 5 Feb 2026 00:53:51 +0000 Subject: [PATCH 06/13] llama : note on blue noise properties --- src/llama-sampling.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp index 060538eb12..408bdacccc 100644 --- a/src/llama-sampling.cpp +++ b/src/llama-sampling.cpp @@ -335,6 +335,7 @@ static void llama_sampler_top_k_impl(llama_token_data_array * cur_p, int32_t k) } // pseudo-random number generator with ~6db/octave blue noise +// this generator produces a uniform distribution // important: blue noise properties cannot be preserved when // the generator is used for multiple purposes simultaneously // nor when multiple next calls are used to construct a larger value From d5def78bb0b32c686e8d566493acead71d8f0535 Mon Sep 17 00:00:00 2001 From: Jan Boon Date: Thu, 5 Feb 2026 03:15:55 +0000 Subject: [PATCH 07/13] llama : note on blue noise --- src/llama-sampling.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp index 408bdacccc..9b651c816d 100644 --- a/src/llama-sampling.cpp +++ b/src/llama-sampling.cpp @@ -334,6 +334,7 @@ static void llama_sampler_top_k_impl(llama_token_data_array * cur_p, int32_t k) cur_p->size = k; } +// generative error diffusion for sequential blue noise // pseudo-random number generator with ~6db/octave blue noise // this generator produces a uniform distribution // important: blue noise properties cannot be preserved when From e829f2904e3ea76e8ce8982eee9fc1adc6a95602 Mon Sep 17 00:00:00 2001 From: Jan Boon Date: Thu, 5 Feb 2026 10:46:18 +0000 Subject: [PATCH 08/13] sampling : blue noise requires tokens to be sorted --- src/llama-sampling.cpp | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp index 9b651c816d..0e7d4cb178 100644 --- a/src/llama-sampling.cpp +++ b/src/llama-sampling.cpp @@ -443,9 +443,10 @@ struct blue_noise_rng { struct llama_dist_rng { virtual ~llama_dist_rng() = default; + virtual bool requires_sorted() = 0; // whether the RNG requires sorted input for proper properties virtual uint32_t rng_min() = 0; virtual uint32_t rng_max() = 0; - virtual uint32_t next() = 0; // uniform bits in [rng_min(), rng_max()] + virtual uint32_t next() = 0; // uniform bits in [rng_min(), rng_max()] virtual double nextf() = 0; // uniform double in [0, 1) virtual void reseed(uint32_t s) = 0; virtual std::unique_ptr clone() const = 0; @@ -468,6 +469,8 @@ struct llama_dist_rng_white : llama_dist_rng { llama_dist_rng_white(uint32_t seed) : rng(seed) {} + bool requires_sorted() override { return false; } + uint32_t rng_min() override { return std::mt19937::min(); } uint32_t rng_max() override { return std::mt19937::max(); } @@ -496,6 +499,8 @@ struct llama_dist_rng_blue : llama_dist_rng { llama_dist_rng_blue(uint32_t seed) : bn_rng(16, seed) {} + bool requires_sorted() override { return true; } + uint32_t rng_min() override { return 0; } uint32_t rng_max() override { return (1u << bn_rng.bit_depth) - 1; } @@ -1234,6 +1239,11 @@ static void llama_sampler_dist_apply(struct llama_sampler * smpl, llama_token_da return; } + // sort if required by the RNG (e.g., blue noise needs sorted input for proper temporal properties) + if (ctx->rng->requires_sorted() && !cur_p->sorted) { + llama_token_data_array_partial_sort_inplace(cur_p, cur_p->size); + } + // max logit for numerical stability float max_l = cur_p->data[0].logit; if (!cur_p->sorted) { From 267cd808a2adff78004b0f96bcde9b2b7d37f0f1 Mon Sep 17 00:00:00 2001 From: Jan Boon Date: Sat, 7 Feb 2026 02:42:21 +0000 Subject: [PATCH 09/13] sampling : cleanup blue noise rng with some more notes --- src/llama-sampler.cpp | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/src/llama-sampler.cpp b/src/llama-sampler.cpp index 9db31d4a2c..afa5fe16be 100644 --- a/src/llama-sampler.cpp +++ b/src/llama-sampler.cpp @@ -355,6 +355,10 @@ struct blue_noise_rng { init(bit_depth, seed); } + // currently this uses lowbias32 as the white noise RNG source + // in practice, any white noise RNG source works + // this random noise is used to perturb the error diffusion weights (binary decision) + // as well as to fill in the low bits of the double precision output to eliminate aliasing static uint32_t hash(uint32_t x) { // lowbias32 x ^= x >> 16; x *= 0x21f0aaad; x ^= x >> 15; x *= 0x735a2d97; @@ -377,6 +381,7 @@ struct blue_noise_rng { position = 0; // 5 reachable states with distribution 3:3:2:1:1 + // established based on empirical testing static const int8_t tbl[10][2] = { { 0, 0}, { 0, 0}, { 0, 0}, {-1, 0}, {-1, 0}, {-1, 0}, @@ -444,9 +449,13 @@ struct llama_dist_rng { virtual ~llama_dist_rng() = default; virtual bool requires_sorted() = 0; // whether the RNG requires sorted input for proper properties + + // for compatilibility with std::discrete_distribution + // nly used in a disabled branch of llama_sampler_dist_apply virtual uint32_t rng_min() = 0; virtual uint32_t rng_max() = 0; virtual uint32_t next() = 0; // uniform bits in [rng_min(), rng_max()] + virtual double nextf() = 0; // uniform double in [0, 1) virtual void reseed(uint32_t s) = 0; virtual std::unique_ptr clone() const = 0; @@ -454,6 +463,7 @@ struct llama_dist_rng { // adapter to satisfy UniformRandomBitGenerator for std::discrete_distribution // note: not guaranteed to preserve blue noise properties +// this is only used in a disabled branch of llama_sampler_dist_apply, added for compatibility struct llama_dist_urbg { using result_type = uint32_t; @@ -464,10 +474,10 @@ struct llama_dist_urbg { result_type operator()() { return rng.next(); } }; -struct llama_dist_rng_white : llama_dist_rng { +struct llama_dist_rng_mt19937 : llama_dist_rng { std::mt19937 rng; - llama_dist_rng_white(uint32_t seed) : rng(seed) {} + llama_dist_rng_mt19937(uint32_t seed) : rng(seed) {} bool requires_sorted() override { return false; } @@ -488,7 +498,7 @@ struct llama_dist_rng_white : llama_dist_rng { } std::unique_ptr clone() const override { - auto c = std::make_unique(0); + auto c = std::make_unique(0); c->rng = rng; return c; } @@ -1432,7 +1442,7 @@ struct llama_sampler * llama_sampler_init_dist(uint32_t seed) { {"dist"}, /* .seed = */ seed, /* .seed_cur = */ seed_cur, - /* .rng = */ std::make_unique(seed_cur), + /* .rng = */ std::make_unique(seed_cur), /* .inp_uniform = */ nullptr, } ); From 1b1b2cbe0e2660a88b98f0671bfa98cec7213530 Mon Sep 17 00:00:00 2001 From: Jan Boon Date: Sat, 7 Feb 2026 04:27:52 +0000 Subject: [PATCH 10/13] sampling : also apply sorting in backend path when blue noise rng is selected --- src/llama-sampler.cpp | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/src/llama-sampler.cpp b/src/llama-sampler.cpp index afa5fe16be..dc7394ae51 100644 --- a/src/llama-sampler.cpp +++ b/src/llama-sampler.cpp @@ -1361,6 +1361,30 @@ static void llama_sampler_dist_backend_apply( ggml_set_name (sctx->inp_uniform, "uniform"); ggml_set_input(sctx->inp_uniform); + // If the RNG requires sorted input (e.g., blue noise), sort logits first + // so the CDF walk operates in probability-rank space, not arbitrary vocab order. + if (sctx->rng->requires_sorted()) { + auto ggml_sort = [ctx](struct ggml_tensor * a, struct ggml_tensor * b) { + GGML_ASSERT(ggml_nrows(a) == 1); + struct ggml_tensor * a_reshaped = ggml_reshape_2d(ctx, a, 1, a->ne[0]); + struct ggml_tensor * a_sorted = ggml_get_rows(ctx, a_reshaped, b); + return ggml_reshape_1d(ctx, a_sorted, a->ne[0]); + }; + + struct ggml_tensor * sorted_idx = ggml_argsort(ctx, data->logits, GGML_SORT_ORDER_DESC); + ggml_set_name(sorted_idx, "dist_sorted_idx"); + + data->logits = ggml_sort(data->logits, sorted_idx); + ggml_set_name(data->logits, "dist_sorted_logits"); + + if (data->candidates) { + data->candidates = ggml_sort(data->candidates, sorted_idx); + } else { + data->candidates = sorted_idx; + } + ggml_set_name(data->candidates, "dist_sorted_candidates"); + } + struct ggml_tensor * probs = ggml_soft_max(ctx, data->logits); ggml_set_name(probs, "dist_probs"); From 15ade86a75d1b26f61be1d084f004e5df00d421f Mon Sep 17 00:00:00 2001 From: Jan Boon Date: Sat, 7 Feb 2026 04:49:37 +0000 Subject: [PATCH 11/13] sampling : simplify clone --- src/llama-sampler.cpp | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/src/llama-sampler.cpp b/src/llama-sampler.cpp index dc7394ae51..0aa000f319 100644 --- a/src/llama-sampler.cpp +++ b/src/llama-sampler.cpp @@ -498,9 +498,7 @@ struct llama_dist_rng_mt19937 : llama_dist_rng { } std::unique_ptr clone() const override { - auto c = std::make_unique(0); - c->rng = rng; - return c; + return std::make_unique(*this); } }; @@ -527,9 +525,7 @@ struct llama_dist_rng_blue : llama_dist_rng { } std::unique_ptr clone() const override { - auto c = std::make_unique(0); - c->bn_rng = bn_rng; - return c; + return std::make_unique(*this); } }; From a0323a989df46cacc0c17f5577ec44080b702f28 Mon Sep 17 00:00:00 2001 From: Jan Boon Date: Sat, 7 Feb 2026 05:03:21 +0000 Subject: [PATCH 12/13] sampling : comment on state use --- src/llama-sampler.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/llama-sampler.cpp b/src/llama-sampler.cpp index 0aa000f319..0d1586508e 100644 --- a/src/llama-sampler.cpp +++ b/src/llama-sampler.cpp @@ -371,7 +371,7 @@ struct blue_noise_rng { seed = hash(s); const int n = (1 << bit_depth) - 1; - states.resize(n); + states.resize(n); // at 16-bit depth, this uses 128KB of state reset(); } From 23b5a5c026f77e0a51361e7dc5ec3292f481f65f Mon Sep 17 00:00:00 2001 From: Jan Boon Date: Sat, 7 Feb 2026 09:19:23 +0000 Subject: [PATCH 13/13] sampling : fix whitespace --- src/llama-sampler.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/llama-sampler.cpp b/src/llama-sampler.cpp index 0d1586508e..275e4c5b56 100644 --- a/src/llama-sampler.cpp +++ b/src/llama-sampler.cpp @@ -337,7 +337,7 @@ static void llama_sampler_top_k_impl(llama_token_data_array * cur_p, int32_t k) // generative error diffusion for sequential blue noise // pseudo-random number generator with ~6db/octave blue noise // this generator produces a uniform distribution -// important: blue noise properties cannot be preserved when +// important: blue noise properties cannot be preserved when // the generator is used for multiple purposes simultaneously // nor when multiple next calls are used to construct a larger value // nor when integer outputs are used with the modulo operator @@ -449,7 +449,7 @@ struct llama_dist_rng { virtual ~llama_dist_rng() = default; virtual bool requires_sorted() = 0; // whether the RNG requires sorted input for proper properties - + // for compatilibility with std::discrete_distribution // nly used in a disabled branch of llama_sampler_dist_apply virtual uint32_t rng_min() = 0;