From e856c8f95933a1896c0ebdb5cc24057dd04e5676 Mon Sep 17 00:00:00 2001 From: Jan Boon Date: Wed, 4 Feb 2026 17:43:49 +0000 Subject: [PATCH 01/25] llama : add blue noise rng implementation --- src/llama-sampling.cpp | 75 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 75 insertions(+) diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp index 515d6c163b..5dd094ce7a 100644 --- a/src/llama-sampling.cpp +++ b/src/llama-sampling.cpp @@ -333,6 +333,81 @@ static void llama_sampler_top_k_impl(llama_token_data_array * cur_p, int32_t k) cur_p->size = k; } +// pseudo-random number generator with ~6db/octave blue noise temporal autocorrelation +struct blue_noise_rng { + uint8_t bit_depth = 0; + uint32_t seed = 0; + uint32_t position = 0; + + // binary tree of 1-bit 50% duty cycle blue noise generators + std::vector> states; // {err0, err1} per tree node + + blue_noise_rng() = default; + + blue_noise_rng(uint8_t bit_depth, uint32_t seed) { + init(bit_depth, seed); + } + + static uint32_t hash(uint32_t x) { // lowbias32 + x ^= x >> 16; x *= 0x21f0aaad; + x ^= x >> 15; x *= 0x735a2d97; + x ^= x >> 15; + return x; + } + + void init(uint8_t depth, uint32_t s) { + bit_depth = std::clamp(depth, 1, 16); + seed = hash(s); + + const int n = (1 << bit_depth) - 1; + states.resize(n); + + reset(); + } + + void reset() { + const int n = (int)states.size(); + position = 0; + + // 5 reachable states with stationary distribution 3:3:2:1:1 (out of 10) + static const int8_t tbl[10][2] = { + { 0, 0}, { 0, 0}, { 0, 0}, + {-1, 0}, {-1, 0}, {-1, 0}, + { 0, -1}, { 0, -1}, + {-2, 0}, + {-1, -1}, + }; + for (int i = 0; i < n; i++) { + uint32_t h = hash((uint32_t)i ^ seed) % 10; + states[i] = {tbl[h][0], tbl[h][1]}; // random initial state + } + } + + uint16_t next() { + uint32_t h = hash(position ^ seed); + position++; + + // traverse binary tree root-to-leaf, one error diffusion ditherer per bit + uint32_t acc = 0; + for (int level = 0; level < bit_depth; level++) { + auto & s = states[(1 << level) - 1 + acc]; // heap-style index + + int out = (s[0] >= 0) ? 1 : 0; + int8_t qe = s[0] + (int8_t)(out ? -1 : 1); // inverse autocorrelation + + s[0] = s[1]; // step forward + s[1] = 0; + + // error diffusion dithering using binary weight perturbation + s[(h >> level) & 1 ? 0 : 1] += qe; // forward to t+1 or defer to t+2 + + acc = acc * 2 + out; + } + + return (uint16_t)acc; + } +}; + static uint32_t get_rng_seed(uint32_t seed) { if (seed == LLAMA_DEFAULT_SEED) { // use system clock if std::random_device is not a true RNG From b2ee2fbc0a8e9fef59a8ab27d93cb99b4a23b782 Mon Sep 17 00:00:00 2001 From: Jan Boon Date: Wed, 4 Feb 2026 18:18:56 +0000 Subject: [PATCH 02/25] llama : add floating point blue noise rng --- src/llama-sampling.cpp | 32 ++++++++++++++++++++++++++++---- 1 file changed, 28 insertions(+), 4 deletions(-) diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp index 5dd094ce7a..7c83095582 100644 --- a/src/llama-sampling.cpp +++ b/src/llama-sampling.cpp @@ -333,13 +333,13 @@ static void llama_sampler_top_k_impl(llama_token_data_array * cur_p, int32_t k) cur_p->size = k; } -// pseudo-random number generator with ~6db/octave blue noise temporal autocorrelation +// pseudo-random number generator with ~6db/octave blue noise struct blue_noise_rng { uint8_t bit_depth = 0; uint32_t seed = 0; uint32_t position = 0; - // binary tree of 1-bit 50% duty cycle blue noise generators + // binary tree of 1-bit 50% duty cycle error diffusion dithering blue noise generators std::vector> states; // {err0, err1} per tree node blue_noise_rng() = default; @@ -383,11 +383,12 @@ struct blue_noise_rng { } } - uint16_t next() { + uint16_t next(uint32_t * hash_remainder = nullptr) { uint32_t h = hash(position ^ seed); position++; - // traverse binary tree root-to-leaf, one error diffusion ditherer per bit + // traverse binary tree, one error diffusion ditherer per population split + // thresholding output at any value still produces blue noise uint32_t acc = 0; for (int level = 0; level < bit_depth; level++) { auto & s = states[(1 << level) - 1 + acc]; // heap-style index @@ -404,8 +405,31 @@ struct blue_noise_rng { acc = acc * 2 + out; } + if (hash_remainder) { + *hash_remainder = h >> bit_depth; // unused bits from random hash + } + return (uint16_t)acc; } + + // blue noise in the upper bit_depth bits, white noise hash remainder in the lower bits + // do not use with modulo operator, as it would just produce white noise + uint32_t next32() { + uint32_t rem; + uint32_t val = next(&rem); + return (val << (32 - bit_depth)) | rem; + } + + // uniform double in [0, 1) with blue noise temporal autocorrelation + double nextf() { + double res = 0.0; + res += hash(position ^ ~seed); // fill low bits with white noise + res *= 1.0 / 4294967296.0; + res += next32(); + res *= 1.0 / 4294967296.0; + if (res >= 1.0) res = std::nextafter(1.0, 0.0); + return res; + } }; static uint32_t get_rng_seed(uint32_t seed) { From f271576d81ca920d5d35a76f44a663da47608adb Mon Sep 17 00:00:00 2001 From: Jan Boon Date: Wed, 4 Feb 2026 19:12:50 +0000 Subject: [PATCH 03/25] llama : initial blue noise test implementation --- common/arg.cpp | 7 ++ common/common.h | 1 + common/sampling.cpp | 6 +- include/llama.h | 3 +- src/llama-sampling.cpp | 197 ++++++++++++++++++++++++++++++++++- tools/server/server-task.cpp | 3 + 6 files changed, 214 insertions(+), 3 deletions(-) diff --git a/common/arg.cpp b/common/arg.cpp index 5fbc9022c0..924b5198a2 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -1577,6 +1577,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex params.sampling.ignore_eos = true; } ).set_sparam()); + add_opt(common_arg( + {"--blue-noise"}, + "use blue noise RNG for sampling instead of white noise", + [](common_params & params) { + params.sampling.blue_noise = true; + } + ).set_sparam()); add_opt(common_arg( {"--temp"}, "N", string_format("temperature (default: %.2f)", (double)params.sampling.temp), diff --git a/common/common.h b/common/common.h index 398ebb0960..0a76a1e26c 100644 --- a/common/common.h +++ b/common/common.h @@ -209,6 +209,7 @@ struct common_params_sampling { bool ignore_eos = false; bool no_perf = false; // disable performance metrics bool timing_per_token = false; + bool blue_noise = false; // use blue noise RNG instead of white noise for dist sampler uint64_t user_sampling_config = 0; // bitfield to track user-specified samplers diff --git a/common/sampling.cpp b/common/sampling.cpp index 11a1d48398..2811eb3a48 100644 --- a/common/sampling.cpp +++ b/common/sampling.cpp @@ -313,7 +313,11 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, st samplers.push_back(llama_sampler_init_adaptive_p(params.adaptive_target, params.adaptive_decay, params.seed)); } else { // default: sample from distribution - samplers.push_back(llama_sampler_init_dist(params.seed)); + if (params.blue_noise) { + samplers.push_back(llama_sampler_init_dist_blue_noise(params.seed)); + } else { + samplers.push_back(llama_sampler_init_dist(params.seed)); + } } } else if (params.mirostat == 1) { samplers.push_back(llama_sampler_init_temp(params.temp)); diff --git a/include/llama.h b/include/llama.h index bf4e28a8be..22f08e1683 100644 --- a/include/llama.h +++ b/include/llama.h @@ -1295,7 +1295,8 @@ extern "C" { LLAMA_API struct llama_sampler * llama_sampler_init_greedy(void); /// seed == LLAMA_DEFAULT_SEED to use a random seed. - LLAMA_API struct llama_sampler * llama_sampler_init_dist(uint32_t seed); + LLAMA_API struct llama_sampler * llama_sampler_init_dist (uint32_t seed); + LLAMA_API struct llama_sampler * llama_sampler_init_dist_blue_noise(uint32_t seed); /// @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751 /// Setting k <= 0 makes this a noop diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp index 7c83095582..09fd3a4700 100644 --- a/src/llama-sampling.cpp +++ b/src/llama-sampling.cpp @@ -369,7 +369,7 @@ struct blue_noise_rng { const int n = (int)states.size(); position = 0; - // 5 reachable states with stationary distribution 3:3:2:1:1 (out of 10) + // 5 reachable states with distribution 3:3:2:1:1 static const int8_t tbl[10][2] = { { 0, 0}, { 0, 0}, { 0, 0}, {-1, 0}, {-1, 0}, {-1, 0}, @@ -1340,6 +1340,197 @@ struct llama_sampler * llama_sampler_init_dist(uint32_t seed) { ); } +// dist (blue noise) + +struct llama_sampler_dist_blue_noise : public llama_sampler_backend { + const uint32_t seed; + uint32_t seed_cur; + + blue_noise_rng bn_rng; + + ggml_tensor * inp_uniform; +}; + +static const char * llama_sampler_dist_blue_noise_name(const struct llama_sampler * smpl) { + auto * sctx = (llama_sampler_dist_blue_noise *) smpl->ctx; + return sctx->get_name(); +} + +static void llama_sampler_dist_blue_noise_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) { + auto * ctx = (llama_sampler_dist_blue_noise *) smpl->ctx; + + // edge cases + if (cur_p->size == 0) { + cur_p->selected = -1; + return; + } + + cur_p->selected = 0; + + if (cur_p->size == 1) { + cur_p->data[0].p = 1.0f; + return; + } + + // max logit for numerical stability + float max_l = cur_p->data[0].logit; + if (!cur_p->sorted) { + for (size_t i = 1; i < cur_p->size; ++i) { + max_l = std::max(max_l, cur_p->data[i].logit); + } + } + + // apply softmax to obtain the probabilities + double sum_cum = 0.0f; + for (size_t i = 0; i < cur_p->size; ++i) { + float p = expf(cur_p->data[i].logit - max_l); + cur_p->data[i].p = p; + sum_cum += p; + } + + // sample using blue noise RNG + const double rnd = ctx->bn_rng.nextf(); + + double sum_run = 0.0f; + const double sum_tgt = sum_cum*rnd; + + bool found = false; + for (size_t i = 0; i < cur_p->size; ++i) { + if (!found) { + sum_run += cur_p->data[i].p; + if (sum_run >= sum_tgt) { + cur_p->selected = i; + found = true; + } + } + + // normalize probs + cur_p->data[i].p /= sum_cum; + } + + assert(found); + if (!found) { + cur_p->selected = cur_p->size - 1; + } +} + +static void llama_sampler_dist_blue_noise_reset(struct llama_sampler * smpl) { + auto * ctx = (llama_sampler_dist_blue_noise *) smpl->ctx; + ctx->seed_cur = get_rng_seed(ctx->seed); + ctx->bn_rng.init(16, ctx->seed_cur); +} + +static struct llama_sampler * llama_sampler_dist_blue_noise_clone(const struct llama_sampler * smpl) { + const auto * ctx = (const llama_sampler_dist_blue_noise *) smpl->ctx; + auto * result = llama_sampler_init_dist_blue_noise(ctx->seed); + + // copy the state + { + auto * result_ctx = (llama_sampler_dist_blue_noise *) result->ctx; + + result_ctx->seed_cur = ctx->seed_cur; + result_ctx->bn_rng = ctx->bn_rng; + } + + return result; +} + +static void llama_sampler_dist_blue_noise_free(struct llama_sampler * smpl) { + delete (llama_sampler_dist_blue_noise *) smpl->ctx; +} + +static bool llama_sampler_dist_blue_noise_backend_init( + struct llama_sampler * smpl, + ggml_backend_buffer_type_t buft) { + auto * sctx = (llama_sampler_dist_blue_noise *) smpl->ctx; + + const bool res = llama_sampler_backend_support(smpl, buft); + + sctx->init(res); + + return res; +} + +static void llama_sampler_dist_blue_noise_backend_apply( + struct llama_sampler * smpl, + struct ggml_context * ctx, + struct ggml_cgraph * gf, + struct llama_sampler_data * data) { + GGML_UNUSED(gf); + + auto * sctx = (llama_sampler_dist_blue_noise *) smpl->ctx; + + sctx->inp_uniform = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1); + ggml_set_name (sctx->inp_uniform, "uniform"); + ggml_set_input(sctx->inp_uniform); + + struct ggml_tensor * probs = ggml_soft_max(ctx, data->logits); + ggml_set_name(probs, "dist_probs"); + + struct ggml_tensor * cumsum = ggml_cumsum(ctx, probs); + ggml_set_name(cumsum, "dist_cumsum"); + + struct ggml_tensor * diff = ggml_sub(ctx, cumsum, sctx->inp_uniform); + ggml_set_name(diff, "dist_cumsum"); + + struct ggml_tensor * mask = ggml_step(ctx, diff); + ggml_set_name(mask, "dist_mask"); + + struct ggml_tensor * idxf = ggml_sum(ctx, mask); + ggml_set_name(idxf, "dist_index_f32"); + + struct ggml_tensor * idx = ggml_cast(ctx, ggml_scale_bias(ctx, idxf, -1.0f, mask->ne[0]), GGML_TYPE_I32); + ggml_set_name(idx, "dist_index_i32"); + + struct ggml_tensor * sampled_token = idx; + if (data->candidates != nullptr) { + struct ggml_tensor * candidates = ggml_reshape_2d(ctx, data->candidates, 1, ggml_nelements(data->candidates)); + + sampled_token = ggml_get_rows(ctx, candidates, idx); + ggml_set_name(sampled_token, "dist_sampled_token"); + } + + data->sampled = sampled_token; + data->probs = probs; +} + +static void llama_sampler_dist_blue_noise_backend_set_input(struct llama_sampler * smpl) { + auto * sctx = (llama_sampler_dist_blue_noise *) smpl->ctx; + + GGML_ASSERT(sctx->inp_uniform != nullptr); + + const float rnd = (float)sctx->bn_rng.nextf(); + + ggml_backend_tensor_set(sctx->inp_uniform, &rnd, 0, sizeof(float)); +} + +static struct llama_sampler_i llama_sampler_dist_blue_noise_i = { + /* .name = */ llama_sampler_dist_blue_noise_name, + /* .accept = */ nullptr, + /* .apply = */ llama_sampler_dist_blue_noise_apply, + /* .reset = */ llama_sampler_dist_blue_noise_reset, + /* .clone = */ llama_sampler_dist_blue_noise_clone, + /* .free = */ llama_sampler_dist_blue_noise_free, + /* .backend_init = */ llama_sampler_dist_blue_noise_backend_init, + /* .backend_accept = */ nullptr, + /* .backend_apply = */ llama_sampler_dist_blue_noise_backend_apply, + /* .backend_set_input = */ llama_sampler_dist_blue_noise_backend_set_input, +}; + +struct llama_sampler * llama_sampler_init_dist_blue_noise(uint32_t seed) { + auto seed_cur = get_rng_seed(seed); + return llama_sampler_init( + /* .iface = */ &llama_sampler_dist_blue_noise_i, + /* .ctx = */ new llama_sampler_dist_blue_noise { + ("dist-blue-noise"), + /* .seed = */ seed, + /* .seed_cur = */ seed_cur, + /* .bn_rng = */ blue_noise_rng(16, seed_cur), + /* .inp_uniform = */ nullptr, + } + ); +} + // top-k struct llama_sampler_top_k : public llama_sampler_backend { @@ -3928,6 +4119,10 @@ uint32_t llama_sampler_get_seed(const struct llama_sampler * smpl) { return ((const llama_sampler_dist *) smpl->ctx)->seed_cur; } + if (smpl->iface == &llama_sampler_dist_blue_noise_i) { + return ((const llama_sampler_dist_blue_noise *) smpl->ctx)->seed_cur; + } + if (smpl->iface == &llama_sampler_mirostat_i) { return ((const llama_sampler_mirostat *) smpl->ctx)->seed_cur; } diff --git a/tools/server/server-task.cpp b/tools/server/server-task.cpp index 2d25db63b7..16c3cf12d0 100644 --- a/tools/server/server-task.cpp +++ b/tools/server/server-task.cpp @@ -66,6 +66,7 @@ json task_params::to_json(bool only_metrics) const { {"n_keep", n_keep}, {"n_discard", n_discard}, {"ignore_eos", sampling.ignore_eos}, + {"blue_noise", sampling.blue_noise}, {"stream", stream}, {"n_probs", sampling.n_probs}, {"min_keep", sampling.min_keep}, @@ -125,6 +126,7 @@ json task_params::to_json(bool only_metrics) const { {"n_keep", n_keep}, {"n_discard", n_discard}, {"ignore_eos", sampling.ignore_eos}, + {"blue_noise", sampling.blue_noise}, {"stream", stream}, {"logit_bias", format_logit_bias(sampling.logit_bias)}, {"n_probs", sampling.n_probs}, @@ -467,6 +469,7 @@ task_params server_task::params_from_json_cmpl( } } + params.sampling.blue_noise = json_value(data, "blue_noise", params_base.sampling.blue_noise); params.sampling.ignore_eos = json_value(data, "ignore_eos", params_base.sampling.ignore_eos); if (params.sampling.ignore_eos) { params.sampling.logit_bias.insert( From 3b4061981b92a091ed65b853add69adcc1a5a091 Mon Sep 17 00:00:00 2001 From: Jan Boon Date: Wed, 4 Feb 2026 23:05:54 +0000 Subject: [PATCH 04/25] llama : make the sampler rng modular --- src/llama-sampling.cpp | 285 ++++++++++++----------------------------- 1 file changed, 81 insertions(+), 204 deletions(-) diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp index 09fd3a4700..c41666aaa7 100644 --- a/src/llama-sampling.cpp +++ b/src/llama-sampling.cpp @@ -432,6 +432,56 @@ struct blue_noise_rng { } }; +// abstract RNG interface for the dist sampler +struct llama_dist_rng { + virtual ~llama_dist_rng() = default; + + virtual double nextf() = 0; // uniform double in [0, 1) + virtual void reseed(uint32_t s) = 0; + virtual std::unique_ptr clone() const = 0; +}; + +struct llama_dist_rng_white : llama_dist_rng { + std::mt19937 rng; + + llama_dist_rng_white(uint32_t seed) : rng(seed) {} + + double nextf() override { + std::uniform_real_distribution dist(0.0, 1.0); + return dist(rng); + } + + void reseed(uint32_t s) override { + rng.seed(s); + } + + std::unique_ptr clone() const override { + auto c = std::make_unique(0); + c->rng = rng; + return c; + } +}; + +struct llama_dist_rng_blue : llama_dist_rng { + blue_noise_rng bn_rng; + + llama_dist_rng_blue(uint32_t seed) : bn_rng(16, seed) {} + + double nextf() override { + return bn_rng.nextf(); + } + + void reseed(uint32_t s) override { + bn_rng.init(16, s); + } + + std::unique_ptr clone() const override { + auto c = std::make_unique(0); + c->bn_rng = bn_rng; + return c; + } +}; + static uint32_t get_rng_seed(uint32_t seed) { if (seed == LLAMA_DEFAULT_SEED) { // use system clock if std::random_device is not a true RNG @@ -1122,7 +1172,7 @@ struct llama_sampler_dist : public llama_sampler_backend { const uint32_t seed; uint32_t seed_cur; - std::mt19937 rng; + std::unique_ptr rng; ggml_tensor * inp_uniform; }; @@ -1168,8 +1218,7 @@ static void llama_sampler_dist_apply(struct llama_sampler * smpl, llama_token_da // sample from the obtained probabilities and normalize the probs in a single pass // this is ~3x faster on Mac with full gpt-oss vocab than the version below // - std::uniform_real_distribution dist(0.0f, 1.0f); - const double rnd = dist(ctx->rng); + const double rnd = ctx->rng->nextf(); double sum_run = 0.0f; const double sum_tgt = sum_cum*rnd; @@ -1200,28 +1249,37 @@ static void llama_sampler_dist_apply(struct llama_sampler * smpl, llama_token_da cur_p->data[i].p /= sum_cum; } - cur_p->selected = llama_sample_dist(cur_p, ctx->rng); + const double rnd = ctx->rng->nextf(); + double cum = 0.0; + for (size_t i = 0; i < cur_p->size; ++i) { + cum += cur_p->data[i].p; + if (cum >= rnd) { + cur_p->selected = i; + break; + } + } #endif } static void llama_sampler_dist_reset(struct llama_sampler * smpl) { auto * ctx = (llama_sampler_dist *) smpl->ctx; ctx->seed_cur = get_rng_seed(ctx->seed); - ctx->rng.seed(ctx->seed_cur); + ctx->rng->reseed(ctx->seed_cur); } static struct llama_sampler * llama_sampler_dist_clone(const struct llama_sampler * smpl) { - const auto * ctx = (const llama_sampler_dist *) smpl->ctx; - auto * result = llama_sampler_init_dist(ctx->seed); + auto * ctx = (llama_sampler_dist *) smpl->ctx; - // copy the state - { - auto * result_ctx = (llama_sampler_dist *) result->ctx; - - result_ctx->rng = ctx->rng; - } - - return result; + return llama_sampler_init( + /* .iface = */ smpl->iface, + /* .ctx = */ new llama_sampler_dist { + {ctx->get_name()}, + /* .seed = */ ctx->seed, + /* .seed_cur = */ ctx->seed_cur, + /* .rng = */ ctx->rng->clone(), + /* .inp_uniform = */ nullptr, + } + ); } static void llama_sampler_dist_free(struct llama_sampler * smpl) { @@ -1307,8 +1365,8 @@ static void llama_sampler_dist_backend_set_input(struct llama_sampler * smpl) { // std::uniform_real_distribution and // std::uniform_real_distribution with same rng will produce // different sequences). - std::uniform_real_distribution dist(0.0f, 1.0f); - const float rnd = dist(sctx->rng); + // nextf returns double, equivalent to std::uniform_real_distribution + const float rnd = (float)sctx->rng->nextf(); ggml_backend_tensor_set(sctx->inp_uniform, &rnd, 0, sizeof(float)); } @@ -1331,201 +1389,24 @@ struct llama_sampler * llama_sampler_init_dist(uint32_t seed) { return llama_sampler_init( /* .iface = */ &llama_sampler_dist_i, /* .ctx = */ new llama_sampler_dist { - ("dist"), + {"dist"}, /* .seed = */ seed, /* .seed_cur = */ seed_cur, - /* .rng = */ std::mt19937(seed_cur), + /* .rng = */ std::make_unique(seed_cur), /* .inp_uniform = */ nullptr, } ); } -// dist (blue noise) - -struct llama_sampler_dist_blue_noise : public llama_sampler_backend { - const uint32_t seed; - uint32_t seed_cur; - - blue_noise_rng bn_rng; - - ggml_tensor * inp_uniform; -}; - -static const char * llama_sampler_dist_blue_noise_name(const struct llama_sampler * smpl) { - auto * sctx = (llama_sampler_dist_blue_noise *) smpl->ctx; - return sctx->get_name(); -} - -static void llama_sampler_dist_blue_noise_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) { - auto * ctx = (llama_sampler_dist_blue_noise *) smpl->ctx; - - // edge cases - if (cur_p->size == 0) { - cur_p->selected = -1; - return; - } - - cur_p->selected = 0; - - if (cur_p->size == 1) { - cur_p->data[0].p = 1.0f; - return; - } - - // max logit for numerical stability - float max_l = cur_p->data[0].logit; - if (!cur_p->sorted) { - for (size_t i = 1; i < cur_p->size; ++i) { - max_l = std::max(max_l, cur_p->data[i].logit); - } - } - - // apply softmax to obtain the probabilities - double sum_cum = 0.0f; - for (size_t i = 0; i < cur_p->size; ++i) { - float p = expf(cur_p->data[i].logit - max_l); - cur_p->data[i].p = p; - sum_cum += p; - } - - // sample using blue noise RNG - const double rnd = ctx->bn_rng.nextf(); - - double sum_run = 0.0f; - const double sum_tgt = sum_cum*rnd; - - bool found = false; - for (size_t i = 0; i < cur_p->size; ++i) { - if (!found) { - sum_run += cur_p->data[i].p; - if (sum_run >= sum_tgt) { - cur_p->selected = i; - found = true; - } - } - - // normalize probs - cur_p->data[i].p /= sum_cum; - } - - assert(found); - if (!found) { - cur_p->selected = cur_p->size - 1; - } -} - -static void llama_sampler_dist_blue_noise_reset(struct llama_sampler * smpl) { - auto * ctx = (llama_sampler_dist_blue_noise *) smpl->ctx; - ctx->seed_cur = get_rng_seed(ctx->seed); - ctx->bn_rng.init(16, ctx->seed_cur); -} - -static struct llama_sampler * llama_sampler_dist_blue_noise_clone(const struct llama_sampler * smpl) { - const auto * ctx = (const llama_sampler_dist_blue_noise *) smpl->ctx; - auto * result = llama_sampler_init_dist_blue_noise(ctx->seed); - - // copy the state - { - auto * result_ctx = (llama_sampler_dist_blue_noise *) result->ctx; - - result_ctx->seed_cur = ctx->seed_cur; - result_ctx->bn_rng = ctx->bn_rng; - } - - return result; -} - -static void llama_sampler_dist_blue_noise_free(struct llama_sampler * smpl) { - delete (llama_sampler_dist_blue_noise *) smpl->ctx; -} - -static bool llama_sampler_dist_blue_noise_backend_init( - struct llama_sampler * smpl, - ggml_backend_buffer_type_t buft) { - auto * sctx = (llama_sampler_dist_blue_noise *) smpl->ctx; - - const bool res = llama_sampler_backend_support(smpl, buft); - - sctx->init(res); - - return res; -} - -static void llama_sampler_dist_blue_noise_backend_apply( - struct llama_sampler * smpl, - struct ggml_context * ctx, - struct ggml_cgraph * gf, - struct llama_sampler_data * data) { - GGML_UNUSED(gf); - - auto * sctx = (llama_sampler_dist_blue_noise *) smpl->ctx; - - sctx->inp_uniform = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1); - ggml_set_name (sctx->inp_uniform, "uniform"); - ggml_set_input(sctx->inp_uniform); - - struct ggml_tensor * probs = ggml_soft_max(ctx, data->logits); - ggml_set_name(probs, "dist_probs"); - - struct ggml_tensor * cumsum = ggml_cumsum(ctx, probs); - ggml_set_name(cumsum, "dist_cumsum"); - - struct ggml_tensor * diff = ggml_sub(ctx, cumsum, sctx->inp_uniform); - ggml_set_name(diff, "dist_cumsum"); - - struct ggml_tensor * mask = ggml_step(ctx, diff); - ggml_set_name(mask, "dist_mask"); - - struct ggml_tensor * idxf = ggml_sum(ctx, mask); - ggml_set_name(idxf, "dist_index_f32"); - - struct ggml_tensor * idx = ggml_cast(ctx, ggml_scale_bias(ctx, idxf, -1.0f, mask->ne[0]), GGML_TYPE_I32); - ggml_set_name(idx, "dist_index_i32"); - - struct ggml_tensor * sampled_token = idx; - if (data->candidates != nullptr) { - struct ggml_tensor * candidates = ggml_reshape_2d(ctx, data->candidates, 1, ggml_nelements(data->candidates)); - - sampled_token = ggml_get_rows(ctx, candidates, idx); - ggml_set_name(sampled_token, "dist_sampled_token"); - } - - data->sampled = sampled_token; - data->probs = probs; -} - -static void llama_sampler_dist_blue_noise_backend_set_input(struct llama_sampler * smpl) { - auto * sctx = (llama_sampler_dist_blue_noise *) smpl->ctx; - - GGML_ASSERT(sctx->inp_uniform != nullptr); - - const float rnd = (float)sctx->bn_rng.nextf(); - - ggml_backend_tensor_set(sctx->inp_uniform, &rnd, 0, sizeof(float)); -} - -static struct llama_sampler_i llama_sampler_dist_blue_noise_i = { - /* .name = */ llama_sampler_dist_blue_noise_name, - /* .accept = */ nullptr, - /* .apply = */ llama_sampler_dist_blue_noise_apply, - /* .reset = */ llama_sampler_dist_blue_noise_reset, - /* .clone = */ llama_sampler_dist_blue_noise_clone, - /* .free = */ llama_sampler_dist_blue_noise_free, - /* .backend_init = */ llama_sampler_dist_blue_noise_backend_init, - /* .backend_accept = */ nullptr, - /* .backend_apply = */ llama_sampler_dist_blue_noise_backend_apply, - /* .backend_set_input = */ llama_sampler_dist_blue_noise_backend_set_input, -}; - struct llama_sampler * llama_sampler_init_dist_blue_noise(uint32_t seed) { auto seed_cur = get_rng_seed(seed); return llama_sampler_init( - /* .iface = */ &llama_sampler_dist_blue_noise_i, - /* .ctx = */ new llama_sampler_dist_blue_noise { - ("dist-blue-noise"), + /* .iface = */ &llama_sampler_dist_i, + /* .ctx = */ new llama_sampler_dist { + {"dist-blue-noise"}, /* .seed = */ seed, /* .seed_cur = */ seed_cur, - /* .bn_rng = */ blue_noise_rng(16, seed_cur), + /* .rng = */ std::make_unique(seed_cur), /* .inp_uniform = */ nullptr, } ); @@ -4119,10 +4000,6 @@ uint32_t llama_sampler_get_seed(const struct llama_sampler * smpl) { return ((const llama_sampler_dist *) smpl->ctx)->seed_cur; } - if (smpl->iface == &llama_sampler_dist_blue_noise_i) { - return ((const llama_sampler_dist_blue_noise *) smpl->ctx)->seed_cur; - } - if (smpl->iface == &llama_sampler_mirostat_i) { return ((const llama_sampler_mirostat *) smpl->ctx)->seed_cur; } From 766d86df29e34f2fdae74e4b37b8875aa02a1906 Mon Sep 17 00:00:00 2001 From: Jan Boon Date: Wed, 4 Feb 2026 23:35:22 +0000 Subject: [PATCH 05/25] llama : cleanup and restore alternate code path --- src/llama-sampling.cpp | 48 +++++++++++++++++++++++++++++++++--------- 1 file changed, 38 insertions(+), 10 deletions(-) diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp index c41666aaa7..060538eb12 100644 --- a/src/llama-sampling.cpp +++ b/src/llama-sampling.cpp @@ -214,7 +214,8 @@ static void llama_token_data_array_partial_sort_inplace(llama_token_data_array * cur_p->sorted = true; } -static int llama_sample_dist(llama_token_data_array * cur_p, std::mt19937 & rng) { +template +static int llama_sample_dist(llama_token_data_array * cur_p, RNG & rng) { // iterator for the probabilities #ifdef __GNUC__ #pragma GCC diagnostic push @@ -334,6 +335,10 @@ static void llama_sampler_top_k_impl(llama_token_data_array * cur_p, int32_t k) } // pseudo-random number generator with ~6db/octave blue noise +// important: blue noise properties cannot be preserved when +// the generator is used for multiple purposes simultaneously +// nor when multiple next calls are used to construct a larger value +// nor when integer outputs are used with the modulo operator struct blue_noise_rng { uint8_t bit_depth = 0; uint32_t seed = 0; @@ -436,16 +441,38 @@ struct blue_noise_rng { struct llama_dist_rng { virtual ~llama_dist_rng() = default; + virtual uint32_t rng_min() = 0; + virtual uint32_t rng_max() = 0; + virtual uint32_t next() = 0; // uniform bits in [rng_min(), rng_max()] virtual double nextf() = 0; // uniform double in [0, 1) virtual void reseed(uint32_t s) = 0; virtual std::unique_ptr clone() const = 0; }; +// adapter to satisfy UniformRandomBitGenerator for std::discrete_distribution +// note: not guaranteed to preserve blue noise properties +struct llama_dist_urbg { + using result_type = uint32_t; + + llama_dist_rng & rng; + + result_type min() { return rng.rng_min(); } + result_type max() { return rng.rng_max(); } + result_type operator()() { return rng.next(); } +}; + struct llama_dist_rng_white : llama_dist_rng { std::mt19937 rng; llama_dist_rng_white(uint32_t seed) : rng(seed) {} + uint32_t rng_min() override { return std::mt19937::min(); } + uint32_t rng_max() override { return std::mt19937::max(); } + + uint32_t next() override { + return rng(); + } + double nextf() override { std::uniform_real_distribution dist(0.0, 1.0); return dist(rng); @@ -467,6 +494,13 @@ struct llama_dist_rng_blue : llama_dist_rng { llama_dist_rng_blue(uint32_t seed) : bn_rng(16, seed) {} + uint32_t rng_min() override { return 0; } + uint32_t rng_max() override { return (1u << bn_rng.bit_depth) - 1; } + + uint32_t next() override { + return bn_rng.next(); + } + double nextf() override { return bn_rng.nextf(); } @@ -1249,15 +1283,9 @@ static void llama_sampler_dist_apply(struct llama_sampler * smpl, llama_token_da cur_p->data[i].p /= sum_cum; } - const double rnd = ctx->rng->nextf(); - double cum = 0.0; - for (size_t i = 0; i < cur_p->size; ++i) { - cum += cur_p->data[i].p; - if (cum >= rnd) { - cur_p->selected = i; - break; - } - } + // this implementation is not guaranteed to preserve blue noise properties + llama_dist_urbg urbg{*ctx->rng}; + cur_p->selected = llama_sample_dist(cur_p, urbg); #endif } From ad73188337ad45a60d404d8cdb97a0eaee6e2599 Mon Sep 17 00:00:00 2001 From: Jan Boon Date: Thu, 5 Feb 2026 00:53:51 +0000 Subject: [PATCH 06/25] llama : note on blue noise properties --- src/llama-sampling.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp index 060538eb12..408bdacccc 100644 --- a/src/llama-sampling.cpp +++ b/src/llama-sampling.cpp @@ -335,6 +335,7 @@ static void llama_sampler_top_k_impl(llama_token_data_array * cur_p, int32_t k) } // pseudo-random number generator with ~6db/octave blue noise +// this generator produces a uniform distribution // important: blue noise properties cannot be preserved when // the generator is used for multiple purposes simultaneously // nor when multiple next calls are used to construct a larger value From d5def78bb0b32c686e8d566493acead71d8f0535 Mon Sep 17 00:00:00 2001 From: Jan Boon Date: Thu, 5 Feb 2026 03:15:55 +0000 Subject: [PATCH 07/25] llama : note on blue noise --- src/llama-sampling.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp index 408bdacccc..9b651c816d 100644 --- a/src/llama-sampling.cpp +++ b/src/llama-sampling.cpp @@ -334,6 +334,7 @@ static void llama_sampler_top_k_impl(llama_token_data_array * cur_p, int32_t k) cur_p->size = k; } +// generative error diffusion for sequential blue noise // pseudo-random number generator with ~6db/octave blue noise // this generator produces a uniform distribution // important: blue noise properties cannot be preserved when From e829f2904e3ea76e8ce8982eee9fc1adc6a95602 Mon Sep 17 00:00:00 2001 From: Jan Boon Date: Thu, 5 Feb 2026 10:46:18 +0000 Subject: [PATCH 08/25] sampling : blue noise requires tokens to be sorted --- src/llama-sampling.cpp | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp index 9b651c816d..0e7d4cb178 100644 --- a/src/llama-sampling.cpp +++ b/src/llama-sampling.cpp @@ -443,9 +443,10 @@ struct blue_noise_rng { struct llama_dist_rng { virtual ~llama_dist_rng() = default; + virtual bool requires_sorted() = 0; // whether the RNG requires sorted input for proper properties virtual uint32_t rng_min() = 0; virtual uint32_t rng_max() = 0; - virtual uint32_t next() = 0; // uniform bits in [rng_min(), rng_max()] + virtual uint32_t next() = 0; // uniform bits in [rng_min(), rng_max()] virtual double nextf() = 0; // uniform double in [0, 1) virtual void reseed(uint32_t s) = 0; virtual std::unique_ptr clone() const = 0; @@ -468,6 +469,8 @@ struct llama_dist_rng_white : llama_dist_rng { llama_dist_rng_white(uint32_t seed) : rng(seed) {} + bool requires_sorted() override { return false; } + uint32_t rng_min() override { return std::mt19937::min(); } uint32_t rng_max() override { return std::mt19937::max(); } @@ -496,6 +499,8 @@ struct llama_dist_rng_blue : llama_dist_rng { llama_dist_rng_blue(uint32_t seed) : bn_rng(16, seed) {} + bool requires_sorted() override { return true; } + uint32_t rng_min() override { return 0; } uint32_t rng_max() override { return (1u << bn_rng.bit_depth) - 1; } @@ -1234,6 +1239,11 @@ static void llama_sampler_dist_apply(struct llama_sampler * smpl, llama_token_da return; } + // sort if required by the RNG (e.g., blue noise needs sorted input for proper temporal properties) + if (ctx->rng->requires_sorted() && !cur_p->sorted) { + llama_token_data_array_partial_sort_inplace(cur_p, cur_p->size); + } + // max logit for numerical stability float max_l = cur_p->data[0].logit; if (!cur_p->sorted) { From 267cd808a2adff78004b0f96bcde9b2b7d37f0f1 Mon Sep 17 00:00:00 2001 From: Jan Boon Date: Sat, 7 Feb 2026 02:42:21 +0000 Subject: [PATCH 09/25] sampling : cleanup blue noise rng with some more notes --- src/llama-sampler.cpp | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/src/llama-sampler.cpp b/src/llama-sampler.cpp index 9db31d4a2c..afa5fe16be 100644 --- a/src/llama-sampler.cpp +++ b/src/llama-sampler.cpp @@ -355,6 +355,10 @@ struct blue_noise_rng { init(bit_depth, seed); } + // currently this uses lowbias32 as the white noise RNG source + // in practice, any white noise RNG source works + // this random noise is used to perturb the error diffusion weights (binary decision) + // as well as to fill in the low bits of the double precision output to eliminate aliasing static uint32_t hash(uint32_t x) { // lowbias32 x ^= x >> 16; x *= 0x21f0aaad; x ^= x >> 15; x *= 0x735a2d97; @@ -377,6 +381,7 @@ struct blue_noise_rng { position = 0; // 5 reachable states with distribution 3:3:2:1:1 + // established based on empirical testing static const int8_t tbl[10][2] = { { 0, 0}, { 0, 0}, { 0, 0}, {-1, 0}, {-1, 0}, {-1, 0}, @@ -444,9 +449,13 @@ struct llama_dist_rng { virtual ~llama_dist_rng() = default; virtual bool requires_sorted() = 0; // whether the RNG requires sorted input for proper properties + + // for compatilibility with std::discrete_distribution + // nly used in a disabled branch of llama_sampler_dist_apply virtual uint32_t rng_min() = 0; virtual uint32_t rng_max() = 0; virtual uint32_t next() = 0; // uniform bits in [rng_min(), rng_max()] + virtual double nextf() = 0; // uniform double in [0, 1) virtual void reseed(uint32_t s) = 0; virtual std::unique_ptr clone() const = 0; @@ -454,6 +463,7 @@ struct llama_dist_rng { // adapter to satisfy UniformRandomBitGenerator for std::discrete_distribution // note: not guaranteed to preserve blue noise properties +// this is only used in a disabled branch of llama_sampler_dist_apply, added for compatibility struct llama_dist_urbg { using result_type = uint32_t; @@ -464,10 +474,10 @@ struct llama_dist_urbg { result_type operator()() { return rng.next(); } }; -struct llama_dist_rng_white : llama_dist_rng { +struct llama_dist_rng_mt19937 : llama_dist_rng { std::mt19937 rng; - llama_dist_rng_white(uint32_t seed) : rng(seed) {} + llama_dist_rng_mt19937(uint32_t seed) : rng(seed) {} bool requires_sorted() override { return false; } @@ -488,7 +498,7 @@ struct llama_dist_rng_white : llama_dist_rng { } std::unique_ptr clone() const override { - auto c = std::make_unique(0); + auto c = std::make_unique(0); c->rng = rng; return c; } @@ -1432,7 +1442,7 @@ struct llama_sampler * llama_sampler_init_dist(uint32_t seed) { {"dist"}, /* .seed = */ seed, /* .seed_cur = */ seed_cur, - /* .rng = */ std::make_unique(seed_cur), + /* .rng = */ std::make_unique(seed_cur), /* .inp_uniform = */ nullptr, } ); From 1b1b2cbe0e2660a88b98f0671bfa98cec7213530 Mon Sep 17 00:00:00 2001 From: Jan Boon Date: Sat, 7 Feb 2026 04:27:52 +0000 Subject: [PATCH 10/25] sampling : also apply sorting in backend path when blue noise rng is selected --- src/llama-sampler.cpp | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/src/llama-sampler.cpp b/src/llama-sampler.cpp index afa5fe16be..dc7394ae51 100644 --- a/src/llama-sampler.cpp +++ b/src/llama-sampler.cpp @@ -1361,6 +1361,30 @@ static void llama_sampler_dist_backend_apply( ggml_set_name (sctx->inp_uniform, "uniform"); ggml_set_input(sctx->inp_uniform); + // If the RNG requires sorted input (e.g., blue noise), sort logits first + // so the CDF walk operates in probability-rank space, not arbitrary vocab order. + if (sctx->rng->requires_sorted()) { + auto ggml_sort = [ctx](struct ggml_tensor * a, struct ggml_tensor * b) { + GGML_ASSERT(ggml_nrows(a) == 1); + struct ggml_tensor * a_reshaped = ggml_reshape_2d(ctx, a, 1, a->ne[0]); + struct ggml_tensor * a_sorted = ggml_get_rows(ctx, a_reshaped, b); + return ggml_reshape_1d(ctx, a_sorted, a->ne[0]); + }; + + struct ggml_tensor * sorted_idx = ggml_argsort(ctx, data->logits, GGML_SORT_ORDER_DESC); + ggml_set_name(sorted_idx, "dist_sorted_idx"); + + data->logits = ggml_sort(data->logits, sorted_idx); + ggml_set_name(data->logits, "dist_sorted_logits"); + + if (data->candidates) { + data->candidates = ggml_sort(data->candidates, sorted_idx); + } else { + data->candidates = sorted_idx; + } + ggml_set_name(data->candidates, "dist_sorted_candidates"); + } + struct ggml_tensor * probs = ggml_soft_max(ctx, data->logits); ggml_set_name(probs, "dist_probs"); From 15ade86a75d1b26f61be1d084f004e5df00d421f Mon Sep 17 00:00:00 2001 From: Jan Boon Date: Sat, 7 Feb 2026 04:49:37 +0000 Subject: [PATCH 11/25] sampling : simplify clone --- src/llama-sampler.cpp | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/src/llama-sampler.cpp b/src/llama-sampler.cpp index dc7394ae51..0aa000f319 100644 --- a/src/llama-sampler.cpp +++ b/src/llama-sampler.cpp @@ -498,9 +498,7 @@ struct llama_dist_rng_mt19937 : llama_dist_rng { } std::unique_ptr clone() const override { - auto c = std::make_unique(0); - c->rng = rng; - return c; + return std::make_unique(*this); } }; @@ -527,9 +525,7 @@ struct llama_dist_rng_blue : llama_dist_rng { } std::unique_ptr clone() const override { - auto c = std::make_unique(0); - c->bn_rng = bn_rng; - return c; + return std::make_unique(*this); } }; From a0323a989df46cacc0c17f5577ec44080b702f28 Mon Sep 17 00:00:00 2001 From: Jan Boon Date: Sat, 7 Feb 2026 05:03:21 +0000 Subject: [PATCH 12/25] sampling : comment on state use --- src/llama-sampler.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/llama-sampler.cpp b/src/llama-sampler.cpp index 0aa000f319..0d1586508e 100644 --- a/src/llama-sampler.cpp +++ b/src/llama-sampler.cpp @@ -371,7 +371,7 @@ struct blue_noise_rng { seed = hash(s); const int n = (1 << bit_depth) - 1; - states.resize(n); + states.resize(n); // at 16-bit depth, this uses 128KB of state reset(); } From 23b5a5c026f77e0a51361e7dc5ec3292f481f65f Mon Sep 17 00:00:00 2001 From: Jan Boon Date: Sat, 7 Feb 2026 09:19:23 +0000 Subject: [PATCH 13/25] sampling : fix whitespace --- src/llama-sampler.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/llama-sampler.cpp b/src/llama-sampler.cpp index 0d1586508e..275e4c5b56 100644 --- a/src/llama-sampler.cpp +++ b/src/llama-sampler.cpp @@ -337,7 +337,7 @@ static void llama_sampler_top_k_impl(llama_token_data_array * cur_p, int32_t k) // generative error diffusion for sequential blue noise // pseudo-random number generator with ~6db/octave blue noise // this generator produces a uniform distribution -// important: blue noise properties cannot be preserved when +// important: blue noise properties cannot be preserved when // the generator is used for multiple purposes simultaneously // nor when multiple next calls are used to construct a larger value // nor when integer outputs are used with the modulo operator @@ -449,7 +449,7 @@ struct llama_dist_rng { virtual ~llama_dist_rng() = default; virtual bool requires_sorted() = 0; // whether the RNG requires sorted input for proper properties - + // for compatilibility with std::discrete_distribution // nly used in a disabled branch of llama_sampler_dist_apply virtual uint32_t rng_min() = 0; From 7f433763b6683b7f6e5a323729acf73a1a4e8ec9 Mon Sep 17 00:00:00 2001 From: Jan Boon Date: Mon, 9 Feb 2026 02:47:42 +0000 Subject: [PATCH 14/25] sampling : implement disabled branch to support blue noise --- src/llama-sampler.cpp | 54 ++++++++++++++++++++++++++++++++++++++----- 1 file changed, 48 insertions(+), 6 deletions(-) diff --git a/src/llama-sampler.cpp b/src/llama-sampler.cpp index 275e4c5b56..02258d981b 100644 --- a/src/llama-sampler.cpp +++ b/src/llama-sampler.cpp @@ -448,10 +448,12 @@ struct blue_noise_rng { struct llama_dist_rng { virtual ~llama_dist_rng() = default; - virtual bool requires_sorted() = 0; // whether the RNG requires sorted input for proper properties + // whether the RNG requires sorted input for proper properties + // this also indicates whether the RNG output itself must be consumed in a coherent order + virtual bool requires_sorted() = 0; - // for compatilibility with std::discrete_distribution - // nly used in a disabled branch of llama_sampler_dist_apply + // for compatibility with std::discrete_distribution + // only used in a disabled branch of llama_sampler_dist_apply virtual uint32_t rng_min() = 0; virtual uint32_t rng_max() = 0; virtual uint32_t next() = 0; // uniform bits in [rng_min(), rng_max()] @@ -474,6 +476,48 @@ struct llama_dist_urbg { result_type operator()() { return rng.next(); } }; +// wrapper to use existing llama_sample_dist for mt19937, otherwise implements CDF walk directly +// this is currently only used in a disabled branch of llama_sampler_dist_apply, added for compatibility and potential use by other samplers +// flag normalized to skip recomputing the probability sum when probs already sum to 1 +static int llama_sample_dist_rng(llama_token_data_array * cur_p, llama_dist_rng & rng, bool normalized = false) { + if (!rng.requires_sorted()) { + llama_dist_urbg urbg{rng}; + return llama_sample_dist(cur_p, urbg); + } + + if (!cur_p->sorted) { + llama_token_data_array_partial_sort_inplace(cur_p, cur_p->size); + } + const double rnd = rng.nextf(); + + double sum_run = 0.0; + + if (normalized) { + for (size_t i = 0; i < cur_p->size; ++i) { + sum_run += cur_p->data[i].p; + if (sum_run >= rnd) { + return i; + } + } + } else { + double sum_cum = 0.0; + for (size_t i = 0; i < cur_p->size; ++i) { + sum_cum += cur_p->data[i].p; + } + + const double sum_tgt = sum_cum * rnd; + + for (size_t i = 0; i < cur_p->size; ++i) { + sum_run += cur_p->data[i].p; + if (sum_run >= sum_tgt) { + return i; + } + } + } + + return (int)(cur_p->size - 1); +} + struct llama_dist_rng_mt19937 : llama_dist_rng { std::mt19937 rng; @@ -1301,9 +1345,7 @@ static void llama_sampler_dist_apply(struct llama_sampler * smpl, llama_token_da cur_p->data[i].p /= sum_cum; } - // this implementation is not guaranteed to preserve blue noise properties - llama_dist_urbg urbg{*ctx->rng}; - cur_p->selected = llama_sample_dist(cur_p, urbg); + cur_p->selected = llama_sample_dist_rng(cur_p, *ctx->rng, true); #endif } From ae31b151e9a1ba8b1b17ecda07a26f3084869ed7 Mon Sep 17 00:00:00 2001 From: Jan Boon Date: Mon, 9 Feb 2026 03:27:12 +0000 Subject: [PATCH 15/25] sampling : cleaner approach for constructing floating point value --- src/llama-sampler.cpp | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/src/llama-sampler.cpp b/src/llama-sampler.cpp index 02258d981b..ad0a20f0ff 100644 --- a/src/llama-sampler.cpp +++ b/src/llama-sampler.cpp @@ -434,13 +434,10 @@ struct blue_noise_rng { // uniform double in [0, 1) with blue noise temporal autocorrelation double nextf() { - double res = 0.0; - res += hash(position ^ ~seed); // fill low bits with white noise - res *= 1.0 / 4294967296.0; - res += next32(); - res *= 1.0 / 4294967296.0; - if (res >= 1.0) res = std::nextafter(1.0, 0.0); - return res; + uint32_t lo = hash(position ^ ~seed); // white noise low bits + uint32_t hi = next32(); // blue noise high bits + uint64_t combined = ((uint64_t)hi << 32) | lo; + return (combined >> 11) * 0x1.0p-53; } }; From 2c7269fd8da3f4b7899aed5f6754520bcc069a4f Mon Sep 17 00:00:00 2001 From: Jan Boon Date: Mon, 9 Feb 2026 04:03:10 +0000 Subject: [PATCH 16/25] sampling : make white noise source for blue noise modular as well --- src/llama-sampler.cpp | 193 +++++++++++++++++++++++++++++------------- 1 file changed, 136 insertions(+), 57 deletions(-) diff --git a/src/llama-sampler.cpp b/src/llama-sampler.cpp index ad0a20f0ff..2ddb2978eb 100644 --- a/src/llama-sampler.cpp +++ b/src/llama-sampler.cpp @@ -334,6 +334,27 @@ static void llama_sampler_top_k_impl(llama_token_data_array * cur_p, int32_t k) cur_p->size = k; } +// abstract RNG interface for the dist sampler +struct llama_dist_rng { + virtual ~llama_dist_rng() = default; + + // whether the RNG requires sorted input for proper properties + // this also indicates whether the RNG output itself must be consumed in a coherent order + virtual bool requires_sorted() = 0; + + // for compatibility with std::discrete_distribution + // only used in a disabled branch of llama_sampler_dist_apply + virtual uint32_t rng_min() = 0; + virtual uint32_t rng_max() = 0; + virtual uint32_t next() = 0; // uniform bits in [rng_min(), rng_max()] + + virtual uint32_t next32() = 0; // uniform 32 bits + virtual uint64_t next64() = 0; // uniform 64 bits + virtual double nextf() = 0; // uniform double in [0, 1) + virtual void reseed(uint32_t s) = 0; + virtual std::unique_ptr clone() const = 0; +}; + // generative error diffusion for sequential blue noise // pseudo-random number generator with ~6db/octave blue noise // this generator produces a uniform distribution @@ -343,32 +364,38 @@ static void llama_sampler_top_k_impl(llama_token_data_array * cur_p, int32_t k) // nor when integer outputs are used with the modulo operator struct blue_noise_rng { uint8_t bit_depth = 0; - uint32_t seed = 0; - uint32_t position = 0; + std::unique_ptr rng; // binary tree of 1-bit 50% duty cycle error diffusion dithering blue noise generators std::vector> states; // {err0, err1} per tree node blue_noise_rng() = default; - blue_noise_rng(uint8_t bit_depth, uint32_t seed) { - init(bit_depth, seed); + blue_noise_rng(uint8_t bit_depth, std::unique_ptr rng) { + init(bit_depth, std::move(rng)); } - // currently this uses lowbias32 as the white noise RNG source - // in practice, any white noise RNG source works - // this random noise is used to perturb the error diffusion weights (binary decision) - // as well as to fill in the low bits of the double precision output to eliminate aliasing - static uint32_t hash(uint32_t x) { // lowbias32 - x ^= x >> 16; x *= 0x21f0aaad; - x ^= x >> 15; x *= 0x735a2d97; - x ^= x >> 15; - return x; + // custom copy (clone the underlying RNG) + blue_noise_rng(const blue_noise_rng & other) + : bit_depth(other.bit_depth) + , rng(other.rng ? other.rng->clone() : nullptr) + , states(other.states) {} + + blue_noise_rng & operator=(const blue_noise_rng & other) { + if (this != &other) { + bit_depth = other.bit_depth; + rng = other.rng ? other.rng->clone() : nullptr; + states = other.states; + } + return *this; } - void init(uint8_t depth, uint32_t s) { + blue_noise_rng(blue_noise_rng &&) = default; + blue_noise_rng & operator=(blue_noise_rng &&) = default; + + void init(uint8_t depth, std::unique_ptr source) { bit_depth = std::clamp(depth, 1, 16); - seed = hash(s); + rng = std::move(source); const int n = (1 << bit_depth) - 1; states.resize(n); // at 16-bit depth, this uses 128KB of state @@ -376,9 +403,13 @@ struct blue_noise_rng { reset(); } + void reseed(uint32_t s) { + rng->reseed(s); + reset(); + } + void reset() { const int n = (int)states.size(); - position = 0; // 5 reachable states with distribution 3:3:2:1:1 // established based on empirical testing @@ -390,15 +421,12 @@ struct blue_noise_rng { {-1, -1}, }; for (int i = 0; i < n; i++) { - uint32_t h = hash((uint32_t)i ^ seed) % 10; + uint32_t h = rng->next32() % 10; states[i] = {tbl[h][0], tbl[h][1]}; // random initial state } } - uint16_t next(uint32_t * hash_remainder = nullptr) { - uint32_t h = hash(position ^ seed); - position++; - + uint16_t advance(uint32_t h) { // traverse binary tree, one error diffusion ditherer per population split // thresholding output at any value still produces blue noise uint32_t acc = 0; @@ -416,50 +444,39 @@ struct blue_noise_rng { acc = acc * 2 + out; } - - if (hash_remainder) { - *hash_remainder = h >> bit_depth; // unused bits from random hash - } - return (uint16_t)acc; } - // blue noise in the upper bit_depth bits, white noise hash remainder in the lower bits + uint16_t next() { + uint32_t h = rng->next32(); + return advance(h); + } + + // blue noise in the upper bit_depth bits, white noise in the lower bits // do not use with modulo operator, as it would just produce white noise uint32_t next32() { - uint32_t rem; - uint32_t val = next(&rem); - return (val << (32 - bit_depth)) | rem; + uint32_t h = rng->next32(); + uint32_t val = advance(h); + return (val << (32 - bit_depth)) | (h >> bit_depth); + } + + // blue noise in the upper bits, white noise in the lower bits + uint64_t next64() { + uint64_t r = rng->next64(); + uint32_t lo = (uint32_t)r; + uint32_t h = (uint32_t)(r >> 32); + uint32_t val = advance(h); + uint32_t hi = (val << (32 - bit_depth)) | (h >> bit_depth); + return ((uint64_t)hi << 32) | lo; } // uniform double in [0, 1) with blue noise temporal autocorrelation double nextf() { - uint32_t lo = hash(position ^ ~seed); // white noise low bits - uint32_t hi = next32(); // blue noise high bits - uint64_t combined = ((uint64_t)hi << 32) | lo; + uint64_t combined = next64(); return (combined >> 11) * 0x1.0p-53; } }; -// abstract RNG interface for the dist sampler -struct llama_dist_rng { - virtual ~llama_dist_rng() = default; - - // whether the RNG requires sorted input for proper properties - // this also indicates whether the RNG output itself must be consumed in a coherent order - virtual bool requires_sorted() = 0; - - // for compatibility with std::discrete_distribution - // only used in a disabled branch of llama_sampler_dist_apply - virtual uint32_t rng_min() = 0; - virtual uint32_t rng_max() = 0; - virtual uint32_t next() = 0; // uniform bits in [rng_min(), rng_max()] - - virtual double nextf() = 0; // uniform double in [0, 1) - virtual void reseed(uint32_t s) = 0; - virtual std::unique_ptr clone() const = 0; -}; - // adapter to satisfy UniformRandomBitGenerator for std::discrete_distribution // note: not guaranteed to preserve blue noise properties // this is only used in a disabled branch of llama_sampler_dist_apply, added for compatibility @@ -515,6 +532,55 @@ static int llama_sample_dist_rng(llama_token_data_array * cur_p, llama_dist_rng return (int)(cur_p->size - 1); } +struct llama_dist_rng_lowbias32 : llama_dist_rng { + uint32_t hashed_seed = 0; + uint32_t position = 0; + + llama_dist_rng_lowbias32(uint32_t seed) : hashed_seed(hash(seed)), position(0) {} + + bool requires_sorted() override { return false; } + uint32_t rng_min() override { return 0; } + uint32_t rng_max() override { return UINT32_MAX; } + + static uint32_t hash(uint32_t x) { // lowbias32 + // coefficients from https://github.com/skeeto/hash-prospector/issues/19 + x ^= x >> 16; x *= 0x21f0aaad; + x ^= x >> 15; x *= 0x735a2d97; + x ^= x >> 15; + return x; + } + + uint32_t next() override { + uint32_t val = hash(position ^ hashed_seed); + position++; + return val; + } + + uint32_t next32() override { + return next(); + } + + uint64_t next64() override { + uint64_t lo = hash(position ^ ~hashed_seed); // secondary sequence using opposing seed + uint64_t hi = next(); + return (hi << 32) | lo; + } + + double nextf() override { + uint64_t combined = next64(); + return (combined >> 11) * 0x1.0p-53; + } + + void reseed(uint32_t s) override { + hashed_seed = hash(s); + position = 0; + } + + std::unique_ptr clone() const override { + return std::make_unique(*this); + } +}; + struct llama_dist_rng_mt19937 : llama_dist_rng { std::mt19937 rng; @@ -524,11 +590,18 @@ struct llama_dist_rng_mt19937 : llama_dist_rng { uint32_t rng_min() override { return std::mt19937::min(); } uint32_t rng_max() override { return std::mt19937::max(); } + uint32_t next() override { return rng(); } - uint32_t next() override { + uint32_t next32() override { return rng(); } + uint64_t next64() override { + uint64_t hi = (uint64_t)rng() << 32; + uint64_t lo = (uint64_t)rng(); + return hi | lo; + } + double nextf() override { std::uniform_real_distribution dist(0.0, 1.0); return dist(rng); @@ -546,15 +619,21 @@ struct llama_dist_rng_mt19937 : llama_dist_rng { struct llama_dist_rng_blue : llama_dist_rng { blue_noise_rng bn_rng; - llama_dist_rng_blue(uint32_t seed) : bn_rng(16, seed) {} + llama_dist_rng_blue(uint32_t seed) + : bn_rng(16, std::make_unique(seed)) {} bool requires_sorted() override { return true; } uint32_t rng_min() override { return 0; } uint32_t rng_max() override { return (1u << bn_rng.bit_depth) - 1; } + uint32_t next() override { return bn_rng.next(); } - uint32_t next() override { - return bn_rng.next(); + uint32_t next32() override { + return bn_rng.next32(); + } + + uint64_t next64() override { + return bn_rng.next64(); } double nextf() override { @@ -562,7 +641,7 @@ struct llama_dist_rng_blue : llama_dist_rng { } void reseed(uint32_t s) override { - bn_rng.init(16, s); + bn_rng.reseed(s); } std::unique_ptr clone() const override { From f3acd240d68b888c7d90ae02337b4f52a3108266 Mon Sep 17 00:00:00 2001 From: Jan Boon Date: Mon, 9 Feb 2026 04:09:57 +0000 Subject: [PATCH 17/25] sampling : simplify --- src/llama-sampler.cpp | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/src/llama-sampler.cpp b/src/llama-sampler.cpp index 2ddb2978eb..df30a613d5 100644 --- a/src/llama-sampler.cpp +++ b/src/llama-sampler.cpp @@ -463,11 +463,8 @@ struct blue_noise_rng { // blue noise in the upper bits, white noise in the lower bits uint64_t next64() { uint64_t r = rng->next64(); - uint32_t lo = (uint32_t)r; - uint32_t h = (uint32_t)(r >> 32); - uint32_t val = advance(h); - uint32_t hi = (val << (32 - bit_depth)) | (h >> bit_depth); - return ((uint64_t)hi << 32) | lo; + uint32_t val = advance((uint32_t)r); + return ((uint64_t)val << (64 - bit_depth)) | (r >> bit_depth); } // uniform double in [0, 1) with blue noise temporal autocorrelation @@ -561,8 +558,8 @@ struct llama_dist_rng_lowbias32 : llama_dist_rng { } uint64_t next64() override { - uint64_t lo = hash(position ^ ~hashed_seed); // secondary sequence using opposing seed - uint64_t hi = next(); + uint64_t hi = hash(position ^ ~hashed_seed); // secondary sequence using opposing seed + uint64_t lo = next(); return (hi << 32) | lo; } From 75cb3e8f2eb6e0d6f44559cda9ebc80f2b8b27c0 Mon Sep 17 00:00:00 2001 From: Jan Boon Date: Mon, 9 Feb 2026 04:17:07 +0000 Subject: [PATCH 18/25] sampling : test against previous implementation --- src/llama-sampler.cpp | 26 ++++++++++++++++++++++---- 1 file changed, 22 insertions(+), 4 deletions(-) diff --git a/src/llama-sampler.cpp b/src/llama-sampler.cpp index df30a613d5..9f5cba09c2 100644 --- a/src/llama-sampler.cpp +++ b/src/llama-sampler.cpp @@ -352,6 +352,7 @@ struct llama_dist_rng { virtual uint64_t next64() = 0; // uniform 64 bits virtual double nextf() = 0; // uniform double in [0, 1) virtual void reseed(uint32_t s) = 0; + virtual void reset() = 0; // reset to post-seed state virtual std::unique_ptr clone() const = 0; }; @@ -400,15 +401,15 @@ struct blue_noise_rng { const int n = (1 << bit_depth) - 1; states.resize(n); // at 16-bit depth, this uses 128KB of state - reset(); + reset_states(); } void reseed(uint32_t s) { rng->reseed(s); - reset(); + reset_states(); } - void reset() { + void reset_states() { const int n = (int)states.size(); // 5 reachable states with distribution 3:3:2:1:1 @@ -424,6 +425,8 @@ struct blue_noise_rng { uint32_t h = rng->next32() % 10; states[i] = {tbl[h][0], tbl[h][1]}; // random initial state } + + rng->reset(); // reset position so generation starts from 0 } uint16_t advance(uint32_t h) { @@ -573,15 +576,20 @@ struct llama_dist_rng_lowbias32 : llama_dist_rng { position = 0; } + void reset() override { + position = 0; + } + std::unique_ptr clone() const override { return std::make_unique(*this); } }; struct llama_dist_rng_mt19937 : llama_dist_rng { + uint32_t seed; std::mt19937 rng; - llama_dist_rng_mt19937(uint32_t seed) : rng(seed) {} + llama_dist_rng_mt19937(uint32_t seed) : seed(seed), rng(seed) {} bool requires_sorted() override { return false; } @@ -605,9 +613,14 @@ struct llama_dist_rng_mt19937 : llama_dist_rng { } void reseed(uint32_t s) override { + seed = s; rng.seed(s); } + void reset() override { + rng.seed(seed); + } + std::unique_ptr clone() const override { return std::make_unique(*this); } @@ -641,6 +654,11 @@ struct llama_dist_rng_blue : llama_dist_rng { bn_rng.reseed(s); } + void reset() override { + bn_rng.rng->reset(); + bn_rng.reset_states(); + } + std::unique_ptr clone() const override { return std::make_unique(*this); } From 7bb5d4b8907a0092cecbd4799395a664a95695b5 Mon Sep 17 00:00:00 2001 From: Jan Boon Date: Mon, 9 Feb 2026 04:30:31 +0000 Subject: [PATCH 19/25] sampling : disable testing code --- src/llama-sampler.cpp | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/llama-sampler.cpp b/src/llama-sampler.cpp index 9f5cba09c2..34f8a62ab4 100644 --- a/src/llama-sampler.cpp +++ b/src/llama-sampler.cpp @@ -339,7 +339,7 @@ struct llama_dist_rng { virtual ~llama_dist_rng() = default; // whether the RNG requires sorted input for proper properties - // this also indicates whether the RNG output itself must be consumed in a coherent order + // this also indicates whether the RNG output itself must be consumed in a sequential order virtual bool requires_sorted() = 0; // for compatibility with std::discrete_distribution @@ -426,7 +426,11 @@ struct blue_noise_rng { states[i] = {tbl[h][0], tbl[h][1]}; // random initial state } +#if 0 + // test against initial implementation outputs + // note: white noise padding in next64 is slightly different, but minimally consequential for testing rng->reset(); // reset position so generation starts from 0 +#endif } uint16_t advance(uint32_t h) { From 2826de3189d8f3a0e95961be7884fe30ec46cf4a Mon Sep 17 00:00:00 2001 From: Jan Boon Date: Mon, 9 Feb 2026 05:23:58 +0000 Subject: [PATCH 20/25] sampling : make rng selection fully modular --- common/arg.cpp | 13 +++++++++++++ common/common.h | 1 + common/sampling.cpp | 13 ++++++------- include/llama.h | 9 +++++++-- src/llama-sampler.cpp | 34 ++++++++++++++++++---------------- tools/server/server-task.cpp | 10 ++++++++++ 6 files changed, 55 insertions(+), 25 deletions(-) diff --git a/common/arg.cpp b/common/arg.cpp index 924b5198a2..7181e31cd7 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -1584,6 +1584,19 @@ common_params_context common_params_parser_init(common_params & params, llama_ex params.sampling.blue_noise = true; } ).set_sparam()); + add_opt(common_arg( + {"--rng-type"}, "{mt19937,lowbias32}", + "RNG type for sampling (default: mt19937)", + [](common_params & params, const std::string & value) { + if (value == "mt19937") { + params.sampling.rng_type = LLAMA_RNG_TYPE_MT19937; + } else if (value == "lowbias32") { + params.sampling.rng_type = LLAMA_RNG_TYPE_LOWBIAS32; + } else { + throw std::invalid_argument("invalid value"); + } + } + ).set_sparam()); add_opt(common_arg( {"--temp"}, "N", string_format("temperature (default: %.2f)", (double)params.sampling.temp), diff --git a/common/common.h b/common/common.h index 0a76a1e26c..662eeb51e2 100644 --- a/common/common.h +++ b/common/common.h @@ -210,6 +210,7 @@ struct common_params_sampling { bool no_perf = false; // disable performance metrics bool timing_per_token = false; bool blue_noise = false; // use blue noise RNG instead of white noise for dist sampler + enum llama_rng_type rng_type = LLAMA_RNG_TYPE_MT19937; // RNG type for dist sampler uint64_t user_sampling_config = 0; // bitfield to track user-specified samplers diff --git a/common/sampling.cpp b/common/sampling.cpp index 2811eb3a48..f98bd7b311 100644 --- a/common/sampling.cpp +++ b/common/sampling.cpp @@ -167,11 +167,14 @@ std::string common_params_sampling::print() const { "\trepeat_last_n = %d, repeat_penalty = %.3f, frequency_penalty = %.3f, presence_penalty = %.3f\n" "\tdry_multiplier = %.3f, dry_base = %.3f, dry_allowed_length = %d, dry_penalty_last_n = %d\n" "\ttop_k = %d, top_p = %.3f, min_p = %.3f, xtc_probability = %.3f, xtc_threshold = %.3f, typical_p = %.3f, top_n_sigma = %.3f, temp = %.3f\n" - "\tmirostat = %d, mirostat_lr = %.3f, mirostat_ent = %.3f, adaptive_target = %.3f, adaptive_decay = %.3f", + "\tmirostat = %d, mirostat_lr = %.3f, mirostat_ent = %.3f, adaptive_target = %.3f, adaptive_decay = %.3f\n" + "\tblue_noise = %s, rng_type = %s", penalty_last_n, penalty_repeat, penalty_freq, penalty_present, dry_multiplier, dry_base, dry_allowed_length, dry_penalty_last_n, top_k, top_p, min_p, xtc_probability, xtc_threshold, typ_p, top_n_sigma, temp, - mirostat, mirostat_eta, mirostat_tau, adaptive_target, adaptive_decay); + mirostat, mirostat_eta, mirostat_tau, adaptive_target, adaptive_decay, + blue_noise ? "true" : "false", + rng_type == LLAMA_RNG_TYPE_LOWBIAS32 ? "lowbias32" : "mt19937"); return std::string(result); } @@ -313,11 +316,7 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, st samplers.push_back(llama_sampler_init_adaptive_p(params.adaptive_target, params.adaptive_decay, params.seed)); } else { // default: sample from distribution - if (params.blue_noise) { - samplers.push_back(llama_sampler_init_dist_blue_noise(params.seed)); - } else { - samplers.push_back(llama_sampler_init_dist(params.seed)); - } + samplers.push_back(llama_sampler_init_dist_rng(params.seed, params.blue_noise, params.rng_type)); } } else if (params.mirostat == 1) { samplers.push_back(llama_sampler_init_temp(params.temp)); diff --git a/include/llama.h b/include/llama.h index 22f08e1683..d9f4acc5c7 100644 --- a/include/llama.h +++ b/include/llama.h @@ -188,6 +188,11 @@ extern "C" { LLAMA_API const char * llama_flash_attn_type_name(enum llama_flash_attn_type flash_attn_type); + enum llama_rng_type { + LLAMA_RNG_TYPE_MT19937 = 0, + LLAMA_RNG_TYPE_LOWBIAS32 = 1, + }; + enum llama_split_mode { LLAMA_SPLIT_MODE_NONE = 0, // single GPU LLAMA_SPLIT_MODE_LAYER = 1, // split layers and KV across GPUs @@ -1295,8 +1300,8 @@ extern "C" { LLAMA_API struct llama_sampler * llama_sampler_init_greedy(void); /// seed == LLAMA_DEFAULT_SEED to use a random seed. - LLAMA_API struct llama_sampler * llama_sampler_init_dist (uint32_t seed); - LLAMA_API struct llama_sampler * llama_sampler_init_dist_blue_noise(uint32_t seed); + LLAMA_API struct llama_sampler * llama_sampler_init_dist (uint32_t seed); + LLAMA_API struct llama_sampler * llama_sampler_init_dist_rng(uint32_t seed, bool blue_noise, enum llama_rng_type rng_type); /// @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751 /// Setting k <= 0 makes this a noop diff --git a/src/llama-sampler.cpp b/src/llama-sampler.cpp index 34f8a62ab4..0a74f2d26f 100644 --- a/src/llama-sampler.cpp +++ b/src/llama-sampler.cpp @@ -633,8 +633,8 @@ struct llama_dist_rng_mt19937 : llama_dist_rng { struct llama_dist_rng_blue : llama_dist_rng { blue_noise_rng bn_rng; - llama_dist_rng_blue(uint32_t seed) - : bn_rng(16, std::make_unique(seed)) {} + llama_dist_rng_blue(std::unique_ptr source) + : bn_rng(16, std::move(source)) {} bool requires_sorted() override { return true; } @@ -1591,32 +1591,34 @@ static struct llama_sampler_i llama_sampler_dist_i = { /* .backend_set_input = */ llama_sampler_dist_backend_set_input, }; -struct llama_sampler * llama_sampler_init_dist(uint32_t seed) { +static std::unique_ptr make_dist_rng(uint32_t seed, enum llama_rng_type rng_type) { + switch (rng_type) { + case LLAMA_RNG_TYPE_LOWBIAS32: return std::make_unique(seed); + case LLAMA_RNG_TYPE_MT19937: + default: return std::make_unique(seed); + } +} + +struct llama_sampler * llama_sampler_init_dist_rng(uint32_t seed, bool blue_noise, enum llama_rng_type rng_type) { auto seed_cur = get_rng_seed(seed); + auto rng = make_dist_rng(seed_cur, rng_type); + if (blue_noise) { + rng = std::make_unique(std::move(rng)); + } return llama_sampler_init( /* .iface = */ &llama_sampler_dist_i, /* .ctx = */ new llama_sampler_dist { {"dist"}, /* .seed = */ seed, /* .seed_cur = */ seed_cur, - /* .rng = */ std::make_unique(seed_cur), + /* .rng = */ std::move(rng), /* .inp_uniform = */ nullptr, } ); } -struct llama_sampler * llama_sampler_init_dist_blue_noise(uint32_t seed) { - auto seed_cur = get_rng_seed(seed); - return llama_sampler_init( - /* .iface = */ &llama_sampler_dist_i, - /* .ctx = */ new llama_sampler_dist { - {"dist-blue-noise"}, - /* .seed = */ seed, - /* .seed_cur = */ seed_cur, - /* .rng = */ std::make_unique(seed_cur), - /* .inp_uniform = */ nullptr, - } - ); +struct llama_sampler * llama_sampler_init_dist(uint32_t seed) { + return llama_sampler_init_dist_rng(seed, false, LLAMA_RNG_TYPE_MT19937); } // top-k diff --git a/tools/server/server-task.cpp b/tools/server/server-task.cpp index 16c3cf12d0..d717165daa 100644 --- a/tools/server/server-task.cpp +++ b/tools/server/server-task.cpp @@ -67,6 +67,7 @@ json task_params::to_json(bool only_metrics) const { {"n_discard", n_discard}, {"ignore_eos", sampling.ignore_eos}, {"blue_noise", sampling.blue_noise}, + {"rng_type", sampling.rng_type == LLAMA_RNG_TYPE_LOWBIAS32 ? "lowbias32" : "mt19937"}, {"stream", stream}, {"n_probs", sampling.n_probs}, {"min_keep", sampling.min_keep}, @@ -127,6 +128,7 @@ json task_params::to_json(bool only_metrics) const { {"n_discard", n_discard}, {"ignore_eos", sampling.ignore_eos}, {"blue_noise", sampling.blue_noise}, + {"rng_type", sampling.rng_type == LLAMA_RNG_TYPE_LOWBIAS32 ? "lowbias32" : "mt19937"}, {"stream", stream}, {"logit_bias", format_logit_bias(sampling.logit_bias)}, {"n_probs", sampling.n_probs}, @@ -470,6 +472,14 @@ task_params server_task::params_from_json_cmpl( } params.sampling.blue_noise = json_value(data, "blue_noise", params_base.sampling.blue_noise); + { + const auto rng_source = json_value(data, "rng_type", std::string("")); + if (rng_source == "lowbias32") { + params.sampling.rng_type = LLAMA_RNG_TYPE_LOWBIAS32; + } else if (rng_source == "mt19937") { + params.sampling.rng_type = LLAMA_RNG_TYPE_MT19937; + } + } params.sampling.ignore_eos = json_value(data, "ignore_eos", params_base.sampling.ignore_eos); if (params.sampling.ignore_eos) { params.sampling.logit_bias.insert( From e896007ad1d4ce9bfd24d039fc106d895afc7fed Mon Sep 17 00:00:00 2001 From: Jan Boon Date: Mon, 9 Feb 2026 05:27:13 +0000 Subject: [PATCH 21/25] sampling : fix whitespace --- tools/server/server-task.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/server/server-task.cpp b/tools/server/server-task.cpp index d717165daa..fdeddbf21d 100644 --- a/tools/server/server-task.cpp +++ b/tools/server/server-task.cpp @@ -67,7 +67,7 @@ json task_params::to_json(bool only_metrics) const { {"n_discard", n_discard}, {"ignore_eos", sampling.ignore_eos}, {"blue_noise", sampling.blue_noise}, - {"rng_type", sampling.rng_type == LLAMA_RNG_TYPE_LOWBIAS32 ? "lowbias32" : "mt19937"}, + {"rng_type", sampling.rng_type == LLAMA_RNG_TYPE_LOWBIAS32 ? "lowbias32" : "mt19937"}, {"stream", stream}, {"n_probs", sampling.n_probs}, {"min_keep", sampling.min_keep}, @@ -128,7 +128,7 @@ json task_params::to_json(bool only_metrics) const { {"n_discard", n_discard}, {"ignore_eos", sampling.ignore_eos}, {"blue_noise", sampling.blue_noise}, - {"rng_type", sampling.rng_type == LLAMA_RNG_TYPE_LOWBIAS32 ? "lowbias32" : "mt19937"}, + {"rng_type", sampling.rng_type == LLAMA_RNG_TYPE_LOWBIAS32 ? "lowbias32" : "mt19937"}, {"stream", stream}, {"logit_bias", format_logit_bias(sampling.logit_bias)}, {"n_probs", sampling.n_probs}, From a4858de4e49785af40a9e9fa68a41ac8bcd2d4d3 Mon Sep 17 00:00:00 2001 From: Jan Boon Date: Mon, 9 Feb 2026 05:36:32 +0000 Subject: [PATCH 22/25] sampling : build fix and cleanup --- src/llama-sampler.cpp | 24 ++++-------------------- 1 file changed, 4 insertions(+), 20 deletions(-) diff --git a/src/llama-sampler.cpp b/src/llama-sampler.cpp index 0a74f2d26f..f06c76077b 100644 --- a/src/llama-sampler.cpp +++ b/src/llama-sampler.cpp @@ -342,12 +342,6 @@ struct llama_dist_rng { // this also indicates whether the RNG output itself must be consumed in a sequential order virtual bool requires_sorted() = 0; - // for compatibility with std::discrete_distribution - // only used in a disabled branch of llama_sampler_dist_apply - virtual uint32_t rng_min() = 0; - virtual uint32_t rng_max() = 0; - virtual uint32_t next() = 0; // uniform bits in [rng_min(), rng_max()] - virtual uint32_t next32() = 0; // uniform 32 bits virtual uint64_t next64() = 0; // uniform 64 bits virtual double nextf() = 0; // uniform double in [0, 1) @@ -489,9 +483,9 @@ struct llama_dist_urbg { llama_dist_rng & rng; - result_type min() { return rng.rng_min(); } - result_type max() { return rng.rng_max(); } - result_type operator()() { return rng.next(); } + static constexpr result_type min() { return 0; } + static constexpr result_type max() { return UINT32_MAX; } + result_type operator()() { return rng.next32(); } }; // wrapper to use existing llama_sample_dist for mt19937, otherwise implements CDF walk directly @@ -543,8 +537,6 @@ struct llama_dist_rng_lowbias32 : llama_dist_rng { llama_dist_rng_lowbias32(uint32_t seed) : hashed_seed(hash(seed)), position(0) {} bool requires_sorted() override { return false; } - uint32_t rng_min() override { return 0; } - uint32_t rng_max() override { return UINT32_MAX; } static uint32_t hash(uint32_t x) { // lowbias32 // coefficients from https://github.com/skeeto/hash-prospector/issues/19 @@ -554,7 +546,7 @@ struct llama_dist_rng_lowbias32 : llama_dist_rng { return x; } - uint32_t next() override { + uint32_t next() { uint32_t val = hash(position ^ hashed_seed); position++; return val; @@ -597,10 +589,6 @@ struct llama_dist_rng_mt19937 : llama_dist_rng { bool requires_sorted() override { return false; } - uint32_t rng_min() override { return std::mt19937::min(); } - uint32_t rng_max() override { return std::mt19937::max(); } - uint32_t next() override { return rng(); } - uint32_t next32() override { return rng(); } @@ -638,10 +626,6 @@ struct llama_dist_rng_blue : llama_dist_rng { bool requires_sorted() override { return true; } - uint32_t rng_min() override { return 0; } - uint32_t rng_max() override { return (1u << bn_rng.bit_depth) - 1; } - uint32_t next() override { return bn_rng.next(); } - uint32_t next32() override { return bn_rng.next32(); } From 2a74c288c83a5d9d2e6fa7f956f096bb5ef1b4db Mon Sep 17 00:00:00 2001 From: Jan Boon Date: Mon, 9 Feb 2026 09:01:45 +0000 Subject: [PATCH 23/25] sampling : prefer high bits as source for generating blue noise --- src/llama-sampler.cpp | 18 ++++++------------ 1 file changed, 6 insertions(+), 12 deletions(-) diff --git a/src/llama-sampler.cpp b/src/llama-sampler.cpp index f06c76077b..717e1e73d5 100644 --- a/src/llama-sampler.cpp +++ b/src/llama-sampler.cpp @@ -419,12 +419,6 @@ struct blue_noise_rng { uint32_t h = rng->next32() % 10; states[i] = {tbl[h][0], tbl[h][1]}; // random initial state } - -#if 0 - // test against initial implementation outputs - // note: white noise padding in next64 is slightly different, but minimally consequential for testing - rng->reset(); // reset position so generation starts from 0 -#endif } uint16_t advance(uint32_t h) { @@ -441,7 +435,7 @@ struct blue_noise_rng { s[1] = 0; // error diffusion dithering using binary weight perturbation - s[(h >> level) & 1 ? 0 : 1] += qe; // forward to t+1 or defer to t+2 + s[(h >> (31 - level)) & 1 ? 0 : 1] += qe; // forward to t+1 or defer to t+2 acc = acc * 2 + out; } @@ -458,14 +452,14 @@ struct blue_noise_rng { uint32_t next32() { uint32_t h = rng->next32(); uint32_t val = advance(h); - return (val << (32 - bit_depth)) | (h >> bit_depth); + return (val << (32 - bit_depth)) | (h & ((1u << (32 - bit_depth)) - 1)); } // blue noise in the upper bits, white noise in the lower bits uint64_t next64() { uint64_t r = rng->next64(); - uint32_t val = advance((uint32_t)r); - return ((uint64_t)val << (64 - bit_depth)) | (r >> bit_depth); + uint32_t val = advance((uint32_t)(r >> 32)); + return ((uint64_t)val << (64 - bit_depth)) | (r & ((UINT64_C(1) << (64 - bit_depth)) - 1)); } // uniform double in [0, 1) with blue noise temporal autocorrelation @@ -557,8 +551,8 @@ struct llama_dist_rng_lowbias32 : llama_dist_rng { } uint64_t next64() override { - uint64_t hi = hash(position ^ ~hashed_seed); // secondary sequence using opposing seed - uint64_t lo = next(); + uint64_t lo = hash(position ^ ~hashed_seed); // secondary sequence using opposing seed + uint64_t hi = next(); return (hi << 32) | lo; } From 10179a636d41f63870148c941a78c753944dfc9f Mon Sep 17 00:00:00 2001 From: Jan Boon Date: Mon, 9 Feb 2026 12:55:54 +0000 Subject: [PATCH 24/25] sampling : also use upper bits for initializing state --- src/llama-sampler.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/llama-sampler.cpp b/src/llama-sampler.cpp index 717e1e73d5..633be9bceb 100644 --- a/src/llama-sampler.cpp +++ b/src/llama-sampler.cpp @@ -416,7 +416,7 @@ struct blue_noise_rng { {-1, -1}, }; for (int i = 0; i < n; i++) { - uint32_t h = rng->next32() % 10; + uint32_t h = (uint32_t)(((uint64_t)rng->next32() * 10) >> 32); states[i] = {tbl[h][0], tbl[h][1]}; // random initial state } } From 1f42650078e2f5f36d2ec03789ac392da897e9e2 Mon Sep 17 00:00:00 2001 From: Jan Boon Date: Mon, 9 Feb 2026 15:13:56 +0000 Subject: [PATCH 25/25] sampling : add rng test case --- tests/test-sampling.cpp | 74 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 74 insertions(+) diff --git a/tests/test-sampling.cpp b/tests/test-sampling.cpp index 7cd96c5cd3..1a04ac5b11 100644 --- a/tests/test-sampling.cpp +++ b/tests/test-sampling.cpp @@ -192,6 +192,73 @@ static void test_top_n_sigma(const std::vector & probs, const std::vector tester.check(); } +static void test_dist_rng(uint32_t seed, bool blue_noise, enum llama_rng_type rng_type, + const std::vector & expected, const char * desc) { + const int n_vocab = 16; + const int n_samples = 32; + + // fixed non-uniform distribution: token i has logit log(i+1) + std::vector data(n_vocab); + for (int i = 0; i < n_vocab; i++) { + data[i] = {i, logf((float)(i + 1)), 0.0f}; + } + + auto * sampler = llama_sampler_init_dist_rng(seed, blue_noise, rng_type); + std::vector tokens(n_samples); + + for (int i = 0; i < n_samples; i++) { + std::vector cur(data); + llama_token_data_array cur_p = {cur.data(), cur.size(), -1, false}; + llama_sampler_apply(sampler, &cur_p); + GGML_ASSERT(cur_p.selected >= 0 && cur_p.selected < (llama_token)n_vocab); + tokens[i] = cur_p.data[cur_p.selected].id; + } + + if (expected.empty()) { + // print sequence for capture + printf("test_dist_rng %s: {", desc); + for (int i = 0; i < n_samples; i++) { + printf("%s%d", i ? ", " : "", tokens[i]); + } + printf("}\n"); + } else { + // verify against known sequence + GGML_ASSERT((int)expected.size() == n_samples); + bool match = true; + for (int i = 0; i < n_samples; i++) { + if (tokens[i] != expected[i]) { + match = false; + break; + } + } + if (!match) { + printf("test_dist_rng %s: MISMATCH\n got: {", desc); + for (int i = 0; i < n_samples; i++) { + printf("%s%d", i ? ", " : "", tokens[i]); + } + printf("}\n expected: {"); + for (int i = 0; i < n_samples; i++) { + printf("%s%d", i ? ", " : "", expected[i]); + } + printf("}\n"); + GGML_ASSERT(false); + } + + // also verify reset reproduces same sequence + llama_sampler_reset(sampler); + for (int i = 0; i < n_samples; i++) { + std::vector cur(data); + llama_token_data_array cur_p = {cur.data(), cur.size(), -1, false}; + llama_sampler_apply(sampler, &cur_p); + GGML_ASSERT(cur_p.data[cur_p.selected].id == tokens[i]); + } + + printf("test_dist_rng %-30s OK\n", desc); + } + + llama_sampler_free(sampler); +} + static void test_sampler_queue(const size_t n_vocab, const std::string & samplers_sequence, const int top_k, const float top_p, const float min_p ) { sampler_tester tester(n_vocab); @@ -392,6 +459,13 @@ int main(void) { test_sampler_queue(10000, "mkp", 100, 0.8f, 0.1f); test_sampler_queue(10000, "mpk", 100, 0.8f, 0.1f); + test_dist_rng(42, false, LLAMA_RNG_TYPE_LOWBIAS32, + {5, 12, 8, 10, 12, 11, 10, 8, 8, 10, 11, 9, 7, 6, 11, 13, 14, 15, 13, 4, 12, 14, 13, 13, 14, 12, 5, 15, 4, 13, 15, 12}, + "lowbias32"); + test_dist_rng(42, true, LLAMA_RNG_TYPE_LOWBIAS32, + {10, 5, 12, 8, 15, 13, 3, 10, 13, 12, 2, 15, 8, 14, 5, 11, 7, 9, 15, 11, 8, 2, 12, 14, 7, 9, 13, 10, 14, 5, 12, 15}, + "lowbias32 + blue noise"); + printf("OK\n"); test_perf();