From 80742cbaebdf1dd10cfc1059dca225fb0954b9fb Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 30 Nov 2025 00:07:13 +0200 Subject: [PATCH] cont : naming --- common/llguidance.cpp | 20 ++++++++++---------- include/llama.h | 4 ---- src/llama-batch.cpp | 6 +++--- src/llama-batch.h | 2 +- src/llama-context.cpp | 25 +++++++++++-------------- src/llama-context.h | 14 +++++++------- 6 files changed, 32 insertions(+), 39 deletions(-) diff --git a/common/llguidance.cpp b/common/llguidance.cpp index 27d15516e9..d58f147a76 100644 --- a/common/llguidance.cpp +++ b/common/llguidance.cpp @@ -106,16 +106,16 @@ static void llama_sampler_llg_free(llama_sampler * smpl) { } static llama_sampler_i llama_sampler_llg_i = { - /* .name = */ llama_sampler_llg_name, - /* .accept = */ llama_sampler_llg_accept_impl, - /* .apply = */ llama_sampler_llg_apply, - /* .reset = */ llama_sampler_llg_reset, - /* .clone = */ llama_sampler_llg_clone, - /* .free = */ llama_sampler_llg_free, - /* .apply_ggml = */ NULL, - /* .accept_ggml = */ NULL, - /* .set_input_ggml = */ NULL, - /* .set_backend_context = */ NULL, + /* .name = */ llama_sampler_llg_name, + /* .accept = */ llama_sampler_llg_accept_impl, + /* .apply = */ llama_sampler_llg_apply, + /* .reset = */ llama_sampler_llg_reset, + /* .clone = */ llama_sampler_llg_clone, + /* .free = */ llama_sampler_llg_free, + /* .backend_init = */ NULL, + /* .backend_accept = */ NULL, + /* .backend_apply = */ NULL, + /* .backend_set_input = */ NULL, }; static size_t llama_sampler_llg_tokenize_fn(const void * user_data, const uint8_t * bytes, size_t bytes_len, diff --git a/include/llama.h b/include/llama.h index 6a1aca7633..01eca7609a 100644 --- a/include/llama.h +++ b/include/llama.h @@ -1374,10 +1374,6 @@ extern "C" { // LLAMA_API struct llama_sampler * llama_sampler_init_infill(const struct llama_vocab * vocab); - // - // Backend samplers - // - // Returns the seed used by the sampler if applicable, LLAMA_DEFAULT_SEED otherwise LLAMA_API uint32_t llama_sampler_get_seed(const struct llama_sampler * smpl); diff --git a/src/llama-batch.cpp b/src/llama-batch.cpp index f0866a9ca1..2700b970c9 100644 --- a/src/llama-batch.cpp +++ b/src/llama-batch.cpp @@ -29,7 +29,7 @@ bool llama_batch_allocr::init( uint32_t n_embd, uint32_t n_seq_max, bool output_all, - bool backend_sampling) { + bool sampling) { clear(); batch = batch_inp; @@ -146,7 +146,7 @@ bool llama_batch_allocr::init( } } - if (backend_sampling) { + if (sampling) { std::vector seq_output_count(n_seq_max, 0); for (int32_t i = 0; i < batch.n_tokens; ++i) { @@ -157,7 +157,7 @@ bool llama_batch_allocr::init( const llama_seq_id seq_id = batch.seq_id[i][s]; seq_output_count[seq_id]++; if (seq_output_count[seq_id] > 1) { - LLAMA_LOG_ERROR("%s: backend sampling allows at most one output token per sequence (%d)\n", __func__, seq_id); + LLAMA_LOG_ERROR("%s: backend sampling requires at most one output token per sequence (%d)\n", __func__, seq_id); return false; } } diff --git a/src/llama-batch.h b/src/llama-batch.h index d8751274f3..db7a75b804 100644 --- a/src/llama-batch.h +++ b/src/llama-batch.h @@ -80,7 +80,7 @@ public: uint32_t n_embd, uint32_t n_seq_max, bool output_all, - bool backend_sampling = false); + bool sampling = false); const llama_batch & get_batch() const; diff --git a/src/llama-context.cpp b/src/llama-context.cpp index 50a31f1168..7f7b838e14 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -1322,12 +1322,12 @@ int llama_context::decode(const llama_batch & batch_inp) { // when computing embeddings, all tokens are output const bool output_all = cparams.embeddings; - const bool has_backend_samplers = !sampling.samplers.empty(); + const bool has_samplers = !sampling.samplers.empty(); if (!balloc->init(batch_inp, vocab, memory.get(), n_embd, cparams.kv_unified ? LLAMA_MAX_SEQ : cparams.n_seq_max, output_all, - has_backend_samplers)) { + has_samplers)) { LLAMA_LOG_ERROR("%s: failed to initialize batch\n", __func__); return -1; } @@ -1415,10 +1415,6 @@ int llama_context::decode(const llama_batch & batch_inp) { int64_t n_outputs_prev = 0; - // This flag indicates whether a backend sampler has actually sampled a specific - // token, or if it has produced probabilites. If true, we can skip the normal copying of logits and embeddings. - bool backend_has_sampled = false; - do { const auto & ubatch = mctx->get_ubatch(); @@ -1477,9 +1473,11 @@ int llama_context::decode(const llama_batch & batch_inp) { // ggml_graph_dump_dot(gf, NULL, "llama.dot"); //} - backend_has_sampled = !res->t_sampled.empty() || !res->t_sampled_probs.empty() || !res->t_sampled_logits.empty(); + // This flag indicates whether a backend sampler has actually sampled a specific + // token, or if it has produced probabilites. If true, we can skip the normal copying of logits and embeddings. + const bool has_sampled = !res->t_sampled.empty() || !res->t_sampled_probs.empty() || !res->t_sampled_logits.empty(); - if (has_backend_samplers && backend_has_sampled) { + if (has_samplers && has_sampled) { const auto seq_to_output_row = build_seq_to_output_row(ubatch, n_outputs_prev); const auto stride = n_vocab; @@ -1495,7 +1493,6 @@ int llama_context::decode(const llama_batch & batch_inp) { // async copy the candidate token ids from the backend to the host. // These are needed by CPU samplers to map probability/logit indices to vocab token ids. copy_tensor_async_candidates(res->t_candidates, sampling.candidates, stride, sampling.candidates_count, seq_to_output_row, sched.get()); - } auto * t_logits = res->get_logits(); @@ -1661,8 +1658,8 @@ uint32_t llama_context::output_reserve(int32_t n_outputs, const llama_batch & ba } // Check which sampling modes are needed by sequences in the current batch. - bool batch_has_backend_sampling = false; - bool batch_needs_cpu_logits = false; + bool batch_has_sampling = false; + bool batch_needs_cpu_logits = false; if (batch.logits) { for (int32_t i = 0; i < batch.n_tokens; i++) { @@ -1672,7 +1669,7 @@ uint32_t llama_context::output_reserve(int32_t n_outputs, const llama_batch & ba for (int32_t j = 0; j < batch.n_seq_id[i]; j++) { llama_seq_id seq_id = batch.seq_id[i][j]; if (sampling.samplers.find(seq_id) != sampling.samplers.end()) { - batch_has_backend_sampling = true; + batch_has_sampling = true; } else { batch_needs_cpu_logits = true; } @@ -1691,7 +1688,7 @@ uint32_t llama_context::output_reserve(int32_t n_outputs, const llama_batch & ba logits_size = (has_logits && batch_needs_cpu_logits) ? n_vocab*n_outputs_max : 0; embd_size = has_embd ? n_embd*n_outputs_max : 0; - if (!batch_has_backend_sampling) { + if (!batch_has_sampling) { sampling.logits_size = 0; sampling.probs_size = 0; sampling.sampled_size = 0; @@ -1762,7 +1759,7 @@ uint32_t llama_context::output_reserve(int32_t n_outputs, const llama_batch & ba embd = has_embd ? (float *) (base + offset) : nullptr; offset += embd_size * sizeof(float); - if (batch_has_backend_sampling) { + if (batch_has_sampling) { sampling.logits = (float *) (base + offset); offset += sampling.logits_size * sizeof(float); diff --git a/src/llama-context.h b/src/llama-context.h index c2f0e3e570..127180413e 100644 --- a/src/llama-context.h +++ b/src/llama-context.h @@ -66,17 +66,17 @@ struct llama_context { float * get_embeddings_ith(int32_t i); float * get_embeddings_seq(llama_seq_id seq_id); - llama_token * get_sampled_tokens(); - llama_token get_sampled_token_ith(int32_t idx); + llama_token * get_sampled_tokens(); + llama_token get_sampled_token_ith(int32_t idx); - float * get_sampled_logits_ith(int32_t idx); - size_t get_sampled_logits_count(int32_t idx); + float * get_sampled_logits_ith(int32_t idx); + size_t get_sampled_logits_count(int32_t idx); - float * get_sampled_probs_ith(int32_t idx); - size_t get_sampled_probs_count(int32_t idx); + float * get_sampled_probs_ith(int32_t idx); + size_t get_sampled_probs_count(int32_t idx); const llama_token * get_sampled_candidates_ith(int32_t idx); - size_t get_sampled_candidates_count(int32_t idx); + size_t get_sampled_candidates_count(int32_t idx); void attach_threadpool( ggml_threadpool_t threadpool,