graph : do not include llama-model.h

This commit is contained in:
Georgi Gerganov 2025-11-18 13:53:25 +02:00
parent 71574f9273
commit 4b52e59903
No known key found for this signature in database
GPG Key ID: 449E073F9DC10735
6 changed files with 31 additions and 28 deletions

View File

@ -189,8 +189,8 @@ struct common_params_sampling {
std::vector<llama_logit_bias> logit_bias_eog; // pre-calculated logit biases for EOG tokens
// Backend sampling flags
bool backend_sampling = false; // enable backend sampling
bool backend_dist = false; // backend performs final sampling (dist)
bool backend_sampling = false; // enable backend sampling
bool backend_dist = false; // backend performs final sampling (dist)
// print the parameters into a string
std::string print() const;
@ -517,8 +517,8 @@ struct common_params {
return !speculative.model.path.empty() || !speculative.model.hf_repo.empty();
}
struct llama_sampler_seq_config * backend_samplers = NULL;
size_t n_backend_samplers = 0;
llama_sampler_seq_config * backend_samplers = NULL;
size_t n_backend_samplers = 0;
};
// call once at the start of a program if it uses libcommon

View File

@ -113,9 +113,9 @@ struct common_sampler {
llama_token_data_array cur_p;
void set_logits(struct llama_context * ctx, int idx) {
const float * sampled_probs = llama_get_backend_sampled_probs_ith(ctx, idx);
const float * sampled_logits = llama_get_backend_sampled_logits_ith(ctx, idx);
const llama_token * sampled_ids = llama_get_backend_sampled_token_ids_ith(ctx, idx);
const float * sampled_probs = llama_get_backend_sampled_probs_ith (ctx, idx);
const float * sampled_logits = llama_get_backend_sampled_logits_ith (ctx, idx);
const llama_token * sampled_ids = llama_get_backend_sampled_token_ids_ith(ctx, idx);
const llama_model * model = llama_get_model(ctx);
const llama_vocab * vocab = llama_model_get_vocab(model);
@ -143,11 +143,11 @@ struct common_sampler {
cur.reserve(sampled_logits_count);
// The backend sampler has filtered the logits so we need to use the sampled ids.
if (sampled_ids != nullptr) {
for (llama_token i = 0; i < (int)sampled_logits_count; i++) {
for (uint32_t i = 0; i < sampled_logits_count; i++) {
cur.emplace_back(llama_token_data{sampled_ids[i], sampled_logits[i], 0.0f});
}
} else {
for (llama_token token_id = 0; token_id < (int)sampled_logits_count; token_id++) {
for (llama_token token_id = 0; token_id < (int) sampled_logits_count; token_id++) {
cur.emplace_back(llama_token_data{token_id, sampled_logits[token_id], 0.0f});
}
}
@ -414,10 +414,12 @@ void common_perf_print(const struct llama_context * ctx, const struct common_sam
llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_context * ctx, int idx, bool grammar_first) {
// Check if a backend sampler has already sampled a token in which case we
// return that token id directly.
const llama_token backend_sampled_token = llama_get_backend_sampled_token_ith(ctx, idx);
if (backend_sampled_token != LLAMA_TOKEN_NULL) {
LOG_DBG("%s: Backend sampler selected token: '%d'. Will not run any CPU samplers\n", __func__, backend_sampled_token);
return backend_sampled_token;
{
const llama_token id = llama_get_backend_sampled_token_ith(ctx, idx);
if (id != LLAMA_TOKEN_NULL) {
LOG_DBG("%s: Backend sampler selected token: '%d'. Will not run any CPU samplers\n", __func__, id);
return id;
}
}
gsmpl->set_logits(ctx, idx);

View File

@ -1233,9 +1233,9 @@ int llama_context::decode(const llama_batch & batch_inp) {
};
int64_t n_outputs_prev = 0;
// This flag indicates whether a backend sampler has actually sampled a specific
// token, or if it has produced probabilites. If true, we true we can skip
// the normal copying of logits and embeddings.
// token, or if it has produced probabilites. If true, we can skip the normal copying of logits and embeddings.
bool backend_has_sampled = false;
do {
@ -1655,6 +1655,7 @@ llm_graph_params llama_context::graph_params(
/*.gtype =*/ gtype,
/*.sched =*/ sched.get(),
/*.backend_cpu =*/ backend_cpu,
/*.dev_out =*/ model.dev_output(),
/*.cvec =*/ &cvec,
/*.loras =*/ &loras,
/*.mctx =*/ mctx,
@ -2712,8 +2713,8 @@ float * llama_get_embeddings_seq(llama_context * ctx, llama_seq_id seq_id) {
return ctx->get_embeddings_seq(seq_id);
}
void llama_set_backend_sampler(llama_context * ctx, llama_seq_id seq_id, llama_sampler * sampler) {
ctx->set_backend_sampler(seq_id, sampler);
void llama_set_backend_sampler(llama_context * ctx, llama_seq_id seq_id, llama_sampler * smpl) {
ctx->set_backend_sampler(seq_id, smpl);
}
llama_token llama_get_backend_sampled_token_ith(llama_context * ctx, int32_t i) {

View File

@ -3,7 +3,6 @@
#include "llama-impl.h"
#include "llama-batch.h"
#include "llama-cparams.h"
#include "llama-model.h"
#include "llama-kv-cache.h"
#include "llama-kv-cache-iswa.h"
@ -610,6 +609,7 @@ llm_graph_context::llm_graph_context(const llm_graph_params & params) :
rope_type (hparams.rope_type),
sched (params.sched),
backend_cpu (params.backend_cpu),
dev_out (params.dev_out),
cvec (params.cvec),
loras (params.loras),
mctx (params.mctx),
@ -2049,8 +2049,7 @@ void llm_graph_context::build_pooling(
ggml_build_forward_expand(gf, cur);
}
void llm_graph_context::build_sampling(const llama_model & model, const llm_graph_params & params) const {
GGML_UNUSED(params);
void llm_graph_context::build_sampling() const {
if (samplers.empty()) {
return;
}
@ -2074,11 +2073,9 @@ void llm_graph_context::build_sampling(const llama_model & model, const llm_grap
ggml_tensor * logits_t = res->t_logits;
GGML_ASSERT(res->t_logits != nullptr && "missing t_logits tensor");
const int n_vocab = llama_vocab_n_tokens(llama_model_get_vocab(&model));
GGML_ASSERT(logits_t->ne[0] == n_vocab);
const int64_t n_vocab = logits_t->ne[0];
ggml_backend_dev_t device = model.dev_output();
ggml_backend_buffer_type_t buft = ggml_backend_dev_buffer_type(device);
ggml_backend_buffer_type_t buft = ggml_backend_dev_buffer_type(dev_out);
std::unordered_map<llama_seq_id, llama_sampler*> active_samplers;

View File

@ -436,6 +436,7 @@ struct llm_graph_params {
ggml_backend_sched_t sched;
ggml_backend_t backend_cpu;
ggml_backend_dev_t dev_out;
const llama_adapter_cvec * cvec;
const llama_adapter_loras * loras;
@ -445,8 +446,8 @@ struct llm_graph_params {
std::unordered_map<llama_seq_id, llama_sampler*> samplers;
static bool samplers_equal(
const std::unordered_map<llama_seq_id, llama_sampler*> & lhs,
const std::unordered_map<llama_seq_id, llama_sampler*> & rhs) {
const std::unordered_map<llama_seq_id, llama_sampler *> & lhs,
const std::unordered_map<llama_seq_id, llama_sampler *> & rhs) {
if (lhs.size() != rhs.size()) {
return false;
}
@ -624,6 +625,8 @@ struct llm_graph_context {
ggml_backend_t backend_cpu; // TODO: needed by build_attn_mha, figure out a way to remove?
ggml_backend_dev_t dev_out;
const llama_adapter_cvec * cvec;
const llama_adapter_loras * loras;
const llama_memory_context_i * mctx;
@ -875,7 +878,7 @@ struct llm_graph_context {
// sampling (backend sampling)
//
void build_sampling(const llama_model & model, const llm_graph_params & params) const;
void build_sampling() const;
//
// dense (out)

View File

@ -7413,7 +7413,7 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
llm->build_pooling(cls, cls_b, cls_out, cls_out_b);
// add backend sampling layers (if any)
llm->build_sampling(*this, params);
llm->build_sampling();
// if the gguf model was converted with --sentence-transformers-dense-modules
// there will be two additional dense projection layers