graph : do not include llama-model.h
This commit is contained in:
parent
71574f9273
commit
4b52e59903
|
|
@ -189,8 +189,8 @@ struct common_params_sampling {
|
|||
std::vector<llama_logit_bias> logit_bias_eog; // pre-calculated logit biases for EOG tokens
|
||||
|
||||
// Backend sampling flags
|
||||
bool backend_sampling = false; // enable backend sampling
|
||||
bool backend_dist = false; // backend performs final sampling (dist)
|
||||
bool backend_sampling = false; // enable backend sampling
|
||||
bool backend_dist = false; // backend performs final sampling (dist)
|
||||
|
||||
// print the parameters into a string
|
||||
std::string print() const;
|
||||
|
|
@ -517,8 +517,8 @@ struct common_params {
|
|||
return !speculative.model.path.empty() || !speculative.model.hf_repo.empty();
|
||||
}
|
||||
|
||||
struct llama_sampler_seq_config * backend_samplers = NULL;
|
||||
size_t n_backend_samplers = 0;
|
||||
llama_sampler_seq_config * backend_samplers = NULL;
|
||||
size_t n_backend_samplers = 0;
|
||||
};
|
||||
|
||||
// call once at the start of a program if it uses libcommon
|
||||
|
|
|
|||
|
|
@ -113,9 +113,9 @@ struct common_sampler {
|
|||
llama_token_data_array cur_p;
|
||||
|
||||
void set_logits(struct llama_context * ctx, int idx) {
|
||||
const float * sampled_probs = llama_get_backend_sampled_probs_ith(ctx, idx);
|
||||
const float * sampled_logits = llama_get_backend_sampled_logits_ith(ctx, idx);
|
||||
const llama_token * sampled_ids = llama_get_backend_sampled_token_ids_ith(ctx, idx);
|
||||
const float * sampled_probs = llama_get_backend_sampled_probs_ith (ctx, idx);
|
||||
const float * sampled_logits = llama_get_backend_sampled_logits_ith (ctx, idx);
|
||||
const llama_token * sampled_ids = llama_get_backend_sampled_token_ids_ith(ctx, idx);
|
||||
|
||||
const llama_model * model = llama_get_model(ctx);
|
||||
const llama_vocab * vocab = llama_model_get_vocab(model);
|
||||
|
|
@ -143,11 +143,11 @@ struct common_sampler {
|
|||
cur.reserve(sampled_logits_count);
|
||||
// The backend sampler has filtered the logits so we need to use the sampled ids.
|
||||
if (sampled_ids != nullptr) {
|
||||
for (llama_token i = 0; i < (int)sampled_logits_count; i++) {
|
||||
for (uint32_t i = 0; i < sampled_logits_count; i++) {
|
||||
cur.emplace_back(llama_token_data{sampled_ids[i], sampled_logits[i], 0.0f});
|
||||
}
|
||||
} else {
|
||||
for (llama_token token_id = 0; token_id < (int)sampled_logits_count; token_id++) {
|
||||
for (llama_token token_id = 0; token_id < (int) sampled_logits_count; token_id++) {
|
||||
cur.emplace_back(llama_token_data{token_id, sampled_logits[token_id], 0.0f});
|
||||
}
|
||||
}
|
||||
|
|
@ -414,10 +414,12 @@ void common_perf_print(const struct llama_context * ctx, const struct common_sam
|
|||
llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_context * ctx, int idx, bool grammar_first) {
|
||||
// Check if a backend sampler has already sampled a token in which case we
|
||||
// return that token id directly.
|
||||
const llama_token backend_sampled_token = llama_get_backend_sampled_token_ith(ctx, idx);
|
||||
if (backend_sampled_token != LLAMA_TOKEN_NULL) {
|
||||
LOG_DBG("%s: Backend sampler selected token: '%d'. Will not run any CPU samplers\n", __func__, backend_sampled_token);
|
||||
return backend_sampled_token;
|
||||
{
|
||||
const llama_token id = llama_get_backend_sampled_token_ith(ctx, idx);
|
||||
if (id != LLAMA_TOKEN_NULL) {
|
||||
LOG_DBG("%s: Backend sampler selected token: '%d'. Will not run any CPU samplers\n", __func__, id);
|
||||
return id;
|
||||
}
|
||||
}
|
||||
|
||||
gsmpl->set_logits(ctx, idx);
|
||||
|
|
|
|||
|
|
@ -1233,9 +1233,9 @@ int llama_context::decode(const llama_batch & batch_inp) {
|
|||
};
|
||||
|
||||
int64_t n_outputs_prev = 0;
|
||||
|
||||
// This flag indicates whether a backend sampler has actually sampled a specific
|
||||
// token, or if it has produced probabilites. If true, we true we can skip
|
||||
// the normal copying of logits and embeddings.
|
||||
// token, or if it has produced probabilites. If true, we can skip the normal copying of logits and embeddings.
|
||||
bool backend_has_sampled = false;
|
||||
|
||||
do {
|
||||
|
|
@ -1655,6 +1655,7 @@ llm_graph_params llama_context::graph_params(
|
|||
/*.gtype =*/ gtype,
|
||||
/*.sched =*/ sched.get(),
|
||||
/*.backend_cpu =*/ backend_cpu,
|
||||
/*.dev_out =*/ model.dev_output(),
|
||||
/*.cvec =*/ &cvec,
|
||||
/*.loras =*/ &loras,
|
||||
/*.mctx =*/ mctx,
|
||||
|
|
@ -2712,8 +2713,8 @@ float * llama_get_embeddings_seq(llama_context * ctx, llama_seq_id seq_id) {
|
|||
return ctx->get_embeddings_seq(seq_id);
|
||||
}
|
||||
|
||||
void llama_set_backend_sampler(llama_context * ctx, llama_seq_id seq_id, llama_sampler * sampler) {
|
||||
ctx->set_backend_sampler(seq_id, sampler);
|
||||
void llama_set_backend_sampler(llama_context * ctx, llama_seq_id seq_id, llama_sampler * smpl) {
|
||||
ctx->set_backend_sampler(seq_id, smpl);
|
||||
}
|
||||
|
||||
llama_token llama_get_backend_sampled_token_ith(llama_context * ctx, int32_t i) {
|
||||
|
|
|
|||
|
|
@ -3,7 +3,6 @@
|
|||
#include "llama-impl.h"
|
||||
#include "llama-batch.h"
|
||||
#include "llama-cparams.h"
|
||||
#include "llama-model.h"
|
||||
|
||||
#include "llama-kv-cache.h"
|
||||
#include "llama-kv-cache-iswa.h"
|
||||
|
|
@ -610,6 +609,7 @@ llm_graph_context::llm_graph_context(const llm_graph_params & params) :
|
|||
rope_type (hparams.rope_type),
|
||||
sched (params.sched),
|
||||
backend_cpu (params.backend_cpu),
|
||||
dev_out (params.dev_out),
|
||||
cvec (params.cvec),
|
||||
loras (params.loras),
|
||||
mctx (params.mctx),
|
||||
|
|
@ -2049,8 +2049,7 @@ void llm_graph_context::build_pooling(
|
|||
ggml_build_forward_expand(gf, cur);
|
||||
}
|
||||
|
||||
void llm_graph_context::build_sampling(const llama_model & model, const llm_graph_params & params) const {
|
||||
GGML_UNUSED(params);
|
||||
void llm_graph_context::build_sampling() const {
|
||||
if (samplers.empty()) {
|
||||
return;
|
||||
}
|
||||
|
|
@ -2074,11 +2073,9 @@ void llm_graph_context::build_sampling(const llama_model & model, const llm_grap
|
|||
ggml_tensor * logits_t = res->t_logits;
|
||||
GGML_ASSERT(res->t_logits != nullptr && "missing t_logits tensor");
|
||||
|
||||
const int n_vocab = llama_vocab_n_tokens(llama_model_get_vocab(&model));
|
||||
GGML_ASSERT(logits_t->ne[0] == n_vocab);
|
||||
const int64_t n_vocab = logits_t->ne[0];
|
||||
|
||||
ggml_backend_dev_t device = model.dev_output();
|
||||
ggml_backend_buffer_type_t buft = ggml_backend_dev_buffer_type(device);
|
||||
ggml_backend_buffer_type_t buft = ggml_backend_dev_buffer_type(dev_out);
|
||||
|
||||
std::unordered_map<llama_seq_id, llama_sampler*> active_samplers;
|
||||
|
||||
|
|
|
|||
|
|
@ -436,6 +436,7 @@ struct llm_graph_params {
|
|||
|
||||
ggml_backend_sched_t sched;
|
||||
ggml_backend_t backend_cpu;
|
||||
ggml_backend_dev_t dev_out;
|
||||
|
||||
const llama_adapter_cvec * cvec;
|
||||
const llama_adapter_loras * loras;
|
||||
|
|
@ -445,8 +446,8 @@ struct llm_graph_params {
|
|||
std::unordered_map<llama_seq_id, llama_sampler*> samplers;
|
||||
|
||||
static bool samplers_equal(
|
||||
const std::unordered_map<llama_seq_id, llama_sampler*> & lhs,
|
||||
const std::unordered_map<llama_seq_id, llama_sampler*> & rhs) {
|
||||
const std::unordered_map<llama_seq_id, llama_sampler *> & lhs,
|
||||
const std::unordered_map<llama_seq_id, llama_sampler *> & rhs) {
|
||||
if (lhs.size() != rhs.size()) {
|
||||
return false;
|
||||
}
|
||||
|
|
@ -624,6 +625,8 @@ struct llm_graph_context {
|
|||
|
||||
ggml_backend_t backend_cpu; // TODO: needed by build_attn_mha, figure out a way to remove?
|
||||
|
||||
ggml_backend_dev_t dev_out;
|
||||
|
||||
const llama_adapter_cvec * cvec;
|
||||
const llama_adapter_loras * loras;
|
||||
const llama_memory_context_i * mctx;
|
||||
|
|
@ -875,7 +878,7 @@ struct llm_graph_context {
|
|||
// sampling (backend sampling)
|
||||
//
|
||||
|
||||
void build_sampling(const llama_model & model, const llm_graph_params & params) const;
|
||||
void build_sampling() const;
|
||||
|
||||
//
|
||||
// dense (out)
|
||||
|
|
|
|||
|
|
@ -7413,7 +7413,7 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
|
|||
llm->build_pooling(cls, cls_b, cls_out, cls_out_b);
|
||||
|
||||
// add backend sampling layers (if any)
|
||||
llm->build_sampling(*this, params);
|
||||
llm->build_sampling();
|
||||
|
||||
// if the gguf model was converted with --sentence-transformers-dense-modules
|
||||
// there will be two additional dense projection layers
|
||||
|
|
|
|||
Loading…
Reference in New Issue