common : disable backend sampling when grammar is involved
This commit is contained in:
parent
eefdb0da17
commit
3b3f5fed31
|
|
@ -176,7 +176,7 @@ std::string common_params_sampling::print() const {
|
|||
return std::string(result);
|
||||
}
|
||||
|
||||
struct common_sampler * common_sampler_init(const struct llama_model * model, const struct common_params_sampling & params) {
|
||||
struct common_sampler * common_sampler_init(const struct llama_model * model, struct common_params_sampling & params) {
|
||||
const llama_vocab * vocab = llama_model_get_vocab(model);
|
||||
|
||||
llama_sampler_chain_params lparams = llama_sampler_chain_default_params();
|
||||
|
|
@ -313,6 +313,12 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
|
|||
llama_sampler_chain_add(chain, smpl);
|
||||
}
|
||||
|
||||
if (grmr && params.backend_sampling) {
|
||||
LOG_WRN("%s: backend sampling is not compatible with grammar, disabling\n", __func__);
|
||||
|
||||
params.backend_sampling = false;
|
||||
}
|
||||
|
||||
auto * result = new common_sampler {
|
||||
/* .params = */ params,
|
||||
/* .grmr = */ grmr,
|
||||
|
|
@ -430,6 +436,8 @@ llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_co
|
|||
if (id != LLAMA_TOKEN_NULL) {
|
||||
LOG_DBG("%s: Backend sampler selected token: '%d'. Will not run any CPU samplers\n", __func__, id);
|
||||
|
||||
GGML_ASSERT(!gsmpl->grmr && "using grammar in combination with backend sampling is not supported");
|
||||
|
||||
// TODO: simplify
|
||||
gsmpl->cur.resize(1);
|
||||
gsmpl->cur[0] = { id, 0.0f, 1.0f };
|
||||
|
|
|
|||
|
|
@ -36,7 +36,8 @@ struct common_sampler;
|
|||
|
||||
// llama_sampler API overloads
|
||||
|
||||
struct common_sampler * common_sampler_init(const struct llama_model * model, const struct common_params_sampling & params);
|
||||
// note: can mutate params in some cases
|
||||
struct common_sampler * common_sampler_init(const struct llama_model * model, struct common_params_sampling & params);
|
||||
|
||||
void common_sampler_free(struct common_sampler * gsmpl);
|
||||
|
||||
|
|
@ -48,6 +49,7 @@ struct common_sampler * common_sampler_clone (struct common_sampler * gsmpl);
|
|||
// arguments can be nullptr to skip printing
|
||||
void common_perf_print(const struct llama_context * ctx, const struct common_sampler * gsmpl);
|
||||
|
||||
// get the underlying llama_sampler_chain
|
||||
struct llama_sampler * common_sampler_get(const struct common_sampler * gsmpl);
|
||||
|
||||
// extended sampling implementation:
|
||||
|
|
|
|||
|
|
@ -1571,7 +1571,7 @@ int llama_context::decode(const llama_batch & batch_inp) {
|
|||
}
|
||||
|
||||
// extract logits
|
||||
// For multipsequence batches that mix backend samplers and CPU sampler
|
||||
// For multi-sequence batches that mix backend samplers and CPU sampler
|
||||
// this is currently inefficient as we copy all logits even for the
|
||||
// backend sampled tokens.
|
||||
if (logits && t_logits && n_outputs > 0) {
|
||||
|
|
|
|||
Loading…
Reference in New Issue