spec : add ngram-mod (#19164)
* spec : add ngram-mod * cont : simplify + keep track of occupancy * cont : cleanup * cont : move initialization to common/speculative * cont : cleanup * cont : cleanup * cont : fix
This commit is contained in:
parent
2e916f996a
commit
dabaa2e77a
|
|
@ -75,6 +75,8 @@ add_library(${TARGET} STATIC
|
|||
ngram-cache.h
|
||||
ngram-map.cpp
|
||||
ngram-map.h
|
||||
ngram-mod.cpp
|
||||
ngram-mod.h
|
||||
peg-parser.cpp
|
||||
peg-parser.h
|
||||
preset.cpp
|
||||
|
|
|
|||
|
|
@ -3396,7 +3396,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|||
}
|
||||
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
|
||||
add_opt(common_arg(
|
||||
{"--spec-type"}, "[none|ngram-cache|ngram-simple|ngram-map-k|ngram-map-k4v]",
|
||||
{"--spec-type"}, "[none|ngram-cache|ngram-simple|ngram-map-k|ngram-map-k4v|ngram-mod]",
|
||||
string_format("type of speculative decoding to use when no draft model is provided (default: %s)\n",
|
||||
common_speculative_type_to_str(params.speculative.type).c_str()),
|
||||
[](common_params & params, const std::string & value) {
|
||||
|
|
@ -3410,6 +3410,8 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|||
params.speculative.type = COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K;
|
||||
} else if (value == "ngram-map-k4v") {
|
||||
params.speculative.type = COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K4V;
|
||||
} else if (value == "ngram-mod") {
|
||||
params.speculative.type = COMMON_SPECULATIVE_TYPE_NGRAM_MOD;
|
||||
} else {
|
||||
throw std::invalid_argument("unknown speculative decoding type without draft model");
|
||||
}
|
||||
|
|
|
|||
|
|
@ -171,6 +171,7 @@ enum common_speculative_type {
|
|||
COMMON_SPECULATIVE_TYPE_NGRAM_SIMPLE, // simple self-speculative decoding
|
||||
COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K, // self-speculative decoding with n-gram keys only
|
||||
COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K4V, // self-speculative decoding with n-gram keys and 4 m-gram values
|
||||
COMMON_SPECULATIVE_TYPE_NGRAM_MOD,
|
||||
COMMON_SPECULATIVE_TYPE_NGRAM_CACHE, // self-speculative decoding with 3-level n-gram cache
|
||||
COMMON_SPECULATIVE_TYPE_COUNT // number of types, unknown type
|
||||
};
|
||||
|
|
@ -252,6 +253,8 @@ struct common_params_model {
|
|||
std::string name = ""; // in format <user>/<model>[:<tag>] (tag is optional) // NOLINT
|
||||
};
|
||||
|
||||
struct common_ngram_mod;
|
||||
|
||||
struct common_params_speculative {
|
||||
common_speculative_type type = COMMON_SPECULATIVE_TYPE_NONE; // type of speculative decoding
|
||||
|
||||
|
|
@ -269,6 +272,8 @@ struct common_params_speculative {
|
|||
uint16_t ngram_check_rate = 1; // check rate for ngram lookup
|
||||
uint16_t ngram_min_hits = 1; // minimum hits at ngram/mgram lookup for mgram to be proposed
|
||||
|
||||
std::shared_ptr<common_ngram_mod> ngram_mod;
|
||||
|
||||
std::string lookup_cache_static; // path of static ngram cache file for lookup decoding // NOLINT
|
||||
std::string lookup_cache_dynamic; // path of dynamic ngram cache file for lookup decoding // NOLINT
|
||||
|
||||
|
|
|
|||
|
|
@ -7,6 +7,21 @@
|
|||
#include <cstdio>
|
||||
#include <sstream>
|
||||
|
||||
// Print the values of a sublist of `llama_tokens & inp` to a string in the form [v0, v1, v2, ...].
|
||||
static std::string common_tokens_to_str(const llama_tokens & inp, size_t start, size_t length) {
|
||||
std::ostringstream oss;
|
||||
oss << '[';
|
||||
for (size_t i = 0; i < length; ++i) {
|
||||
if (i > 0) {
|
||||
oss << ", ";
|
||||
}
|
||||
oss << inp[start + i];
|
||||
}
|
||||
oss << ']';
|
||||
return oss.str();
|
||||
}
|
||||
|
||||
|
||||
// n-gram simple
|
||||
//
|
||||
|
||||
|
|
@ -100,8 +115,6 @@ llama_tokens common_ngram_simple_draft(
|
|||
// maximum number of counted values of a ngram map value.
|
||||
#define COMMON_NGRAM_MAX_VALUE_COUNT 16380
|
||||
|
||||
static std::string common_tokens_to_str(const llama_tokens & inp, size_t start, size_t length);
|
||||
|
||||
void common_ngram_map_draft(common_ngram_map & map,
|
||||
const llama_tokens & inp, llama_token sampled,
|
||||
llama_tokens & draft) {
|
||||
|
|
@ -347,21 +360,3 @@ void common_ngram_map_accept(common_ngram_map & map, uint16_t n_accepted) {
|
|||
n_accepted, curr_value.n_accepted);
|
||||
curr_value.n_accepted = n_accepted;
|
||||
}
|
||||
|
||||
// Helper functions.
|
||||
//
|
||||
|
||||
// Print the values of a sublist of `llama_tokens & inp` to a string in the form [v0, v1, v2, ...].
|
||||
std::string common_tokens_to_str(const llama_tokens & inp, size_t start, size_t length) {
|
||||
std::ostringstream oss;
|
||||
oss << '[';
|
||||
for (size_t i = 0; i < length; ++i) {
|
||||
if (i > 0) {
|
||||
oss << ", ";
|
||||
}
|
||||
oss << inp[start + i];
|
||||
}
|
||||
oss << ']';
|
||||
return oss.str();
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -11,6 +11,7 @@
|
|||
//
|
||||
|
||||
#include "llama.h"
|
||||
#include "common.h"
|
||||
|
||||
#include <vector>
|
||||
|
||||
|
|
|
|||
|
|
@ -0,0 +1,60 @@
|
|||
#include "ngram-mod.h"
|
||||
|
||||
//
|
||||
// common_ngram_mod
|
||||
//
|
||||
|
||||
common_ngram_mod::common_ngram_mod(uint16_t n, size_t size) : n(n), used(0) {
|
||||
entries.resize(size);
|
||||
|
||||
reset();
|
||||
}
|
||||
|
||||
size_t common_ngram_mod::idx(const entry_t * tokens) const {
|
||||
size_t res = 0;
|
||||
|
||||
for (size_t i = 0; i < n; ++i) {
|
||||
res = res*6364136223846793005ULL + tokens[i];
|
||||
}
|
||||
|
||||
res = res % entries.size();
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
void common_ngram_mod::add(const entry_t * tokens) {
|
||||
const size_t i = idx(tokens);
|
||||
|
||||
if (entries[i] == EMPTY) {
|
||||
used++;
|
||||
}
|
||||
|
||||
entries[i] = tokens[n];
|
||||
}
|
||||
|
||||
common_ngram_mod::entry_t common_ngram_mod::get(const entry_t * tokens) const {
|
||||
const size_t i = idx(tokens);
|
||||
|
||||
return entries[i];
|
||||
}
|
||||
|
||||
void common_ngram_mod::reset() {
|
||||
std::fill(entries.begin(), entries.end(), EMPTY);
|
||||
used = 0;
|
||||
}
|
||||
|
||||
size_t common_ngram_mod::get_n() const {
|
||||
return n;
|
||||
}
|
||||
|
||||
size_t common_ngram_mod::get_used() const {
|
||||
return used;
|
||||
}
|
||||
|
||||
size_t common_ngram_mod::size() const {
|
||||
return entries.size();
|
||||
}
|
||||
|
||||
size_t common_ngram_mod::size_bytes() const {
|
||||
return entries.size() * sizeof(entries[0]);
|
||||
}
|
||||
|
|
@ -0,0 +1,37 @@
|
|||
#pragma once
|
||||
|
||||
#include <cstdint>
|
||||
#include <vector>
|
||||
|
||||
//
|
||||
// common_ngram_mod
|
||||
// ref: https://github.com/ggml-org/llama.cpp/pull/19164
|
||||
//
|
||||
|
||||
// basic n-gram hasher
|
||||
struct common_ngram_mod {
|
||||
using entry_t = int32_t;
|
||||
|
||||
static constexpr entry_t EMPTY = -1;
|
||||
|
||||
common_ngram_mod(uint16_t n, size_t size);
|
||||
|
||||
size_t idx(const entry_t * tokens) const;
|
||||
void add(const entry_t * tokens);
|
||||
entry_t get(const entry_t * tokens) const; // return -1 if not found
|
||||
|
||||
void reset();
|
||||
|
||||
size_t get_n() const;
|
||||
size_t get_used() const;
|
||||
|
||||
size_t size() const;
|
||||
size_t size_bytes() const;
|
||||
|
||||
private:
|
||||
size_t n; // ngram size to hash
|
||||
|
||||
size_t used;
|
||||
|
||||
std::vector<entry_t> entries;
|
||||
};
|
||||
|
|
@ -6,6 +6,7 @@
|
|||
#include "log.h"
|
||||
#include "ngram-cache.h"
|
||||
#include "ngram-map.h"
|
||||
#include "ngram-mod.h"
|
||||
#include "sampling.h"
|
||||
|
||||
#include <algorithm>
|
||||
|
|
@ -23,6 +24,7 @@ const std::vector<enum common_speculative_type> common_speculative_types = {
|
|||
COMMON_SPECULATIVE_TYPE_NGRAM_SIMPLE,
|
||||
COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K,
|
||||
COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K4V,
|
||||
COMMON_SPECULATIVE_TYPE_NGRAM_MOD,
|
||||
COMMON_SPECULATIVE_TYPE_NGRAM_CACHE
|
||||
};
|
||||
|
||||
|
|
@ -33,6 +35,7 @@ const std::map<std::string, enum common_speculative_type> common_speculative_typ
|
|||
{"ngram_simple", COMMON_SPECULATIVE_TYPE_NGRAM_SIMPLE},
|
||||
{"ngram_map_k", COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K},
|
||||
{"ngram_map_k4v", COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K4V},
|
||||
{"ngram_mod", COMMON_SPECULATIVE_TYPE_NGRAM_MOD},
|
||||
{"ngram_cache", COMMON_SPECULATIVE_TYPE_NGRAM_CACHE}
|
||||
};
|
||||
|
||||
|
|
@ -110,6 +113,8 @@ static bool common_speculative_are_compatible(
|
|||
struct common_speculative_state {
|
||||
const enum common_speculative_type type;
|
||||
|
||||
// TODO: rename to n_call_draft, n_gen_drafts, n_acc_drafts, n_gen_tokens, n_acc_tokens
|
||||
// TODO: add n_call_begin, n_call_accept
|
||||
size_t drafts_call_count = 0; // number of times this implementation was called.
|
||||
size_t drafts_generated_count = 0; // number of times a draft or part was generated by this implementation.
|
||||
size_t drafts_accepted_count = 0; // number of times a draft or part was accepted by the target model.
|
||||
|
|
@ -119,6 +124,8 @@ struct common_speculative_state {
|
|||
// TODO: track performance of most recent calls
|
||||
const bool gen_perf = true; // whether to generate performance stats.
|
||||
|
||||
// TODO: rename to t_draft_us
|
||||
// TODO: add t_begin_us, t_accept_us
|
||||
int64_t gen_duration_us = 0; // total time spent in this implementation in microseconds.
|
||||
|
||||
common_speculative_state(enum common_speculative_type type) : type(type) {}
|
||||
|
|
@ -509,6 +516,132 @@ struct common_speculative_state_ngram_map_k : public common_speculative_state {
|
|||
}
|
||||
};
|
||||
|
||||
struct common_speculative_state_ngram_mod : public common_speculative_state {
|
||||
common_ngram_mod & mod;
|
||||
|
||||
// the last position in the prompt that was added to the ngram container
|
||||
size_t i_last = 0;
|
||||
|
||||
// length of the last drafted n‑gram (number of tokens returned by draft)
|
||||
size_t n_draft_last = 0;
|
||||
|
||||
// consecutive accept rounds with low acceptance fraction (< 0.5)
|
||||
int n_low = 0;
|
||||
|
||||
// enable trace logging if LLAMA_TRACE is set
|
||||
const bool verbose;
|
||||
|
||||
common_speculative_state_ngram_mod(enum common_speculative_type type, common_ngram_mod & mod)
|
||||
: common_speculative_state(type), mod(mod), verbose(std::getenv("LLAMA_TRACE") != nullptr) {
|
||||
static_assert(sizeof(llama_token) == sizeof(common_ngram_mod::entry_t));
|
||||
}
|
||||
|
||||
void begin(const llama_tokens & prompt) override {
|
||||
i_last = 0;
|
||||
|
||||
n_draft_last = 0;
|
||||
|
||||
const size_t n = mod.get_n();
|
||||
|
||||
if (prompt.size() < n) {
|
||||
return;
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < prompt.size() - n; ++i) {
|
||||
mod.add(prompt.data() + i);
|
||||
}
|
||||
|
||||
i_last = prompt.size() - n;
|
||||
|
||||
const double f = (double)mod.get_used() / (double)mod.size();
|
||||
LOG_INF("%s: ngram_mod occupancy = %zu/%zu (%.2f)\n", __func__, mod.get_used(), mod.size(), f);
|
||||
|
||||
constexpr double f_thold = 0.25;
|
||||
if (f > f_thold) {
|
||||
LOG_WRN("%s: ngram_mod occupancy %.2f exceeds threshold (%.2f) - resetting\n", __func__, f, f_thold);
|
||||
|
||||
mod.reset();
|
||||
}
|
||||
}
|
||||
|
||||
void draft(
|
||||
const common_params_speculative & params,
|
||||
const llama_tokens & prompt_tgt,
|
||||
llama_token id_last,
|
||||
llama_tokens & result) override {
|
||||
GGML_UNUSED(params);
|
||||
|
||||
n_draft_last = 0;
|
||||
|
||||
const size_t cur_len = prompt_tgt.size();
|
||||
if (cur_len < mod.get_n()) {
|
||||
return;
|
||||
}
|
||||
|
||||
const size_t n = mod.get_n();
|
||||
|
||||
// add new ngrams in chunks
|
||||
if (i_last + 32 < cur_len) {
|
||||
for (size_t i = i_last; i < cur_len - n; ++i) {
|
||||
mod.add(prompt_tgt.data() + i);
|
||||
}
|
||||
|
||||
i_last = cur_len - n;
|
||||
}
|
||||
|
||||
result.resize(n + params.n_max);
|
||||
for (size_t i = 0; i < n - 1; ++i) {
|
||||
result[i] = prompt_tgt[cur_len - n + 1 + i];
|
||||
}
|
||||
result[n - 1] = id_last;
|
||||
|
||||
for (int i = 0; i < params.n_max; ++i) {
|
||||
const llama_token token = mod.get(result.data() + i);
|
||||
if (token == common_ngram_mod::EMPTY) {
|
||||
if (i < params.n_min) {
|
||||
result.clear();
|
||||
return;
|
||||
}
|
||||
|
||||
result.resize(n + i);
|
||||
break;
|
||||
}
|
||||
result[n + i] = token;
|
||||
}
|
||||
|
||||
// only return the m tokens that were drafted
|
||||
for (size_t i = 0; n + i < result.size(); ++i) {
|
||||
result[i] = result[n + i];
|
||||
}
|
||||
result.resize(result.size() - n);
|
||||
|
||||
// store length of drafted n‑gram for later acceptance analysis
|
||||
n_draft_last = result.size();
|
||||
}
|
||||
|
||||
void accept(uint16_t n_accepted) override {
|
||||
if (verbose) {
|
||||
LOG_INF("%s: accepted %d tokens from %zu drafted tokens\n", __func__, n_accepted, n_draft_last);
|
||||
}
|
||||
|
||||
// compute acceptance fraction if we have a recorded draft length
|
||||
if (n_draft_last > 0) {
|
||||
const double f_acc = (double)n_accepted / (double)n_draft_last;
|
||||
if (f_acc < 0.5) {
|
||||
n_low++;
|
||||
if (n_low >= 3) {
|
||||
LOG_WRN("%s: low acceptance streak (%d) – resetting ngram_mod\n", __func__, n_low);
|
||||
|
||||
mod.reset();
|
||||
n_low = 0;
|
||||
}
|
||||
} else {
|
||||
n_low = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
struct common_speculative_state_ngram_cache : public common_speculative_state {
|
||||
uint16_t n_draft;
|
||||
bool save_dynamic;
|
||||
|
|
@ -650,6 +783,7 @@ std::string common_speculative_type_to_str(enum common_speculative_type type) {
|
|||
case COMMON_SPECULATIVE_TYPE_NGRAM_SIMPLE: return "ngram_simple";
|
||||
case COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K: return "ngram_map_k";
|
||||
case COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K4V: return "ngram_map_k4v";
|
||||
case COMMON_SPECULATIVE_TYPE_NGRAM_MOD: return "ngram_mod";
|
||||
case COMMON_SPECULATIVE_TYPE_NGRAM_CACHE: return "ngram_cache";
|
||||
default: return "unknown";
|
||||
}
|
||||
|
|
@ -666,8 +800,8 @@ enum common_speculative_type common_speculative_type_from_name(const std::string
|
|||
// initialization of the speculative decoding system
|
||||
//
|
||||
common_speculative * common_speculative_init(
|
||||
const common_params_speculative & params,
|
||||
llama_context * ctx_tgt) {
|
||||
common_params_speculative & params,
|
||||
llama_context * ctx_tgt) {
|
||||
llama_context * ctx_dft = nullptr;
|
||||
if (params.model_dft) {
|
||||
ctx_dft = llama_init_from_model(params.model_dft, params.cparams_dft);
|
||||
|
|
@ -687,6 +821,7 @@ common_speculative * common_speculative_init(
|
|||
bool has_ngram_simple = (params.type == COMMON_SPECULATIVE_TYPE_NGRAM_SIMPLE);
|
||||
bool has_ngram_map_k = (params.type == COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K);
|
||||
bool has_ngram_map_k4v = (params.type == COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K4V);
|
||||
bool has_ngram_mod = (params.type == COMMON_SPECULATIVE_TYPE_NGRAM_MOD);
|
||||
|
||||
// In a more complex implementation we could use the same implementation but with different parameters.
|
||||
// This was initially used in PR-18471 but removed to simplify the code.
|
||||
|
|
@ -701,6 +836,22 @@ common_speculative * common_speculative_init(
|
|||
// This implementation can guess tokens with high acceptance rate but is more expensive.
|
||||
configs.push_back(common_speculative_config(COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K4V, params));
|
||||
}
|
||||
if (has_ngram_mod) {
|
||||
// shared instance for all speculative decoding contexts
|
||||
if (!params.ngram_mod) {
|
||||
params.ngram_mod = std::make_shared<common_ngram_mod>(params.ngram_size_n, 4*1024*1024);
|
||||
|
||||
LOG_INF("%s: initialized ngram_mod with n=%d, size=%zu (%.3f MB)\n", __func__,
|
||||
params.ngram_size_n, params.ngram_mod->size(),
|
||||
(float)(params.ngram_mod->size_bytes())/1024/1024);
|
||||
|
||||
if (params.ngram_size_n < 16) {
|
||||
LOG_WRN("%s: ngram_mod n=%d is too small - poor quality is possible, see: https://github.com/ggml-org/llama.cpp/pull/19164\n", __func__, params.ngram_size_n);
|
||||
}
|
||||
}
|
||||
|
||||
configs.push_back(common_speculative_config(COMMON_SPECULATIVE_TYPE_NGRAM_MOD, params));
|
||||
}
|
||||
if (has_ngram_cache) {
|
||||
configs.push_back(common_speculative_config(COMMON_SPECULATIVE_TYPE_NGRAM_CACHE, params));
|
||||
}
|
||||
|
|
@ -758,6 +909,11 @@ common_speculative * common_speculative_init(
|
|||
));
|
||||
break;
|
||||
}
|
||||
case COMMON_SPECULATIVE_TYPE_NGRAM_MOD: {
|
||||
GGML_ASSERT(config.params.ngram_mod);
|
||||
impls.push_back(std::make_unique<common_speculative_state_ngram_mod>(config.type, *config.params.ngram_mod));
|
||||
break;
|
||||
}
|
||||
case COMMON_SPECULATIVE_TYPE_NGRAM_CACHE: {
|
||||
auto state = create_state_ngram_cache(
|
||||
params.lookup_cache_static, params.lookup_cache_dynamic, config);
|
||||
|
|
@ -822,8 +978,7 @@ llama_tokens common_speculative_draft(
|
|||
|
||||
if (!result.empty()) {
|
||||
LOG_DBG("%s: called impl %s, hist size = %zu, call_count = %zu, gen = %zu\n", __func__,
|
||||
common_speculative_type_to_str(impl.get()->type).c_str(),
|
||||
prompt_tgt.size(),
|
||||
common_speculative_type_to_str(impl.get()->type).c_str(), prompt_tgt.size(),
|
||||
impl.get()->drafts_call_count, result.size());
|
||||
|
||||
spec->curr_impl = impl.get(); // set current implementation for stats
|
||||
|
|
@ -869,6 +1024,7 @@ void common_speculative_print_stats(const common_speculative * spec) {
|
|||
str_perf = "";
|
||||
}
|
||||
|
||||
// TODO: report time for begin() and accept()
|
||||
LOG_INF("statistics %s: #calls = %zu, #gen drafts = %zu, #acc drafts = %zu, #gen tokens = %zu, #acc tokens = %zu%s\n",
|
||||
common_speculative_type_to_str(impl->type).c_str(),
|
||||
impl->drafts_call_count,
|
||||
|
|
|
|||
|
|
@ -15,8 +15,8 @@ enum common_speculative_type common_speculative_type_from_name(const std::string
|
|||
std::string common_speculative_type_to_str(enum common_speculative_type type);
|
||||
|
||||
common_speculative * common_speculative_init(
|
||||
const common_params_speculative & params,
|
||||
llama_context * ctx_tgt);
|
||||
common_params_speculative & params,
|
||||
llama_context * ctx_tgt);
|
||||
|
||||
void common_speculative_free(common_speculative * spec);
|
||||
|
||||
|
|
|
|||
|
|
@ -705,6 +705,11 @@ private:
|
|||
params_base.n_cache_reuse = 0;
|
||||
SRV_WRN("%s\n", "cache_reuse is not supported by multimodal, it will be disabled");
|
||||
}
|
||||
|
||||
if (params_base.speculative.type != COMMON_SPECULATIVE_TYPE_NONE) {
|
||||
params_base.speculative.type = COMMON_SPECULATIVE_TYPE_NONE;
|
||||
SRV_WRN("%s\n", "speculative decoding is not supported by multimodal, it will be disabled");
|
||||
}
|
||||
}
|
||||
|
||||
if (!llama_memory_can_shift(llama_get_memory(ctx))) {
|
||||
|
|
@ -754,9 +759,9 @@ private:
|
|||
SRV_ERR("%s\n", "speculative decoding is not supported with multimodal");
|
||||
return false;
|
||||
}
|
||||
SRV_WRN("%s", "speculative decoding context initialized\n");
|
||||
SLT_INF(slot, "%s", "speculative decoding context initialized\n");
|
||||
} else {
|
||||
SRV_WRN("%s", "speculative decoding context not initialized\n");
|
||||
SLT_INF(slot, "%s", "speculative decoding context not initialized\n");
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
Loading…
Reference in New Issue