Merge fc36eb7700 into 418dea39ce
This commit is contained in:
commit
15e4b7ded1
|
|
@ -168,6 +168,7 @@ enum common_speculative_type {
|
|||
COMMON_SPECULATIVE_TYPE_NONE, // no speculative decoding
|
||||
COMMON_SPECULATIVE_TYPE_DRAFT, // draft model
|
||||
COMMON_SPECULATIVE_TYPE_EAGLE3, // eagle draft model
|
||||
COMMON_SPECULATIVE_TYPE_NEXTN, // MTP model with NextN layer (deepseek, GLM)
|
||||
COMMON_SPECULATIVE_TYPE_NGRAM_SIMPLE, // simple self-speculative decoding
|
||||
COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K, // self-speculative decoding with n-gram keys only
|
||||
COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K4V, // self-speculative decoding with n-gram keys and 4 m-gram values
|
||||
|
|
|
|||
|
|
@ -21,6 +21,7 @@ const std::vector<enum common_speculative_type> common_speculative_types = {
|
|||
COMMON_SPECULATIVE_TYPE_NONE,
|
||||
COMMON_SPECULATIVE_TYPE_DRAFT,
|
||||
COMMON_SPECULATIVE_TYPE_EAGLE3,
|
||||
COMMON_SPECULATIVE_TYPE_NEXTN,
|
||||
COMMON_SPECULATIVE_TYPE_NGRAM_SIMPLE,
|
||||
COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K,
|
||||
COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K4V,
|
||||
|
|
@ -32,6 +33,7 @@ const std::map<std::string, enum common_speculative_type> common_speculative_typ
|
|||
{"none", COMMON_SPECULATIVE_TYPE_NONE},
|
||||
{"draft", COMMON_SPECULATIVE_TYPE_DRAFT},
|
||||
{"eagle3", COMMON_SPECULATIVE_TYPE_EAGLE3},
|
||||
{"nextn", COMMON_SPECULATIVE_TYPE_NEXTN},
|
||||
{"ngram_simple", COMMON_SPECULATIVE_TYPE_NGRAM_SIMPLE},
|
||||
{"ngram_map_k", COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K},
|
||||
{"ngram_map_k4v", COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K4V},
|
||||
|
|
@ -462,6 +464,127 @@ struct common_speculative_state_eagle3 : public common_speculative_state {
|
|||
}
|
||||
};
|
||||
|
||||
struct common_speculative_state_nextn : public common_speculative_state {
|
||||
llama_context * ctx_tgt; // used for copying state from tgt --> dft
|
||||
llama_context * ctx_dft;
|
||||
|
||||
common_sampler * smpl;
|
||||
|
||||
llama_batch batch;
|
||||
llama_tokens prompt_dft;
|
||||
|
||||
bool vocab_cmpt = true; // whether retokenization is needed
|
||||
std::unordered_map<std::string, std::string> vocab_map;
|
||||
|
||||
common_speculative_state_nextn(
|
||||
enum common_speculative_type type,
|
||||
llama_context * ctx_tgt,
|
||||
llama_context * ctx_dft)
|
||||
: common_speculative_state(type)
|
||||
, ctx_tgt(ctx_tgt)
|
||||
, ctx_dft(ctx_dft)
|
||||
{
|
||||
batch = llama_batch_init(llama_n_batch(ctx_dft), 0, 1);
|
||||
|
||||
{
|
||||
common_params_sampling params;
|
||||
params.no_perf = false;
|
||||
params.top_k = 10;
|
||||
params.samplers = {
|
||||
COMMON_SAMPLER_TYPE_TOP_K,
|
||||
};
|
||||
|
||||
smpl = common_sampler_init(llama_get_model(ctx_dft), params);
|
||||
}
|
||||
}
|
||||
|
||||
~common_speculative_state_nextn() override {
|
||||
llama_perf_context_print(ctx_dft);
|
||||
|
||||
common_sampler_free(smpl);
|
||||
|
||||
llama_batch_free(batch);
|
||||
}
|
||||
|
||||
void begin(const llama_tokens & prompt) override {
|
||||
GGML_UNUSED(prompt);
|
||||
}
|
||||
|
||||
void draft(
|
||||
const common_params_speculative & params,
|
||||
const llama_tokens & prompt_tgt,
|
||||
llama_token id_last,
|
||||
llama_tokens & result) override {
|
||||
auto * spec = this;
|
||||
|
||||
auto & batch = spec->batch;
|
||||
auto & ctx_tgt = spec->ctx_tgt;
|
||||
auto & ctx_dft = spec->ctx_dft;
|
||||
auto & smpl = spec->smpl;
|
||||
auto & prompt_dft = spec->prompt_dft;
|
||||
|
||||
auto * mem_dft = llama_get_memory(ctx_dft);
|
||||
|
||||
result.clear();
|
||||
result.reserve(params.n_max);
|
||||
|
||||
llama_memory_clear(mem_dft, false);
|
||||
common_sampler_reset(smpl);
|
||||
|
||||
llama_mtp_start(ctx_tgt, ctx_dft); // copy state from main LLM to draft
|
||||
|
||||
// decode first token
|
||||
int n_past = 0;
|
||||
common_batch_clear(batch);
|
||||
common_batch_add(batch, id_last, n_past++, { 0 }, true);
|
||||
llama_decode(ctx_dft, batch);
|
||||
common_sampler_accept(smpl, id_last, true);
|
||||
|
||||
// sample n_draft tokens from the draft model
|
||||
for (int i = 0; i < params.n_max; ++i) {
|
||||
// printf("drafting token %d\n", i);
|
||||
common_batch_clear(batch);
|
||||
common_sampler_sample(smpl, ctx_dft, 0, true);
|
||||
|
||||
const auto * cur_p = common_sampler_get_candidates(smpl, true);
|
||||
|
||||
for (int k = 0; k < std::min(3, (int) cur_p->size); ++k) {
|
||||
LOG_DBG(" - draft candidate %3d, pos %3d: %6d (%8.3f) '%s'\n",
|
||||
k, i, cur_p->data[k].id, cur_p->data[k].p, common_token_to_piece(ctx_dft, cur_p->data[k].id).c_str());
|
||||
}
|
||||
|
||||
// add drafted token for each sequence
|
||||
const llama_token id = cur_p->data[0].id;
|
||||
|
||||
common_sampler_accept(smpl, id, true);
|
||||
|
||||
result.push_back(id);
|
||||
|
||||
if (params.n_max <= (int) result.size()) {
|
||||
break;
|
||||
}
|
||||
|
||||
// only collect very high-confidence draft tokens
|
||||
if (cur_p->data[0].p < params.p_min) {
|
||||
break;
|
||||
}
|
||||
|
||||
common_batch_add(batch, id, n_past++, { 0 }, true);
|
||||
|
||||
// evaluate the drafted tokens on the draft model
|
||||
llama_decode(ctx_dft, batch);
|
||||
|
||||
prompt_dft.push_back(id);
|
||||
}
|
||||
}
|
||||
|
||||
void accept(uint16_t n_accepted) override {
|
||||
// noop
|
||||
GGML_UNUSED(n_accepted);
|
||||
// printf("\n\n%s: accepted %d tokens\n\n", __func__, n_accepted);
|
||||
}
|
||||
};
|
||||
|
||||
// state of self-speculation (simple implementation, not ngram-map)
|
||||
struct common_speculative_state_ngram_simple : public common_speculative_state {
|
||||
common_ngram_simple_config config;
|
||||
|
|
@ -853,6 +976,7 @@ common_speculative * common_speculative_init(
|
|||
{
|
||||
bool has_draft = !params.mparams_dft.path.empty();
|
||||
bool has_draft_eagle3 = false; // TODO PR-18039: if params.speculative.eagle3
|
||||
bool has_draft_nextn = (params.type == COMMON_SPECULATIVE_TYPE_NEXTN);
|
||||
|
||||
bool has_ngram_cache = (params.type == COMMON_SPECULATIVE_TYPE_NGRAM_CACHE);
|
||||
bool has_ngram_simple = (params.type == COMMON_SPECULATIVE_TYPE_NGRAM_SIMPLE);
|
||||
|
|
@ -898,6 +1022,9 @@ common_speculative * common_speculative_init(
|
|||
if (has_draft_eagle3) {
|
||||
configs.push_back(common_speculative_config(COMMON_SPECULATIVE_TYPE_EAGLE3, params));
|
||||
}
|
||||
if (has_draft_nextn) {
|
||||
configs.push_back(common_speculative_config(COMMON_SPECULATIVE_TYPE_NEXTN, params));
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<std::unique_ptr<common_speculative_state>> impls = {};
|
||||
|
|
@ -919,6 +1046,13 @@ common_speculative * common_speculative_init(
|
|||
impls.push_back(std::make_unique<common_speculative_state_eagle3>(config.type));
|
||||
break;
|
||||
}
|
||||
case COMMON_SPECULATIVE_TYPE_NEXTN: {
|
||||
impls.push_back(std::make_unique<common_speculative_state_nextn>(config.type,
|
||||
/* .ctx_tgt = */ ctx_tgt,
|
||||
/* .ctx_dft = */ ctx_dft
|
||||
));
|
||||
break;
|
||||
}
|
||||
case COMMON_SPECULATIVE_TYPE_NGRAM_SIMPLE: {
|
||||
common_ngram_map ngram_map = get_common_ngram_map(config);
|
||||
|
||||
|
|
|
|||
|
|
@ -194,6 +194,13 @@ extern "C" {
|
|||
LLAMA_SPLIT_MODE_ROW = 2, // split layers and KV across GPUs, use tensor parallelism if supported
|
||||
};
|
||||
|
||||
enum llama_graph_type {
|
||||
LLAMA_GRAPH_TYPE_DEFAULT,
|
||||
LLAMA_GRAPH_TYPE_ENCODER,
|
||||
LLAMA_GRAPH_TYPE_DECODER,
|
||||
LLAMA_GRAPH_TYPE_DECODER_MTP,
|
||||
};
|
||||
|
||||
// TODO: simplify (https://github.com/ggml-org/llama.cpp/pull/9294#pullrequestreview-2286561979)
|
||||
typedef struct llama_token_data {
|
||||
llama_token id; // token id
|
||||
|
|
@ -376,6 +383,8 @@ extern "C" {
|
|||
// note: the samplers must be sampler chains (i.e. use llama_sampler_chain_init)
|
||||
struct llama_sampler_seq_config * samplers;
|
||||
size_t n_samplers;
|
||||
|
||||
llama_graph_type graph_type; // type of the computation graph to be used
|
||||
};
|
||||
|
||||
// model quantization parameters
|
||||
|
|
@ -999,6 +1008,10 @@ extern "C" {
|
|||
// otherwise: float[n_embd] (1-dimensional)
|
||||
LLAMA_API float * llama_get_embeddings_seq(struct llama_context * ctx, llama_seq_id seq_id);
|
||||
|
||||
// Copy the internal MTP state from ctx_llm to ctx_mtp, ready for MTP decoding.
|
||||
// This must be done before calling llama_decode() on ctx_mtp
|
||||
LLAMA_API int32_t llama_mtp_start(struct llama_context * ctx_llm, struct llama_context * ctx_mtp);
|
||||
|
||||
//
|
||||
// backend sampling API [EXPERIMENTAL]
|
||||
// note: use only if the llama_context was created with at least one llama_sampler_seq_config
|
||||
|
|
|
|||
|
|
@ -2723,12 +2723,12 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
|
|||
{LLM_TENSOR_INDEXER_ATTN_Q_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
||||
// NextN/MTP tensors are currently ignored (reserved for future MTP support)
|
||||
// These tensors only exist in the last layer(s) and are treated as output tensors
|
||||
{LLM_TENSOR_NEXTN_EH_PROJ, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}},
|
||||
{LLM_TENSOR_NEXTN_EMBED_TOKENS, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_GET_ROWS}},
|
||||
{LLM_TENSOR_NEXTN_ENORM, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_GET_ROWS}},
|
||||
{LLM_TENSOR_NEXTN_HNORM, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL}},
|
||||
{LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}},
|
||||
{LLM_TENSOR_NEXTN_SHARED_HEAD_NORM, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL}},
|
||||
{LLM_TENSOR_NEXTN_EH_PROJ, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
||||
{LLM_TENSOR_NEXTN_EMBED_TOKENS, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_GET_ROWS}},
|
||||
{LLM_TENSOR_NEXTN_ENORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_GET_ROWS}},
|
||||
{LLM_TENSOR_NEXTN_HNORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
||||
{LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
||||
{LLM_TENSOR_NEXTN_SHARED_HEAD_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
||||
};
|
||||
|
||||
LLM_KV::LLM_KV(llm_arch arch, const char * suffix) : arch(arch), suffix(suffix) {}
|
||||
|
|
|
|||
|
|
@ -189,6 +189,23 @@ llama_context::llama_context(
|
|||
}
|
||||
}
|
||||
|
||||
switch (params.graph_type) {
|
||||
case LLAMA_GRAPH_TYPE_DEFAULT:
|
||||
gtype = LLM_GRAPH_TYPE_DEFAULT;
|
||||
break;
|
||||
case LLAMA_GRAPH_TYPE_ENCODER:
|
||||
gtype = LLM_GRAPH_TYPE_ENCODER;
|
||||
break;
|
||||
case LLAMA_GRAPH_TYPE_DECODER:
|
||||
gtype = LLM_GRAPH_TYPE_DECODER;
|
||||
break;
|
||||
case LLAMA_GRAPH_TYPE_DECODER_MTP:
|
||||
gtype = LLM_GRAPH_TYPE_DECODER_MTP;
|
||||
break;
|
||||
default:
|
||||
throw std::runtime_error("invalid graph type");
|
||||
}
|
||||
|
||||
LLAMA_LOG_INFO("%s: n_seq_max = %u\n", __func__, cparams.n_seq_max);
|
||||
LLAMA_LOG_INFO("%s: n_ctx = %u\n", __func__, cparams.n_ctx);
|
||||
LLAMA_LOG_INFO("%s: n_ctx_seq = %u\n", __func__, cparams.n_ctx_seq);
|
||||
|
|
@ -771,6 +788,24 @@ float * llama_context::get_embeddings_seq(llama_seq_id seq_id) {
|
|||
return it->second.data();
|
||||
}
|
||||
|
||||
int32_t llama_context::cpy_mtp_state(llama_context & ctx_mtp) {
|
||||
if (ctx_mtp.gtype != LLM_GRAPH_TYPE_DECODER_MTP) {
|
||||
LLAMA_LOG_ERROR("%s: target context is not MTP\n", __func__);
|
||||
return -1;
|
||||
}
|
||||
|
||||
if (cross.n_token == 0 || cross.n_embd == 0) {
|
||||
LLAMA_LOG_ERROR("%s: no state to copy\n", __func__);
|
||||
return -1;
|
||||
}
|
||||
|
||||
// TODO: maybe std::move is better?
|
||||
LLAMA_LOG_DEBUG("%s: copying MTP state (n_token = %lld, n_embd = %lld)\n", __func__, cross.n_token, cross.n_embd);
|
||||
ctx_mtp.cross = cross;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
llama_token llama_context::get_sampled_token_ith(int32_t idx) {
|
||||
output_reorder();
|
||||
|
||||
|
|
@ -1421,6 +1456,18 @@ static bool needs_raw_logits(const llama_ubatch & ubatch, const std::map<llama_s
|
|||
int llama_context::decode(const llama_batch & batch_inp) {
|
||||
GGML_ASSERT((!batch_inp.token && batch_inp.embd) || (batch_inp.token && !batch_inp.embd)); // NOLINT
|
||||
|
||||
if (gtype == LLM_GRAPH_TYPE_DECODER_MTP) {
|
||||
if (model.hparams.nextn_predict_layers == 0) {
|
||||
LLAMA_LOG_ERROR("%s: MTP decode called but model does not support MTP\n", __func__);
|
||||
return -1;
|
||||
}
|
||||
if ((uint32_t)batch_inp.n_tokens > n_ubatch()) {
|
||||
// TODO @ngxson : n_tokens > ubatch will mess up the llama_cross state, may need to fix it later
|
||||
LLAMA_LOG_ERROR("%s: MTP decode requires n_ubatch >= n_tokens\n", __func__);
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
if (!memory) {
|
||||
LLAMA_LOG_DEBUG("%s: cannot decode batches with this context (calling encode() instead)\n", __func__);
|
||||
return encode(batch_inp);
|
||||
|
|
@ -1549,6 +1596,17 @@ int llama_context::decode(const llama_batch & batch_inp) {
|
|||
break;
|
||||
}
|
||||
|
||||
const bool update_mtp_state = gtype == LLM_GRAPH_TYPE_DECODER_MTP // this is MTP layer
|
||||
|| (hparams.nextn_predict_layers > 0 && n_outputs_all > 0); // or, this is the main LLM, we need to forward state to MTP layer
|
||||
|
||||
// set MTP state if needed
|
||||
if (update_mtp_state) {
|
||||
// printf("\n\nupdate MTP state: gtype = %d, n_outputs_all = %d\n", (int) gtype, n_outputs_all);
|
||||
cross.n_embd = hparams.get_n_embd_mtp();
|
||||
cross.n_token = n_outputs_all;
|
||||
cross.mtp_embd.resize(cross.n_embd*cross.n_token);
|
||||
}
|
||||
|
||||
// reserve output buffer
|
||||
if (output_reserve(n_outputs_all) < n_outputs_all) {
|
||||
LLAMA_LOG_ERROR("%s: could not reserve space for batch with %d outputs\n", __func__, n_outputs_all);
|
||||
|
|
@ -1577,7 +1635,7 @@ int llama_context::decode(const llama_batch & batch_inp) {
|
|||
}
|
||||
|
||||
ggml_status status;
|
||||
const auto * res = process_ubatch(ubatch, LLM_GRAPH_TYPE_DECODER, mctx.get(), status);
|
||||
const auto * res = process_ubatch(ubatch, gtype, mctx.get(), status);
|
||||
|
||||
if (!res) {
|
||||
// the last ubatch failed or was aborted -> remove all positions of that ubatch from the memory module
|
||||
|
|
@ -1642,6 +1700,14 @@ int llama_context::decode(const llama_batch & batch_inp) {
|
|||
ggml_backend_t backend_embd = ggml_backend_sched_get_tensor_backend(sched.get(), t_embd);
|
||||
GGML_ASSERT(backend_embd != nullptr);
|
||||
|
||||
// set MTP state if needed
|
||||
if (update_mtp_state) {
|
||||
const int64_t n_embd_mtp = cross.n_embd;
|
||||
GGML_ASSERT( n_outputs_prev + n_outputs <= n_outputs_all);
|
||||
GGML_ASSERT((n_outputs_prev + n_outputs)*n_embd_mtp <= (int64_t)cross.mtp_embd.size());
|
||||
ggml_backend_tensor_get_async(backend_embd, t_embd, cross.mtp_embd.data(), 0, n_outputs*n_embd_mtp*sizeof(float));
|
||||
}
|
||||
|
||||
switch (cparams.pooling_type) {
|
||||
case LLAMA_POOLING_TYPE_NONE:
|
||||
{
|
||||
|
|
@ -2801,6 +2867,7 @@ llama_context_params llama_context_default_params() {
|
|||
/*.kv_unified =*/ false,
|
||||
/*.sampler =*/ nullptr,
|
||||
/*.n_sampler =*/ 0,
|
||||
/*.graph_type =*/ LLAMA_GRAPH_TYPE_DEFAULT,
|
||||
};
|
||||
|
||||
return result;
|
||||
|
|
@ -2989,6 +3056,12 @@ float * llama_get_embeddings_seq(llama_context * ctx, llama_seq_id seq_id) {
|
|||
return ctx->get_embeddings_seq(seq_id);
|
||||
}
|
||||
|
||||
int32_t llama_mtp_start(llama_context * ctx_llm, llama_context * ctx_mtp) {
|
||||
ctx_llm->synchronize();
|
||||
|
||||
return ctx_llm->cpy_mtp_state(*ctx_mtp);
|
||||
}
|
||||
|
||||
bool llama_set_sampler(llama_context * ctx, llama_seq_id seq_id, llama_sampler * smpl) {
|
||||
return ctx->set_sampler(seq_id, smpl);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -79,6 +79,8 @@ struct llama_context {
|
|||
float * get_embeddings_ith(int32_t i);
|
||||
float * get_embeddings_seq(llama_seq_id seq_id);
|
||||
|
||||
int32_t cpy_mtp_state(llama_context & ctx_mtp);
|
||||
|
||||
llama_token * get_sampled_tokens() const;
|
||||
llama_token get_sampled_token_ith(int32_t idx);
|
||||
|
||||
|
|
@ -338,6 +340,8 @@ private:
|
|||
// host buffer for the model output (logits and embeddings)
|
||||
ggml_backend_buffer_ptr buf_output;
|
||||
|
||||
llm_graph_type gtype;
|
||||
|
||||
bool has_evaluated_once = false;
|
||||
|
||||
// env: LLAMA_GRAPH_REUSE_DISABLE
|
||||
|
|
|
|||
|
|
@ -323,6 +323,17 @@ void llm_graph_input_cross_embd::set_input(const llama_ubatch * ubatch) {
|
|||
}
|
||||
}
|
||||
|
||||
void llm_graph_input_cross_mtp::set_input(const llama_ubatch * ubatch) {
|
||||
GGML_UNUSED(ubatch);
|
||||
|
||||
if (cross_mtp && !cross->mtp_embd.empty()) {
|
||||
assert(cross_mtp->type == GGML_TYPE_F32);
|
||||
assert(ggml_nelements(cross_mtp) == (int64_t)cross->mtp_embd.size());
|
||||
|
||||
ggml_backend_tensor_set(cross_mtp, cross->mtp_embd.data(), 0, ggml_nbytes(cross_mtp));
|
||||
}
|
||||
}
|
||||
|
||||
static void print_mask(const float * data, int64_t n_tokens, int64_t n_kv, int64_t n_swa, llama_swa_type swa_type) {
|
||||
LLAMA_LOG_DEBUG("%s: === Attention mask ===\n", __func__);
|
||||
const char * swa_type_str = "unknown";
|
||||
|
|
@ -1656,6 +1667,20 @@ ggml_tensor * llm_graph_context::build_inp_cross_embd() const {
|
|||
return cur;
|
||||
}
|
||||
|
||||
ggml_tensor * llm_graph_context::build_inp_cross_mtp() const {
|
||||
auto inp = std::make_unique<llm_graph_input_cross_mtp>(cross);
|
||||
|
||||
auto & cur = inp->cross_mtp;
|
||||
|
||||
GGML_ASSERT(cross != nullptr);
|
||||
cur = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, cross->n_embd, cross->n_token);
|
||||
ggml_set_input(cur);
|
||||
|
||||
res->add_input(std::move(inp));
|
||||
|
||||
return cur;
|
||||
}
|
||||
|
||||
ggml_tensor * llm_graph_context::build_inp_pos_bucket_enc() const {
|
||||
auto inp = std::make_unique<llm_graph_input_pos_bucket>(hparams);
|
||||
|
||||
|
|
|
|||
|
|
@ -31,6 +31,7 @@ enum llm_graph_type {
|
|||
LLM_GRAPH_TYPE_DEFAULT,
|
||||
LLM_GRAPH_TYPE_ENCODER,
|
||||
LLM_GRAPH_TYPE_DECODER,
|
||||
LLM_GRAPH_TYPE_DECODER_MTP,
|
||||
};
|
||||
|
||||
enum llm_ffn_op_type {
|
||||
|
|
@ -56,18 +57,24 @@ enum llm_norm_type {
|
|||
};
|
||||
|
||||
// TODO: tmp - need something better to pass the data from the encoder to the decoder
|
||||
// currently also for passing embeddings for from main model to MTP layers
|
||||
struct llama_cross {
|
||||
// the output embeddings from the encoder as a ggml tensor
|
||||
// TODO: this needs more work to be correct, for now copy the embeddings data to host memory
|
||||
// ref: https://github.com/ggml-org/llama.cpp/pull/11213#discussion_r1969892524
|
||||
//ggml_tensor * t_embd = nullptr;
|
||||
|
||||
int64_t n_embd = 0;
|
||||
int64_t n_enc = 0;
|
||||
int64_t n_embd = 0;
|
||||
int64_t n_enc = 0;
|
||||
int64_t n_token = 0; // used by mtp
|
||||
|
||||
// embeddings data copied to host memory (tmp)
|
||||
std::vector<float> v_embd;
|
||||
|
||||
// embeddings data to be passed to MTP layers
|
||||
// TODO: optimize by using ggml_tensor here
|
||||
std::vector<float> mtp_embd;
|
||||
|
||||
// needed to construct the cross-attention mask in the decoder
|
||||
std::vector<std::set<llama_seq_id>> seq_ids_enc;
|
||||
};
|
||||
|
|
@ -258,6 +265,18 @@ public:
|
|||
const llama_cross * cross;
|
||||
};
|
||||
|
||||
class llm_graph_input_cross_mtp : public llm_graph_input_i {
|
||||
public:
|
||||
llm_graph_input_cross_mtp(
|
||||
const llama_cross * cross) : cross(cross) {}
|
||||
virtual ~llm_graph_input_cross_mtp() = default;
|
||||
void set_input(const llama_ubatch * ubatch) override;
|
||||
|
||||
ggml_tensor * cross_mtp; // F32 [n_embd, n_token]
|
||||
|
||||
const llama_cross * cross;
|
||||
};
|
||||
|
||||
class llm_graph_input_attn_no_cache : public llm_graph_input_i {
|
||||
public:
|
||||
llm_graph_input_attn_no_cache(const llama_hparams & hparams, const llama_cparams & cparams) :
|
||||
|
|
@ -849,6 +868,7 @@ struct llm_graph_context {
|
|||
ggml_tensor * build_inp_cls() const;
|
||||
|
||||
ggml_tensor * build_inp_cross_embd() const;
|
||||
ggml_tensor * build_inp_cross_mtp() const;
|
||||
ggml_tensor * build_inp_pos_bucket_enc() const;
|
||||
ggml_tensor * build_inp_pos_bucket_dec() const;
|
||||
ggml_tensor * build_pos_bias(ggml_tensor * pos_bucket, ggml_tensor * attn_rel_b) const;
|
||||
|
|
|
|||
|
|
@ -76,6 +76,10 @@ uint32_t llama_hparams::n_embd_out() const {
|
|||
return n_embd_out_impl > 0 ? n_embd_out_impl : n_embd;
|
||||
}
|
||||
|
||||
uint32_t llama_hparams::get_n_embd_mtp() const {
|
||||
return n_embd;
|
||||
}
|
||||
|
||||
uint32_t llama_hparams::n_embd_k_gqa(uint32_t il) const {
|
||||
const uint32_t n_head_kv = this->n_head_kv(il);
|
||||
|
||||
|
|
|
|||
|
|
@ -253,6 +253,9 @@ struct llama_hparams {
|
|||
// dimension of output embeddings
|
||||
uint32_t n_embd_out() const;
|
||||
|
||||
// dimension of cross embeddings between main LLM and MTP
|
||||
uint32_t get_n_embd_mtp() const;
|
||||
|
||||
// dimension of key embeddings across all k-v heads
|
||||
uint32_t n_embd_k_gqa(uint32_t il = 0) const;
|
||||
|
||||
|
|
|
|||
|
|
@ -1789,9 +1789,6 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|||
// NextN/MTP parameters (GLM-OCR)
|
||||
ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.nextn_predict_layers, false);
|
||||
|
||||
// TODO: when MTP is implemented, this should probably be updated if needed
|
||||
hparams.n_layer_kv_from_start = hparams.n_layer - hparams.nextn_predict_layers;
|
||||
|
||||
switch (hparams.n_layer) {
|
||||
case 17: type = LLM_TYPE_1B; break; // GLM-OCR
|
||||
case 40: type = LLM_TYPE_9B; break;
|
||||
|
|
@ -1822,9 +1819,6 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|||
// NextN/MTP parameters
|
||||
ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.nextn_predict_layers, false);
|
||||
|
||||
// TODO: when MTP is implemented, this should probably be updated if needed
|
||||
hparams.n_layer_kv_from_start = hparams.n_layer - hparams.nextn_predict_layers;
|
||||
|
||||
switch (hparams.n_layer) {
|
||||
case 47: type = LLM_TYPE_106B_A12B; break; // GLM-4.5-Air (46 layers + 1 NextN layer)
|
||||
case 48: type = LLM_TYPE_102B_A12B; break; // Solar Open
|
||||
|
|
@ -5484,10 +5478,6 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|||
|
||||
for (int i = 0; i < n_layer; ++i) {
|
||||
int flags = 0;
|
||||
if (hparams.nextn_predict_layers > 0 && static_cast<uint32_t>(i) >= n_layer - hparams.nextn_predict_layers) {
|
||||
// skip all tensors in the NextN layers
|
||||
flags |= TENSOR_SKIP;
|
||||
}
|
||||
|
||||
auto & layer = layers[i];
|
||||
|
||||
|
|
@ -5514,7 +5504,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|||
|
||||
layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, flags);
|
||||
|
||||
// NextN/MTP tensors (preserved but unused) - conditionally load for last nextn_predict_layers
|
||||
// NextN/MTP tensors
|
||||
if (hparams.nextn_predict_layers > 0 && static_cast<uint32_t>(i) >= n_layer - hparams.nextn_predict_layers) {
|
||||
layer.nextn.eh_proj = create_tensor(tn(LLM_TENSOR_NEXTN_EH_PROJ, "weight", i), { 2 * n_embd, n_embd }, flags);
|
||||
layer.nextn.enorm = create_tensor(tn(LLM_TENSOR_NEXTN_ENORM, "weight", i), { n_embd }, flags);
|
||||
|
|
@ -8587,11 +8577,19 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
|
|||
} break;
|
||||
case LLM_ARCH_GLM4:
|
||||
{
|
||||
llm = std::make_unique<llm_build_glm4>(*this, params);
|
||||
if (params.gtype == LLM_GRAPH_TYPE_DECODER_MTP) {
|
||||
llm = std::make_unique<llm_build_glm4<LLM_GRAPH_TYPE_DECODER_MTP>>(*this, params);
|
||||
} else {
|
||||
llm = std::make_unique<llm_build_glm4<LLM_GRAPH_TYPE_DECODER>>(*this, params);
|
||||
}
|
||||
} break;
|
||||
case LLM_ARCH_GLM4_MOE:
|
||||
{
|
||||
llm = std::make_unique<llm_build_glm4_moe>(*this, params);
|
||||
if (params.gtype == LLM_GRAPH_TYPE_DECODER_MTP) {
|
||||
llm = std::make_unique<llm_build_glm4_moe<LLM_GRAPH_TYPE_DECODER_MTP>>(*this, params);
|
||||
} else {
|
||||
llm = std::make_unique<llm_build_glm4_moe<LLM_GRAPH_TYPE_DECODER>>(*this, params);
|
||||
}
|
||||
} break;
|
||||
case LLM_ARCH_BITNET:
|
||||
{
|
||||
|
|
|
|||
|
|
@ -1,7 +1,9 @@
|
|||
#include "models.h"
|
||||
|
||||
llm_build_glm4_moe::llm_build_glm4_moe(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
||||
template <>
|
||||
llm_build_glm4_moe<LLM_GRAPH_TYPE_DECODER>::llm_build_glm4_moe(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
const bool use_mrope = hparams.use_mrope();
|
||||
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
|
||||
|
|
@ -13,7 +15,6 @@ llm_build_glm4_moe::llm_build_glm4_moe(const llama_model & model, const llm_grap
|
|||
|
||||
inpL = build_inp_embd(model.tok_embd);
|
||||
|
||||
bool use_mrope = hparams.use_mrope();
|
||||
if (ubatch.embd && !use_mrope) {
|
||||
// unfortunately, we need to forcefully stop here, to avoid users complaining about wrong results
|
||||
GGML_ABORT("This GGUF does not support multimodal. Please reconvert it.");
|
||||
|
|
@ -30,129 +31,9 @@ llm_build_glm4_moe::llm_build_glm4_moe(const llama_model & model, const llm_grap
|
|||
// Final layer tensors are loaded but not processed in forward pass
|
||||
const int n_transformer_layers = n_layer - hparams.nextn_predict_layers;
|
||||
for (int il = 0; il < n_transformer_layers; ++il) {
|
||||
ggml_tensor * inpSA = inpL;
|
||||
|
||||
// Pre-attention norm
|
||||
cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
|
||||
cb(cur, "attn_norm", il);
|
||||
|
||||
// self-attention
|
||||
{
|
||||
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
||||
if (model.layers[il].bq) {
|
||||
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
|
||||
}
|
||||
cb(Qcur, "Qcur", il);
|
||||
|
||||
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
|
||||
if (model.layers[il].bk) {
|
||||
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
|
||||
}
|
||||
cb(Kcur, "Kcur", il);
|
||||
|
||||
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
|
||||
if (model.layers[il].bv) {
|
||||
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
|
||||
}
|
||||
cb(Vcur, "Vcur", il);
|
||||
|
||||
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
||||
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
||||
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
||||
|
||||
// Apply Q/K norm if available (GLM-4.5 355B variant)
|
||||
if (model.layers[il].attn_q_norm) {
|
||||
Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
|
||||
cb(Qcur, "Qcur_normed", il);
|
||||
}
|
||||
if (model.layers[il].attn_k_norm) {
|
||||
Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
|
||||
cb(Kcur, "Kcur_normed", il);
|
||||
}
|
||||
|
||||
if (use_mrope) {
|
||||
Qcur = ggml_rope_multi(ctx0, Qcur, inp_pos, nullptr,
|
||||
n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale,
|
||||
ext_factor, attn_factor, beta_fast, beta_slow);
|
||||
|
||||
Kcur = ggml_rope_multi(ctx0, Kcur, inp_pos, nullptr,
|
||||
n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale,
|
||||
ext_factor, attn_factor, beta_fast, beta_slow);
|
||||
} else {
|
||||
// Normal RoPE
|
||||
Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, nullptr, n_rot,
|
||||
rope_type, n_ctx_orig, freq_base, freq_scale,
|
||||
ext_factor, attn_factor, beta_fast, beta_slow);
|
||||
|
||||
Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, nullptr, n_rot,
|
||||
rope_type, n_ctx_orig, freq_base, freq_scale,
|
||||
ext_factor, attn_factor, beta_fast, beta_slow);
|
||||
}
|
||||
|
||||
cb(Qcur, "Qcur", il);
|
||||
cb(Kcur, "Kcur", il);
|
||||
cb(Vcur, "Vcur", il);
|
||||
|
||||
cur = build_attn(inp_attn,
|
||||
model.layers[il].wo, NULL,
|
||||
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
||||
}
|
||||
if (il == n_transformer_layers - 1 && inp_out_ids) {
|
||||
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
||||
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
||||
}
|
||||
ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
||||
cb(ffn_inp, "ffn_inp", il);
|
||||
|
||||
// Post-attention norm
|
||||
cur = build_norm(ffn_inp, model.layers[il].attn_post_norm, NULL, LLM_NORM_RMS, il);
|
||||
cb(cur, "post_attn_norm", il);
|
||||
|
||||
// Check if this is a dense layer (n_layer_dense_lead=1, so layer 0 is dense)
|
||||
if (static_cast<uint32_t>(il) < hparams.n_layer_dense_lead) {
|
||||
// Dense FFN layer
|
||||
cur = build_ffn(cur,
|
||||
model.layers[il].ffn_up, NULL, NULL,
|
||||
model.layers[il].ffn_gate, NULL, NULL,
|
||||
model.layers[il].ffn_down, NULL, NULL,
|
||||
NULL,
|
||||
LLM_FFN_SILU, LLM_FFN_PAR, il);
|
||||
cb(cur, "ffn_out", il);
|
||||
} else {
|
||||
// Process routed experts using existing MoE infrastructure
|
||||
ggml_tensor * routed_out = build_moe_ffn(cur,
|
||||
model.layers[il].ffn_gate_inp,
|
||||
model.layers[il].ffn_up_exps,
|
||||
model.layers[il].ffn_gate_exps,
|
||||
model.layers[il].ffn_down_exps,
|
||||
model.layers[il].ffn_exp_probs_b,
|
||||
n_expert, n_expert_used,
|
||||
LLM_FFN_SILU, hparams.expert_weights_norm,
|
||||
true, hparams.expert_weights_scale,
|
||||
(llama_expert_gating_func_type) hparams.expert_gating_func,
|
||||
il);
|
||||
cb(routed_out, "ffn_moe_out", il);
|
||||
|
||||
// Process shared expert on original input
|
||||
ggml_tensor * shared_out = build_ffn(cur,
|
||||
model.layers[il].ffn_up_shexp, NULL, NULL,
|
||||
model.layers[il].ffn_gate_shexp, NULL, NULL,
|
||||
model.layers[il].ffn_down_shexp, NULL, NULL,
|
||||
NULL,
|
||||
LLM_FFN_SILU, LLM_FFN_PAR, il);
|
||||
cb(shared_out, "ffn_shexp_out", il);
|
||||
|
||||
// Final output: routed_output + shared_output
|
||||
cur = ggml_add(ctx0, routed_out, shared_out);
|
||||
cb(cur, "ffn_out", il);
|
||||
}
|
||||
cur = ggml_add(ctx0, cur, ffn_inp);
|
||||
|
||||
cur = build_cvec(cur, il);
|
||||
cb(cur, "l_out", il);
|
||||
|
||||
// input for next layer
|
||||
inpL = cur;
|
||||
bool is_output_layer = (il == n_transformer_layers - 1);
|
||||
inpL = build_layer(model, inp_attn, inpL, inp_pos, inp_out_ids, sections, is_output_layer, il);
|
||||
}
|
||||
cur = inpL;
|
||||
cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1);
|
||||
|
|
@ -168,3 +49,200 @@ llm_build_glm4_moe::llm_build_glm4_moe(const llama_model & model, const llm_grap
|
|||
|
||||
ggml_build_forward_expand(gf, cur);
|
||||
}
|
||||
|
||||
// MTP model
|
||||
template <>
|
||||
llm_build_glm4_moe<LLM_GRAPH_TYPE_DECODER_MTP>::llm_build_glm4_moe(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
|
||||
int sections[4];
|
||||
std::copy(std::begin(hparams.rope_sections), std::begin(hparams.rope_sections) + 4, sections);
|
||||
|
||||
ggml_tensor * cur;
|
||||
ggml_tensor * inpL;
|
||||
|
||||
// for now, we only support one single NextN layer for simplicity
|
||||
GGML_ASSERT(hparams.nextn_predict_layers == 1);
|
||||
const int il = n_layer - hparams.nextn_predict_layers;
|
||||
auto & mtp_layer = model.layers[il];
|
||||
|
||||
ggml_tensor * inp_token_embd = build_inp_embd(mtp_layer.nextn.embed_tokens // can be nullptr on GLM-4.6
|
||||
? mtp_layer.nextn.embed_tokens : model.tok_embd);
|
||||
ggml_tensor * inp_state_embd = build_inp_cross_mtp();
|
||||
|
||||
// check number of input tokens
|
||||
GGML_ASSERT(inp_state_embd->ne[1] == inp_token_embd->ne[1]);
|
||||
|
||||
inp_token_embd = build_norm(inp_token_embd, mtp_layer.nextn.enorm, NULL, LLM_NORM_RMS, il);
|
||||
inp_state_embd = build_norm(inp_state_embd, mtp_layer.nextn.hnorm, NULL, LLM_NORM_RMS, il);
|
||||
|
||||
inpL = ggml_concat(ctx0, inp_token_embd, inp_state_embd, 0);
|
||||
cb(inpL, "inp_mtp", il);
|
||||
|
||||
inpL = build_lora_mm(mtp_layer.nextn.eh_proj, inpL);
|
||||
cb(inpL, "inp_mtp_projected", il);
|
||||
|
||||
// inp_pos - contains the positions
|
||||
ggml_tensor * inp_pos = build_inp_pos();
|
||||
|
||||
auto * inp_attn = build_attn_inp_kv();
|
||||
|
||||
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
||||
{
|
||||
bool is_output_layer = true; // TODO: we only have one single nextn layer for now, may need to change in the future
|
||||
inpL = build_layer(model, inp_attn, inpL, inp_pos, inp_out_ids, sections, is_output_layer, il);
|
||||
}
|
||||
cur = inpL;
|
||||
cur = build_norm(cur, mtp_layer.nextn.shared_head_norm // can be nullptr on GLM-4.6
|
||||
? mtp_layer.nextn.shared_head_norm : model.output_norm, NULL, LLM_NORM_RMS, -1);
|
||||
|
||||
cb(cur, "result_norm", -1);
|
||||
res->t_embd = cur;
|
||||
|
||||
// lm_head
|
||||
cur = build_lora_mm(mtp_layer.nextn.shared_head_head // can be nullptr on GLM-4.6
|
||||
? mtp_layer.nextn.shared_head_head : model.output, cur);
|
||||
|
||||
cb(cur, "result_output", -1);
|
||||
res->t_logits = cur;
|
||||
|
||||
ggml_build_forward_expand(gf, cur);
|
||||
}
|
||||
|
||||
template <llm_graph_type graph_type>
|
||||
ggml_tensor * llm_build_glm4_moe<graph_type>::build_layer(const llama_model & model,
|
||||
llm_graph_input_attn_kv * inp_attn,
|
||||
ggml_tensor * inpL,
|
||||
ggml_tensor * inp_pos,
|
||||
ggml_tensor * inp_out_ids,
|
||||
int sections[4],
|
||||
bool is_output_layer,
|
||||
int il) {
|
||||
bool use_mrope = hparams.use_mrope();
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
|
||||
ggml_tensor * inpSA = inpL;
|
||||
|
||||
// Pre-attention norm
|
||||
ggml_tensor * cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
|
||||
cb(cur, "attn_norm", il);
|
||||
|
||||
// self-attention
|
||||
{
|
||||
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
||||
if (model.layers[il].bq) {
|
||||
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
|
||||
}
|
||||
cb(Qcur, "Qcur", il);
|
||||
|
||||
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
|
||||
if (model.layers[il].bk) {
|
||||
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
|
||||
}
|
||||
cb(Kcur, "Kcur", il);
|
||||
|
||||
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
|
||||
if (model.layers[il].bv) {
|
||||
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
|
||||
}
|
||||
cb(Vcur, "Vcur", il);
|
||||
|
||||
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
||||
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
||||
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
||||
|
||||
// Apply Q/K norm if available (GLM-4.5 355B variant)
|
||||
if (model.layers[il].attn_q_norm) {
|
||||
Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
|
||||
cb(Qcur, "Qcur_normed", il);
|
||||
}
|
||||
if (model.layers[il].attn_k_norm) {
|
||||
Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
|
||||
cb(Kcur, "Kcur_normed", il);
|
||||
}
|
||||
|
||||
if (use_mrope) {
|
||||
Qcur = ggml_rope_multi(ctx0, Qcur, inp_pos, nullptr,
|
||||
n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale,
|
||||
ext_factor, attn_factor, beta_fast, beta_slow);
|
||||
|
||||
Kcur = ggml_rope_multi(ctx0, Kcur, inp_pos, nullptr,
|
||||
n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale,
|
||||
ext_factor, attn_factor, beta_fast, beta_slow);
|
||||
} else {
|
||||
// Normal RoPE
|
||||
Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, nullptr, n_rot,
|
||||
rope_type, n_ctx_orig, freq_base, freq_scale,
|
||||
ext_factor, attn_factor, beta_fast, beta_slow);
|
||||
|
||||
Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, nullptr, n_rot,
|
||||
rope_type, n_ctx_orig, freq_base, freq_scale,
|
||||
ext_factor, attn_factor, beta_fast, beta_slow);
|
||||
}
|
||||
|
||||
cb(Qcur, "Qcur", il);
|
||||
cb(Kcur, "Kcur", il);
|
||||
cb(Vcur, "Vcur", il);
|
||||
|
||||
cur = build_attn(inp_attn,
|
||||
model.layers[il].wo, NULL,
|
||||
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
||||
}
|
||||
if (is_output_layer && inp_out_ids) {
|
||||
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
||||
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
||||
}
|
||||
ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
||||
cb(ffn_inp, "ffn_inp", il);
|
||||
|
||||
// Post-attention norm
|
||||
cur = build_norm(ffn_inp, model.layers[il].attn_post_norm, NULL, LLM_NORM_RMS, il);
|
||||
cb(cur, "post_attn_norm", il);
|
||||
|
||||
// Check if this is a dense layer (n_layer_dense_lead=1, so layer 0 is dense)
|
||||
if (static_cast<uint32_t>(il) < hparams.n_layer_dense_lead) {
|
||||
// Dense FFN layer
|
||||
cur = build_ffn(cur,
|
||||
model.layers[il].ffn_up, NULL, NULL,
|
||||
model.layers[il].ffn_gate, NULL, NULL,
|
||||
model.layers[il].ffn_down, NULL, NULL,
|
||||
NULL,
|
||||
LLM_FFN_SILU, LLM_FFN_PAR, il);
|
||||
cb(cur, "ffn_out", il);
|
||||
} else {
|
||||
// Process routed experts using existing MoE infrastructure
|
||||
ggml_tensor * routed_out = build_moe_ffn(cur,
|
||||
model.layers[il].ffn_gate_inp,
|
||||
model.layers[il].ffn_up_exps,
|
||||
model.layers[il].ffn_gate_exps,
|
||||
model.layers[il].ffn_down_exps,
|
||||
model.layers[il].ffn_exp_probs_b,
|
||||
n_expert, n_expert_used,
|
||||
LLM_FFN_SILU, hparams.expert_weights_norm,
|
||||
true, hparams.expert_weights_scale,
|
||||
(llama_expert_gating_func_type) hparams.expert_gating_func,
|
||||
il);
|
||||
cb(routed_out, "ffn_moe_out", il);
|
||||
|
||||
// Process shared expert on original input
|
||||
ggml_tensor * shared_out = build_ffn(cur,
|
||||
model.layers[il].ffn_up_shexp, NULL, NULL,
|
||||
model.layers[il].ffn_gate_shexp, NULL, NULL,
|
||||
model.layers[il].ffn_down_shexp, NULL, NULL,
|
||||
NULL,
|
||||
LLM_FFN_SILU, LLM_FFN_PAR, il);
|
||||
cb(shared_out, "ffn_shexp_out", il);
|
||||
|
||||
// Final output: routed_output + shared_output
|
||||
cur = ggml_add(ctx0, routed_out, shared_out);
|
||||
cb(cur, "ffn_out", il);
|
||||
}
|
||||
cur = ggml_add(ctx0, cur, ffn_inp);
|
||||
|
||||
cur = build_cvec(cur, il);
|
||||
cb(cur, "l_out", il);
|
||||
|
||||
return cur;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,10 +1,9 @@
|
|||
#include "models.h"
|
||||
|
||||
|
||||
|
||||
llm_build_glm4::llm_build_glm4(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
||||
template <>
|
||||
llm_build_glm4<LLM_GRAPH_TYPE_DECODER>::llm_build_glm4(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
||||
const bool use_mrope = hparams.use_mrope();
|
||||
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
|
||||
|
|
@ -16,7 +15,6 @@ llm_build_glm4::llm_build_glm4(const llama_model & model, const llm_graph_params
|
|||
|
||||
inpL = build_inp_embd(model.tok_embd);
|
||||
|
||||
bool use_mrope = hparams.use_mrope();
|
||||
if (ubatch.embd && !use_mrope) {
|
||||
// unfortunately, we need to forcefully stop here, to avoid users complaining about wrong results
|
||||
GGML_ABORT("This GGUF does not support multimodal. Please reconvert it.");
|
||||
|
|
@ -33,116 +31,12 @@ llm_build_glm4::llm_build_glm4(const llama_model & model, const llm_graph_params
|
|||
// Final layer tensors are loaded but not processed in forward pass
|
||||
const int n_transformer_layers = n_layer - hparams.nextn_predict_layers;
|
||||
for (int il = 0; il < n_transformer_layers; ++il) {
|
||||
ggml_tensor * inpSA = inpL;
|
||||
|
||||
// Pre-attention norm
|
||||
cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
|
||||
cb(cur, "attn_norm", il);
|
||||
|
||||
// self-attention
|
||||
{
|
||||
ggml_tensor * Qcur = nullptr;
|
||||
ggml_tensor * Kcur = nullptr;
|
||||
ggml_tensor * Vcur = nullptr;
|
||||
|
||||
if (model.layers[il].wqkv == nullptr) {
|
||||
Qcur = build_lora_mm(model.layers[il].wq, cur);
|
||||
if (model.layers[il].bq) {
|
||||
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
|
||||
}
|
||||
Kcur = build_lora_mm(model.layers[il].wk, cur);
|
||||
if (model.layers[il].bk) {
|
||||
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
|
||||
}
|
||||
Vcur = build_lora_mm(model.layers[il].wv, cur);
|
||||
if (model.layers[il].bv) {
|
||||
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
|
||||
}
|
||||
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
||||
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
||||
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
||||
} else {
|
||||
cur = build_lora_mm(model.layers[il].wqkv, cur);
|
||||
cb(cur, "wqkv", il);
|
||||
if (model.layers[il].bqkv) {
|
||||
cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
|
||||
cb(cur, "bqkv", il);
|
||||
}
|
||||
Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head * sizeof(float), cur->nb[1],
|
||||
0 * sizeof(float) * (n_embd));
|
||||
Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head * sizeof(float),
|
||||
cur->nb[1], 1 * sizeof(float) * (n_embd));
|
||||
Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head * sizeof(float),
|
||||
cur->nb[1], 1 * sizeof(float) * (n_embd + n_embd_gqa));
|
||||
}
|
||||
|
||||
if (use_mrope) {
|
||||
Qcur = ggml_rope_multi(ctx0, Qcur, inp_pos, nullptr,
|
||||
n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale,
|
||||
ext_factor, attn_factor, beta_fast, beta_slow);
|
||||
|
||||
Kcur = ggml_rope_multi(ctx0, Kcur, inp_pos, nullptr,
|
||||
n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale,
|
||||
ext_factor, attn_factor, beta_fast, beta_slow);
|
||||
} else {
|
||||
// Normal RoPE
|
||||
Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, nullptr, n_rot,
|
||||
rope_type, n_ctx_orig, freq_base, freq_scale,
|
||||
ext_factor, attn_factor, beta_fast, beta_slow);
|
||||
|
||||
Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, nullptr, n_rot,
|
||||
rope_type, n_ctx_orig, freq_base, freq_scale,
|
||||
ext_factor, attn_factor, beta_fast, beta_slow);
|
||||
}
|
||||
|
||||
cb(Qcur, "Qcur", il);
|
||||
cb(Kcur, "Kcur", il);
|
||||
cb(Vcur, "Vcur", il);
|
||||
|
||||
cur = build_attn(inp_attn,
|
||||
model.layers[il].wo, NULL,
|
||||
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), il);
|
||||
}
|
||||
if (il == n_transformer_layers - 1 && inp_out_ids) {
|
||||
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
||||
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
||||
}
|
||||
// Post-attention norm (new!)
|
||||
cur = build_norm(cur, model.layers[il].attn_post_norm, NULL, LLM_NORM_RMS, il);
|
||||
cb(cur, "post_attn_norm", il);
|
||||
|
||||
// Add the input (residual connection after post-attention norm)
|
||||
ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
||||
cb(ffn_inp, "ffn_inp", il);
|
||||
|
||||
// FF
|
||||
{
|
||||
// Pre-MLP norm
|
||||
cur = build_norm(ffn_inp, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, il);
|
||||
cb(cur, "ffn_norm", il);
|
||||
|
||||
// MLP
|
||||
cur = build_ffn(cur,
|
||||
model.layers[il].ffn_up, NULL, NULL,
|
||||
NULL, NULL, NULL,
|
||||
model.layers[il].ffn_down, NULL, NULL,
|
||||
NULL, LLM_FFN_SWIGLU, LLM_FFN_SEQ, il);
|
||||
cb(cur, "ffn_out", il);
|
||||
|
||||
// Post-MLP norm
|
||||
cur = build_norm(cur, model.layers[il].ffn_post_norm, NULL, LLM_NORM_RMS, il);
|
||||
cb(cur, "post_mlp_norm", il);
|
||||
}
|
||||
cur = ggml_add(ctx0, cur, ffn_inp);
|
||||
|
||||
cur = build_cvec(cur, il);
|
||||
cb(cur, "l_out", il);
|
||||
|
||||
// input for next layer
|
||||
inpL = cur;
|
||||
bool is_output_layer = (il == n_transformer_layers - 1);
|
||||
inpL = build_layer(model, inp_attn, inpL, inp_pos, inp_out_ids, sections, is_output_layer, il);
|
||||
}
|
||||
// Final norm
|
||||
cur = build_norm(inpL, model.output_norm, NULL, LLM_NORM_RMS, -1);
|
||||
cur = inpL;
|
||||
cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1);
|
||||
|
||||
cb(cur, "result_norm", -1);
|
||||
res->t_embd = cur;
|
||||
|
|
@ -155,3 +49,185 @@ llm_build_glm4::llm_build_glm4(const llama_model & model, const llm_graph_params
|
|||
|
||||
ggml_build_forward_expand(gf, cur);
|
||||
}
|
||||
|
||||
// MTP model
|
||||
template <>
|
||||
llm_build_glm4<LLM_GRAPH_TYPE_DECODER_MTP>::llm_build_glm4(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
|
||||
int sections[4];
|
||||
std::copy(std::begin(hparams.rope_sections), std::begin(hparams.rope_sections) + 4, sections);
|
||||
|
||||
ggml_tensor * cur;
|
||||
ggml_tensor * inpL;
|
||||
|
||||
// for now, we only support one single NextN layer for simplicity
|
||||
GGML_ASSERT(hparams.nextn_predict_layers == 1);
|
||||
const int il = n_layer - hparams.nextn_predict_layers;
|
||||
auto & mtp_layer = model.layers[il];
|
||||
|
||||
ggml_tensor * inp_token_embd = build_inp_embd(mtp_layer.nextn.embed_tokens // can be nullptr on GLM-4.6
|
||||
? mtp_layer.nextn.embed_tokens : model.tok_embd);
|
||||
ggml_tensor * inp_state_embd = build_inp_cross_mtp();
|
||||
|
||||
// check number of input tokens
|
||||
GGML_ASSERT(inp_state_embd->ne[1] == inp_token_embd->ne[1]);
|
||||
|
||||
inp_token_embd = build_norm(inp_token_embd, mtp_layer.nextn.enorm, NULL, LLM_NORM_RMS, il);
|
||||
inp_state_embd = build_norm(inp_state_embd, mtp_layer.nextn.hnorm, NULL, LLM_NORM_RMS, il);
|
||||
|
||||
inpL = ggml_concat(ctx0, inp_token_embd, inp_state_embd, 0);
|
||||
cb(inpL, "inp_mtp", il);
|
||||
|
||||
inpL = build_lora_mm(mtp_layer.nextn.eh_proj, inpL);
|
||||
cb(inpL, "inp_mtp_projected", il);
|
||||
|
||||
// inp_pos - contains the positions
|
||||
ggml_tensor * inp_pos = build_inp_pos();
|
||||
|
||||
auto * inp_attn = build_attn_inp_kv();
|
||||
|
||||
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
||||
{
|
||||
bool is_output_layer = true; // TODO: we only have one single nextn layer for now, may need to change in the future
|
||||
inpL = build_layer(model, inp_attn, inpL, inp_pos, inp_out_ids, sections, is_output_layer, il);
|
||||
}
|
||||
cur = inpL;
|
||||
cur = build_norm(cur, mtp_layer.nextn.shared_head_norm // can be nullptr on GLM-4.6
|
||||
? mtp_layer.nextn.shared_head_norm : model.output_norm, NULL, LLM_NORM_RMS, -1);
|
||||
|
||||
cb(cur, "result_norm", -1);
|
||||
res->t_embd = cur;
|
||||
|
||||
// lm_head
|
||||
cur = build_lora_mm(mtp_layer.nextn.shared_head_head // can be nullptr on GLM-4.6
|
||||
? mtp_layer.nextn.shared_head_head : model.output, cur);
|
||||
|
||||
cb(cur, "result_output", -1);
|
||||
res->t_logits = cur;
|
||||
|
||||
ggml_build_forward_expand(gf, cur);
|
||||
}
|
||||
|
||||
template <llm_graph_type graph_type>
|
||||
ggml_tensor * llm_build_glm4<graph_type>::build_layer(const llama_model & model,
|
||||
llm_graph_input_attn_kv * inp_attn,
|
||||
ggml_tensor * inpL,
|
||||
ggml_tensor * inp_pos,
|
||||
ggml_tensor * inp_out_ids,
|
||||
int sections[4],
|
||||
bool is_output_layer,
|
||||
int il) {
|
||||
bool use_mrope = hparams.use_mrope();
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
||||
|
||||
ggml_tensor * inpSA = inpL;
|
||||
|
||||
// Pre-attention norm
|
||||
ggml_tensor * cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
|
||||
cb(cur, "attn_norm", il);
|
||||
|
||||
// self-attention
|
||||
{
|
||||
ggml_tensor * Qcur = nullptr;
|
||||
ggml_tensor * Kcur = nullptr;
|
||||
ggml_tensor * Vcur = nullptr;
|
||||
|
||||
if (model.layers[il].wqkv == nullptr) {
|
||||
Qcur = build_lora_mm(model.layers[il].wq, cur);
|
||||
if (model.layers[il].bq) {
|
||||
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
|
||||
}
|
||||
Kcur = build_lora_mm(model.layers[il].wk, cur);
|
||||
if (model.layers[il].bk) {
|
||||
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
|
||||
}
|
||||
Vcur = build_lora_mm(model.layers[il].wv, cur);
|
||||
if (model.layers[il].bv) {
|
||||
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
|
||||
}
|
||||
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
||||
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
||||
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
||||
} else {
|
||||
cur = build_lora_mm(model.layers[il].wqkv, cur);
|
||||
cb(cur, "wqkv", il);
|
||||
if (model.layers[il].bqkv) {
|
||||
cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
|
||||
cb(cur, "bqkv", il);
|
||||
}
|
||||
Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head * sizeof(float), cur->nb[1],
|
||||
0 * sizeof(float) * (n_embd));
|
||||
Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head * sizeof(float),
|
||||
cur->nb[1], 1 * sizeof(float) * (n_embd));
|
||||
Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head * sizeof(float),
|
||||
cur->nb[1], 1 * sizeof(float) * (n_embd + n_embd_gqa));
|
||||
}
|
||||
|
||||
if (use_mrope) {
|
||||
Qcur = ggml_rope_multi(ctx0, Qcur, inp_pos, nullptr,
|
||||
n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale,
|
||||
ext_factor, attn_factor, beta_fast, beta_slow);
|
||||
|
||||
Kcur = ggml_rope_multi(ctx0, Kcur, inp_pos, nullptr,
|
||||
n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale,
|
||||
ext_factor, attn_factor, beta_fast, beta_slow);
|
||||
} else {
|
||||
// Normal RoPE
|
||||
Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, nullptr, n_rot,
|
||||
rope_type, n_ctx_orig, freq_base, freq_scale,
|
||||
ext_factor, attn_factor, beta_fast, beta_slow);
|
||||
|
||||
Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, nullptr, n_rot,
|
||||
rope_type, n_ctx_orig, freq_base, freq_scale,
|
||||
ext_factor, attn_factor, beta_fast, beta_slow);
|
||||
}
|
||||
|
||||
cb(Qcur, "Qcur", il);
|
||||
cb(Kcur, "Kcur", il);
|
||||
cb(Vcur, "Vcur", il);
|
||||
|
||||
cur = build_attn(inp_attn,
|
||||
model.layers[il].wo, NULL,
|
||||
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), il);
|
||||
}
|
||||
if (is_output_layer && inp_out_ids) {
|
||||
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
||||
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
||||
}
|
||||
// Post-attention norm (new!)
|
||||
cur = build_norm(cur, model.layers[il].attn_post_norm, NULL, LLM_NORM_RMS, il);
|
||||
cb(cur, "post_attn_norm", il);
|
||||
|
||||
// Add the input (residual connection after post-attention norm)
|
||||
ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
||||
cb(ffn_inp, "ffn_inp", il);
|
||||
|
||||
// FF
|
||||
{
|
||||
// Pre-MLP norm
|
||||
cur = build_norm(ffn_inp, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, il);
|
||||
cb(cur, "ffn_norm", il);
|
||||
|
||||
// MLP
|
||||
cur = build_ffn(cur,
|
||||
model.layers[il].ffn_up, NULL, NULL,
|
||||
NULL, NULL, NULL,
|
||||
model.layers[il].ffn_down, NULL, NULL,
|
||||
NULL, LLM_FFN_SWIGLU, LLM_FFN_SEQ, il);
|
||||
cb(cur, "ffn_out", il);
|
||||
|
||||
// Post-MLP norm
|
||||
cur = build_norm(cur, model.layers[il].ffn_post_norm, NULL, LLM_NORM_RMS, il);
|
||||
cb(cur, "post_mlp_norm", il);
|
||||
}
|
||||
cur = ggml_add(ctx0, cur, ffn_inp);
|
||||
|
||||
cur = build_cvec(cur, il);
|
||||
cb(cur, "l_out", il);
|
||||
|
||||
return cur;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -254,12 +254,30 @@ struct llm_build_gemma : public llm_graph_context {
|
|||
llm_build_gemma(const llama_model & model, const llm_graph_params & params);
|
||||
};
|
||||
|
||||
template <llm_graph_type graph_type>
|
||||
struct llm_build_glm4 : public llm_graph_context {
|
||||
llm_build_glm4(const llama_model & model, const llm_graph_params & params);
|
||||
ggml_tensor * build_layer(const llama_model & model,
|
||||
llm_graph_input_attn_kv * inp_attn,
|
||||
ggml_tensor * inpL,
|
||||
ggml_tensor * inp_pos,
|
||||
ggml_tensor * inp_out_ids,
|
||||
int sections[4],
|
||||
bool is_output_layer,
|
||||
int il);
|
||||
};
|
||||
|
||||
template <llm_graph_type graph_type>
|
||||
struct llm_build_glm4_moe : public llm_graph_context {
|
||||
llm_build_glm4_moe(const llama_model & model, const llm_graph_params & params);
|
||||
ggml_tensor * build_layer(const llama_model & model,
|
||||
llm_graph_input_attn_kv * inp_attn,
|
||||
ggml_tensor * inpL,
|
||||
ggml_tensor * inp_pos,
|
||||
ggml_tensor * inp_out_ids,
|
||||
int sections[4],
|
||||
bool is_output_layer,
|
||||
int il);
|
||||
};
|
||||
|
||||
struct llm_build_gpt2 : public llm_graph_context {
|
||||
|
|
|
|||
|
|
@ -639,7 +639,8 @@ private:
|
|||
|
||||
add_bos_token = llama_vocab_get_add_bos(vocab);
|
||||
|
||||
if (params_base.speculative.has_dft()) {
|
||||
//if (params_base.speculative.has_dft()) {
|
||||
{
|
||||
SRV_INF("loading draft model '%s'\n", params_base.speculative.mparams_dft.path.c_str());
|
||||
|
||||
const auto & params_spec = params_base.speculative;
|
||||
|
|
@ -662,6 +663,7 @@ private:
|
|||
|
||||
params_dft.tensor_buft_overrides = params_spec.tensor_buft_overrides;
|
||||
|
||||
/*
|
||||
auto mparams_dft = common_model_params_to_llama(params_dft);
|
||||
|
||||
model_dft.reset(llama_model_load_from_file(params_dft.model.path.c_str(), mparams_dft));
|
||||
|
|
@ -672,6 +674,13 @@ private:
|
|||
|
||||
params_base.speculative.model_dft = model_dft.get();
|
||||
params_base.speculative.cparams_dft = common_context_params_to_llama(params_dft);
|
||||
*/
|
||||
|
||||
// FOR TESTING ONLY!!!!!!
|
||||
params_base.speculative.type = COMMON_SPECULATIVE_TYPE_NEXTN;
|
||||
params_base.speculative.model_dft = model;
|
||||
params_base.speculative.cparams_dft = common_context_params_to_llama(params_dft);
|
||||
params_base.speculative.cparams_dft.graph_type = LLAMA_GRAPH_TYPE_DECODER_MTP;
|
||||
}
|
||||
|
||||
std::string & mmproj_path = params_base.mmproj.path;
|
||||
|
|
|
|||
Loading…
Reference in New Issue