llama: dynamic head_dim and n_rot for SWA (#20301)
* llama: dynamic head_dim and n_rot for SWA * also add gguf_writer wrappers * fix build * build_rope_shift arg reorder
This commit is contained in:
parent
23fbfcb1ad
commit
59db9a357d
|
|
@ -177,6 +177,8 @@ class Keys:
|
|||
TEMPERATURE_LENGTH = "{arch}.attention.temperature_length"
|
||||
KEY_LENGTH_MLA = "{arch}.attention.key_length_mla"
|
||||
VALUE_LENGTH_MLA = "{arch}.attention.value_length_mla"
|
||||
KEY_LENGTH_SWA = "{arch}.attention.key_length_swa"
|
||||
VALUE_LENGTH_SWA = "{arch}.attention.value_length_swa"
|
||||
SHARED_KV_LAYERS = "{arch}.attention.shared_kv_layers"
|
||||
SLIDING_WINDOW_PATTERN = "{arch}.attention.sliding_window_pattern"
|
||||
TEMPERATURE_SCALE = "{arch}.attention.temperature_scale"
|
||||
|
|
@ -188,6 +190,7 @@ class Keys:
|
|||
|
||||
class Rope:
|
||||
DIMENSION_COUNT = "{arch}.rope.dimension_count"
|
||||
DIMENSION_COUNT_SWA = "{arch}.rope.dimension_count_swa"
|
||||
DIMENSION_SECTIONS = "{arch}.rope.dimension_sections"
|
||||
FREQ_BASE = "{arch}.rope.freq_base"
|
||||
FREQ_BASE_SWA = "{arch}.rope.freq_base_swa"
|
||||
|
|
|
|||
|
|
@ -773,6 +773,12 @@ class GGUFWriter:
|
|||
def add_value_length_mla(self, length: int) -> None:
|
||||
self.add_uint32(Keys.Attention.VALUE_LENGTH_MLA.format(arch=self.arch), length)
|
||||
|
||||
def add_key_length_swa(self, length: int) -> None:
|
||||
self.add_uint32(Keys.Attention.KEY_LENGTH_SWA.format(arch=self.arch), length)
|
||||
|
||||
def add_value_length_swa(self, length: int) -> None:
|
||||
self.add_uint32(Keys.Attention.VALUE_LENGTH_SWA.format(arch=self.arch), length)
|
||||
|
||||
def add_indexer_head_count(self, count: int) -> None:
|
||||
self.add_uint32(Keys.Attention.Indexer.HEAD_COUNT.format(arch=self.arch), count)
|
||||
|
||||
|
|
@ -946,6 +952,9 @@ class GGUFWriter:
|
|||
def add_rope_dimension_count(self, count: int) -> None:
|
||||
self.add_uint32(Keys.Rope.DIMENSION_COUNT.format(arch=self.arch), count)
|
||||
|
||||
def add_rope_dimension_count_swa(self, count: int) -> None:
|
||||
self.add_uint32(Keys.Rope.DIMENSION_COUNT_SWA.format(arch=self.arch), count)
|
||||
|
||||
def add_rope_dimension_sections(self, dims: Sequence[int]) -> None:
|
||||
self.add_array(Keys.Rope.DIMENSION_SECTIONS.format(arch=self.arch), dims)
|
||||
|
||||
|
|
|
|||
|
|
@ -230,11 +230,14 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
|||
{ LLM_KV_ATTENTION_TEMPERATURE_SCALE, "%s.attention.temperature_scale" },
|
||||
{ LLM_KV_ATTENTION_KEY_LENGTH_MLA, "%s.attention.key_length_mla" },
|
||||
{ LLM_KV_ATTENTION_VALUE_LENGTH_MLA, "%s.attention.value_length_mla" },
|
||||
{ LLM_KV_ATTENTION_KEY_LENGTH_SWA, "%s.attention.key_length_swa" },
|
||||
{ LLM_KV_ATTENTION_VALUE_LENGTH_SWA, "%s.attention.value_length_swa" },
|
||||
{ LLM_KV_ATTENTION_INDEXER_HEAD_COUNT, "%s.attention.indexer.head_count" },
|
||||
{ LLM_KV_ATTENTION_INDEXER_KEY_LENGTH, "%s.attention.indexer.key_length" },
|
||||
{ LLM_KV_ATTENTION_INDEXER_TOP_K, "%s.attention.indexer.top_k" },
|
||||
|
||||
{ LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" },
|
||||
{ LLM_KV_ROPE_DIMENSION_COUNT_SWA, "%s.rope.dimension_count_swa" },
|
||||
{ LLM_KV_ROPE_DIMENSION_SECTIONS, "%s.rope.dimension_sections" },
|
||||
{ LLM_KV_ROPE_FREQ_BASE, "%s.rope.freq_base" },
|
||||
{ LLM_KV_ROPE_FREQ_BASE_SWA, "%s.rope.freq_base_swa" },
|
||||
|
|
|
|||
|
|
@ -234,11 +234,14 @@ enum llm_kv {
|
|||
LLM_KV_ATTENTION_TEMPERATURE_SCALE,
|
||||
LLM_KV_ATTENTION_KEY_LENGTH_MLA,
|
||||
LLM_KV_ATTENTION_VALUE_LENGTH_MLA,
|
||||
LLM_KV_ATTENTION_KEY_LENGTH_SWA,
|
||||
LLM_KV_ATTENTION_VALUE_LENGTH_SWA,
|
||||
LLM_KV_ATTENTION_INDEXER_HEAD_COUNT,
|
||||
LLM_KV_ATTENTION_INDEXER_KEY_LENGTH,
|
||||
LLM_KV_ATTENTION_INDEXER_TOP_K,
|
||||
|
||||
LLM_KV_ROPE_DIMENSION_COUNT,
|
||||
LLM_KV_ROPE_DIMENSION_COUNT_SWA,
|
||||
LLM_KV_ROPE_DIMENSION_SECTIONS,
|
||||
LLM_KV_ROPE_FREQ_BASE,
|
||||
LLM_KV_ROPE_FREQ_BASE_SWA,
|
||||
|
|
|
|||
|
|
@ -2876,19 +2876,23 @@ llama_context * llama_init_from_model(
|
|||
|
||||
if (params.flash_attn_type == LLAMA_FLASH_ATTN_TYPE_AUTO && ggml_is_quantized(params.type_k)) {
|
||||
const uint32_t blck_size = ggml_blck_size(params.type_k);
|
||||
if (model->hparams.n_embd_head_k % blck_size != 0) {
|
||||
LLAMA_LOG_ERROR("%s: K cache type %s with block size %u does not divide n_embd_head_k=%u\n",
|
||||
__func__, ggml_type_name(params.type_k), blck_size, model->hparams.n_embd_head_k);
|
||||
return nullptr;
|
||||
for (uint32_t il = 0; il < model->hparams.n_layer; ++il) {
|
||||
if (model->hparams.n_embd_head_k(il) % blck_size != 0) {
|
||||
LLAMA_LOG_ERROR("%s: K cache type %s with block size %u does not divide n_embd_head_k=%u\n",
|
||||
__func__, ggml_type_name(params.type_k), blck_size, model->hparams.n_embd_head_k(il));
|
||||
return nullptr;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (params.flash_attn_type == LLAMA_FLASH_ATTN_TYPE_AUTO && ggml_is_quantized(params.type_v)) {
|
||||
const uint32_t blck_size = ggml_blck_size(params.type_v);
|
||||
if (model->hparams.n_embd_head_v % blck_size != 0) {
|
||||
LLAMA_LOG_ERROR("%s: V cache type %s with block size %u does not divide n_embd_head_k=%u\n",
|
||||
__func__, ggml_type_name(params.type_v), blck_size, model->hparams.n_embd_head_v);
|
||||
return nullptr;
|
||||
for (uint32_t il = 0; il < model->hparams.n_layer; ++il) {
|
||||
if (model->hparams.n_embd_head_v(il) % blck_size != 0) {
|
||||
LLAMA_LOG_ERROR("%s: V cache type %s with block size %u does not divide n_embd_head_v=%u\n",
|
||||
__func__, ggml_type_name(params.type_v), blck_size, model->hparams.n_embd_head_v(il));
|
||||
return nullptr;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -849,13 +849,13 @@ llm_graph_context::llm_graph_context(const llm_graph_params & params) :
|
|||
ubatch (params.ubatch),
|
||||
n_embd (hparams.n_embd),
|
||||
n_layer (hparams.n_layer),
|
||||
n_rot (hparams.n_rot),
|
||||
n_rot (hparams.n_rot()),
|
||||
n_ctx (cparams.n_ctx),
|
||||
n_head (hparams.n_head()),
|
||||
n_head_kv (hparams.n_head_kv()),
|
||||
n_embd_head_k (hparams.n_embd_head_k),
|
||||
n_embd_head_k (hparams.n_embd_head_k()),
|
||||
n_embd_k_gqa (hparams.n_embd_k_gqa()),
|
||||
n_embd_head_v (hparams.n_embd_head_v),
|
||||
n_embd_head_v (hparams.n_embd_head_v()),
|
||||
n_embd_v_gqa (hparams.n_embd_v_gqa()),
|
||||
n_expert (hparams.n_expert),
|
||||
n_expert_used (cparams.warmup ? hparams.n_expert : hparams.n_expert_used),
|
||||
|
|
|
|||
|
|
@ -62,6 +62,14 @@ uint32_t llama_hparams::n_gqa(uint32_t il) const {
|
|||
return n_head/n_head_kv;
|
||||
}
|
||||
|
||||
uint32_t llama_hparams::n_rot(uint32_t il) const {
|
||||
if (il < n_layer) {
|
||||
return is_swa(il) ? n_rot_swa : n_rot_full;
|
||||
}
|
||||
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
|
||||
uint32_t llama_hparams::n_embd_inp() const {
|
||||
uint32_t n_embd_inp = n_embd;
|
||||
|
||||
|
|
@ -76,16 +84,32 @@ uint32_t llama_hparams::n_embd_out() const {
|
|||
return n_embd_out_impl > 0 ? n_embd_out_impl : n_embd;
|
||||
}
|
||||
|
||||
uint32_t llama_hparams::n_embd_head_k(uint32_t il) const {
|
||||
if (il < n_layer) {
|
||||
return is_swa(il) ? n_embd_head_k_swa : n_embd_head_k_full;
|
||||
}
|
||||
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
|
||||
uint32_t llama_hparams::n_embd_head_v(uint32_t il) const {
|
||||
if (il < n_layer) {
|
||||
return is_swa(il) ? n_embd_head_v_swa : n_embd_head_v_full;
|
||||
}
|
||||
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
|
||||
uint32_t llama_hparams::n_embd_k_gqa(uint32_t il) const {
|
||||
const uint32_t n_head_kv = this->n_head_kv(il);
|
||||
|
||||
return n_embd_head_k * n_head_kv;
|
||||
return n_embd_head_k(il) * n_head_kv;
|
||||
}
|
||||
|
||||
uint32_t llama_hparams::n_embd_v_gqa(uint32_t il) const {
|
||||
const uint32_t n_head_kv = this->n_head_kv(il);
|
||||
|
||||
return n_embd_head_v * n_head_kv;
|
||||
return n_embd_head_v(il) * n_head_kv;
|
||||
}
|
||||
|
||||
bool llama_hparams::is_n_embd_k_gqa_variable() const {
|
||||
|
|
@ -197,11 +221,11 @@ bool llama_hparams::is_mla() const {
|
|||
}
|
||||
|
||||
uint32_t llama_hparams::n_embd_head_k_mla() const {
|
||||
return is_mla() ? n_embd_head_k_mla_impl : n_embd_head_k;
|
||||
return is_mla() ? n_embd_head_k_mla_impl : n_embd_head_k();
|
||||
}
|
||||
|
||||
uint32_t llama_hparams::n_embd_head_v_mla() const {
|
||||
return is_mla() ? n_embd_head_v_mla_impl : n_embd_head_v;
|
||||
return is_mla() ? n_embd_head_v_mla_impl : n_embd_head_v();
|
||||
}
|
||||
|
||||
bool llama_hparams::has_kv(uint32_t il) const {
|
||||
|
|
|
|||
|
|
@ -44,13 +44,20 @@ struct llama_hparams {
|
|||
uint32_t n_embd;
|
||||
uint32_t n_layer;
|
||||
int32_t n_layer_kv_from_start = -1; // if non-negative, the first n_layer_kv_from_start layers have KV cache
|
||||
uint32_t n_rot;
|
||||
uint32_t n_embd_head_k; // dimension of keys (d_k). d_q is assumed to be the same, but there are n_head q heads, and only n_head_kv k-v heads
|
||||
uint32_t n_embd_head_v; // dimension of values (d_v) aka n_embd_head
|
||||
uint32_t n_expert = 0;
|
||||
uint32_t n_expert_used = 0;
|
||||
uint32_t n_rel_attn_bkts = 0;
|
||||
|
||||
// different head size for full_attention and SWA layers
|
||||
uint32_t n_embd_head_k_full; // dimension of keys (d_k). d_q is assumed to be the same, but there are n_head q heads, and only n_head_kv k-v heads
|
||||
uint32_t n_embd_head_v_full; // dimension of values (d_v) aka n_embd_head
|
||||
uint32_t n_embd_head_k_swa;
|
||||
uint32_t n_embd_head_v_swa;
|
||||
|
||||
// different RoPE dimensions for full_attention and SWA layers
|
||||
uint32_t n_rot_full;
|
||||
uint32_t n_rot_swa;
|
||||
|
||||
// note: deepseek2 using MLA converts into MQA with larger heads, then decompresses to MHA
|
||||
uint32_t n_embd_head_k_mla_impl = 0;
|
||||
uint32_t n_embd_head_v_mla_impl = 0;
|
||||
|
|
@ -247,12 +254,18 @@ struct llama_hparams {
|
|||
|
||||
uint32_t n_gqa(uint32_t il = 0) const;
|
||||
|
||||
uint32_t n_rot(uint32_t il = 0) const;
|
||||
|
||||
// dimension of main + auxiliary input embeddings
|
||||
uint32_t n_embd_inp() const;
|
||||
|
||||
// dimension of output embeddings
|
||||
uint32_t n_embd_out() const;
|
||||
|
||||
// dimension of key/value embeddings for each head (per layer)
|
||||
uint32_t n_embd_head_k(uint32_t il = 0) const;
|
||||
uint32_t n_embd_head_v(uint32_t il = 0) const;
|
||||
|
||||
// dimension of key embeddings across all k-v heads
|
||||
uint32_t n_embd_k_gqa(uint32_t il = 0) const;
|
||||
|
||||
|
|
|
|||
|
|
@ -1033,8 +1033,8 @@ ggml_tensor * llama_kv_cache::get_k(ggml_context * ctx, int32_t il, uint32_t n_k
|
|||
const uint32_t ns = sinfo.s1 - sinfo.s0 + 1;
|
||||
|
||||
return ggml_view_4d(ctx, k,
|
||||
hparams.n_embd_head_k, hparams.n_head_kv(il), n_kv, ns,
|
||||
ggml_row_size(k->type, hparams.n_embd_head_k),
|
||||
hparams.n_embd_head_k(il), hparams.n_head_kv(il), n_kv, ns,
|
||||
ggml_row_size(k->type, hparams.n_embd_head_k(il)),
|
||||
ggml_row_size(k->type, n_embd_k_gqa),
|
||||
ggml_row_size(k->type, n_embd_k_gqa*kv_size),
|
||||
ggml_row_size(k->type, n_embd_k_gqa*kv_size)*sinfo.s0);
|
||||
|
|
@ -1056,8 +1056,8 @@ ggml_tensor * llama_kv_cache::get_v(ggml_context * ctx, int32_t il, uint32_t n_k
|
|||
if (!v_trans) {
|
||||
// note: v->nb[1] <= v->nb[2]
|
||||
return ggml_view_4d(ctx, v,
|
||||
hparams.n_embd_head_v, hparams.n_head_kv(il), n_kv, ns,
|
||||
ggml_row_size(v->type, hparams.n_embd_head_v), // v->nb[1]
|
||||
hparams.n_embd_head_v(il), hparams.n_head_kv(il), n_kv, ns,
|
||||
ggml_row_size(v->type, hparams.n_embd_head_v(il)), // v->nb[1]
|
||||
ggml_row_size(v->type, n_embd_v_gqa), // v->nb[2]
|
||||
ggml_row_size(v->type, n_embd_v_gqa*kv_size), // v->nb[3]
|
||||
ggml_row_size(v->type, n_embd_v_gqa*kv_size)*sinfo.s0);
|
||||
|
|
@ -1065,8 +1065,8 @@ ggml_tensor * llama_kv_cache::get_v(ggml_context * ctx, int32_t il, uint32_t n_k
|
|||
|
||||
// note: v->nb[1] > v->nb[2]
|
||||
return ggml_view_4d(ctx, v,
|
||||
n_kv, hparams.n_head_kv(il), hparams.n_embd_head_v, ns,
|
||||
ggml_row_size(v->type, kv_size*hparams.n_embd_head_v), // v->nb[1]
|
||||
n_kv, hparams.n_head_kv(il), hparams.n_embd_head_v(il), ns,
|
||||
ggml_row_size(v->type, kv_size*hparams.n_embd_head_v(il)), // v->nb[1]
|
||||
ggml_row_size(v->type, kv_size), // v->nb[2]
|
||||
ggml_row_size(v->type, kv_size*n_embd_v_gqa), // v->nb[3]
|
||||
ggml_row_size(v->type, kv_size*n_embd_v_gqa)*sinfo.s0);
|
||||
|
|
@ -1544,7 +1544,8 @@ ggml_tensor * llama_kv_cache::build_rope_shift(
|
|||
ggml_tensor * shift,
|
||||
ggml_tensor * factors,
|
||||
float freq_base,
|
||||
float freq_scale) const {
|
||||
float freq_scale,
|
||||
uint32_t il) const {
|
||||
const auto & n_ctx_orig = cparams.n_ctx_orig_yarn;
|
||||
|
||||
const auto & yarn_ext_factor = cparams.yarn_ext_factor;
|
||||
|
|
@ -1552,7 +1553,7 @@ ggml_tensor * llama_kv_cache::build_rope_shift(
|
|||
const auto & yarn_beta_slow = cparams.yarn_beta_slow;
|
||||
const auto & yarn_attn_factor = cparams.yarn_attn_factor;
|
||||
|
||||
const auto & n_rot = hparams.n_rot;
|
||||
const auto & n_rot = hparams.n_rot(il);
|
||||
const auto & rope_type = hparams.rope_type == LLAMA_ROPE_TYPE_MROPE || hparams.rope_type == LLAMA_ROPE_TYPE_IMROPE
|
||||
// @ngxson : this is a workaround
|
||||
// for M-RoPE, we want to rotate the whole vector when doing KV shift
|
||||
|
|
@ -1606,13 +1607,6 @@ ggml_cgraph * llama_kv_cache::build_graph_shift(llm_graph_result * res, llama_co
|
|||
auto * ctx = res->get_ctx();
|
||||
auto * gf = res->get_gf();
|
||||
|
||||
const auto & n_embd_head_k = hparams.n_embd_head_k;
|
||||
//const auto & n_embd_head_v = hparams.n_embd_head_v;
|
||||
|
||||
const auto & n_rot = hparams.n_rot;
|
||||
|
||||
const auto n_embd_nope = hparams.n_lora_kv > 0 ? n_embd_head_k - n_rot : 0;
|
||||
|
||||
auto inp = std::make_unique<llm_graph_input_k_shift>(this);
|
||||
|
||||
inp->k_shift = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, (int64_t) get_size()*n_stream);
|
||||
|
|
@ -1626,6 +1620,10 @@ ggml_cgraph * llama_kv_cache::build_graph_shift(llm_graph_result * res, llama_co
|
|||
const int64_t n_head_kv = hparams.n_head_kv(il);
|
||||
const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il);
|
||||
|
||||
const auto n_rot = hparams.n_rot(il);
|
||||
const auto n_embd_head_k = hparams.n_embd_head_k(il);
|
||||
const auto n_embd_nope = hparams.n_lora_kv > 0 ? n_embd_head_k - n_rot : 0;
|
||||
|
||||
const float freq_base_l = model.get_rope_freq_base (cparams, il);
|
||||
const float freq_scale_l = model.get_rope_freq_scale(cparams, il);
|
||||
|
||||
|
|
@ -1638,7 +1636,7 @@ ggml_cgraph * llama_kv_cache::build_graph_shift(llm_graph_result * res, llama_co
|
|||
ggml_row_size(layer.k->type, n_embd_k_gqa),
|
||||
ggml_row_size(layer.k->type, n_embd_nope));
|
||||
|
||||
ggml_tensor * cur = build_rope_shift(cparams, ctx, k, inp->k_shift, rope_factors, freq_base_l, freq_scale_l);
|
||||
ggml_tensor * cur = build_rope_shift(cparams, ctx, k, inp->k_shift, rope_factors, freq_base_l, freq_scale_l, il);
|
||||
|
||||
ggml_build_forward_expand(gf, cur);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -264,7 +264,8 @@ private:
|
|||
ggml_tensor * shift,
|
||||
ggml_tensor * factors,
|
||||
float freq_base,
|
||||
float freq_scale) const;
|
||||
float freq_scale,
|
||||
uint32_t il) const;
|
||||
|
||||
ggml_cgraph * build_graph_shift(
|
||||
llm_graph_result * res,
|
||||
|
|
|
|||
|
|
@ -918,7 +918,7 @@ static bool weight_buft_supported(const llama_hparams & hparams, ggml_tensor * w
|
|||
} break;
|
||||
case GGML_OP_ROPE:
|
||||
{
|
||||
const int n_embd_head = hparams.n_embd_head_v;
|
||||
const int n_embd_head = hparams.n_embd_head_v();
|
||||
const int n_head = hparams.n_head();
|
||||
ggml_tensor * a = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, n_embd_head, n_head, 512);
|
||||
ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 512);
|
||||
|
|
|
|||
|
|
@ -186,8 +186,10 @@ void llama_model_saver::add_kv_from_model() {
|
|||
add_kv(LLM_KV_ATTENTION_HEAD_COUNT_KV, hparams.n_head_kv_arr, true);
|
||||
add_kv(LLM_KV_ATTENTION_MAX_ALIBI_BIAS, hparams.f_max_alibi_bias);
|
||||
add_kv(LLM_KV_ATTENTION_CLAMP_KQV, hparams.f_clamp_kqv);
|
||||
add_kv(LLM_KV_ATTENTION_KEY_LENGTH, hparams.n_embd_head_k);
|
||||
add_kv(LLM_KV_ATTENTION_VALUE_LENGTH, hparams.n_embd_head_v);
|
||||
add_kv(LLM_KV_ATTENTION_KEY_LENGTH, hparams.n_embd_head_k_full);
|
||||
add_kv(LLM_KV_ATTENTION_VALUE_LENGTH, hparams.n_embd_head_v_full);
|
||||
add_kv(LLM_KV_ATTENTION_KEY_LENGTH_SWA, hparams.n_embd_head_k_swa);
|
||||
add_kv(LLM_KV_ATTENTION_VALUE_LENGTH_SWA, hparams.n_embd_head_v_swa);
|
||||
add_kv(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
||||
add_kv(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||
add_kv(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
|
||||
|
|
@ -199,7 +201,8 @@ void llama_model_saver::add_kv_from_model() {
|
|||
|
||||
const float rope_scaling_factor = hparams.rope_freq_scale_train == 1.0f ? 0.0f : 1.0f/hparams.rope_freq_scale_train;
|
||||
|
||||
add_kv(LLM_KV_ROPE_DIMENSION_COUNT, hparams.n_rot);
|
||||
add_kv(LLM_KV_ROPE_DIMENSION_COUNT, hparams.n_rot_full);
|
||||
add_kv(LLM_KV_ROPE_DIMENSION_COUNT_SWA, hparams.n_rot_swa);
|
||||
add_kv(LLM_KV_ROPE_FREQ_BASE, hparams.rope_freq_base_train);
|
||||
// add_kv(LLM_KV_ROPE_SCALE_LINEAR, rope_scaling_factor); // old name
|
||||
add_kv(LLM_KV_ROPE_SCALING_TYPE, llama_rope_scaling_type_name(hparams.rope_scaling_type_train));
|
||||
|
|
|
|||
|
|
@ -459,26 +459,37 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|||
// gpt-neox n_rot = rotary_pct * (n_embd / n_head)
|
||||
// gpt-j n_rot = rotary_dim
|
||||
|
||||
hparams.n_embd_head_k = hparams.n_embd / hparams.n_head();
|
||||
ml.get_key(LLM_KV_ATTENTION_KEY_LENGTH, hparams.n_embd_head_k, false);
|
||||
hparams.n_embd_head_k_full = hparams.n_embd / hparams.n_head();
|
||||
ml.get_key(LLM_KV_ATTENTION_KEY_LENGTH, hparams.n_embd_head_k_full, false);
|
||||
|
||||
hparams.n_embd_head_v = hparams.n_embd / hparams.n_head();
|
||||
ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH, hparams.n_embd_head_v, false);
|
||||
hparams.n_embd_head_v_full = hparams.n_embd / hparams.n_head();
|
||||
ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH, hparams.n_embd_head_v_full, false);
|
||||
|
||||
// sanity check for n_rot (optional)
|
||||
hparams.n_rot = hparams.n_embd_head_k;
|
||||
hparams.n_rot_full = hparams.n_embd_head_k_full;
|
||||
|
||||
ml.get_key(LLM_KV_ROPE_DIMENSION_COUNT, hparams.n_rot, false);
|
||||
ml.get_key(LLM_KV_ROPE_DIMENSION_COUNT, hparams.n_rot_full, false);
|
||||
|
||||
if (arch == LLM_ARCH_LLAMA || arch == LLM_ARCH_DECI || arch == LLM_ARCH_FALCON || arch == LLM_ARCH_LLAMA_EMBED) {
|
||||
if (hparams.n_rot != hparams.n_embd_head_k) {
|
||||
throw std::runtime_error(format("invalid n_rot: %u, expected %u", hparams.n_rot, hparams.n_embd_head_k));
|
||||
if (hparams.n_rot_full != hparams.n_embd_head_k_full) {
|
||||
throw std::runtime_error(format("invalid n_rot: %u, expected %u", hparams.n_rot_full, hparams.n_embd_head_k_full));
|
||||
}
|
||||
}
|
||||
} else {
|
||||
hparams.n_rot = 0;
|
||||
hparams.n_embd_head_k = 0;
|
||||
hparams.n_embd_head_v = 0;
|
||||
hparams.n_rot_full = 0;
|
||||
hparams.n_embd_head_k_full = 0;
|
||||
hparams.n_embd_head_v_full = 0;
|
||||
}
|
||||
|
||||
// head size and n_rot for SWA layers
|
||||
{
|
||||
hparams.n_embd_head_k_swa = hparams.n_embd_head_k_full;
|
||||
hparams.n_embd_head_v_swa = hparams.n_embd_head_v_full;
|
||||
ml.get_key(LLM_KV_ATTENTION_KEY_LENGTH_SWA, hparams.n_embd_head_k_swa, false);
|
||||
ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH_SWA, hparams.n_embd_head_v_swa, false);
|
||||
|
||||
hparams.n_rot_swa = hparams.n_rot_full;
|
||||
ml.get_key(LLM_KV_ROPE_DIMENSION_COUNT_SWA, hparams.n_rot_swa, false);
|
||||
}
|
||||
|
||||
// for differentiating model types
|
||||
|
|
@ -1114,10 +1125,6 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|||
break;
|
||||
default: type = LLM_TYPE_UNKNOWN;
|
||||
}
|
||||
|
||||
// Load attention parameters
|
||||
ml.get_key(LLM_KV_ATTENTION_KEY_LENGTH, hparams.n_embd_head_k, false);
|
||||
ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH, hparams.n_embd_head_v, false);
|
||||
} break;
|
||||
case LLM_ARCH_PLAMO3:
|
||||
{
|
||||
|
|
@ -1212,7 +1219,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|||
// ref: https://github.com/google/gemma_pytorch/blob/014acb7ac4563a5f77c76d7ff98f31b568c16508/gemma/config.py#L173
|
||||
hparams.f_attention_scale = type == LLM_TYPE_27B
|
||||
? 1.0f / std::sqrt(float(hparams.n_embd / hparams.n_head(0)))
|
||||
: 1.0f / std::sqrt(float(hparams.n_embd_head_k));
|
||||
: 1.0f / std::sqrt(float(hparams.n_embd_head_k()));
|
||||
} break;
|
||||
case LLM_ARCH_GEMMA3:
|
||||
{
|
||||
|
|
@ -1245,7 +1252,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|||
// ref: https://github.com/google/gemma_pytorch/blob/014acb7ac4563a5f77c76d7ff98f31b568c16508/gemma/config.py#L289
|
||||
hparams.f_attention_scale = type == LLM_TYPE_27B
|
||||
? 1.0f / std::sqrt(float(hparams.n_embd / hparams.n_head(0)))
|
||||
: 1.0f / std::sqrt(float(hparams.n_embd_head_k));
|
||||
: 1.0f / std::sqrt(float(hparams.n_embd_head_k()));
|
||||
} break;
|
||||
case LLM_ARCH_GEMMA3N:
|
||||
{
|
||||
|
|
@ -1294,7 +1301,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|||
case 24: type = LLM_TYPE_0_3B; break;
|
||||
default: type = LLM_TYPE_UNKNOWN;
|
||||
}
|
||||
hparams.f_attention_scale = 1.0f / std::sqrt(float(hparams.n_embd_head_k));
|
||||
hparams.f_attention_scale = 1.0f / std::sqrt(float(hparams.n_embd_head_k()));
|
||||
|
||||
} break;
|
||||
case LLM_ARCH_STARCODER2:
|
||||
|
|
@ -2487,7 +2494,6 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|||
ml.get_key(LLM_KV_ATTENTION_KEY_LENGTH_MLA, hparams.n_embd_head_k_mla_impl);
|
||||
ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH_MLA, hparams.n_embd_head_v_mla_impl);
|
||||
ml.get_key(LLM_KV_ATTENTION_KV_LORA_RANK, hparams.n_lora_kv);
|
||||
ml.get_key(LLM_KV_ROPE_DIMENSION_COUNT, hparams.n_rot);
|
||||
ml.get_key(LLM_KV_SSM_CONV_KERNEL, hparams.ssm_d_conv);
|
||||
ml.get_key(LLM_KV_KDA_HEAD_DIM, hparams.n_embd_head_kda);
|
||||
|
||||
|
|
@ -2518,6 +2524,9 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|||
|
||||
hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
|
||||
|
||||
// full_attention layer only use half of the RoPE dimensions
|
||||
hparams.n_rot_full = hparams.n_rot_full / 2;
|
||||
|
||||
// MoE + SWA parameters
|
||||
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
|
||||
ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, false);
|
||||
|
|
@ -2661,13 +2670,13 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|||
const int64_t n_embd = hparams.n_embd;
|
||||
const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa();
|
||||
const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa();
|
||||
const int64_t n_embd_head_k = hparams.n_embd_head_k;
|
||||
const int64_t n_embd_head_v = hparams.n_embd_head_v;
|
||||
const int64_t n_embd_head_k = hparams.n_embd_head_k();
|
||||
const int64_t n_embd_head_v = hparams.n_embd_head_v();
|
||||
const int64_t n_ff = hparams.n_ff();
|
||||
const int64_t n_embd_gqa = n_embd_v_gqa;
|
||||
const int64_t n_vocab = vocab.n_tokens();
|
||||
const int64_t n_token_types = vocab.n_token_types();
|
||||
const int64_t n_rot = hparams.n_rot;
|
||||
const int64_t n_rot = hparams.n_rot();
|
||||
const int64_t n_expert = hparams.n_expert;
|
||||
const int64_t n_expert_used = hparams.n_expert_used;
|
||||
const int64_t n_ctx_train = hparams.n_ctx_train;
|
||||
|
|
@ -2967,8 +2976,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|||
} break;
|
||||
case LLM_ARCH_MINICPM3:
|
||||
{
|
||||
const int64_t n_embd_head_qk_rope = hparams.n_rot;
|
||||
const int64_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot;
|
||||
const int64_t n_embd_head_qk_rope = hparams.n_rot();
|
||||
const int64_t n_embd_head_qk_nope = hparams.n_embd_head_k() - hparams.n_rot();
|
||||
|
||||
const int64_t q_lora_rank = hparams.n_lora_q;
|
||||
const int64_t kv_lora_rank = hparams.n_lora_kv;
|
||||
|
|
@ -3840,8 +3849,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|||
const int64_t dt_dim = std::max(64, int(hparams.n_embd / 16));
|
||||
|
||||
// attention parameters
|
||||
const uint32_t qk_dim = hparams.n_embd_head_k;
|
||||
const uint32_t v_dim = hparams.n_embd_head_v;
|
||||
const uint32_t qk_dim = hparams.n_embd_head_k();
|
||||
const uint32_t v_dim = hparams.n_embd_head_v();
|
||||
|
||||
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
||||
|
||||
|
|
@ -3901,8 +3910,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|||
} break;
|
||||
case LLM_ARCH_PLAMO3:
|
||||
{
|
||||
const int64_t head_dim_q = hparams.n_embd_head_k;
|
||||
const int64_t head_dim_v = hparams.n_embd_head_v;
|
||||
const int64_t head_dim_q = hparams.n_embd_head_k();
|
||||
const int64_t head_dim_v = hparams.n_embd_head_v();
|
||||
|
||||
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
||||
|
||||
|
|
@ -4649,7 +4658,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|||
} break;
|
||||
case LLM_ARCH_SEED_OSS:
|
||||
{
|
||||
const uint32_t head_dim = hparams.n_embd_head_k;
|
||||
const uint32_t head_dim = hparams.n_embd_head_k();
|
||||
const int64_t n_qo_dim = n_head * head_dim;
|
||||
const int64_t n_kv_dim = n_head_kv * head_dim;
|
||||
|
||||
|
|
@ -4878,7 +4887,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|||
const int64_t n_embd_head_k_mla = hparams.n_embd_head_k_mla();
|
||||
const int64_t n_embd_head_v_mla = hparams.n_embd_head_v_mla();
|
||||
|
||||
const int64_t n_embd_head_qk_rope = hparams.n_rot;
|
||||
const int64_t n_embd_head_qk_rope = hparams.n_rot();
|
||||
const int64_t n_embd_head_qk_nope = n_embd_head_k_mla - n_embd_head_qk_rope;
|
||||
GGML_ASSERT(n_embd_head_qk_nope >= 1);
|
||||
|
||||
|
|
@ -4957,8 +4966,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|||
} break;
|
||||
case LLM_ARCH_PLM:
|
||||
{
|
||||
const int64_t n_embd_head_qk_rope = hparams.n_rot;
|
||||
const int64_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot;
|
||||
const int64_t n_embd_head_qk_rope = hparams.n_rot();
|
||||
const int64_t n_embd_head_qk_nope = hparams.n_embd_head_k() - hparams.n_rot();
|
||||
const int64_t kv_lora_rank = hparams.n_lora_kv;
|
||||
|
||||
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
||||
|
|
@ -5396,7 +5405,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|||
const int64_t n_embd_head_k_mla = hparams.n_embd_head_k_mla();
|
||||
const int64_t n_embd_head_v_mla = hparams.n_embd_head_v_mla();
|
||||
|
||||
const int64_t n_embd_head_qk_rope = hparams.n_rot;
|
||||
const int64_t n_embd_head_qk_rope = hparams.n_rot();
|
||||
const int64_t n_embd_head_qk_nope = n_embd_head_k_mla - n_embd_head_qk_rope;
|
||||
|
||||
const int64_t q_lora_rank = hparams.n_lora_q;
|
||||
|
|
@ -5680,7 +5689,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|||
const int64_t n_expert = hparams.n_expert;
|
||||
const int64_t n_expert_used = hparams.n_expert_used;
|
||||
const int64_t n_ff_shexp = hparams.n_ff_shexp > 0 ? hparams.n_ff_shexp : n_ff_exp;
|
||||
const int64_t head_dim = hparams.n_embd_head_k;
|
||||
const int64_t head_dim = hparams.n_embd_head_k();
|
||||
const int64_t n_qo_dim = n_head * head_dim;
|
||||
const int64_t n_kv_dim = n_head_kv * head_dim;
|
||||
|
||||
|
|
@ -6968,7 +6977,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|||
|
||||
// Kimi: qk_rope_head_dim = 64 (actual RoPE dimension for MLA)
|
||||
// Note: hparams.n_rot may be 72 (from conversion) but actual is 64
|
||||
const int64_t qk_rope_head_dim = hparams.n_rot; // From config: qk_rope_head_dim
|
||||
const int64_t qk_rope_head_dim = hparams.n_rot(); // From config: qk_rope_head_dim
|
||||
layer.wkv_a_mqa = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_MQA, "weight", i), {n_embd, kv_lora_rank + qk_rope_head_dim}, 0);
|
||||
// Support Legacy GGUFs that don't split wkv_b (MLA KV cache disabled)
|
||||
layer.wkv_b = create_tensor(tn(LLM_TENSOR_ATTN_KV_B, "weight", i),
|
||||
|
|
@ -7339,7 +7348,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|||
// ("rope_freqs.weight") and ggml uses only the first (n_rot_l/2) entries per layer.
|
||||
uint32_t n_rot_max = 0;
|
||||
for (int i = 0; i < n_layer; ++i) {
|
||||
n_rot_max = std::max(n_rot_max, hparams.n_rot);
|
||||
n_rot_max = std::max(n_rot_max, hparams.n_rot());
|
||||
}
|
||||
if (n_rot_max == 0) {
|
||||
n_rot_max = n_rot;
|
||||
|
|
@ -7674,11 +7683,11 @@ void llama_model::print_info() const {
|
|||
LLAMA_LOG_INFO("%s: n_layer = %u\n", __func__, hparams.n_layer);
|
||||
LLAMA_LOG_INFO("%s: n_head = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_head(il); }, hparams.n_layer).c_str());
|
||||
LLAMA_LOG_INFO("%s: n_head_kv = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_head_kv(il); }, hparams.n_layer).c_str());
|
||||
LLAMA_LOG_INFO("%s: n_rot = %u\n", __func__, hparams.n_rot);
|
||||
LLAMA_LOG_INFO("%s: n_rot = %u\n", __func__, hparams.n_rot_full);
|
||||
LLAMA_LOG_INFO("%s: n_swa = %u\n", __func__, hparams.n_swa);
|
||||
LLAMA_LOG_INFO("%s: is_swa_any = %u\n", __func__, hparams.is_swa_any());
|
||||
LLAMA_LOG_INFO("%s: n_embd_head_k = %u\n", __func__, hparams.n_embd_head_k);
|
||||
LLAMA_LOG_INFO("%s: n_embd_head_v = %u\n", __func__, hparams.n_embd_head_v);
|
||||
LLAMA_LOG_INFO("%s: n_embd_head_k = %u\n", __func__, hparams.n_embd_head_k_full);
|
||||
LLAMA_LOG_INFO("%s: n_embd_head_v = %u\n", __func__, hparams.n_embd_head_v_full);
|
||||
LLAMA_LOG_INFO("%s: n_gqa = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_gqa(il); }, hparams.n_layer).c_str());
|
||||
LLAMA_LOG_INFO("%s: n_embd_k_gqa = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_embd_k_gqa(il); }, hparams.n_layer).c_str());
|
||||
LLAMA_LOG_INFO("%s: n_embd_v_gqa = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_embd_v_gqa(il); }, hparams.n_layer).c_str());
|
||||
|
|
@ -7702,6 +7711,9 @@ void llama_model::print_info() const {
|
|||
if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) {
|
||||
LLAMA_LOG_INFO("%s: freq_base_swa = %.1f\n", __func__, hparams.rope_freq_base_train_swa);
|
||||
LLAMA_LOG_INFO("%s: freq_scale_swa = %g\n", __func__, hparams.rope_freq_scale_train_swa);
|
||||
LLAMA_LOG_INFO("%s: n_embd_head_k_swa = %u\n", __func__, hparams.n_embd_head_k_swa);
|
||||
LLAMA_LOG_INFO("%s: n_embd_head_v_swa = %u\n", __func__, hparams.n_embd_head_v_swa);
|
||||
LLAMA_LOG_INFO("%s: n_rot_swa = %u\n", __func__, hparams.n_rot_swa);
|
||||
}
|
||||
LLAMA_LOG_INFO("%s: n_ctx_orig_yarn = %u\n", __func__, hparams.n_ctx_orig_yarn);
|
||||
LLAMA_LOG_INFO("%s: rope_yarn_log_mul = %.4f\n", __func__, hparams.rope_yarn_log_mul);
|
||||
|
|
|
|||
|
|
@ -1,8 +1,8 @@
|
|||
#include "models.h"
|
||||
|
||||
llm_build_afmoe::llm_build_afmoe(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v();
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
|
||||
|
||||
ggml_tensor * cur;
|
||||
ggml_tensor * inpL;
|
||||
|
|
|
|||
|
|
@ -3,10 +3,10 @@
|
|||
|
||||
|
||||
llm_build_apertus::llm_build_apertus(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v();
|
||||
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
|
||||
GGML_ASSERT(n_embd_head == n_rot);
|
||||
|
||||
ggml_tensor * cur;
|
||||
ggml_tensor * inpL;
|
||||
|
|
|
|||
|
|
@ -2,10 +2,10 @@
|
|||
|
||||
|
||||
llm_build_arcee::llm_build_arcee(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v();
|
||||
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
|
||||
GGML_ASSERT(n_embd_head == n_rot);
|
||||
|
||||
ggml_tensor * cur;
|
||||
ggml_tensor * inpL;
|
||||
|
|
|
|||
|
|
@ -1,10 +1,10 @@
|
|||
#include "models.h"
|
||||
|
||||
llm_build_arctic::llm_build_arctic(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v();
|
||||
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
|
||||
GGML_ASSERT(n_embd_head == n_rot);
|
||||
|
||||
ggml_tensor * cur;
|
||||
ggml_tensor * inpL;
|
||||
|
|
|
|||
|
|
@ -2,10 +2,10 @@
|
|||
|
||||
|
||||
llm_build_baichuan::llm_build_baichuan(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v();
|
||||
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
|
||||
GGML_ASSERT(n_embd_head == n_rot);
|
||||
|
||||
ggml_tensor * cur;
|
||||
ggml_tensor * inpL;
|
||||
|
|
|
|||
|
|
@ -2,10 +2,10 @@
|
|||
|
||||
llm_build_bailingmoe2::llm_build_bailingmoe2(const llama_model & model, const llm_graph_params & params) :
|
||||
llm_graph_context(params) {
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v();
|
||||
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
||||
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
|
||||
|
||||
ggml_tensor * cur;
|
||||
ggml_tensor * inpL;
|
||||
|
|
|
|||
|
|
@ -1,10 +1,10 @@
|
|||
#include "models.h"
|
||||
|
||||
llm_build_bert::llm_build_bert(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v();
|
||||
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
||||
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
|
||||
|
||||
ggml_tensor * cur;
|
||||
ggml_tensor * inpL;
|
||||
|
|
|
|||
|
|
@ -2,9 +2,9 @@
|
|||
|
||||
|
||||
llm_build_bitnet::llm_build_bitnet(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v();
|
||||
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
|
||||
|
||||
ggml_tensor * cur;
|
||||
ggml_tensor * inpL;
|
||||
|
|
|
|||
|
|
@ -1,10 +1,10 @@
|
|||
#include "models.h"
|
||||
|
||||
llm_build_bloom::llm_build_bloom(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v();
|
||||
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
||||
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
|
||||
|
||||
ggml_tensor * cur;
|
||||
ggml_tensor * inpL;
|
||||
|
|
|
|||
|
|
@ -3,10 +3,10 @@
|
|||
#include <float.h>
|
||||
|
||||
llm_build_chameleon::llm_build_chameleon(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v();
|
||||
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
|
||||
GGML_ASSERT(n_embd_head == n_rot);
|
||||
|
||||
ggml_tensor * cur;
|
||||
ggml_tensor * inpL;
|
||||
|
|
|
|||
|
|
@ -2,10 +2,10 @@
|
|||
|
||||
|
||||
llm_build_chatglm::llm_build_chatglm(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v();
|
||||
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
||||
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
|
||||
|
||||
ggml_tensor * cur;
|
||||
ggml_tensor * inpL;
|
||||
|
|
|
|||
|
|
@ -1,11 +1,11 @@
|
|||
#include "models.h"
|
||||
|
||||
llm_build_codeshell::llm_build_codeshell(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v();
|
||||
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
||||
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
|
||||
GGML_ASSERT(n_embd_head == n_rot);
|
||||
|
||||
ggml_tensor * cur;
|
||||
ggml_tensor * inpL;
|
||||
|
|
|
|||
|
|
@ -2,11 +2,11 @@
|
|||
|
||||
llm_build_cogvlm::llm_build_cogvlm(const llama_model & model, const llm_graph_params & params) :
|
||||
llm_graph_context(params) {
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v();
|
||||
const float kq_scale = 1.0f / sqrtf(float(n_embd_head));
|
||||
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
|
||||
GGML_ASSERT(n_embd_head == n_rot);
|
||||
|
||||
ggml_tensor * inpL;
|
||||
ggml_tensor * cur;
|
||||
|
|
|
|||
|
|
@ -1,9 +1,9 @@
|
|||
#include "models.h"
|
||||
|
||||
llm_build_cohere2_iswa::llm_build_cohere2_iswa(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v();
|
||||
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
|
||||
|
||||
const float f_logit_scale = hparams.f_logit_scale;
|
||||
|
||||
|
|
|
|||
|
|
@ -4,9 +4,9 @@
|
|||
|
||||
llm_build_command_r::llm_build_command_r(const llama_model & model, const llm_graph_params & params) :
|
||||
llm_graph_context(params) {
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v();
|
||||
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
|
||||
|
||||
const float f_logit_scale = hparams.f_logit_scale;
|
||||
|
||||
|
|
|
|||
|
|
@ -1,11 +1,11 @@
|
|||
#include "models.h"
|
||||
|
||||
llm_build_dbrx::llm_build_dbrx(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v();
|
||||
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
||||
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
|
||||
GGML_ASSERT(n_embd_head == n_rot);
|
||||
|
||||
ggml_tensor * cur;
|
||||
ggml_tensor * inpL;
|
||||
|
|
|
|||
|
|
@ -3,10 +3,10 @@
|
|||
|
||||
|
||||
llm_build_deci::llm_build_deci(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v();
|
||||
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
|
||||
GGML_ASSERT(n_embd_head == n_rot);
|
||||
|
||||
ggml_tensor * cur;
|
||||
ggml_tensor * inpL;
|
||||
|
|
|
|||
|
|
@ -2,10 +2,10 @@
|
|||
|
||||
llm_build_deepseek::llm_build_deepseek(const llama_model & model, const llm_graph_params & params) :
|
||||
llm_graph_context(params) {
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v();
|
||||
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
|
||||
GGML_ASSERT(n_embd_head == n_rot);
|
||||
|
||||
ggml_tensor * cur;
|
||||
ggml_tensor * inpL;
|
||||
|
|
|
|||
|
|
@ -8,7 +8,7 @@ llm_build_deepseek2::llm_build_deepseek2(const llama_model & model, const llm_gr
|
|||
const int64_t n_embd_head_k = hparams.n_embd_head_k_mla();
|
||||
const int64_t n_embd_head_v = hparams.n_embd_head_v_mla();
|
||||
|
||||
const int64_t n_embd_head_qk_rope = hparams.n_rot;
|
||||
const int64_t n_embd_head_qk_rope = hparams.n_rot();
|
||||
const int64_t n_embd_head_qk_nope = n_embd_head_k - n_embd_head_qk_rope;
|
||||
|
||||
const uint32_t kv_lora_rank = hparams.n_lora_kv;
|
||||
|
|
|
|||
|
|
@ -2,10 +2,10 @@
|
|||
|
||||
llm_build_dots1::llm_build_dots1(const llama_model & model, const llm_graph_params & params) :
|
||||
llm_graph_context(params) {
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v();
|
||||
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
|
||||
GGML_ASSERT(n_embd_head == n_rot);
|
||||
|
||||
ggml_tensor * cur;
|
||||
ggml_tensor * inpL;
|
||||
|
|
|
|||
|
|
@ -5,10 +5,10 @@
|
|||
llm_build_dream::llm_build_dream(const llama_model & model, const llm_graph_params & params) :
|
||||
llm_graph_context(params) {
|
||||
//copied from qwen2
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v();
|
||||
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
|
||||
GGML_ASSERT(n_embd_head == n_rot);
|
||||
|
||||
ggml_tensor * cur;
|
||||
ggml_tensor * inpL;
|
||||
|
|
|
|||
|
|
@ -2,10 +2,10 @@
|
|||
|
||||
llm_build_ernie4_5_moe::llm_build_ernie4_5_moe(const llama_model & model, const llm_graph_params & params) :
|
||||
llm_graph_context(params) {
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v();
|
||||
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
|
||||
GGML_ASSERT(n_embd_head == n_rot);
|
||||
|
||||
ggml_tensor * cur;
|
||||
ggml_tensor * inpL;
|
||||
|
|
|
|||
|
|
@ -2,10 +2,10 @@
|
|||
|
||||
llm_build_ernie4_5::llm_build_ernie4_5(const llama_model & model, const llm_graph_params & params) :
|
||||
llm_graph_context(params) {
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v();
|
||||
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
|
||||
GGML_ASSERT(n_embd_head == n_rot);
|
||||
|
||||
ggml_tensor * cur;
|
||||
ggml_tensor * inpL;
|
||||
|
|
|
|||
|
|
@ -1,9 +1,9 @@
|
|||
#include "models.h"
|
||||
|
||||
llm_build_eurobert::llm_build_eurobert(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v();
|
||||
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
|
||||
|
||||
ggml_tensor * cur;
|
||||
ggml_tensor * inpL;
|
||||
|
|
|
|||
|
|
@ -2,10 +2,10 @@
|
|||
|
||||
llm_build_exaone_moe::llm_build_exaone_moe(const llama_model & model, const llm_graph_params & params) :
|
||||
llm_graph_context(params) {
|
||||
const int64_t n_embd_head = hparams.n_embd_head_k;
|
||||
const int64_t n_embd_head = hparams.n_embd_head_k();
|
||||
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_v);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_v());
|
||||
GGML_ASSERT(n_embd_head == n_rot);
|
||||
|
||||
ggml_tensor * cur;
|
||||
ggml_tensor * inpL;
|
||||
|
|
|
|||
|
|
@ -4,10 +4,10 @@
|
|||
|
||||
llm_build_exaone::llm_build_exaone(const llama_model & model, const llm_graph_params & params) :
|
||||
llm_graph_context(params) {
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v();
|
||||
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
|
||||
GGML_ASSERT(n_embd_head == n_rot);
|
||||
|
||||
ggml_tensor * cur;
|
||||
ggml_tensor * inpL;
|
||||
|
|
|
|||
|
|
@ -4,10 +4,10 @@
|
|||
template <bool iswa>
|
||||
llm_build_exaone4<iswa>::llm_build_exaone4(const llama_model & model, const llm_graph_params & params) :
|
||||
llm_graph_context(params) {
|
||||
const int64_t n_embd_head = hparams.n_embd_head_k;
|
||||
const int64_t n_embd_head = hparams.n_embd_head_k();
|
||||
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_v);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_v());
|
||||
GGML_ASSERT(n_embd_head == n_rot);
|
||||
|
||||
ggml_tensor * cur;
|
||||
ggml_tensor * inpL;
|
||||
|
|
|
|||
|
|
@ -2,7 +2,7 @@
|
|||
|
||||
llm_build_falcon_h1::llm_build_falcon_h1(const llama_model & model, const llm_graph_params & params) :
|
||||
llm_build_mamba_base(params) {
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v();
|
||||
|
||||
ggml_tensor * cur;
|
||||
ggml_tensor * inpL;
|
||||
|
|
|
|||
|
|
@ -2,11 +2,11 @@
|
|||
|
||||
|
||||
llm_build_falcon::llm_build_falcon(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v();
|
||||
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
||||
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
|
||||
GGML_ASSERT(n_embd_head == n_rot);
|
||||
|
||||
ggml_tensor * cur;
|
||||
ggml_tensor * inpL;
|
||||
|
|
|
|||
|
|
@ -2,7 +2,7 @@
|
|||
|
||||
llm_build_gemma_embedding::llm_build_gemma_embedding(const llama_model & model, const llm_graph_params & params) :
|
||||
llm_graph_context(params) {
|
||||
const int64_t n_embd_head = hparams.n_embd_head_k;
|
||||
const int64_t n_embd_head = hparams.n_embd_head_k();
|
||||
|
||||
ggml_tensor * cur;
|
||||
ggml_tensor * inpL;
|
||||
|
|
|
|||
|
|
@ -2,7 +2,7 @@
|
|||
|
||||
|
||||
llm_build_gemma::llm_build_gemma(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v();
|
||||
|
||||
ggml_tensor * cur;
|
||||
ggml_tensor * inpL;
|
||||
|
|
|
|||
|
|
@ -1,7 +1,7 @@
|
|||
#include "models.h"
|
||||
|
||||
llm_build_gemma2_iswa::llm_build_gemma2_iswa(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
||||
const int64_t n_embd_head = hparams.n_embd_head_k;
|
||||
const int64_t n_embd_head = hparams.n_embd_head_k();
|
||||
|
||||
ggml_tensor * cur;
|
||||
ggml_tensor * inpL;
|
||||
|
|
|
|||
|
|
@ -2,7 +2,7 @@
|
|||
|
||||
template <bool iswa>
|
||||
llm_build_gemma3<iswa>::llm_build_gemma3(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
||||
const int64_t n_embd_head = hparams.n_embd_head_k;
|
||||
const int64_t n_embd_head = hparams.n_embd_head_k();
|
||||
|
||||
ggml_tensor * cur;
|
||||
ggml_tensor * inpL;
|
||||
|
|
|
|||
|
|
@ -3,7 +3,7 @@
|
|||
llm_build_gemma3n_iswa::llm_build_gemma3n_iswa(const llama_model & model, const llm_graph_params & params) :
|
||||
llm_graph_context(params),
|
||||
model(model),
|
||||
n_embd_head(model.hparams.n_embd_head_k),
|
||||
n_embd_head(model.hparams.n_embd_head_k()),
|
||||
n_embd_altup(model.hparams.n_embd_altup),
|
||||
n_altup(model.hparams.n_altup),
|
||||
i_altup_act(model.hparams.i_altup_act) {
|
||||
|
|
|
|||
|
|
@ -1,9 +1,9 @@
|
|||
#include "models.h"
|
||||
|
||||
llm_build_glm4_moe::llm_build_glm4_moe(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v();
|
||||
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
|
||||
|
||||
int sections[4];
|
||||
std::copy(std::begin(hparams.rope_sections), std::begin(hparams.rope_sections) + 4, sections);
|
||||
|
|
|
|||
|
|
@ -3,10 +3,10 @@
|
|||
|
||||
|
||||
llm_build_glm4::llm_build_glm4(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v();
|
||||
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
||||
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
|
||||
|
||||
int sections[4];
|
||||
std::copy(std::begin(hparams.rope_sections), std::begin(hparams.rope_sections) + 4, sections);
|
||||
|
|
|
|||
|
|
@ -1,10 +1,10 @@
|
|||
#include "models.h"
|
||||
|
||||
llm_build_gpt2::llm_build_gpt2(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v();
|
||||
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
||||
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
|
||||
|
||||
ggml_tensor * cur;
|
||||
ggml_tensor * pos;
|
||||
|
|
|
|||
|
|
@ -2,10 +2,10 @@
|
|||
|
||||
|
||||
llm_build_gptneox::llm_build_gptneox(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v();
|
||||
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
||||
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
|
||||
|
||||
ggml_tensor * cur;
|
||||
ggml_tensor * inpL;
|
||||
|
|
|
|||
|
|
@ -2,8 +2,8 @@
|
|||
|
||||
llm_build_granite_hybrid::llm_build_granite_hybrid(const llama_model & model, const llm_graph_params & params) :
|
||||
llm_build_mamba_base(params) {
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v();
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
|
||||
|
||||
ggml_tensor * cur;
|
||||
ggml_tensor * inpL;
|
||||
|
|
|
|||
|
|
@ -5,10 +5,10 @@ llm_build_granite::llm_build_granite(
|
|||
const llm_graph_params & params)
|
||||
: llm_graph_context(params) {
|
||||
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v();
|
||||
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
|
||||
GGML_ASSERT(n_embd_head == n_rot);
|
||||
|
||||
ggml_tensor * cur;
|
||||
ggml_tensor * inpL;
|
||||
|
|
|
|||
|
|
@ -1,10 +1,10 @@
|
|||
#include "models.h"
|
||||
|
||||
llm_build_grok::llm_build_grok(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v();
|
||||
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
|
||||
GGML_ASSERT(n_embd_head == n_rot);
|
||||
|
||||
ggml_tensor * cur;
|
||||
ggml_tensor * inpL;
|
||||
|
|
|
|||
|
|
@ -2,11 +2,11 @@
|
|||
|
||||
llm_build_grovemoe::llm_build_grovemoe(const llama_model & model, const llm_graph_params & params) :
|
||||
llm_graph_context(params) {
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v();
|
||||
const int64_t n_chunk_expert = n_expert / hparams.n_group_experts;
|
||||
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
|
||||
GGML_ASSERT(n_embd_head == n_rot);
|
||||
|
||||
ggml_tensor * cur;
|
||||
ggml_tensor * inpL;
|
||||
|
|
|
|||
|
|
@ -1,10 +1,10 @@
|
|||
#include "models.h"
|
||||
|
||||
llm_build_hunyuan_dense::llm_build_hunyuan_dense(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v();
|
||||
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
|
||||
GGML_ASSERT(n_embd_head == n_rot);
|
||||
|
||||
ggml_tensor * cur;
|
||||
ggml_tensor * inpL;
|
||||
|
|
|
|||
|
|
@ -1,10 +1,10 @@
|
|||
#include "models.h"
|
||||
|
||||
llm_build_hunyuan_moe::llm_build_hunyuan_moe(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v();
|
||||
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
|
||||
GGML_ASSERT(n_embd_head == n_rot);
|
||||
|
||||
ggml_tensor * cur;
|
||||
ggml_tensor * inpL;
|
||||
|
|
|
|||
|
|
@ -1,10 +1,10 @@
|
|||
#include "models.h"
|
||||
|
||||
llm_build_internlm2::llm_build_internlm2(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v();
|
||||
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
|
||||
GGML_ASSERT(n_embd_head == n_rot);
|
||||
|
||||
ggml_tensor * cur;
|
||||
ggml_tensor * inpL;
|
||||
|
|
|
|||
|
|
@ -1,10 +1,10 @@
|
|||
#include "models.h"
|
||||
|
||||
llm_build_jais::llm_build_jais(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v();
|
||||
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
||||
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
|
||||
|
||||
ggml_tensor * cur;
|
||||
ggml_tensor * inpL;
|
||||
|
|
|
|||
|
|
@ -3,10 +3,10 @@
|
|||
// JAIS-2 model graph builder
|
||||
// Uses: LayerNorm (not RMSNorm), relu2 activation, separate Q/K/V, RoPE embeddings
|
||||
llm_build_jais2::llm_build_jais2(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v();
|
||||
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
|
||||
GGML_ASSERT(n_embd_head == n_rot);
|
||||
|
||||
ggml_tensor * cur;
|
||||
ggml_tensor * inpL;
|
||||
|
|
|
|||
|
|
@ -1,7 +1,7 @@
|
|||
#include "models.h"
|
||||
|
||||
llm_build_jamba::llm_build_jamba(const llama_model & model, const llm_graph_params & params) : llm_build_mamba_base(params) {
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v();
|
||||
|
||||
ggml_tensor * cur;
|
||||
ggml_tensor * inpL;
|
||||
|
|
|
|||
|
|
@ -102,7 +102,7 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll
|
|||
const int64_t kv_lora_rank = hparams.n_lora_kv;
|
||||
// qk_rope_head_dim = 64 (from Kimi config) which is hparams.n_rot
|
||||
// Confirmed from tensor shape: wkv_a_mqa [2304, 576] = [n_embd, kv_lora_rank + qk_rope_head_dim]
|
||||
const int64_t n_embd_head_qk_rope = hparams.n_rot; // config.qk_rope_head_dim
|
||||
const int64_t n_embd_head_qk_rope = hparams.n_rot(); // config.qk_rope_head_dim
|
||||
const int64_t n_embd_head_qk_nope = n_embd_head_k_mla - n_embd_head_qk_rope; // 192 - 64 = 128
|
||||
// Attention scale for MLA
|
||||
const float kq_scale_mla = 1.0f / sqrtf((float)n_embd_head_k_mla);
|
||||
|
|
|
|||
|
|
@ -39,7 +39,7 @@ llm_build_lfm2<iswa>::llm_build_lfm2(const llama_model & model, const llm_graph_
|
|||
inp_attn_type * inp_attn,
|
||||
int il) -> ggml_tensor * {
|
||||
GGML_ASSERT(hparams.n_embd_v_gqa(il) == hparams.n_embd_k_gqa(il));
|
||||
const auto n_embd_head = hparams.n_embd_head_v;
|
||||
const auto n_embd_head = hparams.n_embd_head_v();
|
||||
const auto n_head_kv = hparams.n_head_kv(il);
|
||||
|
||||
auto * q = build_lora_mm(model.layers[il].wq, cur);
|
||||
|
|
|
|||
|
|
@ -1,10 +1,10 @@
|
|||
#include "models.h"
|
||||
|
||||
llm_build_llada_moe::llm_build_llada_moe(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v();
|
||||
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
|
||||
GGML_ASSERT(n_embd_head == n_rot);
|
||||
|
||||
ggml_tensor * cur;
|
||||
ggml_tensor * inpL;
|
||||
|
|
|
|||
|
|
@ -2,10 +2,10 @@
|
|||
|
||||
llm_build_llada::llm_build_llada(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
||||
// LLaDA is similar to LLaMA but uses non-causal attention for diffusion
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v();
|
||||
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
|
||||
GGML_ASSERT(n_embd_head == n_rot);
|
||||
|
||||
ggml_tensor * cur;
|
||||
ggml_tensor * inpL;
|
||||
|
|
|
|||
|
|
@ -1,10 +1,10 @@
|
|||
#include "models.h"
|
||||
|
||||
llm_build_llama_iswa::llm_build_llama_iswa(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v();
|
||||
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
|
||||
GGML_ASSERT(n_embd_head == n_rot);
|
||||
|
||||
ggml_tensor * cur;
|
||||
ggml_tensor * inpL;
|
||||
|
|
|
|||
|
|
@ -2,10 +2,10 @@
|
|||
|
||||
template <bool embed>
|
||||
llm_build_llama<embed>::llm_build_llama(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v();
|
||||
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
|
||||
GGML_ASSERT(n_embd_head == n_rot);
|
||||
|
||||
ggml_tensor * cur;
|
||||
ggml_tensor * inpL;
|
||||
|
|
|
|||
|
|
@ -1,10 +1,10 @@
|
|||
#include "models.h"
|
||||
|
||||
llm_build_maincoder::llm_build_maincoder(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v();
|
||||
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
|
||||
GGML_ASSERT(n_embd_head == n_rot);
|
||||
|
||||
ggml_tensor * cur;
|
||||
ggml_tensor * inpL;
|
||||
|
|
|
|||
|
|
@ -5,10 +5,10 @@ llm_build_minicpm3::llm_build_minicpm3(const llama_model & model, const llm_grap
|
|||
const int64_t n_embd_base = 256;
|
||||
const float scale_embd = 12.0f;
|
||||
const float scale_depth = 1.4f;
|
||||
const float kq_scale = 1.0f / sqrtf(float(hparams.n_embd_head_k));
|
||||
const float kq_scale = 1.0f / sqrtf(float(hparams.n_embd_head_k()));
|
||||
|
||||
const uint32_t n_embd_head_qk_rope = hparams.n_rot;
|
||||
const uint32_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot;
|
||||
const uint32_t n_embd_head_qk_rope = hparams.n_rot();
|
||||
const uint32_t n_embd_head_qk_nope = hparams.n_embd_head_k() - hparams.n_rot();
|
||||
|
||||
const uint32_t kv_lora_rank = hparams.n_lora_kv;
|
||||
|
||||
|
|
@ -51,21 +51,21 @@ llm_build_minicpm3::llm_build_minicpm3(const llama_model & model, const llm_grap
|
|||
LLM_NORM_RMS, il);
|
||||
cb(q, "q", il);
|
||||
|
||||
// {q_lora_rank, n_head * hparams.n_embd_head_k} * {q_lora_rank, n_tokens} -> {n_head * hparams.n_embd_head_k, n_tokens}
|
||||
// {q_lora_rank, n_head * hparams.n_embd_head_k()} * {q_lora_rank, n_tokens} -> {n_head * hparams.n_embd_head_k(), n_tokens}
|
||||
q = ggml_mul_mat(ctx0, model.layers[il].wq_b, q);
|
||||
cb(q, "q", il);
|
||||
|
||||
// split into {n_head * n_embd_head_qk_nope, n_tokens}
|
||||
ggml_tensor * q_nope = ggml_view_3d(ctx0, q, n_embd_head_qk_nope, n_head, n_tokens,
|
||||
ggml_row_size(q->type, hparams.n_embd_head_k),
|
||||
ggml_row_size(q->type, hparams.n_embd_head_k * n_head),
|
||||
ggml_row_size(q->type, hparams.n_embd_head_k()),
|
||||
ggml_row_size(q->type, hparams.n_embd_head_k() * n_head),
|
||||
0);
|
||||
cb(q_nope, "q_nope", il);
|
||||
|
||||
// and {n_head * n_embd_head_qk_rope, n_tokens}
|
||||
ggml_tensor * q_pe = ggml_view_3d(ctx0, q, n_embd_head_qk_rope, n_head, n_tokens,
|
||||
ggml_row_size(q->type, hparams.n_embd_head_k),
|
||||
ggml_row_size(q->type, hparams.n_embd_head_k * n_head),
|
||||
ggml_row_size(q->type, hparams.n_embd_head_k()),
|
||||
ggml_row_size(q->type, hparams.n_embd_head_k() * n_head),
|
||||
ggml_row_size(q->type, n_embd_head_qk_nope));
|
||||
cb(q_pe, "q_pe", il);
|
||||
|
||||
|
|
@ -97,15 +97,15 @@ llm_build_minicpm3::llm_build_minicpm3(const llama_model & model, const llm_grap
|
|||
|
||||
// split into {n_head * n_embd_head_qk_nope, n_tokens}
|
||||
ggml_tensor * k_nope = ggml_view_3d(ctx0, kv, n_embd_head_qk_nope, n_head, n_tokens,
|
||||
ggml_row_size(kv->type, n_embd_head_qk_nope + hparams.n_embd_head_v),
|
||||
ggml_row_size(kv->type, n_head * (n_embd_head_qk_nope + hparams.n_embd_head_v)),
|
||||
ggml_row_size(kv->type, n_embd_head_qk_nope + hparams.n_embd_head_v()),
|
||||
ggml_row_size(kv->type, n_head * (n_embd_head_qk_nope + hparams.n_embd_head_v())),
|
||||
0);
|
||||
cb(k_nope, "k_nope", il);
|
||||
|
||||
// and {n_head * n_embd_head_v, n_tokens}
|
||||
ggml_tensor * v_states = ggml_view_3d(ctx0, kv, hparams.n_embd_head_v, n_head, n_tokens,
|
||||
ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v)),
|
||||
ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v)*n_head),
|
||||
ggml_tensor * v_states = ggml_view_3d(ctx0, kv, hparams.n_embd_head_v(), n_head, n_tokens,
|
||||
ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v())),
|
||||
ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v())*n_head),
|
||||
ggml_row_size(kv->type, (n_embd_head_qk_nope)));
|
||||
cb(v_states, "v_states", il);
|
||||
|
||||
|
|
|
|||
|
|
@ -1,10 +1,10 @@
|
|||
#include "models.h"
|
||||
|
||||
llm_build_minimax_m2::llm_build_minimax_m2(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v();
|
||||
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
// GGML_ASSERT(n_embd_head == hparams.n_rot); this is wrong in case of minimax, head_dim = 128, n_rot = 64
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
|
||||
// GGML_ASSERT(n_embd_head == n_rot); this is wrong in case of minimax, head_dim = 128, n_rot = 64
|
||||
|
||||
ggml_tensor * cur;
|
||||
ggml_tensor * inpL;
|
||||
|
|
|
|||
|
|
@ -1,10 +1,10 @@
|
|||
#include "models.h"
|
||||
|
||||
llm_build_mistral3::llm_build_mistral3(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v();
|
||||
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
|
||||
GGML_ASSERT(n_embd_head == n_rot);
|
||||
|
||||
ggml_tensor * cur;
|
||||
ggml_tensor * inpL;
|
||||
|
|
|
|||
|
|
@ -1,10 +1,10 @@
|
|||
#include "models.h"
|
||||
|
||||
llm_build_modern_bert::llm_build_modern_bert(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v();
|
||||
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
||||
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
|
||||
|
||||
ggml_tensor * cur;
|
||||
ggml_tensor * inpL;
|
||||
|
|
|
|||
|
|
@ -3,10 +3,10 @@
|
|||
|
||||
|
||||
llm_build_mpt::llm_build_mpt(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v();
|
||||
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
||||
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
|
||||
|
||||
ggml_tensor * cur;
|
||||
ggml_tensor * pos;
|
||||
|
|
|
|||
|
|
@ -2,8 +2,8 @@
|
|||
|
||||
llm_build_nemotron_h::llm_build_nemotron_h(const llama_model & model, const llm_graph_params & params) :
|
||||
llm_build_mamba_base(params) {
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v();
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
|
||||
|
||||
ggml_tensor * cur;
|
||||
ggml_tensor * inpL;
|
||||
|
|
|
|||
|
|
@ -1,10 +1,10 @@
|
|||
#include "models.h"
|
||||
|
||||
llm_build_nemotron::llm_build_nemotron(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v();
|
||||
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
//GGML_ASSERT(n_embd_head == hparams.n_rot);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
|
||||
//GGML_ASSERT(n_embd_head == n_rot);
|
||||
|
||||
ggml_tensor * cur;
|
||||
ggml_tensor * inpL;
|
||||
|
|
|
|||
|
|
@ -1,10 +1,10 @@
|
|||
#include "models.h"
|
||||
|
||||
llm_build_neo_bert::llm_build_neo_bert(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v();
|
||||
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
||||
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
|
||||
|
||||
ggml_tensor * cur;
|
||||
ggml_tensor * inpL;
|
||||
|
|
|
|||
|
|
@ -1,10 +1,10 @@
|
|||
#include "models.h"
|
||||
|
||||
llm_build_olmo::llm_build_olmo(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v();
|
||||
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
|
||||
GGML_ASSERT(n_embd_head == n_rot);
|
||||
|
||||
ggml_tensor * cur;
|
||||
ggml_tensor * inpL;
|
||||
|
|
|
|||
|
|
@ -2,10 +2,10 @@
|
|||
|
||||
template <bool iswa>
|
||||
llm_build_olmo2<iswa>::llm_build_olmo2(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v();
|
||||
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
|
||||
GGML_ASSERT(n_embd_head == n_rot);
|
||||
|
||||
ggml_tensor * cur;
|
||||
ggml_tensor * inpL;
|
||||
|
|
|
|||
|
|
@ -1,10 +1,10 @@
|
|||
#include "models.h"
|
||||
|
||||
llm_build_olmoe::llm_build_olmoe(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v();
|
||||
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
|
||||
GGML_ASSERT(n_embd_head == n_rot);
|
||||
|
||||
ggml_tensor * cur;
|
||||
ggml_tensor * inpL;
|
||||
|
|
|
|||
|
|
@ -1,9 +1,9 @@
|
|||
#include "models.h"
|
||||
|
||||
llm_build_openelm::llm_build_openelm(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v();
|
||||
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
|
||||
|
||||
ggml_tensor * cur;
|
||||
ggml_tensor * inpL;
|
||||
|
|
|
|||
|
|
@ -1,10 +1,10 @@
|
|||
#include "models.h"
|
||||
|
||||
llm_build_orion::llm_build_orion(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v();
|
||||
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
|
||||
GGML_ASSERT(n_embd_head == n_rot);
|
||||
|
||||
ggml_tensor * cur;
|
||||
ggml_tensor * inpL;
|
||||
|
|
|
|||
|
|
@ -5,10 +5,10 @@ llm_build_paddleocr::llm_build_paddleocr(const llama_model & model, const llm_gr
|
|||
|
||||
// NOTE: same with qwen2vl.cpp, but bias tensors are optional
|
||||
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v();
|
||||
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
|
||||
GGML_ASSERT(n_embd_head == n_rot);
|
||||
|
||||
ggml_tensor * cur;
|
||||
ggml_tensor * inpL;
|
||||
|
|
|
|||
|
|
@ -2,10 +2,10 @@
|
|||
|
||||
|
||||
llm_build_pangu_embedded::llm_build_pangu_embedded(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v();
|
||||
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
|
||||
GGML_ASSERT(n_embd_head == n_rot);
|
||||
|
||||
ggml_tensor * cur;
|
||||
ggml_tensor * inpL;
|
||||
|
|
|
|||
|
|
@ -2,10 +2,10 @@
|
|||
|
||||
|
||||
llm_build_phi2::llm_build_phi2(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v();
|
||||
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
||||
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
|
||||
|
||||
ggml_tensor * cur;
|
||||
ggml_tensor * attn_norm_output;
|
||||
|
|
|
|||
|
|
@ -2,10 +2,10 @@
|
|||
|
||||
template<bool iswa>
|
||||
llm_build_phi3<iswa>::llm_build_phi3(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v();
|
||||
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
||||
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
|
||||
|
||||
ggml_tensor * cur;
|
||||
ggml_tensor * inpL;
|
||||
|
|
|
|||
|
|
@ -1,10 +1,10 @@
|
|||
#include "models.h"
|
||||
|
||||
llm_build_plamo::llm_build_plamo(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v();
|
||||
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
|
||||
GGML_ASSERT(n_embd_head == n_rot);
|
||||
|
||||
ggml_tensor * cur;
|
||||
ggml_tensor * inpL;
|
||||
|
|
|
|||
|
|
@ -106,9 +106,9 @@ ggml_tensor * llm_build_plamo2::build_plamo2_attn_layer(llm_graph_input_attn_kv
|
|||
cb(qkv, "wqkv", il);
|
||||
|
||||
// split QKV tensor into Q, K, V
|
||||
const int64_t n_embd_head_q = hparams.n_embd_head_k;
|
||||
const int64_t n_embd_head_k = hparams.n_embd_head_k;
|
||||
const int64_t n_embd_head_v = hparams.n_embd_head_v;
|
||||
const int64_t n_embd_head_q = hparams.n_embd_head_k();
|
||||
const int64_t n_embd_head_k = hparams.n_embd_head_k();
|
||||
const int64_t n_embd_head_v = hparams.n_embd_head_v();
|
||||
int32_t n_head = hparams.n_head(il);
|
||||
int32_t n_head_kv = hparams.n_head_kv(il);
|
||||
|
||||
|
|
|
|||
|
|
@ -3,8 +3,8 @@
|
|||
template <bool iswa>
|
||||
llm_build_plamo3<iswa>::llm_build_plamo3(const llama_model & model, const llm_graph_params & params) :
|
||||
llm_graph_context(params) {
|
||||
const int64_t head_dim_q = hparams.n_embd_head_k;
|
||||
const int64_t head_dim_v = hparams.n_embd_head_v;
|
||||
const int64_t head_dim_q = hparams.n_embd_head_k();
|
||||
const int64_t head_dim_v = hparams.n_embd_head_v();
|
||||
|
||||
ggml_tensor * cur;
|
||||
ggml_tensor * inpL = build_inp_embd(model.tok_embd);
|
||||
|
|
|
|||
|
|
@ -1,10 +1,10 @@
|
|||
#include "models.h"
|
||||
|
||||
llm_build_plm::llm_build_plm(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
||||
const float kq_scale = 1.0f/sqrtf(float(hparams.n_embd_head_k));
|
||||
const float kq_scale = 1.0f/sqrtf(float(hparams.n_embd_head_k()));
|
||||
|
||||
const uint32_t n_embd_head_qk_rope = hparams.n_rot;
|
||||
const uint32_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot;
|
||||
const uint32_t n_embd_head_qk_rope = hparams.n_rot();
|
||||
const uint32_t n_embd_head_qk_nope = hparams.n_embd_head_k() - hparams.n_rot();
|
||||
|
||||
const uint32_t kv_lora_rank = hparams.n_lora_kv;
|
||||
|
||||
|
|
@ -38,15 +38,15 @@ llm_build_plm::llm_build_plm(const llama_model & model, const llm_graph_params &
|
|||
|
||||
// split into {n_head * n_embd_head_qk_nope, n_tokens}
|
||||
ggml_tensor * q_nope = ggml_view_3d(ctx0, q, n_embd_head_qk_nope, n_head, n_tokens,
|
||||
ggml_row_size(q->type, hparams.n_embd_head_k),
|
||||
ggml_row_size(q->type, hparams.n_embd_head_k * n_head),
|
||||
ggml_row_size(q->type, hparams.n_embd_head_k()),
|
||||
ggml_row_size(q->type, hparams.n_embd_head_k() * n_head),
|
||||
0);
|
||||
cb(q_nope, "q_nope", il);
|
||||
|
||||
// and {n_head * n_embd_head_qk_rope, n_tokens}
|
||||
ggml_tensor * q_pe = ggml_view_3d(ctx0, q, n_embd_head_qk_rope, n_head, n_tokens,
|
||||
ggml_row_size(q->type, hparams.n_embd_head_k),
|
||||
ggml_row_size(q->type, hparams.n_embd_head_k * n_head),
|
||||
ggml_row_size(q->type, hparams.n_embd_head_k()),
|
||||
ggml_row_size(q->type, hparams.n_embd_head_k() * n_head),
|
||||
ggml_row_size(q->type, n_embd_head_qk_nope));
|
||||
cb(q_pe, "q_pe", il);
|
||||
|
||||
|
|
@ -78,23 +78,23 @@ llm_build_plm::llm_build_plm(const llama_model & model, const llm_graph_params &
|
|||
|
||||
// split into {n_head * n_embd_head_qk_nope, n_tokens}
|
||||
ggml_tensor * k_nope = ggml_view_3d(ctx0, kv, n_embd_head_qk_nope, n_head, n_tokens,
|
||||
ggml_row_size(kv->type, n_embd_head_qk_nope + hparams.n_embd_head_v),
|
||||
ggml_row_size(kv->type, n_head * (n_embd_head_qk_nope + hparams.n_embd_head_v)),
|
||||
ggml_row_size(kv->type, n_embd_head_qk_nope + hparams.n_embd_head_v()),
|
||||
ggml_row_size(kv->type, n_head * (n_embd_head_qk_nope + hparams.n_embd_head_v())),
|
||||
0);
|
||||
cb(k_nope, "k_nope", il);
|
||||
|
||||
// and {n_head * n_embd_head_v, n_tokens}
|
||||
ggml_tensor * v_states = ggml_view_3d(ctx0, kv, hparams.n_embd_head_v, n_head, n_tokens,
|
||||
ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v)),
|
||||
ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v)*n_head),
|
||||
ggml_tensor * v_states = ggml_view_3d(ctx0, kv, hparams.n_embd_head_v(), n_head, n_tokens,
|
||||
ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v())),
|
||||
ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v())*n_head),
|
||||
ggml_row_size(kv->type, (n_embd_head_qk_nope)));
|
||||
cb(v_states, "v_states", il);
|
||||
|
||||
v_states = ggml_cont(ctx0, v_states);
|
||||
cb(v_states, "v_states", il);
|
||||
|
||||
v_states = ggml_view_2d(ctx0, v_states, hparams.n_embd_head_v * n_head, n_tokens,
|
||||
ggml_row_size(kv->type, hparams.n_embd_head_v * n_head),
|
||||
v_states = ggml_view_2d(ctx0, v_states, hparams.n_embd_head_v() * n_head, n_tokens,
|
||||
ggml_row_size(kv->type, hparams.n_embd_head_v() * n_head),
|
||||
0);
|
||||
cb(v_states, "v_states", il);
|
||||
|
||||
|
|
|
|||
|
|
@ -2,9 +2,9 @@
|
|||
|
||||
|
||||
llm_build_qwen::llm_build_qwen(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v();
|
||||
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
|
||||
|
||||
ggml_tensor * cur;
|
||||
ggml_tensor * inpL;
|
||||
|
|
|
|||
|
|
@ -1,10 +1,10 @@
|
|||
#include "models.h"
|
||||
|
||||
llm_build_qwen2::llm_build_qwen2(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v();
|
||||
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
|
||||
GGML_ASSERT(n_embd_head == n_rot);
|
||||
|
||||
ggml_tensor * cur;
|
||||
ggml_tensor * inpL;
|
||||
|
|
|
|||
|
|
@ -1,10 +1,10 @@
|
|||
#include "models.h"
|
||||
|
||||
llm_build_qwen2moe::llm_build_qwen2moe(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v();
|
||||
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
|
||||
GGML_ASSERT(n_embd_head == n_rot);
|
||||
|
||||
ggml_tensor * cur;
|
||||
ggml_tensor * inpL;
|
||||
|
|
|
|||
|
|
@ -1,10 +1,10 @@
|
|||
#include "models.h"
|
||||
|
||||
llm_build_qwen2vl::llm_build_qwen2vl(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v();
|
||||
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
|
||||
GGML_ASSERT(n_embd_head == n_rot);
|
||||
|
||||
ggml_tensor * cur;
|
||||
ggml_tensor * inpL;
|
||||
|
|
|
|||
|
|
@ -1,10 +1,10 @@
|
|||
#include "models.h"
|
||||
|
||||
llm_build_qwen3::llm_build_qwen3(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v();
|
||||
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
|
||||
GGML_ASSERT(n_embd_head == n_rot);
|
||||
|
||||
ggml_tensor * cur;
|
||||
ggml_tensor * inpL;
|
||||
|
|
|
|||
|
|
@ -4,9 +4,9 @@
|
|||
|
||||
llm_build_qwen35::llm_build_qwen35(const llama_model & model, const llm_graph_params & params) :
|
||||
llm_build_delta_net_base(params), model(model) {
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v();
|
||||
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
|
||||
|
||||
int sections[4];
|
||||
std::copy(std::begin(hparams.rope_sections), std::begin(hparams.rope_sections) + 4, sections);
|
||||
|
|
@ -117,8 +117,8 @@ ggml_tensor * llm_build_qwen35::build_layer_attn(
|
|||
ggml_tensor * inp_pos,
|
||||
int * sections,
|
||||
int il) {
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v();
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
|
||||
|
||||
// Order: joint QG projection, QG split, Q norm, KV projection, K norm, RoPE, attention
|
||||
|
||||
|
|
|
|||
|
|
@ -4,9 +4,9 @@
|
|||
|
||||
llm_build_qwen35moe::llm_build_qwen35moe(const llama_model & model, const llm_graph_params & params) :
|
||||
llm_build_delta_net_base(params), model(model) {
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v();
|
||||
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
|
||||
|
||||
int sections[4];
|
||||
std::copy(std::begin(hparams.rope_sections), std::begin(hparams.rope_sections) + 4, sections);
|
||||
|
|
@ -117,8 +117,8 @@ ggml_tensor * llm_build_qwen35moe ::build_layer_attn(
|
|||
ggml_tensor * inp_pos,
|
||||
int * sections,
|
||||
int il) {
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v();
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
|
||||
|
||||
// Order: joint QG projection, QG split, Q norm, KV projection, K norm, RoPE, attention
|
||||
|
||||
|
|
|
|||
|
|
@ -1,10 +1,10 @@
|
|||
#include "models.h"
|
||||
|
||||
llm_build_qwen3moe::llm_build_qwen3moe(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v();
|
||||
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
|
||||
GGML_ASSERT(n_embd_head == n_rot);
|
||||
|
||||
ggml_tensor * cur;
|
||||
ggml_tensor * inpL;
|
||||
|
|
|
|||
|
|
@ -100,8 +100,8 @@ ggml_tensor * llm_build_qwen3next::build_layer_attn(
|
|||
ggml_tensor * cur,
|
||||
ggml_tensor * inp_pos,
|
||||
int il) {
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v();
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
|
||||
|
||||
// Order: joint QG projection, QG split, Q norm, KV projection, K norm, RoPE, attention
|
||||
|
||||
|
|
|
|||
|
|
@ -4,10 +4,10 @@ llm_build_qwen3vlmoe::llm_build_qwen3vlmoe(const llama_model & model, const llm_
|
|||
const size_t n_deepstack_layers = hparams.n_deepstack_layers;
|
||||
|
||||
const int64_t n_embd = hparams.n_embd;
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v();
|
||||
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
|
||||
GGML_ASSERT(n_embd_head == n_rot);
|
||||
|
||||
ggml_tensor * cur;
|
||||
ggml_tensor * inpL;
|
||||
|
|
|
|||
|
|
@ -4,10 +4,10 @@ llm_build_qwen3vl::llm_build_qwen3vl(const llama_model & model, const llm_graph_
|
|||
const size_t n_deepstack_layers = hparams.n_deepstack_layers;
|
||||
|
||||
const int64_t n_embd = hparams.n_embd;
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v();
|
||||
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
|
||||
GGML_ASSERT(n_embd_head == n_rot);
|
||||
|
||||
ggml_tensor * cur;
|
||||
ggml_tensor * inpL;
|
||||
|
|
|
|||
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue