fixed tensor mappings and working on buildin graph
This commit is contained in:
parent
4ceb828112
commit
18c0c23ed8
|
|
@ -3015,9 +3015,6 @@ struct ggml_tensor * ggml_mul_mat(
|
|||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
struct ggml_tensor * b) {
|
||||
|
||||
printf("Up: {%lld, %lld}\n", a->ne[0], a->ne[1]);
|
||||
printf("Cur: {%lld, %lld}\n", b->ne[0], b->ne[1]);
|
||||
|
||||
GGML_ASSERT(ggml_can_mul_mat(a, b));
|
||||
GGML_ASSERT(!ggml_is_transposed(a));
|
||||
|
|
|
|||
|
|
@ -367,7 +367,7 @@ class TensorNameMap:
|
|||
"model.layers.{bid}.mlp.up_proj", # llama-hf refact nemotron olmo2
|
||||
"layers.{bid}.feed_forward.w3", # llama-pth
|
||||
"encoder.layer.{bid}.intermediate.dense", # bert
|
||||
"layers.{bid}.mlp.Wo", # modern bert
|
||||
"layers.{bid}.mlp.Wi", # modern bert
|
||||
"transformer.layer.{bid}.ffn.lin1", # distillbert
|
||||
"transformer.h.{bid}.mlp.fc_in", # gpt-j
|
||||
"transformer.h.{bid}.mlp.linear_3", # refact
|
||||
|
|
@ -467,7 +467,7 @@ class TensorNameMap:
|
|||
"model.layers.{bid}.mlp.down_proj", # llama-hf nemotron olmo2
|
||||
"layers.{bid}.feed_forward.w2", # llama-pth
|
||||
"encoder.layer.{bid}.output.dense", # bert
|
||||
"layers.{bid}.mlp.Wi", # modern bert
|
||||
"layers.{bid}.mlp.Wo", # modern bert
|
||||
"transformer.layer.{bid}.ffn.lin2", # distillbert
|
||||
"transformer.h.{bid}.mlp.fc_out", # gpt-j
|
||||
"language_model.encoder.layers.{bid}.mlp.dense_4h_to_h", # persimmon
|
||||
|
|
|
|||
|
|
@ -2708,8 +2708,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|||
layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, 3 * n_embd }, 0);
|
||||
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
|
||||
|
||||
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_ff, n_embd}, 0); // [3072, 384]
|
||||
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_embd, 2 * n_ff}, 0);
|
||||
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, 2 * n_ff}, 0);
|
||||
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
|
||||
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
||||
}
|
||||
} break;
|
||||
|
|
@ -7548,6 +7548,7 @@ struct llm_build_modern_bert : public llm_graph_context {
|
|||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); // == n_head_kv * n_embd_head
|
||||
const int64_t n_tokens = ubatch.n_tokens;
|
||||
const int64_t n_ff = hparams.n_ff();
|
||||
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
|
||||
|
|
@ -7667,30 +7668,63 @@ struct llm_build_modern_bert : public llm_graph_context {
|
|||
|
||||
// MLP (prefer GEGLU if gate exists or up has 2*n_ff rows)
|
||||
ggml_tensor * mlp_out = nullptr;
|
||||
const bool has_gate_tensor = (model.layers[il].ffn_gate != nullptr);
|
||||
const bool up_is_2x = (model.layers[il].ffn_up && model.layers[il].ffn_up->ne[0] == 2*hparams.n_ff());
|
||||
ggml_tensor * ffn_gate_view = model.layers[il].ffn_gate;
|
||||
ggml_tensor * ffn_up_view = model.layers[il].ffn_up;
|
||||
|
||||
if (ffn_gate_view == nullptr && ffn_up_view) {
|
||||
|
||||
// Case A: weight stored as (2*ffn, hidden) -> split rows into two (ffn x hidden)
|
||||
if( ffn_up_view->ne[0] == 2 * n_ff and ffn_up_view->ne[1] == n_embd) {
|
||||
|
||||
// top half, (ffn up)
|
||||
ffn_up_view = ggml_view_2d(ctx0, model.layers[il].ffn_up,
|
||||
/*ne0*/ n_ff, /*ne1*/ n_embd,
|
||||
/*nb1*/ model.layers[il].ffn_up->nb[1],
|
||||
/*offset_bytes*/ (size_t)0);
|
||||
// bottom half (gate)
|
||||
ffn_gate_view = ggml_view_2d(ctx0, model.layers[il].ffn_up,
|
||||
/*ne0*/ n_ff, /*ne1*/ n_embd,
|
||||
/*nb1*/ model.layers[il].ffn_up->nb[1],
|
||||
/*offset_bytes*/ (size_t)n_ff * model.layers[il].ffn_up->nb[1]);
|
||||
}
|
||||
else if ( ffn_up_view->ne[0] == n_embd && ffn_up_view->ne[1] == 2 * n_ff) {
|
||||
// top half
|
||||
ffn_up_view = ggml_view_2d(ctx0, model.layers[il].ffn_up,
|
||||
n_embd, n_ff,
|
||||
model.layers[il].ffn_up->nb[1],
|
||||
0);
|
||||
ffn_up_view = ggml_cont(ctx0, ffn_up_view);
|
||||
|
||||
ffn_gate_view = ggml_view_2d(ctx0, model.layers[il].ffn_up,
|
||||
n_embd, n_ff,
|
||||
model.layers[il].ffn_up->nb[1],
|
||||
n_ff * sizeof(float));
|
||||
ffn_gate_view = ggml_cont(ctx0, ffn_gate_view);
|
||||
}
|
||||
|
||||
ggml_tensor * ffn_down_view = model.layers[il].ffn_down;
|
||||
LLAMA_LOG_INFO("ffn shapes: Up: {%lld, %lld}, Gate: {%lld, %lld}, Down: {%lld, %lld}",
|
||||
ffn_up_view->ne[0], ffn_up_view->ne[1], ffn_gate_view->ne[0], ffn_gate_view->ne[1], ffn_down_view->ne[0], ffn_down_view->ne[1]);
|
||||
|
||||
if (has_gate_tensor || up_is_2x) {
|
||||
mlp_out = build_ffn(
|
||||
h,
|
||||
model.layers[il].ffn_up, /*up_b*/ NULL, /*up_shexp*/ NULL,
|
||||
model.layers[il].ffn_gate, /*gate_b*/ NULL, /*gate_shexp*/ NULL,
|
||||
ffn_gate_view , /*gate_b*/ NULL, /*gate_shexp*/ NULL,
|
||||
model.layers[il].ffn_down, /*down_b*/ NULL, /*down_shexp*/ NULL,
|
||||
/*expert_scores*/ NULL,
|
||||
LLM_FFN_GEGLU, LLM_FFN_PAR, il);
|
||||
cb(mlp_out, "ffn_out_geglu", il);
|
||||
LLM_FFN_GEGLU, LLM_FFN_PAR, il
|
||||
);
|
||||
cb(mlp_out, "ffn_out_geglu", il);
|
||||
} else {
|
||||
|
||||
LLAMA_LOG_INFO("Ffn_up : {%lld, %lld}, ffn_down : {%lld, %lld}\n", model.layers[il].ffn_up->ne[0], model.layers[il].ffn_up->ne[1],
|
||||
model.layers[il].ffn_down->ne[0], model.layers[il].ffn_down->ne[0]);
|
||||
mlp_out = build_ffn(
|
||||
h,
|
||||
model.layers[il].ffn_up, /*up_b*/ NULL, /*up_shexp*/ NULL,
|
||||
/*gate*/ NULL, /*gate_b*/ NULL, /*gate_shexp*/ NULL,
|
||||
model.layers[il].ffn_down, /*down_b*/ NULL, /*down_shexp*/ NULL,
|
||||
/*expert_scores*/ NULL,
|
||||
LLM_FFN_GELU, LLM_FFN_SEQ, il);
|
||||
cb(mlp_out, "ffn_out_gelu", il);
|
||||
model.layers[il].ffn_up, NULL, NULL,
|
||||
model.layers[il].ffn_gate, NULL, NULL,
|
||||
model.layers[il].ffn_down, NULL, NULL,
|
||||
NULL,
|
||||
LLM_FFN_GEGLU, LLM_FFN_PAR, il
|
||||
);
|
||||
cb(mlp_out, "ffn_out_geglu", il);
|
||||
}
|
||||
|
||||
// Residual after MLP
|
||||
|
|
|
|||
Loading…
Reference in New Issue