models : make deepstack graphs (e.g. Qwen3 VL) have constant topology

This commit is contained in:
Georgi Gerganov 2026-01-03 19:10:17 +02:00
parent eec840331e
commit fc2dd1524e
No known key found for this signature in database
GPG Key ID: 449E073F9DC10735
6 changed files with 67 additions and 47 deletions

View File

@ -2899,7 +2899,7 @@ void llama_context::opt_epoch_iter(
}; };
ctx_compute_opt = ggml_init(params); ctx_compute_opt = ggml_init(params);
} }
ggml_opt_prepare_alloc(opt_ctx, ctx_compute_opt, gf, res->get_tokens(), res->get_logits()); ggml_opt_prepare_alloc(opt_ctx, ctx_compute_opt, gf, res->get_inp_tokens(), res->get_logits());
ggml_opt_alloc(opt_ctx, train); ggml_opt_alloc(opt_ctx, train);
res->set_inputs(&ubatch); res->set_inputs(&ubatch);

View File

@ -33,8 +33,8 @@ void llm_graph_input_embd::set_input(const llama_ubatch * ubatch) {
bool llm_graph_input_embd::can_reuse(const llm_graph_params & params) { bool llm_graph_input_embd::can_reuse(const llm_graph_params & params) {
bool res = true; bool res = true;
res &= (!tokens && !params.ubatch.token) || (tokens && tokens->ne[0] == params.ubatch.n_tokens); res &= (!params.ubatch.token) || (tokens && tokens->ne[0] == params.ubatch.n_tokens);
res &= (!embd && !params.ubatch.embd) || (embd && embd->ne[1] == params.ubatch.n_tokens); res &= (!params.ubatch.embd) || (embd && embd->ne[1] == params.ubatch.n_tokens);
return res; return res;
} }
@ -576,7 +576,8 @@ int64_t llm_graph_result::get_max_nodes() const {
} }
void llm_graph_result::reset() { void llm_graph_result::reset() {
t_tokens = nullptr; t_inp_tokens = nullptr;
t_inp_embd = nullptr;
t_logits = nullptr; t_logits = nullptr;
t_embd = nullptr; t_embd = nullptr;
t_embd_pooled = nullptr; t_embd_pooled = nullptr;
@ -1280,26 +1281,28 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
// input embeddings with optional lora // input embeddings with optional lora
ggml_tensor * llm_graph_context::build_inp_embd(ggml_tensor * tok_embd) const { ggml_tensor * llm_graph_context::build_inp_embd(ggml_tensor * tok_embd) const {
const int64_t n_embd = hparams.n_embd_inp(); const int64_t n_embd_inp = hparams.n_embd_inp();
auto inp = std::make_unique<llm_graph_input_embd>(n_embd); auto inp = std::make_unique<llm_graph_input_embd>(n_embd_inp);
inp->tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ubatch.n_tokens); inp->tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ubatch.n_tokens);
cb(inp->tokens, "inp_tokens", -1); cb(inp->tokens, "inp_tokens", -1);
ggml_set_input(inp->tokens); ggml_set_input(inp->tokens);
res->t_inp_tokens = inp->tokens;
if (ubatch.token) { inp->embd = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd_inp, ubatch.n_tokens);
res->t_tokens = inp->tokens;
}
inp->embd = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, ubatch.n_tokens);
cb(inp->embd, "inp_embd", -1); cb(inp->embd, "inp_embd", -1);
ggml_set_input(inp->embd); ggml_set_input(inp->embd);
res->t_inp_embd = inp->embd;
ggml_tensor * cur; // select one of the 2 inputs, based on the batch contents
// ref: https://github.com/ggml-org/llama.cpp/pull/18550
std::array<ggml_tensor *, 2> inps;
// token embeddings // token embeddings path (ubatch.token != nullptr)
{ {
auto & cur = inps[0];
cur = ggml_get_rows(ctx0, tok_embd, inp->tokens); cur = ggml_get_rows(ctx0, tok_embd, inp->tokens);
// apply lora for embedding tokens if needed // apply lora for embedding tokens if needed
@ -1319,11 +1322,33 @@ ggml_tensor * llm_graph_context::build_inp_embd(ggml_tensor * tok_embd) const {
cur = ggml_add(ctx0, cur, inpL_delta); cur = ggml_add(ctx0, cur, inpL_delta);
} }
if (hparams.n_deepstack_layers > 0) {
// note: ensure the selected node is always assigned to the same backend
// if we don't do this, the `ggml_get_rows()` above (inps[0]) can remain on the CPU, while the inps[1]
// below could be performed on the device (if n_deepstack_layers > 0, e.g. Qwen3 VL), which would result
// in different backend ids, depending on which input path is selected
// TODO: is there a better way to do this?
cur = ggml_cont(ctx0, cur);
}
} }
std::array<ggml_tensor *, 2> inps = { cur, inp->embd }; // vector embeddings path (ubatch.embd != nullptr)
{
auto & cur = inps[1];
cur = ggml_build_forward_select(gf, inps.data(), inps.size(), ubatch.token ? 0 : 1); cur = inp->embd;
if (hparams.n_deepstack_layers > 0) {
cur = ggml_view_2d(ctx0, cur, hparams.n_embd, n_tokens, cur->nb[1], 0);
cur = ggml_cont (ctx0, cur); // makes the shape of this node the same as the ubatch.token path
}
}
assert(ggml_are_same_shape (inps[0], inps[1]));
assert(ggml_are_same_stride(inps[0], inps[1]));
ggml_tensor * cur = ggml_build_forward_select(gf, inps.data(), inps.size(), ubatch.token ? 0 : 1);
// For Granite architecture // For Granite architecture
if (hparams.f_embedding_scale != 0.0f) { if (hparams.f_embedding_scale != 0.0f) {
@ -1429,7 +1454,7 @@ ggml_tensor * llm_graph_context::build_inp_cross_embd() const {
//} //}
const auto n_embd = !cross->v_embd.empty() ? cross->n_embd : hparams.n_embd_inp(); const auto n_embd = !cross->v_embd.empty() ? cross->n_embd : hparams.n_embd_inp();
const auto n_enc = !cross->v_embd.empty() ? cross->n_enc : hparams.n_ctx_train; const auto n_enc = !cross->v_embd.empty() ? cross->n_enc : hparams.n_ctx_train;
cur = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_enc); cur = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_enc);
ggml_set_input(cur); ggml_set_input(cur);

View File

@ -539,7 +539,7 @@ public:
virtual ~llm_graph_result() = default; virtual ~llm_graph_result() = default;
ggml_tensor * get_tokens() const { return t_tokens; } ggml_tensor * get_inp_tokens() const { return t_inp_tokens; }
ggml_tensor * get_logits() const { return t_logits; } ggml_tensor * get_logits() const { return t_logits; }
ggml_tensor * get_embd() const { return t_embd; } ggml_tensor * get_embd() const { return t_embd; }
ggml_tensor * get_embd_pooled() const { return t_embd_pooled; } ggml_tensor * get_embd_pooled() const { return t_embd_pooled; }
@ -566,7 +566,8 @@ public:
void set_params(const llm_graph_params & params); void set_params(const llm_graph_params & params);
// important graph nodes // important graph nodes
ggml_tensor * t_tokens = nullptr; ggml_tensor * t_inp_tokens = nullptr;
ggml_tensor * t_inp_embd = nullptr;
ggml_tensor * t_logits = nullptr; ggml_tensor * t_logits = nullptr;
ggml_tensor * t_embd = nullptr; ggml_tensor * t_embd = nullptr;
ggml_tensor * t_embd_pooled = nullptr; ggml_tensor * t_embd_pooled = nullptr;

View File

@ -250,7 +250,7 @@ ggml_tensor * llm_build_gemma3n_iswa::get_per_layer_inputs() {
if (ubatch.token) { if (ubatch.token) {
inp->tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ubatch.n_tokens); inp->tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ubatch.n_tokens);
ggml_set_input(inp->tokens); ggml_set_input(inp->tokens);
res->t_tokens = inp->tokens; res->t_inp_tokens = inp->tokens;
inp_per_layer = ggml_get_rows(ctx0, model.tok_embd_per_layer, inp->tokens); inp_per_layer = ggml_get_rows(ctx0, model.tok_embd_per_layer, inp->tokens);
inp_per_layer = ggml_reshape_3d(ctx0, inp_per_layer, n_embd_altup, n_layer, n_tokens); inp_per_layer = ggml_reshape_3d(ctx0, inp_per_layer, n_embd_altup, n_layer, n_tokens);
inp_per_layer = ggml_scale(ctx0, inp_per_layer, sqrtf((float) n_embd_altup)); inp_per_layer = ggml_scale(ctx0, inp_per_layer, sqrtf((float) n_embd_altup));

View File

@ -2,7 +2,8 @@
llm_build_qwen3vlmoe::llm_build_qwen3vlmoe(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { llm_build_qwen3vlmoe::llm_build_qwen3vlmoe(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
const size_t n_deepstack_layers = hparams.n_deepstack_layers; const size_t n_deepstack_layers = hparams.n_deepstack_layers;
const int64_t n_embd = hparams.n_embd;
const int64_t n_embd = hparams.n_embd;
const int64_t n_embd_head = hparams.n_embd_head_v; const int64_t n_embd_head = hparams.n_embd_head_v;
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
@ -16,17 +17,6 @@ llm_build_qwen3vlmoe::llm_build_qwen3vlmoe(const llama_model & model, const llm_
int sections[4]; int sections[4];
std::copy(std::begin(hparams.rope_sections), std::begin(hparams.rope_sections) + 4, sections); std::copy(std::begin(hparams.rope_sections), std::begin(hparams.rope_sections) + 4, sections);
std::vector<ggml_tensor *> deepstack_features(n_deepstack_layers, nullptr);
if (ubatch.embd) {
// Image input: split main embd and deepstack embds
ggml_tensor * inpL_main = ggml_view_2d(ctx0, inpL, n_embd, n_tokens, inpL->nb[1], 0);
for (size_t i = 0; i < n_deepstack_layers; i++) {
deepstack_features[i] = ggml_view_2d(ctx0, inpL, n_embd, n_tokens, inpL->nb[1], (i + 1) * n_embd * sizeof(float));
}
inpL = inpL_main;
}
// inp_pos - contains the positions // inp_pos - contains the positions
ggml_tensor * inp_pos = build_inp_pos(); ggml_tensor * inp_pos = build_inp_pos();
@ -120,9 +110,16 @@ llm_build_qwen3vlmoe::llm_build_qwen3vlmoe(const llama_model & model, const llm_
cur = build_cvec(cur, il); cur = build_cvec(cur, il);
cb(cur, "l_out", il); cb(cur, "l_out", il);
if (ubatch.embd && (size_t)il < n_deepstack_layers) { if (il < (int) n_deepstack_layers) {
cur = ggml_add(ctx0, cur, deepstack_features[il]); std::array<ggml_tensor *, 2> curs = { cur, nullptr };
ggml_tensor * ds = ggml_view_2d(ctx0, res->t_inp_embd, n_embd, n_tokens, res->t_inp_embd->nb[1], (il + 1) * n_embd * sizeof(float));
cur = ggml_add(ctx0, cur, ds);
cb(cur, "deepstack_out", il); cb(cur, "deepstack_out", il);
curs[1] = cur;
cur = ggml_build_forward_select(gf, curs.data(), curs.size(), ubatch.embd ? 1 : 0);
} }
// input for next layer // input for next layer

View File

@ -2,7 +2,8 @@
llm_build_qwen3vl::llm_build_qwen3vl(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { llm_build_qwen3vl::llm_build_qwen3vl(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
const size_t n_deepstack_layers = hparams.n_deepstack_layers; const size_t n_deepstack_layers = hparams.n_deepstack_layers;
const int64_t n_embd = hparams.n_embd;
const int64_t n_embd = hparams.n_embd;
const int64_t n_embd_head = hparams.n_embd_head_v; const int64_t n_embd_head = hparams.n_embd_head_v;
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
@ -16,17 +17,6 @@ llm_build_qwen3vl::llm_build_qwen3vl(const llama_model & model, const llm_graph_
int sections[4]; int sections[4];
std::copy(std::begin(hparams.rope_sections), std::begin(hparams.rope_sections) + 4, sections); std::copy(std::begin(hparams.rope_sections), std::begin(hparams.rope_sections) + 4, sections);
std::vector<ggml_tensor *> deepstack_features(n_deepstack_layers, nullptr);
if (ubatch.embd) {
// Image input: split main embd and deepstack embds
ggml_tensor * inpL_main = ggml_view_2d(ctx0, inpL, n_embd, n_tokens, inpL->nb[1], 0);
for (size_t i = 0; i < n_deepstack_layers; i++) {
deepstack_features[i] = ggml_view_2d(ctx0, inpL, n_embd, n_tokens, inpL->nb[1], (i + 1) * n_embd * sizeof(float));
}
inpL = inpL_main;
}
// inp_pos - contains the positions // inp_pos - contains the positions
ggml_tensor * inp_pos = build_inp_pos(); ggml_tensor * inp_pos = build_inp_pos();
@ -113,9 +103,16 @@ llm_build_qwen3vl::llm_build_qwen3vl(const llama_model & model, const llm_graph_
cur = build_cvec(cur, il); cur = build_cvec(cur, il);
cb(cur, "l_out", il); cb(cur, "l_out", il);
if (ubatch.embd && (size_t)il < n_deepstack_layers) { if (il < (int) n_deepstack_layers) {
cur = ggml_add(ctx0, cur, deepstack_features[il]); std::array<ggml_tensor *, 2> curs = { cur, nullptr };
ggml_tensor * ds = ggml_view_2d(ctx0, res->t_inp_embd, n_embd, n_tokens, res->t_inp_embd->nb[1], (il + 1) * n_embd * sizeof(float));
cur = ggml_add(ctx0, cur, ds);
cb(cur, "deepstack_out", il); cb(cur, "deepstack_out", il);
curs[1] = cur;
cur = ggml_build_forward_select(gf, curs.data(), curs.size(), ubatch.embd ? 1 : 0);
} }
// input for next layer // input for next layer