From 1eea6a2968b401c8377bc9d4d49a7a64ffbfb3d0 Mon Sep 17 00:00:00 2001 From: Richard Davison Date: Thu, 12 Mar 2026 00:22:49 +0100 Subject: [PATCH] graph : add optional scale parameter to build_lora_mm [no ci] (#20427) --- src/llama-graph.cpp | 7 ++++++- src/llama-graph.h | 5 +++-- src/models/bitnet.cpp | 25 +++++-------------------- src/models/llama.cpp | 15 +++------------ src/models/qwen3.cpp | 15 +++------------ src/models/qwen3moe.cpp | 15 +++------------ 6 files changed, 23 insertions(+), 59 deletions(-) diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp index 41e804a8f8..9a215bb77a 100644 --- a/src/llama-graph.cpp +++ b/src/llama-graph.cpp @@ -900,7 +900,8 @@ ggml_tensor * llm_graph_context::build_cvec( ggml_tensor * llm_graph_context::build_lora_mm( ggml_tensor * w, - ggml_tensor * cur) const { + ggml_tensor * cur, + ggml_tensor * w_s) const { ggml_tensor * res = ggml_mul_mat(ctx0, w, cur); for (const auto & lora : *loras) { @@ -921,6 +922,10 @@ ggml_tensor * llm_graph_context::build_lora_mm( res = ggml_add(ctx0, res, ab_cur); } + if (w_s) { + res = ggml_mul(ctx0, res, w_s); + } + return res; } diff --git a/src/llama-graph.h b/src/llama-graph.h index c8817b8f1e..4855685ef7 100644 --- a/src/llama-graph.h +++ b/src/llama-graph.h @@ -764,10 +764,11 @@ struct llm_graph_context { ggml_tensor * cur, int il) const; - // do mat_mul, while optionally apply lora + // do mat_mul, while optionally apply lora and per-tensor scale ggml_tensor * build_lora_mm( ggml_tensor * w, - ggml_tensor * cur) const; + ggml_tensor * cur, + ggml_tensor * w_s = nullptr) const; // do mat_mul_id, while optionally apply lora ggml_tensor * build_lora_mm_id( diff --git a/src/models/bitnet.cpp b/src/models/bitnet.cpp index af2cc34bea..ccf5bc8e82 100644 --- a/src/models/bitnet.cpp +++ b/src/models/bitnet.cpp @@ -29,10 +29,7 @@ llm_build_bitnet::llm_build_bitnet(const llama_model & model, const llm_graph_pa // self-attention { // compute Q and K and RoPE them - ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); - if (model.layers[il].wq_s) { - Qcur = ggml_mul(ctx0, Qcur, model.layers[il].wq_s); - } + ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur, model.layers[il].wq_s); cb(Qcur, "Qcur", il); if (model.layers[il].bq) { Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq); @@ -40,10 +37,7 @@ llm_build_bitnet::llm_build_bitnet(const llama_model & model, const llm_graph_pa } // B1.K - ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); - if (model.layers[il].wk_s) { - Kcur = ggml_mul(ctx0, Kcur, model.layers[il].wk_s); - } + ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur, model.layers[il].wk_s); cb(Kcur, "Kcur", il); if (model.layers[il].bk) { Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk); @@ -51,10 +45,7 @@ llm_build_bitnet::llm_build_bitnet(const llama_model & model, const llm_graph_pa } // B1.V - ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); - if (model.layers[il].wv_s) { - Vcur = ggml_mul(ctx0, Vcur, model.layers[il].wv_s); - } + ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur, model.layers[il].wv_s); cb(Vcur, "Vcur", il); if (model.layers[il].bv) { Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv); @@ -90,10 +81,7 @@ llm_build_bitnet::llm_build_bitnet(const llama_model & model, const llm_graph_pa LLM_NORM_RMS, il); cb(cur, "attn_sub_norm", il); - cur = build_lora_mm(model.layers[il].wo, cur); - if (model.layers[il].wo_s) { - cur = ggml_mul(ctx0, cur, model.layers[il].wo_s); - } + cur = build_lora_mm(model.layers[il].wo, cur, model.layers[il].wo_s); if (model.layers[il].bo) { cur = ggml_add(ctx0, cur, model.layers[il].bo); } @@ -127,10 +115,7 @@ llm_build_bitnet::llm_build_bitnet(const llama_model & model, const llm_graph_pa LLM_NORM_RMS, il); cb(cur, "ffn_sub_norm", il); - cur = build_lora_mm(model.layers[il].ffn_down, cur); - if (model.layers[il].ffn_down_s) { - cur = ggml_mul(ctx0, cur, model.layers[il].ffn_down_s); - } + cur = build_lora_mm(model.layers[il].ffn_down, cur, model.layers[il].ffn_down_s); cb(cur, "ffn_down", il); cur = ggml_add(ctx0, cur, ffn_inp); diff --git a/src/models/llama.cpp b/src/models/llama.cpp index d2434b63a5..e08ae0c0b0 100644 --- a/src/models/llama.cpp +++ b/src/models/llama.cpp @@ -43,28 +43,19 @@ llm_build_llama::llm_build_llama(const llama_model & model, const llm_gra ggml_tensor * rope_factors = model.get_rope_factors(cparams, il); // compute Q and K and RoPE them - ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); - if (model.layers[il].wq_s) { - Qcur = ggml_mul(ctx0, Qcur, model.layers[il].wq_s); - } + ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur, model.layers[il].wq_s); cb(Qcur, "Qcur", il); if (model.layers[il].bq) { Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq); cb(Qcur, "Qcur", il); } - ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); - if (model.layers[il].wk_s) { - Kcur = ggml_mul(ctx0, Kcur, model.layers[il].wk_s); - } + ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur, model.layers[il].wk_s); cb(Kcur, "Kcur", il); if (model.layers[il].bk) { Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk); cb(Kcur, "Kcur", il); } - ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); - if (model.layers[il].wv_s) { - Vcur = ggml_mul(ctx0, Vcur, model.layers[il].wv_s); - } + ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur, model.layers[il].wv_s); cb(Vcur, "Vcur", il); if (model.layers[il].bv) { Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv); diff --git a/src/models/qwen3.cpp b/src/models/qwen3.cpp index c13cb6c4fd..5208166847 100644 --- a/src/models/qwen3.cpp +++ b/src/models/qwen3.cpp @@ -30,22 +30,13 @@ llm_build_qwen3::llm_build_qwen3(const llama_model & model, const llm_graph_para // self-attention { // compute Q and K and RoPE them - ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); - if (model.layers[il].wq_s) { - Qcur = ggml_mul(ctx0, Qcur, model.layers[il].wq_s); - } + ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur, model.layers[il].wq_s); cb(Qcur, "Qcur", il); - ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); - if (model.layers[il].wk_s) { - Kcur = ggml_mul(ctx0, Kcur, model.layers[il].wk_s); - } + ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur, model.layers[il].wk_s); cb(Kcur, "Kcur", il); - ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); - if (model.layers[il].wv_s) { - Vcur = ggml_mul(ctx0, Vcur, model.layers[il].wv_s); - } + ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur, model.layers[il].wv_s); cb(Vcur, "Vcur", il); Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); diff --git a/src/models/qwen3moe.cpp b/src/models/qwen3moe.cpp index 5e26119278..dba46618ff 100644 --- a/src/models/qwen3moe.cpp +++ b/src/models/qwen3moe.cpp @@ -30,22 +30,13 @@ llm_build_qwen3moe::llm_build_qwen3moe(const llama_model & model, const llm_grap // self_attention { // compute Q and K and RoPE them - ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); - if (model.layers[il].wq_s) { - Qcur = ggml_mul(ctx0, Qcur, model.layers[il].wq_s); - } + ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur, model.layers[il].wq_s); cb(Qcur, "Qcur", il); - ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); - if (model.layers[il].wk_s) { - Kcur = ggml_mul(ctx0, Kcur, model.layers[il].wk_s); - } + ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur, model.layers[il].wk_s); cb(Kcur, "Kcur", il); - ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); - if (model.layers[il].wv_s) { - Vcur = ggml_mul(ctx0, Vcur, model.layers[il].wv_s); - } + ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur, model.layers[il].wv_s); cb(Vcur, "Vcur", il); Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);