diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index eec0ea14e3..01643fd32f 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -117,7 +117,8 @@ class ModelBase: small_first_shard: bool = False, hparams: dict[str, Any] | None = None, remote_hf_model_id: str | None = None, disable_mistral_community_chat_template: bool = False, sentence_transformers_dense_modules: bool = False, - fuse_gate_up_exps: bool = False): + fuse_gate_up_exps: bool = False, + fuse_qkv: bool = False): if type(self) is ModelBase or \ type(self) is TextModel or \ type(self) is MmprojModel: @@ -139,6 +140,10 @@ class ModelBase: self.fuse_gate_up_exps = fuse_gate_up_exps self._gate_exp_buffer: dict[int, Tensor] = {} self._up_exp_buffer: dict[int, Tensor] = {} + self.fuse_qkv = fuse_qkv + self._q_buffer: dict[int, Tensor] = {} + self._k_buffer: dict[int, Tensor] = {} + self._v_buffer: dict[int, Tensor] = {} self.hparams = ModelBase.load_hparams(self.dir_model, self.is_mistral_format) if hparams is None else hparams self.model_tensors = self.index_tensors(remote_hf_model_id=remote_hf_model_id) self.metadata_override = metadata_override @@ -551,6 +556,33 @@ class ModelBase: self.match_model_tensor_name(new_name, gguf.MODEL_TENSOR.FFN_UP_EXP, bid): return [] + # Handle Q/K/V tensor fusion if enabled + if self.fuse_qkv and bid is not None: + if self.match_model_tensor_name(new_name, gguf.MODEL_TENSOR.ATTN_Q, bid): + self._q_buffer[bid] = data_torch + elif self.match_model_tensor_name(new_name, gguf.MODEL_TENSOR.ATTN_K, bid): + self._k_buffer[bid] = data_torch + elif self.match_model_tensor_name(new_name, gguf.MODEL_TENSOR.ATTN_V, bid): + self._v_buffer[bid] = data_torch + + # Check if all three Q, K, V are buffered for this layer + if bid in self._q_buffer and bid in self._k_buffer and bid in self._v_buffer: + q_data = self._q_buffer.pop(bid) + k_data = self._k_buffer.pop(bid) + v_data = self._v_buffer.pop(bid) + # Q shape: (n_embd_q, n_embd), K shape: (n_embd_k, n_embd), V shape: (n_embd_v, n_embd) + # concatenate to (n_embd_q + n_embd_k + n_embd_v, n_embd) + fused_data = torch.cat([q_data, k_data, v_data], dim=0) + fused_name = self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_QKV, bid) + logger.info(f"Fused Q, K, V into QKV for layer {bid}") + return [(fused_name, fused_data)] + + # If we buffered a Q/K/V tensor, wait for the others + if self.match_model_tensor_name(new_name, gguf.MODEL_TENSOR.ATTN_Q, bid) or \ + self.match_model_tensor_name(new_name, gguf.MODEL_TENSOR.ATTN_K, bid) or \ + self.match_model_tensor_name(new_name, gguf.MODEL_TENSOR.ATTN_V, bid): + return [] + return [(new_name, data_torch)] def tensor_force_quant(self, name: str, new_name: str, bid: int | None, n_dims: int) -> gguf.GGMLQuantizationType | bool: @@ -12293,6 +12325,11 @@ def parse_args() -> argparse.Namespace: help="Fuse gate_exps and up_exps tensors into a single gate_up_exps tensor for MoE models.", ) + parser.add_argument( + "--fuse-qkv", action="store_true", + help="Fuse separate Q, K, V weight tensors into a single QKV tensor.", + ) + args = parser.parse_args() if not args.print_supported_models and args.model is None: parser.error("the following arguments are required: model") @@ -12431,7 +12468,8 @@ def main() -> None: small_first_shard=args.no_tensor_first_split, remote_hf_model_id=hf_repo_id, disable_mistral_community_chat_template=disable_mistral_community_chat_template, sentence_transformers_dense_modules=args.sentence_transformers_dense_modules, - fuse_gate_up_exps=args.fuse_gate_up_exps + fuse_gate_up_exps=args.fuse_gate_up_exps, + fuse_qkv=args.fuse_qkv ) if args.vocab_only: diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index bf617382d0..34c2fbb342 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -1365,6 +1365,7 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = { MODEL_TENSOR.OUTPUT, MODEL_TENSOR.ROPE_FREQS, MODEL_TENSOR.ATTN_NORM, + MODEL_TENSOR.ATTN_QKV, MODEL_TENSOR.ATTN_Q, MODEL_TENSOR.ATTN_K, MODEL_TENSOR.ATTN_V, @@ -1702,6 +1703,7 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = { MODEL_TENSOR.OUTPUT, MODEL_TENSOR.ROPE_FREQS, MODEL_TENSOR.ATTN_NORM, + MODEL_TENSOR.ATTN_QKV, MODEL_TENSOR.ATTN_Q, MODEL_TENSOR.ATTN_K, MODEL_TENSOR.ATTN_V, @@ -1780,6 +1782,7 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = { MODEL_TENSOR.OUTPUT, MODEL_TENSOR.ROPE_FREQS, MODEL_TENSOR.ATTN_NORM, + MODEL_TENSOR.ATTN_QKV, MODEL_TENSOR.ATTN_Q, MODEL_TENSOR.ATTN_Q_NORM, MODEL_TENSOR.ATTN_K, diff --git a/scripts/fuse_qkv_gguf.py b/scripts/fuse_qkv_gguf.py new file mode 100644 index 0000000000..93d56278ec --- /dev/null +++ b/scripts/fuse_qkv_gguf.py @@ -0,0 +1,166 @@ +#!/usr/bin/env python3 +"""Fuse Q/K/V tensors in an existing GGUF file into a single QKV tensor. + +This script operates at the binary level to preserve ALL metadata (including +tokenizer) byte-for-byte from the original file. + +Usage: + python scripts/fuse_qkv_gguf.py input.gguf output.gguf +""" +import sys, struct, os, re +import numpy as np + +sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', 'gguf-py')) +from gguf import GGUFReader + + +def align_offset(offset, alignment=32): + return (offset + alignment - 1) // alignment * alignment + + +def write_tensor_info(f, name, n_dims, dims, tensor_type, data_offset): + """Write one tensor info entry in GGUF format.""" + name_bytes = name.encode('utf-8') + f.write(struct.pack(' QKV[{fused_ne0},{fused_ne1}] {fused_data.nbytes} bytes") + + output_tensors.append((fused_name, 2, [fused_ne0, fused_ne1], + int(q.tensor_type), fused_data.tobytes())) + else: + dims = [int(x) for x in t.field.parts[3]] + n_dims = int(t.field.parts[2][0]) + output_tensors.append((t.name, n_dims, dims, + int(t.tensor_type), bytes(t.data))) + + n_tensors_new = len(output_tensors) + print(f"\n {n_tensors_orig} -> {n_tensors_new} tensors") + + with open(input_path, 'rb') as f: + f.seek(kv_data_start) + kv_data_bytes = f.read(kv_data_end - kv_data_start) + + print(f"\nWriting {output_path}...") + alignment = 32 + + with open(output_path, 'wb') as f: + f.write(magic) + f.write(struct.pack(' ti_section_end: + f.write(b'\x00' * (tensor_data_start - ti_section_end)) + + for i, (name, n_dims, dims, ttype, data) in enumerate(output_tensors): + current_pos = f.tell() - tensor_data_start + target_pos = data_offsets[i] + if target_pos > current_pos: + f.write(b'\x00' * (target_pos - current_pos)) + f.write(data) + + final_size = f.tell() + + print(f" Output size: {final_size / 1e9:.2f} GB") + print(" Done!") + + +if __name__ == '__main__': + main() diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp index 799d16167b..7d4c58bc3e 100644 --- a/src/llama-arch.cpp +++ b/src/llama-arch.cpp @@ -552,6 +552,7 @@ static std::set llm_get_tensor_names(llm_arch arch) { LLM_TENSOR_OUTPUT, LLM_TENSOR_ROPE_FREQS, LLM_TENSOR_ATTN_NORM, + LLM_TENSOR_ATTN_QKV, LLM_TENSOR_ATTN_Q, LLM_TENSOR_ATTN_K, LLM_TENSOR_ATTN_V, @@ -759,6 +760,7 @@ static std::set llm_get_tensor_names(llm_arch arch) { LLM_TENSOR_OUTPUT_NORM, LLM_TENSOR_OUTPUT, LLM_TENSOR_ATTN_NORM, + LLM_TENSOR_ATTN_QKV, LLM_TENSOR_ATTN_Q, LLM_TENSOR_ATTN_K, LLM_TENSOR_ATTN_V, @@ -960,6 +962,7 @@ static std::set llm_get_tensor_names(llm_arch arch) { LLM_TENSOR_OUTPUT, LLM_TENSOR_CLS_OUT, LLM_TENSOR_ATTN_NORM, + LLM_TENSOR_ATTN_QKV, LLM_TENSOR_ATTN_Q, LLM_TENSOR_ATTN_Q_NORM, LLM_TENSOR_ATTN_K, diff --git a/src/llama-model.cpp b/src/llama-model.cpp index e8e1bbf1cd..838d220294 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -2708,6 +2708,16 @@ bool llama_model::load_tensors(llama_model_loader & ml) { layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", bid), {n_embd_, n_ff_, n_expert_}, flags); } }; + + // helper: try merged QKV first, fall back to separate Q, K, V + auto create_tensor_qkv = [&](llama_layer & layer, int bid, int64_t n_embd_, int64_t n_embd_head_k_, int64_t n_head_, int64_t n_embd_k_gqa_, int64_t n_embd_v_gqa_) { + layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", bid), {n_embd_, n_embd_head_k_ * n_head_ + n_embd_k_gqa_ + n_embd_v_gqa_}, TENSOR_NOT_REQUIRED); + if (layer.wqkv == nullptr) { + layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", bid), {n_embd_, n_embd_head_k_ * n_head_}, 0); + layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", bid), {n_embd_, n_embd_k_gqa_}, 0); + layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", bid), {n_embd_, n_embd_v_gqa_}, 0); + } + }; switch (arch) { case LLM_ARCH_LLAMA: case LLM_ARCH_REFACT: @@ -2733,9 +2743,14 @@ bool llama_model::load_tensors(llama_model_loader & ml) { layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0); - layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0); - layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0); - layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0); + // only LLaMA-family archs have fused QKV inference graph support + if (arch == LLM_ARCH_LLAMA || arch == LLM_ARCH_LLAMA_EMBED) { + create_tensor_qkv(layer, i, n_embd, n_embd_head_k, n_head, n_embd_k_gqa, n_embd_v_gqa); + } else { + layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0); + layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0); + layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0); + } layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0); // optional bias tensors @@ -3556,12 +3571,17 @@ bool llama_model::load_tensors(llama_model_loader & ml) { auto & layer = layers[i]; layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0); - - layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0); - layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0); - layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0); + // only Qwen2 arch has fused QKV inference graph support + if (arch == LLM_ARCH_QWEN2) { + create_tensor_qkv(layer, i, n_embd, n_embd_head_k, n_head, n_embd_k_gqa, n_embd_v_gqa); + } else { + layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0); + layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0); + layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0); + } layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0); + // optional bias tensors layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED); layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED); @@ -3645,9 +3665,14 @@ bool llama_model::load_tensors(llama_model_loader & ml) { layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0); - layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0); - layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0); - layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0); + // only Qwen3 arch has fused QKV inference graph support + if (arch == LLM_ARCH_QWEN3) { + create_tensor_qkv(layer, i, n_embd, n_embd_head_k, n_head, n_embd_gqa, n_embd_gqa); + } else { + layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0); + layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0); + layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0); + } layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0); layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0); diff --git a/src/models/llama.cpp b/src/models/llama.cpp index e08ae0c0b0..fd3918a0a1 100644 --- a/src/models/llama.cpp +++ b/src/models/llama.cpp @@ -43,27 +43,67 @@ llm_build_llama::llm_build_llama(const llama_model & model, const llm_gra ggml_tensor * rope_factors = model.get_rope_factors(cparams, il); // compute Q and K and RoPE them - ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur, model.layers[il].wq_s); - cb(Qcur, "Qcur", il); - if (model.layers[il].bq) { - Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq); + ggml_tensor * Qcur = nullptr; + ggml_tensor * Kcur = nullptr; + ggml_tensor * Vcur = nullptr; + + if (model.layers[il].wqkv) { + // fused QKV path: one matmul, then split via views + cur = build_lora_mm(model.layers[il].wqkv, cur); + cb(cur, "wqkv", il); + + const int64_t n_embd_q = n_embd_head * n_head; + const int64_t n_embd_kgq = n_embd_head * n_head_kv; + const size_t es = ggml_element_size(cur); + + if (model.layers[il].bq || model.layers[il].bk || model.layers[il].bv) { + // Models with bias: view_2d -> add bias -> reshape_3d + // (ggml_add produces contiguous output, enabling reshape) + ggml_tensor * Qcur_2d = ggml_view_2d(ctx0, cur, n_embd_q, n_tokens, cur->nb[1], 0); + ggml_tensor * Kcur_2d = ggml_view_2d(ctx0, cur, n_embd_kgq, n_tokens, cur->nb[1], es * n_embd_q); + ggml_tensor * Vcur_2d = ggml_view_2d(ctx0, cur, n_embd_kgq, n_tokens, cur->nb[1], es * (n_embd_q + n_embd_kgq)); + + Qcur_2d = model.layers[il].bq ? ggml_add(ctx0, Qcur_2d, model.layers[il].bq) : ggml_cont(ctx0, Qcur_2d); + Kcur_2d = model.layers[il].bk ? ggml_add(ctx0, Kcur_2d, model.layers[il].bk) : ggml_cont(ctx0, Kcur_2d); + Vcur_2d = model.layers[il].bv ? ggml_add(ctx0, Vcur_2d, model.layers[il].bv) : ggml_cont(ctx0, Vcur_2d); + + Qcur = ggml_reshape_3d(ctx0, Qcur_2d, n_embd_head, n_head, n_tokens); + Kcur = ggml_reshape_3d(ctx0, Kcur_2d, n_embd_head, n_head_kv, n_tokens); + Vcur = ggml_reshape_3d(ctx0, Vcur_2d, n_embd_head, n_head_kv, n_tokens); + } else { + // Models without bias: view_3d directly (zero-copy, no ggml_cont needed) + Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, es * n_embd_head, cur->nb[1], 0); + Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, es * n_embd_head, cur->nb[1], es * n_embd_q); + Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, es * n_embd_head, cur->nb[1], es * (n_embd_q + n_embd_kgq)); + } + cb(Qcur, "Qcur", il); - } - ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur, model.layers[il].wk_s); - cb(Kcur, "Kcur", il); - if (model.layers[il].bk) { - Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk); cb(Kcur, "Kcur", il); - } - ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur, model.layers[il].wv_s); - cb(Vcur, "Vcur", il); - if (model.layers[il].bv) { - Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv); cb(Vcur, "Vcur", il); + } else { + // separate Q/K/V path + Qcur = build_lora_mm(model.layers[il].wq, cur, model.layers[il].wq_s); + cb(Qcur, "Qcur", il); + if (model.layers[il].bq) { + Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq); + cb(Qcur, "Qcur", il); + } + Kcur = build_lora_mm(model.layers[il].wk, cur, model.layers[il].wk_s); + cb(Kcur, "Kcur", il); + if (model.layers[il].bk) { + Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk); + cb(Kcur, "Kcur", il); + } + Vcur = build_lora_mm(model.layers[il].wv, cur, model.layers[il].wv_s); + cb(Vcur, "Vcur", il); + if (model.layers[il].bv) { + Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv); + cb(Vcur, "Vcur", il); + } + Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); + Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); + Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens); } - Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); - Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); - Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens); Qcur = ggml_rope_ext( ctx0, Qcur, inp_pos, rope_factors, diff --git a/src/models/qwen2.cpp b/src/models/qwen2.cpp index 58c1062250..54da238054 100644 --- a/src/models/qwen2.cpp +++ b/src/models/qwen2.cpp @@ -30,30 +30,63 @@ llm_build_qwen2::llm_build_qwen2(const llama_model & model, const llm_graph_para // self-attention { // compute Q and K and RoPE them - ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); - cb(Qcur, "Qcur", il); - if (model.layers[il].bq) { - Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq); + ggml_tensor * Qcur = nullptr; + ggml_tensor * Kcur = nullptr; + ggml_tensor * Vcur = nullptr; + + if (model.layers[il].wqkv) { + // fused QKV path: one matmul, then split via views + cur = build_lora_mm(model.layers[il].wqkv, cur); + cb(cur, "wqkv", il); + + const int64_t n_embd_q = n_embd_head * n_head; + const int64_t n_embd_kgq = n_embd_head * n_head_kv; + const size_t es = ggml_element_size(cur); + + // Models with bias: view_2d -> add bias -> reshape_3d + // (ggml_add produces contiguous output, enabling reshape) + ggml_tensor * Qcur_2d = ggml_view_2d(ctx0, cur, n_embd_q, n_tokens, cur->nb[1], 0); + ggml_tensor * Kcur_2d = ggml_view_2d(ctx0, cur, n_embd_kgq, n_tokens, cur->nb[1], es * n_embd_q); + ggml_tensor * Vcur_2d = ggml_view_2d(ctx0, cur, n_embd_kgq, n_tokens, cur->nb[1], es * (n_embd_q + n_embd_kgq)); + + Qcur_2d = model.layers[il].bq ? ggml_add(ctx0, Qcur_2d, model.layers[il].bq) : ggml_cont(ctx0, Qcur_2d); + Kcur_2d = model.layers[il].bk ? ggml_add(ctx0, Kcur_2d, model.layers[il].bk) : ggml_cont(ctx0, Kcur_2d); + Vcur_2d = model.layers[il].bv ? ggml_add(ctx0, Vcur_2d, model.layers[il].bv) : ggml_cont(ctx0, Vcur_2d); + + Qcur = ggml_reshape_3d(ctx0, Qcur_2d, n_embd_head, n_head, n_tokens); + Kcur = ggml_reshape_3d(ctx0, Kcur_2d, n_embd_head, n_head_kv, n_tokens); + Vcur = ggml_reshape_3d(ctx0, Vcur_2d, n_embd_head, n_head_kv, n_tokens); + cb(Qcur, "Qcur", il); - } - - ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); - cb(Kcur, "Kcur", il); - if (model.layers[il].bk) { - Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk); cb(Kcur, "Kcur", il); - } - - ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); - cb(Vcur, "Vcur", il); - if (model.layers[il].bv) { - Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv); cb(Vcur, "Vcur", il); - } + } else { + // separate Q/K/V path + Qcur = build_lora_mm(model.layers[il].wq, cur); + cb(Qcur, "Qcur", il); + if (model.layers[il].bq) { + Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq); + cb(Qcur, "Qcur", il); + } - Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); - Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); - Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens); + Kcur = build_lora_mm(model.layers[il].wk, cur); + cb(Kcur, "Kcur", il); + if (model.layers[il].bk) { + Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk); + cb(Kcur, "Kcur", il); + } + + Vcur = build_lora_mm(model.layers[il].wv, cur); + cb(Vcur, "Vcur", il); + if (model.layers[il].bv) { + Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv); + cb(Vcur, "Vcur", il); + } + + Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); + Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); + Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens); + } Qcur = ggml_rope_ext( ctx0, Qcur, inp_pos, nullptr, diff --git a/src/models/qwen3.cpp b/src/models/qwen3.cpp index 5208166847..74a5ceb1f4 100644 --- a/src/models/qwen3.cpp +++ b/src/models/qwen3.cpp @@ -30,18 +30,42 @@ llm_build_qwen3::llm_build_qwen3(const llama_model & model, const llm_graph_para // self-attention { // compute Q and K and RoPE them - ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur, model.layers[il].wq_s); - cb(Qcur, "Qcur", il); + ggml_tensor * Qcur = nullptr; + ggml_tensor * Kcur = nullptr; + ggml_tensor * Vcur = nullptr; - ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur, model.layers[il].wk_s); - cb(Kcur, "Kcur", il); + if (model.layers[il].wqkv) { + // fused QKV path: one matmul, then split via views + cur = build_lora_mm(model.layers[il].wqkv, cur); + cb(cur, "wqkv", il); - ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur, model.layers[il].wv_s); - cb(Vcur, "Vcur", il); + const int64_t n_embd_q = n_embd_head * n_head; + const int64_t n_embd_kgq = n_embd_head * n_head_kv; + const size_t es = ggml_element_size(cur); - Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); - Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); - Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens); + // No bias in Qwen3: view_3d directly (zero-copy) + Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, es * n_embd_head, cur->nb[1], 0); + Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, es * n_embd_head, cur->nb[1], es * n_embd_q); + Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, es * n_embd_head, cur->nb[1], es * (n_embd_q + n_embd_kgq)); + + cb(Qcur, "Qcur", il); + cb(Kcur, "Kcur", il); + cb(Vcur, "Vcur", il); + } else { + // separate Q/K/V path + Qcur = build_lora_mm(model.layers[il].wq, cur, model.layers[il].wq_s); + cb(Qcur, "Qcur", il); + + Kcur = build_lora_mm(model.layers[il].wk, cur, model.layers[il].wk_s); + cb(Kcur, "Kcur", il); + + Vcur = build_lora_mm(model.layers[il].wv, cur, model.layers[il].wv_s); + cb(Vcur, "Vcur", il); + + Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); + Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); + Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens); + } Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il); cb(Qcur, "Qcur_normed", il);