From 0a192937a14c28b17ee2cc81bb4875be40053d48 Mon Sep 17 00:00:00 2001
From: hauhaut <hauhaut901@gmail.com>
Date: Tue, 16 Dec 2025 02:08:50 +0100
Subject: [PATCH] qwen3next: trim comments

---
 ggml/src/ggml-cuda/delta-net.cu |  6 ++----
 src/models/qwen3next.cpp        | 35 ---------------------------------
 2 files changed, 2 insertions(+), 39 deletions(-)

diff --git a/ggml/src/ggml-cuda/delta-net.cu b/ggml/src/ggml-cuda/delta-net.cu
index 634780d5dc..acac648c75 100644
--- a/ggml/src/ggml-cuda/delta-net.cu
+++ b/ggml/src/ggml-cuda/delta-net.cu
@@ -1418,17 +1418,15 @@ static void delta_net_f32_cuda(
             // - Vectors (Q,K,V,KBeta,VBeta,KCumdecay,VPrime,VNew,Out): 9 × HEAD_DIM × sizeof(float) = 4608 bytes
             // - Warp scratch: 16 × sizeof(float) = 64 bytes
             // Total: 65536 + 4608 + 64 = 70208 bytes (~68.6KB)
-            // Note: __shared__ scalars (decay, beta, etc.) are static, not dynamic
+            // __shared__ scalars (decay, beta, etc.) are static, not dynamic
             constexpr size_t state_bytes = 128 * 128 * sizeof(float);       // 64KB
             constexpr size_t vector_bytes = 9 * 128 * sizeof(float);         // 4.5KB
             constexpr size_t warp_scratch_bytes = 16 * sizeof(float);        // 64B
             constexpr size_t blackwell_smem_size = state_bytes + vector_bytes + warp_scratch_bytes;
 
-            // Sanity check: ensure we allocated enough
             static_assert(blackwell_smem_size == 70208, "Shared memory size mismatch");
 
-            // Check for A/B comparison mode
-            // Use a function-local static for thread-safe lazy initialization
+            // A/B comparison mode (set GGML_CUDA_DELTA_NET_AB=1)
             static const bool ab_mode = []() {
                 const char* env = std::getenv("GGML_CUDA_DELTA_NET_AB");
                 if (env != nullptr) {
diff --git a/src/models/qwen3next.cpp b/src/models/qwen3next.cpp
index d441a204b5..af43256c6d 100644
--- a/src/models/qwen3next.cpp
+++ b/src/models/qwen3next.cpp
@@ -33,12 +33,9 @@ llm_build_qwen3next::llm_build_qwen3next(const llama_model & model, const llm_gr
         cur = build_norm(inpL, model.layers[il].attn_norm, nullptr, LLM_NORM_RMS, il);
         cb(cur, "attn_norm", il);
 
-        // Determine layer type and build appropriate attention mechanism
         if (hparams.is_recurrent(il)) {
-            // Linear attention layer (gated delta net)
             cur = build_layer_attn_linear(inp->get_recr(), cur, causal_mask, identity, diag_mask, il);
         } else {
-            // Full attention layer
             cur = build_layer_attn(inp->get_attn(), cur, inp_pos, il);
         }
 
@@ -47,37 +44,28 @@ llm_build_qwen3next::llm_build_qwen3next(const llama_model & model, const llm_gr
             inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
         }
 
-        // Residual connection
         cur = ggml_add(ctx0, cur, inpSA);
         cb(cur, "attn_residual", il);
 
-        // Save the tensor before post-attention norm for residual connection
         ggml_tensor * ffn_residual = cur;
 
-        // Post-attention norm
         ggml_tensor * attn_post_norm = build_norm(cur, model.layers[il].attn_post_norm, nullptr, LLM_NORM_RMS, il);
         cb(attn_post_norm, "attn_post_norm", il);
 
-        // FFN layer (MoE or dense) - without residual connection
         cur = build_layer_ffn(attn_post_norm, il);
         cb(cur, "ffn_out", il);
 
-        // Residual connection for FFN - add to the tensor from before post_attention_layernorm
         cur = ggml_add(ctx0, cur, ffn_residual);
         cb(cur, "post_moe", il);
 
-        // Input for next layer
         inpL = cur;
     }
     cur = inpL;
 
-    // Final norm
     cur = build_norm(cur, model.output_norm, nullptr, LLM_NORM_RMS, -1);
-
     cb(cur, "result_norm", -1);
     res->t_embd = cur;
 
-    // LM head
     cur = build_lora_mm(model.output, cur);
 
     cb(cur, "result_output", -1);
@@ -517,16 +505,11 @@ ggml_tensor * llm_build_qwen3next::build_layer_attn(
     const int64_t n_embd_head = hparams.n_embd_head_v;
     GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
 
-    // Order: joint QG projection, QG split, Q norm, KV projection, K norm, RoPE, attention
-
-    // Qwen3Next uses a single Q projection that outputs query + gate
     ggml_tensor * Qcur_full = build_lora_mm(model.layers[il].wq, cur);
     cb(Qcur_full, "Qcur_full", il);
 
     Qcur_full = ggml_reshape_4d(ctx0, Qcur_full, n_embd_head * 2, n_head, n_tokens, 1);
 
-    // Split Q projection into query and gate
-    // The split should be along dimension 0 (the feature dimension)
     ggml_tensor * Qcur = ggml_view_4d(ctx0, Qcur_full, n_embd_head, n_head, n_tokens, 1,
                                              Qcur_full->nb[1], Qcur_full->nb[2], Qcur_full->nb[3], 0);
     ggml_tensor * gate =
@@ -535,11 +518,9 @@ ggml_tensor * llm_build_qwen3next::build_layer_attn(
     cb(Qcur, "Qcur", il);
     cb(gate, "gate", il);
 
-    // Now reshape Qcur to [n_embd_head, n_head, n_tokens] for multi-head attention
     Qcur = ggml_cont_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
     cb(Qcur, "Qcur_reshaped", il);
 
-    // Apply Q normalization
     Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, nullptr, LLM_NORM_RMS, il);
     cb(Qcur, "Qcur_normed", il);
 
@@ -549,18 +530,15 @@ ggml_tensor * llm_build_qwen3next::build_layer_attn(
     ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
     cb(Vcur, "Vcur", il);
 
-    // Apply K normalization
     Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
     Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, nullptr, LLM_NORM_RMS, il);
     cb(Kcur, "Kcur_normed", il);
 
-    // Reshape gate to [n_embd, n_tokens] for the sigmoid gating (flatten the heads)
     gate = ggml_cont_2d(ctx0, gate, n_embd_head * n_head, n_tokens);
     cb(gate, "gate_reshaped", il);
 
     Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
 
-    // Apply RoPE
     Qcur = ggml_rope_ext(
             ctx0, Qcur, inp_pos, nullptr,
             n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
@@ -575,7 +553,6 @@ ggml_tensor * llm_build_qwen3next::build_layer_attn(
     cb(Kcur, "Kcur", il);
     cb(Vcur, "Vcur", il);
 
-    // Attention computation
     const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f / sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
 
     cur = build_attn(inp,
@@ -861,9 +838,7 @@ ggml_tensor * llm_build_qwen3next::build_layer_attn_linear(
 }
 
 ggml_tensor * llm_build_qwen3next::build_layer_ffn(ggml_tensor * cur, const int il) {
-    // Check if this is an MoE layer
     if (model.layers[il].ffn_gate_inp != nullptr) {
-        // MoE branch
         ggml_tensor * moe_out =
             build_moe_ffn(cur,
                 model.layers[il].ffn_gate_inp, model.layers[il].ffn_up_exps,
@@ -873,7 +848,6 @@ ggml_tensor * llm_build_qwen3next::build_layer_ffn(ggml_tensor * cur, const int
                 true, false, 0.0, LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX, il);
         cb(moe_out, "ffn_moe_out", il);
 
-        // Add shared experts if present - following Qwen3Next reference implementation
         if (model.layers[il].ffn_up_shexp != nullptr) {
             ggml_tensor * ffn_shexp =
                 build_ffn(cur,
@@ -884,23 +858,15 @@ ggml_tensor * llm_build_qwen3next::build_layer_ffn(ggml_tensor * cur, const int
                     LLM_FFN_SILU, LLM_FFN_PAR, il);
             cb(ffn_shexp, "ffn_shexp", il);
 
-            // Apply shared expert gating as in the reference implementation
-            // The shared expert has its own gate that is sigmoided
-            // Note: ffn_gate_inp_shexp is the shared expert gate (outputs 1 value per token)
             ggml_tensor * shared_gate = build_lora_mm(model.layers[il].ffn_gate_inp_shexp, cur);
             cb(shared_gate, "shared_expert_gate", il);
 
-            // Apply sigmoid to the gate
             shared_gate = ggml_sigmoid(ctx0, shared_gate);
             cb(shared_gate, "shared_expert_gate_sigmoid", il);
 
-            // The gate needs to be broadcast to match the dimensions of ffn_shexp
-            // ffn_shexp is [n_embd, n_tokens, 1, 1] and shared_gate is [1, n_tokens, 1, 1]
-            // We need to repeat the gate along the feature dimension
             shared_gate = ggml_repeat(ctx0, shared_gate, ffn_shexp);
             cb(shared_gate, "shared_expert_gate_broadcast", il);
 
-            // Apply the gate to the shared expert output
             ffn_shexp = ggml_mul(ctx0, ffn_shexp, shared_gate);
             cb(ffn_shexp, "ffn_shexp_gated", il);
 
@@ -910,7 +876,6 @@ ggml_tensor * llm_build_qwen3next::build_layer_ffn(ggml_tensor * cur, const int
             cur = moe_out;
         }
     } else {
-        // Dense FFN branch
         cur = build_ffn(cur,
             model.layers[il].ffn_up, NULL, NULL,
             model.layers[il].ffn_gate, NULL, NULL,