From 34ba7b5a2f5cd88f99629a3bd68d003fbd5bc2cf Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sat, 7 Feb 2026 07:37:15 +0200
Subject: [PATCH 01/17] metal : fix event synchronization in cpy_tensor_async
 (#19402)

---
 ggml/src/ggml-metal/ggml-metal-context.m | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ggml/src/ggml-metal/ggml-metal-context.m b/ggml/src/ggml-metal/ggml-metal-context.m
index c7e8ebd3f3..5d3a8ce412 100644
--- a/ggml/src/ggml-metal/ggml-metal-context.m
+++ b/ggml/src/ggml-metal/ggml-metal-context.m
@@ -394,7 +394,7 @@ bool ggml_metal_cpy_tensor_async(ggml_metal_t ctx_src, ggml_metal_t ctx_dst, con
         [encoder endEncoding];
 
         ggml_metal_event_t ev_cpy = ggml_metal_get_ev_cpy(ctx_src);
-        ggml_metal_event_record(ctx_src, ev_cpy);
+        ggml_metal_event_encode_signal(ev_cpy, cmd_buf);
 
         [cmd_buf commit];
 

From 8872ad2125336d209a9911a82101f80095a9831d Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sat, 7 Feb 2026 10:35:56 +0200
Subject: [PATCH 02/17] metal : consolidate bin kernels (#19390)

* metal : refactor bin kernels

* cont

* cont : fix cv
---
 ggml/src/ggml-metal/ggml-metal-device.cpp |  80 ++++-
 ggml/src/ggml-metal/ggml-metal-device.h   |   6 +-
 ggml/src/ggml-metal/ggml-metal-device.m   |  10 +-
 ggml/src/ggml-metal/ggml-metal-impl.h     |   1 +
 ggml/src/ggml-metal/ggml-metal-ops.cpp    |  33 +-
 ggml/src/ggml-metal/ggml-metal.metal      | 389 ++++++++--------------
 6 files changed, 224 insertions(+), 295 deletions(-)

diff --git a/ggml/src/ggml-metal/ggml-metal-device.cpp b/ggml/src/ggml-metal/ggml-metal-device.cpp
index 6af0dd88d5..4c4c3ce36c 100644
--- a/ggml/src/ggml-metal/ggml-metal-device.cpp
+++ b/ggml/src/ggml-metal/ggml-metal-device.cpp
@@ -1392,34 +1392,78 @@ ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_flash_attn_ext_v
     GGML_UNUSED(op);
 }
 
-ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_bin(
-        ggml_metal_library_t lib,
-        ggml_op op,
-        int32_t n_fuse,
-        bool row) {
+ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_bin(ggml_metal_library_t lib, const ggml_tensor * op, int32_t n_fuse) {
     char base[256];
     char name[256];
 
-    const char * op_str = "undefined";
-    switch (op) {
-        case GGML_OP_ADD:   op_str = "add";   break;
-        case GGML_OP_SUB:   op_str = "sub";   break;
-        case GGML_OP_MUL:   op_str = "mul";   break;
-        case GGML_OP_DIV:   op_str = "div";   break;
+    int op_num = -1;
+
+    switch (op->op) {
+        case GGML_OP_ADD: op_num = 0; break;
+        case GGML_OP_SUB: op_num = 1; break;
+        case GGML_OP_MUL: op_num = 2; break;
+        case GGML_OP_DIV: op_num = 3; break;
         default: GGML_ABORT("fatal error");
     };
 
-    if (row) {
-        snprintf(base, 256, "kernel_%s_row_c4_fuse_%d", op_str, n_fuse);
-    } else {
-        snprintf(base, 256, "kernel_%s_fuse_%d", op_str, n_fuse);
-    }
+    const char * t0_str = ggml_type_name(op->src[0]->type);
+    const char * t1_str = ggml_type_name(op->src[1]->type);
+    const char * t_str  = ggml_type_name(op->type);
 
-    snprintf(name, 256, "%s", base);
+    const bool is_c4 = (op->src[0]->ne[0] % 4 == 0) && (op->src[1]->ne[0] % 4 == 0);
+
+    const bool is_rb = ggml_is_contiguous(op->src[0]) && ggml_is_contiguous(op->src[1]) && (ggml_nrows(op->src[1]) == 1) && ggml_nelements(op) < 65536;
+
+    snprintf(base, 256, "kernel_bin_fuse_%s_%s_%s%s", t0_str, t1_str, t_str, is_c4 ? "_4" : "");
+    snprintf(name, 256, "%s_op=%d_nf=%d_rb=%d", base, op_num, n_fuse, is_rb);
 
     ggml_metal_pipeline_with_params res = ggml_metal_library_get_pipeline(lib, name);
     if (!res.pipeline) {
-        res = ggml_metal_library_compile_pipeline(lib, base, name, nullptr);
+        ggml_metal_cv_t cv = ggml_metal_cv_init();
+
+        ggml_metal_cv_set_int16(cv, op_num, FC_BIN + 0);
+        ggml_metal_cv_set_int16(cv, n_fuse, FC_BIN + 1);
+        ggml_metal_cv_set_bool (cv, is_rb,  FC_BIN + 2);
+
+        res = ggml_metal_library_compile_pipeline(lib, base, name, cv);
+
+        ggml_metal_cv_free(cv);
+    }
+
+    res.c4  = is_c4;
+    res.cnt = is_rb;
+
+    return res;
+}
+
+ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_bin_one(ggml_metal_library_t lib, ggml_op op) {
+    char base[256];
+    char name[256];
+
+    int op_num = -1;
+
+    switch (op) {
+        case GGML_OP_ADD: op_num = 0; break;
+        case GGML_OP_SUB: op_num = 1; break;
+        case GGML_OP_MUL: op_num = 2; break;
+        case GGML_OP_DIV: op_num = 3; break;
+        default: GGML_ABORT("fatal error");
+    };
+
+    snprintf(base, 256, "kernel_bin_fuse_%s_%s_%s", "f32", "f32", "f32");
+    snprintf(name, 256, "%s_op=%d_nf=%d", base, op_num, 1);
+
+    ggml_metal_pipeline_with_params res = ggml_metal_library_get_pipeline(lib, name);
+    if (!res.pipeline) {
+        ggml_metal_cv_t cv = ggml_metal_cv_init();
+
+        ggml_metal_cv_set_int16(cv, op_num, FC_BIN + 0);
+        ggml_metal_cv_set_int16(cv, 1,      FC_BIN + 1);
+        ggml_metal_cv_set_bool (cv, false,  FC_BIN + 2);
+
+        res = ggml_metal_library_compile_pipeline(lib, base, name, cv);
+
+        ggml_metal_cv_free(cv);
     }
 
     return res;
diff --git a/ggml/src/ggml-metal/ggml-metal-device.h b/ggml/src/ggml-metal/ggml-metal-device.h
index 84dcec3083..93d7f6a216 100644
--- a/ggml/src/ggml-metal/ggml-metal-device.h
+++ b/ggml/src/ggml-metal/ggml-metal-device.h
@@ -53,6 +53,9 @@ struct ggml_metal_pipeline_with_params {
     int nr1;
 
     size_t smem;
+
+    bool c4;
+    bool cnt;
 };
 
 int ggml_metal_pipeline_max_theads_per_threadgroup(struct ggml_metal_pipeline_with_params pipeline);
@@ -134,7 +137,8 @@ struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_argsort
 struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_argsort_merge     (ggml_metal_library_t lib, const struct ggml_tensor * op);
 struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_top_k             (ggml_metal_library_t lib, const struct ggml_tensor * op);
 struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_top_k_merge       (ggml_metal_library_t lib, const struct ggml_tensor * op);
-struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_bin               (ggml_metal_library_t lib, enum ggml_op op, int32_t n_fuse, bool row);
+struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_bin               (ggml_metal_library_t lib, const struct ggml_tensor * op, int32_t n_fuse );
+struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_bin_one           (ggml_metal_library_t lib, enum ggml_op op);
 struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_l2_norm           (ggml_metal_library_t lib, const struct ggml_tensor * op);
 struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_group_norm        (ggml_metal_library_t lib, const struct ggml_tensor * op);
 struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_norm              (ggml_metal_library_t lib, const struct ggml_tensor * op, int32_t n_fuse);
diff --git a/ggml/src/ggml-metal/ggml-metal-device.m b/ggml/src/ggml-metal/ggml-metal-device.m
index c8e737d418..891d70c85a 100644
--- a/ggml/src/ggml-metal/ggml-metal-device.m
+++ b/ggml/src/ggml-metal/ggml-metal-device.m
@@ -346,10 +346,12 @@ struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline(ggml_meta
 
     struct ggml_metal_pipeline_with_params res = {
         /*.pipeline =*/ nil,
+        /*.nsg      =*/ 0,
         /*.nr0      =*/ 0,
         /*.nr1      =*/ 0,
-        /*.nsg      =*/ 0,
         /*.smem     =*/ 0,
+        /*.c4       =*/ false,
+        /*.cnt      =*/ false,
     };
 
     res.pipeline = ggml_metal_pipelines_get(lib->pipelines, name);
@@ -362,10 +364,12 @@ struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline(ggml_meta
 struct ggml_metal_pipeline_with_params ggml_metal_library_compile_pipeline(ggml_metal_library_t lib, const char * base, const char * name, ggml_metal_cv_t cv) {
     struct ggml_metal_pipeline_with_params res = {
         /*.pipeline =*/ nil,
+        /*.nsg      =*/ 0,
         /*.nr0      =*/ 0,
         /*.nr1      =*/ 0,
-        /*.nsg      =*/ 0,
         /*.smem     =*/ 0,
+        /*.c4       =*/ false,
+        /*.cnt      =*/ false,
     };
 
     [lib->lock lock];
@@ -1054,7 +1058,7 @@ bool ggml_metal_device_supports_op(ggml_metal_device_t dev, const struct ggml_te
         case GGML_OP_MUL:
         case GGML_OP_DIV:
         case GGML_OP_ADD_ID:
-            return op->src[0]->type == GGML_TYPE_F32;
+            return ggml_is_contiguous_rows(op->src[0]) && ggml_is_contiguous_rows(op->src[1]) && op->src[0]->type == GGML_TYPE_F32;
         case GGML_OP_ACC:
         case GGML_OP_REPEAT:
         case GGML_OP_SCALE:
diff --git a/ggml/src/ggml-metal/ggml-metal-impl.h b/ggml/src/ggml-metal/ggml-metal-impl.h
index 7f73cb97bb..77bb403c15 100644
--- a/ggml/src/ggml-metal/ggml-metal-impl.h
+++ b/ggml/src/ggml-metal/ggml-metal-impl.h
@@ -80,6 +80,7 @@
 #define FC_SSM_CONV                    900
 #define FC_SOLVE_TRI                   1000
 #define FC_COUNT_EQUAL                 1100
+#define FC_BIN                         1200
 
 // op-specific constants
 #define OP_FLASH_ATTN_EXT_NQPSG 8
diff --git a/ggml/src/ggml-metal/ggml-metal-ops.cpp b/ggml/src/ggml-metal/ggml-metal-ops.cpp
index e0ed6c7805..dbf25433c2 100644
--- a/ggml/src/ggml-metal/ggml-metal-ops.cpp
+++ b/ggml/src/ggml-metal/ggml-metal-ops.cpp
@@ -707,7 +707,7 @@ int ggml_metal_op_acc(ggml_metal_op_t ctx, int idx) {
         /*.o1   =*/ { 0 },
     };
 
-    auto pipeline = ggml_metal_library_get_pipeline_bin(lib, GGML_OP_ADD, 1, false);
+    auto pipeline = ggml_metal_library_get_pipeline_bin_one(lib, GGML_OP_ADD);
 
     ggml_metal_encoder_set_pipeline(enc, pipeline);
     ggml_metal_encoder_set_bytes   (enc, &args, sizeof(args), 0);
@@ -2895,8 +2895,6 @@ int ggml_metal_op_bin(ggml_metal_op_t ctx, int idx) {
     GGML_ASSERT(ggml_is_contiguous_rows(op->src[0]));
     GGML_ASSERT(ggml_is_contiguous_rows(op->src[1]));
 
-    bool bcast_row = false;
-
     ggml_metal_buffer_id bid_src0 = ggml_metal_get_buffer_id(op->src[0]);
     ggml_metal_buffer_id bid_src1 = ggml_metal_get_buffer_id(op->src[1]);
     ggml_metal_buffer_id bid_dst  = ggml_metal_get_buffer_id(op);
@@ -2990,18 +2988,7 @@ int ggml_metal_op_bin(ggml_metal_op_t ctx, int idx) {
 
     struct ggml_metal_pipeline_with_params pipeline;
 
-    if (ggml_nelements(op->src[1]) == ne10 && ggml_is_contiguous(op->src[1]) && ne00 % 4 == 0 && ne10 % 4 == 0) {
-        GGML_ASSERT(ggml_is_contiguous(op->src[0]));
-
-        // src1 is a row
-        GGML_ASSERT(ne11 == 1);
-
-        pipeline = ggml_metal_library_get_pipeline_bin(lib, op->op, n_fuse, true);
-
-        bcast_row = true;
-    } else {
-        pipeline = ggml_metal_library_get_pipeline_bin(lib, op->op, n_fuse, false);
-    }
+    pipeline = ggml_metal_library_get_pipeline_bin(lib, op, n_fuse);
 
     if (n_fuse > 1) {
         bid_dst = ggml_metal_get_buffer_id(ctx->node(idx + n_fuse - 1));
@@ -3015,20 +3002,28 @@ int ggml_metal_op_bin(ggml_metal_op_t ctx, int idx) {
         }
     }
 
+    if (pipeline.c4) {
+        args.ne00 = ne00/4;
+        args.ne10 = ne10/4;
+        args.ne0  = ne0/4;
+    }
+
     ggml_metal_encoder_set_pipeline(enc, pipeline);
     ggml_metal_encoder_set_bytes   (enc, &args, sizeof(args), 0);
     ggml_metal_encoder_set_buffer  (enc, bid_src0, 1);
     ggml_metal_encoder_set_buffer  (enc, bid_src1, 2);
     ggml_metal_encoder_set_buffer  (enc, bid_dst,  3);
 
-    if (bcast_row) {
-        const int64_t n = ggml_nelements(op)/4;
+    if (pipeline.cnt) {
+        const int n = pipeline.c4 ? ggml_nelements(op)/4 : ggml_nelements(op);
 
         ggml_metal_encoder_dispatch_threadgroups(enc, n, 1, 1, 1, 1, 1);
     } else {
-        int nth = 32;
+        const int nth_max = MIN(256, ggml_metal_pipeline_max_theads_per_threadgroup(pipeline));
 
-        while (16*nth < ne0 && nth < ggml_metal_pipeline_max_theads_per_threadgroup(pipeline)) {
+        int nth = 1;
+
+        while (2*nth < args.ne0 && nth < nth_max) {
             nth *= 2;
         }
 
diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal
index 612a42a1ea..35cc3bbdfd 100644
--- a/ggml/src/ggml-metal/ggml-metal.metal
+++ b/ggml/src/ggml-metal/ggml-metal.metal
@@ -895,11 +895,13 @@ enum ggml_sort_order {
     GGML_SORT_ORDER_DESC,
 };
 
-// general-purpose kernel for addition, subtraction, multiplication and division of two tensors
-// pros: works for non-contiguous tensors, supports broadcast across all dims
-// cons: not very efficient
-template <int F>
-kernel void kernel_add_fuse_impl(
+// OP: 0 - add, 1 - sub, 2 - mul, 3 - div
+constant short FC_bin_op [[function_constant(FC_BIN + 0)]];
+constant short FC_bin_f  [[function_constant(FC_BIN + 1)]];
+constant bool  FC_bin_rb [[function_constant(FC_BIN + 2)]];
+
+template <typename T0, typename T1, typename T>
+kernel void kernel_bin_fuse_impl(
         constant ggml_metal_kargs_bin & args,
         device const char * src0,
         device const char * src1,
@@ -907,138 +909,152 @@ kernel void kernel_add_fuse_impl(
         uint3   tgpig[[threadgroup_position_in_grid]],
         ushort3 tpitg[[thread_position_in_threadgroup]],
         ushort3   ntg[[threads_per_threadgroup]]) {
-    const int i03 = tgpig.z;
-    const int i02 = tgpig.y;
-    const int i01 = tgpig.x;
+#define FC_OP FC_bin_op
+#define FC_F  FC_bin_f
+#define FC_RB FC_bin_rb
 
-    const int i13 = i03%args.ne13;
-    const int i12 = i02%args.ne12;
-    const int i11 = i01%args.ne11;
+    if (FC_RB) {
+        // row broadcast
+        const uint i0 = tgpig.x;
+        const uint i1 = i0%args.ne10;
 
-    device const float * src0_ptr = (device const float *) (src0 + i03*args.nb03 + i02*args.nb02 + i01*args.nb01 + args.offs);
-    device       float * dst_ptr  = (device       float *) (dst  + i03*args.nb3  + i02*args.nb2  + i01*args.nb1  + args.offs);
+        device const T0 * src0_row = (device const T0 *) (src0);
+        device       T  * dst_row  = (device       T  *) (dst);
 
-    device const float * src1_ptr[F];
-    for (short j = 0; j < F; ++j) {
-        src1_ptr[j] = (device const float *) (src1 + args.o1[j] + i13*args.nb13 + i12*args.nb12 + i11*args.nb11);
-    }
+        if (FC_F == 1) {
+            device const T1 * src1_row = (device const T1 *) (src1 + args.o1[0]);
 
-    for (int i0 = tpitg.x; i0 < args.ne0; i0 += ntg.x) {
-        const int i10 = i0%args.ne10;
+            if (FC_OP == 0) {
+                dst_row[i0] = src0_row[i0] + src1_row[i1];
+            }
 
-        float res = src0_ptr[i0];
+            if (FC_OP == 1) {
+                dst_row[i0] = src0_row[i0] - src1_row[i1];
+            }
 
-#pragma unroll
-        for (short j = 0; j < F; ++j) {
-            res += src1_ptr[j][i10];
-        }
+            if (FC_OP == 2) {
+                dst_row[i0] = src0_row[i0] * src1_row[i1];
+            }
 
-        dst_ptr[i0] = res;
-    }
-}
+            if (FC_OP == 3) {
+                dst_row[i0] = src0_row[i0] / src1_row[i1];
+            }
+        } else {
+            T0 res = src0_row[i0];
 
-typedef decltype(kernel_add_fuse_impl<2>) kernel_add_fuse_t;
+            if (FC_OP == 0) {
+                FOR_UNROLL (short j = 0; j < FC_F; ++j) {
+                    res += ((device const T1 *) (src1 + args.o1[j]))[i1];
+                }
+            }
 
-template [[host_name("kernel_add_fuse_1")]] kernel kernel_add_fuse_t kernel_add_fuse_impl<1>;
-template [[host_name("kernel_add_fuse_2")]] kernel kernel_add_fuse_t kernel_add_fuse_impl<2>;
-template [[host_name("kernel_add_fuse_3")]] kernel kernel_add_fuse_t kernel_add_fuse_impl<3>;
-template [[host_name("kernel_add_fuse_4")]] kernel kernel_add_fuse_t kernel_add_fuse_impl<4>;
-template [[host_name("kernel_add_fuse_5")]] kernel kernel_add_fuse_t kernel_add_fuse_impl<5>;
-template [[host_name("kernel_add_fuse_6")]] kernel kernel_add_fuse_t kernel_add_fuse_impl<6>;
-template [[host_name("kernel_add_fuse_7")]] kernel kernel_add_fuse_t kernel_add_fuse_impl<7>;
-template [[host_name("kernel_add_fuse_8")]] kernel kernel_add_fuse_t kernel_add_fuse_impl<8>;
+            if (FC_OP == 1) {
+                FOR_UNROLL (short j = 0; j < FC_F; ++j) {
+                    res -= ((device const T1 *) (src1 + args.o1[j]))[i1];
+                }
+            }
 
-kernel void kernel_sub_fuse_1(
-        constant ggml_metal_kargs_bin & args,
-        device const char * src0,
-        device const char * src1,
-        device       char * dst,
-        uint3   tgpig[[threadgroup_position_in_grid]],
-        ushort3 tpitg[[thread_position_in_threadgroup]],
-        ushort3   ntg[[threads_per_threadgroup]]) {
-    const int i03 = tgpig.z;
-    const int i02 = tgpig.y;
-    const int i01 = tgpig.x;
+            if (FC_OP == 2) {
+                FOR_UNROLL (short j = 0; j < FC_F; ++j) {
+                    res *= ((device const T1 *) (src1 + args.o1[j]))[i1];
+                }
+            }
 
-    const int i13 = i03%args.ne13;
-    const int i12 = i02%args.ne12;
-    const int i11 = i01%args.ne11;
+            if (FC_OP == 3) {
+                FOR_UNROLL (short j = 0; j < FC_F; ++j) {
+                    res /= ((device const T1 *) (src1 + args.o1[j]))[i1];
+                }
+            }
 
-    device const char * src0_ptr = src0 + i03*args.nb03 + i02*args.nb02 + i01*args.nb01 + args.offs;
-    device const char * src1_ptr = src1 + i13*args.nb13 + i12*args.nb12 + i11*args.nb11 + args.o1[0];
-    device       char * dst_ptr  = dst  + i03*args.nb3  + i02*args.nb2  + i01*args.nb1  + args.offs;
-
-    for (int i0 = tpitg.x; i0 < args.ne0; i0 += ntg.x) {
-        const int i10 = i0%args.ne10;
-        *((device float *)(dst_ptr + i0*args.nb0)) = *((device float *)(src0_ptr + i0*args.nb00)) - *((device float *)(src1_ptr + i10*args.nb10));
-    }
-}
-
-kernel void kernel_mul_fuse_1(
-        constant ggml_metal_kargs_bin & args,
-        device const char * src0,
-        device const char * src1,
-        device       char * dst,
-        uint3   tgpig[[threadgroup_position_in_grid]],
-        ushort3 tpitg[[thread_position_in_threadgroup]],
-        ushort3   ntg[[threads_per_threadgroup]]) {
-    const int i03 = tgpig.z;
-    const int i02 = tgpig.y;
-    const int i01 = tgpig.x;
-
-    const int i13 = i03%args.ne13;
-    const int i12 = i02%args.ne12;
-    const int i11 = i01%args.ne11;
-
-    device const char * src0_ptr = src0 + i03*args.nb03 + i02*args.nb02 + i01*args.nb01 + args.offs;
-    device const char * src1_ptr = src1 + i13*args.nb13 + i12*args.nb12 + i11*args.nb11 + args.o1[0];
-    device       char * dst_ptr  = dst  + i03*args.nb3  + i02*args.nb2  + i01*args.nb1  + args.offs;
-
-    if (args.ne10 == 1) {
-        const float x = *((device float *)(src1_ptr));
-        for (int i0 = tpitg.x; i0 < args.ne0; i0 += ntg.x) {
-            *((device float *)(dst_ptr + i0*args.nb0)) = *((device float *)(src0_ptr + i0*args.nb00)) * x;
+            dst_row[i0] = res;
         }
     } else {
-        for (int i0 = tpitg.x; i0 < args.ne0; i0 += ntg.x) {
-            const int i10 = i0%args.ne10;
-            *((device float *)(dst_ptr + i0*args.nb0)) = *((device float *)(src0_ptr + i0*args.nb00)) * *((device float *)(src1_ptr + i10*args.nb10));
+        const int i03 = tgpig.z;
+        const int i02 = tgpig.y;
+        const int i01 = tgpig.x;
+
+        if (i01 >= args.ne01) {
+            return;
+        }
+
+        const int i13 = i03%args.ne13;
+        const int i12 = i02%args.ne12;
+        const int i11 = i01%args.ne11;
+
+        device const T0 * src0_ptr = (device const T0 *) (src0 + i03*args.nb03 + i02*args.nb02 + i01*args.nb01 + args.offs);
+        device       T  * dst_ptr  = (device       T  *) (dst  + i03*args.nb3  + i02*args.nb2  + i01*args.nb1  + args.offs);
+
+        if (FC_F == 1) {
+            device const T1 * src1_ptr = (device const T1 *) (src1 + args.o1[0] + i13*args.nb13 + i12*args.nb12 + i11*args.nb11);
+
+            for (int i0 = tpitg.x; i0 < args.ne0; i0 += ntg.x) {
+                const int i10 = i0%args.ne10;
+
+                if (FC_OP == 0) {
+                    dst_ptr[i0] = src0_ptr[i0] + src1_ptr[i10];
+                }
+
+                if (FC_OP == 1) {
+                    dst_ptr[i0] = src0_ptr[i0] - src1_ptr[i10];
+                }
+
+                if (FC_OP == 2) {
+                    dst_ptr[i0] = src0_ptr[i0] * src1_ptr[i10];
+                }
+
+                if (FC_OP == 3) {
+                    dst_ptr[i0] = src0_ptr[i0] / src1_ptr[i10];
+                }
+            }
+        } else {
+            device const T1 * src1_ptr[8];
+            FOR_UNROLL (short j = 0; j < FC_F; ++j) {
+                src1_ptr[j] = (device const T1 *) (src1 + args.o1[j] + i13*args.nb13 + i12*args.nb12 + i11*args.nb11);
+            }
+
+            for (int i0 = tpitg.x; i0 < args.ne0; i0 += ntg.x) {
+                const int i10 = i0%args.ne10;
+
+                T res = src0_ptr[i0];
+
+                if (FC_OP == 0) {
+                    FOR_UNROLL (short j = 0; j < FC_F; ++j) {
+                        res += src1_ptr[j][i10];
+                    }
+                }
+
+                if (FC_OP == 1) {
+                    FOR_UNROLL (short j = 0; j < FC_F; ++j) {
+                        res -= src1_ptr[j][i10];
+                    }
+                }
+
+                if (FC_OP == 2) {
+                    FOR_UNROLL (short j = 0; j < FC_F; ++j) {
+                        res *= src1_ptr[j][i10];
+                    }
+                }
+
+                if (FC_OP == 3) {
+                    FOR_UNROLL (short j = 0; j < FC_F; ++j) {
+                        res /= src1_ptr[j][i10];
+                    }
+                }
+
+                dst_ptr[i0] = res;
+            }
         }
     }
+
+#undef FC_OP
+#undef FC_F
+#undef FC_RB
 }
 
-kernel void kernel_div_fuse_1(
-        constant ggml_metal_kargs_bin & args,
-        device const char * src0,
-        device const char * src1,
-        device       char * dst,
-        uint3   tgpig[[threadgroup_position_in_grid]],
-        ushort3 tpitg[[thread_position_in_threadgroup]],
-        ushort3   ntg[[threads_per_threadgroup]]) {
-    const int i03 = tgpig.z;
-    const int i02 = tgpig.y;
-    const int i01 = tgpig.x;
+typedef decltype(kernel_bin_fuse_impl<float, float, float>) kernel_bin_fuse_t;
 
-    const int i13 = i03%args.ne13;
-    const int i12 = i02%args.ne12;
-    const int i11 = i01%args.ne11;
-
-    device const char * src0_ptr = src0 + i03*args.nb03 + i02*args.nb02 + i01*args.nb01 + args.offs;
-    device const char * src1_ptr = src1 + i13*args.nb13 + i12*args.nb12 + i11*args.nb11 + args.o1[0];
-    device       char * dst_ptr  = dst  + i03*args.nb3  + i02*args.nb2  + i01*args.nb1  + args.offs;
-
-    if (args.ne10 == 1) {
-        const float x = 1.0f / *((device float *)(src1_ptr));
-        for (int i0 = tpitg.x; i0 < args.ne0; i0 += ntg.x) {
-            *((device float *)(dst_ptr + i0*args.nb0)) = *((device float *)(src0_ptr + i0*args.nb00)) * x;
-        }
-    } else {
-        for (int i0 = tpitg.x; i0 < args.ne0; i0 += ntg.x) {
-            const int i10 = i0%args.ne10;
-            *((device float *)(dst_ptr + i0*args.nb0)) = *((device float *)(src0_ptr + i0*args.nb00)) / *((device float *)(src1_ptr + i10*args.nb10));
-        }
-    }
-}
+template [[host_name("kernel_bin_fuse_f32_f32_f32")]]   kernel kernel_bin_fuse_t kernel_bin_fuse_impl<float,  float,  float>;
+template [[host_name("kernel_bin_fuse_f32_f32_f32_4")]] kernel kernel_bin_fuse_t kernel_bin_fuse_impl<float4, float4, float4>;
 
 kernel void kernel_add_id(
         constant ggml_metal_kargs_add_id & args,
@@ -1057,7 +1073,7 @@ kernel void kernel_add_id(
     const size_t nb1 = args.ne0 * sizeof(float);
     const size_t nb2 = args.ne1 * nb1;
 
-    device       float * dst_row  = (device       float *)((device char *)dst + i1*nb1 + i2*nb2);
+    device       float * dst_row  = (device       float *)((device char *)dst  +  i1*nb1       + i2*nb2);
     device const float * src0_row = (device const float *)((device char *)src0 +  i1*args.nb01 + i2*args.nb02);
     device const float * src1_row = (device const float *)((device char *)src1 + i11*args.nb11);
 
@@ -1098,141 +1114,6 @@ template [[host_name("kernel_repeat_f16")]] kernel kernel_repeat_t kernel_repeat
 template [[host_name("kernel_repeat_i32")]] kernel kernel_repeat_t kernel_repeat<int>;
 template [[host_name("kernel_repeat_i16")]] kernel kernel_repeat_t kernel_repeat<short>;
 
-// assumption: src1 is a row
-// broadcast src1 into src0
-template <short F>
-kernel void kernel_add_row_c4_fuse_impl(
-        constant ggml_metal_kargs_bin & args,
-        device const char * src0,
-        device const char * src1,
-        device       char * dst,
-        uint tpig[[thread_position_in_grid]]) {
-    const uint nb = args.ne00/4;
-    const uint i  = tpig % nb;
-
-    device const float4 * src0_row = (device const float4 *) (src0);
-    device       float4 *  dst_row = (device       float4 *) (dst);
-
-    float4 res = src0_row[tpig];
-
-#pragma unroll(F)
-    for (short j = 0; j < F; ++j) {
-        res += ((device const float4 *) (src1 + args.o1[j]))[i];
-    }
-
-    dst_row[tpig] = res;
-}
-
-typedef decltype(kernel_add_row_c4_fuse_impl<1>) kernel_add_row_c4_fuse_t;
-
-template [[host_name("kernel_add_row_c4_fuse_1")]] kernel kernel_add_row_c4_fuse_t kernel_add_row_c4_fuse_impl<1>;
-template [[host_name("kernel_add_row_c4_fuse_2")]] kernel kernel_add_row_c4_fuse_t kernel_add_row_c4_fuse_impl<2>;
-template [[host_name("kernel_add_row_c4_fuse_3")]] kernel kernel_add_row_c4_fuse_t kernel_add_row_c4_fuse_impl<3>;
-template [[host_name("kernel_add_row_c4_fuse_4")]] kernel kernel_add_row_c4_fuse_t kernel_add_row_c4_fuse_impl<4>;
-template [[host_name("kernel_add_row_c4_fuse_5")]] kernel kernel_add_row_c4_fuse_t kernel_add_row_c4_fuse_impl<5>;
-template [[host_name("kernel_add_row_c4_fuse_6")]] kernel kernel_add_row_c4_fuse_t kernel_add_row_c4_fuse_impl<6>;
-template [[host_name("kernel_add_row_c4_fuse_7")]] kernel kernel_add_row_c4_fuse_t kernel_add_row_c4_fuse_impl<7>;
-template [[host_name("kernel_add_row_c4_fuse_8")]] kernel kernel_add_row_c4_fuse_t kernel_add_row_c4_fuse_impl<8>;
-
-template <short F>
-kernel void kernel_sub_row_c4_fuse_impl(
-        constant ggml_metal_kargs_bin & args,
-        device const char * src0,
-        device const char * src1,
-        device       char * dst,
-        uint tpig[[thread_position_in_grid]]) {
-
-    const uint nb = args.ne00/4;
-    const uint i  = tpig % nb;
-
-    device const float4 * src0_row = (device const float4 *) (src0);
-    device       float4 *  dst_row = (device       float4 *) (dst);
-
-    device const float4 * src1_row[F];
-    for (short j = 0; j < F; ++j) {
-        src1_row[j] = (device const float4 *) (src1 + args.o1[j]);
-    }
-
-    float4 res = src0_row[tpig];
-
-#pragma unroll(F)
-    for (short j = 0; j < F; ++j) {
-        res -= src1_row[j][i];
-    }
-
-    dst_row[tpig] = res;
-}
-
-typedef decltype(kernel_sub_row_c4_fuse_impl<1>) kernel_sub_row_c4_fuse_t;
-
-template [[host_name("kernel_sub_row_c4_fuse_1")]] kernel kernel_sub_row_c4_fuse_t kernel_sub_row_c4_fuse_impl<1>;
-
-template <short F>
-kernel void kernel_mul_row_c4_fuse_impl(
-        constant ggml_metal_kargs_bin & args,
-        device const char * src0,
-        device const char * src1,
-        device       char * dst,
-        uint tpig[[thread_position_in_grid]]) {
-
-    const uint nb = args.ne00/4;
-    const uint i  = tpig % nb;
-
-    device const float4 * src0_row = (device const float4 *) (src0);
-    device       float4 *  dst_row = (device       float4 *) (dst);
-
-    device const float4 * src1_row[F];
-    for (short j = 0; j < F; ++j) {
-        src1_row[j] = (device const float4 *) (src1 + args.o1[j]);
-    }
-
-    float4 res = src0_row[tpig];
-
-#pragma unroll(F)
-    for (short j = 0; j < F; ++j) {
-        res *= src1_row[j][i];
-    }
-
-    dst_row[tpig] = res;
-}
-
-typedef decltype(kernel_mul_row_c4_fuse_impl<1>) kernel_mul_row_c4_fuse_t;
-
-template [[host_name("kernel_mul_row_c4_fuse_1")]] kernel kernel_mul_row_c4_fuse_t kernel_mul_row_c4_fuse_impl<1>;
-
-template <short F>
-kernel void kernel_div_row_c4_fuse_impl(
-        constant ggml_metal_kargs_bin & args,
-        device const char * src0,
-        device const char * src1,
-        device       char * dst,
-        uint tpig[[thread_position_in_grid]]) {
-
-    const uint nb = args.ne00/4;
-    const uint i  = tpig % nb;
-
-    device const float4 * src0_row = (device const float4 *) (src0);
-    device       float4 *  dst_row = (device       float4 *) (dst);
-
-    device const float4 * src1_row[F];
-    for (short j = 0; j < F; ++j) {
-        src1_row[j] = (device const float4 *) (src1 + args.o1[j]);
-    }
-
-    float4 res = src0_row[tpig];
-
-#pragma unroll(F)
-    for (short j = 0; j < F; ++j) {
-        res /= src1_row[j][i];
-    }
-
-    dst_row[tpig] = res;
-}
-
-typedef decltype(kernel_div_row_c4_fuse_impl<1>) kernel_div_row_c4_fuse_t;
-
-template [[host_name("kernel_div_row_c4_fuse_1")]] kernel kernel_div_row_c4_fuse_t kernel_div_row_c4_fuse_impl<1>;
-
 kernel void kernel_scale_f32(
         constant ggml_metal_kargs_scale & args,
         device const float * src0,

From 96441c955ea45cfa2b6834cad419b849aa144463 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sun, 8 Feb 2026 00:50:47 +0200
Subject: [PATCH 03/17] ci : use -j param correctly when building with
 sanitizers (#19411)

* ci : use less jobs when building with sanitizers

* cont : fix nproc

* cont : fix the fix

* cont : simplify
---
 .github/workflows/build.yml  | 2 ++
 .github/workflows/server.yml | 2 +-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 51a3dc76e9..6c7ab71143 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -295,6 +295,7 @@ jobs:
             -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \
             -DGGML_SANITIZE_${{ matrix.sanitizer }}=ON \
             -DCMAKE_BUILD_TYPE=${{ matrix.build_type }}
+
           cmake --build build --config ${{ matrix.build_type }} -j $(nproc)
 
       - name: Build (no OpenMP)
@@ -307,6 +308,7 @@ jobs:
             -DGGML_SANITIZE_${{ matrix.sanitizer }}=ON \
             -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
             -DGGML_OPENMP=OFF
+
           cmake --build build --config ${{ matrix.build_type }} -j $(nproc)
 
       - name: Test
diff --git a/.github/workflows/server.yml b/.github/workflows/server.yml
index 3d342c35f7..f44a9e739c 100644
--- a/.github/workflows/server.yml
+++ b/.github/workflows/server.yml
@@ -81,7 +81,7 @@ jobs:
             -DLLAMA_SANITIZE_ADDRESS=${{ matrix.sanitizer == 'ADDRESS' }} \
             -DLLAMA_SANITIZE_THREAD=${{ matrix.sanitizer == 'THREAD' }} \
             -DLLAMA_SANITIZE_UNDEFINED=${{ matrix.sanitizer == 'UNDEFINED' }}
-          cmake --build build --config ${{ matrix.build_type }} -j ${env:NUMBER_OF_PROCESSORS} --target llama-server
+          cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server
 
       - name: Python setup
         id: setup_python

From 9a5f57795c01c6e67a53eeedeae67ed63aaf7f8e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= <sigbjorn.skjaeret@scala.com>
Date: Sun, 8 Feb 2026 01:20:00 +0100
Subject: [PATCH 04/17] ci : remove server job from webui and move slow test
 (#19424)

* remove server job from webui and move slow test

* use pip-install option
---
 .github/workflows/server-webui.yml | 120 -----------------------------
 .github/workflows/server.yml       |  20 ++---
 2 files changed, 10 insertions(+), 130 deletions(-)

diff --git a/.github/workflows/server-webui.yml b/.github/workflows/server-webui.yml
index 6d1b617371..94899c9376 100644
--- a/.github/workflows/server-webui.yml
+++ b/.github/workflows/server-webui.yml
@@ -8,10 +8,6 @@ on:
         description: 'Commit SHA1 to build'
         required: false
         type: string
-      slow_tests:
-        description: 'Run slow tests'
-        required: true
-        type: boolean
   push:
     branches:
       - master
@@ -101,119 +97,3 @@ jobs:
         if: ${{ always() && steps.playwright.conclusion == 'success' }}
         run: npm run test:e2e
         working-directory: tools/server/webui
-
-  server-build:
-    runs-on: ubuntu-latest
-
-    strategy:
-      matrix:
-        sanitizer: [ADDRESS, UNDEFINED] # THREAD is broken
-        build_type: [RelWithDebInfo]
-        include:
-          - build_type: Release
-            sanitizer: ""
-      fail-fast: false # While -DLLAMA_SANITIZE_THREAD=ON is broken
-
-    steps:
-      - name: Dependencies
-        id: depends
-        run: |
-          sudo apt-get update
-          sudo apt-get -y install \
-            build-essential \
-            xxd \
-            git \
-            cmake \
-            curl \
-            wget \
-            language-pack-en \
-            libssl-dev
-
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v6
-        with:
-          fetch-depth: 0
-          ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
-
-      - name: Python setup
-        id: setup_python
-        uses: actions/setup-python@v6
-        with:
-          python-version: '3.11'
-
-      - name: Tests dependencies
-        id: test_dependencies
-        run: |
-          pip install -r tools/server/tests/requirements.txt
-
-      - name: Setup Node.js for WebUI
-        uses: actions/setup-node@v6
-        with:
-          node-version: "22"
-          cache: "npm"
-          cache-dependency-path: "tools/server/webui/package-lock.json"
-
-      - name: Install WebUI dependencies
-        run: npm ci
-        working-directory: tools/server/webui
-
-      - name: Build WebUI
-        run: npm run build
-        working-directory: tools/server/webui
-
-      - name: Build (no OpenMP)
-        id: cmake_build_no_openmp
-        if: ${{ matrix.sanitizer == 'THREAD' }}
-        run: |
-          cmake -B build \
-              -DGGML_NATIVE=OFF \
-              -DLLAMA_BUILD_SERVER=ON \
-              -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
-              -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \
-              -DGGML_OPENMP=OFF ;
-          cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server
-
-      - name: Build (sanitizers)
-        id: cmake_build_sanitizers
-        if: ${{ matrix.sanitizer != '' && matrix.sanitizer != 'THREAD' }}
-        run: |
-          cmake -B build \
-              -DGGML_NATIVE=OFF \
-              -DLLAMA_BUILD_SERVER=ON \
-              -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
-              -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON ;
-          cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server
-
-      - name: Build (sanitizers)
-        id: cmake_build
-        if: ${{ matrix.sanitizer == '' }}
-        run: |
-          cmake -B build \
-              -DGGML_NATIVE=OFF \
-              -DLLAMA_BUILD_SERVER=ON \
-              -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} ;
-          cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server
-
-      - name: Tests
-        id: server_integration_tests
-        if: ${{ matrix.sanitizer == '' }}
-        env:
-          GITHUB_ACTIONS: "true"
-        run: |
-          cd tools/server/tests
-          ./tests.sh
-
-      - name: Tests (sanitizers)
-        id: server_integration_tests_sanitizers
-        if: ${{ matrix.sanitizer != '' }}
-        run: |
-          cd tools/server/tests
-          LLAMA_SANITIZE=1 ./tests.sh
-
-      - name: Slow tests
-        id: server_integration_tests_slow
-        if: ${{ (github.event.schedule || github.event.inputs.slow_tests == 'true') && matrix.build_type == 'Release' }}
-        run: |
-          cd tools/server/tests
-          SLOW_TESTS=1 ./tests.sh
diff --git a/.github/workflows/server.yml b/.github/workflows/server.yml
index f44a9e739c..99d05226ba 100644
--- a/.github/workflows/server.yml
+++ b/.github/workflows/server.yml
@@ -88,11 +88,7 @@ jobs:
         uses: actions/setup-python@v6
         with:
           python-version: '3.11'
-
-      - name: Tests dependencies
-        id: test_dependencies
-        run: |
-          pip install -r tools/server/tests/requirements.txt
+          pip-install: -r tools/server/tests/requirements.txt
 
       - name: Tests
         id: server_integration_tests
@@ -102,6 +98,14 @@ jobs:
           export ${{ matrix.extra_args }}
           pytest -v -x -m "not slow"
 
+      - name: Slow tests
+        id: server_integration_tests_slow
+        if: ${{ (github.event.schedule || github.event.inputs.slow_tests == 'true') && matrix.build_type == 'Release' }}
+        run: |
+          cd tools/server/tests
+          export ${{ matrix.extra_args }}
+          SLOW_TESTS=1 pytest -v -x
+
   server-windows:
     runs-on: windows-2022
 
@@ -124,11 +128,7 @@ jobs:
         uses: actions/setup-python@v6
         with:
           python-version: '3.11'
-
-      - name: Tests dependencies
-        id: test_dependencies
-        run: |
-          pip install -r tools/server/tests/requirements.txt
+          pip-install: -r tools/server/tests/requirements.txt
 
       - name: Tests
         id: server_integration_tests

From 5999b50eb00972732b69c519121dda1361f56eb3 Mon Sep 17 00:00:00 2001
From: ddh0 <dylanhalladay02@icloud.com>
Date: Sun, 8 Feb 2026 01:22:38 -0600
Subject: [PATCH 05/17] llama-quantize : cleanup `--help` output (#19317)

* cleanup `llama-quantize --help` output

some much needed TLC

* remove future argument

oops, spoiler

* cleanup of cleanup
---
 tools/quantize/quantize.cpp | 59 +++++++++++++++++++++++++------------
 1 file changed, 40 insertions(+), 19 deletions(-)

diff --git a/tools/quantize/quantize.cpp b/tools/quantize/quantize.cpp
index 0709e0bda0..c0f49279ee 100644
--- a/tools/quantize/quantize.cpp
+++ b/tools/quantize/quantize.cpp
@@ -119,27 +119,48 @@ static bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftyp
 [[noreturn]]
 static void usage(const char * executable) {
     printf("usage: %s [--help] [--allow-requantize] [--leave-output-tensor] [--pure] [--imatrix] [--include-weights]\n", executable);
-    printf("       [--exclude-weights] [--output-tensor-type] [--token-embedding-type] [--tensor-type] [--tensor-type-file] [--prune-layers] [--keep-split] [--override-kv]\n");
+    printf("       [--exclude-weights] [--output-tensor-type] [--token-embedding-type] [--tensor-type] [--tensor-type-file]\n");
+    printf("       [--prune-layers] [--keep-split] [--override-kv]\n");
     printf("       model-f32.gguf [model-quant.gguf] type [nthreads]\n\n");
-    printf("  --allow-requantize: Allows requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit\n");
-    printf("  --leave-output-tensor: Will leave output.weight un(re)quantized. Increases model size but may also increase quality, especially when requantizing\n");
-    printf("  --pure: Disable k-quant mixtures and quantize all tensors to the same type\n");
-    printf("  --imatrix file_name: use data in file_name as importance matrix for quant optimizations\n");
-    printf("  --include-weights tensor_name: use importance matrix for this/these tensor(s)\n");
-    printf("  --exclude-weights tensor_name: use importance matrix for this/these tensor(s)\n");
-    printf("  --output-tensor-type ggml_type: use this ggml_type for the output.weight tensor\n");
-    printf("  --token-embedding-type ggml_type: use this ggml_type for the token embeddings tensor\n");
-    printf("  --tensor-type TENSOR=TYPE: quantize this tensor to this ggml_type. example: --tensor-type attn_q=q8_0\n");
-    printf("      Advanced option to selectively quantize tensors. May be specified multiple times.\n");
-    printf("  --tensor-type-file tensor_type.txt: list of tensors to quantize to specific ggml_type. example: --tensor-type-file tensor_type_list.txt\n");
-    printf("      Advanced option to selectively quantize a long list of tensors. Format to be tensor_name=ggml_type, separated by spaces/newline.\n");
-    printf("  --prune-layers L0,L1,L2...comma-separated list of layer numbers to prune from the model\n");
-    printf("      Advanced option to remove all tensors from the given layers\n");
-    printf("  --keep-split: will generate quantized model in the same shards as input\n");
+    printf("  --allow-requantize\n");
+    printf("                                      allow requantizing tensors that have already been quantized\n");
+    printf("                                      WARNING: this can severely reduce quality compared to quantizing\n");
+    printf("                                               from 16bit or 32bit!\n");
+    printf("  --leave-output-tensor\n");
+    printf("                                      leave output.weight un(re)quantized\n");
+    printf("                                      increases model size but may also increase quality, especially when requantizing\n");
+    printf("  --pure\n");
+    printf("                                      disable k-quant mixtures and quantize all tensors to the same type\n");
+    printf("  --imatrix file_name\n");
+    printf("                                      use data in file_name as importance matrix for quant optimizations\n");
+    printf("  --include-weights tensor_name\n");
+    printf("                                      use importance matrix for this/these tensor(s)\n");
+    printf("  --exclude-weights tensor_name\n");
+    printf("                                      do not use importance matrix for this/these tensor(s)\n");
+    printf("  --output-tensor-type ggml_type\n");
+    printf("                                      use this ggml_type for the output.weight tensor\n");
+    printf("  --token-embedding-type ggml_type\n");
+    printf("                                      use this ggml_type for the token embeddings tensor\n");
+    printf("  --tensor-type tensor_name=ggml_type\n");
+    printf("                                      quantize this tensor to this ggml_type\n");
+    printf("                                      this is an advanced option to selectively quantize tensors. may be specified multiple times.\n");
+    printf("                                      example: --tensor-type attn_q=q8_0\n");
+    printf("  --tensor-type-file tensor_types.txt\n");
+    printf("                                      list of tensors to quantize to a specific ggml_type\n");
+    printf("                                      this is an advanced option to selectively quantize a long list of tensors.\n");
+    printf("                                      the file should use the same format as above, separated by spaces or newlines.\n");
+    printf("  --prune-layers L0,L1,L2...\n");
+    printf("                                      comma-separated list of layer numbers to prune from the model\n");
+    printf("                                      WARNING: this is an advanced option, use with care.\n");
+    printf("  --keep-split\n");
+    printf("                                      generate quantized model in the same shards as input\n");
     printf("  --override-kv KEY=TYPE:VALUE\n");
-    printf("      Advanced option to override model metadata by key in the quantized model. May be specified multiple times.\n");
-    printf("Note: --include-weights and --exclude-weights cannot be used together\n");
-    printf("\nAllowed quantization types:\n");
+    printf("                                      override model metadata by key in the quantized model. may be specified multiple times.\n");
+    printf("                                      WARNING: this is an advanced option, use with care.\n\n");
+    printf("note: --include-weights and --exclude-weights cannot be used together\n\n");
+    printf("-----------------------------------------------------------------------------\n");
+    printf(" allowed quantization types\n");
+    printf("-----------------------------------------------------------------------------\n\n");
     for (const auto & it : QUANT_OPTIONS) {
         if (it.name != "COPY") {
             printf("  %2d  or  ", it.ftype);

From eb449cdfa4319d8fd9066e4633b49f4c867dd11c Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sun, 8 Feb 2026 09:40:04 +0200
Subject: [PATCH 06/17] server : improve context checkpoint logic (#19408)

---
 tools/server/server-context.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp
index b71d496eeb..8ec8451339 100644
--- a/tools/server/server-context.cpp
+++ b/tools/server/server-context.cpp
@@ -2507,7 +2507,8 @@ private:
                         slot.n_prompt_tokens_processed++;
 
                         // process the last few tokens of the prompt separately in order to allow for a checkpoint to be created.
-                        if (do_checkpoint && slot.task->n_tokens() - slot.prompt.n_tokens() == 64) {
+                        const int n_last = std::min(n_batch, 512);
+                        if (do_checkpoint && slot.task->n_tokens() == slot.prompt.n_tokens() + n_last) {
                             break;
                         }
                     }

From 5fa1c190d9fc86c02698b730a2cb933195e19d96 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrien=20Gallou=C3=ABt?= <angt@huggingface.co>
Date: Sun, 8 Feb 2026 09:06:45 +0100
Subject: [PATCH 07/17] rpc : update from common.cpp (#19400)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Adrien Gallouët <angt@huggingface.co>
---
 tools/rpc/rpc-server.cpp | 64 ++++++++++++++++++++++++++++++----------
 1 file changed, 49 insertions(+), 15 deletions(-)

diff --git a/tools/rpc/rpc-server.cpp b/tools/rpc/rpc-server.cpp
index 58b93c7468..521f79622d 100644
--- a/tools/rpc/rpc-server.cpp
+++ b/tools/rpc/rpc-server.cpp
@@ -1,12 +1,7 @@
-#if defined(_MSC_VER)
-#define _SILENCE_CXX17_CODECVT_HEADER_DEPRECATION_WARNING
-#endif
-
 #include "ggml-rpc.h"
 #ifdef _WIN32
 #  define NOMINMAX
 #  define DIRECTORY_SEPARATOR '\\'
-#  include <locale>
 #  include <windows.h>
 #  include <fcntl.h>
 #  include <io.h>
@@ -15,23 +10,43 @@
 #  include <unistd.h>
 #  include <sys/stat.h>
 #endif
-#include <codecvt>
 #include <string>
 #include <stdio.h>
 #include <vector>
-#include <filesystem>
 #include <algorithm>
 #include <thread>
 #include <regex>
 
-namespace fs = std::filesystem;
+#if defined(__linux__)
+#include <sys/types.h>
+#include <pwd.h>
+#endif
+
+// NOTE: this is copied from common.cpp to avoid linking with libcommon
+#ifdef _WIN32
+static std::wstring utf8_to_wstring(const std::string & str) {
+    if (str.empty()) {
+        return std::wstring();
+    }
+
+    int size = MultiByteToWideChar(CP_UTF8, 0, str.c_str(), (int)str.size(), NULL, 0);
+
+    if (size <= 0) {
+        return std::wstring();
+    }
+
+    std::wstring wstr(size, 0);
+    MultiByteToWideChar(CP_UTF8, 0, str.c_str(), (int)str.size(), &wstr[0], size);
+
+    return wstr;
+}
+#endif
 
 // NOTE: this is copied from common.cpp to avoid linking with libcommon
 // returns true if successful, false otherwise
 static bool fs_create_directory_with_parents(const std::string & path) {
 #ifdef _WIN32
-    std::wstring_convert<std::codecvt_utf8<wchar_t>> converter;
-    std::wstring wpath = converter.from_bytes(path);
+    std::wstring wpath = utf8_to_wstring(path);
 
     // if the path already exists, check whether it's a directory
     const DWORD attributes = GetFileAttributesW(wpath.c_str());
@@ -44,9 +59,16 @@ static bool fs_create_directory_with_parents(const std::string & path) {
     // process path from front to back, procedurally creating directories
     while ((pos_slash = path.find('\\', pos_slash)) != std::string::npos) {
         const std::wstring subpath = wpath.substr(0, pos_slash);
-        const wchar_t * test = subpath.c_str();
 
-        const bool success = CreateDirectoryW(test, NULL);
+        pos_slash += 1;
+
+        // skip the drive letter, in some systems it can return an access denied error
+        if (subpath.length() == 2 && subpath[1] == ':') {
+            continue;
+        }
+
+        const bool success = CreateDirectoryW(subpath.c_str(), NULL);
+
         if (!success) {
             const DWORD error = GetLastError();
 
@@ -60,8 +82,6 @@ static bool fs_create_directory_with_parents(const std::string & path) {
                 return false;
             }
         }
-
-        pos_slash += 1;
     }
 
     return true;
@@ -115,13 +135,27 @@ static std::string fs_get_cache_directory() {
 #if defined(__linux__) || defined(__FreeBSD__) || defined(_AIX) || defined(__OpenBSD__)
         if (std::getenv("XDG_CACHE_HOME")) {
             cache_directory = std::getenv("XDG_CACHE_HOME");
-        } else {
+        } else if (std::getenv("HOME")) {
             cache_directory = std::getenv("HOME") + std::string("/.cache/");
+        } else {
+#if defined(__linux__)
+            /* no $HOME is defined, fallback to getpwuid */
+            struct passwd *pw = getpwuid(getuid());
+            if ((!pw) || (!pw->pw_dir)) {
+                throw std::runtime_error("Failed to find $HOME directory");
+            }
+
+            cache_directory = std::string(pw->pw_dir) + std::string("/.cache/");
+#else /* defined(__linux__) */
+            throw std::runtime_error("Failed to find $HOME directory");
+#endif /* defined(__linux__) */
         }
 #elif defined(__APPLE__)
         cache_directory = std::getenv("HOME") + std::string("/Library/Caches/");
 #elif defined(_WIN32)
         cache_directory = std::getenv("LOCALAPPDATA");
+#elif defined(__EMSCRIPTEN__)
+        GGML_ABORT("not implemented on this platform");
 #else
 #  error Unknown architecture
 #endif

From e06088da0fa86aa444409f38dff274904931c507 Mon Sep 17 00:00:00 2001
From: Oliver Simons <osimons@nvidia.com>
Date: Sun, 8 Feb 2026 14:12:51 +0100
Subject: [PATCH 08/17] CUDA: Fix non-contig rope (#19338)

* Rename variables + fix rope_neox

Seems memory layout is shared with Vulkan so we can port fix from
https://github.com/ggml-org/llama.cpp/pull/19299

* Fix rope_multi

* Fix rope_vision

* Fix rope_norm

* Rename ne* to ne0* for consistent variable naming

* cont : consistent stride names

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
---
 ggml/src/ggml-cuda/rope.cu | 364 +++++++++++++++++++++++--------------
 1 file changed, 232 insertions(+), 132 deletions(-)

diff --git a/ggml/src/ggml-cuda/rope.cu b/ggml/src/ggml-cuda/rope.cu
index 88ed79111a..45a49a5dc2 100644
--- a/ggml/src/ggml-cuda/rope.cu
+++ b/ggml/src/ggml-cuda/rope.cu
@@ -43,10 +43,15 @@ static __device__ void rope_yarn(
 template <bool forward, bool has_ff, typename T, typename D>
 static __global__ void rope_norm(const T *            x,
                                  D *                  dst,
-                                 const int            ne0,
-                                 const int            ne1,
+                                 const int            ne00,
+                                 const int            ne01,
+                                 const int            ne02,
+                                 const int            s01,
+                                 const int            s02,
+                                 const int            s03,
                                  const int            s1,
                                  const int            s2,
+                                 const int            s3,
                                  const int            n_dims,
                                  const int32_t *      pos,
                                  const float          freq_scale,
@@ -59,23 +64,23 @@ static __global__ void rope_norm(const T *            x,
                                  const int            set_rows_stride) {
     const int i0 = 2*(blockDim.y*blockIdx.y + threadIdx.y);
 
-    if (i0 >= ne0) {
+    if (i0 >= ne00) {
         return;
     }
 
     const int row_dst = blockDim.x*blockIdx.x + threadIdx.x;
 
-    const int row_x     = row_dst % ne1;
-    const int channel_x = row_dst / ne1;
-
-    int       idst = row_dst * ne0 + i0;
-    const int ix   = channel_x*s2 + row_x*s1 + i0;
+    const uint32_t i3 = row_dst / (ne01 * ne02);
+    const uint32_t i2 = (row_dst - i3 * ne01 * ne02) / ne01;
+    const uint32_t i1 = row_dst - i3 * ne01 * ne02 - i2 * ne01;
 
+    int       idst = i0 + i1 * s1  + i2 * s2  + i3 * s3;
+    const int ix   = i0 + i1 * s01 + i2 * s02 + i3 * s03;
     // Fusion optimization: ROPE + VIEW + SET_ROWS.
     // The rope output is viewed as a 1D tensor and offset based on a row index in row_indices.
     if (set_rows_stride != 0) {
-        idst = row_x * ne0 + i0;
-        idst += row_indices[channel_x] * set_rows_stride;
+        idst = i1 * s1 + i0;
+        idst += row_indices[i2] * set_rows_stride;
     }
 
     const auto & store_coaelsced = [&](float x0, float x1) {
@@ -92,7 +97,7 @@ static __global__ void rope_norm(const T *            x,
         return;
     }
 
-    const float theta_base = pos[channel_x]*powf(theta_scale, i0/2.0f);
+    const float theta_base = pos[i2]*powf(theta_scale, i0/2.0f);
 
     const float freq_factor = has_ff ? freq_factors[i0/2] : 1.0f;
 
@@ -110,10 +115,15 @@ static __global__ void rope_norm(const T *            x,
 template <bool forward, bool has_ff, typename T, typename D>
 static __global__ void rope_neox(const T *            x,
                                  D *                  dst,
-                                 const int            ne0,
-                                 const int            ne1,
+                                 const int            ne00,
+                                 const int            ne01,
+                                 const int            ne02,
+                                 const int            s01,
+                                 const int            s02,
+                                 const int            s03,
                                  const int            s1,
                                  const int            s2,
+                                 const int            s3,
                                  const int            n_dims,
                                  const int32_t *      pos,
                                  const float          freq_scale,
@@ -126,23 +136,24 @@ static __global__ void rope_neox(const T *            x,
                                  const int            set_rows_stride) {
     const int i0 = 2*(blockDim.y*blockIdx.y + threadIdx.y);
 
-    if (i0 >= ne0) {
+    if (i0 >= ne00) {
         return;
     }
 
     const int row_dst = blockDim.x*blockIdx.x + threadIdx.x;
 
-    const int row_x     = row_dst % ne1;
-    const int channel_x = row_dst / ne1;
+    const uint32_t i3 = row_dst / (ne01 * ne02);
+    const uint32_t i2 = (row_dst - i3 * ne01 * ne02) / ne01;
+    const uint32_t i1 = row_dst - i3 * ne01 * ne02 - i2 * ne01;
 
-    int       idst = row_dst * ne0 + i0 / 2;
-    const int ix   = channel_x*s2 + row_x*s1 + i0/2;
+    int       idst = i0 / 2 + i1 * s1  + i2 * s2  + i3 * s3;
+    const int ix   = i0 / 2 + i1 * s01 + i2 * s02 + i3 * s03;
 
     // Fusion optimization: ROPE + VIEW + SET_ROWS.
     // The rope output is viewed as a 1D tensor and offset based on a row index in row_indices.
     if (set_rows_stride != 0) {
-        idst = row_x * ne0 + i0 / 2;
-        idst += row_indices[channel_x] * set_rows_stride;
+        idst = i1 * s1 + i0 / 2;
+        idst += row_indices[i2] * set_rows_stride;
     }
 
     if (i0 >= n_dims) {
@@ -152,7 +163,7 @@ static __global__ void rope_neox(const T *            x,
         return;
     }
 
-    const float theta_base = pos[channel_x]*powf(theta_scale, i0/2.0f);
+    const float theta_base = pos[i2]*powf(theta_scale, i0/2.0f);
 
     const float freq_factor = has_ff ? freq_factors[i0/2] : 1.0f;
 
@@ -168,24 +179,42 @@ static __global__ void rope_neox(const T *            x,
     dst[idst + n_dims / 2] = ggml_cuda_cast<D>(x0 * sin_theta + x1 * cos_theta);
 }
 
-template<bool forward, bool has_ff, typename T>
-static __global__ void rope_multi(
-        const T * x, T * dst, const int ne0, const int ne1, const int ne2, const int s1, const int s2,
-        const int n_dims, const int32_t * pos, const float freq_scale, const float ext_factor, const float attn_factor,
-        const rope_corr_dims corr_dims, const float theta_scale, const float * freq_factors, const mrope_sections sections, const bool is_imrope) {
-    const int i0 = 2*(blockDim.y*blockIdx.y + threadIdx.y);
+template <bool forward, bool has_ff, typename T>
+static __global__ void rope_multi(const T *            x,
+                                  T *                  dst,
+                                  const int            ne00,
+                                  const int            ne01,
+                                  const int            ne02,
+                                  const int            s01,
+                                  const int            s02,
+                                  const int            s03,
+                                  const int            s1,
+                                  const int            s2,
+                                  const int            s3,
+                                  const int            n_dims,
+                                  const int32_t *      pos,
+                                  const float          freq_scale,
+                                  const float          ext_factor,
+                                  const float          attn_factor,
+                                  const rope_corr_dims corr_dims,
+                                  const float          theta_scale,
+                                  const float *        freq_factors,
+                                  const mrope_sections sections,
+                                  const bool           is_imrope) {
+    const int i0 = 2 * (blockDim.y * blockIdx.y + threadIdx.y);
 
-    if (i0 >= ne0) {
+    if (i0 >= ne00) {
         return;
     }
 
     const int row_dst = blockDim.x*blockIdx.x + threadIdx.x;
 
-    const int row_x     = row_dst % ne1;
-    const int channel_x = row_dst / ne1;
+    const uint32_t i3 = row_dst / (ne01 * ne02);
+    const uint32_t i2 = (row_dst - i3 * ne01 * ne02) / ne01;
+    const uint32_t i1 = row_dst - i3 * ne01 * ne02 - i2 * ne01;
 
-    const int idst = row_dst*ne0 + i0/2;
-    const int ix   = channel_x*s2 + row_x*s1 + i0/2;
+    int       idst = i0 / 2 + i1 * s1  + i2 * s2  + i3 * s3;
+    const int ix   = i0 / 2 + i1 * s01 + i2 * s02 + i3 * s03;
 
     if (i0 >= n_dims) {
         dst[idst + i0/2 + 0] = x[ix + i0/2 + 0];
@@ -200,27 +229,24 @@ static __global__ void rope_multi(
 
     float theta_base = 0.0;
     if (is_imrope) {
-        if (sector % 3 == 1 && sector < 3 * sections.v[1]) { // h
-            theta_base = pos[channel_x + ne2 * 1]*powf(theta_scale, i0/2.0f);
-        } else if (sector % 3 == 2 && sector < 3 * sections.v[2]) { // w
-            theta_base = pos[channel_x + ne2 * 2]*powf(theta_scale, i0/2.0f);
-        } else if (sector % 3 == 0 && sector < 3 * sections.v[0]) { // t
-            theta_base = pos[channel_x]*powf(theta_scale, i0/2.0f);
+        if (sector % 3 == 1 && sector < 3 * sections.v[1]) {         // h
+            theta_base = pos[i2 + ne02 * 1] * powf(theta_scale, i0 / 2.0f);
+        } else if (sector % 3 == 2 && sector < 3 * sections.v[2]) {  // w
+            theta_base = pos[i2 + ne02 * 2] * powf(theta_scale, i0 / 2.0f);
+        } else if (sector % 3 == 0 && sector < 3 * sections.v[0]) {  // t
+            theta_base = pos[i2] * powf(theta_scale, i0 / 2.0f);
         } else {
-            theta_base = pos[channel_x + ne2 * 3]*powf(theta_scale, i0/2.0f);
+            theta_base = pos[i2 + ne02 * 3] * powf(theta_scale, i0 / 2.0f);
         }
     } else {
         if (sector < sections.v[0]) {
-            theta_base = pos[channel_x]*powf(theta_scale, i0/2.0f);
-        }
-        else if (sector >= sections.v[0] && sector < sec_w) {
-            theta_base = pos[channel_x + ne2 * 1]*powf(theta_scale, i0/2.0f);
-        }
-        else if (sector >= sec_w && sector < sec_w + sections.v[2]) {
-            theta_base = pos[channel_x + ne2 * 2]*powf(theta_scale, i0/2.0f);
-        }
-        else if (sector >= sec_w + sections.v[2]) {
-            theta_base = pos[channel_x + ne2 * 3]*powf(theta_scale, i0/2.0f);
+            theta_base = pos[i2] * powf(theta_scale, i0 / 2.0f);
+        } else if (sector >= sections.v[0] && sector < sec_w) {
+            theta_base = pos[i2 + ne02 * 1] * powf(theta_scale, i0 / 2.0f);
+        } else if (sector >= sec_w && sector < sec_w + sections.v[2]) {
+            theta_base = pos[i2 + ne02 * 2] * powf(theta_scale, i0 / 2.0f);
+        } else if (sector >= sec_w + sections.v[2]) {
+            theta_base = pos[i2 + ne02 * 3] * powf(theta_scale, i0 / 2.0f);
         }
     }
 
@@ -238,37 +264,53 @@ static __global__ void rope_multi(
     dst[idst + n_dims/2] = x0*sin_theta + x1*cos_theta;
 }
 
-template<bool forward, bool has_ff, typename T>
-static __global__ void rope_vision(
-        const T * x, T * dst, const int ne0, const int ne1, const int ne2, const int s1, const int s2, const int n_dims,
-        const int32_t * pos, const float freq_scale, const float ext_factor, const float attn_factor, const rope_corr_dims corr_dims,
-        const float theta_scale, const float * freq_factors, const mrope_sections sections) {
+template <bool forward, bool has_ff, typename T>
+static __global__ void rope_vision(const T *            x,
+                                   T *                  dst,
+                                   const int            ne00,
+                                   const int            ne01,
+                                   const int            ne02,
+                                   const int            s01,
+                                   const int            s02,
+                                   const int            s03,
+                                   const int            s1,
+                                   const int            s2,
+                                   const int            s3,
+                                   const int            n_dims,
+                                   const int32_t *      pos,
+                                   const float          freq_scale,
+                                   const float          ext_factor,
+                                   const float          attn_factor,
+                                   const rope_corr_dims corr_dims,
+                                   const float          theta_scale,
+                                   const float *        freq_factors,
+                                   const mrope_sections sections) {
     const int i0 = 2*(blockDim.y*blockIdx.y + threadIdx.y);
 
-    if (i0 >= ne0) {
+    if (i0 >= ne00) {
         return;
     }
 
     const int row_dst = blockDim.x*blockIdx.x + threadIdx.x;
 
-    const int row_x     = row_dst % ne1;
-    const int channel_x = row_dst / ne1;
+    const uint32_t i3 = row_dst / (ne01 * ne02);
+    const uint32_t i2 = (row_dst - i3 * ne01 * ne02) / ne01;
+    const uint32_t i1 = row_dst - i3 * ne01 * ne02 - i2 * ne01;
 
-    const int idst = row_dst*ne0 + i0/2;
-    const int ix   = channel_x*s2 + row_x*s1 + i0/2;
+    int       idst = i0 / 2 + i1 * s1  + i2 * s2  + i3 * s3;
+    const int ix   = i0 / 2 + i1 * s01 + i2 * s02 + i3 * s03;
 
     const int sect_dims = sections.v[0] + sections.v[1];
-    const int sec_w = sections.v[1] + sections.v[0];
-    const int sector = (i0 / 2) % sect_dims;
+    const int sec_w     = sections.v[1] + sections.v[0];
+    const int sector    = (i0 / 2) % sect_dims;
 
     float theta_base = 0.0;
     if (sector < sections.v[0]) {
         const int p = sector;
-        theta_base = pos[channel_x]*powf(theta_scale, p);
-    }
-    else if (sector >= sections.v[0] && sector < sec_w) {
+        theta_base  = pos[i2] * powf(theta_scale, p);
+    } else if (sector >= sections.v[0] && sector < sec_w) {
         const int p = sector - sections.v[0];
-        theta_base = pos[channel_x + ne2]*powf(theta_scale, p);
+        theta_base  = pos[i2 + ne02] * powf(theta_scale, p);
     }
 
     const float freq_factor = has_ff ? freq_factors[i0/2] : 1.0f;
@@ -288,10 +330,15 @@ static __global__ void rope_vision(
 template <bool forward, typename T, typename D>
 static void rope_norm_cuda(const T *            x,
                            D *                  dst,
-                           const int            ne0,
-                           const int            ne1,
+                           const int            ne00,
+                           const int            ne01,
+                           const int            ne02,
+                           const int            s01,
+                           const int            s02,
+                           const int            s03,
                            const int            s1,
                            const int            s2,
+                           const int            s3,
                            const int            n_dims,
                            const int            nr,
                            const int32_t *      pos,
@@ -304,31 +351,36 @@ static void rope_norm_cuda(const T *            x,
                            const int64_t *      row_indices,
                            const int            set_rows_stride,
                            cudaStream_t         stream) {
-    GGML_ASSERT(ne0 % 2 == 0);
+    GGML_ASSERT(ne00 % 2 == 0);
     const dim3 block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1);
-    const int n_blocks_x = (ne0 + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
+    const int  n_blocks_x = (ne00 + 2 * CUDA_ROPE_BLOCK_SIZE - 1) / (2 * CUDA_ROPE_BLOCK_SIZE);
     const dim3 block_nums(nr, n_blocks_x, 1);
 
-    const float theta_scale = powf(freq_base, -2.0f/n_dims);
+    const float theta_scale = powf(freq_base, -2.0f / n_dims);
 
     if (freq_factors == nullptr) {
         rope_norm<forward, false><<<block_nums, block_dims, 0, stream>>>(
-            x, dst, ne0, ne1, s1, s2, n_dims, pos, freq_scale, ext_factor, attn_factor, corr_dims, theta_scale,
-            freq_factors, row_indices, set_rows_stride);
+            x, dst, ne00, ne01, ne02, s01, s02, s03, s1, s2, s3, n_dims, pos, freq_scale, ext_factor,
+            attn_factor, corr_dims, theta_scale, freq_factors, row_indices, set_rows_stride);
     } else {
         rope_norm<forward, true><<<block_nums, block_dims, 0, stream>>>(
-            x, dst, ne0, ne1, s1, s2, n_dims, pos, freq_scale, ext_factor, attn_factor, corr_dims, theta_scale,
-            freq_factors, row_indices, set_rows_stride);
+            x, dst, ne00, ne01, ne02, s01, s02, s03, s1, s2, s3, n_dims, pos, freq_scale, ext_factor,
+            attn_factor, corr_dims, theta_scale, freq_factors, row_indices, set_rows_stride);
     }
 }
 
 template <bool forward, typename T, typename D>
 static void rope_neox_cuda(const T *            x,
                            D *                  dst,
-                           const int            ne0,
-                           const int            ne1,
+                           const int            ne00,
+                           const int            ne01,
+                           const int            ne02,
+                           const int            s01,
+                           const int            s02,
+                           const int            s03,
                            const int            s1,
                            const int            s2,
+                           const int            s3,
                            const int            n_dims,
                            const int            nr,
                            const int32_t *      pos,
@@ -341,55 +393,92 @@ static void rope_neox_cuda(const T *            x,
                            const int64_t *      row_indices,
                            const int            set_rows_stride,
                            cudaStream_t         stream) {
-    GGML_ASSERT(ne0 % 2 == 0);
+    GGML_ASSERT(ne00 % 2 == 0);
     const dim3 block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1);
-    const int n_blocks_x = (ne0 + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
+    const int  n_blocks_x = (ne00 + 2 * CUDA_ROPE_BLOCK_SIZE - 1) / (2 * CUDA_ROPE_BLOCK_SIZE);
     const dim3 block_nums(nr, n_blocks_x, 1);
 
-    const float theta_scale = powf(freq_base, -2.0f/n_dims);
+    const float theta_scale = powf(freq_base, -2.0f / n_dims);
 
     if (freq_factors == nullptr) {
         rope_neox<forward, false><<<block_nums, block_dims, 0, stream>>>(
-            x, dst, ne0, ne1, s1, s2, n_dims, pos, freq_scale, ext_factor, attn_factor, corr_dims, theta_scale,
-            freq_factors, row_indices, set_rows_stride);
+            x, dst, ne00, ne01, ne02, s01, s02, s03, s1, s2, s3, n_dims, pos, freq_scale, ext_factor,
+            attn_factor, corr_dims, theta_scale, freq_factors, row_indices, set_rows_stride);
     } else {
         rope_neox<forward, true><<<block_nums, block_dims, 0, stream>>>(
-            x, dst, ne0, ne1, s1, s2, n_dims, pos, freq_scale, ext_factor, attn_factor, corr_dims, theta_scale,
-            freq_factors, row_indices, set_rows_stride);
+            x, dst, ne00, ne01, ne02, s01, s02, s03, s1, s2, s3, n_dims, pos, freq_scale, ext_factor,
+            attn_factor, corr_dims, theta_scale, freq_factors, row_indices, set_rows_stride);
     }
 }
 
-template<bool forward, typename T>
-static void rope_multi_cuda(
-        const T * x, T * dst, const int ne0, const int ne1, const int ne2, const int s1, const int s2, const int n_dims, const int nr,
-        const int32_t * pos, const float freq_scale, const float freq_base, const float ext_factor, const float attn_factor,
-        const rope_corr_dims corr_dims, const float * freq_factors, const mrope_sections sections, const bool is_imrope, cudaStream_t stream) {
-    GGML_ASSERT(ne0 % 2 == 0);
+template <bool forward, typename T>
+static void rope_multi_cuda(const T *            x,
+                            T *                  dst,
+                            const int            ne00,
+                            const int            ne01,
+                            const int            ne02,
+                            const int            s01,
+                            const int            s02,
+                            const int            s03,
+                            const int            s1,
+                            const int            s2,
+                            const int            s3,
+                            const int            n_dims,
+                            const int            nr,
+                            const int32_t *      pos,
+                            const float          freq_scale,
+                            const float          freq_base,
+                            const float          ext_factor,
+                            const float          attn_factor,
+                            const rope_corr_dims corr_dims,
+                            const float *        freq_factors,
+                            const mrope_sections sections,
+                            const bool           is_imrope,
+                            cudaStream_t         stream) {
+    GGML_ASSERT(ne00 % 2 == 0);
     const dim3 block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1);
-    const int n_blocks_x = (ne0 + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
+    const int  n_blocks_x = (ne00 + 2 * CUDA_ROPE_BLOCK_SIZE - 1) / (2 * CUDA_ROPE_BLOCK_SIZE);
     const dim3 block_nums(nr, n_blocks_x, 1);
 
-    const float theta_scale = powf(freq_base, -2.0f/n_dims);
+    const float theta_scale = powf(freq_base, -2.0f / n_dims);
 
     if (freq_factors == nullptr) {
         rope_multi<forward, false, T><<<block_nums, block_dims, 0, stream>>>(
-            x, dst, ne0, ne1, ne2, s1, s2, n_dims, pos, freq_scale, ext_factor,
+            x, dst, ne00, ne01, ne02, s01, s02, s03, s1, s2, s3, n_dims, pos, freq_scale, ext_factor,
             attn_factor, corr_dims, theta_scale, freq_factors, sections, is_imrope);
     } else {
         rope_multi<forward, true, T><<<block_nums, block_dims, 0, stream>>>(
-            x, dst, ne0, ne1, ne2, s1, s2, n_dims, pos, freq_scale, ext_factor,
+            x, dst, ne00, ne01, ne02, s01, s02, s03, s1, s2, s3, n_dims, pos, freq_scale, ext_factor,
             attn_factor, corr_dims, theta_scale, freq_factors, sections, is_imrope);
     }
 }
 
-template<bool forward, typename T>
-static void rope_vision_cuda(
-        const T * x, T * dst, const int ne0, const int ne1, const int ne2, const int s1, const int s2, const int n_dims, const int nr,
-        const int32_t * pos, const float freq_scale, const float freq_base, const float ext_factor, const float attn_factor,
-        const rope_corr_dims corr_dims, const float * freq_factors, const mrope_sections sections, cudaStream_t stream) {
-    GGML_ASSERT(ne0 % 2 == 0);
+template <bool forward, typename T>
+static void rope_vision_cuda(const T *            x,
+                             T *                  dst,
+                             const int            ne00,
+                             const int            ne01,
+                             const int            ne02,
+                             const int            s01,
+                             const int            s02,
+                             const int            s03,
+                             const int            s1,
+                             const int            s2,
+                             const int            s3,
+                             const int            n_dims,
+                             const int            nr,
+                             const int32_t *      pos,
+                             const float          freq_scale,
+                             const float          freq_base,
+                             const float          ext_factor,
+                             const float          attn_factor,
+                             const rope_corr_dims corr_dims,
+                             const float *        freq_factors,
+                             const mrope_sections sections,
+                             cudaStream_t         stream) {
+    GGML_ASSERT(ne00 % 2 == 0);
     const dim3 block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1);
-    const int n_blocks_x = (ne0 + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
+    const int  n_blocks_x = (ne00 + 2 * CUDA_ROPE_BLOCK_SIZE - 1) / (2 * CUDA_ROPE_BLOCK_SIZE);
     const dim3 block_nums(nr, n_blocks_x, 1);
     // break down (head_dim, heads, seq) into (CUDA_ROPE_BLOCK_SIZE, x, heads * seq)
     // where x ~= ceil(head_dim / CUDA_ROPE_BLOCK_SIZE);
@@ -398,11 +487,11 @@ static void rope_vision_cuda(
 
     if (freq_factors == nullptr) {
         rope_vision<forward, false, T><<<block_nums, block_dims, 0, stream>>>(
-            x, dst, ne0, ne1, ne2, s1, s2, n_dims, pos, freq_scale, ext_factor,
+            x, dst, ne00, ne01, ne02, s01, s02, s03, s1, s2, s3, n_dims, pos, freq_scale, ext_factor,
             attn_factor, corr_dims, theta_scale, freq_factors, sections);
     } else {
         rope_vision<forward, true, T><<<block_nums, block_dims, 0, stream>>>(
-            x, dst, ne0, ne1, ne2, s1, s2, n_dims, pos, freq_scale, ext_factor,
+            x, dst, ne00, ne01, ne02, s01, s02, s03, s1, s2, s3, n_dims, pos, freq_scale, ext_factor,
             attn_factor, corr_dims, theta_scale, freq_factors, sections);
     }
 }
@@ -445,6 +534,11 @@ void ggml_cuda_op_rope_impl(ggml_backend_cuda_context & ctx,
 
     const size_t s01 = src0->nb[1] / ggml_type_size(src0->type);
     const size_t s02 = src0->nb[2] / ggml_type_size(src0->type);
+    const size_t s03 = src0->nb[3] / ggml_type_size(src0->type);
+
+    const size_t s1 = dst->nb[1] / ggml_type_size(dst->type);
+    const size_t s2 = dst->nb[2] / ggml_type_size(dst->type);
+    const size_t s3 = dst->nb[3] / ggml_type_size(dst->type);
 
     //const int n_past     = ((int32_t *) dst->op_params)[0];
     const int n_dims     = ((int32_t *) dst->op_params)[1];
@@ -495,57 +589,63 @@ void ggml_cuda_op_rope_impl(ggml_backend_cuda_context & ctx,
     // compute
     if (is_neox) {
         if (src0->type == GGML_TYPE_F32 && dst_type == GGML_TYPE_F32) {
-            rope_neox_cuda<forward, float, float>((const float *) src0_d, (float *) dst_d, ne00, ne01, s01, s02, n_dims,
-                                                  nr, pos, freq_scale, freq_base, ext_factor, attn_factor, corr_dims,
-                                                  freq_factors, row_indices, set_rows_stride, stream);
+            rope_neox_cuda<forward, float, float>((const float *) src0_d, (float *) dst_d, ne00, ne01, ne02, s01, s02,
+                                                  s03, s1, s2, s3, n_dims, nr, pos, freq_scale, freq_base,
+                                                  ext_factor, attn_factor, corr_dims, freq_factors, row_indices,
+                                                  set_rows_stride, stream);
         } else if (src0->type == GGML_TYPE_F32 && dst_type == GGML_TYPE_F16) {
-            rope_neox_cuda<forward, float, half>((const float *) src0_d, (half *) dst_d, ne00, ne01, s01, s02, n_dims,
-                                                 nr, pos, freq_scale, freq_base, ext_factor, attn_factor, corr_dims,
-                                                 freq_factors, row_indices, set_rows_stride, stream);
+            rope_neox_cuda<forward, float, half>((const float *) src0_d, (half *) dst_d, ne00, ne01, ne02, s01, s02,
+                                                 s03, s1, s2, s3, n_dims, nr, pos, freq_scale, freq_base,
+                                                 ext_factor, attn_factor, corr_dims, freq_factors, row_indices,
+                                                 set_rows_stride, stream);
         } else if (src0->type == GGML_TYPE_F16 && dst_type == GGML_TYPE_F16) {
-            rope_neox_cuda<forward, half, half>((const half *) src0_d, (half *) dst_d, ne00, ne01, s01, s02, n_dims, nr,
-                                                pos, freq_scale, freq_base, ext_factor, attn_factor, corr_dims,
-                                                freq_factors, row_indices, set_rows_stride, stream);
+            rope_neox_cuda<forward, half, half>((const half *) src0_d, (half *) dst_d, ne00, ne01, ne02, s01, s02,
+                                                s03, s1, s2, s3, n_dims, nr, pos, freq_scale, freq_base,
+                                                ext_factor, attn_factor, corr_dims, freq_factors, row_indices,
+                                                set_rows_stride, stream);
         } else {
             GGML_ABORT("fatal error");
         }
     } else if (is_mrope && !is_vision) {
         if (src0->type == GGML_TYPE_F32) {
-            rope_multi_cuda<forward>(
-                (const float *) src0_d, (float *) dst_d, ne00, ne01, ne02, s01, s02, n_dims, nr, pos, freq_scale,
-                freq_base, ext_factor, attn_factor, corr_dims, freq_factors, sections, is_imrope, stream);
+            rope_multi_cuda<forward>((const float *) src0_d, (float *) dst_d, ne00, ne01, ne02, s01, s02, s03, s1,
+                                     s2, s3, n_dims, nr, pos, freq_scale, freq_base, ext_factor, attn_factor,
+                                     corr_dims, freq_factors, sections, is_imrope, stream);
         } else if (src0->type == GGML_TYPE_F16) {
-            rope_multi_cuda<forward>(
-                (const half *) src0_d, (half *) dst_d, ne00, ne01, ne02, s01, s02, n_dims, nr, pos, freq_scale,
-                freq_base, ext_factor, attn_factor, corr_dims, freq_factors, sections, is_imrope, stream);
+            rope_multi_cuda<forward>((const half *) src0_d, (half *) dst_d, ne00, ne01, ne02, s01, s02, s03, s1,
+                                     s2, s3, n_dims, nr, pos, freq_scale, freq_base, ext_factor, attn_factor,
+                                     corr_dims, freq_factors, sections, is_imrope, stream);
         } else {
             GGML_ABORT("fatal error");
         }
     } else if (is_vision) {
         if (src0->type == GGML_TYPE_F32) {
-            rope_vision_cuda<forward>(
-                (const float *) src0_d, (float *) dst_d, ne00, ne01, ne02, s01, s02, n_dims, nr, pos, freq_scale,
-                freq_base, ext_factor, attn_factor, corr_dims, freq_factors, sections, stream);
+            rope_vision_cuda<forward>((const float *) src0_d, (float *) dst_d, ne00, ne01, ne02, s01, s02, s03, s1,
+                                      s2, s3, n_dims, nr, pos, freq_scale, freq_base, ext_factor, attn_factor,
+                                      corr_dims, freq_factors, sections, stream);
         } else if (src0->type == GGML_TYPE_F16) {
-            rope_vision_cuda<forward>(
-                (const half *) src0_d, (half *) dst_d, ne00, ne01, ne02, s01, s02, n_dims, nr, pos, freq_scale,
-                freq_base, ext_factor, attn_factor, corr_dims, freq_factors, sections, stream);
+            rope_vision_cuda<forward>((const half *) src0_d, (half *) dst_d, ne00, ne01, ne02, s01, s02, s03, s1,
+                                      s2, s3, n_dims, nr, pos, freq_scale, freq_base, ext_factor, attn_factor,
+                                      corr_dims, freq_factors, sections, stream);
         } else {
             GGML_ABORT("fatal error");
         }
     } else {
         if (src0->type == GGML_TYPE_F32 && dst_type == GGML_TYPE_F32) {
-            rope_norm_cuda<forward, float, float>((const float *) src0_d, (float *) dst_d, ne00, ne01, s01, s02, n_dims,
-                                                  nr, pos, freq_scale, freq_base, ext_factor, attn_factor, corr_dims,
-                                                  freq_factors, row_indices, set_rows_stride, stream);
+            rope_norm_cuda<forward, float, float>((const float *) src0_d, (float *) dst_d, ne00, ne01, ne02, s01, s02,
+                                                  s03, s1, s2, s3, n_dims, nr, pos, freq_scale, freq_base,
+                                                  ext_factor, attn_factor, corr_dims, freq_factors, row_indices,
+                                                  set_rows_stride, stream);
         } else if (src0->type == GGML_TYPE_F32 && dst_type == GGML_TYPE_F16) {
-            rope_norm_cuda<forward, float, half>((const float *) src0_d, (half *) dst_d, ne00, ne01, s01, s02, n_dims,
-                                                 nr, pos, freq_scale, freq_base, ext_factor, attn_factor, corr_dims,
-                                                 freq_factors, row_indices, set_rows_stride, stream);
+            rope_norm_cuda<forward, float, half>((const float *) src0_d, (half *) dst_d, ne00, ne01, ne02, s01, s02,
+                                                 s03, s1, s2, s3, n_dims, nr, pos, freq_scale, freq_base,
+                                                 ext_factor, attn_factor, corr_dims, freq_factors, row_indices,
+                                                 set_rows_stride, stream);
         } else if (src0->type == GGML_TYPE_F16 && dst_type == GGML_TYPE_F16) {
-            rope_norm_cuda<forward, half, half>((const half *) src0_d, (half *) dst_d, ne00, ne01, s01, s02, n_dims, nr,
-                                                pos, freq_scale, freq_base, ext_factor, attn_factor, corr_dims,
-                                                freq_factors, row_indices, set_rows_stride, stream);
+            rope_norm_cuda<forward, half, half>((const half *) src0_d, (half *) dst_d, ne00, ne01, ne02, s01, s02,
+                                                s03, s1, s2, s3, n_dims, nr, pos, freq_scale, freq_base,
+                                                ext_factor, attn_factor, corr_dims, freq_factors, row_indices,
+                                                set_rows_stride, stream);
         } else {
             GGML_ABORT("fatal error");
         }

From 39bf692af1cba2a1072e4a42425611bf1ec2807d Mon Sep 17 00:00:00 2001
From: "Piotr Wilkin (ilintar)" <piotr.wilkin@syndatis.com>
Date: Mon, 9 Feb 2026 00:24:08 +0100
Subject: [PATCH 09/17] [Model] Qwen3.5 dense and MoE support (no vision)
 (#19435)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Unified delta net handling

* Remove old methods.

* Refactor and optimize

* Adapt autoregressive version from @ymcki

* Change to decay mask approach

* Fix bad permute

* Qwen 3.5 support

* Apply suggestions from code review

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>

* Further fixes

* Use inheritance, remove unneeded conts

* Not like this!

* Remove ggml.h explicit import

* Remove transformers, fix the views

* ACTUALLY fix views, make super calls explicit in conversion.

* Fix conversion again

* Remove extra ggml.h imports

---------

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
---
 convert_hf_to_gguf.py          |  78 +++--
 gguf-py/gguf/constants.py      |  59 ++++
 gguf-py/gguf/tensor_mapping.py |   6 +-
 src/CMakeLists.txt             |   3 +
 src/llama-arch.cpp             |  61 ++++
 src/llama-arch.h               |   2 +
 src/llama-context.cpp          |   2 +-
 src/llama-model.cpp            | 154 ++++++++
 src/models/delta.cpp           | 618 +++++++++++++++++++++++++++++++++
 src/models/kimi-linear.cpp     |   1 -
 src/models/models.h            | 102 +++++-
 src/models/qwen3-5.cpp         | 421 ++++++++++++++++++++++
 src/models/qwen3-5moe.cpp      |  52 +++
 src/models/qwen3next.cpp       | 372 +-------------------
 14 files changed, 1532 insertions(+), 399 deletions(-)
 create mode 100644 src/models/delta.cpp
 create mode 100644 src/models/qwen3-5.cpp
 create mode 100644 src/models/qwen3-5moe.cpp

diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
index 843c00a896..e64756a74a 100755
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -4102,39 +4102,27 @@ class Qwen2MoeModel(TextModel):
         # process the experts separately
         name = name.replace("language_model.", "") # InternVL
 
-        # handle aggregated expert tensors
-        # GGUF stores dimensions reversed from PyTorch, so:
-        # PyTorch (A,B,C) -> GGUF writes [C,B,A] -> GGML reads ne={C,B,A}
-        # Input shapes from HF: (n_expert, n_ff_exp, n_embd) or (n_expert, n_embd, n_ff_exp)
-        # Expected GGML ne: {n_embd, n_ff_exp, n_expert} for gate/up, {n_ff_exp, n_embd, n_expert} for down
+        # handle pre-packed expert tensors (e.g. Qwen3.5 MoE, Qwen3Next)
+        # HF stores these using nn.Linear convention: [n_expert, out_features, in_features]
+        # This matches the individual expert stacking path below (which stacks
+        # per-expert [out, in] weights into [n_expert, out, in]), so no permute is needed.
         if name.endswith("mlp.experts.down_proj") or name.endswith("mlp.experts.down_proj.weight"):
             mapped = f"{name}.weight" if not name.endswith(".weight") else name
-            # Input: (n_expert=128, n_ff_exp=768, n_embd=2048)
-            # Want GGML ne: {n_ff_exp, n_embd, n_expert} = {768, 2048, 128}
-            # Need PyTorch: (128, 2048, 768) [reversed of GGML]
-            # So: permute(0, 2, 1): (128, 768, 2048) -> (128, 2048, 768)
-            permuted = data_torch.permute(0, 2, 1).contiguous()
-            yield from super().modify_tensors(permuted, mapped, bid)
+            # HF: [n_expert, n_embd, n_ff] → GGML: {n_ff, n_embd, n_expert} ✓
+            yield from super().modify_tensors(data_torch, mapped, bid)
             return
 
         if name.endswith("mlp.experts.gate_up_proj") or name.endswith("mlp.experts.gate_up_proj.weight"):
-            if data_torch.ndim < 3 or data_torch.shape[-1] % 2 != 0:
-                raise ValueError(f"Unexpected gate_up_proj shape for {name}: {tuple(data_torch.shape)}")
-            split_dim = data_torch.shape[-1] // 2
-            gate = data_torch[..., :split_dim].contiguous()
-            up = data_torch[..., split_dim:].contiguous()
-            # Input gate/up: (n_expert=128, n_embd=2048, n_ff_exp=768)
-            # Want GGML ne: {n_embd, n_ff_exp, n_expert} = {2048, 768, 128}
-            # Need PyTorch: (128, 768, 2048) [reversed of GGML]
-            # So: permute(0, 2, 1): (128, 2048, 768) -> (128, 768, 2048)
-            base_name = name.removesuffix(".weight")
-            base = base_name.rsplit('.', 1)[0]
-            mapped_gate = f"{base}.gate_proj.weight"
-            mapped_up = f"{base}.up_proj.weight"
-            perm_gate = gate.permute(0, 2, 1).contiguous()
-            perm_up = up.permute(0, 2, 1).contiguous()
-            yield from super().modify_tensors(perm_gate, mapped_gate, bid)
-            yield from super().modify_tensors(perm_up, mapped_up, bid)
+            # HF: [n_expert, 2*n_ff, n_embd] → split on dim=1
+            n_ff = data_torch.shape[1] // 2
+            gate = data_torch[:, :n_ff, :].contiguous()
+            up = data_torch[:, n_ff:, :].contiguous()
+            # gate/up: [n_expert, n_ff, n_embd] → GGML: {n_embd, n_ff, n_expert} ✓
+            base_name = name.removesuffix(".weight").removesuffix(".gate_up_proj")
+            mapped_gate = f"{base_name}.gate_proj.weight"
+            mapped_up = f"{base_name}.up_proj.weight"
+            yield from super().modify_tensors(gate, mapped_gate, bid)
+            yield from super().modify_tensors(up, mapped_up, bid)
             return
 
         if name.startswith("mlp") or name.startswith("vision_model") or name.startswith("model.vision_tower") or name.startswith("model.multi_modal_projector") or name.startswith("model.visual"):
@@ -4344,6 +4332,40 @@ class Qwen3NextModel(Qwen2MoeModel):
             yield from super().modify_tensors(data_torch, name, bid)
 
 
+@ModelBase.register("Qwen3_5ForCausalLM", "Qwen3_5TextForCausalLM")
+class Qwen3_5Model(Qwen3NextModel):
+    model_arch = gguf.MODEL_ARCH.QWEN3_5
+
+    # Stores whichever of in_proj_a/in_proj_b is seen first, keyed by layer
+    _pending_ba: dict[int | None, tuple[str, Tensor]] = {}
+
+    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
+        # Handle split in_proj_b + in_proj_a → concatenated SSM_BETA_ALPHA
+        # safetensors sorts alphabetically so in_proj_a arrives before in_proj_b
+        if "in_proj_a.weight" in name or "in_proj_b.weight" in name:
+            which = "a" if "in_proj_a" in name else "b"
+            if bid not in self._pending_ba:
+                self._pending_ba[bid] = (which, data_torch)
+                return
+            prev_which, prev_tensor = self._pending_ba.pop(bid)
+            assert prev_which != which, f"duplicate in_proj_{which} for layer {bid}"
+            b_tensor = prev_tensor if prev_which == "b" else data_torch
+            a_tensor = prev_tensor if prev_which == "a" else data_torch
+            ba_combined = torch.cat([b_tensor, a_tensor], dim=0)
+            yield (self.format_tensor_name(gguf.MODEL_TENSOR.SSM_BETA_ALPHA, bid, ".weight"), ba_combined)
+            return
+        else:
+            # Qwen3Next uses .qkvz tensor, so we use the super to get the other functionalities
+            # (norm correction, A_log to A etc.) for free
+            # Qwen2Moe already does the gate_up conversion properly, just use that
+            yield from super().modify_tensors(data_torch, name, bid)
+
+
+@ModelBase.register("Qwen3_5MoeForCausalLM", "Qwen3_5MoeTextForCausalLM")
+class Qwen3_5MoeModel(Qwen3_5Model):
+    model_arch = gguf.MODEL_ARCH.QWEN3_5_MOE
+
+
 @ModelBase.register("RND1")
 class RND1Model(Qwen2MoeModel):
     model_arch = gguf.MODEL_ARCH.RND1
diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py
index 3af4fffe95..8a3fab1e1c 100644
--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@@ -382,6 +382,8 @@ class MODEL_ARCH(IntEnum):
     QWEN3            = auto()
     QWEN3MOE         = auto()
     QWEN3NEXT        = auto()
+    QWEN3_5          = auto()
+    QWEN3_5_MOE      = auto()
     QWEN3VL          = auto()
     QWEN3VLMOE       = auto()
     PHI2             = auto()
@@ -812,6 +814,8 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
     MODEL_ARCH.QWEN3:            "qwen3",
     MODEL_ARCH.QWEN3MOE:         "qwen3moe",
     MODEL_ARCH.QWEN3NEXT:        "qwen3next",
+    MODEL_ARCH.QWEN3_5:          "qwen3_5",
+    MODEL_ARCH.QWEN3_5_MOE:      "qwen3_5moe",
     MODEL_ARCH.QWEN3VL:          "qwen3vl",
     MODEL_ARCH.QWEN3VLMOE:       "qwen3vlmoe",
     MODEL_ARCH.PHI2:             "phi2",
@@ -1784,6 +1788,61 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
         MODEL_TENSOR.SSM_BETA_ALPHA,
         MODEL_TENSOR.SSM_OUT
     ],
+    MODEL_ARCH.QWEN3_5: [
+        MODEL_TENSOR.TOKEN_EMBD,
+        MODEL_TENSOR.OUTPUT_NORM,
+        MODEL_TENSOR.OUTPUT,
+        MODEL_TENSOR.ATTN_NORM,
+        MODEL_TENSOR.ATTN_Q,
+        MODEL_TENSOR.ATTN_Q_NORM,
+        MODEL_TENSOR.ATTN_K,
+        MODEL_TENSOR.ATTN_K_NORM,
+        MODEL_TENSOR.ATTN_V,
+        MODEL_TENSOR.ATTN_OUT,
+        MODEL_TENSOR.ATTN_POST_NORM,
+        MODEL_TENSOR.ATTN_GATE,
+        MODEL_TENSOR.ATTN_QKV,
+        MODEL_TENSOR.FFN_GATE,
+        MODEL_TENSOR.FFN_DOWN,
+        MODEL_TENSOR.FFN_UP,
+        MODEL_TENSOR.SSM_A,
+        MODEL_TENSOR.SSM_CONV1D,
+        MODEL_TENSOR.SSM_DT,
+        MODEL_TENSOR.SSM_NORM,
+        MODEL_TENSOR.SSM_IN,
+        MODEL_TENSOR.SSM_BETA_ALPHA,
+        MODEL_TENSOR.SSM_OUT,
+    ],
+    MODEL_ARCH.QWEN3_5_MOE: [
+        MODEL_TENSOR.TOKEN_EMBD,
+        MODEL_TENSOR.OUTPUT_NORM,
+        MODEL_TENSOR.OUTPUT,
+        MODEL_TENSOR.ATTN_NORM,
+        MODEL_TENSOR.ATTN_Q,
+        MODEL_TENSOR.ATTN_Q_NORM,
+        MODEL_TENSOR.ATTN_K,
+        MODEL_TENSOR.ATTN_K_NORM,
+        MODEL_TENSOR.ATTN_V,
+        MODEL_TENSOR.ATTN_OUT,
+        MODEL_TENSOR.ATTN_POST_NORM,
+        MODEL_TENSOR.ATTN_GATE,
+        MODEL_TENSOR.ATTN_QKV,
+        MODEL_TENSOR.FFN_GATE_INP,
+        MODEL_TENSOR.FFN_GATE_INP_SHEXP,
+        MODEL_TENSOR.FFN_UP_SHEXP,
+        MODEL_TENSOR.FFN_DOWN_SHEXP,
+        MODEL_TENSOR.FFN_GATE_SHEXP,
+        MODEL_TENSOR.FFN_DOWN_EXP,
+        MODEL_TENSOR.FFN_UP_EXP,
+        MODEL_TENSOR.FFN_GATE_EXP,
+        MODEL_TENSOR.SSM_A,
+        MODEL_TENSOR.SSM_CONV1D,
+        MODEL_TENSOR.SSM_DT,
+        MODEL_TENSOR.SSM_NORM,
+        MODEL_TENSOR.SSM_IN,
+        MODEL_TENSOR.SSM_BETA_ALPHA,
+        MODEL_TENSOR.SSM_OUT,
+    ],
     MODEL_ARCH.QWEN3VL: [
         MODEL_TENSOR.TOKEN_EMBD,
         MODEL_TENSOR.OUTPUT_NORM,
diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py
index 167ade7803..43f32c7b52 100644
--- a/gguf-py/gguf/tensor_mapping.py
+++ b/gguf-py/gguf/tensor_mapping.py
@@ -228,6 +228,7 @@ class TensorNameMap:
             "transformer_encoder.{bid}.qkv",                                       # neobert
             "layers.{bid}.attn.Wqkv",                                              # modern-bert
             "model.layers.{bid}.self_attn.language_expert_query_key_value",        # cogvlm
+            "model.layers.{bid}.linear_attn.in_proj_qkv",                          # qwen3.5
         ),
 
         # Attention query
@@ -358,8 +359,9 @@ class TensorNameMap:
         ),
 
         MODEL_TENSOR.ATTN_GATE: (
-            "model.layers.{bid}.self_attn.gate_proj", # afmoe
-            "model.layers.{bid}.self_attn.g_proj",    # step3.5 head-wise attention gate
+            "model.layers.{bid}.self_attn.gate_proj",   # afmoe
+            "model.layers.{bid}.self_attn.g_proj",      # step3.5 head-wise attention gate
+            "model.layers.{bid}.linear_attn.in_proj_z", # qwen3.5
         ),
 
         # Feed-forward norm
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 2115fc4255..0c164617a1 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -57,6 +57,7 @@ add_library(llama
             models/deci.cpp
             models/deepseek.cpp
             models/deepseek2.cpp
+            models/delta.cpp
             models/dots1.cpp
             models/dream.cpp
             models/ernie4-5-moe.cpp
@@ -122,6 +123,8 @@ add_library(llama
             models/qwen3vl-moe.cpp
             models/qwen3moe.cpp
             models/qwen3next.cpp
+            models/qwen3-5.cpp
+            models/qwen3-5moe.cpp
             models/refact.cpp
             models/rnd1.cpp
             models/rwkv6-base.cpp
diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp
index bd78f1e556..fce46772d7 100644
--- a/src/llama-arch.cpp
+++ b/src/llama-arch.cpp
@@ -35,6 +35,8 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
     { LLM_ARCH_QWEN3,            "qwen3"            },
     { LLM_ARCH_QWEN3MOE,         "qwen3moe"         },
     { LLM_ARCH_QWEN3NEXT,        "qwen3next"        },
+    { LLM_ARCH_QWEN3_5,          "qwen3_5"          },
+    { LLM_ARCH_QWEN3_5_MOE,      "qwen3_5moe"       },
     { LLM_ARCH_QWEN3VL,          "qwen3vl"          },
     { LLM_ARCH_QWEN3VLMOE,       "qwen3vlmoe"       },
     { LLM_ARCH_PHI2,             "phi2"             },
@@ -985,6 +987,63 @@ static std::set<llm_tensor> llm_get_tensor_names(llm_arch arch) {
                 LLM_TENSOR_SSM_NORM,
                 LLM_TENSOR_SSM_OUT,
             };
+        case LLM_ARCH_QWEN3_5:
+            return {
+                LLM_TENSOR_TOKEN_EMBD,
+                LLM_TENSOR_OUTPUT_NORM,
+                LLM_TENSOR_OUTPUT,
+                LLM_TENSOR_ATTN_NORM,
+                LLM_TENSOR_ATTN_POST_NORM,
+                LLM_TENSOR_ATTN_Q,
+                LLM_TENSOR_ATTN_Q_NORM,
+                LLM_TENSOR_ATTN_K,
+                LLM_TENSOR_ATTN_K_NORM,
+                LLM_TENSOR_ATTN_V,
+                LLM_TENSOR_ATTN_OUT,
+                LLM_TENSOR_ATTN_QKV,
+                LLM_TENSOR_ATTN_GATE,
+                LLM_TENSOR_FFN_GATE,
+                LLM_TENSOR_FFN_DOWN,
+                LLM_TENSOR_FFN_UP,
+                LLM_TENSOR_SSM_A_NOSCAN,
+                LLM_TENSOR_SSM_CONV1D,
+                LLM_TENSOR_SSM_DT,
+                LLM_TENSOR_SSM_BETA_ALPHA,
+                LLM_TENSOR_SSM_IN,
+                LLM_TENSOR_SSM_NORM,
+                LLM_TENSOR_SSM_OUT,
+            };
+        case LLM_ARCH_QWEN3_5_MOE:
+            return {
+                LLM_TENSOR_TOKEN_EMBD,
+                LLM_TENSOR_OUTPUT_NORM,
+                LLM_TENSOR_OUTPUT,
+                LLM_TENSOR_ATTN_NORM,
+                LLM_TENSOR_ATTN_POST_NORM,
+                LLM_TENSOR_ATTN_Q,
+                LLM_TENSOR_ATTN_Q_NORM,
+                LLM_TENSOR_ATTN_K,
+                LLM_TENSOR_ATTN_K_NORM,
+                LLM_TENSOR_ATTN_V,
+                LLM_TENSOR_ATTN_OUT,
+                LLM_TENSOR_ATTN_QKV,
+                LLM_TENSOR_ATTN_GATE,
+                LLM_TENSOR_FFN_GATE_INP,
+                LLM_TENSOR_FFN_GATE_EXPS,
+                LLM_TENSOR_FFN_DOWN_EXPS,
+                LLM_TENSOR_FFN_UP_EXPS,
+                LLM_TENSOR_FFN_GATE_INP_SHEXP,
+                LLM_TENSOR_FFN_GATE_SHEXP,
+                LLM_TENSOR_FFN_DOWN_SHEXP,
+                LLM_TENSOR_FFN_UP_SHEXP,
+                LLM_TENSOR_SSM_A_NOSCAN,
+                LLM_TENSOR_SSM_CONV1D,
+                LLM_TENSOR_SSM_DT,
+                LLM_TENSOR_SSM_BETA_ALPHA,
+                LLM_TENSOR_SSM_IN,
+                LLM_TENSOR_SSM_NORM,
+                LLM_TENSOR_SSM_OUT,
+            };
         case LLM_ARCH_QWEN3VL:
         case LLM_ARCH_CHAMELEON:
         case LLM_ARCH_HUNYUAN_DENSE:
@@ -2674,6 +2733,8 @@ bool llm_arch_is_hybrid(const llm_arch & arch) {
         case LLM_ARCH_NEMOTRON_H:
         case LLM_ARCH_NEMOTRON_H_MOE:
         case LLM_ARCH_QWEN3NEXT:
+        case LLM_ARCH_QWEN3_5:
+        case LLM_ARCH_QWEN3_5_MOE:
         case LLM_ARCH_KIMI_LINEAR:
             return true;
         default:
diff --git a/src/llama-arch.h b/src/llama-arch.h
index e8263369b8..a392ecce2b 100644
--- a/src/llama-arch.h
+++ b/src/llama-arch.h
@@ -39,6 +39,8 @@ enum llm_arch {
     LLM_ARCH_QWEN3,
     LLM_ARCH_QWEN3MOE,
     LLM_ARCH_QWEN3NEXT,
+    LLM_ARCH_QWEN3_5,
+    LLM_ARCH_QWEN3_5_MOE,
     LLM_ARCH_QWEN3VL,
     LLM_ARCH_QWEN3VLMOE,
     LLM_ARCH_PHI2,
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index a6df893a31..80b9a7d46a 100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -2013,7 +2013,7 @@ void llama_context::output_reorder() {
 //
 
 uint32_t llama_context::graph_max_nodes(uint32_t n_tokens) const {
-    if (model.arch == LLM_ARCH_QWEN3NEXT || model.arch == LLM_ARCH_KIMI_LINEAR) {
+    if (model.arch == LLM_ARCH_QWEN3NEXT || model.arch == LLM_ARCH_QWEN3_5 || model.arch == LLM_ARCH_QWEN3_5_MOE || model.arch == LLM_ARCH_KIMI_LINEAR) {
         return std::max<uint32_t>(n_tokens * 40, 32u * model.n_tensors());
     }
     uint32_t res = std::max<uint32_t>(1024u, 8u*model.n_tensors());
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
index 674d06c891..8fc61aee37 100644
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -2412,6 +2412,25 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                     default: type = LLM_TYPE_UNKNOWN;
                 }
             } break;
+        case LLM_ARCH_QWEN3_5:
+        case LLM_ARCH_QWEN3_5_MOE:
+            {
+                ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH,        hparams.n_ff_exp, false);
+                ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, false);
+                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS,       hparams.f_norm_rms_eps);
+
+                // Load linear attention (gated delta net) parameters
+                ml.get_key(LLM_KV_SSM_CONV_KERNEL,    hparams.ssm_d_conv);
+                ml.get_key(LLM_KV_SSM_INNER_SIZE,     hparams.ssm_d_inner);
+                ml.get_key(LLM_KV_SSM_STATE_SIZE,     hparams.ssm_d_state);
+                ml.get_key(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank);
+                ml.get_key(LLM_KV_SSM_GROUP_COUNT,    hparams.ssm_n_group);
+
+                // Mark recurrent layers (linear attention layers)
+                for (uint32_t i = 0; i < hparams.n_layer; ++i) {
+                    hparams.recurrent_layer_arr[i] = ((i + 1) % 4 != 0);
+                }
+            } break;
         case LLM_ARCH_MISTRAL3:
             {
                 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
@@ -7094,6 +7113,129 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
                         layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff_exp, n_embd, n_expert }, 0);
                         layer.ffn_up_exps   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), { n_embd, n_ff_exp, n_expert }, 0);
 
+                        // Shared experts
+                        layer.ffn_gate_inp_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP_SHEXP, "weight", i), { n_embd }, 0);
+                        layer.ffn_gate_shexp     = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP,     "weight", i), { n_embd, hparams.n_ff_shexp }, 0);
+                        layer.ffn_up_shexp       = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP,       "weight", i), { n_embd, hparams.n_ff_shexp }, 0);
+                        layer.ffn_down_shexp     = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP,     "weight", i), { hparams.n_ff_shexp, n_embd }, 0);
+                    }
+                } break;
+            case LLM_ARCH_QWEN3_5:
+                {
+                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
+
+                    // output
+                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
+                    output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), { n_embd, n_vocab }, TENSOR_NOT_REQUIRED);
+
+                    if (output == NULL) {
+                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, TENSOR_DUPLICATED);
+                    }
+
+                    // Calculate dimensions from hyperparameters
+                    const int64_t head_k_dim = hparams.ssm_d_state;
+                    const int64_t head_v_dim = hparams.ssm_d_state;
+                    const int64_t n_k_heads  = hparams.ssm_n_group;
+                    const int64_t n_v_heads  = hparams.ssm_dt_rank;
+                    const int64_t key_dim    = head_k_dim * n_k_heads;
+                    const int64_t value_dim  = head_v_dim * n_v_heads;
+                    const int64_t conv_dim   = key_dim * 2 + value_dim;
+
+                    const int64_t ba_dim = n_v_heads * 2;
+
+                    for (int i = 0; i < n_layer; ++i) {
+                        auto & layer = layers[i];
+
+                        layer.attn_norm      = create_tensor(tn(LLM_TENSOR_ATTN_NORM,      "weight", i), { n_embd }, 0);
+                        layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), { n_embd }, 0);
+
+                        if (!hparams.is_recurrent(i)) {
+                            // Full attention layers
+                            layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), { n_embd, n_embd_head_k * n_head * 2 }, 0);
+                            layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), { n_embd, n_embd_k_gqa }, 0);
+                            layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), { n_embd, n_embd_v_gqa }, 0);
+                            layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head_k * n_head, n_embd }, 0);
+
+                            layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), { n_embd_head_k }, 0);
+                            layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), { n_embd_head_k }, 0);
+                        } else {
+                            // Linear attention (gated delta net) specific tensors
+                            layer.ssm_in         = create_tensor(tn(LLM_TENSOR_SSM_IN,         "weight", i), { n_embd, key_dim * 2 + value_dim * 2 }, TENSOR_NOT_REQUIRED);
+                            layer.wqkv           = create_tensor(tn(LLM_TENSOR_ATTN_QKV,       "weight", i), { n_embd, key_dim * 2 + value_dim }, TENSOR_NOT_REQUIRED);
+                            layer.wqkv_gate      = create_tensor(tn(LLM_TENSOR_ATTN_GATE,      "weight", i), { n_embd, value_dim }, TENSOR_NOT_REQUIRED);
+                            layer.ssm_conv1d     = create_tensor(tn(LLM_TENSOR_SSM_CONV1D,     "weight", i), { hparams.ssm_d_conv, conv_dim }, 0);
+                            layer.ssm_dt         = create_tensor(tn(LLM_TENSOR_SSM_DT,         "bias",   i), { hparams.ssm_dt_rank }, 0);
+                            layer.ssm_a          = create_tensor(tn(LLM_TENSOR_SSM_A_NOSCAN,             i), { hparams.ssm_dt_rank }, 0);
+                            layer.ssm_beta_alpha = create_tensor(tn(LLM_TENSOR_SSM_BETA_ALPHA, "weight", i), { n_embd, ba_dim }, 0);
+                            layer.ssm_norm       = create_tensor(tn(LLM_TENSOR_SSM_NORM,       "weight", i), { head_v_dim }, 0);
+                            layer.ssm_out        = create_tensor(tn(LLM_TENSOR_SSM_OUT,        "weight", i), { value_dim, n_embd }, 0);
+                        }
+
+                        // Dense FFN for all layers
+                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), { n_embd, n_ff }, 0);
+                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), { n_embd, n_ff }, 0);
+                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd }, 0);
+                    }
+                } break;
+            case LLM_ARCH_QWEN3_5_MOE:
+                {
+                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
+
+                    // output
+                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
+                    output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), { n_embd, n_vocab }, TENSOR_NOT_REQUIRED);
+
+                    if (output == NULL) {
+                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, TENSOR_DUPLICATED);
+                    }
+
+                    const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used;
+
+                    // Calculate dimensions from hyperparameters
+                    const int64_t head_k_dim = hparams.ssm_d_state;
+                    const int64_t head_v_dim = hparams.ssm_d_state;
+                    const int64_t n_k_heads  = hparams.ssm_n_group;
+                    const int64_t n_v_heads  = hparams.ssm_dt_rank;
+                    const int64_t key_dim    = head_k_dim * n_k_heads;
+                    const int64_t value_dim  = head_v_dim * n_v_heads;
+                    const int64_t conv_dim   = key_dim * 2 + value_dim;
+
+                    const int64_t ba_dim = n_v_heads * 2;
+
+                    for (int i = 0; i < n_layer; ++i) {
+                        auto & layer = layers[i];
+
+                        layer.attn_norm      = create_tensor(tn(LLM_TENSOR_ATTN_NORM,      "weight", i), { n_embd }, 0);
+                        layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), { n_embd }, 0);
+
+                        if (!hparams.is_recurrent(i)) {
+                            // Full attention layers
+                            layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), { n_embd, n_embd_head_k * n_head * 2 }, 0);
+                            layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), { n_embd, n_embd_k_gqa }, 0);
+                            layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), { n_embd, n_embd_v_gqa }, 0);
+                            layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head_k * n_head, n_embd }, 0);
+
+                            layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), { n_embd_head_k }, 0);
+                            layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), { n_embd_head_k }, 0);
+                        } else {
+                            // Linear attention (gated delta net) specific tensors
+                            layer.ssm_in         = create_tensor(tn(LLM_TENSOR_SSM_IN,         "weight", i), { n_embd, key_dim * 2 + value_dim * 2 }, TENSOR_NOT_REQUIRED);
+                            layer.wqkv           = create_tensor(tn(LLM_TENSOR_ATTN_QKV,       "weight", i), { n_embd, key_dim * 2 + value_dim }, TENSOR_NOT_REQUIRED);
+                            layer.wqkv_gate      = create_tensor(tn(LLM_TENSOR_ATTN_GATE,      "weight", i), { n_embd, value_dim }, TENSOR_NOT_REQUIRED);
+                            layer.ssm_conv1d     = create_tensor(tn(LLM_TENSOR_SSM_CONV1D,     "weight", i), { hparams.ssm_d_conv, conv_dim }, 0);
+                            layer.ssm_dt         = create_tensor(tn(LLM_TENSOR_SSM_DT,         "bias",   i), { hparams.ssm_dt_rank }, 0);
+                            layer.ssm_a          = create_tensor(tn(LLM_TENSOR_SSM_A_NOSCAN,             i), { hparams.ssm_dt_rank }, 0);
+                            layer.ssm_beta_alpha = create_tensor(tn(LLM_TENSOR_SSM_BETA_ALPHA, "weight", i), { n_embd, ba_dim }, 0);
+                            layer.ssm_norm       = create_tensor(tn(LLM_TENSOR_SSM_NORM,       "weight", i), { head_v_dim }, 0);
+                            layer.ssm_out        = create_tensor(tn(LLM_TENSOR_SSM_OUT,        "weight", i), { value_dim, n_embd }, 0);
+                        }
+
+                        // MoE FFN
+                        layer.ffn_gate_inp  = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP,  "weight", i), { n_embd, n_expert }, 0);
+                        layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert }, 0);
+                        layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff_exp, n_embd, n_expert }, 0);
+                        layer.ffn_up_exps   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), { n_embd, n_ff_exp, n_expert }, 0);
+
                         // Shared experts
                         layer.ffn_gate_inp_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP_SHEXP, "weight", i), { n_embd }, 0);
                         layer.ffn_gate_shexp     = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP,     "weight", i), { n_embd, hparams.n_ff_shexp }, 0);
@@ -7545,6 +7687,8 @@ void llama_model::print_info() const {
         arch == LLM_ARCH_PLAMO2 ||
         arch == LLM_ARCH_GRANITE_HYBRID ||
         arch == LLM_ARCH_QWEN3NEXT ||
+        arch == LLM_ARCH_QWEN3_5 ||
+        arch == LLM_ARCH_QWEN3_5_MOE ||
         arch == LLM_ARCH_NEMOTRON_H ||
         arch == LLM_ARCH_NEMOTRON_H_MOE) {
         LLAMA_LOG_INFO("%s: ssm_d_conv            = %u\n",     __func__, hparams.ssm_d_conv);
@@ -8343,6 +8487,14 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
             {
                 llm = std::make_unique<llm_build_qwen3next>(*this, params);
             } break;
+        case LLM_ARCH_QWEN3_5:
+            {
+                llm = std::make_unique<llm_build_qwen3_5>(*this, params);
+            } break;
+        case LLM_ARCH_QWEN3_5_MOE:
+            {
+                llm = std::make_unique<llm_build_qwen3_5_moe>(*this, params);
+            } break;
         case LLM_ARCH_MISTRAL3:
             {
                 llm = std::make_unique<llm_build_mistral3>(*this, params);
@@ -8603,6 +8755,8 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
         case LLM_ARCH_PANGU_EMBED:
         case LLM_ARCH_AFMOE:
         case LLM_ARCH_QWEN3NEXT:
+        case LLM_ARCH_QWEN3_5:
+        case LLM_ARCH_QWEN3_5_MOE:
         case LLM_ARCH_MIMO2:
         case LLM_ARCH_STEP35:
             return LLAMA_ROPE_TYPE_NEOX;
diff --git a/src/models/delta.cpp b/src/models/delta.cpp
new file mode 100644
index 0000000000..d1d9837d09
--- /dev/null
+++ b/src/models/delta.cpp
@@ -0,0 +1,618 @@
+#include "models.h"
+#include "ggml.h"
+#include <cmath>
+#include <utility>
+#include <cassert>
+
+llm_graph_context_delta::llm_graph_context_delta(const llm_graph_params & params) : llm_graph_context_mamba(params) {}
+
+/**
+ * Unified Delta Net implementation supporting both GDA and KDA modes.
+ *
+ * GDA (Gated Delta Attention): g has shape [H, T, B] in GGML (PyTorch: [B, T, H])
+ *   - Per-head gating, broadcasts over K dimension
+ *
+ * KDA (Key-wise Delta Attention): g has shape [K, H, T, B] in GGML (PyTorch: [B, T, H, K])
+ *   - Per-key gating
+ *
+ * The mode is auto-detected based on g's dimensionality.
+ *
+ * Tensor dimension convention:
+ *   GGML: ne[0] is innermost (fastest varying), ne[3] is outermost
+ *   PyTorch: dim 0 is outermost, dim -1 is innermost
+ *   So GGML [A, B, C, D] corresponds to PyTorch [D, C, B, A]
+ */
+
+// Helper to get a slice along dimension 2 (n_chunks dimension)
+static ggml_tensor * get_slice_2d(ggml_context * ctx, ggml_tensor * t, int64_t chunk) {
+    return ggml_view_4d(ctx, t,
+        t->ne[0], t->ne[1], 1, t->ne[3],
+        t->nb[1], t->nb[2], t->nb[3],
+        chunk * t->nb[2]);
+}
+
+/**
+ * Unified chunked Delta Net implementation.
+ *
+ * Input tensor format matches qwen3next conventions:
+ * @param q         Query tensor [S_k, H_k, n_tokens, n_seqs]
+ * @param k         Key tensor [S_k, H_k, n_tokens, n_seqs]
+ * @param v         Value tensor [S_v, H_v, n_tokens, n_seqs]
+ * @param g         Gate tensor:
+ *                    GDA: [H_v, n_tokens, n_seqs]
+ *                    KDA: [S_k, H_v, n_tokens, n_seqs]
+ * @param beta      Beta tensor [H_v, 1, n_tokens, n_seqs]
+ * @param state     State tensor [S_v, S_v * H_v, 1, n_seqs]
+ * @param causal_mask   Lower triangular mask [chunk_size, chunk_size]
+ * @param identity      Identity matrix [chunk_size, chunk_size]
+ * @param diag_mask     Diagonal mask [chunk_size, chunk_size]
+ * @param il            Layer index (for debugging callbacks)
+ * @param chunk_size    Chunk size for chunked processing
+ * @param eps_norm      Epsilon for L2 normalization
+ *
+ * @return Pair of (output_tokens, new_state)
+ */
+std::pair<ggml_tensor *, ggml_tensor *> llm_graph_context_delta::build_delta_net_unified_chunking(
+        ggml_context * ctx0,
+        ggml_tensor * q,
+        ggml_tensor * k,
+        ggml_tensor * v,
+        ggml_tensor * g,
+        ggml_tensor * beta,
+        ggml_tensor * state_reshaped,
+        ggml_tensor * causal_mask,
+        ggml_tensor * identity,
+        ggml_tensor * diag_mask,
+        int           il,
+        int64_t       chunk_size,
+        float         eps_norm) {
+
+    // Input format: [S, H, n_tokens, n_seqs] (matching qwen3next convention)
+    const int64_t S_k      = q->ne[0];
+    const int64_t H_k      = q->ne[1];
+    const int64_t n_tokens = q->ne[2];
+    const int64_t n_seqs   = q->ne[3];
+
+    const int64_t S_v = v->ne[0];
+    const int64_t H_v = v->ne[1];
+
+    // Detect KDA vs GDA based on g's shape
+    // GDA: g has shape [H_v, n_tokens, n_seqs]
+    // KDA: g has shape [S_k, H_v, n_tokens, n_seqs] (4D with ne[0]=S_k)
+    const bool is_kda = (g->ne[0] == S_k && g->ne[1] == H_v);
+
+    // Validate tensor shapes
+    GGML_ASSERT(v->ne[2] == n_tokens);
+    GGML_ASSERT(k->ne[2] == n_tokens);
+    GGML_ASSERT(state_reshaped->ne[0] == S_v && state_reshaped->ne[1] == S_v && state_reshaped->ne[2] == H_v && state_reshaped->ne[3] == n_seqs);
+    GGML_ASSERT(q->ne[0] == S_k && q->ne[1] == H_k && q->ne[2] == n_tokens && q->ne[3] == n_seqs);
+    GGML_ASSERT(k->ne[0] == S_k && k->ne[1] == H_k && k->ne[2] == n_tokens && k->ne[3] == n_seqs);
+    GGML_ASSERT(beta->ne[0] == H_v && beta->ne[2] == n_tokens && beta->ne[3] == n_seqs);
+    GGML_ASSERT(H_k == H_v);
+
+    if (is_kda) {
+        // KDA: g shape [S_k, H_v, n_tokens, n_seqs]
+        GGML_ASSERT(g->ne[0] == S_k && g->ne[1] == H_v && g->ne[2] == n_tokens && g->ne[3] == n_seqs);
+    } else {
+        // GDA: g shape [H_v, n_tokens, n_seqs]
+        GGML_ASSERT(g->ne[0] == H_v && g->ne[1] == n_tokens && g->ne[2] == n_seqs);
+    }
+
+    // L2 normalize q and k
+    q = ggml_l2_norm(ctx0, q, eps_norm);
+    k = ggml_l2_norm(ctx0, k, eps_norm);
+
+    const float scale = 1.0f / sqrtf((float)S_v);
+    q = ggml_scale(ctx0, q, scale);
+
+    beta = ggml_sigmoid(ctx0, beta);
+
+    cb(q, "q_in", il);
+    cb(k, "k_in", il);
+    cb(v, "v_in", il);
+    cb(beta, "beta_in", il);
+    cb(g, "g_in", il);
+
+    // Permute tensors to working format [S, n_tokens, H, n_seqs]
+    // Input: [S, H, n_tokens, n_seqs] -> permute(0, 2, 1, 3) -> [S, n_tokens, H, n_seqs]
+    q = ggml_cont_4d(ctx0, ggml_permute(ctx0, q, 0, 2, 1, 3), S_k, n_tokens, H_k, n_seqs);
+    k = ggml_cont_4d(ctx0, ggml_permute(ctx0, k, 0, 2, 1, 3), S_k, n_tokens, H_k, n_seqs);
+    v = ggml_cont_4d(ctx0, ggml_permute(ctx0, v, 0, 2, 1, 3), S_v, n_tokens, H_v, n_seqs);
+    if (is_kda) {
+        g = ggml_cont_4d(ctx0, ggml_permute(ctx0, g, 0, 2, 1, 3), S_k, n_tokens, H_k, n_seqs);
+    } else {
+        g = ggml_cont_4d(ctx0, ggml_permute(ctx0, g, 2, 0, 3, 1), n_tokens, 1, H_k, n_seqs);
+    }
+    beta = ggml_cont(ctx0, ggml_permute(ctx0, beta, 2, 0, 1, 3));
+
+    cb(q, "q_perm", il);
+    cb(k, "k_perm", il);
+    cb(v, "v_perm", il);
+    cb(beta, "beta_perm", il);
+    cb(g, "g_perm", il);
+    cb(state_reshaped, "state_in", il);
+
+    // Padding for chunk processing
+    const int64_t pad = (chunk_size - n_tokens % chunk_size) % chunk_size;
+    const int64_t n_chunks = (n_tokens + pad) / chunk_size;
+
+    q = ggml_pad(ctx0, q, 0, pad, 0, 0);
+    k = ggml_pad(ctx0, k, 0, pad, 0, 0);
+    v = ggml_pad(ctx0, v, 0, pad, 0, 0);
+    beta = ggml_pad(ctx0, beta, 0, pad, 0, 0);
+    g = ggml_pad(ctx0, g, pad, 0, 0, 0);
+
+
+    cb(q, "q_pad", il);
+    cb(k, "k_pad", il);
+    cb(v, "v_pad", il);
+    cb(beta, "beta_pad", il);
+    cb(g, "g_pad", il);
+
+    ggml_tensor * v_beta = ggml_mul(ctx0, v, beta);
+    ggml_tensor * k_beta = ggml_mul(ctx0, k, beta);
+
+    cb(v_beta, "v_beta", il);
+    cb(k_beta, "k_beta", il);
+
+    // Reshape to chunks
+    q      = ggml_reshape_4d(ctx0, q,      S_k, chunk_size, n_chunks, H_k * n_seqs);
+    k      = ggml_reshape_4d(ctx0, k,      S_k, chunk_size, n_chunks, H_k * n_seqs);
+    k_beta = ggml_reshape_4d(ctx0, k_beta, S_k, chunk_size, n_chunks, H_k * n_seqs);
+    v      = ggml_reshape_4d(ctx0, v,      S_v, chunk_size, n_chunks, H_v * n_seqs);
+    v_beta = ggml_reshape_4d(ctx0, v_beta, S_v, chunk_size, n_chunks, H_v * n_seqs);
+    beta   = ggml_reshape_4d(ctx0, beta, 1, chunk_size, n_chunks, H_k * n_seqs);
+
+    // Reshape g for chunks
+    ggml_tensor * g_cumsum;
+    ggml_tensor * g_cumsum_t;
+    if (is_kda) {
+        // KDA: g [S_k, n_tokens+pad, H_k, n_seqs] -> [S_k, chunk_size, n_chunks, H_k * n_seqs]
+        g = ggml_reshape_4d(ctx0, g, S_k, chunk_size, n_chunks, H_k * n_seqs);
+        // Cumsum along chunk_size dimension (ne[1])
+        // GGML cumsum operates on ne[0], so we need to transpose, cumsum, transpose back
+        g = ggml_cont(ctx0, ggml_transpose(ctx0, g));  // [chunk_size, S_k, n_chunks, H_k * n_seqs]
+        g_cumsum_t = ggml_cumsum(ctx0, g);
+        g_cumsum = ggml_cont(ctx0, ggml_transpose(ctx0, g_cumsum_t));  // [S_k, chunk_size, n_chunks, H_k * n_seqs]
+    } else {
+        // GDA: g [n_tokens+pad, 1, H_k, n_seqs] -> [chunk_size, 1, n_chunks, H_k * n_seqs]
+        g = ggml_reshape_4d(ctx0, g, chunk_size, 1, n_chunks, H_k * n_seqs);
+        g_cumsum = ggml_cumsum(ctx0, g);
+        g_cumsum_t = ggml_reshape_4d(ctx0, g_cumsum, 1, chunk_size, n_chunks, H_k * n_seqs);
+    }
+
+    cb(g_cumsum, "g_cumsum", il);
+
+    // Build attention matrix A for the WY representation solve
+    // For GDA: A[j,i] = sum_k(k[j,k] * exp(g[j] - g[i]) * k[i,k]) = (k @ k^T) * exp(g[j] - g[i])
+    // For KDA: A[j,i] = sum_k(k_beta[j,k] * exp(g[j,k] - g[i,k]) * k[i,k])
+    // KDA uses decay mask with S_k packed into batch to compute exp(g[j,k] - g[i,k]) per-key
+
+    ggml_tensor * k_decay;
+    ggml_tensor * decay_mask = nullptr;
+    ggml_tensor * g_exp_pos = nullptr;
+
+    if (is_kda) {
+        // KDA: Use decay mask with S_k in leading dimension for efficient mul_mat reduction
+        // A[j,i] = sum_k(k_beta[j,k] * exp(g[j,k] - g[i,k]) * k[i,k])
+        // By putting S_k in dim 0, mul_mat implicitly sums over it
+
+        const int64_t CHB = n_chunks * H_k * n_seqs;
+
+        // g_cumsum_t is [chunk_size, S_k, n_chunks, H_k * n_seqs]
+        // Reshape to [chunk_size, S_k, CHB] then build decay mask
+        ggml_tensor * gcs = ggml_reshape_3d(ctx0, g_cumsum_t, chunk_size, S_k, CHB);
+        ggml_tensor * gcs_i = ggml_reshape_4d(ctx0, gcs, chunk_size, 1, S_k, CHB);
+        ggml_tensor * gcs_j = ggml_reshape_4d(ctx0, gcs, 1, chunk_size, S_k, CHB);
+
+        // Build decay mask: [chunk_size, chunk_size, S_k, CHB]
+        ggml_tensor * gcs_j_bc = ggml_repeat_4d(ctx0, gcs_j, chunk_size, chunk_size, S_k, CHB);
+        decay_mask = ggml_sub(ctx0, gcs_j_bc, gcs_i);
+
+        cb(decay_mask, "decay_mask_kda", il);
+
+        decay_mask = ggml_mul(ctx0, decay_mask, diag_mask);
+        decay_mask = ggml_exp(ctx0, decay_mask);
+        decay_mask = ggml_mul(ctx0, decay_mask, diag_mask);
+
+        // Permute to [S_k, chunk_size_j, chunk_size_i, CHB] for mul_mat reduction over S_k
+        decay_mask = ggml_cont_4d(ctx0, ggml_permute(ctx0, decay_mask, 2, 1, 0, 3), S_k, chunk_size, chunk_size, CHB);
+
+        // Reshape k and k_beta for broadcasting with decay_mask
+        // k_i: indexed at position i (dim 2 of decay_mask)
+        // k_beta_j: indexed at position j (dim 1 of decay_mask)
+        ggml_tensor * k_i = ggml_reshape_4d(ctx0, k, S_k, 1, chunk_size, CHB);
+        ggml_tensor * k_beta_j = ggml_reshape_4d(ctx0, k_beta, S_k, chunk_size, 1, CHB);
+
+        // decay_k_beta_j[s,j,i,b] = decay[s,j,i,b] * k_beta[s,j,b]
+        ggml_tensor * decay_k_beta_j = ggml_mul(ctx0, decay_mask, k_beta_j);
+
+        // mul_mat sums over S_k: result[j,1,i,CHB] = sum_s decay_k_beta_j[s,j,i,b] * k_i[s,1,i,b]
+        k_decay = ggml_mul_mat(ctx0, decay_k_beta_j, k_i);
+        k_decay = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_reshape_4d(ctx0, k_decay, chunk_size, chunk_size, n_chunks, H_k * n_seqs)));
+
+        // g_exp_pos is still needed for later (kbeta_gexp, etc.)
+        g_exp_pos = ggml_exp(ctx0, g_cumsum);
+    } else {
+        // GDA: Use decay mask approach (g broadcasts over K dimension)
+        // g_cumsum [chunk_size, 1, n_chunks, H_v * n_seqs]
+        ggml_tensor * gcs_i = g_cumsum;
+        ggml_tensor * gcs_j = g_cumsum_t;
+        g_exp_pos = ggml_exp(ctx0, g_cumsum_t);
+        ggml_tensor * gcs_j_broadcast = ggml_repeat_4d(ctx0, gcs_j, chunk_size, chunk_size, n_chunks, H_v * n_seqs);
+        decay_mask = ggml_sub(ctx0, gcs_j_broadcast, gcs_i);
+
+        cb(decay_mask, "decay_mask", il);
+
+        decay_mask = ggml_mul(ctx0, decay_mask, diag_mask);
+        decay_mask = ggml_exp(ctx0, decay_mask);
+        decay_mask = ggml_mul(ctx0, decay_mask, diag_mask);
+
+        ggml_tensor * kmulkbeta = ggml_mul_mat(ctx0, k, k_beta);
+        k_decay = ggml_mul(ctx0, kmulkbeta, decay_mask);
+    }
+
+    ggml_tensor * attn = ggml_neg(ctx0, ggml_mul(ctx0, k_decay, causal_mask));
+
+    cb(attn, "attn_pre_solve", il);
+
+    // Solve triangular system: (I + L) @ X = I, where L is strictly lower triangular
+    ggml_tensor * attn_lower = ggml_mul(ctx0, attn, causal_mask);
+    ggml_tensor * lhs = ggml_sub(ctx0, ggml_repeat(ctx0, identity, attn_lower), attn_lower);
+    ggml_tensor * lin_solve = ggml_solve_tri(ctx0, lhs, attn, true, true, false);
+    attn = ggml_mul(ctx0, lin_solve, causal_mask);
+    attn = ggml_add(ctx0, attn, identity);
+
+    cb(attn, "attn_solved", il);
+
+    // Compute u = A @ v and w = A @ (g.exp() * k)
+    v = ggml_mul_mat(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, v_beta)), attn);
+
+    ggml_tensor * kbeta_gexp = ggml_mul(ctx0, k_beta, g_exp_pos);
+    cb(kbeta_gexp, "kbeta_gexp", il);
+
+    ggml_tensor * k_cumdecay = ggml_cont(ctx0, ggml_transpose(ctx0,
+        ggml_mul_mat(ctx0, attn, ggml_cont(ctx0, ggml_transpose(ctx0, kbeta_gexp)))));
+    cb(k_cumdecay, "k_cumdecay", il);
+
+    // Attention scores q @ k^T with decay
+    // For GDA: attn_kq[j,i] = sum_k(q[j,k] * exp(g[j] - g[i]) * k[i,k])
+    // For KDA: attn_kq[j,i] = sum_k(q[j,k] * exp(g[j,k] - g[i,k]) * k[i,k])
+    ggml_tensor * attn_kq;
+    if (is_kda) {
+        // KDA: Same approach as k_decay - use decay_mask with S_k in leading dim
+        const int64_t CHB = n_chunks * H_k * n_seqs;
+
+        // Rebuild decay mask (same structure as k_decay)
+        ggml_tensor * gcs = ggml_reshape_3d(ctx0, g_cumsum_t, chunk_size, S_k, CHB);
+        ggml_tensor * gcs_i = ggml_reshape_4d(ctx0, gcs, chunk_size, 1, S_k, CHB);
+        ggml_tensor * gcs_j = ggml_reshape_4d(ctx0, gcs, 1, chunk_size, S_k, CHB);
+        ggml_tensor * gcs_j_bc = ggml_repeat_4d(ctx0, gcs_j, chunk_size, chunk_size, S_k, CHB);
+        ggml_tensor * decay_mask_kq = ggml_sub(ctx0, gcs_j_bc, gcs_i);
+
+        decay_mask_kq = ggml_mul(ctx0, decay_mask_kq, diag_mask);
+        decay_mask_kq = ggml_exp(ctx0, decay_mask_kq);
+        decay_mask_kq = ggml_mul(ctx0, decay_mask_kq, diag_mask);
+
+        // Permute to [S_k, chunk_size_j, chunk_size_i, CHB]
+        decay_mask_kq = ggml_cont_4d(ctx0, ggml_permute(ctx0, decay_mask_kq, 2, 1, 0, 3), S_k, chunk_size, chunk_size, CHB);
+
+        // q_j: indexed at position j, k_i: indexed at position i
+        ggml_tensor * q_j = ggml_reshape_4d(ctx0, q, S_k, chunk_size, 1, CHB);
+        ggml_tensor * k_i = ggml_reshape_4d(ctx0, k, S_k, 1, chunk_size, CHB);
+
+        // decay_q_j[s,j,i,b] = decay[s,j,i,b] * q[s,j,b]
+        ggml_tensor * decay_q_j = ggml_mul(ctx0, decay_mask_kq, q_j);
+
+        // mul_mat sums over S_k
+        attn_kq = ggml_mul_mat(ctx0, decay_q_j, k_i);
+        attn_kq = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_reshape_4d(ctx0, attn_kq, chunk_size, chunk_size, n_chunks, H_k * n_seqs)));
+    } else {
+        // GDA: Use decay mask
+        attn_kq = ggml_mul_mat(ctx0, k, q);
+        attn_kq = ggml_mul(ctx0, attn_kq, decay_mask);
+        attn_kq = ggml_mul(ctx0, attn_kq, diag_mask);
+    }
+    cb(attn_kq, "attn_kq", il);
+
+    // Compute g_last and g_diff for state updates
+    ggml_tensor * g_last;
+    ggml_tensor * g_diff_exp;
+    ggml_tensor * g_last_exp;
+
+    if (is_kda) {
+        // KDA: g_cumsum [S_k, chunk_size, n_chunks, H_k * n_seqs]
+        // Get last element along chunk_size dimension (ne[1])
+        g_last = ggml_view_4d(ctx0, g_cumsum,
+            g_cumsum->ne[0], 1, g_cumsum->ne[2], g_cumsum->ne[3],
+            g_cumsum->nb[1], g_cumsum->nb[2], g_cumsum->nb[3],
+            (g_cumsum->ne[1] - 1) * g_cumsum->nb[1]);
+        g_last = ggml_cont(ctx0, g_last);
+        g_last_exp = ggml_exp(ctx0, g_last);
+
+        // g_diff = g_last - g_cumsum
+        ggml_tensor * g_last_broadcast = ggml_repeat_4d(ctx0, g_last,
+            g_cumsum->ne[0], g_cumsum->ne[1], g_cumsum->ne[2], g_cumsum->ne[3]);
+        ggml_tensor * g_diff = ggml_sub(ctx0, g_last_broadcast, g_cumsum);
+        g_diff_exp = ggml_exp(ctx0, g_diff);
+    } else {
+        // GDA: g_cumsum [chunk_size, 1, n_chunks, H_k * n_seqs]
+        g_last = ggml_view_4d(ctx0, g_cumsum,
+            1, 1, g_cumsum->ne[2], g_cumsum->ne[3],
+            g_cumsum->nb[1], g_cumsum->nb[2], g_cumsum->nb[3],
+            (g_cumsum->ne[0] - 1) * ggml_element_size(g_cumsum));
+        g_last = ggml_cont(ctx0, g_last);
+        g_last_exp = ggml_exp(ctx0, g_last);
+
+        ggml_tensor * g_diff = ggml_neg(ctx0, ggml_sub(ctx0, g_cumsum, g_last));
+        g_diff_exp = ggml_exp(ctx0, g_diff);
+    }
+
+    cb(g_last, "g_last", il);
+    cb(g_last_exp, "g_last_exp", il);
+
+    ggml_tensor * key_gdiff = ggml_mul(ctx0, k, g_diff_exp);
+    cb(key_gdiff, "key_gdiff", il);
+
+    // Process chunks
+    ggml_tensor * new_state = state_reshaped;
+    ggml_tensor * core_attn_out = nullptr;
+
+    for (int64_t chunk = 0; chunk < n_chunks; chunk++) {
+        ggml_tensor * q_chunk = get_slice_2d(ctx0, q, chunk);
+        ggml_tensor * v_chunk = get_slice_2d(ctx0, v, chunk);
+        ggml_tensor * k_cumdecay_chunk = get_slice_2d(ctx0, k_cumdecay, chunk);
+        ggml_tensor * attn_chunk = get_slice_2d(ctx0, attn_kq, chunk);
+        ggml_tensor * gexp_chunk = get_slice_2d(ctx0, g_exp_pos, chunk);
+
+        cb(attn_chunk, "attn_chunk", il);
+
+        ggml_tensor * state_t = ggml_cont_4d(ctx0, ggml_permute(ctx0, new_state, 1, 0, 2, 3),
+            S_v, S_v, 1, H_v * n_seqs);
+
+        // v_prime = k_cumdecay @ state
+        ggml_tensor * v_prime = ggml_mul_mat(ctx0, state_t, k_cumdecay_chunk);
+        cb(v_prime, "v_prime_chunk", il);
+
+        // v_new = v - v_prime
+        ggml_tensor * v_new = ggml_sub(ctx0, ggml_repeat(ctx0, v_chunk, v_prime), v_prime);
+        ggml_tensor * v_new_t = ggml_cont(ctx0, ggml_transpose(ctx0, v_new));
+        cb(v_new, "v_new_chunk", il);
+
+        // attn_inter = (q * g.exp()) @ state
+        ggml_tensor * q_g_exp = ggml_mul(ctx0, q_chunk, gexp_chunk);
+        ggml_tensor * attn_inter = ggml_mul_mat(ctx0, state_t, q_g_exp);
+        cb(attn_inter, "attn_inter_chunk", il);
+
+        // output = attn_inter + attn @ v_new
+        ggml_tensor * v_attn = ggml_mul_mat(ctx0, v_new_t, attn_chunk);
+        cb(v_attn, "v_attn_chunk", il);
+
+        ggml_tensor * core_attn_out_chunk = ggml_add(ctx0, attn_inter, v_attn);
+        cb(core_attn_out_chunk, "core_attn_out_chunk", il);
+
+        core_attn_out = core_attn_out == nullptr
+            ? core_attn_out_chunk
+            : ggml_concat(ctx0, core_attn_out, core_attn_out_chunk, 2);
+
+        // State update: state = state * g_last_exp + key_gdiff^T @ v_new
+        ggml_tensor * k_gdiff = ggml_cont(ctx0, get_slice_2d(ctx0, key_gdiff, chunk));
+        ggml_tensor * kgdmulvnew = ggml_mul_mat(ctx0, v_new_t, ggml_cont(ctx0, ggml_transpose(ctx0, k_gdiff)));
+
+        ggml_tensor * gexp_last_chunk = ggml_cont(ctx0, get_slice_2d(ctx0, g_last_exp, chunk));
+
+        if (is_kda) {
+            // KDA: g_last_exp [S_k, 1, n_chunks, H_k * n_seqs]
+            // State: [S_v, S_v, H_v, n_seqs]
+            // Need to reshape g_last_exp to broadcast correctly over V dimension only
+            gexp_last_chunk = ggml_reshape_4d(ctx0, gexp_last_chunk,
+                1, gexp_last_chunk->ne[0], H_v, n_seqs);  // [1, S_k, H_v, n_seqs]
+            // Transpose to [S_k, 1, H_v, n_seqs] then broadcast
+            gexp_last_chunk = ggml_cont(ctx0, ggml_permute(ctx0, gexp_last_chunk, 1, 0, 2, 3));
+        } else {
+            // GDA: g_last_exp [1, 1, n_chunks, H_k * n_seqs]
+            // Broadcasts over both K and V dimensions
+            gexp_last_chunk = ggml_reshape_4d(ctx0, gexp_last_chunk,
+                gexp_last_chunk->ne[0], gexp_last_chunk->ne[1], H_v, n_seqs);
+        }
+
+        new_state = ggml_add(ctx0,
+            ggml_mul(ctx0, new_state, gexp_last_chunk),
+            ggml_reshape_4d(ctx0, kgdmulvnew, kgdmulvnew->ne[0], kgdmulvnew->ne[1], H_v, n_seqs));
+    }
+
+    // Truncate padding and permute back
+    ggml_tensor * output_tokens = ggml_view_4d(ctx0, core_attn_out,
+        S_v, n_tokens, H_v, n_seqs,
+        ggml_row_size(core_attn_out->type, S_v),
+        ggml_row_size(core_attn_out->type, S_v * chunk_size * n_chunks),
+        ggml_row_size(core_attn_out->type, S_v * chunk_size * n_chunks * H_v), 0);
+    output_tokens = ggml_cont(ctx0, output_tokens);
+
+    cb(output_tokens, "output_tokens", il);
+
+    output_tokens = ggml_permute(ctx0, output_tokens, 0, 2, 1, 3);
+    output_tokens = ggml_cont(ctx0, output_tokens);
+
+    return {output_tokens, new_state};
+}
+
+
+/**
+ * Unified autoregressive Delta Net implementation (single token processing).
+ *
+ * This implementation uses matrix multiplication instead of elementwise operations + summation,
+ * which is more efficient and mathematically equivalent. See inline comments for equivalences.
+ *
+ * Input tensor format matches qwen3next conventions:
+ * @param q         Query tensor [S_k, H_k, 1, n_seqs]
+ * @param k         Key tensor [S_k, H_k, 1, n_seqs]
+ * @param v         Value tensor [S_v, H_v, 1, n_seqs]
+ * @param g         Gate tensor:
+ *                    GDA: [H_v, 1, n_seqs]
+ *                    KDA: [S_k, H_v, 1, n_seqs]
+ * @param beta      Beta tensor [H_v, 1, 1, n_seqs]
+ * @param state     State tensor [S_v, S_v * H_v, 1, n_seqs]
+ * @param il        Layer index (for debugging callbacks)
+ * @param eps_norm  Epsilon for L2 normalization
+ *
+ * @return Pair of (output_tokens, new_state)
+ */
+std::pair<ggml_tensor *, ggml_tensor *> llm_graph_context_delta::build_delta_net_unified_autoregressive(
+        ggml_context * ctx0,
+        ggml_tensor * q,
+        ggml_tensor * k,
+        ggml_tensor * v,
+        ggml_tensor * g,
+        ggml_tensor * beta,
+        ggml_tensor * state,
+        int           il,
+        float         eps_norm) {
+
+    // Input format: [S, H, n_tokens, n_seqs] (matching qwen3next convention)
+    const int64_t S_k      = q->ne[0];
+    const int64_t H_k      = q->ne[1];
+    const int64_t n_tokens = q->ne[2];
+    const int64_t n_seqs   = q->ne[3];
+
+    const int64_t S_v = v->ne[0];
+    const int64_t H_v = v->ne[1];
+
+    GGML_ASSERT(n_tokens == 1);  // Autoregressive mode is for single token
+
+    // Detect KDA vs GDA based on g's shape
+    // GDA: g has shape [H_v, 1, n_seqs] or [H_v, n_tokens, n_seqs]
+    // KDA: g has shape [S_k, H_v, 1, n_seqs] or [S_k, H_v, n_tokens, n_seqs]
+    const bool is_kda = (g->ne[0] == S_k && g->ne[1] == H_v);
+
+    // Validate shapes
+    GGML_ASSERT(v->ne[2] == n_tokens);
+    GGML_ASSERT(k->ne[2] == n_tokens);
+    GGML_ASSERT(state->ne[0] == S_v && state->ne[1] == S_v && state->ne[2] == H_v && state->ne[3] == n_seqs);
+    GGML_ASSERT(q->ne[0] == S_k && q->ne[1] == H_k && q->ne[2] == n_tokens && q->ne[3] == n_seqs);
+    GGML_ASSERT(k->ne[0] == S_k && k->ne[1] == H_k && k->ne[2] == n_tokens && k->ne[3] == n_seqs);
+    GGML_ASSERT(beta->ne[0] == H_v && beta->ne[2] == n_tokens && beta->ne[3] == n_seqs);
+    GGML_ASSERT(H_k == H_v);
+
+    if (is_kda) {
+        GGML_ASSERT(g->ne[0] == S_k && g->ne[1] == H_v);
+    } else {
+        GGML_ASSERT(g->ne[0] == H_v);
+    }
+
+    // L2 normalize q and k
+    q = ggml_l2_norm(ctx0, q, eps_norm);
+    k = ggml_l2_norm(ctx0, k, eps_norm);
+
+    const float scale = 1.0f / sqrtf((float)S_v);
+    q = ggml_scale(ctx0, q, scale);
+    beta = ggml_sigmoid(ctx0, beta);
+
+    cb(q, "q_in", il);
+    cb(k, "k_in", il);
+    cb(v, "v_in", il);
+    cb(beta, "beta_in", il);
+    cb(g, "g_in", il);
+
+    // Reshape g and beta for broadcasting
+    ggml_tensor * g_t;
+    ggml_tensor * beta_t;
+
+    if (is_kda) {
+        // KDA: g [S_k, H_v, 1, n_seqs] -> [S_k, 1, H_k, n_seqs]
+        // For state multiplication, need [1, S_k, H_v, n_seqs] to broadcast over V only
+        g_t = ggml_reshape_4d(ctx0, g, S_k, 1, H_k, n_seqs);
+    } else {
+        // GDA: g [H_v, 1, n_seqs] -> [1, 1, H_k, n_seqs]
+        // For state multiplication, broadcasts over both K and V
+        g_t = ggml_reshape_4d(ctx0, ggml_transpose(ctx0, g), 1, 1, H_k, n_seqs);
+    }
+
+    beta_t = ggml_reshape_4d(ctx0, ggml_transpose(ctx0, beta), 1, 1, H_k, n_seqs);
+
+    // Apply exponential to g_t
+    g_t = ggml_exp(ctx0, g_t);
+
+    // State decay: state = state * exp(g)
+    if (is_kda) {
+        // KDA: g_t [S_k, 1, H_k, n_seqs], state [S_v, S_v, H_v, n_seqs]
+        // Need to broadcast g_t over V dimension (ne[0] of state)
+        // Permute g_t to [1, S_k, H_k, n_seqs] for correct broadcasting
+        ggml_tensor * g_broadcast = ggml_cont(ctx0, ggml_permute(ctx0, g_t, 1, 0, 2, 3));
+        state = ggml_mul(ctx0, state, g_broadcast);
+    } else {
+        // GDA: g_t [1, 1, H_k, n_seqs] broadcasts over both dimensions
+        state = ggml_mul(ctx0, state, g_t);
+    }
+
+    // Equivalence to previous version:
+    // Previous: kv_mem = sum_k(state * k) using elementwise mult + sum_rows
+    // Current:  k_state = state_t @ k_t using matrix multiplication
+    // These are equivalent because: sum_k(A * B) = A @ B when dimensions align
+    ggml_tensor * state_t = ggml_cont(ctx0, ggml_transpose(ctx0, state));
+    ggml_tensor * k_t = ggml_reshape_4d(ctx0, k, S_k, 1, H_k, n_seqs);
+    ggml_tensor * k_state = ggml_mul_mat(ctx0, state_t, k_t);
+
+    // v_diff = v - k_state (equivalent to v - kv_mem in previous version)
+    ggml_tensor * v_t = ggml_reshape_4d(ctx0, v, S_v, 1, H_v, n_seqs);
+    ggml_tensor * v_diff = ggml_sub(ctx0, v_t, k_state);
+    ggml_tensor * k_beta = ggml_mul(ctx0, k_t, beta_t);
+
+    // Equivalence to previous version:
+    // Previous: state += k.unsqueeze(-1) * delta where delta = (v - kv_mem) * beta
+    // Current:  state += v_diff^T @ k_beta^T using matrix multiplication
+    // These are equivalent because: outer_product(k, v_diff * beta) = v_diff^T @ k^T
+    state = ggml_add(ctx0, state, ggml_mul_mat(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, v_diff)), ggml_cont(ctx0, ggml_transpose(ctx0, k_beta))));
+
+    // Equivalence to previous version:
+    // Previous: core_attn_out = sum_k(state * q) using elementwise mult + sum_rows
+    // Current:  core_attn_out = state_t @ q using matrix multiplication
+    // These are equivalent because: sum_k(A * B) = A @ B when dimensions align
+    q = ggml_reshape_4d(ctx0, q, S_k, 1, H_k, n_seqs);
+    state_t = ggml_cont(ctx0, ggml_transpose(ctx0, state));
+    ggml_tensor * core_attn_out = ggml_mul_mat(ctx0, state_t, q);
+    // core_attn_out should be [S_v, 1, H_v, n_seqs] after this
+    cb(core_attn_out, "output_tokens", il);
+    cb(state, "new_state", il);
+
+    return {core_attn_out, state};
+}
+
+
+/**
+ * Main entry point that dispatches to chunked or autoregressive based on n_tokens.
+ *
+ * Input tensor format matches qwen3next conventions:
+ * @param q         Query tensor [S_k, H_k, n_tokens, n_seqs]
+ * @param k         Key tensor [S_k, H_k, n_tokens, n_seqs]
+ * @param v         Value tensor [S_v, H_v, n_tokens, n_seqs]
+ * @param g         Gate tensor (GDA: [H_v, n_tokens, n_seqs], KDA: [S_k, H_v, n_tokens, n_seqs])
+ * @param beta      Beta tensor [H_v, 1, n_tokens, n_seqs]
+ * @param state     State tensor [S_v, S_v * H_v, 1, n_seqs]
+ */
+std::pair<ggml_tensor *, ggml_tensor *> llm_graph_context_delta::build_delta_net_unified(
+        ggml_context * ctx0,
+        ggml_tensor * q,
+        ggml_tensor * k,
+        ggml_tensor * v,
+        ggml_tensor * g,
+        ggml_tensor * beta,
+        ggml_tensor * state,
+        ggml_tensor * causal_mask,
+        ggml_tensor * identity,
+        ggml_tensor * diag_mask,
+        int           il,
+        int64_t       chunk_size,
+        float         eps_norm) {
+
+    // Input format: [S, H, n_tokens, n_seqs] (matching qwen3next convention)
+    const int64_t n_tokens = q->ne[2];
+
+    if (n_tokens == 1) {
+        return build_delta_net_unified_autoregressive(
+            ctx0, q, k, v, g, beta, state, il, eps_norm);
+    }
+    return build_delta_net_unified_chunking(
+        ctx0, q, k, v, g, beta, state, causal_mask, identity, diag_mask,
+        il, chunk_size, eps_norm);
+}
diff --git a/src/models/kimi-linear.cpp b/src/models/kimi-linear.cpp
index 0f037d1a39..d9ee698075 100644
--- a/src/models/kimi-linear.cpp
+++ b/src/models/kimi-linear.cpp
@@ -1,5 +1,4 @@
 #include "models.h"
-#include "ggml.h"
 
 #define CHUNK_SIZE 64
 
diff --git a/src/models/models.h b/src/models/models.h
index cfcbb9aaa5..2a750c168e 100644
--- a/src/models/models.h
+++ b/src/models/models.h
@@ -17,6 +17,53 @@ struct llm_graph_context_mamba : public llm_graph_context {
 
 };
 
+struct llm_graph_context_delta : public llm_graph_context_mamba {
+    llm_graph_context_delta(const llm_graph_params & params);
+
+    virtual ~llm_graph_context_delta() = default;
+
+    std::pair<ggml_tensor *, ggml_tensor *> build_delta_net_unified_chunking(
+        ggml_context * ctx0,
+        ggml_tensor * q,
+        ggml_tensor * k,
+        ggml_tensor * v,
+        ggml_tensor * g,
+        ggml_tensor * beta,
+        ggml_tensor * state,
+        ggml_tensor * causal_mask,
+        ggml_tensor * identity,
+        ggml_tensor * diag_mask,
+        int           il,
+        int64_t       chunk_size,
+        float         eps_norm);
+
+    std::pair<ggml_tensor *, ggml_tensor *> build_delta_net_unified_autoregressive(
+        ggml_context * ctx0,
+        ggml_tensor * q,
+        ggml_tensor * k,
+        ggml_tensor * v,
+        ggml_tensor * g,
+        ggml_tensor * beta,
+        ggml_tensor * state,
+        int           il,
+        float         eps_norm);
+
+    std::pair<ggml_tensor *, ggml_tensor *> build_delta_net_unified(
+        ggml_context * ctx0,
+        ggml_tensor * q,
+        ggml_tensor * k,
+        ggml_tensor * v,
+        ggml_tensor * g,
+        ggml_tensor * beta,
+        ggml_tensor * state,
+        ggml_tensor * causal_mask,
+        ggml_tensor * identity,
+        ggml_tensor * diag_mask,
+        int           il,
+        int64_t       chunk_size,
+        float         eps_norm);
+};
+
 // Base class for RWKV-related models
 struct llm_build_rwkv6_base : public llm_graph_context {
     const llama_model & model;
@@ -476,7 +523,7 @@ struct llm_build_qwen3vl : public llm_graph_context {
 struct llm_build_qwen3vlmoe : public llm_graph_context {
     llm_build_qwen3vlmoe(const llama_model & model, const llm_graph_params & params);
 };
-struct llm_build_qwen3next : public llm_graph_context_mamba {
+struct llm_build_qwen3next : public llm_graph_context_delta {
     llm_build_qwen3next(const llama_model & model, const llm_graph_params & params);
 private:
     ggml_tensor * build_layer_attn(
@@ -534,6 +581,59 @@ private:
     const llama_model & model;
 };
 
+struct llm_build_qwen3_5 : public llm_graph_context_delta {
+    llm_build_qwen3_5(const llama_model & model, const llm_graph_params & params);
+
+protected:
+    // Tag type for subclass constructors that need to call build_graph() themselves
+    // (to ensure virtual dispatch works correctly)
+    struct defer_graph_build_t {};
+
+    llm_build_qwen3_5(const llama_model & model, const llm_graph_params & params, defer_graph_build_t);
+
+    void build_graph();
+
+    virtual ggml_tensor * build_layer_ffn(
+                ggml_tensor * cur,
+                        int   il);
+
+    const llama_model & model;
+
+private:
+    ggml_tensor * build_layer_attn(
+    llm_graph_input_attn_kv * inp_attn,
+                ggml_tensor * cur,
+                ggml_tensor * inp_pos,
+                        int   il);
+
+    ggml_tensor * build_layer_attn_linear(
+         llm_graph_input_rs * inp,
+                ggml_tensor * cur,
+                ggml_tensor * causal_mask,
+                ggml_tensor * identity,
+                ggml_tensor * diag_mask,
+                        int   il);
+
+    ggml_tensor * build_norm_gated(
+                ggml_tensor * input,
+                ggml_tensor * weights,
+                ggml_tensor * gate,
+                        int   layer);
+
+    std::pair<ggml_tensor *, ggml_tensor *> build_qkvz(
+                ggml_tensor * input,
+                        int   il);
+};
+
+struct llm_build_qwen3_5_moe : public llm_build_qwen3_5 {
+    llm_build_qwen3_5_moe(const llama_model & model, const llm_graph_params & params);
+
+protected:
+    ggml_tensor * build_layer_ffn(
+                ggml_tensor * cur,
+                        int   il) override;
+};
+
 struct llm_build_qwen : public llm_graph_context {
     llm_build_qwen(const llama_model & model, const llm_graph_params & params);
 };
diff --git a/src/models/qwen3-5.cpp b/src/models/qwen3-5.cpp
new file mode 100644
index 0000000000..0947299d73
--- /dev/null
+++ b/src/models/qwen3-5.cpp
@@ -0,0 +1,421 @@
+#include "models.h"
+
+#define CHUNK_SIZE 64
+
+llm_build_qwen3_5::llm_build_qwen3_5(const llama_model & model, const llm_graph_params & params) :
+    llm_graph_context_delta(params), model(model) {
+    build_graph();
+}
+
+// virtual call in constructor fix
+llm_build_qwen3_5::llm_build_qwen3_5(const llama_model & model, const llm_graph_params & params, defer_graph_build_t /*tag*/) :
+    llm_graph_context_delta(params), model(model) {
+}
+
+void llm_build_qwen3_5::build_graph() {
+    ggml_tensor * cur;
+    ggml_tensor * inpL;
+
+    inpL = build_inp_embd(model.tok_embd);
+    cb(inpL, "model.embed_tokens", -1);
+
+    auto * inp = build_inp_mem_hybrid();
+
+    ggml_tensor * inp_pos     = build_inp_pos();
+    ggml_tensor * inp_out_ids = build_inp_out_ids();
+
+    ggml_tensor * causal_mask =
+        ggml_tri(ctx0, ggml_fill(ctx0, ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, CHUNK_SIZE, CHUNK_SIZE), 1.0f),
+                    GGML_TRI_TYPE_LOWER);
+
+    ggml_tensor * identity = ggml_diag(ctx0, ggml_fill(ctx0, ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, CHUNK_SIZE), 1.0f));
+    ggml_tensor * diag_mask = ggml_add(ctx0, causal_mask, identity);
+
+    ggml_build_forward_expand(gf, causal_mask);
+    ggml_build_forward_expand(gf, identity);
+    ggml_build_forward_expand(gf, diag_mask);
+
+    for (int il = 0; il < n_layer; ++il) {
+        ggml_tensor * inpSA = inpL;
+
+        cur = build_norm(inpL, model.layers[il].attn_norm, nullptr, LLM_NORM_RMS, il);
+        cb(cur, "attn_norm", il);
+
+        if (hparams.is_recurrent(il)) {
+            cur = build_layer_attn_linear(inp->get_recr(), cur, causal_mask, identity, diag_mask, il);
+        } else {
+            cur = build_layer_attn(inp->get_attn(), cur, inp_pos, il);
+        }
+
+        if (il == n_layer - 1 && inp_out_ids) {
+            cur   = ggml_get_rows(ctx0, cur, inp_out_ids);
+            inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+        }
+
+        cur = ggml_add(ctx0, cur, inpSA);
+        cb(cur, "attn_residual", il);
+
+        ggml_tensor * ffn_residual = cur;
+
+        ggml_tensor * attn_post_norm = build_norm(cur, model.layers[il].attn_post_norm, nullptr, LLM_NORM_RMS, il);
+        cb(attn_post_norm, "attn_post_norm", il);
+
+        cur = build_layer_ffn(attn_post_norm, il);
+        cb(cur, "ffn_out", il);
+
+        cur = ggml_add(ctx0, cur, ffn_residual);
+        cb(cur, "post_ffn", il);
+
+        inpL = cur;
+    }
+    cur = inpL;
+
+    cur = build_norm(cur, model.output_norm, nullptr, LLM_NORM_RMS, -1);
+
+    cb(cur, "result_norm", -1);
+    res->t_embd = cur;
+
+    cur = build_lora_mm(model.output, cur);
+
+    cb(cur, "result_output", -1);
+    res->t_logits = cur;
+
+    ggml_build_forward_expand(gf, cur);
+}
+
+ggml_tensor * llm_build_qwen3_5::build_norm_gated(
+        ggml_tensor * input,
+        ggml_tensor * weights,
+        ggml_tensor * gate,
+        int           layer) {
+    ggml_tensor * normalized = build_norm(input, weights, nullptr, LLM_NORM_RMS, layer);
+    ggml_tensor * gated_silu = ggml_silu(ctx0, gate);
+
+    return ggml_mul(ctx0, normalized, gated_silu);
+}
+
+ggml_tensor * llm_build_qwen3_5::build_layer_attn(
+        llm_graph_input_attn_kv * inp,
+        ggml_tensor *             cur,
+        ggml_tensor *             inp_pos,
+        int                       il) {
+    const int64_t n_embd_head = hparams.n_embd_head_v;
+    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+
+    ggml_tensor * Qcur_full = build_lora_mm(model.layers[il].wq, cur); // [ (n_embd_head * 2) * n_head, n_tokens ]
+    cb(Qcur_full, "Qcur_full", il);
+
+    ggml_tensor * Qcur = ggml_view_3d(ctx0, Qcur_full, n_embd_head, n_head, n_tokens,
+        ggml_element_size(Qcur_full) * n_embd_head * 2,
+        ggml_element_size(Qcur_full) * n_embd_head * 2 * n_head, 0);
+    cb(Qcur, "Qcur_reshaped", il);
+
+    Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, nullptr, LLM_NORM_RMS, il);
+    cb(Qcur, "Qcur_normed", il);
+
+    ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
+    cb(Kcur, "Kcur", il);
+
+    ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
+    cb(Vcur, "Vcur", il);
+
+    Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+    Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, nullptr, LLM_NORM_RMS, il);
+    cb(Kcur, "Kcur_normed", il);
+
+    ggml_tensor * gate = ggml_view_3d(ctx0, Qcur_full, n_embd_head, n_head, n_tokens,
+        ggml_element_size(Qcur_full) * n_embd_head * 2,
+        ggml_element_size(Qcur_full) * n_embd_head * 2 * n_head,
+        ggml_element_size(Qcur_full) * n_embd_head);
+    gate = ggml_cont_2d(ctx0, gate, n_embd_head * n_head, n_tokens);
+    cb(gate, "gate_reshaped", il);
+
+    Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
+
+    Qcur = ggml_rope_ext(
+            ctx0, Qcur, inp_pos, nullptr,
+            n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+            ext_factor, attn_factor, beta_fast, beta_slow);
+
+    Kcur = ggml_rope_ext(
+            ctx0, Kcur, inp_pos, nullptr,
+            n_rot, rope_type, n_ctx_orig, freq_base,
+            freq_scale, ext_factor, attn_factor, beta_fast, beta_slow);
+
+    cb(Qcur, "Qcur", il);
+    cb(Kcur, "Kcur", il);
+    cb(Vcur, "Vcur", il);
+
+    const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f / sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
+
+    cur = build_attn(inp,
+                nullptr, nullptr,
+                Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
+    cb(cur, "attn_pregate", il);
+
+    ggml_tensor * gate_sigmoid = ggml_sigmoid(ctx0, gate);
+    cb(gate_sigmoid, "gate_sigmoid", il);
+
+    cur = ggml_mul(ctx0, cur, gate_sigmoid);
+    cb(cur, "attn_gated", il);
+
+    cur = build_lora_mm(model.layers[il].wo, cur);
+    cb(cur, "attn_output", il);
+
+    return cur;
+}
+
+std::pair<ggml_tensor *, ggml_tensor *> llm_build_qwen3_5::build_qkvz(
+                ggml_tensor * input,
+                        int   il) {
+    const int64_t d_inner      = hparams.ssm_d_inner;
+    const int64_t n_seqs       = ubatch.n_seqs;
+    const int64_t head_k_dim   = hparams.ssm_d_state;
+    const int64_t num_k_heads  = hparams.ssm_n_group;
+    const int64_t num_v_heads  = hparams.ssm_dt_rank;
+    const int64_t head_v_dim   = d_inner / num_v_heads;
+    const int64_t n_seq_tokens = ubatch.n_seq_tokens;
+
+    if (model.layers[il].wqkv) {
+        ggml_tensor * qkv_mixed = build_lora_mm(model.layers[il].wqkv, input);
+        qkv_mixed = ggml_reshape_3d(ctx0, qkv_mixed, qkv_mixed->ne[0], n_seq_tokens, n_seqs);
+        cb(qkv_mixed, "linear_attn_qkv_mixed", il);
+
+        ggml_tensor * z = build_lora_mm(model.layers[il].wqkv_gate, input);
+        cb(z, "z", il);
+
+        return { qkv_mixed, z };
+
+    }
+    // legacy path for combined in_proj_qkvz
+    ggml_tensor * mixed_qkvz = build_lora_mm(model.layers[il].ssm_in, input);
+    cb(mixed_qkvz, "linear_attn_mixed_qkvz", il);
+
+    int64_t       qkvz_new_dim        = 2 * head_k_dim + 2 * head_v_dim * (num_v_heads / num_k_heads);
+    ggml_tensor * mixed_qkvz_reshaped = ggml_reshape_4d(ctx0, mixed_qkvz, qkvz_new_dim, num_k_heads, n_seq_tokens, n_seqs);
+
+    int64_t split_sizes_qkvz[4] = {
+        head_k_dim,
+        head_k_dim,
+        head_v_dim * num_v_heads / num_k_heads,
+        head_v_dim * num_v_heads / num_k_heads
+    };
+
+    ggml_tensor * query =
+        ggml_view_4d(ctx0, mixed_qkvz_reshaped, split_sizes_qkvz[0], num_k_heads, n_seq_tokens, n_seqs,
+                    mixed_qkvz_reshaped->nb[1], mixed_qkvz_reshaped->nb[2], mixed_qkvz_reshaped->nb[3], 0);
+    cb(query, "q", il);
+
+    ggml_tensor * key = ggml_view_4d(ctx0, mixed_qkvz_reshaped, split_sizes_qkvz[1], num_k_heads, n_seq_tokens, n_seqs,
+                                    mixed_qkvz_reshaped->nb[1], mixed_qkvz_reshaped->nb[2], mixed_qkvz_reshaped->nb[3],
+                                    split_sizes_qkvz[0] * ggml_element_size(mixed_qkvz_reshaped));
+    cb(key, "k", il);
+
+    ggml_tensor * value =
+        ggml_view_4d(ctx0, mixed_qkvz_reshaped, split_sizes_qkvz[2], num_k_heads, n_seq_tokens, n_seqs,
+                    mixed_qkvz_reshaped->nb[1], mixed_qkvz_reshaped->nb[2], mixed_qkvz_reshaped->nb[3],
+                    (split_sizes_qkvz[0] + split_sizes_qkvz[1]) * ggml_element_size(mixed_qkvz_reshaped));
+    cb(value, "v", il);
+
+    ggml_tensor * z = ggml_view_4d(ctx0, mixed_qkvz_reshaped, split_sizes_qkvz[3], num_k_heads, n_seq_tokens, n_seqs,
+                                mixed_qkvz_reshaped->nb[1], mixed_qkvz_reshaped->nb[2], mixed_qkvz_reshaped->nb[3],
+                                (split_sizes_qkvz[0] + split_sizes_qkvz[1] + split_sizes_qkvz[2]) * ggml_element_size(mixed_qkvz_reshaped));
+    z = ggml_cont(ctx0, z);
+    cb(z, "z", il);
+
+    ggml_tensor * query_flat = ggml_reshape_3d(ctx0, query, head_k_dim * num_k_heads, n_seq_tokens, n_seqs);
+    cb(query_flat, "query_flat", il);
+
+    ggml_tensor * key_flat = ggml_reshape_3d(ctx0, key, head_k_dim * num_k_heads, n_seq_tokens, n_seqs);
+    cb(key_flat, "key_flat", il);
+
+    ggml_tensor * value_flat = ggml_reshape_3d(ctx0, value, head_v_dim * num_v_heads, n_seq_tokens, n_seqs);
+    cb(value_flat, "value_flat", il);
+
+    ggml_tensor * qkv_mixed = ggml_concat(ctx0, query_flat, key_flat, 0);
+    qkv_mixed               = ggml_concat(ctx0, qkv_mixed, value_flat, 0);
+    cb(qkv_mixed, "qkv_mixed", il);
+
+    return { qkv_mixed, z };
+}
+
+ggml_tensor * llm_build_qwen3_5::build_layer_attn_linear(
+        llm_graph_input_rs * inp,
+        ggml_tensor *        cur,
+        ggml_tensor *        causal_mask,
+        ggml_tensor *        identity,
+        ggml_tensor *        diag_mask,
+        int                  il) {
+    const auto * mctx_cur = inp->mctx;
+
+    const int64_t d_inner      = hparams.ssm_d_inner;
+    const int64_t n_seqs       = ubatch.n_seqs;
+    const int64_t head_k_dim   = hparams.ssm_d_state;
+    const int64_t num_k_heads  = hparams.ssm_n_group;
+    const int64_t num_v_heads  = hparams.ssm_dt_rank;
+    const int64_t head_v_dim   = d_inner / num_v_heads;
+    const int64_t n_seq_tokens = ubatch.n_seq_tokens;
+
+    const auto kv_head = mctx_cur->get_head();
+
+    GGML_ASSERT(n_seqs != 0);
+    GGML_ASSERT(ubatch.equal_seqs());
+    GGML_ASSERT(ubatch.n_tokens == n_seq_tokens * n_seqs);
+
+    auto qkvz = build_qkvz(cur, il);
+    ggml_tensor * qkv_mixed = qkvz.first;
+    ggml_tensor * z         = qkvz.second;
+
+    ggml_tensor * mixed_ba = build_lora_mm(model.layers[il].ssm_beta_alpha, cur);
+    cb(mixed_ba, "linear_attn_mixed_ba", il);
+
+    int64_t       ba_new_dim        = 2 * num_v_heads / num_k_heads;
+    ggml_tensor * mixed_ba_reshaped = ggml_reshape_4d(ctx0, mixed_ba, ba_new_dim, num_k_heads, n_seq_tokens, n_seqs);
+
+    int64_t split_sizes_ba[2] = {
+        num_v_heads / num_k_heads,
+        num_v_heads / num_k_heads
+    };
+
+    ggml_tensor * b = ggml_view_4d(ctx0, mixed_ba_reshaped, split_sizes_ba[0], num_k_heads, n_seq_tokens, n_seqs,
+                                   mixed_ba_reshaped->nb[1], mixed_ba_reshaped->nb[2], mixed_ba_reshaped->nb[3], 0);
+    cb(b, "b", il);
+
+    ggml_tensor * a = ggml_view_4d(ctx0, mixed_ba_reshaped, split_sizes_ba[1], num_k_heads, n_seq_tokens, n_seqs,
+                                   mixed_ba_reshaped->nb[1], mixed_ba_reshaped->nb[2], mixed_ba_reshaped->nb[3],
+                                   split_sizes_ba[0] * ggml_element_size(mixed_ba_reshaped));
+    cb(a, "a", il);
+
+    ggml_tensor * beta  = ggml_cont_4d(ctx0, b, num_v_heads, 1, n_seq_tokens, n_seqs);
+
+    ggml_tensor * alpha = ggml_cont_3d(ctx0, a, num_v_heads, n_seq_tokens, n_seqs);
+
+    ggml_tensor * alpha_biased   = ggml_add(ctx0, alpha, model.layers[il].ssm_dt);
+    ggml_tensor * alpha_softplus = ggml_softplus(ctx0, alpha_biased);
+    cb(alpha_softplus, "a_softplus", il);
+    ggml_tensor * gate = ggml_mul(ctx0, alpha_softplus, model.layers[il].ssm_a);
+    cb(gate, "gate", il);
+
+    ggml_tensor * conv_states_all = mctx_cur->get_r_l(il);
+    ggml_tensor * ssm_states_all  = mctx_cur->get_s_l(il);
+
+    ggml_tensor * conv_states = build_rs(inp, conv_states_all, hparams.n_embd_r(), n_seqs);
+    cb(conv_states, "conv_states", il);
+
+    ggml_tensor * conv_kernel      = model.layers[il].ssm_conv1d;
+    const int64_t conv_kernel_size = conv_kernel->ne[0];
+    const int64_t conv_channels    = d_inner + 2 * hparams.ssm_n_group * hparams.ssm_d_state;
+    conv_states                    = ggml_reshape_3d(ctx0, conv_states, conv_kernel_size - 1, conv_channels, n_seqs);
+    cb(conv_states, "conv_states_reshaped", il);
+
+    qkv_mixed = ggml_permute(ctx0, qkv_mixed, 1, 0, 2, 3);
+    cb(qkv_mixed, "qkv_mixed_permuted", il);
+
+    ggml_tensor * conv_input = ggml_concat(ctx0, conv_states, qkv_mixed, 0);
+    cb(conv_input, "conv_input", il);
+
+    ggml_tensor * last_conv_states =
+        ggml_view_3d(ctx0, conv_input, conv_kernel_size - 1, conv_channels, n_seqs, conv_input->nb[1],
+                     conv_input->nb[2], (conv_input->ne[0] - conv_states->ne[0]) * ggml_element_size(conv_input));
+    cb(last_conv_states, "last_conv_states", il);
+
+    ggml_tensor * state_update_target =
+        ggml_view_1d(ctx0, conv_states_all, (conv_kernel_size - 1) * conv_channels * n_seqs,
+                     kv_head * (conv_kernel_size - 1) * conv_channels * ggml_element_size(conv_states_all));
+    cb(state_update_target, "state_update_target", il);
+
+    ggml_build_forward_expand(gf, ggml_cpy(ctx0, last_conv_states, state_update_target));
+    cb(conv_states_all, "conv_states_updated", il);
+
+    ggml_tensor * conv_output_proper = ggml_ssm_conv(ctx0, conv_input, conv_kernel);
+    cb(conv_output_proper, "conv_output_raw", il);
+
+    ggml_tensor * conv_output_silu = ggml_silu(ctx0, conv_output_proper);
+    cb(conv_output_silu, "conv_output_silu", il);
+
+    ggml_tensor * conv_qkv_mix = conv_output_silu;
+
+    int64_t qkv_dim = head_k_dim * num_k_heads * 2 + head_v_dim * num_v_heads;
+    int64_t nb1_qkv = ggml_row_size(conv_qkv_mix->type, qkv_dim);
+
+    ggml_tensor * q_conv =
+        ggml_view_2d(ctx0, conv_qkv_mix, head_k_dim * num_k_heads, n_seq_tokens * n_seqs, nb1_qkv, 0);
+    cb(q_conv, "q_conv", il);
+    ggml_tensor * k_conv =
+        ggml_view_2d(ctx0, conv_qkv_mix, head_k_dim * num_k_heads, n_seq_tokens * n_seqs, nb1_qkv,
+                     head_k_dim * num_k_heads * ggml_element_size(conv_qkv_mix));
+    cb(k_conv, "k_conv", il);
+    ggml_tensor * v_conv =
+        ggml_view_2d(ctx0, conv_qkv_mix, head_v_dim * num_v_heads, n_seq_tokens * n_seqs, nb1_qkv,
+                     2 * head_k_dim * num_k_heads * ggml_element_size(conv_qkv_mix));
+    cb(v_conv, "v_conv", il);
+
+    q_conv = ggml_cont_4d(ctx0, q_conv, head_k_dim, num_k_heads, n_seq_tokens, n_seqs);
+    k_conv = ggml_cont_4d(ctx0, k_conv, head_k_dim, num_k_heads, n_seq_tokens, n_seqs);
+    v_conv = ggml_cont_4d(ctx0, v_conv, head_v_dim, num_v_heads, n_seq_tokens, n_seqs);
+
+    ggml_tensor * state = build_rs(inp, ssm_states_all, hparams.n_embd_s(), n_seqs);
+    state               = ggml_reshape_4d(ctx0, state, head_v_dim, head_v_dim,  num_v_heads, n_seqs);
+    cb(state, "state_predelta", il);
+
+    if (num_k_heads != num_v_heads) {
+        GGML_ASSERT(num_v_heads % num_k_heads == 0);
+        int64_t repeat_factor = num_v_heads / num_k_heads;
+
+        ggml_tensor * q_reshaped = ggml_reshape_3d(ctx0, q_conv, head_k_dim, 1, num_k_heads * n_seq_tokens * n_seqs);
+        ggml_tensor * k_reshaped = ggml_reshape_3d(ctx0, k_conv, head_k_dim, 1, num_k_heads * n_seq_tokens * n_seqs);
+
+        ggml_tensor * q_repeated =
+            ggml_repeat_4d(ctx0, q_reshaped, head_k_dim, repeat_factor, num_k_heads * n_seq_tokens * n_seqs, 1);
+        ggml_tensor * k_repeated =
+            ggml_repeat_4d(ctx0, k_reshaped, head_k_dim, repeat_factor, num_k_heads * n_seq_tokens * n_seqs, 1);
+
+        q_conv = ggml_reshape_4d(ctx0, q_repeated, head_k_dim, num_k_heads * repeat_factor, n_seq_tokens, n_seqs);
+        k_conv = ggml_reshape_4d(ctx0, k_repeated, head_k_dim, num_k_heads * repeat_factor, n_seq_tokens, n_seqs);
+    }
+
+    cb(q_conv, "q_conv_predelta", il);
+    cb(k_conv, "k_conv_predelta", il);
+    cb(v_conv, "v_conv_predelta", il);
+
+    std::pair<ggml_tensor *, ggml_tensor *> attn_out = build_delta_net_unified(ctx0, q_conv, k_conv, v_conv,
+            gate, beta, state, causal_mask, identity, diag_mask,
+            il, CHUNK_SIZE, hparams.f_norm_rms_eps);
+
+    ggml_tensor * output    = attn_out.first;
+    ggml_tensor * new_state = attn_out.second;
+    cb(output, "attn_output", il);
+    cb(new_state, "new_state", il);
+
+    ggml_build_forward_expand(gf,
+                              ggml_cpy(ctx0, new_state,
+                                       ggml_view_1d(ctx0, ssm_states_all, hparams.n_embd_s() * n_seqs,
+                                                    kv_head * hparams.n_embd_s() * ggml_element_size(ssm_states_all))));
+
+    ggml_tensor * attn_out_2d_final = ggml_reshape_2d(ctx0, output, head_v_dim, num_v_heads * n_seq_tokens * n_seqs);
+
+    ggml_tensor * z_2d = ggml_reshape_2d(ctx0, z, head_v_dim, num_v_heads * n_seq_tokens * n_seqs);
+
+    ggml_tensor * attn_out_norm = build_norm_gated(attn_out_2d_final, model.layers[il].ssm_norm, z_2d, il);
+
+    ggml_tensor * final_output = ggml_reshape_3d(ctx0, attn_out_norm, head_v_dim * num_v_heads, n_seq_tokens, n_seqs);
+    cb(final_output, "final_output", il);
+
+    cur = build_lora_mm(model.layers[il].ssm_out, final_output);
+    cb(cur, "linear_attn_out", il);
+
+    cur = ggml_cont_2d(ctx0, cur, n_embd, n_seq_tokens * n_seqs);
+    return cur;
+}
+
+ggml_tensor * llm_build_qwen3_5::build_layer_ffn(ggml_tensor * cur, const int il) {
+    // Qwen3.5 Dense always uses dense FFN
+    cur = build_ffn(cur,
+        model.layers[il].ffn_up, NULL, NULL,
+        model.layers[il].ffn_gate, NULL, NULL,
+        model.layers[il].ffn_down, NULL, NULL,
+        NULL,
+        LLM_FFN_SILU, LLM_FFN_PAR, il);
+    cb(cur, "ffn_out", il);
+    return cur;
+}
diff --git a/src/models/qwen3-5moe.cpp b/src/models/qwen3-5moe.cpp
new file mode 100644
index 0000000000..a488443218
--- /dev/null
+++ b/src/models/qwen3-5moe.cpp
@@ -0,0 +1,52 @@
+#include "models.h"
+
+llm_build_qwen3_5_moe::llm_build_qwen3_5_moe(const llama_model & model, const llm_graph_params & params) :
+    llm_build_qwen3_5(model, params, defer_graph_build_t{}) {
+    build_graph();
+}
+
+ggml_tensor * llm_build_qwen3_5_moe::build_layer_ffn(ggml_tensor * cur, const int il) {
+    // Check if this is an MoE layer
+    if (model.layers[il].ffn_gate_inp != nullptr) {
+        // MoE branch
+        ggml_tensor * moe_out =
+            build_moe_ffn(cur,
+                model.layers[il].ffn_gate_inp, model.layers[il].ffn_up_exps,
+                model.layers[il].ffn_gate_exps, model.layers[il].ffn_down_exps,
+                nullptr,
+                n_expert, n_expert_used, LLM_FFN_SILU,
+                true, false, 0.0, LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX, il);
+        cb(moe_out, "ffn_moe_out", il);
+
+        // Add shared experts if present
+        if (model.layers[il].ffn_up_shexp != nullptr) {
+            ggml_tensor * ffn_shexp =
+                build_ffn(cur,
+                    model.layers[il].ffn_up_shexp, NULL, NULL,
+                    model.layers[il].ffn_gate_shexp, NULL, NULL,
+                    model.layers[il].ffn_down_shexp, NULL, NULL,
+                    NULL,
+                    LLM_FFN_SILU, LLM_FFN_PAR, il);
+            cb(ffn_shexp, "ffn_shexp", il);
+
+            // Apply shared expert gating (sigmoid)
+            ggml_tensor * shared_gate = build_lora_mm(model.layers[il].ffn_gate_inp_shexp, cur);
+            cb(shared_gate, "shared_expert_gate", il);
+
+            shared_gate = ggml_sigmoid(ctx0, shared_gate);
+            cb(shared_gate, "shared_expert_gate_sigmoid", il);
+
+            ffn_shexp = ggml_mul(ctx0, ffn_shexp, shared_gate);
+            cb(ffn_shexp, "ffn_shexp_gated", il);
+
+            cur = ggml_add(ctx0, moe_out, ffn_shexp);
+            cb(cur, "ffn_out", il);
+        } else {
+            cur = moe_out;
+        }
+    } else {
+        // Dense FFN branch (fallback)
+        cur = llm_build_qwen3_5::build_layer_ffn(cur, il);
+    }
+    return cur;
+}
diff --git a/src/models/qwen3next.cpp b/src/models/qwen3next.cpp
index 99b1a76a48..0335f5ab76 100644
--- a/src/models/qwen3next.cpp
+++ b/src/models/qwen3next.cpp
@@ -1,10 +1,9 @@
-#include "ggml.h"
 #include "models.h"
 
 #define CHUNK_SIZE 64
 
 llm_build_qwen3next::llm_build_qwen3next(const llama_model & model, const llm_graph_params & params) :
-    llm_graph_context_mamba(params), model(model) {
+    llm_graph_context_delta(params), model(model) {
     ggml_tensor * cur;
     ggml_tensor * inpL;
 
@@ -86,362 +85,6 @@ llm_build_qwen3next::llm_build_qwen3next(const llama_model & model, const llm_gr
     ggml_build_forward_expand(gf, cur);
 }
 
-// utility to get one slice from the third dimension
-// input dim:  [x, y, c, b]
-// output dim: [x, y, 1, b]
-static ggml_tensor * get_slice_2d(ggml_context * ctx0, ggml_tensor * t, int64_t c) {
-    return ggml_view_4d(ctx0, t, t->ne[0], t->ne[1], 1, t->ne[3],
-        t->nb[1], t->nb[2], t->nb[3], t->nb[2] * c);
-}
-
-std::pair<ggml_tensor *, ggml_tensor *> llm_build_qwen3next::build_delta_net_chunking(
-        ggml_tensor * q,
-        ggml_tensor * k,
-        ggml_tensor * v,
-        ggml_tensor * g,
-        ggml_tensor * beta,
-        ggml_tensor * state,
-        ggml_tensor * causal_mask,
-        ggml_tensor * identity,
-        ggml_tensor * diag_mask,
-        int           il) {
-    const int64_t S_k      = q->ne[0];
-    const int64_t H_k      = q->ne[1];
-    const int64_t n_tokens = q->ne[2];
-    const int64_t n_seqs   = q->ne[3];
-
-    const int64_t S_v = v->ne[0];
-    const int64_t H_v = v->ne[1];
-
-    GGML_ASSERT(v->ne[2] == n_tokens);
-    GGML_ASSERT(k->ne[2] == n_tokens);
-    GGML_ASSERT(g->ne[0] == H_v && g->ne[1] == n_tokens && g->ne[2] == n_seqs);
-    GGML_ASSERT(beta->ne[0] == H_v && beta->ne[2] == n_tokens && beta->ne[3] == n_seqs);
-    GGML_ASSERT(state->ne[0] == S_v && state->ne[1] == S_v * H_v && state->ne[2] == 1 && state->ne[3] == n_seqs);
-
-    GGML_ASSERT(q->ne[0] == S_k && q->ne[1] == H_k && q->ne[2] == n_tokens && q->ne[3] == n_seqs);
-    GGML_ASSERT(k->ne[0] == S_k && k->ne[1] == H_k && k->ne[2] == n_tokens && k->ne[3] == n_seqs);
-
-    GGML_ASSERT(H_k == H_v);  // we did a repeat to make sure this is the case
-
-    const float eps_norm = hparams.f_norm_rms_eps;
-
-    q = ggml_l2_norm(ctx0, q, eps_norm);
-    k = ggml_l2_norm(ctx0, k, eps_norm);
-
-    const float scale = 1.0f / sqrtf(S_v);
-
-    q = ggml_scale(ctx0, q, scale);
-
-    beta = ggml_sigmoid(ctx0, beta);
-
-    cb(q, "q_in", il);
-    cb(k, "k_in", il);
-    cb(v, "v_in", il);
-    cb(beta, "beta_in", il);
-    cb(g, "g_in", il);
-
-    q = ggml_cont_4d(ctx0, ggml_permute(ctx0, q, 0, 2, 1, 3), S_v, n_tokens, H_v, n_seqs);
-    k = ggml_cont_4d(ctx0, ggml_permute(ctx0, k, 0, 2, 1, 3), S_v, n_tokens, H_v, n_seqs);
-    v = ggml_cont_4d(ctx0, ggml_permute(ctx0, v, 0, 2, 1, 3), S_v, n_tokens, H_v, n_seqs);
-    g = ggml_cont_4d(ctx0, ggml_permute(ctx0, g, 2, 0, 3, 1), n_tokens, 1, H_k, n_seqs);
-
-    beta  = ggml_cont(ctx0, ggml_permute(ctx0, beta, 2, 0, 1, 3));
-    state = ggml_reshape_4d(ctx0, state, S_v, S_v, H_v, n_seqs);
-
-    cb(q, "q_perm", il);
-    cb(k, "k_perm", il);
-    cb(v, "v_perm", il);
-    cb(beta, "beta_perm", il);
-    cb(g, "g_perm", il);
-    cb(state, "state_in", il);
-
-    GGML_ASSERT(q->ne[1] == n_tokens && q->ne[0] == S_k && q->ne[2] == H_k && q->ne[3] == n_seqs);
-    GGML_ASSERT(k->ne[1] == n_tokens && k->ne[0] == S_k && k->ne[2] == H_k && k->ne[3] == n_seqs);
-    GGML_ASSERT(v->ne[1] == n_tokens && v->ne[0] == S_v && v->ne[2] == H_k && v->ne[3] == n_seqs);
-    GGML_ASSERT(beta->ne[1] == n_tokens && beta->ne[2] == H_k && beta->ne[0] == 1 && beta->ne[3] == n_seqs);
-
-    // Do padding
-    const int64_t chunk_size = CHUNK_SIZE;
-
-    const int64_t pad = (chunk_size - n_tokens % chunk_size) % chunk_size;
-    const int64_t n_chunks = (n_tokens + pad) / chunk_size;
-
-    q = ggml_pad(ctx0, q, 0, pad, 0, 0);
-    k = ggml_pad(ctx0, k, 0, pad, 0, 0);
-    v = ggml_pad(ctx0, v, 0, pad, 0, 0);
-    g = ggml_pad(ctx0, g, pad, 0, 0, 0);
-    beta = ggml_pad(ctx0, beta, 0, pad, 0, 0);
-
-    cb(q, "q_pad", il);
-    cb(k, "k_pad", il);
-    cb(v, "v_pad", il);
-    cb(beta, "beta_pad", il);
-    cb(g, "g_pad", il);
-
-    ggml_tensor * v_beta = ggml_mul(ctx0, v, beta);
-    ggml_tensor * k_beta = ggml_mul(ctx0, k, beta);
-
-    cb(v_beta, "v_beta", il);
-    cb(k_beta, "k_beta", il);
-
-    q      = ggml_reshape_4d(ctx0, q,      S_k, chunk_size, n_chunks, H_k * n_seqs);
-    k      = ggml_reshape_4d(ctx0, k,      S_k, chunk_size, n_chunks, H_k * n_seqs);
-    k_beta = ggml_reshape_4d(ctx0, k_beta, S_k, chunk_size, n_chunks, H_k * n_seqs);
-    v      = ggml_reshape_4d(ctx0, v,      S_v, chunk_size, n_chunks, H_v * n_seqs);
-    v_beta = ggml_reshape_4d(ctx0, v_beta, S_v, chunk_size, n_chunks, H_v * n_seqs);
-
-    g    = ggml_reshape_4d(ctx0, g, chunk_size, 1, n_chunks, H_k * n_seqs);
-    beta = ggml_reshape_4d(ctx0, beta, 1, chunk_size, n_chunks, H_k * n_seqs);
-
-    ggml_tensor * g_cumsum = ggml_cumsum(ctx0, g);
-    cb(g_cumsum, "g_cumsum", il); // shape: (chunk_size, 1, n_chunks, H_v * n_seqs)
-
-    ggml_tensor * gcs_i = g_cumsum; // ggml_reshape_4d(ctx0, g_cumsum, chunk_size, 1, n_chunks, H_v * n_seqs);
-    ggml_tensor * gcs_j = ggml_reshape_4d(ctx0, g_cumsum, 1, chunk_size, n_chunks, H_v * n_seqs);
-
-    ggml_tensor * gcs_j_broadcast =
-        ggml_repeat_4d(ctx0, gcs_j, chunk_size, chunk_size, n_chunks, H_v * n_seqs);
-
-    ggml_tensor * decay_mask = ggml_sub(ctx0, gcs_j_broadcast, gcs_i);
-    cb(decay_mask, "decay_mask", il); // shape: (chunk_size, chunk_size, n_chunks, H_v * n_seqs)
-
-    decay_mask = ggml_mul(ctx0, decay_mask, diag_mask);
-    decay_mask = ggml_exp(ctx0, decay_mask);
-    decay_mask = ggml_mul(ctx0, decay_mask, diag_mask);
-
-    ggml_tensor * kmulkbeta = ggml_mul_mat(ctx0, k, k_beta);
-
-    ggml_tensor * k_decay = ggml_mul(ctx0, kmulkbeta, decay_mask);
-    ggml_tensor * attn    = ggml_neg(ctx0, ggml_mul(ctx0, k_decay, causal_mask));
-    cb(attn, "attn_pre_solve", il); // shape: (chunk_size, chunk_size, n_chunks, H_v * n_seqs)
-
-    ggml_tensor * attn_lower = ggml_mul(ctx0, attn, causal_mask);
-    ggml_tensor * lhs        = ggml_sub(ctx0, ggml_repeat(ctx0, identity, attn_lower), attn_lower);
-
-    ggml_tensor * lin_solve  = ggml_solve_tri(ctx0, lhs, attn, true, true, false);
-    attn                     = ggml_mul(ctx0, lin_solve, causal_mask);
-    attn                     = ggml_add(ctx0, attn, identity);
-    cb(attn, "attn_solved", il); // shape: (chunk_size, chunk_size, n_chunks, H_v * n_seqs)
-
-    v = ggml_mul_mat(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, v_beta)), attn);
-
-    ggml_tensor * g_cumsum_t = ggml_cont(ctx0, ggml_transpose(ctx0, g_cumsum));
-    ggml_tensor * gexp       = ggml_exp(ctx0, g_cumsum_t);
-
-    ggml_tensor * kbeta_gexp = ggml_mul(ctx0, k_beta, gexp);
-    cb(kbeta_gexp, "kbeta_gexp", il); // shape: (S_k, chunk_size, n_chunks, H_v * n_seqs)
-
-    ggml_tensor * k_cumdecay =
-        ggml_cont(ctx0, ggml_transpose(ctx0, ggml_mul_mat(ctx0, attn, ggml_cont(ctx0, ggml_transpose(ctx0, kbeta_gexp)))));
-    cb(k_cumdecay, "k_cumdecay", il); // shape: (chunk_size, chunk_size, n_chunks, H_v * n_seqs)
-
-    ggml_tensor * attn_kq = ggml_mul_mat(ctx0, k, q);
-    attn_kq = ggml_mul(ctx0, attn_kq, decay_mask);
-    attn_kq = ggml_mul(ctx0, attn_kq, diag_mask);
-    cb(attn_kq, "attn_kq", il); // shape: (chunk_size, chunk_size, n_chunks, H_v * n_seqs)
-
-
-    // vectorized calculation of key_gdiff
-    // improved from the chunked version:
-    //   g_last = torch.clamp(g_cum[:, :, -1], max=50.0).exp().unsqueeze(-1).unsqueeze(-1)
-    //   g_diff = torch.clamp(g_cum[:, :, -1:] - g_cum, max=50.0).exp()
-    //   key_gdiff = key * g_diff.unsqueeze(-1)
-    //   kgdmulvnew = (key_gdiff).transpose(-1, -2) @ v_new
-    //   last_recurrent_state = last_recurrent_state * g_last + kgdmulvnew
-
-    // get last element in g_cumsum along chunk_size dimension (ne0)
-    // example: [[x, y, z, ..., last], ...] -> [[last], ...]
-    ggml_tensor * g_last = ggml_view_4d(ctx0, g_cumsum, 1, 1, g_cumsum->ne[2], g_cumsum->ne[3],
-                                        g_cumsum->nb[1], g_cumsum->nb[2], g_cumsum->nb[3],
-                                        (g_cumsum->ne[0] - 1) * ggml_element_size(g_cumsum));
-    g_last = ggml_cont(ctx0, g_last);
-    cb(g_last, "g_last", il); // shape: (1, 1, n_chunks, H_v * n_seqs)
-
-    ggml_tensor * g_last_exp = ggml_exp(ctx0, g_last);
-    cb(g_last_exp, "g_last_exp", il); // shape: (1, 1, n_chunks, H_v * n_seqs)
-
-    ggml_tensor * g_diff = ggml_neg(ctx0, ggml_sub(ctx0, g_cumsum, g_last));
-    cb(g_diff, "g_diff", il); // shape: (chunk_size, 1, n_chunks, H_v * n_seqs)
-
-    ggml_tensor * g_diff_exp = ggml_exp(ctx0, g_diff);
-    ggml_tensor * g_diff_exp_t = ggml_reshape_4d(ctx0, g_diff_exp,
-                                                 1, chunk_size, n_chunks, g_diff_exp->ne[3]);
-
-    ggml_tensor * key_gdiff = ggml_mul(ctx0, k, g_diff_exp_t);
-    cb(key_gdiff, "key_gdiff", il); // shape: (S_k, chunk_size, n_chunks, H_v * n_seqs)
-
-    ggml_tensor * key_gdiff_t = ggml_cont(ctx0, ggml_transpose(ctx0, key_gdiff));
-    cb(key_gdiff_t, "key_gdiff_t", il); // shape: (chunk_size, S_k, n_chunks, H_v * n_seqs)
-
-
-    // state to be updated per chunk
-    ggml_tensor * new_state = state; // ggml_dup(ctx0, state);
-    cb(new_state, "new_state", il); // shape: (S_v, S_v, H_v, n_seqs)
-
-    // shape after loop of chunks: (S_v, chunk_size, n_chunks, H_v * n_seqs)
-    ggml_tensor * core_attn_out = nullptr;
-
-    for (int64_t chunk = 0; chunk < n_chunks; chunk++) {
-        // shape: (S_k, chunk_size, 1, H_k * n_seqs)
-        ggml_tensor * q_chunk = get_slice_2d(ctx0, q, chunk); // (no cont), next op: ggml_mul
-
-        // shape: (S_v, chunk_size, 1, H_v * n_seqs)
-        ggml_tensor * v_chunk = get_slice_2d(ctx0, v, chunk); // (no cont), next op: ggml_repeat
-
-        // shape: (chunk_size, 1, n_chunks, H_v * n_seqs)
-        ggml_tensor * gexp_chunk = get_slice_2d(ctx0, gexp, chunk); // (no cont), next op: ggml_mul
-
-        // shape: (chunk_size, 1, H_v * n_seqs)
-        ggml_tensor * k_cumdecay_chunk = get_slice_2d(ctx0, k_cumdecay, chunk); // (no cont), next op: ggml_mul_mat
-
-        // attn = (q_i @ k_i.transpose(-1, -2) * decay_mask[:, :, i]).masked_fill_(mask, 0)
-        // replaced by precomputed attn_kq
-        ggml_tensor * attn_chunk = get_slice_2d(ctx0, attn_kq, chunk);
-        cb(attn_chunk, "attn_chunk", il);
-
-        ggml_tensor * state_t = ggml_cont_4d(ctx0, ggml_permute(ctx0, new_state, 1, 0, 2, 3), S_v, S_v, 1, H_v * n_seqs);
-
-        // v_prime = (k_cumdecay[:, :, i]) @ last_recurrent_state
-        ggml_tensor * v_prime = ggml_mul_mat(ctx0, state_t, k_cumdecay_chunk);
-        cb(v_prime, "v_prime_chunk", il); // shape: (S_v, 1, H_v * n_seqs)
-
-        // v_new = v_i - v_prime
-        ggml_tensor * v_new = ggml_sub(ctx0, ggml_repeat(ctx0, v_chunk, v_prime), v_prime);
-        ggml_tensor * v_new_t = ggml_cont(ctx0, ggml_transpose(ctx0, v_new));
-        cb(v_new, "v_new_chunk", il);
-
-        // attn_inter = (q_i * g[:, :, i, :, None].exp()) @ last_recurrent_state
-        ggml_tensor * q_g_exp    = ggml_mul(ctx0, q_chunk, gexp_chunk);
-        ggml_tensor * attn_inter = ggml_mul_mat(ctx0, state_t, q_g_exp);
-        cb(attn_inter, "attn_inter_chunk", il);
-
-        // core_attn_out[:, :, i] = attn_inter + attn @ v_new
-        ggml_tensor * v_attn = ggml_mul_mat(ctx0, v_new_t, attn_chunk);
-        cb(v_attn, "v_attn_chunk", il);
-
-        ggml_tensor * core_attn_out_chunk = ggml_add(ctx0, attn_inter, v_attn);
-        cb(core_attn_out_chunk, "core_attn_out_chunk", il); // shape: (S_v, chunk_size, 1, H_v * n_seqs)
-
-        core_attn_out = core_attn_out == nullptr
-            ? core_attn_out_chunk
-            : ggml_concat(ctx0, core_attn_out, core_attn_out_chunk, 2);
-
-        // kgdmulvnew = (key_gdiff).transpose(-1, -2) @ v_new
-        ggml_tensor * k_gdiff_t = get_slice_2d(ctx0, key_gdiff_t, chunk);
-        //ggml_tensor * kgdmulvnew = ggml_mul_mat(ctx0, k_gdiff, v_new); // this is slower on metal, why?
-        ggml_tensor * kgdmulvnew = ggml_mul_mat(ctx0, v_new_t, k_gdiff_t);
-
-        // last_recurrent_state = last_recurrent_state * g_last + kgdmulvnew
-        ggml_tensor * gexp_last_chunk = ggml_cont(ctx0, get_slice_2d(ctx0, g_last_exp, chunk));
-        new_state = ggml_add(ctx0,
-            ggml_mul(ctx0, new_state, ggml_reshape_4d(ctx0, gexp_last_chunk, gexp_last_chunk->ne[0], gexp_last_chunk->ne[1], H_v, n_seqs)),
-            ggml_reshape_4d(ctx0, kgdmulvnew, kgdmulvnew->ne[0], kgdmulvnew->ne[1], H_v, n_seqs));
-    }
-
-    // truncate padded tokens
-    ggml_tensor * output_tokens = ggml_view_4d(ctx0, core_attn_out,
-            S_v, n_tokens, H_v, n_seqs,
-            ggml_row_size(core_attn_out->type, S_v),
-            ggml_row_size(core_attn_out->type, S_v * chunk_size * n_chunks),
-            ggml_row_size(core_attn_out->type, S_v * chunk_size * n_chunks * H_v), 0);
-    output_tokens = ggml_cont(ctx0, output_tokens);
-    cb(output_tokens, "output_tokens", il);
-
-    // permute back to (S_v, H_v, n_tokens, n_seqs)
-    output_tokens = ggml_permute(ctx0, output_tokens, 0, 2, 1, 3);
-    output_tokens = ggml_cont(ctx0, output_tokens);
-
-    return {output_tokens, new_state};
-}
-
-std::pair<ggml_tensor *, ggml_tensor *> llm_build_qwen3next::build_delta_net_autoregressive(
-        ggml_tensor * q,
-        ggml_tensor * k,
-        ggml_tensor * v,
-        ggml_tensor * g,
-        ggml_tensor * beta,
-        ggml_tensor * state,
-        int           il) {
-    const int64_t S_k      = q->ne[0];
-    const int64_t H_k      = q->ne[1];
-    const int64_t n_tokens = q->ne[2];
-    const int64_t n_seqs   = q->ne[3];
-
-    const int64_t S_v = v->ne[0];
-    const int64_t H_v = v->ne[1];
-
-    GGML_ASSERT(n_tokens == 1);  // This function is optimized for single token processing
-    GGML_ASSERT(v->ne[2] == n_tokens);
-    GGML_ASSERT(k->ne[2] == n_tokens);
-    GGML_ASSERT(g->ne[0] == H_v && g->ne[1] == n_tokens && g->ne[2] == n_seqs);
-    GGML_ASSERT(beta->ne[0] == H_v && beta->ne[2] == n_tokens && beta->ne[3] == n_seqs);
-    GGML_ASSERT(state->ne[0] == S_v && state->ne[1] == S_v * H_v && state->ne[2] == 1 && state->ne[3] == n_seqs);
-
-    GGML_ASSERT(q->ne[0] == S_k && q->ne[1] == H_k && q->ne[2] == n_tokens && q->ne[3] == n_seqs);
-    GGML_ASSERT(k->ne[0] == S_k && k->ne[1] == H_k && k->ne[2] == n_tokens && k->ne[3] == n_seqs);
-
-    GGML_ASSERT(H_k == H_v);  // we did a repeat to make sure this is the case
-
-    const float eps_norm = hparams.f_norm_rms_eps;
-
-    q = ggml_l2_norm(ctx0, q, eps_norm);
-    k = ggml_l2_norm(ctx0, k, eps_norm);
-
-    const float scale = 1.0f / sqrtf(S_v);
-
-    q    = ggml_scale(ctx0, q, scale);
-    beta = ggml_sigmoid(ctx0, beta);
-
-    cb(q, "q_in", il);
-    cb(k, "k_in", il);
-    cb(v, "v_in", il);
-    cb(beta, "beta_in", il);
-    cb(g, "g_in", il);
-
-    state = ggml_reshape_4d(ctx0, state, S_v, S_v, H_v, n_seqs);
-
-    ggml_tensor * g_t    = ggml_reshape_4d(ctx0, ggml_transpose(ctx0, g), 1, 1, H_k, n_seqs);
-    ggml_tensor * beta_t = ggml_reshape_4d(ctx0, ggml_transpose(ctx0, beta), 1, 1, H_k, n_seqs);
-
-    // Apply exponential to g_t
-    g_t = ggml_exp(ctx0, g_t);
-
-    // Apply the gated delta rule for the single timestep
-    // last_recurrent_state = last_recurrent_state * g_t
-    state = ggml_mul(ctx0, state, g_t);
-
-    // kv_mem = (last_recurrent_state * k_t.unsqueeze(-1)).sum(dim=-2)
-    ggml_tensor * k_t_unsqueezed = ggml_reshape_4d(ctx0, k, 1, S_v, H_v, n_seqs);
-    ggml_tensor * kv_mem         = ggml_mul(ctx0, state, k_t_unsqueezed);
-    // we need to sum over dim=-2, so we transpose, sum, then transpose again
-    kv_mem = ggml_transpose(ctx0, ggml_sum_rows(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, kv_mem))));
-
-    // v_t = v.unsqueeze(2) (we insert the singleton dimension after n_seqs and H_v)
-    ggml_tensor * v_t    = ggml_reshape_4d(ctx0, v, S_v, 1, H_v, n_seqs);
-    // delta = (v_t - kv_mem) * beta_t
-    ggml_tensor * v_diff = ggml_sub(ctx0, v_t, kv_mem);  // both should be [S_v, 1, H_v, n_seqs]
-    ggml_tensor * delta  = ggml_mul(ctx0, v_diff, beta_t);
-
-    // last_recurrent_state = last_recurrent_state + k_t.unsqueeze(-1) * delta
-    ggml_tensor * k_t_delta = ggml_mul(ctx0, ggml_repeat_4d(ctx0, k_t_unsqueezed, S_v, S_v, H_v, n_seqs), delta);
-    state                   = ggml_add(ctx0, state, k_t_delta);
-
-    // Compute the attention output
-    // core_attn_out = (last_recurrent_state * q_t.unsqueeze(-1)).sum(dim=-2)
-    ggml_tensor * q_t_unsqueezed = ggml_reshape_4d(ctx0, q, 1, S_v, H_v, n_seqs);  // unsqueeze q_t
-    ggml_tensor * state_q        = ggml_mul(ctx0, state, q_t_unsqueezed);
-    // again, since it's over dim = -2, transpose, sum, transpose back
-    ggml_tensor * core_attn_out =
-        ggml_transpose(ctx0, ggml_sum_rows(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, state_q))));
-
-    // core_attn_out should be [S_v, 1, H_v, n_seqs] after this
-    cb(core_attn_out, "output_tokens", il);
-    cb(state, "new_state", il);
-
-    return {core_attn_out, state};
-}
-
 ggml_tensor * llm_build_qwen3next::build_norm_gated(
         ggml_tensor * input,
         ggml_tensor * weights,
@@ -752,7 +395,7 @@ ggml_tensor * llm_build_qwen3next::build_layer_attn_linear(
     v_conv = ggml_cont_4d(ctx0, v_conv, head_v_dim, num_v_heads, n_seq_tokens, n_seqs);
 
     ggml_tensor * state = build_rs(inp, ssm_states_all, hparams.n_embd_s(), n_seqs);
-    state               = ggml_reshape_4d(ctx0, state, head_v_dim, head_v_dim * num_v_heads, 1, n_seqs);
+    state               = ggml_reshape_4d(ctx0, state, head_v_dim, head_v_dim,  num_v_heads, n_seqs);
     cb(state, "state_predelta", il);
 
     // if head keys and value keys are different, repeat to force tensors into matching shapes
@@ -781,13 +424,10 @@ ggml_tensor * llm_build_qwen3next::build_layer_attn_linear(
     cb(k_conv, "k_conv_predelta", il);
     cb(v_conv, "v_conv_predelta", il);
 
-    // Choose between build_delta_net_chunking, build_delta_net_recurrent, and build_delta_net_autoregressive based on n_tokens
-    std::pair<ggml_tensor *, ggml_tensor *> attn_out; // pair of (output, new_state)
-    if (n_seq_tokens == 1) {
-        attn_out = build_delta_net_autoregressive(q_conv, k_conv, v_conv, gate, beta, state, il);
-    } else {
-        attn_out = build_delta_net_chunking(q_conv, k_conv, v_conv, gate, beta, state, causal_mask, identity, diag_mask, il);
-    }
+    std::pair<ggml_tensor *, ggml_tensor *> attn_out = build_delta_net_unified(ctx0, q_conv, k_conv, v_conv,
+            gate, beta, state, causal_mask, identity, diag_mask,
+            il, CHUNK_SIZE, hparams.f_norm_rms_eps);
+
     ggml_tensor * output    = attn_out.first;
     ggml_tensor * new_state = attn_out.second;
     cb(output, "attn_output", il);

From 1e8924fd65ad349d1d838412a2172292618f3bbf Mon Sep 17 00:00:00 2001
From: Hugo <hugo@whynothugo.nl>
Date: Mon, 9 Feb 2026 06:12:02 +0000
Subject: [PATCH 10/17] cmake : add variable to skip installing tests (#19370)

When packaging downstream, there's usually little point in installing
test. The default behaviour remains the same.
---
 CMakeLists.txt       | 1 +
 tests/CMakeLists.txt | 8 ++++++--
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 6d4ed67020..55f3d594db 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -109,6 +109,7 @@ option(LLAMA_BUILD_TOOLS    "llama: build tools"          ${LLAMA_STANDALONE})
 option(LLAMA_BUILD_EXAMPLES "llama: build examples"       ${LLAMA_STANDALONE})
 option(LLAMA_BUILD_SERVER   "llama: build server example" ${LLAMA_STANDALONE})
 option(LLAMA_TOOLS_INSTALL  "llama: install tools"        ${LLAMA_TOOLS_INSTALL_DEFAULT})
+option(LLAMA_TESTS_INSTALL  "llama: install tests"        ON)
 
 # 3rd party libs
 option(LLAMA_HTTPLIB    "llama: httplib for downloading functionality" ON)
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index c9436c5995..350bffc315 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -11,7 +11,9 @@ function(llama_build source)
 
     add_executable(${TEST_TARGET} ${TEST_SOURCES})
     target_link_libraries(${TEST_TARGET} PRIVATE common)
-    install(TARGETS ${TEST_TARGET} RUNTIME)
+    if (LLAMA_TESTS_INSTALL)
+        install(TARGETS ${TEST_TARGET} RUNTIME)
+    endif()
 endfunction()
 
 function(llama_test target)
@@ -100,7 +102,9 @@ function(llama_build_and_test source)
     endif()
 
     add_executable(${TEST_TARGET} ${TEST_SOURCES})
-    install(TARGETS ${TEST_TARGET} RUNTIME)
+    if (LLAMA_TESTS_INSTALL)
+        install(TARGETS ${TEST_TARGET} RUNTIME)
+    endif()
     target_link_libraries(${TEST_TARGET} PRIVATE common)
 
     add_test(

From f5e7734ff2e1d2e22015f4a9da9a52c70240a064 Mon Sep 17 00:00:00 2001
From: Kevin Pouget <kpouget@redhat.com>
Date: Mon, 9 Feb 2026 13:15:42 +0100
Subject: [PATCH 11/17] ggml-virtgpu: add backend documentation (#19354)

* ggml-virtgpu: add backend documentation

Assisted-by-AI: Claude Code

* CODEOWNERS: add /docs/backend/GGML-VirtGPU/ -> kpouget

* README: add the link to docs/backend/GGML-VirtGPU/ggml-virt.md

* docs/ggml-virt: add link to testing + configuration

* Revert "CODEOWNERS: add /docs/backend/GGML-VirtGPU/ -> kpouget"

This reverts commit 8ece8e72e24d305f308505c08ebb75804546374e.

* drop the ggml- prefix

* s/ggerganov/ggml-org

* Relocate VirtGPU.md

* reorganize the text

* turn turn the ascii diagram into a mermaid

* README.md: update the link to the main doc
---
 README.md                             |   1 +
 docs/backend/VirtGPU.md               | 180 +++++++++++++++++++++
 docs/backend/VirtGPU/configuration.md | 174 ++++++++++++++++++++
 docs/backend/VirtGPU/development.md   | 220 ++++++++++++++++++++++++++
 4 files changed, 575 insertions(+)
 create mode 100644 docs/backend/VirtGPU.md
 create mode 100644 docs/backend/VirtGPU/configuration.md
 create mode 100644 docs/backend/VirtGPU/development.md

diff --git a/README.md b/README.md
index dac020ad37..5c11f38048 100644
--- a/README.md
+++ b/README.md
@@ -288,6 +288,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
 | [WebGPU [In Progress]](docs/build.md#webgpu) | All |
 | [RPC](https://github.com/ggml-org/llama.cpp/tree/master/tools/rpc) | All |
 | [Hexagon [In Progress]](docs/backend/hexagon/README.md) | Snapdragon |
+| [VirtGPU](docs/backend/VirtGPU.md) | VirtGPU APIR |
 
 ## Obtaining and quantizing models
 
diff --git a/docs/backend/VirtGPU.md b/docs/backend/VirtGPU.md
new file mode 100644
index 0000000000..c81468da13
--- /dev/null
+++ b/docs/backend/VirtGPU.md
@@ -0,0 +1,180 @@
+# GGML-VirtGPU Backend
+
+The GGML-VirtGPU backend enables GGML applications to run machine
+learning computations on host hardware while the application itself
+runs inside a virtual machine.  It uses host-guest shared memory to
+efficiently share data buffers between the two sides.
+
+This backend relies on the virtio-gpu, and VirglRenderer API Remoting
+(APIR) component. The backend is split into two libraries:
+- a GGML implementation (the "remoting frontend"), running in the
+  guest and interacting with the virtgpu device
+- a VirglRenderer APIR compatible library (the "remoting backend"),
+  running in the host and interacting with Virglrenderer and an actual
+  GGML device backend.
+
+## OS support
+
+| OS       | Status            | Backend     | CI testing  | Notes
+| -------- | ----------------- | ----------- | ----------- | -----
+| MacOS 14 | Supported         | ggml-metal  | X           | Working when compiled on MacOS 14
+| MacOS 15 | Supported         | ggml-metal  | X           | Working when compiled on MacOS 14 or MacOS 15
+| MacOS 26 | Not tested        |             |             |
+| Linux    | Under development | ggml-vulkan | not working | Working locally, CI running into deadlocks
+
+
+## Architecture Overview
+
+The GGML-VirtGPU backend consists of three main components:
+
+```mermaid
+graph TD
+    %% Nodes
+
+ subgraph GuestVM ["Guest VM - Frontend"]
+        App([GGML Application<br/>llama.cpp, etc.])
+
+        direction TB
+        Interface[GGML Backend Interface]
+        Comm["GGML-VirtGPU<br/>(hypercalls + shared mem)"]
+
+        App --> Interface
+        Interface --> Comm
+    end
+
+    API[virtio-gpu / virglrenderer API]
+
+    subgraph HostSystem [Host System - Backend]
+        direction TB
+        Dispatcher[GGML-VirtGPU-Backend]
+        BackendLib[GGML Backend library<br/>Metal / Vulkan / CPU / ...]
+
+        Dispatcher --> BackendLib
+    end
+
+    %% Connections
+    Comm --> API
+    API --> HostSystem
+```
+
+### Key Components
+
+1. **Guest-side Frontend** (`ggml-virtgpu/`): Implements the GGML backend interface and forwards operations to the host
+2. **Host-side Backend** (`ggml-virtgpu/backend/`): Receives forwarded operations and executes them on actual hardware backends
+3. **Communication Layer**: Uses virtio-gpu hypercalls and shared memory for efficient data transfer
+
+## Features
+
+- **Dynamic backend loading** on the host side (CPU, CUDA, Metal, etc.)
+- **Zero-copy data transfer** via host-guest shared memory pages
+
+## Communication Protocol
+
+### Hypercalls and Shared Memory
+
+The backend uses two primary communication mechanisms:
+
+1. **Hypercalls (`DRM_IOCTL_VIRTGPU_EXECBUFFER`)**: Trigger remote execution from guest to host
+2. **Shared Memory Pages**: Zero-copy data transfer for tensors and parameters
+
+#### Shared Memory Layout
+
+Each connection uses two shared memory buffers:
+
+- **Data Buffer** (24 MiB): For command/response data and tensor transfers
+- **Reply Buffer** (16 KiB): For command replies and status information
+- **Data Buffers**: Dynamically allocated host-guest shared buffers
+  served as GGML buffers.
+
+### APIR Protocol
+
+The Virglrender API Remoting protocol defines three command types:
+
+- `HANDSHAKE`: Protocol version negotiation and capability discovery
+- `LOADLIBRARY`: Dynamic loading of backend libraries on the host
+- `FORWARD`: API function call forwarding
+
+### Binary Serialization
+
+Commands and data are serialized using a custom binary protocol with:
+
+- Fixed-size encoding for basic types
+- Variable-length arrays with size prefixes
+- Buffer bounds checking
+- Error recovery mechanisms
+
+## Supported Operations
+
+### Device Operations
+- Device enumeration and capability queries
+- Memory information (total/free)
+- Backend type detection
+
+### Buffer Operations
+- Buffer allocation and deallocation
+- Tensor data transfer (host ↔ guest)
+- Memory copying and clearing
+
+### Computation Operations
+- Graph execution forwarding
+
+## Build Requirements
+
+### Guest-side Dependencies
+- `libdrm` for DRM/virtio-gpu communication
+- C++20 compatible compiler
+- CMake 3.14+
+
+### Host-side Dependencies
+- virglrenderer with APIR support (pending upstream review)
+- Target backend libraries (libggml-metal, libggml-vulkan, etc.)
+
+## Configuration
+
+### Environment Variables
+
+- `GGML_VIRTGPU_BACKEND_LIBRARY`: Path to the host-side backend library
+- `GGML_VIRTGPU_DEBUG`: Enable debug logging
+
+### Build Options
+
+- `GGML_VIRTGPU`: Enable the VirtGPU backend (`ON` or `OFF`, default: `OFF`)
+- `GGML_VIRTGPU_BACKEND`: Build the host-side backend component (`ON`, `OFF` or `ONLY`, default: `OFF`)
+
+### System Requirements
+
+- VM with virtio-gpu support
+- VirglRenderer with APIR patches
+- Compatible backend libraries on host
+
+## Limitations
+
+- **VM-specific**: Only works in virtual machines with virtio-gpu support
+- **Host dependency**: Requires properly configured host-side backend
+- **Latency**: Small overhead from VM escaping for each operation
+
+
+* This work is pending upstream changes in the VirglRenderer
+  project.
+  * The backend can be tested with Virglrenderer compiled from source
+  using this PR:
+  https://gitlab.freedesktop.org/virgl/virglrenderer/-/merge_requests/1590
+* This work is pending changes in the VMM/hypervisor running the
+  virtual machine, which need to know how to route the newly
+  introduced APIR capset.
+  * The environment variable `VIRGL_ROUTE_VENUS_TO_APIR=1` allows
+    using the Venus capset, until the relevant hypervisors have been
+    patched. However, setting this flag breaks the Vulkan/Venus normal
+    behavior.
+  * The environment variable `GGML_REMOTING_USE_APIR_CAPSET` tells the
+    `ggml-virtgpu` backend to use the APIR capset. This will become
+    the default when the relevant hypervisors have been patched.
+
+* This work focused on improving the performance of llama.cpp running
+  on MacOS containers, and is mainly tested on this platform. The
+  linux support (via `krun`) is in progress.
+
+## See Also
+
+- [Development and Testing](VirtGPU/development.md)
+- [Backend configuration](VirtGPU/configuration.md)
diff --git a/docs/backend/VirtGPU/configuration.md b/docs/backend/VirtGPU/configuration.md
new file mode 100644
index 0000000000..597862d5c8
--- /dev/null
+++ b/docs/backend/VirtGPU/configuration.md
@@ -0,0 +1,174 @@
+# GGML-VirtGPU Backend Configuration
+
+This document describes the environment variables used by the ggml-virtgpu backend system, covering both the frontend (guest-side) and backend (host-side) components.
+
+## Environment Variables Overview
+
+The ggml-virtgpu backend uses environment variables for configuration across three main components:
+- **Frontend (Guest)**: GGML applications running in VMs
+- **Hypervisor**: Virglrenderer/APIR system
+- **Backend (Host)**: Host-side GGML backend integration
+
+## Frontend (Guest-side) Configuration
+
+### GGML_REMOTING_USE_APIR_CAPSET
+- **Location**: `ggml/src/ggml-virtgpu/virtgpu.cpp`
+- **Type**: Boolean flag (presence-based)
+- **Purpose**: Controls which virtio-gpu capability set to use for communication
+- **Values**:
+  - Set (any value): Use the APIR capset (long-term setup)
+  - Unset: Use the Venus capset (easier for testing with an unmodified hypervisor)
+- **Default**: Unset (Venus capset)
+- **Usage**:
+  ```bash
+  export GGML_REMOTING_USE_APIR_CAPSET=1  # Use APIR capset
+  # or leave unset for Venus capset
+  ```
+
+## Hypervisor (Virglrenderer/APIR) Configuration
+
+These environment variables are used during the transition phase for
+running with an unmodified hypervisor (not supporting the
+VirglRenderer APIR component). They will be removed in the future, and
+the hypervisor will instead configure VirglRenderer with the APIR
+_Configuration Key_.
+
+### VIRGL_APIR_BACKEND_LIBRARY
+- **Location**: `virglrenderer/src/apir/apir-context.c`
+- **Configuration Key**: `apir.load_library.path`
+- **Type**: File path string
+- **Purpose**: Path to the APIR backend library that virglrenderer should dynamically load
+- **Required**: Yes
+- **Example**:
+  ```bash
+  export VIRGL_APIR_BACKEND_LIBRARY="/path/to/libggml-remotingbackend.so"
+  ```
+
+### VIRGL_ROUTE_VENUS_TO_APIR
+- **Location**: `virglrenderer/src/apir/apir-renderer.h`
+- **Type**: Boolean flag (presence-based)
+- **Purpose**: Temporary workaround to route Venus capset calls to APIR during hypervisor transition period
+- **Status**: will be removed once hypervisors support APIR natively
+- **Warning**: Breaks normal Vulkan/Venus functionality
+- **Usage**:
+  ```bash
+  export VIRGL_ROUTE_VENUS_TO_APIR=1  # For testing with an unmodified hypervisor
+  ```
+
+### VIRGL_APIR_LOG_TO_FILE
+- **Location**: `virglrenderer/src/apir/apir-renderer.c`
+- **Environment Variable**: `VIRGL_APIR_LOG_TO_FILE`
+- **Type**: File path string
+- **Purpose**: Enable debug logging from the VirglRenderer APIR component to specified file
+- **Required**: No (optional debugging)
+- **Default**: Logging to `stderr`
+- **Usage**:
+  ```bash
+  export VIRGL_APIR_LOG_TO_FILE="/tmp/apir-debug.log"
+  ```
+
+## Backend (Host-side) Configuration
+
+These environment variables are used during the transition phase for
+running with an unmodified hypervisor (not supporting the
+VirglRenderer APIR component). They will be removed in the future, and
+the hypervisor will instead configure VirglRenderer with the APIR
+_Configuration Key_.
+
+### APIR_LLAMA_CPP_GGML_LIBRARY_PATH
+- **Location**: `ggml/src/ggml-virtgpu/backend/backend.cpp`
+- **Environment Variable**: `APIR_LLAMA_CPP_GGML_LIBRARY_PATH`
+- **Configuration Key**: `ggml.library.path`
+- **Type**: File path string
+- **Purpose**: Path to the actual GGML backend library (Metal, CUDA, Vulkan, etc.)
+- **Required**: **Yes** - backend initialization fails without this
+- **Examples**:
+  ```bash
+  # macOS with Metal backend
+  export APIR_LLAMA_CPP_GGML_LIBRARY_PATH="/opt/llama.cpp/lib/libggml-metal.dylib"
+
+  # Linux with CUDA backend
+  export APIR_LLAMA_CPP_GGML_LIBRARY_PATH="/opt/llama.cpp/lib/libggml-cuda.so"
+
+  # macOS or Linux with Vulkan backend
+  export APIR_LLAMA_CPP_GGML_LIBRARY_PATH="/opt/llama.cpp/lib/libggml-vulkan.so"
+  ```
+
+### APIR_LLAMA_CPP_GGML_LIBRARY_REG
+- **Location**: `ggml/src/ggml-virtgpu/backend/backend.cpp`
+- **Environment Variable**: `APIR_LLAMA_CPP_GGML_LIBRARY_REG`
+- **Configuration Key**: `ggml.library.reg`
+- **Type**: Function symbol name string
+- **Purpose**: Name of the backend registration function to call after loading the library
+- **Required**: No (defaults to `ggml_backend_init`)
+- **Default**: `ggml_backend_init`
+- **Examples**:
+  ```bash
+  # Metal backend
+  export APIR_LLAMA_CPP_GGML_LIBRARY_REG="ggml_backend_metal_reg"
+
+  # CUDA backend
+  export APIR_LLAMA_CPP_GGML_LIBRARY_REG="ggml_backend_cuda_reg"
+
+  # Vulkan backend
+  export APIR_LLAMA_CPP_GGML_LIBRARY_REG="ggml_backend_vulkan_reg"
+
+  # Generic fallback (default)
+  # export APIR_LLAMA_CPP_GGML_LIBRARY_REG="ggml_backend_init"
+  ```
+
+### APIR_LLAMA_CPP_LOG_TO_FILE
+- **Location**: `ggml/src/ggml-virtgpu/backend/backend.cpp:62`
+- **Environment Variable**: `APIR_LLAMA_CPP_LOG_TO_FILE`
+- **Type**: File path string
+- **Purpose**: Enable debug logging from the GGML backend to specified file
+- **Required**: No (optional debugging)
+- **Usage**:
+  ```bash
+  export APIR_LLAMA_CPP_LOG_TO_FILE="/tmp/ggml-backend-debug.log"
+  ```
+
+## Configuration Flow
+
+The configuration system works as follows:
+
+1. **Hypervisor Setup**: Virglrenderer loads the APIR backend library specified by `VIRGL_APIR_BACKEND_LIBRARY`
+
+2. **Context Creation**: When an APIR context is created, it populates a configuration table with environment variables:
+   - `apir.load_library.path` ← `VIRGL_APIR_BACKEND_LIBRARY`
+   - `ggml.library.path` ← `APIR_LLAMA_CPP_GGML_LIBRARY_PATH`
+   - `ggml.library.reg` ← `APIR_LLAMA_CPP_GGML_LIBRARY_REG`
+   - this step will eventually be performed by the hypervisor itself, with command-line arguments instead of environment variables.
+
+3. **Backend Initialization**: The backend queries the configuration via callbacks:
+   - `virgl_cbs->get_config(ctx_id, "ggml.library.path")` returns the library path
+   - `virgl_cbs->get_config(ctx_id, "ggml.library.reg")` returns the registration function
+
+4. **Library Loading**: The backend dynamically loads and initializes the specified GGML library
+
+## Error Messages
+
+Common error scenarios and their messages:
+
+- **Missing library path**: `"cannot open the GGML library: env var 'APIR_LLAMA_CPP_GGML_LIBRARY_PATH' not defined"`
+- **Missing registration function**: `"cannot register the GGML library: env var 'APIR_LLAMA_CPP_GGML_LIBRARY_REG' not defined"`
+
+## Example Complete Configuration
+
+Here's an example configuration for a macOS host with Metal backend:
+
+```bash
+# Hypervisor environment
+export VIRGL_APIR_BACKEND_LIBRARY="/opt/llama.cpp/lib/libggml-virtgpu-backend.dylib"
+
+# Backend configuration
+export APIR_LLAMA_CPP_GGML_LIBRARY_PATH="/opt/llama.cpp/lib/libggml-metal.dylib"
+export APIR_LLAMA_CPP_GGML_LIBRARY_REG="ggml_backend_metal_reg"
+
+# Optional logging
+export VIRGL_APIR_LOG_TO_FILE="/tmp/apir.log"
+export APIR_LLAMA_CPP_LOG_TO_FILE="/tmp/ggml.log"
+
+# Guest configuration
+export GGML_REMOTING_USE_APIR_CAPSET=1
+```
diff --git a/docs/backend/VirtGPU/development.md b/docs/backend/VirtGPU/development.md
new file mode 100644
index 0000000000..ca2e47772a
--- /dev/null
+++ b/docs/backend/VirtGPU/development.md
@@ -0,0 +1,220 @@
+# Development and Testing
+
+## Development
+
+### Code Generation
+
+The backend uses code generation from YAML configuration:
+
+```bash
+# Regenerate protocol code
+cd ggml-virtgpu/
+python regenerate_remoting.py
+```
+
+### Adding New Operations
+
+1. Add function definition to `ggmlremoting_functions.yaml`
+2. Regenerate code with `regenerate_remoting.py`
+3. Implement guest-side forwarding in `virtgpu-forward-*.cpp`
+4. Implement host-side handling in `backend-dispatched-*.cpp`
+
+## Testing
+
+This document provides instructions for building and testing the GGML-VirtGPU backend on macOS with containers.
+
+### Prerequisites
+
+The testing setup requires:
+
+- macOS host system
+- Container runtime with `libkrun` provider (podman machine)
+- Access to development patchset for VirglRenderer
+
+### Required Patchsets
+
+The backend requires patches that are currently under review:
+
+- **Virglrenderer APIR upstream PR**: https://gitlab.freedesktop.org/virgl/virglrenderer/-/merge_requests/1590 (for reference)
+- **MacOS Virglrenderer (for krunkit)**: https://gitlab.freedesktop.org/kpouget/virglrenderer/-/tree/main-macos
+- **Linux Virglrenderer (for krun)**: https://gitlab.freedesktop.org/kpouget/virglrenderer/-/tree/main-linux
+
+### Build Instructions
+
+#### 1. Build ggml-virtgpu-backend (Host-side, macOS)
+
+```bash
+# Build the backend that runs natively on macOS
+mkdir llama.cpp
+cd llama.cpp
+git clone https://github.com/ggml-org/llama.cpp.git src
+cd src
+
+LLAMA_MAC_BUILD=$PWD/build/ggml-virtgpu-backend
+
+cmake -S . -B $LLAMA_MAC_BUILD \
+      -DGGML_NATIVE=OFF \
+      -DLLAMA_CURL=ON \
+      -DGGML_REMOTINGBACKEND=ONLY \
+      -DGGML_METAL=ON
+
+TARGETS="ggml-metal"
+cmake --build $LLAMA_MAC_BUILD --parallel 8 --target $TARGETS
+
+# Build additional tools for native benchmarking
+EXTRA_TARGETS="llama-run llama-bench"
+cmake --build $LLAMA_MAC_BUILD --parallel 8 --target $EXTRA_TARGETS
+```
+
+#### 2. Build virglrenderer (Host-side, macOS)
+
+```bash
+# Build virglrenderer with APIR support
+mkdir virglrenderer
+git clone https://gitlab.freedesktop.org/kpouget/virglrenderer -b main-macos src
+cd src
+
+VIRGL_BUILD_DIR=$PWD/build
+
+# -Dvenus=true and VIRGL_ROUTE_VENUS_TO_APIR=1 route the APIR requests via the Venus backend, for easier testing without a patched hypervisor
+
+meson setup $VIRGL_BUILD_DIR \
+      -Dvenus=true \
+      -Dapir=true
+
+ninja -C $VIRGL_BUILD_DIR
+```
+
+#### 3. Build ggml-virtgpu (Guest-side, Linux)
+
+Option A: Build from a script:
+
+```bash
+# Inside a Linux container
+mkdir llama.cpp
+git clone https://github.com/ggml-org/llama.cpp.git src
+cd src
+
+LLAMA_LINUX_BUILD=$PWD//build-virtgpu
+
+cmake -S . -B $LLAMA_LINUX_BUILD \
+      -DGGML_VIRTGPU=ON
+
+ninja -C $LLAMA_LINUX_BUILD
+```
+
+Option B: Build container image with frontend:
+
+```bash
+cat << EOF > remoting.containerfile
+FROM quay.io/fedora/fedora:43
+USER 0
+
+WORKDIR /app/remoting
+
+ARG LLAMA_CPP_REPO="https://github.com/ggml-org/llama.cpp.git"
+ARG LLAMA_CPP_VERSION="master"
+ARG LLAMA_CPP_CMAKE_FLAGS="-DGGML_VIRTGPU=ON"
+ARG LLAMA_CPP_CMAKE_BUILD_FLAGS="--parallel 4"
+
+RUN dnf install -y git cmake gcc gcc-c++ libcurl-devel libdrm-devel
+
+RUN git clone "\${LLAMA_CPP_REPO}" src \\
+ && git -C src fetch origin \${LLAMA_CPP_VERSION} \\
+ && git -C src reset --hard FETCH_HEAD
+
+RUN mkdir -p build \\
+ && cd src \\
+ && set -o pipefail \\
+ && cmake -S . -B ../build \${LLAMA_CPP_CMAKE_FLAGS} \\
+ && cmake --build ../build/ \${LLAMA_CPP_CMAKE_BUILD_FLAGS}
+
+ENTRYPOINT ["/app/remoting/src/build/bin/llama-server"]
+EOF
+
+mkdir -p empty_dir
+podman build -f remoting.containerfile ./empty_dir -t localhost/llama-cpp.virtgpu
+```
+
+### Environment Setup
+
+#### Set krunkit Environment Variables
+
+```bash
+# Define the base directories (adapt these paths to your system)
+VIRGL_BUILD_DIR=$HOME/remoting/virglrenderer/build
+LLAMA_MAC_BUILD=$HOME/remoting/llama.cpp/build-backend
+
+# For krunkit to load the custom virglrenderer library
+export DYLD_LIBRARY_PATH=$VIRGL_BUILD_DIR/src
+
+# For Virglrenderer to load the ggml-remotingbackend library
+export VIRGL_APIR_BACKEND_LIBRARY="$LLAMA_MAC_BUILD/bin/libggml-virtgpu-backend.dylib"
+
+# For llama.cpp remotingbackend to load the ggml-metal backend
+export APIR_LLAMA_CPP_GGML_LIBRARY_PATH="$LLAMA_MAC_BUILD/bin/libggml-metal.dylib"
+export APIR_LLAMA_CPP_GGML_LIBRARY_REG=ggml_backend_metal_reg
+```
+
+#### Launch Container Environment
+
+```bash
+# Set container provider to libkrun
+export CONTAINERS_MACHINE_PROVIDER=libkrun
+podman machine start
+```
+
+#### Verify Environment
+
+Confirm that krunkit is using the correct virglrenderer library:
+
+```bash
+lsof -c krunkit | grep virglrenderer
+# Expected output:
+# krunkit 50574 user  txt  REG  1,14  2273912  10849442 ($VIRGL_BUILD_DIR/src)/libvirglrenderer.1.dylib
+```
+
+### Running Tests
+
+#### Launch Test Container
+
+```bash
+# Optional model caching
+mkdir -p models
+PODMAN_CACHE_ARGS="-v models:/models --user root:root --cgroupns host --security-opt label=disable -w /models"
+
+podman run $PODMAN_CACHE_ARGS -it --rm --device /dev/dri localhost/llama-cpp.virtgpu
+```
+
+#### Test llama.cpp in Container
+
+```bash
+
+# Run performance benchmark
+/app/remoting/build/bin/llama-bench -m ./llama3.2
+```
+
+Expected output (performance may vary):
+```
+| model                          |       size |     params | backend    | ngl |          test |                  t/s |
+| ------------------------------ | ---------: | ---------: | ---------- | --: | ------------: | -------------------: |
+| llama 3B Q4_K - Medium         |   1.87 GiB |     3.21 B | ggml-virtgpu |  99 |         pp512 |        991.30 ± 0.66 |
+| llama 3B Q4_K - Medium         |   1.87 GiB |     3.21 B | ggml-virtgpu |  99 |         tg128 |         85.71 ± 0.11 |
+```
+
+### Troubleshooting
+
+#### SSH Environment Variable Issues
+
+⚠️ **Warning**: Setting `DYLD_LIBRARY_PATH` from SSH doesn't work on macOS. Here is a workaround:
+
+**Workaround 1: Replace system library**
+```bash
+VIRGL_BUILD_DIR=$HOME/remoting/virglrenderer/build  # ⚠️ adapt to your system
+BREW_VIRGL_DIR=/opt/homebrew/Cellar/virglrenderer/0.10.4d/lib
+VIRGL_LIB=libvirglrenderer.1.dylib
+
+cd $BREW_VIRGL_DIR
+mv $VIRGL_LIB ${VIRGL_LIB}.orig
+ln -s $VIRGL_BUILD_DIR/src/$VIRGL_LIB
+```

From 972f323e73bf0b28358ccaa3b9aa02779421f260 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Mon, 9 Feb 2026 14:57:51 +0200
Subject: [PATCH 12/17] revert : "[Model] Qwen3.5 dense and MoE support (no
 vision) (#19435)" (#19453)

This reverts commit 39bf692af1cba2a1072e4a42425611bf1ec2807d.
---
 convert_hf_to_gguf.py          |  78 ++---
 gguf-py/gguf/constants.py      |  59 ----
 gguf-py/gguf/tensor_mapping.py |   6 +-
 src/CMakeLists.txt             |   3 -
 src/llama-arch.cpp             |  61 ----
 src/llama-arch.h               |   2 -
 src/llama-context.cpp          |   2 +-
 src/llama-model.cpp            | 154 --------
 src/models/delta.cpp           | 618 ---------------------------------
 src/models/kimi-linear.cpp     |   1 +
 src/models/models.h            | 102 +-----
 src/models/qwen3-5.cpp         | 421 ----------------------
 src/models/qwen3-5moe.cpp      |  52 ---
 src/models/qwen3next.cpp       | 372 +++++++++++++++++++-
 14 files changed, 399 insertions(+), 1532 deletions(-)
 delete mode 100644 src/models/delta.cpp
 delete mode 100644 src/models/qwen3-5.cpp
 delete mode 100644 src/models/qwen3-5moe.cpp

diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
index e64756a74a..843c00a896 100755
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -4102,27 +4102,39 @@ class Qwen2MoeModel(TextModel):
         # process the experts separately
         name = name.replace("language_model.", "") # InternVL
 
-        # handle pre-packed expert tensors (e.g. Qwen3.5 MoE, Qwen3Next)
-        # HF stores these using nn.Linear convention: [n_expert, out_features, in_features]
-        # This matches the individual expert stacking path below (which stacks
-        # per-expert [out, in] weights into [n_expert, out, in]), so no permute is needed.
+        # handle aggregated expert tensors
+        # GGUF stores dimensions reversed from PyTorch, so:
+        # PyTorch (A,B,C) -> GGUF writes [C,B,A] -> GGML reads ne={C,B,A}
+        # Input shapes from HF: (n_expert, n_ff_exp, n_embd) or (n_expert, n_embd, n_ff_exp)
+        # Expected GGML ne: {n_embd, n_ff_exp, n_expert} for gate/up, {n_ff_exp, n_embd, n_expert} for down
         if name.endswith("mlp.experts.down_proj") or name.endswith("mlp.experts.down_proj.weight"):
             mapped = f"{name}.weight" if not name.endswith(".weight") else name
-            # HF: [n_expert, n_embd, n_ff] → GGML: {n_ff, n_embd, n_expert} ✓
-            yield from super().modify_tensors(data_torch, mapped, bid)
+            # Input: (n_expert=128, n_ff_exp=768, n_embd=2048)
+            # Want GGML ne: {n_ff_exp, n_embd, n_expert} = {768, 2048, 128}
+            # Need PyTorch: (128, 2048, 768) [reversed of GGML]
+            # So: permute(0, 2, 1): (128, 768, 2048) -> (128, 2048, 768)
+            permuted = data_torch.permute(0, 2, 1).contiguous()
+            yield from super().modify_tensors(permuted, mapped, bid)
             return
 
         if name.endswith("mlp.experts.gate_up_proj") or name.endswith("mlp.experts.gate_up_proj.weight"):
-            # HF: [n_expert, 2*n_ff, n_embd] → split on dim=1
-            n_ff = data_torch.shape[1] // 2
-            gate = data_torch[:, :n_ff, :].contiguous()
-            up = data_torch[:, n_ff:, :].contiguous()
-            # gate/up: [n_expert, n_ff, n_embd] → GGML: {n_embd, n_ff, n_expert} ✓
-            base_name = name.removesuffix(".weight").removesuffix(".gate_up_proj")
-            mapped_gate = f"{base_name}.gate_proj.weight"
-            mapped_up = f"{base_name}.up_proj.weight"
-            yield from super().modify_tensors(gate, mapped_gate, bid)
-            yield from super().modify_tensors(up, mapped_up, bid)
+            if data_torch.ndim < 3 or data_torch.shape[-1] % 2 != 0:
+                raise ValueError(f"Unexpected gate_up_proj shape for {name}: {tuple(data_torch.shape)}")
+            split_dim = data_torch.shape[-1] // 2
+            gate = data_torch[..., :split_dim].contiguous()
+            up = data_torch[..., split_dim:].contiguous()
+            # Input gate/up: (n_expert=128, n_embd=2048, n_ff_exp=768)
+            # Want GGML ne: {n_embd, n_ff_exp, n_expert} = {2048, 768, 128}
+            # Need PyTorch: (128, 768, 2048) [reversed of GGML]
+            # So: permute(0, 2, 1): (128, 2048, 768) -> (128, 768, 2048)
+            base_name = name.removesuffix(".weight")
+            base = base_name.rsplit('.', 1)[0]
+            mapped_gate = f"{base}.gate_proj.weight"
+            mapped_up = f"{base}.up_proj.weight"
+            perm_gate = gate.permute(0, 2, 1).contiguous()
+            perm_up = up.permute(0, 2, 1).contiguous()
+            yield from super().modify_tensors(perm_gate, mapped_gate, bid)
+            yield from super().modify_tensors(perm_up, mapped_up, bid)
             return
 
         if name.startswith("mlp") or name.startswith("vision_model") or name.startswith("model.vision_tower") or name.startswith("model.multi_modal_projector") or name.startswith("model.visual"):
@@ -4332,40 +4344,6 @@ class Qwen3NextModel(Qwen2MoeModel):
             yield from super().modify_tensors(data_torch, name, bid)
 
 
-@ModelBase.register("Qwen3_5ForCausalLM", "Qwen3_5TextForCausalLM")
-class Qwen3_5Model(Qwen3NextModel):
-    model_arch = gguf.MODEL_ARCH.QWEN3_5
-
-    # Stores whichever of in_proj_a/in_proj_b is seen first, keyed by layer
-    _pending_ba: dict[int | None, tuple[str, Tensor]] = {}
-
-    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
-        # Handle split in_proj_b + in_proj_a → concatenated SSM_BETA_ALPHA
-        # safetensors sorts alphabetically so in_proj_a arrives before in_proj_b
-        if "in_proj_a.weight" in name or "in_proj_b.weight" in name:
-            which = "a" if "in_proj_a" in name else "b"
-            if bid not in self._pending_ba:
-                self._pending_ba[bid] = (which, data_torch)
-                return
-            prev_which, prev_tensor = self._pending_ba.pop(bid)
-            assert prev_which != which, f"duplicate in_proj_{which} for layer {bid}"
-            b_tensor = prev_tensor if prev_which == "b" else data_torch
-            a_tensor = prev_tensor if prev_which == "a" else data_torch
-            ba_combined = torch.cat([b_tensor, a_tensor], dim=0)
-            yield (self.format_tensor_name(gguf.MODEL_TENSOR.SSM_BETA_ALPHA, bid, ".weight"), ba_combined)
-            return
-        else:
-            # Qwen3Next uses .qkvz tensor, so we use the super to get the other functionalities
-            # (norm correction, A_log to A etc.) for free
-            # Qwen2Moe already does the gate_up conversion properly, just use that
-            yield from super().modify_tensors(data_torch, name, bid)
-
-
-@ModelBase.register("Qwen3_5MoeForCausalLM", "Qwen3_5MoeTextForCausalLM")
-class Qwen3_5MoeModel(Qwen3_5Model):
-    model_arch = gguf.MODEL_ARCH.QWEN3_5_MOE
-
-
 @ModelBase.register("RND1")
 class RND1Model(Qwen2MoeModel):
     model_arch = gguf.MODEL_ARCH.RND1
diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py
index 8a3fab1e1c..3af4fffe95 100644
--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@@ -382,8 +382,6 @@ class MODEL_ARCH(IntEnum):
     QWEN3            = auto()
     QWEN3MOE         = auto()
     QWEN3NEXT        = auto()
-    QWEN3_5          = auto()
-    QWEN3_5_MOE      = auto()
     QWEN3VL          = auto()
     QWEN3VLMOE       = auto()
     PHI2             = auto()
@@ -814,8 +812,6 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
     MODEL_ARCH.QWEN3:            "qwen3",
     MODEL_ARCH.QWEN3MOE:         "qwen3moe",
     MODEL_ARCH.QWEN3NEXT:        "qwen3next",
-    MODEL_ARCH.QWEN3_5:          "qwen3_5",
-    MODEL_ARCH.QWEN3_5_MOE:      "qwen3_5moe",
     MODEL_ARCH.QWEN3VL:          "qwen3vl",
     MODEL_ARCH.QWEN3VLMOE:       "qwen3vlmoe",
     MODEL_ARCH.PHI2:             "phi2",
@@ -1788,61 +1784,6 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
         MODEL_TENSOR.SSM_BETA_ALPHA,
         MODEL_TENSOR.SSM_OUT
     ],
-    MODEL_ARCH.QWEN3_5: [
-        MODEL_TENSOR.TOKEN_EMBD,
-        MODEL_TENSOR.OUTPUT_NORM,
-        MODEL_TENSOR.OUTPUT,
-        MODEL_TENSOR.ATTN_NORM,
-        MODEL_TENSOR.ATTN_Q,
-        MODEL_TENSOR.ATTN_Q_NORM,
-        MODEL_TENSOR.ATTN_K,
-        MODEL_TENSOR.ATTN_K_NORM,
-        MODEL_TENSOR.ATTN_V,
-        MODEL_TENSOR.ATTN_OUT,
-        MODEL_TENSOR.ATTN_POST_NORM,
-        MODEL_TENSOR.ATTN_GATE,
-        MODEL_TENSOR.ATTN_QKV,
-        MODEL_TENSOR.FFN_GATE,
-        MODEL_TENSOR.FFN_DOWN,
-        MODEL_TENSOR.FFN_UP,
-        MODEL_TENSOR.SSM_A,
-        MODEL_TENSOR.SSM_CONV1D,
-        MODEL_TENSOR.SSM_DT,
-        MODEL_TENSOR.SSM_NORM,
-        MODEL_TENSOR.SSM_IN,
-        MODEL_TENSOR.SSM_BETA_ALPHA,
-        MODEL_TENSOR.SSM_OUT,
-    ],
-    MODEL_ARCH.QWEN3_5_MOE: [
-        MODEL_TENSOR.TOKEN_EMBD,
-        MODEL_TENSOR.OUTPUT_NORM,
-        MODEL_TENSOR.OUTPUT,
-        MODEL_TENSOR.ATTN_NORM,
-        MODEL_TENSOR.ATTN_Q,
-        MODEL_TENSOR.ATTN_Q_NORM,
-        MODEL_TENSOR.ATTN_K,
-        MODEL_TENSOR.ATTN_K_NORM,
-        MODEL_TENSOR.ATTN_V,
-        MODEL_TENSOR.ATTN_OUT,
-        MODEL_TENSOR.ATTN_POST_NORM,
-        MODEL_TENSOR.ATTN_GATE,
-        MODEL_TENSOR.ATTN_QKV,
-        MODEL_TENSOR.FFN_GATE_INP,
-        MODEL_TENSOR.FFN_GATE_INP_SHEXP,
-        MODEL_TENSOR.FFN_UP_SHEXP,
-        MODEL_TENSOR.FFN_DOWN_SHEXP,
-        MODEL_TENSOR.FFN_GATE_SHEXP,
-        MODEL_TENSOR.FFN_DOWN_EXP,
-        MODEL_TENSOR.FFN_UP_EXP,
-        MODEL_TENSOR.FFN_GATE_EXP,
-        MODEL_TENSOR.SSM_A,
-        MODEL_TENSOR.SSM_CONV1D,
-        MODEL_TENSOR.SSM_DT,
-        MODEL_TENSOR.SSM_NORM,
-        MODEL_TENSOR.SSM_IN,
-        MODEL_TENSOR.SSM_BETA_ALPHA,
-        MODEL_TENSOR.SSM_OUT,
-    ],
     MODEL_ARCH.QWEN3VL: [
         MODEL_TENSOR.TOKEN_EMBD,
         MODEL_TENSOR.OUTPUT_NORM,
diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py
index 43f32c7b52..167ade7803 100644
--- a/gguf-py/gguf/tensor_mapping.py
+++ b/gguf-py/gguf/tensor_mapping.py
@@ -228,7 +228,6 @@ class TensorNameMap:
             "transformer_encoder.{bid}.qkv",                                       # neobert
             "layers.{bid}.attn.Wqkv",                                              # modern-bert
             "model.layers.{bid}.self_attn.language_expert_query_key_value",        # cogvlm
-            "model.layers.{bid}.linear_attn.in_proj_qkv",                          # qwen3.5
         ),
 
         # Attention query
@@ -359,9 +358,8 @@ class TensorNameMap:
         ),
 
         MODEL_TENSOR.ATTN_GATE: (
-            "model.layers.{bid}.self_attn.gate_proj",   # afmoe
-            "model.layers.{bid}.self_attn.g_proj",      # step3.5 head-wise attention gate
-            "model.layers.{bid}.linear_attn.in_proj_z", # qwen3.5
+            "model.layers.{bid}.self_attn.gate_proj", # afmoe
+            "model.layers.{bid}.self_attn.g_proj",    # step3.5 head-wise attention gate
         ),
 
         # Feed-forward norm
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 0c164617a1..2115fc4255 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -57,7 +57,6 @@ add_library(llama
             models/deci.cpp
             models/deepseek.cpp
             models/deepseek2.cpp
-            models/delta.cpp
             models/dots1.cpp
             models/dream.cpp
             models/ernie4-5-moe.cpp
@@ -123,8 +122,6 @@ add_library(llama
             models/qwen3vl-moe.cpp
             models/qwen3moe.cpp
             models/qwen3next.cpp
-            models/qwen3-5.cpp
-            models/qwen3-5moe.cpp
             models/refact.cpp
             models/rnd1.cpp
             models/rwkv6-base.cpp
diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp
index fce46772d7..bd78f1e556 100644
--- a/src/llama-arch.cpp
+++ b/src/llama-arch.cpp
@@ -35,8 +35,6 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
     { LLM_ARCH_QWEN3,            "qwen3"            },
     { LLM_ARCH_QWEN3MOE,         "qwen3moe"         },
     { LLM_ARCH_QWEN3NEXT,        "qwen3next"        },
-    { LLM_ARCH_QWEN3_5,          "qwen3_5"          },
-    { LLM_ARCH_QWEN3_5_MOE,      "qwen3_5moe"       },
     { LLM_ARCH_QWEN3VL,          "qwen3vl"          },
     { LLM_ARCH_QWEN3VLMOE,       "qwen3vlmoe"       },
     { LLM_ARCH_PHI2,             "phi2"             },
@@ -987,63 +985,6 @@ static std::set<llm_tensor> llm_get_tensor_names(llm_arch arch) {
                 LLM_TENSOR_SSM_NORM,
                 LLM_TENSOR_SSM_OUT,
             };
-        case LLM_ARCH_QWEN3_5:
-            return {
-                LLM_TENSOR_TOKEN_EMBD,
-                LLM_TENSOR_OUTPUT_NORM,
-                LLM_TENSOR_OUTPUT,
-                LLM_TENSOR_ATTN_NORM,
-                LLM_TENSOR_ATTN_POST_NORM,
-                LLM_TENSOR_ATTN_Q,
-                LLM_TENSOR_ATTN_Q_NORM,
-                LLM_TENSOR_ATTN_K,
-                LLM_TENSOR_ATTN_K_NORM,
-                LLM_TENSOR_ATTN_V,
-                LLM_TENSOR_ATTN_OUT,
-                LLM_TENSOR_ATTN_QKV,
-                LLM_TENSOR_ATTN_GATE,
-                LLM_TENSOR_FFN_GATE,
-                LLM_TENSOR_FFN_DOWN,
-                LLM_TENSOR_FFN_UP,
-                LLM_TENSOR_SSM_A_NOSCAN,
-                LLM_TENSOR_SSM_CONV1D,
-                LLM_TENSOR_SSM_DT,
-                LLM_TENSOR_SSM_BETA_ALPHA,
-                LLM_TENSOR_SSM_IN,
-                LLM_TENSOR_SSM_NORM,
-                LLM_TENSOR_SSM_OUT,
-            };
-        case LLM_ARCH_QWEN3_5_MOE:
-            return {
-                LLM_TENSOR_TOKEN_EMBD,
-                LLM_TENSOR_OUTPUT_NORM,
-                LLM_TENSOR_OUTPUT,
-                LLM_TENSOR_ATTN_NORM,
-                LLM_TENSOR_ATTN_POST_NORM,
-                LLM_TENSOR_ATTN_Q,
-                LLM_TENSOR_ATTN_Q_NORM,
-                LLM_TENSOR_ATTN_K,
-                LLM_TENSOR_ATTN_K_NORM,
-                LLM_TENSOR_ATTN_V,
-                LLM_TENSOR_ATTN_OUT,
-                LLM_TENSOR_ATTN_QKV,
-                LLM_TENSOR_ATTN_GATE,
-                LLM_TENSOR_FFN_GATE_INP,
-                LLM_TENSOR_FFN_GATE_EXPS,
-                LLM_TENSOR_FFN_DOWN_EXPS,
-                LLM_TENSOR_FFN_UP_EXPS,
-                LLM_TENSOR_FFN_GATE_INP_SHEXP,
-                LLM_TENSOR_FFN_GATE_SHEXP,
-                LLM_TENSOR_FFN_DOWN_SHEXP,
-                LLM_TENSOR_FFN_UP_SHEXP,
-                LLM_TENSOR_SSM_A_NOSCAN,
-                LLM_TENSOR_SSM_CONV1D,
-                LLM_TENSOR_SSM_DT,
-                LLM_TENSOR_SSM_BETA_ALPHA,
-                LLM_TENSOR_SSM_IN,
-                LLM_TENSOR_SSM_NORM,
-                LLM_TENSOR_SSM_OUT,
-            };
         case LLM_ARCH_QWEN3VL:
         case LLM_ARCH_CHAMELEON:
         case LLM_ARCH_HUNYUAN_DENSE:
@@ -2733,8 +2674,6 @@ bool llm_arch_is_hybrid(const llm_arch & arch) {
         case LLM_ARCH_NEMOTRON_H:
         case LLM_ARCH_NEMOTRON_H_MOE:
         case LLM_ARCH_QWEN3NEXT:
-        case LLM_ARCH_QWEN3_5:
-        case LLM_ARCH_QWEN3_5_MOE:
         case LLM_ARCH_KIMI_LINEAR:
             return true;
         default:
diff --git a/src/llama-arch.h b/src/llama-arch.h
index a392ecce2b..e8263369b8 100644
--- a/src/llama-arch.h
+++ b/src/llama-arch.h
@@ -39,8 +39,6 @@ enum llm_arch {
     LLM_ARCH_QWEN3,
     LLM_ARCH_QWEN3MOE,
     LLM_ARCH_QWEN3NEXT,
-    LLM_ARCH_QWEN3_5,
-    LLM_ARCH_QWEN3_5_MOE,
     LLM_ARCH_QWEN3VL,
     LLM_ARCH_QWEN3VLMOE,
     LLM_ARCH_PHI2,
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index 80b9a7d46a..a6df893a31 100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -2013,7 +2013,7 @@ void llama_context::output_reorder() {
 //
 
 uint32_t llama_context::graph_max_nodes(uint32_t n_tokens) const {
-    if (model.arch == LLM_ARCH_QWEN3NEXT || model.arch == LLM_ARCH_QWEN3_5 || model.arch == LLM_ARCH_QWEN3_5_MOE || model.arch == LLM_ARCH_KIMI_LINEAR) {
+    if (model.arch == LLM_ARCH_QWEN3NEXT || model.arch == LLM_ARCH_KIMI_LINEAR) {
         return std::max<uint32_t>(n_tokens * 40, 32u * model.n_tensors());
     }
     uint32_t res = std::max<uint32_t>(1024u, 8u*model.n_tensors());
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
index 8fc61aee37..674d06c891 100644
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -2412,25 +2412,6 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                     default: type = LLM_TYPE_UNKNOWN;
                 }
             } break;
-        case LLM_ARCH_QWEN3_5:
-        case LLM_ARCH_QWEN3_5_MOE:
-            {
-                ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH,        hparams.n_ff_exp, false);
-                ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, false);
-                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS,       hparams.f_norm_rms_eps);
-
-                // Load linear attention (gated delta net) parameters
-                ml.get_key(LLM_KV_SSM_CONV_KERNEL,    hparams.ssm_d_conv);
-                ml.get_key(LLM_KV_SSM_INNER_SIZE,     hparams.ssm_d_inner);
-                ml.get_key(LLM_KV_SSM_STATE_SIZE,     hparams.ssm_d_state);
-                ml.get_key(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank);
-                ml.get_key(LLM_KV_SSM_GROUP_COUNT,    hparams.ssm_n_group);
-
-                // Mark recurrent layers (linear attention layers)
-                for (uint32_t i = 0; i < hparams.n_layer; ++i) {
-                    hparams.recurrent_layer_arr[i] = ((i + 1) % 4 != 0);
-                }
-            } break;
         case LLM_ARCH_MISTRAL3:
             {
                 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
@@ -7113,129 +7094,6 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
                         layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff_exp, n_embd, n_expert }, 0);
                         layer.ffn_up_exps   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), { n_embd, n_ff_exp, n_expert }, 0);
 
-                        // Shared experts
-                        layer.ffn_gate_inp_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP_SHEXP, "weight", i), { n_embd }, 0);
-                        layer.ffn_gate_shexp     = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP,     "weight", i), { n_embd, hparams.n_ff_shexp }, 0);
-                        layer.ffn_up_shexp       = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP,       "weight", i), { n_embd, hparams.n_ff_shexp }, 0);
-                        layer.ffn_down_shexp     = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP,     "weight", i), { hparams.n_ff_shexp, n_embd }, 0);
-                    }
-                } break;
-            case LLM_ARCH_QWEN3_5:
-                {
-                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
-
-                    // output
-                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
-                    output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), { n_embd, n_vocab }, TENSOR_NOT_REQUIRED);
-
-                    if (output == NULL) {
-                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, TENSOR_DUPLICATED);
-                    }
-
-                    // Calculate dimensions from hyperparameters
-                    const int64_t head_k_dim = hparams.ssm_d_state;
-                    const int64_t head_v_dim = hparams.ssm_d_state;
-                    const int64_t n_k_heads  = hparams.ssm_n_group;
-                    const int64_t n_v_heads  = hparams.ssm_dt_rank;
-                    const int64_t key_dim    = head_k_dim * n_k_heads;
-                    const int64_t value_dim  = head_v_dim * n_v_heads;
-                    const int64_t conv_dim   = key_dim * 2 + value_dim;
-
-                    const int64_t ba_dim = n_v_heads * 2;
-
-                    for (int i = 0; i < n_layer; ++i) {
-                        auto & layer = layers[i];
-
-                        layer.attn_norm      = create_tensor(tn(LLM_TENSOR_ATTN_NORM,      "weight", i), { n_embd }, 0);
-                        layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), { n_embd }, 0);
-
-                        if (!hparams.is_recurrent(i)) {
-                            // Full attention layers
-                            layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), { n_embd, n_embd_head_k * n_head * 2 }, 0);
-                            layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), { n_embd, n_embd_k_gqa }, 0);
-                            layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), { n_embd, n_embd_v_gqa }, 0);
-                            layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head_k * n_head, n_embd }, 0);
-
-                            layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), { n_embd_head_k }, 0);
-                            layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), { n_embd_head_k }, 0);
-                        } else {
-                            // Linear attention (gated delta net) specific tensors
-                            layer.ssm_in         = create_tensor(tn(LLM_TENSOR_SSM_IN,         "weight", i), { n_embd, key_dim * 2 + value_dim * 2 }, TENSOR_NOT_REQUIRED);
-                            layer.wqkv           = create_tensor(tn(LLM_TENSOR_ATTN_QKV,       "weight", i), { n_embd, key_dim * 2 + value_dim }, TENSOR_NOT_REQUIRED);
-                            layer.wqkv_gate      = create_tensor(tn(LLM_TENSOR_ATTN_GATE,      "weight", i), { n_embd, value_dim }, TENSOR_NOT_REQUIRED);
-                            layer.ssm_conv1d     = create_tensor(tn(LLM_TENSOR_SSM_CONV1D,     "weight", i), { hparams.ssm_d_conv, conv_dim }, 0);
-                            layer.ssm_dt         = create_tensor(tn(LLM_TENSOR_SSM_DT,         "bias",   i), { hparams.ssm_dt_rank }, 0);
-                            layer.ssm_a          = create_tensor(tn(LLM_TENSOR_SSM_A_NOSCAN,             i), { hparams.ssm_dt_rank }, 0);
-                            layer.ssm_beta_alpha = create_tensor(tn(LLM_TENSOR_SSM_BETA_ALPHA, "weight", i), { n_embd, ba_dim }, 0);
-                            layer.ssm_norm       = create_tensor(tn(LLM_TENSOR_SSM_NORM,       "weight", i), { head_v_dim }, 0);
-                            layer.ssm_out        = create_tensor(tn(LLM_TENSOR_SSM_OUT,        "weight", i), { value_dim, n_embd }, 0);
-                        }
-
-                        // Dense FFN for all layers
-                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), { n_embd, n_ff }, 0);
-                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), { n_embd, n_ff }, 0);
-                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd }, 0);
-                    }
-                } break;
-            case LLM_ARCH_QWEN3_5_MOE:
-                {
-                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
-
-                    // output
-                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
-                    output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), { n_embd, n_vocab }, TENSOR_NOT_REQUIRED);
-
-                    if (output == NULL) {
-                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, TENSOR_DUPLICATED);
-                    }
-
-                    const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used;
-
-                    // Calculate dimensions from hyperparameters
-                    const int64_t head_k_dim = hparams.ssm_d_state;
-                    const int64_t head_v_dim = hparams.ssm_d_state;
-                    const int64_t n_k_heads  = hparams.ssm_n_group;
-                    const int64_t n_v_heads  = hparams.ssm_dt_rank;
-                    const int64_t key_dim    = head_k_dim * n_k_heads;
-                    const int64_t value_dim  = head_v_dim * n_v_heads;
-                    const int64_t conv_dim   = key_dim * 2 + value_dim;
-
-                    const int64_t ba_dim = n_v_heads * 2;
-
-                    for (int i = 0; i < n_layer; ++i) {
-                        auto & layer = layers[i];
-
-                        layer.attn_norm      = create_tensor(tn(LLM_TENSOR_ATTN_NORM,      "weight", i), { n_embd }, 0);
-                        layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), { n_embd }, 0);
-
-                        if (!hparams.is_recurrent(i)) {
-                            // Full attention layers
-                            layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), { n_embd, n_embd_head_k * n_head * 2 }, 0);
-                            layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), { n_embd, n_embd_k_gqa }, 0);
-                            layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), { n_embd, n_embd_v_gqa }, 0);
-                            layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head_k * n_head, n_embd }, 0);
-
-                            layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), { n_embd_head_k }, 0);
-                            layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), { n_embd_head_k }, 0);
-                        } else {
-                            // Linear attention (gated delta net) specific tensors
-                            layer.ssm_in         = create_tensor(tn(LLM_TENSOR_SSM_IN,         "weight", i), { n_embd, key_dim * 2 + value_dim * 2 }, TENSOR_NOT_REQUIRED);
-                            layer.wqkv           = create_tensor(tn(LLM_TENSOR_ATTN_QKV,       "weight", i), { n_embd, key_dim * 2 + value_dim }, TENSOR_NOT_REQUIRED);
-                            layer.wqkv_gate      = create_tensor(tn(LLM_TENSOR_ATTN_GATE,      "weight", i), { n_embd, value_dim }, TENSOR_NOT_REQUIRED);
-                            layer.ssm_conv1d     = create_tensor(tn(LLM_TENSOR_SSM_CONV1D,     "weight", i), { hparams.ssm_d_conv, conv_dim }, 0);
-                            layer.ssm_dt         = create_tensor(tn(LLM_TENSOR_SSM_DT,         "bias",   i), { hparams.ssm_dt_rank }, 0);
-                            layer.ssm_a          = create_tensor(tn(LLM_TENSOR_SSM_A_NOSCAN,             i), { hparams.ssm_dt_rank }, 0);
-                            layer.ssm_beta_alpha = create_tensor(tn(LLM_TENSOR_SSM_BETA_ALPHA, "weight", i), { n_embd, ba_dim }, 0);
-                            layer.ssm_norm       = create_tensor(tn(LLM_TENSOR_SSM_NORM,       "weight", i), { head_v_dim }, 0);
-                            layer.ssm_out        = create_tensor(tn(LLM_TENSOR_SSM_OUT,        "weight", i), { value_dim, n_embd }, 0);
-                        }
-
-                        // MoE FFN
-                        layer.ffn_gate_inp  = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP,  "weight", i), { n_embd, n_expert }, 0);
-                        layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert }, 0);
-                        layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff_exp, n_embd, n_expert }, 0);
-                        layer.ffn_up_exps   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), { n_embd, n_ff_exp, n_expert }, 0);
-
                         // Shared experts
                         layer.ffn_gate_inp_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP_SHEXP, "weight", i), { n_embd }, 0);
                         layer.ffn_gate_shexp     = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP,     "weight", i), { n_embd, hparams.n_ff_shexp }, 0);
@@ -7687,8 +7545,6 @@ void llama_model::print_info() const {
         arch == LLM_ARCH_PLAMO2 ||
         arch == LLM_ARCH_GRANITE_HYBRID ||
         arch == LLM_ARCH_QWEN3NEXT ||
-        arch == LLM_ARCH_QWEN3_5 ||
-        arch == LLM_ARCH_QWEN3_5_MOE ||
         arch == LLM_ARCH_NEMOTRON_H ||
         arch == LLM_ARCH_NEMOTRON_H_MOE) {
         LLAMA_LOG_INFO("%s: ssm_d_conv            = %u\n",     __func__, hparams.ssm_d_conv);
@@ -8487,14 +8343,6 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
             {
                 llm = std::make_unique<llm_build_qwen3next>(*this, params);
             } break;
-        case LLM_ARCH_QWEN3_5:
-            {
-                llm = std::make_unique<llm_build_qwen3_5>(*this, params);
-            } break;
-        case LLM_ARCH_QWEN3_5_MOE:
-            {
-                llm = std::make_unique<llm_build_qwen3_5_moe>(*this, params);
-            } break;
         case LLM_ARCH_MISTRAL3:
             {
                 llm = std::make_unique<llm_build_mistral3>(*this, params);
@@ -8755,8 +8603,6 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
         case LLM_ARCH_PANGU_EMBED:
         case LLM_ARCH_AFMOE:
         case LLM_ARCH_QWEN3NEXT:
-        case LLM_ARCH_QWEN3_5:
-        case LLM_ARCH_QWEN3_5_MOE:
         case LLM_ARCH_MIMO2:
         case LLM_ARCH_STEP35:
             return LLAMA_ROPE_TYPE_NEOX;
diff --git a/src/models/delta.cpp b/src/models/delta.cpp
deleted file mode 100644
index d1d9837d09..0000000000
--- a/src/models/delta.cpp
+++ /dev/null
@@ -1,618 +0,0 @@
-#include "models.h"
-#include "ggml.h"
-#include <cmath>
-#include <utility>
-#include <cassert>
-
-llm_graph_context_delta::llm_graph_context_delta(const llm_graph_params & params) : llm_graph_context_mamba(params) {}
-
-/**
- * Unified Delta Net implementation supporting both GDA and KDA modes.
- *
- * GDA (Gated Delta Attention): g has shape [H, T, B] in GGML (PyTorch: [B, T, H])
- *   - Per-head gating, broadcasts over K dimension
- *
- * KDA (Key-wise Delta Attention): g has shape [K, H, T, B] in GGML (PyTorch: [B, T, H, K])
- *   - Per-key gating
- *
- * The mode is auto-detected based on g's dimensionality.
- *
- * Tensor dimension convention:
- *   GGML: ne[0] is innermost (fastest varying), ne[3] is outermost
- *   PyTorch: dim 0 is outermost, dim -1 is innermost
- *   So GGML [A, B, C, D] corresponds to PyTorch [D, C, B, A]
- */
-
-// Helper to get a slice along dimension 2 (n_chunks dimension)
-static ggml_tensor * get_slice_2d(ggml_context * ctx, ggml_tensor * t, int64_t chunk) {
-    return ggml_view_4d(ctx, t,
-        t->ne[0], t->ne[1], 1, t->ne[3],
-        t->nb[1], t->nb[2], t->nb[3],
-        chunk * t->nb[2]);
-}
-
-/**
- * Unified chunked Delta Net implementation.
- *
- * Input tensor format matches qwen3next conventions:
- * @param q         Query tensor [S_k, H_k, n_tokens, n_seqs]
- * @param k         Key tensor [S_k, H_k, n_tokens, n_seqs]
- * @param v         Value tensor [S_v, H_v, n_tokens, n_seqs]
- * @param g         Gate tensor:
- *                    GDA: [H_v, n_tokens, n_seqs]
- *                    KDA: [S_k, H_v, n_tokens, n_seqs]
- * @param beta      Beta tensor [H_v, 1, n_tokens, n_seqs]
- * @param state     State tensor [S_v, S_v * H_v, 1, n_seqs]
- * @param causal_mask   Lower triangular mask [chunk_size, chunk_size]
- * @param identity      Identity matrix [chunk_size, chunk_size]
- * @param diag_mask     Diagonal mask [chunk_size, chunk_size]
- * @param il            Layer index (for debugging callbacks)
- * @param chunk_size    Chunk size for chunked processing
- * @param eps_norm      Epsilon for L2 normalization
- *
- * @return Pair of (output_tokens, new_state)
- */
-std::pair<ggml_tensor *, ggml_tensor *> llm_graph_context_delta::build_delta_net_unified_chunking(
-        ggml_context * ctx0,
-        ggml_tensor * q,
-        ggml_tensor * k,
-        ggml_tensor * v,
-        ggml_tensor * g,
-        ggml_tensor * beta,
-        ggml_tensor * state_reshaped,
-        ggml_tensor * causal_mask,
-        ggml_tensor * identity,
-        ggml_tensor * diag_mask,
-        int           il,
-        int64_t       chunk_size,
-        float         eps_norm) {
-
-    // Input format: [S, H, n_tokens, n_seqs] (matching qwen3next convention)
-    const int64_t S_k      = q->ne[0];
-    const int64_t H_k      = q->ne[1];
-    const int64_t n_tokens = q->ne[2];
-    const int64_t n_seqs   = q->ne[3];
-
-    const int64_t S_v = v->ne[0];
-    const int64_t H_v = v->ne[1];
-
-    // Detect KDA vs GDA based on g's shape
-    // GDA: g has shape [H_v, n_tokens, n_seqs]
-    // KDA: g has shape [S_k, H_v, n_tokens, n_seqs] (4D with ne[0]=S_k)
-    const bool is_kda = (g->ne[0] == S_k && g->ne[1] == H_v);
-
-    // Validate tensor shapes
-    GGML_ASSERT(v->ne[2] == n_tokens);
-    GGML_ASSERT(k->ne[2] == n_tokens);
-    GGML_ASSERT(state_reshaped->ne[0] == S_v && state_reshaped->ne[1] == S_v && state_reshaped->ne[2] == H_v && state_reshaped->ne[3] == n_seqs);
-    GGML_ASSERT(q->ne[0] == S_k && q->ne[1] == H_k && q->ne[2] == n_tokens && q->ne[3] == n_seqs);
-    GGML_ASSERT(k->ne[0] == S_k && k->ne[1] == H_k && k->ne[2] == n_tokens && k->ne[3] == n_seqs);
-    GGML_ASSERT(beta->ne[0] == H_v && beta->ne[2] == n_tokens && beta->ne[3] == n_seqs);
-    GGML_ASSERT(H_k == H_v);
-
-    if (is_kda) {
-        // KDA: g shape [S_k, H_v, n_tokens, n_seqs]
-        GGML_ASSERT(g->ne[0] == S_k && g->ne[1] == H_v && g->ne[2] == n_tokens && g->ne[3] == n_seqs);
-    } else {
-        // GDA: g shape [H_v, n_tokens, n_seqs]
-        GGML_ASSERT(g->ne[0] == H_v && g->ne[1] == n_tokens && g->ne[2] == n_seqs);
-    }
-
-    // L2 normalize q and k
-    q = ggml_l2_norm(ctx0, q, eps_norm);
-    k = ggml_l2_norm(ctx0, k, eps_norm);
-
-    const float scale = 1.0f / sqrtf((float)S_v);
-    q = ggml_scale(ctx0, q, scale);
-
-    beta = ggml_sigmoid(ctx0, beta);
-
-    cb(q, "q_in", il);
-    cb(k, "k_in", il);
-    cb(v, "v_in", il);
-    cb(beta, "beta_in", il);
-    cb(g, "g_in", il);
-
-    // Permute tensors to working format [S, n_tokens, H, n_seqs]
-    // Input: [S, H, n_tokens, n_seqs] -> permute(0, 2, 1, 3) -> [S, n_tokens, H, n_seqs]
-    q = ggml_cont_4d(ctx0, ggml_permute(ctx0, q, 0, 2, 1, 3), S_k, n_tokens, H_k, n_seqs);
-    k = ggml_cont_4d(ctx0, ggml_permute(ctx0, k, 0, 2, 1, 3), S_k, n_tokens, H_k, n_seqs);
-    v = ggml_cont_4d(ctx0, ggml_permute(ctx0, v, 0, 2, 1, 3), S_v, n_tokens, H_v, n_seqs);
-    if (is_kda) {
-        g = ggml_cont_4d(ctx0, ggml_permute(ctx0, g, 0, 2, 1, 3), S_k, n_tokens, H_k, n_seqs);
-    } else {
-        g = ggml_cont_4d(ctx0, ggml_permute(ctx0, g, 2, 0, 3, 1), n_tokens, 1, H_k, n_seqs);
-    }
-    beta = ggml_cont(ctx0, ggml_permute(ctx0, beta, 2, 0, 1, 3));
-
-    cb(q, "q_perm", il);
-    cb(k, "k_perm", il);
-    cb(v, "v_perm", il);
-    cb(beta, "beta_perm", il);
-    cb(g, "g_perm", il);
-    cb(state_reshaped, "state_in", il);
-
-    // Padding for chunk processing
-    const int64_t pad = (chunk_size - n_tokens % chunk_size) % chunk_size;
-    const int64_t n_chunks = (n_tokens + pad) / chunk_size;
-
-    q = ggml_pad(ctx0, q, 0, pad, 0, 0);
-    k = ggml_pad(ctx0, k, 0, pad, 0, 0);
-    v = ggml_pad(ctx0, v, 0, pad, 0, 0);
-    beta = ggml_pad(ctx0, beta, 0, pad, 0, 0);
-    g = ggml_pad(ctx0, g, pad, 0, 0, 0);
-
-
-    cb(q, "q_pad", il);
-    cb(k, "k_pad", il);
-    cb(v, "v_pad", il);
-    cb(beta, "beta_pad", il);
-    cb(g, "g_pad", il);
-
-    ggml_tensor * v_beta = ggml_mul(ctx0, v, beta);
-    ggml_tensor * k_beta = ggml_mul(ctx0, k, beta);
-
-    cb(v_beta, "v_beta", il);
-    cb(k_beta, "k_beta", il);
-
-    // Reshape to chunks
-    q      = ggml_reshape_4d(ctx0, q,      S_k, chunk_size, n_chunks, H_k * n_seqs);
-    k      = ggml_reshape_4d(ctx0, k,      S_k, chunk_size, n_chunks, H_k * n_seqs);
-    k_beta = ggml_reshape_4d(ctx0, k_beta, S_k, chunk_size, n_chunks, H_k * n_seqs);
-    v      = ggml_reshape_4d(ctx0, v,      S_v, chunk_size, n_chunks, H_v * n_seqs);
-    v_beta = ggml_reshape_4d(ctx0, v_beta, S_v, chunk_size, n_chunks, H_v * n_seqs);
-    beta   = ggml_reshape_4d(ctx0, beta, 1, chunk_size, n_chunks, H_k * n_seqs);
-
-    // Reshape g for chunks
-    ggml_tensor * g_cumsum;
-    ggml_tensor * g_cumsum_t;
-    if (is_kda) {
-        // KDA: g [S_k, n_tokens+pad, H_k, n_seqs] -> [S_k, chunk_size, n_chunks, H_k * n_seqs]
-        g = ggml_reshape_4d(ctx0, g, S_k, chunk_size, n_chunks, H_k * n_seqs);
-        // Cumsum along chunk_size dimension (ne[1])
-        // GGML cumsum operates on ne[0], so we need to transpose, cumsum, transpose back
-        g = ggml_cont(ctx0, ggml_transpose(ctx0, g));  // [chunk_size, S_k, n_chunks, H_k * n_seqs]
-        g_cumsum_t = ggml_cumsum(ctx0, g);
-        g_cumsum = ggml_cont(ctx0, ggml_transpose(ctx0, g_cumsum_t));  // [S_k, chunk_size, n_chunks, H_k * n_seqs]
-    } else {
-        // GDA: g [n_tokens+pad, 1, H_k, n_seqs] -> [chunk_size, 1, n_chunks, H_k * n_seqs]
-        g = ggml_reshape_4d(ctx0, g, chunk_size, 1, n_chunks, H_k * n_seqs);
-        g_cumsum = ggml_cumsum(ctx0, g);
-        g_cumsum_t = ggml_reshape_4d(ctx0, g_cumsum, 1, chunk_size, n_chunks, H_k * n_seqs);
-    }
-
-    cb(g_cumsum, "g_cumsum", il);
-
-    // Build attention matrix A for the WY representation solve
-    // For GDA: A[j,i] = sum_k(k[j,k] * exp(g[j] - g[i]) * k[i,k]) = (k @ k^T) * exp(g[j] - g[i])
-    // For KDA: A[j,i] = sum_k(k_beta[j,k] * exp(g[j,k] - g[i,k]) * k[i,k])
-    // KDA uses decay mask with S_k packed into batch to compute exp(g[j,k] - g[i,k]) per-key
-
-    ggml_tensor * k_decay;
-    ggml_tensor * decay_mask = nullptr;
-    ggml_tensor * g_exp_pos = nullptr;
-
-    if (is_kda) {
-        // KDA: Use decay mask with S_k in leading dimension for efficient mul_mat reduction
-        // A[j,i] = sum_k(k_beta[j,k] * exp(g[j,k] - g[i,k]) * k[i,k])
-        // By putting S_k in dim 0, mul_mat implicitly sums over it
-
-        const int64_t CHB = n_chunks * H_k * n_seqs;
-
-        // g_cumsum_t is [chunk_size, S_k, n_chunks, H_k * n_seqs]
-        // Reshape to [chunk_size, S_k, CHB] then build decay mask
-        ggml_tensor * gcs = ggml_reshape_3d(ctx0, g_cumsum_t, chunk_size, S_k, CHB);
-        ggml_tensor * gcs_i = ggml_reshape_4d(ctx0, gcs, chunk_size, 1, S_k, CHB);
-        ggml_tensor * gcs_j = ggml_reshape_4d(ctx0, gcs, 1, chunk_size, S_k, CHB);
-
-        // Build decay mask: [chunk_size, chunk_size, S_k, CHB]
-        ggml_tensor * gcs_j_bc = ggml_repeat_4d(ctx0, gcs_j, chunk_size, chunk_size, S_k, CHB);
-        decay_mask = ggml_sub(ctx0, gcs_j_bc, gcs_i);
-
-        cb(decay_mask, "decay_mask_kda", il);
-
-        decay_mask = ggml_mul(ctx0, decay_mask, diag_mask);
-        decay_mask = ggml_exp(ctx0, decay_mask);
-        decay_mask = ggml_mul(ctx0, decay_mask, diag_mask);
-
-        // Permute to [S_k, chunk_size_j, chunk_size_i, CHB] for mul_mat reduction over S_k
-        decay_mask = ggml_cont_4d(ctx0, ggml_permute(ctx0, decay_mask, 2, 1, 0, 3), S_k, chunk_size, chunk_size, CHB);
-
-        // Reshape k and k_beta for broadcasting with decay_mask
-        // k_i: indexed at position i (dim 2 of decay_mask)
-        // k_beta_j: indexed at position j (dim 1 of decay_mask)
-        ggml_tensor * k_i = ggml_reshape_4d(ctx0, k, S_k, 1, chunk_size, CHB);
-        ggml_tensor * k_beta_j = ggml_reshape_4d(ctx0, k_beta, S_k, chunk_size, 1, CHB);
-
-        // decay_k_beta_j[s,j,i,b] = decay[s,j,i,b] * k_beta[s,j,b]
-        ggml_tensor * decay_k_beta_j = ggml_mul(ctx0, decay_mask, k_beta_j);
-
-        // mul_mat sums over S_k: result[j,1,i,CHB] = sum_s decay_k_beta_j[s,j,i,b] * k_i[s,1,i,b]
-        k_decay = ggml_mul_mat(ctx0, decay_k_beta_j, k_i);
-        k_decay = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_reshape_4d(ctx0, k_decay, chunk_size, chunk_size, n_chunks, H_k * n_seqs)));
-
-        // g_exp_pos is still needed for later (kbeta_gexp, etc.)
-        g_exp_pos = ggml_exp(ctx0, g_cumsum);
-    } else {
-        // GDA: Use decay mask approach (g broadcasts over K dimension)
-        // g_cumsum [chunk_size, 1, n_chunks, H_v * n_seqs]
-        ggml_tensor * gcs_i = g_cumsum;
-        ggml_tensor * gcs_j = g_cumsum_t;
-        g_exp_pos = ggml_exp(ctx0, g_cumsum_t);
-        ggml_tensor * gcs_j_broadcast = ggml_repeat_4d(ctx0, gcs_j, chunk_size, chunk_size, n_chunks, H_v * n_seqs);
-        decay_mask = ggml_sub(ctx0, gcs_j_broadcast, gcs_i);
-
-        cb(decay_mask, "decay_mask", il);
-
-        decay_mask = ggml_mul(ctx0, decay_mask, diag_mask);
-        decay_mask = ggml_exp(ctx0, decay_mask);
-        decay_mask = ggml_mul(ctx0, decay_mask, diag_mask);
-
-        ggml_tensor * kmulkbeta = ggml_mul_mat(ctx0, k, k_beta);
-        k_decay = ggml_mul(ctx0, kmulkbeta, decay_mask);
-    }
-
-    ggml_tensor * attn = ggml_neg(ctx0, ggml_mul(ctx0, k_decay, causal_mask));
-
-    cb(attn, "attn_pre_solve", il);
-
-    // Solve triangular system: (I + L) @ X = I, where L is strictly lower triangular
-    ggml_tensor * attn_lower = ggml_mul(ctx0, attn, causal_mask);
-    ggml_tensor * lhs = ggml_sub(ctx0, ggml_repeat(ctx0, identity, attn_lower), attn_lower);
-    ggml_tensor * lin_solve = ggml_solve_tri(ctx0, lhs, attn, true, true, false);
-    attn = ggml_mul(ctx0, lin_solve, causal_mask);
-    attn = ggml_add(ctx0, attn, identity);
-
-    cb(attn, "attn_solved", il);
-
-    // Compute u = A @ v and w = A @ (g.exp() * k)
-    v = ggml_mul_mat(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, v_beta)), attn);
-
-    ggml_tensor * kbeta_gexp = ggml_mul(ctx0, k_beta, g_exp_pos);
-    cb(kbeta_gexp, "kbeta_gexp", il);
-
-    ggml_tensor * k_cumdecay = ggml_cont(ctx0, ggml_transpose(ctx0,
-        ggml_mul_mat(ctx0, attn, ggml_cont(ctx0, ggml_transpose(ctx0, kbeta_gexp)))));
-    cb(k_cumdecay, "k_cumdecay", il);
-
-    // Attention scores q @ k^T with decay
-    // For GDA: attn_kq[j,i] = sum_k(q[j,k] * exp(g[j] - g[i]) * k[i,k])
-    // For KDA: attn_kq[j,i] = sum_k(q[j,k] * exp(g[j,k] - g[i,k]) * k[i,k])
-    ggml_tensor * attn_kq;
-    if (is_kda) {
-        // KDA: Same approach as k_decay - use decay_mask with S_k in leading dim
-        const int64_t CHB = n_chunks * H_k * n_seqs;
-
-        // Rebuild decay mask (same structure as k_decay)
-        ggml_tensor * gcs = ggml_reshape_3d(ctx0, g_cumsum_t, chunk_size, S_k, CHB);
-        ggml_tensor * gcs_i = ggml_reshape_4d(ctx0, gcs, chunk_size, 1, S_k, CHB);
-        ggml_tensor * gcs_j = ggml_reshape_4d(ctx0, gcs, 1, chunk_size, S_k, CHB);
-        ggml_tensor * gcs_j_bc = ggml_repeat_4d(ctx0, gcs_j, chunk_size, chunk_size, S_k, CHB);
-        ggml_tensor * decay_mask_kq = ggml_sub(ctx0, gcs_j_bc, gcs_i);
-
-        decay_mask_kq = ggml_mul(ctx0, decay_mask_kq, diag_mask);
-        decay_mask_kq = ggml_exp(ctx0, decay_mask_kq);
-        decay_mask_kq = ggml_mul(ctx0, decay_mask_kq, diag_mask);
-
-        // Permute to [S_k, chunk_size_j, chunk_size_i, CHB]
-        decay_mask_kq = ggml_cont_4d(ctx0, ggml_permute(ctx0, decay_mask_kq, 2, 1, 0, 3), S_k, chunk_size, chunk_size, CHB);
-
-        // q_j: indexed at position j, k_i: indexed at position i
-        ggml_tensor * q_j = ggml_reshape_4d(ctx0, q, S_k, chunk_size, 1, CHB);
-        ggml_tensor * k_i = ggml_reshape_4d(ctx0, k, S_k, 1, chunk_size, CHB);
-
-        // decay_q_j[s,j,i,b] = decay[s,j,i,b] * q[s,j,b]
-        ggml_tensor * decay_q_j = ggml_mul(ctx0, decay_mask_kq, q_j);
-
-        // mul_mat sums over S_k
-        attn_kq = ggml_mul_mat(ctx0, decay_q_j, k_i);
-        attn_kq = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_reshape_4d(ctx0, attn_kq, chunk_size, chunk_size, n_chunks, H_k * n_seqs)));
-    } else {
-        // GDA: Use decay mask
-        attn_kq = ggml_mul_mat(ctx0, k, q);
-        attn_kq = ggml_mul(ctx0, attn_kq, decay_mask);
-        attn_kq = ggml_mul(ctx0, attn_kq, diag_mask);
-    }
-    cb(attn_kq, "attn_kq", il);
-
-    // Compute g_last and g_diff for state updates
-    ggml_tensor * g_last;
-    ggml_tensor * g_diff_exp;
-    ggml_tensor * g_last_exp;
-
-    if (is_kda) {
-        // KDA: g_cumsum [S_k, chunk_size, n_chunks, H_k * n_seqs]
-        // Get last element along chunk_size dimension (ne[1])
-        g_last = ggml_view_4d(ctx0, g_cumsum,
-            g_cumsum->ne[0], 1, g_cumsum->ne[2], g_cumsum->ne[3],
-            g_cumsum->nb[1], g_cumsum->nb[2], g_cumsum->nb[3],
-            (g_cumsum->ne[1] - 1) * g_cumsum->nb[1]);
-        g_last = ggml_cont(ctx0, g_last);
-        g_last_exp = ggml_exp(ctx0, g_last);
-
-        // g_diff = g_last - g_cumsum
-        ggml_tensor * g_last_broadcast = ggml_repeat_4d(ctx0, g_last,
-            g_cumsum->ne[0], g_cumsum->ne[1], g_cumsum->ne[2], g_cumsum->ne[3]);
-        ggml_tensor * g_diff = ggml_sub(ctx0, g_last_broadcast, g_cumsum);
-        g_diff_exp = ggml_exp(ctx0, g_diff);
-    } else {
-        // GDA: g_cumsum [chunk_size, 1, n_chunks, H_k * n_seqs]
-        g_last = ggml_view_4d(ctx0, g_cumsum,
-            1, 1, g_cumsum->ne[2], g_cumsum->ne[3],
-            g_cumsum->nb[1], g_cumsum->nb[2], g_cumsum->nb[3],
-            (g_cumsum->ne[0] - 1) * ggml_element_size(g_cumsum));
-        g_last = ggml_cont(ctx0, g_last);
-        g_last_exp = ggml_exp(ctx0, g_last);
-
-        ggml_tensor * g_diff = ggml_neg(ctx0, ggml_sub(ctx0, g_cumsum, g_last));
-        g_diff_exp = ggml_exp(ctx0, g_diff);
-    }
-
-    cb(g_last, "g_last", il);
-    cb(g_last_exp, "g_last_exp", il);
-
-    ggml_tensor * key_gdiff = ggml_mul(ctx0, k, g_diff_exp);
-    cb(key_gdiff, "key_gdiff", il);
-
-    // Process chunks
-    ggml_tensor * new_state = state_reshaped;
-    ggml_tensor * core_attn_out = nullptr;
-
-    for (int64_t chunk = 0; chunk < n_chunks; chunk++) {
-        ggml_tensor * q_chunk = get_slice_2d(ctx0, q, chunk);
-        ggml_tensor * v_chunk = get_slice_2d(ctx0, v, chunk);
-        ggml_tensor * k_cumdecay_chunk = get_slice_2d(ctx0, k_cumdecay, chunk);
-        ggml_tensor * attn_chunk = get_slice_2d(ctx0, attn_kq, chunk);
-        ggml_tensor * gexp_chunk = get_slice_2d(ctx0, g_exp_pos, chunk);
-
-        cb(attn_chunk, "attn_chunk", il);
-
-        ggml_tensor * state_t = ggml_cont_4d(ctx0, ggml_permute(ctx0, new_state, 1, 0, 2, 3),
-            S_v, S_v, 1, H_v * n_seqs);
-
-        // v_prime = k_cumdecay @ state
-        ggml_tensor * v_prime = ggml_mul_mat(ctx0, state_t, k_cumdecay_chunk);
-        cb(v_prime, "v_prime_chunk", il);
-
-        // v_new = v - v_prime
-        ggml_tensor * v_new = ggml_sub(ctx0, ggml_repeat(ctx0, v_chunk, v_prime), v_prime);
-        ggml_tensor * v_new_t = ggml_cont(ctx0, ggml_transpose(ctx0, v_new));
-        cb(v_new, "v_new_chunk", il);
-
-        // attn_inter = (q * g.exp()) @ state
-        ggml_tensor * q_g_exp = ggml_mul(ctx0, q_chunk, gexp_chunk);
-        ggml_tensor * attn_inter = ggml_mul_mat(ctx0, state_t, q_g_exp);
-        cb(attn_inter, "attn_inter_chunk", il);
-
-        // output = attn_inter + attn @ v_new
-        ggml_tensor * v_attn = ggml_mul_mat(ctx0, v_new_t, attn_chunk);
-        cb(v_attn, "v_attn_chunk", il);
-
-        ggml_tensor * core_attn_out_chunk = ggml_add(ctx0, attn_inter, v_attn);
-        cb(core_attn_out_chunk, "core_attn_out_chunk", il);
-
-        core_attn_out = core_attn_out == nullptr
-            ? core_attn_out_chunk
-            : ggml_concat(ctx0, core_attn_out, core_attn_out_chunk, 2);
-
-        // State update: state = state * g_last_exp + key_gdiff^T @ v_new
-        ggml_tensor * k_gdiff = ggml_cont(ctx0, get_slice_2d(ctx0, key_gdiff, chunk));
-        ggml_tensor * kgdmulvnew = ggml_mul_mat(ctx0, v_new_t, ggml_cont(ctx0, ggml_transpose(ctx0, k_gdiff)));
-
-        ggml_tensor * gexp_last_chunk = ggml_cont(ctx0, get_slice_2d(ctx0, g_last_exp, chunk));
-
-        if (is_kda) {
-            // KDA: g_last_exp [S_k, 1, n_chunks, H_k * n_seqs]
-            // State: [S_v, S_v, H_v, n_seqs]
-            // Need to reshape g_last_exp to broadcast correctly over V dimension only
-            gexp_last_chunk = ggml_reshape_4d(ctx0, gexp_last_chunk,
-                1, gexp_last_chunk->ne[0], H_v, n_seqs);  // [1, S_k, H_v, n_seqs]
-            // Transpose to [S_k, 1, H_v, n_seqs] then broadcast
-            gexp_last_chunk = ggml_cont(ctx0, ggml_permute(ctx0, gexp_last_chunk, 1, 0, 2, 3));
-        } else {
-            // GDA: g_last_exp [1, 1, n_chunks, H_k * n_seqs]
-            // Broadcasts over both K and V dimensions
-            gexp_last_chunk = ggml_reshape_4d(ctx0, gexp_last_chunk,
-                gexp_last_chunk->ne[0], gexp_last_chunk->ne[1], H_v, n_seqs);
-        }
-
-        new_state = ggml_add(ctx0,
-            ggml_mul(ctx0, new_state, gexp_last_chunk),
-            ggml_reshape_4d(ctx0, kgdmulvnew, kgdmulvnew->ne[0], kgdmulvnew->ne[1], H_v, n_seqs));
-    }
-
-    // Truncate padding and permute back
-    ggml_tensor * output_tokens = ggml_view_4d(ctx0, core_attn_out,
-        S_v, n_tokens, H_v, n_seqs,
-        ggml_row_size(core_attn_out->type, S_v),
-        ggml_row_size(core_attn_out->type, S_v * chunk_size * n_chunks),
-        ggml_row_size(core_attn_out->type, S_v * chunk_size * n_chunks * H_v), 0);
-    output_tokens = ggml_cont(ctx0, output_tokens);
-
-    cb(output_tokens, "output_tokens", il);
-
-    output_tokens = ggml_permute(ctx0, output_tokens, 0, 2, 1, 3);
-    output_tokens = ggml_cont(ctx0, output_tokens);
-
-    return {output_tokens, new_state};
-}
-
-
-/**
- * Unified autoregressive Delta Net implementation (single token processing).
- *
- * This implementation uses matrix multiplication instead of elementwise operations + summation,
- * which is more efficient and mathematically equivalent. See inline comments for equivalences.
- *
- * Input tensor format matches qwen3next conventions:
- * @param q         Query tensor [S_k, H_k, 1, n_seqs]
- * @param k         Key tensor [S_k, H_k, 1, n_seqs]
- * @param v         Value tensor [S_v, H_v, 1, n_seqs]
- * @param g         Gate tensor:
- *                    GDA: [H_v, 1, n_seqs]
- *                    KDA: [S_k, H_v, 1, n_seqs]
- * @param beta      Beta tensor [H_v, 1, 1, n_seqs]
- * @param state     State tensor [S_v, S_v * H_v, 1, n_seqs]
- * @param il        Layer index (for debugging callbacks)
- * @param eps_norm  Epsilon for L2 normalization
- *
- * @return Pair of (output_tokens, new_state)
- */
-std::pair<ggml_tensor *, ggml_tensor *> llm_graph_context_delta::build_delta_net_unified_autoregressive(
-        ggml_context * ctx0,
-        ggml_tensor * q,
-        ggml_tensor * k,
-        ggml_tensor * v,
-        ggml_tensor * g,
-        ggml_tensor * beta,
-        ggml_tensor * state,
-        int           il,
-        float         eps_norm) {
-
-    // Input format: [S, H, n_tokens, n_seqs] (matching qwen3next convention)
-    const int64_t S_k      = q->ne[0];
-    const int64_t H_k      = q->ne[1];
-    const int64_t n_tokens = q->ne[2];
-    const int64_t n_seqs   = q->ne[3];
-
-    const int64_t S_v = v->ne[0];
-    const int64_t H_v = v->ne[1];
-
-    GGML_ASSERT(n_tokens == 1);  // Autoregressive mode is for single token
-
-    // Detect KDA vs GDA based on g's shape
-    // GDA: g has shape [H_v, 1, n_seqs] or [H_v, n_tokens, n_seqs]
-    // KDA: g has shape [S_k, H_v, 1, n_seqs] or [S_k, H_v, n_tokens, n_seqs]
-    const bool is_kda = (g->ne[0] == S_k && g->ne[1] == H_v);
-
-    // Validate shapes
-    GGML_ASSERT(v->ne[2] == n_tokens);
-    GGML_ASSERT(k->ne[2] == n_tokens);
-    GGML_ASSERT(state->ne[0] == S_v && state->ne[1] == S_v && state->ne[2] == H_v && state->ne[3] == n_seqs);
-    GGML_ASSERT(q->ne[0] == S_k && q->ne[1] == H_k && q->ne[2] == n_tokens && q->ne[3] == n_seqs);
-    GGML_ASSERT(k->ne[0] == S_k && k->ne[1] == H_k && k->ne[2] == n_tokens && k->ne[3] == n_seqs);
-    GGML_ASSERT(beta->ne[0] == H_v && beta->ne[2] == n_tokens && beta->ne[3] == n_seqs);
-    GGML_ASSERT(H_k == H_v);
-
-    if (is_kda) {
-        GGML_ASSERT(g->ne[0] == S_k && g->ne[1] == H_v);
-    } else {
-        GGML_ASSERT(g->ne[0] == H_v);
-    }
-
-    // L2 normalize q and k
-    q = ggml_l2_norm(ctx0, q, eps_norm);
-    k = ggml_l2_norm(ctx0, k, eps_norm);
-
-    const float scale = 1.0f / sqrtf((float)S_v);
-    q = ggml_scale(ctx0, q, scale);
-    beta = ggml_sigmoid(ctx0, beta);
-
-    cb(q, "q_in", il);
-    cb(k, "k_in", il);
-    cb(v, "v_in", il);
-    cb(beta, "beta_in", il);
-    cb(g, "g_in", il);
-
-    // Reshape g and beta for broadcasting
-    ggml_tensor * g_t;
-    ggml_tensor * beta_t;
-
-    if (is_kda) {
-        // KDA: g [S_k, H_v, 1, n_seqs] -> [S_k, 1, H_k, n_seqs]
-        // For state multiplication, need [1, S_k, H_v, n_seqs] to broadcast over V only
-        g_t = ggml_reshape_4d(ctx0, g, S_k, 1, H_k, n_seqs);
-    } else {
-        // GDA: g [H_v, 1, n_seqs] -> [1, 1, H_k, n_seqs]
-        // For state multiplication, broadcasts over both K and V
-        g_t = ggml_reshape_4d(ctx0, ggml_transpose(ctx0, g), 1, 1, H_k, n_seqs);
-    }
-
-    beta_t = ggml_reshape_4d(ctx0, ggml_transpose(ctx0, beta), 1, 1, H_k, n_seqs);
-
-    // Apply exponential to g_t
-    g_t = ggml_exp(ctx0, g_t);
-
-    // State decay: state = state * exp(g)
-    if (is_kda) {
-        // KDA: g_t [S_k, 1, H_k, n_seqs], state [S_v, S_v, H_v, n_seqs]
-        // Need to broadcast g_t over V dimension (ne[0] of state)
-        // Permute g_t to [1, S_k, H_k, n_seqs] for correct broadcasting
-        ggml_tensor * g_broadcast = ggml_cont(ctx0, ggml_permute(ctx0, g_t, 1, 0, 2, 3));
-        state = ggml_mul(ctx0, state, g_broadcast);
-    } else {
-        // GDA: g_t [1, 1, H_k, n_seqs] broadcasts over both dimensions
-        state = ggml_mul(ctx0, state, g_t);
-    }
-
-    // Equivalence to previous version:
-    // Previous: kv_mem = sum_k(state * k) using elementwise mult + sum_rows
-    // Current:  k_state = state_t @ k_t using matrix multiplication
-    // These are equivalent because: sum_k(A * B) = A @ B when dimensions align
-    ggml_tensor * state_t = ggml_cont(ctx0, ggml_transpose(ctx0, state));
-    ggml_tensor * k_t = ggml_reshape_4d(ctx0, k, S_k, 1, H_k, n_seqs);
-    ggml_tensor * k_state = ggml_mul_mat(ctx0, state_t, k_t);
-
-    // v_diff = v - k_state (equivalent to v - kv_mem in previous version)
-    ggml_tensor * v_t = ggml_reshape_4d(ctx0, v, S_v, 1, H_v, n_seqs);
-    ggml_tensor * v_diff = ggml_sub(ctx0, v_t, k_state);
-    ggml_tensor * k_beta = ggml_mul(ctx0, k_t, beta_t);
-
-    // Equivalence to previous version:
-    // Previous: state += k.unsqueeze(-1) * delta where delta = (v - kv_mem) * beta
-    // Current:  state += v_diff^T @ k_beta^T using matrix multiplication
-    // These are equivalent because: outer_product(k, v_diff * beta) = v_diff^T @ k^T
-    state = ggml_add(ctx0, state, ggml_mul_mat(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, v_diff)), ggml_cont(ctx0, ggml_transpose(ctx0, k_beta))));
-
-    // Equivalence to previous version:
-    // Previous: core_attn_out = sum_k(state * q) using elementwise mult + sum_rows
-    // Current:  core_attn_out = state_t @ q using matrix multiplication
-    // These are equivalent because: sum_k(A * B) = A @ B when dimensions align
-    q = ggml_reshape_4d(ctx0, q, S_k, 1, H_k, n_seqs);
-    state_t = ggml_cont(ctx0, ggml_transpose(ctx0, state));
-    ggml_tensor * core_attn_out = ggml_mul_mat(ctx0, state_t, q);
-    // core_attn_out should be [S_v, 1, H_v, n_seqs] after this
-    cb(core_attn_out, "output_tokens", il);
-    cb(state, "new_state", il);
-
-    return {core_attn_out, state};
-}
-
-
-/**
- * Main entry point that dispatches to chunked or autoregressive based on n_tokens.
- *
- * Input tensor format matches qwen3next conventions:
- * @param q         Query tensor [S_k, H_k, n_tokens, n_seqs]
- * @param k         Key tensor [S_k, H_k, n_tokens, n_seqs]
- * @param v         Value tensor [S_v, H_v, n_tokens, n_seqs]
- * @param g         Gate tensor (GDA: [H_v, n_tokens, n_seqs], KDA: [S_k, H_v, n_tokens, n_seqs])
- * @param beta      Beta tensor [H_v, 1, n_tokens, n_seqs]
- * @param state     State tensor [S_v, S_v * H_v, 1, n_seqs]
- */
-std::pair<ggml_tensor *, ggml_tensor *> llm_graph_context_delta::build_delta_net_unified(
-        ggml_context * ctx0,
-        ggml_tensor * q,
-        ggml_tensor * k,
-        ggml_tensor * v,
-        ggml_tensor * g,
-        ggml_tensor * beta,
-        ggml_tensor * state,
-        ggml_tensor * causal_mask,
-        ggml_tensor * identity,
-        ggml_tensor * diag_mask,
-        int           il,
-        int64_t       chunk_size,
-        float         eps_norm) {
-
-    // Input format: [S, H, n_tokens, n_seqs] (matching qwen3next convention)
-    const int64_t n_tokens = q->ne[2];
-
-    if (n_tokens == 1) {
-        return build_delta_net_unified_autoregressive(
-            ctx0, q, k, v, g, beta, state, il, eps_norm);
-    }
-    return build_delta_net_unified_chunking(
-        ctx0, q, k, v, g, beta, state, causal_mask, identity, diag_mask,
-        il, chunk_size, eps_norm);
-}
diff --git a/src/models/kimi-linear.cpp b/src/models/kimi-linear.cpp
index d9ee698075..0f037d1a39 100644
--- a/src/models/kimi-linear.cpp
+++ b/src/models/kimi-linear.cpp
@@ -1,4 +1,5 @@
 #include "models.h"
+#include "ggml.h"
 
 #define CHUNK_SIZE 64
 
diff --git a/src/models/models.h b/src/models/models.h
index 2a750c168e..cfcbb9aaa5 100644
--- a/src/models/models.h
+++ b/src/models/models.h
@@ -17,53 +17,6 @@ struct llm_graph_context_mamba : public llm_graph_context {
 
 };
 
-struct llm_graph_context_delta : public llm_graph_context_mamba {
-    llm_graph_context_delta(const llm_graph_params & params);
-
-    virtual ~llm_graph_context_delta() = default;
-
-    std::pair<ggml_tensor *, ggml_tensor *> build_delta_net_unified_chunking(
-        ggml_context * ctx0,
-        ggml_tensor * q,
-        ggml_tensor * k,
-        ggml_tensor * v,
-        ggml_tensor * g,
-        ggml_tensor * beta,
-        ggml_tensor * state,
-        ggml_tensor * causal_mask,
-        ggml_tensor * identity,
-        ggml_tensor * diag_mask,
-        int           il,
-        int64_t       chunk_size,
-        float         eps_norm);
-
-    std::pair<ggml_tensor *, ggml_tensor *> build_delta_net_unified_autoregressive(
-        ggml_context * ctx0,
-        ggml_tensor * q,
-        ggml_tensor * k,
-        ggml_tensor * v,
-        ggml_tensor * g,
-        ggml_tensor * beta,
-        ggml_tensor * state,
-        int           il,
-        float         eps_norm);
-
-    std::pair<ggml_tensor *, ggml_tensor *> build_delta_net_unified(
-        ggml_context * ctx0,
-        ggml_tensor * q,
-        ggml_tensor * k,
-        ggml_tensor * v,
-        ggml_tensor * g,
-        ggml_tensor * beta,
-        ggml_tensor * state,
-        ggml_tensor * causal_mask,
-        ggml_tensor * identity,
-        ggml_tensor * diag_mask,
-        int           il,
-        int64_t       chunk_size,
-        float         eps_norm);
-};
-
 // Base class for RWKV-related models
 struct llm_build_rwkv6_base : public llm_graph_context {
     const llama_model & model;
@@ -523,7 +476,7 @@ struct llm_build_qwen3vl : public llm_graph_context {
 struct llm_build_qwen3vlmoe : public llm_graph_context {
     llm_build_qwen3vlmoe(const llama_model & model, const llm_graph_params & params);
 };
-struct llm_build_qwen3next : public llm_graph_context_delta {
+struct llm_build_qwen3next : public llm_graph_context_mamba {
     llm_build_qwen3next(const llama_model & model, const llm_graph_params & params);
 private:
     ggml_tensor * build_layer_attn(
@@ -581,59 +534,6 @@ private:
     const llama_model & model;
 };
 
-struct llm_build_qwen3_5 : public llm_graph_context_delta {
-    llm_build_qwen3_5(const llama_model & model, const llm_graph_params & params);
-
-protected:
-    // Tag type for subclass constructors that need to call build_graph() themselves
-    // (to ensure virtual dispatch works correctly)
-    struct defer_graph_build_t {};
-
-    llm_build_qwen3_5(const llama_model & model, const llm_graph_params & params, defer_graph_build_t);
-
-    void build_graph();
-
-    virtual ggml_tensor * build_layer_ffn(
-                ggml_tensor * cur,
-                        int   il);
-
-    const llama_model & model;
-
-private:
-    ggml_tensor * build_layer_attn(
-    llm_graph_input_attn_kv * inp_attn,
-                ggml_tensor * cur,
-                ggml_tensor * inp_pos,
-                        int   il);
-
-    ggml_tensor * build_layer_attn_linear(
-         llm_graph_input_rs * inp,
-                ggml_tensor * cur,
-                ggml_tensor * causal_mask,
-                ggml_tensor * identity,
-                ggml_tensor * diag_mask,
-                        int   il);
-
-    ggml_tensor * build_norm_gated(
-                ggml_tensor * input,
-                ggml_tensor * weights,
-                ggml_tensor * gate,
-                        int   layer);
-
-    std::pair<ggml_tensor *, ggml_tensor *> build_qkvz(
-                ggml_tensor * input,
-                        int   il);
-};
-
-struct llm_build_qwen3_5_moe : public llm_build_qwen3_5 {
-    llm_build_qwen3_5_moe(const llama_model & model, const llm_graph_params & params);
-
-protected:
-    ggml_tensor * build_layer_ffn(
-                ggml_tensor * cur,
-                        int   il) override;
-};
-
 struct llm_build_qwen : public llm_graph_context {
     llm_build_qwen(const llama_model & model, const llm_graph_params & params);
 };
diff --git a/src/models/qwen3-5.cpp b/src/models/qwen3-5.cpp
deleted file mode 100644
index 0947299d73..0000000000
--- a/src/models/qwen3-5.cpp
+++ /dev/null
@@ -1,421 +0,0 @@
-#include "models.h"
-
-#define CHUNK_SIZE 64
-
-llm_build_qwen3_5::llm_build_qwen3_5(const llama_model & model, const llm_graph_params & params) :
-    llm_graph_context_delta(params), model(model) {
-    build_graph();
-}
-
-// virtual call in constructor fix
-llm_build_qwen3_5::llm_build_qwen3_5(const llama_model & model, const llm_graph_params & params, defer_graph_build_t /*tag*/) :
-    llm_graph_context_delta(params), model(model) {
-}
-
-void llm_build_qwen3_5::build_graph() {
-    ggml_tensor * cur;
-    ggml_tensor * inpL;
-
-    inpL = build_inp_embd(model.tok_embd);
-    cb(inpL, "model.embed_tokens", -1);
-
-    auto * inp = build_inp_mem_hybrid();
-
-    ggml_tensor * inp_pos     = build_inp_pos();
-    ggml_tensor * inp_out_ids = build_inp_out_ids();
-
-    ggml_tensor * causal_mask =
-        ggml_tri(ctx0, ggml_fill(ctx0, ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, CHUNK_SIZE, CHUNK_SIZE), 1.0f),
-                    GGML_TRI_TYPE_LOWER);
-
-    ggml_tensor * identity = ggml_diag(ctx0, ggml_fill(ctx0, ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, CHUNK_SIZE), 1.0f));
-    ggml_tensor * diag_mask = ggml_add(ctx0, causal_mask, identity);
-
-    ggml_build_forward_expand(gf, causal_mask);
-    ggml_build_forward_expand(gf, identity);
-    ggml_build_forward_expand(gf, diag_mask);
-
-    for (int il = 0; il < n_layer; ++il) {
-        ggml_tensor * inpSA = inpL;
-
-        cur = build_norm(inpL, model.layers[il].attn_norm, nullptr, LLM_NORM_RMS, il);
-        cb(cur, "attn_norm", il);
-
-        if (hparams.is_recurrent(il)) {
-            cur = build_layer_attn_linear(inp->get_recr(), cur, causal_mask, identity, diag_mask, il);
-        } else {
-            cur = build_layer_attn(inp->get_attn(), cur, inp_pos, il);
-        }
-
-        if (il == n_layer - 1 && inp_out_ids) {
-            cur   = ggml_get_rows(ctx0, cur, inp_out_ids);
-            inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
-        }
-
-        cur = ggml_add(ctx0, cur, inpSA);
-        cb(cur, "attn_residual", il);
-
-        ggml_tensor * ffn_residual = cur;
-
-        ggml_tensor * attn_post_norm = build_norm(cur, model.layers[il].attn_post_norm, nullptr, LLM_NORM_RMS, il);
-        cb(attn_post_norm, "attn_post_norm", il);
-
-        cur = build_layer_ffn(attn_post_norm, il);
-        cb(cur, "ffn_out", il);
-
-        cur = ggml_add(ctx0, cur, ffn_residual);
-        cb(cur, "post_ffn", il);
-
-        inpL = cur;
-    }
-    cur = inpL;
-
-    cur = build_norm(cur, model.output_norm, nullptr, LLM_NORM_RMS, -1);
-
-    cb(cur, "result_norm", -1);
-    res->t_embd = cur;
-
-    cur = build_lora_mm(model.output, cur);
-
-    cb(cur, "result_output", -1);
-    res->t_logits = cur;
-
-    ggml_build_forward_expand(gf, cur);
-}
-
-ggml_tensor * llm_build_qwen3_5::build_norm_gated(
-        ggml_tensor * input,
-        ggml_tensor * weights,
-        ggml_tensor * gate,
-        int           layer) {
-    ggml_tensor * normalized = build_norm(input, weights, nullptr, LLM_NORM_RMS, layer);
-    ggml_tensor * gated_silu = ggml_silu(ctx0, gate);
-
-    return ggml_mul(ctx0, normalized, gated_silu);
-}
-
-ggml_tensor * llm_build_qwen3_5::build_layer_attn(
-        llm_graph_input_attn_kv * inp,
-        ggml_tensor *             cur,
-        ggml_tensor *             inp_pos,
-        int                       il) {
-    const int64_t n_embd_head = hparams.n_embd_head_v;
-    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
-
-    ggml_tensor * Qcur_full = build_lora_mm(model.layers[il].wq, cur); // [ (n_embd_head * 2) * n_head, n_tokens ]
-    cb(Qcur_full, "Qcur_full", il);
-
-    ggml_tensor * Qcur = ggml_view_3d(ctx0, Qcur_full, n_embd_head, n_head, n_tokens,
-        ggml_element_size(Qcur_full) * n_embd_head * 2,
-        ggml_element_size(Qcur_full) * n_embd_head * 2 * n_head, 0);
-    cb(Qcur, "Qcur_reshaped", il);
-
-    Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, nullptr, LLM_NORM_RMS, il);
-    cb(Qcur, "Qcur_normed", il);
-
-    ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
-    cb(Kcur, "Kcur", il);
-
-    ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
-    cb(Vcur, "Vcur", il);
-
-    Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
-    Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, nullptr, LLM_NORM_RMS, il);
-    cb(Kcur, "Kcur_normed", il);
-
-    ggml_tensor * gate = ggml_view_3d(ctx0, Qcur_full, n_embd_head, n_head, n_tokens,
-        ggml_element_size(Qcur_full) * n_embd_head * 2,
-        ggml_element_size(Qcur_full) * n_embd_head * 2 * n_head,
-        ggml_element_size(Qcur_full) * n_embd_head);
-    gate = ggml_cont_2d(ctx0, gate, n_embd_head * n_head, n_tokens);
-    cb(gate, "gate_reshaped", il);
-
-    Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
-
-    Qcur = ggml_rope_ext(
-            ctx0, Qcur, inp_pos, nullptr,
-            n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
-            ext_factor, attn_factor, beta_fast, beta_slow);
-
-    Kcur = ggml_rope_ext(
-            ctx0, Kcur, inp_pos, nullptr,
-            n_rot, rope_type, n_ctx_orig, freq_base,
-            freq_scale, ext_factor, attn_factor, beta_fast, beta_slow);
-
-    cb(Qcur, "Qcur", il);
-    cb(Kcur, "Kcur", il);
-    cb(Vcur, "Vcur", il);
-
-    const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f / sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
-
-    cur = build_attn(inp,
-                nullptr, nullptr,
-                Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
-    cb(cur, "attn_pregate", il);
-
-    ggml_tensor * gate_sigmoid = ggml_sigmoid(ctx0, gate);
-    cb(gate_sigmoid, "gate_sigmoid", il);
-
-    cur = ggml_mul(ctx0, cur, gate_sigmoid);
-    cb(cur, "attn_gated", il);
-
-    cur = build_lora_mm(model.layers[il].wo, cur);
-    cb(cur, "attn_output", il);
-
-    return cur;
-}
-
-std::pair<ggml_tensor *, ggml_tensor *> llm_build_qwen3_5::build_qkvz(
-                ggml_tensor * input,
-                        int   il) {
-    const int64_t d_inner      = hparams.ssm_d_inner;
-    const int64_t n_seqs       = ubatch.n_seqs;
-    const int64_t head_k_dim   = hparams.ssm_d_state;
-    const int64_t num_k_heads  = hparams.ssm_n_group;
-    const int64_t num_v_heads  = hparams.ssm_dt_rank;
-    const int64_t head_v_dim   = d_inner / num_v_heads;
-    const int64_t n_seq_tokens = ubatch.n_seq_tokens;
-
-    if (model.layers[il].wqkv) {
-        ggml_tensor * qkv_mixed = build_lora_mm(model.layers[il].wqkv, input);
-        qkv_mixed = ggml_reshape_3d(ctx0, qkv_mixed, qkv_mixed->ne[0], n_seq_tokens, n_seqs);
-        cb(qkv_mixed, "linear_attn_qkv_mixed", il);
-
-        ggml_tensor * z = build_lora_mm(model.layers[il].wqkv_gate, input);
-        cb(z, "z", il);
-
-        return { qkv_mixed, z };
-
-    }
-    // legacy path for combined in_proj_qkvz
-    ggml_tensor * mixed_qkvz = build_lora_mm(model.layers[il].ssm_in, input);
-    cb(mixed_qkvz, "linear_attn_mixed_qkvz", il);
-
-    int64_t       qkvz_new_dim        = 2 * head_k_dim + 2 * head_v_dim * (num_v_heads / num_k_heads);
-    ggml_tensor * mixed_qkvz_reshaped = ggml_reshape_4d(ctx0, mixed_qkvz, qkvz_new_dim, num_k_heads, n_seq_tokens, n_seqs);
-
-    int64_t split_sizes_qkvz[4] = {
-        head_k_dim,
-        head_k_dim,
-        head_v_dim * num_v_heads / num_k_heads,
-        head_v_dim * num_v_heads / num_k_heads
-    };
-
-    ggml_tensor * query =
-        ggml_view_4d(ctx0, mixed_qkvz_reshaped, split_sizes_qkvz[0], num_k_heads, n_seq_tokens, n_seqs,
-                    mixed_qkvz_reshaped->nb[1], mixed_qkvz_reshaped->nb[2], mixed_qkvz_reshaped->nb[3], 0);
-    cb(query, "q", il);
-
-    ggml_tensor * key = ggml_view_4d(ctx0, mixed_qkvz_reshaped, split_sizes_qkvz[1], num_k_heads, n_seq_tokens, n_seqs,
-                                    mixed_qkvz_reshaped->nb[1], mixed_qkvz_reshaped->nb[2], mixed_qkvz_reshaped->nb[3],
-                                    split_sizes_qkvz[0] * ggml_element_size(mixed_qkvz_reshaped));
-    cb(key, "k", il);
-
-    ggml_tensor * value =
-        ggml_view_4d(ctx0, mixed_qkvz_reshaped, split_sizes_qkvz[2], num_k_heads, n_seq_tokens, n_seqs,
-                    mixed_qkvz_reshaped->nb[1], mixed_qkvz_reshaped->nb[2], mixed_qkvz_reshaped->nb[3],
-                    (split_sizes_qkvz[0] + split_sizes_qkvz[1]) * ggml_element_size(mixed_qkvz_reshaped));
-    cb(value, "v", il);
-
-    ggml_tensor * z = ggml_view_4d(ctx0, mixed_qkvz_reshaped, split_sizes_qkvz[3], num_k_heads, n_seq_tokens, n_seqs,
-                                mixed_qkvz_reshaped->nb[1], mixed_qkvz_reshaped->nb[2], mixed_qkvz_reshaped->nb[3],
-                                (split_sizes_qkvz[0] + split_sizes_qkvz[1] + split_sizes_qkvz[2]) * ggml_element_size(mixed_qkvz_reshaped));
-    z = ggml_cont(ctx0, z);
-    cb(z, "z", il);
-
-    ggml_tensor * query_flat = ggml_reshape_3d(ctx0, query, head_k_dim * num_k_heads, n_seq_tokens, n_seqs);
-    cb(query_flat, "query_flat", il);
-
-    ggml_tensor * key_flat = ggml_reshape_3d(ctx0, key, head_k_dim * num_k_heads, n_seq_tokens, n_seqs);
-    cb(key_flat, "key_flat", il);
-
-    ggml_tensor * value_flat = ggml_reshape_3d(ctx0, value, head_v_dim * num_v_heads, n_seq_tokens, n_seqs);
-    cb(value_flat, "value_flat", il);
-
-    ggml_tensor * qkv_mixed = ggml_concat(ctx0, query_flat, key_flat, 0);
-    qkv_mixed               = ggml_concat(ctx0, qkv_mixed, value_flat, 0);
-    cb(qkv_mixed, "qkv_mixed", il);
-
-    return { qkv_mixed, z };
-}
-
-ggml_tensor * llm_build_qwen3_5::build_layer_attn_linear(
-        llm_graph_input_rs * inp,
-        ggml_tensor *        cur,
-        ggml_tensor *        causal_mask,
-        ggml_tensor *        identity,
-        ggml_tensor *        diag_mask,
-        int                  il) {
-    const auto * mctx_cur = inp->mctx;
-
-    const int64_t d_inner      = hparams.ssm_d_inner;
-    const int64_t n_seqs       = ubatch.n_seqs;
-    const int64_t head_k_dim   = hparams.ssm_d_state;
-    const int64_t num_k_heads  = hparams.ssm_n_group;
-    const int64_t num_v_heads  = hparams.ssm_dt_rank;
-    const int64_t head_v_dim   = d_inner / num_v_heads;
-    const int64_t n_seq_tokens = ubatch.n_seq_tokens;
-
-    const auto kv_head = mctx_cur->get_head();
-
-    GGML_ASSERT(n_seqs != 0);
-    GGML_ASSERT(ubatch.equal_seqs());
-    GGML_ASSERT(ubatch.n_tokens == n_seq_tokens * n_seqs);
-
-    auto qkvz = build_qkvz(cur, il);
-    ggml_tensor * qkv_mixed = qkvz.first;
-    ggml_tensor * z         = qkvz.second;
-
-    ggml_tensor * mixed_ba = build_lora_mm(model.layers[il].ssm_beta_alpha, cur);
-    cb(mixed_ba, "linear_attn_mixed_ba", il);
-
-    int64_t       ba_new_dim        = 2 * num_v_heads / num_k_heads;
-    ggml_tensor * mixed_ba_reshaped = ggml_reshape_4d(ctx0, mixed_ba, ba_new_dim, num_k_heads, n_seq_tokens, n_seqs);
-
-    int64_t split_sizes_ba[2] = {
-        num_v_heads / num_k_heads,
-        num_v_heads / num_k_heads
-    };
-
-    ggml_tensor * b = ggml_view_4d(ctx0, mixed_ba_reshaped, split_sizes_ba[0], num_k_heads, n_seq_tokens, n_seqs,
-                                   mixed_ba_reshaped->nb[1], mixed_ba_reshaped->nb[2], mixed_ba_reshaped->nb[3], 0);
-    cb(b, "b", il);
-
-    ggml_tensor * a = ggml_view_4d(ctx0, mixed_ba_reshaped, split_sizes_ba[1], num_k_heads, n_seq_tokens, n_seqs,
-                                   mixed_ba_reshaped->nb[1], mixed_ba_reshaped->nb[2], mixed_ba_reshaped->nb[3],
-                                   split_sizes_ba[0] * ggml_element_size(mixed_ba_reshaped));
-    cb(a, "a", il);
-
-    ggml_tensor * beta  = ggml_cont_4d(ctx0, b, num_v_heads, 1, n_seq_tokens, n_seqs);
-
-    ggml_tensor * alpha = ggml_cont_3d(ctx0, a, num_v_heads, n_seq_tokens, n_seqs);
-
-    ggml_tensor * alpha_biased   = ggml_add(ctx0, alpha, model.layers[il].ssm_dt);
-    ggml_tensor * alpha_softplus = ggml_softplus(ctx0, alpha_biased);
-    cb(alpha_softplus, "a_softplus", il);
-    ggml_tensor * gate = ggml_mul(ctx0, alpha_softplus, model.layers[il].ssm_a);
-    cb(gate, "gate", il);
-
-    ggml_tensor * conv_states_all = mctx_cur->get_r_l(il);
-    ggml_tensor * ssm_states_all  = mctx_cur->get_s_l(il);
-
-    ggml_tensor * conv_states = build_rs(inp, conv_states_all, hparams.n_embd_r(), n_seqs);
-    cb(conv_states, "conv_states", il);
-
-    ggml_tensor * conv_kernel      = model.layers[il].ssm_conv1d;
-    const int64_t conv_kernel_size = conv_kernel->ne[0];
-    const int64_t conv_channels    = d_inner + 2 * hparams.ssm_n_group * hparams.ssm_d_state;
-    conv_states                    = ggml_reshape_3d(ctx0, conv_states, conv_kernel_size - 1, conv_channels, n_seqs);
-    cb(conv_states, "conv_states_reshaped", il);
-
-    qkv_mixed = ggml_permute(ctx0, qkv_mixed, 1, 0, 2, 3);
-    cb(qkv_mixed, "qkv_mixed_permuted", il);
-
-    ggml_tensor * conv_input = ggml_concat(ctx0, conv_states, qkv_mixed, 0);
-    cb(conv_input, "conv_input", il);
-
-    ggml_tensor * last_conv_states =
-        ggml_view_3d(ctx0, conv_input, conv_kernel_size - 1, conv_channels, n_seqs, conv_input->nb[1],
-                     conv_input->nb[2], (conv_input->ne[0] - conv_states->ne[0]) * ggml_element_size(conv_input));
-    cb(last_conv_states, "last_conv_states", il);
-
-    ggml_tensor * state_update_target =
-        ggml_view_1d(ctx0, conv_states_all, (conv_kernel_size - 1) * conv_channels * n_seqs,
-                     kv_head * (conv_kernel_size - 1) * conv_channels * ggml_element_size(conv_states_all));
-    cb(state_update_target, "state_update_target", il);
-
-    ggml_build_forward_expand(gf, ggml_cpy(ctx0, last_conv_states, state_update_target));
-    cb(conv_states_all, "conv_states_updated", il);
-
-    ggml_tensor * conv_output_proper = ggml_ssm_conv(ctx0, conv_input, conv_kernel);
-    cb(conv_output_proper, "conv_output_raw", il);
-
-    ggml_tensor * conv_output_silu = ggml_silu(ctx0, conv_output_proper);
-    cb(conv_output_silu, "conv_output_silu", il);
-
-    ggml_tensor * conv_qkv_mix = conv_output_silu;
-
-    int64_t qkv_dim = head_k_dim * num_k_heads * 2 + head_v_dim * num_v_heads;
-    int64_t nb1_qkv = ggml_row_size(conv_qkv_mix->type, qkv_dim);
-
-    ggml_tensor * q_conv =
-        ggml_view_2d(ctx0, conv_qkv_mix, head_k_dim * num_k_heads, n_seq_tokens * n_seqs, nb1_qkv, 0);
-    cb(q_conv, "q_conv", il);
-    ggml_tensor * k_conv =
-        ggml_view_2d(ctx0, conv_qkv_mix, head_k_dim * num_k_heads, n_seq_tokens * n_seqs, nb1_qkv,
-                     head_k_dim * num_k_heads * ggml_element_size(conv_qkv_mix));
-    cb(k_conv, "k_conv", il);
-    ggml_tensor * v_conv =
-        ggml_view_2d(ctx0, conv_qkv_mix, head_v_dim * num_v_heads, n_seq_tokens * n_seqs, nb1_qkv,
-                     2 * head_k_dim * num_k_heads * ggml_element_size(conv_qkv_mix));
-    cb(v_conv, "v_conv", il);
-
-    q_conv = ggml_cont_4d(ctx0, q_conv, head_k_dim, num_k_heads, n_seq_tokens, n_seqs);
-    k_conv = ggml_cont_4d(ctx0, k_conv, head_k_dim, num_k_heads, n_seq_tokens, n_seqs);
-    v_conv = ggml_cont_4d(ctx0, v_conv, head_v_dim, num_v_heads, n_seq_tokens, n_seqs);
-
-    ggml_tensor * state = build_rs(inp, ssm_states_all, hparams.n_embd_s(), n_seqs);
-    state               = ggml_reshape_4d(ctx0, state, head_v_dim, head_v_dim,  num_v_heads, n_seqs);
-    cb(state, "state_predelta", il);
-
-    if (num_k_heads != num_v_heads) {
-        GGML_ASSERT(num_v_heads % num_k_heads == 0);
-        int64_t repeat_factor = num_v_heads / num_k_heads;
-
-        ggml_tensor * q_reshaped = ggml_reshape_3d(ctx0, q_conv, head_k_dim, 1, num_k_heads * n_seq_tokens * n_seqs);
-        ggml_tensor * k_reshaped = ggml_reshape_3d(ctx0, k_conv, head_k_dim, 1, num_k_heads * n_seq_tokens * n_seqs);
-
-        ggml_tensor * q_repeated =
-            ggml_repeat_4d(ctx0, q_reshaped, head_k_dim, repeat_factor, num_k_heads * n_seq_tokens * n_seqs, 1);
-        ggml_tensor * k_repeated =
-            ggml_repeat_4d(ctx0, k_reshaped, head_k_dim, repeat_factor, num_k_heads * n_seq_tokens * n_seqs, 1);
-
-        q_conv = ggml_reshape_4d(ctx0, q_repeated, head_k_dim, num_k_heads * repeat_factor, n_seq_tokens, n_seqs);
-        k_conv = ggml_reshape_4d(ctx0, k_repeated, head_k_dim, num_k_heads * repeat_factor, n_seq_tokens, n_seqs);
-    }
-
-    cb(q_conv, "q_conv_predelta", il);
-    cb(k_conv, "k_conv_predelta", il);
-    cb(v_conv, "v_conv_predelta", il);
-
-    std::pair<ggml_tensor *, ggml_tensor *> attn_out = build_delta_net_unified(ctx0, q_conv, k_conv, v_conv,
-            gate, beta, state, causal_mask, identity, diag_mask,
-            il, CHUNK_SIZE, hparams.f_norm_rms_eps);
-
-    ggml_tensor * output    = attn_out.first;
-    ggml_tensor * new_state = attn_out.second;
-    cb(output, "attn_output", il);
-    cb(new_state, "new_state", il);
-
-    ggml_build_forward_expand(gf,
-                              ggml_cpy(ctx0, new_state,
-                                       ggml_view_1d(ctx0, ssm_states_all, hparams.n_embd_s() * n_seqs,
-                                                    kv_head * hparams.n_embd_s() * ggml_element_size(ssm_states_all))));
-
-    ggml_tensor * attn_out_2d_final = ggml_reshape_2d(ctx0, output, head_v_dim, num_v_heads * n_seq_tokens * n_seqs);
-
-    ggml_tensor * z_2d = ggml_reshape_2d(ctx0, z, head_v_dim, num_v_heads * n_seq_tokens * n_seqs);
-
-    ggml_tensor * attn_out_norm = build_norm_gated(attn_out_2d_final, model.layers[il].ssm_norm, z_2d, il);
-
-    ggml_tensor * final_output = ggml_reshape_3d(ctx0, attn_out_norm, head_v_dim * num_v_heads, n_seq_tokens, n_seqs);
-    cb(final_output, "final_output", il);
-
-    cur = build_lora_mm(model.layers[il].ssm_out, final_output);
-    cb(cur, "linear_attn_out", il);
-
-    cur = ggml_cont_2d(ctx0, cur, n_embd, n_seq_tokens * n_seqs);
-    return cur;
-}
-
-ggml_tensor * llm_build_qwen3_5::build_layer_ffn(ggml_tensor * cur, const int il) {
-    // Qwen3.5 Dense always uses dense FFN
-    cur = build_ffn(cur,
-        model.layers[il].ffn_up, NULL, NULL,
-        model.layers[il].ffn_gate, NULL, NULL,
-        model.layers[il].ffn_down, NULL, NULL,
-        NULL,
-        LLM_FFN_SILU, LLM_FFN_PAR, il);
-    cb(cur, "ffn_out", il);
-    return cur;
-}
diff --git a/src/models/qwen3-5moe.cpp b/src/models/qwen3-5moe.cpp
deleted file mode 100644
index a488443218..0000000000
--- a/src/models/qwen3-5moe.cpp
+++ /dev/null
@@ -1,52 +0,0 @@
-#include "models.h"
-
-llm_build_qwen3_5_moe::llm_build_qwen3_5_moe(const llama_model & model, const llm_graph_params & params) :
-    llm_build_qwen3_5(model, params, defer_graph_build_t{}) {
-    build_graph();
-}
-
-ggml_tensor * llm_build_qwen3_5_moe::build_layer_ffn(ggml_tensor * cur, const int il) {
-    // Check if this is an MoE layer
-    if (model.layers[il].ffn_gate_inp != nullptr) {
-        // MoE branch
-        ggml_tensor * moe_out =
-            build_moe_ffn(cur,
-                model.layers[il].ffn_gate_inp, model.layers[il].ffn_up_exps,
-                model.layers[il].ffn_gate_exps, model.layers[il].ffn_down_exps,
-                nullptr,
-                n_expert, n_expert_used, LLM_FFN_SILU,
-                true, false, 0.0, LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX, il);
-        cb(moe_out, "ffn_moe_out", il);
-
-        // Add shared experts if present
-        if (model.layers[il].ffn_up_shexp != nullptr) {
-            ggml_tensor * ffn_shexp =
-                build_ffn(cur,
-                    model.layers[il].ffn_up_shexp, NULL, NULL,
-                    model.layers[il].ffn_gate_shexp, NULL, NULL,
-                    model.layers[il].ffn_down_shexp, NULL, NULL,
-                    NULL,
-                    LLM_FFN_SILU, LLM_FFN_PAR, il);
-            cb(ffn_shexp, "ffn_shexp", il);
-
-            // Apply shared expert gating (sigmoid)
-            ggml_tensor * shared_gate = build_lora_mm(model.layers[il].ffn_gate_inp_shexp, cur);
-            cb(shared_gate, "shared_expert_gate", il);
-
-            shared_gate = ggml_sigmoid(ctx0, shared_gate);
-            cb(shared_gate, "shared_expert_gate_sigmoid", il);
-
-            ffn_shexp = ggml_mul(ctx0, ffn_shexp, shared_gate);
-            cb(ffn_shexp, "ffn_shexp_gated", il);
-
-            cur = ggml_add(ctx0, moe_out, ffn_shexp);
-            cb(cur, "ffn_out", il);
-        } else {
-            cur = moe_out;
-        }
-    } else {
-        // Dense FFN branch (fallback)
-        cur = llm_build_qwen3_5::build_layer_ffn(cur, il);
-    }
-    return cur;
-}
diff --git a/src/models/qwen3next.cpp b/src/models/qwen3next.cpp
index 0335f5ab76..99b1a76a48 100644
--- a/src/models/qwen3next.cpp
+++ b/src/models/qwen3next.cpp
@@ -1,9 +1,10 @@
+#include "ggml.h"
 #include "models.h"
 
 #define CHUNK_SIZE 64
 
 llm_build_qwen3next::llm_build_qwen3next(const llama_model & model, const llm_graph_params & params) :
-    llm_graph_context_delta(params), model(model) {
+    llm_graph_context_mamba(params), model(model) {
     ggml_tensor * cur;
     ggml_tensor * inpL;
 
@@ -85,6 +86,362 @@ llm_build_qwen3next::llm_build_qwen3next(const llama_model & model, const llm_gr
     ggml_build_forward_expand(gf, cur);
 }
 
+// utility to get one slice from the third dimension
+// input dim:  [x, y, c, b]
+// output dim: [x, y, 1, b]
+static ggml_tensor * get_slice_2d(ggml_context * ctx0, ggml_tensor * t, int64_t c) {
+    return ggml_view_4d(ctx0, t, t->ne[0], t->ne[1], 1, t->ne[3],
+        t->nb[1], t->nb[2], t->nb[3], t->nb[2] * c);
+}
+
+std::pair<ggml_tensor *, ggml_tensor *> llm_build_qwen3next::build_delta_net_chunking(
+        ggml_tensor * q,
+        ggml_tensor * k,
+        ggml_tensor * v,
+        ggml_tensor * g,
+        ggml_tensor * beta,
+        ggml_tensor * state,
+        ggml_tensor * causal_mask,
+        ggml_tensor * identity,
+        ggml_tensor * diag_mask,
+        int           il) {
+    const int64_t S_k      = q->ne[0];
+    const int64_t H_k      = q->ne[1];
+    const int64_t n_tokens = q->ne[2];
+    const int64_t n_seqs   = q->ne[3];
+
+    const int64_t S_v = v->ne[0];
+    const int64_t H_v = v->ne[1];
+
+    GGML_ASSERT(v->ne[2] == n_tokens);
+    GGML_ASSERT(k->ne[2] == n_tokens);
+    GGML_ASSERT(g->ne[0] == H_v && g->ne[1] == n_tokens && g->ne[2] == n_seqs);
+    GGML_ASSERT(beta->ne[0] == H_v && beta->ne[2] == n_tokens && beta->ne[3] == n_seqs);
+    GGML_ASSERT(state->ne[0] == S_v && state->ne[1] == S_v * H_v && state->ne[2] == 1 && state->ne[3] == n_seqs);
+
+    GGML_ASSERT(q->ne[0] == S_k && q->ne[1] == H_k && q->ne[2] == n_tokens && q->ne[3] == n_seqs);
+    GGML_ASSERT(k->ne[0] == S_k && k->ne[1] == H_k && k->ne[2] == n_tokens && k->ne[3] == n_seqs);
+
+    GGML_ASSERT(H_k == H_v);  // we did a repeat to make sure this is the case
+
+    const float eps_norm = hparams.f_norm_rms_eps;
+
+    q = ggml_l2_norm(ctx0, q, eps_norm);
+    k = ggml_l2_norm(ctx0, k, eps_norm);
+
+    const float scale = 1.0f / sqrtf(S_v);
+
+    q = ggml_scale(ctx0, q, scale);
+
+    beta = ggml_sigmoid(ctx0, beta);
+
+    cb(q, "q_in", il);
+    cb(k, "k_in", il);
+    cb(v, "v_in", il);
+    cb(beta, "beta_in", il);
+    cb(g, "g_in", il);
+
+    q = ggml_cont_4d(ctx0, ggml_permute(ctx0, q, 0, 2, 1, 3), S_v, n_tokens, H_v, n_seqs);
+    k = ggml_cont_4d(ctx0, ggml_permute(ctx0, k, 0, 2, 1, 3), S_v, n_tokens, H_v, n_seqs);
+    v = ggml_cont_4d(ctx0, ggml_permute(ctx0, v, 0, 2, 1, 3), S_v, n_tokens, H_v, n_seqs);
+    g = ggml_cont_4d(ctx0, ggml_permute(ctx0, g, 2, 0, 3, 1), n_tokens, 1, H_k, n_seqs);
+
+    beta  = ggml_cont(ctx0, ggml_permute(ctx0, beta, 2, 0, 1, 3));
+    state = ggml_reshape_4d(ctx0, state, S_v, S_v, H_v, n_seqs);
+
+    cb(q, "q_perm", il);
+    cb(k, "k_perm", il);
+    cb(v, "v_perm", il);
+    cb(beta, "beta_perm", il);
+    cb(g, "g_perm", il);
+    cb(state, "state_in", il);
+
+    GGML_ASSERT(q->ne[1] == n_tokens && q->ne[0] == S_k && q->ne[2] == H_k && q->ne[3] == n_seqs);
+    GGML_ASSERT(k->ne[1] == n_tokens && k->ne[0] == S_k && k->ne[2] == H_k && k->ne[3] == n_seqs);
+    GGML_ASSERT(v->ne[1] == n_tokens && v->ne[0] == S_v && v->ne[2] == H_k && v->ne[3] == n_seqs);
+    GGML_ASSERT(beta->ne[1] == n_tokens && beta->ne[2] == H_k && beta->ne[0] == 1 && beta->ne[3] == n_seqs);
+
+    // Do padding
+    const int64_t chunk_size = CHUNK_SIZE;
+
+    const int64_t pad = (chunk_size - n_tokens % chunk_size) % chunk_size;
+    const int64_t n_chunks = (n_tokens + pad) / chunk_size;
+
+    q = ggml_pad(ctx0, q, 0, pad, 0, 0);
+    k = ggml_pad(ctx0, k, 0, pad, 0, 0);
+    v = ggml_pad(ctx0, v, 0, pad, 0, 0);
+    g = ggml_pad(ctx0, g, pad, 0, 0, 0);
+    beta = ggml_pad(ctx0, beta, 0, pad, 0, 0);
+
+    cb(q, "q_pad", il);
+    cb(k, "k_pad", il);
+    cb(v, "v_pad", il);
+    cb(beta, "beta_pad", il);
+    cb(g, "g_pad", il);
+
+    ggml_tensor * v_beta = ggml_mul(ctx0, v, beta);
+    ggml_tensor * k_beta = ggml_mul(ctx0, k, beta);
+
+    cb(v_beta, "v_beta", il);
+    cb(k_beta, "k_beta", il);
+
+    q      = ggml_reshape_4d(ctx0, q,      S_k, chunk_size, n_chunks, H_k * n_seqs);
+    k      = ggml_reshape_4d(ctx0, k,      S_k, chunk_size, n_chunks, H_k * n_seqs);
+    k_beta = ggml_reshape_4d(ctx0, k_beta, S_k, chunk_size, n_chunks, H_k * n_seqs);
+    v      = ggml_reshape_4d(ctx0, v,      S_v, chunk_size, n_chunks, H_v * n_seqs);
+    v_beta = ggml_reshape_4d(ctx0, v_beta, S_v, chunk_size, n_chunks, H_v * n_seqs);
+
+    g    = ggml_reshape_4d(ctx0, g, chunk_size, 1, n_chunks, H_k * n_seqs);
+    beta = ggml_reshape_4d(ctx0, beta, 1, chunk_size, n_chunks, H_k * n_seqs);
+
+    ggml_tensor * g_cumsum = ggml_cumsum(ctx0, g);
+    cb(g_cumsum, "g_cumsum", il); // shape: (chunk_size, 1, n_chunks, H_v * n_seqs)
+
+    ggml_tensor * gcs_i = g_cumsum; // ggml_reshape_4d(ctx0, g_cumsum, chunk_size, 1, n_chunks, H_v * n_seqs);
+    ggml_tensor * gcs_j = ggml_reshape_4d(ctx0, g_cumsum, 1, chunk_size, n_chunks, H_v * n_seqs);
+
+    ggml_tensor * gcs_j_broadcast =
+        ggml_repeat_4d(ctx0, gcs_j, chunk_size, chunk_size, n_chunks, H_v * n_seqs);
+
+    ggml_tensor * decay_mask = ggml_sub(ctx0, gcs_j_broadcast, gcs_i);
+    cb(decay_mask, "decay_mask", il); // shape: (chunk_size, chunk_size, n_chunks, H_v * n_seqs)
+
+    decay_mask = ggml_mul(ctx0, decay_mask, diag_mask);
+    decay_mask = ggml_exp(ctx0, decay_mask);
+    decay_mask = ggml_mul(ctx0, decay_mask, diag_mask);
+
+    ggml_tensor * kmulkbeta = ggml_mul_mat(ctx0, k, k_beta);
+
+    ggml_tensor * k_decay = ggml_mul(ctx0, kmulkbeta, decay_mask);
+    ggml_tensor * attn    = ggml_neg(ctx0, ggml_mul(ctx0, k_decay, causal_mask));
+    cb(attn, "attn_pre_solve", il); // shape: (chunk_size, chunk_size, n_chunks, H_v * n_seqs)
+
+    ggml_tensor * attn_lower = ggml_mul(ctx0, attn, causal_mask);
+    ggml_tensor * lhs        = ggml_sub(ctx0, ggml_repeat(ctx0, identity, attn_lower), attn_lower);
+
+    ggml_tensor * lin_solve  = ggml_solve_tri(ctx0, lhs, attn, true, true, false);
+    attn                     = ggml_mul(ctx0, lin_solve, causal_mask);
+    attn                     = ggml_add(ctx0, attn, identity);
+    cb(attn, "attn_solved", il); // shape: (chunk_size, chunk_size, n_chunks, H_v * n_seqs)
+
+    v = ggml_mul_mat(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, v_beta)), attn);
+
+    ggml_tensor * g_cumsum_t = ggml_cont(ctx0, ggml_transpose(ctx0, g_cumsum));
+    ggml_tensor * gexp       = ggml_exp(ctx0, g_cumsum_t);
+
+    ggml_tensor * kbeta_gexp = ggml_mul(ctx0, k_beta, gexp);
+    cb(kbeta_gexp, "kbeta_gexp", il); // shape: (S_k, chunk_size, n_chunks, H_v * n_seqs)
+
+    ggml_tensor * k_cumdecay =
+        ggml_cont(ctx0, ggml_transpose(ctx0, ggml_mul_mat(ctx0, attn, ggml_cont(ctx0, ggml_transpose(ctx0, kbeta_gexp)))));
+    cb(k_cumdecay, "k_cumdecay", il); // shape: (chunk_size, chunk_size, n_chunks, H_v * n_seqs)
+
+    ggml_tensor * attn_kq = ggml_mul_mat(ctx0, k, q);
+    attn_kq = ggml_mul(ctx0, attn_kq, decay_mask);
+    attn_kq = ggml_mul(ctx0, attn_kq, diag_mask);
+    cb(attn_kq, "attn_kq", il); // shape: (chunk_size, chunk_size, n_chunks, H_v * n_seqs)
+
+
+    // vectorized calculation of key_gdiff
+    // improved from the chunked version:
+    //   g_last = torch.clamp(g_cum[:, :, -1], max=50.0).exp().unsqueeze(-1).unsqueeze(-1)
+    //   g_diff = torch.clamp(g_cum[:, :, -1:] - g_cum, max=50.0).exp()
+    //   key_gdiff = key * g_diff.unsqueeze(-1)
+    //   kgdmulvnew = (key_gdiff).transpose(-1, -2) @ v_new
+    //   last_recurrent_state = last_recurrent_state * g_last + kgdmulvnew
+
+    // get last element in g_cumsum along chunk_size dimension (ne0)
+    // example: [[x, y, z, ..., last], ...] -> [[last], ...]
+    ggml_tensor * g_last = ggml_view_4d(ctx0, g_cumsum, 1, 1, g_cumsum->ne[2], g_cumsum->ne[3],
+                                        g_cumsum->nb[1], g_cumsum->nb[2], g_cumsum->nb[3],
+                                        (g_cumsum->ne[0] - 1) * ggml_element_size(g_cumsum));
+    g_last = ggml_cont(ctx0, g_last);
+    cb(g_last, "g_last", il); // shape: (1, 1, n_chunks, H_v * n_seqs)
+
+    ggml_tensor * g_last_exp = ggml_exp(ctx0, g_last);
+    cb(g_last_exp, "g_last_exp", il); // shape: (1, 1, n_chunks, H_v * n_seqs)
+
+    ggml_tensor * g_diff = ggml_neg(ctx0, ggml_sub(ctx0, g_cumsum, g_last));
+    cb(g_diff, "g_diff", il); // shape: (chunk_size, 1, n_chunks, H_v * n_seqs)
+
+    ggml_tensor * g_diff_exp = ggml_exp(ctx0, g_diff);
+    ggml_tensor * g_diff_exp_t = ggml_reshape_4d(ctx0, g_diff_exp,
+                                                 1, chunk_size, n_chunks, g_diff_exp->ne[3]);
+
+    ggml_tensor * key_gdiff = ggml_mul(ctx0, k, g_diff_exp_t);
+    cb(key_gdiff, "key_gdiff", il); // shape: (S_k, chunk_size, n_chunks, H_v * n_seqs)
+
+    ggml_tensor * key_gdiff_t = ggml_cont(ctx0, ggml_transpose(ctx0, key_gdiff));
+    cb(key_gdiff_t, "key_gdiff_t", il); // shape: (chunk_size, S_k, n_chunks, H_v * n_seqs)
+
+
+    // state to be updated per chunk
+    ggml_tensor * new_state = state; // ggml_dup(ctx0, state);
+    cb(new_state, "new_state", il); // shape: (S_v, S_v, H_v, n_seqs)
+
+    // shape after loop of chunks: (S_v, chunk_size, n_chunks, H_v * n_seqs)
+    ggml_tensor * core_attn_out = nullptr;
+
+    for (int64_t chunk = 0; chunk < n_chunks; chunk++) {
+        // shape: (S_k, chunk_size, 1, H_k * n_seqs)
+        ggml_tensor * q_chunk = get_slice_2d(ctx0, q, chunk); // (no cont), next op: ggml_mul
+
+        // shape: (S_v, chunk_size, 1, H_v * n_seqs)
+        ggml_tensor * v_chunk = get_slice_2d(ctx0, v, chunk); // (no cont), next op: ggml_repeat
+
+        // shape: (chunk_size, 1, n_chunks, H_v * n_seqs)
+        ggml_tensor * gexp_chunk = get_slice_2d(ctx0, gexp, chunk); // (no cont), next op: ggml_mul
+
+        // shape: (chunk_size, 1, H_v * n_seqs)
+        ggml_tensor * k_cumdecay_chunk = get_slice_2d(ctx0, k_cumdecay, chunk); // (no cont), next op: ggml_mul_mat
+
+        // attn = (q_i @ k_i.transpose(-1, -2) * decay_mask[:, :, i]).masked_fill_(mask, 0)
+        // replaced by precomputed attn_kq
+        ggml_tensor * attn_chunk = get_slice_2d(ctx0, attn_kq, chunk);
+        cb(attn_chunk, "attn_chunk", il);
+
+        ggml_tensor * state_t = ggml_cont_4d(ctx0, ggml_permute(ctx0, new_state, 1, 0, 2, 3), S_v, S_v, 1, H_v * n_seqs);
+
+        // v_prime = (k_cumdecay[:, :, i]) @ last_recurrent_state
+        ggml_tensor * v_prime = ggml_mul_mat(ctx0, state_t, k_cumdecay_chunk);
+        cb(v_prime, "v_prime_chunk", il); // shape: (S_v, 1, H_v * n_seqs)
+
+        // v_new = v_i - v_prime
+        ggml_tensor * v_new = ggml_sub(ctx0, ggml_repeat(ctx0, v_chunk, v_prime), v_prime);
+        ggml_tensor * v_new_t = ggml_cont(ctx0, ggml_transpose(ctx0, v_new));
+        cb(v_new, "v_new_chunk", il);
+
+        // attn_inter = (q_i * g[:, :, i, :, None].exp()) @ last_recurrent_state
+        ggml_tensor * q_g_exp    = ggml_mul(ctx0, q_chunk, gexp_chunk);
+        ggml_tensor * attn_inter = ggml_mul_mat(ctx0, state_t, q_g_exp);
+        cb(attn_inter, "attn_inter_chunk", il);
+
+        // core_attn_out[:, :, i] = attn_inter + attn @ v_new
+        ggml_tensor * v_attn = ggml_mul_mat(ctx0, v_new_t, attn_chunk);
+        cb(v_attn, "v_attn_chunk", il);
+
+        ggml_tensor * core_attn_out_chunk = ggml_add(ctx0, attn_inter, v_attn);
+        cb(core_attn_out_chunk, "core_attn_out_chunk", il); // shape: (S_v, chunk_size, 1, H_v * n_seqs)
+
+        core_attn_out = core_attn_out == nullptr
+            ? core_attn_out_chunk
+            : ggml_concat(ctx0, core_attn_out, core_attn_out_chunk, 2);
+
+        // kgdmulvnew = (key_gdiff).transpose(-1, -2) @ v_new
+        ggml_tensor * k_gdiff_t = get_slice_2d(ctx0, key_gdiff_t, chunk);
+        //ggml_tensor * kgdmulvnew = ggml_mul_mat(ctx0, k_gdiff, v_new); // this is slower on metal, why?
+        ggml_tensor * kgdmulvnew = ggml_mul_mat(ctx0, v_new_t, k_gdiff_t);
+
+        // last_recurrent_state = last_recurrent_state * g_last + kgdmulvnew
+        ggml_tensor * gexp_last_chunk = ggml_cont(ctx0, get_slice_2d(ctx0, g_last_exp, chunk));
+        new_state = ggml_add(ctx0,
+            ggml_mul(ctx0, new_state, ggml_reshape_4d(ctx0, gexp_last_chunk, gexp_last_chunk->ne[0], gexp_last_chunk->ne[1], H_v, n_seqs)),
+            ggml_reshape_4d(ctx0, kgdmulvnew, kgdmulvnew->ne[0], kgdmulvnew->ne[1], H_v, n_seqs));
+    }
+
+    // truncate padded tokens
+    ggml_tensor * output_tokens = ggml_view_4d(ctx0, core_attn_out,
+            S_v, n_tokens, H_v, n_seqs,
+            ggml_row_size(core_attn_out->type, S_v),
+            ggml_row_size(core_attn_out->type, S_v * chunk_size * n_chunks),
+            ggml_row_size(core_attn_out->type, S_v * chunk_size * n_chunks * H_v), 0);
+    output_tokens = ggml_cont(ctx0, output_tokens);
+    cb(output_tokens, "output_tokens", il);
+
+    // permute back to (S_v, H_v, n_tokens, n_seqs)
+    output_tokens = ggml_permute(ctx0, output_tokens, 0, 2, 1, 3);
+    output_tokens = ggml_cont(ctx0, output_tokens);
+
+    return {output_tokens, new_state};
+}
+
+std::pair<ggml_tensor *, ggml_tensor *> llm_build_qwen3next::build_delta_net_autoregressive(
+        ggml_tensor * q,
+        ggml_tensor * k,
+        ggml_tensor * v,
+        ggml_tensor * g,
+        ggml_tensor * beta,
+        ggml_tensor * state,
+        int           il) {
+    const int64_t S_k      = q->ne[0];
+    const int64_t H_k      = q->ne[1];
+    const int64_t n_tokens = q->ne[2];
+    const int64_t n_seqs   = q->ne[3];
+
+    const int64_t S_v = v->ne[0];
+    const int64_t H_v = v->ne[1];
+
+    GGML_ASSERT(n_tokens == 1);  // This function is optimized for single token processing
+    GGML_ASSERT(v->ne[2] == n_tokens);
+    GGML_ASSERT(k->ne[2] == n_tokens);
+    GGML_ASSERT(g->ne[0] == H_v && g->ne[1] == n_tokens && g->ne[2] == n_seqs);
+    GGML_ASSERT(beta->ne[0] == H_v && beta->ne[2] == n_tokens && beta->ne[3] == n_seqs);
+    GGML_ASSERT(state->ne[0] == S_v && state->ne[1] == S_v * H_v && state->ne[2] == 1 && state->ne[3] == n_seqs);
+
+    GGML_ASSERT(q->ne[0] == S_k && q->ne[1] == H_k && q->ne[2] == n_tokens && q->ne[3] == n_seqs);
+    GGML_ASSERT(k->ne[0] == S_k && k->ne[1] == H_k && k->ne[2] == n_tokens && k->ne[3] == n_seqs);
+
+    GGML_ASSERT(H_k == H_v);  // we did a repeat to make sure this is the case
+
+    const float eps_norm = hparams.f_norm_rms_eps;
+
+    q = ggml_l2_norm(ctx0, q, eps_norm);
+    k = ggml_l2_norm(ctx0, k, eps_norm);
+
+    const float scale = 1.0f / sqrtf(S_v);
+
+    q    = ggml_scale(ctx0, q, scale);
+    beta = ggml_sigmoid(ctx0, beta);
+
+    cb(q, "q_in", il);
+    cb(k, "k_in", il);
+    cb(v, "v_in", il);
+    cb(beta, "beta_in", il);
+    cb(g, "g_in", il);
+
+    state = ggml_reshape_4d(ctx0, state, S_v, S_v, H_v, n_seqs);
+
+    ggml_tensor * g_t    = ggml_reshape_4d(ctx0, ggml_transpose(ctx0, g), 1, 1, H_k, n_seqs);
+    ggml_tensor * beta_t = ggml_reshape_4d(ctx0, ggml_transpose(ctx0, beta), 1, 1, H_k, n_seqs);
+
+    // Apply exponential to g_t
+    g_t = ggml_exp(ctx0, g_t);
+
+    // Apply the gated delta rule for the single timestep
+    // last_recurrent_state = last_recurrent_state * g_t
+    state = ggml_mul(ctx0, state, g_t);
+
+    // kv_mem = (last_recurrent_state * k_t.unsqueeze(-1)).sum(dim=-2)
+    ggml_tensor * k_t_unsqueezed = ggml_reshape_4d(ctx0, k, 1, S_v, H_v, n_seqs);
+    ggml_tensor * kv_mem         = ggml_mul(ctx0, state, k_t_unsqueezed);
+    // we need to sum over dim=-2, so we transpose, sum, then transpose again
+    kv_mem = ggml_transpose(ctx0, ggml_sum_rows(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, kv_mem))));
+
+    // v_t = v.unsqueeze(2) (we insert the singleton dimension after n_seqs and H_v)
+    ggml_tensor * v_t    = ggml_reshape_4d(ctx0, v, S_v, 1, H_v, n_seqs);
+    // delta = (v_t - kv_mem) * beta_t
+    ggml_tensor * v_diff = ggml_sub(ctx0, v_t, kv_mem);  // both should be [S_v, 1, H_v, n_seqs]
+    ggml_tensor * delta  = ggml_mul(ctx0, v_diff, beta_t);
+
+    // last_recurrent_state = last_recurrent_state + k_t.unsqueeze(-1) * delta
+    ggml_tensor * k_t_delta = ggml_mul(ctx0, ggml_repeat_4d(ctx0, k_t_unsqueezed, S_v, S_v, H_v, n_seqs), delta);
+    state                   = ggml_add(ctx0, state, k_t_delta);
+
+    // Compute the attention output
+    // core_attn_out = (last_recurrent_state * q_t.unsqueeze(-1)).sum(dim=-2)
+    ggml_tensor * q_t_unsqueezed = ggml_reshape_4d(ctx0, q, 1, S_v, H_v, n_seqs);  // unsqueeze q_t
+    ggml_tensor * state_q        = ggml_mul(ctx0, state, q_t_unsqueezed);
+    // again, since it's over dim = -2, transpose, sum, transpose back
+    ggml_tensor * core_attn_out =
+        ggml_transpose(ctx0, ggml_sum_rows(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, state_q))));
+
+    // core_attn_out should be [S_v, 1, H_v, n_seqs] after this
+    cb(core_attn_out, "output_tokens", il);
+    cb(state, "new_state", il);
+
+    return {core_attn_out, state};
+}
+
 ggml_tensor * llm_build_qwen3next::build_norm_gated(
         ggml_tensor * input,
         ggml_tensor * weights,
@@ -395,7 +752,7 @@ ggml_tensor * llm_build_qwen3next::build_layer_attn_linear(
     v_conv = ggml_cont_4d(ctx0, v_conv, head_v_dim, num_v_heads, n_seq_tokens, n_seqs);
 
     ggml_tensor * state = build_rs(inp, ssm_states_all, hparams.n_embd_s(), n_seqs);
-    state               = ggml_reshape_4d(ctx0, state, head_v_dim, head_v_dim,  num_v_heads, n_seqs);
+    state               = ggml_reshape_4d(ctx0, state, head_v_dim, head_v_dim * num_v_heads, 1, n_seqs);
     cb(state, "state_predelta", il);
 
     // if head keys and value keys are different, repeat to force tensors into matching shapes
@@ -424,10 +781,13 @@ ggml_tensor * llm_build_qwen3next::build_layer_attn_linear(
     cb(k_conv, "k_conv_predelta", il);
     cb(v_conv, "v_conv_predelta", il);
 
-    std::pair<ggml_tensor *, ggml_tensor *> attn_out = build_delta_net_unified(ctx0, q_conv, k_conv, v_conv,
-            gate, beta, state, causal_mask, identity, diag_mask,
-            il, CHUNK_SIZE, hparams.f_norm_rms_eps);
-
+    // Choose between build_delta_net_chunking, build_delta_net_recurrent, and build_delta_net_autoregressive based on n_tokens
+    std::pair<ggml_tensor *, ggml_tensor *> attn_out; // pair of (output, new_state)
+    if (n_seq_tokens == 1) {
+        attn_out = build_delta_net_autoregressive(q_conv, k_conv, v_conv, gate, beta, state, il);
+    } else {
+        attn_out = build_delta_net_chunking(q_conv, k_conv, v_conv, gate, beta, state, causal_mask, identity, diag_mask, il);
+    }
     ggml_tensor * output    = attn_out.first;
     ggml_tensor * new_state = attn_out.second;
     cb(output, "attn_output", il);

From 81ddc60cb3b980a4503a9a0177b079dfa562c60e Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Mon, 9 Feb 2026 15:09:30 +0200
Subject: [PATCH 13/17] ci : add metal server workflows (#19293)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* ci : add metal server workflows

* cont : try fix python init

* cont : move to a separate workflow that runs only on master

* cont : fix num jobs

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>

---------

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
---
 .github/workflows/server-metal.yml | 73 ++++++++++++++++++++++++++++++
 1 file changed, 73 insertions(+)
 create mode 100644 .github/workflows/server-metal.yml

diff --git a/.github/workflows/server-metal.yml b/.github/workflows/server-metal.yml
new file mode 100644
index 0000000000..1d707bef44
--- /dev/null
+++ b/.github/workflows/server-metal.yml
@@ -0,0 +1,73 @@
+name: Server-Metal
+
+on:
+  workflow_dispatch: # allows manual triggering
+    inputs:
+      sha:
+        description: 'Commit SHA1 to build'
+        required: false
+        type: string
+      slow_tests:
+        description: 'Run slow tests'
+        required: true
+        type: boolean
+  push:
+    branches:
+      - master
+    paths: ['.github/workflows/server-metal.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'tools/server/**.*']
+
+env:
+  LLAMA_LOG_COLORS: 1
+  LLAMA_LOG_PREFIX: 1
+  LLAMA_LOG_TIMESTAMPS: 1
+  LLAMA_LOG_VERBOSITY: 10
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || github.run_id }}
+  cancel-in-progress: true
+
+jobs:
+  server-metal:
+    runs-on: [self-hosted, macOS, ARM64]
+
+    name: server-metal (${{ matrix.wf_name }})
+    strategy:
+      matrix:
+        build_type: [Release]
+        wf_name: ["GPUx1"]
+        include:
+          - build_type: Release
+            extra_args: "LLAMA_ARG_BACKEND_SAMPLING=1"
+            wf_name:    "GPUx1, backend-sampling"
+          - build_type: Release
+            extra_args: "GGML_METAL_DEVICES=2"
+            wf_name:    "GPUx2"
+          - build_type: Release
+            extra_args: "GGML_METAL_DEVICES=2 LLAMA_ARG_BACKEND_SAMPLING=1"
+            wf_name:    "GPUx2, backend-sampling"
+      fail-fast: false
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v6
+        with:
+          fetch-depth: 0
+          ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
+
+      - name: Build
+        id: cmake_build
+        run: |
+          cmake -B build -DGGML_SCHED_NO_REALLOC=ON
+          cmake --build build --config ${{ matrix.build_type }} -j $(sysctl -n hw.logicalcpu) --target llama-server
+
+      - name: Tests
+        id: server_integration_tests
+        if: ${{ (!matrix.disabled_on_pr || !github.event.pull_request) }}
+        run: |
+          cd tools/server/tests
+          python3 -m venv venv
+          source venv/bin/activate
+          pip install -r requirements.txt
+          export ${{ matrix.extra_args }}
+          pytest -v -x -m "not slow"

From 292f6908cdc6abb5c38581e34fa141973e5aba82 Mon Sep 17 00:00:00 2001
From: Sascha Rogmann <59577610+srogmann@users.noreply.github.com>
Date: Mon, 9 Feb 2026 14:30:50 +0100
Subject: [PATCH 14/17] spec : remove check rate (#19377)

* spec: remove parameter spec-ngram-check-rate

* spec : renamed statistics vars

* spec : add n_call_begin, n_call_accept

* spec : don't enable key-map-stats
---
 common/arg.cpp               | 10 -------
 common/common.h              |  1 -
 common/ngram-map.cpp         |  7 ++---
 common/ngram-map.h           |  8 ++----
 common/speculative.cpp       | 55 ++++++++++++++++--------------------
 docs/speculative.md          | 13 ++++-----
 tools/server/server-task.cpp |  4 ---
 7 files changed, 36 insertions(+), 62 deletions(-)

diff --git a/common/arg.cpp b/common/arg.cpp
index 5fbc9022c0..9c85696ebd 100644
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -3437,16 +3437,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
             params.speculative.ngram_size_m = value;
         }
     ).set_examples({LLAMA_EXAMPLE_SERVER}));
-    add_opt(common_arg(
-        {"--spec-ngram-check-rate"}, "N",
-        string_format("ngram check rate for ngram-simple/ngram-map speculative decoding (default: %d)", params.speculative.ngram_check_rate),
-        [](common_params & params, int value) {
-            if (value < 1) {
-                throw std::invalid_argument("ngram check rate must be at least 1");
-            }
-            params.speculative.ngram_check_rate = value;
-        }
-    ).set_examples({LLAMA_EXAMPLE_SERVER}));
     add_opt(common_arg(
         {"--spec-ngram-min-hits"}, "N",
         string_format("minimum hits for ngram-map speculative decoding (default: %d)", params.speculative.ngram_min_hits),
diff --git a/common/common.h b/common/common.h
index 398ebb0960..b284244530 100644
--- a/common/common.h
+++ b/common/common.h
@@ -269,7 +269,6 @@ struct common_params_speculative {
 
     uint16_t ngram_size_n     = 12; // ngram size for lookup
     uint16_t ngram_size_m     = 48; // mgram size for speculative tokens
-    uint16_t ngram_check_rate =  1; // check rate for ngram lookup
     uint16_t ngram_min_hits   =  1; // minimum hits at ngram/mgram lookup for mgram to be proposed
 
     std::shared_ptr<common_ngram_mod> ngram_mod;
diff --git a/common/ngram-map.cpp b/common/ngram-map.cpp
index c5b8fc75ed..2b876a6e99 100644
--- a/common/ngram-map.cpp
+++ b/common/ngram-map.cpp
@@ -231,10 +231,9 @@ void common_ngram_map_draft(common_ngram_map & map,
         GGML_ABORT("%s: cur_len exceeds UINT32_MAX: %zu", __func__, cur_len);
     }
 
-    // Only check every check_rate tokens to save compute
-    // i.e., perform check if (cur_len - idx_last_check) >= check_rate
-    if (map.idx_last_check + map.check_rate > cur_len) {
-        return;
+    if (map.idx_last_check  > cur_len) {
+        // Should not happen because of common_ngram_map_begin().
+        GGML_ABORT("%s: map.idx_last_check > cur_len: %zu > %zu", __func__, map.idx_last_check, cur_len);
     }
     map.idx_last_check = cur_len;
 
diff --git a/common/ngram-map.h b/common/ngram-map.h
index 9668bd5a7c..41b9530449 100644
--- a/common/ngram-map.h
+++ b/common/ngram-map.h
@@ -24,7 +24,6 @@
 struct common_ngram_simple_config {
     uint16_t   size_ngram;      // size of n-grams to lookup in self-mode
     uint16_t   size_mgram;      // size of m-grams to draft in self-mode
-    uint16_t   check_rate;      // check for speculative decoding without draft model for each check_rate token
 };
 
 // Searches for a n-gram in the history and checks whether a draft sequence should be generated.
@@ -66,15 +65,14 @@ struct common_ngram_map {
     bool key_only;       // true if only key n-grams are used, no values.
 
     std::vector<common_ngram_map_key> keys; // key n-grams which occur several times in token-history
-    uint16_t check_rate; // check for speculative decoding without draft model for each check_rate token
     uint16_t min_hits;   // minimum number of key hits to consider a draft
 
-    bool     show_key_map_stats = false; // true, if statitics of the key_map should be printed.
+    bool     show_key_map_stats = false; // true, if statistics of the key_map should be printed.
 
     common_ngram_map(uint16_t sz_key, uint16_t sz_value, bool only_keys,
-                     uint16_t check_rate, uint16_t min_hits)
+                     uint16_t min_hits)
         : size_key(sz_key), size_value(sz_value), key_only(only_keys),
-          check_rate(check_rate), min_hits(min_hits) {
+          min_hits(min_hits) {
         key_map.resize(COMMON_NGRAM_HASH_MAP_SIZE); // 2^18 hash entries, 0 entries if key_map shouldn't be used
     }
 
diff --git a/common/speculative.cpp b/common/speculative.cpp
index 84d2556ceb..3e68c38e49 100644
--- a/common/speculative.cpp
+++ b/common/speculative.cpp
@@ -113,13 +113,14 @@ static bool common_speculative_are_compatible(
 struct common_speculative_state {
     const enum common_speculative_type type;
 
-    // TODO: rename to n_call_draft, n_gen_drafts, n_acc_drafts, n_gen_tokens, n_acc_tokens
-    // TODO: add n_call_begin, n_call_accept
-    size_t drafts_call_count       = 0; // number of times this implementation was called.
-    size_t drafts_generated_count  = 0; // number of times a draft or part was generated by this implementation.
-    size_t drafts_accepted_count   = 0; // number of times a draft or part was accepted by the target model.
-    size_t drafts_generated_tokens = 0; // number of tokens generated by this implementation.
-    size_t drafts_accepted_tokens  = 0; // number of tokens accepted by the target model.
+    size_t n_call_begin  = 0; // number of times this implementation was called for refresh.
+    size_t n_call_draft  = 0; // number of times this implementation was called for generation.
+    size_t n_call_accept = 0; // number of times this implementation was called for accumulation.
+
+    size_t n_gen_drafts = 0; // number of times a draft or part was generated by this implementation.
+    size_t n_acc_drafts = 0; // number of times a draft or part was accepted by the target model.
+    size_t n_gen_tokens = 0; // number of tokens generated by this implementation.
+    size_t n_acc_tokens = 0; // number of tokens accepted by the target model.
 
     // TODO: track performance of most recent calls
     const bool gen_perf = true; // whether to generate performance stats.
@@ -465,8 +466,6 @@ struct common_speculative_state_eagle3 : public common_speculative_state {
 struct common_speculative_state_ngram_simple : public common_speculative_state {
     common_ngram_simple_config config;
 
-    uint16_t check_id = 0; // used to control the frequency of generating drafts
-
     common_speculative_state_ngram_simple(
             enum common_speculative_type type,
             common_ngram_simple_config config)
@@ -481,11 +480,6 @@ struct common_speculative_state_ngram_simple : public common_speculative_state {
             const llama_tokens & prompt_tgt,
             llama_token id_last,
             llama_tokens & result) override {
-        ++check_id;
-        if (check_id < config.check_rate) {
-            return;
-        }
-        check_id = 0;
 
         result = common_ngram_simple_draft(config, prompt_tgt, id_last);
         GGML_UNUSED(params);
@@ -752,10 +746,9 @@ static common_ngram_map get_common_ngram_map(const common_speculative_config & c
     uint16_t size_key   = config.params.ngram_size_n;
     uint16_t size_value = config.params.ngram_size_m;
     bool     key_only   = (config.type == COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K);
-    uint16_t check_rate = config.params.ngram_check_rate;
     uint16_t min_hits   = config.params.ngram_min_hits;
 
-    return common_ngram_map(size_key, size_value, key_only, check_rate, min_hits);
+    return common_ngram_map(size_key, size_value, key_only, min_hits);
 }
 
 static common_speculative_state_ngram_cache create_state_ngram_cache(
@@ -931,12 +924,10 @@ common_speculative * common_speculative_init(
 
                 uint16_t ngram_size_key   = ngram_map.size_key;
                 uint16_t mgram_size_value = ngram_map.size_value;
-                uint16_t check_rate       = ngram_map.check_rate;
 
                 auto config_simple = common_ngram_simple_config {
                     /* .size_ngram      = */ ngram_size_key,
-                    /* .size_mgram      = */ mgram_size_value,
-                    /* .check_rate      = */ check_rate
+                    /* .size_mgram      = */ mgram_size_value
                 };
                 auto state = std::make_unique<common_speculative_state_ngram_simple>(
                     /* .type            = */ config.type,
@@ -997,6 +988,7 @@ void common_speculative_begin(common_speculative * spec, const llama_tokens & pr
     for (auto & impl : spec->impls) {
         common_time_meas tm(impl->t_begin_us, !impl->gen_perf);
         impl->begin(prompt);
+        impl->n_call_begin++;
     }
 }
 
@@ -1013,17 +1005,17 @@ llama_tokens common_speculative_draft(
         {
             common_time_meas tm(impl->t_draft_us, !impl->gen_perf);
             impl->draft(params, prompt_tgt, id_last, result);
-            impl->drafts_call_count++;
+            impl->n_call_draft++;
         }
 
         if (!result.empty()) {
             LOG_DBG("%s: called impl %s, hist size = %zu, call_count = %zu, gen = %zu\n", __func__,
                     common_speculative_type_to_str(impl.get()->type).c_str(), prompt_tgt.size(),
-                    impl.get()->drafts_call_count, result.size());
+                    impl.get()->n_call_draft, result.size());
 
             spec->curr_impl = impl.get(); // set current implementation for stats
-            impl->drafts_generated_count++;
-            impl->drafts_generated_tokens += result.size();
+            impl->n_gen_drafts++;
+            impl->n_gen_tokens += result.size();
 
             break; // We have a draft, so break out of the loop and return it.
         }
@@ -1044,11 +1036,12 @@ void common_speculative_accept(common_speculative * spec, uint16_t n_accepted) {
     {
         common_time_meas tm(impl->t_accept_us, !impl->gen_perf);
         if (n_accepted > 0) {
-            impl->drafts_accepted_count++;
-            impl->drafts_accepted_tokens += n_accepted;
+            impl->n_acc_drafts++;
+            impl->n_acc_tokens += n_accepted;
         }
 
         impl->accept(n_accepted);
+        impl->n_call_accept++;
     }
 }
 
@@ -1069,13 +1062,13 @@ void common_speculative_print_stats(const common_speculative * spec) {
             str_perf = "";
         }
 
-        LOG_INF("statistics %s: #calls = %zu, #gen drafts = %zu, #acc drafts = %zu, #gen tokens = %zu, #acc tokens = %zu%s\n",
+        LOG_INF("statistics %s: #calls(b,g,a) = %zu %zu %zu, #gen drafts = %zu, #acc drafts = %zu, #gen tokens = %zu, #acc tokens = %zu%s\n",
                 common_speculative_type_to_str(impl->type).c_str(),
-                impl->drafts_call_count,
-                impl->drafts_generated_count,
-                impl->drafts_accepted_count,
-                impl->drafts_generated_tokens,
-                impl->drafts_accepted_tokens,
+                impl->n_call_begin, impl->n_call_draft, impl->n_call_accept,
+                impl->n_gen_drafts,
+                impl->n_acc_drafts,
+                impl->n_gen_tokens,
+                impl->n_acc_tokens,
                 str_perf.c_str());
     }
 }
diff --git a/docs/speculative.md b/docs/speculative.md
index 03afab5b41..29da332875 100644
--- a/docs/speculative.md
+++ b/docs/speculative.md
@@ -119,8 +119,6 @@ If a draft model is combined with a draftless decoding the draftless decoding ha
                                         of lookup n-gram (default: 12)
 --spec-ngram-size-m N                   ngram size M for ngram-simple/ngram-map speculative decoding, length
                                         of draft m-gram (default: 48)
---spec-ngram-check-rate N               ngram check rate for ngram-simple/ngram-map speculative decoding
-                                        (default: 1)
 --spec-ngram-min-hits N                 minimum hits for ngram-map speculative decoding (default: 1)
 ```
 
@@ -153,10 +151,6 @@ Sets the size M of the draft m-gram for n-gram map based speculative decoding.
 The m-gram size determines how many tokens to draft when a match is found.
 Larger values can provide more speedup but may reduce acceptance rate.
 
-### `--spec-ngram-check-rate R`
-
-This option aims at performance if the n-gram lookup in history is to costly. A lookup will be executed at every R tokens (default is 1, every token).
-
 ### `--spec-ngram-min-hits H`
 
 This option defines how often a key has to appear in the token history to be used as a draft (default is 1).
@@ -175,7 +169,12 @@ draft acceptance rate = 0.70312 (   90 accepted /   128 generated)
 statistics ngram_mod: #calls = 810, #gen drafts = 15, #acc drafts = 15, #gen tokens = 960, #acc tokens = 730, dur(b,g,a) = 0.149, 0.347, 0.005 ms
 ```
 
-- `#calls`: number of calls of this implementations
+```
+statistics ngram_map_k: #calls(b,g,a) = 6 1690 26, #gen drafts = 26, #acc drafts = 26, #gen tokens = 1248, #acc tokens = 968, dur(b,g,a) = 2.234, 1.427, 0.016 ms
+```
+
+
+- `#calls(b,g,a)`: number of calls of begin (new prompt), generation and accumulation of this implementations
 - `#gen drafts`: number of drafts generated by this implementation
 - `#acc drafts`: number of drafts accepted (partially) by the main model
 - `#gen tokens`: number of tokens generated by this implementation (including rejected tokens)
diff --git a/tools/server/server-task.cpp b/tools/server/server-task.cpp
index 2d25db63b7..a137427c69 100644
--- a/tools/server/server-task.cpp
+++ b/tools/server/server-task.cpp
@@ -80,7 +80,6 @@ json task_params::to_json(bool only_metrics) const {
             {"speculative.type",          common_speculative_type_to_str(speculative.type)},
             {"speculative.ngram_size_n",  speculative.ngram_size_n},
             {"speculative.ngram_size_m",  speculative.ngram_size_m},
-            {"speculative.ngram_c_rate",  speculative.ngram_check_rate},
             {"speculative.ngram_m_hits",  speculative.ngram_min_hits},
             {"timings_per_token",         timings_per_token},
             {"post_sampling_probs",       post_sampling_probs},
@@ -144,7 +143,6 @@ json task_params::to_json(bool only_metrics) const {
         {"speculative.type",          common_speculative_type_to_str(speculative.type)},
         {"speculative.ngram_size_n",  speculative.ngram_size_n},
         {"speculative.ngram_size_m",  speculative.ngram_size_m},
-        {"speculative.ngram_c_rate",  speculative.ngram_check_rate},
         {"speculative.ngram_m_hits",  speculative.ngram_min_hits},
         {"timings_per_token",         timings_per_token},
         {"post_sampling_probs",       post_sampling_probs},
@@ -257,12 +255,10 @@ task_params server_task::params_from_json_cmpl(
 
     params.speculative.ngram_size_n     = json_value(data, "speculative.ngram_size_n", defaults.speculative.ngram_size_n);
     params.speculative.ngram_size_m     = json_value(data, "speculative.ngram_size_m", defaults.speculative.ngram_size_m);
-    params.speculative.ngram_check_rate = json_value(data, "speculative.ngram_c_rate", defaults.speculative.ngram_check_rate);
     params.speculative.ngram_min_hits   = json_value(data, "speculative.ngram_m_hits", defaults.speculative.ngram_min_hits);
 
     params.speculative.ngram_size_n     = std::max(std::min(1, (int) params.speculative.ngram_size_n),     1024);
     params.speculative.ngram_size_m     = std::max(std::min(1, (int) params.speculative.ngram_size_m),     1024);
-    params.speculative.ngram_check_rate = std::max(std::min(1, (int) params.speculative.ngram_check_rate), 1024);
     params.speculative.ngram_min_hits   = std::max(std::min(1, (int) params.speculative.ngram_min_hits),   1024);
 
     // Use OpenAI API logprobs only if n_probs wasn't provided

From 820ebfa6f45347c77e9ddfcc670e44109d6df43f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=EC=86=90=ED=9D=AC=EC=A4=80?= <openingnow@naver.com>
Date: Tue, 10 Feb 2026 00:22:57 +0900
Subject: [PATCH 15/17] Server: log when converting requests to chat
 completions format (#19457)

* Log converting requests

* Print as debug instead of info [no ci]

---------

Co-authored-by: openingnow <>
---
 tools/server/server-context.cpp | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp
index 8ec8451339..ceafcac179 100644
--- a/tools/server/server-context.cpp
+++ b/tools/server/server-context.cpp
@@ -3584,6 +3584,8 @@ void server_routes::init_routes() {
         auto res = create_response();
         std::vector<raw_buffer> files;
         json body = convert_responses_to_chatcmpl(json::parse(req.body));
+        SRV_DBG("%s\n", "Request converted: OpenAI Responses -> OpenAI Chat Completions");
+        SRV_DBG("converted request: %s\n", body.dump().c_str());
         json body_parsed = oaicompat_chat_params_parse(
             body,
             meta->chat_params,
@@ -3600,6 +3602,8 @@ void server_routes::init_routes() {
         auto res = create_response();
         std::vector<raw_buffer> files;
         json body = convert_anthropic_to_oai(json::parse(req.body));
+        SRV_DBG("%s\n", "Request converted: Anthropic -> OpenAI Chat Completions");
+        SRV_DBG("converted request: %s\n", body.dump().c_str());
         json body_parsed = oaicompat_chat_params_parse(
             body,
             meta->chat_params,
@@ -3616,6 +3620,8 @@ void server_routes::init_routes() {
         auto res = create_response();
         std::vector<raw_buffer> files;
         json body = convert_anthropic_to_oai(json::parse(req.body));
+        SRV_DBG("%s\n", "Request converted: Anthropic -> OpenAI Chat Completions");
+        SRV_DBG("converted request: %s\n", body.dump().c_str());
         json body_parsed = oaicompat_chat_params_parse(
             body,
             meta->chat_params,

From 262364e31d1da43596fe84244fba44e94a0de64e Mon Sep 17 00:00:00 2001
From: Tarek Dakhran <tarek@liquid.ai>
Date: Mon, 9 Feb 2026 17:30:32 +0100
Subject: [PATCH 16/17] mtmd: Implement tiling for LFM2-VL (#19454)

---
 tools/mtmd/clip.cpp | 136 ++++++++++++++++++++++++++++++++++++++++++--
 tools/mtmd/mtmd.cpp |  19 ++++++-
 2 files changed, 147 insertions(+), 8 deletions(-)

diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp
index 9fa5afc390..614fe66fde 100644
--- a/tools/mtmd/clip.cpp
+++ b/tools/mtmd/clip.cpp
@@ -10,6 +10,7 @@
 #include "ggml-backend.h"
 #include "gguf.h"
 
+#include <algorithm>
 #include <cassert>
 #include <cmath>
 #include <cstdlib>
@@ -1116,9 +1117,8 @@ struct clip_model_loader {
                 case PROJECTOR_TYPE_LFM2:
                     {
                         get_u32(KEY_PROJ_SCALE_FACTOR, hparams.n_merge, false);
-                        // ref: https://huggingface.co/LiquidAI/LFM2-VL-3B/blob/main/preprocessor_config.json
-                        // config above specifies number of tokens after downsampling, while here it is before, relax lowerbound to 64
-                        hparams.set_limit_image_tokens(64, 1024);
+                        // ref: https://huggingface.co/LiquidAI/LFM2.5-VL-1.6B/blob/main/processor_config.json
+                        hparams.set_limit_image_tokens(64, 256);
                     } break;
                 case PROJECTOR_TYPE_PIXTRAL:
                 case PROJECTOR_TYPE_LIGHTONOCR:
@@ -2807,6 +2807,119 @@ private:
     }
 };
 
+// ref: https://github.com/huggingface/transformers/blob/v5.1.0/src/transformers/models/lfm2_vl/image_processing_lfm2_vl_fast.py
+// some of the logic is similar to llava_uhd, but with different hyperparameters and some logic is unique (e.g. grid layout)
+struct lfm2_vl_image_processor {
+    // ref: https://huggingface.co/LiquidAI/LFM2.5-VL-1.6B/blob/main/processor_config.json
+    static constexpr int   min_tiles            = 2;
+    static constexpr int   max_tiles            = 10;
+    static constexpr float max_pixels_tolerance = 2.0f;
+    static constexpr int   tile_size            = 512;
+
+    static llava_uhd::slice_instructions get_slice_instructions(struct clip_ctx * ctx, const clip_image_size & original_size) {
+        llava_uhd::slice_instructions inst;
+        const auto & params  = ctx->model.hparams;
+        const int align_size = params.patch_size * params.n_merge;
+
+        inst.interpolation_overview = img_tool::RESIZE_ALGO_BILINEAR;
+        inst.interpolation_refined  = img_tool::RESIZE_ALGO_BILINEAR;
+        inst.overview_size          = img_tool::calc_size_preserved_ratio(original_size, align_size, params.image_min_pixels, params.image_max_pixels);
+
+        // tile if either dimension exceeds tile_size with tolerance
+        const bool needs_tiling = original_size.width > tile_size * max_pixels_tolerance || original_size.height > tile_size * max_pixels_tolerance;
+
+        if (!needs_tiling) {
+            inst.refined_size = clip_image_size{0, 0};
+            inst.grid_size    = clip_image_size{0, 0};
+            return inst;
+        }
+
+        const clip_image_size grid = get_grid_layout(original_size.height, original_size.width);
+
+        inst.grid_size    = grid;
+        inst.refined_size = clip_image_size{tile_size * grid.width, tile_size * grid.height};
+
+        LOG_DBG("%s: original size: %d x %d, overview size: %d x %d, refined size: %d x %d, grid size: %d x %d\n",
+                __func__,
+                original_size.width, original_size.height,
+                inst.overview_size.width, inst.overview_size.height,
+                inst.refined_size.width, inst.refined_size.height,
+                grid.width, grid.height);
+
+        for (int row = 0; row < grid.height; row++) {
+            for (int col = 0; col < grid.width; col++) {
+                llava_uhd::slice_coordinates slice;
+                slice.x    = col * tile_size;
+                slice.y    = row * tile_size;
+                slice.size = clip_image_size{tile_size, tile_size};
+                inst.slices.push_back(slice);
+                LOG_DBG("%s: slice %d: x=%d, y=%d, size=%d x %d\n",
+                        __func__, (int)inst.slices.size() - 1,
+                        slice.x, slice.y, slice.size.width, slice.size.height);
+            }
+        }
+
+        return inst;
+    }
+
+private:
+    static clip_image_size find_closest_aspect_ratio(
+            float aspect_ratio,
+            const std::vector<clip_image_size> & target_ratios,
+            int width, int height) {
+        float best_ratio_diff = std::numeric_limits<float>::max();
+        clip_image_size best_ratio = {1, 1};
+        const float area = static_cast<float>(width * height);
+
+        for (const auto & ratio : target_ratios) {
+            const float target_aspect_ratio = static_cast<float>(ratio.width) / ratio.height;
+            const float ratio_diff = std::abs(aspect_ratio - target_aspect_ratio);
+            if (ratio_diff < best_ratio_diff) {
+                best_ratio_diff = ratio_diff;
+                best_ratio = ratio;
+            } else if (ratio_diff == best_ratio_diff) {
+                const float target_area = static_cast<float>(tile_size * tile_size * ratio.width * ratio.height);
+                if (area > 0.5f * target_area) {
+                    best_ratio = ratio;
+                }
+            }
+        }
+        return best_ratio;
+    }
+
+    static std::vector<clip_image_size> get_target_ratios() {
+        std::vector<clip_image_size> ratios;
+        for (int n = min_tiles; n <= max_tiles; n++) {
+            for (int w = 1; w <= n; w++) {
+                for (int h = 1; h <= n; h++) {
+                    if (w * h >= min_tiles && w * h <= max_tiles) {
+                        bool found = false;
+                        for (const auto & r : ratios) {
+                            if (r.width == w && r.height == h) {
+                                found = true;
+                                break;
+                            }
+                        }
+                        if (!found) {
+                            ratios.push_back({w, h});
+                        }
+                    }
+                }
+            }
+        }
+        std::sort(ratios.begin(), ratios.end(), [](const clip_image_size & a, const clip_image_size & b) {
+            return a.width * a.height < b.width * b.height;
+        });
+        return ratios;
+    }
+
+    static clip_image_size get_grid_layout(int height, int width) {
+        const float aspect_ratio = static_cast<float>(width) / height;
+        const auto ratios = get_target_ratios();
+        return find_closest_aspect_ratio(aspect_ratio, ratios, width, height);
+    }
+};
+
 // returns the normalized float tensor for llava-1.5, for spatial_unpad with anyres processing for llava-1.6 it returns the normalized image patch tensors as a vector
 // res_imgs memory is being allocated here, previous allocations will be freed if found
 bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, struct clip_image_f32_batch * res_imgs) {
@@ -3021,6 +3134,20 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
             } break;
 
         case PROJECTOR_TYPE_LFM2:
+            {
+                auto const inst = lfm2_vl_image_processor::get_slice_instructions(ctx, original_size);
+                std::vector<clip_image_u8_ptr> imgs = llava_uhd::slice_image(img, inst);
+
+                for (size_t i = 0; i < imgs.size(); ++i) {
+                    clip_image_f32_ptr res(clip_image_f32_init());
+                    normalize_image_u8_to_f32(*imgs[i], *res, params.image_mean, params.image_std);
+                    res_imgs->entries.push_back(std::move(res));
+                }
+
+                res_imgs->grid_x = inst.grid_size.width;
+                res_imgs->grid_y = inst.grid_size.height;
+            } break;
+
         case PROJECTOR_TYPE_KIMIVL:
             {
                 GGML_ASSERT(params.image_min_pixels > 0 && params.image_max_pixels > 0);
@@ -3032,8 +3159,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
                 const std::array<uint8_t, 3> pad_color = {122, 116, 104};
 
                 clip_image_u8 resized_img;
-                const bool pad = (ctx->proj_type() != PROJECTOR_TYPE_LFM2);
-                img_tool::resize(*img, resized_img, target_size, img_tool::RESIZE_ALGO_BILINEAR, pad, pad_color);
+                img_tool::resize(*img, resized_img, target_size, img_tool::RESIZE_ALGO_BILINEAR, true, pad_color);
                 clip_image_f32_ptr res(clip_image_f32_init());
                 normalize_image_u8_to_f32(resized_img, *res, params.image_mean, params.image_std);
                 res_imgs->entries.push_back(std::move(res));
diff --git a/tools/mtmd/mtmd.cpp b/tools/mtmd/mtmd.cpp
index d037e834f3..b7636279cb 100644
--- a/tools/mtmd/mtmd.cpp
+++ b/tools/mtmd/mtmd.cpp
@@ -85,6 +85,7 @@ enum mtmd_slice_tmpl {
     MTMD_SLICE_TMPL_MINICPMV_2_6,
     MTMD_SLICE_TMPL_LLAMA4,
     MTMD_SLICE_TMPL_IDEFICS3,
+    MTMD_SLICE_TMPL_LFM2,
 };
 
 const char * mtmd_default_marker() {
@@ -307,9 +308,19 @@ struct mtmd_context {
             img_end = "<|im_end|>";
 
         } else if (proj == PROJECTOR_TYPE_LFM2) {
-            img_beg = "<|image_start|>";
-            img_end = "<|image_end|>";
-
+            // multi-tile:
+            //   <|image_start|>
+            //     <|img_row_1_col_1|> (tile) <|img_row_1_col_2|> (tile) ...
+            //     <|img_thumbnail|> (thumbnail)
+            //   <|image_end|>
+            // single-tile:
+            //   <|image_start|> (image) <|image_end|>
+            img_beg            = "<|image_start|>";
+            img_end            = "<|image_end|>";
+            slice_tmpl         = MTMD_SLICE_TMPL_LFM2;
+            sli_img_start_tmpl = "<|img_row_%d_col_%d|>";
+            tok_ov_img_start   = {lookup_token("<|img_thumbnail|>")};
+            ov_img_first       = false;
         } else if (proj == PROJECTOR_TYPE_GLM4V) {
             img_beg = "<|begin_of_image|>";
             img_end = "<|end_of_image|>";
@@ -562,11 +573,13 @@ struct mtmd_tokenizer {
             }
 
             // handle llava-uhd style preprocessing
+            const bool has_tiling_grid = batch_f32.grid_x > 0 && batch_f32.grid_y > 0;
             if (
                 ctx->slice_tmpl == MTMD_SLICE_TMPL_MINICPMV_2_5
                 || ctx->slice_tmpl == MTMD_SLICE_TMPL_MINICPMV_2_6
                 || ctx->slice_tmpl == MTMD_SLICE_TMPL_LLAMA4
                 || ctx->slice_tmpl == MTMD_SLICE_TMPL_IDEFICS3
+                || (ctx->slice_tmpl == MTMD_SLICE_TMPL_LFM2 && has_tiling_grid)
             ) {
                 const int n_col = batch_f32.grid_x;
                 const int n_row = batch_f32.grid_y;

From 98e57ca422c5adb33663a6406c2f2d5b7d255da7 Mon Sep 17 00:00:00 2001
From: Xuan-Son Nguyen <son@huggingface.co>
Date: Mon, 9 Feb 2026 22:14:12 +0100
Subject: [PATCH 17/17] chat: fix case where template accepts type content only
 (#19419)

* chat: fix case where template accepts type content only

* rm stray log

* reuse render_message_to_json
---
 common/chat.cpp          | 39 +++++++++++++++++++++++++++++++++++----
 common/chat.h            |  2 ++
 common/jinja/caps.cpp    | 13 +++++++++----
 common/jinja/caps.h      |  4 +++-
 common/jinja/runtime.cpp |  6 ++++++
 5 files changed, 55 insertions(+), 9 deletions(-)

diff --git a/common/chat.cpp b/common/chat.cpp
index 2bf4632669..47a34d5822 100644
--- a/common/chat.cpp
+++ b/common/chat.cpp
@@ -380,15 +380,46 @@ std::vector<common_chat_msg> common_chat_msgs_parse_oaicompat(const json & messa
     return msgs;
 }
 
-json common_chat_msgs_to_json_oaicompat(const std::vector<common_chat_msg> & msgs, bool concat_typed_text) {
+static json render_message_to_json(const std::vector<common_chat_msg> & msgs, const jinja::caps & c) {
+    if (!c.supports_string_content && !c.supports_typed_content) {
+        LOG_WRN("%s: Neither string content nor typed content is supported by the template. This is unexpected and may lead to issues.\n", __func__);
+    }
+
+    bool only_string_accepted =  c.supports_string_content && !c.supports_typed_content;
+    bool only_typed_accepted  = !c.supports_string_content &&  c.supports_typed_content;
+
     json messages = json::array();
     for (const auto & msg : msgs) {
-        json jmsg = msg.to_json_oaicompat(concat_typed_text);
-        messages.push_back(jmsg);
+        if (only_string_accepted) {
+            json jmsg = msg.to_json_oaicompat(/* concat_typed_text= */ true);
+            messages.push_back(jmsg);
+        } else if (only_typed_accepted) {
+            json jmsg = msg.to_json_oaicompat(/* concat_typed_text= */ false);
+            if (jmsg.at("content").is_string()) {
+                jmsg["content"] = json::array({
+                    json{
+                        {"type", "text"},
+                        {"text", jmsg.at("content").get<std::string>()},
+                    }
+                });
+            }
+            messages.push_back(jmsg);
+        } else {
+            json jmsg = msg.to_json_oaicompat(/* concat_typed_text= */ false);
+            messages.push_back(jmsg);
+        }
     }
     return messages;
 }
 
+// DEPRECATED: only used in tests
+json common_chat_msgs_to_json_oaicompat(const std::vector<common_chat_msg> & msgs, bool concat_typed_text) {
+    jinja::caps c;
+    c.supports_string_content = true;
+    c.supports_typed_content = !concat_typed_text;
+    return render_message_to_json(msgs, c);
+}
+
 std::vector<common_chat_tool> common_chat_tools_parse_oaicompat(const json & tools) {
     std::vector<common_chat_tool> result;
 
@@ -3020,7 +3051,7 @@ static common_chat_params common_chat_templates_apply_jinja(
         : *tmpls->template_default;
     const auto & src = tmpl.source();
     const auto & caps = tmpl.original_caps();
-    params.messages = common_chat_msgs_to_json_oaicompat(inputs.messages, /* concat_text= */ !tmpl.original_caps().requires_typed_content);
+    params.messages = render_message_to_json(inputs.messages, tmpl.original_caps());
     params.add_generation_prompt = inputs.add_generation_prompt;
     params.tool_choice = inputs.tool_choice;
     params.reasoning_format = inputs.reasoning_format;
diff --git a/common/chat.h b/common/chat.h
index 24aa4aab5c..1bf43f7261 100644
--- a/common/chat.h
+++ b/common/chat.h
@@ -240,6 +240,8 @@ bool common_chat_templates_support_enable_thinking(const common_chat_templates *
 
 // Parses a JSON array of messages in OpenAI's chat completion API format.
 std::vector<common_chat_msg> common_chat_msgs_parse_oaicompat(const nlohmann::ordered_json & messages);
+
+// DEPRECATED: only used in tests
 nlohmann::ordered_json common_chat_msgs_to_json_oaicompat(const std::vector<common_chat_msg> & msgs, bool concat_typed_text = false);
 
 std::vector<common_chat_tool> common_chat_tools_parse_oaicompat(const nlohmann::ordered_json & tools);
diff --git a/common/jinja/caps.cpp b/common/jinja/caps.cpp
index f27490f1fb..dbaaed500a 100644
--- a/common/jinja/caps.cpp
+++ b/common/jinja/caps.cpp
@@ -63,7 +63,8 @@ static void caps_print_stats(value & v, const std::string & path) {
 
 std::map<std::string, bool> caps::to_map() const {
     return {
-        {"requires_typed_content", requires_typed_content},
+        {"supports_string_content", supports_string_content},
+        {"supports_typed_content", supports_typed_content},
         {"supports_tools", supports_tools},
         {"supports_tool_calls", supports_tool_calls},
         {"supports_parallel_tool_calls", supports_parallel_tool_calls},
@@ -89,7 +90,7 @@ caps caps_get(jinja::program & prog) {
         return v->stats.ops.find(op_name) != v->stats.ops.end();
     };
 
-    // case: typed content requirement
+    // case: typed content support
     caps_try_execute(
         prog,
         [&]() {
@@ -105,12 +106,16 @@ caps caps_get(jinja::program & prog) {
             // tools
             return json{nullptr};
         },
-        [&](bool, value & messages, value &) {
+        [&](bool success, value & messages, value &) {
             auto & content = messages->at(0)->at("content");
             caps_print_stats(content, "messages[0].content");
             if (has_op(content, "selectattr") || has_op(content, "array_access")) {
                 // accessed as an array
-                result.requires_typed_content = true;
+                result.supports_typed_content = true;
+            }
+            if (!success) {
+                // failed to execute with content as string
+                result.supports_string_content = false;
             }
         }
     );
diff --git a/common/jinja/caps.h b/common/jinja/caps.h
index 77df117baa..e694e7bfaa 100644
--- a/common/jinja/caps.h
+++ b/common/jinja/caps.h
@@ -14,7 +14,9 @@ struct caps {
     bool supports_parallel_tool_calls = true;
     bool supports_preserve_reasoning = false; // support assistant message with reasoning_content
 
-    bool requires_typed_content = false; // default: use string content
+    // one of the 2 content capabilities must be true
+    bool supports_string_content = true;
+    bool supports_typed_content = false;
 
     // for reporting on server
     std::map<std::string, bool> to_map() const;
diff --git a/common/jinja/runtime.cpp b/common/jinja/runtime.cpp
index 4453d86e6d..cc012c892f 100644
--- a/common/jinja/runtime.cpp
+++ b/common/jinja/runtime.cpp
@@ -446,6 +446,12 @@ value for_statement::execute_impl(context & ctx) {
 
     value iterable_val = iter_expr->execute(scope);
 
+    // mark the variable being iterated as used for stats
+    if (ctx.is_get_stats) {
+        iterable_val->stats.used = true;
+        iterable_val->stats.ops.insert("array_access");
+    }
+
     if (iterable_val->is_undefined()) {
         JJ_DEBUG("%s", "For loop iterable is undefined, skipping loop");
         iterable_val = mk_val<value_array>();