From 34ba7b5a2f5cd88f99629a3bd68d003fbd5bc2cf Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sat, 7 Feb 2026 07:37:15 +0200 Subject: [PATCH 01/17] metal : fix event synchronization in cpy_tensor_async (#19402) --- ggml/src/ggml-metal/ggml-metal-context.m | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml/src/ggml-metal/ggml-metal-context.m b/ggml/src/ggml-metal/ggml-metal-context.m index c7e8ebd3f3..5d3a8ce412 100644 --- a/ggml/src/ggml-metal/ggml-metal-context.m +++ b/ggml/src/ggml-metal/ggml-metal-context.m @@ -394,7 +394,7 @@ bool ggml_metal_cpy_tensor_async(ggml_metal_t ctx_src, ggml_metal_t ctx_dst, con [encoder endEncoding]; ggml_metal_event_t ev_cpy = ggml_metal_get_ev_cpy(ctx_src); - ggml_metal_event_record(ctx_src, ev_cpy); + ggml_metal_event_encode_signal(ev_cpy, cmd_buf); [cmd_buf commit]; From 8872ad2125336d209a9911a82101f80095a9831d Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sat, 7 Feb 2026 10:35:56 +0200 Subject: [PATCH 02/17] metal : consolidate bin kernels (#19390) * metal : refactor bin kernels * cont * cont : fix cv --- ggml/src/ggml-metal/ggml-metal-device.cpp | 80 ++++- ggml/src/ggml-metal/ggml-metal-device.h | 6 +- ggml/src/ggml-metal/ggml-metal-device.m | 10 +- ggml/src/ggml-metal/ggml-metal-impl.h | 1 + ggml/src/ggml-metal/ggml-metal-ops.cpp | 33 +- ggml/src/ggml-metal/ggml-metal.metal | 389 ++++++++-------------- 6 files changed, 224 insertions(+), 295 deletions(-) diff --git a/ggml/src/ggml-metal/ggml-metal-device.cpp b/ggml/src/ggml-metal/ggml-metal-device.cpp index 6af0dd88d5..4c4c3ce36c 100644 --- a/ggml/src/ggml-metal/ggml-metal-device.cpp +++ b/ggml/src/ggml-metal/ggml-metal-device.cpp @@ -1392,34 +1392,78 @@ ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_flash_attn_ext_v GGML_UNUSED(op); } -ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_bin( - ggml_metal_library_t lib, - ggml_op op, - int32_t n_fuse, - bool row) { +ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_bin(ggml_metal_library_t lib, const ggml_tensor * op, int32_t n_fuse) { char base[256]; char name[256]; - const char * op_str = "undefined"; - switch (op) { - case GGML_OP_ADD: op_str = "add"; break; - case GGML_OP_SUB: op_str = "sub"; break; - case GGML_OP_MUL: op_str = "mul"; break; - case GGML_OP_DIV: op_str = "div"; break; + int op_num = -1; + + switch (op->op) { + case GGML_OP_ADD: op_num = 0; break; + case GGML_OP_SUB: op_num = 1; break; + case GGML_OP_MUL: op_num = 2; break; + case GGML_OP_DIV: op_num = 3; break; default: GGML_ABORT("fatal error"); }; - if (row) { - snprintf(base, 256, "kernel_%s_row_c4_fuse_%d", op_str, n_fuse); - } else { - snprintf(base, 256, "kernel_%s_fuse_%d", op_str, n_fuse); - } + const char * t0_str = ggml_type_name(op->src[0]->type); + const char * t1_str = ggml_type_name(op->src[1]->type); + const char * t_str = ggml_type_name(op->type); - snprintf(name, 256, "%s", base); + const bool is_c4 = (op->src[0]->ne[0] % 4 == 0) && (op->src[1]->ne[0] % 4 == 0); + + const bool is_rb = ggml_is_contiguous(op->src[0]) && ggml_is_contiguous(op->src[1]) && (ggml_nrows(op->src[1]) == 1) && ggml_nelements(op) < 65536; + + snprintf(base, 256, "kernel_bin_fuse_%s_%s_%s%s", t0_str, t1_str, t_str, is_c4 ? "_4" : ""); + snprintf(name, 256, "%s_op=%d_nf=%d_rb=%d", base, op_num, n_fuse, is_rb); ggml_metal_pipeline_with_params res = ggml_metal_library_get_pipeline(lib, name); if (!res.pipeline) { - res = ggml_metal_library_compile_pipeline(lib, base, name, nullptr); + ggml_metal_cv_t cv = ggml_metal_cv_init(); + + ggml_metal_cv_set_int16(cv, op_num, FC_BIN + 0); + ggml_metal_cv_set_int16(cv, n_fuse, FC_BIN + 1); + ggml_metal_cv_set_bool (cv, is_rb, FC_BIN + 2); + + res = ggml_metal_library_compile_pipeline(lib, base, name, cv); + + ggml_metal_cv_free(cv); + } + + res.c4 = is_c4; + res.cnt = is_rb; + + return res; +} + +ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_bin_one(ggml_metal_library_t lib, ggml_op op) { + char base[256]; + char name[256]; + + int op_num = -1; + + switch (op) { + case GGML_OP_ADD: op_num = 0; break; + case GGML_OP_SUB: op_num = 1; break; + case GGML_OP_MUL: op_num = 2; break; + case GGML_OP_DIV: op_num = 3; break; + default: GGML_ABORT("fatal error"); + }; + + snprintf(base, 256, "kernel_bin_fuse_%s_%s_%s", "f32", "f32", "f32"); + snprintf(name, 256, "%s_op=%d_nf=%d", base, op_num, 1); + + ggml_metal_pipeline_with_params res = ggml_metal_library_get_pipeline(lib, name); + if (!res.pipeline) { + ggml_metal_cv_t cv = ggml_metal_cv_init(); + + ggml_metal_cv_set_int16(cv, op_num, FC_BIN + 0); + ggml_metal_cv_set_int16(cv, 1, FC_BIN + 1); + ggml_metal_cv_set_bool (cv, false, FC_BIN + 2); + + res = ggml_metal_library_compile_pipeline(lib, base, name, cv); + + ggml_metal_cv_free(cv); } return res; diff --git a/ggml/src/ggml-metal/ggml-metal-device.h b/ggml/src/ggml-metal/ggml-metal-device.h index 84dcec3083..93d7f6a216 100644 --- a/ggml/src/ggml-metal/ggml-metal-device.h +++ b/ggml/src/ggml-metal/ggml-metal-device.h @@ -53,6 +53,9 @@ struct ggml_metal_pipeline_with_params { int nr1; size_t smem; + + bool c4; + bool cnt; }; int ggml_metal_pipeline_max_theads_per_threadgroup(struct ggml_metal_pipeline_with_params pipeline); @@ -134,7 +137,8 @@ struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_argsort struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_argsort_merge (ggml_metal_library_t lib, const struct ggml_tensor * op); struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_top_k (ggml_metal_library_t lib, const struct ggml_tensor * op); struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_top_k_merge (ggml_metal_library_t lib, const struct ggml_tensor * op); -struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_bin (ggml_metal_library_t lib, enum ggml_op op, int32_t n_fuse, bool row); +struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_bin (ggml_metal_library_t lib, const struct ggml_tensor * op, int32_t n_fuse ); +struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_bin_one (ggml_metal_library_t lib, enum ggml_op op); struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_l2_norm (ggml_metal_library_t lib, const struct ggml_tensor * op); struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_group_norm (ggml_metal_library_t lib, const struct ggml_tensor * op); struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_norm (ggml_metal_library_t lib, const struct ggml_tensor * op, int32_t n_fuse); diff --git a/ggml/src/ggml-metal/ggml-metal-device.m b/ggml/src/ggml-metal/ggml-metal-device.m index c8e737d418..891d70c85a 100644 --- a/ggml/src/ggml-metal/ggml-metal-device.m +++ b/ggml/src/ggml-metal/ggml-metal-device.m @@ -346,10 +346,12 @@ struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline(ggml_meta struct ggml_metal_pipeline_with_params res = { /*.pipeline =*/ nil, + /*.nsg =*/ 0, /*.nr0 =*/ 0, /*.nr1 =*/ 0, - /*.nsg =*/ 0, /*.smem =*/ 0, + /*.c4 =*/ false, + /*.cnt =*/ false, }; res.pipeline = ggml_metal_pipelines_get(lib->pipelines, name); @@ -362,10 +364,12 @@ struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline(ggml_meta struct ggml_metal_pipeline_with_params ggml_metal_library_compile_pipeline(ggml_metal_library_t lib, const char * base, const char * name, ggml_metal_cv_t cv) { struct ggml_metal_pipeline_with_params res = { /*.pipeline =*/ nil, + /*.nsg =*/ 0, /*.nr0 =*/ 0, /*.nr1 =*/ 0, - /*.nsg =*/ 0, /*.smem =*/ 0, + /*.c4 =*/ false, + /*.cnt =*/ false, }; [lib->lock lock]; @@ -1054,7 +1058,7 @@ bool ggml_metal_device_supports_op(ggml_metal_device_t dev, const struct ggml_te case GGML_OP_MUL: case GGML_OP_DIV: case GGML_OP_ADD_ID: - return op->src[0]->type == GGML_TYPE_F32; + return ggml_is_contiguous_rows(op->src[0]) && ggml_is_contiguous_rows(op->src[1]) && op->src[0]->type == GGML_TYPE_F32; case GGML_OP_ACC: case GGML_OP_REPEAT: case GGML_OP_SCALE: diff --git a/ggml/src/ggml-metal/ggml-metal-impl.h b/ggml/src/ggml-metal/ggml-metal-impl.h index 7f73cb97bb..77bb403c15 100644 --- a/ggml/src/ggml-metal/ggml-metal-impl.h +++ b/ggml/src/ggml-metal/ggml-metal-impl.h @@ -80,6 +80,7 @@ #define FC_SSM_CONV 900 #define FC_SOLVE_TRI 1000 #define FC_COUNT_EQUAL 1100 +#define FC_BIN 1200 // op-specific constants #define OP_FLASH_ATTN_EXT_NQPSG 8 diff --git a/ggml/src/ggml-metal/ggml-metal-ops.cpp b/ggml/src/ggml-metal/ggml-metal-ops.cpp index e0ed6c7805..dbf25433c2 100644 --- a/ggml/src/ggml-metal/ggml-metal-ops.cpp +++ b/ggml/src/ggml-metal/ggml-metal-ops.cpp @@ -707,7 +707,7 @@ int ggml_metal_op_acc(ggml_metal_op_t ctx, int idx) { /*.o1 =*/ { 0 }, }; - auto pipeline = ggml_metal_library_get_pipeline_bin(lib, GGML_OP_ADD, 1, false); + auto pipeline = ggml_metal_library_get_pipeline_bin_one(lib, GGML_OP_ADD); ggml_metal_encoder_set_pipeline(enc, pipeline); ggml_metal_encoder_set_bytes (enc, &args, sizeof(args), 0); @@ -2895,8 +2895,6 @@ int ggml_metal_op_bin(ggml_metal_op_t ctx, int idx) { GGML_ASSERT(ggml_is_contiguous_rows(op->src[0])); GGML_ASSERT(ggml_is_contiguous_rows(op->src[1])); - bool bcast_row = false; - ggml_metal_buffer_id bid_src0 = ggml_metal_get_buffer_id(op->src[0]); ggml_metal_buffer_id bid_src1 = ggml_metal_get_buffer_id(op->src[1]); ggml_metal_buffer_id bid_dst = ggml_metal_get_buffer_id(op); @@ -2990,18 +2988,7 @@ int ggml_metal_op_bin(ggml_metal_op_t ctx, int idx) { struct ggml_metal_pipeline_with_params pipeline; - if (ggml_nelements(op->src[1]) == ne10 && ggml_is_contiguous(op->src[1]) && ne00 % 4 == 0 && ne10 % 4 == 0) { - GGML_ASSERT(ggml_is_contiguous(op->src[0])); - - // src1 is a row - GGML_ASSERT(ne11 == 1); - - pipeline = ggml_metal_library_get_pipeline_bin(lib, op->op, n_fuse, true); - - bcast_row = true; - } else { - pipeline = ggml_metal_library_get_pipeline_bin(lib, op->op, n_fuse, false); - } + pipeline = ggml_metal_library_get_pipeline_bin(lib, op, n_fuse); if (n_fuse > 1) { bid_dst = ggml_metal_get_buffer_id(ctx->node(idx + n_fuse - 1)); @@ -3015,20 +3002,28 @@ int ggml_metal_op_bin(ggml_metal_op_t ctx, int idx) { } } + if (pipeline.c4) { + args.ne00 = ne00/4; + args.ne10 = ne10/4; + args.ne0 = ne0/4; + } + ggml_metal_encoder_set_pipeline(enc, pipeline); ggml_metal_encoder_set_bytes (enc, &args, sizeof(args), 0); ggml_metal_encoder_set_buffer (enc, bid_src0, 1); ggml_metal_encoder_set_buffer (enc, bid_src1, 2); ggml_metal_encoder_set_buffer (enc, bid_dst, 3); - if (bcast_row) { - const int64_t n = ggml_nelements(op)/4; + if (pipeline.cnt) { + const int n = pipeline.c4 ? ggml_nelements(op)/4 : ggml_nelements(op); ggml_metal_encoder_dispatch_threadgroups(enc, n, 1, 1, 1, 1, 1); } else { - int nth = 32; + const int nth_max = MIN(256, ggml_metal_pipeline_max_theads_per_threadgroup(pipeline)); - while (16*nth < ne0 && nth < ggml_metal_pipeline_max_theads_per_threadgroup(pipeline)) { + int nth = 1; + + while (2*nth < args.ne0 && nth < nth_max) { nth *= 2; } diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal index 612a42a1ea..35cc3bbdfd 100644 --- a/ggml/src/ggml-metal/ggml-metal.metal +++ b/ggml/src/ggml-metal/ggml-metal.metal @@ -895,11 +895,13 @@ enum ggml_sort_order { GGML_SORT_ORDER_DESC, }; -// general-purpose kernel for addition, subtraction, multiplication and division of two tensors -// pros: works for non-contiguous tensors, supports broadcast across all dims -// cons: not very efficient -template -kernel void kernel_add_fuse_impl( +// OP: 0 - add, 1 - sub, 2 - mul, 3 - div +constant short FC_bin_op [[function_constant(FC_BIN + 0)]]; +constant short FC_bin_f [[function_constant(FC_BIN + 1)]]; +constant bool FC_bin_rb [[function_constant(FC_BIN + 2)]]; + +template +kernel void kernel_bin_fuse_impl( constant ggml_metal_kargs_bin & args, device const char * src0, device const char * src1, @@ -907,138 +909,152 @@ kernel void kernel_add_fuse_impl( uint3 tgpig[[threadgroup_position_in_grid]], ushort3 tpitg[[thread_position_in_threadgroup]], ushort3 ntg[[threads_per_threadgroup]]) { - const int i03 = tgpig.z; - const int i02 = tgpig.y; - const int i01 = tgpig.x; +#define FC_OP FC_bin_op +#define FC_F FC_bin_f +#define FC_RB FC_bin_rb - const int i13 = i03%args.ne13; - const int i12 = i02%args.ne12; - const int i11 = i01%args.ne11; + if (FC_RB) { + // row broadcast + const uint i0 = tgpig.x; + const uint i1 = i0%args.ne10; - device const float * src0_ptr = (device const float *) (src0 + i03*args.nb03 + i02*args.nb02 + i01*args.nb01 + args.offs); - device float * dst_ptr = (device float *) (dst + i03*args.nb3 + i02*args.nb2 + i01*args.nb1 + args.offs); + device const T0 * src0_row = (device const T0 *) (src0); + device T * dst_row = (device T *) (dst); - device const float * src1_ptr[F]; - for (short j = 0; j < F; ++j) { - src1_ptr[j] = (device const float *) (src1 + args.o1[j] + i13*args.nb13 + i12*args.nb12 + i11*args.nb11); - } + if (FC_F == 1) { + device const T1 * src1_row = (device const T1 *) (src1 + args.o1[0]); - for (int i0 = tpitg.x; i0 < args.ne0; i0 += ntg.x) { - const int i10 = i0%args.ne10; + if (FC_OP == 0) { + dst_row[i0] = src0_row[i0] + src1_row[i1]; + } - float res = src0_ptr[i0]; + if (FC_OP == 1) { + dst_row[i0] = src0_row[i0] - src1_row[i1]; + } -#pragma unroll - for (short j = 0; j < F; ++j) { - res += src1_ptr[j][i10]; - } + if (FC_OP == 2) { + dst_row[i0] = src0_row[i0] * src1_row[i1]; + } - dst_ptr[i0] = res; - } -} + if (FC_OP == 3) { + dst_row[i0] = src0_row[i0] / src1_row[i1]; + } + } else { + T0 res = src0_row[i0]; -typedef decltype(kernel_add_fuse_impl<2>) kernel_add_fuse_t; + if (FC_OP == 0) { + FOR_UNROLL (short j = 0; j < FC_F; ++j) { + res += ((device const T1 *) (src1 + args.o1[j]))[i1]; + } + } -template [[host_name("kernel_add_fuse_1")]] kernel kernel_add_fuse_t kernel_add_fuse_impl<1>; -template [[host_name("kernel_add_fuse_2")]] kernel kernel_add_fuse_t kernel_add_fuse_impl<2>; -template [[host_name("kernel_add_fuse_3")]] kernel kernel_add_fuse_t kernel_add_fuse_impl<3>; -template [[host_name("kernel_add_fuse_4")]] kernel kernel_add_fuse_t kernel_add_fuse_impl<4>; -template [[host_name("kernel_add_fuse_5")]] kernel kernel_add_fuse_t kernel_add_fuse_impl<5>; -template [[host_name("kernel_add_fuse_6")]] kernel kernel_add_fuse_t kernel_add_fuse_impl<6>; -template [[host_name("kernel_add_fuse_7")]] kernel kernel_add_fuse_t kernel_add_fuse_impl<7>; -template [[host_name("kernel_add_fuse_8")]] kernel kernel_add_fuse_t kernel_add_fuse_impl<8>; + if (FC_OP == 1) { + FOR_UNROLL (short j = 0; j < FC_F; ++j) { + res -= ((device const T1 *) (src1 + args.o1[j]))[i1]; + } + } -kernel void kernel_sub_fuse_1( - constant ggml_metal_kargs_bin & args, - device const char * src0, - device const char * src1, - device char * dst, - uint3 tgpig[[threadgroup_position_in_grid]], - ushort3 tpitg[[thread_position_in_threadgroup]], - ushort3 ntg[[threads_per_threadgroup]]) { - const int i03 = tgpig.z; - const int i02 = tgpig.y; - const int i01 = tgpig.x; + if (FC_OP == 2) { + FOR_UNROLL (short j = 0; j < FC_F; ++j) { + res *= ((device const T1 *) (src1 + args.o1[j]))[i1]; + } + } - const int i13 = i03%args.ne13; - const int i12 = i02%args.ne12; - const int i11 = i01%args.ne11; + if (FC_OP == 3) { + FOR_UNROLL (short j = 0; j < FC_F; ++j) { + res /= ((device const T1 *) (src1 + args.o1[j]))[i1]; + } + } - device const char * src0_ptr = src0 + i03*args.nb03 + i02*args.nb02 + i01*args.nb01 + args.offs; - device const char * src1_ptr = src1 + i13*args.nb13 + i12*args.nb12 + i11*args.nb11 + args.o1[0]; - device char * dst_ptr = dst + i03*args.nb3 + i02*args.nb2 + i01*args.nb1 + args.offs; - - for (int i0 = tpitg.x; i0 < args.ne0; i0 += ntg.x) { - const int i10 = i0%args.ne10; - *((device float *)(dst_ptr + i0*args.nb0)) = *((device float *)(src0_ptr + i0*args.nb00)) - *((device float *)(src1_ptr + i10*args.nb10)); - } -} - -kernel void kernel_mul_fuse_1( - constant ggml_metal_kargs_bin & args, - device const char * src0, - device const char * src1, - device char * dst, - uint3 tgpig[[threadgroup_position_in_grid]], - ushort3 tpitg[[thread_position_in_threadgroup]], - ushort3 ntg[[threads_per_threadgroup]]) { - const int i03 = tgpig.z; - const int i02 = tgpig.y; - const int i01 = tgpig.x; - - const int i13 = i03%args.ne13; - const int i12 = i02%args.ne12; - const int i11 = i01%args.ne11; - - device const char * src0_ptr = src0 + i03*args.nb03 + i02*args.nb02 + i01*args.nb01 + args.offs; - device const char * src1_ptr = src1 + i13*args.nb13 + i12*args.nb12 + i11*args.nb11 + args.o1[0]; - device char * dst_ptr = dst + i03*args.nb3 + i02*args.nb2 + i01*args.nb1 + args.offs; - - if (args.ne10 == 1) { - const float x = *((device float *)(src1_ptr)); - for (int i0 = tpitg.x; i0 < args.ne0; i0 += ntg.x) { - *((device float *)(dst_ptr + i0*args.nb0)) = *((device float *)(src0_ptr + i0*args.nb00)) * x; + dst_row[i0] = res; } } else { - for (int i0 = tpitg.x; i0 < args.ne0; i0 += ntg.x) { - const int i10 = i0%args.ne10; - *((device float *)(dst_ptr + i0*args.nb0)) = *((device float *)(src0_ptr + i0*args.nb00)) * *((device float *)(src1_ptr + i10*args.nb10)); + const int i03 = tgpig.z; + const int i02 = tgpig.y; + const int i01 = tgpig.x; + + if (i01 >= args.ne01) { + return; + } + + const int i13 = i03%args.ne13; + const int i12 = i02%args.ne12; + const int i11 = i01%args.ne11; + + device const T0 * src0_ptr = (device const T0 *) (src0 + i03*args.nb03 + i02*args.nb02 + i01*args.nb01 + args.offs); + device T * dst_ptr = (device T *) (dst + i03*args.nb3 + i02*args.nb2 + i01*args.nb1 + args.offs); + + if (FC_F == 1) { + device const T1 * src1_ptr = (device const T1 *) (src1 + args.o1[0] + i13*args.nb13 + i12*args.nb12 + i11*args.nb11); + + for (int i0 = tpitg.x; i0 < args.ne0; i0 += ntg.x) { + const int i10 = i0%args.ne10; + + if (FC_OP == 0) { + dst_ptr[i0] = src0_ptr[i0] + src1_ptr[i10]; + } + + if (FC_OP == 1) { + dst_ptr[i0] = src0_ptr[i0] - src1_ptr[i10]; + } + + if (FC_OP == 2) { + dst_ptr[i0] = src0_ptr[i0] * src1_ptr[i10]; + } + + if (FC_OP == 3) { + dst_ptr[i0] = src0_ptr[i0] / src1_ptr[i10]; + } + } + } else { + device const T1 * src1_ptr[8]; + FOR_UNROLL (short j = 0; j < FC_F; ++j) { + src1_ptr[j] = (device const T1 *) (src1 + args.o1[j] + i13*args.nb13 + i12*args.nb12 + i11*args.nb11); + } + + for (int i0 = tpitg.x; i0 < args.ne0; i0 += ntg.x) { + const int i10 = i0%args.ne10; + + T res = src0_ptr[i0]; + + if (FC_OP == 0) { + FOR_UNROLL (short j = 0; j < FC_F; ++j) { + res += src1_ptr[j][i10]; + } + } + + if (FC_OP == 1) { + FOR_UNROLL (short j = 0; j < FC_F; ++j) { + res -= src1_ptr[j][i10]; + } + } + + if (FC_OP == 2) { + FOR_UNROLL (short j = 0; j < FC_F; ++j) { + res *= src1_ptr[j][i10]; + } + } + + if (FC_OP == 3) { + FOR_UNROLL (short j = 0; j < FC_F; ++j) { + res /= src1_ptr[j][i10]; + } + } + + dst_ptr[i0] = res; + } } } + +#undef FC_OP +#undef FC_F +#undef FC_RB } -kernel void kernel_div_fuse_1( - constant ggml_metal_kargs_bin & args, - device const char * src0, - device const char * src1, - device char * dst, - uint3 tgpig[[threadgroup_position_in_grid]], - ushort3 tpitg[[thread_position_in_threadgroup]], - ushort3 ntg[[threads_per_threadgroup]]) { - const int i03 = tgpig.z; - const int i02 = tgpig.y; - const int i01 = tgpig.x; +typedef decltype(kernel_bin_fuse_impl) kernel_bin_fuse_t; - const int i13 = i03%args.ne13; - const int i12 = i02%args.ne12; - const int i11 = i01%args.ne11; - - device const char * src0_ptr = src0 + i03*args.nb03 + i02*args.nb02 + i01*args.nb01 + args.offs; - device const char * src1_ptr = src1 + i13*args.nb13 + i12*args.nb12 + i11*args.nb11 + args.o1[0]; - device char * dst_ptr = dst + i03*args.nb3 + i02*args.nb2 + i01*args.nb1 + args.offs; - - if (args.ne10 == 1) { - const float x = 1.0f / *((device float *)(src1_ptr)); - for (int i0 = tpitg.x; i0 < args.ne0; i0 += ntg.x) { - *((device float *)(dst_ptr + i0*args.nb0)) = *((device float *)(src0_ptr + i0*args.nb00)) * x; - } - } else { - for (int i0 = tpitg.x; i0 < args.ne0; i0 += ntg.x) { - const int i10 = i0%args.ne10; - *((device float *)(dst_ptr + i0*args.nb0)) = *((device float *)(src0_ptr + i0*args.nb00)) / *((device float *)(src1_ptr + i10*args.nb10)); - } - } -} +template [[host_name("kernel_bin_fuse_f32_f32_f32")]] kernel kernel_bin_fuse_t kernel_bin_fuse_impl; +template [[host_name("kernel_bin_fuse_f32_f32_f32_4")]] kernel kernel_bin_fuse_t kernel_bin_fuse_impl; kernel void kernel_add_id( constant ggml_metal_kargs_add_id & args, @@ -1057,7 +1073,7 @@ kernel void kernel_add_id( const size_t nb1 = args.ne0 * sizeof(float); const size_t nb2 = args.ne1 * nb1; - device float * dst_row = (device float *)((device char *)dst + i1*nb1 + i2*nb2); + device float * dst_row = (device float *)((device char *)dst + i1*nb1 + i2*nb2); device const float * src0_row = (device const float *)((device char *)src0 + i1*args.nb01 + i2*args.nb02); device const float * src1_row = (device const float *)((device char *)src1 + i11*args.nb11); @@ -1098,141 +1114,6 @@ template [[host_name("kernel_repeat_f16")]] kernel kernel_repeat_t kernel_repeat template [[host_name("kernel_repeat_i32")]] kernel kernel_repeat_t kernel_repeat; template [[host_name("kernel_repeat_i16")]] kernel kernel_repeat_t kernel_repeat; -// assumption: src1 is a row -// broadcast src1 into src0 -template -kernel void kernel_add_row_c4_fuse_impl( - constant ggml_metal_kargs_bin & args, - device const char * src0, - device const char * src1, - device char * dst, - uint tpig[[thread_position_in_grid]]) { - const uint nb = args.ne00/4; - const uint i = tpig % nb; - - device const float4 * src0_row = (device const float4 *) (src0); - device float4 * dst_row = (device float4 *) (dst); - - float4 res = src0_row[tpig]; - -#pragma unroll(F) - for (short j = 0; j < F; ++j) { - res += ((device const float4 *) (src1 + args.o1[j]))[i]; - } - - dst_row[tpig] = res; -} - -typedef decltype(kernel_add_row_c4_fuse_impl<1>) kernel_add_row_c4_fuse_t; - -template [[host_name("kernel_add_row_c4_fuse_1")]] kernel kernel_add_row_c4_fuse_t kernel_add_row_c4_fuse_impl<1>; -template [[host_name("kernel_add_row_c4_fuse_2")]] kernel kernel_add_row_c4_fuse_t kernel_add_row_c4_fuse_impl<2>; -template [[host_name("kernel_add_row_c4_fuse_3")]] kernel kernel_add_row_c4_fuse_t kernel_add_row_c4_fuse_impl<3>; -template [[host_name("kernel_add_row_c4_fuse_4")]] kernel kernel_add_row_c4_fuse_t kernel_add_row_c4_fuse_impl<4>; -template [[host_name("kernel_add_row_c4_fuse_5")]] kernel kernel_add_row_c4_fuse_t kernel_add_row_c4_fuse_impl<5>; -template [[host_name("kernel_add_row_c4_fuse_6")]] kernel kernel_add_row_c4_fuse_t kernel_add_row_c4_fuse_impl<6>; -template [[host_name("kernel_add_row_c4_fuse_7")]] kernel kernel_add_row_c4_fuse_t kernel_add_row_c4_fuse_impl<7>; -template [[host_name("kernel_add_row_c4_fuse_8")]] kernel kernel_add_row_c4_fuse_t kernel_add_row_c4_fuse_impl<8>; - -template -kernel void kernel_sub_row_c4_fuse_impl( - constant ggml_metal_kargs_bin & args, - device const char * src0, - device const char * src1, - device char * dst, - uint tpig[[thread_position_in_grid]]) { - - const uint nb = args.ne00/4; - const uint i = tpig % nb; - - device const float4 * src0_row = (device const float4 *) (src0); - device float4 * dst_row = (device float4 *) (dst); - - device const float4 * src1_row[F]; - for (short j = 0; j < F; ++j) { - src1_row[j] = (device const float4 *) (src1 + args.o1[j]); - } - - float4 res = src0_row[tpig]; - -#pragma unroll(F) - for (short j = 0; j < F; ++j) { - res -= src1_row[j][i]; - } - - dst_row[tpig] = res; -} - -typedef decltype(kernel_sub_row_c4_fuse_impl<1>) kernel_sub_row_c4_fuse_t; - -template [[host_name("kernel_sub_row_c4_fuse_1")]] kernel kernel_sub_row_c4_fuse_t kernel_sub_row_c4_fuse_impl<1>; - -template -kernel void kernel_mul_row_c4_fuse_impl( - constant ggml_metal_kargs_bin & args, - device const char * src0, - device const char * src1, - device char * dst, - uint tpig[[thread_position_in_grid]]) { - - const uint nb = args.ne00/4; - const uint i = tpig % nb; - - device const float4 * src0_row = (device const float4 *) (src0); - device float4 * dst_row = (device float4 *) (dst); - - device const float4 * src1_row[F]; - for (short j = 0; j < F; ++j) { - src1_row[j] = (device const float4 *) (src1 + args.o1[j]); - } - - float4 res = src0_row[tpig]; - -#pragma unroll(F) - for (short j = 0; j < F; ++j) { - res *= src1_row[j][i]; - } - - dst_row[tpig] = res; -} - -typedef decltype(kernel_mul_row_c4_fuse_impl<1>) kernel_mul_row_c4_fuse_t; - -template [[host_name("kernel_mul_row_c4_fuse_1")]] kernel kernel_mul_row_c4_fuse_t kernel_mul_row_c4_fuse_impl<1>; - -template -kernel void kernel_div_row_c4_fuse_impl( - constant ggml_metal_kargs_bin & args, - device const char * src0, - device const char * src1, - device char * dst, - uint tpig[[thread_position_in_grid]]) { - - const uint nb = args.ne00/4; - const uint i = tpig % nb; - - device const float4 * src0_row = (device const float4 *) (src0); - device float4 * dst_row = (device float4 *) (dst); - - device const float4 * src1_row[F]; - for (short j = 0; j < F; ++j) { - src1_row[j] = (device const float4 *) (src1 + args.o1[j]); - } - - float4 res = src0_row[tpig]; - -#pragma unroll(F) - for (short j = 0; j < F; ++j) { - res /= src1_row[j][i]; - } - - dst_row[tpig] = res; -} - -typedef decltype(kernel_div_row_c4_fuse_impl<1>) kernel_div_row_c4_fuse_t; - -template [[host_name("kernel_div_row_c4_fuse_1")]] kernel kernel_div_row_c4_fuse_t kernel_div_row_c4_fuse_impl<1>; - kernel void kernel_scale_f32( constant ggml_metal_kargs_scale & args, device const float * src0, From 96441c955ea45cfa2b6834cad419b849aa144463 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 8 Feb 2026 00:50:47 +0200 Subject: [PATCH 03/17] ci : use -j param correctly when building with sanitizers (#19411) * ci : use less jobs when building with sanitizers * cont : fix nproc * cont : fix the fix * cont : simplify --- .github/workflows/build.yml | 2 ++ .github/workflows/server.yml | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 51a3dc76e9..6c7ab71143 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -295,6 +295,7 @@ jobs: -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \ -DGGML_SANITIZE_${{ matrix.sanitizer }}=ON \ -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} + cmake --build build --config ${{ matrix.build_type }} -j $(nproc) - name: Build (no OpenMP) @@ -307,6 +308,7 @@ jobs: -DGGML_SANITIZE_${{ matrix.sanitizer }}=ON \ -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \ -DGGML_OPENMP=OFF + cmake --build build --config ${{ matrix.build_type }} -j $(nproc) - name: Test diff --git a/.github/workflows/server.yml b/.github/workflows/server.yml index 3d342c35f7..f44a9e739c 100644 --- a/.github/workflows/server.yml +++ b/.github/workflows/server.yml @@ -81,7 +81,7 @@ jobs: -DLLAMA_SANITIZE_ADDRESS=${{ matrix.sanitizer == 'ADDRESS' }} \ -DLLAMA_SANITIZE_THREAD=${{ matrix.sanitizer == 'THREAD' }} \ -DLLAMA_SANITIZE_UNDEFINED=${{ matrix.sanitizer == 'UNDEFINED' }} - cmake --build build --config ${{ matrix.build_type }} -j ${env:NUMBER_OF_PROCESSORS} --target llama-server + cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server - name: Python setup id: setup_python From 9a5f57795c01c6e67a53eeedeae67ed63aaf7f8e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= Date: Sun, 8 Feb 2026 01:20:00 +0100 Subject: [PATCH 04/17] ci : remove server job from webui and move slow test (#19424) * remove server job from webui and move slow test * use pip-install option --- .github/workflows/server-webui.yml | 120 ----------------------------- .github/workflows/server.yml | 20 ++--- 2 files changed, 10 insertions(+), 130 deletions(-) diff --git a/.github/workflows/server-webui.yml b/.github/workflows/server-webui.yml index 6d1b617371..94899c9376 100644 --- a/.github/workflows/server-webui.yml +++ b/.github/workflows/server-webui.yml @@ -8,10 +8,6 @@ on: description: 'Commit SHA1 to build' required: false type: string - slow_tests: - description: 'Run slow tests' - required: true - type: boolean push: branches: - master @@ -101,119 +97,3 @@ jobs: if: ${{ always() && steps.playwright.conclusion == 'success' }} run: npm run test:e2e working-directory: tools/server/webui - - server-build: - runs-on: ubuntu-latest - - strategy: - matrix: - sanitizer: [ADDRESS, UNDEFINED] # THREAD is broken - build_type: [RelWithDebInfo] - include: - - build_type: Release - sanitizer: "" - fail-fast: false # While -DLLAMA_SANITIZE_THREAD=ON is broken - - steps: - - name: Dependencies - id: depends - run: | - sudo apt-get update - sudo apt-get -y install \ - build-essential \ - xxd \ - git \ - cmake \ - curl \ - wget \ - language-pack-en \ - libssl-dev - - - name: Clone - id: checkout - uses: actions/checkout@v6 - with: - fetch-depth: 0 - ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }} - - - name: Python setup - id: setup_python - uses: actions/setup-python@v6 - with: - python-version: '3.11' - - - name: Tests dependencies - id: test_dependencies - run: | - pip install -r tools/server/tests/requirements.txt - - - name: Setup Node.js for WebUI - uses: actions/setup-node@v6 - with: - node-version: "22" - cache: "npm" - cache-dependency-path: "tools/server/webui/package-lock.json" - - - name: Install WebUI dependencies - run: npm ci - working-directory: tools/server/webui - - - name: Build WebUI - run: npm run build - working-directory: tools/server/webui - - - name: Build (no OpenMP) - id: cmake_build_no_openmp - if: ${{ matrix.sanitizer == 'THREAD' }} - run: | - cmake -B build \ - -DGGML_NATIVE=OFF \ - -DLLAMA_BUILD_SERVER=ON \ - -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \ - -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \ - -DGGML_OPENMP=OFF ; - cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server - - - name: Build (sanitizers) - id: cmake_build_sanitizers - if: ${{ matrix.sanitizer != '' && matrix.sanitizer != 'THREAD' }} - run: | - cmake -B build \ - -DGGML_NATIVE=OFF \ - -DLLAMA_BUILD_SERVER=ON \ - -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \ - -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON ; - cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server - - - name: Build (sanitizers) - id: cmake_build - if: ${{ matrix.sanitizer == '' }} - run: | - cmake -B build \ - -DGGML_NATIVE=OFF \ - -DLLAMA_BUILD_SERVER=ON \ - -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} ; - cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server - - - name: Tests - id: server_integration_tests - if: ${{ matrix.sanitizer == '' }} - env: - GITHUB_ACTIONS: "true" - run: | - cd tools/server/tests - ./tests.sh - - - name: Tests (sanitizers) - id: server_integration_tests_sanitizers - if: ${{ matrix.sanitizer != '' }} - run: | - cd tools/server/tests - LLAMA_SANITIZE=1 ./tests.sh - - - name: Slow tests - id: server_integration_tests_slow - if: ${{ (github.event.schedule || github.event.inputs.slow_tests == 'true') && matrix.build_type == 'Release' }} - run: | - cd tools/server/tests - SLOW_TESTS=1 ./tests.sh diff --git a/.github/workflows/server.yml b/.github/workflows/server.yml index f44a9e739c..99d05226ba 100644 --- a/.github/workflows/server.yml +++ b/.github/workflows/server.yml @@ -88,11 +88,7 @@ jobs: uses: actions/setup-python@v6 with: python-version: '3.11' - - - name: Tests dependencies - id: test_dependencies - run: | - pip install -r tools/server/tests/requirements.txt + pip-install: -r tools/server/tests/requirements.txt - name: Tests id: server_integration_tests @@ -102,6 +98,14 @@ jobs: export ${{ matrix.extra_args }} pytest -v -x -m "not slow" + - name: Slow tests + id: server_integration_tests_slow + if: ${{ (github.event.schedule || github.event.inputs.slow_tests == 'true') && matrix.build_type == 'Release' }} + run: | + cd tools/server/tests + export ${{ matrix.extra_args }} + SLOW_TESTS=1 pytest -v -x + server-windows: runs-on: windows-2022 @@ -124,11 +128,7 @@ jobs: uses: actions/setup-python@v6 with: python-version: '3.11' - - - name: Tests dependencies - id: test_dependencies - run: | - pip install -r tools/server/tests/requirements.txt + pip-install: -r tools/server/tests/requirements.txt - name: Tests id: server_integration_tests From 5999b50eb00972732b69c519121dda1361f56eb3 Mon Sep 17 00:00:00 2001 From: ddh0 Date: Sun, 8 Feb 2026 01:22:38 -0600 Subject: [PATCH 05/17] llama-quantize : cleanup `--help` output (#19317) * cleanup `llama-quantize --help` output some much needed TLC * remove future argument oops, spoiler * cleanup of cleanup --- tools/quantize/quantize.cpp | 59 +++++++++++++++++++++++++------------ 1 file changed, 40 insertions(+), 19 deletions(-) diff --git a/tools/quantize/quantize.cpp b/tools/quantize/quantize.cpp index 0709e0bda0..c0f49279ee 100644 --- a/tools/quantize/quantize.cpp +++ b/tools/quantize/quantize.cpp @@ -119,27 +119,48 @@ static bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftyp [[noreturn]] static void usage(const char * executable) { printf("usage: %s [--help] [--allow-requantize] [--leave-output-tensor] [--pure] [--imatrix] [--include-weights]\n", executable); - printf(" [--exclude-weights] [--output-tensor-type] [--token-embedding-type] [--tensor-type] [--tensor-type-file] [--prune-layers] [--keep-split] [--override-kv]\n"); + printf(" [--exclude-weights] [--output-tensor-type] [--token-embedding-type] [--tensor-type] [--tensor-type-file]\n"); + printf(" [--prune-layers] [--keep-split] [--override-kv]\n"); printf(" model-f32.gguf [model-quant.gguf] type [nthreads]\n\n"); - printf(" --allow-requantize: Allows requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit\n"); - printf(" --leave-output-tensor: Will leave output.weight un(re)quantized. Increases model size but may also increase quality, especially when requantizing\n"); - printf(" --pure: Disable k-quant mixtures and quantize all tensors to the same type\n"); - printf(" --imatrix file_name: use data in file_name as importance matrix for quant optimizations\n"); - printf(" --include-weights tensor_name: use importance matrix for this/these tensor(s)\n"); - printf(" --exclude-weights tensor_name: use importance matrix for this/these tensor(s)\n"); - printf(" --output-tensor-type ggml_type: use this ggml_type for the output.weight tensor\n"); - printf(" --token-embedding-type ggml_type: use this ggml_type for the token embeddings tensor\n"); - printf(" --tensor-type TENSOR=TYPE: quantize this tensor to this ggml_type. example: --tensor-type attn_q=q8_0\n"); - printf(" Advanced option to selectively quantize tensors. May be specified multiple times.\n"); - printf(" --tensor-type-file tensor_type.txt: list of tensors to quantize to specific ggml_type. example: --tensor-type-file tensor_type_list.txt\n"); - printf(" Advanced option to selectively quantize a long list of tensors. Format to be tensor_name=ggml_type, separated by spaces/newline.\n"); - printf(" --prune-layers L0,L1,L2...comma-separated list of layer numbers to prune from the model\n"); - printf(" Advanced option to remove all tensors from the given layers\n"); - printf(" --keep-split: will generate quantized model in the same shards as input\n"); + printf(" --allow-requantize\n"); + printf(" allow requantizing tensors that have already been quantized\n"); + printf(" WARNING: this can severely reduce quality compared to quantizing\n"); + printf(" from 16bit or 32bit!\n"); + printf(" --leave-output-tensor\n"); + printf(" leave output.weight un(re)quantized\n"); + printf(" increases model size but may also increase quality, especially when requantizing\n"); + printf(" --pure\n"); + printf(" disable k-quant mixtures and quantize all tensors to the same type\n"); + printf(" --imatrix file_name\n"); + printf(" use data in file_name as importance matrix for quant optimizations\n"); + printf(" --include-weights tensor_name\n"); + printf(" use importance matrix for this/these tensor(s)\n"); + printf(" --exclude-weights tensor_name\n"); + printf(" do not use importance matrix for this/these tensor(s)\n"); + printf(" --output-tensor-type ggml_type\n"); + printf(" use this ggml_type for the output.weight tensor\n"); + printf(" --token-embedding-type ggml_type\n"); + printf(" use this ggml_type for the token embeddings tensor\n"); + printf(" --tensor-type tensor_name=ggml_type\n"); + printf(" quantize this tensor to this ggml_type\n"); + printf(" this is an advanced option to selectively quantize tensors. may be specified multiple times.\n"); + printf(" example: --tensor-type attn_q=q8_0\n"); + printf(" --tensor-type-file tensor_types.txt\n"); + printf(" list of tensors to quantize to a specific ggml_type\n"); + printf(" this is an advanced option to selectively quantize a long list of tensors.\n"); + printf(" the file should use the same format as above, separated by spaces or newlines.\n"); + printf(" --prune-layers L0,L1,L2...\n"); + printf(" comma-separated list of layer numbers to prune from the model\n"); + printf(" WARNING: this is an advanced option, use with care.\n"); + printf(" --keep-split\n"); + printf(" generate quantized model in the same shards as input\n"); printf(" --override-kv KEY=TYPE:VALUE\n"); - printf(" Advanced option to override model metadata by key in the quantized model. May be specified multiple times.\n"); - printf("Note: --include-weights and --exclude-weights cannot be used together\n"); - printf("\nAllowed quantization types:\n"); + printf(" override model metadata by key in the quantized model. may be specified multiple times.\n"); + printf(" WARNING: this is an advanced option, use with care.\n\n"); + printf("note: --include-weights and --exclude-weights cannot be used together\n\n"); + printf("-----------------------------------------------------------------------------\n"); + printf(" allowed quantization types\n"); + printf("-----------------------------------------------------------------------------\n\n"); for (const auto & it : QUANT_OPTIONS) { if (it.name != "COPY") { printf(" %2d or ", it.ftype); From eb449cdfa4319d8fd9066e4633b49f4c867dd11c Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 8 Feb 2026 09:40:04 +0200 Subject: [PATCH 06/17] server : improve context checkpoint logic (#19408) --- tools/server/server-context.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp index b71d496eeb..8ec8451339 100644 --- a/tools/server/server-context.cpp +++ b/tools/server/server-context.cpp @@ -2507,7 +2507,8 @@ private: slot.n_prompt_tokens_processed++; // process the last few tokens of the prompt separately in order to allow for a checkpoint to be created. - if (do_checkpoint && slot.task->n_tokens() - slot.prompt.n_tokens() == 64) { + const int n_last = std::min(n_batch, 512); + if (do_checkpoint && slot.task->n_tokens() == slot.prompt.n_tokens() + n_last) { break; } } From 5fa1c190d9fc86c02698b730a2cb933195e19d96 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrien=20Gallou=C3=ABt?= Date: Sun, 8 Feb 2026 09:06:45 +0100 Subject: [PATCH 07/17] rpc : update from common.cpp (#19400) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Adrien Gallouët --- tools/rpc/rpc-server.cpp | 64 ++++++++++++++++++++++++++++++---------- 1 file changed, 49 insertions(+), 15 deletions(-) diff --git a/tools/rpc/rpc-server.cpp b/tools/rpc/rpc-server.cpp index 58b93c7468..521f79622d 100644 --- a/tools/rpc/rpc-server.cpp +++ b/tools/rpc/rpc-server.cpp @@ -1,12 +1,7 @@ -#if defined(_MSC_VER) -#define _SILENCE_CXX17_CODECVT_HEADER_DEPRECATION_WARNING -#endif - #include "ggml-rpc.h" #ifdef _WIN32 # define NOMINMAX # define DIRECTORY_SEPARATOR '\\' -# include # include # include # include @@ -15,23 +10,43 @@ # include # include #endif -#include #include #include #include -#include #include #include #include -namespace fs = std::filesystem; +#if defined(__linux__) +#include +#include +#endif + +// NOTE: this is copied from common.cpp to avoid linking with libcommon +#ifdef _WIN32 +static std::wstring utf8_to_wstring(const std::string & str) { + if (str.empty()) { + return std::wstring(); + } + + int size = MultiByteToWideChar(CP_UTF8, 0, str.c_str(), (int)str.size(), NULL, 0); + + if (size <= 0) { + return std::wstring(); + } + + std::wstring wstr(size, 0); + MultiByteToWideChar(CP_UTF8, 0, str.c_str(), (int)str.size(), &wstr[0], size); + + return wstr; +} +#endif // NOTE: this is copied from common.cpp to avoid linking with libcommon // returns true if successful, false otherwise static bool fs_create_directory_with_parents(const std::string & path) { #ifdef _WIN32 - std::wstring_convert> converter; - std::wstring wpath = converter.from_bytes(path); + std::wstring wpath = utf8_to_wstring(path); // if the path already exists, check whether it's a directory const DWORD attributes = GetFileAttributesW(wpath.c_str()); @@ -44,9 +59,16 @@ static bool fs_create_directory_with_parents(const std::string & path) { // process path from front to back, procedurally creating directories while ((pos_slash = path.find('\\', pos_slash)) != std::string::npos) { const std::wstring subpath = wpath.substr(0, pos_slash); - const wchar_t * test = subpath.c_str(); - const bool success = CreateDirectoryW(test, NULL); + pos_slash += 1; + + // skip the drive letter, in some systems it can return an access denied error + if (subpath.length() == 2 && subpath[1] == ':') { + continue; + } + + const bool success = CreateDirectoryW(subpath.c_str(), NULL); + if (!success) { const DWORD error = GetLastError(); @@ -60,8 +82,6 @@ static bool fs_create_directory_with_parents(const std::string & path) { return false; } } - - pos_slash += 1; } return true; @@ -115,13 +135,27 @@ static std::string fs_get_cache_directory() { #if defined(__linux__) || defined(__FreeBSD__) || defined(_AIX) || defined(__OpenBSD__) if (std::getenv("XDG_CACHE_HOME")) { cache_directory = std::getenv("XDG_CACHE_HOME"); - } else { + } else if (std::getenv("HOME")) { cache_directory = std::getenv("HOME") + std::string("/.cache/"); + } else { +#if defined(__linux__) + /* no $HOME is defined, fallback to getpwuid */ + struct passwd *pw = getpwuid(getuid()); + if ((!pw) || (!pw->pw_dir)) { + throw std::runtime_error("Failed to find $HOME directory"); + } + + cache_directory = std::string(pw->pw_dir) + std::string("/.cache/"); +#else /* defined(__linux__) */ + throw std::runtime_error("Failed to find $HOME directory"); +#endif /* defined(__linux__) */ } #elif defined(__APPLE__) cache_directory = std::getenv("HOME") + std::string("/Library/Caches/"); #elif defined(_WIN32) cache_directory = std::getenv("LOCALAPPDATA"); +#elif defined(__EMSCRIPTEN__) + GGML_ABORT("not implemented on this platform"); #else # error Unknown architecture #endif From e06088da0fa86aa444409f38dff274904931c507 Mon Sep 17 00:00:00 2001 From: Oliver Simons Date: Sun, 8 Feb 2026 14:12:51 +0100 Subject: [PATCH 08/17] CUDA: Fix non-contig rope (#19338) * Rename variables + fix rope_neox Seems memory layout is shared with Vulkan so we can port fix from https://github.com/ggml-org/llama.cpp/pull/19299 * Fix rope_multi * Fix rope_vision * Fix rope_norm * Rename ne* to ne0* for consistent variable naming * cont : consistent stride names --------- Co-authored-by: Georgi Gerganov --- ggml/src/ggml-cuda/rope.cu | 364 +++++++++++++++++++++++-------------- 1 file changed, 232 insertions(+), 132 deletions(-) diff --git a/ggml/src/ggml-cuda/rope.cu b/ggml/src/ggml-cuda/rope.cu index 88ed79111a..45a49a5dc2 100644 --- a/ggml/src/ggml-cuda/rope.cu +++ b/ggml/src/ggml-cuda/rope.cu @@ -43,10 +43,15 @@ static __device__ void rope_yarn( template static __global__ void rope_norm(const T * x, D * dst, - const int ne0, - const int ne1, + const int ne00, + const int ne01, + const int ne02, + const int s01, + const int s02, + const int s03, const int s1, const int s2, + const int s3, const int n_dims, const int32_t * pos, const float freq_scale, @@ -59,23 +64,23 @@ static __global__ void rope_norm(const T * x, const int set_rows_stride) { const int i0 = 2*(blockDim.y*blockIdx.y + threadIdx.y); - if (i0 >= ne0) { + if (i0 >= ne00) { return; } const int row_dst = blockDim.x*blockIdx.x + threadIdx.x; - const int row_x = row_dst % ne1; - const int channel_x = row_dst / ne1; - - int idst = row_dst * ne0 + i0; - const int ix = channel_x*s2 + row_x*s1 + i0; + const uint32_t i3 = row_dst / (ne01 * ne02); + const uint32_t i2 = (row_dst - i3 * ne01 * ne02) / ne01; + const uint32_t i1 = row_dst - i3 * ne01 * ne02 - i2 * ne01; + int idst = i0 + i1 * s1 + i2 * s2 + i3 * s3; + const int ix = i0 + i1 * s01 + i2 * s02 + i3 * s03; // Fusion optimization: ROPE + VIEW + SET_ROWS. // The rope output is viewed as a 1D tensor and offset based on a row index in row_indices. if (set_rows_stride != 0) { - idst = row_x * ne0 + i0; - idst += row_indices[channel_x] * set_rows_stride; + idst = i1 * s1 + i0; + idst += row_indices[i2] * set_rows_stride; } const auto & store_coaelsced = [&](float x0, float x1) { @@ -92,7 +97,7 @@ static __global__ void rope_norm(const T * x, return; } - const float theta_base = pos[channel_x]*powf(theta_scale, i0/2.0f); + const float theta_base = pos[i2]*powf(theta_scale, i0/2.0f); const float freq_factor = has_ff ? freq_factors[i0/2] : 1.0f; @@ -110,10 +115,15 @@ static __global__ void rope_norm(const T * x, template static __global__ void rope_neox(const T * x, D * dst, - const int ne0, - const int ne1, + const int ne00, + const int ne01, + const int ne02, + const int s01, + const int s02, + const int s03, const int s1, const int s2, + const int s3, const int n_dims, const int32_t * pos, const float freq_scale, @@ -126,23 +136,24 @@ static __global__ void rope_neox(const T * x, const int set_rows_stride) { const int i0 = 2*(blockDim.y*blockIdx.y + threadIdx.y); - if (i0 >= ne0) { + if (i0 >= ne00) { return; } const int row_dst = blockDim.x*blockIdx.x + threadIdx.x; - const int row_x = row_dst % ne1; - const int channel_x = row_dst / ne1; + const uint32_t i3 = row_dst / (ne01 * ne02); + const uint32_t i2 = (row_dst - i3 * ne01 * ne02) / ne01; + const uint32_t i1 = row_dst - i3 * ne01 * ne02 - i2 * ne01; - int idst = row_dst * ne0 + i0 / 2; - const int ix = channel_x*s2 + row_x*s1 + i0/2; + int idst = i0 / 2 + i1 * s1 + i2 * s2 + i3 * s3; + const int ix = i0 / 2 + i1 * s01 + i2 * s02 + i3 * s03; // Fusion optimization: ROPE + VIEW + SET_ROWS. // The rope output is viewed as a 1D tensor and offset based on a row index in row_indices. if (set_rows_stride != 0) { - idst = row_x * ne0 + i0 / 2; - idst += row_indices[channel_x] * set_rows_stride; + idst = i1 * s1 + i0 / 2; + idst += row_indices[i2] * set_rows_stride; } if (i0 >= n_dims) { @@ -152,7 +163,7 @@ static __global__ void rope_neox(const T * x, return; } - const float theta_base = pos[channel_x]*powf(theta_scale, i0/2.0f); + const float theta_base = pos[i2]*powf(theta_scale, i0/2.0f); const float freq_factor = has_ff ? freq_factors[i0/2] : 1.0f; @@ -168,24 +179,42 @@ static __global__ void rope_neox(const T * x, dst[idst + n_dims / 2] = ggml_cuda_cast(x0 * sin_theta + x1 * cos_theta); } -template -static __global__ void rope_multi( - const T * x, T * dst, const int ne0, const int ne1, const int ne2, const int s1, const int s2, - const int n_dims, const int32_t * pos, const float freq_scale, const float ext_factor, const float attn_factor, - const rope_corr_dims corr_dims, const float theta_scale, const float * freq_factors, const mrope_sections sections, const bool is_imrope) { - const int i0 = 2*(blockDim.y*blockIdx.y + threadIdx.y); +template +static __global__ void rope_multi(const T * x, + T * dst, + const int ne00, + const int ne01, + const int ne02, + const int s01, + const int s02, + const int s03, + const int s1, + const int s2, + const int s3, + const int n_dims, + const int32_t * pos, + const float freq_scale, + const float ext_factor, + const float attn_factor, + const rope_corr_dims corr_dims, + const float theta_scale, + const float * freq_factors, + const mrope_sections sections, + const bool is_imrope) { + const int i0 = 2 * (blockDim.y * blockIdx.y + threadIdx.y); - if (i0 >= ne0) { + if (i0 >= ne00) { return; } const int row_dst = blockDim.x*blockIdx.x + threadIdx.x; - const int row_x = row_dst % ne1; - const int channel_x = row_dst / ne1; + const uint32_t i3 = row_dst / (ne01 * ne02); + const uint32_t i2 = (row_dst - i3 * ne01 * ne02) / ne01; + const uint32_t i1 = row_dst - i3 * ne01 * ne02 - i2 * ne01; - const int idst = row_dst*ne0 + i0/2; - const int ix = channel_x*s2 + row_x*s1 + i0/2; + int idst = i0 / 2 + i1 * s1 + i2 * s2 + i3 * s3; + const int ix = i0 / 2 + i1 * s01 + i2 * s02 + i3 * s03; if (i0 >= n_dims) { dst[idst + i0/2 + 0] = x[ix + i0/2 + 0]; @@ -200,27 +229,24 @@ static __global__ void rope_multi( float theta_base = 0.0; if (is_imrope) { - if (sector % 3 == 1 && sector < 3 * sections.v[1]) { // h - theta_base = pos[channel_x + ne2 * 1]*powf(theta_scale, i0/2.0f); - } else if (sector % 3 == 2 && sector < 3 * sections.v[2]) { // w - theta_base = pos[channel_x + ne2 * 2]*powf(theta_scale, i0/2.0f); - } else if (sector % 3 == 0 && sector < 3 * sections.v[0]) { // t - theta_base = pos[channel_x]*powf(theta_scale, i0/2.0f); + if (sector % 3 == 1 && sector < 3 * sections.v[1]) { // h + theta_base = pos[i2 + ne02 * 1] * powf(theta_scale, i0 / 2.0f); + } else if (sector % 3 == 2 && sector < 3 * sections.v[2]) { // w + theta_base = pos[i2 + ne02 * 2] * powf(theta_scale, i0 / 2.0f); + } else if (sector % 3 == 0 && sector < 3 * sections.v[0]) { // t + theta_base = pos[i2] * powf(theta_scale, i0 / 2.0f); } else { - theta_base = pos[channel_x + ne2 * 3]*powf(theta_scale, i0/2.0f); + theta_base = pos[i2 + ne02 * 3] * powf(theta_scale, i0 / 2.0f); } } else { if (sector < sections.v[0]) { - theta_base = pos[channel_x]*powf(theta_scale, i0/2.0f); - } - else if (sector >= sections.v[0] && sector < sec_w) { - theta_base = pos[channel_x + ne2 * 1]*powf(theta_scale, i0/2.0f); - } - else if (sector >= sec_w && sector < sec_w + sections.v[2]) { - theta_base = pos[channel_x + ne2 * 2]*powf(theta_scale, i0/2.0f); - } - else if (sector >= sec_w + sections.v[2]) { - theta_base = pos[channel_x + ne2 * 3]*powf(theta_scale, i0/2.0f); + theta_base = pos[i2] * powf(theta_scale, i0 / 2.0f); + } else if (sector >= sections.v[0] && sector < sec_w) { + theta_base = pos[i2 + ne02 * 1] * powf(theta_scale, i0 / 2.0f); + } else if (sector >= sec_w && sector < sec_w + sections.v[2]) { + theta_base = pos[i2 + ne02 * 2] * powf(theta_scale, i0 / 2.0f); + } else if (sector >= sec_w + sections.v[2]) { + theta_base = pos[i2 + ne02 * 3] * powf(theta_scale, i0 / 2.0f); } } @@ -238,37 +264,53 @@ static __global__ void rope_multi( dst[idst + n_dims/2] = x0*sin_theta + x1*cos_theta; } -template -static __global__ void rope_vision( - const T * x, T * dst, const int ne0, const int ne1, const int ne2, const int s1, const int s2, const int n_dims, - const int32_t * pos, const float freq_scale, const float ext_factor, const float attn_factor, const rope_corr_dims corr_dims, - const float theta_scale, const float * freq_factors, const mrope_sections sections) { +template +static __global__ void rope_vision(const T * x, + T * dst, + const int ne00, + const int ne01, + const int ne02, + const int s01, + const int s02, + const int s03, + const int s1, + const int s2, + const int s3, + const int n_dims, + const int32_t * pos, + const float freq_scale, + const float ext_factor, + const float attn_factor, + const rope_corr_dims corr_dims, + const float theta_scale, + const float * freq_factors, + const mrope_sections sections) { const int i0 = 2*(blockDim.y*blockIdx.y + threadIdx.y); - if (i0 >= ne0) { + if (i0 >= ne00) { return; } const int row_dst = blockDim.x*blockIdx.x + threadIdx.x; - const int row_x = row_dst % ne1; - const int channel_x = row_dst / ne1; + const uint32_t i3 = row_dst / (ne01 * ne02); + const uint32_t i2 = (row_dst - i3 * ne01 * ne02) / ne01; + const uint32_t i1 = row_dst - i3 * ne01 * ne02 - i2 * ne01; - const int idst = row_dst*ne0 + i0/2; - const int ix = channel_x*s2 + row_x*s1 + i0/2; + int idst = i0 / 2 + i1 * s1 + i2 * s2 + i3 * s3; + const int ix = i0 / 2 + i1 * s01 + i2 * s02 + i3 * s03; const int sect_dims = sections.v[0] + sections.v[1]; - const int sec_w = sections.v[1] + sections.v[0]; - const int sector = (i0 / 2) % sect_dims; + const int sec_w = sections.v[1] + sections.v[0]; + const int sector = (i0 / 2) % sect_dims; float theta_base = 0.0; if (sector < sections.v[0]) { const int p = sector; - theta_base = pos[channel_x]*powf(theta_scale, p); - } - else if (sector >= sections.v[0] && sector < sec_w) { + theta_base = pos[i2] * powf(theta_scale, p); + } else if (sector >= sections.v[0] && sector < sec_w) { const int p = sector - sections.v[0]; - theta_base = pos[channel_x + ne2]*powf(theta_scale, p); + theta_base = pos[i2 + ne02] * powf(theta_scale, p); } const float freq_factor = has_ff ? freq_factors[i0/2] : 1.0f; @@ -288,10 +330,15 @@ static __global__ void rope_vision( template static void rope_norm_cuda(const T * x, D * dst, - const int ne0, - const int ne1, + const int ne00, + const int ne01, + const int ne02, + const int s01, + const int s02, + const int s03, const int s1, const int s2, + const int s3, const int n_dims, const int nr, const int32_t * pos, @@ -304,31 +351,36 @@ static void rope_norm_cuda(const T * x, const int64_t * row_indices, const int set_rows_stride, cudaStream_t stream) { - GGML_ASSERT(ne0 % 2 == 0); + GGML_ASSERT(ne00 % 2 == 0); const dim3 block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1); - const int n_blocks_x = (ne0 + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE); + const int n_blocks_x = (ne00 + 2 * CUDA_ROPE_BLOCK_SIZE - 1) / (2 * CUDA_ROPE_BLOCK_SIZE); const dim3 block_nums(nr, n_blocks_x, 1); - const float theta_scale = powf(freq_base, -2.0f/n_dims); + const float theta_scale = powf(freq_base, -2.0f / n_dims); if (freq_factors == nullptr) { rope_norm<<>>( - x, dst, ne0, ne1, s1, s2, n_dims, pos, freq_scale, ext_factor, attn_factor, corr_dims, theta_scale, - freq_factors, row_indices, set_rows_stride); + x, dst, ne00, ne01, ne02, s01, s02, s03, s1, s2, s3, n_dims, pos, freq_scale, ext_factor, + attn_factor, corr_dims, theta_scale, freq_factors, row_indices, set_rows_stride); } else { rope_norm<<>>( - x, dst, ne0, ne1, s1, s2, n_dims, pos, freq_scale, ext_factor, attn_factor, corr_dims, theta_scale, - freq_factors, row_indices, set_rows_stride); + x, dst, ne00, ne01, ne02, s01, s02, s03, s1, s2, s3, n_dims, pos, freq_scale, ext_factor, + attn_factor, corr_dims, theta_scale, freq_factors, row_indices, set_rows_stride); } } template static void rope_neox_cuda(const T * x, D * dst, - const int ne0, - const int ne1, + const int ne00, + const int ne01, + const int ne02, + const int s01, + const int s02, + const int s03, const int s1, const int s2, + const int s3, const int n_dims, const int nr, const int32_t * pos, @@ -341,55 +393,92 @@ static void rope_neox_cuda(const T * x, const int64_t * row_indices, const int set_rows_stride, cudaStream_t stream) { - GGML_ASSERT(ne0 % 2 == 0); + GGML_ASSERT(ne00 % 2 == 0); const dim3 block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1); - const int n_blocks_x = (ne0 + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE); + const int n_blocks_x = (ne00 + 2 * CUDA_ROPE_BLOCK_SIZE - 1) / (2 * CUDA_ROPE_BLOCK_SIZE); const dim3 block_nums(nr, n_blocks_x, 1); - const float theta_scale = powf(freq_base, -2.0f/n_dims); + const float theta_scale = powf(freq_base, -2.0f / n_dims); if (freq_factors == nullptr) { rope_neox<<>>( - x, dst, ne0, ne1, s1, s2, n_dims, pos, freq_scale, ext_factor, attn_factor, corr_dims, theta_scale, - freq_factors, row_indices, set_rows_stride); + x, dst, ne00, ne01, ne02, s01, s02, s03, s1, s2, s3, n_dims, pos, freq_scale, ext_factor, + attn_factor, corr_dims, theta_scale, freq_factors, row_indices, set_rows_stride); } else { rope_neox<<>>( - x, dst, ne0, ne1, s1, s2, n_dims, pos, freq_scale, ext_factor, attn_factor, corr_dims, theta_scale, - freq_factors, row_indices, set_rows_stride); + x, dst, ne00, ne01, ne02, s01, s02, s03, s1, s2, s3, n_dims, pos, freq_scale, ext_factor, + attn_factor, corr_dims, theta_scale, freq_factors, row_indices, set_rows_stride); } } -template -static void rope_multi_cuda( - const T * x, T * dst, const int ne0, const int ne1, const int ne2, const int s1, const int s2, const int n_dims, const int nr, - const int32_t * pos, const float freq_scale, const float freq_base, const float ext_factor, const float attn_factor, - const rope_corr_dims corr_dims, const float * freq_factors, const mrope_sections sections, const bool is_imrope, cudaStream_t stream) { - GGML_ASSERT(ne0 % 2 == 0); +template +static void rope_multi_cuda(const T * x, + T * dst, + const int ne00, + const int ne01, + const int ne02, + const int s01, + const int s02, + const int s03, + const int s1, + const int s2, + const int s3, + const int n_dims, + const int nr, + const int32_t * pos, + const float freq_scale, + const float freq_base, + const float ext_factor, + const float attn_factor, + const rope_corr_dims corr_dims, + const float * freq_factors, + const mrope_sections sections, + const bool is_imrope, + cudaStream_t stream) { + GGML_ASSERT(ne00 % 2 == 0); const dim3 block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1); - const int n_blocks_x = (ne0 + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE); + const int n_blocks_x = (ne00 + 2 * CUDA_ROPE_BLOCK_SIZE - 1) / (2 * CUDA_ROPE_BLOCK_SIZE); const dim3 block_nums(nr, n_blocks_x, 1); - const float theta_scale = powf(freq_base, -2.0f/n_dims); + const float theta_scale = powf(freq_base, -2.0f / n_dims); if (freq_factors == nullptr) { rope_multi<<>>( - x, dst, ne0, ne1, ne2, s1, s2, n_dims, pos, freq_scale, ext_factor, + x, dst, ne00, ne01, ne02, s01, s02, s03, s1, s2, s3, n_dims, pos, freq_scale, ext_factor, attn_factor, corr_dims, theta_scale, freq_factors, sections, is_imrope); } else { rope_multi<<>>( - x, dst, ne0, ne1, ne2, s1, s2, n_dims, pos, freq_scale, ext_factor, + x, dst, ne00, ne01, ne02, s01, s02, s03, s1, s2, s3, n_dims, pos, freq_scale, ext_factor, attn_factor, corr_dims, theta_scale, freq_factors, sections, is_imrope); } } -template -static void rope_vision_cuda( - const T * x, T * dst, const int ne0, const int ne1, const int ne2, const int s1, const int s2, const int n_dims, const int nr, - const int32_t * pos, const float freq_scale, const float freq_base, const float ext_factor, const float attn_factor, - const rope_corr_dims corr_dims, const float * freq_factors, const mrope_sections sections, cudaStream_t stream) { - GGML_ASSERT(ne0 % 2 == 0); +template +static void rope_vision_cuda(const T * x, + T * dst, + const int ne00, + const int ne01, + const int ne02, + const int s01, + const int s02, + const int s03, + const int s1, + const int s2, + const int s3, + const int n_dims, + const int nr, + const int32_t * pos, + const float freq_scale, + const float freq_base, + const float ext_factor, + const float attn_factor, + const rope_corr_dims corr_dims, + const float * freq_factors, + const mrope_sections sections, + cudaStream_t stream) { + GGML_ASSERT(ne00 % 2 == 0); const dim3 block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1); - const int n_blocks_x = (ne0 + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE); + const int n_blocks_x = (ne00 + 2 * CUDA_ROPE_BLOCK_SIZE - 1) / (2 * CUDA_ROPE_BLOCK_SIZE); const dim3 block_nums(nr, n_blocks_x, 1); // break down (head_dim, heads, seq) into (CUDA_ROPE_BLOCK_SIZE, x, heads * seq) // where x ~= ceil(head_dim / CUDA_ROPE_BLOCK_SIZE); @@ -398,11 +487,11 @@ static void rope_vision_cuda( if (freq_factors == nullptr) { rope_vision<<>>( - x, dst, ne0, ne1, ne2, s1, s2, n_dims, pos, freq_scale, ext_factor, + x, dst, ne00, ne01, ne02, s01, s02, s03, s1, s2, s3, n_dims, pos, freq_scale, ext_factor, attn_factor, corr_dims, theta_scale, freq_factors, sections); } else { rope_vision<<>>( - x, dst, ne0, ne1, ne2, s1, s2, n_dims, pos, freq_scale, ext_factor, + x, dst, ne00, ne01, ne02, s01, s02, s03, s1, s2, s3, n_dims, pos, freq_scale, ext_factor, attn_factor, corr_dims, theta_scale, freq_factors, sections); } } @@ -445,6 +534,11 @@ void ggml_cuda_op_rope_impl(ggml_backend_cuda_context & ctx, const size_t s01 = src0->nb[1] / ggml_type_size(src0->type); const size_t s02 = src0->nb[2] / ggml_type_size(src0->type); + const size_t s03 = src0->nb[3] / ggml_type_size(src0->type); + + const size_t s1 = dst->nb[1] / ggml_type_size(dst->type); + const size_t s2 = dst->nb[2] / ggml_type_size(dst->type); + const size_t s3 = dst->nb[3] / ggml_type_size(dst->type); //const int n_past = ((int32_t *) dst->op_params)[0]; const int n_dims = ((int32_t *) dst->op_params)[1]; @@ -495,57 +589,63 @@ void ggml_cuda_op_rope_impl(ggml_backend_cuda_context & ctx, // compute if (is_neox) { if (src0->type == GGML_TYPE_F32 && dst_type == GGML_TYPE_F32) { - rope_neox_cuda((const float *) src0_d, (float *) dst_d, ne00, ne01, s01, s02, n_dims, - nr, pos, freq_scale, freq_base, ext_factor, attn_factor, corr_dims, - freq_factors, row_indices, set_rows_stride, stream); + rope_neox_cuda((const float *) src0_d, (float *) dst_d, ne00, ne01, ne02, s01, s02, + s03, s1, s2, s3, n_dims, nr, pos, freq_scale, freq_base, + ext_factor, attn_factor, corr_dims, freq_factors, row_indices, + set_rows_stride, stream); } else if (src0->type == GGML_TYPE_F32 && dst_type == GGML_TYPE_F16) { - rope_neox_cuda((const float *) src0_d, (half *) dst_d, ne00, ne01, s01, s02, n_dims, - nr, pos, freq_scale, freq_base, ext_factor, attn_factor, corr_dims, - freq_factors, row_indices, set_rows_stride, stream); + rope_neox_cuda((const float *) src0_d, (half *) dst_d, ne00, ne01, ne02, s01, s02, + s03, s1, s2, s3, n_dims, nr, pos, freq_scale, freq_base, + ext_factor, attn_factor, corr_dims, freq_factors, row_indices, + set_rows_stride, stream); } else if (src0->type == GGML_TYPE_F16 && dst_type == GGML_TYPE_F16) { - rope_neox_cuda((const half *) src0_d, (half *) dst_d, ne00, ne01, s01, s02, n_dims, nr, - pos, freq_scale, freq_base, ext_factor, attn_factor, corr_dims, - freq_factors, row_indices, set_rows_stride, stream); + rope_neox_cuda((const half *) src0_d, (half *) dst_d, ne00, ne01, ne02, s01, s02, + s03, s1, s2, s3, n_dims, nr, pos, freq_scale, freq_base, + ext_factor, attn_factor, corr_dims, freq_factors, row_indices, + set_rows_stride, stream); } else { GGML_ABORT("fatal error"); } } else if (is_mrope && !is_vision) { if (src0->type == GGML_TYPE_F32) { - rope_multi_cuda( - (const float *) src0_d, (float *) dst_d, ne00, ne01, ne02, s01, s02, n_dims, nr, pos, freq_scale, - freq_base, ext_factor, attn_factor, corr_dims, freq_factors, sections, is_imrope, stream); + rope_multi_cuda((const float *) src0_d, (float *) dst_d, ne00, ne01, ne02, s01, s02, s03, s1, + s2, s3, n_dims, nr, pos, freq_scale, freq_base, ext_factor, attn_factor, + corr_dims, freq_factors, sections, is_imrope, stream); } else if (src0->type == GGML_TYPE_F16) { - rope_multi_cuda( - (const half *) src0_d, (half *) dst_d, ne00, ne01, ne02, s01, s02, n_dims, nr, pos, freq_scale, - freq_base, ext_factor, attn_factor, corr_dims, freq_factors, sections, is_imrope, stream); + rope_multi_cuda((const half *) src0_d, (half *) dst_d, ne00, ne01, ne02, s01, s02, s03, s1, + s2, s3, n_dims, nr, pos, freq_scale, freq_base, ext_factor, attn_factor, + corr_dims, freq_factors, sections, is_imrope, stream); } else { GGML_ABORT("fatal error"); } } else if (is_vision) { if (src0->type == GGML_TYPE_F32) { - rope_vision_cuda( - (const float *) src0_d, (float *) dst_d, ne00, ne01, ne02, s01, s02, n_dims, nr, pos, freq_scale, - freq_base, ext_factor, attn_factor, corr_dims, freq_factors, sections, stream); + rope_vision_cuda((const float *) src0_d, (float *) dst_d, ne00, ne01, ne02, s01, s02, s03, s1, + s2, s3, n_dims, nr, pos, freq_scale, freq_base, ext_factor, attn_factor, + corr_dims, freq_factors, sections, stream); } else if (src0->type == GGML_TYPE_F16) { - rope_vision_cuda( - (const half *) src0_d, (half *) dst_d, ne00, ne01, ne02, s01, s02, n_dims, nr, pos, freq_scale, - freq_base, ext_factor, attn_factor, corr_dims, freq_factors, sections, stream); + rope_vision_cuda((const half *) src0_d, (half *) dst_d, ne00, ne01, ne02, s01, s02, s03, s1, + s2, s3, n_dims, nr, pos, freq_scale, freq_base, ext_factor, attn_factor, + corr_dims, freq_factors, sections, stream); } else { GGML_ABORT("fatal error"); } } else { if (src0->type == GGML_TYPE_F32 && dst_type == GGML_TYPE_F32) { - rope_norm_cuda((const float *) src0_d, (float *) dst_d, ne00, ne01, s01, s02, n_dims, - nr, pos, freq_scale, freq_base, ext_factor, attn_factor, corr_dims, - freq_factors, row_indices, set_rows_stride, stream); + rope_norm_cuda((const float *) src0_d, (float *) dst_d, ne00, ne01, ne02, s01, s02, + s03, s1, s2, s3, n_dims, nr, pos, freq_scale, freq_base, + ext_factor, attn_factor, corr_dims, freq_factors, row_indices, + set_rows_stride, stream); } else if (src0->type == GGML_TYPE_F32 && dst_type == GGML_TYPE_F16) { - rope_norm_cuda((const float *) src0_d, (half *) dst_d, ne00, ne01, s01, s02, n_dims, - nr, pos, freq_scale, freq_base, ext_factor, attn_factor, corr_dims, - freq_factors, row_indices, set_rows_stride, stream); + rope_norm_cuda((const float *) src0_d, (half *) dst_d, ne00, ne01, ne02, s01, s02, + s03, s1, s2, s3, n_dims, nr, pos, freq_scale, freq_base, + ext_factor, attn_factor, corr_dims, freq_factors, row_indices, + set_rows_stride, stream); } else if (src0->type == GGML_TYPE_F16 && dst_type == GGML_TYPE_F16) { - rope_norm_cuda((const half *) src0_d, (half *) dst_d, ne00, ne01, s01, s02, n_dims, nr, - pos, freq_scale, freq_base, ext_factor, attn_factor, corr_dims, - freq_factors, row_indices, set_rows_stride, stream); + rope_norm_cuda((const half *) src0_d, (half *) dst_d, ne00, ne01, ne02, s01, s02, + s03, s1, s2, s3, n_dims, nr, pos, freq_scale, freq_base, + ext_factor, attn_factor, corr_dims, freq_factors, row_indices, + set_rows_stride, stream); } else { GGML_ABORT("fatal error"); } From 39bf692af1cba2a1072e4a42425611bf1ec2807d Mon Sep 17 00:00:00 2001 From: "Piotr Wilkin (ilintar)" Date: Mon, 9 Feb 2026 00:24:08 +0100 Subject: [PATCH 09/17] [Model] Qwen3.5 dense and MoE support (no vision) (#19435) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Unified delta net handling * Remove old methods. * Refactor and optimize * Adapt autoregressive version from @ymcki * Change to decay mask approach * Fix bad permute * Qwen 3.5 support * Apply suggestions from code review Co-authored-by: Sigbjørn Skjæret * Further fixes * Use inheritance, remove unneeded conts * Not like this! * Remove ggml.h explicit import * Remove transformers, fix the views * ACTUALLY fix views, make super calls explicit in conversion. * Fix conversion again * Remove extra ggml.h imports --------- Co-authored-by: Sigbjørn Skjæret --- convert_hf_to_gguf.py | 78 +++-- gguf-py/gguf/constants.py | 59 ++++ gguf-py/gguf/tensor_mapping.py | 6 +- src/CMakeLists.txt | 3 + src/llama-arch.cpp | 61 ++++ src/llama-arch.h | 2 + src/llama-context.cpp | 2 +- src/llama-model.cpp | 154 ++++++++ src/models/delta.cpp | 618 +++++++++++++++++++++++++++++++++ src/models/kimi-linear.cpp | 1 - src/models/models.h | 102 +++++- src/models/qwen3-5.cpp | 421 ++++++++++++++++++++++ src/models/qwen3-5moe.cpp | 52 +++ src/models/qwen3next.cpp | 372 +------------------- 14 files changed, 1532 insertions(+), 399 deletions(-) create mode 100644 src/models/delta.cpp create mode 100644 src/models/qwen3-5.cpp create mode 100644 src/models/qwen3-5moe.cpp diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 843c00a896..e64756a74a 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -4102,39 +4102,27 @@ class Qwen2MoeModel(TextModel): # process the experts separately name = name.replace("language_model.", "") # InternVL - # handle aggregated expert tensors - # GGUF stores dimensions reversed from PyTorch, so: - # PyTorch (A,B,C) -> GGUF writes [C,B,A] -> GGML reads ne={C,B,A} - # Input shapes from HF: (n_expert, n_ff_exp, n_embd) or (n_expert, n_embd, n_ff_exp) - # Expected GGML ne: {n_embd, n_ff_exp, n_expert} for gate/up, {n_ff_exp, n_embd, n_expert} for down + # handle pre-packed expert tensors (e.g. Qwen3.5 MoE, Qwen3Next) + # HF stores these using nn.Linear convention: [n_expert, out_features, in_features] + # This matches the individual expert stacking path below (which stacks + # per-expert [out, in] weights into [n_expert, out, in]), so no permute is needed. if name.endswith("mlp.experts.down_proj") or name.endswith("mlp.experts.down_proj.weight"): mapped = f"{name}.weight" if not name.endswith(".weight") else name - # Input: (n_expert=128, n_ff_exp=768, n_embd=2048) - # Want GGML ne: {n_ff_exp, n_embd, n_expert} = {768, 2048, 128} - # Need PyTorch: (128, 2048, 768) [reversed of GGML] - # So: permute(0, 2, 1): (128, 768, 2048) -> (128, 2048, 768) - permuted = data_torch.permute(0, 2, 1).contiguous() - yield from super().modify_tensors(permuted, mapped, bid) + # HF: [n_expert, n_embd, n_ff] → GGML: {n_ff, n_embd, n_expert} ✓ + yield from super().modify_tensors(data_torch, mapped, bid) return if name.endswith("mlp.experts.gate_up_proj") or name.endswith("mlp.experts.gate_up_proj.weight"): - if data_torch.ndim < 3 or data_torch.shape[-1] % 2 != 0: - raise ValueError(f"Unexpected gate_up_proj shape for {name}: {tuple(data_torch.shape)}") - split_dim = data_torch.shape[-1] // 2 - gate = data_torch[..., :split_dim].contiguous() - up = data_torch[..., split_dim:].contiguous() - # Input gate/up: (n_expert=128, n_embd=2048, n_ff_exp=768) - # Want GGML ne: {n_embd, n_ff_exp, n_expert} = {2048, 768, 128} - # Need PyTorch: (128, 768, 2048) [reversed of GGML] - # So: permute(0, 2, 1): (128, 2048, 768) -> (128, 768, 2048) - base_name = name.removesuffix(".weight") - base = base_name.rsplit('.', 1)[0] - mapped_gate = f"{base}.gate_proj.weight" - mapped_up = f"{base}.up_proj.weight" - perm_gate = gate.permute(0, 2, 1).contiguous() - perm_up = up.permute(0, 2, 1).contiguous() - yield from super().modify_tensors(perm_gate, mapped_gate, bid) - yield from super().modify_tensors(perm_up, mapped_up, bid) + # HF: [n_expert, 2*n_ff, n_embd] → split on dim=1 + n_ff = data_torch.shape[1] // 2 + gate = data_torch[:, :n_ff, :].contiguous() + up = data_torch[:, n_ff:, :].contiguous() + # gate/up: [n_expert, n_ff, n_embd] → GGML: {n_embd, n_ff, n_expert} ✓ + base_name = name.removesuffix(".weight").removesuffix(".gate_up_proj") + mapped_gate = f"{base_name}.gate_proj.weight" + mapped_up = f"{base_name}.up_proj.weight" + yield from super().modify_tensors(gate, mapped_gate, bid) + yield from super().modify_tensors(up, mapped_up, bid) return if name.startswith("mlp") or name.startswith("vision_model") or name.startswith("model.vision_tower") or name.startswith("model.multi_modal_projector") or name.startswith("model.visual"): @@ -4344,6 +4332,40 @@ class Qwen3NextModel(Qwen2MoeModel): yield from super().modify_tensors(data_torch, name, bid) +@ModelBase.register("Qwen3_5ForCausalLM", "Qwen3_5TextForCausalLM") +class Qwen3_5Model(Qwen3NextModel): + model_arch = gguf.MODEL_ARCH.QWEN3_5 + + # Stores whichever of in_proj_a/in_proj_b is seen first, keyed by layer + _pending_ba: dict[int | None, tuple[str, Tensor]] = {} + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + # Handle split in_proj_b + in_proj_a → concatenated SSM_BETA_ALPHA + # safetensors sorts alphabetically so in_proj_a arrives before in_proj_b + if "in_proj_a.weight" in name or "in_proj_b.weight" in name: + which = "a" if "in_proj_a" in name else "b" + if bid not in self._pending_ba: + self._pending_ba[bid] = (which, data_torch) + return + prev_which, prev_tensor = self._pending_ba.pop(bid) + assert prev_which != which, f"duplicate in_proj_{which} for layer {bid}" + b_tensor = prev_tensor if prev_which == "b" else data_torch + a_tensor = prev_tensor if prev_which == "a" else data_torch + ba_combined = torch.cat([b_tensor, a_tensor], dim=0) + yield (self.format_tensor_name(gguf.MODEL_TENSOR.SSM_BETA_ALPHA, bid, ".weight"), ba_combined) + return + else: + # Qwen3Next uses .qkvz tensor, so we use the super to get the other functionalities + # (norm correction, A_log to A etc.) for free + # Qwen2Moe already does the gate_up conversion properly, just use that + yield from super().modify_tensors(data_torch, name, bid) + + +@ModelBase.register("Qwen3_5MoeForCausalLM", "Qwen3_5MoeTextForCausalLM") +class Qwen3_5MoeModel(Qwen3_5Model): + model_arch = gguf.MODEL_ARCH.QWEN3_5_MOE + + @ModelBase.register("RND1") class RND1Model(Qwen2MoeModel): model_arch = gguf.MODEL_ARCH.RND1 diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 3af4fffe95..8a3fab1e1c 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -382,6 +382,8 @@ class MODEL_ARCH(IntEnum): QWEN3 = auto() QWEN3MOE = auto() QWEN3NEXT = auto() + QWEN3_5 = auto() + QWEN3_5_MOE = auto() QWEN3VL = auto() QWEN3VLMOE = auto() PHI2 = auto() @@ -812,6 +814,8 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = { MODEL_ARCH.QWEN3: "qwen3", MODEL_ARCH.QWEN3MOE: "qwen3moe", MODEL_ARCH.QWEN3NEXT: "qwen3next", + MODEL_ARCH.QWEN3_5: "qwen3_5", + MODEL_ARCH.QWEN3_5_MOE: "qwen3_5moe", MODEL_ARCH.QWEN3VL: "qwen3vl", MODEL_ARCH.QWEN3VLMOE: "qwen3vlmoe", MODEL_ARCH.PHI2: "phi2", @@ -1784,6 +1788,61 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = { MODEL_TENSOR.SSM_BETA_ALPHA, MODEL_TENSOR.SSM_OUT ], + MODEL_ARCH.QWEN3_5: [ + MODEL_TENSOR.TOKEN_EMBD, + MODEL_TENSOR.OUTPUT_NORM, + MODEL_TENSOR.OUTPUT, + MODEL_TENSOR.ATTN_NORM, + MODEL_TENSOR.ATTN_Q, + MODEL_TENSOR.ATTN_Q_NORM, + MODEL_TENSOR.ATTN_K, + MODEL_TENSOR.ATTN_K_NORM, + MODEL_TENSOR.ATTN_V, + MODEL_TENSOR.ATTN_OUT, + MODEL_TENSOR.ATTN_POST_NORM, + MODEL_TENSOR.ATTN_GATE, + MODEL_TENSOR.ATTN_QKV, + MODEL_TENSOR.FFN_GATE, + MODEL_TENSOR.FFN_DOWN, + MODEL_TENSOR.FFN_UP, + MODEL_TENSOR.SSM_A, + MODEL_TENSOR.SSM_CONV1D, + MODEL_TENSOR.SSM_DT, + MODEL_TENSOR.SSM_NORM, + MODEL_TENSOR.SSM_IN, + MODEL_TENSOR.SSM_BETA_ALPHA, + MODEL_TENSOR.SSM_OUT, + ], + MODEL_ARCH.QWEN3_5_MOE: [ + MODEL_TENSOR.TOKEN_EMBD, + MODEL_TENSOR.OUTPUT_NORM, + MODEL_TENSOR.OUTPUT, + MODEL_TENSOR.ATTN_NORM, + MODEL_TENSOR.ATTN_Q, + MODEL_TENSOR.ATTN_Q_NORM, + MODEL_TENSOR.ATTN_K, + MODEL_TENSOR.ATTN_K_NORM, + MODEL_TENSOR.ATTN_V, + MODEL_TENSOR.ATTN_OUT, + MODEL_TENSOR.ATTN_POST_NORM, + MODEL_TENSOR.ATTN_GATE, + MODEL_TENSOR.ATTN_QKV, + MODEL_TENSOR.FFN_GATE_INP, + MODEL_TENSOR.FFN_GATE_INP_SHEXP, + MODEL_TENSOR.FFN_UP_SHEXP, + MODEL_TENSOR.FFN_DOWN_SHEXP, + MODEL_TENSOR.FFN_GATE_SHEXP, + MODEL_TENSOR.FFN_DOWN_EXP, + MODEL_TENSOR.FFN_UP_EXP, + MODEL_TENSOR.FFN_GATE_EXP, + MODEL_TENSOR.SSM_A, + MODEL_TENSOR.SSM_CONV1D, + MODEL_TENSOR.SSM_DT, + MODEL_TENSOR.SSM_NORM, + MODEL_TENSOR.SSM_IN, + MODEL_TENSOR.SSM_BETA_ALPHA, + MODEL_TENSOR.SSM_OUT, + ], MODEL_ARCH.QWEN3VL: [ MODEL_TENSOR.TOKEN_EMBD, MODEL_TENSOR.OUTPUT_NORM, diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index 167ade7803..43f32c7b52 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -228,6 +228,7 @@ class TensorNameMap: "transformer_encoder.{bid}.qkv", # neobert "layers.{bid}.attn.Wqkv", # modern-bert "model.layers.{bid}.self_attn.language_expert_query_key_value", # cogvlm + "model.layers.{bid}.linear_attn.in_proj_qkv", # qwen3.5 ), # Attention query @@ -358,8 +359,9 @@ class TensorNameMap: ), MODEL_TENSOR.ATTN_GATE: ( - "model.layers.{bid}.self_attn.gate_proj", # afmoe - "model.layers.{bid}.self_attn.g_proj", # step3.5 head-wise attention gate + "model.layers.{bid}.self_attn.gate_proj", # afmoe + "model.layers.{bid}.self_attn.g_proj", # step3.5 head-wise attention gate + "model.layers.{bid}.linear_attn.in_proj_z", # qwen3.5 ), # Feed-forward norm diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 2115fc4255..0c164617a1 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -57,6 +57,7 @@ add_library(llama models/deci.cpp models/deepseek.cpp models/deepseek2.cpp + models/delta.cpp models/dots1.cpp models/dream.cpp models/ernie4-5-moe.cpp @@ -122,6 +123,8 @@ add_library(llama models/qwen3vl-moe.cpp models/qwen3moe.cpp models/qwen3next.cpp + models/qwen3-5.cpp + models/qwen3-5moe.cpp models/refact.cpp models/rnd1.cpp models/rwkv6-base.cpp diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp index bd78f1e556..fce46772d7 100644 --- a/src/llama-arch.cpp +++ b/src/llama-arch.cpp @@ -35,6 +35,8 @@ static const std::map LLM_ARCH_NAMES = { { LLM_ARCH_QWEN3, "qwen3" }, { LLM_ARCH_QWEN3MOE, "qwen3moe" }, { LLM_ARCH_QWEN3NEXT, "qwen3next" }, + { LLM_ARCH_QWEN3_5, "qwen3_5" }, + { LLM_ARCH_QWEN3_5_MOE, "qwen3_5moe" }, { LLM_ARCH_QWEN3VL, "qwen3vl" }, { LLM_ARCH_QWEN3VLMOE, "qwen3vlmoe" }, { LLM_ARCH_PHI2, "phi2" }, @@ -985,6 +987,63 @@ static std::set llm_get_tensor_names(llm_arch arch) { LLM_TENSOR_SSM_NORM, LLM_TENSOR_SSM_OUT, }; + case LLM_ARCH_QWEN3_5: + return { + LLM_TENSOR_TOKEN_EMBD, + LLM_TENSOR_OUTPUT_NORM, + LLM_TENSOR_OUTPUT, + LLM_TENSOR_ATTN_NORM, + LLM_TENSOR_ATTN_POST_NORM, + LLM_TENSOR_ATTN_Q, + LLM_TENSOR_ATTN_Q_NORM, + LLM_TENSOR_ATTN_K, + LLM_TENSOR_ATTN_K_NORM, + LLM_TENSOR_ATTN_V, + LLM_TENSOR_ATTN_OUT, + LLM_TENSOR_ATTN_QKV, + LLM_TENSOR_ATTN_GATE, + LLM_TENSOR_FFN_GATE, + LLM_TENSOR_FFN_DOWN, + LLM_TENSOR_FFN_UP, + LLM_TENSOR_SSM_A_NOSCAN, + LLM_TENSOR_SSM_CONV1D, + LLM_TENSOR_SSM_DT, + LLM_TENSOR_SSM_BETA_ALPHA, + LLM_TENSOR_SSM_IN, + LLM_TENSOR_SSM_NORM, + LLM_TENSOR_SSM_OUT, + }; + case LLM_ARCH_QWEN3_5_MOE: + return { + LLM_TENSOR_TOKEN_EMBD, + LLM_TENSOR_OUTPUT_NORM, + LLM_TENSOR_OUTPUT, + LLM_TENSOR_ATTN_NORM, + LLM_TENSOR_ATTN_POST_NORM, + LLM_TENSOR_ATTN_Q, + LLM_TENSOR_ATTN_Q_NORM, + LLM_TENSOR_ATTN_K, + LLM_TENSOR_ATTN_K_NORM, + LLM_TENSOR_ATTN_V, + LLM_TENSOR_ATTN_OUT, + LLM_TENSOR_ATTN_QKV, + LLM_TENSOR_ATTN_GATE, + LLM_TENSOR_FFN_GATE_INP, + LLM_TENSOR_FFN_GATE_EXPS, + LLM_TENSOR_FFN_DOWN_EXPS, + LLM_TENSOR_FFN_UP_EXPS, + LLM_TENSOR_FFN_GATE_INP_SHEXP, + LLM_TENSOR_FFN_GATE_SHEXP, + LLM_TENSOR_FFN_DOWN_SHEXP, + LLM_TENSOR_FFN_UP_SHEXP, + LLM_TENSOR_SSM_A_NOSCAN, + LLM_TENSOR_SSM_CONV1D, + LLM_TENSOR_SSM_DT, + LLM_TENSOR_SSM_BETA_ALPHA, + LLM_TENSOR_SSM_IN, + LLM_TENSOR_SSM_NORM, + LLM_TENSOR_SSM_OUT, + }; case LLM_ARCH_QWEN3VL: case LLM_ARCH_CHAMELEON: case LLM_ARCH_HUNYUAN_DENSE: @@ -2674,6 +2733,8 @@ bool llm_arch_is_hybrid(const llm_arch & arch) { case LLM_ARCH_NEMOTRON_H: case LLM_ARCH_NEMOTRON_H_MOE: case LLM_ARCH_QWEN3NEXT: + case LLM_ARCH_QWEN3_5: + case LLM_ARCH_QWEN3_5_MOE: case LLM_ARCH_KIMI_LINEAR: return true; default: diff --git a/src/llama-arch.h b/src/llama-arch.h index e8263369b8..a392ecce2b 100644 --- a/src/llama-arch.h +++ b/src/llama-arch.h @@ -39,6 +39,8 @@ enum llm_arch { LLM_ARCH_QWEN3, LLM_ARCH_QWEN3MOE, LLM_ARCH_QWEN3NEXT, + LLM_ARCH_QWEN3_5, + LLM_ARCH_QWEN3_5_MOE, LLM_ARCH_QWEN3VL, LLM_ARCH_QWEN3VLMOE, LLM_ARCH_PHI2, diff --git a/src/llama-context.cpp b/src/llama-context.cpp index a6df893a31..80b9a7d46a 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -2013,7 +2013,7 @@ void llama_context::output_reorder() { // uint32_t llama_context::graph_max_nodes(uint32_t n_tokens) const { - if (model.arch == LLM_ARCH_QWEN3NEXT || model.arch == LLM_ARCH_KIMI_LINEAR) { + if (model.arch == LLM_ARCH_QWEN3NEXT || model.arch == LLM_ARCH_QWEN3_5 || model.arch == LLM_ARCH_QWEN3_5_MOE || model.arch == LLM_ARCH_KIMI_LINEAR) { return std::max(n_tokens * 40, 32u * model.n_tensors()); } uint32_t res = std::max(1024u, 8u*model.n_tensors()); diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 674d06c891..8fc61aee37 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -2412,6 +2412,25 @@ void llama_model::load_hparams(llama_model_loader & ml) { default: type = LLM_TYPE_UNKNOWN; } } break; + case LLM_ARCH_QWEN3_5: + case LLM_ARCH_QWEN3_5_MOE: + { + ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false); + ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, false); + ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); + + // Load linear attention (gated delta net) parameters + ml.get_key(LLM_KV_SSM_CONV_KERNEL, hparams.ssm_d_conv); + ml.get_key(LLM_KV_SSM_INNER_SIZE, hparams.ssm_d_inner); + ml.get_key(LLM_KV_SSM_STATE_SIZE, hparams.ssm_d_state); + ml.get_key(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank); + ml.get_key(LLM_KV_SSM_GROUP_COUNT, hparams.ssm_n_group); + + // Mark recurrent layers (linear attention layers) + for (uint32_t i = 0; i < hparams.n_layer; ++i) { + hparams.recurrent_layer_arr[i] = ((i + 1) % 4 != 0); + } + } break; case LLM_ARCH_MISTRAL3: { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); @@ -7094,6 +7113,129 @@ bool llama_model::load_tensors(llama_model_loader & ml) { layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff_exp, n_embd, n_expert }, 0); layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert }, 0); + // Shared experts + layer.ffn_gate_inp_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP_SHEXP, "weight", i), { n_embd }, 0); + layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), { n_embd, hparams.n_ff_shexp }, 0); + layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), { n_embd, hparams.n_ff_shexp }, 0); + layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { hparams.n_ff_shexp, n_embd }, 0); + } + } break; + case LLM_ARCH_QWEN3_5: + { + tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0); + + // output + output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0); + output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), { n_embd, n_vocab }, TENSOR_NOT_REQUIRED); + + if (output == NULL) { + output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, TENSOR_DUPLICATED); + } + + // Calculate dimensions from hyperparameters + const int64_t head_k_dim = hparams.ssm_d_state; + const int64_t head_v_dim = hparams.ssm_d_state; + const int64_t n_k_heads = hparams.ssm_n_group; + const int64_t n_v_heads = hparams.ssm_dt_rank; + const int64_t key_dim = head_k_dim * n_k_heads; + const int64_t value_dim = head_v_dim * n_v_heads; + const int64_t conv_dim = key_dim * 2 + value_dim; + + const int64_t ba_dim = n_v_heads * 2; + + for (int i = 0; i < n_layer; ++i) { + auto & layer = layers[i]; + + layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, 0); + layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), { n_embd }, 0); + + if (!hparams.is_recurrent(i)) { + // Full attention layers + layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), { n_embd, n_embd_head_k * n_head * 2 }, 0); + layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), { n_embd, n_embd_k_gqa }, 0); + layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), { n_embd, n_embd_v_gqa }, 0); + layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head_k * n_head, n_embd }, 0); + + layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), { n_embd_head_k }, 0); + layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), { n_embd_head_k }, 0); + } else { + // Linear attention (gated delta net) specific tensors + layer.ssm_in = create_tensor(tn(LLM_TENSOR_SSM_IN, "weight", i), { n_embd, key_dim * 2 + value_dim * 2 }, TENSOR_NOT_REQUIRED); + layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), { n_embd, key_dim * 2 + value_dim }, TENSOR_NOT_REQUIRED); + layer.wqkv_gate = create_tensor(tn(LLM_TENSOR_ATTN_GATE, "weight", i), { n_embd, value_dim }, TENSOR_NOT_REQUIRED); + layer.ssm_conv1d = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", i), { hparams.ssm_d_conv, conv_dim }, 0); + layer.ssm_dt = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), { hparams.ssm_dt_rank }, 0); + layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A_NOSCAN, i), { hparams.ssm_dt_rank }, 0); + layer.ssm_beta_alpha = create_tensor(tn(LLM_TENSOR_SSM_BETA_ALPHA, "weight", i), { n_embd, ba_dim }, 0); + layer.ssm_norm = create_tensor(tn(LLM_TENSOR_SSM_NORM, "weight", i), { head_v_dim }, 0); + layer.ssm_out = create_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", i), { value_dim, n_embd }, 0); + } + + // Dense FFN for all layers + layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), { n_embd, n_ff }, 0); + layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), { n_embd, n_ff }, 0); + layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd }, 0); + } + } break; + case LLM_ARCH_QWEN3_5_MOE: + { + tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0); + + // output + output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0); + output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), { n_embd, n_vocab }, TENSOR_NOT_REQUIRED); + + if (output == NULL) { + output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, TENSOR_DUPLICATED); + } + + const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used; + + // Calculate dimensions from hyperparameters + const int64_t head_k_dim = hparams.ssm_d_state; + const int64_t head_v_dim = hparams.ssm_d_state; + const int64_t n_k_heads = hparams.ssm_n_group; + const int64_t n_v_heads = hparams.ssm_dt_rank; + const int64_t key_dim = head_k_dim * n_k_heads; + const int64_t value_dim = head_v_dim * n_v_heads; + const int64_t conv_dim = key_dim * 2 + value_dim; + + const int64_t ba_dim = n_v_heads * 2; + + for (int i = 0; i < n_layer; ++i) { + auto & layer = layers[i]; + + layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, 0); + layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), { n_embd }, 0); + + if (!hparams.is_recurrent(i)) { + // Full attention layers + layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), { n_embd, n_embd_head_k * n_head * 2 }, 0); + layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), { n_embd, n_embd_k_gqa }, 0); + layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), { n_embd, n_embd_v_gqa }, 0); + layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head_k * n_head, n_embd }, 0); + + layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), { n_embd_head_k }, 0); + layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), { n_embd_head_k }, 0); + } else { + // Linear attention (gated delta net) specific tensors + layer.ssm_in = create_tensor(tn(LLM_TENSOR_SSM_IN, "weight", i), { n_embd, key_dim * 2 + value_dim * 2 }, TENSOR_NOT_REQUIRED); + layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), { n_embd, key_dim * 2 + value_dim }, TENSOR_NOT_REQUIRED); + layer.wqkv_gate = create_tensor(tn(LLM_TENSOR_ATTN_GATE, "weight", i), { n_embd, value_dim }, TENSOR_NOT_REQUIRED); + layer.ssm_conv1d = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", i), { hparams.ssm_d_conv, conv_dim }, 0); + layer.ssm_dt = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), { hparams.ssm_dt_rank }, 0); + layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A_NOSCAN, i), { hparams.ssm_dt_rank }, 0); + layer.ssm_beta_alpha = create_tensor(tn(LLM_TENSOR_SSM_BETA_ALPHA, "weight", i), { n_embd, ba_dim }, 0); + layer.ssm_norm = create_tensor(tn(LLM_TENSOR_SSM_NORM, "weight", i), { head_v_dim }, 0); + layer.ssm_out = create_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", i), { value_dim, n_embd }, 0); + } + + // MoE FFN + layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), { n_embd, n_expert }, 0); + layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert }, 0); + layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff_exp, n_embd, n_expert }, 0); + layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert }, 0); + // Shared experts layer.ffn_gate_inp_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP_SHEXP, "weight", i), { n_embd }, 0); layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), { n_embd, hparams.n_ff_shexp }, 0); @@ -7545,6 +7687,8 @@ void llama_model::print_info() const { arch == LLM_ARCH_PLAMO2 || arch == LLM_ARCH_GRANITE_HYBRID || arch == LLM_ARCH_QWEN3NEXT || + arch == LLM_ARCH_QWEN3_5 || + arch == LLM_ARCH_QWEN3_5_MOE || arch == LLM_ARCH_NEMOTRON_H || arch == LLM_ARCH_NEMOTRON_H_MOE) { LLAMA_LOG_INFO("%s: ssm_d_conv = %u\n", __func__, hparams.ssm_d_conv); @@ -8343,6 +8487,14 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const { { llm = std::make_unique(*this, params); } break; + case LLM_ARCH_QWEN3_5: + { + llm = std::make_unique(*this, params); + } break; + case LLM_ARCH_QWEN3_5_MOE: + { + llm = std::make_unique(*this, params); + } break; case LLM_ARCH_MISTRAL3: { llm = std::make_unique(*this, params); @@ -8603,6 +8755,8 @@ llama_rope_type llama_model_rope_type(const llama_model * model) { case LLM_ARCH_PANGU_EMBED: case LLM_ARCH_AFMOE: case LLM_ARCH_QWEN3NEXT: + case LLM_ARCH_QWEN3_5: + case LLM_ARCH_QWEN3_5_MOE: case LLM_ARCH_MIMO2: case LLM_ARCH_STEP35: return LLAMA_ROPE_TYPE_NEOX; diff --git a/src/models/delta.cpp b/src/models/delta.cpp new file mode 100644 index 0000000000..d1d9837d09 --- /dev/null +++ b/src/models/delta.cpp @@ -0,0 +1,618 @@ +#include "models.h" +#include "ggml.h" +#include +#include +#include + +llm_graph_context_delta::llm_graph_context_delta(const llm_graph_params & params) : llm_graph_context_mamba(params) {} + +/** + * Unified Delta Net implementation supporting both GDA and KDA modes. + * + * GDA (Gated Delta Attention): g has shape [H, T, B] in GGML (PyTorch: [B, T, H]) + * - Per-head gating, broadcasts over K dimension + * + * KDA (Key-wise Delta Attention): g has shape [K, H, T, B] in GGML (PyTorch: [B, T, H, K]) + * - Per-key gating + * + * The mode is auto-detected based on g's dimensionality. + * + * Tensor dimension convention: + * GGML: ne[0] is innermost (fastest varying), ne[3] is outermost + * PyTorch: dim 0 is outermost, dim -1 is innermost + * So GGML [A, B, C, D] corresponds to PyTorch [D, C, B, A] + */ + +// Helper to get a slice along dimension 2 (n_chunks dimension) +static ggml_tensor * get_slice_2d(ggml_context * ctx, ggml_tensor * t, int64_t chunk) { + return ggml_view_4d(ctx, t, + t->ne[0], t->ne[1], 1, t->ne[3], + t->nb[1], t->nb[2], t->nb[3], + chunk * t->nb[2]); +} + +/** + * Unified chunked Delta Net implementation. + * + * Input tensor format matches qwen3next conventions: + * @param q Query tensor [S_k, H_k, n_tokens, n_seqs] + * @param k Key tensor [S_k, H_k, n_tokens, n_seqs] + * @param v Value tensor [S_v, H_v, n_tokens, n_seqs] + * @param g Gate tensor: + * GDA: [H_v, n_tokens, n_seqs] + * KDA: [S_k, H_v, n_tokens, n_seqs] + * @param beta Beta tensor [H_v, 1, n_tokens, n_seqs] + * @param state State tensor [S_v, S_v * H_v, 1, n_seqs] + * @param causal_mask Lower triangular mask [chunk_size, chunk_size] + * @param identity Identity matrix [chunk_size, chunk_size] + * @param diag_mask Diagonal mask [chunk_size, chunk_size] + * @param il Layer index (for debugging callbacks) + * @param chunk_size Chunk size for chunked processing + * @param eps_norm Epsilon for L2 normalization + * + * @return Pair of (output_tokens, new_state) + */ +std::pair llm_graph_context_delta::build_delta_net_unified_chunking( + ggml_context * ctx0, + ggml_tensor * q, + ggml_tensor * k, + ggml_tensor * v, + ggml_tensor * g, + ggml_tensor * beta, + ggml_tensor * state_reshaped, + ggml_tensor * causal_mask, + ggml_tensor * identity, + ggml_tensor * diag_mask, + int il, + int64_t chunk_size, + float eps_norm) { + + // Input format: [S, H, n_tokens, n_seqs] (matching qwen3next convention) + const int64_t S_k = q->ne[0]; + const int64_t H_k = q->ne[1]; + const int64_t n_tokens = q->ne[2]; + const int64_t n_seqs = q->ne[3]; + + const int64_t S_v = v->ne[0]; + const int64_t H_v = v->ne[1]; + + // Detect KDA vs GDA based on g's shape + // GDA: g has shape [H_v, n_tokens, n_seqs] + // KDA: g has shape [S_k, H_v, n_tokens, n_seqs] (4D with ne[0]=S_k) + const bool is_kda = (g->ne[0] == S_k && g->ne[1] == H_v); + + // Validate tensor shapes + GGML_ASSERT(v->ne[2] == n_tokens); + GGML_ASSERT(k->ne[2] == n_tokens); + GGML_ASSERT(state_reshaped->ne[0] == S_v && state_reshaped->ne[1] == S_v && state_reshaped->ne[2] == H_v && state_reshaped->ne[3] == n_seqs); + GGML_ASSERT(q->ne[0] == S_k && q->ne[1] == H_k && q->ne[2] == n_tokens && q->ne[3] == n_seqs); + GGML_ASSERT(k->ne[0] == S_k && k->ne[1] == H_k && k->ne[2] == n_tokens && k->ne[3] == n_seqs); + GGML_ASSERT(beta->ne[0] == H_v && beta->ne[2] == n_tokens && beta->ne[3] == n_seqs); + GGML_ASSERT(H_k == H_v); + + if (is_kda) { + // KDA: g shape [S_k, H_v, n_tokens, n_seqs] + GGML_ASSERT(g->ne[0] == S_k && g->ne[1] == H_v && g->ne[2] == n_tokens && g->ne[3] == n_seqs); + } else { + // GDA: g shape [H_v, n_tokens, n_seqs] + GGML_ASSERT(g->ne[0] == H_v && g->ne[1] == n_tokens && g->ne[2] == n_seqs); + } + + // L2 normalize q and k + q = ggml_l2_norm(ctx0, q, eps_norm); + k = ggml_l2_norm(ctx0, k, eps_norm); + + const float scale = 1.0f / sqrtf((float)S_v); + q = ggml_scale(ctx0, q, scale); + + beta = ggml_sigmoid(ctx0, beta); + + cb(q, "q_in", il); + cb(k, "k_in", il); + cb(v, "v_in", il); + cb(beta, "beta_in", il); + cb(g, "g_in", il); + + // Permute tensors to working format [S, n_tokens, H, n_seqs] + // Input: [S, H, n_tokens, n_seqs] -> permute(0, 2, 1, 3) -> [S, n_tokens, H, n_seqs] + q = ggml_cont_4d(ctx0, ggml_permute(ctx0, q, 0, 2, 1, 3), S_k, n_tokens, H_k, n_seqs); + k = ggml_cont_4d(ctx0, ggml_permute(ctx0, k, 0, 2, 1, 3), S_k, n_tokens, H_k, n_seqs); + v = ggml_cont_4d(ctx0, ggml_permute(ctx0, v, 0, 2, 1, 3), S_v, n_tokens, H_v, n_seqs); + if (is_kda) { + g = ggml_cont_4d(ctx0, ggml_permute(ctx0, g, 0, 2, 1, 3), S_k, n_tokens, H_k, n_seqs); + } else { + g = ggml_cont_4d(ctx0, ggml_permute(ctx0, g, 2, 0, 3, 1), n_tokens, 1, H_k, n_seqs); + } + beta = ggml_cont(ctx0, ggml_permute(ctx0, beta, 2, 0, 1, 3)); + + cb(q, "q_perm", il); + cb(k, "k_perm", il); + cb(v, "v_perm", il); + cb(beta, "beta_perm", il); + cb(g, "g_perm", il); + cb(state_reshaped, "state_in", il); + + // Padding for chunk processing + const int64_t pad = (chunk_size - n_tokens % chunk_size) % chunk_size; + const int64_t n_chunks = (n_tokens + pad) / chunk_size; + + q = ggml_pad(ctx0, q, 0, pad, 0, 0); + k = ggml_pad(ctx0, k, 0, pad, 0, 0); + v = ggml_pad(ctx0, v, 0, pad, 0, 0); + beta = ggml_pad(ctx0, beta, 0, pad, 0, 0); + g = ggml_pad(ctx0, g, pad, 0, 0, 0); + + + cb(q, "q_pad", il); + cb(k, "k_pad", il); + cb(v, "v_pad", il); + cb(beta, "beta_pad", il); + cb(g, "g_pad", il); + + ggml_tensor * v_beta = ggml_mul(ctx0, v, beta); + ggml_tensor * k_beta = ggml_mul(ctx0, k, beta); + + cb(v_beta, "v_beta", il); + cb(k_beta, "k_beta", il); + + // Reshape to chunks + q = ggml_reshape_4d(ctx0, q, S_k, chunk_size, n_chunks, H_k * n_seqs); + k = ggml_reshape_4d(ctx0, k, S_k, chunk_size, n_chunks, H_k * n_seqs); + k_beta = ggml_reshape_4d(ctx0, k_beta, S_k, chunk_size, n_chunks, H_k * n_seqs); + v = ggml_reshape_4d(ctx0, v, S_v, chunk_size, n_chunks, H_v * n_seqs); + v_beta = ggml_reshape_4d(ctx0, v_beta, S_v, chunk_size, n_chunks, H_v * n_seqs); + beta = ggml_reshape_4d(ctx0, beta, 1, chunk_size, n_chunks, H_k * n_seqs); + + // Reshape g for chunks + ggml_tensor * g_cumsum; + ggml_tensor * g_cumsum_t; + if (is_kda) { + // KDA: g [S_k, n_tokens+pad, H_k, n_seqs] -> [S_k, chunk_size, n_chunks, H_k * n_seqs] + g = ggml_reshape_4d(ctx0, g, S_k, chunk_size, n_chunks, H_k * n_seqs); + // Cumsum along chunk_size dimension (ne[1]) + // GGML cumsum operates on ne[0], so we need to transpose, cumsum, transpose back + g = ggml_cont(ctx0, ggml_transpose(ctx0, g)); // [chunk_size, S_k, n_chunks, H_k * n_seqs] + g_cumsum_t = ggml_cumsum(ctx0, g); + g_cumsum = ggml_cont(ctx0, ggml_transpose(ctx0, g_cumsum_t)); // [S_k, chunk_size, n_chunks, H_k * n_seqs] + } else { + // GDA: g [n_tokens+pad, 1, H_k, n_seqs] -> [chunk_size, 1, n_chunks, H_k * n_seqs] + g = ggml_reshape_4d(ctx0, g, chunk_size, 1, n_chunks, H_k * n_seqs); + g_cumsum = ggml_cumsum(ctx0, g); + g_cumsum_t = ggml_reshape_4d(ctx0, g_cumsum, 1, chunk_size, n_chunks, H_k * n_seqs); + } + + cb(g_cumsum, "g_cumsum", il); + + // Build attention matrix A for the WY representation solve + // For GDA: A[j,i] = sum_k(k[j,k] * exp(g[j] - g[i]) * k[i,k]) = (k @ k^T) * exp(g[j] - g[i]) + // For KDA: A[j,i] = sum_k(k_beta[j,k] * exp(g[j,k] - g[i,k]) * k[i,k]) + // KDA uses decay mask with S_k packed into batch to compute exp(g[j,k] - g[i,k]) per-key + + ggml_tensor * k_decay; + ggml_tensor * decay_mask = nullptr; + ggml_tensor * g_exp_pos = nullptr; + + if (is_kda) { + // KDA: Use decay mask with S_k in leading dimension for efficient mul_mat reduction + // A[j,i] = sum_k(k_beta[j,k] * exp(g[j,k] - g[i,k]) * k[i,k]) + // By putting S_k in dim 0, mul_mat implicitly sums over it + + const int64_t CHB = n_chunks * H_k * n_seqs; + + // g_cumsum_t is [chunk_size, S_k, n_chunks, H_k * n_seqs] + // Reshape to [chunk_size, S_k, CHB] then build decay mask + ggml_tensor * gcs = ggml_reshape_3d(ctx0, g_cumsum_t, chunk_size, S_k, CHB); + ggml_tensor * gcs_i = ggml_reshape_4d(ctx0, gcs, chunk_size, 1, S_k, CHB); + ggml_tensor * gcs_j = ggml_reshape_4d(ctx0, gcs, 1, chunk_size, S_k, CHB); + + // Build decay mask: [chunk_size, chunk_size, S_k, CHB] + ggml_tensor * gcs_j_bc = ggml_repeat_4d(ctx0, gcs_j, chunk_size, chunk_size, S_k, CHB); + decay_mask = ggml_sub(ctx0, gcs_j_bc, gcs_i); + + cb(decay_mask, "decay_mask_kda", il); + + decay_mask = ggml_mul(ctx0, decay_mask, diag_mask); + decay_mask = ggml_exp(ctx0, decay_mask); + decay_mask = ggml_mul(ctx0, decay_mask, diag_mask); + + // Permute to [S_k, chunk_size_j, chunk_size_i, CHB] for mul_mat reduction over S_k + decay_mask = ggml_cont_4d(ctx0, ggml_permute(ctx0, decay_mask, 2, 1, 0, 3), S_k, chunk_size, chunk_size, CHB); + + // Reshape k and k_beta for broadcasting with decay_mask + // k_i: indexed at position i (dim 2 of decay_mask) + // k_beta_j: indexed at position j (dim 1 of decay_mask) + ggml_tensor * k_i = ggml_reshape_4d(ctx0, k, S_k, 1, chunk_size, CHB); + ggml_tensor * k_beta_j = ggml_reshape_4d(ctx0, k_beta, S_k, chunk_size, 1, CHB); + + // decay_k_beta_j[s,j,i,b] = decay[s,j,i,b] * k_beta[s,j,b] + ggml_tensor * decay_k_beta_j = ggml_mul(ctx0, decay_mask, k_beta_j); + + // mul_mat sums over S_k: result[j,1,i,CHB] = sum_s decay_k_beta_j[s,j,i,b] * k_i[s,1,i,b] + k_decay = ggml_mul_mat(ctx0, decay_k_beta_j, k_i); + k_decay = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_reshape_4d(ctx0, k_decay, chunk_size, chunk_size, n_chunks, H_k * n_seqs))); + + // g_exp_pos is still needed for later (kbeta_gexp, etc.) + g_exp_pos = ggml_exp(ctx0, g_cumsum); + } else { + // GDA: Use decay mask approach (g broadcasts over K dimension) + // g_cumsum [chunk_size, 1, n_chunks, H_v * n_seqs] + ggml_tensor * gcs_i = g_cumsum; + ggml_tensor * gcs_j = g_cumsum_t; + g_exp_pos = ggml_exp(ctx0, g_cumsum_t); + ggml_tensor * gcs_j_broadcast = ggml_repeat_4d(ctx0, gcs_j, chunk_size, chunk_size, n_chunks, H_v * n_seqs); + decay_mask = ggml_sub(ctx0, gcs_j_broadcast, gcs_i); + + cb(decay_mask, "decay_mask", il); + + decay_mask = ggml_mul(ctx0, decay_mask, diag_mask); + decay_mask = ggml_exp(ctx0, decay_mask); + decay_mask = ggml_mul(ctx0, decay_mask, diag_mask); + + ggml_tensor * kmulkbeta = ggml_mul_mat(ctx0, k, k_beta); + k_decay = ggml_mul(ctx0, kmulkbeta, decay_mask); + } + + ggml_tensor * attn = ggml_neg(ctx0, ggml_mul(ctx0, k_decay, causal_mask)); + + cb(attn, "attn_pre_solve", il); + + // Solve triangular system: (I + L) @ X = I, where L is strictly lower triangular + ggml_tensor * attn_lower = ggml_mul(ctx0, attn, causal_mask); + ggml_tensor * lhs = ggml_sub(ctx0, ggml_repeat(ctx0, identity, attn_lower), attn_lower); + ggml_tensor * lin_solve = ggml_solve_tri(ctx0, lhs, attn, true, true, false); + attn = ggml_mul(ctx0, lin_solve, causal_mask); + attn = ggml_add(ctx0, attn, identity); + + cb(attn, "attn_solved", il); + + // Compute u = A @ v and w = A @ (g.exp() * k) + v = ggml_mul_mat(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, v_beta)), attn); + + ggml_tensor * kbeta_gexp = ggml_mul(ctx0, k_beta, g_exp_pos); + cb(kbeta_gexp, "kbeta_gexp", il); + + ggml_tensor * k_cumdecay = ggml_cont(ctx0, ggml_transpose(ctx0, + ggml_mul_mat(ctx0, attn, ggml_cont(ctx0, ggml_transpose(ctx0, kbeta_gexp))))); + cb(k_cumdecay, "k_cumdecay", il); + + // Attention scores q @ k^T with decay + // For GDA: attn_kq[j,i] = sum_k(q[j,k] * exp(g[j] - g[i]) * k[i,k]) + // For KDA: attn_kq[j,i] = sum_k(q[j,k] * exp(g[j,k] - g[i,k]) * k[i,k]) + ggml_tensor * attn_kq; + if (is_kda) { + // KDA: Same approach as k_decay - use decay_mask with S_k in leading dim + const int64_t CHB = n_chunks * H_k * n_seqs; + + // Rebuild decay mask (same structure as k_decay) + ggml_tensor * gcs = ggml_reshape_3d(ctx0, g_cumsum_t, chunk_size, S_k, CHB); + ggml_tensor * gcs_i = ggml_reshape_4d(ctx0, gcs, chunk_size, 1, S_k, CHB); + ggml_tensor * gcs_j = ggml_reshape_4d(ctx0, gcs, 1, chunk_size, S_k, CHB); + ggml_tensor * gcs_j_bc = ggml_repeat_4d(ctx0, gcs_j, chunk_size, chunk_size, S_k, CHB); + ggml_tensor * decay_mask_kq = ggml_sub(ctx0, gcs_j_bc, gcs_i); + + decay_mask_kq = ggml_mul(ctx0, decay_mask_kq, diag_mask); + decay_mask_kq = ggml_exp(ctx0, decay_mask_kq); + decay_mask_kq = ggml_mul(ctx0, decay_mask_kq, diag_mask); + + // Permute to [S_k, chunk_size_j, chunk_size_i, CHB] + decay_mask_kq = ggml_cont_4d(ctx0, ggml_permute(ctx0, decay_mask_kq, 2, 1, 0, 3), S_k, chunk_size, chunk_size, CHB); + + // q_j: indexed at position j, k_i: indexed at position i + ggml_tensor * q_j = ggml_reshape_4d(ctx0, q, S_k, chunk_size, 1, CHB); + ggml_tensor * k_i = ggml_reshape_4d(ctx0, k, S_k, 1, chunk_size, CHB); + + // decay_q_j[s,j,i,b] = decay[s,j,i,b] * q[s,j,b] + ggml_tensor * decay_q_j = ggml_mul(ctx0, decay_mask_kq, q_j); + + // mul_mat sums over S_k + attn_kq = ggml_mul_mat(ctx0, decay_q_j, k_i); + attn_kq = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_reshape_4d(ctx0, attn_kq, chunk_size, chunk_size, n_chunks, H_k * n_seqs))); + } else { + // GDA: Use decay mask + attn_kq = ggml_mul_mat(ctx0, k, q); + attn_kq = ggml_mul(ctx0, attn_kq, decay_mask); + attn_kq = ggml_mul(ctx0, attn_kq, diag_mask); + } + cb(attn_kq, "attn_kq", il); + + // Compute g_last and g_diff for state updates + ggml_tensor * g_last; + ggml_tensor * g_diff_exp; + ggml_tensor * g_last_exp; + + if (is_kda) { + // KDA: g_cumsum [S_k, chunk_size, n_chunks, H_k * n_seqs] + // Get last element along chunk_size dimension (ne[1]) + g_last = ggml_view_4d(ctx0, g_cumsum, + g_cumsum->ne[0], 1, g_cumsum->ne[2], g_cumsum->ne[3], + g_cumsum->nb[1], g_cumsum->nb[2], g_cumsum->nb[3], + (g_cumsum->ne[1] - 1) * g_cumsum->nb[1]); + g_last = ggml_cont(ctx0, g_last); + g_last_exp = ggml_exp(ctx0, g_last); + + // g_diff = g_last - g_cumsum + ggml_tensor * g_last_broadcast = ggml_repeat_4d(ctx0, g_last, + g_cumsum->ne[0], g_cumsum->ne[1], g_cumsum->ne[2], g_cumsum->ne[3]); + ggml_tensor * g_diff = ggml_sub(ctx0, g_last_broadcast, g_cumsum); + g_diff_exp = ggml_exp(ctx0, g_diff); + } else { + // GDA: g_cumsum [chunk_size, 1, n_chunks, H_k * n_seqs] + g_last = ggml_view_4d(ctx0, g_cumsum, + 1, 1, g_cumsum->ne[2], g_cumsum->ne[3], + g_cumsum->nb[1], g_cumsum->nb[2], g_cumsum->nb[3], + (g_cumsum->ne[0] - 1) * ggml_element_size(g_cumsum)); + g_last = ggml_cont(ctx0, g_last); + g_last_exp = ggml_exp(ctx0, g_last); + + ggml_tensor * g_diff = ggml_neg(ctx0, ggml_sub(ctx0, g_cumsum, g_last)); + g_diff_exp = ggml_exp(ctx0, g_diff); + } + + cb(g_last, "g_last", il); + cb(g_last_exp, "g_last_exp", il); + + ggml_tensor * key_gdiff = ggml_mul(ctx0, k, g_diff_exp); + cb(key_gdiff, "key_gdiff", il); + + // Process chunks + ggml_tensor * new_state = state_reshaped; + ggml_tensor * core_attn_out = nullptr; + + for (int64_t chunk = 0; chunk < n_chunks; chunk++) { + ggml_tensor * q_chunk = get_slice_2d(ctx0, q, chunk); + ggml_tensor * v_chunk = get_slice_2d(ctx0, v, chunk); + ggml_tensor * k_cumdecay_chunk = get_slice_2d(ctx0, k_cumdecay, chunk); + ggml_tensor * attn_chunk = get_slice_2d(ctx0, attn_kq, chunk); + ggml_tensor * gexp_chunk = get_slice_2d(ctx0, g_exp_pos, chunk); + + cb(attn_chunk, "attn_chunk", il); + + ggml_tensor * state_t = ggml_cont_4d(ctx0, ggml_permute(ctx0, new_state, 1, 0, 2, 3), + S_v, S_v, 1, H_v * n_seqs); + + // v_prime = k_cumdecay @ state + ggml_tensor * v_prime = ggml_mul_mat(ctx0, state_t, k_cumdecay_chunk); + cb(v_prime, "v_prime_chunk", il); + + // v_new = v - v_prime + ggml_tensor * v_new = ggml_sub(ctx0, ggml_repeat(ctx0, v_chunk, v_prime), v_prime); + ggml_tensor * v_new_t = ggml_cont(ctx0, ggml_transpose(ctx0, v_new)); + cb(v_new, "v_new_chunk", il); + + // attn_inter = (q * g.exp()) @ state + ggml_tensor * q_g_exp = ggml_mul(ctx0, q_chunk, gexp_chunk); + ggml_tensor * attn_inter = ggml_mul_mat(ctx0, state_t, q_g_exp); + cb(attn_inter, "attn_inter_chunk", il); + + // output = attn_inter + attn @ v_new + ggml_tensor * v_attn = ggml_mul_mat(ctx0, v_new_t, attn_chunk); + cb(v_attn, "v_attn_chunk", il); + + ggml_tensor * core_attn_out_chunk = ggml_add(ctx0, attn_inter, v_attn); + cb(core_attn_out_chunk, "core_attn_out_chunk", il); + + core_attn_out = core_attn_out == nullptr + ? core_attn_out_chunk + : ggml_concat(ctx0, core_attn_out, core_attn_out_chunk, 2); + + // State update: state = state * g_last_exp + key_gdiff^T @ v_new + ggml_tensor * k_gdiff = ggml_cont(ctx0, get_slice_2d(ctx0, key_gdiff, chunk)); + ggml_tensor * kgdmulvnew = ggml_mul_mat(ctx0, v_new_t, ggml_cont(ctx0, ggml_transpose(ctx0, k_gdiff))); + + ggml_tensor * gexp_last_chunk = ggml_cont(ctx0, get_slice_2d(ctx0, g_last_exp, chunk)); + + if (is_kda) { + // KDA: g_last_exp [S_k, 1, n_chunks, H_k * n_seqs] + // State: [S_v, S_v, H_v, n_seqs] + // Need to reshape g_last_exp to broadcast correctly over V dimension only + gexp_last_chunk = ggml_reshape_4d(ctx0, gexp_last_chunk, + 1, gexp_last_chunk->ne[0], H_v, n_seqs); // [1, S_k, H_v, n_seqs] + // Transpose to [S_k, 1, H_v, n_seqs] then broadcast + gexp_last_chunk = ggml_cont(ctx0, ggml_permute(ctx0, gexp_last_chunk, 1, 0, 2, 3)); + } else { + // GDA: g_last_exp [1, 1, n_chunks, H_k * n_seqs] + // Broadcasts over both K and V dimensions + gexp_last_chunk = ggml_reshape_4d(ctx0, gexp_last_chunk, + gexp_last_chunk->ne[0], gexp_last_chunk->ne[1], H_v, n_seqs); + } + + new_state = ggml_add(ctx0, + ggml_mul(ctx0, new_state, gexp_last_chunk), + ggml_reshape_4d(ctx0, kgdmulvnew, kgdmulvnew->ne[0], kgdmulvnew->ne[1], H_v, n_seqs)); + } + + // Truncate padding and permute back + ggml_tensor * output_tokens = ggml_view_4d(ctx0, core_attn_out, + S_v, n_tokens, H_v, n_seqs, + ggml_row_size(core_attn_out->type, S_v), + ggml_row_size(core_attn_out->type, S_v * chunk_size * n_chunks), + ggml_row_size(core_attn_out->type, S_v * chunk_size * n_chunks * H_v), 0); + output_tokens = ggml_cont(ctx0, output_tokens); + + cb(output_tokens, "output_tokens", il); + + output_tokens = ggml_permute(ctx0, output_tokens, 0, 2, 1, 3); + output_tokens = ggml_cont(ctx0, output_tokens); + + return {output_tokens, new_state}; +} + + +/** + * Unified autoregressive Delta Net implementation (single token processing). + * + * This implementation uses matrix multiplication instead of elementwise operations + summation, + * which is more efficient and mathematically equivalent. See inline comments for equivalences. + * + * Input tensor format matches qwen3next conventions: + * @param q Query tensor [S_k, H_k, 1, n_seqs] + * @param k Key tensor [S_k, H_k, 1, n_seqs] + * @param v Value tensor [S_v, H_v, 1, n_seqs] + * @param g Gate tensor: + * GDA: [H_v, 1, n_seqs] + * KDA: [S_k, H_v, 1, n_seqs] + * @param beta Beta tensor [H_v, 1, 1, n_seqs] + * @param state State tensor [S_v, S_v * H_v, 1, n_seqs] + * @param il Layer index (for debugging callbacks) + * @param eps_norm Epsilon for L2 normalization + * + * @return Pair of (output_tokens, new_state) + */ +std::pair llm_graph_context_delta::build_delta_net_unified_autoregressive( + ggml_context * ctx0, + ggml_tensor * q, + ggml_tensor * k, + ggml_tensor * v, + ggml_tensor * g, + ggml_tensor * beta, + ggml_tensor * state, + int il, + float eps_norm) { + + // Input format: [S, H, n_tokens, n_seqs] (matching qwen3next convention) + const int64_t S_k = q->ne[0]; + const int64_t H_k = q->ne[1]; + const int64_t n_tokens = q->ne[2]; + const int64_t n_seqs = q->ne[3]; + + const int64_t S_v = v->ne[0]; + const int64_t H_v = v->ne[1]; + + GGML_ASSERT(n_tokens == 1); // Autoregressive mode is for single token + + // Detect KDA vs GDA based on g's shape + // GDA: g has shape [H_v, 1, n_seqs] or [H_v, n_tokens, n_seqs] + // KDA: g has shape [S_k, H_v, 1, n_seqs] or [S_k, H_v, n_tokens, n_seqs] + const bool is_kda = (g->ne[0] == S_k && g->ne[1] == H_v); + + // Validate shapes + GGML_ASSERT(v->ne[2] == n_tokens); + GGML_ASSERT(k->ne[2] == n_tokens); + GGML_ASSERT(state->ne[0] == S_v && state->ne[1] == S_v && state->ne[2] == H_v && state->ne[3] == n_seqs); + GGML_ASSERT(q->ne[0] == S_k && q->ne[1] == H_k && q->ne[2] == n_tokens && q->ne[3] == n_seqs); + GGML_ASSERT(k->ne[0] == S_k && k->ne[1] == H_k && k->ne[2] == n_tokens && k->ne[3] == n_seqs); + GGML_ASSERT(beta->ne[0] == H_v && beta->ne[2] == n_tokens && beta->ne[3] == n_seqs); + GGML_ASSERT(H_k == H_v); + + if (is_kda) { + GGML_ASSERT(g->ne[0] == S_k && g->ne[1] == H_v); + } else { + GGML_ASSERT(g->ne[0] == H_v); + } + + // L2 normalize q and k + q = ggml_l2_norm(ctx0, q, eps_norm); + k = ggml_l2_norm(ctx0, k, eps_norm); + + const float scale = 1.0f / sqrtf((float)S_v); + q = ggml_scale(ctx0, q, scale); + beta = ggml_sigmoid(ctx0, beta); + + cb(q, "q_in", il); + cb(k, "k_in", il); + cb(v, "v_in", il); + cb(beta, "beta_in", il); + cb(g, "g_in", il); + + // Reshape g and beta for broadcasting + ggml_tensor * g_t; + ggml_tensor * beta_t; + + if (is_kda) { + // KDA: g [S_k, H_v, 1, n_seqs] -> [S_k, 1, H_k, n_seqs] + // For state multiplication, need [1, S_k, H_v, n_seqs] to broadcast over V only + g_t = ggml_reshape_4d(ctx0, g, S_k, 1, H_k, n_seqs); + } else { + // GDA: g [H_v, 1, n_seqs] -> [1, 1, H_k, n_seqs] + // For state multiplication, broadcasts over both K and V + g_t = ggml_reshape_4d(ctx0, ggml_transpose(ctx0, g), 1, 1, H_k, n_seqs); + } + + beta_t = ggml_reshape_4d(ctx0, ggml_transpose(ctx0, beta), 1, 1, H_k, n_seqs); + + // Apply exponential to g_t + g_t = ggml_exp(ctx0, g_t); + + // State decay: state = state * exp(g) + if (is_kda) { + // KDA: g_t [S_k, 1, H_k, n_seqs], state [S_v, S_v, H_v, n_seqs] + // Need to broadcast g_t over V dimension (ne[0] of state) + // Permute g_t to [1, S_k, H_k, n_seqs] for correct broadcasting + ggml_tensor * g_broadcast = ggml_cont(ctx0, ggml_permute(ctx0, g_t, 1, 0, 2, 3)); + state = ggml_mul(ctx0, state, g_broadcast); + } else { + // GDA: g_t [1, 1, H_k, n_seqs] broadcasts over both dimensions + state = ggml_mul(ctx0, state, g_t); + } + + // Equivalence to previous version: + // Previous: kv_mem = sum_k(state * k) using elementwise mult + sum_rows + // Current: k_state = state_t @ k_t using matrix multiplication + // These are equivalent because: sum_k(A * B) = A @ B when dimensions align + ggml_tensor * state_t = ggml_cont(ctx0, ggml_transpose(ctx0, state)); + ggml_tensor * k_t = ggml_reshape_4d(ctx0, k, S_k, 1, H_k, n_seqs); + ggml_tensor * k_state = ggml_mul_mat(ctx0, state_t, k_t); + + // v_diff = v - k_state (equivalent to v - kv_mem in previous version) + ggml_tensor * v_t = ggml_reshape_4d(ctx0, v, S_v, 1, H_v, n_seqs); + ggml_tensor * v_diff = ggml_sub(ctx0, v_t, k_state); + ggml_tensor * k_beta = ggml_mul(ctx0, k_t, beta_t); + + // Equivalence to previous version: + // Previous: state += k.unsqueeze(-1) * delta where delta = (v - kv_mem) * beta + // Current: state += v_diff^T @ k_beta^T using matrix multiplication + // These are equivalent because: outer_product(k, v_diff * beta) = v_diff^T @ k^T + state = ggml_add(ctx0, state, ggml_mul_mat(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, v_diff)), ggml_cont(ctx0, ggml_transpose(ctx0, k_beta)))); + + // Equivalence to previous version: + // Previous: core_attn_out = sum_k(state * q) using elementwise mult + sum_rows + // Current: core_attn_out = state_t @ q using matrix multiplication + // These are equivalent because: sum_k(A * B) = A @ B when dimensions align + q = ggml_reshape_4d(ctx0, q, S_k, 1, H_k, n_seqs); + state_t = ggml_cont(ctx0, ggml_transpose(ctx0, state)); + ggml_tensor * core_attn_out = ggml_mul_mat(ctx0, state_t, q); + // core_attn_out should be [S_v, 1, H_v, n_seqs] after this + cb(core_attn_out, "output_tokens", il); + cb(state, "new_state", il); + + return {core_attn_out, state}; +} + + +/** + * Main entry point that dispatches to chunked or autoregressive based on n_tokens. + * + * Input tensor format matches qwen3next conventions: + * @param q Query tensor [S_k, H_k, n_tokens, n_seqs] + * @param k Key tensor [S_k, H_k, n_tokens, n_seqs] + * @param v Value tensor [S_v, H_v, n_tokens, n_seqs] + * @param g Gate tensor (GDA: [H_v, n_tokens, n_seqs], KDA: [S_k, H_v, n_tokens, n_seqs]) + * @param beta Beta tensor [H_v, 1, n_tokens, n_seqs] + * @param state State tensor [S_v, S_v * H_v, 1, n_seqs] + */ +std::pair llm_graph_context_delta::build_delta_net_unified( + ggml_context * ctx0, + ggml_tensor * q, + ggml_tensor * k, + ggml_tensor * v, + ggml_tensor * g, + ggml_tensor * beta, + ggml_tensor * state, + ggml_tensor * causal_mask, + ggml_tensor * identity, + ggml_tensor * diag_mask, + int il, + int64_t chunk_size, + float eps_norm) { + + // Input format: [S, H, n_tokens, n_seqs] (matching qwen3next convention) + const int64_t n_tokens = q->ne[2]; + + if (n_tokens == 1) { + return build_delta_net_unified_autoregressive( + ctx0, q, k, v, g, beta, state, il, eps_norm); + } + return build_delta_net_unified_chunking( + ctx0, q, k, v, g, beta, state, causal_mask, identity, diag_mask, + il, chunk_size, eps_norm); +} diff --git a/src/models/kimi-linear.cpp b/src/models/kimi-linear.cpp index 0f037d1a39..d9ee698075 100644 --- a/src/models/kimi-linear.cpp +++ b/src/models/kimi-linear.cpp @@ -1,5 +1,4 @@ #include "models.h" -#include "ggml.h" #define CHUNK_SIZE 64 diff --git a/src/models/models.h b/src/models/models.h index cfcbb9aaa5..2a750c168e 100644 --- a/src/models/models.h +++ b/src/models/models.h @@ -17,6 +17,53 @@ struct llm_graph_context_mamba : public llm_graph_context { }; +struct llm_graph_context_delta : public llm_graph_context_mamba { + llm_graph_context_delta(const llm_graph_params & params); + + virtual ~llm_graph_context_delta() = default; + + std::pair build_delta_net_unified_chunking( + ggml_context * ctx0, + ggml_tensor * q, + ggml_tensor * k, + ggml_tensor * v, + ggml_tensor * g, + ggml_tensor * beta, + ggml_tensor * state, + ggml_tensor * causal_mask, + ggml_tensor * identity, + ggml_tensor * diag_mask, + int il, + int64_t chunk_size, + float eps_norm); + + std::pair build_delta_net_unified_autoregressive( + ggml_context * ctx0, + ggml_tensor * q, + ggml_tensor * k, + ggml_tensor * v, + ggml_tensor * g, + ggml_tensor * beta, + ggml_tensor * state, + int il, + float eps_norm); + + std::pair build_delta_net_unified( + ggml_context * ctx0, + ggml_tensor * q, + ggml_tensor * k, + ggml_tensor * v, + ggml_tensor * g, + ggml_tensor * beta, + ggml_tensor * state, + ggml_tensor * causal_mask, + ggml_tensor * identity, + ggml_tensor * diag_mask, + int il, + int64_t chunk_size, + float eps_norm); +}; + // Base class for RWKV-related models struct llm_build_rwkv6_base : public llm_graph_context { const llama_model & model; @@ -476,7 +523,7 @@ struct llm_build_qwen3vl : public llm_graph_context { struct llm_build_qwen3vlmoe : public llm_graph_context { llm_build_qwen3vlmoe(const llama_model & model, const llm_graph_params & params); }; -struct llm_build_qwen3next : public llm_graph_context_mamba { +struct llm_build_qwen3next : public llm_graph_context_delta { llm_build_qwen3next(const llama_model & model, const llm_graph_params & params); private: ggml_tensor * build_layer_attn( @@ -534,6 +581,59 @@ private: const llama_model & model; }; +struct llm_build_qwen3_5 : public llm_graph_context_delta { + llm_build_qwen3_5(const llama_model & model, const llm_graph_params & params); + +protected: + // Tag type for subclass constructors that need to call build_graph() themselves + // (to ensure virtual dispatch works correctly) + struct defer_graph_build_t {}; + + llm_build_qwen3_5(const llama_model & model, const llm_graph_params & params, defer_graph_build_t); + + void build_graph(); + + virtual ggml_tensor * build_layer_ffn( + ggml_tensor * cur, + int il); + + const llama_model & model; + +private: + ggml_tensor * build_layer_attn( + llm_graph_input_attn_kv * inp_attn, + ggml_tensor * cur, + ggml_tensor * inp_pos, + int il); + + ggml_tensor * build_layer_attn_linear( + llm_graph_input_rs * inp, + ggml_tensor * cur, + ggml_tensor * causal_mask, + ggml_tensor * identity, + ggml_tensor * diag_mask, + int il); + + ggml_tensor * build_norm_gated( + ggml_tensor * input, + ggml_tensor * weights, + ggml_tensor * gate, + int layer); + + std::pair build_qkvz( + ggml_tensor * input, + int il); +}; + +struct llm_build_qwen3_5_moe : public llm_build_qwen3_5 { + llm_build_qwen3_5_moe(const llama_model & model, const llm_graph_params & params); + +protected: + ggml_tensor * build_layer_ffn( + ggml_tensor * cur, + int il) override; +}; + struct llm_build_qwen : public llm_graph_context { llm_build_qwen(const llama_model & model, const llm_graph_params & params); }; diff --git a/src/models/qwen3-5.cpp b/src/models/qwen3-5.cpp new file mode 100644 index 0000000000..0947299d73 --- /dev/null +++ b/src/models/qwen3-5.cpp @@ -0,0 +1,421 @@ +#include "models.h" + +#define CHUNK_SIZE 64 + +llm_build_qwen3_5::llm_build_qwen3_5(const llama_model & model, const llm_graph_params & params) : + llm_graph_context_delta(params), model(model) { + build_graph(); +} + +// virtual call in constructor fix +llm_build_qwen3_5::llm_build_qwen3_5(const llama_model & model, const llm_graph_params & params, defer_graph_build_t /*tag*/) : + llm_graph_context_delta(params), model(model) { +} + +void llm_build_qwen3_5::build_graph() { + ggml_tensor * cur; + ggml_tensor * inpL; + + inpL = build_inp_embd(model.tok_embd); + cb(inpL, "model.embed_tokens", -1); + + auto * inp = build_inp_mem_hybrid(); + + ggml_tensor * inp_pos = build_inp_pos(); + ggml_tensor * inp_out_ids = build_inp_out_ids(); + + ggml_tensor * causal_mask = + ggml_tri(ctx0, ggml_fill(ctx0, ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, CHUNK_SIZE, CHUNK_SIZE), 1.0f), + GGML_TRI_TYPE_LOWER); + + ggml_tensor * identity = ggml_diag(ctx0, ggml_fill(ctx0, ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, CHUNK_SIZE), 1.0f)); + ggml_tensor * diag_mask = ggml_add(ctx0, causal_mask, identity); + + ggml_build_forward_expand(gf, causal_mask); + ggml_build_forward_expand(gf, identity); + ggml_build_forward_expand(gf, diag_mask); + + for (int il = 0; il < n_layer; ++il) { + ggml_tensor * inpSA = inpL; + + cur = build_norm(inpL, model.layers[il].attn_norm, nullptr, LLM_NORM_RMS, il); + cb(cur, "attn_norm", il); + + if (hparams.is_recurrent(il)) { + cur = build_layer_attn_linear(inp->get_recr(), cur, causal_mask, identity, diag_mask, il); + } else { + cur = build_layer_attn(inp->get_attn(), cur, inp_pos, il); + } + + if (il == n_layer - 1 && inp_out_ids) { + cur = ggml_get_rows(ctx0, cur, inp_out_ids); + inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); + } + + cur = ggml_add(ctx0, cur, inpSA); + cb(cur, "attn_residual", il); + + ggml_tensor * ffn_residual = cur; + + ggml_tensor * attn_post_norm = build_norm(cur, model.layers[il].attn_post_norm, nullptr, LLM_NORM_RMS, il); + cb(attn_post_norm, "attn_post_norm", il); + + cur = build_layer_ffn(attn_post_norm, il); + cb(cur, "ffn_out", il); + + cur = ggml_add(ctx0, cur, ffn_residual); + cb(cur, "post_ffn", il); + + inpL = cur; + } + cur = inpL; + + cur = build_norm(cur, model.output_norm, nullptr, LLM_NORM_RMS, -1); + + cb(cur, "result_norm", -1); + res->t_embd = cur; + + cur = build_lora_mm(model.output, cur); + + cb(cur, "result_output", -1); + res->t_logits = cur; + + ggml_build_forward_expand(gf, cur); +} + +ggml_tensor * llm_build_qwen3_5::build_norm_gated( + ggml_tensor * input, + ggml_tensor * weights, + ggml_tensor * gate, + int layer) { + ggml_tensor * normalized = build_norm(input, weights, nullptr, LLM_NORM_RMS, layer); + ggml_tensor * gated_silu = ggml_silu(ctx0, gate); + + return ggml_mul(ctx0, normalized, gated_silu); +} + +ggml_tensor * llm_build_qwen3_5::build_layer_attn( + llm_graph_input_attn_kv * inp, + ggml_tensor * cur, + ggml_tensor * inp_pos, + int il) { + const int64_t n_embd_head = hparams.n_embd_head_v; + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); + + ggml_tensor * Qcur_full = build_lora_mm(model.layers[il].wq, cur); // [ (n_embd_head * 2) * n_head, n_tokens ] + cb(Qcur_full, "Qcur_full", il); + + ggml_tensor * Qcur = ggml_view_3d(ctx0, Qcur_full, n_embd_head, n_head, n_tokens, + ggml_element_size(Qcur_full) * n_embd_head * 2, + ggml_element_size(Qcur_full) * n_embd_head * 2 * n_head, 0); + cb(Qcur, "Qcur_reshaped", il); + + Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, nullptr, LLM_NORM_RMS, il); + cb(Qcur, "Qcur_normed", il); + + ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); + cb(Kcur, "Kcur", il); + + ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); + cb(Vcur, "Vcur", il); + + Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); + Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, nullptr, LLM_NORM_RMS, il); + cb(Kcur, "Kcur_normed", il); + + ggml_tensor * gate = ggml_view_3d(ctx0, Qcur_full, n_embd_head, n_head, n_tokens, + ggml_element_size(Qcur_full) * n_embd_head * 2, + ggml_element_size(Qcur_full) * n_embd_head * 2 * n_head, + ggml_element_size(Qcur_full) * n_embd_head); + gate = ggml_cont_2d(ctx0, gate, n_embd_head * n_head, n_tokens); + cb(gate, "gate_reshaped", il); + + Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens); + + Qcur = ggml_rope_ext( + ctx0, Qcur, inp_pos, nullptr, + n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow); + + Kcur = ggml_rope_ext( + ctx0, Kcur, inp_pos, nullptr, + n_rot, rope_type, n_ctx_orig, freq_base, + freq_scale, ext_factor, attn_factor, beta_fast, beta_slow); + + cb(Qcur, "Qcur", il); + cb(Kcur, "Kcur", il); + cb(Vcur, "Vcur", il); + + const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f / sqrtf(float(n_embd_head)) : hparams.f_attention_scale; + + cur = build_attn(inp, + nullptr, nullptr, + Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il); + cb(cur, "attn_pregate", il); + + ggml_tensor * gate_sigmoid = ggml_sigmoid(ctx0, gate); + cb(gate_sigmoid, "gate_sigmoid", il); + + cur = ggml_mul(ctx0, cur, gate_sigmoid); + cb(cur, "attn_gated", il); + + cur = build_lora_mm(model.layers[il].wo, cur); + cb(cur, "attn_output", il); + + return cur; +} + +std::pair llm_build_qwen3_5::build_qkvz( + ggml_tensor * input, + int il) { + const int64_t d_inner = hparams.ssm_d_inner; + const int64_t n_seqs = ubatch.n_seqs; + const int64_t head_k_dim = hparams.ssm_d_state; + const int64_t num_k_heads = hparams.ssm_n_group; + const int64_t num_v_heads = hparams.ssm_dt_rank; + const int64_t head_v_dim = d_inner / num_v_heads; + const int64_t n_seq_tokens = ubatch.n_seq_tokens; + + if (model.layers[il].wqkv) { + ggml_tensor * qkv_mixed = build_lora_mm(model.layers[il].wqkv, input); + qkv_mixed = ggml_reshape_3d(ctx0, qkv_mixed, qkv_mixed->ne[0], n_seq_tokens, n_seqs); + cb(qkv_mixed, "linear_attn_qkv_mixed", il); + + ggml_tensor * z = build_lora_mm(model.layers[il].wqkv_gate, input); + cb(z, "z", il); + + return { qkv_mixed, z }; + + } + // legacy path for combined in_proj_qkvz + ggml_tensor * mixed_qkvz = build_lora_mm(model.layers[il].ssm_in, input); + cb(mixed_qkvz, "linear_attn_mixed_qkvz", il); + + int64_t qkvz_new_dim = 2 * head_k_dim + 2 * head_v_dim * (num_v_heads / num_k_heads); + ggml_tensor * mixed_qkvz_reshaped = ggml_reshape_4d(ctx0, mixed_qkvz, qkvz_new_dim, num_k_heads, n_seq_tokens, n_seqs); + + int64_t split_sizes_qkvz[4] = { + head_k_dim, + head_k_dim, + head_v_dim * num_v_heads / num_k_heads, + head_v_dim * num_v_heads / num_k_heads + }; + + ggml_tensor * query = + ggml_view_4d(ctx0, mixed_qkvz_reshaped, split_sizes_qkvz[0], num_k_heads, n_seq_tokens, n_seqs, + mixed_qkvz_reshaped->nb[1], mixed_qkvz_reshaped->nb[2], mixed_qkvz_reshaped->nb[3], 0); + cb(query, "q", il); + + ggml_tensor * key = ggml_view_4d(ctx0, mixed_qkvz_reshaped, split_sizes_qkvz[1], num_k_heads, n_seq_tokens, n_seqs, + mixed_qkvz_reshaped->nb[1], mixed_qkvz_reshaped->nb[2], mixed_qkvz_reshaped->nb[3], + split_sizes_qkvz[0] * ggml_element_size(mixed_qkvz_reshaped)); + cb(key, "k", il); + + ggml_tensor * value = + ggml_view_4d(ctx0, mixed_qkvz_reshaped, split_sizes_qkvz[2], num_k_heads, n_seq_tokens, n_seqs, + mixed_qkvz_reshaped->nb[1], mixed_qkvz_reshaped->nb[2], mixed_qkvz_reshaped->nb[3], + (split_sizes_qkvz[0] + split_sizes_qkvz[1]) * ggml_element_size(mixed_qkvz_reshaped)); + cb(value, "v", il); + + ggml_tensor * z = ggml_view_4d(ctx0, mixed_qkvz_reshaped, split_sizes_qkvz[3], num_k_heads, n_seq_tokens, n_seqs, + mixed_qkvz_reshaped->nb[1], mixed_qkvz_reshaped->nb[2], mixed_qkvz_reshaped->nb[3], + (split_sizes_qkvz[0] + split_sizes_qkvz[1] + split_sizes_qkvz[2]) * ggml_element_size(mixed_qkvz_reshaped)); + z = ggml_cont(ctx0, z); + cb(z, "z", il); + + ggml_tensor * query_flat = ggml_reshape_3d(ctx0, query, head_k_dim * num_k_heads, n_seq_tokens, n_seqs); + cb(query_flat, "query_flat", il); + + ggml_tensor * key_flat = ggml_reshape_3d(ctx0, key, head_k_dim * num_k_heads, n_seq_tokens, n_seqs); + cb(key_flat, "key_flat", il); + + ggml_tensor * value_flat = ggml_reshape_3d(ctx0, value, head_v_dim * num_v_heads, n_seq_tokens, n_seqs); + cb(value_flat, "value_flat", il); + + ggml_tensor * qkv_mixed = ggml_concat(ctx0, query_flat, key_flat, 0); + qkv_mixed = ggml_concat(ctx0, qkv_mixed, value_flat, 0); + cb(qkv_mixed, "qkv_mixed", il); + + return { qkv_mixed, z }; +} + +ggml_tensor * llm_build_qwen3_5::build_layer_attn_linear( + llm_graph_input_rs * inp, + ggml_tensor * cur, + ggml_tensor * causal_mask, + ggml_tensor * identity, + ggml_tensor * diag_mask, + int il) { + const auto * mctx_cur = inp->mctx; + + const int64_t d_inner = hparams.ssm_d_inner; + const int64_t n_seqs = ubatch.n_seqs; + const int64_t head_k_dim = hparams.ssm_d_state; + const int64_t num_k_heads = hparams.ssm_n_group; + const int64_t num_v_heads = hparams.ssm_dt_rank; + const int64_t head_v_dim = d_inner / num_v_heads; + const int64_t n_seq_tokens = ubatch.n_seq_tokens; + + const auto kv_head = mctx_cur->get_head(); + + GGML_ASSERT(n_seqs != 0); + GGML_ASSERT(ubatch.equal_seqs()); + GGML_ASSERT(ubatch.n_tokens == n_seq_tokens * n_seqs); + + auto qkvz = build_qkvz(cur, il); + ggml_tensor * qkv_mixed = qkvz.first; + ggml_tensor * z = qkvz.second; + + ggml_tensor * mixed_ba = build_lora_mm(model.layers[il].ssm_beta_alpha, cur); + cb(mixed_ba, "linear_attn_mixed_ba", il); + + int64_t ba_new_dim = 2 * num_v_heads / num_k_heads; + ggml_tensor * mixed_ba_reshaped = ggml_reshape_4d(ctx0, mixed_ba, ba_new_dim, num_k_heads, n_seq_tokens, n_seqs); + + int64_t split_sizes_ba[2] = { + num_v_heads / num_k_heads, + num_v_heads / num_k_heads + }; + + ggml_tensor * b = ggml_view_4d(ctx0, mixed_ba_reshaped, split_sizes_ba[0], num_k_heads, n_seq_tokens, n_seqs, + mixed_ba_reshaped->nb[1], mixed_ba_reshaped->nb[2], mixed_ba_reshaped->nb[3], 0); + cb(b, "b", il); + + ggml_tensor * a = ggml_view_4d(ctx0, mixed_ba_reshaped, split_sizes_ba[1], num_k_heads, n_seq_tokens, n_seqs, + mixed_ba_reshaped->nb[1], mixed_ba_reshaped->nb[2], mixed_ba_reshaped->nb[3], + split_sizes_ba[0] * ggml_element_size(mixed_ba_reshaped)); + cb(a, "a", il); + + ggml_tensor * beta = ggml_cont_4d(ctx0, b, num_v_heads, 1, n_seq_tokens, n_seqs); + + ggml_tensor * alpha = ggml_cont_3d(ctx0, a, num_v_heads, n_seq_tokens, n_seqs); + + ggml_tensor * alpha_biased = ggml_add(ctx0, alpha, model.layers[il].ssm_dt); + ggml_tensor * alpha_softplus = ggml_softplus(ctx0, alpha_biased); + cb(alpha_softplus, "a_softplus", il); + ggml_tensor * gate = ggml_mul(ctx0, alpha_softplus, model.layers[il].ssm_a); + cb(gate, "gate", il); + + ggml_tensor * conv_states_all = mctx_cur->get_r_l(il); + ggml_tensor * ssm_states_all = mctx_cur->get_s_l(il); + + ggml_tensor * conv_states = build_rs(inp, conv_states_all, hparams.n_embd_r(), n_seqs); + cb(conv_states, "conv_states", il); + + ggml_tensor * conv_kernel = model.layers[il].ssm_conv1d; + const int64_t conv_kernel_size = conv_kernel->ne[0]; + const int64_t conv_channels = d_inner + 2 * hparams.ssm_n_group * hparams.ssm_d_state; + conv_states = ggml_reshape_3d(ctx0, conv_states, conv_kernel_size - 1, conv_channels, n_seqs); + cb(conv_states, "conv_states_reshaped", il); + + qkv_mixed = ggml_permute(ctx0, qkv_mixed, 1, 0, 2, 3); + cb(qkv_mixed, "qkv_mixed_permuted", il); + + ggml_tensor * conv_input = ggml_concat(ctx0, conv_states, qkv_mixed, 0); + cb(conv_input, "conv_input", il); + + ggml_tensor * last_conv_states = + ggml_view_3d(ctx0, conv_input, conv_kernel_size - 1, conv_channels, n_seqs, conv_input->nb[1], + conv_input->nb[2], (conv_input->ne[0] - conv_states->ne[0]) * ggml_element_size(conv_input)); + cb(last_conv_states, "last_conv_states", il); + + ggml_tensor * state_update_target = + ggml_view_1d(ctx0, conv_states_all, (conv_kernel_size - 1) * conv_channels * n_seqs, + kv_head * (conv_kernel_size - 1) * conv_channels * ggml_element_size(conv_states_all)); + cb(state_update_target, "state_update_target", il); + + ggml_build_forward_expand(gf, ggml_cpy(ctx0, last_conv_states, state_update_target)); + cb(conv_states_all, "conv_states_updated", il); + + ggml_tensor * conv_output_proper = ggml_ssm_conv(ctx0, conv_input, conv_kernel); + cb(conv_output_proper, "conv_output_raw", il); + + ggml_tensor * conv_output_silu = ggml_silu(ctx0, conv_output_proper); + cb(conv_output_silu, "conv_output_silu", il); + + ggml_tensor * conv_qkv_mix = conv_output_silu; + + int64_t qkv_dim = head_k_dim * num_k_heads * 2 + head_v_dim * num_v_heads; + int64_t nb1_qkv = ggml_row_size(conv_qkv_mix->type, qkv_dim); + + ggml_tensor * q_conv = + ggml_view_2d(ctx0, conv_qkv_mix, head_k_dim * num_k_heads, n_seq_tokens * n_seqs, nb1_qkv, 0); + cb(q_conv, "q_conv", il); + ggml_tensor * k_conv = + ggml_view_2d(ctx0, conv_qkv_mix, head_k_dim * num_k_heads, n_seq_tokens * n_seqs, nb1_qkv, + head_k_dim * num_k_heads * ggml_element_size(conv_qkv_mix)); + cb(k_conv, "k_conv", il); + ggml_tensor * v_conv = + ggml_view_2d(ctx0, conv_qkv_mix, head_v_dim * num_v_heads, n_seq_tokens * n_seqs, nb1_qkv, + 2 * head_k_dim * num_k_heads * ggml_element_size(conv_qkv_mix)); + cb(v_conv, "v_conv", il); + + q_conv = ggml_cont_4d(ctx0, q_conv, head_k_dim, num_k_heads, n_seq_tokens, n_seqs); + k_conv = ggml_cont_4d(ctx0, k_conv, head_k_dim, num_k_heads, n_seq_tokens, n_seqs); + v_conv = ggml_cont_4d(ctx0, v_conv, head_v_dim, num_v_heads, n_seq_tokens, n_seqs); + + ggml_tensor * state = build_rs(inp, ssm_states_all, hparams.n_embd_s(), n_seqs); + state = ggml_reshape_4d(ctx0, state, head_v_dim, head_v_dim, num_v_heads, n_seqs); + cb(state, "state_predelta", il); + + if (num_k_heads != num_v_heads) { + GGML_ASSERT(num_v_heads % num_k_heads == 0); + int64_t repeat_factor = num_v_heads / num_k_heads; + + ggml_tensor * q_reshaped = ggml_reshape_3d(ctx0, q_conv, head_k_dim, 1, num_k_heads * n_seq_tokens * n_seqs); + ggml_tensor * k_reshaped = ggml_reshape_3d(ctx0, k_conv, head_k_dim, 1, num_k_heads * n_seq_tokens * n_seqs); + + ggml_tensor * q_repeated = + ggml_repeat_4d(ctx0, q_reshaped, head_k_dim, repeat_factor, num_k_heads * n_seq_tokens * n_seqs, 1); + ggml_tensor * k_repeated = + ggml_repeat_4d(ctx0, k_reshaped, head_k_dim, repeat_factor, num_k_heads * n_seq_tokens * n_seqs, 1); + + q_conv = ggml_reshape_4d(ctx0, q_repeated, head_k_dim, num_k_heads * repeat_factor, n_seq_tokens, n_seqs); + k_conv = ggml_reshape_4d(ctx0, k_repeated, head_k_dim, num_k_heads * repeat_factor, n_seq_tokens, n_seqs); + } + + cb(q_conv, "q_conv_predelta", il); + cb(k_conv, "k_conv_predelta", il); + cb(v_conv, "v_conv_predelta", il); + + std::pair attn_out = build_delta_net_unified(ctx0, q_conv, k_conv, v_conv, + gate, beta, state, causal_mask, identity, diag_mask, + il, CHUNK_SIZE, hparams.f_norm_rms_eps); + + ggml_tensor * output = attn_out.first; + ggml_tensor * new_state = attn_out.second; + cb(output, "attn_output", il); + cb(new_state, "new_state", il); + + ggml_build_forward_expand(gf, + ggml_cpy(ctx0, new_state, + ggml_view_1d(ctx0, ssm_states_all, hparams.n_embd_s() * n_seqs, + kv_head * hparams.n_embd_s() * ggml_element_size(ssm_states_all)))); + + ggml_tensor * attn_out_2d_final = ggml_reshape_2d(ctx0, output, head_v_dim, num_v_heads * n_seq_tokens * n_seqs); + + ggml_tensor * z_2d = ggml_reshape_2d(ctx0, z, head_v_dim, num_v_heads * n_seq_tokens * n_seqs); + + ggml_tensor * attn_out_norm = build_norm_gated(attn_out_2d_final, model.layers[il].ssm_norm, z_2d, il); + + ggml_tensor * final_output = ggml_reshape_3d(ctx0, attn_out_norm, head_v_dim * num_v_heads, n_seq_tokens, n_seqs); + cb(final_output, "final_output", il); + + cur = build_lora_mm(model.layers[il].ssm_out, final_output); + cb(cur, "linear_attn_out", il); + + cur = ggml_cont_2d(ctx0, cur, n_embd, n_seq_tokens * n_seqs); + return cur; +} + +ggml_tensor * llm_build_qwen3_5::build_layer_ffn(ggml_tensor * cur, const int il) { + // Qwen3.5 Dense always uses dense FFN + cur = build_ffn(cur, + model.layers[il].ffn_up, NULL, NULL, + model.layers[il].ffn_gate, NULL, NULL, + model.layers[il].ffn_down, NULL, NULL, + NULL, + LLM_FFN_SILU, LLM_FFN_PAR, il); + cb(cur, "ffn_out", il); + return cur; +} diff --git a/src/models/qwen3-5moe.cpp b/src/models/qwen3-5moe.cpp new file mode 100644 index 0000000000..a488443218 --- /dev/null +++ b/src/models/qwen3-5moe.cpp @@ -0,0 +1,52 @@ +#include "models.h" + +llm_build_qwen3_5_moe::llm_build_qwen3_5_moe(const llama_model & model, const llm_graph_params & params) : + llm_build_qwen3_5(model, params, defer_graph_build_t{}) { + build_graph(); +} + +ggml_tensor * llm_build_qwen3_5_moe::build_layer_ffn(ggml_tensor * cur, const int il) { + // Check if this is an MoE layer + if (model.layers[il].ffn_gate_inp != nullptr) { + // MoE branch + ggml_tensor * moe_out = + build_moe_ffn(cur, + model.layers[il].ffn_gate_inp, model.layers[il].ffn_up_exps, + model.layers[il].ffn_gate_exps, model.layers[il].ffn_down_exps, + nullptr, + n_expert, n_expert_used, LLM_FFN_SILU, + true, false, 0.0, LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX, il); + cb(moe_out, "ffn_moe_out", il); + + // Add shared experts if present + if (model.layers[il].ffn_up_shexp != nullptr) { + ggml_tensor * ffn_shexp = + build_ffn(cur, + model.layers[il].ffn_up_shexp, NULL, NULL, + model.layers[il].ffn_gate_shexp, NULL, NULL, + model.layers[il].ffn_down_shexp, NULL, NULL, + NULL, + LLM_FFN_SILU, LLM_FFN_PAR, il); + cb(ffn_shexp, "ffn_shexp", il); + + // Apply shared expert gating (sigmoid) + ggml_tensor * shared_gate = build_lora_mm(model.layers[il].ffn_gate_inp_shexp, cur); + cb(shared_gate, "shared_expert_gate", il); + + shared_gate = ggml_sigmoid(ctx0, shared_gate); + cb(shared_gate, "shared_expert_gate_sigmoid", il); + + ffn_shexp = ggml_mul(ctx0, ffn_shexp, shared_gate); + cb(ffn_shexp, "ffn_shexp_gated", il); + + cur = ggml_add(ctx0, moe_out, ffn_shexp); + cb(cur, "ffn_out", il); + } else { + cur = moe_out; + } + } else { + // Dense FFN branch (fallback) + cur = llm_build_qwen3_5::build_layer_ffn(cur, il); + } + return cur; +} diff --git a/src/models/qwen3next.cpp b/src/models/qwen3next.cpp index 99b1a76a48..0335f5ab76 100644 --- a/src/models/qwen3next.cpp +++ b/src/models/qwen3next.cpp @@ -1,10 +1,9 @@ -#include "ggml.h" #include "models.h" #define CHUNK_SIZE 64 llm_build_qwen3next::llm_build_qwen3next(const llama_model & model, const llm_graph_params & params) : - llm_graph_context_mamba(params), model(model) { + llm_graph_context_delta(params), model(model) { ggml_tensor * cur; ggml_tensor * inpL; @@ -86,362 +85,6 @@ llm_build_qwen3next::llm_build_qwen3next(const llama_model & model, const llm_gr ggml_build_forward_expand(gf, cur); } -// utility to get one slice from the third dimension -// input dim: [x, y, c, b] -// output dim: [x, y, 1, b] -static ggml_tensor * get_slice_2d(ggml_context * ctx0, ggml_tensor * t, int64_t c) { - return ggml_view_4d(ctx0, t, t->ne[0], t->ne[1], 1, t->ne[3], - t->nb[1], t->nb[2], t->nb[3], t->nb[2] * c); -} - -std::pair llm_build_qwen3next::build_delta_net_chunking( - ggml_tensor * q, - ggml_tensor * k, - ggml_tensor * v, - ggml_tensor * g, - ggml_tensor * beta, - ggml_tensor * state, - ggml_tensor * causal_mask, - ggml_tensor * identity, - ggml_tensor * diag_mask, - int il) { - const int64_t S_k = q->ne[0]; - const int64_t H_k = q->ne[1]; - const int64_t n_tokens = q->ne[2]; - const int64_t n_seqs = q->ne[3]; - - const int64_t S_v = v->ne[0]; - const int64_t H_v = v->ne[1]; - - GGML_ASSERT(v->ne[2] == n_tokens); - GGML_ASSERT(k->ne[2] == n_tokens); - GGML_ASSERT(g->ne[0] == H_v && g->ne[1] == n_tokens && g->ne[2] == n_seqs); - GGML_ASSERT(beta->ne[0] == H_v && beta->ne[2] == n_tokens && beta->ne[3] == n_seqs); - GGML_ASSERT(state->ne[0] == S_v && state->ne[1] == S_v * H_v && state->ne[2] == 1 && state->ne[3] == n_seqs); - - GGML_ASSERT(q->ne[0] == S_k && q->ne[1] == H_k && q->ne[2] == n_tokens && q->ne[3] == n_seqs); - GGML_ASSERT(k->ne[0] == S_k && k->ne[1] == H_k && k->ne[2] == n_tokens && k->ne[3] == n_seqs); - - GGML_ASSERT(H_k == H_v); // we did a repeat to make sure this is the case - - const float eps_norm = hparams.f_norm_rms_eps; - - q = ggml_l2_norm(ctx0, q, eps_norm); - k = ggml_l2_norm(ctx0, k, eps_norm); - - const float scale = 1.0f / sqrtf(S_v); - - q = ggml_scale(ctx0, q, scale); - - beta = ggml_sigmoid(ctx0, beta); - - cb(q, "q_in", il); - cb(k, "k_in", il); - cb(v, "v_in", il); - cb(beta, "beta_in", il); - cb(g, "g_in", il); - - q = ggml_cont_4d(ctx0, ggml_permute(ctx0, q, 0, 2, 1, 3), S_v, n_tokens, H_v, n_seqs); - k = ggml_cont_4d(ctx0, ggml_permute(ctx0, k, 0, 2, 1, 3), S_v, n_tokens, H_v, n_seqs); - v = ggml_cont_4d(ctx0, ggml_permute(ctx0, v, 0, 2, 1, 3), S_v, n_tokens, H_v, n_seqs); - g = ggml_cont_4d(ctx0, ggml_permute(ctx0, g, 2, 0, 3, 1), n_tokens, 1, H_k, n_seqs); - - beta = ggml_cont(ctx0, ggml_permute(ctx0, beta, 2, 0, 1, 3)); - state = ggml_reshape_4d(ctx0, state, S_v, S_v, H_v, n_seqs); - - cb(q, "q_perm", il); - cb(k, "k_perm", il); - cb(v, "v_perm", il); - cb(beta, "beta_perm", il); - cb(g, "g_perm", il); - cb(state, "state_in", il); - - GGML_ASSERT(q->ne[1] == n_tokens && q->ne[0] == S_k && q->ne[2] == H_k && q->ne[3] == n_seqs); - GGML_ASSERT(k->ne[1] == n_tokens && k->ne[0] == S_k && k->ne[2] == H_k && k->ne[3] == n_seqs); - GGML_ASSERT(v->ne[1] == n_tokens && v->ne[0] == S_v && v->ne[2] == H_k && v->ne[3] == n_seqs); - GGML_ASSERT(beta->ne[1] == n_tokens && beta->ne[2] == H_k && beta->ne[0] == 1 && beta->ne[3] == n_seqs); - - // Do padding - const int64_t chunk_size = CHUNK_SIZE; - - const int64_t pad = (chunk_size - n_tokens % chunk_size) % chunk_size; - const int64_t n_chunks = (n_tokens + pad) / chunk_size; - - q = ggml_pad(ctx0, q, 0, pad, 0, 0); - k = ggml_pad(ctx0, k, 0, pad, 0, 0); - v = ggml_pad(ctx0, v, 0, pad, 0, 0); - g = ggml_pad(ctx0, g, pad, 0, 0, 0); - beta = ggml_pad(ctx0, beta, 0, pad, 0, 0); - - cb(q, "q_pad", il); - cb(k, "k_pad", il); - cb(v, "v_pad", il); - cb(beta, "beta_pad", il); - cb(g, "g_pad", il); - - ggml_tensor * v_beta = ggml_mul(ctx0, v, beta); - ggml_tensor * k_beta = ggml_mul(ctx0, k, beta); - - cb(v_beta, "v_beta", il); - cb(k_beta, "k_beta", il); - - q = ggml_reshape_4d(ctx0, q, S_k, chunk_size, n_chunks, H_k * n_seqs); - k = ggml_reshape_4d(ctx0, k, S_k, chunk_size, n_chunks, H_k * n_seqs); - k_beta = ggml_reshape_4d(ctx0, k_beta, S_k, chunk_size, n_chunks, H_k * n_seqs); - v = ggml_reshape_4d(ctx0, v, S_v, chunk_size, n_chunks, H_v * n_seqs); - v_beta = ggml_reshape_4d(ctx0, v_beta, S_v, chunk_size, n_chunks, H_v * n_seqs); - - g = ggml_reshape_4d(ctx0, g, chunk_size, 1, n_chunks, H_k * n_seqs); - beta = ggml_reshape_4d(ctx0, beta, 1, chunk_size, n_chunks, H_k * n_seqs); - - ggml_tensor * g_cumsum = ggml_cumsum(ctx0, g); - cb(g_cumsum, "g_cumsum", il); // shape: (chunk_size, 1, n_chunks, H_v * n_seqs) - - ggml_tensor * gcs_i = g_cumsum; // ggml_reshape_4d(ctx0, g_cumsum, chunk_size, 1, n_chunks, H_v * n_seqs); - ggml_tensor * gcs_j = ggml_reshape_4d(ctx0, g_cumsum, 1, chunk_size, n_chunks, H_v * n_seqs); - - ggml_tensor * gcs_j_broadcast = - ggml_repeat_4d(ctx0, gcs_j, chunk_size, chunk_size, n_chunks, H_v * n_seqs); - - ggml_tensor * decay_mask = ggml_sub(ctx0, gcs_j_broadcast, gcs_i); - cb(decay_mask, "decay_mask", il); // shape: (chunk_size, chunk_size, n_chunks, H_v * n_seqs) - - decay_mask = ggml_mul(ctx0, decay_mask, diag_mask); - decay_mask = ggml_exp(ctx0, decay_mask); - decay_mask = ggml_mul(ctx0, decay_mask, diag_mask); - - ggml_tensor * kmulkbeta = ggml_mul_mat(ctx0, k, k_beta); - - ggml_tensor * k_decay = ggml_mul(ctx0, kmulkbeta, decay_mask); - ggml_tensor * attn = ggml_neg(ctx0, ggml_mul(ctx0, k_decay, causal_mask)); - cb(attn, "attn_pre_solve", il); // shape: (chunk_size, chunk_size, n_chunks, H_v * n_seqs) - - ggml_tensor * attn_lower = ggml_mul(ctx0, attn, causal_mask); - ggml_tensor * lhs = ggml_sub(ctx0, ggml_repeat(ctx0, identity, attn_lower), attn_lower); - - ggml_tensor * lin_solve = ggml_solve_tri(ctx0, lhs, attn, true, true, false); - attn = ggml_mul(ctx0, lin_solve, causal_mask); - attn = ggml_add(ctx0, attn, identity); - cb(attn, "attn_solved", il); // shape: (chunk_size, chunk_size, n_chunks, H_v * n_seqs) - - v = ggml_mul_mat(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, v_beta)), attn); - - ggml_tensor * g_cumsum_t = ggml_cont(ctx0, ggml_transpose(ctx0, g_cumsum)); - ggml_tensor * gexp = ggml_exp(ctx0, g_cumsum_t); - - ggml_tensor * kbeta_gexp = ggml_mul(ctx0, k_beta, gexp); - cb(kbeta_gexp, "kbeta_gexp", il); // shape: (S_k, chunk_size, n_chunks, H_v * n_seqs) - - ggml_tensor * k_cumdecay = - ggml_cont(ctx0, ggml_transpose(ctx0, ggml_mul_mat(ctx0, attn, ggml_cont(ctx0, ggml_transpose(ctx0, kbeta_gexp))))); - cb(k_cumdecay, "k_cumdecay", il); // shape: (chunk_size, chunk_size, n_chunks, H_v * n_seqs) - - ggml_tensor * attn_kq = ggml_mul_mat(ctx0, k, q); - attn_kq = ggml_mul(ctx0, attn_kq, decay_mask); - attn_kq = ggml_mul(ctx0, attn_kq, diag_mask); - cb(attn_kq, "attn_kq", il); // shape: (chunk_size, chunk_size, n_chunks, H_v * n_seqs) - - - // vectorized calculation of key_gdiff - // improved from the chunked version: - // g_last = torch.clamp(g_cum[:, :, -1], max=50.0).exp().unsqueeze(-1).unsqueeze(-1) - // g_diff = torch.clamp(g_cum[:, :, -1:] - g_cum, max=50.0).exp() - // key_gdiff = key * g_diff.unsqueeze(-1) - // kgdmulvnew = (key_gdiff).transpose(-1, -2) @ v_new - // last_recurrent_state = last_recurrent_state * g_last + kgdmulvnew - - // get last element in g_cumsum along chunk_size dimension (ne0) - // example: [[x, y, z, ..., last], ...] -> [[last], ...] - ggml_tensor * g_last = ggml_view_4d(ctx0, g_cumsum, 1, 1, g_cumsum->ne[2], g_cumsum->ne[3], - g_cumsum->nb[1], g_cumsum->nb[2], g_cumsum->nb[3], - (g_cumsum->ne[0] - 1) * ggml_element_size(g_cumsum)); - g_last = ggml_cont(ctx0, g_last); - cb(g_last, "g_last", il); // shape: (1, 1, n_chunks, H_v * n_seqs) - - ggml_tensor * g_last_exp = ggml_exp(ctx0, g_last); - cb(g_last_exp, "g_last_exp", il); // shape: (1, 1, n_chunks, H_v * n_seqs) - - ggml_tensor * g_diff = ggml_neg(ctx0, ggml_sub(ctx0, g_cumsum, g_last)); - cb(g_diff, "g_diff", il); // shape: (chunk_size, 1, n_chunks, H_v * n_seqs) - - ggml_tensor * g_diff_exp = ggml_exp(ctx0, g_diff); - ggml_tensor * g_diff_exp_t = ggml_reshape_4d(ctx0, g_diff_exp, - 1, chunk_size, n_chunks, g_diff_exp->ne[3]); - - ggml_tensor * key_gdiff = ggml_mul(ctx0, k, g_diff_exp_t); - cb(key_gdiff, "key_gdiff", il); // shape: (S_k, chunk_size, n_chunks, H_v * n_seqs) - - ggml_tensor * key_gdiff_t = ggml_cont(ctx0, ggml_transpose(ctx0, key_gdiff)); - cb(key_gdiff_t, "key_gdiff_t", il); // shape: (chunk_size, S_k, n_chunks, H_v * n_seqs) - - - // state to be updated per chunk - ggml_tensor * new_state = state; // ggml_dup(ctx0, state); - cb(new_state, "new_state", il); // shape: (S_v, S_v, H_v, n_seqs) - - // shape after loop of chunks: (S_v, chunk_size, n_chunks, H_v * n_seqs) - ggml_tensor * core_attn_out = nullptr; - - for (int64_t chunk = 0; chunk < n_chunks; chunk++) { - // shape: (S_k, chunk_size, 1, H_k * n_seqs) - ggml_tensor * q_chunk = get_slice_2d(ctx0, q, chunk); // (no cont), next op: ggml_mul - - // shape: (S_v, chunk_size, 1, H_v * n_seqs) - ggml_tensor * v_chunk = get_slice_2d(ctx0, v, chunk); // (no cont), next op: ggml_repeat - - // shape: (chunk_size, 1, n_chunks, H_v * n_seqs) - ggml_tensor * gexp_chunk = get_slice_2d(ctx0, gexp, chunk); // (no cont), next op: ggml_mul - - // shape: (chunk_size, 1, H_v * n_seqs) - ggml_tensor * k_cumdecay_chunk = get_slice_2d(ctx0, k_cumdecay, chunk); // (no cont), next op: ggml_mul_mat - - // attn = (q_i @ k_i.transpose(-1, -2) * decay_mask[:, :, i]).masked_fill_(mask, 0) - // replaced by precomputed attn_kq - ggml_tensor * attn_chunk = get_slice_2d(ctx0, attn_kq, chunk); - cb(attn_chunk, "attn_chunk", il); - - ggml_tensor * state_t = ggml_cont_4d(ctx0, ggml_permute(ctx0, new_state, 1, 0, 2, 3), S_v, S_v, 1, H_v * n_seqs); - - // v_prime = (k_cumdecay[:, :, i]) @ last_recurrent_state - ggml_tensor * v_prime = ggml_mul_mat(ctx0, state_t, k_cumdecay_chunk); - cb(v_prime, "v_prime_chunk", il); // shape: (S_v, 1, H_v * n_seqs) - - // v_new = v_i - v_prime - ggml_tensor * v_new = ggml_sub(ctx0, ggml_repeat(ctx0, v_chunk, v_prime), v_prime); - ggml_tensor * v_new_t = ggml_cont(ctx0, ggml_transpose(ctx0, v_new)); - cb(v_new, "v_new_chunk", il); - - // attn_inter = (q_i * g[:, :, i, :, None].exp()) @ last_recurrent_state - ggml_tensor * q_g_exp = ggml_mul(ctx0, q_chunk, gexp_chunk); - ggml_tensor * attn_inter = ggml_mul_mat(ctx0, state_t, q_g_exp); - cb(attn_inter, "attn_inter_chunk", il); - - // core_attn_out[:, :, i] = attn_inter + attn @ v_new - ggml_tensor * v_attn = ggml_mul_mat(ctx0, v_new_t, attn_chunk); - cb(v_attn, "v_attn_chunk", il); - - ggml_tensor * core_attn_out_chunk = ggml_add(ctx0, attn_inter, v_attn); - cb(core_attn_out_chunk, "core_attn_out_chunk", il); // shape: (S_v, chunk_size, 1, H_v * n_seqs) - - core_attn_out = core_attn_out == nullptr - ? core_attn_out_chunk - : ggml_concat(ctx0, core_attn_out, core_attn_out_chunk, 2); - - // kgdmulvnew = (key_gdiff).transpose(-1, -2) @ v_new - ggml_tensor * k_gdiff_t = get_slice_2d(ctx0, key_gdiff_t, chunk); - //ggml_tensor * kgdmulvnew = ggml_mul_mat(ctx0, k_gdiff, v_new); // this is slower on metal, why? - ggml_tensor * kgdmulvnew = ggml_mul_mat(ctx0, v_new_t, k_gdiff_t); - - // last_recurrent_state = last_recurrent_state * g_last + kgdmulvnew - ggml_tensor * gexp_last_chunk = ggml_cont(ctx0, get_slice_2d(ctx0, g_last_exp, chunk)); - new_state = ggml_add(ctx0, - ggml_mul(ctx0, new_state, ggml_reshape_4d(ctx0, gexp_last_chunk, gexp_last_chunk->ne[0], gexp_last_chunk->ne[1], H_v, n_seqs)), - ggml_reshape_4d(ctx0, kgdmulvnew, kgdmulvnew->ne[0], kgdmulvnew->ne[1], H_v, n_seqs)); - } - - // truncate padded tokens - ggml_tensor * output_tokens = ggml_view_4d(ctx0, core_attn_out, - S_v, n_tokens, H_v, n_seqs, - ggml_row_size(core_attn_out->type, S_v), - ggml_row_size(core_attn_out->type, S_v * chunk_size * n_chunks), - ggml_row_size(core_attn_out->type, S_v * chunk_size * n_chunks * H_v), 0); - output_tokens = ggml_cont(ctx0, output_tokens); - cb(output_tokens, "output_tokens", il); - - // permute back to (S_v, H_v, n_tokens, n_seqs) - output_tokens = ggml_permute(ctx0, output_tokens, 0, 2, 1, 3); - output_tokens = ggml_cont(ctx0, output_tokens); - - return {output_tokens, new_state}; -} - -std::pair llm_build_qwen3next::build_delta_net_autoregressive( - ggml_tensor * q, - ggml_tensor * k, - ggml_tensor * v, - ggml_tensor * g, - ggml_tensor * beta, - ggml_tensor * state, - int il) { - const int64_t S_k = q->ne[0]; - const int64_t H_k = q->ne[1]; - const int64_t n_tokens = q->ne[2]; - const int64_t n_seqs = q->ne[3]; - - const int64_t S_v = v->ne[0]; - const int64_t H_v = v->ne[1]; - - GGML_ASSERT(n_tokens == 1); // This function is optimized for single token processing - GGML_ASSERT(v->ne[2] == n_tokens); - GGML_ASSERT(k->ne[2] == n_tokens); - GGML_ASSERT(g->ne[0] == H_v && g->ne[1] == n_tokens && g->ne[2] == n_seqs); - GGML_ASSERT(beta->ne[0] == H_v && beta->ne[2] == n_tokens && beta->ne[3] == n_seqs); - GGML_ASSERT(state->ne[0] == S_v && state->ne[1] == S_v * H_v && state->ne[2] == 1 && state->ne[3] == n_seqs); - - GGML_ASSERT(q->ne[0] == S_k && q->ne[1] == H_k && q->ne[2] == n_tokens && q->ne[3] == n_seqs); - GGML_ASSERT(k->ne[0] == S_k && k->ne[1] == H_k && k->ne[2] == n_tokens && k->ne[3] == n_seqs); - - GGML_ASSERT(H_k == H_v); // we did a repeat to make sure this is the case - - const float eps_norm = hparams.f_norm_rms_eps; - - q = ggml_l2_norm(ctx0, q, eps_norm); - k = ggml_l2_norm(ctx0, k, eps_norm); - - const float scale = 1.0f / sqrtf(S_v); - - q = ggml_scale(ctx0, q, scale); - beta = ggml_sigmoid(ctx0, beta); - - cb(q, "q_in", il); - cb(k, "k_in", il); - cb(v, "v_in", il); - cb(beta, "beta_in", il); - cb(g, "g_in", il); - - state = ggml_reshape_4d(ctx0, state, S_v, S_v, H_v, n_seqs); - - ggml_tensor * g_t = ggml_reshape_4d(ctx0, ggml_transpose(ctx0, g), 1, 1, H_k, n_seqs); - ggml_tensor * beta_t = ggml_reshape_4d(ctx0, ggml_transpose(ctx0, beta), 1, 1, H_k, n_seqs); - - // Apply exponential to g_t - g_t = ggml_exp(ctx0, g_t); - - // Apply the gated delta rule for the single timestep - // last_recurrent_state = last_recurrent_state * g_t - state = ggml_mul(ctx0, state, g_t); - - // kv_mem = (last_recurrent_state * k_t.unsqueeze(-1)).sum(dim=-2) - ggml_tensor * k_t_unsqueezed = ggml_reshape_4d(ctx0, k, 1, S_v, H_v, n_seqs); - ggml_tensor * kv_mem = ggml_mul(ctx0, state, k_t_unsqueezed); - // we need to sum over dim=-2, so we transpose, sum, then transpose again - kv_mem = ggml_transpose(ctx0, ggml_sum_rows(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, kv_mem)))); - - // v_t = v.unsqueeze(2) (we insert the singleton dimension after n_seqs and H_v) - ggml_tensor * v_t = ggml_reshape_4d(ctx0, v, S_v, 1, H_v, n_seqs); - // delta = (v_t - kv_mem) * beta_t - ggml_tensor * v_diff = ggml_sub(ctx0, v_t, kv_mem); // both should be [S_v, 1, H_v, n_seqs] - ggml_tensor * delta = ggml_mul(ctx0, v_diff, beta_t); - - // last_recurrent_state = last_recurrent_state + k_t.unsqueeze(-1) * delta - ggml_tensor * k_t_delta = ggml_mul(ctx0, ggml_repeat_4d(ctx0, k_t_unsqueezed, S_v, S_v, H_v, n_seqs), delta); - state = ggml_add(ctx0, state, k_t_delta); - - // Compute the attention output - // core_attn_out = (last_recurrent_state * q_t.unsqueeze(-1)).sum(dim=-2) - ggml_tensor * q_t_unsqueezed = ggml_reshape_4d(ctx0, q, 1, S_v, H_v, n_seqs); // unsqueeze q_t - ggml_tensor * state_q = ggml_mul(ctx0, state, q_t_unsqueezed); - // again, since it's over dim = -2, transpose, sum, transpose back - ggml_tensor * core_attn_out = - ggml_transpose(ctx0, ggml_sum_rows(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, state_q)))); - - // core_attn_out should be [S_v, 1, H_v, n_seqs] after this - cb(core_attn_out, "output_tokens", il); - cb(state, "new_state", il); - - return {core_attn_out, state}; -} - ggml_tensor * llm_build_qwen3next::build_norm_gated( ggml_tensor * input, ggml_tensor * weights, @@ -752,7 +395,7 @@ ggml_tensor * llm_build_qwen3next::build_layer_attn_linear( v_conv = ggml_cont_4d(ctx0, v_conv, head_v_dim, num_v_heads, n_seq_tokens, n_seqs); ggml_tensor * state = build_rs(inp, ssm_states_all, hparams.n_embd_s(), n_seqs); - state = ggml_reshape_4d(ctx0, state, head_v_dim, head_v_dim * num_v_heads, 1, n_seqs); + state = ggml_reshape_4d(ctx0, state, head_v_dim, head_v_dim, num_v_heads, n_seqs); cb(state, "state_predelta", il); // if head keys and value keys are different, repeat to force tensors into matching shapes @@ -781,13 +424,10 @@ ggml_tensor * llm_build_qwen3next::build_layer_attn_linear( cb(k_conv, "k_conv_predelta", il); cb(v_conv, "v_conv_predelta", il); - // Choose between build_delta_net_chunking, build_delta_net_recurrent, and build_delta_net_autoregressive based on n_tokens - std::pair attn_out; // pair of (output, new_state) - if (n_seq_tokens == 1) { - attn_out = build_delta_net_autoregressive(q_conv, k_conv, v_conv, gate, beta, state, il); - } else { - attn_out = build_delta_net_chunking(q_conv, k_conv, v_conv, gate, beta, state, causal_mask, identity, diag_mask, il); - } + std::pair attn_out = build_delta_net_unified(ctx0, q_conv, k_conv, v_conv, + gate, beta, state, causal_mask, identity, diag_mask, + il, CHUNK_SIZE, hparams.f_norm_rms_eps); + ggml_tensor * output = attn_out.first; ggml_tensor * new_state = attn_out.second; cb(output, "attn_output", il); From 1e8924fd65ad349d1d838412a2172292618f3bbf Mon Sep 17 00:00:00 2001 From: Hugo Date: Mon, 9 Feb 2026 06:12:02 +0000 Subject: [PATCH 10/17] cmake : add variable to skip installing tests (#19370) When packaging downstream, there's usually little point in installing test. The default behaviour remains the same. --- CMakeLists.txt | 1 + tests/CMakeLists.txt | 8 ++++++-- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 6d4ed67020..55f3d594db 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -109,6 +109,7 @@ option(LLAMA_BUILD_TOOLS "llama: build tools" ${LLAMA_STANDALONE}) option(LLAMA_BUILD_EXAMPLES "llama: build examples" ${LLAMA_STANDALONE}) option(LLAMA_BUILD_SERVER "llama: build server example" ${LLAMA_STANDALONE}) option(LLAMA_TOOLS_INSTALL "llama: install tools" ${LLAMA_TOOLS_INSTALL_DEFAULT}) +option(LLAMA_TESTS_INSTALL "llama: install tests" ON) # 3rd party libs option(LLAMA_HTTPLIB "llama: httplib for downloading functionality" ON) diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index c9436c5995..350bffc315 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -11,7 +11,9 @@ function(llama_build source) add_executable(${TEST_TARGET} ${TEST_SOURCES}) target_link_libraries(${TEST_TARGET} PRIVATE common) - install(TARGETS ${TEST_TARGET} RUNTIME) + if (LLAMA_TESTS_INSTALL) + install(TARGETS ${TEST_TARGET} RUNTIME) + endif() endfunction() function(llama_test target) @@ -100,7 +102,9 @@ function(llama_build_and_test source) endif() add_executable(${TEST_TARGET} ${TEST_SOURCES}) - install(TARGETS ${TEST_TARGET} RUNTIME) + if (LLAMA_TESTS_INSTALL) + install(TARGETS ${TEST_TARGET} RUNTIME) + endif() target_link_libraries(${TEST_TARGET} PRIVATE common) add_test( From f5e7734ff2e1d2e22015f4a9da9a52c70240a064 Mon Sep 17 00:00:00 2001 From: Kevin Pouget Date: Mon, 9 Feb 2026 13:15:42 +0100 Subject: [PATCH 11/17] ggml-virtgpu: add backend documentation (#19354) * ggml-virtgpu: add backend documentation Assisted-by-AI: Claude Code * CODEOWNERS: add /docs/backend/GGML-VirtGPU/ -> kpouget * README: add the link to docs/backend/GGML-VirtGPU/ggml-virt.md * docs/ggml-virt: add link to testing + configuration * Revert "CODEOWNERS: add /docs/backend/GGML-VirtGPU/ -> kpouget" This reverts commit 8ece8e72e24d305f308505c08ebb75804546374e. * drop the ggml- prefix * s/ggerganov/ggml-org * Relocate VirtGPU.md * reorganize the text * turn turn the ascii diagram into a mermaid * README.md: update the link to the main doc --- README.md | 1 + docs/backend/VirtGPU.md | 180 +++++++++++++++++++++ docs/backend/VirtGPU/configuration.md | 174 ++++++++++++++++++++ docs/backend/VirtGPU/development.md | 220 ++++++++++++++++++++++++++ 4 files changed, 575 insertions(+) create mode 100644 docs/backend/VirtGPU.md create mode 100644 docs/backend/VirtGPU/configuration.md create mode 100644 docs/backend/VirtGPU/development.md diff --git a/README.md b/README.md index dac020ad37..5c11f38048 100644 --- a/README.md +++ b/README.md @@ -288,6 +288,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo | [WebGPU [In Progress]](docs/build.md#webgpu) | All | | [RPC](https://github.com/ggml-org/llama.cpp/tree/master/tools/rpc) | All | | [Hexagon [In Progress]](docs/backend/hexagon/README.md) | Snapdragon | +| [VirtGPU](docs/backend/VirtGPU.md) | VirtGPU APIR | ## Obtaining and quantizing models diff --git a/docs/backend/VirtGPU.md b/docs/backend/VirtGPU.md new file mode 100644 index 0000000000..c81468da13 --- /dev/null +++ b/docs/backend/VirtGPU.md @@ -0,0 +1,180 @@ +# GGML-VirtGPU Backend + +The GGML-VirtGPU backend enables GGML applications to run machine +learning computations on host hardware while the application itself +runs inside a virtual machine. It uses host-guest shared memory to +efficiently share data buffers between the two sides. + +This backend relies on the virtio-gpu, and VirglRenderer API Remoting +(APIR) component. The backend is split into two libraries: +- a GGML implementation (the "remoting frontend"), running in the + guest and interacting with the virtgpu device +- a VirglRenderer APIR compatible library (the "remoting backend"), + running in the host and interacting with Virglrenderer and an actual + GGML device backend. + +## OS support + +| OS | Status | Backend | CI testing | Notes +| -------- | ----------------- | ----------- | ----------- | ----- +| MacOS 14 | Supported | ggml-metal | X | Working when compiled on MacOS 14 +| MacOS 15 | Supported | ggml-metal | X | Working when compiled on MacOS 14 or MacOS 15 +| MacOS 26 | Not tested | | | +| Linux | Under development | ggml-vulkan | not working | Working locally, CI running into deadlocks + + +## Architecture Overview + +The GGML-VirtGPU backend consists of three main components: + +```mermaid +graph TD + %% Nodes + + subgraph GuestVM ["Guest VM - Frontend"] + App([GGML Application
llama.cpp, etc.]) + + direction TB + Interface[GGML Backend Interface] + Comm["GGML-VirtGPU
(hypercalls + shared mem)"] + + App --> Interface + Interface --> Comm + end + + API[virtio-gpu / virglrenderer API] + + subgraph HostSystem [Host System - Backend] + direction TB + Dispatcher[GGML-VirtGPU-Backend] + BackendLib[GGML Backend library
Metal / Vulkan / CPU / ...] + + Dispatcher --> BackendLib + end + + %% Connections + Comm --> API + API --> HostSystem +``` + +### Key Components + +1. **Guest-side Frontend** (`ggml-virtgpu/`): Implements the GGML backend interface and forwards operations to the host +2. **Host-side Backend** (`ggml-virtgpu/backend/`): Receives forwarded operations and executes them on actual hardware backends +3. **Communication Layer**: Uses virtio-gpu hypercalls and shared memory for efficient data transfer + +## Features + +- **Dynamic backend loading** on the host side (CPU, CUDA, Metal, etc.) +- **Zero-copy data transfer** via host-guest shared memory pages + +## Communication Protocol + +### Hypercalls and Shared Memory + +The backend uses two primary communication mechanisms: + +1. **Hypercalls (`DRM_IOCTL_VIRTGPU_EXECBUFFER`)**: Trigger remote execution from guest to host +2. **Shared Memory Pages**: Zero-copy data transfer for tensors and parameters + +#### Shared Memory Layout + +Each connection uses two shared memory buffers: + +- **Data Buffer** (24 MiB): For command/response data and tensor transfers +- **Reply Buffer** (16 KiB): For command replies and status information +- **Data Buffers**: Dynamically allocated host-guest shared buffers + served as GGML buffers. + +### APIR Protocol + +The Virglrender API Remoting protocol defines three command types: + +- `HANDSHAKE`: Protocol version negotiation and capability discovery +- `LOADLIBRARY`: Dynamic loading of backend libraries on the host +- `FORWARD`: API function call forwarding + +### Binary Serialization + +Commands and data are serialized using a custom binary protocol with: + +- Fixed-size encoding for basic types +- Variable-length arrays with size prefixes +- Buffer bounds checking +- Error recovery mechanisms + +## Supported Operations + +### Device Operations +- Device enumeration and capability queries +- Memory information (total/free) +- Backend type detection + +### Buffer Operations +- Buffer allocation and deallocation +- Tensor data transfer (host ↔ guest) +- Memory copying and clearing + +### Computation Operations +- Graph execution forwarding + +## Build Requirements + +### Guest-side Dependencies +- `libdrm` for DRM/virtio-gpu communication +- C++20 compatible compiler +- CMake 3.14+ + +### Host-side Dependencies +- virglrenderer with APIR support (pending upstream review) +- Target backend libraries (libggml-metal, libggml-vulkan, etc.) + +## Configuration + +### Environment Variables + +- `GGML_VIRTGPU_BACKEND_LIBRARY`: Path to the host-side backend library +- `GGML_VIRTGPU_DEBUG`: Enable debug logging + +### Build Options + +- `GGML_VIRTGPU`: Enable the VirtGPU backend (`ON` or `OFF`, default: `OFF`) +- `GGML_VIRTGPU_BACKEND`: Build the host-side backend component (`ON`, `OFF` or `ONLY`, default: `OFF`) + +### System Requirements + +- VM with virtio-gpu support +- VirglRenderer with APIR patches +- Compatible backend libraries on host + +## Limitations + +- **VM-specific**: Only works in virtual machines with virtio-gpu support +- **Host dependency**: Requires properly configured host-side backend +- **Latency**: Small overhead from VM escaping for each operation + + +* This work is pending upstream changes in the VirglRenderer + project. + * The backend can be tested with Virglrenderer compiled from source + using this PR: + https://gitlab.freedesktop.org/virgl/virglrenderer/-/merge_requests/1590 +* This work is pending changes in the VMM/hypervisor running the + virtual machine, which need to know how to route the newly + introduced APIR capset. + * The environment variable `VIRGL_ROUTE_VENUS_TO_APIR=1` allows + using the Venus capset, until the relevant hypervisors have been + patched. However, setting this flag breaks the Vulkan/Venus normal + behavior. + * The environment variable `GGML_REMOTING_USE_APIR_CAPSET` tells the + `ggml-virtgpu` backend to use the APIR capset. This will become + the default when the relevant hypervisors have been patched. + +* This work focused on improving the performance of llama.cpp running + on MacOS containers, and is mainly tested on this platform. The + linux support (via `krun`) is in progress. + +## See Also + +- [Development and Testing](VirtGPU/development.md) +- [Backend configuration](VirtGPU/configuration.md) diff --git a/docs/backend/VirtGPU/configuration.md b/docs/backend/VirtGPU/configuration.md new file mode 100644 index 0000000000..597862d5c8 --- /dev/null +++ b/docs/backend/VirtGPU/configuration.md @@ -0,0 +1,174 @@ +# GGML-VirtGPU Backend Configuration + +This document describes the environment variables used by the ggml-virtgpu backend system, covering both the frontend (guest-side) and backend (host-side) components. + +## Environment Variables Overview + +The ggml-virtgpu backend uses environment variables for configuration across three main components: +- **Frontend (Guest)**: GGML applications running in VMs +- **Hypervisor**: Virglrenderer/APIR system +- **Backend (Host)**: Host-side GGML backend integration + +## Frontend (Guest-side) Configuration + +### GGML_REMOTING_USE_APIR_CAPSET +- **Location**: `ggml/src/ggml-virtgpu/virtgpu.cpp` +- **Type**: Boolean flag (presence-based) +- **Purpose**: Controls which virtio-gpu capability set to use for communication +- **Values**: + - Set (any value): Use the APIR capset (long-term setup) + - Unset: Use the Venus capset (easier for testing with an unmodified hypervisor) +- **Default**: Unset (Venus capset) +- **Usage**: + ```bash + export GGML_REMOTING_USE_APIR_CAPSET=1 # Use APIR capset + # or leave unset for Venus capset + ``` + +## Hypervisor (Virglrenderer/APIR) Configuration + +These environment variables are used during the transition phase for +running with an unmodified hypervisor (not supporting the +VirglRenderer APIR component). They will be removed in the future, and +the hypervisor will instead configure VirglRenderer with the APIR +_Configuration Key_. + +### VIRGL_APIR_BACKEND_LIBRARY +- **Location**: `virglrenderer/src/apir/apir-context.c` +- **Configuration Key**: `apir.load_library.path` +- **Type**: File path string +- **Purpose**: Path to the APIR backend library that virglrenderer should dynamically load +- **Required**: Yes +- **Example**: + ```bash + export VIRGL_APIR_BACKEND_LIBRARY="/path/to/libggml-remotingbackend.so" + ``` + +### VIRGL_ROUTE_VENUS_TO_APIR +- **Location**: `virglrenderer/src/apir/apir-renderer.h` +- **Type**: Boolean flag (presence-based) +- **Purpose**: Temporary workaround to route Venus capset calls to APIR during hypervisor transition period +- **Status**: will be removed once hypervisors support APIR natively +- **Warning**: Breaks normal Vulkan/Venus functionality +- **Usage**: + ```bash + export VIRGL_ROUTE_VENUS_TO_APIR=1 # For testing with an unmodified hypervisor + ``` + +### VIRGL_APIR_LOG_TO_FILE +- **Location**: `virglrenderer/src/apir/apir-renderer.c` +- **Environment Variable**: `VIRGL_APIR_LOG_TO_FILE` +- **Type**: File path string +- **Purpose**: Enable debug logging from the VirglRenderer APIR component to specified file +- **Required**: No (optional debugging) +- **Default**: Logging to `stderr` +- **Usage**: + ```bash + export VIRGL_APIR_LOG_TO_FILE="/tmp/apir-debug.log" + ``` + +## Backend (Host-side) Configuration + +These environment variables are used during the transition phase for +running with an unmodified hypervisor (not supporting the +VirglRenderer APIR component). They will be removed in the future, and +the hypervisor will instead configure VirglRenderer with the APIR +_Configuration Key_. + +### APIR_LLAMA_CPP_GGML_LIBRARY_PATH +- **Location**: `ggml/src/ggml-virtgpu/backend/backend.cpp` +- **Environment Variable**: `APIR_LLAMA_CPP_GGML_LIBRARY_PATH` +- **Configuration Key**: `ggml.library.path` +- **Type**: File path string +- **Purpose**: Path to the actual GGML backend library (Metal, CUDA, Vulkan, etc.) +- **Required**: **Yes** - backend initialization fails without this +- **Examples**: + ```bash + # macOS with Metal backend + export APIR_LLAMA_CPP_GGML_LIBRARY_PATH="/opt/llama.cpp/lib/libggml-metal.dylib" + + # Linux with CUDA backend + export APIR_LLAMA_CPP_GGML_LIBRARY_PATH="/opt/llama.cpp/lib/libggml-cuda.so" + + # macOS or Linux with Vulkan backend + export APIR_LLAMA_CPP_GGML_LIBRARY_PATH="/opt/llama.cpp/lib/libggml-vulkan.so" + ``` + +### APIR_LLAMA_CPP_GGML_LIBRARY_REG +- **Location**: `ggml/src/ggml-virtgpu/backend/backend.cpp` +- **Environment Variable**: `APIR_LLAMA_CPP_GGML_LIBRARY_REG` +- **Configuration Key**: `ggml.library.reg` +- **Type**: Function symbol name string +- **Purpose**: Name of the backend registration function to call after loading the library +- **Required**: No (defaults to `ggml_backend_init`) +- **Default**: `ggml_backend_init` +- **Examples**: + ```bash + # Metal backend + export APIR_LLAMA_CPP_GGML_LIBRARY_REG="ggml_backend_metal_reg" + + # CUDA backend + export APIR_LLAMA_CPP_GGML_LIBRARY_REG="ggml_backend_cuda_reg" + + # Vulkan backend + export APIR_LLAMA_CPP_GGML_LIBRARY_REG="ggml_backend_vulkan_reg" + + # Generic fallback (default) + # export APIR_LLAMA_CPP_GGML_LIBRARY_REG="ggml_backend_init" + ``` + +### APIR_LLAMA_CPP_LOG_TO_FILE +- **Location**: `ggml/src/ggml-virtgpu/backend/backend.cpp:62` +- **Environment Variable**: `APIR_LLAMA_CPP_LOG_TO_FILE` +- **Type**: File path string +- **Purpose**: Enable debug logging from the GGML backend to specified file +- **Required**: No (optional debugging) +- **Usage**: + ```bash + export APIR_LLAMA_CPP_LOG_TO_FILE="/tmp/ggml-backend-debug.log" + ``` + +## Configuration Flow + +The configuration system works as follows: + +1. **Hypervisor Setup**: Virglrenderer loads the APIR backend library specified by `VIRGL_APIR_BACKEND_LIBRARY` + +2. **Context Creation**: When an APIR context is created, it populates a configuration table with environment variables: + - `apir.load_library.path` ← `VIRGL_APIR_BACKEND_LIBRARY` + - `ggml.library.path` ← `APIR_LLAMA_CPP_GGML_LIBRARY_PATH` + - `ggml.library.reg` ← `APIR_LLAMA_CPP_GGML_LIBRARY_REG` + - this step will eventually be performed by the hypervisor itself, with command-line arguments instead of environment variables. + +3. **Backend Initialization**: The backend queries the configuration via callbacks: + - `virgl_cbs->get_config(ctx_id, "ggml.library.path")` returns the library path + - `virgl_cbs->get_config(ctx_id, "ggml.library.reg")` returns the registration function + +4. **Library Loading**: The backend dynamically loads and initializes the specified GGML library + +## Error Messages + +Common error scenarios and their messages: + +- **Missing library path**: `"cannot open the GGML library: env var 'APIR_LLAMA_CPP_GGML_LIBRARY_PATH' not defined"` +- **Missing registration function**: `"cannot register the GGML library: env var 'APIR_LLAMA_CPP_GGML_LIBRARY_REG' not defined"` + +## Example Complete Configuration + +Here's an example configuration for a macOS host with Metal backend: + +```bash +# Hypervisor environment +export VIRGL_APIR_BACKEND_LIBRARY="/opt/llama.cpp/lib/libggml-virtgpu-backend.dylib" + +# Backend configuration +export APIR_LLAMA_CPP_GGML_LIBRARY_PATH="/opt/llama.cpp/lib/libggml-metal.dylib" +export APIR_LLAMA_CPP_GGML_LIBRARY_REG="ggml_backend_metal_reg" + +# Optional logging +export VIRGL_APIR_LOG_TO_FILE="/tmp/apir.log" +export APIR_LLAMA_CPP_LOG_TO_FILE="/tmp/ggml.log" + +# Guest configuration +export GGML_REMOTING_USE_APIR_CAPSET=1 +``` diff --git a/docs/backend/VirtGPU/development.md b/docs/backend/VirtGPU/development.md new file mode 100644 index 0000000000..ca2e47772a --- /dev/null +++ b/docs/backend/VirtGPU/development.md @@ -0,0 +1,220 @@ +# Development and Testing + +## Development + +### Code Generation + +The backend uses code generation from YAML configuration: + +```bash +# Regenerate protocol code +cd ggml-virtgpu/ +python regenerate_remoting.py +``` + +### Adding New Operations + +1. Add function definition to `ggmlremoting_functions.yaml` +2. Regenerate code with `regenerate_remoting.py` +3. Implement guest-side forwarding in `virtgpu-forward-*.cpp` +4. Implement host-side handling in `backend-dispatched-*.cpp` + +## Testing + +This document provides instructions for building and testing the GGML-VirtGPU backend on macOS with containers. + +### Prerequisites + +The testing setup requires: + +- macOS host system +- Container runtime with `libkrun` provider (podman machine) +- Access to development patchset for VirglRenderer + +### Required Patchsets + +The backend requires patches that are currently under review: + +- **Virglrenderer APIR upstream PR**: https://gitlab.freedesktop.org/virgl/virglrenderer/-/merge_requests/1590 (for reference) +- **MacOS Virglrenderer (for krunkit)**: https://gitlab.freedesktop.org/kpouget/virglrenderer/-/tree/main-macos +- **Linux Virglrenderer (for krun)**: https://gitlab.freedesktop.org/kpouget/virglrenderer/-/tree/main-linux + +### Build Instructions + +#### 1. Build ggml-virtgpu-backend (Host-side, macOS) + +```bash +# Build the backend that runs natively on macOS +mkdir llama.cpp +cd llama.cpp +git clone https://github.com/ggml-org/llama.cpp.git src +cd src + +LLAMA_MAC_BUILD=$PWD/build/ggml-virtgpu-backend + +cmake -S . -B $LLAMA_MAC_BUILD \ + -DGGML_NATIVE=OFF \ + -DLLAMA_CURL=ON \ + -DGGML_REMOTINGBACKEND=ONLY \ + -DGGML_METAL=ON + +TARGETS="ggml-metal" +cmake --build $LLAMA_MAC_BUILD --parallel 8 --target $TARGETS + +# Build additional tools for native benchmarking +EXTRA_TARGETS="llama-run llama-bench" +cmake --build $LLAMA_MAC_BUILD --parallel 8 --target $EXTRA_TARGETS +``` + +#### 2. Build virglrenderer (Host-side, macOS) + +```bash +# Build virglrenderer with APIR support +mkdir virglrenderer +git clone https://gitlab.freedesktop.org/kpouget/virglrenderer -b main-macos src +cd src + +VIRGL_BUILD_DIR=$PWD/build + +# -Dvenus=true and VIRGL_ROUTE_VENUS_TO_APIR=1 route the APIR requests via the Venus backend, for easier testing without a patched hypervisor + +meson setup $VIRGL_BUILD_DIR \ + -Dvenus=true \ + -Dapir=true + +ninja -C $VIRGL_BUILD_DIR +``` + +#### 3. Build ggml-virtgpu (Guest-side, Linux) + +Option A: Build from a script: + +```bash +# Inside a Linux container +mkdir llama.cpp +git clone https://github.com/ggml-org/llama.cpp.git src +cd src + +LLAMA_LINUX_BUILD=$PWD//build-virtgpu + +cmake -S . -B $LLAMA_LINUX_BUILD \ + -DGGML_VIRTGPU=ON + +ninja -C $LLAMA_LINUX_BUILD +``` + +Option B: Build container image with frontend: + +```bash +cat << EOF > remoting.containerfile +FROM quay.io/fedora/fedora:43 +USER 0 + +WORKDIR /app/remoting + +ARG LLAMA_CPP_REPO="https://github.com/ggml-org/llama.cpp.git" +ARG LLAMA_CPP_VERSION="master" +ARG LLAMA_CPP_CMAKE_FLAGS="-DGGML_VIRTGPU=ON" +ARG LLAMA_CPP_CMAKE_BUILD_FLAGS="--parallel 4" + +RUN dnf install -y git cmake gcc gcc-c++ libcurl-devel libdrm-devel + +RUN git clone "\${LLAMA_CPP_REPO}" src \\ + && git -C src fetch origin \${LLAMA_CPP_VERSION} \\ + && git -C src reset --hard FETCH_HEAD + +RUN mkdir -p build \\ + && cd src \\ + && set -o pipefail \\ + && cmake -S . -B ../build \${LLAMA_CPP_CMAKE_FLAGS} \\ + && cmake --build ../build/ \${LLAMA_CPP_CMAKE_BUILD_FLAGS} + +ENTRYPOINT ["/app/remoting/src/build/bin/llama-server"] +EOF + +mkdir -p empty_dir +podman build -f remoting.containerfile ./empty_dir -t localhost/llama-cpp.virtgpu +``` + +### Environment Setup + +#### Set krunkit Environment Variables + +```bash +# Define the base directories (adapt these paths to your system) +VIRGL_BUILD_DIR=$HOME/remoting/virglrenderer/build +LLAMA_MAC_BUILD=$HOME/remoting/llama.cpp/build-backend + +# For krunkit to load the custom virglrenderer library +export DYLD_LIBRARY_PATH=$VIRGL_BUILD_DIR/src + +# For Virglrenderer to load the ggml-remotingbackend library +export VIRGL_APIR_BACKEND_LIBRARY="$LLAMA_MAC_BUILD/bin/libggml-virtgpu-backend.dylib" + +# For llama.cpp remotingbackend to load the ggml-metal backend +export APIR_LLAMA_CPP_GGML_LIBRARY_PATH="$LLAMA_MAC_BUILD/bin/libggml-metal.dylib" +export APIR_LLAMA_CPP_GGML_LIBRARY_REG=ggml_backend_metal_reg +``` + +#### Launch Container Environment + +```bash +# Set container provider to libkrun +export CONTAINERS_MACHINE_PROVIDER=libkrun +podman machine start +``` + +#### Verify Environment + +Confirm that krunkit is using the correct virglrenderer library: + +```bash +lsof -c krunkit | grep virglrenderer +# Expected output: +# krunkit 50574 user txt REG 1,14 2273912 10849442 ($VIRGL_BUILD_DIR/src)/libvirglrenderer.1.dylib +``` + +### Running Tests + +#### Launch Test Container + +```bash +# Optional model caching +mkdir -p models +PODMAN_CACHE_ARGS="-v models:/models --user root:root --cgroupns host --security-opt label=disable -w /models" + +podman run $PODMAN_CACHE_ARGS -it --rm --device /dev/dri localhost/llama-cpp.virtgpu +``` + +#### Test llama.cpp in Container + +```bash + +# Run performance benchmark +/app/remoting/build/bin/llama-bench -m ./llama3.2 +``` + +Expected output (performance may vary): +``` +| model | size | params | backend | ngl | test | t/s | +| ------------------------------ | ---------: | ---------: | ---------- | --: | ------------: | -------------------: | +| llama 3B Q4_K - Medium | 1.87 GiB | 3.21 B | ggml-virtgpu | 99 | pp512 | 991.30 ± 0.66 | +| llama 3B Q4_K - Medium | 1.87 GiB | 3.21 B | ggml-virtgpu | 99 | tg128 | 85.71 ± 0.11 | +``` + +### Troubleshooting + +#### SSH Environment Variable Issues + +⚠️ **Warning**: Setting `DYLD_LIBRARY_PATH` from SSH doesn't work on macOS. Here is a workaround: + +**Workaround 1: Replace system library** +```bash +VIRGL_BUILD_DIR=$HOME/remoting/virglrenderer/build # ⚠️ adapt to your system +BREW_VIRGL_DIR=/opt/homebrew/Cellar/virglrenderer/0.10.4d/lib +VIRGL_LIB=libvirglrenderer.1.dylib + +cd $BREW_VIRGL_DIR +mv $VIRGL_LIB ${VIRGL_LIB}.orig +ln -s $VIRGL_BUILD_DIR/src/$VIRGL_LIB +``` From 972f323e73bf0b28358ccaa3b9aa02779421f260 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Mon, 9 Feb 2026 14:57:51 +0200 Subject: [PATCH 12/17] revert : "[Model] Qwen3.5 dense and MoE support (no vision) (#19435)" (#19453) This reverts commit 39bf692af1cba2a1072e4a42425611bf1ec2807d. --- convert_hf_to_gguf.py | 78 ++--- gguf-py/gguf/constants.py | 59 ---- gguf-py/gguf/tensor_mapping.py | 6 +- src/CMakeLists.txt | 3 - src/llama-arch.cpp | 61 ---- src/llama-arch.h | 2 - src/llama-context.cpp | 2 +- src/llama-model.cpp | 154 -------- src/models/delta.cpp | 618 --------------------------------- src/models/kimi-linear.cpp | 1 + src/models/models.h | 102 +----- src/models/qwen3-5.cpp | 421 ---------------------- src/models/qwen3-5moe.cpp | 52 --- src/models/qwen3next.cpp | 372 +++++++++++++++++++- 14 files changed, 399 insertions(+), 1532 deletions(-) delete mode 100644 src/models/delta.cpp delete mode 100644 src/models/qwen3-5.cpp delete mode 100644 src/models/qwen3-5moe.cpp diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index e64756a74a..843c00a896 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -4102,27 +4102,39 @@ class Qwen2MoeModel(TextModel): # process the experts separately name = name.replace("language_model.", "") # InternVL - # handle pre-packed expert tensors (e.g. Qwen3.5 MoE, Qwen3Next) - # HF stores these using nn.Linear convention: [n_expert, out_features, in_features] - # This matches the individual expert stacking path below (which stacks - # per-expert [out, in] weights into [n_expert, out, in]), so no permute is needed. + # handle aggregated expert tensors + # GGUF stores dimensions reversed from PyTorch, so: + # PyTorch (A,B,C) -> GGUF writes [C,B,A] -> GGML reads ne={C,B,A} + # Input shapes from HF: (n_expert, n_ff_exp, n_embd) or (n_expert, n_embd, n_ff_exp) + # Expected GGML ne: {n_embd, n_ff_exp, n_expert} for gate/up, {n_ff_exp, n_embd, n_expert} for down if name.endswith("mlp.experts.down_proj") or name.endswith("mlp.experts.down_proj.weight"): mapped = f"{name}.weight" if not name.endswith(".weight") else name - # HF: [n_expert, n_embd, n_ff] → GGML: {n_ff, n_embd, n_expert} ✓ - yield from super().modify_tensors(data_torch, mapped, bid) + # Input: (n_expert=128, n_ff_exp=768, n_embd=2048) + # Want GGML ne: {n_ff_exp, n_embd, n_expert} = {768, 2048, 128} + # Need PyTorch: (128, 2048, 768) [reversed of GGML] + # So: permute(0, 2, 1): (128, 768, 2048) -> (128, 2048, 768) + permuted = data_torch.permute(0, 2, 1).contiguous() + yield from super().modify_tensors(permuted, mapped, bid) return if name.endswith("mlp.experts.gate_up_proj") or name.endswith("mlp.experts.gate_up_proj.weight"): - # HF: [n_expert, 2*n_ff, n_embd] → split on dim=1 - n_ff = data_torch.shape[1] // 2 - gate = data_torch[:, :n_ff, :].contiguous() - up = data_torch[:, n_ff:, :].contiguous() - # gate/up: [n_expert, n_ff, n_embd] → GGML: {n_embd, n_ff, n_expert} ✓ - base_name = name.removesuffix(".weight").removesuffix(".gate_up_proj") - mapped_gate = f"{base_name}.gate_proj.weight" - mapped_up = f"{base_name}.up_proj.weight" - yield from super().modify_tensors(gate, mapped_gate, bid) - yield from super().modify_tensors(up, mapped_up, bid) + if data_torch.ndim < 3 or data_torch.shape[-1] % 2 != 0: + raise ValueError(f"Unexpected gate_up_proj shape for {name}: {tuple(data_torch.shape)}") + split_dim = data_torch.shape[-1] // 2 + gate = data_torch[..., :split_dim].contiguous() + up = data_torch[..., split_dim:].contiguous() + # Input gate/up: (n_expert=128, n_embd=2048, n_ff_exp=768) + # Want GGML ne: {n_embd, n_ff_exp, n_expert} = {2048, 768, 128} + # Need PyTorch: (128, 768, 2048) [reversed of GGML] + # So: permute(0, 2, 1): (128, 2048, 768) -> (128, 768, 2048) + base_name = name.removesuffix(".weight") + base = base_name.rsplit('.', 1)[0] + mapped_gate = f"{base}.gate_proj.weight" + mapped_up = f"{base}.up_proj.weight" + perm_gate = gate.permute(0, 2, 1).contiguous() + perm_up = up.permute(0, 2, 1).contiguous() + yield from super().modify_tensors(perm_gate, mapped_gate, bid) + yield from super().modify_tensors(perm_up, mapped_up, bid) return if name.startswith("mlp") or name.startswith("vision_model") or name.startswith("model.vision_tower") or name.startswith("model.multi_modal_projector") or name.startswith("model.visual"): @@ -4332,40 +4344,6 @@ class Qwen3NextModel(Qwen2MoeModel): yield from super().modify_tensors(data_torch, name, bid) -@ModelBase.register("Qwen3_5ForCausalLM", "Qwen3_5TextForCausalLM") -class Qwen3_5Model(Qwen3NextModel): - model_arch = gguf.MODEL_ARCH.QWEN3_5 - - # Stores whichever of in_proj_a/in_proj_b is seen first, keyed by layer - _pending_ba: dict[int | None, tuple[str, Tensor]] = {} - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - # Handle split in_proj_b + in_proj_a → concatenated SSM_BETA_ALPHA - # safetensors sorts alphabetically so in_proj_a arrives before in_proj_b - if "in_proj_a.weight" in name or "in_proj_b.weight" in name: - which = "a" if "in_proj_a" in name else "b" - if bid not in self._pending_ba: - self._pending_ba[bid] = (which, data_torch) - return - prev_which, prev_tensor = self._pending_ba.pop(bid) - assert prev_which != which, f"duplicate in_proj_{which} for layer {bid}" - b_tensor = prev_tensor if prev_which == "b" else data_torch - a_tensor = prev_tensor if prev_which == "a" else data_torch - ba_combined = torch.cat([b_tensor, a_tensor], dim=0) - yield (self.format_tensor_name(gguf.MODEL_TENSOR.SSM_BETA_ALPHA, bid, ".weight"), ba_combined) - return - else: - # Qwen3Next uses .qkvz tensor, so we use the super to get the other functionalities - # (norm correction, A_log to A etc.) for free - # Qwen2Moe already does the gate_up conversion properly, just use that - yield from super().modify_tensors(data_torch, name, bid) - - -@ModelBase.register("Qwen3_5MoeForCausalLM", "Qwen3_5MoeTextForCausalLM") -class Qwen3_5MoeModel(Qwen3_5Model): - model_arch = gguf.MODEL_ARCH.QWEN3_5_MOE - - @ModelBase.register("RND1") class RND1Model(Qwen2MoeModel): model_arch = gguf.MODEL_ARCH.RND1 diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 8a3fab1e1c..3af4fffe95 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -382,8 +382,6 @@ class MODEL_ARCH(IntEnum): QWEN3 = auto() QWEN3MOE = auto() QWEN3NEXT = auto() - QWEN3_5 = auto() - QWEN3_5_MOE = auto() QWEN3VL = auto() QWEN3VLMOE = auto() PHI2 = auto() @@ -814,8 +812,6 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = { MODEL_ARCH.QWEN3: "qwen3", MODEL_ARCH.QWEN3MOE: "qwen3moe", MODEL_ARCH.QWEN3NEXT: "qwen3next", - MODEL_ARCH.QWEN3_5: "qwen3_5", - MODEL_ARCH.QWEN3_5_MOE: "qwen3_5moe", MODEL_ARCH.QWEN3VL: "qwen3vl", MODEL_ARCH.QWEN3VLMOE: "qwen3vlmoe", MODEL_ARCH.PHI2: "phi2", @@ -1788,61 +1784,6 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = { MODEL_TENSOR.SSM_BETA_ALPHA, MODEL_TENSOR.SSM_OUT ], - MODEL_ARCH.QWEN3_5: [ - MODEL_TENSOR.TOKEN_EMBD, - MODEL_TENSOR.OUTPUT_NORM, - MODEL_TENSOR.OUTPUT, - MODEL_TENSOR.ATTN_NORM, - MODEL_TENSOR.ATTN_Q, - MODEL_TENSOR.ATTN_Q_NORM, - MODEL_TENSOR.ATTN_K, - MODEL_TENSOR.ATTN_K_NORM, - MODEL_TENSOR.ATTN_V, - MODEL_TENSOR.ATTN_OUT, - MODEL_TENSOR.ATTN_POST_NORM, - MODEL_TENSOR.ATTN_GATE, - MODEL_TENSOR.ATTN_QKV, - MODEL_TENSOR.FFN_GATE, - MODEL_TENSOR.FFN_DOWN, - MODEL_TENSOR.FFN_UP, - MODEL_TENSOR.SSM_A, - MODEL_TENSOR.SSM_CONV1D, - MODEL_TENSOR.SSM_DT, - MODEL_TENSOR.SSM_NORM, - MODEL_TENSOR.SSM_IN, - MODEL_TENSOR.SSM_BETA_ALPHA, - MODEL_TENSOR.SSM_OUT, - ], - MODEL_ARCH.QWEN3_5_MOE: [ - MODEL_TENSOR.TOKEN_EMBD, - MODEL_TENSOR.OUTPUT_NORM, - MODEL_TENSOR.OUTPUT, - MODEL_TENSOR.ATTN_NORM, - MODEL_TENSOR.ATTN_Q, - MODEL_TENSOR.ATTN_Q_NORM, - MODEL_TENSOR.ATTN_K, - MODEL_TENSOR.ATTN_K_NORM, - MODEL_TENSOR.ATTN_V, - MODEL_TENSOR.ATTN_OUT, - MODEL_TENSOR.ATTN_POST_NORM, - MODEL_TENSOR.ATTN_GATE, - MODEL_TENSOR.ATTN_QKV, - MODEL_TENSOR.FFN_GATE_INP, - MODEL_TENSOR.FFN_GATE_INP_SHEXP, - MODEL_TENSOR.FFN_UP_SHEXP, - MODEL_TENSOR.FFN_DOWN_SHEXP, - MODEL_TENSOR.FFN_GATE_SHEXP, - MODEL_TENSOR.FFN_DOWN_EXP, - MODEL_TENSOR.FFN_UP_EXP, - MODEL_TENSOR.FFN_GATE_EXP, - MODEL_TENSOR.SSM_A, - MODEL_TENSOR.SSM_CONV1D, - MODEL_TENSOR.SSM_DT, - MODEL_TENSOR.SSM_NORM, - MODEL_TENSOR.SSM_IN, - MODEL_TENSOR.SSM_BETA_ALPHA, - MODEL_TENSOR.SSM_OUT, - ], MODEL_ARCH.QWEN3VL: [ MODEL_TENSOR.TOKEN_EMBD, MODEL_TENSOR.OUTPUT_NORM, diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index 43f32c7b52..167ade7803 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -228,7 +228,6 @@ class TensorNameMap: "transformer_encoder.{bid}.qkv", # neobert "layers.{bid}.attn.Wqkv", # modern-bert "model.layers.{bid}.self_attn.language_expert_query_key_value", # cogvlm - "model.layers.{bid}.linear_attn.in_proj_qkv", # qwen3.5 ), # Attention query @@ -359,9 +358,8 @@ class TensorNameMap: ), MODEL_TENSOR.ATTN_GATE: ( - "model.layers.{bid}.self_attn.gate_proj", # afmoe - "model.layers.{bid}.self_attn.g_proj", # step3.5 head-wise attention gate - "model.layers.{bid}.linear_attn.in_proj_z", # qwen3.5 + "model.layers.{bid}.self_attn.gate_proj", # afmoe + "model.layers.{bid}.self_attn.g_proj", # step3.5 head-wise attention gate ), # Feed-forward norm diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 0c164617a1..2115fc4255 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -57,7 +57,6 @@ add_library(llama models/deci.cpp models/deepseek.cpp models/deepseek2.cpp - models/delta.cpp models/dots1.cpp models/dream.cpp models/ernie4-5-moe.cpp @@ -123,8 +122,6 @@ add_library(llama models/qwen3vl-moe.cpp models/qwen3moe.cpp models/qwen3next.cpp - models/qwen3-5.cpp - models/qwen3-5moe.cpp models/refact.cpp models/rnd1.cpp models/rwkv6-base.cpp diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp index fce46772d7..bd78f1e556 100644 --- a/src/llama-arch.cpp +++ b/src/llama-arch.cpp @@ -35,8 +35,6 @@ static const std::map LLM_ARCH_NAMES = { { LLM_ARCH_QWEN3, "qwen3" }, { LLM_ARCH_QWEN3MOE, "qwen3moe" }, { LLM_ARCH_QWEN3NEXT, "qwen3next" }, - { LLM_ARCH_QWEN3_5, "qwen3_5" }, - { LLM_ARCH_QWEN3_5_MOE, "qwen3_5moe" }, { LLM_ARCH_QWEN3VL, "qwen3vl" }, { LLM_ARCH_QWEN3VLMOE, "qwen3vlmoe" }, { LLM_ARCH_PHI2, "phi2" }, @@ -987,63 +985,6 @@ static std::set llm_get_tensor_names(llm_arch arch) { LLM_TENSOR_SSM_NORM, LLM_TENSOR_SSM_OUT, }; - case LLM_ARCH_QWEN3_5: - return { - LLM_TENSOR_TOKEN_EMBD, - LLM_TENSOR_OUTPUT_NORM, - LLM_TENSOR_OUTPUT, - LLM_TENSOR_ATTN_NORM, - LLM_TENSOR_ATTN_POST_NORM, - LLM_TENSOR_ATTN_Q, - LLM_TENSOR_ATTN_Q_NORM, - LLM_TENSOR_ATTN_K, - LLM_TENSOR_ATTN_K_NORM, - LLM_TENSOR_ATTN_V, - LLM_TENSOR_ATTN_OUT, - LLM_TENSOR_ATTN_QKV, - LLM_TENSOR_ATTN_GATE, - LLM_TENSOR_FFN_GATE, - LLM_TENSOR_FFN_DOWN, - LLM_TENSOR_FFN_UP, - LLM_TENSOR_SSM_A_NOSCAN, - LLM_TENSOR_SSM_CONV1D, - LLM_TENSOR_SSM_DT, - LLM_TENSOR_SSM_BETA_ALPHA, - LLM_TENSOR_SSM_IN, - LLM_TENSOR_SSM_NORM, - LLM_TENSOR_SSM_OUT, - }; - case LLM_ARCH_QWEN3_5_MOE: - return { - LLM_TENSOR_TOKEN_EMBD, - LLM_TENSOR_OUTPUT_NORM, - LLM_TENSOR_OUTPUT, - LLM_TENSOR_ATTN_NORM, - LLM_TENSOR_ATTN_POST_NORM, - LLM_TENSOR_ATTN_Q, - LLM_TENSOR_ATTN_Q_NORM, - LLM_TENSOR_ATTN_K, - LLM_TENSOR_ATTN_K_NORM, - LLM_TENSOR_ATTN_V, - LLM_TENSOR_ATTN_OUT, - LLM_TENSOR_ATTN_QKV, - LLM_TENSOR_ATTN_GATE, - LLM_TENSOR_FFN_GATE_INP, - LLM_TENSOR_FFN_GATE_EXPS, - LLM_TENSOR_FFN_DOWN_EXPS, - LLM_TENSOR_FFN_UP_EXPS, - LLM_TENSOR_FFN_GATE_INP_SHEXP, - LLM_TENSOR_FFN_GATE_SHEXP, - LLM_TENSOR_FFN_DOWN_SHEXP, - LLM_TENSOR_FFN_UP_SHEXP, - LLM_TENSOR_SSM_A_NOSCAN, - LLM_TENSOR_SSM_CONV1D, - LLM_TENSOR_SSM_DT, - LLM_TENSOR_SSM_BETA_ALPHA, - LLM_TENSOR_SSM_IN, - LLM_TENSOR_SSM_NORM, - LLM_TENSOR_SSM_OUT, - }; case LLM_ARCH_QWEN3VL: case LLM_ARCH_CHAMELEON: case LLM_ARCH_HUNYUAN_DENSE: @@ -2733,8 +2674,6 @@ bool llm_arch_is_hybrid(const llm_arch & arch) { case LLM_ARCH_NEMOTRON_H: case LLM_ARCH_NEMOTRON_H_MOE: case LLM_ARCH_QWEN3NEXT: - case LLM_ARCH_QWEN3_5: - case LLM_ARCH_QWEN3_5_MOE: case LLM_ARCH_KIMI_LINEAR: return true; default: diff --git a/src/llama-arch.h b/src/llama-arch.h index a392ecce2b..e8263369b8 100644 --- a/src/llama-arch.h +++ b/src/llama-arch.h @@ -39,8 +39,6 @@ enum llm_arch { LLM_ARCH_QWEN3, LLM_ARCH_QWEN3MOE, LLM_ARCH_QWEN3NEXT, - LLM_ARCH_QWEN3_5, - LLM_ARCH_QWEN3_5_MOE, LLM_ARCH_QWEN3VL, LLM_ARCH_QWEN3VLMOE, LLM_ARCH_PHI2, diff --git a/src/llama-context.cpp b/src/llama-context.cpp index 80b9a7d46a..a6df893a31 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -2013,7 +2013,7 @@ void llama_context::output_reorder() { // uint32_t llama_context::graph_max_nodes(uint32_t n_tokens) const { - if (model.arch == LLM_ARCH_QWEN3NEXT || model.arch == LLM_ARCH_QWEN3_5 || model.arch == LLM_ARCH_QWEN3_5_MOE || model.arch == LLM_ARCH_KIMI_LINEAR) { + if (model.arch == LLM_ARCH_QWEN3NEXT || model.arch == LLM_ARCH_KIMI_LINEAR) { return std::max(n_tokens * 40, 32u * model.n_tensors()); } uint32_t res = std::max(1024u, 8u*model.n_tensors()); diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 8fc61aee37..674d06c891 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -2412,25 +2412,6 @@ void llama_model::load_hparams(llama_model_loader & ml) { default: type = LLM_TYPE_UNKNOWN; } } break; - case LLM_ARCH_QWEN3_5: - case LLM_ARCH_QWEN3_5_MOE: - { - ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false); - ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, false); - ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); - - // Load linear attention (gated delta net) parameters - ml.get_key(LLM_KV_SSM_CONV_KERNEL, hparams.ssm_d_conv); - ml.get_key(LLM_KV_SSM_INNER_SIZE, hparams.ssm_d_inner); - ml.get_key(LLM_KV_SSM_STATE_SIZE, hparams.ssm_d_state); - ml.get_key(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank); - ml.get_key(LLM_KV_SSM_GROUP_COUNT, hparams.ssm_n_group); - - // Mark recurrent layers (linear attention layers) - for (uint32_t i = 0; i < hparams.n_layer; ++i) { - hparams.recurrent_layer_arr[i] = ((i + 1) % 4 != 0); - } - } break; case LLM_ARCH_MISTRAL3: { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); @@ -7113,129 +7094,6 @@ bool llama_model::load_tensors(llama_model_loader & ml) { layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff_exp, n_embd, n_expert }, 0); layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert }, 0); - // Shared experts - layer.ffn_gate_inp_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP_SHEXP, "weight", i), { n_embd }, 0); - layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), { n_embd, hparams.n_ff_shexp }, 0); - layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), { n_embd, hparams.n_ff_shexp }, 0); - layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { hparams.n_ff_shexp, n_embd }, 0); - } - } break; - case LLM_ARCH_QWEN3_5: - { - tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0); - - // output - output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0); - output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), { n_embd, n_vocab }, TENSOR_NOT_REQUIRED); - - if (output == NULL) { - output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, TENSOR_DUPLICATED); - } - - // Calculate dimensions from hyperparameters - const int64_t head_k_dim = hparams.ssm_d_state; - const int64_t head_v_dim = hparams.ssm_d_state; - const int64_t n_k_heads = hparams.ssm_n_group; - const int64_t n_v_heads = hparams.ssm_dt_rank; - const int64_t key_dim = head_k_dim * n_k_heads; - const int64_t value_dim = head_v_dim * n_v_heads; - const int64_t conv_dim = key_dim * 2 + value_dim; - - const int64_t ba_dim = n_v_heads * 2; - - for (int i = 0; i < n_layer; ++i) { - auto & layer = layers[i]; - - layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, 0); - layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), { n_embd }, 0); - - if (!hparams.is_recurrent(i)) { - // Full attention layers - layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), { n_embd, n_embd_head_k * n_head * 2 }, 0); - layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), { n_embd, n_embd_k_gqa }, 0); - layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), { n_embd, n_embd_v_gqa }, 0); - layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head_k * n_head, n_embd }, 0); - - layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), { n_embd_head_k }, 0); - layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), { n_embd_head_k }, 0); - } else { - // Linear attention (gated delta net) specific tensors - layer.ssm_in = create_tensor(tn(LLM_TENSOR_SSM_IN, "weight", i), { n_embd, key_dim * 2 + value_dim * 2 }, TENSOR_NOT_REQUIRED); - layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), { n_embd, key_dim * 2 + value_dim }, TENSOR_NOT_REQUIRED); - layer.wqkv_gate = create_tensor(tn(LLM_TENSOR_ATTN_GATE, "weight", i), { n_embd, value_dim }, TENSOR_NOT_REQUIRED); - layer.ssm_conv1d = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", i), { hparams.ssm_d_conv, conv_dim }, 0); - layer.ssm_dt = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), { hparams.ssm_dt_rank }, 0); - layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A_NOSCAN, i), { hparams.ssm_dt_rank }, 0); - layer.ssm_beta_alpha = create_tensor(tn(LLM_TENSOR_SSM_BETA_ALPHA, "weight", i), { n_embd, ba_dim }, 0); - layer.ssm_norm = create_tensor(tn(LLM_TENSOR_SSM_NORM, "weight", i), { head_v_dim }, 0); - layer.ssm_out = create_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", i), { value_dim, n_embd }, 0); - } - - // Dense FFN for all layers - layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), { n_embd, n_ff }, 0); - layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), { n_embd, n_ff }, 0); - layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd }, 0); - } - } break; - case LLM_ARCH_QWEN3_5_MOE: - { - tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0); - - // output - output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0); - output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), { n_embd, n_vocab }, TENSOR_NOT_REQUIRED); - - if (output == NULL) { - output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, TENSOR_DUPLICATED); - } - - const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used; - - // Calculate dimensions from hyperparameters - const int64_t head_k_dim = hparams.ssm_d_state; - const int64_t head_v_dim = hparams.ssm_d_state; - const int64_t n_k_heads = hparams.ssm_n_group; - const int64_t n_v_heads = hparams.ssm_dt_rank; - const int64_t key_dim = head_k_dim * n_k_heads; - const int64_t value_dim = head_v_dim * n_v_heads; - const int64_t conv_dim = key_dim * 2 + value_dim; - - const int64_t ba_dim = n_v_heads * 2; - - for (int i = 0; i < n_layer; ++i) { - auto & layer = layers[i]; - - layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, 0); - layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), { n_embd }, 0); - - if (!hparams.is_recurrent(i)) { - // Full attention layers - layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), { n_embd, n_embd_head_k * n_head * 2 }, 0); - layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), { n_embd, n_embd_k_gqa }, 0); - layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), { n_embd, n_embd_v_gqa }, 0); - layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head_k * n_head, n_embd }, 0); - - layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), { n_embd_head_k }, 0); - layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), { n_embd_head_k }, 0); - } else { - // Linear attention (gated delta net) specific tensors - layer.ssm_in = create_tensor(tn(LLM_TENSOR_SSM_IN, "weight", i), { n_embd, key_dim * 2 + value_dim * 2 }, TENSOR_NOT_REQUIRED); - layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), { n_embd, key_dim * 2 + value_dim }, TENSOR_NOT_REQUIRED); - layer.wqkv_gate = create_tensor(tn(LLM_TENSOR_ATTN_GATE, "weight", i), { n_embd, value_dim }, TENSOR_NOT_REQUIRED); - layer.ssm_conv1d = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", i), { hparams.ssm_d_conv, conv_dim }, 0); - layer.ssm_dt = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), { hparams.ssm_dt_rank }, 0); - layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A_NOSCAN, i), { hparams.ssm_dt_rank }, 0); - layer.ssm_beta_alpha = create_tensor(tn(LLM_TENSOR_SSM_BETA_ALPHA, "weight", i), { n_embd, ba_dim }, 0); - layer.ssm_norm = create_tensor(tn(LLM_TENSOR_SSM_NORM, "weight", i), { head_v_dim }, 0); - layer.ssm_out = create_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", i), { value_dim, n_embd }, 0); - } - - // MoE FFN - layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), { n_embd, n_expert }, 0); - layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert }, 0); - layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff_exp, n_embd, n_expert }, 0); - layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert }, 0); - // Shared experts layer.ffn_gate_inp_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP_SHEXP, "weight", i), { n_embd }, 0); layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), { n_embd, hparams.n_ff_shexp }, 0); @@ -7687,8 +7545,6 @@ void llama_model::print_info() const { arch == LLM_ARCH_PLAMO2 || arch == LLM_ARCH_GRANITE_HYBRID || arch == LLM_ARCH_QWEN3NEXT || - arch == LLM_ARCH_QWEN3_5 || - arch == LLM_ARCH_QWEN3_5_MOE || arch == LLM_ARCH_NEMOTRON_H || arch == LLM_ARCH_NEMOTRON_H_MOE) { LLAMA_LOG_INFO("%s: ssm_d_conv = %u\n", __func__, hparams.ssm_d_conv); @@ -8487,14 +8343,6 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const { { llm = std::make_unique(*this, params); } break; - case LLM_ARCH_QWEN3_5: - { - llm = std::make_unique(*this, params); - } break; - case LLM_ARCH_QWEN3_5_MOE: - { - llm = std::make_unique(*this, params); - } break; case LLM_ARCH_MISTRAL3: { llm = std::make_unique(*this, params); @@ -8755,8 +8603,6 @@ llama_rope_type llama_model_rope_type(const llama_model * model) { case LLM_ARCH_PANGU_EMBED: case LLM_ARCH_AFMOE: case LLM_ARCH_QWEN3NEXT: - case LLM_ARCH_QWEN3_5: - case LLM_ARCH_QWEN3_5_MOE: case LLM_ARCH_MIMO2: case LLM_ARCH_STEP35: return LLAMA_ROPE_TYPE_NEOX; diff --git a/src/models/delta.cpp b/src/models/delta.cpp deleted file mode 100644 index d1d9837d09..0000000000 --- a/src/models/delta.cpp +++ /dev/null @@ -1,618 +0,0 @@ -#include "models.h" -#include "ggml.h" -#include -#include -#include - -llm_graph_context_delta::llm_graph_context_delta(const llm_graph_params & params) : llm_graph_context_mamba(params) {} - -/** - * Unified Delta Net implementation supporting both GDA and KDA modes. - * - * GDA (Gated Delta Attention): g has shape [H, T, B] in GGML (PyTorch: [B, T, H]) - * - Per-head gating, broadcasts over K dimension - * - * KDA (Key-wise Delta Attention): g has shape [K, H, T, B] in GGML (PyTorch: [B, T, H, K]) - * - Per-key gating - * - * The mode is auto-detected based on g's dimensionality. - * - * Tensor dimension convention: - * GGML: ne[0] is innermost (fastest varying), ne[3] is outermost - * PyTorch: dim 0 is outermost, dim -1 is innermost - * So GGML [A, B, C, D] corresponds to PyTorch [D, C, B, A] - */ - -// Helper to get a slice along dimension 2 (n_chunks dimension) -static ggml_tensor * get_slice_2d(ggml_context * ctx, ggml_tensor * t, int64_t chunk) { - return ggml_view_4d(ctx, t, - t->ne[0], t->ne[1], 1, t->ne[3], - t->nb[1], t->nb[2], t->nb[3], - chunk * t->nb[2]); -} - -/** - * Unified chunked Delta Net implementation. - * - * Input tensor format matches qwen3next conventions: - * @param q Query tensor [S_k, H_k, n_tokens, n_seqs] - * @param k Key tensor [S_k, H_k, n_tokens, n_seqs] - * @param v Value tensor [S_v, H_v, n_tokens, n_seqs] - * @param g Gate tensor: - * GDA: [H_v, n_tokens, n_seqs] - * KDA: [S_k, H_v, n_tokens, n_seqs] - * @param beta Beta tensor [H_v, 1, n_tokens, n_seqs] - * @param state State tensor [S_v, S_v * H_v, 1, n_seqs] - * @param causal_mask Lower triangular mask [chunk_size, chunk_size] - * @param identity Identity matrix [chunk_size, chunk_size] - * @param diag_mask Diagonal mask [chunk_size, chunk_size] - * @param il Layer index (for debugging callbacks) - * @param chunk_size Chunk size for chunked processing - * @param eps_norm Epsilon for L2 normalization - * - * @return Pair of (output_tokens, new_state) - */ -std::pair llm_graph_context_delta::build_delta_net_unified_chunking( - ggml_context * ctx0, - ggml_tensor * q, - ggml_tensor * k, - ggml_tensor * v, - ggml_tensor * g, - ggml_tensor * beta, - ggml_tensor * state_reshaped, - ggml_tensor * causal_mask, - ggml_tensor * identity, - ggml_tensor * diag_mask, - int il, - int64_t chunk_size, - float eps_norm) { - - // Input format: [S, H, n_tokens, n_seqs] (matching qwen3next convention) - const int64_t S_k = q->ne[0]; - const int64_t H_k = q->ne[1]; - const int64_t n_tokens = q->ne[2]; - const int64_t n_seqs = q->ne[3]; - - const int64_t S_v = v->ne[0]; - const int64_t H_v = v->ne[1]; - - // Detect KDA vs GDA based on g's shape - // GDA: g has shape [H_v, n_tokens, n_seqs] - // KDA: g has shape [S_k, H_v, n_tokens, n_seqs] (4D with ne[0]=S_k) - const bool is_kda = (g->ne[0] == S_k && g->ne[1] == H_v); - - // Validate tensor shapes - GGML_ASSERT(v->ne[2] == n_tokens); - GGML_ASSERT(k->ne[2] == n_tokens); - GGML_ASSERT(state_reshaped->ne[0] == S_v && state_reshaped->ne[1] == S_v && state_reshaped->ne[2] == H_v && state_reshaped->ne[3] == n_seqs); - GGML_ASSERT(q->ne[0] == S_k && q->ne[1] == H_k && q->ne[2] == n_tokens && q->ne[3] == n_seqs); - GGML_ASSERT(k->ne[0] == S_k && k->ne[1] == H_k && k->ne[2] == n_tokens && k->ne[3] == n_seqs); - GGML_ASSERT(beta->ne[0] == H_v && beta->ne[2] == n_tokens && beta->ne[3] == n_seqs); - GGML_ASSERT(H_k == H_v); - - if (is_kda) { - // KDA: g shape [S_k, H_v, n_tokens, n_seqs] - GGML_ASSERT(g->ne[0] == S_k && g->ne[1] == H_v && g->ne[2] == n_tokens && g->ne[3] == n_seqs); - } else { - // GDA: g shape [H_v, n_tokens, n_seqs] - GGML_ASSERT(g->ne[0] == H_v && g->ne[1] == n_tokens && g->ne[2] == n_seqs); - } - - // L2 normalize q and k - q = ggml_l2_norm(ctx0, q, eps_norm); - k = ggml_l2_norm(ctx0, k, eps_norm); - - const float scale = 1.0f / sqrtf((float)S_v); - q = ggml_scale(ctx0, q, scale); - - beta = ggml_sigmoid(ctx0, beta); - - cb(q, "q_in", il); - cb(k, "k_in", il); - cb(v, "v_in", il); - cb(beta, "beta_in", il); - cb(g, "g_in", il); - - // Permute tensors to working format [S, n_tokens, H, n_seqs] - // Input: [S, H, n_tokens, n_seqs] -> permute(0, 2, 1, 3) -> [S, n_tokens, H, n_seqs] - q = ggml_cont_4d(ctx0, ggml_permute(ctx0, q, 0, 2, 1, 3), S_k, n_tokens, H_k, n_seqs); - k = ggml_cont_4d(ctx0, ggml_permute(ctx0, k, 0, 2, 1, 3), S_k, n_tokens, H_k, n_seqs); - v = ggml_cont_4d(ctx0, ggml_permute(ctx0, v, 0, 2, 1, 3), S_v, n_tokens, H_v, n_seqs); - if (is_kda) { - g = ggml_cont_4d(ctx0, ggml_permute(ctx0, g, 0, 2, 1, 3), S_k, n_tokens, H_k, n_seqs); - } else { - g = ggml_cont_4d(ctx0, ggml_permute(ctx0, g, 2, 0, 3, 1), n_tokens, 1, H_k, n_seqs); - } - beta = ggml_cont(ctx0, ggml_permute(ctx0, beta, 2, 0, 1, 3)); - - cb(q, "q_perm", il); - cb(k, "k_perm", il); - cb(v, "v_perm", il); - cb(beta, "beta_perm", il); - cb(g, "g_perm", il); - cb(state_reshaped, "state_in", il); - - // Padding for chunk processing - const int64_t pad = (chunk_size - n_tokens % chunk_size) % chunk_size; - const int64_t n_chunks = (n_tokens + pad) / chunk_size; - - q = ggml_pad(ctx0, q, 0, pad, 0, 0); - k = ggml_pad(ctx0, k, 0, pad, 0, 0); - v = ggml_pad(ctx0, v, 0, pad, 0, 0); - beta = ggml_pad(ctx0, beta, 0, pad, 0, 0); - g = ggml_pad(ctx0, g, pad, 0, 0, 0); - - - cb(q, "q_pad", il); - cb(k, "k_pad", il); - cb(v, "v_pad", il); - cb(beta, "beta_pad", il); - cb(g, "g_pad", il); - - ggml_tensor * v_beta = ggml_mul(ctx0, v, beta); - ggml_tensor * k_beta = ggml_mul(ctx0, k, beta); - - cb(v_beta, "v_beta", il); - cb(k_beta, "k_beta", il); - - // Reshape to chunks - q = ggml_reshape_4d(ctx0, q, S_k, chunk_size, n_chunks, H_k * n_seqs); - k = ggml_reshape_4d(ctx0, k, S_k, chunk_size, n_chunks, H_k * n_seqs); - k_beta = ggml_reshape_4d(ctx0, k_beta, S_k, chunk_size, n_chunks, H_k * n_seqs); - v = ggml_reshape_4d(ctx0, v, S_v, chunk_size, n_chunks, H_v * n_seqs); - v_beta = ggml_reshape_4d(ctx0, v_beta, S_v, chunk_size, n_chunks, H_v * n_seqs); - beta = ggml_reshape_4d(ctx0, beta, 1, chunk_size, n_chunks, H_k * n_seqs); - - // Reshape g for chunks - ggml_tensor * g_cumsum; - ggml_tensor * g_cumsum_t; - if (is_kda) { - // KDA: g [S_k, n_tokens+pad, H_k, n_seqs] -> [S_k, chunk_size, n_chunks, H_k * n_seqs] - g = ggml_reshape_4d(ctx0, g, S_k, chunk_size, n_chunks, H_k * n_seqs); - // Cumsum along chunk_size dimension (ne[1]) - // GGML cumsum operates on ne[0], so we need to transpose, cumsum, transpose back - g = ggml_cont(ctx0, ggml_transpose(ctx0, g)); // [chunk_size, S_k, n_chunks, H_k * n_seqs] - g_cumsum_t = ggml_cumsum(ctx0, g); - g_cumsum = ggml_cont(ctx0, ggml_transpose(ctx0, g_cumsum_t)); // [S_k, chunk_size, n_chunks, H_k * n_seqs] - } else { - // GDA: g [n_tokens+pad, 1, H_k, n_seqs] -> [chunk_size, 1, n_chunks, H_k * n_seqs] - g = ggml_reshape_4d(ctx0, g, chunk_size, 1, n_chunks, H_k * n_seqs); - g_cumsum = ggml_cumsum(ctx0, g); - g_cumsum_t = ggml_reshape_4d(ctx0, g_cumsum, 1, chunk_size, n_chunks, H_k * n_seqs); - } - - cb(g_cumsum, "g_cumsum", il); - - // Build attention matrix A for the WY representation solve - // For GDA: A[j,i] = sum_k(k[j,k] * exp(g[j] - g[i]) * k[i,k]) = (k @ k^T) * exp(g[j] - g[i]) - // For KDA: A[j,i] = sum_k(k_beta[j,k] * exp(g[j,k] - g[i,k]) * k[i,k]) - // KDA uses decay mask with S_k packed into batch to compute exp(g[j,k] - g[i,k]) per-key - - ggml_tensor * k_decay; - ggml_tensor * decay_mask = nullptr; - ggml_tensor * g_exp_pos = nullptr; - - if (is_kda) { - // KDA: Use decay mask with S_k in leading dimension for efficient mul_mat reduction - // A[j,i] = sum_k(k_beta[j,k] * exp(g[j,k] - g[i,k]) * k[i,k]) - // By putting S_k in dim 0, mul_mat implicitly sums over it - - const int64_t CHB = n_chunks * H_k * n_seqs; - - // g_cumsum_t is [chunk_size, S_k, n_chunks, H_k * n_seqs] - // Reshape to [chunk_size, S_k, CHB] then build decay mask - ggml_tensor * gcs = ggml_reshape_3d(ctx0, g_cumsum_t, chunk_size, S_k, CHB); - ggml_tensor * gcs_i = ggml_reshape_4d(ctx0, gcs, chunk_size, 1, S_k, CHB); - ggml_tensor * gcs_j = ggml_reshape_4d(ctx0, gcs, 1, chunk_size, S_k, CHB); - - // Build decay mask: [chunk_size, chunk_size, S_k, CHB] - ggml_tensor * gcs_j_bc = ggml_repeat_4d(ctx0, gcs_j, chunk_size, chunk_size, S_k, CHB); - decay_mask = ggml_sub(ctx0, gcs_j_bc, gcs_i); - - cb(decay_mask, "decay_mask_kda", il); - - decay_mask = ggml_mul(ctx0, decay_mask, diag_mask); - decay_mask = ggml_exp(ctx0, decay_mask); - decay_mask = ggml_mul(ctx0, decay_mask, diag_mask); - - // Permute to [S_k, chunk_size_j, chunk_size_i, CHB] for mul_mat reduction over S_k - decay_mask = ggml_cont_4d(ctx0, ggml_permute(ctx0, decay_mask, 2, 1, 0, 3), S_k, chunk_size, chunk_size, CHB); - - // Reshape k and k_beta for broadcasting with decay_mask - // k_i: indexed at position i (dim 2 of decay_mask) - // k_beta_j: indexed at position j (dim 1 of decay_mask) - ggml_tensor * k_i = ggml_reshape_4d(ctx0, k, S_k, 1, chunk_size, CHB); - ggml_tensor * k_beta_j = ggml_reshape_4d(ctx0, k_beta, S_k, chunk_size, 1, CHB); - - // decay_k_beta_j[s,j,i,b] = decay[s,j,i,b] * k_beta[s,j,b] - ggml_tensor * decay_k_beta_j = ggml_mul(ctx0, decay_mask, k_beta_j); - - // mul_mat sums over S_k: result[j,1,i,CHB] = sum_s decay_k_beta_j[s,j,i,b] * k_i[s,1,i,b] - k_decay = ggml_mul_mat(ctx0, decay_k_beta_j, k_i); - k_decay = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_reshape_4d(ctx0, k_decay, chunk_size, chunk_size, n_chunks, H_k * n_seqs))); - - // g_exp_pos is still needed for later (kbeta_gexp, etc.) - g_exp_pos = ggml_exp(ctx0, g_cumsum); - } else { - // GDA: Use decay mask approach (g broadcasts over K dimension) - // g_cumsum [chunk_size, 1, n_chunks, H_v * n_seqs] - ggml_tensor * gcs_i = g_cumsum; - ggml_tensor * gcs_j = g_cumsum_t; - g_exp_pos = ggml_exp(ctx0, g_cumsum_t); - ggml_tensor * gcs_j_broadcast = ggml_repeat_4d(ctx0, gcs_j, chunk_size, chunk_size, n_chunks, H_v * n_seqs); - decay_mask = ggml_sub(ctx0, gcs_j_broadcast, gcs_i); - - cb(decay_mask, "decay_mask", il); - - decay_mask = ggml_mul(ctx0, decay_mask, diag_mask); - decay_mask = ggml_exp(ctx0, decay_mask); - decay_mask = ggml_mul(ctx0, decay_mask, diag_mask); - - ggml_tensor * kmulkbeta = ggml_mul_mat(ctx0, k, k_beta); - k_decay = ggml_mul(ctx0, kmulkbeta, decay_mask); - } - - ggml_tensor * attn = ggml_neg(ctx0, ggml_mul(ctx0, k_decay, causal_mask)); - - cb(attn, "attn_pre_solve", il); - - // Solve triangular system: (I + L) @ X = I, where L is strictly lower triangular - ggml_tensor * attn_lower = ggml_mul(ctx0, attn, causal_mask); - ggml_tensor * lhs = ggml_sub(ctx0, ggml_repeat(ctx0, identity, attn_lower), attn_lower); - ggml_tensor * lin_solve = ggml_solve_tri(ctx0, lhs, attn, true, true, false); - attn = ggml_mul(ctx0, lin_solve, causal_mask); - attn = ggml_add(ctx0, attn, identity); - - cb(attn, "attn_solved", il); - - // Compute u = A @ v and w = A @ (g.exp() * k) - v = ggml_mul_mat(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, v_beta)), attn); - - ggml_tensor * kbeta_gexp = ggml_mul(ctx0, k_beta, g_exp_pos); - cb(kbeta_gexp, "kbeta_gexp", il); - - ggml_tensor * k_cumdecay = ggml_cont(ctx0, ggml_transpose(ctx0, - ggml_mul_mat(ctx0, attn, ggml_cont(ctx0, ggml_transpose(ctx0, kbeta_gexp))))); - cb(k_cumdecay, "k_cumdecay", il); - - // Attention scores q @ k^T with decay - // For GDA: attn_kq[j,i] = sum_k(q[j,k] * exp(g[j] - g[i]) * k[i,k]) - // For KDA: attn_kq[j,i] = sum_k(q[j,k] * exp(g[j,k] - g[i,k]) * k[i,k]) - ggml_tensor * attn_kq; - if (is_kda) { - // KDA: Same approach as k_decay - use decay_mask with S_k in leading dim - const int64_t CHB = n_chunks * H_k * n_seqs; - - // Rebuild decay mask (same structure as k_decay) - ggml_tensor * gcs = ggml_reshape_3d(ctx0, g_cumsum_t, chunk_size, S_k, CHB); - ggml_tensor * gcs_i = ggml_reshape_4d(ctx0, gcs, chunk_size, 1, S_k, CHB); - ggml_tensor * gcs_j = ggml_reshape_4d(ctx0, gcs, 1, chunk_size, S_k, CHB); - ggml_tensor * gcs_j_bc = ggml_repeat_4d(ctx0, gcs_j, chunk_size, chunk_size, S_k, CHB); - ggml_tensor * decay_mask_kq = ggml_sub(ctx0, gcs_j_bc, gcs_i); - - decay_mask_kq = ggml_mul(ctx0, decay_mask_kq, diag_mask); - decay_mask_kq = ggml_exp(ctx0, decay_mask_kq); - decay_mask_kq = ggml_mul(ctx0, decay_mask_kq, diag_mask); - - // Permute to [S_k, chunk_size_j, chunk_size_i, CHB] - decay_mask_kq = ggml_cont_4d(ctx0, ggml_permute(ctx0, decay_mask_kq, 2, 1, 0, 3), S_k, chunk_size, chunk_size, CHB); - - // q_j: indexed at position j, k_i: indexed at position i - ggml_tensor * q_j = ggml_reshape_4d(ctx0, q, S_k, chunk_size, 1, CHB); - ggml_tensor * k_i = ggml_reshape_4d(ctx0, k, S_k, 1, chunk_size, CHB); - - // decay_q_j[s,j,i,b] = decay[s,j,i,b] * q[s,j,b] - ggml_tensor * decay_q_j = ggml_mul(ctx0, decay_mask_kq, q_j); - - // mul_mat sums over S_k - attn_kq = ggml_mul_mat(ctx0, decay_q_j, k_i); - attn_kq = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_reshape_4d(ctx0, attn_kq, chunk_size, chunk_size, n_chunks, H_k * n_seqs))); - } else { - // GDA: Use decay mask - attn_kq = ggml_mul_mat(ctx0, k, q); - attn_kq = ggml_mul(ctx0, attn_kq, decay_mask); - attn_kq = ggml_mul(ctx0, attn_kq, diag_mask); - } - cb(attn_kq, "attn_kq", il); - - // Compute g_last and g_diff for state updates - ggml_tensor * g_last; - ggml_tensor * g_diff_exp; - ggml_tensor * g_last_exp; - - if (is_kda) { - // KDA: g_cumsum [S_k, chunk_size, n_chunks, H_k * n_seqs] - // Get last element along chunk_size dimension (ne[1]) - g_last = ggml_view_4d(ctx0, g_cumsum, - g_cumsum->ne[0], 1, g_cumsum->ne[2], g_cumsum->ne[3], - g_cumsum->nb[1], g_cumsum->nb[2], g_cumsum->nb[3], - (g_cumsum->ne[1] - 1) * g_cumsum->nb[1]); - g_last = ggml_cont(ctx0, g_last); - g_last_exp = ggml_exp(ctx0, g_last); - - // g_diff = g_last - g_cumsum - ggml_tensor * g_last_broadcast = ggml_repeat_4d(ctx0, g_last, - g_cumsum->ne[0], g_cumsum->ne[1], g_cumsum->ne[2], g_cumsum->ne[3]); - ggml_tensor * g_diff = ggml_sub(ctx0, g_last_broadcast, g_cumsum); - g_diff_exp = ggml_exp(ctx0, g_diff); - } else { - // GDA: g_cumsum [chunk_size, 1, n_chunks, H_k * n_seqs] - g_last = ggml_view_4d(ctx0, g_cumsum, - 1, 1, g_cumsum->ne[2], g_cumsum->ne[3], - g_cumsum->nb[1], g_cumsum->nb[2], g_cumsum->nb[3], - (g_cumsum->ne[0] - 1) * ggml_element_size(g_cumsum)); - g_last = ggml_cont(ctx0, g_last); - g_last_exp = ggml_exp(ctx0, g_last); - - ggml_tensor * g_diff = ggml_neg(ctx0, ggml_sub(ctx0, g_cumsum, g_last)); - g_diff_exp = ggml_exp(ctx0, g_diff); - } - - cb(g_last, "g_last", il); - cb(g_last_exp, "g_last_exp", il); - - ggml_tensor * key_gdiff = ggml_mul(ctx0, k, g_diff_exp); - cb(key_gdiff, "key_gdiff", il); - - // Process chunks - ggml_tensor * new_state = state_reshaped; - ggml_tensor * core_attn_out = nullptr; - - for (int64_t chunk = 0; chunk < n_chunks; chunk++) { - ggml_tensor * q_chunk = get_slice_2d(ctx0, q, chunk); - ggml_tensor * v_chunk = get_slice_2d(ctx0, v, chunk); - ggml_tensor * k_cumdecay_chunk = get_slice_2d(ctx0, k_cumdecay, chunk); - ggml_tensor * attn_chunk = get_slice_2d(ctx0, attn_kq, chunk); - ggml_tensor * gexp_chunk = get_slice_2d(ctx0, g_exp_pos, chunk); - - cb(attn_chunk, "attn_chunk", il); - - ggml_tensor * state_t = ggml_cont_4d(ctx0, ggml_permute(ctx0, new_state, 1, 0, 2, 3), - S_v, S_v, 1, H_v * n_seqs); - - // v_prime = k_cumdecay @ state - ggml_tensor * v_prime = ggml_mul_mat(ctx0, state_t, k_cumdecay_chunk); - cb(v_prime, "v_prime_chunk", il); - - // v_new = v - v_prime - ggml_tensor * v_new = ggml_sub(ctx0, ggml_repeat(ctx0, v_chunk, v_prime), v_prime); - ggml_tensor * v_new_t = ggml_cont(ctx0, ggml_transpose(ctx0, v_new)); - cb(v_new, "v_new_chunk", il); - - // attn_inter = (q * g.exp()) @ state - ggml_tensor * q_g_exp = ggml_mul(ctx0, q_chunk, gexp_chunk); - ggml_tensor * attn_inter = ggml_mul_mat(ctx0, state_t, q_g_exp); - cb(attn_inter, "attn_inter_chunk", il); - - // output = attn_inter + attn @ v_new - ggml_tensor * v_attn = ggml_mul_mat(ctx0, v_new_t, attn_chunk); - cb(v_attn, "v_attn_chunk", il); - - ggml_tensor * core_attn_out_chunk = ggml_add(ctx0, attn_inter, v_attn); - cb(core_attn_out_chunk, "core_attn_out_chunk", il); - - core_attn_out = core_attn_out == nullptr - ? core_attn_out_chunk - : ggml_concat(ctx0, core_attn_out, core_attn_out_chunk, 2); - - // State update: state = state * g_last_exp + key_gdiff^T @ v_new - ggml_tensor * k_gdiff = ggml_cont(ctx0, get_slice_2d(ctx0, key_gdiff, chunk)); - ggml_tensor * kgdmulvnew = ggml_mul_mat(ctx0, v_new_t, ggml_cont(ctx0, ggml_transpose(ctx0, k_gdiff))); - - ggml_tensor * gexp_last_chunk = ggml_cont(ctx0, get_slice_2d(ctx0, g_last_exp, chunk)); - - if (is_kda) { - // KDA: g_last_exp [S_k, 1, n_chunks, H_k * n_seqs] - // State: [S_v, S_v, H_v, n_seqs] - // Need to reshape g_last_exp to broadcast correctly over V dimension only - gexp_last_chunk = ggml_reshape_4d(ctx0, gexp_last_chunk, - 1, gexp_last_chunk->ne[0], H_v, n_seqs); // [1, S_k, H_v, n_seqs] - // Transpose to [S_k, 1, H_v, n_seqs] then broadcast - gexp_last_chunk = ggml_cont(ctx0, ggml_permute(ctx0, gexp_last_chunk, 1, 0, 2, 3)); - } else { - // GDA: g_last_exp [1, 1, n_chunks, H_k * n_seqs] - // Broadcasts over both K and V dimensions - gexp_last_chunk = ggml_reshape_4d(ctx0, gexp_last_chunk, - gexp_last_chunk->ne[0], gexp_last_chunk->ne[1], H_v, n_seqs); - } - - new_state = ggml_add(ctx0, - ggml_mul(ctx0, new_state, gexp_last_chunk), - ggml_reshape_4d(ctx0, kgdmulvnew, kgdmulvnew->ne[0], kgdmulvnew->ne[1], H_v, n_seqs)); - } - - // Truncate padding and permute back - ggml_tensor * output_tokens = ggml_view_4d(ctx0, core_attn_out, - S_v, n_tokens, H_v, n_seqs, - ggml_row_size(core_attn_out->type, S_v), - ggml_row_size(core_attn_out->type, S_v * chunk_size * n_chunks), - ggml_row_size(core_attn_out->type, S_v * chunk_size * n_chunks * H_v), 0); - output_tokens = ggml_cont(ctx0, output_tokens); - - cb(output_tokens, "output_tokens", il); - - output_tokens = ggml_permute(ctx0, output_tokens, 0, 2, 1, 3); - output_tokens = ggml_cont(ctx0, output_tokens); - - return {output_tokens, new_state}; -} - - -/** - * Unified autoregressive Delta Net implementation (single token processing). - * - * This implementation uses matrix multiplication instead of elementwise operations + summation, - * which is more efficient and mathematically equivalent. See inline comments for equivalences. - * - * Input tensor format matches qwen3next conventions: - * @param q Query tensor [S_k, H_k, 1, n_seqs] - * @param k Key tensor [S_k, H_k, 1, n_seqs] - * @param v Value tensor [S_v, H_v, 1, n_seqs] - * @param g Gate tensor: - * GDA: [H_v, 1, n_seqs] - * KDA: [S_k, H_v, 1, n_seqs] - * @param beta Beta tensor [H_v, 1, 1, n_seqs] - * @param state State tensor [S_v, S_v * H_v, 1, n_seqs] - * @param il Layer index (for debugging callbacks) - * @param eps_norm Epsilon for L2 normalization - * - * @return Pair of (output_tokens, new_state) - */ -std::pair llm_graph_context_delta::build_delta_net_unified_autoregressive( - ggml_context * ctx0, - ggml_tensor * q, - ggml_tensor * k, - ggml_tensor * v, - ggml_tensor * g, - ggml_tensor * beta, - ggml_tensor * state, - int il, - float eps_norm) { - - // Input format: [S, H, n_tokens, n_seqs] (matching qwen3next convention) - const int64_t S_k = q->ne[0]; - const int64_t H_k = q->ne[1]; - const int64_t n_tokens = q->ne[2]; - const int64_t n_seqs = q->ne[3]; - - const int64_t S_v = v->ne[0]; - const int64_t H_v = v->ne[1]; - - GGML_ASSERT(n_tokens == 1); // Autoregressive mode is for single token - - // Detect KDA vs GDA based on g's shape - // GDA: g has shape [H_v, 1, n_seqs] or [H_v, n_tokens, n_seqs] - // KDA: g has shape [S_k, H_v, 1, n_seqs] or [S_k, H_v, n_tokens, n_seqs] - const bool is_kda = (g->ne[0] == S_k && g->ne[1] == H_v); - - // Validate shapes - GGML_ASSERT(v->ne[2] == n_tokens); - GGML_ASSERT(k->ne[2] == n_tokens); - GGML_ASSERT(state->ne[0] == S_v && state->ne[1] == S_v && state->ne[2] == H_v && state->ne[3] == n_seqs); - GGML_ASSERT(q->ne[0] == S_k && q->ne[1] == H_k && q->ne[2] == n_tokens && q->ne[3] == n_seqs); - GGML_ASSERT(k->ne[0] == S_k && k->ne[1] == H_k && k->ne[2] == n_tokens && k->ne[3] == n_seqs); - GGML_ASSERT(beta->ne[0] == H_v && beta->ne[2] == n_tokens && beta->ne[3] == n_seqs); - GGML_ASSERT(H_k == H_v); - - if (is_kda) { - GGML_ASSERT(g->ne[0] == S_k && g->ne[1] == H_v); - } else { - GGML_ASSERT(g->ne[0] == H_v); - } - - // L2 normalize q and k - q = ggml_l2_norm(ctx0, q, eps_norm); - k = ggml_l2_norm(ctx0, k, eps_norm); - - const float scale = 1.0f / sqrtf((float)S_v); - q = ggml_scale(ctx0, q, scale); - beta = ggml_sigmoid(ctx0, beta); - - cb(q, "q_in", il); - cb(k, "k_in", il); - cb(v, "v_in", il); - cb(beta, "beta_in", il); - cb(g, "g_in", il); - - // Reshape g and beta for broadcasting - ggml_tensor * g_t; - ggml_tensor * beta_t; - - if (is_kda) { - // KDA: g [S_k, H_v, 1, n_seqs] -> [S_k, 1, H_k, n_seqs] - // For state multiplication, need [1, S_k, H_v, n_seqs] to broadcast over V only - g_t = ggml_reshape_4d(ctx0, g, S_k, 1, H_k, n_seqs); - } else { - // GDA: g [H_v, 1, n_seqs] -> [1, 1, H_k, n_seqs] - // For state multiplication, broadcasts over both K and V - g_t = ggml_reshape_4d(ctx0, ggml_transpose(ctx0, g), 1, 1, H_k, n_seqs); - } - - beta_t = ggml_reshape_4d(ctx0, ggml_transpose(ctx0, beta), 1, 1, H_k, n_seqs); - - // Apply exponential to g_t - g_t = ggml_exp(ctx0, g_t); - - // State decay: state = state * exp(g) - if (is_kda) { - // KDA: g_t [S_k, 1, H_k, n_seqs], state [S_v, S_v, H_v, n_seqs] - // Need to broadcast g_t over V dimension (ne[0] of state) - // Permute g_t to [1, S_k, H_k, n_seqs] for correct broadcasting - ggml_tensor * g_broadcast = ggml_cont(ctx0, ggml_permute(ctx0, g_t, 1, 0, 2, 3)); - state = ggml_mul(ctx0, state, g_broadcast); - } else { - // GDA: g_t [1, 1, H_k, n_seqs] broadcasts over both dimensions - state = ggml_mul(ctx0, state, g_t); - } - - // Equivalence to previous version: - // Previous: kv_mem = sum_k(state * k) using elementwise mult + sum_rows - // Current: k_state = state_t @ k_t using matrix multiplication - // These are equivalent because: sum_k(A * B) = A @ B when dimensions align - ggml_tensor * state_t = ggml_cont(ctx0, ggml_transpose(ctx0, state)); - ggml_tensor * k_t = ggml_reshape_4d(ctx0, k, S_k, 1, H_k, n_seqs); - ggml_tensor * k_state = ggml_mul_mat(ctx0, state_t, k_t); - - // v_diff = v - k_state (equivalent to v - kv_mem in previous version) - ggml_tensor * v_t = ggml_reshape_4d(ctx0, v, S_v, 1, H_v, n_seqs); - ggml_tensor * v_diff = ggml_sub(ctx0, v_t, k_state); - ggml_tensor * k_beta = ggml_mul(ctx0, k_t, beta_t); - - // Equivalence to previous version: - // Previous: state += k.unsqueeze(-1) * delta where delta = (v - kv_mem) * beta - // Current: state += v_diff^T @ k_beta^T using matrix multiplication - // These are equivalent because: outer_product(k, v_diff * beta) = v_diff^T @ k^T - state = ggml_add(ctx0, state, ggml_mul_mat(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, v_diff)), ggml_cont(ctx0, ggml_transpose(ctx0, k_beta)))); - - // Equivalence to previous version: - // Previous: core_attn_out = sum_k(state * q) using elementwise mult + sum_rows - // Current: core_attn_out = state_t @ q using matrix multiplication - // These are equivalent because: sum_k(A * B) = A @ B when dimensions align - q = ggml_reshape_4d(ctx0, q, S_k, 1, H_k, n_seqs); - state_t = ggml_cont(ctx0, ggml_transpose(ctx0, state)); - ggml_tensor * core_attn_out = ggml_mul_mat(ctx0, state_t, q); - // core_attn_out should be [S_v, 1, H_v, n_seqs] after this - cb(core_attn_out, "output_tokens", il); - cb(state, "new_state", il); - - return {core_attn_out, state}; -} - - -/** - * Main entry point that dispatches to chunked or autoregressive based on n_tokens. - * - * Input tensor format matches qwen3next conventions: - * @param q Query tensor [S_k, H_k, n_tokens, n_seqs] - * @param k Key tensor [S_k, H_k, n_tokens, n_seqs] - * @param v Value tensor [S_v, H_v, n_tokens, n_seqs] - * @param g Gate tensor (GDA: [H_v, n_tokens, n_seqs], KDA: [S_k, H_v, n_tokens, n_seqs]) - * @param beta Beta tensor [H_v, 1, n_tokens, n_seqs] - * @param state State tensor [S_v, S_v * H_v, 1, n_seqs] - */ -std::pair llm_graph_context_delta::build_delta_net_unified( - ggml_context * ctx0, - ggml_tensor * q, - ggml_tensor * k, - ggml_tensor * v, - ggml_tensor * g, - ggml_tensor * beta, - ggml_tensor * state, - ggml_tensor * causal_mask, - ggml_tensor * identity, - ggml_tensor * diag_mask, - int il, - int64_t chunk_size, - float eps_norm) { - - // Input format: [S, H, n_tokens, n_seqs] (matching qwen3next convention) - const int64_t n_tokens = q->ne[2]; - - if (n_tokens == 1) { - return build_delta_net_unified_autoregressive( - ctx0, q, k, v, g, beta, state, il, eps_norm); - } - return build_delta_net_unified_chunking( - ctx0, q, k, v, g, beta, state, causal_mask, identity, diag_mask, - il, chunk_size, eps_norm); -} diff --git a/src/models/kimi-linear.cpp b/src/models/kimi-linear.cpp index d9ee698075..0f037d1a39 100644 --- a/src/models/kimi-linear.cpp +++ b/src/models/kimi-linear.cpp @@ -1,4 +1,5 @@ #include "models.h" +#include "ggml.h" #define CHUNK_SIZE 64 diff --git a/src/models/models.h b/src/models/models.h index 2a750c168e..cfcbb9aaa5 100644 --- a/src/models/models.h +++ b/src/models/models.h @@ -17,53 +17,6 @@ struct llm_graph_context_mamba : public llm_graph_context { }; -struct llm_graph_context_delta : public llm_graph_context_mamba { - llm_graph_context_delta(const llm_graph_params & params); - - virtual ~llm_graph_context_delta() = default; - - std::pair build_delta_net_unified_chunking( - ggml_context * ctx0, - ggml_tensor * q, - ggml_tensor * k, - ggml_tensor * v, - ggml_tensor * g, - ggml_tensor * beta, - ggml_tensor * state, - ggml_tensor * causal_mask, - ggml_tensor * identity, - ggml_tensor * diag_mask, - int il, - int64_t chunk_size, - float eps_norm); - - std::pair build_delta_net_unified_autoregressive( - ggml_context * ctx0, - ggml_tensor * q, - ggml_tensor * k, - ggml_tensor * v, - ggml_tensor * g, - ggml_tensor * beta, - ggml_tensor * state, - int il, - float eps_norm); - - std::pair build_delta_net_unified( - ggml_context * ctx0, - ggml_tensor * q, - ggml_tensor * k, - ggml_tensor * v, - ggml_tensor * g, - ggml_tensor * beta, - ggml_tensor * state, - ggml_tensor * causal_mask, - ggml_tensor * identity, - ggml_tensor * diag_mask, - int il, - int64_t chunk_size, - float eps_norm); -}; - // Base class for RWKV-related models struct llm_build_rwkv6_base : public llm_graph_context { const llama_model & model; @@ -523,7 +476,7 @@ struct llm_build_qwen3vl : public llm_graph_context { struct llm_build_qwen3vlmoe : public llm_graph_context { llm_build_qwen3vlmoe(const llama_model & model, const llm_graph_params & params); }; -struct llm_build_qwen3next : public llm_graph_context_delta { +struct llm_build_qwen3next : public llm_graph_context_mamba { llm_build_qwen3next(const llama_model & model, const llm_graph_params & params); private: ggml_tensor * build_layer_attn( @@ -581,59 +534,6 @@ private: const llama_model & model; }; -struct llm_build_qwen3_5 : public llm_graph_context_delta { - llm_build_qwen3_5(const llama_model & model, const llm_graph_params & params); - -protected: - // Tag type for subclass constructors that need to call build_graph() themselves - // (to ensure virtual dispatch works correctly) - struct defer_graph_build_t {}; - - llm_build_qwen3_5(const llama_model & model, const llm_graph_params & params, defer_graph_build_t); - - void build_graph(); - - virtual ggml_tensor * build_layer_ffn( - ggml_tensor * cur, - int il); - - const llama_model & model; - -private: - ggml_tensor * build_layer_attn( - llm_graph_input_attn_kv * inp_attn, - ggml_tensor * cur, - ggml_tensor * inp_pos, - int il); - - ggml_tensor * build_layer_attn_linear( - llm_graph_input_rs * inp, - ggml_tensor * cur, - ggml_tensor * causal_mask, - ggml_tensor * identity, - ggml_tensor * diag_mask, - int il); - - ggml_tensor * build_norm_gated( - ggml_tensor * input, - ggml_tensor * weights, - ggml_tensor * gate, - int layer); - - std::pair build_qkvz( - ggml_tensor * input, - int il); -}; - -struct llm_build_qwen3_5_moe : public llm_build_qwen3_5 { - llm_build_qwen3_5_moe(const llama_model & model, const llm_graph_params & params); - -protected: - ggml_tensor * build_layer_ffn( - ggml_tensor * cur, - int il) override; -}; - struct llm_build_qwen : public llm_graph_context { llm_build_qwen(const llama_model & model, const llm_graph_params & params); }; diff --git a/src/models/qwen3-5.cpp b/src/models/qwen3-5.cpp deleted file mode 100644 index 0947299d73..0000000000 --- a/src/models/qwen3-5.cpp +++ /dev/null @@ -1,421 +0,0 @@ -#include "models.h" - -#define CHUNK_SIZE 64 - -llm_build_qwen3_5::llm_build_qwen3_5(const llama_model & model, const llm_graph_params & params) : - llm_graph_context_delta(params), model(model) { - build_graph(); -} - -// virtual call in constructor fix -llm_build_qwen3_5::llm_build_qwen3_5(const llama_model & model, const llm_graph_params & params, defer_graph_build_t /*tag*/) : - llm_graph_context_delta(params), model(model) { -} - -void llm_build_qwen3_5::build_graph() { - ggml_tensor * cur; - ggml_tensor * inpL; - - inpL = build_inp_embd(model.tok_embd); - cb(inpL, "model.embed_tokens", -1); - - auto * inp = build_inp_mem_hybrid(); - - ggml_tensor * inp_pos = build_inp_pos(); - ggml_tensor * inp_out_ids = build_inp_out_ids(); - - ggml_tensor * causal_mask = - ggml_tri(ctx0, ggml_fill(ctx0, ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, CHUNK_SIZE, CHUNK_SIZE), 1.0f), - GGML_TRI_TYPE_LOWER); - - ggml_tensor * identity = ggml_diag(ctx0, ggml_fill(ctx0, ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, CHUNK_SIZE), 1.0f)); - ggml_tensor * diag_mask = ggml_add(ctx0, causal_mask, identity); - - ggml_build_forward_expand(gf, causal_mask); - ggml_build_forward_expand(gf, identity); - ggml_build_forward_expand(gf, diag_mask); - - for (int il = 0; il < n_layer; ++il) { - ggml_tensor * inpSA = inpL; - - cur = build_norm(inpL, model.layers[il].attn_norm, nullptr, LLM_NORM_RMS, il); - cb(cur, "attn_norm", il); - - if (hparams.is_recurrent(il)) { - cur = build_layer_attn_linear(inp->get_recr(), cur, causal_mask, identity, diag_mask, il); - } else { - cur = build_layer_attn(inp->get_attn(), cur, inp_pos, il); - } - - if (il == n_layer - 1 && inp_out_ids) { - cur = ggml_get_rows(ctx0, cur, inp_out_ids); - inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); - } - - cur = ggml_add(ctx0, cur, inpSA); - cb(cur, "attn_residual", il); - - ggml_tensor * ffn_residual = cur; - - ggml_tensor * attn_post_norm = build_norm(cur, model.layers[il].attn_post_norm, nullptr, LLM_NORM_RMS, il); - cb(attn_post_norm, "attn_post_norm", il); - - cur = build_layer_ffn(attn_post_norm, il); - cb(cur, "ffn_out", il); - - cur = ggml_add(ctx0, cur, ffn_residual); - cb(cur, "post_ffn", il); - - inpL = cur; - } - cur = inpL; - - cur = build_norm(cur, model.output_norm, nullptr, LLM_NORM_RMS, -1); - - cb(cur, "result_norm", -1); - res->t_embd = cur; - - cur = build_lora_mm(model.output, cur); - - cb(cur, "result_output", -1); - res->t_logits = cur; - - ggml_build_forward_expand(gf, cur); -} - -ggml_tensor * llm_build_qwen3_5::build_norm_gated( - ggml_tensor * input, - ggml_tensor * weights, - ggml_tensor * gate, - int layer) { - ggml_tensor * normalized = build_norm(input, weights, nullptr, LLM_NORM_RMS, layer); - ggml_tensor * gated_silu = ggml_silu(ctx0, gate); - - return ggml_mul(ctx0, normalized, gated_silu); -} - -ggml_tensor * llm_build_qwen3_5::build_layer_attn( - llm_graph_input_attn_kv * inp, - ggml_tensor * cur, - ggml_tensor * inp_pos, - int il) { - const int64_t n_embd_head = hparams.n_embd_head_v; - GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); - - ggml_tensor * Qcur_full = build_lora_mm(model.layers[il].wq, cur); // [ (n_embd_head * 2) * n_head, n_tokens ] - cb(Qcur_full, "Qcur_full", il); - - ggml_tensor * Qcur = ggml_view_3d(ctx0, Qcur_full, n_embd_head, n_head, n_tokens, - ggml_element_size(Qcur_full) * n_embd_head * 2, - ggml_element_size(Qcur_full) * n_embd_head * 2 * n_head, 0); - cb(Qcur, "Qcur_reshaped", il); - - Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, nullptr, LLM_NORM_RMS, il); - cb(Qcur, "Qcur_normed", il); - - ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); - cb(Kcur, "Kcur", il); - - ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); - cb(Vcur, "Vcur", il); - - Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); - Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, nullptr, LLM_NORM_RMS, il); - cb(Kcur, "Kcur_normed", il); - - ggml_tensor * gate = ggml_view_3d(ctx0, Qcur_full, n_embd_head, n_head, n_tokens, - ggml_element_size(Qcur_full) * n_embd_head * 2, - ggml_element_size(Qcur_full) * n_embd_head * 2 * n_head, - ggml_element_size(Qcur_full) * n_embd_head); - gate = ggml_cont_2d(ctx0, gate, n_embd_head * n_head, n_tokens); - cb(gate, "gate_reshaped", il); - - Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens); - - Qcur = ggml_rope_ext( - ctx0, Qcur, inp_pos, nullptr, - n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, - ext_factor, attn_factor, beta_fast, beta_slow); - - Kcur = ggml_rope_ext( - ctx0, Kcur, inp_pos, nullptr, - n_rot, rope_type, n_ctx_orig, freq_base, - freq_scale, ext_factor, attn_factor, beta_fast, beta_slow); - - cb(Qcur, "Qcur", il); - cb(Kcur, "Kcur", il); - cb(Vcur, "Vcur", il); - - const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f / sqrtf(float(n_embd_head)) : hparams.f_attention_scale; - - cur = build_attn(inp, - nullptr, nullptr, - Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il); - cb(cur, "attn_pregate", il); - - ggml_tensor * gate_sigmoid = ggml_sigmoid(ctx0, gate); - cb(gate_sigmoid, "gate_sigmoid", il); - - cur = ggml_mul(ctx0, cur, gate_sigmoid); - cb(cur, "attn_gated", il); - - cur = build_lora_mm(model.layers[il].wo, cur); - cb(cur, "attn_output", il); - - return cur; -} - -std::pair llm_build_qwen3_5::build_qkvz( - ggml_tensor * input, - int il) { - const int64_t d_inner = hparams.ssm_d_inner; - const int64_t n_seqs = ubatch.n_seqs; - const int64_t head_k_dim = hparams.ssm_d_state; - const int64_t num_k_heads = hparams.ssm_n_group; - const int64_t num_v_heads = hparams.ssm_dt_rank; - const int64_t head_v_dim = d_inner / num_v_heads; - const int64_t n_seq_tokens = ubatch.n_seq_tokens; - - if (model.layers[il].wqkv) { - ggml_tensor * qkv_mixed = build_lora_mm(model.layers[il].wqkv, input); - qkv_mixed = ggml_reshape_3d(ctx0, qkv_mixed, qkv_mixed->ne[0], n_seq_tokens, n_seqs); - cb(qkv_mixed, "linear_attn_qkv_mixed", il); - - ggml_tensor * z = build_lora_mm(model.layers[il].wqkv_gate, input); - cb(z, "z", il); - - return { qkv_mixed, z }; - - } - // legacy path for combined in_proj_qkvz - ggml_tensor * mixed_qkvz = build_lora_mm(model.layers[il].ssm_in, input); - cb(mixed_qkvz, "linear_attn_mixed_qkvz", il); - - int64_t qkvz_new_dim = 2 * head_k_dim + 2 * head_v_dim * (num_v_heads / num_k_heads); - ggml_tensor * mixed_qkvz_reshaped = ggml_reshape_4d(ctx0, mixed_qkvz, qkvz_new_dim, num_k_heads, n_seq_tokens, n_seqs); - - int64_t split_sizes_qkvz[4] = { - head_k_dim, - head_k_dim, - head_v_dim * num_v_heads / num_k_heads, - head_v_dim * num_v_heads / num_k_heads - }; - - ggml_tensor * query = - ggml_view_4d(ctx0, mixed_qkvz_reshaped, split_sizes_qkvz[0], num_k_heads, n_seq_tokens, n_seqs, - mixed_qkvz_reshaped->nb[1], mixed_qkvz_reshaped->nb[2], mixed_qkvz_reshaped->nb[3], 0); - cb(query, "q", il); - - ggml_tensor * key = ggml_view_4d(ctx0, mixed_qkvz_reshaped, split_sizes_qkvz[1], num_k_heads, n_seq_tokens, n_seqs, - mixed_qkvz_reshaped->nb[1], mixed_qkvz_reshaped->nb[2], mixed_qkvz_reshaped->nb[3], - split_sizes_qkvz[0] * ggml_element_size(mixed_qkvz_reshaped)); - cb(key, "k", il); - - ggml_tensor * value = - ggml_view_4d(ctx0, mixed_qkvz_reshaped, split_sizes_qkvz[2], num_k_heads, n_seq_tokens, n_seqs, - mixed_qkvz_reshaped->nb[1], mixed_qkvz_reshaped->nb[2], mixed_qkvz_reshaped->nb[3], - (split_sizes_qkvz[0] + split_sizes_qkvz[1]) * ggml_element_size(mixed_qkvz_reshaped)); - cb(value, "v", il); - - ggml_tensor * z = ggml_view_4d(ctx0, mixed_qkvz_reshaped, split_sizes_qkvz[3], num_k_heads, n_seq_tokens, n_seqs, - mixed_qkvz_reshaped->nb[1], mixed_qkvz_reshaped->nb[2], mixed_qkvz_reshaped->nb[3], - (split_sizes_qkvz[0] + split_sizes_qkvz[1] + split_sizes_qkvz[2]) * ggml_element_size(mixed_qkvz_reshaped)); - z = ggml_cont(ctx0, z); - cb(z, "z", il); - - ggml_tensor * query_flat = ggml_reshape_3d(ctx0, query, head_k_dim * num_k_heads, n_seq_tokens, n_seqs); - cb(query_flat, "query_flat", il); - - ggml_tensor * key_flat = ggml_reshape_3d(ctx0, key, head_k_dim * num_k_heads, n_seq_tokens, n_seqs); - cb(key_flat, "key_flat", il); - - ggml_tensor * value_flat = ggml_reshape_3d(ctx0, value, head_v_dim * num_v_heads, n_seq_tokens, n_seqs); - cb(value_flat, "value_flat", il); - - ggml_tensor * qkv_mixed = ggml_concat(ctx0, query_flat, key_flat, 0); - qkv_mixed = ggml_concat(ctx0, qkv_mixed, value_flat, 0); - cb(qkv_mixed, "qkv_mixed", il); - - return { qkv_mixed, z }; -} - -ggml_tensor * llm_build_qwen3_5::build_layer_attn_linear( - llm_graph_input_rs * inp, - ggml_tensor * cur, - ggml_tensor * causal_mask, - ggml_tensor * identity, - ggml_tensor * diag_mask, - int il) { - const auto * mctx_cur = inp->mctx; - - const int64_t d_inner = hparams.ssm_d_inner; - const int64_t n_seqs = ubatch.n_seqs; - const int64_t head_k_dim = hparams.ssm_d_state; - const int64_t num_k_heads = hparams.ssm_n_group; - const int64_t num_v_heads = hparams.ssm_dt_rank; - const int64_t head_v_dim = d_inner / num_v_heads; - const int64_t n_seq_tokens = ubatch.n_seq_tokens; - - const auto kv_head = mctx_cur->get_head(); - - GGML_ASSERT(n_seqs != 0); - GGML_ASSERT(ubatch.equal_seqs()); - GGML_ASSERT(ubatch.n_tokens == n_seq_tokens * n_seqs); - - auto qkvz = build_qkvz(cur, il); - ggml_tensor * qkv_mixed = qkvz.first; - ggml_tensor * z = qkvz.second; - - ggml_tensor * mixed_ba = build_lora_mm(model.layers[il].ssm_beta_alpha, cur); - cb(mixed_ba, "linear_attn_mixed_ba", il); - - int64_t ba_new_dim = 2 * num_v_heads / num_k_heads; - ggml_tensor * mixed_ba_reshaped = ggml_reshape_4d(ctx0, mixed_ba, ba_new_dim, num_k_heads, n_seq_tokens, n_seqs); - - int64_t split_sizes_ba[2] = { - num_v_heads / num_k_heads, - num_v_heads / num_k_heads - }; - - ggml_tensor * b = ggml_view_4d(ctx0, mixed_ba_reshaped, split_sizes_ba[0], num_k_heads, n_seq_tokens, n_seqs, - mixed_ba_reshaped->nb[1], mixed_ba_reshaped->nb[2], mixed_ba_reshaped->nb[3], 0); - cb(b, "b", il); - - ggml_tensor * a = ggml_view_4d(ctx0, mixed_ba_reshaped, split_sizes_ba[1], num_k_heads, n_seq_tokens, n_seqs, - mixed_ba_reshaped->nb[1], mixed_ba_reshaped->nb[2], mixed_ba_reshaped->nb[3], - split_sizes_ba[0] * ggml_element_size(mixed_ba_reshaped)); - cb(a, "a", il); - - ggml_tensor * beta = ggml_cont_4d(ctx0, b, num_v_heads, 1, n_seq_tokens, n_seqs); - - ggml_tensor * alpha = ggml_cont_3d(ctx0, a, num_v_heads, n_seq_tokens, n_seqs); - - ggml_tensor * alpha_biased = ggml_add(ctx0, alpha, model.layers[il].ssm_dt); - ggml_tensor * alpha_softplus = ggml_softplus(ctx0, alpha_biased); - cb(alpha_softplus, "a_softplus", il); - ggml_tensor * gate = ggml_mul(ctx0, alpha_softplus, model.layers[il].ssm_a); - cb(gate, "gate", il); - - ggml_tensor * conv_states_all = mctx_cur->get_r_l(il); - ggml_tensor * ssm_states_all = mctx_cur->get_s_l(il); - - ggml_tensor * conv_states = build_rs(inp, conv_states_all, hparams.n_embd_r(), n_seqs); - cb(conv_states, "conv_states", il); - - ggml_tensor * conv_kernel = model.layers[il].ssm_conv1d; - const int64_t conv_kernel_size = conv_kernel->ne[0]; - const int64_t conv_channels = d_inner + 2 * hparams.ssm_n_group * hparams.ssm_d_state; - conv_states = ggml_reshape_3d(ctx0, conv_states, conv_kernel_size - 1, conv_channels, n_seqs); - cb(conv_states, "conv_states_reshaped", il); - - qkv_mixed = ggml_permute(ctx0, qkv_mixed, 1, 0, 2, 3); - cb(qkv_mixed, "qkv_mixed_permuted", il); - - ggml_tensor * conv_input = ggml_concat(ctx0, conv_states, qkv_mixed, 0); - cb(conv_input, "conv_input", il); - - ggml_tensor * last_conv_states = - ggml_view_3d(ctx0, conv_input, conv_kernel_size - 1, conv_channels, n_seqs, conv_input->nb[1], - conv_input->nb[2], (conv_input->ne[0] - conv_states->ne[0]) * ggml_element_size(conv_input)); - cb(last_conv_states, "last_conv_states", il); - - ggml_tensor * state_update_target = - ggml_view_1d(ctx0, conv_states_all, (conv_kernel_size - 1) * conv_channels * n_seqs, - kv_head * (conv_kernel_size - 1) * conv_channels * ggml_element_size(conv_states_all)); - cb(state_update_target, "state_update_target", il); - - ggml_build_forward_expand(gf, ggml_cpy(ctx0, last_conv_states, state_update_target)); - cb(conv_states_all, "conv_states_updated", il); - - ggml_tensor * conv_output_proper = ggml_ssm_conv(ctx0, conv_input, conv_kernel); - cb(conv_output_proper, "conv_output_raw", il); - - ggml_tensor * conv_output_silu = ggml_silu(ctx0, conv_output_proper); - cb(conv_output_silu, "conv_output_silu", il); - - ggml_tensor * conv_qkv_mix = conv_output_silu; - - int64_t qkv_dim = head_k_dim * num_k_heads * 2 + head_v_dim * num_v_heads; - int64_t nb1_qkv = ggml_row_size(conv_qkv_mix->type, qkv_dim); - - ggml_tensor * q_conv = - ggml_view_2d(ctx0, conv_qkv_mix, head_k_dim * num_k_heads, n_seq_tokens * n_seqs, nb1_qkv, 0); - cb(q_conv, "q_conv", il); - ggml_tensor * k_conv = - ggml_view_2d(ctx0, conv_qkv_mix, head_k_dim * num_k_heads, n_seq_tokens * n_seqs, nb1_qkv, - head_k_dim * num_k_heads * ggml_element_size(conv_qkv_mix)); - cb(k_conv, "k_conv", il); - ggml_tensor * v_conv = - ggml_view_2d(ctx0, conv_qkv_mix, head_v_dim * num_v_heads, n_seq_tokens * n_seqs, nb1_qkv, - 2 * head_k_dim * num_k_heads * ggml_element_size(conv_qkv_mix)); - cb(v_conv, "v_conv", il); - - q_conv = ggml_cont_4d(ctx0, q_conv, head_k_dim, num_k_heads, n_seq_tokens, n_seqs); - k_conv = ggml_cont_4d(ctx0, k_conv, head_k_dim, num_k_heads, n_seq_tokens, n_seqs); - v_conv = ggml_cont_4d(ctx0, v_conv, head_v_dim, num_v_heads, n_seq_tokens, n_seqs); - - ggml_tensor * state = build_rs(inp, ssm_states_all, hparams.n_embd_s(), n_seqs); - state = ggml_reshape_4d(ctx0, state, head_v_dim, head_v_dim, num_v_heads, n_seqs); - cb(state, "state_predelta", il); - - if (num_k_heads != num_v_heads) { - GGML_ASSERT(num_v_heads % num_k_heads == 0); - int64_t repeat_factor = num_v_heads / num_k_heads; - - ggml_tensor * q_reshaped = ggml_reshape_3d(ctx0, q_conv, head_k_dim, 1, num_k_heads * n_seq_tokens * n_seqs); - ggml_tensor * k_reshaped = ggml_reshape_3d(ctx0, k_conv, head_k_dim, 1, num_k_heads * n_seq_tokens * n_seqs); - - ggml_tensor * q_repeated = - ggml_repeat_4d(ctx0, q_reshaped, head_k_dim, repeat_factor, num_k_heads * n_seq_tokens * n_seqs, 1); - ggml_tensor * k_repeated = - ggml_repeat_4d(ctx0, k_reshaped, head_k_dim, repeat_factor, num_k_heads * n_seq_tokens * n_seqs, 1); - - q_conv = ggml_reshape_4d(ctx0, q_repeated, head_k_dim, num_k_heads * repeat_factor, n_seq_tokens, n_seqs); - k_conv = ggml_reshape_4d(ctx0, k_repeated, head_k_dim, num_k_heads * repeat_factor, n_seq_tokens, n_seqs); - } - - cb(q_conv, "q_conv_predelta", il); - cb(k_conv, "k_conv_predelta", il); - cb(v_conv, "v_conv_predelta", il); - - std::pair attn_out = build_delta_net_unified(ctx0, q_conv, k_conv, v_conv, - gate, beta, state, causal_mask, identity, diag_mask, - il, CHUNK_SIZE, hparams.f_norm_rms_eps); - - ggml_tensor * output = attn_out.first; - ggml_tensor * new_state = attn_out.second; - cb(output, "attn_output", il); - cb(new_state, "new_state", il); - - ggml_build_forward_expand(gf, - ggml_cpy(ctx0, new_state, - ggml_view_1d(ctx0, ssm_states_all, hparams.n_embd_s() * n_seqs, - kv_head * hparams.n_embd_s() * ggml_element_size(ssm_states_all)))); - - ggml_tensor * attn_out_2d_final = ggml_reshape_2d(ctx0, output, head_v_dim, num_v_heads * n_seq_tokens * n_seqs); - - ggml_tensor * z_2d = ggml_reshape_2d(ctx0, z, head_v_dim, num_v_heads * n_seq_tokens * n_seqs); - - ggml_tensor * attn_out_norm = build_norm_gated(attn_out_2d_final, model.layers[il].ssm_norm, z_2d, il); - - ggml_tensor * final_output = ggml_reshape_3d(ctx0, attn_out_norm, head_v_dim * num_v_heads, n_seq_tokens, n_seqs); - cb(final_output, "final_output", il); - - cur = build_lora_mm(model.layers[il].ssm_out, final_output); - cb(cur, "linear_attn_out", il); - - cur = ggml_cont_2d(ctx0, cur, n_embd, n_seq_tokens * n_seqs); - return cur; -} - -ggml_tensor * llm_build_qwen3_5::build_layer_ffn(ggml_tensor * cur, const int il) { - // Qwen3.5 Dense always uses dense FFN - cur = build_ffn(cur, - model.layers[il].ffn_up, NULL, NULL, - model.layers[il].ffn_gate, NULL, NULL, - model.layers[il].ffn_down, NULL, NULL, - NULL, - LLM_FFN_SILU, LLM_FFN_PAR, il); - cb(cur, "ffn_out", il); - return cur; -} diff --git a/src/models/qwen3-5moe.cpp b/src/models/qwen3-5moe.cpp deleted file mode 100644 index a488443218..0000000000 --- a/src/models/qwen3-5moe.cpp +++ /dev/null @@ -1,52 +0,0 @@ -#include "models.h" - -llm_build_qwen3_5_moe::llm_build_qwen3_5_moe(const llama_model & model, const llm_graph_params & params) : - llm_build_qwen3_5(model, params, defer_graph_build_t{}) { - build_graph(); -} - -ggml_tensor * llm_build_qwen3_5_moe::build_layer_ffn(ggml_tensor * cur, const int il) { - // Check if this is an MoE layer - if (model.layers[il].ffn_gate_inp != nullptr) { - // MoE branch - ggml_tensor * moe_out = - build_moe_ffn(cur, - model.layers[il].ffn_gate_inp, model.layers[il].ffn_up_exps, - model.layers[il].ffn_gate_exps, model.layers[il].ffn_down_exps, - nullptr, - n_expert, n_expert_used, LLM_FFN_SILU, - true, false, 0.0, LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX, il); - cb(moe_out, "ffn_moe_out", il); - - // Add shared experts if present - if (model.layers[il].ffn_up_shexp != nullptr) { - ggml_tensor * ffn_shexp = - build_ffn(cur, - model.layers[il].ffn_up_shexp, NULL, NULL, - model.layers[il].ffn_gate_shexp, NULL, NULL, - model.layers[il].ffn_down_shexp, NULL, NULL, - NULL, - LLM_FFN_SILU, LLM_FFN_PAR, il); - cb(ffn_shexp, "ffn_shexp", il); - - // Apply shared expert gating (sigmoid) - ggml_tensor * shared_gate = build_lora_mm(model.layers[il].ffn_gate_inp_shexp, cur); - cb(shared_gate, "shared_expert_gate", il); - - shared_gate = ggml_sigmoid(ctx0, shared_gate); - cb(shared_gate, "shared_expert_gate_sigmoid", il); - - ffn_shexp = ggml_mul(ctx0, ffn_shexp, shared_gate); - cb(ffn_shexp, "ffn_shexp_gated", il); - - cur = ggml_add(ctx0, moe_out, ffn_shexp); - cb(cur, "ffn_out", il); - } else { - cur = moe_out; - } - } else { - // Dense FFN branch (fallback) - cur = llm_build_qwen3_5::build_layer_ffn(cur, il); - } - return cur; -} diff --git a/src/models/qwen3next.cpp b/src/models/qwen3next.cpp index 0335f5ab76..99b1a76a48 100644 --- a/src/models/qwen3next.cpp +++ b/src/models/qwen3next.cpp @@ -1,9 +1,10 @@ +#include "ggml.h" #include "models.h" #define CHUNK_SIZE 64 llm_build_qwen3next::llm_build_qwen3next(const llama_model & model, const llm_graph_params & params) : - llm_graph_context_delta(params), model(model) { + llm_graph_context_mamba(params), model(model) { ggml_tensor * cur; ggml_tensor * inpL; @@ -85,6 +86,362 @@ llm_build_qwen3next::llm_build_qwen3next(const llama_model & model, const llm_gr ggml_build_forward_expand(gf, cur); } +// utility to get one slice from the third dimension +// input dim: [x, y, c, b] +// output dim: [x, y, 1, b] +static ggml_tensor * get_slice_2d(ggml_context * ctx0, ggml_tensor * t, int64_t c) { + return ggml_view_4d(ctx0, t, t->ne[0], t->ne[1], 1, t->ne[3], + t->nb[1], t->nb[2], t->nb[3], t->nb[2] * c); +} + +std::pair llm_build_qwen3next::build_delta_net_chunking( + ggml_tensor * q, + ggml_tensor * k, + ggml_tensor * v, + ggml_tensor * g, + ggml_tensor * beta, + ggml_tensor * state, + ggml_tensor * causal_mask, + ggml_tensor * identity, + ggml_tensor * diag_mask, + int il) { + const int64_t S_k = q->ne[0]; + const int64_t H_k = q->ne[1]; + const int64_t n_tokens = q->ne[2]; + const int64_t n_seqs = q->ne[3]; + + const int64_t S_v = v->ne[0]; + const int64_t H_v = v->ne[1]; + + GGML_ASSERT(v->ne[2] == n_tokens); + GGML_ASSERT(k->ne[2] == n_tokens); + GGML_ASSERT(g->ne[0] == H_v && g->ne[1] == n_tokens && g->ne[2] == n_seqs); + GGML_ASSERT(beta->ne[0] == H_v && beta->ne[2] == n_tokens && beta->ne[3] == n_seqs); + GGML_ASSERT(state->ne[0] == S_v && state->ne[1] == S_v * H_v && state->ne[2] == 1 && state->ne[3] == n_seqs); + + GGML_ASSERT(q->ne[0] == S_k && q->ne[1] == H_k && q->ne[2] == n_tokens && q->ne[3] == n_seqs); + GGML_ASSERT(k->ne[0] == S_k && k->ne[1] == H_k && k->ne[2] == n_tokens && k->ne[3] == n_seqs); + + GGML_ASSERT(H_k == H_v); // we did a repeat to make sure this is the case + + const float eps_norm = hparams.f_norm_rms_eps; + + q = ggml_l2_norm(ctx0, q, eps_norm); + k = ggml_l2_norm(ctx0, k, eps_norm); + + const float scale = 1.0f / sqrtf(S_v); + + q = ggml_scale(ctx0, q, scale); + + beta = ggml_sigmoid(ctx0, beta); + + cb(q, "q_in", il); + cb(k, "k_in", il); + cb(v, "v_in", il); + cb(beta, "beta_in", il); + cb(g, "g_in", il); + + q = ggml_cont_4d(ctx0, ggml_permute(ctx0, q, 0, 2, 1, 3), S_v, n_tokens, H_v, n_seqs); + k = ggml_cont_4d(ctx0, ggml_permute(ctx0, k, 0, 2, 1, 3), S_v, n_tokens, H_v, n_seqs); + v = ggml_cont_4d(ctx0, ggml_permute(ctx0, v, 0, 2, 1, 3), S_v, n_tokens, H_v, n_seqs); + g = ggml_cont_4d(ctx0, ggml_permute(ctx0, g, 2, 0, 3, 1), n_tokens, 1, H_k, n_seqs); + + beta = ggml_cont(ctx0, ggml_permute(ctx0, beta, 2, 0, 1, 3)); + state = ggml_reshape_4d(ctx0, state, S_v, S_v, H_v, n_seqs); + + cb(q, "q_perm", il); + cb(k, "k_perm", il); + cb(v, "v_perm", il); + cb(beta, "beta_perm", il); + cb(g, "g_perm", il); + cb(state, "state_in", il); + + GGML_ASSERT(q->ne[1] == n_tokens && q->ne[0] == S_k && q->ne[2] == H_k && q->ne[3] == n_seqs); + GGML_ASSERT(k->ne[1] == n_tokens && k->ne[0] == S_k && k->ne[2] == H_k && k->ne[3] == n_seqs); + GGML_ASSERT(v->ne[1] == n_tokens && v->ne[0] == S_v && v->ne[2] == H_k && v->ne[3] == n_seqs); + GGML_ASSERT(beta->ne[1] == n_tokens && beta->ne[2] == H_k && beta->ne[0] == 1 && beta->ne[3] == n_seqs); + + // Do padding + const int64_t chunk_size = CHUNK_SIZE; + + const int64_t pad = (chunk_size - n_tokens % chunk_size) % chunk_size; + const int64_t n_chunks = (n_tokens + pad) / chunk_size; + + q = ggml_pad(ctx0, q, 0, pad, 0, 0); + k = ggml_pad(ctx0, k, 0, pad, 0, 0); + v = ggml_pad(ctx0, v, 0, pad, 0, 0); + g = ggml_pad(ctx0, g, pad, 0, 0, 0); + beta = ggml_pad(ctx0, beta, 0, pad, 0, 0); + + cb(q, "q_pad", il); + cb(k, "k_pad", il); + cb(v, "v_pad", il); + cb(beta, "beta_pad", il); + cb(g, "g_pad", il); + + ggml_tensor * v_beta = ggml_mul(ctx0, v, beta); + ggml_tensor * k_beta = ggml_mul(ctx0, k, beta); + + cb(v_beta, "v_beta", il); + cb(k_beta, "k_beta", il); + + q = ggml_reshape_4d(ctx0, q, S_k, chunk_size, n_chunks, H_k * n_seqs); + k = ggml_reshape_4d(ctx0, k, S_k, chunk_size, n_chunks, H_k * n_seqs); + k_beta = ggml_reshape_4d(ctx0, k_beta, S_k, chunk_size, n_chunks, H_k * n_seqs); + v = ggml_reshape_4d(ctx0, v, S_v, chunk_size, n_chunks, H_v * n_seqs); + v_beta = ggml_reshape_4d(ctx0, v_beta, S_v, chunk_size, n_chunks, H_v * n_seqs); + + g = ggml_reshape_4d(ctx0, g, chunk_size, 1, n_chunks, H_k * n_seqs); + beta = ggml_reshape_4d(ctx0, beta, 1, chunk_size, n_chunks, H_k * n_seqs); + + ggml_tensor * g_cumsum = ggml_cumsum(ctx0, g); + cb(g_cumsum, "g_cumsum", il); // shape: (chunk_size, 1, n_chunks, H_v * n_seqs) + + ggml_tensor * gcs_i = g_cumsum; // ggml_reshape_4d(ctx0, g_cumsum, chunk_size, 1, n_chunks, H_v * n_seqs); + ggml_tensor * gcs_j = ggml_reshape_4d(ctx0, g_cumsum, 1, chunk_size, n_chunks, H_v * n_seqs); + + ggml_tensor * gcs_j_broadcast = + ggml_repeat_4d(ctx0, gcs_j, chunk_size, chunk_size, n_chunks, H_v * n_seqs); + + ggml_tensor * decay_mask = ggml_sub(ctx0, gcs_j_broadcast, gcs_i); + cb(decay_mask, "decay_mask", il); // shape: (chunk_size, chunk_size, n_chunks, H_v * n_seqs) + + decay_mask = ggml_mul(ctx0, decay_mask, diag_mask); + decay_mask = ggml_exp(ctx0, decay_mask); + decay_mask = ggml_mul(ctx0, decay_mask, diag_mask); + + ggml_tensor * kmulkbeta = ggml_mul_mat(ctx0, k, k_beta); + + ggml_tensor * k_decay = ggml_mul(ctx0, kmulkbeta, decay_mask); + ggml_tensor * attn = ggml_neg(ctx0, ggml_mul(ctx0, k_decay, causal_mask)); + cb(attn, "attn_pre_solve", il); // shape: (chunk_size, chunk_size, n_chunks, H_v * n_seqs) + + ggml_tensor * attn_lower = ggml_mul(ctx0, attn, causal_mask); + ggml_tensor * lhs = ggml_sub(ctx0, ggml_repeat(ctx0, identity, attn_lower), attn_lower); + + ggml_tensor * lin_solve = ggml_solve_tri(ctx0, lhs, attn, true, true, false); + attn = ggml_mul(ctx0, lin_solve, causal_mask); + attn = ggml_add(ctx0, attn, identity); + cb(attn, "attn_solved", il); // shape: (chunk_size, chunk_size, n_chunks, H_v * n_seqs) + + v = ggml_mul_mat(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, v_beta)), attn); + + ggml_tensor * g_cumsum_t = ggml_cont(ctx0, ggml_transpose(ctx0, g_cumsum)); + ggml_tensor * gexp = ggml_exp(ctx0, g_cumsum_t); + + ggml_tensor * kbeta_gexp = ggml_mul(ctx0, k_beta, gexp); + cb(kbeta_gexp, "kbeta_gexp", il); // shape: (S_k, chunk_size, n_chunks, H_v * n_seqs) + + ggml_tensor * k_cumdecay = + ggml_cont(ctx0, ggml_transpose(ctx0, ggml_mul_mat(ctx0, attn, ggml_cont(ctx0, ggml_transpose(ctx0, kbeta_gexp))))); + cb(k_cumdecay, "k_cumdecay", il); // shape: (chunk_size, chunk_size, n_chunks, H_v * n_seqs) + + ggml_tensor * attn_kq = ggml_mul_mat(ctx0, k, q); + attn_kq = ggml_mul(ctx0, attn_kq, decay_mask); + attn_kq = ggml_mul(ctx0, attn_kq, diag_mask); + cb(attn_kq, "attn_kq", il); // shape: (chunk_size, chunk_size, n_chunks, H_v * n_seqs) + + + // vectorized calculation of key_gdiff + // improved from the chunked version: + // g_last = torch.clamp(g_cum[:, :, -1], max=50.0).exp().unsqueeze(-1).unsqueeze(-1) + // g_diff = torch.clamp(g_cum[:, :, -1:] - g_cum, max=50.0).exp() + // key_gdiff = key * g_diff.unsqueeze(-1) + // kgdmulvnew = (key_gdiff).transpose(-1, -2) @ v_new + // last_recurrent_state = last_recurrent_state * g_last + kgdmulvnew + + // get last element in g_cumsum along chunk_size dimension (ne0) + // example: [[x, y, z, ..., last], ...] -> [[last], ...] + ggml_tensor * g_last = ggml_view_4d(ctx0, g_cumsum, 1, 1, g_cumsum->ne[2], g_cumsum->ne[3], + g_cumsum->nb[1], g_cumsum->nb[2], g_cumsum->nb[3], + (g_cumsum->ne[0] - 1) * ggml_element_size(g_cumsum)); + g_last = ggml_cont(ctx0, g_last); + cb(g_last, "g_last", il); // shape: (1, 1, n_chunks, H_v * n_seqs) + + ggml_tensor * g_last_exp = ggml_exp(ctx0, g_last); + cb(g_last_exp, "g_last_exp", il); // shape: (1, 1, n_chunks, H_v * n_seqs) + + ggml_tensor * g_diff = ggml_neg(ctx0, ggml_sub(ctx0, g_cumsum, g_last)); + cb(g_diff, "g_diff", il); // shape: (chunk_size, 1, n_chunks, H_v * n_seqs) + + ggml_tensor * g_diff_exp = ggml_exp(ctx0, g_diff); + ggml_tensor * g_diff_exp_t = ggml_reshape_4d(ctx0, g_diff_exp, + 1, chunk_size, n_chunks, g_diff_exp->ne[3]); + + ggml_tensor * key_gdiff = ggml_mul(ctx0, k, g_diff_exp_t); + cb(key_gdiff, "key_gdiff", il); // shape: (S_k, chunk_size, n_chunks, H_v * n_seqs) + + ggml_tensor * key_gdiff_t = ggml_cont(ctx0, ggml_transpose(ctx0, key_gdiff)); + cb(key_gdiff_t, "key_gdiff_t", il); // shape: (chunk_size, S_k, n_chunks, H_v * n_seqs) + + + // state to be updated per chunk + ggml_tensor * new_state = state; // ggml_dup(ctx0, state); + cb(new_state, "new_state", il); // shape: (S_v, S_v, H_v, n_seqs) + + // shape after loop of chunks: (S_v, chunk_size, n_chunks, H_v * n_seqs) + ggml_tensor * core_attn_out = nullptr; + + for (int64_t chunk = 0; chunk < n_chunks; chunk++) { + // shape: (S_k, chunk_size, 1, H_k * n_seqs) + ggml_tensor * q_chunk = get_slice_2d(ctx0, q, chunk); // (no cont), next op: ggml_mul + + // shape: (S_v, chunk_size, 1, H_v * n_seqs) + ggml_tensor * v_chunk = get_slice_2d(ctx0, v, chunk); // (no cont), next op: ggml_repeat + + // shape: (chunk_size, 1, n_chunks, H_v * n_seqs) + ggml_tensor * gexp_chunk = get_slice_2d(ctx0, gexp, chunk); // (no cont), next op: ggml_mul + + // shape: (chunk_size, 1, H_v * n_seqs) + ggml_tensor * k_cumdecay_chunk = get_slice_2d(ctx0, k_cumdecay, chunk); // (no cont), next op: ggml_mul_mat + + // attn = (q_i @ k_i.transpose(-1, -2) * decay_mask[:, :, i]).masked_fill_(mask, 0) + // replaced by precomputed attn_kq + ggml_tensor * attn_chunk = get_slice_2d(ctx0, attn_kq, chunk); + cb(attn_chunk, "attn_chunk", il); + + ggml_tensor * state_t = ggml_cont_4d(ctx0, ggml_permute(ctx0, new_state, 1, 0, 2, 3), S_v, S_v, 1, H_v * n_seqs); + + // v_prime = (k_cumdecay[:, :, i]) @ last_recurrent_state + ggml_tensor * v_prime = ggml_mul_mat(ctx0, state_t, k_cumdecay_chunk); + cb(v_prime, "v_prime_chunk", il); // shape: (S_v, 1, H_v * n_seqs) + + // v_new = v_i - v_prime + ggml_tensor * v_new = ggml_sub(ctx0, ggml_repeat(ctx0, v_chunk, v_prime), v_prime); + ggml_tensor * v_new_t = ggml_cont(ctx0, ggml_transpose(ctx0, v_new)); + cb(v_new, "v_new_chunk", il); + + // attn_inter = (q_i * g[:, :, i, :, None].exp()) @ last_recurrent_state + ggml_tensor * q_g_exp = ggml_mul(ctx0, q_chunk, gexp_chunk); + ggml_tensor * attn_inter = ggml_mul_mat(ctx0, state_t, q_g_exp); + cb(attn_inter, "attn_inter_chunk", il); + + // core_attn_out[:, :, i] = attn_inter + attn @ v_new + ggml_tensor * v_attn = ggml_mul_mat(ctx0, v_new_t, attn_chunk); + cb(v_attn, "v_attn_chunk", il); + + ggml_tensor * core_attn_out_chunk = ggml_add(ctx0, attn_inter, v_attn); + cb(core_attn_out_chunk, "core_attn_out_chunk", il); // shape: (S_v, chunk_size, 1, H_v * n_seqs) + + core_attn_out = core_attn_out == nullptr + ? core_attn_out_chunk + : ggml_concat(ctx0, core_attn_out, core_attn_out_chunk, 2); + + // kgdmulvnew = (key_gdiff).transpose(-1, -2) @ v_new + ggml_tensor * k_gdiff_t = get_slice_2d(ctx0, key_gdiff_t, chunk); + //ggml_tensor * kgdmulvnew = ggml_mul_mat(ctx0, k_gdiff, v_new); // this is slower on metal, why? + ggml_tensor * kgdmulvnew = ggml_mul_mat(ctx0, v_new_t, k_gdiff_t); + + // last_recurrent_state = last_recurrent_state * g_last + kgdmulvnew + ggml_tensor * gexp_last_chunk = ggml_cont(ctx0, get_slice_2d(ctx0, g_last_exp, chunk)); + new_state = ggml_add(ctx0, + ggml_mul(ctx0, new_state, ggml_reshape_4d(ctx0, gexp_last_chunk, gexp_last_chunk->ne[0], gexp_last_chunk->ne[1], H_v, n_seqs)), + ggml_reshape_4d(ctx0, kgdmulvnew, kgdmulvnew->ne[0], kgdmulvnew->ne[1], H_v, n_seqs)); + } + + // truncate padded tokens + ggml_tensor * output_tokens = ggml_view_4d(ctx0, core_attn_out, + S_v, n_tokens, H_v, n_seqs, + ggml_row_size(core_attn_out->type, S_v), + ggml_row_size(core_attn_out->type, S_v * chunk_size * n_chunks), + ggml_row_size(core_attn_out->type, S_v * chunk_size * n_chunks * H_v), 0); + output_tokens = ggml_cont(ctx0, output_tokens); + cb(output_tokens, "output_tokens", il); + + // permute back to (S_v, H_v, n_tokens, n_seqs) + output_tokens = ggml_permute(ctx0, output_tokens, 0, 2, 1, 3); + output_tokens = ggml_cont(ctx0, output_tokens); + + return {output_tokens, new_state}; +} + +std::pair llm_build_qwen3next::build_delta_net_autoregressive( + ggml_tensor * q, + ggml_tensor * k, + ggml_tensor * v, + ggml_tensor * g, + ggml_tensor * beta, + ggml_tensor * state, + int il) { + const int64_t S_k = q->ne[0]; + const int64_t H_k = q->ne[1]; + const int64_t n_tokens = q->ne[2]; + const int64_t n_seqs = q->ne[3]; + + const int64_t S_v = v->ne[0]; + const int64_t H_v = v->ne[1]; + + GGML_ASSERT(n_tokens == 1); // This function is optimized for single token processing + GGML_ASSERT(v->ne[2] == n_tokens); + GGML_ASSERT(k->ne[2] == n_tokens); + GGML_ASSERT(g->ne[0] == H_v && g->ne[1] == n_tokens && g->ne[2] == n_seqs); + GGML_ASSERT(beta->ne[0] == H_v && beta->ne[2] == n_tokens && beta->ne[3] == n_seqs); + GGML_ASSERT(state->ne[0] == S_v && state->ne[1] == S_v * H_v && state->ne[2] == 1 && state->ne[3] == n_seqs); + + GGML_ASSERT(q->ne[0] == S_k && q->ne[1] == H_k && q->ne[2] == n_tokens && q->ne[3] == n_seqs); + GGML_ASSERT(k->ne[0] == S_k && k->ne[1] == H_k && k->ne[2] == n_tokens && k->ne[3] == n_seqs); + + GGML_ASSERT(H_k == H_v); // we did a repeat to make sure this is the case + + const float eps_norm = hparams.f_norm_rms_eps; + + q = ggml_l2_norm(ctx0, q, eps_norm); + k = ggml_l2_norm(ctx0, k, eps_norm); + + const float scale = 1.0f / sqrtf(S_v); + + q = ggml_scale(ctx0, q, scale); + beta = ggml_sigmoid(ctx0, beta); + + cb(q, "q_in", il); + cb(k, "k_in", il); + cb(v, "v_in", il); + cb(beta, "beta_in", il); + cb(g, "g_in", il); + + state = ggml_reshape_4d(ctx0, state, S_v, S_v, H_v, n_seqs); + + ggml_tensor * g_t = ggml_reshape_4d(ctx0, ggml_transpose(ctx0, g), 1, 1, H_k, n_seqs); + ggml_tensor * beta_t = ggml_reshape_4d(ctx0, ggml_transpose(ctx0, beta), 1, 1, H_k, n_seqs); + + // Apply exponential to g_t + g_t = ggml_exp(ctx0, g_t); + + // Apply the gated delta rule for the single timestep + // last_recurrent_state = last_recurrent_state * g_t + state = ggml_mul(ctx0, state, g_t); + + // kv_mem = (last_recurrent_state * k_t.unsqueeze(-1)).sum(dim=-2) + ggml_tensor * k_t_unsqueezed = ggml_reshape_4d(ctx0, k, 1, S_v, H_v, n_seqs); + ggml_tensor * kv_mem = ggml_mul(ctx0, state, k_t_unsqueezed); + // we need to sum over dim=-2, so we transpose, sum, then transpose again + kv_mem = ggml_transpose(ctx0, ggml_sum_rows(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, kv_mem)))); + + // v_t = v.unsqueeze(2) (we insert the singleton dimension after n_seqs and H_v) + ggml_tensor * v_t = ggml_reshape_4d(ctx0, v, S_v, 1, H_v, n_seqs); + // delta = (v_t - kv_mem) * beta_t + ggml_tensor * v_diff = ggml_sub(ctx0, v_t, kv_mem); // both should be [S_v, 1, H_v, n_seqs] + ggml_tensor * delta = ggml_mul(ctx0, v_diff, beta_t); + + // last_recurrent_state = last_recurrent_state + k_t.unsqueeze(-1) * delta + ggml_tensor * k_t_delta = ggml_mul(ctx0, ggml_repeat_4d(ctx0, k_t_unsqueezed, S_v, S_v, H_v, n_seqs), delta); + state = ggml_add(ctx0, state, k_t_delta); + + // Compute the attention output + // core_attn_out = (last_recurrent_state * q_t.unsqueeze(-1)).sum(dim=-2) + ggml_tensor * q_t_unsqueezed = ggml_reshape_4d(ctx0, q, 1, S_v, H_v, n_seqs); // unsqueeze q_t + ggml_tensor * state_q = ggml_mul(ctx0, state, q_t_unsqueezed); + // again, since it's over dim = -2, transpose, sum, transpose back + ggml_tensor * core_attn_out = + ggml_transpose(ctx0, ggml_sum_rows(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, state_q)))); + + // core_attn_out should be [S_v, 1, H_v, n_seqs] after this + cb(core_attn_out, "output_tokens", il); + cb(state, "new_state", il); + + return {core_attn_out, state}; +} + ggml_tensor * llm_build_qwen3next::build_norm_gated( ggml_tensor * input, ggml_tensor * weights, @@ -395,7 +752,7 @@ ggml_tensor * llm_build_qwen3next::build_layer_attn_linear( v_conv = ggml_cont_4d(ctx0, v_conv, head_v_dim, num_v_heads, n_seq_tokens, n_seqs); ggml_tensor * state = build_rs(inp, ssm_states_all, hparams.n_embd_s(), n_seqs); - state = ggml_reshape_4d(ctx0, state, head_v_dim, head_v_dim, num_v_heads, n_seqs); + state = ggml_reshape_4d(ctx0, state, head_v_dim, head_v_dim * num_v_heads, 1, n_seqs); cb(state, "state_predelta", il); // if head keys and value keys are different, repeat to force tensors into matching shapes @@ -424,10 +781,13 @@ ggml_tensor * llm_build_qwen3next::build_layer_attn_linear( cb(k_conv, "k_conv_predelta", il); cb(v_conv, "v_conv_predelta", il); - std::pair attn_out = build_delta_net_unified(ctx0, q_conv, k_conv, v_conv, - gate, beta, state, causal_mask, identity, diag_mask, - il, CHUNK_SIZE, hparams.f_norm_rms_eps); - + // Choose between build_delta_net_chunking, build_delta_net_recurrent, and build_delta_net_autoregressive based on n_tokens + std::pair attn_out; // pair of (output, new_state) + if (n_seq_tokens == 1) { + attn_out = build_delta_net_autoregressive(q_conv, k_conv, v_conv, gate, beta, state, il); + } else { + attn_out = build_delta_net_chunking(q_conv, k_conv, v_conv, gate, beta, state, causal_mask, identity, diag_mask, il); + } ggml_tensor * output = attn_out.first; ggml_tensor * new_state = attn_out.second; cb(output, "attn_output", il); From 81ddc60cb3b980a4503a9a0177b079dfa562c60e Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Mon, 9 Feb 2026 15:09:30 +0200 Subject: [PATCH 13/17] ci : add metal server workflows (#19293) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * ci : add metal server workflows * cont : try fix python init * cont : move to a separate workflow that runs only on master * cont : fix num jobs Co-authored-by: Sigbjørn Skjæret --------- Co-authored-by: Sigbjørn Skjæret --- .github/workflows/server-metal.yml | 73 ++++++++++++++++++++++++++++++ 1 file changed, 73 insertions(+) create mode 100644 .github/workflows/server-metal.yml diff --git a/.github/workflows/server-metal.yml b/.github/workflows/server-metal.yml new file mode 100644 index 0000000000..1d707bef44 --- /dev/null +++ b/.github/workflows/server-metal.yml @@ -0,0 +1,73 @@ +name: Server-Metal + +on: + workflow_dispatch: # allows manual triggering + inputs: + sha: + description: 'Commit SHA1 to build' + required: false + type: string + slow_tests: + description: 'Run slow tests' + required: true + type: boolean + push: + branches: + - master + paths: ['.github/workflows/server-metal.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'tools/server/**.*'] + +env: + LLAMA_LOG_COLORS: 1 + LLAMA_LOG_PREFIX: 1 + LLAMA_LOG_TIMESTAMPS: 1 + LLAMA_LOG_VERBOSITY: 10 + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || github.run_id }} + cancel-in-progress: true + +jobs: + server-metal: + runs-on: [self-hosted, macOS, ARM64] + + name: server-metal (${{ matrix.wf_name }}) + strategy: + matrix: + build_type: [Release] + wf_name: ["GPUx1"] + include: + - build_type: Release + extra_args: "LLAMA_ARG_BACKEND_SAMPLING=1" + wf_name: "GPUx1, backend-sampling" + - build_type: Release + extra_args: "GGML_METAL_DEVICES=2" + wf_name: "GPUx2" + - build_type: Release + extra_args: "GGML_METAL_DEVICES=2 LLAMA_ARG_BACKEND_SAMPLING=1" + wf_name: "GPUx2, backend-sampling" + fail-fast: false + + steps: + - name: Clone + id: checkout + uses: actions/checkout@v6 + with: + fetch-depth: 0 + ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }} + + - name: Build + id: cmake_build + run: | + cmake -B build -DGGML_SCHED_NO_REALLOC=ON + cmake --build build --config ${{ matrix.build_type }} -j $(sysctl -n hw.logicalcpu) --target llama-server + + - name: Tests + id: server_integration_tests + if: ${{ (!matrix.disabled_on_pr || !github.event.pull_request) }} + run: | + cd tools/server/tests + python3 -m venv venv + source venv/bin/activate + pip install -r requirements.txt + export ${{ matrix.extra_args }} + pytest -v -x -m "not slow" From 292f6908cdc6abb5c38581e34fa141973e5aba82 Mon Sep 17 00:00:00 2001 From: Sascha Rogmann <59577610+srogmann@users.noreply.github.com> Date: Mon, 9 Feb 2026 14:30:50 +0100 Subject: [PATCH 14/17] spec : remove check rate (#19377) * spec: remove parameter spec-ngram-check-rate * spec : renamed statistics vars * spec : add n_call_begin, n_call_accept * spec : don't enable key-map-stats --- common/arg.cpp | 10 ------- common/common.h | 1 - common/ngram-map.cpp | 7 ++--- common/ngram-map.h | 8 ++---- common/speculative.cpp | 55 ++++++++++++++++-------------------- docs/speculative.md | 13 ++++----- tools/server/server-task.cpp | 4 --- 7 files changed, 36 insertions(+), 62 deletions(-) diff --git a/common/arg.cpp b/common/arg.cpp index 5fbc9022c0..9c85696ebd 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -3437,16 +3437,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex params.speculative.ngram_size_m = value; } ).set_examples({LLAMA_EXAMPLE_SERVER})); - add_opt(common_arg( - {"--spec-ngram-check-rate"}, "N", - string_format("ngram check rate for ngram-simple/ngram-map speculative decoding (default: %d)", params.speculative.ngram_check_rate), - [](common_params & params, int value) { - if (value < 1) { - throw std::invalid_argument("ngram check rate must be at least 1"); - } - params.speculative.ngram_check_rate = value; - } - ).set_examples({LLAMA_EXAMPLE_SERVER})); add_opt(common_arg( {"--spec-ngram-min-hits"}, "N", string_format("minimum hits for ngram-map speculative decoding (default: %d)", params.speculative.ngram_min_hits), diff --git a/common/common.h b/common/common.h index 398ebb0960..b284244530 100644 --- a/common/common.h +++ b/common/common.h @@ -269,7 +269,6 @@ struct common_params_speculative { uint16_t ngram_size_n = 12; // ngram size for lookup uint16_t ngram_size_m = 48; // mgram size for speculative tokens - uint16_t ngram_check_rate = 1; // check rate for ngram lookup uint16_t ngram_min_hits = 1; // minimum hits at ngram/mgram lookup for mgram to be proposed std::shared_ptr ngram_mod; diff --git a/common/ngram-map.cpp b/common/ngram-map.cpp index c5b8fc75ed..2b876a6e99 100644 --- a/common/ngram-map.cpp +++ b/common/ngram-map.cpp @@ -231,10 +231,9 @@ void common_ngram_map_draft(common_ngram_map & map, GGML_ABORT("%s: cur_len exceeds UINT32_MAX: %zu", __func__, cur_len); } - // Only check every check_rate tokens to save compute - // i.e., perform check if (cur_len - idx_last_check) >= check_rate - if (map.idx_last_check + map.check_rate > cur_len) { - return; + if (map.idx_last_check > cur_len) { + // Should not happen because of common_ngram_map_begin(). + GGML_ABORT("%s: map.idx_last_check > cur_len: %zu > %zu", __func__, map.idx_last_check, cur_len); } map.idx_last_check = cur_len; diff --git a/common/ngram-map.h b/common/ngram-map.h index 9668bd5a7c..41b9530449 100644 --- a/common/ngram-map.h +++ b/common/ngram-map.h @@ -24,7 +24,6 @@ struct common_ngram_simple_config { uint16_t size_ngram; // size of n-grams to lookup in self-mode uint16_t size_mgram; // size of m-grams to draft in self-mode - uint16_t check_rate; // check for speculative decoding without draft model for each check_rate token }; // Searches for a n-gram in the history and checks whether a draft sequence should be generated. @@ -66,15 +65,14 @@ struct common_ngram_map { bool key_only; // true if only key n-grams are used, no values. std::vector keys; // key n-grams which occur several times in token-history - uint16_t check_rate; // check for speculative decoding without draft model for each check_rate token uint16_t min_hits; // minimum number of key hits to consider a draft - bool show_key_map_stats = false; // true, if statitics of the key_map should be printed. + bool show_key_map_stats = false; // true, if statistics of the key_map should be printed. common_ngram_map(uint16_t sz_key, uint16_t sz_value, bool only_keys, - uint16_t check_rate, uint16_t min_hits) + uint16_t min_hits) : size_key(sz_key), size_value(sz_value), key_only(only_keys), - check_rate(check_rate), min_hits(min_hits) { + min_hits(min_hits) { key_map.resize(COMMON_NGRAM_HASH_MAP_SIZE); // 2^18 hash entries, 0 entries if key_map shouldn't be used } diff --git a/common/speculative.cpp b/common/speculative.cpp index 84d2556ceb..3e68c38e49 100644 --- a/common/speculative.cpp +++ b/common/speculative.cpp @@ -113,13 +113,14 @@ static bool common_speculative_are_compatible( struct common_speculative_state { const enum common_speculative_type type; - // TODO: rename to n_call_draft, n_gen_drafts, n_acc_drafts, n_gen_tokens, n_acc_tokens - // TODO: add n_call_begin, n_call_accept - size_t drafts_call_count = 0; // number of times this implementation was called. - size_t drafts_generated_count = 0; // number of times a draft or part was generated by this implementation. - size_t drafts_accepted_count = 0; // number of times a draft or part was accepted by the target model. - size_t drafts_generated_tokens = 0; // number of tokens generated by this implementation. - size_t drafts_accepted_tokens = 0; // number of tokens accepted by the target model. + size_t n_call_begin = 0; // number of times this implementation was called for refresh. + size_t n_call_draft = 0; // number of times this implementation was called for generation. + size_t n_call_accept = 0; // number of times this implementation was called for accumulation. + + size_t n_gen_drafts = 0; // number of times a draft or part was generated by this implementation. + size_t n_acc_drafts = 0; // number of times a draft or part was accepted by the target model. + size_t n_gen_tokens = 0; // number of tokens generated by this implementation. + size_t n_acc_tokens = 0; // number of tokens accepted by the target model. // TODO: track performance of most recent calls const bool gen_perf = true; // whether to generate performance stats. @@ -465,8 +466,6 @@ struct common_speculative_state_eagle3 : public common_speculative_state { struct common_speculative_state_ngram_simple : public common_speculative_state { common_ngram_simple_config config; - uint16_t check_id = 0; // used to control the frequency of generating drafts - common_speculative_state_ngram_simple( enum common_speculative_type type, common_ngram_simple_config config) @@ -481,11 +480,6 @@ struct common_speculative_state_ngram_simple : public common_speculative_state { const llama_tokens & prompt_tgt, llama_token id_last, llama_tokens & result) override { - ++check_id; - if (check_id < config.check_rate) { - return; - } - check_id = 0; result = common_ngram_simple_draft(config, prompt_tgt, id_last); GGML_UNUSED(params); @@ -752,10 +746,9 @@ static common_ngram_map get_common_ngram_map(const common_speculative_config & c uint16_t size_key = config.params.ngram_size_n; uint16_t size_value = config.params.ngram_size_m; bool key_only = (config.type == COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K); - uint16_t check_rate = config.params.ngram_check_rate; uint16_t min_hits = config.params.ngram_min_hits; - return common_ngram_map(size_key, size_value, key_only, check_rate, min_hits); + return common_ngram_map(size_key, size_value, key_only, min_hits); } static common_speculative_state_ngram_cache create_state_ngram_cache( @@ -931,12 +924,10 @@ common_speculative * common_speculative_init( uint16_t ngram_size_key = ngram_map.size_key; uint16_t mgram_size_value = ngram_map.size_value; - uint16_t check_rate = ngram_map.check_rate; auto config_simple = common_ngram_simple_config { /* .size_ngram = */ ngram_size_key, - /* .size_mgram = */ mgram_size_value, - /* .check_rate = */ check_rate + /* .size_mgram = */ mgram_size_value }; auto state = std::make_unique( /* .type = */ config.type, @@ -997,6 +988,7 @@ void common_speculative_begin(common_speculative * spec, const llama_tokens & pr for (auto & impl : spec->impls) { common_time_meas tm(impl->t_begin_us, !impl->gen_perf); impl->begin(prompt); + impl->n_call_begin++; } } @@ -1013,17 +1005,17 @@ llama_tokens common_speculative_draft( { common_time_meas tm(impl->t_draft_us, !impl->gen_perf); impl->draft(params, prompt_tgt, id_last, result); - impl->drafts_call_count++; + impl->n_call_draft++; } if (!result.empty()) { LOG_DBG("%s: called impl %s, hist size = %zu, call_count = %zu, gen = %zu\n", __func__, common_speculative_type_to_str(impl.get()->type).c_str(), prompt_tgt.size(), - impl.get()->drafts_call_count, result.size()); + impl.get()->n_call_draft, result.size()); spec->curr_impl = impl.get(); // set current implementation for stats - impl->drafts_generated_count++; - impl->drafts_generated_tokens += result.size(); + impl->n_gen_drafts++; + impl->n_gen_tokens += result.size(); break; // We have a draft, so break out of the loop and return it. } @@ -1044,11 +1036,12 @@ void common_speculative_accept(common_speculative * spec, uint16_t n_accepted) { { common_time_meas tm(impl->t_accept_us, !impl->gen_perf); if (n_accepted > 0) { - impl->drafts_accepted_count++; - impl->drafts_accepted_tokens += n_accepted; + impl->n_acc_drafts++; + impl->n_acc_tokens += n_accepted; } impl->accept(n_accepted); + impl->n_call_accept++; } } @@ -1069,13 +1062,13 @@ void common_speculative_print_stats(const common_speculative * spec) { str_perf = ""; } - LOG_INF("statistics %s: #calls = %zu, #gen drafts = %zu, #acc drafts = %zu, #gen tokens = %zu, #acc tokens = %zu%s\n", + LOG_INF("statistics %s: #calls(b,g,a) = %zu %zu %zu, #gen drafts = %zu, #acc drafts = %zu, #gen tokens = %zu, #acc tokens = %zu%s\n", common_speculative_type_to_str(impl->type).c_str(), - impl->drafts_call_count, - impl->drafts_generated_count, - impl->drafts_accepted_count, - impl->drafts_generated_tokens, - impl->drafts_accepted_tokens, + impl->n_call_begin, impl->n_call_draft, impl->n_call_accept, + impl->n_gen_drafts, + impl->n_acc_drafts, + impl->n_gen_tokens, + impl->n_acc_tokens, str_perf.c_str()); } } diff --git a/docs/speculative.md b/docs/speculative.md index 03afab5b41..29da332875 100644 --- a/docs/speculative.md +++ b/docs/speculative.md @@ -119,8 +119,6 @@ If a draft model is combined with a draftless decoding the draftless decoding ha of lookup n-gram (default: 12) --spec-ngram-size-m N ngram size M for ngram-simple/ngram-map speculative decoding, length of draft m-gram (default: 48) ---spec-ngram-check-rate N ngram check rate for ngram-simple/ngram-map speculative decoding - (default: 1) --spec-ngram-min-hits N minimum hits for ngram-map speculative decoding (default: 1) ``` @@ -153,10 +151,6 @@ Sets the size M of the draft m-gram for n-gram map based speculative decoding. The m-gram size determines how many tokens to draft when a match is found. Larger values can provide more speedup but may reduce acceptance rate. -### `--spec-ngram-check-rate R` - -This option aims at performance if the n-gram lookup in history is to costly. A lookup will be executed at every R tokens (default is 1, every token). - ### `--spec-ngram-min-hits H` This option defines how often a key has to appear in the token history to be used as a draft (default is 1). @@ -175,7 +169,12 @@ draft acceptance rate = 0.70312 ( 90 accepted / 128 generated) statistics ngram_mod: #calls = 810, #gen drafts = 15, #acc drafts = 15, #gen tokens = 960, #acc tokens = 730, dur(b,g,a) = 0.149, 0.347, 0.005 ms ``` -- `#calls`: number of calls of this implementations +``` +statistics ngram_map_k: #calls(b,g,a) = 6 1690 26, #gen drafts = 26, #acc drafts = 26, #gen tokens = 1248, #acc tokens = 968, dur(b,g,a) = 2.234, 1.427, 0.016 ms +``` + + +- `#calls(b,g,a)`: number of calls of begin (new prompt), generation and accumulation of this implementations - `#gen drafts`: number of drafts generated by this implementation - `#acc drafts`: number of drafts accepted (partially) by the main model - `#gen tokens`: number of tokens generated by this implementation (including rejected tokens) diff --git a/tools/server/server-task.cpp b/tools/server/server-task.cpp index 2d25db63b7..a137427c69 100644 --- a/tools/server/server-task.cpp +++ b/tools/server/server-task.cpp @@ -80,7 +80,6 @@ json task_params::to_json(bool only_metrics) const { {"speculative.type", common_speculative_type_to_str(speculative.type)}, {"speculative.ngram_size_n", speculative.ngram_size_n}, {"speculative.ngram_size_m", speculative.ngram_size_m}, - {"speculative.ngram_c_rate", speculative.ngram_check_rate}, {"speculative.ngram_m_hits", speculative.ngram_min_hits}, {"timings_per_token", timings_per_token}, {"post_sampling_probs", post_sampling_probs}, @@ -144,7 +143,6 @@ json task_params::to_json(bool only_metrics) const { {"speculative.type", common_speculative_type_to_str(speculative.type)}, {"speculative.ngram_size_n", speculative.ngram_size_n}, {"speculative.ngram_size_m", speculative.ngram_size_m}, - {"speculative.ngram_c_rate", speculative.ngram_check_rate}, {"speculative.ngram_m_hits", speculative.ngram_min_hits}, {"timings_per_token", timings_per_token}, {"post_sampling_probs", post_sampling_probs}, @@ -257,12 +255,10 @@ task_params server_task::params_from_json_cmpl( params.speculative.ngram_size_n = json_value(data, "speculative.ngram_size_n", defaults.speculative.ngram_size_n); params.speculative.ngram_size_m = json_value(data, "speculative.ngram_size_m", defaults.speculative.ngram_size_m); - params.speculative.ngram_check_rate = json_value(data, "speculative.ngram_c_rate", defaults.speculative.ngram_check_rate); params.speculative.ngram_min_hits = json_value(data, "speculative.ngram_m_hits", defaults.speculative.ngram_min_hits); params.speculative.ngram_size_n = std::max(std::min(1, (int) params.speculative.ngram_size_n), 1024); params.speculative.ngram_size_m = std::max(std::min(1, (int) params.speculative.ngram_size_m), 1024); - params.speculative.ngram_check_rate = std::max(std::min(1, (int) params.speculative.ngram_check_rate), 1024); params.speculative.ngram_min_hits = std::max(std::min(1, (int) params.speculative.ngram_min_hits), 1024); // Use OpenAI API logprobs only if n_probs wasn't provided From 820ebfa6f45347c77e9ddfcc670e44109d6df43f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=EC=86=90=ED=9D=AC=EC=A4=80?= Date: Tue, 10 Feb 2026 00:22:57 +0900 Subject: [PATCH 15/17] Server: log when converting requests to chat completions format (#19457) * Log converting requests * Print as debug instead of info [no ci] --------- Co-authored-by: openingnow <> --- tools/server/server-context.cpp | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp index 8ec8451339..ceafcac179 100644 --- a/tools/server/server-context.cpp +++ b/tools/server/server-context.cpp @@ -3584,6 +3584,8 @@ void server_routes::init_routes() { auto res = create_response(); std::vector files; json body = convert_responses_to_chatcmpl(json::parse(req.body)); + SRV_DBG("%s\n", "Request converted: OpenAI Responses -> OpenAI Chat Completions"); + SRV_DBG("converted request: %s\n", body.dump().c_str()); json body_parsed = oaicompat_chat_params_parse( body, meta->chat_params, @@ -3600,6 +3602,8 @@ void server_routes::init_routes() { auto res = create_response(); std::vector files; json body = convert_anthropic_to_oai(json::parse(req.body)); + SRV_DBG("%s\n", "Request converted: Anthropic -> OpenAI Chat Completions"); + SRV_DBG("converted request: %s\n", body.dump().c_str()); json body_parsed = oaicompat_chat_params_parse( body, meta->chat_params, @@ -3616,6 +3620,8 @@ void server_routes::init_routes() { auto res = create_response(); std::vector files; json body = convert_anthropic_to_oai(json::parse(req.body)); + SRV_DBG("%s\n", "Request converted: Anthropic -> OpenAI Chat Completions"); + SRV_DBG("converted request: %s\n", body.dump().c_str()); json body_parsed = oaicompat_chat_params_parse( body, meta->chat_params, From 262364e31d1da43596fe84244fba44e94a0de64e Mon Sep 17 00:00:00 2001 From: Tarek Dakhran Date: Mon, 9 Feb 2026 17:30:32 +0100 Subject: [PATCH 16/17] mtmd: Implement tiling for LFM2-VL (#19454) --- tools/mtmd/clip.cpp | 136 ++++++++++++++++++++++++++++++++++++++++++-- tools/mtmd/mtmd.cpp | 19 ++++++- 2 files changed, 147 insertions(+), 8 deletions(-) diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp index 9fa5afc390..614fe66fde 100644 --- a/tools/mtmd/clip.cpp +++ b/tools/mtmd/clip.cpp @@ -10,6 +10,7 @@ #include "ggml-backend.h" #include "gguf.h" +#include #include #include #include @@ -1116,9 +1117,8 @@ struct clip_model_loader { case PROJECTOR_TYPE_LFM2: { get_u32(KEY_PROJ_SCALE_FACTOR, hparams.n_merge, false); - // ref: https://huggingface.co/LiquidAI/LFM2-VL-3B/blob/main/preprocessor_config.json - // config above specifies number of tokens after downsampling, while here it is before, relax lowerbound to 64 - hparams.set_limit_image_tokens(64, 1024); + // ref: https://huggingface.co/LiquidAI/LFM2.5-VL-1.6B/blob/main/processor_config.json + hparams.set_limit_image_tokens(64, 256); } break; case PROJECTOR_TYPE_PIXTRAL: case PROJECTOR_TYPE_LIGHTONOCR: @@ -2807,6 +2807,119 @@ private: } }; +// ref: https://github.com/huggingface/transformers/blob/v5.1.0/src/transformers/models/lfm2_vl/image_processing_lfm2_vl_fast.py +// some of the logic is similar to llava_uhd, but with different hyperparameters and some logic is unique (e.g. grid layout) +struct lfm2_vl_image_processor { + // ref: https://huggingface.co/LiquidAI/LFM2.5-VL-1.6B/blob/main/processor_config.json + static constexpr int min_tiles = 2; + static constexpr int max_tiles = 10; + static constexpr float max_pixels_tolerance = 2.0f; + static constexpr int tile_size = 512; + + static llava_uhd::slice_instructions get_slice_instructions(struct clip_ctx * ctx, const clip_image_size & original_size) { + llava_uhd::slice_instructions inst; + const auto & params = ctx->model.hparams; + const int align_size = params.patch_size * params.n_merge; + + inst.interpolation_overview = img_tool::RESIZE_ALGO_BILINEAR; + inst.interpolation_refined = img_tool::RESIZE_ALGO_BILINEAR; + inst.overview_size = img_tool::calc_size_preserved_ratio(original_size, align_size, params.image_min_pixels, params.image_max_pixels); + + // tile if either dimension exceeds tile_size with tolerance + const bool needs_tiling = original_size.width > tile_size * max_pixels_tolerance || original_size.height > tile_size * max_pixels_tolerance; + + if (!needs_tiling) { + inst.refined_size = clip_image_size{0, 0}; + inst.grid_size = clip_image_size{0, 0}; + return inst; + } + + const clip_image_size grid = get_grid_layout(original_size.height, original_size.width); + + inst.grid_size = grid; + inst.refined_size = clip_image_size{tile_size * grid.width, tile_size * grid.height}; + + LOG_DBG("%s: original size: %d x %d, overview size: %d x %d, refined size: %d x %d, grid size: %d x %d\n", + __func__, + original_size.width, original_size.height, + inst.overview_size.width, inst.overview_size.height, + inst.refined_size.width, inst.refined_size.height, + grid.width, grid.height); + + for (int row = 0; row < grid.height; row++) { + for (int col = 0; col < grid.width; col++) { + llava_uhd::slice_coordinates slice; + slice.x = col * tile_size; + slice.y = row * tile_size; + slice.size = clip_image_size{tile_size, tile_size}; + inst.slices.push_back(slice); + LOG_DBG("%s: slice %d: x=%d, y=%d, size=%d x %d\n", + __func__, (int)inst.slices.size() - 1, + slice.x, slice.y, slice.size.width, slice.size.height); + } + } + + return inst; + } + +private: + static clip_image_size find_closest_aspect_ratio( + float aspect_ratio, + const std::vector & target_ratios, + int width, int height) { + float best_ratio_diff = std::numeric_limits::max(); + clip_image_size best_ratio = {1, 1}; + const float area = static_cast(width * height); + + for (const auto & ratio : target_ratios) { + const float target_aspect_ratio = static_cast(ratio.width) / ratio.height; + const float ratio_diff = std::abs(aspect_ratio - target_aspect_ratio); + if (ratio_diff < best_ratio_diff) { + best_ratio_diff = ratio_diff; + best_ratio = ratio; + } else if (ratio_diff == best_ratio_diff) { + const float target_area = static_cast(tile_size * tile_size * ratio.width * ratio.height); + if (area > 0.5f * target_area) { + best_ratio = ratio; + } + } + } + return best_ratio; + } + + static std::vector get_target_ratios() { + std::vector ratios; + for (int n = min_tiles; n <= max_tiles; n++) { + for (int w = 1; w <= n; w++) { + for (int h = 1; h <= n; h++) { + if (w * h >= min_tiles && w * h <= max_tiles) { + bool found = false; + for (const auto & r : ratios) { + if (r.width == w && r.height == h) { + found = true; + break; + } + } + if (!found) { + ratios.push_back({w, h}); + } + } + } + } + } + std::sort(ratios.begin(), ratios.end(), [](const clip_image_size & a, const clip_image_size & b) { + return a.width * a.height < b.width * b.height; + }); + return ratios; + } + + static clip_image_size get_grid_layout(int height, int width) { + const float aspect_ratio = static_cast(width) / height; + const auto ratios = get_target_ratios(); + return find_closest_aspect_ratio(aspect_ratio, ratios, width, height); + } +}; + // returns the normalized float tensor for llava-1.5, for spatial_unpad with anyres processing for llava-1.6 it returns the normalized image patch tensors as a vector // res_imgs memory is being allocated here, previous allocations will be freed if found bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, struct clip_image_f32_batch * res_imgs) { @@ -3021,6 +3134,20 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str } break; case PROJECTOR_TYPE_LFM2: + { + auto const inst = lfm2_vl_image_processor::get_slice_instructions(ctx, original_size); + std::vector imgs = llava_uhd::slice_image(img, inst); + + for (size_t i = 0; i < imgs.size(); ++i) { + clip_image_f32_ptr res(clip_image_f32_init()); + normalize_image_u8_to_f32(*imgs[i], *res, params.image_mean, params.image_std); + res_imgs->entries.push_back(std::move(res)); + } + + res_imgs->grid_x = inst.grid_size.width; + res_imgs->grid_y = inst.grid_size.height; + } break; + case PROJECTOR_TYPE_KIMIVL: { GGML_ASSERT(params.image_min_pixels > 0 && params.image_max_pixels > 0); @@ -3032,8 +3159,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str const std::array pad_color = {122, 116, 104}; clip_image_u8 resized_img; - const bool pad = (ctx->proj_type() != PROJECTOR_TYPE_LFM2); - img_tool::resize(*img, resized_img, target_size, img_tool::RESIZE_ALGO_BILINEAR, pad, pad_color); + img_tool::resize(*img, resized_img, target_size, img_tool::RESIZE_ALGO_BILINEAR, true, pad_color); clip_image_f32_ptr res(clip_image_f32_init()); normalize_image_u8_to_f32(resized_img, *res, params.image_mean, params.image_std); res_imgs->entries.push_back(std::move(res)); diff --git a/tools/mtmd/mtmd.cpp b/tools/mtmd/mtmd.cpp index d037e834f3..b7636279cb 100644 --- a/tools/mtmd/mtmd.cpp +++ b/tools/mtmd/mtmd.cpp @@ -85,6 +85,7 @@ enum mtmd_slice_tmpl { MTMD_SLICE_TMPL_MINICPMV_2_6, MTMD_SLICE_TMPL_LLAMA4, MTMD_SLICE_TMPL_IDEFICS3, + MTMD_SLICE_TMPL_LFM2, }; const char * mtmd_default_marker() { @@ -307,9 +308,19 @@ struct mtmd_context { img_end = "<|im_end|>"; } else if (proj == PROJECTOR_TYPE_LFM2) { - img_beg = "<|image_start|>"; - img_end = "<|image_end|>"; - + // multi-tile: + // <|image_start|> + // <|img_row_1_col_1|> (tile) <|img_row_1_col_2|> (tile) ... + // <|img_thumbnail|> (thumbnail) + // <|image_end|> + // single-tile: + // <|image_start|> (image) <|image_end|> + img_beg = "<|image_start|>"; + img_end = "<|image_end|>"; + slice_tmpl = MTMD_SLICE_TMPL_LFM2; + sli_img_start_tmpl = "<|img_row_%d_col_%d|>"; + tok_ov_img_start = {lookup_token("<|img_thumbnail|>")}; + ov_img_first = false; } else if (proj == PROJECTOR_TYPE_GLM4V) { img_beg = "<|begin_of_image|>"; img_end = "<|end_of_image|>"; @@ -562,11 +573,13 @@ struct mtmd_tokenizer { } // handle llava-uhd style preprocessing + const bool has_tiling_grid = batch_f32.grid_x > 0 && batch_f32.grid_y > 0; if ( ctx->slice_tmpl == MTMD_SLICE_TMPL_MINICPMV_2_5 || ctx->slice_tmpl == MTMD_SLICE_TMPL_MINICPMV_2_6 || ctx->slice_tmpl == MTMD_SLICE_TMPL_LLAMA4 || ctx->slice_tmpl == MTMD_SLICE_TMPL_IDEFICS3 + || (ctx->slice_tmpl == MTMD_SLICE_TMPL_LFM2 && has_tiling_grid) ) { const int n_col = batch_f32.grid_x; const int n_row = batch_f32.grid_y; From 98e57ca422c5adb33663a6406c2f2d5b7d255da7 Mon Sep 17 00:00:00 2001 From: Xuan-Son Nguyen Date: Mon, 9 Feb 2026 22:14:12 +0100 Subject: [PATCH 17/17] chat: fix case where template accepts type content only (#19419) * chat: fix case where template accepts type content only * rm stray log * reuse render_message_to_json --- common/chat.cpp | 39 +++++++++++++++++++++++++++++++++++---- common/chat.h | 2 ++ common/jinja/caps.cpp | 13 +++++++++---- common/jinja/caps.h | 4 +++- common/jinja/runtime.cpp | 6 ++++++ 5 files changed, 55 insertions(+), 9 deletions(-) diff --git a/common/chat.cpp b/common/chat.cpp index 2bf4632669..47a34d5822 100644 --- a/common/chat.cpp +++ b/common/chat.cpp @@ -380,15 +380,46 @@ std::vector common_chat_msgs_parse_oaicompat(const json & messa return msgs; } -json common_chat_msgs_to_json_oaicompat(const std::vector & msgs, bool concat_typed_text) { +static json render_message_to_json(const std::vector & msgs, const jinja::caps & c) { + if (!c.supports_string_content && !c.supports_typed_content) { + LOG_WRN("%s: Neither string content nor typed content is supported by the template. This is unexpected and may lead to issues.\n", __func__); + } + + bool only_string_accepted = c.supports_string_content && !c.supports_typed_content; + bool only_typed_accepted = !c.supports_string_content && c.supports_typed_content; + json messages = json::array(); for (const auto & msg : msgs) { - json jmsg = msg.to_json_oaicompat(concat_typed_text); - messages.push_back(jmsg); + if (only_string_accepted) { + json jmsg = msg.to_json_oaicompat(/* concat_typed_text= */ true); + messages.push_back(jmsg); + } else if (only_typed_accepted) { + json jmsg = msg.to_json_oaicompat(/* concat_typed_text= */ false); + if (jmsg.at("content").is_string()) { + jmsg["content"] = json::array({ + json{ + {"type", "text"}, + {"text", jmsg.at("content").get()}, + } + }); + } + messages.push_back(jmsg); + } else { + json jmsg = msg.to_json_oaicompat(/* concat_typed_text= */ false); + messages.push_back(jmsg); + } } return messages; } +// DEPRECATED: only used in tests +json common_chat_msgs_to_json_oaicompat(const std::vector & msgs, bool concat_typed_text) { + jinja::caps c; + c.supports_string_content = true; + c.supports_typed_content = !concat_typed_text; + return render_message_to_json(msgs, c); +} + std::vector common_chat_tools_parse_oaicompat(const json & tools) { std::vector result; @@ -3020,7 +3051,7 @@ static common_chat_params common_chat_templates_apply_jinja( : *tmpls->template_default; const auto & src = tmpl.source(); const auto & caps = tmpl.original_caps(); - params.messages = common_chat_msgs_to_json_oaicompat(inputs.messages, /* concat_text= */ !tmpl.original_caps().requires_typed_content); + params.messages = render_message_to_json(inputs.messages, tmpl.original_caps()); params.add_generation_prompt = inputs.add_generation_prompt; params.tool_choice = inputs.tool_choice; params.reasoning_format = inputs.reasoning_format; diff --git a/common/chat.h b/common/chat.h index 24aa4aab5c..1bf43f7261 100644 --- a/common/chat.h +++ b/common/chat.h @@ -240,6 +240,8 @@ bool common_chat_templates_support_enable_thinking(const common_chat_templates * // Parses a JSON array of messages in OpenAI's chat completion API format. std::vector common_chat_msgs_parse_oaicompat(const nlohmann::ordered_json & messages); + +// DEPRECATED: only used in tests nlohmann::ordered_json common_chat_msgs_to_json_oaicompat(const std::vector & msgs, bool concat_typed_text = false); std::vector common_chat_tools_parse_oaicompat(const nlohmann::ordered_json & tools); diff --git a/common/jinja/caps.cpp b/common/jinja/caps.cpp index f27490f1fb..dbaaed500a 100644 --- a/common/jinja/caps.cpp +++ b/common/jinja/caps.cpp @@ -63,7 +63,8 @@ static void caps_print_stats(value & v, const std::string & path) { std::map caps::to_map() const { return { - {"requires_typed_content", requires_typed_content}, + {"supports_string_content", supports_string_content}, + {"supports_typed_content", supports_typed_content}, {"supports_tools", supports_tools}, {"supports_tool_calls", supports_tool_calls}, {"supports_parallel_tool_calls", supports_parallel_tool_calls}, @@ -89,7 +90,7 @@ caps caps_get(jinja::program & prog) { return v->stats.ops.find(op_name) != v->stats.ops.end(); }; - // case: typed content requirement + // case: typed content support caps_try_execute( prog, [&]() { @@ -105,12 +106,16 @@ caps caps_get(jinja::program & prog) { // tools return json{nullptr}; }, - [&](bool, value & messages, value &) { + [&](bool success, value & messages, value &) { auto & content = messages->at(0)->at("content"); caps_print_stats(content, "messages[0].content"); if (has_op(content, "selectattr") || has_op(content, "array_access")) { // accessed as an array - result.requires_typed_content = true; + result.supports_typed_content = true; + } + if (!success) { + // failed to execute with content as string + result.supports_string_content = false; } } ); diff --git a/common/jinja/caps.h b/common/jinja/caps.h index 77df117baa..e694e7bfaa 100644 --- a/common/jinja/caps.h +++ b/common/jinja/caps.h @@ -14,7 +14,9 @@ struct caps { bool supports_parallel_tool_calls = true; bool supports_preserve_reasoning = false; // support assistant message with reasoning_content - bool requires_typed_content = false; // default: use string content + // one of the 2 content capabilities must be true + bool supports_string_content = true; + bool supports_typed_content = false; // for reporting on server std::map to_map() const; diff --git a/common/jinja/runtime.cpp b/common/jinja/runtime.cpp index 4453d86e6d..cc012c892f 100644 --- a/common/jinja/runtime.cpp +++ b/common/jinja/runtime.cpp @@ -446,6 +446,12 @@ value for_statement::execute_impl(context & ctx) { value iterable_val = iter_expr->execute(scope); + // mark the variable being iterated as used for stats + if (ctx.is_get_stats) { + iterable_val->stats.used = true; + iterable_val->stats.ops.insert("array_access"); + } + if (iterable_val->is_undefined()) { JJ_DEBUG("%s", "For loop iterable is undefined, skipping loop"); iterable_val = mk_val();