From c7358ddf6458ffaf45dc1d9ca447b354c919fcff Mon Sep 17 00:00:00 2001 From: RachelMantel Date: Fri, 30 Jan 2026 06:00:49 +0200 Subject: [PATCH 01/17] sycl: implement GGML_OP_TRI (#19089) * sycl: implement GGML_OP_TRI * docs: update ops.md for SYCL TRI * docs: regenerate ops.md * docs: update SYCL support for GGML_OP_TRI --- docs/ops.md | 2 +- docs/ops/SYCL.csv | 8 ++-- ggml/src/ggml-sycl/ggml-sycl.cpp | 69 ++++++++++++++++++++++++++++++++ 3 files changed, 74 insertions(+), 5 deletions(-) diff --git a/docs/ops.md b/docs/ops.md index c066ab5a85..b8e0347803 100644 --- a/docs/ops.md +++ b/docs/ops.md @@ -114,7 +114,7 @@ Legend: | TANH | ❌ | βœ… | βœ… | 🟑 | 🟑 | βœ… | βœ… | 🟑 | βœ… | ❌ | ❌ | | TIMESTEP_EMBEDDING | ❌ | βœ… | βœ… | βœ… | βœ… | βœ… | βœ… | βœ… | ❌ | ❌ | ❌ | | TOP_K | ❌ | ❌ | βœ… | ❌ | βœ… | ❌ | ❌ | 🟑 | βœ… | ❌ | ❌ | -| TRI | ❌ | ❌ | βœ… | βœ… | βœ… | ❌ | ❌ | βœ… | ❌ | ❌ | ❌ | +| TRI | ❌ | ❌ | βœ… | βœ… | βœ… | ❌ | βœ… | βœ… | ❌ | ❌ | ❌ | | TRUNC | ❌ | ❌ | βœ… | 🟑 | ❌ | ❌ | 🟑 | 🟑 | βœ… | ❌ | ❌ | | UPSCALE | ❌ | 🟑 | βœ… | βœ… | 🟑 | 🟑 | 🟑 | 🟑 | ❌ | ❌ | ❌ | | XIELU | ❌ | ❌ | βœ… | ❌ | ❌ | ❌ | ❌ | ❌ | βœ… | ❌ | ❌ | diff --git a/docs/ops/SYCL.csv b/docs/ops/SYCL.csv index 091a5caed7..255b4ef473 100644 --- a/docs/ops/SYCL.csv +++ b/docs/ops/SYCL.csv @@ -10052,10 +10052,10 @@ "SYCL0","CUMSUM","type=f32,ne=[375960,1,1,1]","support","0","no","SYCL" "SYCL0","CUMSUM","type=f32,ne=[20481,4,1,1]","support","0","no","SYCL" "SYCL0","XIELU","type=f32,ne=[10,5,4,3]","support","0","no","SYCL" -"SYCL0","TRI","type=f32,ne=[10,10,4,3],tri_type=3","support","0","no","SYCL" -"SYCL0","TRI","type=f32,ne=[10,10,4,3],tri_type=2","support","0","no","SYCL" -"SYCL0","TRI","type=f32,ne=[10,10,4,3],tri_type=1","support","0","no","SYCL" -"SYCL0","TRI","type=f32,ne=[10,10,4,3],tri_type=0","support","0","no","SYCL" +"SYCL0","TRI","type=f32,ne=[10,10,4,3],tri_type=3","support","1","yes","SYCL" +"SYCL0","TRI","type=f32,ne=[10,10,4,3],tri_type=2","support","1","yes","SYCL" +"SYCL0","TRI","type=f32,ne=[10,10,4,3],tri_type=1","support","1","yes","SYCL" +"SYCL0","TRI","type=f32,ne=[10,10,4,3],tri_type=0","support","1","yes","SYCL" "SYCL0","FILL","type=f32,ne=[10,10,4,3],c=0.000000","support","0","no","SYCL" "SYCL0","FILL","type=f32,ne=[303,207,11,3],c=2.000000","support","0","no","SYCL" "SYCL0","FILL","type=f32,ne=[800,600,4,4],c=-152.000000","support","0","no","SYCL" diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp index 3a4c092af5..d20b7ec57d 100644 --- a/ggml/src/ggml-sycl/ggml-sycl.cpp +++ b/ggml/src/ggml-sycl/ggml-sycl.cpp @@ -2263,6 +2263,65 @@ inline void ggml_sycl_op_diag_mask_inf(ggml_backend_sycl_context & ctx, ggml_ten diag_mask_inf_f32_sycl(src0_dd, dst_dd, ne00, nrows0, ne01, n_past, main_stream); } +static void tri_f32_sycl( + const float * src, + float * dst, + const int64_t ne0, + const int64_t ne1, + const int64_t ne2, + const int64_t ne3, + const ggml_tri_type ttype, + dpct::queue_ptr main_stream +) { + const size_t total = (size_t) ne0 * (size_t) ne1 * (size_t) ne2 * (size_t) ne3; + + main_stream->parallel_for(sycl::range<1>(total), [=](sycl::id<1> tid) { + const int64_t idx = (int64_t) tid[0]; + + const int64_t i0 = idx % ne0; + const int64_t t1 = idx / ne0; + const int64_t i1 = t1 % ne1; + + bool keep = false; + switch (ttype) { + case GGML_TRI_TYPE_LOWER: keep = (i0 < i1); break; + case GGML_TRI_TYPE_LOWER_DIAG: keep = (i0 <= i1); break; + case GGML_TRI_TYPE_UPPER: keep = (i0 > i1); break; + case GGML_TRI_TYPE_UPPER_DIAG: keep = (i0 >= i1); break; + default: keep = false; break; + } + + dst[idx] = keep ? src[idx] : 0.0f; + }); +} + +static void ggml_sycl_op_tri(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + const ggml_tensor * src0 = dst->src[0]; + GGML_ASSERT(src0); + + GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT(dst->type == GGML_TYPE_F32); + GGML_ASSERT(ggml_is_contiguous(src0)); + GGML_ASSERT(ggml_is_contiguous(dst)); + GGML_ASSERT(ggml_are_same_shape(src0, dst)); + + dpct::queue_ptr main_stream = ctx.stream(); + SYCL_CHECK(ggml_sycl_set_device(ctx.device)); + + const float * src0_dd = static_cast(src0->data); + float * dst_dd = static_cast(dst->data); + + const ggml_tri_type ttype = (ggml_tri_type) ggml_get_op_params_i32(dst, 0); + + const int64_t ne0 = src0->ne[0]; + const int64_t ne1 = src0->ne[1]; + const int64_t ne2 = src0->ne[2]; + const int64_t ne3 = src0->ne[3]; + + tri_f32_sycl(src0_dd, dst_dd, ne0, ne1, ne2, ne3, ttype, main_stream); +} + + inline void ggml_sycl_op_scale(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); GGML_ASSERT( dst->type == GGML_TYPE_F32); @@ -3912,6 +3971,9 @@ static bool ggml_sycl_compute_forward(ggml_backend_sycl_context & ctx, struct gg case GGML_OP_TRANSPOSE: GGML_SYCL_DEBUG("%s: Tensor NO-OP\n", __func__); break; + case GGML_OP_TRI: + ggml_sycl_op_tri(ctx, dst); + break; case GGML_OP_DIAG_MASK_INF: ggml_sycl_diag_mask_inf(ctx, dst); break; @@ -4616,6 +4678,13 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g return true; case GGML_OP_CONT: return op->src[0]->type != GGML_TYPE_BF16; + case GGML_OP_TRI: + { + const ggml_tensor * src0 = op->src[0]; + return src0 && + op->type == GGML_TYPE_F32 && + ggml_is_contiguous(src0); + } case GGML_OP_DIAG_MASK_INF: return true; case GGML_OP_SOFT_MAX: From 1025fd2c09b1d4f9aa3fb2f7349084eecd04f958 Mon Sep 17 00:00:00 2001 From: s8322 Date: Fri, 30 Jan 2026 06:01:38 +0200 Subject: [PATCH 02/17] sycl: implement GGML_UNARY_OP_SOFTPLUS (#19114) * sycl: add softplus unary op implementation * sycl: add softplus unary op implementation * docs(ops): mark SYCL SOFTPLUS as supported * docs: update SYCL status for SOFTPLUS --- docs/ops.md | 2 +- docs/ops/SYCL.csv | 16 ++++++++-------- ggml/src/ggml-sycl/element_wise.cpp | 20 ++++++++++++++++++++ ggml/src/ggml-sycl/element_wise.hpp | 2 ++ ggml/src/ggml-sycl/ggml-sycl.cpp | 4 ++++ 5 files changed, 35 insertions(+), 9 deletions(-) diff --git a/docs/ops.md b/docs/ops.md index b8e0347803..2c7c60dcca 100644 --- a/docs/ops.md +++ b/docs/ops.md @@ -97,7 +97,7 @@ Legend: | SILU | ❌ | βœ… | βœ… | 🟑 | 🟑 | 🟑 | βœ… | 🟑 | βœ… | ❌ | ❌ | | SILU_BACK | ❌ | ❌ | βœ… | βœ… | ❌ | ❌ | ❌ | βœ… | ❌ | ❌ | ❌ | | SIN | ❌ | βœ… | βœ… | βœ… | 🟑 | ❌ | βœ… | 🟑 | ❌ | ❌ | ❌ | -| SOFTPLUS | ❌ | ❌ | βœ… | 🟑 | 🟑 | ❌ | ❌ | 🟑 | βœ… | ❌ | ❌ | +| SOFTPLUS | ❌ | ❌ | βœ… | 🟑 | 🟑 | ❌ | βœ… | 🟑 | βœ… | ❌ | ❌ | | SOFT_MAX | ❌ | 🟑 | βœ… | βœ… | βœ… | βœ… | βœ… | βœ… | βœ… | ❌ | ❌ | | SOFT_MAX_BACK | ❌ | ❌ | 🟑 | 🟑 | ❌ | ❌ | 🟑 | βœ… | ❌ | ❌ | ❌ | | SOLVE_TRI | ❌ | ❌ | βœ… | 🟑 | ❌ | ❌ | ❌ | 🟑 | ❌ | ❌ | ❌ | diff --git a/docs/ops/SYCL.csv b/docs/ops/SYCL.csv index 255b4ef473..c1d22e65d4 100644 --- a/docs/ops/SYCL.csv +++ b/docs/ops/SYCL.csv @@ -29,8 +29,8 @@ "SYCL0","EXP","type=f16,ne_a=[5,7,11,13],v=0","support","1","yes","SYCL" "SYCL0","EXPM1","type=f16,ne_a=[128,2,2,2],v=0","support","0","no","SYCL" "SYCL0","EXPM1","type=f16,ne_a=[5,7,11,13],v=0","support","0","no","SYCL" -"SYCL0","SOFTPLUS","type=f16,ne_a=[128,2,2,2],v=0","support","0","no","SYCL" -"SYCL0","SOFTPLUS","type=f16,ne_a=[5,7,11,13],v=0","support","0","no","SYCL" +"SYCL0","SOFTPLUS","type=f16,ne_a=[128,2,2,2],v=0","support","1","yes","SYCL" +"SYCL0","SOFTPLUS","type=f16,ne_a=[5,7,11,13],v=0","support","1","yes","SYCL" "SYCL0","GELU_ERF","type=f16,ne_a=[128,2,2,2],v=0","support","1","yes","SYCL" "SYCL0","GELU_ERF","type=f16,ne_a=[5,7,11,13],v=0","support","1","yes","SYCL" "SYCL0","FLOOR","type=f16,ne_a=[128,2,2,2],v=0","support","1","yes","SYCL" @@ -71,8 +71,8 @@ "SYCL0","EXP","type=f16,ne_a=[5,7,11,13],v=1","support","1","yes","SYCL" "SYCL0","EXPM1","type=f16,ne_a=[128,2,2,2],v=1","support","0","no","SYCL" "SYCL0","EXPM1","type=f16,ne_a=[5,7,11,13],v=1","support","0","no","SYCL" -"SYCL0","SOFTPLUS","type=f16,ne_a=[128,2,2,2],v=1","support","0","no","SYCL" -"SYCL0","SOFTPLUS","type=f16,ne_a=[5,7,11,13],v=1","support","0","no","SYCL" +"SYCL0","SOFTPLUS","type=f16,ne_a=[128,2,2,2],v=1","support","1","yes","SYCL" +"SYCL0","SOFTPLUS","type=f16,ne_a=[5,7,11,13],v=1","support","1","yes","SYCL" "SYCL0","GELU_ERF","type=f16,ne_a=[128,2,2,2],v=1","support","1","yes","SYCL" "SYCL0","GELU_ERF","type=f16,ne_a=[5,7,11,13],v=1","support","1","yes","SYCL" "SYCL0","FLOOR","type=f16,ne_a=[128,2,2,2],v=1","support","0","no","SYCL" @@ -113,8 +113,8 @@ "SYCL0","EXP","type=f32,ne_a=[5,7,11,13],v=0","support","1","yes","SYCL" "SYCL0","EXPM1","type=f32,ne_a=[128,2,2,2],v=0","support","0","no","SYCL" "SYCL0","EXPM1","type=f32,ne_a=[5,7,11,13],v=0","support","0","no","SYCL" -"SYCL0","SOFTPLUS","type=f32,ne_a=[128,2,2,2],v=0","support","0","no","SYCL" -"SYCL0","SOFTPLUS","type=f32,ne_a=[5,7,11,13],v=0","support","0","no","SYCL" +"SYCL0","SOFTPLUS","type=f32,ne_a=[128,2,2,2],v=0","support","1","yes","SYCL" +"SYCL0","SOFTPLUS","type=f32,ne_a=[5,7,11,13],v=0","support","1","yes","SYCL" "SYCL0","GELU_ERF","type=f32,ne_a=[128,2,2,2],v=0","support","1","yes","SYCL" "SYCL0","GELU_ERF","type=f32,ne_a=[5,7,11,13],v=0","support","1","yes","SYCL" "SYCL0","FLOOR","type=f32,ne_a=[128,2,2,2],v=0","support","1","yes","SYCL" @@ -155,8 +155,8 @@ "SYCL0","EXP","type=f32,ne_a=[5,7,11,13],v=1","support","1","yes","SYCL" "SYCL0","EXPM1","type=f32,ne_a=[128,2,2,2],v=1","support","0","no","SYCL" "SYCL0","EXPM1","type=f32,ne_a=[5,7,11,13],v=1","support","0","no","SYCL" -"SYCL0","SOFTPLUS","type=f32,ne_a=[128,2,2,2],v=1","support","0","no","SYCL" -"SYCL0","SOFTPLUS","type=f32,ne_a=[5,7,11,13],v=1","support","0","no","SYCL" +"SYCL0","SOFTPLUS","type=f32,ne_a=[128,2,2,2],v=1","support","1","yes","SYCL" +"SYCL0","SOFTPLUS","type=f32,ne_a=[5,7,11,13],v=1","support","1","yes","SYCL" "SYCL0","GELU_ERF","type=f32,ne_a=[128,2,2,2],v=1","support","1","yes","SYCL" "SYCL0","GELU_ERF","type=f32,ne_a=[5,7,11,13],v=1","support","1","yes","SYCL" "SYCL0","FLOOR","type=f32,ne_a=[128,2,2,2],v=1","support","0","no","SYCL" diff --git a/ggml/src/ggml-sycl/element_wise.cpp b/ggml/src/ggml-sycl/element_wise.cpp index 8d83b2446b..651b875b63 100644 --- a/ggml/src/ggml-sycl/element_wise.cpp +++ b/ggml/src/ggml-sycl/element_wise.cpp @@ -123,6 +123,15 @@ static __dpct_inline__ T op_log(T x) { return sycl::log(x); } +template +static __dpct_inline__ T op_softplus(T x) { + const float xf = (float) x; + const float ax = sycl::fabs(xf); + const float m = sycl::fmax(xf, 0.0f); + const float y = m + sycl::log1p(sycl::exp(-ax)); + return (T) y; +} + template static __dpct_inline__ T op_neg(T x) { return -x; @@ -695,6 +704,12 @@ static inline void ggml_sycl_op_log(ggml_backend_sycl_context & ctx, ggml_tensor }); } +static inline void ggml_sycl_op_softplus(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + ggml_sycl_detail::ggml_sycl_op_unary(ctx, dst, [](auto x) { + return op_softplus(x); + }); +} + static inline void ggml_sycl_op_neg(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { ggml_sycl_detail::ggml_sycl_op_unary(ctx, dst, [](auto x) { return op_neg(x); @@ -1101,6 +1116,11 @@ void ggml_sycl_log(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { ggml_sycl_op_log(ctx, dst); } +void ggml_sycl_softplus(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1); + ggml_sycl_op_softplus(ctx, dst); +} + void ggml_sycl_neg(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1); ggml_sycl_op_neg(ctx, dst); diff --git a/ggml/src/ggml-sycl/element_wise.hpp b/ggml/src/ggml-sycl/element_wise.hpp index 0913a2e529..7c71974687 100644 --- a/ggml/src/ggml-sycl/element_wise.hpp +++ b/ggml/src/ggml-sycl/element_wise.hpp @@ -61,6 +61,8 @@ void ggml_sycl_exp(ggml_backend_sycl_context & ctx, ggml_tensor * dst); void ggml_sycl_log(ggml_backend_sycl_context & ctx, ggml_tensor * dst); +void ggml_sycl_softplus(ggml_backend_sycl_context & ctx, ggml_tensor * dst); + void ggml_sycl_neg(ggml_backend_sycl_context & ctx, ggml_tensor * dst); void ggml_sycl_step(ggml_backend_sycl_context & ctx, ggml_tensor * dst); diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp index d20b7ec57d..74b4ed91cc 100644 --- a/ggml/src/ggml-sycl/ggml-sycl.cpp +++ b/ggml/src/ggml-sycl/ggml-sycl.cpp @@ -3845,6 +3845,9 @@ static bool ggml_sycl_compute_forward(ggml_backend_sycl_context & ctx, struct gg case GGML_UNARY_OP_EXP: ggml_sycl_exp(ctx, dst); break; + case GGML_UNARY_OP_SOFTPLUS: + ggml_sycl_softplus(ctx, dst); + break; case GGML_UNARY_OP_SGN: ggml_sycl_sgn(ctx, dst); break; @@ -4466,6 +4469,7 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g case GGML_UNARY_OP_GELU_QUICK: case GGML_UNARY_OP_GELU_ERF: case GGML_UNARY_OP_EXP: + case GGML_UNARY_OP_SOFTPLUS: case GGML_UNARY_OP_ELU: return true; case GGML_UNARY_OP_FLOOR: From ecbf01d44186fe78aa69d22fd6ee7790e2b87dc5 Mon Sep 17 00:00:00 2001 From: bssrdf Date: Thu, 29 Jan 2026 23:57:52 -0500 Subject: [PATCH 03/17] add tensor type checking as part of cuda graph properties (#19186) --- ggml/src/ggml-cuda/common.cuh | 1 + ggml/src/ggml-cuda/ggml-cuda.cu | 5 +++++ 2 files changed, 6 insertions(+) diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh index 43280644e4..a3256d59dd 100644 --- a/ggml/src/ggml-cuda/common.cuh +++ b/ggml/src/ggml-cuda/common.cuh @@ -1124,6 +1124,7 @@ struct ggml_tensor_extra_gpu { struct ggml_cuda_graph_node_properties { void * node_data; ggml_op node_op; + enum ggml_type node_type; int32_t flags; int64_t ne[GGML_MAX_DIMS]; size_t nb[GGML_MAX_DIMS]; diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu index cfcffde8a2..e9e9592eba 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -2920,6 +2920,7 @@ static void ggml_cuda_graph_node_set_properties(ggml_cuda_graph_node_properties memset(props, 0, sizeof(ggml_cuda_graph_node_properties)); props->node_data = node->data; props->node_op = node->op; + props->node_type = node->type; props->flags = node->flags; for (int i = 0; i < GGML_MAX_DIMS; i++) { props->ne[i] = node->ne[i]; @@ -2944,6 +2945,10 @@ static bool ggml_cuda_graph_node_properties_match(ggml_tensor * node, ggml_cuda_ return false; } + if (node->type != props->node_type) { + return false; + } + for (int i = 0; i < GGML_MAX_DIMS; i++) { if (node->ne[i] != props->ne[i]) { return false; From b316895ff95b39cc5b7494427e9dfca9e231b23e Mon Sep 17 00:00:00 2001 From: Antonis Makropoulos Date: Fri, 30 Jan 2026 08:54:28 +0200 Subject: [PATCH 04/17] docs: Add LlamaLib to UI projects (#19181) --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 0783e43e5c..dac020ad37 100644 --- a/README.md +++ b/README.md @@ -213,6 +213,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo - [llama.vim](https://github.com/ggml-org/llama.vim) (MIT) - [LARS](https://github.com/abgulati/LARS) (AGPL) - [Llama Assistant](https://github.com/vietanhdev/llama-assistant) (GPL) +- [LlamaLib](https://github.com/undreamai/LlamaLib) (Apache-2.0) - [LLMFarm](https://github.com/guinmoon/LLMFarm?tab=readme-ov-file) (MIT) - [LLMUnity](https://github.com/undreamai/LLMUnity) (MIT) - [LMStudio](https://lmstudio.ai/) (proprietary) From 83bcdf7217dc06ac67ff5f7322bdd89f46664c04 Mon Sep 17 00:00:00 2001 From: Daniel Bevenius Date: Fri, 30 Jan 2026 10:37:06 +0100 Subject: [PATCH 05/17] memory : remove unused tmp_buf (#19199) This commit removes the unused tmp_buf variable from llama-kv-cache.cpp and llama-memory-recurrent.cpp. The tmp_buf variable was declared but never used but since it has a non-trivial constructor/desctuctor we don't get an unused variable warning about it. --- src/llama-kv-cache.cpp | 8 +++----- src/llama-memory-recurrent.cpp | 8 +++----- 2 files changed, 6 insertions(+), 10 deletions(-) diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp index f3c9b49f30..c35cd6761b 100644 --- a/src/llama-kv-cache.cpp +++ b/src/llama-kv-cache.cpp @@ -1772,8 +1772,6 @@ void llama_kv_cache::state_write_data(llama_io_write_i & io, const cell_ranges_t io.write(&v_trans, sizeof(v_trans)); io.write(&n_layer, sizeof(n_layer)); - std::vector tmp_buf; - // Iterate and write all the keys first, each row is a cell // Get whole range at a time for (const auto & layer : layers) { @@ -1791,7 +1789,7 @@ void llama_kv_cache::state_write_data(llama_io_write_i & io, const cell_ranges_t const uint64_t k_size_row = ggml_row_size(k->type, n_embd_k_gqa); io.write(&k_size_row, sizeof(k_size_row)); - // Read each range of cells of k_size length each into tmp_buf and write out + // Read each range of cells of k_size length and write out for (const auto & range : cr.data) { const size_t range_size = range.second - range.first; const size_t buf_size = range_size * k_size_row; @@ -1818,7 +1816,7 @@ void llama_kv_cache::state_write_data(llama_io_write_i & io, const cell_ranges_t const uint64_t v_size_row = ggml_row_size(v->type, n_embd_v_gqa); io.write(&v_size_row, sizeof(v_size_row)); - // Read each range of cells of v_size length each into tmp_buf and write out + // Read each range of cells of v_size length and write out for (const auto & range : cr.data) { const size_t range_size = range.second - range.first; const size_t buf_size = range_size * v_size_row; @@ -1852,7 +1850,7 @@ void llama_kv_cache::state_write_data(llama_io_write_i & io, const cell_ranges_t // For each row, we get the element values of each cell for (uint32_t j = 0; j < n_embd_v_gqa; ++j) { - // Read each range of cells of v_size_el length each into tmp_buf and write out + // Read each range of cells of v_size_el length and write out for (const auto & range : cr.data) { const size_t range_size = range.second - range.first; const size_t src_offset = (range.first + j * kv_size) * v_size_el; diff --git a/src/llama-memory-recurrent.cpp b/src/llama-memory-recurrent.cpp index 812bf25304..c8361b1812 100644 --- a/src/llama-memory-recurrent.cpp +++ b/src/llama-memory-recurrent.cpp @@ -785,8 +785,6 @@ void llama_memory_recurrent::state_write_data(llama_io_write_i & io, const std:: io.write(&s_trans, sizeof(s_trans)); io.write(&n_layer, sizeof(n_layer)); - std::vector tmp_buf; - // Iterate and write all the keys first, each row is a cell // Get whole range at a time for (uint32_t il = 0; il < n_layer; ++il) { @@ -801,7 +799,7 @@ void llama_memory_recurrent::state_write_data(llama_io_write_i & io, const std:: const uint64_t r_size_row = ggml_row_size(r_l[il]->type, hparams.n_embd_r()); io.write(&r_size_row, sizeof(r_size_row)); - // Read each range of cells of k_size length each into tmp_buf and write out + // Read each range of cells of k_size length and write out for (const auto & range : cell_ranges) { const size_t range_size = range.second - range.first; const size_t buf_size = range_size * r_size_row; @@ -822,7 +820,7 @@ void llama_memory_recurrent::state_write_data(llama_io_write_i & io, const std:: const uint64_t s_size_row = ggml_row_size(s_l[il]->type, hparams.n_embd_s()); io.write(&s_size_row, sizeof(s_size_row)); - // Read each range of cells of s_size length each into tmp_buf and write out + // Read each range of cells of s_size length and write out for (const auto & range : cell_ranges) { const size_t range_size = range.second - range.first; const size_t buf_size = range_size * s_size_row; @@ -851,7 +849,7 @@ void llama_memory_recurrent::state_write_data(llama_io_write_i & io, const std:: // For each row, we get the element values of each cell for (uint32_t j = 0; j < n_embd_s; ++j) { - // Read each range of cells of v_size_el length each into tmp_buf and write out + // Read each range of cells of v_size_el length and write out for (const auto & range : cell_ranges) { const size_t range_size = range.second - range.first; const size_t src_offset = (range.first + j * mem_size) * s_size_el; From 05625031547d95f7335abd012cad4cd07986819b Mon Sep 17 00:00:00 2001 From: Daniel Bevenius Date: Fri, 30 Jan 2026 11:12:53 +0100 Subject: [PATCH 06/17] convert : add missing return statement for GraniteMoeModel (#19202) This commit adds a missing return statement to the GraniteMoeModel class to fix an issue in the model conversion process. Resolves: https://github.com/ggml-org/llama.cpp/issues/19201 --- convert_hf_to_gguf.py | 1 + 1 file changed, 1 insertion(+) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index a391717e32..eb43520f98 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -8806,6 +8806,7 @@ class GraniteMoeModel(GraniteModel): gate, up = data_torch.split(ffn_dim, dim=-2) yield from ModelBase.modify_tensors(self, gate, self.format_tensor_name(gguf.MODEL_TENSOR.FFN_GATE_EXP, bid), bid) yield from ModelBase.modify_tensors(self, up, self.format_tensor_name(gguf.MODEL_TENSOR.FFN_UP_EXP, bid), bid) + return has_experts = bool(self.hparams.get('num_local_experts')) From c3b87cebff7dfdd73b850b8a194cd0a0c6bfcf49 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Fri, 30 Jan 2026 13:52:57 +0200 Subject: [PATCH 07/17] tests : add GQA=20 FA test (#19095) --- tests/test-backend-ops.cpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index d4c1f525c6..411467e968 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -8213,11 +8213,13 @@ static std::vector> make_test_cases_eval() { if (!mask && max_bias > 0.0f) continue; for (float logit_softcap : {0.0f, 10.0f}) { if (hsk != 128 && logit_softcap != 0.0f) continue; - for (int nh : { 4, }) { + for (int nh : { 1, 4 }) { + if (nh == 1 && hsk != 576) continue; // GLM 4.7 Flash for (int nr3 : { 1, 3, }) { if (hsk > 64 && nr3 > 1) continue; // skip broadcast for large head sizes - for (int nr2 : { 1, 4, 12 }) { + for (int nr2 : { 1, 4, 12, 20 }) { if (nr2 == 12 && hsk != 128) continue; + if (nr2 == 20 && (nh != 1 || hsk != 576)) continue; //for (int kv : { 1, 17, 31, 33, 61, 113, 65, 127, 129, 130, 255, 260, 371, 380, 407, 512, 1024, }) { for (int kv : { 113, 512, 1024, }) { if (nr2 != 1 && kv != 512) continue; From f3bc98890c2b07f66c0ee20cd539abbfe9ea7639 Mon Sep 17 00:00:00 2001 From: Daniel Bevenius Date: Fri, 30 Jan 2026 15:18:41 +0100 Subject: [PATCH 08/17] memory : clarify comments for r_l and s_l tensors [no ci] (#19203) This commit updates the comments in state_write_data to clarify that it is handling the R and S tensors and not Key and Value tensors. --- src/llama-memory-recurrent.cpp | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/src/llama-memory-recurrent.cpp b/src/llama-memory-recurrent.cpp index c8361b1812..f0038036dc 100644 --- a/src/llama-memory-recurrent.cpp +++ b/src/llama-memory-recurrent.cpp @@ -785,21 +785,21 @@ void llama_memory_recurrent::state_write_data(llama_io_write_i & io, const std:: io.write(&s_trans, sizeof(s_trans)); io.write(&n_layer, sizeof(n_layer)); - // Iterate and write all the keys first, each row is a cell + // Iterate and write all the R tensors first, each row is a cell // Get whole range at a time for (uint32_t il = 0; il < n_layer; ++il) { // skip null layers (read_data will handle this by checking "r_l" and "s_l" for null) if (r_l[il] == nullptr) continue; - // Write key type + // Write R tensor type const int32_t r_type_i = (int32_t)r_l[il]->type; io.write(&r_type_i, sizeof(r_type_i)); - // Write row size of key + // Write row size of R tensor const uint64_t r_size_row = ggml_row_size(r_l[il]->type, hparams.n_embd_r()); io.write(&r_size_row, sizeof(r_size_row)); - // Read each range of cells of k_size length and write out + // Write each range of cells of r_size_row length for (const auto & range : cell_ranges) { const size_t range_size = range.second - range.first; const size_t buf_size = range_size * r_size_row; @@ -812,15 +812,15 @@ void llama_memory_recurrent::state_write_data(llama_io_write_i & io, const std:: // skip null layers (read_data will handle this by checking "r_l" and "s_l" for null) if (s_l[il] == nullptr) continue; - // Write value type + // Write S tensor type const int32_t s_type_i = (int32_t)s_l[il]->type; io.write(&s_type_i, sizeof(s_type_i)); - // Write row size of value + // Write row size of S tensor const uint64_t s_size_row = ggml_row_size(s_l[il]->type, hparams.n_embd_s()); io.write(&s_size_row, sizeof(s_size_row)); - // Read each range of cells of s_size length and write out + // Write each range of S tensor rows for (const auto & range : cell_ranges) { const size_t range_size = range.second - range.first; const size_t buf_size = range_size * s_size_row; @@ -828,7 +828,7 @@ void llama_memory_recurrent::state_write_data(llama_io_write_i & io, const std:: } } } else { - // When v is transposed, we also need the element size and get the element ranges from each row + // When S tensor is transposed, we also need the element size and get the element ranges from each row const uint32_t mem_size = size; for (uint32_t il = 0; il < n_layer; ++il) { // skip null layers (read_data will handle this by checking "r_l" and "s_l" for null) @@ -836,7 +836,7 @@ void llama_memory_recurrent::state_write_data(llama_io_write_i & io, const std:: const uint32_t n_embd_s = hparams.n_embd_s(); - // Write value type + // Write S tensor type const int32_t s_type_i = (int32_t)s_l[il]->type; io.write(&s_type_i, sizeof(s_type_i)); @@ -849,7 +849,7 @@ void llama_memory_recurrent::state_write_data(llama_io_write_i & io, const std:: // For each row, we get the element values of each cell for (uint32_t j = 0; j < n_embd_s; ++j) { - // Read each range of cells of v_size_el length and write out + // Write each range of cells of s_size_el length for (const auto & range : cell_ranges) { const size_t range_size = range.second - range.first; const size_t src_offset = (range.first + j * mem_size) * s_size_el; From 2e916f996a965a911eb79168b5fb959c1ea0c57b Mon Sep 17 00:00:00 2001 From: Marcello Seri Date: Fri, 30 Jan 2026 16:09:44 +0100 Subject: [PATCH 09/17] jinja : add unordered_map include to value.h [no ci] (#19205) On macos Sequoia 15.7.3, x86_64, the build has recently started failing with ``` In file included from .../code/cpp/llama.cpp/common/jinja/string.cpp:2: .../code/cpp/llama.cpp/common/./jinja/value.h:478:10: error: no template named 'unordered_map' in namespace 'std' 478 | std::unordered_map unordered; | ~~~~~^ In file included from .../code/cpp/llama.cpp/common/jinja/caps.cpp:1: .../code/cpp/llama.cpp/common/jinja/value.h:478:10: error: no template named 'unordered_map' in namespace 'std' 478 | std::unordered_map unordered; | ~~~~~^ In file included from .../code/cpp/llama.cpp/common/jinja/value.cpp:1: In file included from .../code/cpp/llama.cpp/common/jinja/runtime.h:4: .../code/cpp/llama.cpp/common/jinja/value.h:478:10: error: no template named 'unordered_map' in namespace 'std' 478 | std::unordered_map unordered; [...] ``` After a bit of digging to make sure all the appropriate flags were used, I notifced that the necessary header was not included. This fixes the build for me and should not affect negatively other builds that for some reasons were already succeeding --- common/jinja/value.h | 1 + 1 file changed, 1 insertion(+) diff --git a/common/jinja/value.h b/common/jinja/value.h index a2f92d2c69..1c04760a08 100644 --- a/common/jinja/value.h +++ b/common/jinja/value.h @@ -12,6 +12,7 @@ #include #include #include +#include #include namespace jinja { From dabaa2e77a9d94ee1466ce55f9ff47d5c1daae43 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Fri, 30 Jan 2026 18:21:48 +0200 Subject: [PATCH 10/17] spec : add ngram-mod (#19164) * spec : add ngram-mod * cont : simplify + keep track of occupancy * cont : cleanup * cont : move initialization to common/speculative * cont : cleanup * cont : cleanup * cont : fix --- common/CMakeLists.txt | 2 + common/arg.cpp | 4 +- common/common.h | 5 + common/ngram-map.cpp | 35 +++---- common/ngram-map.h | 1 + common/ngram-mod.cpp | 60 ++++++++++++ common/ngram-mod.h | 37 +++++++ common/speculative.cpp | 164 +++++++++++++++++++++++++++++++- common/speculative.h | 4 +- tools/server/server-context.cpp | 9 +- 10 files changed, 292 insertions(+), 29 deletions(-) create mode 100644 common/ngram-mod.cpp create mode 100644 common/ngram-mod.h diff --git a/common/CMakeLists.txt b/common/CMakeLists.txt index 3bc7bc6210..295ae9ea25 100644 --- a/common/CMakeLists.txt +++ b/common/CMakeLists.txt @@ -75,6 +75,8 @@ add_library(${TARGET} STATIC ngram-cache.h ngram-map.cpp ngram-map.h + ngram-mod.cpp + ngram-mod.h peg-parser.cpp peg-parser.h preset.cpp diff --git a/common/arg.cpp b/common/arg.cpp index 218418f070..5fbc9022c0 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -3396,7 +3396,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex } ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI})); add_opt(common_arg( - {"--spec-type"}, "[none|ngram-cache|ngram-simple|ngram-map-k|ngram-map-k4v]", + {"--spec-type"}, "[none|ngram-cache|ngram-simple|ngram-map-k|ngram-map-k4v|ngram-mod]", string_format("type of speculative decoding to use when no draft model is provided (default: %s)\n", common_speculative_type_to_str(params.speculative.type).c_str()), [](common_params & params, const std::string & value) { @@ -3410,6 +3410,8 @@ common_params_context common_params_parser_init(common_params & params, llama_ex params.speculative.type = COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K; } else if (value == "ngram-map-k4v") { params.speculative.type = COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K4V; + } else if (value == "ngram-mod") { + params.speculative.type = COMMON_SPECULATIVE_TYPE_NGRAM_MOD; } else { throw std::invalid_argument("unknown speculative decoding type without draft model"); } diff --git a/common/common.h b/common/common.h index fd3ab8cd18..398ebb0960 100644 --- a/common/common.h +++ b/common/common.h @@ -171,6 +171,7 @@ enum common_speculative_type { COMMON_SPECULATIVE_TYPE_NGRAM_SIMPLE, // simple self-speculative decoding COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K, // self-speculative decoding with n-gram keys only COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K4V, // self-speculative decoding with n-gram keys and 4 m-gram values + COMMON_SPECULATIVE_TYPE_NGRAM_MOD, COMMON_SPECULATIVE_TYPE_NGRAM_CACHE, // self-speculative decoding with 3-level n-gram cache COMMON_SPECULATIVE_TYPE_COUNT // number of types, unknown type }; @@ -252,6 +253,8 @@ struct common_params_model { std::string name = ""; // in format /[:] (tag is optional) // NOLINT }; +struct common_ngram_mod; + struct common_params_speculative { common_speculative_type type = COMMON_SPECULATIVE_TYPE_NONE; // type of speculative decoding @@ -269,6 +272,8 @@ struct common_params_speculative { uint16_t ngram_check_rate = 1; // check rate for ngram lookup uint16_t ngram_min_hits = 1; // minimum hits at ngram/mgram lookup for mgram to be proposed + std::shared_ptr ngram_mod; + std::string lookup_cache_static; // path of static ngram cache file for lookup decoding // NOLINT std::string lookup_cache_dynamic; // path of dynamic ngram cache file for lookup decoding // NOLINT diff --git a/common/ngram-map.cpp b/common/ngram-map.cpp index 930e7a3c10..84fd761367 100644 --- a/common/ngram-map.cpp +++ b/common/ngram-map.cpp @@ -7,6 +7,21 @@ #include #include +// Print the values of a sublist of `llama_tokens & inp` to a string in the form [v0, v1, v2, ...]. +static std::string common_tokens_to_str(const llama_tokens & inp, size_t start, size_t length) { + std::ostringstream oss; + oss << '['; + for (size_t i = 0; i < length; ++i) { + if (i > 0) { + oss << ", "; + } + oss << inp[start + i]; + } + oss << ']'; + return oss.str(); +} + + // n-gram simple // @@ -100,8 +115,6 @@ llama_tokens common_ngram_simple_draft( // maximum number of counted values of a ngram map value. #define COMMON_NGRAM_MAX_VALUE_COUNT 16380 -static std::string common_tokens_to_str(const llama_tokens & inp, size_t start, size_t length); - void common_ngram_map_draft(common_ngram_map & map, const llama_tokens & inp, llama_token sampled, llama_tokens & draft) { @@ -347,21 +360,3 @@ void common_ngram_map_accept(common_ngram_map & map, uint16_t n_accepted) { n_accepted, curr_value.n_accepted); curr_value.n_accepted = n_accepted; } - -// Helper functions. -// - -// Print the values of a sublist of `llama_tokens & inp` to a string in the form [v0, v1, v2, ...]. -std::string common_tokens_to_str(const llama_tokens & inp, size_t start, size_t length) { - std::ostringstream oss; - oss << '['; - for (size_t i = 0; i < length; ++i) { - if (i > 0) { - oss << ", "; - } - oss << inp[start + i]; - } - oss << ']'; - return oss.str(); -} - diff --git a/common/ngram-map.h b/common/ngram-map.h index bf91883f0c..b365034ac5 100644 --- a/common/ngram-map.h +++ b/common/ngram-map.h @@ -11,6 +11,7 @@ // #include "llama.h" +#include "common.h" #include diff --git a/common/ngram-mod.cpp b/common/ngram-mod.cpp new file mode 100644 index 0000000000..76f7257f61 --- /dev/null +++ b/common/ngram-mod.cpp @@ -0,0 +1,60 @@ +#include "ngram-mod.h" + +// +// common_ngram_mod +// + +common_ngram_mod::common_ngram_mod(uint16_t n, size_t size) : n(n), used(0) { + entries.resize(size); + + reset(); +} + +size_t common_ngram_mod::idx(const entry_t * tokens) const { + size_t res = 0; + + for (size_t i = 0; i < n; ++i) { + res = res*6364136223846793005ULL + tokens[i]; + } + + res = res % entries.size(); + + return res; +} + +void common_ngram_mod::add(const entry_t * tokens) { + const size_t i = idx(tokens); + + if (entries[i] == EMPTY) { + used++; + } + + entries[i] = tokens[n]; +} + +common_ngram_mod::entry_t common_ngram_mod::get(const entry_t * tokens) const { + const size_t i = idx(tokens); + + return entries[i]; +} + +void common_ngram_mod::reset() { + std::fill(entries.begin(), entries.end(), EMPTY); + used = 0; +} + +size_t common_ngram_mod::get_n() const { + return n; +} + +size_t common_ngram_mod::get_used() const { + return used; +} + +size_t common_ngram_mod::size() const { + return entries.size(); +} + +size_t common_ngram_mod::size_bytes() const { + return entries.size() * sizeof(entries[0]); +} diff --git a/common/ngram-mod.h b/common/ngram-mod.h new file mode 100644 index 0000000000..cf3c89c915 --- /dev/null +++ b/common/ngram-mod.h @@ -0,0 +1,37 @@ +#pragma once + +#include +#include + +// +// common_ngram_mod +// ref: https://github.com/ggml-org/llama.cpp/pull/19164 +// + +// basic n-gram hasher +struct common_ngram_mod { + using entry_t = int32_t; + + static constexpr entry_t EMPTY = -1; + + common_ngram_mod(uint16_t n, size_t size); + + size_t idx(const entry_t * tokens) const; + void add(const entry_t * tokens); + entry_t get(const entry_t * tokens) const; // return -1 if not found + + void reset(); + + size_t get_n() const; + size_t get_used() const; + + size_t size() const; + size_t size_bytes() const; + +private: + size_t n; // ngram size to hash + + size_t used; + + std::vector entries; +}; diff --git a/common/speculative.cpp b/common/speculative.cpp index 3f314b5d57..a1a3b51c13 100644 --- a/common/speculative.cpp +++ b/common/speculative.cpp @@ -6,6 +6,7 @@ #include "log.h" #include "ngram-cache.h" #include "ngram-map.h" +#include "ngram-mod.h" #include "sampling.h" #include @@ -23,6 +24,7 @@ const std::vector common_speculative_types = { COMMON_SPECULATIVE_TYPE_NGRAM_SIMPLE, COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K, COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K4V, + COMMON_SPECULATIVE_TYPE_NGRAM_MOD, COMMON_SPECULATIVE_TYPE_NGRAM_CACHE }; @@ -33,6 +35,7 @@ const std::map common_speculative_typ {"ngram_simple", COMMON_SPECULATIVE_TYPE_NGRAM_SIMPLE}, {"ngram_map_k", COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K}, {"ngram_map_k4v", COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K4V}, + {"ngram_mod", COMMON_SPECULATIVE_TYPE_NGRAM_MOD}, {"ngram_cache", COMMON_SPECULATIVE_TYPE_NGRAM_CACHE} }; @@ -110,6 +113,8 @@ static bool common_speculative_are_compatible( struct common_speculative_state { const enum common_speculative_type type; + // TODO: rename to n_call_draft, n_gen_drafts, n_acc_drafts, n_gen_tokens, n_acc_tokens + // TODO: add n_call_begin, n_call_accept size_t drafts_call_count = 0; // number of times this implementation was called. size_t drafts_generated_count = 0; // number of times a draft or part was generated by this implementation. size_t drafts_accepted_count = 0; // number of times a draft or part was accepted by the target model. @@ -119,6 +124,8 @@ struct common_speculative_state { // TODO: track performance of most recent calls const bool gen_perf = true; // whether to generate performance stats. + // TODO: rename to t_draft_us + // TODO: add t_begin_us, t_accept_us int64_t gen_duration_us = 0; // total time spent in this implementation in microseconds. common_speculative_state(enum common_speculative_type type) : type(type) {} @@ -509,6 +516,132 @@ struct common_speculative_state_ngram_map_k : public common_speculative_state { } }; +struct common_speculative_state_ngram_mod : public common_speculative_state { + common_ngram_mod & mod; + + // the last position in the prompt that was added to the ngram container + size_t i_last = 0; + + // length of the last drafted n‑gram (number of tokens returned by draft) + size_t n_draft_last = 0; + + // consecutive accept rounds with low acceptance fraction (< 0.5) + int n_low = 0; + + // enable trace logging if LLAMA_TRACE is set + const bool verbose; + + common_speculative_state_ngram_mod(enum common_speculative_type type, common_ngram_mod & mod) + : common_speculative_state(type), mod(mod), verbose(std::getenv("LLAMA_TRACE") != nullptr) { + static_assert(sizeof(llama_token) == sizeof(common_ngram_mod::entry_t)); + } + + void begin(const llama_tokens & prompt) override { + i_last = 0; + + n_draft_last = 0; + + const size_t n = mod.get_n(); + + if (prompt.size() < n) { + return; + } + + for (size_t i = 0; i < prompt.size() - n; ++i) { + mod.add(prompt.data() + i); + } + + i_last = prompt.size() - n; + + const double f = (double)mod.get_used() / (double)mod.size(); + LOG_INF("%s: ngram_mod occupancy = %zu/%zu (%.2f)\n", __func__, mod.get_used(), mod.size(), f); + + constexpr double f_thold = 0.25; + if (f > f_thold) { + LOG_WRN("%s: ngram_mod occupancy %.2f exceeds threshold (%.2f) - resetting\n", __func__, f, f_thold); + + mod.reset(); + } + } + + void draft( + const common_params_speculative & params, + const llama_tokens & prompt_tgt, + llama_token id_last, + llama_tokens & result) override { + GGML_UNUSED(params); + + n_draft_last = 0; + + const size_t cur_len = prompt_tgt.size(); + if (cur_len < mod.get_n()) { + return; + } + + const size_t n = mod.get_n(); + + // add new ngrams in chunks + if (i_last + 32 < cur_len) { + for (size_t i = i_last; i < cur_len - n; ++i) { + mod.add(prompt_tgt.data() + i); + } + + i_last = cur_len - n; + } + + result.resize(n + params.n_max); + for (size_t i = 0; i < n - 1; ++i) { + result[i] = prompt_tgt[cur_len - n + 1 + i]; + } + result[n - 1] = id_last; + + for (int i = 0; i < params.n_max; ++i) { + const llama_token token = mod.get(result.data() + i); + if (token == common_ngram_mod::EMPTY) { + if (i < params.n_min) { + result.clear(); + return; + } + + result.resize(n + i); + break; + } + result[n + i] = token; + } + + // only return the m tokens that were drafted + for (size_t i = 0; n + i < result.size(); ++i) { + result[i] = result[n + i]; + } + result.resize(result.size() - n); + + // store length of drafted n‑gram for later acceptance analysis + n_draft_last = result.size(); + } + + void accept(uint16_t n_accepted) override { + if (verbose) { + LOG_INF("%s: accepted %d tokens from %zu drafted tokens\n", __func__, n_accepted, n_draft_last); + } + + // compute acceptance fraction if we have a recorded draft length + if (n_draft_last > 0) { + const double f_acc = (double)n_accepted / (double)n_draft_last; + if (f_acc < 0.5) { + n_low++; + if (n_low >= 3) { + LOG_WRN("%s: low acceptance streak (%d) – resetting ngram_mod\n", __func__, n_low); + + mod.reset(); + n_low = 0; + } + } else { + n_low = 0; + } + } + } +}; + struct common_speculative_state_ngram_cache : public common_speculative_state { uint16_t n_draft; bool save_dynamic; @@ -650,6 +783,7 @@ std::string common_speculative_type_to_str(enum common_speculative_type type) { case COMMON_SPECULATIVE_TYPE_NGRAM_SIMPLE: return "ngram_simple"; case COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K: return "ngram_map_k"; case COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K4V: return "ngram_map_k4v"; + case COMMON_SPECULATIVE_TYPE_NGRAM_MOD: return "ngram_mod"; case COMMON_SPECULATIVE_TYPE_NGRAM_CACHE: return "ngram_cache"; default: return "unknown"; } @@ -666,8 +800,8 @@ enum common_speculative_type common_speculative_type_from_name(const std::string // initialization of the speculative decoding system // common_speculative * common_speculative_init( - const common_params_speculative & params, - llama_context * ctx_tgt) { + common_params_speculative & params, + llama_context * ctx_tgt) { llama_context * ctx_dft = nullptr; if (params.model_dft) { ctx_dft = llama_init_from_model(params.model_dft, params.cparams_dft); @@ -687,6 +821,7 @@ common_speculative * common_speculative_init( bool has_ngram_simple = (params.type == COMMON_SPECULATIVE_TYPE_NGRAM_SIMPLE); bool has_ngram_map_k = (params.type == COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K); bool has_ngram_map_k4v = (params.type == COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K4V); + bool has_ngram_mod = (params.type == COMMON_SPECULATIVE_TYPE_NGRAM_MOD); // In a more complex implementation we could use the same implementation but with different parameters. // This was initially used in PR-18471 but removed to simplify the code. @@ -701,6 +836,22 @@ common_speculative * common_speculative_init( // This implementation can guess tokens with high acceptance rate but is more expensive. configs.push_back(common_speculative_config(COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K4V, params)); } + if (has_ngram_mod) { + // shared instance for all speculative decoding contexts + if (!params.ngram_mod) { + params.ngram_mod = std::make_shared(params.ngram_size_n, 4*1024*1024); + + LOG_INF("%s: initialized ngram_mod with n=%d, size=%zu (%.3f MB)\n", __func__, + params.ngram_size_n, params.ngram_mod->size(), + (float)(params.ngram_mod->size_bytes())/1024/1024); + + if (params.ngram_size_n < 16) { + LOG_WRN("%s: ngram_mod n=%d is too small - poor quality is possible, see: https://github.com/ggml-org/llama.cpp/pull/19164\n", __func__, params.ngram_size_n); + } + } + + configs.push_back(common_speculative_config(COMMON_SPECULATIVE_TYPE_NGRAM_MOD, params)); + } if (has_ngram_cache) { configs.push_back(common_speculative_config(COMMON_SPECULATIVE_TYPE_NGRAM_CACHE, params)); } @@ -758,6 +909,11 @@ common_speculative * common_speculative_init( )); break; } + case COMMON_SPECULATIVE_TYPE_NGRAM_MOD: { + GGML_ASSERT(config.params.ngram_mod); + impls.push_back(std::make_unique(config.type, *config.params.ngram_mod)); + break; + } case COMMON_SPECULATIVE_TYPE_NGRAM_CACHE: { auto state = create_state_ngram_cache( params.lookup_cache_static, params.lookup_cache_dynamic, config); @@ -822,8 +978,7 @@ llama_tokens common_speculative_draft( if (!result.empty()) { LOG_DBG("%s: called impl %s, hist size = %zu, call_count = %zu, gen = %zu\n", __func__, - common_speculative_type_to_str(impl.get()->type).c_str(), - prompt_tgt.size(), + common_speculative_type_to_str(impl.get()->type).c_str(), prompt_tgt.size(), impl.get()->drafts_call_count, result.size()); spec->curr_impl = impl.get(); // set current implementation for stats @@ -869,6 +1024,7 @@ void common_speculative_print_stats(const common_speculative * spec) { str_perf = ""; } + // TODO: report time for begin() and accept() LOG_INF("statistics %s: #calls = %zu, #gen drafts = %zu, #acc drafts = %zu, #gen tokens = %zu, #acc tokens = %zu%s\n", common_speculative_type_to_str(impl->type).c_str(), impl->drafts_call_count, diff --git a/common/speculative.h b/common/speculative.h index 9e1888e4be..76fe6bb7bc 100644 --- a/common/speculative.h +++ b/common/speculative.h @@ -15,8 +15,8 @@ enum common_speculative_type common_speculative_type_from_name(const std::string std::string common_speculative_type_to_str(enum common_speculative_type type); common_speculative * common_speculative_init( - const common_params_speculative & params, - llama_context * ctx_tgt); + common_params_speculative & params, + llama_context * ctx_tgt); void common_speculative_free(common_speculative * spec); diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp index 1ca4e3cc0e..6f26fc9a9b 100644 --- a/tools/server/server-context.cpp +++ b/tools/server/server-context.cpp @@ -705,6 +705,11 @@ private: params_base.n_cache_reuse = 0; SRV_WRN("%s\n", "cache_reuse is not supported by multimodal, it will be disabled"); } + + if (params_base.speculative.type != COMMON_SPECULATIVE_TYPE_NONE) { + params_base.speculative.type = COMMON_SPECULATIVE_TYPE_NONE; + SRV_WRN("%s\n", "speculative decoding is not supported by multimodal, it will be disabled"); + } } if (!llama_memory_can_shift(llama_get_memory(ctx))) { @@ -754,9 +759,9 @@ private: SRV_ERR("%s\n", "speculative decoding is not supported with multimodal"); return false; } - SRV_WRN("%s", "speculative decoding context initialized\n"); + SLT_INF(slot, "%s", "speculative decoding context initialized\n"); } else { - SRV_WRN("%s", "speculative decoding context not initialized\n"); + SLT_INF(slot, "%s", "speculative decoding context not initialized\n"); } } From 13f3ebfae118e54716658d44ca9884e726c749c2 Mon Sep 17 00:00:00 2001 From: Simon Redman Date: Fri, 30 Jan 2026 11:27:16 -0500 Subject: [PATCH 11/17] Correctly fetch q8_1 quantize pipeline in test as needed by 8a3519b (#19194) --- ggml/src/ggml-vulkan/ggml-vulkan.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp index 3852867c29..a99375c088 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -11956,7 +11956,8 @@ static void ggml_vk_test_dequant_matmul(ggml_backend_vk_context * ctx, size_t m, } } if (mmq) { - ggml_pipeline_request_descriptor_sets(ctx, ctx->device->pipeline_quantize_q8_1, num_it); + vk_pipeline pipeline_quantize_q8_1 = ggml_vk_get_quantize_pipeline(ctx, GGML_TYPE_Q8_1); + ggml_pipeline_request_descriptor_sets(ctx, pipeline_quantize_q8_1, num_it); } ggml_pipeline_allocate_descriptor_sets(ctx); From bbada8bfb9bce17eafb0af5ded22bf96ba57f778 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Fri, 30 Jan 2026 19:46:10 +0200 Subject: [PATCH 12/17] server : wrap around the "id_slot" parameter (#19207) * server : wrap around the "id_slot" parameter * cont : minor --- tools/server/server-context.cpp | 41 ++++++++++++++++++--------------- tools/server/server-task.h | 2 +- 2 files changed, 24 insertions(+), 19 deletions(-) diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp index 6f26fc9a9b..7f9c3c566b 100644 --- a/tools/server/server-context.cpp +++ b/tools/server/server-context.cpp @@ -155,7 +155,7 @@ struct server_slot { double t_prompt_processing; // ms double t_token_generation; // ms - std::function callback_on_release; + std::function callback_on_release; // Speculative decoding stats int32_t n_draft_total = 0; // Total draft tokens generated @@ -767,8 +767,8 @@ private: SLT_INF(slot, "new slot, n_ctx = %d\n", slot.n_ctx); - slot.callback_on_release = [this](int slot_id) { - queue_tasks.pop_deferred_task(slot_id); + slot.callback_on_release = [this](int id_slot) { + queue_tasks.pop_deferred_task(id_slot); }; slot.reset(); @@ -896,6 +896,9 @@ private: } server_slot * get_slot_by_id(int id_slot) { + // note: allow id_slot to be out of bounds (wrap around) + id_slot = id_slot % slots.size(); + for (server_slot & slot : slots) { if (slot.id == id_slot) { return &slot; @@ -1765,7 +1768,7 @@ private: break; } - int id_slot = task.slot_action.slot_id; + const int id_slot = task.slot_action.id_slot; server_slot * slot = get_slot_by_id(id_slot); if (slot == nullptr) { send_error(task, "Invalid slot ID", ERROR_TYPE_INVALID_REQUEST); @@ -1803,7 +1806,7 @@ private: case SERVER_TASK_TYPE_SLOT_RESTORE: { if (!check_no_mtmd(task.id)) break; - int id_slot = task.slot_action.slot_id; + const int id_slot = task.slot_action.id_slot; server_slot * slot = get_slot_by_id(id_slot); if (slot == nullptr) { send_error(task, "Invalid slot ID", ERROR_TYPE_INVALID_REQUEST); @@ -1852,7 +1855,7 @@ private: if (!check_no_mtmd(task.id)) { break; } - int id_slot = task.slot_action.slot_id; + const int id_slot = task.slot_action.id_slot; server_slot * slot = get_slot_by_id(id_slot); if (slot == nullptr) { send_error(task, "Invalid slot ID", ERROR_TYPE_INVALID_REQUEST); @@ -3317,7 +3320,7 @@ void server_routes::init_routes() { } // TODO: get rid of this dynamic_cast - auto res_task = dynamic_cast(result.get()); + auto * res_task = dynamic_cast(result.get()); GGML_ASSERT(res_task != nullptr); // optionally return "fail_on_no_slot" error @@ -3340,8 +3343,8 @@ void server_routes::init_routes() { } std::string id_slot_str = req.get_param("id_slot"); - int id_slot; + int id_slot; try { id_slot = std::stoi(id_slot_str); } catch (const std::exception &) { @@ -3353,14 +3356,16 @@ void server_routes::init_routes() { if (action == "save") { return handle_slots_save(req, id_slot); - } else if (action == "restore") { - return handle_slots_restore(req, id_slot); - } else if (action == "erase") { - return handle_slots_erase(req, id_slot); - } else { - res->error(format_error_response("Invalid action", ERROR_TYPE_INVALID_REQUEST)); - return res; } + if (action == "restore") { + return handle_slots_restore(req, id_slot); + } + if (action == "erase") { + return handle_slots_erase(req, id_slot); + } + + res->error(format_error_response("Invalid action", ERROR_TYPE_INVALID_REQUEST)); + return res; }; this->get_props = [this](const server_http_req &) { @@ -3903,7 +3908,7 @@ std::unique_ptr server_routes::handle_slots_save(const ser { server_task task(SERVER_TASK_TYPE_SLOT_SAVE); task.id = rd.get_new_id(); - task.slot_action.slot_id = id_slot; + task.slot_action.id_slot = id_slot; task.slot_action.filename = filename; task.slot_action.filepath = filepath; rd.post_task(std::move(task)); @@ -3939,7 +3944,7 @@ std::unique_ptr server_routes::handle_slots_restore(const { server_task task(SERVER_TASK_TYPE_SLOT_RESTORE); task.id = rd.get_new_id(); - task.slot_action.slot_id = id_slot; + task.slot_action.id_slot = id_slot; task.slot_action.filename = filename; task.slot_action.filepath = filepath; rd.post_task(std::move(task)); @@ -3968,7 +3973,7 @@ std::unique_ptr server_routes::handle_slots_erase(const se { server_task task(SERVER_TASK_TYPE_SLOT_ERASE); task.id = rd.get_new_id(); - task.slot_action.slot_id = id_slot; + task.slot_action.id_slot = id_slot; rd.post_task(std::move(task)); } diff --git a/tools/server/server-task.h b/tools/server/server-task.h index 244470596b..a69e8f1a3d 100644 --- a/tools/server/server-task.h +++ b/tools/server/server-task.h @@ -153,7 +153,7 @@ struct server_task { // used by SERVER_TASK_TYPE_SLOT_SAVE, SERVER_TASK_TYPE_SLOT_RESTORE, SERVER_TASK_TYPE_SLOT_ERASE struct slot_action { - int slot_id; + int id_slot; std::string filename; std::string filepath; }; From dfd6106c84045f819760c86cc21a27804d2faf32 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Fri, 30 Jan 2026 15:56:15 +0200 Subject: [PATCH 13/17] cuda : fix compile warnings (whisper/0) --- ggml/src/ggml-cuda/ggml-cuda.cu | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu index e9e9592eba..08383edb40 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -3910,14 +3910,14 @@ static void ggml_cuda_graph_evaluate_and_capture(ggml_backend_cuda_context * cud // Launch graph CUDA_CHECK(cudaGraphLaunch(graph->instance, cuda_ctx->stream())); #else + GGML_UNUSED(graph_key); graph_evaluated_or_captured = true; #endif // USE_CUDA_GRAPH } } -static bool ggml_cuda_graph_set_enabled(ggml_backend_cuda_context * cuda_ctx, const void * graph_key) { - #ifdef USE_CUDA_GRAPH +static bool ggml_cuda_graph_set_enabled(ggml_backend_cuda_context * cuda_ctx, const void * graph_key) { ggml_cuda_graph * graph = cuda_ctx->cuda_graph(graph_key); if (graph->graph == nullptr) { @@ -3930,12 +3930,8 @@ static bool ggml_cuda_graph_set_enabled(ggml_backend_cuda_context * cuda_ctx, co } return graph->is_enabled(); -#else - GGML_UNUSED(cuda_ctx); - GGML_UNUSED(graph_key); - return false; -#endif // USE_CUDA_GRAPH } +#endif // USE_CUDA_GRAPH static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) { ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *) backend->context; From d9a2a4bcaa071d730bb1ab4fb411a9c93b50dd13 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Fri, 30 Jan 2026 16:27:14 +0200 Subject: [PATCH 14/17] sync : ggml --- scripts/sync-ggml.last | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/sync-ggml.last b/scripts/sync-ggml.last index c838276158..81e79a9470 100644 --- a/scripts/sync-ggml.last +++ b/scripts/sync-ggml.last @@ -1 +1 @@ -ebc3a0f4a56be1c9424a89fbec09962ac34fde85 +a8db410a252c8c8f2d120c6f2e7133ebe032f35d From 971facc38e2544fcf2cc09368de5d1a68e33c10f Mon Sep 17 00:00:00 2001 From: shaofeiqi Date: Fri, 30 Jan 2026 10:19:27 -0800 Subject: [PATCH 15/17] opencl: add optimized q8_0 mm kernel for adreno (#18871) * Add Q8_0 OpenCL kernel Co-authored-by: yunjie * opencl: fix build for non-adreno * opencl: refactor q8_0 * opencl: enforce subgroup size of 64 for adreno for q8_0 * For A750 and older generations, subgroup size can be 64 or 128. This kernel assumes subgroup size 64. * opencl: suppress warning when adreno kernels are disabled --------- Co-authored-by: yunjie Co-authored-by: Li He --- ggml/src/ggml-opencl/CMakeLists.txt | 2 + ggml/src/ggml-opencl/ggml-opencl.cpp | 464 +++++++++++++++++- ggml/src/ggml-opencl/kernels/cvt.cl | 31 ++ .../gemv_noshuffle_general_q8_0_f32.cl | 195 ++++++++ .../kernels/mul_mm_q8_0_f32_8x4.cl | 129 +++++ 5 files changed, 819 insertions(+), 2 deletions(-) create mode 100644 ggml/src/ggml-opencl/kernels/gemv_noshuffle_general_q8_0_f32.cl create mode 100644 ggml/src/ggml-opencl/kernels/mul_mm_q8_0_f32_8x4.cl diff --git a/ggml/src/ggml-opencl/CMakeLists.txt b/ggml/src/ggml-opencl/CMakeLists.txt index 0259474b6e..fa5fadd112 100644 --- a/ggml/src/ggml-opencl/CMakeLists.txt +++ b/ggml/src/ggml-opencl/CMakeLists.txt @@ -101,6 +101,8 @@ set(GGML_OPENCL_KERNELS mul_mm_f32_f32_l4_lm mul_mm_f16_f32_l4_lm mul_mm_q8_0_f32_l4_lm + mul_mm_q8_0_f32_8x4 + gemv_noshuffle_general_q8_0_f32 mul norm relu diff --git a/ggml/src/ggml-opencl/ggml-opencl.cpp b/ggml/src/ggml-opencl/ggml-opencl.cpp index 678e40965a..4850c11d14 100644 --- a/ggml/src/ggml-opencl/ggml-opencl.cpp +++ b/ggml/src/ggml-opencl/ggml-opencl.cpp @@ -226,7 +226,8 @@ static ADRENO_GPU_GEN get_adreno_gpu_gen(const char *device_name) { return ADRENO_GPU_GEN::A7X; } - if (strstr(device_name, "830")) { + if (strstr(device_name, "830") || + strstr(device_name, "840")) { return ADRENO_GPU_GEN::A8X; } @@ -529,7 +530,7 @@ struct ggml_backend_opencl_context { cl_kernel kernel_mul_mat_q4_0_f32, kernel_mul_mat_q4_0_f32_v; cl_kernel kernel_convert_block_q4_0, kernel_restore_block_q4_0; cl_kernel kernel_convert_block_mxfp4, kernel_convert_block_mxfp4_trans, kernel_restore_block_mxfp4, kernel_restore_block_mxfp4_trans; - cl_kernel kernel_convert_block_q8_0, kernel_restore_block_q8_0; + cl_kernel kernel_convert_block_q8_0, kernel_restore_block_q8_0, kernel_restore_block_q8_0_trans; cl_kernel kernel_mul_mat_q4_0_f32_8x_flat; cl_kernel kernel_convert_block_q4_0_noshuffle; cl_kernel kernel_restore_block_q4_0_noshuffle; @@ -696,6 +697,8 @@ struct ggml_backend_opencl_context { cl_kernel CL_mul_mat_vec_q4_0_f32_1d_4x_flat_4096_1_4096; cl_kernel CL_mul_mat_vec_q4_0_f32_1d_4x_flat_11008_1_4096; cl_kernel CL_mul_mat_vec_q4_0_f32_1d_4x_flat_32000_1_4096; + cl_kernel kernel_mul_mm_q8_0_f32_8x4; + cl_kernel CL_mul_mat_vec_q8_0_f32; #endif // GGML_OPENCL_USE_ADRENO_KERNELS void free() { @@ -894,6 +897,7 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve CL_CHECK((backend_ctx->kernel_restore_block_mxfp4 = clCreateKernel(backend_ctx->program_cvt, "kernel_restore_block_mxfp4", &err), err)); CL_CHECK((backend_ctx->kernel_convert_block_q8_0 = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_q8_0", &err), err)); CL_CHECK((backend_ctx->kernel_restore_block_q8_0 = clCreateKernel(backend_ctx->program_cvt, "kernel_restore_block_q8_0", &err), err)); + CL_CHECK((backend_ctx->kernel_restore_block_q8_0_trans = clCreateKernel(backend_ctx->program_cvt, "kernel_restore_block_q8_0_trans", &err), err)); CL_CHECK((backend_ctx->kernel_convert_block_q6_K = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_q6_K", &err), err)); CL_CHECK((backend_ctx->kernel_restore_block_q6_K = clCreateKernel(backend_ctx->program_cvt, "kernel_restore_block_q6_K", &err), err)); GGML_LOG_CONT("."); @@ -2290,6 +2294,46 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve GGML_LOG_CONT("."); } + // mul_mm_q8_0_f32_8x4 + { +#ifdef GGML_OPENCL_EMBED_KERNELS + const std::string kernel_src_q8_8x4_gemm { + #include "mul_mm_q8_0_f32_8x4.cl.h" + }; +#else + const std::string kernel_src_q8_8x4_gemm = read_file("mul_mm_q8_0_f32_8x4.cl"); +#endif + backend_ctx->program_CL_gemm = build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src_q8_8x4_gemm.c_str(), compile_opts); + CL_CHECK((backend_ctx->kernel_mul_mm_q8_0_f32_8x4 = clCreateKernel(backend_ctx->program_CL_gemm, "kernel_mul_mm_q8_0_f32_8x4", &err), err)); + GGML_LOG_CONT("."); + } + + // gemv_noshuffle_general_q8_0_f32 + { + std::string CL_gemv_compile_opts = std::string("-cl-std=") + opencl_c_std + + " -cl-mad-enable " + " -DSIMDGROUP_WIDTH=" + + std::to_string(backend_ctx->adreno_wave_size); + if (backend_ctx->has_vector_subgroup_broadcast) { + CL_gemv_compile_opts += " -DVECTOR_SUB_GROUP_BROADCAT "; + } + +#ifdef GGML_OPENCL_EMBED_KERNELS + const std::string kernel_src_CL_gemv_general { + #include "gemv_noshuffle_general_q8_0_f32.cl.h" + }; +#else + const std::string kernel_src_CL_gemv_general = read_file("gemv_noshuffle_general_q8_0_f32.cl"); +#endif + + cl_program prog = build_program_from_source( + backend_ctx->context, backend_ctx->device, kernel_src_CL_gemv_general.c_str(), CL_gemv_compile_opts); + + CL_CHECK((backend_ctx->CL_mul_mat_vec_q8_0_f32 = clCreateKernel(prog, "kernel_gemv_noshuffle", &err), err)); + CL_CHECK(clReleaseProgram(prog)); + GGML_LOG_CONT("."); + } + std::string CL_moe_compile_opts = std::string("-cl-std=") + opencl_c_std + " -cl-mad-enable " " -cl-fast-relaxed-math"; @@ -3745,6 +3789,15 @@ inline bool use_adreno_moe_kernels(const ggml_backend_opencl_context *backend_ct return ((strstr(tensor->name, "ffn") != NULL) || (strstr(tensor->name, "as") != NULL)) && (ne01 % 64 == 0); } +inline bool enable_adreno_trans_weight(const ggml_backend_opencl_context *backend_ctx, const ggml_tensor *tensor) { + + bool adreno_kernel = use_adreno_kernels(backend_ctx, tensor); + + size_t elem_num = tensor->ne[0] * tensor->ne[1] * tensor->ne[2] * tensor->ne[3]; + + return ((elem_num < 128 * 1024 * 1024) && adreno_kernel); // max element num: 2**27 +} + static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) { ggml_backend_opencl_context *backend_ctx = ggml_cl2_init(buffer->buft->device); @@ -4159,6 +4212,130 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer, tensor->extra = extra; + // Transpose the weights and scales +#ifdef GGML_OPENCL_USE_ADRENO_KERNELS + if (enable_adreno_trans_weight(backend_ctx, tensor)) { + + int M = tensor->ne[1]; // ne01 + int K = tensor->ne[0]; // ne00 + + GGML_ASSERT(K % 32 == 0); + GGML_ASSERT(M % 4 == 0); + GGML_ASSERT(tensor->ne[2] == 1); + GGML_ASSERT(tensor->ne[3] == 1); + + // Transpose weights + size_t q_size_bytes = K * M / 4 * sizeof(float); + cl_buffer_region region; + region.origin = 0; + region.size = q_size_bytes; + cl_mem qT_d = clCreateSubBuffer( + backend_ctx->prealloc_quant_trans.buffer, + 0, + CL_BUFFER_CREATE_TYPE_REGION, + ®ion, + &err); + CL_CHECK(err); + + cl_mem q_d_image1D; + cl_mem qT_d_image1D; + + cl_image_format img_fmt_1d; + cl_image_desc img_desc_1d; + + img_fmt_1d = { CL_RGBA, CL_FLOAT }; + memset(&img_desc_1d, 0, sizeof(img_desc_1d)); + img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER; + img_desc_1d.image_width = M * K / 4 / 4; + img_desc_1d.buffer = extra->q; + q_d_image1D = clCreateImage(context, 0, &img_fmt_1d, &img_desc_1d, NULL, &err); + CL_CHECK(err); + + img_fmt_1d = { CL_RGBA, CL_FLOAT }; + memset(&img_desc_1d, 0, sizeof(img_desc_1d)); + img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER; + img_desc_1d.image_width = M * K / 4 / 4; + img_desc_1d.buffer = qT_d; + qT_d_image1D = clCreateImage(context, 0, &img_fmt_1d, &img_desc_1d, NULL, &err); + CL_CHECK(err); + + int height_q = M / 4; + int width_q = K / 4 / 4; + kernel = backend_ctx->kernel_transpose_32; + + CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &q_d_image1D)); + CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &qT_d_image1D)); + CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), &height_q)); + CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), &width_q)); + + size_t local_size_q[3] = {4, 16, 1}; + size_t global_size_q[3] = {static_cast(width_q), static_cast(height_q), 1}; + CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_size_q, local_size_q, 0, NULL, &evt)); + CL_CHECK(clWaitForEvents(1, &evt)); + + // Transpose scales + size_t d_size_bytes = M * (K / 32) * 2; + region.origin = 0; + region.size = d_size_bytes; + cl_mem dT_d = clCreateSubBuffer( + backend_ctx->prealloc_scales_trans.buffer, + 0, + CL_BUFFER_CREATE_TYPE_REGION, + ®ion, + &err); + CL_CHECK(err); + + cl_mem d_d_image1D; + cl_mem dT_d_image1D; + + memset(&img_desc_1d, 0, sizeof(img_desc_1d)); + img_fmt_1d = { CL_R, CL_HALF_FLOAT }; + img_desc_1d.image_width = M * K / 32; + img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER; + img_desc_1d.buffer = extra->d; + d_d_image1D = clCreateImage(context, 0, &img_fmt_1d, &img_desc_1d, NULL, &err); + CL_CHECK(err); + + img_fmt_1d = { CL_RGBA, CL_HALF_FLOAT }; + memset(&img_desc_1d, 0, sizeof(img_desc_1d)); + img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER; + img_desc_1d.image_width = M * K / 32 / 4; + img_desc_1d.buffer = dT_d; + dT_d_image1D = clCreateImage(context, 0, &img_fmt_1d, &img_desc_1d, NULL, &err); + CL_CHECK(err); + + int height_s = M / 4; + int width_s = K / 32; + + kernel = backend_ctx->kernel_transpose_16_4x1; + + CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &d_d_image1D)); + CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &dT_d_image1D)); + CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), &height_s)); + CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), &width_s)); + + size_t local_size_s[3] = {4, 16, 1}; + size_t global_size_s[3] = {static_cast(width_s), static_cast(height_s), 1}; + CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_size_s, local_size_s, 0, NULL, &evt)); + CL_CHECK(clWaitForEvents(1, &evt)); + + // copy transposed buffer contents to original buffers + CL_CHECK(clEnqueueCopyBuffer(queue, qT_d, extra->q, 0, 0, q_size_bytes, 0, NULL, &evt)); + CL_CHECK(clWaitForEvents(1, &evt)); + + CL_CHECK(clEnqueueCopyBuffer(queue, dT_d, extra->d, 0, 0, d_size_bytes, 0, NULL, &evt)); + CL_CHECK(clWaitForEvents(1, &evt)); + + CL_CHECK(clReleaseMemObject(qT_d)); + CL_CHECK(clReleaseMemObject(dT_d)); + + CL_CHECK(clReleaseMemObject(q_d_image1D)); + CL_CHECK(clReleaseMemObject(d_d_image1D)); + CL_CHECK(clReleaseMemObject(qT_d_image1D)); + CL_CHECK(clReleaseMemObject(dT_d_image1D)); + } // end transpose +#endif // GGML_OPENCL_USE_ADRENO_KERNELS + return; } if (tensor->type == GGML_TYPE_Q6_K) { @@ -4448,6 +4625,36 @@ static void ggml_backend_opencl_buffer_get_tensor(ggml_backend_buffer_t buffer, ggml_nbytes(tensor), NULL, &err); CL_CHECK(err); +#ifdef GGML_OPENCL_USE_ADRENO_KERNELS + if (enable_adreno_trans_weight(backend_ctx, tensor)) { + cl_kernel kernel = backend_ctx->kernel_restore_block_q8_0_trans; + + int ne00 = tensor->ne[0]; + int ne01 = tensor->ne[1]; + GGML_ASSERT(tensor->ne[2] == 1); // ??? + GGML_ASSERT(tensor->ne[3] == 1); // ??? + + CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra->q)); + CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra->d)); + CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &data_device)); + CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_int), &ne00)); + CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_int), &ne01)); + + size_t global_work_size[3] = {static_cast(((ne01 + 63) / 64) * 64), 1, 1}; + size_t local_work_size[3] = {64, 1, 1}; + + cl_event evt; + CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, + global_work_size, local_work_size, 0, NULL, &evt)); + CL_CHECK(clWaitForEvents(1, &evt)); + + CL_CHECK(clEnqueueReadBuffer( + queue, data_device, CL_TRUE, offset, + size, data, 0, NULL, NULL)); + CL_CHECK(clReleaseMemObject(data_device)); + return; + } +#endif cl_kernel kernel = backend_ctx->kernel_restore_block_q8_0; CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra->q)); CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra->d)); @@ -7947,6 +8154,252 @@ static void ggml_cl_mul_mat_kq_kqv_adreno(ggml_backend_t backend, const ggml_ten CL_CHECK(clReleaseMemObject(D_sub_buffer)); } +static void ggml_cl_mul_mat_q8_0_f32_adreno(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +#ifdef GGML_OPENCL_USE_ADRENO_KERNELS + GGML_ASSERT(src0); + GGML_ASSERT(src0->extra); + GGML_ASSERT(src1); + GGML_ASSERT(src1->extra); + GGML_ASSERT(dst); + GGML_ASSERT(dst->extra); + + const enum ggml_type src0t = src0->type; + const enum ggml_type src1t = src1->type; + + GGML_ASSERT(src0t == GGML_TYPE_Q8_0); + GGML_ASSERT(src1t == GGML_TYPE_F32); + + ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context; + + ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra; + ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra; + + ggml_tensor_extra_cl_q8_0 * extra0_q8_0 = (ggml_tensor_extra_cl_q8_0 *)src0->extra; + + GGML_ASSERT(src1->view_offs == 0); + GGML_ASSERT(dst->view_offs == 0); + + const int ne00 = src0->ne[0]; + const int ne01 = src0->ne[1]; + const int ne02 = src0->ne[2]; + + const int ne10 = src1->ne[0]; + const int ne12 = src1->ne[2]; + + const int ne0 = dst->ne[0]; + const int ne1 = dst->ne[1]; + + GGML_ASSERT(ne00 == ne10); + GGML_ASSERT((ne00 % 32) == 0); + GGML_ASSERT(ne0 == ne01); + + cl_context context = backend_ctx->context; + cl_kernel kernel; + + // init CL objects + cl_int status; + cl_image_format img_fmt_1d; + cl_image_desc img_desc_1d; + cl_buffer_region region; + cl_mem A_image1d; + cl_mem B_image1d; + cl_mem B_sub_buffer; + cl_mem S_image1d; + + cl_mem D_image1d; + cl_mem D_sub_buffer; + + int M = ne01; + int N = ne1; + int K = ne00; + + // create an image for A + img_fmt_1d = { CL_R, CL_FLOAT}; + memset(&img_desc_1d, 0, sizeof(img_desc_1d)); + img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER; + img_desc_1d.image_width = M * K / 4; // Divide by 4 for char -> float + img_desc_1d.buffer = extra0_q8_0->q; + A_image1d = clCreateImage(context, CL_MEM_READ_ONLY, &img_fmt_1d, &img_desc_1d, NULL, &status); + CL_CHECK(status); + + // create an image for Scale + img_fmt_1d = { CL_R, CL_HALF_FLOAT}; + memset(&img_desc_1d, 0, sizeof(img_desc_1d)); + img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER; + img_desc_1d.image_width = M * K / 32; // Block size is 32 + img_desc_1d.buffer = extra0_q8_0->d; + S_image1d = clCreateImage(context, CL_MEM_READ_ONLY, &img_fmt_1d, &img_desc_1d, NULL, &status); + CL_CHECK(status); + + // create a sub_buffer for B + region.origin = (extra1->offset); // + src1->view_offs); + region.size = K * N * sizeof(float); + B_sub_buffer = clCreateSubBuffer((extra1->data_device), 0, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &status); + CL_CHECK(status); + + // create an image for B from sub_buffer: RGBA (OCL) + img_fmt_1d = {CL_RGBA, CL_FLOAT}; + memset(&img_desc_1d, 0, sizeof(img_desc_1d)); + img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER; + img_desc_1d.image_width = K * N / 4; + img_desc_1d.buffer = B_sub_buffer; + B_image1d = clCreateImage(context, CL_MEM_READ_ONLY, &img_fmt_1d, &img_desc_1d, NULL, &status); + CL_CHECK(status); + + // Create subbuffer and image1d_buffer for dst + region.origin = (extrad->offset); // + dst->view_offs; + region.size = M * N * sizeof(float); + D_sub_buffer = clCreateSubBuffer((extrad->data_device), 0, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &status); + CL_CHECK(status); + + img_fmt_1d = {CL_R, CL_FLOAT}; + memset(&img_desc_1d, 0, sizeof(img_desc_1d)); + img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER; + img_desc_1d.image_width = M * N; + img_desc_1d.buffer = D_sub_buffer; + D_image1d = clCreateImage(context, CL_MEM_WRITE_ONLY, &img_fmt_1d, &img_desc_1d, NULL, &status); + CL_CHECK(status); + + size_t local_work_size[3] = {1, 1, 1}; + size_t global_work_size[3] = {1, 1, 1}; + + if (N == 1) { + kernel = backend_ctx->CL_mul_mat_vec_q8_0_f32; + + int r2 = 1; + int r3 = 1; + cl_uint k_arg = 0; + + CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(cl_mem), &A_image1d)); + CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(cl_mem), &extra0_q8_0->d)); + CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(cl_mem), &B_image1d)); + CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(cl_ulong), &extra1->offset)); + CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(cl_mem), &extrad->data_device)); + CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(cl_ulong), &extrad->offset)); + CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &ne00)); + CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &ne01)); + CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &ne02)); + CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &ne10)); + CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &ne12)); + CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &ne0)); + CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &ne1)); + CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &r2)); + CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &r3)); + + size_t wavesize = backend_ctx->adreno_wave_size; + local_work_size[0] = wavesize; + local_work_size[1] = 4; // reduce factor + local_work_size[2] = 1; + + global_work_size[0] = ((M + wavesize - 1) / wavesize) * wavesize; + global_work_size[1] = 4; // reduce factor + global_work_size[2] = 1; + } else { + cl_ulong offsetd = extrad->offset + dst->view_offs; + cl_mem B_image1d_trans = nullptr; + // for B transpose + cl_mem B_d = nullptr; + int padding; + + //how many extra elements beyond multiple of 8 + int extra_elements = N % 8; + + //how much padding to add + padding = 0; + if (extra_elements > 0){ + padding = 8 - extra_elements; + } + + // Specify the starting offset (in bytes) + region.origin = 0; + // Specify the size of the sub-buffer (divide by 2 for FP16) + region.size = K * (N + padding) * sizeof(float)/2; + backend_ctx->prealloc_act_trans.allocate(context, region.size); + B_d = clCreateSubBuffer( + backend_ctx->prealloc_act_trans.buffer, + 0, + CL_BUFFER_CREATE_TYPE_REGION, + ®ion, + &status); + CL_CHECK(status); + + cl_image_format image_format_B_d_output = { CL_RGBA, CL_HALF_FLOAT }; //(CL_HALF_FLOAT for FP16) + cl_image_desc image_desc_B_d_output = { + CL_MEM_OBJECT_IMAGE1D_BUFFER, + static_cast(K * (N + padding)/4), + 0, 0, 0, 0, 0, 0, 0, { B_d } + }; + B_image1d_trans = clCreateImage( + context, + 0, + &image_format_B_d_output, + &image_desc_B_d_output, + NULL, + &status); + CL_CHECK(status); + + int height_B = N/4; + if (height_B == 0) { + height_B = 1; + } + int width_B = K/4; + int padded_height_B = (N + padding)/4; + + kernel = backend_ctx->kernel_transpose_32_16; + CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &B_image1d)); + CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &B_image1d_trans)); + CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), &height_B)); + CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), &width_B)); + CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &padded_height_B)); + + size_t local_size_t[2] = { 1, 16 }; + size_t global_size_t[2] = { + static_cast(width_B), + static_cast(padded_height_B) + }; + + backend_ctx->enqueue_ndrange_kernel(kernel, 2, global_size_t, local_size_t, dst); + + kernel = backend_ctx->kernel_mul_mm_q8_0_f32_8x4; + + int N_with_padding = N + padding; + + CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0_q8_0->q)); + CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra0_q8_0->d)); + CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &B_image1d_trans)); + CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), &extrad->data_device)); + CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &K)); + CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &M)); + CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &N_with_padding)); + CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &N)); + CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &offsetd)); + + global_work_size[0] = (size_t)(N + 7) / 8; + global_work_size[1] = (size_t)(M + 3) / 4; + global_work_size[2] = 1; + + local_work_size[0] = 2; + local_work_size[1] = 128; + local_work_size[2] = 1; + } + + // enqueue kernel with profiling + backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst); + + // deallocate sub buffers and images + CL_CHECK(clReleaseMemObject(A_image1d)); + CL_CHECK(clReleaseMemObject(B_sub_buffer)); + CL_CHECK(clReleaseMemObject(B_image1d)); + CL_CHECK(clReleaseMemObject(S_image1d)); + CL_CHECK(clReleaseMemObject(D_sub_buffer)); + CL_CHECK(clReleaseMemObject(D_image1d)); +#else + GGML_UNUSED(src0); + GGML_UNUSED(src1); + GGML_UNUSED(dst); +#endif +} + static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { GGML_ASSERT(src0); GGML_ASSERT(src0->extra); @@ -8064,6 +8517,13 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co int padding; // <--------------------------------------------> // + // q8_0 x fp32 + if (src0t == GGML_TYPE_Q8_0 && src1t == GGML_TYPE_F32 && + enable_adreno_trans_weight(backend_ctx, src0)) { + ggml_cl_mul_mat_q8_0_f32_adreno(backend, src0, src1, dst); + return; + } + // q4_0 x fp32 if(src0t == GGML_TYPE_Q4_0 && src1t == GGML_TYPE_F32) { // TODO: remove duplicate definitions of image description + format -- move to top diff --git a/ggml/src/ggml-opencl/kernels/cvt.cl b/ggml/src/ggml-opencl/kernels/cvt.cl index adf576a839..9fb434713d 100644 --- a/ggml/src/ggml-opencl/kernels/cvt.cl +++ b/ggml/src/ggml-opencl/kernels/cvt.cl @@ -274,6 +274,37 @@ kernel void kernel_restore_block_q8_0( } } +kernel void kernel_restore_block_q8_0_trans( + global uchar * src_q, + global half * src_d, + global block_q8_0 * dst, + uint ne00, + uint ne01 +){ + uint num_blk_per_row = ne00 / QK8_0; + + global block_q8_0 * b = (global block_q8_0 *) dst + get_global_id(0) * num_blk_per_row; + global uchar * q = (global uchar *) src_q + get_global_id(0) * 4; // 4 8-bit packed + global half * d = (global half *) src_d + get_global_id(0); + + for (uint blk = 0; blk < num_blk_per_row; blk++) { + b->d = *d; + + for (uint i = 0; i < QK8_0; i+=4) { + b->qs[i] = q[0]; + b->qs[i+1] = q[1]; + b->qs[i+2] = q[2]; + b->qs[i+3] = q[3]; + + q += 4 * ne01; // M stride + } + + d += ne01; + + b++; + } +} + //------------------------------------------------------------------------------ // kernel_convert_block_q6_K // Convert the block_q6_K format to 3 separate arrays (AOS -> SOA). diff --git a/ggml/src/ggml-opencl/kernels/gemv_noshuffle_general_q8_0_f32.cl b/ggml/src/ggml-opencl/kernels/gemv_noshuffle_general_q8_0_f32.cl new file mode 100644 index 0000000000..f944ef3a99 --- /dev/null +++ b/ggml/src/ggml-opencl/kernels/gemv_noshuffle_general_q8_0_f32.cl @@ -0,0 +1,195 @@ +#pragma OPENCL EXTENSION cl_khr_fp16 : enable +#pragma OPENCL EXTENSION cl_khr_subgroups : enable + +#ifdef cl_qcom_reqd_sub_group_size +#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable +#define ADRENO_GPU 1 +#define REQD_SUBGROUP_SIZE_64 __attribute__((qcom_reqd_sub_group_size("half"))) +#endif + +#define QK8_0 32 +#define N_SIMDGROUP 4 + +#define dequantizeBlockAccum_ns_sgbroadcast_1(total_sums, bits8, scale, y) \ + float shared_y; \ + char elem; \ + \ + shared_y = sub_group_broadcast(y.s0, 0); \ + elem = (char)(bits8.s0 & 0x000000FF); \ + total_sums += convert_int(elem) * scale * shared_y; \ + shared_y = sub_group_broadcast(y.s1, 0); \ + elem = (char)((bits8.s0 & 0x0000FF00) >> 8); \ + total_sums += convert_int(elem) * scale * shared_y; \ + shared_y = sub_group_broadcast(y.s2, 0); \ + elem = (char)((bits8.s0 & 0x00FF0000) >> 16); \ + total_sums += convert_int(elem) * scale * shared_y; \ + shared_y = sub_group_broadcast(y.s3, 0); \ + elem = (char)((bits8.s0 & 0xFF000000) >> 24); \ + total_sums += convert_int(elem) * scale * shared_y; \ + \ + shared_y = sub_group_broadcast(y.s4, 0); \ + elem = (char)(bits8.s1 & 0x000000FF); \ + total_sums += convert_int(elem) * scale * shared_y; \ + shared_y = sub_group_broadcast(y.s5, 0); \ + elem = (char)((bits8.s1 & 0x0000FF00) >> 8); \ + total_sums += convert_int(elem) * scale * shared_y; \ + shared_y = sub_group_broadcast(y.s6, 0); \ + elem = (char)((bits8.s1 & 0x00FF0000) >> 16); \ + total_sums += convert_int(elem) * scale * shared_y; \ + shared_y = sub_group_broadcast(y.s7, 0); \ + elem = (char)((bits8.s1 & 0xFF000000) >> 24); \ + total_sums += convert_int(elem) * scale * shared_y; \ + \ + shared_y = sub_group_broadcast(y.s0, 1); \ + elem = (char)(bits8.s2 & 0x000000FF); \ + total_sums += convert_int(elem) * scale * shared_y; \ + shared_y = sub_group_broadcast(y.s1, 1); \ + elem = (char)((bits8.s2 & 0x0000FF00) >> 8); \ + total_sums += convert_int(elem) * scale * shared_y; \ + shared_y = sub_group_broadcast(y.s2, 1); \ + elem = (char)((bits8.s2 & 0x00FF0000) >> 16); \ + total_sums += convert_int(elem) * scale * shared_y; \ + shared_y = sub_group_broadcast(y.s3, 1); \ + elem = (char)((bits8.s2 & 0xFF000000) >> 24); \ + total_sums += convert_int(elem) * scale * shared_y; \ + \ + shared_y = sub_group_broadcast(y.s4, 1); \ + elem = (char)(bits8.s3 & 0x000000FF); \ + total_sums += convert_int(elem) * scale * shared_y; \ + shared_y = sub_group_broadcast(y.s5, 1); \ + elem = (char)((bits8.s3 & 0x0000FF00) >> 8); \ + total_sums += convert_int(elem) * scale * shared_y; \ + shared_y = sub_group_broadcast(y.s6, 1); \ + elem = (char)((bits8.s3 & 0x00FF0000) >> 16); \ + total_sums += convert_int(elem) * scale * shared_y; \ + shared_y = sub_group_broadcast(y.s7, 1); \ + elem = (char)((bits8.s3 & 0xFF000000) >> 24); \ + total_sums += convert_int(elem) * scale * shared_y; \ + \ + shared_y = sub_group_broadcast(y.s0, 2); \ + elem = (char)(bits8.s4 & 0x000000FF); \ + total_sums += convert_int(elem) * scale * shared_y; \ + shared_y = sub_group_broadcast(y.s1, 2); \ + elem = (char)((bits8.s4 & 0x0000FF00) >> 8); \ + total_sums += convert_int(elem) * scale * shared_y; \ + shared_y = sub_group_broadcast(y.s2, 2); \ + elem = (char)((bits8.s4 & 0x00FF0000) >> 16); \ + total_sums += convert_int(elem) * scale * shared_y; \ + shared_y = sub_group_broadcast(y.s3, 2); \ + elem = (char)((bits8.s4 & 0xFF000000) >> 24); \ + total_sums += convert_int(elem) * scale * shared_y; \ + \ + shared_y = sub_group_broadcast(y.s4, 2); \ + elem = (char)(bits8.s5 & 0x000000FF); \ + total_sums += convert_int(elem) * scale * shared_y; \ + shared_y = sub_group_broadcast(y.s5, 2); \ + elem = (char)((bits8.s5 & 0x0000FF00) >> 8); \ + total_sums += convert_int(elem) * scale * shared_y; \ + shared_y = sub_group_broadcast(y.s6, 2); \ + elem = (char)((bits8.s5 & 0x00FF0000) >> 16); \ + total_sums += convert_int(elem) * scale * shared_y; \ + shared_y = sub_group_broadcast(y.s7, 2); \ + elem = (char)((bits8.s5 & 0xFF000000) >> 24); \ + total_sums += convert_int(elem) * scale * shared_y; \ + \ + shared_y = sub_group_broadcast(y.s0, 3); \ + elem = (char)(bits8.s6 & 0x000000FF); \ + total_sums += convert_int(elem) * scale * shared_y; \ + shared_y = sub_group_broadcast(y.s1, 3); \ + elem = (char)((bits8.s6 & 0x0000FF00) >> 8); \ + total_sums += convert_int(elem) * scale * shared_y; \ + shared_y = sub_group_broadcast(y.s2, 3); \ + elem = (char)((bits8.s6 & 0x00FF0000) >> 16); \ + total_sums += convert_int(elem) * scale * shared_y; \ + shared_y = sub_group_broadcast(y.s3, 3); \ + elem = (char)((bits8.s6 & 0xFF000000) >> 24); \ + total_sums += convert_int(elem) * scale * shared_y; \ + \ + shared_y = sub_group_broadcast(y.s4, 3); \ + elem = (char)(bits8.s7 & 0x000000FF); \ + total_sums += convert_int(elem) * scale * shared_y; \ + shared_y = sub_group_broadcast(y.s5, 3); \ + elem = (char)((bits8.s7 & 0x0000FF00) >> 8); \ + total_sums += convert_int(elem) * scale * shared_y; \ + shared_y = sub_group_broadcast(y.s6, 3); \ + elem = (char)((bits8.s7 & 0x00FF0000) >> 16); \ + total_sums += convert_int(elem) * scale * shared_y; \ + shared_y = sub_group_broadcast(y.s7, 3); \ + elem = (char)((bits8.s7 & 0xFF000000) >> 24); \ + total_sums += convert_int(elem) * scale * shared_y; \ + +#ifdef ADRENO_GPU +REQD_SUBGROUP_SIZE_64 +#endif +__kernel void kernel_gemv_noshuffle( + __read_only image1d_buffer_t src0_q, // quantized A + global half * src0_d, // A scales + __read_only image1d_buffer_t src1, // B + ulong offset1, // offset to B (0) + global float * dst, // C + ulong offsetd, // offset to C + int ne00, // K + int ne01, // M + int ne02, // 1 + int ne10, // K + int ne12, // 1 + int ne0, // M + int ne1, // N + int r2, // 1 + int r3) +{ + uint groupId = get_local_id(1); + uint gid = get_global_id(0); + ushort slid = get_sub_group_local_id(); + + uint K = ne00; + uint M = ne01; + + uint LINE_STRIDE_A = M; + uint BLOCK_STRIDE_A = 8 * M; // 32 / 4 = 8 + + __private uint8 regA; + __private half regS; + __private float8 regB; + + __private float totalSum = (float)(0.0f); + + // loop along K in block granularity, skip 4 blocks every iter + #pragma unroll 1 /* tell compiler not to unroll */ + for (uint k = groupId; k < (K / QK8_0); k += N_SIMDGROUP) { + regS = src0_d[gid + k * LINE_STRIDE_A]; // each fiber loads scale of one rows + // first 4 fibers in each wave load 8 B values to its private scope + if (slid < 4) { + regB.s0123 = read_imagef(src1, (slid * 2 + k * 8)); + regB.s4567 = read_imagef(src1, (1 + slid * 2 + k * 8)); + } + + // load weights for one block in consecutive rows + regA.s0 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 0)).x; + regA.s1 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 1)).x; + regA.s2 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 2)).x; + regA.s3 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 3)).x; + regA.s4 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 4)).x; + regA.s5 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 5)).x; + regA.s6 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 6)).x; + regA.s7 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 7)).x; + + dequantizeBlockAccum_ns_sgbroadcast_1(totalSum, regA, regS, regB); + } + + // reduction in local memory, assumes #wave=4 + __local float reduceLM[SIMDGROUP_WIDTH * 3]; + if (groupId == 1) reduceLM[SIMDGROUP_WIDTH * 0 + slid] = totalSum; + if (groupId == 2) reduceLM[SIMDGROUP_WIDTH * 1 + slid] = totalSum; + if (groupId == 3) reduceLM[SIMDGROUP_WIDTH * 2 + slid] = totalSum; + barrier(CLK_LOCAL_MEM_FENCE); + if (groupId == 0) totalSum += reduceLM[SIMDGROUP_WIDTH * 0 + slid]; + if (groupId == 0) totalSum += reduceLM[SIMDGROUP_WIDTH * 1 + slid]; + if (groupId == 0) totalSum += reduceLM[SIMDGROUP_WIDTH * 2 + slid]; + + // 1 outputs per fiber in wave 0 + if (groupId == 0) { + dst = (global float*)((global char*)dst + offsetd); + dst[gid] = totalSum; + } +} diff --git a/ggml/src/ggml-opencl/kernels/mul_mm_q8_0_f32_8x4.cl b/ggml/src/ggml-opencl/kernels/mul_mm_q8_0_f32_8x4.cl new file mode 100644 index 0000000000..51ce2121ce --- /dev/null +++ b/ggml/src/ggml-opencl/kernels/mul_mm_q8_0_f32_8x4.cl @@ -0,0 +1,129 @@ +#pragma OPENCL EXTENSION cl_khr_fp16 : enable +#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable + +#ifdef cl_qcom_reqd_sub_group_size +#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable +#define ADRENO_GPU 1 +#define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full"))) +#endif + +#ifdef ADRENO_GPU +REQD_SUBGROUP_SIZE_128 +#endif + +kernel void kernel_mul_mm_q8_0_f32_8x4( + global const uint * src0_q, + global const half * src0_d, + __read_only image1d_buffer_t src1, + global float * dst, + int k, + int m, + int n, + int n_no_padding, + ulong offsetd +) { + + int m_4 = m >> 2; + int n_4 = n >> 2; + + int gy = get_global_id(0); + int gx = get_global_id(1); + int gx_2 = gx << 2; + dst = (global float *)((global char*)dst + offsetd); + + + half8 c0 = 0, c1 = 0, c2 = 0, c3 = 0; + half8 B; + half4 deq; + + __global const uint* wptr = src0_q + gx_2; + __global const half* sptr = src0_d + gx_2; + + for (int i = 0; i < k; i += 4) { + uint4 pack4 = vload4(0, wptr + (i / 4) * m); + half4 scale = vload4(0, sptr + (i / 32) * m); + + char4 p0 = as_char4(pack4.s0); + char4 p1 = as_char4(pack4.s1); + char4 p2 = as_char4(pack4.s2); + char4 p3 = as_char4(pack4.s3); + + // ------------------- j = 0 (k = i+0) ------------------- + B.s0123 = read_imageh(src1, gy * 2 + (i + 0) * n_4); + B.s4567 = read_imageh(src1, gy * 2 + (i + 0) * n_4 + 1); + + half4 wj0 = convert_half4((char4)(p0.s0, p1.s0, p2.s0, p3.s0)) * scale; + + c0 += B * wj0.s0; + c1 += B * wj0.s1; + c2 += B * wj0.s2; + c3 += B * wj0.s3; + + // ------------------- j = 1 (k = i+1) ------------------- + B.s0123 = read_imageh(src1, gy * 2 + (i + 1) * n_4); + B.s4567 = read_imageh(src1, gy * 2 + (i + 1) * n_4 + 1); + + half4 wj1 = convert_half4((char4)(p0.s1, p1.s1, p2.s1, p3.s1)) * scale; + + c0 += B * wj1.s0; + c1 += B * wj1.s1; + c2 += B * wj1.s2; + c3 += B * wj1.s3; + + // ------------------- j = 2 (k = i+2) ------------------- + B.s0123 = read_imageh(src1, gy * 2 + (i + 2) * n_4); + B.s4567 = read_imageh(src1, gy * 2 + (i + 2) * n_4 + 1); + + half4 wj2 = convert_half4((char4)(p0.s2, p1.s2, p2.s2, p3.s2)) * scale; + + c0 += B * wj2.s0; + c1 += B * wj2.s1; + c2 += B * wj2.s2; + c3 += B * wj2.s3; + + // ------------------- j = 3 (k = i+3) ------------------- + B.s0123 = read_imageh(src1, gy * 2 + (i + 3) * n_4); + B.s4567 = read_imageh(src1, gy * 2 + (i + 3) * n_4 + 1); + + half4 wj3 = convert_half4((char4)(p0.s3, p1.s3, p2.s3, p3.s3)) * scale; + + c0 += B * wj3.s0; + c1 += B * wj3.s1; + c2 += B * wj3.s2; + c3 += B * wj3.s3; + } + + int idx = (gy << 3) * m + (gx << 2); + + if(idx+3 < m*n_no_padding){ + vstore4((float4)(c0.s0, c1.s0, c2.s0, c3.s0), 0, dst + idx); + idx += m; + } + if(idx+3 < m*n_no_padding){ + vstore4((float4)(c0.s1, c1.s1, c2.s1, c3.s1), 0, dst + idx); + idx += m; + } + if(idx+3 < m*n_no_padding){ + vstore4((float4)(c0.s2, c1.s2, c2.s2, c3.s2), 0, dst + idx); + idx += m; + } + if(idx+3 < m*n_no_padding){ + vstore4((float4)(c0.s3, c1.s3, c2.s3, c3.s3), 0, dst + idx); + idx += m; + } + if(idx+3 < m*n_no_padding){ + vstore4((float4)(c0.s4, c1.s4, c2.s4, c3.s4), 0, dst + idx); + idx += m; + } + if(idx+3 < m*n_no_padding){ + vstore4((float4)(c0.s5, c1.s5, c2.s5, c3.s5), 0, dst + idx); + idx += m; + } + if(idx+3 < m*n_no_padding){ + vstore4((float4)(c0.s6, c1.s6, c2.s6, c3.s6), 0, dst + idx); + idx += m; + } + if(idx+3 < m*n_no_padding){ + vstore4((float4)(c0.s7, c1.s7, c2.s7, c3.s7), 0, dst + idx); + } +} From 49277958109f738eb5377242e46734788ba21be8 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Fri, 30 Jan 2026 21:27:27 +0200 Subject: [PATCH 16/17] ngram-mod : fix build [no ci] (#19216) --- common/ngram-mod.h | 1 + 1 file changed, 1 insertion(+) diff --git a/common/ngram-mod.h b/common/ngram-mod.h index cf3c89c915..7af92e9dde 100644 --- a/common/ngram-mod.h +++ b/common/ngram-mod.h @@ -2,6 +2,7 @@ #include #include +#include // // common_ngram_mod From 1488339138d609139c4400d1b80f8a5b1a9a203c Mon Sep 17 00:00:00 2001 From: Daniele Pinna <72076821+pestopoppa@users.noreply.github.com> Date: Fri, 30 Jan 2026 21:10:24 +0100 Subject: [PATCH 17/17] lookup, lookahead: fix crash when n_ctx not specified (#18729) * lookup, lookahead: fix crash when n_ctx not specified Since PR #16653 (Dec 15, 2025), the default n_ctx is 0 to enable automatic GPU memory fitting. This causes llama-lookup and llama-lookahead to crash when run without explicit -c flag: GGML_ASSERT(batch.seq_id[batch.n_tokens] && "llama_batch size exceeded") Root cause: Both examples use params.n_ctx directly for batch initialization, but params.n_ctx remains 0 even after the context is properly initialized to n_ctx_train internally. Bug history: - Nov 2023: lookahead.cpp created (PR #4207) with params.n_ctx pattern - Dec 2023: lookup.cpp created (PR #4484) with same pattern - Nov 2024: default n_ctx changed to 4096 (PR #10136) - bug dormant - Dec 2025: default n_ctx changed to 0 (PR #16653) - bug activated The bug was dormant for 2+ years because params.n_ctx defaulted to 512, then 4096. PR #16653 changed it to 0 for GPU auto-fitting, triggering the crash. Fix: Use llama_n_ctx(ctx) to get the actual runtime context size, matching the pattern already used elsewhere in lookup.cpp (line 72) and in speculative.cpp/speculative-simple.cpp. Tested: llama-lookup now works without -c flag (12.5% acceptance on Gemma-3-1B). Note: llama-lookahead has a separate pre-existing issue with sequence initialization (n_seq_max=1 vs W+G+1 needed) that is unrelated to this fix. * lookahead: fix n_seq_max and kv_unified configuration Lookahead decoding requires: - W + G + 1 = 31 sequences for parallel Jacobi decoding - Unified KV cache for coupled sequences in batch splitting These requirements were broken after PR #14482 changed validation logic. Consolidates fix from PR #18730 per maintainer request. Commit message drafted with Claude. --- examples/lookahead/lookahead.cpp | 8 +++++++- examples/lookup/lookup.cpp | 2 +- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/examples/lookahead/lookahead.cpp b/examples/lookahead/lookahead.cpp index f54cfdd77f..aa6efa62b3 100644 --- a/examples/lookahead/lookahead.cpp +++ b/examples/lookahead/lookahead.cpp @@ -50,6 +50,12 @@ int main(int argc, char ** argv) { const int N = 5; // n-gram size const int G = 15; // max verification n-grams + // lookahead requires W + G + 1 sequences for parallel Jacobi decoding + params.n_parallel = W + G + 1; + + // unified KV cache is required for coupled sequences in batch splitting + params.kv_unified = true; + // init llama.cpp llama_backend_init(); llama_numa_init(params.numa); @@ -115,7 +121,7 @@ int main(int argc, char ** argv) { // seq_id == 0 : the current input token // seq_id [1, W] : tokens from the past N - 1 Jacobi iterations // seq_id [W + 1, W + G] : verification n-grams - llama_batch batch = llama_batch_init(params.n_ctx, 0, W + G + 1); + llama_batch batch = llama_batch_init(llama_n_ctx(ctx), 0, W + G + 1); // target model sampling context struct common_sampler * smpl = common_sampler_init(model, params.sampling); diff --git a/examples/lookup/lookup.cpp b/examples/lookup/lookup.cpp index 8e73138a5f..c7552ddde1 100644 --- a/examples/lookup/lookup.cpp +++ b/examples/lookup/lookup.cpp @@ -106,7 +106,7 @@ int main(int argc, char ** argv){ std::vector draft; - llama_batch batch_tgt = llama_batch_init(params.n_ctx, 0, 1); + llama_batch batch_tgt = llama_batch_init(llama_n_ctx(ctx), 0, 1); const auto t_dec_start = ggml_time_us();