From 0a16bf52e6874369cb4fd2bdc4863ef984b688e7 Mon Sep 17 00:00:00 2001 From: Aman Gupta Date: Tue, 9 Sep 2025 01:23:46 +0800 Subject: [PATCH 01/26] CUDA: generate_cu_files.py - add missing mxfp4 (#15880) --- ggml/src/ggml-cuda/template-instances/generate_cu_files.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml/src/ggml-cuda/template-instances/generate_cu_files.py b/ggml/src/ggml-cuda/template-instances/generate_cu_files.py index 3428113dc8..a92a9e52cf 100755 --- a/ggml/src/ggml-cuda/template-instances/generate_cu_files.py +++ b/ggml/src/ggml-cuda/template-instances/generate_cu_files.py @@ -24,7 +24,7 @@ TYPES_MMQ = [ "GGML_TYPE_Q4_0", "GGML_TYPE_Q4_1", "GGML_TYPE_Q5_0", "GGML_TYPE_Q5_1", "GGML_TYPE_Q8_0", "GGML_TYPE_Q2_K", "GGML_TYPE_Q3_K", "GGML_TYPE_Q4_K", "GGML_TYPE_Q5_K", "GGML_TYPE_Q6_K", "GGML_TYPE_IQ2_XXS", "GGML_TYPE_IQ2_XS", "GGML_TYPE_IQ2_S", "GGML_TYPE_IQ3_XXS", "GGML_TYPE_IQ3_S", - "GGML_TYPE_IQ1_S", "GGML_TYPE_IQ4_NL", "GGML_TYPE_IQ4_XS" + "GGML_TYPE_IQ1_S", "GGML_TYPE_IQ4_NL", "GGML_TYPE_IQ4_XS", "GGML_TYPE_MXFP4" ] SOURCE_MMQ = """// This file has been autogenerated by generate_cu_files.py, do not edit manually. From e68aa10d8f3d26fdad5b912540362d79de5460e3 Mon Sep 17 00:00:00 2001 From: Jeff Bolz Date: Mon, 8 Sep 2025 13:10:07 -0500 Subject: [PATCH 02/26] vulkan: sort graph to allow more parallel execution (#15850) * vulkan: sort graph to allow more parallel execution Add a backend proc to allow the backend to modify the graph. The vulkan implementation looks at which nodes depend on each other and greedily reorders them to group together nodes that don't depend on each other. It only reorders the nodes, doesn't change the contents of any of them. With #15489, this reduces the number of synchronizations needed. * call optimize_graph per-split --- ggml/src/ggml-backend-impl.h | 3 + ggml/src/ggml-backend.cpp | 11 +++ ggml/src/ggml-blas/ggml-blas.cpp | 1 + ggml/src/ggml-cann/ggml-cann.cpp | 1 + ggml/src/ggml-cpu/ggml-cpu.cpp | 1 + ggml/src/ggml-cuda/ggml-cuda.cu | 1 + ggml/src/ggml-metal/ggml-metal.m | 1 + ggml/src/ggml-opencl/ggml-opencl.cpp | 1 + ggml/src/ggml-rpc/ggml-rpc.cpp | 1 + ggml/src/ggml-sycl/ggml-sycl.cpp | 1 + ggml/src/ggml-vulkan/ggml-vulkan.cpp | 130 +++++++++++++++++++++++++++ ggml/src/ggml-webgpu/ggml-webgpu.cpp | 1 + ggml/src/ggml-zdnn/ggml-zdnn.cpp | 1 + 13 files changed, 154 insertions(+) diff --git a/ggml/src/ggml-backend-impl.h b/ggml/src/ggml-backend-impl.h index c36c12d657..2db5c4e0f8 100644 --- a/ggml/src/ggml-backend-impl.h +++ b/ggml/src/ggml-backend-impl.h @@ -114,6 +114,9 @@ extern "C" { void (*event_record)(ggml_backend_t backend, ggml_backend_event_t event); // wait for an event on on a different stream void (*event_wait) (ggml_backend_t backend, ggml_backend_event_t event); + + // (optional) sort/optimize the nodes in the graph + void (*optimize_graph) (ggml_backend_t backend, struct ggml_cgraph * cgraph); }; struct ggml_backend { diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp index f615ab4bee..7646f3f134 100644 --- a/ggml/src/ggml-backend.cpp +++ b/ggml/src/ggml-backend.cpp @@ -463,6 +463,13 @@ void ggml_backend_event_wait(ggml_backend_t backend, ggml_backend_event_t event) backend->iface.event_wait(backend, event); } +static void ggml_backend_optimize_graph(ggml_backend_t backend, struct ggml_cgraph * cgraph) { + GGML_ASSERT(backend); + if (backend->iface.optimize_graph != NULL) { + backend->iface.optimize_graph(backend, cgraph); + } +} + // Backend device const char * ggml_backend_dev_name(ggml_backend_dev_t device) { @@ -1298,6 +1305,10 @@ void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgra struct ggml_backend_sched_split * split = &sched->splits[i]; split->graph = ggml_graph_view(graph, split->i_start, split->i_end); + // Optimize this split of the graph. This needs to happen before we make graph_copy, + // so they are in sync. + ggml_backend_optimize_graph(sched->backends[split->backend_id], &split->graph); + // add inputs to the graph copy so that they are allocated by ggml-alloc at the start of the split for (int j = 0; j < split->n_inputs; j++) { assert(graph_copy->size > (graph_copy->n_nodes + 1)); diff --git a/ggml/src/ggml-blas/ggml-blas.cpp b/ggml/src/ggml-blas/ggml-blas.cpp index aeac2e5744..cdfc5a9bc2 100644 --- a/ggml/src/ggml-blas/ggml-blas.cpp +++ b/ggml/src/ggml-blas/ggml-blas.cpp @@ -270,6 +270,7 @@ static struct ggml_backend_i blas_backend_i = { /* .graph_compute = */ ggml_backend_blas_graph_compute, /* .event_record = */ NULL, /* .event_wait = */ NULL, + /* .optimize_graph = */ NULL, }; static ggml_guid_t ggml_backend_blas_guid(void) { diff --git a/ggml/src/ggml-cann/ggml-cann.cpp b/ggml/src/ggml-cann/ggml-cann.cpp index 2f9f373f54..2e47ad90af 100755 --- a/ggml/src/ggml-cann/ggml-cann.cpp +++ b/ggml/src/ggml-cann/ggml-cann.cpp @@ -2690,6 +2690,7 @@ static const ggml_backend_i ggml_backend_cann_interface = { /* .graph_compute = */ ggml_backend_cann_graph_compute, /* .event_record = */ ggml_backend_cann_event_record, /* .event_wait = */ ggml_backend_cann_event_wait, + /* .optimize_graph = */ NULL, }; /** diff --git a/ggml/src/ggml-cpu/ggml-cpu.cpp b/ggml/src/ggml-cpu/ggml-cpu.cpp index 0c15165f1b..2b81f8b9af 100644 --- a/ggml/src/ggml-cpu/ggml-cpu.cpp +++ b/ggml/src/ggml-cpu/ggml-cpu.cpp @@ -190,6 +190,7 @@ static const struct ggml_backend_i ggml_backend_cpu_i = { /* .graph_compute = */ ggml_backend_cpu_graph_compute, /* .event_record = */ NULL, /* .event_wait = */ NULL, + /* .optimize_graph = */ NULL, }; static ggml_guid_t ggml_backend_cpu_guid(void) { diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu index 0c6bd36396..efca2c775a 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -3135,6 +3135,7 @@ static const ggml_backend_i ggml_backend_cuda_interface = { /* .graph_compute = */ ggml_backend_cuda_graph_compute, /* .event_record = */ ggml_backend_cuda_event_record, /* .event_wait = */ ggml_backend_cuda_event_wait, + /* .optimize_graph = */ NULL, }; static ggml_guid_t ggml_backend_cuda_guid() { diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m index eeb6c9d4b7..e76fb71263 100644 --- a/ggml/src/ggml-metal/ggml-metal.m +++ b/ggml/src/ggml-metal/ggml-metal.m @@ -6275,6 +6275,7 @@ static struct ggml_backend_i ggml_backend_metal_i = { /* .graph_compute = */ ggml_backend_metal_graph_compute, /* .event_record = */ NULL, /* .event_wait = */ NULL, + /* .optimize_graph = */ NULL, }; static ggml_guid_t ggml_backend_metal_guid(void) { diff --git a/ggml/src/ggml-opencl/ggml-opencl.cpp b/ggml/src/ggml-opencl/ggml-opencl.cpp index 727163b7fd..b188c5af34 100644 --- a/ggml/src/ggml-opencl/ggml-opencl.cpp +++ b/ggml/src/ggml-opencl/ggml-opencl.cpp @@ -2838,6 +2838,7 @@ static ggml_backend_i ggml_backend_opencl_i = { /* .graph_compute = */ ggml_backend_opencl_graph_compute, /* .event_record = */ NULL, /* .event_wait = */ NULL, + /* .optimize_graph = */ NULL, }; ggml_backend_t ggml_backend_opencl_init(void) { diff --git a/ggml/src/ggml-rpc/ggml-rpc.cpp b/ggml/src/ggml-rpc/ggml-rpc.cpp index e84ff93efc..d4833068d0 100644 --- a/ggml/src/ggml-rpc/ggml-rpc.cpp +++ b/ggml/src/ggml-rpc/ggml-rpc.cpp @@ -795,6 +795,7 @@ static ggml_backend_i ggml_backend_rpc_interface = { /* .graph_compute = */ ggml_backend_rpc_graph_compute, /* .event_record = */ NULL, /* .event_wait = */ NULL, + /* .optimize_graph = */ NULL, }; ggml_backend_buffer_type_t ggml_backend_rpc_buffer_type(const char * endpoint) { diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp index 877fbf7e86..619ccaefc0 100644 --- a/ggml/src/ggml-sycl/ggml-sycl.cpp +++ b/ggml/src/ggml-sycl/ggml-sycl.cpp @@ -4063,6 +4063,7 @@ static ggml_backend_i ggml_backend_sycl_interface = { /* .graph_compute = */ ggml_backend_sycl_graph_compute, /* .event_record = */ ggml_backend_sycl_event_record, /* .event_wait = */ ggml_backend_sycl_event_wait, + /* .optimize_graph = */ NULL, }; static ggml_guid_t ggml_backend_sycl_guid() { diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp index e6245d0cfc..f0fa9e668c 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -583,6 +583,7 @@ struct vk_device_struct { bool disable_fusion; bool disable_host_visible_vidmem; bool allow_sysmem_fallback; + bool disable_optimize_graph; #ifdef GGML_VULKAN_MEMORY_DEBUG std::unique_ptr memory_logger; @@ -3592,6 +3593,9 @@ static vk_device ggml_vk_get_device(size_t idx) { const char* GGML_VK_ALLOW_SYSMEM_FALLBACK = getenv("GGML_VK_ALLOW_SYSMEM_FALLBACK"); device->allow_sysmem_fallback = GGML_VK_ALLOW_SYSMEM_FALLBACK != nullptr; + const char* GGML_VK_DISABLE_OPTIMIZE_GRAPH = getenv("GGML_VK_DISABLE_OPTIMIZE_GRAPH"); + device->disable_optimize_graph = GGML_VK_DISABLE_OPTIMIZE_GRAPH != nullptr; + bool fp16_storage = false; bool fp16_compute = false; bool maintenance4_support = false; @@ -11853,6 +11857,131 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg UNUSED(backend); } +// Sort the graph for improved parallelism. +static void ggml_vk_optimize_graph(ggml_backend_t backend, struct ggml_cgraph * graph) +{ + VK_LOG_DEBUG("ggml_vk_optimize_graph(" << graph->n_nodes << " nodes)"); + ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context; + + if (ctx->device->disable_optimize_graph) { + return; + } + + auto const &is_empty = [](ggml_tensor * node) -> bool { + return node->op == GGML_OP_NONE || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE; + }; + + auto const &is_src_of = [](const ggml_tensor *dst, const ggml_tensor *src) -> bool { + for (uint32_t s = 0; s < GGML_MAX_SRC; ++s) { + if (dst->src[s] == src) { + return true; + } + } + // implicit dependency if they view the same tensor + const ggml_tensor *dst2 = dst->view_src ? dst->view_src : dst; + const ggml_tensor *src2 = src->view_src ? src->view_src : src; + if (dst2 == src2) { + return true; + } + return false; + }; + + // This function tries to reorder the graph to allow nodes to run in parallel. + // This helps with small batches, but for large batches its a slowdown, probably + // due to cache contention. So only reorder if the majority of nodes have few rows. + int num_small_nodes = 0; + int num_counted_nodes = 0; + for (int i = 0; i < graph->n_nodes; ++i) { + if (!is_empty(graph->nodes[i]) && + graph->nodes[i]->op != GGML_OP_SET_ROWS) { + if (ggml_nrows(graph->nodes[i]) <= 8) { + num_small_nodes++; + } + num_counted_nodes++; + } + } + if (num_small_nodes < num_counted_nodes / 2) { + return; + } + + std::vector new_order; + std::vector used(graph->n_nodes, false); + int first_unused = 0; + while (first_unused < graph->n_nodes) { + std::vector current_set; + + // First, grab the next unused node. + current_set.push_back(first_unused); + + // Loop through the next N nodes. Grab any that don't depend on other nodes that + // haven't already been run. Nodes that have already been run have used[i] set + // to true. Allow nodes that depend on the previous node if it's a fusion pattern + // that we support (e.g. RMS_NORM + MUL). + // This first pass only grabs "real" (non-view nodes). Second pass grabs view nodes. + // The goal is to not interleave real and view nodes in a way that breaks fusion. + const int NUM_TO_CHECK = 20; + for (int j = first_unused+1; j < std::min(first_unused + NUM_TO_CHECK, graph->n_nodes); ++j) { + if (used[j]) { + continue; + } + if (is_empty(graph->nodes[j])) { + continue; + } + bool ok = true; + for (int c = first_unused; c < j; ++c) { + if (!used[c] && + is_src_of(graph->nodes[j], graph->nodes[c]) && + !(j == c+1 && c == current_set.back() && graph->nodes[c]->op == GGML_OP_RMS_NORM && graph->nodes[j]->op == GGML_OP_MUL)) { + ok = false; + break; + } + } + if (ok) { + current_set.push_back(j); + } + } + // Second pass grabs view nodes. + // Skip this if it would break a fusion optimization (don't split up add->rms_norm or add->add). + if (graph->nodes[current_set.back()]->op != GGML_OP_ADD) { + for (int j = first_unused+1; j < std::min(first_unused + NUM_TO_CHECK, graph->n_nodes); ++j) { + if (used[j]) { + continue; + } + if (!is_empty(graph->nodes[j])) { + continue; + } + bool ok = true; + for (int c = first_unused; c < j; ++c) { + bool c_in_current_set = std::find(current_set.begin(), current_set.end(), c) != current_set.end(); + // skip views whose srcs haven't been processed. + if (!used[c] && + is_src_of(graph->nodes[j], graph->nodes[c]) && + !c_in_current_set) { + ok = false; + break; + } + } + if (ok) { + current_set.push_back(j); + } + } + } + + // Push the current set into new_order + for (auto c : current_set) { + new_order.push_back(graph->nodes[c]); + used[c] = true; + } + while (first_unused < graph->n_nodes && used[first_unused]) { + first_unused++; + } + } + // Replace the graph with the new order. + for (int i = 0; i < graph->n_nodes; ++i) { + graph->nodes[i] = new_order[i]; + } +} + // TODO: enable async and synchronize static ggml_backend_i ggml_backend_vk_interface = { /* .get_name = */ ggml_backend_vk_name, @@ -11868,6 +11997,7 @@ static ggml_backend_i ggml_backend_vk_interface = { /* .graph_compute = */ ggml_backend_vk_graph_compute, /* .event_record = */ NULL, /* .event_wait = */ NULL, + /* .optimize_graph = */ ggml_vk_optimize_graph, }; static ggml_guid_t ggml_backend_vk_guid() { diff --git a/ggml/src/ggml-webgpu/ggml-webgpu.cpp b/ggml/src/ggml-webgpu/ggml-webgpu.cpp index aad27bf605..df6a3ed95a 100644 --- a/ggml/src/ggml-webgpu/ggml-webgpu.cpp +++ b/ggml/src/ggml-webgpu/ggml-webgpu.cpp @@ -665,6 +665,7 @@ static ggml_backend_i ggml_backend_webgpu_i = { /* .graph_compute = */ ggml_backend_webgpu_graph_compute, /* .event_record = */ NULL, /* .event_wait = */ NULL, + /* .optimize_graph = */ NULL, }; /* End GGML Backend Interface */ diff --git a/ggml/src/ggml-zdnn/ggml-zdnn.cpp b/ggml/src/ggml-zdnn/ggml-zdnn.cpp index 7507a52aea..a4c51ab4e7 100644 --- a/ggml/src/ggml-zdnn/ggml-zdnn.cpp +++ b/ggml/src/ggml-zdnn/ggml-zdnn.cpp @@ -586,6 +586,7 @@ static ggml_backend_i ggml_backend_zdnn_i = { /* .graph_compute = */ ggml_backend_zdnn_graph_compute, /* .event_record = */ NULL, /* .event_wait = */ NULL, + /* .optimize_graph = */ NULL, }; static ggml_guid_t ggml_backend_zdnn_guid(void) { From fe1c92cd7bb491e9c5767c9de413f2b7dc1e832b Mon Sep 17 00:00:00 2001 From: j-k Date: Mon, 8 Sep 2025 19:57:01 +0100 Subject: [PATCH 03/26] media : add llama1 icon (#15878) Add svg and png based off llama1-icon.svg --- media/llama1-icon.png | Bin 0 -> 16045 bytes media/llama1-icon.svg | 87 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 87 insertions(+) create mode 100644 media/llama1-icon.png create mode 100644 media/llama1-icon.svg diff --git a/media/llama1-icon.png b/media/llama1-icon.png new file mode 100644 index 0000000000000000000000000000000000000000..0e44672e54bf3cb91a09d1a2469918e7f6333b3f GIT binary patch literal 16045 zcmc(`_dnJD|3CgfR(2AZNysLfjFOd*m6es1z0R?Zk(pWYM0WNj>u}5>viEjS!m*We z2;p=0djA98-@cbiF6GgE-tM>i{eHV$uh;wA=US>1*BP%v5JaJ__Cyzg@UT}uBt+oL z!1Qtq_>0tC&DaZqSfZ|e@DkU4+k%g`Vai4@JvTd;kCmq_Q5f&`)2NLdn_L!j|Tgpa7{$Z|3L zRVXJk4q4uWsz4aRbPR3bgb@aYI{Yvt?)sY~>@MR0cI#Tr=SzoY98Bcz9^kUQ+Ba~4 z4+HOz+Q<*K9k*^Q$Xg6}FjMH8nwqvg|2$mxD^ozD_h~w(G(5>bbVcDB1hqy^a2q?Q zmvajIXRW~SneW&K2Kz;H%56kIiY?VE*5Wfg;^u7A&^>>*O$b2*Qv${ce5HSIEKCwG z=1n$v=Uho7a^{;xp^!;}SC~zNYW*_^5{9(YYumFDd*wR-eh4?2i(plkVq(a|e5^Qc)4 zM3W#qowjWgDlF8l8|2G?9`4)rj?2(|N$Ub>|tJ{R~lloNn4rc_G(r@SEH z8t8o~)cRm;X>rNCIL0fA1gux9pq1%S)@sjCz}oCzzR>LZ$?Yp@E_;(;YD5nH{By@A z=fT#}3HhXzl`d8e;;Iu6bnxG+Z;kWFN{f!Ml!E9TZR&Lh`m5DlrZ3N~{=&)0sTK3O zrGUuC2_J%#M*oWr=bn#@Gfp40qO8rA^@~cjXDEetYfyTA$zmD>GcJsY{f6I3LyaK`=V@Rca!PpZ>Sg4W@zv3dq_mK$-~iev z`D3ffC#!+-v3}HHlrfQT@dtB@9162(c;*6M6vSsx|6Nn~p+~vi{gpl$*Ap=$PsKdu zyP5IxDyh@WvWg4b#~>X2$zSMz2%n|h-gsQZ`m2nO=d)tXxUIjRX_S@nFX58dWL$!R zx0422J{;QP82zDmeba8M4FPNce7MfuKVy&n?Dg+`)Tyo>0n8 z0uq?-{7zAgJmT@w4ej`5vwrp=ho9EhIuQ@wuBZ#}n6k+U7Wxdv@!148J-IN!>70t< zxwC~*YO99}Aby4?FDXctMr(UMFtoGafsSb~2XTb7cTcxA;puy!L#|<;OH-|#pheZ( zE_j2E#wE#tdRnvk+5h%zIBs4*o~q^0kI zJcHWS89lwvepC|D89fF)*gZ7(L8j-Zbb1m&k#4e4ZMGA2xPbG!>P7vUOqtC^TJ*)) z8~^yXXMtb-K_|8O#1ga^s5Mlvx5RYv#u(MG!*k8O!KR~#Civ6N{effB5=ML$4lWzh(_v${qc|q>#0kE} zi*hkIP}#6DO&_D4&MbPjE~YHjCSUQH)O0nev8Q`iY45}L6z1I_gL+w09s8<0_-5@i zEEVf%XzYY~nt20Kp$6mfTI~~;*)Azru#GEYYfUq-fF6IZE%4K%$@0_JcYDZNw%V%_ zS>G%|(+JboR8yVWIZGY+_-|`@dE9V1iG@?{b6ak&&&~SJQH=qe&y5B2OoxJHtR~H* zIjVwX>xOq9s2I3?8G9ppTvBYgr)qaGTT|tzjrlp9*27iS;auC+k+)zmj9JbZ(LZ-0 z2GIy0t!YPn-w{qNFg3J`CqJBzE-bg>!X|w581ioJz35J^?8wd;N2b_GDMREp zLKCdimH81HgZ|4Fq5k*U2vrpV$0~HJtrw>oRJpMi&7|5{Y9tG{tSHIxK-rU@UridO zGFnJoJKTi!4GpV)df(>AlW^S3?=8Z%=hHs*uhH(@tGK;vpF2+iBcJCuJ`ksV63 zQ4Kb}#N}Q3G@7Ph8*h7@cSue))ym61PjnG#GQ(fdxF!=>z-ha^9qWh|%;#ck zL~q3P`gVEo)K<6$4N^lo?@ZDoj79h+DEg~Bc1xcgw*=c&Go3RYuby686tw=%ui-ya z5$^i(dH$2xv+9{WXXRtpd!wPB?u?4OcFUld`6ytQBe}e

eAJdvhgFw|}}LOQ(2( z>tv$=_`(b2l+jRj_EC}h@gAm^^|?K}56WKso_@o+v^9cTl#ENnD2$XIV@*z+aP%za zHiehRypZcvaY@0Fv9zY($FEBI=~ORk3r2PyME=^EBSkn^RqiL;2ntqK2_TnNhnm2K z3CS9}F|}Zq)=_Sjv}cqx(LbEDU`zh=dm}c2kjG4vd()dItWk0ULoAdAA7(3&z-J-2 z(Rds`9F~j3V75naEoMErPn8=XzUHc=@o{+zq?=O={ryCVYc&dN}mQ^&VQrE?xQttWPhoHLU$paYat+ubk55 zC$r-&^kT2ST-(nPQb_asHm2}!X?x+rwUE@m&I{y=2V<+cjxhn3i0O5I_d132ba1&y<`@*$ zai4h&y-b=e95`Lq4c;|?TH|PBD7R_33vP;E?$Q0nn{Y9w+B@lLVzjzI;XdA8dX%WJ zn4dXfx`l84ICu5k{MKU>EInAsS~%#R?u+u4e*cDS9YJa8wf zu#bAEu_>?@*0@~E`}wrzxe4HURhsG&Vz%nI3xUHJO&=%Y#B*54uKf#9?0wWQ6@k6Q zroBnoW0?KB*K9+Ssq1ua4tM?JZ&Zcuw=wQCM`9Ji(1ftl?zdkUva0uZH#f@Lf{A2q zf5x8IZR<1rK6dgvtc}ZF>eH7MW+Z{NYMGj;xj9huVE*M~@96cNg?XXtYhLV9T*xQa z+Nh36YA}Vc(O`VDwl_Eta+R{rgBRb;STD2&XUrSylA4X}?X#ZB5vsD8F6KXAu?f*p z%3U>JTy3nv`uUPSDZPu>bl7Lpb1Em>j3ql?F5DMQ#J%>)(Uv)aao58uG*VuFdlTNCncJif_{@QFYsmKvvEg^H)|-}-Z`e-^Ke+vLoFEfYLb%vDMf zE9R-N?may-np6z5H|`ksb4wgvmf)z;<32rfJv`sr`@Y?1uypZ-RG1=^QhhvtpvJqM zPj9S}uZ`tF1C2(GT;FobyyQ_bu2eN(QILRy8K32;GQXJ&`d_}ILZHa%NlJc>G|$>L zf;^%Y`yJOA}DrVQ~TKAgw>=4 z-}TPur*ivK*(XFlH-zk`R)6cPDrCRh-o7}Ya^b|v6C47S6MQ^_S>ToWC%khQ4IFPTCw$;sOqk8j-i z>%;M~`6nhnHel)_#}J+*4F=E7_-uMa(`UvXSVJi7#>797i(L|*ftWbf<-KzaS7-v1 z@&5BSx@a;III+HT0&Jga|A+?34^H7zB*)@B%=q%+;r4n6JvAt345@4k93|JcEw{wm z2&UaQ++~f7;XrJ?XI&e3)<9scm`kibo3gZc%c~_$X9G8Ovdp@_Ja!V?mNqsaE1gH= zOBl-fyjUx)jNR^H;Bv)opXl`VhrINeosluWA`E8T_S`iUZv|Rc81r~ek4#sP*%y%U zFCA#{$qx2y4;Jb=0oV9ZZ8^)qO?DCHmiBE^W<29o*zWr?tp{p9GJTbBRX@@kp^#W=lJi$9r?65s1@1)7pH@+&&*Q zOde5TSz;#<9`DA*p`In(`(fg%m8pMmA{gzQqsm5zA+YdaPv6C2w78M^E;{HUxrwXj z-^pjbx2@(R!=o|_vAWS?)923>q6C!a*4bD+Ne!z;XhawEdBNQpg9lSp*nSMvrx+|n-Myo zzvi#V@A!M)uHw^uzmljHTlG}U~BfCa-x@4OMbF+hU+)H zqxLtMajzSy%F)nuGE=ikT}BsGWzo*cbreY)#3aaIf&Gdu_Nbzb#9Zu!dMJwb-fldpWo(} zbdxlFJv(>#yYHQekt*@Scxm+K`b18lv7h{7LMy$sudn=8?bHUW{Om|Tm82eH7)<~u$!=)I093}LgY zSQ=d@+3!Oh zq0Cr&NRJvrl%7@}Z{Eui^PyK&EI$ZlFOWpTCyY#L3wG@X$xka$sB5l-YmYz6KvLDm z4-8Y)BE>w+X)PP};*l?1D#=!GBrt*oA{$^J!$xiu_^#yZ_`s^H53rVN5b>Q+iYK{j zFL_^WvWTj>CCf`%a+3*WGQtZDX`E#)q%TOwQ}TiaorNJ+|^jhNa=H@4J)pvK4cgdTI-JuAN#Te$7rsBZabb zY0Laae1#Zjk+>8I#vuqC!%TLs#$RM|I7NvUN)%`m#qUUOO+z{SzL$$*a&nNRCGbUe z8y8D?PiMy`iD{aZWP7R(ATvu~8aShLH+9c>9Qi8htg~J+9R6eSeFm>@a7ha#=6-F}f`G#Zw>poMvfe9O1qMrUw^e4(1 z?)^PjMWg>kcgj<`QcdmYK9i?Y^>;SaL3U)0Z|l`6MgUN5_zP^UDw+!rreVd#KT|?Vk8+vv*tbT+ zvGzW6xI#{Zf4aCQ#Bd8V5@f1mF<~|?nY}%OW&wx^9>j6gO8j5z@ODwX>6z>y484;V zua`X!6vuGYsR)u!s#RC{bU(HBbiNL;C2->xBs>o0Rq*iPV~f^?+!494a=3#;D_-Yo z_E8ENGCwy8gqMY}r2JTMfW&7l`IjK@h3kMnrf?CBv3EDAjH_GstOj(qLJ&6-R!fQx zQa`?fSuL~Fo-6ZYRo~C@Nfy-C|K(`h$#4zD-YTXjWWaoWtRan9o0G?hwgssHIPR^w z>FS3@u-DSxU3YkVw?JLIv4_)Hyw_z)SzxNyIexlL_G4Z?%*x{L{$O&0sOi}BQjE7< z|NMI%r+{GPg|mVAj8*ux)&xb#a6^2At-X4Vf4%B}_XLAV(&NU$KJmtRrpEvvbTUDx zi86h4EGH`!*}I5Mi8Fet!ms&Ey^c{S>fZ*~mmnQ$sA{RW0$PrfkIu!-bVL8U*+~nG z^W>-0i}1*t@RemCMhBp&-}hO!k&4l#d3YCFX7Ci?Nxr*OfyF2tO0CM-o{>0 zK2MueVD>R4Wru&OG>cHKxw6Cn%~|z@P=z7XHQN2ZE?Wb8b&&m8V>x+2Ol2~8V=7Nw zCrh3ukBFeUq%y@r-Sp}_9&$Nj>^Q+-=)Pcq0;)1MQE*S>mE&Y%!34K5ZhF%r zOM$ar9~HHW5!%*fIy^N~HP`UIW$Z&t83rn%oO%-axhlbq*{VaF?#0CIvlY?mH3|Ya zr^TK#P3!p6?m6ALzl)Yn#W2y<6GGwVz|B=SMvvE-(Fe~>5?v1WZwBz&Y_{^~n91># zf-Y@7i=asW8iFf_`=J{oishs)M;P%qLP2K(`z+;?BwEcv%X*W15EX_B7FjHx%E~wxlLTF2w+VXhi6h_U=!6OC!C*6|t*!`>{8ZH7?gyy#7I2jx z*-HDvhOl!2IsaEJ!P5gDXf$S+vqo`;NsJkm#WLpm0O>v7AiFKFWO8s`q_v{PFwF2u zs&19ZzaO=}JRRWa_=(c5itt3Slsvs-fPg zDEwHmd)3X2bzG;S*yk25^sEKpu?z?1MFX%<^6>V1>(}1)d??(vilEj9`R1CRu)ubX$F*MM<;xmwV0UfL=d(Cxf4Dv5 zq!1a6)n!&aMQ)tohBnSrk-1jsa}3iP^plvul#+$QPD>DdZm)B}rUsjWY9ExFzOa${ zL$rK%ltPEOGpG@uJjHCgQ-BkQ4|=3DNnMBgy{hNGKjsl3t(w3KfKs%(v+jPzhf(bp zb~w;&#pf;`Fh4Ad6?^ORl#8uTtmqaIi^3S9-o^r-5iN`;cai>xEsuNOg=L4Q-AhTK z?HLy#Te*(jYqN>m#q5-l>GzWhspMvN=Y(?r{vageI;mMaePjI~9E9x_4+~#VSlb) zjFHuUW|UQgnSk@2b3_2BQT1XmU%~D7wAdNluE+Vn*Lj+@Iu`c0zW6MUM*qk{?*O*MLca1vs#a`^*>&Mp?FTl?AsNzmQ;)%AjaJn`Qh~lp}XXMP6|gIiIg&60u08An67t3}M*H z!rRCJi1IDBkV^L(QX0hmj(@zSa4)4O!imSQ;_Cu{Xcqvohgid!MD%$wIO@3>EBL=3 zlWvVy?0pANGN0Z;>~C6K*HrwtvW)CKJuArYCV?yH>YkM;YI9Pi&FR+0fK}sF5OPMb zw7Cm0Uk^=qqyg~z%N_qLxc(#bM`|yre#37h$t9ftwy^PR%j2MwjOR$uWP-A zIh%TJ@3Y(w=YpAFa?BNy#oq`W^4pV%a{JxgQ!dskaK7`ENVCdHQua;ZOAo9N zvtC#tKv9)lx&eGGQPLJ#Nk_s8iZ9b(-reK;ngaN)umpz$5Enm(1AgQ3=wZIsU#3t> zU~rsrgNz@(Re~kT&$#1geU^kcm}B%)u+(RjRISx%v1_zGSzR~zQ-~I2?YUrGpVg5A zop+LX6Ha3jYB^OS`rDblj7P*I(U^`A1U;!vGy#e8^*Ls_hnc~M$d+L%B7C_{tt^JA zj_eoP-nXSXtDY%uk8`?g@i{rUDKCiHy6jiKHZ016wdcXU=q%rnO#>){nlI1>8_zC_LaRl-_gy6SK7uxaQdk+GuHpS^TE;avTrOq6YDwiB`PnaVIM!+IG%mT zG48duo*H0r)6LmZN(lmc-?ty!1XE|e<_}G&2%P_IC7SA0J9OL}ls2iY%jHJW<0cdG z%(r;U^gUY2I@~i6PJJ2v11Pr%3&!$E6#*7mt~|hd!G1PH-+C5TBJoP)>+;n5iNluW zZRYhFlcL5yhYle|DcCNrZTOdDz+x~|{#4WOp@(0tzhPa>oQcii|s5VY`af=d}!N47PGd-yeb z*=$m1<`?!Xq4nt3L$O42TQG#Lq%>`X0!6VVtJ!U{_8^f}?wFOu3`K;@01!+>#CAK; z&E?M!N&hy8UoU)I%#qc@jt1Pjk1hC|HS8O^u}4Qu!x=a9CY1~c0@pvAquY6bFLXu& z^zV+dMu)#%z8148chiq6sGV5I23y`Bz4kN^MwBPwx?Q^|auu=|$M12Sl?^8eHkHS2 z>sZ|bD4M*N?o&2ti8b%r?MJ^7^4gJk7F4c+(fkZ3y(?HFGkiGb2HbXYV@m<%PcA zwEA>kzUNcV~LjLBET_*9VVfI%yq>%#ORhy z-C?IfM+|fHkQ6llUjU@W+S>Ylx{3gcX?!DpXjCn6H2S)muczAu@uxc(FzWI zirPgE98?5Itidco(lp{+GmE>B=OJeu7Cf%mj%umGkWK^GfI1g>~a5>8bD%UyY11cWy z=VJl;>Q3Lxi_F4)#vC=t2L(Q*|A*3RTZAV-cSjL>O!kvoWk*2A;@h9Tm$c-iovvd7 zeS4hp$ClNBgv-_R@UwL~2g;MkDkyiun+`sNKyMWc+25gACwYL6F6P%b{rQ!#*}bsd z9w1QW!G{~HtXmNNTgBs2jrpH44uRdf4<4=_v-q9kawd4k_=pr0KM$azKx&$pqo~?+ z&N{CD<{Ih_3EdMEEW0Pc4wI9T!=g?EiLbWapOMTsoEn3qYcOD3c6 z<3ds|h&psnc3)QJlRA%iV=s~(hCX+Y=)~*1WWODFkM8sT-5Bm@FeQ2SV3Xz>R+P{g z4MyJcyfwV1JM27Z7bI=%0TWxFgk+)Cr!^B_RFgpDK16yn?+1JaEK5Uas>8AcPGHk0GSPSPQ1yuA#Y_o9<@$vrV(p5&G&Gf8fCz7X!I zqjlzUoRS=`p^+4ENa)>BxiUe=%|DyKpWVN0WMiiG_`66T26B9KepUL%=MQeQ{x}KC zI7B(gZ4BXA)`0$M`8VZMiT6uP+{VV16{f-23IQzvia1L`q|5=%jvOdK2`Qk9UCsg3 z)%lzn!wRzvIG7&JK^shJ8`^?k2MUZ;JWkW^9#_`0c1_hW0F&32ALo+*#Ybr{tcOL@ zMk^9sAlcJ$LTLb|3u;%%4R|(Mv~U)a;A%dwpEt+&?(Vec~ifGqEUL3pa0)<|?APBw$q26n;;Lf!-Wch{rd*+~*^ zRAtg1rC=8*WxnEds*%{Fp8pIGs>Y0pF5< zA<|ybj<*)AECyWc7M_c}#|xnWb>9a)OkdH>F++)8WTmpSxBR^d6X z#|{`IbLd_zs-%#%htS&R45as$b&+GzP#h6H+BViC?>cco#vqH3y|fP{lpn6>GiNOD zCC$}`c{5=Oa9Ec0)EGh8)e=i)LsNiUQOMbX@E5YC!3d^LG3aXWG6;QfDaXu#syqx1 zKXvR`=;Ahy04a*@kiN2;r5HY1nT){GU~9y1L#m7j%8yg@dF!Ii9Z%Kc(aUWz*%WjsU z%#c0-s3sXF864j3?DMv61A5MR4MoN^6E4OBZHxrZe1XYA`mraaKOgnRk@lcBj-72# zZH#~fjSC`TxP3+FqHo4=cI2{;nz;|Ay<6anLk%Hp;lgP7t*g6tW<&!BssalD(li{) zND3zaQ%;+4z92#TPI6%hGvzvbjqUH8GyJ>N2E*UjMN z$r1`VPQ&&KcMFBq?32EqBC8+R<2BRdM0Bg9`~zD~II#M&NAC!+fD#d2D{cm3WILUv#@WEvT$=?$pR*RW1}+OGu! ziHnN4hB_MkJ3WC}(y9)O{ZI%W7(#kl?V_NSKD=p?mJtFarn-~o^!8XBsx%0%ur`BN z#^d1ybIYzO7-)sHjX+g9{jdLCe8pMdU{{mvO`pYp-6@ z)2ETupSZ~tQivhI6dS7d9nkn&X97tnpe+oNL<0>M3vO;qL!DRNh5a=|iPi6Za1mn! z9XtTzJdLpVw7h^D3Jua1 z;DAByWMF`D(uc*1_L4WQmJl@MZ97DTZPvsF|DRLTibA3C>~!O4Qf@quJAw8x@7zZb zg;7oiiF{C>5Yz|6#-B)^*bfEGLsttGs#txam$YBTzZ=kpciw?n&d<38lXO<99ck^$eN&f>a$qBL0NLB>`kxBe{G9q`aBF7=qY9-Z2n35AfzY`D!;g| zQQi_J`n1E<_3PE-shhUP359k99JCboC=||0(m)(sFL6f`zIdPug3;wE16*|m50Ooo zL-GO2s(_Hg;Nkj-nQ~j390OAiP~Kg(7F_)JY>w0rc~OLUeUfdNnJr9O&7{Jpw)r`B`RXF5~^*;$MI=M->DyjAoGT~!;|A)mf~j%p?? zyj!0@5Hj{Sx8>gJi9KstzYYa#Dda$7K=nt-(_LA`U? zWLxgM&jP71hVJDlxb>C)Y%){EWywhw@!^jf5kqAWiaujYnfbPOyEk#)^c=-`zAaEe zD%Z93Q{`%;eJA#kfZ`OpHUZ>|1W??g+?Jj8_%E{UjmvW%tV1^)sG$6KaD9TzwsN0z zPt!&+TerMV9KEg^cShPznw(?s6{M3ct+t1E=j|0v7)%7(b5d|GDXYbqgA4;sW* zOh3040tM)Aq$HosVfccO9_&i8g}-pTv=3xM4uMbHT;s2DV(}geg#@YX)PM#Fmc@j( zoobfy6V{DKhV{NqkLbn-@b3uQ#Er^rrL5gXY8+J1_9(;lZ8cHhRi0G5P*5aublC*MePRZJF!hR&D<>fy#uIDx!`>n9hB_ksY*!o^GIoVIu znO6{4yG|Jp`D;j@pN-4!4*i~HjX8>}0W$UNKbybHE`|M$F1;P$^a}6r0Ob_$5@=#O z*EG|=n4IPUSt8&QuF!fwm>|y8w(?Sz9tc$N*>%Imo1&ev7A}6jl|T38WO{rh_C>4! zX{@Vs%hK+Hz*2w&7XtzVZu>&^RC?VzH(CGyK}7&lm$CPJMnhGx^RJzwno2IsyM_E} zzp&NtZ&Ia>PXtOmrQ58+NuU2JtjJz40QC6GJ&u%N$G4k5r%Q%EGV8fanrixk^o8ZP z4qcH?prn|4)@`h+3|}Cma80!eH^|~dx670-R}2vUY%)a*PB(wZN0o>d)tTgF@7Mbb zU+zr}m<<*|VamB@-?fEak1{3ZDX*)6t)q&h!!F>Y(RHP{^^cr(uf$Y!oMmLD)-o=y z^sT6#6;n+r0E+n1ME%VADg|P5s3hAA9mu%J^y9E&7Abo%XT~YlBwbA`Nnc;q@~hlo zxu>a;aFs(QpR-KVB+s$(aG$GR!TI}U7!|h3jA%vn8fql)PdMZG?r(r^3Y!~N<#D#e z_&WyZjO_6`0y1|lNm8D-krSw6VP9vX0j)LlcD2i~nu^J**w=n|>@%2sm`d*1vxm-` zpOE4Xp7Ft|mO=VeQYNW_PT!W2Uikc1-(FmS+f0)2@8^vh_pEkN|Dekb^ed&6f(3p< zRznWgoxH7$`e>Y+x0E+7P$A0F*oXGj-9qndj*2`+<<;QE=}2O&CR3`p_vP`XD~XD8d) zL{Dqb!WZ}l;fb&6FqLaWVT#Aim&d-<-u55;qx^1Z<-WZk49NHq@C4y>GG8upF2V!I z!HBLfmh+73r%TtXGQ*95#KVs5SK6JBFhJn^{?Gml3y8v{yF{V_ncelW|6@6UhgP#b zb25y?!P2aMT$AX*BT&v|vH=)n$Awp7Ki#cKE28PK&`{98VfWFS_9N!)O<^Ua!Raxu zWCVaF!6>90Zv%M(NK@jXBWOWTLi9#a^4+Rhj(>q_qAMSO%!QrB((s4?gR5V9*X&Z! zM93$89G$KBAAcsbL39LcB^pT8aJQJv0xTr6eyuoxb3|2=2+L}0Neu)wE?WZIsI&1c zq1r>1i*GVA?lu5We%R4c;%&gTIWjyHewDl}PIYa1imlc#peYxxQBqQILQ#jCtBOLSW-!>h9-38Vm52VaMI zs$uh2>+4cC0Ize4%Jg(cUHl5Yy~PwSa-BB;yR!bKy{^?_WXGnN7+@_dVcyd^I#--m z(PZrc0|y5nhGNB~fxusdMNUi_Kk^M=tGc^VT&1{5JESEF?>8r~Zwc&>{kb&C9vdcq zL`hBzUW$AKKu|}vNjV|y>%O<43zMePROa6?!{XAmF%REd0Up+~{2Hh-xlbLzN>8nS zI!E+=l520vHMJPrY9At$K;d!3DfUJK2`zXO@hNdoyHujTN_t1{zDCFP_sxI-QUB8N zU*6IopGjbGP2hl|8qwGyI83UjCY}W}3$n6*t5?W&^5d!qK^%BPk#G0ZMJ-2Oq)?;i z+cyN`m**Y@s*QB8g|dphg=E9GdtVf%s4)z1aMqr7E*30_^Huq#*+`Co2P7TfjfRVm zJpkg+TOc8T=?|qfRUWfujl#WX9iu<{4etX3&Nk+`221BFxY&qMHpMHdP8Hrw0gZfe z+PL>xYgK7Gp#OB}N*DzP!YGD#0`o_?UE=>IzCdb%%kQ=aXIjE>MHSFi%)XleZ&|R| z;XLMS#*hta%t1H+XOPcgx8>?+(`N<=h0R9+^u1ry3bgT?p9BbtCh-8n`$HN$UBjhJ zHwn7NJ2<&(j zE^3uGXG8)3c5-#@Cw{ecOMkwba?V?3Ta25-*9KT?QL3obk<%8-n%z z=->raKh7l}uHi4`u`88k)wNz6PYbX7{Ptb?&lE?Flq4&SMu91nqlT)=#Jw{om&qiT z-Lw1uv6NOHq$<>P==-ga`!w!crFF6vIx_nuvitYVW{;gV1C=-5VE3dw+U+@ z2jgdcQ7mBW1fF4l*QQZ*^C=Y@SE&$0p2aEWHt?C%vxOQIB2U5PvVdp98e85ix4`fB zvVOb&WpByP=dkZ<9@=#Tt*=O@xSH14X)dJ{B-tXAbjQ}xufE>2Sy+B|QLVRll#oD? zgmW>}z=xo)pGK$1%I#^FB+A$4>4`wMTR&Wx2a4a3GSS=7MmSt9x8K0D7*=k9N517x zHl(iJ^sX8W8?<=0jUq4U&Ee7mjqyx5aQ-$u&!6cEJxQO}cko5fVp+1G&W;|#j7 zu_1p|u=leNJqH}r<`U`++L(^t4nCe=u3_F-Nv&+CZ_Kt-_jx)Urz1oK>i4M(j2mRg z%4EyTy0QL|$5t{)JT&}1$t>hi0Gg+#;Et~x!H z#Hd>P6oTBJn`9Pf!|DvUp{Vb~5X@dQ?l?>?r^?F0{qA$4V~CsmIv-~2xg%4}r- literal 0 HcmV?d00001 diff --git a/media/llama1-icon.svg b/media/llama1-icon.svg new file mode 100644 index 0000000000..dcbe9cce9b --- /dev/null +++ b/media/llama1-icon.svg @@ -0,0 +1,87 @@ + + + + + + + + + + + + + + + + + + From 7057faf64b514e991e2f70147f82bb13d544b1c0 Mon Sep 17 00:00:00 2001 From: Aldehir Rojas Date: Mon, 8 Sep 2025 16:14:32 -0500 Subject: [PATCH 04/26] json : support `enum` values within `allOf` (#15830) --- common/json-schema-to-grammar.cpp | 22 ++++++++- examples/json_schema_to_grammar.py | 15 ++++++- tests/test-json-schema-to-grammar.cpp | 45 +++++++++++++++++++ .../public_legacy/json-schema-to-grammar.mjs | 15 ++++++- 4 files changed, 94 insertions(+), 3 deletions(-) diff --git a/common/json-schema-to-grammar.cpp b/common/json-schema-to-grammar.cpp index 637891f506..182c787544 100644 --- a/common/json-schema-to-grammar.cpp +++ b/common/json-schema-to-grammar.cpp @@ -843,9 +843,10 @@ public: _build_object_rule( properties, required, name, schema.contains("additionalProperties") ? schema["additionalProperties"] : json())); - } else if ((schema_type.is_null() || schema_type == "object") && schema.contains("allOf")) { + } else if ((schema_type.is_null() || schema_type == "object" || schema_type == "string") && schema.contains("allOf")) { std::unordered_set required; std::vector> properties; + std::map enum_values; std::string hybrid_name = name; std::function add_component = [&](const json & comp_schema, bool is_required) { if (comp_schema.contains("$ref")) { @@ -857,6 +858,14 @@ public: required.insert(prop.key()); } } + } else if (comp_schema.contains("enum")) { + for (const auto & v : comp_schema["enum"]) { + const auto rule = _generate_constant_rule(v); + if (enum_values.find(rule) == enum_values.end()) { + enum_values[rule] = 0; + } + enum_values[rule] += 1; + } } else { // todo warning } @@ -870,6 +879,17 @@ public: add_component(t, true); } } + if (!enum_values.empty()) { + std::vector enum_intersection; + for (const auto & p : enum_values) { + if (p.second == schema["allOf"].size()) { + enum_intersection.push_back(p.first); + } + } + if (!enum_intersection.empty()) { + return _add_rule(rule_name, "(" + string_join(enum_intersection, " | ") + ") space"); + } + } return _add_rule(rule_name, _build_object_rule(properties, required, hybrid_name, json())); } else if ((schema_type.is_null() || schema_type == "array") && (schema.contains("items") || schema.contains("prefixItems"))) { json items = schema.contains("items") ? schema["items"] : schema["prefixItems"]; diff --git a/examples/json_schema_to_grammar.py b/examples/json_schema_to_grammar.py index ed37958554..2d57549046 100755 --- a/examples/json_schema_to_grammar.py +++ b/examples/json_schema_to_grammar.py @@ -586,9 +586,10 @@ class SchemaConverter: properties = list(schema.get('properties', {}).items()) return self._add_rule(rule_name, self._build_object_rule(properties, required, name, schema.get('additionalProperties'))) - elif schema_type in (None, 'object') and 'allOf' in schema: + elif schema_type in (None, 'object', 'string') and 'allOf' in schema: required = set() properties = [] + enum_sets = [] hybrid_name = name def add_component(comp_schema, is_required): if (ref := comp_schema.get('$ref')) is not None: @@ -600,6 +601,9 @@ class SchemaConverter: if is_required: required.add(prop_name) + if 'enum' in comp_schema: + enum_sets.append(set(comp_schema['enum'])) + for t in schema['allOf']: if 'anyOf' in t: for tt in t['anyOf']: @@ -607,6 +611,15 @@ class SchemaConverter: else: add_component(t, is_required=True) + if enum_sets: + enum_intersection = enum_sets[0] + for s in enum_sets[1:]: + enum_intersection &= s + + if enum_intersection: + rule = '(' + ' | '.join((self._generate_constant_rule(v) for v in sorted(enum_intersection))) + ') space' + return self._add_rule(rule_name, rule) + return self._add_rule(rule_name, self._build_object_rule(properties, required, hybrid_name, additional_properties=None)) elif schema_type in (None, 'array') and ('items' in schema or 'prefixItems' in schema): diff --git a/tests/test-json-schema-to-grammar.cpp b/tests/test-json-schema-to-grammar.cpp index 78ee55e246..67df240c6f 100755 --- a/tests/test-json-schema-to-grammar.cpp +++ b/tests/test-json-schema-to-grammar.cpp @@ -1209,6 +1209,51 @@ static void test_all(const std::string & lang, std::function { const ref = compSchema.$ref; if (ref !== undefined) { @@ -648,6 +649,10 @@ export class SchemaConverter { } } } + + if ('enum' in compSchema) { + enumSets.push(new Set(compSchema.enum || [])); + } }; for (const t of schema.allOf) { @@ -660,6 +665,14 @@ export class SchemaConverter { } } + if (enumSets.length > 0) { + const enumIntersection = new Set([...enumSets[0]].filter(v => enumSets.every(s => s.has(v)))); + if (enumIntersection.size > 0) { + const sortedEnums = [...enumIntersection].sort((a, b) => a.localeCompare(b)); + const rule = '(' + sortedEnums.map(v => this._generateConstantRule(v)).join(' | ') + ') space'; + return this._addRule(ruleName, rule); + } + } return this._addRule(ruleName, this._buildObjectRule(properties, required, name, null)); } else if ((schemaType === undefined || schemaType === 'array') && ('items' in schema || 'prefixItems' in schema)) { const items = schema.items ?? schema.prefixItems; From acc1b008cfd95e63d5f99a370dbffb98e5a99d2c Mon Sep 17 00:00:00 2001 From: "Piotr Wilkin (ilintar)" Date: Tue, 9 Sep 2025 06:05:55 +0200 Subject: [PATCH 05/26] model-conversion : add extra debugging support for model conversion (#15877) * feat: Extra debugging support for model conversion - added BF16 support for llama-callback-eval and support for dumping intermediate steps in run-org-model.py --- examples/eval-callback/eval-callback.cpp | 11 ++ examples/model-conversion/requirements.txt | 9 +- .../scripts/causal/run-org-model.py | 149 ++++++++++++++++-- 3 files changed, 156 insertions(+), 13 deletions(-) diff --git a/examples/eval-callback/eval-callback.cpp b/examples/eval-callback/eval-callback.cpp index d4ef751fbb..cefa39a57c 100644 --- a/examples/eval-callback/eval-callback.cpp +++ b/examples/eval-callback/eval-callback.cpp @@ -28,6 +28,15 @@ static std::string ggml_ne_string(const ggml_tensor * t) { return str; } +static inline float ggml_compute_bf16_to_fp32(ggml_bf16_t h) { + union { + float f; + uint32_t i; + } u; + u.i = (uint32_t)h.bits << 16; + return u.f; +} + static float ggml_get_float_value(uint8_t * data, ggml_type type, const size_t * nb, size_t i0, size_t i1, size_t i2, size_t i3) { size_t i = i3 * nb[3] + i2 * nb[2] + i1 * nb[1] + i0 * nb[0]; float v; @@ -43,6 +52,8 @@ static float ggml_get_float_value(uint8_t * data, ggml_type type, const size_t * v = (float) *(int16_t *) &data[i]; } else if (type == GGML_TYPE_I8) { v = (float) *(int8_t *) &data[i]; + } else if (type == GGML_TYPE_BF16) { + v = ggml_compute_bf16_to_fp32(*(ggml_bf16_t *) &data[i]); } else { GGML_ABORT("fatal error"); } diff --git a/examples/model-conversion/requirements.txt b/examples/model-conversion/requirements.txt index b8148b269a..ac9f69e10b 100644 --- a/examples/model-conversion/requirements.txt +++ b/examples/model-conversion/requirements.txt @@ -1,5 +1,6 @@ --extra-index-url https://download.pytorch.org/whl/cpu -torch~=2.6.0 -torchvision~=0.21.0 -transformers~=4.55.0 -huggingface-hub~=0.34.0 +torch +torchvision +transformers +huggingface-hub +accelerate diff --git a/examples/model-conversion/scripts/causal/run-org-model.py b/examples/model-conversion/scripts/causal/run-org-model.py index f6188ea6f3..78a54abf13 100755 --- a/examples/model-conversion/scripts/causal/run-org-model.py +++ b/examples/model-conversion/scripts/causal/run-org-model.py @@ -9,15 +9,134 @@ from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig import torch import numpy as np -unreleased_model_name = os.getenv('UNRELEASED_MODEL_NAME') +### If you want to dump RoPE activations, apply this monkey patch to the model +### class from Transformers that you are running (replace apertus.modeling_apertus +### with the proper package and class for your model +### === START ROPE DEBUG === +# from transformers.models.apertus.modeling_apertus import apply_rotary_pos_emb -parser = argparse.ArgumentParser(description='Process model with specified path') -parser.add_argument('--model-path', '-m', help='Path to the model') +# orig_rope = apply_rotary_pos_emb +# torch.set_printoptions(threshold=float('inf')) +# torch.set_printoptions(precision=6, sci_mode=False) + +# def debug_rope(q, k, cos, sin, position_ids=None, unsqueeze_dim=1): +# # log inputs +# summarize(q, "RoPE.q_in") +# summarize(k, "RoPE.k_in") + +# # call original +# q_out, k_out = orig_rope(q, k, cos, sin, position_ids, unsqueeze_dim) + +# # log outputs +# summarize(q_out, "RoPE.q_out") +# summarize(k_out, "RoPE.k_out") + +# return q_out, k_out + +# # Patch it +# import transformers.models.apertus.modeling_apertus as apertus_mod # noqa: E402 +# apertus_mod.apply_rotary_pos_emb = debug_rope +### == END ROPE DEBUG === + + +def summarize(tensor: torch.Tensor, name: str, max_seq: int = 3, max_vals: int = 3): + """ + Print a tensor in llama.cpp debug style. + + Supports: + - 2D tensors (seq, hidden) + - 3D tensors (batch, seq, hidden) + - 4D tensors (batch, seq, heads, dim_per_head) via flattening heads × dim_per_head + + Shows first and last max_vals of each vector per sequence position. + """ + t = tensor.detach().to(torch.float32).cpu() + + # Determine dimensions + if t.ndim == 3: + _, s, _ = t.shape + elif t.ndim == 2: + _, s = 1, t.shape[0] + t = t.unsqueeze(0) + elif t.ndim == 4: + _, s, _, _ = t.shape + else: + print(f"Skipping tensor due to unsupported dimensions: {t.ndim}") + return + + ten_shape = t.shape + + print(f"ggml_debug: {name} = (f32) ... = {{{ten_shape}}}") + print(" [") + print(" [") + + # Determine indices for first and last sequences + first_indices = list(range(min(s, max_seq))) + last_indices = list(range(max(0, s - max_seq), s)) + + # Check if there's an overlap between first and last indices or if we're at the edge case of s = 2 * max_seq + has_overlap = bool(set(first_indices) & set(last_indices)) or (max_seq * 2 == s) + + # Combine indices + if has_overlap: + # If there's overlap, just use the combined unique indices + indices = sorted(list(set(first_indices + last_indices))) + separator_index = None + else: + # If no overlap, we'll add a separator between first and last sequences + indices = first_indices + last_indices + separator_index = len(first_indices) + + for i, si in enumerate(indices): + # Add separator if needed + if separator_index is not None and i == separator_index: + print(" ...") + + # Extract appropriate slice + vec = t[0, si] + if vec.ndim == 2: # 4D case: flatten heads × dim_per_head + flat = vec.flatten().tolist() + else: # 2D or 3D case + flat = vec.tolist() + + # First and last slices + first = flat[:max_vals] + last = flat[-max_vals:] if len(flat) >= max_vals else flat + first_str = ", ".join(f"{v:12.4f}" for v in first) + last_str = ", ".join(f"{v:12.4f}" for v in last) + + print(f" [{first_str}, ..., {last_str}]") + + print(" ],") + print(" ]") + print(f" sum = {t.sum().item():.6f}\n") + + +def debug_hook(name): + def fn(_m, input, output): + if isinstance(input, torch.Tensor): + summarize(input, name + "_in") + elif isinstance(input, (tuple, list)) and isinstance(input[0], torch.Tensor): + summarize(input[0], name + "_in") + if isinstance(output, torch.Tensor): + summarize(output, name + "_out") + elif isinstance(output, (tuple, list)) and isinstance(output[0], torch.Tensor): + summarize(output[0], name + "_out") + + return fn + + +unreleased_model_name = os.getenv("UNRELEASED_MODEL_NAME") + +parser = argparse.ArgumentParser(description="Process model with specified path") +parser.add_argument("--model-path", "-m", help="Path to the model") args = parser.parse_args() -model_path = os.environ.get('MODEL_PATH', args.model_path) +model_path = os.environ.get("MODEL_PATH", args.model_path) if model_path is None: - parser.error("Model path must be specified either via --model-path argument or MODEL_PATH environment variable") + parser.error( + "Model path must be specified either via --model-path argument or MODEL_PATH environment variable" + ) config = AutoConfig.from_pretrained(model_path) @@ -34,18 +153,30 @@ config = AutoConfig.from_pretrained(model_path) if unreleased_model_name: model_name_lower = unreleased_model_name.lower() - unreleased_module_path = f"transformers.models.{model_name_lower}.modular_{model_name_lower}" + unreleased_module_path = ( + f"transformers.models.{model_name_lower}.modular_{model_name_lower}" + ) class_name = f"{unreleased_model_name}ForCausalLM" print(f"Importing unreleased model module: {unreleased_module_path}") try: - model_class = getattr(importlib.import_module(unreleased_module_path), class_name) - model = model_class.from_pretrained(model_path) # Note: from_pretrained, not fromPretrained + model_class = getattr( + importlib.import_module(unreleased_module_path), class_name + ) + model = model_class.from_pretrained( + model_path + ) # Note: from_pretrained, not fromPretrained except (ImportError, AttributeError) as e: print(f"Failed to import or load model: {e}") exit(1) else: - model = AutoModelForCausalLM.from_pretrained(model_path) + model = AutoModelForCausalLM.from_pretrained( + model_path, device_map="auto", offload_folder="offload" + ) + +for name, module in model.named_modules(): + if len(list(module.children())) == 0: # only leaf modules + module.register_forward_hook(debug_hook(name)) model_name = os.path.basename(model_path) # Printing the Model class to allow for easier debugging. This can be useful From 70cd37dbbebdb7a2f84f08207f18eabb0b291a55 Mon Sep 17 00:00:00 2001 From: Daniel Bevenius Date: Tue, 9 Sep 2025 06:06:52 +0200 Subject: [PATCH 06/26] requirements : update transformers/torch for Embedding Gemma (#15828) * requirements : update transformers/torch for Embedding Gemma This commit updates the requirements to support converting Embedding Gemma 300m models. The motivation for this change is that during development I had a local copy of the transformers package which is what I used for converting the models. This was a mistake on my part and I should have also updated my transformers version to the official release. I had checked the requirements/requirements-convert_legacy_llama.txt file and noted that the version was >=4.45.1,<5.0.0 and came to the conculusion that no updated would be needed, this assumed that Embedding Gemma would be in a transformers release at the time Commit fb15d649ed14ab447eeab911e0c9d21e35fb243e ("llama : add support for EmbeddingGemma 300m (#15798)) was merged. So anyone wanting to convert themselves would be able to do so. However, Embedding Gemma is a preview release and this commit updates the requirements to use this preview release. * resolve additional python dependencies * fix pyright errors in tokenizer test and remove unused import --- requirements/requirements-convert_hf_to_gguf.txt | 4 +++- requirements/requirements-convert_legacy_llama.txt | 11 ++++++++++- requirements/requirements-tool_bench.txt | 2 +- tests/test-tokenizer-random.py | 4 ++-- .../minicpmv-convert-image-encoder-to-gguf.py | 4 ++-- tools/mtmd/requirements.txt | 4 ++-- tools/server/tests/requirements.txt | 2 +- 7 files changed, 21 insertions(+), 10 deletions(-) diff --git a/requirements/requirements-convert_hf_to_gguf.txt b/requirements/requirements-convert_hf_to_gguf.txt index 766745f42f..90c98c3ffe 100644 --- a/requirements/requirements-convert_hf_to_gguf.txt +++ b/requirements/requirements-convert_hf_to_gguf.txt @@ -2,7 +2,9 @@ mistral-common>=1.8.3 -r ./requirements-convert_legacy_llama.txt --extra-index-url https://download.pytorch.org/whl/cpu -torch~=2.4.0; platform_machine != "s390x" + +## Embedding Gemma requires PyTorch 2.6.0 or later +torch~=2.6.0; platform_machine != "s390x" # torch s390x packages can only be found from nightly builds --extra-index-url https://download.pytorch.org/whl/nightly diff --git a/requirements/requirements-convert_legacy_llama.txt b/requirements/requirements-convert_legacy_llama.txt index 859204b27e..f6076142ce 100644 --- a/requirements/requirements-convert_legacy_llama.txt +++ b/requirements/requirements-convert_legacy_llama.txt @@ -1,5 +1,14 @@ numpy~=1.26.4 sentencepiece~=0.2.0 -transformers>=4.45.1,<5.0.0 + +# Embedding Gemma is currently a preview release: +# https://github.com/huggingface/transformers/releases/tag/v4.56.0-Embedding-Gemma-preview + +# The version is needed to be able to convert Embedding Gemma models to GGUF format: +git+https://github.com/huggingface/transformers@v4.56.0-Embedding-Gemma-preview + +# Once Embedding Gemma is officially released, we can switch to: +#transformers>=4.57.1,<5.0.0 + gguf>=0.1.0 protobuf>=4.21.0,<5.0.0 diff --git a/requirements/requirements-tool_bench.txt b/requirements/requirements-tool_bench.txt index b94521fc7f..f7912aff72 100644 --- a/requirements/requirements-tool_bench.txt +++ b/requirements/requirements-tool_bench.txt @@ -1,6 +1,6 @@ aiohttp~=3.9.3 pytest~=8.3.3 -huggingface_hub~=0.23.2 +huggingface_hub>=0.34.0,<1.0 matplotlib~=3.10.0 numpy~=1.26.4 openai~=1.55.3 diff --git a/tests/test-tokenizer-random.py b/tests/test-tokenizer-random.py index c6cdcb5548..93e697607e 100644 --- a/tests/test-tokenizer-random.py +++ b/tests/test-tokenizer-random.py @@ -421,10 +421,10 @@ def compare_tokenizers(tokenizer1: TokenizerGroundtruth, tokenizer2: TokenizerLl if text1 == text2: # equal to TokenizerGroundtruth? return True # equal to source text? - if tokenizer1.add_bos_token: # remove BOS + if tokenizer1.add_bos_token and tokenizer1.bos_token and isinstance(tokenizer1.bos_token, str): # remove BOS if text2.startswith(tokenizer1.bos_token): text2 = text2[len(tokenizer1.bos_token):] - if tokenizer1.add_eos_token: # remove EOS + if tokenizer1.add_eos_token and tokenizer1.eos_token and isinstance(tokenizer1.eos_token, str): # remove EOS if text2.endswith(tokenizer1.eos_token): text2 = text2[:-len(tokenizer1.eos_token)] return text == text2 diff --git a/tools/mtmd/legacy-models/minicpmv-convert-image-encoder-to-gguf.py b/tools/mtmd/legacy-models/minicpmv-convert-image-encoder-to-gguf.py index f34d858d67..bb2cc4e4ea 100644 --- a/tools/mtmd/legacy-models/minicpmv-convert-image-encoder-to-gguf.py +++ b/tools/mtmd/legacy-models/minicpmv-convert-image-encoder-to-gguf.py @@ -23,7 +23,6 @@ import warnings import numpy as np import torch import torch.nn.functional as F -import torch.utils.checkpoint from torch import nn from torch.nn.init import _calculate_fan_in_and_fan_out @@ -413,7 +412,8 @@ import re import numpy as np from gguf import * -from transformers.models.idefics2.modeling_idefics2 import Idefics2VisionTransformer, Idefics2VisionConfig +from transformers.models.idefics2.modeling_idefics2 import Idefics2VisionTransformer +from transformers.models.idefics2.configuration_idefics2 import Idefics2VisionConfig TEXT = "clip.text" VISION = "clip.vision" diff --git a/tools/mtmd/requirements.txt b/tools/mtmd/requirements.txt index a9d788f265..0a1f4e8647 100644 --- a/tools/mtmd/requirements.txt +++ b/tools/mtmd/requirements.txt @@ -1,5 +1,5 @@ -r ../../requirements/requirements-convert_legacy_llama.txt --extra-index-url https://download.pytorch.org/whl/cpu pillow~=11.3.0 -torch~=2.4.0 -torchvision~=0.19.1 +torch~=2.6.0 +torchvision~=0.21.0 diff --git a/tools/server/tests/requirements.txt b/tools/server/tests/requirements.txt index 15d024914e..4ea7f19f77 100644 --- a/tools/server/tests/requirements.txt +++ b/tools/server/tests/requirements.txt @@ -1,6 +1,6 @@ aiohttp~=3.9.3 pytest~=8.3.3 -huggingface_hub~=0.23.2 +huggingface_hub>=0.34.0,<1.0 numpy~=1.26.4 openai~=1.55.3 prometheus-client~=0.20.0 From c252ce67c4b99f056eeb18d42a289e28fee03475 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Tue, 9 Sep 2025 08:42:10 +0300 Subject: [PATCH 07/26] contrib : add notes about merging PRs (#15881) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * contrib : add notes about merging PRs * Update CONTRIBUTING.md Co-authored-by: Diego Devesa * Update CONTRIBUTING.md Co-authored-by: Johannes Gäßler --------- Co-authored-by: Diego Devesa Co-authored-by: Johannes Gäßler --- CONTRIBUTING.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index e68ff92445..6db74a0cf2 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -16,6 +16,9 @@ - Use the following format for the squashed commit title: ` : (#)`. For example: `utils : fix typo in utils.py (#1234)` - Optionally pick a `` from here: https://github.com/ggml-org/llama.cpp/wiki/Modules - Consider adding yourself to [CODEOWNERS](CODEOWNERS) +- Let authors, who are also collaborators, merge their own PRs +- When merging a PR by a contributor, make sure you have a good understanding of the changes +- Be mindful of maintenance: most of the work going into a feature happens after the PR is merged. If the PR author is not committed to contribute long-term, someone else needs to take responsibility (you) # Coding guidelines From 550cf726e133fd0a069d991287fd3a2a3e3e1cbd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= Date: Tue, 9 Sep 2025 08:11:01 +0200 Subject: [PATCH 08/26] CUDA: fix GET_ROWS for large tensors (#15882) --- ggml/src/ggml-cuda/getrows.cu | 80 +++++++++++++++++---------------- ggml/src/ggml-cuda/ggml-cuda.cu | 4 -- 2 files changed, 41 insertions(+), 43 deletions(-) diff --git a/ggml/src/ggml-cuda/getrows.cu b/ggml/src/ggml-cuda/getrows.cu index 83d02474f5..2fab33243d 100644 --- a/ggml/src/ggml-cuda/getrows.cu +++ b/ggml/src/ggml-cuda/getrows.cu @@ -2,39 +2,39 @@ #include "dequantize.cuh" #include "convert.cuh" -#define MAX_GRIDDIM_Y 65535 - template static __global__ void k_get_rows( const void * __restrict__ src0, const int32_t * __restrict__ src1, dst_t * __restrict__ dst, const int64_t ne00, /*const int64_t ne01, const int64_t ne02, const int64_t ne03,*/ - /*const int64_t ne10, const int64_t ne11,*/ const int64_t ne12, /*const int64_t ne13,*/ + /*const int64_t ne10,*/ const int64_t ne11, const int64_t ne12, /*const int64_t ne13,*/ /*const size_t s0,*/ const size_t s1, const size_t s2, const size_t s3, /*const size_t nb00,*/ const size_t nb01, const size_t nb02, const size_t nb03, const size_t s10, const size_t s11, const size_t s12/*, const size_t s13*/) { - for (int64_t i00 = 2*(blockIdx.y*blockDim.x + threadIdx.x); i00 < ne00; i00 += gridDim.y*blockDim.x) { - // The x and y dimensions of the grid are swapped because the maximum allowed grid size for x is higher. - const int i10 = blockIdx.x; - const int i11 = blockIdx.z / ne12; - const int i12 = blockIdx.z % ne12; + for (int64_t z = blockIdx.z; z < ne11*ne12; z += gridDim.z) { + for (int64_t i00 = 2*(blockIdx.y*blockDim.x + threadIdx.x); i00 < ne00; i00 += gridDim.y*blockDim.x) { + // The x and y dimensions of the grid are swapped because the maximum allowed grid size for x is higher. + const int i10 = blockIdx.x; + const int i11 = z / ne12; // TODO fastdiv + const int i12 = z % ne12; - const int i01 = src1[i10*s10 + i11*s11 + i12*s12]; + const int i01 = src1[i10*s10 + i11*s11 + i12*s12]; - dst_t * dst_row = dst + i10*s1 + i11*s2 + i12*s3; - const void * src0_row = (const char *) src0 + i01*nb01 + i11*nb02 + i12*nb03; + dst_t * dst_row = dst + i10*s1 + i11*s2 + i12*s3; + const void * src0_row = (const char *) src0 + i01*nb01 + i11*nb02 + i12*nb03; - const int ib = i00/qk; // block index - const int iqs = (i00%qk)/qr; // quant index - const int iybs = i00 - i00%qk; // dst block start index - const int y_offset = qr == 1 ? 1 : qk/2; + const int ib = i00/qk; // block index + const int iqs = (i00%qk)/qr; // quant index + const int iybs = i00 - i00%qk; // dst block start index + const int y_offset = qr == 1 ? 1 : qk/2; - // dequantize - float2 v; - dequantize_kernel(src0_row, ib, iqs, v); + // dequantize + float2 v; + dequantize_kernel(src0_row, ib, iqs, v); - dst_row[iybs + iqs + 0] = ggml_cuda_cast(v.x); - dst_row[iybs + iqs + y_offset] = ggml_cuda_cast(v.y); + dst_row[iybs + iqs + 0] = ggml_cuda_cast(v.x); + dst_row[iybs + iqs + y_offset] = ggml_cuda_cast(v.y); + } } } @@ -42,27 +42,29 @@ template static __global__ void k_get_rows_float( const src0_t * __restrict__ src0, const int32_t * __restrict__ src1, dst_t * __restrict__ dst, const int64_t ne00, /*const int64_t ne01, const int64_t ne02, const int64_t ne03,*/ - /*const int64_t ne10, const int64_t ne11,*/ const int64_t ne12, /*const int64_t ne13,*/ + /*const int64_t ne10,*/ const int64_t ne11, const int64_t ne12, /*const int64_t ne13,*/ /*const size_t s0,*/ const size_t s1, const size_t s2, const size_t s3, /*const size_t nb00,*/ const size_t nb01, const size_t nb02, const size_t nb03, const size_t s10, const size_t s11, const size_t s12/*, const size_t s13*/) { - for (int64_t i00 = blockIdx.y*blockDim.x + threadIdx.x; i00 < ne00; i00 += gridDim.y*blockDim.x) { - // The x and y dimensions of the grid are swapped because the maximum allowed grid size for x is higher. - const int i10 = blockIdx.x; - const int i11 = blockIdx.z / ne12; - const int i12 = blockIdx.z % ne12; + for (int64_t z = blockIdx.z; z < ne11*ne12; z += gridDim.z) { + for (int64_t i00 = blockIdx.y*blockDim.x + threadIdx.x; i00 < ne00; i00 += gridDim.y*blockDim.x) { + // The x and y dimensions of the grid are swapped because the maximum allowed grid size for x is higher. + const int i10 = blockIdx.x; + const int i11 = z / ne12; // TODO fastdiv + const int i12 = z % ne12; - if (i00 >= ne00) { - return; + if (i00 >= ne00) { + return; + } + + const int i01 = src1[i10*s10 + i11*s11 + i12*s12]; + + dst_t * dst_row = dst + i10*s1 + i11*s2 + i12*s3; + const src0_t * src0_row = (const src0_t *)((const char *) src0 + i01*nb01 + i11*nb02 + i12*nb03); + + dst_row[i00] = ggml_cuda_cast(src0_row[i00]); } - - const int i01 = src1[i10*s10 + i11*s11 + i12*s12]; - - dst_t * dst_row = dst + i10*s1 + i11*s2 + i12*s3; - const src0_t * src0_row = (const src0_t *)((const char *) src0 + i01*nb01 + i11*nb02 + i12*nb03); - - dst_row[i00] = ggml_cuda_cast(src0_row[i00]); } } @@ -98,7 +100,7 @@ static void get_rows_cuda_q( cudaStream_t stream) { const dim3 block_dims(CUDA_GET_ROWS_BLOCK_SIZE, 1, 1); const int block_num_y = (ne00 + 2*CUDA_GET_ROWS_BLOCK_SIZE - 1) / (2*CUDA_GET_ROWS_BLOCK_SIZE); - const dim3 block_nums(ne10, MIN(block_num_y, MAX_GRIDDIM_Y), ne11*ne12); + const dim3 block_nums(ne10, MIN(block_num_y, UINT16_MAX), MIN(ne11*ne12, UINT16_MAX)); // strides in elements // const size_t s0 = nb0 / sizeof(dst_t); @@ -116,7 +118,7 @@ static void get_rows_cuda_q( k_get_rows<<>>( src0_d, src1_d, dst_d, ne00, /*ne01, ne02, ne03,*/ - /*ne10, ne11,*/ ne12, /*ne13,*/ + /*ne10,*/ ne11, ne12, /*ne13,*/ /* s0,*/ s1, s2, s3, /* nb00,*/ nb01, nb02, nb03, s10, s11, s12/*, s13*/); @@ -131,7 +133,7 @@ static void get_rows_cuda_float( cudaStream_t stream) { const dim3 block_dims(CUDA_GET_ROWS_BLOCK_SIZE, 1, 1); const int block_num_y = (ne00 + CUDA_GET_ROWS_BLOCK_SIZE - 1) / CUDA_GET_ROWS_BLOCK_SIZE; - const dim3 block_nums(ne10, MIN(block_num_y, MAX_GRIDDIM_Y), ne11*ne12); + const dim3 block_nums(ne10, MIN(block_num_y, UINT16_MAX), MIN(ne11*ne12, UINT16_MAX)); // strides in elements // const size_t s0 = nb0 / sizeof(dst_t); @@ -147,7 +149,7 @@ static void get_rows_cuda_float( k_get_rows_float<<>>( src0_d, src1_d, dst_d, ne00, /*ne01, ne02, ne03,*/ - /*ne10, ne11,*/ ne12, /*ne13,*/ + /*ne10,*/ ne11, ne12, /*ne13,*/ /* s0,*/ s1, s2, s3, /* nb00,*/ nb01, nb02, nb03, s10, s11, s12/*, s13*/); diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu index efca2c775a..95170ae11b 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -3393,10 +3393,6 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g return op->type == GGML_TYPE_F32 && op->src[0]->type == GGML_TYPE_F32 && op->src[1]->type == GGML_TYPE_F32; case GGML_OP_GET_ROWS: { - // FIXME: https://github.com/ggml-org/llama.cpp/pull/15868 - if (op->src[1]->ne[1]*op->src[1]->ne[2] > 65535) { - return false; - } switch (op->src[0]->type) { case GGML_TYPE_F16: case GGML_TYPE_F32: From a972faebed5fdc4a3d2a844d92d476058c02e02d Mon Sep 17 00:00:00 2001 From: Aman Gupta Date: Tue, 9 Sep 2025 14:38:02 +0800 Subject: [PATCH 09/26] CUDA: Add mul_mat_id support for the mmf kernel (#15767) * CUDA: Add mul_mat_id support the mmf Add support for mul_mat_id for bs < 16 * Review: use warp_size, fix should_use_mmf condition * Launch one block per expert, stride along n_expert_used * templatize mul_mat_id * Pad shmem to 16 bytes, add helper function mul_mat_f_switch_ids * Reduce compile times by dividing mmf into f16, bf16 and f32 variants * Divide mmf by ncols_dst * Add missing files * Fix MUSA/HIP builds --- ggml/src/ggml-cuda/CMakeLists.txt | 2 + ggml/src/ggml-cuda/ggml-cuda.cu | 5 + ggml/src/ggml-cuda/mma.cuh | 1 + ggml/src/ggml-cuda/mmf.cu | 382 ++------------ ggml/src/ggml-cuda/mmf.cuh | 470 +++++++++++++++++- .../template-instances/generate_cu_files.py | 11 + .../mmf-instance-ncols_1.cu | 5 + .../mmf-instance-ncols_10.cu | 5 + .../mmf-instance-ncols_11.cu | 5 + .../mmf-instance-ncols_12.cu | 5 + .../mmf-instance-ncols_13.cu | 5 + .../mmf-instance-ncols_14.cu | 5 + .../mmf-instance-ncols_15.cu | 5 + .../mmf-instance-ncols_16.cu | 5 + .../mmf-instance-ncols_2.cu | 5 + .../mmf-instance-ncols_3.cu | 5 + .../mmf-instance-ncols_4.cu | 5 + .../mmf-instance-ncols_5.cu | 5 + .../mmf-instance-ncols_6.cu | 5 + .../mmf-instance-ncols_7.cu | 5 + .../mmf-instance-ncols_8.cu | 5 + .../mmf-instance-ncols_9.cu | 5 + tests/test-backend-ops.cpp | 2 +- 23 files changed, 603 insertions(+), 350 deletions(-) create mode 100644 ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_1.cu create mode 100644 ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_10.cu create mode 100644 ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_11.cu create mode 100644 ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_12.cu create mode 100644 ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_13.cu create mode 100644 ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_14.cu create mode 100644 ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_15.cu create mode 100644 ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_16.cu create mode 100644 ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_2.cu create mode 100644 ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_3.cu create mode 100644 ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_4.cu create mode 100644 ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_5.cu create mode 100644 ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_6.cu create mode 100644 ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_7.cu create mode 100644 ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_8.cu create mode 100644 ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_9.cu diff --git a/ggml/src/ggml-cuda/CMakeLists.txt b/ggml/src/ggml-cuda/CMakeLists.txt index d3dfc7807d..0d8c5af473 100644 --- a/ggml/src/ggml-cuda/CMakeLists.txt +++ b/ggml/src/ggml-cuda/CMakeLists.txt @@ -44,6 +44,8 @@ if (CUDAToolkit_FOUND) list(APPEND GGML_SOURCES_CUDA ${SRCS}) file(GLOB SRCS "template-instances/mmq*.cu") list(APPEND GGML_SOURCES_CUDA ${SRCS}) + file(GLOB SRCS "template-instances/mmf*.cu") + list(APPEND GGML_SOURCES_CUDA ${SRCS}) if (GGML_CUDA_FA_ALL_QUANTS) file(GLOB SRCS "template-instances/fattn-vec*.cu") diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu index 95170ae11b..0f68d68534 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -2109,6 +2109,11 @@ static void ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor * ggml_cuda_mul_mat_q(ctx, src0, src1, ids, dst); return; } + + if (ggml_cuda_should_use_mmf(src0->type, cc, WARP_SIZE, src0->ne, src1->ne[2])) { + ggml_cuda_mul_mat_f(ctx, src0, src1, ids, dst); + return; + } } cudaStream_t stream = ctx.stream(); diff --git a/ggml/src/ggml-cuda/mma.cuh b/ggml/src/ggml-cuda/mma.cuh index 667deb9c65..c1f24243fe 100644 --- a/ggml/src/ggml-cuda/mma.cuh +++ b/ggml/src/ggml-cuda/mma.cuh @@ -1,3 +1,4 @@ +#pragma once // This file contains primitives that expose the tensor core PTX instructions for CUDA code. // The primitives can be used in a similar way as the nvcuda::wmma interface but with a well-defined memory layout. // The documentation for the PTX instructions can be found under: diff --git a/ggml/src/ggml-cuda/mmf.cu b/ggml/src/ggml-cuda/mmf.cu index cfa5c5cce2..16331e9ecf 100644 --- a/ggml/src/ggml-cuda/mmf.cu +++ b/ggml/src/ggml-cuda/mmf.cu @@ -1,343 +1,12 @@ #include "ggml.h" -#include "common.cuh" -#include "mma.cuh" #include "mmf.cuh" -using namespace ggml_cuda_mma; - -#define MMF_ROWS_PER_BLOCK 32 - -template -__launch_bounds__(ggml_cuda_get_physical_warp_size()*nwarps, 1) -static __global__ void mul_mat_f( - const T * __restrict__ x, const float * __restrict__ y, const int32_t * __restrict__ ids, float * __restrict__ dst, - const int ncols, const int nchannels_y, const int stride_row, const int stride_col_y, const int stride_col_dst, - const int channel_ratio, const int stride_channel_x, const int stride_channel_y, const int stride_channel_dst, - const int sample_ratio, const int stride_sample_x, const int stride_sample_y, const int stride_sample_dst) { -#if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA) - typedef tile<16, 8, T> tile_A; - typedef tile< 8, 8, T> tile_B; - typedef tile<16, 8, float> tile_C; - - constexpr int warp_size = ggml_cuda_get_physical_warp_size(); - constexpr int tile_k_padded = warp_size + 4; - constexpr int ntA = rows_per_block / tile_A::I; - constexpr int ntB = (cols_per_block + tile_B::I - 1) / tile_B::I; - - const int row0 = blockIdx.x * rows_per_block; - const int channel_dst = blockIdx.y; - const int channel_x = channel_dst / channel_ratio; - const int channel_y = channel_dst; - const int sample_dst = blockIdx.z; - const int sample_x = sample_dst / sample_ratio; - const int sample_y = sample_dst; - - x += int64_t(sample_x) *stride_sample_x + channel_x *stride_channel_x + row0*stride_row ; - y += int64_t(sample_y) *stride_sample_y + channel_y *stride_channel_y; - dst += int64_t(sample_dst)*stride_sample_dst + channel_dst*stride_channel_dst; - - const float2 * y2 = (const float2 *) y; - - extern __shared__ char data_mmv[]; - - tile_C C[ntA][ntB]; - - T * tile_xy = (T *) data_mmv + threadIdx.y*(tile_A::I * tile_k_padded); - - for (int col = threadIdx.y*warp_size + threadIdx.x; col < ncols; col += nwarps*warp_size) { - tile_A A[ntA][warp_size / tile_A::J]; -#pragma unroll - for (int itA = 0; itA < ntA; ++itA) { -#pragma unroll - for (int i = 0; i < tile_A::I; ++i) { - tile_xy[i*tile_k_padded + threadIdx.x] = x[(itA*tile_A::I + i)*stride_row + col]; - } -#pragma unroll - for (int k0 = 0; k0 < warp_size; k0 += tile_A::J) { - load_ldmatrix(A[itA][k0/tile_A::J], tile_xy + k0, tile_k_padded); - } - } - -#pragma unroll - for (int itB = 0; itB < ntB; ++itB) { - if constexpr (std::is_same_v) { -#pragma unroll - for (int j0 = 0; j0 < tile_B::I; ++j0) { - const int j = j0 + itB*tile_B::I; - - tile_xy[j0*tile_k_padded + threadIdx.x] = j < cols_per_block ? y[j*stride_col_y + col] : 0.0f; - } - } else if constexpr (std::is_same_v || std::is_same_v) { -#pragma unroll - for (int j0 = 0; j0 < tile_B::I; ++j0) { - const int j = j0 + itB*tile_B::I; - - const float2 tmp = j < cols_per_block ? y2[j*stride_col_y + col] : make_float2(0.0f, 0.0f); - tile_xy[j0*tile_k_padded + threadIdx.x] = {tmp.x, tmp.y}; - } - } else { - static_assert(std::is_same_v, "unsupported type"); - } -#pragma unroll - for (int k0 = 0; k0 < warp_size; k0 += tile_B::J) { - tile_B B; - load_ldmatrix(B, tile_xy + k0, tile_k_padded); -#pragma unroll - for (int itA = 0; itA < ntA; ++itA) { - mma(C[itA][itB], A[itA][k0/tile_B::J], B); - } - } - } - } - - float * buf_iw = (float *) data_mmv; - constexpr int kiw = nwarps*rows_per_block + 4; - - if (nwarps > 1) { - __syncthreads(); - } -#pragma unroll - for (int itB = 0; itB < ntB; ++itB) { -#pragma unroll - for (int itA = 0; itA < ntA; ++itA) { -#pragma unroll - for (int l = 0; l < tile_C::ne; ++l) { - const int i = threadIdx.y*rows_per_block + itA*tile_C::I + tile_C::get_i(l); - const int j = itB*tile_C::J + tile_C::get_j(l); - buf_iw[j*kiw + i] = C[itA][itB].x[l]; - } - } - } - - if (nwarps > 1) { - __syncthreads(); - } - -#pragma unroll - for (int j0 = 0; j0 < cols_per_block; j0 += nwarps) { - const int j = j0 + threadIdx.y; - - if (j0 + nwarps > cols_per_block && j >= cols_per_block) { - return; - } - - float sum = 0.0f; - static_assert(rows_per_block == warp_size, "need loop/check"); -#pragma unroll - for (int i0 = 0; i0 < nwarps*rows_per_block; i0 += rows_per_block) { - const int i = i0 + threadIdx.x; - - sum += buf_iw[j*kiw + i]; - } - dst[j*stride_col_dst + row0 + threadIdx.x] = sum; - } -#else - GGML_UNUSED_VARS(x, y, ids, dst, - ncols, nchannels_y, stride_row, stride_col_y, stride_col_dst, - channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst, - sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst); - NO_DEVICE_CODE; -#endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA) -} - -template -static void mul_mat_f_cuda( - const T * x, const float * y, const int32_t * ids, float * dst, - const int64_t ncols_x, const int64_t nrows_x, - const int64_t stride_row, const int64_t stride_col_y, const int64_t stride_col_dst, - const int64_t nchannels_x, const int64_t nchannels_y, const int64_t nchannels_dst, - const int64_t stride_channel_x, const int64_t stride_channel_y, const int64_t stride_channel_dst, const int64_t nsamples_x, - const int64_t nsamples_dst, const int64_t stride_sample_x, const int64_t stride_sample_y, const int64_t stride_sample_dst, - cudaStream_t stream) { - typedef tile<16, 8, T> tile_A; - typedef tile< 8, 8, T> tile_B; - - GGML_ASSERT(!ids && "mul_mat_id not implemented"); - - GGML_ASSERT(ncols_x % 2 == 0); - GGML_ASSERT(stride_row % 2 == 0); - GGML_ASSERT(stride_col_y % 2 == 0); - GGML_ASSERT(ids || nchannels_dst % nchannels_x == 0); - GGML_ASSERT( nsamples_dst % nsamples_x == 0); - const int64_t channel_ratio = nchannels_dst / nchannels_x; - const int64_t sample_ratio = nsamples_dst / nsamples_x; - - const int device = ggml_cuda_get_device(); - const int warp_size = ggml_cuda_info().devices[device].warp_size; - - int64_t nwarps_best = 1; - int64_t niter_best = (ncols_x + warp_size*2 - 1) / (warp_size*2); - int64_t max_block_size = 256; - for (int64_t nwarps = 2; nwarps <= max_block_size/warp_size; nwarps++) { - const int64_t niter = (ncols_x + nwarps*warp_size*2 - 1) / (nwarps*warp_size*2); - if (niter < niter_best) { - niter_best = niter; - nwarps_best = nwarps; - } - } - - constexpr int rows_per_block = MMF_ROWS_PER_BLOCK; - const int nbytes_shared_iter = nwarps_best * tile_A::I * (warp_size + 4) * 4; - const int nbytes_shared_combine = GGML_PAD(cols_per_block, tile_B::I) * (nwarps_best*rows_per_block + 4) * 4; - const int nbytes_shared = std::max(nbytes_shared_iter, nbytes_shared_combine); - const dim3 block_nums(nrows_x/rows_per_block, nchannels_dst, nsamples_dst); - const dim3 block_dims(warp_size, nwarps_best, 1); - switch (nwarps_best) { - case 1: { - mul_mat_f<<>> - (x, y, ids, dst, ncols_x, nchannels_y, stride_row, stride_col_y, stride_col_dst, - channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst, - sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst); - } break; - case 2: { - mul_mat_f<<>> - (x, y, ids, dst, ncols_x, nchannels_y, stride_row, stride_col_y, stride_col_dst, - channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst, - sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst); - } break; - case 3: { - mul_mat_f<<>> - (x, y, ids, dst, ncols_x, nchannels_y, stride_row, stride_col_y, stride_col_dst, - channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst, - sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst); - } break; - case 4: { - mul_mat_f<<>> - (x, y, ids, dst, ncols_x, nchannels_y, stride_row, stride_col_y, stride_col_dst, - channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst, - sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst); - } break; - case 5: { - mul_mat_f<<>> - (x, y, ids, dst, ncols_x, nchannels_y, stride_row, stride_col_y, stride_col_dst, - channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst, - sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst); - } break; - case 6: { - mul_mat_f<<>> - (x, y, ids, dst, ncols_x, nchannels_y, stride_row, stride_col_y, stride_col_dst, - channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst, - sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst); - } break; - case 7: { - mul_mat_f<<>> - (x, y, ids, dst, ncols_x, nchannels_y, stride_row, stride_col_y, stride_col_dst, - channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst, - sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst); - } break; - case 8: { - mul_mat_f<<>> - (x, y, ids, dst, ncols_x, nchannels_y, stride_row, stride_col_y, stride_col_dst, - channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst, - sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst); - } break; - default: { - GGML_ABORT("fatal error"); - } break; - } -} - -template -static void mul_mat_f_switch_cols_per_block( - const T * x, const float * y, const int32_t * ids, float * dst, - const int64_t ncols_x, const int64_t nrows_x, const int64_t ncols_dst, - const int64_t stride_row, const int64_t stride_col_y, const int64_t stride_col_dst, - const int64_t nchannels_x, const int64_t nchannels_y, const int64_t nchannels_dst, - const int64_t stride_channel_x, const int64_t stride_channel_y, const int64_t stride_channel_dst, const int64_t nsamples_x, - const int64_t nsamples_dst, const int64_t stride_sample_x, const int64_t stride_sample_y, const int64_t stride_sample_dst, - cudaStream_t stream) { - switch (ncols_dst) { - case 1: { - mul_mat_f_cuda(x, y, ids, dst, ncols_x, nrows_x, stride_row, stride_col_y, stride_col_dst, - nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst, - nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream); - } break; - case 2: { - mul_mat_f_cuda(x, y, ids, dst, ncols_x, nrows_x, stride_row, stride_col_y, stride_col_dst, - nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst, - nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream); - } break; - case 3: { - mul_mat_f_cuda(x, y, ids, dst, ncols_x, nrows_x, stride_row, stride_col_y, stride_col_dst, - nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst, - nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream); - } break; - case 4: { - mul_mat_f_cuda(x, y, ids, dst, ncols_x, nrows_x, stride_row, stride_col_y, stride_col_dst, - nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst, - nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream); - } break; - case 5: { - mul_mat_f_cuda(x, y, ids, dst, ncols_x, nrows_x, stride_row, stride_col_y, stride_col_dst, - nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst, - nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream); - } break; - case 6: { - mul_mat_f_cuda(x, y, ids, dst, ncols_x, nrows_x, stride_row, stride_col_y, stride_col_dst, - nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst, - nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream); - } break; - case 7: { - mul_mat_f_cuda(x, y, ids, dst, ncols_x, nrows_x, stride_row, stride_col_y, stride_col_dst, - nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst, - nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream); - } break; - case 8: { - mul_mat_f_cuda(x, y, ids, dst, ncols_x, nrows_x, stride_row, stride_col_y, stride_col_dst, - nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst, - nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream); - } break; - case 9: { - mul_mat_f_cuda(x, y, ids, dst, ncols_x, nrows_x, stride_row, stride_col_y, stride_col_dst, - nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst, - nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream); - } break; - case 10: { - mul_mat_f_cuda(x, y, ids, dst, ncols_x, nrows_x, stride_row, stride_col_y, stride_col_dst, - nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst, - nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream); - } break; - case 11: { - mul_mat_f_cuda(x, y, ids, dst, ncols_x, nrows_x, stride_row, stride_col_y, stride_col_dst, - nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst, - nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream); - } break; - case 12: { - mul_mat_f_cuda(x, y, ids, dst, ncols_x, nrows_x, stride_row, stride_col_y, stride_col_dst, - nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst, - nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream); - } break; - case 13: { - mul_mat_f_cuda(x, y, ids, dst, ncols_x, nrows_x, stride_row, stride_col_y, stride_col_dst, - nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst, - nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream); - } break; - case 14: { - mul_mat_f_cuda(x, y, ids, dst, ncols_x, nrows_x, stride_row, stride_col_y, stride_col_dst, - nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst, - nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream); - } break; - case 15: { - mul_mat_f_cuda(x, y, ids, dst, ncols_x, nrows_x, stride_row, stride_col_y, stride_col_dst, - nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst, - nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream); - } break; - case 16: { - mul_mat_f_cuda(x, y, ids, dst, ncols_x, nrows_x, stride_row, stride_col_y, stride_col_dst, - nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst, - nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream); - } break; - default: { - GGML_ABORT("fatal error"); - } break; - } -} - void ggml_cuda_mul_mat_f(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * ids, ggml_tensor * dst) { GGML_ASSERT( src1->type == GGML_TYPE_F32); GGML_ASSERT(!ids || ids->type == GGML_TYPE_I32); GGML_ASSERT( dst->type == GGML_TYPE_F32); + GGML_TENSOR_BINARY_OP_LOCALS; const size_t ts_src0 = ggml_type_size(src0->type); @@ -365,55 +34,72 @@ void ggml_cuda_mul_mat_f(ggml_backend_cuda_context & ctx, const ggml_tensor * sr const int64_t s13 = src1->nb[3] / ts_src1; const int64_t s3 = dst->nb[3] / ts_dst; + const int64_t ids_s0 = ids ? ids->nb[0] / ggml_type_size(ids->type) : 0; + const int64_t ids_s1 = ids ? ids->nb[1] / ggml_type_size(ids->type) : 0; + // For MUL_MAT_ID the memory layout is different than for MUL_MAT: const int64_t ncols_dst = ids ? ne2 : ne1; - const int64_t nchannels_y = ids ? ne11 : ne12; - const int64_t nchannels_dst = ids ? ne1 : ne2; - const int64_t stride_channel_dst = ids ? s1 : s2; - const int64_t stride_channel_y = ids ? s11 : s12; + const int64_t nchannels_dst = ids ? ne1 : ne2; - GGML_ASSERT(!ids || ncols_dst == 1); + const int64_t stride_col_dst = ids ? s2 : s1; + const int64_t stride_col_y = ids ? s12 : s11; + const int64_t stride_channel_dst = ids ? s1 : s2; + + int64_t stride_channel_y = ids ? s11 : s12; + int64_t nchannels_y = ids ? ne11 : ne12; + + //mul_mat_id: handle broadcast + if (ids && nchannels_y == 1) { + stride_channel_y = 0; + nchannels_y = ids->ne[0]; + } switch (src0->type) { case GGML_TYPE_F32: { const float * src0_d = (const float *) src0->data; constexpr int vals_per_T = 1; mul_mat_f_switch_cols_per_block( - src0_d, src1_d, ids_d, dst_d, ne00/vals_per_T, ne01, ncols_dst, s01/vals_per_T, s11/vals_per_T, s1, - ne02, nchannels_y, nchannels_dst, s02/vals_per_T, stride_channel_y, stride_channel_dst, - ne03, ne3, s03/vals_per_T, s13, s3, ctx.stream()); + src0_d, src1_d, ids_d, dst_d, ne00/vals_per_T, ne01, ncols_dst, s01/vals_per_T, stride_col_y/vals_per_T, stride_col_dst, + ids_s0, ids_s1, ne02, nchannels_y, nchannels_dst, s02/vals_per_T, stride_channel_y, stride_channel_dst, + ne03, ne3, s03/vals_per_T, s13, s3, ctx.stream()); } break; case GGML_TYPE_F16: { const half2 * src0_d = (const half2 *) src0->data; constexpr int vals_per_T = 2; mul_mat_f_switch_cols_per_block( - src0_d, src1_d, ids_d, dst_d, ne00/vals_per_T, ne01, ncols_dst, s01/vals_per_T, s11/vals_per_T, s1, - ne02, nchannels_y, nchannels_dst, s02/vals_per_T, stride_channel_y, stride_channel_dst, - ne03, ne3, s03/vals_per_T, s13, s3, ctx.stream()); + src0_d, src1_d, ids_d, dst_d, ne00/vals_per_T, ne01, ncols_dst, s01/vals_per_T, stride_col_y/vals_per_T, stride_col_dst, + ids_s0, ids_s1, ne02, nchannels_y, nchannels_dst, s02/vals_per_T, stride_channel_y, stride_channel_dst, + ne03, ne3, s03/vals_per_T, s13, s3, ctx.stream()); } break; case GGML_TYPE_BF16: { const nv_bfloat162 * src0_d = (const nv_bfloat162 *) src0->data; constexpr int vals_per_T = 2; mul_mat_f_switch_cols_per_block( - src0_d, src1_d, ids_d, dst_d, ne00/vals_per_T, ne01, ncols_dst, s01/vals_per_T, s11/vals_per_T, s1, - ne02, nchannels_y, nchannels_dst, s02/vals_per_T, stride_channel_y, stride_channel_dst, - ne03, ne3, s03/vals_per_T, s13, s3, ctx.stream()); + src0_d, src1_d, ids_d, dst_d, ne00/vals_per_T, ne01, ncols_dst, s01/vals_per_T, stride_col_y/vals_per_T, stride_col_dst, + ids_s0, ids_s1, ne02, nchannels_y, nchannels_dst, s02/vals_per_T, stride_channel_y, stride_channel_dst, + ne03, ne3, s03/vals_per_T, s13, s3, ctx.stream()); } break; default: GGML_ABORT("unsupported type: %s", ggml_type_name(src0->type)); } } -bool ggml_cuda_should_use_mmf(enum ggml_type type, int cc, int warp_size, const int64_t * src0_ne, int64_t ne11) { +bool ggml_cuda_should_use_mmf(enum ggml_type type, int cc, int warp_size, const int64_t * src0_ne, const int src1_ncols) { + + if (ggml_is_quantized(type)) { + return false; + } + if (src0_ne[0] % (warp_size * (4/ggml_type_size(type))) != 0) { return false; } if (src0_ne[1] % MMF_ROWS_PER_BLOCK != 0) { return false; } - if (ne11 > 16) { + if (src1_ncols > 16) { return false; } + switch (type) { case GGML_TYPE_F32: return ampere_mma_available(cc); diff --git a/ggml/src/ggml-cuda/mmf.cuh b/ggml/src/ggml-cuda/mmf.cuh index 785f9f211c..bf724bc57b 100644 --- a/ggml/src/ggml-cuda/mmf.cuh +++ b/ggml/src/ggml-cuda/mmf.cuh @@ -1,5 +1,473 @@ +#pragma once + +#include "mma.cuh" #include "common.cuh" +using namespace ggml_cuda_mma; + +#define MMF_ROWS_PER_BLOCK 32 + void ggml_cuda_mul_mat_f(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * ids, ggml_tensor * dst); -bool ggml_cuda_should_use_mmf(enum ggml_type type, int cc, int warp_size, const int64_t * scr0_ne, int64_t ne11); +bool ggml_cuda_should_use_mmf(enum ggml_type type, int cc, int warp_size, const int64_t * scr0_ne, const int src1_ncols); + +template +__launch_bounds__(ggml_cuda_get_physical_warp_size()*nwarps, 1) +static __global__ void mul_mat_f( + const T * __restrict__ x, const float * __restrict__ y, const int32_t * __restrict__ ids, float * __restrict__ dst, + const int ncols, const int nchannels_dst, const int stride_row, const int stride_col_y, const int stride_col_dst, + const int stride_col_id, const int stride_row_id, + const int channel_ratio, const int stride_channel_x, const int stride_channel_y, const int stride_channel_dst, + const int sample_ratio, const int stride_sample_x, const int stride_sample_y, const int stride_sample_dst) { +#if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA) + typedef tile<16, 8, T> tile_A; + typedef tile< 8, 8, T> tile_B; + typedef tile<16, 8, float> tile_C; + + constexpr int warp_size = ggml_cuda_get_physical_warp_size(); + constexpr int tile_k_padded = warp_size + 4; + constexpr int ntA = rows_per_block / tile_A::I; + constexpr int ntB = (cols_per_block + tile_B::I - 1) / tile_B::I; + + const int row0 = blockIdx.x * rows_per_block; + + const int expert_idx = has_ids ? blockIdx.y : 0; + const int channel_dst = has_ids ? 0 : blockIdx.y; + + const int channel_x = has_ids ? expert_idx : (channel_dst / channel_ratio); + const int channel_y = channel_dst; + const int sample_dst = blockIdx.z; + const int sample_x = sample_dst / sample_ratio; + const int sample_y = sample_dst; + + x += int64_t(sample_x) *stride_sample_x + channel_x *stride_channel_x + row0*stride_row ; + y += int64_t(sample_y) *stride_sample_y + (has_ids ? 0 : channel_y *stride_channel_y); + dst += int64_t(sample_dst)*stride_sample_dst + (has_ids ? 0 : channel_dst*stride_channel_dst); + + const float2 * y2 = (const float2 *) y; + + extern __shared__ char data_mmv[]; + + char * shmem_base = data_mmv; + int * slot_map = (int *) shmem_base; + char * compute_base = has_ids ? (shmem_base + GGML_PAD(cols_per_block, 16) * sizeof(int)) : shmem_base; + + tile_C C[ntA][ntB]; + + T * tile_xy = (T *) compute_base + threadIdx.y*(tile_A::I * tile_k_padded); + + if constexpr (has_ids) { + __shared__ int has_any; + if (threadIdx.y == 0) { + int local_has_any = 0; + for (int j = threadIdx.x; j < cols_per_block; j += warp_size) { + int slot = -1; + for (int k = 0; k < nchannels_dst; ++k) { + const int idv = ids[j*stride_row_id + k*stride_col_id]; + if (idv == expert_idx) { + slot = k; + break; + } + } + if (j < cols_per_block) { + local_has_any |= (slot >= 0); + slot_map[j] = slot; + } + } + has_any = warp_reduce_any(local_has_any); + } + __syncthreads(); + if (has_any == 0) { + return; + } + } + + for (int col = threadIdx.y*warp_size + threadIdx.x; col < ncols; col += nwarps*warp_size) { + tile_A A[ntA][warp_size / tile_A::J]; +#pragma unroll + for (int itA = 0; itA < ntA; ++itA) { +#pragma unroll + for (int i = 0; i < tile_A::I; ++i) { + tile_xy[i*tile_k_padded + threadIdx.x] = x[(itA*tile_A::I + i)*stride_row + col]; + } +#pragma unroll + for (int k0 = 0; k0 < warp_size; k0 += tile_A::J) { + load_ldmatrix(A[itA][k0/tile_A::J], tile_xy + k0, tile_k_padded); + } + } + +#pragma unroll + for (int itB = 0; itB < ntB; ++itB) { + if constexpr (std::is_same_v) { +#pragma unroll + for (int j0 = 0; j0 < tile_B::I; ++j0) { + const int j = j0 + itB*tile_B::I; + + if constexpr (!has_ids) { + tile_xy[j0*tile_k_padded + threadIdx.x] = j < cols_per_block ? y[j*stride_col_y + col] : 0.0f; + } else { + float val = 0.0f; + if (j < cols_per_block) { + const int slot = slot_map[j]; + if (slot >= 0) { + val = y[slot*stride_channel_y + j*stride_col_y + col]; + } + } + tile_xy[j0*tile_k_padded + threadIdx.x] = val; + } + } + } else if constexpr (std::is_same_v || std::is_same_v) { +#pragma unroll + for (int j0 = 0; j0 < tile_B::I; ++j0) { + const int j = j0 + itB*tile_B::I; + + if constexpr (!has_ids) { + const float2 tmp = j < cols_per_block ? y2[j*stride_col_y + col] : make_float2(0.0f, 0.0f); + tile_xy[j0*tile_k_padded + threadIdx.x] = {tmp.x, tmp.y}; + } else { + float2 tmp = make_float2(0.0f, 0.0f); + if (j < cols_per_block) { + const int slot = slot_map[j]; + if (slot >= 0) { + const float2 * y2_slot = (const float2 *)(y + slot*stride_channel_y); + tmp = y2_slot[j*stride_col_y + col]; + } + } + tile_xy[j0*tile_k_padded + threadIdx.x] = {tmp.x, tmp.y}; + } + } + } else { + static_assert(std::is_same_v, "unsupported type"); + } +#pragma unroll + for (int k0 = 0; k0 < warp_size; k0 += tile_B::J) { + tile_B B; + load_ldmatrix(B, tile_xy + k0, tile_k_padded); +#pragma unroll + for (int itA = 0; itA < ntA; ++itA) { + mma(C[itA][itB], A[itA][k0/tile_B::J], B); + } + } + } + } + + float * buf_iw = (float *) compute_base; + constexpr int kiw = nwarps*rows_per_block + 4; + + if (nwarps > 1) { + __syncthreads(); + } +#pragma unroll + for (int itB = 0; itB < ntB; ++itB) { +#pragma unroll + for (int itA = 0; itA < ntA; ++itA) { +#pragma unroll + for (int l = 0; l < tile_C::ne; ++l) { + const int i = threadIdx.y*rows_per_block + itA*tile_C::I + tile_C::get_i(l); + const int j = itB*tile_C::J + tile_C::get_j(l); + buf_iw[j*kiw + i] = C[itA][itB].x[l]; + } + } + } + + if (nwarps > 1) { + __syncthreads(); + } + +#pragma unroll + for (int j0 = 0; j0 < cols_per_block; j0 += nwarps) { + const int j = j0 + threadIdx.y; + + if (j0 + nwarps > cols_per_block && j >= cols_per_block) { + return; + } + + float sum = 0.0f; + static_assert(rows_per_block == warp_size, "need loop/check"); +#pragma unroll + for (int i0 = 0; i0 < nwarps*rows_per_block; i0 += rows_per_block) { + const int i = i0 + threadIdx.x; + + sum += buf_iw[j*kiw + i]; + } + + if constexpr (!has_ids) { + dst[j*stride_col_dst + row0 + threadIdx.x] = sum; + } else { + const int slot = (j < cols_per_block) ? slot_map[j] : -1; + if (slot >= 0) { + dst[slot*stride_channel_dst + j*stride_col_dst + row0 + threadIdx.x] = sum; + } + } + } +#else + GGML_UNUSED_VARS(x, y, ids, dst, + ncols, nchannels_dst, stride_row, stride_col_y, stride_col_dst, + stride_col_id, stride_row_id, + channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst, + sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst); + NO_DEVICE_CODE; +#endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA) +} + +template +static inline void mul_mat_f_switch_ids( + const T * x, const float * y, const int32_t * ids, float * dst, + const int64_t ncols_x, const int64_t nchannels_dst, + const int64_t stride_row, const int64_t stride_col_y, const int64_t stride_col_dst, + const int64_t stride_col_id, const int64_t stride_row_id, + const int64_t channel_ratio, const int64_t stride_channel_x, const int64_t stride_channel_y, const int64_t stride_channel_dst, + const int64_t sample_ratio, const int64_t stride_sample_x, const int64_t stride_sample_y, const int64_t stride_sample_dst, + const dim3 & block_nums, const dim3 & block_dims, const int nbytes_shared_total, cudaStream_t stream) { + if (ids) { + mul_mat_f<<>> + (x, y, ids, dst, ncols_x, nchannels_dst, stride_row, stride_col_y, stride_col_dst, + stride_col_id, stride_row_id, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst, + sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst); + } else { + mul_mat_f<<>> + (x, y, ids, dst, ncols_x, nchannels_dst, stride_row, stride_col_y, stride_col_dst, + stride_col_id, stride_row_id, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst, + sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst); + } +} + +template +void mul_mat_f_cuda( + const T * x, const float * y, const int32_t * ids, float * dst, + const int64_t ncols_x, const int64_t nrows_x, const int64_t stride_row, const int64_t stride_col_y, const int64_t stride_col_dst, + const int64_t stride_col_id, const int64_t stride_row_id, + const int64_t nchannels_x, const int64_t nchannels_y, const int64_t nchannels_dst, + const int64_t stride_channel_x, const int64_t stride_channel_y, const int64_t stride_channel_dst, const int64_t nsamples_x, + const int64_t nsamples_dst, const int64_t stride_sample_x, const int64_t stride_sample_y, const int64_t stride_sample_dst, + cudaStream_t stream) { + typedef tile<16, 8, T> tile_A; + typedef tile< 8, 8, T> tile_B; + + GGML_ASSERT(ncols_x % 2 == 0); + GGML_ASSERT(stride_row % 2 == 0); + GGML_ASSERT(stride_col_y % 2 == 0); + GGML_ASSERT(ids || nchannels_dst % nchannels_x == 0); + GGML_ASSERT( nsamples_dst % nsamples_x == 0); + const int64_t channel_ratio = nchannels_dst / nchannels_x; + const int64_t sample_ratio = nsamples_dst / nsamples_x; + + const int device = ggml_cuda_get_device(); + const int warp_size = ggml_cuda_info().devices[device].warp_size; + + int64_t nwarps_best = 1; + int64_t niter_best = (ncols_x + warp_size*2 - 1) / (warp_size*2); + int64_t max_block_size = 256; + for (int64_t nwarps = 2; nwarps <= max_block_size/warp_size; nwarps++) { + const int64_t niter = (ncols_x + nwarps*warp_size*2 - 1) / (nwarps*warp_size*2); + if (niter < niter_best) { + niter_best = niter; + nwarps_best = nwarps; + } + } + + constexpr int rows_per_block = MMF_ROWS_PER_BLOCK; + const int nbytes_shared_iter = nwarps_best * tile_A::I * (warp_size + 4) * 4; + const int nbytes_shared_combine = GGML_PAD(cols_per_block, tile_B::I) * (nwarps_best*rows_per_block + 4) * 4; + const int nbytes_shared = std::max(nbytes_shared_iter, nbytes_shared_combine); + const int nbytes_slotmap = ids ? GGML_PAD(cols_per_block, 16) * sizeof(int) : 0; + const int nbytes_shared_total = nbytes_shared + nbytes_slotmap; + const int64_t grid_y = ids ? nchannels_x : nchannels_dst; // per expert when ids present + + const dim3 block_nums(nrows_x/rows_per_block, grid_y, nsamples_dst); + const dim3 block_dims(warp_size, nwarps_best, 1); + + switch (nwarps_best) { + case 1: { + mul_mat_f_switch_ids( + x, y, ids, dst, ncols_x, nchannels_dst, stride_row, stride_col_y, stride_col_dst, + stride_col_id, stride_row_id, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst, + sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst, block_nums, block_dims, nbytes_shared_total, stream); + } break; + case 2: { + mul_mat_f_switch_ids( + x, y, ids, dst, ncols_x, nchannels_dst, stride_row, stride_col_y, stride_col_dst, + stride_col_id, stride_row_id, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst, + sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst, block_nums, block_dims, nbytes_shared_total, stream); + } break; + case 3: { + mul_mat_f_switch_ids( + x, y, ids, dst, ncols_x, nchannels_dst, stride_row, stride_col_y, stride_col_dst, + stride_col_id, stride_row_id, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst, + sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst, block_nums, block_dims, nbytes_shared_total, stream); + } break; + case 4: { + mul_mat_f_switch_ids( + x, y, ids, dst, ncols_x, nchannels_dst, stride_row, stride_col_y, stride_col_dst, + stride_col_id, stride_row_id, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst, + sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst, block_nums, block_dims, nbytes_shared_total, stream); + } break; + case 5: { + mul_mat_f_switch_ids( + x, y, ids, dst, ncols_x, nchannels_dst, stride_row, stride_col_y, stride_col_dst, + stride_col_id, stride_row_id, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst, + sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst, block_nums, block_dims, nbytes_shared_total, stream); + } break; + case 6: { + mul_mat_f_switch_ids( + x, y, ids, dst, ncols_x, nchannels_dst, stride_row, stride_col_y, stride_col_dst, + stride_col_id, stride_row_id, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst, + sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst, block_nums, block_dims, nbytes_shared_total, stream); + } break; + case 7: { + mul_mat_f_switch_ids( + x, y, ids, dst, ncols_x, nchannels_dst, stride_row, stride_col_y, stride_col_dst, + stride_col_id, stride_row_id, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst, + sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst, block_nums, block_dims, nbytes_shared_total, stream); + } break; + case 8: { + mul_mat_f_switch_ids( + x, y, ids, dst, ncols_x, nchannels_dst, stride_row, stride_col_y, stride_col_dst, + stride_col_id, stride_row_id, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst, + sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst, block_nums, block_dims, nbytes_shared_total, stream); + } break; + default: { + GGML_ABORT("fatal error"); + } break; + } + + GGML_UNUSED_VARS(nchannels_y); +} + +template +static void mul_mat_f_switch_cols_per_block( + const T * x, const float * y, const int32_t * ids, float * dst, + const int64_t ncols_x, const int64_t nrows_x, const int64_t ncols_dst, + const int64_t stride_row, const int64_t stride_col_y, const int64_t stride_col_dst, + const int64_t stride_col_id, const int stride_row_id, + const int64_t nchannels_x, const int64_t nchannels_y, const int64_t nchannels_dst, + const int64_t stride_channel_x, const int64_t stride_channel_y, const int64_t stride_channel_dst, const int64_t nsamples_x, + const int64_t nsamples_dst, const int64_t stride_sample_x, const int64_t stride_sample_y, const int64_t stride_sample_dst, + cudaStream_t stream) { + switch (ncols_dst) { + case 1: { + mul_mat_f_cuda(x, y, ids, dst, ncols_x, nrows_x, stride_row, stride_col_y, stride_col_dst, + stride_col_id, stride_row_id, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst, + nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream); + } break; + case 2: { + mul_mat_f_cuda(x, y, ids, dst, ncols_x, nrows_x, stride_row, stride_col_y, stride_col_dst, + stride_col_id, stride_row_id, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst, + nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream); + } break; + case 3: { + mul_mat_f_cuda(x, y, ids, dst, ncols_x, nrows_x, stride_row, stride_col_y, stride_col_dst, + stride_col_id, stride_row_id, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst, + nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream); + } break; + case 4: { + mul_mat_f_cuda(x, y, ids, dst, ncols_x, nrows_x, stride_row, stride_col_y, stride_col_dst, + stride_col_id, stride_row_id, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst, + nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream); + } break; + case 5: { + mul_mat_f_cuda(x, y, ids, dst, ncols_x, nrows_x, stride_row, stride_col_y, stride_col_dst, + stride_col_id, stride_row_id, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst, + nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream); + } break; + case 6: { + mul_mat_f_cuda(x, y, ids, dst, ncols_x, nrows_x, stride_row, stride_col_y, stride_col_dst, + stride_col_id, stride_row_id, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst, + nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream); + } break; + case 7: { + mul_mat_f_cuda(x, y, ids, dst, ncols_x, nrows_x, stride_row, stride_col_y, stride_col_dst, + stride_col_id, stride_row_id, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst, + nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream); + } break; + case 8: { + mul_mat_f_cuda(x, y, ids, dst, ncols_x, nrows_x, stride_row, stride_col_y, stride_col_dst, + stride_col_id, stride_row_id, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst, + nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream); + } break; + case 9: { + mul_mat_f_cuda(x, y, ids, dst, ncols_x, nrows_x, stride_row, stride_col_y, stride_col_dst, + stride_col_id, stride_row_id, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst, + nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream); + } break; + case 10: { + mul_mat_f_cuda(x, y, ids, dst, ncols_x, nrows_x, stride_row, stride_col_y, stride_col_dst, + stride_col_id, stride_row_id, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst, + nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream); + } break; + case 11: { + mul_mat_f_cuda(x, y, ids, dst, ncols_x, nrows_x, stride_row, stride_col_y, stride_col_dst, + stride_col_id, stride_row_id, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst, + nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream); + } break; + case 12: { + mul_mat_f_cuda(x, y, ids, dst, ncols_x, nrows_x, stride_row, stride_col_y, stride_col_dst, + stride_col_id, stride_row_id, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst, + nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream); + } break; + case 13: { + mul_mat_f_cuda(x, y, ids, dst, ncols_x, nrows_x, stride_row, stride_col_y, stride_col_dst, + stride_col_id, stride_row_id, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst, + nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream); + } break; + case 14: { + mul_mat_f_cuda(x, y, ids, dst, ncols_x, nrows_x, stride_row, stride_col_y, stride_col_dst, + stride_col_id, stride_row_id, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst, + nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream); + } break; + case 15: { + mul_mat_f_cuda(x, y, ids, dst, ncols_x, nrows_x, stride_row, stride_col_y, stride_col_dst, + stride_col_id, stride_row_id, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst, + nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream); + } break; + case 16: { + mul_mat_f_cuda(x, y, ids, dst, ncols_x, nrows_x, stride_row, stride_col_y, stride_col_dst, + stride_col_id, stride_row_id, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst, + nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream); + } break; + default: { + GGML_ABORT("fatal error"); + } break; + } +} + +#define DECL_MMF_CASE_HELPER(T, ncols_dst) \ + template void mul_mat_f_cuda( \ + const T * x, const float * y, const int32_t * ids, float * dst, \ + const int64_t ncols_x, const int64_t nrows_x, const int64_t stride_row, const int64_t stride_col_y, const int64_t stride_col_dst, \ + const int64_t stride_col_id, const int64_t stride_row_id, \ + const int64_t nchannels_x, const int64_t nchannels_y, const int64_t nchannels_dst, \ + const int64_t stride_channel_x, const int64_t stride_channel_y, const int64_t stride_channel_dst, const int64_t nsamples_x,\ + const int64_t nsamples_dst, const int64_t stride_sample_x, const int64_t stride_sample_y, const int64_t stride_sample_dst, \ + cudaStream_t stream); + +#if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA) +#define DECL_MMF_CASE_EXTERN(ncols_dst) \ + extern DECL_MMF_CASE_HELPER(float, ncols_dst) \ + extern DECL_MMF_CASE_HELPER(half2, ncols_dst) \ + extern DECL_MMF_CASE_HELPER(nv_bfloat162, ncols_dst) + +#define DECL_MMF_CASE(ncols_dst) \ + DECL_MMF_CASE_HELPER(float, ncols_dst) \ + DECL_MMF_CASE_HELPER(half2, ncols_dst) \ + DECL_MMF_CASE_HELPER(nv_bfloat162, ncols_dst) + +DECL_MMF_CASE_EXTERN(1); +DECL_MMF_CASE_EXTERN(2); +DECL_MMF_CASE_EXTERN(3); +DECL_MMF_CASE_EXTERN(4); +DECL_MMF_CASE_EXTERN(5); +DECL_MMF_CASE_EXTERN(6); +DECL_MMF_CASE_EXTERN(7); +DECL_MMF_CASE_EXTERN(8); +DECL_MMF_CASE_EXTERN(9); +DECL_MMF_CASE_EXTERN(10); +DECL_MMF_CASE_EXTERN(11); +DECL_MMF_CASE_EXTERN(12); +DECL_MMF_CASE_EXTERN(13); +DECL_MMF_CASE_EXTERN(14); +DECL_MMF_CASE_EXTERN(15); +DECL_MMF_CASE_EXTERN(16); +#else +#define DECL_MMF_CASE(ncols_dst) +#endif diff --git a/ggml/src/ggml-cuda/template-instances/generate_cu_files.py b/ggml/src/ggml-cuda/template-instances/generate_cu_files.py index a92a9e52cf..da2d7b7c3b 100755 --- a/ggml/src/ggml-cuda/template-instances/generate_cu_files.py +++ b/ggml/src/ggml-cuda/template-instances/generate_cu_files.py @@ -34,6 +34,13 @@ SOURCE_MMQ = """// This file has been autogenerated by generate_cu_files.py, do DECL_MMQ_CASE({type}); """ +SOURCE_MMF = """// This file has been autogenerated by generate_cu_files.py, do not edit manually. + +#include "../mmf.cuh" + +DECL_MMF_CASE({type}); +""" + def get_short_name(long_quant_name): return long_quant_name.replace("GGML_TYPE_", "").lower() @@ -76,3 +83,7 @@ for ncols in [8, 16, 32, 64]: for type in TYPES_MMQ: with open(f"mmq-instance-{get_short_name(type)}.cu", "w") as f: f.write(SOURCE_MMQ.format(type=type)) + +for type in range(1, 17): + with open(f"mmf-instance-ncols_{type}.cu", "w") as f: + f.write(SOURCE_MMF.format(type=type)) diff --git a/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_1.cu b/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_1.cu new file mode 100644 index 0000000000..f594d5d51d --- /dev/null +++ b/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_1.cu @@ -0,0 +1,5 @@ +// This file has been autogenerated by generate_cu_files.py, do not edit manually. + +#include "../mmf.cuh" + +DECL_MMF_CASE(1); diff --git a/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_10.cu b/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_10.cu new file mode 100644 index 0000000000..9cc6772542 --- /dev/null +++ b/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_10.cu @@ -0,0 +1,5 @@ +// This file has been autogenerated by generate_cu_files.py, do not edit manually. + +#include "../mmf.cuh" + +DECL_MMF_CASE(10); diff --git a/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_11.cu b/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_11.cu new file mode 100644 index 0000000000..317f487d7a --- /dev/null +++ b/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_11.cu @@ -0,0 +1,5 @@ +// This file has been autogenerated by generate_cu_files.py, do not edit manually. + +#include "../mmf.cuh" + +DECL_MMF_CASE(11); diff --git a/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_12.cu b/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_12.cu new file mode 100644 index 0000000000..dc0033227c --- /dev/null +++ b/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_12.cu @@ -0,0 +1,5 @@ +// This file has been autogenerated by generate_cu_files.py, do not edit manually. + +#include "../mmf.cuh" + +DECL_MMF_CASE(12); diff --git a/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_13.cu b/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_13.cu new file mode 100644 index 0000000000..0782101753 --- /dev/null +++ b/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_13.cu @@ -0,0 +1,5 @@ +// This file has been autogenerated by generate_cu_files.py, do not edit manually. + +#include "../mmf.cuh" + +DECL_MMF_CASE(13); diff --git a/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_14.cu b/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_14.cu new file mode 100644 index 0000000000..a23ad6ae26 --- /dev/null +++ b/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_14.cu @@ -0,0 +1,5 @@ +// This file has been autogenerated by generate_cu_files.py, do not edit manually. + +#include "../mmf.cuh" + +DECL_MMF_CASE(14); diff --git a/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_15.cu b/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_15.cu new file mode 100644 index 0000000000..0fe3f7821e --- /dev/null +++ b/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_15.cu @@ -0,0 +1,5 @@ +// This file has been autogenerated by generate_cu_files.py, do not edit manually. + +#include "../mmf.cuh" + +DECL_MMF_CASE(15); diff --git a/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_16.cu b/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_16.cu new file mode 100644 index 0000000000..544086375e --- /dev/null +++ b/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_16.cu @@ -0,0 +1,5 @@ +// This file has been autogenerated by generate_cu_files.py, do not edit manually. + +#include "../mmf.cuh" + +DECL_MMF_CASE(16); diff --git a/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_2.cu b/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_2.cu new file mode 100644 index 0000000000..3b901797cf --- /dev/null +++ b/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_2.cu @@ -0,0 +1,5 @@ +// This file has been autogenerated by generate_cu_files.py, do not edit manually. + +#include "../mmf.cuh" + +DECL_MMF_CASE(2); diff --git a/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_3.cu b/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_3.cu new file mode 100644 index 0000000000..56e940bba0 --- /dev/null +++ b/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_3.cu @@ -0,0 +1,5 @@ +// This file has been autogenerated by generate_cu_files.py, do not edit manually. + +#include "../mmf.cuh" + +DECL_MMF_CASE(3); diff --git a/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_4.cu b/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_4.cu new file mode 100644 index 0000000000..a7665d49d0 --- /dev/null +++ b/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_4.cu @@ -0,0 +1,5 @@ +// This file has been autogenerated by generate_cu_files.py, do not edit manually. + +#include "../mmf.cuh" + +DECL_MMF_CASE(4); diff --git a/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_5.cu b/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_5.cu new file mode 100644 index 0000000000..3a1dff2587 --- /dev/null +++ b/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_5.cu @@ -0,0 +1,5 @@ +// This file has been autogenerated by generate_cu_files.py, do not edit manually. + +#include "../mmf.cuh" + +DECL_MMF_CASE(5); diff --git a/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_6.cu b/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_6.cu new file mode 100644 index 0000000000..400fb7c663 --- /dev/null +++ b/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_6.cu @@ -0,0 +1,5 @@ +// This file has been autogenerated by generate_cu_files.py, do not edit manually. + +#include "../mmf.cuh" + +DECL_MMF_CASE(6); diff --git a/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_7.cu b/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_7.cu new file mode 100644 index 0000000000..954a1c7e03 --- /dev/null +++ b/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_7.cu @@ -0,0 +1,5 @@ +// This file has been autogenerated by generate_cu_files.py, do not edit manually. + +#include "../mmf.cuh" + +DECL_MMF_CASE(7); diff --git a/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_8.cu b/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_8.cu new file mode 100644 index 0000000000..f1bd09c945 --- /dev/null +++ b/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_8.cu @@ -0,0 +1,5 @@ +// This file has been autogenerated by generate_cu_files.py, do not edit manually. + +#include "../mmf.cuh" + +DECL_MMF_CASE(8); diff --git a/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_9.cu b/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_9.cu new file mode 100644 index 0000000000..1255ac2af6 --- /dev/null +++ b/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_9.cu @@ -0,0 +1,5 @@ +// This file has been autogenerated by generate_cu_files.py, do not edit manually. + +#include "../mmf.cuh" + +DECL_MMF_CASE(9); diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index cca86a8ce8..a4776ee7f1 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -6261,7 +6261,7 @@ static std::vector> make_test_cases_eval() { for (int n_mats : {4, 8}) { for (int n_used : {1, 2, 4}) { for (bool b : {false, true}) { - for (int n : {1, 32, 129}) { + for (int n : {1, 4, 5, 32, 129}) { int m = 512; int k = 256; test_cases.emplace_back(new test_mul_mat_id(type_a, type_b, n_mats, n_used, b, m, n, k)); From ed54e32558ffe0f2c3b1d31f3227211fae05bd49 Mon Sep 17 00:00:00 2001 From: lksj92hs <134250687+lksj92hs@users.noreply.github.com> Date: Tue, 9 Sep 2025 15:01:15 +0300 Subject: [PATCH 10/26] Workaround for subgroup arithmetic failing on MoltenVK with AMD GPUs (issue 15846) (#15886) --- ggml/src/ggml-vulkan/ggml-vulkan.cpp | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp index f0fa9e668c..6f130d47f2 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -3736,6 +3736,12 @@ static vk_device ggml_vk_get_device(size_t idx) { device->subgroup_arithmetic = (vk11_props.subgroupSupportedStages & vk::ShaderStageFlagBits::eCompute) && (vk11_props.subgroupSupportedOperations & vk::SubgroupFeatureFlagBits::eArithmetic); +#ifdef __APPLE__ + // Workaround for subgroup arithmetic failing on MoltenVK with AMD GPUs (issue 15846) + if (device->vendor_id == VK_VENDOR_ID_AMD) { + device->subgroup_arithmetic = false; + } +#endif device->subgroup_shuffle = (vk11_props.subgroupSupportedStages & vk::ShaderStageFlagBits::eCompute) && (vk11_props.subgroupSupportedOperations & vk::SubgroupFeatureFlagBits::eShuffle); device->subgroup_clustered = (vk11_props.subgroupSupportedStages & vk::ShaderStageFlagBits::eCompute) && From 17bc5a815f0bebc1844c99796760cd7df5e9b8d9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= Date: Tue, 9 Sep 2025 14:04:43 +0200 Subject: [PATCH 11/26] HIP: use v_dot2_f32_f16 instruction for FA (#15884) --- ggml/src/ggml-cuda/common.cuh | 25 +++++++++++++++++++++++++ ggml/src/ggml-cuda/fattn-tile.cu | 7 +------ 2 files changed, 26 insertions(+), 6 deletions(-) diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh index 931524a200..394595be0e 100644 --- a/ggml/src/ggml-cuda/common.cuh +++ b/ggml/src/ggml-cuda/common.cuh @@ -545,6 +545,31 @@ static __device__ __forceinline__ int ggml_cuda_dp4a(const int a, const int b, i #endif // defined(GGML_USE_HIP) } +static __device__ __forceinline__ void ggml_cuda_mad(float & acc, const float v, const float u) { + acc += v*u; +} + +static __device__ __forceinline__ void ggml_cuda_mad(float & acc, const float2 v, const float2 u) { + acc += v.x*u.x; + acc += v.y*u.y; +} + +static __device__ __forceinline__ void ggml_cuda_mad(float & acc, const half2 v, const half2 u) { +#if defined(GGML_USE_HIP) && defined(GCN) + asm volatile("v_dot2_f32_f16 %0, %1, %2, %0" : "+v"(acc) : "v"(v), "v"(u)); +#else +#ifdef FAST_FP16_AVAILABLE + const float2 tmp = __half22float2(v*u); + acc += tmp.x + tmp.y; +#else + const float2 tmpv = __half22float2(v); + const float2 tmpu = __half22float2(u); + acc += tmpv.x * tmpu.x; + acc += tmpv.y * tmpu.y; +#endif // FAST_FP16_AVAILABLE +#endif // defined(GGML_USE_HIP) && defined(GCN) +} + static __device__ __forceinline__ float ggml_cuda_e8m0_to_fp32(uint8_t x) { #if CUDART_VERSION >= 12080 const nv_bfloat16 e = __nv_cvt_e8m0_to_bf16raw(x); diff --git a/ggml/src/ggml-cuda/fattn-tile.cu b/ggml/src/ggml-cuda/fattn-tile.cu index fb2163acd0..64f7d4a1a1 100644 --- a/ggml/src/ggml-cuda/fattn-tile.cu +++ b/ggml/src/ggml-cuda/fattn-tile.cu @@ -304,12 +304,7 @@ static __global__ void flash_attn_tile( for (int i_KQ_0 = 0; i_KQ_0 < kq_stride; i_KQ_0 += warp_size) { #pragma unroll for (int j_KQ_0 = 0; j_KQ_0 < ncols; j_KQ_0 += nwarps) { -#ifdef FAST_FP16_AVAILABLE - const float2 tmp = __half22float2(K_k[i_KQ_0/warp_size] * Q_k[j_KQ_0/nwarps]); - sum[i_KQ_0/warp_size][j_KQ_0/nwarps] += tmp.x + tmp.y; -#else - sum[i_KQ_0/warp_size][j_KQ_0/nwarps] += K_k[i_KQ_0/warp_size] * Q_k[j_KQ_0/nwarps]; -#endif // FAST_FP16_AVAILABLE + ggml_cuda_mad(sum[i_KQ_0/warp_size][j_KQ_0/nwarps], K_k[i_KQ_0/warp_size], Q_k[j_KQ_0/nwarps]); } } } From 4f63cd705c7b6f457f36c63fdc053e07f6f3cc6b Mon Sep 17 00:00:00 2001 From: Jeff Bolz Date: Tue, 9 Sep 2025 07:41:15 -0500 Subject: [PATCH 12/26] vulkan: Fix OOB accesses in soft_max_back (#15861) --- ggml/src/ggml-vulkan/ggml-vulkan.cpp | 4 ++-- ggml/src/ggml-vulkan/vulkan-shaders/soft_max_back.comp | 4 ++++ tests/test-backend-ops.cpp | 1 + 3 files changed, 7 insertions(+), 2 deletions(-) diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp index 6f130d47f2..c0c2239c2d 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -3387,7 +3387,7 @@ static void ggml_vk_load_shaders(vk_device& device) { ggml_vk_create_pipeline(device, device->pipeline_soft_max_f32_wg512, "soft_max_f32_wg512", soft_max_f32_len, soft_max_f32_data, "main", 4, sizeof(vk_op_soft_max_push_constants), {1, 1, 1}, { 512 }, 1); ggml_vk_create_pipeline(device, device->pipeline_soft_max_f32_f16, "soft_max_f32_f16", soft_max_f32_f16_len, soft_max_f32_f16_data, "main", 4, sizeof(vk_op_soft_max_push_constants), {1, 1, 1}, { device->subgroup_size }, 1); ggml_vk_create_pipeline(device, device->pipeline_soft_max_f32_f16_wg512, "soft_max_f32_f16_wg512", soft_max_f32_f16_len, soft_max_f32_f16_data, "main", 4, sizeof(vk_op_soft_max_push_constants), {1, 1, 1}, { 512 }, 1); - ggml_vk_create_pipeline(device, device->pipeline_soft_max_back_f32, "soft_max_back_f32", soft_max_back_f32_len, soft_max_back_f32_data, "main", 3, sizeof(vk_op_push_constants), {1, 1, 1}, { device->subgroup_size }, 1); + ggml_vk_create_pipeline(device, device->pipeline_soft_max_back_f32, "soft_max_back_f32", soft_max_back_f32_len, soft_max_back_f32_data, "main", 3, sizeof(vk_op_push_constants), {1, 1, 1}, { device->subgroup_size }, 1, true); ggml_vk_create_pipeline(device, device->pipeline_rope_norm_f32, "rope_norm_f32", rope_norm_f32_len, rope_norm_f32_data, "main", 4, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_rope_neox_f32, "rope_neox_f32", rope_neox_f32_len, rope_neox_f32_data, "main", 4, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1); @@ -9145,7 +9145,7 @@ static void ggml_vk_soft_max(ggml_backend_vk_context * ctx, vk_context& subctx, static void ggml_vk_soft_max_back(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) { float * op_params = (float *)dst->op_params; - ggml_vk_op_f32(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_SOFT_MAX_BACK, { (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], op_params[0], op_params[1] }, dryrun); + ggml_vk_op_f32(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_SOFT_MAX_BACK, { (uint32_t)src0->ne[0], (uint32_t)ggml_nrows(src0), op_params[0], op_params[1] }, dryrun); } static void ggml_vk_rope(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst, bool backprop, bool dryrun = false) { diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_back.comp b/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_back.comp index 29bd77d7e1..144ea58e6f 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_back.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_back.comp @@ -20,6 +20,10 @@ void main() { const uint row = gl_WorkGroupID.z * 262144 + gl_WorkGroupID.y * 512 + gl_WorkGroupID.x; const uint tid = gl_LocalInvocationID.x; + if (row >= p.KY) { + return; + } + FLOAT_TYPE scale = p.param1; // partial sums for thread in warp diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index a4776ee7f1..adf91ab6f9 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -6391,6 +6391,7 @@ static std::vector> make_test_cases_eval() { for (int64_t ne1 : {16, 1024}) { test_cases.emplace_back(new test_soft_max_back(GGML_TYPE_F32, {ne0, ne1, 1, 1}, scale, max_bias)); test_cases.emplace_back(new test_soft_max_back(GGML_TYPE_F32, {ne0-1, ne1-1, 1, 1}, scale, max_bias)); + test_cases.emplace_back(new test_soft_max_back(GGML_TYPE_F32, {ne0, ne1, 2, 3}, scale, max_bias)); } } } From ae355f6f7108540297d8b7f7ae71d20fe610a0b7 Mon Sep 17 00:00:00 2001 From: Ruben Ortlam Date: Tue, 9 Sep 2025 22:26:03 +0200 Subject: [PATCH 13/26] vulkan: throw the oom error instead of no memory type found (#15905) --- ggml/src/ggml-vulkan/ggml-vulkan.cpp | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp index c0c2239c2d..cb379fe94e 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -1937,7 +1937,9 @@ static vk_buffer ggml_vk_create_buffer(vk_device& device, size_t size, const std vk::PhysicalDeviceMemoryProperties mem_props = device->physical_device.getMemoryProperties(); - for (auto &req_flags : req_flags_list) { + for (auto it = req_flags_list.begin(); it != req_flags_list.end(); it++) { + const auto & req_flags = *it; + uint32_t memory_type_index = find_properties(&mem_props, &mem_req, req_flags); if (memory_type_index == UINT32_MAX) { @@ -1950,6 +1952,11 @@ static vk_buffer ggml_vk_create_buffer(vk_device& device, size_t size, const std break; } catch (const vk::SystemError& e) { // loop and retry + // during last attempt throw the exception + if (it + 1 == req_flags_list.end()) { + device->device.destroyBuffer(buf->buffer); + throw e; + } } } From ff02caf9eed261423289d1531a56536fbf57bfc2 Mon Sep 17 00:00:00 2001 From: Daniel Bevenius Date: Wed, 10 Sep 2025 05:23:19 +0200 Subject: [PATCH 14/26] ci : cache ROCm installation in windows-latest-cmake-hip (#15887) This commit adds caching of the ROCm installation for the windows-latest-cmake-hip job. The motivation for this is that the installation can sometimes hang and/or not complete properly leaving an invalid installation which later fails the build. By caching the installation hopefully we can keep a good installation available in the cache and avoid the installation step. Refs: https://github.com/ggml-org/llama.cpp/pull/15365 --- .github/workflows/build.yml | 31 ++++++++++++++++++++++++++++--- 1 file changed, 28 insertions(+), 3 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 43553ac13b..1f226d6ea8 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -1063,7 +1063,17 @@ jobs: run: | git clone https://github.com/rocm/rocwmma --branch rocm-6.2.4 --depth 1 - - name: Install + - name: Cache ROCm Installation + id: cache-rocm + uses: actions/cache@v4 + with: + path: C:\Program Files\AMD\ROCm + key: rocm-6.1-${{ runner.os }}-v1 + restore-keys: | + rocm-6.1-${{ runner.os }}- + + - name: Install ROCm + if: steps.cache-rocm.outputs.cache-hit != 'true' id: depends run: | $ErrorActionPreference = "Stop" @@ -1071,13 +1081,28 @@ jobs: Invoke-WebRequest -Uri "https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-24.Q3-WinSvr2022-For-HIP.exe" -OutFile "${env:RUNNER_TEMP}\rocm-install.exe" write-host "Installing AMD HIP SDK" $proc = Start-Process "${env:RUNNER_TEMP}\rocm-install.exe" -ArgumentList '-install' -NoNewWindow -PassThru - $proc.WaitForExit(600000) + $completed = $proc.WaitForExit(600000) + if (-not $completed) { + Write-Error "ROCm installation timed out after 10 minutes. Killing the process" + $proc.Kill() + exit 1 + } + if ($proc.ExitCode -ne 0) { + Write-Error "ROCm installation failed with exit code $($proc.ExitCode)" + exit 1 + } write-host "Completed AMD HIP SDK installation" - name: Verify ROCm id: verify run: | - & 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' --version + # Find and test ROCm installation + $clangPath = Get-ChildItem 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | Select-Object -First 1 + if (-not $clangPath) { + Write-Error "ROCm installation not found" + exit 1 + } + & $clangPath.FullName --version - name: Install ccache uses: ggml-org/ccache-action@v1.2.16 From 86587da03bd78df8f4e7d8b111a0c1d2494d6ed0 Mon Sep 17 00:00:00 2001 From: Daniel Bevenius Date: Wed, 10 Sep 2025 05:33:58 +0200 Subject: [PATCH 15/26] llama : check returned fn ptrs from ggml_backend_reg_get_proc_address (#15893) This commit adds check for two function pointers returned from ggml_backend_reg_get_proc_address. The motivation for this is that the function pointer could be nullptr if the get proc address function changes in the future. This is also consistent with all the other calls to ggml_backend_reg_get_proc_address in the code base. --- src/llama-context.cpp | 4 +++- src/llama.cpp | 4 +++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/src/llama-context.cpp b/src/llama-context.cpp index 874c6f82cb..3e163001c1 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -1447,7 +1447,9 @@ ggml_status llama_context::graph_compute( if (backend_cpu != nullptr) { auto * reg = ggml_backend_dev_backend_reg(ggml_backend_get_device(backend_cpu)); auto * set_threadpool_fn = (decltype(ggml_backend_cpu_set_threadpool) *) ggml_backend_reg_get_proc_address(reg, "ggml_backend_cpu_set_threadpool"); - set_threadpool_fn(backend_cpu, tp); + if (set_threadpool_fn) { + set_threadpool_fn(backend_cpu, tp); + } } // set the number of threads for all the backends diff --git a/src/llama.cpp b/src/llama.cpp index f0d4f5f891..92cddccc99 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -83,7 +83,9 @@ void llama_numa_init(enum ggml_numa_strategy numa) { GGML_ASSERT(dev && "CPU backend is not loaded"); auto * reg = ggml_backend_dev_backend_reg(dev); auto * numa_init_fn = (decltype(ggml_numa_init) *) ggml_backend_reg_get_proc_address(reg, "ggml_backend_cpu_numa_init"); - numa_init_fn(numa); + if (numa_init_fn) { + numa_init_fn(numa); + } } } From 28b5f190ef1dbea5edf82dbc8b4407b721fadd13 Mon Sep 17 00:00:00 2001 From: Chenguang Li <757486878@qq.com> Date: Wed, 10 Sep 2025 15:29:12 +0800 Subject: [PATCH 16/26] CANN: implement LRU cache for ACL graphs (#15814) * CANN: implement LRU cache for ACL graphs in CANN backend - Introduce ggml_cann_graph_lru_cache to store multiple ggml_cann_graph objects. - Graphs are loaded on demand and evicted using LRU policy when capacity is exceeded. - Updated push, move_to_front, and clear methods to manage cached graphs efficiently. - Ensures reuse of graphs, reducing graph reconstruction overhead in CANN backend. * fix typo * The LRU cache capacity can be configured via an env variable Signed-off-by: noemotiovon <757486878@qq.com> * refactory acl graph * refactory && fix review comments Signed-off-by: noemotiovon <757486878@qq.com> --------- Signed-off-by: noemotiovon <757486878@qq.com> --- docs/backend/CANN.md | 4 + ggml/src/ggml-cann/common.h | 64 +++++++++++++- ggml/src/ggml-cann/ggml-cann.cpp | 147 ++++++++++++++++++++----------- 3 files changed, 164 insertions(+), 51 deletions(-) diff --git a/docs/backend/CANN.md b/docs/backend/CANN.md index 357253f43a..35b189bb95 100755 --- a/docs/backend/CANN.md +++ b/docs/backend/CANN.md @@ -314,3 +314,7 @@ Converting the matmul weight format from ND to NZ to improve performance. Enable ### GGML_CANN_ACL_GRAPH Operators are executed using ACL graph execution, rather than in op-by-op (eager) mode. Enabled by default. + +### GGML_CANN_GRAPH_CACHE_CAPACITY + +Maximum number of compiled CANN graphs kept in the LRU cache, default is 12. When the number of cached graphs exceeds this capacity, the least recently used graph will be evicted. diff --git a/ggml/src/ggml-cann/common.h b/ggml/src/ggml-cann/common.h index e295f4ab47..17d7dbc75c 100755 --- a/ggml/src/ggml-cann/common.h +++ b/ggml/src/ggml-cann/common.h @@ -38,6 +38,7 @@ #include #include #include +#include #include "../include/ggml-cann.h" #include "../include/ggml.h" @@ -106,6 +107,7 @@ int32_t ggml_cann_get_device(); std::optional get_env(const std::string& name); bool parse_bool(const std::string& value); +int parse_integer(const std::string& value); /** * @brief Abstract base class for memory pools used by CANN. @@ -350,7 +352,7 @@ struct ggml_graph_node_properties { struct ggml_cann_graph { ~ggml_cann_graph() { if (graph != nullptr) { - aclmdlRIDestroy(graph); + ACL_CHECK(aclmdlRIDestroy(graph)); } } @@ -358,6 +360,64 @@ struct ggml_cann_graph { std::vector ggml_graph_properties; }; + +/** + * @brief LRU cache for managing ggml_cann_graph objects. + * + * This class maintains a list of shared_ptr to ggml_cann_graph objects + * and enforces a maximum capacity. It provides methods to push new graphs, + * move existing graphs to the front (most recently used), and clear the cache. + */ +struct ggml_cann_graph_lru_cache { + size_t capacity; /**< Maximum number of graphs in the cache. */ + + std::list cache_list; /**< List storing cached graphs as raw pointers. */ + + ggml_cann_graph_lru_cache() { + capacity = parse_integer(get_env("GGML_CANN_GRAPH_CACHE_CAPACITY").value_or("12")); + } + + /** + * @brief Push a new graph to the front of the cache. + * If the cache exceeds capacity, the least recently used graph is deleted. + * @param new_node Pointer to the new ggml_cann_graph to cache. + * Ownership is transferred to the cache (cache will delete it). + */ + void push(ggml_cann_graph* new_node) { + if (cache_list.size() >= capacity) { + ggml_cann_graph* old = cache_list.back(); + cache_list.pop_back(); + delete old; // free the old graph + } + cache_list.push_front(new_node); + } + + /** + * @brief Move an existing graph to the front of the cache. + * @param node Pointer to the ggml_cann_graph to move. + */ + void move_to_front(ggml_cann_graph* node) { + cache_list.remove(node); + cache_list.push_front(node); + } + + /** + * @brief Clear all graphs from the cache (also frees memory). + */ + void clear() { + for (auto ptr : cache_list) { + delete ptr; + } + cache_list.clear(); + } + + /** + * @brief Destructor that clears the cache and frees all cached graphs. + */ + ~ggml_cann_graph_lru_cache() { + clear(); + } +}; #endif // USE_ACL_GRAPH struct ggml_cann_rope_cache { @@ -394,7 +454,7 @@ struct ggml_backend_cann_context { aclrtEvent copy_event = nullptr; /**< Event for managing copy operations. */ #ifdef USE_ACL_GRAPH /// Cached CANN ACL graph used for executing the current ggml computation graph. - std::unique_ptr cann_graph; + ggml_cann_graph_lru_cache graph_lru_cache; bool acl_graph_mode = true; #endif cann_task_queue task_queue; diff --git a/ggml/src/ggml-cann/ggml-cann.cpp b/ggml/src/ggml-cann/ggml-cann.cpp index 2e47ad90af..aa5913a377 100755 --- a/ggml/src/ggml-cann/ggml-cann.cpp +++ b/ggml/src/ggml-cann/ggml-cann.cpp @@ -116,6 +116,24 @@ bool parse_bool(const std::string& value) { return valid_values.find(value) != valid_values.end(); } +/** + * @brief Parse a string as an integer, returning 0 if invalid. + * + * This function attempts to convert the input string `value` to an `int`. + * If the string is not a valid integer or is out of the `int` range, + * it returns 0. + * + * @param value The string to parse. + * @return The parsed integer, or 0 if conversion fails. + */ +int parse_integer(const std::string& value) { + try { + return std::stoi(value); + } catch (...) { + return 0; + } +} + /** * @brief Initialize the CANN device information. * @@ -2131,30 +2149,52 @@ static void ggml_backend_cann_synchronize(ggml_backend_t backend) { #ifdef USE_ACL_GRAPH /** - * @brief Populate the internal CANN graph node properties from the ggml computation graph. + * @brief Add a new CANN graph to the LRU cache by populating node properties from the ggml graph. * - * This function copies all node attributes (operation type, dimensions, strides, input sources, - * and operation parameters) into the cached CANN graph structure for later reuse or comparison. + * This function creates a new ggml_cann_graph object and fills its node properties + * (operation type, dimensions, strides, input sources, and operation parameters) + * based on the current ggml computation graph. * - * @param cann_ctx The CANN backend context. - * @param cgraph The ggml computational graph. + * Each node in the ggml graph is mapped to a property entry in the new CANN graph: + * - node address + * - operation type + * - shape (ne) and strides (nb) + * - source tensor addresses + * - operation parameters + * + * After initialization, the new graph is pushed into the LRU cache owned by the + * CANN backend context. The cache takes ownership of the graph and manages its + * lifetime (including deletion upon eviction). + * + * @param cann_ctx The CANN backend context containing the graph cache. + * @param cgraph The current ggml computation graph. */ -static void set_ggml_graph_node_properties(ggml_backend_cann_context * cann_ctx, ggml_cgraph * cgraph) { - for (int node_idx = 0; node_idx < cgraph->n_nodes; node_idx++) { - ggml_tensor * node = cgraph->nodes[node_idx]; - cann_ctx->cann_graph->ggml_graph_properties[node_idx].node_address = node->data; - cann_ctx->cann_graph->ggml_graph_properties[node_idx].node_op = node->op; +static void add_lru_matched_graph_node_properties( + ggml_backend_cann_context * cann_ctx, + ggml_cgraph * cgraph) { + // Create a new ggml_cann_graph object on the heap (its lifetime is managed by the cache). + ggml_cann_graph * new_graph = new ggml_cann_graph(); + new_graph->ggml_graph_properties.resize(cgraph->n_nodes); - for (int dim = 0; dim < GGML_MAX_DIMS; dim++) { - cann_ctx->cann_graph->ggml_graph_properties[node_idx].ne[dim] = node->ne[dim]; - cann_ctx->cann_graph->ggml_graph_properties[node_idx].nb[dim] = node->nb[dim]; + for (int node_idx = 0; node_idx < cgraph->n_nodes; ++node_idx) { + ggml_tensor * node = cgraph->nodes[node_idx]; + auto & prop = new_graph->ggml_graph_properties[node_idx]; + + prop.node_address = node->data; + prop.node_op = node->op; + + std::copy_n(node->ne, GGML_MAX_DIMS, prop.ne); + std::copy_n(node->nb, GGML_MAX_DIMS, prop.nb); + + for (int src = 0; src < GGML_MAX_SRC; ++src) { + prop.src_address[src] = node->src[src] ? node->src[src]->data : nullptr; } - for (int src = 0; src < GGML_MAX_SRC; src++) { - cann_ctx->cann_graph->ggml_graph_properties[node_idx].src_address[src] = - node->src[src] ? node->src[src]->data : nullptr; - } - memcpy(cann_ctx->cann_graph->ggml_graph_properties[node_idx].op_params, node->op_params, GGML_MAX_OP_PARAMS); + + memcpy(prop.op_params, node->op_params, GGML_MAX_OP_PARAMS); } + + // Insert into the LRU cache (cache takes ownership and will delete it when evicted). + cann_ctx->graph_lru_cache.push(new_graph); } /** @@ -2199,30 +2239,45 @@ static bool ggml_graph_node_has_matching_properties(ggml_tensor * node, ggml_gra } /** - * @brief Determine if the CANN graph needs to be rebuilt due to graph changes. + * @brief Check whether there is a cached CANN graph that matches the current ggml graph. * - * This checks whether the number or properties of ggml graph nodes have changed - * compared to the last captured CANN graph. If so, the CANN graph must be re-captured. + * This function iterates through the cached CANN graphs stored in the LRU cache and + * compares them against the given ggml computation graph. A match requires that the + * number of nodes is the same and that each node’s properties (operation type, + * dimensions, strides, inputs, and operation parameters) are identical. * - * @param cann_ctx The CANN backend context. + * If a matching graph is found, it is promoted to the front of the LRU cache and the + * function returns true. Otherwise, the function returns false, indicating that a new + * CANN graph needs to be captured. + * + * @param cann_ctx The CANN backend context containing the graph cache. * @param cgraph The current ggml computation graph. - * @return true if an update is required; false otherwise. + * @return true if a matching cached graph exists; false otherwise. */ -static bool is_cann_graph_update_required(ggml_backend_cann_context * cann_ctx, ggml_cgraph * cgraph) { - // The number of nodes is different, so the graph needs to be reconstructed. - if (cann_ctx->cann_graph->ggml_graph_properties.size() != (size_t)cgraph->n_nodes) { - cann_ctx->cann_graph->ggml_graph_properties.resize(cgraph->n_nodes); - return true; - } +static bool is_matched_graph(ggml_backend_cann_context * cann_ctx, ggml_cgraph * cgraph) { + ggml_cann_graph_lru_cache &lru_cache = cann_ctx->graph_lru_cache; + for (auto &graph_ptr : lru_cache.cache_list) { + // Skip graphs with a different number of nodes. + if (graph_ptr->ggml_graph_properties.size() != static_cast(cgraph->n_nodes)) { + continue; + } - // The number of nodes is the same; iterate over each node to check whether they match. - for (int i = 0; i < cgraph->n_nodes; i++) { - bool has_matching_properties = ggml_graph_node_has_matching_properties( - cgraph->nodes[i], &cann_ctx->cann_graph->ggml_graph_properties[i]); - if(!has_matching_properties) { + // Check if all nodes match. + bool all_match = true; + for (int i = 0; i < cgraph->n_nodes; ++i) { + if (!ggml_graph_node_has_matching_properties(cgraph->nodes[i], &graph_ptr->ggml_graph_properties[i])) { + all_match = false; + break; + } + } + + if (all_match) { + // update cache_list && renturn graph_ptr + lru_cache.move_to_front(graph_ptr); return true; } } + return false; } #endif // USE_ACL_GRAPH @@ -2241,17 +2296,13 @@ static bool is_cann_graph_update_required(ggml_backend_cann_context * cann_ctx, * @param cann_graph_update_required Whether graph capture is needed due to graph changes. */ static void evaluate_and_capture_cann_graph(ggml_backend_cann_context * cann_ctx, ggml_cgraph * cgraph, - bool & use_cann_graph, bool & cann_graph_update_required) { + bool & use_cann_graph, bool & cann_graph_update_required) { #ifdef USE_ACL_GRAPH + ggml_cann_graph* matched_graph = cann_ctx->graph_lru_cache.cache_list.front(); if (use_cann_graph && cann_graph_update_required) { - if (cann_ctx->cann_graph->graph != nullptr) { - ACL_CHECK(aclmdlRIDestroy(cann_ctx->cann_graph->graph)); - cann_ctx->cann_graph->graph = nullptr; - } ACL_CHECK(aclmdlRICaptureBegin(cann_ctx->stream(), ACL_MODEL_RI_CAPTURE_MODE_GLOBAL)); } #endif // USE_ACL_GRAPH - // Only perform the graph execution if CANN graphs are not enabled, or we are capturing the graph. // With the use of CANN graphs, the execution will be performed by the graph launch. if (!use_cann_graph || cann_graph_update_required) { @@ -2272,12 +2323,12 @@ static void evaluate_and_capture_cann_graph(ggml_backend_cann_context * cann_ctx #ifdef USE_ACL_GRAPH if (use_cann_graph && cann_graph_update_required) { // End CANN graph capture - ACL_CHECK(aclmdlRICaptureEnd(cann_ctx->stream(), &cann_ctx->cann_graph->graph)); + ACL_CHECK(aclmdlRICaptureEnd(cann_ctx->stream(), &matched_graph->graph)); } if (use_cann_graph) { // Execute graph - ACL_CHECK(aclmdlRIExecuteAsync(cann_ctx->cann_graph->graph, cann_ctx->stream())); + ACL_CHECK(aclmdlRIExecuteAsync(matched_graph->graph, cann_ctx->stream())); } #endif // USE_ACL_GRAPH } @@ -2311,19 +2362,17 @@ static enum ggml_status ggml_backend_cann_graph_compute( } if (use_cann_graph) { - if (cann_ctx->cann_graph == nullptr) { - cann_ctx->cann_graph.reset(new ggml_cann_graph()); - cann_graph_update_required = true; + // If no matching graph is found, the graph needs to be recaptured. + cann_graph_update_required = !is_matched_graph(cann_ctx, cgraph); + if (cann_graph_update_required) { + // If no matching graph is found, add a new ACL graph. + add_lru_matched_graph_node_properties(cann_ctx, cgraph); } - - cann_graph_update_required = is_cann_graph_update_required(cann_ctx, cgraph); - set_ggml_graph_node_properties(cann_ctx, cgraph); } #else bool use_cann_graph = false; bool cann_graph_update_required = false; #endif // USE_ACL_GRAPH - evaluate_and_capture_cann_graph( cann_ctx, cgraph, From 10d8b2b6b0ac2cae252a80b4daea5da55ab63c2f Mon Sep 17 00:00:00 2001 From: Chenguang Li <757486878@qq.com> Date: Wed, 10 Sep 2025 18:42:00 +0800 Subject: [PATCH 17/26] CANN: Add ROPE sin/cos cache for reuse (#15912) * CANN: Add ROPE sin/cos cache for reuse Introduce sin/cos caching mechanism in ROPE to avoid redundant computation across layers. The cache is built on the first layer per device and reused by subsequent layers if parameters match. - Added sin_cache / cos_cache pointers and position_length tracking - Introduced cache validity flags and properties: (ext_factor, theta_scale, freq_scale, attn_factor, is_neox) - Accelerates ROPE by eliminating repeated sin/cos generation This change reduces overhead in multi-layer scenarios while preserving correctness by verifying parameter consistency. Co-authored-by: hipudding * fix typo Signed-off-by: noemotiovon <757486878@qq.com> --------- Signed-off-by: noemotiovon <757486878@qq.com> Co-authored-by: hipudding --- ggml/src/ggml-cann/aclnn_ops.cpp | 62 +++++++++++++++++++------------- ggml/src/ggml-cann/common.h | 15 ++++++++ ggml/src/ggml-cann/ggml-cann.cpp | 3 ++ 3 files changed, 56 insertions(+), 24 deletions(-) diff --git a/ggml/src/ggml-cann/aclnn_ops.cpp b/ggml/src/ggml-cann/aclnn_ops.cpp index ac2e2e1adf..434023dd22 100755 --- a/ggml/src/ggml-cann/aclnn_ops.cpp +++ b/ggml/src/ggml-cann/aclnn_ops.cpp @@ -2268,8 +2268,6 @@ static void aclnn_index_fill_tensor(ggml_backend_cann_context& ctx, * stream, and persistent buffers for rope init/cache. * @param dst The destination ggml_tensor whose computation * depends on the RoPE values (usually Qcur/Kcur). - * @param sin_tensor_buffer Pre-allocated buffer for storing repeated sin values. - * @param cos_tensor_buffer Pre-allocated buffer for storing repeated cos values. * @param theta_scale Scalar exponent base for computing theta scale values. * @param freq_scale Frequency scaling factor, applied to theta scale. * @param attn_factor Attention scaling factor, applied to sin/cos. @@ -2277,17 +2275,23 @@ static void aclnn_index_fill_tensor(ggml_backend_cann_context& ctx, * (dim expansion vs repeat_interleave). */ static void aclnn_cache_init(ggml_backend_cann_context& ctx, ggml_tensor* dst, - void* sin_tensor_buffer, void* cos_tensor_buffer, float* corr_dims, float ext_factor, float theta_scale, float freq_scale, float attn_factor, bool is_neox) { - // int sin/cos cache, cache has different repeat method depond on - // @param.is_neox - ggml_tensor* src0 = dst->src[0]; // input ggml_tensor* src1 = dst->src[1]; // position ggml_tensor* src2 = dst->src[2]; // freq_factors + if(src2 == nullptr && ctx.rope_cache.cached + && ctx.rope_cache.ext_factor == ext_factor + && ctx.rope_cache.theta_scale == theta_scale + && ctx.rope_cache.freq_scale == freq_scale + && ctx.rope_cache.attn_factor == attn_factor + && ctx.rope_cache.is_neox == is_neox) { + // use cache. + return; + } + int64_t theta_scale_length = src0->ne[0] / 2; int64_t theta_scale_ne[] = {theta_scale_length, 1, 1, 1}; size_t theta_scale_nb[] = {sizeof(float), sizeof(float), sizeof(float), @@ -2316,8 +2320,6 @@ static void aclnn_cache_init(ggml_backend_cann_context& ctx, ggml_tensor* dst, ctx.rope_cache.freq_scale != freq_scale) { ctx.rope_cache.theta_scale_length = theta_scale_length; - ctx.rope_cache.theta_scale = theta_scale; - ctx.rope_cache.freq_scale = freq_scale; if (ctx.rope_cache.theta_scale_cache != nullptr) { ACL_CHECK(aclrtFree(ctx.rope_cache.theta_scale_cache)); @@ -2342,7 +2344,7 @@ static void aclnn_cache_init(ggml_backend_cann_context& ctx, ggml_tensor* dst, // return MIN(1, MAX(0, y)) - 1; yarn_ramp_allocator.alloc(theta_scale_length * sizeof(float)); void* yarn_ramp_buffer = yarn_ramp_allocator.get(); - acl_yarn_ramp_tensor = ggml_cann_create_tensor(yarn_ramp_buffer, ACL_FLOAT, sizeof(float_t), + acl_yarn_ramp_tensor = ggml_cann_create_tensor(yarn_ramp_buffer, ACL_FLOAT, sizeof(float), theta_scale_ne, theta_scale_nb, GGML_MAX_DIMS); float zero_value = 0, one_value = 1; float denom_safe_value = MAX(0.001f, corr_dims[1] - corr_dims[0]); @@ -2411,6 +2413,20 @@ static void aclnn_cache_init(ggml_backend_cann_context& ctx, ggml_tensor* dst, ggml_cann_release_resources(ctx, acl_freq_factors_tensor, acl_freq_fac_res_tensor); } + // init sin_repeat && cos_repeat, only to accelerate first layer on each device + if (position_length > ctx.rope_cache.position_length) { + ctx.rope_cache.position_length = position_length; + if (ctx.rope_cache.sin_cache != nullptr) { + ACL_CHECK(aclrtFree(ctx.rope_cache.sin_cache)); + } + if (ctx.rope_cache.cos_cache != nullptr) { + ACL_CHECK(aclrtFree(ctx.rope_cache.cos_cache)); + } + int64_t repeat_theta_length = theta_scale_length * position_length * 2; + ACL_CHECK(aclrtMalloc(&ctx.rope_cache.sin_cache, repeat_theta_length * sizeof(float), ACL_MEM_MALLOC_HUGE_FIRST)); + ACL_CHECK(aclrtMalloc(&ctx.rope_cache.cos_cache, repeat_theta_length * sizeof(float), ACL_MEM_MALLOC_HUGE_FIRST)); + } + // position aclTensor* acl_position_tensor = ggml_cann_create_tensor( src1->data, ggml_cann_type_mapping(src1->type), @@ -2462,10 +2478,10 @@ static void aclnn_cache_init(ggml_backend_cann_context& ctx, ggml_tensor* dst, sin_reshape_nb[i] = sin_reshape_nb[i - 1] * sin_reshape_ne[i - 1]; } aclTensor* acl_sin_repeat_tensor = - ggml_cann_create_tensor(sin_tensor_buffer, ACL_FLOAT, sizeof(float), + ggml_cann_create_tensor(ctx.rope_cache.sin_cache, ACL_FLOAT, sizeof(float), sin_reshape_ne, sin_reshape_nb, GGML_MAX_DIMS); aclTensor* acl_cos_repeat_tensor = - ggml_cann_create_tensor(cos_tensor_buffer, ACL_FLOAT, sizeof(float), + ggml_cann_create_tensor(ctx.rope_cache.cos_cache, ACL_FLOAT, sizeof(float), sin_reshape_ne, sin_reshape_nb, GGML_MAX_DIMS); // repeat @@ -2483,6 +2499,14 @@ static void aclnn_cache_init(ggml_backend_cann_context& ctx, ggml_tensor* dst, num_repeats, output_size); } + // Other layers use cache except first layer. + ctx.rope_cache.cached = true; + ctx.rope_cache.ext_factor = ext_factor; + ctx.rope_cache.theta_scale = theta_scale; + ctx.rope_cache.freq_scale = freq_scale; + ctx.rope_cache.attn_factor = attn_factor; + ctx.rope_cache.is_neox = is_neox; + ggml_cann_release_resources(ctx, acl_theta_scale_tensor, acl_position_tensor, acl_theta_tensor, acl_sin_tensor, acl_sin_repeat_tensor, acl_cos_tensor, acl_cos_repeat_tensor); @@ -2504,10 +2528,7 @@ aclnnStatus aclnnRotaryPositionEmbedding(void* workspace, #endif void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst) { - // TODO: use ascendc - // Only test with LLAMA model. ggml_tensor* src0 = dst->src[0]; // input - ggml_tensor* src1 = dst->src[1]; // param float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow; @@ -2538,15 +2559,8 @@ void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst) { const bool is_neox = mode & GGML_ROPE_TYPE_NEOX; - // sin/cos tensor length. - int64_t repeat_theta_length = src0->ne[0] * src1->ne[0]; - ggml_cann_pool_alloc sin_tensor_allocator(ctx.pool(), repeat_theta_length * sizeof(float)); - ggml_cann_pool_alloc cos_tensor_allocator(ctx.pool(), repeat_theta_length * sizeof(float)); - void *sin_tensor_buffer = sin_tensor_allocator.get(); - void *cos_tensor_buffer = cos_tensor_allocator.get(); - // init ctx.rope_cos/rope_sin cache - aclnn_cache_init(ctx, dst, sin_tensor_buffer, cos_tensor_buffer, corr_dims, ext_factor, + aclnn_cache_init(ctx, dst, corr_dims, ext_factor, theta_scale, freq_scale, attn_factor, is_neox); int64_t sin_reshape_ne[4] = {ne00, 1, ne02, 1}; @@ -2556,10 +2570,10 @@ void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst) { sin_reshape_nb[i] = sin_reshape_nb[i - 1] * sin_reshape_ne[i - 1]; } aclTensor* acl_sin_reshape_tensor = - ggml_cann_create_tensor(sin_tensor_buffer, ACL_FLOAT, sizeof(float), + ggml_cann_create_tensor(ctx.rope_cache.sin_cache, ACL_FLOAT, sizeof(float), sin_reshape_ne, sin_reshape_nb, GGML_MAX_DIMS); aclTensor* acl_cos_reshape_tensor = - ggml_cann_create_tensor(cos_tensor_buffer, ACL_FLOAT, sizeof(float), + ggml_cann_create_tensor(ctx.rope_cache.cos_cache, ACL_FLOAT, sizeof(float), sin_reshape_ne, sin_reshape_nb, GGML_MAX_DIMS); aclTensor* acl_src = ggml_cann_create_tensor(src0); diff --git a/ggml/src/ggml-cann/common.h b/ggml/src/ggml-cann/common.h index 17d7dbc75c..c5fce8dc91 100755 --- a/ggml/src/ggml-cann/common.h +++ b/ggml/src/ggml-cann/common.h @@ -425,12 +425,27 @@ struct ggml_cann_rope_cache { if(theta_scale_cache != nullptr) { ACL_CHECK(aclrtFree(theta_scale_cache)); } + if(sin_cache != nullptr) { + ACL_CHECK(aclrtFree(sin_cache)); + } + if(cos_cache != nullptr) { + ACL_CHECK(aclrtFree(cos_cache)); + } } void* theta_scale_cache = nullptr; int64_t theta_scale_length = 0; + // sin/cos cache, used only to accelerate first layer on each device + void* sin_cache = nullptr; + void* cos_cache = nullptr; + int64_t position_length = 0; + // Properties to check before reusing the sincos cache + bool cached = false; + float ext_factor = 0.0f; float theta_scale = 0.0f; float freq_scale = 0.0f; + float attn_factor = 0.0f; + bool is_neox = false; }; struct ggml_cann_tensor_cache { diff --git a/ggml/src/ggml-cann/ggml-cann.cpp b/ggml/src/ggml-cann/ggml-cann.cpp index aa5913a377..d148174f1e 100755 --- a/ggml/src/ggml-cann/ggml-cann.cpp +++ b/ggml/src/ggml-cann/ggml-cann.cpp @@ -2353,6 +2353,9 @@ static enum ggml_status ggml_backend_cann_graph_compute( ggml_cann_set_device(cann_ctx->device); g_nz_workspaces[cann_ctx->device].clear(); + // calculate rope cache for fist layer in current device. + cann_ctx->rope_cache.cached = false; + #ifdef USE_ACL_GRAPH bool use_cann_graph = true; bool cann_graph_update_required = false; From 09e72a037c77b6823fde9e1e4cf28e815b9c41ab Mon Sep 17 00:00:00 2001 From: Jesse Date: Wed, 10 Sep 2025 07:28:47 -0400 Subject: [PATCH 18/26] gitignore : Ignore vim swap files in tests (#15901) --- tests/.gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/.gitignore b/tests/.gitignore index 620a48ee44..cbc381606c 100644 --- a/tests/.gitignore +++ b/tests/.gitignore @@ -2,3 +2,4 @@ !*.* *.o ggml-common.h +**/*.swp From 2cfef4d117d67ab1dec002915b48a15d11ee1973 Mon Sep 17 00:00:00 2001 From: j-k Date: Wed, 10 Sep 2025 12:51:28 +0100 Subject: [PATCH 19/26] media : add transparent icon svg and png [no ci] (#15891) --- media/llama1-icon-transparent.png | Bin 0 -> 14270 bytes media/llama1-icon-transparent.svg | 77 ++++++++++++++++++++++++++++++ 2 files changed, 77 insertions(+) create mode 100644 media/llama1-icon-transparent.png create mode 100644 media/llama1-icon-transparent.svg diff --git a/media/llama1-icon-transparent.png b/media/llama1-icon-transparent.png new file mode 100644 index 0000000000000000000000000000000000000000..432d6c2223bb445362eadf946ecafea5ea4fd558 GIT binary patch literal 14270 zcmd73`8(8K_&;vTk}!j^Z)1yOe<5QDV-J-!YecpZO2(2cV(i(X82gfttt4g`OC++E zB^f(o9sAgQ&h-AgzkkB_`uyN>U1y%pbMABB=U&d^aX+K3o9Z*2OzW4*sGpn(x^}BR$s*=-sO%OrvAHkD_{3WNIlCFjQv*# zvhBS?qEPG^v^LfDPrjz&=%Q4ZI2X}dFML#i_PvU<(!5Je)Oio;ai-g*qtU_`Y6JpE zV)}b15S3?PM?7=MZ{cyPFOdQYWvVYQ{`_3=#w}6FKzhzKw8GU#*#CK&|MR97qELpK z&ae1!aT8D^%BAny#H;d|k&#W;$-vhy4~M>Fxc2F`p=oVi{B-vam$o_6yc{^I<2>a= zq=7=!-A#^u{pl5AR-Po~%EsCIgdty|Q1N}DerebEzA5Zw!N=V35zZ$s11s}!&Xew99Wq5aFi4!9PA#+%6|>ELotb9e?A>@;T~*Ldq!ytK}dv0WL`XlC@OYJe^60fzx%K&$v~;gt`oGLd_#Pp`~} zY+Q-^iYkbrwc)N%-4R7hIpa+65+<7$9r7xC^-(Cbw2!&Q4!4U<1wY)UTbV)+sQJh# zTv%p!)cf3iS*7Luf`DMz{f4X`)&H65!d~TTrL}`oiMVwtw$4gS5j%hjTQ<(}CDtu{ zx{$XMd38d>j&($fW{in3fTBU$mFBVUNpFDlo2}#~WTm1uG;5z@tDr!}VzL=un3cDKW)xP)k5U;J3 z$e{NRn~0A?N@r^J@mxQh)8?Lbm#)HDH~3K#)IKV8Cqlt8FKsA>3XxFMV*#f6b0*LK z+=Cs8$^L~Gc4ifb9wi_VIzKNFyY+g6r*&|qc6i;>zC1aO(I9Sbu6Hi04{-=$Y>~pe zT+IRz&&A{g6gQOSWdDM~Ar{%=q_)H-n4}Ay#9Ub|wws#3DzL6AFYR<;gd=)lEyo&i zyoK7=@c&sk9{qu9iNq)p(ymp^+6)!R7H0;ZPa!k~a zFr^ij4CLY@q7y@`7_bxT@tkT3ZmSDdv2Y%T9^qfv`&GzrgA0rFHnkI(yGVP?5SXo5 z!$`CZ&lM*B#oc4NwnED{75M}AYX8=giON+0aoDL*nDm<)Fvj3=nejnG^D~1^(c= zn|XXF;cyR=_M$RNh=13QNS|0XAb}j!crH!pv(sa6m2WQ{*_H!bCX_Jj_^4ZGol7!? zf2#-5Dan`Hh5P9m%+%h1QAmP*%&8&WAx5uO+xbT|tR3F(J{g2Rgz$@xIAeto)z`wL zsm;+t+U(LBJqF*++iqcmYmIj*r&8L3_-Y0*=Cq=+<}7YdE^c6;YoF$iak+S|C|M1y zLK2i1OIcbQFUEbSuDz#BOwl^v%O#cKp)iYLtu*&iQ+_i=H7otHFp_?I`QA%pTgYuI zeTTwjK}-w$;p-*HiEs^31Y!ssXBQv4z?5-jWg4AGU&HbYk6p$p&^iV!%jOMG#g@fi zp^+}CL6#sR4ESC89xi`yrwf@okszMnlrB-bGRdx_1yLmoPDL?YA5_5{efT*i-N!R2b+Rs z?{H}=$OrZrmKR7!g6XM)olKS@?PJQyeB_VUoHpLlHnV7SPz_RYPTmJNHHJ;Wz?NXe((wEJGT4e6>aFZ7Vqtf6JyIiSR&}}QaE7DBiia-_TWn~3W!b9FzMd<$J6vw7B9Bv zcP&i>kB6Tcn>$f-_&J}8yX-t$&k}GK#U4s)llA#&%OZ{+gbj|C!jj;3mc#v#5cy5~ z-eoJ}Zr9Ahbfob^zeEV0|tT&xbXscV|R3hb|FZ4Q3_ zNDh&}b1|fMoKgO`9Y68`cyov=o=fL}R8IbaF`f&io2Qi)6neUDYJh8vys z_5VS>-br8|uPsI?cay+UZp+Q-=w-B=q7Nl{ylLwrL;}WCf0bTWMg1VSgd7YXl;#Z- zcWy5%j6L_>J*q&yGPvQ|7xVFtzpP#IAyURt47JARNVsq<_`V@G`k$YQPq187MsXJ(~R|_V3 z{uxcfJBOB|;gD|#$OVz7A1qcj!}n6w3w$7lZco3PP(3$u3k290?fQfs>Kg1N z+RNcx1nc_ma2><}SY|2l%}gi**`(8tuxzz1relvnJ3*BD^v*u;8Hg;i7Tnt{XN!F~ zL?$Je@tRl`IysFcw5&^LuiVyr2Ug+V1F#R4;a+)2y{%Cg`HUPBeKlGY!NrcoM5QR6 zA!Da=NL{d1S;Vr>;fC%_1Mqsg6^4x!YJ~I^e|*PUm5y^BEq5_AgRo+-yTv!|_&6j=syu2UeZQW3X$I2C8QK-RDJ1E!Jqb~=xK{o0#f_s^f?5G+Q4)>Xag z_&bnK3CO1?&QP%>ZD4wq zPCZ;o^PKaURMFtgcu3-9Ab8Y<8!^1?-lre%DZlzZ;!b-dZXr8Y>4BNaFXp}HZxCO< zNsOHh*C4)JV8~oV%PLMO5vib~s!aYj5IknUMN$e~`+f=z7LPTz1d)+}1i(aRMJYPiQ}S}Z-xb_)Gkg>eSMMuGYR3WWgYa~9UaIWYy9;K!Gr z1H?so9*YxcD71a^k=i;bc}f4@i}4$#2G-qpJOc${>L;Fo$$rQCh|&;IYY>)&{M|Xj zfa!({d$m*I8Ez$vC*wqju@Uq)RM!lFeg<+DY+UcbUma=CooXd0WznZ2^>o6!H8yV- z$e!Kf32df`-!)ygm&sNAd8l=9zb=Rw61^=E1{*EV>cvvwA;TW*WjhVOJUf5gz?J^B zZR(feokjoCLvK?jp7M#*HLrlx-y4c{pT?GztyKmZt|Dqp!?K*8C5&?TCL$wvdW$o@ zP5FKqesXp!*=;3SeKeufV1=AXjS3Wf@%qT`RIZWV%tM@x4Q(7i>POP^&i|qDaV4eiK(n zQd}*KydJ*1(0ev>Wz%9GRt-X!wLJRiL=rCcgeBND`T<|A4kL8IUmsguGNFUg> zE5<#zN4h}@_WzU1bXs4+(KA&YB@Fh42+PuO!cpSSODNZD2;8rqrx0B1jWr9ivG`hp zZxKtP;@-x*g+G+BOF`=W;JfW{j{r z>@1(1W(XwW=yUfm%S|)N($LSGy^s-lHHO^yQOO*S!TVN?Jpv_+qCZ5&$ris>B^%%`cw$@CFSECXS-3SVC(093`SIhn&qMU9AHug%m`e_ zf!Thann>FyPpI%D9UoG(hQ{I8a5|j9Vk52hd{OC|4!h3NZz4;i{8j#Q7&l0LwHy)& zs+;I;ZRTJPiL(9jR^Lybn8#P@uqW*!0|+N6Tlo9zKoAtv&acwM@Q->UF2Ryi_;MFI zXZqJ%y6I-Vm8~68tiRPdUOCaEKext04TCY|O&mQFU750ZZPDzcv}5=f1~Vzr>P_}H z4Abz1*EG&GVT2I~TI1rsTW#Lq8V14C4czw#AQQ{quBtuprs?JJYt)(UrN&OU9-<&ok+7eq6`XsXQ}fmObJaH|l4(Rr4yaaG zmU4hUT>;|U?j!B=-nAV8ub%2SycP`h6UdR4Cy+Z~JZY6;+4!SOmlV)}LWvk=v!`*~ zjzRrVq7Wsfg#T{-D7JpZiFBk_{7bjsy?1yPhj2V!M>~@)_m+6`N0D0N_flVbfdp+C zwCXZLq_|6M9W-2D*v6b0@*A;>p#TMXNvp2EGy~4%yYCmE(~OJO{&A+IF(?d(9x_IS z8?(fk8xJNe^JEh&)MKy?2*kj(uq*+%Kb_+B*bN($o43^0E-=3cBk*`=2YhZaHsBQM z$E#YheS#;C8DIyZE41$9!npvqBM;b26XX4Fqj3{ zb)0NZgB^TaH>YJK>kN6&FGs64A@(mj2mj0W$zi0gN#+$F0)_!D^@>(qxy4i>Sl~df zQijz^Jaw%Y0|eaE3(MlE4MR5HIpU4cyw)+$fYkw`AmR!D3`J>1<7Hs@==j}$NJrbx zf%*gpHj)L|6|1Ccj|thp znIaI!ECY-XkNQuWFE2^;`lW>(cTJqjI0>M?N^?U%mey_VH_j;bcKEui6AN^V8u$R@ zB;pmFYl-If?hFBsy^8w?ZleUVpD~txT%IXNTF*=2HH#}YOMFRZfk)PDH6K?s>d3-r zR8Y7sSurYjz=C^ZDo!0YjvG%O*Fmj09nFwD!8Hy&hM-~c5Asg+5M6)T@$xtB@pOOM zO^cC*k*VT8jlzX}M)dA>$#TqMo{&Wh@4!W#PM)IwWS_(+MyY(VAJVfs>nY1VEmS+kN{w{L&A@p?Sv|?C-b=vW zfcb&+5o5WZfD2`4TGFG+o_!to=xeaqKDaTi?RVX1>u10*;QDdj-H)XFhFUq5DV1-2 z;d#IPx2R(>TJa4ym2Pu~p@B&O&7M4R&hrOCk^t*N<*`+IQia~7X2T>uot2B_q2Dz& z3e=pM>EhpIX6*!ZmF=$|ER+Ev>qc@y*5B8>UOju`er<9m$^0|w&E~D8sd7#a%3YGF zBuce17S~JPpHL$8brfzuh9LZIT+-CqBEAbc$4D&7Cpnu0p=YJiZKt#7;>G4~p(bc+ zi;+p5nj^mvQaKgQa;cC2x+urce{vRwI0ae$t+@DmA~nW5|8`8}W~m&Wath%sss8gs zk6tsc?RK}NfBwgYn8$Dbz3^FaL99IPDjgb|-E-to^m`Y3@fDI)g#&Hcn@7j+A{lr0 z>`wgIL`qOZ={}=9_KF=`BTRpTJeT%>-A?wv9-+0FJkQpUTd)_)i|<=Dc_R68MCt z4a2fHK5rTwXp#a5u^zHT-qH#eD%`&h8xQ>5bzqp%p}`xSMrIp({f*nWy;LY;ZUI09 zJrP$B;@_tVP}?fDRaA%K=+eiZ=?W*YJ{xU=mSuXXLIs38!;$fIR!lE4Mx^rxj_HV; zi3KiwTMr0Co%^A>#eG$?%1n}OjWObJ_N(7dX`Db^x?sTl>b~8JH}Qp2_@T=VOo#jW^op4V#{|?NKUD+z_%RtJwuLZ47Xrx8CpDBOd ze0$5J>8{bz;%R}Oz5VYCCq0%gv2RDQnued81wRa05+s9T-+ME*SK+U~GO7eTMGGa2 zOFm-hG1rak zu8ueEt2fgO^2nL6e=ELE*>1|+yXbLuLlG~+b(=LdAx}Kl&?gSJcag`^bc-44Ck?bj zbJ&qH%#f+2Pwcf0IJsT^LQKeVD0u0n$kv(&|IT@UTrCVpkT8|L=bNcQpS>Ng!NC`@ zr62QkL=c`<4r0v_-0hEPQ(pxFkfRrrt}*H2QF}6~=G#(RmisCf0~trU;JO7~E2^sX z6{L@l{kyr4zDfA+16f3aZ0~zJz%Qc?4HAZUjInlWA2V;(SH|c7b_YN8U{S84H=^Fdhy+ZJ8 zm*RYqm0nx|QgaC{R4ob+*LnaxLC0i5$H{NZ%EVfE0kHl15J2zfyVK22Xf8BTo!j~d zb$y5H_y^i^!L0H#rX4+jNiUWkQwe3R0x=wo2`lQf+Zlxg8Bd~g-njgvqN*U7 zLS3t`viA(98(#o39(JP-9vJOBV!)8MwHQm3Ek#x#Bfeh+zfFE9KZK+Pox!xhbwwzb zE8n~Y;3Rb+Sjz5$K2iUcC92&z8n(^_xRnXGj$;nM0^M!QnPJf$G|cO!P{Z#IG!5X2 z?-)QxX?ScyhOAc7a9MZ>SQJBUm{G4eFv6Hln6%ZMOJ-C-47D-HXoIIM?!496S>p^m^%K6S zi;uX0XKr5(qn#eRO{iBHG781$Ag$mL^cQRqgaE2%f96BcYiOZqM6YU^ zb)U_eu5$5_?YVLKl*@s*K^(W8Iwa(_orB=HiEW2-oGdH!Db|t-!qz#?Xk&hYA3;~C zIFS275c9b$)5KtZ&X#e`V4+$%S^l<`Fq(k$y2hj%!gn_(l-02|_cxMC@_m2!gfKNc zC{zy_m-7j$l&RD_7FcZ1l&#)q%1mLDa`GWo`Kx`H4W-4tuLq8UdP4#CD`x*?IY|ol zR6sQ4vp>o2UCSL<-S*0N^8UQ4&&wb5-J=L_MKSxm7>4jTY7PW8R`m34XDkC zYCKb6As_})sQjqvpetY6cC9$~=_DBsG5NKNEkj;q36^q~sP(BUa!wV4cDW{s@)!7k z9peBh#;f4wu1RCMDc$)s{i+Of7&856(R%mLI&;W|%m@ZwiH4X6k1#-NDbG#J_ZnwE zM_i^`-guuR4uF=D&o&fFy>Q{;B|Ao;7in+e?s81KU+3U!pypA+MA&)I;AsHEq$nuV z?chTsgJ0Z+VYVVRb9bS2{Np=+UDP3srfMqzw=Ss9^UXLlZ?huDk!tPUm0yzZbcbp` zQ1)zJ4(GKKQvl?ndrEybNdHcJ&VYw#=xJnX~I@uogj{;%`($kR#IcD)I^_UX^~|K7@i&=5FLAU*kCmHT&~V}fO2wdG`{!{kLS42POf@~Xo-lj++qA*@mp2Ei6 zLv*AN7xRTE9x+H*!G4f1fl1_po8d{rAEI4D@ZhDC=bYI&_(NB0wwl9-^8BiS$FW;n zBd&X*-n;zyH8MBZFXJf(uJRahX|=FP`&ct8xh1 zT>3aeLBXL(?ge<4E0+d~!cZ4zxw%)+;Fr;f5AG^pg3h72x4JWa*TmW^>$AS6$~dLJ zJ7Wdh%TVcBTbk+D1ueh}Asn7i+b*G#v;*xF7FYITk6@TGWQRRS018uiCg-|od81zUe-PBzAYFvQ#g`4?07ev~0>@{UK`>At3gGxxD zKmN>{W5IB!e1{8*2K9hn$|5~bWWc?qNYz~=oS7KwP2VKQe|!NffinH-Z1!oM8i&Vo zNmRbBx-2f|Fr`X_Kob@F#)hg3RNbZe-)T}d_!p}*#Q3(}lKfJ($qdA0L_h6K8d&0T z=@w}9&2P5`@D?a9&U2V?xBf{aQ8xG%t61u@+x{%WehiHt~o#rvsO7Y`l1F2Kl} zdbAwD)Y;^eMK~0TD||$7y|AK&!zl2onReoPO78w9-=3?r!?NxGTvx7Vy=UECV0vae z!T&C3B#EL`FE?uKrD_NA*S+akBa&%!$2eN`)NXnVkiXd+k&9}20!5rBY{}?i(rxbTPbH>2#*^iMPFilwBatH<*ykOv zPch45rD&wW-?@LVgEm5>OOdc>{%LU4%v@gGR~O-kQY5&_lhOm50kWM~DUDqKsw03g z)`Om$dlSirDE<3ODnwAXup)Qn92miJV?!G*5WX7+9A5YIvr)%tUawRQ-{^h%>} zVYyJVB^^fug4;MOi?(p@R`k~2ExHk`nVC5o)K!+rpSvxSfPS6hms#^TpS%R#r4M^s zJfXvd1lV4Ow*01fvW$Rh^eBXiQ(OjIBmLA_EYTWKa7B@T*Y-sNUV|!k?h>e+V19*K zB$6dDB>X7z$FwE@K~W&|kDEXLVUN=wla!5Qbev2dnu%WevAElGLm)7Yyk?ebpZXa% z9xlW+b(ZXC02|%B7PeOC29^B{=y9(jcYt>$Vg_>>Bm`)$t$#GYUB30^GASNxAI?;t z#@gX0E+T9_2PlY}G~C`YLK;JGA*fdu3BbRI=&NCC?eQDZ2DqD8k~Knvi`$#!cRq=F zcelqyV{p+2ni%fVr+!c62Y99ZH=f}YMCCVJOMcGI&76f4|GIrJC!o@^0I5y2mY;uU zOGN!GO9MWoSduSn3e8+a4wE53Bm=1~qf=($aNDL_|P2=@QVTDSFG5K%WF7fp_l5SXRr<1MK@6wmN-MKp+ zg2pJk`*Cjx*z_iXdt2m{%4zXfKWNkP?aL7_u?<;XK#*Iv=)g~JR~NtvmoI<$w`L-( z`i|^UFBwUx2Rc)3_d!9k7zdO8&4Y;5MKJ$`<9>kH7ra*_7&6qBBDDuo5BN7t!R(UY z8L1wUextS>N7gzxHsp~9k6!{2%BM3Q@&Gat5e-1OObknXR;3th8c;{+uZiYsp&fvr zKtzKen4X#aRGHkICJx?gQ6au8s;>M&UhO-OuqqLM#^n4MEm?uE!5r~+Jz(wBT=-vI z1g~o!8{4Kj0q|>7|MR1B9rCLAnL)Db`4@;GEwXwL0C<*!=o9jXjzd4^TVfRiHF#G_ zJDZ-=G3C`H1&^CdH~!W+*k7AydKolvV*h?Z(~lDDNV_MYFDhD@urN{2y@y89ey$!o z{j$QP+Hf3v95#1+JV-L(zE=D<*?g1PMY8KqoeBn9EYdPF9f#+XXmiex%?9}50?vI6 zQNGc8|0}!b>m&a62bTJ(8o}-h9fub_LyhO%s{$EDt&cwCp9!P(fTbnD{z z0Wpt40W2zs>Ws=NzwHz`e2~wv?q4GMbx0t=Et>#BA7}&sN87UksVzUvhG7J!0bsgi z56<~df?xrJ$k#Hv$-cH>Y7l{qJ~LP-Om{)2f)a#LMLRZc-LVqCI8>K1nEwt;ah{CT-84| zuHYg#-i$ZAX!%U@Rm;D@wGjpMU?gL`jSr_sj19-oTji4qVo)OFsIj$X!}50#oya+( z_HV(y#p@hCjNyj(^9+8}zlXv~q^#EXlC1vk!D9;^uk&~gs6p|AM#C@*`sRjQU$;3D z_5JN0gFXPn9Yy{N){7|(kt0FN=+~7BZ|~wqfdnGGiUfX1btGkipRo!JT8fE(m2x+gn5o<8!QC2GjkRW*n?+^;lN8>4>a zmL9v4&yGcbmRU^QHed9uB({mfoG6x#oQ-+U z0!Q1y`$yF~;QAuVpeRu&{`)Go0h_%;fT{fi5pedJ^p+6xYQD1>?bK#tnDjaxy2ZC&05oO za7sTSJ1%U-~JX$4|N`jru=1 z&LB36zrn9636Ffc=Nkt2&rfzr*&jE(VkOXBvELqH!yFZ)GgMxtvYEE2uzWK zV_EkeC=7z8qvF4VZl(b&7q*AWe=>NRyInzB#H^7_6HPvtGqC&-e!4Ygj~xFo=YVM&i49f=4Wzo@|v z`j^io#rF-CmA;S;?ZPU^Ps&Z^d+)v&hl_mqkleu*m(DyzjVlnt{kcaaXBdf zn6}iE$MDmZjQeY{3Eq$k{yHtN(~o>$MA$k*9eWS{{o`2<{Nx&-R@ zTAJ|^Ts7}-3-pCC@1<;~_>%ok)e3vwLW-#72kdvXRO>Z~n}1+jfHHY`#l6_RXrYj$ zCmEN9GD?$wv@rnyhC=@lPa_BX)yZ)jPskMB5Jw`cT=@;xCNmdr?evCI9@4-E8bn%^ z9=xEdk&!KJ7=yR}{FdBa>GW&4YW>b+9Ju8b1aGLIu?>r=_1|{~bDVYWjx96zppkkS z;bXFg@}iBmQC}JYm+H_Xl^Z4*dPmQ37l{Ss(KD?jQXo3&h&)mxSloQnbfKlm&w2K{ z-sD|KV^<~g?lTa1v0>arr5Mm7O({-2NC^I5aVJyXxpx8hVKwkAQGmn=eVCw)-8Sva z)u4*0^5paKzMPshCja~o60}|_ZJwR{kF$c(b{ehdJ9X&2+xW9V``>a3#evKhRIv(Y zCy6){oJXi=+m028t22LpRk%S5hoahUz}u^Ewlx)!ZpWPtv$H{?61>7n7hJd4_D$C30fb%)y)9MQiwOKr+9t%=D9UL9#+=K9`gFD^7%T%eG``s`!0WCDZ zyGyOwV7BoQ(wPKMdD@afV&JxB3PL2Z0`NB-;>~B{7y?E&L6brifp0tbT8RNa0URlc#rr83GWgWJy?&uHhCz02j@*t4 z7@Yk0EHiuSMX+1Q4Q~wcr2)W>pGna8lxH&?{b&Fex0Vh3nSW1~0=mx8_n19)44~~1 zgjB%4fy8472cT8p zo+p7ezIUK)tSJA`U;F5e9Tpk=-KT%6U$@o>w6fKt*+yt|lNzW`^iHYo!3 zu*%coNdHU-vQ*-y`%3_$x0~ZhrVztRO!eCtep@pDY0;h32B37gis^8_u2QcyJ)S8^ z>2s|DZk{?H?8a8`!Py%BH{fg)#KnVtRMmh}fe|kpHBgldWcgFW_j!QFZ>cr}>#MD} zOwxjxO!z6C!Ue0-_6m7hSkt z0BZ=Gpn(IN|LhDBSICPYDqq?8P>+ly;}Q=irQfPXJ5QTwsj$crc{>X^>)u@KcDO@) z0N$7C&*|D-Cim+$Qmhcsu&-6|xKzY0mufw6C!UN9)Sv4SHP_$pK6c>bpb*A96RF9f zX-KM@)0R|LpXC!ydPwd#w;ap0yH67jaPMpTjO!M9<%g1>Hva>7NESuH0EN@9AI@6o zbSq0X!X>q@aSwT`i3!j1GYlMD`wG24(^AB9VNC-&G1(rt{!It<_S?T!A4rY|uY>i0 zmKE`s&7Q}z`J4h;!^1Hv_EcL8JH0n7cEiE$@L&Qz(Dn?mU{c`=l(fImhLfq3%|1vM zFb7;ZpU6#iZg1B{w=qgYf22@m9RFU@-2_No5@gUJg_B&ax#>JbIW*BawZP{rM}e5Mq-)fYMF*ssnx(PG)y{y*&?pxvo{ zujF6$_d1fUR0!$E8&>NEN3I9v5qP?nENBkmbVS_2HzUjdx{1=e7P%f|SB;FPSQlwe zh-#&3NniF3%P^=1k_3W}1xgh7X3BA)_$z0&=slS(9ETr#UoIhco{GSz!M6_HrJwnt zM>FVmo#;D!7iNEDgRd|iPzI{Ul!TAK4S?r>?^yCn17Sj$=cZ^X9s2$wHCvwf8}6mP zLl7tTF#i>XG398bJ^o<&NXQb-z$gw`p`q`f62{itT$0P-m=SwjUDM5HY&Mx?+Cr4;NWo2gFHl6Q*kK`AR^?0_#SztE1?)Pu*Ut<@b?+=AA zx2;^A_7rT#4UXhPjr*G!R`9fO4($%kahjFs7vNQ->P|q8jhizJOdnYc)IpAk) zMr4n4KeaVL@@zuK1y{7YRit8@%V<{UuU4WDPLuhn))O@8w?oyBSBB5^+8xa}+!%(m z7{#8au`;B;6qR>!8oPXnYp6QO=HL#&;g+sr0QuGpzPw8K#14wq|JT39`hV{;vCvZ% Zm8;j&#LGp%r*{+-hPtMRkJ@%G{|_<)MHB!4 literal 0 HcmV?d00001 diff --git a/media/llama1-icon-transparent.svg b/media/llama1-icon-transparent.svg new file mode 100644 index 0000000000..e28203f4e8 --- /dev/null +++ b/media/llama1-icon-transparent.svg @@ -0,0 +1,77 @@ + + + + + + + + + + + + + + + + + From e7b6d83b524bbc24a1343d862de6dba8e8eddbd6 Mon Sep 17 00:00:00 2001 From: Daniel Bevenius Date: Wed, 10 Sep 2025 14:17:09 +0200 Subject: [PATCH 20/26] tests : filter out no-ops from coverage report (#15900) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * tests : filter out no-ops from coverage report This commit is a follow-up commit for #15745 to address the feedback on how no-op operations should be filtered out from the coverage report. The feedback regarding the UNARY and GLU sub-operations not being handled I not exactly sure what should be done. They are included in the coverage, for example ABS, ELU, EXP, GELU, GEGLU, GEGLU_ERF etc are in the list of covered operations: ```console $ ./build/bin/test-backend-ops --show-coverage Operations covered by tests (89): ✓ ABS ✓ ACC ✓ ADD ✓ ADD1 ✓ ADD_ID ✓ ARANGE ✓ ARGMAX ✓ ARGSORT ✓ CLAMP ✓ CONCAT ✓ CONV_2D ✓ CONV_2D_DW ✓ CONV_3D ✓ CONV_TRANSPOSE_1D ✓ CONV_TRANSPOSE_2D ✓ COS ✓ COUNT_EQUAL ✓ CPY ✓ CROSS_ENTROPY_LOSS ✓ CROSS_ENTROPY_LOSS_BACK ✓ DIAG_MASK_INF ✓ DIV ✓ DUP ✓ ELU ✓ EXP ✓ FLASH_ATTN_EXT ✓ GATED_LINEAR_ATTN ✓ GEGLU ✓ GEGLU_ERF ✓ GEGLU_QUICK ✓ GELU ✓ GELU_ERF ✓ GELU_QUICK ✓ GET_ROWS ✓ GET_ROWS_BACK ✓ GROUP_NORM ✓ HARDSIGMOID ✓ HARDSWISH ✓ IM2COL ✓ IM2COL_3D ✓ L2_NORM ✓ LEAKY_RELU ✓ LOG ✓ MEAN ✓ MUL ✓ MUL_MAT ✓ MUL_MAT_ID ✓ NEG ✓ NORM ✓ OPT_STEP_ADAMW ✓ OPT_STEP_SGD ✓ OUT_PROD ✓ PAD ✓ PAD_REFLECT_1D ✓ POOL_2D ✓ REGLU ✓ RELU ✓ REPEAT ✓ REPEAT_BACK ✓ RMS_NORM ✓ RMS_NORM_BACK ✓ ROLL ✓ ROPE ✓ ROPE_BACK ✓ RWKV_WKV6 ✓ RWKV_WKV7 ✓ SCALE ✓ SET ✓ SET_ROWS ✓ SGN ✓ SIGMOID ✓ SILU ✓ SILU_BACK ✓ SIN ✓ SOFT_MAX ✓ SOFT_MAX_BACK ✓ SQR ✓ SQRT ✓ SSM_CONV ✓ SSM_SCAN ✓ STEP ✓ SUB ✓ SUM ✓ SUM_ROWS ✓ SWIGLU ✓ SWIGLU_OAI ✓ TANH ✓ TIMESTEP_EMBEDDING ✓ UPSCALE Operations without tests (14): ✗ ADD_REL_POS ✗ CUSTOM ✗ DIAG ✗ DIAG_MASK_ZERO ✗ FLASH_ATTN_BACK ✗ GET_REL_POS ✗ IM2COL_BACK ✗ MAP_CUSTOM1 ✗ MAP_CUSTOM2 ✗ MAP_CUSTOM3 ✗ POOL_1D ✗ POOL_2D_BACK ✗ WIN_PART ✗ WIN_UNPART Coverage Summary: Total operations: 103 Tested operations: 89 Untested operations: 14 Coverage: 86.4% ``` Refs: https://github.com/ggml-org/llama.cpp/pull/15745 * use of ggml_op enum values instead of strcmp --- tests/test-backend-ops.cpp | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index adf91ab6f9..4a882ab072 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -6807,7 +6807,17 @@ static void list_all_ops() { static void show_test_coverage() { std::set all_ops; for (int i = 1; i < GGML_OP_COUNT; i++) { - all_ops.insert(ggml_op_name((enum ggml_op)i)); + auto op = (enum ggml_op)i; + if (op == GGML_OP_VIEW || + op == GGML_OP_RESHAPE || + op == GGML_OP_PERMUTE || + op == GGML_OP_TRANSPOSE || + op == GGML_OP_CONT || + op == GGML_OP_GLU || + op == GGML_OP_UNARY) { + continue; + } + all_ops.insert(ggml_op_name(op)); } for (int i = 0; i < GGML_UNARY_OP_COUNT; i++) { all_ops.insert(ggml_unary_op_name((enum ggml_unary_op)i)); From 33daece86b65607451d0d4378d2d04ba6a20ad55 Mon Sep 17 00:00:00 2001 From: Daniel Bevenius Date: Wed, 10 Sep 2025 15:39:57 +0200 Subject: [PATCH 21/26] ci : add caching for ROCm installation in release workflow (#15924) This commit applies the same caching to the release workflow which currently exists for the main CI workflow that was introduced in Commit ff02caf9eed261423289d1531a56536fbf57bfc2 ("ci : cache ROCm installation in windows-latest-cmake-hip (#15887)"). --- .github/workflows/release.yml | 31 ++++++++++++++++++++++++++++--- 1 file changed, 28 insertions(+), 3 deletions(-) diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 5367637e42..701811eeb2 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -544,13 +544,23 @@ jobs: run: | git clone https://github.com/rocm/rocwmma --branch rocm-6.2.4 --depth 1 + - name: Cache ROCm Installation + id: cache-rocm + uses: actions/cache@v4 + with: + path: C:\Program Files\AMD\ROCm + key: rocm-6.1-${{ runner.os }}-v1 + restore-keys: | + rocm-6.1-${{ runner.os }}- + - name: ccache uses: ggml-org/ccache-action@v1.2.16 with: key: windows-latest-cmake-hip-${{ matrix.name }}-x64 evict-old-files: 1d - - name: Install + - name: Install ROCm + if: steps.cache-rocm.outputs.cache-hit != 'true' id: depends run: | $ErrorActionPreference = "Stop" @@ -558,13 +568,28 @@ jobs: Invoke-WebRequest -Uri "https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-24.Q3-WinSvr2022-For-HIP.exe" -OutFile "${env:RUNNER_TEMP}\rocm-install.exe" write-host "Installing AMD HIP SDK" $proc = Start-Process "${env:RUNNER_TEMP}\rocm-install.exe" -ArgumentList '-install' -NoNewWindow -PassThru - $proc.WaitForExit(600000) + $completed = $proc.WaitForExit(600000) + if (-not $completed) { + Write-Error "ROCm installation timed out after 10 minutes. Killing the process" + $proc.Kill() + exit 1 + } + if ($proc.ExitCode -ne 0) { + Write-Error "ROCm installation failed with exit code $($proc.ExitCode)" + exit 1 + } write-host "Completed AMD HIP SDK installation" - name: Verify ROCm id: verify run: | - & 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' --version + # Find and test ROCm installation + $clangPath = Get-ChildItem 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | Select-Object -First 1 + if (-not $clangPath) { + Write-Error "ROCm installation not found" + exit 1 + } + & $clangPath.FullName --version - name: Build id: cmake_build From 0f0a3c2851134d49955f3c85afbb0b1bb47c3e07 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Wed, 10 Sep 2025 17:52:35 +0300 Subject: [PATCH 22/26] metal : make the backend async (#15906) * metal : make the backend async ggml-ci * cont : add comments, extend op offload, clean up ggml-ci * metal : fix batch size for MUL_MAT_ID * metal : remove deprecated ggml_backend_metal_buffer_from_ptr * metal : create only metal buffers, no wrapping of host memory ggml-ci * metal : restore .alloc_buffer for buffer_from_ptr_type ggml-ci * metal : remove broken implementation of GGML_OP_SET ggml-ci * metal : clean-up loose ends, ready for tests ggml-ci * metal : support both private and shared buffers ggml-ci * metal : enable private buffers + add global device queue * metal : disable host buffer to prevent races ggml-ci * metal : avoid extra copy during set_tensor ggml-ci * metal : use separate buffer types for shread and private Metal buffers ggml-ci * metal : simplify synchronization logic ggml-ci * metal : fix build ggml-ci * metal : do not implement cpy_tensor ggml-ci * metal : separate implementations for shared and private buffers ggml-ci --- ggml/include/ggml-metal.h | 6 - ggml/src/ggml-metal/ggml-metal.m | 975 ++++++++++++++++++--------- ggml/src/ggml-metal/ggml-metal.metal | 32 - 3 files changed, 674 insertions(+), 339 deletions(-) diff --git a/ggml/include/ggml-metal.h b/ggml/include/ggml-metal.h index a610694423..1163438bc2 100644 --- a/ggml/include/ggml-metal.h +++ b/ggml/include/ggml-metal.h @@ -43,14 +43,8 @@ GGML_BACKEND_API ggml_backend_t ggml_backend_metal_init(void); GGML_BACKEND_API bool ggml_backend_is_metal(ggml_backend_t backend); -GGML_DEPRECATED( - GGML_BACKEND_API ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t size, size_t max_size), - "obsoleted by the new device interface - https://github.com/ggml-org/llama.cpp/pull/9713"); - GGML_BACKEND_API void ggml_backend_metal_set_abort_callback(ggml_backend_t backend, ggml_abort_callback abort_callback, void * user_data); -GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void); - // helper to check if the device supports a specific family // ideally, the user code should be doing these checks // ref: https://developer.apple.com/metal/Metal-Feature-Set-Tables.pdf diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m index e76fb71263..07b96dbdd5 100644 --- a/ggml/src/ggml-metal/ggml-metal.m +++ b/ggml/src/ggml-metal/ggml-metal.m @@ -48,6 +48,11 @@ static struct ggml_backend_metal_device_context { int mtl_device_ref_count; id mtl_library; + // a single global queue shared by all Metal backends + // technically not needed for devices with unified memory, but enables discrete GPUs support + // ref: https://github.com/ggml-org/llama.cpp/pull/15906 + id mtl_queue; + NSLock * mtl_lock; bool has_simdgroup_reduction; @@ -56,6 +61,7 @@ static struct ggml_backend_metal_device_context { bool has_bfloat; bool use_bfloat; bool use_fusion; + bool use_shared_buffers; int debug_fusion; @@ -69,6 +75,7 @@ static struct ggml_backend_metal_device_context { /*.mtl_device =*/ nil, /*.mtl_device_ref_count =*/ 0, /*.mtl_library =*/ nil, + /*.mtl_queue =*/ nil, /*.mtl_lock =*/ nil, /*.has_simdgroup_reduction =*/ false, /*.has_simdgroup_mm =*/ false, @@ -76,6 +83,7 @@ static struct ggml_backend_metal_device_context { /*.has_bfloat =*/ false, /*.use_bfloat =*/ false, /*.use_fusion =*/ true, + /*.use_shared_buffers =*/ true, /*.debug_fusion =*/ 0, /*.fuse_cnt =*/ { 0 }, /*.max_size =*/ 0, @@ -94,6 +102,11 @@ static id ggml_backend_metal_device_acq(struct ggml_backend_metal_dev ctx->mtl_device = MTLCreateSystemDefaultDevice(); if (ctx->mtl_device) { + ctx->mtl_queue = [ctx->mtl_device newCommandQueue]; + if (ctx->mtl_queue == nil) { + GGML_LOG_ERROR("%s: error: failed to create command queue\n", __func__); + } + ctx->has_simdgroup_reduction = [ctx->mtl_device supportsFamily:MTLGPUFamilyApple7]; ctx->has_simdgroup_reduction |= [ctx->mtl_device supportsFamily:MTLGPUFamilyMetal3_GGML]; @@ -118,6 +131,12 @@ static id ggml_backend_metal_device_acq(struct ggml_backend_metal_dev ctx->debug_fusion = val ? atoi(val) : 0; } + ctx->use_shared_buffers = ctx->mtl_device.hasUnifiedMemory; + + if (getenv("GGML_METAL_SHARED_BUFFERS_DISABLE") != NULL) { + ctx->use_shared_buffers = false; + } + memset(ctx->fuse_cnt, 0, sizeof(ctx->fuse_cnt)); ctx->max_size = ctx->mtl_device.maxBufferLength; @@ -161,6 +180,11 @@ static void ggml_backend_metal_device_rel(struct ggml_backend_metal_device_conte ctx->mtl_library = nil; } + if (ctx->mtl_queue) { + [ctx->mtl_queue release]; + ctx->mtl_queue = nil; + } + if (ctx->mtl_device) { [ctx->mtl_device release]; ctx->mtl_device = nil; @@ -467,8 +491,6 @@ enum ggml_metal_kernel_type { GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC, GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_DESC, GGML_METAL_KERNEL_TYPE_LEAKY_RELU_F32, - GGML_METAL_KERNEL_TYPE_SET_I32, - GGML_METAL_KERNEL_TYPE_SET_F32, GGML_METAL_KERNEL_TYPE_CPY_F32_F32, GGML_METAL_KERNEL_TYPE_CPY_F32_F16, GGML_METAL_KERNEL_TYPE_CPY_F32_BF16, @@ -773,7 +795,7 @@ struct ggml_metal_command_buffer { struct ggml_backend_metal_context { id device; - id queue; + id queue; // currently a pointer to the device queue, but might become separate queue [TAG_QUEUE_PER_BACKEND] dispatch_queue_t d_queue; @@ -803,6 +825,12 @@ struct ggml_backend_metal_context { // n_cb command buffers + 1 used by the main thread struct ggml_metal_command_buffer cmd_bufs[GGML_METAL_MAX_COMMAND_BUFFERS + 1]; + // extra command buffers for things like getting, setting and copying tensors + NSMutableArray * cmd_bufs_ext; + + // the last command buffer queued into the Metal queue with operations relevant to the current Metal backend + id cmd_buf_last; + // abort ggml_metal_graph_compute if callback returns true ggml_abort_callback abort_callback; void * abort_callback_data; @@ -999,7 +1027,11 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de GGML_LOG_INFO("%s: picking default device: %s\n", __func__, [[device name] UTF8String]); ctx->device = device; - ctx->queue = [device newCommandQueue]; + + // TODO: question - would it be better to have one queue for the backend and one queue for the device? + // the graph encoders and async ops would use the backend queue while the sync ops would use the device queue? + //ctx->queue = [device newCommandQueue]; [TAG_QUEUE_PER_BACKEND] + ctx->queue = ctx_dev->mtl_queue; if (ctx->queue == nil) { GGML_LOG_ERROR("%s: error: failed to create command queue\n", __func__); return NULL; @@ -1058,6 +1090,8 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de GGML_LOG_INFO("%s: has residency sets = %s\n", __func__, ctx_dev->has_residency_sets ? "true" : "false"); GGML_LOG_INFO("%s: has bfloat = %s\n", __func__, ctx_dev->has_bfloat ? "true" : "false"); GGML_LOG_INFO("%s: use bfloat = %s\n", __func__, ctx_dev->use_bfloat ? "true" : "false"); + GGML_LOG_INFO("%s: use fusion = %s\n", __func__, ctx_dev->use_fusion ? "true" : "false"); + GGML_LOG_INFO("%s: use shared buffers = %s\n", __func__, ctx_dev->use_shared_buffers ? "true" : "false"); GGML_LOG_INFO("%s: hasUnifiedMemory = %s\n", __func__, ctx_dev->mtl_device.hasUnifiedMemory ? "true" : "false"); ctx->capture_next_compute = false; @@ -1073,6 +1107,10 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de ctx->cmd_bufs[i].mem_pool->device = device; } + ctx->cmd_bufs_ext = [[NSMutableArray alloc] init]; + + ctx->cmd_buf_last = nil; + #if TARGET_OS_OSX || (TARGET_OS_IOS && __clang_major__ >= 15) if (@available(macOS 10.12, iOS 16.0, *)) { GGML_LOG_INFO("%s: recommendedMaxWorkingSetSize = %8.2f MB\n", __func__, device.recommendedMaxWorkingSetSize / 1e6); @@ -1390,8 +1428,6 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC, argsort_f32_i32_asc, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_DESC, argsort_f32_i32_desc, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_LEAKY_RELU_F32, leaky_relu_f32, true); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SET_F32, set_f32, true); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SET_I32, set_i32, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_F32, cpy_f32_f32, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_F16, cpy_f32_f16, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_BF16, cpy_f32_bf16, use_bfloat); @@ -1663,14 +1699,19 @@ static void ggml_metal_free(struct ggml_backend_metal_context * ctx) { Block_release(ctx->encode_async); - [ctx->queue release]; + //[ctx->queue release]; // [TAG_QUEUE_PER_BACKEND] for (int i = 0; i < GGML_METAL_MAX_COMMAND_BUFFERS; ++i) { - // ctx->cmd_bufs[i].obj is auto released + if (ctx->cmd_bufs[i].obj) { + [ctx->cmd_bufs[i].obj release]; + } ggml_metal_mem_pool_free(ctx->cmd_bufs[i].mem_pool); } + [ctx->cmd_bufs_ext removeAllObjects]; + [ctx->cmd_bufs_ext release]; + dispatch_release(ctx->d_queue); free(ctx); @@ -1688,14 +1729,21 @@ struct ggml_backend_metal_buffer { struct ggml_backend_metal_buffer_context { void * all_data; size_t all_size; - bool owned; + + // if false, the Metal buffer data is allocated in private GPU memory and is not shared with the host + bool is_shared; // multiple buffers are used only to avoid the maximum buffer size limitation when using mmap int n_buffers; struct ggml_backend_metal_buffer buffers[GGML_METAL_MAX_BUFFERS]; // optional MTLResidencySet + // note: cannot use explicity "id" here because it is not available on certain OSes id rset; + + // pointers to global device objects + id device; + id queue; }; // rset init @@ -1761,7 +1809,7 @@ static void ggml_backend_metal_buffer_rset_free(struct ggml_backend_metal_buffer // the assumption is that there is 1-to-1 mapping between the host and device memory buffers, so we can find the // Metal buffer based on the host memory pointer // -static id ggml_metal_get_buffer(struct ggml_tensor * t, size_t * offs) { +static id ggml_metal_get_buffer(const struct ggml_tensor * t, size_t * offs) { //GGML_LOG_INFO("%s: data tensor '%16s', offs_data = %8ld, offs_eval = %8ld, offs_cach = %8ld\n", __func__, t->name, offs_data, offs_eval, offs_cach); const int64_t tsize = ggml_nbytes(t); @@ -1984,16 +2032,6 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex return false; }; } - case GGML_OP_SET: - { - switch (op->src[0]->type) { - case GGML_TYPE_F32: - case GGML_TYPE_I32: - return true; - default: - return false; - }; - } case GGML_OP_DIAG_MASK_INF: case GGML_OP_GET_ROWS: { @@ -5569,68 +5607,6 @@ static int ggml_metal_encode_node( [encoder dispatchThreadgroups:MTLSizeMake((ne01 + nrptg - 1)/nrptg, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, nrptg, 1)]; } break; - case GGML_OP_SET: - { - GGML_ASSERT(ggml_are_same_shape(src0, dst)); - GGML_ASSERT(ggml_is_contiguous(dst) && ggml_is_contiguous(src0)); - - // src0 and dst as viewed during set - const size_t dst_nb0 = ggml_element_size(src0); - - const size_t dst_nb1 = ((int32_t *) dst->op_params)[0]; - const size_t dst_nb2 = ((int32_t *) dst->op_params)[1]; - const size_t dst_nb3 = ((int32_t *) dst->op_params)[2]; - const size_t offset = ((int32_t *) dst->op_params)[3]; - const bool inplace = (bool) ((int32_t *) dst->op_params)[4]; - - if (!inplace) { - memcpy(((char *) dst->data), ((char *) src0->data), ggml_nbytes(dst)); - } - - const int im0 = (ne10 == 0 ? 0 : ne10-1); - const int im1 = (ne11 == 0 ? 0 : ne11-1); - const int im2 = (ne12 == 0 ? 0 : ne12-1); - const int im3 = (ne13 == 0 ? 0 : ne13-1); - - GGML_ASSERT(offset + im0*dst_nb0 + im1*dst_nb1 + im2*dst_nb2 + im3*dst_nb3 <= ggml_nbytes(dst)); - - id pipeline = nil; - - switch (src0t) { - case GGML_TYPE_F32: - GGML_ASSERT(nb10 == sizeof(float)); - pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SET_F32].pipeline; break; - case GGML_TYPE_I32: - GGML_ASSERT(nb10 == sizeof(int32_t)); - pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SET_I32].pipeline; break; - default: GGML_ABORT("fatal error"); - } - - ggml_metal_kargs_set args = { - /*.ne10 =*/ ne10, - /*.ne11 =*/ ne11, - /*.ne12 =*/ ne12, - /*.nb10 =*/ nb10, - /*.nb11 =*/ nb11, - /*.nb12 =*/ nb12, - /*.nb13 =*/ nb13, - /*.nb1 =*/ dst_nb1, - /*.nb2 =*/ dst_nb2, - /*.nb3 =*/ dst_nb3, - /*.offs =*/ offset, - /*.inplace =*/ inplace, - }; - - const int nth = MIN((int) pipeline.maxTotalThreadsPerThreadgroup, ne10); - - [encoder setComputePipelineState:pipeline]; - [encoder setBytes:&args length:sizeof(args) atIndex:0]; - [encoder setBuffer:id_src0 offset:offs_src0 atIndex:1]; - [encoder setBuffer:id_src1 offset:offs_src1 atIndex:2]; - [encoder setBuffer:id_dst offset:offs_dst atIndex:3]; - - [encoder dispatchThreadgroups:MTLSizeMake(ne11, ne12, ne13) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)]; - } break; case GGML_OP_POOL_2D: { GGML_ASSERT(ggml_is_contiguous(src0)); @@ -5759,6 +5735,12 @@ static enum ggml_status ggml_metal_graph_compute( if (should_capture) { ctx->capture_next_compute = false; + // make sure all previous computations have finished before starting the capture + if (ctx->cmd_buf_last) { + [ctx->cmd_buf_last waitUntilCompleted]; + ctx->cmd_buf_last = nil; + } + if (!ctx->capture_started) { // create capture scope ctx->capture_scope = [[MTLCaptureManager sharedCaptureManager] newCaptureScopeWithDevice:ctx_dev->mtl_device]; @@ -5781,78 +5763,103 @@ static enum ggml_status ggml_metal_graph_compute( // the main thread commits the first few commands immediately // cmd_buf[n_cb] { - id cmd_buf = [ctx->queue commandBufferWithUnretainedReferences]; + // cannot use commandBufferWithUnretainedReferences because the buffers from the memory pool can get destroyed + // TODO: when the memory pools are removed, we can again use commandBufferWithUnretainedReferences + // https://github.com/ggml-org/llama.cpp/pull/15832#discussion_r2334215009 + //id cmd_buf = [ctx->queue commandBufferWithUnretainedReferences]; + id cmd_buf = [ctx->queue commandBuffer]; + [cmd_buf retain]; + ctx->cmd_bufs[n_cb].obj = cmd_buf; [cmd_buf enqueue]; + ctx->encode_async(n_cb); } - // prepare the rest of the command buffers asynchronously + // remember the command buffer for the next iteration + ctx->cmd_buf_last = ctx->cmd_bufs[n_cb].obj; + + // prepare the rest of the command buffers asynchronously (optional) // cmd_buf[0.. n_cb) for (int cb_idx = 0; cb_idx < n_cb; ++cb_idx) { - id cmd_buf = [ctx->queue commandBufferWithUnretainedReferences]; + //id cmd_buf = [ctx->queue commandBufferWithUnretainedReferences]; + id cmd_buf = [ctx->queue commandBuffer]; + [cmd_buf retain]; + + if (ctx->cmd_bufs[cb_idx].obj) { + [ctx->cmd_bufs[cb_idx].obj release]; + } ctx->cmd_bufs[cb_idx].obj = cmd_buf; // always enqueue the first two command buffers // enqueue all of the command buffers if we don't need to abort if (cb_idx < 2 || ctx->abort_callback == NULL) { [cmd_buf enqueue]; + + // update the pointer to the last queued command buffer + // this is needed to implement synchronize() + ctx->cmd_buf_last = cmd_buf; } } dispatch_apply(n_cb, ctx->d_queue, ctx->encode_async); - // wait for completion and check status of each command buffer - // needed to detect if the device ran out-of-memory for example (#1881) - { - id cmd_buf = ctx->cmd_bufs[n_cb].obj; - [cmd_buf waitUntilCompleted]; - - MTLCommandBufferStatus status = [cmd_buf status]; - if (status != MTLCommandBufferStatusCompleted) { - GGML_LOG_INFO("%s: command buffer %d failed with status %lu\n", __func__, n_cb, status); - if (status == MTLCommandBufferStatusError) { - GGML_LOG_INFO("error: %s\n", [[cmd_buf error].localizedDescription UTF8String]); - } - - return GGML_STATUS_FAILED; - } - } - - for (int i = 0; i < n_cb; ++i) { - id cmd_buf = ctx->cmd_bufs[i].obj; - [cmd_buf waitUntilCompleted]; - - MTLCommandBufferStatus status = [cmd_buf status]; - if (status != MTLCommandBufferStatusCompleted) { - GGML_LOG_INFO("%s: command buffer %d failed with status %lu\n", __func__, i, status); - if (status == MTLCommandBufferStatusError) { - GGML_LOG_INFO("error: %s\n", [[cmd_buf error].localizedDescription UTF8String]); - } - - return GGML_STATUS_FAILED; - } - - id next_buffer = (i + 1 < n_cb ? ctx->cmd_bufs[i + 1].obj : nil); - if (!next_buffer) { - continue; - } - - const bool next_queued = ([next_buffer status] != MTLCommandBufferStatusNotEnqueued); - if (next_queued) { - continue; - } - - if (ctx->abort_callback && ctx->abort_callback(ctx->abort_callback_data)) { - GGML_LOG_INFO("%s: command buffer %d aborted", __func__, i); - return GGML_STATUS_ABORTED; - } - - [next_buffer commit]; - } + // for debugging: block until graph is computed + //[ctx->cmd_buf_last waitUntilCompleted]; + // enter here only when capturing in order to wait for all computation to finish + // otherwise, we leave the graph to compute asynchronously if (!should_capture && ctx->capture_started) { + // wait for completion and check status of each command buffer + // needed to detect if the device ran out-of-memory for example (#1881) + { + id cmd_buf = ctx->cmd_bufs[n_cb].obj; + [cmd_buf waitUntilCompleted]; + + MTLCommandBufferStatus status = [cmd_buf status]; + if (status != MTLCommandBufferStatusCompleted) { + GGML_LOG_INFO("%s: command buffer %d failed with status %lu\n", __func__, n_cb, status); + if (status == MTLCommandBufferStatusError) { + GGML_LOG_INFO("error: %s\n", [[cmd_buf error].localizedDescription UTF8String]); + } + + return GGML_STATUS_FAILED; + } + } + + for (int i = 0; i < n_cb; ++i) { + id cmd_buf = ctx->cmd_bufs[i].obj; + [cmd_buf waitUntilCompleted]; + + MTLCommandBufferStatus status = [cmd_buf status]; + if (status != MTLCommandBufferStatusCompleted) { + GGML_LOG_INFO("%s: command buffer %d failed with status %lu\n", __func__, i, status); + if (status == MTLCommandBufferStatusError) { + GGML_LOG_INFO("error: %s\n", [[cmd_buf error].localizedDescription UTF8String]); + } + + return GGML_STATUS_FAILED; + } + + id next_buffer = (i + 1 < n_cb ? ctx->cmd_bufs[i + 1].obj : nil); + if (!next_buffer) { + continue; + } + + const bool next_queued = ([next_buffer status] != MTLCommandBufferStatusNotEnqueued); + if (next_queued) { + continue; + } + + if (ctx->abort_callback && ctx->abort_callback(ctx->abort_callback_data)) { + GGML_LOG_INFO("%s: command buffer %d aborted", __func__, i); + return GGML_STATUS_ABORTED; + } + + [next_buffer commit]; + } + [ctx->capture_scope endScope]; [[MTLCaptureManager sharedCaptureManager] stopCapture]; } @@ -5862,10 +5869,12 @@ static enum ggml_status ggml_metal_graph_compute( } //////////////////////////////////////////////////////////////////////////////// - // backend interface +//////////////////////////////////////////////////////////////////////////////// -static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer) { +// shared buffer + +static void ggml_backend_metal_buffer_shared_free_buffer(ggml_backend_buffer_t buffer) { struct ggml_backend_metal_buffer_context * ctx = (struct ggml_backend_metal_buffer_context *)buffer->context; for (int i = 0; i < ctx->n_buffers; i++) { @@ -5874,7 +5883,9 @@ static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer) ggml_backend_metal_buffer_rset_free(ctx); - if (ctx->owned) { + GGML_ASSERT(ctx->is_shared); + + { #if TARGET_OS_OSX vm_deallocate((vm_map_t)mach_task_self(), (vm_address_t)ctx->all_data, ctx->all_size); #else @@ -5885,66 +5896,254 @@ static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer) free(ctx); } -static void * ggml_backend_metal_buffer_get_base(ggml_backend_buffer_t buffer) { +static void * ggml_backend_metal_buffer_shared_get_base(ggml_backend_buffer_t buffer) { struct ggml_backend_metal_buffer_context * ctx = (struct ggml_backend_metal_buffer_context *)buffer->context; return ctx->all_data; } -static void ggml_backend_metal_buffer_memset_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) { - memset((char *)tensor->data + offset, value, size); - - GGML_UNUSED(buffer); -} - -static void ggml_backend_metal_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) { - memcpy((char *)tensor->data + offset, data, size); - - GGML_UNUSED(buffer); -} - -static void ggml_backend_metal_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) { - memcpy(data, (const char *)tensor->data + offset, size); - - GGML_UNUSED(buffer); -} - -static bool ggml_backend_metal_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst) { - if (ggml_backend_buffer_is_host(src->buffer)) { - memcpy(dst->data, src->data, ggml_nbytes(src)); - return true; - } - return false; - - GGML_UNUSED(buffer); -} - -static void ggml_backend_metal_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { +static void ggml_backend_metal_buffer_shared_memset_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) { struct ggml_backend_metal_buffer_context * ctx = (struct ggml_backend_metal_buffer_context *)buffer->context; + GGML_ASSERT(ctx->is_shared); + + memset((char *)tensor->data + offset, value, size); +} + +static void ggml_backend_metal_buffer_shared_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) { + struct ggml_backend_metal_buffer_context * ctx = (struct ggml_backend_metal_buffer_context *)buffer->context; + + GGML_ASSERT(ctx->is_shared); + + memcpy((char *)tensor->data + offset, data, size); +} + +static void ggml_backend_metal_buffer_shared_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) { + struct ggml_backend_metal_buffer_context * ctx = (struct ggml_backend_metal_buffer_context *)buffer->context; + + GGML_ASSERT(ctx->is_shared); + + memcpy(data, (const char *)tensor->data + offset, size); +} + +static bool ggml_backend_metal_buffer_shared_cpy_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst) { + GGML_UNUSED(buffer); + GGML_UNUSED(src); + GGML_UNUSED(dst); + + return false; +} + +static void ggml_backend_metal_buffer_shared_clear(ggml_backend_buffer_t buffer, uint8_t value) { + struct ggml_backend_metal_buffer_context * ctx = (struct ggml_backend_metal_buffer_context *)buffer->context; + + GGML_ASSERT(ctx->is_shared); + memset(ctx->all_data, value, ctx->all_size); } -static struct ggml_backend_buffer_i ggml_backend_metal_buffer_i = { - /* .free_buffer = */ ggml_backend_metal_buffer_free_buffer, - /* .get_base = */ ggml_backend_metal_buffer_get_base, +static struct ggml_backend_buffer_i ggml_backend_metal_buffer_shared_i = { + /* .free_buffer = */ ggml_backend_metal_buffer_shared_free_buffer, + /* .get_base = */ ggml_backend_metal_buffer_shared_get_base, /* .init_tensor = */ NULL, - /* .memset_tensor = */ ggml_backend_metal_buffer_memset_tensor, - /* .set_tensor = */ ggml_backend_metal_buffer_set_tensor, - /* .get_tensor = */ ggml_backend_metal_buffer_get_tensor, - /* .cpy_tensor = */ ggml_backend_metal_buffer_cpy_tensor, - /* .clear = */ ggml_backend_metal_buffer_clear, + /* .memset_tensor = */ ggml_backend_metal_buffer_shared_memset_tensor, + /* .set_tensor = */ ggml_backend_metal_buffer_shared_set_tensor, + /* .get_tensor = */ ggml_backend_metal_buffer_shared_get_tensor, + /* .cpy_tensor = */ ggml_backend_metal_buffer_shared_cpy_tensor, + /* .clear = */ ggml_backend_metal_buffer_shared_clear, /* .reset = */ NULL, }; -// default buffer type +// private buffer -static const char * ggml_backend_metal_buffer_type_get_name(ggml_backend_buffer_type_t buft) { - return "Metal"; +static void ggml_backend_metal_buffer_private_free_buffer(ggml_backend_buffer_t buffer) { + struct ggml_backend_metal_buffer_context * ctx = (struct ggml_backend_metal_buffer_context *)buffer->context; - GGML_UNUSED(buft); + for (int i = 0; i < ctx->n_buffers; i++) { + [ctx->buffers[i].metal release]; + } + + ggml_backend_metal_buffer_rset_free(ctx); + + GGML_ASSERT(!ctx->is_shared); + + free(ctx); } +static void * ggml_backend_metal_buffer_private_get_base(ggml_backend_buffer_t buffer) { + struct ggml_backend_metal_buffer_context * ctx = (struct ggml_backend_metal_buffer_context *)buffer->context; + + return ctx->all_data; +} + +static void ggml_backend_metal_buffer_private_memset_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) { + struct ggml_backend_metal_buffer_context * ctx = (struct ggml_backend_metal_buffer_context *)buffer->context; + + GGML_ASSERT(!ctx->is_shared); + + @autoreleasepool { + // dst + size_t buf_dst_offset = 0; + id buf_dst = ggml_metal_get_buffer(tensor, &buf_dst_offset); + + buf_dst_offset += offset; + + id queue = ctx->queue; + id cmd_buf = [queue commandBufferWithUnretainedReferences]; + + { + id encoder = [cmd_buf blitCommandEncoder]; + + [encoder fillBuffer:buf_dst + range:NSMakeRange(buf_dst_offset, buf_dst_offset + size) + value:value]; + + [encoder endEncoding]; + } + + [cmd_buf commit]; + [cmd_buf waitUntilCompleted]; + } +} + +static void ggml_backend_metal_buffer_private_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) { + struct ggml_backend_metal_buffer_context * ctx = (struct ggml_backend_metal_buffer_context *)buffer->context; + + GGML_ASSERT(!ctx->is_shared); + + @autoreleasepool { + // src + void * data_ptr = (void *)(uintptr_t) data; // "const cast" the src data + id buf_src = [ctx->device newBufferWithBytesNoCopy:data_ptr + length:size + options:MTLResourceStorageModeShared + deallocator:nil]; + + // dst + size_t buf_dst_offset = 0; + id buf_dst = ggml_metal_get_buffer(tensor, &buf_dst_offset); + + buf_dst_offset += offset; + + // note: for experimentation purposes, here we use a semaphore to wait for the copy to complete + // this is alternative to waitUntilCompleted, which should be faster, but don't seem to make much difference + dispatch_semaphore_t completion_semaphore = dispatch_semaphore_create(0); + + id queue = ctx->queue; + id cmd_buf = [queue commandBufferWithUnretainedReferences]; + + { + id encoder = [cmd_buf blitCommandEncoder]; + + [encoder copyFromBuffer:buf_src + sourceOffset:0 + toBuffer:buf_dst + destinationOffset:buf_dst_offset + size:size]; + + [encoder endEncoding]; + } + + [cmd_buf addCompletedHandler:^(id cb) { + // TODO: can check for errors here + GGML_UNUSED(cb); + + dispatch_semaphore_signal(completion_semaphore); + }]; + + [cmd_buf commit]; + + dispatch_semaphore_wait(completion_semaphore, DISPATCH_TIME_FOREVER); + //[cmd_buf waitUntilCompleted]; + } +} + +static void ggml_backend_metal_buffer_private_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) { + struct ggml_backend_metal_buffer_context * ctx = (struct ggml_backend_metal_buffer_context *)buffer->context; + + GGML_ASSERT(!ctx->is_shared); + + @autoreleasepool { + // src + size_t buf_src_offset = 0; + id buf_src = ggml_metal_get_buffer(tensor, &buf_src_offset); + + buf_src_offset += offset; + + // dst + id buf_dst = [ctx->device newBufferWithBytesNoCopy:data + length:size + options:MTLResourceStorageModeShared + deallocator:nil]; + + id queue = ctx->queue; + id cmd_buf = [queue commandBufferWithUnretainedReferences]; + + { + id encoder = [cmd_buf blitCommandEncoder]; + + [encoder copyFromBuffer:buf_src + sourceOffset:buf_src_offset + toBuffer:buf_dst + destinationOffset:0 + size:size]; + + [encoder endEncoding]; + } + + [cmd_buf commit]; + [cmd_buf waitUntilCompleted]; + } +} + +static bool ggml_backend_metal_buffer_private_cpy_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst) { + GGML_UNUSED(buffer); + GGML_UNUSED(src); + GGML_UNUSED(dst); + + return false; +} + +static void ggml_backend_metal_buffer_private_clear(ggml_backend_buffer_t buffer, uint8_t value) { + struct ggml_backend_metal_buffer_context * ctx = (struct ggml_backend_metal_buffer_context *)buffer->context; + + GGML_ASSERT(!ctx->is_shared); + + @autoreleasepool { + id queue = ctx->queue; + id cmd_buf = [queue commandBufferWithUnretainedReferences]; + + { + id encoder = [cmd_buf blitCommandEncoder]; + + [encoder fillBuffer:ctx->buffers[0].metal + range:NSMakeRange(0, ctx->buffers[0].size) + value:value]; + + [encoder endEncoding]; + } + + [cmd_buf commit]; + [cmd_buf waitUntilCompleted]; + } +} + +static struct ggml_backend_buffer_i ggml_backend_metal_buffer_private_i = { + /* .free_buffer = */ ggml_backend_metal_buffer_private_free_buffer, + /* .get_base = */ ggml_backend_metal_buffer_private_get_base, + /* .init_tensor = */ NULL, + /* .memset_tensor = */ ggml_backend_metal_buffer_private_memset_tensor, + /* .set_tensor = */ ggml_backend_metal_buffer_private_set_tensor, + /* .get_tensor = */ ggml_backend_metal_buffer_private_get_tensor, + /* .cpy_tensor = */ ggml_backend_metal_buffer_private_cpy_tensor, + /* .clear = */ ggml_backend_metal_buffer_private_clear, + /* .reset = */ NULL, +}; + +// +// buffer types +// + static void ggml_backend_metal_log_allocated_size(id device, size_t size_aligned) { #ifndef GGML_METAL_NDEBUG #if TARGET_OS_OSX || (TARGET_OS_IOS && __clang_major__ >= 15) @@ -5970,7 +6169,8 @@ static void ggml_backend_metal_log_allocated_size(id device, size_t s GGML_UNUSED(size_aligned); } -static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { +// common method for allocating shread or private Metal buffers +static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size, bool shared) { struct ggml_backend_metal_buffer_context * ctx = calloc(1, sizeof(struct ggml_backend_metal_buffer_context)); const size_t size_page = sysconf(_SC_PAGESIZE); @@ -5986,22 +6186,40 @@ static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_ba id device = ctx_dev->mtl_device; - ctx->all_data = ggml_metal_host_malloc(size_aligned); + // allocate shared buffer if the device supports it and it is required by the buffer type + if (ctx_dev->use_shared_buffers && shared) { + ctx->all_data = ggml_metal_host_malloc(size_aligned); + ctx->is_shared = true; + } else { + // dummy, non-NULL value - we'll populate this after creating the Metal buffer below + ctx->all_data = (void *) 0x000000400ULL; + ctx->is_shared = false; + } ctx->all_size = size_aligned; - ctx->owned = true; + + ctx->device = device; + ctx->queue = ctx_dev->mtl_queue; + ctx->n_buffers = 1; if (ctx->all_data != NULL) { - ctx->buffers[0].data = ctx->all_data; ctx->buffers[0].size = size; ctx->buffers[0].metal = nil; if (size_aligned > 0) { - ctx->buffers[0].metal = [device newBufferWithBytesNoCopy:ctx->all_data - length:size_aligned - options:MTLResourceStorageModeShared - deallocator:nil]; + if (ctx_dev->use_shared_buffers) { + ctx->buffers[0].metal = [device newBufferWithBytesNoCopy:ctx->all_data + length:size_aligned + options:MTLResourceStorageModeShared + deallocator:nil]; + } else { + ctx->buffers[0].metal = [device newBufferWithLength:size_aligned options:MTLResourceStorageModePrivate]; + + ctx->all_data = (void *) (ctx->buffers[0].metal.gpuAddress); + } } + + ctx->buffers[0].data = ctx->all_data; } if (size_aligned > 0 && (ctx->all_data == NULL || ctx->buffers[0].metal == nil)) { @@ -6018,36 +6236,50 @@ static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_ba //ggml_backend_metal_log_allocated_size(device, size_aligned); - return ggml_backend_buffer_init(buft, ggml_backend_metal_buffer_i, ctx, size); + struct ggml_backend_buffer_i buf_i = ctx->is_shared ? ggml_backend_metal_buffer_shared_i : ggml_backend_metal_buffer_private_i; + + return ggml_backend_buffer_init(buft, buf_i, ctx, size); } -static size_t ggml_backend_metal_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) { +// default (shared) buffer type + +static const char * ggml_backend_metal_buffer_type_shared_get_name(ggml_backend_buffer_type_t buft) { + return "Metal"; + + GGML_UNUSED(buft); +} + +static ggml_backend_buffer_t ggml_backend_metal_buffer_type_shared_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { + return ggml_backend_metal_buffer_type_alloc_buffer(buft, size, true); +} + +static size_t ggml_backend_metal_buffer_type_shared_get_alignment(ggml_backend_buffer_type_t buft) { return 32; GGML_UNUSED(buft); } -static size_t ggml_backend_metal_buffer_type_get_max_size(ggml_backend_buffer_type_t buft) { +static size_t ggml_backend_metal_buffer_type_shared_get_max_size(ggml_backend_buffer_type_t buft) { const size_t max_size = ((struct ggml_backend_metal_device_context *)buft->device->context)->max_size; return max_size; } -static bool ggml_backend_metal_buffer_type_is_host(ggml_backend_buffer_type_t buft) { - return true; +static bool ggml_backend_metal_buffer_type_shared_is_host(ggml_backend_buffer_type_t buft) { + return false; GGML_UNUSED(buft); } -ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void) { +static ggml_backend_buffer_type_t ggml_backend_metal_buffer_type_shared(void) { static struct ggml_backend_buffer_type ggml_backend_buffer_type_metal = { /* .iface = */ { - /* .get_name = */ ggml_backend_metal_buffer_type_get_name, - /* .alloc_buffer = */ ggml_backend_metal_buffer_type_alloc_buffer, - /* .get_alignment = */ ggml_backend_metal_buffer_type_get_alignment, - /* .get_max_size = */ ggml_backend_metal_buffer_type_get_max_size, + /* .get_name = */ ggml_backend_metal_buffer_type_shared_get_name, + /* .alloc_buffer = */ ggml_backend_metal_buffer_type_shared_alloc_buffer, + /* .get_alignment = */ ggml_backend_metal_buffer_type_shared_get_alignment, + /* .get_max_size = */ ggml_backend_metal_buffer_type_shared_get_max_size, /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes - /* .is_host = */ ggml_backend_metal_buffer_type_is_host, + /* .is_host = */ ggml_backend_metal_buffer_type_shared_is_host, }, /* .device = */ &g_ggml_backend_metal_device, /* .context = */ NULL, @@ -6056,116 +6288,101 @@ ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void) { return &ggml_backend_buffer_type_metal; } -static const char * ggml_backend_metal_buffer_from_ptr_type_get_name(ggml_backend_buffer_type_t buft) { - return "Metal_Mapped"; +// default (private) buffer type + +static const char * ggml_backend_metal_buffer_type_private_get_name(ggml_backend_buffer_type_t buft) { + return "Metal_Private"; GGML_UNUSED(buft); } -static ggml_backend_buffer_type_t ggml_backend_metal_buffer_from_ptr_type(void) { - static struct ggml_backend_buffer_type ggml_backend_buffer_from_ptr_type_metal = { +static ggml_backend_buffer_t ggml_backend_metal_buffer_type_private_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { + return ggml_backend_metal_buffer_type_alloc_buffer(buft, size, false); +} + +static size_t ggml_backend_metal_buffer_type_private_get_alignment(ggml_backend_buffer_type_t buft) { + return 32; + + GGML_UNUSED(buft); +} + +static size_t ggml_backend_metal_buffer_type_private_get_max_size(ggml_backend_buffer_type_t buft) { + const size_t max_size = ((struct ggml_backend_metal_device_context *)buft->device->context)->max_size; + + return max_size; +} + +static bool ggml_backend_metal_buffer_type_private_is_host(ggml_backend_buffer_type_t buft) { + return false; + + GGML_UNUSED(buft); +} + +static ggml_backend_buffer_type_t ggml_backend_metal_buffer_type_private(void) { + static struct ggml_backend_buffer_type ggml_backend_buffer_type_metal = { /* .iface = */ { - /* .get_name = */ ggml_backend_metal_buffer_from_ptr_type_get_name, - /* .alloc_buffer = */ ggml_backend_metal_buffer_type_alloc_buffer, - /* .get_alignment = */ ggml_backend_metal_buffer_type_get_alignment, - /* .get_max_size = */ ggml_backend_metal_buffer_type_get_max_size, + /* .get_name = */ ggml_backend_metal_buffer_type_private_get_name, + /* .alloc_buffer = */ ggml_backend_metal_buffer_type_private_alloc_buffer, + /* .get_alignment = */ ggml_backend_metal_buffer_type_private_get_alignment, + /* .get_max_size = */ ggml_backend_metal_buffer_type_private_get_max_size, /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes - /* .is_host = */ ggml_backend_metal_buffer_type_is_host, + /* .is_host = */ ggml_backend_metal_buffer_type_private_is_host, }, /* .device = */ &g_ggml_backend_metal_device, /* .context = */ NULL, }; - return &ggml_backend_buffer_from_ptr_type_metal; + return &ggml_backend_buffer_type_metal; } -// TODO: obsoleted by ggml_backend_metal_device_buffer_from_ptr -ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t size, size_t max_size) { - struct ggml_backend_metal_buffer_context * ctx = calloc(1, sizeof(struct ggml_backend_metal_buffer_context)); +// mapped buffer type - ctx->all_data = data; - ctx->all_size = size; - ctx->owned = false; - ctx->n_buffers = 0; +static const char * ggml_backend_metal_buffer_type_mapped_get_name(ggml_backend_buffer_type_t buft) { + return "Metal_Mapped"; - const size_t size_page = sysconf(_SC_PAGESIZE); + GGML_UNUSED(buft); +} - // page-align the data ptr - { - const uintptr_t offs = (uintptr_t) data % size_page; - data = (void *) ((char *) data - offs); - size += offs; - } +static ggml_backend_buffer_t ggml_backend_metal_buffer_type_mapped_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { + // for mapped buffers, prefer shared memory + return ggml_backend_metal_buffer_type_alloc_buffer(buft, size, true); +} - size_t size_aligned = size; - if ((size_aligned % size_page) != 0) { - size_aligned += (size_page - (size_aligned % size_page)); - } +static size_t ggml_backend_metal_buffer_type_mapped_get_alignment(ggml_backend_buffer_type_t buft) { + return 32; - struct ggml_backend_metal_device_context * ctx_dev = &g_ggml_ctx_dev_main; + GGML_UNUSED(buft); +} - GGML_ASSERT(ctx_dev->mtl_device != nil); +static size_t ggml_backend_metal_buffer_type_mapped_get_max_size(ggml_backend_buffer_type_t buft) { + const size_t max_size = ((struct ggml_backend_metal_device_context *)buft->device->context)->max_size; - id device = ctx_dev->mtl_device; + return max_size; +} - // the buffer fits into the max buffer size allowed by the device - if (size_aligned <= device.maxBufferLength) { - ctx->buffers[ctx->n_buffers].data = data; - ctx->buffers[ctx->n_buffers].size = size; - ctx->buffers[ctx->n_buffers].metal = nil; +static bool ggml_backend_metal_buffer_type_mapped_is_host(ggml_backend_buffer_type_t buft) { + return false; - if (size_aligned > 0) { - ctx->buffers[ctx->n_buffers].metal = [device newBufferWithBytesNoCopy:data length:size_aligned options:MTLResourceStorageModeShared deallocator:nil]; + GGML_UNUSED(buft); +} - if (ctx->buffers[ctx->n_buffers].metal == nil) { - GGML_LOG_ERROR("%s: error: failed to allocate buffer, size = %8.2f MiB\n", __func__, size_aligned / 1024.0 / 1024.0); - return false; - } - } +static ggml_backend_buffer_type_t ggml_backend_metal_buffer_type_mapped(void) { + // note: not obvious, but this buffer type still needs to implement .alloc_buffer: + // https://github.com/ggml-org/llama.cpp/pull/15832#discussion_r2333177099 + static struct ggml_backend_buffer_type ggml_backend_buffer_type_mapped_metal = { + /* .iface = */ { + /* .get_name = */ ggml_backend_metal_buffer_type_mapped_get_name, + /* .alloc_buffer = */ ggml_backend_metal_buffer_type_mapped_alloc_buffer, + /* .get_alignment = */ ggml_backend_metal_buffer_type_mapped_get_alignment, + /* .get_max_size = */ ggml_backend_metal_buffer_type_mapped_get_max_size, + /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes + /* .is_host = */ ggml_backend_metal_buffer_type_mapped_is_host, + }, + /* .device = */ &g_ggml_backend_metal_device, + /* .context = */ NULL, + }; - ggml_backend_metal_log_allocated_size(device, size_aligned); - - ++ctx->n_buffers; - } else { - // this overlap between the views will guarantee that the tensor with the maximum size will fully fit into - // one of the views - const size_t size_ovlp = ((max_size + size_page - 1) / size_page + 1) * size_page; // round-up 2 pages just in case - const size_t size_step = device.maxBufferLength - size_ovlp; - const size_t size_view = device.maxBufferLength; - - for (size_t i = 0; i < size; i += size_step) { - const size_t size_step_aligned = (i + size_view <= size) ? size_view : (size_aligned - i); - - ctx->buffers[ctx->n_buffers].data = (void *) ((uint8_t *) data + i); - ctx->buffers[ctx->n_buffers].size = size_step_aligned; - ctx->buffers[ctx->n_buffers].metal = nil; - - if (size_step_aligned > 0) { - ctx->buffers[ctx->n_buffers].metal = [device newBufferWithBytesNoCopy:(void *) ((uint8_t *) data + i) length:size_step_aligned options:MTLResourceStorageModeShared deallocator:nil]; - - if (ctx->buffers[ctx->n_buffers].metal == nil) { - GGML_LOG_ERROR("%s: error: failed to allocate buffer, size = %8.2f MiB\n", __func__, size_step_aligned / 1024.0 / 1024.0); - return false; - } - } - - ggml_backend_metal_log_allocated_size(device, size_step_aligned); - - if (i + size_step < size) { - GGML_LOG_INFO("\n"); - } - - ++ctx->n_buffers; - } - } - - if (!ggml_backend_metal_buffer_rset_init(ctx, ctx_dev, device)) { - GGML_LOG_ERROR("%s: error: failed to initialize residency set\n", __func__); - free(ctx); - return NULL; - } - - return ggml_backend_buffer_init(ggml_backend_metal_buffer_from_ptr_type(), ggml_backend_metal_buffer_i, ctx, size); + return &ggml_backend_buffer_type_mapped_metal; } // backend @@ -6184,6 +6401,137 @@ static void ggml_backend_metal_free(ggml_backend_t backend) { free(backend); } +static void ggml_backend_metal_synchronize(ggml_backend_t backend) { + struct ggml_backend_metal_context * ctx = backend->context; + + // wait for any backend operations to finish + if (ctx->cmd_buf_last) { + [ctx->cmd_buf_last waitUntilCompleted]; + ctx->cmd_buf_last = nil; + } + + // release any completed command buffers + if (ctx->cmd_bufs_ext.count > 0) { + for (size_t i = 0; i < ctx->cmd_bufs_ext.count; ++i) { + id cmd_buf = ctx->cmd_bufs_ext[i]; + + MTLCommandBufferStatus status = [cmd_buf status]; + if (status != MTLCommandBufferStatusCompleted) { + GGML_LOG_ERROR("%s: error: command buffer %d failed with status %d\n", __func__, (int) i, (int) status); + if (status == MTLCommandBufferStatusError) { + GGML_LOG_ERROR("error: %s\n", [[cmd_buf error].localizedDescription UTF8String]); + } + GGML_ABORT("fatal error"); + } + + [cmd_buf release]; + } + + [ctx->cmd_bufs_ext removeAllObjects]; + } +} + +static void ggml_backend_metal_set_tensor_async(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) { + struct ggml_backend_metal_context * ctx = backend->context; + struct ggml_backend_metal_device_context * ctx_dev = backend->device->context; + + @autoreleasepool { + id device = ctx_dev->mtl_device; + + // wrap the source data into a Metal buffer + id buf_src = [device newBufferWithBytes:data + length:size + options:MTLResourceStorageModeShared]; + + size_t buf_dst_offset = 0; + id buf_dst = ggml_metal_get_buffer(tensor, &buf_dst_offset); + + if (buf_dst == nil) { + GGML_ABORT("%s: failed to find buffer for tensor '%s'\n", __func__, tensor->name); + } + + buf_dst_offset += offset; + + // queue the copy operation into the queue of the Metal context + // this will be queued at the end, after any currently ongoing GPU operations + id cmd_buf = [ctx->queue commandBufferWithUnretainedReferences]; + id encoder = [cmd_buf blitCommandEncoder]; + + [encoder copyFromBuffer:buf_src + sourceOffset:0 + toBuffer:buf_dst + destinationOffset:buf_dst_offset + size:size]; + + [encoder endEncoding]; + [cmd_buf commit]; + + // do not wait here for completion + //[cmd_buf waitUntilCompleted]; + + // instead, remember a reference to the command buffer and wait for it later if needed + [ctx->cmd_bufs_ext addObject:cmd_buf]; + ctx->cmd_buf_last = cmd_buf; + + [cmd_buf retain]; + } +} + +static void ggml_backend_metal_get_tensor_async(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) { + struct ggml_backend_metal_context * ctx = backend->context; + struct ggml_backend_metal_device_context * ctx_dev = backend->device->context; + + @autoreleasepool { + id device = ctx_dev->mtl_device; + + id buf_dst = [device newBufferWithBytesNoCopy:data + length:size + options:MTLResourceStorageModeShared + deallocator:nil]; + + size_t buf_src_offset = 0; + id buf_src = ggml_metal_get_buffer(tensor, &buf_src_offset); + + if (buf_src == nil) { + GGML_ABORT("%s: failed to find buffer for tensor '%s'\n", __func__, tensor->name); + } + + buf_src_offset += offset; + + // queue the copy operation into the queue of the Metal context + // this will be queued at the end, after any currently ongoing GPU operations + id cmd_buf = [ctx->queue commandBufferWithUnretainedReferences]; + id encoder = [cmd_buf blitCommandEncoder]; + + [encoder copyFromBuffer:buf_src + sourceOffset:buf_src_offset + toBuffer:buf_dst + destinationOffset:0 + size:size]; + + [encoder endEncoding]; + [cmd_buf commit]; + + // do not wait here for completion + //[cmd_buf waitUntilCompleted]; + + // instead, remember a reference to the command buffer and wait for it later if needed + [ctx->cmd_bufs_ext addObject:cmd_buf]; + ctx->cmd_buf_last = cmd_buf; + + [cmd_buf retain]; + } +} + +static bool ggml_backend_metal_cpy_tensor_async(ggml_backend_t backend_src, ggml_backend_t backend_dst, const struct ggml_tensor * src, struct ggml_tensor * dst) { + return false; + + GGML_UNUSED(backend_src); + GGML_UNUSED(backend_dst); + GGML_UNUSED(src); + GGML_UNUSED(dst); +} + static enum ggml_status ggml_backend_metal_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) { return ggml_metal_graph_compute(backend, cgraph); } @@ -6214,7 +6562,10 @@ static void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb) { const int n_nodes_per_cb = ctx->n_nodes_per_cb; - id cmd_buf = ctx->cmd_bufs[cb_idx].obj; + id cmd_buf = ctx->cmd_bufs[cb_idx].obj; + struct ggml_metal_mem_pool * mem_pool = ctx->cmd_bufs[cb_idx].mem_pool; + + ggml_metal_mem_pool_reset(mem_pool); id encoder = [cmd_buf computeCommandEncoder]; @@ -6228,9 +6579,6 @@ static void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb) { const bool should_capture = ctx->capture_next_compute; - struct ggml_metal_mem_pool * mem_pool = ctx->cmd_bufs[cb_idx].mem_pool; - ggml_metal_mem_pool_reset(mem_pool); - for (int idx = node_start; idx < node_end;) { if (should_capture) { [encoder pushDebugGroup:[NSString stringWithCString:ggml_op_desc(ggml_graph_node(ctx->gf, idx)) encoding:NSUTF8StringEncoding]]; @@ -6264,15 +6612,19 @@ static void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb) { static struct ggml_backend_i ggml_backend_metal_i = { /* .get_name = */ ggml_backend_metal_name, /* .free = */ ggml_backend_metal_free, - /* .set_tensor_async = */ NULL, - /* .get_tensor_async = */ NULL, - /* .cpy_tensor_async = */ NULL, - /* .synchronize = */ NULL, + /* .set_tensor_async = */ ggml_backend_metal_set_tensor_async, + /* .get_tensor_async = */ ggml_backend_metal_get_tensor_async, + /* .cpy_tensor_async = */ ggml_backend_metal_cpy_tensor_async, // only needed for multi-GPU setups + /* .synchronize = */ ggml_backend_metal_synchronize, /* .graph_plan_create = */ NULL, /* .graph_plan_free = */ NULL, /* .graph_plan_update = */ NULL, /* .graph_plan_compute = */ NULL, /* .graph_compute = */ ggml_backend_metal_graph_compute, + + // the events API is needed only for multi-GPU setups, so likely no need to implement it for Metal + // in any case, these docs seem relevant if we ever decide to implement it: + // https://developer.apple.com/documentation/metal/mtlcommandbuffer#Synchronizing-Passes-with-Events /* .event_record = */ NULL, /* .event_wait = */ NULL, /* .optimize_graph = */ NULL, @@ -6376,7 +6728,7 @@ static void ggml_backend_metal_device_get_props(ggml_backend_dev_t dev, struct g props->type = ggml_backend_metal_device_get_type(dev); ggml_backend_metal_device_get_memory(dev, &props->memory_free, &props->memory_total); props->caps = (struct ggml_backend_dev_caps) { - /* .async = */ false, + /* .async = */ true, /* .host_buffer = */ false, /* .buffer_from_host_ptr = */ true, /* .events = */ false, @@ -6407,17 +6759,19 @@ static ggml_backend_t ggml_backend_metal_device_init(ggml_backend_dev_t dev, con } static ggml_backend_buffer_type_t ggml_backend_metal_device_get_buffer_type(ggml_backend_dev_t dev) { - return ggml_backend_metal_buffer_type(); + struct ggml_backend_metal_device_context * ctx_dev = dev->context; - GGML_UNUSED(dev); + return ctx_dev->use_shared_buffers ? ggml_backend_metal_buffer_type_shared() : ggml_backend_metal_buffer_type_private(); } -static ggml_backend_buffer_t ggml_backend_metal_device_buffer_from_ptr(ggml_backend_dev_t dev, void * ptr, size_t size, size_t max_tensor_size) { +static ggml_backend_buffer_t ggml_backend_metal_device_buffer_mapped(ggml_backend_dev_t dev, void * ptr, size_t size, size_t max_tensor_size) { struct ggml_backend_metal_buffer_context * ctx = calloc(1, sizeof(struct ggml_backend_metal_buffer_context)); ctx->all_data = ptr; ctx->all_size = size; - ctx->owned = false; + + ctx->is_shared = true; + ctx->n_buffers = 0; const size_t size_page = sysconf(_SC_PAGESIZE); @@ -6440,6 +6794,9 @@ static ggml_backend_buffer_t ggml_backend_metal_device_buffer_from_ptr(ggml_back id device = ctx_dev->mtl_device; + ctx->device = device; + ctx->queue = ctx_dev->mtl_queue; + // the buffer fits into the max buffer size allowed by the device if (size_aligned <= device.maxBufferLength) { ctx->buffers[ctx->n_buffers].data = ptr; @@ -6497,7 +6854,7 @@ static ggml_backend_buffer_t ggml_backend_metal_device_buffer_from_ptr(ggml_back return NULL; } - return ggml_backend_buffer_init(ggml_backend_metal_buffer_from_ptr_type(), ggml_backend_metal_buffer_i, ctx, size); + return ggml_backend_buffer_init(ggml_backend_metal_buffer_type_mapped(), ggml_backend_metal_buffer_shared_i, ctx, size); } static bool ggml_backend_metal_device_supports_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) { @@ -6508,14 +6865,30 @@ static bool ggml_backend_metal_device_supports_op(ggml_backend_dev_t dev, const static bool ggml_backend_metal_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) { return - buft->iface.get_name == ggml_backend_metal_buffer_type_get_name || - buft->iface.get_name == ggml_backend_metal_buffer_from_ptr_type_get_name; + buft->iface.get_name == ggml_backend_metal_buffer_type_shared_get_name || + buft->iface.get_name == ggml_backend_metal_buffer_type_private_get_name || + buft->iface.get_name == ggml_backend_metal_buffer_type_mapped_get_name; GGML_UNUSED(dev); } +static int64_t get_op_batch_size(const struct ggml_tensor * op) { + switch (op->op) { + case GGML_OP_MUL_MAT: + return op->ne[1]; + case GGML_OP_MUL_MAT_ID: + return op->ne[2]; + default: + return ggml_nrows(op); + } +} + static bool ggml_backend_metal_device_offload_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) { - return false; + const int min_batch_size = 32; + + return (op->op == GGML_OP_MUL_MAT || + op->op == GGML_OP_MUL_MAT_ID) && + get_op_batch_size(op) >= min_batch_size; GGML_UNUSED(dev); GGML_UNUSED(op); @@ -6530,7 +6903,7 @@ static struct ggml_backend_device_i ggml_backend_metal_device_i = { /* .init_backend = */ ggml_backend_metal_device_init, /* .get_buffer_type = */ ggml_backend_metal_device_get_buffer_type, /* .get_host_buffer_type = */ NULL, - /* .buffer_from_host_ptr = */ ggml_backend_metal_device_buffer_from_ptr, + /* .buffer_from_host_ptr = */ ggml_backend_metal_device_buffer_mapped, /* .supports_op = */ ggml_backend_metal_device_supports_op, /* .supports_buft = */ ggml_backend_metal_device_supports_buft, /* .offload_op = */ ggml_backend_metal_device_offload_op, diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal index 77be3c5c9d..157d0cc6d0 100644 --- a/ggml/src/ggml-metal/ggml-metal.metal +++ b/ggml/src/ggml-metal/ggml-metal.metal @@ -5571,38 +5571,6 @@ kernel void kernel_flash_attn_ext_vec_reduce( #undef DV } -template -kernel void kernel_set( - constant ggml_metal_kargs_set & args, - device const char * src0, - device const char * src1, - device char * dst, - uint3 tgpig[[threadgroup_position_in_grid]], - ushort3 tpitg[[thread_position_in_threadgroup]], - ushort3 ntg[[threads_per_threadgroup]]) { - const int i13 = tgpig[2]; - const int i12 = tgpig[1]; - const int i11 = tgpig[0]; - - const int64_t n = i13*args.ne12*args.ne11*args.ne10 + i12*args.ne11*args.ne10 + i11*args.ne10; - - const int64_t i3 = n / (args.ne12*args.ne11*args.ne10); - const int64_t i2 = (n - i3*args.ne12*args.ne11*args.ne10) / (args.ne11*args.ne10); - const int64_t i1 = (n - i3*args.ne12*args.ne11*args.ne10 - i2*args.ne11*args.ne10) / args.ne10; - - device T * dst_data = (device T *) (dst + i3*args.nb3 + i2*args.nb2 + i1*args.nb1 + args.offs); - - for (int64_t i10 = tpitg.x; i10 < args.ne10; i10 += ntg.x) { - device const T * src = (device T *) (src1 + i13*args.nb13 + i12*args.nb12 + i11*args.nb11 + i10*args.nb10); - dst_data[i10] = (T) src[0]; - } -} - -typedef decltype(kernel_set) kernel_set_t; - -template [[host_name("kernel_set_f32")]] kernel kernel_set_t kernel_set; -template [[host_name("kernel_set_i32")]] kernel kernel_set_t kernel_set; - template kernel void kernel_cpy( constant ggml_metal_kargs_cpy & args, From 9de447d94e1ae9d1a36e5a2e5bf47483352c0d9c Mon Sep 17 00:00:00 2001 From: Daniel Bevenius Date: Wed, 10 Sep 2025 17:31:40 +0200 Subject: [PATCH 23/26] ggml-cpu : fix padding in ggml_timestep_embedding (#15917) This commit fixes the zero padding for odd dimensions in ggml_compute_forward_timestep_embedding_f32. The motivation for this is that currently if an odd dimension is used, the padding check incorrectly uses the dimension value for indexing. For example, with dim=15: Elements 0-6 are set to cosine values Elements 7-13 are set to sine values Element 14 is left uninitialized (contains garbage) Element 15 is correctly set to zero This fix changes embed_data[dim] to embed_data[2 * half] so that element 14 (the first unused element) is properly set to zero as well as the last element. Resolves: https://github.com/ggml-org/ggml/issues/1324 --- ggml/src/ggml-cpu/ops.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp index 9adf910769..212e52ef6a 100644 --- a/ggml/src/ggml-cpu/ops.cpp +++ b/ggml/src/ggml-cpu/ops.cpp @@ -8598,6 +8598,7 @@ static void ggml_compute_forward_timestep_embedding_f32( embed_data[j + half] = sinf(arg); } if (dim % 2 != 0 && ith == 0) { + embed_data[2 * half] = 0.f; embed_data[dim] = 0.f; } } From 6ab397e12ba8e9f776341cdae68f7ffb2f8d2cde Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= Date: Wed, 10 Sep 2025 19:08:59 +0200 Subject: [PATCH 24/26] graph : support non-contiguous Q in build_attn_mha (#15908) * support non-contiguous Q in build_attn_mha * Update src/llama-graph.cpp ggml-ci Co-authored-by: Georgi Gerganov --------- Co-authored-by: Georgi Gerganov --- src/llama-graph.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp index 7f254b25cd..ddc772b179 100644 --- a/src/llama-graph.cpp +++ b/src/llama-graph.cpp @@ -1273,7 +1273,7 @@ ggml_tensor * llm_graph_context::build_attn_mha( // split the batch into streams if needed const auto n_stream = k->ne[3]; - q = ggml_reshape_4d(ctx0, q, q->ne[0], q->ne[1], q->ne[2]/n_stream, n_stream); + q = ggml_view_4d(ctx0, q, q->ne[0], q->ne[1], q->ne[2]/n_stream, n_stream, q->nb[1], q->nb[2], q->nb[3]/n_stream, 0); q = ggml_permute(ctx0, q, 0, 2, 1, 3); k = ggml_permute(ctx0, k, 0, 2, 1, 3); From 4f658855fa8f2e42b7ed9a5b298fa39a2e39b096 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jie=20Fu=20=28=E5=82=85=E6=9D=B0=29?= Date: Thu, 11 Sep 2025 02:51:51 +0800 Subject: [PATCH 25/26] llama : support T5 models with unequal number of encoder-decoder layers (#15909) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Extend the support of T5 models with different encoder-decoder layers Signed-off-by: Jie Fu * Update convert_hf_to_gguf.py Co-authored-by: Sigbjørn Skjæret * Update gguf-py/gguf/constants.py Co-authored-by: Sigbjørn Skjæret * Update gguf-py/gguf/gguf_writer.py Co-authored-by: Sigbjørn Skjæret * Update src/llama-arch.cpp Co-authored-by: Sigbjørn Skjæret * Update src/llama-arch.h Co-authored-by: Sigbjørn Skjæret * Update src/llama-model.cpp Co-authored-by: Sigbjørn Skjæret * Update src/llama-model.cpp Co-authored-by: Sigbjørn Skjæret * Update src/llama-model.cpp Co-authored-by: Sigbjørn Skjæret * Update src/llama-model.cpp Co-authored-by: Sigbjørn Skjæret * Update src/llama-hparams.h Co-authored-by: Sigbjørn Skjæret * Update src/llama-model.cpp Co-authored-by: Sigbjørn Skjæret * Update src/llama-model.cpp Co-authored-by: Sigbjørn Skjæret * Update src/llama-model.cpp Co-authored-by: Sigbjørn Skjæret * Update src/llama-model.cpp Co-authored-by: Sigbjørn Skjæret * Update src/llama-model.cpp Co-authored-by: Sigbjørn Skjæret * Update src/llama-model.cpp Co-authored-by: Sigbjørn Skjæret * Update src/llama-model.cpp Co-authored-by: Sigbjørn Skjæret * Update src/llama-model.cpp Co-authored-by: Sigbjørn Skjæret * Update src/llama-model.cpp Co-authored-by: Sigbjørn Skjæret * Update src/llama-model.cpp Co-authored-by: Sigbjørn Skjæret * Update src/llama-model.cpp Co-authored-by: Sigbjørn Skjæret * Update src/llama-model.cpp Co-authored-by: Sigbjørn Skjæret * Update src/llama-model.cpp Co-authored-by: Sigbjørn Skjæret * Update src/llama-model.cpp Co-authored-by: Sigbjørn Skjæret * Rename n_dec_layer --> dec_n_layer Signed-off-by: Jie Fu * Adapt to cases when dec_n_layer > n_layer Signed-off-by: Jie Fu --------- Signed-off-by: Jie Fu Co-authored-by: Sigbjørn Skjæret --- convert_hf_to_gguf.py | 2 ++ gguf-py/gguf/constants.py | 1 + gguf-py/gguf/gguf_writer.py | 3 +++ src/llama-arch.cpp | 1 + src/llama-arch.h | 1 + src/llama-hparams.h | 1 + src/llama-model.cpp | 26 ++++++++++++++++++++++---- 7 files changed, 31 insertions(+), 4 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 62a546ee22..bbc21813f8 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -6701,6 +6701,8 @@ class T5Model(TextModel): self.gguf_writer.add_embedding_length(self.hparams["d_model"]) self.gguf_writer.add_feed_forward_length(self.hparams["d_ff"]) self.gguf_writer.add_block_count(self.hparams["num_layers"]) + if (dec_n_layer := self.hparams.get("num_decoder_layers")) is not None: + self.gguf_writer.add_decoder_block_count(dec_n_layer) self.gguf_writer.add_head_count(self.hparams["num_heads"]) self.gguf_writer.add_key_length(self.hparams["d_kv"]) self.gguf_writer.add_value_length(self.hparams["d_kv"]) diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index b8ac394580..1e88b6505b 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -109,6 +109,7 @@ class Keys: POOLING_TYPE = "{arch}.pooling_type" LOGIT_SCALE = "{arch}.logit_scale" DECODER_START_TOKEN_ID = "{arch}.decoder_start_token_id" + DECODER_BLOCK_COUNT = "{arch}.decoder_block_count" ATTN_LOGIT_SOFTCAPPING = "{arch}.attn_logit_softcapping" FINAL_LOGIT_SOFTCAPPING = "{arch}.final_logit_softcapping" SWIN_NORM = "{arch}.swin_norm" diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py index a6cc8a931e..7ff12f7f57 100644 --- a/gguf-py/gguf/gguf_writer.py +++ b/gguf-py/gguf/gguf_writer.py @@ -676,6 +676,9 @@ class GGUFWriter: def add_decoder_start_token_id(self, id: int) -> None: self.add_uint32(Keys.LLM.DECODER_START_TOKEN_ID.format(arch=self.arch), id) + def add_decoder_block_count(self, value: int) -> None: + self.add_uint32(Keys.LLM.DECODER_BLOCK_COUNT.format(arch=self.arch), value) + def add_embedding_length_per_layer_input(self, value: int) -> None: self.add_uint32(Keys.LLM.EMBD_LENGTH_PER_LAYER_INP.format(arch=self.arch), value) diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp index 77b2fecf18..81f9746818 100644 --- a/src/llama-arch.cpp +++ b/src/llama-arch.cpp @@ -137,6 +137,7 @@ static const std::map LLM_KV_NAMES = { { LLM_KV_POOLING_TYPE, "%s.pooling_type" }, { LLM_KV_LOGIT_SCALE, "%s.logit_scale" }, { LLM_KV_DECODER_START_TOKEN_ID, "%s.decoder_start_token_id" }, + { LLM_KV_DECODER_BLOCK_COUNT, "%s.decoder_block_count" }, { LLM_KV_ATTN_LOGIT_SOFTCAPPING, "%s.attn_logit_softcapping" }, { LLM_KV_FINAL_LOGIT_SOFTCAPPING, "%s.final_logit_softcapping" }, { LLM_KV_SWIN_NORM, "%s.swin_norm" }, diff --git a/src/llama-arch.h b/src/llama-arch.h index 21ab47bd7a..6ee3707dcf 100644 --- a/src/llama-arch.h +++ b/src/llama-arch.h @@ -141,6 +141,7 @@ enum llm_kv { LLM_KV_POOLING_TYPE, LLM_KV_LOGIT_SCALE, LLM_KV_DECODER_START_TOKEN_ID, + LLM_KV_DECODER_BLOCK_COUNT, LLM_KV_ATTN_LOGIT_SOFTCAPPING, LLM_KV_FINAL_LOGIT_SOFTCAPPING, LLM_KV_SWIN_NORM, diff --git a/src/llama-hparams.h b/src/llama-hparams.h index 89f5c7ab65..4dca2ca41d 100644 --- a/src/llama-hparams.h +++ b/src/llama-hparams.h @@ -159,6 +159,7 @@ struct llama_hparams { // needed by encoder-decoder models (e.g. T5, FLAN-T5) // ref: https://github.com/ggerganov/llama.cpp/pull/8141 llama_token dec_start_token_id = LLAMA_TOKEN_NULL; + uint32_t dec_n_layer = 0; enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_NONE; enum llama_rope_type rope_type = LLAMA_ROPE_TYPE_NONE; diff --git a/src/llama-model.cpp b/src/llama-model.cpp index b9e4634a70..818b209641 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -1542,6 +1542,9 @@ void llama_model::load_hparams(llama_model_loader & ml) { hparams.dec_start_token_id = dec_start_token_id; } + hparams.dec_n_layer = hparams.n_layer; + ml.get_key(LLM_KV_DECODER_BLOCK_COUNT, hparams.dec_n_layer, false); + switch (hparams.n_layer) { case 6: type = LLM_TYPE_60M; break; // t5-small case 8: type = LLM_TYPE_80M; break; // flan-t5-small @@ -4414,6 +4417,14 @@ bool llama_model::load_tensors(llama_model_loader & ml) { output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED); } + // n_layer: number of encoder_layers + // dec_n_layer: number of decoder_layers + const int dec_n_layer = hparams.dec_n_layer; + if (dec_n_layer > n_layer) { + layers.resize(dec_n_layer); + } + + // load encoder layers for (int i = 0; i < n_layer; ++i) { auto & layer = layers[i]; @@ -4429,6 +4440,11 @@ bool llama_model::load_tensors(llama_model_loader & ml) { layer.ffn_gate_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_GATE, "weight", i), {n_embd, n_ff}, TENSOR_NOT_REQUIRED); layer.ffn_down_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0); layer.ffn_up_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_UP, "weight", i), {n_embd, n_ff}, 0); + } + + // load decoder layers + for (int i = 0; i < dec_n_layer; ++i) { + auto & layer = layers[i]; layer.attn_norm = create_tensor(tn(LLM_TENSOR_DEC_ATTN_NORM, "weight", i), {n_embd}, 0); layer.attn_rel_b = create_tensor(tn(LLM_TENSOR_DEC_ATTN_REL_B, "weight", i), {n_head, n_rel_attn_bkts}, TENSOR_NOT_REQUIRED); @@ -13509,7 +13525,9 @@ struct llm_build_t5_dec : public llm_graph_context { ggml_tensor * inp_out_ids = build_inp_out_ids(); - for (int il = 0; il < n_layer; ++il) { + const int64_t dec_n_layer = hparams.dec_n_layer; + + for (int il = 0; il < dec_n_layer; ++il) { ggml_tensor * inpSA = inpL; // norm @@ -13600,7 +13618,7 @@ struct llm_build_t5_dec : public llm_graph_context { //cb(cur, "kqv_out", il); } - if (il == n_layer - 1 && inp_out_ids) { + if (il == dec_n_layer - 1 && inp_out_ids) { cur = ggml_get_rows(ctx0, cur, inp_out_ids); inpCA = ggml_get_rows(ctx0, inpCA, inp_out_ids); } @@ -13621,8 +13639,8 @@ struct llm_build_t5_dec : public llm_graph_context { model.layers[il].ffn_gate, NULL, NULL, model.layers[il].ffn_down, NULL, NULL, NULL, - model.layers[il].ffn_gate_enc ? LLM_FFN_GELU : LLM_FFN_RELU, - model.layers[il].ffn_gate_enc ? LLM_FFN_PAR : LLM_FFN_SEQ, + model.layers[il].ffn_gate ? LLM_FFN_GELU : LLM_FFN_RELU, + model.layers[il].ffn_gate ? LLM_FFN_PAR : LLM_FFN_SEQ, il); cb(cur, "ffn_out", il); } From 00681dfc16ba4cebb9c7fbd2cf2656e06a0692a4 Mon Sep 17 00:00:00 2001 From: Oliver Simons Date: Wed, 10 Sep 2025 22:04:03 +0200 Subject: [PATCH 26/26] CUDA: Add `fastdiv` to `k_bin_bcast*`, giving 1-3% E2E performance (#15872) * Add fastdiv and fastmodulo to k_bin_bcast kernel * Address review comments * `prod_` instead of `prod` suffix * Add test case for `k_bin_bcast_unravel` in CUDA backend --- ggml/src/ggml-cuda/binbcast.cu | 182 +++++++++++++++++++-------------- tests/test-backend-ops.cpp | 3 + 2 files changed, 111 insertions(+), 74 deletions(-) diff --git a/ggml/src/ggml-cuda/binbcast.cu b/ggml/src/ggml-cuda/binbcast.cu index 1c76566344..725e1a81a1 100644 --- a/ggml/src/ggml-cuda/binbcast.cu +++ b/ggml/src/ggml-cuda/binbcast.cu @@ -23,28 +23,44 @@ static __device__ __forceinline__ float op_div(const float a, const float b) { return a / b; } +template +static __global__ void k_bin_bcast(const src0_t * src0, + const src1_t * src1, + dst_t * dst, + const int ne0, + const int ne1, + const int ne2, + const uint3 ne3, + const uint3 ne10, + const uint3 ne11, + const uint3 ne12, + const uint3 ne13, + /*int s0, */ const int s1, + const int s2, + const int s3, + /*int s00,*/ const int s01, + const int s02, + const int s03, + /*int s10,*/ const int s11, + const int s12, + const int s13, + src1_ptrs... src1s) { + const uint32_t i0s = blockDim.x * blockIdx.x + threadIdx.x; + const uint32_t i1 = (blockDim.y * blockIdx.y + threadIdx.y); + const uint32_t i2 = fastdiv((blockDim.z * blockIdx.z + threadIdx.z), ne3); + const uint32_t i3 = (blockDim.z * blockIdx.z + threadIdx.z) - (i2 * ne3.z); - -template -static __global__ void k_bin_bcast(const src0_t * src0, const src1_t * src1, dst_t * dst, - const int ne0, const int ne1, const int ne2, const int ne3, - const int ne10, const int ne11, const int ne12, const int ne13, - /*int s0, */ const int s1, const int s2, const int s3, - /*int s00,*/ const int s01, const int s02, const int s03, - /*int s10,*/ const int s11, const int s12, const int s13, - src1_ptrs... src1s) { - const int i0s = blockDim.x*blockIdx.x + threadIdx.x; - const int i1 = (blockDim.y*blockIdx.y + threadIdx.y); - const int i2 = (blockDim.z*blockIdx.z + threadIdx.z) / ne3; - const int i3 = (blockDim.z*blockIdx.z + threadIdx.z) % ne3; - - if (i0s >= ne0 || i1 >= ne1 || i2 >= ne2 || i3 >= ne3) { + if (i0s >= ne0 || i1 >= ne1 || i2 >= ne2 || i3 >= ne3.z) { return; } - const int i11 = i1 % ne11; - const int i12 = i2 % ne12; - const int i13 = i3 % ne13; + const uint32_t i11 = fastmodulo(i1, ne11); + const uint32_t i12 = fastmodulo(i2, ne12); + const uint32_t i13 = fastmodulo(i3, ne13); const size_t i_src0 = i3*s03 + i2*s02 + i1*s01; const size_t i_src1 = i13*s13 + i12*s12 + i11*s11; @@ -53,8 +69,8 @@ static __global__ void k_bin_bcast(const src0_t * src0, const src1_t * src1, dst const src0_t * src0_row = src0 ? (src0 + i_src0) : nullptr; dst_t * dst_row = dst + i_dst; - for (int i0 = i0s; i0 < ne0; i0 += blockDim.x*gridDim.x) { - const int i10 = i0 % ne10; + for (int i0 = i0s; i0 < ne0; i0 += blockDim.x * gridDim.x) { + const uint32_t i10 = fastmodulo(i0, ne10); float result = src0_row ? (float) src0_row[i0] : 0.0f; if constexpr (sizeof...(src1_ptrs) > 0) { @@ -67,28 +83,48 @@ static __global__ void k_bin_bcast(const src0_t * src0, const src1_t * src1, dst } } -template -static __global__ void k_bin_bcast_unravel(const src0_t * src0, const src1_t * src1, dst_t * dst, - const int ne0, const int ne1, const int ne2,const int ne3, - const int ne10, const int ne11, const int ne12, const int ne13, - /*int s0, */ const int s1, const int s2, const int s3, - /*int s00,*/ const int s01, const int s02, const int s03, - /*int s10,*/ const int s11, const int s12, const int s13, - src1_ptrs ... src1s) { +template +static __global__ void k_bin_bcast_unravel(const src0_t * src0, + const src1_t * src1, + dst_t * dst, + const uint3 ne0, + const uint3 ne1, + const uint3 ne2, + const uint32_t ne3, + const uint3 prod_012, + const uint3 prod_01, + const uint3 ne10, + const uint3 ne11, + const uint3 ne12, + const uint3 ne13, + /*int s0, */ const int s1, + const int s2, + const int s3, + /*int s00,*/ const int s01, + const int s02, + const int s03, + /*int s10,*/ const int s11, + const int s12, + const int s13, + src1_ptrs... src1s) { const int i = blockDim.x*blockIdx.x + threadIdx.x; - const int i3 = i/(ne2*ne1*ne0); - const int i2 = (i/(ne1*ne0)) % ne2; - const int i1 = (i/ne0) % ne1; - const int i0 = i % ne0; + const uint32_t i3 = fastdiv(i, prod_012); + const uint32_t i2 = fastdiv(i - i3 * prod_012.z, prod_01); + const uint32_t i1 = fastdiv(i - i3 * prod_012.z - i2 * prod_01.z, ne0); + const uint32_t i0 = i - i3 * prod_012.z - i2 * prod_01.z - i1 * ne0.z; - if (i0 >= ne0 || i1 >= ne1 || i2 >= ne2 || i3 >= ne3) { + if (i0 >= ne0.z || i1 >= ne1.z || i2 >= ne2.z || i3 >= ne3) { return; } - const int i11 = i1 % ne11; - const int i12 = i2 % ne12; - const int i13 = i3 % ne13; + const int i11 = fastmodulo(i1, ne11); + const int i12 = fastmodulo(i2, ne12); + const int i13 = fastmodulo(i3, ne13); const size_t i_src0 = i3*s03 + i2*s02 + i1*s01; const size_t i_src1 = i13*s13 + i12*s12 + i11*s11; @@ -97,7 +133,7 @@ static __global__ void k_bin_bcast_unravel(const src0_t * src0, const src1_t * const src0_t * src0_row = src0 ? (src0 + i_src0) : nullptr; dst_t * dst_row = dst + i_dst; - const int i10 = i0 % ne10; + const int i10 = fastmodulo(i0, ne10); float result = src0_row ? (float) src0_row[i0] : 0.0f; if constexpr (sizeof...(src1_ptrs) > 0) { @@ -170,11 +206,6 @@ static void launch_bin_bcast_pack(const ggml_tensor * src0, const ggml_tensor * //int64_t ne02 = cne0[2]; GGML_UNUSED(ne02); //int64_t ne03 = cne0[3]; GGML_UNUSED(ne03); - int64_t ne10 = cne1[0]; - int64_t ne11 = cne1[1]; - int64_t ne12 = cne1[2]; - int64_t ne13 = cne1[3]; - size_t nb0 = cnb[0]; size_t nb1 = cnb[1]; size_t nb2 = cnb[2]; @@ -233,48 +264,51 @@ static void launch_bin_bcast_pack(const ggml_tensor * src0, const ggml_tensor * block_dims.y = std::min(ne1, block_size / block_dims.x); block_dims.z = std::min(std::min(ne2 * ne3, block_size / block_dims.x / block_dims.y), 64U); - dim3 block_nums((hne0 + block_dims.x - 1) / block_dims.x, - (ne1 + block_dims.y - 1) / block_dims.y, + dim3 block_nums((hne0 + block_dims.x - 1) / block_dims.x, (ne1 + block_dims.y - 1) / block_dims.y, (ne2 * ne3 + block_dims.z - 1) / block_dims.z); + const uint3 ne10 = init_fastdiv_values((uint32_t) cne1[0]); + const uint3 ne11 = init_fastdiv_values((uint32_t) cne1[1]); + const uint3 ne12 = init_fastdiv_values((uint32_t) cne1[2]); + const uint3 ne13 = init_fastdiv_values((uint32_t) cne1[3]); + if (block_nums.z > 65535) { - int block_num = (ne0 * ne1 * ne2 * ne3 + block_size - 1) / block_size; + int block_num = (ne0 * ne1 * ne2 * ne3 + block_size - 1) / block_size; + const uint3 prod_012 = init_fastdiv_values((uint32_t) (ne0 * ne1 * ne2)); + const uint3 prod_01 = init_fastdiv_values((uint32_t) (ne0 * ne1)); + const uint3 ne0_fastdiv = init_fastdiv_values((uint32_t) ne0); + const uint3 ne1_fastdiv = init_fastdiv_values((uint32_t) ne1); + const uint3 ne2_fastdiv = init_fastdiv_values((uint32_t) ne2); + if constexpr (sizeof...(I) > 0) { - k_bin_bcast_unravel - <<>>(src0_dd, src1_dd, dst_dd, - ne0, ne1, ne2, ne3, - ne10, ne11, ne12, ne13, - /* s0, */ s1, s2, s3, - /* s00,*/ s01, s02, s03, - /* s10,*/ s11, s12,s13, - (const src1_t *) dst->src[I + 1]->data...); + k_bin_bcast_unravel<<>>( + src0_dd, src1_dd, dst_dd, ne0_fastdiv, ne1_fastdiv, ne2_fastdiv, ne3, prod_012, prod_01, ne10, ne11, + ne12, ne13, + /* s0, */ s1, s2, s3, + /* s00,*/ s01, s02, s03, + /* s10,*/ s11, s12, s13, (const src1_t *) dst->src[I + 1]->data...); } else { k_bin_bcast_unravel - <<>>(src0_dd, src1_dd, dst_dd, - ne0, ne1, ne2, ne3, - ne10, ne11, ne12, ne13, - /* s0, */ s1, s2, s3, - /* s00,*/ s01, s02, s03, - /* s10,*/ s11, s12,s13); + <<>>(src0_dd, src1_dd, dst_dd, ne0_fastdiv, ne1_fastdiv, + ne2_fastdiv, ne3, prod_012, prod_01, ne10, ne11, ne12, ne13, + /* s0, */ s1, s2, s3, + /* s00,*/ s01, s02, s03, + /* s10,*/ s11, s12, s13); } } else { + const uint3 ne3_fastdiv = init_fastdiv_values((uint32_t) ne3); if constexpr (sizeof...(I) > 0) { - k_bin_bcast - <<>>(src0_dd, src1_dd, dst_dd, - ne0, ne1, ne2, ne3, - ne10, ne11, ne12, ne13, - /* s0, */ s1, s2, s3, - /* s00,*/ s01, s02, s03, - /* s10,*/ s11, s12,s13, - (const src1_t *) dst->src[I + 1]->data...); + k_bin_bcast<<>>( + src0_dd, src1_dd, dst_dd, ne0, ne1, ne2, ne3_fastdiv, ne10, ne11, ne12, ne13, + /* s0, */ s1, s2, s3, + /* s00,*/ s01, s02, s03, + /* s10,*/ s11, s12, s13, (const src1_t *) dst->src[I + 1]->data...); } else { - k_bin_bcast - <<>>(src0_dd, src1_dd, dst_dd, - ne0, ne1, ne2, ne3, - ne10, ne11, ne12, ne13, - /* s0, */ s1, s2, s3, - /* s00,*/ s01, s02, s03, - /* s10,*/ s11, s12,s13); + k_bin_bcast<<>>( + src0_dd, src1_dd, dst_dd, ne0, ne1, ne2, ne3_fastdiv, ne10, ne11, ne12, ne13, + /* s0, */ s1, s2, s3, + /* s00,*/ s01, s02, s03, + /* s10,*/ s11, s12, s13); } } } diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index 4a882ab072..b54a1a4e82 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -6050,6 +6050,9 @@ static std::vector> make_test_cases_eval() { add_test_bin_bcast(type, {10, 5, 4, 3}, {1, 2, 2, 2}); add_test_bin_bcast(type, {10, 5, 4, 3}, {2, 2, 2, 2}); + // test case for k_bin_bcast_unravel in CUDA backend + add_test_bin_bcast(type, {1, 1, 65536, 1}, {256, 1, 1, 1}); + // stable diffusion add_test_bin_bcast(type, {1280, 1, 1, 1}, {1, 1, 1, 1}); add_test_bin_bcast(type, {1280, 1, 1, 1}, {1, 16, 16, 1});