From 2d6c00a9b8bb4d72f4f43c6521ff1061088e2c2c Mon Sep 17 00:00:00 2001 From: Charles Xu Date: Tue, 30 Dec 2025 13:04:53 +0100 Subject: [PATCH 01/13] kleidiai: add and integrate SVE 256-bit vector-length kernel (#18458) * kleidiai: add and integrate SVE 256-bit vector-length kernel * updated for review comments --- ggml/src/CMakeLists.txt | 4 +- ggml/src/ggml-cpu/CMakeLists.txt | 14 ++- ggml/src/ggml-cpu/kleidiai/kernels.cpp | 111 ++++++++++++++++++------ ggml/src/ggml-cpu/kleidiai/kleidiai.cpp | 23 +++-- 4 files changed, 115 insertions(+), 37 deletions(-) diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt index 25f25c4236..6192a87046 100644 --- a/ggml/src/CMakeLists.txt +++ b/ggml/src/CMakeLists.txt @@ -401,8 +401,8 @@ if (GGML_CPU_ALL_VARIANTS) ggml_add_cpu_backend_variant(android_armv8.2_2 DOTPROD FP16_VECTOR_ARITHMETIC) ggml_add_cpu_backend_variant(android_armv8.6_1 DOTPROD FP16_VECTOR_ARITHMETIC MATMUL_INT8) ggml_add_cpu_backend_variant(android_armv9.0_1 DOTPROD MATMUL_INT8 FP16_VECTOR_ARITHMETIC SVE2) - ggml_add_cpu_backend_variant(android_armv9.2_1 DOTPROD MATMUL_INT8 FP16_VECTOR_ARITHMETIC SME) - ggml_add_cpu_backend_variant(android_armv9.2_2 DOTPROD MATMUL_INT8 FP16_VECTOR_ARITHMETIC SVE SME) + ggml_add_cpu_backend_variant(android_armv9.2_1 DOTPROD MATMUL_INT8 FP16_VECTOR_ARITHMETIC SVE SME) + ggml_add_cpu_backend_variant(android_armv9.2_2 DOTPROD MATMUL_INT8 FP16_VECTOR_ARITHMETIC SVE SVE2 SME) elseif (APPLE) ggml_add_cpu_backend_variant(apple_m1 DOTPROD) ggml_add_cpu_backend_variant(apple_m2_m3 DOTPROD MATMUL_INT8) diff --git a/ggml/src/ggml-cpu/CMakeLists.txt b/ggml/src/ggml-cpu/CMakeLists.txt index 28fb7612e5..7622d0bf49 100644 --- a/ggml/src/ggml-cpu/CMakeLists.txt +++ b/ggml/src/ggml-cpu/CMakeLists.txt @@ -561,9 +561,9 @@ function(ggml_add_cpu_backend_variant_impl tag_name) # Fetch KleidiAI sources: include(FetchContent) - set(KLEIDIAI_COMMIT_TAG "v1.14.0") + set(KLEIDIAI_COMMIT_TAG "v1.16.0") set(KLEIDIAI_DOWNLOAD_URL "https://github.com/ARM-software/kleidiai/archive/refs/tags/${KLEIDIAI_COMMIT_TAG}.tar.gz") - set(KLEIDIAI_ARCHIVE_MD5 "45e110675d93f99f82c23a1afcca76bc") + set(KLEIDIAI_ARCHIVE_MD5 "0a9e9008adb6031f9e8cf70dff4a3321") if (POLICY CMP0135) cmake_policy(SET CMP0135 NEW) @@ -615,6 +615,7 @@ function(ggml_add_cpu_backend_variant_impl tag_name) string(FIND "${ARCH_FLAGS_TEMP}" "+dotprod" DOTPROD_ENABLED) string(FIND "${ARCH_FLAGS_TEMP}" "+i8mm" I8MM_ENABLED) string(FIND "${ARCH_FLAGS_TEMP}" "+sme" SME_ENABLED) + string(FIND "${ARCH_FLAGS_TEMP}" "+sve" SVE_ENABLED) set(PRIVATE_ARCH_FLAGS ${ARCH_FLAGS_TEMP}) @@ -659,6 +660,15 @@ function(ggml_add_cpu_backend_variant_impl tag_name) set(PRIVATE_ARCH_FLAGS "-fno-tree-vectorize;${PRIVATE_ARCH_FLAGS}+sve+sve2") endif() + if (NOT SVE_ENABLED MATCHES -1) + list(APPEND GGML_KLEIDIAI_SOURCES + ${KLEIDIAI_SRC}/kai/kai_common_sve_asm.S + ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p8x8_1x8_sve_dotprod_asm.S + ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p8x8_1x8_sve_dotprod.c + ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p8x8_16x8_sve_i8mm_asm.S + ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p8x8_16x8_sve_i8mm.c) + endif() + set_source_files_properties(${GGML_KLEIDIAI_SOURCES} PROPERTIES COMPILE_OPTIONS "${PRIVATE_ARCH_FLAGS}") list(APPEND GGML_CPU_SOURCES ${GGML_KLEIDIAI_SOURCES}) endif() diff --git a/ggml/src/ggml-cpu/kleidiai/kernels.cpp b/ggml/src/ggml-cpu/kleidiai/kernels.cpp index 55a00f008a..d114f2d49b 100644 --- a/ggml/src/ggml-cpu/kleidiai/kernels.cpp +++ b/ggml/src/ggml-cpu/kleidiai/kernels.cpp @@ -18,6 +18,8 @@ #include "kai_matmul_clamp_f32_qai8dxp1x4_qsi8cxp4x4_1x4_neon_dotprod.h" #include "kai_matmul_clamp_f32_qai8dxp4x4_qsi8cxp4x4_16x4_neon_dotprod.h" #include "kai_matmul_clamp_f32_qai8dxp4x8_qsi8cxp4x8_16x4_neon_i8mm.h" +#include "kai_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p8x8_16x8_sve_i8mm.h" +#include "kai_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p8x8_1x8_sve_dotprod.h" #include "kai_lhs_pack_bf16p2vlx2_f32_sme.h" #include "kai_lhs_quant_pack_qsi8d32p_f32.h" @@ -69,9 +71,9 @@ static inline void kernel_run_fn10(size_t m, size_t n, size_t k, size_t /*bl*/, template static inline void kernel_run_float_fn10(size_t m, size_t n, size_t k, size_t /*bl*/, - const void* lhs, const void* rhs, void* dst, - size_t dst_stride_row, size_t dst_stride_col, - float clamp_min, float clamp_max) { + const void* lhs, const void* rhs, void* dst, + size_t dst_stride_row, size_t dst_stride_col, + float clamp_min, float clamp_max) { Fn(m, n, k, lhs, rhs, static_cast(dst), dst_stride_row, dst_stride_col, clamp_min, clamp_max); } @@ -152,8 +154,8 @@ static inline void rhs_pack_fn12(size_t num_groups, size_t n, size_t k, size_t n template static inline void rhs_pack_scale_fn12(size_t num_groups, size_t n, size_t k, size_t nr, size_t kr, size_t sr, size_t /*bl*/, - size_t /*rhs_stride*/, const void* rhs, const void* bias, const void* scale, - void* rhs_packed, size_t extra_bytes, const void* params) { + size_t /*rhs_stride*/, const void* rhs, const void* bias, const void* scale, + void* rhs_packed, size_t extra_bytes, const void* params) { Fn(num_groups, n, k, nr, kr, sr, static_cast(rhs), static_cast(bias), @@ -524,6 +526,61 @@ static ggml_kleidiai_kernels gemm_gemv_kernels[] = { }, #endif #else +#if defined(__ARM_FEATURE_SVE) + { + /* SVE i8mm GEMM */ + /* .kern_info = */ { + /* .get_m_step = */ kai_get_m_step_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p8x8_16x8_sve_i8mm, + /* .get_n_step = */ kai_get_n_step_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p8x8_16x8_sve_i8mm, + /* .get_mr = */ kai_get_mr_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p8x8_16x8_sve_i8mm, + /* .get_nr = */ kai_get_nr_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p8x8_16x8_sve_i8mm, + /* .get_kr = */ kai_get_kr_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p8x8_16x8_sve_i8mm, + /* .get_sr = */ kai_get_sr_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p8x8_16x8_sve_i8mm, + /* .get_dst_offset = */ kai_get_dst_offset_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p8x8_16x8_sve_i8mm, + /* .get_dst_size = */ kai_get_dst_size_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p8x8_16x8_sve_i8mm, + /* .get_lhs_offset_ex = */ &kernel_offs_fn3, + /* .get_rhs_packed_offset_ex = */ &kernel_offs_fn3, + /* .run_kernel_ex = */ &kernel_run_fn11, + }, + /* .gemm_lhs_info = */ { + /* .get_offset = */ kai_get_lhs_offset_lhs_quant_pack_qsi8d32p4x8sb_f32_neon, + /* .get_packed_offset_ex = */ &lhs_offs_fn6, + /* .packed_size_ex = */ &lhs_ps_fn6, + /* .pack_func_ex = */ &lhs_pack_float_fn10, + }, + /* SVE dotprod GEMV */ + /* .kern_info = */ { + /* .get_m_step = */ kai_get_m_step_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p8x8_1x8_sve_dotprod, + /* .get_n_step = */ kai_get_n_step_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p8x8_1x8_sve_dotprod, + /* .get_mr = */ kai_get_mr_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p8x8_1x8_sve_dotprod, + /* .get_nr = */ kai_get_nr_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p8x8_1x8_sve_dotprod, + /* .get_kr = */ kai_get_kr_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p8x8_1x8_sve_dotprod, + /* .get_sr = */ kai_get_sr_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p8x8_1x8_sve_dotprod, + /* .get_dst_offset = */ kai_get_dst_offset_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p8x8_1x8_sve_dotprod, + /* .get_dst_size = */ kai_get_dst_size_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p8x8_1x8_sve_dotprod, + /* .get_lhs_offset_ex = */ &kernel_offs_fn3, + /* .get_rhs_packed_offset_ex = */ &kernel_offs_fn3, + /* .run_kernel_ex = */ &kernel_run_fn11, + }, + /* .gemv_lhs_info = */ { + /* .get_offset = */ kai_get_lhs_offset_lhs_quant_pack_qsi8d32p_f32, + /* .get_packed_offset_ex = */ &lhs_offs_fn6, + /* .packed_size_ex = */ &lhs_ps_fn6, + /* .pack_func_ex = */ &lhs_pack_float_fn10, + }, + /* .rhs_info = */ { + /* .packed_stride = */ kai_get_rhs_packed_stride_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0, + /* .to_float = */ dequantize_row_qsi4c32pscalef16, + /* .packed_size_ex = */ &rhs_ps_fn5, + /* .packed_stride_ex = */ &rhs_stride_fn4, + /* .pack_func_ex = */ &rhs_pack_fn12, + }, + /* .required_cpu = */ CPU_FEATURE_SVE | CPU_FEATURE_I8MM | CPU_FEATURE_DOTPROD, + /* .lhs_type = */ GGML_TYPE_F32, + /* .rhs_type = */ GGML_TYPE_Q4_0, + /* .op_type = */ GGML_TYPE_F32, + }, +#endif #if defined(__ARM_FEATURE_MATMUL_INT8) { /* i8mm GEMM */ @@ -578,7 +635,7 @@ static ggml_kleidiai_kernels gemm_gemv_kernels[] = { /* .rhs_type = */ GGML_TYPE_Q4_0, /* .op_type = */ GGML_TYPE_F32, }, -#endif +#endif // __ARM_FEATURE_MATMUL_INT8 #if defined(__ARM_FEATURE_DOTPROD) { /* DOTPROD GEMM */ @@ -811,26 +868,27 @@ ggml_kleidiai_kernels * ggml_kleidiai_select_kernels(cpu_feature cpu_features, c ggml_kleidiai_kernels * kernel = nullptr; if (tensor->op == GGML_OP_MUL_MAT && tensor->src[0] != nullptr && tensor->src[1] != nullptr) { -#if defined(__ARM_FEATURE_SME) || defined(__ARM_FEATURE_DOTPROD) || defined(__ARM_FEATURE_MATMUL_INT8) - for (size_t i = 0; i < NELEMS(gemm_gemv_kernels) - 1; ++i) { - if ((cpu_features & gemm_gemv_kernels[i].required_cpu) == gemm_gemv_kernels[i].required_cpu && - gemm_gemv_kernels[i].lhs_type == tensor->src[1]->type && - gemm_gemv_kernels[i].rhs_type == tensor->src[0]->type && - gemm_gemv_kernels[i].op_type == tensor->type) { - kernel = &gemm_gemv_kernels[i]; - break; - } - } - if (!kernel) { - for (size_t i = 0; i < NELEMS(gemm_gemv_kernels_q8) - 1; ++i) { - if ((cpu_features & gemm_gemv_kernels_q8[i].required_cpu) == gemm_gemv_kernels_q8[i].required_cpu && - gemm_gemv_kernels_q8[i].lhs_type == tensor->src[1]->type && - gemm_gemv_kernels_q8[i].rhs_type == tensor->src[0]->type && - gemm_gemv_kernels_q8[i].op_type == tensor->type) { - kernel = &gemm_gemv_kernels_q8[i]; - break; +#if defined(__ARM_FEATURE_SME) || \ + defined(__ARM_FEATURE_DOTPROD) || \ + defined(__ARM_FEATURE_MATMUL_INT8) || \ + defined(__ARM_FEATURE_SVE) + auto try_table = [&](auto & table) { + for (size_t i = 0; i < NELEMS(table) - 1; ++i) { + if ((cpu_features & table[i].required_cpu) == table[i].required_cpu && + table[i].lhs_type == tensor->src[1]->type && + table[i].rhs_type == tensor->src[0]->type && + table[i].op_type == tensor->type) { + kernel = &table[i]; + return true; } } + return false; + }; + + if (tensor->src[0]->type == GGML_TYPE_Q8_0) { + try_table(gemm_gemv_kernels_q8); + } else { + try_table(gemm_gemv_kernels); } #else GGML_UNUSED(gemm_gemv_kernels); @@ -845,7 +903,10 @@ ggml_kleidiai_kernels * ggml_kleidiai_select_kernels(cpu_feature cpu_features, c ggml_kleidiai_kernels * ggml_kleidiai_select_kernels_q4_0(cpu_feature features) { ggml_kleidiai_kernels * kernels = nullptr; -#if defined(__ARM_FEATURE_SME) || defined(__ARM_FEATURE_DOTPROD) || defined(__ARM_FEATURE_MATMUL_INT8) +#if defined(__ARM_FEATURE_SME) || \ + defined(__ARM_FEATURE_DOTPROD) || \ + defined(__ARM_FEATURE_MATMUL_INT8) || \ + defined(__ARM_FEATURE_SVE) for (size_t i = 0; i < NELEMS(gemm_gemv_kernels) - 1; ++i) { if ((features & gemm_gemv_kernels[i].required_cpu) == gemm_gemv_kernels[i].required_cpu) { kernels = &gemm_gemv_kernels[i]; diff --git a/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp b/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp index 6f2a90fbda..ad23e73184 100644 --- a/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +++ b/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp @@ -46,13 +46,20 @@ struct ggml_kleidiai_context { } static ctx = { CPU_FEATURE_NONE, NULL, NULL }; static const char* cpu_feature_to_string(cpu_feature f) { - switch (f) { - case CPU_FEATURE_NONE: return "NONE"; - case CPU_FEATURE_DOTPROD: return "DOTPROD"; - case CPU_FEATURE_I8MM: return "I8MM"; - case CPU_FEATURE_SVE: return "SVE"; - case CPU_FEATURE_SME: return "SME"; - default: return "UNKNOWN"; + if (f == CPU_FEATURE_NONE) { + return "NONE"; + } else if ((f & CPU_FEATURE_SME) == CPU_FEATURE_SME) { + return "SME"; + } else if ((f & CPU_FEATURE_SVE) == CPU_FEATURE_SVE) { + return "SVE"; + } + else if ((f & CPU_FEATURE_I8MM) == CPU_FEATURE_I8MM) { + return "I8MM"; + } else if ((f & CPU_FEATURE_DOTPROD) == CPU_FEATURE_DOTPROD) { + return "DOTPROD"; + } + else { + return "UNKNOWN"; } } @@ -68,7 +75,7 @@ static void init_kleidiai_context(void) { ctx.features = (ggml_cpu_has_dotprod() ? CPU_FEATURE_DOTPROD : CPU_FEATURE_NONE) | (ggml_cpu_has_matmul_int8() ? CPU_FEATURE_I8MM : CPU_FEATURE_NONE) | - (ggml_cpu_has_sve() ? CPU_FEATURE_SVE : CPU_FEATURE_NONE); + ((ggml_cpu_has_sve() && ggml_cpu_get_sve_cnt() == QK8_0) ? CPU_FEATURE_SVE : CPU_FEATURE_NONE); if (env_var) { sme_enabled = atoi(env_var); From f14f4e421b2177fadcf9d15ebccb0492e5464d86 Mon Sep 17 00:00:00 2001 From: Jeff Bolz Date: Tue, 30 Dec 2025 06:11:13 -0600 Subject: [PATCH 02/13] server: fix files built redundantly (#18474) --- tools/server/CMakeLists.txt | 8 -------- 1 file changed, 8 deletions(-) diff --git a/tools/server/CMakeLists.txt b/tools/server/CMakeLists.txt index ae1a497be6..a39b4c5b35 100644 --- a/tools/server/CMakeLists.txt +++ b/tools/server/CMakeLists.txt @@ -38,14 +38,6 @@ set(TARGET_SRCS server-http.h server-models.cpp server-models.h - server-task.cpp - server-task.h - server-queue.cpp - server-queue.h - server-common.cpp - server-common.h - server-context.cpp - server-context.h ) set(PUBLIC_ASSETS index.html.gz From c32fa21db8a631e9127e55f69a3d2bdaa9f71824 Mon Sep 17 00:00:00 2001 From: Jay Zenith <162098309+JayZenith@users.noreply.github.com> Date: Tue, 30 Dec 2025 06:27:49 -0800 Subject: [PATCH 03/13] sampling: reuse token data buffer in llama_sampler_sample (#18365) * sampling: reuse token data buffer in llama_sampler_sample * move cur buffer before timing section, after samplers * minor : fix build --------- Co-authored-by: Georgi Gerganov --- src/llama-sampling.cpp | 77 ++++++++++++++++++++++++------------------ src/llama-sampling.h | 3 ++ 2 files changed, 47 insertions(+), 33 deletions(-) diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp index d96f619ae1..f3891453e4 100644 --- a/src/llama-sampling.cpp +++ b/src/llama-sampling.cpp @@ -421,39 +421,6 @@ void llama_sampler_free(struct llama_sampler * smpl) { delete smpl; } -llama_token llama_sampler_sample(struct llama_sampler * smpl, struct llama_context * ctx, int32_t idx) { - const auto * logits = llama_get_logits_ith(ctx, idx); - - const llama_model * model = llama_get_model(ctx); - const llama_vocab * vocab = llama_model_get_vocab(model); - - const int n_vocab = llama_vocab_n_tokens(vocab); - - // TODO: do not allocate each time - std::vector cur; - cur.reserve(n_vocab); - for (llama_token token_id = 0; token_id < n_vocab; token_id++) { - cur.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f}); - } - - llama_token_data_array cur_p = { - /* .data = */ cur.data(), - /* .size = */ cur.size(), - /* .selected = */ -1, - /* .sorted = */ false, - }; - - llama_sampler_apply(smpl, &cur_p); - - GGML_ASSERT(cur_p.selected >= 0 && cur_p.selected < (int32_t) cur_p.size); - - auto token = cur_p.data[cur_p.selected].id; - - llama_sampler_accept(smpl, token); - - return token; -} - // sampler chain static const char * llama_sampler_chain_name(const struct llama_sampler * /*smpl*/) { @@ -527,12 +494,56 @@ struct llama_sampler * llama_sampler_chain_init(struct llama_sampler_chain_param /* .ctx = */ new llama_sampler_chain { /* .params = */ params, /* .samplers = */ {}, + /* .cur = */ {}, /* .t_sample_us = */ 0, /* .n_sample = */ 0, } ); } +llama_token llama_sampler_sample(struct llama_sampler * smpl, struct llama_context * ctx, int32_t idx) { + const auto * logits = llama_get_logits_ith(ctx, idx); + + const llama_model * model = llama_get_model(ctx); + const llama_vocab * vocab = llama_model_get_vocab(model); + + const int n_vocab = llama_vocab_n_tokens(vocab); + + // use pre-allocated buffer from chain if available, otherwise allocate locally + std::vector * cur_ptr; + std::vector cur_local; + + if (smpl->iface == &llama_sampler_chain_i) { + auto * chain = (llama_sampler_chain *) smpl->ctx; + cur_ptr = &chain->cur; + } else { + cur_ptr = &cur_local; + } + + auto & cur = *cur_ptr; + cur.resize(n_vocab); + for (llama_token token_id = 0; token_id < n_vocab; token_id++) { + cur[token_id] = llama_token_data{token_id, logits[token_id], 0.0f}; + } + + llama_token_data_array cur_p = { + /* .data = */ cur.data(), + /* .size = */ cur.size(), + /* .selected = */ -1, + /* .sorted = */ false, + }; + + llama_sampler_apply(smpl, &cur_p); + + GGML_ASSERT(cur_p.selected >= 0 && cur_p.selected < (int32_t) cur_p.size); + + auto token = cur_p.data[cur_p.selected].id; + + llama_sampler_accept(smpl, token); + + return token; +} + void llama_sampler_chain_add(struct llama_sampler * chain, struct llama_sampler * smpl) { auto * p = (llama_sampler_chain *) chain->ctx; p->samplers.push_back(smpl); diff --git a/src/llama-sampling.h b/src/llama-sampling.h index 759dd7dcb7..1e3de4e2ec 100644 --- a/src/llama-sampling.h +++ b/src/llama-sampling.h @@ -16,6 +16,9 @@ struct llama_sampler_chain { std::vector samplers; + // pre-allocated buffer for llama_sampler_sample to avoid repeated allocations + std::vector cur; + // timing mutable int64_t t_sample_us; From cd78e57c3aeae7b56c5843f94e0e0b83a3d8ca81 Mon Sep 17 00:00:00 2001 From: Xuan-Son Nguyen Date: Tue, 30 Dec 2025 15:53:12 +0100 Subject: [PATCH 04/13] lora: count lora nodes in graph_max_nodes (#18469) * lora: count lora nodes in graph_max_nodes * 3 nodes per weight * 4 nodes * keep track n_lora_nodes from llama_model * fix assert * rm redundant header * common: load adapters before context creation * use 6 nodes --- common/common.cpp | 37 +++++++++++++++++++------------------ include/llama.h | 2 ++ src/llama-adapter.cpp | 15 ++++++++++++--- src/llama-adapter.h | 8 +++++++- src/llama-context.cpp | 4 +++- src/llama-model.h | 3 +++ 6 files changed, 46 insertions(+), 23 deletions(-) diff --git a/common/common.cpp b/common/common.cpp index 58fef59546..79c4756125 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -1109,6 +1109,25 @@ common_init_result::common_init_result(common_params & params) : const llama_vocab * vocab = llama_model_get_vocab(model); + // load and optionally apply lora adapters (must be loaded before context creation) + for (auto & la : params.lora_adapters) { + llama_adapter_lora_ptr lora; + lora.reset(llama_adapter_lora_init(model, la.path.c_str())); + if (lora == nullptr) { + LOG_ERR("%s: failed to load lora adapter '%s'\n", __func__, la.path.c_str()); + pimpl->model.reset(model); + return; + } + + char buf[1024]; + la.ptr = lora.get(); + llama_adapter_meta_val_str(la.ptr, "adapter.lora.task_name", buf, sizeof(buf)); + la.task_name = buf; + llama_adapter_meta_val_str(la.ptr, "adapter.lora.prompt_prefix", buf, sizeof(buf)); + la.prompt_prefix = buf; + pimpl->lora.emplace_back(std::move(lora)); // copy to list of loaded adapters + } + // updates params.sampling // TODO: fix naming common_init_sampler_from_model(model, params.sampling); @@ -1245,24 +1264,6 @@ common_init_result_ptr common_init_from_params(common_params & params) { } } - // load and optionally apply lora adapters - for (auto & la : params.lora_adapters) { - llama_adapter_lora_ptr lora; - lora.reset(llama_adapter_lora_init(model, la.path.c_str())); - if (lora == nullptr) { - LOG_ERR("%s: failed to apply lora adapter '%s'\n", __func__, la.path.c_str()); - return res; - } - - char buf[1024]; - la.ptr = lora.get(); - llama_adapter_meta_val_str(la.ptr, "adapter.lora.task_name", buf, sizeof(buf)); - la.task_name = buf; - llama_adapter_meta_val_str(la.ptr, "adapter.lora.prompt_prefix", buf, sizeof(buf)); - la.prompt_prefix = buf; - res->lora().emplace_back(std::move(lora)); // copy to list of loaded adapters - } - if (!params.lora_init_without_apply) { common_set_adapter_lora(lctx, params.lora_adapters); } diff --git a/include/llama.h b/include/llama.h index 4f0124fdc8..8b3c8a7b10 100644 --- a/include/llama.h +++ b/include/llama.h @@ -607,6 +607,8 @@ extern "C" { // // Load a LoRA adapter from file + // The adapter is valid as long as the associated model is not freed + // All adapters must be loaded before context creation LLAMA_API struct llama_adapter_lora * llama_adapter_lora_init( struct llama_model * model, const char * path_lora); diff --git a/src/llama-adapter.cpp b/src/llama-adapter.cpp index d8eef75a7a..bdc24c2d6b 100644 --- a/src/llama-adapter.cpp +++ b/src/llama-adapter.cpp @@ -146,9 +146,11 @@ llama_adapter_lora_weight * llama_adapter_lora::get_weight(ggml_tensor * w) { return nullptr; } -static void llama_adapter_lora_init_impl(llama_model & model, const char * path_lora, llama_adapter_lora & adapter) { +static void llama_adapter_lora_init_impl(const char * path_lora, llama_adapter_lora & adapter) { LLAMA_LOG_INFO("%s: loading lora adapter from '%s' ...\n", __func__, path_lora); + llama_model & model = adapter.model; + ggml_context * ctx_init; gguf_init_params meta_gguf_params = { /* .no_alloc = */ true, @@ -411,14 +413,17 @@ static void llama_adapter_lora_init_impl(llama_model & model, const char * path_ } } + // update number of nodes used + model.n_lora_nodes += adapter.get_n_nodes(); + LLAMA_LOG_INFO("%s: loaded %zu tensors from lora file\n", __func__, adapter.ab_map.size()*2); } llama_adapter_lora * llama_adapter_lora_init(llama_model * model, const char * path_lora) { - llama_adapter_lora * adapter = new llama_adapter_lora(); + llama_adapter_lora * adapter = new llama_adapter_lora(*model); try { - llama_adapter_lora_init_impl(*model, path_lora, *adapter); + llama_adapter_lora_init_impl(path_lora, *adapter); return adapter; } catch (const std::exception & err) { LLAMA_LOG_ERROR("%s: failed to apply lora adapter: %s\n", __func__, err.what()); @@ -469,6 +474,10 @@ int32_t llama_adapter_meta_val_str_by_index(const llama_adapter_lora * adapter, } void llama_adapter_lora_free(llama_adapter_lora * adapter) { + // update number of nodes used + GGML_ASSERT(adapter->model.n_lora_nodes >= adapter->get_n_nodes()); + adapter->model.n_lora_nodes -= adapter->get_n_nodes(); + delete adapter; } diff --git a/src/llama-adapter.h b/src/llama-adapter.h index 4f65247c0f..42d64a6e0b 100644 --- a/src/llama-adapter.h +++ b/src/llama-adapter.h @@ -59,6 +59,8 @@ struct llama_adapter_lora_weight { }; struct llama_adapter_lora { + llama_model & model; + // map tensor name to lora_a_b std::unordered_map ab_map; @@ -73,10 +75,14 @@ struct llama_adapter_lora { // activated lora (aLoRA) std::vector alora_invocation_tokens; - llama_adapter_lora() = default; + llama_adapter_lora(llama_model & model) : model(model) {} ~llama_adapter_lora() = default; llama_adapter_lora_weight * get_weight(ggml_tensor * w); + + uint32_t get_n_nodes() const { + return ab_map.size() * 6u; // a, b, scale, add, 2 x mul_mat + } }; using llama_adapter_loras = std::unordered_map; diff --git a/src/llama-context.cpp b/src/llama-context.cpp index 1c530fdc91..34dfcd4724 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -1442,7 +1442,9 @@ uint32_t llama_context::graph_max_nodes(uint32_t n_tokens) const { if (model.arch == LLM_ARCH_QWEN3NEXT) { return std::max(n_tokens * 40, 32u * model.n_tensors()); } - return std::max(1024u, 8u*model.n_tensors()); + uint32_t res = std::max(1024u, 8u*model.n_tensors()); + res += model.n_lora_nodes; + return res; } llm_graph_result * llama_context::get_gf_res_reserve() const { diff --git a/src/llama-model.h b/src/llama-model.h index dbe5edc153..f4f44a92b6 100644 --- a/src/llama-model.h +++ b/src/llama-model.h @@ -475,6 +475,9 @@ struct llama_model { // for quantize-stats only std::vector> tensors_by_name; + // for keeping track of extra nodes used by lora adapters + uint32_t n_lora_nodes = 0; + int64_t t_load_us = 0; int64_t t_start_us = 0; From ac1d0eb7bf8c59b81a2cceb4a8dac1f44d201a3f Mon Sep 17 00:00:00 2001 From: Daniel Bevenius Date: Tue, 30 Dec 2025 17:20:14 +0100 Subject: [PATCH 05/13] llama : fix typo in comment in llama-kv-cache.h [no ci] (#18489) --- src/llama-kv-cache.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/llama-kv-cache.h b/src/llama-kv-cache.h index 1868f11857..0c4ed64845 100644 --- a/src/llama-kv-cache.h +++ b/src/llama-kv-cache.h @@ -305,7 +305,7 @@ public: bool do_shift, stream_copy_info sc_info); - // used to create a batch procesing context from a batch + // used to create a batch processing context from a batch llama_kv_cache_context( llama_kv_cache * kv, slot_info_vec_t sinfos, From 0f89d2ecf14270f45f43c442e90ae433fd82dab1 Mon Sep 17 00:00:00 2001 From: Aldehir Rojas Date: Tue, 30 Dec 2025 12:00:57 -0600 Subject: [PATCH 06/13] common : default content to an empty string (#18485) * common : default content to an empty string * common : fix tests that break when content != null --- common/chat.cpp | 2 +- models/templates/llama-cpp-deepseek-r1.jinja | 6 +++--- tests/test-chat.cpp | 8 +++++--- 3 files changed, 9 insertions(+), 7 deletions(-) diff --git a/common/chat.cpp b/common/chat.cpp index 0a426f4478..be44c8abb0 100644 --- a/common/chat.cpp +++ b/common/chat.cpp @@ -319,7 +319,7 @@ json common_chat_msgs_to_json_oaicompat(const std::vector & msg } } } else { - jmsg["content"] = json(); // null + jmsg["content"] = ""; } if (!msg.reasoning_content.empty()) { jmsg["reasoning_content"] = msg.reasoning_content; diff --git a/models/templates/llama-cpp-deepseek-r1.jinja b/models/templates/llama-cpp-deepseek-r1.jinja index fcb1732eb8..0d18870870 100644 --- a/models/templates/llama-cpp-deepseek-r1.jinja +++ b/models/templates/llama-cpp-deepseek-r1.jinja @@ -38,7 +38,7 @@ Example function tool call syntax: {%- if message['role'] == 'user' -%} {{- '<|User|>' + message['content'] + '<|end▁of▁sentence|>' -}} {%- endif -%} - {%- if message['role'] == 'assistant' and message['content'] is none -%} + {%- if message['role'] == 'assistant' and not message['content'] -%} {{- '<|Assistant|><|tool▁calls▁begin|>' -}} {%- set ns.is_first = true -%} {%- for tc in message['tool_calls'] -%} @@ -53,7 +53,7 @@ Example function tool call syntax: {%- endfor -%} {{- '<|tool▁calls▁end|><|end▁of▁sentence|>' -}} {%- endif -%} - {%- if message['role'] == 'assistant' and message['content'] is not none -%} + {%- if message['role'] == 'assistant' and message['content'] -%} {{- flush_tool_outputs() -}} {%- set content = message['content'] -%} {%- if '' in content -%} @@ -73,4 +73,4 @@ Example function tool call syntax: {{- flush_tool_outputs() -}} {%- if add_generation_prompt and not ns.is_tool_outputs -%} {{- '<|Assistant|>\n' -}} -{%- endif -%} \ No newline at end of file +{%- endif -%} diff --git a/tests/test-chat.cpp b/tests/test-chat.cpp index 02af5251cc..a78627604e 100644 --- a/tests/test-chat.cpp +++ b/tests/test-chat.cpp @@ -650,7 +650,7 @@ static void test_msgs_oaicompat_json_conversion() { "[\n" " {\n" " \"role\": \"assistant\",\n" - " \"content\": null,\n" + " \"content\": \"\",\n" " \"tool_calls\": [\n" " {\n" " \"type\": \"function\",\n" @@ -906,7 +906,8 @@ static void test_template_output_parsers() { " },\n" " \"id\": \"123456789\"\n" " }\n" - " ]\n" + " ],\n" + " \"content\": \"\"\n" "}"); } { @@ -1713,7 +1714,8 @@ static void test_template_output_parsers() { " },\n" " \"id\": \"123456789\"\n" " }\n" - " ]\n" + " ],\n" + " \"content\": \"\"\n" "}", /* expect_grammar_triggered= */ false ); From 6e0c8cbc40c4abf49e5c52f0f51267c2afdfc053 Mon Sep 17 00:00:00 2001 From: Bart Louwers Date: Tue, 30 Dec 2025 22:13:49 +0100 Subject: [PATCH 07/13] docs : document that JSON Schema is not available to model when using response_format (#18492) * Document unsupported JSON Schema annotations Add note about unsupported JSON Schema annotations. * Update README.md * Update README.md * Update README.md --- grammars/README.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/grammars/README.md b/grammars/README.md index daac7f4d8d..dcd28648b1 100644 --- a/grammars/README.md +++ b/grammars/README.md @@ -150,6 +150,9 @@ You can use GBNF grammars: - in CLI, with [examples/json_schema_to_grammar.py](../examples/json_schema_to_grammar.py) - in JavaScript with [json-schema-to-grammar.mjs](../tools/server/public_legacy/json-schema-to-grammar.mjs) (this is used by the [server](../tools/server)'s Web UI) +> [!NOTE] +> The JSON schema is only used to constrain the model output and is not injected into the prompt. The model has no visibility into the schema, so if you want it to understand the expected structure, describe it explicitly in your prompt. This does not apply to tool calling, where schemas are injected into the prompt. + Take a look at [tests](../tests/test-json-schema-to-grammar.cpp) to see which features are likely supported (you'll also find usage examples in https://github.com/ggml-org/llama.cpp/pull/5978, https://github.com/ggml-org/llama.cpp/pull/6659 & https://github.com/ggml-org/llama.cpp/pull/6555). ```bash From 4849661d9898ac3caf59ddd62044185805084370 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= Date: Tue, 30 Dec 2025 22:28:53 +0100 Subject: [PATCH 08/13] docker : add CUDA 13.1 image build (#18441) * add updated cuda-new.Dockerfile for Ubuntu 24.04 compatibilty * add cuda13 build --- .devops/cuda-new.Dockerfile | 95 ++++++++++++++++++++++++++++++++++++ .github/workflows/docker.yml | 35 ++++++++----- 2 files changed, 119 insertions(+), 11 deletions(-) create mode 100644 .devops/cuda-new.Dockerfile diff --git a/.devops/cuda-new.Dockerfile b/.devops/cuda-new.Dockerfile new file mode 100644 index 0000000000..62443e17f2 --- /dev/null +++ b/.devops/cuda-new.Dockerfile @@ -0,0 +1,95 @@ +ARG UBUNTU_VERSION=24.04 +# This needs to generally match the container host's environment. +ARG CUDA_VERSION=13.1.0 +# Target the CUDA build image +ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION} + +ARG BASE_CUDA_RUN_CONTAINER=nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu${UBUNTU_VERSION} + +FROM ${BASE_CUDA_DEV_CONTAINER} AS build + +# CUDA architecture to build for (defaults to all supported archs) +ARG CUDA_DOCKER_ARCH=default + +RUN apt-get update && \ + apt-get install -y build-essential cmake python3 python3-pip git libcurl4-openssl-dev libgomp1 + +WORKDIR /app + +COPY . . + +RUN if [ "${CUDA_DOCKER_ARCH}" != "default" ]; then \ + export CMAKE_ARGS="-DCMAKE_CUDA_ARCHITECTURES=${CUDA_DOCKER_ARCH}"; \ + fi && \ + cmake -B build -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DLLAMA_BUILD_TESTS=OFF ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \ + cmake --build build --config Release -j$(nproc) + +RUN mkdir -p /app/lib && \ + find build -name "*.so*" -exec cp -P {} /app/lib \; + +RUN mkdir -p /app/full \ + && cp build/bin/* /app/full \ + && cp *.py /app/full \ + && cp -r gguf-py /app/full \ + && cp -r requirements /app/full \ + && cp requirements.txt /app/full \ + && cp .devops/tools.sh /app/full/tools.sh + +## Base image +FROM ${BASE_CUDA_RUN_CONTAINER} AS base + +RUN apt-get update \ + && apt-get install -y libgomp1 curl\ + && apt autoremove -y \ + && apt clean -y \ + && rm -rf /tmp/* /var/tmp/* \ + && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \ + && find /var/cache -type f -delete + +COPY --from=build /app/lib/ /app + +### Full +FROM base AS full + +COPY --from=build /app/full /app + +WORKDIR /app + +RUN apt-get update \ + && apt-get install -y \ + git \ + python3 \ + python3-pip \ + python3-wheel \ + && pip install --break-system-packages --upgrade setuptools \ + && pip install --break-system-packages -r requirements.txt \ + && apt autoremove -y \ + && apt clean -y \ + && rm -rf /tmp/* /var/tmp/* \ + && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \ + && find /var/cache -type f -delete + + +ENTRYPOINT ["/app/tools.sh"] + +### Light, CLI only +FROM base AS light + +COPY --from=build /app/full/llama-cli /app/full/llama-completion /app + +WORKDIR /app + +ENTRYPOINT [ "/app/llama-cli" ] + +### Server, Server only +FROM base AS server + +ENV LLAMA_ARG_HOST=0.0.0.0 + +COPY --from=build /app/full/llama-server /app + +WORKDIR /app + +HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ] + +ENTRYPOINT [ "/app/llama-server" ] diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml index bfd1270716..d9fe0686d3 100644 --- a/.github/workflows/docker.yml +++ b/.github/workflows/docker.yml @@ -40,7 +40,8 @@ jobs: # https://github.com/ggml-org/llama.cpp/issues/11888 #- { tag: "cpu", dockerfile: ".devops/cpu.Dockerfile", platforms: "linux/amd64,linux/arm64", full: true, light: true, server: true, free_disk_space: false } - { tag: "cpu", dockerfile: ".devops/cpu.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: false, runs_on: "ubuntu-22.04" } - - { tag: "cuda", dockerfile: ".devops/cuda.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: true, runs_on: "ubuntu-22.04" } + - { tag: "cuda cuda12", dockerfile: ".devops/cuda.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: true, runs_on: "ubuntu-22.04", cuda_version: "12.4.0", ubuntu_version: "22.04" } + - { tag: "cuda13", dockerfile: ".devops/cuda-new.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: true, runs_on: "ubuntu-22.04", cuda_version: "13.1.0", ubuntu_version: "24.04" } - { tag: "musa", dockerfile: ".devops/musa.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: true, runs_on: "ubuntu-22.04" } - { tag: "intel", dockerfile: ".devops/intel.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: true, runs_on: "ubuntu-22.04" } - { tag: "vulkan", dockerfile: ".devops/vulkan.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: false, runs_on: "ubuntu-22.04" } @@ -80,18 +81,21 @@ jobs: run: | REPO_OWNER="${GITHUB_REPOSITORY_OWNER@L}" # to lower case REPO_NAME="${{ github.event.repository.name }}" + PREFIX="ghcr.io/${REPO_OWNER}/${REPO_NAME}:" # list all tags possible - if [[ "${{ matrix.config.tag }}" == "cpu" ]]; then - TYPE="" - else - TYPE="-${{ matrix.config.tag }}" - fi - PREFIX="ghcr.io/${REPO_OWNER}/${REPO_NAME}:" - CACHETAGS="${PREFIX}buildcache${TYPE}" - FULLTAGS="${PREFIX}full${TYPE},${PREFIX}full${TYPE}-${{ steps.srctag.outputs.name }}" - LIGHTTAGS="${PREFIX}light${TYPE},${PREFIX}light${TYPE}-${{ steps.srctag.outputs.name }}" - SERVERTAGS="${PREFIX}server${TYPE},${PREFIX}server${TYPE}-${{ steps.srctag.outputs.name }}" + tags="${{ matrix.config.tag }}" + for tag in $tags; do + if [[ "$tag" == "cpu" ]]; then + TYPE="" + else + TYPE="-$tag" + fi + CACHETAGS="${PREFIX}buildcache${TYPE}" + FULLTAGS="${FULLTAGS:+$FULLTAGS,}${PREFIX}full${TYPE},${PREFIX}full${TYPE}-${{ steps.srctag.outputs.name }}" + LIGHTTAGS="${LIGHTTAGS:+$LIGHTTAGS,}${PREFIX}light${TYPE},${PREFIX}light${TYPE}-${{ steps.srctag.outputs.name }}" + SERVERTAGS="${SERVERTAGS:+$SERVERTAGS,}${PREFIX}server${TYPE},${PREFIX}server${TYPE}-${{ steps.srctag.outputs.name }}" + done echo "cache_output_tags=$CACHETAGS" >> $GITHUB_OUTPUT echo "full_output_tags=$FULLTAGS" >> $GITHUB_OUTPUT echo "light_output_tags=$LIGHTTAGS" >> $GITHUB_OUTPUT @@ -132,6 +136,9 @@ jobs: file: ${{ matrix.config.dockerfile }} target: full provenance: false + build-args: | + ${{ matrix.config.ubuntu_version && format('UBUNTU_VERSION={0}', matrix.config.ubuntu_version) || '' }} + ${{ matrix.config.cuda_version && format('CUDA_VERSION={0}', matrix.config.cuda_version) || '' }} # using github experimental cache #cache-from: type=gha #cache-to: type=gha,mode=max @@ -154,6 +161,9 @@ jobs: file: ${{ matrix.config.dockerfile }} target: light provenance: false + build-args: | + ${{ matrix.config.ubuntu_version && format('UBUNTU_VERSION={0}', matrix.config.ubuntu_version) || '' }} + ${{ matrix.config.cuda_version && format('CUDA_VERSION={0}', matrix.config.cuda_version) || '' }} # using github experimental cache #cache-from: type=gha #cache-to: type=gha,mode=max @@ -176,6 +186,9 @@ jobs: file: ${{ matrix.config.dockerfile }} target: server provenance: false + build-args: | + ${{ matrix.config.ubuntu_version && format('UBUNTU_VERSION={0}', matrix.config.ubuntu_version) || '' }} + ${{ matrix.config.cuda_version && format('CUDA_VERSION={0}', matrix.config.cuda_version) || '' }} # using github experimental cache #cache-from: type=gha #cache-to: type=gha,mode=max From c8a37980419e8b7f0193d058fb6f8f01b458cfca Mon Sep 17 00:00:00 2001 From: Rahul Sathe <150351592+rrsathe@users.noreply.github.com> Date: Wed, 31 Dec 2025 06:38:44 +0530 Subject: [PATCH 09/13] Work around broken IntelSYCLConfig.cmake in Intel oneAPI 2025.x (#18345) * cmake: work around broken IntelSYCLConfig.cmake in oneAPI 2025.x * [AI] sycl: auto-detect and skip incompatible IntelSYCL package Automatically detect compiler versions with incompatible IntelSYCL CMake configuration files and fall back to manual SYCL flags instead of requiring users to set options manually. Fixes build failures with oneAPI 2025.x where IntelSYCLConfig.cmake has SYCL_FEATURE_TEST_EXTRACT invocation errors. * refactor: improve SYCL provider handling and error messages in CMake configuration * refactor: enhance SYCL provider validation and error handling in CMake configuration * ggml-sycl: wrap find_package(IntelSYCL) to prevent build crashes --- ggml/src/ggml-sycl/CMakeLists.txt | 44 +++++++++++++++++++++++++++++-- 1 file changed, 42 insertions(+), 2 deletions(-) diff --git a/ggml/src/ggml-sycl/CMakeLists.txt b/ggml/src/ggml-sycl/CMakeLists.txt index 88f29221bb..51594fb88e 100644 --- a/ggml/src/ggml-sycl/CMakeLists.txt +++ b/ggml/src/ggml-sycl/CMakeLists.txt @@ -36,7 +36,47 @@ if (WIN32) endif() endif() -find_package(IntelSYCL) +macro(detect_and_find_package package_name) + set(test_source " + cmake_minimum_required(VERSION ${CMAKE_VERSION}) + project(check_package LANGUAGES CXX) + find_package(${package_name} QUIET) + ") + + set(test_dir "${CMAKE_CURRENT_BINARY_DIR}/check_package_${package_name}") + file(WRITE "${test_dir}/CMakeLists.txt" "${test_source}") + + set(cmake_args "") + if(CMAKE_GENERATOR) + list(APPEND cmake_args "-G" "${CMAKE_GENERATOR}") + endif() + if(CMAKE_GENERATOR_PLATFORM) + list(APPEND cmake_args "-A" "${CMAKE_GENERATOR_PLATFORM}") + endif() + if(CMAKE_GENERATOR_TOOLSET) + list(APPEND cmake_args "-T" "${CMAKE_GENERATOR_TOOLSET}") + endif() + if(CMAKE_CXX_COMPILER) + list(APPEND cmake_args "-DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}") + endif() + + execute_process( + COMMAND ${CMAKE_COMMAND} ${cmake_args} . + WORKING_DIRECTORY "${test_dir}" + RESULT_VARIABLE result + OUTPUT_QUIET + ERROR_QUIET + ) + + if(result EQUAL 0) + find_package(${package_name} ${ARGN}) + else() + message(WARNING "Detection of ${package_name} failed. The package might be broken or incompatible.") + set(${package_name}_FOUND FALSE) + endif() +endmacro() + +detect_and_find_package(IntelSYCL) if (IntelSYCL_FOUND) # Use oneAPI CMake when possible target_link_libraries(ggml-sycl PRIVATE IntelSYCL::SYCL_CXX) @@ -190,4 +230,4 @@ endif() if (GGML_SYCL_DEVICE_ARCH) target_compile_options(ggml-sycl PRIVATE -Xsycl-target-backend --offload-arch=${GGML_SYCL_DEVICE_ARCH}) target_link_options(ggml-sycl PRIVATE -Xsycl-target-backend --offload-arch=${GGML_SYCL_DEVICE_ARCH}) -endif() +endif() \ No newline at end of file From 7bcaf815c20f471fead106088b558e542982bf30 Mon Sep 17 00:00:00 2001 From: Aman Gupta Date: Wed, 31 Dec 2025 14:23:44 +0800 Subject: [PATCH 10/13] sycl: add newline at the end of CMakeLists.txt (#18503) --- ggml/src/ggml-sycl/CMakeLists.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ggml/src/ggml-sycl/CMakeLists.txt b/ggml/src/ggml-sycl/CMakeLists.txt index 51594fb88e..5a89d8dd68 100644 --- a/ggml/src/ggml-sycl/CMakeLists.txt +++ b/ggml/src/ggml-sycl/CMakeLists.txt @@ -230,4 +230,5 @@ endif() if (GGML_SYCL_DEVICE_ARCH) target_compile_options(ggml-sycl PRIVATE -Xsycl-target-backend --offload-arch=${GGML_SYCL_DEVICE_ARCH}) target_link_options(ggml-sycl PRIVATE -Xsycl-target-backend --offload-arch=${GGML_SYCL_DEVICE_ARCH}) -endif() \ No newline at end of file +endif() + From 01ade96e71b62b482019e42dd74551758fde8851 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Wed, 31 Dec 2025 09:53:48 +0200 Subject: [PATCH 11/13] metal : remove BF16 x F16 kernels (#18456) --- ggml/src/ggml-metal/ggml-metal.metal | 6 ------ 1 file changed, 6 deletions(-) diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal index 51bcbae309..3154beff9b 100644 --- a/ggml/src/ggml-metal/ggml-metal.metal +++ b/ggml/src/ggml-metal/ggml-metal.metal @@ -9557,9 +9557,6 @@ template [[host_name("kernel_mul_mm_iq4_xs_f32")]] kernel mul_mm_t kernel_mul_m template [[host_name("kernel_mul_mm_f32_f16")]] kernel mul_mm_t kernel_mul_mm; template [[host_name("kernel_mul_mm_f16_f16")]] kernel mul_mm_t kernel_mul_mm; -#if defined(GGML_METAL_HAS_BF16) -template [[host_name("kernel_mul_mm_bf16_f16")]] kernel mul_mm_t kernel_mul_mm; -#endif template [[host_name("kernel_mul_mm_q4_0_f16")]] kernel mul_mm_t kernel_mul_mm; template [[host_name("kernel_mul_mm_q4_1_f16")]] kernel mul_mm_t kernel_mul_mm; template [[host_name("kernel_mul_mm_q5_0_f16")]] kernel mul_mm_t kernel_mul_mm; @@ -9615,9 +9612,6 @@ template [[host_name("kernel_mul_mm_id_iq4_xs_f32")]] kernel mul_mm_id kernel_m template [[host_name("kernel_mul_mm_id_f32_f16")]] kernel mul_mm_id kernel_mul_mm_id; template [[host_name("kernel_mul_mm_id_f16_f16")]] kernel mul_mm_id kernel_mul_mm_id; -#if defined(GGML_METAL_HAS_BF16) -template [[host_name("kernel_mul_mm_id_bf16_f16")]] kernel mul_mm_id kernel_mul_mm_id; -#endif template [[host_name("kernel_mul_mm_id_q4_0_f16")]] kernel mul_mm_id kernel_mul_mm_id; template [[host_name("kernel_mul_mm_id_q4_1_f16")]] kernel mul_mm_id kernel_mul_mm_id; template [[host_name("kernel_mul_mm_id_q5_0_f16")]] kernel mul_mm_id kernel_mul_mm_id; From ecc343de63ac1aaba9c74cd26807fd60aec31ab9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= Date: Wed, 31 Dec 2025 09:37:00 +0100 Subject: [PATCH 12/13] CUDA: fix KQ max calculation (#18487) --- ggml/src/ggml-cuda/fattn-mma-f16.cuh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ggml/src/ggml-cuda/fattn-mma-f16.cuh b/ggml/src/ggml-cuda/fattn-mma-f16.cuh index 7bd1044c19..856291dc3c 100644 --- a/ggml/src/ggml-cuda/fattn-mma-f16.cuh +++ b/ggml/src/ggml-cuda/fattn-mma-f16.cuh @@ -531,7 +531,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter( for (int k0 = 0; k0 < nbatch_fa; k0 += np*T_C_KQ::I) { #pragma unroll for (int l = 0; l < T_C_KQ::ne; ++l) { - if (!oob_check || k0 + T_C_KQ::get_i(l) < k_VKQ_sup) { + if (!oob_check || k0 + (threadIdx.y % np)*T_C_KQ::I + T_C_KQ::get_i(l) < k_VKQ_sup) { KQ_max_new[l % 2] = fmaxf(KQ_max_new[l % 2], KQ_C[k0/(np*T_C_KQ::I)].x[l] + FATTN_KQ_MAX_OFFSET); } } @@ -583,7 +583,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter( for (int k0 = 0; k0 < nbatch_fa; k0 += np*T_C_KQ::J) { #pragma unroll for (int l = 0; l < T_C_KQ::ne; ++l) { - if (!oob_check || k0 + T_C_KQ::get_j(l) < k_VKQ_sup) { + if (!oob_check || k0 + (threadIdx.y % np)*T_C_KQ::J + T_C_KQ::get_j(l) < k_VKQ_sup) { // Turing + Volta: KQ_max_new[(l/2) % 2] = fmaxf(KQ_max_new[(l/2) % 2], KQ_C[(k0/(np*T_C_KQ::J))].x[l] + FATTN_KQ_MAX_OFFSET); } From 9a6369bb603457f277b597f0ccee1c19cd25c4b2 Mon Sep 17 00:00:00 2001 From: gatbontonpc Date: Wed, 31 Dec 2025 00:39:48 -0800 Subject: [PATCH 13/13] metal : add count_equal op (#18314) * add count equal for metal * remove trailing whitespace * updated doc ops table * changed shmem to i32 * added multi tg and templating * removed BLAS support from Metal docs * Apply suggestions from code review Co-authored-by: Georgi Gerganov * add memset to set dst to 0 * metal : cleanup --------- Co-authored-by: Georgi Gerganov --- docs/ops.md | 2 +- docs/ops/Metal.csv | 682 ++++++++++++---------- ggml/src/ggml-metal/ggml-metal-device.cpp | 57 ++ ggml/src/ggml-metal/ggml-metal-device.h | 2 + ggml/src/ggml-metal/ggml-metal-device.m | 5 + ggml/src/ggml-metal/ggml-metal-impl.h | 20 + ggml/src/ggml-metal/ggml-metal-ops.cpp | 67 ++- ggml/src/ggml-metal/ggml-metal-ops.h | 1 + ggml/src/ggml-metal/ggml-metal.metal | 73 +++ 9 files changed, 585 insertions(+), 324 deletions(-) diff --git a/docs/ops.md b/docs/ops.md index b395d2315c..2b2770cb76 100644 --- a/docs/ops.md +++ b/docs/ops.md @@ -32,7 +32,7 @@ Legend: | CONV_TRANSPOSE_1D | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | | CONV_TRANSPOSE_2D | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | | COS | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | ✅ | 🟡 | ❌ | ❌ | ❌ | -| COUNT_EQUAL | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | +| COUNT_EQUAL | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | | CPY | ❌ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | ❌ | ❌ | | CROSS_ENTROPY_LOSS | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | | CROSS_ENTROPY_LOSS_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | diff --git a/docs/ops/Metal.csv b/docs/ops/Metal.csv index 5f7450e91f..02fd75fdbf 100644 --- a/docs/ops/Metal.csv +++ b/docs/ops/Metal.csv @@ -965,6 +965,7 @@ "Metal","IM2COL","type_input=f32,type_kernel=f16,dst_type=f16,ne_input=[12,12,1,2560],ne_kernel=[3,3,1,2560],s0=1,s1=1,p0=1,p1=1,d0=1,d1=1,is_2D=1","support","1","yes","Metal" "Metal","IM2COL","type_input=f32,type_kernel=f16,dst_type=f16,ne_input=[12,12,2,2560],ne_kernel=[3,3,2,2560],s0=1,s1=1,p0=1,p1=1,d0=1,d1=1,is_2D=1","support","1","yes","Metal" "Metal","IM2COL","type_input=f32,type_kernel=f16,dst_type=f16,ne_input=[5,5,1,32],ne_kernel=[3,4,1,32],s0=1,s1=1,p0=0,p1=0,d0=1,d1=1,is_2D=1","support","1","yes","Metal" +"Metal","IM2COL","type_input=f32,type_kernel=f32,dst_type=f32,ne_input=[2,2,1536,729],ne_kernel=[2,2,1536,4096],s0=1,s1=1,p0=0,p1=0,d0=1,d1=1,is_2D=1","support","1","yes","Metal" "Metal","IM2COL_3D","type_input=f32,type_kernel=f32,dst_type=f32,ne_input=[10,10,10,9],ne_kernel=[3,3,3,1],IC=3,s0=1,s1=1,s2=1,p0=1,p1=1,p2=1,d0=1,d1=1,d2=1,v=0","support","0","no","Metal" "Metal","IM2COL_3D","type_input=f32,type_kernel=f16,dst_type=f32,ne_input=[10,10,10,9],ne_kernel=[3,3,3,1],IC=3,s0=1,s1=1,s2=1,p0=1,p1=1,p2=1,d0=1,d1=1,d2=1,v=0","support","0","no","Metal" "Metal","IM2COL_3D","type_input=f32,type_kernel=f16,dst_type=f16,ne_input=[10,10,10,9],ne_kernel=[3,3,3,1],IC=3,s0=1,s1=1,s2=1,p0=1,p1=1,p2=1,d0=1,d1=1,d2=1,v=0","support","0","no","Metal" @@ -4964,8 +4965,9 @@ "Metal","CONV_TRANSPOSE_1D","ne_input=[2,1,1,1],ne_kernel=[3,1,1,1],s0=1,p0=0,d0=1","support","1","yes","Metal" "Metal","CONV_TRANSPOSE_2D","ne_input=[3,2,3,1],ne_kernel=[2,2,1,3],stride=1","support","1","yes","Metal" "Metal","CONV_TRANSPOSE_2D","ne_input=[10,10,9,1],ne_kernel=[3,3,1,9],stride=2","support","1","yes","Metal" -"Metal","COUNT_EQUAL","type=f32,ne=[4,500,1,1]","support","0","no","Metal" -"Metal","COUNT_EQUAL","type=f32,ne=[4,5000,1,1]","support","0","no","Metal" +"Metal","CONV_TRANSPOSE_2D","ne_input=[129,63,35,1],ne_kernel=[3,3,48,35],stride=1","support","1","yes","Metal" +"Metal","COUNT_EQUAL","type=f32,ne=[4,500,1,1]","support","1","yes","Metal" +"Metal","COUNT_EQUAL","type=f32,ne=[4,5000,1,1]","support","1","yes","Metal" "Metal","ARGMAX","type=f32,ne=[32,1,1,1]","support","1","yes","Metal" "Metal","ARGMAX","type=f32,ne=[32,513,1,1]","support","1","yes","Metal" "Metal","ARGMAX","type=f32,ne=[100,10,1,1]","support","1","yes","Metal" @@ -5715,15 +5717,15 @@ "Metal","L2_NORM","type=f32,ne=[64,5,4,3]","support","1","yes","Metal" "Metal","RMS_NORM","type=f32,ne=[64,5,4,3],v=0,eps=0.000001,inplace=1","support","1","yes","Metal" "Metal","L2_NORM","type=f32,ne=[64,5,4,3]","support","1","yes","Metal" -"Metal","SSM_CONV","type=f32,ne_a=[4,1024,1,1],ne_b=[3,1024,1,1]","support","1","yes","Metal" -"Metal","SSM_CONV","type=f32,ne_a=[8,1024,1,1],ne_b=[3,1024,1,1]","support","1","yes","Metal" -"Metal","SSM_CONV","type=f32,ne_a=[4,1024,4,1],ne_b=[3,1024,1,1]","support","1","yes","Metal" -"Metal","SSM_CONV","type=f32,ne_a=[4,1536,1,1],ne_b=[3,1536,1,1]","support","1","yes","Metal" -"Metal","SSM_CONV","type=f32,ne_a=[8,1536,1,1],ne_b=[3,1536,1,1]","support","1","yes","Metal" -"Metal","SSM_CONV","type=f32,ne_a=[4,1536,4,1],ne_b=[3,1536,1,1]","support","1","yes","Metal" -"Metal","SSM_CONV","type=f32,ne_a=[4,2048,1,1],ne_b=[3,2048,1,1]","support","1","yes","Metal" -"Metal","SSM_CONV","type=f32,ne_a=[8,2048,1,1],ne_b=[3,2048,1,1]","support","1","yes","Metal" -"Metal","SSM_CONV","type=f32,ne_a=[4,2048,4,1],ne_b=[3,2048,1,1]","support","1","yes","Metal" +"Metal","SSM_CONV","type=f32,ne_a=[3,1024,1,1],ne_b=[3,1024,1,1]","support","1","yes","Metal" +"Metal","SSM_CONV","type=f32,ne_a=[6,1024,1,1],ne_b=[3,1024,1,1]","support","1","yes","Metal" +"Metal","SSM_CONV","type=f32,ne_a=[3,1024,4,1],ne_b=[3,1024,1,1]","support","1","yes","Metal" +"Metal","SSM_CONV","type=f32,ne_a=[3,1536,1,1],ne_b=[3,1536,1,1]","support","1","yes","Metal" +"Metal","SSM_CONV","type=f32,ne_a=[6,1536,1,1],ne_b=[3,1536,1,1]","support","1","yes","Metal" +"Metal","SSM_CONV","type=f32,ne_a=[3,1536,4,1],ne_b=[3,1536,1,1]","support","1","yes","Metal" +"Metal","SSM_CONV","type=f32,ne_a=[3,2048,1,1],ne_b=[3,2048,1,1]","support","1","yes","Metal" +"Metal","SSM_CONV","type=f32,ne_a=[6,2048,1,1],ne_b=[3,2048,1,1]","support","1","yes","Metal" +"Metal","SSM_CONV","type=f32,ne_a=[3,2048,4,1],ne_b=[3,2048,1,1]","support","1","yes","Metal" "Metal","SSM_CONV","type=f32,ne_a=[4,1024,1,1],ne_b=[4,1024,1,1]","support","1","yes","Metal" "Metal","SSM_CONV","type=f32,ne_a=[8,1024,1,1],ne_b=[4,1024,1,1]","support","1","yes","Metal" "Metal","SSM_CONV","type=f32,ne_a=[4,1024,4,1],ne_b=[4,1024,1,1]","support","1","yes","Metal" @@ -5733,6 +5735,15 @@ "Metal","SSM_CONV","type=f32,ne_a=[4,2048,1,1],ne_b=[4,2048,1,1]","support","1","yes","Metal" "Metal","SSM_CONV","type=f32,ne_a=[8,2048,1,1],ne_b=[4,2048,1,1]","support","1","yes","Metal" "Metal","SSM_CONV","type=f32,ne_a=[4,2048,4,1],ne_b=[4,2048,1,1]","support","1","yes","Metal" +"Metal","SSM_CONV","type=f32,ne_a=[9,1024,1,1],ne_b=[9,1024,1,1]","support","1","yes","Metal" +"Metal","SSM_CONV","type=f32,ne_a=[18,1024,1,1],ne_b=[9,1024,1,1]","support","1","yes","Metal" +"Metal","SSM_CONV","type=f32,ne_a=[9,1024,4,1],ne_b=[9,1024,1,1]","support","1","yes","Metal" +"Metal","SSM_CONV","type=f32,ne_a=[9,1536,1,1],ne_b=[9,1536,1,1]","support","1","yes","Metal" +"Metal","SSM_CONV","type=f32,ne_a=[18,1536,1,1],ne_b=[9,1536,1,1]","support","1","yes","Metal" +"Metal","SSM_CONV","type=f32,ne_a=[9,1536,4,1],ne_b=[9,1536,1,1]","support","1","yes","Metal" +"Metal","SSM_CONV","type=f32,ne_a=[9,2048,1,1],ne_b=[9,2048,1,1]","support","1","yes","Metal" +"Metal","SSM_CONV","type=f32,ne_a=[18,2048,1,1],ne_b=[9,2048,1,1]","support","1","yes","Metal" +"Metal","SSM_CONV","type=f32,ne_a=[9,2048,4,1],ne_b=[9,2048,1,1]","support","1","yes","Metal" "Metal","SSM_SCAN","type=f32,d_state=16,head_dim=1,n_head=1024,n_group=1,n_seq_tokens=32,n_seqs=4","support","1","yes","Metal" "Metal","SSM_SCAN","type=f32,d_state=128,head_dim=64,n_head=16,n_group=2,n_seq_tokens=32,n_seqs=4","support","1","yes","Metal" "Metal","SSM_SCAN","type=f32,d_state=256,head_dim=64,n_head=8,n_group=2,n_seq_tokens=32,n_seqs=4","support","1","yes","Metal" @@ -8916,6 +8927,8 @@ "Metal","SOFT_MAX","type=f32,ne=[32,2,32,1],mask=1,sinks=0,m_prec=f16,nr23=[1,1],scale=0.100000,max_bias=0.000000,inplace=0","support","1","yes","Metal" "Metal","SOFT_MAX","type=f32,ne=[32,2,32,1],mask=1,sinks=1,m_prec=f32,nr23=[1,1],scale=0.100000,max_bias=8.000000,inplace=0","support","1","yes","Metal" "Metal","SOFT_MAX","type=f32,ne=[32,2,32,1],mask=1,sinks=1,m_prec=f16,nr23=[1,1],scale=0.100000,max_bias=8.000000,inplace=0","support","1","yes","Metal" +"Metal","SOFT_MAX","type=f32,ne=[200001,2,3,1],mask=1,sinks=1,m_prec=f32,nr23=[1,1],scale=0.100000,max_bias=8.000000,inplace=0","support","1","yes","Metal" +"Metal","SOFT_MAX","type=f32,ne=[200001,2,3,1],mask=1,sinks=1,m_prec=f16,nr23=[1,1],scale=0.100000,max_bias=8.000000,inplace=0","support","1","yes","Metal" "Metal","SOFT_MAX_BACK","type=f32,ne=[16,16,1,1],scale=1.000000,max_bias=0.000000","support","0","no","Metal" "Metal","SOFT_MAX_BACK","type=f32,ne=[15,15,1,1],scale=1.000000,max_bias=0.000000","support","0","no","Metal" "Metal","SOFT_MAX_BACK","type=f32,ne=[16,16,2,3],scale=1.000000,max_bias=0.000000","support","0","no","Metal" @@ -9542,311 +9555,311 @@ "Metal","ARGSORT","type=f32,ne=[2048,2,1,3],order=1","support","1","yes","Metal" "Metal","ARGSORT","type=f32,ne=[2049,2,1,3],order=1","support","1","yes","Metal" "Metal","ARGSORT","type=f32,ne=[2,8,8192,1],order=1","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[1,1,1,1],k=1","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[12,1,2,1],k=1","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[2,1,1,1],k=1","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[13,1,2,1],k=1","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[2,1,1,1],k=2","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[13,1,2,1],k=2","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[4,1,1,1],k=1","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[15,1,2,1],k=1","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[4,1,1,1],k=2","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[15,1,2,1],k=2","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[4,1,1,1],k=3","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[15,1,2,1],k=3","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[8,1,1,1],k=1","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[19,1,2,1],k=1","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[8,1,1,1],k=2","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[19,1,2,1],k=2","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[8,1,1,1],k=3","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[19,1,2,1],k=3","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[8,1,1,1],k=7","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[19,1,2,1],k=7","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[16,1,1,1],k=1","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[27,1,2,1],k=1","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[16,1,1,1],k=2","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[27,1,2,1],k=2","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[16,1,1,1],k=3","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[27,1,2,1],k=3","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[16,1,1,1],k=7","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[27,1,2,1],k=7","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[16,1,1,1],k=15","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[27,1,2,1],k=15","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[32,1,1,1],k=1","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[43,1,2,1],k=1","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[32,1,1,1],k=2","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[43,1,2,1],k=2","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[32,1,1,1],k=3","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[43,1,2,1],k=3","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[32,1,1,1],k=7","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[43,1,2,1],k=7","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[32,1,1,1],k=15","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[43,1,2,1],k=15","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[64,1,1,1],k=1","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[75,1,2,1],k=1","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[64,1,1,1],k=2","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[75,1,2,1],k=2","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[64,1,1,1],k=3","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[75,1,2,1],k=3","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[64,1,1,1],k=7","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[75,1,2,1],k=7","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[64,1,1,1],k=15","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[75,1,2,1],k=15","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[128,1,1,1],k=1","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[139,1,2,1],k=1","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[128,1,1,1],k=2","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[139,1,2,1],k=2","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[128,1,1,1],k=3","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[139,1,2,1],k=3","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[128,1,1,1],k=7","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[139,1,2,1],k=7","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[128,1,1,1],k=15","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[139,1,2,1],k=15","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[128,1,1,1],k=100","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[139,1,2,1],k=100","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[256,1,1,1],k=1","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[267,1,2,1],k=1","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[256,1,1,1],k=2","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[267,1,2,1],k=2","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[256,1,1,1],k=3","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[267,1,2,1],k=3","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[256,1,1,1],k=7","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[267,1,2,1],k=7","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[256,1,1,1],k=15","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[267,1,2,1],k=15","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[256,1,1,1],k=100","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[267,1,2,1],k=100","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[512,1,1,1],k=1","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[523,1,2,1],k=1","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[512,1,1,1],k=2","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[523,1,2,1],k=2","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[512,1,1,1],k=3","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[523,1,2,1],k=3","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[512,1,1,1],k=7","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[523,1,2,1],k=7","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[512,1,1,1],k=15","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[523,1,2,1],k=15","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[512,1,1,1],k=100","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[523,1,2,1],k=100","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[512,1,1,1],k=500","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[523,1,2,1],k=500","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[1024,1,1,1],k=1","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[1035,1,2,1],k=1","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[1024,1,1,1],k=2","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[1035,1,2,1],k=2","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[1024,1,1,1],k=3","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[1035,1,2,1],k=3","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[1024,1,1,1],k=7","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[1035,1,2,1],k=7","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[1024,1,1,1],k=15","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[1035,1,2,1],k=15","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[1024,1,1,1],k=100","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[1035,1,2,1],k=100","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[1024,1,1,1],k=500","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[1035,1,2,1],k=500","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[1024,1,1,1],k=1023","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[1035,1,2,1],k=1023","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[2048,1,1,1],k=1","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[2059,1,2,1],k=1","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[2048,1,1,1],k=2","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[2059,1,2,1],k=2","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[2048,1,1,1],k=3","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[2059,1,2,1],k=3","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[2048,1,1,1],k=7","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[2059,1,2,1],k=7","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[2048,1,1,1],k=15","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[2059,1,2,1],k=15","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[2048,1,1,1],k=100","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[2059,1,2,1],k=100","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[2048,1,1,1],k=500","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[2059,1,2,1],k=500","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[2048,1,1,1],k=1023","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[2059,1,2,1],k=1023","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[4096,1,1,1],k=1","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[4107,1,2,1],k=1","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[4096,1,1,1],k=2","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[4107,1,2,1],k=2","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[4096,1,1,1],k=3","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[4107,1,2,1],k=3","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[4096,1,1,1],k=7","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[4107,1,2,1],k=7","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[4096,1,1,1],k=15","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[4107,1,2,1],k=15","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[4096,1,1,1],k=100","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[4107,1,2,1],k=100","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[4096,1,1,1],k=500","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[4107,1,2,1],k=500","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[4096,1,1,1],k=1023","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[4107,1,2,1],k=1023","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[8192,1,1,1],k=1","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[8203,1,2,1],k=1","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[8192,1,1,1],k=2","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[8203,1,2,1],k=2","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[8192,1,1,1],k=3","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[8203,1,2,1],k=3","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[8192,1,1,1],k=7","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[8203,1,2,1],k=7","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[8192,1,1,1],k=15","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[8203,1,2,1],k=15","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[8192,1,1,1],k=100","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[8203,1,2,1],k=100","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[8192,1,1,1],k=500","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[8203,1,2,1],k=500","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[8192,1,1,1],k=1023","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[8203,1,2,1],k=1023","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[16384,1,1,1],k=1","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[16395,1,2,1],k=1","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[16384,1,1,1],k=2","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[16395,1,2,1],k=2","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[16384,1,1,1],k=3","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[16395,1,2,1],k=3","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[16384,1,1,1],k=7","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[16395,1,2,1],k=7","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[16384,1,1,1],k=15","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[16395,1,2,1],k=15","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[16384,1,1,1],k=100","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[16395,1,2,1],k=100","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[16384,1,1,1],k=500","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[16395,1,2,1],k=500","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[16384,1,1,1],k=1023","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[16395,1,2,1],k=1023","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[16384,1,1,1],k=9999","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[16395,1,2,1],k=9999","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[32768,1,1,1],k=1","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[32779,1,2,1],k=1","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[32768,1,1,1],k=2","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[32779,1,2,1],k=2","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[32768,1,1,1],k=3","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[32779,1,2,1],k=3","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[32768,1,1,1],k=7","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[32779,1,2,1],k=7","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[32768,1,1,1],k=15","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[32779,1,2,1],k=15","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[32768,1,1,1],k=100","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[32779,1,2,1],k=100","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[32768,1,1,1],k=500","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[32779,1,2,1],k=500","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[32768,1,1,1],k=1023","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[32779,1,2,1],k=1023","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[32768,1,1,1],k=9999","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[32779,1,2,1],k=9999","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[65536,1,1,1],k=1","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[65547,1,2,1],k=1","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[65536,1,1,1],k=2","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[65547,1,2,1],k=2","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[65536,1,1,1],k=3","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[65547,1,2,1],k=3","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[65536,1,1,1],k=7","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[65547,1,2,1],k=7","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[65536,1,1,1],k=15","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[65547,1,2,1],k=15","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[65536,1,1,1],k=100","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[65547,1,2,1],k=100","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[65536,1,1,1],k=500","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[65547,1,2,1],k=500","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[65536,1,1,1],k=1023","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[65547,1,2,1],k=1023","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[65536,1,1,1],k=9999","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[65547,1,2,1],k=9999","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[131072,1,1,1],k=1","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[131083,1,2,1],k=1","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[131072,1,1,1],k=2","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[131083,1,2,1],k=2","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[131072,1,1,1],k=3","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[131083,1,2,1],k=3","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[131072,1,1,1],k=7","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[131083,1,2,1],k=7","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[131072,1,1,1],k=15","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[131083,1,2,1],k=15","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[131072,1,1,1],k=100","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[131083,1,2,1],k=100","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[131072,1,1,1],k=500","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[131083,1,2,1],k=500","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[131072,1,1,1],k=1023","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[131083,1,2,1],k=1023","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[131072,1,1,1],k=9999","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[131083,1,2,1],k=9999","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[262144,1,1,1],k=1","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[262155,1,2,1],k=1","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[262144,1,1,1],k=2","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[262155,1,2,1],k=2","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[262144,1,1,1],k=3","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[262155,1,2,1],k=3","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[262144,1,1,1],k=7","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[262155,1,2,1],k=7","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[262144,1,1,1],k=15","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[262155,1,2,1],k=15","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[262144,1,1,1],k=100","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[262155,1,2,1],k=100","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[262144,1,1,1],k=500","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[262155,1,2,1],k=500","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[262144,1,1,1],k=1023","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[262155,1,2,1],k=1023","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[262144,1,1,1],k=9999","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[262155,1,2,1],k=9999","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[524288,1,1,1],k=1","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[524299,1,2,1],k=1","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[524288,1,1,1],k=2","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[524299,1,2,1],k=2","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[524288,1,1,1],k=3","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[524299,1,2,1],k=3","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[524288,1,1,1],k=7","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[524299,1,2,1],k=7","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[524288,1,1,1],k=15","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[524299,1,2,1],k=15","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[524288,1,1,1],k=100","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[524299,1,2,1],k=100","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[524288,1,1,1],k=500","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[524299,1,2,1],k=500","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[524288,1,1,1],k=1023","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[524299,1,2,1],k=1023","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[524288,1,1,1],k=9999","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[524299,1,2,1],k=9999","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[16,10,10,10],k=1","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[60,10,10,10],k=1","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[1023,2,1,3],k=1","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[1024,2,1,3],k=1","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[1025,2,1,3],k=1","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[16384,1,1,1],k=1","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[2047,2,1,3],k=1","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[2048,2,1,3],k=1","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[2049,2,1,3],k=1","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[16,10,10,10],k=2","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[60,10,10,10],k=2","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[1023,2,1,3],k=2","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[1024,2,1,3],k=2","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[1025,2,1,3],k=2","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[16384,1,1,1],k=2","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[2047,2,1,3],k=2","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[2048,2,1,3],k=2","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[2049,2,1,3],k=2","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[16,10,10,10],k=3","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[60,10,10,10],k=3","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[1023,2,1,3],k=3","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[1024,2,1,3],k=3","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[1025,2,1,3],k=3","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[16384,1,1,1],k=3","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[2047,2,1,3],k=3","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[2048,2,1,3],k=3","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[2049,2,1,3],k=3","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[16,10,10,10],k=7","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[60,10,10,10],k=7","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[1023,2,1,3],k=7","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[1024,2,1,3],k=7","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[1025,2,1,3],k=7","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[16384,1,1,1],k=7","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[2047,2,1,3],k=7","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[2048,2,1,3],k=7","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[2049,2,1,3],k=7","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[16,10,10,10],k=15","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[60,10,10,10],k=15","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[1023,2,1,3],k=15","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[1024,2,1,3],k=15","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[1025,2,1,3],k=15","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[16384,1,1,1],k=15","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[2047,2,1,3],k=15","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[2048,2,1,3],k=15","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[2049,2,1,3],k=15","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[1,1,1,1],k=1,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[12,1,2,1],k=1,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[2,1,1,1],k=1,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[13,1,2,1],k=1,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[2,1,1,1],k=2,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[13,1,2,1],k=2,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[4,1,1,1],k=1,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[15,1,2,1],k=1,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[4,1,1,1],k=2,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[15,1,2,1],k=2,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[4,1,1,1],k=3,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[15,1,2,1],k=3,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[8,1,1,1],k=1,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[19,1,2,1],k=1,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[8,1,1,1],k=2,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[19,1,2,1],k=2,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[8,1,1,1],k=3,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[19,1,2,1],k=3,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[8,1,1,1],k=7,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[19,1,2,1],k=7,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[16,1,1,1],k=1,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[27,1,2,1],k=1,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[16,1,1,1],k=2,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[27,1,2,1],k=2,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[16,1,1,1],k=3,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[27,1,2,1],k=3,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[16,1,1,1],k=7,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[27,1,2,1],k=7,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[16,1,1,1],k=15,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[27,1,2,1],k=15,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[32,1,1,1],k=1,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[43,1,2,1],k=1,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[32,1,1,1],k=2,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[43,1,2,1],k=2,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[32,1,1,1],k=3,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[43,1,2,1],k=3,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[32,1,1,1],k=7,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[43,1,2,1],k=7,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[32,1,1,1],k=15,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[43,1,2,1],k=15,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[64,1,1,1],k=1,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[75,1,2,1],k=1,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[64,1,1,1],k=2,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[75,1,2,1],k=2,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[64,1,1,1],k=3,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[75,1,2,1],k=3,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[64,1,1,1],k=7,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[75,1,2,1],k=7,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[64,1,1,1],k=15,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[75,1,2,1],k=15,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[128,1,1,1],k=1,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[139,1,2,1],k=1,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[128,1,1,1],k=2,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[139,1,2,1],k=2,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[128,1,1,1],k=3,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[139,1,2,1],k=3,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[128,1,1,1],k=7,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[139,1,2,1],k=7,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[128,1,1,1],k=15,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[139,1,2,1],k=15,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[128,1,1,1],k=100,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[139,1,2,1],k=100,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[256,1,1,1],k=1,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[267,1,2,1],k=1,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[256,1,1,1],k=2,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[267,1,2,1],k=2,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[256,1,1,1],k=3,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[267,1,2,1],k=3,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[256,1,1,1],k=7,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[267,1,2,1],k=7,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[256,1,1,1],k=15,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[267,1,2,1],k=15,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[256,1,1,1],k=100,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[267,1,2,1],k=100,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[512,1,1,1],k=1,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[523,1,2,1],k=1,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[512,1,1,1],k=2,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[523,1,2,1],k=2,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[512,1,1,1],k=3,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[523,1,2,1],k=3,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[512,1,1,1],k=7,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[523,1,2,1],k=7,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[512,1,1,1],k=15,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[523,1,2,1],k=15,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[512,1,1,1],k=100,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[523,1,2,1],k=100,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[512,1,1,1],k=500,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[523,1,2,1],k=500,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[1024,1,1,1],k=1,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[1035,1,2,1],k=1,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[1024,1,1,1],k=2,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[1035,1,2,1],k=2,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[1024,1,1,1],k=3,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[1035,1,2,1],k=3,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[1024,1,1,1],k=7,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[1035,1,2,1],k=7,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[1024,1,1,1],k=15,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[1035,1,2,1],k=15,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[1024,1,1,1],k=100,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[1035,1,2,1],k=100,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[1024,1,1,1],k=500,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[1035,1,2,1],k=500,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[1024,1,1,1],k=1023,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[1035,1,2,1],k=1023,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[2048,1,1,1],k=1,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[2059,1,2,1],k=1,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[2048,1,1,1],k=2,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[2059,1,2,1],k=2,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[2048,1,1,1],k=3,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[2059,1,2,1],k=3,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[2048,1,1,1],k=7,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[2059,1,2,1],k=7,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[2048,1,1,1],k=15,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[2059,1,2,1],k=15,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[2048,1,1,1],k=100,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[2059,1,2,1],k=100,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[2048,1,1,1],k=500,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[2059,1,2,1],k=500,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[2048,1,1,1],k=1023,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[2059,1,2,1],k=1023,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[4096,1,1,1],k=1,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[4107,1,2,1],k=1,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[4096,1,1,1],k=2,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[4107,1,2,1],k=2,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[4096,1,1,1],k=3,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[4107,1,2,1],k=3,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[4096,1,1,1],k=7,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[4107,1,2,1],k=7,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[4096,1,1,1],k=15,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[4107,1,2,1],k=15,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[4096,1,1,1],k=100,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[4107,1,2,1],k=100,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[4096,1,1,1],k=500,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[4107,1,2,1],k=500,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[4096,1,1,1],k=1023,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[4107,1,2,1],k=1023,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[8192,1,1,1],k=1,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[8203,1,2,1],k=1,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[8192,1,1,1],k=2,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[8203,1,2,1],k=2,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[8192,1,1,1],k=3,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[8203,1,2,1],k=3,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[8192,1,1,1],k=7,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[8203,1,2,1],k=7,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[8192,1,1,1],k=15,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[8203,1,2,1],k=15,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[8192,1,1,1],k=100,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[8203,1,2,1],k=100,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[8192,1,1,1],k=500,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[8203,1,2,1],k=500,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[8192,1,1,1],k=1023,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[8203,1,2,1],k=1023,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[16384,1,1,1],k=1,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[16395,1,2,1],k=1,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[16384,1,1,1],k=2,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[16395,1,2,1],k=2,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[16384,1,1,1],k=3,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[16395,1,2,1],k=3,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[16384,1,1,1],k=7,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[16395,1,2,1],k=7,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[16384,1,1,1],k=15,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[16395,1,2,1],k=15,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[16384,1,1,1],k=100,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[16395,1,2,1],k=100,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[16384,1,1,1],k=500,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[16395,1,2,1],k=500,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[16384,1,1,1],k=1023,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[16395,1,2,1],k=1023,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[16384,1,1,1],k=9999,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[16395,1,2,1],k=9999,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[32768,1,1,1],k=1,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[32779,1,2,1],k=1,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[32768,1,1,1],k=2,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[32779,1,2,1],k=2,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[32768,1,1,1],k=3,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[32779,1,2,1],k=3,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[32768,1,1,1],k=7,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[32779,1,2,1],k=7,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[32768,1,1,1],k=15,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[32779,1,2,1],k=15,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[32768,1,1,1],k=100,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[32779,1,2,1],k=100,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[32768,1,1,1],k=500,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[32779,1,2,1],k=500,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[32768,1,1,1],k=1023,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[32779,1,2,1],k=1023,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[32768,1,1,1],k=9999,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[32779,1,2,1],k=9999,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[65536,1,1,1],k=1,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[65547,1,2,1],k=1,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[65536,1,1,1],k=2,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[65547,1,2,1],k=2,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[65536,1,1,1],k=3,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[65547,1,2,1],k=3,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[65536,1,1,1],k=7,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[65547,1,2,1],k=7,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[65536,1,1,1],k=15,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[65547,1,2,1],k=15,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[65536,1,1,1],k=100,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[65547,1,2,1],k=100,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[65536,1,1,1],k=500,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[65547,1,2,1],k=500,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[65536,1,1,1],k=1023,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[65547,1,2,1],k=1023,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[65536,1,1,1],k=9999,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[65547,1,2,1],k=9999,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[131072,1,1,1],k=1,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[131083,1,2,1],k=1,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[131072,1,1,1],k=2,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[131083,1,2,1],k=2,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[131072,1,1,1],k=3,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[131083,1,2,1],k=3,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[131072,1,1,1],k=7,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[131083,1,2,1],k=7,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[131072,1,1,1],k=15,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[131083,1,2,1],k=15,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[131072,1,1,1],k=100,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[131083,1,2,1],k=100,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[131072,1,1,1],k=500,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[131083,1,2,1],k=500,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[131072,1,1,1],k=1023,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[131083,1,2,1],k=1023,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[131072,1,1,1],k=9999,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[131083,1,2,1],k=9999,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[262144,1,1,1],k=1,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[262155,1,2,1],k=1,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[262144,1,1,1],k=2,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[262155,1,2,1],k=2,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[262144,1,1,1],k=3,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[262155,1,2,1],k=3,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[262144,1,1,1],k=7,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[262155,1,2,1],k=7,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[262144,1,1,1],k=15,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[262155,1,2,1],k=15,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[262144,1,1,1],k=100,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[262155,1,2,1],k=100,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[262144,1,1,1],k=500,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[262155,1,2,1],k=500,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[262144,1,1,1],k=1023,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[262155,1,2,1],k=1023,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[262144,1,1,1],k=9999,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[262155,1,2,1],k=9999,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[524288,1,1,1],k=1,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[524299,1,2,1],k=1,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[524288,1,1,1],k=2,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[524299,1,2,1],k=2,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[524288,1,1,1],k=3,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[524299,1,2,1],k=3,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[524288,1,1,1],k=7,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[524299,1,2,1],k=7,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[524288,1,1,1],k=15,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[524299,1,2,1],k=15,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[524288,1,1,1],k=100,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[524299,1,2,1],k=100,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[524288,1,1,1],k=500,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[524299,1,2,1],k=500,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[524288,1,1,1],k=1023,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[524299,1,2,1],k=1023,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[524288,1,1,1],k=9999,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[524299,1,2,1],k=9999,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[16,10,10,10],k=1,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[60,10,10,10],k=1,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[1023,2,1,3],k=1,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[1024,2,1,3],k=1,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[1025,2,1,3],k=1,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[16384,1,1,1],k=1,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[2047,2,1,3],k=1,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[2048,2,1,3],k=1,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[2049,2,1,3],k=1,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[16,10,10,10],k=2,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[60,10,10,10],k=2,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[1023,2,1,3],k=2,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[1024,2,1,3],k=2,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[1025,2,1,3],k=2,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[16384,1,1,1],k=2,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[2047,2,1,3],k=2,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[2048,2,1,3],k=2,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[2049,2,1,3],k=2,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[16,10,10,10],k=3,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[60,10,10,10],k=3,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[1023,2,1,3],k=3,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[1024,2,1,3],k=3,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[1025,2,1,3],k=3,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[16384,1,1,1],k=3,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[2047,2,1,3],k=3,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[2048,2,1,3],k=3,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[2049,2,1,3],k=3,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[16,10,10,10],k=7,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[60,10,10,10],k=7,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[1023,2,1,3],k=7,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[1024,2,1,3],k=7,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[1025,2,1,3],k=7,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[16384,1,1,1],k=7,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[2047,2,1,3],k=7,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[2048,2,1,3],k=7,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[2049,2,1,3],k=7,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[16,10,10,10],k=15,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[60,10,10,10],k=15,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[1023,2,1,3],k=15,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[1024,2,1,3],k=15,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[1025,2,1,3],k=15,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[16384,1,1,1],k=15,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[2047,2,1,3],k=15,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[2048,2,1,3],k=15,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[2049,2,1,3],k=15,ties=0","support","1","yes","Metal" "Metal","UPSCALE","type=f32,ne=[512,512,3,2],scale_factor=2,mode=nearest,transpose=0","support","1","yes","Metal" "Metal","UPSCALE","type=f32,ne=[512,512,3,2],scale_factor=2,mode=nearest,transpose=1","support","1","yes","Metal" "Metal","UPSCALE","type=f32,ne=[2,5,7,11],ne_tgt=[5,7,11,13],mode=nearest,flags=none","support","1","yes","Metal" @@ -9891,8 +9904,9 @@ "Metal","GROUP_NORM","type=f32,ne=[64,64,320,1],num_groups=32,eps=0.000001","support","1","yes","Metal" "Metal","GROUP_NORM","type=f32,ne=[9,9,1280,1],num_groups=32,eps=0.000001","support","1","yes","Metal" "Metal","ACC","type=f32,ne_a=[256,17,1,1],ne_b=[256,16,1,1]","support","1","yes","Metal" -"Metal","PAD","type=f32,ne_a=[512,512,1,1],pad_0=1,pad_1=1","support","1","yes","Metal" -"Metal","PAD","type=f32,ne_a=[512,512,3,1],lp0=1,rp0=1,lp1=1,rp1=1,lp2=1,rp2=1,lp3=1,rp3=1,v=0","support","0","no","Metal" +"Metal","PAD","type=f32,ne_a=[512,512,1,1],pad_0=1,pad_1=1,circular=0","support","1","yes","Metal" +"Metal","PAD","type=f32,ne_a=[33,17,2,1],pad_0=4,pad_1=3,circular=1","support","0","no","Metal" +"Metal","PAD","type=f32,ne_a=[512,512,3,1],lp0=1,rp0=1,lp1=1,rp1=1,lp2=1,rp2=1,lp3=1,rp3=1,v=0,circular=0","support","0","no","Metal" "Metal","PAD_REFLECT_1D","type=f32,ne_a=[512,34,2,1],pad_0=10,pad_1=9","support","1","yes","Metal" "Metal","PAD_REFLECT_1D","type=f32,ne_a=[3000,384,4,1],pad_0=10,pad_1=9","support","1","yes","Metal" "Metal","ROLL","shift0=3,shift1=-2,shift3=1,shift4=-1","support","0","no","Metal" @@ -9923,17 +9937,41 @@ "Metal","FILL","type=f32,ne=[303,207,11,3],c=2.000000","support","1","yes","Metal" "Metal","FILL","type=f32,ne=[800,600,4,4],c=-152.000000","support","1","yes","Metal" "Metal","FILL","type=f32,ne=[2048,512,2,2],c=3.500000","support","1","yes","Metal" +"Metal","DIAG","type=f32,ne=[10,1,4,3]","support","0","no","Metal" +"Metal","DIAG","type=f32,ne=[79,1,19,13]","support","0","no","Metal" +"Metal","DIAG","type=f32,ne=[256,1,8,16]","support","0","no","Metal" "Metal","SOLVE_TRI","type=f32,ne_lhs=[10,10,4,3],ne_rhs=[3,10,4,3]","support","0","no","Metal" "Metal","SOLVE_TRI","type=f32,ne_lhs=[11,11,1,1],ne_rhs=[5,11,1,1]","support","0","no","Metal" "Metal","SOLVE_TRI","type=f32,ne_lhs=[17,17,2,4],ne_rhs=[9,17,2,4]","support","0","no","Metal" "Metal","SOLVE_TRI","type=f32,ne_lhs=[30,30,7,1],ne_rhs=[8,30,7,1]","support","0","no","Metal" "Metal","SOLVE_TRI","type=f32,ne_lhs=[42,42,5,2],ne_rhs=[10,42,5,2]","support","0","no","Metal" "Metal","SOLVE_TRI","type=f32,ne_lhs=[64,64,2,2],ne_rhs=[10,64,2,2]","support","0","no","Metal" +"Metal","SOLVE_TRI","type=f32,ne_lhs=[64,64,2,2],ne_rhs=[64,64,2,2]","support","0","no","Metal" +"Metal","SOLVE_TRI","type=f32,ne_lhs=[79,79,5,3],ne_rhs=[417,79,5,3]","support","0","no","Metal" +"Metal","SOLVE_TRI","type=f32,ne_lhs=[128,128,4,2],ne_rhs=[32,128,4,2]","support","0","no","Metal" +"Metal","SOLVE_TRI","type=f32,ne_lhs=[80,80,2,8],ne_rhs=[80,80,2,8]","support","0","no","Metal" +"Metal","SOLVE_TRI","type=f32,ne_lhs=[80,80,2,8],ne_rhs=[79,80,2,8]","support","0","no","Metal" +"Metal","SOLVE_TRI","type=f32,ne_lhs=[80,80,2,8],ne_rhs=[81,80,2,8]","support","0","no","Metal" +"Metal","SOLVE_TRI","type=f32,ne_lhs=[80,80,8,8],ne_rhs=[80,80,8,8]","support","0","no","Metal" +"Metal","SOLVE_TRI","type=f32,ne_lhs=[80,80,8,8],ne_rhs=[79,80,8,8]","support","0","no","Metal" +"Metal","SOLVE_TRI","type=f32,ne_lhs=[80,80,8,8],ne_rhs=[81,80,8,8]","support","0","no","Metal" +"Metal","SOLVE_TRI","type=f32,ne_lhs=[84,84,4,4],ne_rhs=[32,84,4,4]","support","0","no","Metal" +"Metal","SOLVE_TRI","type=f32,ne_lhs=[95,95,8,8],ne_rhs=[40,95,8,8]","support","0","no","Metal" "Metal","SOLVE_TRI","type=f32,ne_lhs=[100,100,4,4],ne_rhs=[41,100,4,4]","support","0","no","Metal" -"Metal","PAD","type=f32,ne_a=[512,512,1,1],lp0=0,rp0=1,lp1=0,rp1=1,lp2=0,rp2=0,lp3=0,rp3=0,v=0","support","1","yes","Metal" -"Metal","PAD","type=f32,ne_a=[11,22,33,44],lp0=1,rp0=2,lp1=3,rp1=4,lp2=5,rp2=6,lp3=7,rp3=8,v=0","support","0","no","Metal" -"Metal","PAD","type=f32,ne_a=[512,512,1,1],lp0=0,rp0=1,lp1=0,rp1=1,lp2=0,rp2=0,lp3=0,rp3=0,v=1","support","1","yes","Metal" -"Metal","PAD","type=f32,ne_a=[11,22,33,44],lp0=1,rp0=2,lp1=3,rp1=4,lp2=5,rp2=6,lp3=7,rp3=8,v=1","support","0","no","Metal" +"Metal","SOLVE_TRI","type=f32,ne_lhs=[128,128,4,4],ne_rhs=[31,128,4,4]","support","0","no","Metal" +"Metal","SOLVE_TRI","type=f32,ne_lhs=[128,128,4,4],ne_rhs=[32,128,4,4]","support","0","no","Metal" +"Metal","SOLVE_TRI","type=f32,ne_lhs=[128,128,3,4],ne_rhs=[32,128,3,4]","support","0","no","Metal" +"Metal","SOLVE_TRI","type=f32,ne_lhs=[128,128,4,1],ne_rhs=[32,128,4,1]","support","0","no","Metal" +"Metal","SOLVE_TRI","type=f32,ne_lhs=[64,64,4,4],ne_rhs=[200,64,4,4]","support","0","no","Metal" +"Metal","SOLVE_TRI","type=f32,ne_lhs=[64,64,4,4],ne_rhs=[384,64,4,4]","support","0","no","Metal" +"Metal","PAD","type=f32,ne_a=[512,512,1,1],lp0=0,rp0=1,lp1=0,rp1=1,lp2=0,rp2=0,lp3=0,rp3=0,v=0,circular=0","support","1","yes","Metal" +"Metal","PAD","type=f32,ne_a=[11,22,33,44],lp0=1,rp0=2,lp1=3,rp1=4,lp2=5,rp2=6,lp3=7,rp3=8,v=0,circular=0","support","0","no","Metal" +"Metal","PAD","type=f32,ne_a=[512,512,1,1],lp0=0,rp0=1,lp1=0,rp1=1,lp2=0,rp2=0,lp3=0,rp3=0,v=0,circular=1","support","0","no","Metal" +"Metal","PAD","type=f32,ne_a=[11,22,33,44],lp0=1,rp0=2,lp1=3,rp1=4,lp2=5,rp2=6,lp3=7,rp3=8,v=0,circular=1","support","0","no","Metal" +"Metal","PAD","type=f32,ne_a=[512,512,1,1],lp0=0,rp0=1,lp1=0,rp1=1,lp2=0,rp2=0,lp3=0,rp3=0,v=1,circular=0","support","1","yes","Metal" +"Metal","PAD","type=f32,ne_a=[11,22,33,44],lp0=1,rp0=2,lp1=3,rp1=4,lp2=5,rp2=6,lp3=7,rp3=8,v=1,circular=0","support","0","no","Metal" +"Metal","PAD","type=f32,ne_a=[512,512,1,1],lp0=0,rp0=1,lp1=0,rp1=1,lp2=0,rp2=0,lp3=0,rp3=0,v=1,circular=1","support","0","no","Metal" +"Metal","PAD","type=f32,ne_a=[11,22,33,44],lp0=1,rp0=2,lp1=3,rp1=4,lp2=5,rp2=6,lp3=7,rp3=8,v=1,circular=1","support","0","no","Metal" "Metal","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Metal" "Metal","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Metal" "Metal","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","1","yes","Metal" diff --git a/ggml/src/ggml-metal/ggml-metal-device.cpp b/ggml/src/ggml-metal/ggml-metal-device.cpp index 680904d132..b0734797f1 100644 --- a/ggml/src/ggml-metal/ggml-metal-device.cpp +++ b/ggml/src/ggml-metal/ggml-metal-device.cpp @@ -1684,3 +1684,60 @@ ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_opt_step_sgd(ggm return res; } + +ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_memset(ggml_metal_library_t lib, const ggml_tensor * op) { + GGML_ASSERT(op->type == GGML_TYPE_I64); + + char base[256]; + char name[256]; + + snprintf(base, 256, "kernel_memset_%s", ggml_type_name(op->type)); + snprintf(name, 256, "%s", base); + + ggml_metal_pipeline_with_params res = ggml_metal_library_get_pipeline(lib, name); + if (!res.pipeline) { + res = ggml_metal_library_compile_pipeline(lib, base, name, nullptr); + } + + return res; +} + +ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_count_equal(ggml_metal_library_t lib, const ggml_tensor * op) { + assert(op->op == GGML_OP_COUNT_EQUAL); + + GGML_TENSOR_LOCALS(int64_t, ne0, op->src[0], ne); + + GGML_ASSERT(op->src[0]->type == op->src[1]->type); + GGML_ASSERT(op->src[0]->type == GGML_TYPE_I32); + GGML_ASSERT(op->type == GGML_TYPE_I64); + + // note: the kernel only supports i32 output due to metal atomic add only supporting atomic_int + GGML_ASSERT(ggml_nelements(op->src[0]) < (1LL << 31)); + + char base[256]; + char name[256]; + + int nsg = 1; + while (32*nsg < ne00 && nsg < 32) { + nsg *= 2; + } + + snprintf(base, 256, "kernel_count_equal_%s", ggml_type_name(op->src[0]->type)); + snprintf(name, 256, "%s_nsg=%d", base, nsg); + + ggml_metal_pipeline_with_params res = ggml_metal_library_get_pipeline(lib, name); + if (!res.pipeline) { + ggml_metal_cv_t cv = ggml_metal_cv_init(); + + ggml_metal_cv_set_int16(cv, nsg, FC_COUNT_EQUAL + 0); + + res = ggml_metal_library_compile_pipeline(lib, base, name, cv); + + ggml_metal_cv_free(cv); + } + + res.smem = 32 * sizeof(int32_t); + res.nsg = nsg; + + return res; +} diff --git a/ggml/src/ggml-metal/ggml-metal-device.h b/ggml/src/ggml-metal/ggml-metal-device.h index 0a8b9211a7..d983b666ca 100644 --- a/ggml/src/ggml-metal/ggml-metal-device.h +++ b/ggml/src/ggml-metal/ggml-metal-device.h @@ -147,6 +147,8 @@ struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_arange struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_timestep_embedding(ggml_metal_library_t lib, const struct ggml_tensor * op); struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_opt_step_adamw (ggml_metal_library_t lib, const struct ggml_tensor * op); struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_opt_step_sgd (ggml_metal_library_t lib, const struct ggml_tensor * op); +struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_memset (ggml_metal_library_t lib, const struct ggml_tensor * op); +struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_count_equal (ggml_metal_library_t lib, const struct ggml_tensor * op); struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_flash_attn_ext_pad( ggml_metal_library_t lib, diff --git a/ggml/src/ggml-metal/ggml-metal-device.m b/ggml/src/ggml-metal/ggml-metal-device.m index f24270bb1c..59badd0043 100644 --- a/ggml/src/ggml-metal/ggml-metal-device.m +++ b/ggml/src/ggml-metal/ggml-metal-device.m @@ -1023,6 +1023,11 @@ bool ggml_metal_device_supports_op(ggml_metal_device_t dev, const struct ggml_te return has_simdgroup_reduction && ggml_is_contiguous_rows(op->src[0]); case GGML_OP_L2_NORM: return has_simdgroup_reduction && (op->ne[0] % 4 == 0 && ggml_is_contiguous_1(op->src[0])); + case GGML_OP_COUNT_EQUAL: + return has_simdgroup_reduction && + op->src[0]->type == GGML_TYPE_I32 && + op->src[1]->type == GGML_TYPE_I32 && + op->type == GGML_TYPE_I64; case GGML_OP_ARGMAX: return has_simdgroup_reduction; case GGML_OP_NORM: diff --git a/ggml/src/ggml-metal/ggml-metal-impl.h b/ggml/src/ggml-metal/ggml-metal-impl.h index 8944b07e90..d3b0e732ec 100644 --- a/ggml/src/ggml-metal/ggml-metal-impl.h +++ b/ggml/src/ggml-metal/ggml-metal-impl.h @@ -78,6 +78,7 @@ #define FC_MUL_MM 700 #define FC_ROPE 800 #define FC_SSM_CONV 900 +#define FC_COUNT_EQUAL 1000 // op-specific constants #define OP_FLASH_ATTN_EXT_NQPTG 8 @@ -894,6 +895,25 @@ typedef struct { float step; } ggml_metal_kargs_arange; +typedef struct { + int64_t val; +} ggml_metal_kargs_memset; + +typedef struct { + int32_t ne00; + int32_t ne01; + int32_t ne02; + int32_t ne03; + uint64_t nb00; + uint64_t nb01; + uint64_t nb02; + uint64_t nb03; + uint64_t nb10; + uint64_t nb11; + uint64_t nb12; + uint64_t nb13; +} ggml_metal_kargs_count_equal; + typedef struct { int32_t k0; int32_t k1; diff --git a/ggml/src/ggml-metal/ggml-metal-ops.cpp b/ggml/src/ggml-metal/ggml-metal-ops.cpp index e99c1763f6..acf2aa9184 100644 --- a/ggml/src/ggml-metal/ggml-metal-ops.cpp +++ b/ggml/src/ggml-metal/ggml-metal-ops.cpp @@ -448,7 +448,11 @@ static int ggml_metal_op_encode_impl(ggml_metal_op_t ctx, int idx) { { n_fuse = ggml_metal_op_opt_step_sgd(ctx, idx); } break; - default: + case GGML_OP_COUNT_EQUAL: + { + n_fuse = ggml_metal_op_count_equal(ctx, idx); + } break; + default: { GGML_LOG_ERROR("%s: error: node %3d, op = %8s not implemented\n", __func__, idx, ggml_op_name(node->op)); GGML_ABORT("fatal error"); @@ -4090,3 +4094,64 @@ int ggml_metal_op_opt_step_sgd(ggml_metal_op_t ctx, int idx) { return 1; } + +int ggml_metal_op_count_equal(ggml_metal_op_t ctx, int idx) { + ggml_tensor * op = ctx->node(idx); + + ggml_metal_library_t lib = ctx->lib; + ggml_metal_encoder_t enc = ctx->enc; + + GGML_TENSOR_LOCALS(int32_t, ne0, op->src[0], ne); + GGML_TENSOR_LOCALS(uint64_t, nb0, op->src[0], nb); + GGML_TENSOR_LOCALS(uint64_t, nb1, op->src[1], nb); + + { + ggml_metal_kargs_memset args = { /*.val =*/ 0 }; + + auto pipeline = ggml_metal_library_get_pipeline_memset(lib, op); + + ggml_metal_encoder_set_pipeline(enc, pipeline); + ggml_metal_encoder_set_bytes(enc, &args, sizeof(args), 0); + ggml_metal_encoder_set_buffer(enc, ggml_metal_get_buffer_id(op), 1); + + ggml_metal_encoder_dispatch_threadgroups(enc, 1, 1, 1, 1, 1, 1); + } + + ggml_metal_op_concurrency_reset(ctx); + + { + ggml_metal_kargs_count_equal args = { + /*.ne00 =*/ ne00, + /*.ne01 =*/ ne01, + /*.ne02 =*/ ne02, + /*.ne03 =*/ ne03, + /*.nb00 =*/ nb00, + /*.nb01 =*/ nb01, + /*.nb02 =*/ nb02, + /*.nb03 =*/ nb03, + /*.nb10 =*/ nb10, + /*.nb11 =*/ nb11, + /*.nb12 =*/ nb12, + /*.nb13 =*/ nb13, + }; + + auto pipeline = ggml_metal_library_get_pipeline_count_equal(lib, op); + + const size_t smem = pipeline.smem; + + const int nth = 32*pipeline.nsg; + + GGML_ASSERT(nth <= ggml_metal_pipeline_max_theads_per_threadgroup(pipeline)); + + ggml_metal_encoder_set_pipeline(enc, pipeline); + ggml_metal_encoder_set_bytes(enc, &args, sizeof(args), 0); + ggml_metal_encoder_set_buffer(enc, ggml_metal_get_buffer_id(op->src[0]), 1); + ggml_metal_encoder_set_buffer(enc, ggml_metal_get_buffer_id(op->src[1]), 2); + ggml_metal_encoder_set_buffer(enc, ggml_metal_get_buffer_id(op), 3); + + ggml_metal_encoder_set_threadgroup_memory_size(enc, smem, 0); + ggml_metal_encoder_dispatch_threadgroups(enc, ne01, ne02, ne03, nth, 1, 1); + } + + return 1; +} diff --git a/ggml/src/ggml-metal/ggml-metal-ops.h b/ggml/src/ggml-metal/ggml-metal-ops.h index 902b544523..c1025d3567 100644 --- a/ggml/src/ggml-metal/ggml-metal-ops.h +++ b/ggml/src/ggml-metal/ggml-metal-ops.h @@ -87,6 +87,7 @@ int ggml_metal_op_leaky_relu (ggml_metal_op_t ctx, int idx); int ggml_metal_op_tri (ggml_metal_op_t ctx, int idx); int ggml_metal_op_opt_step_adamw (ggml_metal_op_t ctx, int idx); int ggml_metal_op_opt_step_sgd (ggml_metal_op_t ctx, int idx); +int ggml_metal_op_count_equal (ggml_metal_op_t ctx, int idx); #ifdef __cplusplus } diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal index 3154beff9b..67b30e0d93 100644 --- a/ggml/src/ggml-metal/ggml-metal.metal +++ b/ggml/src/ggml-metal/ggml-metal.metal @@ -1790,6 +1790,7 @@ kernel void kernel_op_sum_f32( return; } + // TODO: become function constant const uint nsg = (ntg.x + 31) / 32; float sumf = 0; @@ -9914,3 +9915,75 @@ kernel void kernel_opt_step_sgd_f32( x[gid] = x[gid] * (1.0f - pars[0] * pars[1]) - pars[0] * g[gid]; } + +template +kernel void kernel_memset( + constant ggml_metal_kargs_fill & args, + device T * dst, + uint tpig[[thread_position_in_grid]]) { + dst[tpig] = args.val; +} + +typedef decltype(kernel_memset) kernel_memset_t; + +template [[host_name("kernel_memset_i64")]] kernel kernel_memset_t kernel_memset; + +constant short FC_count_equal_nsg [[function_constant(FC_COUNT_EQUAL + 0)]]; + +template +kernel void kernel_count_equal( + constant ggml_metal_kargs_count_equal & args, + device const char * src0, + device const char * src1, + device atomic_int * dst, + threadgroup int32_t * shmem_i32 [[threadgroup(0)]], + uint3 tgpig[[threadgroup_position_in_grid]], + ushort3 tpitg[[thread_position_in_threadgroup]], + ushort sgitg[[simdgroup_index_in_threadgroup]], + ushort tiisg[[thread_index_in_simdgroup]], + ushort3 ntg[[threads_per_threadgroup]]) { + const short NSG = FC_count_equal_nsg; + + const int i3 = tgpig.z; + const int i2 = tgpig.y; + const int i1 = tgpig.x; + + if (i3 >= args.ne03 || i2 >= args.ne02 || i1 >= args.ne01) { + return; + } + + int sum = 0; + + device const char * base0 = src0 + i1*args.nb01 + i2*args.nb02 + i3*args.nb03; + device const char * base1 = src1 + i1*args.nb11 + i2*args.nb12 + i3*args.nb13; + + for (int64_t i0 = tpitg.x; i0 < args.ne00; i0 += ntg.x) { + const T v0 = *(device const T *)(base0 + i0*args.nb00); + const T v1 = *(device const T *)(base1 + i0*args.nb10); + sum += (v0 == v1); + } + + sum = simd_sum(sum); + + if (tiisg == 0) { + shmem_i32[sgitg] = sum; + } + + threadgroup_barrier(mem_flags::mem_threadgroup); + + if (sgitg == 0) { + float v = 0.0f; + if (tpitg.x < NSG) { + v = shmem_i32[tpitg.x]; + } + + float total = simd_sum(v); + if (tpitg.x == 0) { + atomic_fetch_add_explicit(dst, (int32_t) total, memory_order_relaxed); + } + } +} + +typedef decltype(kernel_count_equal) kernel_count_equal_t; + +template [[host_name("kernel_count_equal_i32")]] kernel kernel_count_equal_t kernel_count_equal;