From 6e0c8cbc40c4abf49e5c52f0f51267c2afdfc053 Mon Sep 17 00:00:00 2001 From: Bart Louwers Date: Tue, 30 Dec 2025 22:13:49 +0100 Subject: [PATCH 01/12] docs : document that JSON Schema is not available to model when using response_format (#18492) * Document unsupported JSON Schema annotations Add note about unsupported JSON Schema annotations. * Update README.md * Update README.md * Update README.md --- grammars/README.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/grammars/README.md b/grammars/README.md index daac7f4d8d..dcd28648b1 100644 --- a/grammars/README.md +++ b/grammars/README.md @@ -150,6 +150,9 @@ You can use GBNF grammars: - in CLI, with [examples/json_schema_to_grammar.py](../examples/json_schema_to_grammar.py) - in JavaScript with [json-schema-to-grammar.mjs](../tools/server/public_legacy/json-schema-to-grammar.mjs) (this is used by the [server](../tools/server)'s Web UI) +> [!NOTE] +> The JSON schema is only used to constrain the model output and is not injected into the prompt. The model has no visibility into the schema, so if you want it to understand the expected structure, describe it explicitly in your prompt. This does not apply to tool calling, where schemas are injected into the prompt. + Take a look at [tests](../tests/test-json-schema-to-grammar.cpp) to see which features are likely supported (you'll also find usage examples in https://github.com/ggml-org/llama.cpp/pull/5978, https://github.com/ggml-org/llama.cpp/pull/6659 & https://github.com/ggml-org/llama.cpp/pull/6555). ```bash From 4849661d9898ac3caf59ddd62044185805084370 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= Date: Tue, 30 Dec 2025 22:28:53 +0100 Subject: [PATCH 02/12] docker : add CUDA 13.1 image build (#18441) * add updated cuda-new.Dockerfile for Ubuntu 24.04 compatibilty * add cuda13 build --- .devops/cuda-new.Dockerfile | 95 ++++++++++++++++++++++++++++++++++++ .github/workflows/docker.yml | 35 ++++++++----- 2 files changed, 119 insertions(+), 11 deletions(-) create mode 100644 .devops/cuda-new.Dockerfile diff --git a/.devops/cuda-new.Dockerfile b/.devops/cuda-new.Dockerfile new file mode 100644 index 0000000000..62443e17f2 --- /dev/null +++ b/.devops/cuda-new.Dockerfile @@ -0,0 +1,95 @@ +ARG UBUNTU_VERSION=24.04 +# This needs to generally match the container host's environment. +ARG CUDA_VERSION=13.1.0 +# Target the CUDA build image +ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION} + +ARG BASE_CUDA_RUN_CONTAINER=nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu${UBUNTU_VERSION} + +FROM ${BASE_CUDA_DEV_CONTAINER} AS build + +# CUDA architecture to build for (defaults to all supported archs) +ARG CUDA_DOCKER_ARCH=default + +RUN apt-get update && \ + apt-get install -y build-essential cmake python3 python3-pip git libcurl4-openssl-dev libgomp1 + +WORKDIR /app + +COPY . . + +RUN if [ "${CUDA_DOCKER_ARCH}" != "default" ]; then \ + export CMAKE_ARGS="-DCMAKE_CUDA_ARCHITECTURES=${CUDA_DOCKER_ARCH}"; \ + fi && \ + cmake -B build -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DLLAMA_BUILD_TESTS=OFF ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \ + cmake --build build --config Release -j$(nproc) + +RUN mkdir -p /app/lib && \ + find build -name "*.so*" -exec cp -P {} /app/lib \; + +RUN mkdir -p /app/full \ + && cp build/bin/* /app/full \ + && cp *.py /app/full \ + && cp -r gguf-py /app/full \ + && cp -r requirements /app/full \ + && cp requirements.txt /app/full \ + && cp .devops/tools.sh /app/full/tools.sh + +## Base image +FROM ${BASE_CUDA_RUN_CONTAINER} AS base + +RUN apt-get update \ + && apt-get install -y libgomp1 curl\ + && apt autoremove -y \ + && apt clean -y \ + && rm -rf /tmp/* /var/tmp/* \ + && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \ + && find /var/cache -type f -delete + +COPY --from=build /app/lib/ /app + +### Full +FROM base AS full + +COPY --from=build /app/full /app + +WORKDIR /app + +RUN apt-get update \ + && apt-get install -y \ + git \ + python3 \ + python3-pip \ + python3-wheel \ + && pip install --break-system-packages --upgrade setuptools \ + && pip install --break-system-packages -r requirements.txt \ + && apt autoremove -y \ + && apt clean -y \ + && rm -rf /tmp/* /var/tmp/* \ + && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \ + && find /var/cache -type f -delete + + +ENTRYPOINT ["/app/tools.sh"] + +### Light, CLI only +FROM base AS light + +COPY --from=build /app/full/llama-cli /app/full/llama-completion /app + +WORKDIR /app + +ENTRYPOINT [ "/app/llama-cli" ] + +### Server, Server only +FROM base AS server + +ENV LLAMA_ARG_HOST=0.0.0.0 + +COPY --from=build /app/full/llama-server /app + +WORKDIR /app + +HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ] + +ENTRYPOINT [ "/app/llama-server" ] diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml index bfd1270716..d9fe0686d3 100644 --- a/.github/workflows/docker.yml +++ b/.github/workflows/docker.yml @@ -40,7 +40,8 @@ jobs: # https://github.com/ggml-org/llama.cpp/issues/11888 #- { tag: "cpu", dockerfile: ".devops/cpu.Dockerfile", platforms: "linux/amd64,linux/arm64", full: true, light: true, server: true, free_disk_space: false } - { tag: "cpu", dockerfile: ".devops/cpu.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: false, runs_on: "ubuntu-22.04" } - - { tag: "cuda", dockerfile: ".devops/cuda.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: true, runs_on: "ubuntu-22.04" } + - { tag: "cuda cuda12", dockerfile: ".devops/cuda.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: true, runs_on: "ubuntu-22.04", cuda_version: "12.4.0", ubuntu_version: "22.04" } + - { tag: "cuda13", dockerfile: ".devops/cuda-new.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: true, runs_on: "ubuntu-22.04", cuda_version: "13.1.0", ubuntu_version: "24.04" } - { tag: "musa", dockerfile: ".devops/musa.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: true, runs_on: "ubuntu-22.04" } - { tag: "intel", dockerfile: ".devops/intel.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: true, runs_on: "ubuntu-22.04" } - { tag: "vulkan", dockerfile: ".devops/vulkan.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: false, runs_on: "ubuntu-22.04" } @@ -80,18 +81,21 @@ jobs: run: | REPO_OWNER="${GITHUB_REPOSITORY_OWNER@L}" # to lower case REPO_NAME="${{ github.event.repository.name }}" + PREFIX="ghcr.io/${REPO_OWNER}/${REPO_NAME}:" # list all tags possible - if [[ "${{ matrix.config.tag }}" == "cpu" ]]; then - TYPE="" - else - TYPE="-${{ matrix.config.tag }}" - fi - PREFIX="ghcr.io/${REPO_OWNER}/${REPO_NAME}:" - CACHETAGS="${PREFIX}buildcache${TYPE}" - FULLTAGS="${PREFIX}full${TYPE},${PREFIX}full${TYPE}-${{ steps.srctag.outputs.name }}" - LIGHTTAGS="${PREFIX}light${TYPE},${PREFIX}light${TYPE}-${{ steps.srctag.outputs.name }}" - SERVERTAGS="${PREFIX}server${TYPE},${PREFIX}server${TYPE}-${{ steps.srctag.outputs.name }}" + tags="${{ matrix.config.tag }}" + for tag in $tags; do + if [[ "$tag" == "cpu" ]]; then + TYPE="" + else + TYPE="-$tag" + fi + CACHETAGS="${PREFIX}buildcache${TYPE}" + FULLTAGS="${FULLTAGS:+$FULLTAGS,}${PREFIX}full${TYPE},${PREFIX}full${TYPE}-${{ steps.srctag.outputs.name }}" + LIGHTTAGS="${LIGHTTAGS:+$LIGHTTAGS,}${PREFIX}light${TYPE},${PREFIX}light${TYPE}-${{ steps.srctag.outputs.name }}" + SERVERTAGS="${SERVERTAGS:+$SERVERTAGS,}${PREFIX}server${TYPE},${PREFIX}server${TYPE}-${{ steps.srctag.outputs.name }}" + done echo "cache_output_tags=$CACHETAGS" >> $GITHUB_OUTPUT echo "full_output_tags=$FULLTAGS" >> $GITHUB_OUTPUT echo "light_output_tags=$LIGHTTAGS" >> $GITHUB_OUTPUT @@ -132,6 +136,9 @@ jobs: file: ${{ matrix.config.dockerfile }} target: full provenance: false + build-args: | + ${{ matrix.config.ubuntu_version && format('UBUNTU_VERSION={0}', matrix.config.ubuntu_version) || '' }} + ${{ matrix.config.cuda_version && format('CUDA_VERSION={0}', matrix.config.cuda_version) || '' }} # using github experimental cache #cache-from: type=gha #cache-to: type=gha,mode=max @@ -154,6 +161,9 @@ jobs: file: ${{ matrix.config.dockerfile }} target: light provenance: false + build-args: | + ${{ matrix.config.ubuntu_version && format('UBUNTU_VERSION={0}', matrix.config.ubuntu_version) || '' }} + ${{ matrix.config.cuda_version && format('CUDA_VERSION={0}', matrix.config.cuda_version) || '' }} # using github experimental cache #cache-from: type=gha #cache-to: type=gha,mode=max @@ -176,6 +186,9 @@ jobs: file: ${{ matrix.config.dockerfile }} target: server provenance: false + build-args: | + ${{ matrix.config.ubuntu_version && format('UBUNTU_VERSION={0}', matrix.config.ubuntu_version) || '' }} + ${{ matrix.config.cuda_version && format('CUDA_VERSION={0}', matrix.config.cuda_version) || '' }} # using github experimental cache #cache-from: type=gha #cache-to: type=gha,mode=max From c8a37980419e8b7f0193d058fb6f8f01b458cfca Mon Sep 17 00:00:00 2001 From: Rahul Sathe <150351592+rrsathe@users.noreply.github.com> Date: Wed, 31 Dec 2025 06:38:44 +0530 Subject: [PATCH 03/12] Work around broken IntelSYCLConfig.cmake in Intel oneAPI 2025.x (#18345) * cmake: work around broken IntelSYCLConfig.cmake in oneAPI 2025.x * [AI] sycl: auto-detect and skip incompatible IntelSYCL package Automatically detect compiler versions with incompatible IntelSYCL CMake configuration files and fall back to manual SYCL flags instead of requiring users to set options manually. Fixes build failures with oneAPI 2025.x where IntelSYCLConfig.cmake has SYCL_FEATURE_TEST_EXTRACT invocation errors. * refactor: improve SYCL provider handling and error messages in CMake configuration * refactor: enhance SYCL provider validation and error handling in CMake configuration * ggml-sycl: wrap find_package(IntelSYCL) to prevent build crashes --- ggml/src/ggml-sycl/CMakeLists.txt | 44 +++++++++++++++++++++++++++++-- 1 file changed, 42 insertions(+), 2 deletions(-) diff --git a/ggml/src/ggml-sycl/CMakeLists.txt b/ggml/src/ggml-sycl/CMakeLists.txt index 88f29221bb..51594fb88e 100644 --- a/ggml/src/ggml-sycl/CMakeLists.txt +++ b/ggml/src/ggml-sycl/CMakeLists.txt @@ -36,7 +36,47 @@ if (WIN32) endif() endif() -find_package(IntelSYCL) +macro(detect_and_find_package package_name) + set(test_source " + cmake_minimum_required(VERSION ${CMAKE_VERSION}) + project(check_package LANGUAGES CXX) + find_package(${package_name} QUIET) + ") + + set(test_dir "${CMAKE_CURRENT_BINARY_DIR}/check_package_${package_name}") + file(WRITE "${test_dir}/CMakeLists.txt" "${test_source}") + + set(cmake_args "") + if(CMAKE_GENERATOR) + list(APPEND cmake_args "-G" "${CMAKE_GENERATOR}") + endif() + if(CMAKE_GENERATOR_PLATFORM) + list(APPEND cmake_args "-A" "${CMAKE_GENERATOR_PLATFORM}") + endif() + if(CMAKE_GENERATOR_TOOLSET) + list(APPEND cmake_args "-T" "${CMAKE_GENERATOR_TOOLSET}") + endif() + if(CMAKE_CXX_COMPILER) + list(APPEND cmake_args "-DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}") + endif() + + execute_process( + COMMAND ${CMAKE_COMMAND} ${cmake_args} . + WORKING_DIRECTORY "${test_dir}" + RESULT_VARIABLE result + OUTPUT_QUIET + ERROR_QUIET + ) + + if(result EQUAL 0) + find_package(${package_name} ${ARGN}) + else() + message(WARNING "Detection of ${package_name} failed. The package might be broken or incompatible.") + set(${package_name}_FOUND FALSE) + endif() +endmacro() + +detect_and_find_package(IntelSYCL) if (IntelSYCL_FOUND) # Use oneAPI CMake when possible target_link_libraries(ggml-sycl PRIVATE IntelSYCL::SYCL_CXX) @@ -190,4 +230,4 @@ endif() if (GGML_SYCL_DEVICE_ARCH) target_compile_options(ggml-sycl PRIVATE -Xsycl-target-backend --offload-arch=${GGML_SYCL_DEVICE_ARCH}) target_link_options(ggml-sycl PRIVATE -Xsycl-target-backend --offload-arch=${GGML_SYCL_DEVICE_ARCH}) -endif() +endif() \ No newline at end of file From 7bcaf815c20f471fead106088b558e542982bf30 Mon Sep 17 00:00:00 2001 From: Aman Gupta Date: Wed, 31 Dec 2025 14:23:44 +0800 Subject: [PATCH 04/12] sycl: add newline at the end of CMakeLists.txt (#18503) --- ggml/src/ggml-sycl/CMakeLists.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ggml/src/ggml-sycl/CMakeLists.txt b/ggml/src/ggml-sycl/CMakeLists.txt index 51594fb88e..5a89d8dd68 100644 --- a/ggml/src/ggml-sycl/CMakeLists.txt +++ b/ggml/src/ggml-sycl/CMakeLists.txt @@ -230,4 +230,5 @@ endif() if (GGML_SYCL_DEVICE_ARCH) target_compile_options(ggml-sycl PRIVATE -Xsycl-target-backend --offload-arch=${GGML_SYCL_DEVICE_ARCH}) target_link_options(ggml-sycl PRIVATE -Xsycl-target-backend --offload-arch=${GGML_SYCL_DEVICE_ARCH}) -endif() \ No newline at end of file +endif() + From 01ade96e71b62b482019e42dd74551758fde8851 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Wed, 31 Dec 2025 09:53:48 +0200 Subject: [PATCH 05/12] metal : remove BF16 x F16 kernels (#18456) --- ggml/src/ggml-metal/ggml-metal.metal | 6 ------ 1 file changed, 6 deletions(-) diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal index 51bcbae309..3154beff9b 100644 --- a/ggml/src/ggml-metal/ggml-metal.metal +++ b/ggml/src/ggml-metal/ggml-metal.metal @@ -9557,9 +9557,6 @@ template [[host_name("kernel_mul_mm_iq4_xs_f32")]] kernel mul_mm_t kernel_mul_m template [[host_name("kernel_mul_mm_f32_f16")]] kernel mul_mm_t kernel_mul_mm; template [[host_name("kernel_mul_mm_f16_f16")]] kernel mul_mm_t kernel_mul_mm; -#if defined(GGML_METAL_HAS_BF16) -template [[host_name("kernel_mul_mm_bf16_f16")]] kernel mul_mm_t kernel_mul_mm; -#endif template [[host_name("kernel_mul_mm_q4_0_f16")]] kernel mul_mm_t kernel_mul_mm; template [[host_name("kernel_mul_mm_q4_1_f16")]] kernel mul_mm_t kernel_mul_mm; template [[host_name("kernel_mul_mm_q5_0_f16")]] kernel mul_mm_t kernel_mul_mm; @@ -9615,9 +9612,6 @@ template [[host_name("kernel_mul_mm_id_iq4_xs_f32")]] kernel mul_mm_id kernel_m template [[host_name("kernel_mul_mm_id_f32_f16")]] kernel mul_mm_id kernel_mul_mm_id; template [[host_name("kernel_mul_mm_id_f16_f16")]] kernel mul_mm_id kernel_mul_mm_id; -#if defined(GGML_METAL_HAS_BF16) -template [[host_name("kernel_mul_mm_id_bf16_f16")]] kernel mul_mm_id kernel_mul_mm_id; -#endif template [[host_name("kernel_mul_mm_id_q4_0_f16")]] kernel mul_mm_id kernel_mul_mm_id; template [[host_name("kernel_mul_mm_id_q4_1_f16")]] kernel mul_mm_id kernel_mul_mm_id; template [[host_name("kernel_mul_mm_id_q5_0_f16")]] kernel mul_mm_id kernel_mul_mm_id; From ecc343de63ac1aaba9c74cd26807fd60aec31ab9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= Date: Wed, 31 Dec 2025 09:37:00 +0100 Subject: [PATCH 06/12] CUDA: fix KQ max calculation (#18487) --- ggml/src/ggml-cuda/fattn-mma-f16.cuh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ggml/src/ggml-cuda/fattn-mma-f16.cuh b/ggml/src/ggml-cuda/fattn-mma-f16.cuh index 7bd1044c19..856291dc3c 100644 --- a/ggml/src/ggml-cuda/fattn-mma-f16.cuh +++ b/ggml/src/ggml-cuda/fattn-mma-f16.cuh @@ -531,7 +531,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter( for (int k0 = 0; k0 < nbatch_fa; k0 += np*T_C_KQ::I) { #pragma unroll for (int l = 0; l < T_C_KQ::ne; ++l) { - if (!oob_check || k0 + T_C_KQ::get_i(l) < k_VKQ_sup) { + if (!oob_check || k0 + (threadIdx.y % np)*T_C_KQ::I + T_C_KQ::get_i(l) < k_VKQ_sup) { KQ_max_new[l % 2] = fmaxf(KQ_max_new[l % 2], KQ_C[k0/(np*T_C_KQ::I)].x[l] + FATTN_KQ_MAX_OFFSET); } } @@ -583,7 +583,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter( for (int k0 = 0; k0 < nbatch_fa; k0 += np*T_C_KQ::J) { #pragma unroll for (int l = 0; l < T_C_KQ::ne; ++l) { - if (!oob_check || k0 + T_C_KQ::get_j(l) < k_VKQ_sup) { + if (!oob_check || k0 + (threadIdx.y % np)*T_C_KQ::J + T_C_KQ::get_j(l) < k_VKQ_sup) { // Turing + Volta: KQ_max_new[(l/2) % 2] = fmaxf(KQ_max_new[(l/2) % 2], KQ_C[(k0/(np*T_C_KQ::J))].x[l] + FATTN_KQ_MAX_OFFSET); } From 9a6369bb603457f277b597f0ccee1c19cd25c4b2 Mon Sep 17 00:00:00 2001 From: gatbontonpc Date: Wed, 31 Dec 2025 00:39:48 -0800 Subject: [PATCH 07/12] metal : add count_equal op (#18314) * add count equal for metal * remove trailing whitespace * updated doc ops table * changed shmem to i32 * added multi tg and templating * removed BLAS support from Metal docs * Apply suggestions from code review Co-authored-by: Georgi Gerganov * add memset to set dst to 0 * metal : cleanup --------- Co-authored-by: Georgi Gerganov --- docs/ops.md | 2 +- docs/ops/Metal.csv | 682 ++++++++++++---------- ggml/src/ggml-metal/ggml-metal-device.cpp | 57 ++ ggml/src/ggml-metal/ggml-metal-device.h | 2 + ggml/src/ggml-metal/ggml-metal-device.m | 5 + ggml/src/ggml-metal/ggml-metal-impl.h | 20 + ggml/src/ggml-metal/ggml-metal-ops.cpp | 67 ++- ggml/src/ggml-metal/ggml-metal-ops.h | 1 + ggml/src/ggml-metal/ggml-metal.metal | 73 +++ 9 files changed, 585 insertions(+), 324 deletions(-) diff --git a/docs/ops.md b/docs/ops.md index b395d2315c..2b2770cb76 100644 --- a/docs/ops.md +++ b/docs/ops.md @@ -32,7 +32,7 @@ Legend: | CONV_TRANSPOSE_1D | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | | CONV_TRANSPOSE_2D | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | | COS | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | ✅ | 🟡 | ❌ | ❌ | ❌ | -| COUNT_EQUAL | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | +| COUNT_EQUAL | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | | CPY | ❌ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | ❌ | ❌ | | CROSS_ENTROPY_LOSS | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | | CROSS_ENTROPY_LOSS_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | diff --git a/docs/ops/Metal.csv b/docs/ops/Metal.csv index 5f7450e91f..02fd75fdbf 100644 --- a/docs/ops/Metal.csv +++ b/docs/ops/Metal.csv @@ -965,6 +965,7 @@ "Metal","IM2COL","type_input=f32,type_kernel=f16,dst_type=f16,ne_input=[12,12,1,2560],ne_kernel=[3,3,1,2560],s0=1,s1=1,p0=1,p1=1,d0=1,d1=1,is_2D=1","support","1","yes","Metal" "Metal","IM2COL","type_input=f32,type_kernel=f16,dst_type=f16,ne_input=[12,12,2,2560],ne_kernel=[3,3,2,2560],s0=1,s1=1,p0=1,p1=1,d0=1,d1=1,is_2D=1","support","1","yes","Metal" "Metal","IM2COL","type_input=f32,type_kernel=f16,dst_type=f16,ne_input=[5,5,1,32],ne_kernel=[3,4,1,32],s0=1,s1=1,p0=0,p1=0,d0=1,d1=1,is_2D=1","support","1","yes","Metal" +"Metal","IM2COL","type_input=f32,type_kernel=f32,dst_type=f32,ne_input=[2,2,1536,729],ne_kernel=[2,2,1536,4096],s0=1,s1=1,p0=0,p1=0,d0=1,d1=1,is_2D=1","support","1","yes","Metal" "Metal","IM2COL_3D","type_input=f32,type_kernel=f32,dst_type=f32,ne_input=[10,10,10,9],ne_kernel=[3,3,3,1],IC=3,s0=1,s1=1,s2=1,p0=1,p1=1,p2=1,d0=1,d1=1,d2=1,v=0","support","0","no","Metal" "Metal","IM2COL_3D","type_input=f32,type_kernel=f16,dst_type=f32,ne_input=[10,10,10,9],ne_kernel=[3,3,3,1],IC=3,s0=1,s1=1,s2=1,p0=1,p1=1,p2=1,d0=1,d1=1,d2=1,v=0","support","0","no","Metal" "Metal","IM2COL_3D","type_input=f32,type_kernel=f16,dst_type=f16,ne_input=[10,10,10,9],ne_kernel=[3,3,3,1],IC=3,s0=1,s1=1,s2=1,p0=1,p1=1,p2=1,d0=1,d1=1,d2=1,v=0","support","0","no","Metal" @@ -4964,8 +4965,9 @@ "Metal","CONV_TRANSPOSE_1D","ne_input=[2,1,1,1],ne_kernel=[3,1,1,1],s0=1,p0=0,d0=1","support","1","yes","Metal" "Metal","CONV_TRANSPOSE_2D","ne_input=[3,2,3,1],ne_kernel=[2,2,1,3],stride=1","support","1","yes","Metal" "Metal","CONV_TRANSPOSE_2D","ne_input=[10,10,9,1],ne_kernel=[3,3,1,9],stride=2","support","1","yes","Metal" -"Metal","COUNT_EQUAL","type=f32,ne=[4,500,1,1]","support","0","no","Metal" -"Metal","COUNT_EQUAL","type=f32,ne=[4,5000,1,1]","support","0","no","Metal" +"Metal","CONV_TRANSPOSE_2D","ne_input=[129,63,35,1],ne_kernel=[3,3,48,35],stride=1","support","1","yes","Metal" +"Metal","COUNT_EQUAL","type=f32,ne=[4,500,1,1]","support","1","yes","Metal" +"Metal","COUNT_EQUAL","type=f32,ne=[4,5000,1,1]","support","1","yes","Metal" "Metal","ARGMAX","type=f32,ne=[32,1,1,1]","support","1","yes","Metal" "Metal","ARGMAX","type=f32,ne=[32,513,1,1]","support","1","yes","Metal" "Metal","ARGMAX","type=f32,ne=[100,10,1,1]","support","1","yes","Metal" @@ -5715,15 +5717,15 @@ "Metal","L2_NORM","type=f32,ne=[64,5,4,3]","support","1","yes","Metal" "Metal","RMS_NORM","type=f32,ne=[64,5,4,3],v=0,eps=0.000001,inplace=1","support","1","yes","Metal" "Metal","L2_NORM","type=f32,ne=[64,5,4,3]","support","1","yes","Metal" -"Metal","SSM_CONV","type=f32,ne_a=[4,1024,1,1],ne_b=[3,1024,1,1]","support","1","yes","Metal" -"Metal","SSM_CONV","type=f32,ne_a=[8,1024,1,1],ne_b=[3,1024,1,1]","support","1","yes","Metal" -"Metal","SSM_CONV","type=f32,ne_a=[4,1024,4,1],ne_b=[3,1024,1,1]","support","1","yes","Metal" -"Metal","SSM_CONV","type=f32,ne_a=[4,1536,1,1],ne_b=[3,1536,1,1]","support","1","yes","Metal" -"Metal","SSM_CONV","type=f32,ne_a=[8,1536,1,1],ne_b=[3,1536,1,1]","support","1","yes","Metal" -"Metal","SSM_CONV","type=f32,ne_a=[4,1536,4,1],ne_b=[3,1536,1,1]","support","1","yes","Metal" -"Metal","SSM_CONV","type=f32,ne_a=[4,2048,1,1],ne_b=[3,2048,1,1]","support","1","yes","Metal" -"Metal","SSM_CONV","type=f32,ne_a=[8,2048,1,1],ne_b=[3,2048,1,1]","support","1","yes","Metal" -"Metal","SSM_CONV","type=f32,ne_a=[4,2048,4,1],ne_b=[3,2048,1,1]","support","1","yes","Metal" +"Metal","SSM_CONV","type=f32,ne_a=[3,1024,1,1],ne_b=[3,1024,1,1]","support","1","yes","Metal" +"Metal","SSM_CONV","type=f32,ne_a=[6,1024,1,1],ne_b=[3,1024,1,1]","support","1","yes","Metal" +"Metal","SSM_CONV","type=f32,ne_a=[3,1024,4,1],ne_b=[3,1024,1,1]","support","1","yes","Metal" +"Metal","SSM_CONV","type=f32,ne_a=[3,1536,1,1],ne_b=[3,1536,1,1]","support","1","yes","Metal" +"Metal","SSM_CONV","type=f32,ne_a=[6,1536,1,1],ne_b=[3,1536,1,1]","support","1","yes","Metal" +"Metal","SSM_CONV","type=f32,ne_a=[3,1536,4,1],ne_b=[3,1536,1,1]","support","1","yes","Metal" +"Metal","SSM_CONV","type=f32,ne_a=[3,2048,1,1],ne_b=[3,2048,1,1]","support","1","yes","Metal" +"Metal","SSM_CONV","type=f32,ne_a=[6,2048,1,1],ne_b=[3,2048,1,1]","support","1","yes","Metal" +"Metal","SSM_CONV","type=f32,ne_a=[3,2048,4,1],ne_b=[3,2048,1,1]","support","1","yes","Metal" "Metal","SSM_CONV","type=f32,ne_a=[4,1024,1,1],ne_b=[4,1024,1,1]","support","1","yes","Metal" "Metal","SSM_CONV","type=f32,ne_a=[8,1024,1,1],ne_b=[4,1024,1,1]","support","1","yes","Metal" "Metal","SSM_CONV","type=f32,ne_a=[4,1024,4,1],ne_b=[4,1024,1,1]","support","1","yes","Metal" @@ -5733,6 +5735,15 @@ "Metal","SSM_CONV","type=f32,ne_a=[4,2048,1,1],ne_b=[4,2048,1,1]","support","1","yes","Metal" "Metal","SSM_CONV","type=f32,ne_a=[8,2048,1,1],ne_b=[4,2048,1,1]","support","1","yes","Metal" "Metal","SSM_CONV","type=f32,ne_a=[4,2048,4,1],ne_b=[4,2048,1,1]","support","1","yes","Metal" +"Metal","SSM_CONV","type=f32,ne_a=[9,1024,1,1],ne_b=[9,1024,1,1]","support","1","yes","Metal" +"Metal","SSM_CONV","type=f32,ne_a=[18,1024,1,1],ne_b=[9,1024,1,1]","support","1","yes","Metal" +"Metal","SSM_CONV","type=f32,ne_a=[9,1024,4,1],ne_b=[9,1024,1,1]","support","1","yes","Metal" +"Metal","SSM_CONV","type=f32,ne_a=[9,1536,1,1],ne_b=[9,1536,1,1]","support","1","yes","Metal" +"Metal","SSM_CONV","type=f32,ne_a=[18,1536,1,1],ne_b=[9,1536,1,1]","support","1","yes","Metal" +"Metal","SSM_CONV","type=f32,ne_a=[9,1536,4,1],ne_b=[9,1536,1,1]","support","1","yes","Metal" +"Metal","SSM_CONV","type=f32,ne_a=[9,2048,1,1],ne_b=[9,2048,1,1]","support","1","yes","Metal" +"Metal","SSM_CONV","type=f32,ne_a=[18,2048,1,1],ne_b=[9,2048,1,1]","support","1","yes","Metal" +"Metal","SSM_CONV","type=f32,ne_a=[9,2048,4,1],ne_b=[9,2048,1,1]","support","1","yes","Metal" "Metal","SSM_SCAN","type=f32,d_state=16,head_dim=1,n_head=1024,n_group=1,n_seq_tokens=32,n_seqs=4","support","1","yes","Metal" "Metal","SSM_SCAN","type=f32,d_state=128,head_dim=64,n_head=16,n_group=2,n_seq_tokens=32,n_seqs=4","support","1","yes","Metal" "Metal","SSM_SCAN","type=f32,d_state=256,head_dim=64,n_head=8,n_group=2,n_seq_tokens=32,n_seqs=4","support","1","yes","Metal" @@ -8916,6 +8927,8 @@ "Metal","SOFT_MAX","type=f32,ne=[32,2,32,1],mask=1,sinks=0,m_prec=f16,nr23=[1,1],scale=0.100000,max_bias=0.000000,inplace=0","support","1","yes","Metal" "Metal","SOFT_MAX","type=f32,ne=[32,2,32,1],mask=1,sinks=1,m_prec=f32,nr23=[1,1],scale=0.100000,max_bias=8.000000,inplace=0","support","1","yes","Metal" "Metal","SOFT_MAX","type=f32,ne=[32,2,32,1],mask=1,sinks=1,m_prec=f16,nr23=[1,1],scale=0.100000,max_bias=8.000000,inplace=0","support","1","yes","Metal" +"Metal","SOFT_MAX","type=f32,ne=[200001,2,3,1],mask=1,sinks=1,m_prec=f32,nr23=[1,1],scale=0.100000,max_bias=8.000000,inplace=0","support","1","yes","Metal" +"Metal","SOFT_MAX","type=f32,ne=[200001,2,3,1],mask=1,sinks=1,m_prec=f16,nr23=[1,1],scale=0.100000,max_bias=8.000000,inplace=0","support","1","yes","Metal" "Metal","SOFT_MAX_BACK","type=f32,ne=[16,16,1,1],scale=1.000000,max_bias=0.000000","support","0","no","Metal" "Metal","SOFT_MAX_BACK","type=f32,ne=[15,15,1,1],scale=1.000000,max_bias=0.000000","support","0","no","Metal" "Metal","SOFT_MAX_BACK","type=f32,ne=[16,16,2,3],scale=1.000000,max_bias=0.000000","support","0","no","Metal" @@ -9542,311 +9555,311 @@ "Metal","ARGSORT","type=f32,ne=[2048,2,1,3],order=1","support","1","yes","Metal" "Metal","ARGSORT","type=f32,ne=[2049,2,1,3],order=1","support","1","yes","Metal" "Metal","ARGSORT","type=f32,ne=[2,8,8192,1],order=1","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[1,1,1,1],k=1","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[12,1,2,1],k=1","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[2,1,1,1],k=1","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[13,1,2,1],k=1","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[2,1,1,1],k=2","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[13,1,2,1],k=2","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[4,1,1,1],k=1","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[15,1,2,1],k=1","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[4,1,1,1],k=2","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[15,1,2,1],k=2","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[4,1,1,1],k=3","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[15,1,2,1],k=3","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[8,1,1,1],k=1","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[19,1,2,1],k=1","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[8,1,1,1],k=2","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[19,1,2,1],k=2","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[8,1,1,1],k=3","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[19,1,2,1],k=3","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[8,1,1,1],k=7","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[19,1,2,1],k=7","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[16,1,1,1],k=1","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[27,1,2,1],k=1","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[16,1,1,1],k=2","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[27,1,2,1],k=2","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[16,1,1,1],k=3","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[27,1,2,1],k=3","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[16,1,1,1],k=7","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[27,1,2,1],k=7","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[16,1,1,1],k=15","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[27,1,2,1],k=15","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[32,1,1,1],k=1","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[43,1,2,1],k=1","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[32,1,1,1],k=2","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[43,1,2,1],k=2","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[32,1,1,1],k=3","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[43,1,2,1],k=3","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[32,1,1,1],k=7","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[43,1,2,1],k=7","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[32,1,1,1],k=15","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[43,1,2,1],k=15","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[64,1,1,1],k=1","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[75,1,2,1],k=1","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[64,1,1,1],k=2","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[75,1,2,1],k=2","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[64,1,1,1],k=3","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[75,1,2,1],k=3","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[64,1,1,1],k=7","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[75,1,2,1],k=7","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[64,1,1,1],k=15","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[75,1,2,1],k=15","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[128,1,1,1],k=1","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[139,1,2,1],k=1","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[128,1,1,1],k=2","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[139,1,2,1],k=2","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[128,1,1,1],k=3","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[139,1,2,1],k=3","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[128,1,1,1],k=7","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[139,1,2,1],k=7","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[128,1,1,1],k=15","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[139,1,2,1],k=15","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[128,1,1,1],k=100","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[139,1,2,1],k=100","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[256,1,1,1],k=1","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[267,1,2,1],k=1","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[256,1,1,1],k=2","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[267,1,2,1],k=2","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[256,1,1,1],k=3","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[267,1,2,1],k=3","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[256,1,1,1],k=7","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[267,1,2,1],k=7","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[256,1,1,1],k=15","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[267,1,2,1],k=15","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[256,1,1,1],k=100","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[267,1,2,1],k=100","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[512,1,1,1],k=1","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[523,1,2,1],k=1","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[512,1,1,1],k=2","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[523,1,2,1],k=2","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[512,1,1,1],k=3","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[523,1,2,1],k=3","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[512,1,1,1],k=7","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[523,1,2,1],k=7","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[512,1,1,1],k=15","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[523,1,2,1],k=15","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[512,1,1,1],k=100","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[523,1,2,1],k=100","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[512,1,1,1],k=500","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[523,1,2,1],k=500","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[1024,1,1,1],k=1","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[1035,1,2,1],k=1","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[1024,1,1,1],k=2","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[1035,1,2,1],k=2","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[1024,1,1,1],k=3","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[1035,1,2,1],k=3","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[1024,1,1,1],k=7","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[1035,1,2,1],k=7","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[1024,1,1,1],k=15","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[1035,1,2,1],k=15","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[1024,1,1,1],k=100","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[1035,1,2,1],k=100","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[1024,1,1,1],k=500","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[1035,1,2,1],k=500","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[1024,1,1,1],k=1023","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[1035,1,2,1],k=1023","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[2048,1,1,1],k=1","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[2059,1,2,1],k=1","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[2048,1,1,1],k=2","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[2059,1,2,1],k=2","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[2048,1,1,1],k=3","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[2059,1,2,1],k=3","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[2048,1,1,1],k=7","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[2059,1,2,1],k=7","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[2048,1,1,1],k=15","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[2059,1,2,1],k=15","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[2048,1,1,1],k=100","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[2059,1,2,1],k=100","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[2048,1,1,1],k=500","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[2059,1,2,1],k=500","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[2048,1,1,1],k=1023","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[2059,1,2,1],k=1023","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[4096,1,1,1],k=1","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[4107,1,2,1],k=1","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[4096,1,1,1],k=2","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[4107,1,2,1],k=2","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[4096,1,1,1],k=3","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[4107,1,2,1],k=3","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[4096,1,1,1],k=7","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[4107,1,2,1],k=7","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[4096,1,1,1],k=15","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[4107,1,2,1],k=15","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[4096,1,1,1],k=100","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[4107,1,2,1],k=100","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[4096,1,1,1],k=500","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[4107,1,2,1],k=500","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[4096,1,1,1],k=1023","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[4107,1,2,1],k=1023","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[8192,1,1,1],k=1","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[8203,1,2,1],k=1","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[8192,1,1,1],k=2","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[8203,1,2,1],k=2","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[8192,1,1,1],k=3","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[8203,1,2,1],k=3","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[8192,1,1,1],k=7","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[8203,1,2,1],k=7","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[8192,1,1,1],k=15","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[8203,1,2,1],k=15","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[8192,1,1,1],k=100","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[8203,1,2,1],k=100","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[8192,1,1,1],k=500","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[8203,1,2,1],k=500","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[8192,1,1,1],k=1023","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[8203,1,2,1],k=1023","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[16384,1,1,1],k=1","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[16395,1,2,1],k=1","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[16384,1,1,1],k=2","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[16395,1,2,1],k=2","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[16384,1,1,1],k=3","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[16395,1,2,1],k=3","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[16384,1,1,1],k=7","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[16395,1,2,1],k=7","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[16384,1,1,1],k=15","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[16395,1,2,1],k=15","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[16384,1,1,1],k=100","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[16395,1,2,1],k=100","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[16384,1,1,1],k=500","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[16395,1,2,1],k=500","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[16384,1,1,1],k=1023","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[16395,1,2,1],k=1023","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[16384,1,1,1],k=9999","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[16395,1,2,1],k=9999","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[32768,1,1,1],k=1","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[32779,1,2,1],k=1","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[32768,1,1,1],k=2","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[32779,1,2,1],k=2","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[32768,1,1,1],k=3","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[32779,1,2,1],k=3","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[32768,1,1,1],k=7","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[32779,1,2,1],k=7","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[32768,1,1,1],k=15","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[32779,1,2,1],k=15","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[32768,1,1,1],k=100","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[32779,1,2,1],k=100","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[32768,1,1,1],k=500","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[32779,1,2,1],k=500","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[32768,1,1,1],k=1023","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[32779,1,2,1],k=1023","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[32768,1,1,1],k=9999","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[32779,1,2,1],k=9999","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[65536,1,1,1],k=1","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[65547,1,2,1],k=1","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[65536,1,1,1],k=2","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[65547,1,2,1],k=2","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[65536,1,1,1],k=3","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[65547,1,2,1],k=3","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[65536,1,1,1],k=7","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[65547,1,2,1],k=7","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[65536,1,1,1],k=15","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[65547,1,2,1],k=15","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[65536,1,1,1],k=100","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[65547,1,2,1],k=100","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[65536,1,1,1],k=500","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[65547,1,2,1],k=500","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[65536,1,1,1],k=1023","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[65547,1,2,1],k=1023","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[65536,1,1,1],k=9999","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[65547,1,2,1],k=9999","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[131072,1,1,1],k=1","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[131083,1,2,1],k=1","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[131072,1,1,1],k=2","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[131083,1,2,1],k=2","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[131072,1,1,1],k=3","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[131083,1,2,1],k=3","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[131072,1,1,1],k=7","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[131083,1,2,1],k=7","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[131072,1,1,1],k=15","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[131083,1,2,1],k=15","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[131072,1,1,1],k=100","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[131083,1,2,1],k=100","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[131072,1,1,1],k=500","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[131083,1,2,1],k=500","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[131072,1,1,1],k=1023","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[131083,1,2,1],k=1023","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[131072,1,1,1],k=9999","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[131083,1,2,1],k=9999","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[262144,1,1,1],k=1","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[262155,1,2,1],k=1","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[262144,1,1,1],k=2","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[262155,1,2,1],k=2","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[262144,1,1,1],k=3","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[262155,1,2,1],k=3","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[262144,1,1,1],k=7","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[262155,1,2,1],k=7","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[262144,1,1,1],k=15","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[262155,1,2,1],k=15","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[262144,1,1,1],k=100","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[262155,1,2,1],k=100","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[262144,1,1,1],k=500","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[262155,1,2,1],k=500","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[262144,1,1,1],k=1023","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[262155,1,2,1],k=1023","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[262144,1,1,1],k=9999","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[262155,1,2,1],k=9999","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[524288,1,1,1],k=1","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[524299,1,2,1],k=1","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[524288,1,1,1],k=2","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[524299,1,2,1],k=2","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[524288,1,1,1],k=3","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[524299,1,2,1],k=3","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[524288,1,1,1],k=7","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[524299,1,2,1],k=7","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[524288,1,1,1],k=15","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[524299,1,2,1],k=15","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[524288,1,1,1],k=100","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[524299,1,2,1],k=100","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[524288,1,1,1],k=500","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[524299,1,2,1],k=500","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[524288,1,1,1],k=1023","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[524299,1,2,1],k=1023","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[524288,1,1,1],k=9999","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[524299,1,2,1],k=9999","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[16,10,10,10],k=1","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[60,10,10,10],k=1","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[1023,2,1,3],k=1","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[1024,2,1,3],k=1","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[1025,2,1,3],k=1","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[16384,1,1,1],k=1","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[2047,2,1,3],k=1","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[2048,2,1,3],k=1","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[2049,2,1,3],k=1","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[16,10,10,10],k=2","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[60,10,10,10],k=2","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[1023,2,1,3],k=2","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[1024,2,1,3],k=2","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[1025,2,1,3],k=2","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[16384,1,1,1],k=2","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[2047,2,1,3],k=2","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[2048,2,1,3],k=2","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[2049,2,1,3],k=2","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[16,10,10,10],k=3","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[60,10,10,10],k=3","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[1023,2,1,3],k=3","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[1024,2,1,3],k=3","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[1025,2,1,3],k=3","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[16384,1,1,1],k=3","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[2047,2,1,3],k=3","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[2048,2,1,3],k=3","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[2049,2,1,3],k=3","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[16,10,10,10],k=7","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[60,10,10,10],k=7","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[1023,2,1,3],k=7","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[1024,2,1,3],k=7","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[1025,2,1,3],k=7","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[16384,1,1,1],k=7","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[2047,2,1,3],k=7","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[2048,2,1,3],k=7","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[2049,2,1,3],k=7","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[16,10,10,10],k=15","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[60,10,10,10],k=15","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[1023,2,1,3],k=15","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[1024,2,1,3],k=15","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[1025,2,1,3],k=15","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[16384,1,1,1],k=15","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[2047,2,1,3],k=15","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[2048,2,1,3],k=15","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[2049,2,1,3],k=15","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[1,1,1,1],k=1,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[12,1,2,1],k=1,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[2,1,1,1],k=1,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[13,1,2,1],k=1,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[2,1,1,1],k=2,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[13,1,2,1],k=2,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[4,1,1,1],k=1,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[15,1,2,1],k=1,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[4,1,1,1],k=2,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[15,1,2,1],k=2,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[4,1,1,1],k=3,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[15,1,2,1],k=3,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[8,1,1,1],k=1,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[19,1,2,1],k=1,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[8,1,1,1],k=2,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[19,1,2,1],k=2,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[8,1,1,1],k=3,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[19,1,2,1],k=3,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[8,1,1,1],k=7,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[19,1,2,1],k=7,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[16,1,1,1],k=1,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[27,1,2,1],k=1,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[16,1,1,1],k=2,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[27,1,2,1],k=2,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[16,1,1,1],k=3,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[27,1,2,1],k=3,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[16,1,1,1],k=7,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[27,1,2,1],k=7,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[16,1,1,1],k=15,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[27,1,2,1],k=15,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[32,1,1,1],k=1,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[43,1,2,1],k=1,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[32,1,1,1],k=2,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[43,1,2,1],k=2,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[32,1,1,1],k=3,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[43,1,2,1],k=3,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[32,1,1,1],k=7,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[43,1,2,1],k=7,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[32,1,1,1],k=15,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[43,1,2,1],k=15,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[64,1,1,1],k=1,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[75,1,2,1],k=1,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[64,1,1,1],k=2,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[75,1,2,1],k=2,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[64,1,1,1],k=3,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[75,1,2,1],k=3,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[64,1,1,1],k=7,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[75,1,2,1],k=7,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[64,1,1,1],k=15,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[75,1,2,1],k=15,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[128,1,1,1],k=1,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[139,1,2,1],k=1,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[128,1,1,1],k=2,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[139,1,2,1],k=2,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[128,1,1,1],k=3,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[139,1,2,1],k=3,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[128,1,1,1],k=7,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[139,1,2,1],k=7,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[128,1,1,1],k=15,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[139,1,2,1],k=15,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[128,1,1,1],k=100,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[139,1,2,1],k=100,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[256,1,1,1],k=1,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[267,1,2,1],k=1,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[256,1,1,1],k=2,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[267,1,2,1],k=2,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[256,1,1,1],k=3,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[267,1,2,1],k=3,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[256,1,1,1],k=7,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[267,1,2,1],k=7,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[256,1,1,1],k=15,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[267,1,2,1],k=15,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[256,1,1,1],k=100,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[267,1,2,1],k=100,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[512,1,1,1],k=1,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[523,1,2,1],k=1,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[512,1,1,1],k=2,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[523,1,2,1],k=2,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[512,1,1,1],k=3,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[523,1,2,1],k=3,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[512,1,1,1],k=7,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[523,1,2,1],k=7,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[512,1,1,1],k=15,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[523,1,2,1],k=15,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[512,1,1,1],k=100,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[523,1,2,1],k=100,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[512,1,1,1],k=500,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[523,1,2,1],k=500,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[1024,1,1,1],k=1,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[1035,1,2,1],k=1,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[1024,1,1,1],k=2,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[1035,1,2,1],k=2,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[1024,1,1,1],k=3,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[1035,1,2,1],k=3,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[1024,1,1,1],k=7,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[1035,1,2,1],k=7,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[1024,1,1,1],k=15,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[1035,1,2,1],k=15,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[1024,1,1,1],k=100,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[1035,1,2,1],k=100,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[1024,1,1,1],k=500,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[1035,1,2,1],k=500,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[1024,1,1,1],k=1023,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[1035,1,2,1],k=1023,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[2048,1,1,1],k=1,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[2059,1,2,1],k=1,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[2048,1,1,1],k=2,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[2059,1,2,1],k=2,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[2048,1,1,1],k=3,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[2059,1,2,1],k=3,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[2048,1,1,1],k=7,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[2059,1,2,1],k=7,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[2048,1,1,1],k=15,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[2059,1,2,1],k=15,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[2048,1,1,1],k=100,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[2059,1,2,1],k=100,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[2048,1,1,1],k=500,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[2059,1,2,1],k=500,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[2048,1,1,1],k=1023,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[2059,1,2,1],k=1023,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[4096,1,1,1],k=1,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[4107,1,2,1],k=1,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[4096,1,1,1],k=2,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[4107,1,2,1],k=2,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[4096,1,1,1],k=3,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[4107,1,2,1],k=3,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[4096,1,1,1],k=7,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[4107,1,2,1],k=7,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[4096,1,1,1],k=15,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[4107,1,2,1],k=15,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[4096,1,1,1],k=100,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[4107,1,2,1],k=100,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[4096,1,1,1],k=500,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[4107,1,2,1],k=500,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[4096,1,1,1],k=1023,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[4107,1,2,1],k=1023,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[8192,1,1,1],k=1,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[8203,1,2,1],k=1,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[8192,1,1,1],k=2,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[8203,1,2,1],k=2,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[8192,1,1,1],k=3,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[8203,1,2,1],k=3,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[8192,1,1,1],k=7,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[8203,1,2,1],k=7,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[8192,1,1,1],k=15,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[8203,1,2,1],k=15,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[8192,1,1,1],k=100,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[8203,1,2,1],k=100,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[8192,1,1,1],k=500,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[8203,1,2,1],k=500,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[8192,1,1,1],k=1023,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[8203,1,2,1],k=1023,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[16384,1,1,1],k=1,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[16395,1,2,1],k=1,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[16384,1,1,1],k=2,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[16395,1,2,1],k=2,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[16384,1,1,1],k=3,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[16395,1,2,1],k=3,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[16384,1,1,1],k=7,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[16395,1,2,1],k=7,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[16384,1,1,1],k=15,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[16395,1,2,1],k=15,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[16384,1,1,1],k=100,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[16395,1,2,1],k=100,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[16384,1,1,1],k=500,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[16395,1,2,1],k=500,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[16384,1,1,1],k=1023,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[16395,1,2,1],k=1023,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[16384,1,1,1],k=9999,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[16395,1,2,1],k=9999,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[32768,1,1,1],k=1,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[32779,1,2,1],k=1,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[32768,1,1,1],k=2,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[32779,1,2,1],k=2,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[32768,1,1,1],k=3,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[32779,1,2,1],k=3,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[32768,1,1,1],k=7,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[32779,1,2,1],k=7,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[32768,1,1,1],k=15,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[32779,1,2,1],k=15,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[32768,1,1,1],k=100,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[32779,1,2,1],k=100,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[32768,1,1,1],k=500,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[32779,1,2,1],k=500,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[32768,1,1,1],k=1023,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[32779,1,2,1],k=1023,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[32768,1,1,1],k=9999,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[32779,1,2,1],k=9999,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[65536,1,1,1],k=1,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[65547,1,2,1],k=1,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[65536,1,1,1],k=2,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[65547,1,2,1],k=2,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[65536,1,1,1],k=3,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[65547,1,2,1],k=3,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[65536,1,1,1],k=7,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[65547,1,2,1],k=7,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[65536,1,1,1],k=15,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[65547,1,2,1],k=15,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[65536,1,1,1],k=100,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[65547,1,2,1],k=100,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[65536,1,1,1],k=500,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[65547,1,2,1],k=500,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[65536,1,1,1],k=1023,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[65547,1,2,1],k=1023,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[65536,1,1,1],k=9999,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[65547,1,2,1],k=9999,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[131072,1,1,1],k=1,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[131083,1,2,1],k=1,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[131072,1,1,1],k=2,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[131083,1,2,1],k=2,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[131072,1,1,1],k=3,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[131083,1,2,1],k=3,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[131072,1,1,1],k=7,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[131083,1,2,1],k=7,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[131072,1,1,1],k=15,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[131083,1,2,1],k=15,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[131072,1,1,1],k=100,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[131083,1,2,1],k=100,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[131072,1,1,1],k=500,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[131083,1,2,1],k=500,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[131072,1,1,1],k=1023,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[131083,1,2,1],k=1023,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[131072,1,1,1],k=9999,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[131083,1,2,1],k=9999,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[262144,1,1,1],k=1,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[262155,1,2,1],k=1,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[262144,1,1,1],k=2,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[262155,1,2,1],k=2,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[262144,1,1,1],k=3,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[262155,1,2,1],k=3,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[262144,1,1,1],k=7,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[262155,1,2,1],k=7,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[262144,1,1,1],k=15,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[262155,1,2,1],k=15,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[262144,1,1,1],k=100,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[262155,1,2,1],k=100,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[262144,1,1,1],k=500,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[262155,1,2,1],k=500,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[262144,1,1,1],k=1023,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[262155,1,2,1],k=1023,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[262144,1,1,1],k=9999,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[262155,1,2,1],k=9999,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[524288,1,1,1],k=1,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[524299,1,2,1],k=1,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[524288,1,1,1],k=2,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[524299,1,2,1],k=2,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[524288,1,1,1],k=3,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[524299,1,2,1],k=3,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[524288,1,1,1],k=7,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[524299,1,2,1],k=7,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[524288,1,1,1],k=15,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[524299,1,2,1],k=15,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[524288,1,1,1],k=100,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[524299,1,2,1],k=100,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[524288,1,1,1],k=500,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[524299,1,2,1],k=500,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[524288,1,1,1],k=1023,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[524299,1,2,1],k=1023,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[524288,1,1,1],k=9999,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[524299,1,2,1],k=9999,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[16,10,10,10],k=1,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[60,10,10,10],k=1,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[1023,2,1,3],k=1,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[1024,2,1,3],k=1,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[1025,2,1,3],k=1,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[16384,1,1,1],k=1,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[2047,2,1,3],k=1,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[2048,2,1,3],k=1,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[2049,2,1,3],k=1,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[16,10,10,10],k=2,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[60,10,10,10],k=2,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[1023,2,1,3],k=2,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[1024,2,1,3],k=2,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[1025,2,1,3],k=2,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[16384,1,1,1],k=2,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[2047,2,1,3],k=2,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[2048,2,1,3],k=2,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[2049,2,1,3],k=2,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[16,10,10,10],k=3,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[60,10,10,10],k=3,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[1023,2,1,3],k=3,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[1024,2,1,3],k=3,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[1025,2,1,3],k=3,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[16384,1,1,1],k=3,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[2047,2,1,3],k=3,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[2048,2,1,3],k=3,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[2049,2,1,3],k=3,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[16,10,10,10],k=7,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[60,10,10,10],k=7,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[1023,2,1,3],k=7,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[1024,2,1,3],k=7,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[1025,2,1,3],k=7,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[16384,1,1,1],k=7,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[2047,2,1,3],k=7,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[2048,2,1,3],k=7,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[2049,2,1,3],k=7,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[16,10,10,10],k=15,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[60,10,10,10],k=15,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[1023,2,1,3],k=15,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[1024,2,1,3],k=15,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[1025,2,1,3],k=15,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[16384,1,1,1],k=15,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[2047,2,1,3],k=15,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[2048,2,1,3],k=15,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[2049,2,1,3],k=15,ties=0","support","1","yes","Metal" "Metal","UPSCALE","type=f32,ne=[512,512,3,2],scale_factor=2,mode=nearest,transpose=0","support","1","yes","Metal" "Metal","UPSCALE","type=f32,ne=[512,512,3,2],scale_factor=2,mode=nearest,transpose=1","support","1","yes","Metal" "Metal","UPSCALE","type=f32,ne=[2,5,7,11],ne_tgt=[5,7,11,13],mode=nearest,flags=none","support","1","yes","Metal" @@ -9891,8 +9904,9 @@ "Metal","GROUP_NORM","type=f32,ne=[64,64,320,1],num_groups=32,eps=0.000001","support","1","yes","Metal" "Metal","GROUP_NORM","type=f32,ne=[9,9,1280,1],num_groups=32,eps=0.000001","support","1","yes","Metal" "Metal","ACC","type=f32,ne_a=[256,17,1,1],ne_b=[256,16,1,1]","support","1","yes","Metal" -"Metal","PAD","type=f32,ne_a=[512,512,1,1],pad_0=1,pad_1=1","support","1","yes","Metal" -"Metal","PAD","type=f32,ne_a=[512,512,3,1],lp0=1,rp0=1,lp1=1,rp1=1,lp2=1,rp2=1,lp3=1,rp3=1,v=0","support","0","no","Metal" +"Metal","PAD","type=f32,ne_a=[512,512,1,1],pad_0=1,pad_1=1,circular=0","support","1","yes","Metal" +"Metal","PAD","type=f32,ne_a=[33,17,2,1],pad_0=4,pad_1=3,circular=1","support","0","no","Metal" +"Metal","PAD","type=f32,ne_a=[512,512,3,1],lp0=1,rp0=1,lp1=1,rp1=1,lp2=1,rp2=1,lp3=1,rp3=1,v=0,circular=0","support","0","no","Metal" "Metal","PAD_REFLECT_1D","type=f32,ne_a=[512,34,2,1],pad_0=10,pad_1=9","support","1","yes","Metal" "Metal","PAD_REFLECT_1D","type=f32,ne_a=[3000,384,4,1],pad_0=10,pad_1=9","support","1","yes","Metal" "Metal","ROLL","shift0=3,shift1=-2,shift3=1,shift4=-1","support","0","no","Metal" @@ -9923,17 +9937,41 @@ "Metal","FILL","type=f32,ne=[303,207,11,3],c=2.000000","support","1","yes","Metal" "Metal","FILL","type=f32,ne=[800,600,4,4],c=-152.000000","support","1","yes","Metal" "Metal","FILL","type=f32,ne=[2048,512,2,2],c=3.500000","support","1","yes","Metal" +"Metal","DIAG","type=f32,ne=[10,1,4,3]","support","0","no","Metal" +"Metal","DIAG","type=f32,ne=[79,1,19,13]","support","0","no","Metal" +"Metal","DIAG","type=f32,ne=[256,1,8,16]","support","0","no","Metal" "Metal","SOLVE_TRI","type=f32,ne_lhs=[10,10,4,3],ne_rhs=[3,10,4,3]","support","0","no","Metal" "Metal","SOLVE_TRI","type=f32,ne_lhs=[11,11,1,1],ne_rhs=[5,11,1,1]","support","0","no","Metal" "Metal","SOLVE_TRI","type=f32,ne_lhs=[17,17,2,4],ne_rhs=[9,17,2,4]","support","0","no","Metal" "Metal","SOLVE_TRI","type=f32,ne_lhs=[30,30,7,1],ne_rhs=[8,30,7,1]","support","0","no","Metal" "Metal","SOLVE_TRI","type=f32,ne_lhs=[42,42,5,2],ne_rhs=[10,42,5,2]","support","0","no","Metal" "Metal","SOLVE_TRI","type=f32,ne_lhs=[64,64,2,2],ne_rhs=[10,64,2,2]","support","0","no","Metal" +"Metal","SOLVE_TRI","type=f32,ne_lhs=[64,64,2,2],ne_rhs=[64,64,2,2]","support","0","no","Metal" +"Metal","SOLVE_TRI","type=f32,ne_lhs=[79,79,5,3],ne_rhs=[417,79,5,3]","support","0","no","Metal" +"Metal","SOLVE_TRI","type=f32,ne_lhs=[128,128,4,2],ne_rhs=[32,128,4,2]","support","0","no","Metal" +"Metal","SOLVE_TRI","type=f32,ne_lhs=[80,80,2,8],ne_rhs=[80,80,2,8]","support","0","no","Metal" +"Metal","SOLVE_TRI","type=f32,ne_lhs=[80,80,2,8],ne_rhs=[79,80,2,8]","support","0","no","Metal" +"Metal","SOLVE_TRI","type=f32,ne_lhs=[80,80,2,8],ne_rhs=[81,80,2,8]","support","0","no","Metal" +"Metal","SOLVE_TRI","type=f32,ne_lhs=[80,80,8,8],ne_rhs=[80,80,8,8]","support","0","no","Metal" +"Metal","SOLVE_TRI","type=f32,ne_lhs=[80,80,8,8],ne_rhs=[79,80,8,8]","support","0","no","Metal" +"Metal","SOLVE_TRI","type=f32,ne_lhs=[80,80,8,8],ne_rhs=[81,80,8,8]","support","0","no","Metal" +"Metal","SOLVE_TRI","type=f32,ne_lhs=[84,84,4,4],ne_rhs=[32,84,4,4]","support","0","no","Metal" +"Metal","SOLVE_TRI","type=f32,ne_lhs=[95,95,8,8],ne_rhs=[40,95,8,8]","support","0","no","Metal" "Metal","SOLVE_TRI","type=f32,ne_lhs=[100,100,4,4],ne_rhs=[41,100,4,4]","support","0","no","Metal" -"Metal","PAD","type=f32,ne_a=[512,512,1,1],lp0=0,rp0=1,lp1=0,rp1=1,lp2=0,rp2=0,lp3=0,rp3=0,v=0","support","1","yes","Metal" -"Metal","PAD","type=f32,ne_a=[11,22,33,44],lp0=1,rp0=2,lp1=3,rp1=4,lp2=5,rp2=6,lp3=7,rp3=8,v=0","support","0","no","Metal" -"Metal","PAD","type=f32,ne_a=[512,512,1,1],lp0=0,rp0=1,lp1=0,rp1=1,lp2=0,rp2=0,lp3=0,rp3=0,v=1","support","1","yes","Metal" -"Metal","PAD","type=f32,ne_a=[11,22,33,44],lp0=1,rp0=2,lp1=3,rp1=4,lp2=5,rp2=6,lp3=7,rp3=8,v=1","support","0","no","Metal" +"Metal","SOLVE_TRI","type=f32,ne_lhs=[128,128,4,4],ne_rhs=[31,128,4,4]","support","0","no","Metal" +"Metal","SOLVE_TRI","type=f32,ne_lhs=[128,128,4,4],ne_rhs=[32,128,4,4]","support","0","no","Metal" +"Metal","SOLVE_TRI","type=f32,ne_lhs=[128,128,3,4],ne_rhs=[32,128,3,4]","support","0","no","Metal" +"Metal","SOLVE_TRI","type=f32,ne_lhs=[128,128,4,1],ne_rhs=[32,128,4,1]","support","0","no","Metal" +"Metal","SOLVE_TRI","type=f32,ne_lhs=[64,64,4,4],ne_rhs=[200,64,4,4]","support","0","no","Metal" +"Metal","SOLVE_TRI","type=f32,ne_lhs=[64,64,4,4],ne_rhs=[384,64,4,4]","support","0","no","Metal" +"Metal","PAD","type=f32,ne_a=[512,512,1,1],lp0=0,rp0=1,lp1=0,rp1=1,lp2=0,rp2=0,lp3=0,rp3=0,v=0,circular=0","support","1","yes","Metal" +"Metal","PAD","type=f32,ne_a=[11,22,33,44],lp0=1,rp0=2,lp1=3,rp1=4,lp2=5,rp2=6,lp3=7,rp3=8,v=0,circular=0","support","0","no","Metal" +"Metal","PAD","type=f32,ne_a=[512,512,1,1],lp0=0,rp0=1,lp1=0,rp1=1,lp2=0,rp2=0,lp3=0,rp3=0,v=0,circular=1","support","0","no","Metal" +"Metal","PAD","type=f32,ne_a=[11,22,33,44],lp0=1,rp0=2,lp1=3,rp1=4,lp2=5,rp2=6,lp3=7,rp3=8,v=0,circular=1","support","0","no","Metal" +"Metal","PAD","type=f32,ne_a=[512,512,1,1],lp0=0,rp0=1,lp1=0,rp1=1,lp2=0,rp2=0,lp3=0,rp3=0,v=1,circular=0","support","1","yes","Metal" +"Metal","PAD","type=f32,ne_a=[11,22,33,44],lp0=1,rp0=2,lp1=3,rp1=4,lp2=5,rp2=6,lp3=7,rp3=8,v=1,circular=0","support","0","no","Metal" +"Metal","PAD","type=f32,ne_a=[512,512,1,1],lp0=0,rp0=1,lp1=0,rp1=1,lp2=0,rp2=0,lp3=0,rp3=0,v=1,circular=1","support","0","no","Metal" +"Metal","PAD","type=f32,ne_a=[11,22,33,44],lp0=1,rp0=2,lp1=3,rp1=4,lp2=5,rp2=6,lp3=7,rp3=8,v=1,circular=1","support","0","no","Metal" "Metal","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Metal" "Metal","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Metal" "Metal","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","1","yes","Metal" diff --git a/ggml/src/ggml-metal/ggml-metal-device.cpp b/ggml/src/ggml-metal/ggml-metal-device.cpp index 680904d132..b0734797f1 100644 --- a/ggml/src/ggml-metal/ggml-metal-device.cpp +++ b/ggml/src/ggml-metal/ggml-metal-device.cpp @@ -1684,3 +1684,60 @@ ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_opt_step_sgd(ggm return res; } + +ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_memset(ggml_metal_library_t lib, const ggml_tensor * op) { + GGML_ASSERT(op->type == GGML_TYPE_I64); + + char base[256]; + char name[256]; + + snprintf(base, 256, "kernel_memset_%s", ggml_type_name(op->type)); + snprintf(name, 256, "%s", base); + + ggml_metal_pipeline_with_params res = ggml_metal_library_get_pipeline(lib, name); + if (!res.pipeline) { + res = ggml_metal_library_compile_pipeline(lib, base, name, nullptr); + } + + return res; +} + +ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_count_equal(ggml_metal_library_t lib, const ggml_tensor * op) { + assert(op->op == GGML_OP_COUNT_EQUAL); + + GGML_TENSOR_LOCALS(int64_t, ne0, op->src[0], ne); + + GGML_ASSERT(op->src[0]->type == op->src[1]->type); + GGML_ASSERT(op->src[0]->type == GGML_TYPE_I32); + GGML_ASSERT(op->type == GGML_TYPE_I64); + + // note: the kernel only supports i32 output due to metal atomic add only supporting atomic_int + GGML_ASSERT(ggml_nelements(op->src[0]) < (1LL << 31)); + + char base[256]; + char name[256]; + + int nsg = 1; + while (32*nsg < ne00 && nsg < 32) { + nsg *= 2; + } + + snprintf(base, 256, "kernel_count_equal_%s", ggml_type_name(op->src[0]->type)); + snprintf(name, 256, "%s_nsg=%d", base, nsg); + + ggml_metal_pipeline_with_params res = ggml_metal_library_get_pipeline(lib, name); + if (!res.pipeline) { + ggml_metal_cv_t cv = ggml_metal_cv_init(); + + ggml_metal_cv_set_int16(cv, nsg, FC_COUNT_EQUAL + 0); + + res = ggml_metal_library_compile_pipeline(lib, base, name, cv); + + ggml_metal_cv_free(cv); + } + + res.smem = 32 * sizeof(int32_t); + res.nsg = nsg; + + return res; +} diff --git a/ggml/src/ggml-metal/ggml-metal-device.h b/ggml/src/ggml-metal/ggml-metal-device.h index 0a8b9211a7..d983b666ca 100644 --- a/ggml/src/ggml-metal/ggml-metal-device.h +++ b/ggml/src/ggml-metal/ggml-metal-device.h @@ -147,6 +147,8 @@ struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_arange struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_timestep_embedding(ggml_metal_library_t lib, const struct ggml_tensor * op); struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_opt_step_adamw (ggml_metal_library_t lib, const struct ggml_tensor * op); struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_opt_step_sgd (ggml_metal_library_t lib, const struct ggml_tensor * op); +struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_memset (ggml_metal_library_t lib, const struct ggml_tensor * op); +struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_count_equal (ggml_metal_library_t lib, const struct ggml_tensor * op); struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_flash_attn_ext_pad( ggml_metal_library_t lib, diff --git a/ggml/src/ggml-metal/ggml-metal-device.m b/ggml/src/ggml-metal/ggml-metal-device.m index f24270bb1c..59badd0043 100644 --- a/ggml/src/ggml-metal/ggml-metal-device.m +++ b/ggml/src/ggml-metal/ggml-metal-device.m @@ -1023,6 +1023,11 @@ bool ggml_metal_device_supports_op(ggml_metal_device_t dev, const struct ggml_te return has_simdgroup_reduction && ggml_is_contiguous_rows(op->src[0]); case GGML_OP_L2_NORM: return has_simdgroup_reduction && (op->ne[0] % 4 == 0 && ggml_is_contiguous_1(op->src[0])); + case GGML_OP_COUNT_EQUAL: + return has_simdgroup_reduction && + op->src[0]->type == GGML_TYPE_I32 && + op->src[1]->type == GGML_TYPE_I32 && + op->type == GGML_TYPE_I64; case GGML_OP_ARGMAX: return has_simdgroup_reduction; case GGML_OP_NORM: diff --git a/ggml/src/ggml-metal/ggml-metal-impl.h b/ggml/src/ggml-metal/ggml-metal-impl.h index 8944b07e90..d3b0e732ec 100644 --- a/ggml/src/ggml-metal/ggml-metal-impl.h +++ b/ggml/src/ggml-metal/ggml-metal-impl.h @@ -78,6 +78,7 @@ #define FC_MUL_MM 700 #define FC_ROPE 800 #define FC_SSM_CONV 900 +#define FC_COUNT_EQUAL 1000 // op-specific constants #define OP_FLASH_ATTN_EXT_NQPTG 8 @@ -894,6 +895,25 @@ typedef struct { float step; } ggml_metal_kargs_arange; +typedef struct { + int64_t val; +} ggml_metal_kargs_memset; + +typedef struct { + int32_t ne00; + int32_t ne01; + int32_t ne02; + int32_t ne03; + uint64_t nb00; + uint64_t nb01; + uint64_t nb02; + uint64_t nb03; + uint64_t nb10; + uint64_t nb11; + uint64_t nb12; + uint64_t nb13; +} ggml_metal_kargs_count_equal; + typedef struct { int32_t k0; int32_t k1; diff --git a/ggml/src/ggml-metal/ggml-metal-ops.cpp b/ggml/src/ggml-metal/ggml-metal-ops.cpp index e99c1763f6..acf2aa9184 100644 --- a/ggml/src/ggml-metal/ggml-metal-ops.cpp +++ b/ggml/src/ggml-metal/ggml-metal-ops.cpp @@ -448,7 +448,11 @@ static int ggml_metal_op_encode_impl(ggml_metal_op_t ctx, int idx) { { n_fuse = ggml_metal_op_opt_step_sgd(ctx, idx); } break; - default: + case GGML_OP_COUNT_EQUAL: + { + n_fuse = ggml_metal_op_count_equal(ctx, idx); + } break; + default: { GGML_LOG_ERROR("%s: error: node %3d, op = %8s not implemented\n", __func__, idx, ggml_op_name(node->op)); GGML_ABORT("fatal error"); @@ -4090,3 +4094,64 @@ int ggml_metal_op_opt_step_sgd(ggml_metal_op_t ctx, int idx) { return 1; } + +int ggml_metal_op_count_equal(ggml_metal_op_t ctx, int idx) { + ggml_tensor * op = ctx->node(idx); + + ggml_metal_library_t lib = ctx->lib; + ggml_metal_encoder_t enc = ctx->enc; + + GGML_TENSOR_LOCALS(int32_t, ne0, op->src[0], ne); + GGML_TENSOR_LOCALS(uint64_t, nb0, op->src[0], nb); + GGML_TENSOR_LOCALS(uint64_t, nb1, op->src[1], nb); + + { + ggml_metal_kargs_memset args = { /*.val =*/ 0 }; + + auto pipeline = ggml_metal_library_get_pipeline_memset(lib, op); + + ggml_metal_encoder_set_pipeline(enc, pipeline); + ggml_metal_encoder_set_bytes(enc, &args, sizeof(args), 0); + ggml_metal_encoder_set_buffer(enc, ggml_metal_get_buffer_id(op), 1); + + ggml_metal_encoder_dispatch_threadgroups(enc, 1, 1, 1, 1, 1, 1); + } + + ggml_metal_op_concurrency_reset(ctx); + + { + ggml_metal_kargs_count_equal args = { + /*.ne00 =*/ ne00, + /*.ne01 =*/ ne01, + /*.ne02 =*/ ne02, + /*.ne03 =*/ ne03, + /*.nb00 =*/ nb00, + /*.nb01 =*/ nb01, + /*.nb02 =*/ nb02, + /*.nb03 =*/ nb03, + /*.nb10 =*/ nb10, + /*.nb11 =*/ nb11, + /*.nb12 =*/ nb12, + /*.nb13 =*/ nb13, + }; + + auto pipeline = ggml_metal_library_get_pipeline_count_equal(lib, op); + + const size_t smem = pipeline.smem; + + const int nth = 32*pipeline.nsg; + + GGML_ASSERT(nth <= ggml_metal_pipeline_max_theads_per_threadgroup(pipeline)); + + ggml_metal_encoder_set_pipeline(enc, pipeline); + ggml_metal_encoder_set_bytes(enc, &args, sizeof(args), 0); + ggml_metal_encoder_set_buffer(enc, ggml_metal_get_buffer_id(op->src[0]), 1); + ggml_metal_encoder_set_buffer(enc, ggml_metal_get_buffer_id(op->src[1]), 2); + ggml_metal_encoder_set_buffer(enc, ggml_metal_get_buffer_id(op), 3); + + ggml_metal_encoder_set_threadgroup_memory_size(enc, smem, 0); + ggml_metal_encoder_dispatch_threadgroups(enc, ne01, ne02, ne03, nth, 1, 1); + } + + return 1; +} diff --git a/ggml/src/ggml-metal/ggml-metal-ops.h b/ggml/src/ggml-metal/ggml-metal-ops.h index 902b544523..c1025d3567 100644 --- a/ggml/src/ggml-metal/ggml-metal-ops.h +++ b/ggml/src/ggml-metal/ggml-metal-ops.h @@ -87,6 +87,7 @@ int ggml_metal_op_leaky_relu (ggml_metal_op_t ctx, int idx); int ggml_metal_op_tri (ggml_metal_op_t ctx, int idx); int ggml_metal_op_opt_step_adamw (ggml_metal_op_t ctx, int idx); int ggml_metal_op_opt_step_sgd (ggml_metal_op_t ctx, int idx); +int ggml_metal_op_count_equal (ggml_metal_op_t ctx, int idx); #ifdef __cplusplus } diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal index 3154beff9b..67b30e0d93 100644 --- a/ggml/src/ggml-metal/ggml-metal.metal +++ b/ggml/src/ggml-metal/ggml-metal.metal @@ -1790,6 +1790,7 @@ kernel void kernel_op_sum_f32( return; } + // TODO: become function constant const uint nsg = (ntg.x + 31) / 32; float sumf = 0; @@ -9914,3 +9915,75 @@ kernel void kernel_opt_step_sgd_f32( x[gid] = x[gid] * (1.0f - pars[0] * pars[1]) - pars[0] * g[gid]; } + +template +kernel void kernel_memset( + constant ggml_metal_kargs_fill & args, + device T * dst, + uint tpig[[thread_position_in_grid]]) { + dst[tpig] = args.val; +} + +typedef decltype(kernel_memset) kernel_memset_t; + +template [[host_name("kernel_memset_i64")]] kernel kernel_memset_t kernel_memset; + +constant short FC_count_equal_nsg [[function_constant(FC_COUNT_EQUAL + 0)]]; + +template +kernel void kernel_count_equal( + constant ggml_metal_kargs_count_equal & args, + device const char * src0, + device const char * src1, + device atomic_int * dst, + threadgroup int32_t * shmem_i32 [[threadgroup(0)]], + uint3 tgpig[[threadgroup_position_in_grid]], + ushort3 tpitg[[thread_position_in_threadgroup]], + ushort sgitg[[simdgroup_index_in_threadgroup]], + ushort tiisg[[thread_index_in_simdgroup]], + ushort3 ntg[[threads_per_threadgroup]]) { + const short NSG = FC_count_equal_nsg; + + const int i3 = tgpig.z; + const int i2 = tgpig.y; + const int i1 = tgpig.x; + + if (i3 >= args.ne03 || i2 >= args.ne02 || i1 >= args.ne01) { + return; + } + + int sum = 0; + + device const char * base0 = src0 + i1*args.nb01 + i2*args.nb02 + i3*args.nb03; + device const char * base1 = src1 + i1*args.nb11 + i2*args.nb12 + i3*args.nb13; + + for (int64_t i0 = tpitg.x; i0 < args.ne00; i0 += ntg.x) { + const T v0 = *(device const T *)(base0 + i0*args.nb00); + const T v1 = *(device const T *)(base1 + i0*args.nb10); + sum += (v0 == v1); + } + + sum = simd_sum(sum); + + if (tiisg == 0) { + shmem_i32[sgitg] = sum; + } + + threadgroup_barrier(mem_flags::mem_threadgroup); + + if (sgitg == 0) { + float v = 0.0f; + if (tpitg.x < NSG) { + v = shmem_i32[tpitg.x]; + } + + float total = simd_sum(v); + if (tpitg.x == 0) { + atomic_fetch_add_explicit(dst, (int32_t) total, memory_order_relaxed); + } + } +} + +typedef decltype(kernel_count_equal) kernel_count_equal_t; + +template [[host_name("kernel_count_equal_i32")]] kernel kernel_count_equal_t kernel_count_equal; From 9b8329de7a7200385aaac16ab4a2ab79ae14d829 Mon Sep 17 00:00:00 2001 From: Henry147147 <44851451+Henry147147@users.noreply.github.com> Date: Wed, 31 Dec 2025 06:13:23 -0500 Subject: [PATCH 08/12] mtmd : Adding support for Nvidia Music Flamingo Model (#18470) * Inital commit, debugging q5_k_s quant * Made hf_to_gguf extend whisper to reduce code duplication * addressed convert_hf_to_gguf pull request issue --------- Co-authored-by: Henry D --- convert_hf_to_gguf.py | 14 +++++++++++++- gguf-py/gguf/constants.py | 1 + tools/mtmd/clip-impl.h | 2 ++ tools/mtmd/clip-model.h | 3 ++- tools/mtmd/clip.cpp | 19 ++++++++++++++++++- tools/mtmd/models/whisper-enc.cpp | 9 +++++++++ tools/mtmd/mtmd.cpp | 4 ++++ 7 files changed, 49 insertions(+), 3 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index f893b24c75..173f8ed0d2 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -3503,7 +3503,7 @@ class QwenModel(TextModel): self._set_vocab_qwen() -@ModelBase.register("Qwen2Model", "Qwen2ForCausalLM", "Qwen2AudioForConditionalGeneration", "KORMoForCausalLM") +@ModelBase.register("Qwen2Model", "Qwen2ForCausalLM", "Qwen2AudioForConditionalGeneration", "KORMoForCausalLM", "AudioFlamingo3ForConditionalGeneration") class Qwen2Model(TextModel): model_arch = gguf.MODEL_ARCH.QWEN2 @@ -9292,6 +9292,18 @@ class VoxtralWhisperEncoderModel(WhisperEncoderModel): self.gguf_writer.add_audio_stack_factor(4) # == intermediate_size // hidden_size +@ModelBase.register("AudioFlamingo3ForConditionalGeneration") +class AudioFlamingo3WhisperEncoderModel(WhisperEncoderModel): + def set_gguf_parameters(self): + super().set_gguf_parameters() + self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.MUSIC_FLAMINGO) + + def tensor_force_quant(self, name, new_name, bid, n_dims): + if ".conv" in name and ".weight" in name: + # Was trained in BF16, being safe, avoiding quantizing to FP16 + return gguf.GGMLQuantizationType.F32 + return super().tensor_force_quant(name, new_name, bid, n_dims) + @ModelBase.register("FalconH1ForCausalLM") class FalconH1Model(Mamba2Model): model_arch = gguf.MODEL_ARCH.FALCON_H1 diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 616b8add36..c2a0f41c1b 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -3492,6 +3492,7 @@ class VisionProjectorType: COGVLM = "cogvlm" JANUS_PRO = "janus_pro" LFM2A = "lfm2a" # audio + MUSIC_FLAMINGO = "musicflamingo" # audio GLM4V = "glm4v" diff --git a/tools/mtmd/clip-impl.h b/tools/mtmd/clip-impl.h index a0939865e3..1ed0741883 100644 --- a/tools/mtmd/clip-impl.h +++ b/tools/mtmd/clip-impl.h @@ -180,6 +180,7 @@ enum projector_type { PROJECTOR_TYPE_GLMA, PROJECTOR_TYPE_QWEN25O, // will be replaced by QWEN2A or QWEN25VL depending on clip_ctx PROJECTOR_TYPE_VOXTRAL, + PROJECTOR_TYPE_MUSIC_FLAMINGO, PROJECTOR_TYPE_LFM2, PROJECTOR_TYPE_KIMIVL, PROJECTOR_TYPE_LIGHTONOCR, @@ -209,6 +210,7 @@ static std::map PROJECTOR_TYPE_NAMES = { { PROJECTOR_TYPE_GLMA, "glma"}, { PROJECTOR_TYPE_QWEN25O, "qwen2.5o"}, { PROJECTOR_TYPE_VOXTRAL, "voxtral"}, + { PROJECTOR_TYPE_MUSIC_FLAMINGO, "musicflamingo"}, { PROJECTOR_TYPE_LFM2, "lfm2"}, { PROJECTOR_TYPE_KIMIVL, "kimivl"}, { PROJECTOR_TYPE_LIGHTONOCR,"lightonocr"}, diff --git a/tools/mtmd/clip-model.h b/tools/mtmd/clip-model.h index b4c31cdde6..1e5aa87b98 100644 --- a/tools/mtmd/clip-model.h +++ b/tools/mtmd/clip-model.h @@ -319,7 +319,8 @@ struct clip_model { bool audio_has_avgpool() const { return proj_type == PROJECTOR_TYPE_QWEN2A - || proj_type == PROJECTOR_TYPE_VOXTRAL; + || proj_type == PROJECTOR_TYPE_VOXTRAL + || proj_type == PROJECTOR_TYPE_MUSIC_FLAMINGO; } bool audio_has_stack_frames() const { diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp index 3ba0823def..fb08dd258c 100644 --- a/tools/mtmd/clip.cpp +++ b/tools/mtmd/clip.cpp @@ -818,6 +818,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 case PROJECTOR_TYPE_VOXTRAL: case PROJECTOR_TYPE_QWEN2A: case PROJECTOR_TYPE_GLMA: + case PROJECTOR_TYPE_MUSIC_FLAMINGO: { builder = std::make_unique(ctx, img); } break; @@ -1176,6 +1177,7 @@ struct clip_model_loader { case PROJECTOR_TYPE_QWEN2A: case PROJECTOR_TYPE_GLMA: case PROJECTOR_TYPE_VOXTRAL: + case PROJECTOR_TYPE_MUSIC_FLAMINGO: { bool require_stack = model.proj_type == PROJECTOR_TYPE_ULTRAVOX || model.proj_type == PROJECTOR_TYPE_VOXTRAL || @@ -1576,6 +1578,17 @@ struct clip_model_loader { model.mm_1_w = get_tensor(string_format(TN_MM_AUDIO_MLP, 1, "weight")); model.mm_2_w = get_tensor(string_format(TN_MM_AUDIO_MLP, 2, "weight")); } break; + case PROJECTOR_TYPE_MUSIC_FLAMINGO: + { + model.conv1d_1_w = get_tensor(string_format(TN_CONV1D, 1, "weight")); + model.conv1d_1_b = get_tensor(string_format(TN_CONV1D, 1, "bias")); + model.conv1d_2_w = get_tensor(string_format(TN_CONV1D, 2, "weight")); + model.conv1d_2_b = get_tensor(string_format(TN_CONV1D, 2, "bias")); + model.mm_1_w = get_tensor(string_format(TN_MM_AUDIO_MLP, 1, "weight")); + model.mm_1_b = get_tensor(string_format(TN_MM_AUDIO_MLP, 1, "bias")); + model.mm_2_w = get_tensor(string_format(TN_MM_AUDIO_MLP, 2, "weight")); + model.mm_2_b = get_tensor(string_format(TN_MM_AUDIO_MLP, 2, "bias")); + } break; case PROJECTOR_TYPE_INTERNVL: { model.mm_0_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 0, "weight")); @@ -3031,6 +3044,7 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im case PROJECTOR_TYPE_VOXTRAL: case PROJECTOR_TYPE_ULTRAVOX: case PROJECTOR_TYPE_QWEN2A: + case PROJECTOR_TYPE_MUSIC_FLAMINGO: { n_patches = img->nx; @@ -3403,6 +3417,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima case PROJECTOR_TYPE_ULTRAVOX: case PROJECTOR_TYPE_LFM2: case PROJECTOR_TYPE_VOXTRAL: + case PROJECTOR_TYPE_MUSIC_FLAMINGO: case PROJECTOR_TYPE_JANUS_PRO: case PROJECTOR_TYPE_COGVLM: { @@ -3526,6 +3541,7 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) { return ctx->model.projection->ne[1]; case PROJECTOR_TYPE_ULTRAVOX: case PROJECTOR_TYPE_VOXTRAL: + case PROJECTOR_TYPE_MUSIC_FLAMINGO: return ctx->model.mm_2_w->ne[1]; case PROJECTOR_TYPE_INTERNVL: return ctx->model.mm_3_w->ne[1]; @@ -3587,7 +3603,8 @@ bool clip_has_whisper_encoder(const struct clip_ctx * ctx) { return ctx->proj_type() == PROJECTOR_TYPE_ULTRAVOX || ctx->proj_type() == PROJECTOR_TYPE_QWEN2A || ctx->proj_type() == PROJECTOR_TYPE_GLMA - || ctx->proj_type() == PROJECTOR_TYPE_VOXTRAL; + || ctx->proj_type() == PROJECTOR_TYPE_VOXTRAL + || ctx->proj_type() == PROJECTOR_TYPE_MUSIC_FLAMINGO; } bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img, int h, int w, float * vec) { diff --git a/tools/mtmd/models/whisper-enc.cpp b/tools/mtmd/models/whisper-enc.cpp index 2870d854ab..2f2b127755 100644 --- a/tools/mtmd/models/whisper-enc.cpp +++ b/tools/mtmd/models/whisper-enc.cpp @@ -86,6 +86,15 @@ ggml_cgraph * clip_graph_whisper_enc::build() { FFN_GELU_ERF, -1); + } else if (proj_type == PROJECTOR_TYPE_MUSIC_FLAMINGO) { + // projector + cur = build_ffn(cur, + model.mm_1_w, model.mm_1_b, + nullptr, nullptr, + model.mm_2_w, model.mm_2_b, + FFN_GELU_ERF, + -1); + } else if (proj_type == PROJECTOR_TYPE_GLMA) { cur = ggml_norm(ctx0, cur, hparams.eps); cur = ggml_mul(ctx0, cur, model.mm_norm_pre_w); diff --git a/tools/mtmd/mtmd.cpp b/tools/mtmd/mtmd.cpp index b9c4fa9098..b0b5ab42ab 100644 --- a/tools/mtmd/mtmd.cpp +++ b/tools/mtmd/mtmd.cpp @@ -330,6 +330,7 @@ struct mtmd_context { case PROJECTOR_TYPE_ULTRAVOX: case PROJECTOR_TYPE_VOXTRAL: case PROJECTOR_TYPE_GLMA: + case PROJECTOR_TYPE_MUSIC_FLAMINGO: audio_preproc = std::make_unique(ctx_a); break; case PROJECTOR_TYPE_LFM2A: @@ -352,6 +353,9 @@ struct mtmd_context { // [BEGIN_AUDIO] ... (embeddings) ... aud_beg = "[BEGIN_AUDIO]"; + } else if (proj == PROJECTOR_TYPE_MUSIC_FLAMINGO) { + // ... (embeddings) ... + aud_beg = ""; } } From 0db81098494023775a704a44042c317d36c91f24 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= Date: Wed, 31 Dec 2025 14:28:21 +0100 Subject: [PATCH 09/12] convert : lint fix (#18507) --- convert_hf_to_gguf.py | 1 + 1 file changed, 1 insertion(+) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 173f8ed0d2..edc0ed539d 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -9304,6 +9304,7 @@ class AudioFlamingo3WhisperEncoderModel(WhisperEncoderModel): return gguf.GGMLQuantizationType.F32 return super().tensor_force_quant(name, new_name, bid, n_dims) + @ModelBase.register("FalconH1ForCausalLM") class FalconH1Model(Mamba2Model): model_arch = gguf.MODEL_ARCH.FALCON_H1 From 33ded988ba9a5514036d64334f803334047a15d8 Mon Sep 17 00:00:00 2001 From: Anri Lombard Date: Wed, 31 Dec 2025 17:29:03 +0200 Subject: [PATCH 10/12] quantize: prevent input/output file collision (#18451) Check if input and output files are the same before quantizing to prevent file corruption when mmap reads from a file being written to. Fixes #12753 --- tools/quantize/quantize.cpp | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/tools/quantize/quantize.cpp b/tools/quantize/quantize.cpp index 470dc3d916..881f4b3dd9 100644 --- a/tools/quantize/quantize.cpp +++ b/tools/quantize/quantize.cpp @@ -12,6 +12,7 @@ #include #include #include +#include struct quant_option { std::string name; @@ -643,6 +644,11 @@ int main(int argc, char ** argv) { return 1; } + if (std::error_code ec; std::filesystem::equivalent(fname_inp, fname_out, ec)) { + fprintf(stderr, "%s: error: input and output files are the same: '%s'\n", __func__, fname_inp.c_str()); + return 1; + } + print_build_info(); fprintf(stderr, "%s: quantizing '%s' to '%s' as %s", __func__, fname_inp.c_str(), fname_out.c_str(), ftype_str.c_str()); From 54f67b9b66341a7d2a362bdb67211090c791ef44 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Wed, 31 Dec 2025 18:24:07 +0200 Subject: [PATCH 11/12] ggml : bump version to 0.9.5 (ggml/1410) --- ggml/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml/CMakeLists.txt b/ggml/CMakeLists.txt index cb46c32100..0176ca1ce9 100644 --- a/ggml/CMakeLists.txt +++ b/ggml/CMakeLists.txt @@ -4,7 +4,7 @@ project("ggml" C CXX ASM) ### GGML Version set(GGML_VERSION_MAJOR 0) set(GGML_VERSION_MINOR 9) -set(GGML_VERSION_PATCH 4) +set(GGML_VERSION_PATCH 5) set(GGML_VERSION_BASE "${GGML_VERSION_MAJOR}.${GGML_VERSION_MINOR}.${GGML_VERSION_PATCH}") find_program(GIT_EXE NAMES git git.exe NO_CMAKE_FIND_ROOT_PATH) From 13814eb370d2f0b70e1830cc577b6155b17aee47 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Wed, 31 Dec 2025 18:27:54 +0200 Subject: [PATCH 12/12] sync : ggml --- scripts/sync-ggml.last | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/sync-ggml.last b/scripts/sync-ggml.last index 5823efac2d..c838276158 100644 --- a/scripts/sync-ggml.last +++ b/scripts/sync-ggml.last @@ -1 +1 @@ -130bc125a88bb57664b88932c48c38a1cb316fac +ebc3a0f4a56be1c9424a89fbec09962ac34fde85