From 2634ed207a17db1a54bd8df0555bd8499a6ab691 Mon Sep 17 00:00:00 2001
From: Neo Zhang <zhang.jianyu@outlook.com>
Date: Sun, 1 Feb 2026 18:24:00 +0800
Subject: [PATCH 01/18] create test.sh to enhance the parameters for testing,
 update the guide, rm useless script (#19243)

---
 docs/backend/SYCL.md                          |  34 ++---
 examples/sycl/run-llama2.sh                   |   5 +-
 examples/sycl/run-llama3.sh                   |  31 -----
 examples/sycl/test.sh                         | 130 ++++++++++++++++++
 examples/sycl/win-run-llama2.bat              |   4 +-
 .../sycl/{win-run-llama3.bat => win-test.bat} |   4 +-
 6 files changed, 149 insertions(+), 59 deletions(-)
 delete mode 100755 examples/sycl/run-llama3.sh
 create mode 100755 examples/sycl/test.sh
 rename examples/sycl/{win-run-llama3.bat => win-test.bat} (69%)

diff --git a/docs/backend/SYCL.md b/docs/backend/SYCL.md
index c0a422b3dc..10cb02ff2c 100644
--- a/docs/backend/SYCL.md
+++ b/docs/backend/SYCL.md
@@ -119,7 +119,7 @@ On older Intel GPUs, you may try [OpenCL](/docs/backend/OPENCL.md) although the
 *Notes:*
 
 - **Memory**
-  - The device memory is a limitation when running a large model. The loaded model size, *`llm_load_tensors: buffer_size`*, is displayed in the log when running `./bin/llama-cli`.
+  - The device memory is a limitation when running a large model. The loaded model size, *`llm_load_tensors: buffer_size`*, is displayed in the log when running `./bin/llama-completion`.
   - Please make sure the GPU shared memory from the host is large enough to account for the model's size. For e.g. the *llama-2-7b.Q4_0* requires at least 8.0GB for integrated GPU and 4.0GB for discrete GPU.
 
 - **Execution Unit (EU)**
@@ -423,16 +423,12 @@ Choose one of following methods to run.
 - Use device 0:
 
 ```sh
-./examples/sycl/run-llama2.sh 0
-# OR
-./examples/sycl/run-llama3.sh 0
+./examples/sycl/test.sh -mg 0
 ```
 - Use multiple devices:
 
 ```sh
-./examples/sycl/run-llama2.sh
-# OR
-./examples/sycl/run-llama3.sh
+./examples/sycl/test.sh
 ```
 
 2. Command line
@@ -455,13 +451,13 @@ Examples:
 - Use device 0:
 
 ```sh
-ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -no-cnv -m models/llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 99 -sm none -mg 0
+ZES_ENABLE_SYSMAN=1 ./build/bin/llama-completion -no-cnv -m models/llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 99 -sm none -mg 0 --mmap
 ```
 
 - Use multiple devices:
 
 ```sh
-ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -no-cnv -m models/llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 99 -sm layer
+ZES_ENABLE_SYSMAN=1 ./build/bin/llama-completion -no-cnv -m models/llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 99 -sm layer --mmap
 ```
 
 *Notes:*
@@ -577,13 +573,13 @@ Or, use CMake presets to build:
 
 ```sh
 cmake --preset x64-windows-sycl-release
-cmake --build build-x64-windows-sycl-release -j --target llama-cli
+cmake --build build-x64-windows-sycl-release -j --target llama-completion
 
 cmake -DGGML_SYCL_F16=ON --preset x64-windows-sycl-release
-cmake --build build-x64-windows-sycl-release -j --target llama-cli
+cmake --build build-x64-windows-sycl-release -j --target llama-completion
 
 cmake --preset x64-windows-sycl-debug
-cmake --build build-x64-windows-sycl-debug -j --target llama-cli
+cmake --build build-x64-windows-sycl-debug -j --target llama-completion
 ```
 
 #### 3. Visual Studio
@@ -608,7 +604,7 @@ You can use Visual Studio to open the `llama.cpp` folder directly as a CMake pro
 - For a minimal experimental setup, you can build only the inference executable using:
 
     ```Powershell
-    cmake --build build --config Release -j --target llama-cli
+    cmake --build build --config Release -j --target llama-completion
     ```
 
 ##### - Generating a Visual Studio Solution
@@ -714,13 +710,7 @@ Choose one of following methods to run.
 1. Script
 
 ```
-examples\sycl\win-run-llama-2.bat
-```
-
-or
-
-```
-examples\sycl\win-run-llama-3.bat
+examples\sycl\win-test.bat
 ```
 
 2. Command line
@@ -744,13 +734,13 @@ Examples:
 - Use device 0:
 
 ```
-build\bin\llama-cli.exe -no-cnv -m models\llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e -ngl 99 -sm none -mg 0
+build\bin\llama-completion.exe -no-cnv -m models\llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e -ngl 99 -sm none -mg 0 --mmap
 ```
 
 - Use multiple devices:
 
 ```
-build\bin\llama-cli.exe -no-cnv -m models\llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e -ngl 99 -sm layer
+build\bin\llama-completion.exe -no-cnv -m models\llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e -ngl 99 -sm layer --mmap
 ```
 
 
diff --git a/examples/sycl/run-llama2.sh b/examples/sycl/run-llama2.sh
index cf23619ee0..d33f82f339 100755
--- a/examples/sycl/run-llama2.sh
+++ b/examples/sycl/run-llama2.sh
@@ -18,13 +18,14 @@ CONTEXT=4096
 #support malloc device memory more than 4GB.
 export UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS=1
 
+LOAD_MODE='--mmap'
 if [ $# -gt 0 ]; then
     GGML_SYCL_DEVICE=$1
     echo "use $GGML_SYCL_DEVICE as main GPU"
     #use signle GPU only
-    ZES_ENABLE_SYSMAN=1 ./build/bin/llama-completion -m ${MODEL_FILE} -no-cnv -p "${INPUT_PROMPT}" -n 400 -e -ngl ${NGL} -s 0 -c ${CONTEXT} -mg $GGML_SYCL_DEVICE -sm none
+    ZES_ENABLE_SYSMAN=1 ./build/bin/llama-completion -m ${MODEL_FILE} -no-cnv -p "${INPUT_PROMPT}" -n 400 -e -ngl ${NGL} -s 0 -c ${CONTEXT} -mg $GGML_SYCL_DEVICE -sm none ${LOAD_MODE}
 
 else
     #use multiple GPUs with same max compute units
-    ZES_ENABLE_SYSMAN=1 ./build/bin/llama-completion -m ${MODEL_FILE} -no-cnv -p "${INPUT_PROMPT}" -n 400 -e -ngl ${NGL} -s 0 -c ${CONTEXT}
+    ZES_ENABLE_SYSMAN=1 ./build/bin/llama-completion -m ${MODEL_FILE} -no-cnv -p "${INPUT_PROMPT}" -n 400 -e -ngl ${NGL} -s 0 -c ${CONTEXT} ${LOAD_MODE}
 fi
diff --git a/examples/sycl/run-llama3.sh b/examples/sycl/run-llama3.sh
deleted file mode 100755
index feee5165e9..0000000000
--- a/examples/sycl/run-llama3.sh
+++ /dev/null
@@ -1,31 +0,0 @@
-#!/usr/bin/env bash
-
-#  MIT license
-#  Copyright (C) 2025 Intel Corporation
-#  SPDX-License-Identifier: MIT
-
-# If you want more control, DPC++ Allows selecting a specific device through the
-# following environment variable
-export ONEAPI_DEVICE_SELECTOR="level_zero:0"
-source /opt/intel/oneapi/setvars.sh
-
-#export GGML_SYCL_DEBUG=1
-
-#ZES_ENABLE_SYSMAN=1, Support to get free memory of GPU by sycl::aspect::ext_intel_free_memory. Recommended to use when --split-mode = layer.
-
-INPUT_PROMPT="Building a website can be done in 10 simple steps:\nStep 1:"
-MODEL_FILE=models/Meta-Llama-3.1-8B-Instruct-Q4_K_M.gguf
-NGL=99 # Layers offloaded to the GPU. If the device runs out of memory, reduce this value according to the model you are using.
-CONTEXT=4096
-
-#support malloc device memory more than 4GB.
-export UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS=1
-
-if [ $# -gt 0 ]; then
-    GGML_SYCL_DEVICE=$1
-    echo "Using $GGML_SYCL_DEVICE as the main GPU"
-    ZES_ENABLE_SYSMAN=1 ./build/bin/llama-completion -m ${MODEL_FILE} -no-cnv -p "${INPUT_PROMPT}" -n 400 -e -ngl ${NGL} -s 0 -c ${CONTEXT} -mg $GGML_SYCL_DEVICE -sm none
-else
-    #use multiple GPUs with same max compute units
-    ZES_ENABLE_SYSMAN=1 ./build/bin/llama-completion -m ${MODEL_FILE} -no-cnv -p "${INPUT_PROMPT}" -n 400 -e -ngl ${NGL} -s 0 -c ${CONTEXT}
-fi
diff --git a/examples/sycl/test.sh b/examples/sycl/test.sh
new file mode 100755
index 0000000000..140c191466
--- /dev/null
+++ b/examples/sycl/test.sh
@@ -0,0 +1,130 @@
+#!/bin/bash
+
+#  MIT license
+#  Copyright (C) 2024 Intel Corporation
+#  SPDX-License-Identifier: MIT
+
+Help() {
+  cat << EOF
+Usage: $(basename "$0") [OPTIONS]
+
+This script processes files with specified options.
+
+Options:
+  -h, --help    Display this help message and exit.
+  -c, --context <value>    Set context length. Bigger need more memory.
+  -p, --promote <value>    Prompt to start generation with.
+  -m, --model   <value>    Full model file path.
+  -mg,--main-gpu <value>   Set main GPU ID (0 - n) for single GPU mode.
+  -sm,--split-mode <value> How to split the model across multiple GPUs, one of:
+                            - none: use one GPU only
+                            - layer (default): split layers and KV across GPUs
+                            - row: split rows across GPUs
+  -ngl,--n-gpu-layers <value>  Max. number of layers to store in VRAM (default: -1)
+  -lv,--log-verbosity <value>  Set the verbosity threshold. Messages with a higher verbosity will be
+                               ignored. Values:
+                                - 0: generic output
+                                - 1: error
+                                - 2: warning
+                                - 3: info
+                                - 4: debug
+
+
+EOF
+}
+
+BIN_FILE=./build/bin/llama-completion
+SEED=0
+GPUS_SETTING=""
+
+INPUT_PROMPT="Building a website can be done in 10 simple steps:\nStep 1:"
+MODEL_FILE=models/llama-2-7b.Q4_0.gguf
+NGL=99
+CONTEXT=4096
+GGML_SYCL_DEVICE=-1
+SPLIT_MODE=layer
+LOG_VERBOSE=3
+while [[ $# -gt 0 ]]; do
+    case "$1" in
+        -c|--context)
+            CONTEXT=$2
+            # Shift twice to consume both the option flag and its value
+            shift
+            shift
+            ;;
+        -p|--promote)
+            # Option that is a simple flag (boolean)
+            INPUT_PROMPT="$2"
+            # Shift once to consume the option flag
+            shift
+            shift
+            ;;
+        -m|--model)
+            MODEL_FILE="$2"
+            # Shift twice to consume both the option flag and its value
+            shift
+            shift
+            ;;
+        -mg|--main-gpu)
+            GGML_SYCL_DEVICE=$2
+            SPLIT_MODE=none
+            # Shift twice to consume both the option flag and its value
+            shift
+            shift
+            ;;
+        -sm|--split-mode)
+            SPLIT_MODE=$2
+            # Shift twice to consume both the option flag and its value
+            shift
+            shift
+            ;;
+        -ngl|--n-gpu-layers)
+            NGL=$2
+            # Shift twice to consume both the option flag and its value
+            shift
+            shift
+            ;;
+        -lv|--log-verbosity)
+            LOG_VERBOSE=$2
+            # Shift twice to consume both the option flag and its value
+            shift
+            shift
+            ;;
+        -h|--help)
+            Help
+            exit 0
+            ;;
+        *)
+            # Handle unknown options or stop processing options
+            echo "Invalid option: $1"
+            # Optional: exit script or shift to treat remaining as positional args
+            exit 1
+            ;;
+    esac
+done
+
+
+
+source /opt/intel/oneapi/setvars.sh
+
+#export GGML_SYCL_DEBUG=1
+
+#ZES_ENABLE_SYSMAN=1, Support to get free memory of GPU by sycl::aspect::ext_intel_free_memory. Recommended to use when --split-mode = layer.
+
+#support malloc device memory more than 4GB.
+export UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS=1
+echo "UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS=${UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS}"
+
+if [ $GGML_SYCL_DEVICE -ne -1 ]; then
+    echo "Use $GGML_SYCL_DEVICE as main GPU"
+    #use signle GPU only
+    GPUS_SETTING="-mg $GGML_SYCL_DEVICE -sm ${SPLIT_MODE}"
+    export ONEAPI_DEVICE_SELECTOR="level_zero:${$GGML_SYCL_DEVICE}"
+    echo "ONEAPI_DEVICE_SELECTOR=${ONEAPI_DEVICE_SELECTOR}"
+else
+   echo "Use all Intel GPUs, including iGPU & dGPU"
+ fi
+
+echo "run cmd: ZES_ENABLE_SYSMAN=1 ${BIN_FILE} -m ${MODEL_FILE} -no-cnv -p "${INPUT_PROMPT}" -n 400 -e -ngl ${NGL} -s ${SEED} -c ${CONTEXT} ${GPUS_SETTING} -lv ${LOG_VERBOSE}  --mmap "
+ZES_ENABLE_SYSMAN=1 ${BIN_FILE} -m ${MODEL_FILE} -no-cnv -p "${INPUT_PROMPT}" -n 400 -e -ngl ${NGL} -s ${SEED} -c ${CONTEXT} ${GPUS_SETTING} -lv ${LOG_VERBOSE} --mmap
+
diff --git a/examples/sycl/win-run-llama2.bat b/examples/sycl/win-run-llama2.bat
index 32ff673ae2..1f2dab8d0a 100644
--- a/examples/sycl/win-run-llama2.bat
+++ b/examples/sycl/win-run-llama2.bat
@@ -7,5 +7,5 @@ set INPUT2="Building a website can be done in 10 simple steps:\nStep 1:"
 
 :: support malloc device memory more than 4GB.
 set UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS=1
-
-.\build\bin\llama-completion.exe -m models\llama-2-7b.Q4_0.gguf -no-cnv -p %INPUT2% -n 400 -e -ngl 99 -s 0
+set LOAD_MODE="--mmap"
+.\build\bin\llama-completion.exe -m models\llama-2-7b.Q4_0.gguf -no-cnv -p %INPUT2% -n 400 -e -ngl 99 -s 0 %LOAD_MODE%
diff --git a/examples/sycl/win-run-llama3.bat b/examples/sycl/win-test.bat
similarity index 69%
rename from examples/sycl/win-run-llama3.bat
rename to examples/sycl/win-test.bat
index ea4ae69d6c..1f2dab8d0a 100644
--- a/examples/sycl/win-run-llama3.bat
+++ b/examples/sycl/win-test.bat
@@ -7,5 +7,5 @@ set INPUT2="Building a website can be done in 10 simple steps:\nStep 1:"
 
 :: support malloc device memory more than 4GB.
 set UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS=1
-
-.\build\bin\llama-completion.exe -m models\Meta-Llama-3.1-8B-Instruct-Q4_K_M.gguf -no-cnv -p %INPUT2% -n 400 -s 0 -e -ngl 99
+set LOAD_MODE="--mmap"
+.\build\bin\llama-completion.exe -m models\llama-2-7b.Q4_0.gguf -no-cnv -p %INPUT2% -n 400 -e -ngl 99 -s 0 %LOAD_MODE%

From 8a98ba4582ea961f06d350e60cf3572015489745 Mon Sep 17 00:00:00 2001
From: Alexis Williams <typedrat@users.noreply.github.com>
Date: Sun, 1 Feb 2026 12:10:48 -0800
Subject: [PATCH 02/18] nix: fix allowUnfreePredicate for packages with
 multiple licenses (#19237)

The allowUnfreePredicate in pkgsCuda was wrapping p.meta.license in a
list unconditionally. This fails when meta.license is already a list
of licenses, as it creates a nested list and then tries to access
.free and .shortName on the inner list.

Use lib.toList instead, which correctly handles both cases:
- Single license attrset -> wraps in list
- List of licenses -> returns unchanged
---
 .devops/nix/nixpkgs-instances.nix | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.devops/nix/nixpkgs-instances.nix b/.devops/nix/nixpkgs-instances.nix
index 90d683a713..40cf58f196 100644
--- a/.devops/nix/nixpkgs-instances.nix
+++ b/.devops/nix/nixpkgs-instances.nix
@@ -4,7 +4,7 @@
   # the module `{ pkgs ... }: { /* config */ }` implicitly uses
   # `_module.args.pkgs` (defined in this case by flake-parts).
   perSystem =
-    { system, ... }:
+    { lib, system, ... }:
     {
       _module.args = {
         # Note: bringing up https://zimbatm.com/notes/1000-instances-of-nixpkgs
@@ -33,7 +33,7 @@
                 "CUDA EULA"
                 "cuDNN EULA"
               ]
-            ) (p.meta.licenses or [ p.meta.license ]);
+            ) (p.meta.licenses or (lib.toList p.meta.license));
         };
         # Ensure dependencies use ROCm consistently
         pkgsRocm = import inputs.nixpkgs {

From 3bc8d2cf23d86232bbdd9dcb60946f6a9199c15c Mon Sep 17 00:00:00 2001
From: Max Krasnyansky <maxk@qti.qualcomm.com>
Date: Sun, 1 Feb 2026 14:13:38 -0800
Subject: [PATCH 03/18] Bump cmake max version (needed for Windows on
 Snapdragon builds) (#19188)

* Bump max cmake version (needed for Windows on Snapdragon builds)

* cmake: move max version setting into ggml/CMakeLists
---
 docs/backend/snapdragon/CMakeUserPresets.json | 5 -----
 docs/backend/snapdragon/windows.md            | 2 +-
 ggml/CMakeLists.txt                           | 2 +-
 3 files changed, 2 insertions(+), 7 deletions(-)

diff --git a/docs/backend/snapdragon/CMakeUserPresets.json b/docs/backend/snapdragon/CMakeUserPresets.json
index 4cf473d05f..1faae2f3db 100644
--- a/docs/backend/snapdragon/CMakeUserPresets.json
+++ b/docs/backend/snapdragon/CMakeUserPresets.json
@@ -1,10 +1,5 @@
 {
   "version": 5,
-  "cmakeMinimumRequired": {
-      "major": 3,
-      "minor": 28,
-      "patch": 0
-  },
   "configurePresets": [
     {
         "name": "arm64-android-snapdragon",
diff --git a/docs/backend/snapdragon/windows.md b/docs/backend/snapdragon/windows.md
index 710ad8fdf4..e9346ccadf 100644
--- a/docs/backend/snapdragon/windows.md
+++ b/docs/backend/snapdragon/windows.md
@@ -128,7 +128,7 @@ However, additional settings are required for generating and signing HTP Ops lib
 > $env:HEXAGON_HTP_CERT="c:\Users\MyUsers\Certs\ggml-htp-v1.pfx"
 > $env:WINDOWS_SDK_BIN="C:\Program Files (x86)\Windows Kits\10\bin\10.0.26100.0\arm64"
 
-> cmake --preset arm64-windows-snapdragon -B build-wos
+> cmake --preset arm64-windows-snapdragon-release -B build-wos
 ...
 > cmake --install build-wos --prefix pkg-snapdragon
 ```
diff --git a/ggml/CMakeLists.txt b/ggml/CMakeLists.txt
index b0b8e57898..71d1a7f0e3 100644
--- a/ggml/CMakeLists.txt
+++ b/ggml/CMakeLists.txt
@@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 3.14) # for add_link_options and implicit target directories.
+cmake_minimum_required(VERSION 3.14...3.28) # for add_link_options and implicit target directories.
 project("ggml" C CXX ASM)
 
 ### GGML Version

From 2dc3ce2166a6c3b149402af60c15f4d80b472a6c Mon Sep 17 00:00:00 2001
From: Nikhil Jain <nikhil.jain0987@gmail.com>
Date: Sun, 1 Feb 2026 18:47:29 -0800
Subject: [PATCH 04/18] Remove pipeline cache mutexes (#19195)

* Remove mutex for pipeline caches, since they are now per-thread.

* Add comment

* Run clang-format

* Cleanup

* Run CI again

* Run CI once more

* Run clang-format
---
 ggml/src/ggml-webgpu/ggml-webgpu.cpp | 213 ++++++++++++---------------
 1 file changed, 93 insertions(+), 120 deletions(-)

diff --git a/ggml/src/ggml-webgpu/ggml-webgpu.cpp b/ggml/src/ggml-webgpu/ggml-webgpu.cpp
index 22e2bfeb4c..4ef50e365e 100644
--- a/ggml/src/ggml-webgpu/ggml-webgpu.cpp
+++ b/ggml/src/ggml-webgpu/ggml-webgpu.cpp
@@ -146,8 +146,13 @@ struct webgpu_submission_futures {
 struct webgpu_buf_pool {
     std::vector<webgpu_pool_bufs> free;
 
-    std::mutex mutex;
-
+    // The pool must be synchronized because
+    // 1. The memset pool is shared globally by every ggml buffer,
+    // since allocating a pool per ggml buffer would consume too much memory.
+    // 2. For the per-thread buffer pools in webgpu_context,
+    // buffers are allocated and freed in Dawn callbacks,
+    // which can run on a different thread than the calling thread.
+    std::mutex              mutex;
     std::condition_variable cv;
 
     void init(wgpu::Device      device,
@@ -266,7 +271,7 @@ struct webgpu_command {
 #endif
 };
 
-struct webgpu_capabilities_base {
+struct webgpu_capabilities {
     wgpu::Limits limits;
     bool         supports_subgroup_matrix = false;
 
@@ -286,11 +291,11 @@ struct webgpu_global_context_struct {
     wgpu::Device   device;
     wgpu::Queue    queue;
 
-    webgpu_capabilities_base capabilities;
+    webgpu_capabilities  capabilities;
     // Shared buffer to move data from device to host
-    wgpu::Buffer             get_tensor_staging_buf;
+    wgpu::Buffer         get_tensor_staging_buf;
     // Global mutex for pipeline and staging buffer, will be refactored to exclude pipeline caches.
-    std::recursive_mutex     mutex;
+    std::recursive_mutex mutex;
 
     webgpu_buf_pool                memset_buf_pool;
     std::map<int, webgpu_pipeline> memset_pipelines;  // variant or type index
@@ -361,7 +366,6 @@ struct webgpu_context_struct {
     std::unordered_map<ggml_webgpu_pad_pipeline_key, webgpu_pipeline, ggml_webgpu_pad_pipeline_key_hash> pad_pipelines;
 
     size_t memset_bytes_per_thread;
-
 };
 
 typedef std::shared_ptr<webgpu_context_struct> webgpu_context;
@@ -383,9 +387,8 @@ struct ggml_backend_webgpu_device_context {
 
 // Per-thread data required to actually run WebGPU operations in a backend instance
 struct ggml_backend_webgpu_context {
-    webgpu_context        webgpu_ctx;
-    std::once_flag        init_once;
-    std::string           name;
+    webgpu_context webgpu_ctx;
+    std::string    name;
 };
 
 // Per-thread data related to buffers
@@ -861,20 +864,15 @@ static webgpu_command ggml_webgpu_pad(webgpu_context & ctx, ggml_tensor * src, g
     };
 
     webgpu_pipeline pipeline;
-    {
-        // TODO: remove guard once pipeline caches are per-thread
-        std::lock_guard<std::recursive_mutex> lock(ctx->global_ctx->mutex);
-        auto                                  it = ctx->pad_pipelines.find(pipeline_key);
-        if (it != ctx->pad_pipelines.end()) {
-            pipeline = it->second;
-        } else {
-            ggml_webgpu_processed_shader processed =
-                ggml_webgpu_preprocess_pad_shader(ctx->p, wgsl_pad, shader_lib_ctx);
-            pipeline =
-                ggml_webgpu_create_pipeline(ctx->global_ctx->device, processed.wgsl.c_str(), processed.variant.c_str());
-            pipeline.context = processed.decisions;
-            ctx->pad_pipelines.emplace(pipeline_key, pipeline);
-        }
+    auto            it = ctx->pad_pipelines.find(pipeline_key);
+    if (it != ctx->pad_pipelines.end()) {
+        pipeline = it->second;
+    } else {
+        ggml_webgpu_processed_shader processed = ggml_webgpu_preprocess_pad_shader(ctx->p, wgsl_pad, shader_lib_ctx);
+        pipeline =
+            ggml_webgpu_create_pipeline(ctx->global_ctx->device, processed.wgsl.c_str(), processed.variant.c_str());
+        pipeline.context = processed.decisions;
+        ctx->pad_pipelines.emplace(pipeline_key, pipeline);
     }
 
     ggml_webgpu_generic_shader_decisions decisions =
@@ -944,20 +942,16 @@ static std::optional<webgpu_command> ggml_webgpu_set_rows(webgpu_context & ctx,
     };
 
     webgpu_pipeline pipeline;
-    // TODO: remove guard once pipeline caches are per-thread
-    {
-        std::lock_guard<std::recursive_mutex> lock(ctx->global_ctx->mutex);
-        auto                                  it = ctx->set_rows_pipelines.find(key);
-        if (it != ctx->set_rows_pipelines.end()) {
-            pipeline = it->second;
-        } else {
-            ggml_webgpu_processed_shader processed =
-                ggml_webgpu_preprocess_set_rows_shader(ctx->p, wgsl_set_rows, shader_lib_ctx);
-            pipeline =
-                ggml_webgpu_create_pipeline(ctx->global_ctx->device, processed.wgsl.c_str(), processed.variant.c_str());
-            pipeline.context = processed.decisions;
-            ctx->set_rows_pipelines.emplace(key, pipeline);
-        }
+    auto            it = ctx->set_rows_pipelines.find(key);
+    if (it != ctx->set_rows_pipelines.end()) {
+        pipeline = it->second;
+    } else {
+        ggml_webgpu_processed_shader processed =
+            ggml_webgpu_preprocess_set_rows_shader(ctx->p, wgsl_set_rows, shader_lib_ctx);
+        pipeline =
+            ggml_webgpu_create_pipeline(ctx->global_ctx->device, processed.wgsl.c_str(), processed.variant.c_str());
+        pipeline.context = processed.decisions;
+        ctx->set_rows_pipelines.emplace(key, pipeline);
     }
 
     ggml_webgpu_generic_shader_decisions decisions =
@@ -1261,29 +1255,25 @@ static webgpu_command ggml_webgpu_flash_attn(webgpu_context & ctx,
     };
 
     webgpu_pipeline pipeline;
-    // TODO: remove guard once pipeline caches are per-thread
-    {
-        std::lock_guard<std::recursive_mutex> lock(ctx->global_ctx->mutex);
-        auto                                  it = ctx->flash_attn_pipelines.find(key);
-        if (it != ctx->flash_attn_pipelines.end()) {
-            pipeline = it->second;
-        } else {
-            ggml_webgpu_flash_attn_shader_lib_context shader_lib_ctx = {
-                .key                = key,
-                .sg_mat_m           = ctx->global_ctx->capabilities.sg_mat_m,
-                .sg_mat_n           = ctx->global_ctx->capabilities.sg_mat_n,
-                .sg_mat_k           = ctx->global_ctx->capabilities.sg_mat_k,
-                .wg_mem_limit_bytes = ctx->global_ctx->capabilities.limits.maxComputeWorkgroupStorageSize,
-                .max_subgroup_size  = ctx->global_ctx->capabilities.max_subgroup_size
-            };
+    auto            it = ctx->flash_attn_pipelines.find(key);
+    if (it != ctx->flash_attn_pipelines.end()) {
+        pipeline = it->second;
+    } else {
+        ggml_webgpu_flash_attn_shader_lib_context shader_lib_ctx = {
+            .key                = key,
+            .sg_mat_m           = ctx->global_ctx->capabilities.sg_mat_m,
+            .sg_mat_n           = ctx->global_ctx->capabilities.sg_mat_n,
+            .sg_mat_k           = ctx->global_ctx->capabilities.sg_mat_k,
+            .wg_mem_limit_bytes = ctx->global_ctx->capabilities.limits.maxComputeWorkgroupStorageSize,
+            .max_subgroup_size  = ctx->global_ctx->capabilities.max_subgroup_size
+        };
 
-            ggml_webgpu_processed_shader processed =
-                ggml_webgpu_preprocess_flash_attn_shader(ctx->p, wgsl_flash_attn, shader_lib_ctx);
-            pipeline =
-                ggml_webgpu_create_pipeline(ctx->global_ctx->device, processed.wgsl.c_str(), processed.variant.c_str());
-            pipeline.context = processed.decisions;
-            ctx->flash_attn_pipelines.emplace(key, pipeline);
-        }
+        ggml_webgpu_processed_shader processed =
+            ggml_webgpu_preprocess_flash_attn_shader(ctx->p, wgsl_flash_attn, shader_lib_ctx);
+        pipeline =
+            ggml_webgpu_create_pipeline(ctx->global_ctx->device, processed.wgsl.c_str(), processed.variant.c_str());
+        pipeline.context = processed.decisions;
+        ctx->flash_attn_pipelines.emplace(key, pipeline);
     }
 
     ggml_webgpu_flash_attn_shader_decisions decisions =
@@ -1308,20 +1298,16 @@ static webgpu_command ggml_webgpu_unary_op(webgpu_context & ctx, ggml_tensor * s
     };
 
     webgpu_pipeline pipeline;
-    {
-        // TODO: remove guard once pipeline caches are per-thread
-        std::lock_guard<std::recursive_mutex> lock(ctx->global_ctx->mutex);
-        auto                                  it = ctx->unary_pipelines.find(pipeline_key);
-        if (it != ctx->unary_pipelines.end()) {
-            pipeline = it->second;
-        } else {
-            ggml_webgpu_processed_shader processed =
-                ggml_webgpu_preprocess_unary_shader(ctx->p, wgsl_unary, shader_lib_ctx);
-            pipeline =
-                ggml_webgpu_create_pipeline(ctx->global_ctx->device, processed.wgsl.c_str(), processed.variant.c_str());
-            pipeline.context = processed.decisions;
-            ctx->unary_pipelines.emplace(pipeline_key, pipeline);
-        }
+    auto            it = ctx->unary_pipelines.find(pipeline_key);
+    if (it != ctx->unary_pipelines.end()) {
+        pipeline = it->second;
+    } else {
+        ggml_webgpu_processed_shader processed =
+            ggml_webgpu_preprocess_unary_shader(ctx->p, wgsl_unary, shader_lib_ctx);
+        pipeline =
+            ggml_webgpu_create_pipeline(ctx->global_ctx->device, processed.wgsl.c_str(), processed.variant.c_str());
+        pipeline.context = processed.decisions;
+        ctx->unary_pipelines.emplace(pipeline_key, pipeline);
     }
 
     ggml_webgpu_generic_shader_decisions decisions =
@@ -1743,19 +1729,15 @@ static webgpu_command ggml_webgpu_argmax(webgpu_context & ctx, ggml_tensor * src
     };
 
     webgpu_pipeline pipeline;
-    {
-        // TODO: remove guard once pipeline caches are per-thread
-        std::lock_guard<std::recursive_mutex> lock(ctx->global_ctx->mutex);
-        auto                                  it = ctx->argmax_pipelines.find(shader_lib_ctx.vec4);
-        if (it != ctx->argmax_pipelines.end()) {
-            pipeline = it->second;
-        } else {
-            ggml_webgpu_processed_shader processed =
-                ggml_webgpu_preprocess_generic_shader(ctx->p, wgsl_argmax, shader_lib_ctx, "argmax");
-            pipeline =
-                ggml_webgpu_create_pipeline(ctx->global_ctx->device, processed.wgsl.c_str(), processed.variant.c_str());
-            ctx->argmax_pipelines.emplace(shader_lib_ctx.vec4, pipeline);
-        }
+    auto            it = ctx->argmax_pipelines.find(shader_lib_ctx.vec4);
+    if (it != ctx->argmax_pipelines.end()) {
+        pipeline = it->second;
+    } else {
+        ggml_webgpu_processed_shader processed =
+            ggml_webgpu_preprocess_generic_shader(ctx->p, wgsl_argmax, shader_lib_ctx, "argmax");
+        pipeline =
+            ggml_webgpu_create_pipeline(ctx->global_ctx->device, processed.wgsl.c_str(), processed.variant.c_str());
+        ctx->argmax_pipelines.emplace(shader_lib_ctx.vec4, pipeline);
     }
     uint32_t wg_x = ggml_nelements(dst);
     return ggml_backend_webgpu_build(ctx->global_ctx, ctx->param_buf_pool, pipeline, params, entries, wg_x);
@@ -1772,9 +1754,8 @@ static webgpu_command ggml_webgpu_argsort(webgpu_context & ctx, ggml_tensor * sr
         .order              = order
     };
 
-    std::lock_guard<std::recursive_mutex> lock(ctx->global_ctx->mutex);
-    webgpu_pipeline                       argsort_pipeline;
-    auto                                  it = ctx->argsort_pipelines.find(order);
+    webgpu_pipeline argsort_pipeline;
+    auto            it = ctx->argsort_pipelines.find(order);
     if (it != ctx->argsort_pipelines.end()) {
         argsort_pipeline = it->second;
     } else {
@@ -1963,19 +1944,15 @@ static webgpu_command ggml_webgpu_cumsum(webgpu_context & ctx, ggml_tensor * src
         .max_wg_size = ctx->global_ctx->capabilities.limits.maxComputeInvocationsPerWorkgroup,
     };
     webgpu_pipeline pipeline;
-    // TODO: remove guard once pipeline caches are per-thread
-    {
-        std::lock_guard<std::recursive_mutex> lock(ctx->global_ctx->mutex);
-        auto                                  it = ctx->cumsum_pipelines.find(1);
-        if (it != ctx->cumsum_pipelines.end()) {
-            pipeline = it->second;
-        } else {
-            ggml_webgpu_processed_shader processed =
-                ggml_webgpu_preprocess_generic_shader(ctx->p, wgsl_cumsum, shader_lib_ctx, "cumsum");
-            pipeline =
-                ggml_webgpu_create_pipeline(ctx->global_ctx->device, processed.wgsl.c_str(), processed.variant.c_str());
-            ctx->cumsum_pipelines.emplace(1, pipeline);
-        }
+    auto            it = ctx->cumsum_pipelines.find(1);
+    if (it != ctx->cumsum_pipelines.end()) {
+        pipeline = it->second;
+    } else {
+        ggml_webgpu_processed_shader processed =
+            ggml_webgpu_preprocess_generic_shader(ctx->p, wgsl_cumsum, shader_lib_ctx, "cumsum");
+        pipeline =
+            ggml_webgpu_create_pipeline(ctx->global_ctx->device, processed.wgsl.c_str(), processed.variant.c_str());
+        ctx->cumsum_pipelines.emplace(1, pipeline);
     }
     uint32_t wg_x = ggml_nrows(dst);
     return ggml_backend_webgpu_build(ctx->global_ctx, ctx->param_buf_pool, pipeline, params, entries, wg_x);
@@ -2009,19 +1986,15 @@ static webgpu_command ggml_webgpu_sum_rows(webgpu_context & ctx, ggml_tensor * s
     };
 
     webgpu_pipeline pipeline;
-    {
-        // TODO: remove guard once pipeline caches are per-thread
-        std::lock_guard<std::recursive_mutex> lock(ctx->global_ctx->mutex);
-        auto                                  it = ctx->sum_rows_pipelines.find(1);
-        if (it != ctx->sum_rows_pipelines.end()) {
-            pipeline = it->second;
-        } else {
-            ggml_webgpu_processed_shader processed =
-                ggml_webgpu_preprocess_generic_shader(ctx->p, wgsl_sum_rows, shader_lib_ctx, "sum_rows");
-            pipeline =
-                ggml_webgpu_create_pipeline(ctx->global_ctx->device, processed.wgsl.c_str(), processed.variant.c_str());
-            ctx->sum_rows_pipelines.emplace(1, pipeline);
-        }
+    auto            it = ctx->sum_rows_pipelines.find(1);
+    if (it != ctx->sum_rows_pipelines.end()) {
+        pipeline = it->second;
+    } else {
+        ggml_webgpu_processed_shader processed =
+            ggml_webgpu_preprocess_generic_shader(ctx->p, wgsl_sum_rows, shader_lib_ctx, "sum_rows");
+        pipeline =
+            ggml_webgpu_create_pipeline(ctx->global_ctx->device, processed.wgsl.c_str(), processed.variant.c_str());
+        ctx->sum_rows_pipelines.emplace(1, pipeline);
     }
     uint32_t wg_x = total_sum ? 1 : ggml_nrows(dst);
     return ggml_backend_webgpu_build(ctx->global_ctx, ctx->param_buf_pool, pipeline, params, entries, wg_x);
@@ -3016,10 +2989,10 @@ static bool create_webgpu_device(ggml_backend_webgpu_reg_context * ctx) {
 
 #ifdef GGML_WEBGPU_GPU_PROFILE
     // Initialize buffer pool for timestamp queries, used for profiling
-    ctx->webgpu_global_ctx->timestamp_query_buf_pool.init(ctx->webgpu_global_ctx->device, WEBGPU_NUM_TIMESTAMP_QUERY_BUFS,
-                                              WEBGPU_TIMESTAMP_QUERY_BUF_SIZE_BYTES,
-                                              wgpu::BufferUsage::QueryResolve | wgpu::BufferUsage::CopySrc,
-                                              wgpu::BufferUsage::MapRead | wgpu::BufferUsage::CopyDst);
+    ctx->webgpu_global_ctx->timestamp_query_buf_pool.init(
+        ctx->webgpu_global_ctx->device, WEBGPU_NUM_TIMESTAMP_QUERY_BUFS, WEBGPU_TIMESTAMP_QUERY_BUF_SIZE_BYTES,
+        wgpu::BufferUsage::QueryResolve | wgpu::BufferUsage::CopySrc,
+        wgpu::BufferUsage::MapRead | wgpu::BufferUsage::CopyDst);
 #endif
 
     GGML_LOG_INFO(

From b4d05a3d2fc7820444ca618570a3ac76cc12fe83 Mon Sep 17 00:00:00 2001
From: Sascha Rogmann <59577610+srogmann@users.noreply.github.com>
Date: Mon, 2 Feb 2026 07:26:58 +0100
Subject: [PATCH 05/18] spec : various improvements ton ngram-map + docs
 (#19253)

* spec: ngram-map and reasoning chats

* spec: add t_begin and t_accept

* ngram-map : add internal hash map

* docs : update ngram-map, add ngram-mod

* docs : fix ngram-map-k

* docs : differences between implementations
---
 common/ngram-map.cpp   | 204 ++++++++++++++++++++++++++++++++++++++---
 common/ngram-map.h     |  35 ++++++-
 common/speculative.cpp |  25 +++--
 docs/speculative.md    |  74 ++++++++++++++-
 4 files changed, 307 insertions(+), 31 deletions(-)

diff --git a/common/ngram-map.cpp b/common/ngram-map.cpp
index 84fd761367..cab231bad7 100644
--- a/common/ngram-map.cpp
+++ b/common/ngram-map.cpp
@@ -7,6 +7,18 @@
 #include <cstdio>
 #include <sstream>
 
+// prime number used for LCG hash function (32 bit), it is near (sqrt(5) - 1)/2 * 2^32.
+#define LCG_FACTOR 2654435761UL
+
+// Compute the LCG hash of a n-gram of size len at offset start.
+static uint32_t common_ngram_map_hash(const llama_tokens & tokens, size_t start, size_t len) {
+    uint32_t hash = 0;
+    for (size_t i = 0; i < len; ++i) {
+        hash = hash * LCG_FACTOR + tokens[start + i];
+    }
+    return hash;
+}
+
 // Print the values of a sublist of `llama_tokens & inp` to a string in the form [v0, v1, v2, ...].
 static std::string common_tokens_to_str(const llama_tokens & inp, size_t start, size_t length) {
     std::ostringstream oss;
@@ -115,6 +127,100 @@ llama_tokens common_ngram_simple_draft(
 // maximum number of counted values of a ngram map value.
 #define COMMON_NGRAM_MAX_VALUE_COUNT 16380
 
+void common_ngram_map_begin(
+    common_ngram_map & map, const llama_tokens & tokens) {
+    size_t size_begin = tokens.size();
+
+    LOG_DBG("%s: begin, idx_last_draft=%zu, new begin=%zu, #keys=%zu\n", __func__,
+            map.idx_last_check, size_begin, map.keys.size());
+
+    size_t count_map_entries_upd = 0;
+    if (!map.key_map.empty() && size_begin < map.idx_last_check) {
+        if (map.show_key_map_stats) {
+            // Print statistics of hash map map_key.
+            size_t count_nonzero = 0;
+            uint32_t min_idx = UINT32_MAX;
+            uint32_t max_idx = 0;
+            for (size_t i = 0; i < map.key_map.size(); ++i) {
+                uint32_t key_idx = map.key_map[i];
+                if (key_idx != 0) {
+                    ++count_nonzero;
+                    if (key_idx < min_idx) min_idx = key_idx;
+                    if (key_idx > max_idx) max_idx = key_idx;
+                }
+            }
+            if (count_nonzero == 0) {
+                min_idx = 0;
+            }
+            LOG_INF("%s: key_map stats: entries=%zu, min_idx=%u, max_idx=%u, key_map_last_idx=%u\n",
+                    __func__, count_nonzero, min_idx, max_idx, map.key_map_last_idx);
+        }
+
+        // Update the map from hash to key index (clear outdated entries).
+        for (size_t i = 0; i < map.key_map.size(); ++i) {
+            uint32_t key_idx = map.key_map[i];
+            if (key_idx >= map.size_last_begin) {
+                map.key_map[i] = 0;
+                count_map_entries_upd++;
+            }
+        }
+        map.key_map_last_idx = (map.size_last_begin > 0) ? map.size_last_begin - 1 : 0;
+    }
+
+    if (size_begin < map.idx_last_check && !map.keys.empty()) {
+        // The next token generation will start at index size_begin.
+        // The tokens between map.size_last_begin and size_begin are no longer valid.
+        //
+        // Refresh map: Remove all entries with index >= map.size_last_begin.
+        size_t count_keys = map.keys.size();
+        size_t count_keys_del = 0;
+        size_t count_values_del = 0;
+        for (int32_t i = map.keys.size() - 1; i >= 0; --i) {
+            common_ngram_map_key & key = map.keys[i];
+            if (key.key_idx >= map.size_last_begin) {
+                // Delete the key.
+                LOG_DBG("%s: delete key %d at index %zu (>= size_last_begin=%zu)\n", __func__, i, key.key_idx, map.size_last_begin);
+                map.keys.erase(map.keys.begin() + i);
+                count_keys_del++;
+                continue;
+            }
+            if (map.key_only) {
+                continue;
+            }
+
+            // Check the indices of the values.
+            for (int16_t j = COMMON_NGRAM_MAX_VALUES - 1; j >= 0; --j) {
+                common_ngram_map_value & value = key.values[j];
+                if (value.value_idx >= map.size_last_begin) {
+                    // Delete the value.
+                    count_values_del++;
+
+                    // Move all values after this value to the left.
+                    for (uint16_t k = j; k < COMMON_NGRAM_MAX_VALUES - 1; ++k) {
+                        key.values[k] = key.values[k + 1];
+                    }
+                    // Clear the last value.
+                    key.values[COMMON_NGRAM_MAX_VALUES - 1].value_idx = 0;
+                    key.values[COMMON_NGRAM_MAX_VALUES - 1].value_num = 0;
+                }
+            }
+            if (key.values[0].value_idx == 0) {
+                // No values left, delete the key.
+                LOG_DBG("%s: delete key %d at index %zu (no values left)\n", __func__, i, key.key_idx);
+                map.keys.erase(map.keys.begin() + i);
+                count_keys_del++;
+            }
+        }
+
+        LOG_INF("%s: refresh map: idx_last_draft=%zu, new begin=%zu, #keys_checked=%zu, #keys_del=%zu, #values_del=%zu, #hashes_upd=%zu\n", __func__,
+                map.idx_last_check, size_begin,
+                count_keys, count_keys_del, count_values_del, count_map_entries_upd);
+    }
+
+    map.idx_last_check = (map.size_last_begin > 0) ? map.size_last_begin - 1 : 0;
+    map.size_last_begin = size_begin;
+}
+
 void common_ngram_map_draft(common_ngram_map & map,
         const llama_tokens & inp, llama_token sampled,
         llama_tokens & draft) {
@@ -129,6 +235,10 @@ void common_ngram_map_draft(common_ngram_map & map,
     if (cur_len < static_cast<size_t>(2 * n + m)) {
         return;
     }
+    if (cur_len >= static_cast<size_t>(UINT32_MAX)) {
+        // key_map uses uint32_t instead of size_t.
+        GGML_ABORT("%s: cur_len exceeds UINT32_MAX: %zu", __func__, cur_len);
+    }
 
     // Only check every check_rate tokens to save compute
     // i.e., perform check if (cur_len - idx_last_check) >= check_rate
@@ -147,24 +257,92 @@ void common_ngram_map_draft(common_ngram_map & map,
 
     // search for the key in the map
     size_t match_pos = 0;
-    for (size_t j = cur_len - n - m - 1; j > 0; --j) {
-        bool match = true;
-        for (size_t k = 0; k < n; ++k) {
-            if (inp[j + k] != key_tokens[k]) {
-                match = false;
-                break;
+    if (map.size_last_begin > cur_len) {
+        GGML_ABORT("%s: map.size_last_begin > cur_len: %zu > %zu", __func__, map.size_last_begin, cur_len);
+    }
+    if (!map.key_map.empty()) {
+        // Search for the key in the map key_map from hash of ngrams to index of ngram.
+        uint32_t idx_hash = (common_ngram_map_hash(key_tokens, 0, n) % map.key_map.size());
+        uint32_t idx_key = map.key_map[idx_hash];
+        if (idx_key != 0 && idx_key < cur_len - n - m - 1) {
+            // Check if the key matches the key at idx_key (because of possible collisions).
+            bool match = true;
+            for (size_t k = 0; k < n; ++k) {
+                if (inp[idx_key + k] != key_tokens[k]) {
+                    match = false;
+                    break;
+                }
+            }
+            LOG_DBG("%s: key hash %x -> idx_key %d: match %d\n", __func__, idx_hash, idx_key, match ? 1 : 0);
+            if (match) {
+                match_pos = idx_key;
             }
         }
-        if (match) {
-           match_pos = j;
-           break;
+    }
+    if (match_pos == 0 && map.size_last_begin > (size_t) (n + m + 1)) {
+        // Search for the key in [1, map.size_last_begin - n - m -1], descending.
+        for (size_t j = map.size_last_begin - n - m - 1; j > map.key_map_last_idx; --j) {
+            // Check if the key matches the key.
+            bool match = true;
+            for (size_t k = 0; k < n; ++k) {
+                if (inp[j + k] != key_tokens[k]) {
+                    match = false;
+                    break;
+                }
+            }
+            if (match) {
+               match_pos = j;
+               break;
+            }
+        }
+    }
+    if (match_pos == 0) {
+        // In case of a reasoning chat, the part after size_last_begin may be deleted/reordered later.
+        //
+        // Search in [size_last_begin, cur_len - n - m - 1], descending.
+        for (size_t j = cur_len - n - m - 1; j > map.size_last_begin && j > map.key_map_last_idx; --j) {
+            bool match = true;
+            for (size_t k = 0; k < n; ++k) {
+                if (inp[j + k] != key_tokens[k]) {
+                    match = false;
+                    break;
+                }
+            }
+            if (match) {
+               match_pos = j;
+               break;
+            }
         }
     }
     if (match_pos > 0) {
-        LOG_INF("%s: cur_len = %zu, n = %d, m = %d, sz_tkns = %zu, sampled = %d, match_pos = %zu\n", __func__,
+        LOG_DBG("%s: cur_len = %zu, n = %d, m = %d, sz_tkns = %zu, sampled = %d, match_pos = %zu\n", __func__,
             cur_len, n, m, key_tokens.size(), sampled, match_pos);
     }
 
+    if (!map.key_map.empty()) {
+        // Add hashes of new ngrams in key_map.
+        //
+        // Use the same order as above.
+        if (map.size_last_begin > (size_t) (n + m + 1)) {
+            for (size_t j = map.size_last_begin - n - m - 1; j > map.key_map_last_idx; --j) {
+                // compute hash and store index of ngram at idx j in the map.
+                uint32_t idx_hash = (common_ngram_map_hash(inp, j, n) % map.key_map.size());
+                if (map.key_map[idx_hash] == 0) {
+                    map.key_map[idx_hash] = j; // collisions may occur
+                }
+            }
+        }
+
+        for (size_t j = cur_len - n - m - 1; j > map.size_last_begin && j > map.key_map_last_idx; --j) {
+            // compute hash and store index of ngram at idx j in the map.
+            uint32_t idx_hash = (common_ngram_map_hash(inp, j, n) % map.key_map.size());
+            if (map.key_map[idx_hash] == 0) {
+                map.key_map[idx_hash] = j;
+            }
+        }
+        map.key_map_last_idx = std::max(static_cast<uint32_t>(cur_len - n - m - 1), map.key_map_last_idx);
+    }
+
     if (match_pos == 0) {
         return;
     }
@@ -215,8 +393,8 @@ void common_ngram_map_draft(common_ngram_map & map,
             draft.push_back(inp[match_pos + n + i]);
         }
 
-        LOG_INF("%s: key_offset = %zu, key_num = %d, draft.size = %zu\n", __func__,
-                key_offset, curr_key.key_num, draft.size());
+        LOG_DBG("%s: key_idx = %zu, key_offset = %zu, key_num = %d, draft.size = %zu\n", __func__,
+                curr_key.key_idx, key_offset, curr_key.key_num, draft.size());
 
         map.last_draft_created   = false;
         map.last_draft_key_idx   = key_offset;
@@ -318,7 +496,7 @@ void common_ngram_map_draft(common_ngram_map & map,
         }
     }
 
-    if (sum_occur > 0 && max_occur < 3 * sum_occur) {
+    if (sum_occur > 0 && max_occur < 2 * sum_occur) {
         // The most frequent value is not much more frequent than the other values.
         // We do not use the draft.
         return;
diff --git a/common/ngram-map.h b/common/ngram-map.h
index b365034ac5..c094d513d5 100644
--- a/common/ngram-map.h
+++ b/common/ngram-map.h
@@ -9,6 +9,8 @@
 // 2. ngram_map: lookup of n-grams followed by m-grams in token history using a map.
 //    The map is a vector of key n-grams, and for each key n-gram there is a list of value m-grams.
 //
+// ref: https://github.com/ggml-org/llama.cpp/pull/18471
+//
 
 #include "llama.h"
 #include "common.h"
@@ -51,10 +53,13 @@ llama_tokens common_ngram_simple_draft(
 // maximum number of m-gram values stored for each key n-gram.
 #define COMMON_NGRAM_MAX_VALUES 4
 
+// number of entries in the (optional, size 0 to disable) map from ngram-hash to ngram-index.
+#define COMMON_NGRAM_HASH_MAP_SIZE 262144
+
 // statistics of a m-gram after a known n-gram
 struct common_ngram_map_value {
-    size_t   value_idx = 0;  // index of value m-gram in token-history (0 if unused)
-    uint16_t value_num = 0;  // number of occurences of this value m-gram after the key n-gram (0 in an unused values-slot)
+    size_t   value_idx =  0;  // index of value m-gram in token-history (0 if unused)
+    uint16_t value_num =  0;  // number of occurences of this value m-gram after the key n-gram (0 in an unused values-slot)
     int16_t n_accepted = -1;  // number of accepted tokens at last draft (-1 if unused)
 };
 
@@ -74,23 +79,43 @@ struct common_ngram_map {
 
     bool key_only;       // true if only key n-grams are used, no values.
 
-    // first draft: vector only, no map.
     std::vector<common_ngram_map_key> keys; // key n-grams which occur several times in token-history
     uint16_t check_rate; // check for speculative decoding without draft model for each check_rate token
     uint16_t min_hits;   // minimum number of key hits to consider a draft
 
+    bool     show_key_map_stats = false; // true, if statitics of the key_map should be printed.
+
     common_ngram_map(uint16_t sz_key, uint16_t sz_value, bool only_keys,
                      uint16_t check_rate, uint16_t min_hits)
         : size_key(sz_key), size_value(sz_value), key_only(only_keys),
-          check_rate(check_rate), min_hits(min_hits) {}
+          check_rate(check_rate), min_hits(min_hits) {
+        key_map.resize(COMMON_NGRAM_HASH_MAP_SIZE); // 2^18 hash entries, 0 entries if key_map shouldn't be used
+    }
+
+    // In reasoning chats the previous reasoning block will be removed from context history.
+    // A rebuild of the ngram map is needed after that.
+
+    size_t   size_last_begin      = 0; // number of tokens at previous start of generation
 
     bool     last_draft_created   = false; // true if a draft was created at last call.
-    size_t   last_draft_key_idx   = 0; // index of last key used for draft generation.
+    size_t   last_draft_key_idx   = 0; // index of last key used for draft generation (0 = no draft)
     uint16_t last_draft_value_idx = 0; // index of last value used for draft generation.
 
     size_t   idx_last_check       = 0; // index of last check in context history
+
+    // optional map "hash to ngram-index" for faster lookup of n-grams. map is empty if unused.
+    //
+    // uint32_t instead of size_t (size of current histories is << UINT32_MAX)
+    std::vector<uint32_t> key_map;              // key_map[hash] = index of ngram in context window
+    uint32_t              key_map_last_idx = 0; // index of the last ngram added to key_map
 };
 
+// Initialize the n-gram map with the given token history.
+// map:                the ngram map to initialize.
+// tokens:             the token history to base the map on.
+void common_ngram_map_begin(
+    common_ngram_map & map,
+    const llama_tokens & tokens);
 
 // Searches for the n-gram in the history and checks whether a draft sequence should be generated.
 // map:                the ngram map to search in.
diff --git a/common/speculative.cpp b/common/speculative.cpp
index a1a3b51c13..152aaa48d4 100644
--- a/common/speculative.cpp
+++ b/common/speculative.cpp
@@ -124,9 +124,9 @@ struct common_speculative_state {
     // TODO: track performance of most recent calls
     const bool gen_perf = true; // whether to generate performance stats.
 
-    // TODO: rename to t_draft_us
-    // TODO: add t_begin_us, t_accept_us
-    int64_t gen_duration_us = 0; // total time spent in this implementation in microseconds.
+    int64_t t_begin_us  = 0; // total time spent in refresh of this implementation in microseconds.
+    int64_t t_draft_us  = 0; // total time spent in generating drafts in this implementation in microseconds.
+    int64_t t_accept_us = 0; // total time spent in accumulation of this implementation in microseconds.
 
     common_speculative_state(enum common_speculative_type type) : type(type) {}
 
@@ -499,7 +499,7 @@ struct common_speculative_state_ngram_map_k : public common_speculative_state {
         : common_speculative_state(type), map(std::move(map)) {}
 
     void begin(const llama_tokens & prompt) override {
-        GGML_UNUSED(prompt);
+        common_ngram_map_begin(map, prompt);
     }
 
     void draft(
@@ -951,7 +951,12 @@ void common_speculative_begin(common_speculative * spec, const llama_tokens & pr
     }
 
     for (auto & impl : spec->impls) {
+        const int64_t t_start_us = impl->gen_perf ? ggml_time_us() : 0;
+
         impl->begin(prompt);
+
+        const int64_t t_now_us = impl->gen_perf ? ggml_time_us() : 0;
+        impl->t_begin_us += t_now_us - t_start_us; // accumulate duration for this refresh
     }
 }
 
@@ -973,7 +978,7 @@ llama_tokens common_speculative_draft(
             const int64_t t_now_us = impl->gen_perf ? ggml_time_us() : 0;
 
             impl->drafts_call_count++;
-            impl->gen_duration_us += t_now_us - t_start_us; // accumulate duration for this implementation
+            impl->t_draft_us += t_now_us - t_start_us; // accumulate duration for this implementation
         }
 
         if (!result.empty()) {
@@ -1001,12 +1006,15 @@ void common_speculative_accept(common_speculative * spec, uint16_t n_accepted) {
 
     GGML_ASSERT(impl);
 
+    const int64_t t_start_us = impl->gen_perf ? ggml_time_us() : 0;
     if (n_accepted > 0) {
         impl->drafts_accepted_count++;
         impl->drafts_accepted_tokens += n_accepted;
     }
 
     impl->accept(n_accepted);
+    const int64_t t_now_us = impl->gen_perf ? ggml_time_us() : 0;
+    impl->t_accept_us += t_now_us - t_start_us; // accumulate duration for this acculumulation
 }
 
 void common_speculative_print_stats(const common_speculative * spec) {
@@ -1018,13 +1026,14 @@ void common_speculative_print_stats(const common_speculative * spec) {
         std::string str_perf;
         if (impl->gen_perf) {
             std::ostringstream oss;
-            oss << std::fixed << std::setprecision(3) << impl->gen_duration_us / 1000.0;
-            str_perf = ", dur = " + oss.str() + " ms";
+            oss << std::fixed << std::setprecision(3) << impl->t_begin_us / 1000.0 << ", ";
+            oss << std::fixed << std::setprecision(3) << impl->t_draft_us / 1000.0 << ", ";
+            oss << std::fixed << std::setprecision(3) << impl->t_accept_us / 1000.0;
+            str_perf = ", dur(b,g,a) = " + oss.str() + " ms";
         } else {
             str_perf = "";
         }
 
-        // TODO: report time for begin() and accept()
         LOG_INF("statistics %s: #calls = %zu, #gen drafts = %zu, #acc drafts = %zu, #gen tokens = %zu, #acc tokens = %zu%s\n",
                 common_speculative_type_to_str(impl->type).c_str(),
                 impl->drafts_call_count,
diff --git a/docs/speculative.md b/docs/speculative.md
index 8281eaa2d3..03afab5b41 100644
--- a/docs/speculative.md
+++ b/docs/speculative.md
@@ -6,7 +6,7 @@ llama.cpp supports speculative decoding, a technique that can significantly acce
 
 ## Implementations
 
-The `llama-server` application supports several implementations of speculative decoding:
+The `llama-server` application supports several implementations of speculative decoding. An implementation with draft model can be mixed with an implementation without draft model.
 
 ### Draft Model (`draft`)
 
@@ -32,12 +32,21 @@ An example to use this approach can be the rewriting of source code by a LLM.
 
 This implementation looks for the last n-gram in history that matches the current n-gram and creates a draft using the m tokens following the matched n-gram. It is the simplest self-speculative approach with minimal overhead.
 
+```
+llama-server [...] --spec-type ngram-simple --draft-max 64
+```
+
 #### n-gram Map Key (`ngram-map-k`)
 
-This implementation looks for the current n-gram of size n (called the _key_) in the token history. If the key n-gram is followed by the same m tokens (called the _mgram_) multiple times, it creates a draft using these m tokens. This approach requires a minimum number of occurrences (argument `--spec-ngram-min-hits`) before generating drafts.
+This implementation looks for the current n-gram of size n (called the _key_) in the token history. If the key n-gram is followed by the same m tokens (called the _mgram_) multiple times, it creates a draft using these m tokens. This approach requires a minimum number of occurrences (argument `--spec-ngram-min-hits`, default is 1) before generating drafts.
 
 The number of accepted tokens is stored for each used n-gram.
 
+**Example:**
+```
+llama-server [...] --spec-type ngram-map-k --draft-max 64
+```
+
 #### n-gram Map Key-4-Values (`ngram-map-k4v`)
 
 This experimental implementation looks for the current n-gram of size n (called the _key_) in the token history. For each key, up to four _values_ (n-grams of size m, called _mgrams_) are tracked. An internal statistic counts the occurrences of each mgram after the key n-gram. If one mgram is significantly more frequent than the others, it is used as the draft.
@@ -45,17 +54,65 @@ This experimental implementation looks for the current n-gram of size n (called
 The number of accepted tokens is stored for each used n-gram.
 
 **Example:** Server options to be used if there are a lot of longer repetitions.
-```bash
-llama-server [...] --spec-type ngram-map-k4v --spec-ngram-size-n 8 --spec-ngram-size-m 8 --spec-ngram-min-hits 2
+```
+llama-server [...] --spec-type ngram-map-k4v --spec-ngram-size-n 8 --spec-ngram-size-m 8 --spec-ngram-min-hits 2 --draft-max 64
 ```
 
+### n-gram Mod (`ngram-mod`)
+
+Add basic ngram hasher for speculative decoding:
+
+- For each ngram, compute a hash using LCG
+- For each computed hash, store the next token
+- During speculation, iteratively compute the rolling hash of the last n tokens and pick the next token from the storage
+
+Some characteristics:
+
+- Lightweight (~16 MB)
+- Constant memory and complexity
+- Can generate variable draft lengths (i.e. m is not fixed)
+
+Currently, a single hash pool is shared across all server slots, so different requests can benefit from each other.
+
+**Sample usage:**
+
+```
+# notes:
+# - small `n` are not recommended
+# - MoEs require long drafts
+# - dense models: can reduce `--draft-min` and `--draft-max`
+
+llama-server ... --spec-type ngram-mod --spec-ngram-size-n 24 --draft-min 48 --draft-max 64
+```
+
+Applications:
+
+- Iterating over a block of text/code (e.g. in llama.vim)
+- Reasoning models (when they have to repeat their thinking in the final answer)
+- Summarization
+
+Example Video:
+
+- See #19164
+
+### Differences between ngram-simple, ngram-map and ngram-mod
+
+- ngram-simple looks for a previous matching n-gram and inserts the following m-gram.
+- ngram-map-k looks for a previous matching n-gram and inserts the following m-gram but uses an internal hash-map of n-grams in the current context window.
+- ngram-mod uses a hash pool which is shared across all server slots. The hash pool is a map from n-gram hash to the next token (not the next m-gram as in ngram-map).
 
 ## Command-Line Options
 
 If a draft model is combined with a draftless decoding the draftless decoding has higher precedence.
 
 ```
---spec-type [none|ngram-cache|ngram-simple|ngram-map-k|ngram-map-k4v]
+--draft, --draft-n, --draft-max N       number of tokens to draft for speculative decoding (default: 16)
+                                        (env: LLAMA_ARG_DRAFT_MAX)
+--draft-min, --draft-n-min N            minimum number of draft tokens to use for speculative decoding
+                                        (default: 0)
+                                        (env: LLAMA_ARG_DRAFT_MIN)
+[...]
+--spec-type [none|ngram-cache|ngram-simple|ngram-map-k|ngram-map-k4v|ngram-mod]
                                         type of speculative decoding to use when no draft model is provided
                                         (default: none)
 --spec-ngram-size-n N                   ngram size N for ngram-simple/ngram-map speculative decoding, length
@@ -78,6 +135,7 @@ Specifies a type of speculative decoding without draft model.
 | `ngram-simple` | Use simple n-gram pattern matching |
 | `ngram-map-k` | Use n-gram pattern matching with n-gram-keys |
 | `ngram-map-k4v` | Use n-gram pattern matching with n-gram-keys and up to four m-gram values (experimental) |
+| `ngram-mod` | Use basic ngram hasher for speculative decoding with shared pool |
 
 **Example:** Server-instance used to refactor source code.
 ```bash
@@ -112,9 +170,15 @@ statistics ngram_simple: #calls = 15, #gen drafts = 5, #acc drafts = 5, #gen tok
 statistics draft: #calls = 10, #gen drafts = 10, #acc drafts = 10, #gen tokens = 110, #acc tokens = 98
 ```
 
+```
+draft acceptance rate = 0.70312 (   90 accepted /   128 generated)
+statistics ngram_mod: #calls = 810, #gen drafts = 15, #acc drafts = 15, #gen tokens = 960, #acc tokens = 730, dur(b,g,a) = 0.149, 0.347, 0.005 ms
+```
+
 - `#calls`: number of calls of this implementations
 - `#gen drafts`: number of drafts generated by this implementation
 - `#acc drafts`: number of drafts accepted (partially) by the main model
 - `#gen tokens`: number of tokens generated by this implementation (including rejected tokens)
 - `#acc tokens`: number of tokens accepted by the main model
+- `dur(b,g,a): durations of begin (new prompt), generation and accumulation (process acceptance).
 

From 7a4ca3cbd905907ac4d12bc14b878fdfbe4fd1d6 Mon Sep 17 00:00:00 2001
From: Christian Kastner <ckk@kvr.at>
Date: Mon, 2 Feb 2026 07:38:55 +0100
Subject: [PATCH 06/18] docs : Minor cleanups (#19252)

* Update old URLs to github.com/ggml-org/

* Bump copyrights
---
 LICENSE                                                   | 2 +-
 docs/multimodal/minicpmo2.6.md                            | 2 +-
 docs/multimodal/minicpmo4.0.md                            | 4 ++--
 docs/multimodal/minicpmv2.5.md                            | 2 +-
 docs/multimodal/minicpmv2.6.md                            | 2 +-
 docs/multimodal/minicpmv4.0.md                            | 4 ++--
 docs/multimodal/minicpmv4.5.md                            | 4 ++--
 examples/deprecation-warning/README.md                    | 2 +-
 examples/deprecation-warning/deprecation-warning.cpp      | 2 +-
 examples/json_schema_to_grammar.py                        | 2 +-
 ggml/include/ggml-cann.h                                  | 2 +-
 ggml/include/ggml.h                                       | 2 +-
 ggml/src/ggml-cann/acl_tensor.cpp                         | 2 +-
 ggml/src/ggml-cann/acl_tensor.h                           | 2 +-
 ggml/src/ggml-cann/aclnn_ops.cpp                          | 2 +-
 ggml/src/ggml-cann/aclnn_ops.h                            | 2 +-
 ggml/src/ggml-cann/common.h                               | 2 +-
 ggml/src/ggml-cann/ggml-cann.cpp                          | 2 +-
 ggml/src/ggml-metal/CMakeLists.txt                        | 2 +-
 ggml/src/ggml-opencl/ggml-opencl.cpp                      | 2 +-
 ggml/src/ggml-sycl/ggml-sycl.cpp                          | 2 +-
 .../src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp | 2 +-
 ggml/src/ggml.c                                           | 2 +-
 src/llama-chat.cpp                                        | 2 +-
 src/llama-hparams.h                                       | 2 +-
 src/llama-vocab.cpp                                       | 8 ++++----
 src/models/deepseek2.cpp                                  | 2 +-
 tests/test-autorelease.cpp                                | 2 +-
 tools/cvector-generator/pca.hpp                           | 2 +-
 tools/export-lora/export-lora.cpp                         | 2 +-
 tools/perplexity/README.md                                | 2 +-
 tools/server/public_legacy/index-new.html                 | 2 +-
 tools/server/public_legacy/index.html                     | 2 +-
 tools/server/public_legacy/theme-mangotango.css           | 2 +-
 tools/server/themes/buttons-top/index.html                | 2 +-
 tools/server/themes/wild/index.html                       | 2 +-
 36 files changed, 42 insertions(+), 42 deletions(-)

diff --git a/LICENSE b/LICENSE
index acb96ce78e..e7dca554bc 100644
--- a/LICENSE
+++ b/LICENSE
@@ -1,6 +1,6 @@
 MIT License
 
-Copyright (c) 2023-2024 The ggml authors
+Copyright (c) 2023-2026 The ggml authors
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
diff --git a/docs/multimodal/minicpmo2.6.md b/docs/multimodal/minicpmo2.6.md
index 5e74058e5d..ce003b2ebc 100644
--- a/docs/multimodal/minicpmo2.6.md
+++ b/docs/multimodal/minicpmo2.6.md
@@ -9,7 +9,7 @@ Download [MiniCPM-o-2_6](https://huggingface.co/openbmb/MiniCPM-o-2_6) PyTorch m
 ### Build llama.cpp
 Readme modification time: 20250206
 
-If there are differences in usage, please refer to the official build [documentation](https://github.com/ggerganov/llama.cpp/blob/master/docs/build.md)
+If there are differences in usage, please refer to the official build [documentation](https://github.com/ggml-org/llama.cpp/blob/master/docs/build.md)
 
 Clone llama.cpp:
 ```bash
diff --git a/docs/multimodal/minicpmo4.0.md b/docs/multimodal/minicpmo4.0.md
index 49125ea05e..a5281779c2 100644
--- a/docs/multimodal/minicpmo4.0.md
+++ b/docs/multimodal/minicpmo4.0.md
@@ -8,11 +8,11 @@ Download [MiniCPM-o-4](https://huggingface.co/openbmb/MiniCPM-o-4) PyTorch model
 ### Build llama.cpp
 Readme modification time: 20250206
 
-If there are differences in usage, please refer to the official build [documentation](https://github.com/ggerganov/llama.cpp/blob/master/docs/build.md)
+If there are differences in usage, please refer to the official build [documentation](https://github.com/ggml-org/llama.cpp/blob/master/docs/build.md)
 
 Clone llama.cpp:
 ```bash
-git clone https://github.com/ggerganov/llama.cpp
+git clone https://github.com/ggml-org/llama.cpp
 cd llama.cpp
 ```
 
diff --git a/docs/multimodal/minicpmv2.5.md b/docs/multimodal/minicpmv2.5.md
index 5eb87bc969..096f070a1c 100644
--- a/docs/multimodal/minicpmv2.5.md
+++ b/docs/multimodal/minicpmv2.5.md
@@ -8,7 +8,7 @@ Download [MiniCPM-Llama3-V-2_5](https://huggingface.co/openbmb/MiniCPM-Llama3-V-
 ### Build llama.cpp
 Readme modification time: 20250206
 
-If there are differences in usage, please refer to the official build [documentation](https://github.com/ggerganov/llama.cpp/blob/master/docs/build.md)
+If there are differences in usage, please refer to the official build [documentation](https://github.com/ggml-org/llama.cpp/blob/master/docs/build.md)
 
 Clone llama.cpp:
 ```bash
diff --git a/docs/multimodal/minicpmv2.6.md b/docs/multimodal/minicpmv2.6.md
index bc874bbd8c..a7db9c58db 100644
--- a/docs/multimodal/minicpmv2.6.md
+++ b/docs/multimodal/minicpmv2.6.md
@@ -8,7 +8,7 @@ Download [MiniCPM-V-2_6](https://huggingface.co/openbmb/MiniCPM-V-2_6) PyTorch m
 ### Build llama.cpp
 Readme modification time: 20250206
 
-If there are differences in usage, please refer to the official build [documentation](https://github.com/ggerganov/llama.cpp/blob/master/docs/build.md)
+If there are differences in usage, please refer to the official build [documentation](https://github.com/ggml-org/llama.cpp/blob/master/docs/build.md)
 
 Clone llama.cpp:
 ```bash
diff --git a/docs/multimodal/minicpmv4.0.md b/docs/multimodal/minicpmv4.0.md
index d04cb338ce..1d21b8cfdf 100644
--- a/docs/multimodal/minicpmv4.0.md
+++ b/docs/multimodal/minicpmv4.0.md
@@ -8,11 +8,11 @@ Download [MiniCPM-V-4](https://huggingface.co/openbmb/MiniCPM-V-4) PyTorch model
 ### Build llama.cpp
 Readme modification time: 20250731
 
-If there are differences in usage, please refer to the official build [documentation](https://github.com/ggerganov/llama.cpp/blob/master/docs/build.md)
+If there are differences in usage, please refer to the official build [documentation](https://github.com/ggml-org/llama.cpp/blob/master/docs/build.md)
 
 Clone llama.cpp:
 ```bash
-git clone https://github.com/ggerganov/llama.cpp
+git clone https://github.com/ggml-org/llama.cpp
 cd llama.cpp
 ```
 
diff --git a/docs/multimodal/minicpmv4.5.md b/docs/multimodal/minicpmv4.5.md
index 8fea5e611d..a102c0fa51 100644
--- a/docs/multimodal/minicpmv4.5.md
+++ b/docs/multimodal/minicpmv4.5.md
@@ -8,11 +8,11 @@ Download [MiniCPM-V-4_5](https://huggingface.co/openbmb/MiniCPM-V-4_5) PyTorch m
 ### Build llama.cpp
 Readme modification time: 20250826
 
-If there are differences in usage, please refer to the official build [documentation](https://github.com/ggerganov/llama.cpp/blob/master/docs/build.md)
+If there are differences in usage, please refer to the official build [documentation](https://github.com/ggml-org/llama.cpp/blob/master/docs/build.md)
 
 Clone llama.cpp:
 ```bash
-git clone https://github.com/ggerganov/llama.cpp
+git clone https://github.com/ggml-org/llama.cpp
 cd llama.cpp
 ```
 
diff --git a/examples/deprecation-warning/README.md b/examples/deprecation-warning/README.md
index 59918ec2bb..9a1b263e8e 100644
--- a/examples/deprecation-warning/README.md
+++ b/examples/deprecation-warning/README.md
@@ -1,7 +1,7 @@
 # Migration notice for binary filenames
 
 > [!IMPORTANT]
-[2024 Jun 12] Binaries have been renamed w/ a `llama-` prefix. `main` is now `llama-cli`, `server` is `llama-server`, etc (https://github.com/ggerganov/llama.cpp/pull/7809)
+[2024 Jun 12] Binaries have been renamed w/ a `llama-` prefix. `main` is now `llama-cli`, `server` is `llama-server`, etc (https://github.com/ggml-org/llama.cpp/pull/7809)
 
 This migration was important, but it is a breaking change that may not always be immediately obvious to users.
 
diff --git a/examples/deprecation-warning/deprecation-warning.cpp b/examples/deprecation-warning/deprecation-warning.cpp
index c2958ea12d..11f5147328 100644
--- a/examples/deprecation-warning/deprecation-warning.cpp
+++ b/examples/deprecation-warning/deprecation-warning.cpp
@@ -28,7 +28,7 @@ int main(int argc, char** argv) {
     fprintf(stdout, "\n");
     fprintf(stdout, "WARNING: The binary '%s' is deprecated.\n", filename.c_str());
     fprintf(stdout, " Please use '%s' instead.\n", replacement_filename.c_str());
-    fprintf(stdout, " See https://github.com/ggerganov/llama.cpp/tree/master/examples/deprecation-warning/README.md for more information.\n");
+    fprintf(stdout, " See https://github.com/ggml-org/llama.cpp/tree/master/examples/deprecation-warning/README.md for more information.\n");
     fprintf(stdout, "\n");
 
     return EXIT_FAILURE;
diff --git a/examples/json_schema_to_grammar.py b/examples/json_schema_to_grammar.py
index 886dd3d81e..9fc90a3c98 100755
--- a/examples/json_schema_to_grammar.py
+++ b/examples/json_schema_to_grammar.py
@@ -402,7 +402,7 @@ class SchemaConverter:
             Transforms a regular expression pattern into a GBNF rule.
 
             Input: https://json-schema.org/understanding-json-schema/reference/regular_expressions
-            Output: https://github.com/ggerganov/llama.cpp/blob/master/grammars/README.md
+            Output: https://github.com/ggml-org/llama.cpp/blob/master/grammars/README.md
 
             Unsupported features: negative/positive lookaheads, greedy/non-greedy modifiers.
 
diff --git a/ggml/include/ggml-cann.h b/ggml/include/ggml-cann.h
index b469e228d0..74af465337 100644
--- a/ggml/include/ggml-cann.h
+++ b/ggml/include/ggml-cann.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023-2024 The ggml authors
+ * Copyright (c) 2023-2026 The ggml authors
  *
  * Permission is hereby granted, free of charge, to any person obtaining a copy
  * of this software and associated documentation files (the "Software"), to
diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h
index 1988d16dc4..f759e2d588 100644
--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@@ -6,7 +6,7 @@
 // This documentation is still a work in progress.
 // If you wish some specific topics to be covered, feel free to drop a comment:
 //
-//   https://github.com/ggerganov/whisper.cpp/issues/40
+//   https://github.com/ggml-org/whisper.cpp/issues/40
 //
 // ## Overview
 //
diff --git a/ggml/src/ggml-cann/acl_tensor.cpp b/ggml/src/ggml-cann/acl_tensor.cpp
index 7b7042a1f5..e95d3c4d88 100644
--- a/ggml/src/ggml-cann/acl_tensor.cpp
+++ b/ggml/src/ggml-cann/acl_tensor.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023-2024 The ggml authors
+ * Copyright (c) 2023-2026 The ggml authors
  *
  * Permission is hereby granted, free of charge, to any person obtaining a copy
  * of this software and associated documentation files (the "Software"), to
diff --git a/ggml/src/ggml-cann/acl_tensor.h b/ggml/src/ggml-cann/acl_tensor.h
index 7deac38342..4737773a4d 100644
--- a/ggml/src/ggml-cann/acl_tensor.h
+++ b/ggml/src/ggml-cann/acl_tensor.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023-2024 The ggml authors
+ * Copyright (c) 2023-2026 The ggml authors
  *
  * Permission is hereby granted, free of charge, to any person obtaining a copy
  * of this software and associated documentation files (the "Software"), to
diff --git a/ggml/src/ggml-cann/aclnn_ops.cpp b/ggml/src/ggml-cann/aclnn_ops.cpp
index 02867e4fdb..87ac05748e 100644
--- a/ggml/src/ggml-cann/aclnn_ops.cpp
+++ b/ggml/src/ggml-cann/aclnn_ops.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023-2024 The ggml authors
+ * Copyright (c) 2023-2026 The ggml authors
  *
  * Permission is hereby granted, free of charge, to any person obtaining a copy
  * of this software and associated documentation files (the "Software"), to
diff --git a/ggml/src/ggml-cann/aclnn_ops.h b/ggml/src/ggml-cann/aclnn_ops.h
index b76e4707ac..3effa1c289 100644
--- a/ggml/src/ggml-cann/aclnn_ops.h
+++ b/ggml/src/ggml-cann/aclnn_ops.h
@@ -1,5 +1,5 @@
 /**
- * Copyright (c) 2023-2024 The ggml authors
+ * Copyright (c) 2023-2026 The ggml authors
  *
  * Permission is hereby granted, free of charge, to any person obtaining a copy
  * of this software and associated documentation files (the "Software"), to
diff --git a/ggml/src/ggml-cann/common.h b/ggml/src/ggml-cann/common.h
index fb3e7572e2..0120f0dfd1 100644
--- a/ggml/src/ggml-cann/common.h
+++ b/ggml/src/ggml-cann/common.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023-2024 The ggml authors
+ * Copyright (c) 2023-2026 The ggml authors
  *
  * Permission is hereby granted, free of charge, to any person obtaining a copy
  * of this software and associated documentation files (the "Software"), to
diff --git a/ggml/src/ggml-cann/ggml-cann.cpp b/ggml/src/ggml-cann/ggml-cann.cpp
index 42c6c67a40..6b2dbdd359 100644
--- a/ggml/src/ggml-cann/ggml-cann.cpp
+++ b/ggml/src/ggml-cann/ggml-cann.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023-2024 The ggml authors
+ * Copyright (c) 2023-2026 The ggml authors
  *
  * Permission is hereby granted, free of charge, to any person obtaining a copy
  * of this software and associated documentation files (the "Software"), to
diff --git a/ggml/src/ggml-metal/CMakeLists.txt b/ggml/src/ggml-metal/CMakeLists.txt
index 9c0b3db859..42054d841a 100644
--- a/ggml/src/ggml-metal/CMakeLists.txt
+++ b/ggml/src/ggml-metal/CMakeLists.txt
@@ -71,7 +71,7 @@ else()
         #       disabling fast math is needed in order to pass tests/test-backend-ops
         # note: adding -fno-inline fixes the tests when using MTL_SHADER_VALIDATION=1
         # note: unfortunately, we have to call it default.metallib instead of ggml.metallib
-        #       ref: https://github.com/ggerganov/whisper.cpp/issues/1720
+        #       ref: https://github.com/ggml-org/whisper.cpp/issues/1720
         # note: adding -g causes segmentation fault during compile
         #set(XC_FLAGS -fno-fast-math -fno-inline -g)
         set(XC_FLAGS -fno-fast-math -fno-inline)
diff --git a/ggml/src/ggml-opencl/ggml-opencl.cpp b/ggml/src/ggml-opencl/ggml-opencl.cpp
index 4850c11d14..0f0eb3a9d8 100644
--- a/ggml/src/ggml-opencl/ggml-opencl.cpp
+++ b/ggml/src/ggml-opencl/ggml-opencl.cpp
@@ -3740,7 +3740,7 @@ static enum ggml_status ggml_backend_opencl_buffer_init_tensor(ggml_backend_buff
         // Reuse extra of the parent tensor. The offset of this view tensor
         // becomes `extra->offset + view_offs` and needs to be calculated when
         // it is used. This changes is needed because of the change to
-        // ggml_alloc.c in https://github.com/ggerganov/llama.cpp/pull/7640.
+        // ggml_alloc.c in https://github.com/ggml-org/llama.cpp/pull/7640.
         // `buffer` passed in here will always be `tensor->buffer`. It is OK
         // to allocate extras from the same buffer context for ordinary
         // intermediate tensors. But for views into kv cache tensors, doing so
diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp
index 74b4ed91cc..12f1e7717b 100644
--- a/ggml/src/ggml-sycl/ggml-sycl.cpp
+++ b/ggml/src/ggml-sycl/ggml-sycl.cpp
@@ -3390,7 +3390,7 @@ static void ggml_sycl_mul_mat(ggml_backend_sycl_context & ctx, const ggml_tensor
 
 
     // mmvq and mmq need the __dp4a instruction which is available for gen12+
-    // Workaround in https://github.com/ggerganov/llama.cpp/commit/95f84d5ce8b449a9b16009434aca800df504a02e
+    // Workaround in https://github.com/ggml-org/llama.cpp/commit/95f84d5ce8b449a9b16009434aca800df504a02e
     use_mul_mat_q = use_mul_mat_q && (src0->type != GGML_TYPE_IQ2_XXS);
 #ifdef SYCL_USE_XMX
     use_mul_mat_q = use_mul_mat_q && (src1->ne[1] <= MMQ_MAX_BATCH_SIZE);
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp b/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp
index bbdbf9dcaa..ca486a288a 100644
--- a/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp
@@ -330,7 +330,7 @@ void string_to_spv_func(std::string name, std::string in_path, std::string out_p
         std::vector<std::string> cmd = {GLSLC, "-fshader-stage=compute", target_env, in_path, "-o", out_path};
     #endif
 
-    // disable spirv-opt for coopmat shaders for https://github.com/ggerganov/llama.cpp/issues/10734
+    // disable spirv-opt for coopmat shaders for https://github.com/ggml-org/llama.cpp/issues/10734
     // disable spirv-opt for bf16 shaders for https://github.com/ggml-org/llama.cpp/issues/15344
     // disable spirv-opt for rope shaders for https://github.com/ggml-org/llama.cpp/issues/16860
     if (!coopmat && name.find("bf16") == std::string::npos && name.find("rope") == std::string::npos) {
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index 1725ad1654..e1471b540e 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -6562,7 +6562,7 @@ static void ggml_compute_backward(
         case GGML_OP_DIAG_MASK_INF: {
             if (src0_needs_grads) {
                 /* ggml_diag_mask_inf_impl() shouldn't be here */
-                /* ref:  https://github.com/ggerganov/llama.cpp/pull/4203#discussion_r1412377992 */
+                /* ref:  https://github.com/ggml-org/llama.cpp/pull/4203#discussion_r1412377992 */
                 const int n_past = ((const int32_t *) tensor->op_params)[0];
                 ggml_add_or_set(ctx, cgraph, isrc0, ggml_diag_mask_zero_impl(ctx, grad, n_past, false));
             }
diff --git a/src/llama-chat.cpp b/src/llama-chat.cpp
index 3c7e0afdae..c415a998f3 100644
--- a/src/llama-chat.cpp
+++ b/src/llama-chat.cpp
@@ -233,7 +233,7 @@ int32_t llm_chat_apply_template(
     llm_chat_template tmpl,
     const std::vector<const llama_chat_message *> & chat,
     std::string & dest, bool add_ass) {
-    // Taken from the research: https://github.com/ggerganov/llama.cpp/issues/5527
+    // Taken from the research: https://github.com/ggml-org/llama.cpp/issues/5527
     std::stringstream ss;
     if (tmpl == LLM_CHAT_TEMPLATE_CHATML) {
         // chatml template
diff --git a/src/llama-hparams.h b/src/llama-hparams.h
index caed0ec1b7..dfbc7d95e9 100644
--- a/src/llama-hparams.h
+++ b/src/llama-hparams.h
@@ -195,7 +195,7 @@ struct llama_hparams {
     uint32_t n_deepstack_layers = 0;
 
     // needed by encoder-decoder models (e.g. T5, FLAN-T5)
-    // ref: https://github.com/ggerganov/llama.cpp/pull/8141
+    // ref: https://github.com/ggml-org/llama.cpp/pull/8141
     llama_token dec_start_token_id = LLAMA_TOKEN_NULL;
     uint32_t    dec_n_layer        = 0;
 
diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
index a23950d007..74a8496f9e 100644
--- a/src/llama-vocab.cpp
+++ b/src/llama-vocab.cpp
@@ -90,7 +90,7 @@ static_assert(std::is_trivially_copyable<llm_symbol>::value, "llm_symbol is not
 //
 // SPM tokenizer
 // original implementation:
-// https://github.com/ggerganov/llama.cpp/commit/074bea2eb1f1349a0118239c4152914aecaa1be4
+// https://github.com/ggml-org/llama.cpp/commit/074bea2eb1f1349a0118239c4152914aecaa1be4
 //
 
 struct llm_bigram_spm {
@@ -285,7 +285,7 @@ struct llm_tokenizer_bpe : llm_tokenizer {
                     // original regex from tokenizer.json
                     //"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
 
-                    // adapted: https://github.com/ggerganov/llama.cpp/pull/6920#issuecomment-2080233989
+                    // adapted: https://github.com/ggml-org/llama.cpp/pull/6920#issuecomment-2080233989
                     "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
                 };
                 break;
@@ -2390,7 +2390,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
 
         // maintain a list of tokens that cause end-of-generation
         // this is currently determined based on the token text, which is obviously not ideal
-        // ref: https://github.com/ggerganov/llama.cpp/issues/9606
+        // ref: https://github.com/ggml-org/llama.cpp/issues/9606
         special_eog_ids.clear();
 
         if (special_fim_pad_id != LLAMA_TOKEN_NULL && special_eog_ids.count(special_fim_pad_id) == 0) {
@@ -3079,7 +3079,7 @@ std::vector<llama_token> llama_vocab::impl::tokenize(
 }
 
 int32_t llama_vocab::impl::token_to_piece(llama_token token, char * buf, int32_t length, int32_t lstrip, bool special) const {
-    // ref: https://github.com/ggerganov/llama.cpp/pull/7587#discussion_r1620983843
+    // ref: https://github.com/ggml-org/llama.cpp/pull/7587#discussion_r1620983843
     static const int attr_special = LLAMA_TOKEN_ATTR_UNKNOWN | LLAMA_TOKEN_ATTR_CONTROL;
     const llama_token_attr attr = token_get_attr(token);
     if (!special && (attr & attr_special)) {
diff --git a/src/models/deepseek2.cpp b/src/models/deepseek2.cpp
index 297dca5136..987f449934 100644
--- a/src/models/deepseek2.cpp
+++ b/src/models/deepseek2.cpp
@@ -14,7 +14,7 @@ llm_build_deepseek2::llm_build_deepseek2(const llama_model & model, const llm_gr
     const uint32_t kv_lora_rank = hparams.n_lora_kv;
 
     // We have to pre-scale kq_scale and attn_factor to make the YaRN RoPE work correctly.
-    // See https://github.com/ggerganov/llama.cpp/discussions/7416 for detailed explanation.
+    // See https://github.com/ggml-org/llama.cpp/discussions/7416 for detailed explanation.
     // And also: https://github.com/ggml-org/llama.cpp/pull/17945 [TAG_DEEPSEEK2_YARN_LOG_MUL_FIX]
 
     // first cancel the adjustment from llama_hparams::yarn_attn_factor_adjust to get the original attn_factor
diff --git a/tests/test-autorelease.cpp b/tests/test-autorelease.cpp
index 35b09aaeac..ca87c56a8f 100644
--- a/tests/test-autorelease.cpp
+++ b/tests/test-autorelease.cpp
@@ -1,4 +1,4 @@
-// ref: https://github.com/ggerganov/llama.cpp/issues/4952#issuecomment-1892864763
+// ref: https://github.com/ggml-org/llama.cpp/issues/4952#issuecomment-1892864763
 
 #include <cstdio>
 #include <string>
diff --git a/tools/cvector-generator/pca.hpp b/tools/cvector-generator/pca.hpp
index e88bbdde93..afd3bf6380 100644
--- a/tools/cvector-generator/pca.hpp
+++ b/tools/cvector-generator/pca.hpp
@@ -290,7 +290,7 @@ static void power_iteration(
     ggml_gallocr_free(allocr);
 
     // TODO @ngxson : The output vector is randomly inverted
-    // Solution: https://github.com/ggerganov/llama.cpp/pull/8069#issuecomment-2185328171
+    // Solution: https://github.com/ggml-org/llama.cpp/pull/8069#issuecomment-2185328171
 }
 
 static void run_pca(
diff --git a/tools/export-lora/export-lora.cpp b/tools/export-lora/export-lora.cpp
index f038019b00..41f426208f 100644
--- a/tools/export-lora/export-lora.cpp
+++ b/tools/export-lora/export-lora.cpp
@@ -190,7 +190,7 @@ struct lora_merge_ctx {
         gguf_set_val_u32(ctx_out, "general.file_type", LLAMA_FTYPE_MOSTLY_F16);
 
         // check if all lora adapters have the same tensors
-        // TODO: remove this when we can support merging subset of adapters. Ref: https://github.com/ggerganov/llama.cpp/pull/8607#discussion_r1686027777
+        // TODO: remove this when we can support merging subset of adapters. Ref: https://github.com/ggml-org/llama.cpp/pull/8607#discussion_r1686027777
         static const char * err_no_subset_adapter = "Input adapters do not have the same list of tensors. This is not yet supported. Please merge the adapter one-by-one instead of merging all at once.";
         if (adapters.size() > 1) {
             for (size_t i = 1; i < adapters.size(); ++i) {
diff --git a/tools/perplexity/README.md b/tools/perplexity/README.md
index 33a46d1a2e..eb3846072e 100644
--- a/tools/perplexity/README.md
+++ b/tools/perplexity/README.md
@@ -29,7 +29,7 @@ In addition to the KL divergence the following statistics are calculated with `-
 * Mean change in "correct" token probability. Positive values mean the model gets better at prediction, negative values mean it gets worse.
 * Pearson correlation coefficient of the "correct" token probabilites between models.
 * Percentiles of change in "correct" token probability. Positive values mean the model gets better at prediction, negative values mean it gets worse. Can be used to judge noise vs. quality loss from quantization. If the percentiles are symmetric then the quantization is essentially just adding noise. If the negative values are significantly larger than the positive values then this indicates that the model is actually becoming worse from the quantization.
-* The root mean square of the change in token probabilities. If you were to assume that the quantization simply causes Gaussian noise on the token probabilities then this would be the standard deviation of said noise. The uncertainty on the value is calculated that the change in token probabilities follows a Gaussian distribution. Related discussion: https://github.com/ggerganov/llama.cpp/discussions/2875 .
+* The root mean square of the change in token probabilities. If you were to assume that the quantization simply causes Gaussian noise on the token probabilities then this would be the standard deviation of said noise. The uncertainty on the value is calculated that the change in token probabilities follows a Gaussian distribution. Related discussion: https://github.com/ggml-org/llama.cpp/discussions/2875 .
 * Same top p: Percentage of how often the token was assigned the highest probabilites by both models. The uncertainty is calculated from the Gaussian approximation of the binomial distribution.
 
 ## LLaMA 3 8b Scoreboard
diff --git a/tools/server/public_legacy/index-new.html b/tools/server/public_legacy/index-new.html
index cbfbbdf280..e2f39d6687 100644
--- a/tools/server/public_legacy/index-new.html
+++ b/tools/server/public_legacy/index-new.html
@@ -1096,7 +1096,7 @@ return html`
           </section>
           <footer>
             <p><${ModelGenerationInfo} /></p>
-            <p>Powered By <a href="https://github.com/ggerganov/llama.cpp#readme" target="_blank">llama.cpp</a> and <a href="https://ggml.ai/" target="_blank">ggml.ai</a></p>
+            <p>Powered By <a href="https://github.com/ggml-org/llama.cpp#readme" target="_blank">llama.cpp</a> and <a href="https://ggml.ai/" target="_blank">ggml.ai</a></p>
           </footer>
         </div>
       `;
diff --git a/tools/server/public_legacy/index.html b/tools/server/public_legacy/index.html
index 75f39330a7..98d56ea8b1 100644
--- a/tools/server/public_legacy/index.html
+++ b/tools/server/public_legacy/index.html
@@ -1281,7 +1281,7 @@
 
           <footer>
             <p><${ModelGenerationInfo} /></p>
-            <p>Powered by <a href="https://github.com/ggerganov/llama.cpp">llama.cpp</a> and <a href="https://ggml.ai">ggml.ai</a>.</p>
+            <p>Powered by <a href="https://github.com/ggml-org/llama.cpp">llama.cpp</a> and <a href="https://ggml.ai">ggml.ai</a>.</p>
           </footer>
         </div>
       `;
diff --git a/tools/server/public_legacy/theme-mangotango.css b/tools/server/public_legacy/theme-mangotango.css
index e433802453..315daf734a 100755
--- a/tools/server/public_legacy/theme-mangotango.css
+++ b/tools/server/public_legacy/theme-mangotango.css
@@ -1,5 +1,5 @@
 /* Author: Yazan Agha-Schrader */
-/* Inspiration from llama.cpp logo/banner https://github.com/ggerganov/llama.cpp#readme */
+/* Inspiration from llama.cpp logo/banner https://github.com/ggml-org/llama.cpp#readme */
 
 .theme-mangotango {
 
diff --git a/tools/server/themes/buttons-top/index.html b/tools/server/themes/buttons-top/index.html
index 3fb88fcc88..cb5af587aa 100644
--- a/tools/server/themes/buttons-top/index.html
+++ b/tools/server/themes/buttons-top/index.html
@@ -1032,7 +1032,7 @@
 
           <footer>
             <p><${ModelGenerationInfo} /></p>
-            <p>Powered by <a href="https://github.com/ggerganov/llama.cpp">llama.cpp</a> and <a href="https://ggml.ai">ggml.ai</a>.</p>
+            <p>Powered by <a href="https://github.com/ggml-org/llama.cpp">llama.cpp</a> and <a href="https://ggml.ai">ggml.ai</a>.</p>
           </footer>
         </div>
       `;
diff --git a/tools/server/themes/wild/index.html b/tools/server/themes/wild/index.html
index 73f36d4b29..601f7762cd 100644
--- a/tools/server/themes/wild/index.html
+++ b/tools/server/themes/wild/index.html
@@ -1036,7 +1036,7 @@
 
           <footer>
             <p><${ModelGenerationInfo} /></p>
-            <p>Powered by <a href="https://github.com/ggerganov/llama.cpp">llama.cpp</a> and <a href="https://ggml.ai">ggml.ai</a>.</p>
+            <p>Powered by <a href="https://github.com/ggml-org/llama.cpp">llama.cpp</a> and <a href="https://ggml.ai">ggml.ai</a>.</p>
           </footer>
         </div>
       `;

From 1239267cc4e5a1c9fc6546825eefe13c856e7458 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Mon, 2 Feb 2026 08:51:25 +0200
Subject: [PATCH 07/18] authors : update (#19263)

[no ci]
---
 AUTHORS | 1085 ++++++++++++++++++++++++++++++++++++++-----------------
 1 file changed, 749 insertions(+), 336 deletions(-)

diff --git a/AUTHORS b/AUTHORS
index 0af9f44ad4..c297f3c217 100644
--- a/AUTHORS
+++ b/AUTHORS
@@ -1,127 +1,228 @@
-# date: Sat Mar  8 18:23:52 EET 2025
+# date: Mon Feb  2 08:45:04 EET 2026
 # this file is auto-generated by scripts/gen-authors.sh
 
+Нияз Гарифзянов <112617865+garrnizon@users.noreply.github.com>
+杨朱 · Kiki <baofa.fan@daocloud.io>
+エシュナヴァリシア <148695646+eternaphia@users.noreply.github.com>
+吴小白 <296015668@qq.com>
+源文雨 <41315874+fumiama@users.noreply.github.com>
+蕭澧邦 <45505768+shou692199@users.noreply.github.com>
+도로로도로또 <60079918+dororodoroddo@users.noreply.github.com>
+손희준 <openingnow@naver.com>
+谢乃闻 <sienaiwun@users.noreply.github.com>
 0cc4m <picard12@live.de>
+0Marble <85058989+0Marble@users.noreply.github.com>
 0xspringtime <110655352+0xspringtime@users.noreply.github.com>
 20kdc <asdd2808@gmail.com>
+2114L3 <2114L3@users.noreply.github.com>
 2f38b454 <dxf@protonmail.com>
 3ooabkhxtn <31479382+3ooabkhxtn@users.noreply.github.com>
 44670 <44670@users.noreply.github.com>
+4onen <11580688+4onen@users.noreply.github.com>
 65a <10104049+65a@users.noreply.github.com>
 708-145 <40387547+708-145@users.noreply.github.com>
-AN Long <aisk@users.noreply.github.com>
-AT <manyoso@users.noreply.github.com>
+a-n-n-a-l-e-e <150648636+a-n-n-a-l-e-e@users.noreply.github.com>
+a3sh <38979186+A3shTnT@users.noreply.github.com>
+aa956 <aa956@users.noreply.github.com>
+Aadeshveer Singh <24b0926@iitb.ac.in>
+Aadeshveer Singh <aadeshveer07@gmail.com>
 Aarni Koskela <akx@iki.fi>
 Aaron Miller <apage43@ninjawhale.com>
 Aaron Teo <57927438+taronaeo@users.noreply.github.com>
+Aaron Teo <aaron.teo1@ibm.com>
 Aaryaman Vasishta <aaryaman.vasishta@amd.com>
 Abheek Gulati <abheekg@hotmail.com>
 Abhilash Majumder <30946547+abhilash1910@users.noreply.github.com>
 Abhishek Gopinath K <31348521+overtunned@users.noreply.github.com>
+Acly <aclysia@gmail.com>
+Adam <channeladam@users.noreply.github.com>
+adel boussaken <netdur@gmail.com>
 Adithya Balaji <adithya.b94@gmail.com>
 AdithyanI <adithyan.i4internet@gmail.com>
 Adrian <smith.adriane@gmail.com>
 Adrian Hesketh <a-h@users.noreply.github.com>
 Adrian Kretz <me@akretz.com>
+Adrian Lundberg <47256989+alundb@users.noreply.github.com>
 Adrien Gallouët <adrien@gallouet.fr>
 Adrien Gallouët <angt@huggingface.co>
+afrideva <95653597+afrideva@users.noreply.github.com>
+ag2s20150909 <19373730+ag2s20150909@users.noreply.github.com>
+agray3 <agray3@users.noreply.github.com>
 Ahmad Tameem <113388789+Tameem-10xE@users.noreply.github.com>
 Ahmet Zeer <ahmed.zeer@std.yildiz.edu.tr>
+ai-fonsi <length-amiss-7k@icloud.com>
+Aidan <99101158+gSUz92nc@users.noreply.github.com>
 AidanBeltonS <87009434+AidanBeltonS@users.noreply.github.com>
 AidanBeltonS <aidan.belton@codeplay.com>
 Aisuko <urakiny@gmail.com>
 Akarshan Biswas <akarshan.biswas@gmail.com>
 Akarshan Biswas <akarshan@menlo.ai>
 Akarshan Biswas <akarshanbiswas@fedoraproject.org>
+akawrykow <142945436+akawrykow@users.noreply.github.com>
 Al Mochkin <14274697+amochkin@users.noreply.github.com>
+Alan Gray <agray3@users.noreply.github.com>
+Alawode Oluwandabira <dabiraalawode@yahoo.com>
 Albert Jin <albert.jin@gmail.com>
 Alberto <57916483+albbus-stack@users.noreply.github.com>
+Alberto Cabrera Pérez <1478977+Alcpz@users.noreply.github.com>
 Alberto Cabrera Pérez <alberto.cabrera@codeplay.com>
 Alberto Cabrera Pérez <alberto.cabrera@intel.com>
+Alberto Cabrera Pérez <alberto.cabrera@liquid.ai>
+Aldehir Rojas <hello@alde.dev>
+alek3y <44779186+alek3y@users.noreply.github.com>
+Aleksander Grygier <aleksander.grygier@gmail.com>
 Aleksei Nikiforov <103434461+AlekseiNikiforovIBM@users.noreply.github.com>
+Alessandro98-git <61804547+Alessandro98-git@users.noreply.github.com>
 Alex <awhill19@icloud.com>
 Alex Azarov <alex@azarov.by>
 Alex Azarov <alexander.azarov@mapbox.com>
 Alex Brooks <alex.brooks@ibm.com>
+Alex Fanthome <xfanth@gmail.com>
 Alex Klinkhamer <from.github.com.917@grencez.dev>
 Alex Klinkhamer <git@grencez.dev>
 Alex Nguyen <tiendung@users.noreply.github.com>
 Alex O'Connell <35843486+acon96@users.noreply.github.com>
 Alex Petenchea <alex.petenchea@gmail.com>
 Alex Renda <alexrenda@users.noreply.github.com>
+Alex Trotta <44127594+Ahajha@users.noreply.github.com>
 Alex Tuddenham <61622354+AlexsCode@users.noreply.github.com>
 Alex von Gluck IV <kallisti5@unixzen.com>
+Alex Wu <dindinw@users.noreply.github.com>
+alex-spacemit <jinghui.huang@spacemit.com>
 Alexey Parfenov <zxed@alkatrazstudio.net>
+Alexis Williams <typedrat@users.noreply.github.com>
+alexpinel <93524949+alexpinel@users.noreply.github.com>
+Alfred <zxu3@clemson.edu>
 Ali Chraghi <63465728+alichraghi@users.noreply.github.com>
 Ali Nehzat <ali.nehzat@thanks.dev>
 Ali Tariq <ali.tariq@10xengineers.ai>
+Ali Tariq <alitariq4589@gmail.com>
 Alon <alonfaraj@gmail.com>
+alonfaraj <alonfaraj@gmail.com>
 AlpinDale <52078762+AlpinDale@users.noreply.github.com>
+alwqx <kenan3015@gmail.com>
+Aman <amangupta052@gmail.com>
+Aman Gupta <amangupta052@gmail.com>
+amd-dwang <dong.wang@amd.com>
+amd-lalithnc <lalithnc@amd.com>
 Amir <amir_zia@outlook.com>
+amirai21 <89905406+amirai21@users.noreply.github.com>
 AmirAli Mirian <37371367+amiralimi@users.noreply.github.com>
+amritahs-ibm <amritahs@linux.vnet.ibm.com>
+AN Long <aisk@users.noreply.github.com>
 Ananta Bastola <anantarajbastola@gmail.com>
 Anas Ahouzi <112881240+aahouzi@users.noreply.github.com>
+Anav Prasad <anavp@nvidia.com>
+anavp-nvidia <anavp@nvidia.com>
+Andika Wasisto <andika@wasisto.com>
 András Salamon <ott2@users.noreply.github.com>
 Andreas (Andi) Kunar <andreask@msn.com>
 Andreas Kieslinger <47689530+aendk@users.noreply.github.com>
 Andrei <abetlen@gmail.com>
+Andrew Aladjev <aladjev.andrew@gmail.com>
 Andrew Canis <andrew.canis@gmail.com>
 Andrew Downing <andrew2085@gmail.com>
 Andrew Duffy <a10y@users.noreply.github.com>
 Andrew Godfrey <AndrewGodfrey@users.noreply.github.com>
+Andrew Marshall <andrew@johnandrewmarshall.com>
 Andrew Minh Nguyen <40281306+amqdn@users.noreply.github.com>
+andrijdavid <david@geek.mg>
 Andy Salerno <andysalerno@gmail.com>
 Andy Tai <andy-tai@users.noreply.github.com>
+Ankur Verma <31362771+ankurvdev@users.noreply.github.com>
+anon998 <131767832+anon998@users.noreply.github.com>
+Anri Lombard <anri.m.lombard@gmail.com>
+Anthony Umfer <aumfer@gmail.com>
 Anthony Van de Gejuchte <anthonyvdgent@gmail.com>
+antichristHater <142441588+antichristHater@users.noreply.github.com>
 Antoine Viallon <antoine@lesviallon.fr>
+Anton Mitkov <anton_b_mitkov@abv.bg>
+Anton Mitkov <anton.mitkov@codeplay.com>
 Antonis Makropoulos <benuix@gmail.com>
+Anudit Nagar <nagaranudit@gmail.com>
+anzz1 <anzz1@live.com>
+apaz <aarpazdera@gmail.com>
+apcameron <37645737+apcameron@users.noreply.github.com>
+arch-btw <57669023+arch-btw@users.noreply.github.com>
+arcrank <arcrank@gmail.com>
+ardfork <134447697+ardfork@users.noreply.github.com>
 Arik Poznanski <arikpoz@users.noreply.github.com>
+arlo-phoenix <140345165+arlo-phoenix@users.noreply.github.com>
 Armen Kaleshian <kriation@users.noreply.github.com>
 Artem <guinmoon@gmail.com>
 Artem Zinnatullin <ceo@abstractny.gay>
 Artyom Lebedev <vagran.ast@gmail.com>
+aryantandon01 <80969509+aryantandon01@users.noreply.github.com>
 Asbjørn Olling <asbjornolling@gmail.com>
 Ásgeir Bjarni Ingvarsson <asgeir@fundinn.org>
 Asghar Ghorbani <a-ghorbani@users.noreply.github.com>
 Ashish <1856117+ashishdatta@users.noreply.github.com>
 Ashok Gelal <401055+ashokgelal@users.noreply.github.com>
 Ashraful Islam <ashraful.meche@gmail.com>
+AT <manyoso@users.noreply.github.com>
+at8u <129688334+at8u@users.noreply.github.com>
+Atharva Dubey <atharva.dubey@codeplay.com>
 Atsushi Tatsuma <yoshoku@outlook.com>
+aubreyli <aubreylee@gmail.com>
 Austin <77757836+teleprint-me@users.noreply.github.com>
 AustinMroz <austinmroz@utexas.edu>
-BADR <contact@pythops.com>
-BB-fat <45072480+BB-fat@users.noreply.github.com>
+automaticcat <daogiatuank54@gmail.com>
+awatuna <23447591+awatuna@users.noreply.github.com>
+b4b4o <zwbao@foxmail.com>
 Bach Le <bach@bullno1.com>
+BADR <contact@pythops.com>
+bagheera <59658056+bghira@users.noreply.github.com>
 Bailey Chittle <39804642+bachittle@users.noreply.github.com>
+bandoti <141645996+bandoti@users.noreply.github.com>
 BarfingLemurs <128182951+BarfingLemurs@users.noreply.github.com>
+Bart Louwers <bart.louwers@gmail.com>
+Bartowski <3266127+bartowski1182@users.noreply.github.com>
 Bartowski <ckealty1182@gmail.com>
+Bas Nijholt <basnijholt@gmail.com>
+bashayer hijji <bashayer.hijji@gmail.com>
+BB-fat <45072480+BB-fat@users.noreply.github.com>
 Behnam M <58621210+ibehnam@users.noreply.github.com>
+beiller <beiller@gmail.com>
+Beinsezii <39478211+Beinsezii@users.noreply.github.com>
 Ben Ashbaugh <ben.ashbaugh@intel.com>
+Ben Chen <chanben04gz@gmail.com>
 Ben Garney <bengarney@users.noreply.github.com>
 Ben Siraphob <bensiraphob@gmail.com>
 Ben Williams <ben@719ben.com>
 Benjamin Findley <39356821+Kartoffelsaft@users.noreply.github.com>
 Benjamin Lecaillon <84293038+blecaillon@users.noreply.github.com>
+Benni <73313922+BenjaminBruenau@users.noreply.github.com>
 Benson Wong <mostlygeek@gmail.com>
 Bernat Vadell <hounter.caza@gmail.com>
 Bernhard M. Wiedemann <githubbmwprimary@lsmod.de>
 Bert Wagner <github@bertwagner.com>
+bhubbb <79117352+bhubbb@users.noreply.github.com>
 Billel Mokeddem <billel.mokeddem.ml@gmail.com>
 Bingan <70050083+binganao@users.noreply.github.com>
+Bizhao Shi <37729561+shibizhao@users.noreply.github.com>
 Bjarke Viksøe <164612031+bviksoe@users.noreply.github.com>
+Björn Ganster <mail@bjoern-ganster.de>
+bmwl <brian.marshall@tolko.com>
+Bo Zheng <368586905@qq.com>
+bobqianic <129547291+bobqianic@users.noreply.github.com>
 Bodhi <3882561+BodhiHu@users.noreply.github.com>
 Bodo Graumann <mail@bodograumann.de>
+Boian Berberov <7432115+bberberov@users.noreply.github.com>
 Bono Lv <lvscar@users.noreply.github.com>
 Borislav Stanimirov <b.stanimirov@abv.bg>
 Borislav Stanimirov <b@ibob.bg>
+Bowen Han <fancycode@gmail.com>
 Branden Butler <bwtbutler@hotmail.com>
 Brandon Squizzato <35474886+bsquizz@users.noreply.github.com>
 Brian <mofosyne@gmail.com>
 Brian Cunnie <brian.cunnie@gmail.com>
 Bruce MacDonald <brucewmacdonald@gmail.com>
+brucepro <git@brucepro.net>
 Bryan Honof <bryanhonof@gmail.com>
-CJ Pais <cj@cjpais.com>
-CRD716 <crd716@gmail.com>
+bryanSwk <93190252+bryanSwk@users.noreply.github.com>
+bsilvereagle <bsilvereagle@users.noreply.github.com>
+bssrdf <merlintiger@hotmail.com>
+byte-6174 <88070277+byte-6174@users.noreply.github.com>
 Calvin Laurenson <calvin@laurenson.dev>
 Cameron <csteele@steelecameron.com>
 Cameron Kaiser <classilla@users.noreply.github.com>
@@ -132,20 +233,33 @@ CarterLi999 <664681047@qq.com>
 Casey Primozic <casey@cprimozic.net>
 Casey Primozic <me@ameo.link>
 CausalLM <148736309+CausalLM@users.noreply.github.com>
+ccbinn <ccbinn@163.com>
+cduk <19917266+cduk@users.noreply.github.com>
+cebtenzzre <cebtenzzre@gmail.com>
 Cebtenzzre <cebtenzzre@gmail.com>
 CentricStorm <CentricStorm@users.noreply.github.com>
 Chad Brewbaker <crb002@gmail.com>
+Chad Voegele <chadvoegele@users.noreply.github.com>
+chaihahaha <chai836275709@gmail.com>
 Changyeon Kim <cyzero.kim@samsung.com>
+chansikpark <chansik.park@gmail.com>
 Chao Jiang <jc19chaoj@zoho.com>
+characharm <123120856+characharm@users.noreply.github.com>
 Charles Duffy <charles@dyfis.net>
 Charles Xu <63788048+chaxu01@users.noreply.github.com>
 Charles Xu <charles.xu@arm.com>
+chen fan <350211548@qq.com>
 Chen Xi <xi2.chen@intel.com>
 Chen Xi <xixichen08@foxmail.com>
 Cheng Shao <terrorjack@type.dance>
+Chenguang Li <757486878@qq.com>
 Chenguang Li <87689256+noemotiovon@users.noreply.github.com>
+chiranko <96988916+chiranko@users.noreply.github.com>
 Chris Elrod <elrodc@gmail.com>
 Chris Kuehl <ckuehl@ckuehl.me>
+Chris Peterson <cpeterson@mozilla.com>
+Chris Rohlf <chris.rohlf@gmail.com>
+Chris Thompson <christopherthompson81@gmail.com>
 Christian Demsar <christian@github.email.demsar.us>
 Christian Demsar <crasm@git.vczf.us>
 Christian Falch <875252+chrfalch@users.noreply.github.com>
@@ -155,260 +269,466 @@ Christian Kögler <ck3d@gmx.de>
 Christian Köhnenkamp <cvk5@me.com>
 Christian Zhou-Zheng <59622928+christianazinn@users.noreply.github.com>
 Christopher Nielsen <62156882+mascguy@users.noreply.github.com>
+City <125218114+city96@users.noreply.github.com>
+CJ Pais <cj@cjpais.com>
 Clark Saben <76020733+csaben@users.noreply.github.com>
 Clauszy <zhangyub@uniontech.com>
+clibdev <52199778+clibdev@users.noreply.github.com>
 Clint Herron <hanclinto@gmail.com>
+clyang <clyang@clyang.net>
+cmdr2 <secondary.cmdr2@gmail.com>
+cmdr2 <shashank.shekhar.global@gmail.com>
+cocktailpeanut <121128867+cocktailpeanut@users.noreply.github.com>
+codezjx <code.zjx@gmail.com>
+coezbek <c.oezbek@gmail.com>
+comex <comexk@gmail.com>
+compilade <113953597+compilade@users.noreply.github.com>
+compilade <git@compilade.net>
+Congcong Cai <congcongcai0907@163.com>
 Conrad Kramer <conrad@conradkramer.com>
+Copilot <198982749+Copilot@users.noreply.github.com>
 Corentin REGAL <corentin.regal@gmail.com>
+cpumaxx <163466046+cpumaxx@users.noreply.github.com>
+crasm <crasm@git.vczf.net>
+crasm <crasm@git.vczf.us>
+crat0z <11581854+crat0z@users.noreply.github.com>
+CRD716 <crd716@gmail.com>
 CrispStrobe <154636388+CrispStrobe@users.noreply.github.com>
 Csaba Kecskemeti <csaba.kecskemeti@gmail.com>
 Cuong Trinh Manh <nguoithichkhampha@gmail.com>
-DAN™ <dranger003@gmail.com>
+daboe01 <daboe01@googlemail.com>
+daghanerdonmez <44506702+daghanerdonmez@users.noreply.github.com>
 Damian Stewart <d@damianstewart.com>
+daminho <37615795+daminho@users.noreply.github.com>
+DAN™ <dranger003@gmail.com>
 Dan Johansson <164997844+eddnjjn@users.noreply.github.com>
 Dan Johansson <dan.johansson@arm.com>
 Dane Madsen <dane_madsen@hotmail.com>
 DaniAndTheWeb <57776841+DaniAndTheWeb@users.noreply.github.com>
+Daniel Benjaminsson <danielbenjaminsson@users.noreply.github.com>
 Daniel Bevenius <daniel.bevenius@gmail.com>
 Daniel Drake <drake@endlessos.org>
+Daniel Han <danielhanchen@gmail.com>
 Daniel Hiltgen <dhiltgen@users.noreply.github.com>
 Daniel Illescas Romero <illescas.daniel@protonmail.com>
 Daniel Kleine <53251018+d-kleine@users.noreply.github.com>
+Daniel Tang <danielzgtg.opensource@gmail.com>
 Daniele <57776841+daniandtheweb@users.noreply.github.com>
+Daniele <daniele.dilotorres@gmail.com>
+Daniele Pinna <72076821+pestopoppa@users.noreply.github.com>
 Danny Milosavljevic <dannym@friendly-machines.com>
 DannyDaemonic <DannyDaemonic@gmail.com>
+Darius Lukas <dariusjlukas@gmail.com>
 Dat Quoc Nguyen <2412555+datquocnguyen@users.noreply.github.com>
 Dave <dave-fl@users.noreply.github.com>
 Dave Airlie <airlied@gmail.com>
 Dave Airlie <airlied@redhat.com>
 Dave Della Costa <ddellacosta+github@gmail.com>
+David Chiu <david20571015@gmail.com>
 David Friehs <david@friehs.info>
 David Huang <1969802+hjc4869@users.noreply.github.com>
 David Kennedy <dakennedyd@gmail.com>
+David Lima <contato@davidlima.com.br>
 David Pflug <david@pflug.email>
+david raistrick <keen99@users.noreply.github.com>
 David Renshaw <dwrenshaw@gmail.com>
+David Ribeiro Alves <davidralves@gmail.com>
 David Sommers <12738+databyte@users.noreply.github.com>
 David Yang <davidyang6us@gmail.com>
+David Zhao <90013954+Your-Cheese@users.noreply.github.com>
+davidef <davidef1986@gmail.com>
 DavidKorczynski <david@adalogics.com>
 Dawid Potocki <github@dawidpotocki.com>
 Dawid Wysocki <62249621+TortillaZHawaii@users.noreply.github.com>
+ddh0 <chemist-mulches-39@icloud.com>
+ddh0 <dylanhalladay02@icloud.com>
+ddpasa <112642920+ddpasa@users.noreply.github.com>
+DDXDB <38449595+DDXDB@users.noreply.github.com>
 Dean <Dean.Sinaean@gmail.com>
+deepdiffuser <112834445+deepdiffuser@users.noreply.github.com>
+deepsek <166548550+deepsek@users.noreply.github.com>
 Deins <deinsegle@gmail.com>
 Denis Spasyuk <34203011+dspasyuk@users.noreply.github.com>
 Derrick T. Woolworth <dwoolworth@gmail.com>
 Deven Mistry <31466137+deven367@users.noreply.github.com>
+devojony <61173062+devojony@users.noreply.github.com>
+diannao <55k@outlook.com>
 Dibakar Gope <dibakar.gope@arm.com>
 Didzis Gosko <didzis@users.noreply.github.com>
 Diego Devesa <slarengh@gmail.com>
+Diner Burger <burger@diner.name>
+Đinh Trọng Huy <77562200+huydt84@users.noreply.github.com>
 Diogo Teles Sant'Anna <diogoteles@google.com>
+ditsuke <ditsuke@protonmail.com>
+divinity76 <divinity76@gmail.com>
 Djip007 <3705339+Djip007@users.noreply.github.com>
 Djip007 <djip.perois@free.fr>
+dm4 <dm4@secondstate.io>
+dm4 <sunrisedm4@gmail.com>
+Dmytro Minochkin <dmytro.minochkin@gmail.com>
+Dobri Danchev <12420863+danchev@users.noreply.github.com>
+DocShotgun <126566557+DocShotgun@users.noreply.github.com>
+Doctor Shotgun <126566557+DocShotgun@users.noreply.github.com>
 Don Mahurin <dmahurin@users.noreply.github.com>
-DooWoong Lee (David) <manics99@naver.com>
+Dong Won Kim <63934649+ddwkim@users.noreply.github.com>
+Donghyeon Jeong <54725479+djeong20@users.noreply.github.com>
+Dongliang Wei <121270393+wdl339@users.noreply.github.com>
 Doomsdayrs <38189170+Doomsdayrs@users.noreply.github.com>
+DooWoong Lee (David) <manics99@naver.com>
+Dorin-Andrei Geman <doringeman@gmail.com>
+dotpy314 <33351922+dotpy314@users.noreply.github.com>
 Dou Xinpeng <15529241576@163.com>
 Dou Xinpeng <81913537+Dou-Git@users.noreply.github.com>
 Douglas Hanley <thesecretaryofwar@gmail.com>
+Dowon <ks2515@naver.com>
 Dr. Tom Murphy VII Ph.D <499244+tom7@users.noreply.github.com>
+drbh <david.richard.holtz@gmail.com>
+ds5t5 <145942675+ds5t5@users.noreply.github.com>
+duduta <simona.gherman@gmail.com>
+dylan <canardleteer@users.noreply.github.com>
+eastriver <lee@eastriver.dev>
 Ebey Abraham <ebey97@gmail.com>
+ebraminio <ebrahim@gnu.org>
+ebraminio <ebraminio@gmail.com>
 Echo Nolan <echo@echonolan.net>
+Ed Addario <29247825+EAddario@users.noreply.github.com>
 Ed Lee <edilee@mozilla.com>
 Ed Lepedus <ed.lepedus@googlemail.com>
 Eddie-Wang <wangjinheng1120@163.com>
 Edward Taylor <edeetee@gmail.com>
+eiery <19350831+eiery@users.noreply.github.com>
 Elaine <elaine.zosa@gmail.com>
 Elbios <141279586+Elbios@users.noreply.github.com>
 Elton Kola <eltonkola@gmail.com>
+Emmanuel Ferdman <emmanuelferdman@gmail.com>
 Emreerdog <34742675+Emreerdog@users.noreply.github.com>
 Engininja2 <139037756+Engininja2@users.noreply.github.com>
 Equim <sayaka@ekyu.moe>
 Eric Curtin <ecurtin@redhat.com>
+Eric Curtin <eric.curtin@docker.com>
 Eric Curtin <ericcurtin17@gmail.com>
 Eric Sommerlade <es0m@users.noreply.github.com>
 Eric Zhang <34133756+EZForever@users.noreply.github.com>
+eric8607242 <e0928021388@gmail.com>
 Erik Garrison <erik.garrison@gmail.com>
 Erik Scholz <Green-Sky@users.noreply.github.com>
+Ervin Áron Tasnádi <etasnadi@protonmail.com>
 Esko Toivonen <eskot98@gmail.com>
 Ettore Di Giacinto <mudler@users.noreply.github.com>
+EugeoSynthesisThirtyTwo <gabriel.dhimoila@gmail.com>
 Evan Jones <evan.q.jones@gmail.com>
 Evan Miller <emmiller@gmail.com>
 Eve <139727413+netrunnereve@users.noreply.github.com>
 Evgeny Kurnevsky <kurnevsky@gmail.com>
+Ewan Crawford <ewan.cr@gmail.com>
+Ewan Crawford <ewan@codeplay.com>
 Ewout ter Hoeven <E.M.terHoeven@student.tudelft.nl>
 ExtReMLapin <3909752+ExtReMLapin@users.noreply.github.com>
-FK <sozforex@gmail.com>
 Fabian <cmdrf@users.noreply.github.com>
 Fabio R. Sluzala <Fabio3rs@users.noreply.github.com>
 Faez Shakil <faez.shakil@gmail.com>
+fairydreaming <166155368+fairydreaming@users.noreply.github.com>
 Faisal Zaghloul <faisal.zaghloul@gmail.com>
 Faisal Zaghloul <quic_fzaghlou@quicinc.com>
 Fan Shupei <dymarkfan@outlook.com>
 FantasyGmm <16450052+FantasyGmm@users.noreply.github.com>
+fanyang <fanyang89@outlook.com>
 Farbod Bijary <110523279+farbodbj@users.noreply.github.com>
 Fattire <528174+fat-tire@users.noreply.github.com>
 Felix <stenbackfelix@gmail.com>
+fengerhu1 <2748250768@qq.com>
+fidoriel <49869342+fidoriel@users.noreply.github.com>
 Finn Voorhees <finnvoorhees@gmail.com>
 Firat <firatkiral@gmail.com>
 FirstTimeEZ <179362031+FirstTimeEZ@users.noreply.github.com>
+fj-y-saito <85871716+fj-y-saito@users.noreply.github.com>
+FK <sozforex@gmail.com>
 Florent BENOIT <fbenoit@redhat.com>
+Florian Badie <florianbadie@odrling.xyz>
 Folko-Ven <71110216+Folko-Ven@users.noreply.github.com>
 Foul-Tarnished <107711110+Foul-Tarnished@users.noreply.github.com>
+Francisco Herrera <ppaanncchhoo507@gmail.com>
 Francisco Melo <43780565+francis2tm@users.noreply.github.com>
 Frank Mai <thxcode0824@gmail.com>
 FrankHB <frankhb1989@gmail.com>
 Frankie Robertson <frankier@users.noreply.github.com>
+fraxy-v <65565042+fraxy-v@users.noreply.github.com>
 Fred Douglas <43351173+fredlas@users.noreply.github.com>
 Frederik Vogel <Schaltfehler@users.noreply.github.com>
+Fredrik Hultin <noname@nurd.se>
+frob <rick+github@frob.com.au>
+fxzjshm <11426482+fxzjshm@users.noreply.github.com>
+g2mt <166577174+g2mt@users.noreply.github.com>
 Gabe Goodhart <gabe.l.hart@gmail.com>
 Gabe Goodhart <ghart@us.ibm.com>
+Gabriel Larson <55459720+gabriellarson@users.noreply.github.com>
+Gadflyii <34758915+Gadflyii@users.noreply.github.com>
 Gaetan Bisson <gaetan@fenua.org>
 GainLee <perfecter.gen@gmail.com>
 Galunid <karolek1231456@gmail.com>
 Gary Linscott <glinscott@gmail.com>
 Gary Mulder <gjmulder@gmail.com>
+gatbontonpc <gatbontonpc@gmail.com>
+Gaurav Garg <52341457+gaugarg-nv@users.noreply.github.com>
+Gaurav Garg <gaugarg@nvidia.com>
 Gavin Zhao <gavinzhaojw@protonmail.com>
 Genkagaku.GPT <hlhr202@163.com>
 Georgi Gerganov <ggerganov@gmail.com>
 Gian-Carlo Pascutto <gcp@sjeng.org>
+GideonSerf <gdserf.gs@gmail.com>
 Gilad S <giladgd@users.noreply.github.com>
 Gilad S. <7817232+giladgd@users.noreply.github.com>
+github-actions[bot] <github-actions[bot]@users.noreply.github.com>
+GittyBurstein <g0534163997@gmail.com>
 Giuseppe Scrivano <giuseppe@scrivano.org>
+Giuseppe Scrivano <gscrivan@redhat.com>
 GiviMAD <GiviMAD@users.noreply.github.com>
+gliptic <gliptic@users.noreply.github.com>
+gn64 <yukikaze.jp@gmail.com>
+goerch <jhr.walter@t-online.de>
 Govlzkoy <gotope@users.noreply.github.com>
+grahameth <96447521+grahameth@users.noreply.github.com>
+Gregor Jasny <gjasny@googlemail.com>
+Grzegorz Grasza <xek@redhat.com>
+gtygo <gtydoit@gmail.com>
 Guillaume "Vermeille" Sanchez <Guillaume.V.Sanchez@gmail.com>
 Guillaume Wenzek <gwenzek@users.noreply.github.com>
 Guoliang Hua <32868157+nbcsm@users.noreply.github.com>
 Guoteng <32697156+SolenoidWGT@users.noreply.github.com>
 Guspan Tanadi <36249910+guspan-tanadi@users.noreply.github.com>
 Gustavo Rocha Dias <91472747+gustrd@users.noreply.github.com>
+Guus Waals <_@guusw.nl>
+Guy Goldenberg <guy110698@gmail.com>
+gwjr <502526+gwjr@users.noreply.github.com>
+h-h-h-h <13482553+h-h-h-h@users.noreply.github.com>
 Haggai Nuchi <h.nuchi@gmail.com>
+Haiyue Wang <haiyuewa@163.com>
 Halalaluyafail3 <55773281+Halalaluyafail3@users.noreply.github.com>
 Hale Chan <halechan@qq.com>
 Hamdoud Hakem <90524568+hamdoudhakem@users.noreply.github.com>
+Han Qingzhe <95479277+hNSBQZ@users.noreply.github.com>
 Han Yin <han.yin@arm.com>
 HanishKVC <hanishkvc@gmail.com>
+hankcs <cnhankmc@gmail.com>
 Haohui Mai <ricetons@gmail.com>
+haopeng <657407891@qq.com>
+Haowei Wu <breadcyanide@icloud.com>
 Haoxiang Fei <tonyfettes@tonyfettes.com>
 Harald Fernengel <harald.fernengel@here.com>
 Hatsune Miku <129688334+at8u@users.noreply.github.com>
 HatsuneMikuUwU33 <173229399+HatsuneMikuUwU33@users.noreply.github.com>
 Haus1 <haus.xda@gmail.com>
+Héctor Estrada Moreno <hectorem2@gmail.com>
+HelloKS <kqwe1859@gmail.com>
+Helton Reis <47722840+HRKings@users.noreply.github.com>
+Hendrik Erz <hendrik@zettlr.com>
 Henk Poley <HenkPoley@gmail.com>
 Henri Vasserman <henv@hot.ee>
 Henrik Forstén <henrik.forsten@gmail.com>
 Henry Linjamäki <henry.linjamaki@gmail.com>
+Henry Linjamäki <henry.mikael.linjamaki@intel.com>
+Henry147147 <44851451+Henry147147@users.noreply.github.com>
+Herman Semenoff <GermanAizek@yandex.ru>
 Herman Semenov <GermanAizek@yandex.ru>
 Hesen Peng <hesen.peng@gmail.com>
+HighDoping <highdoping@gmail.com>
 HimariO <dsfhe49854@gmail.com>
+hipudding <huafengchun@gmail.com>
+hksdpc255 <43977088+hksdpc255@users.noreply.github.com>
 Hoang Nguyen <hugo53@users.noreply.github.com>
+hoangmit <hoangmit@users.noreply.github.com>
+HonestQiao <honestqiao@gmail.com>
 Hong Bo PENG <penghb@cn.ibm.com>
+hongbo.mo <352280764@qq.com>
 Hongyu Ouyang <96765450+casavaca@users.noreply.github.com>
+hopkins385 <98618192+hopkins385@users.noreply.github.com>
 Howard Su <howard0su@gmail.com>
+howlger <eclipse@voormann.de>
+howlger <github@voormann.de>
 Hua Jiang <allenhjiang@outlook.com>
 Huang Qi <huangqi3@xiaomi.com>
 Huawei Lin <huaweilin.cs@gmail.com>
 Hugo Roussel <hugo.rous@gmail.com>
 Huifeng Ou <79071290+ho2103@users.noreply.github.com>
+hutli <6594598+hutli@users.noreply.github.com>
+hutli <hutli@hutli.hu>
+hutli <jensstaermose@hotmail.com>
+hxer7963 <hxer7963@gmail.com>
+hydai <z54981220@gmail.com>
+iacore <74560659+iacore@users.noreply.github.com>
 Ian Bull <irbull@eclipsesource.com>
 Ian Bull <irbull@gmail.com>
 Ian Scrivener <github@zilogy.asia>
+ibrahim khadraoui <132432132+ibrahimkhadraoui@users.noreply.github.com>
 Icecream95 <the.real.icecream95@gmail.com>
+Icenowy Zheng <uwu@icenowy.me>
+icppWorld <124377669+icppWorld@users.noreply.github.com>
 Ido S <ido.pluto@gmail.com>
+igardev <49397134+igardev@users.noreply.github.com>
+igarnier <igarnier@protonmail.com>
 IgnacioFDM <ignaciofdm@gmail.com>
 Igor Okulist <okigan@gmail.com>
+Igor Smirnov <smirnoviv@rambler.ru>
+Ihar Hrachyshka <ihar.hrachyshka@gmail.com>
 Ihar Hrachyshka <ihrachys@redhat.com>
 Ikko Eltociear Ashimine <eltociear@gmail.com>
+Ilia Ilmer <iliailmer@users.noreply.github.com>
 Ilya Kurdyukov <59548320+ilyakurdyukov@users.noreply.github.com>
+Imad Saddik <79410781+ImadSaddik@users.noreply.github.com>
+intelmatt <61025942+intelmatt@users.noreply.github.com>
+iohub <rickyang.pro@gmail.com>
 Ionoclast Laboratories <brigham@ionoclast.com>
+iron <lizhenneng@gmail.com>
 Isaac McFadyen <isaac@imcf.me>
 IsaacDynamo <61521674+IsaacDynamo@users.noreply.github.com>
+Ishaan Gandhi <Ishaangandhi@gmail.com>
+iSma <ismail.senhaji@gmail.com>
+issixx <46835150+issixx@users.noreply.github.com>
 Ivan <nekotekina@gmail.com>
 Ivan Filipov <159561759+vanaka11@users.noreply.github.com>
 Ivan Komarov <Ivan.Komarov@dfyz.info>
 Ivan Stepanov <ivanstepanovftw@gmail.com>
-JC <43374599+MrSMlT@users.noreply.github.com>
-JFLFY2255 <JFLFY2255@163.com>
-JH23X <165871467+JH23X@users.noreply.github.com>
+Ivy233 <952254420@qq.com>
+ixgbe <1113177880@qq.com>
+j-k <dev@j-k.io>
+jacekpoplawski <67507230+jacekpoplawski@users.noreply.github.com>
 Jack Mousseau <jack@software.inc>
 Jack Mousseau <jmousseau@users.noreply.github.com>
 JackJollimore <130917767+JackJollimore@users.noreply.github.com>
+jacobi petrucciani <8117202+jpetrucciani@users.noreply.github.com>
 Jaeden Amero <jaeden@patater.com>
 Jaemin Son <woalsdnd@gmail.com>
 Jafar Uruç <jafar.uruc@gmail.com>
 Jag Chadha <jagtesh@gmail.com>
+jaime-m-p <167997752+jaime-m-p@users.noreply.github.com>
+Jake Karnes <jake.karnes@gmail.com>
+Jakkala Mahesh <155058658+MaheshJakkala@users.noreply.github.com>
 Jakub N <jakubniemczyk97@gmail.com>
 James A Capozzoli <157492257+jac-jim@users.noreply.github.com>
 James Reynolds <magnusviri@users.noreply.github.com>
+jameswu2014 <545426914@qq.com>
 Jan Boon <jan.boon@kaetemi.be>
 Jan Boon <kaetemi@gmail.com>
 Jan Ploski <jpl@plosquare.com>
 Jannis Schönleber <joennlae@gmail.com>
+Jared Tweed <jaredtwe@gmail.com>
 Jared Van Bortel <cebtenzzre@gmail.com>
 Jared Van Bortel <jared@nomic.ai>
+Jaromír Hradílek <jhradilek@gmail.com>
 Jason C.H <ctrysbita@outlook.com>
 Jason McCartney <jmac@theroot.org>
+Jason Ni <jason.ni.py@gmail.com>
 Jason Stillerman <jason.t.stillerman@gmail.com>
+jason_w <jason.wang@126.com>
+Jay <BusyJay@users.noreply.github.com>
+Jay Zenith <162098309+JayZenith@users.noreply.github.com>
+JC <43374599+MrSMlT@users.noreply.github.com>
+jdomke <28772296+jdomke@users.noreply.github.com>
 Jean-Christophe Hoelt <hoelt@fovea.cc>
 Jean-Michaël Celerier <jeanmichael.celerier+github@gmail.com>
 Jed Fox <git@jedfox.com>
 Jeff Bolz <jbolz@nvidia.com>
 Jeffrey Morgan <jmorganca@gmail.com>
 Jeffrey Quesnelle <emozilla@nousresearch.com>
+Jeremy Demeule <jdemeule@users.noreply.github.com>
+Jeremy Rand <244188+JeremyRand@users.noreply.github.com>
 Jeroen Mostert <jeroen.mostert@cm.com>
+Jesse <jesse@createthis.com>
+Jesse Gross <jesse@kernel.org>
+Jesse Ikonen <jesse.ikonen@gmail.com>
 Jesse Jojo Johnson <williamsaintgeorge@gmail.com>
 Jett Janiak <jettjaniak@gmail.com>
 Jeximo <jeximo@gmail.com>
+JFLFY2255 <JFLFY2255@163.com>
+JH23X <165871467+JH23X@users.noreply.github.com>
 Jhen-Jie Hong <iainst0409@gmail.com>
+Jiacheng (Jason) Chen <76919340+jiachengjason@users.noreply.github.com>
 Jiahao Li <liplus17@163.com>
+jiahao su <damow890@gmail.com>
 Jian Liao <jianliao@users.noreply.github.com>
 JidongZhang-THU <1119708529@qq.com>
+Jie Fu (傅杰) <fujie_email@sina.com>
+Jie Fu (傅杰) <jiefu@tencent.com>
+jiez <373447296@qq.com>
 Jinwoo Jeong <33892306+williamjeong2@users.noreply.github.com>
 Jinyang He <hejinyang@loongson.cn>
 Jiří Podivín <66251151+jpodivin@users.noreply.github.com>
 Jiří Sejkora <Sejseloid@gmail.com>
+JJJYmmm <92386084+JJJYmmm@users.noreply.github.com>
+jklincn <985765408@qq.com>
+jklincn <jklincn@outlook.com>
+jneem <joeneeman@gmail.com>
 Joan Fontanals <jfontanalsmartinez@gmail.com>
 Joan Fontanals <joan.fontanals.martinez@jina.ai>
 João Dinis Ferreira <hello@joaof.eu>
 Joe Eli McIlvain <joe.eli.mac@gmail.com>
 Joe Todd <joe.todd@codeplay.com>
+joecryptotoo <80373433+joecryptotoo@users.noreply.github.com>
 Johan <JohanAR@users.noreply.github.com>
 Johannes Gäßler <johannesg@5d6.de>
 Johannes Rudolph <johannes.rudolph@gmail.com>
 John <78893154+cmp-nct@users.noreply.github.com>
 John Balis <phobossystems@gmail.com>
+John Bean <113509988+johnbean393@users.noreply.github.com>
 John Smith <67539080+kingsidelee@users.noreply.github.com>
 JohnnyB <jboero@users.noreply.github.com>
+johnson442 <56517414+johnson442@users.noreply.github.com>
+jojorne <jojorne@users.noreply.github.com>
+jon-chuang <9093549+jon-chuang@users.noreply.github.com>
 Jonas Wunderlich <32615971+jonas-w@users.noreply.github.com>
+Jonathan Graehl <99024+graehl@users.noreply.github.com>
 Jorge A <161275481+jorgealias@users.noreply.github.com>
 Jose Maldonado <63384398+yukiteruamano@users.noreply.github.com>
 Joseph Stahl <1269177+josephst@users.noreply.github.com>
 Josh Ramer <josh.ramer@icloud.com>
+Joshua Cogliati <jrincayc@users.noreply.github.com>
 Joyce <joycebrum@google.com>
+jp-x-g <jpxg-dev@protonmail.com>
 Juan Calderon-Perez <835733+gaby@users.noreply.github.com>
+Judd <4046440+foldl@users.noreply.github.com>
 Judd <foldl@users.noreply.github.com>
 Juk Armstrong <69222624+jukofyork@users.noreply.github.com>
+jukofyork <69222624+jukofyork@users.noreply.github.com>
+Julien Denize <40604584+juliendenize@users.noreply.github.com>
 Julius Arkenberg <arki05@users.noreply.github.com>
+Julius Tischbein <jtischbein@nvidia.com>
+Julius Tischbein <ju.tischbein@gmail.com>
 Jun Hee Yoo <contact.jhyoo@gmail.com>
 Jun Jie <71215065+junnjiee16@users.noreply.github.com>
+junchao-loongson <68935141+junchao-loongson@users.noreply.github.com>
+junchao-zhao <68935141+junchao-loongson@users.noreply.github.com>
 Junil Kim <logyourself@gmail.com>
+Junwon Hwang <nuclear1221@gmail.com>
 Junyang Lin <justinlin930319@hotmail.com>
 Juraj Bednar <juraj@bednar.io>
 Justin Parker <jparkerweb@gmail.com>
+Justin Santa Barbara <justinsb@google.com>
 Justin Suess <justin.suess@westpoint.edu>
 Justina Cho <justcho5@gmail.com>
 Justine Tunney <jtunney@gmail.com>
 Justine Tunney <jtunney@mozilla.com>
 Juuso Alasuutari <juuso.alasuutari@gmail.com>
-KASR <karim.asrih@gmail.com>
+Juyoung Suk <juyoung.suk@trillionlabs.co>
+jwj7140 <32943891+jwj7140@users.noreply.github.com>
+k.h.lai <adrian.k.h.lai@outlook.com>
+Kai Pastor <dg0yt@darc.de>
+kaizau <kaizau@users.noreply.github.com>
+kallewoof <kalle.alm@gmail.com>
+kallewoof <karljohan-alm@garage.co.jp>
+kalomaze <66376113+kalomaze@users.noreply.github.com>
 Kamil Tomšík <info@tomsik.cz>
+kang <tpdns9032100@gmail.com>
 Kante Yin <kerthcet@gmail.com>
 Karol Kontny <82021046+kkontny@users.noreply.github.com>
 Karsten Weiss <knweiss@gmail.com>
 Karthick <j.karthic2004@gmail.com>
 Karthik Kumar Viswanathan <195178+guilt@users.noreply.github.com>
 Karthik Sethuraman <k.seth1993@gmail.com>
+KASR <karim.asrih@gmail.com>
 Kasumi <90275229+kasumi-1@users.noreply.github.com>
+katsu560 <118887472+katsu560@users.noreply.github.com>
 Kawrakow <48489457+ikawrakow@users.noreply.github.com>
+kchro3 <62481661+kchro3@users.noreply.github.com>
 Keiichi Tabata <keiichi.tabata@outlook.com>
 Keke Han <hankeke303@163.com>
 Kenvix ⭐ <kenvixzure@live.com>
@@ -417,48 +737,109 @@ Kevin Gibbons <bakkot@gmail.com>
 Kevin Ji <1146876+kevinji@users.noreply.github.com>
 Kevin Kwok <antimatter15@gmail.com>
 Kevin Lo <kevlo@kevlo.org>
+Kevin Pouget <kpouget@redhat.com>
 Kevin Wang <kevmo314@gmail.com>
+khimaros <me@khimaros.com>
+kiltyj <kiltyj@gmail.com>
+Kim S. <polydecay@users.noreply.github.com>
+kimminsu <80271594+kimminsu38oo@users.noreply.github.com>
+kiwi <122582483+kiwi142857@users.noreply.github.com>
+klosax <131523366+klosax@users.noreply.github.com>
 Kolen Cheung <ickc@users.noreply.github.com>
 Konstantin Herud <konstantin.herud@denkbares.com>
 Konstantin Zhuravlyov <konstantin.zhuravlyov@amd.com>
+krystiancha <krystian@krystianch.com>
+kunal-vaishnavi <115581922+kunal-vaishnavi@users.noreply.github.com>
+kunnis <kunnis@users.noreply.github.com>
 Kunshang Ji <kunshang.ji@intel.com>
+kuronekosaiko <EvanChanJ@163.com>
+kustaaya <58045274+kustaaya@users.noreply.github.com>
+kuvaus <22169537+kuvaus@users.noreply.github.com>
+kwin1412 <42286931+kwin1412@users.noreply.github.com>
 Kyle Bruene <KyleBruene@users.noreply.github.com>
 Kyle Liang <liangmanlai@gmail.com>
 Kyle Mistele <kyle@mistele.com>
 Kylin <56434533+KyL0N@users.noreply.github.com>
+l-austenfeld <53152202+l-austenfeld@users.noreply.github.com>
+l3utterfly <gc.pthzfoldr@gmail.com>
+LaffeyNyaa <112215776+LaffeyNyaa@users.noreply.github.com>
+laik <laik.lj@me.com>
 Lars Grammel <lars.grammel@gmail.com>
+Lars Sonchocky-Helldorf <lars.sonchocky-helldorf@hamburg.de>
 Laura <Tijntje_7@msn.com>
+Law Po Ying <30721578+yingying0906@users.noreply.github.com>
+lcy <lcy0321@users.noreply.github.com>
+ldwang <ftgreat@163.com>
+le.chang <cljs118@126.com>
 Lee <44310445+lx200916@users.noreply.github.com>
 Lee Drake <b.lee.drake@gmail.com>
+leejet <leejet714@gmail.com>
 Leng Yue <lengyue@lengyue.me>
+Lennart Austenfeld <53152202+l-austenfeld@users.noreply.github.com>
+leo-pony <nengjunma@outlook.com>
 Leon Knauer <git@leonknauer.com>
-LeonEricsson <70749762+LeonEricsson@users.noreply.github.com>
+Leonard Mosescu <tlemo@users.noreply.github.com>
 Leonardo Neumann <leonardo@neumann.dev.br>
+LeonEricsson <70749762+LeonEricsson@users.noreply.github.com>
+levkropp <levkropp@protonmail.com>
+lexasub <lexakopp2212@gmail.com>
+lgai-exaone <exaonemodels@lgresearch.ai>
+lhez <lih@qti.qualcomm.com>
+lhez <quic_lih@quicinc.com>
+Li Pengzhan <151381994+Lpzhan931@users.noreply.github.com>
 Li Tan <tanliboy@gmail.com>
+limitedAtonement <limitedAtonement@users.noreply.github.com>
 Linwei Wang <wanix1988@gmail.com>
 Liu Jia <109258120+Septa2112@users.noreply.github.com>
 Liu Jia <jia3.liu@intel.com>
+liuwei-git <14815172+liuwei-git@users.noreply.github.com>
+lixing-star <104126818+lixing-star@users.noreply.github.com>
+lksj92hs <134250687+lksj92hs@users.noreply.github.com>
 LoganDark <github@logandark.mozmail.com>
 Loïc Carrère <loic.carrere@gmail.com>
+lon <114724657+longregen@users.noreply.github.com>
+loonerin <132926317+loonerin@users.noreply.github.com>
 LostRuins <39025047+LostRuins@users.noreply.github.com>
 LostRuins Concedo <39025047+LostRuins@users.noreply.github.com>
+lovedheart <6277001+lovedheart@users.noreply.github.com>
+ltoniazzi <61414566+ltoniazzi@users.noreply.github.com>
+Luca Stefani <luca.stefani.ge1@gmail.com>
 Lucas Moura Belo <lucas.belo@live.com>
 Luciano <lucianostrika44@gmail.com>
+Lukas Straub <lukasstraub2@web.de>
+Łukasz Ślusarczyk <112692748+lslusarczyk@users.noreply.github.com>
 Luo Tian <lt@basecity.com>
+luoyu-intel <yu.luo@intel.com>
+luyhcsu <110711054+luyhcsu@users.noreply.github.com>
 Lyle Dean <dean@lyle.dev>
 M-A <maruel@gmail.com>
+M. Mediouni <mohamed@unpredictable.fr>
 M. Yusuf Sarıgöz <yusufsarigoz@gmail.com>
+m3ndax <adrian.goessl@outlook.com>
 Ma Mingfei <mingfei.ma@intel.com>
 Maarten ter Huurne <maarten@treewalker.org>
 Mack Straight <eiz@users.noreply.github.com>
+maddes8cht <55592906+maddes8cht@users.noreply.github.com>
 Maël Kerbiriou <m431.kerbiriou@gmail.com>
 MaggotHATE <clay1326@gmail.com>
+magicse <magicse@users.noreply.github.com>
+Mahekk Shaikh <118063190+Mahekk357@users.noreply.github.com>
 Mahesh Madhav <67384846+heshpdx@users.noreply.github.com>
+mahorozte <41834471+mahorozte@users.noreply.github.com>
+makomk <makosoft@googlemail.com>
+manikbhandari <mbbhandarimanik2@gmail.com>
 Manuel <44313466+makuche@users.noreply.github.com>
+maor-ps <154728172+maor-ps@users.noreply.github.com>
 Marc Köhlbrugge <subscriptions@marckohlbrugge.com>
+Marcello Seri <mseri@users.noreply.github.com>
 Marco Matthies <71844+marcom@users.noreply.github.com>
+Marcos Del Sol Vives <marcos@orca.pet>
+marcoStocchi <marcostocchi77@gmail.com>
 Marcus Dunn <51931484+MarcusDunn@users.noreply.github.com>
+Marek Hradil jr. <marek.hradil@outlook.com>
 Marian Cepok <marian.cepok@gmail.com>
+Marius Gerdes <141485318+mglambda@users.noreply.github.com>
+Mariusz Woloszyn <emsi@users.noreply.github.com>
 Mark Fairbairn <thebaron88@gmail.com>
 Mark Zhuang <zhuangqiubin@gmail.com>
 Marko Tasic <mtasic85@gmail.com>
@@ -467,7 +848,11 @@ Martin Delille <martin@delille.org>
 Martin Krasser <krasserm@googlemail.com>
 Martin Schwaighofer <mschwaig@users.noreply.github.com>
 Marvin Gießing <marvin.giessing@gmail.com>
+Masashi Yoshimura <yoshimura.masashi.frbs@gmail.com>
+Masato Nakasaka <masato.nakasaka@intel.com>
+Masato Nakasaka <rillomas@gmail.com>
 Masaya, Kato <62578291+msy-kato@users.noreply.github.com>
+mashdragon <122402293+mashdragon@users.noreply.github.com>
 MasterYi1024 <39848311+MasterYi1024@users.noreply.github.com>
 Mateusz Charytoniuk <mateusz.charytoniuk@protonmail.com>
 Matheus C. França <matheus-catarino@hotmail.com>
@@ -475,57 +860,89 @@ Matheus Gabriel Alves Silva <matheusgasource@gmail.com>
 Mathieu Baudier <mbaudier@argeo.org>
 Mathieu Geli <mathieu.geli@gmail.com>
 Mathieu Nayrolles <MathieuNls@users.noreply.github.com>
-Mathijs Henquet <mathijs.henquet@gmail.com>
 Mathijs de Bruin <mathijs@mathijsfietst.nl>
+Mathijs Henquet <mathijs.henquet@gmail.com>
+matiaslin <45382001+matiaslin@users.noreply.github.com>
 Matt Clayton <156335168+mattjcly@users.noreply.github.com>
 Matt Pulver <matt.pulver@heavy.ai>
 Matt Stephenson <mstephenson6@users.noreply.github.com>
+matt23654 <193348153+matt23654@users.noreply.github.com>
+matt23654 <matthew.webber@protonmail.com>
+matteo <matteo.serva@gmail.com>
+matteo <matteogeniaccio@yahoo.it>
 Matteo Boschini <12133566+mbosc@users.noreply.github.com>
 Matteo Mortari <matteo.mortari@gmail.com>
 Mattheus Chediak <shammcity00@gmail.com>
+Matthew Michel <matthew.michel@intel.com>
 Matthew Tejo <matthew.tejo@gmail.com>
+Matthieu Coudron <886074+teto@users.noreply.github.com>
+Mattt <mattt@me.com>
 Matvey Soloviev <blackhole89@gmail.com>
 Max Krasnyansky <max.krasnyansky@gmail.com>
+Max Krasnyansky <maxk@qti.qualcomm.com>
 Max Krasnyansky <quic_maxk@quicinc.com>
 Maxim Evtush <154841002+maximevtush@users.noreply.github.com>
 Maxime <672982+maximegmd@users.noreply.github.com>
 Maximilian Winter <maximilian.winter.91@gmail.com>
+mdrokz <mohammadmunshi@gmail.com>
+MeeMin <74113151+Meet91721@users.noreply.github.com>
 Meng Zhang <meng@tabbyml.com>
 Meng, Hengyu <hengyu.meng@intel.com>
 Mengqing Cao <cmq0113@163.com>
 Merrick Christensen <merrick.christensen@gmail.com>
+mgroeber9110 <45620825+mgroeber9110@users.noreply.github.com>
+Miaoqian Lin <linmq006@gmail.com>
 Michael Coppola <m18coppola@gmail.com>
+Michael de Gans <michael.john.degans@gmail.com>
+Michaël de Vries <vriesdemichael@gmail.com>
 Michael Engel <mengel@redhat.com>
 Michael Francis <edude03@gmail.com>
+Michael Giba <michaelgiba@gmail.com>
 Michael Hueschen <m@mhueschen.dev>
 Michael Kesper <mkesper@schokokeks.org>
 Michael Klimenko <mklimenko29@gmail.com>
 Michael Podvitskiy <podvitskiymichael@gmail.com>
 Michael Potter <NanoTekGuy@Gmail.com>
-Michael de Gans <michael.john.degans@gmail.com>
-Michaël de Vries <vriesdemichael@gmail.com>
+Michael Wand <michael@baybridgeaquarium.com>
 Michał Moskal <michal@moskal.me>
 Michał Tuszyński <srgtuszy@gmail.com>
 Michelle Tan <41475767+MichelleTanPY@users.noreply.github.com>
+midnight <midnightmagic@users.noreply.github.com>
 Mihai <mihai.chirculescu@yahoo.com>
 Mike <ytianhui2004@gmail.com>
+Mike Abbott <furrysalamander@gmail.com>
+Mike Abbott <michael.abbott@lvt.com>
 Mikko Juola <mikjuo@gmail.com>
+Min-Hua <136287195+Min-Hua@users.noreply.github.com>
+minarchist <minarchist@users.noreply.github.com>
 Minsoo Cheong <54794500+mscheong01@users.noreply.github.com>
 Minsoo Cheong <icycle0409@snu.ac.kr>
 Mirko185 <mirkosig@gmail.com>
 Mirror Azure <54669636+MirrorAzure@users.noreply.github.com>
 MistApproach <98988043+MistApproach@users.noreply.github.com>
 Miwa / Ensan <63481257+ensan-hcl@users.noreply.github.com>
+mj-shifu <77107165+mj-shifu@users.noreply.github.com>
+mmyjona <jonathan.gonse@gmail.com>
+mnehete32 <33429707+mnehete32@users.noreply.github.com>
 Mohammadreza Hendiani <hendiani.mohammadreza@gmail.com>
 Mohammadreza Hendiani <mohammad.r.hendiani@gmail.com>
 Molly Sophia <mollysophia379@gmail.com>
+momonga <115213907+mmnga@users.noreply.github.com>
+momonga <146910567+mmngays@users.noreply.github.com>
 MoonRide303 <130458190+MoonRide303@users.noreply.github.com>
 MorganRO8 <47795945+MorganRO8@users.noreply.github.com>
+moritzbrantner <31051084+moritzbrantner@users.noreply.github.com>
+muggle-stack <promuggle@qq.com>
 Murilo Santana <mvrilo@gmail.com>
 Musab Gultekin <musabgultekin@users.noreply.github.com>
+musoles <135031143+musoles@users.noreply.github.com>
+mzcu <milos.cubrilo@gmail.com>
+Naco Siren <naco-siren@users.noreply.github.com>
 Nam D. Tran <42194884+namtranase@users.noreply.github.com>
+nanahi <130121847+na-na-hi@users.noreply.github.com>
 Nathan Epstein <nate2@umbc.edu>
 Natsu <chino@hotococoa.moe>
+Nauful Shaikh <nauful@gmail.com>
 NawafAlansari <72708095+NawafAlansari@users.noreply.github.com>
 Nebula <infinitewormhole@gmail.com>
 Neo Zhang <14088817+arthw@users.noreply.github.com>
@@ -533,73 +950,157 @@ Neo Zhang <zhang.jianyu@outlook.com>
 Neo Zhang Jianyu <jianyu.zhang@intel.com>
 Neuman Vong <neuman.vong@gmail.com>
 NeverLucky <92274250+nvrxq@users.noreply.github.com>
+Nexes the Elder <124105151+Nexesenex@users.noreply.github.com>
 Nexes the Old <124105151+Nexesenex@users.noreply.github.com>
 Nexesenex <124105151+Nexesenex@users.noreply.github.com>
+ngc92 <7938269+ngc92@users.noreply.github.com>
+nhamanasu <45545786+nhamanasu@users.noreply.github.com>
 Niall Coates <1349685+Niall-@users.noreply.github.com>
+niansa/tuxifan <anton-sa@web.de>
+niansa/tuxifan <tuxifan@posteo.de>
 Nicholai Tukanov <nicholaitukanov@gmail.com>
+Nick <0x0b4ac@gmail.com>
+nick huang <nickhuang99@hotmail.com>
+nickp27 <nb.porter@gmail.com>
 Nico Bosshard <nico@bosshome.ch>
 Nicolai Weitkemper <kontakt@nicolaiweitkemper.de>
+Nicolas B. Pierron <nicolas.b.pierron@gmail.com>
 Nicolás Pérez <nicolas_perez@brown.edu>
 Nicolò Scipione <nicolo.scipione@codeplay.com>
 Nigel Bosch <pnigelb@gmail.com>
+Nikhil Jain <nikhil.jain0987@gmail.com>
 Nikita Sarychev <42014488+sARY77@users.noreply.github.com>
 Niklas Korz <niklas@niklaskorz.de>
 NikolaiLyssogor <59844691+NikolaiLyssogor@users.noreply.github.com>
 Nikolaos Pothitos <pothitos@di.uoa.gr>
 Nikolas <127742645+nneubacher@users.noreply.github.com>
+Nikolay Popov <131475237+npopov-vst@users.noreply.github.com>
 Nindaleth <Nindaleth@users.noreply.github.com>
+ningshanwutuobang <ningshanwutuobang@gmail.com>
+Noah <99681487+NoahOksuz@users.noreply.github.com>
+nold <Nold360@users.noreply.github.com>
+nopperl <54780682+nopperl@users.noreply.github.com>
+nullname <chraac@gmail.com>
 Nuno <rare-magma@posteo.eu>
-OSecret <135510162+OLSecret@users.noreply.github.com>
+nusu-github <29514220+nusu-github@users.noreply.github.com>
+nwyin <tommynguyen0512@gmail.com>
+o7si <32285332+o7si@users.noreply.github.com>
 Oleksandr Kuvshynov <661042+okuvshynov@users.noreply.github.com>
 Oleksandr Nikitin <oleksandr@tvori.info>
 Oleksii Maryshchenko <oleksii.maryshchenko@gmail.com>
+Olexandr88 <radole1203@gmail.com>
+olexiyb <olexiyb@gmail.com>
+Oliver Simons <oliver.simons@posteo.de>
+Oliver Simons <osimons@nvidia.com>
+Oliver Walsh <owalsh@redhat.com>
 Olivier Chafik <ochafik@users.noreply.github.com>
+Olivier Chafik <olivier.chafik@gmail.com>
+omahs <73983677+omahs@users.noreply.github.com>
 Ondřej Čertík <ondrej@certik.us>
+oobabooga <112222186+oobabooga@users.noreply.github.com>
+oobabooga <oobabooga4@gmail.com>
+opparco <parco.opaai@gmail.com>
+Oscar Barenys <rtfss1@gmail.com>
+OSecret <135510162+OLSecret@users.noreply.github.com>
+ostix360 <55257054+ostix360@users.noreply.github.com>
 Ouadie EL FAROUKI <ouadie.elfarouki@codeplay.com>
 PAB <pierreantoine.bannier@gmail.com>
 Pablo Duboue <pablo.duboue@gmail.com>
+Pádraic Slattery <pgoslatara@gmail.com>
+Pascal <admin@serveurperso.com>
 Pascal Patry <ppatry@mtacitlabs.com>
+pascal-lc <49066376+pascal-lc@users.noreply.github.com>
 Patrice Ferlet <metal3d@gmail.com>
 Patrick Peng <retr0@retr0.blog>
+Patryk Kaminski <kaminpatryk@gmail.com>
 Paul Tsochantaris <ptsochantaris@icloud.com>
 Pavel Zloi <github.com@drteam.rocks>
+Pavels Zaicenkovs <github@a.pzv.me>
 Pavol Rusnak <pavol@rusnak.io>
 Paweł Wodnicki <151604+32bitmicro@users.noreply.github.com>
+pculliton <phillipculliton@gmail.com>
 Pedro Cuenca <pedro@huggingface.co>
+peidaqi <peidaqi@gmail.com>
+Penglin Cai <1402538448@qq.com>
+pengxin99 <pengxin.yuan@intel.com>
+Pepijn de Vos <me@pepijndevos.nl>
+Percy Piper <piper.percy@googlemail.com>
+Perry Naseck <4472083+DaAwesomeP@users.noreply.github.com>
+perserk <perserk@gmail.com>
 Peter <peter277@users.noreply.github.com>
 Peter Sugihara <peter@campsh.com>
+Peter0x44 <peter0x44@disroot.org>
+petterreinholdtsen <pere-github@hungry.com>
 Phil H <5756783+phiharri@users.noreply.github.com>
 Philip Taron <philip.taron@gmail.com>
+philip-essential <169196560+philip-essential@users.noreply.github.com>
 Phillip Kravtsov <phillip@kravtsov.net>
+Phylliida Dev <phylliida.dev@gmail.com>
+piDack <104877312+piDack@users.noreply.github.com>
 Pierre Alexandre SCHEMBRI <pa.schembri@gmail.com>
 Pierrick Hymbert <pierrick.hymbert@gmail.com>
 Pieter Ouwerkerk <pieter.ouwerkerk@gmail.com>
+Piotr <piotr.stankiewicz@docker.com>
+Piotr Jasiukajtis <estibi@me.com>
+Piotr Kubaj <pkubaj@anongoth.pl>
+Piotr Wilkin (ilintar) <piotr.wilkin@syndatis.com>
+pl752 <pl752@mail.ru>
 Plamen Minev <pacominev@gmail.com>
+pmysl <piotr.myslinski@outlook.com>
+pockers21 <134406831+pockers21@users.noreply.github.com>
+postmasters <namnguyen@google.com>
+Pouya <PooyaGhahramanian@Gmail.com>
+pqnet <119850+pqnet@users.noreply.github.com>
+Prabod <prabod@maincode.com>
+Prajwal B Mehendarkar <prajwal.b.mehendarkar@ibm.com>
 Prashant Vithule <119530321+Vithulep@users.noreply.github.com>
 Przemysław Pawełczyk <przemoc@gmail.com>
+psocolovsky <50770545+psocolovsky@users.noreply.github.com>
+pudepiedj <pudepiedj@gmail.com>
 PureJourney <edward.pong@qq.com>
+QDelta <60222316+QDelta@users.noreply.github.com>
+Qeeweew <68716978+Qeeweew@users.noreply.github.com>
 Qin Yue Chen <71813199+chenqiny@users.noreply.github.com>
+qingfengfenga <41416092+qingfengfenga@users.noreply.github.com>
+qingy1337 <qxli2@students.everettcc.edu>
 Qingyou Meng <meng.qingyou@gmail.com>
+qouoq <qouoq@fastmail.com>
 Qu Zongfu <43257352+yancaoweidaode@users.noreply.github.com>
+Quentin Bramas <quentin.bramas@gmail.com>
+qunash <anzoria@gmail.com>
+R <github@00b.tech>
+R <reg@00b.tech>
 R0CKSTAR <xiaodong.ye@mthreads.com>
 R0CKSTAR <yeahdongcn@gmail.com>
-RJ Adriaansen <adriaansen@eshcc.eur.nl>
+rabidcopy <rabidcopy@yahoo.com>
+RachelMantel <rrm85040@gmail.com>
 Radoslav Gerganov <rgerganov@gmail.com>
 Radosław Gryta <radek.gryta@gmail.com>
+Rafal Lewczuk <rafal.lewczuk@gmail.com>
+Rahul Sathe <150351592+rrsathe@users.noreply.github.com>
 Rahul Vivek Nair <68507071+RahulVivekNair@users.noreply.github.com>
+rainred <107027757+gryffindor-rr@users.noreply.github.com>
 Raj Hammeer Singh Hada <hammeerraj@gmail.com>
 Ralph Soika <ralph.soika@imixs.com>
 Rand Xie <randxiexyy29@gmail.com>
 Randall Fitzgerald <randall@dasaku.net>
 Random Fly <renfei8@live.cn>
+rankaiyx <rankaiyx@rankaiyx.com>
+Raul Torres <138264735+rauletorresc@users.noreply.github.com>
+redbeard <bharrington@alticon.net>
+Reese Levine <reeselevine1@gmail.com>
 Reinforce-II <fate@eastal.com>
 Rémy O <remyoudompheng@gmail.com>
 Rémy Oudompheng <oudomphe@phare.normalesup.org>
 Ren Xuancheng <jklj077@users.noreply.github.com>
+Renat <rntk@users.noreply.github.com>
 Rene Leonhardt <65483435+reneleonhardt@users.noreply.github.com>
 Reza Kakhki <rezakakhki.de@gmail.com>
 Reza Rahemtola <49811529+RezaRahemtola@users.noreply.github.com>
 RhinoDevel <RhinoDevel@users.noreply.github.com>
+rhjdvsgsgks <26178113+rhjdvsgsgks@users.noreply.github.com>
+rhuddleston <ryan.huddleston@percona.com>
+Rhys-T <108157737+Rhys-T@users.noreply.github.com>
 Riccardo Orlando <Riccorl@users.noreply.github.com>
 Riceball LEE <snowyu.lee@gmail.com>
 Rich Dougherty <rich@rd.nz>
@@ -611,14 +1112,22 @@ Rickard Edén <rickardeden@gmail.com>
 Rickard Hallerbäck <rickard.hallerback@gmail.com>
 Rickey Bowers Jr <bitRAKE@gmail.com>
 Riley Stewart <ristew@users.noreply.github.com>
+rimoliga <53384203+rimoliga@users.noreply.github.com>
 Rinne <AsakusaRinne@gmail.com>
 Rinne <liu_yaohui1998@126.com>
+RJ Adriaansen <adriaansen@eshcc.eur.nl>
+rmatif <66360289+rmatif@users.noreply.github.com>
+rmatif <kingrealriadh@gmail.com>
+rmatif <rmatif@proton.me>
 Robert Brisita <986796+rbrisita@users.noreply.github.com>
 Robert Collins <roberto.tomas.cuentas@gmail.com>
 Robert Ormandi <52251610+ormandi@users.noreply.github.com>
 Robert Sung-wook Shin <edp1096@users.noreply.github.com>
 Robey Holderith <robey@flaminglunchbox.net>
+Robin Davidsson <40024429+R-Dson@users.noreply.github.com>
 Robyn <robyngraf@users.noreply.github.com>
+Rőczey Barnabás <31726601+An0nie@users.noreply.github.com>
+RodriMora <bullerwins@gmail.com>
 Roger Meier <r.meier@siemens.com>
 Rohanjames1997 <rohan.james4@gmail.com>
 Roland <14355895+rbur0425@users.noreply.github.com>
@@ -629,68 +1138,133 @@ Roman Parykin <donderom@gmail.com>
 Ron Evans <ron@hybridgroup.com>
 Ron Jailall <rojailal@gmail.com>
 Roni <sulpher@gmx.net>
+Ronny Brendel <ronnyb@nvidia.com>
 Ronny Brendel <ronnybrendel@gmail.com>
 Ronsor <ronsor@ronsor.pw>
+Rotem Dan <rotemdan@gmail.com>
 Rowan Hart <rowanbhart@gmail.com>
+rspOverflow <217881046+rspOverflow@users.noreply.github.com>
+rtaluyev <taluyev@gmail.com>
 Ruan <47767371+ruanych@users.noreply.github.com>
+Ruben Ortlam <picard12@live.de>
+Ruben Ortlam <rortlam@redhat.com>
 Ruchira Hasaranga <ruchira66@gmail.com>
 Rudi Servo <rudiservo@gmail.com>
+Ruikai Peng <retr0@retr0.blog>
 Ruixin Huang <18860020911@163.com>
 Rune <43761327+Rune-AI@users.noreply.github.com>
+runfuture <runfuture@users.noreply.github.com>
 RunningLeon <maningsheng@sensetime.com>
 RunningLeon <mnsheng@yeah.net>
+Russyyds <161207317+Russyyds@users.noreply.github.com>
 Ryan Landay <rlanday@gmail.com>
+Ryan Mangeno <160974989+ryan-mangeno@users.noreply.github.com>
 Ryder Wishart <ryderwishart@gmail.com>
 Ryuei <louixs@users.noreply.github.com>
-Rőczey Barnabás <31726601+An0nie@users.noreply.github.com>
-SAMI <samuel.koesnadi@stud.uni-due.de>
-SRHMorris <69468379+SRHMorris@users.noreply.github.com>
-SXX <sxx1136965276@gmail.com>
+s-goto-11 <206795233+s-goto-11@users.noreply.github.com>
+s8322 <s0527684199@gmail.com>
+Saba Fallah <10401143+sfallah@users.noreply.github.com>
+Sachin Desai <smdesai@gmail.com>
+safranowith <bsh155762@gmail.com>
 SakuraUmi <yukinon244@gmail.com>
 Salvador E. Tropea <stropea@inti.gob.ar>
 Salvatore Mesoraca <s.mesoraca16@gmail.com>
+Sam <sammcj@users.noreply.github.com>
+Sam Malayek <12037535+SamMalayek@users.noreply.github.com>
 Sam Spilsbury <smspillaz@gmail.com>
+Sam/Samuel <57896620+cern1710@users.noreply.github.com>
+SAMI <samuel.koesnadi@stud.uni-due.de>
 Sami Farin <3876865+Safari77@users.noreply.github.com>
 Samuel Maynard <samwmaynard@gmail.com>
+Sandro Hanea <40202887+sandrohanea@users.noreply.github.com>
+sandyiscool <sandyiscool@gmail.com>
 Sang-Kil Park <sang.park@42dot.ai>
+Sascha Rogmann <59577610+srogmann@users.noreply.github.com>
+sasha0552 <admin@sasha0552.org>
+SavicStefan <50296686+SavicStefan@users.noreply.github.com>
+Scott Fudally <sfudally@nvidia.com>
 Seb C <47074056+Sebby37@users.noreply.github.com>
 Sebastián A <sebastian.aedo29@gmail.com>
 SebastianApel <13675545+SebastianApel@users.noreply.github.com>
+semidark <me@semidark.net>
 Senemu <10880819+Senemu@users.noreply.github.com>
+senhtry <w169q169@gmail.com>
+Sergei Vorobyov <sergei.vorobyov01@gmail.com>
+Sergey Alirzaev <l29ah@riseup.net>
 Sergey Alirzaev <zl29ah@gmail.com>
+Sergey Fedorov <vital.had@gmail.com>
 Sergio López <slp@redhat.com>
 Sergio López <slp@sinrega.org>
+serhii-nakon <57632032+serhii-nakon@users.noreply.github.com>
 Sertaç Özercan <852750+sozercan@users.noreply.github.com>
 SeungWon Jeong <65549245+redlion0929@users.noreply.github.com>
 ShadovvBeast <ShadovvBeast@gmail.com>
+Shagun Bera <141054835+notV3NOM@users.noreply.github.com>
 Shakhar Dasgupta <shakhardasgupta@gmail.com>
+Shakil Ahmed <44522075+ahmedshakill@users.noreply.github.com>
+shalinib-ibm <Shalini.Salomi.Bodapati@ibm.com>
 Shane A <shanea@allenai.org>
 Shangning Xu <32517059+xushangning@users.noreply.github.com>
+shani-f <s0556787439@gmail.com>
 Shankar <gshankar.87@gmail.com>
 Shanshan Shen <467638484@qq.com>
+shaofeiqi <109865877+shaofeiqi@users.noreply.github.com>
+shaofeiqi <shaoqi@qti.qualcomm.com>
+sharpHL <132747147+sharpHL@users.noreply.github.com>
+Shawn Gu <shawngu@qti.qualcomm.com>
+Shawn yang <137684499+Yangxiaoz@users.noreply.github.com>
 Shelby Jenkins <47464908+ShelbyJenkins@users.noreply.github.com>
 Sheldon Robinson <sheldon.robinson@live.com>
+shibe2 <shibe@tuta.io>
 Shijie <821898965@qq.com>
+Shin-myoung-serp <relent95@naver.com>
 Shintarou Okada <kokuzen@gmail.com>
+Shouyu <65317431+joeldushouyu@users.noreply.github.com>
 Shouzheng Liu <61452103+lshzh-ww@users.noreply.github.com>
 Shouzheng Liu <lshzh.hi@gmail.com>
+SHUAI YANG <shuaiyang047@163.com>
 Shuichi Tsutsumi <shuichi0526@gmail.com>
+shun095 <8069181+shun095@users.noreply.github.com>
+Shunta Saito <shunta.saito@gmail.com>
 Shupei Fan <dymarkfan@outlook.com>
+Si1w <139008732+Si1w@users.noreply.github.com>
 Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
+simevo <github@simevo.com>
+Simon Redman <simon@ergotech.com>
 Simon Willison <swillison@gmail.com>
+simon886212 <37953122+simon886212@users.noreply.github.com>
+Simranjeet Singh <105192966+simrnsingh@users.noreply.github.com>
+singularity <12184989+singularity-s0@users.noreply.github.com>
+sirus20x6 <sirus20x6@users.noreply.github.com>
 Siwen Yu <yusiwen@gmail.com>
+sjinzh <sjinzh@gmail.com>
+sjxx <63994076+ylsdamxssjxxdd@users.noreply.github.com>
+Sky <Iflyinskyin2013@gmail.com>
 Sky Yan <skyan83@gmail.com>
+slaren <2141330+slaren@users.noreply.github.com>
 Slaren <2141330+slaren@users.noreply.github.com>
+slaren <slarengh@gmail.com>
 Slava Primenko <primenko.s@gmail.com>
+Slobodan Josic <127323561+slojosic-amd@users.noreply.github.com>
 Small Grass Forest <zixuanxcl@gmail.com>
+SmartestWashingMachine <ottobizness@gmail.com>
+SnA1lGo <44647694+skrandy@users.noreply.github.com>
+snadampal <87143774+snadampal@users.noreply.github.com>
 SoftwareRenderer <138734813+SoftwareRenderer@users.noreply.github.com>
 Someone <sergei.kozlukov@aalto.fi>
 Someone Serge <sergei.kozlukov@aalto.fi>
+someone13574 <81528246+someone13574@users.noreply.github.com>
 Sourab Mangrulkar <13534540+pacman100@users.noreply.github.com>
 Spencer Sutton <spencersutton@users.noreply.github.com>
+SRHMorris <69468379+SRHMorris@users.noreply.github.com>
 Srihari-mcw <96763064+Srihari-mcw@users.noreply.github.com>
 Srinivas Billa <nivibilla@gmail.com>
+ssweens <1149151+ssweens@users.noreply.github.com>
+standby24x7 <standby24x7@gmail.com>
+staviq <staviq@gmail.com>
+stduhpf <stephduh@live.fr>
 Stefan Sydow <stefan@sydow.email>
+Ștefan-Gabriel Muscalu <legraphista@users.noreply.github.com>
 Steffen Röcker <sroecker@gmail.com>
 Stephan Walter <stephan@walter.name>
 Stephen Nichols <snichols@users.noreply.github.com>
@@ -698,46 +1272,100 @@ Steve Bonds <sbonds@gmail.com>
 Steve Grubb <ausearch.1@gmail.com>
 Steven Prichard <spprichard20@gmail.com>
 Steven Roussey <sroussey@gmail.com>
+stevenkuang <stevenkuang@tencent.com>
 Steward Garcia <57494570+FSSRepo@users.noreply.github.com>
 StrangeBytesDev <141275258+StrangeBytesDev@users.noreply.github.com>
+strawberrymelonpanda <152940198+strawberrymelonpanda@users.noreply.github.com>
 Suaj Carrot <72162667+SuajCarrot@users.noreply.github.com>
+sudhiarm <sudhi.sathyavathy@arm.com>
 Sukriti Sharma <Ssukriti@users.noreply.github.com>
 SuperUserNameMan <yoann@terminajones.com>
 Sutou Kouhei <kou@cozmixng.org>
+Svetlozar Georgiev <55534064+sgeor255@users.noreply.github.com>
+swittk <switt1995@gmail.com>
+SXX <sxx1136965276@gmail.com>
 Tai Duc Nguyen <taiducnguyen.drexel@gmail.com>
 Taikono-Himazin <kazu@po.harenet.ne.jp>
+Taimur Ahmad <taimur.ahmad@10xengineers.ai>
+Tak-RS <snosk.t@gmail.com>
+takasurazeem <takasurazeem@gmail.com>
+takov751 <40316768+takov751@users.noreply.github.com>
+takuya kodama <a.s.takuya1026@gmail.com>
+takuya kodama <otegami@clear-code.com>
+tamarPal <tamarp3385@gmail.com>
 Tameem <113388789+AhmadTameem@users.noreply.github.com>
 Tamotsu Takahashi <ttakah+github@gmail.com>
+tarcey <cey.tarik@gmail.com>
+Tarek Dakhran <t.dakhran@gmail.com>
+Tarek Dakhran <tarek@liquid.ai>
+tastelikefeet <58414341+tastelikefeet@users.noreply.github.com>
+Tatsuya Tanaka <tanakasan2525@gmail.com>
+Taylor <quantumtraveling@gmail.com>
+tc-mb <157115220+tc-mb@users.noreply.github.com>
+TecJesh <qdvm5gl@163.com>
 Tei Home <taiteitonghome@proton.me>
+tempstudio <49735574+tempstudio@users.noreply.github.com>
+teo <TeoZosa@users.noreply.github.com>
+texmex76 <40733439+texmex76@users.noreply.github.com>
 Thái Hoàng Tâm <75922889+RoyalHeart@users.noreply.github.com>
+Thammachart Chinvarapon <1731496+Thammachart@users.noreply.github.com>
 Thatcher Chamberlin <j.thatcher.c@gmail.com>
 Theia Vogel <theia@vgel.me>
+thement <40525767+thement@users.noreply.github.com>
+theo77186 <theo77186@users.noreply.github.com>
+theraininsky <76763719+theraininsky@users.noreply.github.com>
 Thérence <13496987+Royalphax@users.noreply.github.com>
+thewh1teagle <61390950+thewh1teagle@users.noreply.github.com>
 Thibault Terrasson <thibault.terrasson@gmail.com>
+thom-dev-fr <161708450+thom-dev-fr@users.noreply.github.com>
+Thomas Germer <99991@users.noreply.github.com>
+Thomas Jarosch <thomas.jarosch@intra2net.com>
 Thomas Klausner <wiz@gatalith.at>
+Thore Koritzius <thorekoritzius@outlook.de>
 Thorsten Sommer <SommerEngineering@users.noreply.github.com>
+TianHao324 <854531745@qq.com>
+TianHao324 <tianhao42@huawei.com>
+Tianyue-Zhao <zhaotianyue@outlook.com>
 Tim Miller <drasticactions@users.noreply.github.com>
+Tim Neumann <mail@timnn.me>
 Tim Wang <overocean@gmail.com>
 Timmy Knight <r2d2fish@gmail.com>
 Timothy Cronin <40186632+4imothy@users.noreply.github.com>
 Ting Lou <louting@189.cn>
 Ting Lou <ting.lou@gmail.com>
 Ting Sun <suntcrick@gmail.com>
+tjohnman <tjohnman@users.noreply.github.com>
 Tobias Lütke <tobi@shopify.com>
+Todor Boinovski <todorb@qti.qualcomm.com>
 Tom C <tom.corelis@gmail.com>
 Tom Jobbins <784313+TheBloke@users.noreply.github.com>
 Tomas <tom.tomas.36478119@gmail.com>
 Tomáš Pazdiora <tomas.pazdiora@gmail.com>
 Tony Wasserka <4840017+neobrain@users.noreply.github.com>
+toyer <2042519524@qq.com>
+TrevorS <trevor@strieber.org>
+triplenom <79777178+triplenom@users.noreply.github.com>
 Tristan Druyen <tristan@vault81.mozmail.com>
 Tristan Ross <rosscomputerguy@protonmail.com>
 Trivikram Kamat <16024985+trivikr@users.noreply.github.com>
+tslmy <tslmy@users.noreply.github.com>
+tt <291400568@qq.com>
 Tungsten842 <886724vf@anonaddy.me>
 Tungsten842 <quantmint@protonmail.com>
 Tushar <ditsuke@protonmail.com>
+tv1wnd <55383215+tv1wnd@users.noreply.github.com>
+ubergarm <leimgrub@gmail.com>
+ubik2 <ubik2@users.noreply.github.com>
 UEXTM.com <84163508+uextm@users.noreply.github.com>
+Uilian Ries <uilianries@gmail.com>
+uint256_t <konndennsa@gmail.com>
+uint256_t <maekawatoshiki1017@gmail.com>
 Ujjawal Panchal <31011628+Ujjawal-K-Panchal@users.noreply.github.com>
 Ulrich Drepper <drepper@gmail.com>
+unbounded <haakon@likedan.net>
+uvos <carl@uvos.xyz>
+uvos <devnull@uvos.xyz>
+uvos <philipp@uvos.xyz>
 Uzo Nweke <uzoechi@gmail.com>
 Vaibhav Srivastav <vaibhavs10@gmail.com>
 Val Kharitonov <mail@kharvd.com>
@@ -745,10 +1373,22 @@ Valentin Konovalov <valle.ketsujin@gmail.com>
 Valentin Mamedov <45292985+Inf1delis@users.noreply.github.com>
 Valentyn Bezshapkin <61702053+valentynbez@users.noreply.github.com>
 Vali Malinoiu <0x4139@gmail.com>
+valiray <133289098+valiray@users.noreply.github.com>
+vb <vaibhavs10@gmail.com>
+Vedran Miletić <vedran@miletic.net>
+Victor <194116445+dodekapod@users.noreply.github.com>
 Victor Nogueira <felladrin@gmail.com>
 Victor Z. Peng <ziliangdotme@gmail.com>
 Viet-Anh NGUYEN (Andrew) <vietanh.dev@gmail.com>
+vik <vikhyatk@gmail.com>
+Ville Vesilehto <ville@vesilehto.fi>
+Vineel Abhinav <131174187+vineelabhinav@users.noreply.github.com>
 Vinesh Janarthanan <36610342+VJHack@users.noreply.github.com>
+Vinkal <vinkal-chudgar@users.noreply.github.com>
+virajwad <84867530+virajwad@users.noreply.github.com>
+viric <viric@viric.name>
+Vishal Agarwal <vishalagarwal.jss@gmail.com>
+Vishal Singh <vishal@zettabolt.com>
 Vitali Lovich <vlovich+github@gmail.com>
 Vivian <vynride@gmail.com>
 Vlad <spitfireage@gmail.com>
@@ -756,351 +1396,124 @@ Vladimir <bogdad@gmail.com>
 Vladimir Malyutin <first-leon@yandex.ru>
 Vladimir Vuksanovic <109677816+vvuksanovic@users.noreply.github.com>
 Vladimir Zorin <vladimir@deviant.guru>
+Vladislav Sayapin <70110788+v-sayapin@users.noreply.github.com>
+vmobilis <75476228+vmobilis@users.noreply.github.com>
+vodkaslime <646329483@qq.com>
 VoidIsVoid <343750470@qq.com>
 Volodymyr Vitvitskyi <72226+signalpillar@users.noreply.github.com>
+vvhg1 <94630311+vvhg1@users.noreply.github.com>
+vxiiduu <73044267+vxiiduu@users.noreply.github.com>
 Wagner Bruna <wbruna@users.noreply.github.com>
 Wang Qin <37098874+wangqin0@users.noreply.github.com>
 Wang Ran (汪然) <wangr@smail.nju.edu.cn>
+Wang Weixuan <wangweixvan@gmail.com>
 WangHaoranRobin <56047610+WangHaoranRobin@users.noreply.github.com>
+wangshuai09 <391746016@qq.com>
+wbpxre150 <100937007+wbpxre150@users.noreply.github.com>
+wbtek <171302111+wbtek@users.noreply.github.com>
 Weird Constructor <weirdconstructor@gmail.com>
 Weizhao Ouyang <o451686892@gmail.com>
+Weizhao Ouyang <weizhao.ouyang@arm.com>
 Welby Seely <welbyseely@gmail.com>
+welix <taichitary@gmail.com>
 Wentai Zhang <rchardx@gmail.com>
+whoreson <139810751+whoreson@users.noreply.github.com>
 Wilken Gottwalt <12194808+wgottwalt@users.noreply.github.com>
 WillCorticesAI <150854901+WillCorticesAI@users.noreply.github.com>
+william pan <61359596+wp4032@users.noreply.github.com>
 William Tambellini <william.tambellini@gmail.com>
 William Tambellini <wtambellini@sdl.com>
 Willy Tarreau <w@1wt.eu>
-Woof Dog <197125663+woof-dog@users.noreply.github.com>
-Wouter <9594229+DifferentialityDevelopment@users.noreply.github.com>
-Wu Jian Ping <wujjpp@hotmail.com>
-Wu Jian Ping <wujp@greatld.com>
-Xiake Sun <xiake.sun@intel.com>
-Xiang (Kevin) Li <kevinli020508@gmail.com>
-Xiao-Yong Jin <jinxiaoyong@gmail.com>
-XiaotaoChen <chenxiaotao1234@gmail.com>
-Xiaoyi Chen <cxychina@gmail.com>
-Xie Yanbo <xieyanbo@gmail.com>
-Xingchen Song(宋星辰) <xingchensong1996@163.com>
-Xinpeng Dou <81913537+Dou-Git@users.noreply.github.com>
-Xuan Son Nguyen <thichthat@gmail.com>
-Xuan-Son Nguyen <thichthat@gmail.com>
-Yaiko <elyaiko@hotmail.com>
-Yann Follet <131855179+YannFollet@users.noreply.github.com>
-Yaroslav <yaroslav.yashin@me.com>
-Yazan Agha-Schrader <mountaiin@icloud.com>
-Yiming Cui <conandiy@vip.qq.com>
-Yishuo Wang <MeouSker77@outlook.com>
-Yoshi Suhara <y.suhara@gmail.com>
-Yoshi Suhara <ysuhara@nvidia.com>
-Younes Belkada <49240599+younesbelkada@users.noreply.github.com>
-Yueh-Po Peng <94939112+y10ab1@users.noreply.github.com>
-Yüg <eugeniosegalaweb@gmail.com>
-Yui <dev@sleepyyui.com>
-Yun Dou <dixyes@gmail.com>
-Yuri Khrustalev <ykhrustalev@users.noreply.github.com>
-Yusuf Kağan Hanoğlu <hanoglu@yahoo.com>
-Yuval Peled <31162840+Yuval-Peled@users.noreply.github.com>
-ZHAOKAI WANG <sanxianwei@163.com>
-Zane Shannon <z@zcs.me>
-Zay <95888118+isaiahbjork@users.noreply.github.com>
-Zenix <zenixls2@gmail.com>
-Zhang Peiyuan <a1286225768@gmail.com>
-Zheng.Deng <32841220+dengzheng-cloud@users.noreply.github.com>
-Zhenwei Jin <109658203+kylo5aby@users.noreply.github.com>
-Zhiyuan Li <lizhiyuan@uniartisan.com>
-Zhiyuan Li <uniartisan2017@gmail.com>
-ZhouYuChen <zhouyuchen@naver.com>
-Ziad Ben Hadj-Alouane <zied.benhadjalouane@gmail.com>
-Ziang Wu <97337387+ZiangWu-77@users.noreply.github.com>
-Zsapi <martin1.zsapka@gmail.com>
-a-n-n-a-l-e-e <150648636+a-n-n-a-l-e-e@users.noreply.github.com>
-a3sh <38979186+A3shTnT@users.noreply.github.com>
-adel boussaken <netdur@gmail.com>
-afrideva <95653597+afrideva@users.noreply.github.com>
-ag2s20150909 <19373730+ag2s20150909@users.noreply.github.com>
-agray3 <agray3@users.noreply.github.com>
-akawrykow <142945436+akawrykow@users.noreply.github.com>
-alek3y <44779186+alek3y@users.noreply.github.com>
-alexpinel <93524949+alexpinel@users.noreply.github.com>
-alonfaraj <alonfaraj@gmail.com>
-alwqx <kenan3015@gmail.com>
-amd-dwang <dong.wang@amd.com>
-amd-lalithnc <lalithnc@amd.com>
-amritahs-ibm <amritahs@linux.vnet.ibm.com>
-andrijdavid <david@geek.mg>
-anon998 <131767832+anon998@users.noreply.github.com>
-anzz1 <anzz1@live.com>
-apaz <aarpazdera@gmail.com>
-apcameron <37645737+apcameron@users.noreply.github.com>
-arch-btw <57669023+arch-btw@users.noreply.github.com>
-arcrank <arcrank@gmail.com>
-ardfork <134447697+ardfork@users.noreply.github.com>
-arlo-phoenix <140345165+arlo-phoenix@users.noreply.github.com>
-aryantandon01 <80969509+aryantandon01@users.noreply.github.com>
-at8u <129688334+at8u@users.noreply.github.com>
-automaticcat <daogiatuank54@gmail.com>
-awatuna <23447591+awatuna@users.noreply.github.com>
-b4b4o <zwbao@foxmail.com>
-bandoti <141645996+bandoti@users.noreply.github.com>
-beiller <beiller@gmail.com>
-bhubbb <79117352+bhubbb@users.noreply.github.com>
-bmwl <brian.marshall@tolko.com>
-bobqianic <129547291+bobqianic@users.noreply.github.com>
-brucepro <git@brucepro.net>
-bryanSwk <93190252+bryanSwk@users.noreply.github.com>
-bsilvereagle <bsilvereagle@users.noreply.github.com>
-bssrdf <merlintiger@hotmail.com>
-byte-6174 <88070277+byte-6174@users.noreply.github.com>
-cduk <19917266+cduk@users.noreply.github.com>
-cebtenzzre <cebtenzzre@gmail.com>
-chaihahaha <chai836275709@gmail.com>
-chiranko <96988916+chiranko@users.noreply.github.com>
-clibdev <52199778+clibdev@users.noreply.github.com>
-clyang <clyang@clyang.net>
-cmdr2 <secondary.cmdr2@gmail.com>
-cmdr2 <shashank.shekhar.global@gmail.com>
-cocktailpeanut <121128867+cocktailpeanut@users.noreply.github.com>
-codezjx <code.zjx@gmail.com>
-coezbek <c.oezbek@gmail.com>
-comex <comexk@gmail.com>
-compilade <113953597+compilade@users.noreply.github.com>
-compilade <git@compilade.net>
-cpumaxx <163466046+cpumaxx@users.noreply.github.com>
-crasm <crasm@git.vczf.net>
-crasm <crasm@git.vczf.us>
-daboe01 <daboe01@googlemail.com>
-daghanerdonmez <44506702+daghanerdonmez@users.noreply.github.com>
-daminho <37615795+daminho@users.noreply.github.com>
-david raistrick <keen99@users.noreply.github.com>
-ddh0 <dylanhalladay02@icloud.com>
-ddpasa <112642920+ddpasa@users.noreply.github.com>
-deepdiffuser <112834445+deepdiffuser@users.noreply.github.com>
-devojony <61173062+devojony@users.noreply.github.com>
-ditsuke <ditsuke@protonmail.com>
-divinity76 <divinity76@gmail.com>
-dm4 <dm4@secondstate.io>
-dm4 <sunrisedm4@gmail.com>
-dotpy314 <33351922+dotpy314@users.noreply.github.com>
-drbh <david.richard.holtz@gmail.com>
-ds5t5 <145942675+ds5t5@users.noreply.github.com>
-dylan <canardleteer@users.noreply.github.com>
-eastriver <lee@eastriver.dev>
-ebraminio <ebrahim@gnu.org>
-ebraminio <ebraminio@gmail.com>
-eiery <19350831+eiery@users.noreply.github.com>
-eric8607242 <e0928021388@gmail.com>
-fairydreaming <166155368+fairydreaming@users.noreply.github.com>
-fengerhu1 <2748250768@qq.com>
-fj-y-saito <85871716+fj-y-saito@users.noreply.github.com>
-fraxy-v <65565042+fraxy-v@users.noreply.github.com>
-fxzjshm <11426482+fxzjshm@users.noreply.github.com>
-github-actions[bot] <github-actions[bot]@users.noreply.github.com>
-gliptic <gliptic@users.noreply.github.com>
-gn64 <yukikaze.jp@gmail.com>
-goerch <jhr.walter@t-online.de>
-grahameth <96447521+grahameth@users.noreply.github.com>
-gtygo <gtydoit@gmail.com>
-gwjr <502526+gwjr@users.noreply.github.com>
-h-h-h-h <13482553+h-h-h-h@users.noreply.github.com>
-hankcs <cnhankmc@gmail.com>
-haopeng <657407891@qq.com>
-hipudding <huafengchun@gmail.com>
-hoangmit <hoangmit@users.noreply.github.com>
-hongbo.mo <352280764@qq.com>
-hopkins385 <98618192+hopkins385@users.noreply.github.com>
-howlger <eclipse@voormann.de>
-howlger <github@voormann.de>
-hutli <6594598+hutli@users.noreply.github.com>
-hutli <hutli@hutli.hu>
-hutli <jensstaermose@hotmail.com>
-hxer7963 <hxer7963@gmail.com>
-hydai <z54981220@gmail.com>
-iSma <ismail.senhaji@gmail.com>
-iacore <74560659+iacore@users.noreply.github.com>
-icppWorld <124377669+icppWorld@users.noreply.github.com>
-igardev <49397134+igardev@users.noreply.github.com>
-igarnier <igarnier@protonmail.com>
-intelmatt <61025942+intelmatt@users.noreply.github.com>
-iohub <rickyang.pro@gmail.com>
-issixx <46835150+issixx@users.noreply.github.com>
-jacobi petrucciani <8117202+jpetrucciani@users.noreply.github.com>
-jaime-m-p <167997752+jaime-m-p@users.noreply.github.com>
-jameswu2014 <545426914@qq.com>
-jason_w <jason.wang@126.com>
-jdomke <28772296+jdomke@users.noreply.github.com>
-jiahao su <damow890@gmail.com>
-jiez <373447296@qq.com>
-jneem <joeneeman@gmail.com>
-joecryptotoo <80373433+joecryptotoo@users.noreply.github.com>
-johnson442 <56517414+johnson442@users.noreply.github.com>
-jojorne <jojorne@users.noreply.github.com>
-jon-chuang <9093549+jon-chuang@users.noreply.github.com>
-jp-x-g <jpxg-dev@protonmail.com>
-jukofyork <69222624+jukofyork@users.noreply.github.com>
-junchao-loongson <68935141+junchao-loongson@users.noreply.github.com>
-junchao-zhao <68935141+junchao-loongson@users.noreply.github.com>
-jwj7140 <32943891+jwj7140@users.noreply.github.com>
-k.h.lai <adrian.k.h.lai@outlook.com>
-kaizau <kaizau@users.noreply.github.com>
-kallewoof <kalle.alm@gmail.com>
-kalomaze <66376113+kalomaze@users.noreply.github.com>
-kang <tpdns9032100@gmail.com>
-katsu560 <118887472+katsu560@users.noreply.github.com>
-kchro3 <62481661+kchro3@users.noreply.github.com>
-khimaros <me@khimaros.com>
-kiltyj <kiltyj@gmail.com>
-klosax <131523366+klosax@users.noreply.github.com>
-krystiancha <krystian@krystianch.com>
-kunal-vaishnavi <115581922+kunal-vaishnavi@users.noreply.github.com>
-kunnis <kunnis@users.noreply.github.com>
-kuronekosaiko <EvanChanJ@163.com>
-kustaaya <58045274+kustaaya@users.noreply.github.com>
-kuvaus <22169537+kuvaus@users.noreply.github.com>
-kwin1412 <42286931+kwin1412@users.noreply.github.com>
-l3utterfly <gc.pthzfoldr@gmail.com>
-laik <laik.lj@me.com>
-ldwang <ftgreat@163.com>
-le.chang <cljs118@126.com>
-leejet <leejet714@gmail.com>
-leo-pony <nengjunma@outlook.com>
-lexasub <lexakopp2212@gmail.com>
-lhez <quic_lih@quicinc.com>
-limitedAtonement <limitedAtonement@users.noreply.github.com>
-liuwei-git <14815172+liuwei-git@users.noreply.github.com>
-lon <114724657+longregen@users.noreply.github.com>
-loonerin <132926317+loonerin@users.noreply.github.com>
-ltoniazzi <61414566+ltoniazzi@users.noreply.github.com>
-luoyu-intel <yu.luo@intel.com>
-m3ndax <adrian.goessl@outlook.com>
-maddes8cht <55592906+maddes8cht@users.noreply.github.com>
-magicse <magicse@users.noreply.github.com>
-mahorozte <41834471+mahorozte@users.noreply.github.com>
-makomk <makosoft@googlemail.com>
-manikbhandari <mbbhandarimanik2@gmail.com>
-maor-ps <154728172+maor-ps@users.noreply.github.com>
-mashdragon <122402293+mashdragon@users.noreply.github.com>
-matiaslin <45382001+matiaslin@users.noreply.github.com>
-matt23654 <matthew.webber@protonmail.com>
-matteo <matteogeniaccio@yahoo.it>
-mdrokz <mohammadmunshi@gmail.com>
-mgroeber9110 <45620825+mgroeber9110@users.noreply.github.com>
-midnight <midnightmagic@users.noreply.github.com>
-minarchist <minarchist@users.noreply.github.com>
-mj-shifu <77107165+mj-shifu@users.noreply.github.com>
-mmyjona <jonathan.gonse@gmail.com>
-momonga <115213907+mmnga@users.noreply.github.com>
-momonga <146910567+mmngays@users.noreply.github.com>
-moritzbrantner <31051084+moritzbrantner@users.noreply.github.com>
-musoles <135031143+musoles@users.noreply.github.com>
-mzcu <milos.cubrilo@gmail.com>
-nanahi <130121847+na-na-hi@users.noreply.github.com>
-ngc92 <7938269+ngc92@users.noreply.github.com>
-nhamanasu <45545786+nhamanasu@users.noreply.github.com>
-niansa/tuxifan <anton-sa@web.de>
-niansa/tuxifan <tuxifan@posteo.de>
-nickp27 <nb.porter@gmail.com>
-ningshanwutuobang <ningshanwutuobang@gmail.com>
-nold <Nold360@users.noreply.github.com>
-nopperl <54780682+nopperl@users.noreply.github.com>
-nusu-github <29514220+nusu-github@users.noreply.github.com>
-olexiyb <olexiyb@gmail.com>
-omahs <73983677+omahs@users.noreply.github.com>
-oobabooga <112222186+oobabooga@users.noreply.github.com>
-opparco <parco.opaai@gmail.com>
-ostix360 <55257054+ostix360@users.noreply.github.com>
-pascal-lc <49066376+pascal-lc@users.noreply.github.com>
-pculliton <phillipculliton@gmail.com>
-peidaqi <peidaqi@gmail.com>
-pengxin99 <pengxin.yuan@intel.com>
-perserk <perserk@gmail.com>
-petterreinholdtsen <pere-github@hungry.com>
-piDack <104877312+piDack@users.noreply.github.com>
-pmysl <piotr.myslinski@outlook.com>
-postmasters <namnguyen@google.com>
-pudepiedj <pudepiedj@gmail.com>
-qingfengfenga <41416092+qingfengfenga@users.noreply.github.com>
-qingy1337 <qxli2@students.everettcc.edu>
-qouoq <qouoq@fastmail.com>
-qunash <anzoria@gmail.com>
-rabidcopy <rabidcopy@yahoo.com>
-rankaiyx <rankaiyx@rankaiyx.com>
-redbeard <bharrington@alticon.net>
-rhjdvsgsgks <26178113+rhjdvsgsgks@users.noreply.github.com>
-rhuddleston <ryan.huddleston@percona.com>
-rimoliga <53384203+rimoliga@users.noreply.github.com>
-runfuture <runfuture@users.noreply.github.com>
-sandyiscool <sandyiscool@gmail.com>
-sasha0552 <admin@sasha0552.org>
-semidark <me@semidark.net>
-serhii-nakon <57632032+serhii-nakon@users.noreply.github.com>
-sharpHL <132747147+sharpHL@users.noreply.github.com>
-shibe2 <shibe@tuta.io>
-simon886212 <37953122+simon886212@users.noreply.github.com>
-singularity <12184989+singularity-s0@users.noreply.github.com>
-sjinzh <sjinzh@gmail.com>
-sjxx <63994076+ylsdamxssjxxdd@users.noreply.github.com>
-slaren <2141330+slaren@users.noreply.github.com>
-slaren <slarengh@gmail.com>
-snadampal <87143774+snadampal@users.noreply.github.com>
-someone13574 <81528246+someone13574@users.noreply.github.com>
-standby24x7 <standby24x7@gmail.com>
-staviq <staviq@gmail.com>
-stduhpf <stephduh@live.fr>
-strawberrymelonpanda <152940198+strawberrymelonpanda@users.noreply.github.com>
-swittk <switt1995@gmail.com>
-takov751 <40316768+takov751@users.noreply.github.com>
-tarcey <cey.tarik@gmail.com>
-tc-mb <157115220+tc-mb@users.noreply.github.com>
-texmex76 <40733439+texmex76@users.noreply.github.com>
-thement <40525767+thement@users.noreply.github.com>
-theraininsky <76763719+theraininsky@users.noreply.github.com>
-thewh1teagle <61390950+thewh1teagle@users.noreply.github.com>
-tjohnman <tjohnman@users.noreply.github.com>
-toyer <2042519524@qq.com>
-tslmy <tslmy@users.noreply.github.com>
-tv1wnd <55383215+tv1wnd@users.noreply.github.com>
-ubik2 <ubik2@users.noreply.github.com>
-uint256_t <konndennsa@gmail.com>
-uint256_t <maekawatoshiki1017@gmail.com>
-unbounded <haakon@likedan.net>
-uvos <devnull@uvos.xyz>
-uvos <philipp@uvos.xyz>
-valiray <133289098+valiray@users.noreply.github.com>
-vb <vaibhavs10@gmail.com>
-vik <vikhyatk@gmail.com>
-viric <viric@viric.name>
-vmobilis <75476228+vmobilis@users.noreply.github.com>
-vodkaslime <646329483@qq.com>
-vvhg1 <94630311+vvhg1@users.noreply.github.com>
-vxiiduu <73044267+vxiiduu@users.noreply.github.com>
-wangshuai09 <391746016@qq.com>
-wbpxre150 <100937007+wbpxre150@users.noreply.github.com>
-whoreson <139810751+whoreson@users.noreply.github.com>
 woachk <24752637+woachk@users.noreply.github.com>
 wonjun Jang <strutive07@gmail.com>
 woodx <124784234+woodx9@users.noreply.github.com>
+Woof Dog <197125663+woof-dog@users.noreply.github.com>
+wooksong <wook16.song@samsung.com>
+Wouter <9594229+DifferentialityDevelopment@users.noreply.github.com>
+Wroclaw <wroclaw223@outlook.com>
+wsbagnsv1 <sclumpfpapa36@gmail.com>
+Wu Jian Ping <wujjpp@hotmail.com>
+Wu Jian Ping <wujp@greatld.com>
 wwoodsTM <104587230+wwoodsTM@users.noreply.github.com>
 wzy <32936898+Freed-Wu@users.noreply.github.com>
 xaedes <xaedes@gmail.com>
 xaedes <xaedes@googlemail.com>
 xctan <axunlei@gmail.com>
+xctan <xc-tan@outlook.com>
+Xiake Sun <xiake.sun@intel.com>
+Xiang (Kevin) Li <kevinli020508@gmail.com>
+Xiangyan Sun <wishstudio@gmail.com>
+Xiao-Yong Jin <jinxiaoyong@gmail.com>
 xiaobing318 <71554036+xiaobing318@users.noreply.github.com>
 xiaofei <hbuxiaofei@gmail.com>
+XiaotaoChen <chenxiaotao1234@gmail.com>
+Xiaoyi Chen <cxychina@gmail.com>
+Xie Yanbo <xieyanbo@gmail.com>
+Xingchen Song(宋星辰) <xingchensong1996@163.com>
+Xinpeng Dou <15529241576@163.com>
+Xinpeng Dou <81913537+Dou-Git@users.noreply.github.com>
 xloem <0xloem@gmail.com>
+Xuan Son Nguyen <thichthat@gmail.com>
+Xuan-Son Nguyen <son@huggingface.co>
+Xuan-Son Nguyen <thichthat@gmail.com>
+yael-works <106673277+yael-works@users.noreply.github.com>
+YaelGitAccount <38328157276@mby.co.il>
+YaelLogic <y0548591250@gmail.com>
+Yaiko <elyaiko@hotmail.com>
+YangLe <smilingpoplar@gmail.com>
 yangli2 <yangli2@gmail.com>
+Yann Follet <131855179+YannFollet@users.noreply.github.com>
+Yaroslav <yaroslav.yashin@me.com>
+Yavor Ivanov <yavorgenadiev@gmail.com>
+Yazan Agha-Schrader <mountaiin@icloud.com>
+Ycros <18012+ycros@users.noreply.github.com>
+YehuditE <y8703470@gmail.com>
+Yibo Cai <cyb70289@gmail.com>
+Yibo Cai <yibo.cai@arm.com>
+yifant-code <tian.yifan123@gmail.com>
+Yiming Cui <conandiy@vip.qq.com>
+Yishuo Wang <MeouSker77@outlook.com>
 ymcki <84055651+ymcki@users.noreply.github.com>
+Yoshi Suhara <y.suhara@gmail.com>
+Yoshi Suhara <ysuhara@nvidia.com>
+Yoshi_likes_e4 <104140648+pt13762104@users.noreply.github.com>
+Younes Belkada <49240599+younesbelkada@users.noreply.github.com>
+Yuanhao Ji <jiyuanhao@apache.org>
+Yuannan <yuannan@users.noreply.github.com>
+Yueh-Po Peng <94939112+y10ab1@users.noreply.github.com>
+Yüg <eugeniosegalaweb@gmail.com>
+Yui <dev@sleepyyui.com>
+Yuichiro Utsumi <81412151+utsumi-fj@users.noreply.github.com>
 yuiseki <yuiseki@gmail.com>
+yulo <77381088+zhang-hui-yulo@users.noreply.github.com>
+yumeyao <yumeyao@gmail.com>
+yummy <57988893+jk3456a@users.noreply.github.com>
+Yun Dou <dixyes@gmail.com>
+Yuri Khrustalev <ykhrustalev@users.noreply.github.com>
 yuri@FreeBSD <yurivict@users.noreply.github.com>
+Yusuf Kağan Hanoğlu <hanoglu@yahoo.com>
+Yuval Peled <31162840+Yuval-Peled@users.noreply.github.com>
+Yuxuan Zhang <2448370773@qq.com>
+Z <coffeevampirebusiness@gmail.com>
+Zagaj <m.zagajewska@gmail.com>
 zakkor <edward.partenie@gmail.com>
+Zane Shannon <z@zcs.me>
+Zay <95888118+isaiahbjork@users.noreply.github.com>
+Zenix <zenixls2@gmail.com>
+Zhang Peiyuan <a1286225768@gmail.com>
 zhangkaihuo <zhangkaihuo@gmail.com>
+ZHAOKAI WANG <sanxianwei@163.com>
+Zheng.Deng <32841220+dengzheng-cloud@users.noreply.github.com>
 zhentaoyu <zhentao.yu@intel.com>
+Zhenwei Jin <109658203+kylo5aby@users.noreply.github.com>
+Zheyuan Chen <sephirotheca17@gmail.com>
+Zhiyong Wang <85110830+ravenouse@users.noreply.github.com>
+Zhiyuan Li <lizhiyuan@uniartisan.com>
+Zhiyuan Li <uniartisan2017@gmail.com>
 zhouwg <6889919+zhouwg@users.noreply.github.com>
 zhouwg <zhouwg2000@gmail.com>
+ZhouYuChen <zhouyuchen@naver.com>
+Ziad Ben Hadj-Alouane <zied.benhadjalouane@gmail.com>
+Ziang Wu <97337387+ZiangWu-77@users.noreply.github.com>
 zrm <trustiosity.zrm@gmail.com>
-Ștefan-Gabriel Muscalu <legraphista@users.noreply.github.com>
-杨朱 · Kiki <baofa.fan@daocloud.io>
-源文雨 <41315874+fumiama@users.noreply.github.com>
-蕭澧邦 <45505768+shou692199@users.noreply.github.com>
-谢乃闻 <sienaiwun@users.noreply.github.com>
-Нияз Гарифзянов <112617865+garrnizon@users.noreply.github.com>
+Zsapi <martin1.zsapka@gmail.com>

From 59377a6c870be95e4c71715933e4e9ada71b8356 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= <johannesg@5d6.de>
Date: Mon, 2 Feb 2026 10:00:05 +0100
Subject: [PATCH 08/18] ggml-backend: fix async set/get fallback sync (#19179)

---
 ggml/src/ggml-backend.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp
index 354876574a..22c656996c 100644
--- a/ggml/src/ggml-backend.cpp
+++ b/ggml/src/ggml-backend.cpp
@@ -258,6 +258,7 @@ void ggml_backend_tensor_set_async(ggml_backend_t backend, struct ggml_tensor *
     GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
 
     if (backend->iface.set_tensor_async == NULL) {
+        ggml_backend_synchronize(backend);
         ggml_backend_tensor_set(tensor, data, offset, size);
     } else {
         backend->iface.set_tensor_async(backend, tensor, data, offset, size);
@@ -271,6 +272,7 @@ void ggml_backend_tensor_get_async(ggml_backend_t backend, const struct ggml_ten
     GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
 
     if (backend->iface.get_tensor_async == NULL) {
+        ggml_backend_synchronize(backend);
         ggml_backend_tensor_get(tensor, data, offset, size);
     } else {
         backend->iface.get_tensor_async(backend, tensor, data, offset, size);

From 6156ae51114337ffb6bb46cb65f99227b255089f Mon Sep 17 00:00:00 2001
From: Daniel Bevenius <daniel.bevenius@gmail.com>
Date: Mon, 2 Feb 2026 11:29:57 +0100
Subject: [PATCH 09/18] model-conversion : add debug option to conversion
 script (#19265)

This commit adds a debug option to the model conversion script to enable
using the Python debugger (pdb) during model conversion.

The motivation for this is that I've found myself adding this a few
times now and it would be quicker to have this flag as an option and a
makefile target/recipe for it.
---
 examples/model-conversion/Makefile                   |  5 ++++-
 .../model-conversion/scripts/causal/convert-model.sh | 12 +++++++++++-
 2 files changed, 15 insertions(+), 2 deletions(-)

diff --git a/examples/model-conversion/Makefile b/examples/model-conversion/Makefile
index 3b0505911d..342de63bd0 100644
--- a/examples/model-conversion/Makefile
+++ b/examples/model-conversion/Makefile
@@ -33,11 +33,14 @@ DEVICE ?= auto
 causal-convert-model-bf16: OUTTYPE=bf16
 causal-convert-model-bf16: causal-convert-model
 
+causal-convert-model-debug: DEBUG=--debug
+causal-convert-model-debug: causal-convert-model
+
 causal-convert-model:
 	$(call validate_model_path,causal-convert-model)
 	@MODEL_NAME="$(MODEL_NAME)" OUTTYPE="$(OUTTYPE)" MODEL_PATH="$(MODEL_PATH)" \
 	METADATA_OVERRIDE="$(METADATA_OVERRIDE)" \
-	./scripts/causal/convert-model.sh
+	./scripts/causal/convert-model.sh $(DEBUG)
 
 causal-convert-mm-model-bf16: OUTTYPE=bf16
 causal-convert-mm-model-bf16: MM_OUTTYPE=f16
diff --git a/examples/model-conversion/scripts/causal/convert-model.sh b/examples/model-conversion/scripts/causal/convert-model.sh
index 32ffe132e7..a5865f6acd 100755
--- a/examples/model-conversion/scripts/causal/convert-model.sh
+++ b/examples/model-conversion/scripts/causal/convert-model.sh
@@ -4,12 +4,17 @@ set -e
 
 # Parse command line arguments
 MMPROJ=""
+DEBUG=""
 while [[ $# -gt 0 ]]; do
     case $1 in
         --mmproj)
             MMPROJ="--mmproj"
             shift
             ;;
+        --debug)
+            DEBUG="1"
+            shift
+            ;;
         *)
             shift
             ;;
@@ -28,7 +33,12 @@ echo "Data  type: ${TYPE}"
 echo "Converted model path:: ${CONVERTED_MODEL}"
 echo "Metadata override: ${METADATA_OVERRIDE}"
 
-CMD_ARGS=("python" "../../convert_hf_to_gguf.py" "--verbose")
+if [[ -n "$DEBUG" ]]; then
+    CMD_ARGS=("python" "-m" "pdb")
+else
+    CMD_ARGS=("python")
+fi
+CMD_ARGS+=("../../convert_hf_to_gguf.py" "--verbose")
 CMD_ARGS+=("${MODEL_PATH}")
 CMD_ARGS+=("--outfile" "${CONVERTED_MODEL}")
 CMD_ARGS+=("--outtype" "${TYPE}")

From 6fdddb498780dbda2a14f8b49b92d25601e14764 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Mon, 2 Feb 2026 14:29:44 +0200
Subject: [PATCH 10/18] metal : support virtual devices (#18919)

* metal : support virtual devices

* cont : manage buffer type context memory

* metal : add events

* cont : implement cpy_tensor_async
---
 ggml/src/ggml-metal/ggml-metal-context.h  |   8 +
 ggml/src/ggml-metal/ggml-metal-context.m  | 105 +++++-
 ggml/src/ggml-metal/ggml-metal-device.cpp |   8 +-
 ggml/src/ggml-metal/ggml-metal-device.h   |  16 +-
 ggml/src/ggml-metal/ggml-metal-device.m   |  71 +++-
 ggml/src/ggml-metal/ggml-metal.cpp        | 426 ++++++++++++++++------
 src/llama-context.cpp                     |   1 +
 7 files changed, 508 insertions(+), 127 deletions(-)

diff --git a/ggml/src/ggml-metal/ggml-metal-context.h b/ggml/src/ggml-metal/ggml-metal-context.h
index ec2b686b73..abf4b06ed2 100644
--- a/ggml/src/ggml-metal/ggml-metal-context.h
+++ b/ggml/src/ggml-metal/ggml-metal-context.h
@@ -15,14 +15,22 @@ typedef struct ggml_metal * ggml_metal_t;
 ggml_metal_t ggml_metal_init(ggml_metal_device_t dev);
 void ggml_metal_free(ggml_metal_t ctx);
 
+const char * ggml_metal_get_name(ggml_metal_t ctx);
+
 void ggml_metal_synchronize(ggml_metal_t ctx);
 
 void ggml_metal_set_tensor_async(ggml_metal_t ctx, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
 void ggml_metal_get_tensor_async(ggml_metal_t ctx, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
+bool ggml_metal_cpy_tensor_async(ggml_metal_t ctx_src, ggml_metal_t ctx_dst, const struct ggml_tensor * src, struct ggml_tensor * dst);
 
 enum ggml_status ggml_metal_graph_compute (ggml_metal_t ctx, struct ggml_cgraph * gf);
 void             ggml_metal_graph_optimize(ggml_metal_t ctx, struct ggml_cgraph * gf);
 
+void ggml_metal_event_record(ggml_metal_t ctx, ggml_metal_event_t ev);
+void ggml_metal_event_wait  (ggml_metal_t ctx, ggml_metal_event_t ev);
+
+ggml_metal_event_t ggml_metal_get_ev_cpy(ggml_metal_t ctx);
+
 void ggml_metal_set_n_cb            (ggml_metal_t ctx, int n_cb);
 void ggml_metal_set_abort_callback  (ggml_metal_t ctx, ggml_abort_callback abort_callback, void * user_data);
 bool ggml_metal_supports_family     (ggml_metal_t ctx, int family);
diff --git a/ggml/src/ggml-metal/ggml-metal-context.m b/ggml/src/ggml-metal/ggml-metal-context.m
index 42a35736ee..a412d70aed 100644
--- a/ggml/src/ggml-metal/ggml-metal-context.m
+++ b/ggml/src/ggml-metal/ggml-metal-context.m
@@ -24,9 +24,13 @@ struct ggml_metal_command_buffer {
 };
 
 struct ggml_metal {
+    char name[128];
+
     ggml_metal_device_t  dev;
     ggml_metal_library_t lib;
 
+    ggml_metal_event_t ev_cpy; // for async copies
+
     dispatch_queue_t d_queue;
 
     // additional, inference-time compiled pipelines
@@ -117,7 +121,11 @@ ggml_metal_t ggml_metal_init(ggml_metal_device_t dev) {
         }
     }
 
-    //const struct ggml_metal_device_props * props_dev = ggml_metal_device_get_props(dev);
+    res->ev_cpy = ggml_metal_device_event_init(dev);
+
+    const struct ggml_metal_device_props * props_dev = ggml_metal_device_get_props(dev);
+
+    snprintf(res->name, sizeof(res->name), "%s", props_dev->name);
 
     res->d_queue = dispatch_queue_create("ggml-metal", DISPATCH_QUEUE_CONCURRENT);
 
@@ -206,9 +214,15 @@ void ggml_metal_free(ggml_metal_t ctx) {
 
     dispatch_release(ctx->d_queue);
 
+    ggml_metal_device_event_free(ctx->dev, ctx->ev_cpy);
+
     free(ctx);
 }
 
+const char * ggml_metal_get_name(ggml_metal_t ctx) {
+    return ctx->name;
+}
+
 void ggml_metal_synchronize(ggml_metal_t ctx) {
     // wait for any backend operations to finish
     if (ctx->cmd_buf_last) {
@@ -273,8 +287,8 @@ void ggml_metal_set_tensor_async(ggml_metal_t ctx, struct ggml_tensor * tensor,
         // wrap the source data into a Metal buffer
         id<MTLDevice> device = ggml_metal_device_get_obj(ctx->dev);
         id<MTLBuffer> buf_src = [device newBufferWithBytes:data
-                                                         length:size
-                                                        options:MTLResourceStorageModeShared];
+                                                    length:size
+                                                   options:MTLResourceStorageModeShared];
 
         GGML_ASSERT(buf_src);
 
@@ -316,9 +330,9 @@ void ggml_metal_get_tensor_async(ggml_metal_t ctx, const struct ggml_tensor * te
     @autoreleasepool {
         id<MTLDevice> device = ggml_metal_device_get_obj(ctx->dev);
         id<MTLBuffer> buf_dst = [device newBufferWithBytesNoCopy:data
-                                                               length:size
-                                                              options:MTLResourceStorageModeShared
-                                                          deallocator:nil];
+                                                          length:size
+                                                         options:MTLResourceStorageModeShared
+                                                     deallocator:nil];
 
         GGML_ASSERT(buf_dst);
 
@@ -356,6 +370,49 @@ void ggml_metal_get_tensor_async(ggml_metal_t ctx, const struct ggml_tensor * te
     }
 }
 
+bool ggml_metal_cpy_tensor_async(ggml_metal_t ctx_src, ggml_metal_t ctx_dst, const struct ggml_tensor * src, struct ggml_tensor * dst) {
+    @autoreleasepool {
+        struct ggml_metal_buffer_id bid_src = ggml_metal_get_buffer_id(src);
+        struct ggml_metal_buffer_id bid_dst = ggml_metal_get_buffer_id(dst);
+
+        if (bid_src.metal == nil || bid_dst.metal == nil) {
+            return false;
+        }
+
+        // queue the copy operation into the Metal context
+        // this will be queued at the end, after any currently ongoing GPU operations
+        id<MTLCommandQueue> queue = ggml_metal_device_get_queue(ctx_src->dev);
+        id<MTLCommandBuffer> cmd_buf = [queue commandBuffer];
+        id<MTLBlitCommandEncoder> encoder = [cmd_buf blitCommandEncoder];
+
+        [encoder copyFromBuffer:bid_src.metal
+                   sourceOffset:bid_src.offs
+                       toBuffer:bid_dst.metal
+              destinationOffset:bid_dst.offs
+                           size:ggml_nbytes(src)];
+
+        [encoder endEncoding];
+
+        ggml_metal_event_t ev_cpy = ggml_metal_get_ev_cpy(ctx_src);
+        ggml_metal_event_record(ctx_src, ev_cpy);
+
+        [cmd_buf commit];
+
+        // do not wait here for completion
+        //[cmd_buf waitUntilCompleted];
+
+        // instead, remember a reference to the command buffer and wait for it later if needed
+        [ctx_src->cmd_bufs_ext addObject:cmd_buf];
+        ctx_src->cmd_buf_last = cmd_buf;
+
+        [cmd_buf retain];
+
+        ggml_metal_event_wait(ctx_dst, ev_cpy);
+
+        return true;
+    }
+}
+
 enum ggml_status ggml_metal_graph_compute(ggml_metal_t ctx, struct ggml_cgraph * gf) {
     // number of nodes encoded by the main thread (empirically determined)
     const int n_main = 64;
@@ -530,6 +587,42 @@ void ggml_metal_graph_optimize(ggml_metal_t ctx, struct ggml_cgraph * gf) {
     //printf("%s: graph optimize took %.3f ms\n", __func__, (ggml_time_us() - t_start) / 1000.0);
 }
 
+void ggml_metal_event_record(ggml_metal_t ctx, ggml_metal_event_t ev) {
+    @autoreleasepool {
+        id<MTLCommandQueue> queue = ggml_metal_device_get_queue(ctx->dev);
+        id<MTLCommandBuffer> cmd_buf = [queue commandBuffer];
+
+        ggml_metal_event_encode_signal(ev, cmd_buf);
+
+        [cmd_buf commit];
+
+        [ctx->cmd_bufs_ext addObject:cmd_buf];
+        ctx->cmd_buf_last = cmd_buf;
+
+        [cmd_buf retain];
+    }
+}
+
+void ggml_metal_event_wait(ggml_metal_t ctx, ggml_metal_event_t ev) {
+    @autoreleasepool {
+        id<MTLCommandQueue> queue = ggml_metal_device_get_queue(ctx->dev);
+        id<MTLCommandBuffer> cmd_buf = [queue commandBuffer];
+
+        ggml_metal_event_encode_wait(ev, cmd_buf);
+
+        [cmd_buf commit];
+
+        [ctx->cmd_bufs_ext addObject:cmd_buf];
+        ctx->cmd_buf_last = cmd_buf;
+
+        [cmd_buf retain];
+    }
+}
+
+ggml_metal_event_t ggml_metal_get_ev_cpy(ggml_metal_t ctx) {
+    return ctx->ev_cpy;
+}
+
 void ggml_metal_set_n_cb(ggml_metal_t ctx, int n_cb) {
     if (ctx->n_cb != n_cb) {
         ctx->n_cb = MIN(n_cb, GGML_METAL_MAX_COMMAND_BUFFERS);
diff --git a/ggml/src/ggml-metal/ggml-metal-device.cpp b/ggml/src/ggml-metal/ggml-metal-device.cpp
index 04c6137c5a..377b0d3eb8 100644
--- a/ggml/src/ggml-metal/ggml-metal-device.cpp
+++ b/ggml/src/ggml-metal/ggml-metal-device.cpp
@@ -17,10 +17,12 @@ struct ggml_metal_device_deleter {
 
 typedef std::unique_ptr<ggml_metal_device, ggml_metal_device_deleter> ggml_metal_device_ptr;
 
-ggml_metal_device_t ggml_metal_device_get(void) {
-    static ggml_metal_device_ptr ctx { ggml_metal_device_init() };
+ggml_metal_device_t ggml_metal_device_get(int device) {
+    static std::vector<ggml_metal_device_ptr> devs;
 
-    return ctx.get();
+    devs.emplace_back(ggml_metal_device_init(device));
+
+    return devs.back().get();
 }
 
 struct ggml_metal_pipelines {
diff --git a/ggml/src/ggml-metal/ggml-metal-device.h b/ggml/src/ggml-metal/ggml-metal-device.h
index 3d01c56fb8..afb091e725 100644
--- a/ggml/src/ggml-metal/ggml-metal-device.h
+++ b/ggml/src/ggml-metal/ggml-metal-device.h
@@ -205,7 +205,9 @@ void ggml_metal_rsets_free(ggml_metal_rsets_t rsets);
 //
 
 struct ggml_metal_device_props {
+    int device;
     char name[128];
+    char desc[128];
 
     size_t max_buffer_size;
     size_t max_working_set_size;
@@ -224,11 +226,15 @@ struct ggml_metal_device_props {
     int op_offload_min_batch_size;
 };
 
-ggml_metal_device_t ggml_metal_device_init(void);
+typedef struct ggml_metal_event * ggml_metal_event_t;
+
+void ggml_metal_event_encode_signal(ggml_metal_event_t ev, ggml_metal_cmd_buf_t cmd_buf);
+void ggml_metal_event_encode_wait  (ggml_metal_event_t ev, ggml_metal_cmd_buf_t cmd_buf);
+
+ggml_metal_device_t ggml_metal_device_init(int device);
 void ggml_metal_device_free(ggml_metal_device_t dev);
 
-// return a singleton that is automatically destroyed when the program exits
-ggml_metal_device_t ggml_metal_device_get(void);
+ggml_metal_device_t ggml_metal_device_get(int device);
 
 void * ggml_metal_device_get_obj  (ggml_metal_device_t dev); // id<MTLDevice>
 void * ggml_metal_device_get_queue(ggml_metal_device_t dev); // id<MTLCommandQueue>
@@ -240,6 +246,10 @@ void ggml_metal_device_rsets_rm (ggml_metal_device_t dev, ggml_metal_rset_t rset
 
 void ggml_metal_device_rsets_keep_alive(ggml_metal_device_t dev);
 
+ggml_metal_event_t ggml_metal_device_event_init(ggml_metal_device_t dev);
+void ggml_metal_device_event_free(ggml_metal_device_t dev, ggml_metal_event_t ev);
+void ggml_metal_device_event_synchronize(ggml_metal_device_t dev, ggml_metal_event_t ev);
+
 void ggml_metal_device_get_memory(ggml_metal_device_t dev, size_t * free, size_t * total);
 bool ggml_metal_device_supports_op(ggml_metal_device_t dev, const struct ggml_tensor * op);
 
diff --git a/ggml/src/ggml-metal/ggml-metal-device.m b/ggml/src/ggml-metal/ggml-metal-device.m
index 7f9c384c34..285dd1630e 100644
--- a/ggml/src/ggml-metal/ggml-metal-device.m
+++ b/ggml/src/ggml-metal/ggml-metal-device.m
@@ -24,9 +24,6 @@
 static const NSInteger MTLGPUFamilyMetal3_GGML = 5001;
 static const NSInteger MTLGPUFamilyMetal4_GGML = 5002;
 
-// virtual address for GPU memory allocations
-static atomic_uintptr_t g_addr_device = 0x000000400ULL;
-
 #if !GGML_METAL_EMBED_LIBRARY
 // Here to assist with NSBundle Path Hack
 @interface GGMLMetalClass : NSObject
@@ -523,6 +520,9 @@ struct ggml_metal_device {
     ggml_metal_library_t library;
 
     struct ggml_metal_device_props props;
+
+    // virtual address for GPU memory allocations
+    atomic_uintptr_t addr_virt;
 };
 
 //
@@ -618,7 +618,7 @@ void ggml_metal_rsets_free(ggml_metal_rsets_t rsets) {
     free(rsets);
 }
 
-ggml_metal_device_t ggml_metal_device_init(void) {
+ggml_metal_device_t ggml_metal_device_init(int device) {
     ggml_metal_device_t dev = calloc(1, sizeof(struct ggml_metal_device));
 
     assert(dev != NULL);
@@ -632,6 +632,9 @@ ggml_metal_device_t ggml_metal_device_init(void) {
                 GGML_LOG_ERROR("%s: error: failed to create command queue\n", __func__);
             }
 
+            dev->addr_virt = 0x000000400ULL;
+
+            dev->props.device = device;
             dev->props.has_simdgroup_reduction  = [dev->mtl_device supportsFamily:MTLGPUFamilyApple7];
             dev->props.has_simdgroup_reduction |= [dev->mtl_device supportsFamily:MTLGPUFamilyMetal3_GGML];
 
@@ -792,7 +795,8 @@ ggml_metal_device_t ggml_metal_device_init(void) {
                 dev->props.max_working_set_size   = dev->mtl_device.maxBufferLength;
             }
 
-            strncpy(dev->props.name, [[dev->mtl_device name] UTF8String], sizeof(dev->props.name) - 1);
+            snprintf(dev->props.name, sizeof(dev->props.name), "%s%d", "MTL", device);
+            snprintf(dev->props.desc, sizeof(dev->props.desc), "%s", [[dev->mtl_device name] UTF8String]);
 
             dev->library = ggml_metal_library_init(dev);
             if (!dev->library) {
@@ -922,6 +926,59 @@ void ggml_metal_device_rsets_keep_alive(ggml_metal_device_t dev) {
     atomic_store_explicit(&dev->rsets->d_loop, 2*dev->rsets->keep_alive_s, memory_order_relaxed);
 }
 
+struct ggml_metal_event {
+    void * obj; // id<MTLEvent>
+
+    atomic_int value;
+};
+
+void ggml_metal_event_encode_signal(ggml_metal_event_t ev, ggml_metal_cmd_buf_t cmd_buf_raw) {
+    id<MTLEvent> event = (id<MTLEvent>)ev->obj;
+
+    id<MTLCommandBuffer> cmd_buf = (id<MTLCommandBuffer>) cmd_buf_raw;
+
+    [cmd_buf encodeSignalEvent:event value:atomic_fetch_add_explicit(&ev->value, 1, memory_order_relaxed) + 1];
+}
+
+void ggml_metal_event_encode_wait(ggml_metal_event_t ev, ggml_metal_cmd_buf_t cmd_buf_raw) {
+    id<MTLEvent> event = (id<MTLEvent>)ev->obj;
+
+    id<MTLCommandBuffer> cmd_buf = (id<MTLCommandBuffer>) cmd_buf_raw;
+
+    [cmd_buf encodeWaitForEvent:event value:atomic_load_explicit(&ev->value, memory_order_relaxed)];
+}
+
+ggml_metal_event_t ggml_metal_device_event_init(ggml_metal_device_t dev) {
+    id<MTLEvent> event = [dev->mtl_device newEvent];
+
+    ggml_metal_event_t ev = calloc(1, sizeof(struct ggml_metal_event));
+
+    ev->obj = (__bridge void *)event;
+    ev->value = 0;
+
+    return ev;
+}
+
+void ggml_metal_device_event_free(ggml_metal_device_t dev, ggml_metal_event_t ev) {
+    id<MTLEvent> event = ev->obj;
+    [event release];
+
+    free(ev);
+
+    GGML_UNUSED(dev);
+}
+
+void ggml_metal_device_event_synchronize(ggml_metal_device_t dev, ggml_metal_event_t ev) {
+    @autoreleasepool {
+        id<MTLEvent> event = ev->obj;
+
+        id<MTLCommandBuffer> cmd_buf = [dev->mtl_queue commandBuffer];
+        [cmd_buf encodeWaitForEvent:event value:atomic_load_explicit(&ev->value, memory_order_relaxed)];
+        [cmd_buf commit];
+        [cmd_buf waitUntilCompleted];
+    }
+}
+
 void ggml_metal_device_get_memory(ggml_metal_device_t dev, size_t * free, size_t * total) {
     if (@available(macOS 10.12, iOS 16.0, *)) {
         *total = dev->mtl_device.recommendedMaxWorkingSetSize;
@@ -1344,8 +1401,8 @@ ggml_metal_buffer_t ggml_metal_buffer_init(ggml_metal_device_t dev, size_t size,
         res->all_data = ggml_metal_host_malloc(size_aligned);
         res->is_shared = true;
     } else {
-        // use virtual address from g_addr_device counter
-        res->all_data = (void *) atomic_fetch_add_explicit(&g_addr_device, size_aligned, memory_order_relaxed);
+        // use virtual address
+        res->all_data = (void *) atomic_fetch_add_explicit(&dev->addr_virt, size_aligned, memory_order_relaxed);
         res->is_shared = false;
     }
     res->all_size = size_aligned;
diff --git a/ggml/src/ggml-metal/ggml-metal.cpp b/ggml/src/ggml-metal/ggml-metal.cpp
index 56b59f0afd..a616dcdb46 100644
--- a/ggml/src/ggml-metal/ggml-metal.cpp
+++ b/ggml/src/ggml-metal/ggml-metal.cpp
@@ -7,11 +7,12 @@
 #include "ggml-metal-context.h"
 #include "ggml-metal-ops.h"
 
-// globals
+#define GGML_METAL_NAME "MTL"
+#define GGML_METAL_MAX_DEVICES 16
 
-// initialized in ggml_backend_metal_reg
-static ggml_backend_reg    g_ggml_metal_reg;
-static ggml_backend_device g_ggml_metal_device;
+// number of Metal devices
+// note: can be overriden with GGML_METAL_DEVICES env to simulate virtual devices
+static int g_devices = 1;
 
 ////////////////////////////////////////////////////////////////////////////////
 // backend interface
@@ -165,10 +166,28 @@ static ggml_backend_buffer_i ggml_backend_metal_buffer_private_i = {
     /* .reset           = */ NULL,
 };
 
+static bool ggml_backend_buffer_is_metal(ggml_backend_buffer_t buffer) {
+    return buffer->iface.free_buffer == ggml_backend_metal_buffer_shared_free_buffer ||
+           buffer->iface.free_buffer == ggml_backend_metal_buffer_private_free_buffer;
+}
+
 //
 // buffer types
 //
 
+struct ggml_backend_metal_buffer_type {
+    int device;
+    std::string name;
+};
+
+struct ggml_backend_metal_buffer_type_deleter {
+    void operator()(ggml_backend_metal_buffer_type * ctx) const {
+        delete ctx;
+    }
+};
+
+typedef std::unique_ptr<ggml_backend_metal_buffer_type, ggml_backend_metal_buffer_type_deleter> ggml_backend_metal_buffer_type_ptr;
+
 // common method for allocating shread or private Metal buffers
 static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size, bool shared) {
     ggml_metal_device_t ctx_dev = (ggml_metal_device_t)buft->device->context;
@@ -218,9 +237,9 @@ static size_t ggml_backend_metal_buffer_type_get_alloc_size(ggml_backend_buffer_
 // default (shared) buffer type
 
 static const char * ggml_backend_metal_buffer_type_shared_get_name(ggml_backend_buffer_type_t buft) {
-    return "Metal";
+    ggml_backend_metal_buffer_type * ctx = (ggml_backend_metal_buffer_type *)buft->context;
 
-    GGML_UNUSED(buft);
+    return ctx->name.c_str();
 }
 
 static ggml_backend_buffer_t ggml_backend_metal_buffer_type_shared_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
@@ -249,29 +268,54 @@ static bool ggml_backend_metal_buffer_type_shared_is_host(ggml_backend_buffer_ty
     GGML_UNUSED(buft);
 }
 
-static ggml_backend_buffer_type_t ggml_backend_metal_buffer_type_shared(void) {
-    static ggml_backend_buffer_type ggml_backend_buffer_type_metal = {
-        /* .iface = */ {
-            /* .get_name         = */ ggml_backend_metal_buffer_type_shared_get_name,
-            /* .alloc_buffer     = */ ggml_backend_metal_buffer_type_shared_alloc_buffer,
-            /* .get_alignment    = */ ggml_backend_metal_buffer_type_shared_get_alignment,
-            /* .get_max_size     = */ ggml_backend_metal_buffer_type_shared_get_max_size,
-            /* .get_alloc_size   = */ ggml_backend_metal_buffer_type_shared_get_alloc_size,
-            /* .is_host          = */ ggml_backend_metal_buffer_type_shared_is_host,
-        },
-        /* .device  = */ &g_ggml_metal_device,
-        /* .context = */ NULL,
-    };
+static ggml_backend_buffer_type_t ggml_backend_metal_buffer_type_shared(int device) {
+    static std::mutex mutex;
+    std::lock_guard<std::mutex> lock(mutex);
 
-    return &ggml_backend_buffer_type_metal;
+    static std::vector<ggml_backend_buffer_type> bufts;
+    static std::vector<ggml_backend_metal_buffer_type_ptr> ctxs;
+
+    static bool initialized = false;
+    if (!initialized) {
+        bufts.reserve(g_devices);
+        ctxs.reserve(g_devices);
+
+        for (int i = 0; i < g_devices; ++i) {
+            ggml_backend_metal_buffer_type * raw_ctx =
+                new ggml_backend_metal_buffer_type {
+                    /* .device = */ i,
+                    /* .name   = */ GGML_METAL_NAME + std::to_string(i),
+                };
+            ctxs.emplace_back(raw_ctx);
+
+            ggml_backend_buffer_type buft = {
+                /* .iface = */ {
+                    /* .get_name         = */ ggml_backend_metal_buffer_type_shared_get_name,
+                    /* .alloc_buffer     = */ ggml_backend_metal_buffer_type_shared_alloc_buffer,
+                    /* .get_alignment    = */ ggml_backend_metal_buffer_type_shared_get_alignment,
+                    /* .get_max_size     = */ ggml_backend_metal_buffer_type_shared_get_max_size,
+                    /* .get_alloc_size   = */ ggml_backend_metal_buffer_type_shared_get_alloc_size,
+                    /* .is_host          = */ ggml_backend_metal_buffer_type_shared_is_host,
+                },
+                /* .device  = */ ggml_backend_reg_dev_get(ggml_backend_metal_reg(), i),
+                /* .context = */ raw_ctx,
+            };
+
+            bufts.emplace_back(buft);
+        }
+
+        initialized = true;
+    }
+
+    return &bufts[device];
 }
 
 // default (private) buffer type
 
 static const char * ggml_backend_metal_buffer_type_private_get_name(ggml_backend_buffer_type_t buft) {
-    return "Metal_Private";
+    ggml_backend_metal_buffer_type * ctx = (ggml_backend_metal_buffer_type *)buft->context;
 
-    GGML_UNUSED(buft);
+    return ctx->name.c_str();
 }
 
 static ggml_backend_buffer_t ggml_backend_metal_buffer_type_private_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
@@ -300,29 +344,53 @@ static bool ggml_backend_metal_buffer_type_private_is_host(ggml_backend_buffer_t
     GGML_UNUSED(buft);
 }
 
-static ggml_backend_buffer_type_t ggml_backend_metal_buffer_type_private(void) {
-    static ggml_backend_buffer_type ggml_backend_buffer_type_metal = {
-        /* .iface = */ {
-            /* .get_name         = */ ggml_backend_metal_buffer_type_private_get_name,
-            /* .alloc_buffer     = */ ggml_backend_metal_buffer_type_private_alloc_buffer,
-            /* .get_alignment    = */ ggml_backend_metal_buffer_type_private_get_alignment,
-            /* .get_max_size     = */ ggml_backend_metal_buffer_type_private_get_max_size,
-            /* .get_alloc_size   = */ ggml_backend_metal_buffer_type_private_get_alloc_size,
-            /* .is_host          = */ ggml_backend_metal_buffer_type_private_is_host,
-        },
-        /* .device  = */ &g_ggml_metal_device,
-        /* .context = */ NULL,
-    };
+static ggml_backend_buffer_type_t ggml_backend_metal_buffer_type_private(int device) {
+    static std::mutex mutex;
+    std::lock_guard<std::mutex> lock(mutex);
 
-    return &ggml_backend_buffer_type_metal;
+    static std::vector<ggml_backend_buffer_type> bufts;
+    static std::vector<ggml_backend_metal_buffer_type_ptr> ctxs;
+
+    static bool initialized = false;
+    if (!initialized) {
+        bufts.reserve(g_devices);
+        ctxs.reserve(g_devices);
+
+        for (int i = 0; i < g_devices; ++i) {
+            ggml_backend_metal_buffer_type * raw_ctx = new ggml_backend_metal_buffer_type{
+                /* .device = */ i,
+                /* .name   = */ GGML_METAL_NAME + std::to_string(i) + "_Private"
+            };
+            ctxs.emplace_back(raw_ctx);
+
+            ggml_backend_buffer_type buft = {
+                /* .iface = */ {
+                    /* .get_name         = */ ggml_backend_metal_buffer_type_private_get_name,
+                    /* .alloc_buffer     = */ ggml_backend_metal_buffer_type_private_alloc_buffer,
+                    /* .get_alignment    = */ ggml_backend_metal_buffer_type_private_get_alignment,
+                    /* .get_max_size     = */ ggml_backend_metal_buffer_type_private_get_max_size,
+                    /* .get_alloc_size   = */ ggml_backend_metal_buffer_type_private_get_alloc_size,
+                    /* .is_host          = */ ggml_backend_metal_buffer_type_private_is_host,
+                },
+                /* .device  = */ ggml_backend_reg_dev_get(ggml_backend_metal_reg(), i),
+                /* .context = */ raw_ctx,
+            };
+
+            bufts.emplace_back(buft);
+        }
+
+        initialized = true;
+    }
+
+    return &bufts[device];
 }
 
 // mapped buffer type
 
 static const char * ggml_backend_metal_buffer_type_mapped_get_name(ggml_backend_buffer_type_t buft) {
-    return "Metal_Mapped";
+    ggml_backend_metal_buffer_type * ctx = (ggml_backend_metal_buffer_type *)buft->context;
 
-    GGML_UNUSED(buft);
+    return ctx->name.c_str();
 }
 
 static ggml_backend_buffer_t ggml_backend_metal_buffer_type_mapped_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
@@ -352,31 +420,55 @@ static bool ggml_backend_metal_buffer_type_mapped_is_host(ggml_backend_buffer_ty
     GGML_UNUSED(buft);
 }
 
-static ggml_backend_buffer_type_t ggml_backend_metal_buffer_type_mapped(void) {
-    // note: not obvious, but this buffer type still needs to implement .alloc_buffer:
-    //       https://github.com/ggml-org/llama.cpp/pull/15832#discussion_r2333177099
-    static ggml_backend_buffer_type ggml_backend_buffer_type_mapped_metal = {
-        /* .iface = */ {
-            /* .get_name         = */ ggml_backend_metal_buffer_type_mapped_get_name,
-            /* .alloc_buffer     = */ ggml_backend_metal_buffer_type_mapped_alloc_buffer,
-            /* .get_alignment    = */ ggml_backend_metal_buffer_type_mapped_get_alignment,
-            /* .get_max_size     = */ ggml_backend_metal_buffer_type_mapped_get_max_size,
-            /* .get_alloc_size   = */ ggml_backend_metal_buffer_type_mapped_get_alloc_size,
-            /* .is_host          = */ ggml_backend_metal_buffer_type_mapped_is_host,
-        },
-        /* .device  = */ &g_ggml_metal_device,
-        /* .context = */ NULL,
-    };
+static ggml_backend_buffer_type_t ggml_backend_metal_buffer_type_mapped(int device) {
+    static std::mutex mutex;
+    std::lock_guard<std::mutex> lock(mutex);
 
-    return &ggml_backend_buffer_type_mapped_metal;
+    static std::vector<ggml_backend_buffer_type> bufts;
+    static std::vector<ggml_backend_metal_buffer_type_ptr> ctxs;
+
+    static bool initialized = false;
+    if (!initialized) {
+        bufts.reserve(g_devices);
+        ctxs.reserve(g_devices);
+
+        for (int i = 0; i < g_devices; ++i) {
+            ggml_backend_metal_buffer_type * raw_ctx = new ggml_backend_metal_buffer_type{
+                /* .device = */ i,
+                /* .name   = */ GGML_METAL_NAME + std::to_string(i) + "_Mapped"
+            };
+            ctxs.emplace_back(raw_ctx);
+
+            // note: not obvious, but this buffer type still needs to implement .alloc_buffer:
+            //       https://github.com/ggml-org/llama.cpp/pull/15832#discussion_r2333177099
+            ggml_backend_buffer_type buft = {
+                /* .iface = */ {
+                    /* .get_name         = */ ggml_backend_metal_buffer_type_mapped_get_name,
+                    /* .alloc_buffer     = */ ggml_backend_metal_buffer_type_mapped_alloc_buffer,
+                    /* .get_alignment    = */ ggml_backend_metal_buffer_type_mapped_get_alignment,
+                    /* .get_max_size     = */ ggml_backend_metal_buffer_type_mapped_get_max_size,
+                    /* .get_alloc_size   = */ ggml_backend_metal_buffer_type_mapped_get_alloc_size,
+                    /* .is_host          = */ ggml_backend_metal_buffer_type_mapped_is_host,
+                },
+                /* .device  = */ ggml_backend_reg_dev_get(ggml_backend_metal_reg(), i),
+                /* .context = */ raw_ctx,
+            };
+
+            bufts.emplace_back(buft);
+        }
+
+        initialized = true;
+    }
+
+    return &bufts[device];
 }
 
 // backend
 
 static const char * ggml_backend_metal_name(ggml_backend_t backend) {
-    return "Metal";
+    ggml_metal_t ctx = (ggml_metal_t)backend->context;
 
-    GGML_UNUSED(backend);
+    return ggml_metal_get_name(ctx);
 }
 
 static void ggml_backend_metal_free(ggml_backend_t backend) {
@@ -409,12 +501,24 @@ static void ggml_backend_metal_get_tensor_async(ggml_backend_t backend, const gg
 }
 
 static bool ggml_backend_metal_cpy_tensor_async(ggml_backend_t backend_src, ggml_backend_t backend_dst, const ggml_tensor * src, ggml_tensor * dst) {
-    return false;
+    if (!ggml_backend_is_metal(backend_src) || !ggml_backend_is_metal(backend_dst)) {
+        return false;
+    }
 
-    GGML_UNUSED(backend_src);
-    GGML_UNUSED(backend_dst);
-    GGML_UNUSED(src);
-    GGML_UNUSED(dst);
+    if (!ggml_backend_buffer_is_metal(src->buffer) || !ggml_backend_buffer_is_metal(dst->buffer)) {
+        return false;
+    }
+
+    ggml_metal_t ctx_src = (ggml_metal_t)backend_src->context;
+    ggml_metal_t ctx_dst = (ggml_metal_t)backend_dst->context;
+
+    //ggml_backend_buffer_t buf_src = src->view_src ? src->view_src->buffer : src->buffer;
+    //ggml_backend_buffer_t buf_dst = dst->view_src ? dst->view_src->buffer : dst->buffer;
+
+    //ggml_metal_buffer_t buf_ctx_src = (ggml_metal_buffer_t)buf_src->context;
+    //ggml_metal_buffer_t buf_ctx_dst = (ggml_metal_buffer_t)buf_dst->context;
+
+    return ggml_metal_cpy_tensor_async(ctx_src, ctx_dst, src, dst);
 }
 
 static enum ggml_status ggml_backend_metal_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
@@ -423,6 +527,20 @@ static enum ggml_status ggml_backend_metal_graph_compute(ggml_backend_t backend,
     return ggml_metal_graph_compute(ctx, cgraph);
 }
 
+static void ggml_backend_metal_event_record(ggml_backend_t backend, ggml_backend_event_t event) {
+    ggml_metal_t ctx = (ggml_metal_t)backend->context;
+    ggml_metal_event_t ev = (ggml_metal_event_t)event->context;
+
+    ggml_metal_event_record(ctx, ev);
+}
+
+static void ggml_backend_metal_event_wait(ggml_backend_t backend, ggml_backend_event_t event) {
+    ggml_metal_t ctx = (ggml_metal_t)backend->context;
+    ggml_metal_event_t ev = (ggml_metal_event_t)event->context;
+
+    ggml_metal_event_wait(ctx, ev);
+}
+
 static void ggml_backend_metal_graph_optimize(ggml_backend_t backend, ggml_cgraph * cgraph) {
     ggml_metal_t ctx = (ggml_metal_t)backend->context;
 
@@ -435,7 +553,6 @@ static void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb) {
     ggml_metal_t ctx = (ggml_metal_t)backend->context;
 
     ggml_metal_set_n_cb(ctx, n_cb);
-
 }
 
 static ggml_backend_i ggml_backend_metal_i = {
@@ -450,12 +567,8 @@ static ggml_backend_i ggml_backend_metal_i = {
     /* .graph_plan_update       = */ NULL,
     /* .graph_plan_compute      = */ NULL,
     /* .graph_compute           = */ ggml_backend_metal_graph_compute,
-
-    // the events API is needed only for multi-GPU setups, so likely no need to implement it for Metal
-    // in any case, these docs seem relevant if we ever decide to implement it:
-    // https://developer.apple.com/documentation/metal/mtlcommandbuffer#Synchronizing-Passes-with-Events
-    /* .event_record            = */ NULL,
-    /* .event_wait              = */ NULL,
+    /* .event_record            = */ ggml_backend_metal_event_record,
+    /* .event_wait              = */ ggml_backend_metal_event_wait,
     /* .graph_optimize          = */ ggml_backend_metal_graph_optimize,
 };
 
@@ -519,15 +632,17 @@ void ggml_backend_metal_capture_next_compute(ggml_backend_t backend) {
 // backend device
 
 static const char * ggml_backend_metal_device_get_name(ggml_backend_dev_t dev) {
-    return "Metal";
+    ggml_metal_device_t ctx_dev = (ggml_metal_device_t)dev->context;
 
-    GGML_UNUSED(dev);
+    const ggml_metal_device_props * props_dev = ggml_metal_device_get_props(ctx_dev);
+
+    return props_dev->name;
 }
 
 static const char * ggml_backend_metal_device_get_description(ggml_backend_dev_t dev) {
     ggml_metal_device_t ctx_dev = (ggml_metal_device_t)dev->context;
 
-    return ggml_metal_device_get_props(ctx_dev)->name;
+    return ggml_metal_device_get_props(ctx_dev)->desc;
 }
 
 static void ggml_backend_metal_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
@@ -550,14 +665,14 @@ static void ggml_backend_metal_device_get_props(ggml_backend_dev_t dev, ggml_bac
     ggml_backend_metal_device_get_memory(dev, &props->memory_free, &props->memory_total);
 
     props->caps = {
-        /* .async                 = */ true,
-        /* .host_buffer           = */ false,
-        /* .buffer_from_host_ptr  = */ true,
-        /* .events                = */ false,
+        /* .async                = */ true,
+        /* .host_buffer          = */ false,
+        /* .buffer_from_host_ptr = */ true,
+        /* .events               = */ true,
     };
 }
 
-static ggml_backend_t ggml_backend_metal_device_init(ggml_backend_dev_t dev, const char * params) {
+static ggml_backend_t ggml_backend_metal_device_init_backend(ggml_backend_dev_t dev, const char * params) {
     ggml_metal_device_t ctx_dev = (ggml_metal_device_t)dev->context;
 
     ggml_metal_t ctx = ggml_metal_init(ctx_dev);
@@ -587,7 +702,7 @@ static ggml_backend_buffer_type_t ggml_backend_metal_device_get_buffer_type(ggml
 
     const ggml_metal_device_props * props_dev = ggml_metal_device_get_props(ctx_dev);
 
-    return props_dev->use_shared_buffers ? ggml_backend_metal_buffer_type_shared() : ggml_backend_metal_buffer_type_private();
+    return props_dev->use_shared_buffers ? ggml_backend_metal_buffer_type_shared(props_dev->device) : ggml_backend_metal_buffer_type_private(props_dev->device);
 }
 
 static ggml_backend_buffer_t ggml_backend_metal_device_buffer_mapped(ggml_backend_dev_t dev, void * ptr, size_t size, size_t max_tensor_size) {
@@ -595,7 +710,9 @@ static ggml_backend_buffer_t ggml_backend_metal_device_buffer_mapped(ggml_backen
 
     ggml_metal_buffer_t res = ggml_metal_buffer_map(ctx_dev, ptr, size, max_tensor_size);
 
-    return ggml_backend_buffer_init(ggml_backend_metal_buffer_type_mapped(), ggml_backend_metal_buffer_shared_i, res, size);
+    const ggml_metal_device_props * props_dev = ggml_metal_device_get_props(ctx_dev);
+
+    return ggml_backend_buffer_init(ggml_backend_metal_buffer_type_mapped(props_dev->device), ggml_backend_metal_buffer_shared_i, res, size);
 }
 
 static bool ggml_backend_metal_device_supports_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
@@ -606,9 +723,10 @@ static bool ggml_backend_metal_device_supports_op(ggml_backend_dev_t dev, const
 
 static bool ggml_backend_metal_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
     return
+        buft->device == dev && (
         buft->iface.get_name == ggml_backend_metal_buffer_type_shared_get_name ||
         buft->iface.get_name == ggml_backend_metal_buffer_type_private_get_name ||
-        buft->iface.get_name == ggml_backend_metal_buffer_type_mapped_get_name;
+        buft->iface.get_name == ggml_backend_metal_buffer_type_mapped_get_name);
 
     GGML_UNUSED(dev);
 }
@@ -632,45 +750,97 @@ static bool ggml_backend_metal_device_offload_op(ggml_backend_dev_t dev, const g
             get_op_batch_size(op) >= ggml_metal_device_get_props(ctx_dev)->op_offload_min_batch_size;
 }
 
+static ggml_backend_event_t ggml_backend_metal_device_event_new(ggml_backend_dev_t dev) {
+    ggml_metal_device_t ctx_dev = (ggml_metal_device_t)dev->context;
+
+    ggml_metal_event_t event = ggml_metal_device_event_init(ctx_dev);
+    GGML_ASSERT(event);
+
+    ggml_backend_event_t ev = new ggml_backend_event {
+        /* .device  = */ dev,
+        /* .context = */ event,
+    };
+
+    return ev;
+}
+
+static void ggml_backend_metal_device_event_free(ggml_backend_dev_t dev, ggml_backend_event_t event) {
+    ggml_metal_device_t ctx_dev = (ggml_metal_device_t)dev->context;
+
+    ggml_metal_event_t ev = (ggml_metal_event_t)event->context;
+
+    ggml_metal_device_event_free(ctx_dev, ev);
+
+    delete event;
+}
+
+static void ggml_backend_metal_device_event_synchronize(ggml_backend_dev_t dev, ggml_backend_event_t event) {
+    ggml_metal_device_t ctx_dev = (ggml_metal_device_t)dev->context;
+
+    ggml_metal_event_t evt = (ggml_metal_event_t)event->context;
+
+    ggml_metal_device_event_synchronize(ctx_dev, evt);
+}
+
 static ggml_backend_device_i ggml_backend_metal_device_i = {
     /* .get_name             = */ ggml_backend_metal_device_get_name,
     /* .get_description      = */ ggml_backend_metal_device_get_description,
     /* .get_memory           = */ ggml_backend_metal_device_get_memory,
     /* .get_type             = */ ggml_backend_metal_device_get_type,
     /* .get_props            = */ ggml_backend_metal_device_get_props,
-    /* .init_backend         = */ ggml_backend_metal_device_init,
+    /* .init_backend         = */ ggml_backend_metal_device_init_backend,
     /* .get_buffer_type      = */ ggml_backend_metal_device_get_buffer_type,
     /* .get_host_buffer_type = */ NULL,
     /* .buffer_from_host_ptr = */ ggml_backend_metal_device_buffer_mapped,
     /* .supports_op          = */ ggml_backend_metal_device_supports_op,
     /* .supports_buft        = */ ggml_backend_metal_device_supports_buft,
     /* .offload_op           = */ ggml_backend_metal_device_offload_op,
-    /* .event_new            = */ NULL,
-    /* .event_free           = */ NULL,
-    /* .event_synchronize    = */ NULL,
+    /* .event_new            = */ ggml_backend_metal_device_event_new,
+    /* .event_free           = */ ggml_backend_metal_device_event_free,
+    /* .event_synchronize    = */ ggml_backend_metal_device_event_synchronize,
 };
 
 // backend registry
 
+struct ggml_backend_metal_reg {
+    std::vector<ggml_backend_dev_t> devices;
+};
+
+typedef struct ggml_backend_metal_reg * ggml_backend_metal_reg_t;
+
+static ggml_backend_metal_reg_t ggml_backend_metal_reg_init(void) {
+    ggml_backend_metal_reg_t ctx = new struct ggml_backend_metal_reg;
+
+    return ctx;
+}
+
+static void ggml_backend_metal_reg_free(ggml_backend_metal_reg_t ctx) {
+    delete ctx;
+}
+
+struct ggml_backend_metal_reg_deleter {
+    void operator()(ggml_backend_metal_reg_t ctx) {
+        ggml_backend_metal_reg_free(ctx);
+    }
+};
+
+typedef std::unique_ptr<struct ggml_backend_metal_reg, ggml_backend_metal_reg_deleter> ggml_backend_metal_reg_ptr;
+
 static const char * ggml_backend_metal_reg_get_name(ggml_backend_reg_t reg) {
-    return "Metal";
+    return GGML_METAL_NAME;
 
     GGML_UNUSED(reg);
 }
 
 static size_t ggml_backend_metal_reg_device_count(ggml_backend_reg_t reg) {
-    return 1;
-
-    GGML_UNUSED(reg);
+    ggml_backend_metal_reg_t ctx = (ggml_backend_metal_reg_t)reg->context;
+    return ctx->devices.size();
 }
 
 static ggml_backend_dev_t ggml_backend_metal_reg_device_get(ggml_backend_reg_t reg, size_t index) {
-    GGML_ASSERT(index == 0);
-
-    return &g_ggml_metal_device;
-
-    GGML_UNUSED(reg);
-    GGML_UNUSED(index);
+    ggml_backend_metal_reg_t ctx = (ggml_backend_metal_reg_t)reg->context;
+    GGML_ASSERT(index < ctx->devices.size());
+    return ctx->devices[index];
 }
 
 static ggml_backend_feature g_ggml_backend_metal_features[] = {
@@ -698,27 +868,67 @@ static void * ggml_backend_metal_get_proc_address(ggml_backend_reg_t reg, const
 
 static ggml_backend_reg_i ggml_backend_metal_reg_i = {
     /* .get_name         = */ ggml_backend_metal_reg_get_name,
-    /* .device_count     = */ ggml_backend_metal_reg_device_count,
-    /* .device_get       = */ ggml_backend_metal_reg_device_get,
+    /* .get_device_count = */ ggml_backend_metal_reg_device_count,
+    /* .get_device       = */ ggml_backend_metal_reg_device_get,
     /* .get_proc_address = */ ggml_backend_metal_get_proc_address,
 };
 
-ggml_backend_reg_t ggml_backend_metal_reg(void) {
-    {
-        g_ggml_metal_reg = {
-            /* .api_version = */ GGML_BACKEND_API_VERSION,
-            /* .iface       = */ ggml_backend_metal_reg_i,
-            /* .context     = */ NULL,
-        };
+static ggml_backend_dev_t ggml_backend_metal_device_init(ggml_backend_reg_t reg, int device) {
+    return new ggml_backend_device {
+        /* .iface   = */ ggml_backend_metal_device_i,
+        /* .reg     = */ reg,
+        /* .context = */ ggml_metal_device_get(device),
+    };
+}
 
-        g_ggml_metal_device = {
-            /* .iface   = */ ggml_backend_metal_device_i,
-            /* .reg     = */ &g_ggml_metal_reg,
-            /* .context = */ ggml_metal_device_get(),
-        };
+static void ggml_backend_metal_device_free(ggml_backend_dev_t dev) {
+    delete dev;
+}
+
+struct ggml_backend_device_deleter {
+    void operator()(ggml_backend_dev_t ctx) {
+        ggml_backend_metal_device_free(ctx);
+    }
+};
+
+typedef std::unique_ptr<ggml_backend_device, ggml_backend_device_deleter> ggml_backend_device_ptr;
+
+ggml_backend_reg_t ggml_backend_metal_reg(void) {
+    static ggml_backend_reg reg;
+    static bool initialized = false;
+
+    {
+        static std::mutex mutex;
+        std::lock_guard<std::mutex> lock(mutex);
+
+        const char * env = getenv("GGML_METAL_DEVICES");
+        if (env) {
+            g_devices = atoi(env);
+        }
+
+        static std::vector<ggml_backend_device_ptr> devs;
+
+        if (!initialized) {
+            static ggml_backend_metal_reg_ptr reg_ctx(ggml_backend_metal_reg_init());
+
+            for (int i = 0; i < g_devices; ++i) {
+                auto * dev = ggml_backend_metal_device_init(&reg, i);
+                devs.emplace_back(dev);
+
+                reg_ctx->devices.push_back(dev);
+            }
+
+            reg = {
+                /* .api_version = */ GGML_BACKEND_API_VERSION,
+                /* .iface       = */ ggml_backend_metal_reg_i,
+                /* .context     = */ reg_ctx.get(),
+            };
+        }
+
+        initialized = true;
     }
 
-    return &g_ggml_metal_reg;
+    return &reg;
 }
 
 GGML_BACKEND_DL_IMPL(ggml_backend_metal_reg)
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index 10b306a853..203852d0f1 100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -317,6 +317,7 @@ llama_context::llama_context(
                 auto dev_type = ggml_backend_dev_type(ggml_backend_get_device(backend.get()));
                 if (dev_type == GGML_BACKEND_DEVICE_TYPE_CPU) {
                     // ignore CPU backend
+                    // TODO: should we ignore ACCEL types too?
                     continue;
                 }
                 auto * dev = ggml_backend_get_device(backend.get());

From 4d5e97267386ba5a9fe4c26f26df10fe1c218534 Mon Sep 17 00:00:00 2001
From: Tamar <Tamar0812@outlook.co.il>
Date: Mon, 2 Feb 2026 15:05:51 +0200
Subject: [PATCH 11/18] sycl: implement GGML_OP_TOP_K (#19242)

---
 docs/ops.md                      |   2 +-
 docs/ops/SYCL.csv                | 450 +++++++++++++++----------------
 ggml/src/ggml-sycl/ggml-sycl.cpp | 140 ++++++++++
 3 files changed, 366 insertions(+), 226 deletions(-)

diff --git a/docs/ops.md b/docs/ops.md
index 2c7c60dcca..ef1ebff8b0 100644
--- a/docs/ops.md
+++ b/docs/ops.md
@@ -113,7 +113,7 @@ Legend:
 |                       SWIGLU_OAI | ❌ | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | 🟡 | ✅ | ❌ | ❌ |
 |                             TANH | ❌ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ✅ | 🟡 | ✅ | ❌ | ❌ |
 |               TIMESTEP_EMBEDDING | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ |
-|                            TOP_K | ❌ | ❌ | ✅ | ❌ | ✅ | ❌ | ❌ | 🟡 | ✅ | ❌ | ❌ |
+|                            TOP_K | ❌ | ❌ | ✅ | ❌ | ✅ | ❌ | 🟡 | 🟡 | ✅ | ❌ | ❌ |
 |                              TRI | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
 |                            TRUNC | ❌ | ❌ | ✅ | 🟡 | ❌ | ❌ | 🟡 | 🟡 | ✅ | ❌ | ❌ |
 |                          UPSCALE | ❌ | 🟡 | ✅ | ✅ | 🟡 | 🟡 | 🟡 | 🟡 | ❌ | ❌ | ❌ |
diff --git a/docs/ops/SYCL.csv b/docs/ops/SYCL.csv
index c1d22e65d4..2aa51304b3 100644
--- a/docs/ops/SYCL.csv
+++ b/docs/ops/SYCL.csv
@@ -9677,168 +9677,168 @@
 "SYCL0","ARGSORT","type=f32,ne=[2048,2,1,3],order=1","support","1","yes","SYCL"
 "SYCL0","ARGSORT","type=f32,ne=[2049,2,1,3],order=1","support","1","yes","SYCL"
 "SYCL0","ARGSORT","type=f32,ne=[2,8,8192,1],order=1","support","1","yes","SYCL"
-"SYCL0","TOP_K","type=f32,ne=[1,1,1,1],k=1,ties=0","support","0","no","SYCL"
-"SYCL0","TOP_K","type=f32,ne=[12,1,2,1],k=1,ties=0","support","0","no","SYCL"
-"SYCL0","TOP_K","type=f32,ne=[2,1,1,1],k=1,ties=0","support","0","no","SYCL"
-"SYCL0","TOP_K","type=f32,ne=[13,1,2,1],k=1,ties=0","support","0","no","SYCL"
-"SYCL0","TOP_K","type=f32,ne=[2,1,1,1],k=2,ties=0","support","0","no","SYCL"
-"SYCL0","TOP_K","type=f32,ne=[13,1,2,1],k=2,ties=0","support","0","no","SYCL"
-"SYCL0","TOP_K","type=f32,ne=[4,1,1,1],k=1,ties=0","support","0","no","SYCL"
-"SYCL0","TOP_K","type=f32,ne=[15,1,2,1],k=1,ties=0","support","0","no","SYCL"
-"SYCL0","TOP_K","type=f32,ne=[4,1,1,1],k=2,ties=0","support","0","no","SYCL"
-"SYCL0","TOP_K","type=f32,ne=[15,1,2,1],k=2,ties=0","support","0","no","SYCL"
-"SYCL0","TOP_K","type=f32,ne=[4,1,1,1],k=3,ties=0","support","0","no","SYCL"
-"SYCL0","TOP_K","type=f32,ne=[15,1,2,1],k=3,ties=0","support","0","no","SYCL"
-"SYCL0","TOP_K","type=f32,ne=[8,1,1,1],k=1,ties=0","support","0","no","SYCL"
-"SYCL0","TOP_K","type=f32,ne=[19,1,2,1],k=1,ties=0","support","0","no","SYCL"
-"SYCL0","TOP_K","type=f32,ne=[8,1,1,1],k=2,ties=0","support","0","no","SYCL"
-"SYCL0","TOP_K","type=f32,ne=[19,1,2,1],k=2,ties=0","support","0","no","SYCL"
-"SYCL0","TOP_K","type=f32,ne=[8,1,1,1],k=3,ties=0","support","0","no","SYCL"
-"SYCL0","TOP_K","type=f32,ne=[19,1,2,1],k=3,ties=0","support","0","no","SYCL"
-"SYCL0","TOP_K","type=f32,ne=[8,1,1,1],k=7,ties=0","support","0","no","SYCL"
-"SYCL0","TOP_K","type=f32,ne=[19,1,2,1],k=7,ties=0","support","0","no","SYCL"
-"SYCL0","TOP_K","type=f32,ne=[16,1,1,1],k=1,ties=0","support","0","no","SYCL"
-"SYCL0","TOP_K","type=f32,ne=[27,1,2,1],k=1,ties=0","support","0","no","SYCL"
-"SYCL0","TOP_K","type=f32,ne=[16,1,1,1],k=2,ties=0","support","0","no","SYCL"
-"SYCL0","TOP_K","type=f32,ne=[27,1,2,1],k=2,ties=0","support","0","no","SYCL"
-"SYCL0","TOP_K","type=f32,ne=[16,1,1,1],k=3,ties=0","support","0","no","SYCL"
-"SYCL0","TOP_K","type=f32,ne=[27,1,2,1],k=3,ties=0","support","0","no","SYCL"
-"SYCL0","TOP_K","type=f32,ne=[16,1,1,1],k=7,ties=0","support","0","no","SYCL"
-"SYCL0","TOP_K","type=f32,ne=[27,1,2,1],k=7,ties=0","support","0","no","SYCL"
-"SYCL0","TOP_K","type=f32,ne=[16,1,1,1],k=15,ties=0","support","0","no","SYCL"
-"SYCL0","TOP_K","type=f32,ne=[27,1,2,1],k=15,ties=0","support","0","no","SYCL"
-"SYCL0","TOP_K","type=f32,ne=[32,1,1,1],k=1,ties=0","support","0","no","SYCL"
-"SYCL0","TOP_K","type=f32,ne=[43,1,2,1],k=1,ties=0","support","0","no","SYCL"
-"SYCL0","TOP_K","type=f32,ne=[32,1,1,1],k=2,ties=0","support","0","no","SYCL"
-"SYCL0","TOP_K","type=f32,ne=[43,1,2,1],k=2,ties=0","support","0","no","SYCL"
-"SYCL0","TOP_K","type=f32,ne=[32,1,1,1],k=3,ties=0","support","0","no","SYCL"
-"SYCL0","TOP_K","type=f32,ne=[43,1,2,1],k=3,ties=0","support","0","no","SYCL"
-"SYCL0","TOP_K","type=f32,ne=[32,1,1,1],k=7,ties=0","support","0","no","SYCL"
-"SYCL0","TOP_K","type=f32,ne=[43,1,2,1],k=7,ties=0","support","0","no","SYCL"
-"SYCL0","TOP_K","type=f32,ne=[32,1,1,1],k=15,ties=0","support","0","no","SYCL"
-"SYCL0","TOP_K","type=f32,ne=[43,1,2,1],k=15,ties=0","support","0","no","SYCL"
-"SYCL0","TOP_K","type=f32,ne=[64,1,1,1],k=1,ties=0","support","0","no","SYCL"
-"SYCL0","TOP_K","type=f32,ne=[75,1,2,1],k=1,ties=0","support","0","no","SYCL"
-"SYCL0","TOP_K","type=f32,ne=[64,1,1,1],k=2,ties=0","support","0","no","SYCL"
-"SYCL0","TOP_K","type=f32,ne=[75,1,2,1],k=2,ties=0","support","0","no","SYCL"
-"SYCL0","TOP_K","type=f32,ne=[64,1,1,1],k=3,ties=0","support","0","no","SYCL"
-"SYCL0","TOP_K","type=f32,ne=[75,1,2,1],k=3,ties=0","support","0","no","SYCL"
-"SYCL0","TOP_K","type=f32,ne=[64,1,1,1],k=7,ties=0","support","0","no","SYCL"
-"SYCL0","TOP_K","type=f32,ne=[75,1,2,1],k=7,ties=0","support","0","no","SYCL"
-"SYCL0","TOP_K","type=f32,ne=[64,1,1,1],k=15,ties=0","support","0","no","SYCL"
-"SYCL0","TOP_K","type=f32,ne=[75,1,2,1],k=15,ties=0","support","0","no","SYCL"
-"SYCL0","TOP_K","type=f32,ne=[128,1,1,1],k=1,ties=0","support","0","no","SYCL"
-"SYCL0","TOP_K","type=f32,ne=[139,1,2,1],k=1,ties=0","support","0","no","SYCL"
-"SYCL0","TOP_K","type=f32,ne=[128,1,1,1],k=2,ties=0","support","0","no","SYCL"
-"SYCL0","TOP_K","type=f32,ne=[139,1,2,1],k=2,ties=0","support","0","no","SYCL"
-"SYCL0","TOP_K","type=f32,ne=[128,1,1,1],k=3,ties=0","support","0","no","SYCL"
-"SYCL0","TOP_K","type=f32,ne=[139,1,2,1],k=3,ties=0","support","0","no","SYCL"
-"SYCL0","TOP_K","type=f32,ne=[128,1,1,1],k=7,ties=0","support","0","no","SYCL"
-"SYCL0","TOP_K","type=f32,ne=[139,1,2,1],k=7,ties=0","support","0","no","SYCL"
-"SYCL0","TOP_K","type=f32,ne=[128,1,1,1],k=15,ties=0","support","0","no","SYCL"
-"SYCL0","TOP_K","type=f32,ne=[139,1,2,1],k=15,ties=0","support","0","no","SYCL"
+"SYCL0","TOP_K","type=f32,ne=[1,1,1,1],k=1,ties=0","support","1","yes","SYCL"
+"SYCL0","TOP_K","type=f32,ne=[12,1,2,1],k=1,ties=0","support","1","yes","SYCL"
+"SYCL0","TOP_K","type=f32,ne=[2,1,1,1],k=1,ties=0","support","1","yes","SYCL"
+"SYCL0","TOP_K","type=f32,ne=[13,1,2,1],k=1,ties=0","support","1","yes","SYCL"
+"SYCL0","TOP_K","type=f32,ne=[2,1,1,1],k=2,ties=0","support","1","yes","SYCL"
+"SYCL0","TOP_K","type=f32,ne=[13,1,2,1],k=2,ties=0","support","1","yes","SYCL"
+"SYCL0","TOP_K","type=f32,ne=[4,1,1,1],k=1,ties=0","support","1","yes","SYCL"
+"SYCL0","TOP_K","type=f32,ne=[15,1,2,1],k=1,ties=0","support","1","yes","SYCL"
+"SYCL0","TOP_K","type=f32,ne=[4,1,1,1],k=2,ties=0","support","1","yes","SYCL"
+"SYCL0","TOP_K","type=f32,ne=[15,1,2,1],k=2,ties=0","support","1","yes","SYCL"
+"SYCL0","TOP_K","type=f32,ne=[4,1,1,1],k=3,ties=0","support","1","yes","SYCL"
+"SYCL0","TOP_K","type=f32,ne=[15,1,2,1],k=3,ties=0","support","1","yes","SYCL"
+"SYCL0","TOP_K","type=f32,ne=[8,1,1,1],k=1,ties=0","support","1","yes","SYCL"
+"SYCL0","TOP_K","type=f32,ne=[19,1,2,1],k=1,ties=0","support","1","yes","SYCL"
+"SYCL0","TOP_K","type=f32,ne=[8,1,1,1],k=2,ties=0","support","1","yes","SYCL"
+"SYCL0","TOP_K","type=f32,ne=[19,1,2,1],k=2,ties=0","support","1","yes","SYCL"
+"SYCL0","TOP_K","type=f32,ne=[8,1,1,1],k=3,ties=0","support","1","yes","SYCL"
+"SYCL0","TOP_K","type=f32,ne=[19,1,2,1],k=3,ties=0","support","1","yes","SYCL"
+"SYCL0","TOP_K","type=f32,ne=[8,1,1,1],k=7,ties=0","support","1","yes","SYCL"
+"SYCL0","TOP_K","type=f32,ne=[19,1,2,1],k=7,ties=0","support","1","yes","SYCL"
+"SYCL0","TOP_K","type=f32,ne=[16,1,1,1],k=1,ties=0","support","1","yes","SYCL"
+"SYCL0","TOP_K","type=f32,ne=[27,1,2,1],k=1,ties=0","support","1","yes","SYCL"
+"SYCL0","TOP_K","type=f32,ne=[16,1,1,1],k=2,ties=0","support","1","yes","SYCL"
+"SYCL0","TOP_K","type=f32,ne=[27,1,2,1],k=2,ties=0","support","1","yes","SYCL"
+"SYCL0","TOP_K","type=f32,ne=[16,1,1,1],k=3,ties=0","support","1","yes","SYCL"
+"SYCL0","TOP_K","type=f32,ne=[27,1,2,1],k=3,ties=0","support","1","yes","SYCL"
+"SYCL0","TOP_K","type=f32,ne=[16,1,1,1],k=7,ties=0","support","1","yes","SYCL"
+"SYCL0","TOP_K","type=f32,ne=[27,1,2,1],k=7,ties=0","support","1","yes","SYCL"
+"SYCL0","TOP_K","type=f32,ne=[16,1,1,1],k=15,ties=0","support","1","yes","SYCL"
+"SYCL0","TOP_K","type=f32,ne=[27,1,2,1],k=15,ties=0","support","1","yes","SYCL"
+"SYCL0","TOP_K","type=f32,ne=[32,1,1,1],k=1,ties=0","support","1","yes","SYCL"
+"SYCL0","TOP_K","type=f32,ne=[43,1,2,1],k=1,ties=0","support","1","yes","SYCL"
+"SYCL0","TOP_K","type=f32,ne=[32,1,1,1],k=2,ties=0","support","1","yes","SYCL"
+"SYCL0","TOP_K","type=f32,ne=[43,1,2,1],k=2,ties=0","support","1","yes","SYCL"
+"SYCL0","TOP_K","type=f32,ne=[32,1,1,1],k=3,ties=0","support","1","yes","SYCL"
+"SYCL0","TOP_K","type=f32,ne=[43,1,2,1],k=3,ties=0","support","1","yes","SYCL"
+"SYCL0","TOP_K","type=f32,ne=[32,1,1,1],k=7,ties=0","support","1","yes","SYCL"
+"SYCL0","TOP_K","type=f32,ne=[43,1,2,1],k=7,ties=0","support","1","yes","SYCL"
+"SYCL0","TOP_K","type=f32,ne=[32,1,1,1],k=15,ties=0","support","1","yes","SYCL"
+"SYCL0","TOP_K","type=f32,ne=[43,1,2,1],k=15,ties=0","support","1","yes","SYCL"
+"SYCL0","TOP_K","type=f32,ne=[64,1,1,1],k=1,ties=0","support","1","yes","SYCL"
+"SYCL0","TOP_K","type=f32,ne=[75,1,2,1],k=1,ties=0","support","1","yes","SYCL"
+"SYCL0","TOP_K","type=f32,ne=[64,1,1,1],k=2,ties=0","support","1","yes","SYCL"
+"SYCL0","TOP_K","type=f32,ne=[75,1,2,1],k=2,ties=0","support","1","yes","SYCL"
+"SYCL0","TOP_K","type=f32,ne=[64,1,1,1],k=3,ties=0","support","1","yes","SYCL"
+"SYCL0","TOP_K","type=f32,ne=[75,1,2,1],k=3,ties=0","support","1","yes","SYCL"
+"SYCL0","TOP_K","type=f32,ne=[64,1,1,1],k=7,ties=0","support","1","yes","SYCL"
+"SYCL0","TOP_K","type=f32,ne=[75,1,2,1],k=7,ties=0","support","1","yes","SYCL"
+"SYCL0","TOP_K","type=f32,ne=[64,1,1,1],k=15,ties=0","support","1","yes","SYCL"
+"SYCL0","TOP_K","type=f32,ne=[75,1,2,1],k=15,ties=0","support","1","yes","SYCL"
+"SYCL0","TOP_K","type=f32,ne=[128,1,1,1],k=1,ties=0","support","1","yes","SYCL"
+"SYCL0","TOP_K","type=f32,ne=[139,1,2,1],k=1,ties=0","support","1","yes","SYCL"
+"SYCL0","TOP_K","type=f32,ne=[128,1,1,1],k=2,ties=0","support","1","yes","SYCL"
+"SYCL0","TOP_K","type=f32,ne=[139,1,2,1],k=2,ties=0","support","1","yes","SYCL"
+"SYCL0","TOP_K","type=f32,ne=[128,1,1,1],k=3,ties=0","support","1","yes","SYCL"
+"SYCL0","TOP_K","type=f32,ne=[139,1,2,1],k=3,ties=0","support","1","yes","SYCL"
+"SYCL0","TOP_K","type=f32,ne=[128,1,1,1],k=7,ties=0","support","1","yes","SYCL"
+"SYCL0","TOP_K","type=f32,ne=[139,1,2,1],k=7,ties=0","support","1","yes","SYCL"
+"SYCL0","TOP_K","type=f32,ne=[128,1,1,1],k=15,ties=0","support","1","yes","SYCL"
+"SYCL0","TOP_K","type=f32,ne=[139,1,2,1],k=15,ties=0","support","1","yes","SYCL"
 "SYCL0","TOP_K","type=f32,ne=[128,1,1,1],k=100,ties=0","support","0","no","SYCL"
 "SYCL0","TOP_K","type=f32,ne=[139,1,2,1],k=100,ties=0","support","0","no","SYCL"
-"SYCL0","TOP_K","type=f32,ne=[256,1,1,1],k=1,ties=0","support","0","no","SYCL"
-"SYCL0","TOP_K","type=f32,ne=[267,1,2,1],k=1,ties=0","support","0","no","SYCL"
-"SYCL0","TOP_K","type=f32,ne=[256,1,1,1],k=2,ties=0","support","0","no","SYCL"
-"SYCL0","TOP_K","type=f32,ne=[267,1,2,1],k=2,ties=0","support","0","no","SYCL"
-"SYCL0","TOP_K","type=f32,ne=[256,1,1,1],k=3,ties=0","support","0","no","SYCL"
-"SYCL0","TOP_K","type=f32,ne=[267,1,2,1],k=3,ties=0","support","0","no","SYCL"
-"SYCL0","TOP_K","type=f32,ne=[256,1,1,1],k=7,ties=0","support","0","no","SYCL"
-"SYCL0","TOP_K","type=f32,ne=[267,1,2,1],k=7,ties=0","support","0","no","SYCL"
-"SYCL0","TOP_K","type=f32,ne=[256,1,1,1],k=15,ties=0","support","0","no","SYCL"
-"SYCL0","TOP_K","type=f32,ne=[267,1,2,1],k=15,ties=0","support","0","no","SYCL"
+"SYCL0","TOP_K","type=f32,ne=[256,1,1,1],k=1,ties=0","support","1","yes","SYCL"
+"SYCL0","TOP_K","type=f32,ne=[267,1,2,1],k=1,ties=0","support","1","yes","SYCL"
+"SYCL0","TOP_K","type=f32,ne=[256,1,1,1],k=2,ties=0","support","1","yes","SYCL"
+"SYCL0","TOP_K","type=f32,ne=[267,1,2,1],k=2,ties=0","support","1","yes","SYCL"
+"SYCL0","TOP_K","type=f32,ne=[256,1,1,1],k=3,ties=0","support","1","yes","SYCL"
+"SYCL0","TOP_K","type=f32,ne=[267,1,2,1],k=3,ties=0","support","1","yes","SYCL"
+"SYCL0","TOP_K","type=f32,ne=[256,1,1,1],k=7,ties=0","support","1","yes","SYCL"
+"SYCL0","TOP_K","type=f32,ne=[267,1,2,1],k=7,ties=0","support","1","yes","SYCL"
+"SYCL0","TOP_K","type=f32,ne=[256,1,1,1],k=15,ties=0","support","1","yes","SYCL"
+"SYCL0","TOP_K","type=f32,ne=[267,1,2,1],k=15,ties=0","support","1","yes","SYCL"
 "SYCL0","TOP_K","type=f32,ne=[256,1,1,1],k=100,ties=0","support","0","no","SYCL"
 "SYCL0","TOP_K","type=f32,ne=[267,1,2,1],k=100,ties=0","support","0","no","SYCL"
-"SYCL0","TOP_K","type=f32,ne=[512,1,1,1],k=1,ties=0","support","0","no","SYCL"
-"SYCL0","TOP_K","type=f32,ne=[523,1,2,1],k=1,ties=0","support","0","no","SYCL"
-"SYCL0","TOP_K","type=f32,ne=[512,1,1,1],k=2,ties=0","support","0","no","SYCL"
-"SYCL0","TOP_K","type=f32,ne=[523,1,2,1],k=2,ties=0","support","0","no","SYCL"
-"SYCL0","TOP_K","type=f32,ne=[512,1,1,1],k=3,ties=0","support","0","no","SYCL"
-"SYCL0","TOP_K","type=f32,ne=[523,1,2,1],k=3,ties=0","support","0","no","SYCL"
-"SYCL0","TOP_K","type=f32,ne=[512,1,1,1],k=7,ties=0","support","0","no","SYCL"
-"SYCL0","TOP_K","type=f32,ne=[523,1,2,1],k=7,ties=0","support","0","no","SYCL"
-"SYCL0","TOP_K","type=f32,ne=[512,1,1,1],k=15,ties=0","support","0","no","SYCL"
-"SYCL0","TOP_K","type=f32,ne=[523,1,2,1],k=15,ties=0","support","0","no","SYCL"
+"SYCL0","TOP_K","type=f32,ne=[512,1,1,1],k=1,ties=0","support","1","yes","SYCL"
+"SYCL0","TOP_K","type=f32,ne=[523,1,2,1],k=1,ties=0","support","1","yes","SYCL"
+"SYCL0","TOP_K","type=f32,ne=[512,1,1,1],k=2,ties=0","support","1","yes","SYCL"
+"SYCL0","TOP_K","type=f32,ne=[523,1,2,1],k=2,ties=0","support","1","yes","SYCL"
+"SYCL0","TOP_K","type=f32,ne=[512,1,1,1],k=3,ties=0","support","1","yes","SYCL"
+"SYCL0","TOP_K","type=f32,ne=[523,1,2,1],k=3,ties=0","support","1","yes","SYCL"
+"SYCL0","TOP_K","type=f32,ne=[512,1,1,1],k=7,ties=0","support","1","yes","SYCL"
+"SYCL0","TOP_K","type=f32,ne=[523,1,2,1],k=7,ties=0","support","1","yes","SYCL"
+"SYCL0","TOP_K","type=f32,ne=[512,1,1,1],k=15,ties=0","support","1","yes","SYCL"
+"SYCL0","TOP_K","type=f32,ne=[523,1,2,1],k=15,ties=0","support","1","yes","SYCL"
 "SYCL0","TOP_K","type=f32,ne=[512,1,1,1],k=100,ties=0","support","0","no","SYCL"
 "SYCL0","TOP_K","type=f32,ne=[523,1,2,1],k=100,ties=0","support","0","no","SYCL"
 "SYCL0","TOP_K","type=f32,ne=[512,1,1,1],k=500,ties=0","support","0","no","SYCL"
 "SYCL0","TOP_K","type=f32,ne=[523,1,2,1],k=500,ties=0","support","0","no","SYCL"
-"SYCL0","TOP_K","type=f32,ne=[1024,1,1,1],k=1,ties=0","support","0","no","SYCL"
-"SYCL0","TOP_K","type=f32,ne=[1035,1,2,1],k=1,ties=0","support","0","no","SYCL"
-"SYCL0","TOP_K","type=f32,ne=[1024,1,1,1],k=2,ties=0","support","0","no","SYCL"
-"SYCL0","TOP_K","type=f32,ne=[1035,1,2,1],k=2,ties=0","support","0","no","SYCL"
-"SYCL0","TOP_K","type=f32,ne=[1024,1,1,1],k=3,ties=0","support","0","no","SYCL"
-"SYCL0","TOP_K","type=f32,ne=[1035,1,2,1],k=3,ties=0","support","0","no","SYCL"
-"SYCL0","TOP_K","type=f32,ne=[1024,1,1,1],k=7,ties=0","support","0","no","SYCL"
-"SYCL0","TOP_K","type=f32,ne=[1035,1,2,1],k=7,ties=0","support","0","no","SYCL"
-"SYCL0","TOP_K","type=f32,ne=[1024,1,1,1],k=15,ties=0","support","0","no","SYCL"
-"SYCL0","TOP_K","type=f32,ne=[1035,1,2,1],k=15,ties=0","support","0","no","SYCL"
+"SYCL0","TOP_K","type=f32,ne=[1024,1,1,1],k=1,ties=0","support","1","yes","SYCL"
+"SYCL0","TOP_K","type=f32,ne=[1035,1,2,1],k=1,ties=0","support","1","yes","SYCL"
+"SYCL0","TOP_K","type=f32,ne=[1024,1,1,1],k=2,ties=0","support","1","yes","SYCL"
+"SYCL0","TOP_K","type=f32,ne=[1035,1,2,1],k=2,ties=0","support","1","yes","SYCL"
+"SYCL0","TOP_K","type=f32,ne=[1024,1,1,1],k=3,ties=0","support","1","yes","SYCL"
+"SYCL0","TOP_K","type=f32,ne=[1035,1,2,1],k=3,ties=0","support","1","yes","SYCL"
+"SYCL0","TOP_K","type=f32,ne=[1024,1,1,1],k=7,ties=0","support","1","yes","SYCL"
+"SYCL0","TOP_K","type=f32,ne=[1035,1,2,1],k=7,ties=0","support","1","yes","SYCL"
+"SYCL0","TOP_K","type=f32,ne=[1024,1,1,1],k=15,ties=0","support","1","yes","SYCL"
+"SYCL0","TOP_K","type=f32,ne=[1035,1,2,1],k=15,ties=0","support","1","yes","SYCL"
 "SYCL0","TOP_K","type=f32,ne=[1024,1,1,1],k=100,ties=0","support","0","no","SYCL"
 "SYCL0","TOP_K","type=f32,ne=[1035,1,2,1],k=100,ties=0","support","0","no","SYCL"
 "SYCL0","TOP_K","type=f32,ne=[1024,1,1,1],k=500,ties=0","support","0","no","SYCL"
 "SYCL0","TOP_K","type=f32,ne=[1035,1,2,1],k=500,ties=0","support","0","no","SYCL"
 "SYCL0","TOP_K","type=f32,ne=[1024,1,1,1],k=1023,ties=0","support","0","no","SYCL"
 "SYCL0","TOP_K","type=f32,ne=[1035,1,2,1],k=1023,ties=0","support","0","no","SYCL"
-"SYCL0","TOP_K","type=f32,ne=[2048,1,1,1],k=1,ties=0","support","0","no","SYCL"
-"SYCL0","TOP_K","type=f32,ne=[2059,1,2,1],k=1,ties=0","support","0","no","SYCL"
-"SYCL0","TOP_K","type=f32,ne=[2048,1,1,1],k=2,ties=0","support","0","no","SYCL"
-"SYCL0","TOP_K","type=f32,ne=[2059,1,2,1],k=2,ties=0","support","0","no","SYCL"
-"SYCL0","TOP_K","type=f32,ne=[2048,1,1,1],k=3,ties=0","support","0","no","SYCL"
-"SYCL0","TOP_K","type=f32,ne=[2059,1,2,1],k=3,ties=0","support","0","no","SYCL"
-"SYCL0","TOP_K","type=f32,ne=[2048,1,1,1],k=7,ties=0","support","0","no","SYCL"
-"SYCL0","TOP_K","type=f32,ne=[2059,1,2,1],k=7,ties=0","support","0","no","SYCL"
-"SYCL0","TOP_K","type=f32,ne=[2048,1,1,1],k=15,ties=0","support","0","no","SYCL"
-"SYCL0","TOP_K","type=f32,ne=[2059,1,2,1],k=15,ties=0","support","0","no","SYCL"
+"SYCL0","TOP_K","type=f32,ne=[2048,1,1,1],k=1,ties=0","support","1","yes","SYCL"
+"SYCL0","TOP_K","type=f32,ne=[2059,1,2,1],k=1,ties=0","support","1","yes","SYCL"
+"SYCL0","TOP_K","type=f32,ne=[2048,1,1,1],k=2,ties=0","support","1","yes","SYCL"
+"SYCL0","TOP_K","type=f32,ne=[2059,1,2,1],k=2,ties=0","support","1","yes","SYCL"
+"SYCL0","TOP_K","type=f32,ne=[2048,1,1,1],k=3,ties=0","support","1","yes","SYCL"
+"SYCL0","TOP_K","type=f32,ne=[2059,1,2,1],k=3,ties=0","support","1","yes","SYCL"
+"SYCL0","TOP_K","type=f32,ne=[2048,1,1,1],k=7,ties=0","support","1","yes","SYCL"
+"SYCL0","TOP_K","type=f32,ne=[2059,1,2,1],k=7,ties=0","support","1","yes","SYCL"
+"SYCL0","TOP_K","type=f32,ne=[2048,1,1,1],k=15,ties=0","support","1","yes","SYCL"
+"SYCL0","TOP_K","type=f32,ne=[2059,1,2,1],k=15,ties=0","support","1","yes","SYCL"
 "SYCL0","TOP_K","type=f32,ne=[2048,1,1,1],k=100,ties=0","support","0","no","SYCL"
 "SYCL0","TOP_K","type=f32,ne=[2059,1,2,1],k=100,ties=0","support","0","no","SYCL"
 "SYCL0","TOP_K","type=f32,ne=[2048,1,1,1],k=500,ties=0","support","0","no","SYCL"
 "SYCL0","TOP_K","type=f32,ne=[2059,1,2,1],k=500,ties=0","support","0","no","SYCL"
 "SYCL0","TOP_K","type=f32,ne=[2048,1,1,1],k=1023,ties=0","support","0","no","SYCL"
 "SYCL0","TOP_K","type=f32,ne=[2059,1,2,1],k=1023,ties=0","support","0","no","SYCL"
-"SYCL0","TOP_K","type=f32,ne=[4096,1,1,1],k=1,ties=0","support","0","no","SYCL"
-"SYCL0","TOP_K","type=f32,ne=[4107,1,2,1],k=1,ties=0","support","0","no","SYCL"
-"SYCL0","TOP_K","type=f32,ne=[4096,1,1,1],k=2,ties=0","support","0","no","SYCL"
-"SYCL0","TOP_K","type=f32,ne=[4107,1,2,1],k=2,ties=0","support","0","no","SYCL"
-"SYCL0","TOP_K","type=f32,ne=[4096,1,1,1],k=3,ties=0","support","0","no","SYCL"
-"SYCL0","TOP_K","type=f32,ne=[4107,1,2,1],k=3,ties=0","support","0","no","SYCL"
-"SYCL0","TOP_K","type=f32,ne=[4096,1,1,1],k=7,ties=0","support","0","no","SYCL"
-"SYCL0","TOP_K","type=f32,ne=[4107,1,2,1],k=7,ties=0","support","0","no","SYCL"
-"SYCL0","TOP_K","type=f32,ne=[4096,1,1,1],k=15,ties=0","support","0","no","SYCL"
-"SYCL0","TOP_K","type=f32,ne=[4107,1,2,1],k=15,ties=0","support","0","no","SYCL"
+"SYCL0","TOP_K","type=f32,ne=[4096,1,1,1],k=1,ties=0","support","1","yes","SYCL"
+"SYCL0","TOP_K","type=f32,ne=[4107,1,2,1],k=1,ties=0","support","1","yes","SYCL"
+"SYCL0","TOP_K","type=f32,ne=[4096,1,1,1],k=2,ties=0","support","1","yes","SYCL"
+"SYCL0","TOP_K","type=f32,ne=[4107,1,2,1],k=2,ties=0","support","1","yes","SYCL"
+"SYCL0","TOP_K","type=f32,ne=[4096,1,1,1],k=3,ties=0","support","1","yes","SYCL"
+"SYCL0","TOP_K","type=f32,ne=[4107,1,2,1],k=3,ties=0","support","1","yes","SYCL"
+"SYCL0","TOP_K","type=f32,ne=[4096,1,1,1],k=7,ties=0","support","1","yes","SYCL"
+"SYCL0","TOP_K","type=f32,ne=[4107,1,2,1],k=7,ties=0","support","1","yes","SYCL"
+"SYCL0","TOP_K","type=f32,ne=[4096,1,1,1],k=15,ties=0","support","1","yes","SYCL"
+"SYCL0","TOP_K","type=f32,ne=[4107,1,2,1],k=15,ties=0","support","1","yes","SYCL"
 "SYCL0","TOP_K","type=f32,ne=[4096,1,1,1],k=100,ties=0","support","0","no","SYCL"
 "SYCL0","TOP_K","type=f32,ne=[4107,1,2,1],k=100,ties=0","support","0","no","SYCL"
 "SYCL0","TOP_K","type=f32,ne=[4096,1,1,1],k=500,ties=0","support","0","no","SYCL"
 "SYCL0","TOP_K","type=f32,ne=[4107,1,2,1],k=500,ties=0","support","0","no","SYCL"
 "SYCL0","TOP_K","type=f32,ne=[4096,1,1,1],k=1023,ties=0","support","0","no","SYCL"
 "SYCL0","TOP_K","type=f32,ne=[4107,1,2,1],k=1023,ties=0","support","0","no","SYCL"
-"SYCL0","TOP_K","type=f32,ne=[8192,1,1,1],k=1,ties=0","support","0","no","SYCL"
-"SYCL0","TOP_K","type=f32,ne=[8203,1,2,1],k=1,ties=0","support","0","no","SYCL"
-"SYCL0","TOP_K","type=f32,ne=[8192,1,1,1],k=2,ties=0","support","0","no","SYCL"
-"SYCL0","TOP_K","type=f32,ne=[8203,1,2,1],k=2,ties=0","support","0","no","SYCL"
-"SYCL0","TOP_K","type=f32,ne=[8192,1,1,1],k=3,ties=0","support","0","no","SYCL"
-"SYCL0","TOP_K","type=f32,ne=[8203,1,2,1],k=3,ties=0","support","0","no","SYCL"
-"SYCL0","TOP_K","type=f32,ne=[8192,1,1,1],k=7,ties=0","support","0","no","SYCL"
-"SYCL0","TOP_K","type=f32,ne=[8203,1,2,1],k=7,ties=0","support","0","no","SYCL"
-"SYCL0","TOP_K","type=f32,ne=[8192,1,1,1],k=15,ties=0","support","0","no","SYCL"
-"SYCL0","TOP_K","type=f32,ne=[8203,1,2,1],k=15,ties=0","support","0","no","SYCL"
+"SYCL0","TOP_K","type=f32,ne=[8192,1,1,1],k=1,ties=0","support","1","yes","SYCL"
+"SYCL0","TOP_K","type=f32,ne=[8203,1,2,1],k=1,ties=0","support","1","yes","SYCL"
+"SYCL0","TOP_K","type=f32,ne=[8192,1,1,1],k=2,ties=0","support","1","yes","SYCL"
+"SYCL0","TOP_K","type=f32,ne=[8203,1,2,1],k=2,ties=0","support","1","yes","SYCL"
+"SYCL0","TOP_K","type=f32,ne=[8192,1,1,1],k=3,ties=0","support","1","yes","SYCL"
+"SYCL0","TOP_K","type=f32,ne=[8203,1,2,1],k=3,ties=0","support","1","yes","SYCL"
+"SYCL0","TOP_K","type=f32,ne=[8192,1,1,1],k=7,ties=0","support","1","yes","SYCL"
+"SYCL0","TOP_K","type=f32,ne=[8203,1,2,1],k=7,ties=0","support","1","yes","SYCL"
+"SYCL0","TOP_K","type=f32,ne=[8192,1,1,1],k=15,ties=0","support","1","yes","SYCL"
+"SYCL0","TOP_K","type=f32,ne=[8203,1,2,1],k=15,ties=0","support","1","yes","SYCL"
 "SYCL0","TOP_K","type=f32,ne=[8192,1,1,1],k=100,ties=0","support","0","no","SYCL"
 "SYCL0","TOP_K","type=f32,ne=[8203,1,2,1],k=100,ties=0","support","0","no","SYCL"
 "SYCL0","TOP_K","type=f32,ne=[8192,1,1,1],k=500,ties=0","support","0","no","SYCL"
 "SYCL0","TOP_K","type=f32,ne=[8203,1,2,1],k=500,ties=0","support","0","no","SYCL"
 "SYCL0","TOP_K","type=f32,ne=[8192,1,1,1],k=1023,ties=0","support","0","no","SYCL"
 "SYCL0","TOP_K","type=f32,ne=[8203,1,2,1],k=1023,ties=0","support","0","no","SYCL"
-"SYCL0","TOP_K","type=f32,ne=[16384,1,1,1],k=1,ties=0","support","0","no","SYCL"
-"SYCL0","TOP_K","type=f32,ne=[16395,1,2,1],k=1,ties=0","support","0","no","SYCL"
-"SYCL0","TOP_K","type=f32,ne=[16384,1,1,1],k=2,ties=0","support","0","no","SYCL"
-"SYCL0","TOP_K","type=f32,ne=[16395,1,2,1],k=2,ties=0","support","0","no","SYCL"
-"SYCL0","TOP_K","type=f32,ne=[16384,1,1,1],k=3,ties=0","support","0","no","SYCL"
-"SYCL0","TOP_K","type=f32,ne=[16395,1,2,1],k=3,ties=0","support","0","no","SYCL"
-"SYCL0","TOP_K","type=f32,ne=[16384,1,1,1],k=7,ties=0","support","0","no","SYCL"
-"SYCL0","TOP_K","type=f32,ne=[16395,1,2,1],k=7,ties=0","support","0","no","SYCL"
-"SYCL0","TOP_K","type=f32,ne=[16384,1,1,1],k=15,ties=0","support","0","no","SYCL"
-"SYCL0","TOP_K","type=f32,ne=[16395,1,2,1],k=15,ties=0","support","0","no","SYCL"
+"SYCL0","TOP_K","type=f32,ne=[16384,1,1,1],k=1,ties=0","support","1","yes","SYCL"
+"SYCL0","TOP_K","type=f32,ne=[16395,1,2,1],k=1,ties=0","support","1","yes","SYCL"
+"SYCL0","TOP_K","type=f32,ne=[16384,1,1,1],k=2,ties=0","support","1","yes","SYCL"
+"SYCL0","TOP_K","type=f32,ne=[16395,1,2,1],k=2,ties=0","support","1","yes","SYCL"
+"SYCL0","TOP_K","type=f32,ne=[16384,1,1,1],k=3,ties=0","support","1","yes","SYCL"
+"SYCL0","TOP_K","type=f32,ne=[16395,1,2,1],k=3,ties=0","support","1","yes","SYCL"
+"SYCL0","TOP_K","type=f32,ne=[16384,1,1,1],k=7,ties=0","support","1","yes","SYCL"
+"SYCL0","TOP_K","type=f32,ne=[16395,1,2,1],k=7,ties=0","support","1","yes","SYCL"
+"SYCL0","TOP_K","type=f32,ne=[16384,1,1,1],k=15,ties=0","support","1","yes","SYCL"
+"SYCL0","TOP_K","type=f32,ne=[16395,1,2,1],k=15,ties=0","support","1","yes","SYCL"
 "SYCL0","TOP_K","type=f32,ne=[16384,1,1,1],k=100,ties=0","support","0","no","SYCL"
 "SYCL0","TOP_K","type=f32,ne=[16395,1,2,1],k=100,ties=0","support","0","no","SYCL"
 "SYCL0","TOP_K","type=f32,ne=[16384,1,1,1],k=500,ties=0","support","0","no","SYCL"
@@ -9847,16 +9847,16 @@
 "SYCL0","TOP_K","type=f32,ne=[16395,1,2,1],k=1023,ties=0","support","0","no","SYCL"
 "SYCL0","TOP_K","type=f32,ne=[16384,1,1,1],k=9999,ties=0","support","0","no","SYCL"
 "SYCL0","TOP_K","type=f32,ne=[16395,1,2,1],k=9999,ties=0","support","0","no","SYCL"
-"SYCL0","TOP_K","type=f32,ne=[32768,1,1,1],k=1,ties=0","support","0","no","SYCL"
-"SYCL0","TOP_K","type=f32,ne=[32779,1,2,1],k=1,ties=0","support","0","no","SYCL"
-"SYCL0","TOP_K","type=f32,ne=[32768,1,1,1],k=2,ties=0","support","0","no","SYCL"
-"SYCL0","TOP_K","type=f32,ne=[32779,1,2,1],k=2,ties=0","support","0","no","SYCL"
-"SYCL0","TOP_K","type=f32,ne=[32768,1,1,1],k=3,ties=0","support","0","no","SYCL"
-"SYCL0","TOP_K","type=f32,ne=[32779,1,2,1],k=3,ties=0","support","0","no","SYCL"
-"SYCL0","TOP_K","type=f32,ne=[32768,1,1,1],k=7,ties=0","support","0","no","SYCL"
-"SYCL0","TOP_K","type=f32,ne=[32779,1,2,1],k=7,ties=0","support","0","no","SYCL"
-"SYCL0","TOP_K","type=f32,ne=[32768,1,1,1],k=15,ties=0","support","0","no","SYCL"
-"SYCL0","TOP_K","type=f32,ne=[32779,1,2,1],k=15,ties=0","support","0","no","SYCL"
+"SYCL0","TOP_K","type=f32,ne=[32768,1,1,1],k=1,ties=0","support","1","yes","SYCL"
+"SYCL0","TOP_K","type=f32,ne=[32779,1,2,1],k=1,ties=0","support","1","yes","SYCL"
+"SYCL0","TOP_K","type=f32,ne=[32768,1,1,1],k=2,ties=0","support","1","yes","SYCL"
+"SYCL0","TOP_K","type=f32,ne=[32779,1,2,1],k=2,ties=0","support","1","yes","SYCL"
+"SYCL0","TOP_K","type=f32,ne=[32768,1,1,1],k=3,ties=0","support","1","yes","SYCL"
+"SYCL0","TOP_K","type=f32,ne=[32779,1,2,1],k=3,ties=0","support","1","yes","SYCL"
+"SYCL0","TOP_K","type=f32,ne=[32768,1,1,1],k=7,ties=0","support","1","yes","SYCL"
+"SYCL0","TOP_K","type=f32,ne=[32779,1,2,1],k=7,ties=0","support","1","yes","SYCL"
+"SYCL0","TOP_K","type=f32,ne=[32768,1,1,1],k=15,ties=0","support","1","yes","SYCL"
+"SYCL0","TOP_K","type=f32,ne=[32779,1,2,1],k=15,ties=0","support","1","yes","SYCL"
 "SYCL0","TOP_K","type=f32,ne=[32768,1,1,1],k=100,ties=0","support","0","no","SYCL"
 "SYCL0","TOP_K","type=f32,ne=[32779,1,2,1],k=100,ties=0","support","0","no","SYCL"
 "SYCL0","TOP_K","type=f32,ne=[32768,1,1,1],k=500,ties=0","support","0","no","SYCL"
@@ -9865,16 +9865,16 @@
 "SYCL0","TOP_K","type=f32,ne=[32779,1,2,1],k=1023,ties=0","support","0","no","SYCL"
 "SYCL0","TOP_K","type=f32,ne=[32768,1,1,1],k=9999,ties=0","support","0","no","SYCL"
 "SYCL0","TOP_K","type=f32,ne=[32779,1,2,1],k=9999,ties=0","support","0","no","SYCL"
-"SYCL0","TOP_K","type=f32,ne=[65536,1,1,1],k=1,ties=0","support","0","no","SYCL"
-"SYCL0","TOP_K","type=f32,ne=[65547,1,2,1],k=1,ties=0","support","0","no","SYCL"
-"SYCL0","TOP_K","type=f32,ne=[65536,1,1,1],k=2,ties=0","support","0","no","SYCL"
-"SYCL0","TOP_K","type=f32,ne=[65547,1,2,1],k=2,ties=0","support","0","no","SYCL"
-"SYCL0","TOP_K","type=f32,ne=[65536,1,1,1],k=3,ties=0","support","0","no","SYCL"
-"SYCL0","TOP_K","type=f32,ne=[65547,1,2,1],k=3,ties=0","support","0","no","SYCL"
-"SYCL0","TOP_K","type=f32,ne=[65536,1,1,1],k=7,ties=0","support","0","no","SYCL"
-"SYCL0","TOP_K","type=f32,ne=[65547,1,2,1],k=7,ties=0","support","0","no","SYCL"
-"SYCL0","TOP_K","type=f32,ne=[65536,1,1,1],k=15,ties=0","support","0","no","SYCL"
-"SYCL0","TOP_K","type=f32,ne=[65547,1,2,1],k=15,ties=0","support","0","no","SYCL"
+"SYCL0","TOP_K","type=f32,ne=[65536,1,1,1],k=1,ties=0","support","1","yes","SYCL"
+"SYCL0","TOP_K","type=f32,ne=[65547,1,2,1],k=1,ties=0","support","1","yes","SYCL"
+"SYCL0","TOP_K","type=f32,ne=[65536,1,1,1],k=2,ties=0","support","1","yes","SYCL"
+"SYCL0","TOP_K","type=f32,ne=[65547,1,2,1],k=2,ties=0","support","1","yes","SYCL"
+"SYCL0","TOP_K","type=f32,ne=[65536,1,1,1],k=3,ties=0","support","1","yes","SYCL"
+"SYCL0","TOP_K","type=f32,ne=[65547,1,2,1],k=3,ties=0","support","1","yes","SYCL"
+"SYCL0","TOP_K","type=f32,ne=[65536,1,1,1],k=7,ties=0","support","1","yes","SYCL"
+"SYCL0","TOP_K","type=f32,ne=[65547,1,2,1],k=7,ties=0","support","1","yes","SYCL"
+"SYCL0","TOP_K","type=f32,ne=[65536,1,1,1],k=15,ties=0","support","1","yes","SYCL"
+"SYCL0","TOP_K","type=f32,ne=[65547,1,2,1],k=15,ties=0","support","1","yes","SYCL"
 "SYCL0","TOP_K","type=f32,ne=[65536,1,1,1],k=100,ties=0","support","0","no","SYCL"
 "SYCL0","TOP_K","type=f32,ne=[65547,1,2,1],k=100,ties=0","support","0","no","SYCL"
 "SYCL0","TOP_K","type=f32,ne=[65536,1,1,1],k=500,ties=0","support","0","no","SYCL"
@@ -9883,16 +9883,16 @@
 "SYCL0","TOP_K","type=f32,ne=[65547,1,2,1],k=1023,ties=0","support","0","no","SYCL"
 "SYCL0","TOP_K","type=f32,ne=[65536,1,1,1],k=9999,ties=0","support","0","no","SYCL"
 "SYCL0","TOP_K","type=f32,ne=[65547,1,2,1],k=9999,ties=0","support","0","no","SYCL"
-"SYCL0","TOP_K","type=f32,ne=[131072,1,1,1],k=1,ties=0","support","0","no","SYCL"
-"SYCL0","TOP_K","type=f32,ne=[131083,1,2,1],k=1,ties=0","support","0","no","SYCL"
-"SYCL0","TOP_K","type=f32,ne=[131072,1,1,1],k=2,ties=0","support","0","no","SYCL"
-"SYCL0","TOP_K","type=f32,ne=[131083,1,2,1],k=2,ties=0","support","0","no","SYCL"
-"SYCL0","TOP_K","type=f32,ne=[131072,1,1,1],k=3,ties=0","support","0","no","SYCL"
-"SYCL0","TOP_K","type=f32,ne=[131083,1,2,1],k=3,ties=0","support","0","no","SYCL"
-"SYCL0","TOP_K","type=f32,ne=[131072,1,1,1],k=7,ties=0","support","0","no","SYCL"
-"SYCL0","TOP_K","type=f32,ne=[131083,1,2,1],k=7,ties=0","support","0","no","SYCL"
-"SYCL0","TOP_K","type=f32,ne=[131072,1,1,1],k=15,ties=0","support","0","no","SYCL"
-"SYCL0","TOP_K","type=f32,ne=[131083,1,2,1],k=15,ties=0","support","0","no","SYCL"
+"SYCL0","TOP_K","type=f32,ne=[131072,1,1,1],k=1,ties=0","support","1","yes","SYCL"
+"SYCL0","TOP_K","type=f32,ne=[131083,1,2,1],k=1,ties=0","support","1","yes","SYCL"
+"SYCL0","TOP_K","type=f32,ne=[131072,1,1,1],k=2,ties=0","support","1","yes","SYCL"
+"SYCL0","TOP_K","type=f32,ne=[131083,1,2,1],k=2,ties=0","support","1","yes","SYCL"
+"SYCL0","TOP_K","type=f32,ne=[131072,1,1,1],k=3,ties=0","support","1","yes","SYCL"
+"SYCL0","TOP_K","type=f32,ne=[131083,1,2,1],k=3,ties=0","support","1","yes","SYCL"
+"SYCL0","TOP_K","type=f32,ne=[131072,1,1,1],k=7,ties=0","support","1","yes","SYCL"
+"SYCL0","TOP_K","type=f32,ne=[131083,1,2,1],k=7,ties=0","support","1","yes","SYCL"
+"SYCL0","TOP_K","type=f32,ne=[131072,1,1,1],k=15,ties=0","support","1","yes","SYCL"
+"SYCL0","TOP_K","type=f32,ne=[131083,1,2,1],k=15,ties=0","support","1","yes","SYCL"
 "SYCL0","TOP_K","type=f32,ne=[131072,1,1,1],k=100,ties=0","support","0","no","SYCL"
 "SYCL0","TOP_K","type=f32,ne=[131083,1,2,1],k=100,ties=0","support","0","no","SYCL"
 "SYCL0","TOP_K","type=f32,ne=[131072,1,1,1],k=500,ties=0","support","0","no","SYCL"
@@ -9901,16 +9901,16 @@
 "SYCL0","TOP_K","type=f32,ne=[131083,1,2,1],k=1023,ties=0","support","0","no","SYCL"
 "SYCL0","TOP_K","type=f32,ne=[131072,1,1,1],k=9999,ties=0","support","0","no","SYCL"
 "SYCL0","TOP_K","type=f32,ne=[131083,1,2,1],k=9999,ties=0","support","0","no","SYCL"
-"SYCL0","TOP_K","type=f32,ne=[262144,1,1,1],k=1,ties=0","support","0","no","SYCL"
-"SYCL0","TOP_K","type=f32,ne=[262155,1,2,1],k=1,ties=0","support","0","no","SYCL"
-"SYCL0","TOP_K","type=f32,ne=[262144,1,1,1],k=2,ties=0","support","0","no","SYCL"
-"SYCL0","TOP_K","type=f32,ne=[262155,1,2,1],k=2,ties=0","support","0","no","SYCL"
-"SYCL0","TOP_K","type=f32,ne=[262144,1,1,1],k=3,ties=0","support","0","no","SYCL"
-"SYCL0","TOP_K","type=f32,ne=[262155,1,2,1],k=3,ties=0","support","0","no","SYCL"
-"SYCL0","TOP_K","type=f32,ne=[262144,1,1,1],k=7,ties=0","support","0","no","SYCL"
-"SYCL0","TOP_K","type=f32,ne=[262155,1,2,1],k=7,ties=0","support","0","no","SYCL"
-"SYCL0","TOP_K","type=f32,ne=[262144,1,1,1],k=15,ties=0","support","0","no","SYCL"
-"SYCL0","TOP_K","type=f32,ne=[262155,1,2,1],k=15,ties=0","support","0","no","SYCL"
+"SYCL0","TOP_K","type=f32,ne=[262144,1,1,1],k=1,ties=0","support","1","yes","SYCL"
+"SYCL0","TOP_K","type=f32,ne=[262155,1,2,1],k=1,ties=0","support","1","yes","SYCL"
+"SYCL0","TOP_K","type=f32,ne=[262144,1,1,1],k=2,ties=0","support","1","yes","SYCL"
+"SYCL0","TOP_K","type=f32,ne=[262155,1,2,1],k=2,ties=0","support","1","yes","SYCL"
+"SYCL0","TOP_K","type=f32,ne=[262144,1,1,1],k=3,ties=0","support","1","yes","SYCL"
+"SYCL0","TOP_K","type=f32,ne=[262155,1,2,1],k=3,ties=0","support","1","yes","SYCL"
+"SYCL0","TOP_K","type=f32,ne=[262144,1,1,1],k=7,ties=0","support","1","yes","SYCL"
+"SYCL0","TOP_K","type=f32,ne=[262155,1,2,1],k=7,ties=0","support","1","yes","SYCL"
+"SYCL0","TOP_K","type=f32,ne=[262144,1,1,1],k=15,ties=0","support","1","yes","SYCL"
+"SYCL0","TOP_K","type=f32,ne=[262155,1,2,1],k=15,ties=0","support","1","yes","SYCL"
 "SYCL0","TOP_K","type=f32,ne=[262144,1,1,1],k=100,ties=0","support","0","no","SYCL"
 "SYCL0","TOP_K","type=f32,ne=[262155,1,2,1],k=100,ties=0","support","0","no","SYCL"
 "SYCL0","TOP_K","type=f32,ne=[262144,1,1,1],k=500,ties=0","support","0","no","SYCL"
@@ -9919,16 +9919,16 @@
 "SYCL0","TOP_K","type=f32,ne=[262155,1,2,1],k=1023,ties=0","support","0","no","SYCL"
 "SYCL0","TOP_K","type=f32,ne=[262144,1,1,1],k=9999,ties=0","support","0","no","SYCL"
 "SYCL0","TOP_K","type=f32,ne=[262155,1,2,1],k=9999,ties=0","support","0","no","SYCL"
-"SYCL0","TOP_K","type=f32,ne=[524288,1,1,1],k=1,ties=0","support","0","no","SYCL"
-"SYCL0","TOP_K","type=f32,ne=[524299,1,2,1],k=1,ties=0","support","0","no","SYCL"
-"SYCL0","TOP_K","type=f32,ne=[524288,1,1,1],k=2,ties=0","support","0","no","SYCL"
-"SYCL0","TOP_K","type=f32,ne=[524299,1,2,1],k=2,ties=0","support","0","no","SYCL"
-"SYCL0","TOP_K","type=f32,ne=[524288,1,1,1],k=3,ties=0","support","0","no","SYCL"
-"SYCL0","TOP_K","type=f32,ne=[524299,1,2,1],k=3,ties=0","support","0","no","SYCL"
-"SYCL0","TOP_K","type=f32,ne=[524288,1,1,1],k=7,ties=0","support","0","no","SYCL"
-"SYCL0","TOP_K","type=f32,ne=[524299,1,2,1],k=7,ties=0","support","0","no","SYCL"
-"SYCL0","TOP_K","type=f32,ne=[524288,1,1,1],k=15,ties=0","support","0","no","SYCL"
-"SYCL0","TOP_K","type=f32,ne=[524299,1,2,1],k=15,ties=0","support","0","no","SYCL"
+"SYCL0","TOP_K","type=f32,ne=[524288,1,1,1],k=1,ties=0","support","1","yes","SYCL"
+"SYCL0","TOP_K","type=f32,ne=[524299,1,2,1],k=1,ties=0","support","1","yes","SYCL"
+"SYCL0","TOP_K","type=f32,ne=[524288,1,1,1],k=2,ties=0","support","1","yes","SYCL"
+"SYCL0","TOP_K","type=f32,ne=[524299,1,2,1],k=2,ties=0","support","1","yes","SYCL"
+"SYCL0","TOP_K","type=f32,ne=[524288,1,1,1],k=3,ties=0","support","1","yes","SYCL"
+"SYCL0","TOP_K","type=f32,ne=[524299,1,2,1],k=3,ties=0","support","1","yes","SYCL"
+"SYCL0","TOP_K","type=f32,ne=[524288,1,1,1],k=7,ties=0","support","1","yes","SYCL"
+"SYCL0","TOP_K","type=f32,ne=[524299,1,2,1],k=7,ties=0","support","1","yes","SYCL"
+"SYCL0","TOP_K","type=f32,ne=[524288,1,1,1],k=15,ties=0","support","1","yes","SYCL"
+"SYCL0","TOP_K","type=f32,ne=[524299,1,2,1],k=15,ties=0","support","1","yes","SYCL"
 "SYCL0","TOP_K","type=f32,ne=[524288,1,1,1],k=100,ties=0","support","0","no","SYCL"
 "SYCL0","TOP_K","type=f32,ne=[524299,1,2,1],k=100,ties=0","support","0","no","SYCL"
 "SYCL0","TOP_K","type=f32,ne=[524288,1,1,1],k=500,ties=0","support","0","no","SYCL"
@@ -9937,51 +9937,51 @@
 "SYCL0","TOP_K","type=f32,ne=[524299,1,2,1],k=1023,ties=0","support","0","no","SYCL"
 "SYCL0","TOP_K","type=f32,ne=[524288,1,1,1],k=9999,ties=0","support","0","no","SYCL"
 "SYCL0","TOP_K","type=f32,ne=[524299,1,2,1],k=9999,ties=0","support","0","no","SYCL"
-"SYCL0","TOP_K","type=f32,ne=[16,10,10,10],k=1,ties=0","support","0","no","SYCL"
-"SYCL0","TOP_K","type=f32,ne=[60,10,10,10],k=1,ties=0","support","0","no","SYCL"
-"SYCL0","TOP_K","type=f32,ne=[1023,2,1,3],k=1,ties=0","support","0","no","SYCL"
-"SYCL0","TOP_K","type=f32,ne=[1024,2,1,3],k=1,ties=0","support","0","no","SYCL"
-"SYCL0","TOP_K","type=f32,ne=[1025,2,1,3],k=1,ties=0","support","0","no","SYCL"
-"SYCL0","TOP_K","type=f32,ne=[16384,1,1,1],k=1,ties=0","support","0","no","SYCL"
-"SYCL0","TOP_K","type=f32,ne=[2047,2,1,3],k=1,ties=0","support","0","no","SYCL"
-"SYCL0","TOP_K","type=f32,ne=[2048,2,1,3],k=1,ties=0","support","0","no","SYCL"
-"SYCL0","TOP_K","type=f32,ne=[2049,2,1,3],k=1,ties=0","support","0","no","SYCL"
-"SYCL0","TOP_K","type=f32,ne=[16,10,10,10],k=2,ties=0","support","0","no","SYCL"
-"SYCL0","TOP_K","type=f32,ne=[60,10,10,10],k=2,ties=0","support","0","no","SYCL"
-"SYCL0","TOP_K","type=f32,ne=[1023,2,1,3],k=2,ties=0","support","0","no","SYCL"
-"SYCL0","TOP_K","type=f32,ne=[1024,2,1,3],k=2,ties=0","support","0","no","SYCL"
-"SYCL0","TOP_K","type=f32,ne=[1025,2,1,3],k=2,ties=0","support","0","no","SYCL"
-"SYCL0","TOP_K","type=f32,ne=[16384,1,1,1],k=2,ties=0","support","0","no","SYCL"
-"SYCL0","TOP_K","type=f32,ne=[2047,2,1,3],k=2,ties=0","support","0","no","SYCL"
-"SYCL0","TOP_K","type=f32,ne=[2048,2,1,3],k=2,ties=0","support","0","no","SYCL"
-"SYCL0","TOP_K","type=f32,ne=[2049,2,1,3],k=2,ties=0","support","0","no","SYCL"
-"SYCL0","TOP_K","type=f32,ne=[16,10,10,10],k=3,ties=0","support","0","no","SYCL"
-"SYCL0","TOP_K","type=f32,ne=[60,10,10,10],k=3,ties=0","support","0","no","SYCL"
-"SYCL0","TOP_K","type=f32,ne=[1023,2,1,3],k=3,ties=0","support","0","no","SYCL"
-"SYCL0","TOP_K","type=f32,ne=[1024,2,1,3],k=3,ties=0","support","0","no","SYCL"
-"SYCL0","TOP_K","type=f32,ne=[1025,2,1,3],k=3,ties=0","support","0","no","SYCL"
-"SYCL0","TOP_K","type=f32,ne=[16384,1,1,1],k=3,ties=0","support","0","no","SYCL"
-"SYCL0","TOP_K","type=f32,ne=[2047,2,1,3],k=3,ties=0","support","0","no","SYCL"
-"SYCL0","TOP_K","type=f32,ne=[2048,2,1,3],k=3,ties=0","support","0","no","SYCL"
-"SYCL0","TOP_K","type=f32,ne=[2049,2,1,3],k=3,ties=0","support","0","no","SYCL"
-"SYCL0","TOP_K","type=f32,ne=[16,10,10,10],k=7,ties=0","support","0","no","SYCL"
-"SYCL0","TOP_K","type=f32,ne=[60,10,10,10],k=7,ties=0","support","0","no","SYCL"
-"SYCL0","TOP_K","type=f32,ne=[1023,2,1,3],k=7,ties=0","support","0","no","SYCL"
-"SYCL0","TOP_K","type=f32,ne=[1024,2,1,3],k=7,ties=0","support","0","no","SYCL"
-"SYCL0","TOP_K","type=f32,ne=[1025,2,1,3],k=7,ties=0","support","0","no","SYCL"
-"SYCL0","TOP_K","type=f32,ne=[16384,1,1,1],k=7,ties=0","support","0","no","SYCL"
-"SYCL0","TOP_K","type=f32,ne=[2047,2,1,3],k=7,ties=0","support","0","no","SYCL"
-"SYCL0","TOP_K","type=f32,ne=[2048,2,1,3],k=7,ties=0","support","0","no","SYCL"
-"SYCL0","TOP_K","type=f32,ne=[2049,2,1,3],k=7,ties=0","support","0","no","SYCL"
-"SYCL0","TOP_K","type=f32,ne=[16,10,10,10],k=15,ties=0","support","0","no","SYCL"
-"SYCL0","TOP_K","type=f32,ne=[60,10,10,10],k=15,ties=0","support","0","no","SYCL"
-"SYCL0","TOP_K","type=f32,ne=[1023,2,1,3],k=15,ties=0","support","0","no","SYCL"
-"SYCL0","TOP_K","type=f32,ne=[1024,2,1,3],k=15,ties=0","support","0","no","SYCL"
-"SYCL0","TOP_K","type=f32,ne=[1025,2,1,3],k=15,ties=0","support","0","no","SYCL"
-"SYCL0","TOP_K","type=f32,ne=[16384,1,1,1],k=15,ties=0","support","0","no","SYCL"
-"SYCL0","TOP_K","type=f32,ne=[2047,2,1,3],k=15,ties=0","support","0","no","SYCL"
-"SYCL0","TOP_K","type=f32,ne=[2048,2,1,3],k=15,ties=0","support","0","no","SYCL"
-"SYCL0","TOP_K","type=f32,ne=[2049,2,1,3],k=15,ties=0","support","0","no","SYCL"
+"SYCL0","TOP_K","type=f32,ne=[16,10,10,10],k=1,ties=0","support","1","yes","SYCL"
+"SYCL0","TOP_K","type=f32,ne=[60,10,10,10],k=1,ties=0","support","1","yes","SYCL"
+"SYCL0","TOP_K","type=f32,ne=[1023,2,1,3],k=1,ties=0","support","1","yes","SYCL"
+"SYCL0","TOP_K","type=f32,ne=[1024,2,1,3],k=1,ties=0","support","1","yes","SYCL"
+"SYCL0","TOP_K","type=f32,ne=[1025,2,1,3],k=1,ties=0","support","1","yes","SYCL"
+"SYCL0","TOP_K","type=f32,ne=[16384,1,1,1],k=1,ties=0","support","1","yes","SYCL"
+"SYCL0","TOP_K","type=f32,ne=[2047,2,1,3],k=1,ties=0","support","1","yes","SYCL"
+"SYCL0","TOP_K","type=f32,ne=[2048,2,1,3],k=1,ties=0","support","1","yes","SYCL"
+"SYCL0","TOP_K","type=f32,ne=[2049,2,1,3],k=1,ties=0","support","1","yes","SYCL"
+"SYCL0","TOP_K","type=f32,ne=[16,10,10,10],k=2,ties=0","support","1","yes","SYCL"
+"SYCL0","TOP_K","type=f32,ne=[60,10,10,10],k=2,ties=0","support","1","yes","SYCL"
+"SYCL0","TOP_K","type=f32,ne=[1023,2,1,3],k=2,ties=0","support","1","yes","SYCL"
+"SYCL0","TOP_K","type=f32,ne=[1024,2,1,3],k=2,ties=0","support","1","yes","SYCL"
+"SYCL0","TOP_K","type=f32,ne=[1025,2,1,3],k=2,ties=0","support","1","yes","SYCL"
+"SYCL0","TOP_K","type=f32,ne=[16384,1,1,1],k=2,ties=0","support","1","yes","SYCL"
+"SYCL0","TOP_K","type=f32,ne=[2047,2,1,3],k=2,ties=0","support","1","yes","SYCL"
+"SYCL0","TOP_K","type=f32,ne=[2048,2,1,3],k=2,ties=0","support","1","yes","SYCL"
+"SYCL0","TOP_K","type=f32,ne=[2049,2,1,3],k=2,ties=0","support","1","yes","SYCL"
+"SYCL0","TOP_K","type=f32,ne=[16,10,10,10],k=3,ties=0","support","1","yes","SYCL"
+"SYCL0","TOP_K","type=f32,ne=[60,10,10,10],k=3,ties=0","support","1","yes","SYCL"
+"SYCL0","TOP_K","type=f32,ne=[1023,2,1,3],k=3,ties=0","support","1","yes","SYCL"
+"SYCL0","TOP_K","type=f32,ne=[1024,2,1,3],k=3,ties=0","support","1","yes","SYCL"
+"SYCL0","TOP_K","type=f32,ne=[1025,2,1,3],k=3,ties=0","support","1","yes","SYCL"
+"SYCL0","TOP_K","type=f32,ne=[16384,1,1,1],k=3,ties=0","support","1","yes","SYCL"
+"SYCL0","TOP_K","type=f32,ne=[2047,2,1,3],k=3,ties=0","support","1","yes","SYCL"
+"SYCL0","TOP_K","type=f32,ne=[2048,2,1,3],k=3,ties=0","support","1","yes","SYCL"
+"SYCL0","TOP_K","type=f32,ne=[2049,2,1,3],k=3,ties=0","support","1","yes","SYCL"
+"SYCL0","TOP_K","type=f32,ne=[16,10,10,10],k=7,ties=0","support","1","yes","SYCL"
+"SYCL0","TOP_K","type=f32,ne=[60,10,10,10],k=7,ties=0","support","1","yes","SYCL"
+"SYCL0","TOP_K","type=f32,ne=[1023,2,1,3],k=7,ties=0","support","1","yes","SYCL"
+"SYCL0","TOP_K","type=f32,ne=[1024,2,1,3],k=7,ties=0","support","1","yes","SYCL"
+"SYCL0","TOP_K","type=f32,ne=[1025,2,1,3],k=7,ties=0","support","1","yes","SYCL"
+"SYCL0","TOP_K","type=f32,ne=[16384,1,1,1],k=7,ties=0","support","1","yes","SYCL"
+"SYCL0","TOP_K","type=f32,ne=[2047,2,1,3],k=7,ties=0","support","1","yes","SYCL"
+"SYCL0","TOP_K","type=f32,ne=[2048,2,1,3],k=7,ties=0","support","1","yes","SYCL"
+"SYCL0","TOP_K","type=f32,ne=[2049,2,1,3],k=7,ties=0","support","1","yes","SYCL"
+"SYCL0","TOP_K","type=f32,ne=[16,10,10,10],k=15,ties=0","support","1","yes","SYCL"
+"SYCL0","TOP_K","type=f32,ne=[60,10,10,10],k=15,ties=0","support","1","yes","SYCL"
+"SYCL0","TOP_K","type=f32,ne=[1023,2,1,3],k=15,ties=0","support","1","yes","SYCL"
+"SYCL0","TOP_K","type=f32,ne=[1024,2,1,3],k=15,ties=0","support","1","yes","SYCL"
+"SYCL0","TOP_K","type=f32,ne=[1025,2,1,3],k=15,ties=0","support","1","yes","SYCL"
+"SYCL0","TOP_K","type=f32,ne=[16384,1,1,1],k=15,ties=0","support","1","yes","SYCL"
+"SYCL0","TOP_K","type=f32,ne=[2047,2,1,3],k=15,ties=0","support","1","yes","SYCL"
+"SYCL0","TOP_K","type=f32,ne=[2048,2,1,3],k=15,ties=0","support","1","yes","SYCL"
+"SYCL0","TOP_K","type=f32,ne=[2049,2,1,3],k=15,ties=0","support","1","yes","SYCL"
 "SYCL0","UPSCALE","type=f32,ne=[512,512,3,2],scale_factor=2,mode=nearest,transpose=0","support","1","yes","SYCL"
 "SYCL0","UPSCALE","type=f32,ne=[512,512,3,2],scale_factor=2,mode=nearest,transpose=1","support","1","yes","SYCL"
 "SYCL0","UPSCALE","type=f32,ne=[2,5,7,11],ne_tgt=[5,7,11,13],mode=nearest","support","1","yes","SYCL"
diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp
index 12f1e7717b..c5139fd3dd 100644
--- a/ggml/src/ggml-sycl/ggml-sycl.cpp
+++ b/ggml/src/ggml-sycl/ggml-sycl.cpp
@@ -1840,6 +1840,110 @@ static void argsort_f32_i32_sycl(const float *x, int *dst, const int ncols,
     }
 }
 
+static void top_k_f32_sycl(
+    const float * src,
+    int32_t * dst_indices,
+    const int64_t ncols,
+    const int64_t nrows,
+    const int k,
+    dpct::queue_ptr main_stream
+) {
+    const int block_size = 128;
+
+    const sycl::range<1> block_dims(block_size);
+    const sycl::range<1> grid_dims(nrows);
+
+    main_stream->submit([&](sycl::handler &cgh) {
+        sycl::local_accessor<float, 1> shared_vals(sycl::range<1>(block_size * k), cgh);
+        sycl::local_accessor<int, 1> shared_idx(sycl::range<1>(block_size * k), cgh);
+
+        cgh.parallel_for(
+            sycl::nd_range<1>(grid_dims * block_dims, block_dims),
+            [=](sycl::nd_item<1> item_ct1) {
+                const int row = item_ct1.get_group(0);
+                const int tid = item_ct1.get_local_id(0);
+
+                if (row >= nrows) return;
+
+                const float * src_row = src + row * ncols;
+                int32_t * dst_idx_row = dst_indices + row * k;
+
+                float local_vals[32];
+                int local_idx[32];
+
+                for (int i = 0; i < k; i++) {
+                    local_vals[i] = -FLT_MAX;
+                    local_idx[i] = -1;
+                }
+
+                for (int col = tid; col < ncols; col += block_size) {
+                    float val = src_row[col];
+
+                    if (val > local_vals[k-1]) {
+                        int pos = k - 1;
+                        while (pos > 0 && val > local_vals[pos - 1]) {
+                            pos--;
+                        }
+
+                        for (int i = k - 1; i > pos; i--) {
+                            local_vals[i] = local_vals[i - 1];
+                            local_idx[i] = local_idx[i - 1];
+                        }
+                        local_vals[pos] = val;
+                        local_idx[pos] = col;
+                    }
+                }
+
+                for (int i = 0; i < k; i++) {
+                    shared_vals[tid * k + i] = local_vals[i];
+                    shared_idx[tid * k + i] = local_idx[i];
+                }
+                item_ct1.barrier(sycl::access::fence_space::local_space);
+
+                if (tid == 0) {
+                    float final_vals[32];
+                    int final_idx[32];
+
+                    for (int i = 0; i < k; i++) {
+                        final_vals[i] = -FLT_MAX;
+                        final_idx[i] = -1;
+                    }
+
+                    for (int t = 0; t < block_size; t++) {
+                        for (int i = 0; i < k; i++) {
+                            float val = shared_vals[t * k + i];
+                            int idx = shared_idx[t * k + i];
+
+                            if (val > final_vals[k-1]) {
+                                int pos = k - 1;
+                                while (pos > 0 && val > final_vals[pos - 1]) {
+                                    pos--;
+                                }
+
+                                for (int j = k - 1; j > pos; j--) {
+                                    final_vals[j] = final_vals[j - 1];
+                                    final_idx[j] = final_idx[j - 1];
+                                }
+                                final_vals[pos] = val;
+                                final_idx[pos] = idx;
+                            }
+                        }
+                    }
+
+                    for (int i = 0; i < k; i++) {
+                        dst_idx_row[i] = final_idx[i];
+                    }
+
+                    if (k > 1) {
+                        int32_t temp = dst_idx_row[0];
+                        dst_idx_row[0] = dst_idx_row[1];
+                        dst_idx_row[1] = temp;
+                    }
+                }
+            });
+    });
+}
+
 static void argmax_f32_i32_sycl(const float *x, int *dst, const int ncols,
                                const int nrows, queue_ptr stream) {
     const sycl::range<3> block_dims(1, 1, SYCL_ARGMAX_BLOCK_SIZE);
@@ -2231,6 +2335,30 @@ inline void ggml_sycl_op_argsort(ggml_backend_sycl_context & ctx, ggml_tensor *
                          main_stream, ctx.device);
 }
 
+static void ggml_sycl_op_top_k(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
+    const ggml_tensor * src0 = dst->src[0];
+
+    GGML_ASSERT(src0);
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT(dst->type == GGML_TYPE_I32);
+    GGML_ASSERT(ggml_is_contiguous(src0));
+
+    dpct::queue_ptr main_stream = ctx.stream();
+    SYCL_CHECK(ggml_sycl_set_device(ctx.device));
+
+    const float * src0_dd = static_cast<const float *>(src0->data);
+    int32_t * dst_dd = static_cast<int32_t *>(dst->data);
+
+    const int k = dst->ne[0];
+    const int64_t ncols = src0->ne[0];
+    const int64_t nrows = ggml_nrows(src0);
+
+    GGML_ASSERT(k > 0 && k <= 32);
+    GGML_ASSERT(k <= ncols);
+
+    top_k_f32_sycl(src0_dd, dst_dd, ncols, nrows, k, main_stream);
+}
+
 inline void ggml_sycl_op_argmax(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
     GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
     GGML_ASSERT( dst->type == GGML_TYPE_I32);
@@ -4007,6 +4135,9 @@ static bool ggml_sycl_compute_forward(ggml_backend_sycl_context & ctx, struct gg
         case GGML_OP_ARGSORT:
             ggml_sycl_argsort(ctx, dst);
             break;
+        case GGML_OP_TOP_K:
+            ggml_sycl_op_top_k(ctx, dst);
+            break;
         case GGML_OP_TIMESTEP_EMBEDDING:
             ggml_sycl_op_timestep_embedding(ctx, dst);
             break;
@@ -4710,6 +4841,15 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
         case GGML_OP_ARGSORT:
             return op->src[0]->ne[0] * sizeof(int) <=
                    ggml_sycl_info().devices[device].smpbo;
+        case GGML_OP_TOP_K: {
+            const ggml_tensor * src0 = op->src[0];
+            const int k = op->ne[0];
+            return src0 &&
+                op->type == GGML_TYPE_I32 &&
+                src0->type == GGML_TYPE_F32 &&
+                ggml_is_contiguous(src0) &&
+                k > 0 && k <= 32;
+        }
         case GGML_OP_POOL_2D:
         case GGML_OP_ACC:
             return true;

From bf38346d136c6121732c36c60b079bde18d63a0e Mon Sep 17 00:00:00 2001
From: Neo Zhang <zhang.jianyu@outlook.com>
Date: Mon, 2 Feb 2026 21:06:21 +0800
Subject: [PATCH 12/18] Remove support for Nvidia & AMD GPU, because the oneAPI
 plugin for Nvidia & AMD GPU is unavailable: download/installation channels
 are out of work. (#19246)

User can't build up the software for Nvidia & AMD GPU.
rm the oneMath since it is only used in NV and AMD code path.
---
 docs/backend/SYCL.md               | 126 ++++-------------------------
 ggml/src/ggml-sycl/CMakeLists.txt  |  99 +++--------------------
 ggml/src/ggml-sycl/dpct/helper.hpp |  65 +++++----------
 ggml/src/ggml-sycl/ggml-sycl.cpp   |  35 +++-----
 ggml/src/ggml-sycl/outprod.cpp     |   6 +-
 ggml/src/ggml-sycl/rope.cpp        |   1 -
 ggml/src/ggml-sycl/wkv.cpp         |   2 +-
 7 files changed, 60 insertions(+), 274 deletions(-)

diff --git a/docs/backend/SYCL.md b/docs/backend/SYCL.md
index 10cb02ff2c..b3cff96604 100644
--- a/docs/backend/SYCL.md
+++ b/docs/backend/SYCL.md
@@ -22,12 +22,11 @@
 - **DPCPP** *(Data Parallel C++)*: The primary oneAPI SYCL implementation, which includes the icpx/icx Compilers.
 - **oneAPI Libraries**: A set of highly optimized libraries targeting multiple domains *(e.g. Intel oneMKL, oneMath and oneDNN)*.
 - **oneAPI LevelZero**: A high performance low level interface for fine-grained control over Intel iGPUs and dGPUs.
-- **Nvidia & AMD Plugins**: These are plugins extending oneAPI's DPCPP support to SYCL on Nvidia and AMD GPU targets.
 
 ### Llama.cpp + SYCL
 
 The llama.cpp SYCL backend is primarily designed for **Intel GPUs**.
-SYCL cross-platform capabilities enable support for Nvidia GPUs as well, with limited support for AMD.
+SYCL cross-platform capabilities enable support for other vendor GPUs as well.
 
 ## Recommended Release
 
@@ -42,6 +41,9 @@ The following releases are verified and recommended:
 
 ## News
 
+- 2026.02
+  - Remove support for Nvidia & AMD GPU, because the oneAPI plugin for Nvidia & AMD GPU is unavailable: download/installation channels are out of work. User can't build up the software for Nvidia & AMD GPU.
+
 - 2025.11
   - Support malloc memory on device more than 4GB.
 
@@ -111,8 +113,8 @@ On older Intel GPUs, you may try [OpenCL](/docs/backend/OPENCL.md) although the
 |-------------------------------|---------|---------------------------------------|
 | Intel Data Center Max Series  | Support | Max 1550, 1100                        |
 | Intel Data Center Flex Series | Support | Flex 170                              |
-| Intel Arc A-Series              | Support | Arc A770, Arc A730M, Arc A750         |
-| Intel Arc B-Series              | Support | Arc B580         |
+| Intel Arc A-Series            | Support | Arc A770, Arc A730M, Arc A750         |
+| Intel Arc B-Series            | Support | Arc B580                              |
 | Intel built-in Arc GPU        | Support | built-in Arc GPU in Meteor Lake, Arrow Lake, Lunar Lake |
 | Intel iGPU                    | Support | iGPU in 13700k, 13400, i5-1250P, i7-1260P, i7-1165G7  |
 
@@ -127,20 +129,7 @@ On older Intel GPUs, you may try [OpenCL](/docs/backend/OPENCL.md) although the
 
 ### Other Vendor GPU
 
-**Verified devices**
-
-| Nvidia GPU               | Status    | Verified Model |
-|--------------------------|-----------|----------------|
-| Ampere Series            | Supported | A100, A4000    |
-| Ampere Series *(Mobile)* | Supported | RTX 40 Series  |
-
-| AMD GPU                  | Status       | Verified Model |
-|--------------------------|--------------|----------------|
-| Radeon Pro               | Experimental | W6800          |
-| Radeon RX                | Experimental | 6700 XT        |
-
-Note: AMD GPU support is highly experimental and is incompatible with F16.
-Additionally, it only supports GPUs with a sub_group_size (warp size) of 32.
+NA
 
 ## Docker
 
@@ -149,11 +138,11 @@ The docker build option is currently limited to *Intel GPU* targets.
 ### Build image
 
 ```sh
-# Using FP16
-docker build -t llama-cpp-sycl --build-arg="GGML_SYCL_F16=ON" --target light -f .devops/intel.Dockerfile .
-
 # Using FP32
 docker build -t llama-cpp-sycl --build-arg="GGML_SYCL_F16=OFF" --target light -f .devops/intel.Dockerfile .
+
+# Using FP16
+docker build -t llama-cpp-sycl --build-arg="GGML_SYCL_F16=ON" --target light -f .devops/intel.Dockerfile .
 ```
 
 *Notes*:
@@ -212,14 +201,6 @@ Platform #0: Intel(R) OpenCL HD Graphics
  `-- Device #0: Intel(R) Iris(R) Xe Graphics [0x9a49]
 ```
 
-- **Nvidia GPU**
-
-In order to target Nvidia GPUs through SYCL, please make sure the CUDA/CUBLAS native requirements *-found [here](README.md#cuda)-* are installed.
-
-- **AMD GPU**
-
-To target AMD GPUs with SYCL, the ROCm stack must be installed first.
-
 2. **Install Intel® oneAPI Base toolkit**
 
 SYCL backend depends on:
@@ -248,23 +229,6 @@ Upon a successful installation, SYCL is enabled for the available intel devices,
 |2025.1|
 |2024.1|
 
-- **Adding support to Nvidia GPUs**
-
-**oneAPI Plugin**: In order to enable SYCL support on Nvidia GPUs, please install the [Codeplay oneAPI Plugin for Nvidia GPUs](https://developer.codeplay.com/products/oneapi/nvidia/download). User should also make sure the plugin version matches the installed base toolkit one *(previous step)* for a seamless "oneAPI on Nvidia GPU" setup.
-
-**oneDNN**: The current oneDNN releases *(shipped with the oneAPI base-toolkit)* do not include the NVIDIA backend. Therefore, oneDNN must be compiled from source to enable the NVIDIA target:
-
-```sh
-git clone https://github.com/oneapi-src/oneDNN.git
-cd oneDNN
-cmake -GNinja -Bbuild-nvidia -DDNNL_CPU_RUNTIME=DPCPP -DDNNL_GPU_RUNTIME=DPCPP -DDNNL_GPU_VENDOR=NVIDIA -DONEDNN_BUILD_GRAPH=OFF -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
-cmake --build build-nvidia --config Release
-```
-
-- **Adding support to AMD GPUs**
-
-**oneAPI Plugin**: In order to enable SYCL support on AMD GPUs, please install the [Codeplay oneAPI Plugin for AMD GPUs](https://developer.codeplay.com/products/oneapi/amd/download). As with Nvidia GPUs, the user should also make sure the plugin version matches the installed base toolkit.
-
 3. **Verify installation and environment**
 
 In order to check the available SYCL devices on the machine, please use the `sycl-ls` command.
@@ -285,25 +249,6 @@ When targeting an intel GPU, the user should expect one or more devices among th
 [opencl:gpu][opencl:2] Intel(R) OpenCL Graphics, Intel(R) UHD Graphics 730 OpenCL 3.0 NEO  [24.39.31294]
 ```
 
-- **Nvidia GPU**
-
-Similarly, user targeting Nvidia GPUs should expect at least one SYCL-CUDA device [`cuda:gpu`] as below:
-
-```
-[opencl:acc][opencl:0] Intel(R) FPGA Emulation Platform for OpenCL(TM), Intel(R) FPGA Emulation Device OpenCL 1.2  [2023.16.12.0.12_195853.xmain-hotfix]
-[opencl:cpu][opencl:1] Intel(R) OpenCL, Intel(R) Xeon(R) Gold 6326 CPU @ 2.90GHz OpenCL 3.0 (Build 0) [2023.16.12.0.12_195853.xmain-hotfix]
-[cuda:gpu][cuda:0] NVIDIA CUDA BACKEND, NVIDIA A100-PCIE-40GB 8.0 [CUDA 12.5]
-```
-
-- **AMD GPU**
-
-For AMD GPUs we should expect at least one SYCL-HIP device [`hip:gpu`]:
-
-```
-[opencl:cpu][opencl:0] Intel(R) OpenCL, 12th Gen Intel(R) Core(TM) i9-12900K OpenCL 3.0 (Build 0) [2024.18.6.0.02_160000]
-[hip:gpu][hip:0] AMD HIP BACKEND, AMD Radeon PRO W6800 gfx1030 [HIP 60140.9]
-```
-
 ### II. Build llama.cpp
 
 #### Intel GPU
@@ -332,47 +277,6 @@ It is possible to come across some precision issues when running tests that stem
 instructions, which can be circumvented by setting the environment variable `SYCL_PROGRAM_COMPILE_OPTIONS`
 as `-cl-fp32-correctly-rounded-divide-sqrt`
 
-#### Nvidia GPU
-
-The SYCL backend depends on [oneMath](https://github.com/uxlfoundation/oneMath) for Nvidia and AMD devices.
-By default it is automatically built along with the project. A specific build can be provided by setting the CMake flag `-DoneMath_DIR=/path/to/oneMath/install/lib/cmake/oneMath`.
-
-```sh
-# Build LLAMA with Nvidia BLAS acceleration through SYCL
-# Setting GGML_SYCL_DEVICE_ARCH is optional but can improve performance
-GGML_SYCL_DEVICE_ARCH=sm_80 # Example architecture
-
-# Option 1: Use FP32 (recommended for better performance in most cases)
-cmake -B build -DGGML_SYCL=ON -DGGML_SYCL_TARGET=NVIDIA -DGGML_SYCL_DEVICE_ARCH=${GGML_SYCL_DEVICE_ARCH} -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DDNNL_DIR=/path/to/oneDNN/build-nvidia/install/lib/cmake/dnnl
-
-# Option 2: Use FP16
-cmake -B build -DGGML_SYCL=ON -DGGML_SYCL_TARGET=NVIDIA -DGGML_SYCL_DEVICE_ARCH=${GGML_SYCL_DEVICE_ARCH} -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON -DDNNL_DIR=/path/to/oneDNN/build-nvidia/install/lib/cmake/dnnl
-
-# build all binary
-cmake --build build --config Release -j -v
-```
-
-It is possible to come across some precision issues when running tests that stem from using faster
-instructions, which can be circumvented by passing the `-fno-fast-math` flag to the compiler.
-
-#### AMD GPU
-
-The SYCL backend depends on [oneMath](https://github.com/uxlfoundation/oneMath) for Nvidia and AMD devices.
-By default it is automatically built along with the project. A specific build can be provided by setting the CMake flag `-DoneMath_DIR=/path/to/oneMath/install/lib/cmake/oneMath`.
-
-```sh
-# Build LLAMA with rocBLAS acceleration through SYCL
-
-## AMD
-# Use FP32, FP16 is not supported
-# Find your GGML_SYCL_DEVICE_ARCH with rocminfo, under the key 'Name:'
-GGML_SYCL_DEVICE_ARCH=gfx90a # Example architecture
-cmake -B build -DGGML_SYCL=ON -DGGML_SYCL_TARGET=AMD -DGGML_SYCL_DEVICE_ARCH=${GGML_SYCL_DEVICE_ARCH} -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
-
-# build all binary
-cmake --build build --config Release -j -v
-```
-
 ### III. Run the inference
 
 #### Retrieve and prepare model
@@ -766,15 +670,15 @@ use 1 SYCL GPUs: [0] with Max compute units:512
 | Name               | Value                                 | Function                                    |
 |--------------------|---------------------------------------|---------------------------------------------|
 | GGML_SYCL          | ON (mandatory)                        | Enable build with SYCL code path.           |
-| GGML_SYCL_TARGET   | INTEL *(default)* \| NVIDIA \| AMD    | Set the SYCL target device type.            |
-| GGML_SYCL_DEVICE_ARCH | Optional (except for AMD)             | Set the SYCL device architecture, optional except for AMD. Setting the device architecture can improve the performance. See the table [--offload-arch](https://github.com/intel/llvm/blob/sycl/sycl/doc/design/OffloadDesign.md#--offload-arch) for a list of valid architectures. |
+| GGML_SYCL_TARGET   | INTEL *(default)*                     | Set the SYCL target device type.            |
+| GGML_SYCL_DEVICE_ARCH | Optional                           | Set the SYCL device architecture. Setting the device architecture can improve the performance. See the table [--offload-arch](https://github.com/intel/llvm/blob/sycl/sycl/doc/design/OffloadDesign.md#--offload-arch) for a list of valid architectures. |
 | GGML_SYCL_F16      | OFF *(default)* \|ON *(optional)*     | Enable FP16 build with SYCL code path. (1.) |
-| GGML_SYCL_GRAPH    | ON *(default)* \|OFF *(Optional)*     | Enable build with [SYCL Graph extension](https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/experimental/sycl_ext_oneapi_graph.asciidoc). |
+| GGML_SYCL_GRAPH    | OFF *(default)* \|ON *(Optional)*     | Enable build with [SYCL Graph extension](https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/experimental/sycl_ext_oneapi_graph.asciidoc). |
 | GGML_SYCL_DNN      | ON *(default)* \|OFF *(Optional)*     | Enable build with oneDNN.                   |
 | CMAKE_C_COMPILER   | `icx` *(Linux)*, `icx/cl` *(Windows)* | Set `icx` compiler for SYCL code path.      |
 | CMAKE_CXX_COMPILER | `icpx` *(Linux)*, `icx` *(Windows)*   | Set `icpx/icx` compiler for SYCL code path. |
 
-1. FP16 is recommended for better prompt processing performance on quantized models. Performance is equivalent in text generation but set `GGML_SYCL_F16=OFF` if you are experiencing issues with FP16 builds.
+1. FP32 or FP16 have different performance impact to LLM. Recommended to test them for better prompt processing performance on your models. You need to rebuild the code after change `GGML_SYCL_F16=OFF/ON`.
 
 #### Runtime
 
@@ -782,7 +686,7 @@ use 1 SYCL GPUs: [0] with Max compute units:512
 |-------------------|------------------|---------------------------------------------------------------------------------------------------------------------------|
 | GGML_SYCL_DEBUG   | 0 (default) or 1 | Enable log function by macro: GGML_SYCL_DEBUG                                                                             |
 | GGML_SYCL_DISABLE_OPT | 0 (default) or 1 | Disable optimize features for Intel GPUs. (Recommended to 1 for intel devices older than Gen 10) |
-| GGML_SYCL_DISABLE_GRAPH | 0 or 1 (default) | Disable running computations through SYCL Graphs feature. Disabled by default because graph performance isn't yet better than non-graph performance. |
+| GGML_SYCL_DISABLE_GRAPH | 0 or 1 (default) | Disable running computations through SYCL Graphs feature. Disabled by default because SYCL Graph is still on development, no better performance. |
 | GGML_SYCL_DISABLE_DNN | 0 (default) or 1 | Disable running computations through oneDNN and always use oneMKL. |
 | ZES_ENABLE_SYSMAN | 0 (default) or 1 | Support to get free memory of GPU by sycl::aspect::ext_intel_free_memory.<br>Recommended to use when --split-mode = layer |
 | UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS | 0 (default) or 1 | Support malloc device memory more than 4GB.|
diff --git a/ggml/src/ggml-sycl/CMakeLists.txt b/ggml/src/ggml-sycl/CMakeLists.txt
index 5a89d8dd68..eefdd9725c 100644
--- a/ggml/src/ggml-sycl/CMakeLists.txt
+++ b/ggml/src/ggml-sycl/CMakeLists.txt
@@ -1,7 +1,7 @@
 message(STATUS  "GGML_SYCL_TARGET=${GGML_SYCL_TARGET}")
 
-if (NOT GGML_SYCL_TARGET MATCHES "^(INTEL|NVIDIA|AMD)$")
-    message(FATAL_ERROR "Invalid backend chosen, supported options are INTEL, NVIDIA, or AMD")
+if (NOT GGML_SYCL_TARGET MATCHES "^(INTEL)$")
+    message(FATAL_ERROR "GGML_SYCL_TARGET: Invalid target, the supported options are [INTEL]")
 endif()
 
 check_cxx_compiler_flag("-fsycl" SUPPORTS_SYCL)
@@ -125,25 +125,22 @@ endif()
 target_compile_definitions(ggml-sycl PRIVATE GGML_SYCL_DNNL=${GGML_SYCL_DNNL})
 
 if (GGML_SYCL_F16)
-    if (GGML_SYCL_TARGET STREQUAL "AMD")
-        message(WARNING "AMD target does not entirely support FP16 in the SYCL backend.")
-    endif()
     add_compile_definitions(GGML_SYCL_F16)
 endif()
 
 if (GGML_SYCL_TARGET STREQUAL "INTEL")
     add_compile_definitions(GGML_SYCL_WARP_SIZE=16)
     target_link_options(ggml-sycl PRIVATE  -Xs   -ze-intel-greater-than-4GB-buffer-required)
-elseif (GGML_SYCL_TARGET STREQUAL "NVIDIA")
-    add_compile_definitions(GGML_SYCL_WARP_SIZE=32)
-elseif (GGML_SYCL_TARGET STREQUAL "AMD")
-    # INFO: Allowed Sub_group_sizes are not consistent through all
-    # hip targets. For example, 64 is used for certain models, but the backend
-    # does not support it.
-    # Target archs tested working: gfx1030, gfx1031, (Only tested sub_group_size = 32)
-    add_compile_definitions(GGML_SYCL_WARP_SIZE=32)
+
+    # Link against Intel oneMKL
+    if (CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
+        set(SYCL_COMPILER ON)
+    endif()
+    find_package(MKL REQUIRED)
+    target_link_libraries(ggml-sycl PRIVATE MKL::MKL_SYCL::BLAS)
 else()
     # default for other target
+    message(FATAL_ERROR "GGML_SYCL_TARGET is not supported")
     add_compile_definitions(GGML_SYCL_WARP_SIZE=32)
 endif()
 
@@ -151,82 +148,6 @@ if (GGML_SYCL_GRAPH)
     target_compile_definitions(ggml-sycl PRIVATE GGML_SYCL_GRAPH)
 endif()
 
-# Link against Intel oneMKL or oneMath
-if (GGML_SYCL_TARGET STREQUAL "INTEL")
-    # Intel devices use Intel oneMKL directly instead of oneMath to avoid the limitation of linking Intel oneMKL statically
-    # See https://github.com/uxlfoundation/oneMath/issues/654
-    if (CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
-        set(SYCL_COMPILER ON)
-    endif()
-    find_package(MKL REQUIRED)
-    target_link_libraries(ggml-sycl PRIVATE MKL::MKL_SYCL::BLAS)
-    target_compile_definitions(ggml-sycl PRIVATE GGML_SYCL_USE_INTEL_ONEMKL)
-else()
-    find_package(oneMath QUIET)
-    if (NOT oneMath_FOUND)
-        message(STATUS "oneMath not found: oneMath will be automatically downloaded")
-        # Use FetchContent to automatically pull and build oneMath
-        include(FetchContent)
-        set(BUILD_FUNCTIONAL_TESTS False)
-        set(BUILD_EXAMPLES False)
-        set(TARGET_DOMAINS blas)
-        if (GGML_SYCL_TARGET STREQUAL "NVIDIA")
-            set(ENABLE_MKLCPU_BACKEND False)
-            set(ENABLE_MKLGPU_BACKEND False)
-            set(ENABLE_CUBLAS_BACKEND True)
-        elseif (GGML_SYCL_TARGET STREQUAL "AMD")
-            set(ENABLE_MKLCPU_BACKEND False)
-            set(ENABLE_MKLGPU_BACKEND False)
-            set(ENABLE_ROCBLAS_BACKEND True)
-            # Ensure setting a string variable here is not overriden by oneMath CACHE variables
-            cmake_policy(SET CMP0126 NEW)
-            # Setting the device architecture is only needed and useful for AMD devices in oneMath
-            set(HIP_TARGETS ${GGML_SYCL_DEVICE_ARCH} CACHE STRING "oneMath HIP target" FORCE)
-        endif()
-        FetchContent_Declare(
-            ONEMATH
-            GIT_REPOSITORY https://github.com/uxlfoundation/oneMath.git
-            GIT_TAG 8efe85f5aaebb37f1d8c503b7af66315feabf142
-        )
-        FetchContent_MakeAvailable(ONEMATH)
-        # Create alias to match with find_package targets name
-        function(onemath_alias target)
-            if (TARGET ${target}_obj)
-                # Silence verbose warnings from external libraries
-                target_compile_options(${target}_obj PRIVATE -w)
-            endif()
-            if (TARGET ${target})
-                add_library(ONEMATH::${target} ALIAS ${target})
-            endif()
-        endfunction()
-        onemath_alias(onemath)
-        onemath_alias(onemath_blas_mklcpu)
-        onemath_alias(onemath_blas_mklgpu)
-        onemath_alias(onemath_blas_cublas)
-        onemath_alias(onemath_blas_rocblas)
-    endif()
-
-    # Below oneMath compile-time dispatching is used for better performance
-    if (GGML_SYCL_TARGET STREQUAL "NVIDIA")
-        target_link_libraries(ggml-sycl PRIVATE ONEMATH::onemath_blas_cublas)
-        target_compile_options(ggml-sycl PRIVATE "-fsycl-targets=nvptx64-nvidia-cuda")
-        target_link_options(ggml-sycl PRIVATE "-fsycl-targets=nvptx64-nvidia-cuda")
-        target_compile_definitions(ggml-sycl PRIVATE GGML_SYCL_NVIDIA)
-    elseif (GGML_SYCL_TARGET STREQUAL "AMD")
-        if (NOT GGML_SYCL_DEVICE_ARCH)
-            message(FATAL_ERROR "Can't enable SYCL hip backend, GGML_SYCL_DEVICE_ARCH has not been set.")
-        endif()
-        target_link_libraries(ggml-sycl PRIVATE ONEMATH::onemath_blas_rocblas)
-        target_compile_options(ggml-sycl PRIVATE "-fsycl-targets=amdgcn-amd-amdhsa")
-        target_link_options(ggml-sycl PRIVATE "-fsycl-targets=amdgcn-amd-amdhsa")
-        target_compile_definitions(ggml-sycl PRIVATE GGML_SYCL_AMD)
-    else()
-        # Fallback to oneMath runtime dispatcher
-        target_link_libraries(ggml-sycl PRIVATE ONEMATH::onemath)
-        target_compile_definitions(ggml-sycl PRIVATE GGML_SYCL_GENERIC)
-    endif()
-endif()
-
 if (GGML_SYCL_DEVICE_ARCH)
     target_compile_options(ggml-sycl PRIVATE -Xsycl-target-backend --offload-arch=${GGML_SYCL_DEVICE_ARCH})
     target_link_options(ggml-sycl PRIVATE -Xsycl-target-backend --offload-arch=${GGML_SYCL_DEVICE_ARCH})
diff --git a/ggml/src/ggml-sycl/dpct/helper.hpp b/ggml/src/ggml-sycl/dpct/helper.hpp
index 8ae8098717..ece66a7ac1 100644
--- a/ggml/src/ggml-sycl/dpct/helper.hpp
+++ b/ggml/src/ggml-sycl/dpct/helper.hpp
@@ -15,17 +15,9 @@
 
 #include <sycl/sycl.hpp>
 #include <sycl/half_type.hpp>
-#include <map>
-
-#ifdef GGML_SYCL_USE_INTEL_ONEMKL
 #include <oneapi/mkl.hpp>
-// Allow to use the same namespace for Intel oneMKL and oneMath
-namespace oneapi {
-    namespace math = mkl;
-}
-#else
-#include <oneapi/math.hpp>
-#endif
+
+#include <map>
 
 #include "ggml.h"
 
@@ -91,32 +83,13 @@ inline std::string get_device_backend_and_type(const sycl::device &device) {
 }
 
 template <typename Ts> struct matrix_info_t {
-    oneapi::math::transpose transpose_info[2];
+    oneapi::mkl::transpose transpose_info[2];
     Ts                     value_info[2];
     std::int64_t           size_info[3];
     std::int64_t           ld_info[3];
     std::int64_t           groupsize_info;
 };
 
-inline auto get_onemath_backend(sycl::queue& queue)
-#if defined(GGML_SYCL_GENERIC) || defined(GGML_SYCL_USE_INTEL_ONEMKL)
-  -> sycl::queue&
-#endif
-{
-// If the backend is known at compile-time, use oneMath backend_selector to use
-// compile-time dispatching and avoid the need to dlopen libraries. Otherwise
-// fallback to runtime dispatching.
-#if defined(GGML_SYCL_NVIDIA)
-    return oneapi::math::backend_selector<oneapi::math::backend::cublas>{ queue };
-#elif defined(GGML_SYCL_AMD)
-    return oneapi::math::backend_selector<oneapi::math::backend::rocblas>{ queue };
-#elif defined(GGML_SYCL_GENERIC) || defined(GGML_SYCL_USE_INTEL_ONEMKL)
-    return queue;
-#else
-    static_assert(false, "Unsupported backend");
-#endif
-}
-
 namespace dpct
 {
     typedef sycl::queue *queue_ptr;
@@ -1734,7 +1707,7 @@ namespace dpct
     namespace detail
     {
     template <class Ta, class Tb, class Tc, class Ts>
-    inline void gemm_impl(sycl::queue & q, oneapi::math::transpose a_trans, oneapi::math::transpose b_trans, int m,
+    inline void gemm_impl(sycl::queue & q, oneapi::mkl::transpose a_trans, oneapi::mkl::transpose b_trans, int m,
                           int n, int k, const void * alpha, const void * a, int lda, const void * b, int ldb,
                           const void * beta, void * c, int ldc) {
         Ts   alpha_value = dpct::get_value(reinterpret_cast<const Ts *>(alpha), q);
@@ -1742,7 +1715,7 @@ namespace dpct
         auto data_a      = get_memory<const Ta>(a);
         auto data_b      = get_memory<const Tb>(b);
         auto data_c      = get_memory<Tc>(c);
-        oneapi::math::blas::column_major::gemm(get_onemath_backend(q), a_trans, b_trans, m, n, k, alpha_value, data_a,
+        oneapi::mkl::blas::column_major::gemm(q, a_trans, b_trans, m, n, k, alpha_value, data_a,
                                                lda, data_b, ldb, beta_value, data_c, ldc);
     }
 
@@ -1774,7 +1747,7 @@ namespace dpct
         };
 
         template <class Ta, class Tb, class Tc, class Ts>
-        inline void gemm_batch_impl(sycl::queue & q, oneapi::math::transpose a_trans, oneapi::math::transpose b_trans,
+        inline void gemm_batch_impl(sycl::queue & q, oneapi::mkl::transpose a_trans, oneapi::mkl::transpose b_trans,
                                     int m, int n, int k, const void * alpha, const void ** a, int lda, const void ** b,
                                     int ldb, const void * beta, void ** c, int ldc, int batch_size,
                                     matrix_info_t<float> * matrix_info) {
@@ -1793,8 +1766,8 @@ namespace dpct
             matrix_info->ld_info[2] = ldc;
             matrix_info->groupsize_info = batch_size;
 
-            sycl::event e = oneapi::math::blas::column_major::gemm_batch(
-                get_onemath_backend(q), matrix_info->transpose_info, matrix_info->transpose_info + 1,
+            sycl::event e = oneapi::mkl::blas::column_major::gemm_batch(
+                q, matrix_info->transpose_info, matrix_info->transpose_info + 1,
                 matrix_info->size_info, matrix_info->size_info + 1, matrix_info->size_info + 2,
                 reinterpret_cast<Ts *>(matrix_info->value_info), reinterpret_cast<const Ta **>(a), matrix_info->ld_info,
                 reinterpret_cast<const Tb **>(b), matrix_info->ld_info + 1,
@@ -1803,7 +1776,7 @@ namespace dpct
         }
 
         template <class Ta, class Tb, class Tc, class Ts>
-        inline void gemm_batch_impl(sycl::queue & q, oneapi::math::transpose a_trans, oneapi::math::transpose b_trans,
+        inline void gemm_batch_impl(sycl::queue & q, oneapi::mkl::transpose a_trans, oneapi::mkl::transpose b_trans,
                                     int m, int n, int k, const void * alpha, const void * a, int lda,
                                     long long int stride_a, const void * b, int ldb, long long int stride_b,
                                     const void * beta, void * c, int ldc, long long int stride_c, int batch_size) {
@@ -1812,7 +1785,7 @@ namespace dpct
             auto data_a = get_memory<const Ta>(a);
             auto data_b = get_memory<const Tb>(b);
             auto data_c = get_memory<Tc>(c);
-            oneapi::math::blas::column_major::gemm_batch(get_onemath_backend(q), a_trans, b_trans, m, n, k, alpha_value,
+            oneapi::mkl::blas::column_major::gemm_batch(q, a_trans, b_trans, m, n, k, alpha_value,
                                                          data_a, lda, stride_a, data_b, ldb, stride_b, beta_value,
                                                          data_c, ldc, stride_c, batch_size);
         }
@@ -2299,7 +2272,7 @@ namespace dpct
                            sycl::range<3>(x, y, 1), direction);
     }
 
-    inline void gemm(sycl::queue & q, oneapi::math::transpose a_trans, oneapi::math::transpose b_trans, int m, int n,
+    inline void gemm(sycl::queue & q, oneapi::mkl::transpose a_trans, oneapi::mkl::transpose b_trans, int m, int n,
                      int k, const void * alpha, const void * a, library_data_t a_type, int lda, const void * b,
                      library_data_t b_type, int ldb, const void * beta, void * c, library_data_t c_type, int ldc,
                      library_data_t scaling_type) {
@@ -2366,7 +2339,7 @@ namespace dpct
             library_data_t::real_bfloat16, library_data_t::real_bfloat16,
             library_data_t::real_float, library_data_t::real_float):
         {
-            detail::gemm_impl<oneapi::math::bfloat16, oneapi::math::bfloat16, float, float>(
+            detail::gemm_impl<oneapi::mkl::bfloat16, oneapi::mkl::bfloat16, float, float>(
                 q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
             break;
         }
@@ -2405,7 +2378,7 @@ namespace dpct
             library_data_t::real_bfloat16, library_data_t::real_bfloat16,
             library_data_t::real_bfloat16, library_data_t::real_float):
         {
-            detail::gemm_impl<oneapi::math::bfloat16, oneapi::math::bfloat16, oneapi::math::bfloat16, float>(
+            detail::gemm_impl<oneapi::mkl::bfloat16, oneapi::mkl::bfloat16, oneapi::mkl::bfloat16, float>(
                 q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
             break;
         }
@@ -2447,7 +2420,7 @@ namespace dpct
     /// \param [in] ldc Leading dimension of C.
     /// \param [in] batch_size Specifies the number of matrix multiply operations to perform.
     /// \param [in] scaling_type Data type of the scaling factors.
-    inline void gemm_batch(sycl::queue & q, oneapi::math::transpose a_trans, oneapi::math::transpose b_trans, int m,
+    inline void gemm_batch(sycl::queue & q, oneapi::mkl::transpose a_trans, oneapi::mkl::transpose b_trans, int m,
                            int n, int k, const void * alpha, const void * a[], library_data_t a_type, int lda,
                            const void * b[], library_data_t b_type, int ldb, const void * beta, void * c[],
                            library_data_t c_type, int ldc, int batch_size, library_data_t scaling_type,
@@ -2485,7 +2458,7 @@ namespace dpct
             library_data_t::real_bfloat16, library_data_t::real_bfloat16,
             library_data_t::real_bfloat16, library_data_t::real_float):
         {
-            detail::gemm_batch_impl<oneapi::math::bfloat16, oneapi::math::bfloat16, oneapi::math::bfloat16, float>(
+            detail::gemm_batch_impl<oneapi::mkl::bfloat16, oneapi::mkl::bfloat16, oneapi::mkl::bfloat16, float>(
                 q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, batch_size, matrix_info);
             break;
         }
@@ -2493,7 +2466,7 @@ namespace dpct
             library_data_t::real_bfloat16, library_data_t::real_bfloat16,
             library_data_t::real_float, library_data_t::real_float):
         {
-            detail::gemm_batch_impl<oneapi::math::bfloat16, oneapi::math::bfloat16, float, float>(
+            detail::gemm_batch_impl<oneapi::mkl::bfloat16, oneapi::mkl::bfloat16, float, float>(
                 q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, batch_size, matrix_info);
             break;
         }
@@ -2569,7 +2542,7 @@ namespace dpct
     /// \param [in] stride_c Stride between the different C matrices.
     /// \param [in] batch_size Specifies the number of matrix multiply operations to perform.
     /// \param [in] scaling_type Data type of the scaling factors.
-    inline void gemm_batch(sycl::queue & q, oneapi::math::transpose a_trans, oneapi::math::transpose b_trans, int m,
+    inline void gemm_batch(sycl::queue & q, oneapi::mkl::transpose a_trans, oneapi::mkl::transpose b_trans, int m,
                            int n, int k, const void * alpha, const void * a, library_data_t a_type, int lda,
                            long long int stride_a, const void * b, library_data_t b_type, int ldb,
                            long long int stride_b, const void * beta, void * c, library_data_t c_type, int ldc,
@@ -2642,7 +2615,7 @@ namespace dpct
             library_data_t::real_bfloat16, library_data_t::real_bfloat16,
             library_data_t::real_bfloat16, library_data_t::real_float):
         {
-            detail::gemm_batch_impl<oneapi::math::bfloat16, oneapi::math::bfloat16, oneapi::math::bfloat16, float>(
+            detail::gemm_batch_impl<oneapi::mkl::bfloat16, oneapi::mkl::bfloat16, oneapi::mkl::bfloat16, float>(
                 q, a_trans, b_trans, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, beta, c, ldc, stride_c,
                 batch_size);
             break;
@@ -2651,7 +2624,7 @@ namespace dpct
             library_data_t::real_bfloat16, library_data_t::real_bfloat16,
             library_data_t::real_float, library_data_t::real_float):
         {
-            detail::gemm_batch_impl<oneapi::math::bfloat16, oneapi::math::bfloat16, float, float>(
+            detail::gemm_batch_impl<oneapi::mkl::bfloat16, oneapi::mkl::bfloat16, float, float>(
                 q, a_trans, b_trans, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, beta, c, ldc, stride_c,
                 batch_size);
             break;
diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp
index c5139fd3dd..a03d26d7f2 100644
--- a/ggml/src/ggml-sycl/ggml-sycl.cpp
+++ b/ggml/src/ggml-sycl/ggml-sycl.cpp
@@ -2167,8 +2167,8 @@ inline void ggml_sycl_op_mul_mat_sycl(
             const sycl::half alpha_f16 = 1.0f;
             const sycl::half beta_f16  = 0.0f;
             SYCL_CHECK(CHECK_TRY_ERROR(dpct::gemm(
-                *stream, oneapi::math::transpose::trans,
-                oneapi::math::transpose::nontrans, row_diff, src1_ncols, ne10,
+                *stream, oneapi::mkl::transpose::trans,
+                oneapi::mkl::transpose::nontrans, row_diff, src1_ncols, ne10,
                 &alpha_f16, src0_ptr, dpct::library_data_t::real_half, ne00,
                 src1_ptr, dpct::library_data_t::real_half, ne10, &beta_f16,
                 dst_f16.get(), dpct::library_data_t::real_half, ldc,
@@ -2211,8 +2211,8 @@ inline void ggml_sycl_op_mul_mat_sycl(
         {
             const float alpha = 1.0f;
             const float beta  = 0.0f;
-            SYCL_CHECK(CHECK_TRY_ERROR(oneapi::math::blas::column_major::gemm(
-                get_onemath_backend(*stream), oneapi::math::transpose::trans, oneapi::math::transpose::nontrans, row_diff,
+            SYCL_CHECK(CHECK_TRY_ERROR(oneapi::mkl::blas::column_major::gemm(
+                *stream, oneapi::mkl::transpose::trans, oneapi::mkl::transpose::nontrans, row_diff,
                 src1_ncols, ne10, dpct::get_value(&alpha, *stream), src0_ddf_i, ne00, src1_ddf1_i, ne10,
                 dpct::get_value(&beta, *stream), dst_dd_i, ldc)));
         }
@@ -3165,8 +3165,8 @@ static void ggml_sycl_mul_mat_batched_sycl(ggml_backend_sycl_context & ctx, cons
             const int64_t smb = ne12 == 1 ? s13       : s12;
 
             // there is no broadcast and src0, src1 are contiguous across dims 2, 3
-            SYCL_CHECK(CHECK_TRY_ERROR(dpct::gemm_batch(*queue, oneapi::math::transpose::trans,
-                                                        oneapi::math::transpose::nontrans, ne01, ne11, ne10, alpha,
+            SYCL_CHECK(CHECK_TRY_ERROR(dpct::gemm_batch(*queue, oneapi::mkl::transpose::trans,
+                                                        oneapi::mkl::transpose::nontrans, ne01, ne11, ne10, alpha,
                                                         src0_f16, dpct::library_data_t::real_half, nb01 / nb00, sma,
                                                         src1_f16, dpct::library_data_t::real_half, s11, smb, beta, dst_ddf,
                                                         mkl_data_type, ne0, ne1 * ne0, ne12 * ne13, mkl_compute_type)));
@@ -3190,7 +3190,7 @@ static void ggml_sycl_mul_mat_batched_sycl(ggml_backend_sycl_context & ctx, cons
             });
 
             SYCL_CHECK(CHECK_TRY_ERROR(dpct::gemm_batch(
-                *queue, oneapi::math::transpose::trans, oneapi::math::transpose::nontrans, ne01, ne11, ne10, alpha,
+                *queue, oneapi::mkl::transpose::trans, oneapi::mkl::transpose::nontrans, ne01, ne11, ne10, alpha,
                 (const void **) (ptrs_src.get() + 0 * ne23), dpct::library_data_t::real_half, nb01 / nb00,
                 (const void **) (ptrs_src.get() + 1 * ne23), dpct::library_data_t::real_half, s11, beta,
                 (void **) (ptrs_dst.get() + 0 * ne23), mkl_data_type, ne0, ne23, mkl_compute_type, matrix_info.get())));
@@ -3524,12 +3524,11 @@ static void ggml_sycl_mul_mat(ggml_backend_sycl_context & ctx, const ggml_tensor
     use_mul_mat_q = use_mul_mat_q && (src1->ne[1] <= MMQ_MAX_BATCH_SIZE);
 #endif // SYCL_USE_XMX
 
-    // mmvq path is faster in the CUDA backend.
-    if (!g_ggml_sycl_prioritize_dmmv && (ctx.stream()->get_backend() == sycl::backend::ext_oneapi_cuda
-        // Dispatch becomes obscure with the reorder, MMVQ when the reorder optimization
-        // is enabled takes precedence over DMMV, the current if-else implementation
-        // requires disabling DMMV if both conditions are met
-        || (should_reorder_tensor(ctx, dst) && ggml_sycl_supports_reorder_mmvq(src0->type)))) {
+    // Dispatch becomes obscure with the reorder, MMVQ when the reorder optimization
+    // is enabled takes precedence over DMMV, the current if-else implementation
+    // requires disabling DMMV if both conditions are met
+    if (!g_ggml_sycl_prioritize_dmmv && ((should_reorder_tensor(ctx, dst) &&
+                                          ggml_sycl_supports_reorder_mmvq(src0->type)))) {
         use_dequantize_mul_mat_vec = use_dequantize_mul_mat_vec && !use_mul_mat_vec_q;
     }
 
@@ -4189,16 +4188,6 @@ void ggml_backend_sycl_get_device_memory(int device, size_t *free,
     GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_get_device_memory\n");
     ggml_sycl_set_device(device);
 
-    /*
-    DPCT1009:218: SYCL uses exceptions to report errors and does not use the
-    error codes. The original code was commented out and a warning string was
-    inserted. You need to rewrite this code.
-    */
-    /*
-    DPCT1106:217: 'cudaMemGetInfo' was migrated with the Intel extensions for
-    device information which may not be supported by all compilers or runtimes.
-    You may need to adjust the code.
-    */
     SYCL_CHECK(CHECK_TRY_ERROR(
         dpct::dev_mgr::instance().get_device(device).get_memory_info(*free, *total)));
 }
diff --git a/ggml/src/ggml-sycl/outprod.cpp b/ggml/src/ggml-sycl/outprod.cpp
index 3a17f3a1b8..f52b11f0d6 100644
--- a/ggml/src/ggml-sycl/outprod.cpp
+++ b/ggml/src/ggml-sycl/outprod.cpp
@@ -32,12 +32,12 @@ void ggml_sycl_op_out_prod(ggml_backend_sycl_context& ctx, ggml_tensor* dst) {
 
     // Handle transposition of src1
     const bool src1_T = ggml_is_transposed(src1);
-    const oneapi::math::transpose src1_op = src1_T ? oneapi::math::transpose::nontrans : oneapi::math::transpose::trans;
+    const oneapi::mkl::transpose src1_op = src1_T ? oneapi::mkl::transpose::nontrans : oneapi::mkl::transpose::trans;
     const int64_t ldb = (src1_T ? nb10 : nb11) / sizeof(float);
 
     try {
-        // Perform matrix multiplication using oneMath GEMM
-        oneapi::math::blas::column_major::gemm(get_onemath_backend(*stream), oneapi::math::transpose::nontrans, src1_op,
+        // Perform matrix multiplication using oneMKL GEMM
+        oneapi::mkl::blas::column_major::gemm(*stream, oneapi::mkl::transpose::nontrans, src1_op,
                                                ne0, ne1, ne01, alpha, src0_d, ne00, src1_d, ldb, beta, dst_d, ne0);
     }
     catch (sycl::exception const& exc) {
diff --git a/ggml/src/ggml-sycl/rope.cpp b/ggml/src/ggml-sycl/rope.cpp
index 69140b19a4..aeaa58b95b 100644
--- a/ggml/src/ggml-sycl/rope.cpp
+++ b/ggml/src/ggml-sycl/rope.cpp
@@ -207,7 +207,6 @@ static void rope_vision(const T * x, T * dst, const int ne0, const int ne1, cons
         const int p = sector;
         theta_base  = pos[channel_x] * sycl::pow(theta_scale, (float) p);
     } else {
-        // Simplified from CUDA backend code: if (sector >= sections.v[0] && sector < sec_w) which is just sector >= sections.v[0]
         const int p = sector - sections.v[0];
         theta_base  = pos[channel_x + ne2] * sycl::pow(theta_scale, (float) p);
     }
diff --git a/ggml/src/ggml-sycl/wkv.cpp b/ggml/src/ggml-sycl/wkv.cpp
index c10e2f7645..b56e0c2400 100644
--- a/ggml/src/ggml-sycl/wkv.cpp
+++ b/ggml/src/ggml-sycl/wkv.cpp
@@ -1,7 +1,7 @@
 #include <sycl/sycl.hpp>
 #include "wkv.hpp"
 
-constexpr int WKV_BLOCK_SIZE = 64;  // Matching CUDA_WKV_BLOCK_SIZE
+constexpr int WKV_BLOCK_SIZE = 64;
 
 // Helper function for the main kernel
 template <int block_size>

From 15818ac44c551d6d668c69bcfbf9a937c98f9e5a Mon Sep 17 00:00:00 2001
From: Aman Gupta <amangupta052@gmail.com>
Date: Mon, 2 Feb 2026 22:40:28 +0800
Subject: [PATCH 13/18] ci: add test-backend-ops test for CPU (#19268)

---
 .github/workflows/build.yml |  4 ++--
 ci/run.sh                   | 27 +++++++++++++++++++++++++++
 2 files changed, 29 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index f738edefc4..fd251ac4c2 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -1532,7 +1532,7 @@ jobs:
       - name: Test
         id: ggml-ci
         run: |
-          LLAMA_ARG_THREADS=$(nproc) bash ./ci/run.sh ./tmp/results ./tmp/mnt
+          LLAMA_ARG_THREADS=$(nproc) GG_BUILD_HIGH_PERF=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
 
   ggml-ci-arm64-cpu-high-perf:
     runs-on: ubuntu-22.04-arm
@@ -1558,7 +1558,7 @@ jobs:
       - name: Test
         id: ggml-ci
         run: |
-          LLAMA_ARG_THREADS=$(nproc) GG_BUILD_NO_SVE=1 GG_BUILD_NO_BF16=1 GG_BUILD_EXTRA_TESTS_0=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
+          LLAMA_ARG_THREADS=$(nproc) GG_BUILD_HIGH_PERF=1 GG_BUILD_NO_SVE=1 GG_BUILD_NO_BF16=1 GG_BUILD_EXTRA_TESTS_0=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
 
   ggml-ci-arm64-cpu-high-perf-sve:
     runs-on: ubuntu-22.04-arm
diff --git a/ci/run.sh b/ci/run.sh
index dfcf959661..96755ea13e 100755
--- a/ci/run.sh
+++ b/ci/run.sh
@@ -635,6 +635,29 @@ function gg_check_build_requirements {
     fi
 }
 
+function gg_run_test_backend_ops_cpu {
+    cd ${SRC}
+
+    cd build-ci-release
+
+    set -e
+
+    (time ./bin/test-backend-ops -b CPU ) 2>&1 | tee -a $OUT/${ci}-test-backend-ops-cpu.log
+
+    set +e
+}
+
+function gg_sum_test_backend_ops_cpu {
+    gg_printf '### %s\n\n' "${ci}"
+
+    gg_printf 'Runs test-backend-ops for CPU backend\n'
+    gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
+    gg_printf '```\n'
+    gg_printf '%s\n' "$(cat $OUT/${ci}-test-backend-ops-cpu.log)"
+    gg_printf '```\n'
+    gg_printf '\n'
+}
+
 ## main
 
 export LLAMA_LOG_PREFIX=1
@@ -663,6 +686,10 @@ ret=0
 test $ret -eq 0 && gg_run ctest_debug
 test $ret -eq 0 && gg_run ctest_release
 
+if [ ! -z ${GG_BUILD_HIGH_PERF} ]; then
+    test $ret -eq 0 && gg_run test_backend_ops_cpu
+fi
+
 if [ -z ${GG_BUILD_LOW_PERF} ]; then
     test $ret -eq 0 && gg_run embd_bge_small
     test $ret -eq 0 && gg_run rerank_tiny

From a3fa03582240a4279ba019a3db2bb87311d5d485 Mon Sep 17 00:00:00 2001
From: Matthieu Coudron <886074+teto@users.noreply.github.com>
Date: Mon, 2 Feb 2026 16:55:27 +0100
Subject: [PATCH 14/18] server: print actual model name in 'model not found"
 error (#19117)

Experimenting with AI, my environment gets messy fast and it's not
always easy to know what model my software is trying to load. This helps
with troubleshooting.

before:

Error: {
  code = 400,
  message = "model not found",
  type = "invalid_request_error"
}

After:

Error: {
  code = 400,
  message = "model 'toto' not found",
  type = "invalid_request_error"
}
---
 tools/server/server-models.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/server/server-models.cpp b/tools/server/server-models.cpp
index 803cb02e6e..57655476af 100644
--- a/tools/server/server-models.cpp
+++ b/tools/server/server-models.cpp
@@ -767,7 +767,7 @@ static bool router_validate_model(const std::string & name, server_models & mode
     }
     auto meta = models.get_meta(name);
     if (!meta.has_value()) {
-        res_err(res, format_error_response("model not found", ERROR_TYPE_INVALID_REQUEST));
+        res_err(res, format_error_response(string_format("model '%s' not found", name.c_str()), ERROR_TYPE_INVALID_REQUEST));
         return false;
     }
     if (models_autoload) {

From 9f682fb640765ff79ee13a7a00cdbaa15c1ed07a Mon Sep 17 00:00:00 2001
From: Aman Gupta <amangupta052@gmail.com>
Date: Tue, 3 Feb 2026 01:19:55 +0800
Subject: [PATCH 15/18] ggml-cpu: FA split across kv for faster TG (#19209)

* ggml-cpu: split across kv for faster TG

* simplify sinks application

* add ref impl
---
 ggml/include/ggml-cpu.h           |   5 +
 ggml/src/ggml-cpu/ggml-cpu-impl.h |   3 +
 ggml/src/ggml-cpu/ggml-cpu.c      |  22 ++-
 ggml/src/ggml-cpu/ggml-cpu.cpp    |  15 ++
 ggml/src/ggml-cpu/ops.cpp         | 237 ++++++++++++++++++++++--------
 tests/test-backend-ops.cpp        |   7 +
 6 files changed, 220 insertions(+), 69 deletions(-)

diff --git a/ggml/include/ggml-cpu.h b/ggml/include/ggml-cpu.h
index 4f3b99c8d0..e3e067c916 100644
--- a/ggml/include/ggml-cpu.h
+++ b/ggml/include/ggml-cpu.h
@@ -19,6 +19,9 @@ extern "C" {
         // abort ggml_graph_compute when true
         ggml_abort_callback abort_callback;
         void *              abort_callback_data;
+
+        // use only reference implementations
+        bool use_ref;
     };
 
     // numa strategies
@@ -132,6 +135,8 @@ extern "C" {
     GGML_BACKEND_API void ggml_backend_cpu_set_threadpool    (ggml_backend_t backend_cpu, ggml_threadpool_t threadpool);
     GGML_BACKEND_API void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data);
 
+    GGML_BACKEND_API void ggml_backend_cpu_set_use_ref(ggml_backend_t backend_cpu, bool use_ref);
+
     GGML_BACKEND_API ggml_backend_reg_t ggml_backend_cpu_reg(void);
 
     GGML_BACKEND_API void ggml_cpu_fp32_to_fp32(const float *,       float *, int64_t);
diff --git a/ggml/src/ggml-cpu/ggml-cpu-impl.h b/ggml/src/ggml-cpu/ggml-cpu-impl.h
index 0e8dd0ae05..88a9c9ec05 100644
--- a/ggml/src/ggml-cpu/ggml-cpu-impl.h
+++ b/ggml/src/ggml-cpu/ggml-cpu-impl.h
@@ -24,6 +24,9 @@ struct ggml_compute_params {
     void * wdata;
 
     struct ggml_threadpool * threadpool;
+
+    // use reference implementation
+    bool use_ref;
 };
 
 
diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
index b1de2ae871..3e5f01e3fb 100644
--- a/ggml/src/ggml-cpu/ggml-cpu.c
+++ b/ggml/src/ggml-cpu/ggml-cpu.c
@@ -5,7 +5,6 @@
 #include "ggml-backend.h"
 #include "traits.h"
 #include "ggml-cpu-impl.h"
-#include "ggml-cpu.h"
 #include "ggml-impl.h"
 #include "quants.h"
 #include "ggml-threading.h"
@@ -2867,12 +2866,20 @@ struct ggml_cplan ggml_graph_plan(
                     } break;
                 case GGML_OP_FLASH_ATTN_EXT:
                     {
+                        const int64_t neq2 = node->src[0]->ne[2]; // number of query heads
                         const int64_t DK = node->src[1]->ne[0];
                         const int64_t DV = node->src[2]->ne[0];
 
                         // Tiled flash attention scratch (tile sizes defined in common.h)
                         // Per-thread: Q_q + KQ + mask + VKQ32 + V32 + padding
-                        cur = sizeof(float)*(GGML_FA_TILE_Q*DK + 2*GGML_FA_TILE_Q*GGML_FA_TILE_KV + GGML_FA_TILE_Q*DV + GGML_FA_TILE_KV*DV)*n_tasks;
+                        size_t prefill  = sizeof(float)*(GGML_FA_TILE_Q*DK + 2*GGML_FA_TILE_Q*GGML_FA_TILE_KV + GGML_FA_TILE_Q*DV + GGML_FA_TILE_KV*DV)*n_tasks;
+
+                        // Decode path: n_kv_chunks = n_tasks (one chunk per thread)
+                        // Per-thread: VKQ accmulator (DV), partial M, partial S + intra-thread scratch for V, Q and VKQ
+                        size_t n_chunks = n_tasks;
+                        size_t decode   = sizeof(float)*(neq2*n_chunks*(2+DV) + n_tasks*(DK + 2*DV));
+
+                        cur += MAX(prefill, decode);
                     } break;
                 case GGML_OP_FLASH_ATTN_BACK:
                     {
@@ -2929,11 +2936,12 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
     set_numa_thread_affinity(state->ith);
 
     struct ggml_compute_params params = {
-        /*.ith       =*/ state->ith,
-        /*.nth       =*/ atomic_load_explicit(&tp->n_graph, memory_order_relaxed) & GGML_THREADPOOL_N_THREADS_MASK,
-        /*.wsize     =*/ cplan->work_size,
-        /*.wdata     =*/ cplan->work_data,
-        /*.threadpool=*/ tp,
+        /*.ith        =*/ state->ith,
+        /*.nth        =*/ atomic_load_explicit(&tp->n_graph, memory_order_relaxed) & GGML_THREADPOOL_N_THREADS_MASK,
+        /*.wsize      =*/ cplan->work_size,
+        /*.wdata      =*/ cplan->work_data,
+        /*.threadpool =*/ tp,
+        /*.use_ref    =*/ cplan->use_ref,
     };
 
     GGML_PRINT_DEBUG("thread #%d compute-start cplan %p last-graph %d \n", state->ith, cplan, state->last_graph);
diff --git a/ggml/src/ggml-cpu/ggml-cpu.cpp b/ggml/src/ggml-cpu/ggml-cpu.cpp
index f4713a4218..ddf1737a31 100644
--- a/ggml/src/ggml-cpu/ggml-cpu.cpp
+++ b/ggml/src/ggml-cpu/ggml-cpu.cpp
@@ -105,6 +105,8 @@ struct ggml_backend_cpu_context {
 
     ggml_abort_callback abort_callback;
     void *              abort_callback_data;
+
+    bool                use_ref;  // use reference implementation
 };
 
 static const char * ggml_backend_cpu_get_name(ggml_backend_t backend) {
@@ -143,6 +145,7 @@ static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(ggml_backend
 
     cpu_plan->cplan.abort_callback      = cpu_ctx->abort_callback;
     cpu_plan->cplan.abort_callback_data = cpu_ctx->abort_callback_data;
+    cpu_plan->cplan.use_ref             = cpu_ctx->use_ref;
 
     return cpu_plan;
 }
@@ -182,6 +185,7 @@ static enum ggml_status ggml_backend_cpu_graph_compute(ggml_backend_t backend, s
 
     cplan.abort_callback      = cpu_ctx->abort_callback;
     cplan.abort_callback_data = cpu_ctx->abort_callback_data;
+    cplan.use_ref             = cpu_ctx->use_ref;
 
     return ggml_graph_compute(cgraph, &cplan);
 }
@@ -223,6 +227,7 @@ ggml_backend_t ggml_backend_cpu_init(void) {
     ctx->work_size           = 0;
     ctx->abort_callback      = NULL;
     ctx->abort_callback_data = NULL;
+    ctx->use_ref             = false;
 
     ggml_backend_t cpu_backend = new ggml_backend {
         /* .guid    = */ ggml_backend_cpu_guid(),
@@ -270,6 +275,13 @@ void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_
     ctx->abort_callback_data = abort_callback_data;
 }
 
+void ggml_backend_cpu_set_use_ref(ggml_backend_t backend_cpu, bool use_ref) {
+    GGML_ASSERT(ggml_backend_is_cpu(backend_cpu));
+
+    struct ggml_backend_cpu_context * ctx = (struct ggml_backend_cpu_context *)backend_cpu->context;
+    ctx->use_ref = use_ref;
+}
+
 // CPU backend - device
 
 struct ggml_backend_cpu_device_context {
@@ -646,6 +658,9 @@ static void * ggml_backend_cpu_get_proc_address(ggml_backend_reg_t reg, const ch
     if (strcmp(name, "ggml_backend_cpu_is_numa") == 0) {
         return (void *)ggml_is_numa;
     }
+    if (strcmp(name, "ggml_backend_cpu_set_use_ref") == 0) {
+        return (void *)ggml_backend_cpu_set_use_ref;
+    }
 
     // threadpool - TODO:  move to ggml-base
     if (strcmp(name, "ggml_threadpool_new") == 0) {
diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp
index 48c8964361..ce15b18ce0 100644
--- a/ggml/src/ggml-cpu/ops.cpp
+++ b/ggml/src/ggml-cpu/ops.cpp
@@ -8042,12 +8042,14 @@ void ggml_compute_forward_top_k(
     }
 }
 
-// ggml_compute_forward_flash_attn_ext
-
 static void ggml_compute_forward_flash_attn_ext_f16_one_chunk(
         const ggml_compute_params * params,
         ggml_tensor * dst,
-        int ir0, int ir1) {
+        int ir0, int ir1,
+        int64_t ic_start, int64_t ic_end,
+        float * partials, int64_t partial_stride) {
+
+    const bool write_partials = (partials != nullptr);
     const ggml_tensor * q     = dst->src[0];
     const ggml_tensor * k     = dst->src[1];
     const ggml_tensor * v     = dst->src[2];
@@ -8124,7 +8126,6 @@ static void ggml_compute_forward_flash_attn_ext_f16_one_chunk(
 
     int ith = params->ith;
 
-    // loop over n_batch and n_head
     for (int ir = ir0; ir < ir1; ++ir) {
         // q indices
         const int iq3 = ir/(neq2*neq1);
@@ -8165,7 +8166,7 @@ static void ggml_compute_forward_flash_attn_ext_f16_one_chunk(
         // loop over n_kv and n_head_kv
         // ref: https://arxiv.org/pdf/2112.05682.pdf
 
-        for (int64_t ic = 0; ic < nek1; ++ic) {
+        for (int64_t ic = ic_start; ic < ic_end; ++ic) {
             const float mv = mp ? slope*GGML_CPU_FP16_TO_FP32(mp[ic]) : 0.0f;
             if (mv == -INFINITY) {
                 continue;
@@ -8238,8 +8239,8 @@ static void ggml_compute_forward_flash_attn_ext_f16_one_chunk(
             }
         }
 
-        // sinks
-        if (sinks) {
+        // sinks - apply only on the first kv-chunk
+        if (sinks && ic_start == 0) {
             const float s = ((float *)((char *) sinks->data))[h];
 
             float ms = 1.0f;
@@ -8247,6 +8248,7 @@ static void ggml_compute_forward_flash_attn_ext_f16_one_chunk(
 
             if (s > M) {
                 ms = expf(M - s);
+                M = s;
                 ggml_vec_scale_f32(DV, VKQ32, ms);
             } else {
                 vs = expf(s - M);
@@ -8255,20 +8257,26 @@ static void ggml_compute_forward_flash_attn_ext_f16_one_chunk(
             S = S*ms + vs;
         }
 
-        // V /= S
-        const float S_inv = S == 0.0f ? 0.0f : 1.0f/S;
-        ggml_vec_scale_f32(DV, VKQ32, S_inv);
+        if (write_partials) {
+            // Write M, S, VKQ to partials for later reduction
+            // partials layout: [M, S, VKQ[DV]] per query head
+            float * partial = partials + ir * partial_stride;
+            partial[0] = M;
+            partial[1] = S;
+            memcpy(partial + 2, VKQ32, DV * sizeof(float));
+        } else {
+            // V /= S
+            const float S_inv = S == 0.0f ? 0.0f : 1.0f/S;
+            ggml_vec_scale_f32(DV, VKQ32, S_inv);
 
-        // dst indices
-        const int i1 = iq1;
-        const int i2 = iq2;
-        const int i3 = iq3;
+            // dst indices
+            const int i1 = iq1;
+            const int i2 = iq2;
+            const int i3 = iq3;
 
-        // original
-        //memcpy((char *) dst->data + (i1*nb1 + i2*nb2 + i3*nb3), V, nev0*sizeof(float));
-
-        // permute(0, 2, 1, 3)
-        memcpy((char *) dst->data + (i3*ne2*ne1 + i2 + i1*ne1)*nb1, VKQ32, nb1);
+            // permute(0, 2, 1, 3)
+            memcpy((char *) dst->data + (i3*ne2*ne1 + i2 + i1*ne1)*nb1, VKQ32, nb1);
+        }
     }
 }
 
@@ -8546,6 +8554,78 @@ static void ggml_compute_forward_flash_attn_ext_tiled(
     }
 }
 
+// Reduction function: combines partial results across KV chunks
+// Partials layout in wdata: [n_q_heads][n_chunks][2 + DV]
+static void ggml_flash_attn_ext_reduce_partials(
+        const ggml_compute_params * params,
+        ggml_tensor * dst,
+        const int64_t n_chunks,
+        const int64_t chunk_size) {
+
+    const ggml_tensor * q = dst->src[0];
+    const ggml_tensor * k = dst->src[1];
+    const ggml_tensor * v = dst->src[2];
+
+    const int64_t DK        = k->ne[0];
+    const int64_t DV        = v->ne[0];
+    const int64_t nek1      = k->ne[1];
+    const int64_t n_q_heads = q->ne[2];
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    const int64_t wdata_per_thread = DK + 2*DV + CACHE_LINE_SIZE_F32;
+    float *       thread_wdata     = (float *) params->wdata + ith * wdata_per_thread;
+
+    const int64_t partials_offset  = nth * (DK + 2*DV + CACHE_LINE_SIZE_F32);
+    const int64_t partial_size     = 2 + DV;
+    const float * partials_base    = (const float *) params->wdata + partials_offset;
+
+    // Output layout
+    const int64_t ne1 = dst->ne[1];
+    const int64_t ne2 = dst->ne[2];
+    const size_t  nb1 = dst->nb[1];
+
+    // Each thread reduces a subset of query heads
+    for (int64_t q_head = ith; q_head < n_q_heads; q_head += nth) {
+        float   M_final   = -INFINITY;
+        float   S_final   = 0.0f;
+        float * VKQ_final = thread_wdata;
+        memset(VKQ_final, 0, DV * sizeof(float));
+
+        // Combine partials from all chunks
+        for (int64_t chunk_idx = 0; chunk_idx < n_chunks; ++chunk_idx) {
+            const int64_t ic_start = chunk_idx * chunk_size;
+            if (ic_start >= nek1) continue;
+
+            const float * partial   = partials_base + (q_head * n_chunks + chunk_idx) * partial_size;
+            const float   M_chunk   = partial[0];
+            const float   S_chunk   = partial[1];
+            const float * VKQ_chunk = partial + 2;
+
+            if (S_chunk == 0.0f) continue;
+
+            const float M_new     = fmaxf(M_final, M_chunk);
+            const float scale_old = expf(M_final - M_new);
+            const float scale_new = expf(M_chunk - M_new);
+
+            for (int64_t d = 0; d < DV; ++d) {
+                VKQ_final[d] = VKQ_final[d] * scale_old + VKQ_chunk[d] * scale_new;
+            }
+            S_final = S_final * scale_old + S_chunk * scale_new;
+            M_final = M_new;
+        }
+
+        // Normalize and write to output
+        if (S_final != 0.0f) {
+            const float S_inv = 1.0f / S_final;
+            ggml_vec_scale_f32(DV, VKQ_final, S_inv);
+        }
+        // iq1=0, iq3=0 for decode
+        memcpy((char *) dst->data + (0*ne2*ne1 + q_head + 0*ne1)*nb1, VKQ_final, nb1);
+    }
+}
+
 static void ggml_compute_forward_flash_attn_ext_f16(
         const ggml_compute_params * params,
         ggml_tensor * dst) {
@@ -8567,6 +8647,7 @@ static void ggml_compute_forward_flash_attn_ext_f16(
     const int64_t DV = nev0;
     const int64_t N  = neq1;
 
+
     GGML_ASSERT(ne0 == DV);
     GGML_ASSERT(ne2 == N);
 
@@ -8587,60 +8668,92 @@ static void ggml_compute_forward_flash_attn_ext_f16(
     GGML_ASSERT(nb1 <= nb2);
     GGML_ASSERT(nb2 <= nb3);
 
-    // parallelize by q rows using ggml_vec_dot_f32
-
-    // total rows in q
-    const int64_t nr = neq1*neq2*neq3;
-
-    // rows per thread
     const int ith = params->ith;
     const int nth = params->nth;
 
-    // disable for NUMA
-    const bool disable_chunking = ggml_is_numa();
+    // When use_ref is set, force the vec-only reference implementation (no tiling, no KV-chunking)
+    const bool use_ref = params->use_ref;
 
-    // 4x chunks per thread
-    int nth_scaled = nth * 4;
-    int64_t chunk_size = (nr + nth_scaled - 1) / nth_scaled;
-    int64_t nchunk     = (nr + chunk_size - 1) / chunk_size;
-
-    if (nth == 1 || nchunk < nth || disable_chunking) {
-        nchunk = nth;
-    }
-
-    if (ith == 0) {
-        // Every thread starts at ith, so the first unprocessed chunk is nth.  This save a bit of coordination right at the start.
-        ggml_threadpool_chunk_set(params->threadpool, nth);
-    }
-
-    ggml_barrier(params->threadpool);
-
-    // The number of elements in each chunk
-    const int64_t dr = (nr + nchunk - 1) / nchunk;
-
-    static constexpr int64_t KV_TILE_SZ = ggml_fa_tile_config::KV;
-    static constexpr int64_t Q_TILE_SZ  = ggml_fa_tile_config::Q;
     const bool kv_is_f32_or_f16 = (k->type == GGML_TYPE_F32 || k->type == GGML_TYPE_F16);
-    const bool use_tiled = (q->type == GGML_TYPE_F32 &&
-                            kv_is_f32_or_f16 &&
-                            k->type == v->type &&
-                            nek1 % KV_TILE_SZ == 0 &&
-                            neq1 >= Q_TILE_SZ);  // Only use tiled for batch >= tile size
+    const bool use_split_kv_path = !use_ref && (neq1 == 1 && neq3 == 1) && kv_is_f32_or_f16 && (k->type == v->type) && q->type == GGML_TYPE_F32 && nek1 >= 512;
 
-    // The first chunk comes from our thread_id, the rest will get auto-assigned.
-    int current_chunk = ith;
+    if (use_split_kv_path) {
+        const int64_t chunk_size = (nek1 + nth - 1) / nth;
 
-    while (current_chunk < nchunk) {
-        const int64_t ir0 = dr * current_chunk;
-        const int64_t ir1 = MIN(ir0 + dr, nr);
+        // Partials buffer layout: [q_head][kv_chunk][M, S, VKQ]
+        const int64_t partial_size  = 2 + DV;
+        float *       partials_base = (float *) params->wdata + nth * (DK + 2*DV + CACHE_LINE_SIZE_F32);
 
-        if (use_tiled) {
-            ggml_compute_forward_flash_attn_ext_tiled(params, dst, ir0, ir1);
+        const int64_t ic_start = ith * chunk_size;
+        const int64_t ic_end   = std::min(ic_start + chunk_size, nek1);
+
+        const int64_t partial_stride = nth * partial_size;
+        float *       chunk_partials = partials_base + ith * partial_size;
+
+        if (ic_start < nek1) {
+            for (int64_t q_head = 0; q_head < neq2; q_head++) {
+                ggml_compute_forward_flash_attn_ext_f16_one_chunk(
+                    params, dst, q_head, q_head + 1, ic_start, ic_end,
+                    chunk_partials, partial_stride);
+            }
         } else {
-            ggml_compute_forward_flash_attn_ext_f16_one_chunk(params, dst, ir0, ir1);
+            for (int64_t q_head = 0; q_head < neq2; q_head++) {
+                float * q_partials = chunk_partials + q_head * partial_stride;
+                q_partials[0] = -INFINITY;  // M
+                q_partials[1] = 0.0f;       // S
+            }
         }
 
-        current_chunk = ggml_threadpool_chunk_add(params->threadpool, 1);
+        ggml_barrier(params->threadpool);
+        ggml_flash_attn_ext_reduce_partials(params, dst, nth, chunk_size);
+    } else {
+
+        // total rows in q
+        const int64_t nr = neq1*neq2*neq3;
+
+        // disable for NUMA
+        const bool disable_chunking = ggml_is_numa();
+
+        // 4x chunks per thread
+        int nth_scaled = nth * 4;
+        int64_t chunk_size = (nr + nth_scaled - 1) / nth_scaled;
+        int64_t nchunk     = (nr + chunk_size - 1) / chunk_size;
+
+        if (nth == 1 || nchunk < nth || disable_chunking) {
+            nchunk = nth;
+        }
+
+        if (ith == 0) {
+            ggml_threadpool_chunk_set(params->threadpool, nth);
+        }
+
+        ggml_barrier(params->threadpool);
+
+        const int64_t dr = (nr + nchunk - 1) / nchunk;
+
+        static constexpr int64_t KV_TILE_SZ = ggml_fa_tile_config::KV;
+        static constexpr int64_t Q_TILE_SZ  = ggml_fa_tile_config::Q;
+        const bool use_tiled = !use_ref &&
+                               (q->type == GGML_TYPE_F32 &&
+                                kv_is_f32_or_f16 &&
+                                k->type == v->type &&
+                                nek1 % KV_TILE_SZ == 0 &&
+                                neq1 >= Q_TILE_SZ);
+
+        int current_chunk = ith;
+
+        while (current_chunk < nchunk) {
+            const int64_t ir0 = dr * current_chunk;
+            const int64_t ir1 = MIN(ir0 + dr, nr);
+
+            if (use_tiled) {
+                ggml_compute_forward_flash_attn_ext_tiled(params, dst, ir0, ir1);
+            } else {
+                ggml_compute_forward_flash_attn_ext_f16_one_chunk(params, dst, ir0, ir1, 0, nek1, nullptr, 0);
+            }
+
+            current_chunk = ggml_threadpool_chunk_add(params->threadpool, 1);
+        }
     }
 }
 
diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp
index 411467e968..90cc0d7da2 100644
--- a/tests/test-backend-ops.cpp
+++ b/tests/test-backend-ops.cpp
@@ -8591,6 +8591,13 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
             output_printer->print_operation(info);
             return false;
         }
+        // Use reference implementation on the CPU backend for comparison
+        using ggml_backend_cpu_set_use_ref_t = void (*)(ggml_backend_t, bool);
+        auto * reg = ggml_backend_dev_backend_reg(ggml_backend_get_device(backend_cpu));
+        auto * set_use_ref = (ggml_backend_cpu_set_use_ref_t) ggml_backend_reg_get_proc_address(reg, "ggml_backend_cpu_set_use_ref");
+        if (set_use_ref) {
+            set_use_ref(backend_cpu, true);
+        }
 
         size_t n_ok = 0;
         size_t                   tests_run = 0;

From 07a7412a3b7518a55bf6fe191beb754a1dd2a561 Mon Sep 17 00:00:00 2001
From: Xuan-Son Nguyen <son@huggingface.co>
Date: Mon, 2 Feb 2026 20:59:06 +0100
Subject: [PATCH 16/18] mtmd: add min/max pixels gguf metadata (#19273)

---
 gguf-py/gguf/constants.py   | 2 ++
 gguf-py/gguf/gguf_writer.py | 6 ++++++
 tools/mtmd/clip-impl.h      | 2 ++
 3 files changed, 10 insertions(+)

diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py
index 31273b2b5a..6f56d36c59 100644
--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@@ -284,6 +284,8 @@ class Keys:
     class ClipVision:
         PROJECTOR_TYPE      = "clip.vision.projector_type" # for mixed modality models
         IMAGE_SIZE          = "clip.vision.image_size"
+        IMAGE_MIN_PIXELS    = "clip.vision.image_min_pixels"
+        IMAGE_MAX_PIXELS    = "clip.vision.image_max_pixels"
         PREPROC_IMAGE_SIZE  = "clip.vision.preproc_image_size"
         PATCH_SIZE          = "clip.vision.patch_size"
         EMBEDDING_LENGTH    = "clip.vision.embedding_length"
diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py
index 7fbb78866b..0b9c650161 100644
--- a/gguf-py/gguf/gguf_writer.py
+++ b/gguf-py/gguf/gguf_writer.py
@@ -1113,6 +1113,12 @@ class GGUFWriter:
     def add_vision_image_size(self, value: int) -> None:
         self.add_uint32(Keys.ClipVision.IMAGE_SIZE, value)
 
+    def add_vision_max_pixels(self, value: int) -> None:
+        self.add_uint32(Keys.ClipVision.IMAGE_MAX_PIXELS, value)
+
+    def add_vision_min_pixels(self, value: int) -> None:
+        self.add_uint32(Keys.ClipVision.IMAGE_MIN_PIXELS, value)
+
     def add_vision_preproc_image_size(self, value: int) -> None:
         self.add_uint32(Keys.ClipVision.PREPROC_IMAGE_SIZE, value)
 
diff --git a/tools/mtmd/clip-impl.h b/tools/mtmd/clip-impl.h
index dd693623a2..ad232178bf 100644
--- a/tools/mtmd/clip-impl.h
+++ b/tools/mtmd/clip-impl.h
@@ -36,6 +36,8 @@
 // vision-specific
 #define KEY_VISION_PROJ_TYPE    "clip.vision.projector_type" // for models with mixed modalities
 #define KEY_IMAGE_SIZE          "clip.vision.image_size"
+#define KEY_IMAGE_MIN_PIXELS    "clip.vision.image_min_pixels"
+#define KEY_IMAGE_MAX_PIXELS    "clip.vision.image_max_pixels"
 #define KEY_PREPROC_IMAGE_SIZE  "clip.vision.preproc_image_size"
 #define KEY_PATCH_SIZE          "clip.vision.patch_size"
 #define KEY_IMAGE_MEAN          "clip.vision.image_mean"

From 0dfcd3b60755bf9cb3c9ec726a584b8a4f20239b Mon Sep 17 00:00:00 2001
From: Sid Mohan <61345237+sidmohan0@users.noreply.github.com>
Date: Mon, 2 Feb 2026 12:00:55 -0800
Subject: [PATCH 17/18] jinja : add missing 'in' test to template engine
 (#19004) (#19239)

* jinja : add missing 'in' test to template engine (#19004)

The jinja template parser was missing the 'in' test from
global_builtins(), causing templates using reject("in", ...),
select("in", ...), or 'x is in(y)' to fail with
"selectattr: unknown test 'in'".

This broke tool-calling for Qwen3-Coder and any other model
whose chat template uses the 'in' test.

Added test_is_in supporting array, string, and object containment
checks, mirroring the existing 'in' operator logic in runtime.cpp.

Includes test cases for all three containment types plus
reject/select filter usage.

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>

* reuse test_is_in in binary op

---------

Co-authored-by: Sid Mohan <sidmohan0@users.noreply.github.com>
Co-authored-by: Claude Opus 4.5 <noreply@anthropic.com>
Co-authored-by: Xuan Son Nguyen <son@huggingface.co>
---
 common/jinja/runtime.cpp | 34 +++++++++++++--------------
 common/jinja/value.cpp   | 27 ++++++++++++++++++++++
 tests/test-jinja.cpp     | 50 +++++++++++++++++++++++++++++++++++++++-
 3 files changed, 93 insertions(+), 18 deletions(-)

diff --git a/common/jinja/runtime.cpp b/common/jinja/runtime.cpp
index f234d9284f..4453d86e6d 100644
--- a/common/jinja/runtime.cpp
+++ b/common/jinja/runtime.cpp
@@ -144,6 +144,13 @@ value binary_expression::execute_impl(context & ctx) {
         return false;
     };
 
+    auto test_is_in = [&]() -> bool {
+        func_args args(ctx);
+        args.push_back(left_val);
+        args.push_back(right_val);
+        return global_builtins().at("test_is_in")(args)->as_bool();
+    };
+
     // Handle undefined and null values
     if (is_val<value_undefined>(left_val) || is_val<value_undefined>(right_val)) {
         if (is_val<value_undefined>(right_val) && (op.value == "in" || op.value == "not in")) {
@@ -223,19 +230,11 @@ value binary_expression::execute_impl(context & ctx) {
             return result;
         }
     } else if (is_val<value_array>(right_val)) {
-        auto & arr = right_val->as_array();
-        bool member = false;
-        for (const auto & item : arr) {
-            if (*left_val == *item) {
-                member = true;
-                break;
-            }
-        }
+        // case: 1 in [0, 1, 2]
+        bool member = test_is_in();
         if (op.value == "in") {
-            JJ_DEBUG("Checking membership: %s in Array is %d", left_val->type().c_str(), member);
             return mk_val<value_bool>(member);
         } else if (op.value == "not in") {
-            JJ_DEBUG("Checking non-membership: %s not in Array is %d", left_val->type().c_str(), !member);
             return mk_val<value_bool>(!member);
         }
     }
@@ -252,22 +251,23 @@ value binary_expression::execute_impl(context & ctx) {
 
     // String membership
     if (is_val<value_string>(left_val) && is_val<value_string>(right_val)) {
-        auto left_str = left_val->as_string().str();
-        auto right_str = right_val->as_string().str();
+        // case: "a" in "abc"
+        bool member = test_is_in();
         if (op.value == "in") {
-            return mk_val<value_bool>(right_str.find(left_str) != std::string::npos);
+            return mk_val<value_bool>(member);
         } else if (op.value == "not in") {
-            return mk_val<value_bool>(right_str.find(left_str) == std::string::npos);
+            return mk_val<value_bool>(!member);
         }
     }
 
     // Value key in object
     if (is_val<value_object>(right_val)) {
-        bool has_key = right_val->has_key(left_val);
+        // case: key in {key: value}
+        bool member = test_is_in();
         if (op.value == "in") {
-            return mk_val<value_bool>(has_key);
+            return mk_val<value_bool>(member);
         } else if (op.value == "not in") {
-            return mk_val<value_bool>(!has_key);
+            return mk_val<value_bool>(!member);
         }
     }
 
diff --git a/common/jinja/value.cpp b/common/jinja/value.cpp
index f254ae9251..2aa156b177 100644
--- a/common/jinja/value.cpp
+++ b/common/jinja/value.cpp
@@ -393,6 +393,33 @@ const func_builtins & global_builtins() {
         {"test_is_lt", test_compare_fn<value_compare_op::lt>},
         {"test_is_lessthan", test_compare_fn<value_compare_op::lt>},
         {"test_is_ne", test_compare_fn<value_compare_op::ne>},
+        {"test_is_in", [](const func_args & args) -> value {
+            args.ensure_count(2);
+            auto needle   = args.get_pos(0);
+            auto haystack = args.get_pos(1);
+            if (is_val<value_undefined>(haystack)) {
+                return mk_val<value_bool>(false);
+            }
+            if (is_val<value_array>(haystack)) {
+                for (const auto & item : haystack->as_array()) {
+                    if (*needle == *item) {
+                        return mk_val<value_bool>(true);
+                    }
+                }
+                return mk_val<value_bool>(false);
+            }
+            if (is_val<value_string>(haystack)) {
+                if (!is_val<value_string>(needle)) {
+                    throw raised_exception("'in' test expects args[1] as string when args[0] is string, got args[1] as " + needle->type());
+                }
+                return mk_val<value_bool>(
+                    haystack->as_string().str().find(needle->as_string().str()) != std::string::npos);
+            }
+            if (is_val<value_object>(haystack)) {
+                return mk_val<value_bool>(haystack->has_key(needle));
+            }
+            throw raised_exception("'in' test expects iterable as first argument, got " + haystack->type());
+        }},
         {"test_is_test", [](const func_args & args) -> value {
             args.ensure_vals<value_string>();
             auto & builtins = global_builtins();
diff --git a/tests/test-jinja.cpp b/tests/test-jinja.cpp
index f6114f1e2f..1f25c6ae71 100644
--- a/tests/test-jinja.cpp
+++ b/tests/test-jinja.cpp
@@ -189,12 +189,24 @@ static void test_conditionals(testing & t) {
         "negated"
     );
 
-    test_template(t, "in operator",
+    test_template(t, "in operator (element in array)",
         "{% if 'x' in items %}found{% endif %}",
         {{"items", json::array({"x", "y"})}},
         "found"
     );
 
+    test_template(t, "in operator (substring)",
+        "{% if 'bc' in 'abcd' %}found{% endif %}",
+        json::object(),
+        "found"
+    );
+
+    test_template(t, "in operator (object key)",
+        "{% if 'key' in obj %}found{% endif %}",
+        {{"obj", {{"key", 1}, {"other", 2}}}},
+        "found"
+    );
+
     test_template(t, "is defined",
         "{% if x is defined %}yes{% else %}no{% endif %}",
         {{"x", 1}},
@@ -1036,6 +1048,42 @@ static void test_tests(testing & t) {
         json::object(),
         "yes"
     );
+
+    test_template(t, "is in (array, true)",
+        "{{ 'yes' if 2 is in([1, 2, 3]) }}",
+        json::object(),
+        "yes"
+    );
+
+    test_template(t, "is in (array, false)",
+        "{{ 'yes' if 5 is in([1, 2, 3]) else 'no' }}",
+        json::object(),
+        "no"
+    );
+
+    test_template(t, "is in (string)",
+        "{{ 'yes' if 'bc' is in('abcde') }}",
+        json::object(),
+        "yes"
+    );
+
+    test_template(t, "is in (object keys)",
+        "{{ 'yes' if 'a' is in(obj) }}",
+        {{"obj", {{"a", 1}, {"b", 2}}}},
+        "yes"
+    );
+
+    test_template(t, "reject with in test",
+        "{{ items | reject('in', skip) | join(', ') }}",
+        {{"items", json::array({"a", "b", "c", "d"})}, {"skip", json::array({"b", "d"})}},
+        "a, c"
+    );
+
+    test_template(t, "select with in test",
+        "{{ items | select('in', keep) | join(', ') }}",
+        {{"items", json::array({"a", "b", "c", "d"})}, {"keep", json::array({"b", "c"})}},
+        "b, c"
+    );
 }
 
 static void test_string_methods(testing & t) {

From 91ea44e89b30474831d5dc0ad57719a5819506db Mon Sep 17 00:00:00 2001
From: lhez <lih@qti.qualcomm.com>
Date: Mon, 2 Feb 2026 15:54:43 -0800
Subject: [PATCH 18/18] opencl: refactor some ops, concat, repeat, tanh and
 scale (#19226)

* opencl: refactor concat

* opencl: refactor repeat

* opencl: refactor tanh

* opencl: enable fp16 for tanh

* opencl: refactor scale

* opencl: fix unused variables
---
 ggml/src/ggml-opencl/ggml-opencl.cpp   | 484 +++++++++++--------------
 ggml/src/ggml-opencl/kernels/concat.cl | 140 +++----
 ggml/src/ggml-opencl/kernels/repeat.cl |  63 ++--
 ggml/src/ggml-opencl/kernels/scale.cl  |  18 +-
 ggml/src/ggml-opencl/kernels/tanh.cl   | 142 +++++---
 5 files changed, 400 insertions(+), 447 deletions(-)

diff --git a/ggml/src/ggml-opencl/ggml-opencl.cpp b/ggml/src/ggml-opencl/ggml-opencl.cpp
index 0f0eb3a9d8..508b2b8f03 100644
--- a/ggml/src/ggml-opencl/ggml-opencl.cpp
+++ b/ggml/src/ggml-opencl/ggml-opencl.cpp
@@ -453,7 +453,6 @@ struct ggml_backend_opencl_context {
     cl_program program_rms_norm;
     cl_program program_group_norm;
     cl_program program_rope;
-    cl_program program_scale;
     cl_program program_silu;
     cl_program program_sigmoid;
     cl_program program_softmax_f32;
@@ -462,11 +461,8 @@ struct ggml_backend_opencl_context {
     cl_program program_softmax_4_f16;
     cl_program program_argsort_f32_i32;
     cl_program program_sum_rows_f32;
-    cl_program program_repeat;
     cl_program program_pad;
-    cl_program program_tanh;
     cl_program program_upscale;
-    cl_program program_concat;
     cl_program program_conv_2d_f16;
     cl_program program_conv_2d_f32;
     cl_program program_conv_2d_f16_f32;
@@ -485,7 +481,7 @@ struct ggml_backend_opencl_context {
     cl_kernel kernel_div, kernel_div_row, kernel_div_f16, kernel_div_row_f16;
     cl_kernel kernel_sub, kernel_sub_row, kernel_sub_f16, kernel_sub_row_f16;
     cl_kernel kernel_add_id;
-    cl_kernel kernel_scale;
+    cl_kernel kernel_scale_f32, kernel_scale_f32_4;
     cl_kernel kernel_sqr_cont_f32, kernel_sqr_cont_f32_4, kernel_sqr_cont_f16, kernel_sqr_cont_f16_4;
     cl_kernel kernel_sqrt_cont_f32, kernel_sqrt_cont_f32_4, kernel_sqrt_cont_f16, kernel_sqrt_cont_f16_4;
     cl_kernel kernel_mean_f32;
@@ -544,18 +540,17 @@ struct ggml_backend_opencl_context {
     cl_kernel kernel_im2col_f32, kernel_im2col_f16;
     cl_kernel kernel_argsort_f32_i32;
     cl_kernel kernel_sum_rows_f32;
-    cl_kernel kernel_repeat;
+    cl_kernel kernel_repeat_f32;
     cl_kernel kernel_pad;
-    cl_kernel kernel_tanh_f32_nd;
-    cl_kernel kernel_tanh_f16_nd;
+    cl_kernel kernel_tanh_f32, kernel_tanh_f32_4, kernel_tanh_f32_nc;
+    cl_kernel kernel_tanh_f16, kernel_tanh_f16_4, kernel_tanh_f16_nc;
     cl_kernel kernel_expm1_f32_nd;
     cl_kernel kernel_expm1_f16_nd;
     cl_kernel kernel_softplus_f32_nd;
     cl_kernel kernel_softplus_f16_nd;
     cl_kernel kernel_upscale;
     cl_kernel kernel_upscale_bilinear;
-    cl_kernel kernel_concat_f32_contiguous;
-    cl_kernel kernel_concat_f32_non_contiguous;
+    cl_kernel kernel_concat_f32;
     cl_kernel kernel_conv_2d_f16;
     cl_kernel kernel_conv_2d_f32;
     cl_kernel kernel_conv_2d_f16_f32;
@@ -1483,10 +1478,12 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
 #else
         const std::string kernel_src = read_file("scale.cl");
 #endif
-        backend_ctx->program_scale =
+        cl_program prog =
             build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
 
-        CL_CHECK((backend_ctx->kernel_scale = clCreateKernel(backend_ctx->program_scale, "kernel_scale", &err), err));
+        CL_CHECK((backend_ctx->kernel_scale_f32   = clCreateKernel(prog, "kernel_scale_f32", &err), err));
+        CL_CHECK((backend_ctx->kernel_scale_f32_4 = clCreateKernel(prog, "kernel_scale_f32_4", &err), err));
+        CL_CHECK(clReleaseProgram(prog));
         GGML_LOG_CONT(".");
     }
 
@@ -1814,16 +1811,11 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
 #else
         const std::string kernel_src = read_file("repeat.cl");
 #endif
-        if (!kernel_src.empty()) {
-            backend_ctx->program_repeat =
-                build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
-            CL_CHECK((backend_ctx->kernel_repeat = clCreateKernel(backend_ctx->program_repeat, "kernel_repeat", &err), err));
-            GGML_LOG_CONT(".");
-        } else {
-            GGML_LOG_WARN("ggml_opencl: repeat kernel source not found or empty. Repeat operations will not be available.\n");
-            backend_ctx->program_repeat = nullptr;
-            backend_ctx->kernel_repeat = nullptr;
-        }
+        cl_program prog =
+            build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
+        CL_CHECK((backend_ctx->kernel_repeat_f32 = clCreateKernel(prog, "kernel_repeat_f32", &err), err));
+        CL_CHECK(clReleaseProgram(prog));
+        GGML_LOG_CONT(".");
     }
 
     // pad
@@ -1856,18 +1848,16 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
 #else
         const std::string kernel_src = read_file("tanh.cl");
 #endif
-        if (!kernel_src.empty()) {
-            backend_ctx->program_tanh =
-                build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
-            CL_CHECK((backend_ctx->kernel_tanh_f32_nd = clCreateKernel(backend_ctx->program_tanh, "kernel_tanh_f32_nd", &err), err));
-            CL_CHECK((backend_ctx->kernel_tanh_f16_nd = clCreateKernel(backend_ctx->program_tanh, "kernel_tanh_f16_nd", &err), err));
-            GGML_LOG_CONT(".");
-        } else {
-            GGML_LOG_WARN("ggml_opencl: tanh kernel source not found or empty. Tanh operation will not be available.\n");
-            backend_ctx->program_tanh = nullptr;
-            backend_ctx->kernel_tanh_f32_nd = nullptr;
-            backend_ctx->kernel_tanh_f16_nd = nullptr;
-        }
+        cl_program prog =
+            build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
+        CL_CHECK((backend_ctx->kernel_tanh_f32    = clCreateKernel(prog, "kernel_tanh_f32", &err), err));
+        CL_CHECK((backend_ctx->kernel_tanh_f32_4  = clCreateKernel(prog, "kernel_tanh_f32_4", &err), err));
+        CL_CHECK((backend_ctx->kernel_tanh_f32_nc = clCreateKernel(prog, "kernel_tanh_f32_nc", &err), err));
+        CL_CHECK((backend_ctx->kernel_tanh_f16    = clCreateKernel(prog, "kernel_tanh_f16", &err), err));
+        CL_CHECK((backend_ctx->kernel_tanh_f16_4  = clCreateKernel(prog, "kernel_tanh_f16_4", &err), err));
+        CL_CHECK((backend_ctx->kernel_tanh_f16_nc = clCreateKernel(prog, "kernel_tanh_f16_nc", &err), err));
+        CL_CHECK(clReleaseProgram(prog));
+        GGML_LOG_CONT(".");
     }
 
     // expm1
@@ -1959,22 +1949,13 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
             #include "concat.cl.h"
         };
 #else
-
         const std::string kernel_src = read_file("concat.cl");
 #endif
-        if (!kernel_src.empty()) {
-            backend_ctx->program_concat =
-                build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
-
-            CL_CHECK((backend_ctx->kernel_concat_f32_contiguous = clCreateKernel(backend_ctx->program_concat, "kernel_concat_f32_contiguous", &err), err));
-            CL_CHECK((backend_ctx->kernel_concat_f32_non_contiguous = clCreateKernel(backend_ctx->program_concat, "kernel_concat_f32_non_contiguous", &err), err));
-            GGML_LOG_CONT(".");
-        } else {
-            GGML_LOG_WARN("ggml_opencl: concat kernel source not found or empty. Concat operations will not be available.\n");
-            backend_ctx->program_concat = nullptr;
-            backend_ctx->kernel_concat_f32_contiguous = nullptr;
-            backend_ctx->kernel_concat_f32_non_contiguous = nullptr;
-        }
+        cl_program prog =
+            build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
+        CL_CHECK((backend_ctx->kernel_concat_f32 = clCreateKernel(prog, "kernel_concat_f32", &err), err));
+        CL_CHECK(clReleaseProgram(prog));
+        GGML_LOG_CONT(".");
     }
 
     // timestep_embedding
@@ -3318,8 +3299,7 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
                 case GGML_UNARY_OP_SIGMOID:
                     return ggml_is_contiguous(op->src[0]);
                 case GGML_UNARY_OP_TANH:
-                   return (op->src[0]->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32) ||
-                          (op->src[0]->type == GGML_TYPE_F16 && op->type == GGML_TYPE_F16);
+                   return op->src[0]->type == GGML_TYPE_F32 || op->src[0]->type == GGML_TYPE_F16;
                 case GGML_UNARY_OP_EXPM1:
                    return (op->src[0]->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32) ||
                           (op->src[0]->type == GGML_TYPE_F16 && op->type == GGML_TYPE_F16);
@@ -7029,79 +7009,87 @@ static void ggml_cl_tanh(ggml_backend_t backend, const ggml_tensor * src0, const
     ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
     ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
 
-    cl_ulong offset0_abs = extra0->offset + src0->view_offs;
-    cl_ulong offsetd_abs = extrad->offset + dst->view_offs;
+    cl_ulong offset0 = extra0->offset + src0->view_offs;
+    cl_ulong offsetd = extrad->offset + dst->view_offs;
+
+    const int ne00 = src0->ne[0];
+    const int ne01 = src0->ne[1];
+    const int ne02 = src0->ne[2];
+    const int ne03 = src0->ne[3];
+
+    const cl_ulong nb00 = src0->nb[0];
+    const cl_ulong nb01 = src0->nb[1];
+    const cl_ulong nb02 = src0->nb[2];
+    const cl_ulong nb03 = src0->nb[3];
+
+    const cl_ulong nb0  = dst->nb[0];
+    const cl_ulong nb1  = dst->nb[1];
+    const cl_ulong nb2  = dst->nb[2];
+    const cl_ulong nb3  = dst->nb[3];
 
     cl_kernel kernel;
-    if (dst->type == GGML_TYPE_F32) {
-        kernel = backend_ctx->kernel_tanh_f32_nd;
-    } else if (dst->type == GGML_TYPE_F16) {
-        kernel = backend_ctx->kernel_tanh_f16_nd;
-    } else {
-        GGML_ASSERT(false && "Unsupported type for ggml_cl_tanh");
-    }
-    GGML_ASSERT(kernel != nullptr);
 
-    const int ne00 = src0->ne[0]; const int ne01 = src0->ne[1]; const int ne02 = src0->ne[2]; const int ne03 = src0->ne[3];
-    const cl_ulong nb00 = src0->nb[0]; const cl_ulong nb01 = src0->nb[1]; const cl_ulong nb02 = src0->nb[2]; const cl_ulong nb03 = src0->nb[3];
-
-    const int ne10 = dst->ne[0]; const int ne11 = dst->ne[1]; const int ne12 = dst->ne[2]; const int ne13 = dst->ne[3];
-    const cl_ulong nb10 = dst->nb[0]; const cl_ulong nb11 = dst->nb[1]; const cl_ulong nb12 = dst->nb[2]; const cl_ulong nb13 = dst->nb[3];
-
-    CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem),   &extra0->data_device));
-    CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0_abs));
-    CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem),   &extrad->data_device));
-    CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd_abs));
-
-    CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int),      &ne00));
-    CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int),      &ne01));
-    CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int),      &ne02));
-    CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int),      &ne03));
-    CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &nb00));
-    CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb01));
-    CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong),&nb02));
-    CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong),&nb03));
-
-    CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int),     &ne10));
-    CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int),     &ne11));
-    CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int),     &ne12));
-    CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int),     &ne13));
-    CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_ulong),&nb10));
-    CL_CHECK(clSetKernelArg(kernel, 17, sizeof(cl_ulong),&nb11));
-    CL_CHECK(clSetKernelArg(kernel, 18, sizeof(cl_ulong),&nb12));
-    CL_CHECK(clSetKernelArg(kernel, 19, sizeof(cl_ulong),&nb13));
-
-    size_t global_work_size[3];
-    if (ne10 == 0 || ne11 == 0 || ne12 == 0 || ne13 == 0) { // Handle case of 0 elements
-        return;
-    }
-    global_work_size[0] = (size_t)ne10;
-    global_work_size[1] = (size_t)ne11;
-    global_work_size[2] = (size_t)ne12;
-
-    size_t lws0 = 16, lws1 = 4, lws2 = 1;
-    if (ne10 < 16) lws0 = ne10;
-    if (ne11 < 4) lws1 = ne11;
-    if (ne12 < 1) lws2 = ne12 > 0 ? ne12 : 1;
-
-    while (lws0 * lws1 * lws2 > 256 && lws0 > 1) lws0 /= 2;
-    while (lws0 * lws1 * lws2 > 256 && lws1 > 1) lws1 /= 2;
-    while (lws0 * lws1 * lws2 > 256 && lws2 > 1) lws2 /= 2;
-
-
-    size_t local_work_size[] = {lws0, lws1, lws2};
-
-    size_t* local_work_size_ptr = local_work_size;
-    if (!backend_ctx->non_uniform_workgroups) {
-        if (global_work_size[0] % local_work_size[0] != 0 ||
-            global_work_size[1] % local_work_size[1] != 0 ||
-            global_work_size[2] % local_work_size[2] != 0) {
-            local_work_size_ptr = NULL;
+    if (ggml_is_contiguous(src0)) {
+        // Handle contiguous input
+        int n = ggml_nelements(dst);
+        if (n % 4 == 0) {
+            if (src0->type == GGML_TYPE_F32) {
+                kernel = backend_ctx->kernel_tanh_f32_4;
+            } else {
+                kernel = backend_ctx->kernel_tanh_f16_4;
+            }
+            n /= 4;
+        } else {
+            if (src0->type == GGML_TYPE_F32) {
+                kernel = backend_ctx->kernel_tanh_f32;
+            } else {
+                kernel = backend_ctx->kernel_tanh_f16;
+            }
         }
-    }
-    if (global_work_size[0] == 0 || global_work_size[1] == 0 || global_work_size[2] == 0) return;
 
-    backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst);
+        CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem),   &extra0->data_device));
+        CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
+        CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem),   &extrad->data_device));
+        CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
+
+        size_t global_work_size[] = {(size_t)n, 1, 1};
+        size_t local_work_size[] = {64, 1, 1};
+
+        size_t * local_work_size_ptr = local_work_size;
+        if (n % 64 != 0 && !backend_ctx->non_uniform_workgroups) {
+            local_work_size_ptr = nullptr;
+        }
+
+        backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst);
+    } else {
+        // Handle non-contiguous input
+        if (src0->type == GGML_TYPE_F32) {
+            kernel = backend_ctx->kernel_tanh_f32_nc;
+        } else {
+            kernel = backend_ctx->kernel_tanh_f16_nc;
+        }
+
+        CL_CHECK(clSetKernelArg(kernel,  0, sizeof(cl_mem),   &extra0->data_device));
+        CL_CHECK(clSetKernelArg(kernel,  1, sizeof(cl_ulong), &offset0));
+        CL_CHECK(clSetKernelArg(kernel,  2, sizeof(cl_mem),   &extrad->data_device));
+        CL_CHECK(clSetKernelArg(kernel,  3, sizeof(cl_ulong), &offsetd));
+        CL_CHECK(clSetKernelArg(kernel,  4, sizeof(int),      &ne00));
+        CL_CHECK(clSetKernelArg(kernel,  5, sizeof(cl_ulong), &nb00));
+        CL_CHECK(clSetKernelArg(kernel,  6, sizeof(cl_ulong), &nb01));
+        CL_CHECK(clSetKernelArg(kernel,  7, sizeof(cl_ulong), &nb02));
+        CL_CHECK(clSetKernelArg(kernel,  8, sizeof(cl_ulong), &nb03));
+        CL_CHECK(clSetKernelArg(kernel,  9, sizeof(cl_ulong), &nb0));
+        CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb1));
+        CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb2));
+        CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb3));
+
+        int nth = 64;
+
+        size_t global_work_size[] = {(size_t)ne01*nth, (size_t)ne02, (size_t)ne03};
+        size_t local_work_size[] = {(size_t)nth, 1, 1};
+
+        backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
+    }
 }
 
 static void ggml_cl_expm1(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@@ -7319,53 +7307,58 @@ static void ggml_cl_repeat(ggml_backend_t backend, const ggml_tensor * src0, con
 
     ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
 
-    if (backend_ctx->kernel_repeat == nullptr) {
-        GGML_LOG_WARN("%s: repeat kernel not available, skipping OpenCL execution.\n", __func__);
-        return;
-    }
+    ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
+    ggml_tensor_extra_cl * extrad  = (ggml_tensor_extra_cl *)dst->extra;
 
-    ggml_tensor_extra_cl * extra_src0 = (ggml_tensor_extra_cl *)src0->extra;
-    ggml_tensor_extra_cl * extra_dst  = (ggml_tensor_extra_cl *)dst->extra;
+    cl_ulong offset0 = extra0->offset + src0->view_offs;
+    cl_ulong offsetd  = extrad->offset + dst->view_offs;
 
-    cl_ulong off_src0 = extra_src0->offset + src0->view_offs;
-    cl_ulong off_dst  = extra_dst->offset  + dst->view_offs;
+    const int ne00 = src0->ne[0];
+    const int ne01 = src0->ne[1];
+    const int ne02 = src0->ne[2];
+    const int ne03 = src0->ne[3];
 
-    const int src0_ne0 = src0->ne[0]; const int src0_ne1 = src0->ne[1]; const int src0_ne2 = src0->ne[2]; const int src0_ne3 = src0->ne[3];
-    const cl_ulong src0_nb0 = src0->nb[0]; const cl_ulong src0_nb1 = src0->nb[1]; const cl_ulong src0_nb2 = src0->nb[2]; const cl_ulong src0_nb3 = src0->nb[3];
+    const cl_ulong nb00 = src0->nb[0];
+    const cl_ulong nb01 = src0->nb[1];
+    const cl_ulong nb02 = src0->nb[2];
+    const cl_ulong nb03 = src0->nb[3];
 
-    const int dst_ne0 = dst->ne[0]; const int dst_ne1 = dst->ne[1]; const int dst_ne2 = dst->ne[2]; const int dst_ne3 = dst->ne[3];
-    const cl_ulong dst_nb0 = dst->nb[0]; const cl_ulong dst_nb1 = dst->nb[1]; const cl_ulong dst_nb2 = dst->nb[2]; const cl_ulong dst_nb3 = dst->nb[3];
+    const int ne0 = dst->ne[0];
+    const int ne1 = dst->ne[1];
+    const int ne2 = dst->ne[2];
+    const int ne3 = dst->ne[3];
 
-    cl_kernel kernel = backend_ctx->kernel_repeat;
+    const cl_ulong nb0 = dst->nb[0];
+    const cl_ulong nb1 = dst->nb[1];
+    const cl_ulong nb2 = dst->nb[2];
+    const cl_ulong nb3 = dst->nb[3];
 
-    CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem),    &extra_src0->data_device));
-    CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem),    &extra_dst->data_device));
-    CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_ulong),  &off_src0));
-    CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong),  &off_dst));
-    CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int),       &src0_ne0));
-    CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int),       &src0_ne1));
-    CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int),       &src0_ne2));
-    CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int),       &src0_ne3));
-    CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong),  &src0_nb0));
-    CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong),  &src0_nb1));
-    CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &src0_nb2));
-    CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &src0_nb3));
-    CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int),      &dst_ne0));
-    CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int),      &dst_ne1));
-    CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int),      &dst_ne2));
-    CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int),      &dst_ne3));
-    CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_ulong), &dst_nb0));
-    CL_CHECK(clSetKernelArg(kernel, 17, sizeof(cl_ulong), &dst_nb1));
-    CL_CHECK(clSetKernelArg(kernel, 18, sizeof(cl_ulong), &dst_nb2));
-    CL_CHECK(clSetKernelArg(kernel, 19, sizeof(cl_ulong), &dst_nb3));
+    cl_kernel kernel = backend_ctx->kernel_repeat_f32;
 
-    size_t gws0 = dst_ne1 > 0 ? (size_t)dst_ne1 : 1;
-    size_t gws1 = dst_ne2 > 0 ? (size_t)dst_ne2 : 1;
-    size_t gws2 = dst_ne3 > 0 ? (size_t)dst_ne3 : 1;
+    CL_CHECK(clSetKernelArg(kernel,  0, sizeof(cl_mem),   &extra0->data_device));
+    CL_CHECK(clSetKernelArg(kernel,  1, sizeof(cl_ulong), &offset0));
+    CL_CHECK(clSetKernelArg(kernel,  2, sizeof(cl_mem),   &extrad->data_device));
+    CL_CHECK(clSetKernelArg(kernel,  3, sizeof(cl_ulong), &offsetd));
+    CL_CHECK(clSetKernelArg(kernel,  4, sizeof(int),      &ne00));
+    CL_CHECK(clSetKernelArg(kernel,  5, sizeof(int),      &ne01));
+    CL_CHECK(clSetKernelArg(kernel,  6, sizeof(int),      &ne02));
+    CL_CHECK(clSetKernelArg(kernel,  7, sizeof(int),      &ne03));
+    CL_CHECK(clSetKernelArg(kernel,  8, sizeof(cl_ulong), &nb00));
+    CL_CHECK(clSetKernelArg(kernel,  9, sizeof(cl_ulong), &nb01));
+    CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb02));
+    CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb03));
+    CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int),      &ne0));
+    CL_CHECK(clSetKernelArg(kernel, 13, sizeof(cl_ulong), &nb0));
+    CL_CHECK(clSetKernelArg(kernel, 14, sizeof(cl_ulong), &nb1));
+    CL_CHECK(clSetKernelArg(kernel, 15, sizeof(cl_ulong), &nb2));
+    CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_ulong), &nb3));
 
-    size_t global_work_size[] = { gws0, gws1, gws2 };
+    int nth = 64;
 
-    backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, NULL, dst);
+    size_t global_work_size[] = {(size_t)ne1*nth, (size_t)ne2, (size_t)ne3};
+    size_t local_work_size[] = {(size_t)nth, 1, 1};
+
+    backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
 }
 
 static void ggml_cl_pad(ggml_backend_t backend, const ggml_tensor * src0, ggml_tensor * dst) {
@@ -7589,121 +7582,76 @@ static void ggml_cl_concat(ggml_backend_t backend, const ggml_tensor * src0, con
     GGML_ASSERT(dst->type == GGML_TYPE_F32);
 
     ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
-    cl_command_queue queue = backend_ctx->queue;
 
-    if (backend_ctx->kernel_concat_f32_contiguous == nullptr || backend_ctx->kernel_concat_f32_non_contiguous == nullptr) {
-        GGML_LOG_WARN("%s: concat kernels not available, skipping OpenCL execution.\n", __func__);
-        return;
-    }
+    ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
+    ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
+    ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
 
-    ggml_tensor_extra_cl * extra0_cl = (ggml_tensor_extra_cl *)src0->extra;
-    ggml_tensor_extra_cl * extra1_cl = (ggml_tensor_extra_cl *)src1->extra;
-    ggml_tensor_extra_cl * extrad_cl = (ggml_tensor_extra_cl *)dst->extra;
+    cl_ulong offset0 = extra0->offset + src0->view_offs;
+    cl_ulong offset1 = extra1->offset + src1->view_offs;
+    cl_ulong offsetd  = extrad->offset + dst->view_offs;
 
-    cl_ulong off_src0 = extra0_cl->offset + src0->view_offs;
-    cl_ulong off_src1 = extra1_cl->offset + src1->view_offs;
-    cl_ulong off_dst  = extrad_cl->offset + dst->view_offs;
+    const int ne00 = src0->ne[0];
+    const int ne01 = src0->ne[1];
+    const int ne02 = src0->ne[2];
+    const int ne03 = src0->ne[3];
 
-    const int32_t dim = ((const int32_t *) dst->op_params)[0];
+    const cl_ulong nb00 = src0->nb[0];
+    const cl_ulong nb01 = src0->nb[1];
+    const cl_ulong nb02 = src0->nb[2];
+    const cl_ulong nb03 = src0->nb[3];
+
+    const cl_ulong nb10 = src1->nb[0];
+    const cl_ulong nb11 = src1->nb[1];
+    const cl_ulong nb12 = src1->nb[2];
+    const cl_ulong nb13 = src1->nb[3];
+
+    const int ne0 = dst->ne[0];
+    const int ne1 = dst->ne[1];
+    const int ne2 = dst->ne[2];
+    const int ne3 = dst->ne[3];
+
+    const cl_ulong nb0 = dst->nb[0];
+    const cl_ulong nb1 = dst->nb[1];
+    const cl_ulong nb2 = dst->nb[2];
+    const cl_ulong nb3 = dst->nb[3];
+
+    const cl_int dim = ((const int32_t *) dst->op_params)[0];
     GGML_ASSERT(dim >= 0 && dim <= 3);
 
-    if (ggml_is_contiguous(src0) && ggml_is_contiguous(src1) && ggml_is_contiguous(dst)) {
-        if (dim == 3) {
+    int nth = MIN(64, ne0);
 
-            size_t nbytes_src0 = ggml_nbytes(src0);
-            size_t nbytes_src1 = ggml_nbytes(src1);
+    cl_kernel kernel = backend_ctx->kernel_concat_f32;
 
-            CL_CHECK(clEnqueueCopyBuffer(queue, extra0_cl->data_device, extrad_cl->data_device,
-                                         off_src0, off_dst, nbytes_src0, 0, NULL, NULL));
-            CL_CHECK(clEnqueueCopyBuffer(queue, extra1_cl->data_device, extrad_cl->data_device,
-                                         off_src1, off_dst + nbytes_src0, nbytes_src1, 0, NULL, NULL));
-        } else {
+    CL_CHECK(clSetKernelArg(kernel,  0, sizeof(cl_mem),   &extra0->data_device));
+    CL_CHECK(clSetKernelArg(kernel,  1, sizeof(cl_ulong), &offset0));
+    CL_CHECK(clSetKernelArg(kernel,  2, sizeof(cl_mem),   &extra1->data_device));
+    CL_CHECK(clSetKernelArg(kernel,  3, sizeof(cl_ulong), &offset1));
+    CL_CHECK(clSetKernelArg(kernel,  4, sizeof(cl_mem),   &extrad->data_device));
+    CL_CHECK(clSetKernelArg(kernel,  5, sizeof(cl_ulong), &offsetd));
+    CL_CHECK(clSetKernelArg(kernel,  6, sizeof(int),      &ne00));
+    CL_CHECK(clSetKernelArg(kernel,  7, sizeof(int),      &ne01));
+    CL_CHECK(clSetKernelArg(kernel,  8, sizeof(int),      &ne02));
+    CL_CHECK(clSetKernelArg(kernel,  9, sizeof(int),      &ne03));
+    CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb00));
+    CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb01));
+    CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb02));
+    CL_CHECK(clSetKernelArg(kernel, 13, sizeof(cl_ulong), &nb03));
+    CL_CHECK(clSetKernelArg(kernel, 14, sizeof(cl_ulong), &nb10));
+    CL_CHECK(clSetKernelArg(kernel, 15, sizeof(cl_ulong), &nb11));
+    CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_ulong), &nb12));
+    CL_CHECK(clSetKernelArg(kernel, 17, sizeof(cl_ulong), &nb13));
+    CL_CHECK(clSetKernelArg(kernel, 18, sizeof(int),      &ne0));
+    CL_CHECK(clSetKernelArg(kernel, 19, sizeof(cl_ulong), &nb0));
+    CL_CHECK(clSetKernelArg(kernel, 20, sizeof(cl_ulong), &nb1));
+    CL_CHECK(clSetKernelArg(kernel, 21, sizeof(cl_ulong), &nb2));
+    CL_CHECK(clSetKernelArg(kernel, 22, sizeof(cl_ulong), &nb3));
+    CL_CHECK(clSetKernelArg(kernel, 23, sizeof(cl_int),   &dim));
 
-            cl_kernel kernel = backend_ctx->kernel_concat_f32_contiguous;
-            size_t global_work_size[3];
+    size_t global_work_size[] = {(size_t)ne1*nth, (size_t)ne2, (size_t)ne3};
+    size_t local_work_size[] = {(size_t)nth, 1, 1};
 
-            for (int i3 = 0; i3 < dst->ne[3]; ++i3) {
-                cl_ulong current_off_src0 = off_src0 + (i3 * src0->nb[3]);
-                cl_ulong current_off_src1 = off_src1 + (i3 * src1->nb[3]);
-                cl_ulong current_off_dst  = off_dst  + (i3 * dst->nb[3]);
-
-                int d_ne00 = src0->ne[0]; int d_ne01 = src0->ne[1]; int d_ne02 = src0->ne[2];
-                int d_ne10 = src1->ne[0]; int d_ne11 = src1->ne[1]; int d_ne12 = src1->ne[2];
-                int d_ne0  = dst->ne[0];  int d_ne1  = dst->ne[1];  int d_ne2  = dst->ne[2];
-
-                CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem),    &extra0_cl->data_device));
-                CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong),  &current_off_src0));
-                CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem),    &extra1_cl->data_device));
-                CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong),  &current_off_src1));
-                CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem),    &extrad_cl->data_device));
-                CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong),  &current_off_dst));
-                CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int),       &d_ne00));
-                CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int),       &d_ne01));
-                CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int),       &d_ne02));
-                CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int),       &d_ne10));
-                CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int),      &d_ne11));
-                CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int),      &d_ne12));
-                CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int),      &d_ne0));
-                CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int),      &d_ne1));
-                CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int),      &d_ne2));
-                CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int),      &dim));
-
-                global_work_size[0] = d_ne0;
-                global_work_size[1] = d_ne1;
-                global_work_size[2] = d_ne2;
-
-                backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, NULL, dst);
-            }
-        }
-    } else {
-        cl_kernel kernel = backend_ctx->kernel_concat_f32_non_contiguous;
-
-        cl_long ne00 = src0->ne[0], ne01 = src0->ne[1], ne02 = src0->ne[2], ne03 = src0->ne[3];
-        cl_ulong nb00 = src0->nb[0], nb01 = src0->nb[1], nb02 = src0->nb[2], nb03 = src0->nb[3];
-
-        cl_ulong nb10 = src1->nb[0], nb11 = src1->nb[1], nb12 = src1->nb[2], nb13 = src1->nb[3];
-
-        cl_long d_ne0 = dst->ne[0], d_ne1 = dst->ne[1], d_ne2 = dst->ne[2], d_ne3 = dst->ne[3];
-        cl_ulong d_nb0 = dst->nb[0], d_nb1 = dst->nb[1], d_nb2 = dst->nb[2], d_nb3 = dst->nb[3];
-
-
-        CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem),    &extra0_cl->data_device));
-        CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong),  &off_src0));
-        CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem),    &extra1_cl->data_device));
-        CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong),  &off_src1));
-        CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem),    &extrad_cl->data_device));
-        CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong),  &off_dst));
-
-        CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_long),      &ne00));
-        CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_long),      &ne01));
-        CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_long),      &ne02));
-        CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_long),      &ne03));
-        CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong),    &nb00));
-        CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong),    &nb01));
-        CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong),    &nb02));
-        CL_CHECK(clSetKernelArg(kernel, 13, sizeof(cl_ulong),    &nb03));
-
-        CL_CHECK(clSetKernelArg(kernel, 14, sizeof(cl_ulong),    &nb10));
-        CL_CHECK(clSetKernelArg(kernel, 15, sizeof(cl_ulong),    &nb11));
-        CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_ulong),    &nb12));
-        CL_CHECK(clSetKernelArg(kernel, 17, sizeof(cl_ulong),    &nb13));
-
-        CL_CHECK(clSetKernelArg(kernel, 18, sizeof(cl_long),     &d_ne0));
-        CL_CHECK(clSetKernelArg(kernel, 19, sizeof(cl_long),     &d_ne1));
-        CL_CHECK(clSetKernelArg(kernel, 20, sizeof(cl_long),     &d_ne2));
-        CL_CHECK(clSetKernelArg(kernel, 21, sizeof(cl_long),     &d_ne3));
-        CL_CHECK(clSetKernelArg(kernel, 22, sizeof(cl_ulong),    &d_nb0));
-        CL_CHECK(clSetKernelArg(kernel, 23, sizeof(cl_ulong),    &d_nb1));
-        CL_CHECK(clSetKernelArg(kernel, 24, sizeof(cl_ulong),    &d_nb2));
-        CL_CHECK(clSetKernelArg(kernel, 25, sizeof(cl_ulong),    &d_nb3));
-        CL_CHECK(clSetKernelArg(kernel, 26, sizeof(int),      &dim));
-
-        size_t global_work_size_nc[] = { d_ne1 > 0 ? (size_t)d_ne1 : 1,
-                                         d_ne2 > 0 ? (size_t)d_ne2 : 1,
-                                         d_ne3 > 0 ? (size_t)d_ne3 : 1 };
-
-        backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size_nc, NULL, dst);
-    }
+    backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
 }
 
 static void ggml_cl_timestep_embedding(ggml_backend_t backend, const ggml_tensor * src0, ggml_tensor * dst) {
@@ -8394,6 +8342,7 @@ static void ggml_cl_mul_mat_q8_0_f32_adreno(ggml_backend_t backend, const ggml_t
     CL_CHECK(clReleaseMemObject(D_sub_buffer));
     CL_CHECK(clReleaseMemObject(D_image1d));
 #else
+    GGML_UNUSED(backend);
     GGML_UNUSED(src0);
     GGML_UNUSED(src1);
     GGML_UNUSED(dst);
@@ -9913,7 +9862,16 @@ static void ggml_cl_scale(ggml_backend_t backend, const ggml_tensor * src0, cons
     cl_ulong offset0 = extra0->offset + src0->view_offs;
     cl_ulong offsetd = extrad->offset + dst->view_offs;
 
-    cl_kernel kernel = backend_ctx->kernel_scale;
+    cl_kernel kernel;
+
+    int n = ggml_nelements(dst);
+
+    if (n % 4 == 0) {
+        kernel = backend_ctx->kernel_scale_f32_4;
+        n /= 4;
+    } else {
+        kernel = backend_ctx->kernel_scale_f32;
+    }
 
     CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem),   &extra0->data_device));
     CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
@@ -9922,8 +9880,6 @@ static void ggml_cl_scale(ggml_backend_t backend, const ggml_tensor * src0, cons
     CL_CHECK(clSetKernelArg(kernel, 4, sizeof(float),    &scale));
     CL_CHECK(clSetKernelArg(kernel, 5, sizeof(float),    &bias));
 
-    int n = ggml_nelements(dst)/4;
-
     size_t global_work_size[] = {(size_t)n, 1, 1};
     size_t local_work_size[] = {64, 1, 1};
 
diff --git a/ggml/src/ggml-opencl/kernels/concat.cl b/ggml/src/ggml-opencl/kernels/concat.cl
index 132758469c..0c1b3d785c 100644
--- a/ggml/src/ggml-opencl/kernels/concat.cl
+++ b/ggml/src/ggml-opencl/kernels/concat.cl
@@ -1,109 +1,51 @@
-kernel void kernel_concat_f32_contiguous(
-    global const char * p_src0, ulong off_src0,
-    global const char * p_src1, ulong off_src1,
-    global char * p_dst, ulong off_dst,
-    int d_ne00, int d_ne01, int d_ne02, // src0->ne[0..2] for the slice
-    int d_ne10, int d_ne11, int d_ne12, // src1->ne[0..2] for the slice (d_ne1X must match d_ne0X on non-concat axes)
-    int d_ne0,  int d_ne1,  int d_ne2,  // dst->ne[0..2] for the slice
-    int dim
+kernel void kernel_concat_f32(
+    global  const char * src0,
+    ulong                offset0,
+    global  const char * src1,
+    ulong                offset1,
+    global        char * dst,
+    ulong                offsetd,
+    int             ne00,
+    int             ne01,
+    int             ne02,
+    int             ne03,
+    ulong           nb00,
+    ulong           nb01,
+    ulong           nb02,
+    ulong           nb03,
+    ulong           nb10,
+    ulong           nb11,
+    ulong           nb12,
+    ulong           nb13,
+    int             ne0,
+    ulong           nb0,
+    ulong           nb1,
+    ulong           nb2,
+    ulong           nb3,
+    int             dim
 ) {
-    global const float * src0 = (global const float*)((global char*)p_src0 + off_src0);
-    global const float * src1 = (global const float*)((global char*)p_src1 + off_src1);
-    global float * dst        = (global float*)((global char*)p_dst + off_dst);
+    src0 = src0 + offset0;
+    src1 = src1 + offset1;
+    dst  = dst  + offsetd;
 
-    int i0 = get_global_id(0); // Index along dst's 0th dimension
-    int i1 = get_global_id(1); // Index along dst's 1st dimension
-    int i2 = get_global_id(2); // Index along dst's 2nd dimension
+    const int i3 = get_group_id(2);
+    const int i2 = get_group_id(1);
+    const int i1 = get_group_id(0);
 
-    if (i0 >= d_ne0 || i1 >= d_ne1 || i2 >= d_ne2) {
-        return;
-    }
+    int o[4] = {0, 0, 0, 0};
+    o[dim] = dim == 0 ? ne00 : (dim == 1 ? ne01 : (dim == 2 ? ne02 : ne03));
 
-    ulong dst_idx = (ulong)i2 * d_ne0 * d_ne1 + (ulong)i1 * d_ne0 + i0;
-    ulong src_idx;
+    global const float * x;
 
-    if (dim == 0) {
-        if (i0 < d_ne00) { // Data from src0
-            src_idx = (ulong)i2 * d_ne00 * d_ne01 + (ulong)i1 * d_ne00 + i0;
-            dst[dst_idx] = src0[src_idx];
-        } else { // Data from src1
-            src_idx = (ulong)i2 * d_ne10 * d_ne11 + (ulong)i1 * d_ne10 + (i0 - d_ne00);
-            dst[dst_idx] = src1[src_idx];
-        }
-    } else if (dim == 1) {
-        if (i1 < d_ne01) { // Data from src0
-            src_idx = (ulong)i2 * d_ne00 * d_ne01 + (ulong)i1 * d_ne00 + i0;
-            dst[dst_idx] = src0[src_idx];
-        } else { // Data from src1
-            src_idx = (ulong)i2 * d_ne10 * d_ne11 + (ulong)(i1 - d_ne01) * d_ne10 + i0;
-            dst[dst_idx] = src1[src_idx];
-        }
-    } else if (dim == 2) {
-        if (i2 < d_ne02) { // Data from src0
-            src_idx = (ulong)i2 * d_ne00 * d_ne01 + (ulong)i1 * d_ne00 + i0;
-            dst[dst_idx] = src0[src_idx];
-        } else { // Data from src1
-
-            src_idx = (ulong)(i2 - d_ne02) * d_ne10 * d_ne11 + (ulong)i1 * d_ne10 + i0;
-            dst[dst_idx] = src1[src_idx];
-        }
-    }
-}
-
-kernel void kernel_concat_f32_non_contiguous(
-    global const char * p_src0, ulong off_src0,
-    global const char * p_src1, ulong off_src1,
-    global char * p_dst, ulong off_dst,
-
-    long ne00, long ne01, long ne02, long ne03,
-    ulong nb00, ulong nb01, ulong nb02, ulong nb03,
-
-    ulong nb10, ulong nb11, ulong nb12, ulong nb13, // Strides for src1
-
-    long d_ne0, long d_ne1, long d_ne2, long d_ne3,
-    ulong d_nb0, ulong d_nb1, ulong d_nb2, ulong d_nb3,
-    int dim
-) {
-    global const char * src0_base = p_src0 + off_src0;
-    global const char * src1_base = p_src1 + off_src1;
-    global char * dst_base        = p_dst + off_dst;
-
-    long current_i1 = get_global_id(0); // Index for dst_dim_1
-    long current_i2 = get_global_id(1); // Index for dst_dim_2
-    long current_i3 = get_global_id(2); // Index for dst_dim_3
-
-    if (current_i1 >= d_ne1 || current_i2 >= d_ne2 || current_i3 >= d_ne3) {
-        return;
-    }
-
-    global const float * x_val_ptr;
-    global float * y_val_ptr;
-
-    for (long current_i0 = 0; current_i0 < d_ne0; ++current_i0) {
-        bool use_src0;
-        long s_i0 = current_i0, s_i1 = current_i1, s_i2 = current_i2, s_i3 = current_i3;
-
-        if (dim == 0) {
-            use_src0 = (current_i0 < ne00);
-            if (!use_src0) { s_i0 = current_i0 - ne00; }
-        } else if (dim == 1) {
-            use_src0 = (current_i1 < ne01);
-            if (!use_src0) { s_i1 = current_i1 - ne01; }
-        } else if (dim == 2) {
-            use_src0 = (current_i2 < ne02);
-            if (!use_src0) { s_i2 = current_i2 - ne02; }
-        } else { // dim == 3
-            use_src0 = (current_i3 < ne03);
-            if (!use_src0) { s_i3 = current_i3 - ne03; }
-        }
-
-        if (use_src0) {
-            x_val_ptr = (global const float *)(src0_base + (ulong)s_i3*nb03 + (ulong)s_i2*nb02 + (ulong)s_i1*nb01 + (ulong)s_i0*nb00);
+    for (int i0 = get_local_id(0); i0 < ne0; i0 += get_local_size(0)) {
+        if (i0 < ne00 && i1 < ne01 && i2 < ne02 && i3 < ne03) {
+            x = (global const float *)(src0 + (i3       )*nb03 + (i2       )*nb02 + (i1       )*nb01 + (i0       )*nb00);
         } else {
-            x_val_ptr = (global const float *)(src1_base + (ulong)s_i3*nb13 + (ulong)s_i2*nb12 + (ulong)s_i1*nb11 + (ulong)s_i0*nb10);
+            x = (global const float *)(src1 + (i3 - o[3])*nb13 + (i2 - o[2])*nb12 + (i1 - o[1])*nb11 + (i0 - o[0])*nb10);
         }
 
-        y_val_ptr = (global float *)(dst_base + (ulong)current_i3*d_nb3 + (ulong)current_i2*d_nb2 + (ulong)current_i1*d_nb1 + (ulong)current_i0*d_nb0);
-        *y_val_ptr = *x_val_ptr;
+        global float * y = (global float *)(dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
+
+        *y = *x;
     }
 }
diff --git a/ggml/src/ggml-opencl/kernels/repeat.cl b/ggml/src/ggml-opencl/kernels/repeat.cl
index 079498f5ab..53951a5543 100644
--- a/ggml/src/ggml-opencl/kernels/repeat.cl
+++ b/ggml/src/ggml-opencl/kernels/repeat.cl
@@ -1,39 +1,38 @@
-kernel void kernel_repeat(
-    global const char * src0_data_in,
-    global       char * dst_data_in,
-    ulong src0_offset,
-    ulong dst_offset,
-    int src0_ne0, int src0_ne1, int src0_ne2, int src0_ne3,
-    ulong src0_nb0, ulong src0_nb1, ulong src0_nb2, ulong src0_nb3,
-    int dst_ne0, int dst_ne1, int dst_ne2, int dst_ne3,
-    ulong dst_nb0, ulong dst_nb1, ulong dst_nb2, ulong dst_nb3
+kernel void kernel_repeat_f32(
+        global const char * src0,
+        ulong               offset0,
+        global       char * dst,
+        ulong               offsetd,
+        int     ne00,
+        int     ne01,
+        int     ne02,
+        int     ne03,
+        ulong   nb00,
+        ulong   nb01,
+        ulong   nb02,
+        ulong   nb03,
+        int     ne0,
+        ulong   nb0,
+        ulong   nb1,
+        ulong   nb2,
+        ulong   nb3
 ) {
-    global const char * src0_data = src0_data_in + src0_offset;
-    global       char * dst_data  = dst_data_in + dst_offset;
+    src0 = src0 + offset0;
+    dst  = dst  + offsetd;
 
-    const int d3 = get_global_id(2);
-    const int d2 = get_global_id(1);
-    const int d1 = get_global_id(0);
+    const int i3 = get_group_id(2);
+    const int i2 = get_group_id(1);
+    const int i1 = get_group_id(0);
 
-    if (d3 >= dst_ne3 || d2 >= dst_ne2 || d1 >= dst_ne1) {
-        return;
-    }
+    const int i03 = i3%ne03;
+    const int i02 = i2%ne02;
+    const int i01 = i1%ne01;
 
-    const int s3 = d3 % src0_ne3;
-    const int s2 = d2 % src0_ne2;
-    const int s1 = d1 % src0_ne1;
+    global const char * src0_ptr = src0 + i03*nb03 + i02*nb02 + i01*nb01;
+    global       char * dst_ptr  = dst  +  i3*nb3  +  i2*nb2  +  i1*nb1;
 
-    const global char * p_src0_slice = src0_data + (ulong)s3*src0_nb3 + (ulong)s2*src0_nb2 + (ulong)s1*src0_nb1;
-    global char * p_dst_slice  = dst_data  + (ulong)d3*dst_nb3 + (ulong)d2*dst_nb2 + (ulong)d1*dst_nb1;
-
-    for (int d0 = 0; d0 < dst_ne0; ++d0) {
-        // Determine source index for dimension 0 based on tiling/broadcasting.
-        const int s0 = d0 % src0_ne0;
-
-        const global char * restrict current_src_el_ptr = p_src0_slice + (ulong)s0*src0_nb0;
-        global char * restrict current_dst_el_ptr  = p_dst_slice  + (ulong)d0*dst_nb0;
-        for (int k = 0; k < src0_nb0; ++k) {
-            current_dst_el_ptr[k] = current_src_el_ptr[k];
-        }
+    for (int i0 = get_local_id(0); i0 < ne0; i0 += get_local_size(0)) {
+        const int i00 = i0%ne00;
+        *((global float *)(dst_ptr + i0*nb0)) = *((global float *)(src0_ptr + i00*nb00));
     }
 }
diff --git a/ggml/src/ggml-opencl/kernels/scale.cl b/ggml/src/ggml-opencl/kernels/scale.cl
index aeca8a456e..17ed97f0d6 100644
--- a/ggml/src/ggml-opencl/kernels/scale.cl
+++ b/ggml/src/ggml-opencl/kernels/scale.cl
@@ -1,9 +1,19 @@
 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
 
-//------------------------------------------------------------------------------
-// scale
-//------------------------------------------------------------------------------
-kernel void kernel_scale(
+kernel void kernel_scale_f32(
+        global float * src0,
+        ulong offset0,
+        global float * dst,
+        ulong offsetd,
+        float scale,
+        float bias
+) {
+    src0 = (global float*)((global char*)src0 + offset0);
+    dst = (global float*)((global char*)dst + offsetd);
+    dst[get_global_id(0)] = src0[get_global_id(0)] * scale + bias;
+}
+
+kernel void kernel_scale_f32_4(
         global float4 * src0,
         ulong offset0,
         global float4 * dst,
diff --git a/ggml/src/ggml-opencl/kernels/tanh.cl b/ggml/src/ggml-opencl/kernels/tanh.cl
index d9da86b148..2c4887ad3e 100644
--- a/ggml/src/ggml-opencl/kernels/tanh.cl
+++ b/ggml/src/ggml-opencl/kernels/tanh.cl
@@ -1,63 +1,109 @@
 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
 
-#ifdef cl_intel_required_subgroup_size
-#pragma OPENCL EXTENSION cl_intel_required_subgroup_size : enable
-#define INTEL_GPU 1
-#define REQD_SUBGROUP_SIZE_16 __attribute__((intel_reqd_sub_group_size(16)))
-#define REQD_SUBGROUP_SIZE_32 __attribute__((intel_reqd_sub_group_size(32)))
-#elif defined(cl_qcom_reqd_sub_group_size)
-#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
-#define ADRENO_GPU 1
-#define REQD_SUBGROUP_SIZE_64  __attribute__((qcom_reqd_sub_group_size("half")))
-#define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full")))
-#endif
-
-kernel void kernel_tanh_f32_nd(
-    global void * p_src0_base, ulong off_src0_abs,
-    global void * p_dst_base,  ulong off_dst_abs,
-    int ne00, int ne01, int ne02, int ne03,
-    ulong nb00, ulong nb01, ulong nb02, ulong nb03,
-    int ne10, int ne11, int ne12, int ne13,
-    ulong nb10, ulong nb11, ulong nb12, ulong nb13
+kernel void kernel_tanh_f32(
+        global const float * src0,
+        ulong                offset0,
+        global       float * dst,
+        ulong                offsetd
 ) {
-    int i0 = get_global_id(0);
-    int i1 = get_global_id(1);
-    int i2 = get_global_id(2);
+    src0 = (global float*)((global char*)src0 + offset0);
+    dst  = (global float*)((global char*)dst + offsetd);
 
-    if (i0 < ne10 && i1 < ne11 && i2 < ne12) {
-        for (int i3 = 0; i3 < ne13; ++i3) {
-            ulong src_offset_in_tensor = (ulong)i0*nb00 + (ulong)i1*nb01 + (ulong)i2*nb02 + (ulong)i3*nb03;
-            global const float *src_val_ptr = (global const float *)((global char *)p_src0_base + off_src0_abs + src_offset_in_tensor);
+    dst[get_global_id(0)] = tanh(src0[get_global_id(0)]);
+}
 
-            ulong dst_offset_in_tensor = (ulong)i0*nb10 + (ulong)i1*nb11 + (ulong)i2*nb12 + (ulong)i3*nb13;
-            global float *dst_val_ptr = (global float *)((global char *)p_dst_base + off_dst_abs + dst_offset_in_tensor);
+kernel void kernel_tanh_f32_4(
+        global const float4 * src0,
+        ulong                 offset0,
+        global       float4 * dst,
+        ulong                 offsetd
+) {
+    src0 = (global float4*)((global char*)src0 + offset0);
+    dst  = (global float4*)((global char*)dst + offsetd);
 
-            *dst_val_ptr = tanh(*src_val_ptr);
-        }
+    dst[get_global_id(0)] = tanh(src0[get_global_id(0)]);
+}
+
+kernel void kernel_tanh_f16(
+        global const half * src0,
+        ulong               offset0,
+        global       half * dst,
+        ulong               offsetd
+) {
+    src0 = (global half*)((global char*)src0 + offset0);
+    dst  = (global half*)((global char*)dst + offsetd);
+
+    dst[get_global_id(0)] = tanh(src0[get_global_id(0)]);
+}
+
+kernel void kernel_tanh_f16_4(
+        global const half4 * src0,
+        ulong                offset0,
+        global       half4 * dst,
+        ulong                offsetd
+) {
+    src0 = (global half4*)((global char*)src0 + offset0);
+    dst  = (global half4*)((global char*)dst + offsetd);
+
+    dst[get_global_id(0)] = tanh(src0[get_global_id(0)]);
+}
+
+kernel void kernel_tanh_f32_nc(
+        global const char * src0,
+        ulong               offset0,
+        global       char * dst,
+        ulong               offsetd,
+        int   ne00,
+        ulong nb00,
+        ulong nb01,
+        ulong nb02,
+        ulong nb03,
+        ulong nb0,
+        ulong nb1,
+        ulong nb2,
+        ulong nb3
+) {
+    src0 = src0 + offset0;
+    dst  = dst + offsetd;
+
+    const int i3 = get_group_id(2);
+    const int i2 = get_group_id(1);
+    const int i1 = get_group_id(0);
+
+    for (int i0 = get_local_id(0); i0 < ne00; i0 += get_local_size(0)) {
+        global const float * x = (global const float *)(src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
+        global       float * y = (global       float *)(dst  + i3*nb3  + i2*nb2  + i1*nb1  + i0*nb0);
+
+        *y = tanh(*x);
     }
 }
 
-kernel void kernel_tanh_f16_nd(
-    global void * p_src0_base, ulong off_src0_abs,
-    global void * p_dst_base,  ulong off_dst_abs,
-    int ne00, int ne01, int ne02, int ne03,
-    ulong nb00, ulong nb01, ulong nb02, ulong nb03,
-    int ne10, int ne11, int ne12, int ne13,
-    ulong nb10, ulong nb11, ulong nb12, ulong nb13
+kernel void kernel_tanh_f16_nc(
+        global const char * src0,
+        ulong               offset0,
+        global       char * dst,
+        ulong               offsetd,
+        int   ne00,
+        ulong nb00,
+        ulong nb01,
+        ulong nb02,
+        ulong nb03,
+        ulong nb0,
+        ulong nb1,
+        ulong nb2,
+        ulong nb3
 ) {
-    int i0 = get_global_id(0);
-    int i1 = get_global_id(1);
-    int i2 = get_global_id(2);
+    src0 = src0 + offset0;
+    dst  = dst + offsetd;
 
-    if (i0 < ne10 && i1 < ne11 && i2 < ne12) {
-        for (int i3 = 0; i3 < ne13; ++i3) {
-            ulong src_offset_in_tensor = (ulong)i0*nb00 + (ulong)i1*nb01 + (ulong)i2*nb02 + (ulong)i3*nb03;
-            global const half *src_val_ptr = (global const half *)((global char *)p_src0_base + off_src0_abs + src_offset_in_tensor);
+    const int i3 = get_group_id(2);
+    const int i2 = get_group_id(1);
+    const int i1 = get_group_id(0);
 
-            ulong dst_offset_in_tensor = (ulong)i0*nb10 + (ulong)i1*nb11 + (ulong)i2*nb12 + (ulong)i3*nb13;
-            global half *dst_val_ptr = (global half *)((global char *)p_dst_base + off_dst_abs + dst_offset_in_tensor);
+    for (int i0 = get_local_id(0); i0 < ne00; i0 += get_local_size(0)) {
+        global const half * x = (global const half *)(src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
+        global       half * y = (global       half *)(dst  + i3*nb3  + i2*nb2  + i1*nb1  + i0*nb0);
 
-            *dst_val_ptr = tanh(*src_val_ptr);
-        }
+        *y = tanh(*x);
     }
 }