From 25f40ca65f1aa596f8b1702bbac4bc48a45b87d7 Mon Sep 17 00:00:00 2001 From: Daniel Bevenius Date: Wed, 4 Feb 2026 05:43:28 +0100 Subject: [PATCH 01/20] completion : simplify batch (embd) processing (#19286) * completion : simplify batch (embd) processing This commit simplifies the processing of embd by removing the for loop that currently exists which uses params.n_batch as its increment. This commit also removes the clamping of n_eval as the size of embd is always at most the size of params.n_batch. The motivation is to clarify the code as it is currently a little confusing when looking at this for loop in isolation and thinking that it can process multiple batches. * add an assert to verify n_eval is not greater than n_batch --- tools/completion/completion.cpp | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/tools/completion/completion.cpp b/tools/completion/completion.cpp index f368a2f4c6..977132756f 100644 --- a/tools/completion/completion.cpp +++ b/tools/completion/completion.cpp @@ -674,15 +674,12 @@ int main(int argc, char ** argv) { } } - for (int i = 0; i < (int) embd.size(); i += params.n_batch) { - int n_eval = (int) embd.size() - i; - if (n_eval > params.n_batch) { - n_eval = params.n_batch; - } - + if (!embd.empty()) { + int n_eval = (int) embd.size(); LOG_DBG("eval: %s\n", string_from(ctx, embd).c_str()); - if (llama_decode(ctx, llama_batch_get_one(&embd[i], n_eval))) { + GGML_ASSERT(n_eval <= params.n_batch); + if (llama_decode(ctx, llama_batch_get_one(embd.data(), n_eval))) { LOG_ERR("%s : failed to eval\n", __func__); return 1; } @@ -743,7 +740,7 @@ int main(int argc, char ** argv) { common_sampler_accept(smpl, embd_inp[n_consumed], /* accept_grammar= */ false); ++n_consumed; - if ((int) embd.size() >= params.n_batch) { + if ((int) embd.size() == params.n_batch) { break; } } From d838c22bb33d6b486a73c0ff285346deb575be3f Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Wed, 4 Feb 2026 10:39:53 +0200 Subject: [PATCH 02/20] spec : fix the check-rate logic of ngram-simple (#19261) * spec : fix the check-rate logic of ngram-simple * cont : refactor + fix checks --- common/ngram-map.cpp | 15 +++------------ common/ngram-map.h | 16 +--------------- common/speculative.cpp | 20 ++++++++++++++------ 3 files changed, 18 insertions(+), 33 deletions(-) diff --git a/common/ngram-map.cpp b/common/ngram-map.cpp index cab231bad7..c5b8fc75ed 100644 --- a/common/ngram-map.cpp +++ b/common/ngram-map.cpp @@ -47,21 +47,15 @@ static std::string common_tokens_to_str(const llama_tokens & inp, size_t start, * @return Vector of draft tokens, empty if no matching pattern is found */ llama_tokens common_ngram_simple_draft( - common_ngram_simple_state & state, + const common_ngram_simple_config & config, const llama_tokens & tokens, llama_token sampled) { // Simple implementation of self-speculative decoding without a draft model. // const size_t cur_len = tokens.size(); - // Only check every check_rate tokens to save compute - // i.e., perform check if (cur_len - idx_last_check) >= check_rate - if (state.idx_last_check + state.config.check_rate > cur_len) { - llama_tokens draft_tokens; - return draft_tokens; - } - size_t n_draft_min = state.config.size_ngram; // size of n-gram to lookup in token history - size_t n_draft_max = state.config.size_mgram; // the m-gram following the found n-gram is used for draft + const size_t n_draft_min = config.size_ngram; // size of n-gram to lookup in token history + const size_t n_draft_max = config.size_mgram; // the m-gram following the found n-gram is used for draft // vector for tokens we want to verify. // return empty vector if there is no match. @@ -80,9 +74,6 @@ llama_tokens common_ngram_simple_draft( } pattern.push_back(sampled); // add the last token to the pattern - // We do a search in the token history. - state.idx_last_check = cur_len; - size_t match_pos = 0; // we ignore position 0, position 0 == no match // search backwards, but skip the current match (we are currently there) for (size_t j = cur_len - n_draft_min - 1; j > 0; --j) { diff --git a/common/ngram-map.h b/common/ngram-map.h index c094d513d5..9668bd5a7c 100644 --- a/common/ngram-map.h +++ b/common/ngram-map.h @@ -27,23 +27,9 @@ struct common_ngram_simple_config { uint16_t check_rate; // check for speculative decoding without draft model for each check_rate token }; -// current state (and config) of n-gram simple. -struct common_ngram_simple_state { - common_ngram_simple_config config; - - size_t idx_last_check = 0; // index of last check in context history (mutable) - - common_ngram_simple_state(const common_ngram_simple_config & config) - : config(config) {} -}; - // Searches for a n-gram in the history and checks whether a draft sequence should be generated. -// state: the ngram simple state to search in. -// inp: the tokens generated so far. -// sampled: the token that was just sampled. -// draft: vector to store the draft tokens, initially empty. llama_tokens common_ngram_simple_draft( - common_ngram_simple_state & state, + const common_ngram_simple_config & config, const llama_tokens & tokens, llama_token sampled); diff --git a/common/speculative.cpp b/common/speculative.cpp index 80cd31e35f..c99b19dbfd 100644 --- a/common/speculative.cpp +++ b/common/speculative.cpp @@ -463,12 +463,14 @@ struct common_speculative_state_eagle3 : public common_speculative_state { // state of self-speculation (simple implementation, not ngram-map) struct common_speculative_state_ngram_simple : public common_speculative_state { - common_ngram_simple_state state; + common_ngram_simple_config config; + + uint16_t check_id = 0; // used to control the frequency of generating drafts common_speculative_state_ngram_simple( enum common_speculative_type type, - common_ngram_simple_state state) - : common_speculative_state(type), state(state) {} + common_ngram_simple_config config) + : common_speculative_state(type), config(config) {} void begin(const llama_tokens & prompt) override { GGML_UNUSED(prompt); @@ -479,7 +481,13 @@ struct common_speculative_state_ngram_simple : public common_speculative_state { const llama_tokens & prompt_tgt, llama_token id_last, llama_tokens & result) override { - result = common_ngram_simple_draft(state, prompt_tgt, id_last); + ++check_id; + if (check_id < config.check_rate) { + return; + } + check_id = 0; + + result = common_ngram_simple_draft(config, prompt_tgt, id_last); GGML_UNUSED(params); } @@ -889,14 +897,14 @@ common_speculative * common_speculative_init( uint16_t mgram_size_value = ngram_map.size_value; uint16_t check_rate = ngram_map.check_rate; - auto config_simple = common_ngram_simple_config{ + auto config_simple = common_ngram_simple_config { /* .size_ngram = */ ngram_size_key, /* .size_mgram = */ mgram_size_value, /* .check_rate = */ check_rate }; auto state = std::make_unique( /* .type = */ config.type, - /* .state = */ common_ngram_simple_state(config_simple) + /* .state = */ config_simple ); impls.push_back(std::move(state)); break; From 6ab881b7c3abd7c1aac0d7f3a00bde68d9cfd484 Mon Sep 17 00:00:00 2001 From: Daniel Bevenius Date: Wed, 4 Feb 2026 10:40:53 +0100 Subject: [PATCH 03/20] model-conversion : add tensor-info.py utility (#18954) This commit adds a new python script that can be used to print tensors information from a tensor in a safetensors model. The motivation for this is that during model conversion work it can sometimes be useful to verify the shape of tensors in the original model. While it is possible to print the tensors when loading the model this can be slow when working with larger models. With this script it is possible to quickly query tensor shapes. Example usage: ```console (venv) $ ./scripts/utils/tensor-info.py --help usage: tensor-info.py [-h] [-m MODEL_PATH] [-l] [tensor_name] Print tensor information from a safetensors model positional arguments: tensor_name Name of the tensor to inspect options: -h, --help show this help message and exit -m MODEL_PATH, --model-path MODEL_PATH Path to the model directory (default: MODEL_PATH environment variable) -l, --list List unique tensor patterns in the model (layer numbers replaced with #) ``` Listing tensor names: ```console (venv) $ ./scripts/utils/tensor-info.py -m ~/work/ai/models/google/embeddinggemma-300m -l embed_tokens.weight layers.#.input_layernorm.weight layers.#.mlp.down_proj.weight layers.#.mlp.gate_proj.weight layers.#.mlp.up_proj.weight layers.#.post_attention_layernorm.weight layers.#.post_feedforward_layernorm.weight layers.#.pre_feedforward_layernorm.weight layers.#.self_attn.k_norm.weight layers.#.self_attn.k_proj.weight layers.#.self_attn.o_proj.weight layers.#.self_attn.q_norm.weight layers.#.self_attn.q_proj.weight layers.#.self_attn.v_proj.weight norm.weight ``` Printing a specific tensor's information: ```console (venv) $ ./scripts/utils/tensor-info.py -m ~/work/ai/models/google/embeddinggemma-300m layers.0.input_layernorm.weight Tensor: layers.0.input_layernorm.weight File: model.safetensors Shape: [768] ``` --- .../scripts/utils/tensor-info.py | 159 ++++++++++++++++++ 1 file changed, 159 insertions(+) create mode 100755 examples/model-conversion/scripts/utils/tensor-info.py diff --git a/examples/model-conversion/scripts/utils/tensor-info.py b/examples/model-conversion/scripts/utils/tensor-info.py new file mode 100755 index 0000000000..12a3430b49 --- /dev/null +++ b/examples/model-conversion/scripts/utils/tensor-info.py @@ -0,0 +1,159 @@ +#!/usr/bin/env python3 + +import argparse +import json +import os +import re +import sys +from pathlib import Path +from typing import Optional +from safetensors import safe_open + + +MODEL_SAFETENSORS_FILE = "model.safetensors" +MODEL_SAFETENSORS_INDEX = "model.safetensors.index.json" + + +def get_weight_map(model_path: Path) -> Optional[dict[str, str]]: + index_file = model_path / MODEL_SAFETENSORS_INDEX + + if index_file.exists(): + with open(index_file, 'r') as f: + index = json.load(f) + return index.get("weight_map", {}) + + return None + + +def get_all_tensor_names(model_path: Path) -> list[str]: + weight_map = get_weight_map(model_path) + + if weight_map is not None: + return list(weight_map.keys()) + + single_file = model_path / MODEL_SAFETENSORS_FILE + if single_file.exists(): + try: + with safe_open(single_file, framework="pt", device="cpu") as f: + return list(f.keys()) + except Exception as e: + print(f"Error reading {single_file}: {e}") + sys.exit(1) + + print(f"Error: No safetensors files found in {model_path}") + sys.exit(1) + + +def find_tensor_file(model_path: Path, tensor_name: str) -> Optional[str]: + weight_map = get_weight_map(model_path) + + if weight_map is not None: + return weight_map.get(tensor_name) + + single_file = model_path / MODEL_SAFETENSORS_FILE + if single_file.exists(): + return single_file.name + + return None + + +def normalize_tensor_name(tensor_name: str) -> str: + normalized = re.sub(r'\.\d+\.', '.#.', tensor_name) + normalized = re.sub(r'\.\d+$', '.#', normalized) + return normalized + + +def list_all_tensors(model_path: Path, unique: bool = False): + tensor_names = get_all_tensor_names(model_path) + + if unique: + seen = set() + for tensor_name in sorted(tensor_names): + normalized = normalize_tensor_name(tensor_name) + if normalized not in seen: + seen.add(normalized) + print(normalized) + else: + for tensor_name in sorted(tensor_names): + print(tensor_name) + + +def print_tensor_info(model_path: Path, tensor_name: str): + tensor_file = find_tensor_file(model_path, tensor_name) + + if tensor_file is None: + print(f"Error: Could not find tensor '{tensor_name}' in model index") + print(f"Model path: {model_path}") + sys.exit(1) + + file_path = model_path / tensor_file + + try: + with safe_open(file_path, framework="pt", device="cpu") as f: + if tensor_name in f.keys(): + tensor_slice = f.get_slice(tensor_name) + shape = tensor_slice.get_shape() + print(f"Tensor: {tensor_name}") + print(f"File: {tensor_file}") + print(f"Shape: {shape}") + else: + print(f"Error: Tensor '{tensor_name}' not found in {tensor_file}") + sys.exit(1) + + except FileNotFoundError: + print(f"Error: The file '{file_path}' was not found.") + sys.exit(1) + except Exception as e: + print(f"An error occurred: {e}") + sys.exit(1) + + +def main(): + parser = argparse.ArgumentParser( + description="Print tensor information from a safetensors model" + ) + parser.add_argument( + "tensor_name", + nargs="?", # optional (if --list is used for example) + help="Name of the tensor to inspect" + ) + parser.add_argument( + "-m", "--model-path", + type=Path, + help="Path to the model directory (default: MODEL_PATH environment variable)" + ) + parser.add_argument( + "-l", "--list", + action="store_true", + help="List unique tensor patterns in the model (layer numbers replaced with #)" + ) + + args = parser.parse_args() + + model_path = args.model_path + if model_path is None: + model_path_str = os.environ.get("MODEL_PATH") + if model_path_str is None: + print("Error: --model-path not provided and MODEL_PATH environment variable not set") + sys.exit(1) + model_path = Path(model_path_str) + + if not model_path.exists(): + print(f"Error: Model path does not exist: {model_path}") + sys.exit(1) + + if not model_path.is_dir(): + print(f"Error: Model path is not a directory: {model_path}") + sys.exit(1) + + if args.list: + list_all_tensors(model_path, unique=True) + else: + if args.tensor_name is None: + print("Error: tensor_name is required when not using --list") + sys.exit(1) + print_tensor_info(model_path, args.tensor_name) + + +if __name__ == "__main__": + main() From eaba92c3dcc980ebe753348855d4a5d75c069997 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Wed, 4 Feb 2026 12:45:21 +0200 Subject: [PATCH 04/20] tests : add non-cont, inplace rope tests (#19296) * tests : add non-cont, inplace rope tests * cont : exercise dim 3 Co-authored-by: Jeff Bolz * cont : more dim3 exercises --------- Co-authored-by: Jeff Bolz --- tests/test-backend-ops.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index 90cc0d7da2..cecdf47038 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -8032,6 +8032,8 @@ static std::vector> make_test_cases_eval() { for (int mode : {GGML_ROPE_TYPE_NORMAL, GGML_ROPE_TYPE_NEOX, GGML_ROPE_TYPE_MROPE, GGML_ROPE_TYPE_IMROPE, GGML_ROPE_TYPE_VISION}) { for (bool ff : {false, true}) { test_cases.emplace_back(new test_rope(type, {128, 32, 2, 1}, 128, mode, 512, 1.4245f, 0.7465f, 1.4245f, ff, 0, true, true)); + test_cases.emplace_back(new test_rope(type, {128, 32, 2, 1}, 128, mode, 512, 1.4245f, 0.7465f, 1.4245f, ff, 1, true, true)); + test_cases.emplace_back(new test_rope(type, {128, 32, 2, 3}, 128, mode, 512, 1.4245f, 0.7465f, 1.4245f, ff, 1, true, true)); } } } From 8abcc70a747eff568198ae64aa1a60b7625a3c36 Mon Sep 17 00:00:00 2001 From: Xuan-Son Nguyen Date: Wed, 4 Feb 2026 13:09:58 +0100 Subject: [PATCH 05/20] model: (qwen3next) correct vectorized key_gdiff calculation (#19324) * model: (qwen3next) correct vectorized key_gdiff calculation * move transpose to outside of loop --- src/models/qwen3next.cpp | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/src/models/qwen3next.cpp b/src/models/qwen3next.cpp index 57b6659baf..06d946c5fa 100644 --- a/src/models/qwen3next.cpp +++ b/src/models/qwen3next.cpp @@ -265,9 +265,15 @@ std::pair llm_build_qwen3next::build_delta_net_chu cb(g_diff, "g_diff", il); // shape: (chunk_size, 1, n_chunks, H_v * n_seqs) ggml_tensor * g_diff_exp = ggml_exp(ctx0, g_diff); - ggml_tensor * key_gdiff = ggml_mul(ctx0, k, g_diff_exp); + ggml_tensor * g_diff_exp_t = ggml_reshape_4d(ctx0, g_diff_exp, + 1, chunk_size, n_chunks, g_diff_exp->ne[3]); + + ggml_tensor * key_gdiff = ggml_mul(ctx0, k, g_diff_exp_t); cb(key_gdiff, "key_gdiff", il); // shape: (S_k, chunk_size, n_chunks, H_v * n_seqs) + ggml_tensor * key_gdiff_t = ggml_cont(ctx0, ggml_transpose(ctx0, key_gdiff)); + cb(key_gdiff_t, "key_gdiff_t", il); // shape: (chunk_size, S_k, n_chunks, H_v * n_seqs) + // state to be updated per chunk ggml_tensor * new_state = state; // ggml_dup(ctx0, state); @@ -322,9 +328,9 @@ std::pair llm_build_qwen3next::build_delta_net_chu : ggml_concat(ctx0, core_attn_out, core_attn_out_chunk, 2); // kgdmulvnew = (key_gdiff).transpose(-1, -2) @ v_new - ggml_tensor * k_gdiff = ggml_cont(ctx0, get_slice_2d(ctx0, key_gdiff, chunk)); + ggml_tensor * k_gdiff_t = get_slice_2d(ctx0, key_gdiff_t, chunk); //ggml_tensor * kgdmulvnew = ggml_mul_mat(ctx0, k_gdiff, v_new); // this is slower on metal, why? - ggml_tensor * kgdmulvnew = ggml_mul_mat(ctx0, v_new_t, ggml_cont(ctx0, ggml_transpose(ctx0, k_gdiff))); + ggml_tensor * kgdmulvnew = ggml_mul_mat(ctx0, v_new_t, k_gdiff_t); // last_recurrent_state = last_recurrent_state * g_last + kgdmulvnew ggml_tensor * gexp_last_chunk = ggml_cont(ctx0, get_slice_2d(ctx0, g_last_exp, chunk)); From 423bee462b46323433e5f1e322e3076cedbeec83 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Wed, 4 Feb 2026 15:12:03 +0200 Subject: [PATCH 06/20] ci : fix sanitize workflow to enable ggml sanitizers too (#19323) --- .github/workflows/build.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index fd251ac4c2..8ce679bd9a 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -293,6 +293,7 @@ jobs: cmake -B build \ -DLLAMA_FATAL_WARNINGS=ON \ -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \ + -DGGML_SANITIZE_${{ matrix.sanitizer }}=ON \ -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} cmake --build build --config ${{ matrix.build_type }} -j $(nproc) @@ -303,6 +304,7 @@ jobs: cmake -B build \ -DLLAMA_FATAL_WARNINGS=ON \ -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \ + -DGGML_SANITIZE_${{ matrix.sanitizer }}=ON \ -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \ -DGGML_OPENMP=OFF cmake --build build --config ${{ matrix.build_type }} -j $(nproc) From e0c93af2a03f5c53d052dfaefd86c06ed3784646 Mon Sep 17 00:00:00 2001 From: Xuan-Son Nguyen Date: Wed, 4 Feb 2026 17:55:31 +0100 Subject: [PATCH 07/20] debug: make common_debug_print_tensor readable (#19331) * debug: make common_debug_print_tensor readable * editorconfig --- common/debug.cpp | 34 ++++++++++++++++++---------------- src/models/qwen3next.cpp | 4 ++-- 2 files changed, 20 insertions(+), 18 deletions(-) diff --git a/common/debug.cpp b/common/debug.cpp index fdaddb1443..0df409a79d 100644 --- a/common/debug.cpp +++ b/common/debug.cpp @@ -45,6 +45,8 @@ static float common_ggml_get_float_value(const uint8_t * data, return v; } +#define INDENT " " + template void common_debug_print_tensor(uint8_t * data, ggml_type type, const int64_t * ne, const size_t * nb, int64_t n) { GGML_ASSERT(n > 0); @@ -60,41 +62,41 @@ void common_debug_print_tensor(uint8_t * data, ggml_type type, const int64_t * n } } for (int64_t i3 = 0; i3 < ne[3]; i3++) { - LOG_ERR(" [\n"); + LOG(INDENT "[\n"); for (int64_t i2 = 0; i2 < ne[2]; i2++) { if (i2 == n && ne[2] > 2 * n) { - LOG_ERR(" ..., \n"); + LOG(INDENT INDENT "..., \n"); i2 = ne[2] - n; } - LOG_ERR(" [\n"); + LOG(INDENT INDENT "[\n"); for (int64_t i1 = 0; i1 < ne[1]; i1++) { if (i1 == n && ne[1] > 2 * n) { - LOG_ERR(" ..., \n"); + LOG(INDENT INDENT INDENT "..., \n"); i1 = ne[1] - n; } - LOG_ERR(" ["); + LOG(INDENT INDENT INDENT "["); for (int64_t i0 = 0; i0 < ne[0]; i0++) { if (i0 == n && ne[0] > 2 * n) { - LOG_ERR("..., "); + LOG(" ..., "); i0 = ne[0] - n; } const float v = common_ggml_get_float_value(data, type, nb, i0, i1, i2, i3); - LOG_ERR("%12.4f", v); + LOG("%12.4f", v); if (i0 < ne[0] - 1) { - LOG_ERR(", "); + LOG(", "); } } - LOG_ERR("],\n"); + LOG(" ],\n"); } - LOG_ERR(" ],\n"); + LOG(INDENT INDENT "],\n"); } - LOG_ERR(" ]\n"); - LOG_ERR(" sum = %f\n", sum); + LOG(INDENT "]\n"); + LOG(INDENT "sum = %f\n", sum); } if constexpr (abort) { if (std::isnan(sum)) { - LOG_ERR("encountered NaN - aborting\n"); + LOG("encountered NaN - aborting\n"); exit(0); } } @@ -137,9 +139,9 @@ template bool common_debug_cb_eval(struct ggml_tensor * t, b } if (matches_filter) { - LOG_ERR("%s: %24s = (%s) %10s(%s{%s}, %s}) = {%s}\n", __func__, t->name, ggml_type_name(t->type), - ggml_op_desc(t), src0->name, common_ggml_ne_string(src0).c_str(), src1 ? src1_str : "", - common_ggml_ne_string(t).c_str()); + LOG("%s: %24s = (%s) %10s(%s{%s}, %s}) = {%s}\n", __func__, t->name, ggml_type_name(t->type), + ggml_op_desc(t), src0->name, common_ggml_ne_string(src0).c_str(), src1 ? src1_str : "", + common_ggml_ne_string(t).c_str()); } const bool is_host = ggml_backend_buffer_is_host(t->buffer); diff --git a/src/models/qwen3next.cpp b/src/models/qwen3next.cpp index 06d946c5fa..99b1a76a48 100644 --- a/src/models/qwen3next.cpp +++ b/src/models/qwen3next.cpp @@ -265,13 +265,13 @@ std::pair llm_build_qwen3next::build_delta_net_chu cb(g_diff, "g_diff", il); // shape: (chunk_size, 1, n_chunks, H_v * n_seqs) ggml_tensor * g_diff_exp = ggml_exp(ctx0, g_diff); - ggml_tensor * g_diff_exp_t = ggml_reshape_4d(ctx0, g_diff_exp, + ggml_tensor * g_diff_exp_t = ggml_reshape_4d(ctx0, g_diff_exp, 1, chunk_size, n_chunks, g_diff_exp->ne[3]); ggml_tensor * key_gdiff = ggml_mul(ctx0, k, g_diff_exp_t); cb(key_gdiff, "key_gdiff", il); // shape: (S_k, chunk_size, n_chunks, H_v * n_seqs) - ggml_tensor * key_gdiff_t = ggml_cont(ctx0, ggml_transpose(ctx0, key_gdiff)); + ggml_tensor * key_gdiff_t = ggml_cont(ctx0, ggml_transpose(ctx0, key_gdiff)); cb(key_gdiff_t, "key_gdiff_t", il); // shape: (chunk_size, S_k, n_chunks, H_v * n_seqs) From b536eb023368701fe3564210440e2df6151c3e65 Mon Sep 17 00:00:00 2001 From: Daniel Bevenius Date: Wed, 4 Feb 2026 20:20:40 +0100 Subject: [PATCH 08/20] codeowners : add danbev for examples/debug (#19332) * codeowners : add danbev for examples/debug * Add @pwilkin to CODEOWNERS for debug --------- Co-authored-by: Piotr Wilkin (ilintar) --- CODEOWNERS | 1 + 1 file changed, 1 insertion(+) diff --git a/CODEOWNERS b/CODEOWNERS index e573a3d2e6..9d252c9b8d 100644 --- a/CODEOWNERS +++ b/CODEOWNERS @@ -27,6 +27,7 @@ /examples/batched.swift/ @ggerganov /examples/batched/ @ggerganov /examples/convert-llama2c-to-ggml/ @ggerganov +/examples/debug/ @danbev @pwilkin /examples/deprecation-warning/ @ggerganov /examples/diffusion/ @am17an /examples/embedding/ @ggerganov From e6e934c5ea1540522171ec91eee05d37909286d1 Mon Sep 17 00:00:00 2001 From: Aaron Teo Date: Thu, 5 Feb 2026 05:15:03 +0800 Subject: [PATCH 09/20] vendor: update cpp-httplib version (#19313) Signed-off-by: Aaron Teo --- scripts/sync_vendor.py | 4 +- vendor/cpp-httplib/httplib.cpp | 238 ++++++++++++++++++++++++++------- vendor/cpp-httplib/httplib.h | 102 ++++++++++++-- 3 files changed, 285 insertions(+), 59 deletions(-) diff --git a/scripts/sync_vendor.py b/scripts/sync_vendor.py index 0771942d49..1ff6a9a40f 100755 --- a/scripts/sync_vendor.py +++ b/scripts/sync_vendor.py @@ -12,8 +12,8 @@ vendor = { # "https://github.com/mackron/miniaudio/raw/refs/tags/0.11.23/miniaudio.h": "vendor/miniaudio/miniaudio.h", "https://github.com/mackron/miniaudio/raw/669ed3e844524fcd883231b13095baee9f6de304/miniaudio.h": "vendor/miniaudio/miniaudio.h", - "https://raw.githubusercontent.com/yhirose/cpp-httplib/refs/tags/v0.30.1/httplib.h": "vendor/cpp-httplib/httplib.h", - "https://raw.githubusercontent.com/yhirose/cpp-httplib/refs/tags/v0.30.1/LICENSE": "vendor/cpp-httplib/LICENSE", + "https://raw.githubusercontent.com/yhirose/cpp-httplib/refs/tags/v0.30.2/httplib.h": "vendor/cpp-httplib/httplib.h", + "https://raw.githubusercontent.com/yhirose/cpp-httplib/refs/tags/v0.30.2/LICENSE": "vendor/cpp-httplib/LICENSE", "https://raw.githubusercontent.com/sheredom/subprocess.h/b49c56e9fe214488493021017bf3954b91c7c1f5/subprocess.h": "vendor/sheredom/subprocess.h", } diff --git a/vendor/cpp-httplib/httplib.cpp b/vendor/cpp-httplib/httplib.cpp index d707e65fd3..ba5f9c8ff9 100644 --- a/vendor/cpp-httplib/httplib.cpp +++ b/vendor/cpp-httplib/httplib.cpp @@ -117,6 +117,8 @@ time_t parse_http_date(const std::string &date_str) { #ifdef _WIN32 return _mkgmtime(&tm_buf); +#elif defined _AIX + return mktime(&tm_buf); #else return timegm(&tm_buf); #endif @@ -1376,7 +1378,7 @@ int getaddrinfo_with_timeout(const char *node, const char *service, // Allocate on the heap, so the resolver thread can keep using the data. auto state = std::make_shared(); - state->node = node; + if (node) { state->node = node; } state->service = service; state->hints = *hints; @@ -2896,10 +2898,20 @@ bool parse_range_header(const std::string &s, Ranges &ranges) try { return; } - const auto first = - static_cast(lhs.empty() ? -1 : std::stoll(lhs)); - const auto last = - static_cast(rhs.empty() ? -1 : std::stoll(rhs)); + ssize_t first = -1; + if (!lhs.empty()) { + ssize_t v; + auto res = detail::from_chars(lhs.data(), lhs.data() + lhs.size(), v); + if (res.ec == std::errc{}) { first = v; } + } + + ssize_t last = -1; + if (!rhs.empty()) { + ssize_t v; + auto res = detail::from_chars(rhs.data(), rhs.data() + rhs.size(), v); + if (res.ec == std::errc{}) { last = v; } + } + if ((first == -1 && last == -1) || (first != -1 && last != -1 && first > last)) { all_valid_ranges = false; @@ -2974,25 +2986,17 @@ bool parse_accept_header(const std::string &s, return; } -#ifdef CPPHTTPLIB_NO_EXCEPTIONS { - std::istringstream iss(quality_str); - iss >> accept_entry.quality; - - // Check if conversion was successful and entire string was consumed - if (iss.fail() || !iss.eof()) { + double v = 0.0; + auto res = detail::from_chars( + quality_str.data(), quality_str.data() + quality_str.size(), v); + if (res.ec == std::errc{}) { + accept_entry.quality = v; + } else { has_invalid_entry = true; return; } } -#else - try { - accept_entry.quality = std::stod(quality_str); - } catch (...) { - has_invalid_entry = true; - return; - } -#endif // Check if quality is in valid range [0.0, 1.0] if (accept_entry.quality < 0.0 || accept_entry.quality > 1.0) { has_invalid_entry = true; @@ -5570,13 +5574,26 @@ bool Server::read_content(Stream &strm, Request &req, Response &res) { strm, req, res, // Regular [&](const char *buf, size_t n) { + // Prevent arithmetic overflow when checking sizes. + // Avoid computing (req.body.size() + n) directly because + // adding two unsigned `size_t` values can wrap around and + // produce a small result instead of indicating overflow. + // Instead, check using subtraction: ensure `n` does not + // exceed the remaining capacity `max_size() - size()`. + if (req.body.size() >= req.body.max_size() || + n > req.body.max_size() - req.body.size()) { + return false; + } + // Limit decompressed body size to payload_max_length_ to protect // against "zip bomb" attacks where a small compressed payload // decompresses to a massive size. - if (req.body.size() + n > payload_max_length_ || - req.body.size() + n > req.body.max_size()) { + if (payload_max_length_ > 0 && + (req.body.size() >= payload_max_length_ || + n > payload_max_length_ - req.body.size())) { return false; } + req.body.append(buf, n); return true; }, @@ -5666,22 +5683,29 @@ bool Server::read_content_core( // oversized request and fail early (causing connection close). For SSL // builds we cannot reliably peek the decrypted application bytes, so keep // the original behaviour. -#if !defined(CPPHTTPLIB_OPENSSL_SUPPORT) && !defined(_WIN32) +#if !defined(CPPHTTPLIB_OPENSSL_SUPPORT) if (!req.has_header("Content-Length") && !detail::is_chunked_transfer_encoding(req.headers)) { - socket_t s = strm.socket(); - if (s != INVALID_SOCKET) { - // Peek up to payload_max_length_ + 1 bytes. If more than - // payload_max_length_ bytes are pending, reject the request. - size_t to_peek = - (payload_max_length_ > 0) - ? (std::min)(payload_max_length_ + 1, static_cast(4096)) - : 1; - std::vector peekbuf(to_peek); - ssize_t n = ::recv(s, peekbuf.data(), to_peek, MSG_PEEK); - if (n > 0 && static_cast(n) > payload_max_length_) { - // Indicate failure so connection will be closed. - return false; + // Only peek if payload_max_length is set to a finite value + if (payload_max_length_ > 0 && + payload_max_length_ < (std::numeric_limits::max)()) { + socket_t s = strm.socket(); + if (s != INVALID_SOCKET) { + // Peek to check if there is any pending data + char peekbuf[1]; + ssize_t n = ::recv(s, peekbuf, 1, MSG_PEEK); + if (n > 0) { + // There is data, so read it with payload limit enforcement + auto result = detail::read_content_without_length( + strm, payload_max_length_, out); + if (result == detail::ReadContentResult::PayloadTooLarge) { + res.status = StatusCode::PayloadTooLarge_413; + return false; + } else if (result != detail::ReadContentResult::Success) { + return false; + } + return true; + } } } return true; @@ -6656,7 +6680,8 @@ void ClientImpl::close_socket(Socket &socket) { } bool ClientImpl::read_response_line(Stream &strm, const Request &req, - Response &res) const { + Response &res, + bool skip_100_continue) const { std::array buf{}; detail::stream_line_reader line_reader(strm, buf.data(), buf.size()); @@ -6677,8 +6702,8 @@ bool ClientImpl::read_response_line(Stream &strm, const Request &req, res.status = std::stoi(std::string(m[2])); res.reason = std::string(m[3]); - // Ignore '100 Continue' - while (res.status == StatusCode::Continue_100) { + // Ignore '100 Continue' (only when not using Expect: 100-continue explicitly) + while (skip_100_continue && res.status == StatusCode::Continue_100) { if (!line_reader.getline()) { return false; } // CRLF if (!line_reader.getline()) { return false; } // next response line @@ -7463,7 +7488,8 @@ bool ClientImpl::write_content_with_provider(Stream &strm, } bool ClientImpl::write_request(Stream &strm, Request &req, - bool close_connection, Error &error) { + bool close_connection, Error &error, + bool skip_body) { // Prepare additional headers if (close_connection) { if (!req.has_header("Connection")) { @@ -7582,7 +7608,59 @@ bool ClientImpl::write_request(Stream &strm, Request &req, } } + // After sending request line and headers, wait briefly for an early server + // response (e.g. 4xx) and avoid sending a potentially large request body + // unnecessarily. This workaround is only enabled on Windows because Unix + // platforms surface write errors (EPIPE) earlier; on Windows kernel send + // buffering can accept large writes even when the peer already responded. + // Check the stream first (which covers SSL via `is_readable()`), then + // fall back to select on the socket. Only perform the wait for very large + // request bodies to avoid interfering with normal small requests and + // reduce side-effects. Poll briefly (up to 50ms as default) for an early + // response. Skip this check when using Expect: 100-continue, as the protocol + // handles early responses properly. +#if defined(_WIN32) + if (!skip_body && + req.body.size() > CPPHTTPLIB_WAIT_EARLY_SERVER_RESPONSE_THRESHOLD && + req.path.size() > CPPHTTPLIB_REQUEST_URI_MAX_LENGTH) { + auto start = std::chrono::high_resolution_clock::now(); + + for (;;) { + // Prefer socket-level readiness to avoid SSL_pending() false-positives + // from SSL internals. If the underlying socket is readable, assume an + // early response may be present. + auto sock = strm.socket(); + if (sock != INVALID_SOCKET && detail::select_read(sock, 0, 0) > 0) { + return false; + } + + // Fallback to stream-level check for non-socket streams or when the + // socket isn't reporting readable. Avoid using `is_readable()` for + // SSL, since `SSL_pending()` may report buffered records that do not + // indicate a complete application-level response yet. + if (!is_ssl() && strm.is_readable()) { return false; } + + auto now = std::chrono::high_resolution_clock::now(); + auto elapsed = + std::chrono::duration_cast(now - start) + .count(); + if (elapsed >= CPPHTTPLIB_WAIT_EARLY_SERVER_RESPONSE_TIMEOUT_MSECOND) { + break; + } + + std::this_thread::sleep_for(std::chrono::milliseconds(1)); + } + } +#endif + // Body + if (skip_body) { return true; } + + return write_request_body(strm, req, error); +} + +bool ClientImpl::write_request_body(Stream &strm, Request &req, + Error &error) { if (req.body.empty()) { return write_content_with_provider(strm, req, error); } @@ -7758,8 +7836,20 @@ void ClientImpl::output_error_log(const Error &err, bool ClientImpl::process_request(Stream &strm, Request &req, Response &res, bool close_connection, Error &error) { - // Send request - if (!write_request(strm, req, close_connection, error)) { return false; } + // Auto-add Expect: 100-continue for large bodies + if (CPPHTTPLIB_EXPECT_100_THRESHOLD > 0 && !req.has_header("Expect")) { + auto body_size = req.body.empty() ? req.content_length_ : req.body.size(); + if (body_size >= CPPHTTPLIB_EXPECT_100_THRESHOLD) { + req.set_header("Expect", "100-continue"); + } + } + + // Check for Expect: 100-continue + auto expect_100_continue = req.get_header_value("Expect") == "100-continue"; + + // Send request (skip body if using Expect: 100-continue) + auto write_request_success = + write_request(strm, req, close_connection, error, expect_100_continue); #ifdef CPPHTTPLIB_OPENSSL_SUPPORT if (is_ssl()) { @@ -7774,14 +7864,48 @@ bool ClientImpl::process_request(Stream &strm, Request &req, } #endif + // Handle Expect: 100-continue with timeout + if (expect_100_continue && CPPHTTPLIB_EXPECT_100_TIMEOUT_MSECOND > 0) { + time_t sec = CPPHTTPLIB_EXPECT_100_TIMEOUT_MSECOND / 1000; + time_t usec = (CPPHTTPLIB_EXPECT_100_TIMEOUT_MSECOND % 1000) * 1000; + auto ret = detail::select_read(strm.socket(), sec, usec); + if (ret <= 0) { + // Timeout or error: send body anyway (server didn't respond in time) + if (!write_request_body(strm, req, error)) { return false; } + expect_100_continue = false; // Switch to normal response handling + } + } + // Receive response and headers - if (!read_response_line(strm, req, res) || + // When using Expect: 100-continue, don't auto-skip `100 Continue` response + if (!read_response_line(strm, req, res, !expect_100_continue) || !detail::read_headers(strm, res.headers)) { - error = Error::Read; + if (write_request_success) { error = Error::Read; } output_error_log(error, &req); return false; } + if (!write_request_success) { return false; } + + // Handle Expect: 100-continue response + if (expect_100_continue) { + if (res.status == StatusCode::Continue_100) { + // Server accepted, send the body + if (!write_request_body(strm, req, error)) { return false; } + + // Read the actual response + res.headers.clear(); + res.body.clear(); + if (!read_response_line(strm, req, res) || + !detail::read_headers(strm, res.headers)) { + error = Error::Read; + output_error_log(error, &req); + return false; + } + } + // If not 100 Continue, server returned an error; proceed with that response + } + // Body if ((res.status != StatusCode::NoContent_204) && req.method != "HEAD" && req.method != "CONNECT") { @@ -9543,7 +9667,7 @@ bool SSLClient::load_certs() { last_openssl_error_ = ERR_get_error(); ret = false; } - } else { + } else if (!ca_cert_store_) { auto loaded = false; #ifdef _WIN32 loaded = @@ -9790,7 +9914,11 @@ bool SSLClient::verify_host_with_common_name(X509 *server_cert) const { bool SSLClient::check_host_name(const char *pattern, size_t pattern_len) const { - if (host_.size() == pattern_len && host_ == pattern) { return true; } + // Exact match (case-insensitive) + if (host_.size() == pattern_len && + detail::case_ignore::equal(host_, std::string(pattern, pattern_len))) { + return true; + } // Wildcard match // https://bugs.launchpad.net/ubuntu/+source/firefox-3.0/+bug/376484 @@ -9805,9 +9933,23 @@ bool SSLClient::check_host_name(const char *pattern, auto itr = pattern_components.begin(); for (const auto &h : host_components_) { auto &p = *itr; - if (p != h && p != "*") { - auto partial_match = (p.size() > 0 && p[p.size() - 1] == '*' && - !p.compare(0, p.size() - 1, h)); + if (!httplib::detail::case_ignore::equal(p, h) && p != "*") { + bool partial_match = false; + if (!p.empty() && p[p.size() - 1] == '*') { + const auto prefix_length = p.size() - 1; + if (prefix_length == 0) { + partial_match = true; + } else if (h.size() >= prefix_length) { + partial_match = + std::equal(p.begin(), + p.begin() + static_cast( + prefix_length), + h.begin(), [](const char ca, const char cb) { + return httplib::detail::case_ignore::to_lower(ca) == + httplib::detail::case_ignore::to_lower(cb); + }); + } + } if (!partial_match) { return false; } } ++itr; diff --git a/vendor/cpp-httplib/httplib.h b/vendor/cpp-httplib/httplib.h index 613020d12c..7c7790f41f 100644 --- a/vendor/cpp-httplib/httplib.h +++ b/vendor/cpp-httplib/httplib.h @@ -8,8 +8,8 @@ #ifndef CPPHTTPLIB_HTTPLIB_H #define CPPHTTPLIB_HTTPLIB_H -#define CPPHTTPLIB_VERSION "0.30.1" -#define CPPHTTPLIB_VERSION_NUM "0x001E01" +#define CPPHTTPLIB_VERSION "0.30.2" +#define CPPHTTPLIB_VERSION_NUM "0x001E02" /* * Platform compatibility check @@ -98,6 +98,22 @@ #define CPPHTTPLIB_CLIENT_MAX_TIMEOUT_MSECOND 0 #endif +#ifndef CPPHTTPLIB_EXPECT_100_THRESHOLD +#define CPPHTTPLIB_EXPECT_100_THRESHOLD 1024 +#endif + +#ifndef CPPHTTPLIB_EXPECT_100_TIMEOUT_MSECOND +#define CPPHTTPLIB_EXPECT_100_TIMEOUT_MSECOND 1000 +#endif + +#ifndef CPPHTTPLIB_WAIT_EARLY_SERVER_RESPONSE_THRESHOLD +#define CPPHTTPLIB_WAIT_EARLY_SERVER_RESPONSE_THRESHOLD (1024 * 1024) +#endif + +#ifndef CPPHTTPLIB_WAIT_EARLY_SERVER_RESPONSE_TIMEOUT_MSECOND +#define CPPHTTPLIB_WAIT_EARLY_SERVER_RESPONSE_TIMEOUT_MSECOND 50 +#endif + #ifndef CPPHTTPLIB_IDLE_INTERVAL_SECOND #define CPPHTTPLIB_IDLE_INTERVAL_SECOND 0 #endif @@ -286,8 +302,10 @@ using socket_t = int; #include #include #include +#include #include #include +#include #include #include #include @@ -305,6 +323,7 @@ using socket_t = int; #include #include #include +#include #include #include #include @@ -494,6 +513,69 @@ private: bool execute_on_destruction; }; +// Simple from_chars implementation for integer and double types (C++17 +// substitute) +template struct from_chars_result { + const char *ptr; + std::errc ec; +}; + +template +inline from_chars_result from_chars(const char *first, const char *last, + T &value, int base = 10) { + value = 0; + const char *p = first; + bool negative = false; + + if (p != last && *p == '-') { + negative = true; + ++p; + } + if (p == last) { return {first, std::errc::invalid_argument}; } + + T result = 0; + for (; p != last; ++p) { + char c = *p; + int digit = -1; + if ('0' <= c && c <= '9') { + digit = c - '0'; + } else if ('a' <= c && c <= 'z') { + digit = c - 'a' + 10; + } else if ('A' <= c && c <= 'Z') { + digit = c - 'A' + 10; + } else { + break; + } + + if (digit < 0 || digit >= base) { break; } + if (result > ((std::numeric_limits::max)() - digit) / base) { + return {p, std::errc::result_out_of_range}; + } + result = result * base + digit; + } + + if (p == first || (negative && p == first + 1)) { + return {first, std::errc::invalid_argument}; + } + + value = negative ? -result : result; + return {p, std::errc{}}; +} + +// from_chars for double (simple wrapper for strtod) +inline from_chars_result from_chars(const char *first, const char *last, + double &value) { + std::string s(first, last); + char *endptr = nullptr; + errno = 0; + value = std::strtod(s.c_str(), &endptr); + if (endptr == s.c_str()) { return {first, std::errc::invalid_argument}; } + if (errno == ERANGE) { + return {first + (endptr - s.c_str()), std::errc::result_out_of_range}; + } + return {first + (endptr - s.c_str()), std::errc{}}; +} + } // namespace detail enum SSLVerifierResponse { @@ -1848,10 +1930,11 @@ private: Result send_(Request &&req); socket_t create_client_socket(Error &error) const; - bool read_response_line(Stream &strm, const Request &req, - Response &res) const; + bool read_response_line(Stream &strm, const Request &req, Response &res, + bool skip_100_continue = true) const; bool write_request(Stream &strm, Request &req, bool close_connection, - Error &error); + Error &error, bool skip_body = false); + bool write_request_body(Stream &strm, Request &req, Error &error); void prepare_default_headers(Request &r, bool for_stream, const std::string &ct); bool redirect(Request &req, Response &res, Error &error); @@ -3243,10 +3326,11 @@ private: msg.id = value; } else if (field == "retry") { // Parse retry interval in milliseconds - try { - retry_ms = std::stoi(value); - } catch (...) { - // Invalid retry value, ignore + { + int v = 0; + auto res = + detail::from_chars(value.data(), value.data() + value.size(), v); + if (res.ec == std::errc{}) { retry_ms = v; } } } // Unknown fields are ignored per SSE spec From 11fb327bf3846f390e3af1cbe929da9287c618da Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= Date: Thu, 5 Feb 2026 02:27:38 +0100 Subject: [PATCH 10/20] vendor : add missing llama_add_compile_flags (#19322) * add missing llama_add_compile_flags * disable all warnings for ssl, crypto and fipsmodule --- vendor/cpp-httplib/CMakeLists.txt | 26 ++++++++++++++++++++++++-- 1 file changed, 24 insertions(+), 2 deletions(-) diff --git a/vendor/cpp-httplib/CMakeLists.txt b/vendor/cpp-httplib/CMakeLists.txt index 3d938d9f36..18974d64ca 100644 --- a/vendor/cpp-httplib/CMakeLists.txt +++ b/vendor/cpp-httplib/CMakeLists.txt @@ -3,9 +3,14 @@ license_add_file("cpp-httplib" "LICENSE") find_package(Threads REQUIRED) +llama_add_compile_flags() + add_library(${TARGET} STATIC httplib.cpp httplib.h) -if (NOT MSVC) - # disable warnings in 3rd party code + +# disable warnings in 3rd party code +if (CMAKE_CXX_COMPILER_ID STREQUAL "MSVC") + target_compile_options(${TARGET} PRIVATE /w) +else() target_compile_options(${TARGET} PRIVATE -w) endif() @@ -146,6 +151,23 @@ elseif (LLAMA_OPENSSL) endif() endif() +# disable warnings in 3rd party code +if(LLAMA_BUILD_BORINGSSL OR LLAMA_BUILD_LIBRESSL) + if (CMAKE_CXX_COMPILER_ID STREQUAL "MSVC") + target_compile_options(ssl PRIVATE /w) + target_compile_options(crypto PRIVATE /w) + if(LLAMA_BUILD_BORINGSSL) + target_compile_options(fipsmodule PRIVATE /w) + endif() + else() + target_compile_options(ssl PRIVATE -w) + target_compile_options(crypto PRIVATE -w) + if(LLAMA_BUILD_BORINGSSL) + target_compile_options(fipsmodule PRIVATE -w) + endif() + endif() +endif() + if (CPPHTTPLIB_OPENSSL_SUPPORT) target_compile_definitions(${TARGET} PUBLIC CPPHTTPLIB_OPENSSL_SUPPORT) # used in server.cpp if (APPLE AND CMAKE_SYSTEM_NAME STREQUAL "Darwin") From af252d0758b0d3ed67c57bad2a735abf53d21c55 Mon Sep 17 00:00:00 2001 From: will-lms Date: Thu, 5 Feb 2026 01:05:09 -0500 Subject: [PATCH 11/20] metal : add missing includes (#19348) --- ggml/src/ggml-metal/ggml-metal.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/ggml/src/ggml-metal/ggml-metal.cpp b/ggml/src/ggml-metal/ggml-metal.cpp index a616dcdb46..1c705362fb 100644 --- a/ggml/src/ggml-metal/ggml-metal.cpp +++ b/ggml/src/ggml-metal/ggml-metal.cpp @@ -7,6 +7,9 @@ #include "ggml-metal-context.h" #include "ggml-metal-ops.h" +#include +#include + #define GGML_METAL_NAME "MTL" #define GGML_METAL_MAX_DEVICES 16 From c342c3b93de358a4571941b41c35dc5ba2081145 Mon Sep 17 00:00:00 2001 From: Jeff Bolz Date: Thu, 5 Feb 2026 01:38:59 -0600 Subject: [PATCH 12/20] vulkan: fix non-contig rope (#19299) --- ggml/src/ggml-vulkan/ggml-vulkan.cpp | 32 ++++-- .../ggml-vulkan/vulkan-shaders/rms_norm.comp | 5 +- .../vulkan-shaders/rope_funcs.glsl | 99 +++++++------------ .../vulkan-shaders/rope_multi.comp | 11 ++- .../ggml-vulkan/vulkan-shaders/rope_neox.comp | 11 ++- .../ggml-vulkan/vulkan-shaders/rope_norm.comp | 11 ++- .../vulkan-shaders/rope_params.glsl | 15 ++- .../vulkan-shaders/rope_vision.comp | 11 ++- 8 files changed, 100 insertions(+), 95 deletions(-) diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp index cb7fa2c9cb..af57685a37 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -1263,25 +1263,30 @@ struct vk_op_diag_mask_push_constants { struct vk_op_rope_push_constants { uint32_t rope_mode; - uint32_t ncols; uint32_t nrows; uint32_t n_dims; float freq_scale; - uint32_t p_delta_rows; float freq_base; float ext_factor; float attn_factor; float corr_dims[2]; float theta_scale; uint32_t has_ff; - uint32_t ne02; - uint32_t s1; - uint32_t s2; int32_t sections[4]; uint32_t is_imrope; uint32_t is_back; uint32_t set_rows_stride; + uint32_t ne00; + uint32_t ne01; + uint32_t ne02; + uint32_t nb01; + uint32_t nb02; + uint32_t nb03; + uint32_t nb11; + uint32_t nb12; + uint32_t nb13; }; +static_assert(sizeof(vk_op_rope_push_constants) <= 128, "sizeof(vk_op_rope_push_constants) must be <= 128"); // For fused rms_norm+mul+rope(+view+set_rows) struct vk_op_rms_norm_mul_rope_push_constants { @@ -10405,12 +10410,22 @@ static vk_op_rope_push_constants ggml_vk_make_rope_constants(const ggml_tensor * uint32_t nb01 = src0->nb[1] / ggml_type_size(src0->type); uint32_t nb02 = src0->nb[2] / ggml_type_size(src0->type); + uint32_t nb03 = src0->nb[3] / ggml_type_size(src0->type); + + uint32_t nb11 = dst->nb[1] / ggml_type_size(dst->type); + uint32_t nb12 = dst->nb[2] / ggml_type_size(dst->type); + uint32_t nb13 = dst->nb[3] / ggml_type_size(dst->type); vk_op_rope_push_constants rope { - (uint32_t)mode, (uint32_t)src0->ne[0], (uint32_t)ggml_nrows(src0), (uint32_t)n_dims, freq_scale, (uint32_t)src0->ne[1], - freq_base, ext_factor, attn_factor, {corr_dims[0], corr_dims[1]}, theta_scale, - has_ff, (uint32_t)src0->ne[2], nb01, nb02, + (uint32_t)mode, (uint32_t)ggml_nrows(src0), (uint32_t)n_dims, freq_scale, + freq_base, ext_factor, attn_factor, {corr_dims[0], corr_dims[1]}, theta_scale, has_ff, { sections[0], sections[1], sections[2], sections[3] }, is_imrope, backprop, set_rows_stride, + + (uint32_t)src0->ne[0], + (uint32_t)src0->ne[1], + (uint32_t)src0->ne[2], + nb01, nb02, nb03, + nb11, nb12, nb13, }; return rope; @@ -14798,6 +14813,7 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm case GGML_OP_REPEAT_BACK: return op->type == GGML_TYPE_F32 && op->src[0]->type == GGML_TYPE_F32; case GGML_OP_ROPE: + return ggml_is_contiguous_rows(op) && ggml_is_contiguous_rows(op->src[0]); case GGML_OP_ROPE_BACK: case GGML_OP_NONE: case GGML_OP_RESHAPE: diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm.comp b/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm.comp index 9d6d366542..55b89f19a7 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm.comp @@ -112,12 +112,11 @@ void rms_norm(uint num_iters) { #if RMS_NORM_ROPE_FUSION barrier(); rope_params rp = p.rope; - uint rope_row = (samp*nchannels + channel)*nrows + row; for (uint t = 2*tid; t < ncols; t += 2*BLOCK_SIZE) { if (rp.rope_mode == GGML_ROPE_TYPE_NEOX) { - rope_neox(t, rope_row, rp); + rope_neox(t, row, channel, samp, rp); } else if (rp.rope_mode == GGML_ROPE_TYPE_NORMAL) { - rope_norm(t, rope_row, rp); + rope_norm(t, row, channel, samp, rp); } } #endif diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/rope_funcs.glsl b/ggml/src/ggml-vulkan/vulkan-shaders/rope_funcs.glsl index aacec98469..2e53459909 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/rope_funcs.glsl +++ b/ggml/src/ggml-vulkan/vulkan-shaders/rope_funcs.glsl @@ -4,12 +4,12 @@ float rope_yarn_ramp(const float low, const float high, const uint i0) { return 1.0f - min(1.0f, max(0.0f, y)); } -uint rope_a_coord(const uint i0, const uint i01, const uint i02, rope_params p) { +uint rope_a_coord(const uint i0, const uint i01, const uint i02, const uint i03, rope_params p) { #if RMS_NORM_ROPE_FUSION // Per-row offset in shared memory const uint ix = i0; #else - const uint ix = i02*p.nb02 + i01*p.nb01 + i0; + const uint ix = i03*p.nb03 + i02*p.nb02 + i01*p.nb01 + i0; #endif return ix; } @@ -34,26 +34,19 @@ void rope_yarn(const float theta_extrap, const uint i0, out float cos_theta, out sin_theta = sin(theta) * mscale; } -void rope_norm(const uint i0, const uint i1, rope_params p) { - uint ne0 = p.ncols; - uint ne1 = p.p_delta_rows; - - if (i0 >= ne0) { +void rope_norm(const uint i0, const uint i1, const uint i2, const uint i3, rope_params p) { + if (i0 >= p.ne00) { return; } - // i1 is actually i2*nb2+i1, but the rows are contiguous - const uint i01 = i1 % ne1; - const uint i02 = i1 / ne1; - - uint idst = i1*ne0 + i0; - const uint ix = rope_a_coord(i0, i01, i02, p); + uint idst = i0 + i1 * p.nb11 + i2 * p.nb12 + i3 * p.nb13; + const uint ix = rope_a_coord(i0, i1, i2, i3, p); // Fusion optimization: ROPE + VIEW + SET_ROWS. // The rope output is viewed as a 1D tensor and offset based on a row index in rope_data_i. if (p.set_rows_stride != 0) { - idst = i01*ne0 + i0; - idst += rope_data_i[i02].x * p.set_rows_stride; + idst = i1*p.nb11 + i0; + idst += rope_data_i[i2].x * p.set_rows_stride; } if (i0 >= p.n_dims) { @@ -63,7 +56,7 @@ void rope_norm(const uint i0, const uint i1, rope_params p) { return; } - const float theta_base = rope_data_pos[i02] * pow(p.theta_scale, i0/2.0f); + const float theta_base = rope_data_pos[i2] * pow(p.theta_scale, i0/2.0f); const float freq_factor = p.has_ff != 0 ? rope_data_ff[i0/2] : 1.0f; @@ -77,25 +70,19 @@ void rope_norm(const uint i0, const uint i1, rope_params p) { rope_data_d[idst + 1] = ROPE_D_TYPE(x0*sin_theta + x1*cos_theta); } -void rope_neox(const uint i0, const uint i1, rope_params p) { - uint ne0 = p.ncols; - uint ne1 = p.p_delta_rows; - - if (i0 >= ne0) { +void rope_neox(const uint i0, const uint i1, const uint i2, const uint i3, rope_params p) { + if (i0 >= p.ne00) { return; } - const uint i01 = i1 % ne1; - const uint i02 = i1 / ne1; - - uint idst = i1*ne0 + i0/2; - const uint ix = rope_a_coord(i0/2, i01, i02, p); + uint idst = i0/2 + i1 * p.nb11 + i2 * p.nb12 + i3 * p.nb13; + const uint ix = rope_a_coord(i0/2, i1, i2, i3, p); // Fusion optimization: ROPE + VIEW + SET_ROWS. // The rope output is viewed as a 1D tensor and offset based on a row index in rope_data_i. if (p.set_rows_stride != 0) { - idst = i01*ne0 + i0/2; - idst += rope_data_i[i02].x * p.set_rows_stride; + idst = i1*p.nb11 + i0/2; + idst += rope_data_i[i2].x * p.set_rows_stride; } if (i0 >= p.n_dims) { @@ -105,7 +92,7 @@ void rope_neox(const uint i0, const uint i1, rope_params p) { return; } - const float theta_base = rope_data_pos[i02] * pow(p.theta_scale, i0/2.0f); + const float theta_base = rope_data_pos[i2] * pow(p.theta_scale, i0/2.0f); const float freq_factor = p.has_ff != 0 ? rope_data_ff[i0/2] : 1.0f; @@ -120,26 +107,19 @@ void rope_neox(const uint i0, const uint i1, rope_params p) { } -void rope_multi(const uint i0, const uint i1, rope_params p) { - uint ne0 = p.ncols; - uint ne1 = p.p_delta_rows; - uint ne2 = p.ne02; - - if (i0 >= ne0) { +void rope_multi(const uint i0, const uint i1, const uint i2, const uint i3, rope_params p) { + if (i0 >= p.ne00) { return; } - const uint i01 = i1 % ne1; - const uint i02 = i1 / ne1; - - uint idst = i1*ne0 + i0/2; - const uint ix = rope_a_coord(i0/2, i01, i02, p); + uint idst = i0/2 + i1 * p.nb11 + i2 * p.nb12 + i3 * p.nb13; + const uint ix = rope_a_coord(i0/2, i1, i2, i3, p); // Fusion optimization: ROPE + VIEW + SET_ROWS. // The rope output is viewed as a 1D tensor and offset based on a row index in rope_data_i. if (p.set_rows_stride != 0) { - idst = i01*ne0 + i0/2; - idst += rope_data_i[i02].x * p.set_rows_stride; + idst = i1*p.nb11 + i0/2; + idst += rope_data_i[i2].x * p.set_rows_stride; } if (i0 >= p.n_dims) { @@ -156,26 +136,26 @@ void rope_multi(const uint i0, const uint i1, rope_params p) { float theta_base = 0.0; if (p.is_imrope != 0) { if (sector % 3 == 1 && sector < 3 * p.sections[1]) { - theta_base = rope_data_pos[i02 + ne2 * 1]*pow(p.theta_scale, i0/2.0f); + theta_base = rope_data_pos[i2 + p.ne02 * 1]*pow(p.theta_scale, i0/2.0f); } else if (sector % 3 == 2 && sector < 3 * p.sections[2]) { - theta_base = rope_data_pos[i02 + ne2 * 2]*pow(p.theta_scale, i0/2.0f); + theta_base = rope_data_pos[i2 + p.ne02 * 2]*pow(p.theta_scale, i0/2.0f); } else if (sector % 3 == 0 && sector < 3 * p.sections[0]) { - theta_base = rope_data_pos[i02]*pow(p.theta_scale, i0/2.0f); + theta_base = rope_data_pos[i2]*pow(p.theta_scale, i0/2.0f); } else { - theta_base = rope_data_pos[i02 + ne2 * 3]*pow(p.theta_scale, i0/2.0f); + theta_base = rope_data_pos[i2 + p.ne02 * 3]*pow(p.theta_scale, i0/2.0f); } } else { if (sector < p.sections[0]) { - theta_base = rope_data_pos[i02]*pow(p.theta_scale, i0/2.0f); + theta_base = rope_data_pos[i2]*pow(p.theta_scale, i0/2.0f); } else if (sector >= p.sections[0] && sector < sec_w) { - theta_base = rope_data_pos[i02 + ne2 * 1]*pow(p.theta_scale, i0/2.0f); + theta_base = rope_data_pos[i2 + p.ne02 * 1]*pow(p.theta_scale, i0/2.0f); } else if (sector >= sec_w && sector < sec_w + p.sections[2]) { - theta_base = rope_data_pos[i02 + ne2 * 2]*pow(p.theta_scale, i0/2.0f); + theta_base = rope_data_pos[i2 + p.ne02 * 2]*pow(p.theta_scale, i0/2.0f); } else if (sector >= sec_w + p.sections[2]) { - theta_base = rope_data_pos[i02 + ne2 * 3]*pow(p.theta_scale, i0/2.0f); + theta_base = rope_data_pos[i2 + p.ne02 * 3]*pow(p.theta_scale, i0/2.0f); } } @@ -191,20 +171,13 @@ void rope_multi(const uint i0, const uint i1, rope_params p) { rope_data_d[idst + p.n_dims/2] = ROPE_D_TYPE(x0*sin_theta + x1*cos_theta); } -void rope_vision(const uint i0, const uint i1, rope_params p) { - uint ne0 = p.ncols; - uint ne1 = p.p_delta_rows; - uint ne2 = p.ne02; - - if (i0 >= ne0) { +void rope_vision(const uint i0, const uint i1, const uint i2, const uint i3, rope_params p) { + if (i0 >= p.ne00) { return; } - const uint i01 = i1 % ne1; - const uint i02 = i1 / ne1; - - const uint idst = i1*ne0 + i0/2; - const uint ix = rope_a_coord(i0/2, i01, i02, p); + const uint idst = i0/2 + i1 * p.nb11 + i2 * p.nb12 + i3 * p.nb13; + const uint ix = rope_a_coord(i0/2, i1, i2, i3, p); const int sect_dims = p.sections[0] + p.sections[1]; const int sec_w = p.sections[1] + p.sections[0]; @@ -213,11 +186,11 @@ void rope_vision(const uint i0, const uint i1, rope_params p) { float theta_base = 0.0; if (sector < p.sections[0]) { const uint p0 = sector; - theta_base = rope_data_pos[i02]*pow(p.theta_scale, p0); + theta_base = rope_data_pos[i2]*pow(p.theta_scale, p0); } else if (sector >= p.sections[0] && sector < sec_w) { const uint p0 = sector - p.sections[0]; - theta_base = rope_data_pos[i02 + ne2]*pow(p.theta_scale, p0); + theta_base = rope_data_pos[i2 + p.ne02]*pow(p.theta_scale, p0); } const float freq_factor = p.has_ff != 0 ? rope_data_ff[i0/2] : 1.0f; diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/rope_multi.comp b/ggml/src/ggml-vulkan/vulkan-shaders/rope_multi.comp index f7587468a8..1528fbeeae 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/rope_multi.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/rope_multi.comp @@ -5,10 +5,13 @@ void main() { const uint i0 = 2*gl_GlobalInvocationID.y; - // i1 is actually i2*nb2+i1, but the rows are contiguous - const uint i1 = gl_GlobalInvocationID.x + 32768 * gl_GlobalInvocationID.z; - if (i1 >= pc.nrows) { + const uint row = gl_GlobalInvocationID.x + 32768 * gl_GlobalInvocationID.z; + if (row >= pc.nrows) { return; } - rope_multi(i0, i1, pc); + const uint i3 = row / (pc.ne01*pc.ne02); + const uint i2 = (row - i3 * pc.ne01*pc.ne02) / pc.ne01; + const uint i1 = (row - i3 * pc.ne01*pc.ne02 - i2 * pc.ne01); + + rope_multi(i0, i1, i2, i3, pc); } diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/rope_neox.comp b/ggml/src/ggml-vulkan/vulkan-shaders/rope_neox.comp index acb8ed7815..ad0896095d 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/rope_neox.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/rope_neox.comp @@ -5,10 +5,13 @@ void main() { const uint i0 = 2*gl_GlobalInvocationID.y; - // i1 is actually i2*nb2+i1, but the rows are contiguous - const uint i1 = gl_GlobalInvocationID.x + 32768 * gl_GlobalInvocationID.z; - if (i1 >= pc.nrows) { + const uint row = gl_GlobalInvocationID.x + 32768 * gl_GlobalInvocationID.z; + if (row >= pc.nrows) { return; } - rope_neox(i0, i1, pc); + const uint i3 = row / (pc.ne01*pc.ne02); + const uint i2 = (row - i3 * pc.ne01*pc.ne02) / pc.ne01; + const uint i1 = (row - i3 * pc.ne01*pc.ne02 - i2 * pc.ne01); + + rope_neox(i0, i1, i2, i3, pc); } diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/rope_norm.comp b/ggml/src/ggml-vulkan/vulkan-shaders/rope_norm.comp index 0033cdb224..11220817df 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/rope_norm.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/rope_norm.comp @@ -5,10 +5,13 @@ void main() { const uint i0 = 2*gl_GlobalInvocationID.y; - // i1 is actually i2*nb2+i1, but the rows are contiguous - const uint i1 = gl_GlobalInvocationID.x + 32768 * gl_GlobalInvocationID.z; - if (i1 >= pc.nrows) { + const uint row = gl_GlobalInvocationID.x + 32768 * gl_GlobalInvocationID.z; + if (row >= pc.nrows) { return; } - rope_norm(i0, i1, pc); + const uint i3 = row / (pc.ne01*pc.ne02); + const uint i2 = (row - i3 * pc.ne01*pc.ne02) / pc.ne01; + const uint i1 = (row - i3 * pc.ne01*pc.ne02 - i2 * pc.ne01); + + rope_norm(i0, i1, i2, i3, pc); } diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/rope_params.glsl b/ggml/src/ggml-vulkan/vulkan-shaders/rope_params.glsl index 939cf3c51c..ec6ceaca9b 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/rope_params.glsl +++ b/ggml/src/ggml-vulkan/vulkan-shaders/rope_params.glsl @@ -5,24 +5,29 @@ struct rope_params { uint rope_mode; - uint ncols; uint nrows; uint n_dims; float freq_scale; - uint p_delta_rows; float freq_base; float ext_factor; float attn_factor; float corr_dims[2]; float theta_scale; uint has_ff; - uint ne02; - uint nb01; - uint nb02; int sections[4]; uint is_imrope; uint is_back; uint set_rows_stride; + + uint ne00; + uint ne01; + uint ne02; + uint nb01; + uint nb02; + uint nb03; + uint nb11; + uint nb12; + uint nb13; }; #endif // !defined(GGML_ROPE_PARAMS) diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/rope_vision.comp b/ggml/src/ggml-vulkan/vulkan-shaders/rope_vision.comp index d93800b5e7..ca71efb2f5 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/rope_vision.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/rope_vision.comp @@ -5,10 +5,13 @@ void main() { const uint i0 = 2*gl_GlobalInvocationID.y; - // i1 is actually i2*nb2+i1, but the rows are contiguous - const uint i1 = gl_GlobalInvocationID.x + 32768 * gl_GlobalInvocationID.z; - if (i1 >= pc.nrows) { + const uint row = gl_GlobalInvocationID.x + 32768 * gl_GlobalInvocationID.z; + if (row >= pc.nrows) { return; } - rope_vision(i0, i1, pc); + const uint i3 = row / (pc.ne01*pc.ne02); + const uint i2 = (row - i3 * pc.ne01*pc.ne02) / pc.ne01; + const uint i1 = (row - i3 * pc.ne01*pc.ne02 - i2 * pc.ne01); + + rope_vision(i0, i1, i2, i3, pc); } From 3409ab842d988c3e0dc0d3110dd793a5e187d37e Mon Sep 17 00:00:00 2001 From: Jeff Bolz Date: Thu, 5 Feb 2026 01:48:33 -0600 Subject: [PATCH 13/20] vulkan: Set k_load_shmem to false when K is too large (#19301) --- ggml/src/ggml-vulkan/ggml-vulkan.cpp | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp index af57685a37..2f6570181a 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -3204,9 +3204,10 @@ static void ggml_vk_load_shaders(vk_device& device) { const uint32_t D_lsb = D ^ (D & (D-1)); uint32_t D_split = std::min(std::min(device->subgroup_size, 8u), D_lsb / 4); - // Nvidia prefers shared memory use to load large tiles of K + // Nvidia prefers shared memory use to load large tiles of K. + // Switch to loading from global memory when it would use too much shared memory. // AMD prefers loading K directly from global memory - const uint32_t k_load_shmem = device->vendor_id == VK_VENDOR_ID_NVIDIA ? 1 : 0; + const uint32_t k_load_shmem = device->vendor_id == VK_VENDOR_ID_NVIDIA && hsk < 256 ? 1 : 0; return {wg_size, rows_cols[0], rows_cols[1], hsk, hsv, clamp, D_split, device->subgroup_size, k_load_shmem}; }; @@ -8412,7 +8413,7 @@ static bool ggml_vk_flash_attn_coopmat_shmem_support(const vk_device& device, co const uint32_t sfshstride = (hsk <= 128) ? (Br + 8) : Br; const uint32_t sfsh = Bc * sfshstride * acctype; - const bool k_load_shmem = device->vendor_id == VK_VENDOR_ID_NVIDIA; + const bool k_load_shmem = device->vendor_id == VK_VENDOR_ID_NVIDIA && hsk < 256; const uint32_t kshstride = (k_load_shmem ? hsk_pad : MatBr) / 4 + 2; const uint32_t vsh_stride = MatBc / 4 * row_split; const uint32_t ksh = ((kshstride >= vsh_stride) ? (Bc * kshstride) : (Bc * vsh_stride)) * f16vec4; From a498c75ad1d7019bc2cc1e5d83e11bb8a062861f Mon Sep 17 00:00:00 2001 From: Oleksandr Kuvshynov <661042+okuvshynov@users.noreply.github.com> Date: Thu, 5 Feb 2026 03:06:59 -0500 Subject: [PATCH 14/20] vulkan: fix GPU deduplication logic. (#19222) * vulkan: fix GPU deduplication logic. As reported in https://github.com/ggml-org/llama.cpp/issues/19221, the (same uuid, same driver) logic is problematic for windows+intel igpu. Let's just avoid filtering for MoltenVK which is apple-specific, and keep the logic the same as before 88d23ad5 - just dedup based on UUID. Verified that MacOS + 4xVega still reports 4 GPUs with this version. * vulkan: only skip dedup when both drivers are moltenVk --- ggml/src/ggml-vulkan/ggml-vulkan.cpp | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp index 2f6570181a..ff9cb7355c 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -5561,9 +5561,9 @@ static void ggml_vk_instance_init() { // Check if there are two physical devices corresponding to the same GPU // This handles the case where the same GPU appears with different drivers (e.g., RADV + AMDVLK on Linux), // see https://github.com/ggml-org/llama.cpp/pull/7582 for original deduplication. - // However, for MoltenVK on macOS, multiple GPUs on the same card may report the same UUID, - // see https://github.com/KhronosGroup/MoltenVK/issues/2683. Until this is fixed, we'll only deduplicate - // when drivers differ (same driver + same UUID = likely different GPUs) + // MoltenVK on macOS may report the same UUID for distinct GPUs on multi-GPU cards, + // see https://github.com/KhronosGroup/MoltenVK/issues/2683. Skip when both old/new + // driver is MoltenVK auto old_device = std::find_if( vk_instance.device_indices.begin(), vk_instance.device_indices.end(), @@ -5580,11 +5580,9 @@ static void ggml_vk_instance_init() { old_id.deviceLUIDValid && new_id.deviceLUIDValid && std::equal(std::begin(old_id.deviceLUID), std::end(old_id.deviceLUID), std::begin(new_id.deviceLUID)) ); + bool both_molten_vk = (new_driver.driverID == vk::DriverId::eMoltenvk && old_driver.driverID == vk::DriverId::eMoltenvk); - // Only deduplicate if same UUID AND different drivers - // (same driver + same UUID on MoltenVK = likely different GPUs on multi-GPU card) - bool different_driver = (old_driver.driverID != new_driver.driverID); - return same_uuid && different_driver; + return same_uuid && !both_molten_vk; } ); if (old_device == vk_instance.device_indices.end()) { From 7a4f97d19694ca4ffa643567c6c353dea976455b Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 5 Feb 2026 10:08:45 +0200 Subject: [PATCH 15/20] metal : add diag (#19330) --- ggml/src/ggml-metal/ggml-metal-device.cpp | 20 ++++++++++ ggml/src/ggml-metal/ggml-metal-device.h | 1 + ggml/src/ggml-metal/ggml-metal-device.m | 4 +- ggml/src/ggml-metal/ggml-metal-impl.h | 19 ++++++++++ ggml/src/ggml-metal/ggml-metal-ops.cpp | 46 +++++++++++++++++++++++ ggml/src/ggml-metal/ggml-metal-ops.h | 1 + ggml/src/ggml-metal/ggml-metal.metal | 20 ++++++++++ 7 files changed, 110 insertions(+), 1 deletion(-) diff --git a/ggml/src/ggml-metal/ggml-metal-device.cpp b/ggml/src/ggml-metal/ggml-metal-device.cpp index 4cd3d93d81..6af0dd88d5 100644 --- a/ggml/src/ggml-metal/ggml-metal-device.cpp +++ b/ggml/src/ggml-metal/ggml-metal-device.cpp @@ -176,6 +176,26 @@ ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_set_rows(ggml_me return res; } +ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_diag(ggml_metal_library_t lib, const ggml_tensor * op) { + char base[256]; + char name[256]; + + const int n = op->src[0]->ne[0]; + + snprintf(base, 256, "kernel_diag_%s", ggml_type_name(op->src[0]->type)); + snprintf(name, 256, "%s_n=%d", base, n); + + ggml_metal_pipeline_with_params res = ggml_metal_library_get_pipeline(lib, name); + if (!res.pipeline) { + res = ggml_metal_library_compile_pipeline(lib, base, name, nullptr); + } + + res.nsg = 1; + res.smem = 0; + + return res; +} + ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_repeat(ggml_metal_library_t lib, ggml_type tsrc) { char base[256]; char name[256]; diff --git a/ggml/src/ggml-metal/ggml-metal-device.h b/ggml/src/ggml-metal/ggml-metal-device.h index d898432712..84dcec3083 100644 --- a/ggml/src/ggml-metal/ggml-metal-device.h +++ b/ggml/src/ggml-metal/ggml-metal-device.h @@ -108,6 +108,7 @@ struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_pool_1d struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_pool_2d (ggml_metal_library_t lib, const struct ggml_tensor * op, enum ggml_op_pool op_pool); struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_get_rows (ggml_metal_library_t lib, enum ggml_type tsrc); struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_set_rows (ggml_metal_library_t lib, enum ggml_type tidx, enum ggml_type tdst); +struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_diag (ggml_metal_library_t lib, const struct ggml_tensor * op); struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_repeat (ggml_metal_library_t lib, enum ggml_type tsrc); struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_unary (ggml_metal_library_t lib, const struct ggml_tensor * op); struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_glu (ggml_metal_library_t lib, const struct ggml_tensor * op); diff --git a/ggml/src/ggml-metal/ggml-metal-device.m b/ggml/src/ggml-metal/ggml-metal-device.m index 8a0b85c6e4..c8e737d418 100644 --- a/ggml/src/ggml-metal/ggml-metal-device.m +++ b/ggml/src/ggml-metal/ggml-metal-device.m @@ -1152,8 +1152,8 @@ bool ggml_metal_device_supports_op(ggml_metal_device_t dev, const struct ggml_te return has_simdgroup_reduction; case GGML_OP_RWKV_WKV6: case GGML_OP_RWKV_WKV7: - case GGML_OP_SOLVE_TRI: return true; + case GGML_OP_SOLVE_TRI: case GGML_OP_MUL_MAT: case GGML_OP_MUL_MAT_ID: return has_simdgroup_reduction; @@ -1235,6 +1235,8 @@ bool ggml_metal_device_supports_op(ggml_metal_device_t dev, const struct ggml_te return false; }; } + case GGML_OP_DIAG: + return true; case GGML_OP_OPT_STEP_ADAMW: case GGML_OP_OPT_STEP_SGD: return has_simdgroup_reduction; diff --git a/ggml/src/ggml-metal/ggml-metal-impl.h b/ggml/src/ggml-metal/ggml-metal-impl.h index 640ade8f88..7f73cb97bb 100644 --- a/ggml/src/ggml-metal/ggml-metal-impl.h +++ b/ggml/src/ggml-metal/ggml-metal-impl.h @@ -792,6 +792,25 @@ typedef struct { uint64_t nb3; } ggml_metal_kargs_set_rows; +typedef struct { + int32_t ne00; + int32_t ne01; + int32_t ne02; + int32_t ne03; + uint64_t nb00; + uint64_t nb01; + uint64_t nb02; + uint64_t nb03; + int32_t ne0; + int32_t ne1; + int32_t ne2; + int32_t ne3; + uint64_t nb0; + uint64_t nb1; + uint64_t nb2; + uint64_t nb3; +} ggml_metal_kargs_diag; + typedef struct { int64_t ne00; int64_t ne01; diff --git a/ggml/src/ggml-metal/ggml-metal-ops.cpp b/ggml/src/ggml-metal/ggml-metal-ops.cpp index 753fcec317..e0ed6c7805 100644 --- a/ggml/src/ggml-metal/ggml-metal-ops.cpp +++ b/ggml/src/ggml-metal/ggml-metal-ops.cpp @@ -361,6 +361,10 @@ static int ggml_metal_op_encode_impl(ggml_metal_op_t ctx, int idx) { { n_fuse = ggml_metal_op_set_rows(ctx, idx); } break; + case GGML_OP_DIAG: + { + n_fuse = ggml_metal_op_diag(ctx, idx); + } break; case GGML_OP_L2_NORM: { n_fuse = ggml_metal_op_l2_norm(ctx, idx); @@ -1259,6 +1263,48 @@ int ggml_metal_op_set_rows(ggml_metal_op_t ctx, int idx) { return 1; } +int ggml_metal_op_diag(ggml_metal_op_t ctx, int idx) { + ggml_tensor * op = ctx->node(idx); + + ggml_metal_library_t lib = ctx->lib; + ggml_metal_encoder_t enc = ctx->enc; + + GGML_TENSOR_LOCALS(int32_t, ne0, op->src[0], ne); + GGML_TENSOR_LOCALS(uint64_t, nb0, op->src[0], nb); + GGML_TENSOR_LOCALS(int32_t, ne, op, ne); + GGML_TENSOR_LOCALS(uint64_t, nb, op, nb); + + ggml_metal_kargs_diag args = { + /*.ne00 =*/ne00, + /*.ne01 =*/ne01, + /*.ne02 =*/ne02, + /*.ne03 =*/ne03, + /*.nb00 =*/nb00, + /*.nb01 =*/nb01, + /*.nb02 =*/nb02, + /*.nb03 =*/nb03, + /*.ne0 =*/ne0, + /*.ne1 =*/ne1, + /*.ne2 =*/ne2, + /*.ne3 =*/ne3, + /*.nb0 =*/nb0, + /*.nb1 =*/nb1, + /*.nb2 =*/nb2, + /*.nb3 =*/nb3, + }; + + auto pipeline = ggml_metal_library_get_pipeline_diag(lib, op); + + ggml_metal_encoder_set_pipeline(enc, pipeline); + ggml_metal_encoder_set_bytes(enc, &args, sizeof(args), 0); + ggml_metal_encoder_set_buffer(enc, ggml_metal_get_buffer_id(op->src[0]), 1); + ggml_metal_encoder_set_buffer(enc, ggml_metal_get_buffer_id(op), 2); + + ggml_metal_encoder_dispatch_threadgroups(enc, ne1, ne2, ne3, 32, 1, 1); + + return 1; +} + int ggml_metal_op_soft_max(ggml_metal_op_t ctx, int idx) { ggml_tensor * op = ctx->node(idx); diff --git a/ggml/src/ggml-metal/ggml-metal-ops.h b/ggml/src/ggml-metal/ggml-metal-ops.h index 2e4c7d3fa1..3c64e4f600 100644 --- a/ggml/src/ggml-metal/ggml-metal-ops.h +++ b/ggml/src/ggml-metal/ggml-metal-ops.h @@ -56,6 +56,7 @@ int ggml_metal_op_sum_rows (ggml_metal_op_t ctx, int idx); int ggml_metal_op_cumsum (ggml_metal_op_t ctx, int idx); int ggml_metal_op_get_rows (ggml_metal_op_t ctx, int idx); int ggml_metal_op_set_rows (ggml_metal_op_t ctx, int idx); +int ggml_metal_op_diag (ggml_metal_op_t ctx, int idx); int ggml_metal_op_soft_max (ggml_metal_op_t ctx, int idx); int ggml_metal_op_ssm_conv (ggml_metal_op_t ctx, int idx); int ggml_metal_op_ssm_scan (ggml_metal_op_t ctx, int idx); diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal index c09a54e661..e54cdab39d 100644 --- a/ggml/src/ggml-metal/ggml-metal.metal +++ b/ggml/src/ggml-metal/ggml-metal.metal @@ -8815,6 +8815,26 @@ kernel void kernel_set_rows_f( } } +kernel void kernel_diag_f32( + constant ggml_metal_kargs_diag & args, + device const char * src0, + device char * dst, + uint3 tgpig[[threadgroup_position_in_grid]], + ushort tiitg[[thread_index_in_threadgroup]]) { + constexpr short NW = N_SIMDWIDTH; + + const int32_t i3 = tgpig.z; + const int32_t i2 = tgpig.y; + const int32_t i1 = tgpig.x; + + device const float * src0_ptr = (device const float *)(src0 + i2*args.nb02 + i3*args.nb03); + device float * dst_ptr = (device float *)(dst + i1*args.nb01 + i2*args.nb2 + i3*args.nb3); + + for (int i0 = tiitg; i0 < args.ne0; i0 += NW) { + dst_ptr[i0] = i0 == i1 ? src0_ptr[i0] : 0.0f; + } +} + constant bool FC_mul_mm_bc_inp [[function_constant(FC_MUL_MM + 0)]]; constant bool FC_mul_mm_bc_out [[function_constant(FC_MUL_MM + 1)]]; From a4ea7a188f3f777da665d73fe297fb7bb716e526 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrien=20Gallou=C3=ABt?= Date: Thu, 5 Feb 2026 09:53:35 +0100 Subject: [PATCH 16/20] vendor : update BoringSSL to 0.20260204.0 (#19333) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Adrien Gallouët --- vendor/cpp-httplib/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/cpp-httplib/CMakeLists.txt b/vendor/cpp-httplib/CMakeLists.txt index 18974d64ca..a8a59e02f4 100644 --- a/vendor/cpp-httplib/CMakeLists.txt +++ b/vendor/cpp-httplib/CMakeLists.txt @@ -39,7 +39,7 @@ if (LLAMA_BUILD_BORINGSSL) set(FIPS OFF CACHE BOOL "Enable FIPS (BoringSSL)") set(BORINGSSL_GIT "https://boringssl.googlesource.com/boringssl" CACHE STRING "BoringSSL git repository") - set(BORINGSSL_VERSION "0.20251002.0" CACHE STRING "BoringSSL version") + set(BORINGSSL_VERSION "0.20260204.0" CACHE STRING "BoringSSL version") message(STATUS "Fetching BoringSSL version ${BORINGSSL_VERSION}") From b828e18c75137e29fbfd3f3daa38281172d6a636 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= Date: Thu, 5 Feb 2026 11:10:39 +0100 Subject: [PATCH 17/20] docker : fix vulkan build (#19352) --- .devops/vulkan.Dockerfile | 1 + 1 file changed, 1 insertion(+) diff --git a/.devops/vulkan.Dockerfile b/.devops/vulkan.Dockerfile index 9797c5e0f3..5d6c87ed6b 100644 --- a/.devops/vulkan.Dockerfile +++ b/.devops/vulkan.Dockerfile @@ -54,6 +54,7 @@ RUN apt-get update \ build-essential \ git \ python3 \ + python3-dev \ python3-pip \ python3-wheel \ && pip install --break-system-packages --upgrade setuptools \ From 3795cc1e89e16fbc145f8a5457ea30abd86e0d1d Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 5 Feb 2026 14:34:07 +0200 Subject: [PATCH 18/20] benches : update models + numbers (#19359) * bench : update script * benches : update numbers --- benches/dgx-spark/dgx-spark.md | 371 +++++++++++++++------------ benches/mac-m2-ultra/mac-m2-ultra.md | 298 +++++++++++++++++++++ scripts/bench-models.sh | 60 +++-- 3 files changed, 541 insertions(+), 188 deletions(-) create mode 100644 benches/mac-m2-ultra/mac-m2-ultra.md mode change 100644 => 100755 scripts/bench-models.sh diff --git a/benches/dgx-spark/dgx-spark.md b/benches/dgx-spark/dgx-spark.md index ec6c20d8a0..fd5c4e2c78 100644 --- a/benches/dgx-spark/dgx-spark.md +++ b/benches/dgx-spark/dgx-spark.md @@ -8,7 +8,7 @@ g++ --version g++ (Ubuntu 13.3.0-6ubuntu2~24.04) 13.3.0 nvidia-smi -Sun Nov 2 10:43:25 2025 +Thu Feb 5 13:49:40 2026 +-----------------------------------------------------------------------------------------+ | NVIDIA-SMI 580.95.05 Driver Version: 580.95.05 CUDA Version: 13.0 | +-----------------------------------------+------------------------+----------------------+ @@ -17,7 +17,7 @@ Sun Nov 2 10:43:25 2025 | | | MIG M. | |=========================================+========================+======================| | 0 NVIDIA GB10 On | 0000000F:01:00.0 Off | N/A | -| N/A 35C P8 4W / N/A | Not Supported | 0% Default | +| N/A 47C P0 13W / N/A | Not Supported | 0% Default | | | | N/A | +-----------------------------------------+------------------------+----------------------+ ``` @@ -29,46 +29,46 @@ Model: https://huggingface.co/ggml-org/gpt-oss-20b-GGUF - `llama-batched-bench` -main: n_kv_max = 270336, n_batch = 2048, n_ubatch = 2048, flash_attn = 1, is_pp_shared = 0, n_gpu_layers = -1, n_threads = 20, n_threads_batch = 20 +main: n_kv_max = 270336, n_batch = 2048, n_ubatch = 2048, flash_attn = 1, is_pp_shared = 0, is_tg_separate = 0, n_gpu_layers = -1, n_threads = 20, n_threads_batch = 20 | PP | TG | B | N_KV | T_PP s | S_PP t/s | T_TG s | S_TG t/s | T s | S t/s | |-------|--------|------|--------|----------|----------|----------|----------|----------|----------| -| 512 | 32 | 1 | 544 | 0.374 | 1369.01 | 0.383 | 83.64 | 0.757 | 719.01 | -| 512 | 32 | 2 | 1088 | 0.274 | 3741.35 | 0.659 | 97.14 | 0.933 | 1166.66 | -| 512 | 32 | 4 | 2176 | 0.526 | 3896.47 | 0.817 | 156.73 | 1.342 | 1621.08 | -| 512 | 32 | 8 | 4352 | 1.044 | 3925.10 | 0.987 | 259.44 | 2.030 | 2143.56 | -| 512 | 32 | 16 | 8704 | 2.076 | 3945.84 | 1.248 | 410.32 | 3.324 | 2618.60 | -| 512 | 32 | 32 | 17408 | 4.170 | 3929.28 | 1.630 | 628.40 | 5.799 | 3001.76 | -| 4096 | 32 | 1 | 4128 | 1.083 | 3782.66 | 0.394 | 81.21 | 1.477 | 2795.13 | -| 4096 | 32 | 2 | 8256 | 2.166 | 3782.72 | 0.725 | 88.28 | 2.891 | 2856.14 | -| 4096 | 32 | 4 | 16512 | 4.333 | 3780.88 | 0.896 | 142.82 | 5.230 | 3157.38 | -| 4096 | 32 | 8 | 33024 | 8.618 | 3802.14 | 1.155 | 221.69 | 9.773 | 3379.08 | -| 4096 | 32 | 16 | 66048 | 17.330 | 3781.73 | 1.598 | 320.34 | 18.928 | 3489.45 | -| 4096 | 32 | 32 | 132096 | 34.671 | 3780.48 | 2.336 | 438.35 | 37.007 | 3569.51 | -| 8192 | 32 | 1 | 8224 | 2.233 | 3668.56 | 0.438 | 72.98 | 2.671 | 3078.44 | -| 8192 | 32 | 2 | 16448 | 4.425 | 3702.95 | 0.756 | 84.66 | 5.181 | 3174.95 | -| 8192 | 32 | 4 | 32896 | 8.859 | 3698.64 | 0.967 | 132.38 | 9.826 | 3347.72 | -| 8192 | 32 | 8 | 65792 | 17.714 | 3699.57 | 1.277 | 200.52 | 18.991 | 3464.35 | -| 8192 | 32 | 16 | 131584 | 35.494 | 3692.84 | 1.841 | 278.12 | 37.335 | 3524.46 | -| 8192 | 32 | 32 | 263168 | 70.949 | 3694.82 | 2.798 | 365.99 | 73.747 | 3568.53 | +| 512 | 32 | 1 | 544 | 0.270 | 1895.57 | 0.399 | 80.13 | 0.669 | 812.60 | +| 512 | 32 | 2 | 1088 | 0.230 | 4451.23 | 0.583 | 109.71 | 0.813 | 1337.56 | +| 512 | 32 | 4 | 2176 | 0.437 | 4688.87 | 0.820 | 156.03 | 1.257 | 1730.91 | +| 512 | 32 | 8 | 4352 | 0.863 | 4744.23 | 0.942 | 271.79 | 1.805 | 2410.73 | +| 512 | 32 | 16 | 8704 | 1.725 | 4748.19 | 1.173 | 436.38 | 2.899 | 3002.85 | +| 512 | 32 | 32 | 17408 | 3.437 | 4767.38 | 1.503 | 681.49 | 4.939 | 3524.40 | +| 4096 | 32 | 1 | 4128 | 0.907 | 4513.91 | 0.407 | 78.54 | 1.315 | 3139.56 | +| 4096 | 32 | 2 | 8256 | 1.796 | 4560.42 | 0.625 | 102.37 | 2.422 | 3409.45 | +| 4096 | 32 | 4 | 16512 | 3.596 | 4555.66 | 0.888 | 144.11 | 4.485 | 3681.93 | +| 4096 | 32 | 8 | 33024 | 7.184 | 4561.44 | 1.098 | 233.11 | 8.282 | 3987.51 | +| 4096 | 32 | 16 | 66048 | 14.369 | 4560.82 | 1.503 | 340.74 | 15.872 | 4161.30 | +| 4096 | 32 | 32 | 132096 | 28.760 | 4557.52 | 2.162 | 473.59 | 30.922 | 4271.95 | +| 8192 | 32 | 1 | 8224 | 1.859 | 4405.59 | 0.430 | 74.36 | 2.290 | 3591.61 | +| 8192 | 32 | 2 | 16448 | 3.698 | 4430.02 | 0.656 | 97.59 | 4.354 | 3777.47 | +| 8192 | 32 | 4 | 32896 | 7.403 | 4426.10 | 0.957 | 133.82 | 8.360 | 3934.97 | +| 8192 | 32 | 8 | 65792 | 14.802 | 4427.63 | 1.222 | 209.44 | 16.024 | 4105.87 | +| 8192 | 32 | 16 | 131584 | 29.596 | 4428.67 | 1.741 | 294.13 | 31.337 | 4199.00 | +| 8192 | 32 | 32 | 263168 | 59.169 | 4430.42 | 2.619 | 390.92 | 61.789 | 4259.17 | - `llama-bench` -| model | size | params | backend | ngl | n_ubatch | fa | mmap | test | t/s | -| ------------------------------ | ---------: | ---------: | ---------- | --: | -------: | -: | ---: | --------------: | -------------------: | -| gpt-oss 20B MXFP4 MoE | 11.27 GiB | 20.91 B | CUDA | 99 | 2048 | 1 | 0 | pp2048 | 3714.25 ± 20.36 | -| gpt-oss 20B MXFP4 MoE | 11.27 GiB | 20.91 B | CUDA | 99 | 2048 | 1 | 0 | tg32 | 86.58 ± 0.43 | -| gpt-oss 20B MXFP4 MoE | 11.27 GiB | 20.91 B | CUDA | 99 | 2048 | 1 | 0 | pp2048 @ d4096 | 3445.17 ± 17.85 | -| gpt-oss 20B MXFP4 MoE | 11.27 GiB | 20.91 B | CUDA | 99 | 2048 | 1 | 0 | tg32 @ d4096 | 81.72 ± 0.53 | -| gpt-oss 20B MXFP4 MoE | 11.27 GiB | 20.91 B | CUDA | 99 | 2048 | 1 | 0 | pp2048 @ d8192 | 3218.78 ± 11.34 | -| gpt-oss 20B MXFP4 MoE | 11.27 GiB | 20.91 B | CUDA | 99 | 2048 | 1 | 0 | tg32 @ d8192 | 74.86 ± 0.64 | -| gpt-oss 20B MXFP4 MoE | 11.27 GiB | 20.91 B | CUDA | 99 | 2048 | 1 | 0 | pp2048 @ d16384 | 2732.83 ± 7.17 | -| gpt-oss 20B MXFP4 MoE | 11.27 GiB | 20.91 B | CUDA | 99 | 2048 | 1 | 0 | tg32 @ d16384 | 71.57 ± 0.51 | -| gpt-oss 20B MXFP4 MoE | 11.27 GiB | 20.91 B | CUDA | 99 | 2048 | 1 | 0 | pp2048 @ d32768 | 2119.75 ± 12.81 | -| gpt-oss 20B MXFP4 MoE | 11.27 GiB | 20.91 B | CUDA | 99 | 2048 | 1 | 0 | tg32 @ d32768 | 62.33 ± 0.24 | +| model | size | params | backend | ngl | n_ubatch | fa | mmap | dio | test | t/s | +| ------------------------------ | ---------: | ---------: | ---------- | --: | -------: | -: | ---: | --: | --------------: | -------------------: | +| gpt-oss 20B MXFP4 MoE | 11.27 GiB | 20.91 B | CUDA | 99 | 2048 | 1 | 0 | 1 | pp2048 | 4505.82 ± 12.90 | +| gpt-oss 20B MXFP4 MoE | 11.27 GiB | 20.91 B | CUDA | 99 | 2048 | 1 | 0 | 1 | tg32 | 83.43 ± 0.59 | +| gpt-oss 20B MXFP4 MoE | 11.27 GiB | 20.91 B | CUDA | 99 | 2048 | 1 | 0 | 1 | pp2048 @ d4096 | 4158.34 ± 18.84 | +| gpt-oss 20B MXFP4 MoE | 11.27 GiB | 20.91 B | CUDA | 99 | 2048 | 1 | 0 | 1 | tg32 @ d4096 | 79.22 ± 0.60 | +| gpt-oss 20B MXFP4 MoE | 11.27 GiB | 20.91 B | CUDA | 99 | 2048 | 1 | 0 | 1 | pp2048 @ d8192 | 3993.81 ± 17.55 | +| gpt-oss 20B MXFP4 MoE | 11.27 GiB | 20.91 B | CUDA | 99 | 2048 | 1 | 0 | 1 | tg32 @ d8192 | 75.22 ± 1.05 | +| gpt-oss 20B MXFP4 MoE | 11.27 GiB | 20.91 B | CUDA | 99 | 2048 | 1 | 0 | 1 | pp2048 @ d16384 | 3449.98 ± 12.13 | +| gpt-oss 20B MXFP4 MoE | 11.27 GiB | 20.91 B | CUDA | 99 | 2048 | 1 | 0 | 1 | tg32 @ d16384 | 70.36 ± 0.37 | +| gpt-oss 20B MXFP4 MoE | 11.27 GiB | 20.91 B | CUDA | 99 | 2048 | 1 | 0 | 1 | pp2048 @ d32768 | 2689.42 ± 18.89 | +| gpt-oss 20B MXFP4 MoE | 11.27 GiB | 20.91 B | CUDA | 99 | 2048 | 1 | 0 | 1 | tg32 @ d32768 | 61.65 ± 0.30 | -build: eeee367de (6989) +build: 11fb327bf (7941) ## ggml-org/gpt-oss-120b-GGUF @@ -77,46 +77,46 @@ Model: https://huggingface.co/ggml-org/gpt-oss-120b-GGUF - `llama-batched-bench` -main: n_kv_max = 270336, n_batch = 2048, n_ubatch = 2048, flash_attn = 1, is_pp_shared = 0, n_gpu_layers = -1, n_threads = 20, n_threads_batch = 20 +main: n_kv_max = 270336, n_batch = 2048, n_ubatch = 2048, flash_attn = 1, is_pp_shared = 0, is_tg_separate = 0, n_gpu_layers = -1, n_threads = 20, n_threads_batch = 20 | PP | TG | B | N_KV | T_PP s | S_PP t/s | T_TG s | S_TG t/s | T s | S t/s | |-------|--------|------|--------|----------|----------|----------|----------|----------|----------| -| 512 | 32 | 1 | 544 | 0.571 | 897.18 | 0.543 | 58.96 | 1.113 | 488.60 | -| 512 | 32 | 2 | 1088 | 0.593 | 1725.37 | 1.041 | 61.45 | 1.635 | 665.48 | -| 512 | 32 | 4 | 2176 | 1.043 | 1963.15 | 1.334 | 95.95 | 2.377 | 915.36 | -| 512 | 32 | 8 | 4352 | 2.099 | 1951.63 | 1.717 | 149.07 | 3.816 | 1140.45 | -| 512 | 32 | 16 | 8704 | 4.207 | 1947.12 | 2.311 | 221.56 | 6.518 | 1335.35 | -| 512 | 32 | 32 | 17408 | 8.422 | 1945.36 | 3.298 | 310.46 | 11.720 | 1485.27 | -| 4096 | 32 | 1 | 4128 | 2.138 | 1915.88 | 0.571 | 56.09 | 2.708 | 1524.12 | -| 4096 | 32 | 2 | 8256 | 4.266 | 1920.25 | 1.137 | 56.27 | 5.404 | 1527.90 | -| 4096 | 32 | 4 | 16512 | 8.564 | 1913.02 | 1.471 | 86.99 | 10.036 | 1645.29 | -| 4096 | 32 | 8 | 33024 | 17.092 | 1917.19 | 1.979 | 129.33 | 19.071 | 1731.63 | -| 4096 | 32 | 16 | 66048 | 34.211 | 1915.65 | 2.850 | 179.66 | 37.061 | 1782.15 | -| 4096 | 32 | 32 | 132096 | 68.394 | 1916.44 | 4.381 | 233.72 | 72.775 | 1815.13 | -| 8192 | 32 | 1 | 8224 | 4.349 | 1883.45 | 0.620 | 51.65 | 4.969 | 1655.04 | -| 8192 | 32 | 2 | 16448 | 8.674 | 1888.83 | 1.178 | 54.33 | 9.852 | 1669.48 | -| 8192 | 32 | 4 | 32896 | 17.351 | 1888.55 | 1.580 | 81.01 | 18.931 | 1737.68 | -| 8192 | 32 | 8 | 65792 | 34.743 | 1886.31 | 2.173 | 117.80 | 36.916 | 1782.20 | -| 8192 | 32 | 16 | 131584 | 69.413 | 1888.29 | 3.297 | 155.28 | 72.710 | 1809.70 | -| 8192 | 32 | 32 | 263168 | 138.903 | 1887.24 | 5.004 | 204.63 | 143.907 | 1828.73 | +| 512 | 32 | 1 | 544 | 0.445 | 1151.80 | 0.560 | 57.14 | 1.005 | 541.53 | +| 512 | 32 | 2 | 1088 | 0.472 | 2169.85 | 0.874 | 73.27 | 1.345 | 808.65 | +| 512 | 32 | 4 | 2176 | 0.826 | 2480.33 | 1.299 | 98.51 | 2.125 | 1023.94 | +| 512 | 32 | 8 | 4352 | 1.644 | 2491.67 | 1.608 | 159.18 | 3.252 | 1338.20 | +| 512 | 32 | 16 | 8704 | 3.292 | 2488.35 | 2.117 | 241.85 | 5.409 | 1609.13 | +| 512 | 32 | 32 | 17408 | 6.604 | 2481.07 | 2.898 | 353.31 | 9.502 | 1832.04 | +| 4096 | 32 | 1 | 4128 | 1.698 | 2412.65 | 0.580 | 55.21 | 2.277 | 1812.66 | +| 4096 | 32 | 2 | 8256 | 3.399 | 2409.88 | 0.934 | 68.53 | 4.333 | 1905.27 | +| 4096 | 32 | 4 | 16512 | 6.823 | 2401.21 | 1.411 | 90.72 | 8.234 | 2005.30 | +| 4096 | 32 | 8 | 33024 | 13.574 | 2413.97 | 1.841 | 139.07 | 15.415 | 2142.31 | +| 4096 | 32 | 16 | 66048 | 27.176 | 2411.52 | 2.609 | 196.26 | 29.785 | 2217.49 | +| 4096 | 32 | 32 | 132096 | 54.359 | 2411.23 | 3.905 | 262.20 | 58.264 | 2267.19 | +| 8192 | 32 | 1 | 8224 | 3.491 | 2346.81 | 0.613 | 52.23 | 4.103 | 2004.21 | +| 8192 | 32 | 2 | 16448 | 6.939 | 2361.03 | 0.981 | 65.21 | 7.921 | 2076.56 | +| 8192 | 32 | 4 | 32896 | 13.888 | 2359.40 | 1.511 | 84.71 | 15.399 | 2136.21 | +| 8192 | 32 | 8 | 65792 | 27.756 | 2361.18 | 2.034 | 125.86 | 29.790 | 2208.56 | +| 8192 | 32 | 16 | 131584 | 55.554 | 2359.34 | 3.021 | 169.49 | 58.575 | 2246.41 | +| 8192 | 32 | 32 | 263168 | 111.036 | 2360.89 | 4.537 | 225.72 | 115.573 | 2277.08 | - `llama-bench` -| model | size | params | backend | ngl | n_ubatch | fa | mmap | test | t/s | -| ------------------------------ | ---------: | ---------: | ---------- | --: | -------: | -: | ---: | --------------: | -------------------: | -| gpt-oss 120B MXFP4 MoE | 59.02 GiB | 116.83 B | CUDA | 99 | 2048 | 1 | 0 | pp2048 | 1919.36 ± 5.01 | -| gpt-oss 120B MXFP4 MoE | 59.02 GiB | 116.83 B | CUDA | 99 | 2048 | 1 | 0 | tg32 | 60.40 ± 0.30 | -| gpt-oss 120B MXFP4 MoE | 59.02 GiB | 116.83 B | CUDA | 99 | 2048 | 1 | 0 | pp2048 @ d4096 | 1825.30 ± 6.37 | -| gpt-oss 120B MXFP4 MoE | 59.02 GiB | 116.83 B | CUDA | 99 | 2048 | 1 | 0 | tg32 @ d4096 | 56.94 ± 0.29 | -| gpt-oss 120B MXFP4 MoE | 59.02 GiB | 116.83 B | CUDA | 99 | 2048 | 1 | 0 | pp2048 @ d8192 | 1739.19 ± 6.00 | -| gpt-oss 120B MXFP4 MoE | 59.02 GiB | 116.83 B | CUDA | 99 | 2048 | 1 | 0 | tg32 @ d8192 | 52.51 ± 0.42 | -| gpt-oss 120B MXFP4 MoE | 59.02 GiB | 116.83 B | CUDA | 99 | 2048 | 1 | 0 | pp2048 @ d16384 | 1536.75 ± 4.27 | -| gpt-oss 120B MXFP4 MoE | 59.02 GiB | 116.83 B | CUDA | 99 | 2048 | 1 | 0 | tg32 @ d16384 | 49.33 ± 0.27 | -| gpt-oss 120B MXFP4 MoE | 59.02 GiB | 116.83 B | CUDA | 99 | 2048 | 1 | 0 | pp2048 @ d32768 | 1255.85 ± 3.26 | -| gpt-oss 120B MXFP4 MoE | 59.02 GiB | 116.83 B | CUDA | 99 | 2048 | 1 | 0 | tg32 @ d32768 | 42.99 ± 0.18 | +| model | size | params | backend | ngl | n_ubatch | fa | mmap | dio | test | t/s | +| ------------------------------ | ---------: | ---------: | ---------- | --: | -------: | -: | ---: | --: | --------------: | -------------------: | +| gpt-oss 120B MXFP4 MoE | 59.02 GiB | 116.83 B | CUDA | 99 | 2048 | 1 | 0 | 1 | pp2048 | 2443.91 ± 7.47 | +| gpt-oss 120B MXFP4 MoE | 59.02 GiB | 116.83 B | CUDA | 99 | 2048 | 1 | 0 | 1 | tg32 | 58.72 ± 0.20 | +| gpt-oss 120B MXFP4 MoE | 59.02 GiB | 116.83 B | CUDA | 99 | 2048 | 1 | 0 | 1 | pp2048 @ d4096 | 2309.84 ± 3.63 | +| gpt-oss 120B MXFP4 MoE | 59.02 GiB | 116.83 B | CUDA | 99 | 2048 | 1 | 0 | 1 | tg32 @ d4096 | 55.67 ± 0.35 | +| gpt-oss 120B MXFP4 MoE | 59.02 GiB | 116.83 B | CUDA | 99 | 2048 | 1 | 0 | 1 | pp2048 @ d8192 | 2216.68 ± 10.16 | +| gpt-oss 120B MXFP4 MoE | 59.02 GiB | 116.83 B | CUDA | 99 | 2048 | 1 | 0 | 1 | tg32 @ d8192 | 52.87 ± 0.43 | +| gpt-oss 120B MXFP4 MoE | 59.02 GiB | 116.83 B | CUDA | 99 | 2048 | 1 | 0 | 1 | pp2048 @ d16384 | 1956.31 ± 6.39 | +| gpt-oss 120B MXFP4 MoE | 59.02 GiB | 116.83 B | CUDA | 99 | 2048 | 1 | 0 | 1 | tg32 @ d16384 | 49.45 ± 0.20 | +| gpt-oss 120B MXFP4 MoE | 59.02 GiB | 116.83 B | CUDA | 99 | 2048 | 1 | 0 | 1 | pp2048 @ d32768 | 1567.08 ± 11.79 | +| gpt-oss 120B MXFP4 MoE | 59.02 GiB | 116.83 B | CUDA | 99 | 2048 | 1 | 0 | 1 | tg32 @ d32768 | 42.76 ± 0.14 | -build: eeee367de (6989) +build: 11fb327bf (7941) ## ggml-org/Qwen3-Coder-30B-A3B-Instruct-Q8_0-GGUF @@ -125,46 +125,46 @@ Model: https://huggingface.co/ggml-org/Qwen3-Coder-30B-A3B-Instruct-Q8_0-GGUF - `llama-batched-bench` -main: n_kv_max = 270336, n_batch = 2048, n_ubatch = 2048, flash_attn = 1, is_pp_shared = 0, n_gpu_layers = -1, n_threads = 20, n_threads_batch = 20 +main: n_kv_max = 270336, n_batch = 2048, n_ubatch = 2048, flash_attn = 1, is_pp_shared = 0, is_tg_separate = 0, n_gpu_layers = -1, n_threads = 20, n_threads_batch = 20 | PP | TG | B | N_KV | T_PP s | S_PP t/s | T_TG s | S_TG t/s | T s | S t/s | |-------|--------|------|--------|----------|----------|----------|----------|----------|----------| -| 512 | 32 | 1 | 544 | 0.398 | 1285.90 | 0.530 | 60.41 | 0.928 | 586.27 | -| 512 | 32 | 2 | 1088 | 0.386 | 2651.65 | 0.948 | 67.50 | 1.334 | 815.38 | -| 512 | 32 | 4 | 2176 | 0.666 | 3076.37 | 1.209 | 105.87 | 1.875 | 1160.71 | -| 512 | 32 | 8 | 4352 | 1.325 | 3091.39 | 1.610 | 158.98 | 2.935 | 1482.65 | -| 512 | 32 | 16 | 8704 | 2.664 | 3075.58 | 2.150 | 238.19 | 4.813 | 1808.39 | -| 512 | 32 | 32 | 17408 | 5.336 | 3070.31 | 2.904 | 352.59 | 8.240 | 2112.50 | -| 4096 | 32 | 1 | 4128 | 1.444 | 2836.81 | 0.581 | 55.09 | 2.025 | 2038.81 | -| 4096 | 32 | 2 | 8256 | 2.872 | 2852.14 | 1.084 | 59.06 | 3.956 | 2086.99 | -| 4096 | 32 | 4 | 16512 | 5.744 | 2852.32 | 1.440 | 88.90 | 7.184 | 2298.47 | -| 4096 | 32 | 8 | 33024 | 11.463 | 2858.68 | 2.068 | 123.78 | 13.531 | 2440.65 | -| 4096 | 32 | 16 | 66048 | 22.915 | 2859.95 | 3.018 | 169.67 | 25.933 | 2546.90 | -| 4096 | 32 | 32 | 132096 | 45.956 | 2852.10 | 4.609 | 222.18 | 50.565 | 2612.39 | -| 8192 | 32 | 1 | 8224 | 3.063 | 2674.72 | 0.693 | 46.20 | 3.755 | 2189.92 | -| 8192 | 32 | 2 | 16448 | 6.109 | 2681.87 | 1.214 | 52.71 | 7.323 | 2245.98 | -| 8192 | 32 | 4 | 32896 | 12.197 | 2686.63 | 1.682 | 76.11 | 13.878 | 2370.30 | -| 8192 | 32 | 8 | 65792 | 24.409 | 2684.94 | 2.556 | 100.17 | 26.965 | 2439.95 | -| 8192 | 32 | 16 | 131584 | 48.753 | 2688.50 | 3.994 | 128.20 | 52.747 | 2494.64 | -| 8192 | 32 | 32 | 263168 | 97.508 | 2688.42 | 6.528 | 156.86 | 104.037 | 2529.57 | +| 512 | 32 | 1 | 544 | 0.393 | 1303.73 | 0.548 | 58.36 | 0.941 | 578.10 | +| 512 | 32 | 2 | 1088 | 0.387 | 2648.68 | 0.910 | 70.35 | 1.296 | 839.27 | +| 512 | 32 | 4 | 2176 | 0.659 | 3107.63 | 1.302 | 98.33 | 1.961 | 1109.77 | +| 512 | 32 | 8 | 4352 | 1.322 | 3099.35 | 1.669 | 153.42 | 2.990 | 1455.43 | +| 512 | 32 | 16 | 8704 | 2.639 | 3104.63 | 2.212 | 231.44 | 4.851 | 1794.32 | +| 512 | 32 | 32 | 17408 | 5.284 | 3100.80 | 2.955 | 346.53 | 8.239 | 2112.93 | +| 4096 | 32 | 1 | 4128 | 1.417 | 2890.36 | 0.598 | 53.51 | 2.015 | 2048.45 | +| 4096 | 32 | 2 | 8256 | 2.829 | 2895.62 | 1.019 | 62.82 | 3.848 | 2145.60 | +| 4096 | 32 | 4 | 16512 | 5.656 | 2896.96 | 1.528 | 83.79 | 7.183 | 2298.71 | +| 4096 | 32 | 8 | 33024 | 11.338 | 2890.02 | 2.127 | 120.36 | 13.465 | 2452.53 | +| 4096 | 32 | 16 | 66048 | 22.709 | 2885.96 | 3.104 | 164.97 | 25.812 | 2558.79 | +| 4096 | 32 | 32 | 132096 | 45.301 | 2893.35 | 4.723 | 216.80 | 50.024 | 2640.63 | +| 8192 | 32 | 1 | 8224 | 3.022 | 2711.09 | 0.678 | 47.20 | 3.700 | 2222.89 | +| 8192 | 32 | 2 | 16448 | 6.039 | 2713.01 | 1.149 | 55.70 | 7.188 | 2288.21 | +| 8192 | 32 | 4 | 32896 | 12.050 | 2719.35 | 1.785 | 71.69 | 13.835 | 2377.67 | +| 8192 | 32 | 8 | 65792 | 24.113 | 2717.90 | 2.629 | 97.39 | 26.741 | 2460.31 | +| 8192 | 32 | 16 | 131584 | 48.178 | 2720.58 | 4.099 | 124.91 | 52.277 | 2517.06 | +| 8192 | 32 | 32 | 263168 | 96.401 | 2719.31 | 6.696 | 152.93 | 103.097 | 2552.63 | - `llama-bench` -| model | size | params | backend | ngl | n_ubatch | fa | mmap | test | t/s | -| ------------------------------ | ---------: | ---------: | ---------- | --: | -------: | -: | ---: | --------------: | -------------------: | -| qwen3moe 30B.A3B Q8_0 | 30.25 GiB | 30.53 B | CUDA | 99 | 2048 | 1 | 0 | pp2048 | 2925.55 ± 4.25 | -| qwen3moe 30B.A3B Q8_0 | 30.25 GiB | 30.53 B | CUDA | 99 | 2048 | 1 | 0 | tg32 | 62.80 ± 0.27 | -| qwen3moe 30B.A3B Q8_0 | 30.25 GiB | 30.53 B | CUDA | 99 | 2048 | 1 | 0 | pp2048 @ d4096 | 2531.01 ± 6.79 | -| qwen3moe 30B.A3B Q8_0 | 30.25 GiB | 30.53 B | CUDA | 99 | 2048 | 1 | 0 | tg32 @ d4096 | 55.86 ± 0.33 | -| qwen3moe 30B.A3B Q8_0 | 30.25 GiB | 30.53 B | CUDA | 99 | 2048 | 1 | 0 | pp2048 @ d8192 | 2244.39 ± 5.33 | -| qwen3moe 30B.A3B Q8_0 | 30.25 GiB | 30.53 B | CUDA | 99 | 2048 | 1 | 0 | tg32 @ d8192 | 45.95 ± 0.33 | -| qwen3moe 30B.A3B Q8_0 | 30.25 GiB | 30.53 B | CUDA | 99 | 2048 | 1 | 0 | pp2048 @ d16384 | 1783.17 ± 3.68 | -| qwen3moe 30B.A3B Q8_0 | 30.25 GiB | 30.53 B | CUDA | 99 | 2048 | 1 | 0 | tg32 @ d16384 | 39.07 ± 0.10 | -| qwen3moe 30B.A3B Q8_0 | 30.25 GiB | 30.53 B | CUDA | 99 | 2048 | 1 | 0 | pp2048 @ d32768 | 1241.90 ± 3.13 | -| qwen3moe 30B.A3B Q8_0 | 30.25 GiB | 30.53 B | CUDA | 99 | 2048 | 1 | 0 | tg32 @ d32768 | 29.92 ± 0.06 | +| model | size | params | backend | ngl | n_ubatch | fa | mmap | dio | test | t/s | +| ------------------------------ | ---------: | ---------: | ---------- | --: | -------: | -: | ---: | --: | --------------: | -------------------: | +| qwen3moe 30B.A3B Q8_0 | 30.25 GiB | 30.53 B | CUDA | 99 | 2048 | 1 | 0 | 1 | pp2048 | 2986.97 ± 18.87 | +| qwen3moe 30B.A3B Q8_0 | 30.25 GiB | 30.53 B | CUDA | 99 | 2048 | 1 | 0 | 1 | tg32 | 61.06 ± 0.23 | +| qwen3moe 30B.A3B Q8_0 | 30.25 GiB | 30.53 B | CUDA | 99 | 2048 | 1 | 0 | 1 | pp2048 @ d4096 | 2633.45 ± 6.26 | +| qwen3moe 30B.A3B Q8_0 | 30.25 GiB | 30.53 B | CUDA | 99 | 2048 | 1 | 0 | 1 | tg32 @ d4096 | 54.77 ± 0.28 | +| qwen3moe 30B.A3B Q8_0 | 30.25 GiB | 30.53 B | CUDA | 99 | 2048 | 1 | 0 | 1 | pp2048 @ d8192 | 2354.14 ± 3.84 | +| qwen3moe 30B.A3B Q8_0 | 30.25 GiB | 30.53 B | CUDA | 99 | 2048 | 1 | 0 | 1 | tg32 @ d8192 | 48.02 ± 0.40 | +| qwen3moe 30B.A3B Q8_0 | 30.25 GiB | 30.53 B | CUDA | 99 | 2048 | 1 | 0 | 1 | pp2048 @ d16384 | 1908.86 ± 4.25 | +| qwen3moe 30B.A3B Q8_0 | 30.25 GiB | 30.53 B | CUDA | 99 | 2048 | 1 | 0 | 1 | tg32 @ d16384 | 40.23 ± 0.10 | +| qwen3moe 30B.A3B Q8_0 | 30.25 GiB | 30.53 B | CUDA | 99 | 2048 | 1 | 0 | 1 | pp2048 @ d32768 | 1348.17 ± 2.00 | +| qwen3moe 30B.A3B Q8_0 | 30.25 GiB | 30.53 B | CUDA | 99 | 2048 | 1 | 0 | 1 | tg32 @ d32768 | 30.21 ± 0.04 | -build: eeee367de (6989) +build: 11fb327bf (7941) ## ggml-org/Qwen2.5-Coder-7B-Q8_0-GGUF @@ -173,46 +173,46 @@ Model: https://huggingface.co/ggml-org/Qwen2.5-Coder-7B-Q8_0-GGUF - `llama-batched-bench` -main: n_kv_max = 270336, n_batch = 2048, n_ubatch = 2048, flash_attn = 1, is_pp_shared = 0, n_gpu_layers = -1, n_threads = 20, n_threads_batch = 20 +main: n_kv_max = 270336, n_batch = 2048, n_ubatch = 2048, flash_attn = 1, is_pp_shared = 0, is_tg_separate = 0, n_gpu_layers = -1, n_threads = 20, n_threads_batch = 20 | PP | TG | B | N_KV | T_PP s | S_PP t/s | T_TG s | S_TG t/s | T s | S t/s | |-------|--------|------|--------|----------|----------|----------|----------|----------|----------| -| 512 | 32 | 1 | 544 | 0.211 | 2421.57 | 1.055 | 30.33 | 1.266 | 429.57 | -| 512 | 32 | 2 | 1088 | 0.419 | 2441.34 | 1.130 | 56.65 | 1.549 | 702.32 | -| 512 | 32 | 4 | 2176 | 0.873 | 2345.54 | 1.174 | 108.99 | 2.048 | 1062.74 | -| 512 | 32 | 8 | 4352 | 1.727 | 2371.85 | 1.254 | 204.22 | 2.980 | 1460.19 | -| 512 | 32 | 16 | 8704 | 3.452 | 2373.22 | 1.492 | 343.16 | 4.944 | 1760.56 | -| 512 | 32 | 32 | 17408 | 6.916 | 2368.93 | 1.675 | 611.51 | 8.591 | 2026.36 | -| 4096 | 32 | 1 | 4128 | 1.799 | 2277.26 | 1.084 | 29.51 | 2.883 | 1431.91 | -| 4096 | 32 | 2 | 8256 | 3.577 | 2290.01 | 1.196 | 53.50 | 4.774 | 1729.51 | -| 4096 | 32 | 4 | 16512 | 7.172 | 2284.36 | 1.313 | 97.50 | 8.485 | 1946.00 | -| 4096 | 32 | 8 | 33024 | 14.341 | 2284.96 | 1.520 | 168.46 | 15.860 | 2082.18 | -| 4096 | 32 | 16 | 66048 | 28.675 | 2285.44 | 1.983 | 258.21 | 30.658 | 2154.33 | -| 4096 | 32 | 32 | 132096 | 57.354 | 2285.32 | 2.640 | 387.87 | 59.994 | 2201.82 | -| 8192 | 32 | 1 | 8224 | 3.701 | 2213.75 | 1.119 | 28.59 | 4.820 | 1706.34 | -| 8192 | 32 | 2 | 16448 | 7.410 | 2211.19 | 1.272 | 50.31 | 8.682 | 1894.56 | -| 8192 | 32 | 4 | 32896 | 14.802 | 2213.83 | 1.460 | 87.68 | 16.261 | 2022.96 | -| 8192 | 32 | 8 | 65792 | 29.609 | 2213.35 | 1.781 | 143.74 | 31.390 | 2095.93 | -| 8192 | 32 | 16 | 131584 | 59.229 | 2212.96 | 2.495 | 205.17 | 61.725 | 2131.79 | -| 8192 | 32 | 32 | 263168 | 118.449 | 2213.15 | 3.714 | 275.75 | 122.162 | 2154.25 | +| 512 | 32 | 1 | 544 | 0.212 | 2420.12 | 1.100 | 29.10 | 1.311 | 414.85 | +| 512 | 32 | 2 | 1088 | 0.428 | 2393.89 | 1.185 | 54.00 | 1.613 | 674.56 | +| 512 | 32 | 4 | 2176 | 0.894 | 2290.41 | 1.229 | 104.17 | 2.123 | 1025.02 | +| 512 | 32 | 8 | 4352 | 1.758 | 2330.36 | 1.319 | 194.15 | 3.076 | 1414.70 | +| 512 | 32 | 16 | 8704 | 3.508 | 2335.21 | 1.543 | 331.90 | 5.051 | 1723.33 | +| 512 | 32 | 32 | 17408 | 7.035 | 2328.93 | 1.738 | 589.21 | 8.773 | 1984.29 | +| 4096 | 32 | 1 | 4128 | 1.831 | 2237.25 | 1.125 | 28.44 | 2.956 | 1396.42 | +| 4096 | 32 | 2 | 8256 | 3.642 | 2249.48 | 1.253 | 51.07 | 4.895 | 1686.64 | +| 4096 | 32 | 4 | 16512 | 7.274 | 2252.26 | 1.380 | 92.72 | 8.655 | 1907.81 | +| 4096 | 32 | 8 | 33024 | 14.576 | 2248.09 | 1.617 | 158.29 | 16.193 | 2039.37 | +| 4096 | 32 | 16 | 66048 | 29.138 | 2249.17 | 2.081 | 246.01 | 31.219 | 2115.63 | +| 4096 | 32 | 32 | 132096 | 58.275 | 2249.19 | 2.814 | 363.87 | 61.089 | 2162.34 | +| 8192 | 32 | 1 | 8224 | 3.757 | 2180.26 | 1.184 | 27.03 | 4.941 | 1664.37 | +| 8192 | 32 | 2 | 16448 | 7.522 | 2178.05 | 1.341 | 47.73 | 8.863 | 1855.77 | +| 8192 | 32 | 4 | 32896 | 15.043 | 2178.25 | 1.548 | 82.69 | 16.591 | 1982.74 | +| 8192 | 32 | 8 | 65792 | 30.111 | 2176.49 | 1.937 | 132.13 | 32.048 | 2052.90 | +| 8192 | 32 | 16 | 131584 | 60.405 | 2169.90 | 2.706 | 189.21 | 63.111 | 2084.97 | +| 8192 | 32 | 32 | 263168 | 120.439 | 2176.58 | 3.993 | 256.46 | 124.432 | 2114.96 | - `llama-bench` -| model | size | params | backend | ngl | n_ubatch | fa | mmap | test | t/s | -| ------------------------------ | ---------: | ---------: | ---------- | --: | -------: | -: | ---: | --------------: | -------------------: | -| qwen2 7B Q8_0 | 7.54 GiB | 7.62 B | CUDA | 99 | 2048 | 1 | 0 | pp2048 | 2272.74 ± 4.68 | -| qwen2 7B Q8_0 | 7.54 GiB | 7.62 B | CUDA | 99 | 2048 | 1 | 0 | tg32 | 30.66 ± 0.02 | -| qwen2 7B Q8_0 | 7.54 GiB | 7.62 B | CUDA | 99 | 2048 | 1 | 0 | pp2048 @ d4096 | 2107.80 ± 9.55 | -| qwen2 7B Q8_0 | 7.54 GiB | 7.62 B | CUDA | 99 | 2048 | 1 | 0 | tg32 @ d4096 | 29.71 ± 0.05 | -| qwen2 7B Q8_0 | 7.54 GiB | 7.62 B | CUDA | 99 | 2048 | 1 | 0 | pp2048 @ d8192 | 1937.80 ± 6.75 | -| qwen2 7B Q8_0 | 7.54 GiB | 7.62 B | CUDA | 99 | 2048 | 1 | 0 | tg32 @ d8192 | 28.86 ± 0.04 | -| qwen2 7B Q8_0 | 7.54 GiB | 7.62 B | CUDA | 99 | 2048 | 1 | 0 | pp2048 @ d16384 | 1641.12 ± 1.78 | -| qwen2 7B Q8_0 | 7.54 GiB | 7.62 B | CUDA | 99 | 2048 | 1 | 0 | tg32 @ d16384 | 27.24 ± 0.04 | -| qwen2 7B Q8_0 | 7.54 GiB | 7.62 B | CUDA | 99 | 2048 | 1 | 0 | pp2048 @ d32768 | 1296.02 ± 2.67 | -| qwen2 7B Q8_0 | 7.54 GiB | 7.62 B | CUDA | 99 | 2048 | 1 | 0 | tg32 @ d32768 | 23.78 ± 0.03 | +| model | size | params | backend | ngl | n_ubatch | fa | mmap | dio | test | t/s | +| ------------------------------ | ---------: | ---------: | ---------- | --: | -------: | -: | ---: | --: | --------------: | -------------------: | +| qwen2 7B Q8_0 | 7.54 GiB | 7.62 B | CUDA | 99 | 2048 | 1 | 0 | 1 | pp2048 | 2250.28 ± 6.41 | +| qwen2 7B Q8_0 | 7.54 GiB | 7.62 B | CUDA | 99 | 2048 | 1 | 0 | 1 | tg32 | 29.43 ± 0.02 | +| qwen2 7B Q8_0 | 7.54 GiB | 7.62 B | CUDA | 99 | 2048 | 1 | 0 | 1 | pp2048 @ d4096 | 2100.19 ± 8.96 | +| qwen2 7B Q8_0 | 7.54 GiB | 7.62 B | CUDA | 99 | 2048 | 1 | 0 | 1 | tg32 @ d4096 | 28.61 ± 0.02 | +| qwen2 7B Q8_0 | 7.54 GiB | 7.62 B | CUDA | 99 | 2048 | 1 | 0 | 1 | pp2048 @ d8192 | 2007.56 ± 4.16 | +| qwen2 7B Q8_0 | 7.54 GiB | 7.62 B | CUDA | 99 | 2048 | 1 | 0 | 1 | tg32 @ d8192 | 27.38 ± 0.09 | +| qwen2 7B Q8_0 | 7.54 GiB | 7.62 B | CUDA | 99 | 2048 | 1 | 0 | 1 | pp2048 @ d16384 | 1779.11 ± 6.42 | +| qwen2 7B Q8_0 | 7.54 GiB | 7.62 B | CUDA | 99 | 2048 | 1 | 0 | 1 | tg32 @ d16384 | 25.72 ± 0.03 | +| qwen2 7B Q8_0 | 7.54 GiB | 7.62 B | CUDA | 99 | 2048 | 1 | 0 | 1 | pp2048 @ d32768 | 1471.23 ± 1.71 | +| qwen2 7B Q8_0 | 7.54 GiB | 7.62 B | CUDA | 99 | 2048 | 1 | 0 | 1 | tg32 @ d32768 | 22.51 ± 0.02 | -build: eeee367de (6989) +build: 11fb327bf (7941) ## ggml-org/gemma-3-4b-it-qat-GGUF @@ -221,44 +221,91 @@ Model: https://huggingface.co/ggml-org/gemma-3-4b-it-qat-GGUF - `llama-batched-bench` -main: n_kv_max = 270336, n_batch = 2048, n_ubatch = 2048, flash_attn = 1, is_pp_shared = 0, n_gpu_layers = -1, n_threads = 20, n_threads_batch = 20 +main: n_kv_max = 270336, n_batch = 2048, n_ubatch = 2048, flash_attn = 1, is_pp_shared = 0, is_tg_separate = 0, n_gpu_layers = -1, n_threads = 20, n_threads_batch = 20 | PP | TG | B | N_KV | T_PP s | S_PP t/s | T_TG s | S_TG t/s | T s | S t/s | |-------|--------|------|--------|----------|----------|----------|----------|----------|----------| -| 512 | 32 | 1 | 544 | 0.094 | 5434.73 | 0.394 | 81.21 | 0.488 | 1114.15 | -| 512 | 32 | 2 | 1088 | 0.168 | 6091.68 | 0.498 | 128.52 | 0.666 | 1633.41 | -| 512 | 32 | 4 | 2176 | 0.341 | 6010.68 | 0.542 | 236.37 | 0.882 | 2466.43 | -| 512 | 32 | 8 | 4352 | 0.665 | 6161.46 | 0.678 | 377.74 | 1.342 | 3241.72 | -| 512 | 32 | 16 | 8704 | 1.323 | 6193.19 | 0.902 | 567.41 | 2.225 | 3911.74 | -| 512 | 32 | 32 | 17408 | 2.642 | 6202.03 | 1.231 | 832.03 | 3.872 | 4495.36 | -| 4096 | 32 | 1 | 4128 | 0.701 | 5840.49 | 0.439 | 72.95 | 1.140 | 3621.23 | -| 4096 | 32 | 2 | 8256 | 1.387 | 5906.82 | 0.574 | 111.48 | 1.961 | 4210.12 | -| 4096 | 32 | 4 | 16512 | 2.758 | 5940.33 | 0.651 | 196.58 | 3.409 | 4843.33 | -| 4096 | 32 | 8 | 33024 | 5.491 | 5967.56 | 0.876 | 292.40 | 6.367 | 5187.12 | -| 4096 | 32 | 16 | 66048 | 10.978 | 5969.58 | 1.275 | 401.69 | 12.253 | 5390.38 | -| 4096 | 32 | 32 | 132096 | 21.944 | 5972.93 | 1.992 | 514.16 | 23.936 | 5518.73 | -| 8192 | 32 | 1 | 8224 | 1.402 | 5841.91 | 0.452 | 70.73 | 1.855 | 4434.12 | -| 8192 | 32 | 2 | 16448 | 2.793 | 5865.34 | 0.637 | 100.55 | 3.430 | 4795.51 | -| 8192 | 32 | 4 | 32896 | 5.564 | 5889.64 | 0.770 | 166.26 | 6.334 | 5193.95 | -| 8192 | 32 | 8 | 65792 | 11.114 | 5896.44 | 1.122 | 228.07 | 12.237 | 5376.51 | -| 8192 | 32 | 16 | 131584 | 22.210 | 5901.38 | 1.789 | 286.15 | 24.000 | 5482.74 | -| 8192 | 32 | 32 | 263168 | 44.382 | 5906.56 | 3.044 | 336.38 | 47.426 | 5549.02 | +| 512 | 32 | 1 | 544 | 0.092 | 5566.97 | 0.412 | 77.63 | 0.504 | 1078.95 | +| 512 | 32 | 2 | 1088 | 0.161 | 6345.67 | 0.522 | 122.70 | 0.683 | 1593.06 | +| 512 | 32 | 4 | 2176 | 0.325 | 6309.87 | 0.562 | 227.68 | 0.887 | 2453.87 | +| 512 | 32 | 8 | 4352 | 0.643 | 6374.42 | 0.685 | 373.67 | 1.328 | 3277.94 | +| 512 | 32 | 16 | 8704 | 1.277 | 6413.64 | 0.915 | 559.47 | 2.192 | 3970.01 | +| 512 | 32 | 32 | 17408 | 2.518 | 6506.57 | 1.249 | 819.61 | 3.767 | 4620.64 | +| 4096 | 32 | 1 | 4128 | 0.674 | 6079.68 | 0.453 | 70.60 | 1.127 | 3662.88 | +| 4096 | 32 | 2 | 8256 | 1.335 | 6137.82 | 0.627 | 102.03 | 1.962 | 4208.11 | +| 4096 | 32 | 4 | 16512 | 2.657 | 6167.35 | 0.749 | 170.92 | 3.405 | 4848.71 | +| 4096 | 32 | 8 | 33024 | 5.307 | 6173.91 | 0.974 | 262.89 | 6.281 | 5257.53 | +| 4096 | 32 | 16 | 66048 | 10.610 | 6176.96 | 1.379 | 371.42 | 11.988 | 5509.40 | +| 4096 | 32 | 32 | 132096 | 21.213 | 6178.89 | 2.122 | 482.50 | 23.335 | 5660.82 | +| 8192 | 32 | 1 | 8224 | 1.359 | 6027.34 | 0.467 | 68.52 | 1.826 | 4503.48 | +| 8192 | 32 | 2 | 16448 | 2.699 | 6069.68 | 0.653 | 98.03 | 3.352 | 4906.68 | +| 8192 | 32 | 4 | 32896 | 5.366 | 6106.74 | 0.818 | 156.55 | 6.184 | 5319.96 | +| 8192 | 32 | 8 | 65792 | 10.755 | 6093.50 | 1.174 | 218.04 | 11.929 | 5515.22 | +| 8192 | 32 | 16 | 131584 | 21.484 | 6100.82 | 1.829 | 279.90 | 23.314 | 5644.11 | +| 8192 | 32 | 32 | 263168 | 42.950 | 6103.40 | 3.058 | 334.91 | 46.008 | 5720.05 | - `llama-bench` -| model | size | params | backend | ngl | n_ubatch | fa | mmap | test | t/s | -| ------------------------------ | ---------: | ---------: | ---------- | --: | -------: | -: | ---: | --------------: | -------------------: | -| gemma3 4B Q4_0 | 2.35 GiB | 3.88 B | CUDA | 99 | 2048 | 1 | 0 | pp2048 | 5810.04 ± 21.71 | -| gemma3 4B Q4_0 | 2.35 GiB | 3.88 B | CUDA | 99 | 2048 | 1 | 0 | tg32 | 84.54 ± 0.18 | -| gemma3 4B Q4_0 | 2.35 GiB | 3.88 B | CUDA | 99 | 2048 | 1 | 0 | pp2048 @ d4096 | 5288.04 ± 3.54 | -| gemma3 4B Q4_0 | 2.35 GiB | 3.88 B | CUDA | 99 | 2048 | 1 | 0 | tg32 @ d4096 | 78.82 ± 1.37 | -| gemma3 4B Q4_0 | 2.35 GiB | 3.88 B | CUDA | 99 | 2048 | 1 | 0 | pp2048 @ d8192 | 4960.43 ± 16.64 | -| gemma3 4B Q4_0 | 2.35 GiB | 3.88 B | CUDA | 99 | 2048 | 1 | 0 | tg32 @ d8192 | 74.13 ± 0.30 | -| gemma3 4B Q4_0 | 2.35 GiB | 3.88 B | CUDA | 99 | 2048 | 1 | 0 | pp2048 @ d16384 | 4495.92 ± 31.11 | -| gemma3 4B Q4_0 | 2.35 GiB | 3.88 B | CUDA | 99 | 2048 | 1 | 0 | tg32 @ d16384 | 72.37 ± 0.29 | -| gemma3 4B Q4_0 | 2.35 GiB | 3.88 B | CUDA | 99 | 2048 | 1 | 0 | pp2048 @ d32768 | 3746.90 ± 40.01 | -| gemma3 4B Q4_0 | 2.35 GiB | 3.88 B | CUDA | 99 | 2048 | 1 | 0 | tg32 @ d32768 | 63.02 ± 0.20 | +| model | size | params | backend | ngl | n_ubatch | fa | mmap | dio | test | t/s | +| ------------------------------ | ---------: | ---------: | ---------- | --: | -------: | -: | ---: | --: | --------------: | -------------------: | +| gemma3 4B Q4_0 | 2.35 GiB | 3.88 B | CUDA | 99 | 2048 | 1 | 0 | 1 | pp2048 | 5948.74 ± 10.61 | +| gemma3 4B Q4_0 | 2.35 GiB | 3.88 B | CUDA | 99 | 2048 | 1 | 0 | 1 | tg32 | 81.05 ± 0.20 | +| gemma3 4B Q4_0 | 2.35 GiB | 3.88 B | CUDA | 99 | 2048 | 1 | 0 | 1 | pp2048 @ d4096 | 5652.69 ± 34.29 | +| gemma3 4B Q4_0 | 2.35 GiB | 3.88 B | CUDA | 99 | 2048 | 1 | 0 | 1 | tg32 @ d4096 | 76.37 ± 0.58 | +| gemma3 4B Q4_0 | 2.35 GiB | 3.88 B | CUDA | 99 | 2048 | 1 | 0 | 1 | pp2048 @ d8192 | 5509.57 ± 40.69 | +| gemma3 4B Q4_0 | 2.35 GiB | 3.88 B | CUDA | 99 | 2048 | 1 | 0 | 1 | tg32 @ d8192 | 71.61 ± 0.80 | +| gemma3 4B Q4_0 | 2.35 GiB | 3.88 B | CUDA | 99 | 2048 | 1 | 0 | 1 | pp2048 @ d16384 | 5340.86 ± 36.92 | +| gemma3 4B Q4_0 | 2.35 GiB | 3.88 B | CUDA | 99 | 2048 | 1 | 0 | 1 | tg32 @ d16384 | 70.89 ± 0.34 | +| gemma3 4B Q4_0 | 2.35 GiB | 3.88 B | CUDA | 99 | 2048 | 1 | 0 | 1 | pp2048 @ d32768 | 5023.30 ± 13.52 | +| gemma3 4B Q4_0 | 2.35 GiB | 3.88 B | CUDA | 99 | 2048 | 1 | 0 | 1 | tg32 @ d32768 | 62.28 ± 0.30 | -build: eeee367de (6989) +build: 11fb327bf (7941) +## ggml-org/GLM-4.7-Flash-GGUF + +Model: https://huggingface.co/ggml-org/GLM-4.7-Flash-GGUF + +- `llama-batched-bench` + + +main: n_kv_max = 270336, n_batch = 2048, n_ubatch = 2048, flash_attn = 1, is_pp_shared = 0, is_tg_separate = 0, n_gpu_layers = -1, n_threads = 20, n_threads_batch = 20 + +| PP | TG | B | N_KV | T_PP s | S_PP t/s | T_TG s | S_TG t/s | T s | S t/s | +|-------|--------|------|--------|----------|----------|----------|----------|----------|----------| +| 512 | 32 | 1 | 544 | 0.433 | 1181.83 | 0.693 | 46.16 | 1.126 | 482.94 | +| 512 | 32 | 2 | 1088 | 0.439 | 2334.46 | 1.034 | 61.89 | 1.473 | 738.75 | +| 512 | 32 | 4 | 2176 | 0.772 | 2654.46 | 1.459 | 87.76 | 2.230 | 975.77 | +| 512 | 32 | 8 | 4352 | 1.541 | 2658.78 | 2.043 | 125.31 | 3.583 | 1214.47 | +| 512 | 32 | 16 | 8704 | 3.083 | 2656.91 | 2.675 | 191.42 | 5.758 | 1511.62 | +| 512 | 32 | 32 | 17408 | 6.159 | 2660.12 | 3.615 | 283.24 | 9.774 | 1780.98 | +| 4096 | 32 | 1 | 4128 | 1.915 | 2139.30 | 0.725 | 44.14 | 2.640 | 1563.83 | +| 4096 | 32 | 2 | 8256 | 3.834 | 2136.40 | 1.119 | 57.21 | 4.953 | 1666.81 | +| 4096 | 32 | 4 | 16512 | 7.636 | 2145.72 | 1.631 | 78.49 | 9.266 | 1781.93 | +| 4096 | 32 | 8 | 33024 | 15.295 | 2142.40 | 2.344 | 109.21 | 17.639 | 1872.20 | +| 4096 | 32 | 16 | 66048 | 30.573 | 2143.62 | 3.773 | 135.70 | 34.346 | 1923.04 | +| 4096 | 32 | 32 | 132096 | 61.282 | 2138.82 | 5.795 | 176.71 | 67.077 | 1969.31 | +| 8192 | 32 | 1 | 8224 | 4.510 | 1816.24 | 0.760 | 42.11 | 5.270 | 1560.44 | +| 8192 | 32 | 2 | 16448 | 9.036 | 1813.19 | 1.206 | 53.06 | 10.242 | 1605.91 | +| 8192 | 32 | 4 | 32896 | 18.070 | 1813.43 | 1.783 | 71.80 | 19.852 | 1657.03 | +| 8192 | 32 | 8 | 65792 | 36.125 | 1814.15 | 2.635 | 97.14 | 38.760 | 1697.41 | +| 8192 | 32 | 16 | 131584 | 72.367 | 1811.20 | 4.954 | 103.34 | 77.322 | 1701.77 | +| 8192 | 32 | 32 | 263168 | 144.501 | 1814.13 | 8.103 | 126.37 | 152.604 | 1724.51 | + + +- `llama-bench` + +| model | size | params | backend | ngl | n_ubatch | fa | dio | test | t/s | +| ------------------------------ | ---------: | ---------: | ---------- | --: | -------: | -: | --: | --------------: | -------------------: | +| deepseek2 30B.A3B Q8_0 | 29.65 GiB | 29.94 B | CUDA | 99 | 2048 | 1 | 1 | pp2048 | 2364.18 ± 11.43 | +| deepseek2 30B.A3B Q8_0 | 29.65 GiB | 29.94 B | CUDA | 99 | 2048 | 1 | 1 | tg32 | 48.68 ± 0.12 | +| deepseek2 30B.A3B Q8_0 | 29.65 GiB | 29.94 B | CUDA | 99 | 2048 | 1 | 1 | pp2048 @ d4096 | 1684.13 ± 1.24 | +| deepseek2 30B.A3B Q8_0 | 29.65 GiB | 29.94 B | CUDA | 99 | 2048 | 1 | 1 | tg32 @ d4096 | 44.62 ± 0.22 | +| deepseek2 30B.A3B Q8_0 | 29.65 GiB | 29.94 B | CUDA | 99 | 2048 | 1 | 1 | pp2048 @ d8192 | 1314.68 ± 1.41 | +| deepseek2 30B.A3B Q8_0 | 29.65 GiB | 29.94 B | CUDA | 99 | 2048 | 1 | 1 | tg32 @ d8192 | 42.59 ± 0.11 | +| deepseek2 30B.A3B Q8_0 | 29.65 GiB | 29.94 B | CUDA | 99 | 2048 | 1 | 1 | pp2048 @ d16384 | 914.05 ± 3.32 | +| deepseek2 30B.A3B Q8_0 | 29.65 GiB | 29.94 B | CUDA | 99 | 2048 | 1 | 1 | tg32 @ d16384 | 38.72 ± 0.13 | +| deepseek2 30B.A3B Q8_0 | 29.65 GiB | 29.94 B | CUDA | 99 | 2048 | 1 | 1 | pp2048 @ d32768 | 567.20 ± 0.90 | +| deepseek2 30B.A3B Q8_0 | 29.65 GiB | 29.94 B | CUDA | 99 | 2048 | 1 | 1 | tg32 @ d32768 | 32.65 ± 0.09 | + +build: 11fb327bf (7941) diff --git a/benches/mac-m2-ultra/mac-m2-ultra.md b/benches/mac-m2-ultra/mac-m2-ultra.md new file mode 100644 index 0000000000..cf8a953388 --- /dev/null +++ b/benches/mac-m2-ultra/mac-m2-ultra.md @@ -0,0 +1,298 @@ +## System info + +```bash +uname -a +Darwin gg-studio 25.2.0 Darwin Kernel Version 25.2.0: Tue Nov 18 21:07:05 PST 2025; root:xnu-12377.61.12~1/RELEASE_ARM64_T6020 arm64 + +g++ --version +Apple clang version 17.0.0 (clang-1700.3.19.1) +Target: arm64-apple-darwin25.2.0 +``` + +## ggml-org/gpt-oss-20b-GGUF + +Model: https://huggingface.co/ggml-org/gpt-oss-20b-GGUF + +- `llama-batched-bench` + + +main: n_kv_max = 270336, n_batch = 2048, n_ubatch = 2048, flash_attn = 1, is_pp_shared = 0, is_tg_separate = 0, n_gpu_layers = -1, n_threads = 16, n_threads_batch = 16 + +| PP | TG | B | N_KV | T_PP s | S_PP t/s | T_TG s | S_TG t/s | T s | S t/s | +|-------|--------|------|--------|----------|----------|----------|----------|----------|----------| +| 512 | 32 | 1 | 544 | 0.215 | 2381.35 | 0.245 | 130.45 | 0.460 | 1181.81 | +| 512 | 32 | 2 | 1088 | 0.379 | 2701.43 | 0.382 | 167.56 | 0.761 | 1429.67 | +| 512 | 32 | 4 | 2176 | 0.721 | 2839.27 | 0.604 | 211.76 | 1.326 | 1641.32 | +| 512 | 32 | 8 | 4352 | 1.433 | 2858.30 | 1.033 | 247.75 | 2.466 | 1764.57 | +| 512 | 32 | 16 | 8704 | 2.853 | 2871.12 | 1.570 | 326.11 | 4.423 | 1967.77 | +| 512 | 32 | 32 | 17408 | 5.699 | 2874.95 | 1.910 | 536.15 | 7.609 | 2287.88 | +| 4096 | 32 | 1 | 4128 | 1.552 | 2638.56 | 0.334 | 95.72 | 1.887 | 2188.00 | +| 4096 | 32 | 2 | 8256 | 3.084 | 2655.88 | 0.404 | 158.54 | 3.488 | 2366.86 | +| 4096 | 32 | 4 | 16512 | 6.151 | 2663.78 | 0.652 | 196.39 | 6.802 | 2427.37 | +| 4096 | 32 | 8 | 33024 | 12.288 | 2666.77 | 1.135 | 225.47 | 13.423 | 2460.27 | +| 4096 | 32 | 16 | 66048 | 24.563 | 2668.12 | 1.762 | 290.55 | 26.325 | 2508.97 | +| 4096 | 32 | 32 | 132096 | 49.114 | 2668.73 | 2.398 | 426.94 | 51.512 | 2564.35 | +| 8192 | 32 | 1 | 8224 | 3.345 | 2448.78 | 0.275 | 116.46 | 3.620 | 2271.76 | +| 8192 | 32 | 2 | 16448 | 6.665 | 2458.11 | 0.425 | 150.71 | 7.090 | 2319.91 | +| 8192 | 32 | 4 | 32896 | 13.315 | 2460.92 | 0.691 | 185.21 | 14.006 | 2348.63 | +| 8192 | 32 | 8 | 65792 | 26.611 | 2462.73 | 1.212 | 211.16 | 27.823 | 2364.62 | +| 8192 | 32 | 16 | 131584 | 53.232 | 2462.27 | 1.919 | 266.83 | 55.151 | 2385.88 | +| 8192 | 32 | 32 | 263168 | 110.455 | 2373.30 | 2.752 | 372.03 | 113.208 | 2324.64 | + + +- `llama-bench` + +| model | size | params | backend | threads | n_ubatch | fa | test | t/s | +| ------------------------------ | ---------: | ---------: | ---------- | ------: | -------: | -: | --------------: | -------------------: | +| gpt-oss 20B MXFP4 MoE | 11.27 GiB | 20.91 B | MTL,BLAS | 16 | 2048 | 1 | pp2048 | 2713.40 ± 3.56 | +| gpt-oss 20B MXFP4 MoE | 11.27 GiB | 20.91 B | MTL,BLAS | 16 | 2048 | 1 | tg32 | 129.97 ± 3.90 | +| gpt-oss 20B MXFP4 MoE | 11.27 GiB | 20.91 B | MTL,BLAS | 16 | 2048 | 1 | pp2048 @ d4096 | 2324.59 ± 3.01 | +| gpt-oss 20B MXFP4 MoE | 11.27 GiB | 20.91 B | MTL,BLAS | 16 | 2048 | 1 | tg32 @ d4096 | 123.38 ± 0.17 | +| gpt-oss 20B MXFP4 MoE | 11.27 GiB | 20.91 B | MTL,BLAS | 16 | 2048 | 1 | pp2048 @ d8192 | 1989.82 ± 30.11 | +| gpt-oss 20B MXFP4 MoE | 11.27 GiB | 20.91 B | MTL,BLAS | 16 | 2048 | 1 | tg32 @ d8192 | 117.39 ± 0.33 | +| gpt-oss 20B MXFP4 MoE | 11.27 GiB | 20.91 B | MTL,BLAS | 16 | 2048 | 1 | pp2048 @ d16384 | 1556.54 ± 6.22 | +| gpt-oss 20B MXFP4 MoE | 11.27 GiB | 20.91 B | MTL,BLAS | 16 | 2048 | 1 | tg32 @ d16384 | 109.75 ± 0.42 | +| gpt-oss 20B MXFP4 MoE | 11.27 GiB | 20.91 B | MTL,BLAS | 16 | 2048 | 1 | pp2048 @ d32768 | 1122.63 ± 1.45 | +| gpt-oss 20B MXFP4 MoE | 11.27 GiB | 20.91 B | MTL,BLAS | 16 | 2048 | 1 | tg32 @ d32768 | 98.25 ± 0.08 | + +build: b828e18c7 (7948) + +## ggml-org/gpt-oss-120b-GGUF + +Model: https://huggingface.co/ggml-org/gpt-oss-120b-GGUF + +- `llama-batched-bench` + + +main: n_kv_max = 270336, n_batch = 2048, n_ubatch = 2048, flash_attn = 1, is_pp_shared = 0, is_tg_separate = 0, n_gpu_layers = -1, n_threads = 16, n_threads_batch = 16 + +| PP | TG | B | N_KV | T_PP s | S_PP t/s | T_TG s | S_TG t/s | T s | S t/s | +|-------|--------|------|--------|----------|----------|----------|----------|----------|----------| +| 512 | 32 | 1 | 544 | 0.426 | 1200.92 | 0.361 | 88.56 | 0.788 | 690.64 | +| 512 | 32 | 2 | 1088 | 0.683 | 1500.14 | 0.545 | 117.35 | 1.228 | 886.02 | +| 512 | 32 | 4 | 2176 | 1.204 | 1701.56 | 0.847 | 151.19 | 2.050 | 1061.34 | +| 512 | 32 | 8 | 4352 | 2.402 | 1705.20 | 1.455 | 176.00 | 3.857 | 1128.45 | +| 512 | 32 | 16 | 8704 | 4.802 | 1705.90 | 2.349 | 217.93 | 7.152 | 1217.08 | +| 512 | 32 | 32 | 17408 | 9.593 | 1707.85 | 3.665 | 279.42 | 13.258 | 1313.01 | +| 4096 | 32 | 1 | 4128 | 2.581 | 1587.08 | 0.390 | 82.12 | 2.970 | 1389.67 | +| 4096 | 32 | 2 | 8256 | 5.124 | 1598.79 | 0.589 | 108.62 | 5.713 | 1445.10 | +| 4096 | 32 | 4 | 16512 | 10.231 | 1601.47 | 0.928 | 137.98 | 11.158 | 1479.80 | +| 4096 | 32 | 8 | 33024 | 20.468 | 1600.94 | 1.606 | 159.38 | 22.074 | 1496.04 | +| 4096 | 32 | 16 | 66048 | 40.924 | 1601.42 | 2.639 | 193.99 | 43.563 | 1516.15 | +| 4096 | 32 | 32 | 132096 | 81.819 | 1601.98 | 4.466 | 229.29 | 86.284 | 1530.94 | +| 8192 | 32 | 1 | 8224 | 5.517 | 1484.74 | 0.409 | 78.16 | 5.927 | 1387.58 | +| 8192 | 32 | 2 | 16448 | 11.008 | 1488.43 | 0.622 | 102.92 | 11.629 | 1414.34 | +| 8192 | 32 | 4 | 32896 | 22.002 | 1489.29 | 0.987 | 129.66 | 22.990 | 1430.90 | +| 8192 | 32 | 8 | 65792 | 46.051 | 1423.11 | 1.858 | 137.79 | 47.909 | 1373.27 | +| 8192 | 32 | 16 | 131584 | 97.680 | 1341.85 | 2.872 | 178.28 | 100.552 | 1308.62 | +| 8192 | 32 | 32 | 263168 | 176.407 | 1486.02 | 5.048 | 202.85 | 181.455 | 1450.32 | + + +- `llama-bench` + +| model | size | params | backend | threads | n_ubatch | fa | test | t/s | +| ------------------------------ | ---------: | ---------: | ---------- | ------: | -------: | -: | --------------: | -------------------: | +| gpt-oss 120B MXFP4 MoE | 59.02 GiB | 116.83 B | MTL,BLAS | 16 | 2048 | 1 | pp2048 | 1648.69 ± 1.80 | +| gpt-oss 120B MXFP4 MoE | 59.02 GiB | 116.83 B | MTL,BLAS | 16 | 2048 | 1 | tg32 | 85.60 ± 0.52 | +| gpt-oss 120B MXFP4 MoE | 59.02 GiB | 116.83 B | MTL,BLAS | 16 | 2048 | 1 | pp2048 @ d4096 | 1429.86 ± 1.01 | +| gpt-oss 120B MXFP4 MoE | 59.02 GiB | 116.83 B | MTL,BLAS | 16 | 2048 | 1 | tg32 @ d4096 | 82.03 ± 0.12 | +| gpt-oss 120B MXFP4 MoE | 59.02 GiB | 116.83 B | MTL,BLAS | 16 | 2048 | 1 | pp2048 @ d8192 | 1257.90 ± 1.81 | +| gpt-oss 120B MXFP4 MoE | 59.02 GiB | 116.83 B | MTL,BLAS | 16 | 2048 | 1 | tg32 @ d8192 | 78.23 ± 0.33 | +| gpt-oss 120B MXFP4 MoE | 59.02 GiB | 116.83 B | MTL,BLAS | 16 | 2048 | 1 | pp2048 @ d16384 | 1013.49 ± 0.70 | +| gpt-oss 120B MXFP4 MoE | 59.02 GiB | 116.83 B | MTL,BLAS | 16 | 2048 | 1 | tg32 @ d16384 | 73.20 ± 0.28 | +| gpt-oss 120B MXFP4 MoE | 59.02 GiB | 116.83 B | MTL,BLAS | 16 | 2048 | 1 | pp2048 @ d32768 | 721.11 ± 0.58 | +| gpt-oss 120B MXFP4 MoE | 59.02 GiB | 116.83 B | MTL,BLAS | 16 | 2048 | 1 | tg32 @ d32768 | 65.52 ± 0.10 | + +build: b828e18c7 (7948) + +## ggml-org/Qwen3-Coder-30B-A3B-Instruct-Q8_0-GGUF + +Model: https://huggingface.co/ggml-org/Qwen3-Coder-30B-A3B-Instruct-Q8_0-GGUF + +- `llama-batched-bench` + + +main: n_kv_max = 270336, n_batch = 2048, n_ubatch = 2048, flash_attn = 1, is_pp_shared = 0, is_tg_separate = 0, n_gpu_layers = -1, n_threads = 16, n_threads_batch = 16 + +| PP | TG | B | N_KV | T_PP s | S_PP t/s | T_TG s | S_TG t/s | T s | S t/s | +|-------|--------|------|--------|----------|----------|----------|----------|----------|----------| +| 512 | 32 | 1 | 544 | 0.243 | 2109.23 | 0.419 | 76.34 | 0.662 | 821.84 | +| 512 | 32 | 2 | 1088 | 0.406 | 2521.40 | 0.575 | 111.36 | 0.981 | 1109.27 | +| 512 | 32 | 4 | 2176 | 0.744 | 2751.65 | 0.841 | 152.22 | 1.585 | 1372.71 | +| 512 | 32 | 8 | 4352 | 1.479 | 2770.20 | 1.330 | 192.48 | 2.809 | 1549.53 | +| 512 | 32 | 16 | 8704 | 2.951 | 2776.20 | 2.572 | 199.05 | 5.523 | 1575.93 | +| 512 | 32 | 32 | 17408 | 5.899 | 2777.64 | 2.603 | 393.34 | 8.502 | 2047.54 | +| 4096 | 32 | 1 | 4128 | 1.901 | 2154.15 | 0.474 | 67.58 | 2.375 | 1738.14 | +| 4096 | 32 | 2 | 8256 | 3.788 | 2162.89 | 0.652 | 98.17 | 4.439 | 1859.69 | +| 4096 | 32 | 4 | 16512 | 7.564 | 2166.18 | 0.990 | 129.24 | 8.554 | 1930.34 | +| 4096 | 32 | 8 | 33024 | 15.121 | 2166.98 | 1.632 | 156.82 | 16.754 | 1971.12 | +| 4096 | 32 | 16 | 66048 | 30.241 | 2167.09 | 3.166 | 161.72 | 33.407 | 1977.04 | +| 4096 | 32 | 32 | 132096 | 60.474 | 2167.42 | 3.780 | 270.93 | 64.254 | 2055.86 | +| 8192 | 32 | 1 | 8224 | 4.733 | 1730.92 | 0.483 | 66.29 | 5.215 | 1576.85 | +| 8192 | 32 | 2 | 16448 | 9.459 | 1732.09 | 0.722 | 88.58 | 10.182 | 1615.46 | +| 8192 | 32 | 4 | 32896 | 18.912 | 1732.65 | 1.120 | 114.26 | 20.032 | 1642.14 | +| 8192 | 32 | 8 | 65792 | 37.797 | 1733.91 | 1.873 | 136.67 | 39.670 | 1658.49 | +| 8192 | 32 | 16 | 131584 | 84.133 | 1557.92 | 3.718 | 137.72 | 87.850 | 1497.82 | +| 8192 | 32 | 32 | 263168 | 157.550 | 1663.88 | 4.854 | 210.98 | 162.403 | 1620.46 | + + +- `llama-bench` + +| model | size | params | backend | threads | n_ubatch | fa | test | t/s | +| ------------------------------ | ---------: | ---------: | ---------- | ------: | -------: | -: | --------------: | -------------------: | +| qwen3moe 30B.A3B Q8_0 | 30.25 GiB | 30.53 B | MTL,BLAS | 16 | 2048 | 1 | pp2048 | 2453.11 ± 1.70 | +| qwen3moe 30B.A3B Q8_0 | 30.25 GiB | 30.53 B | MTL,BLAS | 16 | 2048 | 1 | tg32 | 78.97 ± 0.46 | +| qwen3moe 30B.A3B Q8_0 | 30.25 GiB | 30.53 B | MTL,BLAS | 16 | 2048 | 1 | pp2048 @ d4096 | 1569.46 ± 1.97 | +| qwen3moe 30B.A3B Q8_0 | 30.25 GiB | 30.53 B | MTL,BLAS | 16 | 2048 | 1 | tg32 @ d4096 | 71.18 ± 0.37 | +| qwen3moe 30B.A3B Q8_0 | 30.25 GiB | 30.53 B | MTL,BLAS | 16 | 2048 | 1 | pp2048 @ d8192 | 1145.51 ± 1.16 | +| qwen3moe 30B.A3B Q8_0 | 30.25 GiB | 30.53 B | MTL,BLAS | 16 | 2048 | 1 | tg32 @ d8192 | 65.11 ± 0.36 | +| qwen3moe 30B.A3B Q8_0 | 30.25 GiB | 30.53 B | MTL,BLAS | 16 | 2048 | 1 | pp2048 @ d16384 | 741.04 ± 0.74 | +| qwen3moe 30B.A3B Q8_0 | 30.25 GiB | 30.53 B | MTL,BLAS | 16 | 2048 | 1 | tg32 @ d16384 | 56.87 ± 0.14 | +| qwen3moe 30B.A3B Q8_0 | 30.25 GiB | 30.53 B | MTL,BLAS | 16 | 2048 | 1 | pp2048 @ d32768 | 431.31 ± 0.31 | +| qwen3moe 30B.A3B Q8_0 | 30.25 GiB | 30.53 B | MTL,BLAS | 16 | 2048 | 1 | tg32 @ d32768 | 45.26 ± 0.11 | + +build: b828e18c7 (7948) + +## ggml-org/Qwen2.5-Coder-7B-Q8_0-GGUF + +Model: https://huggingface.co/ggml-org/Qwen2.5-Coder-7B-Q8_0-GGUF + +- `llama-batched-bench` + + +main: n_kv_max = 270336, n_batch = 2048, n_ubatch = 2048, flash_attn = 1, is_pp_shared = 0, is_tg_separate = 0, n_gpu_layers = -1, n_threads = 16, n_threads_batch = 16 + +| PP | TG | B | N_KV | T_PP s | S_PP t/s | T_TG s | S_TG t/s | T s | S t/s | +|-------|--------|------|--------|----------|----------|----------|----------|----------|----------| +| 512 | 32 | 1 | 544 | 0.339 | 1509.22 | 0.409 | 78.17 | 0.749 | 726.67 | +| 512 | 32 | 2 | 1088 | 0.646 | 1584.93 | 0.483 | 132.45 | 1.129 | 963.45 | +| 512 | 32 | 4 | 2176 | 1.258 | 1627.50 | 0.585 | 218.67 | 1.844 | 1180.21 | +| 512 | 32 | 8 | 4352 | 2.506 | 1634.41 | 1.005 | 254.83 | 3.511 | 1239.64 | +| 512 | 32 | 16 | 8704 | 5.007 | 1635.99 | 1.595 | 321.07 | 6.602 | 1318.38 | +| 512 | 32 | 32 | 17408 | 10.007 | 1637.19 | 1.676 | 611.12 | 11.683 | 1490.03 | +| 4096 | 32 | 1 | 4128 | 2.730 | 1500.46 | 0.431 | 74.31 | 3.160 | 1306.12 | +| 4096 | 32 | 2 | 8256 | 5.446 | 1504.33 | 0.524 | 122.04 | 5.970 | 1382.91 | +| 4096 | 32 | 4 | 16512 | 10.875 | 1506.59 | 0.662 | 193.45 | 11.537 | 1431.28 | +| 4096 | 32 | 8 | 33024 | 21.749 | 1506.61 | 1.158 | 221.11 | 22.907 | 1441.64 | +| 4096 | 32 | 16 | 66048 | 43.477 | 1507.36 | 1.901 | 269.32 | 45.378 | 1455.49 | +| 4096 | 32 | 32 | 132096 | 86.954 | 1507.37 | 2.325 | 440.42 | 89.279 | 1479.59 | +| 8192 | 32 | 1 | 8224 | 5.940 | 1379.21 | 0.449 | 71.20 | 6.389 | 1287.20 | +| 8192 | 32 | 2 | 16448 | 11.865 | 1380.84 | 0.559 | 114.59 | 12.424 | 1323.92 | +| 8192 | 32 | 4 | 32896 | 23.723 | 1381.25 | 0.728 | 175.80 | 24.452 | 1345.35 | +| 8192 | 32 | 8 | 65792 | 47.434 | 1381.63 | 1.279 | 200.09 | 48.713 | 1350.60 | +| 8192 | 32 | 16 | 131584 | 94.864 | 1381.69 | 2.198 | 232.97 | 97.061 | 1355.68 | +| 8192 | 32 | 32 | 263168 | 189.743 | 1381.57 | 3.052 | 335.50 | 192.795 | 1365.01 | + + +- `llama-bench` + +| model | size | params | backend | threads | n_ubatch | fa | test | t/s | +| ------------------------------ | ---------: | ---------: | ---------- | ------: | -------: | -: | --------------: | -------------------: | +| qwen2 7B Q8_0 | 7.54 GiB | 7.62 B | MTL,BLAS | 16 | 2048 | 1 | pp2048 | 1565.91 ± 0.86 | +| qwen2 7B Q8_0 | 7.54 GiB | 7.62 B | MTL,BLAS | 16 | 2048 | 1 | tg32 | 79.68 ± 0.39 | +| qwen2 7B Q8_0 | 7.54 GiB | 7.62 B | MTL,BLAS | 16 | 2048 | 1 | pp2048 @ d4096 | 1317.41 ± 1.02 | +| qwen2 7B Q8_0 | 7.54 GiB | 7.62 B | MTL,BLAS | 16 | 2048 | 1 | tg32 @ d4096 | 74.70 ± 0.04 | +| qwen2 7B Q8_0 | 7.54 GiB | 7.62 B | MTL,BLAS | 16 | 2048 | 1 | pp2048 @ d8192 | 1134.65 ± 0.76 | +| qwen2 7B Q8_0 | 7.54 GiB | 7.62 B | MTL,BLAS | 16 | 2048 | 1 | tg32 @ d8192 | 71.31 ± 0.12 | +| qwen2 7B Q8_0 | 7.54 GiB | 7.62 B | MTL,BLAS | 16 | 2048 | 1 | pp2048 @ d16384 | 886.46 ± 0.78 | +| qwen2 7B Q8_0 | 7.54 GiB | 7.62 B | MTL,BLAS | 16 | 2048 | 1 | tg32 @ d16384 | 65.93 ± 0.06 | +| qwen2 7B Q8_0 | 7.54 GiB | 7.62 B | MTL,BLAS | 16 | 2048 | 1 | pp2048 @ d32768 | 612.21 ± 0.30 | +| qwen2 7B Q8_0 | 7.54 GiB | 7.62 B | MTL,BLAS | 16 | 2048 | 1 | tg32 @ d32768 | 56.83 ± 0.02 | + +build: b828e18c7 (7948) + +## ggml-org/gemma-3-4b-it-qat-GGUF + +Model: https://huggingface.co/ggml-org/gemma-3-4b-it-qat-GGUF + +- `llama-batched-bench` + + +main: n_kv_max = 270336, n_batch = 2048, n_ubatch = 2048, flash_attn = 1, is_pp_shared = 0, is_tg_separate = 0, n_gpu_layers = -1, n_threads = 16, n_threads_batch = 16 + +| PP | TG | B | N_KV | T_PP s | S_PP t/s | T_TG s | S_TG t/s | T s | S t/s | +|-------|--------|------|--------|----------|----------|----------|----------|----------|----------| +| 512 | 32 | 1 | 544 | 0.186 | 2748.06 | 0.235 | 136.28 | 0.421 | 1291.78 | +| 512 | 32 | 2 | 1088 | 0.342 | 2990.95 | 0.312 | 204.99 | 0.655 | 1662.15 | +| 512 | 32 | 4 | 2176 | 0.662 | 3092.69 | 0.404 | 316.97 | 1.066 | 2041.21 | +| 512 | 32 | 8 | 4352 | 1.317 | 3110.41 | 0.579 | 441.80 | 1.896 | 2294.97 | +| 512 | 32 | 16 | 8704 | 2.625 | 3120.23 | 1.207 | 424.08 | 3.833 | 2270.93 | +| 512 | 32 | 32 | 17408 | 5.242 | 3125.34 | 1.299 | 788.23 | 6.541 | 2661.19 | +| 4096 | 32 | 1 | 4128 | 1.408 | 2909.90 | 0.296 | 108.07 | 1.704 | 2422.95 | +| 4096 | 32 | 2 | 8256 | 2.793 | 2933.40 | 0.325 | 197.00 | 3.118 | 2648.25 | +| 4096 | 32 | 4 | 16512 | 5.567 | 2943.22 | 0.440 | 291.07 | 6.006 | 2749.05 | +| 4096 | 32 | 8 | 33024 | 11.114 | 2948.23 | 0.640 | 400.26 | 11.754 | 2809.59 | +| 4096 | 32 | 16 | 66048 | 22.217 | 2949.76 | 1.327 | 385.83 | 23.544 | 2805.26 | +| 4096 | 32 | 32 | 132096 | 44.420 | 2950.77 | 1.553 | 659.30 | 45.973 | 2873.36 | +| 8192 | 32 | 1 | 8224 | 2.860 | 2864.58 | 0.250 | 127.90 | 3.110 | 2644.42 | +| 8192 | 32 | 2 | 16448 | 5.702 | 2873.63 | 0.335 | 191.07 | 6.036 | 2724.77 | +| 8192 | 32 | 4 | 32896 | 11.383 | 2878.69 | 0.456 | 280.72 | 11.839 | 2778.63 | +| 8192 | 32 | 8 | 65792 | 22.750 | 2880.75 | 0.671 | 381.48 | 23.421 | 2809.14 | +| 8192 | 32 | 16 | 131584 | 45.484 | 2881.74 | 1.406 | 364.04 | 46.890 | 2806.22 | +| 8192 | 32 | 32 | 263168 | 90.956 | 2882.10 | 1.793 | 570.98 | 92.749 | 2837.41 | + + +- `llama-bench` + +| model | size | params | backend | threads | n_ubatch | fa | test | t/s | +| ------------------------------ | ---------: | ---------: | ---------- | ------: | -------: | -: | --------------: | -------------------: | +| gemma3 4B Q4_0 | 2.35 GiB | 3.88 B | MTL,BLAS | 16 | 2048 | 1 | pp2048 | 2923.59 ± 3.10 | +| gemma3 4B Q4_0 | 2.35 GiB | 3.88 B | MTL,BLAS | 16 | 2048 | 1 | tg32 | 134.28 ± 1.29 | +| gemma3 4B Q4_0 | 2.35 GiB | 3.88 B | MTL,BLAS | 16 | 2048 | 1 | pp2048 @ d4096 | 2748.21 ± 3.05 | +| gemma3 4B Q4_0 | 2.35 GiB | 3.88 B | MTL,BLAS | 16 | 2048 | 1 | tg32 @ d4096 | 133.11 ± 0.08 | +| gemma3 4B Q4_0 | 2.35 GiB | 3.88 B | MTL,BLAS | 16 | 2048 | 1 | pp2048 @ d8192 | 2641.45 ± 2.31 | +| gemma3 4B Q4_0 | 2.35 GiB | 3.88 B | MTL,BLAS | 16 | 2048 | 1 | tg32 @ d8192 | 125.85 ± 0.35 | +| gemma3 4B Q4_0 | 2.35 GiB | 3.88 B | MTL,BLAS | 16 | 2048 | 1 | pp2048 @ d16384 | 2446.20 ± 2.94 | +| gemma3 4B Q4_0 | 2.35 GiB | 3.88 B | MTL,BLAS | 16 | 2048 | 1 | tg32 @ d16384 | 125.00 ± 0.12 | +| gemma3 4B Q4_0 | 2.35 GiB | 3.88 B | MTL,BLAS | 16 | 2048 | 1 | pp2048 @ d32768 | 2129.18 ± 7.43 | +| gemma3 4B Q4_0 | 2.35 GiB | 3.88 B | MTL,BLAS | 16 | 2048 | 1 | tg32 @ d32768 | 113.14 ± 0.10 | + +build: b828e18c7 (7948) + +## ggml-org/GLM-4.7-Flash-GGUF + +Model: https://huggingface.co/ggml-org/GLM-4.7-Flash-GGUF + +- `llama-batched-bench` + + +main: n_kv_max = 270336, n_batch = 2048, n_ubatch = 2048, flash_attn = 1, is_pp_shared = 0, is_tg_separate = 0, n_gpu_layers = -1, n_threads = 16, n_threads_batch = 16 + +| PP | TG | B | N_KV | T_PP s | S_PP t/s | T_TG s | S_TG t/s | T s | S t/s | +|-------|--------|------|--------|----------|----------|----------|----------|----------|----------| +| 512 | 32 | 1 | 544 | 0.326 | 1568.69 | 0.522 | 61.28 | 0.849 | 641.09 | +| 512 | 32 | 2 | 1088 | 0.528 | 1939.42 | 0.744 | 86.07 | 1.272 | 855.63 | +| 512 | 32 | 4 | 2176 | 0.968 | 2114.85 | 1.105 | 115.85 | 2.073 | 1049.56 | +| 512 | 32 | 8 | 4352 | 1.928 | 2124.62 | 1.684 | 151.99 | 3.612 | 1204.82 | +| 512 | 32 | 16 | 8704 | 3.844 | 2131.34 | 3.141 | 162.99 | 6.985 | 1246.11 | +| 512 | 32 | 32 | 17408 | 7.683 | 2132.38 | 3.924 | 260.95 | 11.608 | 1499.71 | +| 4096 | 32 | 1 | 4128 | 3.280 | 1248.75 | 0.723 | 44.29 | 4.003 | 1031.33 | +| 4096 | 32 | 2 | 8256 | 6.545 | 1251.63 | 0.930 | 68.85 | 7.475 | 1104.53 | +| 4096 | 32 | 4 | 16512 | 13.080 | 1252.64 | 1.454 | 88.03 | 14.534 | 1136.12 | +| 4096 | 32 | 8 | 33024 | 26.154 | 1252.90 | 2.388 | 107.20 | 28.542 | 1157.04 | +| 4096 | 32 | 16 | 66048 | 52.297 | 1253.14 | 4.724 | 108.37 | 57.022 | 1158.30 | +| 4096 | 32 | 32 | 132096 | 104.578 | 1253.34 | 7.266 | 140.93 | 111.844 | 1181.08 | +| 8192 | 32 | 1 | 8224 | 9.623 | 851.31 | 0.767 | 41.72 | 10.390 | 791.54 | +| 8192 | 32 | 2 | 16448 | 20.916 | 783.32 | 1.148 | 55.74 | 22.064 | 745.45 | +| 8192 | 32 | 4 | 32896 | 43.509 | 753.14 | 1.833 | 69.82 | 45.342 | 725.51 | +| 8192 | 32 | 8 | 65792 | 79.621 | 823.10 | 3.180 | 80.50 | 82.801 | 794.58 | +| 8192 | 32 | 16 | 131584 | 153.770 | 852.39 | 6.502 | 78.74 | 160.272 | 821.00 | +| 8192 | 32 | 32 | 263168 | 307.539 | 852.39 | 10.839 | 94.48 | 318.378 | 826.59 | + + +- `llama-bench` + +| model | size | params | backend | threads | n_ubatch | fa | test | t/s | +| ------------------------------ | ---------: | ---------: | ---------- | ------: | -------: | -: | --------------: | -------------------: | +| deepseek2 30B.A3B Q8_0 | 29.65 GiB | 29.94 B | MTL,BLAS | 16 | 2048 | 1 | pp2048 | 1629.33 ± 0.27 | +| deepseek2 30B.A3B Q8_0 | 29.65 GiB | 29.94 B | MTL,BLAS | 16 | 2048 | 1 | tg32 | 59.58 ± 0.13 | +| deepseek2 30B.A3B Q8_0 | 29.65 GiB | 29.94 B | MTL,BLAS | 16 | 2048 | 1 | pp2048 @ d4096 | 732.67 ± 0.42 | +| deepseek2 30B.A3B Q8_0 | 29.65 GiB | 29.94 B | MTL,BLAS | 16 | 2048 | 1 | tg32 @ d4096 | 47.44 ± 0.15 | +| deepseek2 30B.A3B Q8_0 | 29.65 GiB | 29.94 B | MTL,BLAS | 16 | 2048 | 1 | pp2048 @ d8192 | 474.33 ± 0.33 | +| deepseek2 30B.A3B Q8_0 | 29.65 GiB | 29.94 B | MTL,BLAS | 16 | 2048 | 1 | tg32 @ d8192 | 40.20 ± 0.20 | +| deepseek2 30B.A3B Q8_0 | 29.65 GiB | 29.94 B | MTL,BLAS | 16 | 2048 | 1 | pp2048 @ d16384 | 277.46 ± 0.09 | +| deepseek2 30B.A3B Q8_0 | 29.65 GiB | 29.94 B | MTL,BLAS | 16 | 2048 | 1 | tg32 @ d16384 | 31.50 ± 0.93 | +| deepseek2 30B.A3B Q8_0 | 29.65 GiB | 29.94 B | MTL,BLAS | 16 | 2048 | 1 | pp2048 @ d32768 | 151.44 ± 0.05 | +| deepseek2 30B.A3B Q8_0 | 29.65 GiB | 29.94 B | MTL,BLAS | 16 | 2048 | 1 | tg32 @ d32768 | 21.81 ± 0.01 | + +build: b828e18c7 (7948) diff --git a/scripts/bench-models.sh b/scripts/bench-models.sh old mode 100644 new mode 100755 index 744b0de359..c241013040 --- a/scripts/bench-models.sh +++ b/scripts/bench-models.sh @@ -7,47 +7,54 @@ ARGS_BB="-c 270336 -npp 512,4096,8192 -npl 1,2,4,8,16,32 -ntg 32" ARGS_B="-d 0,4096,8192,16384,32768 -p 2048 -n 32" QUICK=0 +DIO=0 while (( "$#" )); do - case "$1" in - --quick) QUICK=1; shift ;; - *) shift ;; - esac + case "$1" in + --quick) QUICK=1; shift ;; + --dio) DIO=1; shift ;; + *) shift ;; + esac done if (( QUICK )); then - ARGS_BB="-c 20480 -npp 512,4096 -npl 1,2,4 -ntg 32" - ARGS_B="-d 0 -p 2048 -n 32" + ARGS_BB="-c 20480 -npp 512,4096 -npl 1,2,4 -ntg 32" + ARGS_B="-d 0 -p 2048 -n 32" +fi + +if (( DIO )); then + ARGS_BB="${ARGS_BB} --no-mmap --direct-io" + ARGS_B="${ARGS_B} -mmp 0 -dio 1" fi run_model() { - local HFR=$1 - local HFF=$2 + local HFR=$1 + local HFF=$2 - printf "## ${HFR}\n" | tee -a "$RESULTS" - printf "\n" | tee -a "$RESULTS" - printf "Model: https://huggingface.co/${HFR}\n" | tee -a "$RESULTS" - printf "\n" | tee -a "$RESULTS" + printf "## ${HFR}\n" | tee -a "$RESULTS" + printf "\n" | tee -a "$RESULTS" + printf "Model: https://huggingface.co/${HFR}\n" | tee -a "$RESULTS" + printf "\n" | tee -a "$RESULTS" - printf -- "- \`llama-batched-bench\`\n" | tee -a "$RESULTS" - printf "\n" | tee -a "$RESULTS" + printf -- "- \`llama-batched-bench\`\n" | tee -a "$RESULTS" + printf "\n" | tee -a "$RESULTS" - ./bin/llama-batched-bench \ - -hfr "${HFR}" -hff "${HFF}" \ - -m "${HFF}" -fa 1 -ub 2048 --no-mmap \ - ${ARGS_BB} | tee -a "$RESULTS" + ./bin/llama-batched-bench \ + -hfr "${HFR}" -hff "${HFF}" \ + -m "${HFF}" -fa 1 -ub 2048 \ + ${ARGS_BB} | tee -a "$RESULTS" - printf "\n" | tee -a "$RESULTS" + printf "\n" | tee -a "$RESULTS" - printf -- "- \`llama-bench\`\n" | tee -a "$RESULTS" - printf "\n" | tee -a "$RESULTS" + printf -- "- \`llama-bench\`\n" | tee -a "$RESULTS" + printf "\n" | tee -a "$RESULTS" - ./bin/llama-bench \ - -m "${HFF}" -fa 1 -ub 2048 -mmp 0 \ - ${ARGS_B} | tee -a "$RESULTS" + ./bin/llama-bench \ + -m "${HFF}" -fa 1 -ub 2048 \ + ${ARGS_B} | tee -a "$RESULTS" - printf "\n" | tee -a "$RESULTS" + printf "\n" | tee -a "$RESULTS" - printf "\n" + printf "\n" } run_model "ggml-org/gpt-oss-20b-GGUF" "gpt-oss-20b-mxfp4.gguf" @@ -55,6 +62,7 @@ run_model "ggml-org/gpt-oss-120b-GGUF" "gpt-oss-120b-mxfp4- run_model "ggml-org/Qwen3-Coder-30B-A3B-Instruct-Q8_0-GGUF" "qwen3-coder-30b-a3b-instruct-q8_0.gguf" run_model "ggml-org/Qwen2.5-Coder-7B-Q8_0-GGUF" "qwen2.5-coder-7b-q8_0.gguf" run_model "ggml-org/gemma-3-4b-it-qat-GGUF" "gemma-3-4b-it-qat-Q4_0.gguf" +run_model "ggml-org/GLM-4.7-Flash-GGUF" "GLM-4.7-Flash-Q8_0.gguf" if [[ -f models-extra.txt ]]; then while read -r HFR HFF; do From 449ec2ab0751fc713fe338da2ced153125b5c674 Mon Sep 17 00:00:00 2001 From: Jeff Bolz Date: Thu, 5 Feb 2026 09:26:38 -0600 Subject: [PATCH 19/20] vulkan: Preprocess FA mask to detect all-neg-inf and all-zero. (#19281) Write out a 2-bit code per block and avoid loading the mask when it matches these two common cases. Apply this optimization when the mask is relatively large (i.e. prompt processing). --- ggml/src/ggml-vulkan/ggml-vulkan.cpp | 109 +++++++++++--- .../vulkan-shaders/flash_attn.comp | 39 ++--- .../vulkan-shaders/flash_attn_base.glsl | 6 + .../vulkan-shaders/flash_attn_cm1.comp | 112 +++++++------- .../vulkan-shaders/flash_attn_cm2.comp | 63 ++++---- .../vulkan-shaders/flash_attn_mask_opt.comp | 142 ++++++++++++++++++ .../vulkan-shaders/vulkan-shaders-gen.cpp | 2 + tests/test-backend-ops.cpp | 10 +- 8 files changed, 360 insertions(+), 123 deletions(-) create mode 100644 ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_mask_opt.comp diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp index ff9cb7355c..4357da24d4 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -402,18 +402,19 @@ enum FaCodePath { }; struct vk_fa_pipeline_state { - vk_fa_pipeline_state(uint32_t HSK, uint32_t HSV, bool small_rows, bool small_cache, FaCodePath path, bool aligned, bool f32acc) - : HSK(HSK), HSV(HSV), small_rows(small_rows), small_cache(small_cache), path(path), aligned(aligned), f32acc(f32acc) {} + vk_fa_pipeline_state(uint32_t HSK, uint32_t HSV, bool small_rows, bool small_cache, FaCodePath path, bool aligned, bool f32acc, bool use_mask_opt) + : HSK(HSK), HSV(HSV), small_rows(small_rows), small_cache(small_cache), path(path), aligned(aligned), f32acc(f32acc), use_mask_opt(use_mask_opt) {} uint32_t HSK, HSV; bool small_rows, small_cache; FaCodePath path; bool aligned; bool f32acc; + bool use_mask_opt; bool operator<(const vk_fa_pipeline_state &b) const { - return std::tie(HSK, HSV, small_rows, small_cache, path, aligned, f32acc) < - std::tie(b.HSK, b.HSV, b.small_rows, b.small_cache, b.path, b.aligned, b.f32acc); + return std::tie(HSK, HSV, small_rows, small_cache, path, aligned, f32acc, use_mask_opt) < + std::tie(b.HSK, b.HSV, b.small_rows, b.small_cache, b.path, b.aligned, b.f32acc, b.use_mask_opt); } }; @@ -820,6 +821,8 @@ struct vk_device_struct { std::map pipeline_flash_attn_f32_f16[GGML_TYPE_COUNT]; + std::map, vk_pipeline> pipeline_fa_mask_opt; + vk_pipeline pipeline_flash_attn_split_k_reduce; vk_pipeline pipeline_count_experts; @@ -1549,6 +1552,18 @@ struct vk_op_flash_attn_split_k_reduce_push_constants { uint32_t sinks; }; +struct vk_op_flash_attn_mask_opt_push_constants { + uint32_t nem0; + uint32_t nem1; + uint32_t nem2; + uint32_t nbm1; + uint32_t nbm2; + uint32_t nbm3; + uint32_t nbd1; + uint32_t nbd2; + uint32_t nbd3; +}; + // Allow pre-recording command buffers struct vk_staging_memcpy { vk_staging_memcpy(void * _dst, const void * _src, size_t _n) : dst(_dst), src(_src), n(_n) {} @@ -1757,6 +1772,7 @@ class vk_perf_logger { " k(" << k->ne[0] << "," << k->ne[1] << "," << k->ne[2] << "," << k->ne[3] << "), " << " v(" << v->ne[0] << "," << v->ne[1] << "," << v->ne[2] << "," << v->ne[3] << "), " << " m(" << (m?m->ne[0]:0) << "," << (m?m->ne[1]:0) << "," << (m?m->ne[2]:0) << "," << (m?m->ne[3]:0) << ")"; + *n_flops = 2ull * q->ne[1] * q->ne[2] * (k->ne[0] + v->ne[0]) * k->ne[1] * q->ne[3]; return name.str(); } if (node->op == GGML_OP_TOP_K) { @@ -3177,7 +3193,7 @@ static void ggml_vk_load_shaders(vk_device& device) { return {fa_rows_cols(path, hsk, hsv, clamp, type, small_rows, small_cache)[0], 1, 1}; }; - auto const &fa_spec_constants = [&](FaCodePath path, uint32_t hsk, uint32_t hsv, uint32_t clamp, ggml_type type, bool small_rows, bool small_cache) -> std::vector { + auto const &fa_spec_constants = [&](FaCodePath path, uint32_t hsk, uint32_t hsv, uint32_t clamp, ggml_type type, bool small_rows, bool small_cache, bool use_mask_opt) -> std::vector { // For large number of rows, 128 invocations seems to work best. // For small number of rows (e.g. N==1), 256 works better. But matrix granularity for 256 is 32, so we // can't use 256 for D==80. @@ -3209,7 +3225,7 @@ static void ggml_vk_load_shaders(vk_device& device) { // AMD prefers loading K directly from global memory const uint32_t k_load_shmem = device->vendor_id == VK_VENDOR_ID_NVIDIA && hsk < 256 ? 1 : 0; - return {wg_size, rows_cols[0], rows_cols[1], hsk, hsv, clamp, D_split, device->subgroup_size, k_load_shmem}; + return {wg_size, rows_cols[0], rows_cols[1], hsk, hsv, clamp, D_split, device->subgroup_size, k_load_shmem, use_mask_opt}; }; #define CREATE_FA(TYPE, NAMELC, FAPATH, SUFFIX) \ @@ -3221,18 +3237,19 @@ static void ggml_vk_load_shaders(vk_device& device) { FaCodePath path = fa.first.path; \ bool aligned = fa.first.aligned; \ bool f32acc = fa.first.f32acc; \ + bool use_mask_opt = fa.first.use_mask_opt; \ if (path == FAPATH) { \ if (aligned) { \ if (f32acc) { \ - ggml_vk_create_pipeline(device, fa.second, "flash_attn_f32_f16_aligned_f32acc" #NAMELC, flash_attn_f32_f16_ ## NAMELC ## SUFFIX ## _len, flash_attn_f32_f16_ ## NAMELC ## SUFFIX ## _data, "main", 6, sizeof(vk_flash_attn_push_constants), fa_wg_denoms(FAPATH, HSK,HSV,0,TYPE,small_rows,small_cache), fa_spec_constants(FAPATH, HSK,HSV,0,TYPE,small_rows,small_cache), fa_align(FAPATH,HSK,HSV,TYPE,small_rows,small_cache), true, FAPATH==FA_COOPMAT1, (FAPATH==FA_COOPMAT1 ? device->subgroup_size : 0)); \ + ggml_vk_create_pipeline(device, fa.second, "flash_attn_f32_f16_aligned_f32acc" #NAMELC, flash_attn_f32_f16_ ## NAMELC ## SUFFIX ## _len, flash_attn_f32_f16_ ## NAMELC ## SUFFIX ## _data, "main", 7, sizeof(vk_flash_attn_push_constants), fa_wg_denoms(FAPATH, HSK,HSV,0,TYPE,small_rows,small_cache), fa_spec_constants(FAPATH, HSK,HSV,0,TYPE,small_rows,small_cache,use_mask_opt), fa_align(FAPATH,HSK,HSV,TYPE,small_rows,small_cache), true, FAPATH==FA_COOPMAT1, (FAPATH==FA_COOPMAT1 ? device->subgroup_size : 0)); \ } else { \ - ggml_vk_create_pipeline(device, fa.second, "flash_attn_f32_f16_aligned_f16acc" #NAMELC, flash_attn_f32_f16_ ## NAMELC ## _f16acc ## SUFFIX ## _len, flash_attn_f32_f16_ ## NAMELC ## _f16acc ## SUFFIX ## _data, "main", 6, sizeof(vk_flash_attn_push_constants), fa_wg_denoms(FAPATH, HSK,HSV,0,TYPE,small_rows,small_cache), fa_spec_constants(FAPATH, HSK,HSV,0,TYPE,small_rows,small_cache), fa_align(FAPATH,HSK,HSV,TYPE,small_rows,small_cache), true, FAPATH==FA_COOPMAT1, (FAPATH==FA_COOPMAT1 ? device->subgroup_size : 0)); \ + ggml_vk_create_pipeline(device, fa.second, "flash_attn_f32_f16_aligned_f16acc" #NAMELC, flash_attn_f32_f16_ ## NAMELC ## _f16acc ## SUFFIX ## _len, flash_attn_f32_f16_ ## NAMELC ## _f16acc ## SUFFIX ## _data, "main", 7, sizeof(vk_flash_attn_push_constants), fa_wg_denoms(FAPATH, HSK,HSV,0,TYPE,small_rows,small_cache), fa_spec_constants(FAPATH, HSK,HSV,0,TYPE,small_rows,small_cache,use_mask_opt), fa_align(FAPATH,HSK,HSV,TYPE,small_rows,small_cache), true, FAPATH==FA_COOPMAT1, (FAPATH==FA_COOPMAT1 ? device->subgroup_size : 0)); \ } \ } else { \ if (f32acc) { \ - ggml_vk_create_pipeline(device, fa.second, "flash_attn_f32_f16_f32acc" #NAMELC, flash_attn_f32_f16_ ## NAMELC ## SUFFIX ## _len, flash_attn_f32_f16_ ## NAMELC ## SUFFIX ## _data, "main", 6, sizeof(vk_flash_attn_push_constants), fa_wg_denoms(FAPATH, HSK,HSV,1,TYPE,small_rows,small_cache), fa_spec_constants(FAPATH, HSK,HSV,1,TYPE,small_rows,small_cache), 1, true, FAPATH==FA_COOPMAT1, (FAPATH==FA_COOPMAT1 ? device->subgroup_size : 0)); \ + ggml_vk_create_pipeline(device, fa.second, "flash_attn_f32_f16_f32acc" #NAMELC, flash_attn_f32_f16_ ## NAMELC ## SUFFIX ## _len, flash_attn_f32_f16_ ## NAMELC ## SUFFIX ## _data, "main", 7, sizeof(vk_flash_attn_push_constants), fa_wg_denoms(FAPATH, HSK,HSV,1,TYPE,small_rows,small_cache), fa_spec_constants(FAPATH, HSK,HSV,1,TYPE,small_rows,small_cache,use_mask_opt), 1, true, FAPATH==FA_COOPMAT1, (FAPATH==FA_COOPMAT1 ? device->subgroup_size : 0)); \ } else { \ - ggml_vk_create_pipeline(device, fa.second, "flash_attn_f32_f16_f16acc" #NAMELC, flash_attn_f32_f16_ ## NAMELC ## _f16acc ## SUFFIX ## _len, flash_attn_f32_f16_ ## NAMELC ## _f16acc ## SUFFIX ## _data, "main", 6, sizeof(vk_flash_attn_push_constants), fa_wg_denoms(FAPATH, HSK,HSV,1,TYPE,small_rows,small_cache), fa_spec_constants(FAPATH, HSK,HSV,1,TYPE,small_rows,small_cache), 1, true, FAPATH==FA_COOPMAT1, (FAPATH==FA_COOPMAT1 ? device->subgroup_size : 0)); \ + ggml_vk_create_pipeline(device, fa.second, "flash_attn_f32_f16_f16acc" #NAMELC, flash_attn_f32_f16_ ## NAMELC ## _f16acc ## SUFFIX ## _len, flash_attn_f32_f16_ ## NAMELC ## _f16acc ## SUFFIX ## _data, "main", 7, sizeof(vk_flash_attn_push_constants), fa_wg_denoms(FAPATH, HSK,HSV,1,TYPE,small_rows,small_cache), fa_spec_constants(FAPATH, HSK,HSV,1,TYPE,small_rows,small_cache,use_mask_opt), 1, true, FAPATH==FA_COOPMAT1, (FAPATH==FA_COOPMAT1 ? device->subgroup_size : 0)); \ } \ } \ } \ @@ -4028,6 +4045,11 @@ static void ggml_vk_load_shaders(vk_device& device) { ggml_vk_create_pipeline(device, device->pipeline_matmul_split_k_reduce, "split_k_reduce", split_k_reduce_len, split_k_reduce_data, "main", 2, 2 * sizeof(uint32_t), {256 * 4, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_flash_attn_split_k_reduce, "fa_split_k_reduce", fa_split_k_reduce_len, fa_split_k_reduce_data, "main", 3, sizeof(vk_op_flash_attn_split_k_reduce_push_constants), {1, device->subgroup_size, 1}, {device->subgroup_size}, 1, true); + for (auto &it : device->pipeline_fa_mask_opt) { + auto BrBc = it.first; + ggml_vk_create_pipeline(device, it.second, "fa_mask_opt", fa_mask_opt_len, fa_mask_opt_data, "main", 2, sizeof(vk_op_flash_attn_mask_opt_push_constants), {1, 1, 1}, {128, 128 / device->subgroup_size, BrBc.first, BrBc.second}, 1, true, true, device->subgroup_size); + } + if (device->subgroup_clustered && device->subgroup_require_full_support) { ggml_vk_create_pipeline(device, device->pipeline_quantize_q8_1_x4, "quantize_q8_1_x4", quantize_q8_1_x4_subgroup_len, quantize_q8_1_x4_subgroup_data, "main", 2, sizeof(vk_quantize_q8_1_push_constants), {32 * device->subgroup_size / 8, 1, 1}, { device->subgroup_size }, 1, true, true); } else { @@ -8400,8 +8422,6 @@ static bool ggml_vk_flash_attn_coopmat_shmem_support(const vk_device& device, co const uint32_t acctype = f32acc ? 4 : 2; const uint32_t f16vec4 = 8; - const uint32_t tmpsh = (Bc / MatBc) * sizeof(float); - const uint32_t qstride = hsk_pad / 4 + 2; const uint32_t Qf = Br * qstride * f16vec4; @@ -8418,7 +8438,7 @@ static bool ggml_vk_flash_attn_coopmat_shmem_support(const vk_device& device, co const uint32_t slope = Br * acctype; - const uint32_t total_size = tmpsh + Qf + Psh + sfsh + ksh + slope; + const uint32_t total_size = Qf + Psh + sfsh + ksh + slope; const bool supported = total_size <= device->properties.limits.maxComputeSharedMemorySize; VK_LOG_DEBUG("ggml_vk_flash_attn_coopmat_shmem_support(HSK=" << hsk << ", HSV=" << hsv << ", f32acc=" << f32acc << ", kv_type=" << kv_type << ", total_size=" << total_size << ", supported=" << supported); @@ -8445,6 +8465,7 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx GGML_TENSOR_LOCALS(int64_t, ne, dst, ne) GGML_TENSOR_LOCALS(size_t, nb, dst, nb) + const uint32_t nem0 = mask ? mask->ne[0] : 0; const uint32_t nem1 = mask ? mask->ne[1] : 0; const uint32_t nem2 = mask ? mask->ne[2] : 0; const uint32_t nem3 = mask ? mask->ne[3] : 0; @@ -8574,7 +8595,10 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx bool f32acc = path == FA_SCALAR || dst->op_params[3] == GGML_PREC_F32; - vk_fa_pipeline_state fa_pipeline_state(HSK, HSV, small_rows, small_cache, path, aligned, f32acc); + // Only use mask opt when the mask is fairly large. This hasn't been tuned extensively. + bool use_mask_opt = mask && nem1 >= 32 && nem0 * nem1 > 32768; + + vk_fa_pipeline_state fa_pipeline_state(HSK, HSV, small_rows, small_cache, path, aligned, f32acc, use_mask_opt); vk_pipeline pipeline = nullptr; @@ -8625,10 +8649,32 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx ggml_vk_preallocate_buffers(ctx, subctx); } - { - // Request descriptor sets - if (split_k > 1) { - ggml_pipeline_request_descriptor_sets(ctx, ctx->device->pipeline_flash_attn_split_k_reduce, 1); + auto rows_cols = fa_rows_cols(path, HSK, HSV, !aligned, k->type, small_rows, small_cache); + const uint32_t Br = rows_cols[0]; + const uint32_t Bc = rows_cols[1]; + + const uint32_t mask_opt_num_dwords = CEIL_DIV(nem0, 16 * Bc); + const uint64_t mask_opt_size = sizeof(uint32_t) * mask_opt_num_dwords * CEIL_DIV(nem1, Br) * nem2 * nem3; + + vk_pipeline pipeline_fa_mask_opt = nullptr; + if (use_mask_opt) { + std::lock_guard guard(ctx->device->mutex); + auto &pipelines = ctx->device->pipeline_fa_mask_opt; + auto it = pipelines.find({Br, Bc}); + if (it != pipelines.end()) { + pipeline_fa_mask_opt = it->second; + } else { + pipelines[{Br, Bc}] = pipeline_fa_mask_opt = std::make_shared(); + } + assert(pipeline_fa_mask_opt); + ggml_pipeline_request_descriptor_sets(ctx, pipeline_fa_mask_opt, 1); + + if (ctx->prealloc_size_y < mask_opt_size) { + ctx->prealloc_size_y = mask_opt_size; + ggml_vk_preallocate_buffers(ctx, subctx); + } + if (ctx->prealloc_y_need_sync) { + ggml_vk_sync_buffers(ctx, subctx); } } @@ -8655,9 +8701,30 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx vk_subbuffer dst_buf = ggml_vk_tensor_subbuffer(ctx, dst); vk_subbuffer mask_buf = mask ? ggml_vk_tensor_subbuffer(ctx, mask) : q_buf; vk_subbuffer sinks_buf = sinks ? ggml_vk_tensor_subbuffer(ctx, sinks) : q_buf; + vk_subbuffer mask_opt_buf = use_mask_opt ? ggml_vk_subbuffer(ctx, ctx->prealloc_y, 0) : q_buf; uint32_t mask_n_head_log2 = ((sinks != nullptr) << 24) | ((mask != nullptr) << 16) | n_head_log2; + if (use_mask_opt) + { + const vk_op_flash_attn_mask_opt_push_constants opt_pc = { + nem0, + nem1, + nem2, + (uint32_t)(mask->nb[1] / sizeof(ggml_fp16_t)), + (uint32_t)(mask->nb[2] / sizeof(ggml_fp16_t)), + (uint32_t)(mask->nb[3] / sizeof(ggml_fp16_t)), + mask_opt_num_dwords, + mask_opt_num_dwords * CEIL_DIV(nem1, Br), + mask_opt_num_dwords * CEIL_DIV(nem1, Br) * nem2, + }; + + ggml_vk_dispatch_pipeline(ctx, subctx, pipeline_fa_mask_opt, + { mask_buf, mask_opt_buf }, opt_pc, + { mask_opt_num_dwords, CEIL_DIV(nem1, Br), nem2 * nem3 }); + ggml_vk_sync_buffers(ctx, subctx); + } + const vk_flash_attn_push_constants pc = { N, KV, (uint32_t)ne1, (uint32_t)ne2, (uint32_t)ne3, (uint32_t)neq2, (uint32_t)neq3, @@ -8672,13 +8739,15 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx gqa_ratio, split_kv, split_k }; if (split_k > 1) { + ggml_pipeline_request_descriptor_sets(ctx, ctx->device->pipeline_flash_attn_split_k_reduce, 1); + if (ctx->prealloc_split_k_need_sync) { ggml_vk_sync_buffers(ctx, subctx); } workgroups_x *= pipeline->wg_denoms[0]; vk_subbuffer split_k_buf = ggml_vk_subbuffer(ctx, ctx->prealloc_split_k, 0); ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, - {q_buf, k_buf, v_buf, mask_buf, sinks_buf, split_k_buf}, + {q_buf, k_buf, v_buf, mask_buf, sinks_buf, split_k_buf, mask_opt_buf}, // We only use split_k when group query attention is enabled, which means // there's no more than one tile of rows (i.e. workgroups_x would have been // one). We reuse workgroups_x to mean the number of splits, so we need to @@ -8697,7 +8766,7 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx workgroups_x *= pipeline->wg_denoms[0]; } ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, - {q_buf, k_buf, v_buf, mask_buf, sinks_buf, dst_buf}, + {q_buf, k_buf, v_buf, mask_buf, sinks_buf, dst_buf, mask_opt_buf}, pc, { workgroups_x, workgroups_y, workgroups_z }); } } diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp b/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp index 3ce8d07be8..49a3c530cb 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp @@ -94,6 +94,10 @@ void main() { } } + const uint32_t mo_stride = CEIL_DIV(KV, 16 * Bc); + // mo_offset will point to the tile starting at row i*Br and col 0 + uint32_t mo_offset = mo_stride * i; + #if BLOCK_SIZE > 1 uint32_t k_offset = (ik2*p.nb12 + ik3*p.nb13) / BLOCK_BYTE_SIZE; uint32_t v_offset = (iv2*p.nb22 + iv3*p.nb23) / BLOCK_BYTE_SIZE; @@ -104,15 +108,28 @@ void main() { uint32_t m_offset = gqa_iq1*KV; if (p.nem2 != 1 || p.nem3 != 1) { m_offset += ((iq3 % p.nem3) * p.nem2 + (iq2 % p.nem2)) * p.nem1 * KV; + mo_offset += ((iq3 % p.nem3) * p.nem2 + (iq2 % p.nem2)) * CEIL_DIV(p.nem1, Br) * mo_stride; } + uint32_t mask_opt = 0; + uint32_t mask_opt_idx = ~0; + [[dont_unroll]] for (uint32_t j = start_j; j < end_j; ++j) { - if ((p.mask_n_head_log2 & MASK_ENABLE_BIT) != 0) { + if (USE_MASK_OPT && mask_opt_idx != j / 16) { + mask_opt_idx = j / 16; + mask_opt = data_mask_opt[mo_offset + mask_opt_idx]; + } + uint32_t mask_opt_bits = (mask_opt >> ((j % 16) * 2)) & 0x3; + if (mask_opt_bits == MASK_OPT_ALL_NEG_INF) { + // skip this block + continue; + } + // Only load if the block is not all zeros + if ((p.mask_n_head_log2 & MASK_ENABLE_BIT) != 0 && mask_opt_bits != MASK_OPT_ALL_ZERO) { bool nem1_bounds_check = !(p.gqa_ratio > 1) && (p.nem1 % Br) != 0; - float max_mask = NEG_FLT_MAX_OVER_2; [[unroll]] for (uint32_t idx = 0; idx < Bc * Br; idx += gl_WorkGroupSize.x) { uint32_t c = (idx + tid) % Bc; uint32_t r = (idx + tid) / Bc; @@ -120,25 +137,12 @@ void main() { if ((!KV_bounds_check || j * Bc + c < KV) && (!nem1_bounds_check || i * Br + r < p.nem1)) { float m = float(data_m[m_offset + (i * Br + r) * m_stride + (j * Bc + c)]); masksh[c][r] = m; - max_mask = max(max_mask, m); } else { masksh[c][r] = float(0); } } } - // skip the block if the mask is entirely -inf - bool all_less = subgroupAll(max_mask <= NEG_FLT_MAX_OVER_2); barrier(); - if (gl_SubgroupInvocationID == 0) { - tmpsh[gl_SubgroupID] = all_less ? NEG_FLT_MAX_OVER_2 : 0.0f; - } - barrier(); - [[unroll]] for (uint s = 0; s < gl_NumSubgroups; ++s) { - max_mask = max(max_mask, tmpsh[s]); - } - if (max_mask <= NEG_FLT_MAX_OVER_2) { - continue; - } } float Sf[Br][cols_per_thread]; @@ -185,7 +189,7 @@ void main() { } } - if ((p.mask_n_head_log2 & MASK_ENABLE_BIT) != 0) { + if ((p.mask_n_head_log2 & MASK_ENABLE_BIT) != 0 && mask_opt_bits != MASK_OPT_ALL_ZERO) { [[unroll]] for (uint32_t c = 0; c < cols_per_thread; ++c) { [[unroll]] for (uint32_t r = 0; r < Br; ++r) { float mvf = masksh[c * cols_per_iter + col_tid][r]; @@ -256,9 +260,6 @@ void main() { barrier(); } - // prevent race on tmpsh - barrier(); - // reduce across threads [[unroll]] for (uint32_t r = 0; r < Br; ++r) { diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.glsl b/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.glsl index 23a4d2c005..252451101a 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.glsl +++ b/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.glsl @@ -10,6 +10,7 @@ layout (constant_id = 5) const uint32_t Clamp = 0; layout (constant_id = 6) const uint32_t D_split = 16; layout (constant_id = 7) const uint32_t SubGroupSize = 32; layout (constant_id = 8) const uint32_t K_LOAD_SHMEM = 0; +layout (constant_id = 9) const bool USE_MASK_OPT = false; // Round up head sizes to a multiple of 16, for coopmat1/coopmat2 paths const uint32_t HSK_pad = (HSK + 15) & ~15; @@ -66,6 +67,11 @@ layout (binding = 4) readonly buffer S {float data_s[];}; layout (binding = 5) writeonly buffer O {D_TYPE data_o[];}; +layout (binding = 6) readonly buffer MO {uint32_t data_mask_opt[];}; + +#define MASK_OPT_ALL_NEG_INF 1 +#define MASK_OPT_ALL_ZERO 2 + #define BINDING_IDX_K 0 #define BINDING_IDX_V 1 #if defined(DATA_A_F32) diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp b/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp index 83d52d19d6..89af3697e1 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp @@ -42,8 +42,6 @@ D_TYPE perElemOpGqaStore(const in uint32_t r, const in uint32_t c, const in D_TY return elem; } -shared float tmpsh[row_split]; - const uint32_t qstride = HSK_pad / 4 + 2; // in units of f16vec4 shared f16vec4 Qf[Br * qstride]; @@ -134,6 +132,10 @@ void main() { } } + const uint32_t mo_stride = CEIL_DIV(KV, 16 * Bc); + // mo_offset will point to the tile starting at row i*Br and col 0 + uint32_t mo_offset = mo_stride * i; + #if BLOCK_SIZE > 1 uint32_t k_offset = (ik2*p.nb12 + ik3*p.nb13) / BLOCK_BYTE_SIZE; uint32_t v_offset = (iv2*p.nb22 + iv3*p.nb23) / BLOCK_BYTE_SIZE; @@ -144,66 +146,74 @@ void main() { uint32_t m_offset = gqa_iq1*KV; if (p.nem2 != 1 || p.nem3 != 1) { m_offset += ((iq3 % p.nem3) * p.nem2 + (iq2 % p.nem2)) * p.nem1 * KV; + mo_offset += ((iq3 % p.nem3) * p.nem2 + (iq2 % p.nem2)) * CEIL_DIV(p.nem1, Br) * mo_stride; } + uint32_t mask_opt = 0; + uint32_t mask_opt_idx = ~0; + [[dont_unroll]] for (uint32_t j = start_j; j < end_j; ++j) { f16vec4 mask_cache[Bc * Br / 4 / WorkGroupSize]; - if ((p.mask_n_head_log2 & MASK_ENABLE_BIT) != 0) { - bool nem1_bounds_check = !(p.gqa_ratio > 1) && (p.nem1 % Br) != 0; + [[unroll]] for (uint32_t idx = 0; idx < mask_cache.length(); ++idx) { + mask_cache[idx] = f16vec4(0); + } - float max_mask = NEG_FLT_MAX_OVER_2; - [[unroll]] for (uint32_t idx = 0; idx < Bc * Br / 4; idx += gl_WorkGroupSize.x) { - uint32_t c = (idx + tid) / (Br / 4); - uint32_t r = (idx + tid) % (Br / 4); - if (idx + tid < Bc * Br / 4 || idx + gl_WorkGroupSize.x <= Bc * Br / 4) { - if ((!KV_bounds_check || j * Bc + c < KV)) { - f16vec4 m; - if (!nem1_bounds_check || i * Br + r * 4 + 3 < p.nem1) { - m = f16vec4(data_m[m_offset + (i * Br + r * 4 ) * m_stride + (j * Bc + c)], - data_m[m_offset + (i * Br + r * 4 + 1) * m_stride + (j * Bc + c)], - data_m[m_offset + (i * Br + r * 4 + 2) * m_stride + (j * Bc + c)], - data_m[m_offset + (i * Br + r * 4 + 3) * m_stride + (j * Bc + c)]); - max_mask = max(max(max(max(max_mask, float(m[0])), float(m[1])), float(m[2])), float(m[3])); - } else if (i * Br + r * 4 + 2 < p.nem1) { - m = f16vec4(data_m[m_offset + (i * Br + r * 4 ) * m_stride + (j * Bc + c)], - data_m[m_offset + (i * Br + r * 4 + 1) * m_stride + (j * Bc + c)], - data_m[m_offset + (i * Br + r * 4 + 2) * m_stride + (j * Bc + c)], - 0.0); - max_mask = max(max(max(max_mask, float(m[0])), float(m[1])), float(m[2])); - } else if (i * Br + r * 4 + 1 < p.nem1) { - m = f16vec4(data_m[m_offset + (i * Br + r * 4 ) * m_stride + (j * Bc + c)], - data_m[m_offset + (i * Br + r * 4 + 1) * m_stride + (j * Bc + c)], - 0.0, - 0.0); - max_mask = max(max(max_mask, float(m[0])), float(m[1])); - } else if (i * Br + r * 4 < p.nem1) { - m = f16vec4(data_m[m_offset + (i * Br + r * 4 ) * m_stride + (j * Bc + c)], - 0.0, - 0.0, - 0.0); - max_mask = max(max_mask, float(m[0])); - } else { - m = f16vec4(0.0); + if ((p.mask_n_head_log2 & MASK_ENABLE_BIT) != 0) { + + if (USE_MASK_OPT && mask_opt_idx != j / 16) { + mask_opt_idx = j / 16; + mask_opt = data_mask_opt[mo_offset + mask_opt_idx]; + } + uint32_t mask_opt_bits = (mask_opt >> ((j % 16) * 2)) & 0x3; + if (mask_opt_bits == MASK_OPT_ALL_NEG_INF) { + // skip this block + continue; + } + // Only load if the block is not all zeros + if (mask_opt_bits != MASK_OPT_ALL_ZERO) { + bool nem1_bounds_check = !(p.gqa_ratio > 1) && (p.nem1 % Br) != 0; + + float max_mask = NEG_FLT_MAX_OVER_2; + [[unroll]] for (uint32_t idx = 0; idx < Bc * Br / 4; idx += gl_WorkGroupSize.x) { + uint32_t c = (idx + tid) / (Br / 4); + uint32_t r = (idx + tid) % (Br / 4); + if (idx + tid < Bc * Br / 4 || idx + gl_WorkGroupSize.x <= Bc * Br / 4) { + if ((!KV_bounds_check || j * Bc + c < KV)) { + f16vec4 m; + if (!nem1_bounds_check || i * Br + r * 4 + 3 < p.nem1) { + m = f16vec4(data_m[m_offset + (i * Br + r * 4 ) * m_stride + (j * Bc + c)], + data_m[m_offset + (i * Br + r * 4 + 1) * m_stride + (j * Bc + c)], + data_m[m_offset + (i * Br + r * 4 + 2) * m_stride + (j * Bc + c)], + data_m[m_offset + (i * Br + r * 4 + 3) * m_stride + (j * Bc + c)]); + max_mask = max(max(max(max(max_mask, float(m[0])), float(m[1])), float(m[2])), float(m[3])); + } else if (i * Br + r * 4 + 2 < p.nem1) { + m = f16vec4(data_m[m_offset + (i * Br + r * 4 ) * m_stride + (j * Bc + c)], + data_m[m_offset + (i * Br + r * 4 + 1) * m_stride + (j * Bc + c)], + data_m[m_offset + (i * Br + r * 4 + 2) * m_stride + (j * Bc + c)], + 0.0); + max_mask = max(max(max(max_mask, float(m[0])), float(m[1])), float(m[2])); + } else if (i * Br + r * 4 + 1 < p.nem1) { + m = f16vec4(data_m[m_offset + (i * Br + r * 4 ) * m_stride + (j * Bc + c)], + data_m[m_offset + (i * Br + r * 4 + 1) * m_stride + (j * Bc + c)], + 0.0, + 0.0); + max_mask = max(max(max_mask, float(m[0])), float(m[1])); + } else if (i * Br + r * 4 < p.nem1) { + m = f16vec4(data_m[m_offset + (i * Br + r * 4 ) * m_stride + (j * Bc + c)], + 0.0, + 0.0, + 0.0); + max_mask = max(max_mask, float(m[0])); + } else { + m = f16vec4(0.0); + } + mask_cache[idx / WorkGroupSize] = m; } - mask_cache[idx / WorkGroupSize] = m; } } } - // skip the block if the mask is entirely -inf - bool all_less = subgroupAll(max_mask <= NEG_FLT_MAX_OVER_2); - barrier(); - if (gl_SubgroupInvocationID == 0) { - tmpsh[gl_SubgroupID] = all_less ? NEG_FLT_MAX_OVER_2 : 0.0f; - } - barrier(); - [[unroll]] for (uint s = 0; s < gl_NumSubgroups; ++s) { - max_mask = max(max_mask, tmpsh[s]); - } - if (max_mask <= NEG_FLT_MAX_OVER_2) { - continue; - } } if (K_LOAD_SHMEM != 0) { diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp b/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp index 54f1b0b622..47b110621b 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp @@ -138,48 +138,53 @@ void main() { coopMatPerElementNV(slopeMat, slopeMat, perElemOpComputeSlope, iq2); } + const uint32_t mo_stride = CEIL_DIV(KV, 16 * Bc); + // mo_offset will point to the tile starting at row i*Br and col 0 + uint32_t mo_offset = mo_stride * i; + uint32_t m_offset = gqa_iq1*KV * 2 /*sizeof(float16_t)*/; if (p.nem2 != 1 || p.nem3 != 1) { m_offset += ((iq3 % p.nem3) * p.nem2 + (iq2 % p.nem2)) * p.nem1 * KV * 2 /*sizeof(float16_t)*/; + mo_offset += ((iq3 % p.nem3) * p.nem2 + (iq2 % p.nem2)) * CEIL_DIV(p.nem1, Br) * mo_stride; } + uint32_t mask_opt = 0; + uint32_t mask_opt_idx = ~0; + [[dont_unroll]] for (uint32_t j = start_j; j < end_j; ++j) { - coopmat mv; + coopmat mv = coopmat(0); if ((p.mask_n_head_log2 & MASK_ENABLE_BIT) != 0) { - bool nem1_bounds_check = !(p.gqa_ratio > 1) && (p.nem1 % Br) != 0; - if (nem1_bounds_check) { - tensorLayoutNV<2, gl_CooperativeMatrixClampModeConstantNV> tensorLayoutM = createTensorLayoutNV(2, gl_CooperativeMatrixClampModeConstantNV); - tensorLayoutM = setTensorLayoutDimensionNV(tensorLayoutM, p.nem1, KV); - tensorLayoutM = setTensorLayoutStrideNV(tensorLayoutM, m_stride, 1); - tensorLayoutM = setTensorLayoutClampValueNV(tensorLayoutM, 0xfc00); // -inf in float16_t + if (USE_MASK_OPT && mask_opt_idx != j / 16) { + mask_opt_idx = j / 16; + mask_opt = data_mask_opt[mo_offset + mask_opt_idx]; + } + uint32_t mask_opt_bits = (mask_opt >> ((j % 16) * 2)) & 0x3; + if (mask_opt_bits == MASK_OPT_ALL_NEG_INF) { + // skip this block + continue; + } + // Only load if the block is not all zeros + if (mask_opt_bits != MASK_OPT_ALL_ZERO) { + bool nem1_bounds_check = !(p.gqa_ratio > 1) && (p.nem1 % Br) != 0; - coopmat mvmax; + if (nem1_bounds_check) { + tensorLayoutNV<2, gl_CooperativeMatrixClampModeConstantNV> tensorLayoutM = createTensorLayoutNV(2, gl_CooperativeMatrixClampModeConstantNV); + tensorLayoutM = setTensorLayoutDimensionNV(tensorLayoutM, p.nem1, KV); + tensorLayoutM = setTensorLayoutStrideNV(tensorLayoutM, m_stride, 1); + tensorLayoutM = setTensorLayoutClampValueNV(tensorLayoutM, 0xfc00); // -inf in float16_t - coopMatLoadTensorNV(mv, data_m, m_offset, sliceTensorLayoutNV(tensorLayoutM, i * Br, Br, j * Bc, Bc)); + coopMatLoadTensorNV(mv, data_m, m_offset, sliceTensorLayoutNV(tensorLayoutM, i * Br, Br, j * Bc, Bc)); + } else { + tensorLayoutNV<2, Clamp> tensorLayoutM = createTensorLayoutNV(2, Clamp); + // Don't clamp against nem1 when GQA is enabled + uint32_t m_height = p.gqa_ratio > 1 ? ~0 : p.nem1; + tensorLayoutM = setTensorLayoutDimensionNV(tensorLayoutM, m_height, KV); + tensorLayoutM = setTensorLayoutStrideNV(tensorLayoutM, m_stride, 1); - // skip the block if the mask is entirely -inf - coopMatReduceNV(mvmax, mv, gl_CooperativeMatrixReduceRowAndColumnNV, maxReduceFp16); - if (mvmax[0] <= NEG_FLT_MAX_OVER_2) { - continue; - } - } else { - tensorLayoutNV<2, Clamp> tensorLayoutM = createTensorLayoutNV(2, Clamp); - // Don't clamp against nem1 when GQA is enabled - uint32_t m_height = p.gqa_ratio > 1 ? ~0 : p.nem1; - tensorLayoutM = setTensorLayoutDimensionNV(tensorLayoutM, m_height, KV); - tensorLayoutM = setTensorLayoutStrideNV(tensorLayoutM, m_stride, 1); - - coopmat mvmax; - - coopMatLoadTensorNV(mv, data_m, m_offset, sliceTensorLayoutNV(tensorLayoutM, i * Br, Br, j * Bc, Bc)); - - // skip the block if the mask is entirely -inf - coopMatReduceNV(mvmax, mv, gl_CooperativeMatrixReduceRowAndColumnNV, maxReduceFp16); - if (mvmax[0] <= NEG_FLT_MAX_OVER_2) { - continue; + coopMatLoadTensorNV(mv, data_m, m_offset, sliceTensorLayoutNV(tensorLayoutM, i * Br, Br, j * Bc, Bc)); } } } diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_mask_opt.comp b/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_mask_opt.comp new file mode 100644 index 0000000000..8c92c1adcd --- /dev/null +++ b/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_mask_opt.comp @@ -0,0 +1,142 @@ +#version 450 + +#extension GL_EXT_control_flow_attributes : enable +#extension GL_EXT_shader_16bit_storage : enable +#extension GL_KHR_shader_subgroup_arithmetic : enable + +layout (constant_id = 0) const uint BLOCK_SIZE = 128; +layout (constant_id = 1) const uint NUM_SUBGROUPS = 4; +layout (constant_id = 2) const uint Br = 32; +layout (constant_id = 3) const uint Bc = 32; + +layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in; + +layout (binding = 0) readonly buffer A {float16_t data_a[];}; +layout (binding = 0) readonly buffer Av4 {f16vec4 data_av4[];}; +layout (binding = 1) writeonly buffer D {uint data_d[];}; + +layout (push_constant) uniform parameter { + uint nem0; + uint nem1; + uint nem2; + uint nbm1; + uint nbm2; + uint nbm3; + uint nbd1; + uint nbd2; + uint nbd3; +}; + +#define MASK_OPT_ALL_NEG_INF 1 +#define MASK_OPT_ALL_ZERO 2 + +shared float minsh[NUM_SUBGROUPS]; +shared float maxsh[NUM_SUBGROUPS]; + +// For each Br x Bc block of the mask (input) buffer, read all values and check +// if it's all -inf or all zero. Write out a two-bit code indicating which it is +// (or zero for neither). Each workgroup processes 16 tiles and writes out a +// 32-bit result mask. +// +// TODO: This is a lot of work per workgroup, might make sense to split this into +// more workgroups in the future. +void main() { + // Each workgroup handles a row + const uint tid = gl_LocalInvocationIndex; + const uint i0 = gl_WorkGroupID.x; + const uint i1 = gl_WorkGroupID.y; + const uint i2 = gl_WorkGroupID.z % nem2; + const uint i3 = gl_WorkGroupID.z / nem2; + + float FLT_MAX_OVER_2 = uintBitsToFloat(0x7EFFFFFF); + + uint result = 0; + + // Fast path for fully in-bounds blocks where we can do f16vec4 loads + if ((nem0 % Bc) == 0 && (nem1 % Br) == 0 && + ((Br * Bc) % (BLOCK_SIZE * 4)) == 0) { + [[unroll]] for (uint block_x = 0; block_x < 16; ++block_x) { + float min_v = FLT_MAX_OVER_2; + float max_v = -FLT_MAX_OVER_2; + [[unroll]] for (uint i = 0; i < Br * Bc / 4; i += BLOCK_SIZE) { + uint j0 = (i + tid) % (Bc / 4); + uint j1 = (i + tid) / (Bc / 4); + + j0 *= 4; + j0 += (i0 * 16 + block_x) * Bc; + j1 += i1 * Br; + + vec4 f = vec4(data_av4[(j0 + j1 * nbm1 + i2 * nbm2 + i3 * nbm3) / 4]); + [[unroll]] for (int c = 0; c < 4; ++c) { + min_v = min(min_v, f[c]); + max_v = max(max_v, f[c]); + } + } + min_v = subgroupMin(min_v); + max_v = subgroupMax(max_v); + if (gl_SubgroupInvocationID == 0) { + minsh[gl_SubgroupID] = min_v; + maxsh[gl_SubgroupID] = max_v; + } + barrier(); + if (tid == 0) { + [[unroll]] for (uint i = 0; i < NUM_SUBGROUPS; ++i) { + min_v = min(min_v, minsh[i]); + max_v = max(max_v, maxsh[i]); + } + if (max_v <= -FLT_MAX_OVER_2) { + result |= 1 << (2*block_x); + } + if (min_v == 0.0f && max_v == 0.0f) { + result |= 2 << (2*block_x); + } + } + barrier(); + } + } else { + [[unroll]] for (uint block_x = 0; block_x < 16; ++block_x) { + float min_v = FLT_MAX_OVER_2; + float max_v = -FLT_MAX_OVER_2; + [[unroll]] for (uint i = 0; i < Br * Bc; i += BLOCK_SIZE) { + if ((Br * Bc % BLOCK_SIZE) != 0 && i + tid >= Br * Bc) { + continue; + } + uint j0 = (i + tid) % Bc; + uint j1 = (i + tid) / Bc; + + j0 += (i0 * 16 + block_x) * Bc; + j1 += i1 * Br; + + if (j0 < nem0 && j1 < nem1) { + float f = float(data_a[j0 + j1 * nbm1 + i2 * nbm2 + i3 * nbm3]); + min_v = min(min_v, f); + max_v = max(max_v, f); + } + } + min_v = subgroupMin(min_v); + max_v = subgroupMax(max_v); + if (gl_SubgroupInvocationID == 0) { + minsh[gl_SubgroupID] = min_v; + maxsh[gl_SubgroupID] = max_v; + } + barrier(); + if (tid == 0) { + [[unroll]] for (uint i = 0; i < NUM_SUBGROUPS; ++i) { + min_v = min(min_v, minsh[i]); + max_v = max(max_v, maxsh[i]); + } + if (max_v <= -FLT_MAX_OVER_2) { + result |= 1 << (2*block_x); + } + if (min_v == 0.0f && max_v == 0.0f) { + result |= 2 << (2*block_x); + } + } + barrier(); + } + } + + if (tid == 0) { + data_d[i0 + i1 * nbd1 + i2 * nbd2 + i3 * nbd3] = result; + } +} diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp b/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp index ca486a288a..42ebc21e2a 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp @@ -790,6 +790,8 @@ void process_shaders() { string_to_spv("split_k_reduce", "mul_mat_split_k_reduce.comp", {}); string_to_spv("fa_split_k_reduce", "flash_attn_split_k_reduce.comp", {}); + string_to_spv("fa_mask_opt", "flash_attn_mask_opt.comp", {}); + string_to_spv("quantize_q8_1", "quantize_q8_1.comp", {}); string_to_spv("quantize_q8_1_subgroup", "quantize_q8_1.comp", {{"USE_SUBGROUPS", "1"}}); diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index cecdf47038..fbe23037cc 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -169,20 +169,22 @@ static void init_tensor_kq_mask(ggml_tensor * tensor, float min = -1.0f, float m const int blck0 = 128; const int blck1 = 64; - // number of INF blocks - const int n_inf_blocks = 0.1*(ne0*ne1*ne2*ne3)/(blck0*blck1); + // number of INF/zero blocks + const int n_inf_zero_blocks = 0.2*(ne0*ne1*ne2*ne3)/(blck0*blck1); - for (int b = 0; b < n_inf_blocks; b++) { + for (int b = 0; b < n_inf_zero_blocks; b++) { const int p3 = (rd() % ne3); const int p2 = (rd() % ne2); const int p1 = (rd() % ne1); const int p0 = (rd() % ne0); + bool inf = rd() & 1; + for (int i1 = 0; i1 < blck1 && p1 + i1 < ne1; i1++) { const int idx = p3*ne2*ne1*ne0 + p2*ne1*ne0 + (p1 + i1)*ne0 + p0; for (int i0 = 0; i0 < blck0 && p0 + i0 < ne0; i0++) { - data_f32[idx + i0] = -INFINITY; + data_f32[idx + i0] = inf ? -INFINITY : 0.0f; } } } From 22cae832188a1f08d18bd0a707a4ba5cd03c7349 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 5 Feb 2026 19:07:22 +0200 Subject: [PATCH 20/20] metal : adaptive CPU/GPU interleave based on number of nodes (#19369) --- ggml/src/ggml-metal/ggml-metal-context.m | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml/src/ggml-metal/ggml-metal-context.m b/ggml/src/ggml-metal/ggml-metal-context.m index a412d70aed..c7e8ebd3f3 100644 --- a/ggml/src/ggml-metal/ggml-metal-context.m +++ b/ggml/src/ggml-metal/ggml-metal-context.m @@ -415,7 +415,7 @@ bool ggml_metal_cpy_tensor_async(ggml_metal_t ctx_src, ggml_metal_t ctx_dst, con enum ggml_status ggml_metal_graph_compute(ggml_metal_t ctx, struct ggml_cgraph * gf) { // number of nodes encoded by the main thread (empirically determined) - const int n_main = 64; + const int n_main = MAX(64, 0.1*gf->n_nodes); // number of threads in addition to the main thread const int n_cb = ctx->n_cb;