Merge branch 'ggml-org:master' into expected-attn

This commit is contained in:
ddh0 2026-02-03 10:00:20 -06:00 committed by GitHub
commit 477a72ba63
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
135 changed files with 4903 additions and 2204 deletions

View File

@ -4,7 +4,7 @@
# the module `{ pkgs ... }: { /* config */ }` implicitly uses
# `_module.args.pkgs` (defined in this case by flake-parts).
perSystem =
{ system, ... }:
{ lib, system, ... }:
{
_module.args = {
# Note: bringing up https://zimbatm.com/notes/1000-instances-of-nixpkgs
@ -33,7 +33,7 @@
"CUDA EULA"
"cuDNN EULA"
]
) (p.meta.licenses or [ p.meta.license ]);
) (p.meta.licenses or (lib.toList p.meta.license));
};
# Ensure dependencies use ROCm consistently
pkgsRocm = import inputs.nixpkgs {

View File

@ -3,6 +3,7 @@
llamaVersion,
numpy,
tqdm,
requests,
sentencepiece,
pyyaml,
poetry-core,
@ -20,6 +21,7 @@ buildPythonPackage {
tqdm
sentencepiece
pyyaml
requests
];
src = lib.cleanSource ../../gguf-py;
pythonImportsCheck = [

View File

@ -7,13 +7,6 @@
let
pythonPackages = python3.pkgs;
buildPythonPackage = pythonPackages.buildPythonPackage;
numpy = pythonPackages.numpy;
tqdm = pythonPackages.tqdm;
sentencepiece = pythonPackages.sentencepiece;
pyyaml = pythonPackages.pyyaml;
poetry-core = pythonPackages.poetry-core;
pytestCheckHook = pythonPackages.pytestCheckHook;
in
# We're using `makeScope` instead of just writing out an attrset
@ -23,17 +16,18 @@ in
lib.makeScope newScope (self: {
inherit llamaVersion;
gguf-py = self.callPackage ./package-gguf-py.nix {
inherit
buildPythonPackage
inherit (pythonPackages)
numpy
tqdm
sentencepiece
poetry-core
pyyaml
pytestCheckHook
requests
buildPythonPackage
poetry-core
;
};
python-scripts = self.callPackage ./python-scripts.nix { inherit buildPythonPackage poetry-core; };
python-scripts = self.callPackage ./python-scripts.nix { inherit (pythonPackages) buildPythonPackage poetry-core; };
llama-cpp = self.callPackage ./package.nix { };
docker = self.callPackage ./docker.nix { };
docker-min = self.callPackage ./docker.nix { interactive = false; };

View File

@ -1532,7 +1532,7 @@ jobs:
- name: Test
id: ggml-ci
run: |
LLAMA_ARG_THREADS=$(nproc) bash ./ci/run.sh ./tmp/results ./tmp/mnt
LLAMA_ARG_THREADS=$(nproc) GG_BUILD_HIGH_PERF=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
ggml-ci-arm64-cpu-high-perf:
runs-on: ubuntu-22.04-arm
@ -1558,7 +1558,7 @@ jobs:
- name: Test
id: ggml-ci
run: |
LLAMA_ARG_THREADS=$(nproc) GG_BUILD_NO_SVE=1 GG_BUILD_NO_BF16=1 GG_BUILD_EXTRA_TESTS_0=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
LLAMA_ARG_THREADS=$(nproc) GG_BUILD_HIGH_PERF=1 GG_BUILD_NO_SVE=1 GG_BUILD_NO_BF16=1 GG_BUILD_EXTRA_TESTS_0=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
ggml-ci-arm64-cpu-high-perf-sve:
runs-on: ubuntu-22.04-arm

1085
AUTHORS

File diff suppressed because it is too large Load Diff

View File

@ -1,6 +1,6 @@
MIT License
Copyright (c) 2023-2024 The ggml authors
Copyright (c) 2023-2026 The ggml authors
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal

View File

@ -213,6 +213,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
- [llama.vim](https://github.com/ggml-org/llama.vim) (MIT)
- [LARS](https://github.com/abgulati/LARS) (AGPL)
- [Llama Assistant](https://github.com/vietanhdev/llama-assistant) (GPL)
- [LlamaLib](https://github.com/undreamai/LlamaLib) (Apache-2.0)
- [LLMFarm](https://github.com/guinmoon/LLMFarm?tab=readme-ov-file) (MIT)
- [LLMUnity](https://github.com/undreamai/LLMUnity) (MIT)
- [LMStudio](https://lmstudio.ai/) (proprietary)

View File

@ -635,6 +635,29 @@ function gg_check_build_requirements {
fi
}
function gg_run_test_backend_ops_cpu {
cd ${SRC}
cd build-ci-release
set -e
(time ./bin/test-backend-ops -b CPU ) 2>&1 | tee -a $OUT/${ci}-test-backend-ops-cpu.log
set +e
}
function gg_sum_test_backend_ops_cpu {
gg_printf '### %s\n\n' "${ci}"
gg_printf 'Runs test-backend-ops for CPU backend\n'
gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
gg_printf '```\n'
gg_printf '%s\n' "$(cat $OUT/${ci}-test-backend-ops-cpu.log)"
gg_printf '```\n'
gg_printf '\n'
}
## main
export LLAMA_LOG_PREFIX=1
@ -663,6 +686,10 @@ ret=0
test $ret -eq 0 && gg_run ctest_debug
test $ret -eq 0 && gg_run ctest_release
if [ ! -z ${GG_BUILD_HIGH_PERF} ]; then
test $ret -eq 0 && gg_run test_backend_ops_cpu
fi
if [ -z ${GG_BUILD_LOW_PERF} ]; then
test $ret -eq 0 && gg_run embd_bge_small
test $ret -eq 0 && gg_run rerank_tiny

View File

@ -75,6 +75,8 @@ add_library(${TARGET} STATIC
ngram-cache.h
ngram-map.cpp
ngram-map.h
ngram-mod.cpp
ngram-mod.h
peg-parser.cpp
peg-parser.h
preset.cpp

View File

@ -3396,7 +3396,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
}
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
add_opt(common_arg(
{"--spec-type"}, "[none|ngram-cache|ngram-simple|ngram-map-k|ngram-map-k4v]",
{"--spec-type"}, "[none|ngram-cache|ngram-simple|ngram-map-k|ngram-map-k4v|ngram-mod]",
string_format("type of speculative decoding to use when no draft model is provided (default: %s)\n",
common_speculative_type_to_str(params.speculative.type).c_str()),
[](common_params & params, const std::string & value) {
@ -3410,6 +3410,8 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
params.speculative.type = COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K;
} else if (value == "ngram-map-k4v") {
params.speculative.type = COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K4V;
} else if (value == "ngram-mod") {
params.speculative.type = COMMON_SPECULATIVE_TYPE_NGRAM_MOD;
} else {
throw std::invalid_argument("unknown speculative decoding type without draft model");
}

View File

@ -171,6 +171,7 @@ enum common_speculative_type {
COMMON_SPECULATIVE_TYPE_NGRAM_SIMPLE, // simple self-speculative decoding
COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K, // self-speculative decoding with n-gram keys only
COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K4V, // self-speculative decoding with n-gram keys and 4 m-gram values
COMMON_SPECULATIVE_TYPE_NGRAM_MOD,
COMMON_SPECULATIVE_TYPE_NGRAM_CACHE, // self-speculative decoding with 3-level n-gram cache
COMMON_SPECULATIVE_TYPE_COUNT // number of types, unknown type
};
@ -252,6 +253,8 @@ struct common_params_model {
std::string name = ""; // in format <user>/<model>[:<tag>] (tag is optional) // NOLINT
};
struct common_ngram_mod;
struct common_params_speculative {
common_speculative_type type = COMMON_SPECULATIVE_TYPE_NONE; // type of speculative decoding
@ -269,6 +272,8 @@ struct common_params_speculative {
uint16_t ngram_check_rate = 1; // check rate for ngram lookup
uint16_t ngram_min_hits = 1; // minimum hits at ngram/mgram lookup for mgram to be proposed
std::shared_ptr<common_ngram_mod> ngram_mod;
std::string lookup_cache_static; // path of static ngram cache file for lookup decoding // NOLINT
std::string lookup_cache_dynamic; // path of dynamic ngram cache file for lookup decoding // NOLINT

View File

@ -144,6 +144,13 @@ value binary_expression::execute_impl(context & ctx) {
return false;
};
auto test_is_in = [&]() -> bool {
func_args args(ctx);
args.push_back(left_val);
args.push_back(right_val);
return global_builtins().at("test_is_in")(args)->as_bool();
};
// Handle undefined and null values
if (is_val<value_undefined>(left_val) || is_val<value_undefined>(right_val)) {
if (is_val<value_undefined>(right_val) && (op.value == "in" || op.value == "not in")) {
@ -223,19 +230,11 @@ value binary_expression::execute_impl(context & ctx) {
return result;
}
} else if (is_val<value_array>(right_val)) {
auto & arr = right_val->as_array();
bool member = false;
for (const auto & item : arr) {
if (*left_val == *item) {
member = true;
break;
}
}
// case: 1 in [0, 1, 2]
bool member = test_is_in();
if (op.value == "in") {
JJ_DEBUG("Checking membership: %s in Array is %d", left_val->type().c_str(), member);
return mk_val<value_bool>(member);
} else if (op.value == "not in") {
JJ_DEBUG("Checking non-membership: %s not in Array is %d", left_val->type().c_str(), !member);
return mk_val<value_bool>(!member);
}
}
@ -252,22 +251,23 @@ value binary_expression::execute_impl(context & ctx) {
// String membership
if (is_val<value_string>(left_val) && is_val<value_string>(right_val)) {
auto left_str = left_val->as_string().str();
auto right_str = right_val->as_string().str();
// case: "a" in "abc"
bool member = test_is_in();
if (op.value == "in") {
return mk_val<value_bool>(right_str.find(left_str) != std::string::npos);
return mk_val<value_bool>(member);
} else if (op.value == "not in") {
return mk_val<value_bool>(right_str.find(left_str) == std::string::npos);
return mk_val<value_bool>(!member);
}
}
// Value key in object
if (is_val<value_object>(right_val)) {
bool has_key = right_val->has_key(left_val);
// case: key in {key: value}
bool member = test_is_in();
if (op.value == "in") {
return mk_val<value_bool>(has_key);
return mk_val<value_bool>(member);
} else if (op.value == "not in") {
return mk_val<value_bool>(!has_key);
return mk_val<value_bool>(!member);
}
}

View File

@ -393,6 +393,33 @@ const func_builtins & global_builtins() {
{"test_is_lt", test_compare_fn<value_compare_op::lt>},
{"test_is_lessthan", test_compare_fn<value_compare_op::lt>},
{"test_is_ne", test_compare_fn<value_compare_op::ne>},
{"test_is_in", [](const func_args & args) -> value {
args.ensure_count(2);
auto needle = args.get_pos(0);
auto haystack = args.get_pos(1);
if (is_val<value_undefined>(haystack)) {
return mk_val<value_bool>(false);
}
if (is_val<value_array>(haystack)) {
for (const auto & item : haystack->as_array()) {
if (*needle == *item) {
return mk_val<value_bool>(true);
}
}
return mk_val<value_bool>(false);
}
if (is_val<value_string>(haystack)) {
if (!is_val<value_string>(needle)) {
throw raised_exception("'in' test expects args[1] as string when args[0] is string, got args[1] as " + needle->type());
}
return mk_val<value_bool>(
haystack->as_string().str().find(needle->as_string().str()) != std::string::npos);
}
if (is_val<value_object>(haystack)) {
return mk_val<value_bool>(haystack->has_key(needle));
}
throw raised_exception("'in' test expects iterable as first argument, got " + haystack->type());
}},
{"test_is_test", [](const func_args & args) -> value {
args.ensure_vals<value_string>();
auto & builtins = global_builtins();

View File

@ -12,6 +12,7 @@
#include <set>
#include <sstream>
#include <string>
#include <unordered_map>
#include <vector>
namespace jinja {

View File

@ -7,6 +7,33 @@
#include <cstdio>
#include <sstream>
// prime number used for LCG hash function (32 bit), it is near (sqrt(5) - 1)/2 * 2^32.
#define LCG_FACTOR 2654435761UL
// Compute the LCG hash of a n-gram of size len at offset start.
static uint32_t common_ngram_map_hash(const llama_tokens & tokens, size_t start, size_t len) {
uint32_t hash = 0;
for (size_t i = 0; i < len; ++i) {
hash = hash * LCG_FACTOR + tokens[start + i];
}
return hash;
}
// Print the values of a sublist of `llama_tokens & inp` to a string in the form [v0, v1, v2, ...].
static std::string common_tokens_to_str(const llama_tokens & inp, size_t start, size_t length) {
std::ostringstream oss;
oss << '[';
for (size_t i = 0; i < length; ++i) {
if (i > 0) {
oss << ", ";
}
oss << inp[start + i];
}
oss << ']';
return oss.str();
}
// n-gram simple
//
@ -100,7 +127,99 @@ llama_tokens common_ngram_simple_draft(
// maximum number of counted values of a ngram map value.
#define COMMON_NGRAM_MAX_VALUE_COUNT 16380
static std::string common_tokens_to_str(const llama_tokens & inp, size_t start, size_t length);
void common_ngram_map_begin(
common_ngram_map & map, const llama_tokens & tokens) {
size_t size_begin = tokens.size();
LOG_DBG("%s: begin, idx_last_draft=%zu, new begin=%zu, #keys=%zu\n", __func__,
map.idx_last_check, size_begin, map.keys.size());
size_t count_map_entries_upd = 0;
if (!map.key_map.empty() && size_begin < map.idx_last_check) {
if (map.show_key_map_stats) {
// Print statistics of hash map map_key.
size_t count_nonzero = 0;
uint32_t min_idx = UINT32_MAX;
uint32_t max_idx = 0;
for (size_t i = 0; i < map.key_map.size(); ++i) {
uint32_t key_idx = map.key_map[i];
if (key_idx != 0) {
++count_nonzero;
if (key_idx < min_idx) min_idx = key_idx;
if (key_idx > max_idx) max_idx = key_idx;
}
}
if (count_nonzero == 0) {
min_idx = 0;
}
LOG_INF("%s: key_map stats: entries=%zu, min_idx=%u, max_idx=%u, key_map_last_idx=%u\n",
__func__, count_nonzero, min_idx, max_idx, map.key_map_last_idx);
}
// Update the map from hash to key index (clear outdated entries).
for (size_t i = 0; i < map.key_map.size(); ++i) {
uint32_t key_idx = map.key_map[i];
if (key_idx >= map.size_last_begin) {
map.key_map[i] = 0;
count_map_entries_upd++;
}
}
map.key_map_last_idx = (map.size_last_begin > 0) ? map.size_last_begin - 1 : 0;
}
if (size_begin < map.idx_last_check && !map.keys.empty()) {
// The next token generation will start at index size_begin.
// The tokens between map.size_last_begin and size_begin are no longer valid.
//
// Refresh map: Remove all entries with index >= map.size_last_begin.
size_t count_keys = map.keys.size();
size_t count_keys_del = 0;
size_t count_values_del = 0;
for (int32_t i = map.keys.size() - 1; i >= 0; --i) {
common_ngram_map_key & key = map.keys[i];
if (key.key_idx >= map.size_last_begin) {
// Delete the key.
LOG_DBG("%s: delete key %d at index %zu (>= size_last_begin=%zu)\n", __func__, i, key.key_idx, map.size_last_begin);
map.keys.erase(map.keys.begin() + i);
count_keys_del++;
continue;
}
if (map.key_only) {
continue;
}
// Check the indices of the values.
for (int16_t j = COMMON_NGRAM_MAX_VALUES - 1; j >= 0; --j) {
common_ngram_map_value & value = key.values[j];
if (value.value_idx >= map.size_last_begin) {
// Delete the value.
count_values_del++;
// Move all values after this value to the left.
for (uint16_t k = j; k < COMMON_NGRAM_MAX_VALUES - 1; ++k) {
key.values[k] = key.values[k + 1];
}
// Clear the last value.
key.values[COMMON_NGRAM_MAX_VALUES - 1].value_idx = 0;
key.values[COMMON_NGRAM_MAX_VALUES - 1].value_num = 0;
}
}
if (key.values[0].value_idx == 0) {
// No values left, delete the key.
LOG_DBG("%s: delete key %d at index %zu (no values left)\n", __func__, i, key.key_idx);
map.keys.erase(map.keys.begin() + i);
count_keys_del++;
}
}
LOG_INF("%s: refresh map: idx_last_draft=%zu, new begin=%zu, #keys_checked=%zu, #keys_del=%zu, #values_del=%zu, #hashes_upd=%zu\n", __func__,
map.idx_last_check, size_begin,
count_keys, count_keys_del, count_values_del, count_map_entries_upd);
}
map.idx_last_check = (map.size_last_begin > 0) ? map.size_last_begin - 1 : 0;
map.size_last_begin = size_begin;
}
void common_ngram_map_draft(common_ngram_map & map,
const llama_tokens & inp, llama_token sampled,
@ -116,6 +235,10 @@ void common_ngram_map_draft(common_ngram_map & map,
if (cur_len < static_cast<size_t>(2 * n + m)) {
return;
}
if (cur_len >= static_cast<size_t>(UINT32_MAX)) {
// key_map uses uint32_t instead of size_t.
GGML_ABORT("%s: cur_len exceeds UINT32_MAX: %zu", __func__, cur_len);
}
// Only check every check_rate tokens to save compute
// i.e., perform check if (cur_len - idx_last_check) >= check_rate
@ -134,24 +257,92 @@ void common_ngram_map_draft(common_ngram_map & map,
// search for the key in the map
size_t match_pos = 0;
for (size_t j = cur_len - n - m - 1; j > 0; --j) {
bool match = true;
for (size_t k = 0; k < n; ++k) {
if (inp[j + k] != key_tokens[k]) {
match = false;
break;
if (map.size_last_begin > cur_len) {
GGML_ABORT("%s: map.size_last_begin > cur_len: %zu > %zu", __func__, map.size_last_begin, cur_len);
}
if (!map.key_map.empty()) {
// Search for the key in the map key_map from hash of ngrams to index of ngram.
uint32_t idx_hash = (common_ngram_map_hash(key_tokens, 0, n) % map.key_map.size());
uint32_t idx_key = map.key_map[idx_hash];
if (idx_key != 0 && idx_key < cur_len - n - m - 1) {
// Check if the key matches the key at idx_key (because of possible collisions).
bool match = true;
for (size_t k = 0; k < n; ++k) {
if (inp[idx_key + k] != key_tokens[k]) {
match = false;
break;
}
}
LOG_DBG("%s: key hash %x -> idx_key %d: match %d\n", __func__, idx_hash, idx_key, match ? 1 : 0);
if (match) {
match_pos = idx_key;
}
}
if (match) {
match_pos = j;
break;
}
if (match_pos == 0 && map.size_last_begin > (size_t) (n + m + 1)) {
// Search for the key in [1, map.size_last_begin - n - m -1], descending.
for (size_t j = map.size_last_begin - n - m - 1; j > map.key_map_last_idx; --j) {
// Check if the key matches the key.
bool match = true;
for (size_t k = 0; k < n; ++k) {
if (inp[j + k] != key_tokens[k]) {
match = false;
break;
}
}
if (match) {
match_pos = j;
break;
}
}
}
if (match_pos == 0) {
// In case of a reasoning chat, the part after size_last_begin may be deleted/reordered later.
//
// Search in [size_last_begin, cur_len - n - m - 1], descending.
for (size_t j = cur_len - n - m - 1; j > map.size_last_begin && j > map.key_map_last_idx; --j) {
bool match = true;
for (size_t k = 0; k < n; ++k) {
if (inp[j + k] != key_tokens[k]) {
match = false;
break;
}
}
if (match) {
match_pos = j;
break;
}
}
}
if (match_pos > 0) {
LOG_INF("%s: cur_len = %zu, n = %d, m = %d, sz_tkns = %zu, sampled = %d, match_pos = %zu\n", __func__,
LOG_DBG("%s: cur_len = %zu, n = %d, m = %d, sz_tkns = %zu, sampled = %d, match_pos = %zu\n", __func__,
cur_len, n, m, key_tokens.size(), sampled, match_pos);
}
if (!map.key_map.empty()) {
// Add hashes of new ngrams in key_map.
//
// Use the same order as above.
if (map.size_last_begin > (size_t) (n + m + 1)) {
for (size_t j = map.size_last_begin - n - m - 1; j > map.key_map_last_idx; --j) {
// compute hash and store index of ngram at idx j in the map.
uint32_t idx_hash = (common_ngram_map_hash(inp, j, n) % map.key_map.size());
if (map.key_map[idx_hash] == 0) {
map.key_map[idx_hash] = j; // collisions may occur
}
}
}
for (size_t j = cur_len - n - m - 1; j > map.size_last_begin && j > map.key_map_last_idx; --j) {
// compute hash and store index of ngram at idx j in the map.
uint32_t idx_hash = (common_ngram_map_hash(inp, j, n) % map.key_map.size());
if (map.key_map[idx_hash] == 0) {
map.key_map[idx_hash] = j;
}
}
map.key_map_last_idx = std::max(static_cast<uint32_t>(cur_len - n - m - 1), map.key_map_last_idx);
}
if (match_pos == 0) {
return;
}
@ -202,8 +393,8 @@ void common_ngram_map_draft(common_ngram_map & map,
draft.push_back(inp[match_pos + n + i]);
}
LOG_INF("%s: key_offset = %zu, key_num = %d, draft.size = %zu\n", __func__,
key_offset, curr_key.key_num, draft.size());
LOG_DBG("%s: key_idx = %zu, key_offset = %zu, key_num = %d, draft.size = %zu\n", __func__,
curr_key.key_idx, key_offset, curr_key.key_num, draft.size());
map.last_draft_created = false;
map.last_draft_key_idx = key_offset;
@ -305,7 +496,7 @@ void common_ngram_map_draft(common_ngram_map & map,
}
}
if (sum_occur > 0 && max_occur < 3 * sum_occur) {
if (sum_occur > 0 && max_occur < 2 * sum_occur) {
// The most frequent value is not much more frequent than the other values.
// We do not use the draft.
return;
@ -347,21 +538,3 @@ void common_ngram_map_accept(common_ngram_map & map, uint16_t n_accepted) {
n_accepted, curr_value.n_accepted);
curr_value.n_accepted = n_accepted;
}
// Helper functions.
//
// Print the values of a sublist of `llama_tokens & inp` to a string in the form [v0, v1, v2, ...].
std::string common_tokens_to_str(const llama_tokens & inp, size_t start, size_t length) {
std::ostringstream oss;
oss << '[';
for (size_t i = 0; i < length; ++i) {
if (i > 0) {
oss << ", ";
}
oss << inp[start + i];
}
oss << ']';
return oss.str();
}

View File

@ -9,8 +9,11 @@
// 2. ngram_map: lookup of n-grams followed by m-grams in token history using a map.
// The map is a vector of key n-grams, and for each key n-gram there is a list of value m-grams.
//
// ref: https://github.com/ggml-org/llama.cpp/pull/18471
//
#include "llama.h"
#include "common.h"
#include <vector>
@ -50,10 +53,13 @@ llama_tokens common_ngram_simple_draft(
// maximum number of m-gram values stored for each key n-gram.
#define COMMON_NGRAM_MAX_VALUES 4
// number of entries in the (optional, size 0 to disable) map from ngram-hash to ngram-index.
#define COMMON_NGRAM_HASH_MAP_SIZE 262144
// statistics of a m-gram after a known n-gram
struct common_ngram_map_value {
size_t value_idx = 0; // index of value m-gram in token-history (0 if unused)
uint16_t value_num = 0; // number of occurences of this value m-gram after the key n-gram (0 in an unused values-slot)
size_t value_idx = 0; // index of value m-gram in token-history (0 if unused)
uint16_t value_num = 0; // number of occurences of this value m-gram after the key n-gram (0 in an unused values-slot)
int16_t n_accepted = -1; // number of accepted tokens at last draft (-1 if unused)
};
@ -73,23 +79,43 @@ struct common_ngram_map {
bool key_only; // true if only key n-grams are used, no values.
// first draft: vector only, no map.
std::vector<common_ngram_map_key> keys; // key n-grams which occur several times in token-history
uint16_t check_rate; // check for speculative decoding without draft model for each check_rate token
uint16_t min_hits; // minimum number of key hits to consider a draft
bool show_key_map_stats = false; // true, if statitics of the key_map should be printed.
common_ngram_map(uint16_t sz_key, uint16_t sz_value, bool only_keys,
uint16_t check_rate, uint16_t min_hits)
: size_key(sz_key), size_value(sz_value), key_only(only_keys),
check_rate(check_rate), min_hits(min_hits) {}
check_rate(check_rate), min_hits(min_hits) {
key_map.resize(COMMON_NGRAM_HASH_MAP_SIZE); // 2^18 hash entries, 0 entries if key_map shouldn't be used
}
// In reasoning chats the previous reasoning block will be removed from context history.
// A rebuild of the ngram map is needed after that.
size_t size_last_begin = 0; // number of tokens at previous start of generation
bool last_draft_created = false; // true if a draft was created at last call.
size_t last_draft_key_idx = 0; // index of last key used for draft generation.
size_t last_draft_key_idx = 0; // index of last key used for draft generation (0 = no draft)
uint16_t last_draft_value_idx = 0; // index of last value used for draft generation.
size_t idx_last_check = 0; // index of last check in context history
// optional map "hash to ngram-index" for faster lookup of n-grams. map is empty if unused.
//
// uint32_t instead of size_t (size of current histories is << UINT32_MAX)
std::vector<uint32_t> key_map; // key_map[hash] = index of ngram in context window
uint32_t key_map_last_idx = 0; // index of the last ngram added to key_map
};
// Initialize the n-gram map with the given token history.
// map: the ngram map to initialize.
// tokens: the token history to base the map on.
void common_ngram_map_begin(
common_ngram_map & map,
const llama_tokens & tokens);
// Searches for the n-gram in the history and checks whether a draft sequence should be generated.
// map: the ngram map to search in.

60
common/ngram-mod.cpp Normal file
View File

@ -0,0 +1,60 @@
#include "ngram-mod.h"
//
// common_ngram_mod
//
common_ngram_mod::common_ngram_mod(uint16_t n, size_t size) : n(n), used(0) {
entries.resize(size);
reset();
}
size_t common_ngram_mod::idx(const entry_t * tokens) const {
size_t res = 0;
for (size_t i = 0; i < n; ++i) {
res = res*6364136223846793005ULL + tokens[i];
}
res = res % entries.size();
return res;
}
void common_ngram_mod::add(const entry_t * tokens) {
const size_t i = idx(tokens);
if (entries[i] == EMPTY) {
used++;
}
entries[i] = tokens[n];
}
common_ngram_mod::entry_t common_ngram_mod::get(const entry_t * tokens) const {
const size_t i = idx(tokens);
return entries[i];
}
void common_ngram_mod::reset() {
std::fill(entries.begin(), entries.end(), EMPTY);
used = 0;
}
size_t common_ngram_mod::get_n() const {
return n;
}
size_t common_ngram_mod::get_used() const {
return used;
}
size_t common_ngram_mod::size() const {
return entries.size();
}
size_t common_ngram_mod::size_bytes() const {
return entries.size() * sizeof(entries[0]);
}

38
common/ngram-mod.h Normal file
View File

@ -0,0 +1,38 @@
#pragma once
#include <cstdint>
#include <vector>
#include <cstddef>
//
// common_ngram_mod
// ref: https://github.com/ggml-org/llama.cpp/pull/19164
//
// basic n-gram hasher
struct common_ngram_mod {
using entry_t = int32_t;
static constexpr entry_t EMPTY = -1;
common_ngram_mod(uint16_t n, size_t size);
size_t idx(const entry_t * tokens) const;
void add(const entry_t * tokens);
entry_t get(const entry_t * tokens) const; // return -1 if not found
void reset();
size_t get_n() const;
size_t get_used() const;
size_t size() const;
size_t size_bytes() const;
private:
size_t n; // ngram size to hash
size_t used;
std::vector<entry_t> entries;
};

View File

@ -6,6 +6,7 @@
#include "log.h"
#include "ngram-cache.h"
#include "ngram-map.h"
#include "ngram-mod.h"
#include "sampling.h"
#include <algorithm>
@ -23,6 +24,7 @@ const std::vector<enum common_speculative_type> common_speculative_types = {
COMMON_SPECULATIVE_TYPE_NGRAM_SIMPLE,
COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K,
COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K4V,
COMMON_SPECULATIVE_TYPE_NGRAM_MOD,
COMMON_SPECULATIVE_TYPE_NGRAM_CACHE
};
@ -33,6 +35,7 @@ const std::map<std::string, enum common_speculative_type> common_speculative_typ
{"ngram_simple", COMMON_SPECULATIVE_TYPE_NGRAM_SIMPLE},
{"ngram_map_k", COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K},
{"ngram_map_k4v", COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K4V},
{"ngram_mod", COMMON_SPECULATIVE_TYPE_NGRAM_MOD},
{"ngram_cache", COMMON_SPECULATIVE_TYPE_NGRAM_CACHE}
};
@ -110,6 +113,8 @@ static bool common_speculative_are_compatible(
struct common_speculative_state {
const enum common_speculative_type type;
// TODO: rename to n_call_draft, n_gen_drafts, n_acc_drafts, n_gen_tokens, n_acc_tokens
// TODO: add n_call_begin, n_call_accept
size_t drafts_call_count = 0; // number of times this implementation was called.
size_t drafts_generated_count = 0; // number of times a draft or part was generated by this implementation.
size_t drafts_accepted_count = 0; // number of times a draft or part was accepted by the target model.
@ -119,7 +124,9 @@ struct common_speculative_state {
// TODO: track performance of most recent calls
const bool gen_perf = true; // whether to generate performance stats.
int64_t gen_duration_us = 0; // total time spent in this implementation in microseconds.
int64_t t_begin_us = 0; // total time spent in refresh of this implementation in microseconds.
int64_t t_draft_us = 0; // total time spent in generating drafts in this implementation in microseconds.
int64_t t_accept_us = 0; // total time spent in accumulation of this implementation in microseconds.
common_speculative_state(enum common_speculative_type type) : type(type) {}
@ -492,7 +499,7 @@ struct common_speculative_state_ngram_map_k : public common_speculative_state {
: common_speculative_state(type), map(std::move(map)) {}
void begin(const llama_tokens & prompt) override {
GGML_UNUSED(prompt);
common_ngram_map_begin(map, prompt);
}
void draft(
@ -509,6 +516,132 @@ struct common_speculative_state_ngram_map_k : public common_speculative_state {
}
};
struct common_speculative_state_ngram_mod : public common_speculative_state {
common_ngram_mod & mod;
// the last position in the prompt that was added to the ngram container
size_t i_last = 0;
// length of the last drafted ngram (number of tokens returned by draft)
size_t n_draft_last = 0;
// consecutive accept rounds with low acceptance fraction (< 0.5)
int n_low = 0;
// enable trace logging if LLAMA_TRACE is set
const bool verbose;
common_speculative_state_ngram_mod(enum common_speculative_type type, common_ngram_mod & mod)
: common_speculative_state(type), mod(mod), verbose(std::getenv("LLAMA_TRACE") != nullptr) {
static_assert(sizeof(llama_token) == sizeof(common_ngram_mod::entry_t));
}
void begin(const llama_tokens & prompt) override {
i_last = 0;
n_draft_last = 0;
const size_t n = mod.get_n();
if (prompt.size() < n) {
return;
}
for (size_t i = 0; i < prompt.size() - n; ++i) {
mod.add(prompt.data() + i);
}
i_last = prompt.size() - n;
const double f = (double)mod.get_used() / (double)mod.size();
LOG_INF("%s: ngram_mod occupancy = %zu/%zu (%.2f)\n", __func__, mod.get_used(), mod.size(), f);
constexpr double f_thold = 0.25;
if (f > f_thold) {
LOG_WRN("%s: ngram_mod occupancy %.2f exceeds threshold (%.2f) - resetting\n", __func__, f, f_thold);
mod.reset();
}
}
void draft(
const common_params_speculative & params,
const llama_tokens & prompt_tgt,
llama_token id_last,
llama_tokens & result) override {
GGML_UNUSED(params);
n_draft_last = 0;
const size_t cur_len = prompt_tgt.size();
if (cur_len < mod.get_n()) {
return;
}
const size_t n = mod.get_n();
// add new ngrams in chunks
if (i_last + 32 < cur_len) {
for (size_t i = i_last; i < cur_len - n; ++i) {
mod.add(prompt_tgt.data() + i);
}
i_last = cur_len - n;
}
result.resize(n + params.n_max);
for (size_t i = 0; i < n - 1; ++i) {
result[i] = prompt_tgt[cur_len - n + 1 + i];
}
result[n - 1] = id_last;
for (int i = 0; i < params.n_max; ++i) {
const llama_token token = mod.get(result.data() + i);
if (token == common_ngram_mod::EMPTY) {
if (i < params.n_min) {
result.clear();
return;
}
result.resize(n + i);
break;
}
result[n + i] = token;
}
// only return the m tokens that were drafted
for (size_t i = 0; n + i < result.size(); ++i) {
result[i] = result[n + i];
}
result.resize(result.size() - n);
// store length of drafted ngram for later acceptance analysis
n_draft_last = result.size();
}
void accept(uint16_t n_accepted) override {
if (verbose) {
LOG_INF("%s: accepted %d tokens from %zu drafted tokens\n", __func__, n_accepted, n_draft_last);
}
// compute acceptance fraction if we have a recorded draft length
if (n_draft_last > 0) {
const double f_acc = (double)n_accepted / (double)n_draft_last;
if (f_acc < 0.5) {
n_low++;
if (n_low >= 3) {
LOG_WRN("%s: low acceptance streak (%d) resetting ngram_mod\n", __func__, n_low);
mod.reset();
n_low = 0;
}
} else {
n_low = 0;
}
}
}
};
struct common_speculative_state_ngram_cache : public common_speculative_state {
uint16_t n_draft;
bool save_dynamic;
@ -650,6 +783,7 @@ std::string common_speculative_type_to_str(enum common_speculative_type type) {
case COMMON_SPECULATIVE_TYPE_NGRAM_SIMPLE: return "ngram_simple";
case COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K: return "ngram_map_k";
case COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K4V: return "ngram_map_k4v";
case COMMON_SPECULATIVE_TYPE_NGRAM_MOD: return "ngram_mod";
case COMMON_SPECULATIVE_TYPE_NGRAM_CACHE: return "ngram_cache";
default: return "unknown";
}
@ -666,8 +800,8 @@ enum common_speculative_type common_speculative_type_from_name(const std::string
// initialization of the speculative decoding system
//
common_speculative * common_speculative_init(
const common_params_speculative & params,
llama_context * ctx_tgt) {
common_params_speculative & params,
llama_context * ctx_tgt) {
llama_context * ctx_dft = nullptr;
if (params.model_dft) {
ctx_dft = llama_init_from_model(params.model_dft, params.cparams_dft);
@ -687,6 +821,7 @@ common_speculative * common_speculative_init(
bool has_ngram_simple = (params.type == COMMON_SPECULATIVE_TYPE_NGRAM_SIMPLE);
bool has_ngram_map_k = (params.type == COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K);
bool has_ngram_map_k4v = (params.type == COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K4V);
bool has_ngram_mod = (params.type == COMMON_SPECULATIVE_TYPE_NGRAM_MOD);
// In a more complex implementation we could use the same implementation but with different parameters.
// This was initially used in PR-18471 but removed to simplify the code.
@ -701,6 +836,22 @@ common_speculative * common_speculative_init(
// This implementation can guess tokens with high acceptance rate but is more expensive.
configs.push_back(common_speculative_config(COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K4V, params));
}
if (has_ngram_mod) {
// shared instance for all speculative decoding contexts
if (!params.ngram_mod) {
params.ngram_mod = std::make_shared<common_ngram_mod>(params.ngram_size_n, 4*1024*1024);
LOG_INF("%s: initialized ngram_mod with n=%d, size=%zu (%.3f MB)\n", __func__,
params.ngram_size_n, params.ngram_mod->size(),
(float)(params.ngram_mod->size_bytes())/1024/1024);
if (params.ngram_size_n < 16) {
LOG_WRN("%s: ngram_mod n=%d is too small - poor quality is possible, see: https://github.com/ggml-org/llama.cpp/pull/19164\n", __func__, params.ngram_size_n);
}
}
configs.push_back(common_speculative_config(COMMON_SPECULATIVE_TYPE_NGRAM_MOD, params));
}
if (has_ngram_cache) {
configs.push_back(common_speculative_config(COMMON_SPECULATIVE_TYPE_NGRAM_CACHE, params));
}
@ -758,6 +909,11 @@ common_speculative * common_speculative_init(
));
break;
}
case COMMON_SPECULATIVE_TYPE_NGRAM_MOD: {
GGML_ASSERT(config.params.ngram_mod);
impls.push_back(std::make_unique<common_speculative_state_ngram_mod>(config.type, *config.params.ngram_mod));
break;
}
case COMMON_SPECULATIVE_TYPE_NGRAM_CACHE: {
auto state = create_state_ngram_cache(
params.lookup_cache_static, params.lookup_cache_dynamic, config);
@ -795,6 +951,7 @@ void common_speculative_begin(common_speculative * spec, const llama_tokens & pr
}
for (auto & impl : spec->impls) {
common_time_meas tm(impl->t_begin_us, !impl->gen_perf);
impl->begin(prompt);
}
}
@ -810,20 +967,14 @@ llama_tokens common_speculative_draft(
for (auto & impl : spec->impls) {
{
const int64_t t_start_us = impl->gen_perf ? ggml_time_us() : 0;
common_time_meas tm(impl->t_draft_us, !impl->gen_perf);
impl->draft(params, prompt_tgt, id_last, result);
const int64_t t_now_us = impl->gen_perf ? ggml_time_us() : 0;
impl->drafts_call_count++;
impl->gen_duration_us += t_now_us - t_start_us; // accumulate duration for this implementation
}
if (!result.empty()) {
LOG_DBG("%s: called impl %s, hist size = %zu, call_count = %zu, gen = %zu\n", __func__,
common_speculative_type_to_str(impl.get()->type).c_str(),
prompt_tgt.size(),
common_speculative_type_to_str(impl.get()->type).c_str(), prompt_tgt.size(),
impl.get()->drafts_call_count, result.size());
spec->curr_impl = impl.get(); // set current implementation for stats
@ -846,12 +997,15 @@ void common_speculative_accept(common_speculative * spec, uint16_t n_accepted) {
GGML_ASSERT(impl);
if (n_accepted > 0) {
impl->drafts_accepted_count++;
impl->drafts_accepted_tokens += n_accepted;
}
{
common_time_meas tm(impl->t_accept_us, !impl->gen_perf);
if (n_accepted > 0) {
impl->drafts_accepted_count++;
impl->drafts_accepted_tokens += n_accepted;
}
impl->accept(n_accepted);
impl->accept(n_accepted);
}
}
void common_speculative_print_stats(const common_speculative * spec) {
@ -863,8 +1017,10 @@ void common_speculative_print_stats(const common_speculative * spec) {
std::string str_perf;
if (impl->gen_perf) {
std::ostringstream oss;
oss << std::fixed << std::setprecision(3) << impl->gen_duration_us / 1000.0;
str_perf = ", dur = " + oss.str() + " ms";
oss << std::fixed << std::setprecision(3) << impl->t_begin_us / 1000.0 << ", ";
oss << std::fixed << std::setprecision(3) << impl->t_draft_us / 1000.0 << ", ";
oss << std::fixed << std::setprecision(3) << impl->t_accept_us / 1000.0;
str_perf = ", dur(b,g,a) = " + oss.str() + " ms";
} else {
str_perf = "";
}

View File

@ -15,8 +15,8 @@ enum common_speculative_type common_speculative_type_from_name(const std::string
std::string common_speculative_type_to_str(enum common_speculative_type type);
common_speculative * common_speculative_init(
const common_params_speculative & params,
llama_context * ctx_tgt);
common_params_speculative & params,
llama_context * ctx_tgt);
void common_speculative_free(common_speculative * spec);

View File

@ -8806,6 +8806,7 @@ class GraniteMoeModel(GraniteModel):
gate, up = data_torch.split(ffn_dim, dim=-2)
yield from ModelBase.modify_tensors(self, gate, self.format_tensor_name(gguf.MODEL_TENSOR.FFN_GATE_EXP, bid), bid)
yield from ModelBase.modify_tensors(self, up, self.format_tensor_name(gguf.MODEL_TENSOR.FFN_UP_EXP, bid), bid)
return
has_experts = bool(self.hparams.get('num_local_experts'))

View File

@ -22,12 +22,11 @@
- **DPCPP** *(Data Parallel C++)*: The primary oneAPI SYCL implementation, which includes the icpx/icx Compilers.
- **oneAPI Libraries**: A set of highly optimized libraries targeting multiple domains *(e.g. Intel oneMKL, oneMath and oneDNN)*.
- **oneAPI LevelZero**: A high performance low level interface for fine-grained control over Intel iGPUs and dGPUs.
- **Nvidia & AMD Plugins**: These are plugins extending oneAPI's DPCPP support to SYCL on Nvidia and AMD GPU targets.
### Llama.cpp + SYCL
The llama.cpp SYCL backend is primarily designed for **Intel GPUs**.
SYCL cross-platform capabilities enable support for Nvidia GPUs as well, with limited support for AMD.
SYCL cross-platform capabilities enable support for other vendor GPUs as well.
## Recommended Release
@ -42,6 +41,9 @@ The following releases are verified and recommended:
## News
- 2026.02
- Remove support for Nvidia & AMD GPU, because the oneAPI plugin for Nvidia & AMD GPU is unavailable: download/installation channels are out of work. User can't build up the software for Nvidia & AMD GPU.
- 2025.11
- Support malloc memory on device more than 4GB.
@ -111,15 +113,15 @@ On older Intel GPUs, you may try [OpenCL](/docs/backend/OPENCL.md) although the
|-------------------------------|---------|---------------------------------------|
| Intel Data Center Max Series | Support | Max 1550, 1100 |
| Intel Data Center Flex Series | Support | Flex 170 |
| Intel Arc A-Series | Support | Arc A770, Arc A730M, Arc A750 |
| Intel Arc B-Series | Support | Arc B580 |
| Intel Arc A-Series | Support | Arc A770, Arc A730M, Arc A750 |
| Intel Arc B-Series | Support | Arc B580 |
| Intel built-in Arc GPU | Support | built-in Arc GPU in Meteor Lake, Arrow Lake, Lunar Lake |
| Intel iGPU | Support | iGPU in 13700k, 13400, i5-1250P, i7-1260P, i7-1165G7 |
*Notes:*
- **Memory**
- The device memory is a limitation when running a large model. The loaded model size, *`llm_load_tensors: buffer_size`*, is displayed in the log when running `./bin/llama-cli`.
- The device memory is a limitation when running a large model. The loaded model size, *`llm_load_tensors: buffer_size`*, is displayed in the log when running `./bin/llama-completion`.
- Please make sure the GPU shared memory from the host is large enough to account for the model's size. For e.g. the *llama-2-7b.Q4_0* requires at least 8.0GB for integrated GPU and 4.0GB for discrete GPU.
- **Execution Unit (EU)**
@ -127,20 +129,7 @@ On older Intel GPUs, you may try [OpenCL](/docs/backend/OPENCL.md) although the
### Other Vendor GPU
**Verified devices**
| Nvidia GPU | Status | Verified Model |
|--------------------------|-----------|----------------|
| Ampere Series | Supported | A100, A4000 |
| Ampere Series *(Mobile)* | Supported | RTX 40 Series |
| AMD GPU | Status | Verified Model |
|--------------------------|--------------|----------------|
| Radeon Pro | Experimental | W6800 |
| Radeon RX | Experimental | 6700 XT |
Note: AMD GPU support is highly experimental and is incompatible with F16.
Additionally, it only supports GPUs with a sub_group_size (warp size) of 32.
NA
## Docker
@ -149,11 +138,11 @@ The docker build option is currently limited to *Intel GPU* targets.
### Build image
```sh
# Using FP16
docker build -t llama-cpp-sycl --build-arg="GGML_SYCL_F16=ON" --target light -f .devops/intel.Dockerfile .
# Using FP32
docker build -t llama-cpp-sycl --build-arg="GGML_SYCL_F16=OFF" --target light -f .devops/intel.Dockerfile .
# Using FP16
docker build -t llama-cpp-sycl --build-arg="GGML_SYCL_F16=ON" --target light -f .devops/intel.Dockerfile .
```
*Notes*:
@ -212,14 +201,6 @@ Platform #0: Intel(R) OpenCL HD Graphics
`-- Device #0: Intel(R) Iris(R) Xe Graphics [0x9a49]
```
- **Nvidia GPU**
In order to target Nvidia GPUs through SYCL, please make sure the CUDA/CUBLAS native requirements *-found [here](README.md#cuda)-* are installed.
- **AMD GPU**
To target AMD GPUs with SYCL, the ROCm stack must be installed first.
2. **Install Intel® oneAPI Base toolkit**
SYCL backend depends on:
@ -248,23 +229,6 @@ Upon a successful installation, SYCL is enabled for the available intel devices,
|2025.1|
|2024.1|
- **Adding support to Nvidia GPUs**
**oneAPI Plugin**: In order to enable SYCL support on Nvidia GPUs, please install the [Codeplay oneAPI Plugin for Nvidia GPUs](https://developer.codeplay.com/products/oneapi/nvidia/download). User should also make sure the plugin version matches the installed base toolkit one *(previous step)* for a seamless "oneAPI on Nvidia GPU" setup.
**oneDNN**: The current oneDNN releases *(shipped with the oneAPI base-toolkit)* do not include the NVIDIA backend. Therefore, oneDNN must be compiled from source to enable the NVIDIA target:
```sh
git clone https://github.com/oneapi-src/oneDNN.git
cd oneDNN
cmake -GNinja -Bbuild-nvidia -DDNNL_CPU_RUNTIME=DPCPP -DDNNL_GPU_RUNTIME=DPCPP -DDNNL_GPU_VENDOR=NVIDIA -DONEDNN_BUILD_GRAPH=OFF -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
cmake --build build-nvidia --config Release
```
- **Adding support to AMD GPUs**
**oneAPI Plugin**: In order to enable SYCL support on AMD GPUs, please install the [Codeplay oneAPI Plugin for AMD GPUs](https://developer.codeplay.com/products/oneapi/amd/download). As with Nvidia GPUs, the user should also make sure the plugin version matches the installed base toolkit.
3. **Verify installation and environment**
In order to check the available SYCL devices on the machine, please use the `sycl-ls` command.
@ -285,25 +249,6 @@ When targeting an intel GPU, the user should expect one or more devices among th
[opencl:gpu][opencl:2] Intel(R) OpenCL Graphics, Intel(R) UHD Graphics 730 OpenCL 3.0 NEO [24.39.31294]
```
- **Nvidia GPU**
Similarly, user targeting Nvidia GPUs should expect at least one SYCL-CUDA device [`cuda:gpu`] as below:
```
[opencl:acc][opencl:0] Intel(R) FPGA Emulation Platform for OpenCL(TM), Intel(R) FPGA Emulation Device OpenCL 1.2 [2023.16.12.0.12_195853.xmain-hotfix]
[opencl:cpu][opencl:1] Intel(R) OpenCL, Intel(R) Xeon(R) Gold 6326 CPU @ 2.90GHz OpenCL 3.0 (Build 0) [2023.16.12.0.12_195853.xmain-hotfix]
[cuda:gpu][cuda:0] NVIDIA CUDA BACKEND, NVIDIA A100-PCIE-40GB 8.0 [CUDA 12.5]
```
- **AMD GPU**
For AMD GPUs we should expect at least one SYCL-HIP device [`hip:gpu`]:
```
[opencl:cpu][opencl:0] Intel(R) OpenCL, 12th Gen Intel(R) Core(TM) i9-12900K OpenCL 3.0 (Build 0) [2024.18.6.0.02_160000]
[hip:gpu][hip:0] AMD HIP BACKEND, AMD Radeon PRO W6800 gfx1030 [HIP 60140.9]
```
### II. Build llama.cpp
#### Intel GPU
@ -332,47 +277,6 @@ It is possible to come across some precision issues when running tests that stem
instructions, which can be circumvented by setting the environment variable `SYCL_PROGRAM_COMPILE_OPTIONS`
as `-cl-fp32-correctly-rounded-divide-sqrt`
#### Nvidia GPU
The SYCL backend depends on [oneMath](https://github.com/uxlfoundation/oneMath) for Nvidia and AMD devices.
By default it is automatically built along with the project. A specific build can be provided by setting the CMake flag `-DoneMath_DIR=/path/to/oneMath/install/lib/cmake/oneMath`.
```sh
# Build LLAMA with Nvidia BLAS acceleration through SYCL
# Setting GGML_SYCL_DEVICE_ARCH is optional but can improve performance
GGML_SYCL_DEVICE_ARCH=sm_80 # Example architecture
# Option 1: Use FP32 (recommended for better performance in most cases)
cmake -B build -DGGML_SYCL=ON -DGGML_SYCL_TARGET=NVIDIA -DGGML_SYCL_DEVICE_ARCH=${GGML_SYCL_DEVICE_ARCH} -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DDNNL_DIR=/path/to/oneDNN/build-nvidia/install/lib/cmake/dnnl
# Option 2: Use FP16
cmake -B build -DGGML_SYCL=ON -DGGML_SYCL_TARGET=NVIDIA -DGGML_SYCL_DEVICE_ARCH=${GGML_SYCL_DEVICE_ARCH} -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON -DDNNL_DIR=/path/to/oneDNN/build-nvidia/install/lib/cmake/dnnl
# build all binary
cmake --build build --config Release -j -v
```
It is possible to come across some precision issues when running tests that stem from using faster
instructions, which can be circumvented by passing the `-fno-fast-math` flag to the compiler.
#### AMD GPU
The SYCL backend depends on [oneMath](https://github.com/uxlfoundation/oneMath) for Nvidia and AMD devices.
By default it is automatically built along with the project. A specific build can be provided by setting the CMake flag `-DoneMath_DIR=/path/to/oneMath/install/lib/cmake/oneMath`.
```sh
# Build LLAMA with rocBLAS acceleration through SYCL
## AMD
# Use FP32, FP16 is not supported
# Find your GGML_SYCL_DEVICE_ARCH with rocminfo, under the key 'Name:'
GGML_SYCL_DEVICE_ARCH=gfx90a # Example architecture
cmake -B build -DGGML_SYCL=ON -DGGML_SYCL_TARGET=AMD -DGGML_SYCL_DEVICE_ARCH=${GGML_SYCL_DEVICE_ARCH} -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
# build all binary
cmake --build build --config Release -j -v
```
### III. Run the inference
#### Retrieve and prepare model
@ -423,16 +327,12 @@ Choose one of following methods to run.
- Use device 0:
```sh
./examples/sycl/run-llama2.sh 0
# OR
./examples/sycl/run-llama3.sh 0
./examples/sycl/test.sh -mg 0
```
- Use multiple devices:
```sh
./examples/sycl/run-llama2.sh
# OR
./examples/sycl/run-llama3.sh
./examples/sycl/test.sh
```
2. Command line
@ -455,13 +355,13 @@ Examples:
- Use device 0:
```sh
ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -no-cnv -m models/llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 99 -sm none -mg 0
ZES_ENABLE_SYSMAN=1 ./build/bin/llama-completion -no-cnv -m models/llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 99 -sm none -mg 0 --mmap
```
- Use multiple devices:
```sh
ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -no-cnv -m models/llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 99 -sm layer
ZES_ENABLE_SYSMAN=1 ./build/bin/llama-completion -no-cnv -m models/llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 99 -sm layer --mmap
```
*Notes:*
@ -577,13 +477,13 @@ Or, use CMake presets to build:
```sh
cmake --preset x64-windows-sycl-release
cmake --build build-x64-windows-sycl-release -j --target llama-cli
cmake --build build-x64-windows-sycl-release -j --target llama-completion
cmake -DGGML_SYCL_F16=ON --preset x64-windows-sycl-release
cmake --build build-x64-windows-sycl-release -j --target llama-cli
cmake --build build-x64-windows-sycl-release -j --target llama-completion
cmake --preset x64-windows-sycl-debug
cmake --build build-x64-windows-sycl-debug -j --target llama-cli
cmake --build build-x64-windows-sycl-debug -j --target llama-completion
```
#### 3. Visual Studio
@ -608,7 +508,7 @@ You can use Visual Studio to open the `llama.cpp` folder directly as a CMake pro
- For a minimal experimental setup, you can build only the inference executable using:
```Powershell
cmake --build build --config Release -j --target llama-cli
cmake --build build --config Release -j --target llama-completion
```
##### - Generating a Visual Studio Solution
@ -714,13 +614,7 @@ Choose one of following methods to run.
1. Script
```
examples\sycl\win-run-llama-2.bat
```
or
```
examples\sycl\win-run-llama-3.bat
examples\sycl\win-test.bat
```
2. Command line
@ -744,13 +638,13 @@ Examples:
- Use device 0:
```
build\bin\llama-cli.exe -no-cnv -m models\llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e -ngl 99 -sm none -mg 0
build\bin\llama-completion.exe -no-cnv -m models\llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e -ngl 99 -sm none -mg 0 --mmap
```
- Use multiple devices:
```
build\bin\llama-cli.exe -no-cnv -m models\llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e -ngl 99 -sm layer
build\bin\llama-completion.exe -no-cnv -m models\llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e -ngl 99 -sm layer --mmap
```
@ -776,15 +670,15 @@ use 1 SYCL GPUs: [0] with Max compute units:512
| Name | Value | Function |
|--------------------|---------------------------------------|---------------------------------------------|
| GGML_SYCL | ON (mandatory) | Enable build with SYCL code path. |
| GGML_SYCL_TARGET | INTEL *(default)* \| NVIDIA \| AMD | Set the SYCL target device type. |
| GGML_SYCL_DEVICE_ARCH | Optional (except for AMD) | Set the SYCL device architecture, optional except for AMD. Setting the device architecture can improve the performance. See the table [--offload-arch](https://github.com/intel/llvm/blob/sycl/sycl/doc/design/OffloadDesign.md#--offload-arch) for a list of valid architectures. |
| GGML_SYCL_TARGET | INTEL *(default)* | Set the SYCL target device type. |
| GGML_SYCL_DEVICE_ARCH | Optional | Set the SYCL device architecture. Setting the device architecture can improve the performance. See the table [--offload-arch](https://github.com/intel/llvm/blob/sycl/sycl/doc/design/OffloadDesign.md#--offload-arch) for a list of valid architectures. |
| GGML_SYCL_F16 | OFF *(default)* \|ON *(optional)* | Enable FP16 build with SYCL code path. (1.) |
| GGML_SYCL_GRAPH | ON *(default)* \|OFF *(Optional)* | Enable build with [SYCL Graph extension](https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/experimental/sycl_ext_oneapi_graph.asciidoc). |
| GGML_SYCL_GRAPH | OFF *(default)* \|ON *(Optional)* | Enable build with [SYCL Graph extension](https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/experimental/sycl_ext_oneapi_graph.asciidoc). |
| GGML_SYCL_DNN | ON *(default)* \|OFF *(Optional)* | Enable build with oneDNN. |
| CMAKE_C_COMPILER | `icx` *(Linux)*, `icx/cl` *(Windows)* | Set `icx` compiler for SYCL code path. |
| CMAKE_CXX_COMPILER | `icpx` *(Linux)*, `icx` *(Windows)* | Set `icpx/icx` compiler for SYCL code path. |
1. FP16 is recommended for better prompt processing performance on quantized models. Performance is equivalent in text generation but set `GGML_SYCL_F16=OFF` if you are experiencing issues with FP16 builds.
1. FP32 or FP16 have different performance impact to LLM. Recommended to test them for better prompt processing performance on your models. You need to rebuild the code after change `GGML_SYCL_F16=OFF/ON`.
#### Runtime
@ -792,7 +686,7 @@ use 1 SYCL GPUs: [0] with Max compute units:512
|-------------------|------------------|---------------------------------------------------------------------------------------------------------------------------|
| GGML_SYCL_DEBUG | 0 (default) or 1 | Enable log function by macro: GGML_SYCL_DEBUG |
| GGML_SYCL_DISABLE_OPT | 0 (default) or 1 | Disable optimize features for Intel GPUs. (Recommended to 1 for intel devices older than Gen 10) |
| GGML_SYCL_DISABLE_GRAPH | 0 or 1 (default) | Disable running computations through SYCL Graphs feature. Disabled by default because graph performance isn't yet better than non-graph performance. |
| GGML_SYCL_DISABLE_GRAPH | 0 or 1 (default) | Disable running computations through SYCL Graphs feature. Disabled by default because SYCL Graph is still on development, no better performance. |
| GGML_SYCL_DISABLE_DNN | 0 (default) or 1 | Disable running computations through oneDNN and always use oneMKL. |
| ZES_ENABLE_SYSMAN | 0 (default) or 1 | Support to get free memory of GPU by sycl::aspect::ext_intel_free_memory.<br>Recommended to use when --split-mode = layer |
| UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS | 0 (default) or 1 | Support malloc device memory more than 4GB.|

View File

@ -1,10 +1,5 @@
{
"version": 5,
"cmakeMinimumRequired": {
"major": 3,
"minor": 28,
"patch": 0
},
"configurePresets": [
{
"name": "arm64-android-snapdragon",

View File

@ -128,7 +128,7 @@ However, additional settings are required for generating and signing HTP Ops lib
> $env:HEXAGON_HTP_CERT="c:\Users\MyUsers\Certs\ggml-htp-v1.pfx"
> $env:WINDOWS_SDK_BIN="C:\Program Files (x86)\Windows Kits\10\bin\10.0.26100.0\arm64"
> cmake --preset arm64-windows-snapdragon -B build-wos
> cmake --preset arm64-windows-snapdragon-release -B build-wos
...
> cmake --install build-wos --prefix pkg-snapdragon
```

View File

@ -252,9 +252,7 @@ CUDA_VISIBLE_DEVICES="-0" ./build/bin/llama-server --model /srv/models/llama.ggu
The environment variable [`CUDA_SCALE_LAUNCH_QUEUES`](https://docs.nvidia.com/cuda/cuda-programming-guide/05-appendices/environment-variables.html#cuda-scale-launch-queues) controls the size of CUDA's command buffer, which determines how many GPU operations can be queued before the CPU must wait for the GPU to catch up. A larger buffer reduces CPU-side stalls and allows more work to be queued on a GPU.
**Default behavior:** llama.cpp automatically sets `CUDA_SCALE_LAUNCH_QUEUES=4x`, which increases the CUDA command buffer to 4 times its default size. This optimization is particularly beneficial for **Multi-GPU setups with pipeline parallelism**, where it significantly improves prompt processing throughput by allowing more operations to be enqueued across GPUs.
See PR [#19042](https://github.com/ggml-org/llama.cpp/pull/19042) for performance benchmarks and technical details.
Consider setting `CUDA_SCALE_LAUNCH_QUEUES=4x`, which increases the CUDA command buffer to 4 times its default size. This optimization is particularly beneficial for **Multi-GPU setups with pipeline parallelism**, where it significantly improves prompt processing throughput by allowing more operations to be enqueued across GPUs.
### Unified Memory

View File

@ -9,7 +9,7 @@ Download [MiniCPM-o-2_6](https://huggingface.co/openbmb/MiniCPM-o-2_6) PyTorch m
### Build llama.cpp
Readme modification time: 20250206
If there are differences in usage, please refer to the official build [documentation](https://github.com/ggerganov/llama.cpp/blob/master/docs/build.md)
If there are differences in usage, please refer to the official build [documentation](https://github.com/ggml-org/llama.cpp/blob/master/docs/build.md)
Clone llama.cpp:
```bash

View File

@ -8,11 +8,11 @@ Download [MiniCPM-o-4](https://huggingface.co/openbmb/MiniCPM-o-4) PyTorch model
### Build llama.cpp
Readme modification time: 20250206
If there are differences in usage, please refer to the official build [documentation](https://github.com/ggerganov/llama.cpp/blob/master/docs/build.md)
If there are differences in usage, please refer to the official build [documentation](https://github.com/ggml-org/llama.cpp/blob/master/docs/build.md)
Clone llama.cpp:
```bash
git clone https://github.com/ggerganov/llama.cpp
git clone https://github.com/ggml-org/llama.cpp
cd llama.cpp
```

View File

@ -8,7 +8,7 @@ Download [MiniCPM-Llama3-V-2_5](https://huggingface.co/openbmb/MiniCPM-Llama3-V-
### Build llama.cpp
Readme modification time: 20250206
If there are differences in usage, please refer to the official build [documentation](https://github.com/ggerganov/llama.cpp/blob/master/docs/build.md)
If there are differences in usage, please refer to the official build [documentation](https://github.com/ggml-org/llama.cpp/blob/master/docs/build.md)
Clone llama.cpp:
```bash

View File

@ -8,7 +8,7 @@ Download [MiniCPM-V-2_6](https://huggingface.co/openbmb/MiniCPM-V-2_6) PyTorch m
### Build llama.cpp
Readme modification time: 20250206
If there are differences in usage, please refer to the official build [documentation](https://github.com/ggerganov/llama.cpp/blob/master/docs/build.md)
If there are differences in usage, please refer to the official build [documentation](https://github.com/ggml-org/llama.cpp/blob/master/docs/build.md)
Clone llama.cpp:
```bash

View File

@ -8,11 +8,11 @@ Download [MiniCPM-V-4](https://huggingface.co/openbmb/MiniCPM-V-4) PyTorch model
### Build llama.cpp
Readme modification time: 20250731
If there are differences in usage, please refer to the official build [documentation](https://github.com/ggerganov/llama.cpp/blob/master/docs/build.md)
If there are differences in usage, please refer to the official build [documentation](https://github.com/ggml-org/llama.cpp/blob/master/docs/build.md)
Clone llama.cpp:
```bash
git clone https://github.com/ggerganov/llama.cpp
git clone https://github.com/ggml-org/llama.cpp
cd llama.cpp
```

View File

@ -8,11 +8,11 @@ Download [MiniCPM-V-4_5](https://huggingface.co/openbmb/MiniCPM-V-4_5) PyTorch m
### Build llama.cpp
Readme modification time: 20250826
If there are differences in usage, please refer to the official build [documentation](https://github.com/ggerganov/llama.cpp/blob/master/docs/build.md)
If there are differences in usage, please refer to the official build [documentation](https://github.com/ggml-org/llama.cpp/blob/master/docs/build.md)
Clone llama.cpp:
```bash
git clone https://github.com/ggerganov/llama.cpp
git clone https://github.com/ggml-org/llama.cpp
cd llama.cpp
```

View File

@ -97,7 +97,7 @@ Legend:
| SILU | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | 🟡 | ✅ | ❌ | ❌ |
| SILU_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ |
| SIN | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | ✅ | 🟡 | ❌ | ❌ | ❌ |
| SOFTPLUS | ❌ | ❌ | ✅ | 🟡 | 🟡 | ❌ | | 🟡 | ✅ | ❌ | ❌ |
| SOFTPLUS | ❌ | ❌ | ✅ | 🟡 | 🟡 | ❌ | | 🟡 | ✅ | ❌ | ❌ |
| SOFT_MAX | ❌ | 🟡 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
| SOFT_MAX_BACK | ❌ | ❌ | 🟡 | 🟡 | ❌ | ❌ | 🟡 | ✅ | ❌ | ❌ | ❌ |
| SOLVE_TRI | ❌ | ❌ | ✅ | 🟡 | ❌ | ❌ | ❌ | 🟡 | ❌ | ❌ | ❌ |
@ -113,8 +113,8 @@ Legend:
| SWIGLU_OAI | ❌ | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | 🟡 | ✅ | ❌ | ❌ |
| TANH | ❌ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ✅ | 🟡 | ✅ | ❌ | ❌ |
| TIMESTEP_EMBEDDING | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ |
| TOP_K | ❌ | ❌ | ✅ | ❌ | ✅ | ❌ | | 🟡 | ✅ | ❌ | ❌ |
| TRI | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | | ✅ | ❌ | ❌ | ❌ |
| TOP_K | ❌ | ❌ | ✅ | ❌ | ✅ | ❌ | 🟡 | 🟡 | ✅ | ❌ | ❌ |
| TRI | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | | ✅ | ❌ | ❌ | ❌ |
| TRUNC | ❌ | ❌ | ✅ | 🟡 | ❌ | ❌ | 🟡 | 🟡 | ✅ | ❌ | ❌ |
| UPSCALE | ❌ | 🟡 | ✅ | ✅ | 🟡 | 🟡 | 🟡 | 🟡 | ❌ | ❌ | ❌ |
| XIELU | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ |

View File

@ -29,8 +29,8 @@
"SYCL0","EXP","type=f16,ne_a=[5,7,11,13],v=0","support","1","yes","SYCL"
"SYCL0","EXPM1","type=f16,ne_a=[128,2,2,2],v=0","support","0","no","SYCL"
"SYCL0","EXPM1","type=f16,ne_a=[5,7,11,13],v=0","support","0","no","SYCL"
"SYCL0","SOFTPLUS","type=f16,ne_a=[128,2,2,2],v=0","support","0","no","SYCL"
"SYCL0","SOFTPLUS","type=f16,ne_a=[5,7,11,13],v=0","support","0","no","SYCL"
"SYCL0","SOFTPLUS","type=f16,ne_a=[128,2,2,2],v=0","support","1","yes","SYCL"
"SYCL0","SOFTPLUS","type=f16,ne_a=[5,7,11,13],v=0","support","1","yes","SYCL"
"SYCL0","GELU_ERF","type=f16,ne_a=[128,2,2,2],v=0","support","1","yes","SYCL"
"SYCL0","GELU_ERF","type=f16,ne_a=[5,7,11,13],v=0","support","1","yes","SYCL"
"SYCL0","FLOOR","type=f16,ne_a=[128,2,2,2],v=0","support","1","yes","SYCL"
@ -71,8 +71,8 @@
"SYCL0","EXP","type=f16,ne_a=[5,7,11,13],v=1","support","1","yes","SYCL"
"SYCL0","EXPM1","type=f16,ne_a=[128,2,2,2],v=1","support","0","no","SYCL"
"SYCL0","EXPM1","type=f16,ne_a=[5,7,11,13],v=1","support","0","no","SYCL"
"SYCL0","SOFTPLUS","type=f16,ne_a=[128,2,2,2],v=1","support","0","no","SYCL"
"SYCL0","SOFTPLUS","type=f16,ne_a=[5,7,11,13],v=1","support","0","no","SYCL"
"SYCL0","SOFTPLUS","type=f16,ne_a=[128,2,2,2],v=1","support","1","yes","SYCL"
"SYCL0","SOFTPLUS","type=f16,ne_a=[5,7,11,13],v=1","support","1","yes","SYCL"
"SYCL0","GELU_ERF","type=f16,ne_a=[128,2,2,2],v=1","support","1","yes","SYCL"
"SYCL0","GELU_ERF","type=f16,ne_a=[5,7,11,13],v=1","support","1","yes","SYCL"
"SYCL0","FLOOR","type=f16,ne_a=[128,2,2,2],v=1","support","0","no","SYCL"
@ -113,8 +113,8 @@
"SYCL0","EXP","type=f32,ne_a=[5,7,11,13],v=0","support","1","yes","SYCL"
"SYCL0","EXPM1","type=f32,ne_a=[128,2,2,2],v=0","support","0","no","SYCL"
"SYCL0","EXPM1","type=f32,ne_a=[5,7,11,13],v=0","support","0","no","SYCL"
"SYCL0","SOFTPLUS","type=f32,ne_a=[128,2,2,2],v=0","support","0","no","SYCL"
"SYCL0","SOFTPLUS","type=f32,ne_a=[5,7,11,13],v=0","support","0","no","SYCL"
"SYCL0","SOFTPLUS","type=f32,ne_a=[128,2,2,2],v=0","support","1","yes","SYCL"
"SYCL0","SOFTPLUS","type=f32,ne_a=[5,7,11,13],v=0","support","1","yes","SYCL"
"SYCL0","GELU_ERF","type=f32,ne_a=[128,2,2,2],v=0","support","1","yes","SYCL"
"SYCL0","GELU_ERF","type=f32,ne_a=[5,7,11,13],v=0","support","1","yes","SYCL"
"SYCL0","FLOOR","type=f32,ne_a=[128,2,2,2],v=0","support","1","yes","SYCL"
@ -155,8 +155,8 @@
"SYCL0","EXP","type=f32,ne_a=[5,7,11,13],v=1","support","1","yes","SYCL"
"SYCL0","EXPM1","type=f32,ne_a=[128,2,2,2],v=1","support","0","no","SYCL"
"SYCL0","EXPM1","type=f32,ne_a=[5,7,11,13],v=1","support","0","no","SYCL"
"SYCL0","SOFTPLUS","type=f32,ne_a=[128,2,2,2],v=1","support","0","no","SYCL"
"SYCL0","SOFTPLUS","type=f32,ne_a=[5,7,11,13],v=1","support","0","no","SYCL"
"SYCL0","SOFTPLUS","type=f32,ne_a=[128,2,2,2],v=1","support","1","yes","SYCL"
"SYCL0","SOFTPLUS","type=f32,ne_a=[5,7,11,13],v=1","support","1","yes","SYCL"
"SYCL0","GELU_ERF","type=f32,ne_a=[128,2,2,2],v=1","support","1","yes","SYCL"
"SYCL0","GELU_ERF","type=f32,ne_a=[5,7,11,13],v=1","support","1","yes","SYCL"
"SYCL0","FLOOR","type=f32,ne_a=[128,2,2,2],v=1","support","0","no","SYCL"
@ -9677,168 +9677,168 @@
"SYCL0","ARGSORT","type=f32,ne=[2048,2,1,3],order=1","support","1","yes","SYCL"
"SYCL0","ARGSORT","type=f32,ne=[2049,2,1,3],order=1","support","1","yes","SYCL"
"SYCL0","ARGSORT","type=f32,ne=[2,8,8192,1],order=1","support","1","yes","SYCL"
"SYCL0","TOP_K","type=f32,ne=[1,1,1,1],k=1,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[12,1,2,1],k=1,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[2,1,1,1],k=1,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[13,1,2,1],k=1,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[2,1,1,1],k=2,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[13,1,2,1],k=2,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[4,1,1,1],k=1,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[15,1,2,1],k=1,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[4,1,1,1],k=2,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[15,1,2,1],k=2,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[4,1,1,1],k=3,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[15,1,2,1],k=3,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[8,1,1,1],k=1,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[19,1,2,1],k=1,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[8,1,1,1],k=2,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[19,1,2,1],k=2,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[8,1,1,1],k=3,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[19,1,2,1],k=3,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[8,1,1,1],k=7,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[19,1,2,1],k=7,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[16,1,1,1],k=1,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[27,1,2,1],k=1,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[16,1,1,1],k=2,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[27,1,2,1],k=2,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[16,1,1,1],k=3,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[27,1,2,1],k=3,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[16,1,1,1],k=7,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[27,1,2,1],k=7,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[16,1,1,1],k=15,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[27,1,2,1],k=15,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[32,1,1,1],k=1,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[43,1,2,1],k=1,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[32,1,1,1],k=2,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[43,1,2,1],k=2,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[32,1,1,1],k=3,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[43,1,2,1],k=3,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[32,1,1,1],k=7,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[43,1,2,1],k=7,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[32,1,1,1],k=15,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[43,1,2,1],k=15,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[64,1,1,1],k=1,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[75,1,2,1],k=1,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[64,1,1,1],k=2,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[75,1,2,1],k=2,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[64,1,1,1],k=3,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[75,1,2,1],k=3,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[64,1,1,1],k=7,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[75,1,2,1],k=7,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[64,1,1,1],k=15,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[75,1,2,1],k=15,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[128,1,1,1],k=1,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[139,1,2,1],k=1,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[128,1,1,1],k=2,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[139,1,2,1],k=2,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[128,1,1,1],k=3,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[139,1,2,1],k=3,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[128,1,1,1],k=7,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[139,1,2,1],k=7,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[128,1,1,1],k=15,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[139,1,2,1],k=15,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[1,1,1,1],k=1,ties=0","support","1","yes","SYCL"
"SYCL0","TOP_K","type=f32,ne=[12,1,2,1],k=1,ties=0","support","1","yes","SYCL"
"SYCL0","TOP_K","type=f32,ne=[2,1,1,1],k=1,ties=0","support","1","yes","SYCL"
"SYCL0","TOP_K","type=f32,ne=[13,1,2,1],k=1,ties=0","support","1","yes","SYCL"
"SYCL0","TOP_K","type=f32,ne=[2,1,1,1],k=2,ties=0","support","1","yes","SYCL"
"SYCL0","TOP_K","type=f32,ne=[13,1,2,1],k=2,ties=0","support","1","yes","SYCL"
"SYCL0","TOP_K","type=f32,ne=[4,1,1,1],k=1,ties=0","support","1","yes","SYCL"
"SYCL0","TOP_K","type=f32,ne=[15,1,2,1],k=1,ties=0","support","1","yes","SYCL"
"SYCL0","TOP_K","type=f32,ne=[4,1,1,1],k=2,ties=0","support","1","yes","SYCL"
"SYCL0","TOP_K","type=f32,ne=[15,1,2,1],k=2,ties=0","support","1","yes","SYCL"
"SYCL0","TOP_K","type=f32,ne=[4,1,1,1],k=3,ties=0","support","1","yes","SYCL"
"SYCL0","TOP_K","type=f32,ne=[15,1,2,1],k=3,ties=0","support","1","yes","SYCL"
"SYCL0","TOP_K","type=f32,ne=[8,1,1,1],k=1,ties=0","support","1","yes","SYCL"
"SYCL0","TOP_K","type=f32,ne=[19,1,2,1],k=1,ties=0","support","1","yes","SYCL"
"SYCL0","TOP_K","type=f32,ne=[8,1,1,1],k=2,ties=0","support","1","yes","SYCL"
"SYCL0","TOP_K","type=f32,ne=[19,1,2,1],k=2,ties=0","support","1","yes","SYCL"
"SYCL0","TOP_K","type=f32,ne=[8,1,1,1],k=3,ties=0","support","1","yes","SYCL"
"SYCL0","TOP_K","type=f32,ne=[19,1,2,1],k=3,ties=0","support","1","yes","SYCL"
"SYCL0","TOP_K","type=f32,ne=[8,1,1,1],k=7,ties=0","support","1","yes","SYCL"
"SYCL0","TOP_K","type=f32,ne=[19,1,2,1],k=7,ties=0","support","1","yes","SYCL"
"SYCL0","TOP_K","type=f32,ne=[16,1,1,1],k=1,ties=0","support","1","yes","SYCL"
"SYCL0","TOP_K","type=f32,ne=[27,1,2,1],k=1,ties=0","support","1","yes","SYCL"
"SYCL0","TOP_K","type=f32,ne=[16,1,1,1],k=2,ties=0","support","1","yes","SYCL"
"SYCL0","TOP_K","type=f32,ne=[27,1,2,1],k=2,ties=0","support","1","yes","SYCL"
"SYCL0","TOP_K","type=f32,ne=[16,1,1,1],k=3,ties=0","support","1","yes","SYCL"
"SYCL0","TOP_K","type=f32,ne=[27,1,2,1],k=3,ties=0","support","1","yes","SYCL"
"SYCL0","TOP_K","type=f32,ne=[16,1,1,1],k=7,ties=0","support","1","yes","SYCL"
"SYCL0","TOP_K","type=f32,ne=[27,1,2,1],k=7,ties=0","support","1","yes","SYCL"
"SYCL0","TOP_K","type=f32,ne=[16,1,1,1],k=15,ties=0","support","1","yes","SYCL"
"SYCL0","TOP_K","type=f32,ne=[27,1,2,1],k=15,ties=0","support","1","yes","SYCL"
"SYCL0","TOP_K","type=f32,ne=[32,1,1,1],k=1,ties=0","support","1","yes","SYCL"
"SYCL0","TOP_K","type=f32,ne=[43,1,2,1],k=1,ties=0","support","1","yes","SYCL"
"SYCL0","TOP_K","type=f32,ne=[32,1,1,1],k=2,ties=0","support","1","yes","SYCL"
"SYCL0","TOP_K","type=f32,ne=[43,1,2,1],k=2,ties=0","support","1","yes","SYCL"
"SYCL0","TOP_K","type=f32,ne=[32,1,1,1],k=3,ties=0","support","1","yes","SYCL"
"SYCL0","TOP_K","type=f32,ne=[43,1,2,1],k=3,ties=0","support","1","yes","SYCL"
"SYCL0","TOP_K","type=f32,ne=[32,1,1,1],k=7,ties=0","support","1","yes","SYCL"
"SYCL0","TOP_K","type=f32,ne=[43,1,2,1],k=7,ties=0","support","1","yes","SYCL"
"SYCL0","TOP_K","type=f32,ne=[32,1,1,1],k=15,ties=0","support","1","yes","SYCL"
"SYCL0","TOP_K","type=f32,ne=[43,1,2,1],k=15,ties=0","support","1","yes","SYCL"
"SYCL0","TOP_K","type=f32,ne=[64,1,1,1],k=1,ties=0","support","1","yes","SYCL"
"SYCL0","TOP_K","type=f32,ne=[75,1,2,1],k=1,ties=0","support","1","yes","SYCL"
"SYCL0","TOP_K","type=f32,ne=[64,1,1,1],k=2,ties=0","support","1","yes","SYCL"
"SYCL0","TOP_K","type=f32,ne=[75,1,2,1],k=2,ties=0","support","1","yes","SYCL"
"SYCL0","TOP_K","type=f32,ne=[64,1,1,1],k=3,ties=0","support","1","yes","SYCL"
"SYCL0","TOP_K","type=f32,ne=[75,1,2,1],k=3,ties=0","support","1","yes","SYCL"
"SYCL0","TOP_K","type=f32,ne=[64,1,1,1],k=7,ties=0","support","1","yes","SYCL"
"SYCL0","TOP_K","type=f32,ne=[75,1,2,1],k=7,ties=0","support","1","yes","SYCL"
"SYCL0","TOP_K","type=f32,ne=[64,1,1,1],k=15,ties=0","support","1","yes","SYCL"
"SYCL0","TOP_K","type=f32,ne=[75,1,2,1],k=15,ties=0","support","1","yes","SYCL"
"SYCL0","TOP_K","type=f32,ne=[128,1,1,1],k=1,ties=0","support","1","yes","SYCL"
"SYCL0","TOP_K","type=f32,ne=[139,1,2,1],k=1,ties=0","support","1","yes","SYCL"
"SYCL0","TOP_K","type=f32,ne=[128,1,1,1],k=2,ties=0","support","1","yes","SYCL"
"SYCL0","TOP_K","type=f32,ne=[139,1,2,1],k=2,ties=0","support","1","yes","SYCL"
"SYCL0","TOP_K","type=f32,ne=[128,1,1,1],k=3,ties=0","support","1","yes","SYCL"
"SYCL0","TOP_K","type=f32,ne=[139,1,2,1],k=3,ties=0","support","1","yes","SYCL"
"SYCL0","TOP_K","type=f32,ne=[128,1,1,1],k=7,ties=0","support","1","yes","SYCL"
"SYCL0","TOP_K","type=f32,ne=[139,1,2,1],k=7,ties=0","support","1","yes","SYCL"
"SYCL0","TOP_K","type=f32,ne=[128,1,1,1],k=15,ties=0","support","1","yes","SYCL"
"SYCL0","TOP_K","type=f32,ne=[139,1,2,1],k=15,ties=0","support","1","yes","SYCL"
"SYCL0","TOP_K","type=f32,ne=[128,1,1,1],k=100,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[139,1,2,1],k=100,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[256,1,1,1],k=1,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[267,1,2,1],k=1,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[256,1,1,1],k=2,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[267,1,2,1],k=2,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[256,1,1,1],k=3,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[267,1,2,1],k=3,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[256,1,1,1],k=7,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[267,1,2,1],k=7,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[256,1,1,1],k=15,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[267,1,2,1],k=15,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[256,1,1,1],k=1,ties=0","support","1","yes","SYCL"
"SYCL0","TOP_K","type=f32,ne=[267,1,2,1],k=1,ties=0","support","1","yes","SYCL"
"SYCL0","TOP_K","type=f32,ne=[256,1,1,1],k=2,ties=0","support","1","yes","SYCL"
"SYCL0","TOP_K","type=f32,ne=[267,1,2,1],k=2,ties=0","support","1","yes","SYCL"
"SYCL0","TOP_K","type=f32,ne=[256,1,1,1],k=3,ties=0","support","1","yes","SYCL"
"SYCL0","TOP_K","type=f32,ne=[267,1,2,1],k=3,ties=0","support","1","yes","SYCL"
"SYCL0","TOP_K","type=f32,ne=[256,1,1,1],k=7,ties=0","support","1","yes","SYCL"
"SYCL0","TOP_K","type=f32,ne=[267,1,2,1],k=7,ties=0","support","1","yes","SYCL"
"SYCL0","TOP_K","type=f32,ne=[256,1,1,1],k=15,ties=0","support","1","yes","SYCL"
"SYCL0","TOP_K","type=f32,ne=[267,1,2,1],k=15,ties=0","support","1","yes","SYCL"
"SYCL0","TOP_K","type=f32,ne=[256,1,1,1],k=100,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[267,1,2,1],k=100,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[512,1,1,1],k=1,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[523,1,2,1],k=1,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[512,1,1,1],k=2,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[523,1,2,1],k=2,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[512,1,1,1],k=3,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[523,1,2,1],k=3,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[512,1,1,1],k=7,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[523,1,2,1],k=7,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[512,1,1,1],k=15,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[523,1,2,1],k=15,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[512,1,1,1],k=1,ties=0","support","1","yes","SYCL"
"SYCL0","TOP_K","type=f32,ne=[523,1,2,1],k=1,ties=0","support","1","yes","SYCL"
"SYCL0","TOP_K","type=f32,ne=[512,1,1,1],k=2,ties=0","support","1","yes","SYCL"
"SYCL0","TOP_K","type=f32,ne=[523,1,2,1],k=2,ties=0","support","1","yes","SYCL"
"SYCL0","TOP_K","type=f32,ne=[512,1,1,1],k=3,ties=0","support","1","yes","SYCL"
"SYCL0","TOP_K","type=f32,ne=[523,1,2,1],k=3,ties=0","support","1","yes","SYCL"
"SYCL0","TOP_K","type=f32,ne=[512,1,1,1],k=7,ties=0","support","1","yes","SYCL"
"SYCL0","TOP_K","type=f32,ne=[523,1,2,1],k=7,ties=0","support","1","yes","SYCL"
"SYCL0","TOP_K","type=f32,ne=[512,1,1,1],k=15,ties=0","support","1","yes","SYCL"
"SYCL0","TOP_K","type=f32,ne=[523,1,2,1],k=15,ties=0","support","1","yes","SYCL"
"SYCL0","TOP_K","type=f32,ne=[512,1,1,1],k=100,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[523,1,2,1],k=100,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[512,1,1,1],k=500,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[523,1,2,1],k=500,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[1024,1,1,1],k=1,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[1035,1,2,1],k=1,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[1024,1,1,1],k=2,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[1035,1,2,1],k=2,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[1024,1,1,1],k=3,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[1035,1,2,1],k=3,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[1024,1,1,1],k=7,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[1035,1,2,1],k=7,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[1024,1,1,1],k=15,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[1035,1,2,1],k=15,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[1024,1,1,1],k=1,ties=0","support","1","yes","SYCL"
"SYCL0","TOP_K","type=f32,ne=[1035,1,2,1],k=1,ties=0","support","1","yes","SYCL"
"SYCL0","TOP_K","type=f32,ne=[1024,1,1,1],k=2,ties=0","support","1","yes","SYCL"
"SYCL0","TOP_K","type=f32,ne=[1035,1,2,1],k=2,ties=0","support","1","yes","SYCL"
"SYCL0","TOP_K","type=f32,ne=[1024,1,1,1],k=3,ties=0","support","1","yes","SYCL"
"SYCL0","TOP_K","type=f32,ne=[1035,1,2,1],k=3,ties=0","support","1","yes","SYCL"
"SYCL0","TOP_K","type=f32,ne=[1024,1,1,1],k=7,ties=0","support","1","yes","SYCL"
"SYCL0","TOP_K","type=f32,ne=[1035,1,2,1],k=7,ties=0","support","1","yes","SYCL"
"SYCL0","TOP_K","type=f32,ne=[1024,1,1,1],k=15,ties=0","support","1","yes","SYCL"
"SYCL0","TOP_K","type=f32,ne=[1035,1,2,1],k=15,ties=0","support","1","yes","SYCL"
"SYCL0","TOP_K","type=f32,ne=[1024,1,1,1],k=100,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[1035,1,2,1],k=100,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[1024,1,1,1],k=500,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[1035,1,2,1],k=500,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[1024,1,1,1],k=1023,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[1035,1,2,1],k=1023,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[2048,1,1,1],k=1,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[2059,1,2,1],k=1,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[2048,1,1,1],k=2,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[2059,1,2,1],k=2,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[2048,1,1,1],k=3,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[2059,1,2,1],k=3,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[2048,1,1,1],k=7,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[2059,1,2,1],k=7,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[2048,1,1,1],k=15,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[2059,1,2,1],k=15,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[2048,1,1,1],k=1,ties=0","support","1","yes","SYCL"
"SYCL0","TOP_K","type=f32,ne=[2059,1,2,1],k=1,ties=0","support","1","yes","SYCL"
"SYCL0","TOP_K","type=f32,ne=[2048,1,1,1],k=2,ties=0","support","1","yes","SYCL"
"SYCL0","TOP_K","type=f32,ne=[2059,1,2,1],k=2,ties=0","support","1","yes","SYCL"
"SYCL0","TOP_K","type=f32,ne=[2048,1,1,1],k=3,ties=0","support","1","yes","SYCL"
"SYCL0","TOP_K","type=f32,ne=[2059,1,2,1],k=3,ties=0","support","1","yes","SYCL"
"SYCL0","TOP_K","type=f32,ne=[2048,1,1,1],k=7,ties=0","support","1","yes","SYCL"
"SYCL0","TOP_K","type=f32,ne=[2059,1,2,1],k=7,ties=0","support","1","yes","SYCL"
"SYCL0","TOP_K","type=f32,ne=[2048,1,1,1],k=15,ties=0","support","1","yes","SYCL"
"SYCL0","TOP_K","type=f32,ne=[2059,1,2,1],k=15,ties=0","support","1","yes","SYCL"
"SYCL0","TOP_K","type=f32,ne=[2048,1,1,1],k=100,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[2059,1,2,1],k=100,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[2048,1,1,1],k=500,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[2059,1,2,1],k=500,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[2048,1,1,1],k=1023,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[2059,1,2,1],k=1023,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[4096,1,1,1],k=1,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[4107,1,2,1],k=1,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[4096,1,1,1],k=2,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[4107,1,2,1],k=2,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[4096,1,1,1],k=3,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[4107,1,2,1],k=3,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[4096,1,1,1],k=7,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[4107,1,2,1],k=7,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[4096,1,1,1],k=15,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[4107,1,2,1],k=15,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[4096,1,1,1],k=1,ties=0","support","1","yes","SYCL"
"SYCL0","TOP_K","type=f32,ne=[4107,1,2,1],k=1,ties=0","support","1","yes","SYCL"
"SYCL0","TOP_K","type=f32,ne=[4096,1,1,1],k=2,ties=0","support","1","yes","SYCL"
"SYCL0","TOP_K","type=f32,ne=[4107,1,2,1],k=2,ties=0","support","1","yes","SYCL"
"SYCL0","TOP_K","type=f32,ne=[4096,1,1,1],k=3,ties=0","support","1","yes","SYCL"
"SYCL0","TOP_K","type=f32,ne=[4107,1,2,1],k=3,ties=0","support","1","yes","SYCL"
"SYCL0","TOP_K","type=f32,ne=[4096,1,1,1],k=7,ties=0","support","1","yes","SYCL"
"SYCL0","TOP_K","type=f32,ne=[4107,1,2,1],k=7,ties=0","support","1","yes","SYCL"
"SYCL0","TOP_K","type=f32,ne=[4096,1,1,1],k=15,ties=0","support","1","yes","SYCL"
"SYCL0","TOP_K","type=f32,ne=[4107,1,2,1],k=15,ties=0","support","1","yes","SYCL"
"SYCL0","TOP_K","type=f32,ne=[4096,1,1,1],k=100,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[4107,1,2,1],k=100,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[4096,1,1,1],k=500,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[4107,1,2,1],k=500,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[4096,1,1,1],k=1023,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[4107,1,2,1],k=1023,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[8192,1,1,1],k=1,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[8203,1,2,1],k=1,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[8192,1,1,1],k=2,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[8203,1,2,1],k=2,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[8192,1,1,1],k=3,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[8203,1,2,1],k=3,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[8192,1,1,1],k=7,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[8203,1,2,1],k=7,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[8192,1,1,1],k=15,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[8203,1,2,1],k=15,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[8192,1,1,1],k=1,ties=0","support","1","yes","SYCL"
"SYCL0","TOP_K","type=f32,ne=[8203,1,2,1],k=1,ties=0","support","1","yes","SYCL"
"SYCL0","TOP_K","type=f32,ne=[8192,1,1,1],k=2,ties=0","support","1","yes","SYCL"
"SYCL0","TOP_K","type=f32,ne=[8203,1,2,1],k=2,ties=0","support","1","yes","SYCL"
"SYCL0","TOP_K","type=f32,ne=[8192,1,1,1],k=3,ties=0","support","1","yes","SYCL"
"SYCL0","TOP_K","type=f32,ne=[8203,1,2,1],k=3,ties=0","support","1","yes","SYCL"
"SYCL0","TOP_K","type=f32,ne=[8192,1,1,1],k=7,ties=0","support","1","yes","SYCL"
"SYCL0","TOP_K","type=f32,ne=[8203,1,2,1],k=7,ties=0","support","1","yes","SYCL"
"SYCL0","TOP_K","type=f32,ne=[8192,1,1,1],k=15,ties=0","support","1","yes","SYCL"
"SYCL0","TOP_K","type=f32,ne=[8203,1,2,1],k=15,ties=0","support","1","yes","SYCL"
"SYCL0","TOP_K","type=f32,ne=[8192,1,1,1],k=100,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[8203,1,2,1],k=100,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[8192,1,1,1],k=500,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[8203,1,2,1],k=500,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[8192,1,1,1],k=1023,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[8203,1,2,1],k=1023,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[16384,1,1,1],k=1,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[16395,1,2,1],k=1,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[16384,1,1,1],k=2,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[16395,1,2,1],k=2,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[16384,1,1,1],k=3,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[16395,1,2,1],k=3,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[16384,1,1,1],k=7,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[16395,1,2,1],k=7,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[16384,1,1,1],k=15,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[16395,1,2,1],k=15,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[16384,1,1,1],k=1,ties=0","support","1","yes","SYCL"
"SYCL0","TOP_K","type=f32,ne=[16395,1,2,1],k=1,ties=0","support","1","yes","SYCL"
"SYCL0","TOP_K","type=f32,ne=[16384,1,1,1],k=2,ties=0","support","1","yes","SYCL"
"SYCL0","TOP_K","type=f32,ne=[16395,1,2,1],k=2,ties=0","support","1","yes","SYCL"
"SYCL0","TOP_K","type=f32,ne=[16384,1,1,1],k=3,ties=0","support","1","yes","SYCL"
"SYCL0","TOP_K","type=f32,ne=[16395,1,2,1],k=3,ties=0","support","1","yes","SYCL"
"SYCL0","TOP_K","type=f32,ne=[16384,1,1,1],k=7,ties=0","support","1","yes","SYCL"
"SYCL0","TOP_K","type=f32,ne=[16395,1,2,1],k=7,ties=0","support","1","yes","SYCL"
"SYCL0","TOP_K","type=f32,ne=[16384,1,1,1],k=15,ties=0","support","1","yes","SYCL"
"SYCL0","TOP_K","type=f32,ne=[16395,1,2,1],k=15,ties=0","support","1","yes","SYCL"
"SYCL0","TOP_K","type=f32,ne=[16384,1,1,1],k=100,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[16395,1,2,1],k=100,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[16384,1,1,1],k=500,ties=0","support","0","no","SYCL"
@ -9847,16 +9847,16 @@
"SYCL0","TOP_K","type=f32,ne=[16395,1,2,1],k=1023,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[16384,1,1,1],k=9999,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[16395,1,2,1],k=9999,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[32768,1,1,1],k=1,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[32779,1,2,1],k=1,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[32768,1,1,1],k=2,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[32779,1,2,1],k=2,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[32768,1,1,1],k=3,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[32779,1,2,1],k=3,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[32768,1,1,1],k=7,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[32779,1,2,1],k=7,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[32768,1,1,1],k=15,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[32779,1,2,1],k=15,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[32768,1,1,1],k=1,ties=0","support","1","yes","SYCL"
"SYCL0","TOP_K","type=f32,ne=[32779,1,2,1],k=1,ties=0","support","1","yes","SYCL"
"SYCL0","TOP_K","type=f32,ne=[32768,1,1,1],k=2,ties=0","support","1","yes","SYCL"
"SYCL0","TOP_K","type=f32,ne=[32779,1,2,1],k=2,ties=0","support","1","yes","SYCL"
"SYCL0","TOP_K","type=f32,ne=[32768,1,1,1],k=3,ties=0","support","1","yes","SYCL"
"SYCL0","TOP_K","type=f32,ne=[32779,1,2,1],k=3,ties=0","support","1","yes","SYCL"
"SYCL0","TOP_K","type=f32,ne=[32768,1,1,1],k=7,ties=0","support","1","yes","SYCL"
"SYCL0","TOP_K","type=f32,ne=[32779,1,2,1],k=7,ties=0","support","1","yes","SYCL"
"SYCL0","TOP_K","type=f32,ne=[32768,1,1,1],k=15,ties=0","support","1","yes","SYCL"
"SYCL0","TOP_K","type=f32,ne=[32779,1,2,1],k=15,ties=0","support","1","yes","SYCL"
"SYCL0","TOP_K","type=f32,ne=[32768,1,1,1],k=100,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[32779,1,2,1],k=100,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[32768,1,1,1],k=500,ties=0","support","0","no","SYCL"
@ -9865,16 +9865,16 @@
"SYCL0","TOP_K","type=f32,ne=[32779,1,2,1],k=1023,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[32768,1,1,1],k=9999,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[32779,1,2,1],k=9999,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[65536,1,1,1],k=1,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[65547,1,2,1],k=1,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[65536,1,1,1],k=2,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[65547,1,2,1],k=2,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[65536,1,1,1],k=3,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[65547,1,2,1],k=3,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[65536,1,1,1],k=7,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[65547,1,2,1],k=7,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[65536,1,1,1],k=15,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[65547,1,2,1],k=15,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[65536,1,1,1],k=1,ties=0","support","1","yes","SYCL"
"SYCL0","TOP_K","type=f32,ne=[65547,1,2,1],k=1,ties=0","support","1","yes","SYCL"
"SYCL0","TOP_K","type=f32,ne=[65536,1,1,1],k=2,ties=0","support","1","yes","SYCL"
"SYCL0","TOP_K","type=f32,ne=[65547,1,2,1],k=2,ties=0","support","1","yes","SYCL"
"SYCL0","TOP_K","type=f32,ne=[65536,1,1,1],k=3,ties=0","support","1","yes","SYCL"
"SYCL0","TOP_K","type=f32,ne=[65547,1,2,1],k=3,ties=0","support","1","yes","SYCL"
"SYCL0","TOP_K","type=f32,ne=[65536,1,1,1],k=7,ties=0","support","1","yes","SYCL"
"SYCL0","TOP_K","type=f32,ne=[65547,1,2,1],k=7,ties=0","support","1","yes","SYCL"
"SYCL0","TOP_K","type=f32,ne=[65536,1,1,1],k=15,ties=0","support","1","yes","SYCL"
"SYCL0","TOP_K","type=f32,ne=[65547,1,2,1],k=15,ties=0","support","1","yes","SYCL"
"SYCL0","TOP_K","type=f32,ne=[65536,1,1,1],k=100,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[65547,1,2,1],k=100,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[65536,1,1,1],k=500,ties=0","support","0","no","SYCL"
@ -9883,16 +9883,16 @@
"SYCL0","TOP_K","type=f32,ne=[65547,1,2,1],k=1023,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[65536,1,1,1],k=9999,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[65547,1,2,1],k=9999,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[131072,1,1,1],k=1,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[131083,1,2,1],k=1,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[131072,1,1,1],k=2,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[131083,1,2,1],k=2,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[131072,1,1,1],k=3,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[131083,1,2,1],k=3,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[131072,1,1,1],k=7,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[131083,1,2,1],k=7,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[131072,1,1,1],k=15,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[131083,1,2,1],k=15,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[131072,1,1,1],k=1,ties=0","support","1","yes","SYCL"
"SYCL0","TOP_K","type=f32,ne=[131083,1,2,1],k=1,ties=0","support","1","yes","SYCL"
"SYCL0","TOP_K","type=f32,ne=[131072,1,1,1],k=2,ties=0","support","1","yes","SYCL"
"SYCL0","TOP_K","type=f32,ne=[131083,1,2,1],k=2,ties=0","support","1","yes","SYCL"
"SYCL0","TOP_K","type=f32,ne=[131072,1,1,1],k=3,ties=0","support","1","yes","SYCL"
"SYCL0","TOP_K","type=f32,ne=[131083,1,2,1],k=3,ties=0","support","1","yes","SYCL"
"SYCL0","TOP_K","type=f32,ne=[131072,1,1,1],k=7,ties=0","support","1","yes","SYCL"
"SYCL0","TOP_K","type=f32,ne=[131083,1,2,1],k=7,ties=0","support","1","yes","SYCL"
"SYCL0","TOP_K","type=f32,ne=[131072,1,1,1],k=15,ties=0","support","1","yes","SYCL"
"SYCL0","TOP_K","type=f32,ne=[131083,1,2,1],k=15,ties=0","support","1","yes","SYCL"
"SYCL0","TOP_K","type=f32,ne=[131072,1,1,1],k=100,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[131083,1,2,1],k=100,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[131072,1,1,1],k=500,ties=0","support","0","no","SYCL"
@ -9901,16 +9901,16 @@
"SYCL0","TOP_K","type=f32,ne=[131083,1,2,1],k=1023,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[131072,1,1,1],k=9999,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[131083,1,2,1],k=9999,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[262144,1,1,1],k=1,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[262155,1,2,1],k=1,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[262144,1,1,1],k=2,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[262155,1,2,1],k=2,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[262144,1,1,1],k=3,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[262155,1,2,1],k=3,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[262144,1,1,1],k=7,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[262155,1,2,1],k=7,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[262144,1,1,1],k=15,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[262155,1,2,1],k=15,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[262144,1,1,1],k=1,ties=0","support","1","yes","SYCL"
"SYCL0","TOP_K","type=f32,ne=[262155,1,2,1],k=1,ties=0","support","1","yes","SYCL"
"SYCL0","TOP_K","type=f32,ne=[262144,1,1,1],k=2,ties=0","support","1","yes","SYCL"
"SYCL0","TOP_K","type=f32,ne=[262155,1,2,1],k=2,ties=0","support","1","yes","SYCL"
"SYCL0","TOP_K","type=f32,ne=[262144,1,1,1],k=3,ties=0","support","1","yes","SYCL"
"SYCL0","TOP_K","type=f32,ne=[262155,1,2,1],k=3,ties=0","support","1","yes","SYCL"
"SYCL0","TOP_K","type=f32,ne=[262144,1,1,1],k=7,ties=0","support","1","yes","SYCL"
"SYCL0","TOP_K","type=f32,ne=[262155,1,2,1],k=7,ties=0","support","1","yes","SYCL"
"SYCL0","TOP_K","type=f32,ne=[262144,1,1,1],k=15,ties=0","support","1","yes","SYCL"
"SYCL0","TOP_K","type=f32,ne=[262155,1,2,1],k=15,ties=0","support","1","yes","SYCL"
"SYCL0","TOP_K","type=f32,ne=[262144,1,1,1],k=100,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[262155,1,2,1],k=100,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[262144,1,1,1],k=500,ties=0","support","0","no","SYCL"
@ -9919,16 +9919,16 @@
"SYCL0","TOP_K","type=f32,ne=[262155,1,2,1],k=1023,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[262144,1,1,1],k=9999,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[262155,1,2,1],k=9999,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[524288,1,1,1],k=1,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[524299,1,2,1],k=1,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[524288,1,1,1],k=2,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[524299,1,2,1],k=2,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[524288,1,1,1],k=3,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[524299,1,2,1],k=3,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[524288,1,1,1],k=7,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[524299,1,2,1],k=7,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[524288,1,1,1],k=15,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[524299,1,2,1],k=15,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[524288,1,1,1],k=1,ties=0","support","1","yes","SYCL"
"SYCL0","TOP_K","type=f32,ne=[524299,1,2,1],k=1,ties=0","support","1","yes","SYCL"
"SYCL0","TOP_K","type=f32,ne=[524288,1,1,1],k=2,ties=0","support","1","yes","SYCL"
"SYCL0","TOP_K","type=f32,ne=[524299,1,2,1],k=2,ties=0","support","1","yes","SYCL"
"SYCL0","TOP_K","type=f32,ne=[524288,1,1,1],k=3,ties=0","support","1","yes","SYCL"
"SYCL0","TOP_K","type=f32,ne=[524299,1,2,1],k=3,ties=0","support","1","yes","SYCL"
"SYCL0","TOP_K","type=f32,ne=[524288,1,1,1],k=7,ties=0","support","1","yes","SYCL"
"SYCL0","TOP_K","type=f32,ne=[524299,1,2,1],k=7,ties=0","support","1","yes","SYCL"
"SYCL0","TOP_K","type=f32,ne=[524288,1,1,1],k=15,ties=0","support","1","yes","SYCL"
"SYCL0","TOP_K","type=f32,ne=[524299,1,2,1],k=15,ties=0","support","1","yes","SYCL"
"SYCL0","TOP_K","type=f32,ne=[524288,1,1,1],k=100,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[524299,1,2,1],k=100,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[524288,1,1,1],k=500,ties=0","support","0","no","SYCL"
@ -9937,51 +9937,51 @@
"SYCL0","TOP_K","type=f32,ne=[524299,1,2,1],k=1023,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[524288,1,1,1],k=9999,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[524299,1,2,1],k=9999,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[16,10,10,10],k=1,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[60,10,10,10],k=1,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[1023,2,1,3],k=1,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[1024,2,1,3],k=1,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[1025,2,1,3],k=1,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[16384,1,1,1],k=1,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[2047,2,1,3],k=1,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[2048,2,1,3],k=1,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[2049,2,1,3],k=1,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[16,10,10,10],k=2,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[60,10,10,10],k=2,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[1023,2,1,3],k=2,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[1024,2,1,3],k=2,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[1025,2,1,3],k=2,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[16384,1,1,1],k=2,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[2047,2,1,3],k=2,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[2048,2,1,3],k=2,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[2049,2,1,3],k=2,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[16,10,10,10],k=3,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[60,10,10,10],k=3,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[1023,2,1,3],k=3,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[1024,2,1,3],k=3,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[1025,2,1,3],k=3,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[16384,1,1,1],k=3,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[2047,2,1,3],k=3,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[2048,2,1,3],k=3,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[2049,2,1,3],k=3,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[16,10,10,10],k=7,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[60,10,10,10],k=7,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[1023,2,1,3],k=7,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[1024,2,1,3],k=7,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[1025,2,1,3],k=7,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[16384,1,1,1],k=7,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[2047,2,1,3],k=7,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[2048,2,1,3],k=7,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[2049,2,1,3],k=7,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[16,10,10,10],k=15,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[60,10,10,10],k=15,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[1023,2,1,3],k=15,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[1024,2,1,3],k=15,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[1025,2,1,3],k=15,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[16384,1,1,1],k=15,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[2047,2,1,3],k=15,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[2048,2,1,3],k=15,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[2049,2,1,3],k=15,ties=0","support","0","no","SYCL"
"SYCL0","TOP_K","type=f32,ne=[16,10,10,10],k=1,ties=0","support","1","yes","SYCL"
"SYCL0","TOP_K","type=f32,ne=[60,10,10,10],k=1,ties=0","support","1","yes","SYCL"
"SYCL0","TOP_K","type=f32,ne=[1023,2,1,3],k=1,ties=0","support","1","yes","SYCL"
"SYCL0","TOP_K","type=f32,ne=[1024,2,1,3],k=1,ties=0","support","1","yes","SYCL"
"SYCL0","TOP_K","type=f32,ne=[1025,2,1,3],k=1,ties=0","support","1","yes","SYCL"
"SYCL0","TOP_K","type=f32,ne=[16384,1,1,1],k=1,ties=0","support","1","yes","SYCL"
"SYCL0","TOP_K","type=f32,ne=[2047,2,1,3],k=1,ties=0","support","1","yes","SYCL"
"SYCL0","TOP_K","type=f32,ne=[2048,2,1,3],k=1,ties=0","support","1","yes","SYCL"
"SYCL0","TOP_K","type=f32,ne=[2049,2,1,3],k=1,ties=0","support","1","yes","SYCL"
"SYCL0","TOP_K","type=f32,ne=[16,10,10,10],k=2,ties=0","support","1","yes","SYCL"
"SYCL0","TOP_K","type=f32,ne=[60,10,10,10],k=2,ties=0","support","1","yes","SYCL"
"SYCL0","TOP_K","type=f32,ne=[1023,2,1,3],k=2,ties=0","support","1","yes","SYCL"
"SYCL0","TOP_K","type=f32,ne=[1024,2,1,3],k=2,ties=0","support","1","yes","SYCL"
"SYCL0","TOP_K","type=f32,ne=[1025,2,1,3],k=2,ties=0","support","1","yes","SYCL"
"SYCL0","TOP_K","type=f32,ne=[16384,1,1,1],k=2,ties=0","support","1","yes","SYCL"
"SYCL0","TOP_K","type=f32,ne=[2047,2,1,3],k=2,ties=0","support","1","yes","SYCL"
"SYCL0","TOP_K","type=f32,ne=[2048,2,1,3],k=2,ties=0","support","1","yes","SYCL"
"SYCL0","TOP_K","type=f32,ne=[2049,2,1,3],k=2,ties=0","support","1","yes","SYCL"
"SYCL0","TOP_K","type=f32,ne=[16,10,10,10],k=3,ties=0","support","1","yes","SYCL"
"SYCL0","TOP_K","type=f32,ne=[60,10,10,10],k=3,ties=0","support","1","yes","SYCL"
"SYCL0","TOP_K","type=f32,ne=[1023,2,1,3],k=3,ties=0","support","1","yes","SYCL"
"SYCL0","TOP_K","type=f32,ne=[1024,2,1,3],k=3,ties=0","support","1","yes","SYCL"
"SYCL0","TOP_K","type=f32,ne=[1025,2,1,3],k=3,ties=0","support","1","yes","SYCL"
"SYCL0","TOP_K","type=f32,ne=[16384,1,1,1],k=3,ties=0","support","1","yes","SYCL"
"SYCL0","TOP_K","type=f32,ne=[2047,2,1,3],k=3,ties=0","support","1","yes","SYCL"
"SYCL0","TOP_K","type=f32,ne=[2048,2,1,3],k=3,ties=0","support","1","yes","SYCL"
"SYCL0","TOP_K","type=f32,ne=[2049,2,1,3],k=3,ties=0","support","1","yes","SYCL"
"SYCL0","TOP_K","type=f32,ne=[16,10,10,10],k=7,ties=0","support","1","yes","SYCL"
"SYCL0","TOP_K","type=f32,ne=[60,10,10,10],k=7,ties=0","support","1","yes","SYCL"
"SYCL0","TOP_K","type=f32,ne=[1023,2,1,3],k=7,ties=0","support","1","yes","SYCL"
"SYCL0","TOP_K","type=f32,ne=[1024,2,1,3],k=7,ties=0","support","1","yes","SYCL"
"SYCL0","TOP_K","type=f32,ne=[1025,2,1,3],k=7,ties=0","support","1","yes","SYCL"
"SYCL0","TOP_K","type=f32,ne=[16384,1,1,1],k=7,ties=0","support","1","yes","SYCL"
"SYCL0","TOP_K","type=f32,ne=[2047,2,1,3],k=7,ties=0","support","1","yes","SYCL"
"SYCL0","TOP_K","type=f32,ne=[2048,2,1,3],k=7,ties=0","support","1","yes","SYCL"
"SYCL0","TOP_K","type=f32,ne=[2049,2,1,3],k=7,ties=0","support","1","yes","SYCL"
"SYCL0","TOP_K","type=f32,ne=[16,10,10,10],k=15,ties=0","support","1","yes","SYCL"
"SYCL0","TOP_K","type=f32,ne=[60,10,10,10],k=15,ties=0","support","1","yes","SYCL"
"SYCL0","TOP_K","type=f32,ne=[1023,2,1,3],k=15,ties=0","support","1","yes","SYCL"
"SYCL0","TOP_K","type=f32,ne=[1024,2,1,3],k=15,ties=0","support","1","yes","SYCL"
"SYCL0","TOP_K","type=f32,ne=[1025,2,1,3],k=15,ties=0","support","1","yes","SYCL"
"SYCL0","TOP_K","type=f32,ne=[16384,1,1,1],k=15,ties=0","support","1","yes","SYCL"
"SYCL0","TOP_K","type=f32,ne=[2047,2,1,3],k=15,ties=0","support","1","yes","SYCL"
"SYCL0","TOP_K","type=f32,ne=[2048,2,1,3],k=15,ties=0","support","1","yes","SYCL"
"SYCL0","TOP_K","type=f32,ne=[2049,2,1,3],k=15,ties=0","support","1","yes","SYCL"
"SYCL0","UPSCALE","type=f32,ne=[512,512,3,2],scale_factor=2,mode=nearest,transpose=0","support","1","yes","SYCL"
"SYCL0","UPSCALE","type=f32,ne=[512,512,3,2],scale_factor=2,mode=nearest,transpose=1","support","1","yes","SYCL"
"SYCL0","UPSCALE","type=f32,ne=[2,5,7,11],ne_tgt=[5,7,11,13],mode=nearest","support","1","yes","SYCL"
@ -10052,10 +10052,10 @@
"SYCL0","CUMSUM","type=f32,ne=[375960,1,1,1]","support","0","no","SYCL"
"SYCL0","CUMSUM","type=f32,ne=[20481,4,1,1]","support","0","no","SYCL"
"SYCL0","XIELU","type=f32,ne=[10,5,4,3]","support","0","no","SYCL"
"SYCL0","TRI","type=f32,ne=[10,10,4,3],tri_type=3","support","0","no","SYCL"
"SYCL0","TRI","type=f32,ne=[10,10,4,3],tri_type=2","support","0","no","SYCL"
"SYCL0","TRI","type=f32,ne=[10,10,4,3],tri_type=1","support","0","no","SYCL"
"SYCL0","TRI","type=f32,ne=[10,10,4,3],tri_type=0","support","0","no","SYCL"
"SYCL0","TRI","type=f32,ne=[10,10,4,3],tri_type=3","support","1","yes","SYCL"
"SYCL0","TRI","type=f32,ne=[10,10,4,3],tri_type=2","support","1","yes","SYCL"
"SYCL0","TRI","type=f32,ne=[10,10,4,3],tri_type=1","support","1","yes","SYCL"
"SYCL0","TRI","type=f32,ne=[10,10,4,3],tri_type=0","support","1","yes","SYCL"
"SYCL0","FILL","type=f32,ne=[10,10,4,3],c=0.000000","support","0","no","SYCL"
"SYCL0","FILL","type=f32,ne=[303,207,11,3],c=2.000000","support","0","no","SYCL"
"SYCL0","FILL","type=f32,ne=[800,600,4,4],c=-152.000000","support","0","no","SYCL"

Can't render this file because it is too large.

View File

@ -6,7 +6,7 @@ llama.cpp supports speculative decoding, a technique that can significantly acce
## Implementations
The `llama-server` application supports several implementations of speculative decoding:
The `llama-server` application supports several implementations of speculative decoding. An implementation with draft model can be mixed with an implementation without draft model.
### Draft Model (`draft`)
@ -32,12 +32,21 @@ An example to use this approach can be the rewriting of source code by a LLM.
This implementation looks for the last n-gram in history that matches the current n-gram and creates a draft using the m tokens following the matched n-gram. It is the simplest self-speculative approach with minimal overhead.
```
llama-server [...] --spec-type ngram-simple --draft-max 64
```
#### n-gram Map Key (`ngram-map-k`)
This implementation looks for the current n-gram of size n (called the _key_) in the token history. If the key n-gram is followed by the same m tokens (called the _mgram_) multiple times, it creates a draft using these m tokens. This approach requires a minimum number of occurrences (argument `--spec-ngram-min-hits`) before generating drafts.
This implementation looks for the current n-gram of size n (called the _key_) in the token history. If the key n-gram is followed by the same m tokens (called the _mgram_) multiple times, it creates a draft using these m tokens. This approach requires a minimum number of occurrences (argument `--spec-ngram-min-hits`, default is 1) before generating drafts.
The number of accepted tokens is stored for each used n-gram.
**Example:**
```
llama-server [...] --spec-type ngram-map-k --draft-max 64
```
#### n-gram Map Key-4-Values (`ngram-map-k4v`)
This experimental implementation looks for the current n-gram of size n (called the _key_) in the token history. For each key, up to four _values_ (n-grams of size m, called _mgrams_) are tracked. An internal statistic counts the occurrences of each mgram after the key n-gram. If one mgram is significantly more frequent than the others, it is used as the draft.
@ -45,17 +54,65 @@ This experimental implementation looks for the current n-gram of size n (called
The number of accepted tokens is stored for each used n-gram.
**Example:** Server options to be used if there are a lot of longer repetitions.
```bash
llama-server [...] --spec-type ngram-map-k4v --spec-ngram-size-n 8 --spec-ngram-size-m 8 --spec-ngram-min-hits 2
```
llama-server [...] --spec-type ngram-map-k4v --spec-ngram-size-n 8 --spec-ngram-size-m 8 --spec-ngram-min-hits 2 --draft-max 64
```
### n-gram Mod (`ngram-mod`)
Add basic ngram hasher for speculative decoding:
- For each ngram, compute a hash using LCG
- For each computed hash, store the next token
- During speculation, iteratively compute the rolling hash of the last n tokens and pick the next token from the storage
Some characteristics:
- Lightweight (~16 MB)
- Constant memory and complexity
- Can generate variable draft lengths (i.e. m is not fixed)
Currently, a single hash pool is shared across all server slots, so different requests can benefit from each other.
**Sample usage:**
```
# notes:
# - small `n` are not recommended
# - MoEs require long drafts
# - dense models: can reduce `--draft-min` and `--draft-max`
llama-server ... --spec-type ngram-mod --spec-ngram-size-n 24 --draft-min 48 --draft-max 64
```
Applications:
- Iterating over a block of text/code (e.g. in llama.vim)
- Reasoning models (when they have to repeat their thinking in the final answer)
- Summarization
Example Video:
- See #19164
### Differences between ngram-simple, ngram-map and ngram-mod
- ngram-simple looks for a previous matching n-gram and inserts the following m-gram.
- ngram-map-k looks for a previous matching n-gram and inserts the following m-gram but uses an internal hash-map of n-grams in the current context window.
- ngram-mod uses a hash pool which is shared across all server slots. The hash pool is a map from n-gram hash to the next token (not the next m-gram as in ngram-map).
## Command-Line Options
If a draft model is combined with a draftless decoding the draftless decoding has higher precedence.
```
--spec-type [none|ngram-cache|ngram-simple|ngram-map-k|ngram-map-k4v]
--draft, --draft-n, --draft-max N number of tokens to draft for speculative decoding (default: 16)
(env: LLAMA_ARG_DRAFT_MAX)
--draft-min, --draft-n-min N minimum number of draft tokens to use for speculative decoding
(default: 0)
(env: LLAMA_ARG_DRAFT_MIN)
[...]
--spec-type [none|ngram-cache|ngram-simple|ngram-map-k|ngram-map-k4v|ngram-mod]
type of speculative decoding to use when no draft model is provided
(default: none)
--spec-ngram-size-n N ngram size N for ngram-simple/ngram-map speculative decoding, length
@ -78,6 +135,7 @@ Specifies a type of speculative decoding without draft model.
| `ngram-simple` | Use simple n-gram pattern matching |
| `ngram-map-k` | Use n-gram pattern matching with n-gram-keys |
| `ngram-map-k4v` | Use n-gram pattern matching with n-gram-keys and up to four m-gram values (experimental) |
| `ngram-mod` | Use basic ngram hasher for speculative decoding with shared pool |
**Example:** Server-instance used to refactor source code.
```bash
@ -112,9 +170,15 @@ statistics ngram_simple: #calls = 15, #gen drafts = 5, #acc drafts = 5, #gen tok
statistics draft: #calls = 10, #gen drafts = 10, #acc drafts = 10, #gen tokens = 110, #acc tokens = 98
```
```
draft acceptance rate = 0.70312 ( 90 accepted / 128 generated)
statistics ngram_mod: #calls = 810, #gen drafts = 15, #acc drafts = 15, #gen tokens = 960, #acc tokens = 730, dur(b,g,a) = 0.149, 0.347, 0.005 ms
```
- `#calls`: number of calls of this implementations
- `#gen drafts`: number of drafts generated by this implementation
- `#acc drafts`: number of drafts accepted (partially) by the main model
- `#gen tokens`: number of tokens generated by this implementation (including rejected tokens)
- `#acc tokens`: number of tokens accepted by the main model
- `dur(b,g,a): durations of begin (new prompt), generation and accumulation (process acceptance).

View File

@ -1,7 +1,7 @@
# Migration notice for binary filenames
> [!IMPORTANT]
[2024 Jun 12] Binaries have been renamed w/ a `llama-` prefix. `main` is now `llama-cli`, `server` is `llama-server`, etc (https://github.com/ggerganov/llama.cpp/pull/7809)
[2024 Jun 12] Binaries have been renamed w/ a `llama-` prefix. `main` is now `llama-cli`, `server` is `llama-server`, etc (https://github.com/ggml-org/llama.cpp/pull/7809)
This migration was important, but it is a breaking change that may not always be immediately obvious to users.

View File

@ -28,7 +28,7 @@ int main(int argc, char** argv) {
fprintf(stdout, "\n");
fprintf(stdout, "WARNING: The binary '%s' is deprecated.\n", filename.c_str());
fprintf(stdout, " Please use '%s' instead.\n", replacement_filename.c_str());
fprintf(stdout, " See https://github.com/ggerganov/llama.cpp/tree/master/examples/deprecation-warning/README.md for more information.\n");
fprintf(stdout, " See https://github.com/ggml-org/llama.cpp/tree/master/examples/deprecation-warning/README.md for more information.\n");
fprintf(stdout, "\n");
return EXIT_FAILURE;

View File

@ -402,7 +402,7 @@ class SchemaConverter:
Transforms a regular expression pattern into a GBNF rule.
Input: https://json-schema.org/understanding-json-schema/reference/regular_expressions
Output: https://github.com/ggerganov/llama.cpp/blob/master/grammars/README.md
Output: https://github.com/ggml-org/llama.cpp/blob/master/grammars/README.md
Unsupported features: negative/positive lookaheads, greedy/non-greedy modifiers.

View File

@ -50,6 +50,12 @@ int main(int argc, char ** argv) {
const int N = 5; // n-gram size
const int G = 15; // max verification n-grams
// lookahead requires W + G + 1 sequences for parallel Jacobi decoding
params.n_parallel = W + G + 1;
// unified KV cache is required for coupled sequences in batch splitting
params.kv_unified = true;
// init llama.cpp
llama_backend_init();
llama_numa_init(params.numa);
@ -115,7 +121,7 @@ int main(int argc, char ** argv) {
// seq_id == 0 : the current input token
// seq_id [1, W] : tokens from the past N - 1 Jacobi iterations
// seq_id [W + 1, W + G] : verification n-grams
llama_batch batch = llama_batch_init(params.n_ctx, 0, W + G + 1);
llama_batch batch = llama_batch_init(llama_n_ctx(ctx), 0, W + G + 1);
// target model sampling context
struct common_sampler * smpl = common_sampler_init(model, params.sampling);

View File

@ -106,7 +106,7 @@ int main(int argc, char ** argv){
std::vector<llama_token> draft;
llama_batch batch_tgt = llama_batch_init(params.n_ctx, 0, 1);
llama_batch batch_tgt = llama_batch_init(llama_n_ctx(ctx), 0, 1);
const auto t_dec_start = ggml_time_us();

View File

@ -33,11 +33,14 @@ DEVICE ?= auto
causal-convert-model-bf16: OUTTYPE=bf16
causal-convert-model-bf16: causal-convert-model
causal-convert-model-debug: DEBUG=--debug
causal-convert-model-debug: causal-convert-model
causal-convert-model:
$(call validate_model_path,causal-convert-model)
@MODEL_NAME="$(MODEL_NAME)" OUTTYPE="$(OUTTYPE)" MODEL_PATH="$(MODEL_PATH)" \
METADATA_OVERRIDE="$(METADATA_OVERRIDE)" \
./scripts/causal/convert-model.sh
./scripts/causal/convert-model.sh $(DEBUG)
causal-convert-mm-model-bf16: OUTTYPE=bf16
causal-convert-mm-model-bf16: MM_OUTTYPE=f16

View File

@ -4,12 +4,17 @@ set -e
# Parse command line arguments
MMPROJ=""
DEBUG=""
while [[ $# -gt 0 ]]; do
case $1 in
--mmproj)
MMPROJ="--mmproj"
shift
;;
--debug)
DEBUG="1"
shift
;;
*)
shift
;;
@ -28,7 +33,12 @@ echo "Data type: ${TYPE}"
echo "Converted model path:: ${CONVERTED_MODEL}"
echo "Metadata override: ${METADATA_OVERRIDE}"
CMD_ARGS=("python" "../../convert_hf_to_gguf.py" "--verbose")
if [[ -n "$DEBUG" ]]; then
CMD_ARGS=("python" "-m" "pdb")
else
CMD_ARGS=("python")
fi
CMD_ARGS+=("../../convert_hf_to_gguf.py" "--verbose")
CMD_ARGS+=("${MODEL_PATH}")
CMD_ARGS+=("--outfile" "${CONVERTED_MODEL}")
CMD_ARGS+=("--outtype" "${TYPE}")

View File

@ -18,13 +18,14 @@ CONTEXT=4096
#support malloc device memory more than 4GB.
export UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS=1
LOAD_MODE='--mmap'
if [ $# -gt 0 ]; then
GGML_SYCL_DEVICE=$1
echo "use $GGML_SYCL_DEVICE as main GPU"
#use signle GPU only
ZES_ENABLE_SYSMAN=1 ./build/bin/llama-completion -m ${MODEL_FILE} -no-cnv -p "${INPUT_PROMPT}" -n 400 -e -ngl ${NGL} -s 0 -c ${CONTEXT} -mg $GGML_SYCL_DEVICE -sm none
ZES_ENABLE_SYSMAN=1 ./build/bin/llama-completion -m ${MODEL_FILE} -no-cnv -p "${INPUT_PROMPT}" -n 400 -e -ngl ${NGL} -s 0 -c ${CONTEXT} -mg $GGML_SYCL_DEVICE -sm none ${LOAD_MODE}
else
#use multiple GPUs with same max compute units
ZES_ENABLE_SYSMAN=1 ./build/bin/llama-completion -m ${MODEL_FILE} -no-cnv -p "${INPUT_PROMPT}" -n 400 -e -ngl ${NGL} -s 0 -c ${CONTEXT}
ZES_ENABLE_SYSMAN=1 ./build/bin/llama-completion -m ${MODEL_FILE} -no-cnv -p "${INPUT_PROMPT}" -n 400 -e -ngl ${NGL} -s 0 -c ${CONTEXT} ${LOAD_MODE}
fi

View File

@ -1,31 +0,0 @@
#!/usr/bin/env bash
# MIT license
# Copyright (C) 2025 Intel Corporation
# SPDX-License-Identifier: MIT
# If you want more control, DPC++ Allows selecting a specific device through the
# following environment variable
export ONEAPI_DEVICE_SELECTOR="level_zero:0"
source /opt/intel/oneapi/setvars.sh
#export GGML_SYCL_DEBUG=1
#ZES_ENABLE_SYSMAN=1, Support to get free memory of GPU by sycl::aspect::ext_intel_free_memory. Recommended to use when --split-mode = layer.
INPUT_PROMPT="Building a website can be done in 10 simple steps:\nStep 1:"
MODEL_FILE=models/Meta-Llama-3.1-8B-Instruct-Q4_K_M.gguf
NGL=99 # Layers offloaded to the GPU. If the device runs out of memory, reduce this value according to the model you are using.
CONTEXT=4096
#support malloc device memory more than 4GB.
export UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS=1
if [ $# -gt 0 ]; then
GGML_SYCL_DEVICE=$1
echo "Using $GGML_SYCL_DEVICE as the main GPU"
ZES_ENABLE_SYSMAN=1 ./build/bin/llama-completion -m ${MODEL_FILE} -no-cnv -p "${INPUT_PROMPT}" -n 400 -e -ngl ${NGL} -s 0 -c ${CONTEXT} -mg $GGML_SYCL_DEVICE -sm none
else
#use multiple GPUs with same max compute units
ZES_ENABLE_SYSMAN=1 ./build/bin/llama-completion -m ${MODEL_FILE} -no-cnv -p "${INPUT_PROMPT}" -n 400 -e -ngl ${NGL} -s 0 -c ${CONTEXT}
fi

130
examples/sycl/test.sh Executable file
View File

@ -0,0 +1,130 @@
#!/bin/bash
# MIT license
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: MIT
Help() {
cat << EOF
Usage: $(basename "$0") [OPTIONS]
This script processes files with specified options.
Options:
-h, --help Display this help message and exit.
-c, --context <value> Set context length. Bigger need more memory.
-p, --promote <value> Prompt to start generation with.
-m, --model <value> Full model file path.
-mg,--main-gpu <value> Set main GPU ID (0 - n) for single GPU mode.
-sm,--split-mode <value> How to split the model across multiple GPUs, one of:
- none: use one GPU only
- layer (default): split layers and KV across GPUs
- row: split rows across GPUs
-ngl,--n-gpu-layers <value> Max. number of layers to store in VRAM (default: -1)
-lv,--log-verbosity <value> Set the verbosity threshold. Messages with a higher verbosity will be
ignored. Values:
- 0: generic output
- 1: error
- 2: warning
- 3: info
- 4: debug
EOF
}
BIN_FILE=./build/bin/llama-completion
SEED=0
GPUS_SETTING=""
INPUT_PROMPT="Building a website can be done in 10 simple steps:\nStep 1:"
MODEL_FILE=models/llama-2-7b.Q4_0.gguf
NGL=99
CONTEXT=4096
GGML_SYCL_DEVICE=-1
SPLIT_MODE=layer
LOG_VERBOSE=3
while [[ $# -gt 0 ]]; do
case "$1" in
-c|--context)
CONTEXT=$2
# Shift twice to consume both the option flag and its value
shift
shift
;;
-p|--promote)
# Option that is a simple flag (boolean)
INPUT_PROMPT="$2"
# Shift once to consume the option flag
shift
shift
;;
-m|--model)
MODEL_FILE="$2"
# Shift twice to consume both the option flag and its value
shift
shift
;;
-mg|--main-gpu)
GGML_SYCL_DEVICE=$2
SPLIT_MODE=none
# Shift twice to consume both the option flag and its value
shift
shift
;;
-sm|--split-mode)
SPLIT_MODE=$2
# Shift twice to consume both the option flag and its value
shift
shift
;;
-ngl|--n-gpu-layers)
NGL=$2
# Shift twice to consume both the option flag and its value
shift
shift
;;
-lv|--log-verbosity)
LOG_VERBOSE=$2
# Shift twice to consume both the option flag and its value
shift
shift
;;
-h|--help)
Help
exit 0
;;
*)
# Handle unknown options or stop processing options
echo "Invalid option: $1"
# Optional: exit script or shift to treat remaining as positional args
exit 1
;;
esac
done
source /opt/intel/oneapi/setvars.sh
#export GGML_SYCL_DEBUG=1
#ZES_ENABLE_SYSMAN=1, Support to get free memory of GPU by sycl::aspect::ext_intel_free_memory. Recommended to use when --split-mode = layer.
#support malloc device memory more than 4GB.
export UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS=1
echo "UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS=${UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS}"
if [ $GGML_SYCL_DEVICE -ne -1 ]; then
echo "Use $GGML_SYCL_DEVICE as main GPU"
#use signle GPU only
GPUS_SETTING="-mg $GGML_SYCL_DEVICE -sm ${SPLIT_MODE}"
export ONEAPI_DEVICE_SELECTOR="level_zero:${$GGML_SYCL_DEVICE}"
echo "ONEAPI_DEVICE_SELECTOR=${ONEAPI_DEVICE_SELECTOR}"
else
echo "Use all Intel GPUs, including iGPU & dGPU"
fi
echo "run cmd: ZES_ENABLE_SYSMAN=1 ${BIN_FILE} -m ${MODEL_FILE} -no-cnv -p "${INPUT_PROMPT}" -n 400 -e -ngl ${NGL} -s ${SEED} -c ${CONTEXT} ${GPUS_SETTING} -lv ${LOG_VERBOSE} --mmap "
ZES_ENABLE_SYSMAN=1 ${BIN_FILE} -m ${MODEL_FILE} -no-cnv -p "${INPUT_PROMPT}" -n 400 -e -ngl ${NGL} -s ${SEED} -c ${CONTEXT} ${GPUS_SETTING} -lv ${LOG_VERBOSE} --mmap

View File

@ -7,5 +7,5 @@ set INPUT2="Building a website can be done in 10 simple steps:\nStep 1:"
:: support malloc device memory more than 4GB.
set UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS=1
.\build\bin\llama-completion.exe -m models\llama-2-7b.Q4_0.gguf -no-cnv -p %INPUT2% -n 400 -e -ngl 99 -s 0
set LOAD_MODE="--mmap"
.\build\bin\llama-completion.exe -m models\llama-2-7b.Q4_0.gguf -no-cnv -p %INPUT2% -n 400 -e -ngl 99 -s 0 %LOAD_MODE%

View File

@ -7,5 +7,5 @@ set INPUT2="Building a website can be done in 10 simple steps:\nStep 1:"
:: support malloc device memory more than 4GB.
set UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS=1
.\build\bin\llama-completion.exe -m models\Meta-Llama-3.1-8B-Instruct-Q4_K_M.gguf -no-cnv -p %INPUT2% -n 400 -s 0 -e -ngl 99
set LOAD_MODE="--mmap"
.\build\bin\llama-completion.exe -m models\llama-2-7b.Q4_0.gguf -no-cnv -p %INPUT2% -n 400 -e -ngl 99 -s 0 %LOAD_MODE%

View File

@ -1,4 +1,4 @@
cmake_minimum_required(VERSION 3.14) # for add_link_options and implicit target directories.
cmake_minimum_required(VERSION 3.14...3.28) # for add_link_options and implicit target directories.
project("ggml" C CXX ASM)
### GGML Version

View File

@ -1,5 +1,5 @@
/*
* Copyright (c) 2023-2024 The ggml authors
* Copyright (c) 2023-2026 The ggml authors
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to

View File

@ -19,6 +19,9 @@ extern "C" {
// abort ggml_graph_compute when true
ggml_abort_callback abort_callback;
void * abort_callback_data;
// use only reference implementations
bool use_ref;
};
// numa strategies
@ -132,6 +135,8 @@ extern "C" {
GGML_BACKEND_API void ggml_backend_cpu_set_threadpool (ggml_backend_t backend_cpu, ggml_threadpool_t threadpool);
GGML_BACKEND_API void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data);
GGML_BACKEND_API void ggml_backend_cpu_set_use_ref(ggml_backend_t backend_cpu, bool use_ref);
GGML_BACKEND_API ggml_backend_reg_t ggml_backend_cpu_reg(void);
GGML_BACKEND_API void ggml_cpu_fp32_to_fp32(const float *, float *, int64_t);

View File

@ -6,7 +6,7 @@
// This documentation is still a work in progress.
// If you wish some specific topics to be covered, feel free to drop a comment:
//
// https://github.com/ggerganov/whisper.cpp/issues/40
// https://github.com/ggml-org/whisper.cpp/issues/40
//
// ## Overview
//

View File

@ -258,6 +258,7 @@ void ggml_backend_tensor_set_async(ggml_backend_t backend, struct ggml_tensor *
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
if (backend->iface.set_tensor_async == NULL) {
ggml_backend_synchronize(backend);
ggml_backend_tensor_set(tensor, data, offset, size);
} else {
backend->iface.set_tensor_async(backend, tensor, data, offset, size);
@ -271,6 +272,7 @@ void ggml_backend_tensor_get_async(ggml_backend_t backend, const struct ggml_ten
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
if (backend->iface.get_tensor_async == NULL) {
ggml_backend_synchronize(backend);
ggml_backend_tensor_get(tensor, data, offset, size);
} else {
backend->iface.get_tensor_async(backend, tensor, data, offset, size);

View File

@ -1,5 +1,5 @@
/*
* Copyright (c) 2023-2024 The ggml authors
* Copyright (c) 2023-2026 The ggml authors
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to

View File

@ -1,5 +1,5 @@
/*
* Copyright (c) 2023-2024 The ggml authors
* Copyright (c) 2023-2026 The ggml authors
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to

View File

@ -1,5 +1,5 @@
/*
* Copyright (c) 2023-2024 The ggml authors
* Copyright (c) 2023-2026 The ggml authors
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to

View File

@ -1,5 +1,5 @@
/**
* Copyright (c) 2023-2024 The ggml authors
* Copyright (c) 2023-2026 The ggml authors
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to

View File

@ -1,5 +1,5 @@
/*
* Copyright (c) 2023-2024 The ggml authors
* Copyright (c) 2023-2026 The ggml authors
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to

View File

@ -1,5 +1,5 @@
/*
* Copyright (c) 2023-2024 The ggml authors
* Copyright (c) 2023-2026 The ggml authors
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to

View File

@ -24,6 +24,9 @@ struct ggml_compute_params {
void * wdata;
struct ggml_threadpool * threadpool;
// use reference implementation
bool use_ref;
};

View File

@ -5,7 +5,6 @@
#include "ggml-backend.h"
#include "traits.h"
#include "ggml-cpu-impl.h"
#include "ggml-cpu.h"
#include "ggml-impl.h"
#include "quants.h"
#include "ggml-threading.h"
@ -2867,12 +2866,20 @@ struct ggml_cplan ggml_graph_plan(
} break;
case GGML_OP_FLASH_ATTN_EXT:
{
const int64_t neq2 = node->src[0]->ne[2]; // number of query heads
const int64_t DK = node->src[1]->ne[0];
const int64_t DV = node->src[2]->ne[0];
// Tiled flash attention scratch (tile sizes defined in common.h)
// Per-thread: Q_q + KQ + mask + VKQ32 + V32 + padding
cur = sizeof(float)*(GGML_FA_TILE_Q*DK + 2*GGML_FA_TILE_Q*GGML_FA_TILE_KV + GGML_FA_TILE_Q*DV + GGML_FA_TILE_KV*DV)*n_tasks;
size_t prefill = sizeof(float)*(GGML_FA_TILE_Q*DK + 2*GGML_FA_TILE_Q*GGML_FA_TILE_KV + GGML_FA_TILE_Q*DV + GGML_FA_TILE_KV*DV)*n_tasks;
// Decode path: n_kv_chunks = n_tasks (one chunk per thread)
// Per-thread: VKQ accmulator (DV), partial M, partial S + intra-thread scratch for V, Q and VKQ
size_t n_chunks = n_tasks;
size_t decode = sizeof(float)*(neq2*n_chunks*(2+DV) + n_tasks*(DK + 2*DV));
cur += MAX(prefill, decode);
} break;
case GGML_OP_FLASH_ATTN_BACK:
{
@ -2929,11 +2936,12 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
set_numa_thread_affinity(state->ith);
struct ggml_compute_params params = {
/*.ith =*/ state->ith,
/*.nth =*/ atomic_load_explicit(&tp->n_graph, memory_order_relaxed) & GGML_THREADPOOL_N_THREADS_MASK,
/*.wsize =*/ cplan->work_size,
/*.wdata =*/ cplan->work_data,
/*.threadpool=*/ tp,
/*.ith =*/ state->ith,
/*.nth =*/ atomic_load_explicit(&tp->n_graph, memory_order_relaxed) & GGML_THREADPOOL_N_THREADS_MASK,
/*.wsize =*/ cplan->work_size,
/*.wdata =*/ cplan->work_data,
/*.threadpool =*/ tp,
/*.use_ref =*/ cplan->use_ref,
};
GGML_PRINT_DEBUG("thread #%d compute-start cplan %p last-graph %d \n", state->ith, cplan, state->last_graph);

View File

@ -105,6 +105,8 @@ struct ggml_backend_cpu_context {
ggml_abort_callback abort_callback;
void * abort_callback_data;
bool use_ref; // use reference implementation
};
static const char * ggml_backend_cpu_get_name(ggml_backend_t backend) {
@ -143,6 +145,7 @@ static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(ggml_backend
cpu_plan->cplan.abort_callback = cpu_ctx->abort_callback;
cpu_plan->cplan.abort_callback_data = cpu_ctx->abort_callback_data;
cpu_plan->cplan.use_ref = cpu_ctx->use_ref;
return cpu_plan;
}
@ -182,6 +185,7 @@ static enum ggml_status ggml_backend_cpu_graph_compute(ggml_backend_t backend, s
cplan.abort_callback = cpu_ctx->abort_callback;
cplan.abort_callback_data = cpu_ctx->abort_callback_data;
cplan.use_ref = cpu_ctx->use_ref;
return ggml_graph_compute(cgraph, &cplan);
}
@ -223,6 +227,7 @@ ggml_backend_t ggml_backend_cpu_init(void) {
ctx->work_size = 0;
ctx->abort_callback = NULL;
ctx->abort_callback_data = NULL;
ctx->use_ref = false;
ggml_backend_t cpu_backend = new ggml_backend {
/* .guid = */ ggml_backend_cpu_guid(),
@ -270,6 +275,13 @@ void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_
ctx->abort_callback_data = abort_callback_data;
}
void ggml_backend_cpu_set_use_ref(ggml_backend_t backend_cpu, bool use_ref) {
GGML_ASSERT(ggml_backend_is_cpu(backend_cpu));
struct ggml_backend_cpu_context * ctx = (struct ggml_backend_cpu_context *)backend_cpu->context;
ctx->use_ref = use_ref;
}
// CPU backend - device
struct ggml_backend_cpu_device_context {
@ -646,6 +658,9 @@ static void * ggml_backend_cpu_get_proc_address(ggml_backend_reg_t reg, const ch
if (strcmp(name, "ggml_backend_cpu_is_numa") == 0) {
return (void *)ggml_is_numa;
}
if (strcmp(name, "ggml_backend_cpu_set_use_ref") == 0) {
return (void *)ggml_backend_cpu_set_use_ref;
}
// threadpool - TODO: move to ggml-base
if (strcmp(name, "ggml_threadpool_new") == 0) {

View File

@ -8042,12 +8042,14 @@ void ggml_compute_forward_top_k(
}
}
// ggml_compute_forward_flash_attn_ext
static void ggml_compute_forward_flash_attn_ext_f16_one_chunk(
const ggml_compute_params * params,
ggml_tensor * dst,
int ir0, int ir1) {
int ir0, int ir1,
int64_t ic_start, int64_t ic_end,
float * partials, int64_t partial_stride) {
const bool write_partials = (partials != nullptr);
const ggml_tensor * q = dst->src[0];
const ggml_tensor * k = dst->src[1];
const ggml_tensor * v = dst->src[2];
@ -8124,7 +8126,6 @@ static void ggml_compute_forward_flash_attn_ext_f16_one_chunk(
int ith = params->ith;
// loop over n_batch and n_head
for (int ir = ir0; ir < ir1; ++ir) {
// q indices
const int iq3 = ir/(neq2*neq1);
@ -8165,7 +8166,7 @@ static void ggml_compute_forward_flash_attn_ext_f16_one_chunk(
// loop over n_kv and n_head_kv
// ref: https://arxiv.org/pdf/2112.05682.pdf
for (int64_t ic = 0; ic < nek1; ++ic) {
for (int64_t ic = ic_start; ic < ic_end; ++ic) {
const float mv = mp ? slope*GGML_CPU_FP16_TO_FP32(mp[ic]) : 0.0f;
if (mv == -INFINITY) {
continue;
@ -8238,8 +8239,8 @@ static void ggml_compute_forward_flash_attn_ext_f16_one_chunk(
}
}
// sinks
if (sinks) {
// sinks - apply only on the first kv-chunk
if (sinks && ic_start == 0) {
const float s = ((float *)((char *) sinks->data))[h];
float ms = 1.0f;
@ -8247,6 +8248,7 @@ static void ggml_compute_forward_flash_attn_ext_f16_one_chunk(
if (s > M) {
ms = expf(M - s);
M = s;
ggml_vec_scale_f32(DV, VKQ32, ms);
} else {
vs = expf(s - M);
@ -8255,20 +8257,26 @@ static void ggml_compute_forward_flash_attn_ext_f16_one_chunk(
S = S*ms + vs;
}
// V /= S
const float S_inv = S == 0.0f ? 0.0f : 1.0f/S;
ggml_vec_scale_f32(DV, VKQ32, S_inv);
if (write_partials) {
// Write M, S, VKQ to partials for later reduction
// partials layout: [M, S, VKQ[DV]] per query head
float * partial = partials + ir * partial_stride;
partial[0] = M;
partial[1] = S;
memcpy(partial + 2, VKQ32, DV * sizeof(float));
} else {
// V /= S
const float S_inv = S == 0.0f ? 0.0f : 1.0f/S;
ggml_vec_scale_f32(DV, VKQ32, S_inv);
// dst indices
const int i1 = iq1;
const int i2 = iq2;
const int i3 = iq3;
// dst indices
const int i1 = iq1;
const int i2 = iq2;
const int i3 = iq3;
// original
//memcpy((char *) dst->data + (i1*nb1 + i2*nb2 + i3*nb3), V, nev0*sizeof(float));
// permute(0, 2, 1, 3)
memcpy((char *) dst->data + (i3*ne2*ne1 + i2 + i1*ne1)*nb1, VKQ32, nb1);
// permute(0, 2, 1, 3)
memcpy((char *) dst->data + (i3*ne2*ne1 + i2 + i1*ne1)*nb1, VKQ32, nb1);
}
}
}
@ -8546,6 +8554,78 @@ static void ggml_compute_forward_flash_attn_ext_tiled(
}
}
// Reduction function: combines partial results across KV chunks
// Partials layout in wdata: [n_q_heads][n_chunks][2 + DV]
static void ggml_flash_attn_ext_reduce_partials(
const ggml_compute_params * params,
ggml_tensor * dst,
const int64_t n_chunks,
const int64_t chunk_size) {
const ggml_tensor * q = dst->src[0];
const ggml_tensor * k = dst->src[1];
const ggml_tensor * v = dst->src[2];
const int64_t DK = k->ne[0];
const int64_t DV = v->ne[0];
const int64_t nek1 = k->ne[1];
const int64_t n_q_heads = q->ne[2];
const int ith = params->ith;
const int nth = params->nth;
const int64_t wdata_per_thread = DK + 2*DV + CACHE_LINE_SIZE_F32;
float * thread_wdata = (float *) params->wdata + ith * wdata_per_thread;
const int64_t partials_offset = nth * (DK + 2*DV + CACHE_LINE_SIZE_F32);
const int64_t partial_size = 2 + DV;
const float * partials_base = (const float *) params->wdata + partials_offset;
// Output layout
const int64_t ne1 = dst->ne[1];
const int64_t ne2 = dst->ne[2];
const size_t nb1 = dst->nb[1];
// Each thread reduces a subset of query heads
for (int64_t q_head = ith; q_head < n_q_heads; q_head += nth) {
float M_final = -INFINITY;
float S_final = 0.0f;
float * VKQ_final = thread_wdata;
memset(VKQ_final, 0, DV * sizeof(float));
// Combine partials from all chunks
for (int64_t chunk_idx = 0; chunk_idx < n_chunks; ++chunk_idx) {
const int64_t ic_start = chunk_idx * chunk_size;
if (ic_start >= nek1) continue;
const float * partial = partials_base + (q_head * n_chunks + chunk_idx) * partial_size;
const float M_chunk = partial[0];
const float S_chunk = partial[1];
const float * VKQ_chunk = partial + 2;
if (S_chunk == 0.0f) continue;
const float M_new = fmaxf(M_final, M_chunk);
const float scale_old = expf(M_final - M_new);
const float scale_new = expf(M_chunk - M_new);
for (int64_t d = 0; d < DV; ++d) {
VKQ_final[d] = VKQ_final[d] * scale_old + VKQ_chunk[d] * scale_new;
}
S_final = S_final * scale_old + S_chunk * scale_new;
M_final = M_new;
}
// Normalize and write to output
if (S_final != 0.0f) {
const float S_inv = 1.0f / S_final;
ggml_vec_scale_f32(DV, VKQ_final, S_inv);
}
// iq1=0, iq3=0 for decode
memcpy((char *) dst->data + (0*ne2*ne1 + q_head + 0*ne1)*nb1, VKQ_final, nb1);
}
}
static void ggml_compute_forward_flash_attn_ext_f16(
const ggml_compute_params * params,
ggml_tensor * dst) {
@ -8567,6 +8647,7 @@ static void ggml_compute_forward_flash_attn_ext_f16(
const int64_t DV = nev0;
const int64_t N = neq1;
GGML_ASSERT(ne0 == DV);
GGML_ASSERT(ne2 == N);
@ -8587,60 +8668,92 @@ static void ggml_compute_forward_flash_attn_ext_f16(
GGML_ASSERT(nb1 <= nb2);
GGML_ASSERT(nb2 <= nb3);
// parallelize by q rows using ggml_vec_dot_f32
// total rows in q
const int64_t nr = neq1*neq2*neq3;
// rows per thread
const int ith = params->ith;
const int nth = params->nth;
// disable for NUMA
const bool disable_chunking = ggml_is_numa();
// When use_ref is set, force the vec-only reference implementation (no tiling, no KV-chunking)
const bool use_ref = params->use_ref;
// 4x chunks per thread
int nth_scaled = nth * 4;
int64_t chunk_size = (nr + nth_scaled - 1) / nth_scaled;
int64_t nchunk = (nr + chunk_size - 1) / chunk_size;
if (nth == 1 || nchunk < nth || disable_chunking) {
nchunk = nth;
}
if (ith == 0) {
// Every thread starts at ith, so the first unprocessed chunk is nth. This save a bit of coordination right at the start.
ggml_threadpool_chunk_set(params->threadpool, nth);
}
ggml_barrier(params->threadpool);
// The number of elements in each chunk
const int64_t dr = (nr + nchunk - 1) / nchunk;
static constexpr int64_t KV_TILE_SZ = ggml_fa_tile_config::KV;
static constexpr int64_t Q_TILE_SZ = ggml_fa_tile_config::Q;
const bool kv_is_f32_or_f16 = (k->type == GGML_TYPE_F32 || k->type == GGML_TYPE_F16);
const bool use_tiled = (q->type == GGML_TYPE_F32 &&
kv_is_f32_or_f16 &&
k->type == v->type &&
nek1 % KV_TILE_SZ == 0 &&
neq1 >= Q_TILE_SZ); // Only use tiled for batch >= tile size
const bool use_split_kv_path = !use_ref && (neq1 == 1 && neq3 == 1) && kv_is_f32_or_f16 && (k->type == v->type) && q->type == GGML_TYPE_F32 && nek1 >= 512;
// The first chunk comes from our thread_id, the rest will get auto-assigned.
int current_chunk = ith;
if (use_split_kv_path) {
const int64_t chunk_size = (nek1 + nth - 1) / nth;
while (current_chunk < nchunk) {
const int64_t ir0 = dr * current_chunk;
const int64_t ir1 = MIN(ir0 + dr, nr);
// Partials buffer layout: [q_head][kv_chunk][M, S, VKQ]
const int64_t partial_size = 2 + DV;
float * partials_base = (float *) params->wdata + nth * (DK + 2*DV + CACHE_LINE_SIZE_F32);
if (use_tiled) {
ggml_compute_forward_flash_attn_ext_tiled(params, dst, ir0, ir1);
const int64_t ic_start = ith * chunk_size;
const int64_t ic_end = std::min(ic_start + chunk_size, nek1);
const int64_t partial_stride = nth * partial_size;
float * chunk_partials = partials_base + ith * partial_size;
if (ic_start < nek1) {
for (int64_t q_head = 0; q_head < neq2; q_head++) {
ggml_compute_forward_flash_attn_ext_f16_one_chunk(
params, dst, q_head, q_head + 1, ic_start, ic_end,
chunk_partials, partial_stride);
}
} else {
ggml_compute_forward_flash_attn_ext_f16_one_chunk(params, dst, ir0, ir1);
for (int64_t q_head = 0; q_head < neq2; q_head++) {
float * q_partials = chunk_partials + q_head * partial_stride;
q_partials[0] = -INFINITY; // M
q_partials[1] = 0.0f; // S
}
}
current_chunk = ggml_threadpool_chunk_add(params->threadpool, 1);
ggml_barrier(params->threadpool);
ggml_flash_attn_ext_reduce_partials(params, dst, nth, chunk_size);
} else {
// total rows in q
const int64_t nr = neq1*neq2*neq3;
// disable for NUMA
const bool disable_chunking = ggml_is_numa();
// 4x chunks per thread
int nth_scaled = nth * 4;
int64_t chunk_size = (nr + nth_scaled - 1) / nth_scaled;
int64_t nchunk = (nr + chunk_size - 1) / chunk_size;
if (nth == 1 || nchunk < nth || disable_chunking) {
nchunk = nth;
}
if (ith == 0) {
ggml_threadpool_chunk_set(params->threadpool, nth);
}
ggml_barrier(params->threadpool);
const int64_t dr = (nr + nchunk - 1) / nchunk;
static constexpr int64_t KV_TILE_SZ = ggml_fa_tile_config::KV;
static constexpr int64_t Q_TILE_SZ = ggml_fa_tile_config::Q;
const bool use_tiled = !use_ref &&
(q->type == GGML_TYPE_F32 &&
kv_is_f32_or_f16 &&
k->type == v->type &&
nek1 % KV_TILE_SZ == 0 &&
neq1 >= Q_TILE_SZ);
int current_chunk = ith;
while (current_chunk < nchunk) {
const int64_t ir0 = dr * current_chunk;
const int64_t ir1 = MIN(ir0 + dr, nr);
if (use_tiled) {
ggml_compute_forward_flash_attn_ext_tiled(params, dst, ir0, ir1);
} else {
ggml_compute_forward_flash_attn_ext_f16_one_chunk(params, dst, ir0, ir1, 0, nek1, nullptr, 0);
}
current_chunk = ggml_threadpool_chunk_add(params->threadpool, 1);
}
}
}

View File

@ -1124,6 +1124,7 @@ struct ggml_tensor_extra_gpu {
struct ggml_cuda_graph_node_properties {
void * node_data;
ggml_op node_op;
enum ggml_type node_type;
int32_t flags;
int64_t ne[GGML_MAX_DIMS];
size_t nb[GGML_MAX_DIMS];

View File

@ -2279,13 +2279,19 @@ static void ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor *
const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc;
if (src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
if (ne2 == 1) {
static_assert(MMVQ_MAX_BATCH_SIZE == MMVF_MAX_BATCH_SIZE);
if (ne2 <= MMVQ_MAX_BATCH_SIZE) {
if (ggml_is_quantized(src0->type)) {
ggml_cuda_mul_mat_vec_q(ctx, src0, src1, ids, dst);
if (ne2 <= 4) {
ggml_cuda_mul_mat_vec_q(ctx, src0, src1, ids, dst);
return;
}
} else {
ggml_cuda_mul_mat_vec_f(ctx, src0, src1, ids, dst);
if (GGML_CUDA_CC_IS_AMD(cc)) {
ggml_cuda_mul_mat_vec_f(ctx, src0, src1, ids, dst);
return;
}
}
return;
}
if (ggml_cuda_should_use_mmq(src0->type, cc, ne12, /*n_experts=*/ne02)) {
@ -2920,6 +2926,7 @@ static void ggml_cuda_graph_node_set_properties(ggml_cuda_graph_node_properties
memset(props, 0, sizeof(ggml_cuda_graph_node_properties));
props->node_data = node->data;
props->node_op = node->op;
props->node_type = node->type;
props->flags = node->flags;
for (int i = 0; i < GGML_MAX_DIMS; i++) {
props->ne[i] = node->ne[i];
@ -2944,6 +2951,10 @@ static bool ggml_cuda_graph_node_properties_match(ggml_tensor * node, ggml_cuda_
return false;
}
if (node->type != props->node_type) {
return false;
}
for (int i = 0; i < GGML_MAX_DIMS; i++) {
if (node->ne[i] != props->ne[i]) {
return false;
@ -3905,14 +3916,14 @@ static void ggml_cuda_graph_evaluate_and_capture(ggml_backend_cuda_context * cud
// Launch graph
CUDA_CHECK(cudaGraphLaunch(graph->instance, cuda_ctx->stream()));
#else
GGML_UNUSED(graph_key);
graph_evaluated_or_captured = true;
#endif // USE_CUDA_GRAPH
}
}
static bool ggml_cuda_graph_set_enabled(ggml_backend_cuda_context * cuda_ctx, const void * graph_key) {
#ifdef USE_CUDA_GRAPH
static bool ggml_cuda_graph_set_enabled(ggml_backend_cuda_context * cuda_ctx, const void * graph_key) {
ggml_cuda_graph * graph = cuda_ctx->cuda_graph(graph_key);
if (graph->graph == nullptr) {
@ -3925,12 +3936,8 @@ static bool ggml_cuda_graph_set_enabled(ggml_backend_cuda_context * cuda_ctx, co
}
return graph->is_enabled();
#else
GGML_UNUSED(cuda_ctx);
GGML_UNUSED(graph_key);
return false;
#endif // USE_CUDA_GRAPH
}
#endif // USE_CUDA_GRAPH
static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *) backend->context;
@ -5048,16 +5055,6 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
static std::mutex mutex;
std::lock_guard<std::mutex> lock(mutex);
if (!initialized) {
// Set CUDA_SCALE_LAUNCH_QUEUES before any CUDA API call to improve multi-GPU pipeline parallelism performance
// PR: https://github.com/ggml-org/llama.cpp/pull/19042
if (getenv("CUDA_SCALE_LAUNCH_QUEUES") == nullptr) {
#ifdef _WIN32
_putenv_s("CUDA_SCALE_LAUNCH_QUEUES", "4x");
#else
setenv("CUDA_SCALE_LAUNCH_QUEUES", "4x", 0); // don't overwrite if already set
#endif // _WIN32
}
ggml_backend_cuda_reg_context * ctx = new ggml_backend_cuda_reg_context;
const int min_batch_size = getenv("GGML_OP_OFFLOAD_MIN_BATCH") ? atoi(getenv("GGML_OP_OFFLOAD_MIN_BATCH")) : 32;

View File

@ -3697,13 +3697,20 @@ static __global__ void mul_mat_q(
tile_x_max_i, tile_y_max_j, kb0_start, kb0_stop);
}
template <ggml_type type, int mmq_x, bool need_check>
static __global__ void mul_mat_q_stream_k_fixup(
const int32_t * ids_dst, const int32_t * expert_bounds, float * __restrict__ dst, const float * __restrict__ tmp_last_tile,
const int ncols_x, const int nrows_x, const int ncols_dst, const int stride_col_dst,
const int nchannels_y, const int stride_channel_dst, const int nsamples_y, const int stride_sample_dst,
const int ncols_max) {
static __global__ void mul_mat_q_stream_k_fixup(const int32_t * ids_dst,
const int32_t * expert_bounds,
float * __restrict__ dst,
const float * __restrict__ tmp_last_tile,
const int ncols_x,
const int nrows_x,
const int ncols_dst,
const size_t stride_col_dst,
const int nchannels_y,
const size_t stride_channel_dst,
const int nsamples_y,
const size_t stride_sample_dst,
const int ncols_max) {
constexpr int mmq_y = get_mmq_y_device();
constexpr int qk = ggml_cuda_type_traits<type>::qk;
constexpr int ITER_K = get_iter_k(type);

View File

@ -4,26 +4,48 @@
#include "mmvf.cuh"
#include "convert.cuh"
template <typename T, typename type_acc, int ncols_dst, int block_size, bool has_fusion = false>
template <typename T, typename type_acc, int ncols_dst, int block_size, bool has_fusion = false, bool is_multi_token_id = false>
static __global__ void mul_mat_vec_f(
const T * __restrict__ x, const float * __restrict__ y, const int32_t * __restrict__ ids, const ggml_cuda_mm_fusion_args_device fusion, float * __restrict__ dst,
const int ncols2, const int nchannels_y, const int stride_row, const int stride_col_y2, const int stride_col_dst,
const int ncols2, const uint3 nchannels_y, const int stride_row, const int stride_col_y2, const int stride_col_dst,
const uint3 channel_ratio, const int stride_channel_x, const int stride_channel_y, const int stride_channel_dst,
const uint3 sample_ratio, const int stride_sample_x, const int stride_sample_y, const int stride_sample_dst) {
const uint3 sample_ratio, const int stride_sample_x, const int stride_sample_y, const int stride_sample_dst,
const int ids_stride) {
const int row = blockIdx.x;
// for MUL_MAT_ID - blockIdx.y = n_expert_used, blockIdx.z = ncols_dst (tokens)
const int channel_dst = blockIdx.y;
const int channel_x = ids ? ids[channel_dst] : fastdiv((uint32_t) channel_dst, channel_ratio);
const int channel_y = ids ? channel_dst % nchannels_y : channel_dst;
const int sample_dst = blockIdx.z;
const int tid = threadIdx.x;
int token_idx;
int channel_x;
int channel_y;
int sample_dst;
if constexpr (is_multi_token_id) {
// Multi-token MUL_MAT_ID path, adding these in the normal path causes a perf regression for n_tokens=1 case
token_idx = blockIdx.z;
channel_x = ids[channel_dst + token_idx * ids_stride];
channel_y = fastmodulo(channel_dst, nchannels_y);
sample_dst = 0;
} else {
token_idx = ids ? blockIdx.z : 0;
channel_x = ids ? ids[blockIdx.y + token_idx * ids_stride] : fastdiv((uint32_t) channel_dst, channel_ratio);
channel_y = ids ? fastmodulo(blockIdx.y, nchannels_y) : channel_dst;
sample_dst = ids ? 0 : blockIdx.z;
}
const int sample_x = fastdiv((uint32_t) sample_dst, sample_ratio);
const int sample_y = sample_dst;
const int tid = threadIdx.x;
constexpr int warp_size = ggml_cuda_get_physical_warp_size();
x += int64_t(sample_x) *stride_sample_x + channel_x *stride_channel_x + row*stride_row;
y += int64_t(sample_y) *stride_sample_y + channel_y *stride_channel_y;
dst += int64_t(sample_dst)*stride_sample_dst + channel_dst*stride_channel_dst;
if constexpr (is_multi_token_id) {
y += token_idx*stride_col_y2*2;
dst += token_idx*stride_col_dst;
}
bool use_gate = false;
bool use_bias = false;
@ -56,8 +78,10 @@ static __global__ void mul_mat_vec_f(
if (use_gate) {
gate_x += int64_t(sample_x) *stride_sample_x + channel_x *stride_channel_x + row*stride_row;
}
const int channel_bias = ids ? channel_x : channel_dst;
if constexpr (has_fusion) {
const int channel_bias = ids ? channel_x : channel_dst;
if (use_bias) {
x_bias += int64_t(sample_dst)*stride_sample_dst + channel_bias*stride_channel_dst;
}
@ -349,36 +373,36 @@ static __global__ void mul_mat_vec_f(
}
}
template<typename T, typename type_acc, int ncols_dst, int block_size>
template<typename T, typename type_acc, int ncols_dst, int block_size, bool is_multi_token_id = false>
static void mul_mat_vec_f_switch_fusion(
const T * x, const float * y, const int32_t * ids, const ggml_cuda_mm_fusion_args_device fusion, float * dst,
const int64_t ncols, const int64_t nrows,
const int64_t ncols, const uint3 nchannels_y,
const int64_t stride_row, const int64_t stride_col_y, const int64_t stride_col_dst,
const uint3 channel_ratio, const int stride_channel_x, const int stride_channel_y, const int stride_channel_dst,
const uint3 sample_ratio, const int stride_sample_x, const int stride_sample_y, const int stride_sample_dst,
const dim3 & block_dims, const dim3 & block_nums, const int nbytes_shared, const cudaStream_t stream) {
const dim3 & block_dims, const dim3 & block_nums, const int nbytes_shared, const int ids_stride, const cudaStream_t stream) {
const bool has_fusion = fusion.gate != nullptr || fusion.x_bias != nullptr || fusion.gate_bias != nullptr;
if constexpr (ncols_dst == 1) {
if (has_fusion) {
mul_mat_vec_f<T, type_acc, ncols_dst, block_size, true><<<block_nums, block_dims, nbytes_shared, stream>>>
(x, y, ids, fusion, dst, ncols, nrows, stride_row, stride_col_y, stride_col_dst,
mul_mat_vec_f<T, type_acc, ncols_dst, block_size, true, is_multi_token_id><<<block_nums, block_dims, nbytes_shared, stream>>>
(x, y, ids, fusion, dst, ncols, nchannels_y, stride_row, stride_col_y, stride_col_dst,
channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst, ids_stride);
return;
}
}
GGML_ASSERT(!has_fusion && "fusion only supported for ncols_dst=1");
mul_mat_vec_f<T, type_acc, ncols_dst, block_size><<<block_nums, block_dims, nbytes_shared, stream>>>
(x, y, ids, fusion, dst, ncols, nrows, stride_row, stride_col_y, stride_col_dst,
mul_mat_vec_f<T, type_acc, ncols_dst, block_size, false, is_multi_token_id><<<block_nums, block_dims, nbytes_shared, stream>>>
(x, y, ids, fusion, dst, ncols, nchannels_y, stride_row, stride_col_y, stride_col_dst,
channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst, ids_stride);
}
template <typename T, typename type_acc, int ncols_dst>
template <typename T, typename type_acc, int ncols_dst, bool is_multi_token_id = false>
void launch_mul_mat_vec_f_cuda(
const T * x, const float * y, const int32_t * ids, const ggml_cuda_mm_fusion_args_device fusion, float * dst,
const int64_t ncols, const int64_t nrows,
@ -386,12 +410,13 @@ void launch_mul_mat_vec_f_cuda(
const int64_t nchannels_x, const int64_t nchannels_y, const int64_t nchannels_dst,
const int64_t stride_channel_x, const int64_t stride_channel_y, const int64_t stride_channel_dst, const int64_t nsamples_x,
const int64_t nsamples_dst, const int64_t stride_sample_x, const int64_t stride_sample_y, const int64_t stride_sample_dst,
cudaStream_t stream) {
const int64_t nsamples_or_ntokens, const int64_t ids_stride, cudaStream_t stream) {
GGML_ASSERT(ncols % 2 == 0);
GGML_ASSERT(stride_row % 2 == 0);
GGML_ASSERT(stride_col_y % 2 == 0);
GGML_ASSERT(ids || nchannels_dst % nchannels_x == 0);
GGML_ASSERT( nsamples_dst % nsamples_x == 0);
const uint3 nchannels_y_fd = ids ? init_fastdiv_values(nchannels_y) : make_uint3(0, 0, 0);
const uint3 channel_ratio_fd = ids ? make_uint3(0, 0, 0) : init_fastdiv_values(nchannels_dst / nchannels_x);
const uint3 sample_ratio_fd = init_fastdiv_values(nsamples_dst / nsamples_x);
@ -415,56 +440,56 @@ void launch_mul_mat_vec_f_cuda(
const bool has_fusion = fusion.gate != nullptr || fusion.x_bias != nullptr || fusion.gate_bias != nullptr;
const int nbytes_shared = warp_size*sizeof(float) + (has_fusion ? warp_size*sizeof(float) : 0);
const dim3 block_nums(nrows, nchannels_dst, nsamples_dst);
const dim3 block_nums(nrows, nchannels_dst, nsamples_or_ntokens);
const dim3 block_dims(block_size_best, 1, 1);
switch (block_size_best) {
case 32: {
mul_mat_vec_f_switch_fusion<T, type_acc, ncols_dst, 32>
(x, y, ids, fusion, dst, ncols/2, nchannels_y, stride_row, stride_col_y/2, stride_col_dst,
mul_mat_vec_f_switch_fusion<T, type_acc, ncols_dst, 32, is_multi_token_id>
(x, y, ids, fusion, dst, ncols/2, nchannels_y_fd, stride_row, stride_col_y/2, stride_col_dst,
channel_ratio_fd, stride_channel_x, stride_channel_y, stride_channel_dst,
sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst, block_dims, block_nums, nbytes_shared, stream);
sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst, block_dims, block_nums, nbytes_shared, ids_stride, stream);
} break;
case 64: {
mul_mat_vec_f_switch_fusion<T, type_acc, ncols_dst, 64>
(x, y, ids, fusion, dst, ncols/2, nchannels_y, stride_row, stride_col_y/2, stride_col_dst,
mul_mat_vec_f_switch_fusion<T, type_acc, ncols_dst, 64, is_multi_token_id>
(x, y, ids, fusion, dst, ncols/2, nchannels_y_fd, stride_row, stride_col_y/2, stride_col_dst,
channel_ratio_fd, stride_channel_x, stride_channel_y, stride_channel_dst,
sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst, block_dims, block_nums, nbytes_shared, stream);
sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst, block_dims, block_nums, nbytes_shared, ids_stride, stream);
} break;
case 96: {
mul_mat_vec_f_switch_fusion<T, type_acc, ncols_dst, 96>
(x, y, ids, fusion, dst, ncols/2, nchannels_y, stride_row, stride_col_y/2, stride_col_dst,
mul_mat_vec_f_switch_fusion<T, type_acc, ncols_dst, 96, is_multi_token_id>
(x, y, ids, fusion, dst, ncols/2, nchannels_y_fd, stride_row, stride_col_y/2, stride_col_dst,
channel_ratio_fd, stride_channel_x, stride_channel_y, stride_channel_dst,
sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst, block_dims, block_nums, nbytes_shared, stream);
sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst, block_dims, block_nums, nbytes_shared, ids_stride, stream);
} break;
case 128: {
mul_mat_vec_f_switch_fusion<T, type_acc, ncols_dst, 128>
(x, y, ids, fusion, dst, ncols/2, nchannels_y, stride_row, stride_col_y/2, stride_col_dst,
mul_mat_vec_f_switch_fusion<T, type_acc, ncols_dst, 128, is_multi_token_id>
(x, y, ids, fusion, dst, ncols/2, nchannels_y_fd, stride_row, stride_col_y/2, stride_col_dst,
channel_ratio_fd, stride_channel_x, stride_channel_y, stride_channel_dst,
sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst, block_dims, block_nums, nbytes_shared, stream);
sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst, block_dims, block_nums, nbytes_shared, ids_stride, stream);
} break;
case 160: {
mul_mat_vec_f_switch_fusion<T, type_acc, ncols_dst, 160>
(x, y, ids, fusion, dst, ncols/2, nchannels_y, stride_row, stride_col_y/2, stride_col_dst,
mul_mat_vec_f_switch_fusion<T, type_acc, ncols_dst, 160, is_multi_token_id>
(x, y, ids, fusion, dst, ncols/2, nchannels_y_fd, stride_row, stride_col_y/2, stride_col_dst,
channel_ratio_fd, stride_channel_x, stride_channel_y, stride_channel_dst,
sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst, block_dims, block_nums, nbytes_shared, stream);
sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst, block_dims, block_nums, nbytes_shared, ids_stride, stream);
} break;
case 192: {
mul_mat_vec_f_switch_fusion<T, type_acc, ncols_dst, 192>
(x, y, ids, fusion, dst, ncols/2, nchannels_y, stride_row, stride_col_y/2, stride_col_dst,
mul_mat_vec_f_switch_fusion<T, type_acc, ncols_dst, 192, is_multi_token_id>
(x, y, ids, fusion, dst, ncols/2, nchannels_y_fd, stride_row, stride_col_y/2, stride_col_dst,
channel_ratio_fd, stride_channel_x, stride_channel_y, stride_channel_dst,
sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst, block_dims, block_nums, nbytes_shared, stream);
sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst, block_dims, block_nums, nbytes_shared, ids_stride, stream);
} break;
case 224: {
mul_mat_vec_f_switch_fusion<T, type_acc, ncols_dst, 224>
(x, y, ids, fusion, dst, ncols/2, nchannels_y, stride_row, stride_col_y/2, stride_col_dst,
mul_mat_vec_f_switch_fusion<T, type_acc, ncols_dst, 224, is_multi_token_id>
(x, y, ids, fusion, dst, ncols/2, nchannels_y_fd, stride_row, stride_col_y/2, stride_col_dst,
channel_ratio_fd, stride_channel_x, stride_channel_y, stride_channel_dst,
sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst, block_dims, block_nums, nbytes_shared, stream);
sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst, block_dims, block_nums, nbytes_shared, ids_stride, stream);
} break;
case 256: {
mul_mat_vec_f_switch_fusion<T, type_acc, ncols_dst, 256>
(x, y, ids, fusion, dst, ncols/2, nchannels_y, stride_row, stride_col_y/2, stride_col_dst,
mul_mat_vec_f_switch_fusion<T, type_acc, ncols_dst, 256, is_multi_token_id>
(x, y, ids, fusion, dst, ncols/2, nchannels_y_fd, stride_row, stride_col_y/2, stride_col_dst,
channel_ratio_fd, stride_channel_x, stride_channel_y, stride_channel_dst,
sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst, block_dims, block_nums, nbytes_shared, stream);
sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst, block_dims, block_nums, nbytes_shared, ids_stride, stream);
} break;
default: {
GGML_ABORT("fatal error");
@ -480,55 +505,88 @@ static void mul_mat_vec_f_cuda_switch_ncols_dst(
const int64_t nchannels_x, const int64_t nchannels_y, const int64_t nchannels_dst,
const int64_t stride_channel_x, const int64_t stride_channel_y, const int64_t stride_channel_dst, const int64_t nsamples_x,
const int64_t nsamples_dst, const int64_t stride_sample_x, const int64_t stride_sample_y, const int64_t stride_sample_dst,
cudaStream_t stream) {
const int64_t ids_stride, cudaStream_t stream) {
const bool has_ids = ids != nullptr;
if (has_ids && ncols_dst > 1) {
// Multi-token MUL_MAT_ID path only - single-token goes through regular path below
constexpr int c_ncols_dst = 1;
launch_mul_mat_vec_f_cuda<T, type_acc, c_ncols_dst, true>
(x, y, ids, fusion, dst, ncols, nrows, stride_row, stride_col_y, stride_col_dst,
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y,
stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst,
ncols_dst, ids_stride, stream);
return;
}
if (has_ids) {
// Single-token MUL_MAT_ID path
constexpr int c_ncols_dst = 1;
launch_mul_mat_vec_f_cuda<T, type_acc, c_ncols_dst>
(x, y, ids, fusion, dst, ncols, nrows, stride_row, stride_col_y, stride_col_dst,
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y,
stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst,
ncols_dst, ids_stride, stream);
return;
}
switch (ncols_dst) {
case 1:
launch_mul_mat_vec_f_cuda<T, type_acc, 1>
(x, y, ids, fusion, dst, ncols, nrows, stride_row, stride_col_y, stride_col_dst,
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y,
stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst,
nsamples_dst, ids_stride, stream);
break;
case 2:
launch_mul_mat_vec_f_cuda<T, type_acc, 2>
(x, y, ids, fusion, dst, ncols, nrows, stride_row, stride_col_y, stride_col_dst,
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y,
stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst,
nsamples_dst, ids_stride, stream);
break;
case 3:
launch_mul_mat_vec_f_cuda<T, type_acc, 3>
(x, y, ids, fusion, dst, ncols, nrows, stride_row, stride_col_y, stride_col_dst,
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y,
stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst,
nsamples_dst, ids_stride, stream);
break;
case 4:
launch_mul_mat_vec_f_cuda<T, type_acc, 4>
(x, y, ids, fusion, dst, ncols, nrows, stride_row, stride_col_y, stride_col_dst,
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y,
stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst,
nsamples_dst, ids_stride, stream);
break;
case 5:
launch_mul_mat_vec_f_cuda<T, type_acc, 5>
(x, y, ids, fusion, dst, ncols, nrows, stride_row, stride_col_y, stride_col_dst,
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y,
stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst,
nsamples_dst, ids_stride, stream);
break;
case 6:
launch_mul_mat_vec_f_cuda<T, type_acc, 6>
(x, y, ids, fusion, dst, ncols, nrows, stride_row, stride_col_y, stride_col_dst,
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y,
stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst,
nsamples_dst, ids_stride, stream);
break;
case 7:
launch_mul_mat_vec_f_cuda<T, type_acc, 7>
(x, y, ids, fusion, dst, ncols, nrows, stride_row, stride_col_y, stride_col_dst,
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y,
stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst,
nsamples_dst, ids_stride, stream);
break;
case 8:
launch_mul_mat_vec_f_cuda<T, type_acc, 8>
(x, y, ids, fusion, dst, ncols, nrows, stride_row, stride_col_y, stride_col_dst,
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y,
stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst,
nsamples_dst, ids_stride, stream);
break;
default:
GGML_ABORT("fatal error");
@ -544,21 +602,21 @@ static void mul_mat_vec_f_cuda(
const int64_t nchannels_x, const int64_t nchannels_y, const int64_t nchannels_dst,
const int64_t stride_channel_x, const int64_t stride_channel_y, const int64_t stride_channel_dst, const int64_t nsamples_x,
const int64_t nsamples_dst, const int64_t stride_sample_x, const int64_t stride_sample_y, const int64_t stride_sample_dst,
enum ggml_prec prec, cudaStream_t stream) {
const int64_t ids_stride, enum ggml_prec prec, cudaStream_t stream) {
if constexpr(std::is_same_v<T, half>) {
if (prec == GGML_PREC_DEFAULT) {
mul_mat_vec_f_cuda_switch_ncols_dst<T, half>
(x, y, ids, fusion, dst, ncols, nrows, ncols_dst, stride_row, stride_col_y, stride_col_dst,
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y,
stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, ids_stride, stream);
return;
}
}
mul_mat_vec_f_cuda_switch_ncols_dst<T, float>
(x, y, ids, fusion, dst, ncols, nrows, ncols_dst, stride_row, stride_col_y, stride_col_dst,
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y,
stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, ids_stride, stream);
}
void ggml_cuda_mul_mat_vec_f(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * ids, ggml_tensor * dst,
@ -573,7 +631,7 @@ void ggml_cuda_mul_mat_vec_f(ggml_backend_cuda_context & ctx, const ggml_tensor
const size_t ts_src1 = ggml_type_size(src1->type);
const size_t ts_dst = ggml_type_size(dst->type);
GGML_ASSERT(!ids || ne12 == 1); // Implementation is only correct for batch size 1.
GGML_ASSERT(!ids || ne12 <= MMVF_MAX_BATCH_SIZE);
GGML_ASSERT(ne13 == ne3);
GGML_ASSERT( nb00 == ts_src0);
@ -626,29 +684,31 @@ void ggml_cuda_mul_mat_vec_f(ggml_backend_cuda_context & ctx, const ggml_tensor
const int64_t ncols_dst = ids ? ne2 : ne1;
const int64_t nchannels_y = ids ? ne11 : ne12;
const int64_t nchannels_dst = ids ? ne1 : ne2;
const int64_t stride_col_dst = ids ? s2 : s1;
const int64_t stride_col_y = ids ? s12 : s11;
const int64_t stride_channel_dst = ids ? s1 : s2;
const int64_t stride_channel_y = ids ? s11 : s12;
GGML_ASSERT(!ids || ncols_dst == 1);
const int64_t ids_stride = ids ? ids->nb[1] / ggml_type_size(ids->type) : 0;
switch (src0->type) {
case GGML_TYPE_F32: {
const float * src0_d = (const float *) src0->data;
mul_mat_vec_f_cuda(src0_d, src1_d, ids_d, fusion_local, dst_d, ne00, ne01, ncols_dst, s01, s11, s1,
mul_mat_vec_f_cuda(src0_d, src1_d, ids_d, fusion_local, dst_d, ne00, ne01, ncols_dst, s01, stride_col_y, stride_col_dst,
ne02, nchannels_y, nchannels_dst, s02, stride_channel_y, stride_channel_dst,
ne03, ne3, s03, s13, s3, prec, ctx.stream());
ne03, ne3, s03, s13, s3, ids_stride, prec, ctx.stream());
} break;
case GGML_TYPE_F16: {
const half * src0_d = (const half *) src0->data;
mul_mat_vec_f_cuda(src0_d, src1_d, ids_d, fusion_local, dst_d, ne00, ne01, ncols_dst, s01, s11, s1,
mul_mat_vec_f_cuda(src0_d, src1_d, ids_d, fusion_local, dst_d, ne00, ne01, ncols_dst, s01, stride_col_y, stride_col_dst,
ne02, nchannels_y, nchannels_dst, s02, stride_channel_y, stride_channel_dst,
ne03, ne3, s03, s13, s3, prec, ctx.stream());
ne03, ne3, s03, s13, s3, ids_stride, prec, ctx.stream());
} break;
case GGML_TYPE_BF16: {
const nv_bfloat16 * src0_d = (const nv_bfloat16 *) src0->data;
mul_mat_vec_f_cuda(src0_d, src1_d, ids_d, fusion_local, dst_d, ne00, ne01, ncols_dst, s01, s11, s1,
mul_mat_vec_f_cuda(src0_d, src1_d, ids_d, fusion_local, dst_d, ne00, ne01, ncols_dst, s01, stride_col_y, stride_col_dst,
ne02, nchannels_y, nchannels_dst, s02, stride_channel_y, stride_channel_dst,
ne03, ne3, s03, s13, s3, prec, ctx.stream());
ne03, ne3, s03, s13, s3, ids_stride, prec, ctx.stream());
} break;
default:
GGML_ABORT("unsupported type: %s", ggml_type_name(src0->type));
@ -695,19 +755,19 @@ void ggml_cuda_op_mul_mat_vec_f(
const float * src0_d = (const float *) src0_dd_i;
mul_mat_vec_f_cuda(src0_d, src1_ddf_i, nullptr, empty, dst_dd_i, ne00, row_diff, src1_ncols, stride_row, stride_col_y, stride_col_dst,
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, prec, stream);
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, 0, prec, stream);
} break;
case GGML_TYPE_F16: {
const half * src0_d = (const half *) src0_dd_i;
mul_mat_vec_f_cuda(src0_d, src1_ddf_i, nullptr, empty, dst_dd_i, ne00, row_diff, src1_ncols, stride_row, stride_col_y, stride_col_dst,
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, prec, stream);
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, 0, prec, stream);
} break;
case GGML_TYPE_BF16: {
const nv_bfloat16 * src0_d = (const nv_bfloat16 *) src0_dd_i;
mul_mat_vec_f_cuda(src0_d, src1_ddf_i, nullptr, empty, dst_dd_i, ne00, row_diff, src1_ncols, stride_row, stride_col_y, stride_col_dst,
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, prec, stream);
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, 0, prec, stream);
} break;
default:
GGML_ABORT("unsupported type: %s", ggml_type_name(src0->type));

View File

@ -1,5 +1,7 @@
#include "common.cuh"
#define MMVF_MAX_BATCH_SIZE 8 // Max. batch size for which to use MMVF kernels.
void ggml_cuda_mul_mat_vec_f(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * ids, ggml_tensor * dst,
const ggml_cuda_mm_fusion_args_host * fusion = nullptr);

View File

@ -137,15 +137,15 @@ static constexpr __host__ __device__ int calc_rows_per_block(int ncols_dst, int
return 1;
}
// tell the compiler to use as many registers as it wants, see nwarps definition below
template <ggml_type type, int ncols_dst, bool has_fusion>
template <ggml_type type, int ncols_dst, bool has_fusion, bool is_multi_token_id = false>
__launch_bounds__(calc_nwarps(ncols_dst, get_device_table_id())*ggml_cuda_get_physical_warp_size(), 1)
static __global__ void mul_mat_vec_q(
const void * __restrict__ vx, const void * __restrict__ vy, const int32_t * __restrict__ ids, const ggml_cuda_mm_fusion_args_device fusion, float * __restrict__ dst,
const uint32_t ncols_x, const uint3 nchannels_y, const uint32_t stride_row_x, const uint32_t stride_col_y,
const uint32_t stride_col_dst, const uint3 channel_ratio, const uint32_t stride_channel_x,
const uint32_t stride_channel_y, const uint32_t stride_channel_dst, const uint3 sample_ratio,
const uint32_t stride_sample_x, const uint32_t stride_sample_y, const uint32_t stride_sample_dst) {
const uint32_t stride_sample_x, const uint32_t stride_sample_y, const uint32_t stride_sample_dst,
const uint32_t ids_stride) {
constexpr int qk = ggml_cuda_type_traits<type>::qk;
constexpr int qi = ggml_cuda_type_traits<type>::qi;
@ -162,11 +162,25 @@ static __global__ void mul_mat_vec_q(
const int blocks_per_row_x = ncols_x / qk;
constexpr int blocks_per_iter = vdr * nwarps*warp_size / qi;
// The MUL_MAT_ID code path with ids != nullptr is only implemented for ncols_dst == 1.
const uint32_t channel_dst = blockIdx.y;
const uint32_t channel_x = ncols_dst == 1 && ids ? ids[channel_dst] : fastdiv(channel_dst, channel_ratio);
const uint32_t channel_y = ncols_dst == 1 && ids ? fastmodulo(channel_dst, nchannels_y) : channel_dst;
const uint32_t sample_dst = blockIdx.z;
uint32_t token_idx = 0;
uint32_t channel_x;
uint32_t channel_y;
uint32_t sample_dst;
if constexpr (is_multi_token_id) {
// Multi-token MUL_MAT_ID path, adding these in the normal path causes a perf regression for n_tokens=1 case
token_idx = blockIdx.z;
channel_x = ids[channel_dst + token_idx * ids_stride];
channel_y = fastmodulo(channel_dst, nchannels_y);
sample_dst = 0;
} else {
channel_x = ncols_dst == 1 && ids ? ids[channel_dst] : fastdiv(channel_dst, channel_ratio);
channel_y = ncols_dst == 1 && ids ? fastmodulo(channel_dst, nchannels_y) : channel_dst;
sample_dst = blockIdx.z;
}
const uint32_t sample_x = fastdiv(sample_dst, sample_ratio);
const uint32_t sample_y = sample_dst;
@ -188,11 +202,11 @@ static __global__ void mul_mat_vec_q(
active_glu = fusion.glu_op;
}
const uint32_t channel_bias = ids ? channel_x : channel_dst;
float x_biases[ncols_dst] = { 0.0f };
float gate_biases[ncols_dst] = { 0.0f };
if constexpr (has_fusion) {
const uint32_t channel_bias = ids ? channel_x : channel_dst;
if (use_bias) {
x_bias = x_bias + sample_dst*stride_sample_dst + channel_bias*stride_channel_dst + row0;
// 1. Hide latency by prefetching bias and gate here
@ -222,6 +236,9 @@ static __global__ void mul_mat_vec_q(
float tmp_gate[ncols_dst][rows_per_cuda_block] = {{0.0f}};
const block_q8_1 * y = ((const block_q8_1 *) vy) + sample_y*stride_sample_y + channel_y*stride_channel_y;
if constexpr (is_multi_token_id) {
y += token_idx*stride_col_y;
}
const int kbx_offset = sample_x*stride_sample_x + channel_x*stride_channel_x + row0*stride_row_x;
for (int kbx = tid / (qi/vdr); kbx < blocks_per_row_x; kbx += blocks_per_iter) {
@ -275,6 +292,10 @@ static __global__ void mul_mat_vec_q(
dst += sample_dst*stride_sample_dst + channel_dst*stride_channel_dst + row0;
if constexpr (is_multi_token_id) {
dst += token_idx*stride_col_dst;
}
// sum up partial sums and write back result
#pragma unroll
for (int j = 0; j < ncols_dst; ++j) {
@ -335,40 +356,41 @@ static __global__ void mul_mat_vec_q(
}
static std::pair<dim3, dim3> calc_launch_params(
const int ncols_dst, const int nrows_x, const int nchannels_y, const int nsamples_y,
const int ncols_dst, const int nrows_x, const int nchannels_dst, const int nsamples_or_ntokens,
const int warp_size, const mmvq_parameter_table_id table_id) {
const int64_t nblocks = (nrows_x + calc_rows_per_block(ncols_dst, table_id) - 1) / calc_rows_per_block(ncols_dst, table_id);
const dim3 block_nums(nblocks, nchannels_y, nsamples_y);
const dim3 block_nums(nblocks, nchannels_dst, nsamples_or_ntokens);
const dim3 block_dims(warp_size, calc_nwarps(ncols_dst, table_id), 1);
return {block_nums, block_dims};
}
template<ggml_type type, int c_ncols_dst>
template<ggml_type type, int c_ncols_dst, bool is_multi_token_id = false>
static void mul_mat_vec_q_switch_fusion(
const void * vx, const void * vy, const int32_t * ids, const ggml_cuda_mm_fusion_args_device fusion, float * dst,
const uint32_t ncols_x, const uint3 nchannels_y, const uint32_t stride_row_x, const uint32_t stride_col_y,
const uint32_t stride_col_dst, const uint3 channel_ratio, const uint32_t stride_channel_x,
const uint32_t stride_channel_y, const uint32_t stride_channel_dst, const uint3 sample_ratio,
const uint32_t stride_sample_x, const uint32_t stride_sample_y, const uint32_t stride_sample_dst,
const dim3 & block_nums, const dim3 & block_dims, const int nbytes_shared, cudaStream_t stream) {
const dim3 & block_nums, const dim3 & block_dims, const int nbytes_shared,
const uint32_t ids_stride, cudaStream_t stream) {
const bool has_fusion = fusion.gate != nullptr || fusion.x_bias != nullptr || fusion.gate_bias != nullptr;
if constexpr (c_ncols_dst == 1) {
if (has_fusion) {
mul_mat_vec_q<type, c_ncols_dst, true><<<block_nums, block_dims, nbytes_shared, stream>>>
mul_mat_vec_q<type, c_ncols_dst, true, is_multi_token_id><<<block_nums, block_dims, nbytes_shared, stream>>>
(vx, vy, ids, fusion, dst, ncols_x, nchannels_y, stride_row_x, stride_col_y, stride_col_dst,
channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst, ids_stride);
return;
}
}
GGML_ASSERT(!has_fusion && "fusion only supported for ncols_dst=1");
mul_mat_vec_q<type, c_ncols_dst, false><<<block_nums, block_dims, nbytes_shared, stream>>>
mul_mat_vec_q<type, c_ncols_dst, false, is_multi_token_id><<<block_nums, block_dims, nbytes_shared, stream>>>
(vx, vy, ids, fusion, dst, ncols_x, nchannels_y, stride_row_x, stride_col_y, stride_col_dst,
channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst, ids_stride);
}
template <ggml_type type>
@ -379,7 +401,7 @@ static void mul_mat_vec_q_switch_ncols_dst(
const int nchannels_x, const int nchannels_y, const int nchannels_dst,
const int stride_channel_x, const int stride_channel_y, const int stride_channel_dst,
const int nsamples_x, const int nsamples_dst, const int stride_sample_x, const int stride_sample_y, const int stride_sample_dst,
cudaStream_t stream) {
const int ids_stride, cudaStream_t stream) {
GGML_ASSERT(ncols_x % ggml_blck_size(type) == 0);
GGML_ASSERT(ncols_dst <= MMVQ_MAX_BATCH_SIZE);
@ -393,8 +415,19 @@ static void mul_mat_vec_q_switch_ncols_dst(
const mmvq_parameter_table_id table_id = get_device_table_id(ggml_cuda_info().devices[device].cc);
const bool has_fusion = fusion.gate != nullptr || fusion.x_bias != nullptr || fusion.gate_bias != nullptr;
const bool has_ids = ids != nullptr;
if (has_ids && ncols_dst > 1) {
// Multi-token MUL_MAT_ID path only - single-token goes through regular path below
constexpr int c_ncols_dst = 1;
std::pair<dim3, dim3> dims = calc_launch_params(c_ncols_dst, nrows_x, nchannels_dst, ncols_dst, warp_size, table_id);
mul_mat_vec_q_switch_fusion<type, c_ncols_dst, true>(vx, vy, ids, fusion, dst, ncols_x, nchannels_y_fd, stride_row_x, stride_col_y, stride_col_dst,
channel_ratio_fd, stride_channel_x, stride_channel_y, stride_channel_dst,
sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst,
dims.first, dims.second, 0, ids_stride, stream);
return;
}
GGML_ASSERT(!ids || ncols_dst == 1);
switch (ncols_dst) {
case 1: {
constexpr int c_ncols_dst = 1;
@ -402,7 +435,7 @@ static void mul_mat_vec_q_switch_ncols_dst(
mul_mat_vec_q_switch_fusion<type, c_ncols_dst>(vx, vy, ids, fusion, dst, ncols_x, nchannels_y_fd, stride_row_x, stride_col_y, stride_col_dst,
channel_ratio_fd, stride_channel_x, stride_channel_y, stride_channel_dst,
sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst,
dims.first, dims.second, 0, stream);
dims.first, dims.second, 0, ids_stride, stream);
} break;
case 2: {
constexpr int c_ncols_dst = 2;
@ -410,7 +443,7 @@ static void mul_mat_vec_q_switch_ncols_dst(
mul_mat_vec_q_switch_fusion<type, c_ncols_dst>(vx, vy, ids, fusion, dst, ncols_x, nchannels_y_fd, stride_row_x, stride_col_y, stride_col_dst,
channel_ratio_fd, stride_channel_x, stride_channel_y, stride_channel_dst,
sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst,
dims.first, dims.second, 0, stream);
dims.first, dims.second, 0, ids_stride, stream);
} break;
case 3: {
constexpr int c_ncols_dst = 3;
@ -418,7 +451,7 @@ static void mul_mat_vec_q_switch_ncols_dst(
mul_mat_vec_q_switch_fusion<type, c_ncols_dst>(vx, vy, ids, fusion, dst, ncols_x, nchannels_y_fd, stride_row_x, stride_col_y, stride_col_dst,
channel_ratio_fd, stride_channel_x, stride_channel_y, stride_channel_dst,
sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst,
dims.first, dims.second, 0, stream);
dims.first, dims.second, 0, ids_stride, stream);
} break;
case 4: {
constexpr int c_ncols_dst = 4;
@ -426,7 +459,7 @@ static void mul_mat_vec_q_switch_ncols_dst(
mul_mat_vec_q_switch_fusion<type, c_ncols_dst>(vx, vy, ids, fusion, dst, ncols_x, nchannels_y_fd, stride_row_x, stride_col_y, stride_col_dst,
channel_ratio_fd, stride_channel_x, stride_channel_y, stride_channel_dst,
sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst,
dims.first, dims.second, 0, stream);
dims.first, dims.second, 0, ids_stride, stream);
} break;
case 5: {
constexpr int c_ncols_dst = 5;
@ -434,7 +467,7 @@ static void mul_mat_vec_q_switch_ncols_dst(
mul_mat_vec_q_switch_fusion<type, c_ncols_dst>(vx, vy, ids, fusion, dst, ncols_x, nchannels_y_fd, stride_row_x, stride_col_y, stride_col_dst,
channel_ratio_fd, stride_channel_x, stride_channel_y, stride_channel_dst,
sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst,
dims.first, dims.second, 0, stream);
dims.first, dims.second, 0, ids_stride, stream);
} break;
case 6: {
constexpr int c_ncols_dst = 6;
@ -442,7 +475,7 @@ static void mul_mat_vec_q_switch_ncols_dst(
mul_mat_vec_q_switch_fusion<type, c_ncols_dst>(vx, vy, ids, fusion, dst, ncols_x, nchannels_y_fd, stride_row_x, stride_col_y, stride_col_dst,
channel_ratio_fd, stride_channel_x, stride_channel_y, stride_channel_dst,
sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst,
dims.first, dims.second, 0, stream);
dims.first, dims.second, 0, ids_stride, stream);
} break;
case 7: {
constexpr int c_ncols_dst = 7;
@ -450,7 +483,7 @@ static void mul_mat_vec_q_switch_ncols_dst(
mul_mat_vec_q_switch_fusion<type, c_ncols_dst>(vx, vy, ids, fusion, dst, ncols_x, nchannels_y_fd, stride_row_x, stride_col_y, stride_col_dst,
channel_ratio_fd, stride_channel_x, stride_channel_y, stride_channel_dst,
sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst,
dims.first, dims.second, 0, stream);
dims.first, dims.second, 0, ids_stride, stream);
} break;
case 8: {
constexpr int c_ncols_dst = 8;
@ -458,7 +491,7 @@ static void mul_mat_vec_q_switch_ncols_dst(
mul_mat_vec_q_switch_fusion<type, c_ncols_dst>(vx, vy, ids, fusion, dst, ncols_x, nchannels_y_fd, stride_row_x, stride_col_y, stride_col_dst,
channel_ratio_fd, stride_channel_x, stride_channel_y, stride_channel_dst,
sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst,
dims.first, dims.second, 0, stream);
dims.first, dims.second, 0, ids_stride, stream);
} break;
default:
GGML_ABORT("fatal error");
@ -474,127 +507,127 @@ static void mul_mat_vec_q_switch_type(
const int nchannels_x, const int nchannels_y, const int nchannels_dst,
const int stride_channel_x, const int stride_channel_y, const int stride_channel_dst,
const int nsamples_x, const int nsamples_dst, const int stride_sample_x, const int stride_sample_y, const int stride_sample_dst,
cudaStream_t stream) {
const int ids_stride, cudaStream_t stream) {
switch (type_x) {
case GGML_TYPE_Q4_0:
mul_mat_vec_q_switch_ncols_dst<GGML_TYPE_Q4_0>
(vx, vy, ids, fusion, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, ids_stride, stream);
break;
case GGML_TYPE_Q4_1:
mul_mat_vec_q_switch_ncols_dst<GGML_TYPE_Q4_1>
(vx, vy, ids, fusion, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, ids_stride, stream);
break;
case GGML_TYPE_Q5_0:
mul_mat_vec_q_switch_ncols_dst<GGML_TYPE_Q5_0>
(vx, vy, ids, fusion, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, ids_stride, stream);
break;
case GGML_TYPE_Q5_1:
mul_mat_vec_q_switch_ncols_dst<GGML_TYPE_Q5_1>
(vx, vy, ids, fusion, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, ids_stride, stream);
break;
case GGML_TYPE_Q8_0:
mul_mat_vec_q_switch_ncols_dst<GGML_TYPE_Q8_0>
(vx, vy, ids, fusion, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, ids_stride, stream);
break;
case GGML_TYPE_MXFP4:
mul_mat_vec_q_switch_ncols_dst<GGML_TYPE_MXFP4>
(vx, vy, ids, fusion, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, ids_stride, stream);
break;
case GGML_TYPE_Q2_K:
mul_mat_vec_q_switch_ncols_dst<GGML_TYPE_Q2_K>
(vx, vy, ids, fusion, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, ids_stride, stream);
break;
case GGML_TYPE_Q3_K:
mul_mat_vec_q_switch_ncols_dst<GGML_TYPE_Q3_K>
(vx, vy, ids, fusion, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, ids_stride, stream);
break;
case GGML_TYPE_Q4_K:
mul_mat_vec_q_switch_ncols_dst<GGML_TYPE_Q4_K>
(vx, vy, ids, fusion, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, ids_stride, stream);
break;
case GGML_TYPE_Q5_K:
mul_mat_vec_q_switch_ncols_dst<GGML_TYPE_Q5_K>
(vx, vy, ids, fusion, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, ids_stride, stream);
break;
case GGML_TYPE_Q6_K:
mul_mat_vec_q_switch_ncols_dst<GGML_TYPE_Q6_K>
(vx, vy, ids, fusion, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, ids_stride, stream);
break;
case GGML_TYPE_IQ2_XXS:
mul_mat_vec_q_switch_ncols_dst<GGML_TYPE_IQ2_XXS>
(vx, vy, ids, fusion, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, ids_stride, stream);
break;
case GGML_TYPE_IQ2_XS:
mul_mat_vec_q_switch_ncols_dst<GGML_TYPE_IQ2_XS>
(vx, vy, ids, fusion, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, ids_stride, stream);
break;
case GGML_TYPE_IQ2_S:
mul_mat_vec_q_switch_ncols_dst<GGML_TYPE_IQ2_S>
(vx, vy, ids, fusion, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, ids_stride, stream);
break;
case GGML_TYPE_IQ3_XXS:
mul_mat_vec_q_switch_ncols_dst<GGML_TYPE_IQ3_XXS>
(vx, vy, ids, fusion, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, ids_stride, stream);
break;
case GGML_TYPE_IQ1_S:
mul_mat_vec_q_switch_ncols_dst<GGML_TYPE_IQ1_S>
(vx, vy, ids, fusion, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, ids_stride, stream);
break;
case GGML_TYPE_IQ1_M:
mul_mat_vec_q_switch_ncols_dst<GGML_TYPE_IQ1_M>
(vx, vy, ids, fusion, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, ids_stride, stream);
break;
case GGML_TYPE_IQ4_NL:
mul_mat_vec_q_switch_ncols_dst<GGML_TYPE_IQ4_NL>
(vx, vy, ids, fusion, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, ids_stride, stream);
break;
case GGML_TYPE_IQ4_XS:
mul_mat_vec_q_switch_ncols_dst<GGML_TYPE_IQ4_XS>
(vx, vy, ids, fusion, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, ids_stride, stream);
break;
case GGML_TYPE_IQ3_S:
mul_mat_vec_q_switch_ncols_dst<GGML_TYPE_IQ3_S>
(vx, vy, ids, fusion, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, ids_stride, stream);
break;
default:
GGML_ABORT("fatal error");
@ -622,7 +655,7 @@ void ggml_cuda_mul_mat_vec_q(
GGML_ASSERT( nb0 == ts_dst);
GGML_ASSERT(!ids || ids->nb[0] == ggml_type_size(ids->type));
GGML_ASSERT(!ids || ne12 == 1); // Implementation is only correct for batch size 1.
GGML_ASSERT(!ids || ne12 <= MMVQ_MAX_BATCH_SIZE);
const float * src1_d = (const float *) src1->data;
const int32_t * ids_d = ids ? (const int32_t *) ids->data : nullptr;
@ -693,11 +726,13 @@ void ggml_cuda_mul_mat_vec_q(
const int64_t stride_channel_dst = ids ? s1 : s2;
const int64_t stride_channel_y = ids ? s11 : s12;
const int64_t ids_stride = ids ? ids->nb[1] / ggml_type_size(ids->type) : 0;
mul_mat_vec_q_switch_type(
src0->data, src0->type, src1_q8_1.get(), ids_d, fusion_local, dst_d, ne00,
ne01, ncols_dst, s01, stride_col_y, stride_col_dst,
ne02, nchannels_y, nchannels_dst, s02, stride_channel_y, stride_channel_dst,
ne03, ne3, s03, s13, s3, stream);
ne03, ne3, s03, s13, s3, ids_stride, stream);
}
void ggml_cuda_op_mul_mat_vec_q(
@ -726,7 +761,7 @@ void ggml_cuda_op_mul_mat_vec_q(
ggml_cuda_mm_fusion_args_device fusion_local{};
mul_mat_vec_q_switch_type(
src0_dd_i, src0->type, src1_ddq_i, nullptr, fusion_local, dst_dd_i, ne00, row_diff, src1_ncols, stride_row_x, stride_col_y, nrows_dst,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, stream);
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, stream);
GGML_UNUSED_VARS(src1, dst, src1_ddf_i, src1_ncols, src1_padded_row_size);
}

View File

@ -1,8 +1,20 @@
file(TO_CMAKE_PATH "${HEXAGON_SDK_ROOT}" HEXAGON_SDK_ROOT)
file(TO_CMAKE_PATH "${HEXAGON_TOOLS_ROOT}" HEXAGON_TOOLS_ROOT)
if (NOT IS_DIRECTORY "${HEXAGON_SDK_ROOT}" OR NOT IS_DIRECTORY "${HEXAGON_TOOLS_ROOT}")
message(FATAL_ERROR "Make sure HEXAGON_SDK_ROOT and HEXAGON_TOOLS_ROOT point to the correct Hexagon SDK installation.")
if (NOT IS_DIRECTORY "${HEXAGON_SDK_ROOT}")
message(FATAL_ERROR "Make sure HEXAGON_SDK_ROOT point to the correct Hexagon SDK installation.")
endif()
if (NOT IS_DIRECTORY "${HEXAGON_TOOLS_ROOT}")
message("Try to read HEXAGON_TOOLS_ROOT from hexagon_sdk.json")
file(READ "${HEXAGON_SDK_ROOT}/hexagon_sdk.json" HEXAGON_SDK_CONFIG_PATH)
string(JSON HEXAGON_TOOLS_PATH GET ${HEXAGON_SDK_CONFIG_PATH} "root" "tools" "info" 0 "path")
message("Found HEXAGON_TOOLS_PATH: ${HEXAGON_TOOLS_PATH}")
set(HEXAGON_TOOLS_ROOT "${HEXAGON_SDK_ROOT}/${HEXAGON_TOOLS_PATH}")
file(TO_CMAKE_PATH "${HEXAGON_TOOLS_ROOT}" HEXAGON_TOOLS_ROOT)
if (NOT IS_DIRECTORY "${HEXAGON_TOOLS_ROOT}")
message(FATAL_ERROR "Make sure HEXAGON_TOOLS_ROOT point to the correct Hexagon SDK installation.")
endif()
endif()
message(STATUS "hexagon: using ${HEXAGON_SDK_ROOT} and ${HEXAGON_TOOLS_ROOT} for building libggml-htp skels")

View File

@ -17,6 +17,12 @@
#include "htp-msg.h"
#include "htp-ops.h"
static inline HVX_Vector hvx_load_f32_to_f16(const HVX_Vector * restrict src, const HVX_Vector zero) {
HVX_Vector y0_qf = Q6_Vqf32_vsub_VsfVsf(src[0], zero); // 32 elements
HVX_Vector y1_qf = Q6_Vqf32_vsub_VsfVsf(src[1], zero); // 32 elements
return Q6_Vh_vdeal_Vh(Q6_Vhf_equals_Wqf32(Q6_W_vcombine_VV(y1_qf, y0_qf)));
}
// Dot product of FP32 and FP16 vectors, accumulating to float
static inline void hvx_dot_f32_f16_aa(float * restrict r, const void * restrict y, const void * restrict x, unsigned int n, float s) {
const HVX_Vector * restrict vy = (const HVX_Vector * restrict) y; // fp32
@ -33,23 +39,19 @@ static inline void hvx_dot_f32_f16_aa(float * restrict r, const void * restrict
#pragma unroll(4)
for (i = 0; i < nvec; i++) {
// Load y (fp32) and convert into fp16
HVX_Vector y0_qf = Q6_Vqf32_vsub_VsfVsf(vy[i*2+0], zero); // 32 elements
HVX_Vector y1_qf = Q6_Vqf32_vsub_VsfVsf(vy[i*2+1], zero); // 32 elements
HVX_Vector y_hf = Q6_Vh_vdeal_Vh(Q6_Vhf_equals_Wqf32(Q6_W_vcombine_VV(y1_qf, y0_qf)));
HVX_Vector y_hf = hvx_load_f32_to_f16(&vy[i*2], zero);
// Load x (fp16)
HVX_Vector x_hf = vx[i];
HVX_VectorPair xy_qf = Q6_Wqf32_vmpy_VhfVhf(x_hf, y_hf);
rsum = Q6_Vqf32_vadd_Vqf32Vqf32(rsum, Q6_Vqf32_vadd_Vqf32Vqf32(Q6_V_lo_W(xy_qf), Q6_V_hi_W(xy_qf)));
rsum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(Q6_Vqf32_vadd_Vqf32Vqf32(Q6_V_lo_W(xy_qf), Q6_V_hi_W(xy_qf)), rsum));
}
if (nloe) {
// Load y (fp32) and convert into fp16
HVX_Vector y0_qf = Q6_Vqf32_vsub_VsfVsf(vy[i*2+0], zero); // 32 elements
HVX_Vector y1_qf = Q6_Vqf32_vsub_VsfVsf(vy[i*2+1], zero); // 32 elements
HVX_Vector y_hf = Q6_Vh_vdeal_Vh(Q6_Vhf_equals_Wqf32(Q6_W_vcombine_VV(y1_qf, y0_qf)));
HVX_Vector y_hf = hvx_load_f32_to_f16(&vy[i*2], zero);
// Load x (fp16)
HVX_Vector x_hf = vx[i];
@ -62,13 +64,72 @@ static inline void hvx_dot_f32_f16_aa(float * restrict r, const void * restrict
HVX_VectorPair xy_qf = Q6_Wqf32_vmpy_VhfVhf(x_hf, y_hf);
rsum = Q6_Vqf32_vadd_Vqf32Vqf32(rsum, Q6_Vqf32_vadd_Vqf32Vqf32(Q6_V_lo_W(xy_qf), Q6_V_hi_W(xy_qf)));
rsum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(Q6_Vqf32_vadd_Vqf32Vqf32(Q6_V_lo_W(xy_qf), Q6_V_hi_W(xy_qf)), rsum));
}
rsum = Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(rsum), hvx_vec_splat_f32(s));
rsum = Q6_Vsf_equals_Vqf32(hvx_vec_reduce_sum_qf32(rsum));
rsum = Q6_Vqf32_vmpy_VsfVsf(hvx_vec_splat_f32(s), hvx_vec_reduce_sum_f32(rsum));
hvx_vec_store_u(r, 4, Q6_Vsf_equals_Vqf32(rsum));
}
hvx_vec_store_u(r, 4, rsum);
// Dot product of FP32 and FP16 vectors, accumulating to float
static inline void hvx_dot_f32_f16_aa_rx2(float * restrict r,
const void * restrict y,
const void * restrict x0,
const void * restrict x1,
unsigned int n,
float s) {
const HVX_Vector * restrict vy = (const HVX_Vector * restrict) y; // fp32
const HVX_Vector * restrict vx0 = (const HVX_Vector * restrict) x0; // fp16
const HVX_Vector * restrict vx1 = (const HVX_Vector * restrict) x1; // fp16
uint32_t nvec = n / VLEN_FP16; // num full fp16 hvx vectors
uint32_t nloe = n % VLEN_FP16; // leftover elements
const HVX_Vector zero = Q6_V_vsplat_R(0);
HVX_Vector rsum0 = Q6_V_vsplat_R(0);
HVX_Vector rsum1 = Q6_V_vsplat_R(0);
uint32_t i = 0;
#pragma unroll(2)
for (i = 0; i < nvec; i++) {
// Load y (fp32) and convert into fp16
HVX_Vector y_hf = hvx_load_f32_to_f16(&vy[i*2], zero);
// Load x (fp16)
HVX_Vector x0_hf = vx0[i];
HVX_Vector x1_hf = vx1[i];
HVX_VectorPair xy0_qf = Q6_Wqf32_vmpy_VhfVhf(x0_hf, y_hf);
HVX_VectorPair xy1_qf = Q6_Wqf32_vmpy_VhfVhf(x1_hf, y_hf);
rsum0 = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(Q6_Vqf32_vadd_Vqf32Vqf32(Q6_V_lo_W(xy0_qf), Q6_V_hi_W(xy0_qf)), rsum0));
rsum1 = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(Q6_Vqf32_vadd_Vqf32Vqf32(Q6_V_lo_W(xy1_qf), Q6_V_hi_W(xy1_qf)), rsum1));
}
if (nloe) {
// Load y (fp32) and convert into fp16
HVX_Vector y_hf = hvx_load_f32_to_f16(&vy[i*2], zero);
// Load x (fp16)
HVX_Vector x0_hf = vx0[i];
HVX_Vector x1_hf = vx1[i];
// Zero-out unused elements
// Note that we need to clear both x and y because they may contain NANs
HVX_VectorPred bmask = Q6_Q_vsetq_R(nloe * 2);
x0_hf = Q6_V_vand_QV(bmask, x0_hf);
x1_hf = Q6_V_vand_QV(bmask, x1_hf);
y_hf = Q6_V_vand_QV(bmask, y_hf);
HVX_VectorPair xy0_qf = Q6_Wqf32_vmpy_VhfVhf(x0_hf, y_hf);
HVX_VectorPair xy1_qf = Q6_Wqf32_vmpy_VhfVhf(x1_hf, y_hf);
rsum0 = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(Q6_Vqf32_vadd_Vqf32Vqf32(Q6_V_lo_W(xy0_qf), Q6_V_hi_W(xy0_qf)), rsum0));
rsum1 = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(Q6_Vqf32_vadd_Vqf32Vqf32(Q6_V_lo_W(xy1_qf), Q6_V_hi_W(xy1_qf)), rsum1));
}
HVX_Vector rsum = Q6_Vqf32_vmpy_VsfVsf(hvx_vec_splat_f32(s), hvx_vec_reduce_sum_f32x2(rsum0, rsum1));
hvx_vec_store_u(r, 8, Q6_Vsf_equals_Vqf32(rsum));
}
// Dot product of two F16 vectors, accumulating to float
@ -91,7 +152,7 @@ static inline void hvx_dot_f16_f16_aa(float * restrict r, const void * restrict
HVX_VectorPair xy_qf = Q6_Wqf32_vmpy_VhfVhf(x_hf, y_hf);
rsum = Q6_Vqf32_vadd_Vqf32Vqf32(rsum, Q6_Vqf32_vadd_Vqf32Vqf32(Q6_V_lo_W(xy_qf), Q6_V_hi_W(xy_qf)));
rsum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(Q6_Vqf32_vadd_Vqf32Vqf32(Q6_V_lo_W(xy_qf), Q6_V_hi_W(xy_qf)), rsum));
}
if (nloe) {
@ -103,12 +164,62 @@ static inline void hvx_dot_f16_f16_aa(float * restrict r, const void * restrict
HVX_VectorPair xy_qf = Q6_Wqf32_vmpy_VhfVhf(x_hf, y_hf);
rsum = Q6_Vqf32_vadd_Vqf32Vqf32(rsum, Q6_Vqf32_vadd_Vqf32Vqf32(Q6_V_lo_W(xy_qf), Q6_V_hi_W(xy_qf)));
rsum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(Q6_Vqf32_vadd_Vqf32Vqf32(Q6_V_lo_W(xy_qf), Q6_V_hi_W(xy_qf)), rsum));
}
rsum = Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(rsum), hvx_vec_splat_f32(s));
rsum = Q6_Vsf_equals_Vqf32(hvx_vec_reduce_sum_qf32(rsum));
hvx_vec_store_u(r, 4, rsum);
rsum = Q6_Vqf32_vmpy_VsfVsf(hvx_vec_splat_f32(s), hvx_vec_reduce_sum_f32(rsum));
hvx_vec_store_u(r, 4, Q6_Vsf_equals_Vqf32(rsum));
}
static inline void hvx_dot_f16_f16_aa_rx2(float * restrict r,
const void * restrict y,
const void * restrict x0,
const void * restrict x1,
unsigned int n,
float s) {
const HVX_Vector * restrict vx0 = (const HVX_Vector * restrict) x0; // fp16
const HVX_Vector * restrict vx1 = (const HVX_Vector * restrict) x1; // fp16
const HVX_Vector * restrict vy = (const HVX_Vector * restrict) y; // fp16
uint32_t nvec = n / VLEN_FP16; // num full fp16 hvx vectors
uint32_t nloe = n % VLEN_FP16; // leftover elements
const HVX_Vector zero = Q6_V_vsplat_R(0);
HVX_Vector rsum0 = Q6_V_vsplat_R(0);
HVX_Vector rsum1 = Q6_V_vsplat_R(0);
uint32_t i = 0;
#pragma unroll(4)
for (i = 0; i < nvec; i++) {
HVX_Vector y_hf = vy[i];
HVX_Vector x0_hf = vx0[i];
HVX_Vector x1_hf = vx1[i];
HVX_VectorPair xy0_qf = Q6_Wqf32_vmpy_VhfVhf(x0_hf, y_hf);
HVX_VectorPair xy1_qf = Q6_Wqf32_vmpy_VhfVhf(x1_hf, y_hf);
rsum0 = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(Q6_Vqf32_vadd_Vqf32Vqf32(Q6_V_lo_W(xy0_qf), Q6_V_hi_W(xy0_qf)), rsum0));
rsum1 = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(Q6_Vqf32_vadd_Vqf32Vqf32(Q6_V_lo_W(xy1_qf), Q6_V_hi_W(xy1_qf)), rsum1));
}
if (nloe) {
HVX_Vector y_hf = vy[i];
// Load x (fp16) and zero-out unused elements
HVX_VectorPred bmask = Q6_Q_vsetq_R(nloe * 2);
HVX_Vector x0_hf = Q6_V_vand_QV(bmask, vx0[i]);
HVX_Vector x1_hf = Q6_V_vand_QV(bmask, vx1[i]);
HVX_VectorPair xy0_qf = Q6_Wqf32_vmpy_VhfVhf(x0_hf, y_hf);
HVX_VectorPair xy1_qf = Q6_Wqf32_vmpy_VhfVhf(x1_hf, y_hf);
rsum0 = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(Q6_Vqf32_vadd_Vqf32Vqf32(Q6_V_lo_W(xy0_qf), Q6_V_hi_W(xy0_qf)), rsum0));
rsum1 = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(Q6_Vqf32_vadd_Vqf32Vqf32(Q6_V_lo_W(xy1_qf), Q6_V_hi_W(xy1_qf)), rsum1));
}
HVX_Vector rsum = Q6_Vqf32_vmpy_VsfVsf(hvx_vec_splat_f32(s), hvx_vec_reduce_sum_f32x2(rsum0, rsum1));
hvx_vec_store_u(r, 8, Q6_Vsf_equals_Vqf32(rsum));
}
// MAD: y (F32) += x (F16) * s (float)
@ -317,20 +428,22 @@ static void flash_attn_ext_f16_thread(struct htp_ops_context * octx, int ith, in
// Inner loop processing the block from VTCM
uint32_t ic = 0;
const bool is_q_fp32 = (q->type == HTP_TYPE_F32);
// Process in blocks of 32 (VLEN_FP32)
static_assert(FLASH_ATTN_BLOCK_SIZE / VLEN_FP32 == 4, "FLASH_ATTN_BLOCK_SIZE changed, fix HVX_Vector_x4 usage");
static_assert(FLASH_ATTN_BLOCK_SIZE / VLEN_FP32 <= 4, "FLASH_ATTN_BLOCK_SIZE changed, fix HVX_Vector_x4 usage");
HVX_Vector_x4 scores_x4;
HVX_Vector v_max = hvx_vec_splat_f32(-INFINITY);
for (uint32_t iv = 0; ic + VLEN_FP32 <= current_block_size; ic += VLEN_FP32, ++iv) {
// 1. Compute scores
float __attribute__((aligned(VLEN))) scores_arr[FLASH_ATTN_BLOCK_SIZE];
for (int j = 0; j < VLEN_FP32; ++j) {
float __attribute__((aligned(VLEN))) scores_arr[VLEN_FP32];
for (int j = 0; j < VLEN_FP32; j += 2) {
const uint32_t cur_ic = ic + j;
const uint8_t * k_ptr = k_base + cur_ic * size_k_row_padded;
if (q->type == HTP_TYPE_F32) {
hvx_dot_f32_f16_aa(&scores_arr[j], q_ptr_vtcm, k_ptr, DK, scale);
if (is_q_fp32) {
hvx_dot_f32_f16_aa_rx2(&scores_arr[j], q_ptr_vtcm, k_ptr, k_ptr + size_k_row_padded, DK, scale);
} else {
hvx_dot_f16_f16_aa(&scores_arr[j], q_ptr_vtcm, k_ptr, DK, scale);
hvx_dot_f16_f16_aa_rx2(&scores_arr[j], q_ptr_vtcm, k_ptr, k_ptr + size_k_row_padded, DK, scale);
}
}
@ -403,7 +516,7 @@ static void flash_attn_ext_f16_thread(struct htp_ops_context * octx, int ith, in
float s_val;
const uint8_t * k_ptr = k_base + ic * size_k_row_padded;
if (q->type == HTP_TYPE_F32) {
if (is_q_fp32) {
hvx_dot_f32_f16_aa(&s_val, q_ptr_vtcm, k_ptr, DK, scale);
} else {
hvx_dot_f16_f16_aa(&s_val, q_ptr_vtcm, k_ptr, DK, scale);

View File

@ -28,19 +28,16 @@ static void hvx_vec_dump_f16(char * pref, HVX_Vector v) {
}
static void hvx_vec_dump_f32_n(char * pref, HVX_Vector v, uint32_t n) {
union {
HVX_Vector v;
float d[32];
} u = { .v = v };
HVX_VectorAlias u = { .v = v };
const uint32_t n0 = n / 16;
const uint32_t n1 = n % 16;
int i = 0;
for (; i < n0; i++) {
hex_dump_f32_line(pref, u.d + (16 * i), 16);
hex_dump_f32_line(pref, u.fp32 + (16 * i), 16);
}
if (n1) {
hex_dump_f32_line(pref, u.d + (16 * i), n1);
hex_dump_f32_line(pref, u.fp32 + (16 * i), n1);
}
}

View File

@ -44,6 +44,45 @@ static inline HVX_Vector hvx_vec_reduce_sum_qf32(HVX_Vector in) {
return hvx_vec_reduce_sum_n_qf32(in, 32);
}
#if __HVX_ARCH__ > 75
static inline HVX_Vector hvx_vec_reduce_sum_f32x2(HVX_Vector in0, HVX_Vector in1) {
HVX_VectorPair sump = Q6_W_vshuff_VVR(in1, in0, 4);
HVX_Vector sum_sf = Q6_Vsf_vadd_VsfVsf(Q6_V_lo_W(sump), Q6_V_hi_W(sump));
sum_sf = Q6_Vsf_vadd_VsfVsf(sum_sf, Q6_V_vror_VR(sum_sf, VLEN / 2));
sum_sf = Q6_Vsf_vadd_VsfVsf(sum_sf, Q6_V_vror_VR(sum_sf, VLEN / 4));
sum_sf = Q6_Vsf_vadd_VsfVsf(sum_sf, Q6_V_vror_VR(sum_sf, VLEN / 8));
sum_sf = Q6_Vsf_vadd_VsfVsf(sum_sf, Q6_V_vror_VR(sum_sf, VLEN / 16));
return sum_sf;
}
static inline HVX_Vector hvx_vec_reduce_sum_n_f32(HVX_Vector in, unsigned int n) {
unsigned int total = n * 4; // total vec nbytes
unsigned int width = 4; // fp32 nbytes
HVX_Vector sum = in, sum_t;
while (width < total) {
sum_t = Q6_V_vror_VR(sum, width); // rotate right
sum = Q6_Vsf_vadd_VsfVsf(sum, sum_t); // elementwise sum
width = width << 1;
}
return sum;
}
#else
static inline HVX_Vector hvx_vec_reduce_sum_f32x2(HVX_Vector in0, HVX_Vector in1) {
HVX_VectorPair sump = Q6_W_vshuff_VVR(in1, in0, 4);
HVX_Vector sum_qf = Q6_Vqf32_vadd_VsfVsf(Q6_V_lo_W(sump), Q6_V_hi_W(sump));
sum_qf = Q6_Vqf32_vadd_Vqf32Vsf(sum_qf, Q6_V_vror_VR(Q6_Vsf_equals_Vqf32(sum_qf), VLEN / 2));
sum_qf = Q6_Vqf32_vadd_Vqf32Vsf(sum_qf, Q6_V_vror_VR(Q6_Vsf_equals_Vqf32(sum_qf), VLEN / 4));
sum_qf = Q6_Vqf32_vadd_Vqf32Vsf(sum_qf, Q6_V_vror_VR(Q6_Vsf_equals_Vqf32(sum_qf), VLEN / 8));
sum_qf = Q6_Vqf32_vadd_Vqf32Vsf(sum_qf, Q6_V_vror_VR(Q6_Vsf_equals_Vqf32(sum_qf), VLEN / 16));
return Q6_Vsf_equals_Vqf32(sum_qf);
}
static inline HVX_Vector hvx_vec_reduce_sum_n_f32(HVX_Vector in, unsigned int n) {
unsigned int total = n * 4; // total vec nbytes
unsigned int width = 4; // fp32 nbytes
@ -57,6 +96,8 @@ static inline HVX_Vector hvx_vec_reduce_sum_n_f32(HVX_Vector in, unsigned int n)
return sum;
}
#endif
static inline HVX_Vector hvx_vec_reduce_sum_f32(HVX_Vector in) {
return hvx_vec_reduce_sum_n_f32(in, 32);
}

View File

@ -11,6 +11,7 @@
#include "hex-dma.h"
#include "hvx-utils.h"
#include "hvx-dump.h"
#define GGML_COMMON_DECL_C
#include "ggml-common.h"
@ -320,7 +321,7 @@ static void vec_dot_q4x4x2_q8x4x2(const int n, float * restrict s, const void *
const uint8_t * restrict y_q = ((const uint8_t *) vy + 0); // quants first
const uint8_t * restrict y_d = ((const uint8_t *) vy + y_qrow_size); // then scales
// Row sum (qf32)
// Row sum (sf)
HVX_Vector r0_sum = Q6_V_vsplat_R(0);
// Multiply and accumulate into int32.
@ -344,7 +345,7 @@ static void vec_dot_q4x4x2_q8x4x2(const int n, float * restrict s, const void *
HVX_Vector r0_fa = Q6_Vqf32_vmpy_VsfVsf(r0_ia, r0_dd);
r0_sum = Q6_Vqf32_vadd_Vqf32Vqf32(r0_sum, r0_fa);
r0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_fa, r0_sum));
}
// Process leftovers, we still load full 4x4x2 block but zero out unused scales/blocks
@ -362,14 +363,14 @@ static void vec_dot_q4x4x2_q8x4x2(const int n, float * restrict s, const void *
// Zero out unused scales
HVX_VectorPred bmask = Q6_Q_vsetq_R(nloe / 8);
r0_dd = Q6_V_vand_QV(bmask, r0_dd);
r0_ia = Q6_V_vand_QV(bmask, r0_ia);
HVX_Vector r0_fa = Q6_Vqf32_vmpy_VsfVsf(r0_ia, r0_dd);
r0_sum = Q6_Vqf32_vadd_Vqf32Vqf32(r0_sum, r0_fa);
r0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_fa, r0_sum));
}
// Reduce and convert into fp32
r0_sum = hvx_vec_reduce_sum_f32(Q6_Vsf_equals_Vqf32(r0_sum));
r0_sum = hvx_vec_reduce_sum_f32(r0_sum);
hvx_vec_store_u(&s[0], 4, r0_sum);
}
@ -402,7 +403,7 @@ static void vec_dot_q4x4x2_q8x4x2_rx2(const int n,
const uint8_t * restrict y_q = ((const uint8_t *) vy + 0); // quants first
const uint8_t * restrict y_d = ((const uint8_t *) vy + y_qrow_size); // then scales
// Row sum (qf32)
// Row sum (sf)
HVX_Vector r0_sum = Q6_V_vsplat_R(0);
HVX_Vector r1_sum = Q6_V_vsplat_R(0);
@ -432,8 +433,8 @@ static void vec_dot_q4x4x2_q8x4x2_rx2(const int n,
HVX_Vector r0_fa = Q6_Vqf32_vmpy_VsfVsf(r0_ia, r0_dd);
HVX_Vector r1_fa = Q6_Vqf32_vmpy_VsfVsf(r1_ia, r1_dd);
r0_sum = Q6_Vqf32_vadd_Vqf32Vqf32(r0_sum, r0_fa);
r1_sum = Q6_Vqf32_vadd_Vqf32Vqf32(r1_sum, r1_fa);
r0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_fa, r0_sum));
r1_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r1_fa, r1_sum));
}
// Process leftovers, we still load full 4x4x2 block but zero out unused scales/blocks
@ -456,20 +457,18 @@ static void vec_dot_q4x4x2_q8x4x2_rx2(const int n,
HVX_VectorPred bmask = Q6_Q_vsetq_R(nloe / 8);
r0_dd = Q6_V_vand_QV(bmask, r0_dd);
r1_dd = Q6_V_vand_QV(bmask, r1_dd);
r0_ia = Q6_V_vand_QV(bmask, r0_ia);
r1_ia = Q6_V_vand_QV(bmask, r1_ia);
HVX_Vector r0_fa = Q6_Vqf32_vmpy_VsfVsf(r0_ia, r0_dd);
HVX_Vector r1_fa = Q6_Vqf32_vmpy_VsfVsf(r1_ia, r1_dd);
r0_sum = Q6_Vqf32_vadd_Vqf32Vqf32(r0_sum, r0_fa);
r1_sum = Q6_Vqf32_vadd_Vqf32Vqf32(r1_sum, r1_fa);
r0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_fa, r0_sum));
r1_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r1_fa, r1_sum));
}
// Convert into fp32 and reduce
r0_sum = hvx_vec_reduce_sum_f32(Q6_Vsf_equals_Vqf32(r0_sum));
r1_sum = hvx_vec_reduce_sum_f32(Q6_Vsf_equals_Vqf32(r1_sum));
HVX_VectorPair p0 = Q6_W_vshuff_VVR(r1_sum, r0_sum, 4);
hvx_vec_store_u(&s[0], 8, Q6_V_lo_W(p0));
HVX_Vector rsum = hvx_vec_reduce_sum_f32x2(r0_sum, r1_sum);
hvx_vec_store_u(&s[0], 8, rsum);
}
static void vec_dot_q8x4x2_q8x4x2(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
@ -493,7 +492,7 @@ static void vec_dot_q8x4x2_q8x4x2(const int n, float * restrict s, const void *
const uint8_t * restrict y_q = ((const uint8_t *) vy + 0); // quants first
const uint8_t * restrict y_d = ((const uint8_t *) vy + y_qrow_size); // then scales
// Row sum (qf32)
// Row sum (sf)
HVX_Vector r0_sum = Q6_V_vsplat_R(0);
// Multiply and accumulate into int32.
@ -517,7 +516,7 @@ static void vec_dot_q8x4x2_q8x4x2(const int n, float * restrict s, const void *
HVX_Vector r0_fa = Q6_Vqf32_vmpy_VsfVsf(r0_ia, r0_dd);
r0_sum = Q6_Vqf32_vadd_Vqf32Vqf32(r0_sum, r0_fa);
r0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_fa, r0_sum));
}
// Process leftovers, we still load full 4x4x2 block but zero out unused scales/blocks
@ -535,14 +534,14 @@ static void vec_dot_q8x4x2_q8x4x2(const int n, float * restrict s, const void *
// Zero out unused scales
HVX_VectorPred bmask = Q6_Q_vsetq_R(nloe / 8);
r0_dd = Q6_V_vand_QV(bmask, r0_dd);
r0_ia = Q6_V_vand_QV(bmask, r0_ia);
HVX_Vector r0_fa = Q6_Vqf32_vmpy_VsfVsf(r0_ia, r0_dd);
r0_sum = Q6_Vqf32_vadd_Vqf32Vqf32(r0_sum, r0_fa);
r0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_fa, r0_sum));
}
// Reduce and convert into fp32
r0_sum = hvx_vec_reduce_sum_f32(Q6_Vsf_equals_Vqf32(r0_sum));
r0_sum = hvx_vec_reduce_sum_f32(r0_sum);
hvx_vec_store_u(&s[0], 4, r0_sum);
}
@ -605,8 +604,8 @@ static void vec_dot_q8x4x2_q8x4x2_rx2(const int n,
HVX_Vector r0_fa = Q6_Vqf32_vmpy_VsfVsf(r0_ia, r0_dd);
HVX_Vector r1_fa = Q6_Vqf32_vmpy_VsfVsf(r1_ia, r1_dd);
r0_sum = Q6_Vqf32_vadd_Vqf32Vqf32(r0_sum, r0_fa);
r1_sum = Q6_Vqf32_vadd_Vqf32Vqf32(r1_sum, r1_fa);
r0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_fa, r0_sum));
r1_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r1_fa, r1_sum));
}
// Process leftovers, we still load full 4x4x2 block but zero out unused scales/blocks
@ -629,20 +628,18 @@ static void vec_dot_q8x4x2_q8x4x2_rx2(const int n,
HVX_VectorPred bmask = Q6_Q_vsetq_R(nloe / 8);
r0_dd = Q6_V_vand_QV(bmask, r0_dd);
r1_dd = Q6_V_vand_QV(bmask, r1_dd);
r0_ia = Q6_V_vand_QV(bmask, r0_ia);
r1_ia = Q6_V_vand_QV(bmask, r1_ia);
HVX_Vector r0_fa = Q6_Vqf32_vmpy_VsfVsf(r0_ia, r0_dd);
HVX_Vector r1_fa = Q6_Vqf32_vmpy_VsfVsf(r1_ia, r1_dd);
r0_sum = Q6_Vqf32_vadd_Vqf32Vqf32(r0_sum, r0_fa);
r1_sum = Q6_Vqf32_vadd_Vqf32Vqf32(r1_sum, r1_fa);
r0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_fa, r0_sum));
r1_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r1_fa, r1_sum));
}
// Convert into fp32 and reduce
r0_sum = hvx_vec_reduce_sum_f32(Q6_Vsf_equals_Vqf32(r0_sum));
r1_sum = hvx_vec_reduce_sum_f32(Q6_Vsf_equals_Vqf32(r1_sum));
HVX_VectorPair p0 = Q6_W_vshuff_VVR(r1_sum, r0_sum, 4);
hvx_vec_store_u(&s[0], 8, Q6_V_lo_W(p0));
HVX_Vector rsum = hvx_vec_reduce_sum_f32x2(r0_sum, r1_sum);
hvx_vec_store_u(&s[0], 8, rsum);
}
static void vec_dot_mxfp4x4x2_q8x4x2(const int n,
@ -669,7 +666,7 @@ static void vec_dot_mxfp4x4x2_q8x4x2(const int n,
const uint8_t * restrict y_q = ((const uint8_t *) vy + 0); // quants first
const uint8_t * restrict y_d = ((const uint8_t *) vy + y_qrow_size); // then scales
// Row sum (qf32)
// Row sum (sf)
HVX_Vector r0_sum = Q6_V_vsplat_R(0);
// Multiply and accumulate into int32.
@ -708,7 +705,7 @@ static void vec_dot_mxfp4x4x2_q8x4x2(const int n,
HVX_Vector r0_fa = Q6_Vqf32_vmpy_VsfVsf(r0_ia, r0_dd);
r0_sum = Q6_Vqf32_vadd_Vqf32Vqf32(r0_sum, r0_fa);
r0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_fa, r0_sum));
}
// Process leftovers
@ -741,14 +738,14 @@ static void vec_dot_mxfp4x4x2_q8x4x2(const int n,
// Zero-out unused scales
HVX_VectorPred bmask = Q6_Q_vsetq_R(nloe / 8);
r0_dd = Q6_V_vand_QV(bmask, r0_dd);
r0_ia = Q6_V_vand_QV(bmask, r0_ia);
HVX_Vector r0_fa = Q6_Vqf32_vmpy_VsfVsf(r0_ia, r0_dd);
r0_sum = Q6_Vqf32_vadd_Vqf32Vqf32(r0_sum, r0_fa);
r0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_fa, r0_sum));
}
// Reduce and convert into fp32
r0_sum = hvx_vec_reduce_sum_f32(Q6_Vsf_equals_Vqf32(r0_sum));
r0_sum = hvx_vec_reduce_sum_f32(r0_sum);
hvx_vec_store_u(&s[0], 4, r0_sum);
}
@ -781,13 +778,13 @@ static void vec_dot_mxfp4x4x2_q8x4x2_rx2(const int n,
const uint8_t * restrict y_q = ((const uint8_t *) vy + 0); // quants first
const uint8_t * restrict y_d = ((const uint8_t *) vy + y_qrow_size); // then scales
// Row sum (qf32)
// Row sum (sf)
HVX_Vector r0_sum = Q6_V_vsplat_R(0);
HVX_Vector r1_sum = Q6_V_vsplat_R(0);
// Multiply and accumulate into int32.
// Compute combined scale (fp32).
// Apply scale to acc and accumulate into the row sum (qf32).
// Apply scale to acc and accumulate into the row sum (f32).
const uint32_t nb = n / qk; // num full blocks
int32_t nloe = n % qk; // num leftover elemements (must be signed)
@ -829,8 +826,8 @@ static void vec_dot_mxfp4x4x2_q8x4x2_rx2(const int n,
HVX_Vector r0_fa = Q6_Vqf32_vmpy_VsfVsf(r0_ia, r0_dd);
HVX_Vector r1_fa = Q6_Vqf32_vmpy_VsfVsf(r1_ia, r1_dd);
r0_sum = Q6_Vqf32_vadd_Vqf32Vqf32(r0_sum, r0_fa);
r1_sum = Q6_Vqf32_vadd_Vqf32Vqf32(r1_sum, r1_fa);
r0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_fa, r0_sum));
r1_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r1_fa, r1_sum));
}
// Process leftovers
@ -867,24 +864,22 @@ static void vec_dot_mxfp4x4x2_q8x4x2_rx2(const int n,
HVX_Vector r0_dd = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(r0_d, vy_d));
HVX_Vector r1_dd = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(r1_d, vy_d));
// Zero-out unused scales
// Zero-out unused values
HVX_VectorPred bmask = Q6_Q_vsetq_R(nloe / 8);
r0_dd = Q6_V_vand_QV(bmask, r0_dd);
r1_dd = Q6_V_vand_QV(bmask, r1_dd);
r0_ia = Q6_V_vand_QV(bmask, r0_ia);
r1_ia = Q6_V_vand_QV(bmask, r1_ia);
HVX_Vector r0_fa = Q6_Vqf32_vmpy_VsfVsf(r0_ia, r0_dd);
HVX_Vector r1_fa = Q6_Vqf32_vmpy_VsfVsf(r1_ia, r1_dd);
r0_sum = Q6_Vqf32_vadd_Vqf32Vqf32(r0_sum, r0_fa);
r1_sum = Q6_Vqf32_vadd_Vqf32Vqf32(r1_sum, r1_fa);
r0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_fa, r0_sum));
r1_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r1_fa, r1_sum));
}
// Convert into fp32 and reduce
r0_sum = hvx_vec_reduce_sum_f32(Q6_Vsf_equals_Vqf32(r0_sum));
r1_sum = hvx_vec_reduce_sum_f32(Q6_Vsf_equals_Vqf32(r1_sum));
HVX_VectorPair p0 = Q6_W_vshuff_VVR(r1_sum, r0_sum, 4);
hvx_vec_store_u(&s[0], 8, Q6_V_lo_W(p0));
HVX_Vector rsum = hvx_vec_reduce_sum_f32x2(r0_sum, r1_sum);
hvx_vec_store_u(&s[0], 8, rsum);
}
static void vec_dot_f16_f16_aa(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
@ -913,7 +908,7 @@ static void vec_dot_f16_f16_aa(const int n, float * restrict s, const void * res
rsum = Q6_Vqf32_vadd_Vqf32Vqf32(rsum, Q6_Vqf32_vadd_Vqf32Vqf32(Q6_V_lo_W(xy_qf), Q6_V_hi_W(xy_qf)));
}
rsum = Q6_Vsf_equals_Vqf32(hvx_vec_reduce_sum_qf32(rsum));
rsum = hvx_vec_reduce_sum_f32(Q6_Vsf_equals_Vqf32(rsum));
hvx_vec_store_u(&s[0], 4, rsum);
}
@ -957,11 +952,8 @@ static void vec_dot_f16_f16_aa_rx2(const int n,
rsum1 = Q6_Vqf32_vadd_Vqf32Vqf32(rsum1, Q6_Vqf32_vadd_Vqf32Vqf32(Q6_V_lo_W(xy1_qf), Q6_V_hi_W(xy1_qf)));
}
rsum0 = Q6_Vsf_equals_Vqf32(hvx_vec_reduce_sum_qf32(rsum0));
rsum1 = Q6_Vsf_equals_Vqf32(hvx_vec_reduce_sum_qf32(rsum1));
HVX_VectorPair p0 = Q6_W_vshuff_VVR(rsum1, rsum0, 4);
hvx_vec_store_u(&s[0], 8, Q6_V_lo_W(p0));
HVX_Vector rsum = hvx_vec_reduce_sum_f32x2(Q6_Vsf_equals_Vqf32(rsum0), Q6_Vsf_equals_Vqf32(rsum1));
hvx_vec_store_u(&s[0], 8, rsum);
}
static void vec_dot_f16_f16_uu(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
@ -990,7 +982,7 @@ static void vec_dot_f16_f16_uu(const int n, float * restrict s, const void * res
rsum = Q6_Vqf32_vadd_Vqf32Vqf32(rsum, Q6_Vqf32_vadd_Vqf32Vqf32(Q6_V_lo_W(xy_qf), Q6_V_hi_W(xy_qf)));
}
rsum = Q6_Vsf_equals_Vqf32(hvx_vec_reduce_sum_qf32(rsum));
rsum = hvx_vec_reduce_sum_f32(Q6_Vsf_equals_Vqf32(rsum));
hvx_vec_store_u(&s[0], 4, rsum);
}
@ -1042,7 +1034,8 @@ static void vec_dot_f16_f32_uu(const int n, float * restrict s, const void * res
rsum = Q6_Vqf32_vadd_Vqf32Vqf32(rsum, Q6_Vqf32_vadd_Vqf32Vqf32(Q6_V_lo_W(xy_qf), Q6_V_hi_W(xy_qf)));
}
rsum = Q6_Vsf_equals_Vqf32(hvx_vec_reduce_sum_qf32(rsum));
// Convert into fp32 and reduce
rsum = hvx_vec_reduce_sum_f32(Q6_Vsf_equals_Vqf32(rsum));
hvx_vec_store_u(&s[0], 4, rsum);
}

View File

@ -154,8 +154,8 @@ static void hvx_fast_softmax_f32(const uint8_t * restrict src,
v_pad[i] = v3;
}
v = hvx_vec_reduce_sum_qf32(sum_vec);
sum_vec = hvx_vec_repl4(Q6_Vsf_equals_Vqf32(v));
v = hvx_vec_reduce_sum_f32(Q6_Vsf_equals_Vqf32(sum_vec));
sum_vec = hvx_vec_repl4(v);
HVX_VectorPred pos_sum = Q6_Q_vcmp_gt_VwVw(sum_vec, zero_v);
HVX_Vector v4 = hvx_vec_inverse_f32(sum_vec);

View File

@ -57,8 +57,8 @@ static void hvx_fast_rms_norm_f32(const uint8_t * restrict src,
sum_v = Q6_Vqf32_vadd_Vqf32Vqf32(sum_v, v2);
}
HVX_Vector reduced_sum = hvx_vec_reduce_sum_qf32(sum_v);
sum_v = hvx_vec_repl4(Q6_Vsf_equals_Vqf32(reduced_sum));
HVX_Vector reduced_sum = hvx_vec_reduce_sum_f32(Q6_Vsf_equals_Vqf32(sum_v));
sum_v = hvx_vec_repl4(reduced_sum);
HVX_Vector t_v = hvx_vec_splat_f32((float) num_elems);
HVX_Vector denom_v = hvx_vec_inverse_f32(t_v);

View File

@ -71,7 +71,7 @@ else()
# disabling fast math is needed in order to pass tests/test-backend-ops
# note: adding -fno-inline fixes the tests when using MTL_SHADER_VALIDATION=1
# note: unfortunately, we have to call it default.metallib instead of ggml.metallib
# ref: https://github.com/ggerganov/whisper.cpp/issues/1720
# ref: https://github.com/ggml-org/whisper.cpp/issues/1720
# note: adding -g causes segmentation fault during compile
#set(XC_FLAGS -fno-fast-math -fno-inline -g)
set(XC_FLAGS -fno-fast-math -fno-inline)

View File

@ -15,14 +15,22 @@ typedef struct ggml_metal * ggml_metal_t;
ggml_metal_t ggml_metal_init(ggml_metal_device_t dev);
void ggml_metal_free(ggml_metal_t ctx);
const char * ggml_metal_get_name(ggml_metal_t ctx);
void ggml_metal_synchronize(ggml_metal_t ctx);
void ggml_metal_set_tensor_async(ggml_metal_t ctx, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
void ggml_metal_get_tensor_async(ggml_metal_t ctx, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
bool ggml_metal_cpy_tensor_async(ggml_metal_t ctx_src, ggml_metal_t ctx_dst, const struct ggml_tensor * src, struct ggml_tensor * dst);
enum ggml_status ggml_metal_graph_compute (ggml_metal_t ctx, struct ggml_cgraph * gf);
void ggml_metal_graph_optimize(ggml_metal_t ctx, struct ggml_cgraph * gf);
void ggml_metal_event_record(ggml_metal_t ctx, ggml_metal_event_t ev);
void ggml_metal_event_wait (ggml_metal_t ctx, ggml_metal_event_t ev);
ggml_metal_event_t ggml_metal_get_ev_cpy(ggml_metal_t ctx);
void ggml_metal_set_n_cb (ggml_metal_t ctx, int n_cb);
void ggml_metal_set_abort_callback (ggml_metal_t ctx, ggml_abort_callback abort_callback, void * user_data);
bool ggml_metal_supports_family (ggml_metal_t ctx, int family);

View File

@ -24,9 +24,13 @@ struct ggml_metal_command_buffer {
};
struct ggml_metal {
char name[128];
ggml_metal_device_t dev;
ggml_metal_library_t lib;
ggml_metal_event_t ev_cpy; // for async copies
dispatch_queue_t d_queue;
// additional, inference-time compiled pipelines
@ -117,7 +121,11 @@ ggml_metal_t ggml_metal_init(ggml_metal_device_t dev) {
}
}
//const struct ggml_metal_device_props * props_dev = ggml_metal_device_get_props(dev);
res->ev_cpy = ggml_metal_device_event_init(dev);
const struct ggml_metal_device_props * props_dev = ggml_metal_device_get_props(dev);
snprintf(res->name, sizeof(res->name), "%s", props_dev->name);
res->d_queue = dispatch_queue_create("ggml-metal", DISPATCH_QUEUE_CONCURRENT);
@ -206,9 +214,15 @@ void ggml_metal_free(ggml_metal_t ctx) {
dispatch_release(ctx->d_queue);
ggml_metal_device_event_free(ctx->dev, ctx->ev_cpy);
free(ctx);
}
const char * ggml_metal_get_name(ggml_metal_t ctx) {
return ctx->name;
}
void ggml_metal_synchronize(ggml_metal_t ctx) {
// wait for any backend operations to finish
if (ctx->cmd_buf_last) {
@ -273,8 +287,8 @@ void ggml_metal_set_tensor_async(ggml_metal_t ctx, struct ggml_tensor * tensor,
// wrap the source data into a Metal buffer
id<MTLDevice> device = ggml_metal_device_get_obj(ctx->dev);
id<MTLBuffer> buf_src = [device newBufferWithBytes:data
length:size
options:MTLResourceStorageModeShared];
length:size
options:MTLResourceStorageModeShared];
GGML_ASSERT(buf_src);
@ -316,9 +330,9 @@ void ggml_metal_get_tensor_async(ggml_metal_t ctx, const struct ggml_tensor * te
@autoreleasepool {
id<MTLDevice> device = ggml_metal_device_get_obj(ctx->dev);
id<MTLBuffer> buf_dst = [device newBufferWithBytesNoCopy:data
length:size
options:MTLResourceStorageModeShared
deallocator:nil];
length:size
options:MTLResourceStorageModeShared
deallocator:nil];
GGML_ASSERT(buf_dst);
@ -356,6 +370,49 @@ void ggml_metal_get_tensor_async(ggml_metal_t ctx, const struct ggml_tensor * te
}
}
bool ggml_metal_cpy_tensor_async(ggml_metal_t ctx_src, ggml_metal_t ctx_dst, const struct ggml_tensor * src, struct ggml_tensor * dst) {
@autoreleasepool {
struct ggml_metal_buffer_id bid_src = ggml_metal_get_buffer_id(src);
struct ggml_metal_buffer_id bid_dst = ggml_metal_get_buffer_id(dst);
if (bid_src.metal == nil || bid_dst.metal == nil) {
return false;
}
// queue the copy operation into the Metal context
// this will be queued at the end, after any currently ongoing GPU operations
id<MTLCommandQueue> queue = ggml_metal_device_get_queue(ctx_src->dev);
id<MTLCommandBuffer> cmd_buf = [queue commandBuffer];
id<MTLBlitCommandEncoder> encoder = [cmd_buf blitCommandEncoder];
[encoder copyFromBuffer:bid_src.metal
sourceOffset:bid_src.offs
toBuffer:bid_dst.metal
destinationOffset:bid_dst.offs
size:ggml_nbytes(src)];
[encoder endEncoding];
ggml_metal_event_t ev_cpy = ggml_metal_get_ev_cpy(ctx_src);
ggml_metal_event_record(ctx_src, ev_cpy);
[cmd_buf commit];
// do not wait here for completion
//[cmd_buf waitUntilCompleted];
// instead, remember a reference to the command buffer and wait for it later if needed
[ctx_src->cmd_bufs_ext addObject:cmd_buf];
ctx_src->cmd_buf_last = cmd_buf;
[cmd_buf retain];
ggml_metal_event_wait(ctx_dst, ev_cpy);
return true;
}
}
enum ggml_status ggml_metal_graph_compute(ggml_metal_t ctx, struct ggml_cgraph * gf) {
// number of nodes encoded by the main thread (empirically determined)
const int n_main = 64;
@ -530,6 +587,42 @@ void ggml_metal_graph_optimize(ggml_metal_t ctx, struct ggml_cgraph * gf) {
//printf("%s: graph optimize took %.3f ms\n", __func__, (ggml_time_us() - t_start) / 1000.0);
}
void ggml_metal_event_record(ggml_metal_t ctx, ggml_metal_event_t ev) {
@autoreleasepool {
id<MTLCommandQueue> queue = ggml_metal_device_get_queue(ctx->dev);
id<MTLCommandBuffer> cmd_buf = [queue commandBuffer];
ggml_metal_event_encode_signal(ev, cmd_buf);
[cmd_buf commit];
[ctx->cmd_bufs_ext addObject:cmd_buf];
ctx->cmd_buf_last = cmd_buf;
[cmd_buf retain];
}
}
void ggml_metal_event_wait(ggml_metal_t ctx, ggml_metal_event_t ev) {
@autoreleasepool {
id<MTLCommandQueue> queue = ggml_metal_device_get_queue(ctx->dev);
id<MTLCommandBuffer> cmd_buf = [queue commandBuffer];
ggml_metal_event_encode_wait(ev, cmd_buf);
[cmd_buf commit];
[ctx->cmd_bufs_ext addObject:cmd_buf];
ctx->cmd_buf_last = cmd_buf;
[cmd_buf retain];
}
}
ggml_metal_event_t ggml_metal_get_ev_cpy(ggml_metal_t ctx) {
return ctx->ev_cpy;
}
void ggml_metal_set_n_cb(ggml_metal_t ctx, int n_cb) {
if (ctx->n_cb != n_cb) {
ctx->n_cb = MIN(n_cb, GGML_METAL_MAX_COMMAND_BUFFERS);

View File

@ -17,10 +17,12 @@ struct ggml_metal_device_deleter {
typedef std::unique_ptr<ggml_metal_device, ggml_metal_device_deleter> ggml_metal_device_ptr;
ggml_metal_device_t ggml_metal_device_get(void) {
static ggml_metal_device_ptr ctx { ggml_metal_device_init() };
ggml_metal_device_t ggml_metal_device_get(int device) {
static std::vector<ggml_metal_device_ptr> devs;
return ctx.get();
devs.emplace_back(ggml_metal_device_init(device));
return devs.back().get();
}
struct ggml_metal_pipelines {

View File

@ -205,7 +205,9 @@ void ggml_metal_rsets_free(ggml_metal_rsets_t rsets);
//
struct ggml_metal_device_props {
int device;
char name[128];
char desc[128];
size_t max_buffer_size;
size_t max_working_set_size;
@ -224,11 +226,15 @@ struct ggml_metal_device_props {
int op_offload_min_batch_size;
};
ggml_metal_device_t ggml_metal_device_init(void);
typedef struct ggml_metal_event * ggml_metal_event_t;
void ggml_metal_event_encode_signal(ggml_metal_event_t ev, ggml_metal_cmd_buf_t cmd_buf);
void ggml_metal_event_encode_wait (ggml_metal_event_t ev, ggml_metal_cmd_buf_t cmd_buf);
ggml_metal_device_t ggml_metal_device_init(int device);
void ggml_metal_device_free(ggml_metal_device_t dev);
// return a singleton that is automatically destroyed when the program exits
ggml_metal_device_t ggml_metal_device_get(void);
ggml_metal_device_t ggml_metal_device_get(int device);
void * ggml_metal_device_get_obj (ggml_metal_device_t dev); // id<MTLDevice>
void * ggml_metal_device_get_queue(ggml_metal_device_t dev); // id<MTLCommandQueue>
@ -240,6 +246,10 @@ void ggml_metal_device_rsets_rm (ggml_metal_device_t dev, ggml_metal_rset_t rset
void ggml_metal_device_rsets_keep_alive(ggml_metal_device_t dev);
ggml_metal_event_t ggml_metal_device_event_init(ggml_metal_device_t dev);
void ggml_metal_device_event_free(ggml_metal_device_t dev, ggml_metal_event_t ev);
void ggml_metal_device_event_synchronize(ggml_metal_device_t dev, ggml_metal_event_t ev);
void ggml_metal_device_get_memory(ggml_metal_device_t dev, size_t * free, size_t * total);
bool ggml_metal_device_supports_op(ggml_metal_device_t dev, const struct ggml_tensor * op);

View File

@ -24,9 +24,6 @@
static const NSInteger MTLGPUFamilyMetal3_GGML = 5001;
static const NSInteger MTLGPUFamilyMetal4_GGML = 5002;
// virtual address for GPU memory allocations
static atomic_uintptr_t g_addr_device = 0x000000400ULL;
#if !GGML_METAL_EMBED_LIBRARY
// Here to assist with NSBundle Path Hack
@interface GGMLMetalClass : NSObject
@ -523,6 +520,9 @@ struct ggml_metal_device {
ggml_metal_library_t library;
struct ggml_metal_device_props props;
// virtual address for GPU memory allocations
atomic_uintptr_t addr_virt;
};
//
@ -618,7 +618,7 @@ void ggml_metal_rsets_free(ggml_metal_rsets_t rsets) {
free(rsets);
}
ggml_metal_device_t ggml_metal_device_init(void) {
ggml_metal_device_t ggml_metal_device_init(int device) {
ggml_metal_device_t dev = calloc(1, sizeof(struct ggml_metal_device));
assert(dev != NULL);
@ -632,6 +632,9 @@ ggml_metal_device_t ggml_metal_device_init(void) {
GGML_LOG_ERROR("%s: error: failed to create command queue\n", __func__);
}
dev->addr_virt = 0x000000400ULL;
dev->props.device = device;
dev->props.has_simdgroup_reduction = [dev->mtl_device supportsFamily:MTLGPUFamilyApple7];
dev->props.has_simdgroup_reduction |= [dev->mtl_device supportsFamily:MTLGPUFamilyMetal3_GGML];
@ -792,7 +795,8 @@ ggml_metal_device_t ggml_metal_device_init(void) {
dev->props.max_working_set_size = dev->mtl_device.maxBufferLength;
}
strncpy(dev->props.name, [[dev->mtl_device name] UTF8String], sizeof(dev->props.name) - 1);
snprintf(dev->props.name, sizeof(dev->props.name), "%s%d", "MTL", device);
snprintf(dev->props.desc, sizeof(dev->props.desc), "%s", [[dev->mtl_device name] UTF8String]);
dev->library = ggml_metal_library_init(dev);
if (!dev->library) {
@ -922,6 +926,59 @@ void ggml_metal_device_rsets_keep_alive(ggml_metal_device_t dev) {
atomic_store_explicit(&dev->rsets->d_loop, 2*dev->rsets->keep_alive_s, memory_order_relaxed);
}
struct ggml_metal_event {
void * obj; // id<MTLEvent>
atomic_int value;
};
void ggml_metal_event_encode_signal(ggml_metal_event_t ev, ggml_metal_cmd_buf_t cmd_buf_raw) {
id<MTLEvent> event = (id<MTLEvent>)ev->obj;
id<MTLCommandBuffer> cmd_buf = (id<MTLCommandBuffer>) cmd_buf_raw;
[cmd_buf encodeSignalEvent:event value:atomic_fetch_add_explicit(&ev->value, 1, memory_order_relaxed) + 1];
}
void ggml_metal_event_encode_wait(ggml_metal_event_t ev, ggml_metal_cmd_buf_t cmd_buf_raw) {
id<MTLEvent> event = (id<MTLEvent>)ev->obj;
id<MTLCommandBuffer> cmd_buf = (id<MTLCommandBuffer>) cmd_buf_raw;
[cmd_buf encodeWaitForEvent:event value:atomic_load_explicit(&ev->value, memory_order_relaxed)];
}
ggml_metal_event_t ggml_metal_device_event_init(ggml_metal_device_t dev) {
id<MTLEvent> event = [dev->mtl_device newEvent];
ggml_metal_event_t ev = calloc(1, sizeof(struct ggml_metal_event));
ev->obj = (__bridge void *)event;
ev->value = 0;
return ev;
}
void ggml_metal_device_event_free(ggml_metal_device_t dev, ggml_metal_event_t ev) {
id<MTLEvent> event = ev->obj;
[event release];
free(ev);
GGML_UNUSED(dev);
}
void ggml_metal_device_event_synchronize(ggml_metal_device_t dev, ggml_metal_event_t ev) {
@autoreleasepool {
id<MTLEvent> event = ev->obj;
id<MTLCommandBuffer> cmd_buf = [dev->mtl_queue commandBuffer];
[cmd_buf encodeWaitForEvent:event value:atomic_load_explicit(&ev->value, memory_order_relaxed)];
[cmd_buf commit];
[cmd_buf waitUntilCompleted];
}
}
void ggml_metal_device_get_memory(ggml_metal_device_t dev, size_t * free, size_t * total) {
if (@available(macOS 10.12, iOS 16.0, *)) {
*total = dev->mtl_device.recommendedMaxWorkingSetSize;
@ -1344,8 +1401,8 @@ ggml_metal_buffer_t ggml_metal_buffer_init(ggml_metal_device_t dev, size_t size,
res->all_data = ggml_metal_host_malloc(size_aligned);
res->is_shared = true;
} else {
// use virtual address from g_addr_device counter
res->all_data = (void *) atomic_fetch_add_explicit(&g_addr_device, size_aligned, memory_order_relaxed);
// use virtual address
res->all_data = (void *) atomic_fetch_add_explicit(&dev->addr_virt, size_aligned, memory_order_relaxed);
res->is_shared = false;
}
res->all_size = size_aligned;

View File

@ -81,10 +81,10 @@
#define FC_COUNT_EQUAL 1000
// op-specific constants
#define OP_FLASH_ATTN_EXT_NQPTG 8
#define OP_FLASH_ATTN_EXT_NQPSG 8
#define OP_FLASH_ATTN_EXT_NCPSG 64
#define OP_FLASH_ATTN_EXT_VEC_NQPTG 1
#define OP_FLASH_ATTN_EXT_VEC_NQPSG 1
#define OP_FLASH_ATTN_EXT_VEC_NCPSG 32
// kernel argument structs

View File

@ -2295,7 +2295,7 @@ size_t ggml_metal_op_flash_attn_ext_extra_blk(const ggml_tensor * op) {
// return res;
//}
const int nqptg = is_vec ? OP_FLASH_ATTN_EXT_VEC_NQPTG : OP_FLASH_ATTN_EXT_NQPTG;
const int nqptg = is_vec ? OP_FLASH_ATTN_EXT_VEC_NQPSG : OP_FLASH_ATTN_EXT_NQPSG;
const int ncpsg = is_vec ? OP_FLASH_ATTN_EXT_VEC_NCPSG : OP_FLASH_ATTN_EXT_NCPSG;
const int64_t ne1 = (ne01 + nqptg - 1)/nqptg;
@ -2411,7 +2411,7 @@ int ggml_metal_op_flash_attn_ext(ggml_metal_op_t ctx, int idx) {
if (!ggml_metal_op_flash_attn_ext_use_vec(op)) {
// half8x8 kernel
const int nqptg = OP_FLASH_ATTN_EXT_NQPTG; // queries per threadgroup
const int nqptg = OP_FLASH_ATTN_EXT_NQPSG; // queries per threadgroup
const int ncpsg = OP_FLASH_ATTN_EXT_NCPSG; // cache values per simdgroup
GGML_ASSERT(nqptg <= 32);
@ -2578,9 +2578,9 @@ int ggml_metal_op_flash_attn_ext(ggml_metal_op_t ctx, int idx) {
#undef FATTN_SMEM
} else {
// half4x4 kernel
const int nqptg = OP_FLASH_ATTN_EXT_VEC_NQPTG; // queries per threadgroup
const int nqptg = OP_FLASH_ATTN_EXT_VEC_NQPSG; // queries per threadgroup
const int ncpsg = OP_FLASH_ATTN_EXT_VEC_NCPSG; // cache values per simdgroup !! sync with kernel template arguments !!
const int nkpsg = 1*ncpsg;
const int nhptg = 1; // heads per threadgroup
GGML_ASSERT(nqptg <= 32);
GGML_ASSERT(nqptg % 1 == 0);
@ -2632,6 +2632,9 @@ int ggml_metal_op_flash_attn_ext(ggml_metal_op_t ctx, int idx) {
ggml_metal_op_concurrency_reset(ctx);
}
// note: for simplicity assume the K is larger or equal than V
GGML_ASSERT(ne10 >= ne20);
// ne00 + 2*ncpsg*(nsg)
// for each query, we load it as f16 in shared memory (ne00)
// and store the soft_max values and the mask
@ -2639,28 +2642,9 @@ int ggml_metal_op_flash_attn_ext(ggml_metal_op_t ctx, int idx) {
// ne20*(nsg)
// each simdgroup has a full f32 head vector in shared mem to accumulate results
//
#define FATTN_SMEM(nsg) (GGML_PAD((nqptg*(GGML_PAD(ne00, 128) + 4*ncpsg*(nsg)) + 2*GGML_PAD(ne20, 128)*(nsg))*(sizeof(float)/2), 16))
int64_t nsgmax = 2;
while (true) {
const size_t smem = FATTN_SMEM(nsgmax);
// avoid using more than half of the threadgroup memory - can cause slow downs especially for large head sizes
if (smem > props_dev->max_theadgroup_memory_size/2) {
break;
}
nsgmax *= 2;
}
nsgmax /= 2;
// simdgroups per threadgroup (a.k.a. warps)
//const int64_t nsgt = MAX(2, MIN(nsgmax, MIN((ne11 + nkpsg - 1)/(nkpsg), (int64_t) pipeline.maxTotalThreadsPerThreadgroup/32)));
const int64_t nsgt = MAX(2, MIN(nsgmax, MIN((ne11 + nkpsg - 1)/(nkpsg), (int64_t) 1024/32)));
#define FATTN_SMEM(nsg) (GGML_PAD(((GGML_PAD(ne00, 128) + 4*ncpsg + 2*GGML_PAD(ne20, 128))*(nsg))*(sizeof(float)/2), 16))
int64_t nsg = 1;
while (nsg <= nsgt) {
nsg *= 2;
}
nsg /= 2;
// workgroups
// each workgroup handles nsg*nkpsg cache values
@ -2673,7 +2657,7 @@ int ggml_metal_op_flash_attn_ext(ggml_metal_op_t ctx, int idx) {
} else {
nwg = 32;
nsg = 1;
while (2*nwg*nsg*nkpsg < ne11 && nsg < 4) {
while (2*nwg*nsg*ncpsg < ne11 && nsg < 4) {
nsg *= 2;
}
}
@ -2739,7 +2723,7 @@ int ggml_metal_op_flash_attn_ext(ggml_metal_op_t ctx, int idx) {
ggml_metal_encoder_set_threadgroup_memory_size(enc, smem, 0);
ggml_metal_encoder_dispatch_threadgroups(enc, (ne01 + nqptg - 1)/nqptg, ne02, ne03*nwg, 32, nsg, 1);
ggml_metal_encoder_dispatch_threadgroups(enc, (ne01 + nqptg - 1)/nqptg, (ne02 + nhptg - 1)/nhptg, ne03*nwg, 32, nsg, 1);
} else {
// sanity checks
assert(ggml_metal_op_flash_attn_ext_extra_tmp(op) != 0);
@ -2752,7 +2736,7 @@ int ggml_metal_op_flash_attn_ext(ggml_metal_op_t ctx, int idx) {
ggml_metal_encoder_set_buffer(enc, bid_tmp, 7);
ggml_metal_encoder_set_threadgroup_memory_size(enc, smem, 0);
ggml_metal_encoder_dispatch_threadgroups(enc, (ne01 + nqptg - 1)/nqptg, ne02, ne03*nwg, 32, nsg, 1);
ggml_metal_encoder_dispatch_threadgroups(enc, (ne01 + nqptg - 1)/nqptg, (ne02 + nhptg - 1)/nhptg, ne03*nwg, 32, nsg, 1);
// sync the 2 kernels
ggml_metal_op_concurrency_reset(ctx);

View File

@ -7,11 +7,12 @@
#include "ggml-metal-context.h"
#include "ggml-metal-ops.h"
// globals
#define GGML_METAL_NAME "MTL"
#define GGML_METAL_MAX_DEVICES 16
// initialized in ggml_backend_metal_reg
static ggml_backend_reg g_ggml_metal_reg;
static ggml_backend_device g_ggml_metal_device;
// number of Metal devices
// note: can be overriden with GGML_METAL_DEVICES env to simulate virtual devices
static int g_devices = 1;
////////////////////////////////////////////////////////////////////////////////
// backend interface
@ -165,10 +166,28 @@ static ggml_backend_buffer_i ggml_backend_metal_buffer_private_i = {
/* .reset = */ NULL,
};
static bool ggml_backend_buffer_is_metal(ggml_backend_buffer_t buffer) {
return buffer->iface.free_buffer == ggml_backend_metal_buffer_shared_free_buffer ||
buffer->iface.free_buffer == ggml_backend_metal_buffer_private_free_buffer;
}
//
// buffer types
//
struct ggml_backend_metal_buffer_type {
int device;
std::string name;
};
struct ggml_backend_metal_buffer_type_deleter {
void operator()(ggml_backend_metal_buffer_type * ctx) const {
delete ctx;
}
};
typedef std::unique_ptr<ggml_backend_metal_buffer_type, ggml_backend_metal_buffer_type_deleter> ggml_backend_metal_buffer_type_ptr;
// common method for allocating shread or private Metal buffers
static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size, bool shared) {
ggml_metal_device_t ctx_dev = (ggml_metal_device_t)buft->device->context;
@ -218,9 +237,9 @@ static size_t ggml_backend_metal_buffer_type_get_alloc_size(ggml_backend_buffer_
// default (shared) buffer type
static const char * ggml_backend_metal_buffer_type_shared_get_name(ggml_backend_buffer_type_t buft) {
return "Metal";
ggml_backend_metal_buffer_type * ctx = (ggml_backend_metal_buffer_type *)buft->context;
GGML_UNUSED(buft);
return ctx->name.c_str();
}
static ggml_backend_buffer_t ggml_backend_metal_buffer_type_shared_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
@ -249,29 +268,54 @@ static bool ggml_backend_metal_buffer_type_shared_is_host(ggml_backend_buffer_ty
GGML_UNUSED(buft);
}
static ggml_backend_buffer_type_t ggml_backend_metal_buffer_type_shared(void) {
static ggml_backend_buffer_type ggml_backend_buffer_type_metal = {
/* .iface = */ {
/* .get_name = */ ggml_backend_metal_buffer_type_shared_get_name,
/* .alloc_buffer = */ ggml_backend_metal_buffer_type_shared_alloc_buffer,
/* .get_alignment = */ ggml_backend_metal_buffer_type_shared_get_alignment,
/* .get_max_size = */ ggml_backend_metal_buffer_type_shared_get_max_size,
/* .get_alloc_size = */ ggml_backend_metal_buffer_type_shared_get_alloc_size,
/* .is_host = */ ggml_backend_metal_buffer_type_shared_is_host,
},
/* .device = */ &g_ggml_metal_device,
/* .context = */ NULL,
};
static ggml_backend_buffer_type_t ggml_backend_metal_buffer_type_shared(int device) {
static std::mutex mutex;
std::lock_guard<std::mutex> lock(mutex);
return &ggml_backend_buffer_type_metal;
static std::vector<ggml_backend_buffer_type> bufts;
static std::vector<ggml_backend_metal_buffer_type_ptr> ctxs;
static bool initialized = false;
if (!initialized) {
bufts.reserve(g_devices);
ctxs.reserve(g_devices);
for (int i = 0; i < g_devices; ++i) {
ggml_backend_metal_buffer_type * raw_ctx =
new ggml_backend_metal_buffer_type {
/* .device = */ i,
/* .name = */ GGML_METAL_NAME + std::to_string(i),
};
ctxs.emplace_back(raw_ctx);
ggml_backend_buffer_type buft = {
/* .iface = */ {
/* .get_name = */ ggml_backend_metal_buffer_type_shared_get_name,
/* .alloc_buffer = */ ggml_backend_metal_buffer_type_shared_alloc_buffer,
/* .get_alignment = */ ggml_backend_metal_buffer_type_shared_get_alignment,
/* .get_max_size = */ ggml_backend_metal_buffer_type_shared_get_max_size,
/* .get_alloc_size = */ ggml_backend_metal_buffer_type_shared_get_alloc_size,
/* .is_host = */ ggml_backend_metal_buffer_type_shared_is_host,
},
/* .device = */ ggml_backend_reg_dev_get(ggml_backend_metal_reg(), i),
/* .context = */ raw_ctx,
};
bufts.emplace_back(buft);
}
initialized = true;
}
return &bufts[device];
}
// default (private) buffer type
static const char * ggml_backend_metal_buffer_type_private_get_name(ggml_backend_buffer_type_t buft) {
return "Metal_Private";
ggml_backend_metal_buffer_type * ctx = (ggml_backend_metal_buffer_type *)buft->context;
GGML_UNUSED(buft);
return ctx->name.c_str();
}
static ggml_backend_buffer_t ggml_backend_metal_buffer_type_private_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
@ -300,29 +344,53 @@ static bool ggml_backend_metal_buffer_type_private_is_host(ggml_backend_buffer_t
GGML_UNUSED(buft);
}
static ggml_backend_buffer_type_t ggml_backend_metal_buffer_type_private(void) {
static ggml_backend_buffer_type ggml_backend_buffer_type_metal = {
/* .iface = */ {
/* .get_name = */ ggml_backend_metal_buffer_type_private_get_name,
/* .alloc_buffer = */ ggml_backend_metal_buffer_type_private_alloc_buffer,
/* .get_alignment = */ ggml_backend_metal_buffer_type_private_get_alignment,
/* .get_max_size = */ ggml_backend_metal_buffer_type_private_get_max_size,
/* .get_alloc_size = */ ggml_backend_metal_buffer_type_private_get_alloc_size,
/* .is_host = */ ggml_backend_metal_buffer_type_private_is_host,
},
/* .device = */ &g_ggml_metal_device,
/* .context = */ NULL,
};
static ggml_backend_buffer_type_t ggml_backend_metal_buffer_type_private(int device) {
static std::mutex mutex;
std::lock_guard<std::mutex> lock(mutex);
return &ggml_backend_buffer_type_metal;
static std::vector<ggml_backend_buffer_type> bufts;
static std::vector<ggml_backend_metal_buffer_type_ptr> ctxs;
static bool initialized = false;
if (!initialized) {
bufts.reserve(g_devices);
ctxs.reserve(g_devices);
for (int i = 0; i < g_devices; ++i) {
ggml_backend_metal_buffer_type * raw_ctx = new ggml_backend_metal_buffer_type{
/* .device = */ i,
/* .name = */ GGML_METAL_NAME + std::to_string(i) + "_Private"
};
ctxs.emplace_back(raw_ctx);
ggml_backend_buffer_type buft = {
/* .iface = */ {
/* .get_name = */ ggml_backend_metal_buffer_type_private_get_name,
/* .alloc_buffer = */ ggml_backend_metal_buffer_type_private_alloc_buffer,
/* .get_alignment = */ ggml_backend_metal_buffer_type_private_get_alignment,
/* .get_max_size = */ ggml_backend_metal_buffer_type_private_get_max_size,
/* .get_alloc_size = */ ggml_backend_metal_buffer_type_private_get_alloc_size,
/* .is_host = */ ggml_backend_metal_buffer_type_private_is_host,
},
/* .device = */ ggml_backend_reg_dev_get(ggml_backend_metal_reg(), i),
/* .context = */ raw_ctx,
};
bufts.emplace_back(buft);
}
initialized = true;
}
return &bufts[device];
}
// mapped buffer type
static const char * ggml_backend_metal_buffer_type_mapped_get_name(ggml_backend_buffer_type_t buft) {
return "Metal_Mapped";
ggml_backend_metal_buffer_type * ctx = (ggml_backend_metal_buffer_type *)buft->context;
GGML_UNUSED(buft);
return ctx->name.c_str();
}
static ggml_backend_buffer_t ggml_backend_metal_buffer_type_mapped_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
@ -352,31 +420,55 @@ static bool ggml_backend_metal_buffer_type_mapped_is_host(ggml_backend_buffer_ty
GGML_UNUSED(buft);
}
static ggml_backend_buffer_type_t ggml_backend_metal_buffer_type_mapped(void) {
// note: not obvious, but this buffer type still needs to implement .alloc_buffer:
// https://github.com/ggml-org/llama.cpp/pull/15832#discussion_r2333177099
static ggml_backend_buffer_type ggml_backend_buffer_type_mapped_metal = {
/* .iface = */ {
/* .get_name = */ ggml_backend_metal_buffer_type_mapped_get_name,
/* .alloc_buffer = */ ggml_backend_metal_buffer_type_mapped_alloc_buffer,
/* .get_alignment = */ ggml_backend_metal_buffer_type_mapped_get_alignment,
/* .get_max_size = */ ggml_backend_metal_buffer_type_mapped_get_max_size,
/* .get_alloc_size = */ ggml_backend_metal_buffer_type_mapped_get_alloc_size,
/* .is_host = */ ggml_backend_metal_buffer_type_mapped_is_host,
},
/* .device = */ &g_ggml_metal_device,
/* .context = */ NULL,
};
static ggml_backend_buffer_type_t ggml_backend_metal_buffer_type_mapped(int device) {
static std::mutex mutex;
std::lock_guard<std::mutex> lock(mutex);
return &ggml_backend_buffer_type_mapped_metal;
static std::vector<ggml_backend_buffer_type> bufts;
static std::vector<ggml_backend_metal_buffer_type_ptr> ctxs;
static bool initialized = false;
if (!initialized) {
bufts.reserve(g_devices);
ctxs.reserve(g_devices);
for (int i = 0; i < g_devices; ++i) {
ggml_backend_metal_buffer_type * raw_ctx = new ggml_backend_metal_buffer_type{
/* .device = */ i,
/* .name = */ GGML_METAL_NAME + std::to_string(i) + "_Mapped"
};
ctxs.emplace_back(raw_ctx);
// note: not obvious, but this buffer type still needs to implement .alloc_buffer:
// https://github.com/ggml-org/llama.cpp/pull/15832#discussion_r2333177099
ggml_backend_buffer_type buft = {
/* .iface = */ {
/* .get_name = */ ggml_backend_metal_buffer_type_mapped_get_name,
/* .alloc_buffer = */ ggml_backend_metal_buffer_type_mapped_alloc_buffer,
/* .get_alignment = */ ggml_backend_metal_buffer_type_mapped_get_alignment,
/* .get_max_size = */ ggml_backend_metal_buffer_type_mapped_get_max_size,
/* .get_alloc_size = */ ggml_backend_metal_buffer_type_mapped_get_alloc_size,
/* .is_host = */ ggml_backend_metal_buffer_type_mapped_is_host,
},
/* .device = */ ggml_backend_reg_dev_get(ggml_backend_metal_reg(), i),
/* .context = */ raw_ctx,
};
bufts.emplace_back(buft);
}
initialized = true;
}
return &bufts[device];
}
// backend
static const char * ggml_backend_metal_name(ggml_backend_t backend) {
return "Metal";
ggml_metal_t ctx = (ggml_metal_t)backend->context;
GGML_UNUSED(backend);
return ggml_metal_get_name(ctx);
}
static void ggml_backend_metal_free(ggml_backend_t backend) {
@ -409,12 +501,24 @@ static void ggml_backend_metal_get_tensor_async(ggml_backend_t backend, const gg
}
static bool ggml_backend_metal_cpy_tensor_async(ggml_backend_t backend_src, ggml_backend_t backend_dst, const ggml_tensor * src, ggml_tensor * dst) {
return false;
if (!ggml_backend_is_metal(backend_src) || !ggml_backend_is_metal(backend_dst)) {
return false;
}
GGML_UNUSED(backend_src);
GGML_UNUSED(backend_dst);
GGML_UNUSED(src);
GGML_UNUSED(dst);
if (!ggml_backend_buffer_is_metal(src->buffer) || !ggml_backend_buffer_is_metal(dst->buffer)) {
return false;
}
ggml_metal_t ctx_src = (ggml_metal_t)backend_src->context;
ggml_metal_t ctx_dst = (ggml_metal_t)backend_dst->context;
//ggml_backend_buffer_t buf_src = src->view_src ? src->view_src->buffer : src->buffer;
//ggml_backend_buffer_t buf_dst = dst->view_src ? dst->view_src->buffer : dst->buffer;
//ggml_metal_buffer_t buf_ctx_src = (ggml_metal_buffer_t)buf_src->context;
//ggml_metal_buffer_t buf_ctx_dst = (ggml_metal_buffer_t)buf_dst->context;
return ggml_metal_cpy_tensor_async(ctx_src, ctx_dst, src, dst);
}
static enum ggml_status ggml_backend_metal_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
@ -423,6 +527,20 @@ static enum ggml_status ggml_backend_metal_graph_compute(ggml_backend_t backend,
return ggml_metal_graph_compute(ctx, cgraph);
}
static void ggml_backend_metal_event_record(ggml_backend_t backend, ggml_backend_event_t event) {
ggml_metal_t ctx = (ggml_metal_t)backend->context;
ggml_metal_event_t ev = (ggml_metal_event_t)event->context;
ggml_metal_event_record(ctx, ev);
}
static void ggml_backend_metal_event_wait(ggml_backend_t backend, ggml_backend_event_t event) {
ggml_metal_t ctx = (ggml_metal_t)backend->context;
ggml_metal_event_t ev = (ggml_metal_event_t)event->context;
ggml_metal_event_wait(ctx, ev);
}
static void ggml_backend_metal_graph_optimize(ggml_backend_t backend, ggml_cgraph * cgraph) {
ggml_metal_t ctx = (ggml_metal_t)backend->context;
@ -435,7 +553,6 @@ static void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb) {
ggml_metal_t ctx = (ggml_metal_t)backend->context;
ggml_metal_set_n_cb(ctx, n_cb);
}
static ggml_backend_i ggml_backend_metal_i = {
@ -450,12 +567,8 @@ static ggml_backend_i ggml_backend_metal_i = {
/* .graph_plan_update = */ NULL,
/* .graph_plan_compute = */ NULL,
/* .graph_compute = */ ggml_backend_metal_graph_compute,
// the events API is needed only for multi-GPU setups, so likely no need to implement it for Metal
// in any case, these docs seem relevant if we ever decide to implement it:
// https://developer.apple.com/documentation/metal/mtlcommandbuffer#Synchronizing-Passes-with-Events
/* .event_record = */ NULL,
/* .event_wait = */ NULL,
/* .event_record = */ ggml_backend_metal_event_record,
/* .event_wait = */ ggml_backend_metal_event_wait,
/* .graph_optimize = */ ggml_backend_metal_graph_optimize,
};
@ -519,15 +632,17 @@ void ggml_backend_metal_capture_next_compute(ggml_backend_t backend) {
// backend device
static const char * ggml_backend_metal_device_get_name(ggml_backend_dev_t dev) {
return "Metal";
ggml_metal_device_t ctx_dev = (ggml_metal_device_t)dev->context;
GGML_UNUSED(dev);
const ggml_metal_device_props * props_dev = ggml_metal_device_get_props(ctx_dev);
return props_dev->name;
}
static const char * ggml_backend_metal_device_get_description(ggml_backend_dev_t dev) {
ggml_metal_device_t ctx_dev = (ggml_metal_device_t)dev->context;
return ggml_metal_device_get_props(ctx_dev)->name;
return ggml_metal_device_get_props(ctx_dev)->desc;
}
static void ggml_backend_metal_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
@ -550,14 +665,14 @@ static void ggml_backend_metal_device_get_props(ggml_backend_dev_t dev, ggml_bac
ggml_backend_metal_device_get_memory(dev, &props->memory_free, &props->memory_total);
props->caps = {
/* .async = */ true,
/* .host_buffer = */ false,
/* .buffer_from_host_ptr = */ true,
/* .events = */ false,
/* .async = */ true,
/* .host_buffer = */ false,
/* .buffer_from_host_ptr = */ true,
/* .events = */ true,
};
}
static ggml_backend_t ggml_backend_metal_device_init(ggml_backend_dev_t dev, const char * params) {
static ggml_backend_t ggml_backend_metal_device_init_backend(ggml_backend_dev_t dev, const char * params) {
ggml_metal_device_t ctx_dev = (ggml_metal_device_t)dev->context;
ggml_metal_t ctx = ggml_metal_init(ctx_dev);
@ -587,7 +702,7 @@ static ggml_backend_buffer_type_t ggml_backend_metal_device_get_buffer_type(ggml
const ggml_metal_device_props * props_dev = ggml_metal_device_get_props(ctx_dev);
return props_dev->use_shared_buffers ? ggml_backend_metal_buffer_type_shared() : ggml_backend_metal_buffer_type_private();
return props_dev->use_shared_buffers ? ggml_backend_metal_buffer_type_shared(props_dev->device) : ggml_backend_metal_buffer_type_private(props_dev->device);
}
static ggml_backend_buffer_t ggml_backend_metal_device_buffer_mapped(ggml_backend_dev_t dev, void * ptr, size_t size, size_t max_tensor_size) {
@ -595,7 +710,9 @@ static ggml_backend_buffer_t ggml_backend_metal_device_buffer_mapped(ggml_backen
ggml_metal_buffer_t res = ggml_metal_buffer_map(ctx_dev, ptr, size, max_tensor_size);
return ggml_backend_buffer_init(ggml_backend_metal_buffer_type_mapped(), ggml_backend_metal_buffer_shared_i, res, size);
const ggml_metal_device_props * props_dev = ggml_metal_device_get_props(ctx_dev);
return ggml_backend_buffer_init(ggml_backend_metal_buffer_type_mapped(props_dev->device), ggml_backend_metal_buffer_shared_i, res, size);
}
static bool ggml_backend_metal_device_supports_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
@ -606,9 +723,10 @@ static bool ggml_backend_metal_device_supports_op(ggml_backend_dev_t dev, const
static bool ggml_backend_metal_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
return
buft->device == dev && (
buft->iface.get_name == ggml_backend_metal_buffer_type_shared_get_name ||
buft->iface.get_name == ggml_backend_metal_buffer_type_private_get_name ||
buft->iface.get_name == ggml_backend_metal_buffer_type_mapped_get_name;
buft->iface.get_name == ggml_backend_metal_buffer_type_mapped_get_name);
GGML_UNUSED(dev);
}
@ -632,45 +750,97 @@ static bool ggml_backend_metal_device_offload_op(ggml_backend_dev_t dev, const g
get_op_batch_size(op) >= ggml_metal_device_get_props(ctx_dev)->op_offload_min_batch_size;
}
static ggml_backend_event_t ggml_backend_metal_device_event_new(ggml_backend_dev_t dev) {
ggml_metal_device_t ctx_dev = (ggml_metal_device_t)dev->context;
ggml_metal_event_t event = ggml_metal_device_event_init(ctx_dev);
GGML_ASSERT(event);
ggml_backend_event_t ev = new ggml_backend_event {
/* .device = */ dev,
/* .context = */ event,
};
return ev;
}
static void ggml_backend_metal_device_event_free(ggml_backend_dev_t dev, ggml_backend_event_t event) {
ggml_metal_device_t ctx_dev = (ggml_metal_device_t)dev->context;
ggml_metal_event_t ev = (ggml_metal_event_t)event->context;
ggml_metal_device_event_free(ctx_dev, ev);
delete event;
}
static void ggml_backend_metal_device_event_synchronize(ggml_backend_dev_t dev, ggml_backend_event_t event) {
ggml_metal_device_t ctx_dev = (ggml_metal_device_t)dev->context;
ggml_metal_event_t evt = (ggml_metal_event_t)event->context;
ggml_metal_device_event_synchronize(ctx_dev, evt);
}
static ggml_backend_device_i ggml_backend_metal_device_i = {
/* .get_name = */ ggml_backend_metal_device_get_name,
/* .get_description = */ ggml_backend_metal_device_get_description,
/* .get_memory = */ ggml_backend_metal_device_get_memory,
/* .get_type = */ ggml_backend_metal_device_get_type,
/* .get_props = */ ggml_backend_metal_device_get_props,
/* .init_backend = */ ggml_backend_metal_device_init,
/* .init_backend = */ ggml_backend_metal_device_init_backend,
/* .get_buffer_type = */ ggml_backend_metal_device_get_buffer_type,
/* .get_host_buffer_type = */ NULL,
/* .buffer_from_host_ptr = */ ggml_backend_metal_device_buffer_mapped,
/* .supports_op = */ ggml_backend_metal_device_supports_op,
/* .supports_buft = */ ggml_backend_metal_device_supports_buft,
/* .offload_op = */ ggml_backend_metal_device_offload_op,
/* .event_new = */ NULL,
/* .event_free = */ NULL,
/* .event_synchronize = */ NULL,
/* .event_new = */ ggml_backend_metal_device_event_new,
/* .event_free = */ ggml_backend_metal_device_event_free,
/* .event_synchronize = */ ggml_backend_metal_device_event_synchronize,
};
// backend registry
struct ggml_backend_metal_reg {
std::vector<ggml_backend_dev_t> devices;
};
typedef struct ggml_backend_metal_reg * ggml_backend_metal_reg_t;
static ggml_backend_metal_reg_t ggml_backend_metal_reg_init(void) {
ggml_backend_metal_reg_t ctx = new struct ggml_backend_metal_reg;
return ctx;
}
static void ggml_backend_metal_reg_free(ggml_backend_metal_reg_t ctx) {
delete ctx;
}
struct ggml_backend_metal_reg_deleter {
void operator()(ggml_backend_metal_reg_t ctx) {
ggml_backend_metal_reg_free(ctx);
}
};
typedef std::unique_ptr<struct ggml_backend_metal_reg, ggml_backend_metal_reg_deleter> ggml_backend_metal_reg_ptr;
static const char * ggml_backend_metal_reg_get_name(ggml_backend_reg_t reg) {
return "Metal";
return GGML_METAL_NAME;
GGML_UNUSED(reg);
}
static size_t ggml_backend_metal_reg_device_count(ggml_backend_reg_t reg) {
return 1;
GGML_UNUSED(reg);
ggml_backend_metal_reg_t ctx = (ggml_backend_metal_reg_t)reg->context;
return ctx->devices.size();
}
static ggml_backend_dev_t ggml_backend_metal_reg_device_get(ggml_backend_reg_t reg, size_t index) {
GGML_ASSERT(index == 0);
return &g_ggml_metal_device;
GGML_UNUSED(reg);
GGML_UNUSED(index);
ggml_backend_metal_reg_t ctx = (ggml_backend_metal_reg_t)reg->context;
GGML_ASSERT(index < ctx->devices.size());
return ctx->devices[index];
}
static ggml_backend_feature g_ggml_backend_metal_features[] = {
@ -698,27 +868,67 @@ static void * ggml_backend_metal_get_proc_address(ggml_backend_reg_t reg, const
static ggml_backend_reg_i ggml_backend_metal_reg_i = {
/* .get_name = */ ggml_backend_metal_reg_get_name,
/* .device_count = */ ggml_backend_metal_reg_device_count,
/* .device_get = */ ggml_backend_metal_reg_device_get,
/* .get_device_count = */ ggml_backend_metal_reg_device_count,
/* .get_device = */ ggml_backend_metal_reg_device_get,
/* .get_proc_address = */ ggml_backend_metal_get_proc_address,
};
ggml_backend_reg_t ggml_backend_metal_reg(void) {
{
g_ggml_metal_reg = {
/* .api_version = */ GGML_BACKEND_API_VERSION,
/* .iface = */ ggml_backend_metal_reg_i,
/* .context = */ NULL,
};
static ggml_backend_dev_t ggml_backend_metal_device_init(ggml_backend_reg_t reg, int device) {
return new ggml_backend_device {
/* .iface = */ ggml_backend_metal_device_i,
/* .reg = */ reg,
/* .context = */ ggml_metal_device_get(device),
};
}
g_ggml_metal_device = {
/* .iface = */ ggml_backend_metal_device_i,
/* .reg = */ &g_ggml_metal_reg,
/* .context = */ ggml_metal_device_get(),
};
static void ggml_backend_metal_device_free(ggml_backend_dev_t dev) {
delete dev;
}
struct ggml_backend_device_deleter {
void operator()(ggml_backend_dev_t ctx) {
ggml_backend_metal_device_free(ctx);
}
};
typedef std::unique_ptr<ggml_backend_device, ggml_backend_device_deleter> ggml_backend_device_ptr;
ggml_backend_reg_t ggml_backend_metal_reg(void) {
static ggml_backend_reg reg;
static bool initialized = false;
{
static std::mutex mutex;
std::lock_guard<std::mutex> lock(mutex);
const char * env = getenv("GGML_METAL_DEVICES");
if (env) {
g_devices = atoi(env);
}
static std::vector<ggml_backend_device_ptr> devs;
if (!initialized) {
static ggml_backend_metal_reg_ptr reg_ctx(ggml_backend_metal_reg_init());
for (int i = 0; i < g_devices; ++i) {
auto * dev = ggml_backend_metal_device_init(&reg, i);
devs.emplace_back(dev);
reg_ctx->devices.push_back(dev);
}
reg = {
/* .api_version = */ GGML_BACKEND_API_VERSION,
/* .iface = */ ggml_backend_metal_reg_i,
/* .context = */ reg_ctx.get(),
};
}
initialized = true;
}
return &g_ggml_metal_reg;
return &reg;
}
GGML_BACKEND_DL_IMPL(ggml_backend_metal_reg)

View File

@ -5931,7 +5931,7 @@ template<
void (*deq_v)(device const vd4x4_t *, short, thread v4x4_t &),
short DK, // K head size
short DV, // V head size
short Q = OP_FLASH_ATTN_EXT_NQPTG, // queries per threadgroup
short Q = OP_FLASH_ATTN_EXT_NQPSG, // queries per threadgroup
short C = OP_FLASH_ATTN_EXT_NCPSG> // cache items per threadgroup
kernel void kernel_flash_attn_ext(
constant ggml_metal_kargs_flash_attn_ext & args,
@ -6141,11 +6141,10 @@ template<
void (*deq_v_t4)(device const vd4_t *, short, thread v4_t &),
short DK, // K head size
short DV, // V head size
short NE, // head elements per thread
short Q, // queries per threadgroup
short C, // cache items per threadgroup
short NSG> // number of simd groups
void kernel_flash_attn_ext_vec_impl(
short NE = 4, // head elements per thread
short Q = OP_FLASH_ATTN_EXT_VEC_NQPSG, // queries per threadgroup
short C = OP_FLASH_ATTN_EXT_VEC_NCPSG> // cache items per threadgroup
kernel void kernel_flash_attn_ext_vec(
constant ggml_metal_kargs_flash_attn_ext_vec & args,
device const char * q,
device const char * k,
@ -6162,6 +6161,7 @@ void kernel_flash_attn_ext_vec_impl(
static_assert(DV % 32 == 0, "DV must be divisible by 32");
#define NWG (FC_flash_attn_ext_vec_nwg)
#define NSG (FC_flash_attn_ext_vec_nsg)
#define NS10 (FC_flash_attn_ext_vec_ns10)
#define NS20 (FC_flash_attn_ext_vec_ns20)
@ -6190,12 +6190,12 @@ void kernel_flash_attn_ext_vec_impl(
const short T = PK + NSG*SH; // shared memory size per query in (half)
//threadgroup q_t * sq = (threadgroup q_t *) (shmem_f16 + 0*PK); // holds the query data
threadgroup q4_t * sq4 = (threadgroup q4_t *) (shmem_f16 + 0*PK); // same as above but in q4_t
threadgroup s_t * ss = (threadgroup s_t *) (shmem_f16 + sgitg*SH + Q*PK); // scratch buffer for attention
threadgroup s4_t * ss4 = (threadgroup s4_t *) (shmem_f16 + sgitg*SH + Q*PK); // same as above but in s4_t
threadgroup half * sm = (threadgroup half *) (shmem_f16 + sgitg*SH + 2*C + Q*PK); // scratch buffer for mask
threadgroup o4_t * so4 = (threadgroup o4_t *) (shmem_f16 + 2*sgitg*PV + Q*T); // scratch buffer for the results
//threadgroup q_t * sq = (threadgroup q_t *) (shmem_f16 + 0*PK); // holds the query data
threadgroup q4_t * sq4 = (threadgroup q4_t *) (shmem_f16 + 0*PK); // same as above but in q4_t
threadgroup s_t * ss = (threadgroup s_t *) (shmem_f16 + sgitg*SH + NSG*PK); // scratch buffer for attention
threadgroup s4_t * ss4 = (threadgroup s4_t *) (shmem_f16 + sgitg*SH + NSG*PK); // same as above but in s4_t
threadgroup half * sm = (threadgroup half *) (shmem_f16 + sgitg*SH + 2*C + NSG*PK); // scratch buffer for mask
threadgroup o4_t * so4 = (threadgroup o4_t *) (shmem_f16 + 2*sgitg*PV + NSG*PK + NSG*SH); // scratch buffer for the results
// store the result for all queries in shared memory (the O matrix from the paper)
so4 += tiisg;
@ -6213,11 +6213,13 @@ void kernel_flash_attn_ext_vec_impl(
// load heads from Q to shared memory
device const float4 * q4 = (device const float4 *) ((device const char *) q);
for (short i = tiisg; i < PK4; i += NW) {
if (iq1 < args.ne01 && i < DK4) {
sq4[i] = (q4_t) q4[i];
} else {
sq4[i] = (q4_t) 0.0f;
if (iq1 < args.ne01) {
for (short i = tiisg; i < PK4; i += NW) {
if (i < DK4) {
sq4[i] = (q4_t) q4[i];
} else {
sq4[i] = (q4_t) 0.0f;
}
}
}
@ -6295,7 +6297,7 @@ void kernel_flash_attn_ext_vec_impl(
}
// skip -INF blocks
if (simd_max(sm[tiisg]) == -INFINITY) {
if (simd_max(sm[tiisg]) <= -MAXHALF) {
continue;
}
@ -6569,57 +6571,11 @@ void kernel_flash_attn_ext_vec_impl(
}
#undef NWG
#undef NSG
#undef NS10
#undef NS20
}
template<
typename q4_t, // query types in shared memory
typename k4_t, // key types in shared memory
typename v4_t, // value types in shared memory
typename qk_t, // Q*K types
typename s_t, // soft-max types
typename s4_t,
typename o4_t, // attention accumulation types
typename kd4_t, // key type in device memory
short nl_k,
void (*deq_k_t4)(device const kd4_t *, short, thread k4_t &),
typename vd4_t, // value type in device memory
short nl_v,
void (*deq_v_t4)(device const vd4_t *, short, thread v4_t &),
short DK, // K head size
short DV, // V head size
short NE = 4, // head elements per thread
short Q = OP_FLASH_ATTN_EXT_VEC_NQPTG, // queries per threadgroup
short C = OP_FLASH_ATTN_EXT_VEC_NCPSG> // cache items per threadgroup
kernel void kernel_flash_attn_ext_vec(
constant ggml_metal_kargs_flash_attn_ext_vec & args,
device const char * q,
device const char * k,
device const char * v,
device const char * mask,
device const char * sinks,
device const char * pad,
device char * dst,
threadgroup half * shmem_f16 [[threadgroup(0)]],
uint3 tgpig[[threadgroup_position_in_grid]],
ushort tiisg[[thread_index_in_simdgroup]],
ushort sgitg[[simdgroup_index_in_threadgroup]]) {
#define FWD_TMPL q4_t, k4_t, v4_t, qk_t, s_t, s4_t, o4_t, kd4_t, nl_k, deq_k_t4, vd4_t, nl_v, deq_v_t4, DK, DV, NE, Q, C
#define FWD_ARGS args, q, k, v, mask, sinks, pad, dst, shmem_f16, tgpig, tiisg, sgitg
switch (FC_flash_attn_ext_vec_nsg) {
// note: disabled cases to reduce library load time
case 1: kernel_flash_attn_ext_vec_impl<FWD_TMPL, 1>(FWD_ARGS); break;
case 2: kernel_flash_attn_ext_vec_impl<FWD_TMPL, 2>(FWD_ARGS); break;
case 4: kernel_flash_attn_ext_vec_impl<FWD_TMPL, 4>(FWD_ARGS); break;
//case 8: kernel_flash_attn_ext_vec_impl<FWD_TMPL, 8>(FWD_ARGS); break;
//case 16: kernel_flash_attn_ext_vec_impl<FWD_TMPL, 16>(FWD_ARGS); break;
//case 32: kernel_flash_attn_ext_vec_impl<FWD_TMPL, 32>(FWD_ARGS); break;
}
#undef FWD_TMPL
#undef FWD_ARGS
}
// note: I think the s_t can be half instead of float, because the Q*K scaling is done before storing to shared mem
// in the other (non-vec) kernel, we need s_t to also be float because we scale during the soft_max
//

View File

@ -101,6 +101,8 @@ set(GGML_OPENCL_KERNELS
mul_mm_f32_f32_l4_lm
mul_mm_f16_f32_l4_lm
mul_mm_q8_0_f32_l4_lm
mul_mm_q8_0_f32_8x4
gemv_noshuffle_general_q8_0_f32
mul
norm
relu

File diff suppressed because it is too large Load Diff

View File

@ -1,109 +1,51 @@
kernel void kernel_concat_f32_contiguous(
global const char * p_src0, ulong off_src0,
global const char * p_src1, ulong off_src1,
global char * p_dst, ulong off_dst,
int d_ne00, int d_ne01, int d_ne02, // src0->ne[0..2] for the slice
int d_ne10, int d_ne11, int d_ne12, // src1->ne[0..2] for the slice (d_ne1X must match d_ne0X on non-concat axes)
int d_ne0, int d_ne1, int d_ne2, // dst->ne[0..2] for the slice
int dim
kernel void kernel_concat_f32(
global const char * src0,
ulong offset0,
global const char * src1,
ulong offset1,
global char * dst,
ulong offsetd,
int ne00,
int ne01,
int ne02,
int ne03,
ulong nb00,
ulong nb01,
ulong nb02,
ulong nb03,
ulong nb10,
ulong nb11,
ulong nb12,
ulong nb13,
int ne0,
ulong nb0,
ulong nb1,
ulong nb2,
ulong nb3,
int dim
) {
global const float * src0 = (global const float*)((global char*)p_src0 + off_src0);
global const float * src1 = (global const float*)((global char*)p_src1 + off_src1);
global float * dst = (global float*)((global char*)p_dst + off_dst);
src0 = src0 + offset0;
src1 = src1 + offset1;
dst = dst + offsetd;
int i0 = get_global_id(0); // Index along dst's 0th dimension
int i1 = get_global_id(1); // Index along dst's 1st dimension
int i2 = get_global_id(2); // Index along dst's 2nd dimension
const int i3 = get_group_id(2);
const int i2 = get_group_id(1);
const int i1 = get_group_id(0);
if (i0 >= d_ne0 || i1 >= d_ne1 || i2 >= d_ne2) {
return;
}
int o[4] = {0, 0, 0, 0};
o[dim] = dim == 0 ? ne00 : (dim == 1 ? ne01 : (dim == 2 ? ne02 : ne03));
ulong dst_idx = (ulong)i2 * d_ne0 * d_ne1 + (ulong)i1 * d_ne0 + i0;
ulong src_idx;
global const float * x;
if (dim == 0) {
if (i0 < d_ne00) { // Data from src0
src_idx = (ulong)i2 * d_ne00 * d_ne01 + (ulong)i1 * d_ne00 + i0;
dst[dst_idx] = src0[src_idx];
} else { // Data from src1
src_idx = (ulong)i2 * d_ne10 * d_ne11 + (ulong)i1 * d_ne10 + (i0 - d_ne00);
dst[dst_idx] = src1[src_idx];
}
} else if (dim == 1) {
if (i1 < d_ne01) { // Data from src0
src_idx = (ulong)i2 * d_ne00 * d_ne01 + (ulong)i1 * d_ne00 + i0;
dst[dst_idx] = src0[src_idx];
} else { // Data from src1
src_idx = (ulong)i2 * d_ne10 * d_ne11 + (ulong)(i1 - d_ne01) * d_ne10 + i0;
dst[dst_idx] = src1[src_idx];
}
} else if (dim == 2) {
if (i2 < d_ne02) { // Data from src0
src_idx = (ulong)i2 * d_ne00 * d_ne01 + (ulong)i1 * d_ne00 + i0;
dst[dst_idx] = src0[src_idx];
} else { // Data from src1
src_idx = (ulong)(i2 - d_ne02) * d_ne10 * d_ne11 + (ulong)i1 * d_ne10 + i0;
dst[dst_idx] = src1[src_idx];
}
}
}
kernel void kernel_concat_f32_non_contiguous(
global const char * p_src0, ulong off_src0,
global const char * p_src1, ulong off_src1,
global char * p_dst, ulong off_dst,
long ne00, long ne01, long ne02, long ne03,
ulong nb00, ulong nb01, ulong nb02, ulong nb03,
ulong nb10, ulong nb11, ulong nb12, ulong nb13, // Strides for src1
long d_ne0, long d_ne1, long d_ne2, long d_ne3,
ulong d_nb0, ulong d_nb1, ulong d_nb2, ulong d_nb3,
int dim
) {
global const char * src0_base = p_src0 + off_src0;
global const char * src1_base = p_src1 + off_src1;
global char * dst_base = p_dst + off_dst;
long current_i1 = get_global_id(0); // Index for dst_dim_1
long current_i2 = get_global_id(1); // Index for dst_dim_2
long current_i3 = get_global_id(2); // Index for dst_dim_3
if (current_i1 >= d_ne1 || current_i2 >= d_ne2 || current_i3 >= d_ne3) {
return;
}
global const float * x_val_ptr;
global float * y_val_ptr;
for (long current_i0 = 0; current_i0 < d_ne0; ++current_i0) {
bool use_src0;
long s_i0 = current_i0, s_i1 = current_i1, s_i2 = current_i2, s_i3 = current_i3;
if (dim == 0) {
use_src0 = (current_i0 < ne00);
if (!use_src0) { s_i0 = current_i0 - ne00; }
} else if (dim == 1) {
use_src0 = (current_i1 < ne01);
if (!use_src0) { s_i1 = current_i1 - ne01; }
} else if (dim == 2) {
use_src0 = (current_i2 < ne02);
if (!use_src0) { s_i2 = current_i2 - ne02; }
} else { // dim == 3
use_src0 = (current_i3 < ne03);
if (!use_src0) { s_i3 = current_i3 - ne03; }
}
if (use_src0) {
x_val_ptr = (global const float *)(src0_base + (ulong)s_i3*nb03 + (ulong)s_i2*nb02 + (ulong)s_i1*nb01 + (ulong)s_i0*nb00);
for (int i0 = get_local_id(0); i0 < ne0; i0 += get_local_size(0)) {
if (i0 < ne00 && i1 < ne01 && i2 < ne02 && i3 < ne03) {
x = (global const float *)(src0 + (i3 )*nb03 + (i2 )*nb02 + (i1 )*nb01 + (i0 )*nb00);
} else {
x_val_ptr = (global const float *)(src1_base + (ulong)s_i3*nb13 + (ulong)s_i2*nb12 + (ulong)s_i1*nb11 + (ulong)s_i0*nb10);
x = (global const float *)(src1 + (i3 - o[3])*nb13 + (i2 - o[2])*nb12 + (i1 - o[1])*nb11 + (i0 - o[0])*nb10);
}
y_val_ptr = (global float *)(dst_base + (ulong)current_i3*d_nb3 + (ulong)current_i2*d_nb2 + (ulong)current_i1*d_nb1 + (ulong)current_i0*d_nb0);
*y_val_ptr = *x_val_ptr;
global float * y = (global float *)(dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
*y = *x;
}
}

View File

@ -274,6 +274,37 @@ kernel void kernel_restore_block_q8_0(
}
}
kernel void kernel_restore_block_q8_0_trans(
global uchar * src_q,
global half * src_d,
global block_q8_0 * dst,
uint ne00,
uint ne01
){
uint num_blk_per_row = ne00 / QK8_0;
global block_q8_0 * b = (global block_q8_0 *) dst + get_global_id(0) * num_blk_per_row;
global uchar * q = (global uchar *) src_q + get_global_id(0) * 4; // 4 8-bit packed
global half * d = (global half *) src_d + get_global_id(0);
for (uint blk = 0; blk < num_blk_per_row; blk++) {
b->d = *d;
for (uint i = 0; i < QK8_0; i+=4) {
b->qs[i] = q[0];
b->qs[i+1] = q[1];
b->qs[i+2] = q[2];
b->qs[i+3] = q[3];
q += 4 * ne01; // M stride
}
d += ne01;
b++;
}
}
//------------------------------------------------------------------------------
// kernel_convert_block_q6_K
// Convert the block_q6_K format to 3 separate arrays (AOS -> SOA).

View File

@ -0,0 +1,195 @@
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
#pragma OPENCL EXTENSION cl_khr_subgroups : enable
#ifdef cl_qcom_reqd_sub_group_size
#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
#define ADRENO_GPU 1
#define REQD_SUBGROUP_SIZE_64 __attribute__((qcom_reqd_sub_group_size("half")))
#endif
#define QK8_0 32
#define N_SIMDGROUP 4
#define dequantizeBlockAccum_ns_sgbroadcast_1(total_sums, bits8, scale, y) \
float shared_y; \
char elem; \
\
shared_y = sub_group_broadcast(y.s0, 0); \
elem = (char)(bits8.s0 & 0x000000FF); \
total_sums += convert_int(elem) * scale * shared_y; \
shared_y = sub_group_broadcast(y.s1, 0); \
elem = (char)((bits8.s0 & 0x0000FF00) >> 8); \
total_sums += convert_int(elem) * scale * shared_y; \
shared_y = sub_group_broadcast(y.s2, 0); \
elem = (char)((bits8.s0 & 0x00FF0000) >> 16); \
total_sums += convert_int(elem) * scale * shared_y; \
shared_y = sub_group_broadcast(y.s3, 0); \
elem = (char)((bits8.s0 & 0xFF000000) >> 24); \
total_sums += convert_int(elem) * scale * shared_y; \
\
shared_y = sub_group_broadcast(y.s4, 0); \
elem = (char)(bits8.s1 & 0x000000FF); \
total_sums += convert_int(elem) * scale * shared_y; \
shared_y = sub_group_broadcast(y.s5, 0); \
elem = (char)((bits8.s1 & 0x0000FF00) >> 8); \
total_sums += convert_int(elem) * scale * shared_y; \
shared_y = sub_group_broadcast(y.s6, 0); \
elem = (char)((bits8.s1 & 0x00FF0000) >> 16); \
total_sums += convert_int(elem) * scale * shared_y; \
shared_y = sub_group_broadcast(y.s7, 0); \
elem = (char)((bits8.s1 & 0xFF000000) >> 24); \
total_sums += convert_int(elem) * scale * shared_y; \
\
shared_y = sub_group_broadcast(y.s0, 1); \
elem = (char)(bits8.s2 & 0x000000FF); \
total_sums += convert_int(elem) * scale * shared_y; \
shared_y = sub_group_broadcast(y.s1, 1); \
elem = (char)((bits8.s2 & 0x0000FF00) >> 8); \
total_sums += convert_int(elem) * scale * shared_y; \
shared_y = sub_group_broadcast(y.s2, 1); \
elem = (char)((bits8.s2 & 0x00FF0000) >> 16); \
total_sums += convert_int(elem) * scale * shared_y; \
shared_y = sub_group_broadcast(y.s3, 1); \
elem = (char)((bits8.s2 & 0xFF000000) >> 24); \
total_sums += convert_int(elem) * scale * shared_y; \
\
shared_y = sub_group_broadcast(y.s4, 1); \
elem = (char)(bits8.s3 & 0x000000FF); \
total_sums += convert_int(elem) * scale * shared_y; \
shared_y = sub_group_broadcast(y.s5, 1); \
elem = (char)((bits8.s3 & 0x0000FF00) >> 8); \
total_sums += convert_int(elem) * scale * shared_y; \
shared_y = sub_group_broadcast(y.s6, 1); \
elem = (char)((bits8.s3 & 0x00FF0000) >> 16); \
total_sums += convert_int(elem) * scale * shared_y; \
shared_y = sub_group_broadcast(y.s7, 1); \
elem = (char)((bits8.s3 & 0xFF000000) >> 24); \
total_sums += convert_int(elem) * scale * shared_y; \
\
shared_y = sub_group_broadcast(y.s0, 2); \
elem = (char)(bits8.s4 & 0x000000FF); \
total_sums += convert_int(elem) * scale * shared_y; \
shared_y = sub_group_broadcast(y.s1, 2); \
elem = (char)((bits8.s4 & 0x0000FF00) >> 8); \
total_sums += convert_int(elem) * scale * shared_y; \
shared_y = sub_group_broadcast(y.s2, 2); \
elem = (char)((bits8.s4 & 0x00FF0000) >> 16); \
total_sums += convert_int(elem) * scale * shared_y; \
shared_y = sub_group_broadcast(y.s3, 2); \
elem = (char)((bits8.s4 & 0xFF000000) >> 24); \
total_sums += convert_int(elem) * scale * shared_y; \
\
shared_y = sub_group_broadcast(y.s4, 2); \
elem = (char)(bits8.s5 & 0x000000FF); \
total_sums += convert_int(elem) * scale * shared_y; \
shared_y = sub_group_broadcast(y.s5, 2); \
elem = (char)((bits8.s5 & 0x0000FF00) >> 8); \
total_sums += convert_int(elem) * scale * shared_y; \
shared_y = sub_group_broadcast(y.s6, 2); \
elem = (char)((bits8.s5 & 0x00FF0000) >> 16); \
total_sums += convert_int(elem) * scale * shared_y; \
shared_y = sub_group_broadcast(y.s7, 2); \
elem = (char)((bits8.s5 & 0xFF000000) >> 24); \
total_sums += convert_int(elem) * scale * shared_y; \
\
shared_y = sub_group_broadcast(y.s0, 3); \
elem = (char)(bits8.s6 & 0x000000FF); \
total_sums += convert_int(elem) * scale * shared_y; \
shared_y = sub_group_broadcast(y.s1, 3); \
elem = (char)((bits8.s6 & 0x0000FF00) >> 8); \
total_sums += convert_int(elem) * scale * shared_y; \
shared_y = sub_group_broadcast(y.s2, 3); \
elem = (char)((bits8.s6 & 0x00FF0000) >> 16); \
total_sums += convert_int(elem) * scale * shared_y; \
shared_y = sub_group_broadcast(y.s3, 3); \
elem = (char)((bits8.s6 & 0xFF000000) >> 24); \
total_sums += convert_int(elem) * scale * shared_y; \
\
shared_y = sub_group_broadcast(y.s4, 3); \
elem = (char)(bits8.s7 & 0x000000FF); \
total_sums += convert_int(elem) * scale * shared_y; \
shared_y = sub_group_broadcast(y.s5, 3); \
elem = (char)((bits8.s7 & 0x0000FF00) >> 8); \
total_sums += convert_int(elem) * scale * shared_y; \
shared_y = sub_group_broadcast(y.s6, 3); \
elem = (char)((bits8.s7 & 0x00FF0000) >> 16); \
total_sums += convert_int(elem) * scale * shared_y; \
shared_y = sub_group_broadcast(y.s7, 3); \
elem = (char)((bits8.s7 & 0xFF000000) >> 24); \
total_sums += convert_int(elem) * scale * shared_y; \
#ifdef ADRENO_GPU
REQD_SUBGROUP_SIZE_64
#endif
__kernel void kernel_gemv_noshuffle(
__read_only image1d_buffer_t src0_q, // quantized A
global half * src0_d, // A scales
__read_only image1d_buffer_t src1, // B
ulong offset1, // offset to B (0)
global float * dst, // C
ulong offsetd, // offset to C
int ne00, // K
int ne01, // M
int ne02, // 1
int ne10, // K
int ne12, // 1
int ne0, // M
int ne1, // N
int r2, // 1
int r3)
{
uint groupId = get_local_id(1);
uint gid = get_global_id(0);
ushort slid = get_sub_group_local_id();
uint K = ne00;
uint M = ne01;
uint LINE_STRIDE_A = M;
uint BLOCK_STRIDE_A = 8 * M; // 32 / 4 = 8
__private uint8 regA;
__private half regS;
__private float8 regB;
__private float totalSum = (float)(0.0f);
// loop along K in block granularity, skip 4 blocks every iter
#pragma unroll 1 /* tell compiler not to unroll */
for (uint k = groupId; k < (K / QK8_0); k += N_SIMDGROUP) {
regS = src0_d[gid + k * LINE_STRIDE_A]; // each fiber loads scale of one rows
// first 4 fibers in each wave load 8 B values to its private scope
if (slid < 4) {
regB.s0123 = read_imagef(src1, (slid * 2 + k * 8));
regB.s4567 = read_imagef(src1, (1 + slid * 2 + k * 8));
}
// load weights for one block in consecutive rows
regA.s0 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 0)).x;
regA.s1 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 1)).x;
regA.s2 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 2)).x;
regA.s3 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 3)).x;
regA.s4 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 4)).x;
regA.s5 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 5)).x;
regA.s6 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 6)).x;
regA.s7 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 7)).x;
dequantizeBlockAccum_ns_sgbroadcast_1(totalSum, regA, regS, regB);
}
// reduction in local memory, assumes #wave=4
__local float reduceLM[SIMDGROUP_WIDTH * 3];
if (groupId == 1) reduceLM[SIMDGROUP_WIDTH * 0 + slid] = totalSum;
if (groupId == 2) reduceLM[SIMDGROUP_WIDTH * 1 + slid] = totalSum;
if (groupId == 3) reduceLM[SIMDGROUP_WIDTH * 2 + slid] = totalSum;
barrier(CLK_LOCAL_MEM_FENCE);
if (groupId == 0) totalSum += reduceLM[SIMDGROUP_WIDTH * 0 + slid];
if (groupId == 0) totalSum += reduceLM[SIMDGROUP_WIDTH * 1 + slid];
if (groupId == 0) totalSum += reduceLM[SIMDGROUP_WIDTH * 2 + slid];
// 1 outputs per fiber in wave 0
if (groupId == 0) {
dst = (global float*)((global char*)dst + offsetd);
dst[gid] = totalSum;
}
}

View File

@ -0,0 +1,129 @@
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
#ifdef cl_qcom_reqd_sub_group_size
#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
#define ADRENO_GPU 1
#define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full")))
#endif
#ifdef ADRENO_GPU
REQD_SUBGROUP_SIZE_128
#endif
kernel void kernel_mul_mm_q8_0_f32_8x4(
global const uint * src0_q,
global const half * src0_d,
__read_only image1d_buffer_t src1,
global float * dst,
int k,
int m,
int n,
int n_no_padding,
ulong offsetd
) {
int m_4 = m >> 2;
int n_4 = n >> 2;
int gy = get_global_id(0);
int gx = get_global_id(1);
int gx_2 = gx << 2;
dst = (global float *)((global char*)dst + offsetd);
half8 c0 = 0, c1 = 0, c2 = 0, c3 = 0;
half8 B;
half4 deq;
__global const uint* wptr = src0_q + gx_2;
__global const half* sptr = src0_d + gx_2;
for (int i = 0; i < k; i += 4) {
uint4 pack4 = vload4(0, wptr + (i / 4) * m);
half4 scale = vload4(0, sptr + (i / 32) * m);
char4 p0 = as_char4(pack4.s0);
char4 p1 = as_char4(pack4.s1);
char4 p2 = as_char4(pack4.s2);
char4 p3 = as_char4(pack4.s3);
// ------------------- j = 0 (k = i+0) -------------------
B.s0123 = read_imageh(src1, gy * 2 + (i + 0) * n_4);
B.s4567 = read_imageh(src1, gy * 2 + (i + 0) * n_4 + 1);
half4 wj0 = convert_half4((char4)(p0.s0, p1.s0, p2.s0, p3.s0)) * scale;
c0 += B * wj0.s0;
c1 += B * wj0.s1;
c2 += B * wj0.s2;
c3 += B * wj0.s3;
// ------------------- j = 1 (k = i+1) -------------------
B.s0123 = read_imageh(src1, gy * 2 + (i + 1) * n_4);
B.s4567 = read_imageh(src1, gy * 2 + (i + 1) * n_4 + 1);
half4 wj1 = convert_half4((char4)(p0.s1, p1.s1, p2.s1, p3.s1)) * scale;
c0 += B * wj1.s0;
c1 += B * wj1.s1;
c2 += B * wj1.s2;
c3 += B * wj1.s3;
// ------------------- j = 2 (k = i+2) -------------------
B.s0123 = read_imageh(src1, gy * 2 + (i + 2) * n_4);
B.s4567 = read_imageh(src1, gy * 2 + (i + 2) * n_4 + 1);
half4 wj2 = convert_half4((char4)(p0.s2, p1.s2, p2.s2, p3.s2)) * scale;
c0 += B * wj2.s0;
c1 += B * wj2.s1;
c2 += B * wj2.s2;
c3 += B * wj2.s3;
// ------------------- j = 3 (k = i+3) -------------------
B.s0123 = read_imageh(src1, gy * 2 + (i + 3) * n_4);
B.s4567 = read_imageh(src1, gy * 2 + (i + 3) * n_4 + 1);
half4 wj3 = convert_half4((char4)(p0.s3, p1.s3, p2.s3, p3.s3)) * scale;
c0 += B * wj3.s0;
c1 += B * wj3.s1;
c2 += B * wj3.s2;
c3 += B * wj3.s3;
}
int idx = (gy << 3) * m + (gx << 2);
if(idx+3 < m*n_no_padding){
vstore4((float4)(c0.s0, c1.s0, c2.s0, c3.s0), 0, dst + idx);
idx += m;
}
if(idx+3 < m*n_no_padding){
vstore4((float4)(c0.s1, c1.s1, c2.s1, c3.s1), 0, dst + idx);
idx += m;
}
if(idx+3 < m*n_no_padding){
vstore4((float4)(c0.s2, c1.s2, c2.s2, c3.s2), 0, dst + idx);
idx += m;
}
if(idx+3 < m*n_no_padding){
vstore4((float4)(c0.s3, c1.s3, c2.s3, c3.s3), 0, dst + idx);
idx += m;
}
if(idx+3 < m*n_no_padding){
vstore4((float4)(c0.s4, c1.s4, c2.s4, c3.s4), 0, dst + idx);
idx += m;
}
if(idx+3 < m*n_no_padding){
vstore4((float4)(c0.s5, c1.s5, c2.s5, c3.s5), 0, dst + idx);
idx += m;
}
if(idx+3 < m*n_no_padding){
vstore4((float4)(c0.s6, c1.s6, c2.s6, c3.s6), 0, dst + idx);
idx += m;
}
if(idx+3 < m*n_no_padding){
vstore4((float4)(c0.s7, c1.s7, c2.s7, c3.s7), 0, dst + idx);
}
}

View File

@ -1,39 +1,38 @@
kernel void kernel_repeat(
global const char * src0_data_in,
global char * dst_data_in,
ulong src0_offset,
ulong dst_offset,
int src0_ne0, int src0_ne1, int src0_ne2, int src0_ne3,
ulong src0_nb0, ulong src0_nb1, ulong src0_nb2, ulong src0_nb3,
int dst_ne0, int dst_ne1, int dst_ne2, int dst_ne3,
ulong dst_nb0, ulong dst_nb1, ulong dst_nb2, ulong dst_nb3
kernel void kernel_repeat_f32(
global const char * src0,
ulong offset0,
global char * dst,
ulong offsetd,
int ne00,
int ne01,
int ne02,
int ne03,
ulong nb00,
ulong nb01,
ulong nb02,
ulong nb03,
int ne0,
ulong nb0,
ulong nb1,
ulong nb2,
ulong nb3
) {
global const char * src0_data = src0_data_in + src0_offset;
global char * dst_data = dst_data_in + dst_offset;
src0 = src0 + offset0;
dst = dst + offsetd;
const int d3 = get_global_id(2);
const int d2 = get_global_id(1);
const int d1 = get_global_id(0);
const int i3 = get_group_id(2);
const int i2 = get_group_id(1);
const int i1 = get_group_id(0);
if (d3 >= dst_ne3 || d2 >= dst_ne2 || d1 >= dst_ne1) {
return;
}
const int i03 = i3%ne03;
const int i02 = i2%ne02;
const int i01 = i1%ne01;
const int s3 = d3 % src0_ne3;
const int s2 = d2 % src0_ne2;
const int s1 = d1 % src0_ne1;
global const char * src0_ptr = src0 + i03*nb03 + i02*nb02 + i01*nb01;
global char * dst_ptr = dst + i3*nb3 + i2*nb2 + i1*nb1;
const global char * p_src0_slice = src0_data + (ulong)s3*src0_nb3 + (ulong)s2*src0_nb2 + (ulong)s1*src0_nb1;
global char * p_dst_slice = dst_data + (ulong)d3*dst_nb3 + (ulong)d2*dst_nb2 + (ulong)d1*dst_nb1;
for (int d0 = 0; d0 < dst_ne0; ++d0) {
// Determine source index for dimension 0 based on tiling/broadcasting.
const int s0 = d0 % src0_ne0;
const global char * restrict current_src_el_ptr = p_src0_slice + (ulong)s0*src0_nb0;
global char * restrict current_dst_el_ptr = p_dst_slice + (ulong)d0*dst_nb0;
for (int k = 0; k < src0_nb0; ++k) {
current_dst_el_ptr[k] = current_src_el_ptr[k];
}
for (int i0 = get_local_id(0); i0 < ne0; i0 += get_local_size(0)) {
const int i00 = i0%ne00;
*((global float *)(dst_ptr + i0*nb0)) = *((global float *)(src0_ptr + i00*nb00));
}
}

View File

@ -1,9 +1,19 @@
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
//------------------------------------------------------------------------------
// scale
//------------------------------------------------------------------------------
kernel void kernel_scale(
kernel void kernel_scale_f32(
global float * src0,
ulong offset0,
global float * dst,
ulong offsetd,
float scale,
float bias
) {
src0 = (global float*)((global char*)src0 + offset0);
dst = (global float*)((global char*)dst + offsetd);
dst[get_global_id(0)] = src0[get_global_id(0)] * scale + bias;
}
kernel void kernel_scale_f32_4(
global float4 * src0,
ulong offset0,
global float4 * dst,

View File

@ -1,63 +1,109 @@
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
#ifdef cl_intel_required_subgroup_size
#pragma OPENCL EXTENSION cl_intel_required_subgroup_size : enable
#define INTEL_GPU 1
#define REQD_SUBGROUP_SIZE_16 __attribute__((intel_reqd_sub_group_size(16)))
#define REQD_SUBGROUP_SIZE_32 __attribute__((intel_reqd_sub_group_size(32)))
#elif defined(cl_qcom_reqd_sub_group_size)
#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
#define ADRENO_GPU 1
#define REQD_SUBGROUP_SIZE_64 __attribute__((qcom_reqd_sub_group_size("half")))
#define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full")))
#endif
kernel void kernel_tanh_f32_nd(
global void * p_src0_base, ulong off_src0_abs,
global void * p_dst_base, ulong off_dst_abs,
int ne00, int ne01, int ne02, int ne03,
ulong nb00, ulong nb01, ulong nb02, ulong nb03,
int ne10, int ne11, int ne12, int ne13,
ulong nb10, ulong nb11, ulong nb12, ulong nb13
kernel void kernel_tanh_f32(
global const float * src0,
ulong offset0,
global float * dst,
ulong offsetd
) {
int i0 = get_global_id(0);
int i1 = get_global_id(1);
int i2 = get_global_id(2);
src0 = (global float*)((global char*)src0 + offset0);
dst = (global float*)((global char*)dst + offsetd);
if (i0 < ne10 && i1 < ne11 && i2 < ne12) {
for (int i3 = 0; i3 < ne13; ++i3) {
ulong src_offset_in_tensor = (ulong)i0*nb00 + (ulong)i1*nb01 + (ulong)i2*nb02 + (ulong)i3*nb03;
global const float *src_val_ptr = (global const float *)((global char *)p_src0_base + off_src0_abs + src_offset_in_tensor);
dst[get_global_id(0)] = tanh(src0[get_global_id(0)]);
}
ulong dst_offset_in_tensor = (ulong)i0*nb10 + (ulong)i1*nb11 + (ulong)i2*nb12 + (ulong)i3*nb13;
global float *dst_val_ptr = (global float *)((global char *)p_dst_base + off_dst_abs + dst_offset_in_tensor);
kernel void kernel_tanh_f32_4(
global const float4 * src0,
ulong offset0,
global float4 * dst,
ulong offsetd
) {
src0 = (global float4*)((global char*)src0 + offset0);
dst = (global float4*)((global char*)dst + offsetd);
*dst_val_ptr = tanh(*src_val_ptr);
}
dst[get_global_id(0)] = tanh(src0[get_global_id(0)]);
}
kernel void kernel_tanh_f16(
global const half * src0,
ulong offset0,
global half * dst,
ulong offsetd
) {
src0 = (global half*)((global char*)src0 + offset0);
dst = (global half*)((global char*)dst + offsetd);
dst[get_global_id(0)] = tanh(src0[get_global_id(0)]);
}
kernel void kernel_tanh_f16_4(
global const half4 * src0,
ulong offset0,
global half4 * dst,
ulong offsetd
) {
src0 = (global half4*)((global char*)src0 + offset0);
dst = (global half4*)((global char*)dst + offsetd);
dst[get_global_id(0)] = tanh(src0[get_global_id(0)]);
}
kernel void kernel_tanh_f32_nc(
global const char * src0,
ulong offset0,
global char * dst,
ulong offsetd,
int ne00,
ulong nb00,
ulong nb01,
ulong nb02,
ulong nb03,
ulong nb0,
ulong nb1,
ulong nb2,
ulong nb3
) {
src0 = src0 + offset0;
dst = dst + offsetd;
const int i3 = get_group_id(2);
const int i2 = get_group_id(1);
const int i1 = get_group_id(0);
for (int i0 = get_local_id(0); i0 < ne00; i0 += get_local_size(0)) {
global const float * x = (global const float *)(src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
global float * y = (global float *)(dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
*y = tanh(*x);
}
}
kernel void kernel_tanh_f16_nd(
global void * p_src0_base, ulong off_src0_abs,
global void * p_dst_base, ulong off_dst_abs,
int ne00, int ne01, int ne02, int ne03,
ulong nb00, ulong nb01, ulong nb02, ulong nb03,
int ne10, int ne11, int ne12, int ne13,
ulong nb10, ulong nb11, ulong nb12, ulong nb13
kernel void kernel_tanh_f16_nc(
global const char * src0,
ulong offset0,
global char * dst,
ulong offsetd,
int ne00,
ulong nb00,
ulong nb01,
ulong nb02,
ulong nb03,
ulong nb0,
ulong nb1,
ulong nb2,
ulong nb3
) {
int i0 = get_global_id(0);
int i1 = get_global_id(1);
int i2 = get_global_id(2);
src0 = src0 + offset0;
dst = dst + offsetd;
if (i0 < ne10 && i1 < ne11 && i2 < ne12) {
for (int i3 = 0; i3 < ne13; ++i3) {
ulong src_offset_in_tensor = (ulong)i0*nb00 + (ulong)i1*nb01 + (ulong)i2*nb02 + (ulong)i3*nb03;
global const half *src_val_ptr = (global const half *)((global char *)p_src0_base + off_src0_abs + src_offset_in_tensor);
const int i3 = get_group_id(2);
const int i2 = get_group_id(1);
const int i1 = get_group_id(0);
ulong dst_offset_in_tensor = (ulong)i0*nb10 + (ulong)i1*nb11 + (ulong)i2*nb12 + (ulong)i3*nb13;
global half *dst_val_ptr = (global half *)((global char *)p_dst_base + off_dst_abs + dst_offset_in_tensor);
for (int i0 = get_local_id(0); i0 < ne00; i0 += get_local_size(0)) {
global const half * x = (global const half *)(src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
global half * y = (global half *)(dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
*dst_val_ptr = tanh(*src_val_ptr);
}
*y = tanh(*x);
}
}

View File

@ -1,7 +1,7 @@
message(STATUS "GGML_SYCL_TARGET=${GGML_SYCL_TARGET}")
if (NOT GGML_SYCL_TARGET MATCHES "^(INTEL|NVIDIA|AMD)$")
message(FATAL_ERROR "Invalid backend chosen, supported options are INTEL, NVIDIA, or AMD")
if (NOT GGML_SYCL_TARGET MATCHES "^(INTEL)$")
message(FATAL_ERROR "GGML_SYCL_TARGET: Invalid target, the supported options are [INTEL]")
endif()
check_cxx_compiler_flag("-fsycl" SUPPORTS_SYCL)
@ -125,25 +125,22 @@ endif()
target_compile_definitions(ggml-sycl PRIVATE GGML_SYCL_DNNL=${GGML_SYCL_DNNL})
if (GGML_SYCL_F16)
if (GGML_SYCL_TARGET STREQUAL "AMD")
message(WARNING "AMD target does not entirely support FP16 in the SYCL backend.")
endif()
add_compile_definitions(GGML_SYCL_F16)
endif()
if (GGML_SYCL_TARGET STREQUAL "INTEL")
add_compile_definitions(GGML_SYCL_WARP_SIZE=16)
target_link_options(ggml-sycl PRIVATE -Xs -ze-intel-greater-than-4GB-buffer-required)
elseif (GGML_SYCL_TARGET STREQUAL "NVIDIA")
add_compile_definitions(GGML_SYCL_WARP_SIZE=32)
elseif (GGML_SYCL_TARGET STREQUAL "AMD")
# INFO: Allowed Sub_group_sizes are not consistent through all
# hip targets. For example, 64 is used for certain models, but the backend
# does not support it.
# Target archs tested working: gfx1030, gfx1031, (Only tested sub_group_size = 32)
add_compile_definitions(GGML_SYCL_WARP_SIZE=32)
# Link against Intel oneMKL
if (CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
set(SYCL_COMPILER ON)
endif()
find_package(MKL REQUIRED)
target_link_libraries(ggml-sycl PRIVATE MKL::MKL_SYCL::BLAS)
else()
# default for other target
message(FATAL_ERROR "GGML_SYCL_TARGET is not supported")
add_compile_definitions(GGML_SYCL_WARP_SIZE=32)
endif()
@ -151,82 +148,6 @@ if (GGML_SYCL_GRAPH)
target_compile_definitions(ggml-sycl PRIVATE GGML_SYCL_GRAPH)
endif()
# Link against Intel oneMKL or oneMath
if (GGML_SYCL_TARGET STREQUAL "INTEL")
# Intel devices use Intel oneMKL directly instead of oneMath to avoid the limitation of linking Intel oneMKL statically
# See https://github.com/uxlfoundation/oneMath/issues/654
if (CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
set(SYCL_COMPILER ON)
endif()
find_package(MKL REQUIRED)
target_link_libraries(ggml-sycl PRIVATE MKL::MKL_SYCL::BLAS)
target_compile_definitions(ggml-sycl PRIVATE GGML_SYCL_USE_INTEL_ONEMKL)
else()
find_package(oneMath QUIET)
if (NOT oneMath_FOUND)
message(STATUS "oneMath not found: oneMath will be automatically downloaded")
# Use FetchContent to automatically pull and build oneMath
include(FetchContent)
set(BUILD_FUNCTIONAL_TESTS False)
set(BUILD_EXAMPLES False)
set(TARGET_DOMAINS blas)
if (GGML_SYCL_TARGET STREQUAL "NVIDIA")
set(ENABLE_MKLCPU_BACKEND False)
set(ENABLE_MKLGPU_BACKEND False)
set(ENABLE_CUBLAS_BACKEND True)
elseif (GGML_SYCL_TARGET STREQUAL "AMD")
set(ENABLE_MKLCPU_BACKEND False)
set(ENABLE_MKLGPU_BACKEND False)
set(ENABLE_ROCBLAS_BACKEND True)
# Ensure setting a string variable here is not overriden by oneMath CACHE variables
cmake_policy(SET CMP0126 NEW)
# Setting the device architecture is only needed and useful for AMD devices in oneMath
set(HIP_TARGETS ${GGML_SYCL_DEVICE_ARCH} CACHE STRING "oneMath HIP target" FORCE)
endif()
FetchContent_Declare(
ONEMATH
GIT_REPOSITORY https://github.com/uxlfoundation/oneMath.git
GIT_TAG 8efe85f5aaebb37f1d8c503b7af66315feabf142
)
FetchContent_MakeAvailable(ONEMATH)
# Create alias to match with find_package targets name
function(onemath_alias target)
if (TARGET ${target}_obj)
# Silence verbose warnings from external libraries
target_compile_options(${target}_obj PRIVATE -w)
endif()
if (TARGET ${target})
add_library(ONEMATH::${target} ALIAS ${target})
endif()
endfunction()
onemath_alias(onemath)
onemath_alias(onemath_blas_mklcpu)
onemath_alias(onemath_blas_mklgpu)
onemath_alias(onemath_blas_cublas)
onemath_alias(onemath_blas_rocblas)
endif()
# Below oneMath compile-time dispatching is used for better performance
if (GGML_SYCL_TARGET STREQUAL "NVIDIA")
target_link_libraries(ggml-sycl PRIVATE ONEMATH::onemath_blas_cublas)
target_compile_options(ggml-sycl PRIVATE "-fsycl-targets=nvptx64-nvidia-cuda")
target_link_options(ggml-sycl PRIVATE "-fsycl-targets=nvptx64-nvidia-cuda")
target_compile_definitions(ggml-sycl PRIVATE GGML_SYCL_NVIDIA)
elseif (GGML_SYCL_TARGET STREQUAL "AMD")
if (NOT GGML_SYCL_DEVICE_ARCH)
message(FATAL_ERROR "Can't enable SYCL hip backend, GGML_SYCL_DEVICE_ARCH has not been set.")
endif()
target_link_libraries(ggml-sycl PRIVATE ONEMATH::onemath_blas_rocblas)
target_compile_options(ggml-sycl PRIVATE "-fsycl-targets=amdgcn-amd-amdhsa")
target_link_options(ggml-sycl PRIVATE "-fsycl-targets=amdgcn-amd-amdhsa")
target_compile_definitions(ggml-sycl PRIVATE GGML_SYCL_AMD)
else()
# Fallback to oneMath runtime dispatcher
target_link_libraries(ggml-sycl PRIVATE ONEMATH::onemath)
target_compile_definitions(ggml-sycl PRIVATE GGML_SYCL_GENERIC)
endif()
endif()
if (GGML_SYCL_DEVICE_ARCH)
target_compile_options(ggml-sycl PRIVATE -Xsycl-target-backend --offload-arch=${GGML_SYCL_DEVICE_ARCH})
target_link_options(ggml-sycl PRIVATE -Xsycl-target-backend --offload-arch=${GGML_SYCL_DEVICE_ARCH})

View File

@ -15,17 +15,9 @@
#include <sycl/sycl.hpp>
#include <sycl/half_type.hpp>
#include <map>
#ifdef GGML_SYCL_USE_INTEL_ONEMKL
#include <oneapi/mkl.hpp>
// Allow to use the same namespace for Intel oneMKL and oneMath
namespace oneapi {
namespace math = mkl;
}
#else
#include <oneapi/math.hpp>
#endif
#include <map>
#include "ggml.h"
@ -91,32 +83,13 @@ inline std::string get_device_backend_and_type(const sycl::device &device) {
}
template <typename Ts> struct matrix_info_t {
oneapi::math::transpose transpose_info[2];
oneapi::mkl::transpose transpose_info[2];
Ts value_info[2];
std::int64_t size_info[3];
std::int64_t ld_info[3];
std::int64_t groupsize_info;
};
inline auto get_onemath_backend(sycl::queue& queue)
#if defined(GGML_SYCL_GENERIC) || defined(GGML_SYCL_USE_INTEL_ONEMKL)
-> sycl::queue&
#endif
{
// If the backend is known at compile-time, use oneMath backend_selector to use
// compile-time dispatching and avoid the need to dlopen libraries. Otherwise
// fallback to runtime dispatching.
#if defined(GGML_SYCL_NVIDIA)
return oneapi::math::backend_selector<oneapi::math::backend::cublas>{ queue };
#elif defined(GGML_SYCL_AMD)
return oneapi::math::backend_selector<oneapi::math::backend::rocblas>{ queue };
#elif defined(GGML_SYCL_GENERIC) || defined(GGML_SYCL_USE_INTEL_ONEMKL)
return queue;
#else
static_assert(false, "Unsupported backend");
#endif
}
namespace dpct
{
typedef sycl::queue *queue_ptr;
@ -1734,7 +1707,7 @@ namespace dpct
namespace detail
{
template <class Ta, class Tb, class Tc, class Ts>
inline void gemm_impl(sycl::queue & q, oneapi::math::transpose a_trans, oneapi::math::transpose b_trans, int m,
inline void gemm_impl(sycl::queue & q, oneapi::mkl::transpose a_trans, oneapi::mkl::transpose b_trans, int m,
int n, int k, const void * alpha, const void * a, int lda, const void * b, int ldb,
const void * beta, void * c, int ldc) {
Ts alpha_value = dpct::get_value(reinterpret_cast<const Ts *>(alpha), q);
@ -1742,7 +1715,7 @@ namespace dpct
auto data_a = get_memory<const Ta>(a);
auto data_b = get_memory<const Tb>(b);
auto data_c = get_memory<Tc>(c);
oneapi::math::blas::column_major::gemm(get_onemath_backend(q), a_trans, b_trans, m, n, k, alpha_value, data_a,
oneapi::mkl::blas::column_major::gemm(q, a_trans, b_trans, m, n, k, alpha_value, data_a,
lda, data_b, ldb, beta_value, data_c, ldc);
}
@ -1774,7 +1747,7 @@ namespace dpct
};
template <class Ta, class Tb, class Tc, class Ts>
inline void gemm_batch_impl(sycl::queue & q, oneapi::math::transpose a_trans, oneapi::math::transpose b_trans,
inline void gemm_batch_impl(sycl::queue & q, oneapi::mkl::transpose a_trans, oneapi::mkl::transpose b_trans,
int m, int n, int k, const void * alpha, const void ** a, int lda, const void ** b,
int ldb, const void * beta, void ** c, int ldc, int batch_size,
matrix_info_t<float> * matrix_info) {
@ -1793,8 +1766,8 @@ namespace dpct
matrix_info->ld_info[2] = ldc;
matrix_info->groupsize_info = batch_size;
sycl::event e = oneapi::math::blas::column_major::gemm_batch(
get_onemath_backend(q), matrix_info->transpose_info, matrix_info->transpose_info + 1,
sycl::event e = oneapi::mkl::blas::column_major::gemm_batch(
q, matrix_info->transpose_info, matrix_info->transpose_info + 1,
matrix_info->size_info, matrix_info->size_info + 1, matrix_info->size_info + 2,
reinterpret_cast<Ts *>(matrix_info->value_info), reinterpret_cast<const Ta **>(a), matrix_info->ld_info,
reinterpret_cast<const Tb **>(b), matrix_info->ld_info + 1,
@ -1803,7 +1776,7 @@ namespace dpct
}
template <class Ta, class Tb, class Tc, class Ts>
inline void gemm_batch_impl(sycl::queue & q, oneapi::math::transpose a_trans, oneapi::math::transpose b_trans,
inline void gemm_batch_impl(sycl::queue & q, oneapi::mkl::transpose a_trans, oneapi::mkl::transpose b_trans,
int m, int n, int k, const void * alpha, const void * a, int lda,
long long int stride_a, const void * b, int ldb, long long int stride_b,
const void * beta, void * c, int ldc, long long int stride_c, int batch_size) {
@ -1812,7 +1785,7 @@ namespace dpct
auto data_a = get_memory<const Ta>(a);
auto data_b = get_memory<const Tb>(b);
auto data_c = get_memory<Tc>(c);
oneapi::math::blas::column_major::gemm_batch(get_onemath_backend(q), a_trans, b_trans, m, n, k, alpha_value,
oneapi::mkl::blas::column_major::gemm_batch(q, a_trans, b_trans, m, n, k, alpha_value,
data_a, lda, stride_a, data_b, ldb, stride_b, beta_value,
data_c, ldc, stride_c, batch_size);
}
@ -2299,7 +2272,7 @@ namespace dpct
sycl::range<3>(x, y, 1), direction);
}
inline void gemm(sycl::queue & q, oneapi::math::transpose a_trans, oneapi::math::transpose b_trans, int m, int n,
inline void gemm(sycl::queue & q, oneapi::mkl::transpose a_trans, oneapi::mkl::transpose b_trans, int m, int n,
int k, const void * alpha, const void * a, library_data_t a_type, int lda, const void * b,
library_data_t b_type, int ldb, const void * beta, void * c, library_data_t c_type, int ldc,
library_data_t scaling_type) {
@ -2366,7 +2339,7 @@ namespace dpct
library_data_t::real_bfloat16, library_data_t::real_bfloat16,
library_data_t::real_float, library_data_t::real_float):
{
detail::gemm_impl<oneapi::math::bfloat16, oneapi::math::bfloat16, float, float>(
detail::gemm_impl<oneapi::mkl::bfloat16, oneapi::mkl::bfloat16, float, float>(
q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
break;
}
@ -2405,7 +2378,7 @@ namespace dpct
library_data_t::real_bfloat16, library_data_t::real_bfloat16,
library_data_t::real_bfloat16, library_data_t::real_float):
{
detail::gemm_impl<oneapi::math::bfloat16, oneapi::math::bfloat16, oneapi::math::bfloat16, float>(
detail::gemm_impl<oneapi::mkl::bfloat16, oneapi::mkl::bfloat16, oneapi::mkl::bfloat16, float>(
q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
break;
}
@ -2447,7 +2420,7 @@ namespace dpct
/// \param [in] ldc Leading dimension of C.
/// \param [in] batch_size Specifies the number of matrix multiply operations to perform.
/// \param [in] scaling_type Data type of the scaling factors.
inline void gemm_batch(sycl::queue & q, oneapi::math::transpose a_trans, oneapi::math::transpose b_trans, int m,
inline void gemm_batch(sycl::queue & q, oneapi::mkl::transpose a_trans, oneapi::mkl::transpose b_trans, int m,
int n, int k, const void * alpha, const void * a[], library_data_t a_type, int lda,
const void * b[], library_data_t b_type, int ldb, const void * beta, void * c[],
library_data_t c_type, int ldc, int batch_size, library_data_t scaling_type,
@ -2485,7 +2458,7 @@ namespace dpct
library_data_t::real_bfloat16, library_data_t::real_bfloat16,
library_data_t::real_bfloat16, library_data_t::real_float):
{
detail::gemm_batch_impl<oneapi::math::bfloat16, oneapi::math::bfloat16, oneapi::math::bfloat16, float>(
detail::gemm_batch_impl<oneapi::mkl::bfloat16, oneapi::mkl::bfloat16, oneapi::mkl::bfloat16, float>(
q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, batch_size, matrix_info);
break;
}
@ -2493,7 +2466,7 @@ namespace dpct
library_data_t::real_bfloat16, library_data_t::real_bfloat16,
library_data_t::real_float, library_data_t::real_float):
{
detail::gemm_batch_impl<oneapi::math::bfloat16, oneapi::math::bfloat16, float, float>(
detail::gemm_batch_impl<oneapi::mkl::bfloat16, oneapi::mkl::bfloat16, float, float>(
q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, batch_size, matrix_info);
break;
}
@ -2569,7 +2542,7 @@ namespace dpct
/// \param [in] stride_c Stride between the different C matrices.
/// \param [in] batch_size Specifies the number of matrix multiply operations to perform.
/// \param [in] scaling_type Data type of the scaling factors.
inline void gemm_batch(sycl::queue & q, oneapi::math::transpose a_trans, oneapi::math::transpose b_trans, int m,
inline void gemm_batch(sycl::queue & q, oneapi::mkl::transpose a_trans, oneapi::mkl::transpose b_trans, int m,
int n, int k, const void * alpha, const void * a, library_data_t a_type, int lda,
long long int stride_a, const void * b, library_data_t b_type, int ldb,
long long int stride_b, const void * beta, void * c, library_data_t c_type, int ldc,
@ -2642,7 +2615,7 @@ namespace dpct
library_data_t::real_bfloat16, library_data_t::real_bfloat16,
library_data_t::real_bfloat16, library_data_t::real_float):
{
detail::gemm_batch_impl<oneapi::math::bfloat16, oneapi::math::bfloat16, oneapi::math::bfloat16, float>(
detail::gemm_batch_impl<oneapi::mkl::bfloat16, oneapi::mkl::bfloat16, oneapi::mkl::bfloat16, float>(
q, a_trans, b_trans, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, beta, c, ldc, stride_c,
batch_size);
break;
@ -2651,7 +2624,7 @@ namespace dpct
library_data_t::real_bfloat16, library_data_t::real_bfloat16,
library_data_t::real_float, library_data_t::real_float):
{
detail::gemm_batch_impl<oneapi::math::bfloat16, oneapi::math::bfloat16, float, float>(
detail::gemm_batch_impl<oneapi::mkl::bfloat16, oneapi::mkl::bfloat16, float, float>(
q, a_trans, b_trans, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, beta, c, ldc, stride_c,
batch_size);
break;

View File

@ -123,6 +123,15 @@ static __dpct_inline__ T op_log(T x) {
return sycl::log(x);
}
template<typename T>
static __dpct_inline__ T op_softplus(T x) {
const float xf = (float) x;
const float ax = sycl::fabs(xf);
const float m = sycl::fmax(xf, 0.0f);
const float y = m + sycl::log1p(sycl::exp(-ax));
return (T) y;
}
template<typename T>
static __dpct_inline__ T op_neg(T x) {
return -x;
@ -695,6 +704,12 @@ static inline void ggml_sycl_op_log(ggml_backend_sycl_context & ctx, ggml_tensor
});
}
static inline void ggml_sycl_op_softplus(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
ggml_sycl_detail::ggml_sycl_op_unary(ctx, dst, [](auto x) {
return op_softplus(x);
});
}
static inline void ggml_sycl_op_neg(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
ggml_sycl_detail::ggml_sycl_op_unary(ctx, dst, [](auto x) {
return op_neg(x);
@ -1101,6 +1116,11 @@ void ggml_sycl_log(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
ggml_sycl_op_log(ctx, dst);
}
void ggml_sycl_softplus(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
ggml_sycl_op_softplus(ctx, dst);
}
void ggml_sycl_neg(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
ggml_sycl_op_neg(ctx, dst);

View File

@ -61,6 +61,8 @@ void ggml_sycl_exp(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
void ggml_sycl_log(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
void ggml_sycl_softplus(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
void ggml_sycl_neg(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
void ggml_sycl_step(ggml_backend_sycl_context & ctx, ggml_tensor * dst);

View File

@ -1840,6 +1840,110 @@ static void argsort_f32_i32_sycl(const float *x, int *dst, const int ncols,
}
}
static void top_k_f32_sycl(
const float * src,
int32_t * dst_indices,
const int64_t ncols,
const int64_t nrows,
const int k,
dpct::queue_ptr main_stream
) {
const int block_size = 128;
const sycl::range<1> block_dims(block_size);
const sycl::range<1> grid_dims(nrows);
main_stream->submit([&](sycl::handler &cgh) {
sycl::local_accessor<float, 1> shared_vals(sycl::range<1>(block_size * k), cgh);
sycl::local_accessor<int, 1> shared_idx(sycl::range<1>(block_size * k), cgh);
cgh.parallel_for(
sycl::nd_range<1>(grid_dims * block_dims, block_dims),
[=](sycl::nd_item<1> item_ct1) {
const int row = item_ct1.get_group(0);
const int tid = item_ct1.get_local_id(0);
if (row >= nrows) return;
const float * src_row = src + row * ncols;
int32_t * dst_idx_row = dst_indices + row * k;
float local_vals[32];
int local_idx[32];
for (int i = 0; i < k; i++) {
local_vals[i] = -FLT_MAX;
local_idx[i] = -1;
}
for (int col = tid; col < ncols; col += block_size) {
float val = src_row[col];
if (val > local_vals[k-1]) {
int pos = k - 1;
while (pos > 0 && val > local_vals[pos - 1]) {
pos--;
}
for (int i = k - 1; i > pos; i--) {
local_vals[i] = local_vals[i - 1];
local_idx[i] = local_idx[i - 1];
}
local_vals[pos] = val;
local_idx[pos] = col;
}
}
for (int i = 0; i < k; i++) {
shared_vals[tid * k + i] = local_vals[i];
shared_idx[tid * k + i] = local_idx[i];
}
item_ct1.barrier(sycl::access::fence_space::local_space);
if (tid == 0) {
float final_vals[32];
int final_idx[32];
for (int i = 0; i < k; i++) {
final_vals[i] = -FLT_MAX;
final_idx[i] = -1;
}
for (int t = 0; t < block_size; t++) {
for (int i = 0; i < k; i++) {
float val = shared_vals[t * k + i];
int idx = shared_idx[t * k + i];
if (val > final_vals[k-1]) {
int pos = k - 1;
while (pos > 0 && val > final_vals[pos - 1]) {
pos--;
}
for (int j = k - 1; j > pos; j--) {
final_vals[j] = final_vals[j - 1];
final_idx[j] = final_idx[j - 1];
}
final_vals[pos] = val;
final_idx[pos] = idx;
}
}
}
for (int i = 0; i < k; i++) {
dst_idx_row[i] = final_idx[i];
}
if (k > 1) {
int32_t temp = dst_idx_row[0];
dst_idx_row[0] = dst_idx_row[1];
dst_idx_row[1] = temp;
}
}
});
});
}
static void argmax_f32_i32_sycl(const float *x, int *dst, const int ncols,
const int nrows, queue_ptr stream) {
const sycl::range<3> block_dims(1, 1, SYCL_ARGMAX_BLOCK_SIZE);
@ -2063,8 +2167,8 @@ inline void ggml_sycl_op_mul_mat_sycl(
const sycl::half alpha_f16 = 1.0f;
const sycl::half beta_f16 = 0.0f;
SYCL_CHECK(CHECK_TRY_ERROR(dpct::gemm(
*stream, oneapi::math::transpose::trans,
oneapi::math::transpose::nontrans, row_diff, src1_ncols, ne10,
*stream, oneapi::mkl::transpose::trans,
oneapi::mkl::transpose::nontrans, row_diff, src1_ncols, ne10,
&alpha_f16, src0_ptr, dpct::library_data_t::real_half, ne00,
src1_ptr, dpct::library_data_t::real_half, ne10, &beta_f16,
dst_f16.get(), dpct::library_data_t::real_half, ldc,
@ -2107,8 +2211,8 @@ inline void ggml_sycl_op_mul_mat_sycl(
{
const float alpha = 1.0f;
const float beta = 0.0f;
SYCL_CHECK(CHECK_TRY_ERROR(oneapi::math::blas::column_major::gemm(
get_onemath_backend(*stream), oneapi::math::transpose::trans, oneapi::math::transpose::nontrans, row_diff,
SYCL_CHECK(CHECK_TRY_ERROR(oneapi::mkl::blas::column_major::gemm(
*stream, oneapi::mkl::transpose::trans, oneapi::mkl::transpose::nontrans, row_diff,
src1_ncols, ne10, dpct::get_value(&alpha, *stream), src0_ddf_i, ne00, src1_ddf1_i, ne10,
dpct::get_value(&beta, *stream), dst_dd_i, ldc)));
}
@ -2231,6 +2335,30 @@ inline void ggml_sycl_op_argsort(ggml_backend_sycl_context & ctx, ggml_tensor *
main_stream, ctx.device);
}
static void ggml_sycl_op_top_k(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
const ggml_tensor * src0 = dst->src[0];
GGML_ASSERT(src0);
GGML_ASSERT(src0->type == GGML_TYPE_F32);
GGML_ASSERT(dst->type == GGML_TYPE_I32);
GGML_ASSERT(ggml_is_contiguous(src0));
dpct::queue_ptr main_stream = ctx.stream();
SYCL_CHECK(ggml_sycl_set_device(ctx.device));
const float * src0_dd = static_cast<const float *>(src0->data);
int32_t * dst_dd = static_cast<int32_t *>(dst->data);
const int k = dst->ne[0];
const int64_t ncols = src0->ne[0];
const int64_t nrows = ggml_nrows(src0);
GGML_ASSERT(k > 0 && k <= 32);
GGML_ASSERT(k <= ncols);
top_k_f32_sycl(src0_dd, dst_dd, ncols, nrows, k, main_stream);
}
inline void ggml_sycl_op_argmax(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
GGML_ASSERT( dst->type == GGML_TYPE_I32);
@ -2263,6 +2391,65 @@ inline void ggml_sycl_op_diag_mask_inf(ggml_backend_sycl_context & ctx, ggml_ten
diag_mask_inf_f32_sycl(src0_dd, dst_dd, ne00, nrows0, ne01, n_past, main_stream);
}
static void tri_f32_sycl(
const float * src,
float * dst,
const int64_t ne0,
const int64_t ne1,
const int64_t ne2,
const int64_t ne3,
const ggml_tri_type ttype,
dpct::queue_ptr main_stream
) {
const size_t total = (size_t) ne0 * (size_t) ne1 * (size_t) ne2 * (size_t) ne3;
main_stream->parallel_for(sycl::range<1>(total), [=](sycl::id<1> tid) {
const int64_t idx = (int64_t) tid[0];
const int64_t i0 = idx % ne0;
const int64_t t1 = idx / ne0;
const int64_t i1 = t1 % ne1;
bool keep = false;
switch (ttype) {
case GGML_TRI_TYPE_LOWER: keep = (i0 < i1); break;
case GGML_TRI_TYPE_LOWER_DIAG: keep = (i0 <= i1); break;
case GGML_TRI_TYPE_UPPER: keep = (i0 > i1); break;
case GGML_TRI_TYPE_UPPER_DIAG: keep = (i0 >= i1); break;
default: keep = false; break;
}
dst[idx] = keep ? src[idx] : 0.0f;
});
}
static void ggml_sycl_op_tri(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
const ggml_tensor * src0 = dst->src[0];
GGML_ASSERT(src0);
GGML_ASSERT(src0->type == GGML_TYPE_F32);
GGML_ASSERT(dst->type == GGML_TYPE_F32);
GGML_ASSERT(ggml_is_contiguous(src0));
GGML_ASSERT(ggml_is_contiguous(dst));
GGML_ASSERT(ggml_are_same_shape(src0, dst));
dpct::queue_ptr main_stream = ctx.stream();
SYCL_CHECK(ggml_sycl_set_device(ctx.device));
const float * src0_dd = static_cast<const float *>(src0->data);
float * dst_dd = static_cast<float *>(dst->data);
const ggml_tri_type ttype = (ggml_tri_type) ggml_get_op_params_i32(dst, 0);
const int64_t ne0 = src0->ne[0];
const int64_t ne1 = src0->ne[1];
const int64_t ne2 = src0->ne[2];
const int64_t ne3 = src0->ne[3];
tri_f32_sycl(src0_dd, dst_dd, ne0, ne1, ne2, ne3, ttype, main_stream);
}
inline void ggml_sycl_op_scale(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
GGML_ASSERT( dst->type == GGML_TYPE_F32);
@ -2978,8 +3165,8 @@ static void ggml_sycl_mul_mat_batched_sycl(ggml_backend_sycl_context & ctx, cons
const int64_t smb = ne12 == 1 ? s13 : s12;
// there is no broadcast and src0, src1 are contiguous across dims 2, 3
SYCL_CHECK(CHECK_TRY_ERROR(dpct::gemm_batch(*queue, oneapi::math::transpose::trans,
oneapi::math::transpose::nontrans, ne01, ne11, ne10, alpha,
SYCL_CHECK(CHECK_TRY_ERROR(dpct::gemm_batch(*queue, oneapi::mkl::transpose::trans,
oneapi::mkl::transpose::nontrans, ne01, ne11, ne10, alpha,
src0_f16, dpct::library_data_t::real_half, nb01 / nb00, sma,
src1_f16, dpct::library_data_t::real_half, s11, smb, beta, dst_ddf,
mkl_data_type, ne0, ne1 * ne0, ne12 * ne13, mkl_compute_type)));
@ -3003,7 +3190,7 @@ static void ggml_sycl_mul_mat_batched_sycl(ggml_backend_sycl_context & ctx, cons
});
SYCL_CHECK(CHECK_TRY_ERROR(dpct::gemm_batch(
*queue, oneapi::math::transpose::trans, oneapi::math::transpose::nontrans, ne01, ne11, ne10, alpha,
*queue, oneapi::mkl::transpose::trans, oneapi::mkl::transpose::nontrans, ne01, ne11, ne10, alpha,
(const void **) (ptrs_src.get() + 0 * ne23), dpct::library_data_t::real_half, nb01 / nb00,
(const void **) (ptrs_src.get() + 1 * ne23), dpct::library_data_t::real_half, s11, beta,
(void **) (ptrs_dst.get() + 0 * ne23), mkl_data_type, ne0, ne23, mkl_compute_type, matrix_info.get())));
@ -3331,18 +3518,17 @@ static void ggml_sycl_mul_mat(ggml_backend_sycl_context & ctx, const ggml_tensor
// mmvq and mmq need the __dp4a instruction which is available for gen12+
// Workaround in https://github.com/ggerganov/llama.cpp/commit/95f84d5ce8b449a9b16009434aca800df504a02e
// Workaround in https://github.com/ggml-org/llama.cpp/commit/95f84d5ce8b449a9b16009434aca800df504a02e
use_mul_mat_q = use_mul_mat_q && (src0->type != GGML_TYPE_IQ2_XXS);
#ifdef SYCL_USE_XMX
use_mul_mat_q = use_mul_mat_q && (src1->ne[1] <= MMQ_MAX_BATCH_SIZE);
#endif // SYCL_USE_XMX
// mmvq path is faster in the CUDA backend.
if (!g_ggml_sycl_prioritize_dmmv && (ctx.stream()->get_backend() == sycl::backend::ext_oneapi_cuda
// Dispatch becomes obscure with the reorder, MMVQ when the reorder optimization
// is enabled takes precedence over DMMV, the current if-else implementation
// requires disabling DMMV if both conditions are met
|| (should_reorder_tensor(ctx, dst) && ggml_sycl_supports_reorder_mmvq(src0->type)))) {
// Dispatch becomes obscure with the reorder, MMVQ when the reorder optimization
// is enabled takes precedence over DMMV, the current if-else implementation
// requires disabling DMMV if both conditions are met
if (!g_ggml_sycl_prioritize_dmmv && ((should_reorder_tensor(ctx, dst) &&
ggml_sycl_supports_reorder_mmvq(src0->type)))) {
use_dequantize_mul_mat_vec = use_dequantize_mul_mat_vec && !use_mul_mat_vec_q;
}
@ -3786,6 +3972,9 @@ static bool ggml_sycl_compute_forward(ggml_backend_sycl_context & ctx, struct gg
case GGML_UNARY_OP_EXP:
ggml_sycl_exp(ctx, dst);
break;
case GGML_UNARY_OP_SOFTPLUS:
ggml_sycl_softplus(ctx, dst);
break;
case GGML_UNARY_OP_SGN:
ggml_sycl_sgn(ctx, dst);
break;
@ -3912,6 +4101,9 @@ static bool ggml_sycl_compute_forward(ggml_backend_sycl_context & ctx, struct gg
case GGML_OP_TRANSPOSE:
GGML_SYCL_DEBUG("%s: Tensor NO-OP\n", __func__);
break;
case GGML_OP_TRI:
ggml_sycl_op_tri(ctx, dst);
break;
case GGML_OP_DIAG_MASK_INF:
ggml_sycl_diag_mask_inf(ctx, dst);
break;
@ -3942,6 +4134,9 @@ static bool ggml_sycl_compute_forward(ggml_backend_sycl_context & ctx, struct gg
case GGML_OP_ARGSORT:
ggml_sycl_argsort(ctx, dst);
break;
case GGML_OP_TOP_K:
ggml_sycl_op_top_k(ctx, dst);
break;
case GGML_OP_TIMESTEP_EMBEDDING:
ggml_sycl_op_timestep_embedding(ctx, dst);
break;
@ -3993,16 +4188,6 @@ void ggml_backend_sycl_get_device_memory(int device, size_t *free,
GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_get_device_memory\n");
ggml_sycl_set_device(device);
/*
DPCT1009:218: SYCL uses exceptions to report errors and does not use the
error codes. The original code was commented out and a warning string was
inserted. You need to rewrite this code.
*/
/*
DPCT1106:217: 'cudaMemGetInfo' was migrated with the Intel extensions for
device information which may not be supported by all compilers or runtimes.
You may need to adjust the code.
*/
SYCL_CHECK(CHECK_TRY_ERROR(
dpct::dev_mgr::instance().get_device(device).get_memory_info(*free, *total)));
}
@ -4404,6 +4589,7 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
case GGML_UNARY_OP_GELU_QUICK:
case GGML_UNARY_OP_GELU_ERF:
case GGML_UNARY_OP_EXP:
case GGML_UNARY_OP_SOFTPLUS:
case GGML_UNARY_OP_ELU:
return true;
case GGML_UNARY_OP_FLOOR:
@ -4616,6 +4802,13 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
return true;
case GGML_OP_CONT:
return op->src[0]->type != GGML_TYPE_BF16;
case GGML_OP_TRI:
{
const ggml_tensor * src0 = op->src[0];
return src0 &&
op->type == GGML_TYPE_F32 &&
ggml_is_contiguous(src0);
}
case GGML_OP_DIAG_MASK_INF:
return true;
case GGML_OP_SOFT_MAX:
@ -4637,6 +4830,15 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
case GGML_OP_ARGSORT:
return op->src[0]->ne[0] * sizeof(int) <=
ggml_sycl_info().devices[device].smpbo;
case GGML_OP_TOP_K: {
const ggml_tensor * src0 = op->src[0];
const int k = op->ne[0];
return src0 &&
op->type == GGML_TYPE_I32 &&
src0->type == GGML_TYPE_F32 &&
ggml_is_contiguous(src0) &&
k > 0 && k <= 32;
}
case GGML_OP_POOL_2D:
case GGML_OP_ACC:
return true;

View File

@ -32,12 +32,12 @@ void ggml_sycl_op_out_prod(ggml_backend_sycl_context& ctx, ggml_tensor* dst) {
// Handle transposition of src1
const bool src1_T = ggml_is_transposed(src1);
const oneapi::math::transpose src1_op = src1_T ? oneapi::math::transpose::nontrans : oneapi::math::transpose::trans;
const oneapi::mkl::transpose src1_op = src1_T ? oneapi::mkl::transpose::nontrans : oneapi::mkl::transpose::trans;
const int64_t ldb = (src1_T ? nb10 : nb11) / sizeof(float);
try {
// Perform matrix multiplication using oneMath GEMM
oneapi::math::blas::column_major::gemm(get_onemath_backend(*stream), oneapi::math::transpose::nontrans, src1_op,
// Perform matrix multiplication using oneMKL GEMM
oneapi::mkl::blas::column_major::gemm(*stream, oneapi::mkl::transpose::nontrans, src1_op,
ne0, ne1, ne01, alpha, src0_d, ne00, src1_d, ldb, beta, dst_d, ne0);
}
catch (sycl::exception const& exc) {

View File

@ -207,7 +207,6 @@ static void rope_vision(const T * x, T * dst, const int ne0, const int ne1, cons
const int p = sector;
theta_base = pos[channel_x] * sycl::pow(theta_scale, (float) p);
} else {
// Simplified from CUDA backend code: if (sector >= sections.v[0] && sector < sec_w) which is just sector >= sections.v[0]
const int p = sector - sections.v[0];
theta_base = pos[channel_x + ne2] * sycl::pow(theta_scale, (float) p);
}

Some files were not shown because too many files have changed in this diff Show More