Merge branch 'master' into conv2d-implicit

This commit is contained in:
Bin Zhao 2026-01-31 10:50:43 -05:00
commit 2bd682bb1f
66 changed files with 3011 additions and 1094 deletions

View File

@ -21,7 +21,8 @@ on:
'**/*.m',
'**/*.metal',
'**/*.comp',
'**/*.glsl'
'**/*.glsl',
'**/*.wgsl'
]
pull_request:
@ -42,7 +43,8 @@ on:
'**/*.m',
'**/*.metal',
'**/*.comp',
'**/*.glsl'
'**/*.glsl',
'**/*.wgsl'
]
concurrency:
@ -1371,7 +1373,7 @@ jobs:
id: update_presets
if: ${{ matrix.build == 'arm64-snapdragon' }}
run: |
cp docs/backend/hexagon/CMakeUserPresets.json .
cp docs/backend/snapdragon/CMakeUserPresets.json .
- name: Build
id: ndk_build

View File

@ -213,6 +213,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
- [llama.vim](https://github.com/ggml-org/llama.vim) (MIT)
- [LARS](https://github.com/abgulati/LARS) (AGPL)
- [Llama Assistant](https://github.com/vietanhdev/llama-assistant) (GPL)
- [LlamaLib](https://github.com/undreamai/LlamaLib) (Apache-2.0)
- [LLMFarm](https://github.com/guinmoon/LLMFarm?tab=readme-ov-file) (MIT)
- [LLMUnity](https://github.com/undreamai/LLMUnity) (MIT)
- [LMStudio](https://lmstudio.ai/) (proprietary)

View File

@ -75,6 +75,8 @@ add_library(${TARGET} STATIC
ngram-cache.h
ngram-map.cpp
ngram-map.h
ngram-mod.cpp
ngram-mod.h
peg-parser.cpp
peg-parser.h
preset.cpp

View File

@ -3396,7 +3396,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
}
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
add_opt(common_arg(
{"--spec-type"}, "[none|ngram-cache|ngram-simple|ngram-map-k|ngram-map-k4v]",
{"--spec-type"}, "[none|ngram-cache|ngram-simple|ngram-map-k|ngram-map-k4v|ngram-mod]",
string_format("type of speculative decoding to use when no draft model is provided (default: %s)\n",
common_speculative_type_to_str(params.speculative.type).c_str()),
[](common_params & params, const std::string & value) {
@ -3410,6 +3410,8 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
params.speculative.type = COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K;
} else if (value == "ngram-map-k4v") {
params.speculative.type = COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K4V;
} else if (value == "ngram-mod") {
params.speculative.type = COMMON_SPECULATIVE_TYPE_NGRAM_MOD;
} else {
throw std::invalid_argument("unknown speculative decoding type without draft model");
}

View File

@ -171,6 +171,7 @@ enum common_speculative_type {
COMMON_SPECULATIVE_TYPE_NGRAM_SIMPLE, // simple self-speculative decoding
COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K, // self-speculative decoding with n-gram keys only
COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K4V, // self-speculative decoding with n-gram keys and 4 m-gram values
COMMON_SPECULATIVE_TYPE_NGRAM_MOD,
COMMON_SPECULATIVE_TYPE_NGRAM_CACHE, // self-speculative decoding with 3-level n-gram cache
COMMON_SPECULATIVE_TYPE_COUNT // number of types, unknown type
};
@ -252,6 +253,8 @@ struct common_params_model {
std::string name = ""; // in format <user>/<model>[:<tag>] (tag is optional) // NOLINT
};
struct common_ngram_mod;
struct common_params_speculative {
common_speculative_type type = COMMON_SPECULATIVE_TYPE_NONE; // type of speculative decoding
@ -269,6 +272,8 @@ struct common_params_speculative {
uint16_t ngram_check_rate = 1; // check rate for ngram lookup
uint16_t ngram_min_hits = 1; // minimum hits at ngram/mgram lookup for mgram to be proposed
std::shared_ptr<common_ngram_mod> ngram_mod;
std::string lookup_cache_static; // path of static ngram cache file for lookup decoding // NOLINT
std::string lookup_cache_dynamic; // path of dynamic ngram cache file for lookup decoding // NOLINT

View File

@ -12,6 +12,7 @@
#include <set>
#include <sstream>
#include <string>
#include <unordered_map>
#include <vector>
namespace jinja {

View File

@ -7,6 +7,21 @@
#include <cstdio>
#include <sstream>
// Print the values of a sublist of `llama_tokens & inp` to a string in the form [v0, v1, v2, ...].
static std::string common_tokens_to_str(const llama_tokens & inp, size_t start, size_t length) {
std::ostringstream oss;
oss << '[';
for (size_t i = 0; i < length; ++i) {
if (i > 0) {
oss << ", ";
}
oss << inp[start + i];
}
oss << ']';
return oss.str();
}
// n-gram simple
//
@ -100,8 +115,6 @@ llama_tokens common_ngram_simple_draft(
// maximum number of counted values of a ngram map value.
#define COMMON_NGRAM_MAX_VALUE_COUNT 16380
static std::string common_tokens_to_str(const llama_tokens & inp, size_t start, size_t length);
void common_ngram_map_draft(common_ngram_map & map,
const llama_tokens & inp, llama_token sampled,
llama_tokens & draft) {
@ -347,21 +360,3 @@ void common_ngram_map_accept(common_ngram_map & map, uint16_t n_accepted) {
n_accepted, curr_value.n_accepted);
curr_value.n_accepted = n_accepted;
}
// Helper functions.
//
// Print the values of a sublist of `llama_tokens & inp` to a string in the form [v0, v1, v2, ...].
std::string common_tokens_to_str(const llama_tokens & inp, size_t start, size_t length) {
std::ostringstream oss;
oss << '[';
for (size_t i = 0; i < length; ++i) {
if (i > 0) {
oss << ", ";
}
oss << inp[start + i];
}
oss << ']';
return oss.str();
}

View File

@ -11,6 +11,7 @@
//
#include "llama.h"
#include "common.h"
#include <vector>

60
common/ngram-mod.cpp Normal file
View File

@ -0,0 +1,60 @@
#include "ngram-mod.h"
//
// common_ngram_mod
//
common_ngram_mod::common_ngram_mod(uint16_t n, size_t size) : n(n), used(0) {
entries.resize(size);
reset();
}
size_t common_ngram_mod::idx(const entry_t * tokens) const {
size_t res = 0;
for (size_t i = 0; i < n; ++i) {
res = res*6364136223846793005ULL + tokens[i];
}
res = res % entries.size();
return res;
}
void common_ngram_mod::add(const entry_t * tokens) {
const size_t i = idx(tokens);
if (entries[i] == EMPTY) {
used++;
}
entries[i] = tokens[n];
}
common_ngram_mod::entry_t common_ngram_mod::get(const entry_t * tokens) const {
const size_t i = idx(tokens);
return entries[i];
}
void common_ngram_mod::reset() {
std::fill(entries.begin(), entries.end(), EMPTY);
used = 0;
}
size_t common_ngram_mod::get_n() const {
return n;
}
size_t common_ngram_mod::get_used() const {
return used;
}
size_t common_ngram_mod::size() const {
return entries.size();
}
size_t common_ngram_mod::size_bytes() const {
return entries.size() * sizeof(entries[0]);
}

38
common/ngram-mod.h Normal file
View File

@ -0,0 +1,38 @@
#pragma once
#include <cstdint>
#include <vector>
#include <cstddef>
//
// common_ngram_mod
// ref: https://github.com/ggml-org/llama.cpp/pull/19164
//
// basic n-gram hasher
struct common_ngram_mod {
using entry_t = int32_t;
static constexpr entry_t EMPTY = -1;
common_ngram_mod(uint16_t n, size_t size);
size_t idx(const entry_t * tokens) const;
void add(const entry_t * tokens);
entry_t get(const entry_t * tokens) const; // return -1 if not found
void reset();
size_t get_n() const;
size_t get_used() const;
size_t size() const;
size_t size_bytes() const;
private:
size_t n; // ngram size to hash
size_t used;
std::vector<entry_t> entries;
};

View File

@ -6,6 +6,7 @@
#include "log.h"
#include "ngram-cache.h"
#include "ngram-map.h"
#include "ngram-mod.h"
#include "sampling.h"
#include <algorithm>
@ -23,6 +24,7 @@ const std::vector<enum common_speculative_type> common_speculative_types = {
COMMON_SPECULATIVE_TYPE_NGRAM_SIMPLE,
COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K,
COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K4V,
COMMON_SPECULATIVE_TYPE_NGRAM_MOD,
COMMON_SPECULATIVE_TYPE_NGRAM_CACHE
};
@ -33,6 +35,7 @@ const std::map<std::string, enum common_speculative_type> common_speculative_typ
{"ngram_simple", COMMON_SPECULATIVE_TYPE_NGRAM_SIMPLE},
{"ngram_map_k", COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K},
{"ngram_map_k4v", COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K4V},
{"ngram_mod", COMMON_SPECULATIVE_TYPE_NGRAM_MOD},
{"ngram_cache", COMMON_SPECULATIVE_TYPE_NGRAM_CACHE}
};
@ -110,6 +113,8 @@ static bool common_speculative_are_compatible(
struct common_speculative_state {
const enum common_speculative_type type;
// TODO: rename to n_call_draft, n_gen_drafts, n_acc_drafts, n_gen_tokens, n_acc_tokens
// TODO: add n_call_begin, n_call_accept
size_t drafts_call_count = 0; // number of times this implementation was called.
size_t drafts_generated_count = 0; // number of times a draft or part was generated by this implementation.
size_t drafts_accepted_count = 0; // number of times a draft or part was accepted by the target model.
@ -119,6 +124,8 @@ struct common_speculative_state {
// TODO: track performance of most recent calls
const bool gen_perf = true; // whether to generate performance stats.
// TODO: rename to t_draft_us
// TODO: add t_begin_us, t_accept_us
int64_t gen_duration_us = 0; // total time spent in this implementation in microseconds.
common_speculative_state(enum common_speculative_type type) : type(type) {}
@ -509,6 +516,132 @@ struct common_speculative_state_ngram_map_k : public common_speculative_state {
}
};
struct common_speculative_state_ngram_mod : public common_speculative_state {
common_ngram_mod & mod;
// the last position in the prompt that was added to the ngram container
size_t i_last = 0;
// length of the last drafted ngram (number of tokens returned by draft)
size_t n_draft_last = 0;
// consecutive accept rounds with low acceptance fraction (< 0.5)
int n_low = 0;
// enable trace logging if LLAMA_TRACE is set
const bool verbose;
common_speculative_state_ngram_mod(enum common_speculative_type type, common_ngram_mod & mod)
: common_speculative_state(type), mod(mod), verbose(std::getenv("LLAMA_TRACE") != nullptr) {
static_assert(sizeof(llama_token) == sizeof(common_ngram_mod::entry_t));
}
void begin(const llama_tokens & prompt) override {
i_last = 0;
n_draft_last = 0;
const size_t n = mod.get_n();
if (prompt.size() < n) {
return;
}
for (size_t i = 0; i < prompt.size() - n; ++i) {
mod.add(prompt.data() + i);
}
i_last = prompt.size() - n;
const double f = (double)mod.get_used() / (double)mod.size();
LOG_INF("%s: ngram_mod occupancy = %zu/%zu (%.2f)\n", __func__, mod.get_used(), mod.size(), f);
constexpr double f_thold = 0.25;
if (f > f_thold) {
LOG_WRN("%s: ngram_mod occupancy %.2f exceeds threshold (%.2f) - resetting\n", __func__, f, f_thold);
mod.reset();
}
}
void draft(
const common_params_speculative & params,
const llama_tokens & prompt_tgt,
llama_token id_last,
llama_tokens & result) override {
GGML_UNUSED(params);
n_draft_last = 0;
const size_t cur_len = prompt_tgt.size();
if (cur_len < mod.get_n()) {
return;
}
const size_t n = mod.get_n();
// add new ngrams in chunks
if (i_last + 32 < cur_len) {
for (size_t i = i_last; i < cur_len - n; ++i) {
mod.add(prompt_tgt.data() + i);
}
i_last = cur_len - n;
}
result.resize(n + params.n_max);
for (size_t i = 0; i < n - 1; ++i) {
result[i] = prompt_tgt[cur_len - n + 1 + i];
}
result[n - 1] = id_last;
for (int i = 0; i < params.n_max; ++i) {
const llama_token token = mod.get(result.data() + i);
if (token == common_ngram_mod::EMPTY) {
if (i < params.n_min) {
result.clear();
return;
}
result.resize(n + i);
break;
}
result[n + i] = token;
}
// only return the m tokens that were drafted
for (size_t i = 0; n + i < result.size(); ++i) {
result[i] = result[n + i];
}
result.resize(result.size() - n);
// store length of drafted ngram for later acceptance analysis
n_draft_last = result.size();
}
void accept(uint16_t n_accepted) override {
if (verbose) {
LOG_INF("%s: accepted %d tokens from %zu drafted tokens\n", __func__, n_accepted, n_draft_last);
}
// compute acceptance fraction if we have a recorded draft length
if (n_draft_last > 0) {
const double f_acc = (double)n_accepted / (double)n_draft_last;
if (f_acc < 0.5) {
n_low++;
if (n_low >= 3) {
LOG_WRN("%s: low acceptance streak (%d) resetting ngram_mod\n", __func__, n_low);
mod.reset();
n_low = 0;
}
} else {
n_low = 0;
}
}
}
};
struct common_speculative_state_ngram_cache : public common_speculative_state {
uint16_t n_draft;
bool save_dynamic;
@ -650,6 +783,7 @@ std::string common_speculative_type_to_str(enum common_speculative_type type) {
case COMMON_SPECULATIVE_TYPE_NGRAM_SIMPLE: return "ngram_simple";
case COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K: return "ngram_map_k";
case COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K4V: return "ngram_map_k4v";
case COMMON_SPECULATIVE_TYPE_NGRAM_MOD: return "ngram_mod";
case COMMON_SPECULATIVE_TYPE_NGRAM_CACHE: return "ngram_cache";
default: return "unknown";
}
@ -666,8 +800,8 @@ enum common_speculative_type common_speculative_type_from_name(const std::string
// initialization of the speculative decoding system
//
common_speculative * common_speculative_init(
const common_params_speculative & params,
llama_context * ctx_tgt) {
common_params_speculative & params,
llama_context * ctx_tgt) {
llama_context * ctx_dft = nullptr;
if (params.model_dft) {
ctx_dft = llama_init_from_model(params.model_dft, params.cparams_dft);
@ -687,6 +821,7 @@ common_speculative * common_speculative_init(
bool has_ngram_simple = (params.type == COMMON_SPECULATIVE_TYPE_NGRAM_SIMPLE);
bool has_ngram_map_k = (params.type == COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K);
bool has_ngram_map_k4v = (params.type == COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K4V);
bool has_ngram_mod = (params.type == COMMON_SPECULATIVE_TYPE_NGRAM_MOD);
// In a more complex implementation we could use the same implementation but with different parameters.
// This was initially used in PR-18471 but removed to simplify the code.
@ -701,6 +836,22 @@ common_speculative * common_speculative_init(
// This implementation can guess tokens with high acceptance rate but is more expensive.
configs.push_back(common_speculative_config(COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K4V, params));
}
if (has_ngram_mod) {
// shared instance for all speculative decoding contexts
if (!params.ngram_mod) {
params.ngram_mod = std::make_shared<common_ngram_mod>(params.ngram_size_n, 4*1024*1024);
LOG_INF("%s: initialized ngram_mod with n=%d, size=%zu (%.3f MB)\n", __func__,
params.ngram_size_n, params.ngram_mod->size(),
(float)(params.ngram_mod->size_bytes())/1024/1024);
if (params.ngram_size_n < 16) {
LOG_WRN("%s: ngram_mod n=%d is too small - poor quality is possible, see: https://github.com/ggml-org/llama.cpp/pull/19164\n", __func__, params.ngram_size_n);
}
}
configs.push_back(common_speculative_config(COMMON_SPECULATIVE_TYPE_NGRAM_MOD, params));
}
if (has_ngram_cache) {
configs.push_back(common_speculative_config(COMMON_SPECULATIVE_TYPE_NGRAM_CACHE, params));
}
@ -758,6 +909,11 @@ common_speculative * common_speculative_init(
));
break;
}
case COMMON_SPECULATIVE_TYPE_NGRAM_MOD: {
GGML_ASSERT(config.params.ngram_mod);
impls.push_back(std::make_unique<common_speculative_state_ngram_mod>(config.type, *config.params.ngram_mod));
break;
}
case COMMON_SPECULATIVE_TYPE_NGRAM_CACHE: {
auto state = create_state_ngram_cache(
params.lookup_cache_static, params.lookup_cache_dynamic, config);
@ -822,8 +978,7 @@ llama_tokens common_speculative_draft(
if (!result.empty()) {
LOG_DBG("%s: called impl %s, hist size = %zu, call_count = %zu, gen = %zu\n", __func__,
common_speculative_type_to_str(impl.get()->type).c_str(),
prompt_tgt.size(),
common_speculative_type_to_str(impl.get()->type).c_str(), prompt_tgt.size(),
impl.get()->drafts_call_count, result.size());
spec->curr_impl = impl.get(); // set current implementation for stats
@ -869,6 +1024,7 @@ void common_speculative_print_stats(const common_speculative * spec) {
str_perf = "";
}
// TODO: report time for begin() and accept()
LOG_INF("statistics %s: #calls = %zu, #gen drafts = %zu, #acc drafts = %zu, #gen tokens = %zu, #acc tokens = %zu%s\n",
common_speculative_type_to_str(impl->type).c_str(),
impl->drafts_call_count,

View File

@ -15,8 +15,8 @@ enum common_speculative_type common_speculative_type_from_name(const std::string
std::string common_speculative_type_to_str(enum common_speculative_type type);
common_speculative * common_speculative_init(
const common_params_speculative & params,
llama_context * ctx_tgt);
common_params_speculative & params,
llama_context * ctx_tgt);
void common_speculative_free(common_speculative * spec);

View File

@ -8806,6 +8806,7 @@ class GraniteMoeModel(GraniteModel):
gate, up = data_torch.split(ffn_dim, dim=-2)
yield from ModelBase.modify_tensors(self, gate, self.format_tensor_name(gguf.MODEL_TENSOR.FFN_GATE_EXP, bid), bid)
yield from ModelBase.modify_tensors(self, up, self.format_tensor_name(gguf.MODEL_TENSOR.FFN_UP_EXP, bid), bid)
return
has_experts = bool(self.hparams.get('num_local_experts'))

View File

@ -35,9 +35,9 @@ The following releases are verified and recommended:
|Commit ID|Tag|Release|Verified Platform| Update date|
|-|-|-|-|-|
|24e86cae7219b0f3ede1d5abdf5bf3ad515cccb8|b5377 |[llama-b5377-bin-win-sycl-x64.zip](https://github.com/ggml-org/llama.cpp/releases/download/b5377/llama-b5377-bin-win-sycl-x64.zip) |ArcB580/Linux/oneAPI 2025.1<br>LNL Arc GPU/Windows 11/oneAPI 2025.1.1|2025-05-15|
|3bcd40b3c593d14261fb2abfabad3c0fb5b9e318|b4040 |[llama-b4040-bin-win-sycl-x64.zip](https://github.com/ggml-org/llama.cpp/releases/download/b4040/llama-b4040-bin-win-sycl-x64.zip) |Arc770/Linux/oneAPI 2024.1<br>MTL Arc GPU/Windows 11/oneAPI 2024.1| 2024-11-19|
|fb76ec31a9914b7761c1727303ab30380fd4f05c|b3038 |[llama-b3038-bin-win-sycl-x64.zip](https://github.com/ggml-org/llama.cpp/releases/download/b3038/llama-b3038-bin-win-sycl-x64.zip) |Arc770/Linux/oneAPI 2024.1<br>MTL Arc GPU/Windows 11/oneAPI 2024.1||
|24e86cae7219b0f3ede1d5abdf5bf3ad515cccb8|b5377 |[llama-b5377-bin-win-sycl-x64.zip](https://github.com/ggml-org/llama.cpp/releases/download/b5377/llama-b5377-bin-win-sycl-x64.zip) |Arc B580/Linux/oneAPI 2025.1<br>LNL Arc GPU/Windows 11/oneAPI 2025.1.1|2025-05-15|
|3bcd40b3c593d14261fb2abfabad3c0fb5b9e318|b4040 |[llama-b4040-bin-win-sycl-x64.zip](https://github.com/ggml-org/llama.cpp/releases/download/b4040/llama-b4040-bin-win-sycl-x64.zip) |Arc A770/Linux/oneAPI 2024.1<br>MTL Arc GPU/Windows 11/oneAPI 2024.1| 2024-11-19|
|fb76ec31a9914b7761c1727303ab30380fd4f05c|b3038 |[llama-b3038-bin-win-sycl-x64.zip](https://github.com/ggml-org/llama.cpp/releases/download/b3038/llama-b3038-bin-win-sycl-x64.zip) |Arc A770/Linux/oneAPI 2024.1<br>MTL Arc GPU/Windows 11/oneAPI 2024.1||
## News
@ -51,7 +51,7 @@ The following releases are verified and recommended:
|-|-|-|-|
|PVC 1550|39|73|+87%|
|Flex 170|39|50|+28%|
|Arc770|42|55|+30%|
|Arc A770|42|55|+30%|
|MTL|13|16|+23%|
|ARL-H|14|17|+21%|
@ -62,7 +62,7 @@ The following releases are verified and recommended:
- Use oneDNN as the default GEMM library, improve the compatibility for new Intel GPUs.
- 2024.5
- Performance is increased: 34 -> 37 tokens/s of llama-2-7b.Q4_0 on Arc770.
- Performance is increased: 34 -> 37 tokens/s of llama-2-7b.Q4_0 on Arc A770.
- Arch Linux is verified successfully.
- 2024.4
@ -111,7 +111,8 @@ On older Intel GPUs, you may try [OpenCL](/docs/backend/OPENCL.md) although the
|-------------------------------|---------|---------------------------------------|
| Intel Data Center Max Series | Support | Max 1550, 1100 |
| Intel Data Center Flex Series | Support | Flex 170 |
| Intel Arc Series | Support | Arc 770, 730M, Arc A750, B580 |
| Intel Arc A-Series | Support | Arc A770, Arc A730M, Arc A750 |
| Intel Arc B-Series | Support | Arc B580 |
| Intel built-in Arc GPU | Support | built-in Arc GPU in Meteor Lake, Arrow Lake, Lunar Lake |
| Intel iGPU | Support | iGPU in 13700k, 13400, i5-1250P, i7-1260P, i7-1165G7 |

View File

@ -1,5 +1,10 @@
{
"version": 4,
"version": 5,
"cmakeMinimumRequired": {
"major": 3,
"minor": 28,
"patch": 0
},
"configurePresets": [
{
"name": "arm64-android-snapdragon",
@ -16,7 +21,9 @@
"CMAKE_CXX_FLAGS_RELEASE": "-O3 -DNDEBUG",
"CMAKE_C_FLAGS_RELWITHDEBINFO": "-O3 -DNDEBUG -g",
"CMAKE_CXX_FLAGS_RELWITHDEBINFO": "-O3 -DNDEBUG -g",
"HEXAGON_SDK_ROOT": "$env{HEXAGON_SDK_ROOT}",
"CMAKE_PREFIX_PATH": "$env{OPENCL_SDK_ROOT}",
"HEXAGON_SDK_ROOT": "$env{HEXAGON_SDK_ROOT}",
"HEXAGON_TOOLS_ROOT": "$env{HEXAGON_TOOLS_ROOT}",
"PREBUILT_LIB_DIR": "android_aarch64",
"GGML_OPENMP": "OFF",
"GGML_LLAMAFILE": "OFF",
@ -31,7 +38,15 @@
"name": "arm64-windows-snapdragon",
"inherits": [ "base", "arm64-windows-llvm" ],
"cacheVariables": {
"HEXAGON_SDK_ROOT": "$env{HEXAGON_SDK_ROOT}",
"CMAKE_C_FLAGS": "-march=armv8.7a+fp16 -fvectorize -ffp-model=fast -flto -D_GNU_SOURCE",
"CMAKE_CXX_FLAGS": "-march=armv8.7a+fp16 -fvectorize -ffp-model=fast -flto -D_GNU_SOURCE",
"CMAKE_C_FLAGS_RELEASE": "-O3 -DNDEBUG",
"CMAKE_CXX_FLAGS_RELEASE": "-O3 -DNDEBUG",
"CMAKE_C_FLAGS_RELWITHDEBINFO": "-O3 -DNDEBUG -g",
"CMAKE_CXX_FLAGS_RELWITHDEBINFO": "-O3 -DNDEBUG -g",
"CMAKE_PREFIX_PATH": "$env{OPENCL_SDK_ROOT}",
"HEXAGON_SDK_ROOT": "$env{HEXAGON_SDK_ROOT}",
"HEXAGON_TOOLS_ROOT": "$env{HEXAGON_TOOLS_ROOT}",
"PREBUILT_LIB_DIR": "windows_aarch64",
"GGML_OPENMP": "OFF",
"GGML_LLAMAFILE": "OFF",

View File

@ -1,6 +1,8 @@
# Snapdragon-based Android devices
# Snapdragon-based devices
## How to Build
## Setup
### Android
The easiest way to build llama.cpp for a Snapdragon-based Android device is using the toolchain Docker image (see github.com/snapdragon-toolchain).
This image includes Android NDK, OpenCL SDK, Hexagon SDK, CMake, etc.
@ -12,7 +14,24 @@ This method works on Linux, macOS, and Windows. macOS and Windows users should i
[d]/> cd /workspace
```
The rest of the Android build process assumes that you're running inside the toolchain container.
Note: The rest of the **Android** build process assumes that you're running inside the toolchain container.
### Windows On Snapdragon
Native Windows 11 arm64 builds has the following tools dependencies:
- MS Visual Studio 2026 (Community Edition or Pro)
- MSVC arm64 standard and runtime libraries
- UCRT and Driver Kit
- LLVM core libraries and Clang compiler (winget)
- CMake, Git, Python (winget)
- Hexagon SDK Community Edition 6.4 or later (see windows.md)
- OpenCL SDK 2.3 or later (see windows.md)
Note: The rest of the **Windows** build process assumes that you're running natively in Powershell.
Adapt below build commands accordingly.
## How to Build
Let's build llama.cpp with CPU, OpenCL, and Hexagon backends via CMake presets:
```
@ -49,24 +68,26 @@ Preset CMake variables:
To generate an installable "package" simply use cmake --install:
```
[d]/workspace> cmake --install build-snapdragon --prefix pkg-adb/llama.cpp
[d]/workspace> cmake --install build-snapdragon --prefix pkg-snapdragon/llama.cpp
-- Install configuration: "Release"
-- Installing: /workspace/pkg-adb/llama.cpp/lib/libggml-cpu.so
-- Installing: /workspace/pkg-adb/llama.cpp/lib/libggml-opencl.so
-- Installing: /workspace/pkg-adb/llama.cpp/lib/libggml-hexagon.so
-- Installing: /workspace/pkg-adb/llama.cpp/lib/libggml-htp-v73.so
-- Installing: /workspace/pkg-adb/llama.cpp/lib/libggml-htp-v75.so
-- Installing: /workspace/pkg-adb/llama.cpp/lib/libggml-htp-v79.so
-- Installing: /workspace/pkg-adb/llama.cpp/lib/libggml-htp-v81.so
-- Installing: /workspace/pkg-adb/llama.cpp/lib/libggml.so
-- Installing: /workspace/pkg-snapdragon/llama.cpp/lib/libggml-cpu.so
-- Installing: /workspace/pkg-snapdragon/llama.cpp/lib/libggml-opencl.so
-- Installing: /workspace/pkg-snapdragon/llama.cpp/lib/libggml-hexagon.so
-- Installing: /workspace/pkg-snapdragon/llama.cpp/lib/libggml-htp-v73.so
-- Installing: /workspace/pkg-snapdragon/llama.cpp/lib/libggml-htp-v75.so
-- Installing: /workspace/pkg-snapdragon/llama.cpp/lib/libggml-htp-v79.so
-- Installing: /workspace/pkg-snapdragon/llama.cpp/lib/libggml-htp-v81.so
-- Installing: /workspace/pkg-snapdragon/llama.cpp/lib/libggml.so
...
-- Installing: /workspace/pkg-adb/llama.cpp/bin/llama-bench
-- Installing: /workspace/pkg-adb/llama.cpp/bin/llama-cli
-- Installing: /workspace/pkg-snapdragon/llama.cpp/bin/llama-bench
-- Installing: /workspace/pkg-snapdragon/llama.cpp/bin/llama-cli
...
```
## How to Install
### Android
For this step, your device needs to be configured for on-device development.
Please see https://developer.android.com/studio/debug/dev-options for details.
@ -74,10 +95,10 @@ Once ADB is enabled, use `adb push` to install `pkg-snapdragon` on the device.
**Note that the toolchain Docker image doesn't have ADB and doesn't set up the ADB bridge. Please use native ADB on the host.**
```
~/src/llama.cpp$ adb push pkg-adb/llama.cpp /data/local/tmp/
pkg-adb/llama.cpp/bin/: 67 files pushed, 0 skipped. 190.2 MB/s (919095042 bytes in 4.607s)
pkg-adb/llama.cpp/include/: 19 files pushed, 0 skipped. 20.5 MB/s (255173 bytes in 0.012s)
pkg-adb/llama.cpp/lib/: 16 files pushed, 0 skipped. 144.4 MB/s (43801382 bytes in 0.289s)
~/src/llama.cpp$ adb push pkg-snapdragon/llama.cpp /data/local/tmp/
pkg-snapdragon/llama.cpp/bin/: 67 files pushed, 0 skipped. 190.2 MB/s (919095042 bytes in 4.607s)
pkg-snapdragon/llama.cpp/include/: 19 files pushed, 0 skipped. 20.5 MB/s (255173 bytes in 0.012s)
pkg-snapdragon/llama.cpp/lib/: 16 files pushed, 0 skipped. 144.4 MB/s (43801382 bytes in 0.289s)
102 files pushed, 0 skipped. 186.9 MB/s (963151597 bytes in 4.914s)
```
@ -92,6 +113,11 @@ At this point, you should also install some models:
Llama-3.2-1B-Instruct-Q4_0.gguf: 1 file pushed, 0 skipped. 38.3 MB/s (773025920 bytes in 19.250s)
```
### Windows
All artifacts are already installed in the `pkg-snapdragon` folder.
To run, adapt below instructions to use Powershell scrits in `scripts/snapdragon/windows`.
## How to Run
The easiest way to run llama.cpp cli tools is using provided wrapper scripts that properly set up all required environment variables.

View File

@ -0,0 +1,161 @@
## Overview
The document covers procedures for installing the latest GPU and NPU drivers, and OpenCL and Hexagon SDKs.
In order to use Hexagon NPU on Snapdragon Windows devices the underlying HTP Ops libraries (e.g libggml-htp-v73.so)
must be included in the .cat file digitally signed with a trusted certificate.
This document covers details on how to generate personal certificate files (.pfx) and how to configure the system
to allow for test signatures (aka test-signing).
## Install the latest Adreno OpenCL SDK
Either use the trimmed down version (optimized for CI) from
https://github.com/snapdragon-toolchain/opencl-sdk/releases/download/v2.3.2/adreno-opencl-sdk-v2.3.2-arm64-wos.tar.xz
Or download the complete official version from
https://softwarecenter.qualcomm.com/catalog/item/Adreno_OpenCL_SDK?version=2.3.2
Unzip/untar the archive into
```
c:\Qualcomm\OpenCL_SDK\2.3.2
```
## Install the latest Hexagon SDK Community Edition
Either use the trimmed down version (optimized for CI) from
https://github.com/snapdragon-toolchain/hexagon-sdk/releases/download/v6.4.0.2/hexagon-sdk-v6.4.0.2-arm64-wos.tar.xz
Or download the complete official version from
https://softwarecenter.qualcomm.com/catalog/item/Hexagon_SDK?version=6.4.0.2
Unzip/untar the archive into
```
c:\Qualcomm\Hexagon_SDK\6.4.0.2
```
## Install the latest Adreno GPU driver
Download the driver from
https://softwarecenter.qualcomm.com/catalog/item/Windows_Graphics_Driver
After the automated installation and reboot please make sure that the GPU device shows up in the `Device Manager` (under 'Display Adapters`)
## Install the latest Qualcomm NPU driver
Download the driver from
https://softwarecenter.qualcomm.com/catalog/item/Qualcomm_HND
After the automated installation and reboot please make sure that the Hexagon NPU device shows up in the `Device Manager` (under `Neural Processors`).
If the device is not available you can try installing all components (`qcnspmcdm8380`, `qcnspmcdm8380_ext`) manually.
The components are extracted into
```
c:\QCDrivers\qcnspmcdm...
```
## Enable NPU driver test signatures
Please note that the following steps are required only for the Hexagon NPU.
Adreno GPU backend does not require test signatures.
### Enable testsigning
Use `bcdedit` to enable test-signing
```
> bcdedit /set TESTSIGNING ON
```
(Secure Boot may need to be disabled for this to work)
Make sure test-signing is enabled after reboot
```
> bcdedit /enum
...
testsigning Yes
...
```
For additional details see Microsoft guide at
https://learn.microsoft.com/en-us/windows-hardware/drivers/install/the-testsigning-boot-configuration-option
### Create personal certificate
The tools required for this procedure are available as part of Windows SDK and Windows Driver Kit which should be
installed as part of the MS Visual Studio.
They are typically located at
```
c:\Program Files (x86)\Windows Kits\10\bin\10.0.26100.0
```
(replace 10.0.26100.0 with correct version).
To create personal self-signed certificate run the following commands (either from cmd or power-shell):
```
> cd c:\Users\MyUser
> mkdir Certs
> cd Certs
> makecert -r -pe -ss PrivateCertStore -n CN=GGML.HTP.v1 -eku 1.3.6.1.5.5.7.3.3 -sv ggml-htp-v1.pvk ggml-htp-v1.cer
> pvk2pfx.exe -pvk ggml-htp-v1.pvk -spc ggml-htp-v1.cer -pfx ggml-htp-v1.pfx
```
(replace `MyUser` with your username).
Add this certificate to `Trusted Root Certification Authorities` and `Trusted Publishers` stores.
This can be done using `certlm` Certificate Manager tool.
Right click on the certificate store, select `All Tasks -> Import` and follow the prompts to import the certificate from the
PFX file you created above.
For additional details see Microsoft guide at
https://learn.microsoft.com/en-us/windows-hardware/drivers/install/introduction-to-test-signing
Make sure to save the PFX file, you will need it for the build procedures.
Please note that the same certificate can be used for signing any number of builds.
## Build Hexagon backend with signed HTP ops libraries
The overall Hexagon backend build procedure for Windows on Snapdragon is the same as for other platforms.
However, additional settings are required for generating and signing HTP Ops libraries.
```
> $env:OPENCL_SDK_ROOT="C:\Qualcomm\OpenCL_SDK\2.3.2"
> $env:HEXAGON_SDK_ROOT="C:\Qualcomm\Hexagon_SDK\6.4.0.2"
> $env:HEXAGON_TOOLS_ROOT="C:\Qualcomm\Hexagon_SDK\6.4.0.2\tools\HEXAGON_Tools\19.0.04"
> $env:HEXAGON_HTP_CERT="c:\Users\MyUsers\Certs\ggml-htp-v1.pfx"
> $env:WINDOWS_SDK_BIN="C:\Program Files (x86)\Windows Kits\10\bin\10.0.26100.0\arm64"
> cmake --preset arm64-windows-snapdragon -B build-wos
...
> cmake --install build-wos --prefix pkg-snapdragon
```
Once the build is complete HTP ops libraries will be installed like this
```
> dir pkg-snapdragon/lib
...
-a---- 1/22/2026 6:01 PM 187656 libggml-htp-v73.so
-a---- 1/22/2026 6:01 PM 191752 libggml-htp-v75.so
-a---- 1/22/2026 6:01 PM 187656 libggml-htp-v79.so
-a---- 1/22/2026 6:01 PM 187656 libggml-htp-v81.so
-a---- 1/22/2026 6:01 PM 4139 libggml-htp.cat
```
The .cat file, the signature and proper certicate installation can be verified with
```
> signtool.exe verify /v /pa .\pkg-snapdragon\lib\libggml-htp.cat
Verifying: .\pkg-snapdragon\lib\libggml-htp.cat
Signature Index: 0 (Primary Signature)
Hash of file (sha256): 9820C664DA59D5EAE31DBB664127FCDAEF59CDC31502496BC567544EC2F401CF
Signing Certificate Chain:
Issued to: GGML.HTP.v1
...
Successfully verified: .\pkg-snapdragon\lib\libggml-htp.cat
...
```

View File

@ -97,7 +97,7 @@ Legend:
| SILU | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | 🟡 | ✅ | ❌ | ❌ |
| SILU_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ |
| SIN | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | ✅ | 🟡 | ❌ | ❌ | ❌ |
| SOFTPLUS | ❌ | ❌ | ✅ | 🟡 | 🟡 | ❌ | | 🟡 | ✅ | ❌ | ❌ |
| SOFTPLUS | ❌ | ❌ | ✅ | 🟡 | 🟡 | ❌ | | 🟡 | ✅ | ❌ | ❌ |
| SOFT_MAX | ❌ | 🟡 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
| SOFT_MAX_BACK | ❌ | ❌ | 🟡 | 🟡 | ❌ | ❌ | 🟡 | ✅ | ❌ | ❌ | ❌ |
| SOLVE_TRI | ❌ | ❌ | ✅ | 🟡 | ❌ | ❌ | ❌ | 🟡 | ❌ | ❌ | ❌ |
@ -114,7 +114,7 @@ Legend:
| TANH | ❌ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ✅ | 🟡 | ✅ | ❌ | ❌ |
| TIMESTEP_EMBEDDING | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ |
| TOP_K | ❌ | ❌ | ✅ | ❌ | ✅ | ❌ | ❌ | 🟡 | ✅ | ❌ | ❌ |
| TRI | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | | ✅ | ❌ | ❌ | ❌ |
| TRI | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | | ✅ | ❌ | ❌ | ❌ |
| TRUNC | ❌ | ❌ | ✅ | 🟡 | ❌ | ❌ | 🟡 | 🟡 | ✅ | ❌ | ❌ |
| UPSCALE | ❌ | 🟡 | ✅ | ✅ | 🟡 | 🟡 | 🟡 | 🟡 | ❌ | ❌ | ❌ |
| XIELU | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ |

View File

@ -29,8 +29,8 @@
"SYCL0","EXP","type=f16,ne_a=[5,7,11,13],v=0","support","1","yes","SYCL"
"SYCL0","EXPM1","type=f16,ne_a=[128,2,2,2],v=0","support","0","no","SYCL"
"SYCL0","EXPM1","type=f16,ne_a=[5,7,11,13],v=0","support","0","no","SYCL"
"SYCL0","SOFTPLUS","type=f16,ne_a=[128,2,2,2],v=0","support","0","no","SYCL"
"SYCL0","SOFTPLUS","type=f16,ne_a=[5,7,11,13],v=0","support","0","no","SYCL"
"SYCL0","SOFTPLUS","type=f16,ne_a=[128,2,2,2],v=0","support","1","yes","SYCL"
"SYCL0","SOFTPLUS","type=f16,ne_a=[5,7,11,13],v=0","support","1","yes","SYCL"
"SYCL0","GELU_ERF","type=f16,ne_a=[128,2,2,2],v=0","support","1","yes","SYCL"
"SYCL0","GELU_ERF","type=f16,ne_a=[5,7,11,13],v=0","support","1","yes","SYCL"
"SYCL0","FLOOR","type=f16,ne_a=[128,2,2,2],v=0","support","1","yes","SYCL"
@ -71,8 +71,8 @@
"SYCL0","EXP","type=f16,ne_a=[5,7,11,13],v=1","support","1","yes","SYCL"
"SYCL0","EXPM1","type=f16,ne_a=[128,2,2,2],v=1","support","0","no","SYCL"
"SYCL0","EXPM1","type=f16,ne_a=[5,7,11,13],v=1","support","0","no","SYCL"
"SYCL0","SOFTPLUS","type=f16,ne_a=[128,2,2,2],v=1","support","0","no","SYCL"
"SYCL0","SOFTPLUS","type=f16,ne_a=[5,7,11,13],v=1","support","0","no","SYCL"
"SYCL0","SOFTPLUS","type=f16,ne_a=[128,2,2,2],v=1","support","1","yes","SYCL"
"SYCL0","SOFTPLUS","type=f16,ne_a=[5,7,11,13],v=1","support","1","yes","SYCL"
"SYCL0","GELU_ERF","type=f16,ne_a=[128,2,2,2],v=1","support","1","yes","SYCL"
"SYCL0","GELU_ERF","type=f16,ne_a=[5,7,11,13],v=1","support","1","yes","SYCL"
"SYCL0","FLOOR","type=f16,ne_a=[128,2,2,2],v=1","support","0","no","SYCL"
@ -113,8 +113,8 @@
"SYCL0","EXP","type=f32,ne_a=[5,7,11,13],v=0","support","1","yes","SYCL"
"SYCL0","EXPM1","type=f32,ne_a=[128,2,2,2],v=0","support","0","no","SYCL"
"SYCL0","EXPM1","type=f32,ne_a=[5,7,11,13],v=0","support","0","no","SYCL"
"SYCL0","SOFTPLUS","type=f32,ne_a=[128,2,2,2],v=0","support","0","no","SYCL"
"SYCL0","SOFTPLUS","type=f32,ne_a=[5,7,11,13],v=0","support","0","no","SYCL"
"SYCL0","SOFTPLUS","type=f32,ne_a=[128,2,2,2],v=0","support","1","yes","SYCL"
"SYCL0","SOFTPLUS","type=f32,ne_a=[5,7,11,13],v=0","support","1","yes","SYCL"
"SYCL0","GELU_ERF","type=f32,ne_a=[128,2,2,2],v=0","support","1","yes","SYCL"
"SYCL0","GELU_ERF","type=f32,ne_a=[5,7,11,13],v=0","support","1","yes","SYCL"
"SYCL0","FLOOR","type=f32,ne_a=[128,2,2,2],v=0","support","1","yes","SYCL"
@ -155,8 +155,8 @@
"SYCL0","EXP","type=f32,ne_a=[5,7,11,13],v=1","support","1","yes","SYCL"
"SYCL0","EXPM1","type=f32,ne_a=[128,2,2,2],v=1","support","0","no","SYCL"
"SYCL0","EXPM1","type=f32,ne_a=[5,7,11,13],v=1","support","0","no","SYCL"
"SYCL0","SOFTPLUS","type=f32,ne_a=[128,2,2,2],v=1","support","0","no","SYCL"
"SYCL0","SOFTPLUS","type=f32,ne_a=[5,7,11,13],v=1","support","0","no","SYCL"
"SYCL0","SOFTPLUS","type=f32,ne_a=[128,2,2,2],v=1","support","1","yes","SYCL"
"SYCL0","SOFTPLUS","type=f32,ne_a=[5,7,11,13],v=1","support","1","yes","SYCL"
"SYCL0","GELU_ERF","type=f32,ne_a=[128,2,2,2],v=1","support","1","yes","SYCL"
"SYCL0","GELU_ERF","type=f32,ne_a=[5,7,11,13],v=1","support","1","yes","SYCL"
"SYCL0","FLOOR","type=f32,ne_a=[128,2,2,2],v=1","support","0","no","SYCL"
@ -10052,10 +10052,10 @@
"SYCL0","CUMSUM","type=f32,ne=[375960,1,1,1]","support","0","no","SYCL"
"SYCL0","CUMSUM","type=f32,ne=[20481,4,1,1]","support","0","no","SYCL"
"SYCL0","XIELU","type=f32,ne=[10,5,4,3]","support","0","no","SYCL"
"SYCL0","TRI","type=f32,ne=[10,10,4,3],tri_type=3","support","0","no","SYCL"
"SYCL0","TRI","type=f32,ne=[10,10,4,3],tri_type=2","support","0","no","SYCL"
"SYCL0","TRI","type=f32,ne=[10,10,4,3],tri_type=1","support","0","no","SYCL"
"SYCL0","TRI","type=f32,ne=[10,10,4,3],tri_type=0","support","0","no","SYCL"
"SYCL0","TRI","type=f32,ne=[10,10,4,3],tri_type=3","support","1","yes","SYCL"
"SYCL0","TRI","type=f32,ne=[10,10,4,3],tri_type=2","support","1","yes","SYCL"
"SYCL0","TRI","type=f32,ne=[10,10,4,3],tri_type=1","support","1","yes","SYCL"
"SYCL0","TRI","type=f32,ne=[10,10,4,3],tri_type=0","support","1","yes","SYCL"
"SYCL0","FILL","type=f32,ne=[10,10,4,3],c=0.000000","support","0","no","SYCL"
"SYCL0","FILL","type=f32,ne=[303,207,11,3],c=2.000000","support","0","no","SYCL"
"SYCL0","FILL","type=f32,ne=[800,600,4,4],c=-152.000000","support","0","no","SYCL"

Can't render this file because it is too large.

View File

@ -50,6 +50,12 @@ int main(int argc, char ** argv) {
const int N = 5; // n-gram size
const int G = 15; // max verification n-grams
// lookahead requires W + G + 1 sequences for parallel Jacobi decoding
params.n_parallel = W + G + 1;
// unified KV cache is required for coupled sequences in batch splitting
params.kv_unified = true;
// init llama.cpp
llama_backend_init();
llama_numa_init(params.numa);
@ -115,7 +121,7 @@ int main(int argc, char ** argv) {
// seq_id == 0 : the current input token
// seq_id [1, W] : tokens from the past N - 1 Jacobi iterations
// seq_id [W + 1, W + G] : verification n-grams
llama_batch batch = llama_batch_init(params.n_ctx, 0, W + G + 1);
llama_batch batch = llama_batch_init(llama_n_ctx(ctx), 0, W + G + 1);
// target model sampling context
struct common_sampler * smpl = common_sampler_init(model, params.sampling);

View File

@ -106,7 +106,7 @@ int main(int argc, char ** argv){
std::vector<llama_token> draft;
llama_batch batch_tgt = llama_batch_init(params.n_ctx, 0, 1);
llama_batch batch_tgt = llama_batch_init(llama_n_ctx(ctx), 0, 1);
const auto t_dec_start = ggml_time_us();

View File

@ -222,6 +222,7 @@ if (GGML_SCHED_NO_REALLOC)
endif()
add_library(ggml
ggml-backend-dl.cpp
ggml-backend-reg.cpp)
add_library(ggml::ggml ALIAS ggml)

View File

@ -0,0 +1,48 @@
#include "ggml-backend-dl.h"
#ifdef _WIN32
dl_handle * dl_load_library(const fs::path & path) {
// suppress error dialogs for missing DLLs
DWORD old_mode = SetErrorMode(SEM_FAILCRITICALERRORS);
SetErrorMode(old_mode | SEM_FAILCRITICALERRORS);
HMODULE handle = LoadLibraryW(path.wstring().c_str());
SetErrorMode(old_mode);
return handle;
}
void * dl_get_sym(dl_handle * handle, const char * name) {
DWORD old_mode = SetErrorMode(SEM_FAILCRITICALERRORS);
SetErrorMode(old_mode | SEM_FAILCRITICALERRORS);
void * p = (void *) GetProcAddress(handle, name);
SetErrorMode(old_mode);
return p;
}
const char * dl_error() {
return "";
}
#else
dl_handle * dl_load_library(const fs::path & path) {
dl_handle * handle = dlopen(path.string().c_str(), RTLD_NOW | RTLD_LOCAL);
return handle;
}
void * dl_get_sym(dl_handle * handle, const char * name) {
return dlsym(handle, name);
}
const char * dl_error() {
const char *rslt = dlerror();
return rslt != nullptr ? rslt : "";
}
#endif

View File

@ -0,0 +1,45 @@
#pragma once
#ifdef _WIN32
# define WIN32_LEAN_AND_MEAN
# ifndef NOMINMAX
# define NOMINMAX
# endif
# include <windows.h>
# include <winevt.h>
#else
# include <dlfcn.h>
# include <unistd.h>
#endif
#include <filesystem>
namespace fs = std::filesystem;
#ifdef _WIN32
using dl_handle = std::remove_pointer_t<HMODULE>;
struct dl_handle_deleter {
void operator()(HMODULE handle) {
FreeLibrary(handle);
}
};
#else
using dl_handle = void;
struct dl_handle_deleter {
void operator()(void * handle) {
dlclose(handle);
}
};
#endif
using dl_handle_ptr = std::unique_ptr<dl_handle, dl_handle_deleter>;
dl_handle * dl_load_library(const fs::path & path);
void * dl_get_sym(dl_handle * handle, const char * name);
const char * dl_error();

View File

@ -1,5 +1,6 @@
#include "ggml-backend-impl.h"
#include "ggml-backend.h"
#include "ggml-backend-dl.h"
#include "ggml-impl.h"
#include <algorithm>
#include <cstring>
@ -98,72 +99,6 @@ static std::string path_str(const fs::path & path) {
}
}
#ifdef _WIN32
using dl_handle = std::remove_pointer_t<HMODULE>;
struct dl_handle_deleter {
void operator()(HMODULE handle) {
FreeLibrary(handle);
}
};
static dl_handle * dl_load_library(const fs::path & path) {
// suppress error dialogs for missing DLLs
DWORD old_mode = SetErrorMode(SEM_FAILCRITICALERRORS);
SetErrorMode(old_mode | SEM_FAILCRITICALERRORS);
HMODULE handle = LoadLibraryW(path.wstring().c_str());
SetErrorMode(old_mode);
return handle;
}
static void * dl_get_sym(dl_handle * handle, const char * name) {
DWORD old_mode = SetErrorMode(SEM_FAILCRITICALERRORS);
SetErrorMode(old_mode | SEM_FAILCRITICALERRORS);
void * p = (void *) GetProcAddress(handle, name);
SetErrorMode(old_mode);
return p;
}
static const char * dl_error() {
return "";
}
#else
using dl_handle = void;
struct dl_handle_deleter {
void operator()(void * handle) {
dlclose(handle);
}
};
static void * dl_load_library(const fs::path & path) {
dl_handle * handle = dlopen(path.string().c_str(), RTLD_NOW | RTLD_LOCAL);
return handle;
}
static void * dl_get_sym(dl_handle * handle, const char * name) {
return dlsym(handle, name);
}
static const char * dl_error() {
const char *rslt = dlerror();
return rslt != nullptr ? rslt : "";
}
#endif
using dl_handle_ptr = std::unique_ptr<dl_handle, dl_handle_deleter>;
struct ggml_backend_reg_entry {
ggml_backend_reg_t reg;
dl_handle_ptr handle;

View File

@ -1124,6 +1124,7 @@ struct ggml_tensor_extra_gpu {
struct ggml_cuda_graph_node_properties {
void * node_data;
ggml_op node_op;
enum ggml_type node_type;
int32_t flags;
int64_t ne[GGML_MAX_DIMS];
size_t nb[GGML_MAX_DIMS];

View File

@ -2921,6 +2921,7 @@ static void ggml_cuda_graph_node_set_properties(ggml_cuda_graph_node_properties
memset(props, 0, sizeof(ggml_cuda_graph_node_properties));
props->node_data = node->data;
props->node_op = node->op;
props->node_type = node->type;
props->flags = node->flags;
for (int i = 0; i < GGML_MAX_DIMS; i++) {
props->ne[i] = node->ne[i];
@ -2945,6 +2946,10 @@ static bool ggml_cuda_graph_node_properties_match(ggml_tensor * node, ggml_cuda_
return false;
}
if (node->type != props->node_type) {
return false;
}
for (int i = 0; i < GGML_MAX_DIMS; i++) {
if (node->ne[i] != props->ne[i]) {
return false;
@ -3906,14 +3911,14 @@ static void ggml_cuda_graph_evaluate_and_capture(ggml_backend_cuda_context * cud
// Launch graph
CUDA_CHECK(cudaGraphLaunch(graph->instance, cuda_ctx->stream()));
#else
GGML_UNUSED(graph_key);
graph_evaluated_or_captured = true;
#endif // USE_CUDA_GRAPH
}
}
static bool ggml_cuda_graph_set_enabled(ggml_backend_cuda_context * cuda_ctx, const void * graph_key) {
#ifdef USE_CUDA_GRAPH
static bool ggml_cuda_graph_set_enabled(ggml_backend_cuda_context * cuda_ctx, const void * graph_key) {
ggml_cuda_graph * graph = cuda_ctx->cuda_graph(graph_key);
if (graph->graph == nullptr) {
@ -3926,12 +3931,8 @@ static bool ggml_cuda_graph_set_enabled(ggml_backend_cuda_context * cuda_ctx, co
}
return graph->is_enabled();
#else
GGML_UNUSED(cuda_ctx);
GGML_UNUSED(graph_key);
return false;
#endif // USE_CUDA_GRAPH
}
#endif // USE_CUDA_GRAPH
static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *) backend->context;

View File

@ -1,7 +1,29 @@
file(TO_CMAKE_PATH "${HEXAGON_SDK_ROOT}" HEXAGON_SDK_ROOT)
file(TO_CMAKE_PATH "${HEXAGON_TOOLS_ROOT}" HEXAGON_TOOLS_ROOT)
if (NOT IS_DIRECTORY "${HEXAGON_SDK_ROOT}")
message(FATAL_ERROR "Make sure HEXAGON_SDK_ROOT point to the correct Hexagon SDK installation.")
endif()
if (NOT IS_DIRECTORY "${HEXAGON_TOOLS_ROOT}")
message("Try to read HEXAGON_TOOLS_ROOT from hexagon_sdk.json")
file(READ "${HEXAGON_SDK_ROOT}/hexagon_sdk.json" HEXAGON_SDK_CONFIG_PATH)
string(JSON HEXAGON_TOOLS_PATH GET ${HEXAGON_SDK_CONFIG_PATH} "root" "tools" "info" 0 "path")
message("Found HEXAGON_TOOLS_PATH: ${HEXAGON_TOOLS_PATH}")
set(HEXAGON_TOOLS_ROOT "${HEXAGON_SDK_ROOT}/${HEXAGON_TOOLS_PATH}")
file(TO_CMAKE_PATH "${HEXAGON_TOOLS_ROOT}" HEXAGON_TOOLS_ROOT)
if (NOT IS_DIRECTORY "${HEXAGON_TOOLS_ROOT}")
message(FATAL_ERROR "Make sure HEXAGON_TOOLS_ROOT point to the correct Hexagon SDK installation.")
endif()
endif()
message(STATUS "hexagon: using ${HEXAGON_SDK_ROOT} and ${HEXAGON_TOOLS_ROOT} for building libggml-htp skels")
include(${HEXAGON_SDK_ROOT}/build/cmake/hexagon_fun.cmake)
include(ExternalProject)
option(GGML_HEXAGON_HTP_DEBUG "ggml-hexagon: enable HTP debug output" OFF)
set(GGML_HEXAGON_HTP_CERT "$ENV{HEXAGON_HTP_CERT}" CACHE PATH "ggml-hexagon: enable HTP library signing using certificate")
set(GGML_HEXAGON_FP32_QUANTIZE_GROUP_SIZE 128 CACHE STRING "ggml-hexagon: quantize group size (32, 64, or 128)")
add_library(htp_iface OBJECT
@ -25,56 +47,71 @@ else()
target_link_options(htp_iface PUBLIC -ldl)
endif()
link_custom_library(htp_iface cdsprpc)
link_custom_library(htp_iface rpcmem)
set(TARGET_NAME ggml-hexagon)
ggml_add_backend_library(${TARGET_NAME}
ggml-hexagon.cpp htp-utils.c htp-utils.h ../../include/ggml-hexagon.h)
ggml-hexagon.cpp
htp-drv.cpp
htp-drv.h
libdl.h
../../include/ggml-hexagon.h)
target_link_libraries(${TARGET_NAME} PRIVATE htp_iface)
target_include_directories(${TARGET_NAME} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/htp ${CMAKE_CURRENT_BINARY_DIR})
# Build HTP bits
set(HTP_CMAKE_ARGS
-DCMAKE_TOOLCHAIN_FILE=${CMAKE_CURRENT_SOURCE_DIR}/htp/cmake-toolchain.cmake
-DCMAKE_BUILD_TYPE=Release
-DCMAKE_INSTALL_LIBDIR=${CMAKE_CURRENT_BINARY_DIR}
-DHEXAGON_SDK_ROOT=$ENV{HEXAGON_SDK_ROOT}
-DHEXAGON_TOOLS_ROOT=$ENV{HEXAGON_TOOLS_ROOT}
-DHEXAGON_HTP_DEBUG=${GGML_HEXAGON_HTP_DEBUG}
-DGGML_HEXAGON_FP32_QUANTIZE_GROUP_SIZE=${GGML_HEXAGON_FP32_QUANTIZE_GROUP_SIZE})
# Build HTP skels
set(HTP_SKELS)
function(build_htp_skel V)
ExternalProject_Add(htp-${V}
SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/htp BUILD_ALWAYS ON
BUILD_BYPRODUCTS ${CMAKE_CURRENT_BINARY_DIR}/libggml-htp-${V}.so
CMAKE_ARGS
-DCMAKE_BUILD_TYPE=Release
-DCMAKE_TOOLCHAIN_FILE=${CMAKE_CURRENT_SOURCE_DIR}/htp/cmake-toolchain.cmake
-DCMAKE_INSTALL_LIBDIR=${CMAKE_CURRENT_BINARY_DIR}
-DHEXAGON_SDK_ROOT=${HEXAGON_SDK_ROOT}
-DHEXAGON_TOOLS_ROOT=${HEXAGON_TOOLS_ROOT}
-DHEXAGON_HTP_DEBUG=${GGML_HEXAGON_HTP_DEBUG}
-DGGML_HEXAGON_FP32_QUANTIZE_GROUP_SIZE=${GGML_HEXAGON_FP32_QUANTIZE_GROUP_SIZE}
-DDSP_VERSION=${V}
-DPREBUILT_LIB_DIR="toolv19_${V}")
list(APPEND HTP_SKELS ${CMAKE_CURRENT_BINARY_DIR}/libggml-htp-${V}.so)
set(HTP_SKELS ${HTP_SKELS} PARENT_SCOPE)
endfunction()
ExternalProject_Add(htp-v68
SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/htp BUILD_ALWAYS ON
CMAKE_ARGS ${HTP_CMAKE_ARGS} -DDSP_VERSION=v68 -DPREBUILT_LIB_DIR="toolv19_v68")
ExternalProject_Add(htp-v69
SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/htp BUILD_ALWAYS ON
CMAKE_ARGS ${HTP_CMAKE_ARGS} -DDSP_VERSION=v69 -DPREBUILT_LIB_DIR="toolv19_v69")
ExternalProject_Add(htp-v73
SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/htp BUILD_ALWAYS ON
CMAKE_ARGS ${HTP_CMAKE_ARGS} -DDSP_VERSION=v73 -DPREBUILT_LIB_DIR="toolv19_v73")
ExternalProject_Add(htp-v75
SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/htp BUILD_ALWAYS ON
CMAKE_ARGS ${HTP_CMAKE_ARGS} -DDSP_VERSION=v75 -DPREBUILT_LIB_DIR="toolv19_v75")
ExternalProject_Add(htp-v79
SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/htp BUILD_ALWAYS ON
CMAKE_ARGS ${HTP_CMAKE_ARGS} -DDSP_VERSION=v79 -DPREBUILT_LIB_DIR="toolv19_v79")
ExternalProject_Add(htp-v81
SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/htp BUILD_ALWAYS ON
CMAKE_ARGS ${HTP_CMAKE_ARGS} -DDSP_VERSION=v81 -DPREBUILT_LIB_DIR="toolv19_v81")
build_htp_skel(v68)
build_htp_skel(v69)
build_htp_skel(v73)
build_htp_skel(v75)
build_htp_skel(v79)
build_htp_skel(v81)
# Install Hexagon skels required at runtime
install(FILES
${CMAKE_CURRENT_BINARY_DIR}/libggml-htp-v68.so
${CMAKE_CURRENT_BINARY_DIR}/libggml-htp-v69.so
${CMAKE_CURRENT_BINARY_DIR}/libggml-htp-v73.so
${CMAKE_CURRENT_BINARY_DIR}/libggml-htp-v75.so
${CMAKE_CURRENT_BINARY_DIR}/libggml-htp-v79.so
${CMAKE_CURRENT_BINARY_DIR}/libggml-htp-v81.so
TYPE LIB)
install(FILES ${HTP_SKELS} TYPE LIB)
if (CMAKE_SYSTEM_NAME MATCHES Windows AND GGML_HEXAGON_HTP_CERT)
file(TO_CMAKE_PATH "$ENV{WINDOWS_SDK_BIN}/arm64" WINSDK_BIN0_ARM64)
file(TO_CMAKE_PATH "$ENV{WINDOWS_SDK_BIN}/x86" WINSDK_BIN0_X86)
file(TO_CMAKE_PATH "$ENV{WindowsSdkVerBinPath}/arm64" WINSDK_BIN1_ARM64)
file(TO_CMAKE_PATH "$ENV{WindowsSdkVerBinPath}/x86" WINSDK_BIN1_X86)
set(WINSDK_PATHS ${WINSDK_BIN0_ARM64} ${WINSDK_BIN0_X86} ${WINSDK_BIN1_ARM64} ${WINSDK_BIN1_X86})
find_program(INF2CAT NAMES inf2cat.exe PATHS ${WINSDK_PATHS} REQUIRED)
find_program(SIGNTOOL NAMES signtool.exe PATHS ${WINSDK_PATHS} REQUIRED)
message(STATUS "hexagon: using ${GGML_HEXAGON_HTP_CERT} to sign libggml-htp skels")
set(LIBGGML_HTP_CAT ${CMAKE_CURRENT_BINARY_DIR}/libggml-htp.cat)
add_custom_target(libggml-htp-cat
BYPRODUCTS ${LIBGGML_HTP_CAT}
DEPENDS libggml-htp.inf ${HTP_SKELS}
COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_SOURCE_DIR}/libggml-htp.inf ${CMAKE_CURRENT_BINARY_DIR}
COMMAND ${INF2CAT} /driver:${CMAKE_CURRENT_BINARY_DIR} /os:10_25H2_ARM64
COMMAND ${SIGNTOOL} sign /fd sha256 /f ${GGML_HEXAGON_HTP_CERT} ${LIBGGML_HTP_CAT}
COMMENT "generating and signing libggml-htp.cat file"
VERBATIM
)
add_dependencies(${TARGET_NAME} libggml-htp-cat)
install(FILES ${LIBGGML_HTP_CAT} TYPE LIB)
endif()

View File

@ -14,9 +14,6 @@
#ifdef _WIN32
# include <sal.h>
# ifndef _WINDOWS
# define _WINDOWS
# endif
#else
# include <semaphore.h>
# include <unistd.h>
@ -25,8 +22,6 @@
#pragma clang diagnostic ignored "-Wnested-anon-types"
#pragma clang diagnostic ignored "-Wgnu-anonymous-struct"
#include "htp-utils.h"
#include <AEEStdErr.h>
#include <dspqueue.h>
#include <rpcmem.h>
@ -40,6 +35,7 @@
#include "op-desc.h"
#include "htp-msg.h"
#include "htp_iface.h"
#include "htp-drv.h"
static size_t opt_ndev = 1;
static size_t opt_nhvx = 0; // use all
@ -150,9 +146,9 @@ void ggml_hexagon_session::enqueue(struct htp_general_req &req, struct dspqueue_
0, // flags - the framework will autoset this
n_bufs, // number of buffers
bufs, // buffer references
sizeof(req),
sizeof(req), // Message length
(const uint8_t *) &req, // Message
1000000 // Timeout
DSPQUEUE_TIMEOUT // Timeout
);
if (err != 0) {
@ -182,13 +178,13 @@ void ggml_hexagon_session::flush() {
// Read response packet from queue
int err = dspqueue_read(q, &flags,
HTP_MAX_PACKET_BUFFERS, // Maximum number of buffer references
&n_bufs, // Number of buffer references
bufs, // Buffer references
sizeof(rsp), // Max message length
&rsp_size, // Message length
(uint8_t *) &rsp,
1000000); // Timeout
HTP_MAX_PACKET_BUFFERS, // Maximum number of buffer references
&n_bufs, // Number of buffer references
bufs, // Buffer references
sizeof(rsp), // Max message length
&rsp_size, // Message length
(uint8_t *) &rsp, // Message
DSPQUEUE_TIMEOUT); // Timeout
if (err == AEE_EEXPIRED) {
// TODO: might need to bail out if the HTP is stuck on something
@ -269,13 +265,7 @@ struct ggml_backend_hexagon_buffer_context {
ggml_backend_hexagon_buffer_context(ggml_hexagon_session * sess, size_t size, bool repack) {
size += 4 * 1024; // extra page for padding
if (rpcmem_alloc2) {
this->base = (uint8_t *) rpcmem_alloc2(RPCMEM_HEAP_ID_SYSTEM, RPCMEM_DEFAULT_FLAGS | RPCMEM_HEAP_NOREG, size);
} else {
GGML_LOG_INFO("ggml-hex: %s rpcmem_alloc2 not found, falling back to rpcmem_alloc\n", sess->name.c_str());
this->base = (uint8_t *) rpcmem_alloc(RPCMEM_HEAP_ID_SYSTEM, RPCMEM_DEFAULT_FLAGS | RPCMEM_HEAP_NOREG, size);
}
this->base = (uint8_t *) rpcmem_alloc2(RPCMEM_HEAP_ID_SYSTEM, RPCMEM_DEFAULT_FLAGS | RPCMEM_HEAP_NOREG, size);
if (!this->base) {
GGML_LOG_ERROR("ggml-hex: %s failed to allocate buffer : size %zu\n", sess->name.c_str(), size);
throw std::runtime_error("ggml-hex: rpcmem_alloc failed (see log for details)");
@ -2461,12 +2451,12 @@ static void ggml_backend_hexagon_free(ggml_backend_t backend) {
}
static inline bool op_reuse_src1(const ggml_tensor * op1, const ggml_tensor * op0) {
return (op0 && op0->src[1] == op1->src[1] && ggml_is_quantized(op0->src[0]->type) && ggml_is_quantized(op1->src[1]->type));
return (op0 && op0->src[1] == op1->src[1] && ggml_is_quantized(op0->src[0]->type));
}
static inline bool is_compute_op(ggml_tensor *node)
{
return !(ggml_op_is_empty(node->op) || ggml_is_empty(node));
return !ggml_op_is_empty(node->op) && !ggml_is_empty(node) && (node->flags & GGML_TENSOR_FLAG_COMPUTE);
}
// scan the graph and figure out last compute op index
@ -2488,7 +2478,7 @@ static ggml_status ggml_backend_hexagon_graph_compute(ggml_backend_t backend, gg
const int last = last_compute_op(graph);
const struct ggml_tensor * prev_quant_op = nullptr; // prev executed op with quantizer
const struct ggml_tensor * prev_op = nullptr; // prev executed op
for (int i = 0; i < graph->n_nodes; ++i) {
ggml_tensor * node = graph->nodes[i];
@ -2497,17 +2487,15 @@ static ggml_status ggml_backend_hexagon_graph_compute(ggml_backend_t backend, gg
continue;
}
if ((node->flags & GGML_TENSOR_FLAG_COMPUTE) == 0) {
continue;
}
uint32_t flags = 0;
// skip quantizer if src1 is reused
if (op_reuse_src1(node, prev_quant_op)) {
if (op_reuse_src1(node, prev_op)) {
flags |= HTP_OPFLAGS_SKIP_QUANTIZE;
}
prev_op = node;
// ask for early notification for the last Op
if (i == last) {
flags |= HTP_OPFLAGS_EARLY_WAKEUP;
@ -2520,7 +2508,6 @@ static ggml_status ggml_backend_hexagon_graph_compute(ggml_backend_t backend, gg
} else {
ggml_hexagon_dispatch_op<init_binary_req<false>>(sess, node, flags);
}
prev_quant_op = node;
break;
case GGML_OP_MUL_MAT_ID:
if (ggml_is_quantized(node->src[0]->type)) {
@ -2528,7 +2515,6 @@ static ggml_status ggml_backend_hexagon_graph_compute(ggml_backend_t backend, gg
} else {
ggml_hexagon_dispatch_op<init_binary_id_req<false>>(sess, node, flags);
}
prev_quant_op = node;
break;
case GGML_OP_MUL:
case GGML_OP_ADD:
@ -2670,7 +2656,7 @@ static std::vector<int> ggml_hexagon_graph_optimize_reorder(const std::vector<no
}
// that many nodes forward to search for stackable nodes that can reuse VTCM
constexpr int N_FORWARD = 8;
constexpr int N_FORWARD = 16;
for (int i1 = i0 + 1; i1 < i0 + N_FORWARD && i1 < n; i1++) {
if (used[i1]) {
@ -3056,10 +3042,12 @@ ggml_hexagon_registry::ggml_hexagon_registry(ggml_backend_reg_t reg) {
}
}
#if defined(__ANDROID__)
if (opt_arch < 75) {
opt_ndev = 1;
GGML_LOG_WARN("ggml-hex: forcing ndev to 1 for SoCs archs lower than v75.\n");
}
#endif
GGML_LOG_INFO("ggml-hex: Hexagon Arch version v%d\n", opt_arch);
@ -3156,6 +3144,8 @@ static void ggml_hexagon_init(ggml_backend_reg * reg) {
opt_arch = strtoul(str_arch, NULL, 0);
}
opt_hostbuf = str_hostbuf ? atoi(str_hostbuf) : 1;
reg->context = new ggml_hexagon_registry(reg);
HEX_VERBOSE("ggml-hex: size-of-general-req %zu size-of-general-rsp %zu\n", sizeof(struct htp_general_req),
@ -3180,6 +3170,11 @@ ggml_backend_reg_t ggml_backend_hexagon_reg(void) {
static std::mutex mutex;
std::lock_guard<std::mutex> lock(mutex);
if (!initialized) {
auto nErr = htpdrv_init();
if (nErr != AEE_SUCCESS) {
return NULL;
}
ggml_hexagon_init(&reg);
}

View File

@ -0,0 +1,418 @@
// sample drv interface
#pragma clang diagnostic ignored "-Wgnu-anonymous-struct"
#pragma clang diagnostic ignored "-Wmissing-prototypes"
#pragma clang diagnostic ignored "-Wsign-compare"
#include <filesystem>
#include <set>
#include <sstream>
#include <string>
#ifdef _WIN32
# define WIN32_LEAN_AND_MEAN
# ifndef NOMINMAX
# define NOMINMAX
# endif
# include <windows.h>
# include <winevt.h>
#else
# include <dlfcn.h>
# include <unistd.h>
#endif
#include "ggml-impl.h"
#include "htp-drv.h"
#include "libdl.h"
#include <domain.h>
//
// Driver API types
//
typedef void * (*rpcmem_alloc_pfn_t)(int heapid, uint32_t flags, int size);
typedef void * (*rpcmem_alloc2_pfn_t)(int heapid, uint32_t flags, size_t size);
typedef void (*rpcmem_free_pfn_t)(void * po);
typedef int (*rpcmem_to_fd_pfn_t)(void * po);
typedef AEEResult (*dspqueue_create_pfn_t)(int domain,
uint32_t flags,
uint32_t req_queue_size,
uint32_t resp_queue_size,
dspqueue_callback_t packet_callback,
dspqueue_callback_t error_callback,
void * callback_context,
dspqueue_t * queue);
typedef AEEResult (*dspqueue_close_pfn_t)(dspqueue_t queue);
typedef AEEResult (*dspqueue_export_pfn_t)(dspqueue_t queue, uint64_t *queue_id);
typedef AEEResult (*dspqueue_write_pfn_t)(dspqueue_t queue, uint32_t flags,
uint32_t num_buffers,
struct dspqueue_buffer *buffers,
uint32_t message_length,
const uint8_t *message,
uint32_t timeout_us);
typedef AEEResult (*dspqueue_read_pfn_t)(dspqueue_t queue, uint32_t *flags,
uint32_t max_buffers, uint32_t *num_buffers,
struct dspqueue_buffer *buffers,
uint32_t max_message_length,
uint32_t *message_length, uint8_t *message,
uint32_t timeout_us);
typedef int (*fastrpc_mmap_pfn_t)(int domain, int fd, void *addr, int offset, size_t length, enum fastrpc_map_flags flags);
typedef int (*fastrpc_munmap_pfn_t)(int domain, int fd, void *addr, size_t length);
typedef int (*remote_handle64_open_pfn_t)(const char* name, remote_handle64 *ph);
typedef int (*remote_handle64_invoke_pfn_t)(remote_handle64 h, uint32_t dwScalars, remote_arg *pra);
typedef int (*remote_handle64_close_pfn_t)(remote_handle h);
typedef int (*remote_handle_control_pfn_t)(uint32_t req, void* data, uint32_t datalen);
typedef int (*remote_handle64_control_pfn_t)(remote_handle64 h, uint32_t req, void* data, uint32_t datalen);
typedef int (*remote_session_control_pfn_t)(uint32_t req, void *data, uint32_t datalen);
//
// Driver API pfns
//
rpcmem_alloc_pfn_t rpcmem_alloc_pfn = nullptr;
rpcmem_alloc2_pfn_t rpcmem_alloc2_pfn = nullptr;
rpcmem_free_pfn_t rpcmem_free_pfn = nullptr;
rpcmem_to_fd_pfn_t rpcmem_to_fd_pfn = nullptr;
fastrpc_mmap_pfn_t fastrpc_mmap_pfn = nullptr;
fastrpc_munmap_pfn_t fastrpc_munmap_pfn = nullptr;
dspqueue_create_pfn_t dspqueue_create_pfn = nullptr;
dspqueue_close_pfn_t dspqueue_close_pfn = nullptr;
dspqueue_export_pfn_t dspqueue_export_pfn = nullptr;
dspqueue_write_pfn_t dspqueue_write_pfn = nullptr;
dspqueue_read_pfn_t dspqueue_read_pfn = nullptr;
remote_handle64_open_pfn_t remote_handle64_open_pfn = nullptr;
remote_handle64_invoke_pfn_t remote_handle64_invoke_pfn = nullptr;
remote_handle64_close_pfn_t remote_handle64_close_pfn = nullptr;
remote_handle_control_pfn_t remote_handle_control_pfn = nullptr;
remote_handle64_control_pfn_t remote_handle64_control_pfn = nullptr;
remote_session_control_pfn_t remote_session_control_pfn = nullptr;
//
// Driver API
//
void * rpcmem_alloc(int heapid, uint32_t flags, int size) {
return rpcmem_alloc_pfn(heapid, flags, size);
}
void * rpcmem_alloc2(int heapid, uint32_t flags, size_t size) {
if (rpcmem_alloc2_pfn) {
return rpcmem_alloc2_pfn(heapid, flags, size);
} else {
GGML_LOG_INFO("ggml-hex: rpcmem_alloc2 not found, falling back to rpcmem_alloc\n");
return rpcmem_alloc_pfn(heapid, flags, size);
}
}
void rpcmem_free(void * po) {
return rpcmem_free_pfn(po);
}
int rpcmem_to_fd(void * po) {
return rpcmem_to_fd_pfn(po);
}
HTPDRV_API int fastrpc_mmap(int domain, int fd, void * addr, int offset, size_t length, enum fastrpc_map_flags flags) {
return fastrpc_mmap_pfn(domain, fd, addr, offset, length, flags);
}
HTPDRV_API int fastrpc_munmap(int domain, int fd, void * addr, size_t length) {
return fastrpc_munmap_pfn(domain, fd, addr, length);
}
AEEResult dspqueue_create(int domain,
uint32_t flags,
uint32_t req_queue_size,
uint32_t resp_queue_size,
dspqueue_callback_t packet_callback,
dspqueue_callback_t error_callback,
void * callback_context,
dspqueue_t * queue) {
return dspqueue_create_pfn(domain, flags, req_queue_size, resp_queue_size, packet_callback, error_callback,
callback_context, queue);
}
AEEResult dspqueue_close(dspqueue_t queue) {
return dspqueue_close_pfn(queue);
}
AEEResult dspqueue_export(dspqueue_t queue, uint64_t * queue_id) {
return dspqueue_export_pfn(queue, queue_id);
}
AEEResult dspqueue_write(dspqueue_t queue,
uint32_t flags,
uint32_t num_buffers,
struct dspqueue_buffer * buffers,
uint32_t message_length,
const uint8_t * message,
uint32_t timeout_us) {
return dspqueue_write_pfn(queue, flags, num_buffers, buffers, message_length, message, timeout_us);
}
AEEResult dspqueue_read(dspqueue_t queue,
uint32_t * flags,
uint32_t max_buffers,
uint32_t * num_buffers,
struct dspqueue_buffer * buffers,
uint32_t max_message_length,
uint32_t * message_length,
uint8_t * message,
uint32_t timeout_us) {
return dspqueue_read_pfn(queue, flags, max_buffers, num_buffers, buffers, max_message_length, message_length,
message, timeout_us);
}
HTPDRV_API int remote_handle64_open(const char * name, remote_handle64 * ph) {
return remote_handle64_open_pfn(name, ph);
}
HTPDRV_API int remote_handle64_invoke(remote_handle64 h, uint32_t dwScalars, remote_arg * pra) {
return remote_handle64_invoke_pfn(h, dwScalars, pra);
}
HTPDRV_API int remote_handle64_close(remote_handle64 h) {
return remote_handle64_close_pfn(h);
}
HTPDRV_API int remote_handle_control(uint32_t req, void * data, uint32_t datalen) {
return remote_handle_control_pfn(req, data, datalen);
}
HTPDRV_API int remote_handle64_control(remote_handle64 h, uint32_t req, void * data, uint32_t datalen) {
return remote_handle64_control_pfn(h, req, data, datalen);
}
HTPDRV_API int remote_session_control(uint32_t req, void * data, uint32_t datalen) {
return remote_session_control_pfn(req, data, datalen);
}
#ifdef _WIN32
static std::string wstr_to_str(std::wstring_view wstr) {
std::string result;
if (wstr.empty()) {
return result;
}
auto bytes_needed = WideCharToMultiByte(CP_UTF8, WC_ERR_INVALID_CHARS,
wstr.data(), (int) wstr.size(),
nullptr, 0, nullptr, nullptr);
if (bytes_needed == 0) {
GGML_LOG_ERROR("ggml-hex: WideCharToMultiByte failed. Error %lu\n", GetLastError());
throw std::runtime_error("Invalid wstring input");
}
result.resize(bytes_needed, '\0');
int bytes_written = WideCharToMultiByte(CP_UTF8, WC_ERR_INVALID_CHARS,
wstr.data(), (int) wstr.size(),
result.data(), bytes_needed,
nullptr, nullptr);
if (bytes_written == 0) {
GGML_LOG_ERROR("ggml-hex: WideCharToMultiByte failed. Error %lu\n", GetLastError());
throw std::runtime_error("Wstring conversion failed");
}
return result;
}
static std::string get_driver_path() {
std::wstring serviceName = L"qcnspmcdm";
std::string result;
// Get a handle to the SCM database.
SC_HANDLE schSCManager = OpenSCManagerW(NULL, NULL, STANDARD_RIGHTS_READ);
if (nullptr == schSCManager) {
GGML_LOG_ERROR("ggml-hex: Failed to open SCManager. Error: %lu\n", GetLastError());
return result;
}
// Get a handle to the service.
SC_HANDLE schService = OpenServiceW(schSCManager, // SCM database
serviceName.c_str(), // name of service
SERVICE_QUERY_CONFIG); // need query config access
if (nullptr == schService) {
GGML_LOG_ERROR("ggml-hex: Failed to open qcnspmcdm service. Error: %lu\n", GetLastError());
CloseServiceHandle(schSCManager);
return result;
}
// Store the size of buffer used as an output.
DWORD bufferSize;
if (!QueryServiceConfigW(schService, NULL, 0, &bufferSize) &&
(GetLastError() != ERROR_INSUFFICIENT_BUFFER)) {
GGML_LOG_ERROR("ggml-hex: Failed to query service config. Error: %lu\n", GetLastError());
CloseServiceHandle(schService);
CloseServiceHandle(schSCManager);
return result;
}
// Get the configuration of the service.
LPQUERY_SERVICE_CONFIGW serviceConfig =
static_cast<LPQUERY_SERVICE_CONFIGW>(LocalAlloc(LMEM_FIXED, bufferSize));
if (!QueryServiceConfigW(schService, serviceConfig, bufferSize, &bufferSize)) {
fprintf(stderr, "ggml-hex: Failed to query service config. Error: %lu\n", GetLastError());
LocalFree(serviceConfig);
CloseServiceHandle(schService);
CloseServiceHandle(schSCManager);
return result;
}
// Read the driver file path get its parent directory
std::wstring driverPath = std::wstring(serviceConfig->lpBinaryPathName);
driverPath = driverPath.substr(0, driverPath.find_last_of(L"\\"));
// Clean up resources
LocalFree(serviceConfig);
CloseServiceHandle(schService);
CloseServiceHandle(schSCManager);
// Driver path would contain invalid path string, like:
// \SystemRoot\System32\DriverStore\FileRepository\qcadsprpc8280.inf_arm64_c2b9460c9a072f37
// "\SystemRoot" should be replace with a correct one (e.g. C:\Windows)
const std::wstring systemRootPlaceholder = L"\\SystemRoot";
if (0 != driverPath.compare(0, systemRootPlaceholder.length(), systemRootPlaceholder)) {
GGML_LOG_ERROR("ggml-hex: String pattern not found in driver path.\n");
return result;
}
// Replace \SystemRoot with an absolute path from system ENV windir
const std::wstring systemRootEnv = L"windir";
// Query the number of wide charactors this variable requires
DWORD numWords = GetEnvironmentVariableW(systemRootEnv.c_str(), NULL, 0);
if (numWords == 0) {
GGML_LOG_ERROR("ggml-hex: Failed get systemRoot environment variable\n");
return result;
}
// Query the actual system root name from environment variable
std::vector<wchar_t> systemRoot(numWords + 1);
numWords = GetEnvironmentVariableW(systemRootEnv.c_str(), systemRoot.data(), numWords + 1);
if (numWords == 0) {
GGML_LOG_ERROR("ggml-hex: Failed to read windir environment variable\n");
return result;
}
driverPath.replace(0, systemRootPlaceholder.length(), std::wstring(systemRoot.data()));
return wstr_to_str(driverPath);
}
#endif
using dl_handle_ptr = std::unique_ptr<dl_handle, dl_handle_deleter>;
int htpdrv_init() {
static dl_handle_ptr lib_cdsp_rpc_handle = nullptr;
static bool initialized = false;
#ifdef _WIN32
std::string drv_path = get_driver_path() + "\\" + "libcdsprpc.dll";
#else
std::string drv_path = "libcdsprpc.so";
#endif
if (initialized) {
GGML_LOG_INFO("ggml-hex: Driver already loaded\n");
return AEE_SUCCESS;
}
GGML_LOG_INFO("ggml-hex: Loading driver %s\n", drv_path.c_str());
fs::path path{ drv_path.c_str() };
dl_handle_ptr handle { dl_load_library(path) };
if (!handle) {
GGML_LOG_ERROR("ggml-hex: failed to load %s: %s\n", path.u8string().c_str(), dl_error());
return AEE_EUNABLETOLOAD;
}
#define dlsym(drv, type, pfn, symbol, ignore) \
do { \
pfn = (type) dl_get_sym(drv, #symbol); \
if (!ignore && nullptr == pfn) { \
GGML_LOG_ERROR("ggml-hex: failed to dlsym %s\n", #symbol); \
return AEE_EUNABLETOLOAD; \
} \
} while (0)
dlsym(handle.get(), rpcmem_alloc_pfn_t, rpcmem_alloc_pfn, rpcmem_alloc, false);
dlsym(handle.get(), rpcmem_alloc2_pfn_t, rpcmem_alloc2_pfn, rpcmem_alloc2, true);
dlsym(handle.get(), rpcmem_free_pfn_t, rpcmem_free_pfn, rpcmem_free, false);
dlsym(handle.get(), rpcmem_to_fd_pfn_t, rpcmem_to_fd_pfn, rpcmem_to_fd, false);
dlsym(handle.get(), fastrpc_mmap_pfn_t, fastrpc_mmap_pfn, fastrpc_mmap, false);
dlsym(handle.get(), fastrpc_munmap_pfn_t, fastrpc_munmap_pfn, fastrpc_munmap, false);
dlsym(handle.get(), dspqueue_create_pfn_t, dspqueue_create_pfn, dspqueue_create, false);
dlsym(handle.get(), dspqueue_close_pfn_t, dspqueue_close_pfn, dspqueue_close, false);
dlsym(handle.get(), dspqueue_export_pfn_t, dspqueue_export_pfn, dspqueue_export, false);
dlsym(handle.get(), dspqueue_write_pfn_t, dspqueue_write_pfn, dspqueue_write, false);
dlsym(handle.get(), dspqueue_read_pfn_t, dspqueue_read_pfn, dspqueue_read, false);
dlsym(handle.get(), remote_handle64_open_pfn_t, remote_handle64_open_pfn, remote_handle64_open, false);
dlsym(handle.get(), remote_handle64_invoke_pfn_t, remote_handle64_invoke_pfn, remote_handle64_invoke, false);
dlsym(handle.get(), remote_handle_control_pfn_t, remote_handle_control_pfn, remote_handle_control, false);
dlsym(handle.get(), remote_handle64_control_pfn_t, remote_handle64_control_pfn, remote_handle64_control, false);
dlsym(handle.get(), remote_session_control_pfn_t, remote_session_control_pfn, remote_session_control, false);
dlsym(handle.get(), remote_handle64_close_pfn_t, remote_handle64_close_pfn, remote_handle64_close, false);
lib_cdsp_rpc_handle = std::move(handle);
initialized = true;
return AEE_SUCCESS;
}
domain * get_domain(int domain_id) {
int i = 0;
int size = sizeof(supported_domains) / sizeof(domain);
for (i = 0; i < size; i++) {
if (supported_domains[i].id == domain_id) {
return &supported_domains[i];
}
}
return NULL;
}
int get_hex_arch_ver(int domain, int * arch) {
if (!remote_handle_control_pfn) {
GGML_LOG_ERROR("ggml-hex: remote_handle_control is not supported on this device\n");
return AEE_EUNSUPPORTEDAPI;
}
struct remote_dsp_capability arch_ver;
arch_ver.domain = (uint32_t) domain;
arch_ver.attribute_ID = ARCH_VER;
arch_ver.capability = (uint32_t) 0;
int err = remote_handle_control(DSPRPC_GET_DSP_INFO, &arch_ver, sizeof(arch_ver));
if ((err & 0xff) == (AEE_EUNSUPPORTEDAPI & 0xff)) {
GGML_LOG_ERROR("ggml-hex: FastRPC capability API is not supported on this device\n");
return AEE_EUNSUPPORTEDAPI;
}
if (err != AEE_SUCCESS) {
GGML_LOG_ERROR("ggml-hex: FastRPC capability query failed (err %d)\n", err);
return err;
}
switch (arch_ver.capability & 0xff) {
case 0x68:
*arch = 68;
return 0;
case 0x69:
*arch = 69;
return 0;
case 0x73:
*arch = 73;
return 0;
case 0x75:
*arch = 75;
return 0;
case 0x79:
*arch = 79;
return 0;
case 0x81:
*arch = 81;
return 0;
}
return -1;
}

View File

@ -0,0 +1,121 @@
#pragma once
#ifdef __cplusplus
extern "C" {
#endif
#ifdef _WIN32
# pragma clang diagnostic ignored "-Wignored-attributes"
#endif
#include <AEEStdErr.h>
#include <rpcmem.h>
#include <remote.h>
#include <dspqueue.h>
#if defined(_WIN32) && !defined(__MINGW32__)
# ifdef GGML_BACKEND_BUILD
# define HTPDRV_API __declspec(dllexport) extern
# else
# define HTPDRV_API __declspec(dllimport) extern
# endif
#else
# define HTPDRV_API __attribute__ ((visibility ("default"))) extern
#endif
/* Offset to differentiate HLOS and Hexagon error codes.
Stores the value of AEE_EOFFSET for Hexagon. */
#ifndef DSP_OFFSET
# define DSP_OFFSET 0x80000400
#endif
/* Errno for connection reset by peer. */
#ifndef ECONNRESET
# ifdef __hexagon__
# define ECONNRESET 104
# endif
#endif
/* Abstraction of different OS specific sleep APIs.
SLEEP accepts input in seconds. */
#ifndef SLEEP
# ifdef __hexagon__
# define SLEEP(x) \
{ /* Do nothing for simulator. */ \
}
# else
# ifdef _WIN32
# define SLEEP(x) Sleep(1000 * x) /* Sleep accepts input in milliseconds. */
# else
# define SLEEP(x) sleep(x) /* sleep accepts input in seconds. */
# endif
# endif
#endif
/* Include windows specific header files. */
#ifdef _WIN32
# include <windows.h>
# include <sysinfoapi.h>
# define _CRT_SECURE_NO_WARNINGS 1
# define _WINSOCK_DEPRECATED_NO_WARNINGS 1
#endif
/* Includes and defines for all HLOS except windows */
#if !defined(__hexagon__) && !defined(_WIN32)
# include "unistd.h"
# include <sys/time.h>
#endif
/* Includes and defines for Hexagon and all HLOS except Windows. */
#if !defined(_WIN32)
/* Weak reference to remote symbol for compilation. */
# pragma weak remote_session_control
# pragma weak remote_handle_control
# pragma weak remote_handle64_control
# pragma weak fastrpc_mmap
# pragma weak fastrpc_munmap
# pragma weak rpcmem_alloc2
#endif
#if !defined(_WIN32)
# pragma weak remote_system_request
#endif
#ifdef _WIN32
# define DSPQUEUE_TIMEOUT DSPQUEUE_TIMEOUT_NONE
#else
# define DSPQUEUE_TIMEOUT 1000000
#endif
/**
* htpdrv_init API: driver interface entry point
*
* @return Return AEE error codes as defined in Hexagon SDK.
*/
HTPDRV_API int htpdrv_init(void);
/**
* get_domain API: get domain struct from domain value.
*
* @param[in] domain value of a domain
* @return Returns domain struct of the domain if it is supported or else
* returns NULL.
*
*/
HTPDRV_API domain * get_domain(int domain_id);
/**
* get_hex_arch_ver API: query the Hexagon processor architecture version information
*
* @param[in] domain_id value of a domain
* @param[out] Arch version (73, 75, ...)
* @return 0 if query is successful.
* non-zero if error, return value points to the error.
*
*/
HTPDRV_API int get_hex_arch_ver(int domain, int * arch);
#ifdef __cplusplus
}
#endif

View File

@ -1,454 +0,0 @@
#pragma clang diagnostic ignored "-Wgnu-anonymous-struct"
#pragma clang diagnostic ignored "-Wmissing-prototypes"
#pragma clang diagnostic ignored "-Wsign-compare"
#define GGML_COMMON_IMPL_C
#include "ggml-backend-impl.h"
#include "ggml-common.h"
#include "ggml-hexagon.h"
#include "ggml-impl.h"
#include "htp-utils.h"
#include <domain.h>
#include <remote.h>
#include <stdbool.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
domain * get_domain(int domain_id) {
int i = 0;
int size = sizeof(supported_domains) / sizeof(domain);
for (i = 0; i < size; i++) {
if (supported_domains[i].id == domain_id) {
return &supported_domains[i];
}
}
return NULL;
}
bool is_valid_domain_id(int domain_id, int compute_only) {
int i = 0;
int size = sizeof(supported_domains) / sizeof(domain);
if (compute_only) {
return is_CDSP(domain_id);
}
for (i = 0; i < size; i++) {
if (supported_domains[i].id == domain_id) {
return true;
}
}
return false;
}
int get_domains_info(char * domain_type, int * num_domains, fastrpc_domain ** domains_info) {
int nErr = AEE_SUCCESS;
int ss_info = 0;
if (domain_type != NULL) {
if (strcmp(domain_type, "LPASS") == 0) {
ss_info = FASTRPC_LPASS;
} else if (strcmp(domain_type, "HPASS") == 0) {
ss_info = FASTRPC_HPASS;
} else {
ss_info = FASTRPC_NSP;
}
}
system_req_payload req = { 0 };
req.id = FASTRPC_GET_DOMAINS;
req.sys.domains = NULL;
fastrpc_domain * domain = NULL;
if (ss_info != 0) {
req.sys.flags = DOMAINS_LIST_FLAGS_SET_TYPE(req.sys.flags, ss_info);
} else {
req.sys.flags = 0;
}
#ifdef _WIN32
nErr = AEE_EUNSUPPORTED;
goto bail;
#endif
if (remote_system_request) {
nErr = remote_system_request(&req);
if (nErr != AEE_SUCCESS) {
GGML_LOG_ERROR("Failure in remote_system_request call: %d.\n", nErr);
goto bail;
}
// Allocate memory for domain-info array
req.sys.max_domains = req.sys.num_domains;
if ((req.sys.domains = calloc(req.sys.num_domains, sizeof(fastrpc_domain))) == NULL) {
nErr = AEE_ENOMEMORY;
GGML_LOG_ERROR("Unable to allocate memory for req.sys.domains");
goto bail;
}
nErr = remote_system_request(&req);
if (nErr != AEE_SUCCESS) {
GGML_LOG_ERROR("Failure in remote_system_request call: %d.\n", nErr);
goto bail;
}
for (int i = 0; i < req.sys.num_domains; i++) {
// Verify that only requested type domains were returned
domain = &req.sys.domains[i];
if (domain->type != ss_info && domain_type != NULL) {
nErr = -1;
GGML_LOG_ERROR("Incorrect data received from remote_system_request.\n");
goto bail;
}
}
*domains_info = req.sys.domains;
*num_domains = req.sys.num_domains;
} else {
nErr = AEE_EUNSUPPORTED;
goto bail;
}
bail:
if (nErr && !req.sys.domains) {
free(req.sys.domains);
}
return nErr;
}
int get_effective_domain_id(char * domain_name, int session_id, int * effec_domain_id) {
int err = 0;
remote_rpc_effective_domain_id_t sess = { 0 };
sess.domain_name = domain_name;
sess.domain_name_len = strlen(domain_name);
sess.session_id = session_id;
err = remote_session_control(FASTRPC_GET_EFFECTIVE_DOMAIN_ID, &sess, sizeof(sess));
if (err) {
GGML_LOG_ERROR("Error 0x%x: failed to get effective domain id for %s, session id %d\n", err, sess.domain_name,
session_id);
return err;
}
*effec_domain_id = sess.effective_domain_id;
return err;
}
int get_dsp_support(int * domain) {
int nErr = AEE_SUCCESS;
*domain = CDSP_DOMAIN_ID; // DSP domain default value is CDSP_DOMAIN_ID
if (remote_handle_control) {
struct remote_dsp_capability dsp_capability_domain = { CDSP_DOMAIN_ID, DOMAIN_SUPPORT, 0 };
nErr = remote_handle_control(DSPRPC_GET_DSP_INFO, &dsp_capability_domain, sizeof(struct remote_dsp_capability));
if ((nErr & 0xFF) == (AEE_EUNSUPPORTEDAPI & 0xFF)) {
GGML_LOG_ERROR("\nFastRPC Capability API is not supported on this device\n");
goto bail;
}
if (dsp_capability_domain.capability == 0) {
dsp_capability_domain.domain = ADSP_DOMAIN_ID; // Check for ADSP support.
dsp_capability_domain.attribute_ID = DOMAIN_SUPPORT;
dsp_capability_domain.capability = 0;
nErr = remote_handle_control(DSPRPC_GET_DSP_INFO, &dsp_capability_domain,
sizeof(struct remote_dsp_capability));
if (dsp_capability_domain.capability) {
*domain = ADSP_DOMAIN_ID; // For targets like Agatti (not having cDSP), domain is ADSP_DOMAIN_ID
}
}
if (nErr != AEE_SUCCESS) {
GGML_LOG_ERROR("\nget_dsp_support failed with Error 0x%x\n", nErr);
goto bail;
}
} else {
nErr = AEE_EUNSUPPORTEDAPI;
GGML_LOG_ERROR("remote_dsp_capability interface is not supported on this device\n");
}
bail:
return nErr;
}
int get_vtcm_info(int domain, uint32_t * capability, uint32_t attr) {
int nErr = AEE_SUCCESS;
*capability = 0;
if (attr == VTCM_PAGE || attr == VTCM_COUNT) {
} else {
nErr = AEE_EBADPARM;
GGML_LOG_ERROR("Unsupported attr. Only VTCM_PAGE and VTCM_COUNT supported\n");
goto bail;
}
if (remote_handle_control) {
if (domain == ADSP_DOMAIN_ID || domain == CDSP_DOMAIN_ID) {
/*
* Query the DSP for VTCM information
* Since the ADSP does not have a dedicated VTCM, we expect the output to be 0
*/
struct remote_dsp_capability dsp_capability_vtcm_dsp;
dsp_capability_vtcm_dsp.domain = (uint32_t) domain;
dsp_capability_vtcm_dsp.attribute_ID = attr;
dsp_capability_vtcm_dsp.capability = (uint32_t) 0;
nErr = remote_handle_control(DSPRPC_GET_DSP_INFO, &dsp_capability_vtcm_dsp,
sizeof(struct remote_dsp_capability));
if ((nErr & 0xFF) == (AEE_EUNSUPPORTEDAPI & 0xFF)) {
GGML_LOG_ERROR("\nFastRPC Capability API is not supported on this device\n");
GGML_LOG_ERROR("Running the usecase without checking the capability\n");
nErr = AEE_SUCCESS;
goto bail;
} else if (nErr == AEE_SUCCESS) {
*capability = dsp_capability_vtcm_dsp.capability;
} else {
GGML_LOG_ERROR("\nget_vtcm_info failed with Error 0x%x\n", nErr);
goto bail;
}
} else {
nErr = AEE_EUNSUPPORTED;
GGML_LOG_ERROR("Unsupported domain %d\n", domain);
goto bail;
}
} else {
nErr = AEE_EUNSUPPORTEDAPI;
GGML_LOG_ERROR("remote_dsp_capability interface is not supported on this device\n");
}
bail:
return nErr;
}
bool is_unsignedpd_supported(int domain_id) {
int nErr = AEE_SUCCESS;
if (remote_handle_control) {
struct remote_dsp_capability dsp_capability_domain = { domain_id, UNSIGNED_PD_SUPPORT, 0 };
nErr = remote_handle_control(DSPRPC_GET_DSP_INFO, &dsp_capability_domain, sizeof(struct remote_dsp_capability));
if ((nErr & 0xFF) == (AEE_EUNSUPPORTEDAPI & 0xFF)) {
GGML_LOG_ERROR("\nFastRPC Capability API is not supported on this device. Falling back to signed pd.\n");
return false;
}
if (nErr) {
GGML_LOG_ERROR("\nERROR 0x%x: FastRPC Capability API failed. Falling back to signed pd.", nErr);
return false;
}
if (dsp_capability_domain.capability == 1) {
return true;
}
} else {
nErr = AEE_EUNSUPPORTEDAPI;
GGML_LOG_ERROR("remote_dsp_capability interface is not supported on this device. Falling back to signed pd.\n");
return false;
}
return false;
}
bool get_unsignedpd_support(void) {
return is_unsignedpd_supported(CDSP_DOMAIN_ID);
}
bool is_async_fastrpc_supported(int domain) {
int nErr = AEE_SUCCESS;
if (remote_handle_control) {
if (domain == CDSP_DOMAIN_ID) {
/*
* Query the DSP for ASYNC_FASTRPC_SUPPORT information
* Async fastrpc is supported only on CDSP
*/
struct remote_dsp_capability dsp_capability_async_support;
dsp_capability_async_support.domain = (uint32_t) domain;
dsp_capability_async_support.attribute_ID = ASYNC_FASTRPC_SUPPORT;
dsp_capability_async_support.capability = (uint32_t) 0;
nErr = remote_handle_control(DSPRPC_GET_DSP_INFO, &dsp_capability_async_support,
sizeof(struct remote_dsp_capability));
if ((nErr & 0xFF) == (AEE_EUNSUPPORTEDAPI & 0xFF)) {
GGML_LOG_ERROR("\nFastRPC Capability API is not supported on this device\n");
GGML_LOG_ERROR("Running the usecase without checking the capability\n");
nErr = AEE_SUCCESS;
goto bail;
} else if (dsp_capability_async_support.capability == 1) {
return true;
}
if (nErr != AEE_SUCCESS) {
GGML_LOG_ERROR("\nis_async_fastrpc_supported failed with Error 0x%x\n", nErr);
goto bail;
}
} else {
nErr = AEE_EUNSUPPORTED;
GGML_LOG_ERROR("Async fastrpc is not supported on domain %d\n", domain);
goto bail;
}
} else {
nErr = AEE_EUNSUPPORTEDAPI;
GGML_LOG_ERROR("remote_dsp_capability interface is not supported on this device\n");
}
bail:
return false;
}
bool is_status_notification_supported(int domain) {
int nErr = AEE_SUCCESS;
if (remote_handle_control) {
/*
* Query the DSP for STATUS_NOTIFICATION_SUPPORT information
* DSP User PD status notification Support
*/
struct remote_dsp_capability dsp_capability_status_notification_support;
dsp_capability_status_notification_support.domain = (uint32_t) domain;
dsp_capability_status_notification_support.attribute_ID = STATUS_NOTIFICATION_SUPPORT;
dsp_capability_status_notification_support.capability = (uint32_t) 0;
nErr = remote_handle_control(DSPRPC_GET_DSP_INFO, &dsp_capability_status_notification_support,
sizeof(struct remote_dsp_capability));
if ((nErr & 0xFF) == (AEE_EUNSUPPORTEDAPI & 0xFF)) {
GGML_LOG_ERROR("\nFastRPC Capability API is not supported on this device\n");
GGML_LOG_ERROR("Running the usecase without checking the capability\n");
nErr = AEE_SUCCESS;
goto bail;
} else if (dsp_capability_status_notification_support.capability == 1) {
return true;
}
if (nErr != AEE_SUCCESS) {
GGML_LOG_ERROR("\nis_status_notification_supported failed with Error 0x%x\n", nErr);
goto bail;
}
} else {
nErr = AEE_EUNSUPPORTEDAPI;
GGML_LOG_ERROR("remote_dsp_capability interface is not supported on this device\n");
}
bail:
return false;
}
int get_hmx_support_info(int domain, uint32_t * capability, uint32_t attr) {
int nErr = AEE_SUCCESS;
*capability = 0;
if (attr != HMX_SUPPORT_SPATIAL && attr != HMX_SUPPORT_DEPTH) {
nErr = AEE_EBADPARM;
GGML_LOG_ERROR("Unsupported attr. Only HMX_SUPPORT_SPATIAL and HMX_SUPPORT_DEPTH supported\n");
goto bail;
}
if (remote_handle_control) {
if (domain == CDSP_DOMAIN_ID) {
/*
* Query the DSP for HMX SUPPORT information
* HMX is supported on CDSP only
*/
struct remote_dsp_capability dsp_capability_hmx_dsp;
dsp_capability_hmx_dsp.domain = (uint32_t) domain;
dsp_capability_hmx_dsp.attribute_ID = attr;
dsp_capability_hmx_dsp.capability = (uint32_t) 0;
nErr = remote_handle_control(DSPRPC_GET_DSP_INFO, &dsp_capability_hmx_dsp,
sizeof(struct remote_dsp_capability));
if ((nErr & 0xFF) == (AEE_EUNSUPPORTEDAPI & 0xFF)) {
GGML_LOG_ERROR("\nFastRPC Capability API is not supported on this device\n");
GGML_LOG_ERROR("Running the usecase without checking the capability\n");
nErr = AEE_SUCCESS;
goto bail;
} else if (nErr == AEE_SUCCESS) {
*capability = dsp_capability_hmx_dsp.capability;
} else {
GGML_LOG_ERROR("\nget_hmx_support_info failed with Error 0x%x\n", nErr);
goto bail;
}
} else {
nErr = AEE_EUNSUPPORTED;
GGML_LOG_ERROR("HMX support is not there for domain %d\n", domain);
goto bail;
}
} else {
nErr = AEE_EUNSUPPORTEDAPI;
GGML_LOG_ERROR("remote_dsp_capability interface is not supported on this device\n");
}
bail:
return nErr;
}
int get_hex_arch_ver(int domain, int * arch) {
if (!remote_handle_control) {
GGML_LOG_ERROR("ggml-hex: remote_handle_control is not supported on this device\n");
return AEE_EUNSUPPORTEDAPI;
}
struct remote_dsp_capability arch_ver;
arch_ver.domain = (uint32_t) domain;
arch_ver.attribute_ID = ARCH_VER;
arch_ver.capability = (uint32_t) 0;
int err = remote_handle_control(DSPRPC_GET_DSP_INFO, &arch_ver, sizeof(arch_ver));
if ((err & 0xff) == (AEE_EUNSUPPORTEDAPI & 0xff)) {
GGML_LOG_ERROR("ggml-hex: FastRPC capability API is not supported on this device\n");
return AEE_EUNSUPPORTEDAPI;
}
if (err != AEE_SUCCESS) {
GGML_LOG_ERROR("ggml-hex: FastRPC capability query failed (err %d)\n", err);
return err;
}
switch (arch_ver.capability & 0xff) {
case 0x68:
*arch = 68;
return 0;
case 0x69:
*arch = 69;
return 0;
case 0x73:
*arch = 73;
return 0;
case 0x75:
*arch = 75;
return 0;
case 0x79:
*arch = 79;
return 0;
case 0x81:
*arch = 81;
return 0;
}
return -1;
}
int get_hvx_support_info(int domain, uint32_t * capability, uint32_t attr) {
int nErr = AEE_SUCCESS;
*capability = 0;
if (remote_handle_control) {
if (domain == CDSP_DOMAIN_ID) {
/*
* Query the DSP for HVX SUPPORT information
* HVX is supported on CDSP only
*/
struct remote_dsp_capability dsp_capability_hvx_dsp;
dsp_capability_hvx_dsp.domain = (uint32_t) domain;
dsp_capability_hvx_dsp.attribute_ID = attr;
dsp_capability_hvx_dsp.capability = (uint32_t) 0;
nErr = remote_handle_control(DSPRPC_GET_DSP_INFO, &dsp_capability_hvx_dsp,
sizeof(struct remote_dsp_capability));
if ((nErr & 0xFF) == (AEE_EUNSUPPORTEDAPI & 0xFF)) {
GGML_LOG_ERROR("\nFastRPC Capability API is not supported on this device\n");
GGML_LOG_ERROR("Running the usecase without checking the capability\n");
nErr = AEE_SUCCESS;
goto bail;
} else if (nErr == AEE_SUCCESS) {
*capability = dsp_capability_hvx_dsp.capability;
} else {
GGML_LOG_ERROR("\nget_hvx_support_info failed with Error 0x%x\n", nErr);
goto bail;
}
} else {
nErr = AEE_EUNSUPPORTED;
GGML_LOG_ERROR("HVX support is not available on domain %d\n", domain);
goto bail;
}
} else {
nErr = AEE_EUNSUPPORTEDAPI;
GGML_LOG_ERROR("remote_dsp_capability interface is not supported on this device\n");
}
bail:
return nErr;
}

View File

@ -1,221 +0,0 @@
#ifndef HTP_UTILS_H
#define HTP_UTILS_H
#ifdef __cplusplus
extern "C" {
#endif
#include <AEEStdErr.h>
#include <inttypes.h>
#include <remote.h>
#include <rpcmem.h>
#include <stdbool.h>
/* Offset to differentiate HLOS and Hexagon error codes.
Stores the value of AEE_EOFFSET for Hexagon. */
#ifndef DSP_OFFSET
# define DSP_OFFSET 0x80000400
#endif
/* Errno for connection reset by peer. */
#ifndef ECONNRESET
# ifdef __hexagon__
# define ECONNRESET 104
# endif
#endif
/* Abstraction of different OS specific sleep APIs.
SLEEP accepts input in seconds. */
#ifndef SLEEP
# ifdef __hexagon__
# define SLEEP(x) \
{ /* Do nothing for simulator. */ \
}
# else
# ifdef _WINDOWS
# define SLEEP(x) Sleep(1000 * x) /* Sleep accepts input in milliseconds. */
# else
# define SLEEP(x) sleep(x) /* sleep accepts input in seconds. */
# endif
# endif
#endif
/* Include windows specific header files. */
#ifdef _WINDOWS
# include <sysinfoapi.h>
# include <windows.h>
# define _CRT_SECURE_NO_WARNINGS 1
# define _WINSOCK_DEPRECATED_NO_WARNINGS 1
/* Including this file for custom implementation of getopt function. */
# include "getopt_custom.h"
#endif
/* Includes and defines for all HLOS except windows */
#if !defined(__hexagon__) && !defined(_WINDOWS)
# include "unistd.h"
# include <sys/time.h>
#endif
/* Includes and defines for Hexagon and all HLOS except Windows. */
#if !defined(_WINDOWS)
/* Weak reference to remote symbol for compilation. */
# pragma weak remote_session_control
# pragma weak remote_handle_control
# pragma weak remote_handle64_control
# pragma weak fastrpc_mmap
# pragma weak fastrpc_munmap
# pragma weak rpcmem_alloc2
#endif
#if !defined(_WINDOWS)
# pragma weak remote_system_request
#endif
/**
* Wrapper for FastRPC Capability API: query DSP support.
*
* @param[out] domain pointer to supported domain.
* @return 0 if query is successful.
* non-zero if error, return value points to the error.
*/
int get_dsp_support(int * domain);
/**
* Wrapper for FastRPC Capability API: query VTCM information.
*
* @param[in] domain value of domain in the queried.
* @param[out] capability capability value of the attribute queried.
* @param[in] attr value of the attribute to the queried.
* @return 0 if query is successful.
* non-zero if error, return value points to the error.
*/
int get_vtcm_info(int domain, uint32_t * capability, uint32_t attr);
/**
* Wrapper for FastRPC Capability API: query unsigned pd support on CDSP domain.
*
* @return true if unsigned pd is supported.
* false if unsigned pd is not supported, capability query failed.
*/
bool get_unsignedpd_support(void);
/**
* Wrapper for FastRPC Capability API: query unsigned pd support.
*
* @param[in] domain value of domain in the queried.
* @return true if unsigned pd is supported.
* false if unsigned pd is not supported, capability query failed.
*/
bool is_unsignedpd_supported(int domain_id);
/**
* is_valid_domain_id API: query a domain id is valid.
*
* @param[in] domain value of domain in the queried.
* @param[in] compute_only value of domain is only compared with CDSP domains supported by the target when enabled.
* @return true if value of domain is valid.
* false if value of domain is not valid.
*/
bool is_valid_domain_id(int domain_id, int compute_only);
/**
* get_domain API: get domain struct from domain value.
*
* @param[in] domain value of a domain
* @return Returns domain struct of the domain if it is supported or else
* returns NULL.
*
*/
domain * get_domain(int domain_id);
/**
* get_domains_info API: get information for all the domains available on the device
*
* @param[in] domain_type pointer to domain type
* @param[in] num_domains pointer to number of domains
* @param[in] domains_info pointer to save discovered domains information.
* @return 0 if query is successful.
* non-zero if error, return value points to the error.
*
* It is user's responsibility to free the memory used to store the domains info whose address is present in domains_info before closing the application.
*
*/
int get_domains_info(char * domain_type, int * num_domains, fastrpc_domain ** domains_info);
/**
* get_effective_domain_id API: get effective domain id for given session id
*
* @param[in] domain_name pointer to domain name
* @param[in] session_id
* @param[in] effec_domain_id pointer to save obtained effective domain id.
* @return 0 if query is successful.
* non-zero if error, return value points to the error.
*
*/
int get_effective_domain_id(char * domain_name, int session_id, int * effec_domain_id);
/**
* is_async_fastrpc_supported API: query a domain id has async fastrpc supported or not
*
* @param[in] domain_id value of a domain
* @return Returns true or false stating support of Async FastRPC
*
*/
bool is_async_fastrpc_supported(int domain_id);
/**
* is_status_notification_supported API: query the DSP for STATUS_NOTIFICATION_SUPPORT information
*
* @param[in] domain_id value of a domain
* @return Returns true or false stating status notification support information
*
*/
bool is_status_notification_supported(int domain_id);
/**
* get_hmx_support_info API: query the DSP for HMX SUPPORT information
*
* @param[in] domain_id value of a domain
* @param[out] capability capability value of the attribute queried.
* @param[in] attr value of the attribute to the queried.
* @return 0 if query is successful.
* non-zero if error, return value points to the error.
*
*/
int get_hmx_support_info(int domain, uint32_t * capability, uint32_t attr);
/**
* get_hex_arch_ver API: query the Hexagon processor architecture version information
*
* @param[in] domain_id value of a domain
* @param[out] Arch version (73, 75, ...)
* @return 0 if query is successful.
* non-zero if error, return value points to the error.
*
*/
int get_hex_arch_ver(int domain, int * arch);
/**
* get_hvx_support_info API: query the DSP for HVX SUPPORT information
*
* @param[in] domain_id value of a domain
* @param[out] capability capability value of the attribute queried.
* @param[in] attr value of the attribute to the queried.
* @return 0 if query is successful.
* non-zero if error, return value points to the error.
*
*/
int get_hvx_support_info(int domain, uint32_t * capability, uint32_t attr);
#ifdef __cplusplus
}
#endif
#endif //DSP_CAPABILITIES_UTILS_H

View File

@ -17,6 +17,12 @@
#include "htp-msg.h"
#include "htp-ops.h"
static inline HVX_Vector hvx_load_f32_to_f16(const HVX_Vector * restrict src, const HVX_Vector zero) {
HVX_Vector y0_qf = Q6_Vqf32_vsub_VsfVsf(src[0], zero); // 32 elements
HVX_Vector y1_qf = Q6_Vqf32_vsub_VsfVsf(src[1], zero); // 32 elements
return Q6_Vh_vdeal_Vh(Q6_Vhf_equals_Wqf32(Q6_W_vcombine_VV(y1_qf, y0_qf)));
}
// Dot product of FP32 and FP16 vectors, accumulating to float
static inline void hvx_dot_f32_f16_aa(float * restrict r, const void * restrict y, const void * restrict x, unsigned int n, float s) {
const HVX_Vector * restrict vy = (const HVX_Vector * restrict) y; // fp32
@ -33,23 +39,19 @@ static inline void hvx_dot_f32_f16_aa(float * restrict r, const void * restrict
#pragma unroll(4)
for (i = 0; i < nvec; i++) {
// Load y (fp32) and convert into fp16
HVX_Vector y0_qf = Q6_Vqf32_vsub_VsfVsf(vy[i*2+0], zero); // 32 elements
HVX_Vector y1_qf = Q6_Vqf32_vsub_VsfVsf(vy[i*2+1], zero); // 32 elements
HVX_Vector y_hf = Q6_Vh_vdeal_Vh(Q6_Vhf_equals_Wqf32(Q6_W_vcombine_VV(y1_qf, y0_qf)));
HVX_Vector y_hf = hvx_load_f32_to_f16(&vy[i*2], zero);
// Load x (fp16)
HVX_Vector x_hf = vx[i];
HVX_VectorPair xy_qf = Q6_Wqf32_vmpy_VhfVhf(x_hf, y_hf);
rsum = Q6_Vqf32_vadd_Vqf32Vqf32(rsum, Q6_Vqf32_vadd_Vqf32Vqf32(Q6_V_lo_W(xy_qf), Q6_V_hi_W(xy_qf)));
rsum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(Q6_Vqf32_vadd_Vqf32Vqf32(Q6_V_lo_W(xy_qf), Q6_V_hi_W(xy_qf)), rsum));
}
if (nloe) {
// Load y (fp32) and convert into fp16
HVX_Vector y0_qf = Q6_Vqf32_vsub_VsfVsf(vy[i*2+0], zero); // 32 elements
HVX_Vector y1_qf = Q6_Vqf32_vsub_VsfVsf(vy[i*2+1], zero); // 32 elements
HVX_Vector y_hf = Q6_Vh_vdeal_Vh(Q6_Vhf_equals_Wqf32(Q6_W_vcombine_VV(y1_qf, y0_qf)));
HVX_Vector y_hf = hvx_load_f32_to_f16(&vy[i*2], zero);
// Load x (fp16)
HVX_Vector x_hf = vx[i];
@ -62,13 +64,72 @@ static inline void hvx_dot_f32_f16_aa(float * restrict r, const void * restrict
HVX_VectorPair xy_qf = Q6_Wqf32_vmpy_VhfVhf(x_hf, y_hf);
rsum = Q6_Vqf32_vadd_Vqf32Vqf32(rsum, Q6_Vqf32_vadd_Vqf32Vqf32(Q6_V_lo_W(xy_qf), Q6_V_hi_W(xy_qf)));
rsum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(Q6_Vqf32_vadd_Vqf32Vqf32(Q6_V_lo_W(xy_qf), Q6_V_hi_W(xy_qf)), rsum));
}
rsum = Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(rsum), hvx_vec_splat_f32(s));
rsum = Q6_Vsf_equals_Vqf32(hvx_vec_reduce_sum_qf32(rsum));
rsum = Q6_Vqf32_vmpy_VsfVsf(hvx_vec_splat_f32(s), hvx_vec_reduce_sum_f32(rsum));
hvx_vec_store_u(r, 4, Q6_Vsf_equals_Vqf32(rsum));
}
hvx_vec_store_u(r, 4, rsum);
// Dot product of FP32 and FP16 vectors, accumulating to float
static inline void hvx_dot_f32_f16_aa_rx2(float * restrict r,
const void * restrict y,
const void * restrict x0,
const void * restrict x1,
unsigned int n,
float s) {
const HVX_Vector * restrict vy = (const HVX_Vector * restrict) y; // fp32
const HVX_Vector * restrict vx0 = (const HVX_Vector * restrict) x0; // fp16
const HVX_Vector * restrict vx1 = (const HVX_Vector * restrict) x1; // fp16
uint32_t nvec = n / VLEN_FP16; // num full fp16 hvx vectors
uint32_t nloe = n % VLEN_FP16; // leftover elements
const HVX_Vector zero = Q6_V_vsplat_R(0);
HVX_Vector rsum0 = Q6_V_vsplat_R(0);
HVX_Vector rsum1 = Q6_V_vsplat_R(0);
uint32_t i = 0;
#pragma unroll(2)
for (i = 0; i < nvec; i++) {
// Load y (fp32) and convert into fp16
HVX_Vector y_hf = hvx_load_f32_to_f16(&vy[i*2], zero);
// Load x (fp16)
HVX_Vector x0_hf = vx0[i];
HVX_Vector x1_hf = vx1[i];
HVX_VectorPair xy0_qf = Q6_Wqf32_vmpy_VhfVhf(x0_hf, y_hf);
HVX_VectorPair xy1_qf = Q6_Wqf32_vmpy_VhfVhf(x1_hf, y_hf);
rsum0 = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(Q6_Vqf32_vadd_Vqf32Vqf32(Q6_V_lo_W(xy0_qf), Q6_V_hi_W(xy0_qf)), rsum0));
rsum1 = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(Q6_Vqf32_vadd_Vqf32Vqf32(Q6_V_lo_W(xy1_qf), Q6_V_hi_W(xy1_qf)), rsum1));
}
if (nloe) {
// Load y (fp32) and convert into fp16
HVX_Vector y_hf = hvx_load_f32_to_f16(&vy[i*2], zero);
// Load x (fp16)
HVX_Vector x0_hf = vx0[i];
HVX_Vector x1_hf = vx1[i];
// Zero-out unused elements
// Note that we need to clear both x and y because they may contain NANs
HVX_VectorPred bmask = Q6_Q_vsetq_R(nloe * 2);
x0_hf = Q6_V_vand_QV(bmask, x0_hf);
x1_hf = Q6_V_vand_QV(bmask, x1_hf);
y_hf = Q6_V_vand_QV(bmask, y_hf);
HVX_VectorPair xy0_qf = Q6_Wqf32_vmpy_VhfVhf(x0_hf, y_hf);
HVX_VectorPair xy1_qf = Q6_Wqf32_vmpy_VhfVhf(x1_hf, y_hf);
rsum0 = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(Q6_Vqf32_vadd_Vqf32Vqf32(Q6_V_lo_W(xy0_qf), Q6_V_hi_W(xy0_qf)), rsum0));
rsum1 = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(Q6_Vqf32_vadd_Vqf32Vqf32(Q6_V_lo_W(xy1_qf), Q6_V_hi_W(xy1_qf)), rsum1));
}
HVX_Vector rsum = Q6_Vqf32_vmpy_VsfVsf(hvx_vec_splat_f32(s), hvx_vec_reduce_sum_f32x2(rsum0, rsum1));
hvx_vec_store_u(r, 8, Q6_Vsf_equals_Vqf32(rsum));
}
// Dot product of two F16 vectors, accumulating to float
@ -91,7 +152,7 @@ static inline void hvx_dot_f16_f16_aa(float * restrict r, const void * restrict
HVX_VectorPair xy_qf = Q6_Wqf32_vmpy_VhfVhf(x_hf, y_hf);
rsum = Q6_Vqf32_vadd_Vqf32Vqf32(rsum, Q6_Vqf32_vadd_Vqf32Vqf32(Q6_V_lo_W(xy_qf), Q6_V_hi_W(xy_qf)));
rsum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(Q6_Vqf32_vadd_Vqf32Vqf32(Q6_V_lo_W(xy_qf), Q6_V_hi_W(xy_qf)), rsum));
}
if (nloe) {
@ -103,12 +164,62 @@ static inline void hvx_dot_f16_f16_aa(float * restrict r, const void * restrict
HVX_VectorPair xy_qf = Q6_Wqf32_vmpy_VhfVhf(x_hf, y_hf);
rsum = Q6_Vqf32_vadd_Vqf32Vqf32(rsum, Q6_Vqf32_vadd_Vqf32Vqf32(Q6_V_lo_W(xy_qf), Q6_V_hi_W(xy_qf)));
rsum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(Q6_Vqf32_vadd_Vqf32Vqf32(Q6_V_lo_W(xy_qf), Q6_V_hi_W(xy_qf)), rsum));
}
rsum = Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(rsum), hvx_vec_splat_f32(s));
rsum = Q6_Vsf_equals_Vqf32(hvx_vec_reduce_sum_qf32(rsum));
hvx_vec_store_u(r, 4, rsum);
rsum = Q6_Vqf32_vmpy_VsfVsf(hvx_vec_splat_f32(s), hvx_vec_reduce_sum_f32(rsum));
hvx_vec_store_u(r, 4, Q6_Vsf_equals_Vqf32(rsum));
}
static inline void hvx_dot_f16_f16_aa_rx2(float * restrict r,
const void * restrict y,
const void * restrict x0,
const void * restrict x1,
unsigned int n,
float s) {
const HVX_Vector * restrict vx0 = (const HVX_Vector * restrict) x0; // fp16
const HVX_Vector * restrict vx1 = (const HVX_Vector * restrict) x1; // fp16
const HVX_Vector * restrict vy = (const HVX_Vector * restrict) y; // fp16
uint32_t nvec = n / VLEN_FP16; // num full fp16 hvx vectors
uint32_t nloe = n % VLEN_FP16; // leftover elements
const HVX_Vector zero = Q6_V_vsplat_R(0);
HVX_Vector rsum0 = Q6_V_vsplat_R(0);
HVX_Vector rsum1 = Q6_V_vsplat_R(0);
uint32_t i = 0;
#pragma unroll(4)
for (i = 0; i < nvec; i++) {
HVX_Vector y_hf = vy[i];
HVX_Vector x0_hf = vx0[i];
HVX_Vector x1_hf = vx1[i];
HVX_VectorPair xy0_qf = Q6_Wqf32_vmpy_VhfVhf(x0_hf, y_hf);
HVX_VectorPair xy1_qf = Q6_Wqf32_vmpy_VhfVhf(x1_hf, y_hf);
rsum0 = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(Q6_Vqf32_vadd_Vqf32Vqf32(Q6_V_lo_W(xy0_qf), Q6_V_hi_W(xy0_qf)), rsum0));
rsum1 = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(Q6_Vqf32_vadd_Vqf32Vqf32(Q6_V_lo_W(xy1_qf), Q6_V_hi_W(xy1_qf)), rsum1));
}
if (nloe) {
HVX_Vector y_hf = vy[i];
// Load x (fp16) and zero-out unused elements
HVX_VectorPred bmask = Q6_Q_vsetq_R(nloe * 2);
HVX_Vector x0_hf = Q6_V_vand_QV(bmask, vx0[i]);
HVX_Vector x1_hf = Q6_V_vand_QV(bmask, vx1[i]);
HVX_VectorPair xy0_qf = Q6_Wqf32_vmpy_VhfVhf(x0_hf, y_hf);
HVX_VectorPair xy1_qf = Q6_Wqf32_vmpy_VhfVhf(x1_hf, y_hf);
rsum0 = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(Q6_Vqf32_vadd_Vqf32Vqf32(Q6_V_lo_W(xy0_qf), Q6_V_hi_W(xy0_qf)), rsum0));
rsum1 = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(Q6_Vqf32_vadd_Vqf32Vqf32(Q6_V_lo_W(xy1_qf), Q6_V_hi_W(xy1_qf)), rsum1));
}
HVX_Vector rsum = Q6_Vqf32_vmpy_VsfVsf(hvx_vec_splat_f32(s), hvx_vec_reduce_sum_f32x2(rsum0, rsum1));
hvx_vec_store_u(r, 8, Q6_Vsf_equals_Vqf32(rsum));
}
// MAD: y (F32) += x (F16) * s (float)
@ -317,20 +428,22 @@ static void flash_attn_ext_f16_thread(struct htp_ops_context * octx, int ith, in
// Inner loop processing the block from VTCM
uint32_t ic = 0;
const bool is_q_fp32 = (q->type == HTP_TYPE_F32);
// Process in blocks of 32 (VLEN_FP32)
static_assert(FLASH_ATTN_BLOCK_SIZE / VLEN_FP32 == 4, "FLASH_ATTN_BLOCK_SIZE changed, fix HVX_Vector_x4 usage");
static_assert(FLASH_ATTN_BLOCK_SIZE / VLEN_FP32 <= 4, "FLASH_ATTN_BLOCK_SIZE changed, fix HVX_Vector_x4 usage");
HVX_Vector_x4 scores_x4;
HVX_Vector v_max = hvx_vec_splat_f32(-INFINITY);
for (uint32_t iv = 0; ic + VLEN_FP32 <= current_block_size; ic += VLEN_FP32, ++iv) {
// 1. Compute scores
float __attribute__((aligned(VLEN))) scores_arr[FLASH_ATTN_BLOCK_SIZE];
for (int j = 0; j < VLEN_FP32; ++j) {
float __attribute__((aligned(VLEN))) scores_arr[VLEN_FP32];
for (int j = 0; j < VLEN_FP32; j += 2) {
const uint32_t cur_ic = ic + j;
const uint8_t * k_ptr = k_base + cur_ic * size_k_row_padded;
if (q->type == HTP_TYPE_F32) {
hvx_dot_f32_f16_aa(&scores_arr[j], q_ptr_vtcm, k_ptr, DK, scale);
if (is_q_fp32) {
hvx_dot_f32_f16_aa_rx2(&scores_arr[j], q_ptr_vtcm, k_ptr, k_ptr + size_k_row_padded, DK, scale);
} else {
hvx_dot_f16_f16_aa(&scores_arr[j], q_ptr_vtcm, k_ptr, DK, scale);
hvx_dot_f16_f16_aa_rx2(&scores_arr[j], q_ptr_vtcm, k_ptr, k_ptr + size_k_row_padded, DK, scale);
}
}
@ -403,7 +516,7 @@ static void flash_attn_ext_f16_thread(struct htp_ops_context * octx, int ith, in
float s_val;
const uint8_t * k_ptr = k_base + ic * size_k_row_padded;
if (q->type == HTP_TYPE_F32) {
if (is_q_fp32) {
hvx_dot_f32_f16_aa(&s_val, q_ptr_vtcm, k_ptr, DK, scale);
} else {
hvx_dot_f16_f16_aa(&s_val, q_ptr_vtcm, k_ptr, DK, scale);

View File

@ -28,19 +28,16 @@ static void hvx_vec_dump_f16(char * pref, HVX_Vector v) {
}
static void hvx_vec_dump_f32_n(char * pref, HVX_Vector v, uint32_t n) {
union {
HVX_Vector v;
float d[32];
} u = { .v = v };
HVX_VectorAlias u = { .v = v };
const uint32_t n0 = n / 16;
const uint32_t n1 = n % 16;
int i = 0;
for (; i < n0; i++) {
hex_dump_f32_line(pref, u.d + (16 * i), 16);
hex_dump_f32_line(pref, u.fp32 + (16 * i), 16);
}
if (n1) {
hex_dump_f32_line(pref, u.d + (16 * i), n1);
hex_dump_f32_line(pref, u.fp32 + (16 * i), n1);
}
}

View File

@ -44,6 +44,45 @@ static inline HVX_Vector hvx_vec_reduce_sum_qf32(HVX_Vector in) {
return hvx_vec_reduce_sum_n_qf32(in, 32);
}
#if __HVX_ARCH__ > 75
static inline HVX_Vector hvx_vec_reduce_sum_f32x2(HVX_Vector in0, HVX_Vector in1) {
HVX_VectorPair sump = Q6_W_vshuff_VVR(in1, in0, 4);
HVX_Vector sum_sf = Q6_Vsf_vadd_VsfVsf(Q6_V_lo_W(sump), Q6_V_hi_W(sump));
sum_sf = Q6_Vsf_vadd_VsfVsf(sum_sf, Q6_V_vror_VR(sum_sf, VLEN / 2));
sum_sf = Q6_Vsf_vadd_VsfVsf(sum_sf, Q6_V_vror_VR(sum_sf, VLEN / 4));
sum_sf = Q6_Vsf_vadd_VsfVsf(sum_sf, Q6_V_vror_VR(sum_sf, VLEN / 8));
sum_sf = Q6_Vsf_vadd_VsfVsf(sum_sf, Q6_V_vror_VR(sum_sf, VLEN / 16));
return sum_sf;
}
static inline HVX_Vector hvx_vec_reduce_sum_n_f32(HVX_Vector in, unsigned int n) {
unsigned int total = n * 4; // total vec nbytes
unsigned int width = 4; // fp32 nbytes
HVX_Vector sum = in, sum_t;
while (width < total) {
sum_t = Q6_V_vror_VR(sum, width); // rotate right
sum = Q6_Vsf_vadd_VsfVsf(sum, sum_t); // elementwise sum
width = width << 1;
}
return sum;
}
#else
static inline HVX_Vector hvx_vec_reduce_sum_f32x2(HVX_Vector in0, HVX_Vector in1) {
HVX_VectorPair sump = Q6_W_vshuff_VVR(in1, in0, 4);
HVX_Vector sum_qf = Q6_Vqf32_vadd_VsfVsf(Q6_V_lo_W(sump), Q6_V_hi_W(sump));
sum_qf = Q6_Vqf32_vadd_Vqf32Vsf(sum_qf, Q6_V_vror_VR(Q6_Vsf_equals_Vqf32(sum_qf), VLEN / 2));
sum_qf = Q6_Vqf32_vadd_Vqf32Vsf(sum_qf, Q6_V_vror_VR(Q6_Vsf_equals_Vqf32(sum_qf), VLEN / 4));
sum_qf = Q6_Vqf32_vadd_Vqf32Vsf(sum_qf, Q6_V_vror_VR(Q6_Vsf_equals_Vqf32(sum_qf), VLEN / 8));
sum_qf = Q6_Vqf32_vadd_Vqf32Vsf(sum_qf, Q6_V_vror_VR(Q6_Vsf_equals_Vqf32(sum_qf), VLEN / 16));
return Q6_Vsf_equals_Vqf32(sum_qf);
}
static inline HVX_Vector hvx_vec_reduce_sum_n_f32(HVX_Vector in, unsigned int n) {
unsigned int total = n * 4; // total vec nbytes
unsigned int width = 4; // fp32 nbytes
@ -57,6 +96,8 @@ static inline HVX_Vector hvx_vec_reduce_sum_n_f32(HVX_Vector in, unsigned int n)
return sum;
}
#endif
static inline HVX_Vector hvx_vec_reduce_sum_f32(HVX_Vector in) {
return hvx_vec_reduce_sum_n_f32(in, 32);
}

View File

@ -11,6 +11,7 @@
#include "hex-dma.h"
#include "hvx-utils.h"
#include "hvx-dump.h"
#define GGML_COMMON_DECL_C
#include "ggml-common.h"
@ -320,7 +321,7 @@ static void vec_dot_q4x4x2_q8x4x2(const int n, float * restrict s, const void *
const uint8_t * restrict y_q = ((const uint8_t *) vy + 0); // quants first
const uint8_t * restrict y_d = ((const uint8_t *) vy + y_qrow_size); // then scales
// Row sum (qf32)
// Row sum (sf)
HVX_Vector r0_sum = Q6_V_vsplat_R(0);
// Multiply and accumulate into int32.
@ -344,7 +345,7 @@ static void vec_dot_q4x4x2_q8x4x2(const int n, float * restrict s, const void *
HVX_Vector r0_fa = Q6_Vqf32_vmpy_VsfVsf(r0_ia, r0_dd);
r0_sum = Q6_Vqf32_vadd_Vqf32Vqf32(r0_sum, r0_fa);
r0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_fa, r0_sum));
}
// Process leftovers, we still load full 4x4x2 block but zero out unused scales/blocks
@ -362,14 +363,14 @@ static void vec_dot_q4x4x2_q8x4x2(const int n, float * restrict s, const void *
// Zero out unused scales
HVX_VectorPred bmask = Q6_Q_vsetq_R(nloe / 8);
r0_dd = Q6_V_vand_QV(bmask, r0_dd);
r0_ia = Q6_V_vand_QV(bmask, r0_ia);
HVX_Vector r0_fa = Q6_Vqf32_vmpy_VsfVsf(r0_ia, r0_dd);
r0_sum = Q6_Vqf32_vadd_Vqf32Vqf32(r0_sum, r0_fa);
r0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_fa, r0_sum));
}
// Reduce and convert into fp32
r0_sum = hvx_vec_reduce_sum_f32(Q6_Vsf_equals_Vqf32(r0_sum));
r0_sum = hvx_vec_reduce_sum_f32(r0_sum);
hvx_vec_store_u(&s[0], 4, r0_sum);
}
@ -402,7 +403,7 @@ static void vec_dot_q4x4x2_q8x4x2_rx2(const int n,
const uint8_t * restrict y_q = ((const uint8_t *) vy + 0); // quants first
const uint8_t * restrict y_d = ((const uint8_t *) vy + y_qrow_size); // then scales
// Row sum (qf32)
// Row sum (sf)
HVX_Vector r0_sum = Q6_V_vsplat_R(0);
HVX_Vector r1_sum = Q6_V_vsplat_R(0);
@ -432,8 +433,8 @@ static void vec_dot_q4x4x2_q8x4x2_rx2(const int n,
HVX_Vector r0_fa = Q6_Vqf32_vmpy_VsfVsf(r0_ia, r0_dd);
HVX_Vector r1_fa = Q6_Vqf32_vmpy_VsfVsf(r1_ia, r1_dd);
r0_sum = Q6_Vqf32_vadd_Vqf32Vqf32(r0_sum, r0_fa);
r1_sum = Q6_Vqf32_vadd_Vqf32Vqf32(r1_sum, r1_fa);
r0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_fa, r0_sum));
r1_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r1_fa, r1_sum));
}
// Process leftovers, we still load full 4x4x2 block but zero out unused scales/blocks
@ -456,20 +457,18 @@ static void vec_dot_q4x4x2_q8x4x2_rx2(const int n,
HVX_VectorPred bmask = Q6_Q_vsetq_R(nloe / 8);
r0_dd = Q6_V_vand_QV(bmask, r0_dd);
r1_dd = Q6_V_vand_QV(bmask, r1_dd);
r0_ia = Q6_V_vand_QV(bmask, r0_ia);
r1_ia = Q6_V_vand_QV(bmask, r1_ia);
HVX_Vector r0_fa = Q6_Vqf32_vmpy_VsfVsf(r0_ia, r0_dd);
HVX_Vector r1_fa = Q6_Vqf32_vmpy_VsfVsf(r1_ia, r1_dd);
r0_sum = Q6_Vqf32_vadd_Vqf32Vqf32(r0_sum, r0_fa);
r1_sum = Q6_Vqf32_vadd_Vqf32Vqf32(r1_sum, r1_fa);
r0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_fa, r0_sum));
r1_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r1_fa, r1_sum));
}
// Convert into fp32 and reduce
r0_sum = hvx_vec_reduce_sum_f32(Q6_Vsf_equals_Vqf32(r0_sum));
r1_sum = hvx_vec_reduce_sum_f32(Q6_Vsf_equals_Vqf32(r1_sum));
HVX_VectorPair p0 = Q6_W_vshuff_VVR(r1_sum, r0_sum, 4);
hvx_vec_store_u(&s[0], 8, Q6_V_lo_W(p0));
HVX_Vector rsum = hvx_vec_reduce_sum_f32x2(r0_sum, r1_sum);
hvx_vec_store_u(&s[0], 8, rsum);
}
static void vec_dot_q8x4x2_q8x4x2(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
@ -493,7 +492,7 @@ static void vec_dot_q8x4x2_q8x4x2(const int n, float * restrict s, const void *
const uint8_t * restrict y_q = ((const uint8_t *) vy + 0); // quants first
const uint8_t * restrict y_d = ((const uint8_t *) vy + y_qrow_size); // then scales
// Row sum (qf32)
// Row sum (sf)
HVX_Vector r0_sum = Q6_V_vsplat_R(0);
// Multiply and accumulate into int32.
@ -517,7 +516,7 @@ static void vec_dot_q8x4x2_q8x4x2(const int n, float * restrict s, const void *
HVX_Vector r0_fa = Q6_Vqf32_vmpy_VsfVsf(r0_ia, r0_dd);
r0_sum = Q6_Vqf32_vadd_Vqf32Vqf32(r0_sum, r0_fa);
r0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_fa, r0_sum));
}
// Process leftovers, we still load full 4x4x2 block but zero out unused scales/blocks
@ -535,14 +534,14 @@ static void vec_dot_q8x4x2_q8x4x2(const int n, float * restrict s, const void *
// Zero out unused scales
HVX_VectorPred bmask = Q6_Q_vsetq_R(nloe / 8);
r0_dd = Q6_V_vand_QV(bmask, r0_dd);
r0_ia = Q6_V_vand_QV(bmask, r0_ia);
HVX_Vector r0_fa = Q6_Vqf32_vmpy_VsfVsf(r0_ia, r0_dd);
r0_sum = Q6_Vqf32_vadd_Vqf32Vqf32(r0_sum, r0_fa);
r0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_fa, r0_sum));
}
// Reduce and convert into fp32
r0_sum = hvx_vec_reduce_sum_f32(Q6_Vsf_equals_Vqf32(r0_sum));
r0_sum = hvx_vec_reduce_sum_f32(r0_sum);
hvx_vec_store_u(&s[0], 4, r0_sum);
}
@ -605,8 +604,8 @@ static void vec_dot_q8x4x2_q8x4x2_rx2(const int n,
HVX_Vector r0_fa = Q6_Vqf32_vmpy_VsfVsf(r0_ia, r0_dd);
HVX_Vector r1_fa = Q6_Vqf32_vmpy_VsfVsf(r1_ia, r1_dd);
r0_sum = Q6_Vqf32_vadd_Vqf32Vqf32(r0_sum, r0_fa);
r1_sum = Q6_Vqf32_vadd_Vqf32Vqf32(r1_sum, r1_fa);
r0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_fa, r0_sum));
r1_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r1_fa, r1_sum));
}
// Process leftovers, we still load full 4x4x2 block but zero out unused scales/blocks
@ -629,20 +628,18 @@ static void vec_dot_q8x4x2_q8x4x2_rx2(const int n,
HVX_VectorPred bmask = Q6_Q_vsetq_R(nloe / 8);
r0_dd = Q6_V_vand_QV(bmask, r0_dd);
r1_dd = Q6_V_vand_QV(bmask, r1_dd);
r0_ia = Q6_V_vand_QV(bmask, r0_ia);
r1_ia = Q6_V_vand_QV(bmask, r1_ia);
HVX_Vector r0_fa = Q6_Vqf32_vmpy_VsfVsf(r0_ia, r0_dd);
HVX_Vector r1_fa = Q6_Vqf32_vmpy_VsfVsf(r1_ia, r1_dd);
r0_sum = Q6_Vqf32_vadd_Vqf32Vqf32(r0_sum, r0_fa);
r1_sum = Q6_Vqf32_vadd_Vqf32Vqf32(r1_sum, r1_fa);
r0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_fa, r0_sum));
r1_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r1_fa, r1_sum));
}
// Convert into fp32 and reduce
r0_sum = hvx_vec_reduce_sum_f32(Q6_Vsf_equals_Vqf32(r0_sum));
r1_sum = hvx_vec_reduce_sum_f32(Q6_Vsf_equals_Vqf32(r1_sum));
HVX_VectorPair p0 = Q6_W_vshuff_VVR(r1_sum, r0_sum, 4);
hvx_vec_store_u(&s[0], 8, Q6_V_lo_W(p0));
HVX_Vector rsum = hvx_vec_reduce_sum_f32x2(r0_sum, r1_sum);
hvx_vec_store_u(&s[0], 8, rsum);
}
static void vec_dot_mxfp4x4x2_q8x4x2(const int n,
@ -669,7 +666,7 @@ static void vec_dot_mxfp4x4x2_q8x4x2(const int n,
const uint8_t * restrict y_q = ((const uint8_t *) vy + 0); // quants first
const uint8_t * restrict y_d = ((const uint8_t *) vy + y_qrow_size); // then scales
// Row sum (qf32)
// Row sum (sf)
HVX_Vector r0_sum = Q6_V_vsplat_R(0);
// Multiply and accumulate into int32.
@ -708,7 +705,7 @@ static void vec_dot_mxfp4x4x2_q8x4x2(const int n,
HVX_Vector r0_fa = Q6_Vqf32_vmpy_VsfVsf(r0_ia, r0_dd);
r0_sum = Q6_Vqf32_vadd_Vqf32Vqf32(r0_sum, r0_fa);
r0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_fa, r0_sum));
}
// Process leftovers
@ -741,14 +738,14 @@ static void vec_dot_mxfp4x4x2_q8x4x2(const int n,
// Zero-out unused scales
HVX_VectorPred bmask = Q6_Q_vsetq_R(nloe / 8);
r0_dd = Q6_V_vand_QV(bmask, r0_dd);
r0_ia = Q6_V_vand_QV(bmask, r0_ia);
HVX_Vector r0_fa = Q6_Vqf32_vmpy_VsfVsf(r0_ia, r0_dd);
r0_sum = Q6_Vqf32_vadd_Vqf32Vqf32(r0_sum, r0_fa);
r0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_fa, r0_sum));
}
// Reduce and convert into fp32
r0_sum = hvx_vec_reduce_sum_f32(Q6_Vsf_equals_Vqf32(r0_sum));
r0_sum = hvx_vec_reduce_sum_f32(r0_sum);
hvx_vec_store_u(&s[0], 4, r0_sum);
}
@ -781,13 +778,13 @@ static void vec_dot_mxfp4x4x2_q8x4x2_rx2(const int n,
const uint8_t * restrict y_q = ((const uint8_t *) vy + 0); // quants first
const uint8_t * restrict y_d = ((const uint8_t *) vy + y_qrow_size); // then scales
// Row sum (qf32)
// Row sum (sf)
HVX_Vector r0_sum = Q6_V_vsplat_R(0);
HVX_Vector r1_sum = Q6_V_vsplat_R(0);
// Multiply and accumulate into int32.
// Compute combined scale (fp32).
// Apply scale to acc and accumulate into the row sum (qf32).
// Apply scale to acc and accumulate into the row sum (f32).
const uint32_t nb = n / qk; // num full blocks
int32_t nloe = n % qk; // num leftover elemements (must be signed)
@ -829,8 +826,8 @@ static void vec_dot_mxfp4x4x2_q8x4x2_rx2(const int n,
HVX_Vector r0_fa = Q6_Vqf32_vmpy_VsfVsf(r0_ia, r0_dd);
HVX_Vector r1_fa = Q6_Vqf32_vmpy_VsfVsf(r1_ia, r1_dd);
r0_sum = Q6_Vqf32_vadd_Vqf32Vqf32(r0_sum, r0_fa);
r1_sum = Q6_Vqf32_vadd_Vqf32Vqf32(r1_sum, r1_fa);
r0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_fa, r0_sum));
r1_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r1_fa, r1_sum));
}
// Process leftovers
@ -867,24 +864,22 @@ static void vec_dot_mxfp4x4x2_q8x4x2_rx2(const int n,
HVX_Vector r0_dd = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(r0_d, vy_d));
HVX_Vector r1_dd = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(r1_d, vy_d));
// Zero-out unused scales
// Zero-out unused values
HVX_VectorPred bmask = Q6_Q_vsetq_R(nloe / 8);
r0_dd = Q6_V_vand_QV(bmask, r0_dd);
r1_dd = Q6_V_vand_QV(bmask, r1_dd);
r0_ia = Q6_V_vand_QV(bmask, r0_ia);
r1_ia = Q6_V_vand_QV(bmask, r1_ia);
HVX_Vector r0_fa = Q6_Vqf32_vmpy_VsfVsf(r0_ia, r0_dd);
HVX_Vector r1_fa = Q6_Vqf32_vmpy_VsfVsf(r1_ia, r1_dd);
r0_sum = Q6_Vqf32_vadd_Vqf32Vqf32(r0_sum, r0_fa);
r1_sum = Q6_Vqf32_vadd_Vqf32Vqf32(r1_sum, r1_fa);
r0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_fa, r0_sum));
r1_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r1_fa, r1_sum));
}
// Convert into fp32 and reduce
r0_sum = hvx_vec_reduce_sum_f32(Q6_Vsf_equals_Vqf32(r0_sum));
r1_sum = hvx_vec_reduce_sum_f32(Q6_Vsf_equals_Vqf32(r1_sum));
HVX_VectorPair p0 = Q6_W_vshuff_VVR(r1_sum, r0_sum, 4);
hvx_vec_store_u(&s[0], 8, Q6_V_lo_W(p0));
HVX_Vector rsum = hvx_vec_reduce_sum_f32x2(r0_sum, r1_sum);
hvx_vec_store_u(&s[0], 8, rsum);
}
static void vec_dot_f16_f16_aa(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
@ -913,7 +908,7 @@ static void vec_dot_f16_f16_aa(const int n, float * restrict s, const void * res
rsum = Q6_Vqf32_vadd_Vqf32Vqf32(rsum, Q6_Vqf32_vadd_Vqf32Vqf32(Q6_V_lo_W(xy_qf), Q6_V_hi_W(xy_qf)));
}
rsum = Q6_Vsf_equals_Vqf32(hvx_vec_reduce_sum_qf32(rsum));
rsum = hvx_vec_reduce_sum_f32(Q6_Vsf_equals_Vqf32(rsum));
hvx_vec_store_u(&s[0], 4, rsum);
}
@ -957,11 +952,8 @@ static void vec_dot_f16_f16_aa_rx2(const int n,
rsum1 = Q6_Vqf32_vadd_Vqf32Vqf32(rsum1, Q6_Vqf32_vadd_Vqf32Vqf32(Q6_V_lo_W(xy1_qf), Q6_V_hi_W(xy1_qf)));
}
rsum0 = Q6_Vsf_equals_Vqf32(hvx_vec_reduce_sum_qf32(rsum0));
rsum1 = Q6_Vsf_equals_Vqf32(hvx_vec_reduce_sum_qf32(rsum1));
HVX_VectorPair p0 = Q6_W_vshuff_VVR(rsum1, rsum0, 4);
hvx_vec_store_u(&s[0], 8, Q6_V_lo_W(p0));
HVX_Vector rsum = hvx_vec_reduce_sum_f32x2(Q6_Vsf_equals_Vqf32(rsum0), Q6_Vsf_equals_Vqf32(rsum1));
hvx_vec_store_u(&s[0], 8, rsum);
}
static void vec_dot_f16_f16_uu(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
@ -990,7 +982,7 @@ static void vec_dot_f16_f16_uu(const int n, float * restrict s, const void * res
rsum = Q6_Vqf32_vadd_Vqf32Vqf32(rsum, Q6_Vqf32_vadd_Vqf32Vqf32(Q6_V_lo_W(xy_qf), Q6_V_hi_W(xy_qf)));
}
rsum = Q6_Vsf_equals_Vqf32(hvx_vec_reduce_sum_qf32(rsum));
rsum = hvx_vec_reduce_sum_f32(Q6_Vsf_equals_Vqf32(rsum));
hvx_vec_store_u(&s[0], 4, rsum);
}
@ -1042,7 +1034,8 @@ static void vec_dot_f16_f32_uu(const int n, float * restrict s, const void * res
rsum = Q6_Vqf32_vadd_Vqf32Vqf32(rsum, Q6_Vqf32_vadd_Vqf32Vqf32(Q6_V_lo_W(xy_qf), Q6_V_hi_W(xy_qf)));
}
rsum = Q6_Vsf_equals_Vqf32(hvx_vec_reduce_sum_qf32(rsum));
// Convert into fp32 and reduce
rsum = hvx_vec_reduce_sum_f32(Q6_Vsf_equals_Vqf32(rsum));
hvx_vec_store_u(&s[0], 4, rsum);
}

View File

@ -154,8 +154,8 @@ static void hvx_fast_softmax_f32(const uint8_t * restrict src,
v_pad[i] = v3;
}
v = hvx_vec_reduce_sum_qf32(sum_vec);
sum_vec = hvx_vec_repl4(Q6_Vsf_equals_Vqf32(v));
v = hvx_vec_reduce_sum_f32(Q6_Vsf_equals_Vqf32(sum_vec));
sum_vec = hvx_vec_repl4(v);
HVX_VectorPred pos_sum = Q6_Q_vcmp_gt_VwVw(sum_vec, zero_v);
HVX_Vector v4 = hvx_vec_inverse_f32(sum_vec);

View File

@ -57,8 +57,8 @@ static void hvx_fast_rms_norm_f32(const uint8_t * restrict src,
sum_v = Q6_Vqf32_vadd_Vqf32Vqf32(sum_v, v2);
}
HVX_Vector reduced_sum = hvx_vec_reduce_sum_qf32(sum_v);
sum_v = hvx_vec_repl4(Q6_Vsf_equals_Vqf32(reduced_sum));
HVX_Vector reduced_sum = hvx_vec_reduce_sum_f32(Q6_Vsf_equals_Vqf32(sum_v));
sum_v = hvx_vec_repl4(reduced_sum);
HVX_Vector t_v = hvx_vec_splat_f32((float) num_elems);
HVX_Vector denom_v = hvx_vec_inverse_f32(t_v);

View File

@ -0,0 +1,79 @@
#pragma once
#ifdef _WIN32
# define WIN32_LEAN_AND_MEAN
# ifndef NOMINMAX
# define NOMINMAX
# endif
# include <windows.h>
# include <winevt.h>
#else
# include <dlfcn.h>
# include <unistd.h>
#endif
#include <filesystem>
namespace fs = std::filesystem;
#ifdef _WIN32
using dl_handle = std::remove_pointer_t<HMODULE>;
struct dl_handle_deleter {
void operator()(HMODULE handle) {
FreeLibrary(handle);
}
};
static inline dl_handle * dl_load_library(const fs::path & path) {
// suppress error dialogs for missing DLLs
DWORD old_mode = SetErrorMode(SEM_FAILCRITICALERRORS);
SetErrorMode(old_mode | SEM_FAILCRITICALERRORS);
HMODULE handle = LoadLibraryW(path.wstring().c_str());
SetErrorMode(old_mode);
return handle;
}
static inline void * dl_get_sym(dl_handle * handle, const char * name) {
DWORD old_mode = SetErrorMode(SEM_FAILCRITICALERRORS);
SetErrorMode(old_mode | SEM_FAILCRITICALERRORS);
void * p = (void *) GetProcAddress(handle, name);
SetErrorMode(old_mode);
return p;
}
static inline const char * dl_error() {
return "";
}
#else
using dl_handle = void;
struct dl_handle_deleter {
void operator()(void * handle) {
dlclose(handle);
}
};
static inline dl_handle * dl_load_library(const fs::path & path) {
dl_handle * handle = dlopen(path.string().c_str(), RTLD_NOW | RTLD_LOCAL);
return handle;
}
static inline void * dl_get_sym(dl_handle * handle, const char * name) {
return dlsym(handle, name);
}
static inline const char * dl_error() {
const char *rslt = dlerror();
return rslt != nullptr ? rslt : "";
}
#endif

View File

@ -0,0 +1,38 @@
[Version]
Signature = "$WINDOWS NT$"
Class = ComputeAccelerator
ClassGuid = {F01A9D53-3FF6-48D2-9F97-C8A7004BE10C}
Provider = %GGML%
DriverVer = 01/01/2026,1.0.0.0
CatalogFile = libggml-htp.cat
PnpLockDown = 1
[DestinationDirs]
Drivers_Dir = 6
[SourceDisksNames]
1 = %DiskId%
[SourceDisksFiles]
libggml-htp-v68.so = 1
libggml-htp-v69.so = 1
libggml-htp-v73.so = 1
libggml-htp-v75.so = 1
libggml-htp-v81.so = 1
[ControlFlags]
ExcludeFromSelect = *
[DefaultInstall.NTarm64]
CopyFiles=Drivers_Dir
[Drivers_Dir]
libggml-htp-v68.so,,,0x10 ;COPYFLG_NO_OVERWRITE
libggml-htp-v69.so,,,0x10 ;COPYFLG_NO_OVERWRITE
libggml-htp-v73.so,,,0x10 ;COPYFLG_NO_OVERWRITE
libggml-htp-v75.so,,,0x10 ;COPYFLG_NO_OVERWRITE
libggml-htp-v81.so,,,0x10 ;COPYFLG_NO_OVERWRITE
[Strings]
GGML = 'GGML'
DiskId = 'GGML HTP library'

View File

@ -101,6 +101,8 @@ set(GGML_OPENCL_KERNELS
mul_mm_f32_f32_l4_lm
mul_mm_f16_f32_l4_lm
mul_mm_q8_0_f32_l4_lm
mul_mm_q8_0_f32_8x4
gemv_noshuffle_general_q8_0_f32
mul
norm
relu

View File

@ -226,7 +226,8 @@ static ADRENO_GPU_GEN get_adreno_gpu_gen(const char *device_name) {
return ADRENO_GPU_GEN::A7X;
}
if (strstr(device_name, "830")) {
if (strstr(device_name, "830") ||
strstr(device_name, "840")) {
return ADRENO_GPU_GEN::A8X;
}
@ -529,7 +530,7 @@ struct ggml_backend_opencl_context {
cl_kernel kernel_mul_mat_q4_0_f32, kernel_mul_mat_q4_0_f32_v;
cl_kernel kernel_convert_block_q4_0, kernel_restore_block_q4_0;
cl_kernel kernel_convert_block_mxfp4, kernel_convert_block_mxfp4_trans, kernel_restore_block_mxfp4, kernel_restore_block_mxfp4_trans;
cl_kernel kernel_convert_block_q8_0, kernel_restore_block_q8_0;
cl_kernel kernel_convert_block_q8_0, kernel_restore_block_q8_0, kernel_restore_block_q8_0_trans;
cl_kernel kernel_mul_mat_q4_0_f32_8x_flat;
cl_kernel kernel_convert_block_q4_0_noshuffle;
cl_kernel kernel_restore_block_q4_0_noshuffle;
@ -696,6 +697,8 @@ struct ggml_backend_opencl_context {
cl_kernel CL_mul_mat_vec_q4_0_f32_1d_4x_flat_4096_1_4096;
cl_kernel CL_mul_mat_vec_q4_0_f32_1d_4x_flat_11008_1_4096;
cl_kernel CL_mul_mat_vec_q4_0_f32_1d_4x_flat_32000_1_4096;
cl_kernel kernel_mul_mm_q8_0_f32_8x4;
cl_kernel CL_mul_mat_vec_q8_0_f32;
#endif // GGML_OPENCL_USE_ADRENO_KERNELS
void free() {
@ -894,6 +897,7 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
CL_CHECK((backend_ctx->kernel_restore_block_mxfp4 = clCreateKernel(backend_ctx->program_cvt, "kernel_restore_block_mxfp4", &err), err));
CL_CHECK((backend_ctx->kernel_convert_block_q8_0 = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_q8_0", &err), err));
CL_CHECK((backend_ctx->kernel_restore_block_q8_0 = clCreateKernel(backend_ctx->program_cvt, "kernel_restore_block_q8_0", &err), err));
CL_CHECK((backend_ctx->kernel_restore_block_q8_0_trans = clCreateKernel(backend_ctx->program_cvt, "kernel_restore_block_q8_0_trans", &err), err));
CL_CHECK((backend_ctx->kernel_convert_block_q6_K = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_q6_K", &err), err));
CL_CHECK((backend_ctx->kernel_restore_block_q6_K = clCreateKernel(backend_ctx->program_cvt, "kernel_restore_block_q6_K", &err), err));
GGML_LOG_CONT(".");
@ -2290,6 +2294,46 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
GGML_LOG_CONT(".");
}
// mul_mm_q8_0_f32_8x4
{
#ifdef GGML_OPENCL_EMBED_KERNELS
const std::string kernel_src_q8_8x4_gemm {
#include "mul_mm_q8_0_f32_8x4.cl.h"
};
#else
const std::string kernel_src_q8_8x4_gemm = read_file("mul_mm_q8_0_f32_8x4.cl");
#endif
backend_ctx->program_CL_gemm = build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src_q8_8x4_gemm.c_str(), compile_opts);
CL_CHECK((backend_ctx->kernel_mul_mm_q8_0_f32_8x4 = clCreateKernel(backend_ctx->program_CL_gemm, "kernel_mul_mm_q8_0_f32_8x4", &err), err));
GGML_LOG_CONT(".");
}
// gemv_noshuffle_general_q8_0_f32
{
std::string CL_gemv_compile_opts = std::string("-cl-std=") + opencl_c_std +
" -cl-mad-enable "
" -DSIMDGROUP_WIDTH=" +
std::to_string(backend_ctx->adreno_wave_size);
if (backend_ctx->has_vector_subgroup_broadcast) {
CL_gemv_compile_opts += " -DVECTOR_SUB_GROUP_BROADCAT ";
}
#ifdef GGML_OPENCL_EMBED_KERNELS
const std::string kernel_src_CL_gemv_general {
#include "gemv_noshuffle_general_q8_0_f32.cl.h"
};
#else
const std::string kernel_src_CL_gemv_general = read_file("gemv_noshuffle_general_q8_0_f32.cl");
#endif
cl_program prog = build_program_from_source(
backend_ctx->context, backend_ctx->device, kernel_src_CL_gemv_general.c_str(), CL_gemv_compile_opts);
CL_CHECK((backend_ctx->CL_mul_mat_vec_q8_0_f32 = clCreateKernel(prog, "kernel_gemv_noshuffle", &err), err));
CL_CHECK(clReleaseProgram(prog));
GGML_LOG_CONT(".");
}
std::string CL_moe_compile_opts = std::string("-cl-std=") + opencl_c_std +
" -cl-mad-enable "
" -cl-fast-relaxed-math";
@ -3745,6 +3789,15 @@ inline bool use_adreno_moe_kernels(const ggml_backend_opencl_context *backend_ct
return ((strstr(tensor->name, "ffn") != NULL) || (strstr(tensor->name, "as") != NULL)) && (ne01 % 64 == 0);
}
inline bool enable_adreno_trans_weight(const ggml_backend_opencl_context *backend_ctx, const ggml_tensor *tensor) {
bool adreno_kernel = use_adreno_kernels(backend_ctx, tensor);
size_t elem_num = tensor->ne[0] * tensor->ne[1] * tensor->ne[2] * tensor->ne[3];
return ((elem_num < 128 * 1024 * 1024) && adreno_kernel); // max element num: 2**27
}
static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
ggml_backend_opencl_context *backend_ctx = ggml_cl2_init(buffer->buft->device);
@ -4159,6 +4212,130 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer,
tensor->extra = extra;
// Transpose the weights and scales
#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
if (enable_adreno_trans_weight(backend_ctx, tensor)) {
int M = tensor->ne[1]; // ne01
int K = tensor->ne[0]; // ne00
GGML_ASSERT(K % 32 == 0);
GGML_ASSERT(M % 4 == 0);
GGML_ASSERT(tensor->ne[2] == 1);
GGML_ASSERT(tensor->ne[3] == 1);
// Transpose weights
size_t q_size_bytes = K * M / 4 * sizeof(float);
cl_buffer_region region;
region.origin = 0;
region.size = q_size_bytes;
cl_mem qT_d = clCreateSubBuffer(
backend_ctx->prealloc_quant_trans.buffer,
0,
CL_BUFFER_CREATE_TYPE_REGION,
&region,
&err);
CL_CHECK(err);
cl_mem q_d_image1D;
cl_mem qT_d_image1D;
cl_image_format img_fmt_1d;
cl_image_desc img_desc_1d;
img_fmt_1d = { CL_RGBA, CL_FLOAT };
memset(&img_desc_1d, 0, sizeof(img_desc_1d));
img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
img_desc_1d.image_width = M * K / 4 / 4;
img_desc_1d.buffer = extra->q;
q_d_image1D = clCreateImage(context, 0, &img_fmt_1d, &img_desc_1d, NULL, &err);
CL_CHECK(err);
img_fmt_1d = { CL_RGBA, CL_FLOAT };
memset(&img_desc_1d, 0, sizeof(img_desc_1d));
img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
img_desc_1d.image_width = M * K / 4 / 4;
img_desc_1d.buffer = qT_d;
qT_d_image1D = clCreateImage(context, 0, &img_fmt_1d, &img_desc_1d, NULL, &err);
CL_CHECK(err);
int height_q = M / 4;
int width_q = K / 4 / 4;
kernel = backend_ctx->kernel_transpose_32;
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &q_d_image1D));
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &qT_d_image1D));
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), &height_q));
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), &width_q));
size_t local_size_q[3] = {4, 16, 1};
size_t global_size_q[3] = {static_cast<size_t>(width_q), static_cast<size_t>(height_q), 1};
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_size_q, local_size_q, 0, NULL, &evt));
CL_CHECK(clWaitForEvents(1, &evt));
// Transpose scales
size_t d_size_bytes = M * (K / 32) * 2;
region.origin = 0;
region.size = d_size_bytes;
cl_mem dT_d = clCreateSubBuffer(
backend_ctx->prealloc_scales_trans.buffer,
0,
CL_BUFFER_CREATE_TYPE_REGION,
&region,
&err);
CL_CHECK(err);
cl_mem d_d_image1D;
cl_mem dT_d_image1D;
memset(&img_desc_1d, 0, sizeof(img_desc_1d));
img_fmt_1d = { CL_R, CL_HALF_FLOAT };
img_desc_1d.image_width = M * K / 32;
img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
img_desc_1d.buffer = extra->d;
d_d_image1D = clCreateImage(context, 0, &img_fmt_1d, &img_desc_1d, NULL, &err);
CL_CHECK(err);
img_fmt_1d = { CL_RGBA, CL_HALF_FLOAT };
memset(&img_desc_1d, 0, sizeof(img_desc_1d));
img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
img_desc_1d.image_width = M * K / 32 / 4;
img_desc_1d.buffer = dT_d;
dT_d_image1D = clCreateImage(context, 0, &img_fmt_1d, &img_desc_1d, NULL, &err);
CL_CHECK(err);
int height_s = M / 4;
int width_s = K / 32;
kernel = backend_ctx->kernel_transpose_16_4x1;
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &d_d_image1D));
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &dT_d_image1D));
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), &height_s));
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), &width_s));
size_t local_size_s[3] = {4, 16, 1};
size_t global_size_s[3] = {static_cast<size_t>(width_s), static_cast<size_t>(height_s), 1};
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_size_s, local_size_s, 0, NULL, &evt));
CL_CHECK(clWaitForEvents(1, &evt));
// copy transposed buffer contents to original buffers
CL_CHECK(clEnqueueCopyBuffer(queue, qT_d, extra->q, 0, 0, q_size_bytes, 0, NULL, &evt));
CL_CHECK(clWaitForEvents(1, &evt));
CL_CHECK(clEnqueueCopyBuffer(queue, dT_d, extra->d, 0, 0, d_size_bytes, 0, NULL, &evt));
CL_CHECK(clWaitForEvents(1, &evt));
CL_CHECK(clReleaseMemObject(qT_d));
CL_CHECK(clReleaseMemObject(dT_d));
CL_CHECK(clReleaseMemObject(q_d_image1D));
CL_CHECK(clReleaseMemObject(d_d_image1D));
CL_CHECK(clReleaseMemObject(qT_d_image1D));
CL_CHECK(clReleaseMemObject(dT_d_image1D));
} // end transpose
#endif // GGML_OPENCL_USE_ADRENO_KERNELS
return;
}
if (tensor->type == GGML_TYPE_Q6_K) {
@ -4448,6 +4625,36 @@ static void ggml_backend_opencl_buffer_get_tensor(ggml_backend_buffer_t buffer,
ggml_nbytes(tensor), NULL, &err);
CL_CHECK(err);
#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
if (enable_adreno_trans_weight(backend_ctx, tensor)) {
cl_kernel kernel = backend_ctx->kernel_restore_block_q8_0_trans;
int ne00 = tensor->ne[0];
int ne01 = tensor->ne[1];
GGML_ASSERT(tensor->ne[2] == 1); // ???
GGML_ASSERT(tensor->ne[3] == 1); // ???
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra->q));
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra->d));
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &data_device));
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_int), &ne00));
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_int), &ne01));
size_t global_work_size[3] = {static_cast<size_t>(((ne01 + 63) / 64) * 64), 1, 1};
size_t local_work_size[3] = {64, 1, 1};
cl_event evt;
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL,
global_work_size, local_work_size, 0, NULL, &evt));
CL_CHECK(clWaitForEvents(1, &evt));
CL_CHECK(clEnqueueReadBuffer(
queue, data_device, CL_TRUE, offset,
size, data, 0, NULL, NULL));
CL_CHECK(clReleaseMemObject(data_device));
return;
}
#endif
cl_kernel kernel = backend_ctx->kernel_restore_block_q8_0;
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra->q));
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra->d));
@ -7947,6 +8154,252 @@ static void ggml_cl_mul_mat_kq_kqv_adreno(ggml_backend_t backend, const ggml_ten
CL_CHECK(clReleaseMemObject(D_sub_buffer));
}
static void ggml_cl_mul_mat_q8_0_f32_adreno(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
GGML_ASSERT(src0);
GGML_ASSERT(src0->extra);
GGML_ASSERT(src1);
GGML_ASSERT(src1->extra);
GGML_ASSERT(dst);
GGML_ASSERT(dst->extra);
const enum ggml_type src0t = src0->type;
const enum ggml_type src1t = src1->type;
GGML_ASSERT(src0t == GGML_TYPE_Q8_0);
GGML_ASSERT(src1t == GGML_TYPE_F32);
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
ggml_tensor_extra_cl_q8_0 * extra0_q8_0 = (ggml_tensor_extra_cl_q8_0 *)src0->extra;
GGML_ASSERT(src1->view_offs == 0);
GGML_ASSERT(dst->view_offs == 0);
const int ne00 = src0->ne[0];
const int ne01 = src0->ne[1];
const int ne02 = src0->ne[2];
const int ne10 = src1->ne[0];
const int ne12 = src1->ne[2];
const int ne0 = dst->ne[0];
const int ne1 = dst->ne[1];
GGML_ASSERT(ne00 == ne10);
GGML_ASSERT((ne00 % 32) == 0);
GGML_ASSERT(ne0 == ne01);
cl_context context = backend_ctx->context;
cl_kernel kernel;
// init CL objects
cl_int status;
cl_image_format img_fmt_1d;
cl_image_desc img_desc_1d;
cl_buffer_region region;
cl_mem A_image1d;
cl_mem B_image1d;
cl_mem B_sub_buffer;
cl_mem S_image1d;
cl_mem D_image1d;
cl_mem D_sub_buffer;
int M = ne01;
int N = ne1;
int K = ne00;
// create an image for A
img_fmt_1d = { CL_R, CL_FLOAT};
memset(&img_desc_1d, 0, sizeof(img_desc_1d));
img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
img_desc_1d.image_width = M * K / 4; // Divide by 4 for char -> float
img_desc_1d.buffer = extra0_q8_0->q;
A_image1d = clCreateImage(context, CL_MEM_READ_ONLY, &img_fmt_1d, &img_desc_1d, NULL, &status);
CL_CHECK(status);
// create an image for Scale
img_fmt_1d = { CL_R, CL_HALF_FLOAT};
memset(&img_desc_1d, 0, sizeof(img_desc_1d));
img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
img_desc_1d.image_width = M * K / 32; // Block size is 32
img_desc_1d.buffer = extra0_q8_0->d;
S_image1d = clCreateImage(context, CL_MEM_READ_ONLY, &img_fmt_1d, &img_desc_1d, NULL, &status);
CL_CHECK(status);
// create a sub_buffer for B
region.origin = (extra1->offset); // + src1->view_offs);
region.size = K * N * sizeof(float);
B_sub_buffer = clCreateSubBuffer((extra1->data_device), 0, CL_BUFFER_CREATE_TYPE_REGION, &region, &status);
CL_CHECK(status);
// create an image for B from sub_buffer: RGBA (OCL)
img_fmt_1d = {CL_RGBA, CL_FLOAT};
memset(&img_desc_1d, 0, sizeof(img_desc_1d));
img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
img_desc_1d.image_width = K * N / 4;
img_desc_1d.buffer = B_sub_buffer;
B_image1d = clCreateImage(context, CL_MEM_READ_ONLY, &img_fmt_1d, &img_desc_1d, NULL, &status);
CL_CHECK(status);
// Create subbuffer and image1d_buffer for dst
region.origin = (extrad->offset); // + dst->view_offs;
region.size = M * N * sizeof(float);
D_sub_buffer = clCreateSubBuffer((extrad->data_device), 0, CL_BUFFER_CREATE_TYPE_REGION, &region, &status);
CL_CHECK(status);
img_fmt_1d = {CL_R, CL_FLOAT};
memset(&img_desc_1d, 0, sizeof(img_desc_1d));
img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
img_desc_1d.image_width = M * N;
img_desc_1d.buffer = D_sub_buffer;
D_image1d = clCreateImage(context, CL_MEM_WRITE_ONLY, &img_fmt_1d, &img_desc_1d, NULL, &status);
CL_CHECK(status);
size_t local_work_size[3] = {1, 1, 1};
size_t global_work_size[3] = {1, 1, 1};
if (N == 1) {
kernel = backend_ctx->CL_mul_mat_vec_q8_0_f32;
int r2 = 1;
int r3 = 1;
cl_uint k_arg = 0;
CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(cl_mem), &A_image1d));
CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(cl_mem), &extra0_q8_0->d));
CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(cl_mem), &B_image1d));
CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(cl_ulong), &extra1->offset));
CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(cl_mem), &extrad->data_device));
CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(cl_ulong), &extrad->offset));
CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &ne00));
CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &ne01));
CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &ne02));
CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &ne10));
CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &ne12));
CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &ne0));
CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &ne1));
CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &r2));
CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &r3));
size_t wavesize = backend_ctx->adreno_wave_size;
local_work_size[0] = wavesize;
local_work_size[1] = 4; // reduce factor
local_work_size[2] = 1;
global_work_size[0] = ((M + wavesize - 1) / wavesize) * wavesize;
global_work_size[1] = 4; // reduce factor
global_work_size[2] = 1;
} else {
cl_ulong offsetd = extrad->offset + dst->view_offs;
cl_mem B_image1d_trans = nullptr;
// for B transpose
cl_mem B_d = nullptr;
int padding;
//how many extra elements beyond multiple of 8
int extra_elements = N % 8;
//how much padding to add
padding = 0;
if (extra_elements > 0){
padding = 8 - extra_elements;
}
// Specify the starting offset (in bytes)
region.origin = 0;
// Specify the size of the sub-buffer (divide by 2 for FP16)
region.size = K * (N + padding) * sizeof(float)/2;
backend_ctx->prealloc_act_trans.allocate(context, region.size);
B_d = clCreateSubBuffer(
backend_ctx->prealloc_act_trans.buffer,
0,
CL_BUFFER_CREATE_TYPE_REGION,
&region,
&status);
CL_CHECK(status);
cl_image_format image_format_B_d_output = { CL_RGBA, CL_HALF_FLOAT }; //(CL_HALF_FLOAT for FP16)
cl_image_desc image_desc_B_d_output = {
CL_MEM_OBJECT_IMAGE1D_BUFFER,
static_cast<size_t>(K * (N + padding)/4),
0, 0, 0, 0, 0, 0, 0, { B_d }
};
B_image1d_trans = clCreateImage(
context,
0,
&image_format_B_d_output,
&image_desc_B_d_output,
NULL,
&status);
CL_CHECK(status);
int height_B = N/4;
if (height_B == 0) {
height_B = 1;
}
int width_B = K/4;
int padded_height_B = (N + padding)/4;
kernel = backend_ctx->kernel_transpose_32_16;
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &B_image1d));
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &B_image1d_trans));
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), &height_B));
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), &width_B));
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &padded_height_B));
size_t local_size_t[2] = { 1, 16 };
size_t global_size_t[2] = {
static_cast<size_t>(width_B),
static_cast<size_t>(padded_height_B)
};
backend_ctx->enqueue_ndrange_kernel(kernel, 2, global_size_t, local_size_t, dst);
kernel = backend_ctx->kernel_mul_mm_q8_0_f32_8x4;
int N_with_padding = N + padding;
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0_q8_0->q));
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra0_q8_0->d));
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &B_image1d_trans));
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), &extrad->data_device));
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &K));
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &M));
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &N_with_padding));
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &N));
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &offsetd));
global_work_size[0] = (size_t)(N + 7) / 8;
global_work_size[1] = (size_t)(M + 3) / 4;
global_work_size[2] = 1;
local_work_size[0] = 2;
local_work_size[1] = 128;
local_work_size[2] = 1;
}
// enqueue kernel with profiling
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
// deallocate sub buffers and images
CL_CHECK(clReleaseMemObject(A_image1d));
CL_CHECK(clReleaseMemObject(B_sub_buffer));
CL_CHECK(clReleaseMemObject(B_image1d));
CL_CHECK(clReleaseMemObject(S_image1d));
CL_CHECK(clReleaseMemObject(D_sub_buffer));
CL_CHECK(clReleaseMemObject(D_image1d));
#else
GGML_UNUSED(src0);
GGML_UNUSED(src1);
GGML_UNUSED(dst);
#endif
}
static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
GGML_ASSERT(src0);
GGML_ASSERT(src0->extra);
@ -8064,6 +8517,13 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
int padding;
// <--------------------------------------------> //
// q8_0 x fp32
if (src0t == GGML_TYPE_Q8_0 && src1t == GGML_TYPE_F32 &&
enable_adreno_trans_weight(backend_ctx, src0)) {
ggml_cl_mul_mat_q8_0_f32_adreno(backend, src0, src1, dst);
return;
}
// q4_0 x fp32
if(src0t == GGML_TYPE_Q4_0 && src1t == GGML_TYPE_F32) {
// TODO: remove duplicate definitions of image description + format -- move to top

View File

@ -274,6 +274,37 @@ kernel void kernel_restore_block_q8_0(
}
}
kernel void kernel_restore_block_q8_0_trans(
global uchar * src_q,
global half * src_d,
global block_q8_0 * dst,
uint ne00,
uint ne01
){
uint num_blk_per_row = ne00 / QK8_0;
global block_q8_0 * b = (global block_q8_0 *) dst + get_global_id(0) * num_blk_per_row;
global uchar * q = (global uchar *) src_q + get_global_id(0) * 4; // 4 8-bit packed
global half * d = (global half *) src_d + get_global_id(0);
for (uint blk = 0; blk < num_blk_per_row; blk++) {
b->d = *d;
for (uint i = 0; i < QK8_0; i+=4) {
b->qs[i] = q[0];
b->qs[i+1] = q[1];
b->qs[i+2] = q[2];
b->qs[i+3] = q[3];
q += 4 * ne01; // M stride
}
d += ne01;
b++;
}
}
//------------------------------------------------------------------------------
// kernel_convert_block_q6_K
// Convert the block_q6_K format to 3 separate arrays (AOS -> SOA).

View File

@ -0,0 +1,195 @@
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
#pragma OPENCL EXTENSION cl_khr_subgroups : enable
#ifdef cl_qcom_reqd_sub_group_size
#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
#define ADRENO_GPU 1
#define REQD_SUBGROUP_SIZE_64 __attribute__((qcom_reqd_sub_group_size("half")))
#endif
#define QK8_0 32
#define N_SIMDGROUP 4
#define dequantizeBlockAccum_ns_sgbroadcast_1(total_sums, bits8, scale, y) \
float shared_y; \
char elem; \
\
shared_y = sub_group_broadcast(y.s0, 0); \
elem = (char)(bits8.s0 & 0x000000FF); \
total_sums += convert_int(elem) * scale * shared_y; \
shared_y = sub_group_broadcast(y.s1, 0); \
elem = (char)((bits8.s0 & 0x0000FF00) >> 8); \
total_sums += convert_int(elem) * scale * shared_y; \
shared_y = sub_group_broadcast(y.s2, 0); \
elem = (char)((bits8.s0 & 0x00FF0000) >> 16); \
total_sums += convert_int(elem) * scale * shared_y; \
shared_y = sub_group_broadcast(y.s3, 0); \
elem = (char)((bits8.s0 & 0xFF000000) >> 24); \
total_sums += convert_int(elem) * scale * shared_y; \
\
shared_y = sub_group_broadcast(y.s4, 0); \
elem = (char)(bits8.s1 & 0x000000FF); \
total_sums += convert_int(elem) * scale * shared_y; \
shared_y = sub_group_broadcast(y.s5, 0); \
elem = (char)((bits8.s1 & 0x0000FF00) >> 8); \
total_sums += convert_int(elem) * scale * shared_y; \
shared_y = sub_group_broadcast(y.s6, 0); \
elem = (char)((bits8.s1 & 0x00FF0000) >> 16); \
total_sums += convert_int(elem) * scale * shared_y; \
shared_y = sub_group_broadcast(y.s7, 0); \
elem = (char)((bits8.s1 & 0xFF000000) >> 24); \
total_sums += convert_int(elem) * scale * shared_y; \
\
shared_y = sub_group_broadcast(y.s0, 1); \
elem = (char)(bits8.s2 & 0x000000FF); \
total_sums += convert_int(elem) * scale * shared_y; \
shared_y = sub_group_broadcast(y.s1, 1); \
elem = (char)((bits8.s2 & 0x0000FF00) >> 8); \
total_sums += convert_int(elem) * scale * shared_y; \
shared_y = sub_group_broadcast(y.s2, 1); \
elem = (char)((bits8.s2 & 0x00FF0000) >> 16); \
total_sums += convert_int(elem) * scale * shared_y; \
shared_y = sub_group_broadcast(y.s3, 1); \
elem = (char)((bits8.s2 & 0xFF000000) >> 24); \
total_sums += convert_int(elem) * scale * shared_y; \
\
shared_y = sub_group_broadcast(y.s4, 1); \
elem = (char)(bits8.s3 & 0x000000FF); \
total_sums += convert_int(elem) * scale * shared_y; \
shared_y = sub_group_broadcast(y.s5, 1); \
elem = (char)((bits8.s3 & 0x0000FF00) >> 8); \
total_sums += convert_int(elem) * scale * shared_y; \
shared_y = sub_group_broadcast(y.s6, 1); \
elem = (char)((bits8.s3 & 0x00FF0000) >> 16); \
total_sums += convert_int(elem) * scale * shared_y; \
shared_y = sub_group_broadcast(y.s7, 1); \
elem = (char)((bits8.s3 & 0xFF000000) >> 24); \
total_sums += convert_int(elem) * scale * shared_y; \
\
shared_y = sub_group_broadcast(y.s0, 2); \
elem = (char)(bits8.s4 & 0x000000FF); \
total_sums += convert_int(elem) * scale * shared_y; \
shared_y = sub_group_broadcast(y.s1, 2); \
elem = (char)((bits8.s4 & 0x0000FF00) >> 8); \
total_sums += convert_int(elem) * scale * shared_y; \
shared_y = sub_group_broadcast(y.s2, 2); \
elem = (char)((bits8.s4 & 0x00FF0000) >> 16); \
total_sums += convert_int(elem) * scale * shared_y; \
shared_y = sub_group_broadcast(y.s3, 2); \
elem = (char)((bits8.s4 & 0xFF000000) >> 24); \
total_sums += convert_int(elem) * scale * shared_y; \
\
shared_y = sub_group_broadcast(y.s4, 2); \
elem = (char)(bits8.s5 & 0x000000FF); \
total_sums += convert_int(elem) * scale * shared_y; \
shared_y = sub_group_broadcast(y.s5, 2); \
elem = (char)((bits8.s5 & 0x0000FF00) >> 8); \
total_sums += convert_int(elem) * scale * shared_y; \
shared_y = sub_group_broadcast(y.s6, 2); \
elem = (char)((bits8.s5 & 0x00FF0000) >> 16); \
total_sums += convert_int(elem) * scale * shared_y; \
shared_y = sub_group_broadcast(y.s7, 2); \
elem = (char)((bits8.s5 & 0xFF000000) >> 24); \
total_sums += convert_int(elem) * scale * shared_y; \
\
shared_y = sub_group_broadcast(y.s0, 3); \
elem = (char)(bits8.s6 & 0x000000FF); \
total_sums += convert_int(elem) * scale * shared_y; \
shared_y = sub_group_broadcast(y.s1, 3); \
elem = (char)((bits8.s6 & 0x0000FF00) >> 8); \
total_sums += convert_int(elem) * scale * shared_y; \
shared_y = sub_group_broadcast(y.s2, 3); \
elem = (char)((bits8.s6 & 0x00FF0000) >> 16); \
total_sums += convert_int(elem) * scale * shared_y; \
shared_y = sub_group_broadcast(y.s3, 3); \
elem = (char)((bits8.s6 & 0xFF000000) >> 24); \
total_sums += convert_int(elem) * scale * shared_y; \
\
shared_y = sub_group_broadcast(y.s4, 3); \
elem = (char)(bits8.s7 & 0x000000FF); \
total_sums += convert_int(elem) * scale * shared_y; \
shared_y = sub_group_broadcast(y.s5, 3); \
elem = (char)((bits8.s7 & 0x0000FF00) >> 8); \
total_sums += convert_int(elem) * scale * shared_y; \
shared_y = sub_group_broadcast(y.s6, 3); \
elem = (char)((bits8.s7 & 0x00FF0000) >> 16); \
total_sums += convert_int(elem) * scale * shared_y; \
shared_y = sub_group_broadcast(y.s7, 3); \
elem = (char)((bits8.s7 & 0xFF000000) >> 24); \
total_sums += convert_int(elem) * scale * shared_y; \
#ifdef ADRENO_GPU
REQD_SUBGROUP_SIZE_64
#endif
__kernel void kernel_gemv_noshuffle(
__read_only image1d_buffer_t src0_q, // quantized A
global half * src0_d, // A scales
__read_only image1d_buffer_t src1, // B
ulong offset1, // offset to B (0)
global float * dst, // C
ulong offsetd, // offset to C
int ne00, // K
int ne01, // M
int ne02, // 1
int ne10, // K
int ne12, // 1
int ne0, // M
int ne1, // N
int r2, // 1
int r3)
{
uint groupId = get_local_id(1);
uint gid = get_global_id(0);
ushort slid = get_sub_group_local_id();
uint K = ne00;
uint M = ne01;
uint LINE_STRIDE_A = M;
uint BLOCK_STRIDE_A = 8 * M; // 32 / 4 = 8
__private uint8 regA;
__private half regS;
__private float8 regB;
__private float totalSum = (float)(0.0f);
// loop along K in block granularity, skip 4 blocks every iter
#pragma unroll 1 /* tell compiler not to unroll */
for (uint k = groupId; k < (K / QK8_0); k += N_SIMDGROUP) {
regS = src0_d[gid + k * LINE_STRIDE_A]; // each fiber loads scale of one rows
// first 4 fibers in each wave load 8 B values to its private scope
if (slid < 4) {
regB.s0123 = read_imagef(src1, (slid * 2 + k * 8));
regB.s4567 = read_imagef(src1, (1 + slid * 2 + k * 8));
}
// load weights for one block in consecutive rows
regA.s0 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 0)).x;
regA.s1 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 1)).x;
regA.s2 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 2)).x;
regA.s3 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 3)).x;
regA.s4 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 4)).x;
regA.s5 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 5)).x;
regA.s6 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 6)).x;
regA.s7 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 7)).x;
dequantizeBlockAccum_ns_sgbroadcast_1(totalSum, regA, regS, regB);
}
// reduction in local memory, assumes #wave=4
__local float reduceLM[SIMDGROUP_WIDTH * 3];
if (groupId == 1) reduceLM[SIMDGROUP_WIDTH * 0 + slid] = totalSum;
if (groupId == 2) reduceLM[SIMDGROUP_WIDTH * 1 + slid] = totalSum;
if (groupId == 3) reduceLM[SIMDGROUP_WIDTH * 2 + slid] = totalSum;
barrier(CLK_LOCAL_MEM_FENCE);
if (groupId == 0) totalSum += reduceLM[SIMDGROUP_WIDTH * 0 + slid];
if (groupId == 0) totalSum += reduceLM[SIMDGROUP_WIDTH * 1 + slid];
if (groupId == 0) totalSum += reduceLM[SIMDGROUP_WIDTH * 2 + slid];
// 1 outputs per fiber in wave 0
if (groupId == 0) {
dst = (global float*)((global char*)dst + offsetd);
dst[gid] = totalSum;
}
}

View File

@ -0,0 +1,129 @@
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
#ifdef cl_qcom_reqd_sub_group_size
#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
#define ADRENO_GPU 1
#define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full")))
#endif
#ifdef ADRENO_GPU
REQD_SUBGROUP_SIZE_128
#endif
kernel void kernel_mul_mm_q8_0_f32_8x4(
global const uint * src0_q,
global const half * src0_d,
__read_only image1d_buffer_t src1,
global float * dst,
int k,
int m,
int n,
int n_no_padding,
ulong offsetd
) {
int m_4 = m >> 2;
int n_4 = n >> 2;
int gy = get_global_id(0);
int gx = get_global_id(1);
int gx_2 = gx << 2;
dst = (global float *)((global char*)dst + offsetd);
half8 c0 = 0, c1 = 0, c2 = 0, c3 = 0;
half8 B;
half4 deq;
__global const uint* wptr = src0_q + gx_2;
__global const half* sptr = src0_d + gx_2;
for (int i = 0; i < k; i += 4) {
uint4 pack4 = vload4(0, wptr + (i / 4) * m);
half4 scale = vload4(0, sptr + (i / 32) * m);
char4 p0 = as_char4(pack4.s0);
char4 p1 = as_char4(pack4.s1);
char4 p2 = as_char4(pack4.s2);
char4 p3 = as_char4(pack4.s3);
// ------------------- j = 0 (k = i+0) -------------------
B.s0123 = read_imageh(src1, gy * 2 + (i + 0) * n_4);
B.s4567 = read_imageh(src1, gy * 2 + (i + 0) * n_4 + 1);
half4 wj0 = convert_half4((char4)(p0.s0, p1.s0, p2.s0, p3.s0)) * scale;
c0 += B * wj0.s0;
c1 += B * wj0.s1;
c2 += B * wj0.s2;
c3 += B * wj0.s3;
// ------------------- j = 1 (k = i+1) -------------------
B.s0123 = read_imageh(src1, gy * 2 + (i + 1) * n_4);
B.s4567 = read_imageh(src1, gy * 2 + (i + 1) * n_4 + 1);
half4 wj1 = convert_half4((char4)(p0.s1, p1.s1, p2.s1, p3.s1)) * scale;
c0 += B * wj1.s0;
c1 += B * wj1.s1;
c2 += B * wj1.s2;
c3 += B * wj1.s3;
// ------------------- j = 2 (k = i+2) -------------------
B.s0123 = read_imageh(src1, gy * 2 + (i + 2) * n_4);
B.s4567 = read_imageh(src1, gy * 2 + (i + 2) * n_4 + 1);
half4 wj2 = convert_half4((char4)(p0.s2, p1.s2, p2.s2, p3.s2)) * scale;
c0 += B * wj2.s0;
c1 += B * wj2.s1;
c2 += B * wj2.s2;
c3 += B * wj2.s3;
// ------------------- j = 3 (k = i+3) -------------------
B.s0123 = read_imageh(src1, gy * 2 + (i + 3) * n_4);
B.s4567 = read_imageh(src1, gy * 2 + (i + 3) * n_4 + 1);
half4 wj3 = convert_half4((char4)(p0.s3, p1.s3, p2.s3, p3.s3)) * scale;
c0 += B * wj3.s0;
c1 += B * wj3.s1;
c2 += B * wj3.s2;
c3 += B * wj3.s3;
}
int idx = (gy << 3) * m + (gx << 2);
if(idx+3 < m*n_no_padding){
vstore4((float4)(c0.s0, c1.s0, c2.s0, c3.s0), 0, dst + idx);
idx += m;
}
if(idx+3 < m*n_no_padding){
vstore4((float4)(c0.s1, c1.s1, c2.s1, c3.s1), 0, dst + idx);
idx += m;
}
if(idx+3 < m*n_no_padding){
vstore4((float4)(c0.s2, c1.s2, c2.s2, c3.s2), 0, dst + idx);
idx += m;
}
if(idx+3 < m*n_no_padding){
vstore4((float4)(c0.s3, c1.s3, c2.s3, c3.s3), 0, dst + idx);
idx += m;
}
if(idx+3 < m*n_no_padding){
vstore4((float4)(c0.s4, c1.s4, c2.s4, c3.s4), 0, dst + idx);
idx += m;
}
if(idx+3 < m*n_no_padding){
vstore4((float4)(c0.s5, c1.s5, c2.s5, c3.s5), 0, dst + idx);
idx += m;
}
if(idx+3 < m*n_no_padding){
vstore4((float4)(c0.s6, c1.s6, c2.s6, c3.s6), 0, dst + idx);
idx += m;
}
if(idx+3 < m*n_no_padding){
vstore4((float4)(c0.s7, c1.s7, c2.s7, c3.s7), 0, dst + idx);
}
}

View File

@ -123,6 +123,15 @@ static __dpct_inline__ T op_log(T x) {
return sycl::log(x);
}
template<typename T>
static __dpct_inline__ T op_softplus(T x) {
const float xf = (float) x;
const float ax = sycl::fabs(xf);
const float m = sycl::fmax(xf, 0.0f);
const float y = m + sycl::log1p(sycl::exp(-ax));
return (T) y;
}
template<typename T>
static __dpct_inline__ T op_neg(T x) {
return -x;
@ -695,6 +704,12 @@ static inline void ggml_sycl_op_log(ggml_backend_sycl_context & ctx, ggml_tensor
});
}
static inline void ggml_sycl_op_softplus(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
ggml_sycl_detail::ggml_sycl_op_unary(ctx, dst, [](auto x) {
return op_softplus(x);
});
}
static inline void ggml_sycl_op_neg(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
ggml_sycl_detail::ggml_sycl_op_unary(ctx, dst, [](auto x) {
return op_neg(x);
@ -1101,6 +1116,11 @@ void ggml_sycl_log(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
ggml_sycl_op_log(ctx, dst);
}
void ggml_sycl_softplus(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
ggml_sycl_op_softplus(ctx, dst);
}
void ggml_sycl_neg(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
ggml_sycl_op_neg(ctx, dst);

View File

@ -61,6 +61,8 @@ void ggml_sycl_exp(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
void ggml_sycl_log(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
void ggml_sycl_softplus(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
void ggml_sycl_neg(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
void ggml_sycl_step(ggml_backend_sycl_context & ctx, ggml_tensor * dst);

View File

@ -2263,6 +2263,65 @@ inline void ggml_sycl_op_diag_mask_inf(ggml_backend_sycl_context & ctx, ggml_ten
diag_mask_inf_f32_sycl(src0_dd, dst_dd, ne00, nrows0, ne01, n_past, main_stream);
}
static void tri_f32_sycl(
const float * src,
float * dst,
const int64_t ne0,
const int64_t ne1,
const int64_t ne2,
const int64_t ne3,
const ggml_tri_type ttype,
dpct::queue_ptr main_stream
) {
const size_t total = (size_t) ne0 * (size_t) ne1 * (size_t) ne2 * (size_t) ne3;
main_stream->parallel_for(sycl::range<1>(total), [=](sycl::id<1> tid) {
const int64_t idx = (int64_t) tid[0];
const int64_t i0 = idx % ne0;
const int64_t t1 = idx / ne0;
const int64_t i1 = t1 % ne1;
bool keep = false;
switch (ttype) {
case GGML_TRI_TYPE_LOWER: keep = (i0 < i1); break;
case GGML_TRI_TYPE_LOWER_DIAG: keep = (i0 <= i1); break;
case GGML_TRI_TYPE_UPPER: keep = (i0 > i1); break;
case GGML_TRI_TYPE_UPPER_DIAG: keep = (i0 >= i1); break;
default: keep = false; break;
}
dst[idx] = keep ? src[idx] : 0.0f;
});
}
static void ggml_sycl_op_tri(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
const ggml_tensor * src0 = dst->src[0];
GGML_ASSERT(src0);
GGML_ASSERT(src0->type == GGML_TYPE_F32);
GGML_ASSERT(dst->type == GGML_TYPE_F32);
GGML_ASSERT(ggml_is_contiguous(src0));
GGML_ASSERT(ggml_is_contiguous(dst));
GGML_ASSERT(ggml_are_same_shape(src0, dst));
dpct::queue_ptr main_stream = ctx.stream();
SYCL_CHECK(ggml_sycl_set_device(ctx.device));
const float * src0_dd = static_cast<const float *>(src0->data);
float * dst_dd = static_cast<float *>(dst->data);
const ggml_tri_type ttype = (ggml_tri_type) ggml_get_op_params_i32(dst, 0);
const int64_t ne0 = src0->ne[0];
const int64_t ne1 = src0->ne[1];
const int64_t ne2 = src0->ne[2];
const int64_t ne3 = src0->ne[3];
tri_f32_sycl(src0_dd, dst_dd, ne0, ne1, ne2, ne3, ttype, main_stream);
}
inline void ggml_sycl_op_scale(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
GGML_ASSERT( dst->type == GGML_TYPE_F32);
@ -3786,6 +3845,9 @@ static bool ggml_sycl_compute_forward(ggml_backend_sycl_context & ctx, struct gg
case GGML_UNARY_OP_EXP:
ggml_sycl_exp(ctx, dst);
break;
case GGML_UNARY_OP_SOFTPLUS:
ggml_sycl_softplus(ctx, dst);
break;
case GGML_UNARY_OP_SGN:
ggml_sycl_sgn(ctx, dst);
break;
@ -3912,6 +3974,9 @@ static bool ggml_sycl_compute_forward(ggml_backend_sycl_context & ctx, struct gg
case GGML_OP_TRANSPOSE:
GGML_SYCL_DEBUG("%s: Tensor NO-OP\n", __func__);
break;
case GGML_OP_TRI:
ggml_sycl_op_tri(ctx, dst);
break;
case GGML_OP_DIAG_MASK_INF:
ggml_sycl_diag_mask_inf(ctx, dst);
break;
@ -4404,6 +4469,7 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
case GGML_UNARY_OP_GELU_QUICK:
case GGML_UNARY_OP_GELU_ERF:
case GGML_UNARY_OP_EXP:
case GGML_UNARY_OP_SOFTPLUS:
case GGML_UNARY_OP_ELU:
return true;
case GGML_UNARY_OP_FLOOR:
@ -4616,6 +4682,13 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
return true;
case GGML_OP_CONT:
return op->src[0]->type != GGML_TYPE_BF16;
case GGML_OP_TRI:
{
const ggml_tensor * src0 = op->src[0];
return src0 &&
op->type == GGML_TYPE_F32 &&
ggml_is_contiguous(src0);
}
case GGML_OP_DIAG_MASK_INF:
return true;
case GGML_OP_SOFT_MAX:

View File

@ -11956,7 +11956,8 @@ static void ggml_vk_test_dequant_matmul(ggml_backend_vk_context * ctx, size_t m,
}
}
if (mmq) {
ggml_pipeline_request_descriptor_sets(ctx, ctx->device->pipeline_quantize_q8_1, num_it);
vk_pipeline pipeline_quantize_q8_1 = ggml_vk_get_quantize_pipeline(ctx, GGML_TYPE_Q8_1);
ggml_pipeline_request_descriptor_sets(ctx, pipeline_quantize_q8_1, num_it);
}
ggml_pipeline_allocate_descriptor_sets(ctx);

View File

@ -114,7 +114,7 @@ struct Params {
#define PARAMS_BINDING 4
#endif
@group(0) @binding(DST_BINDING) var<storage, read_write> dst: array<f32>;
@group(0) @binding(DST_BINDING) var<storage, read_write> dst: array<vec4<f32>>;
@group(0) @binding(PARAMS_BINDING) var<uniform> params: Params;
// Just a very small float value.
@ -160,14 +160,21 @@ fn calc_softmax_term(kv_idx: u32, q_tile_row: u32, slope: f32) -> f32 {
return v;
}
fn load_f32x4(buf: ptr<storage, array<vec4<f32>>, read_write>, scalar_index: u32) -> vec4<f32> {
return (*buf)[scalar_index >> 2u];
}
fn load_kvx4(buf: ptr<storage, array<vec4<KV_TYPE>>, read_write>, scalar_index: u32) -> vec4<KV_TYPE> {
return (*buf)[scalar_index >> 2u];
}
@compute @workgroup_size(WG_SIZE)
fn main(@builtin(workgroup_id) wg_id: vec3<u32>,
@builtin(local_invocation_id) local_id: vec3<u32>,
@builtin(subgroup_id) subgroup_id: u32,
@builtin(subgroup_size) subgroup_size: u32,
@builtin(num_subgroups) num_subgroups: u32,
@builtin(subgroup_invocation_id) sg_inv_id: u32) {
@builtin(local_invocation_id) local_id: vec3<u32>,
@builtin(subgroup_id) subgroup_id: u32,
@builtin(subgroup_size) subgroup_size: u32,
@builtin(num_subgroups) num_subgroups: u32,
@builtin(subgroup_invocation_id) sg_inv_id: u32) {
// initialize row max for online softmax
for (var i = local_id.x; i < Q_TILE; i += WG_SIZE) {
@ -231,9 +238,9 @@ fn main(@builtin(workgroup_id) wg_id: vec3<u32>,
for (var kv_tile = 0u; kv_tile < params.seq_len_kv; kv_tile += KV_TILE) {
// clear inter_shmem to ensure zero-initialized accumulators
for (var elem_idx = local_id.x; elem_idx < Q_TILE * KV_TILE; elem_idx += WG_SIZE) {
inter_shmem[elem_idx] = 0.0;
}
for (var elem_idx = local_id.x; elem_idx < Q_TILE * KV_TILE; elem_idx += WG_SIZE) {
inter_shmem[elem_idx] = 0.0;
}
// load k tile into shared memory
#if defined(KV_Q4_0)
@ -309,48 +316,77 @@ fn main(@builtin(workgroup_id) wg_id: vec3<u32>,
// accumulate q block * k block into registers across the entire KV tile
// TODO: this loop seems to be the current largest bottleneck
for (var kv_block = subgroup_id; kv_block < KV_BLOCKS; kv_block += num_subgroups) {
let inter_offset = kv_block * SG_MAT_N;
var acc: subgroup_matrix_result<f16, SG_MAT_M, SG_MAT_N> = subgroupMatrixLoad<
subgroup_matrix_result<f16, SG_MAT_M, SG_MAT_N>>(&inter_shmem, inter_offset, false, KV_TILE);
// this bracket exists to scope the lifetime of variables, reducing register pressure
{
#ifdef KV_DIRECT
let k_block_row = kv_tile + kv_block * SG_MAT_N;
let k_global_offset = k_head_offset + k_block_row * params.stride_k1;
let k_block_row = kv_tile + subgroup_id * SG_MAT_N;
var k_global_offset = k_head_offset + k_block_row * params.stride_k1;
#else
let k_block_offset = kv_block * SG_MAT_N * HEAD_DIM_QK;
var k_block_offset = subgroup_id * SG_MAT_N * HEAD_DIM_QK;
#endif
for (var head_dim_block = 0u; head_dim_block < HEAD_DIM_QK; head_dim_block += SG_MAT_K) {
// load q submatrix from shared memory
var q_sg_mat: subgroup_matrix_left<f16, SG_MAT_M, SG_MAT_K> = subgroupMatrixLoad<subgroup_matrix_left<f16, SG_MAT_M, SG_MAT_K>>(
&q_shmem,
head_dim_block,
false,
HEAD_DIM_QK
);
for (var kv_block = subgroup_id; kv_block < KV_BLOCKS; kv_block += num_subgroups) {
let inter_offset = kv_block * SG_MAT_N;
var acc: subgroup_matrix_result<f16, SG_MAT_M, SG_MAT_N> = subgroupMatrixLoad<subgroup_matrix_result<f16, SG_MAT_M, SG_MAT_N>>(&inter_shmem, inter_offset, false, KV_TILE);
var q_cur = subgroupMatrixLoad<subgroup_matrix_left<f16, SG_MAT_M, SG_MAT_K>>(&q_shmem, 0u, false, HEAD_DIM_QK);
// load k submatrix from device or shared memory
#ifdef KV_DIRECT
var k_sg_mat: subgroup_matrix_right<f16, SG_MAT_K, SG_MAT_N> = subgroupMatrixLoad<subgroup_matrix_right<f16, SG_MAT_K, SG_MAT_N>>(
&K,
k_global_offset + head_dim_block,
true,
params.stride_k1
);
var k_cur = subgroupMatrixLoad<subgroup_matrix_right<f16, SG_MAT_K, SG_MAT_N>>(&K, k_global_offset + 0u, true, params.stride_k1);
#else
var k_sg_mat: subgroup_matrix_right<f16, SG_MAT_K, SG_MAT_N> = subgroupMatrixLoad<subgroup_matrix_right<f16, SG_MAT_K, SG_MAT_N>>(
&kv_shmem,
k_block_offset + head_dim_block,
true,
HEAD_DIM_QK
);
var k_cur = subgroupMatrixLoad<subgroup_matrix_right<f16, SG_MAT_K, SG_MAT_N>>(&kv_shmem, k_block_offset + 0u, true, HEAD_DIM_QK);
#endif
acc = subgroupMatrixMultiplyAccumulate(q_sg_mat, k_sg_mat, acc);
var t: u32 = 1u;
for (; t + 1u < HEAD_DIM_QK / SG_MAT_K; t += 2u) {
let h0 = t * SG_MAT_K;
var q0 = subgroupMatrixLoad<subgroup_matrix_left<f16, SG_MAT_M, SG_MAT_K>>(&q_shmem, h0, false, HEAD_DIM_QK);
#ifdef KV_DIRECT
var k0 = subgroupMatrixLoad<subgroup_matrix_right<f16, SG_MAT_K, SG_MAT_N>>(&K, k_global_offset + h0, true, params.stride_k1);
#else
var k0 = subgroupMatrixLoad<subgroup_matrix_right<f16, SG_MAT_K, SG_MAT_N>>(&kv_shmem, k_block_offset + h0, true, HEAD_DIM_QK);
#endif
acc = subgroupMatrixMultiplyAccumulate(q_cur, k_cur, acc);
q_cur = q0;
k_cur = k0;
let h1 = (t + 1u) * SG_MAT_K;
var q1g = subgroupMatrixLoad<subgroup_matrix_left<f16, SG_MAT_M, SG_MAT_K>>(&q_shmem, h1, false, HEAD_DIM_QK);
#ifdef KV_DIRECT
var k1g = subgroupMatrixLoad<subgroup_matrix_right<f16, SG_MAT_K, SG_MAT_N>>(&K, k_global_offset + h1, true, params.stride_k1);
#else
var k1g = subgroupMatrixLoad<subgroup_matrix_right<f16, SG_MAT_K, SG_MAT_N>>(&kv_shmem, k_block_offset + h1, true, HEAD_DIM_QK);
#endif
acc = subgroupMatrixMultiplyAccumulate(q_cur, k_cur, acc);
q_cur = q1g;
k_cur = k1g;
}
// handle odd tail
if (t < HEAD_DIM_QK / SG_MAT_K) {
let h = t * SG_MAT_K;
var qn = subgroupMatrixLoad<subgroup_matrix_left<f16, SG_MAT_M, SG_MAT_K>>(&q_shmem, h, false, HEAD_DIM_QK);
#ifdef KV_DIRECT
var kn = subgroupMatrixLoad<subgroup_matrix_right<f16, SG_MAT_K, SG_MAT_N>>(&K, k_global_offset + h, true, params.stride_k1);
#else
var kn = subgroupMatrixLoad<subgroup_matrix_right<f16, SG_MAT_K, SG_MAT_N>>(&kv_shmem, k_block_offset + h, true, HEAD_DIM_QK);
#endif
acc = subgroupMatrixMultiplyAccumulate(q_cur, k_cur, acc);
q_cur = qn;
k_cur = kn;
}
acc = subgroupMatrixMultiplyAccumulate(q_cur, k_cur, acc);
#ifdef KV_DIRECT
k_global_offset += num_subgroups * SG_MAT_N * params.stride_k1;
#else
k_block_offset += num_subgroups * SG_MAT_N * HEAD_DIM_QK;
#endif
subgroupMatrixStore(&inter_shmem, inter_offset, acc, false, KV_TILE);
}
// store acc to shared memory for softmax (S matrix from paper)
subgroupMatrixStore(&inter_shmem, inter_offset, acc, false, KV_TILE);
}
#ifdef MASK
// load mask tile into shared memory for this KV block
// TODO: optimize and skip if mask is -INF for the entire tile
@ -495,7 +531,6 @@ fn main(@builtin(workgroup_id) wg_id: vec3<u32>,
false,
HEAD_DIM_V
);
for (var kv_block = 0u; kv_block < KV_BLOCKS; kv_block++) {
let p_offset = kv_block * SG_MAT_N;
var p_sg_mat: subgroup_matrix_left<f16, SG_MAT_M, SG_MAT_K> = subgroupMatrixLoad<subgroup_matrix_left<f16, SG_MAT_M, SG_MAT_K>>(
@ -527,11 +562,9 @@ fn main(@builtin(workgroup_id) wg_id: vec3<u32>,
// O += P * V
o_sg_mat = subgroupMatrixMultiplyAccumulate(p_sg_mat, v_sg_mat, o_sg_mat);
}
// store O back to shared memory
subgroupMatrixStore(&o_shmem, head_dim_block, o_sg_mat, false, HEAD_DIM_V);
}
workgroupBarrier();
}
@ -566,26 +599,38 @@ fn main(@builtin(workgroup_id) wg_id: vec3<u32>,
o_shmem[idx] = f16(val);
}
}
workgroupBarrier();
#endif
// write output back to global memory
for (var q_tile_row = subgroup_id;
q_tile_row < Q_TILE;
q_tile_row += num_subgroups) {
let global_q_row = q_row_start + q_tile_row;
if (global_q_row >= params.seq_len_q) {
break;
}
q_tile_row < Q_TILE;
q_tile_row += num_subgroups) {
let exp_sum = exp_sum_shmem[q_tile_row];
let scale = select(0.0, 1.0 / exp_sum, exp_sum != 0);
let global_q_row = q_row_start + q_tile_row;
if (global_q_row >= params.seq_len_q) { break; }
for (var elem_idx = sg_inv_id; elem_idx < HEAD_DIM_V; elem_idx += subgroup_size) {
let o_val = o_shmem[q_tile_row * HEAD_DIM_V + elem_idx];
let scaled = f32(o_val) * scale;
dst[dst_global_offset + q_tile_row * dst2_stride + elem_idx] = scaled;
}
let exp_sum = exp_sum_shmem[q_tile_row];
let scale = select(0.0, 1.0 / exp_sum, exp_sum != 0.0);
let row_base: u32 = dst_global_offset + q_tile_row * dst2_stride;
for (var elem_base = sg_inv_id * 4u;
elem_base < HEAD_DIM_V;
elem_base += subgroup_size * 4u) {
let i0 = q_tile_row * HEAD_DIM_V + (elem_base + 0u);
let i1 = q_tile_row * HEAD_DIM_V + (elem_base + 1u);
let i2 = q_tile_row * HEAD_DIM_V + (elem_base + 2u);
let i3 = q_tile_row * HEAD_DIM_V + (elem_base + 3u);
let v = vec4<f32>(
f32(o_shmem[i0]) * scale,
f32(o_shmem[i1]) * scale,
f32(o_shmem[i2]) * scale,
f32(o_shmem[i3]) * scale
);
let dst_vec_index: u32 = (row_base + elem_base) >> 2u;
dst[dst_vec_index] = v;
}
}
}

View File

@ -0,0 +1,40 @@
#!/usr/bin/env pwsh
# Basedir on device
$basedir=".\pkg-snapdragon"
$cli_opts=$args
$model="Llama-3.2-3B-Instruct-Q4_0.gguf"
if ($null -ne $env:M) {
$model=$env:M
}
$device="HTP0"
if ($null -ne $env:D) {
$device=$env:D
}
if ($null -ne $env:V) {
$env:GGML_HEXAGON_VERBOSE=$env:V
}
if ($null -ne $env:OPMASK) {
$env:GGML_HEXAGON_OPMASK=$env:OPMASK
}
if ($null -ne $env:NHVX) {
$env:GGML_HEXAGON_NHVX=$env:NHVX
}
if ($null -ne $env:NDEV) {
$env:GGML_HEXAGON_NDEV=$env:NDEV
}
$env:ADSP_LIBRARY_PATH="$basedir\lib"
& "$basedir\bin\llama-bench.exe" `
--mmap 0 -m $basedir\..\..\gguf\$model `
--poll 1000 -t 6 --cpu-mask 0xfc --cpu-strict 1 `
--batch-size 128 -ngl 99 --device $device $cli_opts

View File

@ -0,0 +1,53 @@
#!/usr/bin/env pwsh
# Basedir on device
$basedir=".\pkg-snapdragon"
$cli_opts=$args
$model="Llama-3.2-3B-Instruct-Q4_0.gguf"
if ($null -ne $env:M) {
$model=$env:M
}
$device="HTP0"
if ($null -ne $env:D) {
$device=$env:D
}
if ($null -ne $env:V) {
$env:GGML_HEXAGON_VERBOSE=$env:V
}
if ($null -ne $env:E) {
$env:GGML_HEXAGON_EXPERIMENTAL=$env:E
}
if ($null -ne $env:SCHED) {
$env:GGML_SCHED_DEBUG=$env:SCHED; $cli_opts="$cli_opts -v"
}
if ($null -ne $env:PROF) {
$env:GGML_HEXAGON_PROFILE=$env:PROF; $env:GGML_HEXAGON_OPSYNC=1
}
if ($null -ne $env:OPMASK) {
$env:GGML_HEXAGON_OPMASK=$env:OPMASK
}
if ($null -ne $env:NHVX) {
$env:GGML_HEXAGON_NHVX=$env:NHVX
}
if ($null -ne $env:NDEV) {
$env:GGML_HEXAGON_NDEV=$env:NDEV
}
$env:ADSP_LIBRARY_PATH="$basedir\lib"
& "$basedir\bin\llama-completion.exe" `
--no-mmap -no-cnv -m $basedir\..\..\gguf\$model `
--poll 1000 -t 6 --cpu-mask 0xfc --cpu-strict 1 `
--ctx-size 8192 --batch-size 128 -ctk q8_0 -ctv q8_0 -fa on `
-ngl 99 --device $device $cli_opts

View File

@ -0,0 +1,56 @@
#!/usr/bin/env pwsh
# Basedir on device
$basedir=".\pkg-snapdragon"
if ($args.Count -eq 0) {
Write-Host "No arguments provided.Expected the tool and argument to run."
exit -1
}
$tool=$args[0]
$cli_opts=@()
if ($args.Count -gt 1) {
$cli_opts=$args[1..($args.Count - 1)]
$remainingArgs = $args[1..($args.Count - 1)]
}
$device="HTP0"
if ($null -ne $env:D) {
$device=$env:D
}
if ($null -ne $env:V) {
$env:GGML_HEXAGON_VERBOSE=$env:V
}
if ($null -ne $env:E) {
$env:GGML_HEXAGON_EXPERIMENTAL=$env:E
}
if ($null -ne $env:SCHED) {
$env:GGML_SCHED_DEBUG=$env:SCHED; $cli_opts="$cli_opts -v"
}
if ($null -ne $env:PROF) {
$env:GGML_HEXAGON_PROFILE=$env:PROF; $env:GGML_HEXAGON_OPSYNC=1
}
if ($null -ne $env:OPMASK) {
$env:GGML_HEXAGON_OPMASK=$env:OPMASK
}
if ($null -ne $env:NHVX) {
$env:GGML_HEXAGON_NHVX=$env:NHVX
}
if ($null -ne $env:NDEV) {
$env:GGML_HEXAGON_NDEV=$env:NDEV
}
$env:ADSP_LIBRARY_PATH="$basedir\lib"
& "$basedir\bin\$tool" `
$cli_opts

View File

@ -0,0 +1,105 @@
# Requires Run as Administrator is NOT strictly necessary for User-scope env vars,
# but recommended for creating directories in C:\ root if permissions are restricted.
$ErrorActionPreference = "Stop"
# --- Configuration ---
$BaseDir = "C:\Qualcomm"
# SDK 1: Hexagon
$HexagonUrl = "https://github.com/snapdragon-toolchain/hexagon-sdk/releases/download/v6.4.0.2/hexagon-sdk-v6.4.0.2-arm64-wos.tar.xz"
$HexagonParent = Join-Path $BaseDir "Hexagon_SDK"
$HexagonSdkVersion = "6.4.0.2"
$HexagonToolsVersion = "19.0.04"
$HexagonSdkTarget = Join-Path $HexagonParent $HexagonSdkVersion
$HexagonToolsTarget = Join-Path $HexagonSdkTarget "\tools\HEXAGON_Tools\$HexagonToolsVersion"
# SDK 2: OpenCL
$OpenCLUrl = "https://github.com/snapdragon-toolchain/opencl-sdk/releases/download/v2.3.2/adreno-opencl-sdk-v2.3.2-arm64-wos.tar.xz"
$OpenCLParent = Join-Path $BaseDir "OpenCL_SDK"
$OpenCLVersion = "2.3.2"
$OpenCLTarget = Join-Path $OpenCLParent $OpenCLVersion
# --- Helper Function ---
function Install-QualcommSDK {
param (
[string]$Url,
[string]$ParentDir,
[string]$TargetDir,
[string]$Name
)
# 1. Create Parent Directory
if (-not (Test-Path -Path $ParentDir)) {
Write-Host "Creating directory: $ParentDir" -ForegroundColor Cyan
New-Item -Path $ParentDir -ItemType Directory -Force | Out-Null
}
# 2. Check for Specific Version Directory
if (Test-Path -Path $TargetDir) {
Write-Host "$Name ($TargetDir) already exists. Skipping download." -ForegroundColor Green
}
else {
Write-Host "$Name not found. preparing to download..." -ForegroundColor Yellow
# Create the target directory to extract into
New-Item -Path $TargetDir -ItemType Directory -Force | Out-Null
# Define temporary archive path
$TempFile = Join-Path $ParentDir "temp_sdk.tar.xz"
try {
# Download
Write-Host "Downloading from: $Url"
Invoke-WebRequest -Uri $Url -OutFile $TempFile
# Untar
# Note: We assume Windows includes tar.exe (Win 10 build 17063+)
Write-Host "Extracting archive to $TargetDir..."
# We use -C to extract contents INTO the target directory created above
tar -xJvf $TempFile -C $TargetDir\..
Write-Host "Extraction complete." -ForegroundColor Green
}
catch {
Write-Error "Failed to download or extract $Name. Error: $_"
# Cleanup target dir if failed so script tries again next time
Remove-Item -Path $TargetDir -Recurse -Force -ErrorAction SilentlyContinue
}
finally {
# Cleanup Archive
if (Test-Path $TempFile) { Remove-Item $TempFile -Force }
}
}
}
# --- Execution ---
# 1. Ensure Base C:\Qualcomm exists
if (-not (Test-Path $BaseDir)) {
New-Item -Path $BaseDir -ItemType Directory -Force | Out-Null
}
# 2. Run Install Logic
Install-QualcommSDK -Url $HexagonUrl -ParentDir $HexagonParent -TargetDir $HexagonSdkTarget -Name "Hexagon SDK"
Install-QualcommSDK -Url $OpenCLUrl -ParentDir $OpenCLParent -TargetDir $OpenCLTarget -Name "OpenCL SDK"
# --- Environment Variables ---
Write-Host "`nSetting Environment Variables..." -ForegroundColor Cyan
# Set OPENCL_SDK_ROOT
[System.Environment]::SetEnvironmentVariable('OPENCL_SDK_ROOT', $OpenCLTarget, [System.EnvironmentVariableTarget]::User)
$env:OPENCL_SDK_ROOT = $OpenCLTarget # Set for current session as well
Write-Host "OPENCL_SDK_ROOT set to: $OpenCLTarget"
# Set HEXAGON_SDK_ROOT
[System.Environment]::SetEnvironmentVariable('HEXAGON_SDK_ROOT', $HexagonSdkTarget, [System.EnvironmentVariableTarget]::User)
$env:HEXAGON_SDK_ROOT = $HexagonSdkTarget # Set for current session as well
Write-Host "HEXAGON_SDK_ROOT set to: $HexagonSdkTarget"
# Set HEXAGON_SDK_ROOT
[System.Environment]::SetEnvironmentVariable('HEXAGON_TOOLS_ROOT', $HexagonToolsTarget, [System.EnvironmentVariableTarget]::User)
$env:HEXAGON_TOOLS_ROOT = $HexagonToolsTarget # Set for current session as well
Write-Host "HEXAGON_TOOLS_ROOT set to: $HexagonToolsTarget"

View File

@ -1 +1 @@
ebc3a0f4a56be1c9424a89fbec09962ac34fde85
a8db410a252c8c8f2d120c6f2e7133ebe032f35d

View File

@ -1772,8 +1772,6 @@ void llama_kv_cache::state_write_data(llama_io_write_i & io, const cell_ranges_t
io.write(&v_trans, sizeof(v_trans));
io.write(&n_layer, sizeof(n_layer));
std::vector<uint8_t> tmp_buf;
// Iterate and write all the keys first, each row is a cell
// Get whole range at a time
for (const auto & layer : layers) {
@ -1791,7 +1789,7 @@ void llama_kv_cache::state_write_data(llama_io_write_i & io, const cell_ranges_t
const uint64_t k_size_row = ggml_row_size(k->type, n_embd_k_gqa);
io.write(&k_size_row, sizeof(k_size_row));
// Read each range of cells of k_size length each into tmp_buf and write out
// Read each range of cells of k_size length and write out
for (const auto & range : cr.data) {
const size_t range_size = range.second - range.first;
const size_t buf_size = range_size * k_size_row;
@ -1818,7 +1816,7 @@ void llama_kv_cache::state_write_data(llama_io_write_i & io, const cell_ranges_t
const uint64_t v_size_row = ggml_row_size(v->type, n_embd_v_gqa);
io.write(&v_size_row, sizeof(v_size_row));
// Read each range of cells of v_size length each into tmp_buf and write out
// Read each range of cells of v_size length and write out
for (const auto & range : cr.data) {
const size_t range_size = range.second - range.first;
const size_t buf_size = range_size * v_size_row;
@ -1852,7 +1850,7 @@ void llama_kv_cache::state_write_data(llama_io_write_i & io, const cell_ranges_t
// For each row, we get the element values of each cell
for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
// Read each range of cells of v_size_el length each into tmp_buf and write out
// Read each range of cells of v_size_el length and write out
for (const auto & range : cr.data) {
const size_t range_size = range.second - range.first;
const size_t src_offset = (range.first + j * kv_size) * v_size_el;

View File

@ -785,23 +785,21 @@ void llama_memory_recurrent::state_write_data(llama_io_write_i & io, const std::
io.write(&s_trans, sizeof(s_trans));
io.write(&n_layer, sizeof(n_layer));
std::vector<uint8_t> tmp_buf;
// Iterate and write all the keys first, each row is a cell
// Iterate and write all the R tensors first, each row is a cell
// Get whole range at a time
for (uint32_t il = 0; il < n_layer; ++il) {
// skip null layers (read_data will handle this by checking "r_l" and "s_l" for null)
if (r_l[il] == nullptr) continue;
// Write key type
// Write R tensor type
const int32_t r_type_i = (int32_t)r_l[il]->type;
io.write(&r_type_i, sizeof(r_type_i));
// Write row size of key
// Write row size of R tensor
const uint64_t r_size_row = ggml_row_size(r_l[il]->type, hparams.n_embd_r());
io.write(&r_size_row, sizeof(r_size_row));
// Read each range of cells of k_size length each into tmp_buf and write out
// Write each range of cells of r_size_row length
for (const auto & range : cell_ranges) {
const size_t range_size = range.second - range.first;
const size_t buf_size = range_size * r_size_row;
@ -814,15 +812,15 @@ void llama_memory_recurrent::state_write_data(llama_io_write_i & io, const std::
// skip null layers (read_data will handle this by checking "r_l" and "s_l" for null)
if (s_l[il] == nullptr) continue;
// Write value type
// Write S tensor type
const int32_t s_type_i = (int32_t)s_l[il]->type;
io.write(&s_type_i, sizeof(s_type_i));
// Write row size of value
// Write row size of S tensor
const uint64_t s_size_row = ggml_row_size(s_l[il]->type, hparams.n_embd_s());
io.write(&s_size_row, sizeof(s_size_row));
// Read each range of cells of s_size length each into tmp_buf and write out
// Write each range of S tensor rows
for (const auto & range : cell_ranges) {
const size_t range_size = range.second - range.first;
const size_t buf_size = range_size * s_size_row;
@ -830,7 +828,7 @@ void llama_memory_recurrent::state_write_data(llama_io_write_i & io, const std::
}
}
} else {
// When v is transposed, we also need the element size and get the element ranges from each row
// When S tensor is transposed, we also need the element size and get the element ranges from each row
const uint32_t mem_size = size;
for (uint32_t il = 0; il < n_layer; ++il) {
// skip null layers (read_data will handle this by checking "r_l" and "s_l" for null)
@ -838,7 +836,7 @@ void llama_memory_recurrent::state_write_data(llama_io_write_i & io, const std::
const uint32_t n_embd_s = hparams.n_embd_s();
// Write value type
// Write S tensor type
const int32_t s_type_i = (int32_t)s_l[il]->type;
io.write(&s_type_i, sizeof(s_type_i));
@ -851,7 +849,7 @@ void llama_memory_recurrent::state_write_data(llama_io_write_i & io, const std::
// For each row, we get the element values of each cell
for (uint32_t j = 0; j < n_embd_s; ++j) {
// Read each range of cells of v_size_el length each into tmp_buf and write out
// Write each range of cells of s_size_el length
for (const auto & range : cell_ranges) {
const size_t range_size = range.second - range.first;
const size_t src_offset = (range.first + j * mem_size) * s_size_el;

View File

@ -8233,11 +8233,13 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
if (!mask && max_bias > 0.0f) continue;
for (float logit_softcap : {0.0f, 10.0f}) {
if (hsk != 128 && logit_softcap != 0.0f) continue;
for (int nh : { 4, }) {
for (int nh : { 1, 4 }) {
if (nh == 1 && hsk != 576) continue; // GLM 4.7 Flash
for (int nr3 : { 1, 3, }) {
if (hsk > 64 && nr3 > 1) continue; // skip broadcast for large head sizes
for (int nr2 : { 1, 4, 12 }) {
for (int nr2 : { 1, 4, 12, 20 }) {
if (nr2 == 12 && hsk != 128) continue;
if (nr2 == 20 && (nh != 1 || hsk != 576)) continue;
//for (int kv : { 1, 17, 31, 33, 61, 113, 65, 127, 129, 130, 255, 260, 371, 380, 407, 512, 1024, }) {
for (int kv : { 113, 512, 1024, }) {
if (nr2 != 1 && kv != 512) continue;

View File

@ -1005,6 +1005,8 @@ struct clip_model_loader {
hparams.minicpmv_query_num = 64;
} else if (hparams.minicpmv_version == 6) {
hparams.minicpmv_query_num = 64;
} else if (hparams.minicpmv_version == 100045) {
hparams.minicpmv_query_num = 64;
} else {
hparams.minicpmv_query_num = 96;
}
@ -3209,6 +3211,9 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
} else if (params.minicpmv_version == 6) {
// MiniCPM-V 4.5
n_patches = 64;
} else if (params.minicpmv_version == 100045) {
// MiniCPM-o 4.5
n_patches = 64;
} else {
GGML_ABORT("Unknown minicpmv version");
}

View File

@ -501,7 +501,7 @@ default_image_mean = [0.5, 0.5, 0.5]
default_image_std = [0.5, 0.5, 0.5]
ap.add_argument('--image-mean', type=float, nargs='+', help='Mean of the images for normalization (overrides processor) ', default=None)
ap.add_argument('--image-std', type=float, nargs='+', help='Standard deviation of the images for normalization (overrides processor)', default=None)
ap.add_argument('--minicpmv_version', type=int, help='minicpmv_version: MiniCPM-V-2 use 1; MiniCPM-V-2.5 use 2; MiniCPM-V-2.6 use 3; MiniCPM-o-2.6 use 4; MiniCPM-V 4.0 use 5; MiniCPM-o-4.0 use 6', default=2)
ap.add_argument('--minicpmv_version', type=int, help='minicpmv_version: MiniCPM-V-2 use 1; MiniCPM-V-2.5 use 2; MiniCPM-V-2.6 use 3; MiniCPM-o-2.6 use 4; MiniCPM-V 4.0 use 5; MiniCPM-o-4.0 use 6; MiniCPM-o-4.5 use 100045', default=2)
# with proper
args = ap.parse_args()
@ -610,6 +610,9 @@ else:
elif minicpmv_version == 6:
emb_dim = 4096
block_count = 27
elif minicpmv_version == 100045:
emb_dim = 4096
block_count = 27
default_vision_config = {
"hidden_size": 1152,
@ -637,6 +640,10 @@ elif minicpmv_version == 6:
default_vision_config["model_type"] = "siglip_vision_model"
vision_config = SiglipVisionConfig(**default_vision_config)
model = SiglipVisionTransformer(vision_config)
elif minicpmv_version == 100045:
default_vision_config["model_type"] = "siglip_vision_model"
vision_config = SiglipVisionConfig(**default_vision_config)
model = SiglipVisionTransformer(vision_config)
processor = None
# if model.attn_pool is not None:

View File

@ -236,7 +236,7 @@ struct mtmd_context {
tok_row_end_trail = false; // no trailing end-of-row token
ov_img_first = true;
} else if (minicpmv_version == 3 || minicpmv_version == 4 || minicpmv_version == 5 || minicpmv_version == 6) {
} else if (minicpmv_version == 3 || minicpmv_version == 4 || minicpmv_version == 5 || minicpmv_version == 6 || minicpmv_version == 100045) {
// minicpmv 2.6 format:
// <image> (overview) </image><slice> (slice) </slice><slice> (slice) </slice>\n ...
slice_tmpl = MTMD_SLICE_TMPL_MINICPMV_2_6;

View File

@ -119,7 +119,7 @@ static bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftyp
[[noreturn]]
static void usage(const char * executable) {
printf("usage: %s [--help] [--allow-requantize] [--leave-output-tensor] [--pure] [--imatrix] [--include-weights]\n", executable);
printf(" [--exclude-weights] [--output-tensor-type] [--token-embedding-type] [--tensor-type] [--prune-layers] [--keep-split] [--override-kv]\n");
printf(" [--exclude-weights] [--output-tensor-type] [--token-embedding-type] [--tensor-type] [--tensor-type-file] [--prune-layers] [--keep-split] [--override-kv]\n");
printf(" model-f32.gguf [model-quant.gguf] type [nthreads]\n\n");
printf(" --allow-requantize: Allows requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit\n");
printf(" --leave-output-tensor: Will leave output.weight un(re)quantized. Increases model size but may also increase quality, especially when requantizing\n");
@ -131,6 +131,8 @@ static void usage(const char * executable) {
printf(" --token-embedding-type ggml_type: use this ggml_type for the token embeddings tensor\n");
printf(" --tensor-type TENSOR=TYPE: quantize this tensor to this ggml_type. example: --tensor-type attn_q=q8_0\n");
printf(" Advanced option to selectively quantize tensors. May be specified multiple times.\n");
printf(" --tensor-type-file tensor_type.txt: list of tensors to quantize to specific ggml_type. example: --tensor-type-file tensor_type_list.txt\n");
printf(" Advanced option to selectively quantize a long list of tensors. Format to be tensor_name=ggml_type, separated by spaces/newline.\n");
printf(" --prune-layers L0,L1,L2...comma-separated list of layer numbers to prune from the model\n");
printf(" Advanced option to remove all tensors from the given layers\n");
printf(" --keep-split: will generate quantized model in the same shards as input\n");
@ -415,6 +417,23 @@ static bool parse_tensor_type(const char * data, std::vector<tensor_quantization
return true;
}
static bool parse_tensor_type_file(const char * filename, std::vector<tensor_quantization> & tensor_type) {
std::ifstream file(filename);
if (!file) {
printf("\n%s: failed to open file '%s': %s\n\n", __func__, filename, std::strerror(errno));
return false;
}
std::string arg;
while (file >> arg) {
if (!parse_tensor_type(arg.c_str(), tensor_type)) {
return false;
}
}
return true;
}
static bool parse_layer_prune(const char * data, std::vector<int> & prune_layers) {
if (!data) {
printf("\n%s: no layer pruning ids provided\n\n", __func__);
@ -480,6 +499,10 @@ int main(int argc, char ** argv) {
if (arg_idx == argc-1 || !parse_tensor_type(argv[++arg_idx], tensor_types)) {
usage(argv[0]);
}
} else if (strcmp(argv[arg_idx], "--tensor-type-file") == 0) {
if (arg_idx == argc-1 || !parse_tensor_type_file(argv[++arg_idx], tensor_types)) {
usage(argv[0]);
}
} else if (strcmp(argv[arg_idx], "--prune-layers") == 0) {
if (arg_idx == argc-1 || !parse_layer_prune(argv[++arg_idx], prune_layers)) {
usage(argv[0]);
@ -686,3 +709,4 @@ int main(int argc, char ** argv) {
return 0;
}

View File

@ -155,7 +155,7 @@ struct server_slot {
double t_prompt_processing; // ms
double t_token_generation; // ms
std::function<void(int /* slot_id */)> callback_on_release;
std::function<void(int /* id_slot */)> callback_on_release;
// Speculative decoding stats
int32_t n_draft_total = 0; // Total draft tokens generated
@ -705,6 +705,11 @@ private:
params_base.n_cache_reuse = 0;
SRV_WRN("%s\n", "cache_reuse is not supported by multimodal, it will be disabled");
}
if (params_base.speculative.type != COMMON_SPECULATIVE_TYPE_NONE) {
params_base.speculative.type = COMMON_SPECULATIVE_TYPE_NONE;
SRV_WRN("%s\n", "speculative decoding is not supported by multimodal, it will be disabled");
}
}
if (!llama_memory_can_shift(llama_get_memory(ctx))) {
@ -754,16 +759,16 @@ private:
SRV_ERR("%s\n", "speculative decoding is not supported with multimodal");
return false;
}
SRV_WRN("%s", "speculative decoding context initialized\n");
SLT_INF(slot, "%s", "speculative decoding context initialized\n");
} else {
SRV_WRN("%s", "speculative decoding context not initialized\n");
SLT_INF(slot, "%s", "speculative decoding context not initialized\n");
}
}
SLT_INF(slot, "new slot, n_ctx = %d\n", slot.n_ctx);
slot.callback_on_release = [this](int slot_id) {
queue_tasks.pop_deferred_task(slot_id);
slot.callback_on_release = [this](int id_slot) {
queue_tasks.pop_deferred_task(id_slot);
};
slot.reset();
@ -891,6 +896,9 @@ private:
}
server_slot * get_slot_by_id(int id_slot) {
// note: allow id_slot to be out of bounds (wrap around)
id_slot = id_slot % slots.size();
for (server_slot & slot : slots) {
if (slot.id == id_slot) {
return &slot;
@ -1760,7 +1768,7 @@ private:
break;
}
int id_slot = task.slot_action.slot_id;
const int id_slot = task.slot_action.id_slot;
server_slot * slot = get_slot_by_id(id_slot);
if (slot == nullptr) {
send_error(task, "Invalid slot ID", ERROR_TYPE_INVALID_REQUEST);
@ -1798,7 +1806,7 @@ private:
case SERVER_TASK_TYPE_SLOT_RESTORE:
{
if (!check_no_mtmd(task.id)) break;
int id_slot = task.slot_action.slot_id;
const int id_slot = task.slot_action.id_slot;
server_slot * slot = get_slot_by_id(id_slot);
if (slot == nullptr) {
send_error(task, "Invalid slot ID", ERROR_TYPE_INVALID_REQUEST);
@ -1847,7 +1855,7 @@ private:
if (!check_no_mtmd(task.id)) {
break;
}
int id_slot = task.slot_action.slot_id;
const int id_slot = task.slot_action.id_slot;
server_slot * slot = get_slot_by_id(id_slot);
if (slot == nullptr) {
send_error(task, "Invalid slot ID", ERROR_TYPE_INVALID_REQUEST);
@ -3312,7 +3320,7 @@ void server_routes::init_routes() {
}
// TODO: get rid of this dynamic_cast
auto res_task = dynamic_cast<server_task_result_metrics*>(result.get());
auto * res_task = dynamic_cast<server_task_result_metrics*>(result.get());
GGML_ASSERT(res_task != nullptr);
// optionally return "fail_on_no_slot" error
@ -3335,8 +3343,8 @@ void server_routes::init_routes() {
}
std::string id_slot_str = req.get_param("id_slot");
int id_slot;
int id_slot;
try {
id_slot = std::stoi(id_slot_str);
} catch (const std::exception &) {
@ -3348,14 +3356,16 @@ void server_routes::init_routes() {
if (action == "save") {
return handle_slots_save(req, id_slot);
} else if (action == "restore") {
return handle_slots_restore(req, id_slot);
} else if (action == "erase") {
return handle_slots_erase(req, id_slot);
} else {
res->error(format_error_response("Invalid action", ERROR_TYPE_INVALID_REQUEST));
return res;
}
if (action == "restore") {
return handle_slots_restore(req, id_slot);
}
if (action == "erase") {
return handle_slots_erase(req, id_slot);
}
res->error(format_error_response("Invalid action", ERROR_TYPE_INVALID_REQUEST));
return res;
};
this->get_props = [this](const server_http_req &) {
@ -3898,7 +3908,7 @@ std::unique_ptr<server_res_generator> server_routes::handle_slots_save(const ser
{
server_task task(SERVER_TASK_TYPE_SLOT_SAVE);
task.id = rd.get_new_id();
task.slot_action.slot_id = id_slot;
task.slot_action.id_slot = id_slot;
task.slot_action.filename = filename;
task.slot_action.filepath = filepath;
rd.post_task(std::move(task));
@ -3934,7 +3944,7 @@ std::unique_ptr<server_res_generator> server_routes::handle_slots_restore(const
{
server_task task(SERVER_TASK_TYPE_SLOT_RESTORE);
task.id = rd.get_new_id();
task.slot_action.slot_id = id_slot;
task.slot_action.id_slot = id_slot;
task.slot_action.filename = filename;
task.slot_action.filepath = filepath;
rd.post_task(std::move(task));
@ -3963,7 +3973,7 @@ std::unique_ptr<server_res_generator> server_routes::handle_slots_erase(const se
{
server_task task(SERVER_TASK_TYPE_SLOT_ERASE);
task.id = rd.get_new_id();
task.slot_action.slot_id = id_slot;
task.slot_action.id_slot = id_slot;
rd.post_task(std::move(task));
}

View File

@ -153,7 +153,7 @@ struct server_task {
// used by SERVER_TASK_TYPE_SLOT_SAVE, SERVER_TASK_TYPE_SLOT_RESTORE, SERVER_TASK_TYPE_SLOT_ERASE
struct slot_action {
int slot_id;
int id_slot;
std::string filename;
std::string filepath;
};