diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 551bdd3df0..f738edefc4 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -21,7 +21,8 @@ on: '**/*.m', '**/*.metal', '**/*.comp', - '**/*.glsl' + '**/*.glsl', + '**/*.wgsl' ] pull_request: @@ -42,7 +43,8 @@ on: '**/*.m', '**/*.metal', '**/*.comp', - '**/*.glsl' + '**/*.glsl', + '**/*.wgsl' ] concurrency: @@ -1371,7 +1373,7 @@ jobs: id: update_presets if: ${{ matrix.build == 'arm64-snapdragon' }} run: | - cp docs/backend/hexagon/CMakeUserPresets.json . + cp docs/backend/snapdragon/CMakeUserPresets.json . - name: Build id: ndk_build diff --git a/README.md b/README.md index 0783e43e5c..dac020ad37 100644 --- a/README.md +++ b/README.md @@ -213,6 +213,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo - [llama.vim](https://github.com/ggml-org/llama.vim) (MIT) - [LARS](https://github.com/abgulati/LARS) (AGPL) - [Llama Assistant](https://github.com/vietanhdev/llama-assistant) (GPL) +- [LlamaLib](https://github.com/undreamai/LlamaLib) (Apache-2.0) - [LLMFarm](https://github.com/guinmoon/LLMFarm?tab=readme-ov-file) (MIT) - [LLMUnity](https://github.com/undreamai/LLMUnity) (MIT) - [LMStudio](https://lmstudio.ai/) (proprietary) diff --git a/common/CMakeLists.txt b/common/CMakeLists.txt index 3bc7bc6210..295ae9ea25 100644 --- a/common/CMakeLists.txt +++ b/common/CMakeLists.txt @@ -75,6 +75,8 @@ add_library(${TARGET} STATIC ngram-cache.h ngram-map.cpp ngram-map.h + ngram-mod.cpp + ngram-mod.h peg-parser.cpp peg-parser.h preset.cpp diff --git a/common/arg.cpp b/common/arg.cpp index 218418f070..5fbc9022c0 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -3396,7 +3396,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex } ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI})); add_opt(common_arg( - {"--spec-type"}, "[none|ngram-cache|ngram-simple|ngram-map-k|ngram-map-k4v]", + {"--spec-type"}, "[none|ngram-cache|ngram-simple|ngram-map-k|ngram-map-k4v|ngram-mod]", string_format("type of speculative decoding to use when no draft model is provided (default: %s)\n", common_speculative_type_to_str(params.speculative.type).c_str()), [](common_params & params, const std::string & value) { @@ -3410,6 +3410,8 @@ common_params_context common_params_parser_init(common_params & params, llama_ex params.speculative.type = COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K; } else if (value == "ngram-map-k4v") { params.speculative.type = COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K4V; + } else if (value == "ngram-mod") { + params.speculative.type = COMMON_SPECULATIVE_TYPE_NGRAM_MOD; } else { throw std::invalid_argument("unknown speculative decoding type without draft model"); } diff --git a/common/common.h b/common/common.h index fd3ab8cd18..398ebb0960 100644 --- a/common/common.h +++ b/common/common.h @@ -171,6 +171,7 @@ enum common_speculative_type { COMMON_SPECULATIVE_TYPE_NGRAM_SIMPLE, // simple self-speculative decoding COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K, // self-speculative decoding with n-gram keys only COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K4V, // self-speculative decoding with n-gram keys and 4 m-gram values + COMMON_SPECULATIVE_TYPE_NGRAM_MOD, COMMON_SPECULATIVE_TYPE_NGRAM_CACHE, // self-speculative decoding with 3-level n-gram cache COMMON_SPECULATIVE_TYPE_COUNT // number of types, unknown type }; @@ -252,6 +253,8 @@ struct common_params_model { std::string name = ""; // in format /[:] (tag is optional) // NOLINT }; +struct common_ngram_mod; + struct common_params_speculative { common_speculative_type type = COMMON_SPECULATIVE_TYPE_NONE; // type of speculative decoding @@ -269,6 +272,8 @@ struct common_params_speculative { uint16_t ngram_check_rate = 1; // check rate for ngram lookup uint16_t ngram_min_hits = 1; // minimum hits at ngram/mgram lookup for mgram to be proposed + std::shared_ptr ngram_mod; + std::string lookup_cache_static; // path of static ngram cache file for lookup decoding // NOLINT std::string lookup_cache_dynamic; // path of dynamic ngram cache file for lookup decoding // NOLINT diff --git a/common/jinja/value.h b/common/jinja/value.h index a2f92d2c69..1c04760a08 100644 --- a/common/jinja/value.h +++ b/common/jinja/value.h @@ -12,6 +12,7 @@ #include #include #include +#include #include namespace jinja { diff --git a/common/ngram-map.cpp b/common/ngram-map.cpp index 930e7a3c10..84fd761367 100644 --- a/common/ngram-map.cpp +++ b/common/ngram-map.cpp @@ -7,6 +7,21 @@ #include #include +// Print the values of a sublist of `llama_tokens & inp` to a string in the form [v0, v1, v2, ...]. +static std::string common_tokens_to_str(const llama_tokens & inp, size_t start, size_t length) { + std::ostringstream oss; + oss << '['; + for (size_t i = 0; i < length; ++i) { + if (i > 0) { + oss << ", "; + } + oss << inp[start + i]; + } + oss << ']'; + return oss.str(); +} + + // n-gram simple // @@ -100,8 +115,6 @@ llama_tokens common_ngram_simple_draft( // maximum number of counted values of a ngram map value. #define COMMON_NGRAM_MAX_VALUE_COUNT 16380 -static std::string common_tokens_to_str(const llama_tokens & inp, size_t start, size_t length); - void common_ngram_map_draft(common_ngram_map & map, const llama_tokens & inp, llama_token sampled, llama_tokens & draft) { @@ -347,21 +360,3 @@ void common_ngram_map_accept(common_ngram_map & map, uint16_t n_accepted) { n_accepted, curr_value.n_accepted); curr_value.n_accepted = n_accepted; } - -// Helper functions. -// - -// Print the values of a sublist of `llama_tokens & inp` to a string in the form [v0, v1, v2, ...]. -std::string common_tokens_to_str(const llama_tokens & inp, size_t start, size_t length) { - std::ostringstream oss; - oss << '['; - for (size_t i = 0; i < length; ++i) { - if (i > 0) { - oss << ", "; - } - oss << inp[start + i]; - } - oss << ']'; - return oss.str(); -} - diff --git a/common/ngram-map.h b/common/ngram-map.h index bf91883f0c..b365034ac5 100644 --- a/common/ngram-map.h +++ b/common/ngram-map.h @@ -11,6 +11,7 @@ // #include "llama.h" +#include "common.h" #include diff --git a/common/ngram-mod.cpp b/common/ngram-mod.cpp new file mode 100644 index 0000000000..76f7257f61 --- /dev/null +++ b/common/ngram-mod.cpp @@ -0,0 +1,60 @@ +#include "ngram-mod.h" + +// +// common_ngram_mod +// + +common_ngram_mod::common_ngram_mod(uint16_t n, size_t size) : n(n), used(0) { + entries.resize(size); + + reset(); +} + +size_t common_ngram_mod::idx(const entry_t * tokens) const { + size_t res = 0; + + for (size_t i = 0; i < n; ++i) { + res = res*6364136223846793005ULL + tokens[i]; + } + + res = res % entries.size(); + + return res; +} + +void common_ngram_mod::add(const entry_t * tokens) { + const size_t i = idx(tokens); + + if (entries[i] == EMPTY) { + used++; + } + + entries[i] = tokens[n]; +} + +common_ngram_mod::entry_t common_ngram_mod::get(const entry_t * tokens) const { + const size_t i = idx(tokens); + + return entries[i]; +} + +void common_ngram_mod::reset() { + std::fill(entries.begin(), entries.end(), EMPTY); + used = 0; +} + +size_t common_ngram_mod::get_n() const { + return n; +} + +size_t common_ngram_mod::get_used() const { + return used; +} + +size_t common_ngram_mod::size() const { + return entries.size(); +} + +size_t common_ngram_mod::size_bytes() const { + return entries.size() * sizeof(entries[0]); +} diff --git a/common/ngram-mod.h b/common/ngram-mod.h new file mode 100644 index 0000000000..7af92e9dde --- /dev/null +++ b/common/ngram-mod.h @@ -0,0 +1,38 @@ +#pragma once + +#include +#include +#include + +// +// common_ngram_mod +// ref: https://github.com/ggml-org/llama.cpp/pull/19164 +// + +// basic n-gram hasher +struct common_ngram_mod { + using entry_t = int32_t; + + static constexpr entry_t EMPTY = -1; + + common_ngram_mod(uint16_t n, size_t size); + + size_t idx(const entry_t * tokens) const; + void add(const entry_t * tokens); + entry_t get(const entry_t * tokens) const; // return -1 if not found + + void reset(); + + size_t get_n() const; + size_t get_used() const; + + size_t size() const; + size_t size_bytes() const; + +private: + size_t n; // ngram size to hash + + size_t used; + + std::vector entries; +}; diff --git a/common/speculative.cpp b/common/speculative.cpp index 3f314b5d57..a1a3b51c13 100644 --- a/common/speculative.cpp +++ b/common/speculative.cpp @@ -6,6 +6,7 @@ #include "log.h" #include "ngram-cache.h" #include "ngram-map.h" +#include "ngram-mod.h" #include "sampling.h" #include @@ -23,6 +24,7 @@ const std::vector common_speculative_types = { COMMON_SPECULATIVE_TYPE_NGRAM_SIMPLE, COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K, COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K4V, + COMMON_SPECULATIVE_TYPE_NGRAM_MOD, COMMON_SPECULATIVE_TYPE_NGRAM_CACHE }; @@ -33,6 +35,7 @@ const std::map common_speculative_typ {"ngram_simple", COMMON_SPECULATIVE_TYPE_NGRAM_SIMPLE}, {"ngram_map_k", COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K}, {"ngram_map_k4v", COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K4V}, + {"ngram_mod", COMMON_SPECULATIVE_TYPE_NGRAM_MOD}, {"ngram_cache", COMMON_SPECULATIVE_TYPE_NGRAM_CACHE} }; @@ -110,6 +113,8 @@ static bool common_speculative_are_compatible( struct common_speculative_state { const enum common_speculative_type type; + // TODO: rename to n_call_draft, n_gen_drafts, n_acc_drafts, n_gen_tokens, n_acc_tokens + // TODO: add n_call_begin, n_call_accept size_t drafts_call_count = 0; // number of times this implementation was called. size_t drafts_generated_count = 0; // number of times a draft or part was generated by this implementation. size_t drafts_accepted_count = 0; // number of times a draft or part was accepted by the target model. @@ -119,6 +124,8 @@ struct common_speculative_state { // TODO: track performance of most recent calls const bool gen_perf = true; // whether to generate performance stats. + // TODO: rename to t_draft_us + // TODO: add t_begin_us, t_accept_us int64_t gen_duration_us = 0; // total time spent in this implementation in microseconds. common_speculative_state(enum common_speculative_type type) : type(type) {} @@ -509,6 +516,132 @@ struct common_speculative_state_ngram_map_k : public common_speculative_state { } }; +struct common_speculative_state_ngram_mod : public common_speculative_state { + common_ngram_mod & mod; + + // the last position in the prompt that was added to the ngram container + size_t i_last = 0; + + // length of the last drafted n‑gram (number of tokens returned by draft) + size_t n_draft_last = 0; + + // consecutive accept rounds with low acceptance fraction (< 0.5) + int n_low = 0; + + // enable trace logging if LLAMA_TRACE is set + const bool verbose; + + common_speculative_state_ngram_mod(enum common_speculative_type type, common_ngram_mod & mod) + : common_speculative_state(type), mod(mod), verbose(std::getenv("LLAMA_TRACE") != nullptr) { + static_assert(sizeof(llama_token) == sizeof(common_ngram_mod::entry_t)); + } + + void begin(const llama_tokens & prompt) override { + i_last = 0; + + n_draft_last = 0; + + const size_t n = mod.get_n(); + + if (prompt.size() < n) { + return; + } + + for (size_t i = 0; i < prompt.size() - n; ++i) { + mod.add(prompt.data() + i); + } + + i_last = prompt.size() - n; + + const double f = (double)mod.get_used() / (double)mod.size(); + LOG_INF("%s: ngram_mod occupancy = %zu/%zu (%.2f)\n", __func__, mod.get_used(), mod.size(), f); + + constexpr double f_thold = 0.25; + if (f > f_thold) { + LOG_WRN("%s: ngram_mod occupancy %.2f exceeds threshold (%.2f) - resetting\n", __func__, f, f_thold); + + mod.reset(); + } + } + + void draft( + const common_params_speculative & params, + const llama_tokens & prompt_tgt, + llama_token id_last, + llama_tokens & result) override { + GGML_UNUSED(params); + + n_draft_last = 0; + + const size_t cur_len = prompt_tgt.size(); + if (cur_len < mod.get_n()) { + return; + } + + const size_t n = mod.get_n(); + + // add new ngrams in chunks + if (i_last + 32 < cur_len) { + for (size_t i = i_last; i < cur_len - n; ++i) { + mod.add(prompt_tgt.data() + i); + } + + i_last = cur_len - n; + } + + result.resize(n + params.n_max); + for (size_t i = 0; i < n - 1; ++i) { + result[i] = prompt_tgt[cur_len - n + 1 + i]; + } + result[n - 1] = id_last; + + for (int i = 0; i < params.n_max; ++i) { + const llama_token token = mod.get(result.data() + i); + if (token == common_ngram_mod::EMPTY) { + if (i < params.n_min) { + result.clear(); + return; + } + + result.resize(n + i); + break; + } + result[n + i] = token; + } + + // only return the m tokens that were drafted + for (size_t i = 0; n + i < result.size(); ++i) { + result[i] = result[n + i]; + } + result.resize(result.size() - n); + + // store length of drafted n‑gram for later acceptance analysis + n_draft_last = result.size(); + } + + void accept(uint16_t n_accepted) override { + if (verbose) { + LOG_INF("%s: accepted %d tokens from %zu drafted tokens\n", __func__, n_accepted, n_draft_last); + } + + // compute acceptance fraction if we have a recorded draft length + if (n_draft_last > 0) { + const double f_acc = (double)n_accepted / (double)n_draft_last; + if (f_acc < 0.5) { + n_low++; + if (n_low >= 3) { + LOG_WRN("%s: low acceptance streak (%d) – resetting ngram_mod\n", __func__, n_low); + + mod.reset(); + n_low = 0; + } + } else { + n_low = 0; + } + } + } +}; + struct common_speculative_state_ngram_cache : public common_speculative_state { uint16_t n_draft; bool save_dynamic; @@ -650,6 +783,7 @@ std::string common_speculative_type_to_str(enum common_speculative_type type) { case COMMON_SPECULATIVE_TYPE_NGRAM_SIMPLE: return "ngram_simple"; case COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K: return "ngram_map_k"; case COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K4V: return "ngram_map_k4v"; + case COMMON_SPECULATIVE_TYPE_NGRAM_MOD: return "ngram_mod"; case COMMON_SPECULATIVE_TYPE_NGRAM_CACHE: return "ngram_cache"; default: return "unknown"; } @@ -666,8 +800,8 @@ enum common_speculative_type common_speculative_type_from_name(const std::string // initialization of the speculative decoding system // common_speculative * common_speculative_init( - const common_params_speculative & params, - llama_context * ctx_tgt) { + common_params_speculative & params, + llama_context * ctx_tgt) { llama_context * ctx_dft = nullptr; if (params.model_dft) { ctx_dft = llama_init_from_model(params.model_dft, params.cparams_dft); @@ -687,6 +821,7 @@ common_speculative * common_speculative_init( bool has_ngram_simple = (params.type == COMMON_SPECULATIVE_TYPE_NGRAM_SIMPLE); bool has_ngram_map_k = (params.type == COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K); bool has_ngram_map_k4v = (params.type == COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K4V); + bool has_ngram_mod = (params.type == COMMON_SPECULATIVE_TYPE_NGRAM_MOD); // In a more complex implementation we could use the same implementation but with different parameters. // This was initially used in PR-18471 but removed to simplify the code. @@ -701,6 +836,22 @@ common_speculative * common_speculative_init( // This implementation can guess tokens with high acceptance rate but is more expensive. configs.push_back(common_speculative_config(COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K4V, params)); } + if (has_ngram_mod) { + // shared instance for all speculative decoding contexts + if (!params.ngram_mod) { + params.ngram_mod = std::make_shared(params.ngram_size_n, 4*1024*1024); + + LOG_INF("%s: initialized ngram_mod with n=%d, size=%zu (%.3f MB)\n", __func__, + params.ngram_size_n, params.ngram_mod->size(), + (float)(params.ngram_mod->size_bytes())/1024/1024); + + if (params.ngram_size_n < 16) { + LOG_WRN("%s: ngram_mod n=%d is too small - poor quality is possible, see: https://github.com/ggml-org/llama.cpp/pull/19164\n", __func__, params.ngram_size_n); + } + } + + configs.push_back(common_speculative_config(COMMON_SPECULATIVE_TYPE_NGRAM_MOD, params)); + } if (has_ngram_cache) { configs.push_back(common_speculative_config(COMMON_SPECULATIVE_TYPE_NGRAM_CACHE, params)); } @@ -758,6 +909,11 @@ common_speculative * common_speculative_init( )); break; } + case COMMON_SPECULATIVE_TYPE_NGRAM_MOD: { + GGML_ASSERT(config.params.ngram_mod); + impls.push_back(std::make_unique(config.type, *config.params.ngram_mod)); + break; + } case COMMON_SPECULATIVE_TYPE_NGRAM_CACHE: { auto state = create_state_ngram_cache( params.lookup_cache_static, params.lookup_cache_dynamic, config); @@ -822,8 +978,7 @@ llama_tokens common_speculative_draft( if (!result.empty()) { LOG_DBG("%s: called impl %s, hist size = %zu, call_count = %zu, gen = %zu\n", __func__, - common_speculative_type_to_str(impl.get()->type).c_str(), - prompt_tgt.size(), + common_speculative_type_to_str(impl.get()->type).c_str(), prompt_tgt.size(), impl.get()->drafts_call_count, result.size()); spec->curr_impl = impl.get(); // set current implementation for stats @@ -869,6 +1024,7 @@ void common_speculative_print_stats(const common_speculative * spec) { str_perf = ""; } + // TODO: report time for begin() and accept() LOG_INF("statistics %s: #calls = %zu, #gen drafts = %zu, #acc drafts = %zu, #gen tokens = %zu, #acc tokens = %zu%s\n", common_speculative_type_to_str(impl->type).c_str(), impl->drafts_call_count, diff --git a/common/speculative.h b/common/speculative.h index 9e1888e4be..76fe6bb7bc 100644 --- a/common/speculative.h +++ b/common/speculative.h @@ -15,8 +15,8 @@ enum common_speculative_type common_speculative_type_from_name(const std::string std::string common_speculative_type_to_str(enum common_speculative_type type); common_speculative * common_speculative_init( - const common_params_speculative & params, - llama_context * ctx_tgt); + common_params_speculative & params, + llama_context * ctx_tgt); void common_speculative_free(common_speculative * spec); diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index a391717e32..eb43520f98 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -8806,6 +8806,7 @@ class GraniteMoeModel(GraniteModel): gate, up = data_torch.split(ffn_dim, dim=-2) yield from ModelBase.modify_tensors(self, gate, self.format_tensor_name(gguf.MODEL_TENSOR.FFN_GATE_EXP, bid), bid) yield from ModelBase.modify_tensors(self, up, self.format_tensor_name(gguf.MODEL_TENSOR.FFN_UP_EXP, bid), bid) + return has_experts = bool(self.hparams.get('num_local_experts')) diff --git a/docs/backend/SYCL.md b/docs/backend/SYCL.md index bcb3ce6743..c0a422b3dc 100644 --- a/docs/backend/SYCL.md +++ b/docs/backend/SYCL.md @@ -35,9 +35,9 @@ The following releases are verified and recommended: |Commit ID|Tag|Release|Verified Platform| Update date| |-|-|-|-|-| -|24e86cae7219b0f3ede1d5abdf5bf3ad515cccb8|b5377 |[llama-b5377-bin-win-sycl-x64.zip](https://github.com/ggml-org/llama.cpp/releases/download/b5377/llama-b5377-bin-win-sycl-x64.zip) |ArcB580/Linux/oneAPI 2025.1
LNL Arc GPU/Windows 11/oneAPI 2025.1.1|2025-05-15| -|3bcd40b3c593d14261fb2abfabad3c0fb5b9e318|b4040 |[llama-b4040-bin-win-sycl-x64.zip](https://github.com/ggml-org/llama.cpp/releases/download/b4040/llama-b4040-bin-win-sycl-x64.zip) |Arc770/Linux/oneAPI 2024.1
MTL Arc GPU/Windows 11/oneAPI 2024.1| 2024-11-19| -|fb76ec31a9914b7761c1727303ab30380fd4f05c|b3038 |[llama-b3038-bin-win-sycl-x64.zip](https://github.com/ggml-org/llama.cpp/releases/download/b3038/llama-b3038-bin-win-sycl-x64.zip) |Arc770/Linux/oneAPI 2024.1
MTL Arc GPU/Windows 11/oneAPI 2024.1|| +|24e86cae7219b0f3ede1d5abdf5bf3ad515cccb8|b5377 |[llama-b5377-bin-win-sycl-x64.zip](https://github.com/ggml-org/llama.cpp/releases/download/b5377/llama-b5377-bin-win-sycl-x64.zip) |Arc B580/Linux/oneAPI 2025.1
LNL Arc GPU/Windows 11/oneAPI 2025.1.1|2025-05-15| +|3bcd40b3c593d14261fb2abfabad3c0fb5b9e318|b4040 |[llama-b4040-bin-win-sycl-x64.zip](https://github.com/ggml-org/llama.cpp/releases/download/b4040/llama-b4040-bin-win-sycl-x64.zip) |Arc A770/Linux/oneAPI 2024.1
MTL Arc GPU/Windows 11/oneAPI 2024.1| 2024-11-19| +|fb76ec31a9914b7761c1727303ab30380fd4f05c|b3038 |[llama-b3038-bin-win-sycl-x64.zip](https://github.com/ggml-org/llama.cpp/releases/download/b3038/llama-b3038-bin-win-sycl-x64.zip) |Arc A770/Linux/oneAPI 2024.1
MTL Arc GPU/Windows 11/oneAPI 2024.1|| ## News @@ -51,7 +51,7 @@ The following releases are verified and recommended: |-|-|-|-| |PVC 1550|39|73|+87%| |Flex 170|39|50|+28%| - |Arc770|42|55|+30%| + |Arc A770|42|55|+30%| |MTL|13|16|+23%| |ARL-H|14|17|+21%| @@ -62,7 +62,7 @@ The following releases are verified and recommended: - Use oneDNN as the default GEMM library, improve the compatibility for new Intel GPUs. - 2024.5 - - Performance is increased: 34 -> 37 tokens/s of llama-2-7b.Q4_0 on Arc770. + - Performance is increased: 34 -> 37 tokens/s of llama-2-7b.Q4_0 on Arc A770. - Arch Linux is verified successfully. - 2024.4 @@ -111,7 +111,8 @@ On older Intel GPUs, you may try [OpenCL](/docs/backend/OPENCL.md) although the |-------------------------------|---------|---------------------------------------| | Intel Data Center Max Series | Support | Max 1550, 1100 | | Intel Data Center Flex Series | Support | Flex 170 | -| Intel Arc Series | Support | Arc 770, 730M, Arc A750, B580 | +| Intel Arc A-Series | Support | Arc A770, Arc A730M, Arc A750 | +| Intel Arc B-Series | Support | Arc B580 | | Intel built-in Arc GPU | Support | built-in Arc GPU in Meteor Lake, Arrow Lake, Lunar Lake | | Intel iGPU | Support | iGPU in 13700k, 13400, i5-1250P, i7-1260P, i7-1165G7 | diff --git a/docs/backend/hexagon/CMakeUserPresets.json b/docs/backend/snapdragon/CMakeUserPresets.json similarity index 70% rename from docs/backend/hexagon/CMakeUserPresets.json rename to docs/backend/snapdragon/CMakeUserPresets.json index 1f2676c0bc..4cf473d05f 100644 --- a/docs/backend/hexagon/CMakeUserPresets.json +++ b/docs/backend/snapdragon/CMakeUserPresets.json @@ -1,5 +1,10 @@ { - "version": 4, + "version": 5, + "cmakeMinimumRequired": { + "major": 3, + "minor": 28, + "patch": 0 + }, "configurePresets": [ { "name": "arm64-android-snapdragon", @@ -16,7 +21,9 @@ "CMAKE_CXX_FLAGS_RELEASE": "-O3 -DNDEBUG", "CMAKE_C_FLAGS_RELWITHDEBINFO": "-O3 -DNDEBUG -g", "CMAKE_CXX_FLAGS_RELWITHDEBINFO": "-O3 -DNDEBUG -g", - "HEXAGON_SDK_ROOT": "$env{HEXAGON_SDK_ROOT}", + "CMAKE_PREFIX_PATH": "$env{OPENCL_SDK_ROOT}", + "HEXAGON_SDK_ROOT": "$env{HEXAGON_SDK_ROOT}", + "HEXAGON_TOOLS_ROOT": "$env{HEXAGON_TOOLS_ROOT}", "PREBUILT_LIB_DIR": "android_aarch64", "GGML_OPENMP": "OFF", "GGML_LLAMAFILE": "OFF", @@ -31,7 +38,15 @@ "name": "arm64-windows-snapdragon", "inherits": [ "base", "arm64-windows-llvm" ], "cacheVariables": { - "HEXAGON_SDK_ROOT": "$env{HEXAGON_SDK_ROOT}", + "CMAKE_C_FLAGS": "-march=armv8.7a+fp16 -fvectorize -ffp-model=fast -flto -D_GNU_SOURCE", + "CMAKE_CXX_FLAGS": "-march=armv8.7a+fp16 -fvectorize -ffp-model=fast -flto -D_GNU_SOURCE", + "CMAKE_C_FLAGS_RELEASE": "-O3 -DNDEBUG", + "CMAKE_CXX_FLAGS_RELEASE": "-O3 -DNDEBUG", + "CMAKE_C_FLAGS_RELWITHDEBINFO": "-O3 -DNDEBUG -g", + "CMAKE_CXX_FLAGS_RELWITHDEBINFO": "-O3 -DNDEBUG -g", + "CMAKE_PREFIX_PATH": "$env{OPENCL_SDK_ROOT}", + "HEXAGON_SDK_ROOT": "$env{HEXAGON_SDK_ROOT}", + "HEXAGON_TOOLS_ROOT": "$env{HEXAGON_TOOLS_ROOT}", "PREBUILT_LIB_DIR": "windows_aarch64", "GGML_OPENMP": "OFF", "GGML_LLAMAFILE": "OFF", diff --git a/docs/backend/hexagon/README.md b/docs/backend/snapdragon/README.md similarity index 84% rename from docs/backend/hexagon/README.md rename to docs/backend/snapdragon/README.md index 3befdf7225..8e1f37b206 100644 --- a/docs/backend/hexagon/README.md +++ b/docs/backend/snapdragon/README.md @@ -1,6 +1,8 @@ -# Snapdragon-based Android devices +# Snapdragon-based devices -## How to Build +## Setup + +### Android The easiest way to build llama.cpp for a Snapdragon-based Android device is using the toolchain Docker image (see github.com/snapdragon-toolchain). This image includes Android NDK, OpenCL SDK, Hexagon SDK, CMake, etc. @@ -12,7 +14,24 @@ This method works on Linux, macOS, and Windows. macOS and Windows users should i [d]/> cd /workspace ``` -The rest of the Android build process assumes that you're running inside the toolchain container. +Note: The rest of the **Android** build process assumes that you're running inside the toolchain container. + +### Windows On Snapdragon + +Native Windows 11 arm64 builds has the following tools dependencies: +- MS Visual Studio 2026 (Community Edition or Pro) + - MSVC arm64 standard and runtime libraries + - UCRT and Driver Kit +- LLVM core libraries and Clang compiler (winget) +- CMake, Git, Python (winget) +- Hexagon SDK Community Edition 6.4 or later (see windows.md) +- OpenCL SDK 2.3 or later (see windows.md) + +Note: The rest of the **Windows** build process assumes that you're running natively in Powershell. +Adapt below build commands accordingly. + +## How to Build + Let's build llama.cpp with CPU, OpenCL, and Hexagon backends via CMake presets: ``` @@ -49,24 +68,26 @@ Preset CMake variables: To generate an installable "package" simply use cmake --install: ``` -[d]/workspace> cmake --install build-snapdragon --prefix pkg-adb/llama.cpp +[d]/workspace> cmake --install build-snapdragon --prefix pkg-snapdragon/llama.cpp -- Install configuration: "Release" --- Installing: /workspace/pkg-adb/llama.cpp/lib/libggml-cpu.so --- Installing: /workspace/pkg-adb/llama.cpp/lib/libggml-opencl.so --- Installing: /workspace/pkg-adb/llama.cpp/lib/libggml-hexagon.so --- Installing: /workspace/pkg-adb/llama.cpp/lib/libggml-htp-v73.so --- Installing: /workspace/pkg-adb/llama.cpp/lib/libggml-htp-v75.so --- Installing: /workspace/pkg-adb/llama.cpp/lib/libggml-htp-v79.so --- Installing: /workspace/pkg-adb/llama.cpp/lib/libggml-htp-v81.so --- Installing: /workspace/pkg-adb/llama.cpp/lib/libggml.so +-- Installing: /workspace/pkg-snapdragon/llama.cpp/lib/libggml-cpu.so +-- Installing: /workspace/pkg-snapdragon/llama.cpp/lib/libggml-opencl.so +-- Installing: /workspace/pkg-snapdragon/llama.cpp/lib/libggml-hexagon.so +-- Installing: /workspace/pkg-snapdragon/llama.cpp/lib/libggml-htp-v73.so +-- Installing: /workspace/pkg-snapdragon/llama.cpp/lib/libggml-htp-v75.so +-- Installing: /workspace/pkg-snapdragon/llama.cpp/lib/libggml-htp-v79.so +-- Installing: /workspace/pkg-snapdragon/llama.cpp/lib/libggml-htp-v81.so +-- Installing: /workspace/pkg-snapdragon/llama.cpp/lib/libggml.so ... --- Installing: /workspace/pkg-adb/llama.cpp/bin/llama-bench --- Installing: /workspace/pkg-adb/llama.cpp/bin/llama-cli +-- Installing: /workspace/pkg-snapdragon/llama.cpp/bin/llama-bench +-- Installing: /workspace/pkg-snapdragon/llama.cpp/bin/llama-cli ... ``` ## How to Install +### Android + For this step, your device needs to be configured for on-device development. Please see https://developer.android.com/studio/debug/dev-options for details. @@ -74,10 +95,10 @@ Once ADB is enabled, use `adb push` to install `pkg-snapdragon` on the device. **Note that the toolchain Docker image doesn't have ADB and doesn't set up the ADB bridge. Please use native ADB on the host.** ``` -~/src/llama.cpp$ adb push pkg-adb/llama.cpp /data/local/tmp/ -pkg-adb/llama.cpp/bin/: 67 files pushed, 0 skipped. 190.2 MB/s (919095042 bytes in 4.607s) -pkg-adb/llama.cpp/include/: 19 files pushed, 0 skipped. 20.5 MB/s (255173 bytes in 0.012s) -pkg-adb/llama.cpp/lib/: 16 files pushed, 0 skipped. 144.4 MB/s (43801382 bytes in 0.289s) +~/src/llama.cpp$ adb push pkg-snapdragon/llama.cpp /data/local/tmp/ +pkg-snapdragon/llama.cpp/bin/: 67 files pushed, 0 skipped. 190.2 MB/s (919095042 bytes in 4.607s) +pkg-snapdragon/llama.cpp/include/: 19 files pushed, 0 skipped. 20.5 MB/s (255173 bytes in 0.012s) +pkg-snapdragon/llama.cpp/lib/: 16 files pushed, 0 skipped. 144.4 MB/s (43801382 bytes in 0.289s) 102 files pushed, 0 skipped. 186.9 MB/s (963151597 bytes in 4.914s) ``` @@ -92,6 +113,11 @@ At this point, you should also install some models: Llama-3.2-1B-Instruct-Q4_0.gguf: 1 file pushed, 0 skipped. 38.3 MB/s (773025920 bytes in 19.250s) ``` +### Windows + +All artifacts are already installed in the `pkg-snapdragon` folder. +To run, adapt below instructions to use Powershell scrits in `scripts/snapdragon/windows`. + ## How to Run The easiest way to run llama.cpp cli tools is using provided wrapper scripts that properly set up all required environment variables. diff --git a/docs/backend/hexagon/developer.md b/docs/backend/snapdragon/developer.md similarity index 100% rename from docs/backend/hexagon/developer.md rename to docs/backend/snapdragon/developer.md diff --git a/docs/backend/snapdragon/windows.md b/docs/backend/snapdragon/windows.md new file mode 100644 index 0000000000..710ad8fdf4 --- /dev/null +++ b/docs/backend/snapdragon/windows.md @@ -0,0 +1,161 @@ +## Overview + +The document covers procedures for installing the latest GPU and NPU drivers, and OpenCL and Hexagon SDKs. + + +In order to use Hexagon NPU on Snapdragon Windows devices the underlying HTP Ops libraries (e.g libggml-htp-v73.so) +must be included in the .cat file digitally signed with a trusted certificate. + +This document covers details on how to generate personal certificate files (.pfx) and how to configure the system +to allow for test signatures (aka test-signing). + +## Install the latest Adreno OpenCL SDK + +Either use the trimmed down version (optimized for CI) from + + https://github.com/snapdragon-toolchain/opencl-sdk/releases/download/v2.3.2/adreno-opencl-sdk-v2.3.2-arm64-wos.tar.xz + +Or download the complete official version from + + https://softwarecenter.qualcomm.com/catalog/item/Adreno_OpenCL_SDK?version=2.3.2 + +Unzip/untar the archive into +``` +c:\Qualcomm\OpenCL_SDK\2.3.2 +``` + +## Install the latest Hexagon SDK Community Edition + +Either use the trimmed down version (optimized for CI) from + + https://github.com/snapdragon-toolchain/hexagon-sdk/releases/download/v6.4.0.2/hexagon-sdk-v6.4.0.2-arm64-wos.tar.xz + +Or download the complete official version from + + https://softwarecenter.qualcomm.com/catalog/item/Hexagon_SDK?version=6.4.0.2 + +Unzip/untar the archive into +``` +c:\Qualcomm\Hexagon_SDK\6.4.0.2 +``` + +## Install the latest Adreno GPU driver + +Download the driver from + + https://softwarecenter.qualcomm.com/catalog/item/Windows_Graphics_Driver + +After the automated installation and reboot please make sure that the GPU device shows up in the `Device Manager` (under 'Display Adapters`) + +## Install the latest Qualcomm NPU driver + +Download the driver from + + https://softwarecenter.qualcomm.com/catalog/item/Qualcomm_HND + +After the automated installation and reboot please make sure that the Hexagon NPU device shows up in the `Device Manager` (under `Neural Processors`). + +If the device is not available you can try installing all components (`qcnspmcdm8380`, `qcnspmcdm8380_ext`) manually. +The components are extracted into +``` +c:\QCDrivers\qcnspmcdm... +``` + +## Enable NPU driver test signatures + +Please note that the following steps are required only for the Hexagon NPU. +Adreno GPU backend does not require test signatures. + +### Enable testsigning + +Use `bcdedit` to enable test-signing +``` +> bcdedit /set TESTSIGNING ON +``` +(Secure Boot may need to be disabled for this to work) + +Make sure test-signing is enabled after reboot +``` +> bcdedit /enum +... +testsigning Yes +... +``` +For additional details see Microsoft guide at + + https://learn.microsoft.com/en-us/windows-hardware/drivers/install/the-testsigning-boot-configuration-option + +### Create personal certificate + +The tools required for this procedure are available as part of Windows SDK and Windows Driver Kit which should be +installed as part of the MS Visual Studio. +They are typically located at +``` +c:\Program Files (x86)\Windows Kits\10\bin\10.0.26100.0 +``` +(replace 10.0.26100.0 with correct version). + +To create personal self-signed certificate run the following commands (either from cmd or power-shell): +``` +> cd c:\Users\MyUser +> mkdir Certs +> cd Certs +> makecert -r -pe -ss PrivateCertStore -n CN=GGML.HTP.v1 -eku 1.3.6.1.5.5.7.3.3 -sv ggml-htp-v1.pvk ggml-htp-v1.cer +> pvk2pfx.exe -pvk ggml-htp-v1.pvk -spc ggml-htp-v1.cer -pfx ggml-htp-v1.pfx +``` +(replace `MyUser` with your username). + +Add this certificate to `Trusted Root Certification Authorities` and `Trusted Publishers` stores. +This can be done using `certlm` Certificate Manager tool. +Right click on the certificate store, select `All Tasks -> Import` and follow the prompts to import the certificate from the +PFX file you created above. + +For additional details see Microsoft guide at + + https://learn.microsoft.com/en-us/windows-hardware/drivers/install/introduction-to-test-signing + +Make sure to save the PFX file, you will need it for the build procedures. +Please note that the same certificate can be used for signing any number of builds. + +## Build Hexagon backend with signed HTP ops libraries + +The overall Hexagon backend build procedure for Windows on Snapdragon is the same as for other platforms. +However, additional settings are required for generating and signing HTP Ops libraries. +``` +> $env:OPENCL_SDK_ROOT="C:\Qualcomm\OpenCL_SDK\2.3.2" +> $env:HEXAGON_SDK_ROOT="C:\Qualcomm\Hexagon_SDK\6.4.0.2" +> $env:HEXAGON_TOOLS_ROOT="C:\Qualcomm\Hexagon_SDK\6.4.0.2\tools\HEXAGON_Tools\19.0.04" +> $env:HEXAGON_HTP_CERT="c:\Users\MyUsers\Certs\ggml-htp-v1.pfx" +> $env:WINDOWS_SDK_BIN="C:\Program Files (x86)\Windows Kits\10\bin\10.0.26100.0\arm64" + +> cmake --preset arm64-windows-snapdragon -B build-wos +... +> cmake --install build-wos --prefix pkg-snapdragon +``` + +Once the build is complete HTP ops libraries will be installed like this +``` +> dir pkg-snapdragon/lib +... +-a---- 1/22/2026 6:01 PM 187656 libggml-htp-v73.so +-a---- 1/22/2026 6:01 PM 191752 libggml-htp-v75.so +-a---- 1/22/2026 6:01 PM 187656 libggml-htp-v79.so +-a---- 1/22/2026 6:01 PM 187656 libggml-htp-v81.so +-a---- 1/22/2026 6:01 PM 4139 libggml-htp.cat +``` + +The .cat file, the signature and proper certicate installation can be verified with + +``` +> signtool.exe verify /v /pa .\pkg-snapdragon\lib\libggml-htp.cat +Verifying: .\pkg-snapdragon\lib\libggml-htp.cat + +Signature Index: 0 (Primary Signature) +Hash of file (sha256): 9820C664DA59D5EAE31DBB664127FCDAEF59CDC31502496BC567544EC2F401CF + +Signing Certificate Chain: + Issued to: GGML.HTP.v1 +... +Successfully verified: .\pkg-snapdragon\lib\libggml-htp.cat +... +``` diff --git a/docs/ops.md b/docs/ops.md index c066ab5a85..2c7c60dcca 100644 --- a/docs/ops.md +++ b/docs/ops.md @@ -97,7 +97,7 @@ Legend: | SILU | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | 🟡 | ✅ | ❌ | ❌ | | SILU_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | | SIN | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | ✅ | 🟡 | ❌ | ❌ | ❌ | -| SOFTPLUS | ❌ | ❌ | ✅ | 🟡 | 🟡 | ❌ | ❌ | 🟡 | ✅ | ❌ | ❌ | +| SOFTPLUS | ❌ | ❌ | ✅ | 🟡 | 🟡 | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ | | SOFT_MAX | ❌ | 🟡 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | | SOFT_MAX_BACK | ❌ | ❌ | 🟡 | 🟡 | ❌ | ❌ | 🟡 | ✅ | ❌ | ❌ | ❌ | | SOLVE_TRI | ❌ | ❌ | ✅ | 🟡 | ❌ | ❌ | ❌ | 🟡 | ❌ | ❌ | ❌ | @@ -114,7 +114,7 @@ Legend: | TANH | ❌ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ✅ | 🟡 | ✅ | ❌ | ❌ | | TIMESTEP_EMBEDDING | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ | | TOP_K | ❌ | ❌ | ✅ | ❌ | ✅ | ❌ | ❌ | 🟡 | ✅ | ❌ | ❌ | -| TRI | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | +| TRI | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | | TRUNC | ❌ | ❌ | ✅ | 🟡 | ❌ | ❌ | 🟡 | 🟡 | ✅ | ❌ | ❌ | | UPSCALE | ❌ | 🟡 | ✅ | ✅ | 🟡 | 🟡 | 🟡 | 🟡 | ❌ | ❌ | ❌ | | XIELU | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ | diff --git a/docs/ops/SYCL.csv b/docs/ops/SYCL.csv index 091a5caed7..c1d22e65d4 100644 --- a/docs/ops/SYCL.csv +++ b/docs/ops/SYCL.csv @@ -29,8 +29,8 @@ "SYCL0","EXP","type=f16,ne_a=[5,7,11,13],v=0","support","1","yes","SYCL" "SYCL0","EXPM1","type=f16,ne_a=[128,2,2,2],v=0","support","0","no","SYCL" "SYCL0","EXPM1","type=f16,ne_a=[5,7,11,13],v=0","support","0","no","SYCL" -"SYCL0","SOFTPLUS","type=f16,ne_a=[128,2,2,2],v=0","support","0","no","SYCL" -"SYCL0","SOFTPLUS","type=f16,ne_a=[5,7,11,13],v=0","support","0","no","SYCL" +"SYCL0","SOFTPLUS","type=f16,ne_a=[128,2,2,2],v=0","support","1","yes","SYCL" +"SYCL0","SOFTPLUS","type=f16,ne_a=[5,7,11,13],v=0","support","1","yes","SYCL" "SYCL0","GELU_ERF","type=f16,ne_a=[128,2,2,2],v=0","support","1","yes","SYCL" "SYCL0","GELU_ERF","type=f16,ne_a=[5,7,11,13],v=0","support","1","yes","SYCL" "SYCL0","FLOOR","type=f16,ne_a=[128,2,2,2],v=0","support","1","yes","SYCL" @@ -71,8 +71,8 @@ "SYCL0","EXP","type=f16,ne_a=[5,7,11,13],v=1","support","1","yes","SYCL" "SYCL0","EXPM1","type=f16,ne_a=[128,2,2,2],v=1","support","0","no","SYCL" "SYCL0","EXPM1","type=f16,ne_a=[5,7,11,13],v=1","support","0","no","SYCL" -"SYCL0","SOFTPLUS","type=f16,ne_a=[128,2,2,2],v=1","support","0","no","SYCL" -"SYCL0","SOFTPLUS","type=f16,ne_a=[5,7,11,13],v=1","support","0","no","SYCL" +"SYCL0","SOFTPLUS","type=f16,ne_a=[128,2,2,2],v=1","support","1","yes","SYCL" +"SYCL0","SOFTPLUS","type=f16,ne_a=[5,7,11,13],v=1","support","1","yes","SYCL" "SYCL0","GELU_ERF","type=f16,ne_a=[128,2,2,2],v=1","support","1","yes","SYCL" "SYCL0","GELU_ERF","type=f16,ne_a=[5,7,11,13],v=1","support","1","yes","SYCL" "SYCL0","FLOOR","type=f16,ne_a=[128,2,2,2],v=1","support","0","no","SYCL" @@ -113,8 +113,8 @@ "SYCL0","EXP","type=f32,ne_a=[5,7,11,13],v=0","support","1","yes","SYCL" "SYCL0","EXPM1","type=f32,ne_a=[128,2,2,2],v=0","support","0","no","SYCL" "SYCL0","EXPM1","type=f32,ne_a=[5,7,11,13],v=0","support","0","no","SYCL" -"SYCL0","SOFTPLUS","type=f32,ne_a=[128,2,2,2],v=0","support","0","no","SYCL" -"SYCL0","SOFTPLUS","type=f32,ne_a=[5,7,11,13],v=0","support","0","no","SYCL" +"SYCL0","SOFTPLUS","type=f32,ne_a=[128,2,2,2],v=0","support","1","yes","SYCL" +"SYCL0","SOFTPLUS","type=f32,ne_a=[5,7,11,13],v=0","support","1","yes","SYCL" "SYCL0","GELU_ERF","type=f32,ne_a=[128,2,2,2],v=0","support","1","yes","SYCL" "SYCL0","GELU_ERF","type=f32,ne_a=[5,7,11,13],v=0","support","1","yes","SYCL" "SYCL0","FLOOR","type=f32,ne_a=[128,2,2,2],v=0","support","1","yes","SYCL" @@ -155,8 +155,8 @@ "SYCL0","EXP","type=f32,ne_a=[5,7,11,13],v=1","support","1","yes","SYCL" "SYCL0","EXPM1","type=f32,ne_a=[128,2,2,2],v=1","support","0","no","SYCL" "SYCL0","EXPM1","type=f32,ne_a=[5,7,11,13],v=1","support","0","no","SYCL" -"SYCL0","SOFTPLUS","type=f32,ne_a=[128,2,2,2],v=1","support","0","no","SYCL" -"SYCL0","SOFTPLUS","type=f32,ne_a=[5,7,11,13],v=1","support","0","no","SYCL" +"SYCL0","SOFTPLUS","type=f32,ne_a=[128,2,2,2],v=1","support","1","yes","SYCL" +"SYCL0","SOFTPLUS","type=f32,ne_a=[5,7,11,13],v=1","support","1","yes","SYCL" "SYCL0","GELU_ERF","type=f32,ne_a=[128,2,2,2],v=1","support","1","yes","SYCL" "SYCL0","GELU_ERF","type=f32,ne_a=[5,7,11,13],v=1","support","1","yes","SYCL" "SYCL0","FLOOR","type=f32,ne_a=[128,2,2,2],v=1","support","0","no","SYCL" @@ -10052,10 +10052,10 @@ "SYCL0","CUMSUM","type=f32,ne=[375960,1,1,1]","support","0","no","SYCL" "SYCL0","CUMSUM","type=f32,ne=[20481,4,1,1]","support","0","no","SYCL" "SYCL0","XIELU","type=f32,ne=[10,5,4,3]","support","0","no","SYCL" -"SYCL0","TRI","type=f32,ne=[10,10,4,3],tri_type=3","support","0","no","SYCL" -"SYCL0","TRI","type=f32,ne=[10,10,4,3],tri_type=2","support","0","no","SYCL" -"SYCL0","TRI","type=f32,ne=[10,10,4,3],tri_type=1","support","0","no","SYCL" -"SYCL0","TRI","type=f32,ne=[10,10,4,3],tri_type=0","support","0","no","SYCL" +"SYCL0","TRI","type=f32,ne=[10,10,4,3],tri_type=3","support","1","yes","SYCL" +"SYCL0","TRI","type=f32,ne=[10,10,4,3],tri_type=2","support","1","yes","SYCL" +"SYCL0","TRI","type=f32,ne=[10,10,4,3],tri_type=1","support","1","yes","SYCL" +"SYCL0","TRI","type=f32,ne=[10,10,4,3],tri_type=0","support","1","yes","SYCL" "SYCL0","FILL","type=f32,ne=[10,10,4,3],c=0.000000","support","0","no","SYCL" "SYCL0","FILL","type=f32,ne=[303,207,11,3],c=2.000000","support","0","no","SYCL" "SYCL0","FILL","type=f32,ne=[800,600,4,4],c=-152.000000","support","0","no","SYCL" diff --git a/examples/lookahead/lookahead.cpp b/examples/lookahead/lookahead.cpp index f54cfdd77f..aa6efa62b3 100644 --- a/examples/lookahead/lookahead.cpp +++ b/examples/lookahead/lookahead.cpp @@ -50,6 +50,12 @@ int main(int argc, char ** argv) { const int N = 5; // n-gram size const int G = 15; // max verification n-grams + // lookahead requires W + G + 1 sequences for parallel Jacobi decoding + params.n_parallel = W + G + 1; + + // unified KV cache is required for coupled sequences in batch splitting + params.kv_unified = true; + // init llama.cpp llama_backend_init(); llama_numa_init(params.numa); @@ -115,7 +121,7 @@ int main(int argc, char ** argv) { // seq_id == 0 : the current input token // seq_id [1, W] : tokens from the past N - 1 Jacobi iterations // seq_id [W + 1, W + G] : verification n-grams - llama_batch batch = llama_batch_init(params.n_ctx, 0, W + G + 1); + llama_batch batch = llama_batch_init(llama_n_ctx(ctx), 0, W + G + 1); // target model sampling context struct common_sampler * smpl = common_sampler_init(model, params.sampling); diff --git a/examples/lookup/lookup.cpp b/examples/lookup/lookup.cpp index 8e73138a5f..c7552ddde1 100644 --- a/examples/lookup/lookup.cpp +++ b/examples/lookup/lookup.cpp @@ -106,7 +106,7 @@ int main(int argc, char ** argv){ std::vector draft; - llama_batch batch_tgt = llama_batch_init(params.n_ctx, 0, 1); + llama_batch batch_tgt = llama_batch_init(llama_n_ctx(ctx), 0, 1); const auto t_dec_start = ggml_time_us(); diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt index 260ad48f0e..265023733e 100644 --- a/ggml/src/CMakeLists.txt +++ b/ggml/src/CMakeLists.txt @@ -222,6 +222,7 @@ if (GGML_SCHED_NO_REALLOC) endif() add_library(ggml + ggml-backend-dl.cpp ggml-backend-reg.cpp) add_library(ggml::ggml ALIAS ggml) diff --git a/ggml/src/ggml-backend-dl.cpp b/ggml/src/ggml-backend-dl.cpp new file mode 100644 index 0000000000..a65cf00905 --- /dev/null +++ b/ggml/src/ggml-backend-dl.cpp @@ -0,0 +1,48 @@ +#include "ggml-backend-dl.h" + +#ifdef _WIN32 + +dl_handle * dl_load_library(const fs::path & path) { + // suppress error dialogs for missing DLLs + DWORD old_mode = SetErrorMode(SEM_FAILCRITICALERRORS); + SetErrorMode(old_mode | SEM_FAILCRITICALERRORS); + + HMODULE handle = LoadLibraryW(path.wstring().c_str()); + + SetErrorMode(old_mode); + + return handle; +} + +void * dl_get_sym(dl_handle * handle, const char * name) { + DWORD old_mode = SetErrorMode(SEM_FAILCRITICALERRORS); + SetErrorMode(old_mode | SEM_FAILCRITICALERRORS); + + void * p = (void *) GetProcAddress(handle, name); + + SetErrorMode(old_mode); + + return p; +} + +const char * dl_error() { + return ""; +} + +#else + +dl_handle * dl_load_library(const fs::path & path) { + dl_handle * handle = dlopen(path.string().c_str(), RTLD_NOW | RTLD_LOCAL); + return handle; +} + +void * dl_get_sym(dl_handle * handle, const char * name) { + return dlsym(handle, name); +} + +const char * dl_error() { + const char *rslt = dlerror(); + return rslt != nullptr ? rslt : ""; +} + +#endif diff --git a/ggml/src/ggml-backend-dl.h b/ggml/src/ggml-backend-dl.h new file mode 100644 index 0000000000..f74b7c9489 --- /dev/null +++ b/ggml/src/ggml-backend-dl.h @@ -0,0 +1,45 @@ +#pragma once + +#ifdef _WIN32 +# define WIN32_LEAN_AND_MEAN +# ifndef NOMINMAX +# define NOMINMAX +# endif +# include +# include +#else +# include +# include +#endif +#include + +namespace fs = std::filesystem; + +#ifdef _WIN32 + +using dl_handle = std::remove_pointer_t; + +struct dl_handle_deleter { + void operator()(HMODULE handle) { + FreeLibrary(handle); + } +}; + +#else + +using dl_handle = void; + +struct dl_handle_deleter { + void operator()(void * handle) { + dlclose(handle); + } +}; + +#endif + +using dl_handle_ptr = std::unique_ptr; + +dl_handle * dl_load_library(const fs::path & path); +void * dl_get_sym(dl_handle * handle, const char * name); +const char * dl_error(); + diff --git a/ggml/src/ggml-backend-reg.cpp b/ggml/src/ggml-backend-reg.cpp index dd991f262e..8a693f84af 100644 --- a/ggml/src/ggml-backend-reg.cpp +++ b/ggml/src/ggml-backend-reg.cpp @@ -1,5 +1,6 @@ #include "ggml-backend-impl.h" #include "ggml-backend.h" +#include "ggml-backend-dl.h" #include "ggml-impl.h" #include #include @@ -98,72 +99,6 @@ static std::string path_str(const fs::path & path) { } } -#ifdef _WIN32 - -using dl_handle = std::remove_pointer_t; - -struct dl_handle_deleter { - void operator()(HMODULE handle) { - FreeLibrary(handle); - } -}; - -static dl_handle * dl_load_library(const fs::path & path) { - // suppress error dialogs for missing DLLs - DWORD old_mode = SetErrorMode(SEM_FAILCRITICALERRORS); - SetErrorMode(old_mode | SEM_FAILCRITICALERRORS); - - HMODULE handle = LoadLibraryW(path.wstring().c_str()); - - SetErrorMode(old_mode); - - return handle; -} - -static void * dl_get_sym(dl_handle * handle, const char * name) { - DWORD old_mode = SetErrorMode(SEM_FAILCRITICALERRORS); - SetErrorMode(old_mode | SEM_FAILCRITICALERRORS); - - void * p = (void *) GetProcAddress(handle, name); - - SetErrorMode(old_mode); - - return p; -} - -static const char * dl_error() { - return ""; -} - -#else - -using dl_handle = void; - -struct dl_handle_deleter { - void operator()(void * handle) { - dlclose(handle); - } -}; - -static void * dl_load_library(const fs::path & path) { - dl_handle * handle = dlopen(path.string().c_str(), RTLD_NOW | RTLD_LOCAL); - - return handle; -} - -static void * dl_get_sym(dl_handle * handle, const char * name) { - return dlsym(handle, name); -} - -static const char * dl_error() { - const char *rslt = dlerror(); - return rslt != nullptr ? rslt : ""; -} - -#endif - -using dl_handle_ptr = std::unique_ptr; - struct ggml_backend_reg_entry { ggml_backend_reg_t reg; dl_handle_ptr handle; diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh index 43280644e4..a3256d59dd 100644 --- a/ggml/src/ggml-cuda/common.cuh +++ b/ggml/src/ggml-cuda/common.cuh @@ -1124,6 +1124,7 @@ struct ggml_tensor_extra_gpu { struct ggml_cuda_graph_node_properties { void * node_data; ggml_op node_op; + enum ggml_type node_type; int32_t flags; int64_t ne[GGML_MAX_DIMS]; size_t nb[GGML_MAX_DIMS]; diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu index bc98a0f665..796278a15e 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -2921,6 +2921,7 @@ static void ggml_cuda_graph_node_set_properties(ggml_cuda_graph_node_properties memset(props, 0, sizeof(ggml_cuda_graph_node_properties)); props->node_data = node->data; props->node_op = node->op; + props->node_type = node->type; props->flags = node->flags; for (int i = 0; i < GGML_MAX_DIMS; i++) { props->ne[i] = node->ne[i]; @@ -2945,6 +2946,10 @@ static bool ggml_cuda_graph_node_properties_match(ggml_tensor * node, ggml_cuda_ return false; } + if (node->type != props->node_type) { + return false; + } + for (int i = 0; i < GGML_MAX_DIMS; i++) { if (node->ne[i] != props->ne[i]) { return false; @@ -3906,14 +3911,14 @@ static void ggml_cuda_graph_evaluate_and_capture(ggml_backend_cuda_context * cud // Launch graph CUDA_CHECK(cudaGraphLaunch(graph->instance, cuda_ctx->stream())); #else + GGML_UNUSED(graph_key); graph_evaluated_or_captured = true; #endif // USE_CUDA_GRAPH } } -static bool ggml_cuda_graph_set_enabled(ggml_backend_cuda_context * cuda_ctx, const void * graph_key) { - #ifdef USE_CUDA_GRAPH +static bool ggml_cuda_graph_set_enabled(ggml_backend_cuda_context * cuda_ctx, const void * graph_key) { ggml_cuda_graph * graph = cuda_ctx->cuda_graph(graph_key); if (graph->graph == nullptr) { @@ -3926,12 +3931,8 @@ static bool ggml_cuda_graph_set_enabled(ggml_backend_cuda_context * cuda_ctx, co } return graph->is_enabled(); -#else - GGML_UNUSED(cuda_ctx); - GGML_UNUSED(graph_key); - return false; -#endif // USE_CUDA_GRAPH } +#endif // USE_CUDA_GRAPH static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) { ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *) backend->context; diff --git a/ggml/src/ggml-hexagon/CMakeLists.txt b/ggml/src/ggml-hexagon/CMakeLists.txt index d58e287823..f3a583543c 100644 --- a/ggml/src/ggml-hexagon/CMakeLists.txt +++ b/ggml/src/ggml-hexagon/CMakeLists.txt @@ -1,7 +1,29 @@ +file(TO_CMAKE_PATH "${HEXAGON_SDK_ROOT}" HEXAGON_SDK_ROOT) +file(TO_CMAKE_PATH "${HEXAGON_TOOLS_ROOT}" HEXAGON_TOOLS_ROOT) + +if (NOT IS_DIRECTORY "${HEXAGON_SDK_ROOT}") + message(FATAL_ERROR "Make sure HEXAGON_SDK_ROOT point to the correct Hexagon SDK installation.") +endif() + +if (NOT IS_DIRECTORY "${HEXAGON_TOOLS_ROOT}") + message("Try to read HEXAGON_TOOLS_ROOT from hexagon_sdk.json") + file(READ "${HEXAGON_SDK_ROOT}/hexagon_sdk.json" HEXAGON_SDK_CONFIG_PATH) + string(JSON HEXAGON_TOOLS_PATH GET ${HEXAGON_SDK_CONFIG_PATH} "root" "tools" "info" 0 "path") + message("Found HEXAGON_TOOLS_PATH: ${HEXAGON_TOOLS_PATH}") + set(HEXAGON_TOOLS_ROOT "${HEXAGON_SDK_ROOT}/${HEXAGON_TOOLS_PATH}") + file(TO_CMAKE_PATH "${HEXAGON_TOOLS_ROOT}" HEXAGON_TOOLS_ROOT) + if (NOT IS_DIRECTORY "${HEXAGON_TOOLS_ROOT}") + message(FATAL_ERROR "Make sure HEXAGON_TOOLS_ROOT point to the correct Hexagon SDK installation.") + endif() +endif() + +message(STATUS "hexagon: using ${HEXAGON_SDK_ROOT} and ${HEXAGON_TOOLS_ROOT} for building libggml-htp skels") + include(${HEXAGON_SDK_ROOT}/build/cmake/hexagon_fun.cmake) include(ExternalProject) option(GGML_HEXAGON_HTP_DEBUG "ggml-hexagon: enable HTP debug output" OFF) +set(GGML_HEXAGON_HTP_CERT "$ENV{HEXAGON_HTP_CERT}" CACHE PATH "ggml-hexagon: enable HTP library signing using certificate") set(GGML_HEXAGON_FP32_QUANTIZE_GROUP_SIZE 128 CACHE STRING "ggml-hexagon: quantize group size (32, 64, or 128)") add_library(htp_iface OBJECT @@ -25,56 +47,71 @@ else() target_link_options(htp_iface PUBLIC -ldl) endif() -link_custom_library(htp_iface cdsprpc) -link_custom_library(htp_iface rpcmem) - set(TARGET_NAME ggml-hexagon) ggml_add_backend_library(${TARGET_NAME} - ggml-hexagon.cpp htp-utils.c htp-utils.h ../../include/ggml-hexagon.h) + ggml-hexagon.cpp + htp-drv.cpp + htp-drv.h + libdl.h + ../../include/ggml-hexagon.h) target_link_libraries(${TARGET_NAME} PRIVATE htp_iface) target_include_directories(${TARGET_NAME} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/htp ${CMAKE_CURRENT_BINARY_DIR}) -# Build HTP bits -set(HTP_CMAKE_ARGS - -DCMAKE_TOOLCHAIN_FILE=${CMAKE_CURRENT_SOURCE_DIR}/htp/cmake-toolchain.cmake - -DCMAKE_BUILD_TYPE=Release - -DCMAKE_INSTALL_LIBDIR=${CMAKE_CURRENT_BINARY_DIR} - -DHEXAGON_SDK_ROOT=$ENV{HEXAGON_SDK_ROOT} - -DHEXAGON_TOOLS_ROOT=$ENV{HEXAGON_TOOLS_ROOT} - -DHEXAGON_HTP_DEBUG=${GGML_HEXAGON_HTP_DEBUG} - -DGGML_HEXAGON_FP32_QUANTIZE_GROUP_SIZE=${GGML_HEXAGON_FP32_QUANTIZE_GROUP_SIZE}) +# Build HTP skels +set(HTP_SKELS) +function(build_htp_skel V) + ExternalProject_Add(htp-${V} + SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/htp BUILD_ALWAYS ON + BUILD_BYPRODUCTS ${CMAKE_CURRENT_BINARY_DIR}/libggml-htp-${V}.so + CMAKE_ARGS + -DCMAKE_BUILD_TYPE=Release + -DCMAKE_TOOLCHAIN_FILE=${CMAKE_CURRENT_SOURCE_DIR}/htp/cmake-toolchain.cmake + -DCMAKE_INSTALL_LIBDIR=${CMAKE_CURRENT_BINARY_DIR} + -DHEXAGON_SDK_ROOT=${HEXAGON_SDK_ROOT} + -DHEXAGON_TOOLS_ROOT=${HEXAGON_TOOLS_ROOT} + -DHEXAGON_HTP_DEBUG=${GGML_HEXAGON_HTP_DEBUG} + -DGGML_HEXAGON_FP32_QUANTIZE_GROUP_SIZE=${GGML_HEXAGON_FP32_QUANTIZE_GROUP_SIZE} + -DDSP_VERSION=${V} + -DPREBUILT_LIB_DIR="toolv19_${V}") + list(APPEND HTP_SKELS ${CMAKE_CURRENT_BINARY_DIR}/libggml-htp-${V}.so) + set(HTP_SKELS ${HTP_SKELS} PARENT_SCOPE) +endfunction() -ExternalProject_Add(htp-v68 - SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/htp BUILD_ALWAYS ON - CMAKE_ARGS ${HTP_CMAKE_ARGS} -DDSP_VERSION=v68 -DPREBUILT_LIB_DIR="toolv19_v68") - -ExternalProject_Add(htp-v69 - SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/htp BUILD_ALWAYS ON - CMAKE_ARGS ${HTP_CMAKE_ARGS} -DDSP_VERSION=v69 -DPREBUILT_LIB_DIR="toolv19_v69") - -ExternalProject_Add(htp-v73 - SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/htp BUILD_ALWAYS ON - CMAKE_ARGS ${HTP_CMAKE_ARGS} -DDSP_VERSION=v73 -DPREBUILT_LIB_DIR="toolv19_v73") - -ExternalProject_Add(htp-v75 - SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/htp BUILD_ALWAYS ON - CMAKE_ARGS ${HTP_CMAKE_ARGS} -DDSP_VERSION=v75 -DPREBUILT_LIB_DIR="toolv19_v75") - -ExternalProject_Add(htp-v79 - SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/htp BUILD_ALWAYS ON - CMAKE_ARGS ${HTP_CMAKE_ARGS} -DDSP_VERSION=v79 -DPREBUILT_LIB_DIR="toolv19_v79") - -ExternalProject_Add(htp-v81 - SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/htp BUILD_ALWAYS ON - CMAKE_ARGS ${HTP_CMAKE_ARGS} -DDSP_VERSION=v81 -DPREBUILT_LIB_DIR="toolv19_v81") +build_htp_skel(v68) +build_htp_skel(v69) +build_htp_skel(v73) +build_htp_skel(v75) +build_htp_skel(v79) +build_htp_skel(v81) # Install Hexagon skels required at runtime -install(FILES - ${CMAKE_CURRENT_BINARY_DIR}/libggml-htp-v68.so - ${CMAKE_CURRENT_BINARY_DIR}/libggml-htp-v69.so - ${CMAKE_CURRENT_BINARY_DIR}/libggml-htp-v73.so - ${CMAKE_CURRENT_BINARY_DIR}/libggml-htp-v75.so - ${CMAKE_CURRENT_BINARY_DIR}/libggml-htp-v79.so - ${CMAKE_CURRENT_BINARY_DIR}/libggml-htp-v81.so - TYPE LIB) +install(FILES ${HTP_SKELS} TYPE LIB) + +if (CMAKE_SYSTEM_NAME MATCHES Windows AND GGML_HEXAGON_HTP_CERT) + file(TO_CMAKE_PATH "$ENV{WINDOWS_SDK_BIN}/arm64" WINSDK_BIN0_ARM64) + file(TO_CMAKE_PATH "$ENV{WINDOWS_SDK_BIN}/x86" WINSDK_BIN0_X86) + file(TO_CMAKE_PATH "$ENV{WindowsSdkVerBinPath}/arm64" WINSDK_BIN1_ARM64) + file(TO_CMAKE_PATH "$ENV{WindowsSdkVerBinPath}/x86" WINSDK_BIN1_X86) + + set(WINSDK_PATHS ${WINSDK_BIN0_ARM64} ${WINSDK_BIN0_X86} ${WINSDK_BIN1_ARM64} ${WINSDK_BIN1_X86}) + + find_program(INF2CAT NAMES inf2cat.exe PATHS ${WINSDK_PATHS} REQUIRED) + find_program(SIGNTOOL NAMES signtool.exe PATHS ${WINSDK_PATHS} REQUIRED) + + message(STATUS "hexagon: using ${GGML_HEXAGON_HTP_CERT} to sign libggml-htp skels") + + set(LIBGGML_HTP_CAT ${CMAKE_CURRENT_BINARY_DIR}/libggml-htp.cat) + add_custom_target(libggml-htp-cat + BYPRODUCTS ${LIBGGML_HTP_CAT} + DEPENDS libggml-htp.inf ${HTP_SKELS} + COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_SOURCE_DIR}/libggml-htp.inf ${CMAKE_CURRENT_BINARY_DIR} + COMMAND ${INF2CAT} /driver:${CMAKE_CURRENT_BINARY_DIR} /os:10_25H2_ARM64 + COMMAND ${SIGNTOOL} sign /fd sha256 /f ${GGML_HEXAGON_HTP_CERT} ${LIBGGML_HTP_CAT} + COMMENT "generating and signing libggml-htp.cat file" + VERBATIM + ) + + add_dependencies(${TARGET_NAME} libggml-htp-cat) + install(FILES ${LIBGGML_HTP_CAT} TYPE LIB) +endif() diff --git a/ggml/src/ggml-hexagon/ggml-hexagon.cpp b/ggml/src/ggml-hexagon/ggml-hexagon.cpp index 5b835c11c7..4f0a1620fb 100644 --- a/ggml/src/ggml-hexagon/ggml-hexagon.cpp +++ b/ggml/src/ggml-hexagon/ggml-hexagon.cpp @@ -14,9 +14,6 @@ #ifdef _WIN32 # include -# ifndef _WINDOWS -# define _WINDOWS -# endif #else # include # include @@ -25,8 +22,6 @@ #pragma clang diagnostic ignored "-Wnested-anon-types" #pragma clang diagnostic ignored "-Wgnu-anonymous-struct" -#include "htp-utils.h" - #include #include #include @@ -40,6 +35,7 @@ #include "op-desc.h" #include "htp-msg.h" #include "htp_iface.h" +#include "htp-drv.h" static size_t opt_ndev = 1; static size_t opt_nhvx = 0; // use all @@ -150,9 +146,9 @@ void ggml_hexagon_session::enqueue(struct htp_general_req &req, struct dspqueue_ 0, // flags - the framework will autoset this n_bufs, // number of buffers bufs, // buffer references - sizeof(req), + sizeof(req), // Message length (const uint8_t *) &req, // Message - 1000000 // Timeout + DSPQUEUE_TIMEOUT // Timeout ); if (err != 0) { @@ -182,13 +178,13 @@ void ggml_hexagon_session::flush() { // Read response packet from queue int err = dspqueue_read(q, &flags, - HTP_MAX_PACKET_BUFFERS, // Maximum number of buffer references - &n_bufs, // Number of buffer references - bufs, // Buffer references - sizeof(rsp), // Max message length - &rsp_size, // Message length - (uint8_t *) &rsp, - 1000000); // Timeout + HTP_MAX_PACKET_BUFFERS, // Maximum number of buffer references + &n_bufs, // Number of buffer references + bufs, // Buffer references + sizeof(rsp), // Max message length + &rsp_size, // Message length + (uint8_t *) &rsp, // Message + DSPQUEUE_TIMEOUT); // Timeout if (err == AEE_EEXPIRED) { // TODO: might need to bail out if the HTP is stuck on something @@ -269,13 +265,7 @@ struct ggml_backend_hexagon_buffer_context { ggml_backend_hexagon_buffer_context(ggml_hexagon_session * sess, size_t size, bool repack) { size += 4 * 1024; // extra page for padding - if (rpcmem_alloc2) { - this->base = (uint8_t *) rpcmem_alloc2(RPCMEM_HEAP_ID_SYSTEM, RPCMEM_DEFAULT_FLAGS | RPCMEM_HEAP_NOREG, size); - } else { - GGML_LOG_INFO("ggml-hex: %s rpcmem_alloc2 not found, falling back to rpcmem_alloc\n", sess->name.c_str()); - this->base = (uint8_t *) rpcmem_alloc(RPCMEM_HEAP_ID_SYSTEM, RPCMEM_DEFAULT_FLAGS | RPCMEM_HEAP_NOREG, size); - } - + this->base = (uint8_t *) rpcmem_alloc2(RPCMEM_HEAP_ID_SYSTEM, RPCMEM_DEFAULT_FLAGS | RPCMEM_HEAP_NOREG, size); if (!this->base) { GGML_LOG_ERROR("ggml-hex: %s failed to allocate buffer : size %zu\n", sess->name.c_str(), size); throw std::runtime_error("ggml-hex: rpcmem_alloc failed (see log for details)"); @@ -2461,12 +2451,12 @@ static void ggml_backend_hexagon_free(ggml_backend_t backend) { } static inline bool op_reuse_src1(const ggml_tensor * op1, const ggml_tensor * op0) { - return (op0 && op0->src[1] == op1->src[1] && ggml_is_quantized(op0->src[0]->type) && ggml_is_quantized(op1->src[1]->type)); + return (op0 && op0->src[1] == op1->src[1] && ggml_is_quantized(op0->src[0]->type)); } static inline bool is_compute_op(ggml_tensor *node) { - return !(ggml_op_is_empty(node->op) || ggml_is_empty(node)); + return !ggml_op_is_empty(node->op) && !ggml_is_empty(node) && (node->flags & GGML_TENSOR_FLAG_COMPUTE); } // scan the graph and figure out last compute op index @@ -2488,7 +2478,7 @@ static ggml_status ggml_backend_hexagon_graph_compute(ggml_backend_t backend, gg const int last = last_compute_op(graph); - const struct ggml_tensor * prev_quant_op = nullptr; // prev executed op with quantizer + const struct ggml_tensor * prev_op = nullptr; // prev executed op for (int i = 0; i < graph->n_nodes; ++i) { ggml_tensor * node = graph->nodes[i]; @@ -2497,17 +2487,15 @@ static ggml_status ggml_backend_hexagon_graph_compute(ggml_backend_t backend, gg continue; } - if ((node->flags & GGML_TENSOR_FLAG_COMPUTE) == 0) { - continue; - } - uint32_t flags = 0; // skip quantizer if src1 is reused - if (op_reuse_src1(node, prev_quant_op)) { + if (op_reuse_src1(node, prev_op)) { flags |= HTP_OPFLAGS_SKIP_QUANTIZE; } + prev_op = node; + // ask for early notification for the last Op if (i == last) { flags |= HTP_OPFLAGS_EARLY_WAKEUP; @@ -2520,7 +2508,6 @@ static ggml_status ggml_backend_hexagon_graph_compute(ggml_backend_t backend, gg } else { ggml_hexagon_dispatch_op>(sess, node, flags); } - prev_quant_op = node; break; case GGML_OP_MUL_MAT_ID: if (ggml_is_quantized(node->src[0]->type)) { @@ -2528,7 +2515,6 @@ static ggml_status ggml_backend_hexagon_graph_compute(ggml_backend_t backend, gg } else { ggml_hexagon_dispatch_op>(sess, node, flags); } - prev_quant_op = node; break; case GGML_OP_MUL: case GGML_OP_ADD: @@ -2670,7 +2656,7 @@ static std::vector ggml_hexagon_graph_optimize_reorder(const std::vectorcontext = new ggml_hexagon_registry(reg); HEX_VERBOSE("ggml-hex: size-of-general-req %zu size-of-general-rsp %zu\n", sizeof(struct htp_general_req), @@ -3180,6 +3170,11 @@ ggml_backend_reg_t ggml_backend_hexagon_reg(void) { static std::mutex mutex; std::lock_guard lock(mutex); if (!initialized) { + auto nErr = htpdrv_init(); + if (nErr != AEE_SUCCESS) { + return NULL; + } + ggml_hexagon_init(®); } diff --git a/ggml/src/ggml-hexagon/htp-drv.cpp b/ggml/src/ggml-hexagon/htp-drv.cpp new file mode 100644 index 0000000000..2530bb06d6 --- /dev/null +++ b/ggml/src/ggml-hexagon/htp-drv.cpp @@ -0,0 +1,418 @@ +// sample drv interface + +#pragma clang diagnostic ignored "-Wgnu-anonymous-struct" +#pragma clang diagnostic ignored "-Wmissing-prototypes" +#pragma clang diagnostic ignored "-Wsign-compare" + +#include +#include +#include +#include +#ifdef _WIN32 +# define WIN32_LEAN_AND_MEAN +# ifndef NOMINMAX +# define NOMINMAX +# endif +# include +# include +#else +# include +# include +#endif +#include "ggml-impl.h" +#include "htp-drv.h" +#include "libdl.h" + +#include + +// +// Driver API types +// + +typedef void * (*rpcmem_alloc_pfn_t)(int heapid, uint32_t flags, int size); +typedef void * (*rpcmem_alloc2_pfn_t)(int heapid, uint32_t flags, size_t size); +typedef void (*rpcmem_free_pfn_t)(void * po); +typedef int (*rpcmem_to_fd_pfn_t)(void * po); + +typedef AEEResult (*dspqueue_create_pfn_t)(int domain, + uint32_t flags, + uint32_t req_queue_size, + uint32_t resp_queue_size, + dspqueue_callback_t packet_callback, + dspqueue_callback_t error_callback, + void * callback_context, + dspqueue_t * queue); +typedef AEEResult (*dspqueue_close_pfn_t)(dspqueue_t queue); +typedef AEEResult (*dspqueue_export_pfn_t)(dspqueue_t queue, uint64_t *queue_id); +typedef AEEResult (*dspqueue_write_pfn_t)(dspqueue_t queue, uint32_t flags, + uint32_t num_buffers, + struct dspqueue_buffer *buffers, + uint32_t message_length, + const uint8_t *message, + uint32_t timeout_us); +typedef AEEResult (*dspqueue_read_pfn_t)(dspqueue_t queue, uint32_t *flags, + uint32_t max_buffers, uint32_t *num_buffers, + struct dspqueue_buffer *buffers, + uint32_t max_message_length, + uint32_t *message_length, uint8_t *message, + uint32_t timeout_us); + +typedef int (*fastrpc_mmap_pfn_t)(int domain, int fd, void *addr, int offset, size_t length, enum fastrpc_map_flags flags); +typedef int (*fastrpc_munmap_pfn_t)(int domain, int fd, void *addr, size_t length); + +typedef int (*remote_handle64_open_pfn_t)(const char* name, remote_handle64 *ph); +typedef int (*remote_handle64_invoke_pfn_t)(remote_handle64 h, uint32_t dwScalars, remote_arg *pra); +typedef int (*remote_handle64_close_pfn_t)(remote_handle h); +typedef int (*remote_handle_control_pfn_t)(uint32_t req, void* data, uint32_t datalen); +typedef int (*remote_handle64_control_pfn_t)(remote_handle64 h, uint32_t req, void* data, uint32_t datalen); +typedef int (*remote_session_control_pfn_t)(uint32_t req, void *data, uint32_t datalen); + +// +// Driver API pfns +// + +rpcmem_alloc_pfn_t rpcmem_alloc_pfn = nullptr; +rpcmem_alloc2_pfn_t rpcmem_alloc2_pfn = nullptr; +rpcmem_free_pfn_t rpcmem_free_pfn = nullptr; +rpcmem_to_fd_pfn_t rpcmem_to_fd_pfn = nullptr; + +fastrpc_mmap_pfn_t fastrpc_mmap_pfn = nullptr; +fastrpc_munmap_pfn_t fastrpc_munmap_pfn = nullptr; + +dspqueue_create_pfn_t dspqueue_create_pfn = nullptr; +dspqueue_close_pfn_t dspqueue_close_pfn = nullptr; +dspqueue_export_pfn_t dspqueue_export_pfn = nullptr; +dspqueue_write_pfn_t dspqueue_write_pfn = nullptr; +dspqueue_read_pfn_t dspqueue_read_pfn = nullptr; + +remote_handle64_open_pfn_t remote_handle64_open_pfn = nullptr; +remote_handle64_invoke_pfn_t remote_handle64_invoke_pfn = nullptr; +remote_handle64_close_pfn_t remote_handle64_close_pfn = nullptr; +remote_handle_control_pfn_t remote_handle_control_pfn = nullptr; +remote_handle64_control_pfn_t remote_handle64_control_pfn = nullptr; +remote_session_control_pfn_t remote_session_control_pfn = nullptr; + +// +// Driver API +// + +void * rpcmem_alloc(int heapid, uint32_t flags, int size) { + return rpcmem_alloc_pfn(heapid, flags, size); +} + +void * rpcmem_alloc2(int heapid, uint32_t flags, size_t size) { + if (rpcmem_alloc2_pfn) { + return rpcmem_alloc2_pfn(heapid, flags, size); + } else { + GGML_LOG_INFO("ggml-hex: rpcmem_alloc2 not found, falling back to rpcmem_alloc\n"); + return rpcmem_alloc_pfn(heapid, flags, size); + } +} + +void rpcmem_free(void * po) { + return rpcmem_free_pfn(po); +} + +int rpcmem_to_fd(void * po) { + return rpcmem_to_fd_pfn(po); +} + +HTPDRV_API int fastrpc_mmap(int domain, int fd, void * addr, int offset, size_t length, enum fastrpc_map_flags flags) { + return fastrpc_mmap_pfn(domain, fd, addr, offset, length, flags); +} + +HTPDRV_API int fastrpc_munmap(int domain, int fd, void * addr, size_t length) { + return fastrpc_munmap_pfn(domain, fd, addr, length); +} + +AEEResult dspqueue_create(int domain, + uint32_t flags, + uint32_t req_queue_size, + uint32_t resp_queue_size, + dspqueue_callback_t packet_callback, + dspqueue_callback_t error_callback, + void * callback_context, + dspqueue_t * queue) { + return dspqueue_create_pfn(domain, flags, req_queue_size, resp_queue_size, packet_callback, error_callback, + callback_context, queue); +} + +AEEResult dspqueue_close(dspqueue_t queue) { + return dspqueue_close_pfn(queue); +} + +AEEResult dspqueue_export(dspqueue_t queue, uint64_t * queue_id) { + return dspqueue_export_pfn(queue, queue_id); +} + +AEEResult dspqueue_write(dspqueue_t queue, + uint32_t flags, + uint32_t num_buffers, + struct dspqueue_buffer * buffers, + uint32_t message_length, + const uint8_t * message, + uint32_t timeout_us) { + return dspqueue_write_pfn(queue, flags, num_buffers, buffers, message_length, message, timeout_us); +} + +AEEResult dspqueue_read(dspqueue_t queue, + uint32_t * flags, + uint32_t max_buffers, + uint32_t * num_buffers, + struct dspqueue_buffer * buffers, + uint32_t max_message_length, + uint32_t * message_length, + uint8_t * message, + uint32_t timeout_us) { + return dspqueue_read_pfn(queue, flags, max_buffers, num_buffers, buffers, max_message_length, message_length, + message, timeout_us); +} + +HTPDRV_API int remote_handle64_open(const char * name, remote_handle64 * ph) { + return remote_handle64_open_pfn(name, ph); +} + +HTPDRV_API int remote_handle64_invoke(remote_handle64 h, uint32_t dwScalars, remote_arg * pra) { + return remote_handle64_invoke_pfn(h, dwScalars, pra); +} + +HTPDRV_API int remote_handle64_close(remote_handle64 h) { + return remote_handle64_close_pfn(h); +} + +HTPDRV_API int remote_handle_control(uint32_t req, void * data, uint32_t datalen) { + return remote_handle_control_pfn(req, data, datalen); +} + +HTPDRV_API int remote_handle64_control(remote_handle64 h, uint32_t req, void * data, uint32_t datalen) { + return remote_handle64_control_pfn(h, req, data, datalen); +} + +HTPDRV_API int remote_session_control(uint32_t req, void * data, uint32_t datalen) { + return remote_session_control_pfn(req, data, datalen); +} + +#ifdef _WIN32 + +static std::string wstr_to_str(std::wstring_view wstr) { + std::string result; + if (wstr.empty()) { + return result; + } + auto bytes_needed = WideCharToMultiByte(CP_UTF8, WC_ERR_INVALID_CHARS, + wstr.data(), (int) wstr.size(), + nullptr, 0, nullptr, nullptr); + if (bytes_needed == 0) { + GGML_LOG_ERROR("ggml-hex: WideCharToMultiByte failed. Error %lu\n", GetLastError()); + throw std::runtime_error("Invalid wstring input"); + } + + result.resize(bytes_needed, '\0'); + int bytes_written = WideCharToMultiByte(CP_UTF8, WC_ERR_INVALID_CHARS, + wstr.data(), (int) wstr.size(), + result.data(), bytes_needed, + nullptr, nullptr); + if (bytes_written == 0) { + GGML_LOG_ERROR("ggml-hex: WideCharToMultiByte failed. Error %lu\n", GetLastError()); + throw std::runtime_error("Wstring conversion failed"); + } + return result; +} + +static std::string get_driver_path() { + std::wstring serviceName = L"qcnspmcdm"; + std::string result; + + // Get a handle to the SCM database. + SC_HANDLE schSCManager = OpenSCManagerW(NULL, NULL, STANDARD_RIGHTS_READ); + if (nullptr == schSCManager) { + GGML_LOG_ERROR("ggml-hex: Failed to open SCManager. Error: %lu\n", GetLastError()); + return result; + } + + // Get a handle to the service. + SC_HANDLE schService = OpenServiceW(schSCManager, // SCM database + serviceName.c_str(), // name of service + SERVICE_QUERY_CONFIG); // need query config access + + if (nullptr == schService) { + GGML_LOG_ERROR("ggml-hex: Failed to open qcnspmcdm service. Error: %lu\n", GetLastError()); + CloseServiceHandle(schSCManager); + return result; + } + + // Store the size of buffer used as an output. + DWORD bufferSize; + if (!QueryServiceConfigW(schService, NULL, 0, &bufferSize) && + (GetLastError() != ERROR_INSUFFICIENT_BUFFER)) { + GGML_LOG_ERROR("ggml-hex: Failed to query service config. Error: %lu\n", GetLastError()); + CloseServiceHandle(schService); + CloseServiceHandle(schSCManager); + return result; + } + // Get the configuration of the service. + LPQUERY_SERVICE_CONFIGW serviceConfig = + static_cast(LocalAlloc(LMEM_FIXED, bufferSize)); + if (!QueryServiceConfigW(schService, serviceConfig, bufferSize, &bufferSize)) { + fprintf(stderr, "ggml-hex: Failed to query service config. Error: %lu\n", GetLastError()); + LocalFree(serviceConfig); + CloseServiceHandle(schService); + CloseServiceHandle(schSCManager); + return result; + } + + // Read the driver file path get its parent directory + std::wstring driverPath = std::wstring(serviceConfig->lpBinaryPathName); + driverPath = driverPath.substr(0, driverPath.find_last_of(L"\\")); + + // Clean up resources + LocalFree(serviceConfig); + CloseServiceHandle(schService); + CloseServiceHandle(schSCManager); + + // Driver path would contain invalid path string, like: + // \SystemRoot\System32\DriverStore\FileRepository\qcadsprpc8280.inf_arm64_c2b9460c9a072f37 + // "\SystemRoot" should be replace with a correct one (e.g. C:\Windows) + const std::wstring systemRootPlaceholder = L"\\SystemRoot"; + if (0 != driverPath.compare(0, systemRootPlaceholder.length(), systemRootPlaceholder)) { + GGML_LOG_ERROR("ggml-hex: String pattern not found in driver path.\n"); + return result; + } + + // Replace \SystemRoot with an absolute path from system ENV windir + const std::wstring systemRootEnv = L"windir"; + + // Query the number of wide charactors this variable requires + DWORD numWords = GetEnvironmentVariableW(systemRootEnv.c_str(), NULL, 0); + if (numWords == 0) { + GGML_LOG_ERROR("ggml-hex: Failed get systemRoot environment variable\n"); + return result; + } + + // Query the actual system root name from environment variable + std::vector systemRoot(numWords + 1); + numWords = GetEnvironmentVariableW(systemRootEnv.c_str(), systemRoot.data(), numWords + 1); + if (numWords == 0) { + GGML_LOG_ERROR("ggml-hex: Failed to read windir environment variable\n"); + return result; + } + driverPath.replace(0, systemRootPlaceholder.length(), std::wstring(systemRoot.data())); + + return wstr_to_str(driverPath); +} + +#endif + +using dl_handle_ptr = std::unique_ptr; + +int htpdrv_init() { + static dl_handle_ptr lib_cdsp_rpc_handle = nullptr; + static bool initialized = false; +#ifdef _WIN32 + std::string drv_path = get_driver_path() + "\\" + "libcdsprpc.dll"; +#else + std::string drv_path = "libcdsprpc.so"; +#endif + if (initialized) { + GGML_LOG_INFO("ggml-hex: Driver already loaded\n"); + return AEE_SUCCESS; + } + GGML_LOG_INFO("ggml-hex: Loading driver %s\n", drv_path.c_str()); + + fs::path path{ drv_path.c_str() }; + dl_handle_ptr handle { dl_load_library(path) }; + if (!handle) { + GGML_LOG_ERROR("ggml-hex: failed to load %s: %s\n", path.u8string().c_str(), dl_error()); + return AEE_EUNABLETOLOAD; + } + +#define dlsym(drv, type, pfn, symbol, ignore) \ + do { \ + pfn = (type) dl_get_sym(drv, #symbol); \ + if (!ignore && nullptr == pfn) { \ + GGML_LOG_ERROR("ggml-hex: failed to dlsym %s\n", #symbol); \ + return AEE_EUNABLETOLOAD; \ + } \ + } while (0) + + dlsym(handle.get(), rpcmem_alloc_pfn_t, rpcmem_alloc_pfn, rpcmem_alloc, false); + dlsym(handle.get(), rpcmem_alloc2_pfn_t, rpcmem_alloc2_pfn, rpcmem_alloc2, true); + dlsym(handle.get(), rpcmem_free_pfn_t, rpcmem_free_pfn, rpcmem_free, false); + dlsym(handle.get(), rpcmem_to_fd_pfn_t, rpcmem_to_fd_pfn, rpcmem_to_fd, false); + dlsym(handle.get(), fastrpc_mmap_pfn_t, fastrpc_mmap_pfn, fastrpc_mmap, false); + dlsym(handle.get(), fastrpc_munmap_pfn_t, fastrpc_munmap_pfn, fastrpc_munmap, false); + dlsym(handle.get(), dspqueue_create_pfn_t, dspqueue_create_pfn, dspqueue_create, false); + dlsym(handle.get(), dspqueue_close_pfn_t, dspqueue_close_pfn, dspqueue_close, false); + dlsym(handle.get(), dspqueue_export_pfn_t, dspqueue_export_pfn, dspqueue_export, false); + dlsym(handle.get(), dspqueue_write_pfn_t, dspqueue_write_pfn, dspqueue_write, false); + dlsym(handle.get(), dspqueue_read_pfn_t, dspqueue_read_pfn, dspqueue_read, false); + dlsym(handle.get(), remote_handle64_open_pfn_t, remote_handle64_open_pfn, remote_handle64_open, false); + dlsym(handle.get(), remote_handle64_invoke_pfn_t, remote_handle64_invoke_pfn, remote_handle64_invoke, false); + dlsym(handle.get(), remote_handle_control_pfn_t, remote_handle_control_pfn, remote_handle_control, false); + dlsym(handle.get(), remote_handle64_control_pfn_t, remote_handle64_control_pfn, remote_handle64_control, false); + dlsym(handle.get(), remote_session_control_pfn_t, remote_session_control_pfn, remote_session_control, false); + dlsym(handle.get(), remote_handle64_close_pfn_t, remote_handle64_close_pfn, remote_handle64_close, false); + + lib_cdsp_rpc_handle = std::move(handle); + initialized = true; + + return AEE_SUCCESS; +} + +domain * get_domain(int domain_id) { + int i = 0; + int size = sizeof(supported_domains) / sizeof(domain); + + for (i = 0; i < size; i++) { + if (supported_domains[i].id == domain_id) { + return &supported_domains[i]; + } + } + + return NULL; +} + +int get_hex_arch_ver(int domain, int * arch) { + if (!remote_handle_control_pfn) { + GGML_LOG_ERROR("ggml-hex: remote_handle_control is not supported on this device\n"); + return AEE_EUNSUPPORTEDAPI; + } + + struct remote_dsp_capability arch_ver; + arch_ver.domain = (uint32_t) domain; + arch_ver.attribute_ID = ARCH_VER; + arch_ver.capability = (uint32_t) 0; + + int err = remote_handle_control(DSPRPC_GET_DSP_INFO, &arch_ver, sizeof(arch_ver)); + if ((err & 0xff) == (AEE_EUNSUPPORTEDAPI & 0xff)) { + GGML_LOG_ERROR("ggml-hex: FastRPC capability API is not supported on this device\n"); + return AEE_EUNSUPPORTEDAPI; + } + + if (err != AEE_SUCCESS) { + GGML_LOG_ERROR("ggml-hex: FastRPC capability query failed (err %d)\n", err); + return err; + } + + switch (arch_ver.capability & 0xff) { + case 0x68: + *arch = 68; + return 0; + case 0x69: + *arch = 69; + return 0; + case 0x73: + *arch = 73; + return 0; + case 0x75: + *arch = 75; + return 0; + case 0x79: + *arch = 79; + return 0; + case 0x81: + *arch = 81; + return 0; + } + return -1; +} diff --git a/ggml/src/ggml-hexagon/htp-drv.h b/ggml/src/ggml-hexagon/htp-drv.h new file mode 100644 index 0000000000..6eba7ba17d --- /dev/null +++ b/ggml/src/ggml-hexagon/htp-drv.h @@ -0,0 +1,121 @@ +#pragma once + +#ifdef __cplusplus +extern "C" { +#endif + +#ifdef _WIN32 +# pragma clang diagnostic ignored "-Wignored-attributes" +#endif + +#include +#include +#include +#include + +#if defined(_WIN32) && !defined(__MINGW32__) +# ifdef GGML_BACKEND_BUILD +# define HTPDRV_API __declspec(dllexport) extern +# else +# define HTPDRV_API __declspec(dllimport) extern +# endif +#else +# define HTPDRV_API __attribute__ ((visibility ("default"))) extern +#endif + +/* Offset to differentiate HLOS and Hexagon error codes. + Stores the value of AEE_EOFFSET for Hexagon. */ +#ifndef DSP_OFFSET +# define DSP_OFFSET 0x80000400 +#endif + +/* Errno for connection reset by peer. */ +#ifndef ECONNRESET +# ifdef __hexagon__ +# define ECONNRESET 104 +# endif +#endif + +/* Abstraction of different OS specific sleep APIs. + SLEEP accepts input in seconds. */ +#ifndef SLEEP +# ifdef __hexagon__ +# define SLEEP(x) \ + { /* Do nothing for simulator. */ \ + } +# else +# ifdef _WIN32 +# define SLEEP(x) Sleep(1000 * x) /* Sleep accepts input in milliseconds. */ +# else +# define SLEEP(x) sleep(x) /* sleep accepts input in seconds. */ +# endif +# endif +#endif + +/* Include windows specific header files. */ +#ifdef _WIN32 +# include +# include +# define _CRT_SECURE_NO_WARNINGS 1 +# define _WINSOCK_DEPRECATED_NO_WARNINGS 1 +#endif + +/* Includes and defines for all HLOS except windows */ +#if !defined(__hexagon__) && !defined(_WIN32) +# include "unistd.h" + +# include +#endif + +/* Includes and defines for Hexagon and all HLOS except Windows. */ +#if !defined(_WIN32) +/* Weak reference to remote symbol for compilation. */ +# pragma weak remote_session_control +# pragma weak remote_handle_control +# pragma weak remote_handle64_control +# pragma weak fastrpc_mmap +# pragma weak fastrpc_munmap +# pragma weak rpcmem_alloc2 +#endif + +#if !defined(_WIN32) +# pragma weak remote_system_request +#endif + +#ifdef _WIN32 +# define DSPQUEUE_TIMEOUT DSPQUEUE_TIMEOUT_NONE +#else +# define DSPQUEUE_TIMEOUT 1000000 +#endif + +/** + * htpdrv_init API: driver interface entry point + * + * @return Return AEE error codes as defined in Hexagon SDK. + */ +HTPDRV_API int htpdrv_init(void); + +/** + * get_domain API: get domain struct from domain value. + * + * @param[in] domain value of a domain + * @return Returns domain struct of the domain if it is supported or else + * returns NULL. + * + */ +HTPDRV_API domain * get_domain(int domain_id); + +/** + * get_hex_arch_ver API: query the Hexagon processor architecture version information + * + * @param[in] domain_id value of a domain + * @param[out] Arch version (73, 75, ...) + * @return 0 if query is successful. + * non-zero if error, return value points to the error. + * + */ +HTPDRV_API int get_hex_arch_ver(int domain, int * arch); + +#ifdef __cplusplus +} +#endif diff --git a/ggml/src/ggml-hexagon/htp-utils.c b/ggml/src/ggml-hexagon/htp-utils.c deleted file mode 100644 index 3f335bf71c..0000000000 --- a/ggml/src/ggml-hexagon/htp-utils.c +++ /dev/null @@ -1,454 +0,0 @@ - -#pragma clang diagnostic ignored "-Wgnu-anonymous-struct" -#pragma clang diagnostic ignored "-Wmissing-prototypes" -#pragma clang diagnostic ignored "-Wsign-compare" - -#define GGML_COMMON_IMPL_C -#include "ggml-backend-impl.h" -#include "ggml-common.h" -#include "ggml-hexagon.h" -#include "ggml-impl.h" - -#include "htp-utils.h" - -#include -#include -#include -#include -#include -#include -#include - -domain * get_domain(int domain_id) { - int i = 0; - int size = sizeof(supported_domains) / sizeof(domain); - - for (i = 0; i < size; i++) { - if (supported_domains[i].id == domain_id) { - return &supported_domains[i]; - } - } - - return NULL; -} - -bool is_valid_domain_id(int domain_id, int compute_only) { - int i = 0; - int size = sizeof(supported_domains) / sizeof(domain); - - if (compute_only) { - return is_CDSP(domain_id); - } - - for (i = 0; i < size; i++) { - if (supported_domains[i].id == domain_id) { - return true; - } - } - - return false; -} - -int get_domains_info(char * domain_type, int * num_domains, fastrpc_domain ** domains_info) { - int nErr = AEE_SUCCESS; - int ss_info = 0; - if (domain_type != NULL) { - if (strcmp(domain_type, "LPASS") == 0) { - ss_info = FASTRPC_LPASS; - } else if (strcmp(domain_type, "HPASS") == 0) { - ss_info = FASTRPC_HPASS; - } else { - ss_info = FASTRPC_NSP; - } - } - system_req_payload req = { 0 }; - req.id = FASTRPC_GET_DOMAINS; - req.sys.domains = NULL; - fastrpc_domain * domain = NULL; - if (ss_info != 0) { - req.sys.flags = DOMAINS_LIST_FLAGS_SET_TYPE(req.sys.flags, ss_info); - } else { - req.sys.flags = 0; - } -#ifdef _WIN32 - nErr = AEE_EUNSUPPORTED; - goto bail; -#endif - if (remote_system_request) { - nErr = remote_system_request(&req); - if (nErr != AEE_SUCCESS) { - GGML_LOG_ERROR("Failure in remote_system_request call: %d.\n", nErr); - goto bail; - } - // Allocate memory for domain-info array - req.sys.max_domains = req.sys.num_domains; - if ((req.sys.domains = calloc(req.sys.num_domains, sizeof(fastrpc_domain))) == NULL) { - nErr = AEE_ENOMEMORY; - GGML_LOG_ERROR("Unable to allocate memory for req.sys.domains"); - goto bail; - } - - nErr = remote_system_request(&req); - if (nErr != AEE_SUCCESS) { - GGML_LOG_ERROR("Failure in remote_system_request call: %d.\n", nErr); - goto bail; - } - - for (int i = 0; i < req.sys.num_domains; i++) { - // Verify that only requested type domains were returned - domain = &req.sys.domains[i]; - if (domain->type != ss_info && domain_type != NULL) { - nErr = -1; - GGML_LOG_ERROR("Incorrect data received from remote_system_request.\n"); - goto bail; - } - } - *domains_info = req.sys.domains; - *num_domains = req.sys.num_domains; - } else { - nErr = AEE_EUNSUPPORTED; - goto bail; - } -bail: - if (nErr && !req.sys.domains) { - free(req.sys.domains); - } - return nErr; -} - -int get_effective_domain_id(char * domain_name, int session_id, int * effec_domain_id) { - int err = 0; - remote_rpc_effective_domain_id_t sess = { 0 }; - - sess.domain_name = domain_name; - sess.domain_name_len = strlen(domain_name); - sess.session_id = session_id; - - err = remote_session_control(FASTRPC_GET_EFFECTIVE_DOMAIN_ID, &sess, sizeof(sess)); - if (err) { - GGML_LOG_ERROR("Error 0x%x: failed to get effective domain id for %s, session id %d\n", err, sess.domain_name, - session_id); - return err; - } - - *effec_domain_id = sess.effective_domain_id; - return err; -} - -int get_dsp_support(int * domain) { - int nErr = AEE_SUCCESS; - *domain = CDSP_DOMAIN_ID; // DSP domain default value is CDSP_DOMAIN_ID - - if (remote_handle_control) { - struct remote_dsp_capability dsp_capability_domain = { CDSP_DOMAIN_ID, DOMAIN_SUPPORT, 0 }; - nErr = remote_handle_control(DSPRPC_GET_DSP_INFO, &dsp_capability_domain, sizeof(struct remote_dsp_capability)); - if ((nErr & 0xFF) == (AEE_EUNSUPPORTEDAPI & 0xFF)) { - GGML_LOG_ERROR("\nFastRPC Capability API is not supported on this device\n"); - goto bail; - } - - if (dsp_capability_domain.capability == 0) { - dsp_capability_domain.domain = ADSP_DOMAIN_ID; // Check for ADSP support. - dsp_capability_domain.attribute_ID = DOMAIN_SUPPORT; - dsp_capability_domain.capability = 0; - nErr = remote_handle_control(DSPRPC_GET_DSP_INFO, &dsp_capability_domain, - sizeof(struct remote_dsp_capability)); - if (dsp_capability_domain.capability) { - *domain = ADSP_DOMAIN_ID; // For targets like Agatti (not having cDSP), domain is ADSP_DOMAIN_ID - } - } - - if (nErr != AEE_SUCCESS) { - GGML_LOG_ERROR("\nget_dsp_support failed with Error 0x%x\n", nErr); - goto bail; - } - } else { - nErr = AEE_EUNSUPPORTEDAPI; - GGML_LOG_ERROR("remote_dsp_capability interface is not supported on this device\n"); - } - -bail: - return nErr; -} - -int get_vtcm_info(int domain, uint32_t * capability, uint32_t attr) { - int nErr = AEE_SUCCESS; - *capability = 0; - - if (attr == VTCM_PAGE || attr == VTCM_COUNT) { - } else { - nErr = AEE_EBADPARM; - GGML_LOG_ERROR("Unsupported attr. Only VTCM_PAGE and VTCM_COUNT supported\n"); - goto bail; - } - if (remote_handle_control) { - if (domain == ADSP_DOMAIN_ID || domain == CDSP_DOMAIN_ID) { - /* - * Query the DSP for VTCM information - * Since the ADSP does not have a dedicated VTCM, we expect the output to be 0 - */ - struct remote_dsp_capability dsp_capability_vtcm_dsp; - dsp_capability_vtcm_dsp.domain = (uint32_t) domain; - dsp_capability_vtcm_dsp.attribute_ID = attr; - dsp_capability_vtcm_dsp.capability = (uint32_t) 0; - nErr = remote_handle_control(DSPRPC_GET_DSP_INFO, &dsp_capability_vtcm_dsp, - sizeof(struct remote_dsp_capability)); - if ((nErr & 0xFF) == (AEE_EUNSUPPORTEDAPI & 0xFF)) { - GGML_LOG_ERROR("\nFastRPC Capability API is not supported on this device\n"); - GGML_LOG_ERROR("Running the usecase without checking the capability\n"); - nErr = AEE_SUCCESS; - goto bail; - } else if (nErr == AEE_SUCCESS) { - *capability = dsp_capability_vtcm_dsp.capability; - } else { - GGML_LOG_ERROR("\nget_vtcm_info failed with Error 0x%x\n", nErr); - goto bail; - } - } else { - nErr = AEE_EUNSUPPORTED; - GGML_LOG_ERROR("Unsupported domain %d\n", domain); - goto bail; - } - } else { - nErr = AEE_EUNSUPPORTEDAPI; - GGML_LOG_ERROR("remote_dsp_capability interface is not supported on this device\n"); - } - -bail: - return nErr; -} - -bool is_unsignedpd_supported(int domain_id) { - int nErr = AEE_SUCCESS; - if (remote_handle_control) { - struct remote_dsp_capability dsp_capability_domain = { domain_id, UNSIGNED_PD_SUPPORT, 0 }; - nErr = remote_handle_control(DSPRPC_GET_DSP_INFO, &dsp_capability_domain, sizeof(struct remote_dsp_capability)); - if ((nErr & 0xFF) == (AEE_EUNSUPPORTEDAPI & 0xFF)) { - GGML_LOG_ERROR("\nFastRPC Capability API is not supported on this device. Falling back to signed pd.\n"); - return false; - } - if (nErr) { - GGML_LOG_ERROR("\nERROR 0x%x: FastRPC Capability API failed. Falling back to signed pd.", nErr); - return false; - } - if (dsp_capability_domain.capability == 1) { - return true; - } - } else { - nErr = AEE_EUNSUPPORTEDAPI; - GGML_LOG_ERROR("remote_dsp_capability interface is not supported on this device. Falling back to signed pd.\n"); - return false; - } - return false; -} - -bool get_unsignedpd_support(void) { - return is_unsignedpd_supported(CDSP_DOMAIN_ID); -} - -bool is_async_fastrpc_supported(int domain) { - int nErr = AEE_SUCCESS; - if (remote_handle_control) { - if (domain == CDSP_DOMAIN_ID) { - /* - * Query the DSP for ASYNC_FASTRPC_SUPPORT information - * Async fastrpc is supported only on CDSP - */ - struct remote_dsp_capability dsp_capability_async_support; - dsp_capability_async_support.domain = (uint32_t) domain; - dsp_capability_async_support.attribute_ID = ASYNC_FASTRPC_SUPPORT; - dsp_capability_async_support.capability = (uint32_t) 0; - nErr = remote_handle_control(DSPRPC_GET_DSP_INFO, &dsp_capability_async_support, - sizeof(struct remote_dsp_capability)); - if ((nErr & 0xFF) == (AEE_EUNSUPPORTEDAPI & 0xFF)) { - GGML_LOG_ERROR("\nFastRPC Capability API is not supported on this device\n"); - GGML_LOG_ERROR("Running the usecase without checking the capability\n"); - nErr = AEE_SUCCESS; - goto bail; - } else if (dsp_capability_async_support.capability == 1) { - return true; - } - if (nErr != AEE_SUCCESS) { - GGML_LOG_ERROR("\nis_async_fastrpc_supported failed with Error 0x%x\n", nErr); - goto bail; - } - } else { - nErr = AEE_EUNSUPPORTED; - GGML_LOG_ERROR("Async fastrpc is not supported on domain %d\n", domain); - goto bail; - } - } else { - nErr = AEE_EUNSUPPORTEDAPI; - GGML_LOG_ERROR("remote_dsp_capability interface is not supported on this device\n"); - } - -bail: - return false; -} - -bool is_status_notification_supported(int domain) { - int nErr = AEE_SUCCESS; - - if (remote_handle_control) { - /* - * Query the DSP for STATUS_NOTIFICATION_SUPPORT information - * DSP User PD status notification Support - */ - struct remote_dsp_capability dsp_capability_status_notification_support; - dsp_capability_status_notification_support.domain = (uint32_t) domain; - dsp_capability_status_notification_support.attribute_ID = STATUS_NOTIFICATION_SUPPORT; - dsp_capability_status_notification_support.capability = (uint32_t) 0; - nErr = remote_handle_control(DSPRPC_GET_DSP_INFO, &dsp_capability_status_notification_support, - sizeof(struct remote_dsp_capability)); - if ((nErr & 0xFF) == (AEE_EUNSUPPORTEDAPI & 0xFF)) { - GGML_LOG_ERROR("\nFastRPC Capability API is not supported on this device\n"); - GGML_LOG_ERROR("Running the usecase without checking the capability\n"); - nErr = AEE_SUCCESS; - goto bail; - } else if (dsp_capability_status_notification_support.capability == 1) { - return true; - } - if (nErr != AEE_SUCCESS) { - GGML_LOG_ERROR("\nis_status_notification_supported failed with Error 0x%x\n", nErr); - goto bail; - } - } else { - nErr = AEE_EUNSUPPORTEDAPI; - GGML_LOG_ERROR("remote_dsp_capability interface is not supported on this device\n"); - } - -bail: - return false; -} - -int get_hmx_support_info(int domain, uint32_t * capability, uint32_t attr) { - int nErr = AEE_SUCCESS; - *capability = 0; - - if (attr != HMX_SUPPORT_SPATIAL && attr != HMX_SUPPORT_DEPTH) { - nErr = AEE_EBADPARM; - GGML_LOG_ERROR("Unsupported attr. Only HMX_SUPPORT_SPATIAL and HMX_SUPPORT_DEPTH supported\n"); - goto bail; - } - if (remote_handle_control) { - if (domain == CDSP_DOMAIN_ID) { - /* - * Query the DSP for HMX SUPPORT information - * HMX is supported on CDSP only - */ - struct remote_dsp_capability dsp_capability_hmx_dsp; - dsp_capability_hmx_dsp.domain = (uint32_t) domain; - dsp_capability_hmx_dsp.attribute_ID = attr; - dsp_capability_hmx_dsp.capability = (uint32_t) 0; - nErr = remote_handle_control(DSPRPC_GET_DSP_INFO, &dsp_capability_hmx_dsp, - sizeof(struct remote_dsp_capability)); - if ((nErr & 0xFF) == (AEE_EUNSUPPORTEDAPI & 0xFF)) { - GGML_LOG_ERROR("\nFastRPC Capability API is not supported on this device\n"); - GGML_LOG_ERROR("Running the usecase without checking the capability\n"); - nErr = AEE_SUCCESS; - goto bail; - } else if (nErr == AEE_SUCCESS) { - *capability = dsp_capability_hmx_dsp.capability; - } else { - GGML_LOG_ERROR("\nget_hmx_support_info failed with Error 0x%x\n", nErr); - goto bail; - } - } else { - nErr = AEE_EUNSUPPORTED; - GGML_LOG_ERROR("HMX support is not there for domain %d\n", domain); - goto bail; - } - } else { - nErr = AEE_EUNSUPPORTEDAPI; - GGML_LOG_ERROR("remote_dsp_capability interface is not supported on this device\n"); - } - -bail: - return nErr; -} - -int get_hex_arch_ver(int domain, int * arch) { - if (!remote_handle_control) { - GGML_LOG_ERROR("ggml-hex: remote_handle_control is not supported on this device\n"); - return AEE_EUNSUPPORTEDAPI; - } - - struct remote_dsp_capability arch_ver; - arch_ver.domain = (uint32_t) domain; - arch_ver.attribute_ID = ARCH_VER; - arch_ver.capability = (uint32_t) 0; - - int err = remote_handle_control(DSPRPC_GET_DSP_INFO, &arch_ver, sizeof(arch_ver)); - if ((err & 0xff) == (AEE_EUNSUPPORTEDAPI & 0xff)) { - GGML_LOG_ERROR("ggml-hex: FastRPC capability API is not supported on this device\n"); - return AEE_EUNSUPPORTEDAPI; - } - - if (err != AEE_SUCCESS) { - GGML_LOG_ERROR("ggml-hex: FastRPC capability query failed (err %d)\n", err); - return err; - } - - switch (arch_ver.capability & 0xff) { - case 0x68: - *arch = 68; - return 0; - case 0x69: - *arch = 69; - return 0; - case 0x73: - *arch = 73; - return 0; - case 0x75: - *arch = 75; - return 0; - case 0x79: - *arch = 79; - return 0; - case 0x81: - *arch = 81; - return 0; - } - return -1; -} - -int get_hvx_support_info(int domain, uint32_t * capability, uint32_t attr) { - int nErr = AEE_SUCCESS; - *capability = 0; - - if (remote_handle_control) { - if (domain == CDSP_DOMAIN_ID) { - /* - * Query the DSP for HVX SUPPORT information - * HVX is supported on CDSP only - */ - struct remote_dsp_capability dsp_capability_hvx_dsp; - dsp_capability_hvx_dsp.domain = (uint32_t) domain; - dsp_capability_hvx_dsp.attribute_ID = attr; - dsp_capability_hvx_dsp.capability = (uint32_t) 0; - nErr = remote_handle_control(DSPRPC_GET_DSP_INFO, &dsp_capability_hvx_dsp, - sizeof(struct remote_dsp_capability)); - if ((nErr & 0xFF) == (AEE_EUNSUPPORTEDAPI & 0xFF)) { - GGML_LOG_ERROR("\nFastRPC Capability API is not supported on this device\n"); - GGML_LOG_ERROR("Running the usecase without checking the capability\n"); - nErr = AEE_SUCCESS; - goto bail; - } else if (nErr == AEE_SUCCESS) { - *capability = dsp_capability_hvx_dsp.capability; - } else { - GGML_LOG_ERROR("\nget_hvx_support_info failed with Error 0x%x\n", nErr); - goto bail; - } - } else { - nErr = AEE_EUNSUPPORTED; - GGML_LOG_ERROR("HVX support is not available on domain %d\n", domain); - goto bail; - } - } else { - nErr = AEE_EUNSUPPORTEDAPI; - GGML_LOG_ERROR("remote_dsp_capability interface is not supported on this device\n"); - } - -bail: - return nErr; -} diff --git a/ggml/src/ggml-hexagon/htp-utils.h b/ggml/src/ggml-hexagon/htp-utils.h deleted file mode 100644 index 7bbae3a0b7..0000000000 --- a/ggml/src/ggml-hexagon/htp-utils.h +++ /dev/null @@ -1,221 +0,0 @@ -#ifndef HTP_UTILS_H -#define HTP_UTILS_H - -#ifdef __cplusplus -extern "C" { -#endif - -#include -#include -#include -#include -#include - -/* Offset to differentiate HLOS and Hexagon error codes. - Stores the value of AEE_EOFFSET for Hexagon. */ -#ifndef DSP_OFFSET -# define DSP_OFFSET 0x80000400 -#endif - -/* Errno for connection reset by peer. */ -#ifndef ECONNRESET -# ifdef __hexagon__ -# define ECONNRESET 104 -# endif -#endif - -/* Abstraction of different OS specific sleep APIs. - SLEEP accepts input in seconds. */ -#ifndef SLEEP -# ifdef __hexagon__ -# define SLEEP(x) \ - { /* Do nothing for simulator. */ \ - } -# else -# ifdef _WINDOWS -# define SLEEP(x) Sleep(1000 * x) /* Sleep accepts input in milliseconds. */ -# else -# define SLEEP(x) sleep(x) /* sleep accepts input in seconds. */ -# endif -# endif -#endif - -/* Include windows specific header files. */ -#ifdef _WINDOWS -# include -# include -# define _CRT_SECURE_NO_WARNINGS 1 -# define _WINSOCK_DEPRECATED_NO_WARNINGS 1 -/* Including this file for custom implementation of getopt function. */ -# include "getopt_custom.h" -#endif - -/* Includes and defines for all HLOS except windows */ -#if !defined(__hexagon__) && !defined(_WINDOWS) -# include "unistd.h" - -# include -#endif - -/* Includes and defines for Hexagon and all HLOS except Windows. */ -#if !defined(_WINDOWS) -/* Weak reference to remote symbol for compilation. */ -# pragma weak remote_session_control -# pragma weak remote_handle_control -# pragma weak remote_handle64_control -# pragma weak fastrpc_mmap -# pragma weak fastrpc_munmap -# pragma weak rpcmem_alloc2 -#endif - -#if !defined(_WINDOWS) -# pragma weak remote_system_request -#endif -/** - * Wrapper for FastRPC Capability API: query DSP support. - * - * @param[out] domain pointer to supported domain. - * @return 0 if query is successful. - * non-zero if error, return value points to the error. - */ -int get_dsp_support(int * domain); - -/** - * Wrapper for FastRPC Capability API: query VTCM information. - * - * @param[in] domain value of domain in the queried. - * @param[out] capability capability value of the attribute queried. - * @param[in] attr value of the attribute to the queried. - * @return 0 if query is successful. - * non-zero if error, return value points to the error. - */ -int get_vtcm_info(int domain, uint32_t * capability, uint32_t attr); - -/** - * Wrapper for FastRPC Capability API: query unsigned pd support on CDSP domain. - * - * @return true if unsigned pd is supported. - * false if unsigned pd is not supported, capability query failed. - */ - -bool get_unsignedpd_support(void); - -/** - * Wrapper for FastRPC Capability API: query unsigned pd support. - * - * @param[in] domain value of domain in the queried. - * @return true if unsigned pd is supported. - * false if unsigned pd is not supported, capability query failed. - */ - -bool is_unsignedpd_supported(int domain_id); - -/** - * is_valid_domain_id API: query a domain id is valid. - * - * @param[in] domain value of domain in the queried. - * @param[in] compute_only value of domain is only compared with CDSP domains supported by the target when enabled. - * @return true if value of domain is valid. - * false if value of domain is not valid. - */ - -bool is_valid_domain_id(int domain_id, int compute_only); - -/** - * get_domain API: get domain struct from domain value. - * - * @param[in] domain value of a domain - * @return Returns domain struct of the domain if it is supported or else - * returns NULL. - * - */ - -domain * get_domain(int domain_id); - -/** - * get_domains_info API: get information for all the domains available on the device - * - * @param[in] domain_type pointer to domain type - * @param[in] num_domains pointer to number of domains - * @param[in] domains_info pointer to save discovered domains information. - * @return 0 if query is successful. - * non-zero if error, return value points to the error. - * - * It is user's responsibility to free the memory used to store the domains info whose address is present in domains_info before closing the application. - * - */ - -int get_domains_info(char * domain_type, int * num_domains, fastrpc_domain ** domains_info); - -/** - * get_effective_domain_id API: get effective domain id for given session id - * - * @param[in] domain_name pointer to domain name - * @param[in] session_id - * @param[in] effec_domain_id pointer to save obtained effective domain id. - * @return 0 if query is successful. - * non-zero if error, return value points to the error. - * - */ - -int get_effective_domain_id(char * domain_name, int session_id, int * effec_domain_id); - -/** - * is_async_fastrpc_supported API: query a domain id has async fastrpc supported or not - * - * @param[in] domain_id value of a domain - * @return Returns true or false stating support of Async FastRPC - * - */ - -bool is_async_fastrpc_supported(int domain_id); - -/** - * is_status_notification_supported API: query the DSP for STATUS_NOTIFICATION_SUPPORT information - * - * @param[in] domain_id value of a domain - * @return Returns true or false stating status notification support information - * - */ -bool is_status_notification_supported(int domain_id); - -/** - * get_hmx_support_info API: query the DSP for HMX SUPPORT information - * - * @param[in] domain_id value of a domain - * @param[out] capability capability value of the attribute queried. - * @param[in] attr value of the attribute to the queried. - * @return 0 if query is successful. - * non-zero if error, return value points to the error. - * - */ -int get_hmx_support_info(int domain, uint32_t * capability, uint32_t attr); - -/** - * get_hex_arch_ver API: query the Hexagon processor architecture version information - * - * @param[in] domain_id value of a domain - * @param[out] Arch version (73, 75, ...) - * @return 0 if query is successful. - * non-zero if error, return value points to the error. - * - */ -int get_hex_arch_ver(int domain, int * arch); - -/** - * get_hvx_support_info API: query the DSP for HVX SUPPORT information - * - * @param[in] domain_id value of a domain - * @param[out] capability capability value of the attribute queried. - * @param[in] attr value of the attribute to the queried. - * @return 0 if query is successful. - * non-zero if error, return value points to the error. - * - */ -int get_hvx_support_info(int domain, uint32_t * capability, uint32_t attr); - -#ifdef __cplusplus -} -#endif - -#endif //DSP_CAPABILITIES_UTILS_H diff --git a/ggml/src/ggml-hexagon/htp/flash-attn-ops.c b/ggml/src/ggml-hexagon/htp/flash-attn-ops.c index c7cb2a4e0b..c184637443 100644 --- a/ggml/src/ggml-hexagon/htp/flash-attn-ops.c +++ b/ggml/src/ggml-hexagon/htp/flash-attn-ops.c @@ -17,6 +17,12 @@ #include "htp-msg.h" #include "htp-ops.h" +static inline HVX_Vector hvx_load_f32_to_f16(const HVX_Vector * restrict src, const HVX_Vector zero) { + HVX_Vector y0_qf = Q6_Vqf32_vsub_VsfVsf(src[0], zero); // 32 elements + HVX_Vector y1_qf = Q6_Vqf32_vsub_VsfVsf(src[1], zero); // 32 elements + return Q6_Vh_vdeal_Vh(Q6_Vhf_equals_Wqf32(Q6_W_vcombine_VV(y1_qf, y0_qf))); +} + // Dot product of FP32 and FP16 vectors, accumulating to float static inline void hvx_dot_f32_f16_aa(float * restrict r, const void * restrict y, const void * restrict x, unsigned int n, float s) { const HVX_Vector * restrict vy = (const HVX_Vector * restrict) y; // fp32 @@ -33,23 +39,19 @@ static inline void hvx_dot_f32_f16_aa(float * restrict r, const void * restrict #pragma unroll(4) for (i = 0; i < nvec; i++) { // Load y (fp32) and convert into fp16 - HVX_Vector y0_qf = Q6_Vqf32_vsub_VsfVsf(vy[i*2+0], zero); // 32 elements - HVX_Vector y1_qf = Q6_Vqf32_vsub_VsfVsf(vy[i*2+1], zero); // 32 elements - HVX_Vector y_hf = Q6_Vh_vdeal_Vh(Q6_Vhf_equals_Wqf32(Q6_W_vcombine_VV(y1_qf, y0_qf))); + HVX_Vector y_hf = hvx_load_f32_to_f16(&vy[i*2], zero); // Load x (fp16) HVX_Vector x_hf = vx[i]; HVX_VectorPair xy_qf = Q6_Wqf32_vmpy_VhfVhf(x_hf, y_hf); - rsum = Q6_Vqf32_vadd_Vqf32Vqf32(rsum, Q6_Vqf32_vadd_Vqf32Vqf32(Q6_V_lo_W(xy_qf), Q6_V_hi_W(xy_qf))); + rsum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(Q6_Vqf32_vadd_Vqf32Vqf32(Q6_V_lo_W(xy_qf), Q6_V_hi_W(xy_qf)), rsum)); } if (nloe) { // Load y (fp32) and convert into fp16 - HVX_Vector y0_qf = Q6_Vqf32_vsub_VsfVsf(vy[i*2+0], zero); // 32 elements - HVX_Vector y1_qf = Q6_Vqf32_vsub_VsfVsf(vy[i*2+1], zero); // 32 elements - HVX_Vector y_hf = Q6_Vh_vdeal_Vh(Q6_Vhf_equals_Wqf32(Q6_W_vcombine_VV(y1_qf, y0_qf))); + HVX_Vector y_hf = hvx_load_f32_to_f16(&vy[i*2], zero); // Load x (fp16) HVX_Vector x_hf = vx[i]; @@ -62,13 +64,72 @@ static inline void hvx_dot_f32_f16_aa(float * restrict r, const void * restrict HVX_VectorPair xy_qf = Q6_Wqf32_vmpy_VhfVhf(x_hf, y_hf); - rsum = Q6_Vqf32_vadd_Vqf32Vqf32(rsum, Q6_Vqf32_vadd_Vqf32Vqf32(Q6_V_lo_W(xy_qf), Q6_V_hi_W(xy_qf))); + rsum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(Q6_Vqf32_vadd_Vqf32Vqf32(Q6_V_lo_W(xy_qf), Q6_V_hi_W(xy_qf)), rsum)); } - rsum = Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(rsum), hvx_vec_splat_f32(s)); - rsum = Q6_Vsf_equals_Vqf32(hvx_vec_reduce_sum_qf32(rsum)); + rsum = Q6_Vqf32_vmpy_VsfVsf(hvx_vec_splat_f32(s), hvx_vec_reduce_sum_f32(rsum)); + hvx_vec_store_u(r, 4, Q6_Vsf_equals_Vqf32(rsum)); +} - hvx_vec_store_u(r, 4, rsum); +// Dot product of FP32 and FP16 vectors, accumulating to float +static inline void hvx_dot_f32_f16_aa_rx2(float * restrict r, + const void * restrict y, + const void * restrict x0, + const void * restrict x1, + unsigned int n, + float s) { + const HVX_Vector * restrict vy = (const HVX_Vector * restrict) y; // fp32 + const HVX_Vector * restrict vx0 = (const HVX_Vector * restrict) x0; // fp16 + const HVX_Vector * restrict vx1 = (const HVX_Vector * restrict) x1; // fp16 + + uint32_t nvec = n / VLEN_FP16; // num full fp16 hvx vectors + uint32_t nloe = n % VLEN_FP16; // leftover elements + + const HVX_Vector zero = Q6_V_vsplat_R(0); + HVX_Vector rsum0 = Q6_V_vsplat_R(0); + HVX_Vector rsum1 = Q6_V_vsplat_R(0); + + uint32_t i = 0; + + #pragma unroll(2) + for (i = 0; i < nvec; i++) { + // Load y (fp32) and convert into fp16 + HVX_Vector y_hf = hvx_load_f32_to_f16(&vy[i*2], zero); + // Load x (fp16) + HVX_Vector x0_hf = vx0[i]; + HVX_Vector x1_hf = vx1[i]; + + HVX_VectorPair xy0_qf = Q6_Wqf32_vmpy_VhfVhf(x0_hf, y_hf); + HVX_VectorPair xy1_qf = Q6_Wqf32_vmpy_VhfVhf(x1_hf, y_hf); + + rsum0 = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(Q6_Vqf32_vadd_Vqf32Vqf32(Q6_V_lo_W(xy0_qf), Q6_V_hi_W(xy0_qf)), rsum0)); + rsum1 = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(Q6_Vqf32_vadd_Vqf32Vqf32(Q6_V_lo_W(xy1_qf), Q6_V_hi_W(xy1_qf)), rsum1)); + } + + if (nloe) { + // Load y (fp32) and convert into fp16 + HVX_Vector y_hf = hvx_load_f32_to_f16(&vy[i*2], zero); + + // Load x (fp16) + HVX_Vector x0_hf = vx0[i]; + HVX_Vector x1_hf = vx1[i]; + + // Zero-out unused elements + // Note that we need to clear both x and y because they may contain NANs + HVX_VectorPred bmask = Q6_Q_vsetq_R(nloe * 2); + x0_hf = Q6_V_vand_QV(bmask, x0_hf); + x1_hf = Q6_V_vand_QV(bmask, x1_hf); + y_hf = Q6_V_vand_QV(bmask, y_hf); + + HVX_VectorPair xy0_qf = Q6_Wqf32_vmpy_VhfVhf(x0_hf, y_hf); + HVX_VectorPair xy1_qf = Q6_Wqf32_vmpy_VhfVhf(x1_hf, y_hf); + + rsum0 = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(Q6_Vqf32_vadd_Vqf32Vqf32(Q6_V_lo_W(xy0_qf), Q6_V_hi_W(xy0_qf)), rsum0)); + rsum1 = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(Q6_Vqf32_vadd_Vqf32Vqf32(Q6_V_lo_W(xy1_qf), Q6_V_hi_W(xy1_qf)), rsum1)); + } + + HVX_Vector rsum = Q6_Vqf32_vmpy_VsfVsf(hvx_vec_splat_f32(s), hvx_vec_reduce_sum_f32x2(rsum0, rsum1)); + hvx_vec_store_u(r, 8, Q6_Vsf_equals_Vqf32(rsum)); } // Dot product of two F16 vectors, accumulating to float @@ -91,7 +152,7 @@ static inline void hvx_dot_f16_f16_aa(float * restrict r, const void * restrict HVX_VectorPair xy_qf = Q6_Wqf32_vmpy_VhfVhf(x_hf, y_hf); - rsum = Q6_Vqf32_vadd_Vqf32Vqf32(rsum, Q6_Vqf32_vadd_Vqf32Vqf32(Q6_V_lo_W(xy_qf), Q6_V_hi_W(xy_qf))); + rsum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(Q6_Vqf32_vadd_Vqf32Vqf32(Q6_V_lo_W(xy_qf), Q6_V_hi_W(xy_qf)), rsum)); } if (nloe) { @@ -103,12 +164,62 @@ static inline void hvx_dot_f16_f16_aa(float * restrict r, const void * restrict HVX_VectorPair xy_qf = Q6_Wqf32_vmpy_VhfVhf(x_hf, y_hf); - rsum = Q6_Vqf32_vadd_Vqf32Vqf32(rsum, Q6_Vqf32_vadd_Vqf32Vqf32(Q6_V_lo_W(xy_qf), Q6_V_hi_W(xy_qf))); + rsum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(Q6_Vqf32_vadd_Vqf32Vqf32(Q6_V_lo_W(xy_qf), Q6_V_hi_W(xy_qf)), rsum)); } - rsum = Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(rsum), hvx_vec_splat_f32(s)); - rsum = Q6_Vsf_equals_Vqf32(hvx_vec_reduce_sum_qf32(rsum)); - hvx_vec_store_u(r, 4, rsum); + rsum = Q6_Vqf32_vmpy_VsfVsf(hvx_vec_splat_f32(s), hvx_vec_reduce_sum_f32(rsum)); + hvx_vec_store_u(r, 4, Q6_Vsf_equals_Vqf32(rsum)); +} + +static inline void hvx_dot_f16_f16_aa_rx2(float * restrict r, + const void * restrict y, + const void * restrict x0, + const void * restrict x1, + unsigned int n, + float s) { + const HVX_Vector * restrict vx0 = (const HVX_Vector * restrict) x0; // fp16 + const HVX_Vector * restrict vx1 = (const HVX_Vector * restrict) x1; // fp16 + const HVX_Vector * restrict vy = (const HVX_Vector * restrict) y; // fp16 + + uint32_t nvec = n / VLEN_FP16; // num full fp16 hvx vectors + uint32_t nloe = n % VLEN_FP16; // leftover elements + + const HVX_Vector zero = Q6_V_vsplat_R(0); + HVX_Vector rsum0 = Q6_V_vsplat_R(0); + HVX_Vector rsum1 = Q6_V_vsplat_R(0); + + uint32_t i = 0; + + #pragma unroll(4) + for (i = 0; i < nvec; i++) { + HVX_Vector y_hf = vy[i]; + HVX_Vector x0_hf = vx0[i]; + HVX_Vector x1_hf = vx1[i]; + + HVX_VectorPair xy0_qf = Q6_Wqf32_vmpy_VhfVhf(x0_hf, y_hf); + HVX_VectorPair xy1_qf = Q6_Wqf32_vmpy_VhfVhf(x1_hf, y_hf); + + rsum0 = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(Q6_Vqf32_vadd_Vqf32Vqf32(Q6_V_lo_W(xy0_qf), Q6_V_hi_W(xy0_qf)), rsum0)); + rsum1 = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(Q6_Vqf32_vadd_Vqf32Vqf32(Q6_V_lo_W(xy1_qf), Q6_V_hi_W(xy1_qf)), rsum1)); + } + + if (nloe) { + HVX_Vector y_hf = vy[i]; + + // Load x (fp16) and zero-out unused elements + HVX_VectorPred bmask = Q6_Q_vsetq_R(nloe * 2); + HVX_Vector x0_hf = Q6_V_vand_QV(bmask, vx0[i]); + HVX_Vector x1_hf = Q6_V_vand_QV(bmask, vx1[i]); + + HVX_VectorPair xy0_qf = Q6_Wqf32_vmpy_VhfVhf(x0_hf, y_hf); + HVX_VectorPair xy1_qf = Q6_Wqf32_vmpy_VhfVhf(x1_hf, y_hf); + + rsum0 = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(Q6_Vqf32_vadd_Vqf32Vqf32(Q6_V_lo_W(xy0_qf), Q6_V_hi_W(xy0_qf)), rsum0)); + rsum1 = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(Q6_Vqf32_vadd_Vqf32Vqf32(Q6_V_lo_W(xy1_qf), Q6_V_hi_W(xy1_qf)), rsum1)); + } + + HVX_Vector rsum = Q6_Vqf32_vmpy_VsfVsf(hvx_vec_splat_f32(s), hvx_vec_reduce_sum_f32x2(rsum0, rsum1)); + hvx_vec_store_u(r, 8, Q6_Vsf_equals_Vqf32(rsum)); } // MAD: y (F32) += x (F16) * s (float) @@ -317,20 +428,22 @@ static void flash_attn_ext_f16_thread(struct htp_ops_context * octx, int ith, in // Inner loop processing the block from VTCM uint32_t ic = 0; + const bool is_q_fp32 = (q->type == HTP_TYPE_F32); + // Process in blocks of 32 (VLEN_FP32) - static_assert(FLASH_ATTN_BLOCK_SIZE / VLEN_FP32 == 4, "FLASH_ATTN_BLOCK_SIZE changed, fix HVX_Vector_x4 usage"); + static_assert(FLASH_ATTN_BLOCK_SIZE / VLEN_FP32 <= 4, "FLASH_ATTN_BLOCK_SIZE changed, fix HVX_Vector_x4 usage"); HVX_Vector_x4 scores_x4; HVX_Vector v_max = hvx_vec_splat_f32(-INFINITY); for (uint32_t iv = 0; ic + VLEN_FP32 <= current_block_size; ic += VLEN_FP32, ++iv) { // 1. Compute scores - float __attribute__((aligned(VLEN))) scores_arr[FLASH_ATTN_BLOCK_SIZE]; - for (int j = 0; j < VLEN_FP32; ++j) { + float __attribute__((aligned(VLEN))) scores_arr[VLEN_FP32]; + for (int j = 0; j < VLEN_FP32; j += 2) { const uint32_t cur_ic = ic + j; const uint8_t * k_ptr = k_base + cur_ic * size_k_row_padded; - if (q->type == HTP_TYPE_F32) { - hvx_dot_f32_f16_aa(&scores_arr[j], q_ptr_vtcm, k_ptr, DK, scale); + if (is_q_fp32) { + hvx_dot_f32_f16_aa_rx2(&scores_arr[j], q_ptr_vtcm, k_ptr, k_ptr + size_k_row_padded, DK, scale); } else { - hvx_dot_f16_f16_aa(&scores_arr[j], q_ptr_vtcm, k_ptr, DK, scale); + hvx_dot_f16_f16_aa_rx2(&scores_arr[j], q_ptr_vtcm, k_ptr, k_ptr + size_k_row_padded, DK, scale); } } @@ -403,7 +516,7 @@ static void flash_attn_ext_f16_thread(struct htp_ops_context * octx, int ith, in float s_val; const uint8_t * k_ptr = k_base + ic * size_k_row_padded; - if (q->type == HTP_TYPE_F32) { + if (is_q_fp32) { hvx_dot_f32_f16_aa(&s_val, q_ptr_vtcm, k_ptr, DK, scale); } else { hvx_dot_f16_f16_aa(&s_val, q_ptr_vtcm, k_ptr, DK, scale); diff --git a/ggml/src/ggml-hexagon/htp/hvx-dump.h b/ggml/src/ggml-hexagon/htp/hvx-dump.h index e882227893..85201fc345 100644 --- a/ggml/src/ggml-hexagon/htp/hvx-dump.h +++ b/ggml/src/ggml-hexagon/htp/hvx-dump.h @@ -28,19 +28,16 @@ static void hvx_vec_dump_f16(char * pref, HVX_Vector v) { } static void hvx_vec_dump_f32_n(char * pref, HVX_Vector v, uint32_t n) { - union { - HVX_Vector v; - float d[32]; - } u = { .v = v }; + HVX_VectorAlias u = { .v = v }; const uint32_t n0 = n / 16; const uint32_t n1 = n % 16; int i = 0; for (; i < n0; i++) { - hex_dump_f32_line(pref, u.d + (16 * i), 16); + hex_dump_f32_line(pref, u.fp32 + (16 * i), 16); } if (n1) { - hex_dump_f32_line(pref, u.d + (16 * i), n1); + hex_dump_f32_line(pref, u.fp32 + (16 * i), n1); } } diff --git a/ggml/src/ggml-hexagon/htp/hvx-reduce.h b/ggml/src/ggml-hexagon/htp/hvx-reduce.h index 8845fe73ea..1ca7c05d98 100644 --- a/ggml/src/ggml-hexagon/htp/hvx-reduce.h +++ b/ggml/src/ggml-hexagon/htp/hvx-reduce.h @@ -44,6 +44,45 @@ static inline HVX_Vector hvx_vec_reduce_sum_qf32(HVX_Vector in) { return hvx_vec_reduce_sum_n_qf32(in, 32); } +#if __HVX_ARCH__ > 75 + +static inline HVX_Vector hvx_vec_reduce_sum_f32x2(HVX_Vector in0, HVX_Vector in1) { + HVX_VectorPair sump = Q6_W_vshuff_VVR(in1, in0, 4); + HVX_Vector sum_sf = Q6_Vsf_vadd_VsfVsf(Q6_V_lo_W(sump), Q6_V_hi_W(sump)); + + sum_sf = Q6_Vsf_vadd_VsfVsf(sum_sf, Q6_V_vror_VR(sum_sf, VLEN / 2)); + sum_sf = Q6_Vsf_vadd_VsfVsf(sum_sf, Q6_V_vror_VR(sum_sf, VLEN / 4)); + sum_sf = Q6_Vsf_vadd_VsfVsf(sum_sf, Q6_V_vror_VR(sum_sf, VLEN / 8)); + sum_sf = Q6_Vsf_vadd_VsfVsf(sum_sf, Q6_V_vror_VR(sum_sf, VLEN / 16)); + return sum_sf; +} + +static inline HVX_Vector hvx_vec_reduce_sum_n_f32(HVX_Vector in, unsigned int n) { + unsigned int total = n * 4; // total vec nbytes + unsigned int width = 4; // fp32 nbytes + + HVX_Vector sum = in, sum_t; + while (width < total) { + sum_t = Q6_V_vror_VR(sum, width); // rotate right + sum = Q6_Vsf_vadd_VsfVsf(sum, sum_t); // elementwise sum + width = width << 1; + } + return sum; +} + +#else + +static inline HVX_Vector hvx_vec_reduce_sum_f32x2(HVX_Vector in0, HVX_Vector in1) { + HVX_VectorPair sump = Q6_W_vshuff_VVR(in1, in0, 4); + HVX_Vector sum_qf = Q6_Vqf32_vadd_VsfVsf(Q6_V_lo_W(sump), Q6_V_hi_W(sump)); + + sum_qf = Q6_Vqf32_vadd_Vqf32Vsf(sum_qf, Q6_V_vror_VR(Q6_Vsf_equals_Vqf32(sum_qf), VLEN / 2)); + sum_qf = Q6_Vqf32_vadd_Vqf32Vsf(sum_qf, Q6_V_vror_VR(Q6_Vsf_equals_Vqf32(sum_qf), VLEN / 4)); + sum_qf = Q6_Vqf32_vadd_Vqf32Vsf(sum_qf, Q6_V_vror_VR(Q6_Vsf_equals_Vqf32(sum_qf), VLEN / 8)); + sum_qf = Q6_Vqf32_vadd_Vqf32Vsf(sum_qf, Q6_V_vror_VR(Q6_Vsf_equals_Vqf32(sum_qf), VLEN / 16)); + return Q6_Vsf_equals_Vqf32(sum_qf); +} + static inline HVX_Vector hvx_vec_reduce_sum_n_f32(HVX_Vector in, unsigned int n) { unsigned int total = n * 4; // total vec nbytes unsigned int width = 4; // fp32 nbytes @@ -57,6 +96,8 @@ static inline HVX_Vector hvx_vec_reduce_sum_n_f32(HVX_Vector in, unsigned int n) return sum; } +#endif + static inline HVX_Vector hvx_vec_reduce_sum_f32(HVX_Vector in) { return hvx_vec_reduce_sum_n_f32(in, 32); } diff --git a/ggml/src/ggml-hexagon/htp/matmul-ops.c b/ggml/src/ggml-hexagon/htp/matmul-ops.c index 1603ff2b3b..d251eeed33 100644 --- a/ggml/src/ggml-hexagon/htp/matmul-ops.c +++ b/ggml/src/ggml-hexagon/htp/matmul-ops.c @@ -11,6 +11,7 @@ #include "hex-dma.h" #include "hvx-utils.h" +#include "hvx-dump.h" #define GGML_COMMON_DECL_C #include "ggml-common.h" @@ -320,7 +321,7 @@ static void vec_dot_q4x4x2_q8x4x2(const int n, float * restrict s, const void * const uint8_t * restrict y_q = ((const uint8_t *) vy + 0); // quants first const uint8_t * restrict y_d = ((const uint8_t *) vy + y_qrow_size); // then scales - // Row sum (qf32) + // Row sum (sf) HVX_Vector r0_sum = Q6_V_vsplat_R(0); // Multiply and accumulate into int32. @@ -344,7 +345,7 @@ static void vec_dot_q4x4x2_q8x4x2(const int n, float * restrict s, const void * HVX_Vector r0_fa = Q6_Vqf32_vmpy_VsfVsf(r0_ia, r0_dd); - r0_sum = Q6_Vqf32_vadd_Vqf32Vqf32(r0_sum, r0_fa); + r0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_fa, r0_sum)); } // Process leftovers, we still load full 4x4x2 block but zero out unused scales/blocks @@ -362,14 +363,14 @@ static void vec_dot_q4x4x2_q8x4x2(const int n, float * restrict s, const void * // Zero out unused scales HVX_VectorPred bmask = Q6_Q_vsetq_R(nloe / 8); r0_dd = Q6_V_vand_QV(bmask, r0_dd); + r0_ia = Q6_V_vand_QV(bmask, r0_ia); HVX_Vector r0_fa = Q6_Vqf32_vmpy_VsfVsf(r0_ia, r0_dd); - r0_sum = Q6_Vqf32_vadd_Vqf32Vqf32(r0_sum, r0_fa); + r0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_fa, r0_sum)); } - // Reduce and convert into fp32 - r0_sum = hvx_vec_reduce_sum_f32(Q6_Vsf_equals_Vqf32(r0_sum)); + r0_sum = hvx_vec_reduce_sum_f32(r0_sum); hvx_vec_store_u(&s[0], 4, r0_sum); } @@ -402,7 +403,7 @@ static void vec_dot_q4x4x2_q8x4x2_rx2(const int n, const uint8_t * restrict y_q = ((const uint8_t *) vy + 0); // quants first const uint8_t * restrict y_d = ((const uint8_t *) vy + y_qrow_size); // then scales - // Row sum (qf32) + // Row sum (sf) HVX_Vector r0_sum = Q6_V_vsplat_R(0); HVX_Vector r1_sum = Q6_V_vsplat_R(0); @@ -432,8 +433,8 @@ static void vec_dot_q4x4x2_q8x4x2_rx2(const int n, HVX_Vector r0_fa = Q6_Vqf32_vmpy_VsfVsf(r0_ia, r0_dd); HVX_Vector r1_fa = Q6_Vqf32_vmpy_VsfVsf(r1_ia, r1_dd); - r0_sum = Q6_Vqf32_vadd_Vqf32Vqf32(r0_sum, r0_fa); - r1_sum = Q6_Vqf32_vadd_Vqf32Vqf32(r1_sum, r1_fa); + r0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_fa, r0_sum)); + r1_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r1_fa, r1_sum)); } // Process leftovers, we still load full 4x4x2 block but zero out unused scales/blocks @@ -456,20 +457,18 @@ static void vec_dot_q4x4x2_q8x4x2_rx2(const int n, HVX_VectorPred bmask = Q6_Q_vsetq_R(nloe / 8); r0_dd = Q6_V_vand_QV(bmask, r0_dd); r1_dd = Q6_V_vand_QV(bmask, r1_dd); + r0_ia = Q6_V_vand_QV(bmask, r0_ia); + r1_ia = Q6_V_vand_QV(bmask, r1_ia); HVX_Vector r0_fa = Q6_Vqf32_vmpy_VsfVsf(r0_ia, r0_dd); HVX_Vector r1_fa = Q6_Vqf32_vmpy_VsfVsf(r1_ia, r1_dd); - r0_sum = Q6_Vqf32_vadd_Vqf32Vqf32(r0_sum, r0_fa); - r1_sum = Q6_Vqf32_vadd_Vqf32Vqf32(r1_sum, r1_fa); + r0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_fa, r0_sum)); + r1_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r1_fa, r1_sum)); } - // Convert into fp32 and reduce - r0_sum = hvx_vec_reduce_sum_f32(Q6_Vsf_equals_Vqf32(r0_sum)); - r1_sum = hvx_vec_reduce_sum_f32(Q6_Vsf_equals_Vqf32(r1_sum)); - HVX_VectorPair p0 = Q6_W_vshuff_VVR(r1_sum, r0_sum, 4); - - hvx_vec_store_u(&s[0], 8, Q6_V_lo_W(p0)); + HVX_Vector rsum = hvx_vec_reduce_sum_f32x2(r0_sum, r1_sum); + hvx_vec_store_u(&s[0], 8, rsum); } static void vec_dot_q8x4x2_q8x4x2(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { @@ -493,7 +492,7 @@ static void vec_dot_q8x4x2_q8x4x2(const int n, float * restrict s, const void * const uint8_t * restrict y_q = ((const uint8_t *) vy + 0); // quants first const uint8_t * restrict y_d = ((const uint8_t *) vy + y_qrow_size); // then scales - // Row sum (qf32) + // Row sum (sf) HVX_Vector r0_sum = Q6_V_vsplat_R(0); // Multiply and accumulate into int32. @@ -517,7 +516,7 @@ static void vec_dot_q8x4x2_q8x4x2(const int n, float * restrict s, const void * HVX_Vector r0_fa = Q6_Vqf32_vmpy_VsfVsf(r0_ia, r0_dd); - r0_sum = Q6_Vqf32_vadd_Vqf32Vqf32(r0_sum, r0_fa); + r0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_fa, r0_sum)); } // Process leftovers, we still load full 4x4x2 block but zero out unused scales/blocks @@ -535,14 +534,14 @@ static void vec_dot_q8x4x2_q8x4x2(const int n, float * restrict s, const void * // Zero out unused scales HVX_VectorPred bmask = Q6_Q_vsetq_R(nloe / 8); r0_dd = Q6_V_vand_QV(bmask, r0_dd); + r0_ia = Q6_V_vand_QV(bmask, r0_ia); HVX_Vector r0_fa = Q6_Vqf32_vmpy_VsfVsf(r0_ia, r0_dd); - r0_sum = Q6_Vqf32_vadd_Vqf32Vqf32(r0_sum, r0_fa); + r0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_fa, r0_sum)); } - // Reduce and convert into fp32 - r0_sum = hvx_vec_reduce_sum_f32(Q6_Vsf_equals_Vqf32(r0_sum)); + r0_sum = hvx_vec_reduce_sum_f32(r0_sum); hvx_vec_store_u(&s[0], 4, r0_sum); } @@ -605,8 +604,8 @@ static void vec_dot_q8x4x2_q8x4x2_rx2(const int n, HVX_Vector r0_fa = Q6_Vqf32_vmpy_VsfVsf(r0_ia, r0_dd); HVX_Vector r1_fa = Q6_Vqf32_vmpy_VsfVsf(r1_ia, r1_dd); - r0_sum = Q6_Vqf32_vadd_Vqf32Vqf32(r0_sum, r0_fa); - r1_sum = Q6_Vqf32_vadd_Vqf32Vqf32(r1_sum, r1_fa); + r0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_fa, r0_sum)); + r1_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r1_fa, r1_sum)); } // Process leftovers, we still load full 4x4x2 block but zero out unused scales/blocks @@ -629,20 +628,18 @@ static void vec_dot_q8x4x2_q8x4x2_rx2(const int n, HVX_VectorPred bmask = Q6_Q_vsetq_R(nloe / 8); r0_dd = Q6_V_vand_QV(bmask, r0_dd); r1_dd = Q6_V_vand_QV(bmask, r1_dd); + r0_ia = Q6_V_vand_QV(bmask, r0_ia); + r1_ia = Q6_V_vand_QV(bmask, r1_ia); HVX_Vector r0_fa = Q6_Vqf32_vmpy_VsfVsf(r0_ia, r0_dd); HVX_Vector r1_fa = Q6_Vqf32_vmpy_VsfVsf(r1_ia, r1_dd); - r0_sum = Q6_Vqf32_vadd_Vqf32Vqf32(r0_sum, r0_fa); - r1_sum = Q6_Vqf32_vadd_Vqf32Vqf32(r1_sum, r1_fa); + r0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_fa, r0_sum)); + r1_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r1_fa, r1_sum)); } - // Convert into fp32 and reduce - r0_sum = hvx_vec_reduce_sum_f32(Q6_Vsf_equals_Vqf32(r0_sum)); - r1_sum = hvx_vec_reduce_sum_f32(Q6_Vsf_equals_Vqf32(r1_sum)); - HVX_VectorPair p0 = Q6_W_vshuff_VVR(r1_sum, r0_sum, 4); - - hvx_vec_store_u(&s[0], 8, Q6_V_lo_W(p0)); + HVX_Vector rsum = hvx_vec_reduce_sum_f32x2(r0_sum, r1_sum); + hvx_vec_store_u(&s[0], 8, rsum); } static void vec_dot_mxfp4x4x2_q8x4x2(const int n, @@ -669,7 +666,7 @@ static void vec_dot_mxfp4x4x2_q8x4x2(const int n, const uint8_t * restrict y_q = ((const uint8_t *) vy + 0); // quants first const uint8_t * restrict y_d = ((const uint8_t *) vy + y_qrow_size); // then scales - // Row sum (qf32) + // Row sum (sf) HVX_Vector r0_sum = Q6_V_vsplat_R(0); // Multiply and accumulate into int32. @@ -708,7 +705,7 @@ static void vec_dot_mxfp4x4x2_q8x4x2(const int n, HVX_Vector r0_fa = Q6_Vqf32_vmpy_VsfVsf(r0_ia, r0_dd); - r0_sum = Q6_Vqf32_vadd_Vqf32Vqf32(r0_sum, r0_fa); + r0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_fa, r0_sum)); } // Process leftovers @@ -741,14 +738,14 @@ static void vec_dot_mxfp4x4x2_q8x4x2(const int n, // Zero-out unused scales HVX_VectorPred bmask = Q6_Q_vsetq_R(nloe / 8); r0_dd = Q6_V_vand_QV(bmask, r0_dd); + r0_ia = Q6_V_vand_QV(bmask, r0_ia); HVX_Vector r0_fa = Q6_Vqf32_vmpy_VsfVsf(r0_ia, r0_dd); - r0_sum = Q6_Vqf32_vadd_Vqf32Vqf32(r0_sum, r0_fa); + r0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_fa, r0_sum)); } - // Reduce and convert into fp32 - r0_sum = hvx_vec_reduce_sum_f32(Q6_Vsf_equals_Vqf32(r0_sum)); + r0_sum = hvx_vec_reduce_sum_f32(r0_sum); hvx_vec_store_u(&s[0], 4, r0_sum); } @@ -781,13 +778,13 @@ static void vec_dot_mxfp4x4x2_q8x4x2_rx2(const int n, const uint8_t * restrict y_q = ((const uint8_t *) vy + 0); // quants first const uint8_t * restrict y_d = ((const uint8_t *) vy + y_qrow_size); // then scales - // Row sum (qf32) + // Row sum (sf) HVX_Vector r0_sum = Q6_V_vsplat_R(0); HVX_Vector r1_sum = Q6_V_vsplat_R(0); // Multiply and accumulate into int32. // Compute combined scale (fp32). - // Apply scale to acc and accumulate into the row sum (qf32). + // Apply scale to acc and accumulate into the row sum (f32). const uint32_t nb = n / qk; // num full blocks int32_t nloe = n % qk; // num leftover elemements (must be signed) @@ -829,8 +826,8 @@ static void vec_dot_mxfp4x4x2_q8x4x2_rx2(const int n, HVX_Vector r0_fa = Q6_Vqf32_vmpy_VsfVsf(r0_ia, r0_dd); HVX_Vector r1_fa = Q6_Vqf32_vmpy_VsfVsf(r1_ia, r1_dd); - r0_sum = Q6_Vqf32_vadd_Vqf32Vqf32(r0_sum, r0_fa); - r1_sum = Q6_Vqf32_vadd_Vqf32Vqf32(r1_sum, r1_fa); + r0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_fa, r0_sum)); + r1_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r1_fa, r1_sum)); } // Process leftovers @@ -867,24 +864,22 @@ static void vec_dot_mxfp4x4x2_q8x4x2_rx2(const int n, HVX_Vector r0_dd = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(r0_d, vy_d)); HVX_Vector r1_dd = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(r1_d, vy_d)); - // Zero-out unused scales + // Zero-out unused values HVX_VectorPred bmask = Q6_Q_vsetq_R(nloe / 8); r0_dd = Q6_V_vand_QV(bmask, r0_dd); r1_dd = Q6_V_vand_QV(bmask, r1_dd); + r0_ia = Q6_V_vand_QV(bmask, r0_ia); + r1_ia = Q6_V_vand_QV(bmask, r1_ia); HVX_Vector r0_fa = Q6_Vqf32_vmpy_VsfVsf(r0_ia, r0_dd); HVX_Vector r1_fa = Q6_Vqf32_vmpy_VsfVsf(r1_ia, r1_dd); - r0_sum = Q6_Vqf32_vadd_Vqf32Vqf32(r0_sum, r0_fa); - r1_sum = Q6_Vqf32_vadd_Vqf32Vqf32(r1_sum, r1_fa); + r0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_fa, r0_sum)); + r1_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r1_fa, r1_sum)); } - // Convert into fp32 and reduce - r0_sum = hvx_vec_reduce_sum_f32(Q6_Vsf_equals_Vqf32(r0_sum)); - r1_sum = hvx_vec_reduce_sum_f32(Q6_Vsf_equals_Vqf32(r1_sum)); - HVX_VectorPair p0 = Q6_W_vshuff_VVR(r1_sum, r0_sum, 4); - - hvx_vec_store_u(&s[0], 8, Q6_V_lo_W(p0)); + HVX_Vector rsum = hvx_vec_reduce_sum_f32x2(r0_sum, r1_sum); + hvx_vec_store_u(&s[0], 8, rsum); } static void vec_dot_f16_f16_aa(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { @@ -913,7 +908,7 @@ static void vec_dot_f16_f16_aa(const int n, float * restrict s, const void * res rsum = Q6_Vqf32_vadd_Vqf32Vqf32(rsum, Q6_Vqf32_vadd_Vqf32Vqf32(Q6_V_lo_W(xy_qf), Q6_V_hi_W(xy_qf))); } - rsum = Q6_Vsf_equals_Vqf32(hvx_vec_reduce_sum_qf32(rsum)); + rsum = hvx_vec_reduce_sum_f32(Q6_Vsf_equals_Vqf32(rsum)); hvx_vec_store_u(&s[0], 4, rsum); } @@ -957,11 +952,8 @@ static void vec_dot_f16_f16_aa_rx2(const int n, rsum1 = Q6_Vqf32_vadd_Vqf32Vqf32(rsum1, Q6_Vqf32_vadd_Vqf32Vqf32(Q6_V_lo_W(xy1_qf), Q6_V_hi_W(xy1_qf))); } - rsum0 = Q6_Vsf_equals_Vqf32(hvx_vec_reduce_sum_qf32(rsum0)); - rsum1 = Q6_Vsf_equals_Vqf32(hvx_vec_reduce_sum_qf32(rsum1)); - HVX_VectorPair p0 = Q6_W_vshuff_VVR(rsum1, rsum0, 4); - - hvx_vec_store_u(&s[0], 8, Q6_V_lo_W(p0)); + HVX_Vector rsum = hvx_vec_reduce_sum_f32x2(Q6_Vsf_equals_Vqf32(rsum0), Q6_Vsf_equals_Vqf32(rsum1)); + hvx_vec_store_u(&s[0], 8, rsum); } static void vec_dot_f16_f16_uu(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { @@ -990,7 +982,7 @@ static void vec_dot_f16_f16_uu(const int n, float * restrict s, const void * res rsum = Q6_Vqf32_vadd_Vqf32Vqf32(rsum, Q6_Vqf32_vadd_Vqf32Vqf32(Q6_V_lo_W(xy_qf), Q6_V_hi_W(xy_qf))); } - rsum = Q6_Vsf_equals_Vqf32(hvx_vec_reduce_sum_qf32(rsum)); + rsum = hvx_vec_reduce_sum_f32(Q6_Vsf_equals_Vqf32(rsum)); hvx_vec_store_u(&s[0], 4, rsum); } @@ -1042,7 +1034,8 @@ static void vec_dot_f16_f32_uu(const int n, float * restrict s, const void * res rsum = Q6_Vqf32_vadd_Vqf32Vqf32(rsum, Q6_Vqf32_vadd_Vqf32Vqf32(Q6_V_lo_W(xy_qf), Q6_V_hi_W(xy_qf))); } - rsum = Q6_Vsf_equals_Vqf32(hvx_vec_reduce_sum_qf32(rsum)); + // Convert into fp32 and reduce + rsum = hvx_vec_reduce_sum_f32(Q6_Vsf_equals_Vqf32(rsum)); hvx_vec_store_u(&s[0], 4, rsum); } diff --git a/ggml/src/ggml-hexagon/htp/softmax-ops.c b/ggml/src/ggml-hexagon/htp/softmax-ops.c index 1b6b2eba4a..e91a16d947 100644 --- a/ggml/src/ggml-hexagon/htp/softmax-ops.c +++ b/ggml/src/ggml-hexagon/htp/softmax-ops.c @@ -154,8 +154,8 @@ static void hvx_fast_softmax_f32(const uint8_t * restrict src, v_pad[i] = v3; } - v = hvx_vec_reduce_sum_qf32(sum_vec); - sum_vec = hvx_vec_repl4(Q6_Vsf_equals_Vqf32(v)); + v = hvx_vec_reduce_sum_f32(Q6_Vsf_equals_Vqf32(sum_vec)); + sum_vec = hvx_vec_repl4(v); HVX_VectorPred pos_sum = Q6_Q_vcmp_gt_VwVw(sum_vec, zero_v); HVX_Vector v4 = hvx_vec_inverse_f32(sum_vec); diff --git a/ggml/src/ggml-hexagon/htp/unary-ops.c b/ggml/src/ggml-hexagon/htp/unary-ops.c index be8be8c4e6..1a27cb6e63 100644 --- a/ggml/src/ggml-hexagon/htp/unary-ops.c +++ b/ggml/src/ggml-hexagon/htp/unary-ops.c @@ -57,8 +57,8 @@ static void hvx_fast_rms_norm_f32(const uint8_t * restrict src, sum_v = Q6_Vqf32_vadd_Vqf32Vqf32(sum_v, v2); } - HVX_Vector reduced_sum = hvx_vec_reduce_sum_qf32(sum_v); - sum_v = hvx_vec_repl4(Q6_Vsf_equals_Vqf32(reduced_sum)); + HVX_Vector reduced_sum = hvx_vec_reduce_sum_f32(Q6_Vsf_equals_Vqf32(sum_v)); + sum_v = hvx_vec_repl4(reduced_sum); HVX_Vector t_v = hvx_vec_splat_f32((float) num_elems); HVX_Vector denom_v = hvx_vec_inverse_f32(t_v); diff --git a/ggml/src/ggml-hexagon/libdl.h b/ggml/src/ggml-hexagon/libdl.h new file mode 100644 index 0000000000..8ca5016f03 --- /dev/null +++ b/ggml/src/ggml-hexagon/libdl.h @@ -0,0 +1,79 @@ +#pragma once + +#ifdef _WIN32 +# define WIN32_LEAN_AND_MEAN +# ifndef NOMINMAX +# define NOMINMAX +# endif +# include +# include +#else +# include +# include +#endif +#include + +namespace fs = std::filesystem; + +#ifdef _WIN32 + +using dl_handle = std::remove_pointer_t; + +struct dl_handle_deleter { + void operator()(HMODULE handle) { + FreeLibrary(handle); + } +}; + +static inline dl_handle * dl_load_library(const fs::path & path) { + // suppress error dialogs for missing DLLs + DWORD old_mode = SetErrorMode(SEM_FAILCRITICALERRORS); + SetErrorMode(old_mode | SEM_FAILCRITICALERRORS); + + HMODULE handle = LoadLibraryW(path.wstring().c_str()); + + SetErrorMode(old_mode); + + return handle; +} + +static inline void * dl_get_sym(dl_handle * handle, const char * name) { + DWORD old_mode = SetErrorMode(SEM_FAILCRITICALERRORS); + SetErrorMode(old_mode | SEM_FAILCRITICALERRORS); + + void * p = (void *) GetProcAddress(handle, name); + + SetErrorMode(old_mode); + + return p; +} + +static inline const char * dl_error() { + return ""; +} + +#else + +using dl_handle = void; + +struct dl_handle_deleter { + void operator()(void * handle) { + dlclose(handle); + } +}; + +static inline dl_handle * dl_load_library(const fs::path & path) { + dl_handle * handle = dlopen(path.string().c_str(), RTLD_NOW | RTLD_LOCAL); + return handle; +} + +static inline void * dl_get_sym(dl_handle * handle, const char * name) { + return dlsym(handle, name); +} + +static inline const char * dl_error() { + const char *rslt = dlerror(); + return rslt != nullptr ? rslt : ""; +} + +#endif diff --git a/ggml/src/ggml-hexagon/libggml-htp.inf b/ggml/src/ggml-hexagon/libggml-htp.inf new file mode 100644 index 0000000000..656d2d9ab2 --- /dev/null +++ b/ggml/src/ggml-hexagon/libggml-htp.inf @@ -0,0 +1,38 @@ +[Version] +Signature = "$WINDOWS NT$" +Class = ComputeAccelerator +ClassGuid = {F01A9D53-3FF6-48D2-9F97-C8A7004BE10C} +Provider = %GGML% +DriverVer = 01/01/2026,1.0.0.0 +CatalogFile = libggml-htp.cat +PnpLockDown = 1 + +[DestinationDirs] +Drivers_Dir = 6 + +[SourceDisksNames] +1 = %DiskId% + +[SourceDisksFiles] +libggml-htp-v68.so = 1 +libggml-htp-v69.so = 1 +libggml-htp-v73.so = 1 +libggml-htp-v75.so = 1 +libggml-htp-v81.so = 1 + +[ControlFlags] +ExcludeFromSelect = * + +[DefaultInstall.NTarm64] +CopyFiles=Drivers_Dir + +[Drivers_Dir] +libggml-htp-v68.so,,,0x10 ;COPYFLG_NO_OVERWRITE +libggml-htp-v69.so,,,0x10 ;COPYFLG_NO_OVERWRITE +libggml-htp-v73.so,,,0x10 ;COPYFLG_NO_OVERWRITE +libggml-htp-v75.so,,,0x10 ;COPYFLG_NO_OVERWRITE +libggml-htp-v81.so,,,0x10 ;COPYFLG_NO_OVERWRITE + +[Strings] +GGML = 'GGML' +DiskId = 'GGML HTP library' diff --git a/ggml/src/ggml-opencl/CMakeLists.txt b/ggml/src/ggml-opencl/CMakeLists.txt index 0259474b6e..fa5fadd112 100644 --- a/ggml/src/ggml-opencl/CMakeLists.txt +++ b/ggml/src/ggml-opencl/CMakeLists.txt @@ -101,6 +101,8 @@ set(GGML_OPENCL_KERNELS mul_mm_f32_f32_l4_lm mul_mm_f16_f32_l4_lm mul_mm_q8_0_f32_l4_lm + mul_mm_q8_0_f32_8x4 + gemv_noshuffle_general_q8_0_f32 mul norm relu diff --git a/ggml/src/ggml-opencl/ggml-opencl.cpp b/ggml/src/ggml-opencl/ggml-opencl.cpp index 678e40965a..4850c11d14 100644 --- a/ggml/src/ggml-opencl/ggml-opencl.cpp +++ b/ggml/src/ggml-opencl/ggml-opencl.cpp @@ -226,7 +226,8 @@ static ADRENO_GPU_GEN get_adreno_gpu_gen(const char *device_name) { return ADRENO_GPU_GEN::A7X; } - if (strstr(device_name, "830")) { + if (strstr(device_name, "830") || + strstr(device_name, "840")) { return ADRENO_GPU_GEN::A8X; } @@ -529,7 +530,7 @@ struct ggml_backend_opencl_context { cl_kernel kernel_mul_mat_q4_0_f32, kernel_mul_mat_q4_0_f32_v; cl_kernel kernel_convert_block_q4_0, kernel_restore_block_q4_0; cl_kernel kernel_convert_block_mxfp4, kernel_convert_block_mxfp4_trans, kernel_restore_block_mxfp4, kernel_restore_block_mxfp4_trans; - cl_kernel kernel_convert_block_q8_0, kernel_restore_block_q8_0; + cl_kernel kernel_convert_block_q8_0, kernel_restore_block_q8_0, kernel_restore_block_q8_0_trans; cl_kernel kernel_mul_mat_q4_0_f32_8x_flat; cl_kernel kernel_convert_block_q4_0_noshuffle; cl_kernel kernel_restore_block_q4_0_noshuffle; @@ -696,6 +697,8 @@ struct ggml_backend_opencl_context { cl_kernel CL_mul_mat_vec_q4_0_f32_1d_4x_flat_4096_1_4096; cl_kernel CL_mul_mat_vec_q4_0_f32_1d_4x_flat_11008_1_4096; cl_kernel CL_mul_mat_vec_q4_0_f32_1d_4x_flat_32000_1_4096; + cl_kernel kernel_mul_mm_q8_0_f32_8x4; + cl_kernel CL_mul_mat_vec_q8_0_f32; #endif // GGML_OPENCL_USE_ADRENO_KERNELS void free() { @@ -894,6 +897,7 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve CL_CHECK((backend_ctx->kernel_restore_block_mxfp4 = clCreateKernel(backend_ctx->program_cvt, "kernel_restore_block_mxfp4", &err), err)); CL_CHECK((backend_ctx->kernel_convert_block_q8_0 = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_q8_0", &err), err)); CL_CHECK((backend_ctx->kernel_restore_block_q8_0 = clCreateKernel(backend_ctx->program_cvt, "kernel_restore_block_q8_0", &err), err)); + CL_CHECK((backend_ctx->kernel_restore_block_q8_0_trans = clCreateKernel(backend_ctx->program_cvt, "kernel_restore_block_q8_0_trans", &err), err)); CL_CHECK((backend_ctx->kernel_convert_block_q6_K = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_q6_K", &err), err)); CL_CHECK((backend_ctx->kernel_restore_block_q6_K = clCreateKernel(backend_ctx->program_cvt, "kernel_restore_block_q6_K", &err), err)); GGML_LOG_CONT("."); @@ -2290,6 +2294,46 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve GGML_LOG_CONT("."); } + // mul_mm_q8_0_f32_8x4 + { +#ifdef GGML_OPENCL_EMBED_KERNELS + const std::string kernel_src_q8_8x4_gemm { + #include "mul_mm_q8_0_f32_8x4.cl.h" + }; +#else + const std::string kernel_src_q8_8x4_gemm = read_file("mul_mm_q8_0_f32_8x4.cl"); +#endif + backend_ctx->program_CL_gemm = build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src_q8_8x4_gemm.c_str(), compile_opts); + CL_CHECK((backend_ctx->kernel_mul_mm_q8_0_f32_8x4 = clCreateKernel(backend_ctx->program_CL_gemm, "kernel_mul_mm_q8_0_f32_8x4", &err), err)); + GGML_LOG_CONT("."); + } + + // gemv_noshuffle_general_q8_0_f32 + { + std::string CL_gemv_compile_opts = std::string("-cl-std=") + opencl_c_std + + " -cl-mad-enable " + " -DSIMDGROUP_WIDTH=" + + std::to_string(backend_ctx->adreno_wave_size); + if (backend_ctx->has_vector_subgroup_broadcast) { + CL_gemv_compile_opts += " -DVECTOR_SUB_GROUP_BROADCAT "; + } + +#ifdef GGML_OPENCL_EMBED_KERNELS + const std::string kernel_src_CL_gemv_general { + #include "gemv_noshuffle_general_q8_0_f32.cl.h" + }; +#else + const std::string kernel_src_CL_gemv_general = read_file("gemv_noshuffle_general_q8_0_f32.cl"); +#endif + + cl_program prog = build_program_from_source( + backend_ctx->context, backend_ctx->device, kernel_src_CL_gemv_general.c_str(), CL_gemv_compile_opts); + + CL_CHECK((backend_ctx->CL_mul_mat_vec_q8_0_f32 = clCreateKernel(prog, "kernel_gemv_noshuffle", &err), err)); + CL_CHECK(clReleaseProgram(prog)); + GGML_LOG_CONT("."); + } + std::string CL_moe_compile_opts = std::string("-cl-std=") + opencl_c_std + " -cl-mad-enable " " -cl-fast-relaxed-math"; @@ -3745,6 +3789,15 @@ inline bool use_adreno_moe_kernels(const ggml_backend_opencl_context *backend_ct return ((strstr(tensor->name, "ffn") != NULL) || (strstr(tensor->name, "as") != NULL)) && (ne01 % 64 == 0); } +inline bool enable_adreno_trans_weight(const ggml_backend_opencl_context *backend_ctx, const ggml_tensor *tensor) { + + bool adreno_kernel = use_adreno_kernels(backend_ctx, tensor); + + size_t elem_num = tensor->ne[0] * tensor->ne[1] * tensor->ne[2] * tensor->ne[3]; + + return ((elem_num < 128 * 1024 * 1024) && adreno_kernel); // max element num: 2**27 +} + static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) { ggml_backend_opencl_context *backend_ctx = ggml_cl2_init(buffer->buft->device); @@ -4159,6 +4212,130 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer, tensor->extra = extra; + // Transpose the weights and scales +#ifdef GGML_OPENCL_USE_ADRENO_KERNELS + if (enable_adreno_trans_weight(backend_ctx, tensor)) { + + int M = tensor->ne[1]; // ne01 + int K = tensor->ne[0]; // ne00 + + GGML_ASSERT(K % 32 == 0); + GGML_ASSERT(M % 4 == 0); + GGML_ASSERT(tensor->ne[2] == 1); + GGML_ASSERT(tensor->ne[3] == 1); + + // Transpose weights + size_t q_size_bytes = K * M / 4 * sizeof(float); + cl_buffer_region region; + region.origin = 0; + region.size = q_size_bytes; + cl_mem qT_d = clCreateSubBuffer( + backend_ctx->prealloc_quant_trans.buffer, + 0, + CL_BUFFER_CREATE_TYPE_REGION, + ®ion, + &err); + CL_CHECK(err); + + cl_mem q_d_image1D; + cl_mem qT_d_image1D; + + cl_image_format img_fmt_1d; + cl_image_desc img_desc_1d; + + img_fmt_1d = { CL_RGBA, CL_FLOAT }; + memset(&img_desc_1d, 0, sizeof(img_desc_1d)); + img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER; + img_desc_1d.image_width = M * K / 4 / 4; + img_desc_1d.buffer = extra->q; + q_d_image1D = clCreateImage(context, 0, &img_fmt_1d, &img_desc_1d, NULL, &err); + CL_CHECK(err); + + img_fmt_1d = { CL_RGBA, CL_FLOAT }; + memset(&img_desc_1d, 0, sizeof(img_desc_1d)); + img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER; + img_desc_1d.image_width = M * K / 4 / 4; + img_desc_1d.buffer = qT_d; + qT_d_image1D = clCreateImage(context, 0, &img_fmt_1d, &img_desc_1d, NULL, &err); + CL_CHECK(err); + + int height_q = M / 4; + int width_q = K / 4 / 4; + kernel = backend_ctx->kernel_transpose_32; + + CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &q_d_image1D)); + CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &qT_d_image1D)); + CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), &height_q)); + CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), &width_q)); + + size_t local_size_q[3] = {4, 16, 1}; + size_t global_size_q[3] = {static_cast(width_q), static_cast(height_q), 1}; + CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_size_q, local_size_q, 0, NULL, &evt)); + CL_CHECK(clWaitForEvents(1, &evt)); + + // Transpose scales + size_t d_size_bytes = M * (K / 32) * 2; + region.origin = 0; + region.size = d_size_bytes; + cl_mem dT_d = clCreateSubBuffer( + backend_ctx->prealloc_scales_trans.buffer, + 0, + CL_BUFFER_CREATE_TYPE_REGION, + ®ion, + &err); + CL_CHECK(err); + + cl_mem d_d_image1D; + cl_mem dT_d_image1D; + + memset(&img_desc_1d, 0, sizeof(img_desc_1d)); + img_fmt_1d = { CL_R, CL_HALF_FLOAT }; + img_desc_1d.image_width = M * K / 32; + img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER; + img_desc_1d.buffer = extra->d; + d_d_image1D = clCreateImage(context, 0, &img_fmt_1d, &img_desc_1d, NULL, &err); + CL_CHECK(err); + + img_fmt_1d = { CL_RGBA, CL_HALF_FLOAT }; + memset(&img_desc_1d, 0, sizeof(img_desc_1d)); + img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER; + img_desc_1d.image_width = M * K / 32 / 4; + img_desc_1d.buffer = dT_d; + dT_d_image1D = clCreateImage(context, 0, &img_fmt_1d, &img_desc_1d, NULL, &err); + CL_CHECK(err); + + int height_s = M / 4; + int width_s = K / 32; + + kernel = backend_ctx->kernel_transpose_16_4x1; + + CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &d_d_image1D)); + CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &dT_d_image1D)); + CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), &height_s)); + CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), &width_s)); + + size_t local_size_s[3] = {4, 16, 1}; + size_t global_size_s[3] = {static_cast(width_s), static_cast(height_s), 1}; + CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_size_s, local_size_s, 0, NULL, &evt)); + CL_CHECK(clWaitForEvents(1, &evt)); + + // copy transposed buffer contents to original buffers + CL_CHECK(clEnqueueCopyBuffer(queue, qT_d, extra->q, 0, 0, q_size_bytes, 0, NULL, &evt)); + CL_CHECK(clWaitForEvents(1, &evt)); + + CL_CHECK(clEnqueueCopyBuffer(queue, dT_d, extra->d, 0, 0, d_size_bytes, 0, NULL, &evt)); + CL_CHECK(clWaitForEvents(1, &evt)); + + CL_CHECK(clReleaseMemObject(qT_d)); + CL_CHECK(clReleaseMemObject(dT_d)); + + CL_CHECK(clReleaseMemObject(q_d_image1D)); + CL_CHECK(clReleaseMemObject(d_d_image1D)); + CL_CHECK(clReleaseMemObject(qT_d_image1D)); + CL_CHECK(clReleaseMemObject(dT_d_image1D)); + } // end transpose +#endif // GGML_OPENCL_USE_ADRENO_KERNELS + return; } if (tensor->type == GGML_TYPE_Q6_K) { @@ -4448,6 +4625,36 @@ static void ggml_backend_opencl_buffer_get_tensor(ggml_backend_buffer_t buffer, ggml_nbytes(tensor), NULL, &err); CL_CHECK(err); +#ifdef GGML_OPENCL_USE_ADRENO_KERNELS + if (enable_adreno_trans_weight(backend_ctx, tensor)) { + cl_kernel kernel = backend_ctx->kernel_restore_block_q8_0_trans; + + int ne00 = tensor->ne[0]; + int ne01 = tensor->ne[1]; + GGML_ASSERT(tensor->ne[2] == 1); // ??? + GGML_ASSERT(tensor->ne[3] == 1); // ??? + + CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra->q)); + CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra->d)); + CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &data_device)); + CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_int), &ne00)); + CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_int), &ne01)); + + size_t global_work_size[3] = {static_cast(((ne01 + 63) / 64) * 64), 1, 1}; + size_t local_work_size[3] = {64, 1, 1}; + + cl_event evt; + CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, + global_work_size, local_work_size, 0, NULL, &evt)); + CL_CHECK(clWaitForEvents(1, &evt)); + + CL_CHECK(clEnqueueReadBuffer( + queue, data_device, CL_TRUE, offset, + size, data, 0, NULL, NULL)); + CL_CHECK(clReleaseMemObject(data_device)); + return; + } +#endif cl_kernel kernel = backend_ctx->kernel_restore_block_q8_0; CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra->q)); CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra->d)); @@ -7947,6 +8154,252 @@ static void ggml_cl_mul_mat_kq_kqv_adreno(ggml_backend_t backend, const ggml_ten CL_CHECK(clReleaseMemObject(D_sub_buffer)); } +static void ggml_cl_mul_mat_q8_0_f32_adreno(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +#ifdef GGML_OPENCL_USE_ADRENO_KERNELS + GGML_ASSERT(src0); + GGML_ASSERT(src0->extra); + GGML_ASSERT(src1); + GGML_ASSERT(src1->extra); + GGML_ASSERT(dst); + GGML_ASSERT(dst->extra); + + const enum ggml_type src0t = src0->type; + const enum ggml_type src1t = src1->type; + + GGML_ASSERT(src0t == GGML_TYPE_Q8_0); + GGML_ASSERT(src1t == GGML_TYPE_F32); + + ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context; + + ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra; + ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra; + + ggml_tensor_extra_cl_q8_0 * extra0_q8_0 = (ggml_tensor_extra_cl_q8_0 *)src0->extra; + + GGML_ASSERT(src1->view_offs == 0); + GGML_ASSERT(dst->view_offs == 0); + + const int ne00 = src0->ne[0]; + const int ne01 = src0->ne[1]; + const int ne02 = src0->ne[2]; + + const int ne10 = src1->ne[0]; + const int ne12 = src1->ne[2]; + + const int ne0 = dst->ne[0]; + const int ne1 = dst->ne[1]; + + GGML_ASSERT(ne00 == ne10); + GGML_ASSERT((ne00 % 32) == 0); + GGML_ASSERT(ne0 == ne01); + + cl_context context = backend_ctx->context; + cl_kernel kernel; + + // init CL objects + cl_int status; + cl_image_format img_fmt_1d; + cl_image_desc img_desc_1d; + cl_buffer_region region; + cl_mem A_image1d; + cl_mem B_image1d; + cl_mem B_sub_buffer; + cl_mem S_image1d; + + cl_mem D_image1d; + cl_mem D_sub_buffer; + + int M = ne01; + int N = ne1; + int K = ne00; + + // create an image for A + img_fmt_1d = { CL_R, CL_FLOAT}; + memset(&img_desc_1d, 0, sizeof(img_desc_1d)); + img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER; + img_desc_1d.image_width = M * K / 4; // Divide by 4 for char -> float + img_desc_1d.buffer = extra0_q8_0->q; + A_image1d = clCreateImage(context, CL_MEM_READ_ONLY, &img_fmt_1d, &img_desc_1d, NULL, &status); + CL_CHECK(status); + + // create an image for Scale + img_fmt_1d = { CL_R, CL_HALF_FLOAT}; + memset(&img_desc_1d, 0, sizeof(img_desc_1d)); + img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER; + img_desc_1d.image_width = M * K / 32; // Block size is 32 + img_desc_1d.buffer = extra0_q8_0->d; + S_image1d = clCreateImage(context, CL_MEM_READ_ONLY, &img_fmt_1d, &img_desc_1d, NULL, &status); + CL_CHECK(status); + + // create a sub_buffer for B + region.origin = (extra1->offset); // + src1->view_offs); + region.size = K * N * sizeof(float); + B_sub_buffer = clCreateSubBuffer((extra1->data_device), 0, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &status); + CL_CHECK(status); + + // create an image for B from sub_buffer: RGBA (OCL) + img_fmt_1d = {CL_RGBA, CL_FLOAT}; + memset(&img_desc_1d, 0, sizeof(img_desc_1d)); + img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER; + img_desc_1d.image_width = K * N / 4; + img_desc_1d.buffer = B_sub_buffer; + B_image1d = clCreateImage(context, CL_MEM_READ_ONLY, &img_fmt_1d, &img_desc_1d, NULL, &status); + CL_CHECK(status); + + // Create subbuffer and image1d_buffer for dst + region.origin = (extrad->offset); // + dst->view_offs; + region.size = M * N * sizeof(float); + D_sub_buffer = clCreateSubBuffer((extrad->data_device), 0, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &status); + CL_CHECK(status); + + img_fmt_1d = {CL_R, CL_FLOAT}; + memset(&img_desc_1d, 0, sizeof(img_desc_1d)); + img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER; + img_desc_1d.image_width = M * N; + img_desc_1d.buffer = D_sub_buffer; + D_image1d = clCreateImage(context, CL_MEM_WRITE_ONLY, &img_fmt_1d, &img_desc_1d, NULL, &status); + CL_CHECK(status); + + size_t local_work_size[3] = {1, 1, 1}; + size_t global_work_size[3] = {1, 1, 1}; + + if (N == 1) { + kernel = backend_ctx->CL_mul_mat_vec_q8_0_f32; + + int r2 = 1; + int r3 = 1; + cl_uint k_arg = 0; + + CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(cl_mem), &A_image1d)); + CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(cl_mem), &extra0_q8_0->d)); + CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(cl_mem), &B_image1d)); + CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(cl_ulong), &extra1->offset)); + CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(cl_mem), &extrad->data_device)); + CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(cl_ulong), &extrad->offset)); + CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &ne00)); + CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &ne01)); + CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &ne02)); + CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &ne10)); + CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &ne12)); + CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &ne0)); + CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &ne1)); + CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &r2)); + CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &r3)); + + size_t wavesize = backend_ctx->adreno_wave_size; + local_work_size[0] = wavesize; + local_work_size[1] = 4; // reduce factor + local_work_size[2] = 1; + + global_work_size[0] = ((M + wavesize - 1) / wavesize) * wavesize; + global_work_size[1] = 4; // reduce factor + global_work_size[2] = 1; + } else { + cl_ulong offsetd = extrad->offset + dst->view_offs; + cl_mem B_image1d_trans = nullptr; + // for B transpose + cl_mem B_d = nullptr; + int padding; + + //how many extra elements beyond multiple of 8 + int extra_elements = N % 8; + + //how much padding to add + padding = 0; + if (extra_elements > 0){ + padding = 8 - extra_elements; + } + + // Specify the starting offset (in bytes) + region.origin = 0; + // Specify the size of the sub-buffer (divide by 2 for FP16) + region.size = K * (N + padding) * sizeof(float)/2; + backend_ctx->prealloc_act_trans.allocate(context, region.size); + B_d = clCreateSubBuffer( + backend_ctx->prealloc_act_trans.buffer, + 0, + CL_BUFFER_CREATE_TYPE_REGION, + ®ion, + &status); + CL_CHECK(status); + + cl_image_format image_format_B_d_output = { CL_RGBA, CL_HALF_FLOAT }; //(CL_HALF_FLOAT for FP16) + cl_image_desc image_desc_B_d_output = { + CL_MEM_OBJECT_IMAGE1D_BUFFER, + static_cast(K * (N + padding)/4), + 0, 0, 0, 0, 0, 0, 0, { B_d } + }; + B_image1d_trans = clCreateImage( + context, + 0, + &image_format_B_d_output, + &image_desc_B_d_output, + NULL, + &status); + CL_CHECK(status); + + int height_B = N/4; + if (height_B == 0) { + height_B = 1; + } + int width_B = K/4; + int padded_height_B = (N + padding)/4; + + kernel = backend_ctx->kernel_transpose_32_16; + CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &B_image1d)); + CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &B_image1d_trans)); + CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), &height_B)); + CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), &width_B)); + CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &padded_height_B)); + + size_t local_size_t[2] = { 1, 16 }; + size_t global_size_t[2] = { + static_cast(width_B), + static_cast(padded_height_B) + }; + + backend_ctx->enqueue_ndrange_kernel(kernel, 2, global_size_t, local_size_t, dst); + + kernel = backend_ctx->kernel_mul_mm_q8_0_f32_8x4; + + int N_with_padding = N + padding; + + CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0_q8_0->q)); + CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra0_q8_0->d)); + CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &B_image1d_trans)); + CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), &extrad->data_device)); + CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &K)); + CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &M)); + CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &N_with_padding)); + CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &N)); + CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &offsetd)); + + global_work_size[0] = (size_t)(N + 7) / 8; + global_work_size[1] = (size_t)(M + 3) / 4; + global_work_size[2] = 1; + + local_work_size[0] = 2; + local_work_size[1] = 128; + local_work_size[2] = 1; + } + + // enqueue kernel with profiling + backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst); + + // deallocate sub buffers and images + CL_CHECK(clReleaseMemObject(A_image1d)); + CL_CHECK(clReleaseMemObject(B_sub_buffer)); + CL_CHECK(clReleaseMemObject(B_image1d)); + CL_CHECK(clReleaseMemObject(S_image1d)); + CL_CHECK(clReleaseMemObject(D_sub_buffer)); + CL_CHECK(clReleaseMemObject(D_image1d)); +#else + GGML_UNUSED(src0); + GGML_UNUSED(src1); + GGML_UNUSED(dst); +#endif +} + static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { GGML_ASSERT(src0); GGML_ASSERT(src0->extra); @@ -8064,6 +8517,13 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co int padding; // <--------------------------------------------> // + // q8_0 x fp32 + if (src0t == GGML_TYPE_Q8_0 && src1t == GGML_TYPE_F32 && + enable_adreno_trans_weight(backend_ctx, src0)) { + ggml_cl_mul_mat_q8_0_f32_adreno(backend, src0, src1, dst); + return; + } + // q4_0 x fp32 if(src0t == GGML_TYPE_Q4_0 && src1t == GGML_TYPE_F32) { // TODO: remove duplicate definitions of image description + format -- move to top diff --git a/ggml/src/ggml-opencl/kernels/cvt.cl b/ggml/src/ggml-opencl/kernels/cvt.cl index adf576a839..9fb434713d 100644 --- a/ggml/src/ggml-opencl/kernels/cvt.cl +++ b/ggml/src/ggml-opencl/kernels/cvt.cl @@ -274,6 +274,37 @@ kernel void kernel_restore_block_q8_0( } } +kernel void kernel_restore_block_q8_0_trans( + global uchar * src_q, + global half * src_d, + global block_q8_0 * dst, + uint ne00, + uint ne01 +){ + uint num_blk_per_row = ne00 / QK8_0; + + global block_q8_0 * b = (global block_q8_0 *) dst + get_global_id(0) * num_blk_per_row; + global uchar * q = (global uchar *) src_q + get_global_id(0) * 4; // 4 8-bit packed + global half * d = (global half *) src_d + get_global_id(0); + + for (uint blk = 0; blk < num_blk_per_row; blk++) { + b->d = *d; + + for (uint i = 0; i < QK8_0; i+=4) { + b->qs[i] = q[0]; + b->qs[i+1] = q[1]; + b->qs[i+2] = q[2]; + b->qs[i+3] = q[3]; + + q += 4 * ne01; // M stride + } + + d += ne01; + + b++; + } +} + //------------------------------------------------------------------------------ // kernel_convert_block_q6_K // Convert the block_q6_K format to 3 separate arrays (AOS -> SOA). diff --git a/ggml/src/ggml-opencl/kernels/gemv_noshuffle_general_q8_0_f32.cl b/ggml/src/ggml-opencl/kernels/gemv_noshuffle_general_q8_0_f32.cl new file mode 100644 index 0000000000..f944ef3a99 --- /dev/null +++ b/ggml/src/ggml-opencl/kernels/gemv_noshuffle_general_q8_0_f32.cl @@ -0,0 +1,195 @@ +#pragma OPENCL EXTENSION cl_khr_fp16 : enable +#pragma OPENCL EXTENSION cl_khr_subgroups : enable + +#ifdef cl_qcom_reqd_sub_group_size +#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable +#define ADRENO_GPU 1 +#define REQD_SUBGROUP_SIZE_64 __attribute__((qcom_reqd_sub_group_size("half"))) +#endif + +#define QK8_0 32 +#define N_SIMDGROUP 4 + +#define dequantizeBlockAccum_ns_sgbroadcast_1(total_sums, bits8, scale, y) \ + float shared_y; \ + char elem; \ + \ + shared_y = sub_group_broadcast(y.s0, 0); \ + elem = (char)(bits8.s0 & 0x000000FF); \ + total_sums += convert_int(elem) * scale * shared_y; \ + shared_y = sub_group_broadcast(y.s1, 0); \ + elem = (char)((bits8.s0 & 0x0000FF00) >> 8); \ + total_sums += convert_int(elem) * scale * shared_y; \ + shared_y = sub_group_broadcast(y.s2, 0); \ + elem = (char)((bits8.s0 & 0x00FF0000) >> 16); \ + total_sums += convert_int(elem) * scale * shared_y; \ + shared_y = sub_group_broadcast(y.s3, 0); \ + elem = (char)((bits8.s0 & 0xFF000000) >> 24); \ + total_sums += convert_int(elem) * scale * shared_y; \ + \ + shared_y = sub_group_broadcast(y.s4, 0); \ + elem = (char)(bits8.s1 & 0x000000FF); \ + total_sums += convert_int(elem) * scale * shared_y; \ + shared_y = sub_group_broadcast(y.s5, 0); \ + elem = (char)((bits8.s1 & 0x0000FF00) >> 8); \ + total_sums += convert_int(elem) * scale * shared_y; \ + shared_y = sub_group_broadcast(y.s6, 0); \ + elem = (char)((bits8.s1 & 0x00FF0000) >> 16); \ + total_sums += convert_int(elem) * scale * shared_y; \ + shared_y = sub_group_broadcast(y.s7, 0); \ + elem = (char)((bits8.s1 & 0xFF000000) >> 24); \ + total_sums += convert_int(elem) * scale * shared_y; \ + \ + shared_y = sub_group_broadcast(y.s0, 1); \ + elem = (char)(bits8.s2 & 0x000000FF); \ + total_sums += convert_int(elem) * scale * shared_y; \ + shared_y = sub_group_broadcast(y.s1, 1); \ + elem = (char)((bits8.s2 & 0x0000FF00) >> 8); \ + total_sums += convert_int(elem) * scale * shared_y; \ + shared_y = sub_group_broadcast(y.s2, 1); \ + elem = (char)((bits8.s2 & 0x00FF0000) >> 16); \ + total_sums += convert_int(elem) * scale * shared_y; \ + shared_y = sub_group_broadcast(y.s3, 1); \ + elem = (char)((bits8.s2 & 0xFF000000) >> 24); \ + total_sums += convert_int(elem) * scale * shared_y; \ + \ + shared_y = sub_group_broadcast(y.s4, 1); \ + elem = (char)(bits8.s3 & 0x000000FF); \ + total_sums += convert_int(elem) * scale * shared_y; \ + shared_y = sub_group_broadcast(y.s5, 1); \ + elem = (char)((bits8.s3 & 0x0000FF00) >> 8); \ + total_sums += convert_int(elem) * scale * shared_y; \ + shared_y = sub_group_broadcast(y.s6, 1); \ + elem = (char)((bits8.s3 & 0x00FF0000) >> 16); \ + total_sums += convert_int(elem) * scale * shared_y; \ + shared_y = sub_group_broadcast(y.s7, 1); \ + elem = (char)((bits8.s3 & 0xFF000000) >> 24); \ + total_sums += convert_int(elem) * scale * shared_y; \ + \ + shared_y = sub_group_broadcast(y.s0, 2); \ + elem = (char)(bits8.s4 & 0x000000FF); \ + total_sums += convert_int(elem) * scale * shared_y; \ + shared_y = sub_group_broadcast(y.s1, 2); \ + elem = (char)((bits8.s4 & 0x0000FF00) >> 8); \ + total_sums += convert_int(elem) * scale * shared_y; \ + shared_y = sub_group_broadcast(y.s2, 2); \ + elem = (char)((bits8.s4 & 0x00FF0000) >> 16); \ + total_sums += convert_int(elem) * scale * shared_y; \ + shared_y = sub_group_broadcast(y.s3, 2); \ + elem = (char)((bits8.s4 & 0xFF000000) >> 24); \ + total_sums += convert_int(elem) * scale * shared_y; \ + \ + shared_y = sub_group_broadcast(y.s4, 2); \ + elem = (char)(bits8.s5 & 0x000000FF); \ + total_sums += convert_int(elem) * scale * shared_y; \ + shared_y = sub_group_broadcast(y.s5, 2); \ + elem = (char)((bits8.s5 & 0x0000FF00) >> 8); \ + total_sums += convert_int(elem) * scale * shared_y; \ + shared_y = sub_group_broadcast(y.s6, 2); \ + elem = (char)((bits8.s5 & 0x00FF0000) >> 16); \ + total_sums += convert_int(elem) * scale * shared_y; \ + shared_y = sub_group_broadcast(y.s7, 2); \ + elem = (char)((bits8.s5 & 0xFF000000) >> 24); \ + total_sums += convert_int(elem) * scale * shared_y; \ + \ + shared_y = sub_group_broadcast(y.s0, 3); \ + elem = (char)(bits8.s6 & 0x000000FF); \ + total_sums += convert_int(elem) * scale * shared_y; \ + shared_y = sub_group_broadcast(y.s1, 3); \ + elem = (char)((bits8.s6 & 0x0000FF00) >> 8); \ + total_sums += convert_int(elem) * scale * shared_y; \ + shared_y = sub_group_broadcast(y.s2, 3); \ + elem = (char)((bits8.s6 & 0x00FF0000) >> 16); \ + total_sums += convert_int(elem) * scale * shared_y; \ + shared_y = sub_group_broadcast(y.s3, 3); \ + elem = (char)((bits8.s6 & 0xFF000000) >> 24); \ + total_sums += convert_int(elem) * scale * shared_y; \ + \ + shared_y = sub_group_broadcast(y.s4, 3); \ + elem = (char)(bits8.s7 & 0x000000FF); \ + total_sums += convert_int(elem) * scale * shared_y; \ + shared_y = sub_group_broadcast(y.s5, 3); \ + elem = (char)((bits8.s7 & 0x0000FF00) >> 8); \ + total_sums += convert_int(elem) * scale * shared_y; \ + shared_y = sub_group_broadcast(y.s6, 3); \ + elem = (char)((bits8.s7 & 0x00FF0000) >> 16); \ + total_sums += convert_int(elem) * scale * shared_y; \ + shared_y = sub_group_broadcast(y.s7, 3); \ + elem = (char)((bits8.s7 & 0xFF000000) >> 24); \ + total_sums += convert_int(elem) * scale * shared_y; \ + +#ifdef ADRENO_GPU +REQD_SUBGROUP_SIZE_64 +#endif +__kernel void kernel_gemv_noshuffle( + __read_only image1d_buffer_t src0_q, // quantized A + global half * src0_d, // A scales + __read_only image1d_buffer_t src1, // B + ulong offset1, // offset to B (0) + global float * dst, // C + ulong offsetd, // offset to C + int ne00, // K + int ne01, // M + int ne02, // 1 + int ne10, // K + int ne12, // 1 + int ne0, // M + int ne1, // N + int r2, // 1 + int r3) +{ + uint groupId = get_local_id(1); + uint gid = get_global_id(0); + ushort slid = get_sub_group_local_id(); + + uint K = ne00; + uint M = ne01; + + uint LINE_STRIDE_A = M; + uint BLOCK_STRIDE_A = 8 * M; // 32 / 4 = 8 + + __private uint8 regA; + __private half regS; + __private float8 regB; + + __private float totalSum = (float)(0.0f); + + // loop along K in block granularity, skip 4 blocks every iter + #pragma unroll 1 /* tell compiler not to unroll */ + for (uint k = groupId; k < (K / QK8_0); k += N_SIMDGROUP) { + regS = src0_d[gid + k * LINE_STRIDE_A]; // each fiber loads scale of one rows + // first 4 fibers in each wave load 8 B values to its private scope + if (slid < 4) { + regB.s0123 = read_imagef(src1, (slid * 2 + k * 8)); + regB.s4567 = read_imagef(src1, (1 + slid * 2 + k * 8)); + } + + // load weights for one block in consecutive rows + regA.s0 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 0)).x; + regA.s1 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 1)).x; + regA.s2 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 2)).x; + regA.s3 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 3)).x; + regA.s4 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 4)).x; + regA.s5 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 5)).x; + regA.s6 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 6)).x; + regA.s7 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 7)).x; + + dequantizeBlockAccum_ns_sgbroadcast_1(totalSum, regA, regS, regB); + } + + // reduction in local memory, assumes #wave=4 + __local float reduceLM[SIMDGROUP_WIDTH * 3]; + if (groupId == 1) reduceLM[SIMDGROUP_WIDTH * 0 + slid] = totalSum; + if (groupId == 2) reduceLM[SIMDGROUP_WIDTH * 1 + slid] = totalSum; + if (groupId == 3) reduceLM[SIMDGROUP_WIDTH * 2 + slid] = totalSum; + barrier(CLK_LOCAL_MEM_FENCE); + if (groupId == 0) totalSum += reduceLM[SIMDGROUP_WIDTH * 0 + slid]; + if (groupId == 0) totalSum += reduceLM[SIMDGROUP_WIDTH * 1 + slid]; + if (groupId == 0) totalSum += reduceLM[SIMDGROUP_WIDTH * 2 + slid]; + + // 1 outputs per fiber in wave 0 + if (groupId == 0) { + dst = (global float*)((global char*)dst + offsetd); + dst[gid] = totalSum; + } +} diff --git a/ggml/src/ggml-opencl/kernels/mul_mm_q8_0_f32_8x4.cl b/ggml/src/ggml-opencl/kernels/mul_mm_q8_0_f32_8x4.cl new file mode 100644 index 0000000000..51ce2121ce --- /dev/null +++ b/ggml/src/ggml-opencl/kernels/mul_mm_q8_0_f32_8x4.cl @@ -0,0 +1,129 @@ +#pragma OPENCL EXTENSION cl_khr_fp16 : enable +#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable + +#ifdef cl_qcom_reqd_sub_group_size +#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable +#define ADRENO_GPU 1 +#define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full"))) +#endif + +#ifdef ADRENO_GPU +REQD_SUBGROUP_SIZE_128 +#endif + +kernel void kernel_mul_mm_q8_0_f32_8x4( + global const uint * src0_q, + global const half * src0_d, + __read_only image1d_buffer_t src1, + global float * dst, + int k, + int m, + int n, + int n_no_padding, + ulong offsetd +) { + + int m_4 = m >> 2; + int n_4 = n >> 2; + + int gy = get_global_id(0); + int gx = get_global_id(1); + int gx_2 = gx << 2; + dst = (global float *)((global char*)dst + offsetd); + + + half8 c0 = 0, c1 = 0, c2 = 0, c3 = 0; + half8 B; + half4 deq; + + __global const uint* wptr = src0_q + gx_2; + __global const half* sptr = src0_d + gx_2; + + for (int i = 0; i < k; i += 4) { + uint4 pack4 = vload4(0, wptr + (i / 4) * m); + half4 scale = vload4(0, sptr + (i / 32) * m); + + char4 p0 = as_char4(pack4.s0); + char4 p1 = as_char4(pack4.s1); + char4 p2 = as_char4(pack4.s2); + char4 p3 = as_char4(pack4.s3); + + // ------------------- j = 0 (k = i+0) ------------------- + B.s0123 = read_imageh(src1, gy * 2 + (i + 0) * n_4); + B.s4567 = read_imageh(src1, gy * 2 + (i + 0) * n_4 + 1); + + half4 wj0 = convert_half4((char4)(p0.s0, p1.s0, p2.s0, p3.s0)) * scale; + + c0 += B * wj0.s0; + c1 += B * wj0.s1; + c2 += B * wj0.s2; + c3 += B * wj0.s3; + + // ------------------- j = 1 (k = i+1) ------------------- + B.s0123 = read_imageh(src1, gy * 2 + (i + 1) * n_4); + B.s4567 = read_imageh(src1, gy * 2 + (i + 1) * n_4 + 1); + + half4 wj1 = convert_half4((char4)(p0.s1, p1.s1, p2.s1, p3.s1)) * scale; + + c0 += B * wj1.s0; + c1 += B * wj1.s1; + c2 += B * wj1.s2; + c3 += B * wj1.s3; + + // ------------------- j = 2 (k = i+2) ------------------- + B.s0123 = read_imageh(src1, gy * 2 + (i + 2) * n_4); + B.s4567 = read_imageh(src1, gy * 2 + (i + 2) * n_4 + 1); + + half4 wj2 = convert_half4((char4)(p0.s2, p1.s2, p2.s2, p3.s2)) * scale; + + c0 += B * wj2.s0; + c1 += B * wj2.s1; + c2 += B * wj2.s2; + c3 += B * wj2.s3; + + // ------------------- j = 3 (k = i+3) ------------------- + B.s0123 = read_imageh(src1, gy * 2 + (i + 3) * n_4); + B.s4567 = read_imageh(src1, gy * 2 + (i + 3) * n_4 + 1); + + half4 wj3 = convert_half4((char4)(p0.s3, p1.s3, p2.s3, p3.s3)) * scale; + + c0 += B * wj3.s0; + c1 += B * wj3.s1; + c2 += B * wj3.s2; + c3 += B * wj3.s3; + } + + int idx = (gy << 3) * m + (gx << 2); + + if(idx+3 < m*n_no_padding){ + vstore4((float4)(c0.s0, c1.s0, c2.s0, c3.s0), 0, dst + idx); + idx += m; + } + if(idx+3 < m*n_no_padding){ + vstore4((float4)(c0.s1, c1.s1, c2.s1, c3.s1), 0, dst + idx); + idx += m; + } + if(idx+3 < m*n_no_padding){ + vstore4((float4)(c0.s2, c1.s2, c2.s2, c3.s2), 0, dst + idx); + idx += m; + } + if(idx+3 < m*n_no_padding){ + vstore4((float4)(c0.s3, c1.s3, c2.s3, c3.s3), 0, dst + idx); + idx += m; + } + if(idx+3 < m*n_no_padding){ + vstore4((float4)(c0.s4, c1.s4, c2.s4, c3.s4), 0, dst + idx); + idx += m; + } + if(idx+3 < m*n_no_padding){ + vstore4((float4)(c0.s5, c1.s5, c2.s5, c3.s5), 0, dst + idx); + idx += m; + } + if(idx+3 < m*n_no_padding){ + vstore4((float4)(c0.s6, c1.s6, c2.s6, c3.s6), 0, dst + idx); + idx += m; + } + if(idx+3 < m*n_no_padding){ + vstore4((float4)(c0.s7, c1.s7, c2.s7, c3.s7), 0, dst + idx); + } +} diff --git a/ggml/src/ggml-sycl/element_wise.cpp b/ggml/src/ggml-sycl/element_wise.cpp index 8d83b2446b..651b875b63 100644 --- a/ggml/src/ggml-sycl/element_wise.cpp +++ b/ggml/src/ggml-sycl/element_wise.cpp @@ -123,6 +123,15 @@ static __dpct_inline__ T op_log(T x) { return sycl::log(x); } +template +static __dpct_inline__ T op_softplus(T x) { + const float xf = (float) x; + const float ax = sycl::fabs(xf); + const float m = sycl::fmax(xf, 0.0f); + const float y = m + sycl::log1p(sycl::exp(-ax)); + return (T) y; +} + template static __dpct_inline__ T op_neg(T x) { return -x; @@ -695,6 +704,12 @@ static inline void ggml_sycl_op_log(ggml_backend_sycl_context & ctx, ggml_tensor }); } +static inline void ggml_sycl_op_softplus(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + ggml_sycl_detail::ggml_sycl_op_unary(ctx, dst, [](auto x) { + return op_softplus(x); + }); +} + static inline void ggml_sycl_op_neg(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { ggml_sycl_detail::ggml_sycl_op_unary(ctx, dst, [](auto x) { return op_neg(x); @@ -1101,6 +1116,11 @@ void ggml_sycl_log(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { ggml_sycl_op_log(ctx, dst); } +void ggml_sycl_softplus(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1); + ggml_sycl_op_softplus(ctx, dst); +} + void ggml_sycl_neg(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1); ggml_sycl_op_neg(ctx, dst); diff --git a/ggml/src/ggml-sycl/element_wise.hpp b/ggml/src/ggml-sycl/element_wise.hpp index 0913a2e529..7c71974687 100644 --- a/ggml/src/ggml-sycl/element_wise.hpp +++ b/ggml/src/ggml-sycl/element_wise.hpp @@ -61,6 +61,8 @@ void ggml_sycl_exp(ggml_backend_sycl_context & ctx, ggml_tensor * dst); void ggml_sycl_log(ggml_backend_sycl_context & ctx, ggml_tensor * dst); +void ggml_sycl_softplus(ggml_backend_sycl_context & ctx, ggml_tensor * dst); + void ggml_sycl_neg(ggml_backend_sycl_context & ctx, ggml_tensor * dst); void ggml_sycl_step(ggml_backend_sycl_context & ctx, ggml_tensor * dst); diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp index 3a4c092af5..74b4ed91cc 100644 --- a/ggml/src/ggml-sycl/ggml-sycl.cpp +++ b/ggml/src/ggml-sycl/ggml-sycl.cpp @@ -2263,6 +2263,65 @@ inline void ggml_sycl_op_diag_mask_inf(ggml_backend_sycl_context & ctx, ggml_ten diag_mask_inf_f32_sycl(src0_dd, dst_dd, ne00, nrows0, ne01, n_past, main_stream); } +static void tri_f32_sycl( + const float * src, + float * dst, + const int64_t ne0, + const int64_t ne1, + const int64_t ne2, + const int64_t ne3, + const ggml_tri_type ttype, + dpct::queue_ptr main_stream +) { + const size_t total = (size_t) ne0 * (size_t) ne1 * (size_t) ne2 * (size_t) ne3; + + main_stream->parallel_for(sycl::range<1>(total), [=](sycl::id<1> tid) { + const int64_t idx = (int64_t) tid[0]; + + const int64_t i0 = idx % ne0; + const int64_t t1 = idx / ne0; + const int64_t i1 = t1 % ne1; + + bool keep = false; + switch (ttype) { + case GGML_TRI_TYPE_LOWER: keep = (i0 < i1); break; + case GGML_TRI_TYPE_LOWER_DIAG: keep = (i0 <= i1); break; + case GGML_TRI_TYPE_UPPER: keep = (i0 > i1); break; + case GGML_TRI_TYPE_UPPER_DIAG: keep = (i0 >= i1); break; + default: keep = false; break; + } + + dst[idx] = keep ? src[idx] : 0.0f; + }); +} + +static void ggml_sycl_op_tri(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + const ggml_tensor * src0 = dst->src[0]; + GGML_ASSERT(src0); + + GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT(dst->type == GGML_TYPE_F32); + GGML_ASSERT(ggml_is_contiguous(src0)); + GGML_ASSERT(ggml_is_contiguous(dst)); + GGML_ASSERT(ggml_are_same_shape(src0, dst)); + + dpct::queue_ptr main_stream = ctx.stream(); + SYCL_CHECK(ggml_sycl_set_device(ctx.device)); + + const float * src0_dd = static_cast(src0->data); + float * dst_dd = static_cast(dst->data); + + const ggml_tri_type ttype = (ggml_tri_type) ggml_get_op_params_i32(dst, 0); + + const int64_t ne0 = src0->ne[0]; + const int64_t ne1 = src0->ne[1]; + const int64_t ne2 = src0->ne[2]; + const int64_t ne3 = src0->ne[3]; + + tri_f32_sycl(src0_dd, dst_dd, ne0, ne1, ne2, ne3, ttype, main_stream); +} + + inline void ggml_sycl_op_scale(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); GGML_ASSERT( dst->type == GGML_TYPE_F32); @@ -3786,6 +3845,9 @@ static bool ggml_sycl_compute_forward(ggml_backend_sycl_context & ctx, struct gg case GGML_UNARY_OP_EXP: ggml_sycl_exp(ctx, dst); break; + case GGML_UNARY_OP_SOFTPLUS: + ggml_sycl_softplus(ctx, dst); + break; case GGML_UNARY_OP_SGN: ggml_sycl_sgn(ctx, dst); break; @@ -3912,6 +3974,9 @@ static bool ggml_sycl_compute_forward(ggml_backend_sycl_context & ctx, struct gg case GGML_OP_TRANSPOSE: GGML_SYCL_DEBUG("%s: Tensor NO-OP\n", __func__); break; + case GGML_OP_TRI: + ggml_sycl_op_tri(ctx, dst); + break; case GGML_OP_DIAG_MASK_INF: ggml_sycl_diag_mask_inf(ctx, dst); break; @@ -4404,6 +4469,7 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g case GGML_UNARY_OP_GELU_QUICK: case GGML_UNARY_OP_GELU_ERF: case GGML_UNARY_OP_EXP: + case GGML_UNARY_OP_SOFTPLUS: case GGML_UNARY_OP_ELU: return true; case GGML_UNARY_OP_FLOOR: @@ -4616,6 +4682,13 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g return true; case GGML_OP_CONT: return op->src[0]->type != GGML_TYPE_BF16; + case GGML_OP_TRI: + { + const ggml_tensor * src0 = op->src[0]; + return src0 && + op->type == GGML_TYPE_F32 && + ggml_is_contiguous(src0); + } case GGML_OP_DIAG_MASK_INF: return true; case GGML_OP_SOFT_MAX: diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp index 3852867c29..a99375c088 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -11956,7 +11956,8 @@ static void ggml_vk_test_dequant_matmul(ggml_backend_vk_context * ctx, size_t m, } } if (mmq) { - ggml_pipeline_request_descriptor_sets(ctx, ctx->device->pipeline_quantize_q8_1, num_it); + vk_pipeline pipeline_quantize_q8_1 = ggml_vk_get_quantize_pipeline(ctx, GGML_TYPE_Q8_1); + ggml_pipeline_request_descriptor_sets(ctx, pipeline_quantize_q8_1, num_it); } ggml_pipeline_allocate_descriptor_sets(ctx); diff --git a/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn.wgsl b/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn.wgsl index de7c132a62..b682216146 100644 --- a/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn.wgsl +++ b/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn.wgsl @@ -114,7 +114,7 @@ struct Params { #define PARAMS_BINDING 4 #endif -@group(0) @binding(DST_BINDING) var dst: array; +@group(0) @binding(DST_BINDING) var dst: array>; @group(0) @binding(PARAMS_BINDING) var params: Params; // Just a very small float value. @@ -160,14 +160,21 @@ fn calc_softmax_term(kv_idx: u32, q_tile_row: u32, slope: f32) -> f32 { return v; } +fn load_f32x4(buf: ptr>, read_write>, scalar_index: u32) -> vec4 { + return (*buf)[scalar_index >> 2u]; +} + +fn load_kvx4(buf: ptr>, read_write>, scalar_index: u32) -> vec4 { + return (*buf)[scalar_index >> 2u]; +} @compute @workgroup_size(WG_SIZE) fn main(@builtin(workgroup_id) wg_id: vec3, - @builtin(local_invocation_id) local_id: vec3, - @builtin(subgroup_id) subgroup_id: u32, - @builtin(subgroup_size) subgroup_size: u32, - @builtin(num_subgroups) num_subgroups: u32, - @builtin(subgroup_invocation_id) sg_inv_id: u32) { + @builtin(local_invocation_id) local_id: vec3, + @builtin(subgroup_id) subgroup_id: u32, + @builtin(subgroup_size) subgroup_size: u32, + @builtin(num_subgroups) num_subgroups: u32, + @builtin(subgroup_invocation_id) sg_inv_id: u32) { // initialize row max for online softmax for (var i = local_id.x; i < Q_TILE; i += WG_SIZE) { @@ -231,9 +238,9 @@ fn main(@builtin(workgroup_id) wg_id: vec3, for (var kv_tile = 0u; kv_tile < params.seq_len_kv; kv_tile += KV_TILE) { // clear inter_shmem to ensure zero-initialized accumulators - for (var elem_idx = local_id.x; elem_idx < Q_TILE * KV_TILE; elem_idx += WG_SIZE) { - inter_shmem[elem_idx] = 0.0; - } + for (var elem_idx = local_id.x; elem_idx < Q_TILE * KV_TILE; elem_idx += WG_SIZE) { + inter_shmem[elem_idx] = 0.0; + } // load k tile into shared memory #if defined(KV_Q4_0) @@ -309,48 +316,77 @@ fn main(@builtin(workgroup_id) wg_id: vec3, // accumulate q block * k block into registers across the entire KV tile // TODO: this loop seems to be the current largest bottleneck - for (var kv_block = subgroup_id; kv_block < KV_BLOCKS; kv_block += num_subgroups) { - let inter_offset = kv_block * SG_MAT_N; - var acc: subgroup_matrix_result = subgroupMatrixLoad< - subgroup_matrix_result>(&inter_shmem, inter_offset, false, KV_TILE); + // this bracket exists to scope the lifetime of variables, reducing register pressure + { #ifdef KV_DIRECT - let k_block_row = kv_tile + kv_block * SG_MAT_N; - let k_global_offset = k_head_offset + k_block_row * params.stride_k1; + let k_block_row = kv_tile + subgroup_id * SG_MAT_N; + var k_global_offset = k_head_offset + k_block_row * params.stride_k1; #else - let k_block_offset = kv_block * SG_MAT_N * HEAD_DIM_QK; + var k_block_offset = subgroup_id * SG_MAT_N * HEAD_DIM_QK; #endif - for (var head_dim_block = 0u; head_dim_block < HEAD_DIM_QK; head_dim_block += SG_MAT_K) { - // load q submatrix from shared memory - var q_sg_mat: subgroup_matrix_left = subgroupMatrixLoad>( - &q_shmem, - head_dim_block, - false, - HEAD_DIM_QK - ); + for (var kv_block = subgroup_id; kv_block < KV_BLOCKS; kv_block += num_subgroups) { + let inter_offset = kv_block * SG_MAT_N; + var acc: subgroup_matrix_result = subgroupMatrixLoad>(&inter_shmem, inter_offset, false, KV_TILE); + + var q_cur = subgroupMatrixLoad>(&q_shmem, 0u, false, HEAD_DIM_QK); - // load k submatrix from device or shared memory #ifdef KV_DIRECT - var k_sg_mat: subgroup_matrix_right = subgroupMatrixLoad>( - &K, - k_global_offset + head_dim_block, - true, - params.stride_k1 - ); + var k_cur = subgroupMatrixLoad>(&K, k_global_offset + 0u, true, params.stride_k1); #else - var k_sg_mat: subgroup_matrix_right = subgroupMatrixLoad>( - &kv_shmem, - k_block_offset + head_dim_block, - true, - HEAD_DIM_QK - ); + var k_cur = subgroupMatrixLoad>(&kv_shmem, k_block_offset + 0u, true, HEAD_DIM_QK); #endif - acc = subgroupMatrixMultiplyAccumulate(q_sg_mat, k_sg_mat, acc); + + var t: u32 = 1u; + for (; t + 1u < HEAD_DIM_QK / SG_MAT_K; t += 2u) { + let h0 = t * SG_MAT_K; + var q0 = subgroupMatrixLoad>(&q_shmem, h0, false, HEAD_DIM_QK); +#ifdef KV_DIRECT + var k0 = subgroupMatrixLoad>(&K, k_global_offset + h0, true, params.stride_k1); +#else + var k0 = subgroupMatrixLoad>(&kv_shmem, k_block_offset + h0, true, HEAD_DIM_QK); +#endif + acc = subgroupMatrixMultiplyAccumulate(q_cur, k_cur, acc); + q_cur = q0; + k_cur = k0; + + let h1 = (t + 1u) * SG_MAT_K; + var q1g = subgroupMatrixLoad>(&q_shmem, h1, false, HEAD_DIM_QK); +#ifdef KV_DIRECT + var k1g = subgroupMatrixLoad>(&K, k_global_offset + h1, true, params.stride_k1); +#else + var k1g = subgroupMatrixLoad>(&kv_shmem, k_block_offset + h1, true, HEAD_DIM_QK); +#endif + acc = subgroupMatrixMultiplyAccumulate(q_cur, k_cur, acc); + q_cur = q1g; + k_cur = k1g; + } + + // handle odd tail + if (t < HEAD_DIM_QK / SG_MAT_K) { + let h = t * SG_MAT_K; + var qn = subgroupMatrixLoad>(&q_shmem, h, false, HEAD_DIM_QK); +#ifdef KV_DIRECT + var kn = subgroupMatrixLoad>(&K, k_global_offset + h, true, params.stride_k1); +#else + var kn = subgroupMatrixLoad>(&kv_shmem, k_block_offset + h, true, HEAD_DIM_QK); +#endif + acc = subgroupMatrixMultiplyAccumulate(q_cur, k_cur, acc); + q_cur = qn; + k_cur = kn; + } + + acc = subgroupMatrixMultiplyAccumulate(q_cur, k_cur, acc); + +#ifdef KV_DIRECT + k_global_offset += num_subgroups * SG_MAT_N * params.stride_k1; +#else + k_block_offset += num_subgroups * SG_MAT_N * HEAD_DIM_QK; +#endif + subgroupMatrixStore(&inter_shmem, inter_offset, acc, false, KV_TILE); } - - // store acc to shared memory for softmax (S matrix from paper) - subgroupMatrixStore(&inter_shmem, inter_offset, acc, false, KV_TILE); } + #ifdef MASK // load mask tile into shared memory for this KV block // TODO: optimize and skip if mask is -INF for the entire tile @@ -495,7 +531,6 @@ fn main(@builtin(workgroup_id) wg_id: vec3, false, HEAD_DIM_V ); - for (var kv_block = 0u; kv_block < KV_BLOCKS; kv_block++) { let p_offset = kv_block * SG_MAT_N; var p_sg_mat: subgroup_matrix_left = subgroupMatrixLoad>( @@ -527,11 +562,9 @@ fn main(@builtin(workgroup_id) wg_id: vec3, // O += P * V o_sg_mat = subgroupMatrixMultiplyAccumulate(p_sg_mat, v_sg_mat, o_sg_mat); } - // store O back to shared memory subgroupMatrixStore(&o_shmem, head_dim_block, o_sg_mat, false, HEAD_DIM_V); } - workgroupBarrier(); } @@ -566,26 +599,38 @@ fn main(@builtin(workgroup_id) wg_id: vec3, o_shmem[idx] = f16(val); } } - workgroupBarrier(); #endif - - // write output back to global memory for (var q_tile_row = subgroup_id; - q_tile_row < Q_TILE; - q_tile_row += num_subgroups) { - let global_q_row = q_row_start + q_tile_row; - if (global_q_row >= params.seq_len_q) { - break; - } + q_tile_row < Q_TILE; + q_tile_row += num_subgroups) { - let exp_sum = exp_sum_shmem[q_tile_row]; - let scale = select(0.0, 1.0 / exp_sum, exp_sum != 0); + let global_q_row = q_row_start + q_tile_row; + if (global_q_row >= params.seq_len_q) { break; } - for (var elem_idx = sg_inv_id; elem_idx < HEAD_DIM_V; elem_idx += subgroup_size) { - let o_val = o_shmem[q_tile_row * HEAD_DIM_V + elem_idx]; - let scaled = f32(o_val) * scale; - dst[dst_global_offset + q_tile_row * dst2_stride + elem_idx] = scaled; - } + let exp_sum = exp_sum_shmem[q_tile_row]; + let scale = select(0.0, 1.0 / exp_sum, exp_sum != 0.0); + + let row_base: u32 = dst_global_offset + q_tile_row * dst2_stride; + + for (var elem_base = sg_inv_id * 4u; + elem_base < HEAD_DIM_V; + elem_base += subgroup_size * 4u) { + + let i0 = q_tile_row * HEAD_DIM_V + (elem_base + 0u); + let i1 = q_tile_row * HEAD_DIM_V + (elem_base + 1u); + let i2 = q_tile_row * HEAD_DIM_V + (elem_base + 2u); + let i3 = q_tile_row * HEAD_DIM_V + (elem_base + 3u); + + let v = vec4( + f32(o_shmem[i0]) * scale, + f32(o_shmem[i1]) * scale, + f32(o_shmem[i2]) * scale, + f32(o_shmem[i3]) * scale + ); + + let dst_vec_index: u32 = (row_base + elem_base) >> 2u; + dst[dst_vec_index] = v; + } } } diff --git a/scripts/snapdragon/windows/run-bench.ps1 b/scripts/snapdragon/windows/run-bench.ps1 new file mode 100644 index 0000000000..21fd063ebe --- /dev/null +++ b/scripts/snapdragon/windows/run-bench.ps1 @@ -0,0 +1,40 @@ + +#!/usr/bin/env pwsh + +# Basedir on device +$basedir=".\pkg-snapdragon" + +$cli_opts=$args + +$model="Llama-3.2-3B-Instruct-Q4_0.gguf" +if ($null -ne $env:M) { + $model=$env:M +} + +$device="HTP0" +if ($null -ne $env:D) { + $device=$env:D +} + +if ($null -ne $env:V) { + $env:GGML_HEXAGON_VERBOSE=$env:V +} + +if ($null -ne $env:OPMASK) { + $env:GGML_HEXAGON_OPMASK=$env:OPMASK +} + +if ($null -ne $env:NHVX) { + $env:GGML_HEXAGON_NHVX=$env:NHVX +} + +if ($null -ne $env:NDEV) { + $env:GGML_HEXAGON_NDEV=$env:NDEV +} + +$env:ADSP_LIBRARY_PATH="$basedir\lib" + +& "$basedir\bin\llama-bench.exe" ` + --mmap 0 -m $basedir\..\..\gguf\$model ` + --poll 1000 -t 6 --cpu-mask 0xfc --cpu-strict 1 ` + --batch-size 128 -ngl 99 --device $device $cli_opts diff --git a/scripts/snapdragon/windows/run-cli.ps1 b/scripts/snapdragon/windows/run-cli.ps1 new file mode 100644 index 0000000000..b13161aa63 --- /dev/null +++ b/scripts/snapdragon/windows/run-cli.ps1 @@ -0,0 +1,53 @@ + +#!/usr/bin/env pwsh + +# Basedir on device +$basedir=".\pkg-snapdragon" + +$cli_opts=$args + +$model="Llama-3.2-3B-Instruct-Q4_0.gguf" +if ($null -ne $env:M) { + $model=$env:M +} + +$device="HTP0" +if ($null -ne $env:D) { + $device=$env:D +} + +if ($null -ne $env:V) { + $env:GGML_HEXAGON_VERBOSE=$env:V +} + +if ($null -ne $env:E) { + $env:GGML_HEXAGON_EXPERIMENTAL=$env:E +} + +if ($null -ne $env:SCHED) { + $env:GGML_SCHED_DEBUG=$env:SCHED; $cli_opts="$cli_opts -v" +} + +if ($null -ne $env:PROF) { + $env:GGML_HEXAGON_PROFILE=$env:PROF; $env:GGML_HEXAGON_OPSYNC=1 +} + +if ($null -ne $env:OPMASK) { + $env:GGML_HEXAGON_OPMASK=$env:OPMASK +} + +if ($null -ne $env:NHVX) { + $env:GGML_HEXAGON_NHVX=$env:NHVX +} + +if ($null -ne $env:NDEV) { + $env:GGML_HEXAGON_NDEV=$env:NDEV +} + +$env:ADSP_LIBRARY_PATH="$basedir\lib" + +& "$basedir\bin\llama-completion.exe" ` + --no-mmap -no-cnv -m $basedir\..\..\gguf\$model ` + --poll 1000 -t 6 --cpu-mask 0xfc --cpu-strict 1 ` + --ctx-size 8192 --batch-size 128 -ctk q8_0 -ctv q8_0 -fa on ` + -ngl 99 --device $device $cli_opts diff --git a/scripts/snapdragon/windows/run-tool.ps1 b/scripts/snapdragon/windows/run-tool.ps1 new file mode 100644 index 0000000000..70094af9bc --- /dev/null +++ b/scripts/snapdragon/windows/run-tool.ps1 @@ -0,0 +1,56 @@ + +#!/usr/bin/env pwsh + +# Basedir on device +$basedir=".\pkg-snapdragon" + +if ($args.Count -eq 0) { + Write-Host "No arguments provided.Expected the tool and argument to run." + exit -1 +} + +$tool=$args[0] +$cli_opts=@() + +if ($args.Count -gt 1) { + $cli_opts=$args[1..($args.Count - 1)] + $remainingArgs = $args[1..($args.Count - 1)] +} + +$device="HTP0" +if ($null -ne $env:D) { + $device=$env:D +} + +if ($null -ne $env:V) { + $env:GGML_HEXAGON_VERBOSE=$env:V +} + +if ($null -ne $env:E) { + $env:GGML_HEXAGON_EXPERIMENTAL=$env:E +} + +if ($null -ne $env:SCHED) { + $env:GGML_SCHED_DEBUG=$env:SCHED; $cli_opts="$cli_opts -v" +} + +if ($null -ne $env:PROF) { + $env:GGML_HEXAGON_PROFILE=$env:PROF; $env:GGML_HEXAGON_OPSYNC=1 +} + +if ($null -ne $env:OPMASK) { + $env:GGML_HEXAGON_OPMASK=$env:OPMASK +} + +if ($null -ne $env:NHVX) { + $env:GGML_HEXAGON_NHVX=$env:NHVX +} + +if ($null -ne $env:NDEV) { + $env:GGML_HEXAGON_NDEV=$env:NDEV +} + +$env:ADSP_LIBRARY_PATH="$basedir\lib" + +& "$basedir\bin\$tool" ` + $cli_opts diff --git a/scripts/snapdragon/windows/setup-build.ps1 b/scripts/snapdragon/windows/setup-build.ps1 new file mode 100644 index 0000000000..0f3244cc9d --- /dev/null +++ b/scripts/snapdragon/windows/setup-build.ps1 @@ -0,0 +1,105 @@ +# Requires Run as Administrator is NOT strictly necessary for User-scope env vars, +# but recommended for creating directories in C:\ root if permissions are restricted. + +$ErrorActionPreference = "Stop" + +# --- Configuration --- +$BaseDir = "C:\Qualcomm" + +# SDK 1: Hexagon +$HexagonUrl = "https://github.com/snapdragon-toolchain/hexagon-sdk/releases/download/v6.4.0.2/hexagon-sdk-v6.4.0.2-arm64-wos.tar.xz" +$HexagonParent = Join-Path $BaseDir "Hexagon_SDK" +$HexagonSdkVersion = "6.4.0.2" +$HexagonToolsVersion = "19.0.04" +$HexagonSdkTarget = Join-Path $HexagonParent $HexagonSdkVersion +$HexagonToolsTarget = Join-Path $HexagonSdkTarget "\tools\HEXAGON_Tools\$HexagonToolsVersion" + +# SDK 2: OpenCL +$OpenCLUrl = "https://github.com/snapdragon-toolchain/opencl-sdk/releases/download/v2.3.2/adreno-opencl-sdk-v2.3.2-arm64-wos.tar.xz" +$OpenCLParent = Join-Path $BaseDir "OpenCL_SDK" +$OpenCLVersion = "2.3.2" +$OpenCLTarget = Join-Path $OpenCLParent $OpenCLVersion + +# --- Helper Function --- +function Install-QualcommSDK { + param ( + [string]$Url, + [string]$ParentDir, + [string]$TargetDir, + [string]$Name + ) + + # 1. Create Parent Directory + if (-not (Test-Path -Path $ParentDir)) { + Write-Host "Creating directory: $ParentDir" -ForegroundColor Cyan + New-Item -Path $ParentDir -ItemType Directory -Force | Out-Null + } + + # 2. Check for Specific Version Directory + if (Test-Path -Path $TargetDir) { + Write-Host "$Name ($TargetDir) already exists. Skipping download." -ForegroundColor Green + } + else { + Write-Host "$Name not found. preparing to download..." -ForegroundColor Yellow + + # Create the target directory to extract into + New-Item -Path $TargetDir -ItemType Directory -Force | Out-Null + + # Define temporary archive path + $TempFile = Join-Path $ParentDir "temp_sdk.tar.xz" + + try { + # Download + Write-Host "Downloading from: $Url" + Invoke-WebRequest -Uri $Url -OutFile $TempFile + + # Untar + # Note: We assume Windows includes tar.exe (Win 10 build 17063+) + Write-Host "Extracting archive to $TargetDir..." + + # We use -C to extract contents INTO the target directory created above + tar -xJvf $TempFile -C $TargetDir\.. + + Write-Host "Extraction complete." -ForegroundColor Green + } + catch { + Write-Error "Failed to download or extract $Name. Error: $_" + # Cleanup target dir if failed so script tries again next time + Remove-Item -Path $TargetDir -Recurse -Force -ErrorAction SilentlyContinue + } + finally { + # Cleanup Archive + if (Test-Path $TempFile) { Remove-Item $TempFile -Force } + } + } +} + +# --- Execution --- + +# 1. Ensure Base C:\Qualcomm exists +if (-not (Test-Path $BaseDir)) { + New-Item -Path $BaseDir -ItemType Directory -Force | Out-Null +} + +# 2. Run Install Logic +Install-QualcommSDK -Url $HexagonUrl -ParentDir $HexagonParent -TargetDir $HexagonSdkTarget -Name "Hexagon SDK" +Install-QualcommSDK -Url $OpenCLUrl -ParentDir $OpenCLParent -TargetDir $OpenCLTarget -Name "OpenCL SDK" + +# --- Environment Variables --- + +Write-Host "`nSetting Environment Variables..." -ForegroundColor Cyan + +# Set OPENCL_SDK_ROOT +[System.Environment]::SetEnvironmentVariable('OPENCL_SDK_ROOT', $OpenCLTarget, [System.EnvironmentVariableTarget]::User) +$env:OPENCL_SDK_ROOT = $OpenCLTarget # Set for current session as well +Write-Host "OPENCL_SDK_ROOT set to: $OpenCLTarget" + +# Set HEXAGON_SDK_ROOT +[System.Environment]::SetEnvironmentVariable('HEXAGON_SDK_ROOT', $HexagonSdkTarget, [System.EnvironmentVariableTarget]::User) +$env:HEXAGON_SDK_ROOT = $HexagonSdkTarget # Set for current session as well +Write-Host "HEXAGON_SDK_ROOT set to: $HexagonSdkTarget" + +# Set HEXAGON_SDK_ROOT +[System.Environment]::SetEnvironmentVariable('HEXAGON_TOOLS_ROOT', $HexagonToolsTarget, [System.EnvironmentVariableTarget]::User) +$env:HEXAGON_TOOLS_ROOT = $HexagonToolsTarget # Set for current session as well +Write-Host "HEXAGON_TOOLS_ROOT set to: $HexagonToolsTarget" diff --git a/scripts/sync-ggml.last b/scripts/sync-ggml.last index c838276158..81e79a9470 100644 --- a/scripts/sync-ggml.last +++ b/scripts/sync-ggml.last @@ -1 +1 @@ -ebc3a0f4a56be1c9424a89fbec09962ac34fde85 +a8db410a252c8c8f2d120c6f2e7133ebe032f35d diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp index f3c9b49f30..c35cd6761b 100644 --- a/src/llama-kv-cache.cpp +++ b/src/llama-kv-cache.cpp @@ -1772,8 +1772,6 @@ void llama_kv_cache::state_write_data(llama_io_write_i & io, const cell_ranges_t io.write(&v_trans, sizeof(v_trans)); io.write(&n_layer, sizeof(n_layer)); - std::vector tmp_buf; - // Iterate and write all the keys first, each row is a cell // Get whole range at a time for (const auto & layer : layers) { @@ -1791,7 +1789,7 @@ void llama_kv_cache::state_write_data(llama_io_write_i & io, const cell_ranges_t const uint64_t k_size_row = ggml_row_size(k->type, n_embd_k_gqa); io.write(&k_size_row, sizeof(k_size_row)); - // Read each range of cells of k_size length each into tmp_buf and write out + // Read each range of cells of k_size length and write out for (const auto & range : cr.data) { const size_t range_size = range.second - range.first; const size_t buf_size = range_size * k_size_row; @@ -1818,7 +1816,7 @@ void llama_kv_cache::state_write_data(llama_io_write_i & io, const cell_ranges_t const uint64_t v_size_row = ggml_row_size(v->type, n_embd_v_gqa); io.write(&v_size_row, sizeof(v_size_row)); - // Read each range of cells of v_size length each into tmp_buf and write out + // Read each range of cells of v_size length and write out for (const auto & range : cr.data) { const size_t range_size = range.second - range.first; const size_t buf_size = range_size * v_size_row; @@ -1852,7 +1850,7 @@ void llama_kv_cache::state_write_data(llama_io_write_i & io, const cell_ranges_t // For each row, we get the element values of each cell for (uint32_t j = 0; j < n_embd_v_gqa; ++j) { - // Read each range of cells of v_size_el length each into tmp_buf and write out + // Read each range of cells of v_size_el length and write out for (const auto & range : cr.data) { const size_t range_size = range.second - range.first; const size_t src_offset = (range.first + j * kv_size) * v_size_el; diff --git a/src/llama-memory-recurrent.cpp b/src/llama-memory-recurrent.cpp index 812bf25304..f0038036dc 100644 --- a/src/llama-memory-recurrent.cpp +++ b/src/llama-memory-recurrent.cpp @@ -785,23 +785,21 @@ void llama_memory_recurrent::state_write_data(llama_io_write_i & io, const std:: io.write(&s_trans, sizeof(s_trans)); io.write(&n_layer, sizeof(n_layer)); - std::vector tmp_buf; - - // Iterate and write all the keys first, each row is a cell + // Iterate and write all the R tensors first, each row is a cell // Get whole range at a time for (uint32_t il = 0; il < n_layer; ++il) { // skip null layers (read_data will handle this by checking "r_l" and "s_l" for null) if (r_l[il] == nullptr) continue; - // Write key type + // Write R tensor type const int32_t r_type_i = (int32_t)r_l[il]->type; io.write(&r_type_i, sizeof(r_type_i)); - // Write row size of key + // Write row size of R tensor const uint64_t r_size_row = ggml_row_size(r_l[il]->type, hparams.n_embd_r()); io.write(&r_size_row, sizeof(r_size_row)); - // Read each range of cells of k_size length each into tmp_buf and write out + // Write each range of cells of r_size_row length for (const auto & range : cell_ranges) { const size_t range_size = range.second - range.first; const size_t buf_size = range_size * r_size_row; @@ -814,15 +812,15 @@ void llama_memory_recurrent::state_write_data(llama_io_write_i & io, const std:: // skip null layers (read_data will handle this by checking "r_l" and "s_l" for null) if (s_l[il] == nullptr) continue; - // Write value type + // Write S tensor type const int32_t s_type_i = (int32_t)s_l[il]->type; io.write(&s_type_i, sizeof(s_type_i)); - // Write row size of value + // Write row size of S tensor const uint64_t s_size_row = ggml_row_size(s_l[il]->type, hparams.n_embd_s()); io.write(&s_size_row, sizeof(s_size_row)); - // Read each range of cells of s_size length each into tmp_buf and write out + // Write each range of S tensor rows for (const auto & range : cell_ranges) { const size_t range_size = range.second - range.first; const size_t buf_size = range_size * s_size_row; @@ -830,7 +828,7 @@ void llama_memory_recurrent::state_write_data(llama_io_write_i & io, const std:: } } } else { - // When v is transposed, we also need the element size and get the element ranges from each row + // When S tensor is transposed, we also need the element size and get the element ranges from each row const uint32_t mem_size = size; for (uint32_t il = 0; il < n_layer; ++il) { // skip null layers (read_data will handle this by checking "r_l" and "s_l" for null) @@ -838,7 +836,7 @@ void llama_memory_recurrent::state_write_data(llama_io_write_i & io, const std:: const uint32_t n_embd_s = hparams.n_embd_s(); - // Write value type + // Write S tensor type const int32_t s_type_i = (int32_t)s_l[il]->type; io.write(&s_type_i, sizeof(s_type_i)); @@ -851,7 +849,7 @@ void llama_memory_recurrent::state_write_data(llama_io_write_i & io, const std:: // For each row, we get the element values of each cell for (uint32_t j = 0; j < n_embd_s; ++j) { - // Read each range of cells of v_size_el length each into tmp_buf and write out + // Write each range of cells of s_size_el length for (const auto & range : cell_ranges) { const size_t range_size = range.second - range.first; const size_t src_offset = (range.first + j * mem_size) * s_size_el; diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index 58ba6e8b40..8ce1a13c68 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -8233,11 +8233,13 @@ static std::vector> make_test_cases_eval() { if (!mask && max_bias > 0.0f) continue; for (float logit_softcap : {0.0f, 10.0f}) { if (hsk != 128 && logit_softcap != 0.0f) continue; - for (int nh : { 4, }) { + for (int nh : { 1, 4 }) { + if (nh == 1 && hsk != 576) continue; // GLM 4.7 Flash for (int nr3 : { 1, 3, }) { if (hsk > 64 && nr3 > 1) continue; // skip broadcast for large head sizes - for (int nr2 : { 1, 4, 12 }) { + for (int nr2 : { 1, 4, 12, 20 }) { if (nr2 == 12 && hsk != 128) continue; + if (nr2 == 20 && (nh != 1 || hsk != 576)) continue; //for (int kv : { 1, 17, 31, 33, 61, 113, 65, 127, 129, 130, 255, 260, 371, 380, 407, 512, 1024, }) { for (int kv : { 113, 512, 1024, }) { if (nr2 != 1 && kv != 512) continue; diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp index 9b076e0c56..9fa5afc390 100644 --- a/tools/mtmd/clip.cpp +++ b/tools/mtmd/clip.cpp @@ -1005,6 +1005,8 @@ struct clip_model_loader { hparams.minicpmv_query_num = 64; } else if (hparams.minicpmv_version == 6) { hparams.minicpmv_query_num = 64; + } else if (hparams.minicpmv_version == 100045) { + hparams.minicpmv_query_num = 64; } else { hparams.minicpmv_query_num = 96; } @@ -3209,6 +3211,9 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im } else if (params.minicpmv_version == 6) { // MiniCPM-V 4.5 n_patches = 64; + } else if (params.minicpmv_version == 100045) { + // MiniCPM-o 4.5 + n_patches = 64; } else { GGML_ABORT("Unknown minicpmv version"); } diff --git a/tools/mtmd/legacy-models/minicpmv-convert-image-encoder-to-gguf.py b/tools/mtmd/legacy-models/minicpmv-convert-image-encoder-to-gguf.py index bb2cc4e4ea..944037e703 100644 --- a/tools/mtmd/legacy-models/minicpmv-convert-image-encoder-to-gguf.py +++ b/tools/mtmd/legacy-models/minicpmv-convert-image-encoder-to-gguf.py @@ -501,7 +501,7 @@ default_image_mean = [0.5, 0.5, 0.5] default_image_std = [0.5, 0.5, 0.5] ap.add_argument('--image-mean', type=float, nargs='+', help='Mean of the images for normalization (overrides processor) ', default=None) ap.add_argument('--image-std', type=float, nargs='+', help='Standard deviation of the images for normalization (overrides processor)', default=None) -ap.add_argument('--minicpmv_version', type=int, help='minicpmv_version: MiniCPM-V-2 use 1; MiniCPM-V-2.5 use 2; MiniCPM-V-2.6 use 3; MiniCPM-o-2.6 use 4; MiniCPM-V 4.0 use 5; MiniCPM-o-4.0 use 6', default=2) +ap.add_argument('--minicpmv_version', type=int, help='minicpmv_version: MiniCPM-V-2 use 1; MiniCPM-V-2.5 use 2; MiniCPM-V-2.6 use 3; MiniCPM-o-2.6 use 4; MiniCPM-V 4.0 use 5; MiniCPM-o-4.0 use 6; MiniCPM-o-4.5 use 100045', default=2) # with proper args = ap.parse_args() @@ -610,6 +610,9 @@ else: elif minicpmv_version == 6: emb_dim = 4096 block_count = 27 + elif minicpmv_version == 100045: + emb_dim = 4096 + block_count = 27 default_vision_config = { "hidden_size": 1152, @@ -637,6 +640,10 @@ elif minicpmv_version == 6: default_vision_config["model_type"] = "siglip_vision_model" vision_config = SiglipVisionConfig(**default_vision_config) model = SiglipVisionTransformer(vision_config) +elif minicpmv_version == 100045: + default_vision_config["model_type"] = "siglip_vision_model" + vision_config = SiglipVisionConfig(**default_vision_config) + model = SiglipVisionTransformer(vision_config) processor = None # if model.attn_pool is not None: diff --git a/tools/mtmd/mtmd.cpp b/tools/mtmd/mtmd.cpp index 32a24bfcea..d037e834f3 100644 --- a/tools/mtmd/mtmd.cpp +++ b/tools/mtmd/mtmd.cpp @@ -236,7 +236,7 @@ struct mtmd_context { tok_row_end_trail = false; // no trailing end-of-row token ov_img_first = true; - } else if (minicpmv_version == 3 || minicpmv_version == 4 || minicpmv_version == 5 || minicpmv_version == 6) { + } else if (minicpmv_version == 3 || minicpmv_version == 4 || minicpmv_version == 5 || minicpmv_version == 6 || minicpmv_version == 100045) { // minicpmv 2.6 format: // (overview) (slice) (slice) \n ... slice_tmpl = MTMD_SLICE_TMPL_MINICPMV_2_6; diff --git a/tools/quantize/quantize.cpp b/tools/quantize/quantize.cpp index 881f4b3dd9..0709e0bda0 100644 --- a/tools/quantize/quantize.cpp +++ b/tools/quantize/quantize.cpp @@ -119,7 +119,7 @@ static bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftyp [[noreturn]] static void usage(const char * executable) { printf("usage: %s [--help] [--allow-requantize] [--leave-output-tensor] [--pure] [--imatrix] [--include-weights]\n", executable); - printf(" [--exclude-weights] [--output-tensor-type] [--token-embedding-type] [--tensor-type] [--prune-layers] [--keep-split] [--override-kv]\n"); + printf(" [--exclude-weights] [--output-tensor-type] [--token-embedding-type] [--tensor-type] [--tensor-type-file] [--prune-layers] [--keep-split] [--override-kv]\n"); printf(" model-f32.gguf [model-quant.gguf] type [nthreads]\n\n"); printf(" --allow-requantize: Allows requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit\n"); printf(" --leave-output-tensor: Will leave output.weight un(re)quantized. Increases model size but may also increase quality, especially when requantizing\n"); @@ -131,6 +131,8 @@ static void usage(const char * executable) { printf(" --token-embedding-type ggml_type: use this ggml_type for the token embeddings tensor\n"); printf(" --tensor-type TENSOR=TYPE: quantize this tensor to this ggml_type. example: --tensor-type attn_q=q8_0\n"); printf(" Advanced option to selectively quantize tensors. May be specified multiple times.\n"); + printf(" --tensor-type-file tensor_type.txt: list of tensors to quantize to specific ggml_type. example: --tensor-type-file tensor_type_list.txt\n"); + printf(" Advanced option to selectively quantize a long list of tensors. Format to be tensor_name=ggml_type, separated by spaces/newline.\n"); printf(" --prune-layers L0,L1,L2...comma-separated list of layer numbers to prune from the model\n"); printf(" Advanced option to remove all tensors from the given layers\n"); printf(" --keep-split: will generate quantized model in the same shards as input\n"); @@ -415,6 +417,23 @@ static bool parse_tensor_type(const char * data, std::vector & tensor_type) { + std::ifstream file(filename); + if (!file) { + printf("\n%s: failed to open file '%s': %s\n\n", __func__, filename, std::strerror(errno)); + return false; + } + + std::string arg; + while (file >> arg) { + if (!parse_tensor_type(arg.c_str(), tensor_type)) { + return false; + } + } + + return true; +} + static bool parse_layer_prune(const char * data, std::vector & prune_layers) { if (!data) { printf("\n%s: no layer pruning ids provided\n\n", __func__); @@ -480,6 +499,10 @@ int main(int argc, char ** argv) { if (arg_idx == argc-1 || !parse_tensor_type(argv[++arg_idx], tensor_types)) { usage(argv[0]); } + } else if (strcmp(argv[arg_idx], "--tensor-type-file") == 0) { + if (arg_idx == argc-1 || !parse_tensor_type_file(argv[++arg_idx], tensor_types)) { + usage(argv[0]); + } } else if (strcmp(argv[arg_idx], "--prune-layers") == 0) { if (arg_idx == argc-1 || !parse_layer_prune(argv[++arg_idx], prune_layers)) { usage(argv[0]); @@ -686,3 +709,4 @@ int main(int argc, char ** argv) { return 0; } + diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp index 1ca4e3cc0e..7f9c3c566b 100644 --- a/tools/server/server-context.cpp +++ b/tools/server/server-context.cpp @@ -155,7 +155,7 @@ struct server_slot { double t_prompt_processing; // ms double t_token_generation; // ms - std::function callback_on_release; + std::function callback_on_release; // Speculative decoding stats int32_t n_draft_total = 0; // Total draft tokens generated @@ -705,6 +705,11 @@ private: params_base.n_cache_reuse = 0; SRV_WRN("%s\n", "cache_reuse is not supported by multimodal, it will be disabled"); } + + if (params_base.speculative.type != COMMON_SPECULATIVE_TYPE_NONE) { + params_base.speculative.type = COMMON_SPECULATIVE_TYPE_NONE; + SRV_WRN("%s\n", "speculative decoding is not supported by multimodal, it will be disabled"); + } } if (!llama_memory_can_shift(llama_get_memory(ctx))) { @@ -754,16 +759,16 @@ private: SRV_ERR("%s\n", "speculative decoding is not supported with multimodal"); return false; } - SRV_WRN("%s", "speculative decoding context initialized\n"); + SLT_INF(slot, "%s", "speculative decoding context initialized\n"); } else { - SRV_WRN("%s", "speculative decoding context not initialized\n"); + SLT_INF(slot, "%s", "speculative decoding context not initialized\n"); } } SLT_INF(slot, "new slot, n_ctx = %d\n", slot.n_ctx); - slot.callback_on_release = [this](int slot_id) { - queue_tasks.pop_deferred_task(slot_id); + slot.callback_on_release = [this](int id_slot) { + queue_tasks.pop_deferred_task(id_slot); }; slot.reset(); @@ -891,6 +896,9 @@ private: } server_slot * get_slot_by_id(int id_slot) { + // note: allow id_slot to be out of bounds (wrap around) + id_slot = id_slot % slots.size(); + for (server_slot & slot : slots) { if (slot.id == id_slot) { return &slot; @@ -1760,7 +1768,7 @@ private: break; } - int id_slot = task.slot_action.slot_id; + const int id_slot = task.slot_action.id_slot; server_slot * slot = get_slot_by_id(id_slot); if (slot == nullptr) { send_error(task, "Invalid slot ID", ERROR_TYPE_INVALID_REQUEST); @@ -1798,7 +1806,7 @@ private: case SERVER_TASK_TYPE_SLOT_RESTORE: { if (!check_no_mtmd(task.id)) break; - int id_slot = task.slot_action.slot_id; + const int id_slot = task.slot_action.id_slot; server_slot * slot = get_slot_by_id(id_slot); if (slot == nullptr) { send_error(task, "Invalid slot ID", ERROR_TYPE_INVALID_REQUEST); @@ -1847,7 +1855,7 @@ private: if (!check_no_mtmd(task.id)) { break; } - int id_slot = task.slot_action.slot_id; + const int id_slot = task.slot_action.id_slot; server_slot * slot = get_slot_by_id(id_slot); if (slot == nullptr) { send_error(task, "Invalid slot ID", ERROR_TYPE_INVALID_REQUEST); @@ -3312,7 +3320,7 @@ void server_routes::init_routes() { } // TODO: get rid of this dynamic_cast - auto res_task = dynamic_cast(result.get()); + auto * res_task = dynamic_cast(result.get()); GGML_ASSERT(res_task != nullptr); // optionally return "fail_on_no_slot" error @@ -3335,8 +3343,8 @@ void server_routes::init_routes() { } std::string id_slot_str = req.get_param("id_slot"); - int id_slot; + int id_slot; try { id_slot = std::stoi(id_slot_str); } catch (const std::exception &) { @@ -3348,14 +3356,16 @@ void server_routes::init_routes() { if (action == "save") { return handle_slots_save(req, id_slot); - } else if (action == "restore") { - return handle_slots_restore(req, id_slot); - } else if (action == "erase") { - return handle_slots_erase(req, id_slot); - } else { - res->error(format_error_response("Invalid action", ERROR_TYPE_INVALID_REQUEST)); - return res; } + if (action == "restore") { + return handle_slots_restore(req, id_slot); + } + if (action == "erase") { + return handle_slots_erase(req, id_slot); + } + + res->error(format_error_response("Invalid action", ERROR_TYPE_INVALID_REQUEST)); + return res; }; this->get_props = [this](const server_http_req &) { @@ -3898,7 +3908,7 @@ std::unique_ptr server_routes::handle_slots_save(const ser { server_task task(SERVER_TASK_TYPE_SLOT_SAVE); task.id = rd.get_new_id(); - task.slot_action.slot_id = id_slot; + task.slot_action.id_slot = id_slot; task.slot_action.filename = filename; task.slot_action.filepath = filepath; rd.post_task(std::move(task)); @@ -3934,7 +3944,7 @@ std::unique_ptr server_routes::handle_slots_restore(const { server_task task(SERVER_TASK_TYPE_SLOT_RESTORE); task.id = rd.get_new_id(); - task.slot_action.slot_id = id_slot; + task.slot_action.id_slot = id_slot; task.slot_action.filename = filename; task.slot_action.filepath = filepath; rd.post_task(std::move(task)); @@ -3963,7 +3973,7 @@ std::unique_ptr server_routes::handle_slots_erase(const se { server_task task(SERVER_TASK_TYPE_SLOT_ERASE); task.id = rd.get_new_id(); - task.slot_action.slot_id = id_slot; + task.slot_action.id_slot = id_slot; rd.post_task(std::move(task)); } diff --git a/tools/server/server-task.h b/tools/server/server-task.h index 244470596b..a69e8f1a3d 100644 --- a/tools/server/server-task.h +++ b/tools/server/server-task.h @@ -153,7 +153,7 @@ struct server_task { // used by SERVER_TASK_TYPE_SLOT_SAVE, SERVER_TASK_TYPE_SLOT_RESTORE, SERVER_TASK_TYPE_SLOT_ERASE struct slot_action { - int slot_id; + int id_slot; std::string filename; std::string filepath; };