diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 551bdd3df0..f738edefc4 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -21,7 +21,8 @@ on:
       '**/*.m',
       '**/*.metal',
       '**/*.comp',
-      '**/*.glsl'
+      '**/*.glsl',
+      '**/*.wgsl'
     ]
 
   pull_request:
@@ -42,7 +43,8 @@ on:
       '**/*.m',
       '**/*.metal',
       '**/*.comp',
-      '**/*.glsl'
+      '**/*.glsl',
+      '**/*.wgsl'
     ]
 
 concurrency:
@@ -1371,7 +1373,7 @@ jobs:
         id: update_presets
         if: ${{ matrix.build == 'arm64-snapdragon' }}
         run: |
-          cp docs/backend/hexagon/CMakeUserPresets.json .
+          cp docs/backend/snapdragon/CMakeUserPresets.json .
 
       - name: Build
         id: ndk_build
diff --git a/README.md b/README.md
index 0783e43e5c..dac020ad37 100644
--- a/README.md
+++ b/README.md
@@ -213,6 +213,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
 - [llama.vim](https://github.com/ggml-org/llama.vim) (MIT)
 - [LARS](https://github.com/abgulati/LARS) (AGPL)
 - [Llama Assistant](https://github.com/vietanhdev/llama-assistant) (GPL)
+- [LlamaLib](https://github.com/undreamai/LlamaLib) (Apache-2.0)
 - [LLMFarm](https://github.com/guinmoon/LLMFarm?tab=readme-ov-file) (MIT)
 - [LLMUnity](https://github.com/undreamai/LLMUnity) (MIT)
 - [LMStudio](https://lmstudio.ai/) (proprietary)
diff --git a/common/CMakeLists.txt b/common/CMakeLists.txt
index 3bc7bc6210..295ae9ea25 100644
--- a/common/CMakeLists.txt
+++ b/common/CMakeLists.txt
@@ -75,6 +75,8 @@ add_library(${TARGET} STATIC
     ngram-cache.h
     ngram-map.cpp
     ngram-map.h
+    ngram-mod.cpp
+    ngram-mod.h
     peg-parser.cpp
     peg-parser.h
     preset.cpp
diff --git a/common/arg.cpp b/common/arg.cpp
index 218418f070..5fbc9022c0 100644
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -3396,7 +3396,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         }
     ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
     add_opt(common_arg(
-        {"--spec-type"}, "[none|ngram-cache|ngram-simple|ngram-map-k|ngram-map-k4v]",
+        {"--spec-type"}, "[none|ngram-cache|ngram-simple|ngram-map-k|ngram-map-k4v|ngram-mod]",
         string_format("type of speculative decoding to use when no draft model is provided (default: %s)\n",
             common_speculative_type_to_str(params.speculative.type).c_str()),
         [](common_params & params, const std::string & value) {
@@ -3410,6 +3410,8 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
                 params.speculative.type = COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K;
             } else if (value == "ngram-map-k4v") {
                 params.speculative.type = COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K4V;
+            } else if (value == "ngram-mod") {
+                params.speculative.type = COMMON_SPECULATIVE_TYPE_NGRAM_MOD;
             } else {
                 throw std::invalid_argument("unknown speculative decoding type without draft model");
             }
diff --git a/common/common.h b/common/common.h
index fd3ab8cd18..398ebb0960 100644
--- a/common/common.h
+++ b/common/common.h
@@ -171,6 +171,7 @@ enum common_speculative_type {
     COMMON_SPECULATIVE_TYPE_NGRAM_SIMPLE,  // simple self-speculative decoding
     COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K,   // self-speculative decoding with n-gram keys only
     COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K4V, // self-speculative decoding with n-gram keys and 4 m-gram values
+    COMMON_SPECULATIVE_TYPE_NGRAM_MOD,
     COMMON_SPECULATIVE_TYPE_NGRAM_CACHE,   // self-speculative decoding with 3-level n-gram cache
     COMMON_SPECULATIVE_TYPE_COUNT          // number of types, unknown type
 };
@@ -252,6 +253,8 @@ struct common_params_model {
     std::string name        = ""; // in format <user>/<model>[:<tag>] (tag is optional)     // NOLINT
 };
 
+struct common_ngram_mod;
+
 struct common_params_speculative {
     common_speculative_type type = COMMON_SPECULATIVE_TYPE_NONE; // type of speculative decoding
 
@@ -269,6 +272,8 @@ struct common_params_speculative {
     uint16_t ngram_check_rate =  1; // check rate for ngram lookup
     uint16_t ngram_min_hits   =  1; // minimum hits at ngram/mgram lookup for mgram to be proposed
 
+    std::shared_ptr<common_ngram_mod> ngram_mod;
+
     std::string lookup_cache_static;  // path of static ngram cache file for lookup decoding           // NOLINT
     std::string lookup_cache_dynamic; // path of dynamic ngram cache file for lookup decoding          // NOLINT
 
diff --git a/common/jinja/value.h b/common/jinja/value.h
index a2f92d2c69..1c04760a08 100644
--- a/common/jinja/value.h
+++ b/common/jinja/value.h
@@ -12,6 +12,7 @@
 #include <set>
 #include <sstream>
 #include <string>
+#include <unordered_map>
 #include <vector>
 
 namespace jinja {
diff --git a/common/ngram-map.cpp b/common/ngram-map.cpp
index 930e7a3c10..84fd761367 100644
--- a/common/ngram-map.cpp
+++ b/common/ngram-map.cpp
@@ -7,6 +7,21 @@
 #include <cstdio>
 #include <sstream>
 
+// Print the values of a sublist of `llama_tokens & inp` to a string in the form [v0, v1, v2, ...].
+static std::string common_tokens_to_str(const llama_tokens & inp, size_t start, size_t length) {
+    std::ostringstream oss;
+    oss << '[';
+    for (size_t i = 0; i < length; ++i) {
+        if (i > 0) {
+            oss << ", ";
+        }
+        oss << inp[start + i];
+    }
+    oss << ']';
+    return oss.str();
+}
+
+
 // n-gram simple
 //
 
@@ -100,8 +115,6 @@ llama_tokens common_ngram_simple_draft(
 // maximum number of counted values of a ngram map value.
 #define COMMON_NGRAM_MAX_VALUE_COUNT 16380
 
-static std::string common_tokens_to_str(const llama_tokens & inp, size_t start, size_t length);
-
 void common_ngram_map_draft(common_ngram_map & map,
         const llama_tokens & inp, llama_token sampled,
         llama_tokens & draft) {
@@ -347,21 +360,3 @@ void common_ngram_map_accept(common_ngram_map & map, uint16_t n_accepted) {
             n_accepted, curr_value.n_accepted);
     curr_value.n_accepted = n_accepted;
 }
-
-// Helper functions.
-//
-
-// Print the values of a sublist of `llama_tokens & inp` to a string in the form [v0, v1, v2, ...].
-std::string common_tokens_to_str(const llama_tokens & inp, size_t start, size_t length) {
-    std::ostringstream oss;
-    oss << '[';
-    for (size_t i = 0; i < length; ++i) {
-        if (i > 0) {
-            oss << ", ";
-        }
-        oss << inp[start + i];
-    }
-    oss << ']';
-    return oss.str();
-}
-
diff --git a/common/ngram-map.h b/common/ngram-map.h
index bf91883f0c..b365034ac5 100644
--- a/common/ngram-map.h
+++ b/common/ngram-map.h
@@ -11,6 +11,7 @@
 //
 
 #include "llama.h"
+#include "common.h"
 
 #include <vector>
 
diff --git a/common/ngram-mod.cpp b/common/ngram-mod.cpp
new file mode 100644
index 0000000000..76f7257f61
--- /dev/null
+++ b/common/ngram-mod.cpp
@@ -0,0 +1,60 @@
+#include "ngram-mod.h"
+
+//
+// common_ngram_mod
+//
+
+common_ngram_mod::common_ngram_mod(uint16_t n, size_t size) : n(n), used(0) {
+    entries.resize(size);
+
+    reset();
+}
+
+size_t common_ngram_mod::idx(const entry_t * tokens) const {
+    size_t res = 0;
+
+    for (size_t i = 0; i < n; ++i) {
+        res = res*6364136223846793005ULL + tokens[i];
+    }
+
+    res = res % entries.size();
+
+    return res;
+}
+
+void common_ngram_mod::add(const entry_t * tokens) {
+    const size_t i = idx(tokens);
+
+    if (entries[i] == EMPTY) {
+        used++;
+    }
+
+    entries[i] = tokens[n];
+}
+
+common_ngram_mod::entry_t common_ngram_mod::get(const entry_t * tokens) const {
+    const size_t i = idx(tokens);
+
+    return entries[i];
+}
+
+void common_ngram_mod::reset() {
+    std::fill(entries.begin(), entries.end(), EMPTY);
+    used = 0;
+}
+
+size_t common_ngram_mod::get_n() const {
+    return n;
+}
+
+size_t common_ngram_mod::get_used() const {
+    return used;
+}
+
+size_t common_ngram_mod::size() const {
+    return entries.size();
+}
+
+size_t common_ngram_mod::size_bytes() const {
+    return entries.size() * sizeof(entries[0]);
+}
diff --git a/common/ngram-mod.h b/common/ngram-mod.h
new file mode 100644
index 0000000000..7af92e9dde
--- /dev/null
+++ b/common/ngram-mod.h
@@ -0,0 +1,38 @@
+#pragma once
+
+#include <cstdint>
+#include <vector>
+#include <cstddef>
+
+//
+// common_ngram_mod
+// ref: https://github.com/ggml-org/llama.cpp/pull/19164
+//
+
+// basic n-gram hasher
+struct common_ngram_mod {
+    using entry_t = int32_t;
+
+    static constexpr entry_t EMPTY = -1;
+
+    common_ngram_mod(uint16_t n, size_t size);
+
+    size_t  idx(const entry_t * tokens) const;
+    void    add(const entry_t * tokens);
+    entry_t get(const entry_t * tokens) const; // return -1 if not found
+
+    void reset();
+
+    size_t get_n()    const;
+    size_t get_used() const;
+
+    size_t size()       const;
+    size_t size_bytes() const;
+
+private:
+    size_t n; // ngram size to hash
+
+    size_t used;
+
+    std::vector<entry_t> entries;
+};
diff --git a/common/speculative.cpp b/common/speculative.cpp
index 3f314b5d57..a1a3b51c13 100644
--- a/common/speculative.cpp
+++ b/common/speculative.cpp
@@ -6,6 +6,7 @@
 #include "log.h"
 #include "ngram-cache.h"
 #include "ngram-map.h"
+#include "ngram-mod.h"
 #include "sampling.h"
 
 #include <algorithm>
@@ -23,6 +24,7 @@ const std::vector<enum common_speculative_type> common_speculative_types = {
     COMMON_SPECULATIVE_TYPE_NGRAM_SIMPLE,
     COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K,
     COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K4V,
+    COMMON_SPECULATIVE_TYPE_NGRAM_MOD,
     COMMON_SPECULATIVE_TYPE_NGRAM_CACHE
 };
 
@@ -33,6 +35,7 @@ const std::map<std::string, enum common_speculative_type> common_speculative_typ
     {"ngram_simple",  COMMON_SPECULATIVE_TYPE_NGRAM_SIMPLE},
     {"ngram_map_k",   COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K},
     {"ngram_map_k4v", COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K4V},
+    {"ngram_mod",     COMMON_SPECULATIVE_TYPE_NGRAM_MOD},
     {"ngram_cache",   COMMON_SPECULATIVE_TYPE_NGRAM_CACHE}
 };
 
@@ -110,6 +113,8 @@ static bool common_speculative_are_compatible(
 struct common_speculative_state {
     const enum common_speculative_type type;
 
+    // TODO: rename to n_call_draft, n_gen_drafts, n_acc_drafts, n_gen_tokens, n_acc_tokens
+    // TODO: add n_call_begin, n_call_accept
     size_t drafts_call_count       = 0; // number of times this implementation was called.
     size_t drafts_generated_count  = 0; // number of times a draft or part was generated by this implementation.
     size_t drafts_accepted_count   = 0; // number of times a draft or part was accepted by the target model.
@@ -119,6 +124,8 @@ struct common_speculative_state {
     // TODO: track performance of most recent calls
     const bool gen_perf = true; // whether to generate performance stats.
 
+    // TODO: rename to t_draft_us
+    // TODO: add t_begin_us, t_accept_us
     int64_t gen_duration_us = 0; // total time spent in this implementation in microseconds.
 
     common_speculative_state(enum common_speculative_type type) : type(type) {}
@@ -509,6 +516,132 @@ struct common_speculative_state_ngram_map_k : public common_speculative_state {
     }
 };
 
+struct common_speculative_state_ngram_mod : public common_speculative_state {
+    common_ngram_mod & mod;
+
+    // the last position in the prompt that was added to the ngram container
+    size_t i_last = 0;
+
+    // length of the last drafted n‑gram (number of tokens returned by draft)
+    size_t n_draft_last = 0;
+
+    // consecutive accept rounds with low acceptance fraction (< 0.5)
+    int n_low = 0;
+
+    // enable trace logging if LLAMA_TRACE is set
+    const bool verbose;
+
+    common_speculative_state_ngram_mod(enum common_speculative_type type, common_ngram_mod & mod)
+        : common_speculative_state(type), mod(mod), verbose(std::getenv("LLAMA_TRACE") != nullptr) {
+        static_assert(sizeof(llama_token) == sizeof(common_ngram_mod::entry_t));
+    }
+
+    void begin(const llama_tokens & prompt) override {
+        i_last = 0;
+
+        n_draft_last = 0;
+
+        const size_t n = mod.get_n();
+
+        if (prompt.size() < n) {
+            return;
+        }
+
+        for (size_t i = 0; i < prompt.size() - n; ++i) {
+            mod.add(prompt.data() + i);
+        }
+
+        i_last = prompt.size() - n;
+
+        const double f = (double)mod.get_used() / (double)mod.size();
+        LOG_INF("%s: ngram_mod occupancy = %zu/%zu (%.2f)\n", __func__, mod.get_used(), mod.size(), f);
+
+        constexpr double f_thold = 0.25;
+        if (f > f_thold) {
+            LOG_WRN("%s: ngram_mod occupancy %.2f exceeds threshold (%.2f) - resetting\n", __func__, f, f_thold);
+
+            mod.reset();
+        }
+    }
+
+    void draft(
+            const common_params_speculative & params,
+            const llama_tokens & prompt_tgt,
+            llama_token id_last,
+            llama_tokens & result) override {
+        GGML_UNUSED(params);
+
+        n_draft_last = 0;
+
+        const size_t cur_len = prompt_tgt.size();
+        if (cur_len < mod.get_n()) {
+            return;
+        }
+
+        const size_t n = mod.get_n();
+
+        // add new ngrams in chunks
+        if (i_last + 32 < cur_len) {
+            for (size_t i = i_last; i < cur_len - n; ++i) {
+                mod.add(prompt_tgt.data() + i);
+            }
+
+            i_last = cur_len - n;
+        }
+
+        result.resize(n + params.n_max);
+        for (size_t i = 0; i < n - 1; ++i) {
+            result[i] = prompt_tgt[cur_len - n + 1 + i];
+        }
+        result[n - 1] = id_last;
+
+        for (int i = 0; i < params.n_max; ++i) {
+            const llama_token token = mod.get(result.data() + i);
+            if (token == common_ngram_mod::EMPTY) {
+                if (i < params.n_min) {
+                    result.clear();
+                    return;
+                }
+
+                result.resize(n + i);
+                break;
+            }
+            result[n + i] = token;
+        }
+
+        // only return the m tokens that were drafted
+        for (size_t i = 0; n + i < result.size(); ++i) {
+            result[i] = result[n + i];
+        }
+        result.resize(result.size() - n);
+
+        // store length of drafted n‑gram for later acceptance analysis
+        n_draft_last = result.size();
+    }
+
+    void accept(uint16_t n_accepted) override {
+        if (verbose) {
+            LOG_INF("%s: accepted %d tokens from %zu drafted tokens\n", __func__, n_accepted, n_draft_last);
+        }
+
+        // compute acceptance fraction if we have a recorded draft length
+        if (n_draft_last > 0) {
+            const double f_acc = (double)n_accepted / (double)n_draft_last;
+            if (f_acc < 0.5) {
+                n_low++;
+                if (n_low >= 3) {
+                    LOG_WRN("%s: low acceptance streak (%d) – resetting ngram_mod\n", __func__, n_low);
+
+                    mod.reset();
+                    n_low = 0;
+                }
+            } else {
+                n_low = 0;
+            }
+        }
+    }
+};
+
 struct common_speculative_state_ngram_cache : public common_speculative_state {
     uint16_t n_draft;
     bool save_dynamic;
@@ -650,6 +783,7 @@ std::string common_speculative_type_to_str(enum common_speculative_type type) {
         case COMMON_SPECULATIVE_TYPE_NGRAM_SIMPLE:  return "ngram_simple";
         case COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K:   return "ngram_map_k";
         case COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K4V: return "ngram_map_k4v";
+        case COMMON_SPECULATIVE_TYPE_NGRAM_MOD:     return "ngram_mod";
         case COMMON_SPECULATIVE_TYPE_NGRAM_CACHE:   return "ngram_cache";
         default:                                    return "unknown";
     }
@@ -666,8 +800,8 @@ enum common_speculative_type common_speculative_type_from_name(const std::string
 // initialization of the speculative decoding system
 //
 common_speculative * common_speculative_init(
-        const common_params_speculative & params,
-              llama_context             * ctx_tgt) {
+        common_params_speculative & params,
+        llama_context             * ctx_tgt) {
     llama_context * ctx_dft = nullptr;
     if (params.model_dft) {
         ctx_dft = llama_init_from_model(params.model_dft, params.cparams_dft);
@@ -687,6 +821,7 @@ common_speculative * common_speculative_init(
         bool has_ngram_simple  = (params.type == COMMON_SPECULATIVE_TYPE_NGRAM_SIMPLE);
         bool has_ngram_map_k   = (params.type == COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K);
         bool has_ngram_map_k4v = (params.type == COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K4V);
+        bool has_ngram_mod     = (params.type == COMMON_SPECULATIVE_TYPE_NGRAM_MOD);
 
         // In a more complex implementation we could use the same implementation but with different parameters.
         // This was initially used in PR-18471 but removed to simplify the code.
@@ -701,6 +836,22 @@ common_speculative * common_speculative_init(
             // This implementation can guess tokens with high acceptance rate but is more expensive.
             configs.push_back(common_speculative_config(COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K4V, params));
         }
+        if (has_ngram_mod) {
+            // shared instance for all speculative decoding contexts
+            if (!params.ngram_mod) {
+                params.ngram_mod = std::make_shared<common_ngram_mod>(params.ngram_size_n, 4*1024*1024);
+
+                LOG_INF("%s: initialized ngram_mod with n=%d, size=%zu (%.3f MB)\n", __func__,
+                        params.ngram_size_n, params.ngram_mod->size(),
+                        (float)(params.ngram_mod->size_bytes())/1024/1024);
+
+                if (params.ngram_size_n < 16) {
+                    LOG_WRN("%s: ngram_mod n=%d is too small - poor quality is possible, see: https://github.com/ggml-org/llama.cpp/pull/19164\n", __func__, params.ngram_size_n);
+                }
+            }
+
+            configs.push_back(common_speculative_config(COMMON_SPECULATIVE_TYPE_NGRAM_MOD, params));
+        }
         if (has_ngram_cache) {
             configs.push_back(common_speculative_config(COMMON_SPECULATIVE_TYPE_NGRAM_CACHE, params));
         }
@@ -758,6 +909,11 @@ common_speculative * common_speculative_init(
                 ));
                 break;
             }
+            case COMMON_SPECULATIVE_TYPE_NGRAM_MOD: {
+                GGML_ASSERT(config.params.ngram_mod);
+                impls.push_back(std::make_unique<common_speculative_state_ngram_mod>(config.type, *config.params.ngram_mod));
+                break;
+            }
             case COMMON_SPECULATIVE_TYPE_NGRAM_CACHE: {
                 auto state = create_state_ngram_cache(
                         params.lookup_cache_static, params.lookup_cache_dynamic, config);
@@ -822,8 +978,7 @@ llama_tokens common_speculative_draft(
 
         if (!result.empty()) {
             LOG_DBG("%s: called impl %s, hist size = %zu, call_count = %zu, gen = %zu\n", __func__,
-                    common_speculative_type_to_str(impl.get()->type).c_str(),
-                    prompt_tgt.size(),
+                    common_speculative_type_to_str(impl.get()->type).c_str(), prompt_tgt.size(),
                     impl.get()->drafts_call_count, result.size());
 
             spec->curr_impl = impl.get(); // set current implementation for stats
@@ -869,6 +1024,7 @@ void common_speculative_print_stats(const common_speculative * spec) {
             str_perf = "";
         }
 
+        // TODO: report time for begin() and accept()
         LOG_INF("statistics %s: #calls = %zu, #gen drafts = %zu, #acc drafts = %zu, #gen tokens = %zu, #acc tokens = %zu%s\n",
                 common_speculative_type_to_str(impl->type).c_str(),
                 impl->drafts_call_count,
diff --git a/common/speculative.h b/common/speculative.h
index 9e1888e4be..76fe6bb7bc 100644
--- a/common/speculative.h
+++ b/common/speculative.h
@@ -15,8 +15,8 @@ enum common_speculative_type common_speculative_type_from_name(const std::string
 std::string common_speculative_type_to_str(enum common_speculative_type type);
 
 common_speculative * common_speculative_init(
-        const common_params_speculative & params,
-              llama_context             * ctx_tgt);
+        common_params_speculative & params,
+        llama_context             * ctx_tgt);
 
 void common_speculative_free(common_speculative * spec);
 
diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
index a391717e32..eb43520f98 100755
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -8806,6 +8806,7 @@ class GraniteMoeModel(GraniteModel):
             gate, up = data_torch.split(ffn_dim, dim=-2)
             yield from ModelBase.modify_tensors(self, gate, self.format_tensor_name(gguf.MODEL_TENSOR.FFN_GATE_EXP, bid), bid)
             yield from ModelBase.modify_tensors(self, up, self.format_tensor_name(gguf.MODEL_TENSOR.FFN_UP_EXP, bid), bid)
+            return
 
         has_experts = bool(self.hparams.get('num_local_experts'))
 
diff --git a/docs/backend/SYCL.md b/docs/backend/SYCL.md
index bcb3ce6743..c0a422b3dc 100644
--- a/docs/backend/SYCL.md
+++ b/docs/backend/SYCL.md
@@ -35,9 +35,9 @@ The following releases are verified and recommended:
 
 |Commit ID|Tag|Release|Verified  Platform| Update date|
 |-|-|-|-|-|
-|24e86cae7219b0f3ede1d5abdf5bf3ad515cccb8|b5377 |[llama-b5377-bin-win-sycl-x64.zip](https://github.com/ggml-org/llama.cpp/releases/download/b5377/llama-b5377-bin-win-sycl-x64.zip) |ArcB580/Linux/oneAPI 2025.1<br>LNL Arc GPU/Windows 11/oneAPI 2025.1.1|2025-05-15|
-|3bcd40b3c593d14261fb2abfabad3c0fb5b9e318|b4040 |[llama-b4040-bin-win-sycl-x64.zip](https://github.com/ggml-org/llama.cpp/releases/download/b4040/llama-b4040-bin-win-sycl-x64.zip) |Arc770/Linux/oneAPI 2024.1<br>MTL Arc GPU/Windows 11/oneAPI 2024.1| 2024-11-19|
-|fb76ec31a9914b7761c1727303ab30380fd4f05c|b3038 |[llama-b3038-bin-win-sycl-x64.zip](https://github.com/ggml-org/llama.cpp/releases/download/b3038/llama-b3038-bin-win-sycl-x64.zip) |Arc770/Linux/oneAPI 2024.1<br>MTL Arc GPU/Windows 11/oneAPI 2024.1||
+|24e86cae7219b0f3ede1d5abdf5bf3ad515cccb8|b5377 |[llama-b5377-bin-win-sycl-x64.zip](https://github.com/ggml-org/llama.cpp/releases/download/b5377/llama-b5377-bin-win-sycl-x64.zip) |Arc B580/Linux/oneAPI 2025.1<br>LNL Arc GPU/Windows 11/oneAPI 2025.1.1|2025-05-15|
+|3bcd40b3c593d14261fb2abfabad3c0fb5b9e318|b4040 |[llama-b4040-bin-win-sycl-x64.zip](https://github.com/ggml-org/llama.cpp/releases/download/b4040/llama-b4040-bin-win-sycl-x64.zip) |Arc A770/Linux/oneAPI 2024.1<br>MTL Arc GPU/Windows 11/oneAPI 2024.1| 2024-11-19|
+|fb76ec31a9914b7761c1727303ab30380fd4f05c|b3038 |[llama-b3038-bin-win-sycl-x64.zip](https://github.com/ggml-org/llama.cpp/releases/download/b3038/llama-b3038-bin-win-sycl-x64.zip) |Arc A770/Linux/oneAPI 2024.1<br>MTL Arc GPU/Windows 11/oneAPI 2024.1||
 
 
 ## News
@@ -51,7 +51,7 @@ The following releases are verified and recommended:
     |-|-|-|-|
     |PVC 1550|39|73|+87%|
     |Flex 170|39|50|+28%|
-    |Arc770|42|55|+30%|
+    |Arc A770|42|55|+30%|
     |MTL|13|16|+23%|
     |ARL-H|14|17|+21%|
 
@@ -62,7 +62,7 @@ The following releases are verified and recommended:
   - Use oneDNN as the default GEMM library, improve the compatibility for new Intel GPUs.
 
 - 2024.5
-  - Performance is increased: 34 -> 37 tokens/s of llama-2-7b.Q4_0 on Arc770.
+  - Performance is increased: 34 -> 37 tokens/s of llama-2-7b.Q4_0 on Arc A770.
   - Arch Linux is verified successfully.
 
 - 2024.4
@@ -111,7 +111,8 @@ On older Intel GPUs, you may try [OpenCL](/docs/backend/OPENCL.md) although the
 |-------------------------------|---------|---------------------------------------|
 | Intel Data Center Max Series  | Support | Max 1550, 1100                        |
 | Intel Data Center Flex Series | Support | Flex 170                              |
-| Intel Arc Series              | Support | Arc 770, 730M, Arc A750, B580         |
+| Intel Arc A-Series              | Support | Arc A770, Arc A730M, Arc A750         |
+| Intel Arc B-Series              | Support | Arc B580         |
 | Intel built-in Arc GPU        | Support | built-in Arc GPU in Meteor Lake, Arrow Lake, Lunar Lake |
 | Intel iGPU                    | Support | iGPU in 13700k, 13400, i5-1250P, i7-1260P, i7-1165G7  |
 
diff --git a/docs/backend/hexagon/CMakeUserPresets.json b/docs/backend/snapdragon/CMakeUserPresets.json
similarity index 70%
rename from docs/backend/hexagon/CMakeUserPresets.json
rename to docs/backend/snapdragon/CMakeUserPresets.json
index 1f2676c0bc..4cf473d05f 100644
--- a/docs/backend/hexagon/CMakeUserPresets.json
+++ b/docs/backend/snapdragon/CMakeUserPresets.json
@@ -1,5 +1,10 @@
 {
-  "version": 4,
+  "version": 5,
+  "cmakeMinimumRequired": {
+      "major": 3,
+      "minor": 28,
+      "patch": 0
+  },
   "configurePresets": [
     {
         "name": "arm64-android-snapdragon",
@@ -16,7 +21,9 @@
             "CMAKE_CXX_FLAGS_RELEASE":        "-O3 -DNDEBUG",
             "CMAKE_C_FLAGS_RELWITHDEBINFO":   "-O3 -DNDEBUG -g",
             "CMAKE_CXX_FLAGS_RELWITHDEBINFO": "-O3 -DNDEBUG -g",
-            "HEXAGON_SDK_ROOT": "$env{HEXAGON_SDK_ROOT}",
+            "CMAKE_PREFIX_PATH":  "$env{OPENCL_SDK_ROOT}",
+            "HEXAGON_SDK_ROOT":   "$env{HEXAGON_SDK_ROOT}",
+            "HEXAGON_TOOLS_ROOT": "$env{HEXAGON_TOOLS_ROOT}",
             "PREBUILT_LIB_DIR": "android_aarch64",
             "GGML_OPENMP":      "OFF",
             "GGML_LLAMAFILE":   "OFF",
@@ -31,7 +38,15 @@
         "name": "arm64-windows-snapdragon",
         "inherits": [ "base", "arm64-windows-llvm" ],
         "cacheVariables": {
-            "HEXAGON_SDK_ROOT": "$env{HEXAGON_SDK_ROOT}",
+            "CMAKE_C_FLAGS":   "-march=armv8.7a+fp16 -fvectorize -ffp-model=fast -flto -D_GNU_SOURCE",
+            "CMAKE_CXX_FLAGS": "-march=armv8.7a+fp16 -fvectorize -ffp-model=fast -flto -D_GNU_SOURCE",
+            "CMAKE_C_FLAGS_RELEASE":          "-O3 -DNDEBUG",
+            "CMAKE_CXX_FLAGS_RELEASE":        "-O3 -DNDEBUG",
+            "CMAKE_C_FLAGS_RELWITHDEBINFO":   "-O3 -DNDEBUG -g",
+            "CMAKE_CXX_FLAGS_RELWITHDEBINFO": "-O3 -DNDEBUG -g",
+            "CMAKE_PREFIX_PATH":  "$env{OPENCL_SDK_ROOT}",
+            "HEXAGON_SDK_ROOT":   "$env{HEXAGON_SDK_ROOT}",
+            "HEXAGON_TOOLS_ROOT": "$env{HEXAGON_TOOLS_ROOT}",
             "PREBUILT_LIB_DIR": "windows_aarch64",
             "GGML_OPENMP":      "OFF",
             "GGML_LLAMAFILE":   "OFF",
diff --git a/docs/backend/hexagon/README.md b/docs/backend/snapdragon/README.md
similarity index 84%
rename from docs/backend/hexagon/README.md
rename to docs/backend/snapdragon/README.md
index 3befdf7225..8e1f37b206 100644
--- a/docs/backend/hexagon/README.md
+++ b/docs/backend/snapdragon/README.md
@@ -1,6 +1,8 @@
-# Snapdragon-based Android devices
+# Snapdragon-based devices
 
-## How to Build
+## Setup
+
+### Android
 
 The easiest way to build llama.cpp for a Snapdragon-based Android device is using the toolchain Docker image (see github.com/snapdragon-toolchain).
 This image includes Android NDK, OpenCL SDK, Hexagon SDK, CMake, etc.
@@ -12,7 +14,24 @@ This method works on Linux, macOS, and Windows. macOS and Windows users should i
 [d]/> cd /workspace
 ```
 
-The rest of the Android build process assumes that you're running inside the toolchain container.
+Note: The rest of the **Android** build process assumes that you're running inside the toolchain container.
+
+### Windows On Snapdragon
+
+Native Windows 11 arm64 builds has the following tools dependencies:
+- MS Visual Studio 2026 (Community Edition or Pro)
+  - MSVC arm64 standard and runtime libraries
+  - UCRT and Driver Kit
+- LLVM core libraries and Clang compiler (winget)
+- CMake, Git, Python (winget)
+- Hexagon SDK Community Edition 6.4 or later (see windows.md)
+- OpenCL SDK 2.3 or later (see windows.md)
+
+Note: The rest of the **Windows** build process assumes that you're running natively in Powershell.
+Adapt below build commands accordingly.
+
+## How to Build
+
 Let's build llama.cpp with CPU, OpenCL, and Hexagon backends via CMake presets:
 
 ```
@@ -49,24 +68,26 @@ Preset CMake variables:
 To generate an installable "package" simply use cmake --install:
 
 ```
-[d]/workspace> cmake --install build-snapdragon --prefix pkg-adb/llama.cpp
+[d]/workspace> cmake --install build-snapdragon --prefix pkg-snapdragon/llama.cpp
 -- Install configuration: "Release"
--- Installing: /workspace/pkg-adb/llama.cpp/lib/libggml-cpu.so
--- Installing: /workspace/pkg-adb/llama.cpp/lib/libggml-opencl.so
--- Installing: /workspace/pkg-adb/llama.cpp/lib/libggml-hexagon.so
--- Installing: /workspace/pkg-adb/llama.cpp/lib/libggml-htp-v73.so
--- Installing: /workspace/pkg-adb/llama.cpp/lib/libggml-htp-v75.so
--- Installing: /workspace/pkg-adb/llama.cpp/lib/libggml-htp-v79.so
--- Installing: /workspace/pkg-adb/llama.cpp/lib/libggml-htp-v81.so
--- Installing: /workspace/pkg-adb/llama.cpp/lib/libggml.so
+-- Installing: /workspace/pkg-snapdragon/llama.cpp/lib/libggml-cpu.so
+-- Installing: /workspace/pkg-snapdragon/llama.cpp/lib/libggml-opencl.so
+-- Installing: /workspace/pkg-snapdragon/llama.cpp/lib/libggml-hexagon.so
+-- Installing: /workspace/pkg-snapdragon/llama.cpp/lib/libggml-htp-v73.so
+-- Installing: /workspace/pkg-snapdragon/llama.cpp/lib/libggml-htp-v75.so
+-- Installing: /workspace/pkg-snapdragon/llama.cpp/lib/libggml-htp-v79.so
+-- Installing: /workspace/pkg-snapdragon/llama.cpp/lib/libggml-htp-v81.so
+-- Installing: /workspace/pkg-snapdragon/llama.cpp/lib/libggml.so
 ...
--- Installing: /workspace/pkg-adb/llama.cpp/bin/llama-bench
--- Installing: /workspace/pkg-adb/llama.cpp/bin/llama-cli
+-- Installing: /workspace/pkg-snapdragon/llama.cpp/bin/llama-bench
+-- Installing: /workspace/pkg-snapdragon/llama.cpp/bin/llama-cli
 ...
 ```
 
 ## How to Install
 
+### Android
+
 For this step, your device needs to be configured for on-device development.
 Please see https://developer.android.com/studio/debug/dev-options for details.
 
@@ -74,10 +95,10 @@ Once ADB is enabled, use `adb push` to install `pkg-snapdragon` on the device.
 **Note that the toolchain Docker image doesn't have ADB and doesn't set up the ADB bridge. Please use native ADB on the host.**
 
 ```
-~/src/llama.cpp$ adb push pkg-adb/llama.cpp /data/local/tmp/
-pkg-adb/llama.cpp/bin/: 67 files pushed, 0 skipped. 190.2 MB/s (919095042 bytes in 4.607s)
-pkg-adb/llama.cpp/include/: 19 files pushed, 0 skipped. 20.5 MB/s (255173 bytes in 0.012s)
-pkg-adb/llama.cpp/lib/: 16 files pushed, 0 skipped. 144.4 MB/s (43801382 bytes in 0.289s)
+~/src/llama.cpp$ adb push pkg-snapdragon/llama.cpp /data/local/tmp/
+pkg-snapdragon/llama.cpp/bin/: 67 files pushed, 0 skipped. 190.2 MB/s (919095042 bytes in 4.607s)
+pkg-snapdragon/llama.cpp/include/: 19 files pushed, 0 skipped. 20.5 MB/s (255173 bytes in 0.012s)
+pkg-snapdragon/llama.cpp/lib/: 16 files pushed, 0 skipped. 144.4 MB/s (43801382 bytes in 0.289s)
 102 files pushed, 0 skipped. 186.9 MB/s (963151597 bytes in 4.914s)
 ```
 
@@ -92,6 +113,11 @@ At this point, you should also install some models:
 Llama-3.2-1B-Instruct-Q4_0.gguf: 1 file pushed, 0 skipped. 38.3 MB/s (773025920 bytes in 19.250s)
 ```
 
+### Windows
+
+All artifacts are already installed in the `pkg-snapdragon` folder.
+To run, adapt below instructions to use Powershell scrits in `scripts/snapdragon/windows`.
+
 ## How to Run
 
 The easiest way to run llama.cpp cli tools is using provided wrapper scripts that properly set up all required environment variables.
diff --git a/docs/backend/hexagon/developer.md b/docs/backend/snapdragon/developer.md
similarity index 100%
rename from docs/backend/hexagon/developer.md
rename to docs/backend/snapdragon/developer.md
diff --git a/docs/backend/snapdragon/windows.md b/docs/backend/snapdragon/windows.md
new file mode 100644
index 0000000000..710ad8fdf4
--- /dev/null
+++ b/docs/backend/snapdragon/windows.md
@@ -0,0 +1,161 @@
+## Overview
+
+The document covers procedures for installing the latest GPU and NPU drivers, and OpenCL and Hexagon SDKs.
+
+
+In order to use Hexagon NPU on Snapdragon Windows devices the underlying HTP Ops libraries (e.g libggml-htp-v73.so)
+must be included in the .cat file digitally signed with a trusted certificate.
+
+This document covers details on how to generate personal certificate files (.pfx) and how to configure the system
+to allow for test signatures (aka test-signing).
+
+## Install the latest Adreno OpenCL SDK
+
+Either use the trimmed down version (optimized for CI) from
+
+    https://github.com/snapdragon-toolchain/opencl-sdk/releases/download/v2.3.2/adreno-opencl-sdk-v2.3.2-arm64-wos.tar.xz
+
+Or download the complete official version from
+
+    https://softwarecenter.qualcomm.com/catalog/item/Adreno_OpenCL_SDK?version=2.3.2
+
+Unzip/untar the archive into
+```
+c:\Qualcomm\OpenCL_SDK\2.3.2
+```
+
+## Install the latest Hexagon SDK Community Edition
+
+Either use the trimmed down version (optimized for CI) from
+
+    https://github.com/snapdragon-toolchain/hexagon-sdk/releases/download/v6.4.0.2/hexagon-sdk-v6.4.0.2-arm64-wos.tar.xz
+
+Or download the complete official version from
+
+    https://softwarecenter.qualcomm.com/catalog/item/Hexagon_SDK?version=6.4.0.2
+
+Unzip/untar the archive into
+```
+c:\Qualcomm\Hexagon_SDK\6.4.0.2
+```
+
+## Install the latest Adreno GPU driver
+
+Download the driver from
+
+    https://softwarecenter.qualcomm.com/catalog/item/Windows_Graphics_Driver
+
+After the automated installation and reboot please make sure that the GPU device shows up in the `Device Manager` (under 'Display Adapters`)
+
+## Install the latest Qualcomm NPU driver
+
+Download the driver from
+
+    https://softwarecenter.qualcomm.com/catalog/item/Qualcomm_HND
+
+After the automated installation and reboot please make sure that the Hexagon NPU device shows up in the `Device Manager` (under `Neural Processors`).
+
+If the device is not available you can try installing all components (`qcnspmcdm8380`, `qcnspmcdm8380_ext`) manually.
+The components are extracted into
+```
+c:\QCDrivers\qcnspmcdm...
+```
+
+## Enable NPU driver test signatures
+
+Please note that the following steps are required only for the Hexagon NPU.
+Adreno GPU backend does not require test signatures.
+
+### Enable testsigning
+
+Use `bcdedit` to enable test-signing
+```
+> bcdedit /set TESTSIGNING ON
+```
+(Secure Boot may need to be disabled for this to work)
+
+Make sure test-signing is enabled after reboot
+```
+> bcdedit /enum
+...
+testsigning             Yes
+...
+```
+For additional details see Microsoft guide at
+
+   https://learn.microsoft.com/en-us/windows-hardware/drivers/install/the-testsigning-boot-configuration-option
+
+### Create personal certificate
+
+The tools required for this procedure are available as part of Windows SDK and Windows Driver Kit which should be
+installed as part of the MS Visual Studio.
+They are typically located at
+```
+c:\Program Files (x86)\Windows Kits\10\bin\10.0.26100.0
+```
+(replace 10.0.26100.0 with correct version).
+
+To create personal self-signed certificate run the following commands (either from cmd or power-shell):
+```
+> cd c:\Users\MyUser
+> mkdir Certs
+> cd Certs
+> makecert -r -pe -ss PrivateCertStore -n CN=GGML.HTP.v1 -eku 1.3.6.1.5.5.7.3.3 -sv ggml-htp-v1.pvk ggml-htp-v1.cer
+> pvk2pfx.exe -pvk ggml-htp-v1.pvk -spc ggml-htp-v1.cer -pfx ggml-htp-v1.pfx
+```
+(replace `MyUser` with your username).
+
+Add this certificate to `Trusted Root Certification Authorities` and `Trusted Publishers` stores.
+This can be done using `certlm` Certificate Manager tool.
+Right click on the certificate store, select `All Tasks -> Import` and follow the prompts to import the certificate from the
+PFX file you created above.
+
+For additional details see Microsoft guide at
+
+    https://learn.microsoft.com/en-us/windows-hardware/drivers/install/introduction-to-test-signing
+
+Make sure to save the PFX file, you will need it for the build procedures.
+Please note that the same certificate can be used for signing any number of builds.
+
+## Build Hexagon backend with signed HTP ops libraries
+
+The overall Hexagon backend build procedure for Windows on Snapdragon is the same as for other platforms.
+However, additional settings are required for generating and signing HTP Ops libraries.
+```
+> $env:OPENCL_SDK_ROOT="C:\Qualcomm\OpenCL_SDK\2.3.2"
+> $env:HEXAGON_SDK_ROOT="C:\Qualcomm\Hexagon_SDK\6.4.0.2"
+> $env:HEXAGON_TOOLS_ROOT="C:\Qualcomm\Hexagon_SDK\6.4.0.2\tools\HEXAGON_Tools\19.0.04"
+> $env:HEXAGON_HTP_CERT="c:\Users\MyUsers\Certs\ggml-htp-v1.pfx"
+> $env:WINDOWS_SDK_BIN="C:\Program Files (x86)\Windows Kits\10\bin\10.0.26100.0\arm64"
+
+> cmake --preset arm64-windows-snapdragon -B build-wos
+...
+> cmake --install build-wos --prefix pkg-snapdragon
+```
+
+Once the build is complete HTP ops libraries will be installed like this
+```
+> dir pkg-snapdragon/lib
+...
+-a----         1/22/2026   6:01 PM         187656 libggml-htp-v73.so
+-a----         1/22/2026   6:01 PM         191752 libggml-htp-v75.so
+-a----         1/22/2026   6:01 PM         187656 libggml-htp-v79.so
+-a----         1/22/2026   6:01 PM         187656 libggml-htp-v81.so
+-a----         1/22/2026   6:01 PM           4139 libggml-htp.cat
+```
+
+The .cat file, the signature and proper certicate installation can be verified with
+
+```
+> signtool.exe verify /v /pa .\pkg-snapdragon\lib\libggml-htp.cat
+Verifying: .\pkg-snapdragon\lib\libggml-htp.cat
+
+Signature Index: 0 (Primary Signature)
+Hash of file (sha256): 9820C664DA59D5EAE31DBB664127FCDAEF59CDC31502496BC567544EC2F401CF
+
+Signing Certificate Chain:
+        Issued to: GGML.HTP.v1
+...
+Successfully verified: .\pkg-snapdragon\lib\libggml-htp.cat
+...
+```
diff --git a/docs/ops.md b/docs/ops.md
index c066ab5a85..2c7c60dcca 100644
--- a/docs/ops.md
+++ b/docs/ops.md
@@ -97,7 +97,7 @@ Legend:
 |                             SILU | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | 🟡 | ✅ | ❌ | ❌ |
 |                        SILU_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ |
 |                              SIN | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | ✅ | 🟡 | ❌ | ❌ | ❌ |
-|                         SOFTPLUS | ❌ | ❌ | ✅ | 🟡 | 🟡 | ❌ | ❌ | 🟡 | ✅ | ❌ | ❌ |
+|                         SOFTPLUS | ❌ | ❌ | ✅ | 🟡 | 🟡 | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ |
 |                         SOFT_MAX | ❌ | 🟡 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
 |                    SOFT_MAX_BACK | ❌ | ❌ | 🟡 | 🟡 | ❌ | ❌ | 🟡 | ✅ | ❌ | ❌ | ❌ |
 |                        SOLVE_TRI | ❌ | ❌ | ✅ | 🟡 | ❌ | ❌ | ❌ | 🟡 | ❌ | ❌ | ❌ |
@@ -114,7 +114,7 @@ Legend:
 |                             TANH | ❌ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ✅ | 🟡 | ✅ | ❌ | ❌ |
 |               TIMESTEP_EMBEDDING | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ |
 |                            TOP_K | ❌ | ❌ | ✅ | ❌ | ✅ | ❌ | ❌ | 🟡 | ✅ | ❌ | ❌ |
-|                              TRI | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ |
+|                              TRI | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
 |                            TRUNC | ❌ | ❌ | ✅ | 🟡 | ❌ | ❌ | 🟡 | 🟡 | ✅ | ❌ | ❌ |
 |                          UPSCALE | ❌ | 🟡 | ✅ | ✅ | 🟡 | 🟡 | 🟡 | 🟡 | ❌ | ❌ | ❌ |
 |                            XIELU | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ |
diff --git a/docs/ops/SYCL.csv b/docs/ops/SYCL.csv
index 091a5caed7..c1d22e65d4 100644
--- a/docs/ops/SYCL.csv
+++ b/docs/ops/SYCL.csv
@@ -29,8 +29,8 @@
 "SYCL0","EXP","type=f16,ne_a=[5,7,11,13],v=0","support","1","yes","SYCL"
 "SYCL0","EXPM1","type=f16,ne_a=[128,2,2,2],v=0","support","0","no","SYCL"
 "SYCL0","EXPM1","type=f16,ne_a=[5,7,11,13],v=0","support","0","no","SYCL"
-"SYCL0","SOFTPLUS","type=f16,ne_a=[128,2,2,2],v=0","support","0","no","SYCL"
-"SYCL0","SOFTPLUS","type=f16,ne_a=[5,7,11,13],v=0","support","0","no","SYCL"
+"SYCL0","SOFTPLUS","type=f16,ne_a=[128,2,2,2],v=0","support","1","yes","SYCL"
+"SYCL0","SOFTPLUS","type=f16,ne_a=[5,7,11,13],v=0","support","1","yes","SYCL"
 "SYCL0","GELU_ERF","type=f16,ne_a=[128,2,2,2],v=0","support","1","yes","SYCL"
 "SYCL0","GELU_ERF","type=f16,ne_a=[5,7,11,13],v=0","support","1","yes","SYCL"
 "SYCL0","FLOOR","type=f16,ne_a=[128,2,2,2],v=0","support","1","yes","SYCL"
@@ -71,8 +71,8 @@
 "SYCL0","EXP","type=f16,ne_a=[5,7,11,13],v=1","support","1","yes","SYCL"
 "SYCL0","EXPM1","type=f16,ne_a=[128,2,2,2],v=1","support","0","no","SYCL"
 "SYCL0","EXPM1","type=f16,ne_a=[5,7,11,13],v=1","support","0","no","SYCL"
-"SYCL0","SOFTPLUS","type=f16,ne_a=[128,2,2,2],v=1","support","0","no","SYCL"
-"SYCL0","SOFTPLUS","type=f16,ne_a=[5,7,11,13],v=1","support","0","no","SYCL"
+"SYCL0","SOFTPLUS","type=f16,ne_a=[128,2,2,2],v=1","support","1","yes","SYCL"
+"SYCL0","SOFTPLUS","type=f16,ne_a=[5,7,11,13],v=1","support","1","yes","SYCL"
 "SYCL0","GELU_ERF","type=f16,ne_a=[128,2,2,2],v=1","support","1","yes","SYCL"
 "SYCL0","GELU_ERF","type=f16,ne_a=[5,7,11,13],v=1","support","1","yes","SYCL"
 "SYCL0","FLOOR","type=f16,ne_a=[128,2,2,2],v=1","support","0","no","SYCL"
@@ -113,8 +113,8 @@
 "SYCL0","EXP","type=f32,ne_a=[5,7,11,13],v=0","support","1","yes","SYCL"
 "SYCL0","EXPM1","type=f32,ne_a=[128,2,2,2],v=0","support","0","no","SYCL"
 "SYCL0","EXPM1","type=f32,ne_a=[5,7,11,13],v=0","support","0","no","SYCL"
-"SYCL0","SOFTPLUS","type=f32,ne_a=[128,2,2,2],v=0","support","0","no","SYCL"
-"SYCL0","SOFTPLUS","type=f32,ne_a=[5,7,11,13],v=0","support","0","no","SYCL"
+"SYCL0","SOFTPLUS","type=f32,ne_a=[128,2,2,2],v=0","support","1","yes","SYCL"
+"SYCL0","SOFTPLUS","type=f32,ne_a=[5,7,11,13],v=0","support","1","yes","SYCL"
 "SYCL0","GELU_ERF","type=f32,ne_a=[128,2,2,2],v=0","support","1","yes","SYCL"
 "SYCL0","GELU_ERF","type=f32,ne_a=[5,7,11,13],v=0","support","1","yes","SYCL"
 "SYCL0","FLOOR","type=f32,ne_a=[128,2,2,2],v=0","support","1","yes","SYCL"
@@ -155,8 +155,8 @@
 "SYCL0","EXP","type=f32,ne_a=[5,7,11,13],v=1","support","1","yes","SYCL"
 "SYCL0","EXPM1","type=f32,ne_a=[128,2,2,2],v=1","support","0","no","SYCL"
 "SYCL0","EXPM1","type=f32,ne_a=[5,7,11,13],v=1","support","0","no","SYCL"
-"SYCL0","SOFTPLUS","type=f32,ne_a=[128,2,2,2],v=1","support","0","no","SYCL"
-"SYCL0","SOFTPLUS","type=f32,ne_a=[5,7,11,13],v=1","support","0","no","SYCL"
+"SYCL0","SOFTPLUS","type=f32,ne_a=[128,2,2,2],v=1","support","1","yes","SYCL"
+"SYCL0","SOFTPLUS","type=f32,ne_a=[5,7,11,13],v=1","support","1","yes","SYCL"
 "SYCL0","GELU_ERF","type=f32,ne_a=[128,2,2,2],v=1","support","1","yes","SYCL"
 "SYCL0","GELU_ERF","type=f32,ne_a=[5,7,11,13],v=1","support","1","yes","SYCL"
 "SYCL0","FLOOR","type=f32,ne_a=[128,2,2,2],v=1","support","0","no","SYCL"
@@ -10052,10 +10052,10 @@
 "SYCL0","CUMSUM","type=f32,ne=[375960,1,1,1]","support","0","no","SYCL"
 "SYCL0","CUMSUM","type=f32,ne=[20481,4,1,1]","support","0","no","SYCL"
 "SYCL0","XIELU","type=f32,ne=[10,5,4,3]","support","0","no","SYCL"
-"SYCL0","TRI","type=f32,ne=[10,10,4,3],tri_type=3","support","0","no","SYCL"
-"SYCL0","TRI","type=f32,ne=[10,10,4,3],tri_type=2","support","0","no","SYCL"
-"SYCL0","TRI","type=f32,ne=[10,10,4,3],tri_type=1","support","0","no","SYCL"
-"SYCL0","TRI","type=f32,ne=[10,10,4,3],tri_type=0","support","0","no","SYCL"
+"SYCL0","TRI","type=f32,ne=[10,10,4,3],tri_type=3","support","1","yes","SYCL"
+"SYCL0","TRI","type=f32,ne=[10,10,4,3],tri_type=2","support","1","yes","SYCL"
+"SYCL0","TRI","type=f32,ne=[10,10,4,3],tri_type=1","support","1","yes","SYCL"
+"SYCL0","TRI","type=f32,ne=[10,10,4,3],tri_type=0","support","1","yes","SYCL"
 "SYCL0","FILL","type=f32,ne=[10,10,4,3],c=0.000000","support","0","no","SYCL"
 "SYCL0","FILL","type=f32,ne=[303,207,11,3],c=2.000000","support","0","no","SYCL"
 "SYCL0","FILL","type=f32,ne=[800,600,4,4],c=-152.000000","support","0","no","SYCL"
diff --git a/examples/lookahead/lookahead.cpp b/examples/lookahead/lookahead.cpp
index f54cfdd77f..aa6efa62b3 100644
--- a/examples/lookahead/lookahead.cpp
+++ b/examples/lookahead/lookahead.cpp
@@ -50,6 +50,12 @@ int main(int argc, char ** argv) {
     const int N = 5;  // n-gram size
     const int G = 15; // max verification n-grams
 
+    // lookahead requires W + G + 1 sequences for parallel Jacobi decoding
+    params.n_parallel = W + G + 1;
+
+    // unified KV cache is required for coupled sequences in batch splitting
+    params.kv_unified = true;
+
     // init llama.cpp
     llama_backend_init();
     llama_numa_init(params.numa);
@@ -115,7 +121,7 @@ int main(int argc, char ** argv) {
     // seq_id == 0           : the current input token
     // seq_id [1, W]         : tokens from the past N - 1 Jacobi iterations
     // seq_id [W + 1, W + G] : verification n-grams
-    llama_batch batch = llama_batch_init(params.n_ctx, 0, W + G + 1);
+    llama_batch batch = llama_batch_init(llama_n_ctx(ctx), 0, W + G + 1);
 
     // target model sampling context
     struct common_sampler * smpl = common_sampler_init(model, params.sampling);
diff --git a/examples/lookup/lookup.cpp b/examples/lookup/lookup.cpp
index 8e73138a5f..c7552ddde1 100644
--- a/examples/lookup/lookup.cpp
+++ b/examples/lookup/lookup.cpp
@@ -106,7 +106,7 @@ int main(int argc, char ** argv){
 
     std::vector<llama_token> draft;
 
-    llama_batch batch_tgt = llama_batch_init(params.n_ctx, 0, 1);
+    llama_batch batch_tgt = llama_batch_init(llama_n_ctx(ctx), 0, 1);
 
     const auto t_dec_start = ggml_time_us();
 
diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt
index 260ad48f0e..265023733e 100644
--- a/ggml/src/CMakeLists.txt
+++ b/ggml/src/CMakeLists.txt
@@ -222,6 +222,7 @@ if (GGML_SCHED_NO_REALLOC)
 endif()
 
 add_library(ggml
+            ggml-backend-dl.cpp
             ggml-backend-reg.cpp)
 add_library(ggml::ggml ALIAS ggml)
 
diff --git a/ggml/src/ggml-backend-dl.cpp b/ggml/src/ggml-backend-dl.cpp
new file mode 100644
index 0000000000..a65cf00905
--- /dev/null
+++ b/ggml/src/ggml-backend-dl.cpp
@@ -0,0 +1,48 @@
+#include "ggml-backend-dl.h"
+
+#ifdef _WIN32
+
+dl_handle * dl_load_library(const fs::path & path) {
+    // suppress error dialogs for missing DLLs
+    DWORD old_mode = SetErrorMode(SEM_FAILCRITICALERRORS);
+    SetErrorMode(old_mode | SEM_FAILCRITICALERRORS);
+
+    HMODULE handle = LoadLibraryW(path.wstring().c_str());
+
+    SetErrorMode(old_mode);
+
+    return handle;
+}
+
+void * dl_get_sym(dl_handle * handle, const char * name) {
+    DWORD old_mode = SetErrorMode(SEM_FAILCRITICALERRORS);
+    SetErrorMode(old_mode | SEM_FAILCRITICALERRORS);
+
+    void * p = (void *) GetProcAddress(handle, name);
+
+    SetErrorMode(old_mode);
+
+    return p;
+}
+
+const char * dl_error() {
+    return "";
+}
+
+#else
+
+dl_handle * dl_load_library(const fs::path & path) {
+    dl_handle * handle = dlopen(path.string().c_str(), RTLD_NOW | RTLD_LOCAL);
+    return handle;
+}
+
+void * dl_get_sym(dl_handle * handle, const char * name) {
+    return dlsym(handle, name);
+}
+
+const char * dl_error() {
+    const char *rslt = dlerror();
+    return rslt != nullptr ? rslt : "";
+}
+
+#endif
diff --git a/ggml/src/ggml-backend-dl.h b/ggml/src/ggml-backend-dl.h
new file mode 100644
index 0000000000..f74b7c9489
--- /dev/null
+++ b/ggml/src/ggml-backend-dl.h
@@ -0,0 +1,45 @@
+#pragma once
+
+#ifdef _WIN32
+#   define WIN32_LEAN_AND_MEAN
+#   ifndef NOMINMAX
+#       define NOMINMAX
+#   endif
+#   include <windows.h>
+#   include <winevt.h>
+#else
+#    include <dlfcn.h>
+#    include <unistd.h>
+#endif
+#include <filesystem>
+
+namespace fs = std::filesystem;
+
+#ifdef _WIN32
+
+using dl_handle = std::remove_pointer_t<HMODULE>;
+
+struct dl_handle_deleter {
+    void operator()(HMODULE handle) {
+        FreeLibrary(handle);
+    }
+};
+
+#else
+
+using dl_handle = void;
+
+struct dl_handle_deleter {
+    void operator()(void * handle) {
+        dlclose(handle);
+    }
+};
+
+#endif
+
+using dl_handle_ptr = std::unique_ptr<dl_handle, dl_handle_deleter>;
+
+dl_handle * dl_load_library(const fs::path & path);
+void * dl_get_sym(dl_handle * handle, const char * name);
+const char * dl_error();
+
diff --git a/ggml/src/ggml-backend-reg.cpp b/ggml/src/ggml-backend-reg.cpp
index dd991f262e..8a693f84af 100644
--- a/ggml/src/ggml-backend-reg.cpp
+++ b/ggml/src/ggml-backend-reg.cpp
@@ -1,5 +1,6 @@
 #include "ggml-backend-impl.h"
 #include "ggml-backend.h"
+#include "ggml-backend-dl.h"
 #include "ggml-impl.h"
 #include <algorithm>
 #include <cstring>
@@ -98,72 +99,6 @@ static std::string path_str(const fs::path & path) {
     }
 }
 
-#ifdef _WIN32
-
-using dl_handle = std::remove_pointer_t<HMODULE>;
-
-struct dl_handle_deleter {
-    void operator()(HMODULE handle) {
-        FreeLibrary(handle);
-    }
-};
-
-static dl_handle * dl_load_library(const fs::path & path) {
-    // suppress error dialogs for missing DLLs
-    DWORD old_mode = SetErrorMode(SEM_FAILCRITICALERRORS);
-    SetErrorMode(old_mode | SEM_FAILCRITICALERRORS);
-
-    HMODULE handle = LoadLibraryW(path.wstring().c_str());
-
-    SetErrorMode(old_mode);
-
-    return handle;
-}
-
-static void * dl_get_sym(dl_handle * handle, const char * name) {
-    DWORD old_mode = SetErrorMode(SEM_FAILCRITICALERRORS);
-    SetErrorMode(old_mode | SEM_FAILCRITICALERRORS);
-
-    void * p = (void *) GetProcAddress(handle, name);
-
-    SetErrorMode(old_mode);
-
-    return p;
-}
-
-static const char * dl_error() {
-    return "";
-}
-
-#else
-
-using dl_handle = void;
-
-struct dl_handle_deleter {
-    void operator()(void * handle) {
-        dlclose(handle);
-    }
-};
-
-static void * dl_load_library(const fs::path & path) {
-    dl_handle * handle = dlopen(path.string().c_str(), RTLD_NOW | RTLD_LOCAL);
-
-    return handle;
-}
-
-static void * dl_get_sym(dl_handle * handle, const char * name) {
-    return dlsym(handle, name);
-}
-
-static const char * dl_error() {
-    const char *rslt = dlerror();
-    return rslt != nullptr ? rslt : "";
-}
-
-#endif
-
-using dl_handle_ptr = std::unique_ptr<dl_handle, dl_handle_deleter>;
-
 struct ggml_backend_reg_entry {
     ggml_backend_reg_t reg;
     dl_handle_ptr handle;
diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh
index 43280644e4..a3256d59dd 100644
--- a/ggml/src/ggml-cuda/common.cuh
+++ b/ggml/src/ggml-cuda/common.cuh
@@ -1124,6 +1124,7 @@ struct ggml_tensor_extra_gpu {
 struct ggml_cuda_graph_node_properties {
     void * node_data;
     ggml_op node_op;
+    enum ggml_type node_type;
     int32_t flags;
     int64_t ne[GGML_MAX_DIMS];
     size_t nb[GGML_MAX_DIMS];
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
index bc98a0f665..796278a15e 100644
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -2921,6 +2921,7 @@ static void ggml_cuda_graph_node_set_properties(ggml_cuda_graph_node_properties
     memset(props, 0, sizeof(ggml_cuda_graph_node_properties));
     props->node_data = node->data;
     props->node_op = node->op;
+    props->node_type = node->type;
     props->flags = node->flags;
     for (int i = 0; i < GGML_MAX_DIMS; i++) {
         props->ne[i] = node->ne[i];
@@ -2945,6 +2946,10 @@ static bool ggml_cuda_graph_node_properties_match(ggml_tensor * node, ggml_cuda_
         return false;
     }
 
+    if (node->type != props->node_type) {
+        return false;
+    }
+
     for (int i = 0; i < GGML_MAX_DIMS; i++) {
         if (node->ne[i] != props->ne[i]) {
             return false;
@@ -3906,14 +3911,14 @@ static void ggml_cuda_graph_evaluate_and_capture(ggml_backend_cuda_context * cud
         // Launch graph
         CUDA_CHECK(cudaGraphLaunch(graph->instance, cuda_ctx->stream()));
 #else
+        GGML_UNUSED(graph_key);
         graph_evaluated_or_captured = true;
 #endif  // USE_CUDA_GRAPH
     }
 }
 
-static bool ggml_cuda_graph_set_enabled(ggml_backend_cuda_context * cuda_ctx, const void * graph_key) {
-
 #ifdef USE_CUDA_GRAPH
+static bool ggml_cuda_graph_set_enabled(ggml_backend_cuda_context * cuda_ctx, const void * graph_key) {
     ggml_cuda_graph * graph = cuda_ctx->cuda_graph(graph_key);
 
     if (graph->graph == nullptr) {
@@ -3926,12 +3931,8 @@ static bool ggml_cuda_graph_set_enabled(ggml_backend_cuda_context * cuda_ctx, co
     }
 
     return graph->is_enabled();
-#else
-    GGML_UNUSED(cuda_ctx);
-    GGML_UNUSED(graph_key);
-    return false;
-#endif // USE_CUDA_GRAPH
 }
+#endif // USE_CUDA_GRAPH
 
 static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
     ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *) backend->context;
diff --git a/ggml/src/ggml-hexagon/CMakeLists.txt b/ggml/src/ggml-hexagon/CMakeLists.txt
index d58e287823..f3a583543c 100644
--- a/ggml/src/ggml-hexagon/CMakeLists.txt
+++ b/ggml/src/ggml-hexagon/CMakeLists.txt
@@ -1,7 +1,29 @@
+file(TO_CMAKE_PATH "${HEXAGON_SDK_ROOT}"   HEXAGON_SDK_ROOT)
+file(TO_CMAKE_PATH "${HEXAGON_TOOLS_ROOT}" HEXAGON_TOOLS_ROOT)
+
+if (NOT IS_DIRECTORY "${HEXAGON_SDK_ROOT}")
+    message(FATAL_ERROR "Make sure HEXAGON_SDK_ROOT point to the correct Hexagon SDK installation.")
+endif()
+
+if (NOT IS_DIRECTORY "${HEXAGON_TOOLS_ROOT}")
+    message("Try to read HEXAGON_TOOLS_ROOT from hexagon_sdk.json")
+    file(READ "${HEXAGON_SDK_ROOT}/hexagon_sdk.json" HEXAGON_SDK_CONFIG_PATH)
+    string(JSON HEXAGON_TOOLS_PATH GET ${HEXAGON_SDK_CONFIG_PATH} "root" "tools" "info" 0 "path")
+    message("Found HEXAGON_TOOLS_PATH: ${HEXAGON_TOOLS_PATH}")
+    set(HEXAGON_TOOLS_ROOT "${HEXAGON_SDK_ROOT}/${HEXAGON_TOOLS_PATH}")
+    file(TO_CMAKE_PATH "${HEXAGON_TOOLS_ROOT}" HEXAGON_TOOLS_ROOT)
+    if (NOT IS_DIRECTORY "${HEXAGON_TOOLS_ROOT}")
+        message(FATAL_ERROR "Make sure HEXAGON_TOOLS_ROOT point to the correct Hexagon SDK installation.")
+    endif()
+endif()
+
+message(STATUS "hexagon: using ${HEXAGON_SDK_ROOT} and ${HEXAGON_TOOLS_ROOT} for building libggml-htp skels")
+
 include(${HEXAGON_SDK_ROOT}/build/cmake/hexagon_fun.cmake)
 include(ExternalProject)
 
 option(GGML_HEXAGON_HTP_DEBUG "ggml-hexagon: enable HTP debug output" OFF)
+set(GGML_HEXAGON_HTP_CERT  "$ENV{HEXAGON_HTP_CERT}" CACHE PATH "ggml-hexagon: enable HTP library signing using certificate")
 set(GGML_HEXAGON_FP32_QUANTIZE_GROUP_SIZE 128 CACHE STRING "ggml-hexagon: quantize group size (32, 64, or 128)")
 
 add_library(htp_iface OBJECT
@@ -25,56 +47,71 @@ else()
     target_link_options(htp_iface PUBLIC -ldl)
 endif()
 
-link_custom_library(htp_iface cdsprpc)
-link_custom_library(htp_iface rpcmem)
-
 set(TARGET_NAME ggml-hexagon)
 ggml_add_backend_library(${TARGET_NAME}
-    ggml-hexagon.cpp htp-utils.c htp-utils.h ../../include/ggml-hexagon.h)
+    ggml-hexagon.cpp
+    htp-drv.cpp
+    htp-drv.h
+    libdl.h
+    ../../include/ggml-hexagon.h)
 
 target_link_libraries(${TARGET_NAME} PRIVATE htp_iface)
 target_include_directories(${TARGET_NAME} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/htp ${CMAKE_CURRENT_BINARY_DIR})
 
-# Build HTP bits
-set(HTP_CMAKE_ARGS
-    -DCMAKE_TOOLCHAIN_FILE=${CMAKE_CURRENT_SOURCE_DIR}/htp/cmake-toolchain.cmake
-    -DCMAKE_BUILD_TYPE=Release
-    -DCMAKE_INSTALL_LIBDIR=${CMAKE_CURRENT_BINARY_DIR}
-    -DHEXAGON_SDK_ROOT=$ENV{HEXAGON_SDK_ROOT}
-    -DHEXAGON_TOOLS_ROOT=$ENV{HEXAGON_TOOLS_ROOT}
-    -DHEXAGON_HTP_DEBUG=${GGML_HEXAGON_HTP_DEBUG}
-    -DGGML_HEXAGON_FP32_QUANTIZE_GROUP_SIZE=${GGML_HEXAGON_FP32_QUANTIZE_GROUP_SIZE})
+# Build HTP skels
+set(HTP_SKELS)
+function(build_htp_skel V)
+    ExternalProject_Add(htp-${V}
+        SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/htp BUILD_ALWAYS ON
+        BUILD_BYPRODUCTS ${CMAKE_CURRENT_BINARY_DIR}/libggml-htp-${V}.so
+        CMAKE_ARGS
+            -DCMAKE_BUILD_TYPE=Release
+            -DCMAKE_TOOLCHAIN_FILE=${CMAKE_CURRENT_SOURCE_DIR}/htp/cmake-toolchain.cmake
+            -DCMAKE_INSTALL_LIBDIR=${CMAKE_CURRENT_BINARY_DIR}
+            -DHEXAGON_SDK_ROOT=${HEXAGON_SDK_ROOT}
+            -DHEXAGON_TOOLS_ROOT=${HEXAGON_TOOLS_ROOT}
+            -DHEXAGON_HTP_DEBUG=${GGML_HEXAGON_HTP_DEBUG}
+            -DGGML_HEXAGON_FP32_QUANTIZE_GROUP_SIZE=${GGML_HEXAGON_FP32_QUANTIZE_GROUP_SIZE}
+            -DDSP_VERSION=${V}
+            -DPREBUILT_LIB_DIR="toolv19_${V}")
+    list(APPEND HTP_SKELS ${CMAKE_CURRENT_BINARY_DIR}/libggml-htp-${V}.so)
+    set(HTP_SKELS ${HTP_SKELS} PARENT_SCOPE)
+endfunction()
 
-ExternalProject_Add(htp-v68
-    SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/htp BUILD_ALWAYS ON
-    CMAKE_ARGS ${HTP_CMAKE_ARGS} -DDSP_VERSION=v68 -DPREBUILT_LIB_DIR="toolv19_v68")
-
-ExternalProject_Add(htp-v69
-    SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/htp BUILD_ALWAYS ON
-    CMAKE_ARGS ${HTP_CMAKE_ARGS} -DDSP_VERSION=v69 -DPREBUILT_LIB_DIR="toolv19_v69")
-
-ExternalProject_Add(htp-v73
-    SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/htp BUILD_ALWAYS ON
-    CMAKE_ARGS ${HTP_CMAKE_ARGS} -DDSP_VERSION=v73 -DPREBUILT_LIB_DIR="toolv19_v73")
-
-ExternalProject_Add(htp-v75
-    SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/htp BUILD_ALWAYS ON
-    CMAKE_ARGS ${HTP_CMAKE_ARGS} -DDSP_VERSION=v75 -DPREBUILT_LIB_DIR="toolv19_v75")
-
-ExternalProject_Add(htp-v79
-    SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/htp BUILD_ALWAYS ON
-    CMAKE_ARGS ${HTP_CMAKE_ARGS} -DDSP_VERSION=v79 -DPREBUILT_LIB_DIR="toolv19_v79")
-
-ExternalProject_Add(htp-v81
-    SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/htp BUILD_ALWAYS ON
-    CMAKE_ARGS ${HTP_CMAKE_ARGS} -DDSP_VERSION=v81 -DPREBUILT_LIB_DIR="toolv19_v81")
+build_htp_skel(v68)
+build_htp_skel(v69)
+build_htp_skel(v73)
+build_htp_skel(v75)
+build_htp_skel(v79)
+build_htp_skel(v81)
 
 # Install Hexagon skels required at runtime
-install(FILES
-    ${CMAKE_CURRENT_BINARY_DIR}/libggml-htp-v68.so
-    ${CMAKE_CURRENT_BINARY_DIR}/libggml-htp-v69.so
-    ${CMAKE_CURRENT_BINARY_DIR}/libggml-htp-v73.so
-    ${CMAKE_CURRENT_BINARY_DIR}/libggml-htp-v75.so
-    ${CMAKE_CURRENT_BINARY_DIR}/libggml-htp-v79.so
-    ${CMAKE_CURRENT_BINARY_DIR}/libggml-htp-v81.so
-    TYPE LIB)
+install(FILES ${HTP_SKELS} TYPE LIB)
+
+if (CMAKE_SYSTEM_NAME MATCHES Windows AND GGML_HEXAGON_HTP_CERT)
+    file(TO_CMAKE_PATH "$ENV{WINDOWS_SDK_BIN}/arm64"      WINSDK_BIN0_ARM64)
+    file(TO_CMAKE_PATH "$ENV{WINDOWS_SDK_BIN}/x86"        WINSDK_BIN0_X86)
+    file(TO_CMAKE_PATH "$ENV{WindowsSdkVerBinPath}/arm64" WINSDK_BIN1_ARM64)
+    file(TO_CMAKE_PATH "$ENV{WindowsSdkVerBinPath}/x86"   WINSDK_BIN1_X86)
+
+    set(WINSDK_PATHS ${WINSDK_BIN0_ARM64} ${WINSDK_BIN0_X86} ${WINSDK_BIN1_ARM64} ${WINSDK_BIN1_X86})
+
+    find_program(INF2CAT  NAMES inf2cat.exe  PATHS ${WINSDK_PATHS} REQUIRED)
+    find_program(SIGNTOOL NAMES signtool.exe PATHS ${WINSDK_PATHS} REQUIRED)
+
+    message(STATUS "hexagon: using ${GGML_HEXAGON_HTP_CERT} to sign libggml-htp skels")
+
+    set(LIBGGML_HTP_CAT ${CMAKE_CURRENT_BINARY_DIR}/libggml-htp.cat)
+    add_custom_target(libggml-htp-cat
+        BYPRODUCTS ${LIBGGML_HTP_CAT}
+        DEPENDS libggml-htp.inf ${HTP_SKELS}
+        COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_SOURCE_DIR}/libggml-htp.inf ${CMAKE_CURRENT_BINARY_DIR}
+        COMMAND ${INF2CAT} /driver:${CMAKE_CURRENT_BINARY_DIR} /os:10_25H2_ARM64
+        COMMAND ${SIGNTOOL} sign /fd sha256 /f ${GGML_HEXAGON_HTP_CERT} ${LIBGGML_HTP_CAT}
+        COMMENT "generating and signing libggml-htp.cat file"
+        VERBATIM
+    )
+
+    add_dependencies(${TARGET_NAME} libggml-htp-cat)
+    install(FILES ${LIBGGML_HTP_CAT} TYPE LIB)
+endif()
diff --git a/ggml/src/ggml-hexagon/ggml-hexagon.cpp b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
index 5b835c11c7..4f0a1620fb 100644
--- a/ggml/src/ggml-hexagon/ggml-hexagon.cpp
+++ b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
@@ -14,9 +14,6 @@
 
 #ifdef _WIN32
 #    include <sal.h>
-#    ifndef _WINDOWS
-#        define _WINDOWS
-#    endif
 #else
 #    include <semaphore.h>
 #    include <unistd.h>
@@ -25,8 +22,6 @@
 #pragma clang diagnostic ignored "-Wnested-anon-types"
 #pragma clang diagnostic ignored "-Wgnu-anonymous-struct"
 
-#include "htp-utils.h"
-
 #include <AEEStdErr.h>
 #include <dspqueue.h>
 #include <rpcmem.h>
@@ -40,6 +35,7 @@
 #include "op-desc.h"
 #include "htp-msg.h"
 #include "htp_iface.h"
+#include "htp-drv.h"
 
 static size_t opt_ndev         = 1;
 static size_t opt_nhvx         = 0; // use all
@@ -150,9 +146,9 @@ void ggml_hexagon_session::enqueue(struct htp_general_req &req, struct dspqueue_
                              0,                       // flags - the framework will autoset this
                              n_bufs,                  // number of buffers
                              bufs,                    // buffer references
-                             sizeof(req),
+                             sizeof(req),             // Message length
                              (const uint8_t *) &req,  // Message
-                             1000000                  // Timeout
+                             DSPQUEUE_TIMEOUT         // Timeout
     );
 
     if (err != 0) {
@@ -182,13 +178,13 @@ void ggml_hexagon_session::flush() {
 
         // Read response packet from queue
         int err = dspqueue_read(q, &flags,
-                                   HTP_MAX_PACKET_BUFFERS,  // Maximum number of buffer references
-                                   &n_bufs,                 // Number of buffer references
-                                   bufs,                    // Buffer references
-                                   sizeof(rsp),             // Max message length
-                                   &rsp_size,               // Message length
-                                   (uint8_t *) &rsp,
-                                   1000000);                // Timeout
+                                HTP_MAX_PACKET_BUFFERS,  // Maximum number of buffer references
+                                &n_bufs,                 // Number of buffer references
+                                bufs,                    // Buffer references
+                                sizeof(rsp),             // Max message length
+                                &rsp_size,               // Message length
+                                (uint8_t *) &rsp,        // Message
+                                DSPQUEUE_TIMEOUT);       // Timeout
 
         if (err == AEE_EEXPIRED) {
             // TODO: might need to bail out if the HTP is stuck on something
@@ -269,13 +265,7 @@ struct ggml_backend_hexagon_buffer_context {
     ggml_backend_hexagon_buffer_context(ggml_hexagon_session * sess, size_t size, bool repack) {
         size += 4 * 1024;  // extra page for padding
 
-        if (rpcmem_alloc2) {
-            this->base = (uint8_t *) rpcmem_alloc2(RPCMEM_HEAP_ID_SYSTEM, RPCMEM_DEFAULT_FLAGS | RPCMEM_HEAP_NOREG, size);
-        } else {
-            GGML_LOG_INFO("ggml-hex: %s rpcmem_alloc2 not found, falling back to rpcmem_alloc\n", sess->name.c_str());
-            this->base = (uint8_t *) rpcmem_alloc(RPCMEM_HEAP_ID_SYSTEM, RPCMEM_DEFAULT_FLAGS | RPCMEM_HEAP_NOREG, size);
-        }
-
+        this->base = (uint8_t *) rpcmem_alloc2(RPCMEM_HEAP_ID_SYSTEM, RPCMEM_DEFAULT_FLAGS | RPCMEM_HEAP_NOREG, size);
         if (!this->base) {
             GGML_LOG_ERROR("ggml-hex: %s failed to allocate buffer : size %zu\n", sess->name.c_str(), size);
             throw std::runtime_error("ggml-hex: rpcmem_alloc failed (see log for details)");
@@ -2461,12 +2451,12 @@ static void ggml_backend_hexagon_free(ggml_backend_t backend) {
 }
 
 static inline bool op_reuse_src1(const ggml_tensor * op1, const ggml_tensor * op0) {
-    return (op0 && op0->src[1] == op1->src[1] && ggml_is_quantized(op0->src[0]->type) && ggml_is_quantized(op1->src[1]->type));
+    return (op0 && op0->src[1] == op1->src[1] && ggml_is_quantized(op0->src[0]->type));
 }
 
 static inline bool is_compute_op(ggml_tensor *node)
 {
-    return !(ggml_op_is_empty(node->op) || ggml_is_empty(node));
+    return !ggml_op_is_empty(node->op) && !ggml_is_empty(node) && (node->flags & GGML_TENSOR_FLAG_COMPUTE);
 }
 
 // scan the graph and figure out last compute op index
@@ -2488,7 +2478,7 @@ static ggml_status ggml_backend_hexagon_graph_compute(ggml_backend_t backend, gg
 
     const int last = last_compute_op(graph);
 
-    const struct ggml_tensor * prev_quant_op = nullptr;  // prev executed op with quantizer
+    const struct ggml_tensor * prev_op = nullptr;  // prev executed op
 
     for (int i = 0; i < graph->n_nodes; ++i) {
         ggml_tensor * node = graph->nodes[i];
@@ -2497,17 +2487,15 @@ static ggml_status ggml_backend_hexagon_graph_compute(ggml_backend_t backend, gg
             continue;
         }
 
-        if ((node->flags & GGML_TENSOR_FLAG_COMPUTE) == 0) {
-            continue;
-        }
-
         uint32_t flags = 0;
 
         // skip quantizer if src1 is reused
-        if (op_reuse_src1(node, prev_quant_op)) {
+        if (op_reuse_src1(node, prev_op)) {
             flags |= HTP_OPFLAGS_SKIP_QUANTIZE;
         }
 
+        prev_op = node;
+
         // ask for early notification for the last Op
         if (i == last) {
             flags |= HTP_OPFLAGS_EARLY_WAKEUP;
@@ -2520,7 +2508,6 @@ static ggml_status ggml_backend_hexagon_graph_compute(ggml_backend_t backend, gg
                 } else {
                     ggml_hexagon_dispatch_op<init_binary_req<false>>(sess, node, flags);
                 }
-                prev_quant_op = node;
                 break;
             case GGML_OP_MUL_MAT_ID:
                 if (ggml_is_quantized(node->src[0]->type)) {
@@ -2528,7 +2515,6 @@ static ggml_status ggml_backend_hexagon_graph_compute(ggml_backend_t backend, gg
                 } else {
                     ggml_hexagon_dispatch_op<init_binary_id_req<false>>(sess, node, flags);
                 }
-                prev_quant_op = node;
                 break;
             case GGML_OP_MUL:
             case GGML_OP_ADD:
@@ -2670,7 +2656,7 @@ static std::vector<int> ggml_hexagon_graph_optimize_reorder(const std::vector<no
         }
 
         // that many nodes forward to search for stackable nodes that can reuse VTCM
-        constexpr int N_FORWARD = 8;
+        constexpr int N_FORWARD = 16;
 
         for (int i1 = i0 + 1; i1 < i0 + N_FORWARD && i1 < n; i1++) {
             if (used[i1]) {
@@ -3056,10 +3042,12 @@ ggml_hexagon_registry::ggml_hexagon_registry(ggml_backend_reg_t reg) {
         }
     }
 
+#if defined(__ANDROID__)
     if (opt_arch < 75) {
         opt_ndev = 1;
         GGML_LOG_WARN("ggml-hex: forcing ndev to 1 for SoCs archs lower than v75.\n");
     }
+#endif
 
     GGML_LOG_INFO("ggml-hex: Hexagon Arch version v%d\n", opt_arch);
 
@@ -3156,6 +3144,8 @@ static void ggml_hexagon_init(ggml_backend_reg * reg) {
         opt_arch = strtoul(str_arch, NULL, 0);
     }
 
+    opt_hostbuf = str_hostbuf ? atoi(str_hostbuf) : 1;
+
     reg->context = new ggml_hexagon_registry(reg);
 
     HEX_VERBOSE("ggml-hex: size-of-general-req %zu size-of-general-rsp %zu\n", sizeof(struct htp_general_req),
@@ -3180,6 +3170,11 @@ ggml_backend_reg_t ggml_backend_hexagon_reg(void) {
         static std::mutex           mutex;
         std::lock_guard<std::mutex> lock(mutex);
         if (!initialized) {
+            auto nErr = htpdrv_init();
+            if (nErr != AEE_SUCCESS) {
+                return NULL;
+            }
+
             ggml_hexagon_init(&reg);
         }
 
diff --git a/ggml/src/ggml-hexagon/htp-drv.cpp b/ggml/src/ggml-hexagon/htp-drv.cpp
new file mode 100644
index 0000000000..2530bb06d6
--- /dev/null
+++ b/ggml/src/ggml-hexagon/htp-drv.cpp
@@ -0,0 +1,418 @@
+// sample drv interface
+
+#pragma clang diagnostic ignored "-Wgnu-anonymous-struct"
+#pragma clang diagnostic ignored "-Wmissing-prototypes"
+#pragma clang diagnostic ignored "-Wsign-compare"
+
+#include <filesystem>
+#include <set>
+#include <sstream>
+#include <string>
+#ifdef _WIN32
+#   define WIN32_LEAN_AND_MEAN
+#   ifndef NOMINMAX
+#       define NOMINMAX
+#   endif
+#   include <windows.h>
+#   include <winevt.h>
+#else
+#    include <dlfcn.h>
+#    include <unistd.h>
+#endif
+#include "ggml-impl.h"
+#include "htp-drv.h"
+#include "libdl.h"
+
+#include <domain.h>
+
+//
+// Driver API types
+//
+
+typedef void * (*rpcmem_alloc_pfn_t)(int heapid, uint32_t flags, int size);
+typedef void * (*rpcmem_alloc2_pfn_t)(int heapid, uint32_t flags, size_t size);
+typedef void   (*rpcmem_free_pfn_t)(void * po);
+typedef int    (*rpcmem_to_fd_pfn_t)(void * po);
+
+typedef AEEResult (*dspqueue_create_pfn_t)(int                 domain,
+                                           uint32_t            flags,
+                                           uint32_t            req_queue_size,
+                                           uint32_t            resp_queue_size,
+                                           dspqueue_callback_t packet_callback,
+                                           dspqueue_callback_t error_callback,
+                                           void *              callback_context,
+                                           dspqueue_t *        queue);
+typedef AEEResult (*dspqueue_close_pfn_t)(dspqueue_t queue);
+typedef AEEResult (*dspqueue_export_pfn_t)(dspqueue_t queue, uint64_t *queue_id);
+typedef AEEResult (*dspqueue_write_pfn_t)(dspqueue_t queue, uint32_t flags,
+                                          uint32_t num_buffers,
+                                          struct dspqueue_buffer *buffers,
+                                          uint32_t message_length,
+                                          const uint8_t *message,
+                                          uint32_t timeout_us);
+typedef AEEResult (*dspqueue_read_pfn_t)(dspqueue_t queue, uint32_t *flags,
+                                         uint32_t max_buffers, uint32_t *num_buffers,
+                                         struct dspqueue_buffer *buffers,
+                                         uint32_t max_message_length,
+                                         uint32_t *message_length, uint8_t *message,
+                                         uint32_t timeout_us);
+
+typedef int (*fastrpc_mmap_pfn_t)(int domain, int fd, void *addr, int offset, size_t length, enum fastrpc_map_flags flags);
+typedef int (*fastrpc_munmap_pfn_t)(int domain, int fd, void *addr, size_t length);
+
+typedef int (*remote_handle64_open_pfn_t)(const char* name, remote_handle64 *ph);
+typedef int (*remote_handle64_invoke_pfn_t)(remote_handle64 h, uint32_t dwScalars, remote_arg *pra);
+typedef int (*remote_handle64_close_pfn_t)(remote_handle h);
+typedef int (*remote_handle_control_pfn_t)(uint32_t req, void* data, uint32_t datalen);
+typedef int (*remote_handle64_control_pfn_t)(remote_handle64 h, uint32_t req, void* data, uint32_t datalen);
+typedef int (*remote_session_control_pfn_t)(uint32_t req, void *data, uint32_t datalen);
+
+//
+// Driver API pfns
+//
+
+rpcmem_alloc_pfn_t  rpcmem_alloc_pfn  = nullptr;
+rpcmem_alloc2_pfn_t rpcmem_alloc2_pfn = nullptr;
+rpcmem_free_pfn_t   rpcmem_free_pfn   = nullptr;
+rpcmem_to_fd_pfn_t  rpcmem_to_fd_pfn  = nullptr;
+
+fastrpc_mmap_pfn_t   fastrpc_mmap_pfn   = nullptr;
+fastrpc_munmap_pfn_t fastrpc_munmap_pfn = nullptr;
+
+dspqueue_create_pfn_t dspqueue_create_pfn = nullptr;
+dspqueue_close_pfn_t  dspqueue_close_pfn  = nullptr;
+dspqueue_export_pfn_t dspqueue_export_pfn = nullptr;
+dspqueue_write_pfn_t  dspqueue_write_pfn  = nullptr;
+dspqueue_read_pfn_t   dspqueue_read_pfn   = nullptr;
+
+remote_handle64_open_pfn_t    remote_handle64_open_pfn    = nullptr;
+remote_handle64_invoke_pfn_t  remote_handle64_invoke_pfn  = nullptr;
+remote_handle64_close_pfn_t   remote_handle64_close_pfn   = nullptr;
+remote_handle_control_pfn_t   remote_handle_control_pfn   = nullptr;
+remote_handle64_control_pfn_t remote_handle64_control_pfn = nullptr;
+remote_session_control_pfn_t  remote_session_control_pfn  = nullptr;
+
+//
+// Driver API
+//
+
+void * rpcmem_alloc(int heapid, uint32_t flags, int size) {
+    return rpcmem_alloc_pfn(heapid, flags, size);
+}
+
+void * rpcmem_alloc2(int heapid, uint32_t flags, size_t size) {
+    if (rpcmem_alloc2_pfn) {
+        return rpcmem_alloc2_pfn(heapid, flags, size);
+    } else {
+        GGML_LOG_INFO("ggml-hex: rpcmem_alloc2 not found, falling back to rpcmem_alloc\n");
+        return rpcmem_alloc_pfn(heapid, flags, size);
+    }
+}
+
+void rpcmem_free(void * po) {
+    return rpcmem_free_pfn(po);
+}
+
+int rpcmem_to_fd(void * po) {
+    return rpcmem_to_fd_pfn(po);
+}
+
+HTPDRV_API int fastrpc_mmap(int domain, int fd, void * addr, int offset, size_t length, enum fastrpc_map_flags flags) {
+    return fastrpc_mmap_pfn(domain, fd, addr, offset, length, flags);
+}
+
+HTPDRV_API int fastrpc_munmap(int domain, int fd, void * addr, size_t length) {
+    return fastrpc_munmap_pfn(domain, fd, addr, length);
+}
+
+AEEResult dspqueue_create(int                 domain,
+                          uint32_t            flags,
+                          uint32_t            req_queue_size,
+                          uint32_t            resp_queue_size,
+                          dspqueue_callback_t packet_callback,
+                          dspqueue_callback_t error_callback,
+                          void *              callback_context,
+                          dspqueue_t *        queue) {
+    return dspqueue_create_pfn(domain, flags, req_queue_size, resp_queue_size, packet_callback, error_callback,
+                               callback_context, queue);
+}
+
+AEEResult dspqueue_close(dspqueue_t queue) {
+    return dspqueue_close_pfn(queue);
+}
+
+AEEResult dspqueue_export(dspqueue_t queue, uint64_t * queue_id) {
+    return dspqueue_export_pfn(queue, queue_id);
+}
+
+AEEResult dspqueue_write(dspqueue_t               queue,
+                         uint32_t                 flags,
+                         uint32_t                 num_buffers,
+                         struct dspqueue_buffer * buffers,
+                         uint32_t                 message_length,
+                         const uint8_t *          message,
+                         uint32_t                 timeout_us) {
+    return dspqueue_write_pfn(queue, flags, num_buffers, buffers, message_length, message, timeout_us);
+}
+
+AEEResult dspqueue_read(dspqueue_t               queue,
+                        uint32_t *               flags,
+                        uint32_t                 max_buffers,
+                        uint32_t *               num_buffers,
+                        struct dspqueue_buffer * buffers,
+                        uint32_t                 max_message_length,
+                        uint32_t *               message_length,
+                        uint8_t *                message,
+                        uint32_t                 timeout_us) {
+    return dspqueue_read_pfn(queue, flags, max_buffers, num_buffers, buffers, max_message_length, message_length,
+                             message, timeout_us);
+}
+
+HTPDRV_API int remote_handle64_open(const char * name, remote_handle64 * ph) {
+    return remote_handle64_open_pfn(name, ph);
+}
+
+HTPDRV_API int remote_handle64_invoke(remote_handle64 h, uint32_t dwScalars, remote_arg * pra) {
+    return remote_handle64_invoke_pfn(h, dwScalars, pra);
+}
+
+HTPDRV_API int remote_handle64_close(remote_handle64 h) {
+    return remote_handle64_close_pfn(h);
+}
+
+HTPDRV_API int remote_handle_control(uint32_t req, void * data, uint32_t datalen) {
+    return remote_handle_control_pfn(req, data, datalen);
+}
+
+HTPDRV_API int remote_handle64_control(remote_handle64 h, uint32_t req, void * data, uint32_t datalen) {
+    return remote_handle64_control_pfn(h, req, data, datalen);
+}
+
+HTPDRV_API int remote_session_control(uint32_t req, void * data, uint32_t datalen) {
+    return remote_session_control_pfn(req, data, datalen);
+}
+
+#ifdef _WIN32
+
+static std::string wstr_to_str(std::wstring_view wstr) {
+    std::string result;
+    if (wstr.empty()) {
+        return result;
+    }
+    auto bytes_needed = WideCharToMultiByte(CP_UTF8, WC_ERR_INVALID_CHARS,
+                                            wstr.data(), (int) wstr.size(),
+                                            nullptr, 0, nullptr, nullptr);
+    if (bytes_needed == 0) {
+        GGML_LOG_ERROR("ggml-hex: WideCharToMultiByte failed. Error %lu\n", GetLastError());
+        throw std::runtime_error("Invalid wstring input");
+    }
+
+    result.resize(bytes_needed, '\0');
+    int bytes_written = WideCharToMultiByte(CP_UTF8, WC_ERR_INVALID_CHARS,
+                                            wstr.data(), (int) wstr.size(),
+                                            result.data(), bytes_needed,
+                                            nullptr, nullptr);
+    if (bytes_written == 0) {
+        GGML_LOG_ERROR("ggml-hex: WideCharToMultiByte failed. Error %lu\n", GetLastError());
+        throw std::runtime_error("Wstring conversion failed");
+    }
+    return result;
+}
+
+static std::string get_driver_path() {
+    std::wstring serviceName = L"qcnspmcdm";
+    std::string result;
+
+    // Get a handle to the SCM database.
+    SC_HANDLE schSCManager = OpenSCManagerW(NULL, NULL, STANDARD_RIGHTS_READ);
+    if (nullptr == schSCManager) {
+        GGML_LOG_ERROR("ggml-hex: Failed to open SCManager. Error: %lu\n", GetLastError());
+        return result;
+    }
+
+    // Get a handle to the service.
+    SC_HANDLE schService = OpenServiceW(schSCManager,           // SCM database
+                                        serviceName.c_str(),    // name of service
+                                        SERVICE_QUERY_CONFIG);  // need query config access
+
+    if (nullptr == schService) {
+        GGML_LOG_ERROR("ggml-hex: Failed to open qcnspmcdm service. Error: %lu\n", GetLastError());
+        CloseServiceHandle(schSCManager);
+        return result;
+    }
+
+    // Store the size of buffer used as an output.
+    DWORD bufferSize;
+    if (!QueryServiceConfigW(schService, NULL, 0, &bufferSize) &&
+        (GetLastError() != ERROR_INSUFFICIENT_BUFFER)) {
+        GGML_LOG_ERROR("ggml-hex: Failed to query service config. Error: %lu\n", GetLastError());
+        CloseServiceHandle(schService);
+        CloseServiceHandle(schSCManager);
+        return result;
+    }
+    // Get the configuration of the service.
+    LPQUERY_SERVICE_CONFIGW serviceConfig =
+        static_cast<LPQUERY_SERVICE_CONFIGW>(LocalAlloc(LMEM_FIXED, bufferSize));
+    if (!QueryServiceConfigW(schService, serviceConfig, bufferSize, &bufferSize)) {
+        fprintf(stderr, "ggml-hex: Failed to query service config. Error: %lu\n", GetLastError());
+        LocalFree(serviceConfig);
+        CloseServiceHandle(schService);
+        CloseServiceHandle(schSCManager);
+        return result;
+    }
+
+    // Read the driver file path get its parent directory
+    std::wstring driverPath = std::wstring(serviceConfig->lpBinaryPathName);
+    driverPath = driverPath.substr(0, driverPath.find_last_of(L"\\"));
+
+    // Clean up resources
+    LocalFree(serviceConfig);
+    CloseServiceHandle(schService);
+    CloseServiceHandle(schSCManager);
+
+    // Driver path would contain invalid path string, like:
+    // \SystemRoot\System32\DriverStore\FileRepository\qcadsprpc8280.inf_arm64_c2b9460c9a072f37
+    // "\SystemRoot" should be replace with a correct one (e.g. C:\Windows)
+    const std::wstring systemRootPlaceholder = L"\\SystemRoot";
+    if (0 != driverPath.compare(0, systemRootPlaceholder.length(), systemRootPlaceholder)) {
+        GGML_LOG_ERROR("ggml-hex: String pattern not found in driver path.\n");
+        return result;
+    }
+
+    // Replace \SystemRoot with an absolute path from system ENV windir
+    const std::wstring systemRootEnv = L"windir";
+
+    // Query the number of wide charactors this variable requires
+    DWORD numWords = GetEnvironmentVariableW(systemRootEnv.c_str(), NULL, 0);
+    if (numWords == 0) {
+        GGML_LOG_ERROR("ggml-hex: Failed get systemRoot environment variable\n");
+        return result;
+    }
+
+    // Query the actual system root name from environment variable
+    std::vector<wchar_t> systemRoot(numWords + 1);
+    numWords = GetEnvironmentVariableW(systemRootEnv.c_str(), systemRoot.data(), numWords + 1);
+    if (numWords == 0) {
+        GGML_LOG_ERROR("ggml-hex: Failed to read windir environment variable\n");
+        return result;
+    }
+    driverPath.replace(0, systemRootPlaceholder.length(), std::wstring(systemRoot.data()));
+
+    return wstr_to_str(driverPath);
+}
+
+#endif
+
+using dl_handle_ptr = std::unique_ptr<dl_handle, dl_handle_deleter>;
+
+int htpdrv_init() {
+    static dl_handle_ptr lib_cdsp_rpc_handle = nullptr;
+    static bool initialized = false;
+#ifdef _WIN32
+    std::string drv_path = get_driver_path() + "\\" + "libcdsprpc.dll";
+#else
+    std::string drv_path = "libcdsprpc.so";
+#endif
+    if (initialized) {
+        GGML_LOG_INFO("ggml-hex: Driver already loaded\n");
+        return AEE_SUCCESS;
+    }
+    GGML_LOG_INFO("ggml-hex: Loading driver %s\n", drv_path.c_str());
+
+    fs::path path{ drv_path.c_str() };
+    dl_handle_ptr handle { dl_load_library(path) };
+    if (!handle) {
+        GGML_LOG_ERROR("ggml-hex: failed to load %s: %s\n", path.u8string().c_str(), dl_error());
+        return AEE_EUNABLETOLOAD;
+    }
+
+#define dlsym(drv, type, pfn, symbol, ignore)                               \
+    do {                                                                    \
+        pfn = (type) dl_get_sym(drv, #symbol);                              \
+        if (!ignore && nullptr == pfn) {                                    \
+            GGML_LOG_ERROR("ggml-hex: failed to dlsym %s\n", #symbol);      \
+            return AEE_EUNABLETOLOAD;                                       \
+        }                                                                   \
+    } while (0)
+
+    dlsym(handle.get(), rpcmem_alloc_pfn_t, rpcmem_alloc_pfn, rpcmem_alloc, false);
+    dlsym(handle.get(), rpcmem_alloc2_pfn_t, rpcmem_alloc2_pfn, rpcmem_alloc2, true);
+    dlsym(handle.get(), rpcmem_free_pfn_t, rpcmem_free_pfn, rpcmem_free, false);
+    dlsym(handle.get(), rpcmem_to_fd_pfn_t, rpcmem_to_fd_pfn, rpcmem_to_fd, false);
+    dlsym(handle.get(), fastrpc_mmap_pfn_t, fastrpc_mmap_pfn, fastrpc_mmap, false);
+    dlsym(handle.get(), fastrpc_munmap_pfn_t, fastrpc_munmap_pfn, fastrpc_munmap, false);
+    dlsym(handle.get(), dspqueue_create_pfn_t, dspqueue_create_pfn, dspqueue_create, false);
+    dlsym(handle.get(), dspqueue_close_pfn_t, dspqueue_close_pfn, dspqueue_close, false);
+    dlsym(handle.get(), dspqueue_export_pfn_t, dspqueue_export_pfn, dspqueue_export, false);
+    dlsym(handle.get(), dspqueue_write_pfn_t, dspqueue_write_pfn, dspqueue_write, false);
+    dlsym(handle.get(), dspqueue_read_pfn_t, dspqueue_read_pfn, dspqueue_read, false);
+    dlsym(handle.get(), remote_handle64_open_pfn_t, remote_handle64_open_pfn, remote_handle64_open, false);
+    dlsym(handle.get(), remote_handle64_invoke_pfn_t, remote_handle64_invoke_pfn, remote_handle64_invoke, false);
+    dlsym(handle.get(), remote_handle_control_pfn_t, remote_handle_control_pfn, remote_handle_control, false);
+    dlsym(handle.get(), remote_handle64_control_pfn_t, remote_handle64_control_pfn, remote_handle64_control, false);
+    dlsym(handle.get(), remote_session_control_pfn_t, remote_session_control_pfn, remote_session_control, false);
+    dlsym(handle.get(), remote_handle64_close_pfn_t, remote_handle64_close_pfn, remote_handle64_close, false);
+
+    lib_cdsp_rpc_handle = std::move(handle);
+    initialized         = true;
+
+    return AEE_SUCCESS;
+}
+
+domain * get_domain(int domain_id) {
+    int i    = 0;
+    int size = sizeof(supported_domains) / sizeof(domain);
+
+    for (i = 0; i < size; i++) {
+        if (supported_domains[i].id == domain_id) {
+            return &supported_domains[i];
+        }
+    }
+
+    return NULL;
+}
+
+int get_hex_arch_ver(int domain, int * arch) {
+    if (!remote_handle_control_pfn) {
+        GGML_LOG_ERROR("ggml-hex: remote_handle_control is not supported on this device\n");
+        return AEE_EUNSUPPORTEDAPI;
+    }
+
+    struct remote_dsp_capability arch_ver;
+    arch_ver.domain       = (uint32_t) domain;
+    arch_ver.attribute_ID = ARCH_VER;
+    arch_ver.capability   = (uint32_t) 0;
+
+    int err = remote_handle_control(DSPRPC_GET_DSP_INFO, &arch_ver, sizeof(arch_ver));
+    if ((err & 0xff) == (AEE_EUNSUPPORTEDAPI & 0xff)) {
+        GGML_LOG_ERROR("ggml-hex: FastRPC capability API is not supported on this device\n");
+        return AEE_EUNSUPPORTEDAPI;
+    }
+
+    if (err != AEE_SUCCESS) {
+        GGML_LOG_ERROR("ggml-hex: FastRPC capability query failed (err %d)\n", err);
+        return err;
+    }
+
+    switch (arch_ver.capability & 0xff) {
+        case 0x68:
+            *arch = 68;
+            return 0;
+        case 0x69:
+            *arch = 69;
+            return 0;
+        case 0x73:
+            *arch = 73;
+            return 0;
+        case 0x75:
+            *arch = 75;
+            return 0;
+        case 0x79:
+            *arch = 79;
+            return 0;
+        case 0x81:
+            *arch = 81;
+            return 0;
+    }
+    return -1;
+}
diff --git a/ggml/src/ggml-hexagon/htp-drv.h b/ggml/src/ggml-hexagon/htp-drv.h
new file mode 100644
index 0000000000..6eba7ba17d
--- /dev/null
+++ b/ggml/src/ggml-hexagon/htp-drv.h
@@ -0,0 +1,121 @@
+#pragma once
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifdef _WIN32
+#    pragma clang diagnostic ignored "-Wignored-attributes"
+#endif
+
+#include <AEEStdErr.h>
+#include <rpcmem.h>
+#include <remote.h>
+#include <dspqueue.h>
+
+#if defined(_WIN32) && !defined(__MINGW32__)
+#    ifdef GGML_BACKEND_BUILD
+#        define HTPDRV_API __declspec(dllexport) extern
+#    else
+#        define HTPDRV_API __declspec(dllimport) extern
+#    endif
+#else
+#    define HTPDRV_API __attribute__ ((visibility ("default"))) extern
+#endif
+
+/* Offset to differentiate HLOS and Hexagon error codes.
+   Stores the value of AEE_EOFFSET for Hexagon. */
+#ifndef DSP_OFFSET
+#    define DSP_OFFSET 0x80000400
+#endif
+
+/* Errno for connection reset by peer. */
+#ifndef ECONNRESET
+#    ifdef __hexagon__
+#        define ECONNRESET 104
+#    endif
+#endif
+
+/* Abstraction of different OS specific sleep APIs.
+   SLEEP accepts input in seconds. */
+#ifndef SLEEP
+#    ifdef __hexagon__
+#        define SLEEP(x)                      \
+            { /* Do nothing for simulator. */ \
+            }
+#    else
+#        ifdef _WIN32
+#            define SLEEP(x) Sleep(1000 * x) /* Sleep accepts input in milliseconds. */
+#        else
+#            define SLEEP(x) sleep(x)        /* sleep accepts input in seconds. */
+#        endif
+#    endif
+#endif
+
+/* Include windows specific header files. */
+#ifdef _WIN32
+#    include <windows.h>
+#    include <sysinfoapi.h>
+#    define _CRT_SECURE_NO_WARNINGS         1
+#    define _WINSOCK_DEPRECATED_NO_WARNINGS 1
+#endif
+
+/* Includes and defines for all HLOS except windows */
+#if !defined(__hexagon__) && !defined(_WIN32)
+#    include "unistd.h"
+
+#    include <sys/time.h>
+#endif
+
+/* Includes and defines for Hexagon and all HLOS except Windows. */
+#if !defined(_WIN32)
+/* Weak reference to remote symbol for compilation. */
+#    pragma weak remote_session_control
+#    pragma weak remote_handle_control
+#    pragma weak remote_handle64_control
+#    pragma weak fastrpc_mmap
+#    pragma weak fastrpc_munmap
+#    pragma weak rpcmem_alloc2
+#endif
+
+#if !defined(_WIN32)
+#    pragma weak remote_system_request
+#endif
+
+#ifdef _WIN32
+#     define DSPQUEUE_TIMEOUT DSPQUEUE_TIMEOUT_NONE
+#else
+#     define DSPQUEUE_TIMEOUT 1000000
+#endif
+
+/**
+ * htpdrv_init API: driver interface entry point
+ *
+ * @return      Return AEE error codes as defined in Hexagon SDK.
+ */
+HTPDRV_API int htpdrv_init(void);
+
+/**
+ * get_domain API: get domain struct from domain value.
+ *
+ * @param[in]  domain value of a domain
+ * @return     Returns domain struct of the domain if it is supported or else
+ *             returns NULL.
+ *
+ */
+HTPDRV_API domain * get_domain(int domain_id);
+
+/**
+ * get_hex_arch_ver API: query the Hexagon processor architecture version information
+ *
+ * @param[in]   domain_id value of a domain
+ * @param[out]  Arch version (73, 75, ...)
+ * @return      0 if query is successful.
+ *              non-zero if error, return value points to the error.
+ *
+ */
+HTPDRV_API int get_hex_arch_ver(int domain, int * arch);
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/ggml/src/ggml-hexagon/htp-utils.c b/ggml/src/ggml-hexagon/htp-utils.c
deleted file mode 100644
index 3f335bf71c..0000000000
--- a/ggml/src/ggml-hexagon/htp-utils.c
+++ /dev/null
@@ -1,454 +0,0 @@
-
-#pragma clang diagnostic ignored "-Wgnu-anonymous-struct"
-#pragma clang diagnostic ignored "-Wmissing-prototypes"
-#pragma clang diagnostic ignored "-Wsign-compare"
-
-#define GGML_COMMON_IMPL_C
-#include "ggml-backend-impl.h"
-#include "ggml-common.h"
-#include "ggml-hexagon.h"
-#include "ggml-impl.h"
-
-#include "htp-utils.h"
-
-#include <domain.h>
-#include <remote.h>
-#include <stdbool.h>
-#include <stdint.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-
-domain * get_domain(int domain_id) {
-    int i    = 0;
-    int size = sizeof(supported_domains) / sizeof(domain);
-
-    for (i = 0; i < size; i++) {
-        if (supported_domains[i].id == domain_id) {
-            return &supported_domains[i];
-        }
-    }
-
-    return NULL;
-}
-
-bool is_valid_domain_id(int domain_id, int compute_only) {
-    int i    = 0;
-    int size = sizeof(supported_domains) / sizeof(domain);
-
-    if (compute_only) {
-        return is_CDSP(domain_id);
-    }
-
-    for (i = 0; i < size; i++) {
-        if (supported_domains[i].id == domain_id) {
-            return true;
-        }
-    }
-
-    return false;
-}
-
-int get_domains_info(char * domain_type, int * num_domains, fastrpc_domain ** domains_info) {
-    int nErr    = AEE_SUCCESS;
-    int ss_info = 0;
-    if (domain_type != NULL) {
-        if (strcmp(domain_type, "LPASS") == 0) {
-            ss_info = FASTRPC_LPASS;
-        } else if (strcmp(domain_type, "HPASS") == 0) {
-            ss_info = FASTRPC_HPASS;
-        } else {
-            ss_info = FASTRPC_NSP;
-        }
-    }
-    system_req_payload req  = { 0 };
-    req.id                  = FASTRPC_GET_DOMAINS;
-    req.sys.domains         = NULL;
-    fastrpc_domain * domain = NULL;
-    if (ss_info != 0) {
-        req.sys.flags = DOMAINS_LIST_FLAGS_SET_TYPE(req.sys.flags, ss_info);
-    } else {
-        req.sys.flags = 0;
-    }
-#ifdef _WIN32
-    nErr = AEE_EUNSUPPORTED;
-    goto bail;
-#endif
-    if (remote_system_request) {
-        nErr = remote_system_request(&req);
-        if (nErr != AEE_SUCCESS) {
-            GGML_LOG_ERROR("Failure in remote_system_request call: %d.\n", nErr);
-            goto bail;
-        }
-        // Allocate memory for domain-info array
-        req.sys.max_domains = req.sys.num_domains;
-        if ((req.sys.domains = calloc(req.sys.num_domains, sizeof(fastrpc_domain))) == NULL) {
-            nErr = AEE_ENOMEMORY;
-            GGML_LOG_ERROR("Unable to allocate memory for req.sys.domains");
-            goto bail;
-        }
-
-        nErr = remote_system_request(&req);
-        if (nErr != AEE_SUCCESS) {
-            GGML_LOG_ERROR("Failure in remote_system_request call: %d.\n", nErr);
-            goto bail;
-        }
-
-        for (int i = 0; i < req.sys.num_domains; i++) {
-            // Verify that only requested type domains were returned
-            domain = &req.sys.domains[i];
-            if (domain->type != ss_info && domain_type != NULL) {
-                nErr = -1;
-                GGML_LOG_ERROR("Incorrect data received from remote_system_request.\n");
-                goto bail;
-            }
-        }
-        *domains_info = req.sys.domains;
-        *num_domains  = req.sys.num_domains;
-    } else {
-        nErr = AEE_EUNSUPPORTED;
-        goto bail;
-    }
-bail:
-    if (nErr && !req.sys.domains) {
-        free(req.sys.domains);
-    }
-    return nErr;
-}
-
-int get_effective_domain_id(char * domain_name, int session_id, int * effec_domain_id) {
-    int                              err  = 0;
-    remote_rpc_effective_domain_id_t sess = { 0 };
-
-    sess.domain_name     = domain_name;
-    sess.domain_name_len = strlen(domain_name);
-    sess.session_id      = session_id;
-
-    err = remote_session_control(FASTRPC_GET_EFFECTIVE_DOMAIN_ID, &sess, sizeof(sess));
-    if (err) {
-        GGML_LOG_ERROR("Error 0x%x: failed to get effective domain id for %s, session id %d\n", err, sess.domain_name,
-               session_id);
-        return err;
-    }
-
-    *effec_domain_id = sess.effective_domain_id;
-    return err;
-}
-
-int get_dsp_support(int * domain) {
-    int nErr = AEE_SUCCESS;
-    *domain  = CDSP_DOMAIN_ID;  // DSP domain default value is CDSP_DOMAIN_ID
-
-    if (remote_handle_control) {
-        struct remote_dsp_capability dsp_capability_domain = { CDSP_DOMAIN_ID, DOMAIN_SUPPORT, 0 };
-        nErr = remote_handle_control(DSPRPC_GET_DSP_INFO, &dsp_capability_domain, sizeof(struct remote_dsp_capability));
-        if ((nErr & 0xFF) == (AEE_EUNSUPPORTEDAPI & 0xFF)) {
-            GGML_LOG_ERROR("\nFastRPC Capability API is not supported on this device\n");
-            goto bail;
-        }
-
-        if (dsp_capability_domain.capability == 0) {
-            dsp_capability_domain.domain       = ADSP_DOMAIN_ID;  // Check for ADSP support.
-            dsp_capability_domain.attribute_ID = DOMAIN_SUPPORT;
-            dsp_capability_domain.capability   = 0;
-            nErr                               = remote_handle_control(DSPRPC_GET_DSP_INFO, &dsp_capability_domain,
-                                                                       sizeof(struct remote_dsp_capability));
-            if (dsp_capability_domain.capability) {
-                *domain = ADSP_DOMAIN_ID;  // For targets like Agatti (not having cDSP), domain is ADSP_DOMAIN_ID
-            }
-        }
-
-        if (nErr != AEE_SUCCESS) {
-            GGML_LOG_ERROR("\nget_dsp_support failed with Error 0x%x\n", nErr);
-            goto bail;
-        }
-    } else {
-        nErr = AEE_EUNSUPPORTEDAPI;
-        GGML_LOG_ERROR("remote_dsp_capability interface is not supported on this device\n");
-    }
-
-bail:
-    return nErr;
-}
-
-int get_vtcm_info(int domain, uint32_t * capability, uint32_t attr) {
-    int nErr    = AEE_SUCCESS;
-    *capability = 0;
-
-    if (attr == VTCM_PAGE || attr == VTCM_COUNT) {
-    } else {
-        nErr = AEE_EBADPARM;
-        GGML_LOG_ERROR("Unsupported attr. Only VTCM_PAGE and VTCM_COUNT supported\n");
-        goto bail;
-    }
-    if (remote_handle_control) {
-        if (domain == ADSP_DOMAIN_ID || domain == CDSP_DOMAIN_ID) {
-            /*
-            * Query the DSP for VTCM information
-            * Since the ADSP does not have a dedicated VTCM, we expect the output to be 0
-            */
-            struct remote_dsp_capability dsp_capability_vtcm_dsp;
-            dsp_capability_vtcm_dsp.domain       = (uint32_t) domain;
-            dsp_capability_vtcm_dsp.attribute_ID = attr;
-            dsp_capability_vtcm_dsp.capability   = (uint32_t) 0;
-            nErr                                 = remote_handle_control(DSPRPC_GET_DSP_INFO, &dsp_capability_vtcm_dsp,
-                                                                         sizeof(struct remote_dsp_capability));
-            if ((nErr & 0xFF) == (AEE_EUNSUPPORTEDAPI & 0xFF)) {
-                GGML_LOG_ERROR("\nFastRPC Capability API is not supported on this device\n");
-                GGML_LOG_ERROR("Running the usecase without checking the capability\n");
-                nErr = AEE_SUCCESS;
-                goto bail;
-            } else if (nErr == AEE_SUCCESS) {
-                *capability = dsp_capability_vtcm_dsp.capability;
-            } else {
-                GGML_LOG_ERROR("\nget_vtcm_info failed with Error 0x%x\n", nErr);
-                goto bail;
-            }
-        } else {
-            nErr = AEE_EUNSUPPORTED;
-            GGML_LOG_ERROR("Unsupported domain %d\n", domain);
-            goto bail;
-        }
-    } else {
-        nErr = AEE_EUNSUPPORTEDAPI;
-        GGML_LOG_ERROR("remote_dsp_capability interface is not supported on this device\n");
-    }
-
-bail:
-    return nErr;
-}
-
-bool is_unsignedpd_supported(int domain_id) {
-    int nErr = AEE_SUCCESS;
-    if (remote_handle_control) {
-        struct remote_dsp_capability dsp_capability_domain = { domain_id, UNSIGNED_PD_SUPPORT, 0 };
-        nErr = remote_handle_control(DSPRPC_GET_DSP_INFO, &dsp_capability_domain, sizeof(struct remote_dsp_capability));
-        if ((nErr & 0xFF) == (AEE_EUNSUPPORTEDAPI & 0xFF)) {
-            GGML_LOG_ERROR("\nFastRPC Capability API is not supported on this device. Falling back to signed pd.\n");
-            return false;
-        }
-        if (nErr) {
-            GGML_LOG_ERROR("\nERROR 0x%x: FastRPC Capability API failed. Falling back to signed pd.", nErr);
-            return false;
-        }
-        if (dsp_capability_domain.capability == 1) {
-            return true;
-        }
-    } else {
-        nErr = AEE_EUNSUPPORTEDAPI;
-        GGML_LOG_ERROR("remote_dsp_capability interface is not supported on this device. Falling back to signed pd.\n");
-        return false;
-    }
-    return false;
-}
-
-bool get_unsignedpd_support(void) {
-    return is_unsignedpd_supported(CDSP_DOMAIN_ID);
-}
-
-bool is_async_fastrpc_supported(int domain) {
-    int nErr = AEE_SUCCESS;
-    if (remote_handle_control) {
-        if (domain == CDSP_DOMAIN_ID) {
-            /*
-            * Query the DSP for ASYNC_FASTRPC_SUPPORT information
-            * Async fastrpc is supported only on CDSP
-            */
-            struct remote_dsp_capability dsp_capability_async_support;
-            dsp_capability_async_support.domain       = (uint32_t) domain;
-            dsp_capability_async_support.attribute_ID = ASYNC_FASTRPC_SUPPORT;
-            dsp_capability_async_support.capability   = (uint32_t) 0;
-            nErr = remote_handle_control(DSPRPC_GET_DSP_INFO, &dsp_capability_async_support,
-                                         sizeof(struct remote_dsp_capability));
-            if ((nErr & 0xFF) == (AEE_EUNSUPPORTEDAPI & 0xFF)) {
-                GGML_LOG_ERROR("\nFastRPC Capability API is not supported on this device\n");
-                GGML_LOG_ERROR("Running the usecase without checking the capability\n");
-                nErr = AEE_SUCCESS;
-                goto bail;
-            } else if (dsp_capability_async_support.capability == 1) {
-                return true;
-            }
-            if (nErr != AEE_SUCCESS) {
-                GGML_LOG_ERROR("\nis_async_fastrpc_supported failed with Error 0x%x\n", nErr);
-                goto bail;
-            }
-        } else {
-            nErr = AEE_EUNSUPPORTED;
-            GGML_LOG_ERROR("Async fastrpc is not supported on domain %d\n", domain);
-            goto bail;
-        }
-    } else {
-        nErr = AEE_EUNSUPPORTEDAPI;
-        GGML_LOG_ERROR("remote_dsp_capability interface is not supported on this device\n");
-    }
-
-bail:
-    return false;
-}
-
-bool is_status_notification_supported(int domain) {
-    int nErr = AEE_SUCCESS;
-
-    if (remote_handle_control) {
-        /*
-        * Query the DSP for STATUS_NOTIFICATION_SUPPORT information
-        * DSP User PD status notification Support
-        */
-        struct remote_dsp_capability dsp_capability_status_notification_support;
-        dsp_capability_status_notification_support.domain       = (uint32_t) domain;
-        dsp_capability_status_notification_support.attribute_ID = STATUS_NOTIFICATION_SUPPORT;
-        dsp_capability_status_notification_support.capability   = (uint32_t) 0;
-        nErr = remote_handle_control(DSPRPC_GET_DSP_INFO, &dsp_capability_status_notification_support,
-                                     sizeof(struct remote_dsp_capability));
-        if ((nErr & 0xFF) == (AEE_EUNSUPPORTEDAPI & 0xFF)) {
-            GGML_LOG_ERROR("\nFastRPC Capability API is not supported on this device\n");
-            GGML_LOG_ERROR("Running the usecase without checking the capability\n");
-            nErr = AEE_SUCCESS;
-            goto bail;
-        } else if (dsp_capability_status_notification_support.capability == 1) {
-            return true;
-        }
-        if (nErr != AEE_SUCCESS) {
-            GGML_LOG_ERROR("\nis_status_notification_supported failed with Error 0x%x\n", nErr);
-            goto bail;
-        }
-    } else {
-        nErr = AEE_EUNSUPPORTEDAPI;
-        GGML_LOG_ERROR("remote_dsp_capability interface is not supported on this device\n");
-    }
-
-bail:
-    return false;
-}
-
-int get_hmx_support_info(int domain, uint32_t * capability, uint32_t attr) {
-    int nErr    = AEE_SUCCESS;
-    *capability = 0;
-
-    if (attr != HMX_SUPPORT_SPATIAL && attr != HMX_SUPPORT_DEPTH) {
-        nErr = AEE_EBADPARM;
-        GGML_LOG_ERROR("Unsupported attr. Only HMX_SUPPORT_SPATIAL and HMX_SUPPORT_DEPTH supported\n");
-        goto bail;
-    }
-    if (remote_handle_control) {
-        if (domain == CDSP_DOMAIN_ID) {
-            /*
-            * Query the DSP for HMX SUPPORT information
-            * HMX is supported on CDSP only
-            */
-            struct remote_dsp_capability dsp_capability_hmx_dsp;
-            dsp_capability_hmx_dsp.domain       = (uint32_t) domain;
-            dsp_capability_hmx_dsp.attribute_ID = attr;
-            dsp_capability_hmx_dsp.capability   = (uint32_t) 0;
-            nErr                                = remote_handle_control(DSPRPC_GET_DSP_INFO, &dsp_capability_hmx_dsp,
-                                                                        sizeof(struct remote_dsp_capability));
-            if ((nErr & 0xFF) == (AEE_EUNSUPPORTEDAPI & 0xFF)) {
-                GGML_LOG_ERROR("\nFastRPC Capability API is not supported on this device\n");
-                GGML_LOG_ERROR("Running the usecase without checking the capability\n");
-                nErr = AEE_SUCCESS;
-                goto bail;
-            } else if (nErr == AEE_SUCCESS) {
-                *capability = dsp_capability_hmx_dsp.capability;
-            } else {
-                GGML_LOG_ERROR("\nget_hmx_support_info failed with Error 0x%x\n", nErr);
-                goto bail;
-            }
-        } else {
-            nErr = AEE_EUNSUPPORTED;
-            GGML_LOG_ERROR("HMX support is not there for domain %d\n", domain);
-            goto bail;
-        }
-    } else {
-        nErr = AEE_EUNSUPPORTEDAPI;
-        GGML_LOG_ERROR("remote_dsp_capability interface is not supported on this device\n");
-    }
-
-bail:
-    return nErr;
-}
-
-int get_hex_arch_ver(int domain, int * arch) {
-    if (!remote_handle_control) {
-        GGML_LOG_ERROR("ggml-hex: remote_handle_control is not supported on this device\n");
-        return AEE_EUNSUPPORTEDAPI;
-    }
-
-    struct remote_dsp_capability arch_ver;
-    arch_ver.domain       = (uint32_t) domain;
-    arch_ver.attribute_ID = ARCH_VER;
-    arch_ver.capability   = (uint32_t) 0;
-
-    int err = remote_handle_control(DSPRPC_GET_DSP_INFO, &arch_ver, sizeof(arch_ver));
-    if ((err & 0xff) == (AEE_EUNSUPPORTEDAPI & 0xff)) {
-        GGML_LOG_ERROR("ggml-hex: FastRPC capability API is not supported on this device\n");
-        return AEE_EUNSUPPORTEDAPI;
-    }
-
-    if (err != AEE_SUCCESS) {
-        GGML_LOG_ERROR("ggml-hex: FastRPC capability query failed (err %d)\n", err);
-        return err;
-    }
-
-    switch (arch_ver.capability & 0xff) {
-        case 0x68:
-            *arch = 68;
-            return 0;
-        case 0x69:
-            *arch = 69;
-            return 0;
-        case 0x73:
-            *arch = 73;
-            return 0;
-        case 0x75:
-            *arch = 75;
-            return 0;
-        case 0x79:
-            *arch = 79;
-            return 0;
-        case 0x81:
-            *arch = 81;
-            return 0;
-    }
-    return -1;
-}
-
-int get_hvx_support_info(int domain, uint32_t * capability, uint32_t attr) {
-    int nErr    = AEE_SUCCESS;
-    *capability = 0;
-
-    if (remote_handle_control) {
-        if (domain == CDSP_DOMAIN_ID) {
-            /*
-            * Query the DSP for HVX SUPPORT information
-            * HVX is supported on CDSP only
-            */
-            struct remote_dsp_capability dsp_capability_hvx_dsp;
-            dsp_capability_hvx_dsp.domain       = (uint32_t) domain;
-            dsp_capability_hvx_dsp.attribute_ID = attr;
-            dsp_capability_hvx_dsp.capability   = (uint32_t) 0;
-            nErr                                = remote_handle_control(DSPRPC_GET_DSP_INFO, &dsp_capability_hvx_dsp,
-                                                                        sizeof(struct remote_dsp_capability));
-            if ((nErr & 0xFF) == (AEE_EUNSUPPORTEDAPI & 0xFF)) {
-                GGML_LOG_ERROR("\nFastRPC Capability API is not supported on this device\n");
-                GGML_LOG_ERROR("Running the usecase without checking the capability\n");
-                nErr = AEE_SUCCESS;
-                goto bail;
-            } else if (nErr == AEE_SUCCESS) {
-                *capability = dsp_capability_hvx_dsp.capability;
-            } else {
-                GGML_LOG_ERROR("\nget_hvx_support_info failed with Error 0x%x\n", nErr);
-                goto bail;
-            }
-        } else {
-            nErr = AEE_EUNSUPPORTED;
-            GGML_LOG_ERROR("HVX support is not available on domain %d\n", domain);
-            goto bail;
-        }
-    } else {
-        nErr = AEE_EUNSUPPORTEDAPI;
-        GGML_LOG_ERROR("remote_dsp_capability interface is not supported on this device\n");
-    }
-
-bail:
-    return nErr;
-}
diff --git a/ggml/src/ggml-hexagon/htp-utils.h b/ggml/src/ggml-hexagon/htp-utils.h
deleted file mode 100644
index 7bbae3a0b7..0000000000
--- a/ggml/src/ggml-hexagon/htp-utils.h
+++ /dev/null
@@ -1,221 +0,0 @@
-#ifndef HTP_UTILS_H
-#define HTP_UTILS_H
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#include <AEEStdErr.h>
-#include <inttypes.h>
-#include <remote.h>
-#include <rpcmem.h>
-#include <stdbool.h>
-
-/* Offset to differentiate HLOS and Hexagon error codes.
-   Stores the value of AEE_EOFFSET for Hexagon. */
-#ifndef DSP_OFFSET
-#    define DSP_OFFSET 0x80000400
-#endif
-
-/* Errno for connection reset by peer. */
-#ifndef ECONNRESET
-#    ifdef __hexagon__
-#        define ECONNRESET 104
-#    endif
-#endif
-
-/* Abstraction of different OS specific sleep APIs.
-   SLEEP accepts input in seconds. */
-#ifndef SLEEP
-#    ifdef __hexagon__
-#        define SLEEP(x)                      \
-            { /* Do nothing for simulator. */ \
-            }
-#    else
-#        ifdef _WINDOWS
-#            define SLEEP(x) Sleep(1000 * x) /* Sleep accepts input in milliseconds. */
-#        else
-#            define SLEEP(x) sleep(x)        /* sleep accepts input in seconds. */
-#        endif
-#    endif
-#endif
-
-/* Include windows specific header files. */
-#ifdef _WINDOWS
-#    include <sysinfoapi.h>
-#    include <windows.h>
-#    define _CRT_SECURE_NO_WARNINGS         1
-#    define _WINSOCK_DEPRECATED_NO_WARNINGS 1
-/* Including this file for custom implementation of getopt function. */
-#    include "getopt_custom.h"
-#endif
-
-/* Includes and defines for all HLOS except windows */
-#if !defined(__hexagon__) && !defined(_WINDOWS)
-#    include "unistd.h"
-
-#    include <sys/time.h>
-#endif
-
-/* Includes and defines for Hexagon and all HLOS except Windows. */
-#if !defined(_WINDOWS)
-/* Weak reference to remote symbol for compilation. */
-#    pragma weak remote_session_control
-#    pragma weak remote_handle_control
-#    pragma weak remote_handle64_control
-#    pragma weak fastrpc_mmap
-#    pragma weak fastrpc_munmap
-#    pragma weak rpcmem_alloc2
-#endif
-
-#if !defined(_WINDOWS)
-#    pragma weak remote_system_request
-#endif
-/**
- * Wrapper for FastRPC Capability API: query DSP support.
- *
- * @param[out]  domain pointer to supported domain.
- * @return      0          if query is successful.
- *              non-zero   if error, return value points to the error.
- */
-int get_dsp_support(int * domain);
-
-/**
- * Wrapper for FastRPC Capability API: query VTCM information.
- *
- * @param[in]   domain value of domain in the queried.
- * @param[out]  capability capability value of the attribute queried.
- * @param[in]   attr value of the attribute to the queried.
- * @return      0          if query is successful.
- *              non-zero   if error, return value points to the error.
- */
-int get_vtcm_info(int domain, uint32_t * capability, uint32_t attr);
-
-/**
- * Wrapper for FastRPC Capability API: query unsigned pd support on CDSP domain.
- *
- * @return      true          if unsigned pd is supported.
- *              false         if unsigned pd is not supported, capability query failed.
- */
-
-bool get_unsignedpd_support(void);
-
-/**
- * Wrapper for FastRPC Capability API: query unsigned pd support.
- *
- * @param[in]   domain value of domain in the queried.
- * @return      true          if unsigned pd is supported.
- *              false         if unsigned pd is not supported, capability query failed.
- */
-
-bool is_unsignedpd_supported(int domain_id);
-
-/**
- * is_valid_domain_id API: query a domain id is valid.
- *
- * @param[in]   domain value of domain in the queried.
- * @param[in]   compute_only value of domain is only compared with CDSP domains supported by the target when enabled.
- * @return      true          if value of domain is valid.
- *              false         if value of domain is not valid.
- */
-
-bool is_valid_domain_id(int domain_id, int compute_only);
-
-/**
- * get_domain API: get domain struct from domain value.
- *
- * @param[in]  domain value of a domain
- * @return     Returns domain struct of the domain if it is supported or else
- *             returns NULL.
- *
- */
-
-domain * get_domain(int domain_id);
-
-/**
- * get_domains_info API: get information for all the domains available on the device
- *
- * @param[in]  domain_type pointer to domain type
- * @param[in]  num_domains pointer to number of domains
- * @param[in]  domains_info pointer to save discovered domains information.
- * @return     0 if query is successful.
- *              non-zero if error, return value points to the error.
- *
- * It is user's responsibility to free the memory used to store the domains info whose address is present in domains_info before closing the application.
- *
- */
-
-int get_domains_info(char * domain_type, int * num_domains, fastrpc_domain ** domains_info);
-
-/**
- * get_effective_domain_id API: get effective domain id for given session id
- *
- * @param[in]  domain_name pointer to domain name
- * @param[in]  session_id
- * @param[in]  effec_domain_id pointer to save obtained effective domain id.
- * @return     0 if query is successful.
- *              non-zero if error, return value points to the error.
- *
- */
-
-int get_effective_domain_id(char * domain_name, int session_id, int * effec_domain_id);
-
-/**
- * is_async_fastrpc_supported API: query a domain id has async fastrpc supported or not
- *
- * @param[in]  domain_id value of a domain
- * @return     Returns true or false stating support of Async FastRPC
- *
- */
-
-bool is_async_fastrpc_supported(int domain_id);
-
-/**
- * is_status_notification_supported API: query the DSP for STATUS_NOTIFICATION_SUPPORT information
- *
- * @param[in]  domain_id value of a domain
- * @return     Returns true or false stating status notification support information
- *
- */
-bool is_status_notification_supported(int domain_id);
-
-/**
- * get_hmx_support_info API: query the DSP for HMX SUPPORT information
- *
- * @param[in]   domain_id value of a domain
- * @param[out]  capability capability value of the attribute queried.
- * @param[in]   attr value of the attribute to the queried.
- * @return      0 if query is successful.
- *              non-zero if error, return value points to the error.
- *
- */
-int get_hmx_support_info(int domain, uint32_t * capability, uint32_t attr);
-
-/**
- * get_hex_arch_ver API: query the Hexagon processor architecture version information
- *
- * @param[in]   domain_id value of a domain
- * @param[out]  Arch version (73, 75, ...)
- * @return      0 if query is successful.
- *              non-zero if error, return value points to the error.
- *
- */
-int get_hex_arch_ver(int domain, int * arch);
-
-/**
- * get_hvx_support_info API: query the DSP for HVX SUPPORT information
- *
- * @param[in]   domain_id value of a domain
- * @param[out]  capability capability value of the attribute queried.
- * @param[in]   attr value of the attribute to the queried.
- * @return      0 if query is successful.
- *              non-zero if error, return value points to the error.
- *
- */
-int get_hvx_support_info(int domain, uint32_t * capability, uint32_t attr);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif  //DSP_CAPABILITIES_UTILS_H
diff --git a/ggml/src/ggml-hexagon/htp/flash-attn-ops.c b/ggml/src/ggml-hexagon/htp/flash-attn-ops.c
index c7cb2a4e0b..c184637443 100644
--- a/ggml/src/ggml-hexagon/htp/flash-attn-ops.c
+++ b/ggml/src/ggml-hexagon/htp/flash-attn-ops.c
@@ -17,6 +17,12 @@
 #include "htp-msg.h"
 #include "htp-ops.h"
 
+static inline HVX_Vector hvx_load_f32_to_f16(const HVX_Vector * restrict src, const HVX_Vector zero) {
+    HVX_Vector y0_qf = Q6_Vqf32_vsub_VsfVsf(src[0], zero);  // 32 elements
+    HVX_Vector y1_qf = Q6_Vqf32_vsub_VsfVsf(src[1], zero);  // 32 elements
+    return Q6_Vh_vdeal_Vh(Q6_Vhf_equals_Wqf32(Q6_W_vcombine_VV(y1_qf, y0_qf)));
+}
+
 // Dot product of FP32 and FP16 vectors, accumulating to float
 static inline void hvx_dot_f32_f16_aa(float * restrict r, const void * restrict y, const void * restrict x, unsigned int n, float s) {
     const HVX_Vector * restrict vy = (const HVX_Vector * restrict) y; // fp32
@@ -33,23 +39,19 @@ static inline void hvx_dot_f32_f16_aa(float * restrict r, const void * restrict
     #pragma unroll(4)
     for (i = 0; i < nvec; i++) {
         // Load y (fp32) and convert into fp16
-        HVX_Vector y0_qf = Q6_Vqf32_vsub_VsfVsf(vy[i*2+0], zero);  // 32 elements
-        HVX_Vector y1_qf = Q6_Vqf32_vsub_VsfVsf(vy[i*2+1], zero);  // 32 elements
-        HVX_Vector y_hf  = Q6_Vh_vdeal_Vh(Q6_Vhf_equals_Wqf32(Q6_W_vcombine_VV(y1_qf, y0_qf)));
+        HVX_Vector y_hf  = hvx_load_f32_to_f16(&vy[i*2], zero);
 
         // Load x (fp16)
         HVX_Vector x_hf  = vx[i];
 
         HVX_VectorPair xy_qf = Q6_Wqf32_vmpy_VhfVhf(x_hf, y_hf);
 
-        rsum = Q6_Vqf32_vadd_Vqf32Vqf32(rsum, Q6_Vqf32_vadd_Vqf32Vqf32(Q6_V_lo_W(xy_qf), Q6_V_hi_W(xy_qf)));
+        rsum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(Q6_Vqf32_vadd_Vqf32Vqf32(Q6_V_lo_W(xy_qf), Q6_V_hi_W(xy_qf)), rsum));
     }
 
     if (nloe) {
         // Load y (fp32) and convert into fp16
-        HVX_Vector y0_qf = Q6_Vqf32_vsub_VsfVsf(vy[i*2+0], zero);  // 32 elements
-        HVX_Vector y1_qf = Q6_Vqf32_vsub_VsfVsf(vy[i*2+1], zero);  // 32 elements
-        HVX_Vector y_hf  = Q6_Vh_vdeal_Vh(Q6_Vhf_equals_Wqf32(Q6_W_vcombine_VV(y1_qf, y0_qf)));
+        HVX_Vector y_hf  = hvx_load_f32_to_f16(&vy[i*2], zero);
 
         // Load x (fp16)
         HVX_Vector x_hf  = vx[i];
@@ -62,13 +64,72 @@ static inline void hvx_dot_f32_f16_aa(float * restrict r, const void * restrict
 
         HVX_VectorPair xy_qf = Q6_Wqf32_vmpy_VhfVhf(x_hf, y_hf);
 
-        rsum = Q6_Vqf32_vadd_Vqf32Vqf32(rsum, Q6_Vqf32_vadd_Vqf32Vqf32(Q6_V_lo_W(xy_qf), Q6_V_hi_W(xy_qf)));
+        rsum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(Q6_Vqf32_vadd_Vqf32Vqf32(Q6_V_lo_W(xy_qf), Q6_V_hi_W(xy_qf)), rsum));
     }
 
-    rsum = Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(rsum), hvx_vec_splat_f32(s));
-    rsum = Q6_Vsf_equals_Vqf32(hvx_vec_reduce_sum_qf32(rsum));
+    rsum = Q6_Vqf32_vmpy_VsfVsf(hvx_vec_splat_f32(s), hvx_vec_reduce_sum_f32(rsum));
+    hvx_vec_store_u(r, 4, Q6_Vsf_equals_Vqf32(rsum));
+}
 
-    hvx_vec_store_u(r, 4, rsum);
+// Dot product of FP32 and FP16 vectors, accumulating to float
+static inline void hvx_dot_f32_f16_aa_rx2(float * restrict r,
+                                          const void * restrict y,
+                                          const void * restrict x0,
+                                          const void * restrict x1,
+                                          unsigned int n,
+                                          float        s) {
+    const HVX_Vector * restrict vy  = (const HVX_Vector * restrict) y;   // fp32
+    const HVX_Vector * restrict vx0 = (const HVX_Vector * restrict) x0;  // fp16
+    const HVX_Vector * restrict vx1 = (const HVX_Vector * restrict) x1;  // fp16
+
+    uint32_t nvec = n / VLEN_FP16;                                       // num full fp16 hvx vectors
+    uint32_t nloe = n % VLEN_FP16;                                       // leftover elements
+
+    const HVX_Vector zero  = Q6_V_vsplat_R(0);
+    HVX_Vector       rsum0 = Q6_V_vsplat_R(0);
+    HVX_Vector       rsum1 = Q6_V_vsplat_R(0);
+
+    uint32_t i = 0;
+
+    #pragma unroll(2)
+    for (i = 0; i < nvec; i++) {
+        // Load y (fp32) and convert into fp16
+        HVX_Vector y_hf  = hvx_load_f32_to_f16(&vy[i*2], zero);
+        // Load x (fp16)
+        HVX_Vector x0_hf = vx0[i];
+        HVX_Vector x1_hf = vx1[i];
+
+        HVX_VectorPair xy0_qf = Q6_Wqf32_vmpy_VhfVhf(x0_hf, y_hf);
+        HVX_VectorPair xy1_qf = Q6_Wqf32_vmpy_VhfVhf(x1_hf, y_hf);
+
+        rsum0 = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(Q6_Vqf32_vadd_Vqf32Vqf32(Q6_V_lo_W(xy0_qf), Q6_V_hi_W(xy0_qf)), rsum0));
+        rsum1 = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(Q6_Vqf32_vadd_Vqf32Vqf32(Q6_V_lo_W(xy1_qf), Q6_V_hi_W(xy1_qf)), rsum1));
+    }
+
+    if (nloe) {
+        // Load y (fp32) and convert into fp16
+        HVX_Vector y_hf  = hvx_load_f32_to_f16(&vy[i*2], zero);
+
+        // Load x (fp16)
+        HVX_Vector x0_hf = vx0[i];
+        HVX_Vector x1_hf = vx1[i];
+
+        // Zero-out unused elements
+        // Note that we need to clear both x and y because they may contain NANs
+        HVX_VectorPred bmask = Q6_Q_vsetq_R(nloe * 2);
+        x0_hf                = Q6_V_vand_QV(bmask, x0_hf);
+        x1_hf                = Q6_V_vand_QV(bmask, x1_hf);
+        y_hf                 = Q6_V_vand_QV(bmask, y_hf);
+
+        HVX_VectorPair xy0_qf = Q6_Wqf32_vmpy_VhfVhf(x0_hf, y_hf);
+        HVX_VectorPair xy1_qf = Q6_Wqf32_vmpy_VhfVhf(x1_hf, y_hf);
+
+        rsum0 = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(Q6_Vqf32_vadd_Vqf32Vqf32(Q6_V_lo_W(xy0_qf), Q6_V_hi_W(xy0_qf)), rsum0));
+        rsum1 = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(Q6_Vqf32_vadd_Vqf32Vqf32(Q6_V_lo_W(xy1_qf), Q6_V_hi_W(xy1_qf)), rsum1));
+    }
+
+    HVX_Vector rsum = Q6_Vqf32_vmpy_VsfVsf(hvx_vec_splat_f32(s), hvx_vec_reduce_sum_f32x2(rsum0, rsum1));
+    hvx_vec_store_u(r, 8, Q6_Vsf_equals_Vqf32(rsum));
 }
 
 // Dot product of two F16 vectors, accumulating to float
@@ -91,7 +152,7 @@ static inline void hvx_dot_f16_f16_aa(float * restrict r, const void * restrict
 
         HVX_VectorPair xy_qf = Q6_Wqf32_vmpy_VhfVhf(x_hf, y_hf);
 
-        rsum = Q6_Vqf32_vadd_Vqf32Vqf32(rsum, Q6_Vqf32_vadd_Vqf32Vqf32(Q6_V_lo_W(xy_qf),  Q6_V_hi_W(xy_qf)));
+        rsum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(Q6_Vqf32_vadd_Vqf32Vqf32(Q6_V_lo_W(xy_qf), Q6_V_hi_W(xy_qf)), rsum));
     }
 
     if (nloe) {
@@ -103,12 +164,62 @@ static inline void hvx_dot_f16_f16_aa(float * restrict r, const void * restrict
 
         HVX_VectorPair xy_qf = Q6_Wqf32_vmpy_VhfVhf(x_hf, y_hf);
 
-        rsum = Q6_Vqf32_vadd_Vqf32Vqf32(rsum, Q6_Vqf32_vadd_Vqf32Vqf32(Q6_V_lo_W(xy_qf),  Q6_V_hi_W(xy_qf)));
+        rsum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(Q6_Vqf32_vadd_Vqf32Vqf32(Q6_V_lo_W(xy_qf), Q6_V_hi_W(xy_qf)), rsum));
     }
 
-    rsum = Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(rsum), hvx_vec_splat_f32(s));
-    rsum = Q6_Vsf_equals_Vqf32(hvx_vec_reduce_sum_qf32(rsum));
-    hvx_vec_store_u(r, 4, rsum);
+    rsum = Q6_Vqf32_vmpy_VsfVsf(hvx_vec_splat_f32(s), hvx_vec_reduce_sum_f32(rsum));
+    hvx_vec_store_u(r, 4, Q6_Vsf_equals_Vqf32(rsum));
+}
+
+static inline void hvx_dot_f16_f16_aa_rx2(float * restrict r,
+                                          const void * restrict y,
+                                          const void * restrict x0,
+                                          const void * restrict x1,
+                                          unsigned int n,
+                                          float        s) {
+    const HVX_Vector * restrict vx0 = (const HVX_Vector * restrict) x0;  // fp16
+    const HVX_Vector * restrict vx1 = (const HVX_Vector * restrict) x1;  // fp16
+    const HVX_Vector * restrict vy  = (const HVX_Vector * restrict) y;   // fp16
+
+    uint32_t nvec = n / VLEN_FP16;                                       // num full fp16 hvx vectors
+    uint32_t nloe = n % VLEN_FP16;                                       // leftover elements
+
+    const HVX_Vector zero  = Q6_V_vsplat_R(0);
+    HVX_Vector       rsum0 = Q6_V_vsplat_R(0);
+    HVX_Vector       rsum1 = Q6_V_vsplat_R(0);
+
+    uint32_t i = 0;
+
+    #pragma unroll(4)
+    for (i = 0; i < nvec; i++) {
+        HVX_Vector y_hf  = vy[i];
+        HVX_Vector x0_hf = vx0[i];
+        HVX_Vector x1_hf = vx1[i];
+
+        HVX_VectorPair xy0_qf = Q6_Wqf32_vmpy_VhfVhf(x0_hf, y_hf);
+        HVX_VectorPair xy1_qf = Q6_Wqf32_vmpy_VhfVhf(x1_hf, y_hf);
+
+        rsum0 = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(Q6_Vqf32_vadd_Vqf32Vqf32(Q6_V_lo_W(xy0_qf), Q6_V_hi_W(xy0_qf)), rsum0));
+        rsum1 = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(Q6_Vqf32_vadd_Vqf32Vqf32(Q6_V_lo_W(xy1_qf), Q6_V_hi_W(xy1_qf)), rsum1));
+    }
+
+    if (nloe) {
+        HVX_Vector y_hf = vy[i];
+
+        // Load x (fp16) and zero-out unused elements
+        HVX_VectorPred bmask = Q6_Q_vsetq_R(nloe * 2);
+        HVX_Vector     x0_hf = Q6_V_vand_QV(bmask, vx0[i]);
+        HVX_Vector     x1_hf = Q6_V_vand_QV(bmask, vx1[i]);
+
+        HVX_VectorPair xy0_qf = Q6_Wqf32_vmpy_VhfVhf(x0_hf, y_hf);
+        HVX_VectorPair xy1_qf = Q6_Wqf32_vmpy_VhfVhf(x1_hf, y_hf);
+
+        rsum0 = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(Q6_Vqf32_vadd_Vqf32Vqf32(Q6_V_lo_W(xy0_qf), Q6_V_hi_W(xy0_qf)), rsum0));
+        rsum1 = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(Q6_Vqf32_vadd_Vqf32Vqf32(Q6_V_lo_W(xy1_qf), Q6_V_hi_W(xy1_qf)), rsum1));
+    }
+
+    HVX_Vector rsum = Q6_Vqf32_vmpy_VsfVsf(hvx_vec_splat_f32(s), hvx_vec_reduce_sum_f32x2(rsum0, rsum1));
+    hvx_vec_store_u(r, 8, Q6_Vsf_equals_Vqf32(rsum));
 }
 
 // MAD: y (F32) += x (F16) * s (float)
@@ -317,20 +428,22 @@ static void flash_attn_ext_f16_thread(struct htp_ops_context * octx, int ith, in
             // Inner loop processing the block from VTCM
             uint32_t ic = 0;
 
+            const bool is_q_fp32 = (q->type == HTP_TYPE_F32);
+
             // Process in blocks of 32 (VLEN_FP32)
-            static_assert(FLASH_ATTN_BLOCK_SIZE / VLEN_FP32 == 4, "FLASH_ATTN_BLOCK_SIZE changed, fix HVX_Vector_x4 usage");
+            static_assert(FLASH_ATTN_BLOCK_SIZE / VLEN_FP32 <= 4, "FLASH_ATTN_BLOCK_SIZE changed, fix HVX_Vector_x4 usage");
             HVX_Vector_x4 scores_x4;
             HVX_Vector v_max = hvx_vec_splat_f32(-INFINITY);
             for (uint32_t iv = 0; ic + VLEN_FP32 <= current_block_size; ic += VLEN_FP32, ++iv) {
                 // 1. Compute scores
-                float __attribute__((aligned(VLEN))) scores_arr[FLASH_ATTN_BLOCK_SIZE];
-                for (int j = 0; j < VLEN_FP32; ++j) {
+                float __attribute__((aligned(VLEN))) scores_arr[VLEN_FP32];
+                for (int j = 0; j < VLEN_FP32; j += 2) {
                     const uint32_t cur_ic = ic + j;
                     const uint8_t * k_ptr = k_base + cur_ic * size_k_row_padded;
-                    if (q->type == HTP_TYPE_F32) {
-                        hvx_dot_f32_f16_aa(&scores_arr[j], q_ptr_vtcm, k_ptr, DK, scale);
+                    if (is_q_fp32) {
+                        hvx_dot_f32_f16_aa_rx2(&scores_arr[j], q_ptr_vtcm, k_ptr, k_ptr + size_k_row_padded, DK, scale);
                     } else {
-                        hvx_dot_f16_f16_aa(&scores_arr[j], q_ptr_vtcm, k_ptr, DK, scale);
+                        hvx_dot_f16_f16_aa_rx2(&scores_arr[j], q_ptr_vtcm, k_ptr, k_ptr + size_k_row_padded, DK, scale);
                     }
                 }
 
@@ -403,7 +516,7 @@ static void flash_attn_ext_f16_thread(struct htp_ops_context * octx, int ith, in
                 float s_val;
                 const uint8_t * k_ptr = k_base + ic * size_k_row_padded;
 
-                if (q->type == HTP_TYPE_F32) {
+                if (is_q_fp32) {
                     hvx_dot_f32_f16_aa(&s_val, q_ptr_vtcm, k_ptr, DK, scale);
                 } else {
                     hvx_dot_f16_f16_aa(&s_val, q_ptr_vtcm, k_ptr, DK, scale);
diff --git a/ggml/src/ggml-hexagon/htp/hvx-dump.h b/ggml/src/ggml-hexagon/htp/hvx-dump.h
index e882227893..85201fc345 100644
--- a/ggml/src/ggml-hexagon/htp/hvx-dump.h
+++ b/ggml/src/ggml-hexagon/htp/hvx-dump.h
@@ -28,19 +28,16 @@ static void hvx_vec_dump_f16(char * pref, HVX_Vector v) {
 }
 
 static void hvx_vec_dump_f32_n(char * pref, HVX_Vector v, uint32_t n) {
-    union {
-        HVX_Vector v;
-        float      d[32];
-    } u = { .v = v };
+    HVX_VectorAlias u = { .v = v };
 
     const uint32_t n0 = n / 16;
     const uint32_t n1 = n % 16;
     int            i  = 0;
     for (; i < n0; i++) {
-        hex_dump_f32_line(pref, u.d + (16 * i), 16);
+        hex_dump_f32_line(pref, u.fp32 + (16 * i), 16);
     }
     if (n1) {
-        hex_dump_f32_line(pref, u.d + (16 * i), n1);
+        hex_dump_f32_line(pref, u.fp32 + (16 * i), n1);
     }
 }
 
diff --git a/ggml/src/ggml-hexagon/htp/hvx-reduce.h b/ggml/src/ggml-hexagon/htp/hvx-reduce.h
index 8845fe73ea..1ca7c05d98 100644
--- a/ggml/src/ggml-hexagon/htp/hvx-reduce.h
+++ b/ggml/src/ggml-hexagon/htp/hvx-reduce.h
@@ -44,6 +44,45 @@ static inline HVX_Vector hvx_vec_reduce_sum_qf32(HVX_Vector in) {
     return hvx_vec_reduce_sum_n_qf32(in, 32);
 }
 
+#if __HVX_ARCH__ > 75
+
+static inline HVX_Vector hvx_vec_reduce_sum_f32x2(HVX_Vector in0, HVX_Vector in1) {
+    HVX_VectorPair sump = Q6_W_vshuff_VVR(in1, in0, 4);
+    HVX_Vector  sum_sf  = Q6_Vsf_vadd_VsfVsf(Q6_V_lo_W(sump), Q6_V_hi_W(sump));
+
+    sum_sf = Q6_Vsf_vadd_VsfVsf(sum_sf, Q6_V_vror_VR(sum_sf, VLEN / 2));
+    sum_sf = Q6_Vsf_vadd_VsfVsf(sum_sf, Q6_V_vror_VR(sum_sf, VLEN / 4));
+    sum_sf = Q6_Vsf_vadd_VsfVsf(sum_sf, Q6_V_vror_VR(sum_sf, VLEN / 8));
+    sum_sf = Q6_Vsf_vadd_VsfVsf(sum_sf, Q6_V_vror_VR(sum_sf, VLEN / 16));
+    return sum_sf;
+}
+
+static inline HVX_Vector hvx_vec_reduce_sum_n_f32(HVX_Vector in, unsigned int n) {
+    unsigned int total = n * 4;  // total vec nbytes
+    unsigned int width = 4;      // fp32 nbytes
+
+    HVX_Vector sum = in, sum_t;
+    while (width < total) {
+        sum_t = Q6_V_vror_VR(sum, width);       // rotate right
+        sum   = Q6_Vsf_vadd_VsfVsf(sum, sum_t); // elementwise sum
+        width = width << 1;
+    }
+    return sum;
+}
+
+#else
+
+static inline HVX_Vector hvx_vec_reduce_sum_f32x2(HVX_Vector in0, HVX_Vector in1) {
+    HVX_VectorPair sump = Q6_W_vshuff_VVR(in1, in0, 4);
+    HVX_Vector  sum_qf  = Q6_Vqf32_vadd_VsfVsf(Q6_V_lo_W(sump), Q6_V_hi_W(sump));
+
+    sum_qf = Q6_Vqf32_vadd_Vqf32Vsf(sum_qf, Q6_V_vror_VR(Q6_Vsf_equals_Vqf32(sum_qf), VLEN / 2));
+    sum_qf = Q6_Vqf32_vadd_Vqf32Vsf(sum_qf, Q6_V_vror_VR(Q6_Vsf_equals_Vqf32(sum_qf), VLEN / 4));
+    sum_qf = Q6_Vqf32_vadd_Vqf32Vsf(sum_qf, Q6_V_vror_VR(Q6_Vsf_equals_Vqf32(sum_qf), VLEN / 8));
+    sum_qf = Q6_Vqf32_vadd_Vqf32Vsf(sum_qf, Q6_V_vror_VR(Q6_Vsf_equals_Vqf32(sum_qf), VLEN / 16));
+    return Q6_Vsf_equals_Vqf32(sum_qf);
+}
+
 static inline HVX_Vector hvx_vec_reduce_sum_n_f32(HVX_Vector in, unsigned int n) {
     unsigned int total = n * 4;  // total vec nbytes
     unsigned int width = 4;      // fp32 nbytes
@@ -57,6 +96,8 @@ static inline HVX_Vector hvx_vec_reduce_sum_n_f32(HVX_Vector in, unsigned int n)
     return sum;
 }
 
+#endif
+
 static inline HVX_Vector hvx_vec_reduce_sum_f32(HVX_Vector in) {
     return hvx_vec_reduce_sum_n_f32(in, 32);
 }
diff --git a/ggml/src/ggml-hexagon/htp/matmul-ops.c b/ggml/src/ggml-hexagon/htp/matmul-ops.c
index 1603ff2b3b..d251eeed33 100644
--- a/ggml/src/ggml-hexagon/htp/matmul-ops.c
+++ b/ggml/src/ggml-hexagon/htp/matmul-ops.c
@@ -11,6 +11,7 @@
 
 #include "hex-dma.h"
 #include "hvx-utils.h"
+#include "hvx-dump.h"
 
 #define GGML_COMMON_DECL_C
 #include "ggml-common.h"
@@ -320,7 +321,7 @@ static void vec_dot_q4x4x2_q8x4x2(const int n, float * restrict s, const void *
     const uint8_t * restrict y_q = ((const uint8_t *) vy + 0);               // quants first
     const uint8_t * restrict y_d = ((const uint8_t *) vy + y_qrow_size);     // then scales
 
-    // Row sum (qf32)
+    // Row sum (sf)
     HVX_Vector r0_sum = Q6_V_vsplat_R(0);
 
     // Multiply and accumulate into int32.
@@ -344,7 +345,7 @@ static void vec_dot_q4x4x2_q8x4x2(const int n, float * restrict s, const void *
 
         HVX_Vector r0_fa = Q6_Vqf32_vmpy_VsfVsf(r0_ia, r0_dd);
 
-        r0_sum = Q6_Vqf32_vadd_Vqf32Vqf32(r0_sum, r0_fa);
+        r0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_fa, r0_sum));
     }
 
     // Process leftovers, we still load full 4x4x2 block but zero out unused scales/blocks
@@ -362,14 +363,14 @@ static void vec_dot_q4x4x2_q8x4x2(const int n, float * restrict s, const void *
         // Zero out unused scales
         HVX_VectorPred bmask = Q6_Q_vsetq_R(nloe / 8);
         r0_dd                = Q6_V_vand_QV(bmask, r0_dd);
+        r0_ia                = Q6_V_vand_QV(bmask, r0_ia);
 
         HVX_Vector r0_fa = Q6_Vqf32_vmpy_VsfVsf(r0_ia, r0_dd);
 
-        r0_sum = Q6_Vqf32_vadd_Vqf32Vqf32(r0_sum, r0_fa);
+        r0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_fa, r0_sum));
     }
 
-    // Reduce and convert into fp32
-    r0_sum = hvx_vec_reduce_sum_f32(Q6_Vsf_equals_Vqf32(r0_sum));
+    r0_sum = hvx_vec_reduce_sum_f32(r0_sum);
 
     hvx_vec_store_u(&s[0], 4, r0_sum);
 }
@@ -402,7 +403,7 @@ static void vec_dot_q4x4x2_q8x4x2_rx2(const int n,
     const uint8_t * restrict y_q = ((const uint8_t *) vy + 0);                                     // quants first
     const uint8_t * restrict y_d = ((const uint8_t *) vy + y_qrow_size);                           // then scales
 
-    // Row sum (qf32)
+    // Row sum (sf)
     HVX_Vector r0_sum = Q6_V_vsplat_R(0);
     HVX_Vector r1_sum = Q6_V_vsplat_R(0);
 
@@ -432,8 +433,8 @@ static void vec_dot_q4x4x2_q8x4x2_rx2(const int n,
         HVX_Vector r0_fa = Q6_Vqf32_vmpy_VsfVsf(r0_ia, r0_dd);
         HVX_Vector r1_fa = Q6_Vqf32_vmpy_VsfVsf(r1_ia, r1_dd);
 
-        r0_sum = Q6_Vqf32_vadd_Vqf32Vqf32(r0_sum, r0_fa);
-        r1_sum = Q6_Vqf32_vadd_Vqf32Vqf32(r1_sum, r1_fa);
+        r0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_fa, r0_sum));
+        r1_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r1_fa, r1_sum));
     }
 
     // Process leftovers, we still load full 4x4x2 block but zero out unused scales/blocks
@@ -456,20 +457,18 @@ static void vec_dot_q4x4x2_q8x4x2_rx2(const int n,
         HVX_VectorPred bmask = Q6_Q_vsetq_R(nloe / 8);
         r0_dd                = Q6_V_vand_QV(bmask, r0_dd);
         r1_dd                = Q6_V_vand_QV(bmask, r1_dd);
+        r0_ia                = Q6_V_vand_QV(bmask, r0_ia);
+        r1_ia                = Q6_V_vand_QV(bmask, r1_ia);
 
         HVX_Vector r0_fa = Q6_Vqf32_vmpy_VsfVsf(r0_ia, r0_dd);
         HVX_Vector r1_fa = Q6_Vqf32_vmpy_VsfVsf(r1_ia, r1_dd);
 
-        r0_sum = Q6_Vqf32_vadd_Vqf32Vqf32(r0_sum, r0_fa);
-        r1_sum = Q6_Vqf32_vadd_Vqf32Vqf32(r1_sum, r1_fa);
+        r0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_fa, r0_sum));
+        r1_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r1_fa, r1_sum));
     }
 
-    // Convert into fp32 and reduce
-    r0_sum = hvx_vec_reduce_sum_f32(Q6_Vsf_equals_Vqf32(r0_sum));
-    r1_sum = hvx_vec_reduce_sum_f32(Q6_Vsf_equals_Vqf32(r1_sum));
-    HVX_VectorPair p0 = Q6_W_vshuff_VVR(r1_sum, r0_sum, 4);
-
-    hvx_vec_store_u(&s[0], 8, Q6_V_lo_W(p0));
+    HVX_Vector rsum = hvx_vec_reduce_sum_f32x2(r0_sum, r1_sum);
+    hvx_vec_store_u(&s[0], 8, rsum);
 }
 
 static void vec_dot_q8x4x2_q8x4x2(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
@@ -493,7 +492,7 @@ static void vec_dot_q8x4x2_q8x4x2(const int n, float * restrict s, const void *
     const uint8_t * restrict y_q = ((const uint8_t *) vy + 0);               // quants first
     const uint8_t * restrict y_d = ((const uint8_t *) vy + y_qrow_size);     // then scales
 
-    // Row sum (qf32)
+    // Row sum (sf)
     HVX_Vector r0_sum = Q6_V_vsplat_R(0);
 
     // Multiply and accumulate into int32.
@@ -517,7 +516,7 @@ static void vec_dot_q8x4x2_q8x4x2(const int n, float * restrict s, const void *
 
         HVX_Vector r0_fa = Q6_Vqf32_vmpy_VsfVsf(r0_ia, r0_dd);
 
-        r0_sum = Q6_Vqf32_vadd_Vqf32Vqf32(r0_sum, r0_fa);
+        r0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_fa, r0_sum));
     }
 
     // Process leftovers, we still load full 4x4x2 block but zero out unused scales/blocks
@@ -535,14 +534,14 @@ static void vec_dot_q8x4x2_q8x4x2(const int n, float * restrict s, const void *
         // Zero out unused scales
         HVX_VectorPred bmask = Q6_Q_vsetq_R(nloe / 8);
         r0_dd                = Q6_V_vand_QV(bmask, r0_dd);
+        r0_ia                = Q6_V_vand_QV(bmask, r0_ia);
 
         HVX_Vector r0_fa = Q6_Vqf32_vmpy_VsfVsf(r0_ia, r0_dd);
 
-        r0_sum = Q6_Vqf32_vadd_Vqf32Vqf32(r0_sum, r0_fa);
+        r0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_fa, r0_sum));
     }
 
-    // Reduce and convert into fp32
-    r0_sum = hvx_vec_reduce_sum_f32(Q6_Vsf_equals_Vqf32(r0_sum));
+    r0_sum = hvx_vec_reduce_sum_f32(r0_sum);
 
     hvx_vec_store_u(&s[0], 4, r0_sum);
 }
@@ -605,8 +604,8 @@ static void vec_dot_q8x4x2_q8x4x2_rx2(const int n,
         HVX_Vector r0_fa = Q6_Vqf32_vmpy_VsfVsf(r0_ia, r0_dd);
         HVX_Vector r1_fa = Q6_Vqf32_vmpy_VsfVsf(r1_ia, r1_dd);
 
-        r0_sum = Q6_Vqf32_vadd_Vqf32Vqf32(r0_sum, r0_fa);
-        r1_sum = Q6_Vqf32_vadd_Vqf32Vqf32(r1_sum, r1_fa);
+        r0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_fa, r0_sum));
+        r1_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r1_fa, r1_sum));
     }
 
     // Process leftovers, we still load full 4x4x2 block but zero out unused scales/blocks
@@ -629,20 +628,18 @@ static void vec_dot_q8x4x2_q8x4x2_rx2(const int n,
         HVX_VectorPred bmask = Q6_Q_vsetq_R(nloe / 8);
         r0_dd                = Q6_V_vand_QV(bmask, r0_dd);
         r1_dd                = Q6_V_vand_QV(bmask, r1_dd);
+        r0_ia                = Q6_V_vand_QV(bmask, r0_ia);
+        r1_ia                = Q6_V_vand_QV(bmask, r1_ia);
 
         HVX_Vector r0_fa = Q6_Vqf32_vmpy_VsfVsf(r0_ia, r0_dd);
         HVX_Vector r1_fa = Q6_Vqf32_vmpy_VsfVsf(r1_ia, r1_dd);
 
-        r0_sum = Q6_Vqf32_vadd_Vqf32Vqf32(r0_sum, r0_fa);
-        r1_sum = Q6_Vqf32_vadd_Vqf32Vqf32(r1_sum, r1_fa);
+        r0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_fa, r0_sum));
+        r1_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r1_fa, r1_sum));
     }
 
-    // Convert into fp32 and reduce
-    r0_sum = hvx_vec_reduce_sum_f32(Q6_Vsf_equals_Vqf32(r0_sum));
-    r1_sum = hvx_vec_reduce_sum_f32(Q6_Vsf_equals_Vqf32(r1_sum));
-    HVX_VectorPair p0 = Q6_W_vshuff_VVR(r1_sum, r0_sum, 4);
-
-    hvx_vec_store_u(&s[0], 8, Q6_V_lo_W(p0));
+    HVX_Vector rsum = hvx_vec_reduce_sum_f32x2(r0_sum, r1_sum);
+    hvx_vec_store_u(&s[0], 8, rsum);
 }
 
 static void vec_dot_mxfp4x4x2_q8x4x2(const int n,
@@ -669,7 +666,7 @@ static void vec_dot_mxfp4x4x2_q8x4x2(const int n,
     const uint8_t * restrict y_q = ((const uint8_t *) vy + 0);               // quants first
     const uint8_t * restrict y_d = ((const uint8_t *) vy + y_qrow_size);     // then scales
 
-    // Row sum (qf32)
+    // Row sum (sf)
     HVX_Vector r0_sum = Q6_V_vsplat_R(0);
 
     // Multiply and accumulate into int32.
@@ -708,7 +705,7 @@ static void vec_dot_mxfp4x4x2_q8x4x2(const int n,
 
         HVX_Vector r0_fa = Q6_Vqf32_vmpy_VsfVsf(r0_ia, r0_dd);
 
-        r0_sum = Q6_Vqf32_vadd_Vqf32Vqf32(r0_sum, r0_fa);
+        r0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_fa, r0_sum));
     }
 
     // Process leftovers
@@ -741,14 +738,14 @@ static void vec_dot_mxfp4x4x2_q8x4x2(const int n,
         // Zero-out unused scales
         HVX_VectorPred bmask = Q6_Q_vsetq_R(nloe / 8);
         r0_dd                = Q6_V_vand_QV(bmask, r0_dd);
+        r0_ia                = Q6_V_vand_QV(bmask, r0_ia);
 
         HVX_Vector r0_fa = Q6_Vqf32_vmpy_VsfVsf(r0_ia, r0_dd);
 
-        r0_sum = Q6_Vqf32_vadd_Vqf32Vqf32(r0_sum, r0_fa);
+        r0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_fa, r0_sum));
     }
 
-    // Reduce and convert into fp32
-    r0_sum = hvx_vec_reduce_sum_f32(Q6_Vsf_equals_Vqf32(r0_sum));
+    r0_sum = hvx_vec_reduce_sum_f32(r0_sum);
 
     hvx_vec_store_u(&s[0], 4, r0_sum);
 }
@@ -781,13 +778,13 @@ static void vec_dot_mxfp4x4x2_q8x4x2_rx2(const int n,
     const uint8_t * restrict y_q = ((const uint8_t *) vy + 0);                                     // quants first
     const uint8_t * restrict y_d = ((const uint8_t *) vy + y_qrow_size);                           // then scales
 
-    // Row sum (qf32)
+    // Row sum (sf)
     HVX_Vector r0_sum = Q6_V_vsplat_R(0);
     HVX_Vector r1_sum = Q6_V_vsplat_R(0);
 
     // Multiply and accumulate into int32.
     // Compute combined scale (fp32).
-    // Apply scale to acc and accumulate into the row sum (qf32).
+    // Apply scale to acc and accumulate into the row sum (f32).
 
     const uint32_t nb   = n / qk;  // num full blocks
     int32_t        nloe = n % qk;  // num leftover elemements (must be signed)
@@ -829,8 +826,8 @@ static void vec_dot_mxfp4x4x2_q8x4x2_rx2(const int n,
         HVX_Vector r0_fa = Q6_Vqf32_vmpy_VsfVsf(r0_ia, r0_dd);
         HVX_Vector r1_fa = Q6_Vqf32_vmpy_VsfVsf(r1_ia, r1_dd);
 
-        r0_sum = Q6_Vqf32_vadd_Vqf32Vqf32(r0_sum, r0_fa);
-        r1_sum = Q6_Vqf32_vadd_Vqf32Vqf32(r1_sum, r1_fa);
+        r0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_fa, r0_sum));
+        r1_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r1_fa, r1_sum));
     }
 
     // Process leftovers
@@ -867,24 +864,22 @@ static void vec_dot_mxfp4x4x2_q8x4x2_rx2(const int n,
         HVX_Vector r0_dd = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(r0_d, vy_d));
         HVX_Vector r1_dd = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(r1_d, vy_d));
 
-        // Zero-out unused scales
+        // Zero-out unused values
         HVX_VectorPred bmask = Q6_Q_vsetq_R(nloe / 8);
         r0_dd                = Q6_V_vand_QV(bmask, r0_dd);
         r1_dd                = Q6_V_vand_QV(bmask, r1_dd);
+        r0_ia                = Q6_V_vand_QV(bmask, r0_ia);
+        r1_ia                = Q6_V_vand_QV(bmask, r1_ia);
 
         HVX_Vector r0_fa = Q6_Vqf32_vmpy_VsfVsf(r0_ia, r0_dd);
         HVX_Vector r1_fa = Q6_Vqf32_vmpy_VsfVsf(r1_ia, r1_dd);
 
-        r0_sum = Q6_Vqf32_vadd_Vqf32Vqf32(r0_sum, r0_fa);
-        r1_sum = Q6_Vqf32_vadd_Vqf32Vqf32(r1_sum, r1_fa);
+        r0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_fa, r0_sum));
+        r1_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r1_fa, r1_sum));
     }
 
-    // Convert into fp32 and reduce
-    r0_sum = hvx_vec_reduce_sum_f32(Q6_Vsf_equals_Vqf32(r0_sum));
-    r1_sum = hvx_vec_reduce_sum_f32(Q6_Vsf_equals_Vqf32(r1_sum));
-    HVX_VectorPair p0 = Q6_W_vshuff_VVR(r1_sum, r0_sum, 4);
-
-    hvx_vec_store_u(&s[0], 8, Q6_V_lo_W(p0));
+    HVX_Vector rsum = hvx_vec_reduce_sum_f32x2(r0_sum, r1_sum);
+    hvx_vec_store_u(&s[0], 8, rsum);
 }
 
 static void vec_dot_f16_f16_aa(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
@@ -913,7 +908,7 @@ static void vec_dot_f16_f16_aa(const int n, float * restrict s, const void * res
         rsum = Q6_Vqf32_vadd_Vqf32Vqf32(rsum, Q6_Vqf32_vadd_Vqf32Vqf32(Q6_V_lo_W(xy_qf),  Q6_V_hi_W(xy_qf)));
     }
 
-    rsum = Q6_Vsf_equals_Vqf32(hvx_vec_reduce_sum_qf32(rsum));
+    rsum = hvx_vec_reduce_sum_f32(Q6_Vsf_equals_Vqf32(rsum));
     hvx_vec_store_u(&s[0], 4, rsum);
 }
 
@@ -957,11 +952,8 @@ static void vec_dot_f16_f16_aa_rx2(const int n,
         rsum1 = Q6_Vqf32_vadd_Vqf32Vqf32(rsum1, Q6_Vqf32_vadd_Vqf32Vqf32(Q6_V_lo_W(xy1_qf), Q6_V_hi_W(xy1_qf)));
     }
 
-    rsum0 = Q6_Vsf_equals_Vqf32(hvx_vec_reduce_sum_qf32(rsum0));
-    rsum1 = Q6_Vsf_equals_Vqf32(hvx_vec_reduce_sum_qf32(rsum1));
-    HVX_VectorPair p0 = Q6_W_vshuff_VVR(rsum1, rsum0, 4);
-
-    hvx_vec_store_u(&s[0], 8, Q6_V_lo_W(p0));
+    HVX_Vector rsum = hvx_vec_reduce_sum_f32x2(Q6_Vsf_equals_Vqf32(rsum0), Q6_Vsf_equals_Vqf32(rsum1));
+    hvx_vec_store_u(&s[0], 8, rsum);
 }
 
 static void vec_dot_f16_f16_uu(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
@@ -990,7 +982,7 @@ static void vec_dot_f16_f16_uu(const int n, float * restrict s, const void * res
         rsum = Q6_Vqf32_vadd_Vqf32Vqf32(rsum, Q6_Vqf32_vadd_Vqf32Vqf32(Q6_V_lo_W(xy_qf),  Q6_V_hi_W(xy_qf)));
     }
 
-    rsum = Q6_Vsf_equals_Vqf32(hvx_vec_reduce_sum_qf32(rsum));
+    rsum = hvx_vec_reduce_sum_f32(Q6_Vsf_equals_Vqf32(rsum));
     hvx_vec_store_u(&s[0], 4, rsum);
 }
 
@@ -1042,7 +1034,8 @@ static void vec_dot_f16_f32_uu(const int n, float * restrict s, const void * res
         rsum = Q6_Vqf32_vadd_Vqf32Vqf32(rsum, Q6_Vqf32_vadd_Vqf32Vqf32(Q6_V_lo_W(xy_qf),  Q6_V_hi_W(xy_qf)));
     }
 
-    rsum = Q6_Vsf_equals_Vqf32(hvx_vec_reduce_sum_qf32(rsum));
+    // Convert into fp32 and reduce
+    rsum = hvx_vec_reduce_sum_f32(Q6_Vsf_equals_Vqf32(rsum));
     hvx_vec_store_u(&s[0], 4, rsum);
 }
 
diff --git a/ggml/src/ggml-hexagon/htp/softmax-ops.c b/ggml/src/ggml-hexagon/htp/softmax-ops.c
index 1b6b2eba4a..e91a16d947 100644
--- a/ggml/src/ggml-hexagon/htp/softmax-ops.c
+++ b/ggml/src/ggml-hexagon/htp/softmax-ops.c
@@ -154,8 +154,8 @@ static void hvx_fast_softmax_f32(const uint8_t * restrict src,
         v_pad[i] = v3;
     }
 
-    v       = hvx_vec_reduce_sum_qf32(sum_vec);
-    sum_vec = hvx_vec_repl4(Q6_Vsf_equals_Vqf32(v));
+    v       = hvx_vec_reduce_sum_f32(Q6_Vsf_equals_Vqf32(sum_vec));
+    sum_vec = hvx_vec_repl4(v);
 
     HVX_VectorPred pos_sum   = Q6_Q_vcmp_gt_VwVw(sum_vec, zero_v);
     HVX_Vector     v4        = hvx_vec_inverse_f32(sum_vec);
diff --git a/ggml/src/ggml-hexagon/htp/unary-ops.c b/ggml/src/ggml-hexagon/htp/unary-ops.c
index be8be8c4e6..1a27cb6e63 100644
--- a/ggml/src/ggml-hexagon/htp/unary-ops.c
+++ b/ggml/src/ggml-hexagon/htp/unary-ops.c
@@ -57,8 +57,8 @@ static void hvx_fast_rms_norm_f32(const uint8_t * restrict src,
         sum_v         = Q6_Vqf32_vadd_Vqf32Vqf32(sum_v, v2);
     }
 
-    HVX_Vector reduced_sum = hvx_vec_reduce_sum_qf32(sum_v);
-    sum_v                  = hvx_vec_repl4(Q6_Vsf_equals_Vqf32(reduced_sum));
+    HVX_Vector reduced_sum = hvx_vec_reduce_sum_f32(Q6_Vsf_equals_Vqf32(sum_v));
+    sum_v                  = hvx_vec_repl4(reduced_sum);
 
     HVX_Vector t_v            = hvx_vec_splat_f32((float) num_elems);
     HVX_Vector denom_v        = hvx_vec_inverse_f32(t_v);
diff --git a/ggml/src/ggml-hexagon/libdl.h b/ggml/src/ggml-hexagon/libdl.h
new file mode 100644
index 0000000000..8ca5016f03
--- /dev/null
+++ b/ggml/src/ggml-hexagon/libdl.h
@@ -0,0 +1,79 @@
+#pragma once
+
+#ifdef _WIN32
+#   define WIN32_LEAN_AND_MEAN
+#   ifndef NOMINMAX
+#       define NOMINMAX
+#   endif
+#   include <windows.h>
+#   include <winevt.h>
+#else
+#    include <dlfcn.h>
+#    include <unistd.h>
+#endif
+#include <filesystem>
+
+namespace fs = std::filesystem;
+
+#ifdef _WIN32
+
+using dl_handle = std::remove_pointer_t<HMODULE>;
+
+struct dl_handle_deleter {
+    void operator()(HMODULE handle) {
+        FreeLibrary(handle);
+    }
+};
+
+static inline dl_handle * dl_load_library(const fs::path & path) {
+    // suppress error dialogs for missing DLLs
+    DWORD old_mode = SetErrorMode(SEM_FAILCRITICALERRORS);
+    SetErrorMode(old_mode | SEM_FAILCRITICALERRORS);
+
+    HMODULE handle = LoadLibraryW(path.wstring().c_str());
+
+    SetErrorMode(old_mode);
+
+    return handle;
+}
+
+static inline void * dl_get_sym(dl_handle * handle, const char * name) {
+    DWORD old_mode = SetErrorMode(SEM_FAILCRITICALERRORS);
+    SetErrorMode(old_mode | SEM_FAILCRITICALERRORS);
+
+    void * p = (void *) GetProcAddress(handle, name);
+
+    SetErrorMode(old_mode);
+
+    return p;
+}
+
+static inline const char * dl_error() {
+    return "";
+}
+
+#else
+
+using dl_handle = void;
+
+struct dl_handle_deleter {
+    void operator()(void * handle) {
+        dlclose(handle);
+    }
+};
+
+static inline dl_handle * dl_load_library(const fs::path & path) {
+    dl_handle * handle = dlopen(path.string().c_str(), RTLD_NOW | RTLD_LOCAL);
+    return handle;
+}
+
+static inline void * dl_get_sym(dl_handle * handle, const char * name) {
+    return dlsym(handle, name);
+}
+
+static inline const char * dl_error() {
+    const char *rslt = dlerror();
+    return rslt != nullptr ? rslt : "";
+}
+
+#endif
diff --git a/ggml/src/ggml-hexagon/libggml-htp.inf b/ggml/src/ggml-hexagon/libggml-htp.inf
new file mode 100644
index 0000000000..656d2d9ab2
--- /dev/null
+++ b/ggml/src/ggml-hexagon/libggml-htp.inf
@@ -0,0 +1,38 @@
+[Version]
+Signature   = "$WINDOWS NT$"
+Class       = ComputeAccelerator
+ClassGuid   = {F01A9D53-3FF6-48D2-9F97-C8A7004BE10C}
+Provider    = %GGML%
+DriverVer   = 01/01/2026,1.0.0.0
+CatalogFile = libggml-htp.cat
+PnpLockDown = 1
+
+[DestinationDirs]
+Drivers_Dir = 6
+
+[SourceDisksNames]
+1 = %DiskId%
+
+[SourceDisksFiles]
+libggml-htp-v68.so = 1
+libggml-htp-v69.so = 1
+libggml-htp-v73.so = 1
+libggml-htp-v75.so = 1
+libggml-htp-v81.so = 1
+
+[ControlFlags]
+ExcludeFromSelect = *
+
+[DefaultInstall.NTarm64]
+CopyFiles=Drivers_Dir
+
+[Drivers_Dir]
+libggml-htp-v68.so,,,0x10 ;COPYFLG_NO_OVERWRITE
+libggml-htp-v69.so,,,0x10 ;COPYFLG_NO_OVERWRITE
+libggml-htp-v73.so,,,0x10 ;COPYFLG_NO_OVERWRITE
+libggml-htp-v75.so,,,0x10 ;COPYFLG_NO_OVERWRITE
+libggml-htp-v81.so,,,0x10 ;COPYFLG_NO_OVERWRITE
+
+[Strings]
+GGML   = 'GGML'
+DiskId = 'GGML HTP library'
diff --git a/ggml/src/ggml-opencl/CMakeLists.txt b/ggml/src/ggml-opencl/CMakeLists.txt
index 0259474b6e..fa5fadd112 100644
--- a/ggml/src/ggml-opencl/CMakeLists.txt
+++ b/ggml/src/ggml-opencl/CMakeLists.txt
@@ -101,6 +101,8 @@ set(GGML_OPENCL_KERNELS
     mul_mm_f32_f32_l4_lm
     mul_mm_f16_f32_l4_lm
     mul_mm_q8_0_f32_l4_lm
+    mul_mm_q8_0_f32_8x4
+    gemv_noshuffle_general_q8_0_f32
     mul
     norm
     relu
diff --git a/ggml/src/ggml-opencl/ggml-opencl.cpp b/ggml/src/ggml-opencl/ggml-opencl.cpp
index 678e40965a..4850c11d14 100644
--- a/ggml/src/ggml-opencl/ggml-opencl.cpp
+++ b/ggml/src/ggml-opencl/ggml-opencl.cpp
@@ -226,7 +226,8 @@ static ADRENO_GPU_GEN get_adreno_gpu_gen(const char *device_name) {
         return ADRENO_GPU_GEN::A7X;
     }
 
-    if (strstr(device_name, "830")) {
+    if (strstr(device_name, "830") ||
+        strstr(device_name, "840")) {
         return ADRENO_GPU_GEN::A8X;
     }
 
@@ -529,7 +530,7 @@ struct ggml_backend_opencl_context {
     cl_kernel kernel_mul_mat_q4_0_f32, kernel_mul_mat_q4_0_f32_v;
     cl_kernel kernel_convert_block_q4_0, kernel_restore_block_q4_0;
     cl_kernel kernel_convert_block_mxfp4, kernel_convert_block_mxfp4_trans, kernel_restore_block_mxfp4, kernel_restore_block_mxfp4_trans;
-    cl_kernel kernel_convert_block_q8_0, kernel_restore_block_q8_0;
+    cl_kernel kernel_convert_block_q8_0, kernel_restore_block_q8_0, kernel_restore_block_q8_0_trans;
     cl_kernel kernel_mul_mat_q4_0_f32_8x_flat;
     cl_kernel kernel_convert_block_q4_0_noshuffle;
     cl_kernel kernel_restore_block_q4_0_noshuffle;
@@ -696,6 +697,8 @@ struct ggml_backend_opencl_context {
     cl_kernel CL_mul_mat_vec_q4_0_f32_1d_4x_flat_4096_1_4096;
     cl_kernel CL_mul_mat_vec_q4_0_f32_1d_4x_flat_11008_1_4096;
     cl_kernel CL_mul_mat_vec_q4_0_f32_1d_4x_flat_32000_1_4096;
+    cl_kernel kernel_mul_mm_q8_0_f32_8x4;
+    cl_kernel CL_mul_mat_vec_q8_0_f32;
 #endif // GGML_OPENCL_USE_ADRENO_KERNELS
 
     void free() {
@@ -894,6 +897,7 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
         CL_CHECK((backend_ctx->kernel_restore_block_mxfp4 = clCreateKernel(backend_ctx->program_cvt, "kernel_restore_block_mxfp4", &err), err));
         CL_CHECK((backend_ctx->kernel_convert_block_q8_0  = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_q8_0", &err), err));
         CL_CHECK((backend_ctx->kernel_restore_block_q8_0  = clCreateKernel(backend_ctx->program_cvt, "kernel_restore_block_q8_0", &err), err));
+        CL_CHECK((backend_ctx->kernel_restore_block_q8_0_trans  = clCreateKernel(backend_ctx->program_cvt, "kernel_restore_block_q8_0_trans", &err), err));
         CL_CHECK((backend_ctx->kernel_convert_block_q6_K  = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_q6_K", &err), err));
         CL_CHECK((backend_ctx->kernel_restore_block_q6_K  = clCreateKernel(backend_ctx->program_cvt, "kernel_restore_block_q6_K", &err), err));
         GGML_LOG_CONT(".");
@@ -2290,6 +2294,46 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
         GGML_LOG_CONT(".");
     }
 
+    // mul_mm_q8_0_f32_8x4
+    {
+#ifdef GGML_OPENCL_EMBED_KERNELS
+        const std::string kernel_src_q8_8x4_gemm {
+            #include "mul_mm_q8_0_f32_8x4.cl.h"
+       };
+#else
+        const std::string kernel_src_q8_8x4_gemm = read_file("mul_mm_q8_0_f32_8x4.cl");
+#endif
+        backend_ctx->program_CL_gemm = build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src_q8_8x4_gemm.c_str(), compile_opts);
+        CL_CHECK((backend_ctx->kernel_mul_mm_q8_0_f32_8x4 = clCreateKernel(backend_ctx->program_CL_gemm, "kernel_mul_mm_q8_0_f32_8x4", &err), err));
+        GGML_LOG_CONT(".");
+    }
+
+    // gemv_noshuffle_general_q8_0_f32
+    {
+        std::string CL_gemv_compile_opts = std::string("-cl-std=") + opencl_c_std +
+                                       " -cl-mad-enable "
+                                       " -DSIMDGROUP_WIDTH=" +
+                                       std::to_string(backend_ctx->adreno_wave_size);
+        if (backend_ctx->has_vector_subgroup_broadcast) {
+            CL_gemv_compile_opts += " -DVECTOR_SUB_GROUP_BROADCAT ";
+        }
+
+#ifdef GGML_OPENCL_EMBED_KERNELS
+        const std::string kernel_src_CL_gemv_general {
+            #include "gemv_noshuffle_general_q8_0_f32.cl.h"
+        };
+#else
+        const std::string kernel_src_CL_gemv_general = read_file("gemv_noshuffle_general_q8_0_f32.cl");
+#endif
+
+        cl_program prog = build_program_from_source(
+            backend_ctx->context, backend_ctx->device, kernel_src_CL_gemv_general.c_str(), CL_gemv_compile_opts);
+
+        CL_CHECK((backend_ctx->CL_mul_mat_vec_q8_0_f32 = clCreateKernel(prog, "kernel_gemv_noshuffle", &err), err));
+        CL_CHECK(clReleaseProgram(prog));
+        GGML_LOG_CONT(".");
+    }
+
     std::string CL_moe_compile_opts = std::string("-cl-std=") + opencl_c_std +
             " -cl-mad-enable "
             " -cl-fast-relaxed-math";
@@ -3745,6 +3789,15 @@ inline bool use_adreno_moe_kernels(const ggml_backend_opencl_context *backend_ct
     return ((strstr(tensor->name, "ffn") != NULL) || (strstr(tensor->name, "as") != NULL)) && (ne01 % 64 == 0);
 }
 
+inline bool enable_adreno_trans_weight(const ggml_backend_opencl_context *backend_ctx, const ggml_tensor *tensor) {
+
+    bool adreno_kernel = use_adreno_kernels(backend_ctx, tensor);
+
+    size_t elem_num = tensor->ne[0] * tensor->ne[1] * tensor->ne[2] * tensor->ne[3];
+
+    return ((elem_num < 128 * 1024 * 1024) && adreno_kernel);  // max element num: 2**27
+}
+
 static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
     ggml_backend_opencl_context *backend_ctx = ggml_cl2_init(buffer->buft->device);
 
@@ -4159,6 +4212,130 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer,
 
         tensor->extra = extra;
 
+        // Transpose the weights and scales
+#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
+        if (enable_adreno_trans_weight(backend_ctx, tensor)) {
+
+            int M = tensor->ne[1];   // ne01
+            int K = tensor->ne[0];   // ne00
+
+            GGML_ASSERT(K % 32 == 0);
+            GGML_ASSERT(M % 4 == 0);
+            GGML_ASSERT(tensor->ne[2] == 1);
+            GGML_ASSERT(tensor->ne[3] == 1);
+
+            // Transpose weights
+            size_t q_size_bytes = K * M / 4 * sizeof(float);
+            cl_buffer_region region;
+            region.origin = 0;
+            region.size = q_size_bytes;
+            cl_mem qT_d = clCreateSubBuffer(
+                backend_ctx->prealloc_quant_trans.buffer,
+                0,
+                CL_BUFFER_CREATE_TYPE_REGION,
+                &region,
+                &err);
+            CL_CHECK(err);
+
+            cl_mem q_d_image1D;
+            cl_mem qT_d_image1D;
+
+            cl_image_format img_fmt_1d;
+            cl_image_desc img_desc_1d;
+
+            img_fmt_1d = { CL_RGBA, CL_FLOAT };
+            memset(&img_desc_1d, 0, sizeof(img_desc_1d));
+            img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
+            img_desc_1d.image_width = M * K / 4 / 4;
+            img_desc_1d.buffer = extra->q;
+            q_d_image1D = clCreateImage(context, 0, &img_fmt_1d, &img_desc_1d, NULL, &err);
+            CL_CHECK(err);
+
+            img_fmt_1d = { CL_RGBA, CL_FLOAT };
+            memset(&img_desc_1d, 0, sizeof(img_desc_1d));
+            img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
+            img_desc_1d.image_width = M * K / 4 / 4;
+            img_desc_1d.buffer = qT_d;
+            qT_d_image1D = clCreateImage(context, 0, &img_fmt_1d, &img_desc_1d, NULL, &err);
+            CL_CHECK(err);
+
+            int height_q = M / 4;
+            int width_q = K / 4 / 4;
+            kernel = backend_ctx->kernel_transpose_32;
+
+            CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &q_d_image1D));
+            CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &qT_d_image1D));
+            CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int),    &height_q));
+            CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int),    &width_q));
+
+            size_t local_size_q[3] = {4, 16, 1};
+            size_t global_size_q[3] = {static_cast<size_t>(width_q), static_cast<size_t>(height_q), 1};
+            CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_size_q, local_size_q, 0, NULL, &evt));
+            CL_CHECK(clWaitForEvents(1, &evt));
+
+            // Transpose scales
+            size_t d_size_bytes = M * (K / 32) * 2;
+            region.origin = 0;
+            region.size = d_size_bytes;
+            cl_mem dT_d = clCreateSubBuffer(
+                backend_ctx->prealloc_scales_trans.buffer,
+                0,
+                CL_BUFFER_CREATE_TYPE_REGION,
+                &region,
+                &err);
+            CL_CHECK(err);
+
+            cl_mem d_d_image1D;
+            cl_mem dT_d_image1D;
+
+            memset(&img_desc_1d, 0, sizeof(img_desc_1d));
+            img_fmt_1d = { CL_R, CL_HALF_FLOAT };
+            img_desc_1d.image_width = M * K / 32;
+            img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
+            img_desc_1d.buffer = extra->d;
+            d_d_image1D = clCreateImage(context, 0, &img_fmt_1d, &img_desc_1d, NULL, &err);
+            CL_CHECK(err);
+
+            img_fmt_1d = { CL_RGBA, CL_HALF_FLOAT };
+            memset(&img_desc_1d, 0, sizeof(img_desc_1d));
+            img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
+            img_desc_1d.image_width = M * K / 32 / 4;
+            img_desc_1d.buffer = dT_d;
+            dT_d_image1D = clCreateImage(context, 0, &img_fmt_1d, &img_desc_1d, NULL, &err);
+            CL_CHECK(err);
+
+            int height_s = M / 4;
+            int width_s = K / 32;
+
+            kernel = backend_ctx->kernel_transpose_16_4x1;
+
+            CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &d_d_image1D));
+            CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &dT_d_image1D));
+            CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), &height_s));
+            CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), &width_s));
+
+            size_t local_size_s[3] = {4, 16, 1};
+            size_t global_size_s[3] = {static_cast<size_t>(width_s), static_cast<size_t>(height_s), 1};
+            CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_size_s, local_size_s, 0, NULL, &evt));
+            CL_CHECK(clWaitForEvents(1, &evt));
+
+            // copy transposed buffer contents to original buffers
+            CL_CHECK(clEnqueueCopyBuffer(queue, qT_d, extra->q, 0, 0, q_size_bytes, 0, NULL, &evt));
+            CL_CHECK(clWaitForEvents(1, &evt));
+
+            CL_CHECK(clEnqueueCopyBuffer(queue, dT_d, extra->d, 0, 0, d_size_bytes, 0, NULL, &evt));
+            CL_CHECK(clWaitForEvents(1, &evt));
+
+            CL_CHECK(clReleaseMemObject(qT_d));
+            CL_CHECK(clReleaseMemObject(dT_d));
+
+            CL_CHECK(clReleaseMemObject(q_d_image1D));
+            CL_CHECK(clReleaseMemObject(d_d_image1D));
+            CL_CHECK(clReleaseMemObject(qT_d_image1D));
+            CL_CHECK(clReleaseMemObject(dT_d_image1D));
+        } // end transpose
+#endif // GGML_OPENCL_USE_ADRENO_KERNELS
+
         return;
     }
     if (tensor->type == GGML_TYPE_Q6_K) {
@@ -4448,6 +4625,36 @@ static void ggml_backend_opencl_buffer_get_tensor(ggml_backend_buffer_t buffer,
             ggml_nbytes(tensor), NULL, &err);
         CL_CHECK(err);
 
+#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
+        if (enable_adreno_trans_weight(backend_ctx, tensor)) {
+            cl_kernel kernel = backend_ctx->kernel_restore_block_q8_0_trans;
+
+            int ne00 = tensor->ne[0];
+            int ne01 = tensor->ne[1];
+            GGML_ASSERT(tensor->ne[2] == 1);  // ???
+            GGML_ASSERT(tensor->ne[3] == 1);  // ???
+
+            CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra->q));
+            CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra->d));
+            CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &data_device));
+            CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_int), &ne00));
+            CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_int), &ne01));
+
+            size_t global_work_size[3] = {static_cast<size_t>(((ne01 + 63) / 64) * 64), 1, 1};
+            size_t local_work_size[3] = {64, 1, 1};
+
+            cl_event evt;
+            CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL,
+                global_work_size, local_work_size, 0, NULL, &evt));
+            CL_CHECK(clWaitForEvents(1, &evt));
+
+            CL_CHECK(clEnqueueReadBuffer(
+                queue, data_device, CL_TRUE, offset,
+                size, data, 0, NULL, NULL));
+            CL_CHECK(clReleaseMemObject(data_device));
+            return;
+        }
+#endif
         cl_kernel kernel = backend_ctx->kernel_restore_block_q8_0;
         CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra->q));
         CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra->d));
@@ -7947,6 +8154,252 @@ static void ggml_cl_mul_mat_kq_kqv_adreno(ggml_backend_t backend, const ggml_ten
     CL_CHECK(clReleaseMemObject(D_sub_buffer));
 }
 
+static void ggml_cl_mul_mat_q8_0_f32_adreno(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
+    GGML_ASSERT(src0);
+    GGML_ASSERT(src0->extra);
+    GGML_ASSERT(src1);
+    GGML_ASSERT(src1->extra);
+    GGML_ASSERT(dst);
+    GGML_ASSERT(dst->extra);
+
+    const enum ggml_type src0t = src0->type;
+    const enum ggml_type src1t = src1->type;
+
+    GGML_ASSERT(src0t == GGML_TYPE_Q8_0);
+    GGML_ASSERT(src1t == GGML_TYPE_F32);
+
+    ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
+
+    ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
+    ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
+
+    ggml_tensor_extra_cl_q8_0 * extra0_q8_0 = (ggml_tensor_extra_cl_q8_0 *)src0->extra;
+
+    GGML_ASSERT(src1->view_offs == 0);
+    GGML_ASSERT(dst->view_offs == 0);
+
+    const int  ne00 = src0->ne[0];
+    const int  ne01 = src0->ne[1];
+    const int  ne02 = src0->ne[2];
+
+    const int  ne10 = src1->ne[0];
+    const int  ne12 = src1->ne[2];
+
+    const int  ne0 = dst->ne[0];
+    const int  ne1 = dst->ne[1];
+
+    GGML_ASSERT(ne00 == ne10);
+    GGML_ASSERT((ne00 % 32) == 0);
+    GGML_ASSERT(ne0 == ne01);
+
+    cl_context context = backend_ctx->context;
+    cl_kernel kernel;
+
+    // init CL objects
+    cl_int              status;
+    cl_image_format     img_fmt_1d;
+    cl_image_desc       img_desc_1d;
+    cl_buffer_region    region;
+    cl_mem              A_image1d;
+    cl_mem              B_image1d;
+    cl_mem              B_sub_buffer;
+    cl_mem              S_image1d;
+
+    cl_mem              D_image1d;
+    cl_mem              D_sub_buffer;
+
+    int M = ne01;
+    int N = ne1;
+    int K = ne00;
+
+    // create an image for A
+    img_fmt_1d = { CL_R, CL_FLOAT};
+    memset(&img_desc_1d, 0, sizeof(img_desc_1d));
+    img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
+    img_desc_1d.image_width = M * K / 4;    // Divide by 4 for char -> float
+    img_desc_1d.buffer = extra0_q8_0->q;
+    A_image1d = clCreateImage(context, CL_MEM_READ_ONLY, &img_fmt_1d, &img_desc_1d, NULL, &status);
+    CL_CHECK(status);
+
+    // create an image for Scale
+    img_fmt_1d = { CL_R, CL_HALF_FLOAT};
+    memset(&img_desc_1d, 0, sizeof(img_desc_1d));
+    img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
+    img_desc_1d.image_width = M * K / 32;    // Block size is 32
+    img_desc_1d.buffer = extra0_q8_0->d;
+    S_image1d = clCreateImage(context, CL_MEM_READ_ONLY, &img_fmt_1d, &img_desc_1d, NULL, &status);
+    CL_CHECK(status);
+
+    // create a sub_buffer for B
+    region.origin = (extra1->offset); // + src1->view_offs);
+    region.size = K * N * sizeof(float);
+    B_sub_buffer = clCreateSubBuffer((extra1->data_device), 0, CL_BUFFER_CREATE_TYPE_REGION, &region, &status);
+    CL_CHECK(status);
+
+    // create an image for B from sub_buffer: RGBA (OCL)
+    img_fmt_1d = {CL_RGBA, CL_FLOAT};
+    memset(&img_desc_1d, 0, sizeof(img_desc_1d));
+    img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
+    img_desc_1d.image_width = K * N / 4;
+    img_desc_1d.buffer = B_sub_buffer;
+    B_image1d = clCreateImage(context, CL_MEM_READ_ONLY, &img_fmt_1d, &img_desc_1d, NULL, &status);
+    CL_CHECK(status);
+
+    // Create subbuffer and image1d_buffer for dst
+    region.origin = (extrad->offset); // + dst->view_offs;
+    region.size = M * N * sizeof(float);
+    D_sub_buffer = clCreateSubBuffer((extrad->data_device), 0, CL_BUFFER_CREATE_TYPE_REGION, &region, &status);
+    CL_CHECK(status);
+
+    img_fmt_1d = {CL_R, CL_FLOAT};
+    memset(&img_desc_1d, 0, sizeof(img_desc_1d));
+    img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
+    img_desc_1d.image_width = M * N;
+    img_desc_1d.buffer = D_sub_buffer;
+    D_image1d = clCreateImage(context, CL_MEM_WRITE_ONLY, &img_fmt_1d, &img_desc_1d, NULL, &status);
+    CL_CHECK(status);
+
+    size_t local_work_size[3] = {1, 1, 1};
+    size_t global_work_size[3] = {1, 1, 1};
+
+    if (N == 1) {
+        kernel = backend_ctx->CL_mul_mat_vec_q8_0_f32;
+
+        int r2 = 1;
+        int r3 = 1;
+        cl_uint k_arg = 0;
+
+        CL_CHECK(clSetKernelArg(kernel,  k_arg++, sizeof(cl_mem),   &A_image1d));
+        CL_CHECK(clSetKernelArg(kernel,  k_arg++, sizeof(cl_mem),   &extra0_q8_0->d));
+        CL_CHECK(clSetKernelArg(kernel,  k_arg++, sizeof(cl_mem),   &B_image1d));
+        CL_CHECK(clSetKernelArg(kernel,  k_arg++, sizeof(cl_ulong), &extra1->offset));
+        CL_CHECK(clSetKernelArg(kernel,  k_arg++, sizeof(cl_mem),   &extrad->data_device));
+        CL_CHECK(clSetKernelArg(kernel,  k_arg++, sizeof(cl_ulong), &extrad->offset));
+        CL_CHECK(clSetKernelArg(kernel,  k_arg++, sizeof(int),      &ne00));
+        CL_CHECK(clSetKernelArg(kernel,  k_arg++, sizeof(int),      &ne01));
+        CL_CHECK(clSetKernelArg(kernel,  k_arg++, sizeof(int),      &ne02));
+        CL_CHECK(clSetKernelArg(kernel,  k_arg++, sizeof(int),      &ne10));
+        CL_CHECK(clSetKernelArg(kernel,  k_arg++, sizeof(int),      &ne12));
+        CL_CHECK(clSetKernelArg(kernel,  k_arg++, sizeof(int),      &ne0));
+        CL_CHECK(clSetKernelArg(kernel,  k_arg++, sizeof(int),      &ne1));
+        CL_CHECK(clSetKernelArg(kernel,  k_arg++, sizeof(int),      &r2));
+        CL_CHECK(clSetKernelArg(kernel,  k_arg++, sizeof(int),      &r3));
+
+        size_t wavesize = backend_ctx->adreno_wave_size;
+        local_work_size[0] = wavesize;
+        local_work_size[1] = 4; // reduce factor
+        local_work_size[2] = 1;
+
+        global_work_size[0] = ((M + wavesize - 1) / wavesize) * wavesize;
+        global_work_size[1] = 4; // reduce factor
+        global_work_size[2] = 1;
+    } else {
+        cl_ulong offsetd = extrad->offset + dst->view_offs;
+        cl_mem              B_image1d_trans = nullptr;
+        // for B transpose
+        cl_mem B_d = nullptr;
+        int padding;
+
+        //how many extra elements beyond multiple of 8
+        int extra_elements = N % 8;
+
+        //how much padding to add
+        padding = 0;
+        if (extra_elements > 0){
+            padding = 8 - extra_elements;
+        }
+
+        // Specify the starting offset (in bytes)
+        region.origin = 0;
+        // Specify the size of the sub-buffer (divide by 2 for FP16)
+        region.size = K * (N + padding) * sizeof(float)/2;
+        backend_ctx->prealloc_act_trans.allocate(context, region.size);
+        B_d = clCreateSubBuffer(
+            backend_ctx->prealloc_act_trans.buffer,
+            0,
+            CL_BUFFER_CREATE_TYPE_REGION,
+            &region,
+            &status);
+        CL_CHECK(status);
+
+        cl_image_format image_format_B_d_output = { CL_RGBA, CL_HALF_FLOAT }; //(CL_HALF_FLOAT for FP16)
+        cl_image_desc image_desc_B_d_output = {
+            CL_MEM_OBJECT_IMAGE1D_BUFFER,
+            static_cast<size_t>(K * (N + padding)/4),
+            0, 0, 0, 0, 0, 0, 0, { B_d }
+        };
+        B_image1d_trans = clCreateImage(
+            context,
+            0,
+            &image_format_B_d_output,
+            &image_desc_B_d_output,
+            NULL,
+            &status);
+        CL_CHECK(status);
+
+        int height_B = N/4;
+        if (height_B == 0) {
+            height_B = 1;
+        }
+        int width_B = K/4;
+        int padded_height_B = (N + padding)/4;
+
+        kernel = backend_ctx->kernel_transpose_32_16;
+        CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &B_image1d));
+        CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &B_image1d_trans));
+        CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int),    &height_B));
+        CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int),    &width_B));
+        CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int),    &padded_height_B));
+
+        size_t local_size_t[2] = { 1, 16 };
+        size_t global_size_t[2] = {
+            static_cast<size_t>(width_B),
+            static_cast<size_t>(padded_height_B)
+        };
+
+        backend_ctx->enqueue_ndrange_kernel(kernel, 2, global_size_t, local_size_t, dst);
+
+        kernel = backend_ctx->kernel_mul_mm_q8_0_f32_8x4;
+
+        int N_with_padding = N + padding;
+
+        CL_CHECK(clSetKernelArg(kernel,  0, sizeof(cl_mem),   &extra0_q8_0->q));
+        CL_CHECK(clSetKernelArg(kernel,  1, sizeof(cl_mem),   &extra0_q8_0->d));
+        CL_CHECK(clSetKernelArg(kernel,  2, sizeof(cl_mem),   &B_image1d_trans));
+        CL_CHECK(clSetKernelArg(kernel,  3, sizeof(cl_mem),   &extrad->data_device));
+        CL_CHECK(clSetKernelArg(kernel,  4, sizeof(int),      &K));
+        CL_CHECK(clSetKernelArg(kernel,  5, sizeof(int),      &M));
+        CL_CHECK(clSetKernelArg(kernel,  6, sizeof(int),      &N_with_padding));
+        CL_CHECK(clSetKernelArg(kernel,  7, sizeof(int),      &N));
+        CL_CHECK(clSetKernelArg(kernel,  8, sizeof(cl_ulong), &offsetd));
+
+        global_work_size[0] = (size_t)(N + 7) / 8;
+        global_work_size[1] = (size_t)(M + 3) / 4;
+        global_work_size[2] = 1;
+
+        local_work_size[0] = 2;
+        local_work_size[1] = 128;
+        local_work_size[2] = 1;
+    }
+
+    // enqueue kernel with profiling
+    backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
+
+    // deallocate sub buffers and images
+    CL_CHECK(clReleaseMemObject(A_image1d));
+    CL_CHECK(clReleaseMemObject(B_sub_buffer));
+    CL_CHECK(clReleaseMemObject(B_image1d));
+    CL_CHECK(clReleaseMemObject(S_image1d));
+    CL_CHECK(clReleaseMemObject(D_sub_buffer));
+    CL_CHECK(clReleaseMemObject(D_image1d));
+#else
+    GGML_UNUSED(src0);
+    GGML_UNUSED(src1);
+    GGML_UNUSED(dst);
+#endif
+}
+
 static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
     GGML_ASSERT(src0);
     GGML_ASSERT(src0->extra);
@@ -8064,6 +8517,13 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
     int padding;
     // <--------------------------------------------> //
 
+    // q8_0 x fp32
+    if (src0t == GGML_TYPE_Q8_0 && src1t == GGML_TYPE_F32 &&
+        enable_adreno_trans_weight(backend_ctx, src0)) {
+            ggml_cl_mul_mat_q8_0_f32_adreno(backend, src0, src1, dst);
+            return;
+    }
+
     // q4_0 x fp32
     if(src0t == GGML_TYPE_Q4_0 && src1t == GGML_TYPE_F32) {
         // TODO: remove duplicate definitions of image description + format -- move to top
diff --git a/ggml/src/ggml-opencl/kernels/cvt.cl b/ggml/src/ggml-opencl/kernels/cvt.cl
index adf576a839..9fb434713d 100644
--- a/ggml/src/ggml-opencl/kernels/cvt.cl
+++ b/ggml/src/ggml-opencl/kernels/cvt.cl
@@ -274,6 +274,37 @@ kernel void kernel_restore_block_q8_0(
     }
 }
 
+kernel void kernel_restore_block_q8_0_trans(
+    global uchar * src_q,
+    global half  * src_d,
+    global block_q8_0 * dst,
+    uint ne00,
+    uint ne01
+){
+    uint num_blk_per_row = ne00 / QK8_0;
+
+    global block_q8_0 * b = (global block_q8_0 *) dst + get_global_id(0) * num_blk_per_row;
+    global uchar      * q = (global uchar *) src_q + get_global_id(0) * 4; // 4 8-bit packed
+    global half       * d = (global half *) src_d + get_global_id(0);
+
+    for (uint blk = 0; blk < num_blk_per_row; blk++) {
+        b->d = *d;
+
+        for (uint i = 0; i < QK8_0; i+=4) {
+            b->qs[i]   = q[0];
+            b->qs[i+1] = q[1];
+            b->qs[i+2] = q[2];
+            b->qs[i+3] = q[3];
+
+            q += 4 * ne01; // M stride
+        }
+
+        d += ne01;
+
+        b++;
+    }
+}
+
 //------------------------------------------------------------------------------
 // kernel_convert_block_q6_K
 // Convert the block_q6_K format to 3 separate arrays (AOS -> SOA).
diff --git a/ggml/src/ggml-opencl/kernels/gemv_noshuffle_general_q8_0_f32.cl b/ggml/src/ggml-opencl/kernels/gemv_noshuffle_general_q8_0_f32.cl
new file mode 100644
index 0000000000..f944ef3a99
--- /dev/null
+++ b/ggml/src/ggml-opencl/kernels/gemv_noshuffle_general_q8_0_f32.cl
@@ -0,0 +1,195 @@
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#pragma OPENCL EXTENSION cl_khr_subgroups : enable
+
+#ifdef cl_qcom_reqd_sub_group_size
+#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
+#define ADRENO_GPU 1
+#define REQD_SUBGROUP_SIZE_64 __attribute__((qcom_reqd_sub_group_size("half")))
+#endif
+
+#define QK8_0 32
+#define N_SIMDGROUP 4
+
+#define dequantizeBlockAccum_ns_sgbroadcast_1(total_sums, bits8, scale, y) \
+    float shared_y; \
+    char elem; \
+                                             \
+    shared_y = sub_group_broadcast(y.s0, 0); \
+    elem = (char)(bits8.s0 & 0x000000FF); \
+    total_sums += convert_int(elem) * scale * shared_y; \
+    shared_y = sub_group_broadcast(y.s1, 0); \
+    elem = (char)((bits8.s0 & 0x0000FF00) >> 8); \
+    total_sums += convert_int(elem) * scale * shared_y; \
+    shared_y = sub_group_broadcast(y.s2, 0); \
+    elem = (char)((bits8.s0 & 0x00FF0000) >> 16); \
+    total_sums += convert_int(elem) * scale * shared_y; \
+    shared_y = sub_group_broadcast(y.s3, 0); \
+    elem = (char)((bits8.s0 & 0xFF000000) >> 24); \
+    total_sums += convert_int(elem) * scale * shared_y; \
+                                             \
+    shared_y = sub_group_broadcast(y.s4, 0); \
+    elem = (char)(bits8.s1 & 0x000000FF); \
+    total_sums += convert_int(elem) * scale * shared_y; \
+    shared_y = sub_group_broadcast(y.s5, 0); \
+    elem = (char)((bits8.s1 & 0x0000FF00) >> 8); \
+    total_sums += convert_int(elem) * scale * shared_y; \
+    shared_y = sub_group_broadcast(y.s6, 0); \
+    elem = (char)((bits8.s1 & 0x00FF0000) >> 16); \
+    total_sums += convert_int(elem) * scale * shared_y; \
+    shared_y = sub_group_broadcast(y.s7, 0); \
+    elem = (char)((bits8.s1 & 0xFF000000) >> 24); \
+    total_sums += convert_int(elem) * scale * shared_y; \
+                                             \
+    shared_y = sub_group_broadcast(y.s0, 1); \
+    elem = (char)(bits8.s2 & 0x000000FF); \
+    total_sums += convert_int(elem) * scale * shared_y; \
+    shared_y = sub_group_broadcast(y.s1, 1); \
+    elem = (char)((bits8.s2 & 0x0000FF00) >> 8); \
+    total_sums += convert_int(elem) * scale * shared_y; \
+    shared_y = sub_group_broadcast(y.s2, 1); \
+    elem = (char)((bits8.s2 & 0x00FF0000) >> 16); \
+    total_sums += convert_int(elem) * scale * shared_y; \
+    shared_y = sub_group_broadcast(y.s3, 1); \
+    elem = (char)((bits8.s2 & 0xFF000000) >> 24); \
+    total_sums += convert_int(elem) * scale * shared_y; \
+                                             \
+    shared_y = sub_group_broadcast(y.s4, 1); \
+    elem = (char)(bits8.s3 & 0x000000FF); \
+    total_sums += convert_int(elem) * scale * shared_y; \
+    shared_y = sub_group_broadcast(y.s5, 1); \
+    elem = (char)((bits8.s3 & 0x0000FF00) >> 8); \
+    total_sums += convert_int(elem) * scale * shared_y; \
+    shared_y = sub_group_broadcast(y.s6, 1); \
+    elem = (char)((bits8.s3 & 0x00FF0000) >> 16); \
+    total_sums += convert_int(elem) * scale * shared_y; \
+    shared_y = sub_group_broadcast(y.s7, 1); \
+    elem = (char)((bits8.s3 & 0xFF000000) >> 24); \
+    total_sums += convert_int(elem) * scale * shared_y; \
+                                             \
+    shared_y = sub_group_broadcast(y.s0, 2); \
+    elem = (char)(bits8.s4 & 0x000000FF); \
+    total_sums += convert_int(elem) * scale * shared_y; \
+    shared_y = sub_group_broadcast(y.s1, 2); \
+    elem = (char)((bits8.s4 & 0x0000FF00) >> 8); \
+    total_sums += convert_int(elem) * scale * shared_y; \
+    shared_y = sub_group_broadcast(y.s2, 2); \
+    elem = (char)((bits8.s4 & 0x00FF0000) >> 16); \
+    total_sums += convert_int(elem) * scale * shared_y; \
+    shared_y = sub_group_broadcast(y.s3, 2); \
+    elem = (char)((bits8.s4 & 0xFF000000) >> 24); \
+    total_sums += convert_int(elem) * scale * shared_y; \
+                                             \
+    shared_y = sub_group_broadcast(y.s4, 2); \
+    elem = (char)(bits8.s5 & 0x000000FF); \
+    total_sums += convert_int(elem) * scale * shared_y; \
+    shared_y = sub_group_broadcast(y.s5, 2); \
+    elem = (char)((bits8.s5 & 0x0000FF00) >> 8); \
+    total_sums += convert_int(elem) * scale * shared_y; \
+    shared_y = sub_group_broadcast(y.s6, 2); \
+    elem = (char)((bits8.s5 & 0x00FF0000) >> 16); \
+    total_sums += convert_int(elem) * scale * shared_y; \
+    shared_y = sub_group_broadcast(y.s7, 2); \
+    elem = (char)((bits8.s5 & 0xFF000000) >> 24); \
+    total_sums += convert_int(elem) * scale * shared_y; \
+                                             \
+    shared_y = sub_group_broadcast(y.s0, 3); \
+    elem = (char)(bits8.s6 & 0x000000FF); \
+    total_sums += convert_int(elem) * scale * shared_y; \
+    shared_y = sub_group_broadcast(y.s1, 3); \
+    elem = (char)((bits8.s6 & 0x0000FF00) >> 8); \
+    total_sums += convert_int(elem) * scale * shared_y; \
+    shared_y = sub_group_broadcast(y.s2, 3); \
+    elem = (char)((bits8.s6 & 0x00FF0000) >> 16); \
+    total_sums += convert_int(elem) * scale * shared_y; \
+    shared_y = sub_group_broadcast(y.s3, 3); \
+    elem = (char)((bits8.s6 & 0xFF000000) >> 24); \
+    total_sums += convert_int(elem) * scale * shared_y; \
+                                             \
+    shared_y = sub_group_broadcast(y.s4, 3); \
+    elem = (char)(bits8.s7 & 0x000000FF); \
+    total_sums += convert_int(elem) * scale * shared_y; \
+    shared_y = sub_group_broadcast(y.s5, 3); \
+    elem = (char)((bits8.s7 & 0x0000FF00) >> 8); \
+    total_sums += convert_int(elem) * scale * shared_y; \
+    shared_y = sub_group_broadcast(y.s6, 3); \
+    elem = (char)((bits8.s7 & 0x00FF0000) >> 16); \
+    total_sums += convert_int(elem) * scale * shared_y; \
+    shared_y = sub_group_broadcast(y.s7, 3); \
+    elem = (char)((bits8.s7 & 0xFF000000) >> 24); \
+    total_sums += convert_int(elem) * scale * shared_y; \
+
+#ifdef ADRENO_GPU
+REQD_SUBGROUP_SIZE_64
+#endif
+__kernel void kernel_gemv_noshuffle(
+        __read_only  image1d_buffer_t src0_q,  // quantized A
+        global half  * src0_d,  // A scales
+        __read_only  image1d_buffer_t src1,    // B
+        ulong offset1,            // offset to B (0)
+        global float * dst,     // C
+        ulong offsetd,            // offset to C
+        int ne00,               // K
+        int ne01,               // M
+        int ne02,               // 1
+        int ne10,               // K
+        int ne12,               // 1
+        int ne0,                // M
+        int ne1,                // N
+        int r2,                 // 1
+        int r3)
+{
+    uint groupId = get_local_id(1);
+    uint gid     = get_global_id(0);
+    ushort slid    = get_sub_group_local_id();
+
+    uint K = ne00;
+    uint M = ne01;
+
+    uint LINE_STRIDE_A = M;
+    uint BLOCK_STRIDE_A = 8 * M;   // 32 / 4 = 8
+
+    __private uint8     regA;
+    __private half      regS;
+    __private float8    regB;
+
+    __private float totalSum = (float)(0.0f);
+
+    // loop along K in block granularity, skip 4 blocks every iter
+    #pragma unroll 1 /* tell compiler not to unroll */
+    for (uint k = groupId; k < (K / QK8_0); k += N_SIMDGROUP) {
+        regS = src0_d[gid + k * LINE_STRIDE_A]; // each fiber loads scale of one rows
+        // first 4 fibers in each wave load 8 B values to its private scope
+        if (slid < 4) {
+            regB.s0123 = read_imagef(src1, (slid * 2 + k * 8));
+            regB.s4567 = read_imagef(src1, (1 + slid * 2 + k * 8));
+        }
+
+        // load weights for one block in consecutive rows
+        regA.s0 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 0)).x;
+        regA.s1 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 1)).x;
+        regA.s2 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 2)).x;
+        regA.s3 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 3)).x;
+        regA.s4 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 4)).x;
+        regA.s5 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 5)).x;
+        regA.s6 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 6)).x;
+        regA.s7 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 7)).x;
+
+        dequantizeBlockAccum_ns_sgbroadcast_1(totalSum, regA, regS, regB);
+    }
+
+    // reduction in local memory, assumes #wave=4
+    __local float reduceLM[SIMDGROUP_WIDTH * 3];
+    if (groupId == 1) reduceLM[SIMDGROUP_WIDTH * 0 + slid] = totalSum;
+    if (groupId == 2) reduceLM[SIMDGROUP_WIDTH * 1 + slid] = totalSum;
+    if (groupId == 3) reduceLM[SIMDGROUP_WIDTH * 2 + slid] = totalSum;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    if (groupId == 0) totalSum += reduceLM[SIMDGROUP_WIDTH * 0 + slid];
+    if (groupId == 0) totalSum += reduceLM[SIMDGROUP_WIDTH * 1 + slid];
+    if (groupId == 0) totalSum += reduceLM[SIMDGROUP_WIDTH * 2 + slid];
+
+    // 1 outputs per fiber in wave 0
+    if (groupId == 0) {
+        dst = (global float*)((global char*)dst + offsetd);
+        dst[gid] = totalSum;
+    }
+}
diff --git a/ggml/src/ggml-opencl/kernels/mul_mm_q8_0_f32_8x4.cl b/ggml/src/ggml-opencl/kernels/mul_mm_q8_0_f32_8x4.cl
new file mode 100644
index 0000000000..51ce2121ce
--- /dev/null
+++ b/ggml/src/ggml-opencl/kernels/mul_mm_q8_0_f32_8x4.cl
@@ -0,0 +1,129 @@
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
+
+#ifdef cl_qcom_reqd_sub_group_size
+#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
+#define ADRENO_GPU 1
+#define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full")))
+#endif
+
+#ifdef ADRENO_GPU
+REQD_SUBGROUP_SIZE_128
+#endif
+
+kernel void kernel_mul_mm_q8_0_f32_8x4(
+        global const uint * src0_q,
+        global const half  * src0_d,
+        __read_only image1d_buffer_t src1,
+        global float * dst,
+        int k,
+        int m,
+        int n,
+        int n_no_padding,
+        ulong offsetd
+) {
+
+    int m_4 = m >> 2;
+    int n_4 = n >> 2;
+
+    int gy   = get_global_id(0);
+    int gx   = get_global_id(1);
+    int gx_2 = gx << 2;
+    dst  = (global float *)((global char*)dst  + offsetd);
+
+
+    half8 c0 = 0, c1 = 0, c2 = 0, c3 = 0;
+    half8 B;
+    half4 deq;
+
+    __global const uint* wptr = src0_q + gx_2;
+    __global const half* sptr = src0_d + gx_2;
+
+      for (int i = 0; i < k; i += 4) {
+        uint4 pack4 = vload4(0, wptr + (i / 4) * m);
+        half4 scale = vload4(0, sptr + (i / 32) * m);
+
+        char4 p0 = as_char4(pack4.s0);
+        char4 p1 = as_char4(pack4.s1);
+        char4 p2 = as_char4(pack4.s2);
+        char4 p3 = as_char4(pack4.s3);
+
+        // ------------------- j = 0 (k = i+0) -------------------
+        B.s0123 = read_imageh(src1, gy * 2 + (i + 0) * n_4);
+        B.s4567 = read_imageh(src1, gy * 2 + (i + 0) * n_4 + 1);
+
+        half4 wj0 = convert_half4((char4)(p0.s0, p1.s0, p2.s0, p3.s0)) * scale;
+
+        c0 += B * wj0.s0;
+        c1 += B * wj0.s1;
+        c2 += B * wj0.s2;
+        c3 += B * wj0.s3;
+
+        // ------------------- j = 1 (k = i+1) -------------------
+        B.s0123 = read_imageh(src1, gy * 2 + (i + 1) * n_4);
+        B.s4567 = read_imageh(src1, gy * 2 + (i + 1) * n_4 + 1);
+
+        half4 wj1 = convert_half4((char4)(p0.s1, p1.s1, p2.s1, p3.s1)) * scale;
+
+        c0 += B * wj1.s0;
+        c1 += B * wj1.s1;
+        c2 += B * wj1.s2;
+        c3 += B * wj1.s3;
+
+        // ------------------- j = 2 (k = i+2) -------------------
+        B.s0123 = read_imageh(src1, gy * 2 + (i + 2) * n_4);
+        B.s4567 = read_imageh(src1, gy * 2 + (i + 2) * n_4 + 1);
+
+        half4 wj2 = convert_half4((char4)(p0.s2, p1.s2, p2.s2, p3.s2)) * scale;
+
+        c0 += B * wj2.s0;
+        c1 += B * wj2.s1;
+        c2 += B * wj2.s2;
+        c3 += B * wj2.s3;
+
+        // ------------------- j = 3 (k = i+3) -------------------
+        B.s0123 = read_imageh(src1, gy * 2 + (i + 3) * n_4);
+        B.s4567 = read_imageh(src1, gy * 2 + (i + 3) * n_4 + 1);
+
+        half4 wj3 = convert_half4((char4)(p0.s3, p1.s3, p2.s3, p3.s3)) * scale;
+
+        c0 += B * wj3.s0;
+        c1 += B * wj3.s1;
+        c2 += B * wj3.s2;
+        c3 += B * wj3.s3;
+    }
+
+    int idx = (gy << 3) * m + (gx << 2);
+
+    if(idx+3 < m*n_no_padding){
+        vstore4((float4)(c0.s0, c1.s0, c2.s0, c3.s0), 0, dst + idx);
+        idx += m;
+    }
+    if(idx+3 < m*n_no_padding){
+        vstore4((float4)(c0.s1, c1.s1, c2.s1, c3.s1), 0, dst + idx);
+        idx += m;
+    }
+    if(idx+3 < m*n_no_padding){
+        vstore4((float4)(c0.s2, c1.s2, c2.s2, c3.s2), 0, dst + idx);
+        idx += m;
+    }
+    if(idx+3 < m*n_no_padding){
+        vstore4((float4)(c0.s3, c1.s3, c2.s3, c3.s3), 0, dst + idx);
+        idx += m;
+    }
+    if(idx+3 < m*n_no_padding){
+        vstore4((float4)(c0.s4, c1.s4, c2.s4, c3.s4), 0, dst + idx);
+        idx += m;
+    }
+    if(idx+3 < m*n_no_padding){
+        vstore4((float4)(c0.s5, c1.s5, c2.s5, c3.s5), 0, dst + idx);
+        idx += m;
+    }
+    if(idx+3 < m*n_no_padding){
+        vstore4((float4)(c0.s6, c1.s6, c2.s6, c3.s6), 0, dst + idx);
+        idx += m;
+    }
+    if(idx+3 < m*n_no_padding){
+        vstore4((float4)(c0.s7, c1.s7, c2.s7, c3.s7), 0, dst + idx);
+    }
+}
diff --git a/ggml/src/ggml-sycl/element_wise.cpp b/ggml/src/ggml-sycl/element_wise.cpp
index 8d83b2446b..651b875b63 100644
--- a/ggml/src/ggml-sycl/element_wise.cpp
+++ b/ggml/src/ggml-sycl/element_wise.cpp
@@ -123,6 +123,15 @@ static __dpct_inline__ T op_log(T x) {
     return sycl::log(x);
 }
 
+template<typename T>
+static __dpct_inline__ T op_softplus(T x) {
+    const float xf = (float) x;
+    const float ax = sycl::fabs(xf);
+    const float m  = sycl::fmax(xf, 0.0f);
+    const float y  = m + sycl::log1p(sycl::exp(-ax));
+    return (T) y;
+}
+
 template<typename T>
 static __dpct_inline__ T op_neg(T x) {
     return -x;
@@ -695,6 +704,12 @@ static inline void ggml_sycl_op_log(ggml_backend_sycl_context & ctx, ggml_tensor
         });
 }
 
+static inline void ggml_sycl_op_softplus(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
+    ggml_sycl_detail::ggml_sycl_op_unary(ctx, dst, [](auto x) {
+        return op_softplus(x);
+    });
+}
+
 static inline void ggml_sycl_op_neg(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
     ggml_sycl_detail::ggml_sycl_op_unary(ctx, dst, [](auto x) {
         return op_neg(x);
@@ -1101,6 +1116,11 @@ void ggml_sycl_log(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
     ggml_sycl_op_log(ctx, dst);
 }
 
+void ggml_sycl_softplus(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
+    scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
+    ggml_sycl_op_softplus(ctx, dst);
+}
+
 void ggml_sycl_neg(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
     scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
     ggml_sycl_op_neg(ctx, dst);
diff --git a/ggml/src/ggml-sycl/element_wise.hpp b/ggml/src/ggml-sycl/element_wise.hpp
index 0913a2e529..7c71974687 100644
--- a/ggml/src/ggml-sycl/element_wise.hpp
+++ b/ggml/src/ggml-sycl/element_wise.hpp
@@ -61,6 +61,8 @@ void ggml_sycl_exp(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
 
 void ggml_sycl_log(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
 
+void ggml_sycl_softplus(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
+
 void ggml_sycl_neg(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
 
 void ggml_sycl_step(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp
index 3a4c092af5..74b4ed91cc 100644
--- a/ggml/src/ggml-sycl/ggml-sycl.cpp
+++ b/ggml/src/ggml-sycl/ggml-sycl.cpp
@@ -2263,6 +2263,65 @@ inline void ggml_sycl_op_diag_mask_inf(ggml_backend_sycl_context & ctx, ggml_ten
     diag_mask_inf_f32_sycl(src0_dd, dst_dd, ne00, nrows0, ne01, n_past, main_stream);
 }
 
+static void tri_f32_sycl(
+    const float * src,
+    float * dst,
+    const int64_t ne0,
+    const int64_t ne1,
+    const int64_t ne2,
+    const int64_t ne3,
+    const ggml_tri_type ttype,
+    dpct::queue_ptr main_stream
+) {
+    const size_t total = (size_t) ne0 * (size_t) ne1 * (size_t) ne2 * (size_t) ne3;
+
+    main_stream->parallel_for(sycl::range<1>(total), [=](sycl::id<1> tid) {
+        const int64_t idx = (int64_t) tid[0];
+
+        const int64_t i0 = idx % ne0;
+        const int64_t t1 = idx / ne0;
+        const int64_t i1 = t1 % ne1;
+
+        bool keep = false;
+        switch (ttype) {
+            case GGML_TRI_TYPE_LOWER:      keep = (i0 <  i1); break;
+            case GGML_TRI_TYPE_LOWER_DIAG: keep = (i0 <= i1); break;
+            case GGML_TRI_TYPE_UPPER:      keep = (i0 >  i1); break;
+            case GGML_TRI_TYPE_UPPER_DIAG: keep = (i0 >= i1); break;
+            default: keep = false; break;
+        }
+
+        dst[idx] = keep ? src[idx] : 0.0f;
+    });
+}
+
+static void ggml_sycl_op_tri(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
+    const ggml_tensor * src0 = dst->src[0];
+    GGML_ASSERT(src0);
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT(dst->type  == GGML_TYPE_F32);
+    GGML_ASSERT(ggml_is_contiguous(src0));
+    GGML_ASSERT(ggml_is_contiguous(dst));
+    GGML_ASSERT(ggml_are_same_shape(src0, dst));
+
+    dpct::queue_ptr main_stream = ctx.stream();
+    SYCL_CHECK(ggml_sycl_set_device(ctx.device));
+
+    const float * src0_dd = static_cast<const float *>(src0->data);
+    float *       dst_dd  = static_cast<float *>(dst->data);
+
+    const ggml_tri_type ttype = (ggml_tri_type) ggml_get_op_params_i32(dst, 0);
+
+    const int64_t ne0 = src0->ne[0];
+    const int64_t ne1 = src0->ne[1];
+    const int64_t ne2 = src0->ne[2];
+    const int64_t ne3 = src0->ne[3];
+
+    tri_f32_sycl(src0_dd, dst_dd, ne0, ne1, ne2, ne3, ttype, main_stream);
+}
+
+
 inline void ggml_sycl_op_scale(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
     GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
     GGML_ASSERT( dst->type == GGML_TYPE_F32);
@@ -3786,6 +3845,9 @@ static bool ggml_sycl_compute_forward(ggml_backend_sycl_context & ctx, struct gg
                 case GGML_UNARY_OP_EXP:
                     ggml_sycl_exp(ctx, dst);
                     break;
+                case GGML_UNARY_OP_SOFTPLUS:
+                    ggml_sycl_softplus(ctx, dst);
+                    break;
                 case GGML_UNARY_OP_SGN:
                     ggml_sycl_sgn(ctx, dst);
                     break;
@@ -3912,6 +3974,9 @@ static bool ggml_sycl_compute_forward(ggml_backend_sycl_context & ctx, struct gg
         case GGML_OP_TRANSPOSE:
             GGML_SYCL_DEBUG("%s: Tensor NO-OP\n", __func__);
             break;
+        case GGML_OP_TRI:
+            ggml_sycl_op_tri(ctx, dst);
+            break;
         case GGML_OP_DIAG_MASK_INF:
             ggml_sycl_diag_mask_inf(ctx, dst);
             break;
@@ -4404,6 +4469,7 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
                 case GGML_UNARY_OP_GELU_QUICK:
                 case GGML_UNARY_OP_GELU_ERF:
                 case GGML_UNARY_OP_EXP:
+                case GGML_UNARY_OP_SOFTPLUS:
                 case GGML_UNARY_OP_ELU:
                     return true;
                 case GGML_UNARY_OP_FLOOR:
@@ -4616,6 +4682,13 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
             return true;
         case GGML_OP_CONT:
             return op->src[0]->type != GGML_TYPE_BF16;
+        case GGML_OP_TRI:
+            {
+                const ggml_tensor * src0 = op->src[0];
+                return src0 &&
+                       op->type == GGML_TYPE_F32 &&
+                       ggml_is_contiguous(src0);
+            }
         case GGML_OP_DIAG_MASK_INF:
             return true;
         case GGML_OP_SOFT_MAX:
diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
index 3852867c29..a99375c088 100644
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
@@ -11956,7 +11956,8 @@ static void ggml_vk_test_dequant_matmul(ggml_backend_vk_context * ctx, size_t m,
         }
     }
     if (mmq) {
-        ggml_pipeline_request_descriptor_sets(ctx, ctx->device->pipeline_quantize_q8_1, num_it);
+        vk_pipeline pipeline_quantize_q8_1 = ggml_vk_get_quantize_pipeline(ctx, GGML_TYPE_Q8_1);
+        ggml_pipeline_request_descriptor_sets(ctx, pipeline_quantize_q8_1, num_it);
     }
 
     ggml_pipeline_allocate_descriptor_sets(ctx);
diff --git a/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn.wgsl b/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn.wgsl
index de7c132a62..b682216146 100644
--- a/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn.wgsl
+++ b/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn.wgsl
@@ -114,7 +114,7 @@ struct Params {
 #define PARAMS_BINDING 4
 #endif
 
-@group(0) @binding(DST_BINDING) var<storage, read_write> dst: array<f32>;
+@group(0) @binding(DST_BINDING) var<storage, read_write> dst: array<vec4<f32>>;
 @group(0) @binding(PARAMS_BINDING) var<uniform> params: Params;
 
 // Just a very small float value.
@@ -160,14 +160,21 @@ fn calc_softmax_term(kv_idx: u32, q_tile_row: u32, slope: f32) -> f32 {
     return v;
 }
 
+fn load_f32x4(buf: ptr<storage, array<vec4<f32>>, read_write>, scalar_index: u32) -> vec4<f32> {
+    return (*buf)[scalar_index >> 2u];
+}
+
+fn load_kvx4(buf: ptr<storage, array<vec4<KV_TYPE>>, read_write>, scalar_index: u32) -> vec4<KV_TYPE> {
+    return (*buf)[scalar_index >> 2u];
+}
 
 @compute @workgroup_size(WG_SIZE)
 fn main(@builtin(workgroup_id) wg_id: vec3<u32>,
-        @builtin(local_invocation_id) local_id: vec3<u32>,
-        @builtin(subgroup_id) subgroup_id: u32,
-        @builtin(subgroup_size) subgroup_size: u32,
-        @builtin(num_subgroups) num_subgroups: u32,
-        @builtin(subgroup_invocation_id) sg_inv_id: u32) {
+    @builtin(local_invocation_id) local_id: vec3<u32>,
+    @builtin(subgroup_id) subgroup_id: u32,
+    @builtin(subgroup_size) subgroup_size: u32,
+    @builtin(num_subgroups) num_subgroups: u32,
+    @builtin(subgroup_invocation_id) sg_inv_id: u32) {
 
     // initialize row max for online softmax
     for (var i = local_id.x; i < Q_TILE; i += WG_SIZE) {
@@ -231,9 +238,9 @@ fn main(@builtin(workgroup_id) wg_id: vec3<u32>,
 
     for (var kv_tile = 0u; kv_tile < params.seq_len_kv; kv_tile += KV_TILE) {
       // clear inter_shmem to ensure zero-initialized accumulators
-      for (var elem_idx = local_id.x; elem_idx < Q_TILE * KV_TILE; elem_idx += WG_SIZE) {
-          inter_shmem[elem_idx] = 0.0;
-      }
+        for (var elem_idx = local_id.x; elem_idx < Q_TILE * KV_TILE; elem_idx += WG_SIZE) {
+            inter_shmem[elem_idx] = 0.0;
+        }
 
       // load k tile into shared memory
 #if defined(KV_Q4_0)
@@ -309,48 +316,77 @@ fn main(@builtin(workgroup_id) wg_id: vec3<u32>,
 
       // accumulate q block * k block into registers across the entire KV tile
       // TODO: this loop seems to be the current largest bottleneck
-      for (var kv_block = subgroup_id; kv_block < KV_BLOCKS; kv_block += num_subgroups) {
-          let inter_offset = kv_block * SG_MAT_N;
-          var acc: subgroup_matrix_result<f16, SG_MAT_M, SG_MAT_N> = subgroupMatrixLoad<
-              subgroup_matrix_result<f16, SG_MAT_M, SG_MAT_N>>(&inter_shmem, inter_offset, false, KV_TILE);
+      // this bracket exists to scope the lifetime of variables, reducing register pressure
+      {
 #ifdef KV_DIRECT
-          let k_block_row = kv_tile + kv_block * SG_MAT_N;
-          let k_global_offset = k_head_offset + k_block_row * params.stride_k1;
+          let k_block_row = kv_tile + subgroup_id * SG_MAT_N;
+          var k_global_offset = k_head_offset + k_block_row * params.stride_k1;
 #else
-          let k_block_offset = kv_block * SG_MAT_N * HEAD_DIM_QK;
+          var k_block_offset = subgroup_id * SG_MAT_N * HEAD_DIM_QK;
 #endif
-          for (var head_dim_block = 0u; head_dim_block < HEAD_DIM_QK; head_dim_block += SG_MAT_K) {
-              // load q submatrix from shared memory
-              var q_sg_mat: subgroup_matrix_left<f16, SG_MAT_M, SG_MAT_K> = subgroupMatrixLoad<subgroup_matrix_left<f16, SG_MAT_M, SG_MAT_K>>(
-                  &q_shmem,
-                  head_dim_block,
-                  false,
-                  HEAD_DIM_QK
-              );
+          for (var kv_block = subgroup_id; kv_block < KV_BLOCKS; kv_block += num_subgroups) {
+              let inter_offset = kv_block * SG_MAT_N;
+              var acc: subgroup_matrix_result<f16, SG_MAT_M, SG_MAT_N> = subgroupMatrixLoad<subgroup_matrix_result<f16, SG_MAT_M, SG_MAT_N>>(&inter_shmem, inter_offset, false, KV_TILE);
+
+              var q_cur = subgroupMatrixLoad<subgroup_matrix_left<f16, SG_MAT_M, SG_MAT_K>>(&q_shmem, 0u, false, HEAD_DIM_QK);
 
-              // load k submatrix from device or shared memory
 #ifdef KV_DIRECT
-              var k_sg_mat: subgroup_matrix_right<f16, SG_MAT_K, SG_MAT_N> = subgroupMatrixLoad<subgroup_matrix_right<f16, SG_MAT_K, SG_MAT_N>>(
-                  &K,
-                  k_global_offset + head_dim_block,
-                  true,
-                  params.stride_k1
-              );
+              var k_cur = subgroupMatrixLoad<subgroup_matrix_right<f16, SG_MAT_K, SG_MAT_N>>(&K, k_global_offset + 0u, true, params.stride_k1);
 #else
-              var k_sg_mat: subgroup_matrix_right<f16, SG_MAT_K, SG_MAT_N> = subgroupMatrixLoad<subgroup_matrix_right<f16, SG_MAT_K, SG_MAT_N>>(
-                  &kv_shmem,
-                  k_block_offset + head_dim_block,
-                  true,
-                  HEAD_DIM_QK
-              );
+              var k_cur = subgroupMatrixLoad<subgroup_matrix_right<f16, SG_MAT_K, SG_MAT_N>>(&kv_shmem, k_block_offset + 0u, true, HEAD_DIM_QK);
 #endif
-              acc = subgroupMatrixMultiplyAccumulate(q_sg_mat, k_sg_mat, acc);
+
+              var t: u32 = 1u;
+              for (; t + 1u < HEAD_DIM_QK / SG_MAT_K; t += 2u) {
+                  let h0 = t * SG_MAT_K;
+                  var q0 = subgroupMatrixLoad<subgroup_matrix_left<f16, SG_MAT_M, SG_MAT_K>>(&q_shmem, h0, false, HEAD_DIM_QK);
+#ifdef KV_DIRECT
+                  var k0 = subgroupMatrixLoad<subgroup_matrix_right<f16, SG_MAT_K, SG_MAT_N>>(&K, k_global_offset + h0, true, params.stride_k1);
+#else
+                  var k0 = subgroupMatrixLoad<subgroup_matrix_right<f16, SG_MAT_K, SG_MAT_N>>(&kv_shmem, k_block_offset + h0, true, HEAD_DIM_QK);
+#endif
+                  acc = subgroupMatrixMultiplyAccumulate(q_cur, k_cur, acc);
+                  q_cur = q0;
+                  k_cur = k0;
+
+                  let h1 = (t + 1u) * SG_MAT_K;
+                  var q1g = subgroupMatrixLoad<subgroup_matrix_left<f16, SG_MAT_M, SG_MAT_K>>(&q_shmem, h1, false, HEAD_DIM_QK);
+#ifdef KV_DIRECT
+                  var k1g = subgroupMatrixLoad<subgroup_matrix_right<f16, SG_MAT_K, SG_MAT_N>>(&K, k_global_offset + h1, true, params.stride_k1);
+#else
+                  var k1g = subgroupMatrixLoad<subgroup_matrix_right<f16, SG_MAT_K, SG_MAT_N>>(&kv_shmem, k_block_offset + h1, true, HEAD_DIM_QK);
+#endif
+                  acc = subgroupMatrixMultiplyAccumulate(q_cur, k_cur, acc);
+                  q_cur = q1g;
+                  k_cur = k1g;
+              }
+
+              // handle odd tail
+              if (t < HEAD_DIM_QK / SG_MAT_K) {
+                  let h = t * SG_MAT_K;
+                  var qn = subgroupMatrixLoad<subgroup_matrix_left<f16, SG_MAT_M, SG_MAT_K>>(&q_shmem, h, false, HEAD_DIM_QK);
+#ifdef KV_DIRECT
+                  var kn = subgroupMatrixLoad<subgroup_matrix_right<f16, SG_MAT_K, SG_MAT_N>>(&K, k_global_offset + h, true, params.stride_k1);
+#else
+                  var kn = subgroupMatrixLoad<subgroup_matrix_right<f16, SG_MAT_K, SG_MAT_N>>(&kv_shmem, k_block_offset + h, true, HEAD_DIM_QK);
+#endif
+                  acc = subgroupMatrixMultiplyAccumulate(q_cur, k_cur, acc);
+                  q_cur = qn;
+                  k_cur = kn;
+              }
+
+              acc = subgroupMatrixMultiplyAccumulate(q_cur, k_cur, acc);
+
+#ifdef KV_DIRECT
+              k_global_offset += num_subgroups * SG_MAT_N * params.stride_k1;
+#else
+              k_block_offset += num_subgroups * SG_MAT_N * HEAD_DIM_QK;
+#endif
+              subgroupMatrixStore(&inter_shmem, inter_offset, acc, false, KV_TILE);
           }
-
-          // store acc to shared memory for softmax (S matrix from paper)
-          subgroupMatrixStore(&inter_shmem, inter_offset, acc, false, KV_TILE);
       }
 
+
 #ifdef MASK
       // load mask tile into shared memory for this KV block
       // TODO: optimize and skip if mask is -INF for the entire tile
@@ -495,7 +531,6 @@ fn main(@builtin(workgroup_id) wg_id: vec3<u32>,
                   false,
                   HEAD_DIM_V
               );
-
               for (var kv_block = 0u; kv_block < KV_BLOCKS; kv_block++) {
                   let p_offset = kv_block * SG_MAT_N;
                   var p_sg_mat: subgroup_matrix_left<f16, SG_MAT_M, SG_MAT_K> = subgroupMatrixLoad<subgroup_matrix_left<f16, SG_MAT_M, SG_MAT_K>>(
@@ -527,11 +562,9 @@ fn main(@builtin(workgroup_id) wg_id: vec3<u32>,
                   // O += P * V
                   o_sg_mat = subgroupMatrixMultiplyAccumulate(p_sg_mat, v_sg_mat, o_sg_mat);
               }
-
               // store O back to shared memory
               subgroupMatrixStore(&o_shmem, head_dim_block, o_sg_mat, false, HEAD_DIM_V);
       }
-
       workgroupBarrier();
     }
 
@@ -566,26 +599,38 @@ fn main(@builtin(workgroup_id) wg_id: vec3<u32>,
                 o_shmem[idx] = f16(val);
             }
     }
-
     workgroupBarrier();
 #endif
-
-    // write output back to global memory
     for (var q_tile_row = subgroup_id;
-         q_tile_row < Q_TILE;
-         q_tile_row += num_subgroups) {
-            let global_q_row = q_row_start + q_tile_row;
-            if (global_q_row >= params.seq_len_q) {
-                break;
-            }
+        q_tile_row < Q_TILE;
+        q_tile_row += num_subgroups) {
 
-            let exp_sum = exp_sum_shmem[q_tile_row];
-            let scale = select(0.0, 1.0 / exp_sum, exp_sum != 0);
+        let global_q_row = q_row_start + q_tile_row;
+        if (global_q_row >= params.seq_len_q) { break; }
 
-            for (var elem_idx = sg_inv_id; elem_idx < HEAD_DIM_V; elem_idx += subgroup_size) {
-                let o_val = o_shmem[q_tile_row * HEAD_DIM_V + elem_idx];
-                let scaled = f32(o_val) * scale;
-                dst[dst_global_offset + q_tile_row * dst2_stride + elem_idx] = scaled;
-            }
+        let exp_sum = exp_sum_shmem[q_tile_row];
+        let scale = select(0.0, 1.0 / exp_sum, exp_sum != 0.0);
+
+        let row_base: u32 = dst_global_offset + q_tile_row * dst2_stride;
+
+        for (var elem_base = sg_inv_id * 4u;
+            elem_base < HEAD_DIM_V;
+            elem_base += subgroup_size * 4u) {
+
+            let i0 = q_tile_row * HEAD_DIM_V + (elem_base + 0u);
+            let i1 = q_tile_row * HEAD_DIM_V + (elem_base + 1u);
+            let i2 = q_tile_row * HEAD_DIM_V + (elem_base + 2u);
+            let i3 = q_tile_row * HEAD_DIM_V + (elem_base + 3u);
+
+            let v = vec4<f32>(
+                f32(o_shmem[i0]) * scale,
+                f32(o_shmem[i1]) * scale,
+                f32(o_shmem[i2]) * scale,
+                f32(o_shmem[i3]) * scale
+            );
+
+            let dst_vec_index: u32 = (row_base + elem_base) >> 2u;
+            dst[dst_vec_index] = v;
+        }
     }
 }
diff --git a/scripts/snapdragon/windows/run-bench.ps1 b/scripts/snapdragon/windows/run-bench.ps1
new file mode 100644
index 0000000000..21fd063ebe
--- /dev/null
+++ b/scripts/snapdragon/windows/run-bench.ps1
@@ -0,0 +1,40 @@
+
+#!/usr/bin/env pwsh
+
+# Basedir on device
+$basedir=".\pkg-snapdragon"
+
+$cli_opts=$args
+
+$model="Llama-3.2-3B-Instruct-Q4_0.gguf"
+if ($null -ne $env:M) {
+    $model=$env:M
+}
+
+$device="HTP0"
+if ($null -ne $env:D) {
+    $device=$env:D
+}
+
+if ($null -ne $env:V) {
+    $env:GGML_HEXAGON_VERBOSE=$env:V
+}
+
+if ($null -ne $env:OPMASK) {
+    $env:GGML_HEXAGON_OPMASK=$env:OPMASK
+}
+
+if ($null -ne $env:NHVX) {
+    $env:GGML_HEXAGON_NHVX=$env:NHVX
+}
+
+if ($null -ne $env:NDEV) {
+    $env:GGML_HEXAGON_NDEV=$env:NDEV
+}
+
+$env:ADSP_LIBRARY_PATH="$basedir\lib"
+
+& "$basedir\bin\llama-bench.exe" `
+    --mmap 0 -m $basedir\..\..\gguf\$model `
+    --poll 1000 -t 6 --cpu-mask 0xfc --cpu-strict 1 `
+    --batch-size 128 -ngl 99 --device $device $cli_opts
diff --git a/scripts/snapdragon/windows/run-cli.ps1 b/scripts/snapdragon/windows/run-cli.ps1
new file mode 100644
index 0000000000..b13161aa63
--- /dev/null
+++ b/scripts/snapdragon/windows/run-cli.ps1
@@ -0,0 +1,53 @@
+
+#!/usr/bin/env pwsh
+
+# Basedir on device
+$basedir=".\pkg-snapdragon"
+
+$cli_opts=$args
+
+$model="Llama-3.2-3B-Instruct-Q4_0.gguf"
+if ($null -ne $env:M) {
+    $model=$env:M
+}
+
+$device="HTP0"
+if ($null -ne $env:D) {
+    $device=$env:D
+}
+
+if ($null -ne $env:V) {
+    $env:GGML_HEXAGON_VERBOSE=$env:V
+}
+
+if ($null -ne $env:E) {
+    $env:GGML_HEXAGON_EXPERIMENTAL=$env:E
+}
+
+if ($null -ne $env:SCHED) {
+    $env:GGML_SCHED_DEBUG=$env:SCHED; $cli_opts="$cli_opts -v"
+}
+
+if ($null -ne $env:PROF) {
+    $env:GGML_HEXAGON_PROFILE=$env:PROF; $env:GGML_HEXAGON_OPSYNC=1
+}
+
+if ($null -ne $env:OPMASK) {
+    $env:GGML_HEXAGON_OPMASK=$env:OPMASK
+}
+
+if ($null -ne $env:NHVX) {
+    $env:GGML_HEXAGON_NHVX=$env:NHVX
+}
+
+if ($null -ne $env:NDEV) {
+    $env:GGML_HEXAGON_NDEV=$env:NDEV
+}
+
+$env:ADSP_LIBRARY_PATH="$basedir\lib"
+
+& "$basedir\bin\llama-completion.exe" `
+    --no-mmap -no-cnv -m $basedir\..\..\gguf\$model `
+    --poll 1000 -t 6 --cpu-mask 0xfc --cpu-strict 1 `
+    --ctx-size 8192 --batch-size 128 -ctk q8_0 -ctv q8_0 -fa on `
+    -ngl 99 --device $device $cli_opts
diff --git a/scripts/snapdragon/windows/run-tool.ps1 b/scripts/snapdragon/windows/run-tool.ps1
new file mode 100644
index 0000000000..70094af9bc
--- /dev/null
+++ b/scripts/snapdragon/windows/run-tool.ps1
@@ -0,0 +1,56 @@
+
+#!/usr/bin/env pwsh
+
+# Basedir on device
+$basedir=".\pkg-snapdragon"
+
+if ($args.Count -eq 0) {
+    Write-Host "No arguments provided.Expected the tool and argument to run."
+    exit -1
+}
+
+$tool=$args[0]
+$cli_opts=@()
+
+if ($args.Count -gt 1) {
+    $cli_opts=$args[1..($args.Count - 1)]
+    $remainingArgs = $args[1..($args.Count - 1)]
+}
+
+$device="HTP0"
+if ($null -ne $env:D) {
+    $device=$env:D
+}
+
+if ($null -ne $env:V) {
+    $env:GGML_HEXAGON_VERBOSE=$env:V
+}
+
+if ($null -ne $env:E) {
+    $env:GGML_HEXAGON_EXPERIMENTAL=$env:E
+}
+
+if ($null -ne $env:SCHED) {
+    $env:GGML_SCHED_DEBUG=$env:SCHED; $cli_opts="$cli_opts -v"
+}
+
+if ($null -ne $env:PROF) {
+    $env:GGML_HEXAGON_PROFILE=$env:PROF; $env:GGML_HEXAGON_OPSYNC=1
+}
+
+if ($null -ne $env:OPMASK) {
+    $env:GGML_HEXAGON_OPMASK=$env:OPMASK
+}
+
+if ($null -ne $env:NHVX) {
+    $env:GGML_HEXAGON_NHVX=$env:NHVX
+}
+
+if ($null -ne $env:NDEV) {
+    $env:GGML_HEXAGON_NDEV=$env:NDEV
+}
+
+$env:ADSP_LIBRARY_PATH="$basedir\lib"
+
+& "$basedir\bin\$tool" `
+    $cli_opts
diff --git a/scripts/snapdragon/windows/setup-build.ps1 b/scripts/snapdragon/windows/setup-build.ps1
new file mode 100644
index 0000000000..0f3244cc9d
--- /dev/null
+++ b/scripts/snapdragon/windows/setup-build.ps1
@@ -0,0 +1,105 @@
+# Requires Run as Administrator is NOT strictly necessary for User-scope env vars,
+# but recommended for creating directories in C:\ root if permissions are restricted.
+
+$ErrorActionPreference = "Stop"
+
+# --- Configuration ---
+$BaseDir = "C:\Qualcomm"
+
+# SDK 1: Hexagon
+$HexagonUrl     = "https://github.com/snapdragon-toolchain/hexagon-sdk/releases/download/v6.4.0.2/hexagon-sdk-v6.4.0.2-arm64-wos.tar.xz"
+$HexagonParent  = Join-Path $BaseDir "Hexagon_SDK"
+$HexagonSdkVersion   = "6.4.0.2"
+$HexagonToolsVersion = "19.0.04"
+$HexagonSdkTarget    = Join-Path $HexagonParent $HexagonSdkVersion
+$HexagonToolsTarget  = Join-Path $HexagonSdkTarget "\tools\HEXAGON_Tools\$HexagonToolsVersion"
+
+# SDK 2: OpenCL
+$OpenCLUrl      = "https://github.com/snapdragon-toolchain/opencl-sdk/releases/download/v2.3.2/adreno-opencl-sdk-v2.3.2-arm64-wos.tar.xz"
+$OpenCLParent   = Join-Path $BaseDir "OpenCL_SDK"
+$OpenCLVersion  = "2.3.2"
+$OpenCLTarget   = Join-Path $OpenCLParent $OpenCLVersion
+
+# --- Helper Function ---
+function Install-QualcommSDK {
+    param (
+        [string]$Url,
+        [string]$ParentDir,
+        [string]$TargetDir,
+        [string]$Name
+    )
+
+    # 1. Create Parent Directory
+    if (-not (Test-Path -Path $ParentDir)) {
+        Write-Host "Creating directory: $ParentDir" -ForegroundColor Cyan
+        New-Item -Path $ParentDir -ItemType Directory -Force | Out-Null
+    }
+
+    # 2. Check for Specific Version Directory
+    if (Test-Path -Path $TargetDir) {
+        Write-Host "$Name ($TargetDir) already exists. Skipping download." -ForegroundColor Green
+    }
+    else {
+        Write-Host "$Name not found. preparing to download..." -ForegroundColor Yellow
+
+        # Create the target directory to extract into
+        New-Item -Path $TargetDir -ItemType Directory -Force | Out-Null
+
+        # Define temporary archive path
+        $TempFile = Join-Path $ParentDir "temp_sdk.tar.xz"
+
+        try {
+            # Download
+            Write-Host "Downloading from: $Url"
+            Invoke-WebRequest -Uri $Url -OutFile $TempFile
+
+            # Untar
+            # Note: We assume Windows includes tar.exe (Win 10 build 17063+)
+            Write-Host "Extracting archive to $TargetDir..."
+
+            # We use -C to extract contents INTO the target directory created above
+            tar -xJvf $TempFile -C $TargetDir\..
+
+            Write-Host "Extraction complete." -ForegroundColor Green
+        }
+        catch {
+            Write-Error "Failed to download or extract $Name. Error: $_"
+            # Cleanup target dir if failed so script tries again next time
+            Remove-Item -Path $TargetDir -Recurse -Force -ErrorAction SilentlyContinue
+        }
+        finally {
+            # Cleanup Archive
+            if (Test-Path $TempFile) { Remove-Item $TempFile -Force }
+        }
+    }
+}
+
+# --- Execution ---
+
+# 1. Ensure Base C:\Qualcomm exists
+if (-not (Test-Path $BaseDir)) {
+    New-Item -Path $BaseDir -ItemType Directory -Force | Out-Null
+}
+
+# 2. Run Install Logic
+Install-QualcommSDK -Url $HexagonUrl -ParentDir $HexagonParent -TargetDir $HexagonSdkTarget -Name "Hexagon SDK"
+Install-QualcommSDK -Url $OpenCLUrl -ParentDir $OpenCLParent -TargetDir $OpenCLTarget -Name "OpenCL SDK"
+
+# --- Environment Variables ---
+
+Write-Host "`nSetting Environment Variables..." -ForegroundColor Cyan
+
+# Set OPENCL_SDK_ROOT
+[System.Environment]::SetEnvironmentVariable('OPENCL_SDK_ROOT', $OpenCLTarget, [System.EnvironmentVariableTarget]::User)
+$env:OPENCL_SDK_ROOT = $OpenCLTarget # Set for current session as well
+Write-Host "OPENCL_SDK_ROOT set to:  $OpenCLTarget"
+
+# Set HEXAGON_SDK_ROOT
+[System.Environment]::SetEnvironmentVariable('HEXAGON_SDK_ROOT', $HexagonSdkTarget, [System.EnvironmentVariableTarget]::User)
+$env:HEXAGON_SDK_ROOT = $HexagonSdkTarget # Set for current session as well
+Write-Host "HEXAGON_SDK_ROOT set to: $HexagonSdkTarget"
+
+# Set HEXAGON_SDK_ROOT
+[System.Environment]::SetEnvironmentVariable('HEXAGON_TOOLS_ROOT', $HexagonToolsTarget, [System.EnvironmentVariableTarget]::User)
+$env:HEXAGON_TOOLS_ROOT = $HexagonToolsTarget # Set for current session as well
+Write-Host "HEXAGON_TOOLS_ROOT set to: $HexagonToolsTarget"
diff --git a/scripts/sync-ggml.last b/scripts/sync-ggml.last
index c838276158..81e79a9470 100644
--- a/scripts/sync-ggml.last
+++ b/scripts/sync-ggml.last
@@ -1 +1 @@
-ebc3a0f4a56be1c9424a89fbec09962ac34fde85
+a8db410a252c8c8f2d120c6f2e7133ebe032f35d
diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp
index f3c9b49f30..c35cd6761b 100644
--- a/src/llama-kv-cache.cpp
+++ b/src/llama-kv-cache.cpp
@@ -1772,8 +1772,6 @@ void llama_kv_cache::state_write_data(llama_io_write_i & io, const cell_ranges_t
     io.write(&v_trans, sizeof(v_trans));
     io.write(&n_layer, sizeof(n_layer));
 
-    std::vector<uint8_t> tmp_buf;
-
     // Iterate and write all the keys first, each row is a cell
     // Get whole range at a time
     for (const auto & layer : layers) {
@@ -1791,7 +1789,7 @@ void llama_kv_cache::state_write_data(llama_io_write_i & io, const cell_ranges_t
         const uint64_t k_size_row = ggml_row_size(k->type, n_embd_k_gqa);
         io.write(&k_size_row, sizeof(k_size_row));
 
-        // Read each range of cells of k_size length each into tmp_buf and write out
+        // Read each range of cells of k_size length and write out
         for (const auto & range : cr.data) {
             const size_t range_size = range.second - range.first;
             const size_t buf_size = range_size * k_size_row;
@@ -1818,7 +1816,7 @@ void llama_kv_cache::state_write_data(llama_io_write_i & io, const cell_ranges_t
             const uint64_t v_size_row = ggml_row_size(v->type, n_embd_v_gqa);
             io.write(&v_size_row, sizeof(v_size_row));
 
-            // Read each range of cells of v_size length each into tmp_buf and write out
+            // Read each range of cells of v_size length and write out
             for (const auto & range : cr.data) {
                 const size_t range_size = range.second - range.first;
                 const size_t buf_size = range_size * v_size_row;
@@ -1852,7 +1850,7 @@ void llama_kv_cache::state_write_data(llama_io_write_i & io, const cell_ranges_t
 
             // For each row, we get the element values of each cell
             for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
-                // Read each range of cells of v_size_el length each into tmp_buf and write out
+                // Read each range of cells of v_size_el length and write out
                 for (const auto & range : cr.data) {
                     const size_t range_size = range.second - range.first;
                     const size_t src_offset = (range.first + j * kv_size) * v_size_el;
diff --git a/src/llama-memory-recurrent.cpp b/src/llama-memory-recurrent.cpp
index 812bf25304..f0038036dc 100644
--- a/src/llama-memory-recurrent.cpp
+++ b/src/llama-memory-recurrent.cpp
@@ -785,23 +785,21 @@ void llama_memory_recurrent::state_write_data(llama_io_write_i & io, const std::
     io.write(&s_trans, sizeof(s_trans));
     io.write(&n_layer,   sizeof(n_layer));
 
-    std::vector<uint8_t> tmp_buf;
-
-    // Iterate and write all the keys first, each row is a cell
+    // Iterate and write all the R tensors first, each row is a cell
     // Get whole range at a time
     for (uint32_t il = 0; il < n_layer; ++il) {
         // skip null layers (read_data will handle this by checking "r_l" and "s_l" for null)
         if (r_l[il] == nullptr) continue;
 
-        // Write key type
+        // Write R tensor type
         const int32_t r_type_i = (int32_t)r_l[il]->type;
         io.write(&r_type_i, sizeof(r_type_i));
 
-        // Write row size of key
+        // Write row size of R tensor
         const uint64_t r_size_row = ggml_row_size(r_l[il]->type, hparams.n_embd_r());
         io.write(&r_size_row, sizeof(r_size_row));
 
-        // Read each range of cells of k_size length each into tmp_buf and write out
+        // Write each range of cells of r_size_row length
         for (const auto & range : cell_ranges) {
             const size_t range_size = range.second - range.first;
             const size_t buf_size = range_size * r_size_row;
@@ -814,15 +812,15 @@ void llama_memory_recurrent::state_write_data(llama_io_write_i & io, const std::
             // skip null layers (read_data will handle this by checking "r_l" and "s_l" for null)
             if (s_l[il] == nullptr) continue;
 
-            // Write value type
+            // Write S tensor type
             const int32_t s_type_i = (int32_t)s_l[il]->type;
             io.write(&s_type_i, sizeof(s_type_i));
 
-            // Write row size of value
+            // Write row size of S tensor
             const uint64_t s_size_row = ggml_row_size(s_l[il]->type, hparams.n_embd_s());
             io.write(&s_size_row, sizeof(s_size_row));
 
-            // Read each range of cells of s_size length each into tmp_buf and write out
+            // Write each range of S tensor rows
             for (const auto & range : cell_ranges) {
                 const size_t range_size = range.second - range.first;
                 const size_t buf_size = range_size * s_size_row;
@@ -830,7 +828,7 @@ void llama_memory_recurrent::state_write_data(llama_io_write_i & io, const std::
             }
         }
     } else {
-        // When v is transposed, we also need the element size and get the element ranges from each row
+        // When S tensor is transposed, we also need the element size and get the element ranges from each row
         const uint32_t mem_size = size;
         for (uint32_t il = 0; il < n_layer; ++il) {
             // skip null layers (read_data will handle this by checking "r_l" and "s_l" for null)
@@ -838,7 +836,7 @@ void llama_memory_recurrent::state_write_data(llama_io_write_i & io, const std::
 
             const uint32_t n_embd_s = hparams.n_embd_s();
 
-            // Write value type
+            // Write S tensor type
             const int32_t s_type_i = (int32_t)s_l[il]->type;
             io.write(&s_type_i, sizeof(s_type_i));
 
@@ -851,7 +849,7 @@ void llama_memory_recurrent::state_write_data(llama_io_write_i & io, const std::
 
             // For each row, we get the element values of each cell
             for (uint32_t j = 0; j < n_embd_s; ++j) {
-                // Read each range of cells of v_size_el length each into tmp_buf and write out
+                // Write each range of cells of s_size_el length
                 for (const auto & range : cell_ranges) {
                     const size_t range_size = range.second - range.first;
                     const size_t src_offset = (range.first + j * mem_size) * s_size_el;
diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp
index 58ba6e8b40..8ce1a13c68 100644
--- a/tests/test-backend-ops.cpp
+++ b/tests/test-backend-ops.cpp
@@ -8233,11 +8233,13 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
                         if (!mask && max_bias > 0.0f) continue;
                         for (float logit_softcap : {0.0f, 10.0f}) {
                             if (hsk != 128 && logit_softcap != 0.0f) continue;
-                            for (int nh : { 4, }) {
+                            for (int nh : { 1, 4 }) {
+                                if (nh == 1 && hsk != 576) continue; // GLM 4.7 Flash
                                 for (int nr3 : { 1, 3, }) {
                                     if (hsk > 64 && nr3 > 1) continue; // skip broadcast for large head sizes
-                                    for (int nr2 : { 1, 4, 12 }) {
+                                    for (int nr2 : { 1, 4, 12, 20 }) {
                                         if (nr2 == 12 && hsk != 128) continue;
+                                        if (nr2 == 20 && (nh != 1 || hsk != 576)) continue;
                                         //for (int kv : { 1, 17, 31, 33, 61, 113, 65, 127, 129, 130, 255, 260, 371, 380, 407, 512, 1024, }) {
                                         for (int kv : { 113, 512, 1024, }) {
                                             if (nr2 != 1 && kv != 512) continue;
diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp
index 9b076e0c56..9fa5afc390 100644
--- a/tools/mtmd/clip.cpp
+++ b/tools/mtmd/clip.cpp
@@ -1005,6 +1005,8 @@ struct clip_model_loader {
                         hparams.minicpmv_query_num = 64;
                     } else if (hparams.minicpmv_version == 6) {
                         hparams.minicpmv_query_num = 64;
+                    } else if (hparams.minicpmv_version == 100045) {
+                        hparams.minicpmv_query_num = 64;
                     } else {
                         hparams.minicpmv_query_num = 96;
                     }
@@ -3209,6 +3211,9 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
                     } else if (params.minicpmv_version == 6) {
                         // MiniCPM-V 4.5
                         n_patches = 64;
+                    } else if (params.minicpmv_version == 100045) {
+                        // MiniCPM-o 4.5
+                        n_patches = 64;
                     } else {
                         GGML_ABORT("Unknown minicpmv version");
                     }
diff --git a/tools/mtmd/legacy-models/minicpmv-convert-image-encoder-to-gguf.py b/tools/mtmd/legacy-models/minicpmv-convert-image-encoder-to-gguf.py
index bb2cc4e4ea..944037e703 100644
--- a/tools/mtmd/legacy-models/minicpmv-convert-image-encoder-to-gguf.py
+++ b/tools/mtmd/legacy-models/minicpmv-convert-image-encoder-to-gguf.py
@@ -501,7 +501,7 @@ default_image_mean = [0.5, 0.5, 0.5]
 default_image_std = [0.5, 0.5, 0.5]
 ap.add_argument('--image-mean', type=float, nargs='+', help='Mean of the images for normalization (overrides processor) ', default=None)
 ap.add_argument('--image-std', type=float, nargs='+', help='Standard deviation of the images for normalization (overrides processor)', default=None)
-ap.add_argument('--minicpmv_version', type=int, help='minicpmv_version: MiniCPM-V-2 use 1; MiniCPM-V-2.5 use 2; MiniCPM-V-2.6 use 3; MiniCPM-o-2.6 use 4; MiniCPM-V 4.0 use 5; MiniCPM-o-4.0 use 6', default=2)
+ap.add_argument('--minicpmv_version', type=int, help='minicpmv_version: MiniCPM-V-2 use 1; MiniCPM-V-2.5 use 2; MiniCPM-V-2.6 use 3; MiniCPM-o-2.6 use 4; MiniCPM-V 4.0 use 5; MiniCPM-o-4.0 use 6; MiniCPM-o-4.5 use 100045', default=2)
 
 # with proper
 args = ap.parse_args()
@@ -610,6 +610,9 @@ else:
     elif minicpmv_version == 6:
         emb_dim = 4096
         block_count = 27
+    elif minicpmv_version == 100045:
+        emb_dim = 4096
+        block_count = 27
 
     default_vision_config = {
             "hidden_size": 1152,
@@ -637,6 +640,10 @@ elif minicpmv_version == 6:
     default_vision_config["model_type"] = "siglip_vision_model"
     vision_config = SiglipVisionConfig(**default_vision_config)
     model = SiglipVisionTransformer(vision_config)
+elif minicpmv_version == 100045:
+    default_vision_config["model_type"] = "siglip_vision_model"
+    vision_config = SiglipVisionConfig(**default_vision_config)
+    model = SiglipVisionTransformer(vision_config)
 
 processor = None
 # if model.attn_pool is not None:
diff --git a/tools/mtmd/mtmd.cpp b/tools/mtmd/mtmd.cpp
index 32a24bfcea..d037e834f3 100644
--- a/tools/mtmd/mtmd.cpp
+++ b/tools/mtmd/mtmd.cpp
@@ -236,7 +236,7 @@ struct mtmd_context {
             tok_row_end_trail = false; // no trailing end-of-row token
             ov_img_first      = true;
 
-        } else if (minicpmv_version == 3 || minicpmv_version == 4 || minicpmv_version == 5 || minicpmv_version == 6) {
+        } else if (minicpmv_version == 3 || minicpmv_version == 4 || minicpmv_version == 5 || minicpmv_version == 6 || minicpmv_version == 100045) {
             // minicpmv 2.6 format:
             // <image> (overview) </image><slice> (slice) </slice><slice> (slice) </slice>\n ...
             slice_tmpl        = MTMD_SLICE_TMPL_MINICPMV_2_6;
diff --git a/tools/quantize/quantize.cpp b/tools/quantize/quantize.cpp
index 881f4b3dd9..0709e0bda0 100644
--- a/tools/quantize/quantize.cpp
+++ b/tools/quantize/quantize.cpp
@@ -119,7 +119,7 @@ static bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftyp
 [[noreturn]]
 static void usage(const char * executable) {
     printf("usage: %s [--help] [--allow-requantize] [--leave-output-tensor] [--pure] [--imatrix] [--include-weights]\n", executable);
-    printf("       [--exclude-weights] [--output-tensor-type] [--token-embedding-type] [--tensor-type] [--prune-layers] [--keep-split] [--override-kv]\n");
+    printf("       [--exclude-weights] [--output-tensor-type] [--token-embedding-type] [--tensor-type] [--tensor-type-file] [--prune-layers] [--keep-split] [--override-kv]\n");
     printf("       model-f32.gguf [model-quant.gguf] type [nthreads]\n\n");
     printf("  --allow-requantize: Allows requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit\n");
     printf("  --leave-output-tensor: Will leave output.weight un(re)quantized. Increases model size but may also increase quality, especially when requantizing\n");
@@ -131,6 +131,8 @@ static void usage(const char * executable) {
     printf("  --token-embedding-type ggml_type: use this ggml_type for the token embeddings tensor\n");
     printf("  --tensor-type TENSOR=TYPE: quantize this tensor to this ggml_type. example: --tensor-type attn_q=q8_0\n");
     printf("      Advanced option to selectively quantize tensors. May be specified multiple times.\n");
+    printf("  --tensor-type-file tensor_type.txt: list of tensors to quantize to specific ggml_type. example: --tensor-type-file tensor_type_list.txt\n");
+    printf("      Advanced option to selectively quantize a long list of tensors. Format to be tensor_name=ggml_type, separated by spaces/newline.\n");
     printf("  --prune-layers L0,L1,L2...comma-separated list of layer numbers to prune from the model\n");
     printf("      Advanced option to remove all tensors from the given layers\n");
     printf("  --keep-split: will generate quantized model in the same shards as input\n");
@@ -415,6 +417,23 @@ static bool parse_tensor_type(const char * data, std::vector<tensor_quantization
     return true;
 }
 
+static bool parse_tensor_type_file(const char * filename, std::vector<tensor_quantization> & tensor_type) {
+    std::ifstream file(filename);
+    if (!file) {
+        printf("\n%s: failed to open file '%s': %s\n\n", __func__, filename, std::strerror(errno));
+        return false;
+    }
+
+    std::string arg;
+    while (file >> arg) {
+        if (!parse_tensor_type(arg.c_str(), tensor_type)) {
+            return false;
+        }
+    }
+
+    return true;
+}
+
 static bool parse_layer_prune(const char * data, std::vector<int> & prune_layers) {
     if (!data) {
         printf("\n%s: no layer pruning ids provided\n\n", __func__);
@@ -480,6 +499,10 @@ int main(int argc, char ** argv) {
             if (arg_idx == argc-1 || !parse_tensor_type(argv[++arg_idx], tensor_types)) {
                 usage(argv[0]);
             }
+        } else if (strcmp(argv[arg_idx], "--tensor-type-file") == 0) {
+            if (arg_idx == argc-1 || !parse_tensor_type_file(argv[++arg_idx], tensor_types)) {
+                usage(argv[0]);
+            }
         } else if (strcmp(argv[arg_idx], "--prune-layers") == 0) {
             if (arg_idx == argc-1 || !parse_layer_prune(argv[++arg_idx], prune_layers)) {
                 usage(argv[0]);
@@ -686,3 +709,4 @@ int main(int argc, char ** argv) {
 
     return 0;
 }
+
diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp
index 1ca4e3cc0e..7f9c3c566b 100644
--- a/tools/server/server-context.cpp
+++ b/tools/server/server-context.cpp
@@ -155,7 +155,7 @@ struct server_slot {
     double t_prompt_processing; // ms
     double t_token_generation;  // ms
 
-    std::function<void(int /* slot_id */)> callback_on_release;
+    std::function<void(int /* id_slot */)> callback_on_release;
 
     // Speculative decoding stats
     int32_t n_draft_total = 0;      // Total draft tokens generated
@@ -705,6 +705,11 @@ private:
                 params_base.n_cache_reuse = 0;
                 SRV_WRN("%s\n", "cache_reuse is not supported by multimodal, it will be disabled");
             }
+
+            if (params_base.speculative.type != COMMON_SPECULATIVE_TYPE_NONE) {
+                params_base.speculative.type =  COMMON_SPECULATIVE_TYPE_NONE;
+                SRV_WRN("%s\n", "speculative decoding is not supported by multimodal, it will be disabled");
+            }
         }
 
         if (!llama_memory_can_shift(llama_get_memory(ctx))) {
@@ -754,16 +759,16 @@ private:
                         SRV_ERR("%s\n", "speculative decoding is not supported with multimodal");
                         return false;
                     }
-                    SRV_WRN("%s", "speculative decoding context initialized\n");
+                    SLT_INF(slot, "%s", "speculative decoding context initialized\n");
                 } else {
-                    SRV_WRN("%s", "speculative decoding context not initialized\n");
+                    SLT_INF(slot, "%s", "speculative decoding context not initialized\n");
                 }
             }
 
             SLT_INF(slot, "new slot, n_ctx = %d\n", slot.n_ctx);
 
-            slot.callback_on_release = [this](int slot_id) {
-                queue_tasks.pop_deferred_task(slot_id);
+            slot.callback_on_release = [this](int id_slot) {
+                queue_tasks.pop_deferred_task(id_slot);
             };
 
             slot.reset();
@@ -891,6 +896,9 @@ private:
     }
 
     server_slot * get_slot_by_id(int id_slot) {
+        // note: allow id_slot to be out of bounds (wrap around)
+        id_slot = id_slot % slots.size();
+
         for (server_slot & slot : slots) {
             if (slot.id == id_slot) {
                 return &slot;
@@ -1760,7 +1768,7 @@ private:
                         break;
                     }
 
-                    int id_slot = task.slot_action.slot_id;
+                    const int id_slot = task.slot_action.id_slot;
                     server_slot * slot = get_slot_by_id(id_slot);
                     if (slot == nullptr) {
                         send_error(task, "Invalid slot ID", ERROR_TYPE_INVALID_REQUEST);
@@ -1798,7 +1806,7 @@ private:
             case SERVER_TASK_TYPE_SLOT_RESTORE:
                 {
                     if (!check_no_mtmd(task.id)) break;
-                    int id_slot = task.slot_action.slot_id;
+                    const int id_slot = task.slot_action.id_slot;
                     server_slot * slot = get_slot_by_id(id_slot);
                     if (slot == nullptr) {
                         send_error(task, "Invalid slot ID", ERROR_TYPE_INVALID_REQUEST);
@@ -1847,7 +1855,7 @@ private:
                     if (!check_no_mtmd(task.id)) {
                         break;
                     }
-                    int id_slot = task.slot_action.slot_id;
+                    const int id_slot = task.slot_action.id_slot;
                     server_slot * slot = get_slot_by_id(id_slot);
                     if (slot == nullptr) {
                         send_error(task, "Invalid slot ID", ERROR_TYPE_INVALID_REQUEST);
@@ -3312,7 +3320,7 @@ void server_routes::init_routes() {
         }
 
         // TODO: get rid of this dynamic_cast
-        auto res_task = dynamic_cast<server_task_result_metrics*>(result.get());
+        auto * res_task = dynamic_cast<server_task_result_metrics*>(result.get());
         GGML_ASSERT(res_task != nullptr);
 
         // optionally return "fail_on_no_slot" error
@@ -3335,8 +3343,8 @@ void server_routes::init_routes() {
         }
 
         std::string id_slot_str = req.get_param("id_slot");
-        int id_slot;
 
+        int id_slot;
         try {
             id_slot = std::stoi(id_slot_str);
         } catch (const std::exception &) {
@@ -3348,14 +3356,16 @@ void server_routes::init_routes() {
 
         if (action == "save") {
             return handle_slots_save(req, id_slot);
-        } else if (action == "restore") {
-            return handle_slots_restore(req, id_slot);
-        } else if (action == "erase") {
-            return handle_slots_erase(req, id_slot);
-        } else {
-            res->error(format_error_response("Invalid action", ERROR_TYPE_INVALID_REQUEST));
-            return res;
         }
+        if (action == "restore") {
+            return handle_slots_restore(req, id_slot);
+        }
+        if (action == "erase") {
+            return handle_slots_erase(req, id_slot);
+        }
+
+        res->error(format_error_response("Invalid action", ERROR_TYPE_INVALID_REQUEST));
+        return res;
     };
 
     this->get_props = [this](const server_http_req &) {
@@ -3898,7 +3908,7 @@ std::unique_ptr<server_res_generator> server_routes::handle_slots_save(const ser
     {
         server_task task(SERVER_TASK_TYPE_SLOT_SAVE);
         task.id = rd.get_new_id();
-        task.slot_action.slot_id  = id_slot;
+        task.slot_action.id_slot  = id_slot;
         task.slot_action.filename = filename;
         task.slot_action.filepath = filepath;
         rd.post_task(std::move(task));
@@ -3934,7 +3944,7 @@ std::unique_ptr<server_res_generator> server_routes::handle_slots_restore(const
     {
         server_task task(SERVER_TASK_TYPE_SLOT_RESTORE);
         task.id = rd.get_new_id();
-        task.slot_action.slot_id  = id_slot;
+        task.slot_action.id_slot  = id_slot;
         task.slot_action.filename = filename;
         task.slot_action.filepath = filepath;
         rd.post_task(std::move(task));
@@ -3963,7 +3973,7 @@ std::unique_ptr<server_res_generator> server_routes::handle_slots_erase(const se
     {
         server_task task(SERVER_TASK_TYPE_SLOT_ERASE);
         task.id = rd.get_new_id();
-        task.slot_action.slot_id = id_slot;
+        task.slot_action.id_slot = id_slot;
         rd.post_task(std::move(task));
     }
 
diff --git a/tools/server/server-task.h b/tools/server/server-task.h
index 244470596b..a69e8f1a3d 100644
--- a/tools/server/server-task.h
+++ b/tools/server/server-task.h
@@ -153,7 +153,7 @@ struct server_task {
 
     // used by SERVER_TASK_TYPE_SLOT_SAVE, SERVER_TASK_TYPE_SLOT_RESTORE, SERVER_TASK_TYPE_SLOT_ERASE
     struct slot_action {
-        int slot_id;
+        int id_slot;
         std::string filename;
         std::string filepath;
     };