feat: dequant use lut (#55)

* Add power management utilities to NPU device context and update DCVS settings * Update DCVS settings in power_utils to use v3 API and enhance power management * wip * Enhance dequantization functions by adding load_dequant_table support and updating signatures for improved performance * use lut * wip * fix test failure * wip * Refactor load_qual_block_generic to improve block handling and optimize vector operations * Enhance load_dual_block_generic and load_qual_block_generic to accept a mask parameter for improved block handling * Refactor flash_attn_impl to optimize mask l2 prefetch * wip * wip * wip * wip * add log * link against shared libraries instead of static ones * fix swiglu * wip * refactor expf_fix to handle overflow for different data types * enhance is_glu_op_supported to validate shapes for multiple sources * wip * refactor logging macros to use hexagon namespace and improve formatting * fix printf format error * wip * refactor: update static_assert messages for block size validation and add HVX_VectorPred_x3 type alias * rename * feat: enhance fa with mask * wip * wip * refactor: replace instances of Q6_V_vzero() with kZeroV for consistency * wip * wip * wip * fix: improve address alignment check in HVX_Vector handling * refactor: streamline vector dot product implementations for improved readability * refactor: q4k add hvx intrinsic impl * refactor: enhance dequantize_row_q4_K for clarity and performance * refactor: optimize scale mask usage in dequantization functions for improved performance * refactor: optimize dequantize_row_q4_K for intrinsic usage and performance improvements * refactor: move GLU operation implementation into separated file * sync after swiglu * wip * wip * wip * feat: increase prc main thread stack size * fix: replace hardcoded stack size with NPU_THREAD_STACK_SIZE constant * wip * feat: add optimized vector operations for exponential and division with overflow handling * wip * feat: refactor exponential function to handle overflow and underflow with improved logic * wip * wip * feat: add vector loading and scaling functions for improved performance in block processing * wip * feat: optimize block loading by refactoring scale index handling for improved performance * use Q6_Vb_vlut32_VbVbR_nomatch instead * feat: enhance scale loading by adding static assertion and restructuring block handling * wip * feat: refactor vec_dot_product_mixed_impl for improved clarity and performance * wip * feat: simplify vector loading functions and improve alignment handling * wip * feat: enhance scale loading mask with quantization block size validation * wip * feat: implement make_scale_load_mask function and refactor vector handling in vec_ops * feat: enhance load_dual_block_generic to include scale indices for improved vector loading * revert q8 dequant * wip * feat: optimize dequantization functions by removing unnecessary masking and updating lookup methods * wip * wip
2025-08-29 21:42:57 +08:00 · 2025-08-29 21:42:57 +08:00 · 5ef9b98869
parent 0979133ea8
commit 5ef9b98869
23 changed files with 1025 additions and 533 deletions
--- a/ggml/src/ggml-qnn/npu/CMakeLists.txt
+++ b/ggml/src/ggml-qnn/npu/CMakeLists.txt
@ -220,7 +220,7 @@ else()
        target_compile_options(hexagon_npu_skel_OBJS PUBLIC
            -fsanitize=address -fno-omit-frame-pointer
        )
-        target_link_libraries(hexagon_npu_skel_OBJS PUBLIC
+        target_link_options(hexagon_npu_skel_OBJS PUBLIC
            -fsanitize=address
        )
    endif()
@ -248,9 +248,9 @@ else()

    add_library(hexagon_npu_skel SHARED $<TARGET_OBJECTS:hexagon_npu_skel_OBJS>)
    target_link_libraries(hexagon_npu_skel
-        ${HEXAGON_LIB_DIR}/${HEXAGON_ARCH}/G0/pic/libc++abi.a
-        ${HEXAGON_LIB_DIR}/${HEXAGON_ARCH}/G0/pic/libc++.a
-        ${HEXAGON_LIB_DIR}/${HEXAGON_ARCH}/G0/pic/libc.a
+        ${HEXAGON_LIB_DIR}/${HEXAGON_ARCH}/G0/pic/libc++abi.so.1
+        ${HEXAGON_LIB_DIR}/${HEXAGON_ARCH}/G0/pic/libc++.so.1
+        ${HEXAGON_LIB_DIR}/${HEXAGON_ARCH}/G0/pic/libc.so
    )
    set_target_properties(hexagon_npu_skel PROPERTIES OUTPUT_NAME "hexagon_npu_skel_${HEXAGON_ARCH}")
    target_link_libraries(hexagon_npu_skel qprintf_static)
--- a/ggml/src/ggml-qnn/npu/device/device.cpp
+++ b/ggml/src/ggml-qnn/npu/device/device.cpp
@ -17,21 +17,30 @@
 namespace {

 struct npu_device_context {
+    std::unique_ptr<hexagon::power_utils>         power_utils;       // Power management utilities
    std::unique_ptr<hexagon::default_thread_pool> thread_pool;
    std::unique_ptr<float[]>                      f16_to_f32_table;  // TODO: store vtcm?

    bool init() {
        if (!init_ltu()) {
-            DEVICE_LOG_ERROR("Failed to initialize LTU");
+            DEVICE_LOG_ERROR("Failed to initialize LTU\n");
            return false;
        }

        if (!init_thread_pool()) {
-            DEVICE_LOG_ERROR("Failed to initialize thread pool");
+            DEVICE_LOG_ERROR("Failed to initialize thread pool\n");
            return false;
        }

-        DEVICE_LOG_DEBUG("NPU device context initialized");
+        power_utils = std::make_unique<hexagon::power_utils>();
+        if (power_utils && power_utils->is_valid()) {
+            power_utils->set_dvcs_performance_mode(true);
+            DEVICE_LOG_DEBUG("Power utilities initialized with DVCS performance mode enabled\n");
+        } else {
+            DEVICE_LOG_ERROR("Failed to initialize power utilities\n");
+        }
+
+        DEVICE_LOG_DEBUG("NPU device context initialized\n");
        return true;
    }

@ -41,29 +50,29 @@ struct npu_device_context {

        f16_to_f32_table = std::make_unique<float[]>(kLtuCount);
        if (!f16_to_f32_table) {
-            DEVICE_LOG_ERROR("Failed to allocate memory for f16_to_f32 table");
+            DEVICE_LOG_ERROR("Failed to allocate memory for f16_to_f32 table\n");
            return false;
        }

        hexagon::init_f16_f32_table(f16_to_f32_table.get(), kLtuCount);
-        DEVICE_LOG_DEBUG("f16_to_f32 table initialized");
+        DEVICE_LOG_DEBUG("f16_to_f32 table initialized\n");
        return true;
    }

    bool init_thread_pool() {
        if (thread_pool) {
-            DEVICE_LOG_DEBUG("Thread pool already initialized");
+            DEVICE_LOG_DEBUG("Thread pool already initialized\n");
            return true;
        }

        auto pool = std::make_unique<hexagon::default_thread_pool>();
        if (!pool) {
-            DEVICE_LOG_ERROR("Failed to create thread pool");
+            DEVICE_LOG_ERROR("Failed to create thread pool\n");
            return false;
        }

        thread_pool = std::move(pool);
-        DEVICE_LOG_DEBUG("Thread pool initialized");
+        DEVICE_LOG_DEBUG("Thread pool initialized\n");
        return true;
    }
 };
@ -102,25 +111,25 @@ int npu_device_open(const char * uri, remote_handle64 * h) {
    // TODO: should we have a device context here?
    auto * context = new npu_device_context();
    if (!context->init()) {
-        DEVICE_LOG_ERROR("Failed to initialize npu_device_context");
+        DEVICE_LOG_ERROR("Failed to initialize npu_device_context\n");
        delete context;
        return AEE_EFAILED;
    }

    *h = reinterpret_cast<remote_handle64>(context);
-    DEVICE_LOG_INFO("NPU device context created: %p", (void *) *h);
+    DEVICE_LOG_INFO("NPU device context created: %p\n", (void *) *h);
    return AEE_SUCCESS;
 }

 int npu_device_close(remote_handle64 h) {
    auto * context = device_context_from_handle(h);
    if (!context) {
-        DEVICE_LOG_ERROR("Invalid npu_device_context handle");
+        DEVICE_LOG_ERROR("Invalid npu_device_context handle\n");
        return AEE_EINVHANDLE;
    }

    delete context;
-    DEVICE_LOG_INFO("NPU device context destroyed: %p", (void *) h);
+    DEVICE_LOG_INFO("NPU device context destroyed: %p\n", (void *) h);
    return AEE_SUCCESS;
 }

@ -139,7 +148,7 @@ AEEResult npu_device_device_support_op(remote_handle64                   _h,
    NPU_UNUSED(_h);

    if (!srcs || srcsLen <= 0 || !dst || !is_supported) {
-        DEVICE_LOG_ERROR("npu_device_device_support_op: Invalid arguments");
+        DEVICE_LOG_ERROR("npu_device_device_support_op: Invalid arguments\n");
        return AEE_EINVARGS;
    }

@ -185,7 +194,7 @@ AEEResult npu_device_tensors_free(remote_handle64                    _h,
                                  int                                tensor_handlesLen) {
    NPU_UNUSED(_h);
    if (!tensor_handles || tensor_handlesLen < 0) {
-        DEVICE_LOG_ERROR("npu_device_tensors_free: Invalid arguments");
+        DEVICE_LOG_ERROR("npu_device_tensors_free: Invalid arguments\n");
        return AEE_EINVARGS;
    }

@ -194,7 +203,7 @@ AEEResult npu_device_tensors_free(remote_handle64                    _h,
        if (tensor) {
            delete tensor;
        } else {
-            DEVICE_LOG_ERROR("npu_device_tensors_free: Invalid tensor handle at index %d", i);
+            DEVICE_LOG_ERROR("npu_device_tensors_free: Invalid tensor handle at index %d\n", i);
        }
    }

@ -250,13 +259,13 @@ AEEResult npu_device_graph_set_tensor_with_param(remote_handle64
 AEEResult npu_device_graph_compute(remote_handle64 _h, npu_device_graph_handle_t graph_handle) {
    auto dev_ctx = device_context_from_handle(_h);
    if (!dev_ctx) {
-        DEVICE_LOG_DEBUG("Invalid npu_device_context handle");
+        DEVICE_LOG_DEBUG("Invalid npu_device_context handle\n");
        return AEE_EINVHANDLE;
    }

    auto * graph = graph_from_handle(graph_handle);
    if (!graph) {
-        DEVICE_LOG_ERROR("Invalid graph handle");
+        DEVICE_LOG_ERROR("Invalid graph handle\n");
        return AEE_EINVHANDLE;
    }

--- a/ggml/src/ggml-qnn/npu/device/graph.cpp
+++ b/ggml/src/ggml-qnn/npu/device/graph.cpp
@ -91,6 +91,7 @@ void graph::compute_impl(default_thread_pool * pool, default_thread_pool::thread

        const bool should_sync = requires_thread_barrier(op);
        if (pool && should_sync && i < _tensor_count - 1) {
+            // For the last tensor, the thread pool will handle synchronization
            DEVICE_SCOPED_PERFORMANCE_TRACKER("[%p]sync_thread, tidx: %zu, tensor[%zu/%zu]",
                                              (void *) this,
                                              params.get_thread_index(),
--- a/ggml/src/ggml-qnn/npu/device/op_flash_attn.cpp
+++ b/ggml/src/ggml-qnn/npu/device/op_flash_attn.cpp
@ -13,7 +13,7 @@ inline float f16_to_f32(const npu_device_fp16_t src) {
 }

 // From: ggml/src/ggml-cpu/ops.cpp
-template <bool _IsKvF16>
+template <bool _IsKvF16, bool _HasMask>
 void flash_attn_impl(hexagon::tensor *         out,
                     const hexagon::tensor *   q,
                     const hexagon::tensor *   k,
@ -24,6 +24,7 @@ void flash_attn_impl(hexagon::tensor *         out,
    static_assert(3 <= hexagon::kMaxParamsCount, "flash_attn op params count exceeds max params count");

    constexpr const npu_device_tensor_data_type kKvDataType = _IsKvF16 ? NPU_DATA_TYPE_F16 : NPU_DATA_TYPE_F32;
+    constexpr const bool                        kHasMask    = _HasMask;

    if (k->get_type() != kKvDataType || v->get_type() != k->get_type()) {
        DEVICE_LOG_ERROR("flash_attn_impl: k and v must have same type, got k: %s, v: %s\n",
@ -32,6 +33,11 @@ void flash_attn_impl(hexagon::tensor *         out,
        return;
    }

+    if (kHasMask != (mask != nullptr)) {
+        DEVICE_LOG_ERROR("flash_attn_impl: mask is required when kHasMask is true\n");
+        return;
+    }
+
    float       scale         = out->get_op_param<float>(0);
    const float max_bias      = out->get_op_param<float>(1);
    const float logit_softcap = out->get_op_param<float>(2);
@ -96,7 +102,7 @@ void flash_attn_impl(hexagon::tensor *         out,
    const uint8_t * q_ptr     = q->get_read_buffer();
    const uint8_t * k_ptr     = k->get_read_buffer();
    const uint8_t * v_ptr     = v->get_read_buffer();
-    const uint8_t * mask_ptr  = mask ? mask->get_read_buffer() : nullptr;
+    const uint8_t * mask_ptr  = kHasMask ? mask->get_read_buffer() : nullptr;
    const uint8_t * sinks_ptr = sinks ? sinks->get_read_buffer() : nullptr;
    float *         VKQ32     = reinterpret_cast<float *>(cache_ptr);          // FP32 VKQ accumulator
    auto * VKQ16 = reinterpret_cast<npu_device_fp16_t *>(VKQ32 + aligned_dv);  // (temporary) FP16 VKQ accumulator
@ -125,11 +131,17 @@ void flash_attn_impl(hexagon::tensor *         out,
        }

        const npu_device_fp16_t * mp =
-            mask_ptr ? reinterpret_cast<const npu_device_fp16_t *>(mask_ptr + iq1 * mask->get_nb(1) +
+            kHasMask ? reinterpret_cast<const npu_device_fp16_t *>(mask_ptr + iq1 * mask->get_nb(1) +
                                                                   (iq2 % mask->get_ne(2)) * mask->get_nb(2) +
                                                                   (iq3 % mask->get_ne(3)) * mask->get_nb(3)) :
                       nullptr;

+        q_to_vec_dot(reinterpret_cast<const float *>(q_data), Q_q, DK);
+
+        if (kHasMask) {
+            hexagon::l2fetch_row(reinterpret_cast<const uint8_t *>(mp), mask->get_nb(1));
+        }
+
        // k indices
        const int ik3 = iq3 / rk3;
        const int ik2 = iq2 / rk2;
@ -138,8 +150,6 @@ void flash_attn_impl(hexagon::tensor *         out,
        const int iv3 = iq3 / rv3;
        const int iv2 = iq2 / rv2;

-        q_to_vec_dot(reinterpret_cast<const float *>(q_data), Q_q, DK);
-
        // online softmax / attention
        // loop over n_kv and n_head_kv
        // ref: https://arxiv.org/pdf/2112.05682.pdf
@ -147,7 +157,7 @@ void flash_attn_impl(hexagon::tensor *         out,
        const auto * v_plane_ptr = v_ptr + iv2 * v->get_nb(2) + iv3 * v->get_nb(3);
        for (int64_t ic = 0; ic < k->get_ne(1); ++ic) {
            DEVICE_SCOPED_OP_PERFORMANCE_TRACKER_ADD_ONE_SUB_PROC(flash_attn, 0, loop);
-            float mv = mp ? (slope * f16_to_f32(mp[ic])) : 0.0f;
+            float mv = kHasMask ? (slope * f16_to_f32(mp[ic])) : 0.0f;
            if (mv == -INFINITY) {
                continue;
            }
@ -282,9 +292,17 @@ bool flash_attn_f32(tensor * out, compute_params * params) {
    const auto * mask  = out->get_src(3);
    const auto * sinks = out->get_src(4);
    if (k->get_type() == NPU_DATA_TYPE_F16) {
-        flash_attn_impl<true>(out, q, k, v, mask, sinks, params);
+        if (mask) {
+            flash_attn_impl<true, true>(out, q, k, v, mask, sinks, params);
+        } else {
+            flash_attn_impl<true, false>(out, q, k, v, mask, sinks, params);
+        }
    } else {
-        flash_attn_impl<false>(out, q, k, v, mask, sinks, params);
+        if (mask) {
+            flash_attn_impl<false, true>(out, q, k, v, mask, sinks, params);
+        } else {
+            flash_attn_impl<false, false>(out, q, k, v, mask, sinks, params);
+        }
    }
    return true;
 }
@ -338,8 +356,8 @@ bool is_flash_attn_supported(const npu_device_tensor_op_spec * op_spec,

    if (dst->ne[0] != v->ne[0] || dst->ne[2] != q->ne[1]) {
        DEVICE_LOG_DEBUG(
-            "[%s]dst shape does not match q and v: dst ne: %ld, %ld, %ld, %ld, q ne: %ld, %ld, %ld, %ld, "
-            "v ne: %ld, %ld, %ld, %ld\n",
+            "[%s]dst shape does not match q and v: dst ne: %lld, %lld, %lld, %lld, q ne: %lld, %lld, %lld, %lld, "
+            "v ne: %lld, %lld, %lld, %lld\n",
            op_get_name(op),
            dst->ne[0],
            dst->ne[1],
@ -359,24 +377,25 @@ bool is_flash_attn_supported(const npu_device_tensor_op_spec * op_spec,
    if (is_transposed_or_permuted(dst->nb)) {
        DEVICE_LOG_DEBUG("[%s]dst cannot be transposed or permuted, nb: %zu, %zu, %zu, %zu\n",
                         op_get_name(op),
-                         dst->nb[0],
-                         dst->nb[1],
-                         dst->nb[2],
-                         dst->nb[3]);
+                         (size_t) dst->nb[0],
+                         (size_t) dst->nb[1],
+                         (size_t) dst->nb[2],
+                         (size_t) dst->nb[3]);
        return false;
    }

    if (q->ne[0] != k->ne[0]) {
-        DEVICE_LOG_DEBUG("[%s]q and k shapes do not match: q ne: %ld, %ld, %ld, %ld, k ne: %ld, %ld, %ld, %ld\n",
-                         op_get_name(op),
-                         q->ne[0],
-                         q->ne[1],
-                         q->ne[2],
-                         q->ne[3],
-                         k->ne[0],
-                         k->ne[1],
-                         k->ne[2],
-                         k->ne[3]);
+        DEVICE_LOG_DEBUG(
+            "[%s]q and k shapes do not match: q ne: %lld, %lld, %lld, %lld, k ne: %lld, %lld, %lld, %lld\n",
+            op_get_name(op),
+            q->ne[0],
+            q->ne[1],
+            q->ne[2],
+            q->ne[3],
+            k->ne[0],
+            k->ne[1],
+            k->ne[2],
+            k->ne[3]);
        return false;
    }

--- a/ggml/src/ggml-qnn/npu/device/op_glu.cpp
+++ b/ggml/src/ggml-qnn/npu/device/op_glu.cpp
@ -0,0 +1,228 @@
+
+#include "op_glu.hpp"
+
+#include "type_traits.hpp"
+#include "util.hpp"
+
+namespace {
+
+template <typename T> struct get_data_type {};
+
+template <typename _TyData, typename _TyParam>
+struct get_data_type<void (*)(const _TyData *, const _TyData *, _TyData *, size_t, _TyParam)> {
+    using type       = _TyData;
+    using param_type = typename std::remove_cv<typename std::remove_reference<_TyParam>::type>::type;
+};
+
+inline float dummy_load_coeff() {
+    // This is a dummy function to satisfy the template requirements.
+    // In practice, this should be replaced with a proper coefficient loading function.
+    return 0;
+}
+
+inline float expf_f16_guard_inf(float x) {
+    // Avoid overflow for large values, f16: log(65504)
+    constexpr float kMaxExp = 11.0898664f;
+
+    if (x >= kMaxExp) {
+        // Avoid overflow for large values
+        return std::numeric_limits<float>::infinity();
+    }
+
+    return std::expf(x);
+}
+
+inline void glu_vec_op_f16_f16(const __fp16 * src0, const __fp16 * src1, __fp16 * dst, size_t count, float coeff) {
+    // TODO: use simd version, for some input hexagon intrinsics will generate nan instead of inf.
+    for (uint32_t i = 0; i < count; ++i) {
+        float x = src0[i];
+        float g = src1[i];
+
+        dst[i] = (x / (1.0f + expf_f16_guard_inf(-x))) * g;
+    }
+}
+
+inline void glu_vec_op_f32_f32(const float *              src0,
+                               const float *              src1,
+                               float *                    dst,
+                               size_t                     count,
+                               hexagon::HVX_VectorPair_x4 coeff) {
+    using namespace hexagon::vec;
+    vec_trans_with_param_impl<float, hexagon::HVX_VectorPair_x4, hexagon::vec_swiglu_f32_f32>(
+        src0, src1, dst, count, coeff);
+}
+
+template <auto _GluRowFunc, auto _CoeffLoadFunc>
+bool glu_impl(hexagon::tensor * out, hexagon::compute_params * params) {
+    using data_type  = typename get_data_type<decltype(_GluRowFunc)>::type;
+    using param_type = typename get_data_type<decltype(_GluRowFunc)>::param_type;
+    static_assert(DEVICE_TENSOR_MAX_DIMS == 4, "element_wise_op requires max dims 4");
+    static_assert(std::is_same_v<param_type, decltype(_CoeffLoadFunc())>,
+                  "GluRowFunc must have the same param type as CoeffLoadFunc");
+
+    if (!out) {
+        return false;
+    }
+
+    const bool has_src1 = out->get_src(1) != nullptr;
+    auto *     src0     = out->get_src(0);
+    auto *     src1     = has_src1 ? out->get_src(1) : src0;
+    if (!src0 || !src1) {
+        return true;  // skip if no src
+    }
+
+    const auto total_cols = has_src1 ? src0->get_ne(0) : src0->get_ne(0) / 2;
+    if (out->get_ne(0) != total_cols) {
+        DEVICE_LOG_ERROR(
+            "[hexagon-npu][GLU]out.ne[0] (%ld) != total_cols (%d)\n", (long) out->get_ne(0), (int) total_cols);
+        return false;
+    }
+
+    auto       total_rows    = out->get_ne(3) * out->get_ne(2) * out->get_ne(1);
+    const auto rows_per_cube = out->get_ne(2) * out->get_ne(1);
+    const auto start_end     = params->get_work_slice(total_rows);
+    if (start_end.first >= start_end.second) {
+        return true;
+    }
+
+    uint8_t * dst_ptr = out->get_write_buffer();
+    if (!dst_ptr) {
+        DEVICE_LOG_ERROR("[hexagon-npu][GLU]glu_impl: dst_ptr is not writable, tensor: %p, type: %s\n",
+                         (void *) out,
+                         hexagon::get_type_name(out->get_type()));
+        return false;
+    }
+
+    const int32_t   swapped  = out->get_op_param<int32_t>(1);
+    const uint8_t * src0_ptr = src0->get_read_buffer();
+    const uint8_t * src1_ptr = has_src1 ? src1->get_read_buffer() : (src0_ptr + total_cols * sizeof(data_type));
+    if (swapped) {
+        std::swap(src0_ptr, src1_ptr);
+    }
+
+    DEVICE_SCOPED_OP_PERFORMANCE_TRACKER(out, params->get_thread_index());
+
+    auto         coeff           = _CoeffLoadFunc();
+    const size_t valid_row_bytes = src0->get_ne(0) * sizeof(data_type);
+    for (int64_t ir = start_end.first; ir < start_end.second; ++ir) {
+        const auto i03 = ir / rows_per_cube;
+        const auto i02 = ir / out->get_ne(1) - i03 * out->get_ne(2);
+        const auto i01 = ir % out->get_ne(1);  // TODO: should we use divide instead of mod?
+        const auto i13 = i03 % src1->get_ne(3);
+        const auto i12 = i02 % src1->get_ne(2);
+        const auto i11 = i01 % src1->get_ne(1);
+
+        auto * src1_plane = src1_ptr + i13 * src1->get_nb(3) + i12 * src1->get_nb(2);
+        auto * src0_row   = src0_ptr + i03 * src0->get_nb(3) + i02 * src0->get_nb(2) + i01 * src0->get_nb(1);
+        auto * src1_row   = src1_plane + i11 * src1->get_nb(1);
+        auto * dst_row    = dst_ptr + i03 * out->get_nb(3) + i02 * out->get_nb(2) + i01 * out->get_nb(1);
+        if (ir + 1 < start_end.second) {
+            hexagon::l2fetch_row(src0_row + src0->get_nb(1), valid_row_bytes);
+            hexagon::l2fetch_row(src1_row + src1->get_nb(1), valid_row_bytes);
+        }
+
+        _GluRowFunc(reinterpret_cast<const data_type *>(src0_row),
+                    reinterpret_cast<const data_type *>(src1_row),
+                    reinterpret_cast<data_type *>(dst_row),
+                    static_cast<size_t>(total_cols),
+                    coeff);
+    }
+
+    out->release_write_buffer();  // mark the output tensor as modified
+    return true;
+}
+
+template <npu_device_tensor_data_type _DataType>
+bool glu_compute(hexagon::tensor * out, hexagon::compute_params * params) {
+    using namespace hexagon::vec::math;
+
+    if (out->get_op_param<int32_t>(0) != NPU_GLU_OP_SWIGLU) {
+        DEVICE_LOG_ERROR("Invalid GLU op type: %d\n", (int) out->get_op_param<int32_t>(0));
+        return false;
+    }
+
+    if (out->get_type() != _DataType) {
+        DEVICE_LOG_ERROR("GLU op type mismatch: %s vs %s\n",
+                         hexagon::get_type_name(out->get_type()),
+                         hexagon::get_type_name(_DataType));
+        return false;
+    }
+
+    if constexpr (_DataType == NPU_DATA_TYPE_F32) {
+        return glu_impl<glu_vec_op_f32_f32, qhmath_load_div_sf_ltu>(out, params);
+    } else if constexpr (_DataType == NPU_DATA_TYPE_F16) {
+        return glu_impl<glu_vec_op_f16_f16, dummy_load_coeff>(out, params);
+    }
+
+    DEVICE_LOG_ERROR("Unsupported GLU data type: %s\n", hexagon::get_type_name(out->get_type()));
+    return true;
+}
+
+}  // namespace
+
+namespace hexagon {
+
+bool glu_f32(hexagon::tensor * out, hexagon::compute_params * params) {
+    return glu_compute<npu_device_tensor_data_type::NPU_DATA_TYPE_F32>(out, params);
+}
+
+bool glu_f16(hexagon::tensor * out, hexagon::compute_params * params) {
+    return glu_compute<npu_device_tensor_data_type::NPU_DATA_TYPE_F16>(out, params);
+}
+
+bool is_glu_op_supported(const npu_device_tensor_op_spec * op_spec,
+                         const npu_device_tensor_spec *    dst,
+                         const npu_device_tensor_spec *    srcs,
+                         size_t                            src_len) {
+    const auto op = op_spec->op;
+    if (op != NPU_OP_GLU) {
+        DEVICE_LOG_DEBUG("[%s]unsupported\n", hexagon::op_get_name(op));
+        return false;
+    }
+
+    if (op_spec->params[0] != NPU_GLU_OP_SWIGLU) {
+        DEVICE_LOG_DEBUG("[%s]unsupported GLU op type: %d\n", hexagon::op_get_name(op), (int) op_spec->params[0]);
+        return false;
+    }
+
+    if (!dst || !srcs || src_len < 1) {
+        DEVICE_LOG_DEBUG("[%s]invalid dst or srcs\n", hexagon::op_get_name(op));
+        return false;
+    }
+
+    const auto & src0 = srcs[0];
+    if (dst->type != src0.type) {
+        DEVICE_LOG_DEBUG("[%s]src0.type and dst.type mismatch: %s vs %s\n",
+                         hexagon::op_get_name(op),
+                         hexagon::get_type_name(src0.type),
+                         hexagon::get_type_name(dst->type));
+        return false;
+    }
+
+    if (dst->type != NPU_DATA_TYPE_F32 && dst->type != NPU_DATA_TYPE_F16) {
+        DEVICE_LOG_DEBUG(
+            "[%s]unsupported data type: %s\n", hexagon::op_get_name(op), hexagon::get_type_name(dst->type));
+        return false;
+    }
+
+    if (src_len > 1) {
+        if (!hexagon::is_same_shape(src0, *dst) || !hexagon::is_same_shape(srcs[1], *dst)) {
+            DEVICE_LOG_DEBUG("[%s]src0, src1 and dst have different shape\n", hexagon::op_get_name(op));
+            return false;  // src0 and src1 have the same shape as dst
+        }
+    } else {
+        static_assert(DEVICE_TENSOR_MAX_DIMS == 4, "GLU requires max dims 4");
+        if (src0.ne[0] / 2 != dst->ne[0] || src0.ne[1] != dst->ne[1] || src0.ne[2] != dst->ne[2] ||
+            src0.ne[3] != dst->ne[3]) {
+            DEVICE_LOG_DEBUG("[%s]src0 and dst have different shape: src0.ne[0]: %ld, dst.ne[0]: %ld\n",
+                             hexagon::op_get_name(op),
+                             (long) src0.ne[0],
+                             (long) dst->ne[0]);
+            return false;
+        }
+    }
+
+    return true;
+}
+
+}  // namespace hexagon
--- a/ggml/src/ggml-qnn/npu/device/op_glu.hpp
+++ b/ggml/src/ggml-qnn/npu/device/op_glu.hpp
@ -0,0 +1,15 @@
+#pragma once
+
+#include "op_types.hpp"
+
+namespace hexagon {
+
+bool glu_f32(hexagon::tensor * out, hexagon::compute_params * params);
+bool glu_f16(hexagon::tensor * out, hexagon::compute_params * params);
+
+bool is_glu_op_supported(const npu_device_tensor_op_spec * op_spec,
+                         const npu_device_tensor_spec *    dst,
+                         const npu_device_tensor_spec *    srcs,
+                         size_t                            src_len);
+
+}  // namespace hexagon
--- a/ggml/src/ggml-qnn/npu/device/op_impl.cpp
+++ b/ggml/src/ggml-qnn/npu/device/op_impl.cpp
@ -3,11 +3,13 @@
 #include "op_impl.hpp"

 #include "op_flash_attn.hpp"
+#include "op_glu.hpp"
 #include "op_mul_mat.hpp"
 #include "op_rope.hpp"
 #include "type_traits.hpp"
 #include "vec_ops.hpp"

+#include <cmath>
 #include <type_traits>

 namespace {
@ -59,15 +61,10 @@ template <typename _TyData> struct get_data_type<void (*)(const _TyData *, const
    using type = _TyData;
 };

-template <typename _TyData>
-struct get_data_type<void (*)(const _TyData *, const _TyData *, _TyData *, size_t, hexagon::HVX_VectorPair_x4)> {
-    using type = _TyData;
-};
-
 template <typename _TyData, typename _TyParam>
 struct get_data_type<void (*)(const _TyData *, _TyData *, size_t, _TyParam)> {
    using type       = _TyData;
-    using param_type = typename std::remove_cv<typename std::remove_reference<_TyData>::type>::type;
+    using param_type = typename std::remove_cv<typename std::remove_reference<_TyParam>::type>::type;
 };

 template <auto _RowFunc> bool element_wise_op(hexagon::tensor * out, hexagon::compute_params * params) {
@ -325,171 +322,6 @@ bool is_unary_op_supported(const npu_device_tensor_op_spec * op_spec,
    return true;
 }

-inline void glu_vec_op_f32_f32(const float *              src0,
-                               const float *              src1,
-                               float *                    dst,
-                               size_t                     count,
-                               hexagon::HVX_VectorPair_x4 coeff) {
-    using namespace hexagon::vec;
-    vec_trans_with_param_impl<float, hexagon::HVX_VectorPair_x4, hexagon::vec_swiglu_f32_f32>(
-        src0, src1, dst, count, coeff);
-}
-
-inline void glu_vec_op_f16_f16(const npu_device_fp16_t *  src0,
-                               const npu_device_fp16_t *  src1,
-                               npu_device_fp16_t *        dst,
-                               size_t                     count,
-                               hexagon::HVX_VectorPair_x4 coeff) {
-    using namespace hexagon::vec;
-    vec_trans_with_param_impl<npu_device_fp16_t, hexagon::HVX_VectorPair_x4, hexagon::vec_swiglu_f16_f16>(
-        src0, src1, dst, count, coeff);
-}
-
-template <auto _GluRowFunc, hexagon::HVX_VectorPair_x4 (*_CoeffLoadFunc)()>
-bool glu_impl(hexagon::tensor * out, hexagon::compute_params * params) {
-    using data_type = typename get_data_type<decltype(_GluRowFunc)>::type;
-    static_assert(DEVICE_TENSOR_MAX_DIMS == 4, "element_wise_op requires max dims 4");
-
-    if (!out) {
-        return false;
-    }
-
-    const bool has_src1 = out->get_src(1) != nullptr;
-    auto *     src0     = out->get_src(0);
-    auto *     src1     = has_src1 ? out->get_src(1) : src0;
-    if (!src0 || !src1) {
-        return true;  // skip if no src
-    }
-
-    const auto total_cols = has_src1 ? src0->get_ne(0) : src0->get_ne(0) / 2;
-    if (out->get_ne(0) != total_cols) {
-        DEVICE_LOG_ERROR("out.ne[0] (%ld) != total_cols (%d)\n", (long) out->get_ne(0), (int) total_cols);
-        return false;
-    }
-
-    auto       total_rows    = out->get_ne(3) * out->get_ne(2) * out->get_ne(1);
-    const auto rows_per_cube = out->get_ne(2) * out->get_ne(1);
-    const auto start_end     = params->get_work_slice(total_rows);
-    if (start_end.first >= start_end.second) {
-        return true;
-    }
-
-    uint8_t * dst_ptr = out->get_write_buffer();
-    if (!dst_ptr) {
-        DEVICE_LOG_ERROR("element_wise_op: dst_ptr is not writable, tensor: %p, type: %s\n",
-                         (void *) out,
-                         hexagon::get_type_name(out->get_type()));
-        return false;
-    }
-
-    const int32_t   swapped  = out->get_op_param<int32_t>(1);
-    const uint8_t * src0_ptr = src0->get_read_buffer();
-    const uint8_t * src1_ptr = has_src1 ? src1->get_read_buffer() : (src0_ptr + total_cols * sizeof(data_type));
-    if (swapped) {
-        std::swap(src0_ptr, src1_ptr);
-    }
-
-    DEVICE_SCOPED_OP_PERFORMANCE_TRACKER(out, params->get_thread_index());
-
-    auto         coeff           = _CoeffLoadFunc();
-    const size_t valid_row_bytes = src0->get_ne(0) * sizeof(data_type);
-    for (int64_t ir = start_end.first; ir < start_end.second; ++ir) {
-        const auto i03 = ir / rows_per_cube;
-        const auto i02 = ir / out->get_ne(1) - i03 * out->get_ne(2);
-        const auto i01 = ir % out->get_ne(1);  // TODO: should we use divide instead of mod?
-        const auto i13 = i03 % src1->get_ne(3);
-        const auto i12 = i02 % src1->get_ne(2);
-        const auto i11 = i01 % src1->get_ne(1);
-
-        auto * src1_plane = src1_ptr + i13 * src1->get_nb(3) + i12 * src1->get_nb(2);
-        auto * src0_row   = src0_ptr + i03 * src0->get_nb(3) + i02 * src0->get_nb(2) + i01 * src0->get_nb(1);
-        auto * src1_row   = src1_plane + i11 * src1->get_nb(1);
-        auto * dst_row    = dst_ptr + i03 * out->get_nb(3) + i02 * out->get_nb(2) + i01 * out->get_nb(1);
-        if (ir + 1 < start_end.second) {
-            hexagon::l2fetch_row(src0_row + src0->get_nb(1), valid_row_bytes);
-            hexagon::l2fetch_row(src1_row + src1->get_nb(1), valid_row_bytes);
-        }
-
-        _GluRowFunc(reinterpret_cast<const data_type *>(src0_row),
-                    reinterpret_cast<const data_type *>(src1_row),
-                    reinterpret_cast<data_type *>(dst_row),
-                    static_cast<size_t>(total_cols),
-                    coeff);
-    }
-
-    out->release_write_buffer();  // mark the output tensor as modified
-    return true;
-}
-
-template <npu_device_tensor_data_type _DataType>
-bool glu_compute(hexagon::tensor * out, hexagon::compute_params * params) {
-    using namespace hexagon::vec::math;
-
-    if (out->get_op_param<int32_t>(0) != NPU_GLU_OP_SWIGLU) {
-        DEVICE_LOG_ERROR("Invalid GLU op type: %d\n", out->get_op_param<int32_t>(0));
-        return false;
-    }
-
-    if (out->get_type() != _DataType) {
-        DEVICE_LOG_ERROR("GLU op type mismatch: %s vs %s\n",
-                         hexagon::get_type_name(out->get_type()),
-                         hexagon::get_type_name(_DataType));
-        return false;
-    }
-
-    if constexpr (_DataType == NPU_DATA_TYPE_F32) {
-        return glu_impl<glu_vec_op_f32_f32, qhmath_load_div_sf_ltu>(out, params);
-    } else if constexpr (_DataType == NPU_DATA_TYPE_F16) {
-        return glu_impl<glu_vec_op_f16_f16, qhmath_load_div_hf_ltu>(out, params);
-    }
-
-    DEVICE_LOG_ERROR("Unsupported GLU data type: %s\n", hexagon::get_type_name(out->get_type()));
-    return true;
-}
-
-bool is_glu_op_supported(const npu_device_tensor_op_spec * op_spec,
-                         const npu_device_tensor_spec *    dst,
-                         const npu_device_tensor_spec *    srcs,
-                         size_t                            src_len) {
-    const auto op = op_spec->op;
-    if (op != NPU_OP_GLU) {
-        DEVICE_LOG_DEBUG("[%s]unsupported\n", hexagon::op_get_name(op));
-        return false;
-    }
-
-    if (op_spec->params[0] != NPU_GLU_OP_SWIGLU) {
-        DEVICE_LOG_DEBUG("[%s]unsupported GLU op type: %d\n", hexagon::op_get_name(op), op_spec->params[0]);
-        return false;
-    }
-
-    if (!dst || !srcs || src_len < 1) {
-        DEVICE_LOG_DEBUG("[%s]invalid dst or srcs\n", hexagon::op_get_name(op));
-        return false;
-    }
-
-    const auto & src0 = srcs[0];
-    if (dst->type != src0.type) {
-        DEVICE_LOG_DEBUG("[%s]src0.type and dst.type mismatch: %s vs %s\n",
-                         hexagon::op_get_name(op),
-                         hexagon::get_type_name(src0.type),
-                         hexagon::get_type_name(dst->type));
-        return false;
-    }
-
-    if (dst->type != NPU_DATA_TYPE_F32 && dst->type != NPU_DATA_TYPE_F16) {
-        DEVICE_LOG_DEBUG(
-            "[%s]unsupported data type: %s\n", hexagon::op_get_name(op), hexagon::get_type_name(dst->type));
-        return false;
-    }
-
-    if (!hexagon::is_same_shape(src0, *dst)) {
-        DEVICE_LOG_DEBUG("[%s]src0 and dst have different shape\n", hexagon::op_get_name(op));
-        return false;
-    }
-
-    return false;  // TODO: fix: for some input hexagon intrinsics will generate nan instead of inf.
-}
-
 struct op_capabilities {
    npu_device_tensor_op               op;
    hexagon::op_is_supported_func_type is_supported;
@ -499,60 +331,60 @@ struct op_capabilities {

 constexpr const op_capabilities kOpCapabilities[] = {
    {
-     NPU_OP_MUL_MAT,                                                           hexagon::is_mul_mat_supported,
+     NPU_OP_MUL_MAT, hexagon::is_mul_mat_supported,
     {
            hexagon::mul_mat_f32,  // NPU_DATA_TYPE_F32
            nullptr,               // NPU_DATA_TYPE_F16
-        },                                                                                                             true, // requires_thread_barrier
+        }, true,                      // requires_thread_barrier
    },
    {
-     NPU_OP_ADD,                                                                         is_element_wise_op_supported,
+     NPU_OP_ADD, is_element_wise_op_supported,
     {
            element_wise_op<vec_op_f32_f32<vadd_f32_f32>>,  // NPU_DATA_TYPE_F32
            element_wise_op<vec_op_f16_f16<vadd_f16_f16>>,  // NPU_DATA_TYPE_F16
-        },                                                                                                                   false,                                                                               // requires_thread_barrier
-    },
+        }, false,
+     },
    {
     NPU_OP_SUB, is_element_wise_op_supported,
     {
            element_wise_op<vec_op_f32_f32<vsub_f32_f32>>,  // NPU_DATA_TYPE_F32
            element_wise_op<vec_op_f16_f16<vsub_f16_f16>>,  // NPU_DATA_TYPE_F16
-        },                                                                                                             false,                                                                                                                       // requires_thread_barrier
-    },
+        }, false,
+     },
    {
-     NPU_OP_MUL,                                                                   is_element_wise_op_supported,
+     NPU_OP_MUL, is_element_wise_op_supported,
     {
            element_wise_op<vec_op_f32_f32<vmul_f32_f32>>,  // NPU_DATA_TYPE_F32
            element_wise_op<vec_op_f16_f16<vmul_f16_f16>>,  // NPU_DATA_TYPE_F16
-        },                                                      false,                                                                                                             // requires_thread_barrier
-    },
+        }, false,
+     },
    {
-     NPU_OP_RMS_NORM,                                                                     is_unary_op_supported,
+     NPU_OP_RMS_NORM, is_unary_op_supported,
     {
            unary_op<rms_norm_vec_f32>,  // NPU_DATA_TYPE_F32
            nullptr,                     // NPU_DATA_TYPE_F16
-        },                                                                                                                   false,                           // requires_thread_barrier
-    },
+        }, false,
+     },
    {
-     NPU_OP_FLASH_ATTN,hexagon::is_flash_attn_supported,
+     NPU_OP_FLASH_ATTN, hexagon::is_flash_attn_supported,
     {
            hexagon::flash_attn_f32,  // NPU_DATA_TYPE_F32
            nullptr,                  // NPU_DATA_TYPE_F16
        }, true,                         // requires_thread_barrier
    },
    {
-     NPU_OP_ROPE,                                                        hexagon::is_rope_supported,
+     NPU_OP_ROPE, hexagon::is_rope_supported,
     {
            hexagon::rope_f32,  // NPU_DATA_TYPE_F32
            nullptr,            // NPU_DATA_TYPE_F16
-        }, false,                  // requires_thread_barrier
-    },
+        }, false,
+     },
    {
-     NPU_OP_GLU,                                                                         is_glu_op_supported,
+     NPU_OP_GLU, hexagon::is_glu_op_supported,
     {
-            glu_compute<NPU_DATA_TYPE_F32>,  // NPU_DATA_TYPE_F32
-            glu_compute<NPU_DATA_TYPE_F16>,  // NPU_DATA_TYPE_F16
-        }, false,                               // requires_thread_barrier
+            hexagon::glu_f32,  // NPU_DATA_TYPE_F32
+            hexagon::glu_f16,  // NPU_DATA_TYPE_F16
+        }, true,                  // TODO: should we avoid using thread barrier?
    },
 };

--- a/ggml/src/ggml-qnn/npu/device/op_mul_mat.cpp
+++ b/ggml/src/ggml-qnn/npu/device/op_mul_mat.cpp
@ -36,8 +36,9 @@ void mul_mat_impl(hexagon::tensor *         src0,
    using data_type0 = typename get_data_type<decltype(_DotFunc)>::data_type0;
    using data_type1 = typename get_data_type<decltype(_DotFunc)>::data_type1;

-    const auto src0_actual_row_size = hexagon::get_dequantized_row_size(src0);
-    auto *     dequantize_row_func  = hexagon::get_type_traits(src0->get_type()).to_float;
+    const auto src0_actual_row_size    = hexagon::get_dequantized_row_size(src0);
+    auto *     dequantize_row_func     = hexagon::get_type_traits(src0->get_type()).to_float;
+    auto *     load_dequant_table_func = hexagon::get_type_traits(src0->get_type()).load_dequant_table;
    if (_ShouldCacheSrc0 && dequantize_row_func == nullptr) {
        DEVICE_LOG_ERROR("Unsupported quantized src0 type: %d, dequantize_row_func is null\n", src0->get_type());
        return;
@ -62,8 +63,8 @@ void mul_mat_impl(hexagon::tensor *         src0,
    if (start_end_plane.second <= start_end_plane.first || start_end_row.second <= start_end_row.first ||
        start_end_element.second <= start_end_element.first) {
        DEVICE_LOG_DEBUG(
-            "mul_mat_impl: no work to do, start_end_plane: (%ld, %ld), start_end_row: (%ld, %ld), "
-            "start_end_element: (%ld, %ld)\n",
+            "mul_mat_impl: no work to do, start_end_plane: (%lld, %lld), start_end_row: (%lld, %lld), "
+            "start_end_element: (%lld, %lld)\n",
            start_end_plane.first,
            start_end_plane.second,
            start_end_row.first,
@ -116,6 +117,7 @@ void mul_mat_impl(hexagon::tensor *         src0,
        return;
    }

+    auto            dequant_table         = load_dequant_table_func ? load_dequant_table_func() : HVX_Vector();
    constexpr bool  should_fetch_src0_row = !_ShouldCacheSrc0;
    const uint8_t * src0_ptr              = src0->get_read_buffer();
    const uint8_t * src1_ptr              = src1->get_read_buffer();
@ -146,7 +148,8 @@ void mul_mat_impl(hexagon::tensor *         src0,
                        auto * cached_row_ptr = src0_plane_cache_ptr + ir * src0_actual_row_size;
                        dequantize_row_func(src0_row,
                                            reinterpret_cast<hexagon::dequant_output_type *>(cached_row_ptr),
-                                            src0->get_ne(0));
+                                            src0->get_ne(0),
+                                            dequant_table);
                    }

                    last_cached_plane_ptr = src0_plane;
@ -218,8 +221,9 @@ void mul_mat_gemv_impl(hexagon::tensor *         src0,
    using data_type0 = typename get_data_type<decltype(_DotFunc)>::data_type0;
    using data_type1 = typename get_data_type<decltype(_DotFunc)>::data_type1;

-    const auto src0_actual_row_size = hexagon::get_dequantized_row_size(src0);
-    auto *     dequantize_row_func  = hexagon::get_type_traits(src0->get_type()).to_float;
+    const auto src0_actual_row_size    = hexagon::get_dequantized_row_size(src0);
+    auto *     dequantize_row_func     = hexagon::get_type_traits(src0->get_type()).to_float;
+    auto *     load_dequant_table_func = hexagon::get_type_traits(src0->get_type()).load_dequant_table;
    if (_ShouldCacheSrc0 && dequantize_row_func == nullptr) {
        DEVICE_LOG_ERROR("Unsupported quantized src0 type: %d, dequantize_row_func is null\n", src0->get_type());
        return;
@ -229,7 +233,7 @@ void mul_mat_gemv_impl(hexagon::tensor *         src0,
    if (dst->get_ne(0) >= params->get_thread_count()) {
        start_end_element = params->get_work_slice(dst->get_ne(0));
    } else {
-        DEVICE_LOG_ERROR("Unsupported src1 tensor shape for gemv: %s, ne: %ldx%ldx%ldx%ld\n",
+        DEVICE_LOG_ERROR("Unsupported src1 tensor shape for gemv: %s, ne: %lldx%lldx%lldx%lld\n",
                         hexagon::get_type_name(src1->get_type()),
                         src1->get_ne(0),
                         src1->get_ne(1),
@ -241,7 +245,7 @@ void mul_mat_gemv_impl(hexagon::tensor *         src0,
    if (start_end_element.second <= start_end_element.first) {
        DEVICE_LOG_DEBUG(
            "mul_mat_impl: no work to do, start_end_plane: [0, 1), start_end_row: [0, 1), "
-            "start_end_element: [%ld, %ld)\n",
+            "start_end_element: [%lld, %lld)\n",
            start_end_element.first,
            start_end_element.second);
        return;
@ -297,6 +301,7 @@ void mul_mat_gemv_impl(hexagon::tensor *         src0,
        return;
    }

+    auto            dequant_table         = load_dequant_table_func ? load_dequant_table_func() : HVX_Vector();
    constexpr bool  should_fetch_src0_row = !_ShouldCacheSrc0;
    const uint8_t * src0_ptr              = src0->get_read_buffer();
    const uint8_t * src1_ptr              = src1->get_read_buffer();
@ -325,8 +330,10 @@ void mul_mat_gemv_impl(hexagon::tensor *         src0,
                    }

                    auto * cached_row_ptr = src0_plane_cache_ptr + ir * src0_actual_row_size;
-                    dequantize_row_func(
-                        src0_row, reinterpret_cast<hexagon::dequant_output_type *>(cached_row_ptr), src0->get_ne(0));
+                    dequantize_row_func(src0_row,
+                                        reinterpret_cast<hexagon::dequant_output_type *>(cached_row_ptr),
+                                        src0->get_ne(0),
+                                        dequant_table);
                }

                src0_plane = src0_plane_cache_ptr;
--- a/ggml/src/ggml-qnn/npu/device/op_rope.cpp
+++ b/ggml/src/ggml-qnn/npu/device/op_rope.cpp
@ -165,7 +165,7 @@ bool rope_impl(hexagon::tensor * out, hexagon::compute_params * params) {
    }

    if (n_dims % 2 || (_IsVision && n_dims != out->get_ne(0) / 2)) {
-        DEVICE_LOG_ERROR("[ROPE]invalid n_dims for vision ROPE: %d, expected: %d\n", n_dims, out->get_ne(0) / 2);
+        DEVICE_LOG_ERROR("[ROPE]invalid n_dims for vision ROPE: %d, expected: %lld\n", n_dims, out->get_ne(0) / 2);
        return false;  // invalid n_dims for vision ROPE
    }

--- a/ggml/src/ggml-qnn/npu/device/tensor.hpp
+++ b/ggml/src/ggml-qnn/npu/device/tensor.hpp
@ -20,30 +20,30 @@ class tensor {
        void * mmap_address = nullptr;
        auto   ret          = HAP_mmap_get(_info.buffer_fd, &mmap_address, &phy_address);
        if (ret != AEE_SUCCESS) {
-            DEVICE_LOG_ERROR("Failed to mmap tensor buffer: %d", (int) ret);
+            DEVICE_LOG_ERROR("Failed to mmap tensor buffer: %d\n", (int) ret);
            return;
        }

        _data = static_cast<uint8_t *>(mmap_address);
-        DEVICE_LOG_INFO("tensor(%p[%ldx%ldx%ldx%ld]), fd: %d, offset: %zu, mmap_addr: %p, phy_addr: 0x%lx\n",
-                        (void *) this,
-                        (long) _info.ne[0],
-                        (long) _info.ne[1],
-                        (long) _info.ne[2],
-                        (long) _info.ne[3],
-                        _info.buffer_fd,
-                        _info.offset,
-                        (void *) mmap_address,
-                        phy_address);
+        DEVICE_LOG_DEBUG("tensor(%p[%ldx%ldx%ldx%ld]), fd: %d, offset: %zu, mmap_addr: %p, phy_addr: 0x%lx\n",
+                         (void *) this,
+                         (long) _info.ne[0],
+                         (long) _info.ne[1],
+                         (long) _info.ne[2],
+                         (long) _info.ne[3],
+                         (int) _info.buffer_fd,
+                         (size_t) _info.offset,
+                         (void *) mmap_address,
+                         (long) phy_address);
    }

    ~tensor() noexcept {
        auto ret = HAP_mmap_put(_info.buffer_fd);
        if (ret != AEE_SUCCESS) {
-            DEVICE_LOG_ERROR("Failed to unmap tensor buffer: %d", (int) ret);
+            DEVICE_LOG_ERROR("Failed to unmap tensor buffer: %d\n", (int) ret);
        }

-        DEVICE_LOG_INFO("~tensor(%p) fd: %d", (void *) this, _info.buffer_fd);
+        DEVICE_LOG_DEBUG("~tensor(%p) fd: %d\n", (void *) this, _info.buffer_fd);
    }

    void flush() const {
@ -131,7 +131,7 @@ class tensor {

    uint8_t * get_write_buffer() const {
        if (_info.is_constant) {
-            DEVICE_LOG_ERROR("Attempt to write to a constant tensor: %p", (void *) this);
+            DEVICE_LOG_ERROR("Attempt to write to a constant tensor: %p\n", (void *) this);
            return nullptr;  // Do not allow writing to constant tensors
        }

--- a/ggml/src/ggml-qnn/npu/device/thread_pool.hpp
+++ b/ggml/src/ggml-qnn/npu/device/thread_pool.hpp
@ -14,7 +14,7 @@
 namespace hexagon {

 constexpr const size_t kMaxThreadCount   = 4;
-constexpr const size_t kDefaultStackSize = 1024 * 64;  // 64KB
+constexpr const size_t kDefaultStackSize = NPU_THREAD_STACK_SIZE;  // 64KB

 template <size_t _stack_size> class qurt_thread {
  public:
@ -24,7 +24,7 @@ template <size_t _stack_size> class qurt_thread {
                         qurt_thread_func_type thread_func,
                         void *                arg,
                         unsigned short        priority) {
-        DEVICE_LOG_DEBUG("qurt_thread.create: %s", thread_name.c_str());
+        DEVICE_LOG_DEBUG("qurt_thread.create: %s\n", thread_name.c_str());
        qurt_thread_attr_init(&_attributes);
        qurt_thread_attr_set_name(&_attributes, (char *) thread_name.c_str());
        qurt_thread_attr_set_stack_addr(&_attributes, _stack);
@ -37,26 +37,26 @@ template <size_t _stack_size> class qurt_thread {
        auto ret = qurt_thread_create(
            &_tid, &_attributes, reinterpret_cast<void (*)(void *)>(&qurt_thread::thread_func_impl), (void *) this);
        if (ret != QURT_EOK) {
-            DEVICE_LOG_ERROR("Failed to create thread: %d", (int) ret);
+            DEVICE_LOG_ERROR("Failed to create thread: %d\n", (int) ret);
            _func = nullptr;
            _arg  = nullptr;
            return;
        }

-        DEVICE_LOG_DEBUG("qurt_thread.created: %s, id: %d", thread_name.c_str(), (int) _tid);
+        DEVICE_LOG_DEBUG("qurt_thread.created: %s, id: %d\n", thread_name.c_str(), (int) _tid);
    }

    ~qurt_thread() {
-        DEVICE_LOG_DEBUG("qurt_thread.destroy: %d", (int) _tid);
+        DEVICE_LOG_DEBUG("qurt_thread.destroy: %d\n", (int) _tid);
        int  thread_exit_code = QURT_EOK;
        auto ret              = qurt_thread_join(_tid, &thread_exit_code);
        if (ret != QURT_EOK && ret != QURT_ENOTHREAD) {
-            DEVICE_LOG_ERROR("Failed to join thread: %d", (int) ret);
+            DEVICE_LOG_ERROR("Failed to join thread: %d\n", (int) ret);
            return;
        }

        if (thread_exit_code != QURT_EOK) {
-            DEVICE_LOG_ERROR("Thread exit code: %d", (int) thread_exit_code);
+            DEVICE_LOG_ERROR("Thread exit code: %d\n", (int) thread_exit_code);
        }
    }

@ -135,7 +135,7 @@ template <size_t _ThreadCount> class thread_pool {
            auto thread = std::make_unique<thread_type>(
                thread_name_base + std::to_string(i), &thread_pool::thread_func_impl, &_thread_params[i + 1], priority);
            if (!thread->is_valid()) {
-                DEVICE_LOG_ERROR("Failed to create thread: %zu", i);
+                DEVICE_LOG_ERROR("Failed to create thread: %zu\n", i);
                // destroy all barriers and threads at destructor
                return;
            }
@ -143,11 +143,11 @@ template <size_t _ThreadCount> class thread_pool {
            _threads[i] = std::move(thread);
        }

-        DEVICE_LOG_DEBUG("thread_pool.created: %zu", kMaxSubThreadCount);
+        DEVICE_LOG_DEBUG("thread_pool.created: %zu\n", kMaxSubThreadCount);
    }

    ~thread_pool() {
-        DEVICE_LOG_DEBUG("thread_pool.destroy");
+        DEVICE_LOG_DEBUG("thread_pool.destroy\n");
        _thread_exit = true;
        qurt_barrier_wait(&_pending);  // release all task threads

@ -161,7 +161,7 @@ template <size_t _ThreadCount> class thread_pool {

    bool sync_execute(task_type task, void * arg) {
        if (!task) {
-            DEVICE_LOG_ERROR("Invalid task");
+            DEVICE_LOG_ERROR("Invalid task\n");
            return false;
        }

@ -174,7 +174,7 @@ template <size_t _ThreadCount> class thread_pool {
        qurt_barrier_wait(&_pending);

        task(this, &_thread_params[0], arg);
-        DEVICE_LOG_DEBUG("main_thread.task_completed: 0");
+        DEVICE_LOG_DEBUG("main_thread.task_completed: 0\n");

        qurt_barrier_wait(&_completed);

@ -198,19 +198,19 @@ template <size_t _ThreadCount> class thread_pool {

        auto * param = reinterpret_cast<thread_params *>(arg);

-        DEVICE_LOG_DEBUG("thread_func_impl.start: %zu", param->tidx);
+        DEVICE_LOG_DEBUG("thread_func_impl.start: %zu\n", param->tidx);

        auto & pool = *(param->pool);
        for (;;) {
            qurt_barrier_wait(&pool._pending);
            if (pool._thread_exit) {
-                DEVICE_LOG_DEBUG("thread_func_impl.exit: %zu", param->tidx);
+                DEVICE_LOG_DEBUG("thread_func_impl.exit: %zu\n", param->tidx);
                break;
            }

 #ifdef GGML_HEXAGON_ENABLE_PERFORMANCE_TRACKING
            auto task_begin_cycles = pool._task_begin_cycles.load();
-            DEVICE_LOG_WARN("[profiler]worker_thread, tidx: %zu, prepare: %lluus",
+            DEVICE_LOG_WARN("[profiler]worker_thread, tidx: %zu, prepare: %lluus\n",
                            param->tidx,
                            static_cast<unsigned long long>(
                                HAP_perf_qtimer_count_to_us(HAP_perf_get_qtimer_count() - task_begin_cycles)));
@ -221,18 +221,18 @@ template <size_t _ThreadCount> class thread_pool {
                task(param->pool, param, pool._arg);
            }

-            DEVICE_LOG_DEBUG("thread_func_impl.task_completed: %zu", param->tidx);
+            DEVICE_LOG_DEBUG("thread_func_impl.task_completed: %zu\n", param->tidx);
            qurt_barrier_wait(&pool._completed);

 #ifdef GGML_HEXAGON_ENABLE_PERFORMANCE_TRACKING
-            DEVICE_LOG_WARN("[profiler]worker_thread, tidx: %zu, task_end: %lluus",
+            DEVICE_LOG_WARN("[profiler]worker_thread, tidx: %zu, task_end: %lluus\n",
                            param->tidx,
                            static_cast<unsigned long long>(
                                HAP_perf_qtimer_count_to_us(HAP_perf_get_qtimer_count() - task_begin_cycles)));
 #endif
        }

-        DEVICE_LOG_DEBUG("thread_func_impl.end: %zu", param->tidx);
+        DEVICE_LOG_DEBUG("thread_func_impl.end: %zu\n", param->tidx);
    }

    std::atomic_bool                                _thread_exit                    = false;
--- a/ggml/src/ggml-qnn/npu/device/type_traits.cpp
+++ b/ggml/src/ggml-qnn/npu/device/type_traits.cpp
@ -3,8 +3,6 @@
 #include "op_types.hpp"  // TODO: remove this include
 #include "vec_ops.hpp"

-#include <hexagon_types.h>
-
 #include <array>

 static_assert(sizeof(npu_device_block_q4_k) ==
@ -31,42 +29,122 @@ inline npu_device_fp16_t to_fp16(const float src) {
 template <typename _TStruct, size_t _Count, auto _MemberPtr> inline HVX_Vector load_into_vector(const _TStruct * src) {
    static_assert(hexagon::kBytesPerVector >= sizeof(_TStruct) * _Count, "_TStruct too large for vector load");

-    const HVX_Vector * qs0  = reinterpret_cast<const HVX_Vector *>(&(src->*_MemberPtr));
-    HVX_Vector         prev = *qs0;
-    HVX_Vector         curr = hexagon::is_addr_aligned(qs0) ? Q6_V_vzero() : *(qs0 + 1);
-    return Q6_V_valign_VVR(curr, prev, (size_t) qs0);
+    return *reinterpret_cast<const HVX_UVector *>(&(src->*_MemberPtr));
+}
+
+template <typename _TStruct, size_t _Count> inline HVX_Vector load_struct_into_vector(const _TStruct * src) {
+    static_assert(hexagon::kBytesPerVector >= sizeof(_TStruct) * _Count, "_TStruct too large for vector load");
+
+    return *reinterpret_cast<const HVX_UVector *>(src);
 }

 template <typename _TBlock> inline HVX_Vector load_block_generic(const _TBlock & src) {
-    static_assert(hexagon::kBytesPerVector >= sizeof(_TBlock), "wrong q4_0 block size/padding");
+    static_assert(hexagon::kBytesPerVector >= sizeof(_TBlock), "wrong block size/padding");
    return load_into_vector<_TBlock, 1, &_TBlock::qs>(&src);
 }

-template <typename _TBlock> inline HVX_Vector load_dual_block_generic(const _TBlock * srcs) {
-    static_assert(hexagon::kBytesPerVector >= sizeof(_TBlock) * 2, "wrong q4_0 block size/padding");
-    constexpr const uint32_t kSizeOfQs = sizeof(_TBlock::qs);
+template <typename _TBlock> inline HVX_Vector make_scale_load_mask() {
+    static_assert(sizeof(_TBlock) < 32, "wrong block size/padding");
+    static_assert(sizeof(_TBlock::qs) == 16 || sizeof(_TBlock::qs) == 32, "wrong quantization block size");

-    HVX_Vector blocks = load_into_vector<_TBlock, 2, &_TBlock::qs>(srcs);
-    HVX_Vector block1 = Q6_V_valign_VVR(Q6_V_vzero(), blocks, sizeof(_TBlock));
-    return Q6_V_lo_W(Q6_W_vshuff_VVR(block1, blocks, kSizeOfQs));
+    constexpr const size_t kScaleBlockSize = QUANT_BLOCK_SIZE * sizeof(hexagon::dequant_output_type);
+
+    // TODO: handle the case that scale not at the start of struct
+    hexagon::HVX_VectorAlias ret;
+    for (size_t i = 0; i < QUANT_BLOCK_SIZE; ++i) {
+        size_t base      = i * 2;
+        ret.u8[base]     = 0;
+        ret.u8[base + 1] = 1;
+
+        ret.u8[base + kScaleBlockSize]     = sizeof(_TBlock);
+        ret.u8[base + kScaleBlockSize + 1] = sizeof(_TBlock) + 1;
+    }
+
+    return ret.v;
 }

-template <typename _TBlock> inline HVX_Vector load_qual_block_generic(const _TBlock * srcs) {
-    static_assert(hexagon::kBytesPerVector >= sizeof(_TBlock) * 4, "wrong q4_0 block size/padding");
+template <typename _TBlock> inline HVX_Vector load_dual_block_generic(const _TBlock * srcs, HVX_VectorPred mask) {
+    static_assert(hexagon::kBytesPerVector >= sizeof(_TBlock) * 2, "wrong block size/padding");
+    constexpr const uint32_t kSizeOfQs    = sizeof(_TBlock::qs);
+    constexpr const uint32_t kSizeOfScale = sizeof(_TBlock) - kSizeOfQs;
+
+    HVX_Vector blocks = load_into_vector<_TBlock, 2, &_TBlock::qs>(srcs);
+    HVX_Vector block1 = Q6_V_vror_VR(blocks, kSizeOfScale);
+    return Q6_V_vmux_QVV(mask, blocks, block1);
+}
+
+template <typename _TBlock>
+inline hexagon::HVX_Vector_x2 load_dual_block_generic(const _TBlock *  srcs,
+                                                      HVX_VectorPred   mask,
+                                                      const HVX_Vector scale_indices) {
+    static_assert(hexagon::kBytesPerVector >= sizeof(_TBlock) * 2, "wrong block size/padding");
+    constexpr const uint32_t kSizeOfQs    = sizeof(_TBlock::qs);
+    constexpr const uint32_t kSizeOfScale = sizeof(_TBlock) - kSizeOfQs;
+
+    hexagon::HVX_Vector_x2 result;
+
+    HVX_Vector blocks = load_into_vector<_TBlock, 2, &_TBlock::qs>(srcs);
+
+    HVX_Vector block1  = Q6_V_vror_VR(blocks, kSizeOfScale);
+    HVX_Vector scale01 = Q6_Vb_vshuff_Vb(blocks);
+
+    result.val[0] = Q6_V_vmux_QVV(mask, blocks, block1);
+    result.val[1] = Q6_Vb_vlut32_VbVbR_nomatch(scale_indices, scale01, 0);
+
+    return result;
+}
+
+template <typename _TBlock> inline hexagon::HVX_VectorPred_x3 make_quad_block_mask() {
+    static_assert(hexagon::kBytesPerVector >= sizeof(_TBlock) * 4, "wrong block size/padding");
    constexpr const uint32_t kSizeOfQs = sizeof(_TBlock::qs);

-    HVX_Vector     blocks = load_into_vector<_TBlock, 4, &_TBlock::qs>(srcs);
-    HVX_Vector     block1 = Q6_V_valign_VVR(Q6_V_vzero(), blocks, sizeof(_TBlock));
-    HVX_VectorPair qp0    = Q6_W_vshuff_VVR(block1, blocks, kSizeOfQs);
+    hexagon::HVX_VectorPred_x3 mask;
+    mask.val[0] = Q6_Q_vsetq_R(kSizeOfQs);
+    mask.val[1] = Q6_Q_vsetq_R(kSizeOfQs * 3);
+    mask.val[2] = Q6_Q_vsetq_R(kSizeOfQs * 2);
+    return mask;
+}

-    HVX_Vector     block2 = Q6_V_valign_VVR(Q6_V_vzero(), blocks, sizeof(_TBlock) * 2);
-    HVX_Vector     block3 = Q6_V_valign_VVR(Q6_V_vzero(), blocks, sizeof(_TBlock) * 3);
-    HVX_VectorPair qp1    = Q6_W_vshuff_VVR(block3, block2, kSizeOfQs);
+template <typename _TBlock>
+inline hexagon::HVX_Vector_x3 load_qual_block_generic(const _TBlock *                  srcs,
+                                                      const hexagon::HVX_VectorPred_x3 mask,
+                                                      const HVX_Vector                 scale_indices) {
+    static_assert(hexagon::kBytesPerVector >= sizeof(_TBlock) * 4, "wrong block size/padding");
+    constexpr const uint32_t kSizeOfQs    = sizeof(_TBlock::qs);
+    constexpr const uint32_t kSizeOfScale = sizeof(_TBlock) - kSizeOfQs;

-    return Q6_V_lo_W(Q6_W_vshuff_VVR(Q6_V_lo_W(qp1), Q6_V_lo_W(qp0), kSizeOfQs * 2));
+    hexagon::HVX_Vector_x3 result;
+
+    const HVX_Vector blocks = load_struct_into_vector<_TBlock, 4>(srcs);
+
+    {
+        HVX_Vector block0 = Q6_V_vror_VR(blocks, kSizeOfScale);
+        HVX_Vector block1 = Q6_V_vror_VR(blocks, kSizeOfScale * 2);
+
+        HVX_Vector block2 = Q6_V_vror_VR(blocks, kSizeOfScale * 3);
+        HVX_Vector block3 = Q6_V_vror_VR(blocks, kSizeOfScale * 4);
+
+        HVX_Vector block01 = Q6_V_vmux_QVV(mask.val[0], block0, block1);
+        HVX_Vector block23 = Q6_V_vmux_QVV(mask.val[1], block2, block3);
+
+        result.val[0] = Q6_V_vmux_QVV(mask.val[2], block01, block23);
+    }
+
+    {
+        HVX_Vector scale23 = Q6_V_vror_VR(blocks, sizeof(_TBlock) * 2);
+
+        HVX_Vector scale01 = Q6_Vb_vshuff_Vb(blocks);
+        scale23            = Q6_Vb_vshuff_Vb(scale23);
+
+        result.val[1] = Q6_Vb_vlut32_VbVbR_nomatch(scale_indices, scale01, 0);
+        result.val[2] = Q6_Vb_vlut32_VbVbR_nomatch(scale_indices, scale23, 0);
+    }
+
+    return result;
 }

 inline void get_scale_min_k4(int j, const uint8_t * q, uint8_t * d, uint8_t * m) {
+    // TODO: use intrinsics
    if (j < 4) {
        *d = q[j] & 63;
        *m = q[j + 4] & 63;
@ -324,23 +402,24 @@ void quantize_row_q4_K(const float * src, void * dst, size_t count) {
    }
 }

-void dequantize_row_q8_0(const void * src, hexagon::dequant_output_type * dst, size_t count) {
+void dequantize_row_q8_0(const void * src, hexagon::dequant_output_type * dst, size_t count, HVX_Vector) {
    constexpr const int qk = QUANT_BLOCK_SIZE;
    static_assert(QUANT_BLOCK_SIZE == hexagon::kBytesPerVector / sizeof(float));

-    const int    nb      = count / qk;
-    const auto * src_ptr = reinterpret_cast<const npu_device_block_q8_0 *>(src);
-    auto *       dst_ptr = ((hexagon::dequant_output_type *) dst);  // TODO: opt for aligned access
+    const int            nb         = count / qk;
+    const auto *         src_ptr    = reinterpret_cast<const npu_device_block_q8_0 *>(src);
+    auto *               dst_ptr    = ((hexagon::dequant_output_type *) dst);  // TODO: opt for aligned access
+    const HVX_VectorPred mask       = Q6_Q_vsetq_R(sizeof(npu_device_block_q8_0::qs));
+    const HVX_VectorPred scale_mask = Q6_Q_vsetq_R(hexagon::kBytesPerVector / 2);

    int i = 0;
    for (; i + 1 < nb; i += 2) {
        const auto & src0 = src_ptr[i];
        const auto & src1 = src_ptr[i + 1];

-        HVX_Vector scales01 =
-            Q6_V_valign_VVR(Q6_Vh_vsplat_R(src1.d), Q6_Vh_vsplat_R(src0.d), hexagon::kBytesPerVector / 2);
+        HVX_Vector scales01 = Q6_V_vmux_QVV(scale_mask, Q6_Vh_vsplat_R(src0.d), Q6_Vh_vsplat_R(src1.d));

-        HVX_Vector qs     = load_dual_block_generic(src_ptr + i);
+        HVX_Vector qs     = load_dual_block_generic(src_ptr + i, mask);
        HVX_Vector q_lo   = Q6_Vhf_equals_Vh(Q6_V_lo_W(Q6_Wh_vunpack_Vb(qs)));
        HVX_Vector result = Q6_Vqf16_vmpy_VhfVhf(q_lo, scales01);

@ -363,44 +442,39 @@ void dequantize_row_q8_0(const void * src, hexagon::dequant_output_type * dst, s
 }

 template <bool _IsDstAligned>
-void dequantize_row_q4_0_impl(const void * src, hexagon::dequant_output_type * dst, size_t count) {
+void dequantize_row_q4_0_impl(const void * src, hexagon::dequant_output_type * dst, size_t count, HVX_Vector table) {
    constexpr const int qk = QUANT_BLOCK_SIZE;
    static_assert(qk % 2 == 0, "qk must be even");
    static_assert(QUANT_BLOCK_SIZE == hexagon::kBytesPerVector / sizeof(float));
    constexpr const uint32_t kSizeOfQs = sizeof(npu_device_block_q4_0::qs);

+    static const auto       load_masks = make_quad_block_mask<npu_device_block_q4_0>();
+    static const HVX_Vector scale_indices __attribute__((aligned(hexagon::kBytesPerVector))) =
+        make_scale_load_mask<npu_device_block_q4_0>();
+
    const int                      nb      = count / qk;
    const auto *                   src_ptr = reinterpret_cast<const npu_device_block_q4_0 *>(src);
-    const HVX_Vector               mask    = Q6_Vb_vsplat_R(0x0F);
-    const HVX_Vector               minus   = Q6_Vb_vsplat_R(8);
    hexagon::dequant_output_type * dst_ptr = dst;  // TODO: opt for aligned access

    int i = 0;
    for (; i + 3 < nb; i += 4) {
-        const auto & src0 = src_ptr[i];
-        const auto & src1 = src_ptr[i + 1];
-        const auto & src2 = src_ptr[i + 2];
-        const auto & src3 = src_ptr[i + 3];
+        auto qs = load_qual_block_generic(src_ptr + i, load_masks, scale_indices);

-        HVX_Vector scales01 =
-            Q6_V_valign_VVR(Q6_Vh_vsplat_R(src1.d), Q6_Vh_vsplat_R(src0.d), hexagon::kBytesPerVector / 2);
-        HVX_Vector scales23 =
-            Q6_V_valign_VVR(Q6_Vh_vsplat_R(src3.d), Q6_Vh_vsplat_R(src2.d), hexagon::kBytesPerVector / 2);
+        HVX_Vector q_lo = qs.val[0];
+        HVX_Vector q_hi = Q6_Vub_vlsr_VubR(qs.val[0], 4);

-        HVX_Vector     qs   = load_qual_block_generic(src_ptr + i);
-        HVX_Vector     q_lo = Q6_V_vand_VV(qs, mask);
-        HVX_Vector     q_hi = Q6_Vub_vlsr_VubR(qs, 4);
-        HVX_VectorPair qp0  = Q6_W_vshuff_VVR(q_hi, q_lo, kSizeOfQs * (1 + 2 + 4));
-        q_lo                = Q6_Vb_vsub_VbVb(Q6_V_lo_W(qp0), minus);
-        qp0                 = Q6_Wh_vunpack_Vb(q_lo);
+        HVX_VectorPair qp0 = Q6_W_vshuff_VVR(q_hi, q_lo, kSizeOfQs * (1 + 2 + 4));

-        q_lo = Q6_Vhf_equals_Vh(Q6_V_lo_W(qp0));
-        q_hi = Q6_Vhf_equals_Vh(Q6_V_hi_W(qp0));
+        q_lo = Q6_Vb_vshuff_Vb(Q6_V_lo_W(qp0));
+        qp0  = Q6_Wh_vlut16_VbVhR_nomatch(q_lo, table, 0);
+
+        q_lo = Q6_V_lo_W(qp0);
+        q_hi = Q6_V_hi_W(qp0);
+
+        q_lo = Q6_Vqf16_vmpy_VhfVhf(q_lo, qs.val[1]);
+        q_hi = Q6_Vqf16_vmpy_VhfVhf(q_hi, qs.val[2]);

-        q_lo = Q6_Vqf16_vmpy_VhfVhf(q_lo, scales01);
        q_lo = Q6_Vhf_equals_Vqf16(q_lo);
-
-        q_hi = Q6_Vqf16_vmpy_VhfVhf(q_hi, scales23);
        q_hi = Q6_Vhf_equals_Vqf16(q_hi);

        if constexpr (_IsDstAligned) {
@ -415,21 +489,16 @@ void dequantize_row_q4_0_impl(const void * src, hexagon::dequant_output_type * d
    }

    for (; i + 1 < nb; i += 2) {
-        const auto & src0 = src_ptr[i];
-        const auto & src1 = src_ptr[i + 1];
-
-        HVX_Vector scales01 =
-            Q6_V_valign_VVR(Q6_Vh_vsplat_R(src1.d), Q6_Vh_vsplat_R(src0.d), hexagon::kBytesPerVector / 2);
-
-        HVX_Vector     qs   = load_dual_block_generic(src_ptr + i);
-        HVX_Vector     q_lo = Q6_V_vand_VV(qs, mask);
-        HVX_Vector     q_hi = Q6_Vub_vlsr_VubR(qs, 4);
+        auto           qs   = load_dual_block_generic(src_ptr + i, load_masks.val[0], scale_indices);
+        HVX_Vector     q_lo = qs.val[0];
+        HVX_Vector     q_hi = Q6_Vub_vlsr_VubR(qs.val[0], 4);
        HVX_VectorPair qp0  = Q6_W_vshuff_VVR(q_hi, q_lo, kSizeOfQs * (1 + 2));
-        q_lo                = Q6_Vb_vsub_VbVb(Q6_V_lo_W(qp0), minus);
-        qp0                 = Q6_Wh_vunpack_Vb(q_lo);
-        q_lo                = Q6_Vhf_equals_Vh(Q6_V_lo_W(qp0));
-        q_lo                = Q6_Vqf16_vmpy_VhfVhf(q_lo, scales01);
-        q_lo                = Q6_Vhf_equals_Vqf16(q_lo);
+
+        q_lo = Q6_Vb_vshuff_Vb(Q6_V_lo_W(qp0));
+        qp0  = Q6_Wh_vlut16_VbVhR_nomatch(q_lo, table, 0);
+
+        q_lo = Q6_Vqf16_vmpy_VhfVhf(Q6_V_lo_W(qp0), qs.val[1]);
+        q_lo = Q6_Vhf_equals_Vqf16(q_lo);

        if constexpr (_IsDstAligned) {
            *reinterpret_cast<HVX_Vector *>(dst_ptr) = q_lo;
@ -445,14 +514,15 @@ void dequantize_row_q4_0_impl(const void * src, hexagon::dequant_output_type * d
        HVX_Vector   scales   = Q6_Vh_vsplat_R(curr_blk.d);

        HVX_Vector     qs   = load_block_generic(curr_blk);
-        HVX_Vector     q_lo = Q6_V_vand_VV(qs, mask);
+        HVX_Vector     q_lo = qs;
        HVX_Vector     q_hi = Q6_Vub_vlsr_VubR(qs, 4);
        HVX_VectorPair qp0  = Q6_W_vshuff_VVR(q_hi, q_lo, kSizeOfQs);
-        q_lo                = Q6_Vb_vsub_VbVb(Q6_V_lo_W(qp0), minus);
-        qp0                 = Q6_Wh_vunpack_Vb(q_lo);
-        q_lo                = Q6_Vhf_equals_Vh(Q6_V_lo_W(qp0));
-        q_lo                = Q6_Vqf16_vmpy_VhfVhf(q_lo, scales);
-        q_lo                = Q6_Vhf_equals_Vqf16(q_lo);
+
+        q_lo = Q6_Vb_vshuff_Vb(Q6_V_lo_W(qp0));
+        qp0  = Q6_Wh_vlut16_VbVhR_nomatch(q_lo, table, 0);
+
+        q_lo = Q6_Vqf16_vmpy_VhfVhf(Q6_V_lo_W(qp0), scales);
+        q_lo = Q6_Vhf_equals_Vqf16(q_lo);

        if constexpr (_IsDstAligned) {
            hexagon::q6op_vstu_variable_aligned<hexagon::kBytesPerVector / 2>(dst_ptr, q_lo);
@ -462,24 +532,82 @@ void dequantize_row_q4_0_impl(const void * src, hexagon::dequant_output_type * d
    }
 }

-void dequantize_row_q4_0(const void * src, hexagon::dequant_output_type * dst, size_t count) {
+HVX_Vector load_dequant_table_q4_0() {
+    constexpr const int kTableSize   = 1 << 4;  // 4 bits per value, 16 values
+    constexpr const int kQ4ZeroPoint = 8;       // zero point for q4_0 quantization
+    static_assert(kTableSize <= hexagon::kBytesPerVector / sizeof(__fp16), "table too large");
+
+    static const HVX_Vector result = []() -> HVX_Vector {
+        union {
+            HVX_Vector v;
+            __fp16 f16[sizeof(HVX_Vector) / sizeof(__fp16)];
+        } table __attribute__((aligned(hexagon::kBytesPerVector)));
+
+        table.v = Q6_V_vzero();
+        for (int i = 0; i < kTableSize; ++i) {
+            table.f16[i * 2] = i - kQ4ZeroPoint;  // TODO: vectorize this?
+        }
+        return table.v;
+    }();
+
+    return result;
+}
+
+void dequantize_row_q4_0(const void * src, hexagon::dequant_output_type * dst, size_t count, HVX_Vector table) {
    const bool dst_aligned = hexagon::is_addr_aligned(dst);
    if (dst_aligned) {
-        dequantize_row_q4_0_impl<true>(src, dst, count);
+        dequantize_row_q4_0_impl<true>(src, dst, count, table);
    } else {
-        dequantize_row_q4_0_impl<false>(src, dst, count);
+        dequantize_row_q4_0_impl<false>(src, dst, count, table);
    }
 }

-void dequantize_row_q4_K(const void * src, hexagon::dequant_output_type * dst, size_t count) {
+HVX_Vector load_dequant_table_q4_k() {
+    constexpr const int kTableSize = 1 << 4;  // 4 bits per value, 16 values
+    static_assert(kTableSize <= hexagon::kBytesPerVector / sizeof(__fp16), "table too large");
+
+    const static HVX_Vector result = []() -> HVX_Vector {
+        union {
+            HVX_Vector v;
+            __fp16 f16[sizeof(HVX_Vector) / sizeof(__fp16)];
+        } table __attribute__((aligned(hexagon::kBytesPerVector)));
+
+        table.v = Q6_V_vzero();
+        for (int i = 0; i < kTableSize; ++i) {
+            table.f16[i * 2] = i;  // TODO: vectorize this?
+        }
+        return table.v;
+    }();
+
+    return result;
+}
+
+void dequantize_row_q4_K(const void * src, hexagon::dequant_output_type * dst, size_t count, HVX_Vector table) {
+    constexpr const int kQuantSubBlockSize = 32;
+
    const int    nb      = count / QUANT_K_BLOCK_SIZE;
    const auto * src_ptr = reinterpret_cast<const npu_device_block_q4_k *>(src);
-    auto *       dst_ptr = reinterpret_cast<__fp16 *>(dst);
+    auto *       dst_ptr = reinterpret_cast<npu_device_fp16_t *>(dst);
+
+    const HVX_VectorPred scale_mask = Q6_Q_vsetq_R(hexagon::kBytesPerVector / 2);
+
+    union {
+        HVX_VectorPair p[2];
+        HVX_Vector     v[4];
+    } dual_pair __attribute__((aligned(hexagon::kBytesPerVector * 4)));

-    // TODO: use intrinsics
    for (int i = 0; i < nb; i++) {
        const uint8_t * q = src_ptr[i].qs;

+        HVX_Vector qv = *reinterpret_cast<const HVX_UVector *>(q);
+
+        HVX_Vector     q_lo = qv;
+        HVX_Vector     q_hi = Q6_Vub_vlsr_VubR(qv, 4);
+        HVX_VectorPair qp   = Q6_W_vshuff_VVR(q_hi, q_lo, kQuantSubBlockSize * 3);
+
+        dual_pair.p[0] = Q6_Wh_vlut16_VbVhR_nomatch(Q6_Vb_vshuff_Vb(Q6_V_lo_W(qp)), table, 0);
+        dual_pair.p[1] = Q6_Wh_vlut16_VbVhR_nomatch(Q6_Vb_vshuff_Vb(Q6_V_hi_W(qp)), table, 0);
+
        const __fp16 d   = reinterpret_cast<const __fp16 &>(src_ptr[i].d);
        const __fp16 min = reinterpret_cast<const __fp16 &>(src_ptr[i].dmin);

@ -487,30 +615,61 @@ void dequantize_row_q4_K(const void * src, hexagon::dequant_output_type * dst, s
        uint8_t      sc     = 0;
        uint8_t      m      = 0;
        const auto * scales = src_ptr[i].scales;
-        for (int j = 0; j < QUANT_K_BLOCK_SIZE; j += 64) {
+        for (int j = 0; j < QUANT_K_BLOCK_SIZE; j += 128) {
            get_scale_min_k4(is + 0, scales, &sc, &m);
+            const __fp16 d0 = d * sc;
+            const __fp16 m0 = min * m;
+
+            HVX_Vector dv0 = Q6_Vh_vsplat_R(reinterpret_cast<const uint16_t &>(d0));
+            HVX_Vector dm0 = Q6_Vh_vsplat_R(reinterpret_cast<const uint16_t &>(m0));
+
+            get_scale_min_k4(is + 1, scales, &sc, &m);
            const __fp16 d1 = d * sc;
            const __fp16 m1 = min * m;
-            get_scale_min_k4(is + 1, scales, &sc, &m);
+
+            HVX_Vector dv1 = Q6_Vh_vsplat_R(reinterpret_cast<const uint16_t &>(d1));
+            HVX_Vector dm1 = Q6_Vh_vsplat_R(reinterpret_cast<const uint16_t &>(m1));
+
+            get_scale_min_k4(is + 2, scales, &sc, &m);
            const __fp16 d2 = d * sc;
            const __fp16 m2 = min * m;
-            for (int l = 0; l < 32; ++l) {
-                dst_ptr[0]  = d1 * (q[l] & 0xF) - m1;
-                dst_ptr[32] = d2 * ((q[l] >> 4) & 0xF) - m2;
-                dst_ptr++;
-            }
-            dst_ptr += 32;
-            q += 32;
-            is += 2;
+
+            HVX_Vector dv2 = Q6_Vh_vsplat_R(reinterpret_cast<const uint16_t &>(d2));
+            HVX_Vector dm2 = Q6_Vh_vsplat_R(reinterpret_cast<const uint16_t &>(m2));
+
+            get_scale_min_k4(is + 3, scales, &sc, &m);
+            const __fp16 d3 = d * sc;
+            const __fp16 m3 = min * m;
+
+            HVX_Vector dv3 = Q6_Vh_vsplat_R(reinterpret_cast<const uint16_t &>(d3));
+            HVX_Vector dm3 = Q6_Vh_vsplat_R(reinterpret_cast<const uint16_t &>(m3));
+
+            HVX_Vector dv01 = Q6_V_vmux_QVV(scale_mask, dv0, dv1);
+            HVX_Vector dm01 = Q6_V_vmux_QVV(scale_mask, dm0, dm1);
+
+            HVX_Vector dv23 = Q6_V_vmux_QVV(scale_mask, dv2, dv3);
+            HVX_Vector dm23 = Q6_V_vmux_QVV(scale_mask, dm2, dm3);
+
+            q_lo = Q6_Vqf16_vmpy_VhfVhf(dual_pair.v[j / 64], dv01);
+            q_lo = Q6_Vqf16_vsub_Vqf16Vhf(q_lo, dm01);
+
+            q_hi = Q6_Vqf16_vmpy_VhfVhf(dual_pair.v[j / 64 + 1], dv23);
+            q_hi = Q6_Vqf16_vsub_Vqf16Vhf(q_hi, dm23);
+
+            reinterpret_cast<HVX_UVector *>(dst_ptr)[0] = Q6_Vhf_equals_Vqf16(q_lo);
+            reinterpret_cast<HVX_UVector *>(dst_ptr)[1] = Q6_Vhf_equals_Vqf16(q_hi);
+
+            dst_ptr += 128;
+            is += 4;
        }
    }
 }

-void copy_row_f16(const void * src, hexagon::dequant_output_type * dst, size_t count) {
+void copy_row_f16(const void * src, hexagon::dequant_output_type * dst, size_t count, HVX_Vector) {
    hexagon::vec_cpy_f16(reinterpret_cast<const npu_device_fp16_t *>(src), dst, count);
 }

-void copy_row_f32(const void * src, hexagon::dequant_output_type * dst, size_t count) {
+void copy_row_f32(const void * src, hexagon::dequant_output_type * dst, size_t count, HVX_Vector) {
    hexagon::vec_cpy_f32(reinterpret_cast<const float *>(src), reinterpret_cast<float *>(dst), count);
 }

@ -539,12 +698,16 @@ constexpr const hexagon::device_type_traits kDeviceTypeTraits[] = {
     "Q4_0", QUANT_BLOCK_SIZE,
     sizeof(npu_device_block_q4_0),
     true, dequantize_row_q4_0,
-     quantize_row_q4_0 },
+     quantize_row_q4_0, nullptr,
+     nullptr, nullptr,
+     load_dequant_table_q4_0 },
    { NPU_DATA_TYPE_Q4_K,
     "Q4_K", QUANT_K_BLOCK_SIZE,
     sizeof(npu_device_block_q4_k),
     true, dequantize_row_q4_K,
-     quantize_row_q4_K },
+     quantize_row_q4_K, nullptr,
+     nullptr, nullptr,
+     load_dequant_table_q4_k },
 };

 static_assert(std::size(kDeviceTypeTraits) == NPU_DATA_TYPE_COUNT,
--- a/ggml/src/ggml-qnn/npu/device/type_traits.hpp
+++ b/ggml/src/ggml-qnn/npu/device/type_traits.hpp
@ -3,6 +3,8 @@
 #include "tensor.hpp"
 #include "util.hpp"

+#include <hexagon_types.h>
+
 namespace hexagon {

 using dequant_output_type = npu_device_fp16_t;
@ -10,9 +12,10 @@ using dequant_output_type = npu_device_fp16_t;
 bool init_f16_f32_table(float * table, size_t count);

 typedef void (*quantize_row_type)(const float * src, void * dst, size_t count);
-typedef void (*dequantize_row_type)(const void * src, dequant_output_type * dst, size_t count);
+typedef void (*dequantize_row_type)(const void * src, dequant_output_type * dst, size_t count, HVX_Vector table);
 typedef float (*vec_dot_type)(const void * src0, const void * src1, size_t count);
 typedef bool (*can_use_aligned_vec_dot_type)(const void * src0, const void * src1, size_t count);
+typedef HVX_Vector (*load_dequant_table_type)();

 struct device_type_traits {
    npu_device_tensor_data_type type;
@ -21,11 +24,12 @@ struct device_type_traits {
    size_t                      type_size;
    bool                        is_quantized;

-    dequantize_row_type          to_float;
-    quantize_row_type            from_float;
-    vec_dot_type                 vec_dot;
-    vec_dot_type                 vec_dot_aligned;
-    can_use_aligned_vec_dot_type can_use_aligned_vec_dot;
+    dequantize_row_type          to_float                = nullptr;
+    quantize_row_type            from_float              = nullptr;
+    vec_dot_type                 vec_dot                 = nullptr;
+    vec_dot_type                 vec_dot_aligned         = nullptr;
+    can_use_aligned_vec_dot_type can_use_aligned_vec_dot = nullptr;
+    load_dequant_table_type      load_dequant_table      = nullptr;
 };

 const device_type_traits & get_type_traits(npu_device_tensor_data_type type);
@ -49,7 +53,7 @@ namespace hexagon {
 inline auto make_scoped_op_perf_timer(tensor * op, size_t tidx) {
    auto * src0 = op->get_src(0);
    auto * src1 = op->get_src(1);
-    char   buffer[1024];
+    char   buffer[512];
    if (src1 == nullptr) {
        snprintf(buffer,
                 sizeof(buffer),
@ -96,8 +100,10 @@ inline auto make_scoped_op_perf_timer(tensor * op, size_t tidx) {
 #    define DEVICE_SCOPED_OP_PERFORMANCE_TRACKER_WITH_MULTI_SUB_PROC(op, tidx, tracker_name) \
        auto __npu_op_timer_##tracker_name = hexagon::make_scoped_op_perf_timer(op, tidx)

-#    define DEVICE_SCOPED_OP_PERFORMANCE_TRACKER_ADD_ONE_SUB_PROC(tracker_name, idx, sub_prefix)          \
-        hexagon::npu_sub_process_scoped_timer<decltype(__npu_op_timer_##tracker_name)::kBufferCount, idx> \
+#    define DEVICE_SCOPED_OP_PERFORMANCE_TRACKER_ADD_ONE_SUB_PROC(tracker_name, idx, sub_prefix) \
+        hexagon::npu_sub_process_scoped_timer<                                                   \
+            std::remove_reference_t<decltype(__npu_op_timer_##tracker_name)>::kBufferCount,      \
+            idx>                                                                                 \
        __npu_op_sub_timer##sub_prefix(__npu_op_timer_##tracker_name, #sub_prefix)

 #else
--- a/ggml/src/ggml-qnn/npu/device/util.hpp
+++ b/ggml/src/ggml-qnn/npu/device/util.hpp
@ -8,17 +8,16 @@
 #include <HAP_power.h>

 #include <cstdint>
+#include <cstdio>
 #include <cstring>
 #include <utility>

-#define DEVICE_LOG_ERROR(...) FARF(FATAL, __VA_ARGS__)
-#define DEVICE_LOG_WARN(...)  FARF(ERROR, __VA_ARGS__)
-#define DEVICE_LOG_INFO(...)  FARF(HIGH, __VA_ARGS__)
+#define DEVICE_LOG_ERROR(...) hexagon::log_error(__VA_ARGS__)
+#define DEVICE_LOG_WARN(...)  hexagon::log_message(__VA_ARGS__)
+#define DEVICE_LOG_INFO(...)  hexagon::log_message(__VA_ARGS__)

 #ifdef _DEBUG
-#    undef FARF_LOW
-#    define FARF_LOW              1
-#    define DEVICE_LOG_DEBUG(...) FARF(LOW, __VA_ARGS__)
+#    define DEVICE_LOG_DEBUG(...) hexagon::log_message(__VA_ARGS__)
 #else
 #    define DEVICE_LOG_DEBUG(...) (void) 0
 #endif
@ -40,6 +39,20 @@

 namespace hexagon {

+__attribute__((format(printf, 1, 2))) inline void log_error(const char * format, ...) {
+    va_list args;
+    va_start(args, format);
+    std::vfprintf(stderr, format, args);
+    va_end(args);
+}
+
+__attribute__((format(printf, 1, 2))) inline void log_message(const char * format, ...) {
+    va_list args;
+    va_start(args, format);
+    std::vprintf(format, args);
+    va_end(args);
+}
+
 inline constexpr const char * op_get_name(npu_device_tensor_op op) {
    switch (op) {
        case NPU_OP_MUL_MAT:
@ -137,23 +150,22 @@ class power_utils {
            return;
        }

-        HAP_power_request_t request = {};
-        request.type                = HAP_power_set_DCVS_v3;
-        request.dcvs_v3.dcvs_enable = enable ? TRUE : FALSE;
+        HAP_power_request_t request     = {};
+        request.type                    = HAP_power_set_DCVS_v3;
+        request.dcvs_v3.set_dcvs_enable = enable ? TRUE : FALSE;
+        request.dcvs_v3.dcvs_enable     = enable ? TRUE : FALSE;
+        request.dcvs_v3.set_core_params = TRUE;
        if (enable) {
-            request.dcvs_v3.dcvs_option = HAP_DCVS_V2_PERFORMANCE_MODE;
-            /*
-             * sleep_latency : To request for sleep latency in micro-seconds.
-             *                 Sleep latency is the minimum time before which the DSP sleeps
-             *                 Set latency to 65535 to reset it to the default value
-             */
-            request.dcvs_v3.set_latency = TRUE;
-            request.dcvs_v3.latency     = 1000;
-
-            request.dcvs_v3.set_bus_params           = TRUE;
-            request.dcvs_v3.bus_params.min_corner    = HAP_DCVS_VCORNER_SVS;
-            request.dcvs_v3.bus_params.max_corner    = HAP_DCVS_VCORNER_TURBO;
-            request.dcvs_v3.bus_params.target_corner = HAP_DCVS_VCORNER_NOM;
+            request.dcvs_v3.dcvs_option               = HAP_DCVS_V2_PERFORMANCE_MODE;
+            request.dcvs_v3.set_bus_params            = TRUE;
+            request.dcvs_v3.bus_params.min_corner     = HAP_DCVS_VCORNER_MAX;
+            request.dcvs_v3.bus_params.max_corner     = HAP_DCVS_VCORNER_MAX;
+            request.dcvs_v3.bus_params.target_corner  = HAP_DCVS_VCORNER_MAX;
+            request.dcvs_v3.core_params.min_corner    = HAP_DCVS_VCORNER_MAX;
+            request.dcvs_v3.core_params.max_corner    = HAP_DCVS_VCORNER_MAX;
+            request.dcvs_v3.core_params.target_corner = HAP_DCVS_VCORNER_MAX;
+            request.dcvs_v3.set_sleep_disable         = TRUE;
+            request.dcvs_v3.sleep_disable             = TRUE;
        }

        auto ret = HAP_power_set(_context_ptr, &request);
@ -359,7 +371,7 @@ template <size_t _buffer_count, size_t _sub_idx> class npu_sub_process_scoped_ti
 inline auto make_scoped_perf_timer(const char * format, ...) {
    va_list args;
    va_start(args, format);
-    char buffer[1024];
+    char buffer[512];
    vsnprintf(buffer, sizeof(buffer), format, args);
    va_end(args);
    return npu_scoped_timer<1024>(buffer);
--- a/ggml/src/ggml-qnn/npu/device/vec_math.inl
+++ b/ggml/src/ggml-qnn/npu/device/vec_math.inl
@ -1120,10 +1120,75 @@ inline HVX_VectorPair hvx_vqf32_convert_vhf(HVX_Vector vxl) {

 inline HVX_Vector_x2 hvx_vsf_convert_vhf(HVX_Vector vxl, HVX_Vector one) {
    HVX_VectorPair res = Q6_Wqf32_vmpy_VhfVhf(Q6_Vh_vshuff_Vh(vxl), one);
-    return {
-        Q6_Vsf_equals_Vqf32(Q6_V_lo_W(res)),
-        Q6_Vsf_equals_Vqf32(Q6_V_hi_W(res)),
-    };
+
+    HVX_Vector_x2 ret;
+    ret.val[0] = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(res));
+    ret.val[1] = Q6_Vsf_equals_Vqf32(Q6_V_hi_W(res));
+    return ret;
+}
+
+/**
+ * @brief Calculates exponential (e^x) for vector elements with infinity guard
+ *
+ * This function computes the exponential value for each element in the input vector.
+ * For input values greater than kMaxExp (88.02f), the function returns the provided
+ * infinity value instead of attempting to calculate an exponential that would overflow.
+ *
+ * @param sline The input vector containing values to compute exponential for
+ * @param inf The vector containing the infinity representation to use for guarded values
+ * @return HVX_Vector containing exponential values, with values > kMaxExp replaced by inf
+ *
+ * @note Input values greater than 88.02f will return the specified infinity value
+ */
+inline HVX_Vector qhmath_hvx_exp_vf_guard_inf(HVX_Vector sline, const HVX_Vector inf) {
+    constexpr float  kMaxExp = 88.02f;
+    const HVX_Vector max_exp = Q6_V_vsplat_R(reinterpret_cast<const uint32_t &>(kMaxExp));
+
+    HVX_VectorPred pred_gt_max_exp = Q6_Q_vcmp_gt_VsfVsf(sline, max_exp);
+
+    HVX_Vector out = qhmath_hvx_exp_vf(sline);
+
+    out = Q6_V_vmux_QVV(pred_gt_max_exp, inf, out);
+    return out;
+}
+
+/**
+ * @brief Vectorized division with guard for infinite denominators on HVX.
+ *
+ * Performs element-wise division num/denom using qhmath_hvx_div_vf and then
+ * masks out lanes where denom equals the provided inf value, forcing those
+ * lanes of the result to zero. This is a temporary guard until proper INF
+ * handling is implemented in the underlying division routine.
+ *
+ * @param num    Numerator vector (per-lane).
+ * @param denom  Denominator vector (per-lane); lanes equal to inf are zeroed in the output.
+ * @param coeffs Coefficients used by qhmath_hvx_div_vf for the reciprocal/division approximation.
+ * @param inf    Lane value representing +INF to compare against denom.
+ * @return       Vector of num/denom with lanes set to zero where denom == inf.
+ *
+ * @note NaNs, negative infinity, zero denominators, and subnormals are not explicitly handled.
+ * @see qhmath_hvx_div_vf
+ */
+inline HVX_Vector qhmath_hvx_div_vf_guard_inf(HVX_Vector        num,
+                                              HVX_Vector        denom,
+                                              HVX_VectorPair_x4 coeffs,
+                                              const HVX_Vector  inf) {
+    HVX_VectorPred pred_inf = Q6_Q_vcmp_eq_VwVw(denom, inf);
+
+    // TODO: fix the inf in div
+    HVX_Vector out = qhmath_hvx_div_vf(num, denom, coeffs);
+
+    out = Q6_V_vmux_QVV(pred_inf, Q6_V_vzero(), out);
+    return out;
+}
+
+inline HVX_Vector Q6_Vsf_vadd_VsfVsf_guard_inf(HVX_Vector num0, HVX_Vector num1, const HVX_Vector inf) {
+    HVX_VectorPred pred0 = Q6_Q_vcmp_eq_VwVw(num0, inf);
+
+    HVX_Vector out = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_VsfVsf(num0, num1));
+
+    out = Q6_V_vmux_QVV(pred0, inf, out);
+    return out;
 }

 }  // namespace hexagon::vec::math
--- a/ggml/src/ggml-qnn/npu/device/vec_ops.hpp
+++ b/ggml/src/ggml-qnn/npu/device/vec_ops.hpp
@ -8,12 +8,18 @@

 namespace hexagon {

+constexpr const size_t kBytesPerVector = sizeof(HVX_Vector);  // 128 for v73
+constexpr const size_t kAlignMask      = kBytesPerVector - 1;
+
 template <typename T, int N> struct HEXAGON_pack {
    T val[N];
 };

-using HVX_Vector_x2     = std::pair<HVX_Vector, HVX_Vector>;
+using HVX_Vector_x2     = HEXAGON_pack<HVX_Vector, 2>;
+using HVX_Vector_x3     = HEXAGON_pack<HVX_Vector, 3>;
+using HVX_Vector_x4     = HEXAGON_pack<HVX_Vector, 4>;
 using HVX_VectorPair_x4 = HEXAGON_pack<HVX_VectorPair, 4>;
+using HVX_VectorPred_x3 = HEXAGON_pack<HVX_VectorPred, 3>;

 typedef union {
    HVX_VectorPair VV;
@ -24,8 +30,14 @@ typedef union {
    } V;
 } HVX_DV;

-constexpr const size_t kBytesPerVector = sizeof(HVX_Vector);  // 128 for v73
-constexpr const size_t kAlignMask      = kBytesPerVector - 1;
+typedef union {
+    HVX_Vector v;
+    float      f32[kBytesPerVector / sizeof(float)];
+    uint32_t   u32[kBytesPerVector / sizeof(uint32_t)];
+    __fp16 f16[kBytesPerVector / sizeof(__fp16)];
+    uint16_t u16[kBytesPerVector / sizeof(uint16_t)];
+    uint8_t  u8[kBytesPerVector];
+} HVX_VectorAlias;

 inline size_t get_aligned_size(size_t size) {
    return (size + kAlignMask) & ~kAlignMask;
@ -383,22 +395,35 @@ _TReturn type_erase_dot_func(const void * src0, const void * src1, size_t count)
 inline HVX_Vector vec_silu_f32_f32(HVX_Vector x, HVX_VectorPair_x4 coeff) {
    using namespace hexagon::vec::math;

-    HVX_Vector one = Q6_V_vsplat_R(0x3F800000);
+    constexpr float kMaxExp = 88.02f;  // log(INF)
+
+    const HVX_Vector max_exp = Q6_V_vsplat_R(reinterpret_cast<const uint32_t &>(kMaxExp));
+    HVX_Vector       one     = Q6_V_vsplat_R(0x3F800000);

    // x/(1.0f + expf(-x));
-    HVX_Vector exp_neg_x = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vsub_VsfVsf(Q6_V_vzero(), x));
-    HVX_Vector denom     = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_VsfVsf(qhmath_hvx_exp_vf(exp_neg_x), one));
-    return qhmath_hvx_div_vf(x, denom, coeff);
+    HVX_Vector     exp_neg_x = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vsub_VsfVsf(Q6_V_vzero(), x));
+    HVX_VectorPred pred0     = Q6_Q_vcmp_gt_VsfVsf(exp_neg_x, max_exp);
+    HVX_Vector     denom     = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_VsfVsf(qhmath_hvx_exp_vf(exp_neg_x), one));
+    HVX_Vector     out       = qhmath_hvx_div_vf(x, denom, coeff);
+    out                      = Q6_V_vmux_QVV(pred0, Q6_V_vzero(), out);
+    return out;
 }

 inline HVX_Vector vec_silu_f16_f16(HVX_Vector x, HVX_VectorPair_x4 coeff) {
    using namespace hexagon::vec::math;
-    HVX_Vector one = Q6_Vh_vsplat_R(0x3c00);
+
+    constexpr __fp16 kMaxExp = 11.0898664f;  // log(INF)
+
+    const HVX_Vector max_exp = Q6_Vh_vsplat_R(reinterpret_cast<const uint16_t &>(kMaxExp));
+    HVX_Vector       one     = Q6_Vh_vsplat_R(0x3c00);

    // x/(1.0f + expf(-x));
-    HVX_Vector exp_neg_x = Q6_Vhf_equals_Vqf16(Q6_Vqf16_vsub_VhfVhf(Q6_V_vzero(), x));
-    HVX_Vector denom     = Q6_Vhf_equals_Vqf16(Q6_Vqf16_vadd_VhfVhf(qhmath_hvx_exp_vhf(exp_neg_x), one));
-    return qhmath_hvx_div_vhf(x, denom, coeff);
+    HVX_Vector     exp_neg_x = Q6_Vhf_equals_Vqf16(Q6_Vqf16_vsub_VhfVhf(Q6_V_vzero(), x));
+    HVX_VectorPred pred0     = Q6_Q_vcmp_gt_VhfVhf(exp_neg_x, max_exp);
+    HVX_Vector     denom     = Q6_Vhf_equals_Vqf16(Q6_Vqf16_vadd_VhfVhf(qhmath_hvx_exp_vhf(exp_neg_x), one));
+    HVX_Vector     out       = qhmath_hvx_div_vhf(x, denom, coeff);
+    out                      = Q6_V_vmux_QVV(pred0, Q6_V_vzero(), out);
+    return out;
 }

 inline HVX_Vector vec_swiglu_f32_f32(HVX_Vector x, HVX_Vector g, HVX_VectorPair_x4 coeff) {
--- a/ggml/src/ggml-qnn/npu/device/vec_ops.inl
+++ b/ggml/src/ggml-qnn/npu/device/vec_ops.inl
@ -16,16 +16,18 @@ template <typename _TElem,
 inline _TRet vec_dot_product_impl(const _TElem * src0, const _TElem * src1, size_t count) {
    constexpr const size_t kElementsPerVector = hexagon::kBytesPerVector / sizeof(_TElem);

+    const HVX_Vector kZeroV = Q6_V_vzero();
+
    HVX_Vector *       src0_vec_ptr     = ((HVX_Vector *) src0);
    HVX_Vector * const src0_vec_ptr_end = ((HVX_Vector *) src0) + count / kElementsPerVector;
    HVX_Vector *       src1_vec_ptr     = ((HVX_Vector *) src1);
    HVX_Vector         prev0            = *src0_vec_ptr++;
    HVX_Vector         prev1            = *src1_vec_ptr++;
-    HVX_Vector         sum              = Q6_V_vzero();
+    HVX_Vector         sum              = kZeroV;

    if (src0_vec_ptr_end - src0_vec_ptr > 1) {
-        HVX_Vector sum0 = Q6_V_vzero();
-        HVX_Vector sum1 = Q6_V_vzero();
+        HVX_Vector sum0 = kZeroV;
+        HVX_Vector sum1 = kZeroV;

        do {
            HVX_VectorPair curr0 = reinterpret_cast<HVX_VectorPair *>(src0_vec_ptr)[0];
@ -33,14 +35,19 @@ inline _TRet vec_dot_product_impl(const _TElem * src0, const _TElem * src1, size

            HVX_Vector l0 = Q6_V_valign_VVR(Q6_V_lo_W(curr0), prev0, (size_t) src0);
            HVX_Vector l1 = Q6_V_valign_VVR(Q6_V_lo_W(curr1), prev1, (size_t) src1);
-            sum0          = _AddFunc(_MpyFunc(l0, l1), sum0);

            HVX_Vector h0 = Q6_V_valign_VVR(Q6_V_hi_W(curr0), Q6_V_lo_W(curr0), (size_t) src0);
            HVX_Vector h1 = Q6_V_valign_VVR(Q6_V_hi_W(curr1), Q6_V_lo_W(curr1), (size_t) src1);
-            sum1          = _AddFunc(_MpyFunc(h0, h1), sum1);
+
+            HVX_Vector mpy0 = _MpyFunc(l0, l1);
+            HVX_Vector mpy1 = _MpyFunc(h0, h1);

            prev0 = Q6_V_hi_W(curr0);
            prev1 = Q6_V_hi_W(curr1);
+
+            sum0 = _AddFunc(mpy0, sum0);
+            sum1 = _AddFunc(mpy1, sum1);
+
            src0_vec_ptr += 2;
            src1_vec_ptr += 2;
        } while (src0_vec_ptr_end - src0_vec_ptr > 1);
@ -73,10 +80,11 @@ inline _TRet vec_dot_product_impl(const _TElem * src0, const _TElem * src1, size
        src1_vec_ptr += should_fetch_src1 ? 1 : 0;
        HVX_Vector s0 = Q6_V_valign_VVR(curr0, prev0, (size_t) src0);
        HVX_Vector s1 = Q6_V_valign_VVR(curr1, prev1, (size_t) src1);
-        prev0         = curr0;
-        prev1         = curr1;

-        sum = _AddFunc(_MpyFunc(s0, s1), sum);
+        HVX_Vector mpy0 = _MpyFunc(s0, s1);
+        prev0           = curr0;
+        prev1           = curr1;
+        sum             = _AddFunc(mpy0, sum);
    }

    if (leftover > 0) {
@ -92,7 +100,7 @@ inline _TRet vec_dot_product_impl(const _TElem * src0, const _TElem * src1, size
                               prev1;
        curr1            = Q6_V_valign_VVR(curr1, prev1, (size_t) src1);

-        sum = _AddFunc(Q6_V_valign_VVR(_MpyFunc(curr0, curr1), Q6_V_vzero(), leftover_bytes), sum);
+        sum = _AddFunc(Q6_V_valign_VVR(_MpyFunc(curr0, curr1), kZeroV, leftover_bytes), sum);
    }

    return _ReduceFunc(sum);
@ -106,36 +114,38 @@ template <typename _TElem,
 inline _TRet vec_dot_product_aligned_impl(const _TElem * src0, const _TElem * src1, size_t count) {
    constexpr const size_t kElementsPerVector = hexagon::kBytesPerVector / sizeof(_TElem);

+    const HVX_Vector kZeroV = Q6_V_vzero();
+
    HVX_Vector *       src0_vec_ptr     = ((HVX_Vector *) src0);
    HVX_Vector * const src0_vec_ptr_end = ((HVX_Vector *) src0) + count / kElementsPerVector;
    HVX_Vector *       src1_vec_ptr     = ((HVX_Vector *) src1);
-    HVX_Vector         sum              = Q6_V_vzero();
+    HVX_Vector         sum              = kZeroV;

    {
-        HVX_Vector sum0 = Q6_V_vzero();
-        HVX_Vector sum1 = Q6_V_vzero();
-        if (src0_vec_ptr_end - src0_vec_ptr > 3) {
-            HVX_Vector sum2 = Q6_V_vzero();
-            HVX_Vector sum3 = Q6_V_vzero();
+        HVX_Vector sum0 = kZeroV;
+        HVX_Vector sum1 = kZeroV;
+        while (src0_vec_ptr_end - src0_vec_ptr > 3) {
+            HVX_VectorPair curr00 = reinterpret_cast<HVX_VectorPair *>(src0_vec_ptr)[0];
+            HVX_VectorPair curr10 = reinterpret_cast<HVX_VectorPair *>(src1_vec_ptr)[0];

-            do {
-                HVX_VectorPair curr00 = reinterpret_cast<HVX_VectorPair *>(src0_vec_ptr)[0];
-                HVX_VectorPair curr10 = reinterpret_cast<HVX_VectorPair *>(src1_vec_ptr)[0];
-                sum0                  = _AddFunc(_MpyFunc(Q6_V_lo_W(curr00), Q6_V_lo_W(curr10)), sum0);
-                sum1                  = _AddFunc(_MpyFunc(Q6_V_hi_W(curr00), Q6_V_hi_W(curr10)), sum1);
+            HVX_VectorPair curr01 = reinterpret_cast<HVX_VectorPair *>(src0_vec_ptr)[1];
+            HVX_VectorPair curr11 = reinterpret_cast<HVX_VectorPair *>(src1_vec_ptr)[1];

-                HVX_VectorPair curr01 = reinterpret_cast<HVX_VectorPair *>(src0_vec_ptr)[1];
-                HVX_VectorPair curr11 = reinterpret_cast<HVX_VectorPair *>(src1_vec_ptr)[1];
-                sum2                  = _AddFunc(_MpyFunc(Q6_V_lo_W(curr01), Q6_V_lo_W(curr11)), sum2);
-                sum3                  = _AddFunc(_MpyFunc(Q6_V_hi_W(curr01), Q6_V_hi_W(curr11)), sum3);
+            HVX_Vector mpy0 = _MpyFunc(Q6_V_lo_W(curr00), Q6_V_lo_W(curr10));
+            HVX_Vector mpy1 = _MpyFunc(Q6_V_hi_W(curr00), Q6_V_hi_W(curr10));

-                src0_vec_ptr += 4;
-                src1_vec_ptr += 4;
-            } while (src0_vec_ptr_end - src0_vec_ptr > 3);
+            HVX_Vector mpy2 = _MpyFunc(Q6_V_lo_W(curr01), Q6_V_lo_W(curr11));
+            HVX_Vector mpy3 = _MpyFunc(Q6_V_hi_W(curr01), Q6_V_hi_W(curr11));

-            sum0 = _AddFunc(sum2, sum0);
-            sum1 = _AddFunc(sum3, sum1);
-        }
+            sum0 = _AddFunc(mpy0, sum0);
+            sum1 = _AddFunc(mpy1, sum1);
+
+            sum0 = _AddFunc(mpy2, sum0);
+            sum1 = _AddFunc(mpy3, sum1);
+
+            src0_vec_ptr += 4;
+            src1_vec_ptr += 4;
+        };

        if (src0_vec_ptr_end - src0_vec_ptr > 1) {
            HVX_VectorPair curr0 = reinterpret_cast<HVX_VectorPair *>(src0_vec_ptr)[0];
@ -143,8 +153,11 @@ inline _TRet vec_dot_product_aligned_impl(const _TElem * src0, const _TElem * sr
            src0_vec_ptr += 2;
            src1_vec_ptr += 2;

-            sum0 = _AddFunc(_MpyFunc(Q6_V_lo_W(curr0), Q6_V_lo_W(curr1)), sum0);
-            sum1 = _AddFunc(_MpyFunc(Q6_V_hi_W(curr0), Q6_V_hi_W(curr1)), sum1);
+            HVX_Vector mpy0 = _MpyFunc(Q6_V_lo_W(curr0), Q6_V_lo_W(curr1));
+            HVX_Vector mpy1 = _MpyFunc(Q6_V_hi_W(curr0), Q6_V_hi_W(curr1));
+
+            sum0 = _AddFunc(mpy0, sum0);
+            sum1 = _AddFunc(mpy1, sum1);
        }

        sum = _AddFunc(sum0, sum1);
@ -195,6 +208,7 @@ inline _TRet vec_dot_product_mixed_impl(const _TElem0 * src0, const _TElem1 * sr

    constexpr const __fp16 kOne = 1.0f;
    const HVX_Vector kOneV      = Q6_Vh_vsplat_R(reinterpret_cast<const uint16_t &>(kOne));
+    const HVX_Vector kZeroV     = Q6_V_vzero();

    const _TElem0 * const src0_ptr_end     = src0 + count;
    HVX_Vector *          src0_vec_ptr     = ((HVX_Vector *) src0);
@ -202,27 +216,33 @@ inline _TRet vec_dot_product_mixed_impl(const _TElem0 * src0, const _TElem1 * sr
    HVX_Vector * const    src1_vec_ptr_end = ((HVX_Vector *) src1) + count / kElementsPerVector1;
    HVX_Vector            prev0            = *src0_vec_ptr++;
    HVX_Vector            prev1            = *src1_vec_ptr++;
-    HVX_Vector            sum              = Q6_V_vzero();
+    HVX_Vector            sum              = kZeroV;

    if (src1_vec_ptr_end - src1_vec_ptr > 1) {
-        HVX_Vector sum0 = Q6_V_vzero();
-        HVX_Vector sum1 = Q6_V_vzero();
+        HVX_Vector sum0 = kZeroV;
+        HVX_Vector sum1 = kZeroV;

        do {
-            HVX_Vector     curr0 = src0_vec_ptr[0];
-            HVX_VectorPair curr1 = reinterpret_cast<HVX_VectorPair *>(src1_vec_ptr)[0];
+            HVX_Vector curr0 = src0_vec_ptr[0];

            HVX_Vector    s0      = Q6_V_valign_VVR(curr0, prev0, (size_t) src0);
            HVX_Vector_x2 s0_pair = _ExpandFunc(s0, kOneV);

-            HVX_Vector l1 = Q6_V_valign_VVR(Q6_V_lo_W(curr1), prev1, (size_t) src1);
-            sum0          = _AddFunc(_MpyFunc(s0_pair.first, l1), sum0);
+            HVX_Vector curr10 = src1_vec_ptr[0];
+            HVX_Vector curr11 = src1_vec_ptr[1];

-            HVX_Vector h1 = Q6_V_valign_VVR(Q6_V_hi_W(curr1), Q6_V_lo_W(curr1), (size_t) src1);
-            sum1          = _AddFunc(_MpyFunc(s0_pair.second, h1), sum1);
+            HVX_Vector l1 = Q6_V_valign_VVR(curr10, prev1, (size_t) src1);
+            HVX_Vector h1 = Q6_V_valign_VVR(curr11, curr10, (size_t) src1);
+
+            HVX_Vector mpy0 = _MpyFunc(s0_pair.val[0], l1);
+            HVX_Vector mpy1 = _MpyFunc(s0_pair.val[1], h1);

            prev0 = curr0;
-            prev1 = Q6_V_hi_W(curr1);
+            prev1 = curr11;
+
+            sum0 = _AddFunc(mpy0, sum0);
+            sum1 = _AddFunc(mpy1, sum1);
+
            src0_vec_ptr++;
            src1_vec_ptr += 2;
        } while (src1_vec_ptr_end - src1_vec_ptr > 1);
@ -245,8 +265,11 @@ inline _TRet vec_dot_product_mixed_impl(const _TElem0 * src0, const _TElem1 * sr
        if (has_remaining_src1_vector) {
            HVX_Vector curr1 = *src1_vec_ptr++;
            HVX_Vector s1    = Q6_V_valign_VVR(curr1, prev1, (size_t) src1);
-            sum              = _AddFunc(_MpyFunc(s0_pair.first, s1), sum);
-            prev1            = curr1;
+
+            HVX_Vector mpy0 = _MpyFunc(s0_pair.val[0], s1);
+            prev1           = curr1;
+
+            sum = _AddFunc(mpy0, sum);
        }

        bool       should_fetch_src1 = leftover1 != 0 || !hexagon::is_addr_aligned(src1_vec_ptr);
@ -254,9 +277,11 @@ inline _TRet vec_dot_product_mixed_impl(const _TElem0 * src0, const _TElem1 * sr
        src1_vec_ptr += should_fetch_src1 ? 1 : 0;
        HVX_Vector s1 = Q6_V_valign_VVR(curr1, prev1, (size_t) src1);
        prev0         = curr0;
-        prev1         = curr1;

-        sum = _AddFunc(_MpyFunc(has_remaining_src1_vector ? s0_pair.second : s0_pair.first, s1), sum);
+        HVX_Vector mpy1 = _MpyFunc(has_remaining_src1_vector ? s0_pair.val[1] : s0_pair.val[0], s1);
+        prev1           = curr1;
+
+        sum = _AddFunc(mpy1, sum);
    }

    if (leftover1 > 0) {
@ -274,8 +299,8 @@ inline _TRet vec_dot_product_mixed_impl(const _TElem0 * src0, const _TElem1 * sr

        HVX_Vector_x2 curr0_pair = _ExpandFunc(curr0, kOneV);

-        curr0 = leftover1 == leftover0 ? curr0_pair.first : curr0_pair.second;
-        sum   = _AddFunc(Q6_V_valign_VVR(_MpyFunc(curr0, curr1), Q6_V_vzero(), leftover_bytes1), sum);
+        curr0 = leftover1 == leftover0 ? curr0_pair.val[0] : curr0_pair.val[1];
+        sum   = _AddFunc(Q6_V_valign_VVR(_MpyFunc(curr0, curr1), kZeroV, leftover_bytes1), sum);
    }

    return _ReduceFunc(sum);
@ -299,44 +324,55 @@ inline _TRet vec_dot_product_mix_aligned_impl(const _TElem0 * src0, const _TElem

    constexpr const __fp16 kOne = 1.0f;
    const HVX_Vector kOneV      = Q6_Vh_vsplat_R(reinterpret_cast<const uint16_t &>(kOne));
+    const HVX_Vector kZeroV     = Q6_V_vzero();

    HVX_Vector *       src0_vec_ptr     = ((HVX_Vector *) src0);
    HVX_Vector *       src1_vec_ptr     = ((HVX_Vector *) src1);
    HVX_Vector * const src1_vec_ptr_end = ((HVX_Vector *) src1) + count / kElementsPerVector1;
-    HVX_Vector         sum0             = Q6_V_vzero();
-    HVX_Vector         sum1             = Q6_V_vzero();
+    HVX_Vector         sum0             = kZeroV;
+    HVX_Vector         sum1             = kZeroV;

-    if (src1_vec_ptr_end - src1_vec_ptr > 3) {
-        HVX_Vector sum2 = Q6_V_vzero();
-        HVX_Vector sum3 = Q6_V_vzero();
+    while (src1_vec_ptr_end - src1_vec_ptr > 3) {
+        HVX_Vector curr0_lo  = src0_vec_ptr[0];
+        HVX_Vector curr10_lo = src1_vec_ptr[0];

-        do {
-            HVX_VectorPair curr0  = reinterpret_cast<HVX_VectorPair *>(src0_vec_ptr)[0];
-            HVX_Vector_x2  curr00 = _ExpandFunc(Q6_V_lo_W(curr0), kOneV);
-            HVX_VectorPair curr10 = reinterpret_cast<HVX_VectorPair *>(src1_vec_ptr)[0];
-            sum0                  = _AddFunc(_MpyFunc(curr00.first, Q6_V_lo_W(curr10)), sum0);
-            sum1                  = _AddFunc(_MpyFunc(curr00.second, Q6_V_hi_W(curr10)), sum1);
+        HVX_Vector    curr0_hi = src0_vec_ptr[1];
+        HVX_Vector_x2 curr00   = _ExpandFunc(curr0_lo, kOneV);

-            HVX_Vector_x2  curr01 = _ExpandFunc(Q6_V_hi_W(curr0), kOneV);
-            HVX_VectorPair curr11 = reinterpret_cast<HVX_VectorPair *>(src1_vec_ptr)[1];
-            sum2                  = _AddFunc(_MpyFunc(curr01.first, Q6_V_lo_W(curr11)), sum2);
-            sum3                  = _AddFunc(_MpyFunc(curr01.second, Q6_V_hi_W(curr11)), sum3);
+        HVX_Vector    curr10_hi = src1_vec_ptr[1];
+        HVX_Vector_x2 curr01    = _ExpandFunc(curr0_hi, kOneV);

-            src0_vec_ptr += 2;
-            src1_vec_ptr += 4;
-        } while (src1_vec_ptr_end - src1_vec_ptr > 3);
+        HVX_Vector mpy0 = _MpyFunc(curr00.val[0], curr10_lo);
+        HVX_Vector mpy1 = _MpyFunc(curr00.val[1], curr10_hi);

-        sum0 = _AddFunc(sum0, sum2);
-        sum1 = _AddFunc(sum1, sum3);
-    }
+        HVX_Vector curr11_lo = src1_vec_ptr[2];
+        HVX_Vector curr11_hi = src1_vec_ptr[3];
+
+        sum0 = _AddFunc(mpy0, sum0);
+        sum1 = _AddFunc(mpy1, sum1);
+
+        HVX_Vector mpy2 = _MpyFunc(curr01.val[0], curr11_lo);
+        HVX_Vector mpy3 = _MpyFunc(curr01.val[1], curr11_hi);
+
+        sum0 = _AddFunc(mpy2, sum0);
+        sum1 = _AddFunc(mpy3, sum1);
+
+        src0_vec_ptr += 2;
+        src1_vec_ptr += 4;
+    };

    if (src1_vec_ptr_end - src1_vec_ptr > 1) {
-        HVX_Vector    curr0   = src0_vec_ptr[0];
-        HVX_Vector_x2 s0_pair = _ExpandFunc(curr0, kOneV);
+        HVX_Vector curr0    = src0_vec_ptr[0];
+        HVX_Vector curr1_lo = src1_vec_ptr[0];

-        HVX_VectorPair curr1 = reinterpret_cast<HVX_VectorPair *>(src1_vec_ptr)[0];
-        sum0                 = _AddFunc(_MpyFunc(s0_pair.first, Q6_V_lo_W(curr1)), sum0);
-        sum1                 = _AddFunc(_MpyFunc(s0_pair.second, Q6_V_hi_W(curr1)), sum1);
+        HVX_Vector_x2 s0_pair  = _ExpandFunc(curr0, kOneV);
+        HVX_Vector    curr1_hi = src1_vec_ptr[1];
+
+        HVX_Vector mpy0 = _MpyFunc(s0_pair.val[0], curr1_lo);
+        HVX_Vector mpy1 = _MpyFunc(s0_pair.val[1], curr1_hi);
+
+        sum0 = _AddFunc(mpy0, sum0);
+        sum1 = _AddFunc(mpy1, sum1);
    }

    return _ReduceFunc(_AddFunc(sum0, sum1));
@ -360,14 +396,14 @@ inline void vec_scale_impl(const _TParam * src, float scale, _TParam * dst, size
        HVX_VectorPair curr = reinterpret_cast<HVX_VectorPair *>(src_vec_ptr)[0];
        src_vec_ptr += 2;

-        HVX_Vector lo  = Q6_V_valign_VVR(Q6_V_lo_W(curr), prev, (size_t) src);
-        dst_vec_ptr[0] = _Func(lo, dst_vec_ptr, scale_vec);
+        HVX_Vector lo = Q6_V_valign_VVR(Q6_V_lo_W(curr), prev, (size_t) src);
+        HVX_Vector hi = Q6_V_valign_VVR(Q6_V_hi_W(curr), Q6_V_lo_W(curr), (size_t) src);
+        prev          = Q6_V_hi_W(curr);

-        HVX_Vector hi  = Q6_V_valign_VVR(Q6_V_hi_W(curr), Q6_V_lo_W(curr), (size_t) src);
+        dst_vec_ptr[0] = _Func(lo, dst_vec_ptr, scale_vec);
        dst_vec_ptr[1] = _Func(hi, dst_vec_ptr + 1, scale_vec);

        dst_vec_ptr += 2;
-        prev = Q6_V_hi_W(curr);
    }

    if (src_vec_end - src_vec_ptr > 0) {
@ -405,14 +441,16 @@ template <typename _TData> inline void vec_zero_impl(_TData * src, size_t count)
    HVX_UVector *       src_vec_ptr = ((HVX_UVector *) src);
    HVX_UVector * const src_vec_end = ((HVX_UVector *) src) + (count / kElementsPerVector);

+    const HVX_Vector kZeroV = Q6_V_vzero();
+
    while (src_vec_end - src_vec_ptr > 1) {
-        src_vec_ptr[0] = Q6_V_vzero();
-        src_vec_ptr[1] = Q6_V_vzero();
+        src_vec_ptr[0] = kZeroV;
+        src_vec_ptr[1] = kZeroV;
        src_vec_ptr += 2;
    }

    if (src_vec_end - src_vec_ptr > 0) {
-        src_vec_ptr[0] = Q6_V_vzero();
+        src_vec_ptr[0] = kZeroV;
        src_vec_ptr++;
    }

@ -420,7 +458,7 @@ template <typename _TData> inline void vec_zero_impl(_TData * src, size_t count)
    if (leftover > 0) {
        // handle the leftover elements
        const size_t leftover_bytes = leftover * sizeof(_TData);
-        q6op_vstu_variable_ARV(src_vec_ptr, leftover_bytes, Q6_V_vzero());
+        q6op_vstu_variable_ARV(src_vec_ptr, leftover_bytes, kZeroV);
    }
 }

--- a/ggml/src/ggml-qnn/npu/host/graph.cpp
+++ b/ggml/src/ggml-qnn/npu/host/graph.cpp
@ -90,13 +90,16 @@ bool host_graph::compute() {
        return false;
    }

+    LOG_DEBUG("[%p]host_graph::compute started\n", (void *) this);
    SCOPED_PERFORMANCE_TRACKER("[hexagon-npu][%p]compute, handle(%p)", (void *) this, (void *) _graph_handle);
    auto status = npu_device_graph_compute(_device_handle, _graph_handle);
    if (status != AEE_SUCCESS) {
        LOG_ERROR("Failed to compute host_graph: 0x%x\n", (int) status);
+        LOG_DEBUG("[%p]host_graph::compute finished with failure\n", (void *) this);
        return false;
    }

+    LOG_DEBUG("[%p]host_graph::compute finished\n", (void *) this);
    return true;
 }

--- a/ggml/src/ggml-qnn/npu/host/host_device.cpp
+++ b/ggml/src/ggml-qnn/npu/host/host_device.cpp
@ -242,6 +242,7 @@ bool npu_device::init_rpc_mem() {

 bool npu_device::init_device_lib() {
    if (!_device_handle) {
+        set_fast_rpc_stack_size(_rpc_interface, _dsp_domain_id, NPU_THREAD_STACK_SIZE);
        auto         arch            = get_dsp_arch(_rpc_interface, _dsp_domain_id);
        const auto & device_lib_info = get_device_library_info(arch);
        std::string  device_lib_uri  = device_lib_info.device_lib_uri;
--- a/ggml/src/ggml-qnn/npu/host/tensor.hpp
+++ b/ggml/src/ggml-qnn/npu/host/tensor.hpp
@ -1,14 +1,14 @@
 #pragma once

-#include <list>
-#include <type_traits>
-#include <vector>
-
 #include "common.hpp"
 #include "ggml-impl.h"
 #include "hexagon_npu.h"
 #include "util.hpp"

+#include <list>
+#include <type_traits>
+#include <vector>
+
 namespace hexagon {

 // TODO: merge this with device tensor?
@ -62,7 +62,7 @@ class host_tensor {

    ~host_tensor() {
        LOG_DEBUG("host_tensor(%p) destroy, device_tensor_handle: %p\n", (void *) this, (void *) _device_tensor_handle);
-        if (_device_tensor_handle) {
+        if (_device_tensor_handle != npu_device_INVALID_DEVICE_TENSOR_HANDLE) {
            npu_device_tensor_free(_device_handle, _device_tensor_handle);
            // TODO: figure out why the _ggml_tensor is invalid here
        }
@ -113,8 +113,11 @@ class host_tensor {
        if (memcmp(_info_update.params, _ggml_tensor->op_params, sizeof(_info_update.params)) != 0) {
            params_changed = true;
            memcpy(_info_update.params, _ggml_tensor->op_params, sizeof(_info_update.params));
-            LOG_DEBUG("host_tensor(%p) op_params changed: [%x, %x, %x, %x]\n", (void *) this,
-                      (int) _info_update.params[0], (int) _info_update.params[1], (int) _info_update.params[2],
+            LOG_DEBUG("host_tensor(%p) op_params changed: [%x, %x, %x, %x]\n",
+                      (void *) this,
+                      (int) _info_update.params[0],
+                      (int) _info_update.params[1],
+                      (int) _info_update.params[2],
                      (int) _info_update.params[3]);
        }

@ -136,19 +139,29 @@ class host_tensor {
        if (memcmp(_info_update.src_handles, src_tensor_handles, sizeof(_info_update.src_handles)) != 0) {
            params_changed = true;
            memcpy(_info_update.src_handles, src_tensor_handles, sizeof(_info_update.src_handles));
-            LOG_DEBUG("host_tensor(%p) src changed, handles: [%p, %p]\n", (void *) this,
-                      (void *) _info_update.src_handles[0], (void *) _info_update.src_handles[1]);
+            LOG_DEBUG("host_tensor(%p) src changed, handles: [%p, %p]\n",
+                      (void *) this,
+                      (void *) _info_update.src_handles[0],
+                      (void *) _info_update.src_handles[1]);
        }

        if (params_changed) {
            npu_device_tensor_update_params(_device_handle, _device_tensor_handle, &_info_update);
-            LOG_DEBUG("host_tensor(%p) update_params, op: %s, params: [%x, %x, %x, %x]\n", (void *) this,
-                      ggml_op_desc(_ggml_tensor), (int) _info_update.params[0], (int) _info_update.params[1],
-                      (int) _info_update.params[2], (int) _info_update.params[3]);
+            LOG_DEBUG("host_tensor(%p) update_params, op: %s, params: [%x, %x, %x, %x]\n",
+                      (void *) this,
+                      ggml_op_desc(_ggml_tensor),
+                      (int) _info_update.params[0],
+                      (int) _info_update.params[1],
+                      (int) _info_update.params[2],
+                      (int) _info_update.params[3]);
        } else {
-            LOG_DEBUG("host_tensor(%p) update_params, no changes, op: %s, params: [%x, %x, %x, %x]\n", (void *) this,
-                      ggml_op_desc(_ggml_tensor), (int) _info_update.params[0], (int) _info_update.params[1],
-                      (int) _info_update.params[2], (int) _info_update.params[3]);
+            LOG_DEBUG("host_tensor(%p) update_params, no changes, op: %s, params: [%x, %x, %x, %x]\n",
+                      (void *) this,
+                      ggml_op_desc(_ggml_tensor),
+                      (int) _info_update.params[0],
+                      (int) _info_update.params[1],
+                      (int) _info_update.params[2],
+                      (int) _info_update.params[3]);
        }
    }

@ -174,9 +187,13 @@ class host_tensor {
 #endif
        }

-        LOG_DEBUG("host_tensor(%p) update_params, op: %s, params: [%x, %x, %x, %x]\n", (void *) this,
-                  ggml_op_desc(_ggml_tensor), (int) _info_update.params[0], (int) _info_update.params[1],
-                  (int) _info_update.params[2], (int) _info_update.params[3]);
+        LOG_DEBUG("host_tensor(%p) update_params, op: %s, params: [%x, %x, %x, %x]\n",
+                  (void *) this,
+                  ggml_op_desc(_ggml_tensor),
+                  (int) _info_update.params[0],
+                  (int) _info_update.params[1],
+                  (int) _info_update.params[2],
+                  (int) _info_update.params[3]);
        return _info_update;
    }

@ -192,11 +209,21 @@ class host_tensor {
    }

    int get_desc(char * buffer, size_t size) const {
-        return snprintf(buffer, size, "%s[%ldx%ldx%ldx%ld], nb[%ld,%ld,%ld,%ld], %s, addr: %p, ggml: %p, handle:%p",
-                        _ggml_tensor->name, (long) _ggml_tensor->ne[0], (long) _ggml_tensor->ne[1],
-                        (long) _ggml_tensor->ne[2], (long) _ggml_tensor->ne[3], (long) _ggml_tensor->nb[0],
-                        (long) _ggml_tensor->nb[1], (long) _ggml_tensor->nb[2], (long) _ggml_tensor->nb[3],
-                        ggml_type_name(_ggml_tensor->type), (void *) this, (void *) _ggml_tensor,
+        return snprintf(buffer,
+                        size,
+                        "%s[%ldx%ldx%ldx%ld], nb[%ld,%ld,%ld,%ld], %s, addr: %p, ggml: %p, handle:%p",
+                        _ggml_tensor->name,
+                        (long) _ggml_tensor->ne[0],
+                        (long) _ggml_tensor->ne[1],
+                        (long) _ggml_tensor->ne[2],
+                        (long) _ggml_tensor->ne[3],
+                        (long) _ggml_tensor->nb[0],
+                        (long) _ggml_tensor->nb[1],
+                        (long) _ggml_tensor->nb[2],
+                        (long) _ggml_tensor->nb[3],
+                        ggml_type_name(_ggml_tensor->type),
+                        (void *) this,
+                        (void *) _ggml_tensor,
                        (void *) _device_tensor_handle);
    }

--- a/ggml/src/ggml-qnn/npu/host/util.cpp
+++ b/ggml/src/ggml-qnn/npu/host/util.cpp
@ -149,6 +149,23 @@ void enable_unsigned_dsp_module(common::rpc_interface_ptr rpc_interface, uint32_
    }
 }

+void set_fast_rpc_stack_size(common::rpc_interface_ptr rpc_interface, uint32_t domain_id, uint32_t stack_size) {
+    constexpr const uint32_t FASTRPC_THREAD_PARAMS = 1;
+
+    if (!rpc_interface || !rpc_interface->is_valid()) {
+        return;
+    }
+
+    remote_rpc_thread_params tp = {};
+    tp.domain                   = domain_id;
+    tp.prio                     = -1;
+    tp.stack_size               = stack_size;
+    auto ret                    = rpc_interface->remote_session_control(FASTRPC_THREAD_PARAMS, &tp, sizeof(tp));
+    if (ret != AEE_SUCCESS) {
+        LOG_ERROR("failed to set fast RPC stack size: 0x%x\n", ret);
+    }
+}
+
 void get_op_tensor_desc(const ggml_tensor * dst, char * out, size_t max_len) {
    if (dst == nullptr) {
        snprintf(out, max_len, "null");
@ -161,15 +178,30 @@ void get_op_tensor_desc(const ggml_tensor * dst, char * out, size_t max_len) {
        switch (dims) {
            default:
            case 4:
-                snprintf(out, max_len, "%s[%ldx%ldx%ldx%ld]", ggml_type_name(tensor->type), (long) tensor->ne[0],
-                         (long) tensor->ne[1], (long) tensor->ne[2], (long) tensor->ne[3]);
+                snprintf(out,
+                         max_len,
+                         "%s[%ldx%ldx%ldx%ld]",
+                         ggml_type_name(tensor->type),
+                         (long) tensor->ne[0],
+                         (long) tensor->ne[1],
+                         (long) tensor->ne[2],
+                         (long) tensor->ne[3]);
                break;
            case 3:
-                snprintf(out, max_len, "%s[%ldx%ldx%ld]", ggml_type_name(tensor->type), (long) tensor->ne[0],
-                         (long) tensor->ne[1], (long) tensor->ne[2]);
+                snprintf(out,
+                         max_len,
+                         "%s[%ldx%ldx%ld]",
+                         ggml_type_name(tensor->type),
+                         (long) tensor->ne[0],
+                         (long) tensor->ne[1],
+                         (long) tensor->ne[2]);
                break;
            case 2:
-                snprintf(out, max_len, "%s[%ldx%ld]", ggml_type_name(tensor->type), (long) tensor->ne[0],
+                snprintf(out,
+                         max_len,
+                         "%s[%ldx%ld]",
+                         ggml_type_name(tensor->type),
+                         (long) tensor->ne[0],
                         (long) tensor->ne[1]);
                break;
            case 1:
@ -201,8 +233,14 @@ void get_op_tensor_desc(const ggml_tensor * dst, char * out, size_t max_len) {
                print_tensor(dst->src[2], src2_desc, sizeof(src2_desc));
                char src3_desc[256];
                print_tensor(dst->src[3], src3_desc, sizeof(src3_desc));
-                snprintf(out, max_len, "dst: %s, src0: %s, src1: %s, src2: %s, src3: %s", dst_desc, src0_desc,
-                         src1_desc, src2_desc, src3_desc);
+                snprintf(out,
+                         max_len,
+                         "dst: %s, src0: %s, src1: %s, src2: %s, src3: %s",
+                         dst_desc,
+                         src0_desc,
+                         src1_desc,
+                         src2_desc,
+                         src3_desc);
                return;
            }
        case 3:
@ -213,8 +251,8 @@ void get_op_tensor_desc(const ggml_tensor * dst, char * out, size_t max_len) {
                print_tensor(dst->src[1], src1_desc, sizeof(src1_desc));
                char src2_desc[256];
                print_tensor(dst->src[2], src2_desc, sizeof(src2_desc));
-                snprintf(out, max_len, "dst: %s, src0: %s, src1: %s, src2: %s", dst_desc, src0_desc, src1_desc,
-                         src2_desc);
+                snprintf(
+                    out, max_len, "dst: %s, src0: %s, src1: %s, src2: %s", dst_desc, src0_desc, src1_desc, src2_desc);
                return;
            }
        case 2:
--- a/ggml/src/ggml-qnn/npu/host/util.hpp
+++ b/ggml/src/ggml-qnn/npu/host/util.hpp
@ -23,6 +23,7 @@ hexagon_dsp_arch get_dsp_arch(common::rpc_interface_ptr rpc_interface, uint32_t
 const char * get_dsp_arch_desc(hexagon_dsp_arch arch);

 void enable_unsigned_dsp_module(common::rpc_interface_ptr rpc_interface, uint32_t domain_id);
+void set_fast_rpc_stack_size(common::rpc_interface_ptr rpc_interface, uint32_t domain_id, uint32_t stack_size);

 void get_op_tensor_desc(const ggml_tensor * dst, char * out, size_t max_len);

--- a/ggml/src/ggml-qnn/npu/idl/hexagon_npu.idl
+++ b/ggml/src/ggml-qnn/npu/idl/hexagon_npu.idl
@ -13,6 +13,8 @@ const uint32_t NPU_ROPE_TYPE_NEOX = 2;
 const uint32_t NPU_ROPE_TYPE_MROPE = 8;
 const uint32_t NPU_ROPE_TYPE_VISION = 24;

+const uint32_t NPU_THREAD_STACK_SIZE = 64 * 1024;
+
 interface npu_device : remote_handle64{

    typedef int64_t ne_type[DEVICE_TENSOR_MAX_DIMS];