feat: perf opt part4 (#43)

* wip * refactor: rewrite dequantize_row_q4_0 by intrinsic * log for debug * fix q4 intrinsic * small opt * wip * wip * add vtcm_quota_size * add perf log for hexagon-npu backend * wip * add log * sync after a specfic op * increase worker thread priority * fix unbalanced thread slice * small slict to fit in vtcm cache * limit the supported row element size * opt 4_0 dequant * fix q4 dequant * add power_utils * add rms_norm * wip * enable rms_norm f32 * fix rms_norm with param * fix compiling flags * use float * fix small row size * vectorized rms norm * wip * read 2 vectors * rename * add perf log on update * set empty tensors handle also * merge some rpc functions * opt param update * wip * print more log * add struct for update param config * add npu_device_graph_set_tensor_with_param * merge tensor and params update * wip * wip * make as template to reuse * vectorize dequantize_row_q8_0 * opt * avoid using union to store q data * wip * wip * wip
2025-05-28 00:00:42 +08:00 · 2025-05-28 00:00:42 +08:00 · c23ab465c0
parent 2306f82a58
commit c23ab465c0
32 changed files with 1014 additions and 397 deletions
--- a/ggml/src/ggml-qnn/npu/CMakeLists.txt
+++ b/ggml/src/ggml-qnn/npu/CMakeLists.txt
@ -231,6 +231,11 @@ else()

    build_idl(idl/hexagon_npu.idl hexagon_npu_skel_OBJS)

+    add_subdirectory(${HEXAGON_SDK_ROOT}/libs/qprintf qprintf_dir)
+    target_include_directories(hexagon_npu_skel_OBJS PUBLIC
+        ${HEXAGON_SDK_ROOT}/libs/qprintf/inc/
+    )
+
    # disable warnings for the skel
    set_source_files_properties(
        ${skel_srcs}
@ -239,12 +244,12 @@ else()
    )

    add_library(hexagon_npu_skel SHARED $<TARGET_OBJECTS:hexagon_npu_skel_OBJS>)
-
    target_link_libraries(hexagon_npu_skel
        ${HEXAGON_LIB_DIR}/${HEXAGON_ARCH}/G0/pic/libc++abi.a
        ${HEXAGON_LIB_DIR}/${HEXAGON_ARCH}/G0/pic/libc++.a
    )
    set_target_properties(hexagon_npu_skel PROPERTIES OUTPUT_NAME "hexagon_npu_skel_${HEXAGON_ARCH}")
+    target_link_libraries(hexagon_npu_skel qprintf_static)

    copy_binaries(hexagon_npu_skel)
 endif()
--- a/ggml/src/ggml-qnn/npu/device/device.cpp
+++ b/ggml/src/ggml-qnn/npu/device/device.cpp
@ -106,6 +106,7 @@ int npu_device_open(const char * uri, remote_handle64 * h) {
    }

    *h = reinterpret_cast<remote_handle64>(context);
+    DEVICE_LOG_INFO("NPU device context created: %p", (void *) *h);
    return AEE_SUCCESS;
 }

@ -117,6 +118,7 @@ int npu_device_close(remote_handle64 h) {
    }

    delete context;
+    DEVICE_LOG_INFO("NPU device context destroyed: %p", (void *) h);
    return AEE_SUCCESS;
 }

@ -130,6 +132,12 @@ AEEResult npu_device_device_support_op(remote_handle64 _h, const npu_device_tens
                                       const npu_device_tensor_spec * src1, const npu_device_tensor_spec * dst,
                                       npu_device_tensor_op op, boolean * is_supported) {
    NPU_UNUSED(_h);
+
+    if (!src0 || !src1 || !dst || !is_supported) {
+        DEVICE_LOG_ERROR("npu_device_device_support_op: Invalid arguments");
+        return AEE_EINVARGS;
+    }
+
    *is_supported = hexagon::support_op(*src0, *src1, *dst, op);
    return AEE_SUCCESS;
 }
@ -147,28 +155,15 @@ AEEResult npu_device_tensor_init(remote_handle64 _h, const npu_device_tensor_con
    return AEE_SUCCESS;
 }

-AEEResult npu_device_tensor_set_src(remote_handle64 _h, npu_device_tensor_handle_t tensor_handle, uint64_t index,
-                                    npu_device_tensor_handle_t src) {
+AEEResult npu_device_tensor_update_params(remote_handle64 _h, npu_device_tensor_handle_t tensor_handle,
+                                          const npu_device_tensor_update_config * config) {
    NPU_UNUSED(_h);
    auto * tensor = tensor_from_handle(tensor_handle);
-    if (!tensor) {
+    if (!tensor || !config) {
        return AEE_EINVHANDLE;
    }

-    auto * src_tensor = tensor_from_handle(src);
-    tensor->set_src(index, src_tensor);
-    return AEE_SUCCESS;
-}
-
-AEEResult npu_device_tensor_set_op(remote_handle64 _h, npu_device_tensor_handle_t tensor_handle,
-                                   npu_device_tensor_op op) {
-    NPU_UNUSED(_h);
-    auto * tensor = tensor_from_handle(tensor_handle);
-    if (!tensor) {
-        return AEE_EINVHANDLE;
-    }
-
-    tensor->set_op(op);
+    tensor->update_config(*config);
    return AEE_SUCCESS;
 }

@ -206,6 +201,29 @@ AEEResult npu_device_graph_set_tensor(remote_handle64 _h, npu_device_graph_handl
    return AEE_SUCCESS;
 }

+AEEResult npu_device_graph_set_tensor_with_param(remote_handle64 _h, npu_device_graph_handle_t graph_handle,
+                                                 const npu_device_tensor_handle_t *      tensor_handles,
+                                                 int                                     tensor_handlesLen,
+                                                 const npu_device_tensor_update_config * tensor_params,
+                                                 int                                     tensor_paramsLen) {
+    NPU_UNUSED(_h);
+    auto * graph = graph_from_handle(graph_handle);
+    if (!graph || !tensor_handles || tensor_handlesLen <= 0 || !tensor_params ||
+        tensor_handlesLen != tensor_paramsLen) {
+        return AEE_EINVHANDLE;
+    }
+
+    graph->set_tensor(tensor_handles, tensor_handlesLen);
+    for (int i = 0; i < tensor_handlesLen; ++i) {
+        auto * tensor = tensor_from_handle(tensor_handles[i]);
+        if (tensor) {
+            tensor->update_config(tensor_params[i]);
+        }
+    }
+
+    return AEE_SUCCESS;
+}
+
 AEEResult npu_device_graph_compute(remote_handle64 _h, npu_device_graph_handle_t graph_handle) {
    auto dev_ctx = device_context_from_handle(_h);
    if (!dev_ctx) {
--- a/ggml/src/ggml-qnn/npu/device/graph.cpp
+++ b/ggml/src/ggml-qnn/npu/device/graph.cpp
@ -10,7 +10,8 @@
 namespace hexagon {

 graph::graph() noexcept {
-    DEVICE_LOG_DEBUG("graph(%p) created\n", (void *) this);
+    _vtcm_quota_size = hexagon::vtcm_mem::get_avail_block_size();  // TODO: move to device init?
+    DEVICE_LOG_DEBUG("graph(%p) created: vtcm quota size: %zu\n", (void *) this, _vtcm_quota_size);
 }

 graph::~graph() noexcept {
@ -45,6 +46,8 @@ bool graph::compute(default_thread_pool * thread_pool, const float * f16_to_f32_
    }

    DEVICE_LOG_DEBUG("graph(%p) compute\n", (void *) this);
+
+    DEVICE_SCOPED_PERFORMANCE_TRACKER("[%p]compute", (void *) this);
    _f16_to_f32_table = f16_to_f32_table;
    if (thread_pool) {
        thread_pool->sync_execute(reinterpret_cast<default_thread_pool::task_type>(&graph::thread_pool_task), this);
@ -61,6 +64,8 @@ void graph::thread_pool_task(default_thread_pool * pool, size_t thread_idx, size
 }

 void graph::compute_impl(default_thread_pool * pool, size_t thread_idx, size_t thread_count) {
+    hexagon::compute_params params = { thread_idx, thread_count, _vtcm_quota_size / thread_count, _f16_to_f32_table };
+
    for (size_t i = 0; i < _tensor_count; ++i) {
        auto * dst  = _tensors[i];
        auto   op   = dst->get_op();
@ -69,14 +74,14 @@ void graph::compute_impl(default_thread_pool * pool, size_t thread_idx, size_t t
            DEVICE_LOG_ERROR("graph(%p) tensor[%zu] op %d not supported\n", (void *) this, i, op);
            return;
        }
-
-        hexagon::compute_params params = { thread_idx, thread_count, _f16_to_f32_table };
        if (!func(dst, &params)) {
            DEVICE_LOG_ERROR("graph(%p) tensor[%zu] op %d compute failed\n", (void *) this, i, op);
        }

-        // TODO: figure out which ops need to sync
-        if (pool) {
+        DEVICE_SCOPED_PERFORMANCE_TRACKER("[%p]sync_thread, tidx: %zu", (void *) this, thread_idx);
+
+        const bool should_sync = requires_thread_barrier(op);
+        if (pool && should_sync && i < _tensor_count - 1) {
            pool->sync_thread();
        }
        dst->invalidate();
--- a/ggml/src/ggml-qnn/npu/device/graph.hpp
+++ b/ggml/src/ggml-qnn/npu/device/graph.hpp
@ -25,6 +25,7 @@ class graph {

    std::unique_ptr<tensor *[]> _tensors;
    size_t                      _tensor_count     = 0;
+    size_t                      _vtcm_quota_size  = 0;
    const float *               _f16_to_f32_table = nullptr;

    DISABLE_COPY_AND_MOVE(graph);
--- a/ggml/src/ggml-qnn/npu/device/op_impl.cpp
+++ b/ggml/src/ggml-qnn/npu/device/op_impl.cpp
@ -5,6 +5,8 @@
 #include <hexagon_types.h>
 #include <HTP/core/intrinsics.h>

+#include <type_traits>
+
 #include "op_mul_mat.hpp"
 #include "quants.hpp"

@ -17,7 +19,7 @@ inline void vec_op_impl(const _TyData * src0, const _TyData * src1, size_t count
    HVX_Vector * iptr0     = ((HVX_Vector *) src0);
    HVX_Vector * iptr0_end = ((HVX_Vector *) src0) + (count / kElementsPerVector);
    HVX_Vector * iptr1     = ((HVX_Vector *) src1);
-    HVX_Vector * optr      = ((HVX_Vector *) dst);
+    HVX_Vector * optr      = ((HVX_Vector *) dst);  // framework will ensure the dst is aligned
    HVX_Vector   prev0     = *iptr0++;
    HVX_Vector   prev1     = *iptr1++;

@ -108,6 +110,12 @@ template <typename _TyData> struct get_data_type<void (*)(const _TyData *, const
    using type = _TyData;
 };

+template <typename _TyData, typename _TyParam>
+struct get_data_type<void (*)(const _TyData *, size_t, _TyParam, _TyData *)> {
+    using type       = _TyData;
+    using param_type = typename std::remove_cv<typename std::remove_reference<_TyData>::type>::type;
+};
+
 template <auto _RowFunc> bool element_wise_op(hexagon::tensor * out, hexagon::compute_params * params) {
    using data_type = typename get_data_type<decltype(_RowFunc)>::type;

@ -166,6 +174,16 @@ template <auto _RowFunc> bool element_wise_op(hexagon::tensor * out, hexagon::co
    return true;
 }

+bool is_same_shape(const npu_device_tensor_spec & src, const npu_device_tensor_spec & dst) {
+    for (size_t i = 0; i < DEVICE_TENSOR_MAX_DIMS; ++i) {
+        if (src.ne[i] != dst.ne[i]) {
+            return false;
+        }
+    }
+
+    return true;
+}
+
 bool is_element_wise_op_supported(const npu_device_tensor_spec & src0, const npu_device_tensor_spec & src1,
                                  const npu_device_tensor_spec & dst, npu_device_tensor_op op) {
    if (op != NPU_OP_ADD && op != NPU_OP_SUB && op != NPU_OP_MUL) {
@ -196,12 +214,149 @@ bool is_element_wise_op_supported(const npu_device_tensor_spec & src0, const npu
        return false;
    }

-    for (size_t i = 0; i < DEVICE_TENSOR_MAX_DIMS; ++i) {
-        if (src0.ne[i] != dst.ne[i]) {
-            DEVICE_LOG_DEBUG("[%s]src0.ne[%zu] and dst.ne[%zu] not match: %lld vs %lld\n", hexagon::op_get_name(op), i,
-                             i, (long long) src0.ne[i], (long long) dst.ne[i]);
-            return false;
+    if (!is_same_shape(src0, dst)) {
+        DEVICE_LOG_DEBUG("[%s]src0 and dst have different shape\n", hexagon::op_get_name(op));
+        return false;
+    }
+
+    return true;
+}
+
+void rms_norm_vec_f32(const float * src, size_t count, float eps, float * dst) {
+    constexpr const size_t kElementsPerVector = hexagon::kBytesPerVector / sizeof(float);
+
+    HVX_Vector * src_vec_ptr = ((HVX_Vector *) src);
+    HVX_Vector * src_vec_end = ((HVX_Vector *) src) + (count / kElementsPerVector);
+    HVX_Vector   prev        = *src_vec_ptr++;
+    HVX_Vector   sum         = Q6_V_vzero();
+    while (src_vec_ptr < src_vec_end) {
+        HVX_Vector curr = *src_vec_ptr++;
+        HVX_Vector s0   = Q6_V_valign_VVR(curr, prev, (size_t) src);
+        sum             = Q6_Vqf32_vadd_Vqf32Vqf32(sum, Q6_Vqf32_vmpy_VsfVsf(s0, s0));
+        prev            = curr;
+    }
+
+    if ((src_vec_end - ((HVX_Vector *) src)) > 0) {
+        // handle the last vector
+        bool       src_ptr_aligned = hexagon::is_addr_aligned(src_vec_ptr);
+        HVX_Vector curr            = src_ptr_aligned ? prev : *src_vec_ptr;
+        src_vec_ptr                = src_ptr_aligned ? src_vec_ptr : src_vec_ptr + 1;
+        HVX_Vector s0              = Q6_V_valign_VVR(curr, prev, (size_t) src);
+        sum                        = Q6_Vqf32_vadd_Vqf32Vqf32(sum, Q6_Vqf32_vmpy_VsfVsf(s0, s0));
+        prev                       = curr;
+    }
+
+    const size_t leftover       = count % kElementsPerVector;
+    const size_t leftover_bytes = leftover * sizeof(float);
+    if (leftover > 0) {
+        // handle the leftover elements
+        HVX_Vector curr =
+            (leftover_bytes + hexagon::unaligned_bytes(src_vec_ptr) > hexagon::kBytesPerVector) ? *src_vec_ptr : prev;
+        curr = Q6_V_valign_VVR(curr, prev, (size_t) src);
+        sum  = Q6_Vqf32_vadd_Vqf32Vqf32(sum,
+                                        Q6_V_valign_VVR(Q6_Vqf32_vmpy_VsfVsf(curr, curr), Q6_V_vzero(), leftover_bytes));
+    }
+
+    const float mean  = hexagon::vec_reduction_f32(sum) / count;  // TODO: figure out how to do division in vector
+    const float scale = 1.0f / sqrtf(mean + eps);                 // TODO: use buildin blas sqrtf?
+
+    HVX_Vector scale_vec     = Q6_V_vsplat_R(reinterpret_cast<const uint32_t &>(scale));
+    src_vec_ptr              = ((HVX_Vector *) src);
+    prev                     = *src_vec_ptr++;
+    HVX_Vector * dst_vec_ptr = ((HVX_Vector *) dst);  // framework will ensure the dst is aligned
+    while (src_vec_ptr < src_vec_end) {
+        HVX_Vector curr = *src_vec_ptr++;
+        HVX_Vector s0   = Q6_V_valign_VVR(curr, prev, (size_t) src);
+        *dst_vec_ptr++  = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(s0, scale_vec));
+        prev            = curr;
+    }
+
+    if ((src_vec_end - ((HVX_Vector *) src)) > 0) {
+        // handle the last vector
+        bool       src_ptr_aligned = hexagon::is_addr_aligned(src_vec_ptr);
+        HVX_Vector curr            = src_ptr_aligned ? prev : *src_vec_ptr;
+        src_vec_ptr                = src_ptr_aligned ? src_vec_ptr : src_vec_ptr + 1;
+        HVX_Vector s0              = Q6_V_valign_VVR(curr, prev, (size_t) src);
+        *dst_vec_ptr++             = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(s0, scale_vec));
+        prev                       = curr;
+    }
+
+    if (leftover > 0) {
+        // handle the leftover elements
+        HVX_Vector curr =
+            (leftover_bytes + hexagon::unaligned_bytes(src_vec_ptr) > hexagon::kBytesPerVector) ? *src_vec_ptr : prev;
+        curr = Q6_V_valign_VVR(curr, prev, (size_t) src);
+        q6op_vstu_variable_ARV(dst_vec_ptr, leftover_bytes, Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(curr, scale_vec)));
+    }
+}
+
+// TODO: merge with element_wise_op?
+template <auto _RowFunc> bool unary_op(hexagon::tensor * out, hexagon::compute_params * params) {
+    using data_type  = typename get_data_type<decltype(_RowFunc)>::type;
+    using param_type = typename get_data_type<decltype(_RowFunc)>::param_type;
+
+    if (!out) {
+        return false;
+    }
+
+    static_assert(DEVICE_TENSOR_MAX_DIMS == 4, "element_wise_op requires max dims 4");
+    auto * src0 = out->get_src(0);
+    if (!src0) {
+        return true;  // skip if no src
+    }
+
+    const auto * src0_ptr      = reinterpret_cast<const uint8_t *>(src0->get_read_buffer());
+    auto *       dst_ptr       = reinterpret_cast<uint8_t *>(out->get_write_buffer());
+    auto         total_rows    = out->get_ne(3) * out->get_ne(2) * out->get_ne(1);
+    const auto   rows_per_cube = out->get_ne(2) * out->get_ne(1);
+    const auto   start_end     = hexagon::get_thread_work_slice(total_rows, params->tidx, params->tcnt);
+    if (start_end.first >= start_end.second) {
+        return true;
+    }
+
+    DEVICE_SCOPED_OP_PERFORMANCE_TRACKER(out, params->tidx);
+
+    const auto   param           = out->get_op_param<param_type>(0);
+    const size_t valid_row_bytes = src0->get_ne(0) * sizeof(data_type);
+    for (int64_t ir = start_end.first; ir < start_end.second; ++ir) {
+        const auto i03 = ir / rows_per_cube;
+        const auto i02 = ir / out->get_ne(1) - i03 * out->get_ne(2);
+        const auto i01 = ir % out->get_ne(1);  // TODO: should we use divide instead of mod?
+
+        auto * src0_row = src0_ptr + i03 * src0->get_nb(3) + i02 * src0->get_nb(2) + i01 * src0->get_nb(1);
+        auto * dst_row  = dst_ptr + i03 * out->get_nb(3) + i02 * out->get_nb(2) + i01 * out->get_nb(1);
+        if (ir + 1 < start_end.second) {
+            hexagon::l2fetch_row(src0_row + src0->get_nb(1), valid_row_bytes);
        }
+
+        _RowFunc(reinterpret_cast<const data_type *>(src0_row), static_cast<size_t>(out->get_ne(0)), param,
+                 reinterpret_cast<data_type *>(dst_row));
+    }
+
+    return true;
+}
+
+bool is_unary_op_supported(const npu_device_tensor_spec & src0, const npu_device_tensor_spec & src1,
+                           const npu_device_tensor_spec & dst, npu_device_tensor_op op) {
+    if (op != NPU_OP_RMS_NORM) {
+        DEVICE_LOG_DEBUG("[%s]unsupported\n", hexagon::op_get_name(op));
+        return false;
+    }
+
+    if (dst.type != src0.type) {
+        DEVICE_LOG_DEBUG("[%s]src0.type and dst.type mismatch: %s vs %s\n", hexagon::op_get_name(op),
+                         hexagon::get_type_name(src0.type), hexagon::get_type_name(dst.type));
+        return false;
+    }
+
+    if (dst.type != NPU_DATA_TYPE_F32) {
+        DEVICE_LOG_DEBUG("[%s]unsupported data type: %s\n", hexagon::op_get_name(op), hexagon::get_type_name(dst.type));
+        return false;
+    }
+
+    if (!is_same_shape(src0, dst)) {
+        DEVICE_LOG_DEBUG("[%s]src0 and dst have different shape\n", hexagon::op_get_name(op));
+        return false;
    }

    return true;
@ -211,6 +366,7 @@ struct op_capabilities {
    npu_device_tensor_op               op;
    hexagon::op_is_supported_func_type is_supported;
    hexagon::compute_func_type         compute_funcs[NPU_DATA_TYPE_COUNT];
+    bool                               requires_thread_barrier = false;
 };

 constexpr const op_capabilities kOpCapabilities[] = {
@ -219,22 +375,36 @@ constexpr const op_capabilities kOpCapabilities[] = {
     {
            hexagon::mul_mat_f32,  // NPU_DATA_TYPE_F32
            nullptr,               // NPU_DATA_TYPE_F16
-        }, },
-    { NPU_OP_ADD,
-     is_element_wise_op_supported, {
-          element_wise_op<vec_op_f32_f32<vadd_f32_f32>>,  // NPU_DATA_TYPE_F32
-          element_wise_op<vec_op_f16_f16<vadd_f16_f16>>,  // NPU_DATA_TYPE_F16
-      } },
-    { NPU_OP_SUB,
-     is_element_wise_op_supported, {
-          element_wise_op<vec_op_f32_f32<vsub_f32_f32>>,  // NPU_DATA_TYPE_F32
-          element_wise_op<vec_op_f16_f16<vsub_f16_f16>>,  // NPU_DATA_TYPE_F16
-      } },
-    { NPU_OP_MUL,
-     is_element_wise_op_supported, {
-          element_wise_op<vec_op_f32_f32<vmul_f32_f32>>,  // NPU_DATA_TYPE_F32
-          element_wise_op<vec_op_f16_f16<vmul_f16_f16>>,  // NPU_DATA_TYPE_F16
-      } },
+        },       true,
+     },
+    {
+     NPU_OP_ADD,              is_element_wise_op_supported,
+     {
+            element_wise_op<vec_op_f32_f32<vadd_f32_f32>>,  // NPU_DATA_TYPE_F32
+            element_wise_op<vec_op_f16_f16<vadd_f16_f16>>,  // NPU_DATA_TYPE_F16
+        }, false,
+     },
+    {
+     NPU_OP_SUB,           is_element_wise_op_supported,
+     {
+            element_wise_op<vec_op_f32_f32<vsub_f32_f32>>,  // NPU_DATA_TYPE_F32
+            element_wise_op<vec_op_f16_f16<vsub_f16_f16>>,  // NPU_DATA_TYPE_F16
+        },          false,
+     },
+    {
+     NPU_OP_MUL,     is_element_wise_op_supported,
+     {
+            element_wise_op<vec_op_f32_f32<vmul_f32_f32>>,  // NPU_DATA_TYPE_F32
+            element_wise_op<vec_op_f16_f16<vmul_f16_f16>>,  // NPU_DATA_TYPE_F16
+        },       false,
+     },
+    {
+     NPU_OP_RMS_NORM,              is_unary_op_supported,
+     {
+            unary_op<rms_norm_vec_f32>,  // NPU_DATA_TYPE_F32
+            nullptr,                     // NPU_DATA_TYPE_F16
+        }, false,
+     },
 };

 static_assert(kOpCapabilities[NPU_OP_MUL_MAT].compute_funcs[NPU_DATA_TYPE_F32] == hexagon::mul_mat_f32,
@ -243,6 +413,8 @@ static_assert(kOpCapabilities[NPU_OP_MUL_MAT].compute_funcs[NPU_DATA_TYPE_F32] =
 static_assert(std::size(kOpCapabilities) == NPU_OP_COUNT);
 static_assert(kOpCapabilities[NPU_OP_MUL_MAT].op == NPU_OP_MUL_MAT, "kOpArray[NPU_OP_MUL_MAT].op != NPU_OP_MUL_MAT");
 static_assert(kOpCapabilities[NPU_OP_MUL].op == NPU_OP_MUL, "kOpArray[NPU_OP_MUL].op != NPU_OP_MUL");
+static_assert(kOpCapabilities[NPU_OP_RMS_NORM].op == NPU_OP_RMS_NORM,
+              "kOpArray[NPU_OP_RMS_NORM].op != NPU_OP_RMS_NORM");

 hexagon::compute_func_type get_compute_func_impl(npu_device_tensor_op op, npu_device_tensor_data_type type) {
    if (op >= NPU_OP_COUNT) {
@ -260,6 +432,14 @@ compute_func_type get_compute_func(tensor * dst) {
    return get_compute_func_impl(dst->get_op(), dst->get_type());
 }

+bool requires_thread_barrier(npu_device_tensor_op op) {
+    if (op >= NPU_OP_COUNT) {
+        return false;
+    }
+
+    return kOpCapabilities[op].requires_thread_barrier;
+}
+
 bool support_op(const npu_device_tensor_spec & src0, const npu_device_tensor_spec & src1,
                const npu_device_tensor_spec & dst, npu_device_tensor_op op) {
    if (get_compute_func_impl(op, dst.type) == nullptr) {
--- a/ggml/src/ggml-qnn/npu/device/op_impl.hpp
+++ b/ggml/src/ggml-qnn/npu/device/op_impl.hpp
@ -6,6 +6,8 @@ namespace hexagon {

 compute_func_type get_compute_func(tensor * dst);

+bool requires_thread_barrier(npu_device_tensor_op op);
+
 bool support_op(const npu_device_tensor_spec & src0, const npu_device_tensor_spec & src1,
                const npu_device_tensor_spec & dst, npu_device_tensor_op op);

--- a/ggml/src/ggml-qnn/npu/device/op_mul_mat.cpp
+++ b/ggml/src/ggml-qnn/npu/device/op_mul_mat.cpp
@ -3,44 +3,43 @@
 #include <HTP/core/intrinsics.h>

 #include "quants.hpp"
+#include "thread_pool.hpp"  // TODO: remove this dependency
 #include "vtcm_mem.hpp"

 namespace {

-inline float vec_reduction_f32(HVX_Vector sums) {
-    constexpr const size_t kFloatsPerVector = hexagon::kBytesPerVector / sizeof(float);
-    static_assert(kFloatsPerVector == 32 || kFloatsPerVector == 16, "kFloatsPerVector should be 16 or 32");
-
-    // TODO: do we have a better way to do the reduction?
-    switch (kFloatsPerVector) {
-        default:
-        case 32:
-            sums = Q6_Vqf32_vadd_Vqf32Vqf32(sums, Q6_V_vror_VR(sums, 16 * sizeof(float)));
-            // fallthrough
-        case 16:
-            sums = Q6_Vqf32_vadd_Vqf32Vqf32(sums, Q6_V_vror_VR(sums, 8 * sizeof(float)));
-            sums = Q6_Vqf32_vadd_Vqf32Vqf32(sums, Q6_V_vror_VR(sums, 4 * sizeof(float)));
-            sums = Q6_Vqf32_vadd_Vqf32Vqf32(sums, Q6_V_vror_VR(sums, 2 * sizeof(float)));
-            sums = Q6_Vqf32_vadd_Vqf32Vqf32(sums, Q6_V_vror_VR(sums, sizeof(float)));
-            break;
-    }
-
-    return hexagon::get_flt0_from_fltv(Q6_Vsf_equals_Vqf32(sums));
-}
-
 inline float vec_dot_product_f32_f32(const float * src0, const float * src1, size_t count) {
    constexpr const size_t kElementsPerVector = hexagon::kBytesPerVector / sizeof(float);

-    HVX_Vector * iptr0     = ((HVX_Vector *) src0);
-    HVX_Vector * iptr0_end = ((HVX_Vector *) src0) + (count / kElementsPerVector);
-    HVX_Vector * iptr1     = ((HVX_Vector *) src1);
-    HVX_Vector   prev0     = *iptr0++;
-    HVX_Vector   prev1     = *iptr1++;
-    HVX_Vector   sum       = Q6_V_vzero();
+    HVX_Vector * src0_vec_ptr     = ((HVX_Vector *) src0);
+    HVX_Vector * src0_vec_ptr_end = ((HVX_Vector *) src0) + count / kElementsPerVector;
+    HVX_Vector * src1_vec_ptr     = ((HVX_Vector *) src1);
+    HVX_Vector   prev0            = *src0_vec_ptr++;
+    HVX_Vector   prev1            = *src1_vec_ptr++;
+    HVX_Vector   sum              = Q6_V_vzero();

-    while (iptr0 < iptr0_end) {
-        HVX_Vector curr0 = *iptr0++;
-        HVX_Vector curr1 = *iptr1++;
+    while (src0_vec_ptr_end - src0_vec_ptr > 1) {
+        HVX_Vector curr0_lo = src0_vec_ptr[0];
+        HVX_Vector curr0_hi = src0_vec_ptr[1];
+        HVX_Vector curr1_lo = src1_vec_ptr[0];
+        HVX_Vector curr1_hi = src1_vec_ptr[1];
+
+        HVX_Vector l0 = Q6_V_valign_VVR(curr0_lo, prev0, (size_t) src0);
+        HVX_Vector l1 = Q6_V_valign_VVR(curr1_lo, prev1, (size_t) src1);
+        HVX_Vector h0 = Q6_V_valign_VVR(curr0_hi, curr0_lo, (size_t) src0);
+        HVX_Vector h1 = Q6_V_valign_VVR(curr1_hi, curr1_lo, (size_t) src1);
+        sum           = Q6_Vqf32_vadd_Vqf32Vqf32(Q6_Vqf32_vmpy_VsfVsf(l0, l1), sum);
+        sum           = Q6_Vqf32_vadd_Vqf32Vqf32(Q6_Vqf32_vmpy_VsfVsf(h0, h1), sum);
+
+        prev0 = curr0_hi;
+        prev1 = curr1_hi;
+        src0_vec_ptr += 2;
+        src1_vec_ptr += 2;
+    }
+
+    if (src0_vec_ptr_end - src0_vec_ptr > 0) {
+        HVX_Vector curr0 = *src0_vec_ptr++;
+        HVX_Vector curr1 = *src1_vec_ptr++;
        HVX_Vector s0    = Q6_V_valign_VVR(curr0, prev0, (size_t) src0);
        HVX_Vector s1    = Q6_V_valign_VVR(curr1, prev1, (size_t) src1);
        sum              = Q6_Vqf32_vadd_Vqf32Vqf32(Q6_Vqf32_vmpy_VsfVsf(s0, s1), sum);
@ -48,17 +47,17 @@ inline float vec_dot_product_f32_f32(const float * src0, const float * src1, siz
        prev1            = curr1;
    }

-    if ((iptr0_end - ((HVX_Vector *) src0)) > 0) {
+    if ((src0_vec_ptr_end - ((HVX_Vector *) src0)) > 0) {
        // handle the last vector
        // see also:
        //   https://github.com/UbiquitousLearning/mllm/blob/babf4410352ce8730824c87699c025a0d4ce3a6f/src/backends/qnn/LLaMAOpPackageHtp/LLaMAPackage/src/ops/LLaMAMul.cpp#L147
        //   or qualcomm sdk libs\qhl_hvx\src\qhblas_hvx\qhblas_hvx_aw_vector_add_ah.c
-        bool       iptr0_aligned = hexagon::is_addr_aligned(iptr0);
-        HVX_Vector curr0         = iptr0_aligned ? prev0 : *iptr0;
-        iptr0                    = iptr0_aligned ? iptr0 : iptr0 + 1;
-        bool       iptr1_aligned = hexagon::is_addr_aligned(iptr1);
-        HVX_Vector curr1         = iptr1_aligned ? prev1 : *iptr1;
-        iptr1                    = iptr1_aligned ? iptr1 : iptr1 + 1;
+        bool       iptr0_aligned = hexagon::is_addr_aligned(src0_vec_ptr);
+        HVX_Vector curr0         = iptr0_aligned ? prev0 : *src0_vec_ptr;
+        src0_vec_ptr             = iptr0_aligned ? src0_vec_ptr : src0_vec_ptr + 1;
+        bool       iptr1_aligned = hexagon::is_addr_aligned(src1_vec_ptr);
+        HVX_Vector curr1         = iptr1_aligned ? prev1 : *src1_vec_ptr;
+        src1_vec_ptr             = iptr1_aligned ? src1_vec_ptr : src1_vec_ptr + 1;
        HVX_Vector s0            = Q6_V_valign_VVR(curr0, prev0, (size_t) src0);
        HVX_Vector s1            = Q6_V_valign_VVR(curr1, prev1, (size_t) src1);
        sum                      = Q6_Vqf32_vadd_Vqf32Vqf32(Q6_Vqf32_vmpy_VsfVsf(s0, s1), sum);
@ -70,19 +69,21 @@ inline float vec_dot_product_f32_f32(const float * src0, const float * src1, siz
    const size_t leftover_bytes = leftover * sizeof(float);
    if (leftover > 0) {
        // handle the leftover elements
-        HVX_Vector curr0 =
-            (leftover_bytes + hexagon::unaligned_bytes(iptr0) > hexagon::kBytesPerVector) ? *iptr0 : prev0;
-        curr0 = Q6_V_valign_VVR(curr0, prev0, (size_t) src0);
+        HVX_Vector curr0 = (leftover_bytes + hexagon::unaligned_bytes(src0_vec_ptr) > hexagon::kBytesPerVector) ?
+                               *src0_vec_ptr :
+                               prev0;
+        curr0            = Q6_V_valign_VVR(curr0, prev0, (size_t) src0);

-        HVX_Vector curr1 =
-            (leftover_bytes + hexagon::unaligned_bytes(iptr1) > hexagon::kBytesPerVector) ? *iptr1 : prev1;
-        curr1 = Q6_V_valign_VVR(curr1, prev1, (size_t) src1);
+        HVX_Vector curr1 = (leftover_bytes + hexagon::unaligned_bytes(src1_vec_ptr) > hexagon::kBytesPerVector) ?
+                               *src1_vec_ptr :
+                               prev1;
+        curr1            = Q6_V_valign_VVR(curr1, prev1, (size_t) src1);

        sum = Q6_Vqf32_vadd_Vqf32Vqf32(
            Q6_V_valign_VVR(Q6_Vqf32_vmpy_VsfVsf(curr0, curr1), Q6_V_vzero(), leftover_bytes), sum);
    }

-    return vec_reduction_f32(sum);
+    return hexagon::vec_reduction_f32(sum);
 }

 // TODO: merge with vec_dot_product_f32_f32?
@ -90,17 +91,17 @@ inline float vec_dot_product_f16_f16(const npu_device_fp16_t * src0, const npu_d
    constexpr const size_t kElementsPerVector = hexagon::kBytesPerVector / sizeof(npu_device_fp16_t);
    constexpr const size_t kFloatsPerVector   = hexagon::kBytesPerVector / sizeof(float);

-    HVX_Vector * iptr0     = ((HVX_Vector *) src0);
-    HVX_Vector * iptr0_end = ((HVX_Vector *) src0) + (count / kElementsPerVector);
-    HVX_Vector * iptr1     = ((HVX_Vector *) src1);
-    HVX_Vector   prev0     = *iptr0++;
-    HVX_Vector   prev1     = *iptr1++;
-    HVX_Vector   sum_hi    = Q6_V_vzero();
-    HVX_Vector   sum_lo    = Q6_V_vzero();
+    HVX_Vector * src0_vec_ptr     = ((HVX_Vector *) src0);
+    HVX_Vector * src0_vec_ptr_end = ((HVX_Vector *) src0) + (count / kElementsPerVector);
+    HVX_Vector * src1_vec_ptr     = ((HVX_Vector *) src1);
+    HVX_Vector   prev0            = *src0_vec_ptr++;
+    HVX_Vector   prev1            = *src1_vec_ptr++;
+    HVX_Vector   sum_hi           = Q6_V_vzero();
+    HVX_Vector   sum_lo           = Q6_V_vzero();

-    while (iptr0 < iptr0_end) {
-        HVX_Vector     curr0  = *iptr0++;
-        HVX_Vector     curr1  = *iptr1++;
+    while (src0_vec_ptr < src0_vec_ptr_end) {
+        HVX_Vector     curr0  = *src0_vec_ptr++;
+        HVX_Vector     curr1  = *src1_vec_ptr++;
        HVX_Vector     s0     = Q6_V_valign_VVR(curr0, prev0, (size_t) src0);
        HVX_Vector     s1     = Q6_V_valign_VVR(curr1, prev1, (size_t) src1);
        HVX_VectorPair result = Q6_Wqf32_vmpy_VhfVhf(s0, s1);
@ -110,17 +111,17 @@ inline float vec_dot_product_f16_f16(const npu_device_fp16_t * src0, const npu_d
        prev1                 = curr1;
    }

-    if ((iptr0_end - ((HVX_Vector *) src0)) > 0) {
+    if ((src0_vec_ptr_end - ((HVX_Vector *) src0)) > 0) {
        // handle the last vector
        // see also:
        //   https://github.com/UbiquitousLearning/mllm/blob/babf4410352ce8730824c87699c025a0d4ce3a6f/src/backends/qnn/LLaMAOpPackageHtp/LLaMAPackage/src/ops/LLaMAMul.cpp#L147
        //   or qualcomm sdk libs\qhl_hvx\src\qhblas_hvx\qhblas_hvx_aw_vector_add_ah.c
-        bool       iptr0_aligned = hexagon::is_addr_aligned(iptr0);
-        HVX_Vector curr0         = iptr0_aligned ? prev0 : *iptr0;
-        iptr0                    = iptr0_aligned ? iptr0 : iptr0 + 1;
-        bool       iptr1_aligned = hexagon::is_addr_aligned(iptr1);
-        HVX_Vector curr1         = iptr1_aligned ? prev1 : *iptr1;
-        iptr1                    = iptr1_aligned ? iptr1 : iptr1 + 1;
+        bool       iptr0_aligned = hexagon::is_addr_aligned(src0_vec_ptr);
+        HVX_Vector curr0         = iptr0_aligned ? prev0 : *src0_vec_ptr;
+        src0_vec_ptr             = iptr0_aligned ? src0_vec_ptr : src0_vec_ptr + 1;
+        bool       iptr1_aligned = hexagon::is_addr_aligned(src1_vec_ptr);
+        HVX_Vector curr1         = iptr1_aligned ? prev1 : *src1_vec_ptr;
+        src1_vec_ptr             = iptr1_aligned ? src1_vec_ptr : src1_vec_ptr + 1;
        HVX_Vector     s0        = Q6_V_valign_VVR(curr0, prev0, (size_t) src0);
        HVX_Vector     s1        = Q6_V_valign_VVR(curr1, prev1, (size_t) src1);
        HVX_VectorPair result    = Q6_Wqf32_vmpy_VhfVhf(s0, s1);
@ -134,13 +135,15 @@ inline float vec_dot_product_f16_f16(const npu_device_fp16_t * src0, const npu_d
    const size_t leftover_bytes = leftover * sizeof(npu_device_fp16_t);
    if (leftover > 0) {
        // handle the leftover elements
-        HVX_Vector curr0 =
-            (leftover_bytes + hexagon::unaligned_bytes(iptr0) > hexagon::kBytesPerVector) ? *iptr0 : prev0;
-        curr0 = Q6_V_valign_VVR(curr0, prev0, (size_t) src0);
+        HVX_Vector curr0 = (leftover_bytes + hexagon::unaligned_bytes(src0_vec_ptr) > hexagon::kBytesPerVector) ?
+                               *src0_vec_ptr :
+                               prev0;
+        curr0            = Q6_V_valign_VVR(curr0, prev0, (size_t) src0);

-        HVX_Vector curr1 =
-            (leftover_bytes + hexagon::unaligned_bytes(iptr1) > hexagon::kBytesPerVector) ? *iptr1 : prev1;
-        curr1 = Q6_V_valign_VVR(curr1, prev1, (size_t) src1);
+        HVX_Vector curr1 = (leftover_bytes + hexagon::unaligned_bytes(src1_vec_ptr) > hexagon::kBytesPerVector) ?
+                               *src1_vec_ptr :
+                               prev1;
+        curr1            = Q6_V_valign_VVR(curr1, prev1, (size_t) src1);

        HVX_VectorPair result = Q6_Wqf32_vmpy_VhfVhf(curr0, curr1);

@ -156,7 +159,7 @@ inline float vec_dot_product_f16_f16(const npu_device_fp16_t * src0, const npu_d
        }
    }

-    return vec_reduction_f32(Q6_Vqf32_vadd_Vqf32Vqf32(sum_hi, sum_lo));
+    return hexagon::vec_reduction_f32(Q6_Vqf32_vadd_Vqf32Vqf32(sum_hi, sum_lo));
 }

 template <typename T> struct get_data_type {};
@ -208,70 +211,118 @@ void mul_mat_impl(hexagon::tensor * src0, hexagon::tensor * src1, hexagon::tenso
    }

    // cache the src0 plane in VTCM
-    const size_t    src0_plane_row_count  = start_end_element.second - start_end_element.first;
-    size_t          src0_plane_cache_size = 0;
-    uint8_t *       src0_plane_cache_ptr  = nullptr;
-    const uint8_t * last_cached_plane_ptr = nullptr;
+    size_t          src0_plane_slice_row_count = start_end_element.second - start_end_element.first;
+    size_t          src0_plane_cache_size      = 0;
+    uint8_t *       src0_plane_cache_ptr       = nullptr;
+    const uint8_t * last_cached_plane_ptr      = nullptr;
+    bool            is_mem_cache               = false;
    if (is_quantized) {
-        src0_plane_cache_size = src0_actual_row_size * src0_plane_row_count;
-        src0_plane_cache_ptr  = params->get_cache(src0_plane_cache_size, is_quantized);
+        src0_plane_slice_row_count =
+            std::min(params->vtcm_quota_size / src0_actual_row_size, src0_plane_slice_row_count);
+        src0_plane_cache_size = src0_actual_row_size * src0_plane_slice_row_count;
+        src0_plane_cache_ptr  = params->get_vtcm_cache(src0_plane_cache_size);
+        if (src0_plane_cache_ptr == nullptr) {
+            DEVICE_LOG_DEBUG(
+                "mul_mat_impl: failed to get VTCM cache for src0, size: %zu, src0_plane_slice_row_count: %zu, "
+                "src0_actual_row_size: %zu, will fallback to mem cache\n",
+                src0_plane_cache_size, src0_plane_slice_row_count, src0_actual_row_size);
+            src0_plane_cache_ptr = params->get_mem_cache(src0_plane_cache_size);
+            is_mem_cache         = true;
+        }
    }

-    DEVICE_LOG_DEBUG("mul_mat_impl src0_actual_row_size: %zu, is_quantized: %d, vtcm_mem: %p(%zu)\n",
-                     src0_actual_row_size, is_quantized, (void *) src0_plane_cache_ptr, src0_plane_cache_size);
+    DEVICE_LOG_DEBUG(
+        "mul_mat_impl src0_actual_row_size: %zu, src0_plane_slice_row_count: %zu, is_quantized: %d, vtcm_mem: "
+        "%p(%zu)\n",
+        src0_actual_row_size, src0_plane_slice_row_count, is_quantized, (void *) src0_plane_cache_ptr,
+        src0_plane_cache_size);

    const size_t valid_row_bytes = src1->get_ne(0) * sizeof(data_type);
    DEVICE_SCOPED_OP_PERFORMANCE_TRACKER_WITH_SUB_PROC(dst, params->tidx, dequant);
    for (int64_t ip = start_end_plane.first; ip < start_end_plane.second; ip++) {
        const auto   i3         = ip / dst->get_ne(2);
        const auto   i2         = ip - i3 * dst->get_ne(2);
-        const auto * src0_plane = src0_ptr + i3 / r03 * src0->get_nb(3) + i2 / r02 * src0->get_nb(2) +
-                                  start_end_element.first * src0->get_nb(1);
        const auto * src1_plane = src1_ptr + i3 * src1->get_nb(3) + i2 * src1->get_nb(2);
        auto *       dst_plane  = dst_ptr + i3 * dst->get_nb(3) + i2 * dst->get_nb(2);
+        for (int64_t col_idx = start_end_element.first; col_idx < start_end_element.second;
+             col_idx += src0_plane_slice_row_count) {
+            const auto * src0_plane =
+                src0_ptr + i3 / r03 * src0->get_nb(3) + i2 / r02 * src0->get_nb(2) + col_idx * src0->get_nb(1);
+            if (src0_plane_cache_ptr) {
+                if (last_cached_plane_ptr != src0_plane) {
+                    DEVICE_SCOPED_OP_PERFORMANCE_TRACKER_ADD_SUB_PROC(dequant);

-        if (src0_plane_cache_ptr) {
-            if (last_cached_plane_ptr != src0_plane) {
-                DEVICE_SCOPED_OP_PERFORMANCE_TRACKER_ADD_SUB_PROC(dequant);
+                    for (int64_t ir = 0; ir < (int64_t) src0_plane_slice_row_count; ir++) {
+                        auto * src0_row = src0_plane + ir * src0->get_nb(1);
+                        if (ir + 1 < src0_plane_slice_row_count) {
+                            hexagon::l2fetch_row(src0_row + src0->get_nb(1), src0->get_nb(1));
+                        }

-                for (int64_t ir = 0; ir < (int64_t) src0_plane_row_count; ir++) {
-                    auto * src0_row = src0_plane + ir * src0->get_nb(1);
-                    if (ir + 1 < src0_plane_row_count) {
-                        hexagon::l2fetch_row(src0_row + src0->get_nb(1), src0->get_nb(1));
+                        auto * dst_row = reinterpret_cast<float *>(src0_plane_cache_ptr + ir * src0_actual_row_size);
+                        dequantize_row_func(src0_row, reinterpret_cast<float *>(dst_row), src0->get_ne(0),
+                                            params->f16_to_f32_table);
                    }

-                    auto * dst_row = reinterpret_cast<float *>(src0_plane_cache_ptr + ir * src0_actual_row_size);
-                    dequantize_row_func(src0_row, reinterpret_cast<float *>(dst_row), src0->get_ne(0),
-                                        params->f16_to_f32_table);
+                    last_cached_plane_ptr = src0_plane;
                }

-                last_cached_plane_ptr = src0_plane;
+                src0_plane = src0_plane_cache_ptr;
            }

-            src0_plane = src0_plane_cache_ptr;
-        }
-
-        for (int64_t i1 = start_end_row.first; i1 < start_end_row.second; i1++) {
-            auto * src1_row = src1_plane + i1 * src1->get_nb(1);
-            auto * dst_row  = reinterpret_cast<float *>(dst_plane + i1 * dst->get_nb(1)) + start_end_element.first;
-            for (int64_t i0 = 0; i0 < (int64_t) src0_plane_row_count; i0++) {
-                auto * src0_row = src0_plane + i0 * src0_actual_row_size;
-                if (i0 + 1 < src0_plane_row_count) {
-                    if (!src0_plane_cache_ptr) {
-                        hexagon::l2fetch_row(src0_row + src0_actual_row_size, valid_row_bytes);
+            for (int64_t i1 = start_end_row.first; i1 < start_end_row.second; i1++) {
+                auto * src1_row = src1_plane + i1 * src1->get_nb(1);
+                auto * dst_row  = reinterpret_cast<float *>(dst_plane + i1 * dst->get_nb(1)) + col_idx;
+                for (int64_t i0 = 0; i0 < (int64_t) src0_plane_slice_row_count; i0++) {
+                    auto * src0_row = src0_plane + i0 * src0_actual_row_size;
+                    if (i0 + 1 < src0_plane_slice_row_count) {
+                        if (!src0_plane_cache_ptr || is_mem_cache) {
+                            hexagon::l2fetch_row(src0_row + src0_actual_row_size, valid_row_bytes);
+                        }
+                    } else if (ip + 1 < start_end_plane.second) {
+                        hexagon::l2fetch_row(src1_row + src1->get_nb(1), valid_row_bytes);
                    }
-                } else if (ip + 1 < start_end_plane.second) {
-                    hexagon::l2fetch_row(src1_row + src1->get_nb(1), valid_row_bytes);
-                }

-                // TODO: figure dst how to handle a entire row
-                dst_row[i0] = _DotFunc(reinterpret_cast<const data_type *>(src0_row),
-                                       reinterpret_cast<const data_type *>(src1_row), (size_t) src0->get_ne(0));
+                    // TODO: figure dst how to handle a entire row
+                    dst_row[i0] = _DotFunc(reinterpret_cast<const data_type *>(src0_row),
+                                           reinterpret_cast<const data_type *>(src1_row), (size_t) src0->get_ne(0));
+                }
            }
        }
    }
 }

+bool is_quantized_mul_mat_supported(const npu_device_tensor_spec & src0, const npu_device_tensor_spec & src1) {
+    if (src1.type != NPU_DATA_TYPE_F32) {
+        DEVICE_LOG_DEBUG("[MUL_MAT]src0.type(%s) and src1.type(%s) mismatch and src1 is not F32\n",
+                         hexagon::get_type_name(src0.type), hexagon::get_type_name(src1.type));
+        return false;
+    }
+
+    const auto type_traits = hexagon::get_type_traits(src0.type);
+    if (!type_traits.is_quantized || type_traits.dequantize_row == nullptr) {
+        DEVICE_LOG_DEBUG("[MUL_MAT]src0.type(%s) and src1.type(%s) mismatch and src0 is not quantized\n",
+                         hexagon::get_type_name(src0.type), hexagon::get_type_name(src1.type));
+        return false;
+    }
+
+    if (src0.ne[0] % type_traits.blck_size) {
+        DEVICE_LOG_DEBUG("[MUL_MAT]src0.type(%s) ne[0] is not aligned: %ld\n", hexagon::get_type_name(src0.type),
+                         (long) src0.ne[0]);
+        return false;
+    }
+
+    const auto vtcm_thread_quota_size = hexagon::vtcm_mem::get_total_size() / hexagon::kMaxThreadCount;
+    if (src0.ne[0] * sizeof(hexagon::dequantized_element_type) > vtcm_thread_quota_size) {
+        DEVICE_LOG_DEBUG("[MUL_MAT]src0.type(%s) ne[0] is too large: %ld, vtcm_thread_quota_size: %zu\n",
+                         hexagon::get_type_name(src0.type), (long) src0.ne[0], vtcm_thread_quota_size);
+        return false;
+    }
+
+    DEVICE_LOG_DEBUG("[MUL_MAT]supported quantized src0.type(%s) and src1.type(%s)\n",
+                     hexagon::get_type_name(src0.type), hexagon::get_type_name(src1.type));
+    return true;
+}
+
 }  // namespace

 namespace hexagon {
@ -319,27 +370,9 @@ bool is_mul_mat_supported(const npu_device_tensor_spec & src0, const npu_device_

    if (src0.type != src1.type) {
 #ifdef GGML_HEXAGON_ENABLE_QUANTIZED_TENSORS
-        if (src1.type != NPU_DATA_TYPE_F32) {
-            DEVICE_LOG_DEBUG("[%s]src0.type(%s) and src1.type(%s) mismatch and src1 is not F32\n", op_get_name(op),
-                             get_type_name(src0.type), get_type_name(src1.type));
+        if (!is_quantized_mul_mat_supported(src0, src1)) {
            return false;
        }
-
-        const auto type_traits = get_type_traits(src0.type);
-        if (!type_traits.is_quantized || type_traits.dequantize_row == nullptr) {
-            DEVICE_LOG_DEBUG("[%s]src0.type(%s) and src1.type(%s) mismatch and src0 is not quantized\n",
-                             op_get_name(op), get_type_name(src0.type), get_type_name(src1.type));
-            return false;
-        }
-
-        if (src0.ne[0] % type_traits.blck_size) {
-            DEVICE_LOG_DEBUG("[%s]src0.type(%s) ne[0] is not aligned: %ld\n", op_get_name(op), get_type_name(src0.type),
-                             (long) src0.ne[0]);
-            return false;
-        }
-
-        DEVICE_LOG_DEBUG("[%s]supported quantized src0.type(%s) and src1.type(%s)\n", op_get_name(op),
-                         get_type_name(src0.type), get_type_name(src1.type));
 #else
        DEVICE_LOG_DEBUG("[%s]src0.type(%s) and src1.type(%s) mismatch and quantized tensors are not supported\n",
                         op_get_name(op), get_type_name(src0.type), get_type_name(src1.type));
--- a/ggml/src/ggml-qnn/npu/device/op_mul_mat.hpp
+++ b/ggml/src/ggml-qnn/npu/device/op_mul_mat.hpp
@ -7,11 +7,6 @@

 namespace hexagon {

-constexpr const size_t kBytesPerVector      = sizeof(HVX_Vector);  // 128 for v73
-constexpr const size_t kAlignMask           = kBytesPerVector - 1;
-constexpr const size_t kL2CacheSize         = 8 * 1024;            // // 8KB L2 cache
-constexpr const size_t kL2FetchAheadVectors = kL2CacheSize / kBytesPerVector;
-
 inline size_t unaligned_bytes(const void * addr) {
    return ((size_t) addr) & kAlignMask;
 }
@ -43,6 +38,31 @@ inline float get_flt0_from_fltv(HVX_Vector vect) {
    return cvt.f;
 }

+inline HVX_Vector vec_reduction_qf32(HVX_Vector sums) {
+    constexpr const size_t kFloatsPerVector = hexagon::kBytesPerVector / sizeof(float);
+    static_assert(kFloatsPerVector == 32 || kFloatsPerVector == 16, "kFloatsPerVector should be 16 or 32");
+
+    // TODO: do we have a better way to do the reduction?
+    switch (kFloatsPerVector) {
+        default:
+        case 32:
+            sums = Q6_Vqf32_vadd_Vqf32Vqf32(sums, Q6_V_vror_VR(sums, 16 * sizeof(float)));
+            // fallthrough
+        case 16:
+            sums = Q6_Vqf32_vadd_Vqf32Vqf32(sums, Q6_V_vror_VR(sums, 8 * sizeof(float)));
+            sums = Q6_Vqf32_vadd_Vqf32Vqf32(sums, Q6_V_vror_VR(sums, 4 * sizeof(float)));
+            sums = Q6_Vqf32_vadd_Vqf32Vqf32(sums, Q6_V_vror_VR(sums, 2 * sizeof(float)));
+            sums = Q6_Vqf32_vadd_Vqf32Vqf32(sums, Q6_V_vror_VR(sums, sizeof(float)));
+            break;
+    }
+
+    return sums;
+}
+
+inline float vec_reduction_f32(HVX_Vector sums) {
+    return hexagon::get_flt0_from_fltv(Q6_Vsf_equals_Vqf32(vec_reduction_qf32(sums)));
+}
+
 bool mul_mat_f32(tensor * out, compute_params * params);
 bool is_mul_mat_supported(const npu_device_tensor_spec & src0, const npu_device_tensor_spec & src1,
                          const npu_device_tensor_spec & dst, npu_device_tensor_op op);
--- a/ggml/src/ggml-qnn/npu/device/op_types.hpp
+++ b/ggml/src/ggml-qnn/npu/device/op_types.hpp
@ -1,5 +1,7 @@
 #pragma once

+#include <hexagon_types.h>
+
 #include <algorithm>
 #include <cstdint>
 #include <memory>
@ -15,26 +17,25 @@ namespace hexagon {
 struct compute_params {
    const size_t                       tidx;
    const size_t                       tcnt;
+    const size_t                       vtcm_quota_size;
    const float *                      f16_to_f32_table;
    std::unique_ptr<hexagon::vtcm_mem> vtcm_cache;
    std::unique_ptr<uint8_t[]>         mem_cache;
    size_t                             mem_cache_size = 0;

-    uint8_t * get_cache(size_t size, bool fallback_to_mem) {
+    uint8_t * get_vtcm_cache(size_t size) {
        if (!vtcm_cache || vtcm_cache->get_size() < size) {
            vtcm_cache = std::make_unique<hexagon::vtcm_mem>(size, false);
        }

-        if (vtcm_cache->is_valid()) {
-            return vtcm_cache->get_mem();
-        }
-
-        if (!fallback_to_mem) {
-            DEVICE_LOG_DEBUG("vtcm_mem not valid, return nullptr\n");
+        if (!vtcm_cache->is_valid()) {
            return nullptr;
        }

-        DEVICE_LOG_DEBUG("vtcm_mem not valid, allocate from mem_cache\n");
+        return vtcm_cache->get_mem();
+    }
+
+    uint8_t * get_mem_cache(size_t size) {
        if (!mem_cache || mem_cache_size < size) {
            mem_cache      = std::make_unique<uint8_t[]>(size + 256);
            mem_cache_size = mem_cache ? size : 0;
@ -49,10 +50,31 @@ typedef bool (*op_is_supported_func_type)(const npu_device_tensor_spec & src0, c
                                          const npu_device_tensor_spec & dst, npu_device_tensor_op op);

 inline constexpr std::pair<int64_t, int64_t> get_thread_work_slice(int64_t total, size_t tidx, size_t tcnt) {
-    const auto elements_per_thread = (total + tcnt - 1) / tcnt;
-    const auto start               = tidx * elements_per_thread;
-    const auto end                 = std::min<int64_t>(start + elements_per_thread, total);
-    return { start, end };
+    if (total <= 0 || tidx >= tcnt) {
+        return { 0, 0 };  // No work for this thread
+    }
+
+    const auto elements_per_thread = total / tcnt;
+    const auto remainder           = total % tcnt;
+
+    int64_t start = 0;
+    int64_t end   = 0;
+    if (tidx < remainder) {
+        // First 'remainder' threads get one extra item
+        start = tidx * (elements_per_thread + 1);
+        end   = start + elements_per_thread + 1;
+    } else {
+        // Remaining threads get the base number of elements
+        start = remainder * (elements_per_thread + 1) + (tidx - remainder) * elements_per_thread;
+        end   = start + elements_per_thread;
+    }
+
+    return { start, std::min(end, total) };
 }

+constexpr const size_t kBytesPerVector      = sizeof(HVX_Vector);  // 128 for v73
+constexpr const size_t kAlignMask           = kBytesPerVector - 1;
+constexpr const size_t kL2CacheSize         = 8 * 1024;            // // 8KB L2 cache
+constexpr const size_t kL2FetchAheadVectors = kL2CacheSize / kBytesPerVector;
+
 }  // namespace hexagon
--- a/ggml/src/ggml-qnn/npu/device/quants.cpp
+++ b/ggml/src/ggml-qnn/npu/device/quants.cpp
@ -4,6 +4,8 @@

 #include <array>

+#include "op_types.hpp"  // TODO: remove this include
+
 static_assert(sizeof(npu_device_block_q4_K) ==
                  2 * sizeof(npu_device_fp16_t) + QUANT_K_SCALE_SIZE + QUANT_K_BLOCK_SIZE / 2,
              "wrong q4_K block size/padding");
@ -16,14 +18,34 @@ static_assert(sizeof(npu_device_block_q8_0) == sizeof(npu_device_fp16_t) + QUANT

 namespace {

-inline float to_float(const npu_device_fp16_t src) {
-    union {
-        __fp16 f16;
-        npu_device_fp16_t u16;
-    } f16;
+inline HVX_Vector vmemu(const void * unaligned_ptr) {
+    HVX_Vector ret = *reinterpret_cast<const HVX_UVector *>(unaligned_ptr);
+    return ret;
+}

-    f16.u16 = src;
-    return f16.f16;
+inline float to_float(const npu_device_fp16_t src) {
+    return reinterpret_cast<const __fp16 &>(src);
+}
+
+template <typename _TBlock> inline HVX_Vector load_block_generic(const _TBlock & src) {
+    uint8_t buffer[hexagon::kBytesPerVector];
+
+    static_assert(sizeof(buffer) == sizeof(HVX_Vector), "wrong cvt size/padding");
+    static_assert(sizeof(buffer) >= sizeof(src.qs), "wrong q4_0 block size/padding");
+
+    memcpy(&buffer[0], src.qs, sizeof(src.qs));
+    return *reinterpret_cast<HVX_UVector *>(buffer);
+}
+
+template <typename _TBlock> inline HVX_Vector load_dual_block_generic(const _TBlock & src1, const _TBlock & src2) {
+    uint8_t buffer[hexagon::kBytesPerVector];
+
+    static_assert(sizeof(buffer) == sizeof(HVX_Vector), "wrong cvt size/padding");
+    static_assert(sizeof(buffer) >= sizeof(src1.qs) * 2, "wrong q4_0 block size/padding");
+
+    memcpy(&buffer[0], src1.qs, sizeof(src1.qs));
+    memcpy(&buffer[sizeof(src1.qs)], src2.qs, sizeof(src2.qs));
+    return *reinterpret_cast<HVX_UVector *>(buffer);
 }

 inline void get_scale_min_k4(int j, const uint8_t * q, uint8_t * d, uint8_t * m) {
@ -37,38 +59,78 @@ inline void get_scale_min_k4(int j, const uint8_t * q, uint8_t * d, uint8_t * m)
 }

 void dequantize_row_q8_0(const void * src, float * dst, size_t count, const float * f16_to_f32_table) {
-    constexpr const int qk      = QUANT_BLOCK_SIZE;
-    const int           nb      = count / qk;
-    const auto *        src_ptr = reinterpret_cast<const npu_device_block_q8_0 *>(src);
+    constexpr const int qk = QUANT_BLOCK_SIZE;
+    static_assert(QUANT_BLOCK_SIZE == hexagon::kBytesPerVector / sizeof(float));
+
+    const int     nb      = count / qk;
+    const auto *  src_ptr = reinterpret_cast<const npu_device_block_q8_0 *>(src);
+    HVX_UVector * out     = ((HVX_UVector *) dst);  // TODO: opt for aligned access

-    // TODO: use intrinsics
    for (int i = 0; i < nb; i++) {
-        const float d = f16_to_f32_table[src_ptr[i].d];
+        const auto & src = src_ptr[i];
+        HVX_Vector   d   = Q6_Vh_vsplat_R(src.d);

-        for (int j = 0; j < qk; ++j) {
-            dst[i * qk + j] = src_ptr[i].qs[j] * d;
-        }
+        HVX_Vector     q_lo = load_block_generic(src);
+        HVX_VectorPair q    = Q6_Wh_vunpack_Vb(q_lo);
+        q                   = Q6_Wh_vunpack_Vb(Q6_V_lo_W(q));
+        q_lo                = Q6_Vhf_equals_Vh(Q6_V_lo_W(q));
+        q                   = Q6_Wqf32_vmpy_VhfVhf(q_lo, d);
+        out[i]              = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(q));
    }
 }

 void dequantize_row_q4_0(const void * src, float * dst, size_t count, const float * f16_to_f32_table) {
    constexpr const int qk = QUANT_BLOCK_SIZE;
    static_assert(qk % 2 == 0, "qk must be even");
+    static_assert(QUANT_BLOCK_SIZE == hexagon::kBytesPerVector / sizeof(float));
+    constexpr const uint32_t kSizeOfQs = sizeof(npu_device_block_q4_0::qs);

-    const int    nb      = count / qk;
-    const auto * src_ptr = reinterpret_cast<const npu_device_block_q4_0 *>(src);
+    const int     nb      = count / qk;
+    const auto *  src_ptr = reinterpret_cast<const npu_device_block_q4_0 *>(src);
+    HVX_Vector    mask    = Q6_Vb_vsplat_R(0x0F);
+    HVX_Vector    minus   = Q6_Vb_vsplat_R(8);
+    HVX_UVector * out     = ((HVX_UVector *) dst);  // TODO: opt for aligned access

-    // TODO: use intrinsics
-    for (int i = 0; i < nb; i++) {
-        const float d = f16_to_f32_table[src_ptr[i].d];
+    const int loop_count = nb - (nb % 2);
+    for (int i = 0; i < loop_count; i += 2) {
+        const auto & src1 = src_ptr[i];
+        const auto & src2 = src_ptr[i + 1];

-        for (int j = 0; j < qk / 2; ++j) {
-            const int x0 = (src_ptr[i].qs[j] & 0x0F) - 8;
-            const int x1 = ((src_ptr[i].qs[j] >> 4) & 0xF) - 8;
+        HVX_Vector d1 = Q6_Vh_vsplat_R(src1.d);
+        HVX_Vector d2 = Q6_Vh_vsplat_R(src2.d);
+        d1            = Q6_V_valign_VVR(d1, Q6_V_vzero(), hexagon::kBytesPerVector / 2);
+        d1            = Q6_V_valign_VVR(d2, d1, hexagon::kBytesPerVector / 2);
+        HVX_Vector d  = Q6_Vh_vshuff_Vh(d1);

-            dst[i * qk + j + 0]      = x0 * d;
-            dst[i * qk + j + qk / 2] = x1 * d;
-        }
+        HVX_Vector     q_lo = load_dual_block_generic(src1, src2);
+        HVX_Vector     q_hi = Q6_Vub_vlsr_VubR(q_lo, 4);
+        HVX_VectorPair q    = Q6_W_vshuff_VVR(q_hi, Q6_V_vand_VV(q_lo, mask), kSizeOfQs);
+        q_lo                = Q6_V_valign_VVR(Q6_V_lo_W(q), Q6_V_vzero(), hexagon::kBytesPerVector / 2);
+        q_lo                = Q6_V_valign_VVR(Q6_V_hi_W(q), q_lo, hexagon::kBytesPerVector / 2);
+        q_lo                = Q6_Vb_vshuff_Vb(q_lo);
+        q_lo                = Q6_Vb_vsub_VbVb(q_lo, minus);
+        q                   = Q6_Wh_vunpack_Vb(q_lo);
+        q_lo                = Q6_Vhf_equals_Vh(Q6_V_lo_W(q));
+        q                   = Q6_Wqf32_vmpy_VhfVhf(q_lo, d);
+        out[i]              = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(q));
+        out[i + 1]          = Q6_Vsf_equals_Vqf32(Q6_V_hi_W(q));
+    }
+
+    if (loop_count < nb) {
+        const auto & curr_blk = src_ptr[nb - 1];
+        HVX_Vector   d        = Q6_Vh_vsplat_R(curr_blk.d);
+
+        HVX_Vector q_lo = load_block_generic(curr_blk);
+        HVX_Vector q_hi = Q6_Vub_vlsr_VubR(q_lo, 4);
+        q_lo            = Q6_V_valign_VVR(Q6_V_vand_VV(q_lo, mask), Q6_V_vzero(), sizeof(curr_blk.qs));
+        q_lo            = Q6_V_valign_VVR(q_hi, q_lo, hexagon::kBytesPerVector - sizeof(curr_blk.qs));
+        q_lo            = Q6_Vb_vsub_VbVb(q_lo, minus);
+
+        HVX_VectorPair q = Q6_Wh_vunpack_Vb(q_lo);
+        q                = Q6_Wh_vunpack_Vb(Q6_V_lo_W(q));
+        q_lo             = Q6_Vhf_equals_Vh(Q6_V_lo_W(q));
+        q                = Q6_Wqf32_vmpy_VhfVhf(q_lo, d);
+        out[nb - 1]      = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(q));
    }
 }

--- a/ggml/src/ggml-qnn/npu/device/quants.hpp
+++ b/ggml/src/ggml-qnn/npu/device/quants.hpp
@ -23,13 +23,15 @@ inline bool is_quantized_type(npu_device_tensor_data_type type) {
    return get_type_traits(type).is_quantized;
 }

-inline size_t get_dequantized_row_size(tensor * tensor) {
+using dequantized_element_type = float;
+
+inline size_t get_dequantized_row_size(const tensor * tensor) {
    if (!is_quantized_type(tensor->get_type())) {
        return tensor->get_nb(1);  // for f32 and f16
    }

    auto row_elems_count = tensor->get_ne(0);
-    return row_elems_count * sizeof(float);  // currently only f32 is supported
+    return row_elems_count * sizeof(dequantized_element_type);  // currently only f32 is supported
 }

 inline const char * get_type_name(npu_device_tensor_data_type type) {
--- a/ggml/src/ggml-qnn/npu/device/tensor.hpp
+++ b/ggml/src/ggml-qnn/npu/device/tensor.hpp
@ -8,7 +8,8 @@

 namespace hexagon {

-constexpr const size_t kMaxTensorSrc = DEVICE_TENSOR_MAX_SRC;
+constexpr const size_t kMaxTensorSrc   = DEVICE_TENSOR_MAX_SRC;
+constexpr const size_t kMaxParamsCount = DEVICE_TENSOR_MAX_OP_PARAMS;

 class tensor {
  public:
@ -50,17 +51,17 @@ class tensor {
        }
    }

-    bool set_src(size_t index, tensor * src) {
-        if (index >= kMaxTensorSrc) {
-            return false;
+    void update_config(const npu_device_tensor_update_config & config) {
+        static_assert(sizeof(_op_params) == sizeof(config.params), "op params size mismatch");
+
+        _info.op = config.op;
+        memcpy(_op_params, config.params, sizeof(_op_params));
+        for (size_t i = 0; i < DEVICE_TENSOR_MAX_SRC; ++i) {
+            auto src_handle = config.src_handles[i];
+            _src[i]         = (src_handle ? reinterpret_cast<tensor *>(src_handle) : nullptr);
        }
-
-        _src[index] = src;
-        return true;
    }

-    void set_op(npu_device_tensor_op op) { _info.op = op; }
-
    tensor * get_src(size_t index) const {
        if (index >= kMaxTensorSrc) {
            return nullptr;
@ -77,6 +78,20 @@ class tensor {

    npu_device_tensor_op get_op() const { return _info.op; }

+    template <typename _TyParam> const _TyParam get_op_param(size_t index) const {
+        static_assert(sizeof(_TyParam) <= sizeof(_op_params), "_op_param type size exceeds op params size");
+
+        if (sizeof(_TyParam) * (index + 1) >= sizeof(_op_params)) {
+            return 0;
+        }
+
+        return reinterpret_cast<const _TyParam *>(_op_params)[index];
+    }
+
+    const int32_t * get_op_params() const { return _op_params; }
+
+    const size_t get_op_param_count() const { return kMaxParamsCount; }
+
    npu_device_tensor_data_type get_type() const { return _info.type; }

    const uint8_t * get_read_buffer() const {
@ -89,9 +104,10 @@ class tensor {
    bool is_valid() const { return _data != nullptr; }

  private:
-    npu_device_tensor_config _info;
-    tensor *                 _src[kMaxTensorSrc] = {};
-    uint8_t *                _data               = nullptr;
+    npu_device_tensor_config _info                       = {};
+    int32_t                  _op_params[kMaxParamsCount] = {};
+    tensor *                 _src[kMaxTensorSrc]         = {};
+    uint8_t *                _data                       = nullptr;

    DISABLE_COPY_AND_MOVE(tensor);
 };
--- a/ggml/src/ggml-qnn/npu/device/thread_pool.hpp
+++ b/ggml/src/ggml-qnn/npu/device/thread_pool.hpp
@ -12,7 +12,7 @@
 namespace hexagon {

 constexpr const size_t             kMaxThreadCount       = 4;
-constexpr const size_t             kDefaultStackSize     = 1024 * 16;  // 16KB
+constexpr const size_t             kDefaultStackSize     = 1024 * 32;  // 32KB
 constexpr const unsigned long long kThreadTaskPendingBit = 1;

 template <size_t _stack_size> class qurt_thread {
@ -80,7 +80,7 @@ using qurt_thread_ptr = std::unique_ptr<qurt_thread<kDefaultStackSize>>;

 template <size_t _thread_count> class thread_pool {
    static_assert(_thread_count > 1, "Thread count must be greater than 1");
-    constexpr const static size_t kMaxThreadCount = _thread_count - 1;
+    constexpr const static size_t kMaxSubThreadCount = _thread_count - 1;

  public:
    typedef qurt_thread<kDefaultStackSize> thread_type;
@ -88,9 +88,10 @@ template <size_t _thread_count> class thread_pool {

    thread_pool() {
        std::string thread_name_base = "thread_pool_";
-        qurt_barrier_init(&_pending, kMaxThreadCount + 1);
-        qurt_barrier_init(&_completed, kMaxThreadCount + 1);
-        for (size_t i = 0; i < kMaxThreadCount; ++i) {
+        qurt_barrier_init(&_pending, kMaxSubThreadCount + 1);
+        qurt_barrier_init(&_completed, kMaxSubThreadCount + 1);
+        const auto priority = qurt_thread_get_priority(qurt_thread_get_id());
+        for (size_t i = 0; i < kMaxSubThreadCount; ++i) {
            auto & thread_arg     = _thread_args[i];
            thread_arg.pool       = this;
            thread_arg.thread_idx = i + 1;
@ -98,7 +99,7 @@ template <size_t _thread_count> class thread_pool {
            auto thread = std::make_unique<thread_type>(
                thread_name_base + std::to_string(i),
                reinterpret_cast<thread_type::qurt_thread_func_type>(&thread_pool::thread_func_impl), &thread_arg,
-                QURT_THREAD_ATTR_PRIORITY_DEFAULT);
+                priority);
            if (!thread->is_valid()) {
                DEVICE_LOG_ERROR("Failed to create thread: %zu", i);
                // destroy all barriers and threads at destructor
@ -107,7 +108,7 @@ template <size_t _thread_count> class thread_pool {

            _threads[i] = std::move(thread);
        }
-        DEVICE_LOG_DEBUG("thread_pool.created: %zu", kMaxThreadCount);
+        DEVICE_LOG_DEBUG("thread_pool.created: %zu", kMaxSubThreadCount);
    }

    ~thread_pool() {
@ -133,7 +134,7 @@ template <size_t _thread_count> class thread_pool {
        _arg  = arg;
        qurt_barrier_wait(&_pending);

-        task(this, 0, kMaxThreadCount + 1, arg);
+        task(this, 0, kMaxSubThreadCount + 1, arg);
        DEVICE_LOG_DEBUG("main_thread.task_completed: 0");

        qurt_barrier_wait(&_completed);
@ -166,7 +167,7 @@ template <size_t _thread_count> class thread_pool {

            auto task = pool._task;
            if (task) {
-                task(arg->pool, arg->thread_idx, kMaxThreadCount + 1, pool._arg);
+                task(arg->pool, arg->thread_idx, kMaxSubThreadCount + 1, pool._arg);
            }

            DEVICE_LOG_DEBUG("thread_func_impl.task_completed: %zu", arg->thread_idx);
@ -176,13 +177,13 @@ template <size_t _thread_count> class thread_pool {
        DEVICE_LOG_DEBUG("thread_func_impl.end: %zu", arg->thread_idx);
    }

-    std::atomic_bool                             _thread_exit = false;
-    std::array<qurt_thread_ptr, kMaxThreadCount> _threads;
-    thread_pool_arg                              _thread_args[kMaxThreadCount] = {};
-    qurt_barrier_t                               _pending                      = {};
-    qurt_barrier_t                               _completed                    = {};
-    task_type                                    _task                         = nullptr;
-    void *                                       _arg                          = nullptr;
+    std::atomic_bool                                _thread_exit = false;
+    std::array<qurt_thread_ptr, kMaxSubThreadCount> _threads;
+    thread_pool_arg                                 _thread_args[kMaxSubThreadCount] = {};
+    qurt_barrier_t                                  _pending                         = {};
+    qurt_barrier_t                                  _completed                       = {};
+    task_type                                       _task                            = nullptr;
+    void *                                          _arg                             = nullptr;

    DISABLE_COPY_AND_MOVE(thread_pool);
 };
--- a/ggml/src/ggml-qnn/npu/device/util.hpp
+++ b/ggml/src/ggml-qnn/npu/device/util.hpp
@ -1,7 +1,9 @@
 #pragma once

+#include <AEEStdDef.h>
 #include <HAP_farf.h>
 #include <HAP_perf.h>
+#include <HAP_power.h>

 #include <cstdint>
 #include <cstring>
@ -48,11 +50,114 @@ inline constexpr const char * op_get_name(npu_device_tensor_op op) {
            return "SUB";
        case NPU_OP_MUL:
            return "MUL";
+        case NPU_OP_RMS_NORM:
+            return "RMS_NORM";
        default:
            return "UNKNOWN";
    }
 }

+class power_utils {
+  public:
+    power_utils() {
+        _context_ptr = HAP_utils_create_context();
+        if (_context_ptr == nullptr) {
+            DEVICE_LOG_ERROR("Failed to create power context\n");
+        }
+    }
+
+    ~power_utils() {
+        if (_context_ptr != nullptr) {
+            HAP_utils_destroy_context(_context_ptr);
+        }
+    }
+
+    unsigned int get_clock_speed_hz() const {
+        if (!is_valid()) {
+            DEVICE_LOG_ERROR("Power context is not initialized\n");
+            return 0;
+        }
+
+        HAP_power_response_t response = {};
+        response.type                 = HAP_power_get_clk_Freq;
+        auto ret                      = HAP_power_get(_context_ptr, &response);
+        if (ret != AEE_SUCCESS) {
+            DEVICE_LOG_ERROR("Failed to get clock speed: %d\n", ret);
+            return 0;
+        }
+
+        return response.clkFreqHz;
+    }
+
+    bool get_dvcs_enabled() const {
+        if (!is_valid()) {
+            DEVICE_LOG_ERROR("Power context is not initialized\n");
+            return false;
+        }
+
+        HAP_power_response_t response = {};
+        response.type                 = HAP_power_get_dcvsEnabled;
+        auto ret                      = HAP_power_get(_context_ptr, &response);
+        if (ret != AEE_SUCCESS) {
+            DEVICE_LOG_ERROR("Failed to get DVCS enabled: %d\n", ret);
+            return false;
+        }
+
+        return response.dcvsEnabled;
+    }
+
+    void set_dvcs_performance_mode(bool enable) {
+        if (!is_valid()) {
+            DEVICE_LOG_ERROR("Power context is not initialized\n");
+            return;
+        }
+
+        HAP_power_request_t request = {};
+        request.type                = HAP_power_set_DCVS_v3;
+        request.dcvs_v3.dcvs_enable = enable ? TRUE : FALSE;
+        if (enable) {
+            request.dcvs_v3.dcvs_option = HAP_DCVS_V2_PERFORMANCE_MODE;
+            /*
+             * sleep_latency : To request for sleep latency in micro-seconds.
+             *                 Sleep latency is the minimum time before which the DSP sleeps
+             *                 Set latency to 65535 to reset it to the default value
+             */
+            request.dcvs_v3.set_latency = TRUE;
+            request.dcvs_v3.latency     = 1000;
+
+            request.dcvs_v3.set_bus_params           = TRUE;
+            request.dcvs_v3.bus_params.min_corner    = HAP_DCVS_VCORNER_SVS;
+            request.dcvs_v3.bus_params.max_corner    = HAP_DCVS_VCORNER_TURBO;
+            request.dcvs_v3.bus_params.target_corner = HAP_DCVS_VCORNER_NOM;
+        }
+
+        auto ret = HAP_power_set(_context_ptr, &request);
+        if (ret != AEE_SUCCESS) {
+            DEVICE_LOG_ERROR("Failed to set DVCS performance mode: %d\n", ret);
+        }
+    }
+
+    void set_sleep_mode(bool enable) {
+        if (!is_valid()) {
+            DEVICE_LOG_ERROR("Power context is not initialized\n");
+            return;
+        }
+
+        boolean sleep_disable = enable ? FALSE : TRUE;
+        auto    ret           = HAP_power_set_sleep_mode(_context_ptr, sleep_disable);
+        if (ret != AEE_SUCCESS) {
+            DEVICE_LOG_ERROR("Failed to set sleep mode: %d\n", ret);
+        }
+    }
+
+    bool is_valid() const { return _context_ptr != nullptr; }
+
+  private:
+    void * _context_ptr = nullptr;
+
+    DISABLE_COPY_AND_MOVE(power_utils);
+};
+
 #ifdef GGML_HEXAGON_ENABLE_PERFORMANCE_TRACKING

 template <size_t _buffer_count> class npu_scoped_timer {
--- a/ggml/src/ggml-qnn/npu/device/vtcm_mem.hpp
+++ b/ggml/src/ggml-qnn/npu/device/vtcm_mem.hpp
@ -47,7 +47,7 @@ class vtcm_mem {
        DEVICE_LOG_DEBUG("VTCM released: %zu bytes at %p\n", _vtcm_size, _vtcm_mem);
    }

-    bool is_valid() const { return _vtcm_mem != nullptr; }
+    bool is_valid() const { return _vtcm_mem != nullptr && _vtcm_size != 0; }

    uint8_t * get_mem() const { return reinterpret_cast<uint8_t *>(_vtcm_mem); }

--- a/ggml/src/ggml-qnn/npu/host/buffer.cpp
+++ b/ggml/src/ggml-qnn/npu/host/buffer.cpp
@ -177,7 +177,7 @@ std::shared_ptr<host_tensor> host_buffer::init_tensor(ggml_tensor * tensor, remo

        auto ret = _allocator->fastrpc_mmap((int) _domain_id, _buffer_fd, _data, 0, _size, FASTRPC_MAP_FD);
        if (ret != AEE_SUCCESS) {
-            LOG_ERROR("failed to mmap rpc memory, fd: %d, ret: %d\n", _buffer_fd, ret);
+            LOG_ERROR("failed to mmap rpc memory, fd: %d, size: %zu, ret: %d\n", _buffer_fd, _size, ret);
            return std::shared_ptr<host_tensor>();
        }

--- a/ggml/src/ggml-qnn/npu/host/graph.cpp
+++ b/ggml/src/ggml-qnn/npu/host/graph.cpp
@ -1,5 +1,6 @@
 #include "graph.hpp"

+#include "profiler.hpp"
 #include "tensor.hpp"

 namespace hexagon {
@ -28,8 +29,12 @@ bool host_graph::update(ggml_cgraph * cgraph) {
        return false;
    }

+    SCOPED_PERFORMANCE_TRACKER("[hexagon-npu][%p]update, handle(%p)", (void *) this, (void *) _graph_handle);
+
    _tensor_handles.clear();
+    _tensor_update_configs.clear();
    _tensor_handles.reserve(cgraph->n_nodes);
+    _tensor_update_configs.reserve(cgraph->n_nodes);
    for (int i = 0; i < cgraph->n_nodes; ++i) {
        auto * node = cgraph->nodes[i];
        if (node->op == GGML_OP_NONE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE ||
@ -40,28 +45,38 @@ bool host_graph::update(ggml_cgraph * cgraph) {
            continue;
        }

+        // TODO: move to tensor?
        auto * tensor_obj = host_tensor::from_ggml_tensor(node);
        if (!tensor_obj) {
            LOG_DEBUG("Unable to get host tensor from ggml tensor: %p\n", (void *) node);
            continue;
        }

-        tensor_obj->set_op(node->op);
        _tensor_handles.push_back(tensor_obj->get_device_tensor_handle());
-        LOG_DEBUG("node[%d]%s(%s), addr: %p, type: %s, tensor_handle: %p\n", i, ggml_get_name(node), ggml_op_desc(node),
-                  (void *) node, ggml_type_name(node->type), (void *) tensor_obj->get_device_tensor_handle());
-        for (size_t j = 0; j < GGML_MAX_SRC && node->src[j]; ++j) {
-            auto * src = host_tensor::from_ggml_tensor(node->src[j]);
-            tensor_obj->set_src(j, src);
-        }
+        _tensor_update_configs.push_back(tensor_obj->update_hosts_params_only(node));
+        LOG_DEBUG("[%p]node[%d]%s(%s), addr: %p, type: %s, tensor_handle: %p\n", (void *) this, i, ggml_get_name(node),
+                  ggml_op_desc(node), (void *) node, ggml_type_name(node->type),
+                  (void *) tensor_obj->get_device_tensor_handle());
    }

-    LOG_DEBUG("host_graph::update, host_graph(%p), handle(%p), ggml_cgraph(%p), tensor count(%zu)\n", (void *) this,
-              (void *) _graph_handle, (void *) cgraph, _tensor_handles.size());
-    if (!_tensor_handles.empty()) {
-        npu_device_graph_set_tensor(_device_handle, _graph_handle, _tensor_handles.data(),
-                                    (int) _tensor_handles.size());
+    GGML_ASSERT(_tensor_handles.size() == _tensor_update_configs.size());
+
+    constexpr const npu_device_tensor_handle_t      kEmptyTensorHandle = 0;
+    constexpr const npu_device_tensor_update_config kEmptyUpdateConfig = {};
+
+    auto ret = npu_device_graph_set_tensor_with_param(
+        _device_handle, _graph_handle, _tensor_handles.size() ? _tensor_handles.data() : &kEmptyTensorHandle,
+        (int) _tensor_handles.size(),
+        _tensor_update_configs.size() ? _tensor_update_configs.data() : &kEmptyUpdateConfig,
+        (int) _tensor_update_configs.size());
+
+    if (ret != AEE_SUCCESS) {
+        LOG_ERROR("Failed to set tensors in host_graph: 0x%x\n", (int) ret);
+        return false;
    }
+
+    LOG_DEBUG("[%p]host_graph::update, handle(%p), ggml_cgraph(%p), tensor count(%zu)\n", (void *) this,
+              (void *) _graph_handle, (void *) cgraph, _tensor_handles.size());
    return true;
 }

@ -71,6 +86,7 @@ bool host_graph::compute() {
        return false;
    }

+    SCOPED_PERFORMANCE_TRACKER("[hexagon-npu][%p]compute, handle(%p)", (void *) this, (void *) _graph_handle);
    auto status = npu_device_graph_compute(_device_handle, _graph_handle);
    if (status != AEE_SUCCESS) {
        LOG_ERROR("Failed to compute host_graph: 0x%x\n", (int) status);
--- a/ggml/src/ggml-qnn/npu/host/graph.hpp
+++ b/ggml/src/ggml-qnn/npu/host/graph.hpp
@ -21,9 +21,10 @@ class host_graph {
    bool compute();

  private:
-    remote_handle64                         _device_handle = 0;
-    npu_device_graph_handle_t               _graph_handle  = 0;
-    std::vector<npu_device_tensor_handle_t> _tensor_handles;
+    remote_handle64                              _device_handle = 0;
+    npu_device_graph_handle_t                    _graph_handle  = 0;
+    std::vector<npu_device_tensor_handle_t>      _tensor_handles;
+    std::vector<npu_device_tensor_update_config> _tensor_update_configs;

    DISABLE_COPY(host_graph);
    DISABLE_MOVE(host_graph);
--- a/ggml/src/ggml-qnn/npu/host/host_device.cpp
+++ b/ggml/src/ggml-qnn/npu/host/host_device.cpp
@ -151,7 +151,7 @@ bool npu_device::supports_op_impl(const ggml_tensor * op) {

    auto * src0 = op->src[0];
    if (!src0) {
-        LOG_DEBUG("[%s]Unsupported inplace op: %s\n", get_name(), ggml_op_name(op->op));
+        LOG_DEBUG("[%s]Unsupported inplace op: %s\n", get_name(), ggml_op_desc(op));
        return false;
    }

@ -168,7 +168,7 @@ bool npu_device::supports_op_impl(const ggml_tensor * op) {

    auto npu_op = op_to_npu_op(op->op);
    if (npu_op == NPU_OP_COUNT) {
-        LOG_DEBUG("[%s]Unsupported op: %s\n", get_name(), ggml_op_name(op->op));
+        LOG_DEBUG("[%s]Unsupported op: %s\n", get_name(), ggml_op_desc(op));
        return false;
    }

@ -179,7 +179,7 @@ bool npu_device::supports_op_impl(const ggml_tensor * op) {

    constexpr const auto get_spec = [](const ggml_tensor * tensor) -> npu_device_tensor_spec {
        if (!tensor) {
-            return npu_device_tensor_spec{};
+            return npu_device_tensor_spec{ {}, NPU_DATA_TYPE_COUNT };
        }

        static_assert(DEVICE_TENSOR_MAX_DIMS == GGML_MAX_DIMS, "tensor dimensions mismatch");
--- a/ggml/src/ggml-qnn/npu/host/tensor.hpp
+++ b/ggml/src/ggml-qnn/npu/host/tensor.hpp
@ -1,5 +1,7 @@
 #pragma once

+#include <type_traits>
+
 #include "common.hpp"
 #include "ggml-impl.h"
 #include "hexagon_npu.h"
@ -19,11 +21,15 @@ class host_tensor {

    explicit host_tensor(ggml_tensor * tensor, int buffer_fd, uint64_t offset, remote_handle64 device_handle) :
        _device_handle(device_handle) {
+
+        // TODO: figure out why the npu_device_tensor_config can't be larger than 100 bytes
+        static_assert(sizeof(npu_device_tensor_config) < 100, "npu_device_tensor_config size too large");
+
        _info.buffer_fd = buffer_fd;
        _info.offset    = offset;
        _info.type      = type_to_npu_type(tensor->type);
-        _info.op        = op_to_npu_op(tensor->op);
        _info.size      = ggml_nbytes(tensor);
+        // _info.op will be updated in update_params()

        static_assert(DEVICE_TENSOR_MAX_DIMS == GGML_MAX_DIMS, "tensor dimensions mismatch");
        static_assert(sizeof(_info.ne) == sizeof(tensor->ne), "tensor ne size mismatch");
@ -56,28 +62,96 @@ class host_tensor {

    npu_device_tensor_handle_t get_device_tensor_handle() const { return _device_tensor_handle; }

-    void set_src(size_t index, host_tensor * src) {
-        if (index >= DEVICE_TENSOR_MAX_SRC) {
-            LOG_ERROR("host_tensor(%p) set_src[%zu] out of range\n", (void *) this, index);
+    void update_params(ggml_tensor * ggml_tensor) {
+        static_assert(sizeof(_info_update.params) <= sizeof(_ggml_tensor->op_params),
+                      "device tensor params size mismatch");
+        static_assert(DEVICE_TENSOR_MAX_SRC <= GGML_MAX_SRC, "device tensor src size mismatch");
+
+        GGML_ASSERT(ggml_tensor == _ggml_tensor);
+        if (!_ggml_tensor) {
+            LOG_DEBUG("host_tensor(%p) _ggml_tensor is null\n", (void *) this);
            return;
        }

-        LOG_DEBUG("host_tensor(%p) set_src[%zu]: %p\n", (void *) this, index, (void *) src);
-        npu_device_tensor_set_src(_device_handle, _device_tensor_handle, index, src->get_device_tensor_handle());
+        auto new_op         = op_to_npu_op(_ggml_tensor->op);
+        bool params_changed = new_op != _info_update.op;
+        if (params_changed) {
+            LOG_DEBUG("host_tensor(%p) op changed: %s -> %s\n", (void *) this, get_npu_op_desc(_info.op),
+                      get_npu_op_desc(new_op));
+        }
+
+        _info.op        = new_op;
+        _info_update.op = new_op;
+
+        if (memcmp(_info_update.params, _ggml_tensor->op_params, sizeof(_info_update.params)) != 0) {
+            params_changed = true;
+            memcpy(_info_update.params, _ggml_tensor->op_params, sizeof(_info_update.params));
+            LOG_DEBUG("host_tensor(%p) op_params changed: [%x, %x, %x, %x]\n", (void *) this,
+                      (int) _info_update.params[0], (int) _info_update.params[1], (int) _info_update.params[2],
+                      (int) _info_update.params[3]);
+        }
+
+        npu_device_tensor_handle_t src_tensor_handles[DEVICE_TENSOR_MAX_SRC] = {};
+        for (size_t j = 0; j < DEVICE_TENSOR_MAX_SRC && _ggml_tensor->src[j]; ++j) {
+            auto * src            = host_tensor::from_ggml_tensor(_ggml_tensor->src[j]);
+            src_tensor_handles[j] = src->get_device_tensor_handle();
+            LOG_DEBUG("host_tensor(%p) set_src[%zu]: %p\n", (void *) this, j, (void *) src);
+        }
+
+        static_assert(std::is_same<decltype(_info_update.src_handles), decltype(src_tensor_handles)>::value,
+                      "src tensor handles type mismatch");
+
+        if (memcmp(_info_update.src_handles, src_tensor_handles, sizeof(_info_update.src_handles)) != 0) {
+            params_changed = true;
+            memcpy(_info_update.src_handles, src_tensor_handles, sizeof(_info_update.src_handles));
+            LOG_DEBUG("host_tensor(%p) src changed, handles: [%p, %p]\n", (void *) this,
+                      (void *) _info_update.src_handles[0], (void *) _info_update.src_handles[1]);
+        }
+
+        if (params_changed) {
+            npu_device_tensor_update_params(_device_handle, _device_tensor_handle, &_info_update);
+            LOG_DEBUG("host_tensor(%p) update_params, op: %s, params: [%x, %x, %x, %x]\n", (void *) this,
+                      ggml_op_desc(_ggml_tensor), (int) _info_update.params[0], (int) _info_update.params[1],
+                      (int) _info_update.params[2], (int) _info_update.params[3]);
+        } else {
+            LOG_DEBUG("host_tensor(%p) update_params, no changes, op: %s, params: [%x, %x, %x, %x]\n", (void *) this,
+                      ggml_op_desc(_ggml_tensor), (int) _info_update.params[0], (int) _info_update.params[1],
+                      (int) _info_update.params[2], (int) _info_update.params[3]);
+        }
    }

-    void set_op(ggml_op op) {
-        _info.op = op_to_npu_op(op);
-        npu_device_tensor_set_op(_device_handle, _device_tensor_handle, _info.op);
+    const npu_device_tensor_update_config & update_hosts_params_only(ggml_tensor * ggml_tensor) {
+        static_assert(sizeof(_info_update.params) <= sizeof(ggml_tensor->op_params),
+                      "device tensor params size mismatch");
+        static_assert(DEVICE_TENSOR_MAX_SRC <= GGML_MAX_SRC, "device tensor src size mismatch");
+
+        GGML_ASSERT(ggml_tensor == _ggml_tensor);
+
+        auto new_op     = op_to_npu_op(_ggml_tensor->op);
+        _info.op        = new_op;
+        _info_update.op = new_op;
+        memcpy(_info_update.params, _ggml_tensor->op_params, sizeof(_info_update.params));
+
+        for (size_t j = 0; j < DEVICE_TENSOR_MAX_SRC && _ggml_tensor->src[j]; ++j) {
+            auto * src                  = host_tensor::from_ggml_tensor(_ggml_tensor->src[j]);
+            _info_update.src_handles[j] = src->get_device_tensor_handle();
+            LOG_DEBUG("host_tensor(%p) set_src[%zu]: %p\n", (void *) this, j, (void *) src);
+        }
+
+        LOG_DEBUG("host_tensor(%p) update_params, op: %s, params: [%x, %x, %x, %x]\n", (void *) this,
+                  ggml_op_desc(_ggml_tensor), (int) _info_update.params[0], (int) _info_update.params[1],
+                  (int) _info_update.params[2], (int) _info_update.params[3]);
+        return _info_update;
    }

    bool is_valid() const { return _device_tensor_handle != 0; }

  private:
-    remote_handle64            _device_handle        = 0;
-    npu_device_tensor_handle_t _device_tensor_handle = 0;
-    npu_device_tensor_config   _info                 = {};
-    ggml_tensor *              _ggml_tensor          = nullptr;
+    remote_handle64                 _device_handle        = 0;
+    npu_device_tensor_handle_t      _device_tensor_handle = 0;
+    npu_device_tensor_config        _info                 = {};
+    npu_device_tensor_update_config _info_update          = {};
+    ggml_tensor *                   _ggml_tensor          = nullptr;

    DISABLE_COPY(host_tensor);
    DISABLE_MOVE(host_tensor);
--- a/ggml/src/ggml-qnn/npu/host/util.cpp
+++ b/ggml/src/ggml-qnn/npu/host/util.cpp
@ -25,11 +25,30 @@ enum npu_device_tensor_op op_to_npu_op(ggml_op op) {
            return NPU_OP_SUB;
        case GGML_OP_MUL:
            return NPU_OP_MUL;
+        case GGML_OP_RMS_NORM:
+            return NPU_OP_RMS_NORM;
        default:
            return NPU_OP_COUNT;
    }
 }

+const char * get_npu_op_desc(enum npu_device_tensor_op op) {
+    switch (op) {
+        case NPU_OP_MUL_MAT:
+            return ggml_op_name(GGML_OP_MUL_MAT);
+        case NPU_OP_ADD:
+            return ggml_op_name(GGML_OP_ADD);
+        case NPU_OP_SUB:
+            return ggml_op_name(GGML_OP_SUB);
+        case NPU_OP_MUL:
+            return ggml_op_name(GGML_OP_MUL);
+        case NPU_OP_RMS_NORM:
+            return ggml_op_name(GGML_OP_RMS_NORM);
+        default:
+            return "UNKNOWN";
+    }
+}
+
 enum npu_device_tensor_data_type type_to_npu_type(ggml_type type) {
    switch (type) {
        case GGML_TYPE_F32:
--- a/ggml/src/ggml-qnn/npu/host/util.hpp
+++ b/ggml/src/ggml-qnn/npu/host/util.hpp
@ -5,6 +5,7 @@
 namespace hexagon {

 enum npu_device_tensor_op        op_to_npu_op(ggml_op op);
+const char *                     get_npu_op_desc(enum npu_device_tensor_op op);
 enum npu_device_tensor_data_type type_to_npu_type(ggml_type type);

 // TODO: merge with qcom_htp_arch
--- a/ggml/src/ggml-qnn/npu/idl/hexagon_npu.idl
+++ b/ggml/src/ggml-qnn/npu/idl/hexagon_npu.idl
@ -4,6 +4,7 @@

 const uint32_t DEVICE_TENSOR_MAX_DIMS = 4;
 const uint32_t DEVICE_TENSOR_MAX_SRC = 2;
+const uint32_t DEVICE_TENSOR_MAX_OP_PARAMS = 4;
 const uint32_t QUANT_BLOCK_SIZE = 32;
 const uint32_t QUANT_K_BLOCK_SIZE = 256;
 const uint32_t QUANT_K_SCALE_SIZE = 12;
@ -38,6 +39,7 @@ interface npu_device : remote_handle64{
        NPU_OP_ADD,
        NPU_OP_SUB,
        NPU_OP_MUL,
+        NPU_OP_RMS_NORM,
        NPU_OP_COUNT
    };

@ -55,6 +57,12 @@ interface npu_device : remote_handle64{
        tensor_data_type type;
    };

+    struct tensor_update_config {
+        tensor_op op;
+        int32_t params[DEVICE_TENSOR_MAX_OP_PARAMS];
+        tensor_handle_t src_handles[DEVICE_TENSOR_MAX_SRC];
+    };
+
    struct tensor_config {
        ne_type ne;
        uint64_t nb[DEVICE_TENSOR_MAX_DIMS];
@ -82,15 +90,9 @@ interface npu_device : remote_handle64{
        rout tensor_handle_t tensor_handle
    );

-    AEEResult tensor_set_src(
+    AEEResult tensor_update_params(
        in tensor_handle_t tensor_handle,
-        in uint64_t index,
-        in tensor_handle_t src
-    );
-
-    AEEResult tensor_set_op(
-        in tensor_handle_t tensor_handle,
-        in tensor_op op
+        in tensor_update_config config
    );

    AEEResult tensor_free(
@ -106,6 +108,12 @@ interface npu_device : remote_handle64{
        in sequence<tensor_handle_t> tensor_handles
    );

+    AEEResult graph_set_tensor_with_param(
+        in graph_handle_t graph_handle,
+        in sequence<tensor_handle_t> tensor_handles,
+        in sequence<tensor_update_config> tensor_params
+    );
+
    AEEResult graph_compute(
        in graph_handle_t graph_handle
    );
--- a/ggml/src/ggml-qnn/qnn/event_tracer.cpp
+++ b/ggml/src/ggml-qnn/qnn/event_tracer.cpp
@ -1,5 +1,5 @@

-#include "profiler.hpp"
+#include "event_tracer.hpp"

 #include <HTP/QnnHtpProfile.h>
 #include <QnnProfile.h>
--- a/ggml/src/ggml-qnn/qnn/event_tracer.hpp
+++ b/ggml/src/ggml-qnn/qnn/event_tracer.hpp
@ -0,0 +1,45 @@
+#pragma once
+
+#include <QnnCommon.h>
+
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <string>
+
+#include "logger.hpp"
+#include "profiler.hpp"
+#include "qnn-types.hpp"
+
+namespace qnn {
+
+// forward declaration of qnn_interface
+class qnn_interface;
+
+class qnn_event_tracer {
+  public:
+    // ref:
+    //   https://github.com/pytorch/executorch/blob/ae3d558d5e6aa04fc52a3065399fe6a773702f52/backends/qualcomm/serialization/qc_schema.py#L53
+    //   https://docs.qualcomm.com/bundle/publicresource/topics/80-63442-50/overview.html#supported-snapdragon-devices
+    enum sdk_profile_level { PROFILE_OFF = 0, PROFILE_BASIC, PROFILE_DETAIL, PROFILE_OP_TRACE };
+
+    explicit qnn_event_tracer(const std::string & prefix, std::shared_ptr<qnn_interface> interface,
+                              Qnn_BackendHandle_t backend_handle, sdk_profile_level level);
+    ~qnn_event_tracer();
+
+    Qnn_ProfileHandle_t get_handle() const { return _handle; }
+
+    void print_profile_events();
+
+  private:
+    std::shared_ptr<qnn_interface> _interface;
+    Qnn_ProfileHandle_t            _handle = nullptr;
+    std::string                    _prefix;
+
+    DISABLE_COPY(qnn_event_tracer);
+    DISABLE_MOVE(qnn_event_tracer);
+};
+
+using qnn_event_tracer_ptr = std::shared_ptr<qnn_event_tracer>;
+
+}  // namespace qnn
--- a/ggml/src/ggml-qnn/qnn/graph.cpp
+++ b/ggml/src/ggml-qnn/qnn/graph.cpp
@ -4,10 +4,10 @@
 #include <algorithm>
 #include <unordered_map>

+#include "event_tracer.hpp"
 #include "ggml-impl.h"
 #include "logger.hpp"
 #include "op-config.hpp"
-#include "profiler.hpp"
 #include "tensor.hpp"

 #ifdef GGML_HEXAGON_ENABLE_PERFORMANCE_TRACKING
@ -411,8 +411,8 @@ bool qnn_graph::build_graph_from_ggml_graph(const ggml_cgraph * cgraph) {
            GGML_TYPE_COUNT > GGML_TYPE_Q8_0 && GGML_TYPE_Q8_0 > GGML_TYPE_F16 && GGML_TYPE_F16 > GGML_TYPE_F32,
            "GGML_TYPE enum order is not correct");

-        QNN_SCOPED_PERFORMANCE_TRACKER("[%s][%s]build_graph_from_ggml_graph", get_backend_name(_device),
-                                       _graph_name.c_str());
+        SCOPED_PERFORMANCE_TRACKER("[%s][%s]build_graph_from_ggml_graph", get_backend_name(_device),
+                                   _graph_name.c_str());

        auto override_data_type = get_override_data_type(inputs, outputs);
        if (override_data_type != GGML_TYPE_COUNT) {
@ -466,8 +466,7 @@ bool qnn_graph::execute(const ggml_cgraph * cgraph, std::shared_ptr<qnn_convert_
    ggml_tensor_array_t inputs;
    ggml_tensor_array_t outputs;
    {
-        QNN_SCOPED_PERFORMANCE_TRACKER("[%s][%s]get_io_tensors_from_graph", get_backend_name(_device),
-                                       _graph_name.c_str());
+        SCOPED_PERFORMANCE_TRACKER("[%s][%s]get_io_tensors_from_graph", get_backend_name(_device), _graph_name.c_str());
 #ifdef NDEBUG
        get_io_tensors_from_graph(cgraph, inputs, outputs);
 #else
@ -478,7 +477,7 @@ bool qnn_graph::execute(const ggml_cgraph * cgraph, std::shared_ptr<qnn_convert_
    }

    {
-        QNN_SCOPED_PERFORMANCE_TRACKER("[%s][%s]bind_tensors", get_backend_name(_device), _graph_name.c_str());
+        SCOPED_PERFORMANCE_TRACKER("[%s][%s]bind_tensors", get_backend_name(_device), _graph_name.c_str());
        auto override_data_type = get_override_data_type(inputs, outputs);
        if (override_data_type != GGML_TYPE_COUNT) {
            QNN_LOG_DEBUG("[%s][%s]override_data_type: %s\n", get_backend_name(_device), _graph_name.c_str(),
@ -502,7 +501,7 @@ bool qnn_graph::execute(const ggml_cgraph * cgraph, std::shared_ptr<qnn_convert_
    }

    {
-        QNN_SCOPED_PERFORMANCE_TRACKER("[%s][%s]execute", get_backend_name(_device), _graph_name.c_str());
+        SCOPED_PERFORMANCE_TRACKER("[%s][%s]execute", get_backend_name(_device), _graph_name.c_str());
        auto & qnn_tensor_inputs  = _qnn_tensor_inputs;
        auto & qnn_tensor_outputs = _qnn_tensor_outputs;
        auto   error              = _qnn_interface->qnn_graph_execute(_graph_handle, qnn_tensor_inputs.data(),
@ -529,7 +528,7 @@ bool qnn_graph::execute(const ggml_cgraph * cgraph, std::shared_ptr<qnn_convert_
 }

 bool qnn_graph::finalize() {
-    QNN_SCOPED_PERFORMANCE_TRACKER("[%s][%s]finalize", get_backend_name(_device), _graph_name.c_str());
+    SCOPED_PERFORMANCE_TRACKER("[%s][%s]finalize", get_backend_name(_device), _graph_name.c_str());

    if (!qnn::add_op_to_graph(_graph_handle, _operations)) {
        QNN_LOG_ERROR("[%s]add nodes failed\n", _graph_name.c_str());
--- a/ggml/src/ggml-qnn/qnn/graph.hpp
+++ b/ggml/src/ggml-qnn/qnn/graph.hpp
@ -6,9 +6,9 @@
 #include <vector>

 #include "convert.hpp"
+#include "event_tracer.hpp"
 #include "ggml-qnn.h"
 #include "op-config.hpp"
-#include "profiler.hpp"
 #include "qnn-lib.hpp"

 namespace qnn {
--- a/ggml/src/ggml-qnn/qnn/profiler.hpp
+++ b/ggml/src/ggml-qnn/qnn/profiler.hpp
@ -1,100 +0,0 @@
-#pragma once
-
-#include <QnnCommon.h>
-
-#include <cstddef>
-#include <cstdint>
-#include <memory>
-#include <string>
-
-#include "logger.hpp"
-#include "qnn-types.hpp"
-
-namespace qnn {
-
-#ifdef GGML_HEXAGON_ENABLE_PERFORMANCE_TRACKING
-
-class qnn_scoped_timer {
-  public:
-    qnn_scoped_timer(const std::string & log_prefix) : _log_prefix(std::move(log_prefix)) {
-        _begin_us = ggml_time_us();
-    }
-
-    qnn_scoped_timer(qnn_scoped_timer && other) {
-        _begin_us   = other._begin_us;
-        _log_prefix = std::move(other._log_prefix);
-    }
-
-    ~qnn_scoped_timer() { print(); }
-
-    void operator=(qnn_scoped_timer && other) {
-        _begin_us   = other._begin_us;
-        _log_prefix = std::move(other._log_prefix);
-    }
-
-    void print() const {
-        auto duration = (ggml_time_us() - _begin_us) / 1000.0;
-        QNN_LOG_INFO("[profiler]%s, duration: %.4f ms\n", _log_prefix.c_str(), duration);
-    }
-
-
-  private:
-    int64_t     _begin_us = 0LL;
-    std::string _log_prefix;
-
-    qnn_scoped_timer(const qnn_scoped_timer &) = delete;
-    void operator=(const qnn_scoped_timer &)   = delete;
-};
-
-inline qnn_scoped_timer make_scope_perf_timer(const char * format, ...) {
-    va_list args;
-    va_start(args, format);
-    char buffer[4096];
-    vsnprintf(buffer, sizeof(buffer), format, args);
-    va_end(args);
-    return qnn_scoped_timer(buffer);
-}
-
-#else
-
-inline void make_scope_perf_timer(const char *, ...) {}
-
-#endif
-
-// forward declaration of qnn_interface
-class qnn_interface;
-
-class qnn_event_tracer {
-  public:
-    // ref:
-    //   https://github.com/pytorch/executorch/blob/ae3d558d5e6aa04fc52a3065399fe6a773702f52/backends/qualcomm/serialization/qc_schema.py#L53
-    //   https://docs.qualcomm.com/bundle/publicresource/topics/80-63442-50/overview.html#supported-snapdragon-devices
-    enum sdk_profile_level { PROFILE_OFF = 0, PROFILE_BASIC, PROFILE_DETAIL, PROFILE_OP_TRACE };
-
-    explicit qnn_event_tracer(const std::string & prefix, std::shared_ptr<qnn_interface> interface,
-                              Qnn_BackendHandle_t backend_handle, sdk_profile_level level);
-    ~qnn_event_tracer();
-
-    Qnn_ProfileHandle_t get_handle() const { return _handle; }
-
-    void print_profile_events();
-
-  private:
-    std::shared_ptr<qnn_interface> _interface;
-    Qnn_ProfileHandle_t            _handle = nullptr;
-    std::string                    _prefix;
-
-    DISABLE_COPY(qnn_event_tracer);
-    DISABLE_MOVE(qnn_event_tracer);
-};
-
-using qnn_event_tracer_ptr = std::shared_ptr<qnn_event_tracer>;
-
-}  // namespace qnn
-
-#ifdef GGML_HEXAGON_ENABLE_PERFORMANCE_TRACKING
-#    define QNN_SCOPED_PERFORMANCE_TRACKER(fmt, ...) \
-        auto __qnn_timer_##__LINE__ = qnn::make_scope_perf_timer(fmt, __VA_ARGS__)
-#else
-#    define QNN_SCOPED_PERFORMANCE_TRACKER(fmt, ...) ((void) 0)
-#endif
--- a/ggml/src/ggml-qnn/qnn/qnn-lib.cpp
+++ b/ggml/src/ggml-qnn/qnn/qnn-lib.cpp
@ -34,21 +34,36 @@ constexpr const qnn::device_caps kDeviceCaps[] = {
    {
     // https://docs.qualcomm.com/bundle/publicresource/topics/80-63442-50/CpuOpDefSupplement.html#matmul
        kQnnCpuLibName,                                                                   GGML_BACKEND_DEVICE_TYPE_ACCEL, (1L << GGML_TYPE_I8) | (1L << GGML_TYPE_F32),
-     0xFFFFFE,                                                                                                                                                                                                                                                                            // all quantized types can be offload to CPU, at current implementation, those types will be dequantized into float32 on cpu
-        0,                                                    // 0 for no limitation
+#ifdef GGML_HEXAGON_ENABLE_QUANTIZED_TENSORS
+     // all quantized types can be offload to CPU, at current implementation, those types will be dequantized into float32 on cpu
+        0xFFFFFE,
+#else
+        0,
+#endif
+
+     0,                                                                     // 0 for no limitation
    },
    {
     // https://docs.qualcomm.com/bundle/publicresource/topics/80-63442-50/GpuOpDefSupplement.html#matmul
        kQnnGpuLibName,                                                                                    GGML_BACKEND_DEVICE_TYPE_GPU,                                                                                                   (1L << GGML_TYPE_F32) | (1L << GGML_TYPE_F16),
+#ifdef GGML_HEXAGON_ENABLE_QUANTIZED_TENSORS
     // all quantized types can be offload to GPU, at current implementation, those types will be dequantized into float32 on cpu
-        0xFFFFFE,                                                           (128256L * 4096 *
+        0xFFFFFE,
+#else
+        0,
+#endif
+     (128256L * 4096 *
         sizeof(float)), // tested on 8 gen 2, failed to allocate tensor with size 128256x4096 and float32
    },
    {
     // https://docs.qualcomm.com/bundle/publicresource/topics/80-63442-50/HtpOpDefSupplement.html#matmul
        kQnnNpuLibName, GGML_BACKEND_DEVICE_TYPE_ACCEL,
+#ifdef GGML_HEXAGON_ENABLE_QUANTIZED_TENSORS
     (1L << GGML_TYPE_F32) | (1L << GGML_TYPE_F16) | (1L << GGML_TYPE_I16),
     (1L << GGML_TYPE_Q2_K) | (1L << GGML_TYPE_Q3_K) | (1L << GGML_TYPE_Q4_K) | (1L << GGML_TYPE_Q8_K),
+#else
+        0,
+#endif
     (8192L * 2048 + 8192 * 512 + 2048 * 512) * sizeof(float),  // TODO: should have a better way to get this value
    },
 };
--- a/ggml/src/ggml-qnn/shared/common.hpp
+++ b/ggml/src/ggml-qnn/shared/common.hpp
@ -45,6 +45,10 @@ size_t get_system_free_memory_in_bytes();
    class_name(class_name &&)     = delete; \
    void operator=(class_name &&) = delete

+#define DISABLE_COPY_AND_MOVE(class_name) \
+    DISABLE_COPY(class_name);             \
+    DISABLE_MOVE(class_name)
+
 #define LOG_ERROR(...) (GGML_LOG_ERROR(__VA_ARGS__))
 #define LOG_WARN(...)  (GGML_LOG_WARN(__VA_ARGS__))
 #define LOG_INFO(...)  (GGML_LOG_INFO(__VA_ARGS__))
--- a/ggml/src/ggml-qnn/shared/profiler.hpp
+++ b/ggml/src/ggml-qnn/shared/profiler.hpp
@ -0,0 +1,61 @@
+#pragma once
+
+#include <cstddef>
+#include <cstdint>
+#include <string>
+
+#include "common.hpp"
+#include "ggml-impl.h"
+
+namespace profiler {
+
+#ifdef GGML_HEXAGON_ENABLE_PERFORMANCE_TRACKING
+
+class scoped_timer {
+  public:
+    scoped_timer(const std::string & log_prefix) : _log_prefix(std::move(log_prefix)) { _begin_us = ggml_time_us(); }
+
+    scoped_timer(scoped_timer && other) {
+        _begin_us   = other._begin_us;
+        _log_prefix = std::move(other._log_prefix);
+    }
+
+    ~scoped_timer() { print(); }
+
+    void operator=(scoped_timer && other) {
+        _begin_us   = other._begin_us;
+        _log_prefix = std::move(other._log_prefix);
+    }
+
+    void print() const {
+        auto duration = ggml_time_us() - _begin_us;
+        GGML_LOG_INFO("[profiler]%s, dur: %lld us\n", _log_prefix.c_str(), (long long) duration);
+    }
+
+
+  private:
+    int64_t     _begin_us = 0LL;
+    std::string _log_prefix;
+
+    DISABLE_COPY(scoped_timer);
+};
+
+inline scoped_timer make_scope_perf_timer(const char * format, ...) {
+    va_list args;
+    va_start(args, format);
+    char buffer[4096];
+    vsnprintf(buffer, sizeof(buffer), format, args);
+    va_end(args);
+    return scoped_timer(buffer);
+}
+
+#endif
+
+}  // namespace profiler
+
+#ifdef GGML_HEXAGON_ENABLE_PERFORMANCE_TRACKING
+#    define SCOPED_PERFORMANCE_TRACKER(fmt, ...) \
+        auto __scoped_timer_##__LINE__ = profiler::make_scope_perf_timer(fmt, __VA_ARGS__)
+#else
+#    define SCOPED_PERFORMANCE_TRACKER(fmt, ...) ((void) 0)
+#endif
--- a/ggml/src/ggml-qnn/shared/rpc-mem.hpp
+++ b/ggml/src/ggml-qnn/shared/rpc-mem.hpp
@ -64,8 +64,10 @@ class rpc_mem {

        void * buf = nullptr;
        if (_rpc_interface->is_alloc2_available()) {
+            LOG_DEBUG("rpcmem_alloc2 available, using it\n");
            buf = _rpc_interface->rpcmem_alloc2(heapid, flags, size);
        } else {
+            LOG_DEBUG("rpcmem_alloc2 not available, using rpcmem_alloc\n");
            buf = _rpc_interface->rpcmem_alloc(heapid, flags, size);
        }