qnn fix: update device capabilities for quantized types in qnn-lib to improve compatibility

2025-06-20 20:16:23 +08:00 · 2025-06-20 20:16:23 +08:00 · 332514cd5c
parent af620a12f7
commit 332514cd5c
2 changed files with 6 additions and 3 deletions
--- a/ggml/src/ggml-qnn/qnn/ggml-qnn.cpp
+++ b/ggml/src/ggml-qnn/qnn/ggml-qnn.cpp
@ -283,8 +283,11 @@ ggml_backend_t ggml_backend_qnn_init_with_device_context(ggml_backend_dev_t dev,
                 qnn::get_backend_desc(dev_ctx->device));
        dev_ctx->description = buffer;
    }
+
+#ifdef GGML_HEXAGON_ENABLE_QUANTIZED_TENSORS
    // TODO: remove npu from here if hardware quantization is supported
    dev_ctx->enable_cpu_dequantize = device == QNN_BACKEND_CPU;
+#endif

    ggml_backend_t qnn_backend = new ggml_backend{
        /* .guid      = */ ggml_backend_qnn_guid(),
--- a/ggml/src/ggml-qnn/qnn/qnn-lib.cpp
+++ b/ggml/src/ggml-qnn/qnn/qnn-lib.cpp
@ -38,7 +38,7 @@ constexpr const qnn::device_caps kDeviceCaps[] = {
     // all quantized types can be offload to CPU, at current implementation, those types will be dequantized into float32 on cpu
        0xFFFFFE,
 #else
-        0,
+        (1L << GGML_TYPE_F32),
 #endif

     0,                                                                     // 0 for no limitation
@ -50,7 +50,7 @@ constexpr const qnn::device_caps kDeviceCaps[] = {
     // all quantized types can be offload to GPU, at current implementation, those types will be dequantized into float32 on cpu
        0xFFFFFE,
 #else
-        0,
+        (1L << GGML_TYPE_F32) | (1L << GGML_TYPE_F16),
 #endif
     (128256L * 4096 *
         sizeof(float)), // tested on 8 gen 2, failed to allocate tensor with size 128256x4096 and float32
@ -62,7 +62,7 @@ constexpr const qnn::device_caps kDeviceCaps[] = {
     (1L << GGML_TYPE_F32) | (1L << GGML_TYPE_F16) | (1L << GGML_TYPE_I16),
     (1L << GGML_TYPE_Q2_K) | (1L << GGML_TYPE_Q3_K) | (1L << GGML_TYPE_Q4_K) | (1L << GGML_TYPE_Q8_K),
 #else
-        0,
+        (1L << GGML_TYPE_F32) | (1L << GGML_TYPE_F16),
 #endif
     (8192L * 2048 + 8192 * 512 + 2048 * 512) * sizeof(float),  // TODO: should have a better way to get this value
    },