diff --git a/ggml/src/ggml-qnn/qnn/ggml-qnn.cpp b/ggml/src/ggml-qnn/qnn/ggml-qnn.cpp index e559cfdb28..4a13f3ec05 100644 --- a/ggml/src/ggml-qnn/qnn/ggml-qnn.cpp +++ b/ggml/src/ggml-qnn/qnn/ggml-qnn.cpp @@ -283,8 +283,11 @@ ggml_backend_t ggml_backend_qnn_init_with_device_context(ggml_backend_dev_t dev, qnn::get_backend_desc(dev_ctx->device)); dev_ctx->description = buffer; } + +#ifdef GGML_HEXAGON_ENABLE_QUANTIZED_TENSORS // TODO: remove npu from here if hardware quantization is supported dev_ctx->enable_cpu_dequantize = device == QNN_BACKEND_CPU; +#endif ggml_backend_t qnn_backend = new ggml_backend{ /* .guid = */ ggml_backend_qnn_guid(), diff --git a/ggml/src/ggml-qnn/qnn/qnn-lib.cpp b/ggml/src/ggml-qnn/qnn/qnn-lib.cpp index e32bab5f92..7dbcaf968e 100644 --- a/ggml/src/ggml-qnn/qnn/qnn-lib.cpp +++ b/ggml/src/ggml-qnn/qnn/qnn-lib.cpp @@ -38,7 +38,7 @@ constexpr const qnn::device_caps kDeviceCaps[] = { // all quantized types can be offload to CPU, at current implementation, those types will be dequantized into float32 on cpu 0xFFFFFE, #else - 0, + (1L << GGML_TYPE_F32), #endif 0, // 0 for no limitation @@ -50,7 +50,7 @@ constexpr const qnn::device_caps kDeviceCaps[] = { // all quantized types can be offload to GPU, at current implementation, those types will be dequantized into float32 on cpu 0xFFFFFE, #else - 0, + (1L << GGML_TYPE_F32) | (1L << GGML_TYPE_F16), #endif (128256L * 4096 * sizeof(float)), // tested on 8 gen 2, failed to allocate tensor with size 128256x4096 and float32 @@ -62,7 +62,7 @@ constexpr const qnn::device_caps kDeviceCaps[] = { (1L << GGML_TYPE_F32) | (1L << GGML_TYPE_F16) | (1L << GGML_TYPE_I16), (1L << GGML_TYPE_Q2_K) | (1L << GGML_TYPE_Q3_K) | (1L << GGML_TYPE_Q4_K) | (1L << GGML_TYPE_Q8_K), #else - 0, + (1L << GGML_TYPE_F32) | (1L << GGML_TYPE_F16), #endif (8192L * 2048 + 8192 * 512 + 2048 * 512) * sizeof(float), // TODO: should have a better way to get this value },