Add experimental ggml-hexagon backend for the Hexagon NPU (#16547)

* model: add support for extra bufs for all devices * hexagon: add experimental ggml-hexagon backend for the Hexagon NPU This commit introduces a new experimental backend `ggml-hexagon` with support for the Hexagon NPU. Highlights: - Supports Hexagon versions: v73, v75, v79, and v81 - Targets Android devices based on Snapdragon SoCs: Gen3, 8-Elite, and 8-Elite Gen5 - Supports Q4_0, Q8_0, MXFP4, and FP32 data types - Implements core LLM ops: MUL_MAT/MUL_MAT_ID, ADD/SUB/MUL/ADD_ID, RMS_NORM, ROPE, GLU/SWIGLU, SOFTMAX **Note:** This backend is experimental and may exhibit instability or limited performance across supported devices. It is intended for early testing and feedback from llama.cpp/ggml developer and user community. Co-Authored-By: Rajdeep Ganguly <rganguly@qti.qualcomm.com> Co-Authored-By: Todor Boinovski <todorb@qti.qualcomm.com> * hexagon: fix format checker errors * hexagon: update readme and cmake presets * ci: add android-ndk-build jobs that build plain ARM64 and Snapdragon versions * hexagon: add simple graph optimizer for stacking MUL_MAT ops with the same input * hexagon: move ADB helper scripts into scripts/snapdragon/adb * hexagon: replace all f/printfs with GGML_LOG_... * readme: add hexagon to the list supported backends * hexagon: stack malmuts with quantized inputs only * hexagon: add TODO for fixing issues in hexagon_graph_optimize * hexagon: update to hex-sdk 6.4.0 and add scripts for running on QDC * scripts: fix lint errors * scripts: update qdc pytest script to make linter happy * hexagon: add reduce sum in fp32 * hexagon: reduce number of vector stores in matmul output * hexagon: remove the need for vdelta in reduce-multiply-x8 * hexagon: consistent use of reduce_sum_fp32 for row_sums * hexagon: some more matmul optimizations and comments Optimize cases where tensor dims are not multiple of 1024 (e.g in Qwen models). We've handled those cases already but at a higher overhead. * hexagon: update cmake presets * hexagon: add OPMASK support for run-bench.sh wrapper * hexagon: update to use GGML_BACKEND_API * hexagon: remove unused logic for setting tensor flags for the views * hexagon: add asserts to set/get_tensor to make sure we handle complete tensors Same asserts as the CPU backend. * hexagon: use cpy_tensor slow path for non-host buffers * hexagon: error checks in the buffer allocator * cmake: move include(extProj) under ggml-hexagon * hexagon: don't forget to delete the backend on free * hexagon: set/get_tensor size assert apply only to quantized tensors * hexagon: reintroduce HEX_VERBOSE wrapper for GGML_LOG_DEBUG for now GGML_LOG_DEBUG is always enabled for test-backend-ops and the output gets in the way. Ideally we need a bit more finer log levels. * docs: typos in hexagon developer docs (libggm-...) * hexagon: overhaul error handling in the session/device allocation this should handle all failure paths in the session allocation. * hexagon: update cmake presets to enable fp16 vectors * hexagon: remove unused time_usec function * hexagon: don't forget to release buffer contexts * hexagon: fixed indents in hvx-utils (missed clang-format auto-format failure) * hexagon: remove custom can_repeat function and use ggml_can_repeat --------- Co-authored-by: Rajdeep Ganguly <rganguly@qti.qualcomm.com> Co-authored-by: Todor Boinovski <todorb@qti.qualcomm.com>
2025-10-22 13:47:09 -07:00 · 2025-10-22 13:47:09 -07:00 · 63d2fc46e1
parent a2e0088d92
commit 63d2fc46e1
45 changed files with 13530 additions and 0 deletions
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@ -1305,6 +1305,81 @@ jobs:
          cd examples/llama.android
          ./gradlew build --no-daemon
  android-ndk-build:
    runs-on: ubuntu-latest
    env:
      OPENCL_VERSION: 2025.07.22
    strategy:
      matrix:
        include:
          - build: 'arm64-cpu'
            defines: '-D ANDROID_ABI=arm64-v8a -D ANDROID_PLATFORM=android-31 -D CMAKE_TOOLCHAIN_FILE=${ANDROID_NDK_ROOT}/build/cmake/android.toolchain.cmake -D GGML_NATIVE=OFF -DGGML_CPU_ARM_ARCH=armv8.5-a+fp16+i8mm -G Ninja -D LLAMA_CURL=OFF -D GGML_OPENMP=OFF'
          - build: 'arm64-snapdragon'
            defines: '--preset arm64-android-snapdragon-release'
    steps:
      - name: Clone
        id: checkout
        uses: actions/checkout@v4
      - name: Install OpenCL Headers and Libs
        id: install_opencl
        if: ${{ matrix.build == 'arm64-snapdragon' }}
        run: |
          mkdir opencl
          curl -L -o opencl/clhpp.tar.gz      https://github.com/KhronosGroup/OpenCL-CLHPP/archive/refs/tags/v${OPENCL_VERSION}.tar.gz
          curl -L -o opencl/headers.tar.gz    https://github.com/KhronosGroup/OpenCL-Headers/archive/refs/tags/v${OPENCL_VERSION}.tar.gz
          curl -L -o opencl/icd-loader.tar.gz https://github.com/KhronosGroup/OpenCL-ICD-Loader/archive/refs/tags/v${OPENCL_VERSION}.tar.gz
          tar -xaf opencl/headers.tar.gz    -C opencl
          tar -xaf opencl/clhpp.tar.gz      -C opencl
          tar -xaf opencl/icd-loader.tar.gz -C opencl
          sudo cp -r opencl/OpenCL-Headers-${OPENCL_VERSION}/CL         ${ANDROID_NDK_ROOT}/toolchains/llvm/prebuilt/linux-x86_64/sysroot/usr/include
          sudo cp -r opencl/OpenCL-CLHPP-${OPENCL_VERSION}/include/CL/* ${ANDROID_NDK_ROOT}/toolchains/llvm/prebuilt/linux-x86_64/sysroot/usr/include/CL
          cd opencl/OpenCL-ICD-Loader-${OPENCL_VERSION}
          cmake -B build -G Ninja -DCMAKE_BUILD_TYPE=Release -DCMAKE_TOOLCHAIN_FILE=${ANDROID_NDK_ROOT}/build/cmake/android.toolchain.cmake -DOPENCL_ICD_LOADER_HEADERS_DIR=${ANDROID_NDK_ROOT}/toolchains/llvm/prebuilt/linux-x86_64/sysroot/usr/include -DANDROID_ABI=arm64-v8a -DANDROID_PLATFORM=31 -DANDROID_STL=c++_shared
          cmake --build build
          sudo cp build/libOpenCL.so ${ANDROID_NDK_ROOT}/toolchains/llvm/prebuilt/linux-x86_64/sysroot/usr/lib/aarch64-linux-android
          rm -rf opencl
      - name: Install Hexagon SDK
        id: install_hexsdk
        if: ${{ matrix.build == 'arm64-snapdragon' }}
        env:
          HEXSDK_VER: 6.4.0.2
          HEXTLS_VER: 19.0.04
        run: |
          curl -L -o hex-sdk.tar.gz https://github.com/snapdragon-toolchain/hexagon-sdk/releases/download/v$HEXSDK_VER/hexagon-sdk-v$HEXSDK_VER-amd64-lnx.tar.xz
          mkdir hex-sdk
          tar -xaf hex-sdk.tar.gz -C hex-sdk
          ls -l hex-sdk
          sudo mv hex-sdk /opt/hexagon
          echo "HEXAGON_SDK_ROOT=/opt/hexagon/$HEXSDK_VER"                                     >> "$GITHUB_ENV"
          echo "HEXAGON_TOOLS_ROOT=/opt/hexagon/$HEXSDK_VER/tools/HEXAGON_Tools/$HEXTLS_VER"   >> "$GITHUB_ENV"
          echo "DEFAULT_HLOS_ARCH=64"                                                          >> "$GITHUB_ENV"
          echo "DEFAULT_TOOLS_VARIANT=toolv19"                                                 >> "$GITHUB_ENV"
          echo "DEFAULT_NO_QURT_INC=0"                                                         >> "$GITHUB_ENV"
          echo "DEFAULT_DSP_ARCH=v73"                                                          >> "$GITHUB_ENV"
      - name: Update CMake presets
        id: update_presets
        if: ${{ matrix.build == 'arm64-snapdragon' }}
        run: |
          cp docs/backend/hexagon/CMakeUserPresets.json .
      - name: Build
        id: ndk_build
        run: |
          cmake ${{ matrix.defines }} -B build
          cmake --build build
          cmake --install build --prefix pkg-adb/llama.cpp
      - name: Test
        id: cmake_test
        run: |
          echo "FIXME: test on devices"
  openEuler-latest-cmake-cann:
    if: ${{ github.event_name != 'pull_request' || contains(github.event.pull_request.labels.*.name, 'Ascend NPU') }}
    defaults:
--- a/1
+++ b/1
@ -65,6 +65,7 @@
 /ggml/src/ggml-impl.h                   @ggerganov @slaren
 /ggml/src/ggml-metal/                   @ggerganov
 /ggml/src/ggml-opencl/                  @lhez @max-krasnyansky
 /ggml/src/ggml-hexagon/                 @max-krasnyansky
 /ggml/src/ggml-opt.cpp                  @JohannesGaessler
 /ggml/src/ggml-quants.*                 @ggerganov
 /ggml/src/ggml-rpc/                     @rgerganov
--- a/README.md
+++ b/README.md
@ -280,6 +280,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
 | [IBM zDNN](docs/backend/zDNN.md) | IBM Z & LinuxONE |
 | [WebGPU [In Progress]](docs/build.md#webgpu) | All |
 | [RPC](https://github.com/ggml-org/llama.cpp/tree/master/tools/rpc) | All |
 | [Hexagon [In Progress]](docs/backend/hexagon/README.md) | Snapdragon |
 ## Obtaining and quantizing models
--- a/docs/backend/hexagon/CMakeUserPresets.json
+++ b/docs/backend/hexagon/CMakeUserPresets.json
@ -0,0 +1,49 @@
 {
  "version": 4,
  "configurePresets": [
    {
        "name": "arm64-android-snapdragon",
        "hidden": true,
        "architecture": { "value": "arm64",       "strategy": "external" },
        "toolset":      { "value": "host=x86_64", "strategy": "external" },
        "cacheVariables": {
            "ANDROID_ABI":      "arm64-v8a",
            "ANDROID_PLATFORM": "android-31",
            "CMAKE_TOOLCHAIN_FILE": "$env{ANDROID_NDK_ROOT}/build/cmake/android.toolchain.cmake",
            "CMAKE_C_FLAGS":   "-march=armv8.7a+fp16 -fvectorize -ffp-model=fast -fno-finite-math-only -flto -D_GNU_SOURCE",
            "CMAKE_CXX_FLAGS": "-march=armv8.7a+fp16 -fvectorize -ffp-model=fast -fno-finite-math-only -flto -D_GNU_SOURCE",
            "CMAKE_C_FLAGS_RELEASE":          "-O3 -DNDEBUG",
            "CMAKE_CXX_FLAGS_RELEASE":        "-O3 -DNDEBUG",
            "CMAKE_C_FLAGS_RELWITHDEBINFO":   "-O3 -DNDEBUG -g",
            "CMAKE_CXX_FLAGS_RELWITHDEBINFO": "-O3 -DNDEBUG -g",
            "HEXAGON_SDK_ROOT": "$env{HEXAGON_SDK_ROOT}",
            "PREBUILT_LIB_DIR": "android_aarch64",
            "GGML_OPENMP":      "OFF",
            "GGML_LLAMAFILE":   "OFF",
            "GGML_OPENCL":      "ON",
            "GGML_HEXAGON":     "ON",
            "LLAMA_CURL":       "OFF"
        }
    },
    {
        "name": "arm64-windows-snapdragon",
        "inherits": [ "base", "arm64-windows-llvm" ],
        "cacheVariables": {
            "HEXAGON_SDK_ROOT": "$env{HEXAGON_SDK_ROOT}",
            "PREBUILT_LIB_DIR": "windows_aarch64",
            "GGML_OPENMP":      "OFF",
            "GGML_LLAMAFILE":   "OFF",
            "GGML_OPENCL":      "ON",
            "GGML_HEXAGON":     "ON",
            "LLAMA_CURL":       "OFF"
        }
    },
    { "name": "arm64-android-snapdragon-debug"  , "inherits": [ "base", "arm64-android-snapdragon", "debug" ] },
    { "name": "arm64-android-snapdragon-release", "inherits": [ "base", "arm64-android-snapdragon", "release" ] },
    { "name": "arm64-windows-snapdragon-debug"  , "inherits": [ "base", "arm64-windows-snapdragon", "debug" ] },
    { "name": "arm64-windows-snapdragon-release", "inherits": [ "base", "arm64-windows-snapdragon", "release" ] }
  ]
 }
--- a/docs/backend/hexagon/README.md
+++ b/docs/backend/hexagon/README.md
@ -0,0 +1,239 @@
 # Snapdragon-based Android devices
 ## How to Build
 The easiest way to build llama.cpp for a Snapdragon-based Android device is using the toolchain Docker image (see github.com/snapdragon-toolchain).
 This image includes Android NDK, OpenCL SDK, Hexagon SDK, CMake, etc.
 This method works on Linux, macOS, and Windows. macOS and Windows users should install Docker Desktop.
 ```
 ~/src/llama.cpp$ docker run -it -u $(id -u):$(id -g) --volume $(pwd):/workspace --platform linux/amd64 ghcr.io/snapdragon-toolchain/arm64-android:v0.3
 [d]/> cd /workspace
 ```
 The rest of the Android build process assumes that you're running inside the toolchain container.
 Let's build llama.cpp with CPU, OpenCL, and Hexagon backends via CMake presets:
 ```
 [d]/workspace> cp docs/backend/hexagon/CMakeUserPresets.json .
 [d]/workspace> cmake --preset arm64-android-snapdragon-release -B build-snapdragon
 Preset CMake variables:
  ANDROID_ABI="arm64-v8a"
  ...
  CMAKE_TOOLCHAIN_FILE="/opt/android-ndk-r28b/build/cmake/android.toolchain.cmake"
  GGML_HEXAGON="ON"
  GGML_OPENCL="ON"
  GGML_OPENMP="OFF"
  HEXAGON_SDK_ROOT="/opt/hexagon/6.4.0.2"
 ...
 -- Including OpenCL backend
 -- Including Hexagon backend
 ...
 -- Build files have been written to: /workspace/build-snapdragon
 [d]/workspace> cmake --build build-snapdragon
 ...
 [144/356] Performing build step for 'htp-v73'
 [1/16] Generating htp_iface_skel.c, htp_iface_stub.c, htp_iface.h
 [2/16] Building C object CMakeFiles/ggml-htp-v73.dir/hvx-sigmoid.c.obj
 [3/16] Building C object CMakeFiles/ggml-htp-v73.dir/htp-dma.c.obj
 [4/16] Building C object CMakeFiles/ggml-htp-v73.dir/worker-pool.c.obj
 ...
 -- Installing: /workspace/build-snapdragon/ggml/src/ggml-hexagon/libggml-htp-v73.so
 -- Installing: /workspace/build-snapdragon/ggml/src/ggml-hexagon/libggml-htp-v75.so
 ...
 ```
 To generate an installable "package" simply use cmake --install:
 ```
 [d]/workspace> cmake --install build-snapdragon --prefix pkg-adb/llama.cpp
 -- Install configuration: "Release"
 -- Installing: /workspace/pkg-adb/llama.cpp/lib/libggml-cpu.so
 -- Installing: /workspace/pkg-adb/llama.cpp/lib/libggml-opencl.so
 -- Installing: /workspace/pkg-adb/llama.cpp/lib/libggml-hexagon.so
 -- Installing: /workspace/pkg-adb/llama.cpp/lib/libggml-htp-v73.so
 -- Installing: /workspace/pkg-adb/llama.cpp/lib/libggml-htp-v75.so
 -- Installing: /workspace/pkg-adb/llama.cpp/lib/libggml-htp-v79.so
 -- Installing: /workspace/pkg-adb/llama.cpp/lib/libggml-htp-v81.so
 -- Installing: /workspace/pkg-adb/llama.cpp/lib/libggml.so
 ...
 -- Installing: /workspace/pkg-adb/llama.cpp/bin/llama-bench
 -- Installing: /workspace/pkg-adb/llama.cpp/bin/llama-cli
 ...
 ```
 ## How to Install
 For this step, your device needs to be configured for on-device development.
 Please see https://developer.android.com/studio/debug/dev-options for details.
 Once ADB is enabled, use `adb push` to install `pkg-snapdragon` on the device.
 **Note that the toolchain Docker image doesn't have ADB and doesn't set up the ADB bridge. Please use native ADB on the host.**
 ```
 ~/src/llama.cpp$ adb push pkg-adb/llama.cpp /data/local/tmp/
 pkg-adb/llama.cpp/bin/: 67 files pushed, 0 skipped. 190.2 MB/s (919095042 bytes in 4.607s)
 pkg-adb/llama.cpp/include/: 19 files pushed, 0 skipped. 20.5 MB/s (255173 bytes in 0.012s)
 pkg-adb/llama.cpp/lib/: 16 files pushed, 0 skipped. 144.4 MB/s (43801382 bytes in 0.289s)
 102 files pushed, 0 skipped. 186.9 MB/s (963151597 bytes in 4.914s)
 ```
 At this point, you should also install some models:
 ```
 ~/src/llama.cpp$ wget https://huggingface.co/bartowski/Llama-3.2-1B-Instruct-GGUF/resolve/main/Llama-3.2-1B-Instruct-Q4_0.gguf
 ...
 2025-10-11 12:04:52 (10.7 MB/s) - ‘Llama-3.2-1B-Instruct-Q4_0.gguf’ saved [773025920/773025920]
 ~/src/llama.cpp$ adb push Llama-3.2-1B-Instruct-Q4_0.gguf /data/local/tmp/gguf
 Llama-3.2-1B-Instruct-Q4_0.gguf: 1 file pushed, 0 skipped. 38.3 MB/s (773025920 bytes in 19.250s)
 ```
 ## How to Run
 The easiest way to run llama.cpp cli tools is using provided wrapper scripts that properly set up all required environment variables.
 llama.cpp supports three backends on Snapdragon-based devices: CPU, Adreno GPU (GPUOpenCL), and Hexagon NPU (HTP0-4).
 You can select which backend to run the model on using the `D=` variable, which maps to the `--device` option.
 Hexagon NPU behaves as a "GPU" device when it comes to `-ngl` and other offload-related options.
 Here are some examples of running various llama.cpp tools via ADB.
 Simple question for Llama-3.2-1B
 ```
 ~/src/llama.cpp$ M=Llama-3.2-1B-Instruct-Q4_0.gguf D=HTP0 ./scripts/snapdragon/adb/run-cli.sh -no-cnv -p "what is the most popular cookie in the world?"
 ...
 ggml-hex: Hexagon backend (experimental) : allocating new registry : ndev 1
 ggml-hex: Hexagon Arch version v79
 ggml-hex: allocating new session: HTP0
 ggml-hex: new session: HTP0 : session-id 0 domain-id 3 uri file:///libggml-htp-v79.so?htp_iface_skel_handle_invoke&_modver=1.0&_dom=cdsp&_session=0 handle 0xb4000072c7955e50
 ...
 load_tensors: offloading output layer to GPU
 load_tensors: offloaded 17/17 layers to GPU
 load_tensors:          CPU model buffer size =   225.49 MiB
 load_tensors:         HTP0 model buffer size =     0.26 MiB
 load_tensors:  HTP0-REPACK model buffer size =   504.00 MiB
 ...
 I hope this helps you understand the world's most popular cookies! [end of text]
 ...
 llama_perf_sampler_print:    sampling time =      30.08 ms /   487 runs   (    0.06 ms per token, 16191.77 tokens per second)
 llama_perf_context_print:        load time =     617.94 ms
 llama_perf_context_print: prompt eval time =      80.76 ms /    11 tokens (    7.34 ms per token,   136.21 tokens per second)
 llama_perf_context_print:        eval time =    9210.59 ms /   475 runs   (   19.39 ms per token,    51.57 tokens per second)
 llama_perf_context_print:       total time =    9454.92 ms /   486 tokens
 llama_perf_context_print:    graphs reused =        473
 llama_memory_breakdown_print: | memory breakdown [MiB] | total   free    self   model   context   compute    unaccounted |
 llama_memory_breakdown_print: |   - HTP0 (Hexagon)     |  2048 = 2048 + (   0 =     0 +       0 +       0) +           0 |
 llama_memory_breakdown_print: |   - Host               |                  439 =   225 +     136 +      77                |
 llama_memory_breakdown_print: |   - HTP0-REPACK        |                  504 =   504 +       0 +       0                |
 ```
 Summary request for OLMoE-1B-7B. This is a large model that requires two HTP sessions/devices
 ```
 ~/src/llama.cpp$ M=OLMoE-1B-7B-0125-Instruct-Q4_0.gguf NDEV=2 D=HTP0,HTP1 ./scripts/snapdragon/adb/run-cli.sh -f surfing.txt -no-cnv
 ...
 ggml-hex: Hexagon backend (experimental) : allocating new registry : ndev 1
 ggml-hex: Hexagon Arch version v81
 ggml-hex: allocating new session: HTP0
 ggml-hex: allocating new session: HTP1
 ...
 load_tensors: offloading output layer to GPU
 load_tensors: offloaded 17/17 layers to GPU
 load_tensors:          CPU model buffer size =   143.86 MiB
 load_tensors:         HTP1 model buffer size =     0.23 MiB
 load_tensors:  HTP1-REPACK model buffer size =  1575.00 MiB
 load_tensors:         HTP0 model buffer size =     0.28 MiB
 load_tensors:  HTP0-REPACK model buffer size =  2025.00 MiB
 ...
 llama_context:        CPU  output buffer size =     0.19 MiB
 llama_kv_cache:       HTP1 KV buffer size =   238.00 MiB
 llama_kv_cache:       HTP0 KV buffer size =   306.00 MiB
 llama_kv_cache: size =  544.00 MiB (  8192 cells,  16 layers,  1/1 seqs), K (q8_0):  272.00 MiB, V (q8_0):  272.00 MiB
 llama_context:       HTP0 compute buffer size =    15.00 MiB
 llama_context:       HTP1 compute buffer size =    15.00 MiB
 llama_context:        CPU compute buffer size =    24.56 MiB
 ...
 llama_perf_context_print: prompt eval time =    1730.57 ms /   212 tokens (    8.16 ms per token,   122.50 tokens per second)
 llama_perf_context_print:        eval time =    5624.75 ms /   257 runs   (   21.89 ms per token,    45.69 tokens per second)
 llama_perf_context_print:       total time =    7377.33 ms /   469 tokens
 llama_perf_context_print:    graphs reused =        255
 llama_memory_breakdown_print: | memory breakdown [MiB] | total   free    self   model   context   compute    unaccounted |
 llama_memory_breakdown_print: |   - HTP0 (Hexagon)     |  2048 = 2048 + (   0 =     0 +       0 +       0) +           0 |
 llama_memory_breakdown_print: |   - HTP1 (Hexagon)     |  2048 = 2048 + (   0 =     0 +       0 +       0) +           0 |
 llama_memory_breakdown_print: |   - Host               |                  742 =   144 +     544 +      54                |
 llama_memory_breakdown_print: |   - HTP1-REPACK        |                 1575 =  1575 +       0 +       0                |
 llama_memory_breakdown_print: |   - HTP0-REPACK        |                 2025 =  2025 +       0 +       0                |
 ```
 Op test for MUL_MAT
 ```
 ~/src/llama.cpp$ HB=0 ./scripts/snapdragon/adb/run-tool.sh test-backend-ops -b HTP0 -o MUL_MAT
 ...
 Backend 2/3: HTP0
 Device description: Hexagon
 Device memory: 2048 MB (2048 MB free)
 MUL_MAT(type_a=q4_0,type_b=f32,m=16,n=1,k=256,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): OK
 MUL_MAT(type_a=q4_0,type_b=f32,m=16,n=2,k=256,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): OK
 MUL_MAT(type_a=q4_0,type_b=f32,m=16,n=3,k=256,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): OK
 ~/src/llama.cpp-hexagon$ M=Llama-3.2-1B-Instruct-Q4_0.gguf ./scripts/snapdragon/adb/run-bench.sh -p 128 -n 64
 ...
 ggml-hex: Hexagon backend (experimental) : allocating new registry : ndev 1
 ggml-hex: Hexagon Arch version v79
 ggml-hex: allocating new session: HTP0
 ggml-hex: new session: HTP0 : session-id 0 domain-id 3 uri file:///libggml-htp-v79.so?htp_iface_skel_handle_invoke&_modver=1.0&_dom=cdsp&_session=0 handle 0xb400007d4b231090
 | model          |       size | params | backend    | ngl | threads | n_batch | mmap |  test |           t/s |
 | ---------------| ---------: | -----: | ---------- | --: | ------: | ------: | ---: | ----: | ------------: |
 | llama 1B Q4_0  | 729.75 MiB | 1.24 B | HTP        |  99 |       4 |     128 |    0 | pp128 | 169.42 ± 1.75 |
 | llama 1B Q4_0  | 729.75 MiB | 1.24 B | HTP        |  99 |       4 |     128 |    0 |  tg64 |  51.54 ± 1.13 |
 build: 6a8cf8914 (6733)
 ```
 ## Environment variables
 - `GGML_HEXAGON_NDEV=1`
  Controls the number of devices/sessions to allocate. The default is 1.
  Most quantized models under 4B fit into a single session; an 8B model needs two, and a 20B model needs four.
 - `GGML_HEXAGON_NHVX=0`
  Controls the number of HVX hardware threads to use. The default is all (actual number varies depending on the hardware version).
 - `GGML_HEXAGON_HOSTBUF=1`
  Controls whether the Hexagon backend allocates host buffers. By default, all buffers except for REPACK are host buffers.
  This option is required for testing Ops that require REPACK buffers (MUL_MAT and MUL_MAT_ID).
 - `GGML_HEXAGON_VERBOSE=1`
  Enables verbose logging of Ops from the backend. Example output:
  ```
  ggml-hex: HTP0 graph-compute n_nodes 2
  ggml-hex: HTP0 matmul : blk.27.ffn_up.weight x ffn_norm-27 -> ffn_up-27 : 3072:8192 x 3072:1 -> 8192:1 : q4_0 x f32 -> f32 : HTP0 x HTP0 -> HTP0 : flags 0x1
  ggml-hex: HTP0 matmul : blk.27.ffn_gate.weight x ffn_norm-27 -> ffn_gate-27 : 3072:8192 x 3072:1 -> 8192:1 : q4_0 x f32 -> f32 : HTP0 x HTP0 -> HTP0 : flags 0x3
  ggml-hex: HTP0 graph-compute n_nodes 1
  ggml-hex: HTP0 matmul : blk.27.ffn_down.weight x ffn_gate_par-27 -> ffn_out-27 : 8192:3072 x 8192:1 -> 3072:1 : q4_0 x f32 -> f32 : HTP0 x HTP0 -> HTP0 : flags 0x0
  ggml-hex: HTP0 get-tensor result_output : data 0x7592487000 offset 0 size 513024
  ```
 - `GGML_HEXAGON_PROFILE=1`
  Generates a host-side profile for the ggml-hexagon Ops.
 - `GGML_HEXAGON_OPMASK=0x0`
  Allows enabling specific stages of the processing pipeline:
  - `0x1` Enable Op Queue (i.e., queuing Ops into NPU)
  - `0x2` Enable Dynamic Quantizer (if needed for the Op)
  - `0x4` Enable Op Compute (MUL_MAT, etc.)
  Examples:
      `GGML_HEXAGON_OPMASK=0x1 llama-cli ...` - Ops are enqueued but NPU-side processing is stubbed out
      `GGML_HEXAGON_OPMASK=0x3 llama-cli ...` - NPU performs dynamic quantization and skips the rest
      `GGML_HEXAGON_OPMASK=0x7 llama-cli ...` - Full queuing and processing of Ops (default)
--- a/docs/backend/hexagon/developer.md
+++ b/docs/backend/hexagon/developer.md
@ -0,0 +1,109 @@
 # Hexagon backend developer details
 ## Backend libraries
 The Hexagon backend consist of two parts:
  - `libggml-hexagon`
    This is the regular CPU-side GGML backend library, either shared or statically linked
  - `libggml-htp-vNN`
    This is the NPU-side (HTP stands for Hexagon Tensor Processor) shared library that contains the Op dispatcher and kernels.
    The correct library is selected automatically at runtime based on the HW version.
 Here is an example of the build artifacts
 ```
 ~/src/llama.cpp$ ls -l pkg-adb/llama.cpp/lib/libggml*
 pkg-adb/llama.cpp/lib/libggml-base.so
 pkg-adb/llama.cpp/lib/libggml-cpu.so
 pkg-adb/llama.cpp/lib/libggml-hexagon.so      <<< CPU library
 pkg-adb/llama.cpp/lib/libggml-htp-v73.so      <<< HTP op/kernels for Hexagon v73
 pkg-adb/llama.cpp/lib/libggml-htp-v75.so
 pkg-adb/llama.cpp/lib/libggml-htp-v79.so
 pkg-adb/llama.cpp/lib/libggml-htp-v81.so
 ```
 ## Memory buffers
 Hexagon NPU backend takes advantage of the Snapdragon's unified memory model where all buffers are fully accessible by the CPU and GPU.
 The NPU does have a dedicated tightly-coupled memory called VTCM but that memory is used only for intermediate data (e.g. dynamically
 quantized tensors) or temporary data (chunks of the weight tensors fetched via DMA).
 Please note that currently the Hexagon backend does not implement SET/GET_ROWS Ops because there is no advantage in offloading those
 to the NPU at this point.
 The backend does allocates non-host buffers for the tensors with datatypes that require repacking: Q4_0, Q8_0, MXFP4.
 From the MMU perspective these buffers are still regular buffers (normal access by the CPU) they are marked as non-host simply to force
 the repacking.
 ## Large model handling
 Hexagon NPU session (aka Process Domain (PD) in the Hexagon docs) is limited to a memory mapping of around 3.5GB.
 In llama.cpp/GGML the Hexagon session is mapped to a single GGML backend device (HTP0, HTP1, etc).
 In order to map models larger than 3.5GB we need to allocate multiple devices and split the model.
 For this we're taking advantage of the llama.cpp/GGML multi-GPU layer-splitting support.
 Each Hexagon device behaves like a GPU from the offload and model splitting perspective.
 Here is an example of running GPT-OSS-20B model on a newer Snapdragon device with 16GB of DDR.
 ```
 M=gpt-oss-20b-Q4_0.gguf NDEV=4 D=HTP0,HTP1,HTP2,HTP3 P=surfing.txt scripts/snapdragon/adb/run-cli.sh -no-cnv -f surfing.txt -n 32
 ...
 LD_LIBRARY_PATH=/data/local/tmp/llama.cpp/lib
 ADSP_LIBRARY_PATH=/data/local/tmp/llama.cpp/lib
 GGML_HEXAGON_NDEV=4 ./bin/llama-cli --no-mmap -m /data/local/tmp/llama.cpp/../gguf/gpt-oss-20b-Q4_0.gguf
      -t 4 --ctx-size 8192 --batch-size 128 -ctk q8_0 -ctv q8_0 -fa on -ngl 99 --device HTP0,HTP1,HTP2,HTP3 -no-cnv -f surfing.txt
 ...
 llama_model_loader: - type  f32:  289 tensors
 llama_model_loader: - type q4_0:   96 tensors
 llama_model_loader: - type q8_0:    2 tensors
 llama_model_loader: - type mxfp4:  72 tensors
 ...
 load_tensors: offloaded 25/25 layers to GPU
 load_tensors:          CPU model buffer size =  1182.09 MiB
 load_tensors:         HTP1 model buffer size =     6.64 MiB
 load_tensors:  HTP1-REPACK model buffer size =  2505.94 MiB
 load_tensors:         HTP3 model buffer size =     5.55 MiB
 load_tensors:  HTP3-REPACK model buffer size =  2088.28 MiB
 load_tensors:         HTP0 model buffer size =     7.75 MiB
 load_tensors:  HTP0-REPACK model buffer size =  2923.59 MiB
 load_tensors:         HTP2 model buffer size =     6.64 MiB
 load_tensors:  HTP2-REPACK model buffer size =  2505.94 MiB
 ...
 llama_context: n_ctx_per_seq (8192) < n_ctx_train (131072) -- the full capacity of the model will not be utilized
 llama_context:        CPU  output buffer size =     0.77 MiB
 llama_kv_cache_iswa: creating non-SWA KV cache, size = 8192 cells
 llama_kv_cache:       HTP1 KV buffer size =    25.50 MiB
 llama_kv_cache:       HTP3 KV buffer size =    25.50 MiB
 llama_kv_cache:       HTP0 KV buffer size =    25.50 MiB
 llama_kv_cache:       HTP2 KV buffer size =    25.50 MiB
 llama_kv_cache: size =  102.00 MiB (  8192 cells,  12 layers,  1/1 seqs), K (q8_0):   51.00 MiB, V (q8_0):   51.00 MiB
 llama_kv_cache_iswa: creating     SWA KV cache, size = 256 cells
 llama_kv_cache:       HTP1 KV buffer size =     0.80 MiB
 llama_kv_cache:       HTP3 KV buffer size =     0.53 MiB
 llama_kv_cache:       HTP0 KV buffer size =     1.06 MiB
 llama_kv_cache:       HTP2 KV buffer size =     0.80 MiB
 llama_kv_cache: size =    3.19 MiB (   256 cells,  12 layers,  1/1 seqs), K (q8_0):    1.59 MiB, V (q8_0):    1.59 MiB
 llama_context:       HTP0 compute buffer size =    16.06 MiB
 llama_context:       HTP1 compute buffer size =    16.06 MiB
 llama_context:       HTP2 compute buffer size =    16.06 MiB
 llama_context:       HTP3 compute buffer size =    16.06 MiB
 llama_context:        CPU compute buffer size =    98.19 MiB
 ...
 llama_perf_context_print: prompt eval time =    3843.67 ms /   197 tokens ( 19.51 ms per token, 51.25 tokens per second)
 llama_perf_context_print:        eval time =    1686.13 ms /    31 runs   ( 54.39 ms per token, 18.39 tokens per second)
 llama_perf_context_print:       total time =    6266.30 ms /   228 tokens
 llama_perf_context_print:    graphs reused =         30
 llama_memory_breakdown_print: | memory breakdown [MiB] | total   free    self   model   context   compute    unaccounted |
 llama_memory_breakdown_print: |   - HTP0 (Hexagon)     |  2048 = 2048 + (   0 =     0 +       0 +       0) +           0 |
 llama_memory_breakdown_print: |   - HTP1 (Hexagon)     |  2048 = 2048 + (   0 =     0 +       0 +       0) +           0 |
 llama_memory_breakdown_print: |   - HTP2 (Hexagon)     |  2048 = 2048 + (   0 =     0 +       0 +       0) +           0 |
 llama_memory_breakdown_print: |   - HTP3 (Hexagon)     |  2048 = 2048 + (   0 =     0 +       0 +       0) +           0 |
 llama_memory_breakdown_print: |   - Host               |                 1476 =  1208 +     105 +     162                |
 llama_memory_breakdown_print: |   - HTP1-REPACK        |                 2505 =  2505 +       0 +       0                |
 llama_memory_breakdown_print: |   - HTP3-REPACK        |                 2088 =  2088 +       0 +       0                |
 llama_memory_breakdown_print: |   - HTP0-REPACK        |                 2923 =  2923 +       0 +       0                |
 llama_memory_breakdown_print: |   - HTP2-REPACK        |                 2505 =  2505 +       0 +       0                |
 ```
--- a/ggml/CMakeLists.txt
+++ b/ggml/CMakeLists.txt
@ -251,6 +251,8 @@ option(GGML_OPENCL_USE_ADRENO_KERNELS       "ggml: use optimized kernels for Adr
 set   (GGML_OPENCL_TARGET_VERSION "300" CACHE STRING
                                            "gmml: OpenCL API version to target")
 option(GGML_HEXAGON                         "ggml: enable Hexagon backend"                    OFF)
 # toolchain for vulkan-shaders-gen
 set   (GGML_VULKAN_SHADERS_GEN_TOOLCHAIN "" CACHE FILEPATH "ggml: toolchain file for vulkan-shaders-gen")
--- a/ggml/include/ggml-hexagon.h
+++ b/ggml/include/ggml-hexagon.h
@ -0,0 +1,19 @@
 #pragma once
 #include "ggml.h"
 #include "ggml-backend.h"
 #ifdef  __cplusplus
 extern "C" {
 #endif
 // backend API
 GGML_BACKEND_API ggml_backend_t ggml_backend_hexagon_init(void);
 GGML_BACKEND_API bool ggml_backend_is_hexagon(ggml_backend_t backend);
 GGML_BACKEND_API ggml_backend_reg_t ggml_backend_hexagon_reg(void);
 #ifdef  __cplusplus
 }
 #endif
--- a/ggml/src/CMakeLists.txt
+++ b/ggml/src/CMakeLists.txt
@ -402,6 +402,7 @@ ggml_add_backend(Vulkan)
 ggml_add_backend(WebGPU)
 ggml_add_backend(zDNN)
 ggml_add_backend(OpenCL)
 ggml_add_backend(Hexagon)
 foreach (target ggml-base ggml)
    target_include_directories(${target} PUBLIC    $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/../include> $<INSTALL_INTERFACE:include>)
--- a/ggml/src/ggml-backend-reg.cpp
+++ b/ggml/src/ggml-backend-reg.cpp
@ -57,6 +57,10 @@
 #include "ggml-opencl.h"
 #endif
 #ifdef GGML_USE_HEXAGON
 #include "ggml-hexagon.h"
 #endif
 #ifdef GGML_USE_BLAS
 #include "ggml-blas.h"
 #endif
@ -199,6 +203,9 @@ struct ggml_backend_registry {
 #ifdef GGML_USE_OPENCL
        register_backend(ggml_backend_opencl_reg());
 #endif
 #ifdef GGML_USE_HEXAGON
        register_backend(ggml_backend_hexagon_reg());
 #endif
 #ifdef GGML_USE_CANN
        register_backend(ggml_backend_cann_reg());
 #endif
@ -598,6 +605,7 @@ void ggml_backend_load_all_from_path(const char * dir_path) {
    ggml_backend_load_best("sycl", silent, dir_path);
    ggml_backend_load_best("vulkan", silent, dir_path);
    ggml_backend_load_best("opencl", silent, dir_path);
    ggml_backend_load_best("hexagon", silent, dir_path);
    ggml_backend_load_best("musa", silent, dir_path);
    ggml_backend_load_best("cpu", silent, dir_path);
    // check the environment variable GGML_BACKEND_PATH to load an out-of-tree backend
--- a/ggml/src/ggml-hexagon/CMakeLists.txt
+++ b/ggml/src/ggml-hexagon/CMakeLists.txt
@ -0,0 +1,68 @@
 include(${HEXAGON_SDK_ROOT}/build/cmake/hexagon_fun.cmake)
 include(ExternalProject)
 option(GGML_HEXAGON_HTP_DEBUG "ggml-hexagon: enable HTP debug output" OFF)
 add_library(htp_iface OBJECT
    ${CMAKE_CURRENT_BINARY_DIR}/htp_iface_stub.c)
 set_target_properties(htp_iface PROPERTIES POSITION_INDEPENDENT_CODE ON)
 target_include_directories(htp_iface PUBLIC
    ${HEXAGON_SDK_ROOT}/incs
    ${HEXAGON_SDK_ROOT}/incs/stddef
    ${HEXAGON_SDK_ROOT}/utils/examples
    ${CMAKE_CURRENT_SOURCE_DIR}/htp
    ${CMAKE_CURRENT_BINARY_DIR})
 build_idl(htp/htp_iface.idl htp_iface)
 if (CMAKE_SYSTEM_NAME MATCHES Android)
    target_link_options(htp_iface PUBLIC -llog -ldl)
 elseif (CMAKE_SYSTEM_NAME MATCHES Windows)
    target_precompile_headers(htp_iface PUBLIC <sal.h>)
 else()
    target_link_options(htp_iface PUBLIC -ldl)
 endif()
 link_custom_library(htp_iface cdsprpc)
 link_custom_library(htp_iface rpcmem)
 set(TARGET_NAME ggml-hexagon)
 ggml_add_backend_library(${TARGET_NAME}
    ggml-hexagon.cpp htp-utils.c htp-utils.h ../../include/ggml-hexagon.h)
 target_link_libraries(${TARGET_NAME} PRIVATE htp_iface)
 target_include_directories(${TARGET_NAME} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/htp ${CMAKE_CURRENT_BINARY_DIR})
 # Build HTP bits
 set(HTP_CMAKE_ARGS
    -DCMAKE_TOOLCHAIN_FILE=${CMAKE_CURRENT_SOURCE_DIR}/htp/cmake-toolchain.cmake
    -DCMAKE_BUILD_TYPE=Release
    -DCMAKE_INSTALL_LIBDIR=${CMAKE_CURRENT_BINARY_DIR}
    -DHEXAGON_SDK_ROOT=$ENV{HEXAGON_SDK_ROOT}
    -DHEXAGON_TOOLS_ROOT=$ENV{HEXAGON_TOOLS_ROOT}
    -DHEXAGON_HTP_DEBUG=${GGML_HEXAGON_HTP_DEBUG})
 ExternalProject_Add(htp-v73
    SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/htp BUILD_ALWAYS ON
    CMAKE_ARGS ${HTP_CMAKE_ARGS} -DDSP_VERSION=v73 -DPREBUILT_LIB_DIR="toolv19_v73")
 ExternalProject_Add(htp-v75
    SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/htp BUILD_ALWAYS ON
    CMAKE_ARGS ${HTP_CMAKE_ARGS} -DDSP_VERSION=v75 -DPREBUILT_LIB_DIR="toolv19_v75")
 ExternalProject_Add(htp-v79
    SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/htp BUILD_ALWAYS ON
    CMAKE_ARGS ${HTP_CMAKE_ARGS} -DDSP_VERSION=v79 -DPREBUILT_LIB_DIR="toolv19_v79")
 ExternalProject_Add(htp-v81
    SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/htp BUILD_ALWAYS ON
    CMAKE_ARGS ${HTP_CMAKE_ARGS} -DDSP_VERSION=v81 -DPREBUILT_LIB_DIR="toolv19_v81")
 # Install Hexagon skels required at runtime
 install(FILES
    ${CMAKE_CURRENT_BINARY_DIR}/libggml-htp-v73.so
    ${CMAKE_CURRENT_BINARY_DIR}/libggml-htp-v75.so
    ${CMAKE_CURRENT_BINARY_DIR}/libggml-htp-v79.so
    ${CMAKE_CURRENT_BINARY_DIR}/libggml-htp-v81.so
    TYPE LIB)
--- a/ggml/src/ggml-hexagon/ggml-hexagon.cpp
+++ b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
--- a/ggml/src/ggml-hexagon/htp-utils.c
+++ b/ggml/src/ggml-hexagon/htp-utils.c
@ -0,0 +1,448 @@
 #pragma clang diagnostic ignored "-Wgnu-anonymous-struct"
 #pragma clang diagnostic ignored "-Wmissing-prototypes"
 #pragma clang diagnostic ignored "-Wsign-compare"
 #define GGML_COMMON_IMPL_C
 #include "ggml-backend-impl.h"
 #include "ggml-common.h"
 #include "ggml-hexagon.h"
 #include "ggml-impl.h"
 #include "htp-utils.h"
 #include <domain.h>
 #include <remote.h>
 #include <stdbool.h>
 #include <stdint.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 domain * get_domain(int domain_id) {
    int i    = 0;
    int size = sizeof(supported_domains) / sizeof(domain);
    for (i = 0; i < size; i++) {
        if (supported_domains[i].id == domain_id) {
            return &supported_domains[i];
        }
    }
    return NULL;
 }
 bool is_valid_domain_id(int domain_id, int compute_only) {
    int i    = 0;
    int size = sizeof(supported_domains) / sizeof(domain);
    if (compute_only) {
        return is_CDSP(domain_id);
    }
    for (i = 0; i < size; i++) {
        if (supported_domains[i].id == domain_id) {
            return true;
        }
    }
    return false;
 }
 int get_domains_info(char * domain_type, int * num_domains, fastrpc_domain ** domains_info) {
    int nErr    = AEE_SUCCESS;
    int ss_info = 0;
    if (domain_type != NULL) {
        if (strcmp(domain_type, "LPASS") == 0) {
            ss_info = FASTRPC_LPASS;
        } else if (strcmp(domain_type, "HPASS") == 0) {
            ss_info = FASTRPC_HPASS;
        } else {
            ss_info = FASTRPC_NSP;
        }
    }
    system_req_payload req  = { 0 };
    req.id                  = FASTRPC_GET_DOMAINS;
    req.sys.domains         = NULL;
    fastrpc_domain * domain = NULL;
    if (ss_info != 0) {
        req.sys.flags = DOMAINS_LIST_FLAGS_SET_TYPE(req.sys.flags, ss_info);
    } else {
        req.sys.flags = 0;
    }
 #ifdef _WIN32
    nErr = AEE_EUNSUPPORTED;
    goto bail;
 #endif
    if (remote_system_request) {
        nErr = remote_system_request(&req);
        if (nErr != AEE_SUCCESS) {
            GGML_LOG_ERROR("Failure in remote_system_request call: %d.\n", nErr);
            goto bail;
        }
        // Allocate memory for domain-info array
        req.sys.max_domains = req.sys.num_domains;
        if ((req.sys.domains = calloc(req.sys.num_domains, sizeof(fastrpc_domain))) == NULL) {
            nErr = AEE_ENOMEMORY;
            GGML_LOG_ERROR("Unable to allocate memory for req.sys.domains");
            goto bail;
        }
        nErr = remote_system_request(&req);
        if (nErr != AEE_SUCCESS) {
            GGML_LOG_ERROR("Failure in remote_system_request call: %d.\n", nErr);
            goto bail;
        }
        for (int i = 0; i < req.sys.num_domains; i++) {
            // Verify that only requested type domains were returned
            domain = &req.sys.domains[i];
            if (domain->type != ss_info && domain_type != NULL) {
                nErr = -1;
                GGML_LOG_ERROR("Incorrect data received from remote_system_request.\n");
                goto bail;
            }
        }
        *domains_info = req.sys.domains;
        *num_domains  = req.sys.num_domains;
    } else {
        nErr = AEE_EUNSUPPORTED;
        goto bail;
    }
 bail:
    if (nErr && !req.sys.domains) {
        free(req.sys.domains);
    }
    return nErr;
 }
 int get_effective_domain_id(char * domain_name, int session_id, int * effec_domain_id) {
    int                              err  = 0;
    remote_rpc_effective_domain_id_t sess = { 0 };
    sess.domain_name     = domain_name;
    sess.domain_name_len = strlen(domain_name);
    sess.session_id      = session_id;
    err = remote_session_control(FASTRPC_GET_EFFECTIVE_DOMAIN_ID, &sess, sizeof(sess));
    if (err) {
        GGML_LOG_ERROR("Error 0x%x: failed to get effective domain id for %s, session id %d\n", err, sess.domain_name,
               session_id);
        return err;
    }
    *effec_domain_id = sess.effective_domain_id;
    return err;
 }
 int get_dsp_support(int * domain) {
    int nErr = AEE_SUCCESS;
    *domain  = CDSP_DOMAIN_ID;  // DSP domain default value is CDSP_DOMAIN_ID
    if (remote_handle_control) {
        struct remote_dsp_capability dsp_capability_domain = { CDSP_DOMAIN_ID, DOMAIN_SUPPORT, 0 };
        nErr = remote_handle_control(DSPRPC_GET_DSP_INFO, &dsp_capability_domain, sizeof(struct remote_dsp_capability));
        if ((nErr & 0xFF) == (AEE_EUNSUPPORTEDAPI & 0xFF)) {
            GGML_LOG_ERROR("\nFastRPC Capability API is not supported on this device\n");
            goto bail;
        }
        if (dsp_capability_domain.capability == 0) {
            dsp_capability_domain.domain       = ADSP_DOMAIN_ID;  // Check for ADSP support.
            dsp_capability_domain.attribute_ID = DOMAIN_SUPPORT;
            dsp_capability_domain.capability   = 0;
            nErr                               = remote_handle_control(DSPRPC_GET_DSP_INFO, &dsp_capability_domain,
                                                                       sizeof(struct remote_dsp_capability));
            if (dsp_capability_domain.capability) {
                *domain = ADSP_DOMAIN_ID;  // For targets like Agatti (not having cDSP), domain is ADSP_DOMAIN_ID
            }
        }
        if (nErr != AEE_SUCCESS) {
            GGML_LOG_ERROR("\nget_dsp_support failed with Error 0x%x\n", nErr);
            goto bail;
        }
    } else {
        nErr = AEE_EUNSUPPORTEDAPI;
        GGML_LOG_ERROR("remote_dsp_capability interface is not supported on this device\n");
    }
 bail:
    return nErr;
 }
 int get_vtcm_info(int domain, uint32_t * capability, uint32_t attr) {
    int nErr    = AEE_SUCCESS;
    *capability = 0;
    if (attr == VTCM_PAGE || attr == VTCM_COUNT) {
    } else {
        nErr = AEE_EBADPARM;
        GGML_LOG_ERROR("Unsupported attr. Only VTCM_PAGE and VTCM_COUNT supported\n");
        goto bail;
    }
    if (remote_handle_control) {
        if (domain == ADSP_DOMAIN_ID || domain == CDSP_DOMAIN_ID) {
            /*
            * Query the DSP for VTCM information
            * Since the ADSP does not have a dedicated VTCM, we expect the output to be 0
            */
            struct remote_dsp_capability dsp_capability_vtcm_dsp;
            dsp_capability_vtcm_dsp.domain       = (uint32_t) domain;
            dsp_capability_vtcm_dsp.attribute_ID = attr;
            dsp_capability_vtcm_dsp.capability   = (uint32_t) 0;
            nErr                                 = remote_handle_control(DSPRPC_GET_DSP_INFO, &dsp_capability_vtcm_dsp,
                                                                         sizeof(struct remote_dsp_capability));
            if ((nErr & 0xFF) == (AEE_EUNSUPPORTEDAPI & 0xFF)) {
                GGML_LOG_ERROR("\nFastRPC Capability API is not supported on this device\n");
                GGML_LOG_ERROR("Running the usecase without checking the capability\n");
                nErr = AEE_SUCCESS;
                goto bail;
            } else if (nErr == AEE_SUCCESS) {
                *capability = dsp_capability_vtcm_dsp.capability;
            } else {
                GGML_LOG_ERROR("\nget_vtcm_info failed with Error 0x%x\n", nErr);
                goto bail;
            }
        } else {
            nErr = AEE_EUNSUPPORTED;
            GGML_LOG_ERROR("Unsupported domain %d\n", domain);
            goto bail;
        }
    } else {
        nErr = AEE_EUNSUPPORTEDAPI;
        GGML_LOG_ERROR("remote_dsp_capability interface is not supported on this device\n");
    }
 bail:
    return nErr;
 }
 bool is_unsignedpd_supported(int domain_id) {
    int nErr = AEE_SUCCESS;
    if (remote_handle_control) {
        struct remote_dsp_capability dsp_capability_domain = { domain_id, UNSIGNED_PD_SUPPORT, 0 };
        nErr = remote_handle_control(DSPRPC_GET_DSP_INFO, &dsp_capability_domain, sizeof(struct remote_dsp_capability));
        if ((nErr & 0xFF) == (AEE_EUNSUPPORTEDAPI & 0xFF)) {
            GGML_LOG_ERROR("\nFastRPC Capability API is not supported on this device. Falling back to signed pd.\n");
            return false;
        }
        if (nErr) {
            GGML_LOG_ERROR("\nERROR 0x%x: FastRPC Capability API failed. Falling back to signed pd.", nErr);
            return false;
        }
        if (dsp_capability_domain.capability == 1) {
            return true;
        }
    } else {
        nErr = AEE_EUNSUPPORTEDAPI;
        GGML_LOG_ERROR("remote_dsp_capability interface is not supported on this device. Falling back to signed pd.\n");
        return false;
    }
    return false;
 }
 bool get_unsignedpd_support(void) {
    return is_unsignedpd_supported(CDSP_DOMAIN_ID);
 }
 bool is_async_fastrpc_supported(int domain) {
    int nErr = AEE_SUCCESS;
    if (remote_handle_control) {
        if (domain == CDSP_DOMAIN_ID) {
            /*
            * Query the DSP for ASYNC_FASTRPC_SUPPORT information
            * Async fastrpc is supported only on CDSP
            */
            struct remote_dsp_capability dsp_capability_async_support;
            dsp_capability_async_support.domain       = (uint32_t) domain;
            dsp_capability_async_support.attribute_ID = ASYNC_FASTRPC_SUPPORT;
            dsp_capability_async_support.capability   = (uint32_t) 0;
            nErr = remote_handle_control(DSPRPC_GET_DSP_INFO, &dsp_capability_async_support,
                                         sizeof(struct remote_dsp_capability));
            if ((nErr & 0xFF) == (AEE_EUNSUPPORTEDAPI & 0xFF)) {
                GGML_LOG_ERROR("\nFastRPC Capability API is not supported on this device\n");
                GGML_LOG_ERROR("Running the usecase without checking the capability\n");
                nErr = AEE_SUCCESS;
                goto bail;
            } else if (dsp_capability_async_support.capability == 1) {
                return true;
            }
            if (nErr != AEE_SUCCESS) {
                GGML_LOG_ERROR("\nis_async_fastrpc_supported failed with Error 0x%x\n", nErr);
                goto bail;
            }
        } else {
            nErr = AEE_EUNSUPPORTED;
            GGML_LOG_ERROR("Async fastrpc is not supported on domain %d\n", domain);
            goto bail;
        }
    } else {
        nErr = AEE_EUNSUPPORTEDAPI;
        GGML_LOG_ERROR("remote_dsp_capability interface is not supported on this device\n");
    }
 bail:
    return false;
 }
 bool is_status_notification_supported(int domain) {
    int nErr = AEE_SUCCESS;
    if (remote_handle_control) {
        /*
        * Query the DSP for STATUS_NOTIFICATION_SUPPORT information
        * DSP User PD status notification Support
        */
        struct remote_dsp_capability dsp_capability_status_notification_support;
        dsp_capability_status_notification_support.domain       = (uint32_t) domain;
        dsp_capability_status_notification_support.attribute_ID = STATUS_NOTIFICATION_SUPPORT;
        dsp_capability_status_notification_support.capability   = (uint32_t) 0;
        nErr = remote_handle_control(DSPRPC_GET_DSP_INFO, &dsp_capability_status_notification_support,
                                     sizeof(struct remote_dsp_capability));
        if ((nErr & 0xFF) == (AEE_EUNSUPPORTEDAPI & 0xFF)) {
            GGML_LOG_ERROR("\nFastRPC Capability API is not supported on this device\n");
            GGML_LOG_ERROR("Running the usecase without checking the capability\n");
            nErr = AEE_SUCCESS;
            goto bail;
        } else if (dsp_capability_status_notification_support.capability == 1) {
            return true;
        }
        if (nErr != AEE_SUCCESS) {
            GGML_LOG_ERROR("\nis_status_notification_supported failed with Error 0x%x\n", nErr);
            goto bail;
        }
    } else {
        nErr = AEE_EUNSUPPORTEDAPI;
        GGML_LOG_ERROR("remote_dsp_capability interface is not supported on this device\n");
    }
 bail:
    return false;
 }
 int get_hmx_support_info(int domain, uint32_t * capability, uint32_t attr) {
    int nErr    = AEE_SUCCESS;
    *capability = 0;
    if (attr != HMX_SUPPORT_SPATIAL && attr != HMX_SUPPORT_DEPTH) {
        nErr = AEE_EBADPARM;
        GGML_LOG_ERROR("Unsupported attr. Only HMX_SUPPORT_SPATIAL and HMX_SUPPORT_DEPTH supported\n");
        goto bail;
    }
    if (remote_handle_control) {
        if (domain == CDSP_DOMAIN_ID) {
            /*
            * Query the DSP for HMX SUPPORT information
            * HMX is supported on CDSP only
            */
            struct remote_dsp_capability dsp_capability_hmx_dsp;
            dsp_capability_hmx_dsp.domain       = (uint32_t) domain;
            dsp_capability_hmx_dsp.attribute_ID = attr;
            dsp_capability_hmx_dsp.capability   = (uint32_t) 0;
            nErr                                = remote_handle_control(DSPRPC_GET_DSP_INFO, &dsp_capability_hmx_dsp,
                                                                        sizeof(struct remote_dsp_capability));
            if ((nErr & 0xFF) == (AEE_EUNSUPPORTEDAPI & 0xFF)) {
                GGML_LOG_ERROR("\nFastRPC Capability API is not supported on this device\n");
                GGML_LOG_ERROR("Running the usecase without checking the capability\n");
                nErr = AEE_SUCCESS;
                goto bail;
            } else if (nErr == AEE_SUCCESS) {
                *capability = dsp_capability_hmx_dsp.capability;
            } else {
                GGML_LOG_ERROR("\nget_hmx_support_info failed with Error 0x%x\n", nErr);
                goto bail;
            }
        } else {
            nErr = AEE_EUNSUPPORTED;
            GGML_LOG_ERROR("HMX support is not there for domain %d\n", domain);
            goto bail;
        }
    } else {
        nErr = AEE_EUNSUPPORTEDAPI;
        GGML_LOG_ERROR("remote_dsp_capability interface is not supported on this device\n");
    }
 bail:
    return nErr;
 }
 int get_hex_arch_ver(int domain, int * arch) {
    if (!remote_handle_control) {
        GGML_LOG_ERROR("ggml-hex: remote_handle_control is not supported on this device\n");
        return AEE_EUNSUPPORTEDAPI;
    }
    struct remote_dsp_capability arch_ver;
    arch_ver.domain       = (uint32_t) domain;
    arch_ver.attribute_ID = ARCH_VER;
    arch_ver.capability   = (uint32_t) 0;
    int err = remote_handle_control(DSPRPC_GET_DSP_INFO, &arch_ver, sizeof(arch_ver));
    if ((err & 0xff) == (AEE_EUNSUPPORTEDAPI & 0xff)) {
        GGML_LOG_ERROR("ggml-hex: FastRPC capability API is not supported on this device\n");
        return AEE_EUNSUPPORTEDAPI;
    }
    if (err != AEE_SUCCESS) {
        GGML_LOG_ERROR("ggml-hex: FastRPC capability query failed (err %d)\n", err);
        return err;
    }
    switch (arch_ver.capability & 0xff) {
        case 0x73:
            *arch = 73;
            return 0;
        case 0x75:
            *arch = 75;
            return 0;
        case 0x79:
            *arch = 79;
            return 0;
        case 0x81:
            *arch = 81;
            return 0;
    }
    return -1;
 }
 int get_hvx_support_info(int domain, uint32_t * capability, uint32_t attr) {
    int nErr    = AEE_SUCCESS;
    *capability = 0;
    if (remote_handle_control) {
        if (domain == CDSP_DOMAIN_ID) {
            /*
            * Query the DSP for HVX SUPPORT information
            * HVX is supported on CDSP only
            */
            struct remote_dsp_capability dsp_capability_hvx_dsp;
            dsp_capability_hvx_dsp.domain       = (uint32_t) domain;
            dsp_capability_hvx_dsp.attribute_ID = attr;
            dsp_capability_hvx_dsp.capability   = (uint32_t) 0;
            nErr                                = remote_handle_control(DSPRPC_GET_DSP_INFO, &dsp_capability_hvx_dsp,
                                                                        sizeof(struct remote_dsp_capability));
            if ((nErr & 0xFF) == (AEE_EUNSUPPORTEDAPI & 0xFF)) {
                GGML_LOG_ERROR("\nFastRPC Capability API is not supported on this device\n");
                GGML_LOG_ERROR("Running the usecase without checking the capability\n");
                nErr = AEE_SUCCESS;
                goto bail;
            } else if (nErr == AEE_SUCCESS) {
                *capability = dsp_capability_hvx_dsp.capability;
            } else {
                GGML_LOG_ERROR("\nget_hvx_support_info failed with Error 0x%x\n", nErr);
                goto bail;
            }
        } else {
            nErr = AEE_EUNSUPPORTED;
            GGML_LOG_ERROR("HVX support is not available on domain %d\n", domain);
            goto bail;
        }
    } else {
        nErr = AEE_EUNSUPPORTEDAPI;
        GGML_LOG_ERROR("remote_dsp_capability interface is not supported on this device\n");
    }
 bail:
    return nErr;
 }
--- a/ggml/src/ggml-hexagon/htp-utils.h
+++ b/ggml/src/ggml-hexagon/htp-utils.h
@ -0,0 +1,219 @@
 #ifndef HTP_UTILS_H
 #define HTP_UTILS_H
 #ifdef __cplusplus
 extern "C" {
 #endif
 #include <AEEStdErr.h>
 #include <inttypes.h>
 #include <remote.h>
 #include <stdbool.h>
 /* Offset to differentiate HLOS and Hexagon error codes.
   Stores the value of AEE_EOFFSET for Hexagon. */
 #ifndef DSP_OFFSET
 #    define DSP_OFFSET 0x80000400
 #endif
 /* Errno for connection reset by peer. */
 #ifndef ECONNRESET
 #    ifdef __hexagon__
 #        define ECONNRESET 104
 #    endif
 #endif
 /* Abstraction of different OS specific sleep APIs.
   SLEEP accepts input in seconds. */
 #ifndef SLEEP
 #    ifdef __hexagon__
 #        define SLEEP(x)                      \
            { /* Do nothing for simulator. */ \
            }
 #    else
 #        ifdef _WINDOWS
 #            define SLEEP(x) Sleep(1000 * x) /* Sleep accepts input in milliseconds. */
 #        else
 #            define SLEEP(x) sleep(x)        /* sleep accepts input in seconds. */
 #        endif
 #    endif
 #endif
 /* Include windows specific header files. */
 #ifdef _WINDOWS
 #    include <sysinfoapi.h>
 #    include <windows.h>
 #    define _CRT_SECURE_NO_WARNINGS         1
 #    define _WINSOCK_DEPRECATED_NO_WARNINGS 1
 /* Including this file for custom implementation of getopt function. */
 #    include "getopt_custom.h"
 #endif
 /* Includes and defines for all HLOS except windows */
 #if !defined(__hexagon__) && !defined(_WINDOWS)
 #    include "unistd.h"
 #    include <sys/time.h>
 #endif
 /* Includes and defines for Hexagon and all HLOS except Windows. */
 #if !defined(_WINDOWS)
 /* Weak reference to remote symbol for compilation. */
 #    pragma weak remote_session_control
 #    pragma weak remote_handle_control
 #    pragma weak remote_handle64_control
 #    pragma weak fastrpc_mmap
 #    pragma weak fastrpc_munmap
 #endif
 #if !defined(_WINDOWS)
 #    pragma weak remote_system_request
 #endif
 /**
 * Wrapper for FastRPC Capability API: query DSP support.
 *
 * @param[out]  domain pointer to supported domain.
 * @return      0          if query is successful.
 *              non-zero   if error, return value points to the error.
 */
 int get_dsp_support(int * domain);
 /**
 * Wrapper for FastRPC Capability API: query VTCM information.
 *
 * @param[in]   domain value of domain in the queried.
 * @param[out]  capability capability value of the attribute queried.
 * @param[in]   attr value of the attribute to the queried.
 * @return      0          if query is successful.
 *              non-zero   if error, return value points to the error.
 */
 int get_vtcm_info(int domain, uint32_t * capability, uint32_t attr);
 /**
 * Wrapper for FastRPC Capability API: query unsigned pd support on CDSP domain.
 *
 * @return      true          if unsigned pd is supported.
 *              false         if unsigned pd is not supported, capability query failed.
 */
 bool get_unsignedpd_support(void);
 /**
 * Wrapper for FastRPC Capability API: query unsigned pd support.
 *
 * @param[in]   domain value of domain in the queried.
 * @return      true          if unsigned pd is supported.
 *              false         if unsigned pd is not supported, capability query failed.
 */
 bool is_unsignedpd_supported(int domain_id);
 /**
 * is_valid_domain_id API: query a domain id is valid.
 *
 * @param[in]   domain value of domain in the queried.
 * @param[in]   compute_only value of domain is only compared with CDSP domains supported by the target when enabled.
 * @return      true          if value of domain is valid.
 *              false         if value of domain is not valid.
 */
 bool is_valid_domain_id(int domain_id, int compute_only);
 /**
 * get_domain API: get domain struct from domain value.
 *
 * @param[in]  domain value of a domain
 * @return     Returns domain struct of the domain if it is supported or else
 *             returns NULL.
 *
 */
 domain * get_domain(int domain_id);
 /**
 * get_domains_info API: get information for all the domains available on the device
 *
 * @param[in]  domain_type pointer to domain type
 * @param[in]  num_domains pointer to number of domains
 * @param[in]  domains_info pointer to save discovered domains information.
 * @return     0 if query is successful.
 *              non-zero if error, return value points to the error.
 *
 * It is user's responsibility to free the memory used to store the domains info whose address is present in domains_info before closing the application.
 *
 */
 int get_domains_info(char * domain_type, int * num_domains, fastrpc_domain ** domains_info);
 /**
 * get_effective_domain_id API: get effective domain id for given session id
 *
 * @param[in]  domain_name pointer to domain name
 * @param[in]  session_id
 * @param[in]  effec_domain_id pointer to save obtained effective domain id.
 * @return     0 if query is successful.
 *              non-zero if error, return value points to the error.
 *
 */
 int get_effective_domain_id(char * domain_name, int session_id, int * effec_domain_id);
 /**
 * is_async_fastrpc_supported API: query a domain id has async fastrpc supported or not
 *
 * @param[in]  domain_id value of a domain
 * @return     Returns true or false stating support of Async FastRPC
 *
 */
 bool is_async_fastrpc_supported(int domain_id);
 /**
 * is_status_notification_supported API: query the DSP for STATUS_NOTIFICATION_SUPPORT information
 *
 * @param[in]  domain_id value of a domain
 * @return     Returns true or false stating status notification support information
 *
 */
 bool is_status_notification_supported(int domain_id);
 /**
 * get_hmx_support_info API: query the DSP for HMX SUPPORT information
 *
 * @param[in]   domain_id value of a domain
 * @param[out]  capability capability value of the attribute queried.
 * @param[in]   attr value of the attribute to the queried.
 * @return      0 if query is successful.
 *              non-zero if error, return value points to the error.
 *
 */
 int get_hmx_support_info(int domain, uint32_t * capability, uint32_t attr);
 /**
 * get_hex_arch_ver API: query the Hexagon processor architecture version information
 *
 * @param[in]   domain_id value of a domain
 * @param[out]  Arch version (73, 75, ...)
 * @return      0 if query is successful.
 *              non-zero if error, return value points to the error.
 *
 */
 int get_hex_arch_ver(int domain, int * arch);
 /**
 * get_hvx_support_info API: query the DSP for HVX SUPPORT information
 *
 * @param[in]   domain_id value of a domain
 * @param[out]  capability capability value of the attribute queried.
 * @param[in]   attr value of the attribute to the queried.
 * @return      0 if query is successful.
 *              non-zero if error, return value points to the error.
 *
 */
 int get_hvx_support_info(int domain, uint32_t * capability, uint32_t attr);
 #ifdef __cplusplus
 }
 #endif
 #endif  //DSP_CAPABILITIES_UTILS_H
--- a/ggml/src/ggml-hexagon/htp/CMakeLists.txt
+++ b/ggml/src/ggml-hexagon/htp/CMakeLists.txt
@ -0,0 +1,40 @@
 cmake_minimum_required(VERSION 3.22.2)
 project(ggml-htp C CXX ASM)
 include(${HEXAGON_SDK_ROOT}/build/cmake/hexagon_fun.cmake)
 include_directories(
    ${HEXAGON_SDK_ROOT}/incs
    ${HEXAGON_SDK_ROOT}/incs/stddef
    ${CMAKE_CURRENT_SOURCE_DIR}/../..
    ${CMAKE_CURRENT_SOURCE_DIR}/..
    ${CMAKE_CURRENT_SOURCE_DIR}
    ${CMAKE_CURRENT_BINARY_DIR})
 set(HTP_LIB ggml-htp-${DSP_VERSION})
 add_library(${HTP_LIB} SHARED
    main.c
    htp_iface_skel.c
    worker-pool.c
    htp-dma.c
    hvx-sigmoid.c
    hvx-inverse.c
    hvx-exp.c
    hvx-utils.c
    matmul-ops.c
    binary-ops.c
    unary-ops.c
    softmax-ops.c
    act-ops.c
    rope-ops.c
 )
 target_compile_definitions(${HTP_LIB} PRIVATE
    $<IF:$<BOOL:${HEXAGON_HTP_DEBUG}>,HTP_DEBUG=1,NDEBUG=1>)
 build_idl(htp_iface.idl ${HTP_LIB})
 set_target_properties(${HTP_LIB} PROPERTIES EXPORT_COMPILE_COMMANDS ON)
 install(TARGETS ${HTP_LIB})
--- a/ggml/src/ggml-hexagon/htp/act-ops.c
+++ b/ggml/src/ggml-hexagon/htp/act-ops.c
@ -0,0 +1,448 @@
 #pragma clang diagnostic ignored "-Wunused-variable"
 #pragma clang diagnostic ignored "-Wunused-function"
 #pragma clang diagnostic ignored "-Wunused-but-set-variable"
 #ifdef HTP_DEBUG
 #    define FARF_HIGH 1
 #endif
 #include <HAP_farf.h>
 #include <HAP_mem.h>
 #include <HAP_perf.h>
 #include <HAP_ps.h>
 #include <hexagon_protos.h>
 #include <hexagon_types.h>
 #include <math.h>
 #include <qurt_thread.h>
 #include <string.h>
 #define GGML_COMMON_DECL_C
 #include "ggml-common.h"
 #include "htp-ctx.h"
 #include "htp-dma.h"
 #include "htp-msg.h"
 #include "htp-ops.h"
 #include "hvx-utils.h"
 #include "ops-utils.h"
 #define htp_act_preamble3              \
    const uint32_t ne00 = src0->ne[0]; \
    const uint32_t ne01 = src0->ne[1]; \
    const uint32_t ne02 = src0->ne[2]; \
    const uint32_t ne03 = src0->ne[3]; \
                                       \
    const uint32_t ne10 = src1->ne[0]; \
    const uint32_t ne11 = src1->ne[1]; \
    const uint32_t ne12 = src1->ne[2]; \
    const uint32_t ne13 = src1->ne[3]; \
                                       \
    const uint32_t ne0 = dst->ne[0];   \
    const uint32_t ne1 = dst->ne[1];   \
    const uint32_t ne2 = dst->ne[2];   \
    const uint32_t ne3 = dst->ne[3];   \
                                       \
    const uint32_t nb00 = src0->nb[0]; \
    const uint32_t nb01 = src0->nb[1]; \
    const uint32_t nb02 = src0->nb[2]; \
    const uint32_t nb03 = src0->nb[3]; \
                                       \
    const uint32_t nb10 = src1->nb[0]; \
    const uint32_t nb11 = src1->nb[1]; \
    const uint32_t nb12 = src1->nb[2]; \
    const uint32_t nb13 = src1->nb[3]; \
                                       \
    const uint32_t nb0 = dst->nb[0];   \
    const uint32_t nb1 = dst->nb[1];   \
    const uint32_t nb2 = dst->nb[2];   \
    const uint32_t nb3 = dst->nb[3];
 #define htp_act_preamble2              \
    const uint32_t ne00 = src0->ne[0]; \
    const uint32_t ne01 = src0->ne[1]; \
    const uint32_t ne02 = src0->ne[2]; \
    const uint32_t ne03 = src0->ne[3]; \
                                       \
    const uint32_t ne0 = dst->ne[0];   \
    const uint32_t ne1 = dst->ne[1];   \
    const uint32_t ne2 = dst->ne[2];   \
    const uint32_t ne3 = dst->ne[3];   \
                                       \
    const uint32_t nb00 = src0->nb[0]; \
    const uint32_t nb01 = src0->nb[1]; \
    const uint32_t nb02 = src0->nb[2]; \
    const uint32_t nb03 = src0->nb[3]; \
                                       \
    const uint32_t nb0 = dst->nb[0];   \
    const uint32_t nb1 = dst->nb[1];   \
    const uint32_t nb2 = dst->nb[2];   \
    const uint32_t nb3 = dst->nb[3];
 static void glu_swiglu_fp32_per_thread(const struct htp_tensor * src0,
                                       const struct htp_tensor * src1,
                                       struct htp_tensor *       dst,
                                       const int32_t *           op_params,
                                       struct htp_spad *         src0_spad,
                                       struct htp_spad *         src1_spad,
                                       struct htp_spad *         dst_spad,
                                       uint32_t                  nth,
                                       uint32_t                  ith,
                                       uint32_t                  src0_nrows_per_thread) {
    htp_act_preamble3;
    size_t src0_row_size = nb01;
    size_t src1_row_size = nb11;
    size_t dst_row_size  = nb1;
    const uint32_t src0_nrows = ne01 * ne02 * ne03;  // src0 rows
    const uint32_t src0_start_row = src0_nrows_per_thread * ith;
    const uint32_t src0_end_row   = MIN(src0_start_row + src0_nrows_per_thread, src0_nrows);
    // no work for this thread
    if (src0_start_row >= src0_end_row) {
        return;
    }
    uint64_t t1, t2;
    t1 = HAP_perf_get_qtimer_count();
    int is_aligned = 1;
    int opt_path   = 0;
    if (!htp_is_aligned((void *) src0->data, VLEN) || !htp_is_aligned((void *) dst->data, VLEN)) {
        is_aligned = 0;
        FARF(HIGH, "swiglu-f32: unaligned addresses in elementwise op, possibly slower execution\n");
    }
    if ((1 == is_aligned) && !(nb01 & (VLEN - 1))) {
        opt_path = 1;
    }
    const uint8_t * restrict data_src0 = (const uint8_t *) src0->data;
    const uint8_t * restrict data_src1 = (const uint8_t *) src1->data;
    uint8_t * restrict data_dst        = (uint8_t *) dst->data;
    bool src1_valid = src1->ne[0];
    if (!src1_valid) {
        data_src1     = data_src0;
        src1_row_size = src0_row_size;
    }
    uint8_t * restrict src0_spad_data = src0_spad->data + (ith * src0_row_size);
    uint8_t * restrict src1_spad_data = src1_spad->data + (ith * src1_row_size);
    uint8_t * restrict dst_spad_data  = dst_spad->data + (ith * dst_row_size);
    const int32_t swapped = op_params[1];
    const int nc = (src1_valid) ? ne0 : ne0 / 2;
    for (uint32_t ir = src0_start_row; ir < src0_end_row; ir++) {
        const float * restrict src0 = (float *) (data_src0 + (ir * src0_row_size));
        const float * restrict src1 = (float *) (data_src1 + (ir * src1_row_size));
        float * restrict dst        = (float *) (data_dst + (ir * dst_row_size));
        if (ir + 1 < src0_end_row) {
            htp_l2fetch(src0 + src0_row_size, 1, src0_row_size, src0_row_size);
        }
        if (!src1_valid) {
            src0 += swapped ? nc : 0;
            src1 += swapped ? 0 : nc;
        }
        if (1 == opt_path) {
            hvx_fast_sigmoid_f32((const uint8_t *) src0, (uint8_t *) src0_spad_data, nc);
            hvx_mul_mul_f32_opt((const uint8_t *) src0, (const uint8_t *) src0_spad_data, (const uint8_t *) src1,
                                (uint8_t *) dst, nc);
        } else {
            hvx_exp_f32((const uint8_t *) src0, src0_spad_data, nc, true);
            hvx_add_scalar_f32(src0_spad_data, 1.0, src1_spad_data, nc);
            hvx_inverse_f32(src1_spad_data, src0_spad_data, nc);
            hvx_mul_f32((const uint8_t *) src0, src0_spad_data, dst_spad_data, nc);
            hvx_mul_f32(dst_spad_data, (const uint8_t *) src1, (uint8_t *) dst, nc);
        }
    }
    t2 = HAP_perf_get_qtimer_count();
    FARF(HIGH, "swiglu-f32 %d/%d/%d: %ux%ux%ux%u (%u:%u) x %ux%ux%ux%u -> %ux%ux%ux%u usec %u\n", ith, nth, opt_path,
         ne00, ne01, ne02, ne03, src0_start_row, src0_end_row, ne10, ne11, ne12, ne13, ne0, ne1, ne2, ne3,
         (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1));
 }
 static void glu_swiglu_oai_fp32_per_thread(const struct htp_tensor * src0,
                                           const struct htp_tensor * src1,
                                           struct htp_tensor *       dst,
                                           const int32_t *           op_params,
                                           struct htp_spad *         src0_spad,
                                           struct htp_spad *         src1_spad,
                                           struct htp_spad *         dst_spad,
                                           uint32_t                  nth,
                                           uint32_t                  ith,
                                           uint32_t                  src0_nrows_per_thread) {
    htp_act_preamble3;
    uint64_t t1, t2;
    t1 = HAP_perf_get_qtimer_count();
    const size_t src0_row_size = nb01;
    const size_t src1_row_size = nb11;
    const size_t dst_row_size  = nb1;
    const uint32_t src0_nrows = ne01 * ne02 * ne03;  // src0 rows
    const uint32_t src0_start_row = src0_nrows_per_thread * ith;
    const uint32_t src0_end_row   = MIN(src0_start_row + src0_nrows_per_thread, src0_nrows);
    // no work for this thread
    if (src0_start_row >= src0_end_row) {
        return;
    }
    if (!htp_is_aligned((void *) src0->data, VLEN) || !htp_is_aligned((void *) dst->data, VLEN)) {
        FARF(HIGH, "act-f32: unaligned addresses in activations op, possibly slower execution\n");
    }
    const uint8_t * restrict data_src0 = (const uint8_t *) src0->data;
    const uint8_t * restrict data_src1 = (const uint8_t *) src1->data;
    uint8_t * restrict data_dst        = (uint8_t *) dst->data;
    bool src1_valid = src1->ne[0];
    if (!src1_valid) {
        data_src1 = data_src0;
    }
    uint8_t * restrict src0_spad_data = src0_spad->data + (ith * src0_row_size);
    uint8_t * restrict src1_spad_data = src1_spad->data + (ith * src1_row_size);
    uint8_t * restrict dst_spad_data  = dst_spad->data + (ith * dst_row_size);
    const int32_t swapped = op_params[1];
    const float   alpha   = ((const float *) (op_params))[2];
    const float   limit   = ((const float *) (op_params))[3];
    const int nc = (src1_valid) ? ne0 : ne0 / 2;
    for (uint32_t ir = src0_start_row; ir < src0_end_row; ir++) {
        const float * restrict src0 = (float *) (data_src0 + (ir * src0_row_size));
        const float * restrict src1 = (float *) (data_src1 + (ir * src1_row_size));
        float * restrict dst        = (float *) (data_dst + (ir * dst_row_size));
        if (ir + 1 < src0_end_row) {
            htp_l2fetch(src0 + src0_row_size, 1, src0_row_size, src0_row_size);
        }
        if (!src1) {
            src0 += swapped ? nc : 0;
            src1 += swapped ? 0 : nc;
        }
        // x (src0_spad_data) = std::min(src0_p[k], limit);
        hvx_min_scalar_f32((const uint8_t *) src0, limit, src0_spad_data, nc);
        // y1 (src1_spad_data) = std::clamp(src1_p[k], -limit, limit);
        hvx_clamp_scalar_f32((const uint8_t *) src1, limit, limit, src1_spad_data, nc);
        // y (src1_spad_data)  = y1 + 1.f
        hvx_add_scalar_f32(src1_spad_data, 1.0, src1_spad_data, nc);
        // x1 (dst_spad_data) = alpha * (x)
        hvx_mul_scalar_f32(src0_spad_data, alpha, dst_spad_data, nc);
        // x2 (dst_spad_data) = expf(-x1)
        hvx_exp_f32(dst_spad_data, dst_spad_data, nc, true);
        // x3 (dst_spad_data) = x2 + 1.f
        hvx_add_scalar_f32(dst_spad_data, 1.0, dst_spad_data, nc);
        // x4 (dst_spad_data) = 1 / x3
        hvx_inverse_f32(dst_spad_data, dst_spad_data, nc);
        // out_glu(dst_spad_data) = x * x4
        hvx_mul_f32(src0_spad_data, dst_spad_data, dst_spad_data, nc);
        // out = out_glu * (y + 1.f);
        hvx_mul_f32(dst_spad_data, src1_spad_data, (uint8_t *) dst, nc);
    }
    t2 = HAP_perf_get_qtimer_count();
    FARF(HIGH, "swiglu-f32 %d/%d: %ux%ux%ux%u (%u:%u) x %ux%ux%ux%u -> %ux%ux%ux%u usec %u\n", ith, nth, src0->ne[0],
         src0->ne[1], src0->ne[2], src0->ne[3], src0_start_row, src0_end_row, src1->ne[0], src1->ne[1], src1->ne[2],
         src1->ne[3], dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1));
 }
 static void unary_silu_fp32_per_thread(const struct htp_tensor * src0,
                                       struct htp_tensor *       dst,
                                       const int32_t *           op_params,
                                       struct htp_spad *         src0_spad,
                                       struct htp_spad *         dst_spad,
                                       uint32_t                  nth,
                                       uint32_t                  ith,
                                       uint32_t                  src0_nrows_per_thread) {
    htp_act_preamble2;
    uint64_t t1, t2;
    t1 = HAP_perf_get_qtimer_count();
    const size_t src0_row_size = nb01;
    const size_t dst_row_size  = nb1;
    const uint32_t src0_nrows = ne01 * ne02 * ne03;
    const uint32_t src0_start_row = src0_nrows_per_thread * ith;
    const uint32_t src0_end_row   = MIN(src0_start_row + src0_nrows_per_thread, src0_nrows);
    // no work for this thread
    if (src0_start_row >= src0_end_row) {
        return;
    }
    int is_aligned = 1;
    int opt_path   = 0;
    if (!htp_is_aligned((void *) src0->data, VLEN) || !htp_is_aligned((void *) dst->data, VLEN)) {
        is_aligned = 0;
        FARF(HIGH, "silu-f32: unaligned addresses in elementwise op, possibly slower execution\n");
    }
    if ((1 == is_aligned) && !(nb01 & (VLEN - 1))) {
        opt_path = 1;
    }
    const uint8_t * restrict data_src0 = (const uint8_t *) src0->data;
    uint8_t * restrict data_dst        = (uint8_t *) dst->data;
    uint8_t * restrict src0_spad_data = src0_spad->data + (ith * src0_row_size);
    uint8_t * restrict dst_spad_data  = dst_spad->data + (ith * dst_row_size);
    for (uint32_t ir = src0_start_row; ir < src0_end_row; ir++) {
        const float * restrict src0 = (float *) (data_src0 + (ir * src0_row_size));
        float * restrict dst        = (float *) (data_dst + (ir * dst_row_size));
        if (ir + 1 < src0_end_row) {
            htp_l2fetch(src0 + src0_row_size, 1, src0_row_size, src0_row_size);
        }
        if (1 == opt_path) {
            hvx_fast_sigmoid_f32((const uint8_t *) src0, (uint8_t *) src0_spad_data, ne0);
            hvx_mul_f32_opt((const uint8_t *) src0, src0_spad_data, (uint8_t *) dst, ne0);
        } else {
            hvx_exp_f32((const uint8_t *) src0, src0_spad_data, ne0, true);
            hvx_add_scalar_f32(src0_spad_data, 1.0, dst_spad_data, ne0);
            hvx_inverse_f32(dst_spad_data, src0_spad_data, ne0);
            hvx_mul_f32((const uint8_t *) src0, src0_spad_data, (uint8_t *) dst, ne0);
        }
    }
    t2 = HAP_perf_get_qtimer_count();
    FARF(HIGH, "silu-f32 %d/%d/%d: %ux%ux%ux%u (%u:%u) -> %ux%ux%ux%u usec %u\n", ith, nth, opt_path, ne00, ne01, ne02,
         ne03, src0_start_row, src0_end_row, ne0, ne1, ne2, ne3, (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1));
 }
 static void unary_silu_fp32(unsigned int n, unsigned int i, void * data) {
    struct htp_ops_context * octx = (struct htp_ops_context *) data;
    unary_silu_fp32_per_thread(&octx->src0, &octx->dst, octx->op_params, &octx->src0_spad, &octx->dst_spad, n, i,
                               octx->src0_nrows_per_thread);
 }
 static void glu_swiglu_fp32(unsigned int n, unsigned int i, void * data) {
    struct htp_ops_context * octx = (struct htp_ops_context *) data;
    glu_swiglu_fp32_per_thread(&octx->src0, &octx->src1, &octx->dst, octx->op_params, &octx->src0_spad,
                               &octx->src1_spad, &octx->dst_spad, n, i, octx->src0_nrows_per_thread);
 }
 static void glu_swiglu_oai_fp32(unsigned int n, unsigned int i, void * data) {
    struct htp_ops_context * octx = (struct htp_ops_context *) data;
    glu_swiglu_oai_fp32_per_thread(&octx->src0, &octx->src1, &octx->dst, octx->op_params, &octx->src0_spad,
                                   &octx->src1_spad, &octx->dst_spad, n, i, octx->src0_nrows_per_thread);
 }
 static int execute_op_activations_fp32(struct htp_ops_context * octx) {
    int err = HTP_STATUS_OK;
    const struct htp_tensor * src0 = &octx->src0;
    const struct htp_tensor * src1 = &octx->src1;
    struct htp_tensor *       dst  = &octx->dst;
    if (((src0->ne[0] * SIZEOF_FP32) != src0->nb[1]) || ((dst->ne[0] * SIZEOF_FP32) != dst->nb[1])) {
        FARF(ERROR, "Non-contiguous tensors are not supported at this time \n");
        return HTP_STATUS_NO_SUPPORT;
    }
    worker_callback_t act_op_func;
    const char *      op_type = NULL;
    switch (octx->op) {
        case HTP_OP_UNARY_SILU:
            act_op_func = unary_silu_fp32;
            op_type     = "silu-f32";
            break;
        case HTP_OP_GLU_SWIGLU:
            act_op_func = glu_swiglu_fp32;
            op_type     = "swiglu-f32";
            break;
        case HTP_OP_GLU_SWIGLU_OAI:
            act_op_func = glu_swiglu_oai_fp32;
            op_type     = "swiglu-oai-f32";
            break;
        default:
            FARF(ERROR, "Unsupported activations Op %u\n", octx->op);
            return HTP_STATUS_NO_SUPPORT;
    }
    const uint32_t n_threads  = octx->n_threads;
    const uint32_t src0_nrows = src0->ne[1] * src0->ne[2] * src0->ne[3];
    const size_t src0_row_size = src0->nb[1];
    const size_t src1_row_size = src1->ne[0] ? src1->nb[1] : src0->nb[1];
    const size_t dst_row_size  = dst->nb[1];
    // VTCM scratchpads for all tensors
    // N rows per thread, padded to HVX vector size
    octx->dst_spad.size  = htp_round_up(dst_row_size, 128) * octx->n_threads;
    octx->src0_spad.size = htp_round_up(src0_row_size, 128) * octx->n_threads;
    octx->src1_spad.size = htp_round_up(src1_row_size, 128) * octx->n_threads;
    size_t spad_size = octx->src0_spad.size + octx->src1_spad.size + octx->dst_spad.size;
    if (src1->ne[0]) {
        FARF(HIGH,
             "%s: %ux%ux%ux%u x %ux%ux%ux%u -> %ux%ux%ux%u : src0-spad-size %u src1-spad-size %u dst-spad-size %u\n",
             op_type, src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], src1->ne[0], src1->ne[1], src1->ne[2],
             src1->ne[3], dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], octx->src0_spad.size, octx->src1_spad.size,
             octx->dst_spad.size);
    } else {
        FARF(HIGH, "%s: %ux%ux%ux%u -> %ux%ux%ux%u : src0-spad-size %u src1-spad-size %u dst-spad-size %u\n", op_type,
             src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3],
             octx->src0_spad.size, octx->src1_spad.size, octx->dst_spad.size);
    }
    // Make sure the reserved vtcm size is sufficient
    if (octx->ctx->vtcm_size < spad_size) {
        FARF(ERROR, "act-%s : current VTCM reservation %zu is too small, needed %zu\n", op_type, octx->ctx->vtcm_size,
             spad_size);
        return HTP_STATUS_VTCM_TOO_SMALL;
    }
    octx->src0_spad.data = octx->ctx->vtcm_base;
    octx->src1_spad.data = octx->src0_spad.data + octx->src0_spad.size;
    octx->dst_spad.data  = octx->src1_spad.data + octx->src1_spad.size;
    if (!(octx->flags & HTP_OPFLAGS_SKIP_COMPUTE)) {
        uint32_t n_jobs = MIN(n_threads, src0_nrows);
        octx->src0_nrows_per_thread = (src0_nrows + n_jobs - 1) / n_jobs;
        worker_pool_run_func(octx->ctx->worker_pool, act_op_func, octx, n_jobs);
    }
    return err;
 }
 int op_activations(struct htp_ops_context * octx) {
    int err = HTP_STATUS_OK;
    switch (octx->src0.type) {
        case HTP_TYPE_F32:
            err = execute_op_activations_fp32(octx);
            break;
        default:
            err = HTP_STATUS_NO_SUPPORT;
            break;
    }
    return err;
 }
--- a/ggml/src/ggml-hexagon/htp/binary-ops.c
+++ b/ggml/src/ggml-hexagon/htp/binary-ops.c
@ -0,0 +1,344 @@
 #pragma clang diagnostic ignored "-Wunused-variable"
 #pragma clang diagnostic ignored "-Wunused-function"
 #pragma clang diagnostic ignored "-Wunused-but-set-variable"
 #ifdef HTP_DEBUG
 #    define FARF_HIGH 1
 #endif
 #include <HAP_farf.h>
 #include <HAP_mem.h>
 #include <HAP_perf.h>
 #include <HAP_ps.h>
 #include <hexagon_protos.h>
 #include <hexagon_types.h>
 #include <math.h>
 #include <qurt_thread.h>
 #include <string.h>
 #define GGML_COMMON_DECL_C
 #include "ggml-common.h"
 #include "htp-ctx.h"
 #include "htp-dma.h"
 #include "htp-msg.h"
 #include "htp-ops.h"
 #include "hvx-utils.h"
 #include "ops-utils.h"
 typedef void (*hvx_elemwise_f32_func)(const uint8_t * src0,
                                      const uint8_t * src1,
                                      uint8_t *       data_dst,
                                      const int       num_elems);
 static hvx_elemwise_f32_func func_table_HVX[]     = { hvx_mul_f32, hvx_add_f32, hvx_sub_f32 };
 static hvx_elemwise_f32_func func_table_HVX_opt[] = { hvx_mul_f32_opt, hvx_add_f32_opt, hvx_sub_f32_opt };
 #define htp_binary_preamble            \
    const uint32_t ne00 = src0->ne[0]; \
    const uint32_t ne01 = src0->ne[1]; \
    const uint32_t ne02 = src0->ne[2]; \
    const uint32_t ne03 = src0->ne[3]; \
                                       \
    const uint32_t ne10 = src1->ne[0]; \
    const uint32_t ne11 = src1->ne[1]; \
    const uint32_t ne12 = src1->ne[2]; \
    const uint32_t ne13 = src1->ne[3]; \
                                       \
    const uint32_t ne0 = dst->ne[0];   \
    const uint32_t ne1 = dst->ne[1];   \
    const uint32_t ne2 = dst->ne[2];   \
    const uint32_t ne3 = dst->ne[3];   \
                                       \
    const uint32_t nb00 = src0->nb[0]; \
    const uint32_t nb01 = src0->nb[1]; \
    const uint32_t nb02 = src0->nb[2]; \
    const uint32_t nb03 = src0->nb[3]; \
                                       \
    const uint32_t nb10 = src1->nb[0]; \
    const uint32_t nb11 = src1->nb[1]; \
    const uint32_t nb12 = src1->nb[2]; \
    const uint32_t nb13 = src1->nb[3]; \
                                       \
    const uint32_t nb0 = dst->nb[0];   \
    const uint32_t nb1 = dst->nb[1];   \
    const uint32_t nb2 = dst->nb[2];   \
    const uint32_t nb3 = dst->nb[3];
 static void binary_job_f32_per_thread(const struct htp_tensor * src0,
                                      const struct htp_tensor * src1,
                                      struct htp_tensor *       dst,
                                      uint8_t *                 spad_data,
                                      uint32_t                  nth,
                                      uint32_t                  ith,
                                      uint32_t                  src0_nrows_per_thread,
                                      enum htp_op               op) {
    htp_binary_preamble;
    const size_t src0_row_size = nb01;
    const size_t src1_row_size = nb11;
    const size_t dst_row_size  = nb1;
    const uint32_t src0_nrows = ne01 * ne02 * ne03;  // src0 rows
    const uint32_t src1_nrows = ne11 * ne12 * ne13;  // src1 rows
    const uint32_t src0_start_row = src0_nrows_per_thread * ith;
    const uint32_t src0_end_row   = MIN(src0_start_row + src0_nrows_per_thread, src0_nrows);
    // no work for this thread
    if (src0_start_row >= src0_end_row) {
        return;
    }
    uint64_t t1, t2;
    t1 = HAP_perf_get_qtimer_count();
    int is_aligned = 1;
    int opt_path   = 0;
    if ((0 == htp_is_aligned((void *) src0->data, VLEN)) || (0 == htp_is_aligned((void *) src1->data, VLEN)) ||
        (0 == htp_is_aligned((void *) dst->data, VLEN))) {
        FARF(HIGH, "binary-f32: unaligned addresses in elementwise op, possibly slower execution\n");
        is_aligned = 0;
    }
    if ((1 == is_aligned) && !(nb01 & (VLEN - 1))) {
        opt_path = 1;
    }
    hvx_elemwise_f32_func func_HVX = (1 == opt_path) ? func_table_HVX_opt[op] : func_table_HVX[op];
    uint8_t * restrict spad_data_th = spad_data + (ith * src0_row_size);
    const uint32_t nr0 = ne00 / ne10;
    const uint8_t * restrict src0_ptr = (const uint8_t *) src0->data + (src0_start_row * src0_row_size);
    uint8_t * restrict dst_ptr        = (uint8_t *) dst->data + (src0_start_row * dst_row_size);
    const uint8_t * restrict data_src1 = (const uint8_t *) src1->data;
    const uint8_t * restrict src1_ptr  = NULL;
    for (uint32_t ir = src0_start_row; ir < src0_end_row; ir++) {
        src1_ptr = data_src1 + (ir % src1_nrows) * src1_row_size;
        if (ir + 1 < src0_end_row) {
            htp_l2fetch(src0_ptr + ne00, 1, src0_row_size, src0_row_size);
            if (src1_row_size == src0_row_size) {
                htp_l2fetch(src1_ptr, 1, src1_row_size, src1_row_size);
            }
        }
        if (nr0 > 1) {
            if ((1 == is_aligned) && (nr0 == ne00)) {
                hvx_bcast_fp32_a(spad_data_th, *(float *) src1_ptr, nr0);
            } else {
                for (uint32_t r = 0; r < nr0; r++) {
                    memcpy(spad_data_th + r * nb11, (const uint8_t *) src1_ptr, nb11);
                }
            }
            func_HVX((const uint8_t *) src0_ptr, (const uint8_t *) spad_data_th, (uint8_t *) dst_ptr, ne00);
        } else {
            func_HVX((const uint8_t *) src0_ptr, (const uint8_t *) src1_ptr, (uint8_t *) dst_ptr, ne00);
        }
        src0_ptr += src0_row_size;
        dst_ptr += dst_row_size;
    }
    t2 = HAP_perf_get_qtimer_count();
    FARF(HIGH, "binary-f32 %d/%d/%d: %ux%ux%ux%u (%u:%u) x %ux%ux%ux%u -> %ux%ux%ux%u usec %u\n", ith, nth, opt_path,
         ne00, ne01, ne02, ne03, src0_start_row, src0_end_row, ne10, ne11, ne12, ne13, ne0, ne1, ne2, ne3,
         (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1));
 }
 static void binary_add_id_job_f32_per_thread(const struct htp_tensor * src0,
                                             const struct htp_tensor * src1,
                                             const struct htp_tensor * src2,
                                             struct htp_tensor *       dst,
                                             uint8_t *                 spad_data,
                                             uint32_t                  nth,
                                             uint32_t                  ith,
                                             uint32_t                  src0_nrows_per_thread,
                                             hvx_elemwise_f32_func     func_HVX) {
    htp_binary_preamble;
    const size_t src0_row_size = nb01;
    const size_t src1_row_size = nb11;
    const size_t dst_row_size  = nb1;
    const uint32_t ne02_ne01  = ne02 * ne01;
    const uint32_t src0_nrows = ne01 * ne02 * ne03;  // src0 rows
    const uint32_t src0_start_row = src0_nrows_per_thread * ith;
    const uint32_t src0_end_row   = MIN(src0_start_row + src0_nrows_per_thread, src0_nrows);
    // no work for this thread
    if (src0_start_row >= src0_end_row) {
        return;
    }
    uint64_t t1, t2;
    t1 = HAP_perf_get_qtimer_count();
    if ((0 == htp_is_aligned((void *) src0->data, VLEN)) || (0 == htp_is_aligned((void *) src1->data, VLEN)) ||
        (0 == htp_is_aligned((void *) dst->data, VLEN))) {
        FARF(HIGH, "add-id-f32: unaligned addresses, possibly slower execution\n");
    }
    const uint8_t * restrict data_src0 = (const uint8_t *) src0->data;
    const uint8_t * restrict data_src1 = (const uint8_t *) src1->data;
    uint8_t * restrict data_dst        = (uint8_t *) dst->data;
    for (uint32_t ir = src0_start_row; ir < src0_end_row; ir++) {
        // src0 indices
        const uint32_t i03 = ir / ne02_ne01;
        const uint32_t i02 = (ir - i03 * ne02_ne01) / ne01;
        const uint32_t i01 = (ir - i03 * ne02_ne01 - i02 * ne01);
        // src1 indices
        const int i11 = *(int32_t *) ((char *) src2->data + i01 * src2->nb[0] + i02 * src2->nb[1]);
        assert(i11 >= 0 && i11 < ne11);
        float * restrict dst_ptr        = (float *) (data_dst + i03 * nb3 + i02 * nb2 + i01 * nb1);
        const float * restrict src0_ptr = (const float *) (data_src0 + i03 * nb03 + i02 * nb02 + i01 * nb01);
        const float * restrict src1_ptr = (const float *) (data_src1 + 0 + 0 + i11 * nb11);
        if (ir + 1 < src0_end_row) {
            htp_l2fetch(src0_ptr + ne00, 1, src0_row_size, src0_row_size);
            if (src1_row_size == src0_row_size) {
                htp_l2fetch(src1_ptr + ne10, 1, src1_row_size, src1_row_size);
            }
        }
        const uint32_t nr0 = ne00 / ne10;
        if (nr0 > 1) {
            for (uint32_t r = 0; r < nr0; r++) {
                memcpy(spad_data + r * nb10, (const uint8_t *) src1_ptr, nb10);
            }
            func_HVX((const uint8_t *) src0_ptr, (const uint8_t *) spad_data, (uint8_t *) dst_ptr, ne00);
        } else {
            func_HVX((const uint8_t *) src0_ptr, (const uint8_t *) src1_ptr, (uint8_t *) dst_ptr, ne00);
        }
    }
    t2 = HAP_perf_get_qtimer_count();
    FARF(HIGH, "add-id-f32 %d/%d: %ux%ux%ux%u (%u:%u) x %ux%ux%ux%u (%ux%ux%ux%u) -> %ux%ux%ux%u usec %u\n", ith, nth,
         src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], src0_start_row, src0_end_row, src1->ne[0], src1->ne[1],
         src1->ne[2], src1->ne[3], src2->ne[0], src2->ne[1], src2->ne[2], src2->ne[3], dst->ne[0], dst->ne[1],
         dst->ne[2], dst->ne[3], (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1));
 }
 static void binary_job_dispatcher_f32(unsigned int n, unsigned int i, void * data) {
    struct htp_ops_context * octx = (struct htp_ops_context *) data;
    switch (octx->op) {
        case HTP_OP_MUL:
        case HTP_OP_ADD:
        case HTP_OP_SUB:
            binary_job_f32_per_thread(&octx->src0, &octx->src1, &octx->dst, octx->src1_spad.data, n, i,
                                      octx->src0_nrows_per_thread, octx->op);
            break;
        case HTP_OP_ADD_ID:
            binary_add_id_job_f32_per_thread(&octx->src0, &octx->src1, &octx->src2, &octx->dst, octx->src0_spad.data, n,
                                             i, octx->src0_nrows_per_thread, hvx_add_f32);
            break;
        default:
            FARF(ERROR, "Unknown Binary Op %u", octx->op);
            break;
    }
 }
 static int execute_op_binary_f32(struct htp_ops_context * octx) {
    int err = HTP_STATUS_OK;
    const struct htp_tensor * src0 = &octx->src0;
    const struct htp_tensor * src1 = &octx->src1;
    struct htp_tensor *       dst  = &octx->dst;
    worker_callback_t binary_op_func;
    const char *      op_type = NULL;
    switch (octx->op) {
        case HTP_OP_MUL:
            binary_op_func = binary_job_dispatcher_f32;
            op_type        = "mul-f32";
            break;
        case HTP_OP_ADD:
            binary_op_func = binary_job_dispatcher_f32;
            op_type        = "add-f32";
            break;
        case HTP_OP_SUB:
            binary_op_func = binary_job_dispatcher_f32;
            op_type        = "sub-f32";
            break;
        case HTP_OP_ADD_ID:
            binary_op_func = binary_job_dispatcher_f32;
            op_type        = "add-id-f32";
            break;
        default:
            FARF(ERROR, "Unsupported binary-Op %u\n", octx->op);
            return HTP_STATUS_NO_SUPPORT;
    }
    const int      n_threads  = octx->n_threads;
    const uint32_t src0_nrows = src0->ne[1] * src0->ne[2] * src0->ne[3];
    const size_t src0_row_size = src0->nb[1];
    const size_t src1_row_size = src1->nb[1];
    const size_t dst_row_size  = dst->nb[1];
    // VTCM scratchpads for all tensors
    octx->dst_spad.size  = htp_round_up(dst_row_size, 128) * n_threads;
    octx->src0_spad.size = htp_round_up(src0_row_size, 128) * n_threads;
    octx->src1_spad.size = htp_round_up(src1_row_size, 128) * n_threads;
    size_t spad_size = octx->src0_spad.size + octx->src1_spad.size + octx->dst_spad.size;
    FARF(HIGH,
         "%s: (%ux%ux%ux%u) * (%ux%ux%ux%u) -> (%ux%ux%ux%u) : src0-spad-size %u src1-spad-size %u dst-spad-size %u\n",
         op_type, src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], src1->ne[0], src1->ne[1], src1->ne[2],
         src1->ne[3], dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], octx->src0_spad.size, octx->src1_spad.size,
         octx->dst_spad.size);
    // Make sure the reserved vtcm size is sufficient
    if (octx->ctx->vtcm_size < spad_size) {
        FARF(ERROR, "binary-%s : current VTCM reservation %zu is too small, needed %zu\n", op_type,
             octx->ctx->vtcm_size, spad_size);
        return HTP_STATUS_VTCM_TOO_SMALL;
    }
    octx->src0_spad.data = octx->ctx->vtcm_base;
    octx->src1_spad.data = octx->src0_spad.data + octx->src0_spad.size;
    octx->dst_spad.data  = octx->src1_spad.data + octx->src1_spad.size;
    if (!(octx->flags & HTP_OPFLAGS_SKIP_COMPUTE)) {
        uint32_t n_jobs = MIN(n_threads, src0_nrows);
        octx->src0_nrows_per_thread = (src0_nrows + n_jobs - 1) / n_jobs;
        worker_pool_run_func(octx->ctx->worker_pool, binary_op_func, octx, n_jobs);
    }
    return err;
 }
 int op_binary(struct htp_ops_context * octx) {
    int err = HTP_STATUS_OK;
    switch (octx->src0.type) {
        case HTP_TYPE_F32:
            err = execute_op_binary_f32(octx);
            break;
        default:
            err = HTP_STATUS_NO_SUPPORT;
            break;
    }
    return err;
 }
--- a/ggml/src/ggml-hexagon/htp/cmake-toolchain.cmake
+++ b/ggml/src/ggml-hexagon/htp/cmake-toolchain.cmake
@ -0,0 +1,157 @@
 if (HEXAGON_TOOLCHAIN_INCLUDED)
  return()
 endif()
 set(HEXAGON_TOOLCHAIN_INCLUDED true)
 #Cross Compiling for Hexagon
 set(HEXAGON TRUE)
 set(CMAKE_SYSTEM_NAME QURT)
 set(CMAKE_SYSTEM_PROCESSOR Hexagon)
 set(CMAKE_SYSTEM_VERSION "1") #${HEXAGON_PLATFORM_LEVEL})
 set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)
 set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
 set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
 set(CMAKE_FIND_ROOT_PATH_MODE_PACKAGE ONLY)
 set(CUSTOM_RUNELF_PATH "")
 #To fix backward compatibility with EAI addon.
 if (NOT HEXAGON_SDK_ROOT)
    set(HEXAGON_SDK_ROOT $ENV{HEXAGON_SDK_ROOT})
 endif()
 if (NOT HEXAGON_TOOLS_ROOT)
    if (DEFINED ENV{HEXAGON_TOOLS_ROOT})
        set(HEXAGON_TOOLS_ROOT $ENV{HEXAGON_TOOLS_ROOT})
    endif()
    if(NOT HEXAGON_TOOLS_ROOT)
        set(HEXAGON_TOOLS_ROOT $ENV{DEFAULT_HEXAGON_TOOLS_ROOT})
    endif()
 endif()
 file(TO_CMAKE_PATH "${HEXAGON_TOOLS_ROOT}" HEXAGON_TOOLS_ROOT)
 file(TO_CMAKE_PATH "${HEXAGON_SDK_ROOT}"   HEXAGON_SDK_ROOT)
 #Get the Binary extension of the Hexagon Toolchain
 if(CMAKE_HOST_SYSTEM_NAME STREQUAL Windows)
    set(HEXAGON_TOOLCHAIN_SUFFIX .exe)
 endif()
 message(DEBUG "CMAKE_HOST_SYSTEM_NAME:${CMAKE_HOST_SYSTEM_NAME}")
 include(${HEXAGON_SDK_ROOT}/build/cmake/hexagon_arch.cmake)
 set(HEXAGON_TOOLCHAIN ${HEXAGON_TOOLS_ROOT})
 set(HEXAGON_LIB_DIR "${HEXAGON_TOOLCHAIN}/Tools/target/hexagon/lib")
 set(HEXAGON_ISS_DIR ${HEXAGON_TOOLCHAIN}/Tools/lib/iss)
 set(CMAKE_TRY_COMPILE_PLATFORM_VARIABLES
    HEXAGON_SDK_ROOT
    HEXAGON_TOOLS_ROOT
 )
 #QURT Related includes and linker flags
 set(V_ARCH ${HEXAGON_ARCH})
 set(_QURT_INSTALL_DIR "${HEXAGON_SDK_ROOT}/rtos/qurt/ADSP${V_ARCH}MP${V_ARCH_EXTN}")
 set(_QURT_INSTALL_DIR "${HEXAGON_SDK_ROOT}/rtos/qurt/compute${V_ARCH}${V_ARCH_EXTN}")
 if( ${TREE} MATCHES PAKMAN )
    set(_QURT_INSTALL_DIR "${QURT_IMAGE_DIR}/compute${V_ARCH}${V_ARCH_EXTN}")
 endif()
 message(DEBUG "_QURT_INSTALL_DIR:${_QURT_INSTALL_DIR}")
 set(RTOS_DIR ${_QURT_INSTALL_DIR})
 set(QCC_DIR "${HEXAGON_QCC_DIR}/${V_ARCH}/G0")
 set(TARGET_DIR "${HEXAGON_LIB_DIR}/${V_ARCH}/G0")
 include_directories(
    ${_QURT_INSTALL_DIR}/include
    ${_QURT_INSTALL_DIR}/include/qurt
    ${_QURT_INSTALL_DIR}/include/posix
    )
 set(QURT_START_LINK_LIBS)
 set(QURT_START_LINK_LIBS
    "${TARGET_DIR}/init.o"
    "${RTOS_DIR}/lib/crt1.o"
    "${RTOS_DIR}/lib/debugmon.o"
    "${RTOS_DIR}/lib/libqurt.a"
    "${TARGET_DIR}/libc.a"
    "${TARGET_DIR}/libqcc.a"
    "${TARGET_DIR}/libhexagon.a"
    "${RTOS_DIR}/lib/libqurtcfs.a"
    "${RTOS_DIR}/lib/libtimer_island.a"
    "${RTOS_DIR}/lib/libtimer_main.a"
    "${RTOS_DIR}/lib/libposix.a"
    )
 STRING(REPLACE ";" " " QURT_START_LINK_LIBS "${QURT_START_LINK_LIBS}")
 set(QURT_END_LINK_LIBS
    ${TARGET_DIR}/fini.o
    )
 #Non QURT related includes and linker flags
 set(TARGET_DIR_NOOS "${HEXAGON_TOOLCHAIN}/Tools/target/hexagon/lib/${HEXAGON_ARCH}")
 if (NOT NO_WRAP_MEM_API)
    set(WRAP_MALLOC   -Wl,--wrap=malloc)
    set(WRAP_CALLOC   -Wl,--wrap=calloc)
    set(WRAP_FREE     -Wl,--wrap=free)
    set(WRAP_REALLOC  -Wl,--wrap=realloc)
    set(WRAP_MEMALIGN -Wl,--wrap=memalign)
 endif()
 set(PIC_SHARED_LD_FLAGS
    -mcpu=${V_ARCH} -m${V_ARCH} -mhvx=${V_ARCH}
    -G0
    -fpic
    -Wl,-Bsymbolic
    -Wl,-L${TARGET_DIR_NOOS}/G0/pic
    -Wl,-L${HEXAGON_TOOLCHAIN}/Tools/target/hexagon/lib/
    -Wl,--no-threads ${WRAP_MALLOC} ${WRAP_CALLOC} ${WRAP_FREE} ${WRAP_REALLOC} ${WRAP_MEMALIGN}
    -shared
    "-o <TARGET> <SONAME_FLAG><TARGET_SONAME>"
    "<LINK_FLAGS>"
    -Wl,--start-group
    "<OBJECTS>"
    "<LINK_LIBRARIES>"
    -Wl,--end-group
    -lc
    )
 STRING(REPLACE ";" " " PIC_SHARED_LD_FLAGS "${PIC_SHARED_LD_FLAGS}")
 set(HEXAGON_PIC_SHARED_LINK_OPTIONS "${PIC_SHARED_LD_FLAGS}")
 #System include paths
 include_directories(SYSTEM ${HEXAGON_SDK_ROOT}/incs)
 include_directories(SYSTEM ${HEXAGON_SDK_ROOT}/incs/stddef)
 include_directories(SYSTEM ${HEXAGON_SDK_ROOT}/ipc/fastrpc/incs)
 #LLVM toolchain setup
 #Compiler paths, options and architecture
 set(CMAKE_C_COMPILER ${HEXAGON_TOOLCHAIN}/Tools/bin/hexagon-clang${HEXAGON_TOOLCHAIN_SUFFIX})
 set(CMAKE_CXX_COMPILER ${HEXAGON_TOOLCHAIN}/Tools/bin/hexagon-clang++${HEXAGON_TOOLCHAIN_SUFFIX})
 set(CMAKE_AR ${HEXAGON_TOOLCHAIN}/Tools/bin/hexagon-ar${HEXAGON_TOOLCHAIN_SUFFIX})
 set(CMAKE_ASM_COMPILER ${HEXAGON_TOOLCHAIN}/Tools/bin/hexagon-clang++${HEXAGON_TOOLCHAIN_SUFFIX})
 set(HEXAGON_LINKER ${CMAKE_C_COMPILER})
 set(CMAKE_PREFIX_PATH ${HEXAGON_TOOLCHAIN}/Tools/target/hexagon)
 set(CMAKE_SHARED_LIBRARY_SONAME_C_FLAG   "-Wl,-soname,")
 set(CMAKE_SHARED_LIBRARY_SONAME_CXX_FLAG "-Wl,-soname,")
 #Compiler Options
 set(COMMON_FLAGS "-mcpu=hexagon${V_ARCH} -m${V_ARCH} -mhvx=${V_ARCH} -fvectorize -Wall -Werror -fno-zero-initialized-in-bss -G0 -fdata-sections -fpic ${XQF_ARGS}")
 set(CMAKE_CXX_FLAGS_DEBUG          "${COMMON_FLAGS} -O0 -D_DEBUG -g")
 set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${COMMON_FLAGS} -O3 -g")
 set(CMAKE_CXX_FLAGS_RELEASE        "${COMMON_FLAGS} -O3")
 set(CMAKE_C_FLAGS_DEBUG            "${COMMON_FLAGS} -O0 -D_DEBUG -g")
 set(CMAKE_C_FLAGS_RELWITHDEBINFO   "${COMMON_FLAGS} -O3 -g")
 set(CMAKE_C_FLAGS_RELEASE          "${COMMON_FLAGS} -O3")
 set(CMAKE_ASM_FLAGS_DEBUG          "${COMMON_FLAGS} ${CMAKE_CXX_FLAGS_DEBUG}")
 set(CMAKE_ASM_FLAGS_RELEASE        "${COMMON_FLAGS} ${CMAKE_CXX_FLAGS_RELEASE}")
 set(CMAKE_ASM_FLAGS_RELWITHDEBINFO "${COMMON_FLAGS} ${CMAKE_CXX_FLAGS_RELWITHDEBINFO}" )
 #Linker Options
 set(CMAKE_C_CREATE_SHARED_LIBRARY   "${HEXAGON_LINKER} ${HEXAGON_PIC_SHARED_LINK_OPTIONS}")
 set(CMAKE_CXX_CREATE_SHARED_LIBRARY "${HEXAGON_LINKER} ${HEXAGON_PIC_SHARED_LINK_OPTIONS}")
--- a/ggml/src/ggml-hexagon/htp/htp-ctx.h
+++ b/ggml/src/ggml-hexagon/htp/htp-ctx.h
@ -0,0 +1,40 @@
 #ifndef HTP_CTX_H
 #define HTP_CTX_H
 #include "htp-dma.h"
 #include "worker-pool.h"
 #include <assert.h>
 #include <dspqueue.h>
 #include <stdatomic.h>
 #include <stdint.h>
 #define HTP_MAX_NTHREADS 10
 // FIXME: move these into matmul-ops
 #define HTP_SPAD_SRC0_NROWS 16
 #define HTP_SPAD_SRC1_NROWS 16
 #define HTP_SPAD_DST_NROWS  2
 // Main context for htp DSP backend
 struct htp_context {
    dspqueue_t            queue;
    dma_queue *           dma[HTP_MAX_NTHREADS];
    worker_pool_context_t worker_pool;
    uint32_t              n_threads;
    int thread_id;
    int thread_prio;
    uint8_t * vtcm_base;
    size_t    vtcm_size;
    uint32_t  vtcm_rctx;
    atomic_bool vtcm_valid;
    atomic_bool vtcm_inuse;
    atomic_bool vtcm_needs_release;
    uint32_t opmask;
 };
 #endif /* HTP_CTX_H */
--- a/ggml/src/ggml-hexagon/htp/htp-dma.c
+++ b/ggml/src/ggml-hexagon/htp/htp-dma.c
@ -0,0 +1,69 @@
 #include "htp-dma.h"
 #include <stdbool.h>
 #include <stdlib.h>
 #include <string.h>
 #pragma clang diagnostic ignored "-Wunused-function"
 static inline uint32_t pow2_ceil(uint32_t x) {
    if (x <= 1) {
        return 1;
    }
    int p = 2;
    x--;
    while (x >>= 1) {
        p <<= 1;
    }
    return p;
 }
 dma_queue * dma_queue_create(size_t capacity) {
    dma_queue * q = (dma_queue *) memalign(32, sizeof(dma_queue));
    if (q == NULL) {
        FARF(ERROR, "%s: failed to allocate DMA queue\n", __FUNCTION__);
        return NULL;
    }
    capacity = pow2_ceil(capacity);
    memset(q, 0, sizeof(dma_queue));
    q->capacity = capacity;
    q->idx_mask = capacity - 1;
    q->desc = (hexagon_udma_descriptor_type1_t *) memalign(64, capacity * sizeof(hexagon_udma_descriptor_type1_t));
    memset(q->desc, 0, capacity * sizeof(hexagon_udma_descriptor_type1_t));
    q->dst = (void **) memalign(4, capacity * sizeof(void *));
    memset(q->dst, 0, capacity * sizeof(void *));
    q->tail = &q->desc[capacity - 1];
    if (!q->desc && !q->dst) {
        FARF(ERROR, "%s: failed to allocate DMA queue items\n", __FUNCTION__);
        return NULL;
    }
    FARF(HIGH, "dma-queue: capacity %u\n", capacity);
    return q;
 }
 void dma_queue_delete(dma_queue * q) {
    if (!q) {
        return;
    }
    free(q->desc);
    free(q->dst);
    free(q);
 }
 void dma_queue_flush(dma_queue * q) {
    while (1) {
        uint32_t s = dmwait() & 0x3;
        if (s == HEXAGON_UDMA_DM0_STATUS_IDLE) {
            break;
        }
    }
    q->tail = NULL;
 }
--- a/ggml/src/ggml-hexagon/htp/htp-dma.h
+++ b/ggml/src/ggml-hexagon/htp/htp-dma.h
@ -0,0 +1,119 @@
 #ifndef HTP_DMA_H
 #define HTP_DMA_H
 #include <HAP_farf.h>
 #include <hexagon_protos.h>
 #include <hexagon_types.h>
 #include <stdbool.h>
 #include <stdint.h>
 #ifdef __cplusplus
 extern "C" {
 #endif
 typedef struct {
    hexagon_udma_descriptor_type1_t * desc;  // descriptor pointers
    hexagon_udma_descriptor_type1_t * tail;  // tail pointer
    void **                           dst;   // dst pointers
    uint32_t                          push_idx;
    uint32_t                          pop_idx;
    uint32_t                          capacity;
    uint32_t                          idx_mask;
 } dma_queue;
 dma_queue * dma_queue_create(size_t capacity);
 void        dma_queue_delete(dma_queue * q);
 void        dma_queue_flush(dma_queue * q);
 // TODO: technically we don't need these and could use Q6_dmstart/wait/etc instead
 // but those do not seem to always compiler properly.
 static inline void dmstart(void * next) {
    asm volatile(" release(%0):at" : : "r"(next));
    asm volatile(" dmstart(%0)" : : "r"(next));
 }
 static inline void dmlink(void * cur, void * next) {
    asm volatile(" release(%0):at" : : "r"(next));
    asm volatile(" dmlink(%0, %1)" : : "r"(cur), "r"(next));
 }
 static inline unsigned int dmpoll(void) {
    unsigned int ret = 0;
    asm volatile(" %0 = dmpoll" : "=r"(ret) : : "memory");
    return ret;
 }
 static inline unsigned int dmwait(void) {
    unsigned int ret = 0;
    asm volatile(" %0 = dmwait" : "=r"(ret) : : "memory");
    return ret;
 }
 static inline bool dma_queue_push(dma_queue *  q,
                                  void *       dst,
                                  const void * src,
                                  size_t       dst_row_size,
                                  size_t       src_row_size,
                                  size_t       nrows) {
    if (((q->push_idx + 1) & q->idx_mask) == q->pop_idx) {
        return false;
    }
    hexagon_udma_descriptor_type1_t * desc = &q->desc[q->push_idx];
    desc->next           = NULL;
    desc->length         = 0;
    desc->desctype       = HEXAGON_UDMA_DESC_DESCTYPE_TYPE1;
    desc->dstbypass      = 1;
    desc->srcbypass      = 1;
    desc->order          = 0;
    desc->dstate         = HEXAGON_UDMA_DESC_DSTATE_INCOMPLETE;
    desc->src            = (void *) src;
    desc->dst            = (void *) dst;
    desc->allocation     = 0;
    desc->padding        = 0;
    desc->roiwidth       = src_row_size;
    desc->roiheight      = nrows;
    desc->srcstride      = src_row_size;
    desc->dststride      = dst_row_size;
    desc->srcwidthoffset = 0;
    desc->dstwidthoffset = 0;
    q->dst[q->push_idx] = dst;
    dmlink(q->tail, desc);
    q->tail = desc;
    // FARF(ERROR, "dma-push: i %u len %u dst %p src %p\n", q->push_idx, len, dst, src);
    q->push_idx = (q->push_idx + 1) & q->idx_mask;
    return true;
 }
 static inline uint8_t * dma_queue_pop(dma_queue * q) {
    if (q->push_idx == q->pop_idx) {
        return NULL;
    }
    hexagon_udma_descriptor_type1_t * desc = &q->desc[q->pop_idx];
    // Wait for desc to complete
    while (1) {
        dmpoll();
        if (desc->dstate == HEXAGON_UDMA_DESC_DSTATE_COMPLETE) {
            break;
        }
        // FARF(ERROR, "dma-pop: waiting for DMA : %u\n", q->pop_idx);
    }
    uint8_t * dst = (uint8_t *) q->dst[q->pop_idx];
    // FARF(ERROR, "dma-pop: i %u dst %p\n", q->pop_idx, dst);
    q->pop_idx = (q->pop_idx + 1) & q->idx_mask;
    return dst;
 }
 #ifdef __cplusplus
 }  // extern "C"
 #endif
 #endif /* HTP_DMA_H */
--- a/ggml/src/ggml-hexagon/htp/htp-msg.h
+++ b/ggml/src/ggml-hexagon/htp/htp-msg.h
@ -0,0 +1,156 @@
 #ifndef HTP_MSG_H
 #define HTP_MSG_H
 #include <assert.h>
 // ggml-common.h must be included prio to this header
 // Mask to enable various stages of the Ops.
 // Used for debugging and profiling.
 enum {
    HTP_OPMASK_QUEUE    = (1 << 0),  // Enable Queueing (ie calls into the DSP)
    HTP_OPMASK_QUANTIZE = (1 << 1),  // Enable Quantize
    HTP_OPMASK_COMPUTE  = (1 << 2),  // Enable Compute
 };
 // Op flags
 enum {
    HTP_OPFLAGS_SKIP_QUANTIZE = (1 << 0),  // Skip dynamic quantization (reuse quantized tensors)
    HTP_OPFLAGS_SKIP_COMPUTE  = (1 << 1),  // Skip actual computation (used for profiling)
    HTP_OPFLAGS_EARLY_WAKEUP  = (1 << 2)   // Send early wakeup notification
 };
 enum htp_status {
    HTP_STATUS_OK             = 1,
    HTP_STATUS_INTERNAL_ERR   = 2,
    HTP_STATUS_NO_SUPPORT     = 3,
    HTP_STATUS_INVAL_PARAMS   = 4,
    HTP_STATUS_VTCM_TOO_SMALL = 5,
 };
 // The values must match the ggml_type.
 // Duplicated here because we can't include full ggml.h in the htp build.
 // We have some static_asserts in the cpp code to ensure things are in sync.
 enum htp_data_type {
    HTP_TYPE_F32   = 0,
    HTP_TYPE_F16   = 1,
    HTP_TYPE_Q4_0  = 2,
    HTP_TYPE_Q8_0  = 8,
    HTP_TYPE_MXFP4 = 39,
    HTP_TYPE_COUNT
 };
 // These values are manually translated over to HTP
 // !!!! DO NOT ALTER THE ORDER OF THE FIRST FOUR ENUMS !!!!
 enum htp_op {
    HTP_OP_MUL            = 0,
    HTP_OP_ADD            = 1,
    HTP_OP_SUB            = 2,
    HTP_OP_DIV            = 3,
    HTP_OP_MUL_MAT        = 4,
    HTP_OP_MUL_MAT_ID     = 5,
    HTP_OP_RMS_NORM       = 6,
    HTP_OP_UNARY_SILU     = 7,
    HTP_OP_GLU_SWIGLU     = 8,
    HTP_OP_GLU_SWIGLU_OAI = 9,
    HTP_OP_SOFTMAX        = 10,
    HTP_OP_ADD_ID         = 11,
    HTP_OP_ROPE           = 12,
    INVALID
 };
 static inline size_t htp_type_block_size(uint32_t t) {
    switch (t) {
        case HTP_TYPE_F32:
            return 1;
        case HTP_TYPE_F16:
            return 1;
        case HTP_TYPE_Q4_0:
            return QK4_0;
        case HTP_TYPE_Q8_0:
            return QK8_0;
        case HTP_TYPE_MXFP4:
            return QK_MXFP4;
        default:
            assert(0 && "unsupported HTP data type");
    }
    return 0;
 }
 static inline size_t htp_type_nbytes(uint32_t t) {
    switch (t) {
        case HTP_TYPE_F32:
            return 4;
        case HTP_TYPE_F16:
            return 2;
        case HTP_TYPE_Q4_0:
            return sizeof(block_q4_0);
        case HTP_TYPE_Q8_0:
            return sizeof(block_q8_0);
        case HTP_TYPE_MXFP4:
            return sizeof(block_mxfp4);
        default:
            assert(0 && "unsupported HTP data type");
    }
    return 0;
 }
 static const char * htp_type_name(uint32_t t) {
    switch (t) {
        case HTP_TYPE_F32:
            return "fp32";
        case HTP_TYPE_F16:
            return "fp16";
        case HTP_TYPE_Q4_0:
            return "q4_0";
        case HTP_TYPE_Q8_0:
            return "q8_0";
        case HTP_TYPE_MXFP4:
            return "mxfp4";
    }
    return 0;
 }
 // Internal types
 #define QK_Q4_0x4x2  256  // 4x Q4_0 blocks packed with next 4x Q4_0 blocks (size in bytes 128)
 #define QK_Q8_0x4x2  256  // 4x Q8_0 blocks concat with next 4x Q8_0 blocks
 #define QK_MXFP4x4x2 256  // 4x MXFP4 blocks concat with next 4x MXFP4 blocks
 #define HTP_MAX_DIMS 4
 struct htp_tensor {
    uint32_t data;              // Buffer offset in the messages, and data pointer on the NSP
    uint32_t type;              // Data type
    uint32_t ne[HTP_MAX_DIMS];  // Number of elements
    uint32_t nb[HTP_MAX_DIMS];  // Stride in bytes (see ggml.h ggml_tensor)
 };
 #define HTP_MAX_OP_PARAMS 64
 struct htp_general_req {
    uint32_t op;  // GGML/HTP Op
    int32_t  op_params[HTP_MAX_OP_PARAMS / sizeof(int32_t)];
    // Params for the op, e.g. epsilon of RMS norm
    uint32_t flags;          // Request flags
    struct htp_tensor src0;  // Input0 tensor
    struct htp_tensor src1;  // Input1 tensor
    struct htp_tensor src2;  // Input2 tensor
    struct htp_tensor dst;   // Output tensor
    // should be multiple of 64 bytes (cacheline)
 };
 struct htp_general_rsp {
    uint32_t op;           // GGML/HTP Op
    uint32_t status;       // HTP_STATUS_...
    uint32_t prof_usecs;   // Number of usec per request
    uint32_t prof_cycles;  // Number of cycles per request
    uint32_t prof_pkts;    // Number of instruction packets per request
    uint8_t  unused[44];   // Pad to 64 bytes
 };
 #define HTP_MAX_MESSAGE_SIZE   sizeof(struct htp_general_req)
 #define HTP_MAX_PACKET_BUFFERS 4
 #endif /* HTP_MSG_H */
--- a/ggml/src/ggml-hexagon/htp/htp-ops.h
+++ b/ggml/src/ggml-hexagon/htp/htp-ops.h
@ -0,0 +1,53 @@
 #ifndef HTP_OPS_H
 #define HTP_OPS_H
 #include "htp-ctx.h"
 #include "htp-msg.h"
 #include "worker-pool.h"
 #include <assert.h>
 #include <stdint.h>
 // ggml-common.h must be included prior to this header
 struct htp_spad {
    uint8_t * data;
    size_t    size;
    size_t    size_per_thread;
 };
 struct htp_ops_context {
    struct htp_context * ctx;
    enum htp_op op;
    int32_t     op_params[HTP_MAX_OP_PARAMS / sizeof(int32_t)];
    struct htp_tensor src0;
    struct htp_tensor src1;
    struct htp_tensor src2;
    struct htp_tensor dst;
    struct htp_spad src0_spad;
    struct htp_spad src1_spad;
    struct htp_spad src2_spad;
    struct htp_spad dst_spad;
    worker_pool_context_t * wpool;      // worker pool
    uint32_t                n_threads;  // num threads
    uint32_t src0_nrows_per_thread;
    uint32_t src1_nrows_per_thread;
    uint32_t flags;
 };
 int op_matmul(struct htp_ops_context * octx);
 int op_matmul_id(struct htp_ops_context * octx);
 int op_binary(struct htp_ops_context * octx);
 int op_unary(struct htp_ops_context * octx);
 int op_activations(struct htp_ops_context * octx);
 int op_softmax(struct htp_ops_context * octx);
 int op_add_id(struct htp_ops_context * octx);
 int op_rope(struct htp_ops_context * octx);
 #endif /* HTP_OPS_H */
--- a/ggml/src/ggml-hexagon/htp/htp_iface.idl
+++ b/ggml/src/ggml-hexagon/htp/htp_iface.idl
@ -0,0 +1,16 @@
 // FastRPC IDL interface for GGML HTP
 #ifndef HTP_IDL
 #define HTP_IDL
 #include "AEEStdDef.idl"
 #include "remote.idl"
 interface htp_iface : remote_handle64 {
    AEEResult start(in uint32 sess_id, in uint64 dsp_queue_id, in uint32 n_hvx);
    AEEResult stop();
    AEEResult enable_etm();
    AEEResult disable_etm();
 };
 #endif /* HTP_IDL */
--- a/ggml/src/ggml-hexagon/htp/hvx-exp.c
+++ b/ggml/src/ggml-hexagon/htp/hvx-exp.c
@ -0,0 +1,80 @@
 #pragma clang diagnostic ignored "-Wunused-variable"
 #pragma clang diagnostic ignored "-Wunused-function"
 #pragma clang diagnostic ignored "-Wunused-but-set-variable"
 #include <hexagon_protos.h>
 #include <hexagon_types.h>
 #include <math.h>
 #include <string.h>
 #define GGML_COMMON_DECL_C
 #include "ggml-common.h"
 #include "htp-ctx.h"
 #include "htp-dma.h"
 #include "htp-msg.h"
 #include "htp-ops.h"
 #include "hvx-utils.h"
 #include "ops-utils.h"
 void hvx_exp_f32(const uint8_t * restrict src, uint8_t * restrict dst, const int num_elems, bool negate) {
    int left_over       = num_elems & (VLEN_FP32 - 1);
    int num_elems_whole = num_elems - left_over;
    int unaligned_addr = 0;
    int unaligned_loop = 0;
    if ((0 == htp_is_aligned((void *) src, VLEN)) || (0 == htp_is_aligned((void *) dst, VLEN))) {
        FARF(HIGH, "hvx_exp_f32: unaligned address in hvx op, possibly slower execution\n");
        unaligned_addr = 1;
    }
    // assert((0 == unaligned_addr) || (0 == num_elems_whole));
    if ((1 == unaligned_addr) && (num_elems_whole != 0)) {
        unaligned_loop = 1;
        FARF(HIGH, "hvx_exp_f32: unaligned loop in hvx op, possibly slower execution\n");
    }
    HVX_Vector vec_out = Q6_V_vzero();
    if (0 == unaligned_loop) {
        HVX_Vector * p_vec_in1 = (HVX_Vector *) src;
        HVX_Vector * p_vec_out = (HVX_Vector *) dst;
        #pragma unroll(4)
        for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
            if (true == negate) {
                HVX_Vector neg_vec_in = hvx_vec_neg_fp32(*p_vec_in1++);
                *p_vec_out++          = hvx_vec_exp_fp32(neg_vec_in);
            } else {
                *p_vec_out++ = hvx_vec_exp_fp32(*p_vec_in1++);
            }
        }
    } else {
        #pragma unroll(4)
        for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
            HVX_Vector in = *(HVX_UVector *) (src + i * SIZEOF_FP32);
            if (true == negate) {
                HVX_Vector neg_vec_in                    = hvx_vec_neg_fp32(in);
                *(HVX_UVector *) (dst + i * SIZEOF_FP32) = hvx_vec_exp_fp32(neg_vec_in);
            } else {
                *(HVX_UVector *) (dst + i * SIZEOF_FP32) = hvx_vec_exp_fp32(in);
            }
        }
    }
    if (left_over > 0) {
        const float * srcf = (float *) src + num_elems_whole;
        float *       dstf = (float *) dst + num_elems_whole;
        HVX_Vector in = *(HVX_UVector *) srcf;
        if (true == negate) {
            HVX_Vector neg_vec_in = hvx_vec_neg_fp32(in);
            vec_out = hvx_vec_exp_fp32(neg_vec_in);
        } else {
            vec_out = hvx_vec_exp_fp32(in);
        }
        hvx_vec_store_u((void *) dstf, left_over * SIZEOF_FP32, vec_out);
    }
 }
--- a/ggml/src/ggml-hexagon/htp/hvx-inverse.c
+++ b/ggml/src/ggml-hexagon/htp/hvx-inverse.c
@ -0,0 +1,60 @@
 #pragma clang diagnostic ignored "-Wunused-variable"
 #pragma clang diagnostic ignored "-Wunused-function"
 #pragma clang diagnostic ignored "-Wunused-but-set-variable"
 #include <hexagon_protos.h>
 #include <hexagon_types.h>
 #include <math.h>
 #include <string.h>
 #define GGML_COMMON_DECL_C
 #include "ggml-common.h"
 #include "htp-ctx.h"
 #include "htp-dma.h"
 #include "htp-msg.h"
 #include "htp-ops.h"
 #include "hvx-utils.h"
 #include "ops-utils.h"
 void hvx_inverse_f32(const uint8_t * restrict src, uint8_t * restrict dst, const int num_elems) {
    int left_over       = num_elems & (VLEN_FP32 - 1);
    int num_elems_whole = num_elems - left_over;
    int unaligned_addr = 0;
    int unaligned_loop = 0;
    if ((0 == htp_is_aligned((void *) src, VLEN)) || (0 == htp_is_aligned((void *) dst, VLEN))) {
        FARF(HIGH, "hvx_inverse_f32: unaligned address in hvx op, possibly slower execution\n");
        unaligned_addr = 1;
    }
    // assert((0 == unaligned_addr) || (0 == num_elems_whole));
    if ((1 == unaligned_addr) && (num_elems_whole != 0)) {
        unaligned_loop = 1;
        FARF(HIGH, "hvx_inverse_f32: unaligned loop in hvx op, possibly slower execution\n");
    }
    if (0 == unaligned_loop) {
        HVX_Vector * p_vec_in  = (HVX_Vector *) src;
        HVX_Vector * p_vec_out = (HVX_Vector *) dst;
        #pragma unroll(4)
        for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
            *p_vec_out++ = hvx_vec_inverse_fp32(*p_vec_in++);
        }
    } else {
        #pragma unroll(4)
        for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
            HVX_Vector in                            = *(HVX_UVector *) (src + i * SIZEOF_FP32);
            *(HVX_UVector *) (dst + i * SIZEOF_FP32) = hvx_vec_inverse_fp32(in);
        }
    }
    if (left_over > 0) {
        const float * srcf = (float *) src + num_elems_whole;
        float *       dstf = (float *) dst + num_elems_whole;
        HVX_Vector in  = *(HVX_UVector *) srcf;
        HVX_Vector out = hvx_vec_inverse_fp32(in);
        hvx_vec_store_u((void *) dstf, left_over * SIZEOF_FP32, out);
    }
 }
--- a/ggml/src/ggml-hexagon/htp/hvx-sigmoid.c
+++ b/ggml/src/ggml-hexagon/htp/hvx-sigmoid.c
@ -0,0 +1,49 @@
 #pragma clang diagnostic ignored "-Wunused-variable"
 #pragma clang diagnostic ignored "-Wunused-function"
 #pragma clang diagnostic ignored "-Wunused-but-set-variable"
 #include <hexagon_protos.h>
 #include <hexagon_types.h>
 #include <math.h>
 #include <string.h>
 #define GGML_COMMON_DECL_C
 #include "ggml-common.h"
 #include "htp-ctx.h"
 #include "htp-dma.h"
 #include "htp-msg.h"
 #include "htp-ops.h"
 #include "hvx-utils.h"
 #include "ops-utils.h"
 #if 0
 // Reference algo used in hvx-utils
 static void fast_sigmoid_f32(const float*  restrict src, float* restrict dst, const int num_elems)
 {
    const float c1 = 0.03138777;
    const float c2 = 0.276281267;
    const float c_log2f = 1.442695022;
    int32_t store_ints[32];
    float store_floats[3][32];
    for (int i = 0; i < num_elems; i++)
    {
        float v = src0[i];
        v *= c_log2f*0.5;
        int intPart = (int)v;
        float x = (v - intPart);
        float xx = x * x;
        float v1 = c_log2f + c2 * xx;
        float v2 = x + xx * c1 * x;
        float v3 = (v2 + v1);
        *((int*)&v3) += intPart << 24;
        float v4 = v2 - v1;
        float v5 = v3 - v4;
        float res = v3 / v5;
        dst[i] = res;
    }
 }
 #endif
--- a/ggml/src/ggml-hexagon/htp/hvx-utils.c
+++ b/ggml/src/ggml-hexagon/htp/hvx-utils.c
@ -0,0 +1,947 @@
 #pragma clang diagnostic ignored "-Wunused-variable"
 #pragma clang diagnostic ignored "-Wunused-function"
 #pragma clang diagnostic ignored "-Wunused-but-set-variable"
 #ifdef HTP_DEBUG
 #    define FARF_HIGH 1
 #endif
 #include <HAP_farf.h>
 #include <HAP_mem.h>
 #include <HAP_perf.h>
 #include <HAP_ps.h>
 #include <hexagon_protos.h>
 #include <hexagon_types.h>
 #include <math.h>
 #include <string.h>
 #define GGML_COMMON_DECL_C
 #include "ggml-common.h"
 #include "hvx-utils.h"
 #define htp_binary_ops_preamble                                                                                \
    int step_of_4 = num_elems >> 7;                                                                            \
    int step_of_2 = (num_elems - step_of_4 * VLEN_FP32 * 4) >> 6;                                              \
    int step_of_1 = (num_elems - step_of_4 * VLEN_FP32 * 4 - step_of_2 * VLEN_FP32 * 2) >> 5;                  \
    int remaining = num_elems - step_of_4 * VLEN_FP32 * 4 - step_of_2 * VLEN_FP32 * 2 - step_of_1 * VLEN_FP32; \
                                                                                                               \
    const uint8_t * restrict src0_curr = src0;                                                                 \
    const uint8_t * restrict src1_curr = src1;                                                                 \
    uint8_t * restrict dst_curr        = dst;
 void hvx_mul_f32(const uint8_t * restrict src0,
                 const uint8_t * restrict src1,
                 uint8_t * restrict dst,
                 const int num_elems) {
    int left_over       = num_elems & (VLEN_FP32 - 1);
    int num_elems_whole = num_elems - left_over;
    int unaligned_addr = 0;
    int unaligned_loop = 0;
    if ((0 == htp_is_aligned((void *) src0, VLEN)) || (0 == htp_is_aligned((void *) src1, VLEN)) ||
        (0 == htp_is_aligned((void *) dst, VLEN))) {
        FARF(HIGH, "hvx_mul_f32: unaligned address in hvx op, possibly slower execution\n");
        unaligned_addr = 1;
    }
    if ((1 == unaligned_addr) && (num_elems_whole != 0)) {
        unaligned_loop = 1;
        FARF(HIGH, "hvx_mul_f32: unaligned loop in hvx op, possibly slower execution\n");
    }
    if (0 == unaligned_loop) {
        HVX_Vector * restrict vec_in1 = (HVX_Vector *) src0;
        HVX_Vector * restrict vec_in2 = (HVX_Vector *) src1;
        HVX_Vector * restrict vec_out = (HVX_Vector *) dst;
        #pragma unroll(4)
        for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
            HVX_Vector v = Q6_Vqf32_vmpy_VsfVsf(*vec_in1++, *vec_in2++);
            *vec_out++   = Q6_Vsf_equals_Vqf32(v);
        }
    } else {
        #pragma unroll(4)
        for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
            HVX_Vector in1 = *(HVX_UVector *) (src0 + i * SIZEOF_FP32);
            HVX_Vector in2 = *(HVX_UVector *) (src1 + i * SIZEOF_FP32);
            HVX_Vector out = Q6_Vqf32_vmpy_VsfVsf(in1, in2);
            *(HVX_UVector *) (dst + i * SIZEOF_FP32) = Q6_Vsf_equals_Vqf32(out);
        }
    }
    if (left_over > 0) {
        const float * src0f = (const float *) src0 + num_elems_whole;
        const float * src1f = (const float *) src1 + num_elems_whole;
        float *       dstf  = (float *) dst + num_elems_whole;
        HVX_Vector in1 = *(HVX_UVector *) src0f;
        HVX_Vector in2 = *(HVX_UVector *) src1f;
        HVX_Vector out = Q6_Vqf32_vmpy_VsfVsf(in1, in2);
        hvx_vec_store_u((void *) dstf, left_over * SIZEOF_FP32, Q6_Vsf_equals_Vqf32(out));
    }
 }
 void hvx_mul_f32_opt(const uint8_t * restrict src0,
                     const uint8_t * restrict src1,
                     uint8_t * restrict dst,
                     const int num_elems) {
    htp_binary_ops_preamble;
    for (int i = 0; i < step_of_4; i++) {
        HVX_Vector v1a = *(HVX_Vector *) src0_curr;
        HVX_Vector v1b = *(HVX_Vector *) src1_curr;
        HVX_Vector v2a = *(HVX_Vector *) (src0_curr + VLEN);
        HVX_Vector v1 = Q6_Vqf32_vmpy_VsfVsf(v1a, v1b);
        HVX_Vector v2b = *(HVX_Vector *) (src1_curr + VLEN);
        HVX_Vector v3a = *(HVX_Vector *) (src0_curr + 2 * VLEN);
        HVX_Vector v2 = Q6_Vqf32_vmpy_VsfVsf(v2a, v2b);
        *(HVX_Vector *) dst_curr = Q6_Vsf_equals_Vqf32(v1);
        HVX_Vector v3b = *(HVX_Vector *) (src1_curr + 2 * VLEN);
        HVX_Vector v4a = *(HVX_Vector *) (src0_curr + 3 * VLEN);
        src0_curr += 4 * VLEN;
        HVX_Vector v3 = Q6_Vqf32_vmpy_VsfVsf(v3a, v3b);
        *(HVX_Vector *) (dst_curr + VLEN) = Q6_Vsf_equals_Vqf32(v2);
        HVX_Vector v4b = *(HVX_Vector *) (src1_curr + 3 * VLEN);
        *(HVX_Vector *) (dst_curr + 2 * VLEN) = Q6_Vsf_equals_Vqf32(v3);
        HVX_Vector v4 = Q6_Vqf32_vmpy_VsfVsf(v4a, v4b);
        src1_curr += 4 * VLEN;
        *(HVX_Vector *) (dst_curr + 3 * VLEN) = Q6_Vsf_equals_Vqf32(v4);
        dst_curr += 4 * VLEN;
    }
    for (int i = 0; i < step_of_2; i++) {
        HVX_Vector v1a = *(HVX_Vector *) src0_curr;
        HVX_Vector v1b = *(HVX_Vector *) src1_curr;
        HVX_Vector v2a = *(HVX_Vector *) (src0_curr + VLEN);
        HVX_Vector v1 = Q6_Vqf32_vmpy_VsfVsf(v1a, v1b);
        HVX_Vector v2b = *(HVX_Vector *) (src1_curr + VLEN);
        *(HVX_Vector *) dst_curr = Q6_Vsf_equals_Vqf32(v1);
        src0_curr += 2 * VLEN;
        HVX_Vector v2 = Q6_Vqf32_vmpy_VsfVsf(v2a, v2b);
        src1_curr += 2 * VLEN;
        *(HVX_Vector *) (dst_curr + VLEN) = Q6_Vsf_equals_Vqf32(v2);
        dst_curr += 2 * VLEN;
    }
    for (int i = 0; i < step_of_1; i++) {
        HVX_Vector va = *(HVX_Vector *) src0_curr;
        src0_curr += VLEN;
        HVX_Vector vb = *(HVX_Vector *) src1_curr;
        src1_curr += VLEN;
        HVX_Vector v = Q6_Vqf32_vmpy_VsfVsf(va, vb);
        *(HVX_Vector *) dst_curr = Q6_Vsf_equals_Vqf32(v);
        dst_curr += VLEN;
    }
    if (remaining > 0) {
        HVX_Vector v = Q6_Vqf32_vmpy_VsfVsf(*(HVX_Vector *) src0_curr, *(HVX_Vector *) src1_curr);
        hvx_vec_store_u((void *) dst_curr, remaining * SIZEOF_FP32, Q6_Vsf_equals_Vqf32(v));
    }
 }
 void hvx_mul_mul_f32_opt(const uint8_t * restrict src0,
                         const uint8_t * restrict src1,
                         const uint8_t * restrict src2,
                         uint8_t * restrict dst,
                         const int num_elems) {
    const uint8_t * restrict src0_curr = src0;
    const uint8_t * restrict src1_curr = src1;
    const uint8_t * restrict src2_curr = src2;
    uint8_t * restrict dst_curr        = dst;
    int step_of_2 = num_elems >> 6;
    int step_of_1 = (num_elems - step_of_2 * VLEN_FP32 * 2) >> 5;
    int remaining = num_elems - step_of_2 * VLEN_FP32 * 2 - step_of_1 * VLEN_FP32;
    for (int i = 0; i < step_of_2; i++) {
        HVX_Vector v1a = *(HVX_Vector *) src0_curr;
        HVX_Vector v1b = *(HVX_Vector *) src1_curr;
        HVX_Vector v1c = *(HVX_Vector *) src2_curr;
        HVX_Vector v2a = *(HVX_Vector *) (src0_curr + VLEN);
        HVX_Vector v1_ = Q6_Vqf32_vmpy_VsfVsf(v1a, v1b);
        HVX_Vector v1  = Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(v1_), v1c);
        HVX_Vector v2b = *(HVX_Vector *) (src1_curr + VLEN);
        *(HVX_Vector *) dst_curr = Q6_Vsf_equals_Vqf32(v1);
        HVX_Vector v2c = *(HVX_Vector *) (src2_curr + VLEN);
        src0_curr += 2 * VLEN;
        HVX_Vector v2_ = Q6_Vqf32_vmpy_VsfVsf(v2a, v2b);
        HVX_Vector v2  = Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(v2_), v2c);
        src1_curr += 2 * VLEN;
        src2_curr += 2 * VLEN;
        *(HVX_Vector *) (dst_curr + VLEN) = Q6_Vsf_equals_Vqf32(v2);
        dst_curr += 2 * VLEN;
    }
    for (int i = 0; i < step_of_1; i++) {
        HVX_Vector va = *(HVX_Vector *) src0_curr;
        src0_curr += VLEN;
        HVX_Vector vb = *(HVX_Vector *) src1_curr;
        src1_curr += VLEN;
        HVX_Vector vc = *(HVX_Vector *) src2_curr;
        src2_curr += VLEN;
        HVX_Vector v1 = Q6_Vqf32_vmpy_VsfVsf(va, vb);
        HVX_Vector v2 = Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(v1), vc);
        *(HVX_Vector *) dst_curr = Q6_Vsf_equals_Vqf32(v2);
        dst_curr += VLEN;
    }
    if (remaining > 0) {
        HVX_Vector v1 = Q6_Vqf32_vmpy_VsfVsf(*(HVX_Vector *) src0_curr, *(HVX_Vector *) src1_curr);
        HVX_Vector v2 = Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(v1), *(HVX_Vector *) src2_curr);
        hvx_vec_store_u((void *) dst_curr, remaining * SIZEOF_FP32, Q6_Vsf_equals_Vqf32(v2));
    }
 }
 void hvx_add_f32(const uint8_t * restrict src0,
                 const uint8_t * restrict src1,
                 uint8_t * restrict dst,
                 const int num_elems) {
    int left_over       = num_elems & (VLEN_FP32 - 1);
    int num_elems_whole = num_elems - left_over;
    int unaligned_addr = 0;
    int unaligned_loop = 0;
    if ((0 == htp_is_aligned((void *) src0, VLEN)) || (0 == htp_is_aligned((void *) src1, VLEN)) ||
        (0 == htp_is_aligned((void *) dst, VLEN))) {
        FARF(HIGH, "hvx_add_f32: unaligned address in hvx op, possibly slower execution\n");
        unaligned_addr = 1;
    }
    if ((1 == unaligned_addr) && (num_elems_whole != 0)) {
        unaligned_loop = 1;
        FARF(HIGH, "hvx_add_f32: unaligned loop in hvx op, possibly slower execution\n");
    }
    if (0 == unaligned_loop) {
        HVX_Vector * restrict vec_in1 = (HVX_Vector *) src0;
        HVX_Vector * restrict vec_in2 = (HVX_Vector *) src1;
        HVX_Vector * restrict vec_out = (HVX_Vector *) dst;
        #pragma unroll(4)
        for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
            HVX_Vector v = Q6_Vqf32_vadd_VsfVsf(*vec_in1++, *vec_in2++);
            *vec_out++   = Q6_Vsf_equals_Vqf32(v);
        }
    } else {
        #pragma unroll(4)
        for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
            HVX_Vector in1 = *(HVX_UVector *) (src0 + i * SIZEOF_FP32);
            HVX_Vector in2 = *(HVX_UVector *) (src1 + i * SIZEOF_FP32);
            HVX_Vector out = Q6_Vqf32_vadd_VsfVsf(in1, in2);
            *(HVX_UVector *) (dst + i * SIZEOF_FP32) = Q6_Vsf_equals_Vqf32(out);
        }
    }
    if (left_over > 0) {
        const float * src0f = (const float *) src0 + num_elems_whole;
        const float * src1f = (const float *) src1 + num_elems_whole;
        float *       dstf  = (float *) dst + num_elems_whole;
        HVX_Vector in1 = *(HVX_UVector *) src0f;
        HVX_Vector in2 = *(HVX_UVector *) src1f;
        HVX_Vector out = Q6_Vqf32_vadd_VsfVsf(in1, in2);
        hvx_vec_store_u((void *) dstf, left_over * SIZEOF_FP32, Q6_Vsf_equals_Vqf32(out));
    }
 }
 void hvx_add_f32_opt(const uint8_t * restrict src0,
                     const uint8_t * restrict src1,
                     uint8_t * restrict dst,
                     const int num_elems) {
    htp_binary_ops_preamble;
    for (int i = 0; i < step_of_4; i++) {
        HVX_Vector v1a = *(HVX_Vector *) src0_curr;
        HVX_Vector v1b = *(HVX_Vector *) src1_curr;
        HVX_Vector v2a = *(HVX_Vector *) (src0_curr + VLEN);
        HVX_Vector v1 = Q6_Vqf32_vadd_VsfVsf(v1a, v1b);
        HVX_Vector v2b = *(HVX_Vector *) (src1_curr + VLEN);
        HVX_Vector v3a = *(HVX_Vector *) (src0_curr + 2 * VLEN);
        HVX_Vector v2 = Q6_Vqf32_vadd_VsfVsf(v2a, v2b);
        *(HVX_Vector *) dst_curr = Q6_Vsf_equals_Vqf32(v1);
        HVX_Vector v3b = *(HVX_Vector *) (src1_curr + 2 * VLEN);
        HVX_Vector v4a = *(HVX_Vector *) (src0_curr + 3 * VLEN);
        src0_curr += 4 * VLEN;
        HVX_Vector v3 = Q6_Vqf32_vadd_VsfVsf(v3a, v3b);
        *(HVX_Vector *) (dst_curr + VLEN) = Q6_Vsf_equals_Vqf32(v2);
        HVX_Vector v4b = *(HVX_Vector *) (src1_curr + 3 * VLEN);
        *(HVX_Vector *) (dst_curr + 2 * VLEN) = Q6_Vsf_equals_Vqf32(v3);
        HVX_Vector v4 = Q6_Vqf32_vadd_VsfVsf(v4a, v4b);
        src1_curr += 4 * VLEN;
        *(HVX_Vector *) (dst_curr + 3 * VLEN) = Q6_Vsf_equals_Vqf32(v4);
        dst_curr += 4 * VLEN;
    }
    for (int i = 0; i < step_of_2; i++) {
        HVX_Vector v1a = *(HVX_Vector *) src0_curr;
        HVX_Vector v1b = *(HVX_Vector *) src1_curr;
        HVX_Vector v2a = *(HVX_Vector *) (src0_curr + VLEN);
        HVX_Vector v1 = Q6_Vqf32_vadd_VsfVsf(v1a, v1b);
        HVX_Vector v2b = *(HVX_Vector *) (src1_curr + VLEN);
        *(HVX_Vector *) dst_curr = Q6_Vsf_equals_Vqf32(v1);
        src0_curr += 2 * VLEN;
        HVX_Vector v2 = Q6_Vqf32_vadd_VsfVsf(v2a, v2b);
        src1_curr += 2 * VLEN;
        *(HVX_Vector *) (dst_curr + VLEN) = Q6_Vsf_equals_Vqf32(v2);
        dst_curr += 2 * VLEN;
    }
    for (int i = 0; i < step_of_1; i++) {
        HVX_Vector va = *(HVX_Vector *) src0_curr;
        src0_curr += VLEN;
        HVX_Vector vb = *(HVX_Vector *) src1_curr;
        src1_curr += VLEN;
        HVX_Vector v = Q6_Vqf32_vadd_VsfVsf(va, vb);
        *(HVX_Vector *) dst_curr = Q6_Vsf_equals_Vqf32(v);
        dst_curr += VLEN;
    }
    if (remaining > 0) {
        HVX_Vector v = Q6_Vqf32_vadd_VsfVsf(*(HVX_Vector *) src0_curr, *(HVX_Vector *) src1_curr);
        hvx_vec_store_u((void *) dst_curr, remaining * SIZEOF_FP32, Q6_Vsf_equals_Vqf32(v));
    }
 }
 void hvx_add_scalar_f32(const uint8_t * restrict src, const float val, uint8_t * restrict dst, const int num_elems) {
    size_t left_over       = num_elems & (VLEN_FP32 - 1);
    size_t num_elems_whole = num_elems - left_over;
    int unaligned_addr = 0;
    int unaligned_loop = 0;
    if ((0 == htp_is_aligned((void *) src, VLEN)) || (0 == htp_is_aligned((void *) dst, VLEN))) {
        FARF(HIGH, "hvx_add_scalar_f32: unaligned address in hvx op, possibly slower execution\n");
        unaligned_addr = 1;
    }
    if ((1 == unaligned_addr) && (num_elems_whole != 0)) {
        unaligned_loop = 1;
        FARF(HIGH, "hvx_add_scalar_f32: unaligned loop in hvx op, possibly slower execution\n");
    }
    HVX_Vector val_vec = hvx_vec_splat_fp32(val);
    if (0 == unaligned_loop) {
        HVX_Vector * restrict vec_in1 = (HVX_Vector *) src;
        HVX_Vector * restrict vec_out = (HVX_Vector *) dst;
        #pragma unroll(4)
        for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
            HVX_Vector v = Q6_Vqf32_vadd_VsfVsf(*vec_in1++, val_vec);
            *vec_out++   = Q6_Vsf_equals_Vqf32(v);
        }
    } else {
        #pragma unroll(4)
        for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
            HVX_Vector in = *(HVX_UVector *) (src + i * SIZEOF_FP32);
            HVX_Vector out = Q6_Vqf32_vadd_VsfVsf(in, val_vec);
            *(HVX_UVector *) (dst + i * SIZEOF_FP32) = Q6_Vsf_equals_Vqf32(out);
        }
    }
    if (left_over > 0) {
        const float * srcf = (const float *) src + num_elems_whole;
        float *       dstf = (float *) dst + num_elems_whole;
        HVX_Vector in = *(HVX_UVector *) srcf;
        HVX_Vector out = Q6_Vqf32_vadd_VsfVsf(in, val_vec);
        hvx_vec_store_u((void *) dstf, left_over * SIZEOF_FP32, Q6_Vsf_equals_Vqf32(out));
    }
 }
 void hvx_mul_scalar_f32(const uint8_t * restrict src, const float val, uint8_t * restrict dst, const int num_elems) {
    size_t left_over       = num_elems & (VLEN_FP32 - 1);
    size_t num_elems_whole = num_elems - left_over;
    int unaligned_addr = 0;
    int unaligned_loop = 0;
    if ((0 == htp_is_aligned((void *) src, VLEN)) || (0 == htp_is_aligned((void *) dst, VLEN))) {
        FARF(HIGH, "hvx_mul_scalar_f32: unaligned address in hvx op, possibly slower execution\n");
        unaligned_addr = 1;
    }
    if ((1 == unaligned_addr) && (num_elems_whole != 0)) {
        unaligned_loop = 1;
        FARF(HIGH, "hvx_mul_scalar_f32: unaligned loop in hvx op, possibly slower execution\n");
    }
    HVX_Vector val_vec = hvx_vec_splat_fp32(val);
    if (0 == unaligned_loop) {
        HVX_Vector * restrict vec_in1 = (HVX_Vector *) src;
        HVX_Vector * restrict vec_out = (HVX_Vector *) dst;
        #pragma unroll(4)
        for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
            HVX_Vector v = Q6_Vqf32_vmpy_VsfVsf(*vec_in1++, val_vec);
            *vec_out++   = Q6_Vsf_equals_Vqf32(v);
        }
    } else {
        #pragma unroll(4)
        for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
            HVX_Vector in = *(HVX_UVector *) (src + i * SIZEOF_FP32);
            HVX_Vector out = Q6_Vqf32_vmpy_VsfVsf(in, val_vec);
            *(HVX_UVector *) (dst + i * SIZEOF_FP32) = Q6_Vsf_equals_Vqf32(out);
        }
    }
    if (left_over > 0) {
        const float * srcf = (const float *) src + num_elems_whole;
        float *       dstf = (float *) dst + num_elems_whole;
        HVX_Vector in = *(HVX_UVector *) srcf;
        HVX_Vector out = Q6_Vqf32_vmpy_VsfVsf(in, val_vec);
        hvx_vec_store_u((void *) dstf, left_over * SIZEOF_FP32, Q6_Vsf_equals_Vqf32(out));
    }
 }
 void hvx_sub_f32(const uint8_t * restrict src0,
                 const uint8_t * restrict src1,
                 uint8_t * restrict dst,
                 const int num_elems) {
    size_t left_over       = num_elems & (VLEN_FP32 - 1);
    size_t num_elems_whole = num_elems - left_over;
    int unaligned_addr = 0;
    int unaligned_loop = 0;
    if ((0 == htp_is_aligned((void *) src0, VLEN)) || (0 == htp_is_aligned((void *) src1, VLEN)) ||
        (0 == htp_is_aligned((void *) dst, VLEN))) {
        FARF(HIGH, "hvx_sub_f32: unaligned address in hvx op, possibly slower execution\n");
        unaligned_addr = 1;
    }
    if ((1 == unaligned_addr) && (num_elems_whole != 0)) {
        unaligned_loop = 1;
        FARF(HIGH, "hvx_sub_f32: unaligned loop in hvx op, possibly slower execution\n");
    }
    if (0 == unaligned_loop) {
        HVX_Vector * restrict vec_in1 = (HVX_Vector *) src0;
        HVX_Vector * restrict vec_in2 = (HVX_Vector *) src1;
        HVX_Vector * restrict vec_out = (HVX_Vector *) dst;
        #pragma unroll(4)
        for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
            HVX_Vector v = Q6_Vqf32_vsub_VsfVsf(*vec_in1++, *vec_in2++);
            *vec_out++   = Q6_Vsf_equals_Vqf32(v);
        }
    } else {
        #pragma unroll(4)
        for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
            HVX_Vector in1 = *(HVX_UVector *) (src0 + i * SIZEOF_FP32);
            HVX_Vector in2 = *(HVX_UVector *) (src1 + i * SIZEOF_FP32);
            HVX_Vector out = Q6_Vqf32_vsub_VsfVsf(in1, in2);
            *(HVX_UVector *) (dst + i * SIZEOF_FP32) = Q6_Vsf_equals_Vqf32(out);
        }
    }
    if (left_over > 0) {
        const float * src0f = (const float *) src0 + num_elems_whole;
        const float * src1f = (const float *) src1 + num_elems_whole;
        float *       dstf  = (float *) dst + num_elems_whole;
        HVX_Vector in1 = *(HVX_UVector *) src0f;
        HVX_Vector in2 = *(HVX_UVector *) src1f;
        HVX_Vector out = Q6_Vqf32_vsub_VsfVsf(in1, in2);
        hvx_vec_store_u((void *) dstf, left_over * SIZEOF_FP32, Q6_Vsf_equals_Vqf32(out));
    }
 }
 void hvx_sub_f32_opt(const uint8_t * restrict src0,
                     const uint8_t * restrict src1,
                     uint8_t * restrict dst,
                     const int num_elems) {
    htp_binary_ops_preamble;
    for (int i = 0; i < step_of_4; i++) {
        HVX_Vector v1a = *(HVX_Vector *) src0_curr;
        HVX_Vector v1b = *(HVX_Vector *) src1_curr;
        HVX_Vector v2a = *(HVX_Vector *) (src0_curr + VLEN);
        HVX_Vector v1 = Q6_Vqf32_vsub_VsfVsf(v1a, v1b);
        HVX_Vector v2b = *(HVX_Vector *) (src1_curr + VLEN);
        HVX_Vector v3a = *(HVX_Vector *) (src0_curr + 2 * VLEN);
        HVX_Vector v2 = Q6_Vqf32_vsub_VsfVsf(v2a, v2b);
        *(HVX_Vector *) dst_curr = Q6_Vsf_equals_Vqf32(v1);
        HVX_Vector v3b = *(HVX_Vector *) (src1_curr + 2 * VLEN);
        HVX_Vector v4a = *(HVX_Vector *) (src0_curr + 3 * VLEN);
        src0_curr += 4 * VLEN;
        HVX_Vector v3 = Q6_Vqf32_vsub_VsfVsf(v3a, v3b);
        *(HVX_Vector *) (dst_curr + VLEN) = Q6_Vsf_equals_Vqf32(v2);
        HVX_Vector v4b = *(HVX_Vector *) (src1_curr + 3 * VLEN);
        *(HVX_Vector *) (dst_curr + 2 * VLEN) = Q6_Vsf_equals_Vqf32(v3);
        HVX_Vector v4 = Q6_Vqf32_vsub_VsfVsf(v4a, v4b);
        src1_curr += 4 * VLEN;
        *(HVX_Vector *) (dst_curr + 3 * VLEN) = Q6_Vsf_equals_Vqf32(v4);
        dst_curr += 4 * VLEN;
    }
    for (int i = 0; i < step_of_2; i++) {
        HVX_Vector v1a = *(HVX_Vector *) src0_curr;
        HVX_Vector v1b = *(HVX_Vector *) src1_curr;
        HVX_Vector v2a = *(HVX_Vector *) (src0_curr + VLEN);
        HVX_Vector v1 = Q6_Vqf32_vsub_VsfVsf(v1a, v1b);
        HVX_Vector v2b = *(HVX_Vector *) (src1_curr + VLEN);
        *(HVX_Vector *) dst_curr = Q6_Vsf_equals_Vqf32(v1);
        src0_curr += 2 * VLEN;
        HVX_Vector v2 = Q6_Vqf32_vsub_VsfVsf(v2a, v2b);
        src1_curr += 2 * VLEN;
        *(HVX_Vector *) (dst_curr + VLEN) = Q6_Vsf_equals_Vqf32(v2);
        dst_curr += 2 * VLEN;
    }
    for (int i = 0; i < step_of_1; i++) {
        HVX_Vector va = *(HVX_Vector *) src0_curr;
        src0_curr += VLEN;
        HVX_Vector vb = *(HVX_Vector *) src1_curr;
        src1_curr += VLEN;
        HVX_Vector v = Q6_Vqf32_vsub_VsfVsf(va, vb);
        *(HVX_Vector *) dst_curr = Q6_Vsf_equals_Vqf32(v);
        dst_curr += VLEN;
    }
    if (remaining > 0) {
        HVX_Vector v = Q6_Vqf32_vsub_VsfVsf(*(HVX_Vector *) src0_curr, *(HVX_Vector *) src1_curr);
        hvx_vec_store_u((void *) dst_curr, remaining * SIZEOF_FP32, Q6_Vsf_equals_Vqf32(v));
    }
 }
 void hvx_sub_scalar_f32(const uint8_t * restrict src, const float val, uint8_t * restrict dst, const int num_elems) {
    size_t left_over       = num_elems & (VLEN_FP32 - 1);
    size_t num_elems_whole = num_elems - left_over;
    int unaligned_addr = 0;
    int unaligned_loop = 0;
    if ((0 == htp_is_aligned((void *) src, VLEN)) || (0 == htp_is_aligned((void *) dst, VLEN))) {
        FARF(HIGH, "hvx_sub_scalar_f32: unaligned address in hvx op, possibly slower execution\n");
        unaligned_addr = 1;
    }
    if ((1 == unaligned_addr) && (num_elems_whole != 0)) {
        unaligned_loop = 1;
        FARF(HIGH, "hvx_sub_scalar_f32: unaligned loop in hvx op, possibly slower execution\n");
    }
    HVX_Vector val_vec = hvx_vec_splat_fp32(val);
    if (0 == unaligned_loop) {
        HVX_Vector * restrict vec_in1 = (HVX_Vector *) src;
        HVX_Vector * restrict vec_out = (HVX_Vector *) dst;
        #pragma unroll(4)
        for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
            HVX_Vector v = Q6_Vqf32_vsub_VsfVsf(*vec_in1++, val_vec);
            *vec_out++   = Q6_Vsf_equals_Vqf32(v);
        }
    } else {
        #pragma unroll(4)
        for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
            HVX_Vector in = *(HVX_UVector *) (src + i * SIZEOF_FP32);
            HVX_Vector out = Q6_Vqf32_vsub_VsfVsf(in, val_vec);
            *(HVX_UVector *) (dst + i * SIZEOF_FP32) = Q6_Vsf_equals_Vqf32(out);
        }
    }
    if (left_over > 0) {
        const float * srcf = (const float *) src + num_elems_whole;
        float *       dstf = (float *) dst + num_elems_whole;
        HVX_Vector in = *(HVX_UVector *) srcf;
        HVX_Vector out = Q6_Vqf32_vsub_VsfVsf(in, val_vec);
        hvx_vec_store_u((void *) dstf, left_over * SIZEOF_FP32, Q6_Vsf_equals_Vqf32(out));
    }
 }
 float hvx_sum_of_squares_f32(const uint8_t * restrict src, const int num_elems) {
    int left_over       = num_elems & (VLEN_FP32 - 1);
    int num_elems_whole = num_elems - left_over;
    if (0 == htp_is_aligned((void *) src, VLEN)) {
        FARF(HIGH, "hvx_sum_of_squares_f32: unaligned address in hvx op, possibly slower execution\n");
    }
    assert((1 == htp_is_aligned((void *) src, VLEN)) || (0 == num_elems_whole));
    HVX_Vector * restrict vec_in1 = (HVX_Vector *) src;
    HVX_Vector sum_vec_acc = Q6_V_vsplat_R(0x00000000);
    HVX_Vector zero_vec    = Q6_V_vsplat_R(0x00000000);
    #pragma unroll(4)
    for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
        HVX_Vector v = Q6_Vqf32_vmpy_VsfVsf(*vec_in1, *vec_in1);
        sum_vec_acc  = Q6_Vqf32_vadd_Vqf32Vqf32(sum_vec_acc, v);
        vec_in1++;
    }
    if (left_over > 0) {
        const float * srcf = (const float *) src + num_elems_whole;
        HVX_Vector vec_left = *(HVX_UVector *) srcf;
        HVX_Vector vec_left_sq = Q6_Vqf32_vmpy_VsfVsf(vec_left, vec_left);
        HVX_Vector vec_tmp     = Q6_V_valign_VVR(vec_left_sq, zero_vec, left_over * SIZEOF_FP32);
        sum_vec_acc = Q6_Vqf32_vadd_Vqf32Vqf32(sum_vec_acc, vec_tmp);
    }
    HVX_Vector v = hvx_vec_qf32_reduce_sum(sum_vec_acc);
    return hvx_vec_get_fp32(Q6_Vsf_equals_Vqf32(v));
 }
 float hvx_self_sum_f32(const uint8_t * restrict src, const int num_elems) {
    int left_over       = num_elems & (VLEN_FP32 - 1);
    int num_elems_whole = num_elems - left_over;
    int unaligned_addr = 0;
    int unaligned_loop = 0;
    if (0 == htp_is_aligned((void *) src, VLEN)) {
        FARF(HIGH, "hvx_self_sum_f32: unaligned address in hvx op, possibly slower execution\n");
        unaligned_addr = 1;
    }
    if ((1 == unaligned_addr) && (num_elems_whole != 0)) {
        unaligned_loop = 1;
        FARF(HIGH, "hvx_self_sum_f32: unaligned loop in hvx op, possibly slower execution\n");
    }
    HVX_Vector sum_vec  = Q6_V_vsplat_R(0x00000000);
    HVX_Vector zero_vec = Q6_V_vsplat_R(0x00000000);
    if (0 == unaligned_loop) {
        HVX_Vector * vec_in = (HVX_Vector *) src;
        #pragma unroll(4)
        for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
            // sum_vec = Q6_Vqf32_vadd_Vqf32Vsf(sum_vec, *vec_in++);
            sum_vec = Q6_Vqf32_vadd_VsfVsf(Q6_Vsf_equals_Vqf32(sum_vec), *vec_in++);
        }
    } else {
        #pragma unroll(4)
        for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
            HVX_Vector in = *(HVX_UVector *) (src + i * SIZEOF_FP32);
            sum_vec = Q6_Vqf32_vadd_VsfVsf(Q6_Vsf_equals_Vqf32(sum_vec), in);
        }
    }
    if (left_over > 0) {
        const float * srcf = (const float *) src + num_elems_whole;
        HVX_Vector vec_left = *(HVX_UVector *) srcf;
        HVX_Vector vec_tmp  = Q6_V_valign_VVR(vec_left, zero_vec, left_over * SIZEOF_FP32);
        // sum_vec = Q6_Vqf32_vadd_Vqf32Vsf(sum_vec, vec_tmp);
        sum_vec             = Q6_Vqf32_vadd_VsfVsf(Q6_Vsf_equals_Vqf32(sum_vec), vec_tmp);
    }
    HVX_Vector v = hvx_vec_qf32_reduce_sum(sum_vec);
    return hvx_vec_get_fp32(Q6_Vsf_equals_Vqf32(v));
 }
 void hvx_scale_f32(const uint8_t * restrict src, uint8_t * restrict dst, const int num_elems, const float scale) {
    int left_over       = num_elems & (VLEN_FP32 - 1);
    int num_elems_whole = num_elems - left_over;
    int unaligned_addr = 0;
    int unaligned_loop = 0;
    if ((0 == htp_is_aligned((void *) src, VLEN)) || (0 == htp_is_aligned((void *) dst, VLEN))) {
        FARF(HIGH, "hvx_scale_f32: unaligned address in hvx op, possibly slower execution\n");
        unaligned_addr = 1;
    }
    if ((1 == unaligned_addr) && (num_elems_whole != 0)) {
        unaligned_loop = 1;
        FARF(HIGH, "hvx_scale_f32: unaligned loop in hvx op, possibly slower execution\n");
    }
    HVX_Vector scale_vec = hvx_vec_splat_fp32(scale);
    if (0 == unaligned_loop) {
        HVX_Vector * vec_in1 = (HVX_Vector *) src;
        HVX_Vector * vec_out = (HVX_Vector *) dst;
        #pragma unroll(4)
        for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
            HVX_Vector v = Q6_Vqf32_vmpy_VsfVsf(*vec_in1++, scale_vec);
            *vec_out++   = Q6_Vsf_equals_Vqf32(v);
        }
    } else {
        #pragma unroll(4)
        for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
            HVX_Vector in = *(HVX_UVector *) (src + i * SIZEOF_FP32);
            HVX_Vector out = Q6_Vqf32_vmpy_VsfVsf(in, scale_vec);
            *(HVX_UVector *) (dst + i * SIZEOF_FP32) = Q6_Vsf_equals_Vqf32(out);
        }
    }
    if (left_over > 0) {
        const float * srcf = (const float *) src + num_elems_whole;
        float *       dstf = (float *) dst + num_elems_whole;
        HVX_Vector in = *(HVX_UVector *) srcf;
        HVX_Vector out = Q6_Vqf32_vmpy_VsfVsf(in, scale_vec);
        hvx_vec_store_u((void *) dstf, left_over * SIZEOF_FP32, Q6_Vsf_equals_Vqf32(out));
    }
 }
 float hvx_self_max_f32(const uint8_t * restrict src, const int num_elems) {
    int left_over       = num_elems & (VLEN_FP32 - 1);
    int num_elems_whole = num_elems - left_over;
    int unaligned_addr = 0;
    int unaligned_loop = 0;
    if (0 == htp_is_aligned((void *) src, VLEN)) {
        FARF(HIGH, "hvx_self_max_f32: unaligned address in hvx op, possibly slower execution\n");
        unaligned_addr = 1;
    }
    if ((1 == unaligned_addr) && (num_elems_whole != 0)) {
        unaligned_loop = 1;
        FARF(HIGH, "hvx_self_max_f32: unaligned loop in hvx op, possibly slower execution\n");
    }
    HVX_Vector vec_max   = hvx_vec_splat_fp32(((const float *) src)[0]);
    HVX_Vector vec_first = hvx_vec_splat_fp32(((const float *) src)[0]);
    if (0 == unaligned_loop) {
        HVX_Vector * restrict vec_in = (HVX_Vector *) src;
        #pragma unroll(4)
        for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
            vec_max = Q6_Vsf_vmax_VsfVsf(vec_max, *vec_in++);
        }
    } else {
        #pragma unroll(4)
        for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
            HVX_Vector in = *(HVX_UVector *) (src + i * SIZEOF_FP32);
            vec_max = Q6_Vsf_vmax_VsfVsf(vec_max, in);
        }
    }
    if (left_over > 0) {
        const float * srcf = (const float *) src + num_elems_whole;
        HVX_Vector in = *(HVX_UVector *) srcf;
        HVX_Vector temp = Q6_V_valign_VVR(in, vec_first, left_over * SIZEOF_FP32);
        vec_max         = Q6_Vsf_vmax_VsfVsf(vec_max, temp);
    }
    HVX_Vector v = hvx_vec_reduce_max_fp32(vec_max);
    return hvx_vec_get_fp32(v);
 }
 void hvx_min_scalar_f32(const uint8_t * restrict src, const float val, uint8_t * restrict dst, const int num_elems) {
    size_t left_over       = num_elems & (VLEN_FP32 - 1);
    size_t num_elems_whole = num_elems - left_over;
    if ((0 == htp_is_aligned((void *) src, VLEN)) || (0 == htp_is_aligned((void *) dst, VLEN))) {
        FARF(HIGH, "hvx_min_scalar_f32: unaligned address in hvx op, possibly slower execution\n");
    }
    assert((1 == htp_is_aligned((void *) src, VLEN)) || (0 == num_elems_whole));
    const float * src_f = (const float *) src;
    HVX_Vector vec_min = Q6_V_vsplat_R(val);
    HVX_Vector * restrict vec_in  = (HVX_Vector *) src;
    HVX_Vector * restrict vec_out = (HVX_Vector *) dst;
    #pragma unroll(4)
    for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
        vec_min    = Q6_Vsf_vmin_VsfVsf(vec_min, *vec_in++);
        *vec_out++ = Q6_Vsf_equals_Vqf32(vec_min);
    }
    if (left_over > 0) {
        const float * srcf = (const float *) src + num_elems_whole;
        float *       dstf = (float *) dst + num_elems_whole;
        HVX_Vector in = *(HVX_UVector *) srcf;
        vec_min = Q6_Vsf_vmin_VsfVsf(vec_min, in);
        hvx_vec_store_u((void *) dstf, left_over * SIZEOF_FP32, Q6_Vsf_equals_Vqf32(vec_min));
    }
 }
 void hvx_clamp_scalar_f32(const uint8_t * restrict src,
                          const float limit_left,
                          const float limit_right,
                          uint8_t * restrict dst,
                          const int num_elems) {
    size_t left_over       = num_elems & (VLEN_FP32 - 1);
    size_t num_elems_whole = num_elems - left_over;
    if ((0 == htp_is_aligned((void *) src, VLEN)) || (0 == htp_is_aligned((void *) dst, VLEN))) {
        FARF(HIGH, "hvx_clamp_scalar_f32: unaligned address in hvx op, possibly slower execution\n");
    }
    assert((1 == htp_is_aligned((void *) src, VLEN)) || (0 == num_elems_whole));
    HVX_Vector * restrict vec_in  = (HVX_Vector *) src;
    HVX_Vector * restrict vec_out = (HVX_Vector *) dst;
    HVX_Vector range_left  = hvx_vec_splat_fp32(limit_left);
    HVX_Vector range_right = hvx_vec_splat_fp32(limit_right);
    #pragma unroll(4)
    for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
        HVX_Vector in_vec = *vec_in++;
        HVX_Vector temp_v = in_vec;
        HVX_VectorPred pred_cap_right = Q6_Q_vcmp_gt_VsfVsf(in_vec, range_right);
        HVX_VectorPred pred_cap_left  = Q6_Q_vcmp_gt_VsfVsf(range_left, in_vec);
        in_vec = Q6_V_vmux_QVV(pred_cap_right, range_right, temp_v);
        in_vec = Q6_V_vmux_QVV(pred_cap_left, range_left, temp_v);
        *vec_out++ = Q6_Vsf_equals_Vqf32(in_vec);
    }
    if (left_over > 0) {
        const float * srcf = (const float *) src + num_elems_whole;
        float *       dstf = (float *) dst + num_elems_whole;
        HVX_Vector in = *(HVX_UVector *) srcf;
        HVX_Vector temp_v = in;
        HVX_VectorPred pred_cap_right = Q6_Q_vcmp_gt_VsfVsf(in, range_right);
        HVX_VectorPred pred_cap_left  = Q6_Q_vcmp_gt_VsfVsf(range_left, in);
        in = Q6_V_vmux_QVV(pred_cap_right, range_right, temp_v);
        in = Q6_V_vmux_QVV(pred_cap_left, range_left, temp_v);
        hvx_vec_store_u((void *) dstf, left_over * SIZEOF_FP32, Q6_Vsf_equals_Vqf32(in));
    }
 }
--- a/ggml/src/ggml-hexagon/htp/hvx-utils.h
+++ b/ggml/src/ggml-hexagon/htp/hvx-utils.h
@ -0,0 +1,998 @@
 #ifndef HVX_UTILS_H
 #define HVX_UTILS_H
 #include "ops-utils.h"
 #include <stdbool.h>
 #include <stdint.h>
 #define SIZEOF_FP32 (4)
 #define SIZEOF_FP16 (2)
 #define VLEN        (128)
 #define VLEN_FP32   (VLEN / SIZEOF_FP32)
 #define VLEN_FP16   (VLEN / SIZEOF_FP16)
 static inline HVX_Vector hvx_vec_splat_fp32(float i) {
    union {
        float   f;
        int32_t i;
    } fp32 = { .f = i };
    return Q6_V_vsplat_R(fp32.i);
 }
 static inline void hvx_vec_store_u(void * addr, uint32_t n, HVX_Vector v) {
    // Rotate as needed.
    v = Q6_V_vlalign_VVR(v, v, (size_t) addr);
    uint32_t left_off  = (size_t) addr & 127;
    uint32_t right_off = left_off + n;
    HVX_VectorPred ql_not = Q6_Q_vsetq_R((size_t) addr);
    HVX_VectorPred qr     = Q6_Q_vsetq2_R(right_off);
    if (right_off > 128) {
        Q6_vmem_QRIV(qr, (HVX_Vector *) addr + 1, v);
        // all 1's
        qr = Q6_Q_vcmp_eq_VbVb(v, v);
    }
    ql_not = Q6_Q_or_QQn(ql_not, qr);
    Q6_vmem_QnRIV(ql_not, (HVX_Vector *) addr, v);
 }
 static inline void hvx_vec_store_a(void * ptr, size_t n, HVX_Vector v) {
    assert((unsigned long) ptr % 128 == 0);
    HVX_VectorPred ql_not = Q6_Q_vsetq_R((size_t) ptr);
    HVX_VectorPred qr     = Q6_Q_vsetq2_R(n);
    ql_not                = Q6_Q_or_QQn(ql_not, qr);
    Q6_vmem_QnRIV(ql_not, (HVX_Vector *) ptr, v);
 }
 static inline HVX_Vector hvx_vec_repl4(HVX_Vector v) {
    // vdelta control to replicate first 4 bytes across all elements
    static const uint8_t __attribute__((aligned(128))) repl[128] = {
        0x00, 0x00, 0x00, 0x00, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04,
        0x10, 0x10, 0x10, 0x10, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04,
        0x20, 0x20, 0x20, 0x20, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04,
        0x10, 0x10, 0x10, 0x10, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04,
        0x40, 0x40, 0x40, 0x40, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04,
        0x10, 0x10, 0x10, 0x10, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04,
        0x20, 0x20, 0x20, 0x20, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04,
        0x10, 0x10, 0x10, 0x10, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04,
    };
    HVX_Vector ctrl = *(HVX_Vector *) repl;
    return Q6_V_vdelta_VV(v, ctrl);
 }
 // copy n fp16 elements : source and destination are aligned to HVX Vector (128)
 static inline void hvx_copy_fp16_aa(uint8_t * restrict dst, const uint8_t * restrict src, uint32_t n) {
    HVX_Vector * restrict vdst = (HVX_Vector *) dst;
    HVX_Vector * restrict vsrc = (HVX_Vector *) src;
    assert((unsigned long) dst % 128 == 0);
    assert((unsigned long) src % 128 == 0);
    uint32_t nvec = n / 64;
    uint32_t nloe = n % 64;
    uint32_t i = 0;
    #pragma unroll(4)
    for (; i < nvec; i++) {
        HVX_Vector v = vsrc[i];
        vdst[i]      = v;
    }
    if (nloe) {
        HVX_Vector v = vsrc[i];
        hvx_vec_store_u((void *) &vdst[i], nloe * sizeof(__fp16), v);
    }
 }
 // copy n fp16 elements : source is aligned, destination is potentially unaligned
 static inline void hvx_copy_fp16_ua(uint8_t * restrict dst, const uint8_t * restrict src, uint32_t n) {
    HVX_UVector * restrict vdst = (HVX_UVector *) dst;
    HVX_Vector * restrict vsrc  = (HVX_Vector *) src;
    assert((unsigned long) src % 128 == 0);
    uint32_t nvec = n / 64;
    uint32_t nloe = n % 64;
    uint32_t i = 0;
    #pragma unroll(4)
    for (; i < nvec; i++) {
        HVX_Vector v = vsrc[i];
        vdst[i]      = v;
    }
    if (nloe) {
        HVX_Vector v = vsrc[i];
        hvx_vec_store_u((void *) &vdst[i], nloe * sizeof(__fp16), v);
    }
 }
 // copy n fp16 elements : source is aligned, destination is potentially unaligned
 static inline void hvx_copy_fp16_au(uint8_t * restrict dst, const uint8_t * restrict src, uint32_t n) {
    HVX_Vector * restrict vdst  = (HVX_Vector *) dst;
    HVX_UVector * restrict vsrc = (HVX_UVector *) src;
    assert((unsigned long) dst % 128 == 0);
    uint32_t nvec = n / 64;
    uint32_t nloe = n % 64;
    uint32_t i = 0;
    #pragma unroll(4)
    for (; i < nvec; i++) {
        HVX_Vector v = vsrc[i];
        vdst[i]      = v;
    }
    if (nloe) {
        HVX_Vector v = vsrc[i];
        hvx_vec_store_u((void *) &vdst[i], nloe * sizeof(__fp16), v);
    }
 }
 // copy n fp32 elements : source and destination are aligned to HVX Vector (128)
 static inline void hvx_copy_fp32_aa(uint8_t * restrict dst, const uint8_t * restrict src, uint32_t n) {
    HVX_Vector * restrict vdst = (HVX_Vector *) dst;
    HVX_Vector * restrict vsrc = (HVX_Vector *) src;
    assert((unsigned long) dst % 128 == 0);
    assert((unsigned long) src % 128 == 0);
    uint32_t nvec = n / 32;
    uint32_t nloe = n % 32;
    uint32_t i = 0;
    #pragma unroll(4)
    for (; i < nvec; i++) {
        HVX_Vector v = vsrc[i];
        vdst[i]      = v;
    }
    if (nloe) {
        HVX_Vector v = vsrc[i];
        hvx_vec_store_u((void *) &vdst[i], nloe * sizeof(float), v);
    }
 }
 // copy n fp32 elements : source is aligned, destination is unaligned
 static inline void hvx_copy_fp32_ua(uint8_t * restrict dst, const uint8_t * restrict src, uint32_t n) {
    HVX_UVector * restrict vdst = (HVX_UVector *) dst;
    HVX_Vector * restrict vsrc  = (HVX_Vector *) src;
    assert((unsigned long) src % 128 == 0);
    uint32_t nvec = n / 32;
    uint32_t nloe = n % 32;
    uint32_t i = 0;
    #pragma unroll(4)
    for (; i < nvec; i++) {
        HVX_Vector v = vsrc[i];
        vdst[i]      = v;
    }
    if (nloe) {
        HVX_Vector v = vsrc[i];
        hvx_vec_store_u((void *) &vdst[i], nloe * sizeof(float), v);
    }
 }
 // copy n fp32 elements : source is unaligned, destination is aligned
 static inline void hvx_copy_fp32_au(uint8_t * restrict dst, const uint8_t * restrict src, uint32_t n) {
    HVX_Vector * restrict vdst  = (HVX_Vector *) dst;
    HVX_UVector * restrict vsrc = (HVX_UVector *) src;
    assert((unsigned long) dst % 128 == 0);
    uint32_t nvec = n / 32;
    uint32_t nloe = n % 32;
    uint32_t i = 0;
    #pragma unroll(4)
    for (; i < nvec; i++) {
        HVX_Vector v = vsrc[i];
        vdst[i]      = v;
    }
    if (nloe) {
        HVX_Vector v = vsrc[i];
        hvx_vec_store_u((void *) &vdst[i], nloe * sizeof(float), v);
    }
 }
 // bcast 1 fp32 element from source to n fp32 elements in destination : destination is aligned
 static inline void hvx_bcast_fp32_a(uint8_t * restrict dst, float elem, uint32_t n) {
    HVX_Vector * restrict vdst = (HVX_Vector *) dst;
    HVX_Vector velem = hvx_vec_splat_fp32(elem);
    assert((unsigned long) dst % 128 == 0);
    uint32_t nvec = n / 32;
    uint32_t nloe = n % 32;
    uint32_t i = 0;
    #pragma unroll(4)
    for (; i < nvec; i++) {
        vdst[i] = velem;
    }
    if (nloe) {
        hvx_vec_store_u((void *) &vdst[i], nloe * sizeof(float), velem);
    }
 }
 static __attribute__((always_inline)) int32_t is_in_one_chunk(void * addr, uint32_t n, uint32_t chunk_size) {
    uint32_t left_off  = (size_t) addr & (chunk_size - 1);
    uint32_t right_off = left_off + n;
    return right_off <= chunk_size;
 }
 static void hvx_vec_dump_fp16_n(char * pref, HVX_Vector v, uint32_t n) {
    union {
        HVX_Vector v;
        __fp16 d[64];
    } u = { .v = v };
    const uint32_t n0 = n / 16;
    const uint32_t n1 = n % 16;
    int            i  = 0;
    for (; i < n0; i++) {
        htp_dump_fp16_line(pref, u.d + (16 * i), 16);
    }
    if (n1) {
        htp_dump_fp16_line(pref, u.d + (16 * i), n1);
    }
 }
 static void hvx_vec_dump_fp16(char * pref, HVX_Vector v) {
    hvx_vec_dump_fp16_n(pref, v, 64);
 }
 static void hvx_vec_dump_fp32_n(char * pref, HVX_Vector v, uint32_t n) {
    union {
        HVX_Vector v;
        float      d[32];
    } u = { .v = v };
    const uint32_t n0 = n / 16;
    const uint32_t n1 = n % 16;
    int            i  = 0;
    for (; i < n0; i++) {
        htp_dump_fp32_line(pref, u.d + (16 * i), 16);
    }
    if (n1) {
        htp_dump_fp32_line(pref, u.d + (16 * i), n1);
    }
 }
 static void hvx_vec_dump_fp32_hmt(char * pref, HVX_Vector v) {
    union {
        HVX_Vector v;
        float      d[32];
    } u = { .v = v };
    FARF(HIGH, "%s: %.6f %.6f %.6f %.6f ...  %.6f %.6f %.6f %.6f ... %.6f %.6f %.6f %.6f\n", pref, u.d[0], u.d[1],
         u.d[2], u.d[3], u.d[12], u.d[13], u.d[14], u.d[15], u.d[28], u.d[29], u.d[30], u.d[31]);
 }
 static void hvx_vec_dump_fp32(char * pref, HVX_Vector v) {
    hvx_vec_dump_fp32_n(pref, v, 32);
 }
 static void hvx_vec_dump_int32(char * pref, HVX_Vector v) {
    union {
        HVX_Vector v;
        int32_t    d[32];
    } u = { .v = v };
    for (int i = 0; i < 32 / 16; i++) {
        htp_dump_int32_line(pref, u.d + (16 * i), 16);
    }
 }
 static void hvx_vec_dump_int32_hmt(char * pref, HVX_Vector v) {
    union {
        HVX_Vector v;
        int32_t    d[32];
    } u = { .v = v };
    FARF(HIGH, "%s: %d %d %d %d ... %d %d %d %d ... %d %d %d %d\n", pref, u.d[0], u.d[1], u.d[2], u.d[3], u.d[12],
         u.d[13], u.d[14], u.d[15], u.d[28], u.d[29], u.d[30], u.d[31]);
 }
 static void hvx_vec_dump_int8_hmt(char * pref, HVX_Vector v) {
    union {
        HVX_Vector v;
        int8_t     d[128];
    } u = { .v = v };
    FARF(HIGH, "%s: %d %d %d %d ... %d %d %d %d ... %d %d %d %d\n", pref, u.d[0], u.d[1], u.d[2], u.d[3], u.d[60],
         u.d[61], u.d[62], u.d[63], u.d[124], u.d[125], u.d[126], u.d[127]);
 }
 static void hvx_vec_dump_int8(char * pref, HVX_Vector v) {
    union {
        HVX_Vector v;
        int8_t     d[128];
    } u = { .v = v };
    for (int i = 0; i < 128 / 16; i++) {
        htp_dump_int8_line(pref, u.d + (16 * i), 16);
    }
 }
 static void hvx_vec_dump_uint8(char * pref, HVX_Vector v) {
    union {
        HVX_Vector v;
        uint8_t    d[128];
    } u = { .v = v };
    for (int i = 0; i < 128 / 16; i++) {
        htp_dump_uint8_line(pref, u.d + (16 * i), 16);
    }
 }
 static bool hvx_vec_eq(HVX_Vector v0, HVX_Vector v1, size_t n) {
    typedef union {
        HVX_Vector v;
        int8_t     d[128];
    } U;
    U u0 = { .v = v0 };
    U u1 = { .v = v1 };
    for (int i = 0; i < n; i++) {
        if (u0.d[i] != u1.d[i]) {
            return false;
        }
    }
    return true;
 }
 static inline float hvx_vec_get_fp32(HVX_Vector v) {
    float __attribute__((aligned(128))) x;
    hvx_vec_store_a(&x, 4, v);
    return x;
 }
 static inline HVX_Vector hvx_vec_int32_reduce_sum_n(HVX_Vector in, unsigned int n) {
    unsigned int total = n * 4;  // total vec nbytes
    unsigned int width = 4;      // int32
    HVX_Vector sum = in, sum_t;
    while (width < total) {
        sum_t = Q6_V_vror_VR(sum, width);     // rotate right
        sum   = Q6_Vw_vadd_VwVw(sum_t, sum);  // elementwise sum
        width = width << 1;
    }
    return sum;
 }
 static inline HVX_Vector hvx_vec_int32_reduce_sum(HVX_Vector in) {
    return hvx_vec_int32_reduce_sum_n(in, 32);
 }
 static inline HVX_Vector hvx_vec_qf32_reduce_sum_n(HVX_Vector in, unsigned int n) {
    unsigned int total = n * 4;  // total vec nbytes
    unsigned int width = 4;      // fp32 nbytes
    HVX_Vector sum = in, sum_t;
    while (width < total) {
        sum_t = Q6_V_vror_VR(Q6_Vsf_equals_Vqf32(sum), width);  // rotate right
        sum   = Q6_Vqf32_vadd_Vqf32Vsf(sum, sum_t);             // elementwise sum
        width = width << 1;
    }
    return sum;
 }
 static inline HVX_Vector hvx_vec_qf32_reduce_sum(HVX_Vector in) {
    return hvx_vec_qf32_reduce_sum_n(in, 32);
 }
 static inline HVX_Vector hvx_vec_fp32_reduce_sum_n(HVX_Vector in, unsigned int n) {
    unsigned int total = n * 4;  // total vec nbytes
    unsigned int width = 4;      // fp32 nbytes
    HVX_Vector sum = in, sum_t;
    while (width < total) {
        sum_t = Q6_V_vror_VR(sum, width);       // rotate right
        sum   = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_VsfVsf(sum, sum_t)); // elementwise sum
        width = width << 1;
    }
    return sum;
 }
 static inline HVX_Vector hvx_vec_fp32_reduce_sum(HVX_Vector in) {
    return hvx_vec_fp32_reduce_sum_n(in, 32);
 }
 static inline HVX_Vector hvx_vec_reduce_max_fp16(HVX_Vector in) {
    unsigned total = 128;  // total vec nbytes
    unsigned width = 2;    // fp16 nbytes
    HVX_Vector _max = in, _max_t;
    while (width < total) {
        _max_t = Q6_V_vror_VR(_max, width);         // rotate right
        _max   = Q6_Vhf_vmax_VhfVhf(_max_t, _max);  // elementwise max
        width  = width << 1;
    }
    return _max;
 }
 static inline HVX_Vector hvx_vec_reduce_max2_fp16(HVX_Vector in, HVX_Vector _max) {
    unsigned total = 128;  // total vec nbytes
    unsigned width = 2;    // fp32 nbytes
    HVX_Vector _max_t;
    _max = Q6_Vhf_vmax_VhfVhf(in, _max);
    while (width < total) {
        _max_t = Q6_V_vror_VR(_max, width);         // rotate right
        _max   = Q6_Vhf_vmax_VhfVhf(_max_t, _max);  // elementwise max
        width  = width << 1;
    }
    return _max;
 }
 static inline HVX_Vector hvx_vec_reduce_max_fp32(HVX_Vector in) {
    unsigned total = 128;  // total vec nbytes
    unsigned width = 4;    // fp32 nbytes
    HVX_Vector _max = in, _max_t;
    while (width < total) {
        _max_t = Q6_V_vror_VR(_max, width);         // rotate right
        _max   = Q6_Vsf_vmax_VsfVsf(_max_t, _max);  // elementwise max
        width  = width << 1;
    }
    return _max;
 }
 static inline HVX_Vector hvx_vec_reduce_max2_fp32(HVX_Vector in, HVX_Vector _max) {
    unsigned total = 128;  // total vec nbytes
    unsigned width = 4;    // fp32 nbytes
    HVX_Vector _max_t;
    _max = Q6_Vsf_vmax_VsfVsf(in, _max);
    while (width < total) {
        _max_t = Q6_V_vror_VR(_max, width);         // rotate right
        _max   = Q6_Vsf_vmax_VsfVsf(_max_t, _max);  // elementwise max
        width  = width << 1;
    }
    return _max;
 }
 static inline HVX_Vector hvx_vec_abs_fp16(HVX_Vector v) {
    // abs by clearing the fp16 sign bit
    HVX_Vector mask = Q6_Vh_vsplat_R(0x7fff);
    return Q6_V_vand_VV(v, mask);
 }
 static inline HVX_Vector hvx_vec_neg_fp16(HVX_Vector v) {
    // neg by setting the fp16 sign bit
    HVX_Vector mask = Q6_Vh_vsplat_R(0x8000);
    return Q6_V_vor_VV(v, mask);
 }
 static inline HVX_Vector hvx_vec_abs_fp32(HVX_Vector v) {
    // abs by clearing the fp32 sign bit
    HVX_Vector mask = Q6_V_vsplat_R(0x7fffffff);
    return Q6_V_vand_VV(v, mask);
 }
 static inline HVX_Vector hvx_vec_neg_fp32(HVX_Vector v) {
 #if __HTP_ARCH__ > 75
    return Q6_Vsf_vfneg_Vsf(v);
 #else
    // neg by setting the fp32 sign bit
    HVX_Vector mask = Q6_V_vsplat_R(0x80000000);
    return Q6_V_vor_VV(v, mask);
 #endif  // __HTP_ARCH__ > 75
 }
 // ====================================================
 // FUNCTION: 1/(x+1)     y(0) = 1,  y(0.5) = 0.6667, y(1) = 0.5
 // Order:3; continuity: True; Ends forced: True
 // Mode: unsigned;   Result fractional bits: 14
 // Peak Error: 1.1295e-04  Rms Error: 2.8410e-05   Mean Error: 1.1370e-05
 //      32769  -32706   31252  -10589
 //      32590  -30635   22793   -4493
 //      32066  -27505   16481   -2348
 //      31205  -24054   11849   -1306
 static inline HVX_Vector hvx_vec_recip_xp1_O3_unsigned(HVX_Vector vx) {
    // input is 0..0xffff representing 0.0  .. 1.0
    HVX_Vector p;
    p = Q6_Vh_vlut4_VuhPh(vx, 0xFAE6F6D4EE73D6A3ull);
    p = Q6_Vh_vmpa_VhVhVuhPuh_sat(p, vx, 0x2E49406159097A14ull);
    p = Q6_Vh_vmps_VhVhVuhPuh_sat(p, vx, 0x5DF66B7177AB7FC2ull);
    p = Q6_Vh_vmpa_VhVhVuhPuh_sat(p, vx, 0x79E57D427F4E8001ull);
    return p;  // signed result, 14 fractional bits
 }
 // Find reciprocal of fp16.
 // (1) first, convert to fp32, multiplying by 1.0; this is done to
 //    handle denormals. Ignoring sign and zero, result should be at
 //    least 5.9604645e-08 (32-bit code 0x33800000) and at most 131008 (0x47ffe000)
 //    (exponent in range [103,143])
 // (2) extract the mantissa into 16-bit unsigned; find reciprocal using a fitted poly
 // (3) put this, along with '253-exp' (exp from (1)) together to make an qf32
 // (4) convert that to fp16
 // (5) put sign back in. Also, if the original value (w/o sign) was <0x81, replace
 //     the result with the max value.
 static inline HVX_Vector hvx_vec_inverse_fp16(HVX_Vector vals) {
    HVX_Vector     em_mask  = Q6_Vh_vsplat_R(0x7FFF);
    HVX_Vector     avals    = Q6_V_vand_VV(vals, em_mask);
    HVX_VectorPred is_neg   = Q6_Q_vcmp_gt_VhVh(avals, vals);
    // is too small to 1/x ? for 'standard' fp16, this would be 0x101
    HVX_VectorPred is_small = Q6_Q_vcmp_gt_VhVh(Q6_Vh_vsplat_R(0x101), avals);
    HVX_VectorPair to_qf32  = Q6_Wqf32_vmpy_VhfVhf(avals, Q6_Vh_vsplat_R(0x3C00));  // *1.0
    HVX_Vector     to_f32_0 = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(to_qf32));
    HVX_Vector     to_f32_1 = Q6_Vsf_equals_Vqf32(Q6_V_hi_W(to_qf32));
    // bits 22..13 contain the mantissa now (w/o hidden bit); move to bit 14..5 of a 16-bit vector
    HVX_Vector mant_u16 = Q6_Vh_vshuffo_VhVh(Q6_Vw_vasl_VwR(to_f32_1, 9), Q6_Vw_vasl_VwR(to_f32_0, 9));
    // likewise extract the upper 16 from each, containing the exponents in range 103..142
    HVX_Vector exp_u16  = Q6_Vh_vshuffo_VhVh(to_f32_1, to_f32_0);
    //Get exponent in IEEE 32-bit representation
    exp_u16             = Q6_Vuh_vlsr_VuhR(exp_u16, 7);
    // so, mant_u16 contains an unbiased mantissa in upper 10 bits of each u16 lane
    // We can consider it to be x-1.0, with 16 fractional bits, where 'x' is in range [1.0,2.0)
    // Use poly to transform to 1/x, with 14 fractional bits
    //
    HVX_Vector rm = hvx_vec_recip_xp1_O3_unsigned(mant_u16);
    HVX_Vector vcl0 = Q6_Vuh_vcl0_Vuh(rm);  //count leading zeros
    // Get mantissa for 16-bit represenation
    HVX_Vector mant_recip = Q6_V_vand_VV(Q6_Vh_vasr_VhR(Q6_Vh_vasl_VhVh(rm, vcl0), 5), Q6_Vh_vsplat_R(0x03FF));
    //Compute Reciprocal Exponent
    HVX_Vector exp_recip =
        Q6_Vh_vsub_VhVh(Q6_Vh_vsub_VhVh(Q6_Vh_vsplat_R(254), exp_u16), Q6_Vh_vsub_VhVh(vcl0, Q6_Vh_vsplat_R(1)));
    //Convert it for 16-bit representation
    exp_recip = Q6_Vh_vadd_VhVh_sat(Q6_Vh_vsub_VhVh(exp_recip, Q6_Vh_vsplat_R(127)), Q6_Vh_vsplat_R(15));
    exp_recip = Q6_Vh_vasl_VhR(exp_recip, 10);
    //Merge exponent and mantissa for reciprocal
    HVX_Vector recip = Q6_V_vor_VV(exp_recip, mant_recip);
    // map 'small' inputs to standard largest value 0x7bff
    recip            = Q6_V_vmux_QVV(is_small, Q6_Vh_vsplat_R(0x7bff), recip);
    // add sign back
    recip            = Q6_V_vandor_VQR(recip, is_neg, 0x80008000);
    return recip;
 }
 #define IEEE_VSF_EXPLEN   (8)
 #define IEEE_VSF_EXPBIAS  (127)
 #define IEEE_VSF_EXPMASK  (0xFF)
 #define IEEE_VSF_MANTLEN  (23)
 #define IEEE_VSF_MANTMASK (0x7FFFFF)
 #define IEEE_VSF_MIMPMASK (0x800000)
 static inline HVX_Vector hvx_vec_truncate_fp32(HVX_Vector in_vec) {
    HVX_Vector mask_mant_v  = Q6_V_vsplat_R(IEEE_VSF_MANTMASK);
    HVX_Vector mask_impl_v  = Q6_V_vsplat_R(IEEE_VSF_MIMPMASK);
    HVX_Vector const_zero_v = Q6_V_vzero();
    HVX_VectorPred q_negative = Q6_Q_vcmp_gt_VwVw(const_zero_v, in_vec);
    HVX_Vector expval_v = in_vec >> IEEE_VSF_MANTLEN;
    expval_v &= IEEE_VSF_EXPMASK;
    expval_v -= IEEE_VSF_EXPBIAS;
    // negative exp == fractional value
    HVX_VectorPred q_negexp = Q6_Q_vcmp_gt_VwVw(const_zero_v, expval_v);
    HVX_Vector rshift_v = IEEE_VSF_MANTLEN - expval_v;         // fractional bits - exp shift
    HVX_Vector mant_v = in_vec & mask_mant_v;                  // obtain mantissa
    HVX_Vector vout   = Q6_Vw_vadd_VwVw(mant_v, mask_impl_v);  // add implicit 1.0
    vout = Q6_Vw_vasr_VwVw(vout, rshift_v);                    // shift to obtain truncated integer
    vout = Q6_V_vmux_QVV(q_negexp, const_zero_v, vout);        // expval<0 -> 0
    HVX_Vector neg_vout = -vout;
    vout = Q6_V_vmux_QVV(q_negative, neg_vout, vout);  // handle negatives
    return (vout);
 }
 static inline HVX_Vector hvx_vec_floor_fp32(HVX_Vector in_vec) {
    HVX_Vector mask_mant_v    = Q6_V_vsplat_R(IEEE_VSF_MANTMASK);
    HVX_Vector mask_impl_v    = Q6_V_vsplat_R(IEEE_VSF_MIMPMASK);
    HVX_Vector const_mnlen_v  = Q6_V_vsplat_R(IEEE_VSF_MANTLEN);
    HVX_Vector const_zero_v   = Q6_V_vzero();
    HVX_Vector const_negone_v = Q6_V_vsplat_R(0xbf800000);  // -1 IEEE vsf
    HVX_VectorPred q_negative = Q6_Q_vcmp_gt_VwVw(const_zero_v, in_vec);
    HVX_Vector expval_v = in_vec >> IEEE_VSF_MANTLEN;
    expval_v &= IEEE_VSF_EXPMASK;
    expval_v -= IEEE_VSF_EXPBIAS;
    HVX_VectorPred q_negexp     = Q6_Q_vcmp_gt_VwVw(const_zero_v, expval_v);
    HVX_VectorPred q_expltmn    = Q6_Q_vcmp_gt_VwVw(const_mnlen_v, expval_v);
    HVX_VectorPred q_negexp_pos = Q6_Q_vcmp_gtand_QVwVw(q_negexp, in_vec, const_zero_v);
    HVX_VectorPred q_negexp_neg = Q6_Q_vcmp_gtand_QVwVw(q_negexp, const_zero_v, in_vec);
    // if expval < 0 (q_negexp)         // <0, floor is 0
    //    if vin > 0
    //       floor = 0
    //    if vin < 0
    //       floor = -1
    // if expval < mant_len (q_expltmn) // >0, but fraction may exist
    //    get sign (q_negative)
    //    mask >> expval                // fraction bits to mask off
    //    vout = ~(mask)                // apply mask to remove fraction
    //    if (qneg)                     // negative floor is one less (more, sign bit for neg)
    //      vout += ((impl_mask) >> expval)
    //    if (mask && vin)
    //      vout = vin
    // else                             // already an integer
    //    ;                             // no change
    // compute floor
    mask_mant_v >>= expval_v;
    HVX_Vector neg_addin_v    = mask_impl_v >> expval_v;
    HVX_Vector vout_neg_addin = Q6_Vw_vadd_VwVw(in_vec, neg_addin_v);
    HVX_Vector vout           = Q6_V_vmux_QVV(q_negative, vout_neg_addin, in_vec);
    HVX_Vector     mask_chk_v = Q6_V_vand_VV(in_vec, mask_mant_v);  // chk if bits set
    HVX_VectorPred q_integral = Q6_Q_vcmp_eq_VwVw(const_zero_v, mask_chk_v);
    HVX_Vector not_mask_v = Q6_V_vnot_V(mask_mant_v);        // frac bits to clear
    HVX_Vector vfrfloor_v = Q6_V_vand_VV(vout, not_mask_v);  // clear frac bits
    vout = in_vec;
    vout = Q6_V_vmux_QVV(q_expltmn, vfrfloor_v, vout);         // expval<mant
    vout = Q6_V_vmux_QVV(q_integral, in_vec, vout);            // integral values
    vout = Q6_V_vmux_QVV(q_negexp_pos, const_zero_v, vout);    // expval<0 x>0 -> 0
    vout = Q6_V_vmux_QVV(q_negexp_neg, const_negone_v, vout);  // expval<0 x<0 -> -1
    return vout;
 }
 static inline HVX_Vector hvx_vec_i16_from_hf_rnd_sat(HVX_Vector vin) {
    // This looks complicated.
    // Ideally should just be Q6_Vh_equals_Vhf(vin)
    // but that instruction does not do proper rounding.
    // convert to qf32, multiplying by 1.0 in the process.
    HVX_VectorPair v32 = Q6_Wqf32_vmpy_VhfVhf(vin, Q6_Vh_vsplat_R(0x3C00));
    // 'in-range' values are +/32752.
    // add 192K to it, convert to sf
    HVX_Vector v192K = Q6_V_vsplat_R(0x48400000);
    HVX_Vector vsf_0 = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(Q6_V_lo_W(v32), v192K));
    HVX_Vector vsf_1 = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(Q6_V_hi_W(v32), v192K));
    // for in-range cases, result is {163858... 229360} so the exponent is always 144.
    // if we extract bits 21..0 as a signed quantity, and round 6 bits off, that will be the answer.
    // Start by <<10 to get the final 'sign' bit in bit 15...
    vsf_0 = Q6_Vw_vasl_VwR(vsf_0, 10);
    vsf_1 = Q6_Vw_vasl_VwR(vsf_1, 10);
    // now round down to 16
    return Q6_Vh_vround_VwVw_sat(vsf_1, vsf_0);
 }
 static inline HVX_Vector hvx_vec_inverse_fp32(HVX_Vector v_sf) {
    HVX_Vector inv_aprox_sf = Q6_V_vsplat_R(0x7EEEEBB3);
    HVX_Vector two_sf       = hvx_vec_splat_fp32(2.0);
    // First approximation
    HVX_Vector i_sf = Q6_Vw_vsub_VwVw(inv_aprox_sf, v_sf);
    HVX_Vector r_qf;
    // Refine
    r_qf = Q6_Vqf32_vmpy_VsfVsf(
        i_sf, Q6_Vsf_equals_Vqf32(Q6_Vqf32_vsub_VsfVsf(two_sf, Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(i_sf, v_sf)))));
    r_qf = Q6_Vqf32_vmpy_Vqf32Vqf32(
        r_qf, Q6_Vqf32_vsub_VsfVsf(two_sf, Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(r_qf), v_sf))));
    r_qf = Q6_Vqf32_vmpy_Vqf32Vqf32(
        r_qf, Q6_Vqf32_vsub_VsfVsf(two_sf, Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(r_qf), v_sf))));
    return Q6_Vsf_equals_Vqf32(r_qf);
 }
 #define FAST_SIGMOID_LOG2F (0x3fb8aa3b)  // 1.442695022
 #define FAST_SIGMOID_C1    (0x3d009076)  // 0.03138777
 #define FAST_SIGMOID_C2    (0x3e8d74bd)  // 0.276281267
 #define FAST_SIGMOID_C3    (0x3f000000)  // 0.5
 static inline HVX_Vector hvx_vec_fast_sigmoid_fp32(HVX_Vector v) {
    v = Q6_Vqf32_vmpy_VsfVsf(v, Q6_V_vsplat_R(FAST_SIGMOID_LOG2F));
    v = Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(v), Q6_V_vsplat_R(FAST_SIGMOID_C3));
    HVX_Vector in_int = hvx_vec_truncate_fp32(Q6_Vsf_equals_Vqf32(v));
    HVX_Vector x      = Q6_Vqf32_vsub_Vqf32Vsf(v, Q6_Vsf_equals_Vw(in_int));
    HVX_Vector xx     = Q6_Vqf32_vmpy_Vqf32Vqf32(x, x);
    HVX_Vector v1 = Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(xx), Q6_V_vsplat_R(FAST_SIGMOID_C2));
    v1            = Q6_Vqf32_vadd_Vqf32Vsf(v1, Q6_V_vsplat_R(FAST_SIGMOID_LOG2F));
    HVX_Vector v2 = Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(x), Q6_V_vsplat_R(FAST_SIGMOID_C1));
    v2            = Q6_Vqf32_vmpy_Vqf32Vqf32(v2, xx);
    v2            = Q6_Vqf32_vadd_Vqf32Vqf32(v2, x);
    HVX_Vector v3          = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vqf32(v2, v1));
    HVX_Vector v3_exponent = Q6_Vw_vasl_VwR(v3, 1);
    v3_exponent            = Q6_Vuw_vlsr_VuwR(v3_exponent, 24);
    v3_exponent            = Q6_Vw_vadd_VwVw(in_int, v3_exponent);
    v3                     = Q6_Vw_vaslacc_VwVwR(v3, in_int, 24);
    HVX_Vector v4 = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vsub_Vqf32Vqf32(v2, v1));
    HVX_Vector v5 = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vsub_VsfVsf(v3, v4));
    HVX_Vector res = hvx_vec_inverse_fp32(v5);
    res            = Q6_Vqf32_vmpy_VsfVsf(v3, res);
    return Q6_Vsf_equals_Vqf32(res);
 }
 #define EXP_COEFF_5 (0x39506967)  // 0.000198757 = 1/(7!)
 #define EXP_COEFF_4 (0x3AB743CE)  // 0.0013982   = 1/(6!)
 #define EXP_COEFF_3 (0x3C088908)  // 0.00833345  = 1/(5!)
 #define EXP_COEFF_2 (0x3D2AA9C1)  // 0.416658    = 1/(4!)
 #define EXP_COEFF_1 (0x3E2AAAAA)  // 0.16666667  = 1/(3!)
 #define EXP_COEFF_0 (0x3F000000)  // 0.5         = 1/(2!)
 #define EXP_LOGN2   (0x3F317218)  // ln(2)   = 0.6931471805
 #define EXP_LOG2E   (0x3FB8AA3B)  // log2(e) = 1/ln(2) = 1.4426950408
 #define EXP_ONE     (0x3f800000)  // 1.0
 #define EXP_RANGE_R (0x41a00000)  // 20.0
 #define EXP_RANGE_L (0xc1a00000)  // -20.0
 static inline HVX_Vector hvx_vec_exp_fp32(HVX_Vector in_vec) {
    HVX_Vector z_qf32_v;
    HVX_Vector x_v;
    HVX_Vector x_qf32_v;
    HVX_Vector y_v;
    HVX_Vector k_v;
    HVX_Vector f_v;
    HVX_Vector epsilon_v;
    HVX_Vector log2e = Q6_V_vsplat_R(EXP_LOG2E);
    HVX_Vector logn2 = Q6_V_vsplat_R(EXP_LOGN2);
    HVX_Vector E_const;
    HVX_Vector zero_v = Q6_V_vzero();
    // exp(x) is approximated as follows:
    //   f = floor(x/ln(2)) = floor(x*log2(e))
    //   epsilon = x - f*ln(2)
    //   exp(x) = exp(epsilon+f*ln(2))
    //          = exp(epsilon)*exp(f*ln(2))
    //          = exp(epsilon)*2^f
    //
    //   Since epsilon is close to zero, it can be approximated with its Taylor series:
    //            exp(x) ~= 1+x+x^2/2!+x^3/3!+...+x^n/n!+...
    //   Preserving the first eight elements, we get:
    //            exp(x) ~= 1+x+e0*x^2+e1*x^3+e2*x^4+e3*x^5+e4*x^6+e5*x^7
    //                   =  1+x+(E0+(E1+(E2+(E3+(E4+E5*x)*x)*x)*x)*x)*x^2
    HVX_Vector temp_v = in_vec;
    // Clamp inputs to (-20.0, 20.0)
    HVX_VectorPred pred_cap_right = Q6_Q_vcmp_gt_VsfVsf(in_vec, Q6_V_vsplat_R(EXP_RANGE_R));
    HVX_VectorPred pred_cap_left  = Q6_Q_vcmp_gt_VsfVsf(Q6_V_vsplat_R(EXP_RANGE_L), in_vec);
    in_vec = Q6_V_vmux_QVV(pred_cap_right, Q6_V_vsplat_R(EXP_RANGE_R), temp_v);
    in_vec = Q6_V_vmux_QVV(pred_cap_left, Q6_V_vsplat_R(EXP_RANGE_L), temp_v);
    epsilon_v = Q6_Vqf32_vmpy_VsfVsf(log2e, in_vec);
    epsilon_v = Q6_Vsf_equals_Vqf32(epsilon_v);
    //    f_v is the floating point result and k_v is the integer result
    f_v = hvx_vec_floor_fp32(epsilon_v);
    k_v = hvx_vec_truncate_fp32(f_v);
    x_qf32_v = Q6_Vqf32_vadd_VsfVsf(in_vec, zero_v);
    //  x = x - f_v * logn2;
    epsilon_v = Q6_Vqf32_vmpy_VsfVsf(f_v, logn2);
    x_qf32_v  = Q6_Vqf32_vsub_Vqf32Vqf32(x_qf32_v, epsilon_v);
    // normalize before every QFloat's vmpy
    x_qf32_v  = Q6_Vqf32_vadd_Vqf32Vsf(x_qf32_v, zero_v);
    // z = x * x;
    z_qf32_v = Q6_Vqf32_vmpy_Vqf32Vqf32(x_qf32_v, x_qf32_v);
    z_qf32_v = Q6_Vqf32_vadd_Vqf32Vsf(z_qf32_v, zero_v);
    x_v = Q6_Vsf_equals_Vqf32(x_qf32_v);
    // y = E4 + E5 * x;
    E_const = Q6_V_vsplat_R(EXP_COEFF_5);
    y_v     = Q6_Vqf32_vmpy_VsfVsf(E_const, x_v);
    E_const = Q6_V_vsplat_R(EXP_COEFF_4);
    y_v     = Q6_Vqf32_vadd_Vqf32Vsf(y_v, E_const);
    y_v     = Q6_Vqf32_vadd_Vqf32Vsf(y_v, zero_v);
    // y = E3 + y * x;
    E_const = Q6_V_vsplat_R(EXP_COEFF_3);
    y_v     = Q6_Vqf32_vmpy_Vqf32Vqf32(y_v, x_qf32_v);
    y_v     = Q6_Vqf32_vadd_Vqf32Vsf(y_v, E_const);
    y_v     = Q6_Vqf32_vadd_Vqf32Vsf(y_v, zero_v);
    // y = E2 + y * x;
    E_const = Q6_V_vsplat_R(EXP_COEFF_2);
    y_v     = Q6_Vqf32_vmpy_Vqf32Vqf32(y_v, x_qf32_v);
    y_v     = Q6_Vqf32_vadd_Vqf32Vsf(y_v, E_const);
    y_v     = Q6_Vqf32_vadd_Vqf32Vsf(y_v, zero_v);
    // y = E1 + y * x;
    E_const = Q6_V_vsplat_R(EXP_COEFF_1);
    y_v     = Q6_Vqf32_vmpy_Vqf32Vqf32(y_v, x_qf32_v);
    y_v     = Q6_Vqf32_vadd_Vqf32Vsf(y_v, E_const);
    y_v     = Q6_Vqf32_vadd_Vqf32Vsf(y_v, zero_v);
    // y = E0 + y * x;
    E_const = Q6_V_vsplat_R(EXP_COEFF_0);
    y_v     = Q6_Vqf32_vmpy_Vqf32Vqf32(y_v, x_qf32_v);
    y_v     = Q6_Vqf32_vadd_Vqf32Vsf(y_v, E_const);
    y_v     = Q6_Vqf32_vadd_Vqf32Vsf(y_v, zero_v);
    // y = x + y * z;
    y_v = Q6_Vqf32_vmpy_Vqf32Vqf32(y_v, z_qf32_v);
    y_v = Q6_Vqf32_vadd_Vqf32Vqf32(y_v, x_qf32_v);
    y_v = Q6_Vqf32_vadd_Vqf32Vsf(y_v, zero_v);
    // y = y + 1.0;
    y_v = Q6_Vqf32_vadd_Vqf32Vsf(y_v, Q6_V_vsplat_R(EXP_ONE));
    // insert exponents
    //        y = ldexpf(y, k);
    //    y_v += k_v; // qf32
    // modify exponent
    y_v = Q6_Vsf_equals_Vqf32(y_v);
    // add k_v to the exponent of y_v
    HVX_Vector y_v_exponent = Q6_Vw_vasl_VwR(y_v, 1);
    y_v_exponent = Q6_Vuw_vlsr_VuwR(y_v_exponent, IEEE_VSF_MANTLEN + 1);
    y_v_exponent = Q6_Vw_vadd_VwVw(k_v, y_v_exponent);
    // exponent cannot be negative; if overflow is detected, result is set to zero
    HVX_VectorPred qy_v_negative_exponent = Q6_Q_vcmp_gt_VwVw(zero_v, y_v_exponent);
    y_v = Q6_Vw_vaslacc_VwVwR(y_v, k_v, IEEE_VSF_MANTLEN);
    y_v = Q6_V_vmux_QVV(qy_v_negative_exponent, zero_v, y_v);
    return y_v;
 }
 #define RSQRT_CONST        0x5f3759df  // Constant for fast inverse square root calculation
 #define RSQRT_ONE_HALF     0x3f000000  // 0.5
 #define RSQRT_THREE_HALVES 0x3fc00000  // 1.5
 static inline HVX_Vector hvx_vec_rsqrt_fp32(HVX_Vector in_vec) {
    //Algorithm :
    //  x2 = input*0.5
    //  y  = * (long *) &input
    //  y  = 0x5f3759df - (y>>2)
    //  y  = y*(threehalfs - x2*y*y)
    HVX_Vector rsqrtconst = Q6_V_vsplat_R(RSQRT_CONST);
    HVX_Vector onehalf    = Q6_V_vsplat_R(RSQRT_ONE_HALF);
    HVX_Vector threehalfs = Q6_V_vsplat_R(RSQRT_THREE_HALVES);
    HVX_Vector x2, y, ypower2, temp;
    x2 = Q6_Vqf32_vmpy_VsfVsf(in_vec, onehalf);
    x2 = Q6_Vqf32_vadd_Vqf32Vsf(x2, Q6_V_vzero());
    y = Q6_Vw_vasr_VwR(in_vec, 1);
    y = Q6_Vw_vsub_VwVw(rsqrtconst, y);
    // 1st iteration
    ypower2 = Q6_Vqf32_vmpy_VsfVsf(y, y);
    ypower2 = Q6_Vqf32_vadd_Vqf32Vsf(ypower2, Q6_V_vzero());
    temp    = Q6_Vqf32_vmpy_Vqf32Vqf32(x2, ypower2);
    temp    = Q6_Vqf32_vsub_VsfVsf(threehalfs, Q6_Vsf_equals_Vqf32(temp));
    temp    = Q6_Vqf32_vmpy_VsfVsf(y, Q6_Vsf_equals_Vqf32(temp));
    // 2nd iteration
    y       = Q6_Vqf32_vadd_Vqf32Vsf(temp, Q6_V_vzero());
    ypower2 = Q6_Vqf32_vmpy_Vqf32Vqf32(y, y);
    ypower2 = Q6_Vqf32_vadd_Vqf32Vsf(ypower2, Q6_V_vzero());
    temp    = Q6_Vqf32_vmpy_Vqf32Vqf32(x2, ypower2);
    temp    = Q6_Vqf32_vsub_VsfVsf(threehalfs, Q6_Vsf_equals_Vqf32(temp));
    temp    = Q6_Vqf32_vmpy_Vqf32Vqf32(y, temp);
    // 3rd iteration
    y       = Q6_Vqf32_vadd_Vqf32Vsf(temp, Q6_V_vzero());
    ypower2 = Q6_Vqf32_vmpy_Vqf32Vqf32(y, y);
    ypower2 = Q6_Vqf32_vadd_Vqf32Vsf(ypower2, Q6_V_vzero());
    temp    = Q6_Vqf32_vmpy_Vqf32Vqf32(x2, ypower2);
    temp    = Q6_Vqf32_vsub_VsfVsf(threehalfs, Q6_Vsf_equals_Vqf32(temp));
    temp    = Q6_Vqf32_vmpy_Vqf32Vqf32(y, temp);
    return Q6_Vsf_equals_Vqf32(temp);
 }
 static inline void hvx_fast_sigmoid_f32(const uint8_t * restrict src, uint8_t * restrict dst, const int num_elems) {
    int step_of_1 = num_elems >> 5;
    int remaining = num_elems - step_of_1 * VLEN_FP32;
    assert(remaining == 0);
    const HVX_Vector * restrict v_src = (HVX_Vector *) src;
    HVX_Vector * restrict v_dst       = (HVX_Vector *) dst;
    #pragma unroll(4)
    for (int i = 0; i < step_of_1; i++) {
        v_dst[i] = hvx_vec_fast_sigmoid_fp32(v_src[i]);
    }
 }
 float hvx_sum_of_squares_f32(const uint8_t * restrict src, const int num_elems);
 void  hvx_mul_f32(const uint8_t * restrict src0,
                  const uint8_t * restrict src1,
                  uint8_t * restrict dst,
                  const int num_elems);
 void  hvx_mul_f32_opt(const uint8_t * restrict src0,
                      const uint8_t * restrict src1,
                      uint8_t * restrict dst,
                      const int num_elems);
 void  hvx_mul_mul_f32_opt(const uint8_t * restrict src0,
                          const uint8_t * restrict src1,
                          const uint8_t * restrict src2,
                          uint8_t * restrict dst,
                          const int num_elems);
 void  hvx_mul_scalar_f32(const uint8_t * restrict src, const float val, uint8_t * restrict dst, const int num_elems);
 void  hvx_add_f32(const uint8_t * restrict src0,
                  const uint8_t * restrict src1,
                  uint8_t * restrict dst,
                  const int num_elems);
 void  hvx_add_f32_opt(const uint8_t * restrict src0,
                      const uint8_t * restrict src1,
                      uint8_t * restrict dst,
                      const int num_elems);
 void  hvx_add_scalar_f32(const uint8_t * restrict src, const float val, uint8_t * restrict dst, const int num_elems);
 void  hvx_sub_f32(const uint8_t * restrict src0,
                  const uint8_t * restrict src1,
                  uint8_t * restrict dst,
                  const int num_elems);
 void  hvx_sub_f32_opt(const uint8_t * restrict src0,
                      const uint8_t * restrict src1,
                      uint8_t * restrict dst,
                      const int num_elems);
 void  hvx_sub_scalar_f32(const uint8_t * restrict src, const float val, uint8_t * restrict dst, const int num_elems);
 void  hvx_scale_f32(const uint8_t * restrict src, uint8_t * restrict dst, const int num_elems, const float scale);
 void  hvx_inverse_f32(const uint8_t * restrict src, uint8_t * restrict dst, const int num_elems);
 void  hvx_sigmoid_f32(const uint8_t * restrict src, uint8_t * restrict dst, const int num_elems);
 void  hvx_exp_f32(const uint8_t * restrict src, uint8_t * restrict dst, const int num_elems, bool negate);
 float hvx_self_max_f32(const uint8_t * restrict src, const int num_elems);
 float hvx_self_sum_f32(const uint8_t * restrict src, const int num_elems);
 void  hvx_min_scalar_f32(const uint8_t * restrict src, const float val, uint8_t * restrict dst, const int num_elems);
 void  hvx_clamp_scalar_f32(const uint8_t * restrict src,
                           const float limit_left,
                           const float limit_right,
                           uint8_t * restrict dst,
                           const int num_elems);
 #endif /* HVX_UTILS_H */
--- a/ggml/src/ggml-hexagon/htp/main.c
+++ b/ggml/src/ggml-hexagon/htp/main.c
@ -0,0 +1,945 @@
 #pragma clang diagnostic ignored "-Wgnu-zero-variadic-macro-arguments"
 #pragma clang diagnostic ignored "-Wunused-function"
 #define FARF_ERROR  1
 #define FARF_HIGH   1
 #define FARF_MEDIUM 0
 #define FARF_LOW    0
 #include <AEEStdErr.h>
 #include <dspqueue.h>
 #include <HAP_compute_res.h>
 #include <HAP_etm_config.h>
 #include <HAP_farf.h>
 #include <HAP_mem.h>
 #include <HAP_perf.h>
 #include <HAP_power.h>
 #include <HAP_ps.h>
 #include <qurt.h>
 #include <qurt_thread.h>
 #include <remote.h>
 #include <string.h>
 #define GGML_COMMON_DECL_C
 #include "ggml-common.h"
 #include "htp-ctx.h"
 #include "htp-dma.h"
 #include "htp-msg.h"
 #include "htp-ops.h"
 #include "ops-utils.h"
 #include "worker-pool.h"
 AEEResult htp_iface_open(const char * uri, remote_handle64 * handle) {
    struct htp_context * ctx;
    int                  err = 0;
    ctx = calloc(1, sizeof(*ctx));
    if (ctx == NULL) {
        return AEE_ENOMEMORY;
    }
    // Use the context structure as a handle
    *handle = (remote_handle64) ctx;
    // Enable FARF logs
    HAP_setFARFRuntimeLoggingParams(0xffff, NULL, 0);
    // Set client class
    {
        HAP_power_request_t request;
        memset(&request, 0, sizeof(HAP_power_request_t));
        request.type    = HAP_power_set_apptype;
        request.apptype = HAP_POWER_COMPUTE_CLIENT_CLASS;
        if ((err = HAP_power_set((void *) ctx, &request)) != 0) {
            return err;
        }
    }
    {
        HAP_power_request_t request;
        memset(&request, 0, sizeof(request));
        request.type                              = HAP_power_set_DCVS_v3;
        request.dcvs_v3.set_dcvs_enable           = TRUE;
        request.dcvs_v3.dcvs_enable               = TRUE;
        request.dcvs_v3.dcvs_option               = HAP_DCVS_V2_PERFORMANCE_MODE;
        request.dcvs_v3.set_bus_params            = TRUE;
        request.dcvs_v3.bus_params.min_corner     = HAP_DCVS_VCORNER_MAX;
        request.dcvs_v3.bus_params.max_corner     = HAP_DCVS_VCORNER_MAX;
        request.dcvs_v3.bus_params.target_corner  = HAP_DCVS_VCORNER_MAX;
        request.dcvs_v3.set_core_params           = TRUE;
        request.dcvs_v3.core_params.min_corner    = HAP_DCVS_VCORNER_MAX;
        request.dcvs_v3.core_params.max_corner    = HAP_DCVS_VCORNER_MAX;
        request.dcvs_v3.core_params.target_corner = HAP_DCVS_VCORNER_MAX;
        request.dcvs_v3.set_sleep_disable         = TRUE;
        request.dcvs_v3.sleep_disable             = TRUE;
        if ((err = HAP_power_set((void *) ctx, &request)) != 0) {
            return err;
        }
        memset(&request, 0, sizeof(request));
        request.type         = HAP_power_set_HVX;
        request.hvx.power_up = TRUE;
        if ((err = HAP_power_set((void *) ctx, &request)) != 0) {
            return err;
        }
    }
    {
        // Power on HMX
        HAP_power_request_t request;
        memset(&request, 0, sizeof(HAP_power_request_t));
        request.type         = HAP_power_set_HMX;
        request.hmx.power_up = TRUE;
        FARF(ALWAYS, "Powering HMX on\n");
        err = HAP_power_set((void *) &ctx, &request);
        if (err != AEE_SUCCESS) {
            FARF(ERROR, "Error powering on HMX.");
            return err;
        }
    }
    return AEE_SUCCESS;
 }
 AEEResult htp_iface_close(remote_handle64 handle) {
    struct htp_context * ctx = (struct htp_context *) handle;
    if (!ctx) {
        return AEE_EBADPARM;
    }
    if (ctx->queue) {
        FARF(ERROR, "Closing handle with queue still open");
        return AEE_EITEMBUSY;
    }
    free(ctx);
    return AEE_SUCCESS;
 }
 AEEResult htp_iface_enable_etm(remote_handle64 handle) {
    int err = HAP_user_etm_enable();
    if (err) {
        if (err == AEE_EVERSIONNOTSUPPORT) {
            FARF(ERROR, "API HAP_user_etm_enable is not supported\n");
        } else {
            FARF(ERROR, "Error executing HAP_user_etm_enable with error code : 0x%x\n", err);
        }
    }
    return err;
 }
 AEEResult htp_iface_disable_etm(remote_handle64 handle) {
    int err = HAP_user_etm_disable();
    if (err) {
        if (err == AEE_EVERSIONNOTSUPPORT) {
            FARF(ERROR, "API HAP_user_etm_disable is not supported\n");
        } else {
            FARF(ERROR, "Error executing HAP_user_etm_disable with error code : 0x%x\n", err);
        }
    }
    return err;
 }
 static int vtcm_acquire(struct htp_context * ctx) {
    if (!ctx->vtcm_valid) {
        // Temporarily bump thread priority to make sure it's higher than other sessions.
        // This way the resource manager will notify the other thread to release VTCM.
        // Note that we need to reaquire VTCM at normal priority for this to work next time.
        qurt_thread_set_priority(qurt_thread_get_id(), ctx->thread_prio - 10);
        HAP_compute_res_acquire_cached(ctx->vtcm_rctx, 1000000);
        HAP_compute_res_release_cached(ctx->vtcm_rctx);
        qurt_thread_set_priority(qurt_thread_get_id(), ctx->thread_prio);
        HAP_compute_res_acquire_cached(ctx->vtcm_rctx, 1000000);
        ctx->vtcm_valid = true;
    }
    ctx->vtcm_inuse = true;
    return 0;
 }
 static int vtcm_release(struct htp_context * ctx) {
    ctx->vtcm_inuse = false;
    if (ctx->vtcm_valid && ctx->vtcm_needs_release) {
        ctx->vtcm_valid         = false;
        ctx->vtcm_needs_release = false;
        HAP_compute_res_release_cached(ctx->vtcm_rctx);
    }
    return 0;
 }
 static int vtcm_release_callback(unsigned int rctx, void * state) {
    struct htp_context * ctx = (struct htp_context *) state;
    if (!ctx || ctx->vtcm_rctx != rctx) {
        return AEE_EBADPARM;
    }
    // If VTCM is not inuse (not processing Ops) release it right here
    // otherwise we'll release it once we're done with the current Op.
    if (ctx->vtcm_inuse) {
        ctx->vtcm_needs_release = false;
        return 0;
    }
    ctx->vtcm_valid = false;
    HAP_compute_res_release_cached(ctx->vtcm_rctx);
    return 0;
 }
 static int vtcm_alloc(struct htp_context * ctx) {
    unsigned int vtcm_size = 8 * 1024 * 1024;  // 8MB default
    HAP_compute_res_query_VTCM(0, &vtcm_size, NULL, NULL, NULL);
    compute_res_attr_t attr;
    HAP_compute_res_attr_init(&attr);
    HAP_compute_res_attr_set_serialize(&attr, 0);
    HAP_compute_res_attr_set_cache_mode(&attr, 1);
    HAP_compute_res_attr_set_vtcm_param_v2(&attr, vtcm_size, vtcm_size, vtcm_size);
    HAP_compute_res_attr_set_release_callback(&attr, vtcm_release_callback, (void *) ctx);
    HAP_compute_res_attr_set_hmx_param(&attr, 1);
    // Allocate VTCM for scratch pads
    uint32_t rctx = HAP_compute_res_acquire(&attr, 1000000 /* timeout */);
    if (!rctx) {
        FARF(ERROR, "failed to allocate %zu bytes VTCM\n", ctx->vtcm_size);
        return AEE_ENOMEMORY;
    }
    void * vtcm_ptr;
    if (HAP_compute_res_attr_get_vtcm_ptr_v2(&attr, &vtcm_ptr, &vtcm_size) != 0) {
        HAP_compute_res_release(rctx);
        FARF(ERROR, "failed to allocate %zu bytes VTCM (new)\n", ctx->vtcm_size);
        return AEE_ENOMEMORY;
    }
    ctx->vtcm_base          = (uint8_t *) vtcm_ptr;
    ctx->vtcm_size          = vtcm_size;
    ctx->vtcm_rctx          = rctx;
    ctx->vtcm_valid         = false;
    ctx->vtcm_inuse         = false;
    ctx->vtcm_needs_release = false;
    return 0;
 }
 static void vtcm_free(struct htp_context * ctx) {
    if (ctx->vtcm_rctx) {
        HAP_compute_res_release(ctx->vtcm_rctx);
        ctx->vtcm_base = 0;
        ctx->vtcm_rctx = 0;
    }
 }
 static void htp_packet_callback(dspqueue_t queue, int error, void * context);
 static void htp_error_callback(dspqueue_t queue, int error, void * context);
 AEEResult htp_iface_start(remote_handle64 handle, uint32 sess_id, uint64 dsp_queue_id, uint32 n_hvx) {
    struct htp_context * ctx = (struct htp_context *) handle;
    if (!ctx) {
        return AEE_EBADPARM;
    }
    if (ctx->queue) {
        FARF(ERROR, "Queue already open");
        return AEE_EITEMBUSY;
    }
    // Import queue created on the CPU
    int err = dspqueue_import(dsp_queue_id,         // Queue ID from dspqueue_export
                              htp_packet_callback,  // Packet callback
                              htp_error_callback,   // Error callback; no errors expected on the DSP
                              (void *) ctx,         // Callback context
                              &ctx->queue);
    if (err) {
        FARF(ERROR, "Queue import failed with 0x%08x", (unsigned) err);
        return err;
    }
    ctx->thread_id   = qurt_thread_get_id();
    ctx->thread_prio = qurt_thread_get_priority(ctx->thread_id);
    // allocate VTCM
    err = vtcm_alloc(ctx);
    if (err != AEE_SUCCESS) {
        FARF(ERROR, "Unable to allocate VTCM");
        return AEE_ENOMEMORY;
    }
    qurt_sysenv_max_hthreads_t hw_threads;
    qurt_sysenv_get_max_hw_threads(&hw_threads);
    uint32_t hw_nhvx = (qurt_hvx_get_units() >> 8) & 0xFF;
    if (n_hvx == 0) {
        n_hvx = hw_nhvx;
    }
    if (n_hvx > hw_threads.max_hthreads) {
        n_hvx = hw_threads.max_hthreads;
    }
    if (n_hvx > HTP_MAX_NTHREADS) {
        n_hvx = HTP_MAX_NTHREADS;
    }
    ctx->n_threads = n_hvx;
    for (int i = 0; i < ctx->n_threads; i++) {
        ctx->dma[i] = dma_queue_create(HTP_SPAD_SRC0_NROWS * 2);
    }
    // init worker pool
    err = worker_pool_init(&ctx->worker_pool, n_hvx);
    if (err != AEE_SUCCESS) {
        FARF(ERROR, "Unable to create worker pool");
        return err;
    }
    FARF(HIGH, "session %u started: n-hvx %u vtcm-size %zu vtcm-rctx %u n-threads %u thread-id %d thread-prio %d \n",
         sess_id, hw_nhvx, ctx->vtcm_size, ctx->vtcm_rctx, ctx->n_threads, ctx->thread_id, ctx->thread_prio);
    return AEE_SUCCESS;
 }
 AEEResult htp_iface_stop(remote_handle64 handle) {
    struct htp_context * ctx = (struct htp_context *) handle;
    if (!ctx) {
        return AEE_EBADPARM;
    }
    if (!ctx->queue) {
        FARF(ERROR, "Queue not open");
        return AEE_EBADSTATE;
    }
    // Close queue. dspqueue_close() will also wait for callbacks to finish.
    int err    = dspqueue_close(ctx->queue);
    ctx->queue = NULL;
    if (err != 0) {
        FARF(ERROR, "Queue close failed with 0x%08x", (unsigned) err);
        return err;
    }
    if (ctx->worker_pool) {
        // Release worker pool
        worker_pool_release(&ctx->worker_pool);
    }
    for (int i = 0; i < ctx->n_threads; i++) {
        dma_queue_delete(ctx->dma[i]);
    }
    vtcm_free(ctx);
    return AEE_SUCCESS;
 }
 static void htp_error_callback(dspqueue_t queue, int error, void * context) {
    // No errors expected on the DSP.
    FARF(ERROR, "Error callback: 0x%08x", (unsigned) error);
 }
 struct profile_data {
    uint64_t usecs;
    uint64_t cycles;
    uint64_t pkts;
 };
 static inline void profile_start(struct profile_data * d) {
    d->usecs  = HAP_perf_get_qtimer_count();
    d->cycles = htp_get_cycles();
    d->pkts   = htp_get_pktcnt();
 }
 static inline void profile_stop(struct profile_data * d) {
    d->usecs  = HAP_perf_qtimer_count_to_us(HAP_perf_get_qtimer_count() - d->usecs);
    d->cycles = htp_get_cycles() - d->cycles;
    d->pkts   = htp_get_pktcnt() - d->pkts;
 }
 static int send_htp_rsp(struct htp_context *     c,
                        uint32_t                 op,
                        uint32_t                 status,
                        struct dspqueue_buffer * bufs,
                        size_t                   n_bufs,
                        struct profile_data *    prof) {
    // Prep response struct
    struct htp_general_rsp rsp;
    rsp.op          = op;
    rsp.status      = status;
    rsp.prof_usecs  = prof->usecs;
    rsp.prof_cycles = prof->cycles;
    rsp.prof_pkts   = prof->pkts;
    int err = dspqueue_write(c->queue,
                             0,                       // Flags
                             n_bufs,
                             bufs,                    // Buffer references
                             sizeof(rsp),
                             (const uint8_t *) &rsp,  // Message
                             DSPQUEUE_TIMEOUT_NONE);
    if (err != 0) {
        FARF(ERROR, "dspqueue_write failed: 0x%08x", (unsigned) err);
    }
    return err;
 }
 static void proc_matmul_req(struct htp_context *     ctx,
                            struct htp_general_req * req,
                            struct dspqueue_buffer * bufs,
                            size_t                   n_bufs) {
    // Prep response buffer structs (needed for error responses, etc)
    struct dspqueue_buffer rsp_bufs[HTP_MAX_PACKET_BUFFERS];
    memset(rsp_bufs, 0, sizeof(rsp_bufs));
    rsp_bufs[0].fd     = bufs[0].fd;
    rsp_bufs[0].ptr    = bufs[0].ptr;
    rsp_bufs[0].size   = bufs[0].size;
    rsp_bufs[0].offset = bufs[0].offset;
    rsp_bufs[0].flags  = DSPQUEUE_BUFFER_FLAG_DEREF;  // Release reference
    rsp_bufs[1].fd     = bufs[1].fd;
    rsp_bufs[1].ptr    = bufs[1].ptr;
    rsp_bufs[1].size   = bufs[1].size;
    rsp_bufs[1].offset = bufs[1].offset;
    rsp_bufs[1].flags  = DSPQUEUE_BUFFER_FLAG_DEREF;  // Release reference
    // We had written to the output buffer, we'd also need to flush it
    rsp_bufs[2].fd     = bufs[2].fd;
    rsp_bufs[2].ptr    = bufs[2].ptr;
    rsp_bufs[2].size   = bufs[2].size;
    rsp_bufs[2].offset = bufs[2].offset;
    rsp_bufs[2].flags  = (DSPQUEUE_BUFFER_FLAG_DEREF |                 // Release reference
                         DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER |          // Flush NSP
                         DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT);  // Invalidate CPU
    // Setup Op context
    struct htp_ops_context octx = { 0 };
    octx.ctx                    = ctx;
    octx.src0                   = req->src0;
    octx.src1                   = req->src1;
    octx.dst                    = req->dst;
    octx.flags                  = req->flags;
    octx.op                     = req->op;
    // Update data pointers
    octx.src0.data = (uint32_t) bufs[0].ptr;
    octx.src1.data = (uint32_t) bufs[1].ptr;
    octx.dst.data  = (uint32_t) bufs[2].ptr;
    octx.n_threads = ctx->n_threads;
    struct profile_data prof;
    profile_start(&prof);
    uint32_t rsp_status = HTP_STATUS_INTERNAL_ERR;
    if (vtcm_acquire(ctx) == AEE_SUCCESS) {
        rsp_status = op_matmul(&octx);
        vtcm_release(ctx);
    }
    profile_stop(&prof);
    send_htp_rsp(ctx, req->op, rsp_status, rsp_bufs, 3, &prof);
 }
 static void proc_matmul_id_req(struct htp_context *     ctx,
                               struct htp_general_req * req,
                               struct dspqueue_buffer * bufs,
                               size_t                   n_bufs) {
    // Prep response buffer structs (needed for error responses, etc)
    struct dspqueue_buffer rsp_bufs[HTP_MAX_PACKET_BUFFERS];
    memset(rsp_bufs, 0, sizeof(rsp_bufs));
    rsp_bufs[0].fd     = bufs[0].fd;
    rsp_bufs[0].ptr    = bufs[0].ptr;
    rsp_bufs[0].size   = bufs[0].size;
    rsp_bufs[0].offset = bufs[0].offset;
    rsp_bufs[0].flags  = DSPQUEUE_BUFFER_FLAG_DEREF;  // Release reference
    rsp_bufs[1].fd     = bufs[1].fd;
    rsp_bufs[1].ptr    = bufs[1].ptr;
    rsp_bufs[1].size   = bufs[1].size;
    rsp_bufs[1].offset = bufs[1].offset;
    rsp_bufs[1].flags  = DSPQUEUE_BUFFER_FLAG_DEREF;  // Release reference
    rsp_bufs[2].fd     = bufs[2].fd;
    rsp_bufs[2].ptr    = bufs[2].ptr;
    rsp_bufs[2].size   = bufs[2].size;
    rsp_bufs[2].offset = bufs[2].offset;
    rsp_bufs[2].flags  = DSPQUEUE_BUFFER_FLAG_DEREF;  // Release reference
    // We had written to the output buffer, we'd also need to flush it
    rsp_bufs[3].fd     = bufs[3].fd;
    rsp_bufs[3].ptr    = bufs[3].ptr;
    rsp_bufs[3].size   = bufs[3].size;
    rsp_bufs[3].offset = bufs[3].offset;
    rsp_bufs[3].flags  = (DSPQUEUE_BUFFER_FLAG_DEREF |                 // Release reference
                         DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER |          // Flush NSP
                         DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT);  // Invalidate CPU
    // Setup Op context
    struct htp_ops_context octx = { 0 };
    octx.ctx                    = ctx;
    octx.src0                   = req->src0;
    octx.src1                   = req->src1;
    octx.src2                   = req->src2;
    octx.dst                    = req->dst;
    octx.flags                  = req->flags;
    octx.op                     = req->op;
    // Update data pointers
    octx.src0.data = (uint32_t) bufs[0].ptr;
    octx.src1.data = (uint32_t) bufs[1].ptr;
    octx.src2.data = (uint32_t) bufs[2].ptr;
    octx.dst.data  = (uint32_t) bufs[3].ptr;
    octx.n_threads = ctx->n_threads;
    struct profile_data prof;
    profile_start(&prof);
    uint32_t rsp_status = HTP_STATUS_INTERNAL_ERR;
    if (vtcm_acquire(ctx) == AEE_SUCCESS) {
        rsp_status = op_matmul_id(&octx);
        vtcm_release(ctx);
    }
    profile_stop(&prof);
    send_htp_rsp(ctx, req->op, rsp_status, rsp_bufs, 4, &prof);
 }
 static void proc_binary_req(struct htp_context * ctx, struct htp_general_req * req, struct dspqueue_buffer * bufs) {
    struct dspqueue_buffer rsp_bufs[HTP_MAX_PACKET_BUFFERS];
    memset(rsp_bufs, 0, sizeof(rsp_bufs));
    rsp_bufs[0].fd     = bufs[0].fd;
    rsp_bufs[0].ptr    = bufs[0].ptr;
    rsp_bufs[0].offset = bufs[0].offset;
    rsp_bufs[0].size   = bufs[0].size;
    rsp_bufs[0].flags  = DSPQUEUE_BUFFER_FLAG_DEREF;  // Release reference
    rsp_bufs[1].fd     = bufs[1].fd;
    rsp_bufs[1].ptr    = bufs[1].ptr;
    rsp_bufs[1].offset = bufs[1].offset;
    rsp_bufs[1].size   = bufs[1].size;
    rsp_bufs[1].flags  = DSPQUEUE_BUFFER_FLAG_DEREF;  // Release reference
    // We had written to the output buffer, we'd also need to flush it
    rsp_bufs[2].fd     = bufs[2].fd;
    rsp_bufs[2].ptr    = bufs[2].ptr;
    rsp_bufs[2].offset = bufs[2].offset;
    rsp_bufs[2].size   = bufs[2].size;
    rsp_bufs[2].flags  = (DSPQUEUE_BUFFER_FLAG_DEREF |                 // Release reference
                         DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER |          // Flush NSP
                         DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT);  // Invalidate CPU
    // Setup Op context
    struct htp_ops_context octx = { 0 };
    octx.ctx                    = ctx;
    octx.src0                   = req->src0;
    octx.src1                   = req->src1;
    octx.dst                    = req->dst;
    octx.flags                  = req->flags;
    octx.op                     = req->op;
    // Update data pointers
    octx.src0.data = (uint32_t) bufs[0].ptr;
    octx.src1.data = (uint32_t) bufs[1].ptr;
    octx.dst.data  = (uint32_t) bufs[2].ptr;
    octx.n_threads = ctx->n_threads;
    struct profile_data prof;
    profile_start(&prof);
    uint32_t rsp_status = HTP_STATUS_INTERNAL_ERR;
    if (vtcm_acquire(ctx) == AEE_SUCCESS) {
        rsp_status = op_binary(&octx);
        vtcm_release(ctx);
    }
    profile_stop(&prof);
    send_htp_rsp(ctx, req->op, rsp_status, rsp_bufs, 3, &prof);
 }
 static void proc_add_id_req(struct htp_context * ctx, struct htp_general_req * req, struct dspqueue_buffer * bufs) {
    struct dspqueue_buffer rsp_bufs[HTP_MAX_PACKET_BUFFERS];
    memset(rsp_bufs, 0, sizeof(rsp_bufs));
    rsp_bufs[0].fd     = bufs[0].fd;
    rsp_bufs[0].ptr    = bufs[0].ptr;
    rsp_bufs[0].offset = bufs[0].offset;
    rsp_bufs[0].size   = bufs[0].size;
    rsp_bufs[0].flags  = DSPQUEUE_BUFFER_FLAG_DEREF;  // Release reference
    rsp_bufs[1].fd     = bufs[1].fd;
    rsp_bufs[1].ptr    = bufs[1].ptr;
    rsp_bufs[1].offset = bufs[1].offset;
    rsp_bufs[1].size   = bufs[1].size;
    rsp_bufs[1].flags  = DSPQUEUE_BUFFER_FLAG_DEREF;  // Release reference
    rsp_bufs[2].fd     = bufs[2].fd;
    rsp_bufs[2].ptr    = bufs[2].ptr;
    rsp_bufs[2].offset = bufs[2].offset;
    rsp_bufs[2].size   = bufs[2].size;
    rsp_bufs[2].flags  = DSPQUEUE_BUFFER_FLAG_DEREF;  // Release reference
    // We had written to the output buffer, we'd also need to flush it
    rsp_bufs[3].fd     = bufs[3].fd;
    rsp_bufs[3].ptr    = bufs[3].ptr;
    rsp_bufs[3].offset = bufs[3].offset;
    rsp_bufs[3].size   = bufs[3].size;
    rsp_bufs[3].flags  = (DSPQUEUE_BUFFER_FLAG_DEREF |                 // Release reference
                         DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER |          // Flush NSP
                         DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT);  // Invalidate CPU
    // Setup Op context
    struct htp_ops_context octx = { 0 };
    octx.ctx                    = ctx;
    octx.src0                   = req->src0;
    octx.src1                   = req->src1;
    octx.src2                   = req->src2;
    octx.dst                    = req->dst;
    octx.flags                  = req->flags;
    octx.op                     = req->op;
    // Update data pointers
    octx.src0.data = (uint32_t) bufs[0].ptr;
    octx.src1.data = (uint32_t) bufs[1].ptr;
    octx.src2.data = (uint32_t) bufs[2].ptr;
    octx.dst.data  = (uint32_t) bufs[3].ptr;
    octx.n_threads = ctx->n_threads;
    struct profile_data prof;
    profile_start(&prof);
    uint32_t rsp_status = HTP_STATUS_INTERNAL_ERR;
    if (vtcm_acquire(ctx) == AEE_SUCCESS) {
        rsp_status = op_binary(&octx);
        vtcm_release(ctx);
    }
    profile_stop(&prof);
    send_htp_rsp(ctx, req->op, rsp_status, rsp_bufs, 4, &prof);
 }
 static void proc_unary_req(struct htp_context * ctx, struct htp_general_req * req, struct dspqueue_buffer * bufs) {
    struct dspqueue_buffer rsp_bufs[HTP_MAX_PACKET_BUFFERS];
    memset(rsp_bufs, 0, sizeof(rsp_bufs));
    rsp_bufs[0].fd     = bufs[0].fd;
    rsp_bufs[0].ptr    = bufs[0].ptr;
    rsp_bufs[0].offset = bufs[0].offset;
    rsp_bufs[0].size   = bufs[0].size;
    rsp_bufs[0].flags  = DSPQUEUE_BUFFER_FLAG_DEREF;  // Release reference
    // We had written to the output buffer, we'd also need to flush it
    rsp_bufs[1].fd     = bufs[1].fd;
    rsp_bufs[1].ptr    = bufs[1].ptr;
    rsp_bufs[1].offset = bufs[1].offset;
    rsp_bufs[1].size   = bufs[1].size;
    rsp_bufs[1].flags  = (DSPQUEUE_BUFFER_FLAG_DEREF |                 // Release reference
                         DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER |          // Flush NSP
                         DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT);  // Invalidate CPU
    // Setup Op context
    struct htp_ops_context octx = { 0 };
    octx.ctx                    = ctx;
    octx.src0                   = req->src0;
    octx.dst                    = req->dst;
    octx.flags                  = req->flags;
    octx.op                     = req->op;
    memcpy(octx.op_params, req->op_params, sizeof(octx.op_params));
    // Update data pointers
    octx.src0.data = (uint32_t) bufs[0].ptr;
    octx.dst.data  = (uint32_t) bufs[1].ptr;
    octx.n_threads = ctx->n_threads;
    struct profile_data prof;
    profile_start(&prof);
    uint32_t rsp_status = HTP_STATUS_INTERNAL_ERR;
    if (vtcm_acquire(ctx) == AEE_SUCCESS) {
        rsp_status = op_unary(&octx);
        vtcm_release(ctx);
    }
    profile_stop(&prof);
    send_htp_rsp(ctx, req->op, rsp_status, rsp_bufs, 2, &prof);
 }
 static void proc_activations_req(struct htp_context *     ctx,
                                 struct htp_general_req * req,
                                 struct dspqueue_buffer * bufs,
                                 uint32_t                 n_bufs) {
    struct dspqueue_buffer rsp_bufs[HTP_MAX_PACKET_BUFFERS];
    memset(rsp_bufs, 0, sizeof(rsp_bufs));
    rsp_bufs[0].fd     = bufs[0].fd;
    rsp_bufs[0].ptr    = bufs[0].ptr;
    rsp_bufs[0].offset = bufs[0].offset;
    rsp_bufs[0].size   = bufs[0].size;
    rsp_bufs[0].flags  = DSPQUEUE_BUFFER_FLAG_DEREF;  // Release reference
    int write_idx = 1;
    if (3 == n_bufs) {
        rsp_bufs[1].fd     = bufs[1].fd;
        rsp_bufs[1].ptr    = bufs[1].ptr;
        rsp_bufs[1].offset = bufs[1].offset;
        rsp_bufs[1].size   = bufs[1].size;
        rsp_bufs[1].flags  = DSPQUEUE_BUFFER_FLAG_DEREF;  // Release reference
        write_idx = 2;
    }
    // We had written to the output buffer, we'd also need to flush it
    rsp_bufs[write_idx].fd     = bufs[write_idx].fd;
    rsp_bufs[write_idx].ptr    = bufs[write_idx].ptr;
    rsp_bufs[write_idx].offset = bufs[write_idx].offset;
    rsp_bufs[write_idx].size   = bufs[write_idx].size;
    rsp_bufs[write_idx].flags  = (DSPQUEUE_BUFFER_FLAG_DEREF |                 // Release reference
                                 DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER |          // Flush NSP
                                 DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT);  // Invalidate CPU
    // Setup Op context
    struct htp_ops_context octx = { 0 };
    octx.ctx                    = ctx;
    octx.src0                   = req->src0;
    if (3 == n_bufs) {
        octx.src1 = req->src1;
    }
    octx.dst   = req->dst;
    octx.flags = req->flags;
    octx.op    = req->op;
    memcpy(octx.op_params, req->op_params, sizeof(octx.op_params));
    // Update data pointers
    octx.src0.data = (uint32_t) bufs[0].ptr;
    if (3 == n_bufs) {
        octx.src1.data = (uint32_t) bufs[1].ptr;
        octx.dst.data  = (uint32_t) bufs[2].ptr;
    } else {
        octx.dst.data = (uint32_t) bufs[1].ptr;
    }
    octx.n_threads = ctx->n_threads;
    struct profile_data prof;
    profile_start(&prof);
    uint32_t rsp_status = HTP_STATUS_INTERNAL_ERR;
    if (vtcm_acquire(ctx) == AEE_SUCCESS) {
        if (octx.op == HTP_OP_SOFTMAX) {
            rsp_status = op_softmax(&octx);
        } else {
            rsp_status = op_activations(&octx);
        }
        vtcm_release(ctx);
    }
    profile_stop(&prof);
    send_htp_rsp(ctx, req->op, rsp_status, rsp_bufs, n_bufs, &prof);
 }
 static void proc_rope_req(struct htp_context *     ctx,
                          struct htp_general_req * req,
                          struct dspqueue_buffer * bufs,
                          uint32_t                 n_bufs) {
    struct dspqueue_buffer rsp_bufs[HTP_MAX_PACKET_BUFFERS];
    memset(rsp_bufs, 0, sizeof(rsp_bufs));
    rsp_bufs[0].fd     = bufs[0].fd;
    rsp_bufs[0].ptr    = bufs[0].ptr;
    rsp_bufs[0].offset = bufs[0].offset;
    rsp_bufs[0].size   = bufs[0].size;
    rsp_bufs[0].flags  = DSPQUEUE_BUFFER_FLAG_DEREF;  // Release reference
    rsp_bufs[1].fd     = bufs[1].fd;
    rsp_bufs[1].ptr    = bufs[1].ptr;
    rsp_bufs[1].offset = bufs[1].offset;
    rsp_bufs[1].size   = bufs[1].size;
    rsp_bufs[1].flags  = DSPQUEUE_BUFFER_FLAG_DEREF;  // Release reference
    int write_idx = 2;
    if (4 == n_bufs) {
        rsp_bufs[write_idx].fd     = bufs[write_idx].fd;
        rsp_bufs[write_idx].ptr    = bufs[write_idx].ptr;
        rsp_bufs[write_idx].offset = bufs[write_idx].offset;
        rsp_bufs[write_idx].size   = bufs[write_idx].size;
        rsp_bufs[write_idx].flags  = DSPQUEUE_BUFFER_FLAG_DEREF;  // Release reference
        write_idx++;
    }
    // We had written to the output buffer, we'd also need to flush it
    rsp_bufs[write_idx].fd     = bufs[write_idx].fd;
    rsp_bufs[write_idx].ptr    = bufs[write_idx].ptr;
    rsp_bufs[write_idx].offset = bufs[write_idx].offset;
    rsp_bufs[write_idx].size   = bufs[write_idx].size;
    rsp_bufs[write_idx].flags  = (DSPQUEUE_BUFFER_FLAG_DEREF |                 // Release reference
                                 DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER |          // Flush NSP
                                 DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT);  // Invalidate CPU
    // Setup Op context
    struct htp_ops_context octx = { 0 };
    octx.ctx                    = ctx;
    octx.src0                   = req->src0;
    octx.src1                   = req->src1;
    if (4 == n_bufs) {
        octx.src2 = req->src2;
    }
    octx.dst   = req->dst;
    octx.flags = req->flags;
    octx.op    = req->op;
    memcpy(octx.op_params, req->op_params, sizeof(octx.op_params));
    // Update data pointers
    octx.src0.data = (uint32_t) bufs[0].ptr;
    octx.src1.data = (uint32_t) bufs[1].ptr;
    if (4 == n_bufs) {
        octx.src2.data = (uint32_t) bufs[2].ptr;
        octx.dst.data  = (uint32_t) bufs[3].ptr;
    } else {
        octx.dst.data = (uint32_t) bufs[2].ptr;
    }
    octx.n_threads = ctx->n_threads;
    struct profile_data prof;
    profile_start(&prof);
    uint32_t rsp_status = HTP_STATUS_INTERNAL_ERR;
    if (vtcm_acquire(ctx) == AEE_SUCCESS) {
        rsp_status = op_rope(&octx);
        vtcm_release(ctx);
    }
    profile_stop(&prof);
    send_htp_rsp(ctx, req->op, rsp_status, rsp_bufs, n_bufs, &prof);
 }
 static void htp_packet_callback(dspqueue_t queue, int error, void * context) {
    struct htp_context * ctx = (struct htp_context *) context;
    // Repeatedly read packets from the queue until it's empty. We don't
    // necessarily get a separate callback for each packet, and new packets
    // may arrive while we're processing the previous one. This ensures we
    // keep the DSP busy as much as possible and avoid waiting for the CPU.
    while (1) {
        struct htp_general_req req;
        uint32_t               req_size;
        struct dspqueue_buffer bufs[HTP_MAX_PACKET_BUFFERS];
        uint32_t               n_bufs;
        uint32_t               flags;
        // Read packet from queue
        int err = dspqueue_read_noblock(queue, &flags,
                                        HTP_MAX_PACKET_BUFFERS,  // Maximum number of buffer references
                                        &n_bufs,                 // Number of buffer references
                                        bufs,                    // Buffer references
                                        sizeof(req),             // Max message length
                                        &req_size,               // Message length
                                        (uint8_t *) &req);       // Message
        if (err == AEE_EWOULDBLOCK) {
            // Consumed all packets available for now
            return;
        }
        if (err != 0) {
            FARF(ERROR, "dspqueue_read_noblock failed: 0x%08x", (unsigned) err);
            return;
        }
        if (req_size != sizeof(req)) {
            FARF(ERROR, "Invalid request size");
            continue;
        }
        if (req.flags & HTP_OPFLAGS_EARLY_WAKEUP) {
            // Host wants early notification
            dspqueue_write_early_wakeup_noblock(ctx->queue, 10, 0);
        }
        // Process packet based on its message type
        switch (req.op) {
            case HTP_OP_MUL_MAT:
                if (n_bufs != 3) {
                    FARF(ERROR, "Bad matmul-req buffer list");
                    continue;
                }
                proc_matmul_req(ctx, &req, bufs, n_bufs);
                break;
            case HTP_OP_MUL_MAT_ID:
                if (n_bufs != 4) {
                    FARF(ERROR, "Bad matmul-id-req buffer list");
                    continue;
                }
                proc_matmul_id_req(ctx, &req, bufs, n_bufs);
                break;
            case HTP_OP_MUL:
            case HTP_OP_ADD:
            case HTP_OP_SUB:
                if (n_bufs != 3) {
                    FARF(ERROR, "Bad binary-req buffer list");
                    continue;
                }
                proc_binary_req(ctx, &req, bufs);
                break;
            case HTP_OP_RMS_NORM:
                if (n_bufs != 2) {
                    FARF(ERROR, "Bad unary-req buffer list");
                    continue;
                }
                proc_unary_req(ctx, &req, bufs);
                break;
            case HTP_OP_UNARY_SILU:
                if (n_bufs != 2) {
                    FARF(ERROR, "Bad act-req buffer list");
                    continue;
                }
                proc_activations_req(ctx, &req, bufs, n_bufs);
                break;
            case HTP_OP_GLU_SWIGLU:
            case HTP_OP_SOFTMAX:
                if ((n_bufs != 2) && (n_bufs != 3)) {
                    FARF(ERROR, "Bad act-req buffer list");
                    continue;
                }
                proc_activations_req(ctx, &req, bufs, n_bufs);
                break;
            case HTP_OP_ADD_ID:
                if (n_bufs != 4) {
                    FARF(ERROR, "Bad add-id-req buffer list");
                    continue;
                }
                proc_add_id_req(ctx, &req, bufs);
                break;
            case HTP_OP_ROPE:
                if ((n_bufs != 3) && (n_bufs != 4)) {
                    FARF(ERROR, "Bad rope-req buffer list");
                    continue;
                }
                proc_rope_req(ctx, &req, bufs, n_bufs);
                break;
            default:
                FARF(ERROR, "Unknown Op %u", req.op);
                break;
        }
    }
 }
--- a/ggml/src/ggml-hexagon/htp/matmul-ops.c
+++ b/ggml/src/ggml-hexagon/htp/matmul-ops.c
--- a/ggml/src/ggml-hexagon/htp/ops-utils.h
+++ b/ggml/src/ggml-hexagon/htp/ops-utils.h
@ -0,0 +1,116 @@
 #ifndef OPS_UTILS_H
 #define OPS_UTILS_H
 #include "htp-msg.h"
 #ifndef MAX
 #    define MAX(a, b) ((a) > (b) ? (a) : (b))
 #endif
 #ifndef MIN
 #    define MIN(a, b) ((a) < (b) ? (a) : (b))
 #endif
 static inline uint64_t htp_get_cycles() {
    uint64_t cycles = 0;
    asm volatile(" %0 = c15:14\n" : "=r"(cycles));
    return cycles;
 }
 static inline uint64_t htp_get_pktcnt() {
    uint64_t pktcnt;
    asm volatile(" %0 = c19:18\n" : "=r"(pktcnt));
    return pktcnt;
 }
 static inline int32_t htp_is_aligned(void * addr, uint32_t align) {
    return ((size_t) addr & (align - 1)) == 0;
 }
 static inline uint32_t htp_round_up(uint32_t n, uint32_t m) {
    return m * ((n + m - 1) / m);
 }
 static inline void htp_l2fetch(const void * p, uint32_t height, uint32_t width, uint32_t stride) {
    const uint64_t control = Q6_P_combine_RR(stride, Q6_R_combine_RlRl(width, height));
    asm volatile(" l2fetch(%0,%1) " : : "r"(p), "r"(control));
 }
 static inline int32_t htp_is_one_chunk(void * addr, uint32_t n, uint32_t chunk_size) {
    uint32_t left_off  = (size_t) addr & (chunk_size - 1);
    uint32_t right_off = left_off + n;
    return right_off <= chunk_size;
 }
 static inline void htp_dump_int8_line(char * pref, const int8_t * x, int n) {
    char str[1024], *p = str;
    p += sprintf(p, "%s: ", pref);
    for (int i = 0; i < 16; i++) {
        p += sprintf(p, "%d, ", x[i]);
    }
    FARF(HIGH, "%s\n", str);
 }
 static inline void htp_dump_uint8_line(char * pref, const uint8_t * x, uint32_t n) {
    char str[1024], *p = str;
    p += sprintf(p, "%s: ", pref);
    for (int i = 0; i < n; i++) {
        p += sprintf(p, "%d, ", x[i]);
    }
    FARF(HIGH, "%s\n", str);
 }
 static inline void htp_dump_int32_line(char * pref, const int32_t * x, uint32_t n) {
    char str[1024], *p = str;
    p += sprintf(p, "%s: ", pref);
    for (int i = 0; i < n; i++) {
        p += sprintf(p, "%d, ", (int) x[i]);
    }
    FARF(HIGH, "%s\n", str);
 }
 static inline void htp_dump_fp16_line(char * pref, const __fp16 * x, uint32_t n) {
    char str[1024], *p = str;
    p += sprintf(p, "%s: ", pref);
    for (int i = 0; i < n; i++) {
        p += sprintf(p, "%.6f, ", (float) x[i]);
    }
    FARF(HIGH, "%s\n", str);
 }
 static inline void htp_dump_fp32_line(char * pref, const float * x, uint32_t n) {
    char str[1024], *p = str;
    p += sprintf(p, "%s: ", pref);
    for (int i = 0; i < n; i++) {
        p += sprintf(p, "%.6f, ", x[i]);
    }
    FARF(HIGH, "%s\n", str);
 }
 static inline void htp_dump_f32(char * pref, const float * x, uint32_t n) {
    uint32_t n0 = n / 16;
    uint32_t n1 = n % 16;
    uint32_t i = 0;
    for (; i < n0; i++) {
        htp_dump_fp32_line(pref, x + (16 * i), 16);
    }
    if (n1) {
        htp_dump_fp32_line(pref, x + (16 * i), n1);
    }
 }
 static inline void htp_dump_f16(char * pref, const __fp16 * x, uint32_t n) {
    uint32_t n0 = n / 16;
    uint32_t n1 = n % 16;
    uint32_t i = 0;
    for (; i < n0; i++) {
        htp_dump_fp16_line(pref, x + (16 * i), 16);
    }
    if (n1) {
        htp_dump_fp16_line(pref, x + (16 * i), n1);
    }
 }
 #endif /* OPS_UTILS_H */
--- a/ggml/src/ggml-hexagon/htp/rope-ops.c
+++ b/ggml/src/ggml-hexagon/htp/rope-ops.c
@ -0,0 +1,418 @@
 #pragma clang diagnostic ignored "-Wunused-variable"
 #pragma clang diagnostic ignored "-Wunused-function"
 #pragma clang diagnostic ignored "-Wunused-but-set-variable"
 #ifdef HTP_DEBUG
 #    define FARF_HIGH 1
 #endif
 #include <HAP_farf.h>
 #include <HAP_mem.h>
 #include <HAP_perf.h>
 #include <HAP_ps.h>
 #include <hexagon_protos.h>
 #include <hexagon_types.h>
 #include <math.h>
 #include <qurt_thread.h>
 #include <string.h>
 #define GGML_COMMON_DECL_C
 #include "ggml-common.h"
 #include "htp-ctx.h"
 #include "htp-dma.h"
 #include "htp-msg.h"
 #include "htp-ops.h"
 #include "hvx-utils.h"
 #include "ops-utils.h"
 #define htp_rope_preamble              \
    const uint32_t ne00 = src0->ne[0]; \
    const uint32_t ne01 = src0->ne[1]; \
    const uint32_t ne02 = src0->ne[2]; \
    const uint32_t ne03 = src0->ne[3]; \
                                       \
    const uint32_t ne0 = dst->ne[0];   \
    const uint32_t ne1 = dst->ne[1];   \
    const uint32_t ne2 = dst->ne[2];   \
    const uint32_t ne3 = dst->ne[3];   \
                                       \
    const uint32_t nb00 = src0->nb[0]; \
    const uint32_t nb01 = src0->nb[1]; \
    const uint32_t nb02 = src0->nb[2]; \
    const uint32_t nb03 = src0->nb[3]; \
                                       \
    const uint32_t nb0 = dst->nb[0];   \
    const uint32_t nb1 = dst->nb[1];   \
    const uint32_t nb2 = dst->nb[2];   \
    const uint32_t nb3 = dst->nb[3];
 struct rope_th_ctx {
    int32_t n_dims;
    int32_t mode;
    int32_t n_ctx_orig;
    int32_t sections[4];
    float freq_base;
    float freq_scale;
    float ext_factor;
    float attn_factor;
    float beta_fast;
    float beta_slow;
    float theta_scale;
    float corr_dims[2];
    struct htp_ops_context * octx;
 };
 static float rope_yarn_ramp(const float low, const float high, const int i0) {
    const float y = (i0 / 2 - low) / MAX(0.001f, high - low);
    return (1 - MIN(1, MAX(0, y)));
 }
 static void rope_cache_init(const float   theta_base,
                            float         freq_scale,
                            const float * freq_factors,
                            float *       corr_dims,
                            uint32_t      ne0,
                            float         ext_factor,
                            float         mscale,
                            float *       cache,
                            float         theta_scale) {
    // ref: https://github.com/jquesnelle/yarn/blob/master/scaled_rope/LlamaYaRNScaledRotaryEmbedding.py
    float theta = theta_base;
    for (uint32_t i0 = 0; i0 < ne0; i0 += 2) {
        const float ff = freq_factors ? freq_factors[i0 / 2] : 1.0f;
        float theta_extrap = theta / ff;
        // Get n-d rotational scaling corrected for extrapolation
        float theta_interp = freq_scale * theta_extrap;
        float theta2       = theta_interp;
        if (ext_factor != 0.0f) {
            float ramp_mix = rope_yarn_ramp(corr_dims[0], corr_dims[1], i0) * ext_factor;
            theta2         = theta_interp * (1 - ramp_mix) + theta_extrap * ramp_mix;
            // Get n-d magnitude scaling corrected for interpolation
            mscale *= 1.0f + 0.1f * logf(1.0f / freq_scale);
        }
        cache[i0 + 0] = cosf(theta2) * mscale;
        cache[i0 + 1] = sinf(theta2) * mscale;
        theta *= theta_scale;
    }
 }
 #define M_PI 3.1415926535897932384626433
 static void rope_corr_dims(int     n_dims,
                           int     n_ctx_orig,
                           float   freq_base,
                           float   beta_fast,
                           float   beta_slow,
                           float * dims) {
    float start = floorf(n_dims * logf(n_ctx_orig / (beta_fast * 2 * (float) M_PI)) / (2 * logf(freq_base)));
    float end   = ceilf(n_dims * logf(n_ctx_orig / (beta_slow * 2 * (float) M_PI)) / (2 * logf(freq_base)));
    dims[0]     = MAX(0, start);
    dims[1]     = MIN(n_dims - 1, end);
 }
 static void init_rope_ctx(struct rope_th_ctx * rope_ctx, struct htp_ops_context * octx) {
    memset(rope_ctx, 0, sizeof(struct rope_th_ctx));
    const int32_t * op_params = &octx->op_params[0];
    rope_ctx->n_dims     = ((const int32_t *) op_params)[1];
    rope_ctx->mode       = ((const int32_t *) op_params)[2];
    rope_ctx->n_ctx_orig = ((const int32_t *) op_params)[4];
    memcpy(&rope_ctx->freq_base, (int32_t *) op_params + 5, sizeof(float));
    memcpy(&rope_ctx->freq_scale, (int32_t *) op_params + 6, sizeof(float));
    memcpy(&rope_ctx->ext_factor, (int32_t *) op_params + 7, sizeof(float));
    memcpy(&rope_ctx->attn_factor, (int32_t *) op_params + 8, sizeof(float));
    memcpy(&rope_ctx->beta_fast, (int32_t *) op_params + 9, sizeof(float));
    memcpy(&rope_ctx->beta_slow, (int32_t *) op_params + 10, sizeof(float));
    memcpy(&rope_ctx->sections, (int32_t *) op_params + 11, sizeof(int) * 4);
    rope_ctx->theta_scale = powf(rope_ctx->freq_base, -2.0f / rope_ctx->n_dims);
    rope_corr_dims(rope_ctx->n_dims, rope_ctx->n_ctx_orig, rope_ctx->freq_base, rope_ctx->beta_fast,
                   rope_ctx->beta_slow, rope_ctx->corr_dims);
    rope_ctx->octx = octx;
    FARF(HIGH, "rope-f32 n_dims:%d, ext_factor:%.6f, theta_scale:%.6f, attn_factor:%.6f\n", rope_ctx->n_dims,
         rope_ctx->ext_factor, rope_ctx->theta_scale, rope_ctx->attn_factor);
 }
 static void hvx_calc_rope_f32(const float * restrict src0,
                              float * restrict dst,
                              const int num_elems,
                              const float * restrict theta_cache) {
    // for (int i = 0; i < num_elems; i += 2) {
    //const float cos_theta = theta_cache[i + 0];
    //const float sin_theta = theta_cache[i + 1];
    //const float x0 = src[0];
    //const float x1 = src[1];
    //dst[0] = x0*cos_theta - x1*sin_theta;
    //dst[1] = x0*sin_theta + x1*cos_theta;
    //src += 2;
    //dst += 2;
    // }
    const uint8_t * restrict src0_curr  = (const uint8_t *) src0;
    const uint8_t * restrict theta_curr = (const uint8_t *) theta_cache;
    uint8_t * restrict dst_curr         = (uint8_t *) dst;
    int step_of_1 = num_elems >> 6;  // 6 because we process two vectors at once
    for (int i = 0; i < step_of_1; i++) {
        HVX_Vector v0 = *(HVX_Vector *) src0_curr;
        HVX_Vector v1 = *(HVX_Vector *) (src0_curr + VLEN);
        HVX_Vector v2 = *(HVX_Vector *) theta_curr;
        HVX_Vector v3 = *(HVX_Vector *) (theta_curr + VLEN);
        HVX_VectorPair vx0_x1   = Q6_W_vdeal_VVR(v1, v0, -4);  // vx0_x1[0] = x0, vx0_x1[1] = x1
        HVX_VectorPair vcos_sin = Q6_W_vdeal_VVR(v3, v2, -4);  // vcos_sin[0] = cos_theta, vcos_sin[1] = sin_theta
        HVX_Vector vx0_c = Q6_Vqf32_vmpy_VsfVsf(Q6_V_lo_W(vx0_x1), Q6_V_lo_W(vcos_sin));
        HVX_Vector vx0_s = Q6_Vqf32_vmpy_VsfVsf(Q6_V_lo_W(vx0_x1), Q6_V_hi_W(vcos_sin));
        HVX_Vector vx1_c = Q6_Vqf32_vmpy_VsfVsf(Q6_V_hi_W(vx0_x1), Q6_V_lo_W(vcos_sin));
        HVX_Vector vx1_s = Q6_Vqf32_vmpy_VsfVsf(Q6_V_hi_W(vx0_x1), Q6_V_hi_W(vcos_sin));
        HVX_Vector v4 = Q6_Vqf32_vsub_Vqf32Vqf32(vx0_c, vx1_s);
        HVX_Vector v5 = Q6_Vqf32_vadd_Vqf32Vqf32(vx0_s, vx1_c);
        HVX_VectorPair vstore = Q6_W_vshuff_VVR(Q6_Vsf_equals_Vqf32(v5), Q6_Vsf_equals_Vqf32(v4), -4);
        *(HVX_Vector *) dst_curr          = Q6_V_lo_W(vstore);
        *(HVX_Vector *) (dst_curr + VLEN) = Q6_V_hi_W(vstore);
        src0_curr += 2 * VLEN;
        theta_curr += 2 * VLEN;
        dst_curr += 2 * VLEN;
    }
 }
 static void rope_hex_f32(struct rope_th_ctx * rope_ctx,
                         const uint32_t       ir0,
                         const uint32_t       ir1,
                         int                  nth,
                         int                  ith,
                         int                  opt_path) {
    struct htp_ops_context * octx = rope_ctx->octx;
    const struct htp_tensor * src0 = &octx->src0;
    const struct htp_tensor * src1 = &octx->src1;
    const struct htp_tensor * src2 = &octx->src2;
    struct htp_tensor *       dst  = &octx->dst;
    htp_rope_preamble;
    const int32_t * pos = (const int32_t *) src1->data;
    float * wp0 = (float *) (octx->src0_spad.data + (ith * nb01));
    const float * freq_factors = NULL;
    if (src2 != NULL) {
        freq_factors = (const float *) src2->data;
    }
    int ir = 0;
    for (uint32_t i3 = 0; i3 < ne3; i3++) {      // batch
        for (uint32_t i2 = 0; i2 < ne2; i2++) {  // seq-len
            const int32_t p = pos[i2];
            rope_cache_init(p, rope_ctx->freq_scale, freq_factors, rope_ctx->corr_dims, ne0, rope_ctx->ext_factor,
                            rope_ctx->attn_factor, wp0, rope_ctx->theta_scale);
            for (uint32_t i1 = 0; i1 < ne1; i1++) {  // attn-heads
                if (ir++ < ir0) {
                    continue;
                }
                if (ir > ir1) {
                    break;
                }
                const float * src      = (float *) ((char *) src0->data + i3 * nb03 + i2 * nb02 + i1 * nb01);
                float *       dst_data = (float *) ((char *) dst->data + i3 * nb3 + i2 * nb2 + i1 * nb1);
                const float * src_loc      = src;
                float *       dst_data_loc = dst_data;
                if (1 == opt_path) {
                    hvx_calc_rope_f32(src_loc, dst_data_loc, rope_ctx->n_dims, wp0);
                } else {
                    for (uint32_t i0 = 0; i0 < rope_ctx->n_dims; i0 += 2) {
                        const float cos_theta = wp0[i0 + 0];
                        const float sin_theta = wp0[i0 + 1];
                        const float x0 = src_loc[0];
                        const float x1 = src_loc[1];
                        dst_data_loc[0] = x0 * cos_theta - x1 * sin_theta;
                        dst_data_loc[1] = x0 * sin_theta + x1 * cos_theta;
                        src_loc += 2;
                        dst_data_loc += 2;
                    }
                }
                for (uint32_t i0 = rope_ctx->n_dims; i0 < ne0; i0 += 2) {
                    dst_data_loc[0] = src_loc[0];
                    dst_data_loc[1] = src_loc[1];
                    src_loc += 2;
                    dst_data_loc += 2;
                }
            }
        }
    }
 }
 static void rope_job_f32_per_thread(struct rope_th_ctx * rope_ctx, int nth, int ith) {
    struct htp_ops_context * octx = rope_ctx->octx;
    const struct htp_tensor * src0 = &octx->src0;
    const struct htp_tensor * src1 = &octx->src1;
    struct htp_tensor *       dst  = &octx->dst;
    htp_rope_preamble;
    const uint32_t src0_nrows            = ne01 * ne02 * ne03;  // src0 rows
    const uint32_t src0_nrows_per_thread = octx->src0_nrows_per_thread;
    const uint32_t src0_start_row = src0_nrows_per_thread * ith;
    const uint32_t src0_end_row   = MIN(src0_start_row + src0_nrows_per_thread, src0_nrows);
    // no work for this thread
    if (src0_start_row >= src0_end_row) {
        return;
    }
    uint64_t t1, t2;
    t1 = HAP_perf_get_qtimer_count();
    int is_aligned = 1;
    int opt_path   = 0;
    if ((0 == htp_is_aligned((void *) src0->data, VLEN)) || (0 == htp_is_aligned((void *) src1->data, VLEN)) ||
        (0 == htp_is_aligned((void *) dst->data, VLEN))) {
        FARF(HIGH, "rope-f32: unaligned addresses in rope op, possibly slower execution\n");
        is_aligned = 0;
    }
    if ((1 == is_aligned) && !(nb01 & (VLEN - 1))) {
        opt_path = 1;
    }
    rope_hex_f32(rope_ctx, src0_start_row, src0_end_row, nth, ith, opt_path);
    t2 = HAP_perf_get_qtimer_count();
    FARF(HIGH, "rope-f32: %d/%d/%d: (%u:%u) usec %u\n", ith, nth, opt_path, src0_start_row, src0_end_row,
         (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1));
 }
 static void rope_job_dispatcher_f32(unsigned int n, unsigned int i, void * data) {
    struct rope_th_ctx * rope_ctx = (struct rope_th_ctx *) data;
    rope_job_f32_per_thread(rope_ctx, n, i);
 }
 static int execute_op_rope_f32(struct htp_ops_context * octx) {
    int err = HTP_STATUS_OK;
    const struct htp_tensor * src0 = &octx->src0;
    const struct htp_tensor * src1 = &octx->src1;
    const struct htp_tensor * src2 = &octx->src2;
    struct htp_tensor *       dst  = &octx->dst;
    worker_callback_t op_func;
    const char *      op_type = NULL;
    struct rope_th_ctx rope_ctx;
    switch (octx->op) {
        case HTP_OP_ROPE:
            op_func = rope_job_dispatcher_f32;
            op_type = "rope-f32";
            init_rope_ctx(&rope_ctx, octx);
            break;
        default:
            FARF(ERROR, "Unsupported Op %u\n", octx->op);
            return HTP_STATUS_NO_SUPPORT;
    }
    const uint32_t n_threads = octx->n_threads;
    const size_t src0_row_size = src0->nb[1];
    const size_t src1_row_size = src0_row_size;
    const size_t dst_row_size  = dst->nb[1];
    // VTCM scratchpads for all tensors
    // N rows per thread, padded to HVX vector size
    octx->dst_spad.size  = htp_round_up(dst_row_size, 128) * n_threads;
    octx->src0_spad.size = htp_round_up(src0_row_size, 128) * n_threads;
    octx->src1_spad.size = htp_round_up(src1_row_size, 128) * n_threads;
    size_t spad_size = octx->src0_spad.size + octx->src1_spad.size + octx->dst_spad.size;
    if (src2->ne[0]) {
        FARF(HIGH,
             "%s: %ux%ux%ux%u (x %ux%ux%ux%u x %ux%ux%ux%u) -> %ux%ux%ux%u : src0-spad-size %u src1-spad-size %u "
             "dst-spad-size %u\n",
             op_type, src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], src1->ne[0], src1->ne[1], src1->ne[2],
             src1->ne[3], src2->ne[0], src2->ne[1], src2->ne[2], src2->ne[3], dst->ne[0], dst->ne[1], dst->ne[2],
             dst->ne[3], octx->src0_spad.size, octx->src1_spad.size, octx->dst_spad.size);
    } else {
        FARF(HIGH,
             "%s: %ux%ux%ux%u (%ux%ux%ux%u) -> %ux%ux%ux%u : src0-spad-size %u src1-spad-size %u dst-spad-size %u\n",
             op_type, src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], src1->ne[0], src1->ne[1], src1->ne[2],
             src1->ne[3], dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], octx->src0_spad.size, octx->src1_spad.size,
             octx->dst_spad.size);
    }
    // Make sure the reserved vtcm size is sufficient
    if (octx->ctx->vtcm_size < spad_size) {
        FARF(ERROR, "%s : current VTCM reservation %zu is too small, needed %zu\n", op_type, octx->ctx->vtcm_size,
             spad_size);
        return HTP_STATUS_VTCM_TOO_SMALL;
    }
    octx->src0_spad.data = octx->ctx->vtcm_base;
    octx->src1_spad.data = octx->src0_spad.data + octx->src0_spad.size;
    octx->dst_spad.data  = octx->src1_spad.data + octx->src1_spad.size;
    uint32_t src0_nrows = src0->ne[1] * src0->ne[2] * src0->ne[3];
    if (!(octx->flags & HTP_OPFLAGS_SKIP_COMPUTE)) {
        uint32_t n_jobs             = MIN(n_threads, src0_nrows);
        octx->src0_nrows_per_thread = (src0_nrows + n_jobs - 1) / n_jobs;
        worker_pool_run_func(octx->ctx->worker_pool, op_func, &rope_ctx, n_jobs);
    }
    return err;
 }
 int op_rope(struct htp_ops_context * octx) {
    int err = HTP_STATUS_OK;
    switch (octx->src0.type) {
        case HTP_TYPE_F32:
            err = execute_op_rope_f32(octx);
            break;
        default:
            err = HTP_STATUS_NO_SUPPORT;
            break;
    }
    return err;
 }
--- a/ggml/src/ggml-hexagon/htp/softmax-ops.c
+++ b/ggml/src/ggml-hexagon/htp/softmax-ops.c
@ -0,0 +1,402 @@
 #pragma clang diagnostic ignored "-Wunused-variable"
 #pragma clang diagnostic ignored "-Wunused-function"
 #pragma clang diagnostic ignored "-Wunused-but-set-variable"
 #ifdef HTP_DEBUG
 #    define FARF_HIGH 1
 #endif
 #include <HAP_farf.h>
 #include <HAP_mem.h>
 #include <HAP_perf.h>
 #include <HAP_ps.h>
 #include <hexagon_protos.h>
 #include <hexagon_types.h>
 #include <math.h>
 #include <qurt_thread.h>
 #include <string.h>
 #define GGML_COMMON_DECL_C
 #include "ggml-common.h"
 #include "htp-ctx.h"
 #include "htp-dma.h"
 #include "htp-msg.h"
 #include "htp-ops.h"
 #include "hvx-utils.h"
 #include "ops-utils.h"
 #define htp_softmax_preamble3                              \
    const uint32_t ne00 = src0->ne[0];                     \
    const uint32_t ne01 = src0->ne[1];                     \
    const uint32_t ne02 = src0->ne[2];                     \
    const uint32_t ne03 = src0->ne[3];                     \
                                                           \
    const uint32_t nb00 = src0->nb[0];                     \
    const uint32_t nb01 = src0->nb[1];                     \
    const uint32_t nb02 = src0->nb[2];                     \
    const uint32_t nb03 = src0->nb[3];                     \
                                                           \
    const uint32_t ne10 = (src1->ne[0]) ? src1->ne[0] : 1; \
    const uint32_t ne11 = (src1->ne[0]) ? src1->ne[1] : 1; \
    const uint32_t ne12 = (src1->ne[0]) ? src1->ne[2] : 1; \
    const uint32_t ne13 = (src1->ne[0]) ? src1->ne[3] : 1; \
                                                           \
    const uint32_t nb10 = (src1->ne[0]) ? src1->nb[0] : 1; \
    const uint32_t nb11 = (src1->ne[0]) ? src1->nb[1] : 1; \
    const uint32_t nb12 = (src1->ne[0]) ? src1->nb[2] : 1; \
    const uint32_t nb13 = (src1->ne[0]) ? src1->nb[3] : 1; \
                                                           \
    const uint32_t ne0 = dst->ne[0];                       \
    const uint32_t ne1 = dst->ne[1];                       \
    const uint32_t ne2 = dst->ne[2];                       \
    const uint32_t ne3 = dst->ne[3];                       \
                                                           \
    const uint32_t nb0 = dst->nb[0];                       \
    const uint32_t nb1 = dst->nb[1];                       \
    const uint32_t nb2 = dst->nb[2];                       \
    const uint32_t nb3 = dst->nb[3];
 struct softmax_th_ctx {
    bool     use_f16;
    bool     use_src1;
    uint32_t n_head;
    uint32_t n_head_log2;
    float scale;
    float max_bias;
    float m0;
    float m1;
    struct htp_ops_context * octx;
 };
 static void init_softmax_ctx(struct softmax_th_ctx * softmax_ctx, struct htp_ops_context * octx) {
    const struct htp_tensor * src0 = &octx->src0;
    const struct htp_tensor * src1 = &octx->src1;
    memset(softmax_ctx, 0, sizeof(struct softmax_th_ctx));
    memcpy(&softmax_ctx->scale, (float *) octx->op_params, sizeof(float));
    memcpy(&softmax_ctx->max_bias, (float *) octx->op_params + 1, sizeof(float));
    softmax_ctx->n_head      = src0->ne[2];
    softmax_ctx->n_head_log2 = 1u << (uint32_t) floor(log2(softmax_ctx->n_head));
    softmax_ctx->m0 = powf(2.0f, -(softmax_ctx->max_bias) / softmax_ctx->n_head_log2);
    softmax_ctx->m1 = powf(2.0f, -(softmax_ctx->max_bias / 2.0f) / softmax_ctx->n_head_log2);
    softmax_ctx->use_src1 = (src1->ne[0] != 0);
    softmax_ctx->use_f16  = (src1->ne[0] != 0) && (src1->type == HTP_TYPE_F16);
    softmax_ctx->octx = octx;
 }
 static void hvx_fast_softmax_prep_f32(const uint8_t * restrict src,
                                      uint8_t * restrict dst,
                                      const int num_elems,
                                      float     scale,
                                      const uint8_t * restrict mask,
                                      float slope) {
    const uint8_t * restrict src_curr  = src;
    uint8_t * restrict dst_curr        = dst;
    const uint8_t * restrict mask_curr = mask;
    HVX_Vector scale_vec = hvx_vec_splat_fp32(scale);
    HVX_Vector slope_vec = hvx_vec_splat_fp32(slope);
    int step_of_1 = num_elems >> 5;
    #pragma unroll(4)
    for (int i = 0; i < step_of_1; i++) {
        HVX_Vector v1 = *(HVX_Vector *) src_curr;
        HVX_Vector v3 = *(HVX_Vector *) mask_curr;
        HVX_Vector v2 = Q6_Vqf32_vmpy_VsfVsf(v1, scale_vec);
        HVX_Vector v4 = Q6_Vqf32_vmpy_VsfVsf(v3, slope_vec);
        HVX_Vector v5 = Q6_Vqf32_vadd_Vqf32Vqf32(v2, v4);
        *(HVX_Vector *) dst_curr = Q6_Vsf_equals_Vqf32(v5);
        src_curr += VLEN;
        dst_curr += VLEN;
        mask_curr += VLEN;
    }
 }
 static void hvx_fast_softmax_f32(const uint8_t * restrict src,
                                 uint8_t * restrict dst,
                                 uint8_t * restrict pad,
                                 const int num_elems) {
    const HVX_Vector * restrict v_src = (HVX_Vector *) src;
    HVX_Vector * restrict v_pad       = (HVX_Vector *) pad;
    HVX_Vector * restrict v_dst       = (HVX_Vector *) dst;
    HVX_Vector sum_vec = Q6_V_vsplat_R(0x00000000);
    HVX_Vector max_vec = hvx_vec_splat_fp32(((const float *) src)[0]);
    HVX_Vector zero_v  = Q6_V_vzero();
    HVX_Vector one_v   = hvx_vec_splat_fp32(1.0);
    int step_of_1 = num_elems >> 5;
    #pragma unroll(4)
    for (int i = 0; i < step_of_1; i++) {
        HVX_Vector v1 = v_src[i];
        max_vec       = Q6_Vsf_vmax_VsfVsf(max_vec, v1);
    }
    HVX_Vector v = hvx_vec_reduce_max_fp32(max_vec);
    max_vec      = hvx_vec_repl4(v);
    #pragma unroll(4)
    for (int i = 0; i < step_of_1; i++) {
        HVX_Vector v1 = v_src[i];
        HVX_Vector v2 = Q6_Vqf32_vsub_VsfVsf(v1, max_vec);
        HVX_Vector v3 = hvx_vec_exp_fp32(Q6_Vsf_equals_Vqf32(v2));
        sum_vec = Q6_Vqf32_vadd_VsfVsf(Q6_Vsf_equals_Vqf32(sum_vec), v3);
        v_pad[i] = v3;
    }
    v       = hvx_vec_qf32_reduce_sum(sum_vec);
    sum_vec = hvx_vec_repl4(Q6_Vsf_equals_Vqf32(v));
    HVX_VectorPred pos_sum   = Q6_Q_vcmp_gt_VwVw(sum_vec, zero_v);
    HVX_Vector     v4        = hvx_vec_inverse_fp32(sum_vec);
    HVX_Vector     scale_vec = Q6_V_vmux_QVV(pos_sum, v4, one_v);
    #pragma unroll(4)
    for (int i = 0; i < step_of_1; i++) {
        HVX_Vector v1 = v_pad[i];
        HVX_Vector v2 = Q6_Vqf32_vmpy_VsfVsf(v1, scale_vec);
        v_dst[i]      = Q6_Vsf_equals_Vqf32(v2);
    }
 }
 static float hvx_softmax_f32(const uint8_t * restrict src,
                             uint8_t * restrict dst,
                             uint8_t * restrict spad,
                             const int   num_elems,
                             const float max) {
    hvx_sub_scalar_f32(src, max, spad, num_elems);
    hvx_exp_f32(spad, dst, num_elems, false);
    float sum = hvx_self_sum_f32(dst, num_elems);
    return sum;
 }
 static void softmax_htp_f32(int nth, int ith, struct softmax_th_ctx * softmax_ctx, int opt_path) {
    struct htp_ops_context * octx = softmax_ctx->octx;
    const struct htp_tensor * src0 = &octx->src0;
    const struct htp_tensor * src1 = &octx->src1;
    const struct htp_tensor * dst  = &octx->dst;
    htp_softmax_preamble3;
    uint8_t * src0_spad_data = octx->src0_spad.data + (ith * nb01);
    uint8_t * src1_spad_data = octx->src1_spad.data + (ith * nb01);
    uint8_t * dst_spad_data  = octx->dst_spad.data + (ith * nb1);
    float * wp0 = (float *) src0_spad_data;
    float * wp1 = (float *) src1_spad_data;
    float * wp2 = (float *) dst_spad_data;
    for (uint32_t i03 = 0; i03 < ne03; i03++) {
        for (uint32_t i02 = 0; i02 < ne02; i02++) {
            for (uint32_t i01 = ith; i01 < ne01; i01 += nth) {
                const uint32_t i11 = i01;
                const uint32_t i12 = i02 % ne12;
                const uint32_t i13 = i03 % ne13;
                // ALiBi
                const uint32_t h = i02;  // head
                const float slope = (softmax_ctx->max_bias > 0.0f) ?
                                        h < softmax_ctx->n_head_log2 ?
                                        powf(softmax_ctx->m0, h + 1) :
                                        powf(softmax_ctx->m1, 2 * (h - softmax_ctx->n_head_log2) + 1) :
                                        1.0f;
                float * sp = (float *) ((char *) octx->src0.data + i01 * nb01 + i02 * nb02 + i03 * nb03);
                float * dp = (float *) ((char *) octx->dst.data + i01 * nb1 + i02 * nb2 + i03 * nb3);
                // broadcast the mask across rows
                __fp16 * mp_f16 = (softmax_ctx->use_src1) ?
                                      (__fp16 *) ((char *) octx->src1.data + i11 * nb11 + i12 * nb12 + i13 * nb13) :
                                      NULL;
                float *  mp_f32 = (softmax_ctx->use_src1) ?
                                      (float *) ((char *) octx->src1.data + i11 * nb11 + i12 * nb12 + i13 * nb13) :
                                      NULL;
                if ((1 == opt_path) && (mp_f32) && !(softmax_ctx->use_f16)) {
                    hvx_fast_softmax_prep_f32((const uint8_t *) sp, (uint8_t *) wp0, ne00, softmax_ctx->scale,
                                              (const uint8_t *) mp_f32, slope);
                } else {
                    hvx_scale_f32((const uint8_t *) sp, (uint8_t *) wp0, ne00, softmax_ctx->scale);
                    if (mp_f32) {
                        if (softmax_ctx->use_f16) {
                            for (int i = 0; i < ne00; ++i) {
                                wp0[i] += slope * (float) mp_f16[i];
                            }
                        } else {
                            for (int i = 0; i < ne00; ++i) {
                                wp0[i] += slope * mp_f32[i];
                            }
                        }
                    }
                }
                if (1 == opt_path) {
                    hvx_fast_softmax_f32((const uint8_t *) wp0, (uint8_t *) dp, (uint8_t *) wp1, ne00);
                } else {
                    float max = hvx_self_max_f32((const uint8_t *) wp0, ne00);
                    float sum = hvx_softmax_f32((const uint8_t *) wp0, (uint8_t *) wp2, (uint8_t *) wp1, ne00, max);
                    sum       = sum > 0.0 ? (1.0 / sum) : 1;
                    hvx_scale_f32((const uint8_t *) wp2, (uint8_t *) dp, ne00, sum);
                }
            }
        }
    }
 }
 static void softmax_job_f32_per_thread(struct softmax_th_ctx * softmax_ctx, int nth, int ith) {
    struct htp_ops_context * octx = softmax_ctx->octx;
    const struct htp_tensor * src0 = &octx->src0;
    const struct htp_tensor * src1 = &octx->src1;
    struct htp_tensor *       dst  = &octx->dst;
    htp_softmax_preamble3;
    const uint32_t src0_nrows            = ne01 * ne02 * ne03;  // src0 rows
    const uint32_t src0_nrows_per_thread = octx->src0_nrows_per_thread;
    const uint32_t src0_start_row = src0_nrows_per_thread * ith;
    const uint32_t src0_end_row   = MIN(src0_start_row + src0_nrows_per_thread, src0_nrows);
    // no work for this thread
    if (src0_start_row >= src0_end_row) {
        return;
    }
    uint64_t t1, t2;
    t1 = HAP_perf_get_qtimer_count();
    int is_aligned = 1;
    int opt_path   = 0;
    if (!htp_is_aligned((void *) src0->data, VLEN) || !htp_is_aligned((void *) dst->data, VLEN)) {
        is_aligned = 0;
        FARF(HIGH, "softmax-f32: unaligned addresses in elementwise op, possibly slower execution\n");
    }
    if ((1 == is_aligned) && !(nb01 & (VLEN - 1))) {
        opt_path = 1;
    }
    softmax_htp_f32(nth, ith, softmax_ctx, opt_path);
    t2 = HAP_perf_get_qtimer_count();
    FARF(HIGH, "softmax-f32 %d/%d/%d/%d: %ux%ux%ux%u (%u:%u) x %ux%ux%ux%u -> %ux%ux%ux%u usec %u\n", ith, nth,
         softmax_ctx->use_f16, opt_path, ne00, ne01, ne02, ne03, src0_start_row, src0_end_row, ne10, ne11, ne12, ne13,
         ne0, ne1, ne2, ne3, (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1));
 }
 static void softmax_job_dispatcher_f32(unsigned int n, unsigned int i, void * p_data) {
    struct softmax_th_ctx * p_softmax_ctx = (struct softmax_th_ctx *) p_data;
    softmax_job_f32_per_thread(p_softmax_ctx, n, i);
 }
 static int execute_op_softmax_f32(struct htp_ops_context * octx) {
    int err = HTP_STATUS_OK;
    const struct htp_tensor * src0 = &octx->src0;
    const struct htp_tensor * src1 = &octx->src1;
    struct htp_tensor *       dst  = &octx->dst;
    worker_callback_t op_func;
    const char *      op_type = NULL;
    struct softmax_th_ctx softmax_ctx;
    switch (octx->op) {
        case HTP_OP_SOFTMAX:
            op_func = softmax_job_dispatcher_f32;
            op_type = "softmax-f32";
            init_softmax_ctx(&softmax_ctx, octx);
            break;
        default:
            FARF(ERROR, "Unsupported Op %u\n", octx->op);
            return HTP_STATUS_NO_SUPPORT;
    }
    const uint32_t n_threads = octx->n_threads;
    const size_t src0_row_size = src0->nb[1];
    const size_t src1_row_size = src0_row_size;
    const size_t dst_row_size  = dst->nb[1];
    // VTCM scratchpads for all tensors
    // N rows per thread, padded to HVX vector size
    octx->dst_spad.size  = htp_round_up(dst_row_size, 128) * n_threads;
    octx->src0_spad.size = htp_round_up(src0_row_size, 128) * n_threads;
    octx->src1_spad.size = htp_round_up(src1_row_size, 128) * n_threads;
    size_t spad_size = octx->src0_spad.size + octx->src1_spad.size + octx->dst_spad.size;
    if (src1->ne[0]) {
        FARF(HIGH,
             "%s: %ux%ux%ux%u x %ux%ux%ux%u -> %ux%ux%ux%u : src0-spad-size %u src1-spad-size %u dst-spad-size %u\n",
             op_type, src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], src1->ne[0], src1->ne[1], src1->ne[2],
             src1->ne[3], dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], octx->src0_spad.size, octx->src1_spad.size,
             octx->dst_spad.size);
    } else {
        FARF(HIGH, "%s: %ux%ux%ux%u -> %ux%ux%ux%u : src0-spad-size %u src1-spad-size %u dst-spad-size %u\n", op_type,
             src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3],
             octx->src0_spad.size, octx->src1_spad.size, octx->dst_spad.size);
    }
    // Make sure the reserved vtcm size is sufficient
    if (octx->ctx->vtcm_size < spad_size) {
        FARF(ERROR, "%s : current VTCM reservation %zu is too small, needed %zu\n", op_type, octx->ctx->vtcm_size,
             spad_size);
        return HTP_STATUS_VTCM_TOO_SMALL;
    }
    octx->src0_spad.data = octx->ctx->vtcm_base;
    octx->src1_spad.data = octx->src0_spad.data + octx->src0_spad.size;
    octx->dst_spad.data  = octx->src1_spad.data + octx->src1_spad.size;
    uint32_t src0_nrows = src0->ne[1] * src0->ne[2] * src0->ne[3];
    if (!(octx->flags & HTP_OPFLAGS_SKIP_COMPUTE)) {
        uint32_t n_jobs             = MIN(n_threads, src0_nrows);
        octx->src0_nrows_per_thread = (src0_nrows + n_jobs - 1) / n_jobs;
        worker_pool_run_func(octx->ctx->worker_pool, op_func, &softmax_ctx, n_jobs);
    }
    return err;
 }
 int op_softmax(struct htp_ops_context * octx) {
    int err = HTP_STATUS_OK;
    switch (octx->src0.type) {
        case HTP_TYPE_F32:
            err = execute_op_softmax_f32(octx);
            break;
        default:
            err = HTP_STATUS_NO_SUPPORT;
            break;
    }
    return err;
 }
--- a/ggml/src/ggml-hexagon/htp/unary-ops.c
+++ b/ggml/src/ggml-hexagon/htp/unary-ops.c
@ -0,0 +1,255 @@
 #pragma clang diagnostic ignored "-Wunused-variable"
 #pragma clang diagnostic ignored "-Wunused-function"
 #pragma clang diagnostic ignored "-Wunused-but-set-variable"
 #ifdef HTP_DEBUG
 #    define FARF_HIGH 1
 #endif
 #include <HAP_farf.h>
 #include <HAP_mem.h>
 #include <HAP_perf.h>
 #include <HAP_ps.h>
 #include <hexagon_protos.h>
 #include <hexagon_types.h>
 #include <math.h>
 #include <qurt_thread.h>
 #include <string.h>
 #define GGML_COMMON_DECL_C
 #include "ggml-common.h"
 #include "htp-ctx.h"
 #include "htp-dma.h"
 #include "htp-msg.h"
 #include "htp-ops.h"
 #include "hvx-utils.h"
 #include "ops-utils.h"
 #define htp_unary_preamble            \
    const uint32_t ne00 = src->ne[0]; \
    const uint32_t ne01 = src->ne[1]; \
    const uint32_t ne02 = src->ne[2]; \
    const uint32_t ne03 = src->ne[3]; \
                                      \
    const uint32_t ne0 = dst->ne[0];  \
    const uint32_t ne1 = dst->ne[1];  \
    const uint32_t ne2 = dst->ne[2];  \
    const uint32_t ne3 = dst->ne[3];  \
                                      \
    const uint32_t nb00 = src->nb[0]; \
    const uint32_t nb01 = src->nb[1]; \
    const uint32_t nb02 = src->nb[2]; \
    const uint32_t nb03 = src->nb[3]; \
                                      \
    const uint32_t nb0 = dst->nb[0];  \
    const uint32_t nb1 = dst->nb[1];  \
    const uint32_t nb2 = dst->nb[2];  \
    const uint32_t nb3 = dst->nb[3];
 static void hvx_fast_rms_norm_f32(const uint8_t * restrict src,
                                  uint8_t * restrict dst,
                                  uint8_t * restrict pad,
                                  const int num_elems,
                                  float     epsilon) {
    const HVX_Vector * restrict v_src = (HVX_Vector *) src;
    HVX_Vector * restrict v_dst       = (HVX_Vector *) dst;
    HVX_Vector sum_v     = Q6_V_vsplat_R(0x00000000);
    HVX_Vector epsilon_v = hvx_vec_splat_fp32(epsilon);
    int step_of_1 = num_elems >> 5;
    #pragma unroll(4)
    for (int i = 0; i < step_of_1; i++) {
        HVX_Vector v1 = v_src[i];
        HVX_Vector v2 = Q6_Vqf32_vmpy_VsfVsf(v1, v1);
        sum_v         = Q6_Vqf32_vadd_Vqf32Vqf32(sum_v, v2);
    }
    HVX_Vector reduced_sum = hvx_vec_qf32_reduce_sum(sum_v);
    sum_v                  = hvx_vec_repl4(Q6_Vsf_equals_Vqf32(reduced_sum));
    HVX_Vector t_v            = hvx_vec_splat_fp32((float) num_elems);
    HVX_Vector denom_v        = hvx_vec_inverse_fp32(t_v);
    HVX_Vector mean_v         = Q6_Vqf32_vmpy_VsfVsf(sum_v, denom_v);
    HVX_Vector mean_epsilon_v = Q6_Vqf32_vadd_Vqf32Vsf(mean_v, epsilon_v);
    HVX_Vector scale_v = hvx_vec_rsqrt_fp32(Q6_Vsf_equals_Vqf32(mean_epsilon_v));
    #pragma unroll(4)
    for (int i = 0; i < step_of_1; i++) {
        HVX_Vector v1 = v_src[i];
        HVX_Vector v2 = Q6_Vqf32_vmpy_VsfVsf(v1, scale_v);
        v_dst[i]      = Q6_Vsf_equals_Vqf32(v2);
    }
 }
 static void rms_norm_htp_f32(const float * restrict src,
                             float * restrict dst,
                             uint8_t * restrict spad,
                             const uint32_t num_rows,
                             const uint32_t row_elems,
                             const size_t   row_size,
                             int32_t *      op_params,
                             int            opt_path) {
    float epsilon = 0.f;
    memcpy(&epsilon, op_params, sizeof(float));
    for (uint32_t ir = 0; ir < num_rows; ir++) {
        const float * restrict src_local = src + (ir * row_elems);
        float * restrict dst_local       = dst + (ir * row_elems);
        if (ir + 1 < num_rows) {
            htp_l2fetch(src_local + row_elems, 1, row_size, row_size);
        }
        if (1 == opt_path) {
            hvx_fast_rms_norm_f32((const uint8_t *) src_local, (uint8_t *) dst_local, spad, row_elems, epsilon);
        } else {
            float sum = hvx_sum_of_squares_f32((const uint8_t *) src_local, row_elems);
            const float mean  = sum / row_elems;
            const float scale = 1.0f / sqrtf(mean + epsilon);
            hvx_scale_f32((const uint8_t *) src_local, (uint8_t *) dst_local, row_elems, scale);
        }
    }
 }
 static void unary_job_f32_per_thread(const struct htp_tensor * src,
                                     struct htp_tensor *       dst,
                                     uint8_t *                 spad,
                                     int                       htp_op,
                                     int32_t *                 op_params,
                                     uint32_t                  nth,
                                     uint32_t                  ith,
                                     uint32_t                  src0_nrows_per_thread) {
    htp_unary_preamble;
    const size_t src0_row_size = nb01;
    const size_t dst_row_size  = nb1;
    const uint32_t src0_nrows = ne01 * ne02 * ne03;  // src0 rows
    const uint32_t src0_start_row = src0_nrows_per_thread * ith;
    const uint32_t src0_end_row   = MIN(src0_start_row + src0_nrows_per_thread, src0_nrows);
    // no work for this thread
    if (src0_start_row >= src0_end_row) {
        return;
    }
    uint64_t t1, t2;
    t1 = HAP_perf_get_qtimer_count();
    int is_aligned = 1;
    int opt_path   = 0;
    if ((0 == htp_is_aligned((void *) src->data, VLEN)) || (0 == htp_is_aligned((void *) dst->data, VLEN))) {
        is_aligned = 0;
        FARF(HIGH, "unary-f32: unaligned addresses in unary op, possibly slower execution\n");
    }
    if ((1 == is_aligned) && !(nb01 & (VLEN - 1))) {
        opt_path = 1;
    }
    const uint8_t * restrict data_src = (const uint8_t *) src->data;
    uint8_t * restrict data_dst       = (uint8_t *) dst->data;
    const float * restrict src_th = (float *) (data_src + (src0_start_row * src0_row_size));
    float * restrict dst_th       = (float *) (data_dst + (src0_start_row * dst_row_size));
    uint8_t * restrict spad_th    = (uint8_t *) spad + (ith * nb01);
    switch (htp_op) {
        case HTP_OP_RMS_NORM:
            rms_norm_htp_f32(src_th, dst_th, spad_th, src0_end_row - src0_start_row, ne0, nb1, op_params, opt_path);
            break;
        default:
            break;
    }
    t2 = HAP_perf_get_qtimer_count();
    FARF(HIGH, "unary-f32 %d/%d/%d: %ux%ux%ux%u (%u:%u) -> %ux%ux%ux%u usec %u\n", ith, nth, opt_path, src->ne[0],
         src->ne[1], src->ne[2], src->ne[3], src0_start_row, src0_end_row, dst->ne[0], dst->ne[1], dst->ne[2],
         dst->ne[3], (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1));
 }
 static void unary_job_dispatcher_f32(unsigned int n, unsigned int i, void * data) {
    struct htp_ops_context * octx = (struct htp_ops_context *) data;
    unary_job_f32_per_thread(&octx->src0, &octx->dst, octx->src0_spad.data, octx->op, octx->op_params, n, i,
                             octx->src0_nrows_per_thread);
 }
 static int execute_op_unary_f32(struct htp_ops_context * octx) {
    int err = HTP_STATUS_OK;
    const struct htp_tensor * src0 = &octx->src0;
    struct htp_tensor *       dst  = &octx->dst;
    worker_callback_t unary_op_func;
    const char *      op_type = NULL;
    switch (octx->op) {
        case HTP_OP_RMS_NORM:
            unary_op_func = unary_job_dispatcher_f32;
            op_type       = "rmsnorm-f32";
            break;
        default:
            FARF(ERROR, "Unsupported unary Op %u\n", octx->op);
            return HTP_STATUS_NO_SUPPORT;
    }
    const int      n_threads  = octx->n_threads;
    const uint32_t src0_nrows = src0->ne[1] * src0->ne[2] * src0->ne[3];
    const size_t src0_row_size = src0->nb[1];
    const size_t dst_row_size  = dst->nb[1];
    // VTCM scratchpads for all tensors
    octx->dst_spad.size  = htp_round_up(dst_row_size, 128) * n_threads;
    octx->src0_spad.size = htp_round_up(src0_row_size, 128) * n_threads;
    size_t spad_size = octx->src0_spad.size + octx->dst_spad.size;
    FARF(HIGH, "%s: (%ux%ux%ux%u) -> (%ux%ux%ux%u) : src0-spad-size %u src1-spad-size %u dst-spad-size %u\n", op_type,
         src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3],
         octx->src0_spad.size, octx->src1_spad.size, octx->dst_spad.size);
    // Make sure the reserved vtcm size is sufficient
    if (octx->ctx->vtcm_size < spad_size) {
        FARF(ERROR, "unary-%s : current VTCM reservation %zu is too small, needed %zu\n", op_type, octx->ctx->vtcm_size,
             spad_size);
        return HTP_STATUS_VTCM_TOO_SMALL;
    }
    octx->src0_spad.data = octx->ctx->vtcm_base;
    octx->dst_spad.data  = octx->src0_spad.data + octx->src0_spad.size;
    if (!(octx->flags & HTP_OPFLAGS_SKIP_COMPUTE)) {
        uint32_t n_jobs = MIN(n_threads, src0_nrows);
        octx->src0_nrows_per_thread = (src0_nrows + n_jobs - 1) / n_jobs;
        worker_pool_run_func(octx->ctx->worker_pool, unary_op_func, octx, n_jobs);
    }
    return err;
 }
 int op_unary(struct htp_ops_context * octx) {
    int err = HTP_STATUS_OK;
    switch (octx->src0.type) {
        case HTP_TYPE_F32:
            err = execute_op_unary_f32(octx);
            break;
        default:
            err = HTP_STATUS_NO_SUPPORT;
            break;
    }
    return err;
 }
--- a/ggml/src/ggml-hexagon/htp/worker-pool.c
+++ b/ggml/src/ggml-hexagon/htp/worker-pool.c
@ -0,0 +1,297 @@
 #include "worker-pool.h"
 #include <qurt.h>
 #include <stdatomic.h>
 #include <stdint.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #ifdef HTP_DEBUG
 #    define FARF_HIGH 1
 #endif
 #include "HAP_farf.h"
 #define WORKER_THREAD_STACK_SZ  (2 * 16384)
 #define LOWEST_USABLE_QURT_PRIO (254)
 struct worker_pool_s;
 // internal structure kept in thread-local storage per instance of worker pool
 typedef struct {
    struct worker_pool_s * pool;
    unsigned int           id;
 } worker_context_t;
 // internal structure kept in thread-local storage per instance of worker pool
 typedef struct worker_pool_s {
    worker_pool_job_t job[MAX_NUM_WORKERS];      // list of job descriptors
    qurt_thread_t     thread[MAX_NUM_WORKERS];   // thread ID's of the workers
    worker_context_t  context[MAX_NUM_WORKERS];  // worker contexts
    void *            stack[MAX_NUM_WORKERS];    // thread stack pointers
    unsigned int      n_threads;                 // number of workers in this pool
    atomic_uint seqn;                            // seqno used to detect new jobs
    atomic_uint next_job;                        // next job index
    atomic_uint n_pending;                       // number of pending jobs
    atomic_uint n_jobs;                          // number of current jobs
    atomic_bool killed;                          // threads need to exit
 } worker_pool_t;
 static void worker_pool_main(void * context) {
    worker_context_t * me   = (worker_context_t *) context;
    worker_pool_t *    pool = me->pool;
    FARF(HIGH, "worker-pool: thread %u started", me->id);
    unsigned int prev_seqn = 0;
    while (!atomic_load(&pool->killed)) {
        unsigned int seqn = atomic_load(&pool->seqn);
        if (seqn == prev_seqn) {
            // Nothing to do
            qurt_futex_wait(&pool->seqn, prev_seqn);
            continue;
        }
        // New job
        prev_seqn = seqn;
        unsigned int n = atomic_load(&pool->n_jobs);
        unsigned int i = atomic_fetch_add(&pool->next_job, 1);
        if (i >= n) {
            // Spurios wakeup
            continue;
        }
        pool->job[i].func(n, i, pool->job[i].data);
        atomic_fetch_sub(&pool->n_pending, 1);
    }
    FARF(HIGH, "worker-pool: thread %u stopped", me->id);
 }
 AEEResult worker_pool_init_with_stack_size(worker_pool_context_t * context, uint32_t n_threads, uint32_t stack_size) {
    int err = 0;
    if (NULL == context) {
        FARF(ERROR, "NULL context passed to worker_pool_init().");
        return AEE_EBADPARM;
    }
    // Allocations
    int size = (stack_size * n_threads) + (sizeof(worker_pool_t));
    unsigned char * mem_blob = (unsigned char *) malloc(size);
    if (!mem_blob) {
        FARF(ERROR, "Could not allocate memory for worker pool!!");
        return AEE_ENOMEMORY;
    }
    worker_pool_t * me = (worker_pool_t *) (mem_blob + stack_size * n_threads);
    // name for the first worker, useful in debugging threads
    char name[19];
    snprintf(name, 12, "0x%8x:", (int) me);
    strcat(name, "worker0");
    me->n_threads = n_threads;
    // initializations
    for (unsigned int i = 0; i < me->n_threads; i++) {
        me->stack[i]  = NULL;
        me->thread[i] = 0;
        me->context[i].id   = i;
        me->context[i].pool = me;
    }
    // initialize job queue
    me->n_pending = 0;
    me->n_jobs    = 0;
    me->next_job  = 0;
    me->seqn      = 0;
    me->killed    = 0;
    // launch the workers
    qurt_thread_attr_t attr;
    qurt_thread_attr_init(&attr);
    for (unsigned int i = 0; i < me->n_threads; i++) {
        // set up stack
        me->stack[i] = mem_blob;
        mem_blob += stack_size;
        qurt_thread_attr_set_stack_addr(&attr, me->stack[i]);
        qurt_thread_attr_set_stack_size(&attr, stack_size);
        // set up name
        qurt_thread_attr_set_name(&attr, name);
        name[17] = (name[17] + 1);
        // name threads context:worker0, context:worker1, .. (recycle at 9, but num threads should be less than that anyway)
        if (name[17] > '9') {
            name[17] = '0';
        }
        // set up priority - by default, match the creating thread's prio
        int prio = qurt_thread_get_priority(qurt_thread_get_id());
        if (prio < 1) {
            prio = 1;
        }
        if (prio > LOWEST_USABLE_QURT_PRIO) {
            prio = LOWEST_USABLE_QURT_PRIO;
        }
        qurt_thread_attr_set_priority(&attr, prio);
        // launch
        err = qurt_thread_create(&me->thread[i], &attr, worker_pool_main, (void *) &me->context[i]);
        if (err) {
            FARF(ERROR, "Could not launch worker threads!");
            worker_pool_release((worker_pool_context_t *) &me);
            return AEE_EQURTTHREADCREATE;
        }
    }
    *context = (worker_pool_context_t *) me;
    return AEE_SUCCESS;
 }
 AEEResult worker_pool_init(worker_pool_context_t * context, uint32_t n_threads) {
    return worker_pool_init_with_stack_size(context, n_threads, WORKER_THREAD_STACK_SZ);
 }
 // clean up worker pool
 void worker_pool_release(worker_pool_context_t * context) {
    worker_pool_t * me = (worker_pool_t *) *context;
    // if no worker pool exists, return error.
    if (NULL == me) {
        return;
    }
    atomic_store(&me->killed, 1);
    atomic_fetch_add(&me->seqn, 1);
    qurt_futex_wake(&me->seqn, me->n_threads);
    // de-initializations
    for (unsigned int i = 0; i < me->n_threads; i++) {
        if (me->thread[i]) {
            int status;
            (void) qurt_thread_join(me->thread[i], &status);
        }
    }
    // free allocated memory (were allocated as a single buffer starting at stack[0])
    if (me->stack[0]) {
        free(me->stack[0]);
    }
    *context = NULL;
 }
 // run jobs
 AEEResult worker_pool_run_jobs(worker_pool_context_t context, worker_pool_job_t * job, unsigned int n) {
    worker_pool_t * me = (worker_pool_t *) context;
    if (NULL == me) {
        FARF(ERROR, "worker-pool: invalid context");
        return AEE_EBADPARM;
    }
    if (n > me->n_threads) {
        FARF(ERROR, "worker-pool: invalid number of jobs %u for n-threads %u", n, me->n_threads);
        return AEE_EBADPARM;
    }
    memcpy(me->job, job, sizeof(worker_pool_job_t) * n);
    if (n > 1) {
        atomic_store(&me->next_job, 1);
        atomic_store(&me->n_jobs, n);
        atomic_store(&me->n_pending, n - 1);
        // wake up workers
        atomic_fetch_add(&me->seqn, 1);
        qurt_futex_wake(&me->seqn, n - 1);
    }
    // main thread runs job #0
    me->job[0].func(n, 0, me->job[0].data);
    if (n > 1) {
        while (atomic_load(&me->n_pending))
            ;
    }
    return 0;
 }
 // run func
 AEEResult worker_pool_run_func(worker_pool_context_t context, worker_callback_t func, void * data, unsigned int n) {
    worker_pool_job_t job[n];
    for (unsigned int i = 0; i < n; i++) {
        job[i].func = func;
        job[i].data = data;
    }
    return worker_pool_run_jobs(context, job, n);
 }
 AEEResult worker_pool_set_thread_priority(worker_pool_context_t context, unsigned int prio) {
    worker_pool_t * me = (worker_pool_t *) context;
    // if no worker pool exists, return error.
    if (!me) {
        return AEE_ENOMORE;
    }
    int result = AEE_SUCCESS;
    if (prio < 1) {
        prio = 1;
    }
    if (prio > LOWEST_USABLE_QURT_PRIO) {
        prio = LOWEST_USABLE_QURT_PRIO;
    }
    for (unsigned int i = 0; i < me->n_threads; i++) {
        int res = qurt_thread_set_priority(me->thread[i], (unsigned short) prio);
        if (0 != res) {
            result = AEE_EBADPARM;
            FARF(ERROR, "QURT failed to set priority of thread %d, ERROR = %d", me->thread[i], res);
        }
    }
    return result;
 }
 AEEResult worker_pool_retrieve_thread_id(worker_pool_context_t context, unsigned int * tids) {
    worker_pool_t * me = (worker_pool_t *) context;
    if (!me) {
        FARF(ERROR, "worker-pool: invalid context");
        return AEE_EBADPARM;
        ;
    }
    for (int i = 0; i < me->n_threads; i++) {
        tids[i] = me->thread[i];
    }
    return AEE_SUCCESS;
 }
 AEEResult worker_pool_get_thread_priority(worker_pool_context_t context, unsigned int * prio) {
    worker_pool_t * me = (worker_pool_t *) context;
    if (!me) {
        FARF(ERROR, "worker-pool: invalid context");
        return AEE_EBADPARM;
    }
    int priority = qurt_thread_get_priority(me->thread[0]);
    if (priority > 0) {
        *prio = priority;
        return 0;
    } else {
        *prio = 0;
        return AEE_EBADSTATE;
    }
 }
--- a/ggml/src/ggml-hexagon/htp/worker-pool.h
+++ b/ggml/src/ggml-hexagon/htp/worker-pool.h
@ -0,0 +1,57 @@
 #ifndef HTP_WORKER_POOL_H
 #define HTP_WORKER_POOL_H
 // MACRO enables function to be visible in shared-library case.
 #define WORKERPOOL_API __attribute__((visibility("default")))
 #include <AEEStdDef.h>
 #include <AEEStdErr.h>
 #include <stdint.h>
 #ifdef __cplusplus
 extern "C" {
 #endif
 /// signature of callbacks to be invoked by worker threads
 typedef void (*worker_callback_t)(unsigned int n, unsigned int i, void *);
 /// Typedef of worker_pool context
 typedef void * worker_pool_context_t;
 /// descriptor for requested callback
 typedef struct {
    worker_callback_t func;
    void *            data;
 } worker_pool_job_t;
 /// Maximum supported number of worker threads.
 #define MAX_NUM_WORKERS 10
 // Initialize worker pool.
 WORKERPOOL_API AEEResult worker_pool_init(worker_pool_context_t * context, uint32_t n_threads);
 // Initialize worker pool with custom stack size
 WORKERPOOL_API AEEResult worker_pool_init_with_stack_size(worker_pool_context_t * context,
                                                          uint32_t                n_threads,
                                                          uint32_t                stack_size);
 // Kill worker threads and release worker pool resources
 WORKERPOOL_API void worker_pool_release(worker_pool_context_t * context);
 // Run jobs with the worker pool.
 WORKERPOOL_API AEEResult worker_pool_run_jobs(worker_pool_context_t context, worker_pool_job_t * job, unsigned int n);
 WORKERPOOL_API AEEResult worker_pool_run_func(worker_pool_context_t context,
                                              worker_callback_t     func,
                                              void *                data,
                                              unsigned int          n);
 WORKERPOOL_API AEEResult worker_pool_set_thread_priority(worker_pool_context_t context, unsigned int prio);
 WORKERPOOL_API AEEResult worker_pool_get_thread_priority(worker_pool_context_t context, unsigned int * prio);
 WORKERPOOL_API AEEResult worker_pool_retrieve_thread_id(worker_pool_context_t context, unsigned int * tids);
 #ifdef __cplusplus
 }
 #endif
 #endif  // #ifndef HTP_WORKER_POOL_H
--- a/scripts/snapdragon/adb/llama-cli.farf
+++ b/scripts/snapdragon/adb/llama-cli.farf
@ -0,0 +1 @@
 0xffff
--- a/scripts/snapdragon/adb/run-bench.sh
+++ b/scripts/snapdragon/adb/run-bench.sh
@ -0,0 +1,39 @@
 #!/bin/sh
 #
 # Basedir on device
 basedir=/data/local/tmp/llama.cpp
 branch=.
 [ "$B" != "" ] && branch=$B
 adbserial=
 [ "$S" != "" ] && adbserial="-s $S"
 model="Llama-3.2-3B-Instruct-Q4_0.gguf"
 [ "$M" != "" ] && model="$M"
 device="HTP0"
 [ "$D" != "" ] && device="$D"
 verbose=""
 [ "$V" != "" ] && verbose="$V"
 opmask=
 [ "$OPMASK" != "" ] && opmask="GGML_HEXAGON_OPMASK=$OPMASK"
 nhvx=
 [ "$NHVX" != "" ] && nhvx="GGML_HEXAGON_NHVX=$NHVX"
 ndev=
 [ "$NDEV" != "" ] && ndev="GGML_HEXAGON_NDEV=$NDEV"
 set -x
 adb $adbserial shell " \
  cd $basedir;         \
  LD_LIBRARY_PATH=$basedir/$branch/lib   \
  ADSP_LIBRARY_PATH=$basedir/$branch/lib \
    $ndev $nhvx $opmask ./$branch/bin/llama-bench --device $device --mmap 0 -m $basedir/../gguf/$model \
        -t 4 --batch-size 128 -ngl 99 $@ \
 "
--- a/scripts/snapdragon/adb/run-cli.sh
+++ b/scripts/snapdragon/adb/run-cli.sh
@ -0,0 +1,52 @@
 #!/bin/sh
 #
 # Basedir on device
 basedir=/data/local/tmp/llama.cpp
 cli_opts=
 branch=.
 [ "$B" != "" ] && branch=$B
 adbserial=
 [ "$S" != "" ] && adbserial="-s $S"
 model="Llama-3.2-3B-Instruct-Q4_0.gguf"
 [ "$M" != "" ] && model="$M"
 device="HTP0"
 [ "$D" != "" ] && device="$D"
 verbose=
 [ "$V" != "" ] && verbose="GGML_HEXAGON_VERBOSE=$V"
 experimental=
 [ "$E" != "" ] && experimental="GGML_HEXAGON_EXPERIMENTAL=$E"
 sched=
 [ "$SCHED" != "" ] && sched="GGML_SCHED_DEBUG=2" cli_opts="$cli_opts -v"
 profile=
 [ "$PROF" != "" ] && profile="GGML_HEXAGON_PROFILE=$PROF GGML_HEXAGON_OPSYNC=1"
 opmask=
 [ "$OPMASK" != "" ] && opmask="GGML_HEXAGON_OPMASK=$OPMASK"
 nhvx=
 [ "$NHVX" != "" ] && nhvx="GGML_HEXAGON_NHVX=$NHVX"
 ndev=
 [ "$NDEV" != "" ] && ndev="GGML_HEXAGON_NDEV=$NDEV"
 set -x
 adb $adbserial shell " \
  cd $basedir; ulimit -c unlimited;        \
    LD_LIBRARY_PATH=$basedir/$branch/lib   \
    ADSP_LIBRARY_PATH=$basedir/$branch/lib \
    $verbose $experimental $sched $opmask $profile $nhvx $ndev           \
      ./$branch/bin/llama-cli --no-mmap -m $basedir/../gguf/$model       \
         -t 4 --ctx-size 8192 --batch-size 128 -ctk q8_0 -ctv q8_0 -fa on \
         -ngl 99 --device $device $cli_opts $@ \
 "
--- a/scripts/snapdragon/adb/run-tool.sh
+++ b/scripts/snapdragon/adb/run-tool.sh
@ -0,0 +1,51 @@
 #!/bin/sh
 #
 # Basedir on device
 basedir=/data/local/tmp/llama.cpp
 cli_opts=
 branch=.
 [ "$B" != "" ] && branch=$B
 adbserial=
 [ "$S" != "" ] && adbserial="-s $S"
 device="HTP0"
 [ "$D" != "" ] && device="$D"
 verbose=
 [ "$V" != "" ] && verbose="GGML_HEXAGON_VERBOSE=$V"
 experimental=
 [ "$E" != "" ] && experimental="GGML_HEXAGON_EXPERIMENTAL=$V"
 sched=
 [ "$SCHED" != "" ] && sched="GGML_SCHED_DEBUG=2" cli_opts="$cli_opts -v"
 profile=
 [ "$PROF" != "" ] && profile="GGML_HEXAGON_PROFILE=$PROF GGML_HEXAGON_OPSYNC=1"
 opmask=
 [ "$OPMASK" != "" ] && opmask="GGML_HEXAGON_OPMASK=$OPMASK"
 nhvx=
 [ "$NHVX" != "" ] && nhvx="GGML_HEXAGON_NHVX=$NHVX"
 ndev=
 [ "$NDEV" != "" ] && ndev="GGML_HEXAGON_NDEV=$NDEV"
 hb=
 [ "$HB" != "" ] && hb="GGML_HEXAGON_HOSTBUF=$HB"
 set -x
 tool=$1; shift
 adb $adbserial shell " \
  cd $basedir; ulimit -c unlimited;        \
    LD_LIBRARY_PATH=$basedir/$branch/lib   \
    ADSP_LIBRARY_PATH=$basedir/$branch/lib \
    $verbose $experimental $sched $opmask $profile $nhvx $ndev $hb ./$branch/bin/$tool $@ \
 "
--- a/scripts/snapdragon/qdc/readme.md
+++ b/scripts/snapdragon/qdc/readme.md
@ -0,0 +1 @@
 This directory includes pytest based scripts for running CI jobs on Qualcomm Device Cloud (QDC).
--- a/scripts/snapdragon/qdc/requirements.txt
+++ b/scripts/snapdragon/qdc/requirements.txt
@ -0,0 +1,25 @@
 Appium-Python-Client==5.2.4
 attrs==25.4.0
 certifi==2025.10.5
 exceptiongroup==1.3.0
 h11==0.16.0
 idna==3.11
 iniconfig==2.1.0
 outcome==1.3.0.post0
 packaging==25.0
 pluggy==1.6.0
 Pygments==2.19.2
 PySocks==1.7.1
 pytest==8.4.2
 pytest-dependency==0.6.0
 selenium==4.36.0
 setuptools==80.9.0
 sniffio==1.3.1
 sortedcontainers==2.4.0
 tomli==2.3.0
 trio==0.31.0
 trio-websocket==0.12.2
 typing_extensions==4.15.0
 urllib3==2.5.0
 websocket-client==1.9.0
 wsproto==1.2.0
--- a/scripts/snapdragon/qdc/tests/test_bench.py
+++ b/scripts/snapdragon/qdc/tests/test_bench.py
@ -0,0 +1,63 @@
 import pytest
 import subprocess
 import sys
 tmp_path='/data/local/tmp'
 pkg_path=f'{tmp_path}/llama.cpp'
 lib_path=f'{pkg_path}/lib'
 bin_path=f'{pkg_path}/bin'
 model='../gguf/Llama-3.2-1B-Instruct-Q4_0.gguf'
 cli_pref=f'cd {pkg_path} && LD_LIBRARY_PATH={lib_path} ADSP_LIBRARY_PATH={lib_path} {bin_path}'
 def run_cmd(cmd):
    p = subprocess.run(cmd, text = True, stdout = subprocess.PIPE, stderr = subprocess.STDOUT)
    sys.stdout.write(p.stdout)
    assert(p.returncode == 0)
@pytest.mark.dependency()
 def test_install():
    run_cmd(['adb', 'push', 'llama.cpp', f'{tmp_path}'])
    run_cmd(['adb', 'shell', f'chmod 755 {bin_path}/*'])
 ## Basic cli tests
 def run_llama_cli(dev, opts):
    prompt='what is the most popular cookie in the world?\nPlease provide a very brief bullet point summary.\nBegin your answer with **BEGIN**.'
    opts = '--batch-size 128 -n 128 -no-cnv --seed 42 ' + opts
    run_cmd(['adb', 'shell', f'{cli_pref}/llama-cli -m {model} --device {dev} -ngl 99 -t 4 {opts} -p "{prompt}"'])
@pytest.mark.dependency(depends=['test_install'])
 def test_llama_cli_cpu():
    run_llama_cli('none', '-ctk q8_0 -ctv q8_0 -fa on')
@pytest.mark.dependency(depends=['test_install'])
 def test_llama_cli_gpu():
    run_llama_cli('GPUOpenCL', '-fa on')
@pytest.mark.dependency(depends=['test_install'])
 def test_llama_cli_npu():
    run_llama_cli('HTP0', '-ctk q8_0 -ctv q8_0 -fa on')
 ## Basic bench tests
 def run_llama_bench(dev):
    run_cmd(['adb', 'shell', f'{cli_pref}/llama-bench -m {model} --device {dev} -ngl 99 --batch-size 128 -t 4 -p 128 -n 32'])
@pytest.mark.dependency(depends=['test_install'])
 def test_llama_bench_cpu():
    run_llama_bench('none')
 def test_llama_bench_gpu():
    run_llama_bench('GPUOpenCL')
 def test_llama_bench_npu():
    run_llama_bench('HTP0')
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@ -404,6 +404,19 @@ static buft_list_t make_gpu_buft_list(ggml_backend_dev_t dev, llama_split_mode s
    // add the device default buffer type
    buft_list.emplace_back(dev, ggml_backend_dev_buffer_type(dev));
    // add the device extra buffer type (if any)
    ggml_backend_reg_t reg = ggml_backend_dev_backend_reg(dev);
    auto ggml_backend_dev_get_extra_bufts_fn = (ggml_backend_dev_get_extra_bufts_t)
        ggml_backend_reg_get_proc_address(reg, "ggml_backend_dev_get_extra_bufts");
    if (ggml_backend_dev_get_extra_bufts_fn) {
        ggml_backend_buffer_type_t * extra_bufts = ggml_backend_dev_get_extra_bufts_fn(dev);
        while (extra_bufts && *extra_bufts) {
            buft_list.emplace_back(dev, *extra_bufts);
            ++extra_bufts;
        }
    }
    return buft_list;
 }
		`@ -0,0 +1 @@`
							`This directory includes pytest based scripts for running CI jobs on Qualcomm Device Cloud (QDC).`