Merge branch 'ggml-org:master' into llama-quant-refactor

2026-03-31 10:37:10 -04:00 · 2026-03-31 10:37:10 -04:00 · 2cf3eaf094
parent 04c829966e 6307ec07d3
commit 2cf3eaf094
299 changed files with 12484 additions and 3764 deletions
--- a/.devops/cann.Dockerfile
+++ b/.devops/cann.Dockerfile
@ -4,7 +4,7 @@

 # Define the CANN base image for easier version updates later
 ARG CHIP_TYPE=910b
-ARG CANN_BASE_IMAGE=quay.io/ascend/cann:8.3.rc2-${CHIP_TYPE}-openeuler24.03-py3.11
+ARG CANN_BASE_IMAGE=quay.io/ascend/cann:8.5.0-${CHIP_TYPE}-openeuler24.03-py3.11

 # ==============================================================================
 # BUILD STAGE
--- a/.devops/cpu.Dockerfile
+++ b/.devops/cpu.Dockerfile
@ -1,11 +1,13 @@
-ARG UBUNTU_VERSION=22.04
+ARG UBUNTU_VERSION=24.04

 FROM ubuntu:$UBUNTU_VERSION AS build

 ARG TARGETARCH

 RUN apt-get update && \
-    apt-get install -y build-essential git cmake libssl-dev
+    apt-get install -y gcc-14 g++-14 build-essential git cmake libssl-dev
+
+ENV CC=gcc-14 CXX=g++-14

 WORKDIR /app

@ -34,7 +36,7 @@ RUN mkdir -p /app/full \
 FROM ubuntu:$UBUNTU_VERSION AS base

 RUN apt-get update \
-    && apt-get install -y libgomp1 curl\
+    && apt-get install -y libgomp1 curl \
    && apt autoremove -y \
    && apt clean -y \
    && rm -rf /tmp/* /var/tmp/* \
@ -55,8 +57,9 @@ RUN apt-get update \
    git \
    python3 \
    python3-pip \
-    && pip install --upgrade pip setuptools wheel \
-    && pip install -r requirements.txt \
+    python3-wheel \
+    && pip install --break-system-packages --upgrade setuptools \
+    && pip install --break-system-packages -r requirements.txt \
    && apt autoremove -y \
    && apt clean -y \
    && rm -rf /tmp/* /var/tmp/* \
--- a/.devops/cuda-new.Dockerfile
+++ b/.devops/cuda-new.Dockerfile
@ -1,6 +1,6 @@
 ARG UBUNTU_VERSION=24.04
 # This needs to generally match the container host's environment.
-ARG CUDA_VERSION=13.1.0
+ARG CUDA_VERSION=13.1.1
 # Target the CUDA build image
 ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}

@ -12,7 +12,9 @@ FROM ${BASE_CUDA_DEV_CONTAINER} AS build
 ARG CUDA_DOCKER_ARCH=default

 RUN apt-get update && \
-    apt-get install -y build-essential cmake python3 python3-pip git libssl-dev libgomp1
+    apt-get install -y gcc-14 g++-14 build-essential cmake python3 python3-pip git libssl-dev libgomp1
+
+ENV CC=gcc-14 CXX=g++-14 CUDAHOSTCXX=g++-14

 WORKDIR /app

@ -39,7 +41,7 @@ RUN mkdir -p /app/full \
 FROM ${BASE_CUDA_RUN_CONTAINER} AS base

 RUN apt-get update \
-    && apt-get install -y libgomp1 curl\
+    && apt-get install -y libgomp1 curl \
    && apt autoremove -y \
    && apt clean -y \
    && rm -rf /tmp/* /var/tmp/* \
--- a/.devops/cuda.Dockerfile
+++ b/.devops/cuda.Dockerfile
@ -1,6 +1,6 @@
-ARG UBUNTU_VERSION=22.04
+ARG UBUNTU_VERSION=24.04
 # This needs to generally match the container host's environment.
-ARG CUDA_VERSION=12.4.0
+ARG CUDA_VERSION=12.8.1
 # Target the CUDA build image
 ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}

@ -12,7 +12,9 @@ FROM ${BASE_CUDA_DEV_CONTAINER} AS build
 ARG CUDA_DOCKER_ARCH=default

 RUN apt-get update && \
-    apt-get install -y build-essential cmake python3 python3-pip git libssl-dev libgomp1
+    apt-get install -y gcc-14 g++-14 build-essential cmake python3 python3-pip git libssl-dev libgomp1
+
+ENV CC=gcc-14 CXX=g++-14 CUDAHOSTCXX=g++-14

 WORKDIR /app

@ -39,7 +41,7 @@ RUN mkdir -p /app/full \
 FROM ${BASE_CUDA_RUN_CONTAINER} AS base

 RUN apt-get update \
-    && apt-get install -y libgomp1 curl\
+    && apt-get install -y libgomp1 curl \
    && apt autoremove -y \
    && apt clean -y \
    && rm -rf /tmp/* /var/tmp/* \
@ -60,7 +62,8 @@ RUN apt-get update \
    git \
    python3 \
    python3-pip \
-    && pip install --upgrade pip setuptools wheel \
+    python3-wheel \
+    && pip install --break-system-packages --upgrade setuptools \
    && pip install --break-system-packages -r requirements.txt \
    && apt autoremove -y \
    && apt clean -y \
--- a/.devops/intel.Dockerfile
+++ b/.devops/intel.Dockerfile
@ -33,8 +33,25 @@ RUN mkdir -p /app/full \

 FROM intel/deep-learning-essentials:$ONEAPI_VERSION AS base

+ARG IGC_VERSION=v2.30.1
+ARG IGC_VERSION_FULL=2_2.30.1+20950
+ARG COMPUTE_RUNTIME_VERSION=26.09.37435.1
+ARG COMPUTE_RUNTIME_VERSION_FULL=26.09.37435.1-0
+ARG IGDGMM_VERSION=22.9.0
+RUN mkdir /tmp/neo/ && cd /tmp/neo/ \
+  && wget https://github.com/intel/intel-graphics-compiler/releases/download/$IGC_VERSION/intel-igc-core-${IGC_VERSION_FULL}_amd64.deb \
+  && wget https://github.com/intel/intel-graphics-compiler/releases/download/$IGC_VERSION/intel-igc-opencl-${IGC_VERSION_FULL}_amd64.deb \
+  && wget https://github.com/intel/compute-runtime/releases/download/$COMPUTE_RUNTIME_VERSION/intel-ocloc-dbgsym_${COMPUTE_RUNTIME_VERSION_FULL}_amd64.ddeb \
+  && wget https://github.com/intel/compute-runtime/releases/download/$COMPUTE_RUNTIME_VERSION/intel-ocloc_${COMPUTE_RUNTIME_VERSION_FULL}_amd64.deb \
+  && wget https://github.com/intel/compute-runtime/releases/download/$COMPUTE_RUNTIME_VERSION/intel-opencl-icd-dbgsym_${COMPUTE_RUNTIME_VERSION_FULL}_amd64.ddeb \
+  && wget https://github.com/intel/compute-runtime/releases/download/$COMPUTE_RUNTIME_VERSION/intel-opencl-icd_${COMPUTE_RUNTIME_VERSION_FULL}_amd64.deb \
+  && wget https://github.com/intel/compute-runtime/releases/download/$COMPUTE_RUNTIME_VERSION/libigdgmm12_${IGDGMM_VERSION}_amd64.deb \
+  && wget https://github.com/intel/compute-runtime/releases/download/$COMPUTE_RUNTIME_VERSION/libze-intel-gpu1-dbgsym_${COMPUTE_RUNTIME_VERSION_FULL}_amd64.ddeb \
+  && wget https://github.com/intel/compute-runtime/releases/download/$COMPUTE_RUNTIME_VERSION/libze-intel-gpu1_${COMPUTE_RUNTIME_VERSION_FULL}_amd64.deb \
+  && dpkg --install *.deb
+
 RUN apt-get update \
-    && apt-get install -y libgomp1 curl\
+    && apt-get install -y libgomp1 curl \
    && apt autoremove -y \
    && apt clean -y \
    && rm -rf /tmp/* /var/tmp/* \
--- a/.devops/llama-cli-cann.Dockerfile
+++ b/.devops/llama-cli-cann.Dockerfile
@ -1,4 +1,4 @@
-ARG ASCEND_VERSION=8.1.RC1.alpha001-910b-openeuler22.03-py3.10
+ARG ASCEND_VERSION=8.5.0-910b-openeuler22.03-py3.10

 FROM ascendai/cann:$ASCEND_VERSION AS build

--- a/.devops/musa.Dockerfile
+++ b/.devops/musa.Dockerfile
@ -46,7 +46,7 @@ RUN mkdir -p /app/full \
 FROM ${BASE_MUSA_RUN_CONTAINER} AS base

 RUN apt-get update \
-    && apt-get install -y libgomp1 curl\
+    && apt-get install -y libgomp1 curl \
    && apt autoremove -y \
    && apt clean -y \
    && rm -rf /tmp/* /var/tmp/* \
--- a/.devops/nix/package.nix
+++ b/.devops/nix/package.nix
@ -41,6 +41,7 @@
  effectiveStdenv ? if useCuda then cudaPackages.backendStdenv else stdenv,
  enableStatic ? effectiveStdenv.hostPlatform.isStatic,
  precompileMetalShaders ? false,
+  useWebUi ? true,
 }:

 let
@ -164,6 +165,7 @@ effectiveStdenv.mkDerivation (finalAttrs: {
  cmakeFlags =
    [
      (cmakeBool "LLAMA_BUILD_SERVER" true)
+      (cmakeBool "LLAMA_BUILD_WEBUI" useWebUi)
      (cmakeBool "BUILD_SHARED_LIBS" (!enableStatic))
      (cmakeBool "CMAKE_SKIP_BUILD_RPATH" true)
      (cmakeBool "GGML_NATIVE" false)
--- a/.devops/openvino.Dockerfile
+++ b/.devops/openvino.Dockerfile
@ -78,7 +78,7 @@ ARG http_proxy
 ARG https_proxy

 RUN apt-get update \
-    && apt-get install -y libgomp1 libtbb12 curl\
+    && apt-get install -y libgomp1 libtbb12 curl \
    && apt autoremove -y \
    && apt clean -y \
    && rm -rf /tmp/* /var/tmp/* \
--- a/.devops/rocm.Dockerfile
+++ b/.devops/rocm.Dockerfile
@ -58,7 +58,7 @@ RUN mkdir -p /app/full \
 FROM ${BASE_ROCM_DEV_CONTAINER} AS base

 RUN apt-get update \
-    && apt-get install -y libgomp1 curl\
+    && apt-get install -y libgomp1 curl \
    && apt autoremove -y \
    && apt clean -y \
    && rm -rf /tmp/* /var/tmp/* \
@ -79,7 +79,7 @@ RUN apt-get update \
    git \
    python3-pip \
    python3 \
-    python3-wheel\
+    python3-wheel \
    && pip install --break-system-packages --upgrade setuptools \
    && pip install --break-system-packages -r requirements.txt \
    && apt autoremove -y \
--- a/.devops/vulkan.Dockerfile
+++ b/.devops/vulkan.Dockerfile
@ -49,17 +49,20 @@ COPY --from=build /app/full /app

 WORKDIR /app

+ENV PATH="/root/.venv/bin:/root/.local/bin:${PATH}"
+
+# Flag for compatibility with pip
+ARG UV_INDEX_STRATEGY="unsafe-best-match"
 RUN apt-get update \
    && apt-get install -y \
    build-essential \
+    curl \
    git \
-    python3.13 \
-    python3.13-dev \
-    python3-pip \
-    python3-wheel \
-    && update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.13 100 \
-    && pip install --break-system-packages --upgrade setuptools \
-    && pip install --break-system-packages -r requirements.txt \
+    ca-certificates \
+    && curl -LsSf https://astral.sh/uv/install.sh | sh \
+    && uv python install 3.13 \
+    && uv venv --python 3.13 /root/.venv \
+    && uv pip install --python /root/.venv/bin/python -r requirements.txt \
    && apt autoremove -y \
    && apt clean -y \
    && rm -rf /tmp/* /var/tmp/* \
--- a/.editorconfig
+++ b/.editorconfig
@ -21,14 +21,6 @@ indent_style = tab
 [prompts/*.txt]
 insert_final_newline = unset

-[tools/server/public/*]
-indent_size = 2
-
-[tools/server/public/deps_*]
-trim_trailing_whitespace = unset
-indent_style = unset
-indent_size = unset
-
 [tools/server/deps_*]
 trim_trailing_whitespace = unset
 indent_style = unset
@ -61,6 +53,14 @@ charset = unset
 trim_trailing_whitespace = unset
 insert_final_newline = unset

+[tools/server/public/**]
+indent_style = unset
+indent_size = unset
+end_of_line = unset
+charset = unset
+trim_trailing_whitespace = unset
+insert_final_newline = unset
+
 [benches/**]
 indent_style = unset
 indent_size = unset
--- a/.gitattributes
+++ b/.gitattributes
@ -0,0 +1,4 @@
+# Treat the generated single-file WebUI build as binary for diff purposes.
+# Git's pack-file delta compression still works (byte-level), but this prevents
+# git diff from printing the entire minified file on every change.
+tools/server/public/index.html -diff
--- a/.github/workflows/build-android.yml
+++ b/.github/workflows/build-android.yml
@ -40,13 +40,9 @@ jobs:
    steps:
      - name: Clone
        uses: actions/checkout@v6
-
-      # Disabled due to size (400MB) and always 0 cache hits
-      # - name: ccache
-      #   uses: ggml-org/ccache-action@v1.2.16
-      #   with:
-      #     key: android-build
-      #     evict-old-files: 1d
+        with:
+          fetch-depth: 0
+          lfs: false

      - name: Set up JDK
        uses: actions/setup-java@v5
@ -55,7 +51,7 @@ jobs:
          distribution: zulu

      - name: Setup Android SDK
-        uses: android-actions/setup-android@v3
+        uses: android-actions/setup-android@9fc6c4e9069bf8d3d10b2204b1fb8f6ef7065407 # v3
        with:
          log-accepted-android-sdk-licenses: false

@ -66,10 +62,11 @@ jobs:

  android-ndk:
    runs-on: ubuntu-latest
-
-    env:
-      OPENCL_VERSION: 2025.07.22
-
+    container:
+      image: 'ghcr.io/snapdragon-toolchain/arm64-android:v0.3'
+    defaults:
+      run:
+        shell: bash
    strategy:
      matrix:
        include:
@ -82,59 +79,23 @@ jobs:
      - name: Clone
        id: checkout
        uses: actions/checkout@v6
+        with:
+          fetch-depth: 0
+          lfs: false

-      - name: Install OpenCL Headers and Libs
-        id: install_opencl
-        if: ${{ matrix.build == 'arm64-snapdragon' }}
-        run: |
-          mkdir opencl
-          curl -L -o opencl/clhpp.tar.gz      https://github.com/KhronosGroup/OpenCL-CLHPP/archive/refs/tags/v${OPENCL_VERSION}.tar.gz
-          curl -L -o opencl/headers.tar.gz    https://github.com/KhronosGroup/OpenCL-Headers/archive/refs/tags/v${OPENCL_VERSION}.tar.gz
-          curl -L -o opencl/icd-loader.tar.gz https://github.com/KhronosGroup/OpenCL-ICD-Loader/archive/refs/tags/v${OPENCL_VERSION}.tar.gz
-          tar -xaf opencl/headers.tar.gz    -C opencl
-          tar -xaf opencl/clhpp.tar.gz      -C opencl
-          tar -xaf opencl/icd-loader.tar.gz -C opencl
-          sudo cp -r opencl/OpenCL-Headers-${OPENCL_VERSION}/CL         ${ANDROID_NDK_ROOT}/toolchains/llvm/prebuilt/linux-x86_64/sysroot/usr/include
-          sudo cp -r opencl/OpenCL-CLHPP-${OPENCL_VERSION}/include/CL/* ${ANDROID_NDK_ROOT}/toolchains/llvm/prebuilt/linux-x86_64/sysroot/usr/include/CL
-          cd opencl/OpenCL-ICD-Loader-${OPENCL_VERSION}
-          cmake -B build -G Ninja -DCMAKE_BUILD_TYPE=Release -DCMAKE_TOOLCHAIN_FILE=${ANDROID_NDK_ROOT}/build/cmake/android.toolchain.cmake -DOPENCL_ICD_LOADER_HEADERS_DIR=${ANDROID_NDK_ROOT}/toolchains/llvm/prebuilt/linux-x86_64/sysroot/usr/include -DANDROID_ABI=arm64-v8a -DANDROID_PLATFORM=31 -DANDROID_STL=c++_shared
-          cmake --build build
-          sudo cp build/libOpenCL.so ${ANDROID_NDK_ROOT}/toolchains/llvm/prebuilt/linux-x86_64/sysroot/usr/lib/aarch64-linux-android
-          rm -rf opencl
-
-      - name: Install Hexagon SDK
-        id: install_hexsdk
-        if: ${{ matrix.build == 'arm64-snapdragon' }}
-        env:
-          HEXSDK_VER: 6.4.0.2
-          HEXTLS_VER: 19.0.04
-        run: |
-          curl -L -o hex-sdk.tar.gz https://github.com/snapdragon-toolchain/hexagon-sdk/releases/download/v$HEXSDK_VER/hexagon-sdk-v$HEXSDK_VER-amd64-lnx.tar.xz
-          mkdir hex-sdk
-          tar -xaf hex-sdk.tar.gz -C hex-sdk
-          ls -l hex-sdk
-          sudo mv hex-sdk /opt/hexagon
-          echo "HEXAGON_SDK_ROOT=/opt/hexagon/$HEXSDK_VER"                                     >> "$GITHUB_ENV"
-          echo "HEXAGON_TOOLS_ROOT=/opt/hexagon/$HEXSDK_VER/tools/HEXAGON_Tools/$HEXTLS_VER"   >> "$GITHUB_ENV"
-          echo "DEFAULT_HLOS_ARCH=64"                                                          >> "$GITHUB_ENV"
-          echo "DEFAULT_TOOLS_VARIANT=toolv19"                                                 >> "$GITHUB_ENV"
-          echo "DEFAULT_NO_QURT_INC=0"                                                         >> "$GITHUB_ENV"
-          echo "DEFAULT_DSP_ARCH=v73"                                                          >> "$GITHUB_ENV"
-
-      - name: Update CMake presets
-        id: update_presets
-        if: ${{ matrix.build == 'arm64-snapdragon' }}
-        run: |
-          cp docs/backend/snapdragon/CMakeUserPresets.json .
-
-      - name: Build
-        id: ndk_build
+      - name: Build Llama.CPP for Hexagon Android
+        id: build_llama_cpp_hexagon_android
        run: |
+          if [[ "${{ matrix.build }}" == "arm64-snapdragon" ]]; then
+            cp docs/backend/snapdragon/CMakeUserPresets.json .
+          fi
          cmake ${{ matrix.defines }} -B build
          cmake --build build
          cmake --install build --prefix pkg-adb/llama.cpp

-      - name: Test
-        id: cmake_test
-        run: |
-          echo "FIXME: test on devices"
+      - name: Upload Llama.CPP Hexagon Android Build Artifact
+        if: ${{ always() && steps.build_llama_cpp_hexagon_android.outcome == 'success' }}
+        uses: actions/upload-artifact@v6
+        with:
+          name: llama-cpp-android-${{ matrix.build }}
+          path: pkg-adb/llama.cpp
--- a/.github/workflows/build-cann.yml
+++ b/.github/workflows/build-cann.yml
@ -63,7 +63,7 @@ jobs:
      - name: Set container image
        id: cann-image
        run: |
-          image="ascendai/cann:${{ matrix.chip_type == '910b' &&  '8.3.rc2-910b-openeuler24.03-py3.11' || '8.3.rc2-310p-openeuler24.03-py3.11' }}"
+          image="ascendai/cann:${{ matrix.chip_type == '910b' &&  '8.5.0-910b-openeuler24.03-py3.11' || '8.5.0-310p-openeuler24.03-py3.11' }}"
          echo "image=${image}" >> "${GITHUB_OUTPUT}"

      - name: Pull container image
--- a/.github/workflows/build-msys.yml
+++ b/.github/workflows/build-msys.yml
@ -43,7 +43,7 @@ jobs:
      #    save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}

      - name: Setup ${{ matrix.sys }}
-        uses: msys2/setup-msys2@v2
+        uses: msys2/setup-msys2@cafece8e6baf9247cf9b1bf95097b0b983cc558d # v2
        with:
          update: true
          msystem: ${{matrix.sys}}
--- a/.github/workflows/build-self-hosted.yml
+++ b/.github/workflows/build-self-hosted.yml
@ -141,60 +141,61 @@ jobs:
  #         amd-smi static
  #         GG_BUILD_ROCM=1 GG_BUILD_AMDGPU_TARGETS="gfx1101" bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp

-  ggml-ci-mac-metal:
-    runs-on: [self-hosted, macOS, ARM64]
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v6
-
-      - name: Test
-        id: ggml-ci
-        run: |
-          GG_BUILD_METAL=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
-
-  ggml-ci-mac-webgpu:
-    runs-on: [self-hosted, macOS, ARM64]
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v6
-
-      - name: Dawn Dependency
-        id: dawn-depends
-        run: |
-          DAWN_VERSION="v2.0.0"
-          DAWN_OWNER="reeselevine"
-          DAWN_REPO="dawn"
-          DAWN_ASSET_NAME="Dawn-5e9a4865b1635796ccc77dd30057f2b4002a1355-macos-latest-Release"
-          echo "Fetching release asset from https://github.com/${DAWN_OWNER}/${DAWN_REPO}/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}.zip"
-          curl -L -o artifact.zip \
-            "https://github.com/${DAWN_OWNER}/${DAWN_REPO}/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}.zip"
-          mkdir dawn
-          unzip artifact.zip
-          tar -xvf ${DAWN_ASSET_NAME}.tar.gz -C dawn --strip-components=1
-
-      - name: Test
-        id: ggml-ci
-        run: |
-          GG_BUILD_WEBGPU=1 GG_BUILD_WEBGPU_DAWN_PREFIX="$GITHUB_WORKSPACE/dawn" \
-            bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
-
-  ggml-ci-mac-vulkan:
-    runs-on: [self-hosted, macOS, ARM64]
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v6
-
-      - name: Test
-        id: ggml-ci
-        run: |
-          vulkaninfo --summary
-          GG_BUILD_VULKAN=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
+  # TODO: sandbox Mac runners
+  #  ggml-ci-mac-metal:
+  #    runs-on: [self-hosted, macOS, ARM64]
+  #
+  #    steps:
+  #      - name: Clone
+  #        id: checkout
+  #        uses: actions/checkout@v6
+  #
+  #      - name: Test
+  #        id: ggml-ci
+  #        run: |
+  #          GG_BUILD_METAL=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
+  #
+  #  ggml-ci-mac-webgpu:
+  #    runs-on: [self-hosted, macOS, ARM64]
+  #
+  #    steps:
+  #      - name: Clone
+  #        id: checkout
+  #        uses: actions/checkout@v6
+  #
+  #      - name: Dawn Dependency
+  #        id: dawn-depends
+  #        run: |
+  #          DAWN_VERSION="v2.0.0"
+  #          DAWN_OWNER="reeselevine"
+  #          DAWN_REPO="dawn"
+  #          DAWN_ASSET_NAME="Dawn-5e9a4865b1635796ccc77dd30057f2b4002a1355-macos-latest-Release"
+  #          echo "Fetching release asset from https://github.com/${DAWN_OWNER}/${DAWN_REPO}/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}.zip"
+  #          curl -L -o artifact.zip \
+  #            "https://github.com/${DAWN_OWNER}/${DAWN_REPO}/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}.zip"
+  #          mkdir dawn
+  #          unzip artifact.zip
+  #          tar -xvf ${DAWN_ASSET_NAME}.tar.gz -C dawn --strip-components=1
+  #
+  #      - name: Test
+  #        id: ggml-ci
+  #        run: |
+  #          GG_BUILD_WEBGPU=1 GG_BUILD_WEBGPU_DAWN_PREFIX="$GITHUB_WORKSPACE/dawn" \
+  #            bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
+  #
+  #  ggml-ci-mac-vulkan:
+  #    runs-on: [self-hosted, macOS, ARM64]
+  #
+  #    steps:
+  #      - name: Clone
+  #        id: checkout
+  #        uses: actions/checkout@v6
+  #
+  #      - name: Test
+  #        id: ggml-ci
+  #        run: |
+  #          vulkaninfo --summary
+  #          GG_BUILD_VULKAN=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp

  ggml-ci-linux-intel-vulkan:
    runs-on: [self-hosted, Linux, Intel]
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@ -87,7 +87,7 @@ jobs:
            -DGGML_METAL_EMBED_LIBRARY=OFF \
            -DGGML_METAL_SHADER_DEBUG=ON \
            -DGGML_RPC=ON
-          cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)
+          time cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)
          leaks -atExit -- ./build/bin/test-thread-safety -hf ggml-org/gemma-3-270m-qat-GGUF -ngl 99 -p "$(printf 'hello %.0s' {1..128})" -n 16 -c 512 -ub 32 -np 2 -t 2 -lv 1

      - name: Test
@ -124,7 +124,7 @@ jobs:
            -DGGML_METAL=OFF \
            -DGGML_RPC=ON \
            -DCMAKE_OSX_DEPLOYMENT_TARGET=13.3
-          cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)
+          time cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)

      - name: Test
        id: cmake_test
@ -165,8 +165,8 @@ jobs:
        id: cmake_build
        run: |
          export CMAKE_PREFIX_PATH=dawn
-          cmake -B build -DGGML_WEBGPU=ON -DGGML_METAL=OFF -DGGML_BLAS=OFF
-          cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)
+          cmake -B build -G "Ninja" -DCMAKE_BUILD_TYPE=Release -DGGML_WEBGPU=ON -DGGML_METAL=OFF -DGGML_BLAS=OFF
+          time cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)

      - name: Test
        id: cmake_test
@ -181,7 +181,7 @@ jobs:
          - build: 'x64'
            os: ubuntu-22.04
          - build: 'arm64'
-            os: ubuntu-22.04-arm
+            os: ubuntu-24.04-arm
          - build: 's390x'
            os: ubuntu-24.04-s390x
          - build: 'ppc64le'
@ -207,14 +207,22 @@ jobs:
        run: |
          sudo apt-get update
          sudo apt-get install -y --no-install-recommends \
-            python3 python3-pip python3-dev \
+            python3 python3-pip python3-dev python3-wheel \
            libjpeg-dev build-essential libssl-dev \
            git-lfs

+      - name: Toolchain workaround (GCC 14)
+        if: ${{ contains(matrix.os, 'ubuntu-24.04') }}
+        run: |
+          sudo apt-get install -y gcc-14 g++-14
+          echo "CC=gcc-14" >> "$GITHUB_ENV"
+          echo "CXX=g++-14" >> "$GITHUB_ENV"
+
      - name: Python Dependencies
        id: python_depends
        run: |
-          python3 -m pip install --upgrade pip
+          export PIP_BREAK_SYSTEM_PACKAGES="1"
+          python3 -m pip install --upgrade pip setuptools
          pip3 install ./gguf-py

      - name: Swap Endianness
@ -231,7 +239,7 @@ jobs:
          cmake -B build \
            -DLLAMA_FATAL_WARNINGS=ON \
            -DGGML_RPC=ON
-          cmake --build build --config Release -j $(nproc)
+          time cmake --build build --config Release -j $(nproc)

      - name: Test
        id: cmake_test
@ -274,14 +282,16 @@ jobs:
        id: depends
        run: |
          sudo apt-get update
-          sudo apt-get install build-essential libssl-dev
+          sudo apt-get install build-essential libssl-dev ninja-build

      - name: Build
        id: cmake_build
        run: |
          cmake -B build \
+            -G "Ninja" \
+            -DCMAKE_BUILD_TYPE=Release \
            -DGGML_RPC=ON
-          cmake --build build --config Release -j $(nproc)
+          time cmake --build build --config Release -j $(nproc)

      - name: Test
        id: cmake_test
@ -290,7 +300,15 @@ jobs:
          ctest -L main --verbose

  ubuntu-24-vulkan:
-    runs-on: ${{ 'ubuntu-24.04-arm' || 'ubuntu-24.04' }}
+    strategy:
+      matrix:
+        include:
+          - build: 'x64'
+            os: ubuntu-24.04
+          - build: 'arm64'
+            os: ubuntu-24.04-arm
+
+    runs-on: ${{ matrix.os }}

    steps:
      - name: Clone
@ -300,12 +318,16 @@ jobs:
      - name: Dependencies
        id: depends
        run: |
-          sudo apt-get install -y glslc libvulkan-dev libssl-dev
+          sudo apt-get update
+          sudo apt-get install -y gcc-14 g++-14 build-essential glslc libvulkan-dev libssl-dev ninja-build
+          echo "CC=gcc-14" >> "$GITHUB_ENV"
+          echo "CXX=g++-14" >> "$GITHUB_ENV"

      - name: Configure
        id: cmake_configure
        run: |
          cmake -B build \
+            -G "Ninja" \
            -DCMAKE_BUILD_TYPE=RelWithDebInfo \
            -DGGML_BACKEND_DL=ON \
            -DGGML_CPU_ALL_VARIANTS=ON \
@ -314,7 +336,7 @@ jobs:
      - name: Build
        id: cmake_build
        run: |
-          cmake --build build -j $(nproc)
+          time cmake --build build -j $(nproc)

  ubuntu-24-webgpu:
    runs-on: ubuntu-24.04
@ -336,7 +358,8 @@ jobs:
        run: |
          sudo add-apt-repository -y ppa:kisak/kisak-mesa
          sudo apt-get update -y
-          sudo apt-get install -y build-essential mesa-vulkan-drivers libxcb-xinput0 libxcb-xinerama0 libxcb-cursor-dev libssl-dev
+          sudo apt-get install -y build-essential mesa-vulkan-drivers \
+            libxcb-xinput0 libxcb-xinerama0 libxcb-cursor-dev libssl-dev

      - name: Get latest Vulkan SDK version
        id: vulkan_sdk_version
@ -378,7 +401,7 @@ jobs:
          export Dawn_DIR=dawn/lib64/cmake/Dawn
          cmake -B build \
            -DGGML_WEBGPU=ON
-          cmake --build build --config Release -j $(nproc)
+          time cmake --build build --config Release -j $(nproc)

      - name: Test
        id: cmake_test
@ -415,11 +438,13 @@ jobs:
        run: |
          source emsdk/emsdk_env.sh
          emcmake cmake -B build-wasm \
+            -G "Ninja" \
+            -DCMAKE_BUILD_TYPE=Release \
            -DGGML_WEBGPU=ON \
            -DLLAMA_OPENSSL=OFF \
            -DEMDAWNWEBGPU_DIR=emdawnwebgpu_pkg

-          cmake --build build-wasm --target test-backend-ops -j $(nproc)
+          time cmake --build build-wasm --config Release --target test-backend-ops -j $(nproc)

  ubuntu-22-hip:
    runs-on: ubuntu-22.04
@ -479,7 +504,7 @@ jobs:
        run: |
          cmake -B build -S . \
            -DGGML_MUSA=ON
-          cmake --build build --config Release -j $(nproc)
+          time cmake --build build --config Release -j $(nproc)

  ubuntu-22-sycl:
    runs-on: ubuntu-22.04
@ -528,7 +553,7 @@ jobs:
            -DGGML_SYCL=ON \
            -DCMAKE_C_COMPILER=icx \
            -DCMAKE_CXX_COMPILER=icpx
-          cmake --build build --config Release -j $(nproc)
+          time cmake --build build --config Release -j $(nproc)

  ubuntu-22-sycl-fp16:
    runs-on: ubuntu-22.04
@ -551,7 +576,7 @@ jobs:
        shell: bash
        run: |
          sudo apt update
-          sudo apt install intel-oneapi-compiler-dpcpp-cpp libssl-dev
+          sudo apt install intel-oneapi-compiler-dpcpp-cpp libssl-dev ninja-build

      - name: install oneAPI MKL library
        shell: bash
@ -574,11 +599,13 @@ jobs:
        run: |
          source /opt/intel/oneapi/setvars.sh
          cmake -B build \
+            -G "Ninja" \
+            -DCMAKE_BUILD_TYPE=Release \
            -DGGML_SYCL=ON \
            -DCMAKE_C_COMPILER=icx \
            -DCMAKE_CXX_COMPILER=icpx \
            -DGGML_SYCL_F16=ON
-          cmake --build build --config Release -j $(nproc)
+          time cmake --build build --config Release -j $(nproc)

  ubuntu-24-openvino:
      name: ubuntu-24-openvino-${{ matrix.openvino_device }}
@ -648,7 +675,7 @@ jobs:
            cmake -B build/ReleaseOV -G Ninja \
              -DCMAKE_BUILD_TYPE=Release \
              -DGGML_OPENVINO=ON
-            cmake --build build/ReleaseOV --config Release -j $(nproc)
+            time cmake --build build/ReleaseOV --config Release -j $(nproc)

        - name: Test
          id: cmake_test
@ -1039,7 +1066,7 @@ jobs:
            -DCMAKE_C_COMPILER=riscv64-linux-gnu-gcc-14 \
            -DCMAKE_CXX_COMPILER=riscv64-linux-gnu-g++-14

-          cmake --build build --config Release -j $(nproc)
+          time cmake --build build --config Release -j $(nproc)

      - name: Test
        id: cmake_test
--- a/.github/workflows/copilot-setup-steps.yml
+++ b/.github/workflows/copilot-setup-steps.yml
@ -54,4 +54,3 @@ jobs:
          python3 -m venv .venv
          source .venv/bin/activate
          pip install -r requirements/requirements-all.txt -r tools/server/tests/requirements.txt
-          pip install flake8 pyright pre-commit
--- a/.github/workflows/docker.yml
+++ b/.github/workflows/docker.yml
@ -25,186 +25,13 @@ permissions:
  packages: write

 jobs:
-  push_to_registry:
-    name: Push Docker image to Docker Hub
-
-    runs-on: ${{ matrix.config.runs_on }}
-    env:
-      COMMIT_SHA: ${{ github.sha }}
-    strategy:
-      fail-fast: false
-      matrix:
-        config:
-          # Multi-stage build
-          # Note: the arm64 images are failing, which prevents the amd64 images from being built
-          # https://github.com/ggml-org/llama.cpp/issues/11888
-          #- { tag: "cpu", dockerfile: ".devops/cpu.Dockerfile", platforms: "linux/amd64,linux/arm64", full: true, light: true, server: true, free_disk_space: false }
-          - { tag: "cpu",    dockerfile: ".devops/cpu.Dockerfile",    platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: false, runs_on: "ubuntu-22.04" }
-          - { tag: "cuda cuda12", dockerfile: ".devops/cuda.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: true,  runs_on: "ubuntu-22.04", cuda_version: "12.4.0", ubuntu_version: "22.04" }
-          - { tag: "cuda13", dockerfile: ".devops/cuda-new.Dockerfile",  platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: true,  runs_on: "ubuntu-22.04", cuda_version: "13.1.0", ubuntu_version: "24.04" }
-          - { tag: "musa",   dockerfile: ".devops/musa.Dockerfile",   platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: true,  runs_on: "ubuntu-22.04" }
-          - { tag: "intel",  dockerfile: ".devops/intel.Dockerfile",  platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: true,  runs_on: "ubuntu-22.04" }
-          - { tag: "vulkan", dockerfile: ".devops/vulkan.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: false, runs_on: "ubuntu-22.04" }
-          - { tag: "s390x",  dockerfile: ".devops/s390x.Dockerfile",  platforms: "linux/s390x", full: true, light: true, server: true, free_disk_space: false, runs_on: "ubuntu-22.04-s390x" }
-          - { tag: "rocm",   dockerfile: ".devops/rocm.Dockerfile",   platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: true,  runs_on: "ubuntu-22.04" }
-          - { tag: "openvino", dockerfile: ".devops/openvino.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: false, runs_on: "ubuntu-22.04" }
-    steps:
-      - name: Check out the repo
-        uses: actions/checkout@v6
-        with:
-          fetch-depth: 0 # preserve git history, so we can determine the build number
-
-      - name: Set up QEMU
-        if: ${{ matrix.config.tag != 's390x' }}
-        uses: docker/setup-qemu-action@v3
-        with:
-          image: tonistiigi/binfmt:qemu-v7.0.0-28
-
-      - name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@v3
-
-      - name: Log in to Docker Hub
-        uses: docker/login-action@v3
-        with:
-          registry: ghcr.io
-          username: ${{ github.repository_owner }}
-          password: ${{ secrets.GITHUB_TOKEN }}
-
-      - name: Determine source tag name
-        id: srctag
-        uses: ./.github/actions/get-tag-name
-        env:
-          BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
-
-      - name: Determine image tag name
-        id: tag
-        shell: bash
-        run: |
-          REPO_OWNER="${GITHUB_REPOSITORY_OWNER@L}"  # to lower case
-          REPO_NAME="${{ github.event.repository.name }}"
-          PREFIX="ghcr.io/${REPO_OWNER}/${REPO_NAME}:"
-
-          # list all tags possible
-          tags="${{ matrix.config.tag }}"
-          for tag in $tags; do
-              if [[ "$tag" == "cpu" ]]; then
-                  TYPE=""
-              else
-                  TYPE="-$tag"
-              fi
-              CACHETAGS="${PREFIX}buildcache${TYPE}"
-              FULLTAGS="${FULLTAGS:+$FULLTAGS,}${PREFIX}full${TYPE},${PREFIX}full${TYPE}-${{ steps.srctag.outputs.name }}"
-              LIGHTTAGS="${LIGHTTAGS:+$LIGHTTAGS,}${PREFIX}light${TYPE},${PREFIX}light${TYPE}-${{ steps.srctag.outputs.name }}"
-              SERVERTAGS="${SERVERTAGS:+$SERVERTAGS,}${PREFIX}server${TYPE},${PREFIX}server${TYPE}-${{ steps.srctag.outputs.name }}"
-          done
-          echo "cache_output_tags=$CACHETAGS" >> $GITHUB_OUTPUT
-          echo "full_output_tags=$FULLTAGS" >> $GITHUB_OUTPUT
-          echo "light_output_tags=$LIGHTTAGS" >> $GITHUB_OUTPUT
-          echo "server_output_tags=$SERVERTAGS" >> $GITHUB_OUTPUT
-          echo "cache_output_tags=$CACHETAGS"  # print out for debugging
-          echo "full_output_tags=$FULLTAGS"  # print out for debugging
-          echo "light_output_tags=$LIGHTTAGS"  # print out for debugging
-          echo "server_output_tags=$SERVERTAGS"  # print out for debugging
-        env:
-          GITHUB_REPOSITORY_OWNER: '${{ github.repository_owner }}'
-
-      - name: Free Disk Space (Ubuntu)
-        if: ${{ matrix.config.free_disk_space == true }}
-        uses: ggml-org/free-disk-space@v1.3.1
-        with:
-          # this might remove tools that are actually needed,
-          # if set to "true" but frees about 6 GB
-          tool-cache: false
-
-          # all of these default to true, but feel free to set to
-          # "false" if necessary for your workflow
-          android: true
-          dotnet: true
-          haskell: true
-          large-packages: true
-          docker-images: true
-          swap-storage: true
-
-      - name: Build and push Full Docker image (tagged + versioned)
-        if: ${{ (github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch') && matrix.config.full == true }}
-        uses: docker/build-push-action@v6
-        with:
-          context: .
-          push: true
-          platforms: ${{ matrix.config.platforms }}
-          # tag list is generated from step above
-          tags: ${{ steps.tag.outputs.full_output_tags }}
-          file: ${{ matrix.config.dockerfile }}
-          target: full
-          provenance: false
-          build-args: |
-            ${{ matrix.config.ubuntu_version && format('UBUNTU_VERSION={0}', matrix.config.ubuntu_version) || '' }}
-            ${{ matrix.config.cuda_version && format('CUDA_VERSION={0}', matrix.config.cuda_version) || '' }}
-          # using github experimental cache
-          #cache-from: type=gha
-          #cache-to: type=gha,mode=max
-          # return to this if the experimental github cache is having issues
-          #cache-to: type=local,dest=/tmp/.buildx-cache
-          #cache-from: type=local,src=/tmp/.buildx-cache
-          # using registry cache (no storage limit)
-          cache-from: type=registry,ref=${{ steps.tag.outputs.cache_output_tags }}
-          cache-to: type=registry,ref=${{ steps.tag.outputs.cache_output_tags }},mode=max
-
-      - name: Build and push Light Docker image (tagged + versioned)
-        if: ${{ (github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch') && matrix.config.light == true }}
-        uses: docker/build-push-action@v6
-        with:
-          context: .
-          push: true
-          platforms: ${{ matrix.config.platforms }}
-          # tag list is generated from step above
-          tags: ${{ steps.tag.outputs.light_output_tags }}
-          file: ${{ matrix.config.dockerfile }}
-          target: light
-          provenance: false
-          build-args: |
-            ${{ matrix.config.ubuntu_version && format('UBUNTU_VERSION={0}', matrix.config.ubuntu_version) || '' }}
-            ${{ matrix.config.cuda_version && format('CUDA_VERSION={0}', matrix.config.cuda_version) || '' }}
-          # using github experimental cache
-          #cache-from: type=gha
-          #cache-to: type=gha,mode=max
-          # return to this if the experimental github cache is having issues
-          #cache-to: type=local,dest=/tmp/.buildx-cache
-          #cache-from: type=local,src=/tmp/.buildx-cache
-          # using registry cache (no storage limit)
-          cache-from: type=registry,ref=${{ steps.tag.outputs.cache_output_tags }}
-          cache-to: type=registry,ref=${{ steps.tag.outputs.cache_output_tags }},mode=max
-
-      - name: Build and push Server Docker image (tagged + versioned)
-        if: ${{ (github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch') && matrix.config.server == true }}
-        uses: docker/build-push-action@v6
-        with:
-          context: .
-          push: true
-          platforms: ${{ matrix.config.platforms }}
-          # tag list is generated from step above
-          tags: ${{ steps.tag.outputs.server_output_tags }}
-          file: ${{ matrix.config.dockerfile }}
-          target: server
-          provenance: false
-          build-args: |
-            ${{ matrix.config.ubuntu_version && format('UBUNTU_VERSION={0}', matrix.config.ubuntu_version) || '' }}
-            ${{ matrix.config.cuda_version && format('CUDA_VERSION={0}', matrix.config.cuda_version) || '' }}
-          # using github experimental cache
-          #cache-from: type=gha
-          #cache-to: type=gha,mode=max
-          # return to this if the experimental github cache is having issues
-          #cache-to: type=local,dest=/tmp/.buildx-cache
-          #cache-from: type=local,src=/tmp/.buildx-cache
-          # using registry cache (no storage limit)
-          cache-from: type=registry,ref=${{ steps.tag.outputs.cache_output_tags }}
-          cache-to: type=registry,ref=${{ steps.tag.outputs.cache_output_tags }},mode=max
-
  create_tag:
    name: Create and push git tag
-    runs-on: ubuntu-22.04
+    runs-on: ubuntu-slim
    permissions:
      contents: write
+    outputs:
+      source_tag: ${{ steps.srctag.outputs.name }}

    steps:
      - name: Clone
@ -225,3 +52,391 @@ jobs:
        run: |
          git tag ${{ steps.srctag.outputs.name }} || exit 0
          git push origin ${{ steps.srctag.outputs.name }} || exit 0
+
+  prepare_matrices:
+    name: Prepare Docker matrices
+    runs-on: ubuntu-24.04
+    outputs:
+      build_matrix: ${{ steps.matrices.outputs.build_matrix }}
+      merge_matrix: ${{ steps.matrices.outputs.merge_matrix }}
+
+    steps:
+      - name: Generate build and merge matrices
+        id: matrices
+        shell: bash
+        run: |
+          set -euo pipefail
+
+          # Keep all build targets in one place and derive merge targets from it.
+          cat > build-matrix.json <<'JSON'
+          [
+            { "tag": "cpu", "dockerfile": ".devops/cpu.Dockerfile", "platforms": "linux/amd64", "full": true, "light": true, "server": true, "free_disk_space": false, "runs_on": "ubuntu-24.04" },
+            { "tag": "cpu", "dockerfile": ".devops/cpu.Dockerfile", "platforms": "linux/arm64", "full": true, "light": true, "server": true, "free_disk_space": false, "runs_on": "ubuntu-24.04-arm" },
+            { "tag": "cpu", "dockerfile": ".devops/s390x.Dockerfile", "platforms": "linux/s390x", "full": true, "light": true, "server": true, "free_disk_space": false, "runs_on": "ubuntu-24.04-s390x" },
+            { "tag": "cuda cuda12", "dockerfile": ".devops/cuda.Dockerfile", "platforms": "linux/amd64", "full": true, "light": true, "server": true, "free_disk_space": true, "runs_on": "ubuntu-24.04" },
+            { "tag": "cuda cuda12", "dockerfile": ".devops/cuda.Dockerfile", "platforms": "linux/arm64", "full": true, "light": true, "server": true, "free_disk_space": true, "runs_on": "ubuntu-24.04-arm" },
+            { "tag": "cuda13", "dockerfile": ".devops/cuda-new.Dockerfile", "platforms": "linux/amd64", "full": true, "light": true, "server": true, "free_disk_space": true, "runs_on": "ubuntu-24.04" },
+            { "tag": "cuda13", "dockerfile": ".devops/cuda-new.Dockerfile", "platforms": "linux/arm64", "full": true, "light": true, "server": true, "free_disk_space": true, "runs_on": "ubuntu-24.04-arm" },
+            { "tag": "musa", "dockerfile": ".devops/musa.Dockerfile", "platforms": "linux/amd64", "full": true, "light": true, "server": true, "free_disk_space": true, "runs_on": "ubuntu-24.04" },
+            { "tag": "intel", "dockerfile": ".devops/intel.Dockerfile", "platforms": "linux/amd64", "full": true, "light": true, "server": true, "free_disk_space": true, "runs_on": "ubuntu-24.04" },
+            { "tag": "vulkan", "dockerfile": ".devops/vulkan.Dockerfile", "platforms": "linux/amd64", "full": true, "light": true, "server": true, "free_disk_space": false, "runs_on": "ubuntu-24.04" },
+            { "tag": "vulkan", "dockerfile": ".devops/vulkan.Dockerfile", "platforms": "linux/arm64", "full": true, "light": true, "server": true, "free_disk_space": false, "runs_on": "ubuntu-24.04-arm" },
+            { "tag": "rocm", "dockerfile": ".devops/rocm.Dockerfile", "platforms": "linux/amd64", "full": true, "light": true, "server": true, "free_disk_space": true, "runs_on": "ubuntu-24.04" },
+            { "tag": "openvino", "dockerfile": ".devops/openvino.Dockerfile", "platforms": "linux/amd64", "full": true, "light": true, "server": true, "free_disk_space": false, "runs_on": "ubuntu-24.04" }
+          ]
+          JSON
+
+          BUILD_MATRIX="$(jq -c . build-matrix.json)"
+          MERGE_MATRIX="$(jq -c '
+            reduce .[] as $entry ({}; .[$entry.tag] |= (
+              . // {
+                tag: $entry.tag,
+                arches: [],
+                full: false,
+                light: false,
+                server: false
+              }
+              | .full = (.full or ($entry.full // false))
+              | .light = (.light or ($entry.light // false))
+              | .server = (.server or ($entry.server // false))
+              | .arches += [($entry.platforms | sub("^linux/"; ""))]
+            ))
+            # Backward compatibility: s390x tags are aliases of cpu for the linux/s390x platform.
+            | if (has("cpu") and (((.cpu.arches // []) | index("s390x")) != null)) then
+                . + {
+                  s390x: {
+                    tag: "s390x",
+                    arches: ["s390x"],
+                    full: .cpu.full,
+                    light: .cpu.light,
+                    server: .cpu.server
+                  }
+                }
+              else
+                .
+              end
+            | [.[] | .arches = (.arches | unique | sort | join(" "))]
+          ' build-matrix.json)"
+
+          echo "build_matrix=$BUILD_MATRIX" >> "$GITHUB_OUTPUT"
+          echo "merge_matrix=$MERGE_MATRIX" >> "$GITHUB_OUTPUT"
+
+  push_to_registry:
+    name: Push Docker image to Docker Registry
+    needs: [prepare_matrices, create_tag]
+
+    runs-on: ${{ matrix.config.runs_on }}
+    strategy:
+      fail-fast: false
+      matrix:
+        config: ${{ fromJSON(needs.prepare_matrices.outputs.build_matrix) }}
+    steps:
+      - name: Check out the repo
+        uses: actions/checkout@v6
+        with:
+          fetch-depth: 0
+          ref: ${{ needs.create_tag.outputs.source_tag }}
+
+      - name: Set up QEMU
+        if: ${{ contains(matrix.config.platforms, 'linux/amd64') }}
+        uses: docker/setup-qemu-action@ce360397dd3f832beb865e1373c09c0e9f86d70a # v4
+        with:
+          image: tonistiigi/binfmt:qemu-v10.2.1
+
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@4d04d5d9486b7bd6fa91e7baf45bbb4f8b9deedd # v4
+
+      - name: Log in to Docker Registry
+        uses: docker/login-action@b45d80f862d83dbcd57f89517bcf500b2ab88fb2 # v4
+        with:
+          registry: ghcr.io
+          username: ${{ github.repository_owner }}
+          password: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Determine image metadata
+        id: meta
+        shell: bash
+        run: |
+          set -euo pipefail
+
+          REPO_OWNER="${GITHUB_REPOSITORY_OWNER@L}"  # to lower case
+          REPO_NAME="${{ github.event.repository.name }}"
+          IMAGE_REPO="ghcr.io/${REPO_OWNER}/${REPO_NAME}"
+          PREFIX="${IMAGE_REPO}:"
+          PLATFORM="${{ matrix.config.platforms }}"
+          ARCH_SUFFIX="${PLATFORM#linux/}"
+
+          # list all tags possible
+          tags="${{ matrix.config.tag }}"
+          for tag in $tags; do
+              if [[ "$tag" == "cpu" ]]; then
+                  TYPE=""
+              else
+                  TYPE="-$tag"
+              fi
+              CACHETAG="${PREFIX}buildcache${TYPE}-${ARCH_SUFFIX}"
+          done
+
+          SAFE_TAGS="$(echo "$tags" | tr ' ' '_')"
+
+          echo "image_repo=$IMAGE_REPO" >> $GITHUB_OUTPUT
+          echo "arch_suffix=$ARCH_SUFFIX" >> $GITHUB_OUTPUT
+          echo "cache_output_tag=$CACHETAG" >> $GITHUB_OUTPUT
+          echo "digest_artifact_suffix=${SAFE_TAGS}-${ARCH_SUFFIX}" >> $GITHUB_OUTPUT
+          echo "cache_output_tag=$CACHETAG"  # print out for debugging
+        env:
+          GITHUB_REPOSITORY_OWNER: '${{ github.repository_owner }}'
+
+      - name: Free Disk Space (Ubuntu)
+        if: ${{ matrix.config.free_disk_space == true }}
+        uses: ggml-org/free-disk-space@v1.3.1
+        with:
+          # this might remove tools that are actually needed,
+          # if set to "true" but frees about 6 GB
+          tool-cache: false
+
+          # all of these default to true, but feel free to set to
+          # "false" if necessary for your workflow
+          android: true
+          dotnet: true
+          haskell: true
+          large-packages: true
+          docker-images: true
+          swap-storage: true
+
+      - name: Build and push Full Docker image by digest
+        id: build_full
+        if: ${{ (github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch') && matrix.config.full == true }}
+        uses: docker/build-push-action@d08e5c354a6adb9ed34480a06d141179aa583294 # v7
+        with:
+          context: .
+          platforms: ${{ matrix.config.platforms }}
+          outputs: type=image,name=${{ steps.meta.outputs.image_repo }},push-by-digest=true,name-canonical=true,push=true
+          file: ${{ matrix.config.dockerfile }}
+          target: full
+          provenance: false
+          build-args: |
+            ${{ matrix.config.ubuntu_version && format('UBUNTU_VERSION={0}', matrix.config.ubuntu_version) || '' }}
+            ${{ matrix.config.cuda_version && format('CUDA_VERSION={0}', matrix.config.cuda_version) || '' }}
+          # using github experimental cache
+          #cache-from: type=gha
+          #cache-to: type=gha,mode=max
+          # return to this if the experimental github cache is having issues
+          #cache-to: type=local,dest=/tmp/.buildx-cache
+          #cache-from: type=local,src=/tmp/.buildx-cache
+          # using registry cache (no storage limit)
+          cache-from: type=registry,ref=${{ steps.meta.outputs.cache_output_tag }}
+          cache-to: type=registry,ref=${{ steps.meta.outputs.cache_output_tag }},mode=max
+
+      - name: Build and push Light Docker image by digest
+        id: build_light
+        if: ${{ (github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch') && matrix.config.light == true }}
+        uses: docker/build-push-action@d08e5c354a6adb9ed34480a06d141179aa583294 # v7
+        with:
+          context: .
+          platforms: ${{ matrix.config.platforms }}
+          outputs: type=image,name=${{ steps.meta.outputs.image_repo }},push-by-digest=true,name-canonical=true,push=true
+          file: ${{ matrix.config.dockerfile }}
+          target: light
+          provenance: false
+          build-args: |
+            ${{ matrix.config.ubuntu_version && format('UBUNTU_VERSION={0}', matrix.config.ubuntu_version) || '' }}
+            ${{ matrix.config.cuda_version && format('CUDA_VERSION={0}', matrix.config.cuda_version) || '' }}
+          # using github experimental cache
+          #cache-from: type=gha
+          #cache-to: type=gha,mode=max
+          # return to this if the experimental github cache is having issues
+          #cache-to: type=local,dest=/tmp/.buildx-cache
+          #cache-from: type=local,src=/tmp/.buildx-cache
+          # using registry cache (no storage limit)
+          cache-from: type=registry,ref=${{ steps.meta.outputs.cache_output_tag }}
+          cache-to: type=registry,ref=${{ steps.meta.outputs.cache_output_tag }},mode=max
+
+      - name: Build and push Server Docker image by digest
+        id: build_server
+        if: ${{ (github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch') && matrix.config.server == true }}
+        uses: docker/build-push-action@d08e5c354a6adb9ed34480a06d141179aa583294 # v7
+        with:
+          context: .
+          platforms: ${{ matrix.config.platforms }}
+          outputs: type=image,name=${{ steps.meta.outputs.image_repo }},push-by-digest=true,name-canonical=true,push=true
+          file: ${{ matrix.config.dockerfile }}
+          target: server
+          provenance: false
+          build-args: |
+            ${{ matrix.config.ubuntu_version && format('UBUNTU_VERSION={0}', matrix.config.ubuntu_version) || '' }}
+            ${{ matrix.config.cuda_version && format('CUDA_VERSION={0}', matrix.config.cuda_version) || '' }}
+          # using github experimental cache
+          #cache-from: type=gha
+          #cache-to: type=gha,mode=max
+          # return to this if the experimental github cache is having issues
+          #cache-to: type=local,dest=/tmp/.buildx-cache
+          #cache-from: type=local,src=/tmp/.buildx-cache
+          # using registry cache (no storage limit)
+          cache-from: type=registry,ref=${{ steps.meta.outputs.cache_output_tag }}
+          cache-to: type=registry,ref=${{ steps.meta.outputs.cache_output_tag }},mode=max
+
+      - name: Export digest metadata
+        shell: bash
+        run: |
+            set -euo pipefail
+
+            TAGS="${{ matrix.config.tag }}"
+            ARCH_SUFFIX="${{ steps.meta.outputs.arch_suffix }}"
+            DIGEST_FILE="/tmp/digests/${{ steps.meta.outputs.digest_artifact_suffix }}.tsv"
+            mkdir -p /tmp/digests
+
+            add_digest_rows() {
+                local image_type="$1"
+                local digest="$2"
+
+                if [[ -z "$digest" ]]; then
+                  echo "Missing digest for image_type=${image_type}" >&2
+                  exit 1
+                fi
+
+                for tag in $TAGS; do
+                    printf '%s\t%s\t%s\t%s\n' "$tag" "$ARCH_SUFFIX" "$image_type" "$digest" >> "$DIGEST_FILE"
+                done
+            }
+
+            if [[ "${{ matrix.config.full }}" == "true" ]]; then
+                add_digest_rows "full" "${{ steps.build_full.outputs.digest }}"
+            fi
+
+            if [[ "${{ matrix.config.light }}" == "true" ]]; then
+                add_digest_rows "light" "${{ steps.build_light.outputs.digest }}"
+            fi
+
+            if [[ "${{ matrix.config.server }}" == "true" ]]; then
+                add_digest_rows "server" "${{ steps.build_server.outputs.digest }}"
+            fi
+
+      - name: Upload digest metadata
+        uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7
+        with:
+          name: digests-${{ steps.meta.outputs.digest_artifact_suffix }}
+          path: /tmp/digests/${{ steps.meta.outputs.digest_artifact_suffix }}.tsv
+          if-no-files-found: error
+
+  merge_arch_tags:
+    name: Create shared tags from digests
+    needs: [prepare_matrices, push_to_registry, create_tag]
+    runs-on: ubuntu-24.04
+    strategy:
+      fail-fast: false
+      matrix:
+        config: ${{ fromJSON(needs.prepare_matrices.outputs.merge_matrix) }}
+
+    steps:
+      - name: Check out the repo
+        uses: actions/checkout@v6
+        with:
+          fetch-depth: 0
+
+      - name: Download digest metadata
+        uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # v8
+        with:
+          pattern: digests-*
+          path: /tmp/digests
+          merge-multiple: true
+
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@4d04d5d9486b7bd6fa91e7baf45bbb4f8b9deedd # v4
+
+      - name: Log in to Docker Registry
+        uses: docker/login-action@b45d80f862d83dbcd57f89517bcf500b2ab88fb2 # v4
+        with:
+          registry: ghcr.io
+          username: ${{ github.repository_owner }}
+          password: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Create tags from digests
+        shell: bash
+        run: |
+          set -euo pipefail
+
+          REPO_OWNER="${GITHUB_REPOSITORY_OWNER@L}"  # to lower case
+          REPO_NAME="${{ github.event.repository.name }}"
+          IMAGE_REPO="ghcr.io/${REPO_OWNER}/${REPO_NAME}"
+          PREFIX="${IMAGE_REPO}:"
+          SRC_TAG="${{ needs.create_tag.outputs.source_tag }}"
+          TAGS="${{ matrix.config.tag }}"
+          ARCHES="${{ matrix.config.arches }}"
+          DIGEST_GLOB="/tmp/digests/*.tsv"
+
+          if ! ls ${DIGEST_GLOB} >/dev/null 2>&1; then
+              echo "No digest metadata found in /tmp/digests" >&2
+              exit 1
+          fi
+
+          if [[ -z "$SRC_TAG" ]]; then
+              echo "Missing source tag from create_tag" >&2
+              exit 1
+          fi
+
+          find_digest() {
+              local tag_name="$1"
+              local arch="$2"
+              local image_type="$3"
+              local digest
+
+              digest="$(awk -F '\t' -v t="$tag_name" -v a="$arch" -v i="$image_type" '$1 == t && $2 == a && $3 == i { print $4; exit }' ${DIGEST_GLOB})"
+
+              # Backward compatibility: s390x tags are aliases of cpu for the linux/s390x platform.
+              if [[ -z "$digest" && "$tag_name" == "s390x" && "$arch" == "s390x" ]]; then
+                digest="$(awk -F '\t' -v t="cpu" -v a="$arch" -v i="$image_type" '$1 == t && $2 == a && $3 == i { print $4; exit }' ${DIGEST_GLOB})"
+              fi
+
+              if [[ -z "$digest" ]]; then
+                echo "Missing digest for tag=${tag_name} arch=${arch} image_type=${image_type}" >&2
+                exit 1
+              fi
+
+              echo "$digest"
+          }
+
+          create_manifest_tags() {
+              local image_type="$1"
+              local tag_name="$2"
+              local suffix="$3"
+
+              local merged_tag="${PREFIX}${image_type}${suffix}"
+              local merged_versioned_tag="${merged_tag}-${SRC_TAG}"
+
+              local refs=()
+
+              for arch in $ARCHES; do
+                  local digest
+                  digest="$(find_digest "$tag_name" "$arch" "$image_type")"
+                  refs+=("${IMAGE_REPO}@${digest}")
+              done
+
+              echo "Creating ${merged_tag} from ${refs[*]}"
+              docker buildx imagetools create --tag "${merged_tag}" "${refs[@]}"
+
+              echo "Creating ${merged_versioned_tag} from ${refs[*]}"
+              docker buildx imagetools create --tag "${merged_versioned_tag}" "${refs[@]}"
+          }
+
+          for tag in $TAGS; do
+              if [[ "$tag" == "cpu" ]]; then
+                  TYPE=""
+              else
+                  TYPE="-$tag"
+              fi
+
+              if [[ "${{ matrix.config.full }}" == "true" ]]; then
+                  create_manifest_tags "full" "$tag" "$TYPE"
+              fi
+
+              if [[ "${{ matrix.config.light }}" == "true" ]]; then
+                  create_manifest_tags "light" "$tag" "$TYPE"
+              fi
+
+              if [[ "${{ matrix.config.server }}" == "true" ]]; then
+                  create_manifest_tags "server" "$tag" "$TYPE"
+              fi
+          done
+        env:
+          GITHUB_REPOSITORY_OWNER: '${{ github.repository_owner }}'
--- a/.github/workflows/editorconfig.yml
+++ b/.github/workflows/editorconfig.yml
@ -23,7 +23,7 @@ jobs:
    runs-on: ubuntu-slim
    steps:
      - uses: actions/checkout@v6
-      - uses: editorconfig-checker/action-editorconfig-checker@v2
+      - uses: editorconfig-checker/action-editorconfig-checker@840e866d93b8e032123c23bac69dece044d4d84c # v2.2.0
        with:
          version: v3.0.3
      - run: editorconfig-checker
--- a/.github/workflows/gguf-publish.yml
+++ b/.github/workflows/gguf-publish.yml
@ -28,17 +28,17 @@ jobs:
    - name: Set up Python
      uses: actions/setup-python@v6
      with:
-        python-version: '3.9.x'
+        python-version: '3.11'
    - name: Install dependencies
      run: |
        cd gguf-py
-        python -m pip install poetry
+        python -m pip install poetry==2.3.2
        poetry install

    - name: Build package
      run: cd gguf-py && poetry build
    - name: Publish package
-      uses: pypa/gh-action-pypi-publish@release/v1
+      uses: pypa/gh-action-pypi-publish@ed0c53931b1dc9bd32cbe73a98c7f6766f8a527e # release/v1
      with:
        password: ${{ secrets.PYPI_API_TOKEN }}
        packages-dir: gguf-py/dist
--- a/.github/workflows/hip-quality-check.yml
+++ b/.github/workflows/hip-quality-check.yml
@ -8,7 +8,8 @@ on:
    paths: [
      '.github/workflows/hip-quality-check.yml',
      '**/*.cu',
-      '**/*.cuh'
+      '**/*.cuh',
+      'scripts/hip/gcn-cdna-vgpr-check.py'
    ]

  pull_request:
@ -16,7 +17,8 @@ on:
    paths: [
      '.github/workflows/hip-quality-check.yml',
      '**/*.cu',
-      '**/*.cuh'
+      '**/*.cuh',
+      'scripts/hip/gcn-cdna-vgpr-check.py'
    ]

 concurrency:
--- a/.github/workflows/python-lint.yml
+++ b/.github/workflows/python-lint.yml
@ -31,6 +31,6 @@ jobs:
        with:
          python-version: "3.11"
      - name: flake8 Lint
-        uses: py-actions/flake8@v2
+        uses: py-actions/flake8@84ec6726560b6d5bd68f2a5bed83d62b52bb50ba # v2
        with:
            plugins: "flake8-no-print"
--- a/.github/workflows/python-type-check.yml
+++ b/.github/workflows/python-type-check.yml
@ -31,7 +31,7 @@ jobs:
        uses: actions/setup-python@v6
        with:
          python-version: "3.11"
-          pip-install: -r requirements/requirements-all.txt ty==0.0.24
+          pip-install: -r requirements/requirements-all.txt ty==0.0.26
      # - name: Type-check with Pyright
      #   uses: jakebailey/pyright-action@v2
      #   with:
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@ -131,17 +131,16 @@ jobs:
          path: llama-${{ steps.tag.outputs.name }}-bin-macos-x64.tar.gz
          name: llama-bin-macos-x64.tar.gz

-  ubuntu-22-cpu:
+  ubuntu-cpu:
    strategy:
      matrix:
        include:
          - build: 'x64'
            os: ubuntu-22.04
+          - build: 'arm64'
+            os: ubuntu-24.04-arm
          - build: 's390x'
            os: ubuntu-24.04-s390x
-          # GGML_BACKEND_DL and GGML_CPU_ALL_VARIANTS are not currently supported on arm
-          # - build: 'arm64'
-          #   os: ubuntu-22.04-arm

    runs-on: ${{ matrix.os }}

@ -165,6 +164,13 @@ jobs:
          sudo apt-get update
          sudo apt-get install build-essential libssl-dev

+      - name: Toolchain workaround (GCC 14)
+        if: ${{ contains(matrix.os, 'ubuntu-24.04') }}
+        run: |
+          sudo apt-get install -y gcc-14 g++-14
+          echo "CC=gcc-14" >> "$GITHUB_ENV"
+          echo "CXX=g++-14" >> "$GITHUB_ENV"
+
      - name: Build
        id: cmake_build
        run: |
@ -194,8 +200,16 @@ jobs:
          path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-${{ matrix.build }}.tar.gz
          name: llama-bin-ubuntu-${{ matrix.build }}.tar.gz

-  ubuntu-22-vulkan:
-    runs-on: ubuntu-22.04
+  ubuntu-vulkan:
+    strategy:
+      matrix:
+        include:
+          - build: 'x64'
+            os: ubuntu-22.04
+          - build: 'arm64'
+            os: ubuntu-24.04-arm
+
+    runs-on: ${{ matrix.os }}

    steps:
      - name: Clone
@ -207,16 +221,23 @@ jobs:
      - name: ccache
        uses: ggml-org/ccache-action@v1.2.21
        with:
-          key: ubuntu-22-vulkan
+          key: ubuntu-vulkan-${{ matrix.build }}
          evict-old-files: 1d

      - name: Dependencies
        id: depends
        run: |
-          wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | sudo apt-key add -
-          sudo wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list
-          sudo apt-get update -y
-          sudo apt-get install -y build-essential mesa-vulkan-drivers vulkan-sdk libssl-dev
+          if [[ "${{ matrix.os }}" =~ "ubuntu-22.04" ]]; then
+            wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | sudo apt-key add -
+            sudo wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list
+            sudo apt-get update -y
+            sudo apt-get install -y build-essential mesa-vulkan-drivers vulkan-sdk libssl-dev
+          else
+            sudo apt-get update -y
+            sudo apt-get install -y gcc-14 g++-14 build-essential glslc libvulkan-dev libssl-dev ninja-build
+            echo "CC=gcc-14" >> "$GITHUB_ENV"
+            echo "CXX=g++-14" >> "$GITHUB_ENV"
+          fi

      - name: Build
        id: cmake_build
@ -239,13 +260,13 @@ jobs:
        id: pack_artifacts
        run: |
          cp LICENSE ./build/bin/
-          tar -czvf llama-${{ steps.tag.outputs.name }}-bin-ubuntu-vulkan-x64.tar.gz --transform "s,./,llama-${{ steps.tag.outputs.name }}/," -C ./build/bin .
+          tar -czvf llama-${{ steps.tag.outputs.name }}-bin-ubuntu-vulkan-${{ matrix.build }}.tar.gz --transform "s,./,llama-${{ steps.tag.outputs.name }}/," -C ./build/bin .

      - name: Upload artifacts
        uses: actions/upload-artifact@v6
        with:
-          path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-vulkan-x64.tar.gz
-          name: llama-bin-ubuntu-vulkan-x64.tar.gz
+          path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-vulkan-${{ matrix.build }}.tar.gz
+          name: llama-bin-ubuntu-vulkan-${{ matrix.build }}.tar.gz

  ubuntu-24-openvino:
    runs-on: ubuntu-24.04
@ -907,7 +928,7 @@ jobs:
      - name: Set container image
        id: cann-image
        run: |
-          image="ascendai/cann:${{ matrix.chip_type == '910b' &&  '8.3.rc2-910b-openeuler24.03-py3.11' || '8.3.rc2-310p-openeuler24.03-py3.11' }}"
+          image="ascendai/cann:${{ matrix.chip_type == '910b' &&  '8.5.0-910b-openeuler24.03-py3.11' || '8.5.0-310p-openeuler24.03-py3.11' }}"
          echo "image=${image}" >> "${GITHUB_OUTPUT}"

      - name: Pull container image
@ -977,8 +998,8 @@ jobs:
      - windows-sycl
      - windows-hip
      - ubuntu-22-rocm
-      - ubuntu-22-cpu
-      - ubuntu-22-vulkan
+      - ubuntu-cpu
+      - ubuntu-vulkan
      - ubuntu-24-openvino
      - macOS-arm64
      - macOS-x64
@ -1061,9 +1082,11 @@ jobs:

            **Linux:**
            - [Ubuntu x64 (CPU)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-x64.tar.gz)
-            - [Ubuntu x64 (Vulkan)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-vulkan-x64.tar.gz)
-            - [Ubuntu x64 (ROCm 7.2)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-rocm-7.2-x64.tar.gz)
+            - [Ubuntu arm64 (CPU)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-arm64.tar.gz)
            - [Ubuntu s390x (CPU)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-s390x.tar.gz)
+            - [Ubuntu x64 (Vulkan)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-vulkan-x64.tar.gz)
+            - [Ubuntu arm64 (Vulkan)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-vulkan-arm64.tar.gz)
+            - [Ubuntu x64 (ROCm 7.2)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-rocm-7.2-x64.tar.gz)
            - [Ubuntu x64 (OpenVINO)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-openvino-${{ needs.ubuntu-24-openvino.outputs.openvino_version }}-x64.tar.gz)

            **Windows:**
--- a/.gitignore
+++ b/.gitignore
@ -95,6 +95,8 @@
 # Server Web UI temporary files
 /tools/server/webui/node_modules
 /tools/server/webui/dist
+# we no longer use gz for index.html
+/tools/server/public/index.html.gz

 # Python

--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -108,6 +108,7 @@ option(LLAMA_BUILD_TESTS    "llama: build tests"          ${LLAMA_STANDALONE})
 option(LLAMA_BUILD_TOOLS    "llama: build tools"          ${LLAMA_STANDALONE})
 option(LLAMA_BUILD_EXAMPLES "llama: build examples"       ${LLAMA_STANDALONE})
 option(LLAMA_BUILD_SERVER   "llama: build server example" ${LLAMA_STANDALONE})
+option(LLAMA_BUILD_WEBUI    "llama: build the embedded Web UI for server"  ON)
 option(LLAMA_TOOLS_INSTALL  "llama: install tools"        ${LLAMA_TOOLS_INSTALL_DEFAULT})
 option(LLAMA_TESTS_INSTALL  "llama: install tests"        ON)

--- a/ci/run.sh
+++ b/ci/run.sh
@ -57,6 +57,13 @@ SRC=`pwd`
 CMAKE_EXTRA="-DLLAMA_FATAL_WARNINGS=${LLAMA_FATAL_WARNINGS:-ON} -DLLAMA_OPENSSL=OFF -DGGML_SCHED_NO_REALLOC=ON"
 CTEST_EXTRA=""

+# Default to use make unless specified for compatibility
+CMAKE_GENERATOR="Unix Makefiles"
+
+if [ ! -z "${GG_BUILD_NINJA}" ]; then
+    CMAKE_GENERATOR="Ninja"
+fi
+
 if [ ! -z ${GG_BUILD_METAL} ]; then
    CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_METAL=ON"
 fi
@ -242,13 +249,13 @@ function gg_run_ctest_debug {

    set -e

-    # Check cmake, make and ctest are installed
+    # Check cmake and ctest are installed
    gg_check_build_requirements

-    (time cmake -DCMAKE_BUILD_TYPE=Debug ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
-    (time make -j$(nproc)                                  ) 2>&1 | tee -a $OUT/${ci}-make.log
+    (cmake -G "${CMAKE_GENERATOR}" -DCMAKE_BUILD_TYPE=Debug ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
+    (time cmake --build . --config Debug -j$(nproc)) 2>&1 | tee -a $OUT/${ci}-make.log

-    (time ctest --output-on-failure -L main -E "test-opt|test-backend-ops" ${CTEST_EXTRA}) 2>&1 | tee -a $OUT/${ci}-ctest.log
+    (time ctest -C Debug --output-on-failure -L main -E "test-opt|test-backend-ops" ${CTEST_EXTRA}) 2>&1 | tee -a $OUT/${ci}-ctest.log

    set +e
 }
@ -273,16 +280,16 @@ function gg_run_ctest_release {

    set -e

-    # Check cmake, make and ctest are installed
+    # Check cmake and ctest are installed
    gg_check_build_requirements

-    (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
-    (time make -j$(nproc)                                    ) 2>&1 | tee -a $OUT/${ci}-make.log
+    (cmake -G "${CMAKE_GENERATOR}" -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
+    (time cmake --build . --config Release -j$(nproc)) 2>&1 | tee -a $OUT/${ci}-make.log

    if [ -z ${GG_BUILD_LOW_PERF} ]; then
-        (time ctest --output-on-failure -L 'main|python' ${CTEST_EXTRA}) 2>&1 | tee -a $OUT/${ci}-ctest.log
+        (time ctest -C Release --output-on-failure -L 'main|python' ${CTEST_EXTRA}) 2>&1 | tee -a $OUT/${ci}-ctest.log
    else
-        (time ctest --output-on-failure -L main -E test-opt ${CTEST_EXTRA}) 2>&1 | tee -a $OUT/${ci}-ctest.log
+        (time ctest -C Release --output-on-failure -L main -E test-opt ${CTEST_EXTRA}) 2>&1 | tee -a $OUT/${ci}-ctest.log
    fi

    set +e
@ -340,7 +347,7 @@ function gg_run_ctest_with_model_debug {
    cd build-ci-debug
    set -e

-    (LLAMACPP_TEST_MODELFILE="$model" time ctest --output-on-failure -L model) 2>&1 | tee -a $OUT/${ci}-ctest.log
+    (LLAMACPP_TEST_MODELFILE="$model" time ctest -C Debug --output-on-failure -L model) 2>&1 | tee -a $OUT/${ci}-ctest.log

    set +e
    cd ..
@ -353,7 +360,7 @@ function gg_run_ctest_with_model_release {
    cd build-ci-release
    set -e

-    (LLAMACPP_TEST_MODELFILE="$model" time ctest --output-on-failure -L model) 2>&1 | tee -a $OUT/${ci}-ctest.log
+    (LLAMACPP_TEST_MODELFILE="$model" time ctest -C Release --output-on-failure -L model) 2>&1 | tee -a $OUT/${ci}-ctest.log

    # test memory leaks
    #if [[ ! -z ${GG_BUILD_METAL} ]]; then
@ -407,8 +414,8 @@ function gg_run_qwen3_0_6b {

    set -e

-    (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
-    (time make -j$(nproc)                                    ) 2>&1 | tee -a $OUT/${ci}-make.log
+    (cmake -G "${CMAKE_GENERATOR}" -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
+    (time cmake --build . --config Release -j$(nproc)) 2>&1 | tee -a $OUT/${ci}-make.log

    python3 ../convert_hf_to_gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf  --outtype f16
    python3 ../convert_hf_to_gguf.py ${path_models} --outfile ${path_models}/ggml-model-bf16.gguf --outtype bf16
@ -556,8 +563,8 @@ function gg_run_embd_bge_small {

    set -e

-    (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
-    (time make -j$(nproc)                                    ) 2>&1 | tee -a $OUT/${ci}-make.log
+    (cmake -G "${CMAKE_GENERATOR}" -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
+    (time cmake --build . --config Release -j$(nproc)) 2>&1 | tee -a $OUT/${ci}-make.log

    python3 ../convert_hf_to_gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf

@ -601,8 +608,8 @@ function gg_run_rerank_tiny {

    set -e

-    (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
-    (time make -j$(nproc)                                    ) 2>&1 | tee -a $OUT/${ci}-make.log
+    (cmake -G "${CMAKE_GENERATOR}" -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
+    (time cmake --build . --config Release -j$(nproc)) 2>&1 | tee -a $OUT/${ci}-make.log

    python3 ../convert_hf_to_gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf

@ -652,10 +659,6 @@ function gg_check_build_requirements {
        gg_printf 'cmake not found, please install'
    fi

-    if ! command -v make &> /dev/null; then
-        gg_printf 'make not found, please install'
-    fi
-
    if ! command -v ctest &> /dev/null; then
        gg_printf 'ctest not found, please install'
    fi
--- a/common/arg.cpp
+++ b/common/arg.cpp
@ -423,6 +423,9 @@ static bool parse_bool_value(const std::string & value) {
 static bool common_params_parse_ex(int argc, char ** argv, common_params_context & ctx_arg) {
    common_params & params = ctx_arg.params;

+    // setup log directly from params.verbosity: see tools/cli/cli.cpp
+    common_log_set_verbosity_thold(params.verbosity);
+
    std::unordered_map<std::string, std::pair<common_arg *, bool>> arg_to_options;
    for (auto & opt : ctx_arg.options) {
        for (const auto & arg : opt.args) {
@ -631,8 +634,6 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
        ));
    }

-    common_log_set_verbosity_thold(params.verbosity);
-
    return true;
 }

@ -1078,7 +1079,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        [](common_params & params) {
            params.verbose_prompt = true;
        }
-    ));
+    ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_RETRIEVAL}));
    add_opt(common_arg(
        {"--display-prompt"},
        {"--no-display-prompt"},
@ -2806,6 +2807,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
            params.port = value;
        }
    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_PORT"));
+    add_opt(common_arg(
+        {"--reuse-port"},
+        string_format("allow multiple sockets to bind to the same port (default: %s)", params.reuse_port ? "enabled" : "disabled"),
+        [](common_params & params) {
+            params.reuse_port = true;
+        }
+    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_REUSE_PORT"));
    add_opt(common_arg(
        {"--path"}, "PATH",
        string_format("path to serve static files from (default: %s)", params.public_path.c_str()),
@ -2842,6 +2850,15 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
            params.webui_mcp_proxy = value;
        }
    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_WEBUI_MCP_PROXY"));
+    add_opt(common_arg(
+        {"--tools"}, "TOOL1,TOOL2,...",
+        "experimental: whether to enable built-in tools for AI agents - do not enable in untrusted environments (default: no tools)\n"
+        "specify \"all\" to enable all tools\n"
+        "available tools: read_file, file_glob_search, grep_search, exec_shell_command, write_file, edit_file, apply_diff",
+        [](common_params & params, const std::string & value) {
+            params.server_tools = parse_csv_row(value);
+        }
+    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_TOOLS"));
    add_opt(common_arg(
        {"--webui"},
        {"--no-webui"},
@ -3244,6 +3261,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        "Set verbosity level to infinity (i.e. log all messages, useful for debugging)",
        [](common_params & params) {
            params.verbosity = INT_MAX;
+            common_log_set_verbosity_thold(INT_MAX);
        }
    ));
    add_opt(common_arg(
@ -3264,6 +3282,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
            "(default: %d)\n", params.verbosity),
        [](common_params & params, int value) {
            params.verbosity = value;
+            common_log_set_verbosity_thold(value);
        }
    ).set_env("LLAMA_LOG_VERBOSITY"));
    add_opt(common_arg(
--- a/common/chat-auto-parser-generator.cpp
+++ b/common/chat-auto-parser-generator.cpp
@ -65,7 +65,7 @@ common_chat_params peg_generator::generate_parser(const common_chat_template &
        data.grammar      = build_grammar([&](const common_grammar_builder & builder) {
            foreach_function(inputs.tools, [&](const json & tool) {
                const auto & function = tool.at("function");
-                auto         schema   = function.at("parameters");
+                auto         schema   = function.contains("parameters") ? function.at("parameters") : json::object();
                builder.resolve_refs(schema);
            });
            parser.build_grammar(builder, data.grammar_lazy);
@ -221,7 +221,7 @@ common_peg_parser analyze_tools::build_tool_parser_tag_json(parser_build_context
    foreach_function(inputs.tools, [&](const json & tool) {
        const auto & func   = tool.at("function");
        std::string  name   = func.at("name");
-        const auto & schema = func.at("parameters");
+        const auto & schema = func.contains("parameters") ? func.at("parameters") : json::object();

        // Build call_id parser based on position (if supported)
        common_peg_parser call_id_section = p.eps();
@ -282,19 +282,11 @@ common_peg_parser analyze_tools::build_tool_parser_tag_tagged(parser_build_conte
    common_peg_parser tool_choice = p.choice();

    foreach_function(inputs.tools, [&](const json & tool) {
-        const auto & func   = tool.at("function");
-        std::string  name   = func.at("name");
-        const auto & params = func.at("parameters");
-
-        if (!params.contains("properties") || !params.at("properties").is_object()) {
-            return;
-        }
-
-        const auto &          properties = params.at("properties");
+        const auto &          func       = tool.at("function");
+        std::string           name       = func.at("name");
+        const auto &          params     = func.contains("parameters") ? func.at("parameters") : json::object();
+        const auto &          properties = params.contains("properties") ? params.at("properties") : json::object();
        std::set<std::string> required;
-        if (params.contains("required") && params.at("required").is_array()) {
-            params.at("required").get_to(required);
-        }

        // Build parser for each argument, separating required and optional
        std::vector<common_peg_parser> required_parsers;
@ -311,17 +303,18 @@ common_peg_parser analyze_tools::build_tool_parser_tag_tagged(parser_build_conte
                }
            }

-            auto arg = p.tool_arg(
-                p.tool_arg_open(arguments.name_prefix + p.tool_arg_name(p.literal(param_name)) +
-                                arguments.name_suffix) +
-                arguments.value_prefix +
-                (type == "string" ? p.tool_arg_string_value(p.schema(p.until(arguments.value_suffix),
-                                                                     "tool-" + name + "-arg-" + param_name + "-schema",
-                                                                     param_schema, true)) :
-                                    p.tool_arg_json_value(p.schema(
-                                        p.json(), "tool-" + name + "-arg-" + param_name + "-schema", param_schema, false)) +
-                                        p.space()) +
-                p.tool_arg_close(p.literal(arguments.value_suffix)));
+            auto arg =
+                p.tool_arg(p.tool_arg_open(arguments.name_prefix + p.tool_arg_name(p.literal(param_name)) +
+                                           arguments.name_suffix) +
+                           arguments.value_prefix +
+                           (type == "string" ?
+                                p.tool_arg_string_value(p.schema(p.until(arguments.value_suffix),
+                                                                 "tool-" + name + "-arg-" + param_name + "-schema",
+                                                                 param_schema, true)) :
+                                p.tool_arg_json_value(p.schema(
+                                    p.json(), "tool-" + name + "-arg-" + param_name + "-schema", param_schema, false)) +
+                                    p.space()) +
+                           p.tool_arg_close(p.literal(arguments.value_suffix)));

            auto named_arg = p.rule("tool-" + name + "-arg-" + param_name, arg);
            if (is_required) {
--- a/common/chat-diff-analyzer.cpp
+++ b/common/chat-diff-analyzer.cpp
@ -287,7 +287,7 @@ void analyze_reasoning::compare_reasoning_presence() {
            return p.literal(reasoning_content) + p.space() + p.optional(p.tag("post", (p.marker() + p.space())) + p.rest());
        });
        auto parser_wrapped = build_tagged_peg_parser([&](common_peg_parser_builder &p) {
-            return p.tag("pre", p.marker()) + p.space() + p.literal(reasoning_content) + p.space() + p.tag("post", (p.marker() + p.space())) + p.rest();
+            return p.tag("pre", p.marker() + p.space()) + p.literal(reasoning_content) + p.space() + p.tag("post", (p.marker() + p.space())) + p.rest();
        });
        // try the more aggressive parse first, if it fails, fall back to the delimiter one
        auto result = parser_wrapped.parse_anywhere_and_extract(comparison->output_B);
@ -297,7 +297,7 @@ void analyze_reasoning::compare_reasoning_presence() {
        if (result.result.success()) {
            if (!result.tags["pre"].empty() && !result.tags["post"].empty()) {
                mode = reasoning_mode::TAG_BASED;
-                start = trim_whitespace(result.tags["pre"]);
+                start = trim_leading_whitespace(result.tags["pre"]);
                end   = trim_trailing_whitespace(result.tags["post"]);
            } else if (!result.tags["post"].empty()) {
                mode = reasoning_mode::TAG_BASED;
@ -333,7 +333,7 @@ void analyze_reasoning::compare_thinking_enabled() {
    if (left_trimmed.empty() && !diff.right.empty()) {
        if (!right_trimmed.empty() && string_ends_with(comparison->output_B, right_trimmed)) {
            if (start.empty()) {
-                start = right_trimmed;
+                start = trim_leading_whitespace(diff.right);
                mode  = reasoning_mode::TAG_BASED;
            }
        }
@ -344,7 +344,7 @@ void analyze_reasoning::compare_thinking_enabled() {
                if (seg.size() >= 2 && seg[seg.size() - 1].value == left_trimmed && seg[seg.size() - 2].type == segment_type::MARKER) {
                    start = seg[seg.size() - 2].value;
                }
-                end = left_trimmed;
+                end = trim_trailing_whitespace(diff.left);
                mode = reasoning_mode::TAG_BASED;
            }
        }
@ -363,15 +363,23 @@ void analyze_reasoning::compare_thinking_enabled() {
            size_t len = std::min(base.size(), anchor_len);
            std::string anchor = base.substr(base.size() - len);
            auto pos = extended.rfind(anchor);
-            if (pos == std::string::npos || pos + len >= extended.size()) continue;
+            if (pos == std::string::npos || pos + len >= extended.size()) {
+                continue;
+            }

            std::string extra = trim_whitespace(extended.substr(pos + len));
-            if (extra.empty()) continue;
+            if (extra.empty()) {
+                continue;
+            }

            auto seg = prune_whitespace_segments(segmentize_markers(extra));
            if (seg.size() == 2 && seg[0].type == segment_type::MARKER && seg[1].type == segment_type::MARKER) {
-                if (start.empty()) start = seg[0].value;
-                if (end.empty())   end   = seg[1].value;
+                if (start.empty()) {
+                    start = seg[0].value;
+                }
+                if (end.empty()) {
+                    end   = seg[1].value;
+                }
                mode = reasoning_mode::TAG_BASED;
                break;
            }
@ -423,7 +431,7 @@ void analyze_reasoning::compare_reasoning_scope() {
        LOG_DBG(ANSI_ORANGE "%s: Detected TOOLS_ONLY reasoning mode\n" ANSI_RESET, __func__);

        auto parser_wrapped = build_tagged_peg_parser([&](common_peg_parser_builder &p) {
-            return p.tag("pre", p.marker()) + p.space() + p.literal(reasoning_content) + p.space() + p.tag("post", (p.marker() + p.space()));
+            return p.tag("pre", p.marker() + p.space()) + p.literal(reasoning_content) + p.space() + p.tag("post", (p.marker() + p.space()));
        });
        auto result = parser_wrapped.parse_anywhere_and_extract(comparison->output_B);
        if (result.result.success()) {
@ -516,7 +524,7 @@ analyze_content::analyze_content(const common_chat_template & tmpl, const analyz
        // Take the more promising diff
        std::string pure_content = rdiff.length() > diff_tools.left.length() ? rdiff : diff_tools.left;
        auto parser_wrapped = build_tagged_peg_parser([&](common_peg_parser_builder &p) {
-            return p.tag("pre", p.marker()) + p.space() + p.literal(response) + p.space() + p.tag("post", (p.marker() + p.space())) + p.rest();
+            return p.tag("pre", p.marker() + p.space()) + p.literal(response) + p.space() + p.tag("post", (p.marker() + p.space())) + p.rest();
        });
        auto result = parser_wrapped.parse_anywhere_and_extract(pure_content);
        start = result.tags["pre"];
--- a/common/chat.cpp
+++ b/common/chat.cpp
@ -221,7 +221,7 @@ using chat_template_caps = jinja::caps;
 struct common_chat_templates {
    bool add_bos;
    bool add_eos;
-    bool has_explicit_template;  // Model had builtin template or template overridde was specified.
+    bool has_explicit_template;  // Model had builtin template or template overridden was specified.
    std::unique_ptr<common_chat_template> template_default;  // always set (defaults to chatml)
    std::unique_ptr<common_chat_template> template_tool_use;
 };
@ -971,6 +971,7 @@ static common_chat_params common_chat_params_init_gpt_oss(const common_chat_temp
    auto has_tools           = inputs.tools.is_array() && !inputs.tools.empty();
    auto has_response_format = !inputs.json_schema.is_null() && inputs.json_schema.is_object();
    auto include_grammar     = has_response_format || (has_tools && inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_NONE);
+    auto extract_reasoning   = inputs.reasoning_format != COMMON_REASONING_FORMAT_NONE;

    auto parser = build_chat_peg_parser([&](common_chat_peg_builder & p) {
        auto start           = p.rule("start", p.literal("<|start|>assistant"));
@ -979,9 +980,19 @@ static common_chat_params common_chat_params_init_gpt_oss(const common_chat_temp
        auto channel         = p.literal("<|channel|>") + (p.literal("commentary") | p.literal("analysis"));
        auto constrain_type  = p.chars("[A-Za-z0-9_-]", 1, -1);

-        auto analysis = p.rule("analysis", p.literal("<|channel|>analysis<|message|>") + p.reasoning(content) + end);
+        if (extract_reasoning) {
+            p.rule("analysis", p.literal("<|channel|>analysis<|message|>") + p.reasoning(content) + end);
+        } else {
+            p.rule("analysis", p.content(p.literal("<|channel|>analysis<|message|>") + content + end));
+        }
+
+        auto analysis = p.ref("analysis");
        auto preamble = p.rule("preamble", p.literal("<|channel|>commentary<|message|>") + p.content(content) + end);
        auto final_msg = p.rule("final", p.literal("<|channel|>final<|message|>") + p.content(content));
+
+        // Consume any unsolicited tool calls, e.g. builtin functions
+        auto unsolicited = p.rule("unsolicited", p.atomic(p.optional(channel) + p.literal(" to=") + content + end));
+
        auto any = p.rule("any", preamble | analysis);

        if (has_response_format) {
@ -1025,7 +1036,7 @@ static common_chat_params common_chat_params_init_gpt_oss(const common_chat_temp
            return p.zero_or_more(start + any) + start + (tool_call | final_msg);
        }

-        return p.zero_or_more(start + any) + start + final_msg;
+        return p.zero_or_more(start + any) + start + (final_msg | unsolicited);
    });

    data.parser = parser.save();
--- a/common/common.cpp
+++ b/common/common.cpp
@ -359,6 +359,11 @@ bool parse_cpu_mask(const std::string & mask, bool (&boolmask)[GGML_MAX_N_THREAD
 }

 void common_init() {
+#if defined(_WIN32)
+    SetConsoleOutputCP(CP_UTF8);
+    SetConsoleCP(CP_UTF8);
+#endif
+
    llama_log_set(common_log_default_callback, NULL);

 #ifdef NDEBUG
@ -367,7 +372,7 @@ void common_init() {
    const char * build_type = " (debug)";
 #endif

-    LOG_INF("build: %d (%s) with %s for %s%s\n", LLAMA_BUILD_NUMBER, LLAMA_COMMIT, LLAMA_COMPILER, LLAMA_BUILD_TARGET, build_type);
+    LOG_DBG("build: %d (%s) with %s for %s%s\n", LLAMA_BUILD_NUMBER, LLAMA_COMMIT, LLAMA_COMPILER, LLAMA_BUILD_TARGET, build_type);
 }

 std::string common_params_get_system_info(const common_params & params) {
@ -656,6 +661,97 @@ bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_over
    return true;
 }

+static inline bool glob_class_match(const char c, const char * pattern, const char * class_end) {
+    const char * class_start = pattern;
+    bool negated = false;
+
+    if (*class_start == '!') {
+        negated = true;
+        class_start++;
+    }
+
+    // If first character after negation is ']' or '-', treat it as literal
+    if (*class_start == ']' || *class_start == '-') {
+        if (class_start < class_end && *class_start == c) {
+            return !negated;
+        }
+        class_start++;
+    }
+
+    bool matched = false;
+
+    while (class_start < class_end) {
+        if (class_start + 2 < class_end && class_start[1] == '-' && class_start[2] != ']') {
+            char start_char = *class_start;
+            char end_char = class_start[2];
+            if (c >= start_char && c <= end_char) {
+                matched = true;
+                break;
+            }
+            class_start += 3;
+        } else {
+            if (*class_start == c) {
+                matched = true;
+                break;
+            }
+            class_start++;
+        }
+    }
+
+    return negated ? !matched : matched;
+}
+
+// simple glob: * matches non-/ chars, ** matches anything including /, [] matches character class
+static inline bool glob_match(const char * pattern, const char * str) {
+    if (*pattern == '\0') {
+        return *str == '\0';
+    }
+    if (pattern[0] == '*' && pattern[1] == '*') {
+        const char * p = pattern + 2;
+        if (glob_match(p, str)) return true;
+        if (*str != '\0') return glob_match(pattern, str + 1);
+        return false;
+    }
+    if (*pattern == '*') {
+        const char * p = pattern + 1;
+        for (; *str != '\0' && *str != '/'; str++) {
+            if (glob_match(p, str)) return true;
+        }
+        return glob_match(p, str);
+    }
+    if (*pattern == '?' && *str != '\0' && *str != '/') {
+        return glob_match(pattern + 1, str + 1);
+    }
+    if (*pattern == '[') {
+        const char * class_end = pattern + 1;
+        // If first character after '[' is ']' or '-', treat it as literal
+        if (*class_end == ']' || *class_end == '-') {
+            class_end++;
+        }
+        while (*class_end != '\0' && *class_end != ']') {
+            class_end++;
+        }
+        if (*class_end == ']') {
+            if (*str == '\0') return false;
+            bool matched = glob_class_match(*str, pattern + 1, class_end);
+            return matched && glob_match(class_end + 1, str + 1);
+        } else {
+            if (*str == '[') {
+                return glob_match(pattern + 1, str + 1);
+            }
+            return false;
+        }
+    }
+    if (*pattern == *str) {
+        return glob_match(pattern + 1, str + 1);
+    }
+    return false;
+}
+
+bool glob_match(const std::string & pattern, const std::string & str) {
+    return glob_match(pattern.c_str(), str.c_str());
+}
+
 //
 // Filesystem utils
 //
@ -1152,6 +1248,9 @@ llama_context * common_init_result::context() {
 }

 common_sampler * common_init_result::sampler(llama_seq_id seq_id) {
+    if (seq_id < 0 || seq_id >= (int) pimpl->samplers.size()) {
+        return nullptr;
+    }
    return pimpl->samplers[seq_id].get();
 }

--- a/common/common.h
+++ b/common/common.h
@ -573,6 +573,7 @@ struct common_params {

    // server params
    int32_t port                = 8080;          // server listens on this network port
+    bool    reuse_port          = false;         // allow multiple sockets to bind to the same port
    int32_t timeout_read        = 600;           // http read timeout in seconds
    int32_t timeout_write       = timeout_read;  // http write timeout in seconds
    int32_t n_threads_http      = -1;    // number of threads to process HTTP requests (TODO: support threadpool)
@ -613,6 +614,9 @@ struct common_params {
    bool endpoint_props   = false; // only control POST requests, not GET
    bool endpoint_metrics = false;

+    // enable built-in tools
+    std::vector<std::string> server_tools;
+
    // router server configs
    std::string models_dir    = ""; // directory containing models for the router server
    std::string models_preset = ""; // directory containing model presets for the router server
@ -790,6 +794,8 @@ std::string string_from(const std::vector<int> & values);
 std::string string_from(const struct llama_context * ctx, const std::vector<llama_token> & tokens);
 std::string string_from(const struct llama_context * ctx, const struct llama_batch & batch);

+bool glob_match(const std::string & pattern, const std::string & str);
+
 //
 // Filesystem utils
 //
--- a/common/download.cpp
+++ b/common/download.cpp
@ -119,6 +119,9 @@ class ProgressBar {
    static inline std::map<const ProgressBar *, int> lines;
    static inline int max_line = 0;

+    std::string filename;
+    size_t len = 0;
+
    static void cleanup(const ProgressBar * line) {
        lines.erase(line);
        if (lines.empty()) {
@ -135,7 +138,23 @@ class ProgressBar {
    }

 public:
-    ProgressBar() = default;
+    ProgressBar(const std::string & url = "") : filename(url) {
+        if (auto pos = filename.rfind('/'); pos != std::string::npos) {
+            filename = filename.substr(pos + 1);
+        }
+        if (auto pos = filename.find('?'); pos != std::string::npos) {
+            filename = filename.substr(0, pos);
+        }
+        for (size_t i = 0; i < filename.size(); ++i) {
+            if ((filename[i] & 0xC0) != 0x80) {
+                if (len++ == 39) {
+                    filename.resize(i);
+                    filename += "…";
+                    break;
+                }
+            }
+        }
+    }

    ~ProgressBar() {
        std::lock_guard<std::mutex> lock(mutex);
@ -143,11 +162,7 @@ public:
    }

    void update(size_t current, size_t total) {
-        if (!is_output_a_tty()) {
-            return;
-        }
-
-        if (!total) {
+        if (!total || !is_output_a_tty()) {
            return;
        }

@ -159,28 +174,27 @@ public:
        }
        int lines_up = max_line - lines[this];

-        size_t width = 50;
+        size_t bar = 55 - len;
        size_t pct = (100 * current) / total;
-        size_t pos = (width * current) / total;
-
-        std::cout << "\033[s";
+        size_t pos = (bar * current) / total;

        if (lines_up > 0) {
            std::cout << "\033[" << lines_up << "A";
        }
-        std::cout << "\033[2K\r["
-            << std::string(pos, '=')
-            << (pos < width ? ">" : "")
-            << std::string(width - pos, ' ')
-            << "] " << std::setw(3) << pct << "%  ("
-            << current / (1024 * 1024) << " MB / "
-            << total / (1024 * 1024) << " MB) "
-            << "\033[u";
+        std::cout << '\r' << "Downloading " << filename << " ";

-        std::cout.flush();
+        for (size_t i = 0; i < bar; ++i) {
+            std::cout << (i < pos ? "—" : " ");
+        }
+        std::cout << std::setw(4) << pct << "%\033[K";
+
+        if (lines_up > 0) {
+            std::cout << "\033[" << lines_up << "B";
+        }
+        std::cout << '\r' << std::flush;

        if (current == total) {
-             cleanup(this);
+            cleanup(this);
        }
    }

@ -208,7 +222,7 @@ static bool common_pull_file(httplib::Client & cli,
    const char * func = __func__; // avoid __func__ inside a lambda
    size_t downloaded = existing_size;
    size_t progress_step = 0;
-    ProgressBar bar;
+    ProgressBar bar(resolve_path);

    auto res = cli.Get(resolve_path, headers,
        [&](const httplib::Response &response) {
@ -286,7 +300,7 @@ static int common_download_file_single_online(const std::string        & url,
    const bool file_exists = std::filesystem::exists(path);

    if (file_exists && skip_etag) {
-        LOG_INF("%s: using cached file: %s\n", __func__, path.c_str());
+        LOG_DBG("%s: using cached file: %s\n", __func__, path.c_str());
        return 304; // 304 Not Modified - fake cached response
    }

@ -294,7 +308,7 @@ static int common_download_file_single_online(const std::string        & url,
    if (file_exists) {
        last_etag = read_etag(path);
    } else {
-        LOG_INF("%s: no previous model file found %s\n", __func__, path.c_str());
+        LOG_DBG("%s: no previous model file found %s\n", __func__, path.c_str());
    }

    auto head = cli.Head(parts.path);
@ -328,11 +342,11 @@ static int common_download_file_single_online(const std::string        & url,

    if (file_exists) {
        if (etag.empty()) {
-            LOG_INF("%s: using cached file (no server etag): %s\n", __func__, path.c_str());
+            LOG_DBG("%s: using cached file (no server etag): %s\n", __func__, path.c_str());
            return 304; // 304 Not Modified - fake cached response
        }
        if (!last_etag.empty() && last_etag == etag) {
-            LOG_INF("%s: using cached file (same etag): %s\n", __func__, path.c_str());
+            LOG_DBG("%s: using cached file (same etag): %s\n", __func__, path.c_str());
            return 304; // 304 Not Modified - fake cached response
        }
        if (remove(path.c_str()) != 0) {
@ -368,7 +382,7 @@ static int common_download_file_single_online(const std::string        & url,
            }
        }

-        LOG_INF("%s: downloading from %s to %s (etag:%s)...\n",
+        LOG_DBG("%s: downloading from %s to %s (etag:%s)...\n",
                __func__, common_http_show_masked_url(parts).c_str(),
                path_temporary.c_str(), etag.c_str());

@ -437,7 +451,7 @@ int common_download_file_single(const std::string & url,
        return -1;
    }

-    LOG_INF("%s: using cached file (offline mode): %s\n", __func__, path.c_str());
+    LOG_DBG("%s: using cached file (offline mode): %s\n", __func__, path.c_str());
    return 304; // Not Modified - fake cached response
 }

@ -454,7 +468,9 @@ static gguf_split_info get_gguf_split_info(const std::string & path) {
    std::smatch m;

    std::string prefix = path;
-    string_remove_suffix(prefix, ".gguf");
+    if (!string_remove_suffix(prefix, ".gguf")) {
+        return {};
+    }

    int index = 1;
    int count = 1;
@ -546,6 +562,20 @@ static hf_cache::hf_file find_best_mmproj(const hf_cache::hf_files & files,
    return best;
 }

+static bool gguf_filename_is_model(const std::string & filepath) {
+    if (!string_ends_with(filepath, ".gguf")) {
+        return false;
+    }
+
+    std::string filename = filepath;
+    if (auto pos = filename.rfind('/'); pos != std::string::npos) {
+        filename = filename.substr(pos + 1);
+    }
+
+    return filename.find("mmproj")  == std::string::npos &&
+           filename.find("imatrix") == std::string::npos;
+}
+
 static hf_cache::hf_file find_best_model(const hf_cache::hf_files & files,
                                         const std::string        & tag) {
    std::vector<std::string> tags;
@ -559,8 +589,7 @@ static hf_cache::hf_file find_best_model(const hf_cache::hf_files & files,
    for (const auto & t : tags) {
        std::regex pattern(t + "[.-]", std::regex::icase);
        for (const auto & f : files) {
-            if (string_ends_with(f.path, ".gguf") &&
-                f.path.find("mmproj") == std::string::npos &&
+            if (gguf_filename_is_model(f.path) &&
                std::regex_search(f.path, pattern)) {
                return f;
            }
@ -568,8 +597,7 @@ static hf_cache::hf_file find_best_model(const hf_cache::hf_files & files,
    }

    for (const auto & f : files) {
-        if (string_ends_with(f.path, ".gguf") &&
-            f.path.find("mmproj") == std::string::npos) {
+        if (gguf_filename_is_model(f.path)) {
            return f;
        }
    }
--- a/common/hf-cache.cpp
+++ b/common/hf-cache.cpp
@ -26,6 +26,8 @@ namespace nl = nlohmann;
 #include <windows.h>
 #else
 #define HOME_DIR "HOME"
+#include <unistd.h>
+#include <pwd.h>
 #endif

 namespace hf_cache {
@ -38,6 +40,7 @@ static fs::path get_cache_directory() {
            const char * var;
            fs::path path;
        } entries[] = {
+            {"LLAMA_CACHE",           fs::path()},
            {"HF_HUB_CACHE",          fs::path()},
            {"HUGGINGFACE_HUB_CACHE", fs::path()},
            {"HF_HOME",               fs::path("hub")},
@ -50,6 +53,13 @@ static fs::path get_cache_directory() {
                return entry.path.empty() ? base : base / entry.path;
            }
        }
+#ifndef _WIN32
+        const struct passwd * pw = getpwuid(getuid());
+
+        if (pw->pw_dir && *pw->pw_dir) {
+            return fs::path(pw->pw_dir) / ".cache" / "huggingface" / "hub";
+        }
+#endif
        throw std::runtime_error("Failed to determine HF cache directory");
    }();

@ -325,9 +335,15 @@ hf_files get_repo_files(const std::string & repo_id,
                if (item["lfs"].contains("oid") && item["lfs"]["oid"].is_string()) {
                    file.oid = item["lfs"]["oid"].get<std::string>();
                }
+                if (item["lfs"].contains("size") && item["lfs"]["size"].is_number()) {
+                    file.size = item["lfs"]["size"].get<size_t>();
+                }
            } else if (item.contains("oid") && item["oid"].is_string()) {
                file.oid = item["oid"].get<std::string>();
            }
+            if (file.size == 0 && item.contains("size") && item["size"].is_number()) {
+                file.size = item["size"].get<size_t>();
+            }

            if (!file.oid.empty() && !is_valid_oid(file.oid)) {
                LOG_WRN("%s: skip invalid oid: %s\n", __func__, file.oid.c_str());
@ -487,6 +503,34 @@ std::string finalize_file(const hf_file & file) {

 // delete everything after this line, one day

+// copied from download.cpp without the tag part
+struct gguf_split_info {
+    std::string prefix; // tag included
+    int index;
+    int count;
+};
+
+static gguf_split_info get_gguf_split_info(const std::string & path) {
+    static const std::regex re_split("^(.+)-([0-9]{5})-of-([0-9]{5})$", std::regex::icase);
+    std::smatch m;
+
+    std::string prefix = path;
+    if (!string_remove_suffix(prefix, ".gguf")) {
+        return {};
+    }
+
+    int index = 1;
+    int count = 1;
+
+    if (std::regex_match(prefix, m, re_split)) {
+        index = std::stoi(m[2].str());
+        count = std::stoi(m[3].str());
+        prefix = m[1].str();
+    }
+
+    return {std::move(prefix), index, count};
+}
+
 static std::pair<std::string, std::string> parse_manifest_name(std::string & filename) {
    static const std::regex re(R"(^manifest=([^=]+)=([^=]+)=.*\.json$)");
    std::smatch match;
@ -504,25 +548,30 @@ static std::string make_old_cache_filename(const std::string & owner,
    return result;
 }

-static bool migrate_single_file(const fs::path    & old_cache,
-                                const std::string & owner,
-                                const std::string & repo,
-                                const nl::json    & node,
-                                const hf_files    & files) {
+struct migrate_file {
+    std::string path;
+    std::string sha256;
+    size_t size;
+    fs::path old_path;
+    fs::path etag_path;
+    const hf_file * file;
+};

-    if (!node.contains("rfilename") ||
-        !node.contains("lfs")       ||
-        !node["lfs"].contains("sha256")) {
-        return false;
-    }
+using migrate_files = std::vector<migrate_file>;

-    std::string path = node["rfilename"];
-    std::string sha256 = node["lfs"]["sha256"];
+static bool collect_file(const fs::path    & old_cache,
+                         const std::string & owner,
+                         const std::string & repo,
+                         const std::string & path,
+                         const std::string & sha256,
+                         const hf_files    & files,
+                         migrate_files     & to_migrate) {
+
+    const hf_file * file = nullptr;

-    const hf_file * file_info = nullptr;
    for (const auto & f : files) {
        if (f.path == path) {
-            file_info = &f;
+            file = &f;
            break;
        }
    }
@ -532,50 +581,104 @@ static bool migrate_single_file(const fs::path    & old_cache,
    fs::path etag_path = old_path.string() + ".etag";

    if (!fs::exists(old_path)) {
-        if (fs::exists(etag_path)) {
-            LOG_WRN("%s: %s is orphan, deleting...\n", __func__, etag_path.string().c_str());
-            fs::remove(etag_path);
+        if (file && fs::exists(file->final_path)) {
+            return true;
        }
+        LOG_WRN("%s: %s not found in old cache or HF cache\n", __func__, old_filename.c_str());
        return false;
    }

-    bool delete_old_path = false;
-
-    if (!file_info) {
-        LOG_WRN("%s: %s not found in current repo, deleting...\n", __func__, old_filename.c_str());
-        delete_old_path = true;
-    } else if (!sha256.empty() && !file_info->oid.empty() && sha256 != file_info->oid) {
-        LOG_WRN("%s: %s is not up to date (sha256 mismatch), deleting...\n", __func__, old_filename.c_str());
-        delete_old_path = true;
+    if (!file) {
+        LOG_WRN("%s: %s not found in current repo\n", __func__, old_filename.c_str());
+        return false;
    }

-    std::error_code ec;
+    if (!sha256.empty() && !file->oid.empty() && sha256 != file->oid) {
+        LOG_WRN("%s: %s is not up to date (sha256 mismatch)\n", __func__, old_filename.c_str());
+        return false;
+    }

-    if (delete_old_path) {
-        fs::remove(old_path, ec);
-        fs::remove(etag_path, ec);
+    if (file->size > 0) {
+        size_t size = fs::file_size(old_path);
+        if (size != file->size) {
+            LOG_WRN("%s: %s has wrong size %zu (expected %zu)\n", __func__, old_filename.c_str(), size, file->size);
+            return false;
+        }
+    }
+
+    to_migrate.push_back({path, sha256, file->size, old_path, etag_path, file});
+    return true;
+}
+
+static bool collect_files(const fs::path    & old_cache,
+                          const std::string & owner,
+                          const std::string & repo,
+                          const nl::json    & node,
+                          const hf_files    & files,
+                          migrate_files     & to_migrate) {
+
+    if (!node.contains("rfilename") ||
+        !node.contains("lfs")       ||
+        !node["lfs"].contains("sha256")) {
        return true;
    }

-    fs::path new_path(file_info->local_path);
+    std::string path = node["rfilename"];
+    std::string sha256 = node["lfs"]["sha256"];
+
+    auto split = get_gguf_split_info(path);
+
+    if (split.count <= 1) {
+        return collect_file(old_cache, owner, repo, path, sha256, files, to_migrate);
+    }
+
+    std::vector<std::pair<std::string, std::string>> splits;
+
+    for (const auto & f : files) {
+        auto split_f = get_gguf_split_info(f.path);
+        if (split_f.count == split.count && split_f.prefix == split.prefix) {
+            // sadly the manifest only provides the sha256 of the first file (index == 1)
+            // the rest will be verified using the size...
+            std::string f_sha256 = (split_f.index == 1) ? sha256 : "";
+            splits.emplace_back(f.path, f_sha256);
+        }
+    }
+
+    if ((int)splits.size() != split.count) {
+        LOG_WRN("%s: expected %d split files but found %d in repo\n", __func__, split.count, (int)splits.size());
+        return false;
+    }
+
+    for (const auto & [f_path, f_sha256] : splits) {
+        if (!collect_file(old_cache, owner, repo, f_path, f_sha256, files, to_migrate)) {
+            return false;
+        }
+    }
+
+    return true;
+}
+
+static bool migrate_file(const migrate_file & file) {
+    std::error_code ec;
+
+    fs::path new_path(file.file->local_path);
    fs::create_directories(new_path.parent_path(), ec);

    if (!fs::exists(new_path, ec)) {
-        fs::rename(old_path, new_path, ec);
+        fs::rename(file.old_path, new_path, ec);
        if (ec) {
-            fs::copy_file(old_path, new_path, ec);
+            fs::copy_file(file.old_path, new_path, ec);
            if (ec) {
-                LOG_WRN("%s: failed to move/copy %s: %s\n", __func__, old_path.string().c_str(), ec.message().c_str());
+                LOG_ERR("%s: failed to move/copy %s: %s\n", __func__, file.old_path.string().c_str(), ec.message().c_str());
                return false;
            }
        }
-        fs::remove(old_path, ec);
+        fs::remove(file.old_path, ec);
    }
-    fs::remove(etag_path, ec);
-
-    std::string filename = finalize_file(*file_info);
-    LOG_INF("%s: migrated %s -> %s\n", __func__, old_filename.c_str(), filename.c_str());
+    fs::remove(file.etag_path, ec);

+    std::string filename = finalize_file(*file.file);
+    LOG_INF("%s: migrated %s -> %s\n", __func__, file.old_path.filename().string().c_str(), filename.c_str());
    return true;
 }

@ -624,19 +727,43 @@ void migrate_old_cache_to_hf_cache(const std::string & token, bool offline) {
            continue;
        }

+        migrate_files to_migrate;
+        bool ok = true;
+
        try {
            std::ifstream manifest(entry.path());
            auto json = nl::json::parse(manifest);
-
            for (const char * key : {"ggufFile", "mmprojFile"}) {
                if (json.contains(key)) {
-                    migrate_single_file(old_cache, owner, repo, json[key], files);
+                    if (!collect_files(old_cache, owner, repo, json[key], files, to_migrate)) {
+                        ok = false;
+                        break;
+                    }
                }
            }
        } catch (const std::exception & e) {
            LOG_WRN("%s: failed to parse manifest %s: %s\n", __func__, filename.c_str(), e.what());
            continue;
        }
+
+        if (!ok) {
+            LOG_WRN("%s: migration skipped: one or more files failed validation\n", __func__);
+            continue;
+        }
+
+        for (const auto & file : to_migrate) {
+            if (!migrate_file(file)) {
+                ok = false;
+                break;
+            }
+        }
+
+        if (!ok) {
+            LOG_WRN("%s: migration failed: could not migrate all files\n", __func__);
+            continue;
+        }
+
+        LOG_INF("%s: migration complete, deleting manifest: %s\n", __func__, entry.path().string().c_str());
        fs::remove(entry.path());
    }
 }
--- a/common/hf-cache.h
+++ b/common/hf-cache.h
@ -14,6 +14,7 @@ struct hf_file {
    std::string final_path;
    std::string oid;
    std::string repo_id;
+    size_t size = 0; // only for the migration
 };

 using hf_files = std::vector<hf_file>;
--- a/common/jinja/parser.cpp
+++ b/common/jinja/parser.cpp
@ -539,6 +539,9 @@ private:
            statement_ptr step = slices.size() > 2 ? std::move(slices[2]) : nullptr;
            return mk_stmt<slice_expression>(start_pos, std::move(start), std::move(stop), std::move(step));
        }
+        if (slices.empty()) {
+            return mk_stmt<blank_expression>(start_pos);
+        }
        return std::move(slices[0]);
    }

--- a/common/jinja/runtime.cpp
+++ b/common/jinja/runtime.cpp
@ -667,8 +667,9 @@ value macro_statement::execute_impl(context & ctx) {
                if (is_stmt<identifier>(this->args[i])) {
                    // normal parameter
                    std::string param_name = cast_stmt<identifier>(this->args[i])->val;
-                    JJ_DEBUG("  Binding parameter '%s' to argument of type %s", param_name.c_str(), args.get_pos(i)->type().c_str());
-                    macro_ctx.set_val(param_name, args.get_pos(i));
+                    value param_value = args.get_kwarg_or_pos(param_name, i);
+                    JJ_DEBUG("  Binding parameter '%s' to argument of type %s", param_name.c_str(), param_value->type().c_str());
+                    macro_ctx.set_val(param_name, param_value);
                } else if (is_stmt<keyword_argument_expression>(this->args[i])) {
                    // default argument used as normal parameter
                    auto kwarg = cast_stmt<keyword_argument_expression>(this->args[i]);
@ -676,8 +677,9 @@ value macro_statement::execute_impl(context & ctx) {
                        throw std::runtime_error("Keyword argument key must be an identifier in macro '" + name + "'");
                    }
                    std::string param_name = cast_stmt<identifier>(kwarg->key)->val;
-                    JJ_DEBUG("  Binding parameter '%s' to argument of type %s", param_name.c_str(), args.get_pos(i)->type().c_str());
-                    macro_ctx.set_val(param_name, args.get_pos(i));
+                    value param_value = args.get_kwarg_or_pos(param_name, i);
+                    JJ_DEBUG("  Binding parameter '%s' to argument of type %s", param_name.c_str(), param_value->type().c_str());
+                    macro_ctx.set_val(param_name, param_value);
                } else {
                    throw std::runtime_error("Invalid parameter type in macro '" + name + "'");
                }
@ -769,10 +771,15 @@ value member_expression::execute_impl(context & ctx) {
    }

    JJ_DEBUG("Member expression on object type %s, property type %s", object->type().c_str(), property->type().c_str());
-    ensure_key_type_allowed(property);
-
    value val = mk_val<value_undefined>("object_property");

+    if (property->is_undefined()) {
+        JJ_DEBUG("%s", "Member expression property is undefined, returning undefined");
+        return val;
+    }
+
+    ensure_key_type_allowed(property);
+
    if (is_val<value_undefined>(object)) {
        JJ_DEBUG("%s", "Accessing property on undefined object, returning undefined");
        return val;
--- a/common/jinja/runtime.h
+++ b/common/jinja/runtime.h
@ -263,6 +263,14 @@ struct comment_statement : public statement {

 // Expressions

+// Represents an omitted expression in a computed member, e.g. `a[]`.
+struct blank_expression : public expression {
+    std::string type() const override { return "BlankExpression"; }
+    value execute_impl(context &) override {
+        return mk_val<value_undefined>();
+    }
+};
+
 struct member_expression : public expression {
    statement_ptr object;
    statement_ptr property;
--- a/common/json-schema-to-grammar.cpp
+++ b/common/json-schema-to-grammar.cpp
@ -416,15 +416,30 @@ private:
                    i++;
                } else if (c == '(') {
                    i++;
-                    if (i < length) {
-                        if (sub_pattern[i] == '?') {
+                    if (i < length && sub_pattern[i] == '?') {
+                        if (i + 1 < length && sub_pattern[i + 1] == ':') {
+                            i += 2; // skip "?:" for non-capturing group, treat as regular group
+                        } else {
+                            // lookahead/lookbehind (?=, ?!, ?<=, ?<!) - not supported
                            _warnings.push_back("Unsupported pattern syntax");
+                            // skip to matching ')' to avoid UB on empty seq
+                            int depth = 1;
+                            while (i < length && depth > 0) {
+                                if (sub_pattern[i] == '\\' && i + 1 < length) {
+                                    i += 2; // skip escaped character
+                                } else {
+                                    if (sub_pattern[i] == '(') depth++;
+                                    else if (sub_pattern[i] == ')') depth--;
+                                    i++;
+                                }
+                            }
+                            continue;
                        }
                    }
                    seq.emplace_back("(" + to_rule(transform()) + ")", false);
                } else if (c == ')') {
                    i++;
-                    if (start > 0 && sub_pattern[start - 1] != '(') {
+                    if (start > 0 && sub_pattern[start - 1] != '(' && (start < 2 || sub_pattern[start - 2] != '?' || sub_pattern[start - 1] != ':')) {
                        _errors.push_back("Unbalanced parentheses");
                    }
                    return join_seq();
--- a/common/ngram-map.h
+++ b/common/ngram-map.h
@ -51,7 +51,7 @@ struct common_ngram_map_value {
 // statistics of a n-gram
 struct common_ngram_map_key {
    size_t   key_idx;   // index of key n-gram in token-history
-    size_t   stat_idx;  // index of last token of stastistics computation (key_num, values)
+    size_t   stat_idx;  // index of last token of statistics computation (key_num, values)

    uint16_t key_num;   // number of occurrences of this key n-gram in token-history
    common_ngram_map_value values[COMMON_NGRAM_MAX_VALUES]; // some known values after the key
--- a/common/reasoning-budget.cpp
+++ b/common/reasoning-budget.cpp
@ -115,9 +115,11 @@ static void common_reasoning_budget_accept(struct llama_sampler * smpl, llama_to
            break;
        }
        case REASONING_BUDGET_FORCING:
-            // force_pos is advanced in apply(), not here.
-            // This ensures the first forced token isn't skipped when the sampler
-            // is initialized directly in FORCING state (e.g. COUNTING + budget=0)
+            ctx->force_pos++;
+            if (ctx->force_pos >= ctx->forced_tokens.size()) {
+                ctx->state = REASONING_BUDGET_DONE;
+                LOG_INF("reasoning-budget: forced sequence complete, done\n");
+            }
            break;
        case REASONING_BUDGET_DONE:
            break;
@ -144,14 +146,6 @@ static void common_reasoning_budget_apply(struct llama_sampler * smpl, llama_tok
            cur_p->data[i].logit = -INFINITY;
        }
    }
-
-    // advance to next forced token (done here rather than in accept so that
-    // the first forced token isn't skipped when starting in FORCING state)
-    ctx->force_pos++;
-    if (ctx->force_pos >= ctx->forced_tokens.size()) {
-        ctx->state = REASONING_BUDGET_DONE;
-        LOG_INF("reasoning-budget: forced sequence complete, done\n");
-    }
 }

 static void common_reasoning_budget_reset(struct llama_sampler * smpl) {
@ -261,3 +255,10 @@ struct llama_sampler * common_reasoning_budget_init(
        common_reasoning_budget_state    initial_state) {
    return common_reasoning_budget_init_state(vocab, start_tokens, end_tokens, forced_tokens, budget, initial_state);
 }
+
+common_reasoning_budget_state common_reasoning_budget_get_state(const struct llama_sampler * smpl) {
+    if (!smpl) {
+        return REASONING_BUDGET_IDLE;
+    }
+    return ((const common_reasoning_budget_ctx *)smpl->ctx)->state;
+}
--- a/common/reasoning-budget.h
+++ b/common/reasoning-budget.h
@ -51,3 +51,5 @@ struct llama_sampler * common_reasoning_budget_init(
        const std::vector<llama_token> & forced_tokens,
        int32_t                          budget,
        common_reasoning_budget_state    initial_state);
+
+common_reasoning_budget_state common_reasoning_budget_get_state(const struct llama_sampler * smpl);
--- a/common/sampling.cpp
+++ b/common/sampling.cpp
@ -7,6 +7,7 @@

 #include <algorithm>
 #include <cctype>
+#include <climits>
 #include <cmath>
 #include <cstring>
 #include <unordered_map>
@ -109,6 +110,7 @@ struct common_sampler {
    common_params_sampling params;

    struct llama_sampler * grmr;
+    struct llama_sampler * rbudget;
    struct llama_sampler * chain;

    ring_buffer<llama_token> prev;
@ -188,6 +190,7 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, st
    lparams.no_perf = params.no_perf;

    llama_sampler * grmr = nullptr;
+    llama_sampler * rbudget = nullptr;
    llama_sampler * chain = llama_sampler_chain_init(lparams);

    std::vector<llama_sampler *> samplers;
@ -270,7 +273,7 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, st
            }
        }

-        if (grmr) {
+        if (grmr && !params.grammar_lazy) {
            try {
                for (const auto & token : prefill_tokens) {
                    llama_sampler_accept(grmr, token);
@ -284,15 +287,15 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, st
        }
    }

-    // reasoning budget sampler — added first so it can force tokens before other samplers
-    if (params.reasoning_budget_tokens >= 0 && !params.reasoning_budget_forced.empty()) {
-        samplers.push_back(common_reasoning_budget_init(
+    // reasoning budget sampler
+    if (!params.reasoning_budget_start.empty() && !params.reasoning_budget_end.empty()) {
+        rbudget = common_reasoning_budget_init(
            vocab,
            params.reasoning_budget_start,
            params.reasoning_budget_end,
            params.reasoning_budget_forced,
-            params.reasoning_budget_tokens,
-            prefill_tokens));
+            params.reasoning_budget_tokens < 0 ? INT_MAX : params.reasoning_budget_tokens,
+            prefill_tokens);
    }

    if (params.has_logit_bias()) {
@ -380,9 +383,16 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, st
        params.backend_sampling = false;
    }

+    if (rbudget && params.backend_sampling) {
+        LOG_WRN("%s: backend sampling is not compatible with reasoning budget, disabling\n", __func__);
+
+        params.backend_sampling = false;
+    }
+
    auto * result = new common_sampler {
        /* .params  = */ params,
        /* .grmr    = */ grmr,
+        /* .rbudget = */ rbudget,
        /* .chain   = */ chain,
        /* .prev    = */ ring_buffer<llama_token>(std::max(32, params.n_prev)),
        /* .cur     = */ {},
@ -398,11 +408,27 @@ void common_sampler_free(struct common_sampler * gsmpl) {
    }

    llama_sampler_free(gsmpl->grmr);
+    llama_sampler_free(gsmpl->rbudget);
    llama_sampler_free(gsmpl->chain);

    delete gsmpl;
 }

+static bool grammar_should_apply(struct common_sampler * gsmpl) {
+    if (!gsmpl->grmr) {
+        return false;
+    }
+    if (!gsmpl->rbudget) {
+        return true;
+    }
+    if (gsmpl->params.grammar_lazy) {
+        // if grammar is lazy, only apply when reasoning budget is not active
+        const auto state = common_reasoning_budget_get_state(gsmpl->rbudget);
+        return state == REASONING_BUDGET_IDLE || state == REASONING_BUDGET_DONE;
+    }
+    return true;
+}
+
 void common_sampler_accept(struct common_sampler * gsmpl, llama_token token, bool accept_grammar) {
    if (!gsmpl) {
        return;
@ -410,6 +436,11 @@ void common_sampler_accept(struct common_sampler * gsmpl, llama_token token, boo

    const auto tm = gsmpl->tm();

+    // grammar_should_apply() checks the reasoning budget state, so calculate this before we accept
+    accept_grammar = accept_grammar && grammar_should_apply(gsmpl);
+
+    llama_sampler_accept(gsmpl->rbudget, token);
+
    if (gsmpl->grmr && accept_grammar) {
        llama_sampler_accept(gsmpl->grmr, token);
    }
@ -431,6 +462,7 @@ struct common_sampler * common_sampler_clone(common_sampler * gsmpl) {
    return new common_sampler {
        /* .params  = */ gsmpl->params,
        /* .grmr    = */ llama_sampler_clone(gsmpl->grmr),
+        /* .rbudget = */ llama_sampler_clone(gsmpl->rbudget),
        /* .chain   = */ llama_sampler_clone(gsmpl->chain),
        /* .prev    = */ gsmpl->prev,
        /* .cur     = */ gsmpl->cur,
@ -500,6 +532,7 @@ llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_co
    llama_token id = LLAMA_TOKEN_NULL;

    auto & grmr  = gsmpl->grmr;
+    auto & rbudget = gsmpl->rbudget;
    auto & chain = gsmpl->chain;
    auto & cur_p = gsmpl->cur_p; // initialized by set_logits

@ -511,7 +544,8 @@ llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_co
        if (id != LLAMA_TOKEN_NULL) {
            LOG_DBG("%s: Backend sampler selected token: '%d'. Will not run any CPU samplers\n", __func__, id);

-            GGML_ASSERT(!gsmpl->grmr && "using grammar in combination with backend sampling is not supported");
+            GGML_ASSERT(!gsmpl->grmr    && "using grammar in combination with backend sampling is not supported");
+            GGML_ASSERT(!gsmpl->rbudget && "using reasoning budget in combination with backend sampling is not supported");

            // TODO: simplify
            gsmpl->cur.resize(1);
@ -524,7 +558,10 @@ llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_co

    gsmpl->set_logits(ctx, idx);

-    if (grammar_first) {
+    // apply reasoning budget first
+    llama_sampler_apply(rbudget, &cur_p);
+
+    if (grammar_first && grammar_should_apply(gsmpl)) {
        llama_sampler_apply(grmr, &cur_p);
    }

@ -532,7 +569,7 @@ llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_co

    id = cur_p.data[cur_p.selected].id;

-    if (grammar_first) {
+    if (grammar_first || !grammar_should_apply(gsmpl)) {
        return id;
    }

@ -553,7 +590,12 @@ llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_co
    // if the token is not valid, sample again, but first apply the grammar sampler and then the sampling chain
    gsmpl->set_logits(ctx, idx);

-    llama_sampler_apply(grmr,  &cur_p);
+    llama_sampler_apply(rbudget,  &cur_p);
+
+    if (grammar_should_apply(gsmpl)) {
+        llama_sampler_apply(grmr,  &cur_p);
+    }
+
    llama_sampler_apply(chain, &cur_p);

    GGML_ASSERT(cur_p.selected != -1 && "no selected token during sampling - check your sampling configuration");
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@ -31,10 +31,10 @@ import gguf
 from gguf.vocab import MistralTokenizerType, MistralVocab

 try:
-    from mistral_common.tokens.tokenizers.base import TokenizerVersion # type: ignore[import-not-found]
-    from mistral_common.tokens.tokenizers.multimodal import DATASET_MEAN as _MISTRAL_COMMON_DATASET_MEAN, DATASET_STD as _MISTRAL_COMMON_DATASET_STD # type: ignore[import-not-found]
-    from mistral_common.tokens.tokenizers.tekken import Tekkenizer # type: ignore[import-not-found]
-    from mistral_common.tokens.tokenizers.sentencepiece import ( # type: ignore[import-not-found]
+    from mistral_common.tokens.tokenizers.base import TokenizerVersion # type: ignore[import-not-found, ty:unresolved-import]
+    from mistral_common.tokens.tokenizers.multimodal import DATASET_MEAN as _MISTRAL_COMMON_DATASET_MEAN, DATASET_STD as _MISTRAL_COMMON_DATASET_STD # type: ignore[import-not-found, ty:unresolved-import]
+    from mistral_common.tokens.tokenizers.tekken import Tekkenizer # type: ignore[import-not-found, ty:unresolved-import]
+    from mistral_common.tokens.tokenizers.sentencepiece import ( # type: ignore[import-not-found, ty:unresolved-import]
        SentencePieceTokenizer,
    )

@ -486,7 +486,7 @@ class ModelBase:
            elif quant_method == "modelopt":
                # Mixed-precision ModelOpt models: NVFP4 tensors are handled by
                # _generate_nvfp4_tensors; FP8 tensors have 1D weight_scale and
-                # are dequantized here. input_scale tensors are unused.
+                # are dequantized here. k/v scale tensors are unused.
                for name in self.model_tensors.keys():
                    if name.endswith(".weight_scale"):
                        weight_name = name.removesuffix("_scale")
@ -494,7 +494,7 @@ class ModelBase:
                        s = self.model_tensors[name]
                        self.model_tensors[weight_name] = lambda w=w, s=s: dequant_simple(w(), s(), None)
                        tensors_to_remove.append(name)
-                    if name.endswith((".input_scale", ".k_scale", ".v_scale")):
+                    if name.endswith((".k_scale", ".v_scale")):
                        tensors_to_remove.append(name)
            elif quant_method is not None:
                raise NotImplementedError(f"Quant method is not yet supported: {quant_method!r}")
@ -542,7 +542,6 @@ class ModelBase:
        raise NotImplementedError("set_gguf_parameters() must be implemented in subclasses")

    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
-
        new_name = self.map_tensor_name(name)

        # Handle gate/up expert tensor fusion if enabled
@ -607,7 +606,12 @@ class ModelBase:
    def _nvfp4_scale2_is_trivial(scale2: Tensor) -> bool:
        return scale2.numel() <= 1 and abs(float(scale2.float().sum()) - 1.0) < 1e-6

-    def _repack_nvfp4(self, new_name: str, weight: Tensor, scale: Tensor, scale2: Tensor):
+    def _repack_nvfp4(self, name: str, weight: Tensor, scale: Tensor, scale2: Tensor, input_scale: Tensor):
+        if "language_model." in name:
+            name = name.replace("language_model.", "")
+
+        new_name = self.map_tensor_name(name)
+
        raw, shape = self._nvfp4_pack(weight, scale)
        logger.info(f"Repacked {new_name} with shape {shape} and quantization NVFP4")
        self.gguf_writer.add_tensor(new_name, raw, raw_dtype=gguf.GGMLQuantizationType.NVFP4)
@ -619,10 +623,18 @@ class ModelBase:
            logger.info(f"  + {scale_name} (per-tensor NVFP4 scale2, shape [{scale2_f32.size}])")
            self.gguf_writer.add_tensor(scale_name, scale2_f32)

+        # Emit per-tensor input_scale as a separate F32 tensor when non-trivial
+        if not self._nvfp4_scale2_is_trivial(input_scale):
+            input_scale_f32 = input_scale.float().numpy().flatten()
+            input_scale_name = new_name.replace(".weight", ".input_scale")
+            logger.info(f"  + {input_scale_name} (per-tensor NVFP4 input_scale, shape [{input_scale_f32.size}])")
+            self.gguf_writer.add_tensor(input_scale_name, input_scale_f32)
+
    def _generate_nvfp4_tensors(self):
        # Per-layer expert merging to avoid holding all experts in memory
        expert_blocks: dict[tuple[int, str], list[tuple[int, np.ndarray]]] = {}
        expert_scales: dict[tuple[int, str], list[tuple[int, float]]] = {}
+        expert_input_scales: dict[tuple[int, str], list[tuple[int, float]]] = {}
        expert_shapes: dict[tuple[int, str], list[int]] = {}
        n_experts = self.find_hparam(["num_local_experts", "num_experts"], optional=True) or 0
        consumed: list[str] = []
@ -632,6 +644,7 @@ class ModelBase:
                continue
            scale_name = name.replace(".weight", ".weight_scale")
            scale2_name = name.replace(".weight", ".weight_scale_2")
+            input_scale_name = name.replace(".weight", ".input_scale")
            if scale_name not in self.model_tensors:
                continue
            # Force eager materialization of lazy tensors
@ -643,11 +656,14 @@ class ModelBase:
                continue

            scale2 = LazyTorchTensor.to_eager(self.model_tensors.get(scale2_name, lambda: torch.tensor(1.0))())
+            input_scale = LazyTorchTensor.to_eager(self.model_tensors.get(input_scale_name, lambda: torch.tensor(1.0))())

            # Mark tensors for removal from model_tensors (already written to gguf)
            consumed.extend([name, scale_name])
            if scale2_name in self.model_tensors:
                consumed.append(scale2_name)
+            if input_scale_name in self.model_tensors:
+                consumed.append(input_scale_name)

            # Check if this is a per-expert tensor
            m = re.search(r'\.experts\.(\d+)\.(gate_proj|up_proj|down_proj)\.weight$', name)
@ -663,34 +679,37 @@ class ModelBase:
                if key not in expert_blocks:
                    expert_blocks[key] = []
                    expert_scales[key] = []
+                    expert_input_scales[key] = []
                    expert_shapes[key] = shape
                expert_blocks[key].append((expert_id, raw.copy()))
                # Collect per-expert scale2 (scalar per expert)
                expert_scales[key].append((expert_id, float(scale2.float().sum())))
+                # Collect per-expert input_scale (scalar per expert)
+                expert_input_scales[key].append((expert_id, float(input_scale.float().sum())))

                # Flush when all experts for this (layer, proj) are collected
                if n_experts > 0 and len(expert_blocks[key]) >= n_experts:
-                    self._flush_nvfp4_experts(key, expert_blocks, expert_scales, expert_shapes, bid, proj_type)
+                    self._flush_nvfp4_experts(key, expert_blocks, expert_scales, expert_input_scales, expert_shapes, bid, proj_type)
            else:
-                new_name = self.map_tensor_name(name)
-                self._repack_nvfp4(new_name, weight, scale, scale2)
+                self._repack_nvfp4(name, weight, scale, scale2, input_scale)

        # Flush any remaining experts (fallback if n_experts was unknown)
        for (bid, proj_type) in list(expert_blocks.keys()):
-            self._flush_nvfp4_experts((bid, proj_type), expert_blocks, expert_scales, expert_shapes, bid, proj_type)
+            self._flush_nvfp4_experts((bid, proj_type), expert_blocks, expert_scales, expert_input_scales, expert_shapes, bid, proj_type)

        # Remove consumed tensors so get_tensors/modify_tensors won't see them
        for name in consumed:
            self.model_tensors.pop(name, None)

-        # Remove unused auxiliary tensors (input_scale, k_scale, v_scale)
+        # Remove any remaining unused auxiliary tensors
        for name in list(self.model_tensors.keys()):
-            if name.endswith((".input_scale", ".k_scale", ".v_scale")):
+            if name.endswith((".k_scale", ".v_scale")):
                del self.model_tensors[name]

-    def _flush_nvfp4_experts(self, key, expert_blocks, expert_scales, expert_shapes, bid, proj_type):
+    def _flush_nvfp4_experts(self, key, expert_blocks, expert_scales, expert_input_scales, expert_shapes, bid, proj_type):
        experts = expert_blocks.pop(key)
        scales = expert_scales.pop(key)
+        input_scales = expert_input_scales.pop(key)
        shape = expert_shapes.pop(key)

        experts.sort(key=lambda x: x[0])
@ -708,6 +727,14 @@ class ModelBase:
            logger.info(f"  + {scale_name} (per-expert NVFP4 scale2, shape [{len(scales)}])")
            self.gguf_writer.add_tensor(scale_name, scale_vals)

+        # Emit per-expert input_scale tensor if any expert has non-trivial input_scale
+        input_scales.sort(key=lambda x: x[0])
+        input_scale_vals = np.array([s[1] for s in input_scales], dtype=np.float32)
+        if not np.allclose(input_scale_vals, 1.0, atol=1e-6):
+            input_scale_name = new_name.replace(".weight", ".input_scale")
+            logger.info(f"  + {input_scale_name} (per-expert NVFP4 input_scale, shape [{len(input_scales)}])")
+            self.gguf_writer.add_tensor(input_scale_name, input_scale_vals)
+
        del experts, merged

    def prepare_tensors(self):
@ -947,6 +974,9 @@ class ModelBase:
        if "thinker_config" in config:
            # rename for Qwen2.5-Omni
            config["text_config"] = config["thinker_config"]["text_config"]
+        if "language_config" in config:
+            # rename for DeepSeekOCR
+            config["text_config"] = config["language_config"]
        if "lfm" in config:
            # rename for LFM2-Audio
            config["text_config"] = config["lfm"]
@ -1308,6 +1338,9 @@ class TextModel(ModelBase):
        if chkhsh == "b3d1dd861f1d4c5c0d2569ce36baf3f90fe8a102db3de50dd71ff860d91be3df":
            # ref: https://huggingface.co/aari1995/German_Semantic_V3
            res = "jina-v2-de"
+        if chkhsh == "0fe1cf6eda062318a1af7270f3331a85c539a01778ff948e24388e949c5282f4":
+            # ref: https://huggingface.co/evilfreelancer/ruGPT3XL
+            res = "gpt-2"
        if chkhsh == "0ef9807a4087ebef797fc749390439009c3b9eda9ad1a097abbe738f486c01e5":
            # ref: https://huggingface.co/meta-llama/Meta-Llama-3-8B
            res = "llama-bpe"
@ -1503,6 +1536,9 @@ class TextModel(ModelBase):
        if chkhsh == "e4d54df1ebc1f2b91acd986c5b51aa50837d5faf7c7398e73c1f9e9ee5d19869":
            # ref: https://huggingface.co/kakaocorp/kanana-2-30b-a3b-instruct-2601
            res = "kanana2"
+        if chkhsh == "862f827721df956049dff5ca81a57f29e575280bc622e290d3bf4e35eca29015":
+            # ref: https://huggingface.co/codefuse-ai/F2LLM-v2-4B
+            res = "f2llmv2"

        if res is None:
            logger.warning("\n")
@ -2071,7 +2107,7 @@ class MmprojModel(ModelBase):
    preprocessor_config: dict[str, Any]
    global_config: dict[str, Any]

-    n_block_keys = ["n_layers", "num_hidden_layers", "n_layer", "num_layers", "depth", "encoder_layers", "vt_num_hidden_layers"]
+    n_block_keys = ["n_layers", "num_hidden_layers", "n_layer", "num_layers", "depth", "layers", "encoder_layers", "vt_num_hidden_layers"]

    has_vision_encoder: bool = True # by default
    has_audio_encoder: bool = False
@ -4572,7 +4608,7 @@ class Qwen2MoeModel(TextModel):
                raise ValueError(f"Unprocessed experts: {experts}")


-@ModelBase.register("Qwen3ForCausalLM")
+@ModelBase.register("Qwen3ForCausalLM", "Qwen3Model")
 class Qwen3Model(Qwen2Model):
    model_arch = gguf.MODEL_ARCH.QWEN3

@ -5005,6 +5041,97 @@ class _LinearAttentionVReorderBase(Qwen3NextModel):
        perm[dim], perm[dim + 1] = perm[dim + 1], perm[dim]
        return tensor.permute(*perm).contiguous().reshape(*shape)

+    def _transform_nvfp4_weight(self, name: str, weight: Tensor, scale: Tensor) -> tuple[Tensor, Tensor]:
+        if not name.endswith((
+            ".linear_attn.in_proj_qkv.weight",
+            ".linear_attn.in_proj_z.weight",
+            ".linear_attn.in_proj_a.weight",
+            ".linear_attn.in_proj_b.weight",
+            ".linear_attn.out_proj.weight",
+        )):
+            return weight, scale
+
+        num_k_heads = self.hparams["linear_num_key_heads"]
+        num_v_heads = self.hparams["linear_num_value_heads"]
+        head_k_dim = self.hparams["linear_key_head_dim"]
+        head_v_dim = self.hparams["linear_value_head_dim"]
+        num_v_per_k = num_v_heads // num_k_heads
+
+        def unpack_nibbles(qs: Tensor) -> Tensor:
+            lo = torch.bitwise_and(qs, 0x0F)
+            hi = torch.bitwise_right_shift(qs, 4)
+            return torch.stack((lo, hi), dim=-1).reshape(*qs.shape[:-1], qs.shape[-1] * 2)
+
+        def pack_nibbles(codes: Tensor) -> Tensor:
+            codes = codes.reshape(*codes.shape[:-1], codes.shape[-1] // 2, 2)
+            lo = torch.bitwise_and(codes[..., 0], 0x0F)
+            hi = torch.bitwise_left_shift(torch.bitwise_and(codes[..., 1], 0x0F), 4)
+            return torch.bitwise_or(lo, hi).contiguous()
+
+        def apply_col_perm(qs: Tensor, scales: Tensor, col_perm: Tensor) -> tuple[Tensor, Tensor]:
+            assert qs.ndim >= 2
+            assert scales.ndim >= 2
+
+            k = qs.shape[-1] * 2
+            assert col_perm.numel() == k
+            assert k % 16 == 0
+
+            group_cols = col_perm.reshape(-1, 16)
+            group_starts = group_cols[:, 0]
+            expected = group_starts.unsqueeze(1) + torch.arange(16, dtype=col_perm.dtype)
+            assert torch.equal(group_cols, expected)
+            assert torch.all(group_starts % 16 == 0)
+
+            group_perm = (group_starts // 16).to(dtype=torch.long)
+            expected_groups = torch.arange(scales.shape[-1], dtype=torch.long)
+            assert group_perm.numel() == scales.shape[-1]
+            assert torch.equal(torch.sort(group_perm).values, expected_groups)
+
+            codes = unpack_nibbles(qs)
+            codes = codes.index_select(-1, col_perm.to(device=qs.device, dtype=torch.long))
+            qs = pack_nibbles(codes)
+            scales = scales.index_select(-1, group_perm.to(device=scales.device))
+            return qs, scales
+
+        def reorder_rows(qs: Tensor, scales: Tensor, head_dim: int) -> tuple[Tensor, Tensor]:
+            row_perm = self._reorder_v_heads(
+                torch.arange(num_v_heads * head_dim, dtype=torch.long).unsqueeze(-1),
+                0, num_k_heads, num_v_per_k, head_dim,
+            ).squeeze(-1)
+            return (
+                qs.index_select(0, row_perm.to(device=qs.device)),
+                scales.index_select(0, row_perm.to(device=scales.device)),
+            )
+
+        if name.endswith(".linear_attn.in_proj_qkv.weight"):
+            q_dim = head_k_dim * num_k_heads
+            k_dim = head_k_dim * num_k_heads
+            q = weight[:q_dim]
+            k = weight[q_dim:q_dim + k_dim]
+            v = weight[q_dim + k_dim:]
+            q_scale = scale[:q_dim]
+            k_scale = scale[q_dim:q_dim + k_dim]
+            v_scale = scale[q_dim + k_dim:]
+            v, v_scale = reorder_rows(v, v_scale, head_v_dim)
+            return torch.cat([q, k, v], dim=0), torch.cat([q_scale, k_scale, v_scale], dim=0)
+
+        if name.endswith(".linear_attn.in_proj_z.weight"):
+            weight, scale = reorder_rows(weight, scale, head_v_dim)
+        elif name.endswith((".linear_attn.in_proj_a.weight", ".linear_attn.in_proj_b.weight")):
+            weight, scale = reorder_rows(weight, scale, 1)
+        elif name.endswith(".linear_attn.out_proj.weight"):
+            col_perm = self._reorder_v_heads(
+                torch.arange(num_v_heads * head_v_dim, dtype=torch.long).unsqueeze(0),
+                1, num_k_heads, num_v_per_k, head_v_dim,
+            ).squeeze(0)
+            weight, scale = apply_col_perm(weight, scale, col_perm)
+
+        return weight, scale
+
+    def _repack_nvfp4(self, name: str, weight: Tensor, scale: Tensor, scale2: Tensor, input_scale: Tensor):
+        weight, scale = self._transform_nvfp4_weight(name, weight, scale)
+        super()._repack_nvfp4(name, weight, scale, scale2, input_scale)
+
    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
        num_k_heads = self.hparams.get("linear_num_key_heads", 0)
        num_v_heads = self.hparams.get("linear_num_value_heads", 0)
@ -5094,6 +5221,47 @@ class GPT2Model(TextModel):
        yield from super().modify_tensors(data_torch, new_name, bid)


+@ModelBase.register("RuGPT3XLForCausalLM")
+class RuGPT3XLModel(TextModel):
+    model_arch = gguf.MODEL_ARCH.GPT2
+
+    _qkv_parts: list[dict[str, Tensor]] | None = None
+
+    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
+        # Fuse separate Q, K, V projections into a single QKV tensor
+        if ".self_attn.q_proj." in name or ".self_attn.k_proj." in name or ".self_attn.v_proj." in name:
+            suffix = "weight" if name.endswith(".weight") else "bias"
+            part = "q" if ".q_proj." in name else ("k" if ".k_proj." in name else "v")
+            key = f"{part}.{suffix}"
+
+            assert bid is not None
+            if self._qkv_parts is None:
+                self._qkv_parts = [{} for _ in range(self.block_count)]
+            self._qkv_parts[bid][key] = data_torch
+
+            q_key, k_key, v_key = f"q.{suffix}", f"k.{suffix}", f"v.{suffix}"
+            if all(k in self._qkv_parts[bid] for k in [q_key, k_key, v_key]):
+                q = self._qkv_parts[bid].pop(q_key)
+                k = self._qkv_parts[bid].pop(k_key)
+                v = self._qkv_parts[bid].pop(v_key)
+                data_torch = torch.cat([q, k, v], dim=0)
+                name = self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_QKV, bid, f".{suffix}")
+                logger.debug(f"Fused Q/K/V {suffix} for layer {bid} -> {name}")
+            else:
+                return
+
+        yield from super().modify_tensors(data_torch, name, bid)
+
+    def prepare_tensors(self):
+        super().prepare_tensors()
+
+        if self._qkv_parts is not None:
+            # flatten `list[dict[str, Tensor]]` into `list[str]`
+            parts = [f"({i}){k}" for i, d in enumerate(self._qkv_parts) for k in d.keys()]
+            if len(parts) > 0:
+                raise ValueError(f"Unprocessed Q/K/V parts: {parts}")
+
+
@ModelBase.register("PhiForCausalLM")
 class Phi2Model(TextModel):
    model_arch = gguf.MODEL_ARCH.PHI2
@ -6935,6 +7103,70 @@ class ConformerAudioModel(MmprojModel):
        yield from super().modify_tensors(data_torch, name, bid)


+@ModelBase.register("DeepseekOCRForCausalLM")
+class DeepseekOCRVisionModel(MmprojModel):
+    def set_gguf_parameters(self):
+        super().set_gguf_parameters()
+        hparams = self.hparams
+        self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.DEEPSEEKOCR)
+        # default values below are taken from HF tranformers code
+        self.gguf_writer.add_vision_attention_layernorm_eps(hparams.get("layer_norm_eps", 1e-6))
+        self.gguf_writer.add_vision_use_gelu(True)
+        # calculate proj_scale_factor (used by tinygemma3 test model)
+        image_seq_length = self.preprocessor_config.get("image_seq_length", 256)
+        n_per_side = int(image_seq_length ** 0.5)
+        image_size = self.hparams["image_size"]
+        patch_size = self.hparams["patch_size"]
+        proj_scale_factor = (image_size // patch_size) // n_per_side
+        if proj_scale_factor > 0 and proj_scale_factor != 4:
+            # we only need to write this if it's not the default value
+            # in this case, we are converting a test model
+            self.gguf_writer.add_vision_projector_scale_factor(proj_scale_factor)
+        # @bluebread: there's no window_size in config but just add it here anyway
+        self.gguf_writer.add_vision_window_size(self.hparams.get("window_size", 14))
+
+        # SAM configuration
+        sam_hparams = hparams['sam']
+        self.gguf_writer.add_vision_sam_layers_count(sam_hparams['layers'])
+        self.gguf_writer.add_vision_sam_embedding_length(sam_hparams['width'])
+        self.gguf_writer.add_vision_sam_head_count(sam_hparams['heads'])
+
+    def get_vision_config(self) -> dict[str, Any]:
+        vision_config: dict[str, Any] | None = self.global_config.get("vision_config")
+
+        if not vision_config:
+            raise ValueError("DeepseekOCR model requires 'vision_config' in the model configuration, but it was not found")
+
+        vision_config['sam'] = vision_config['width']['sam_vit_b']
+        vision_config.update(vision_config['width']['clip-l-14-224'])
+        vision_config['hidden_size'] = vision_config['width']
+        vision_config['num_heads'] = vision_config['heads']
+        vision_config['intermediate_size'] = vision_config['heads'] * 4
+
+        return vision_config
+
+    def tensor_force_quant(self, name, new_name, bid, n_dims):
+        if ".embeddings." in name or 'pos_embed' in name:
+            return gguf.GGMLQuantizationType.F32
+        if ".rel_pos_h" in name or '.rel_pos_w' in name:
+            return gguf.GGMLQuantizationType.F32
+        if ".neck." in name or ".net_" in name:
+            return gguf.GGMLQuantizationType.F32
+        return super().tensor_force_quant(name, new_name, bid, n_dims)
+
+    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
+        # Only process vision-related tensors, skip language model tensors
+        # Vision components: sam_model, vision_model, projector, image_newline, view_seperator
+        # Language model components to skip: lm_head, embed_tokens, layers, norm
+        if name.startswith(("lm_head.", "model.embed_tokens.", "model.layers.", "model.norm.")):
+            return
+
+        if name.endswith("pos_embed") or name.endswith("rel_pos_h") or name.endswith("rel_pos_w"):
+            name += ".weight"
+
+        yield from super().modify_tensors(data_torch, name, bid)
+
+
@ModelBase.register("Gemma3nForConditionalGeneration")
 class Gemma3nVisionAudioModel(ConformerAudioModel):
    has_audio_encoder = True
@ -8280,6 +8512,19 @@ class DeepseekV2Model(TextModel):

    merge_expert = True

+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        hparams: dict = ModelBase.load_hparams(self.dir_model, is_mistral_format=False)
+        self.origin_hf_arch = hparams.get('architectures', [None])[0]
+
+        # special handling for Deepseek OCR
+        if self.origin_hf_arch == "DeepseekOCRForCausalLM":
+            self.model_arch = gguf.MODEL_ARCH.DEEPSEEK2OCR
+            self.gguf_writer.arch = gguf.MODEL_ARCH_NAMES[self.model_arch]
+            self.gguf_writer.add_architecture()
+            # default jinja template
+            self.gguf_writer.add_chat_template("{% for m in messages %}{{m['content']}}{% endfor %}")
+
    def set_vocab(self):
        try:
            self._set_vocab_gpt2()
@ -8335,9 +8580,15 @@ class DeepseekV2Model(TextModel):
            raise NotImplementedError(f"Deepseek pre-tokenizer {tokpre!r} is not supported yet!")

    def set_gguf_parameters(self):
+        is_ocr = (self.model_arch == gguf.MODEL_ARCH.DEEPSEEK2OCR)

-        # note: deepseek2 using MLA converts into MQA (ie: GQA with 1 group)
-        self.hparams["num_key_value_heads"] = 1
+        if is_ocr:
+            self.hparams['rope_theta'] = self.hparams.get('rope_theta', 10000.0)
+        else:
+            # note: deepseek2 using MLA converts into MQA (ie: GQA with 1 group)
+            self.hparams["num_key_value_heads"] = 1
+
+        self.hparams['rms_norm_eps'] = self.hparams.get('rms_norm_eps', 1e-6)

        super().set_gguf_parameters()
        hparams = self.hparams
@ -8351,16 +8602,18 @@ class DeepseekV2Model(TextModel):
            # Default: if no MoE, all layers are dense; if MoE, none are dense
            first_k_dense_replace = hparams["num_hidden_layers"] if not has_moe else 0
        self.gguf_writer.add_leading_dense_block_count(first_k_dense_replace)
+        kv_lora_rank = hparams.get("kv_lora_rank", 512)
        self.gguf_writer.add_vocab_size(hparams["vocab_size"])
        if "q_lora_rank" in hparams and hparams["q_lora_rank"] is not None:
            self.gguf_writer.add_q_lora_rank(hparams["q_lora_rank"])
-        self.gguf_writer.add_kv_lora_rank(hparams["kv_lora_rank"])

        # note: deepseek2 using MLA converts into MQA with larger heads, then decompresses to MHA
-        self.gguf_writer.add_key_length(hparams["kv_lora_rank"] + hparams["qk_rope_head_dim"])
-        self.gguf_writer.add_value_length(hparams["kv_lora_rank"])
-        self.gguf_writer.add_key_length_mla(hparams["qk_nope_head_dim"] + hparams["qk_rope_head_dim"])
-        self.gguf_writer.add_value_length_mla(hparams["v_head_dim"])
+        if not is_ocr:
+            self.gguf_writer.add_kv_lora_rank(kv_lora_rank)
+            self.gguf_writer.add_key_length(kv_lora_rank + hparams["qk_rope_head_dim"])
+            self.gguf_writer.add_value_length(kv_lora_rank)
+            self.gguf_writer.add_key_length_mla(hparams["qk_nope_head_dim"] + hparams["qk_rope_head_dim"])
+            self.gguf_writer.add_value_length_mla(hparams["v_head_dim"])

        # MoE parameters (required by C++ code for DEEPSEEK2 arch)
        # For non-MoE models like Youtu, use intermediate_size as expert_feed_forward_length
@ -8392,8 +8645,15 @@ class DeepseekV2Model(TextModel):
    _experts: list[dict[str, Tensor]] | None = None

    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
-        # skip vision tensors and remove "language_model." for Kimi-VL and Kimi-K2.5
-        if "vision_tower" in name or "multi_modal_projector" in name or "mm_projector" in name:
+        # skip vision tensors and remove "language_model." for Kimi-VL and Kimi-K2.5, and DeepSeek-OCR
+        if ("vision_tower" in name
+                or "multi_modal_projector" in name
+                or "mm_projector" in name
+                or "vision_model" in name
+                or "image_newline" in name
+                or "model.projector" in name
+                or "sam_model" in name
+                or "view_seperator" in name):
            return
        if name.startswith("siglip2.") or name.startswith("merger."):
            return
--- a/convert_hf_to_gguf_update.py
+++ b/convert_hf_to_gguf_update.py
@ -154,6 +154,7 @@ models = [
    {"name": "qwen35",           "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/Qwen/Qwen3.5-9B-Instruct", },
    {"name": "joyai-llm",        "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jdopensource/JoyAI-LLM-Flash", },
    {"name": "kanana2",          "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/kakaocorp/kanana-2-30b-a3b-instruct-2601", },
+    {"name": "f2llmv2",          "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/codefuse-ai/F2LLM-v2-4B", },
 ]

 # some models are known to be broken upstream, so we will skip them as exceptions
@ -177,6 +178,7 @@ pre_computed_hashes = [
    {"name": "grok-2",    "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/alvarobartt/grok-2-tokenizer", "chkhsh": "66b8d4e19ab16c3bfd89bce5d785fb7e0155e8648708a1f42077cb9fe002c273"},
    # jina-v2-de variants
    {"name": "jina-v2-de", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/aari1995/German_Semantic_V3", "chkhsh": "b3d1dd861f1d4c5c0d2569ce36baf3f90fe8a102db3de50dd71ff860d91be3df"},
+    {"name": "gpt-2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/evilfreelancer/ruGPT3XL", "chkhsh": "0fe1cf6eda062318a1af7270f3331a85c539a01778ff948e24388e949c5282f4"},
 ]


--- a/docs/backend/CANN.md
+++ b/docs/backend/CANN.md
@ -42,12 +42,22 @@ The llama.cpp CANN backend is designed to support Ascend NPU. It utilize the abi

 ### Ascend NPU

-**Verified devices**
+You can retrieve your Ascend device IDs using the following command:

-| Ascend NPU                    | Status  |
-|:-----------------------------:|:-------:|
-| Atlas 300T A2                 | Support |
-| Atlas 300I Duo                | Support |
+```sh
+lspci -n | grep -Eo '19e5:d[0-9a-f]{3}' | cut -d: -f2
+```
+
+**Devices**
+
+| Device Id | Product Series | Product Models | Chip Model | Verified Status |
+|:---------:|----------------|----------------|:----------:|:---------------:|
+|    d803   | Atlas A3 Train |                |    910C    |                 |
+|    d803   | Atlas A3 Infer |                |    910C    |                 |
+|    d802   | Atlas A2 Train |                |    910B    |                 |
+|    d802   | Atlas A2 Infer | Atlas 300I A2  |    910B    |     Support     |
+|    d801   | Atlas Train    |                |     910    |                 |
+|    d500   | Atlas Infer    | Atlas 300I Duo |    310P    |     Support     |

 *Notes:*

@ -57,6 +67,9 @@ The llama.cpp CANN backend is designed to support Ascend NPU. It utilize the abi

 ## Model Supports

+<details>
+<summary>Text-only</summary>
+
 | Model Name                  | FP16  | Q4_0 | Q8_0 |
 |:----------------------------|:-----:|:----:|:----:|
 | Llama-2                     |   √   |   √  |   √  |
@ -118,8 +131,11 @@ The llama.cpp CANN backend is designed to support Ascend NPU. It utilize the abi
 | Trillion-7B-preview         |   √   |   √  |   √  |
 | Ling models                 |   √   |   √  |   √  |

+</details>
+
+<details>
+<summary>Multimodal</summary>

-**Multimodal**
 | Model Name                  | FP16  | Q4_0 | Q8_0 |
 |:----------------------------|:-----:|:----:|:----:|
 | LLaVA 1.5 models, LLaVA 1.6 models      |   x   |   x  |   x  |
@ -134,15 +150,22 @@ The llama.cpp CANN backend is designed to support Ascend NPU. It utilize the abi
 |  GLM-EDGE                   |   √   |   √  |   √  |
 |  Qwen2-VL                   |   √   |   √  |   √  |

+</details>
+


 ## DataType Supports

-| DataType               | Status  |
-|:----------------------:|:-------:|
-| FP16                   | Support |
-| Q8_0                   | Support |
-| Q4_0                   | Support |
+| DataType               | 910B    | 310P    |
+|:----------------------:|:-------:|:-------:|
+| FP16                   | Support | Support |
+| Q8_0                   | Support | Partial |
+| Q4_0                   | Support | Partial |
+| BF16                   | Support |         |
+
+> **310P note**
+> - `Q8_0`: data transform / buffer path is implemented, and `GET_ROWS` is supported, but quantized `MUL_MAT` / `MUL_MAT_ID` are not supported.
+> - `Q4_0`: data transform / buffer path is implemented, but quantized `MUL_MAT` / `MUL_MAT_ID` are not supported.

 ## Docker

@ -160,7 +183,20 @@ npu-smi info

 # Select the cards that you want to use, make sure these cards are not used by someone.
 # Following using cards of device0.
-docker run --name llamacpp --device /dev/davinci0  --device /dev/davinci_manager --device /dev/devmm_svm --device /dev/hisi_hdc -v /usr/local/dcmi:/usr/local/dcmi -v /usr/local/bin/npu-smi:/usr/local/bin/npu-smi -v /usr/local/Ascend/driver/lib64/:/usr/local/Ascend/driver/lib64/ -v /usr/local/Ascend/driver/version.info:/usr/local/Ascend/driver/version.info -v /PATH_TO_YOUR_MODELS/:/app/models -it llama-cpp-cann -m /app/models/MODEL_PATH -ngl 32 -p "Building a website can be done in 10 simple steps:"
+docker run --name llamacpp \
+  --device /dev/davinci0 \
+  --device /dev/davinci_manager \
+  --device /dev/devmm_svm \
+  --device /dev/hisi_hdc \
+  -v /usr/local/dcmi:/usr/local/dcmi \
+  -v /usr/local/bin/npu-smi:/usr/local/bin/npu-smi \
+  -v /usr/local/Ascend/driver/lib64/:/usr/local/Ascend/driver/lib64/ \
+  -v /usr/local/Ascend/driver/version.info:/usr/local/Ascend/driver/version.info \
+  -v /PATH_TO_YOUR_MODELS/:/app/models \
+  -it llama-cpp-cann \
+  -m /app/models/MODEL_PATH \
+  -ngl 32 \
+  -p "Building a website can be done in 10 simple steps:"
 ```

 *Notes:*
@ -171,69 +207,57 @@ docker run --name llamacpp --device /dev/davinci0  --device /dev/davinci_manager

 ### I. Setup Environment

-1. **Install Ascend Driver and firmware**
+1. **Configure Ascend user and group**

    ```sh
-    # create driver running user.
-    sudo groupadd -g HwHiAiUser
+    sudo groupadd HwHiAiUser
    sudo useradd -g HwHiAiUser -d /home/HwHiAiUser -m HwHiAiUser -s /bin/bash
    sudo usermod -aG HwHiAiUser $USER
-
-    # download driver from https://www.hiascend.com/hardware/firmware-drivers/community according to your system
-    # and install driver.
-    sudo sh Ascend-hdk-910b-npu-driver_x.x.x_linux-{arch}.run --full --install-for-all
    ```

-    Once installed, run `npu-smi info` to check whether driver is installed successfully.
+2. **Install dependencies**
+
+    **Ubuntu/Debian:**
    ```sh
-    +-------------------------------------------------------------------------------------------+
-    | npu-smi 24.1.rc2               Version: 24.1.rc2                                          |
-    +----------------------+---------------+----------------------------------------------------+
-    | NPU   Name           | Health        | Power(W)    Temp(C)           Hugepages-Usage(page)|
-    | Chip                 | Bus-Id        | AICore(%)   Memory-Usage(MB)  HBM-Usage(MB)        |
-    +======================+===============+====================================================+
-    | 2     xxx            | OK            | 64.4        51                15   / 15            |
-    | 0                    | 0000:01:00.0  | 0           1873 / 15077      0    / 32768         |
-    +======================+===============+====================================================+
-    | 5     xxx            | OK            | 64.0        52                15   / 15            |
-    | 0                    | 0000:81:00.0  | 0           1874 / 15077      0    / 32768         |
-    +======================+===============+====================================================+
-    | No running processes found in NPU 2                                                       |
-    +======================+===============+====================================================+
-    | No running processes found in NPU 5                                                       |
-    +======================+===============+====================================================+
+    sudo apt-get update
+    sudo apt-get install -y gcc python3 python3-pip linux-headers-$(uname -r)
    ```

-2. **Install Ascend Firmware**
+    **RHEL/CentOS:**
    ```sh
-    # download driver from https://www.hiascend.com/hardware/firmware-drivers/community according to your system
-    # and install driver.
-    sudo sh Ascend-hdk-910b-npu-firmware_x.x.x.x.X.run --full
+    sudo yum makecache
+    sudo yum install -y gcc python3 python3-pip kernel-headers-$(uname -r) kernel-devel-$(uname -r)
    ```
-    If the following message appears, firmware is installed successfully.
+
+3. **Install CANN (driver + toolkit)**
+
+    > The `Ascend-cann` package includes both the driver and toolkit.
+    > `$ARCH` can be `x86_64` or `aarch64`, `$CHIP` can be `910b` or `310p`.
+
    ```sh
-    Firmware package installed successfully!
+    wget https://ascend-repo.obs.cn-east-2.myhuaweicloud.com/CANN/CANN%208.5.T63/Ascend-cann_8.5.0_linux-$ARCH.run
+    sudo bash ./Ascend-cann_8.5.0_linux-$ARCH.run --install
+
+    wget https://ascend-repo.obs.cn-east-2.myhuaweicloud.com/CANN/CANN%208.5.T63/Ascend-cann-$CHIP-ops_8.5.0_linux-$ARCH.run
+    sudo bash ./Ascend-cann-$CHIP-ops_8.5.0_linux-$ARCH.run --install
    ```

+4. **Verify installation**

-3. **Install CANN toolkit and kernels**
-
-    CANN toolkit and kernels can be obtained from the official [CANN Toolkit](https://www.hiascend.com/zh/developer/download/community/result?module=cann) page.
-
-    Please download the corresponding version that satified your system. The minimum version required is 8.0.RC2.alpha002 and here is the install command.
    ```sh
-    pip3 install attrs numpy decorator sympy cffi pyyaml pathlib2 psutil protobuf scipy requests absl-py wheel typing_extensions
-    sh Ascend-cann-toolkit_8.0.RC2.alpha002_linux-aarch64.run --install
-    sh Ascend-cann-kernels-910b_8.0.RC2.alpha002_linux.run --install
+    npu-smi info
    ```

-    Set Ascend Variables:
+    If device information is displayed correctly, the driver is functioning properly.
+
    ```sh
-    echo "source ~/Ascend/ascend-toolkit/set_env.sh" >> ~/.bashrc
-    source ~/.bashrc
+    # Set environment variables (adjust path if needed)
+    source /usr/local/Ascend/cann/set_env.sh
+
+    python3 -c "import acl; print(acl.get_soc_name())"
    ```

-Upon a successful installation, CANN is enabled for the available ascend devices.
+    If the command outputs the chip model, the installation was successful.

 ### II. Build llama.cpp

--- a/docs/backend/OPENVINO.md
+++ b/docs/backend/OPENVINO.md
@ -1,6 +1,9 @@
 # OpenVINO Backend for llama.cpp
-[OpenVINO](https://docs.openvino.ai/) is an open-source toolkit for optimizing and deploying high-performance AI inference, specifically designed for Intel hardware, including CPUs, GPUs, and NPUs, in the cloud, on-premises, and on the edge.
-This document describes the [OpenVINO backend for llama.cpp](../../src/ggml-openvino), which enables hardware-accelerated inference on **Intel® CPUs, GPUs, and NPUs** while remaining compatible with the existing **GGUF model ecosystem**. The backend translates GGML compute graphs into OpenVINO graphs and leverages graph compilation, kernel fusion, and device-specific optimizations to improve inference performance on supported Intel hardware.
+
+> [!NOTE]
+> Performance and memory optimizations, accuracy validation, broader quantization coverage, broader operator and model support are work in progress.
+
+[OpenVINO](https://docs.openvino.ai/) is an open-source toolkit for optimizing and deploying high-performance AI inference, specifically designed for Intel hardware, including CPUs, GPUs, and NPUs, in the cloud, on-premises, and on the edge. [OpenVINO backend for llama.cpp](../../src/ggml-openvino) enables hardware-accelerated inference on **Intel® CPUs, GPUs, and NPUs** while remaining compatible with the existing **GGUF model ecosystem**. The backend translates GGML compute graphs into OpenVINO graphs and leverages graph compilation, kernel fusion, and device-specific optimizations to improve inference performance on supported Intel hardware.

 The OpenVINO backend is implemented in `ggml/src/ggml-openvino` and provides a translation layer for core GGML operations. The OpenVINO backend replaces the standard GGML graph execution path with Intel's OpenVINO inference engine. This approach allows the same GGUF model file to run on Intel CPUs, Intel GPUs (integrated and discrete), and Intel NPUs without changes to the model or the rest of the llama.cpp stack. When a `ggml_cgraph` is dispatched to OpenVINO backend, it:

@ -179,31 +182,73 @@ curl -L https://huggingface.co/unsloth/Llama-3.2-1B-Instruct-GGUF/resolve/main/L

 When using the OpenVINO backend, the first inference token may have slightly higher latency due to on-the-fly conversion to the OpenVINO graph. Subsequent tokens and runs will be faster.

+> [!NOTE]
+> Default context size is set to the model training context, which may be very large. For example, 131072 for Llama 3.2 1B, which may result in lower performance, especially on edge/laptop devices. Use `-c` to limit context size in supported llama.cpp tools for better performance. For example, `-c 512`.
+
 ```bash
 # If device is unset or unavailable, defaults to CPU.
 # If the system has multiple GPUs, use GPU.0 or GPU.1 to explicitly target a specific GPU.

 # Linux
 export GGML_OPENVINO_DEVICE=GPU
+# Enable stateful execution with GPU device to avoid known stateless execution failures.
+export GGML_OPENVINO_STATEFUL_EXECUTION=1
 # To run llama-simple:
 ./build/ReleaseOV/bin/llama-simple -m ~/models/Llama-3.2-1B-Instruct-Q4_0.gguf -n 50 "The story of AI is "
 # To run in chat mode:
-./build/ReleaseOV/bin/llama-cli -m ~/models/Llama-3.2-1B-Instruct-Q4_0.gguf
+./build/ReleaseOV/bin/llama-cli -m ~/models/Llama-3.2-1B-Instruct-Q4_0.gguf -c 1024
+# To run llama-bench, -fa 1 is needed
+GGML_OPENVINO_STATEFUL_EXECUTION=1 GGML_OPENVINO_DEVICE=GPU ./build/ReleaseOV/bin/llama-bench -m ~/models/Llama-3.2-1B-Instruct-Q4_0.gguf -fa 1
+
+# NPU: keep context small to avoid failures from very large model context windows.
+export GGML_OPENVINO_DEVICE=NPU
+./build/ReleaseOV/bin/llama-cli -m ~/models/Llama-3.2-1B-Instruct-Q4_0.gguf -c 512

 # Windows Command Line
 set GGML_OPENVINO_DEVICE=GPU
+# Enable stateful execution with GPU device to avoid known stateless execution failures.
+set GGML_OPENVINO_STATEFUL_EXECUTION=1
 # Windows PowerShell
 $env:GGML_OPENVINO_DEVICE = "GPU"
+$env:GGML_OPENVINO_STATEFUL_EXECUTION = "1"

 # To run llama-simple
 build\ReleaseOV\bin\llama-simple.exe -m "C:\models\Llama-3.2-1B-Instruct-Q4_0.gguf" -n 50 "The story of AI is "
 # To run in chat mode:
-build\ReleaseOV\bin\llama-cli.exe -m "C:\models\Llama-3.2-1B-Instruct-Q4_0.gguf"
+build\ReleaseOV\bin\llama-cli.exe -m "C:\models\Llama-3.2-1B-Instruct-Q4_0.gguf" -c 1024
+# To run llama-bench, -fa 1 is needed
+build\ReleaseOV\bin\llama-bench.exe -m "C:\models\Llama-3.2-1B-Instruct-Q4_0.gguf" -fa 1

+# NPU: keep context small to avoid failures from very large model context windows.
+# Windows Command Line
+set GGML_OPENVINO_DEVICE=NPU
+# Windows PowerShell
+$env:GGML_OPENVINO_DEVICE = "NPU"
+build\ReleaseOV\bin\llama-cli.exe -m "C:\models\Llama-3.2-1B-Instruct-Q4_0.gguf" -c 512
 ```
 > [!NOTE]
 > On systems with multiple GPUs, use `GPU.0` or `GPU.1` to explicitly target specific GPU. See [OpenVINO GPU Device](https://docs.openvino.ai/2026/openvino-workflow/running-inference/inference-devices-and-modes/gpu-device.html) for more details.

+### Known Issues and Current Workarounds
+
+- GPU stateless execution is currently affected by a known issue.
+  - Workaround: set `GGML_OPENVINO_STATEFUL_EXECUTION=1` when using GPU device.
+- NPU failures can happen when context size is too large. Recent llama.cpp behavior may resolve context size to the model training context (for example, 131072 for Llama 3.2 1B), which is too large for current NPU usage and can also stress laptop CPU/GPU on larger models. To inspect the selected context size, run `llama-cli` or `llama-server` with `-lv 3`.
+  - Workaround: explicitly set context size, for ex. `-c 1024` for NPU runs. Performance will be better with lower context size.
+- Additional NPU limitations:
+  - Model caching is not yet supported.
+  - `llama-server -np > 1` (multiple parallel sequences) is not supported.
+  - `llama-perplexity` is only supported with `-b 512` or smaller.
+- `--context-shift` with `llama-cli` is currently not supported with OpenVINO backend across CPU, GPU, and NPU devices.
+- Encoder models (embedding, reranking) are not supported with the current OpenVINO backend implementation.
+- `-fa 1` is required when running llama-bench with the OpenVINO backend.
+  - `GGML_OPENVINO_STATEFUL_EXECUTION=1 GGML_OPENVINO_DEVICE=GPU ./llama-bench -fa 1`
+- `llama-server` with OpenVINO backend supports only one chat session/thread, when `GGML_OPENVINO_STATEFUL_EXECUTION=1` is enabled.
+- For Intel GPU, NPU detection in containers, GPU, NPU user-space drivers/libraries must be present inside the image. We will include in a future PR. Until then, you can use this reference Dockerfile: [openvino.Dockerfile](https://github.com/ravi9/llama.cpp/blob/ov-docker-update/.devops/openvino.Dockerfile)
+
+> [!NOTE]
+> The OpenVINO backend is actively under development. Fixes are underway, and this document will continue to be updated as issues are resolved.
+

 ### Docker Build

@ -229,31 +274,42 @@ docker build --build-arg http_proxy=$http_proxy --build-arg https_proxy=$https_p
 Run llama.cpp with OpenVINO backend Docker container.
 Save sample models in `~/models` as [shown above](#3-download-sample-model). It will be mounted to the container in the examples below.

+> [!NOTE]
+> Intel GPU, NPU detection in containers will be included in a future PR. Until then, you can use this reference Dockerfile: [openvino.Dockerfile](https://github.com/ravi9/llama.cpp/blob/ov-docker-update/.devops/openvino.Dockerfile).
+
 ```bash
 #  Run Docker container
-docker run --rm -it -v ~/models:/models llama-openvino:light --no-warmup -m /models/Llama-3.2-1B-Instruct-Q4_0.gguf
+docker run --rm -it -v ~/models:/models llama-openvino:light --no-warmup -c 1024 -m /models/Llama-3.2-1B-Instruct-Q4_0.gguf

 # With Intel GPU access (iGPU or dGPU)
 docker run --rm -it -v ~/models:/models \
 --device=/dev/dri --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1) -u $(id -u):$(id -g) \
-llama-openvino:light --no-warmup -m /models/Llama-3.2-1B-Instruct-Q4_0.gguf
+--env=GGML_OPENVINO_DEVICE=GPU --env=GGML_OPENVINO_STATEFUL_EXECUTION=1 \
+llama-openvino:light --no-warmup -c 1024 -m /models/Llama-3.2-1B-Instruct-Q4_0.gguf

 # With Intel NPU access
-docker run --rm -it --env GGML_OPENVINO_DEVICE=NPU -v ~/models:/models \
+docker run --rm -it -v ~/models:/models \
 --device=/dev/accel --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1) -u $(id -u):$(id -g) \
-llama-openvino:light --no-warmup -m /models/Llama-3.2-1B-Instruct-Q4_0.gguf
+--env=GGML_OPENVINO_DEVICE=NPU \
+llama-openvino:light --no-warmup -c 1024 -m /models/Llama-3.2-1B-Instruct-Q4_0.gguf
 ```

-Run Llama.cpp Server with OpenVINO Backend:
+Run Llama.cpp Server with OpenVINO Backend.
+> [!NOTE]
+> `llama-server` with OpenVINO backend supports only one chat session/thread, when `GGML_OPENVINO_STATEFUL_EXECUTION=1` is enabled.
+
 ```bash
 # Run the Server Docker container
-docker run --rm -it -p 8080:8080 -v ~/models:/models llama-openvino:server --no-warmup -m /models/Llama-3.2-1B-Instruct-Q4_0.gguf
-
-# In a NEW terminal, test the server with curl
+docker run --rm -it -p 8080:8080 -v ~/models:/models llama-openvino:server --no-warmup -m /models/Llama-3.2-1B-Instruct-Q4_0.gguf -c 1024
+# Or Using llama-server executable
+./build/ReleaseOV/bin/llama-server -m ~/models/Llama-3.2-1B-Instruct-Q4_0.gguf --port 8080 -c 1024

 # If you are behind a proxy, make sure to set NO_PROXY to avoid proxy for localhost
 export NO_PROXY=localhost,127.0.0.1

+# Option 1: Open your browser to http://localhost:8080 to access the web UI for the llama.cpp server.
+# Option 2: In a NEW terminal, test the server with curl
+
 # Test health endpoint
 curl -f http://localhost:8080/health

@ -295,6 +351,7 @@ The OpenVINO backend can be configured using the following environment variables
 export GGML_OPENVINO_CACHE_DIR=/tmp/ov_cache
 export GGML_OPENVINO_PROFILING=1
 export GGML_OPENVINO_DEVICE=GPU
+export GGML_OPENVINO_STATEFUL_EXECUTION=1

 ./build/ReleaseOV/bin/llama-simple -m ~/models/Llama-3.2-1B-Instruct-Q4_0.gguf -n 50 "The story of AI is "

@ -302,38 +359,27 @@ export GGML_OPENVINO_DEVICE=GPU
 set GGML_OPENVINO_CACHE_DIR=C:\tmp\ov_cache
 set GGML_OPENVINO_PROFILING=1
 set GGML_OPENVINO_DEVICE=GPU
+set GGML_OPENVINO_STATEFUL_EXECUTION=1

 # Windows PowerShell
 $env:GGML_OPENVINO_CACHE_DIR = "C:\tmp\ov_cache"
 $env:GGML_OPENVINO_PROFILING = "1"
 $env:GGML_OPENVINO_DEVICE = "GPU"
+$env:GGML_OPENVINO_STATEFUL_EXECUTION = "1"

 build\ReleaseOV\bin\llama-simple.exe -m "C:\models\Llama-3.2-1B-Instruct-Q4_0.gguf" -n 50 "The story of AI is "

 ```

-#### llama-bench
-
-```bash
-# -fa 1 is required when running llama-bench with the OpenVINO backend.
-GGML_OPENVINO_DEVICE=GPU ./llama-bench -fa 1
-```
-
-### NPU Notes
-
- Model caching is not yet supported
- Does not support llama-server -np > 1 (multiple parallel sequences)
- Only supports llama-perplexity -b 512 or smaller
-
 ## Llama.cpp Tools

 The following tools work with the OpenVINO backend on CPU, GPU, NPU:
- llama-simple
- llama-run
- llama-cli
- llama-server
 - llama-bench
+- llama-cli
+- llama-completion
 - llama-perplexity
+- llama-server
+- llama-simple

 ## Work in Progress

--- a/docs/docker.md
+++ b/docs/docker.md
@ -13,24 +13,30 @@ We have three Docker images available for this project:

 Additionally, there the following images, similar to the above:

- `ghcr.io/ggml-org/llama.cpp:full-cuda`: Same as `full` but compiled with CUDA support. (platforms: `linux/amd64`)
- `ghcr.io/ggml-org/llama.cpp:light-cuda`: Same as `light` but compiled with CUDA support. (platforms: `linux/amd64`)
- `ghcr.io/ggml-org/llama.cpp:server-cuda`: Same as `server` but compiled with CUDA support. (platforms: `linux/amd64`)
- `ghcr.io/ggml-org/llama.cpp:full-rocm`: Same as `full` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`)
- `ghcr.io/ggml-org/llama.cpp:light-rocm`: Same as `light` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`)
- `ghcr.io/ggml-org/llama.cpp:server-rocm`: Same as `server` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`)
+- `ghcr.io/ggml-org/llama.cpp:full-cuda`: Same as `full` but compiled with CUDA 12 support. (platforms: `linux/amd64`, `linux/arm64`)
+- `ghcr.io/ggml-org/llama.cpp:full-cuda13`: Same as `full` but compiled with CUDA 13 support. (platforms: `linux/amd64`, `linux/arm64`)
+- `ghcr.io/ggml-org/llama.cpp:light-cuda`: Same as `light` but compiled with CUDA 12 support. (platforms: `linux/amd64`, `linux/arm64`)
+- `ghcr.io/ggml-org/llama.cpp:light-cuda13`: Same as `light` but compiled with CUDA 13 support. (platforms: `linux/amd64`, `linux/arm64`)
+- `ghcr.io/ggml-org/llama.cpp:server-cuda`: Same as `server` but compiled with CUDA 12 support. (platforms: `linux/amd64`, `linux/arm64`)
+- `ghcr.io/ggml-org/llama.cpp:server-cuda13`: Same as `server` but compiled with CUDA 13 support. (platforms: `linux/amd64`, `linux/arm64`)
+- `ghcr.io/ggml-org/llama.cpp:full-rocm`: Same as `full` but compiled with ROCm support. (platforms: `linux/amd64`)
+- `ghcr.io/ggml-org/llama.cpp:light-rocm`: Same as `light` but compiled with ROCm support. (platforms: `linux/amd64`)
+- `ghcr.io/ggml-org/llama.cpp:server-rocm`: Same as `server` but compiled with ROCm support. (platforms: `linux/amd64`)
 - `ghcr.io/ggml-org/llama.cpp:full-musa`: Same as `full` but compiled with MUSA support. (platforms: `linux/amd64`)
 - `ghcr.io/ggml-org/llama.cpp:light-musa`: Same as `light` but compiled with MUSA support. (platforms: `linux/amd64`)
 - `ghcr.io/ggml-org/llama.cpp:server-musa`: Same as `server` but compiled with MUSA support. (platforms: `linux/amd64`)
 - `ghcr.io/ggml-org/llama.cpp:full-intel`: Same as `full` but compiled with SYCL support. (platforms: `linux/amd64`)
 - `ghcr.io/ggml-org/llama.cpp:light-intel`: Same as `light` but compiled with SYCL support. (platforms: `linux/amd64`)
 - `ghcr.io/ggml-org/llama.cpp:server-intel`: Same as `server` but compiled with SYCL support. (platforms: `linux/amd64`)
- `ghcr.io/ggml-org/llama.cpp:full-vulkan`: Same as `full` but compiled with Vulkan support. (platforms: `linux/amd64`)
- `ghcr.io/ggml-org/llama.cpp:light-vulkan`: Same as `light` but compiled with Vulkan support. (platforms: `linux/amd64`)
- `ghcr.io/ggml-org/llama.cpp:server-vulkan`: Same as `server` but compiled with Vulkan support. (platforms: `linux/amd64`)
+- `ghcr.io/ggml-org/llama.cpp:full-vulkan`: Same as `full` but compiled with Vulkan support. (platforms: `linux/amd64`, `linux/arm64`)
+- `ghcr.io/ggml-org/llama.cpp:light-vulkan`: Same as `light` but compiled with Vulkan support. (platforms: `linux/amd64`, `linux/arm64`)
+- `ghcr.io/ggml-org/llama.cpp:server-vulkan`: Same as `server` but compiled with Vulkan support. (platforms: `linux/amd64`, `linux/arm64`)
 - `ghcr.io/ggml-org/llama.cpp:full-openvino`: Same as `full` but compiled with OpenVino support. (platforms: `linux/amd64`)
 - `ghcr.io/ggml-org/llama.cpp:light-openvino`: Same as `light` but compiled with OpenVino support. (platforms: `linux/amd64`)
 - `ghcr.io/ggml-org/llama.cpp:server-openvino`: Same as `server` but compiled with OpenVino support. (platforms: `linux/amd64`)
+- `ghcr.io/ggml-org/llama.cpp:full-s390x`: Identical to `full`, an alias for the `s390x` platform. (platforms: `linux/s390x`)
+- `ghcr.io/ggml-org/llama.cpp:light-s390x`: Identical to `light`, an alias for the `s390x` platform. (platforms: `linux/s390x`)
+- `ghcr.io/ggml-org/llama.cpp:server-s390x`: Identical to `server`, an alias for the `s390x` platform. (platforms: `linux/s390x`)

 The GPU enabled images are not currently tested by CI beyond being built. They are not built with any variation from the ones in the Dockerfiles defined in [.devops/](../.devops/) and the GitHub Action defined in [.github/workflows/docker.yml](../.github/workflows/docker.yml). If you need different settings (for example, a different CUDA, ROCm or MUSA library, you'll need to build the images locally for now).

@ -82,7 +88,7 @@ You may want to pass in some different `ARGS`, depending on the CUDA environment

 The defaults are:

- `CUDA_VERSION` set to `12.4.0`
+- `CUDA_VERSION` set to `12.8.1`
 - `CUDA_DOCKER_ARCH` set to the cmake build default, which includes all the supported architectures

 The resulting images, are essentially the same as the non-CUDA images:
--- a/docs/multimodal.md
+++ b/docs/multimodal.md
@ -31,6 +31,13 @@ llama-server -m gemma-3-4b-it-Q4_K_M.gguf --mmproj mmproj-gemma-3-4b-it-Q4_K_M.g
 llama-server -hf ggml-org/gemma-3-4b-it-GGUF --no-mmproj-offload
 ```

+> [!IMPORTANT]
+>
+> OCR models are trained with specific prompt and input structure, please refer to these discussions for more info:
+> - PaddleOCR-VL: https://github.com/ggml-org/llama.cpp/pull/18825
+> - GLM-OCR: https://github.com/ggml-org/llama.cpp/pull/19677
+> - Deepseek-OCR: https://github.com/ggml-org/llama.cpp/pull/17400
+
 ## Pre-quantized models

 These are ready-to-use models, most of them come with `Q4_K_M` quantization by default. They can be found at the Hugging Face page of the ggml-org: https://huggingface.co/collections/ggml-org/multimodal-ggufs-68244e01ff1f39e5bebeeedc
--- a/examples/batched/batched.cpp
+++ b/examples/batched/batched.cpp
@ -24,12 +24,12 @@ int main(int argc, char ** argv) {
    params.prompt = "Hello my name is";
    params.n_predict = 32;

+    common_init();
+
    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_BATCHED, print_usage)) {
        return 1;
    }

-    common_init();
-
    // number of parallel batches
    int n_parallel = params.n_parallel;

--- a/examples/debug/debug.cpp
+++ b/examples/debug/debug.cpp
@ -213,12 +213,12 @@ static bool run(llama_context * ctx, const common_params & params) {
 int main(int argc, char ** argv) {
    common_params params;

+    common_init();
+
    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_DEBUG, print_usage)) {
        return 1;
    }

-    common_init();
-
    llama_backend_init();
    llama_numa_init(params.numa);

--- a/examples/diffusion/diffusion-cli.cpp
+++ b/examples/diffusion/diffusion-cli.cpp
@ -545,11 +545,12 @@ int main(int argc, char ** argv) {

    common_params params;

+    common_init();
+
    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_DIFFUSION)) {
        return 1;
    }

-    common_init();
    llama_backend_init();

    llama_model_params model_params = llama_model_default_params();
--- a/examples/embedding/embedding.cpp
+++ b/examples/embedding/embedding.cpp
@ -99,12 +99,12 @@ int main(int argc, char ** argv) {

    common_params params;

+    common_init();
+
    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_EMBEDDING)) {
        return 1;
    }

-    common_init();
-
    params.embedding = true;

    // get max number of sequences per batch
--- a/examples/eval-callback/eval-callback.cpp
+++ b/examples/eval-callback/eval-callback.cpp
@ -37,12 +37,12 @@ int main(int argc, char ** argv) {

    common_params params;

+    common_init();
+
    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) {
        return 1;
    }

-    common_init();
-
    llama_backend_init();
    llama_numa_init(params.numa);

--- a/examples/idle/idle.cpp
+++ b/examples/idle/idle.cpp
@ -19,12 +19,12 @@ static void print_usage(int /*argc*/, char ** argv) {
 int main(int argc, char ** argv) {
    common_params params;

+    common_init();
+
    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON, print_usage)) {
        return 1;
    }

-    common_init();
-
    // init LLM

    llama_backend_init();
--- a/examples/llama.android/lib/src/main/cpp/ai_chat.cpp
+++ b/examples/llama.android/lib/src/main/cpp/ai_chat.cpp
@ -365,13 +365,13 @@ Java_com_arm_aichat_internal_InferenceEngineImpl_processSystemPrompt(
    const auto *system_prompt = env->GetStringUTFChars(jsystem_prompt, nullptr);
    LOGd("%s: System prompt received: \n%s", __func__, system_prompt);
    std::string formatted_system_prompt(system_prompt);
-    env->ReleaseStringUTFChars(jsystem_prompt, system_prompt);

    // Format system prompt if applicable
    const bool has_chat_template = common_chat_templates_was_explicit(g_chat_templates.get());
    if (has_chat_template) {
        formatted_system_prompt = chat_add_and_format(ROLE_SYSTEM, system_prompt);
    }
+    env->ReleaseStringUTFChars(jsystem_prompt, system_prompt);

    // Tokenize system prompt
    const auto system_tokens = common_tokenize(g_context, formatted_system_prompt,
@ -414,13 +414,13 @@ Java_com_arm_aichat_internal_InferenceEngineImpl_processUserPrompt(
    const auto *const user_prompt = env->GetStringUTFChars(juser_prompt, nullptr);
    LOGd("%s: User prompt received: \n%s", __func__, user_prompt);
    std::string formatted_user_prompt(user_prompt);
-    env->ReleaseStringUTFChars(juser_prompt, user_prompt);

    // Format user prompt if applicable
    const bool has_chat_template = common_chat_templates_was_explicit(g_chat_templates.get());
    if (has_chat_template) {
        formatted_user_prompt = chat_add_and_format(ROLE_USER, user_prompt);
    }
+    env->ReleaseStringUTFChars(juser_prompt, user_prompt);

    // Decode formatted user prompts
    auto user_tokens = common_tokenize(g_context, formatted_user_prompt, has_chat_template, has_chat_template);
--- a/examples/lookahead/lookahead.cpp
+++ b/examples/lookahead/lookahead.cpp
@ -43,12 +43,12 @@ int main(int argc, char ** argv) {

    common_params params;

+    common_init();
+
    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) {
        return 1;
    }

-    common_init();
-
    const int W = 15; // lookahead window
    const int N = 5;  // n-gram size
    const int G = 15; // max verification n-grams
--- a/examples/lookup/lookup-create.cpp
+++ b/examples/lookup/lookup-create.cpp
@ -12,6 +12,8 @@ int main(int argc, char ** argv){

    common_params params;

+    common_init();
+
    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_LOOKUP)) {
        return 1;
    }
--- a/examples/lookup/lookup-stats.cpp
+++ b/examples/lookup/lookup-stats.cpp
@ -18,12 +18,12 @@ int main(int argc, char ** argv){

    common_params params;

+    common_init();
+
    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_LOOKUP)) {
        return 1;
    }

-    common_init();
-
    const int n_draft = params.speculative.n_max;

    // init llama.cpp
--- a/examples/lookup/lookup.cpp
+++ b/examples/lookup/lookup.cpp
@ -18,12 +18,12 @@ int main(int argc, char ** argv){

    common_params params;

+    common_init();
+
    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_LOOKUP)) {
        return 1;
    }

-    common_init();
-
    // max. number of additional tokens to draft if match is found
    const int n_draft = params.speculative.n_max;

--- a/examples/model-conversion/scripts/causal/compare-logits.py
+++ b/examples/model-conversion/scripts/causal/compare-logits.py
@ -7,7 +7,7 @@ import os

 # Add utils directory to path for direct script execution
 sys.path.insert(0, str(Path(__file__).parent.parent / "utils"))
-from common import get_model_name_from_env_path, compare_tokens, exit_with_warning  # type: ignore[import-not-found]
+from common import get_model_name_from_env_path, compare_tokens, exit_with_warning  # type: ignore[import-not-found, ty:unresolved-import]

 def quick_logits_check(pytorch_file, llamacpp_file):
    """Lightweight sanity check before NMSE"""
--- a/examples/model-conversion/scripts/utils/check-nmse.py
+++ b/examples/model-conversion/scripts/utils/check-nmse.py
@ -5,7 +5,7 @@ import sys
 import os
 import argparse
 from pathlib import Path
-from common import get_model_name_from_env_path  # type: ignore[import-not-found]
+from common import get_model_name_from_env_path  # type: ignore[import-not-found, ty:unresolved-import]

 def calculate_nmse(reference, test):
    mse = np.mean((test - reference) ** 2)
--- a/examples/model-conversion/scripts/utils/compare_tokens.py
+++ b/examples/model-conversion/scripts/utils/compare_tokens.py
@ -2,7 +2,7 @@

 import argparse
 import sys
-from common import compare_tokens  # type: ignore[import-not-found]
+from common import compare_tokens  # type: ignore[import-not-found, ty:unresolved-import]


 def parse_arguments():
--- a/examples/model-conversion/scripts/utils/semantic_check.py
+++ b/examples/model-conversion/scripts/utils/semantic_check.py
@ -7,7 +7,7 @@ import importlib
 from pathlib import Path

 from transformers import AutoTokenizer, AutoConfig, AutoModelForCausalLM, AutoModel
-from common import compare_tokens, exit_with_warning  # type: ignore[import-not-found]
+from common import compare_tokens, exit_with_warning  # type: ignore[import-not-found, ty:unresolved-import]

 unreleased_model_name = os.getenv('UNRELEASED_MODEL_NAME')

--- a/examples/parallel/parallel.cpp
+++ b/examples/parallel/parallel.cpp
@ -163,12 +163,12 @@ int main(int argc, char ** argv) {
    params.n_predict = 128;
    params.n_junk = 1;

+    common_init();
+
    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_PARALLEL)) {
        return 1;
    }

-    common_init();
-
    // number of simultaneous "clients" to simulate
    const int32_t n_clients = params.n_parallel;

--- a/examples/passkey/passkey.cpp
+++ b/examples/passkey/passkey.cpp
@ -25,12 +25,12 @@ int main(int argc, char ** argv) {
    params.n_keep = 32;
    params.i_pos  = -1;

+    common_init();
+
    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_PASSKEY, print_usage)) {
        return 1;
    }

-    common_init();
-
    int n_junk = params.n_junk;
    int n_keep = params.n_keep;
    int n_grp  = params.grp_attn_n;
--- a/examples/retrieval/retrieval.cpp
+++ b/examples/retrieval/retrieval.cpp
@ -117,12 +117,12 @@ int main(int argc, char ** argv) {

    common_params params;

+    common_init();
+
    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_RETRIEVAL, print_usage)) {
        return 1;
    }

-    common_init();
-
    // For BERT models, batch size must be equal to ubatch size
    params.n_ubatch = params.n_batch;
    params.embedding = true;
--- a/examples/save-load-state/save-load-state.cpp
+++ b/examples/save-load-state/save-load-state.cpp
@ -17,6 +17,8 @@ int main(int argc, char ** argv) {

    const std::string_view state_file = "dump_state.bin";

+    common_init();
+
    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) {
        return 1;
    }
@ -27,8 +29,6 @@ int main(int argc, char ** argv) {
        params.kv_unified = true;
    }

-    common_init();
-
    if (params.n_predict < 0) {
        params.n_predict = 16;
    }
--- a/examples/speculative-simple/speculative-simple.cpp
+++ b/examples/speculative-simple/speculative-simple.cpp
@ -16,6 +16,8 @@ int main(int argc, char ** argv) {

    common_params params;

+    common_init();
+
    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_SPECULATIVE)) {
        return 1;
    }
@ -25,8 +27,6 @@ int main(int argc, char ** argv) {
        return 1;
    }

-    common_init();
-
    if (params.speculative.mparams_dft.path.empty()) {
        LOG_ERR("%s: --model-draft is required\n", __func__);
        return 1;
--- a/examples/speculative/speculative.cpp
+++ b/examples/speculative/speculative.cpp
@ -38,6 +38,8 @@ int main(int argc, char ** argv) {
    // needed to get candidate probs even for temp <= 0.0
    params.sampling.n_probs = 128;

+    common_init();
+
    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_SPECULATIVE)) {
        return 1;
    }
@ -47,8 +49,6 @@ int main(int argc, char ** argv) {
        return 1;
    }

-    common_init();
-
    if (params.speculative.mparams_dft.path.empty()) {
        LOG_ERR("%s: --model-draft is required\n", __func__);
        return 1;
--- a/examples/sycl/build.sh
+++ b/examples/sycl/build.sh
@ -20,4 +20,4 @@ cmake .. -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA
 #cmake --build . --config Release --target llama-bench

 #build all binary
-cmake --build . --config Release -j -v
+cmake --build . --config Release -j$((($(nproc)+1)/2)) -v
--- a/examples/sycl/run-llama2.sh
+++ b/examples/sycl/run-llama2.sh
@ -23,9 +23,9 @@ if [ $# -gt 0 ]; then
    GGML_SYCL_DEVICE=$1
    echo "use $GGML_SYCL_DEVICE as main GPU"
    #use signle GPU only
-    ZES_ENABLE_SYSMAN=1 ./build/bin/llama-completion -m ${MODEL_FILE} -no-cnv -p "${INPUT_PROMPT}" -n 400 -e -ngl ${NGL} -s 0 -c ${CONTEXT} -mg $GGML_SYCL_DEVICE -sm none ${LOAD_MODE}
+    ZES_ENABLE_SYSMAN=1 ./build/bin/llama-completion -m ${MODEL_FILE} -no-cnv -p "${INPUT_PROMPT}" -n 200 -e -ngl ${NGL} -s 0 -c ${CONTEXT} -mg $GGML_SYCL_DEVICE -sm none ${LOAD_MODE}

 else
    #use multiple GPUs with same max compute units
-    ZES_ENABLE_SYSMAN=1 ./build/bin/llama-completion -m ${MODEL_FILE} -no-cnv -p "${INPUT_PROMPT}" -n 400 -e -ngl ${NGL} -s 0 -c ${CONTEXT} ${LOAD_MODE}
+    ZES_ENABLE_SYSMAN=1 ./build/bin/llama-completion -m ${MODEL_FILE} -no-cnv -p "${INPUT_PROMPT}" -n 200 -e -ngl ${NGL} -s 0 -c ${CONTEXT} ${LOAD_MODE}
 fi
--- a/examples/training/finetune.cpp
+++ b/examples/training/finetune.cpp
@ -20,6 +20,8 @@ int main(int argc, char ** argv) {
    common_params params;
    params.escape = false;

+    common_init();
+
    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_FINETUNE)) {
        return 1;
    }
@ -38,7 +40,6 @@ int main(int argc, char ** argv) {
        params.cache_type_v = GGML_TYPE_F32;
    }

-    common_init();
    llama_backend_init();
    llama_numa_init(params.numa);
    // load the model and apply lora adapter, if any
--- a/ggml/CMakeLists.txt
+++ b/ggml/CMakeLists.txt
@ -4,7 +4,7 @@ project("ggml" C CXX ASM)
 ### GGML Version
 set(GGML_VERSION_MAJOR 0)
 set(GGML_VERSION_MINOR 9)
-set(GGML_VERSION_PATCH 8)
+set(GGML_VERSION_PATCH 9)
 set(GGML_VERSION_BASE "${GGML_VERSION_MAJOR}.${GGML_VERSION_MINOR}.${GGML_VERSION_PATCH}")

 find_program(GIT_EXE NAMES git git.exe NO_CMAKE_FIND_ROOT_PATH)
--- a/ggml/include/gguf.h
+++ b/ggml/include/gguf.h
@ -77,6 +77,7 @@ extern "C" {
    };

    GGML_API struct gguf_context * gguf_init_empty(void);
+    GGML_API struct gguf_context * gguf_init_from_file_ptr(FILE * file, struct gguf_init_params params);
    GGML_API struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_params params);
    //GGML_API struct gguf_context * gguf_init_from_buffer(..);

@ -189,6 +190,7 @@ extern "C" {
    //

    // write the entire context to a binary file
+    GGML_API bool gguf_write_to_file_ptr(const struct gguf_context * ctx, FILE * file, bool only_meta);
    GGML_API bool gguf_write_to_file(const struct gguf_context * ctx, const char * fname, bool only_meta);

    // get the size in bytes of the meta data (header, kv pairs, tensor info) including padding
--- a/ggml/src/ggml-cann/aclnn_ops.cpp
+++ b/ggml/src/ggml-cann/aclnn_ops.cpp
@ -434,6 +434,9 @@ void ggml_cann_norm(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
 void ggml_cann_l2_norm(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
    ggml_tensor * src = dst->src[0];

+    float eps;
+    memcpy(&eps, dst->op_params, sizeof(float));
+
    acl_tensor_ptr acl_src = ggml_cann_create_tensor(src);
    acl_tensor_ptr acl_dst = ggml_cann_create_tensor(dst);

@ -456,6 +459,13 @@ void ggml_cann_l2_norm(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
    float          p_value  = 2.0f;
    acl_scalar_ptr p_scalar = ggml_cann_create_scalar(&p_value, aclDataType::ACL_FLOAT);
    GGML_CANN_CALL_ACLNN_OP(ctx, Norm, acl_src.get(), p_scalar.get(), dims_array.get(), true, acl_div.get());
+
+    // Clamp norm to at least eps: scale = 1/fmaxf(norm, eps)
+    acl_scalar_ptr acl_min = ggml_cann_create_scalar(&eps, aclDataType::ACL_FLOAT);
+    float          flt_max = FLT_MAX;
+    acl_scalar_ptr acl_max = ggml_cann_create_scalar(&flt_max, aclDataType::ACL_FLOAT);
+    GGML_CANN_CALL_ACLNN_OP(ctx, Clamp, acl_div.get(), acl_min.get(), acl_max.get(), acl_div.get());
+
    GGML_CANN_CALL_ACLNN_OP(ctx, Div, acl_src.get(), acl_div.get(), acl_dst.get());
 }

--- a/ggml/src/ggml-cann/common.h
+++ b/ggml/src/ggml-cann/common.h
@ -216,14 +216,16 @@ struct ggml_cann_pool_alloc {
 #ifdef USE_ACL_GRAPH
 struct ggml_graph_node_properties {
    // dst tensor
-    void *  node_address;
-    int64_t ne[GGML_MAX_DIMS];
-    size_t  nb[GGML_MAX_DIMS];
+    void *    node_address;
+    ggml_type node_type;
+    int64_t   ne[GGML_MAX_DIMS];
+    size_t    nb[GGML_MAX_DIMS];

    // src tensor
-    void *  src_address[GGML_MAX_SRC];
-    int64_t src_ne[GGML_MAX_SRC][GGML_MAX_DIMS];
-    size_t  src_nb[GGML_MAX_SRC][GGML_MAX_DIMS];
+    void *    src_address[GGML_MAX_SRC];
+    ggml_type src_type[GGML_MAX_SRC];
+    int64_t   src_ne[GGML_MAX_SRC][GGML_MAX_DIMS];
+    size_t    src_nb[GGML_MAX_SRC][GGML_MAX_DIMS];

    // op
    ggml_op node_op;
@ -247,6 +249,10 @@ struct ggml_graph_node_properties {
            return false;
        }

+        if (node->type != this->node_type) {
+            return false;
+        }
+
        for (int i = 0; i < GGML_MAX_DIMS; i++) {
            if (node->ne[i] != this->ne[i]) {
                return false;
@ -262,6 +268,10 @@ struct ggml_graph_node_properties {
                    return false;
                }

+                if (node->src[i]->type != this->src_type[i]) {
+                    return false;
+                }
+
                for (int d = 0; d < GGML_MAX_DIMS; d++) {
                    if (node->src[i]->ne[d] != this->src_ne[i][d]) {
                        return false;
@ -277,10 +287,7 @@ struct ggml_graph_node_properties {
            }
        }

-        if (node->op == GGML_OP_SCALE || node->op == GGML_OP_UNARY || node->op == GGML_OP_GLU || node->op == GGML_OP_ROPE){
-            return memcmp(this->op_params, node->op_params, GGML_MAX_OP_PARAMS) == 0;
-        }
-        return true;
+        return memcmp(this->op_params, node->op_params, GGML_MAX_OP_PARAMS) == 0;
    }
 };

@ -322,6 +329,7 @@ struct ggml_cann_graph {

            prop.node_address = node->data;
            prop.node_op      = node->op;
+            prop.node_type    = node->type;

            std::copy_n(node->ne, GGML_MAX_DIMS, prop.ne);
            std::copy_n(node->nb, GGML_MAX_DIMS, prop.nb);
@ -329,10 +337,12 @@ struct ggml_cann_graph {
            for (int src = 0; src < GGML_MAX_SRC; ++src) {
                if (node->src[src]) {
                    prop.src_address[src] = node->src[src]->data;
+                    prop.src_type[src]    = node->src[src]->type;
                    std::copy_n(node->src[src]->ne, GGML_MAX_DIMS, prop.src_ne[src]);
                    std::copy_n(node->src[src]->nb, GGML_MAX_DIMS, prop.src_nb[src]);
                } else {
                    prop.src_address[src] = nullptr;
+                    prop.src_type[src]    = GGML_TYPE_COUNT;
                    std::fill_n(prop.src_ne[src], GGML_MAX_DIMS, 0);
                    std::fill_n(prop.src_nb[src], GGML_MAX_DIMS, 0);
                }
--- a/ggml/src/ggml-cann/ggml-cann.cpp
+++ b/ggml/src/ggml-cann/ggml-cann.cpp
@ -36,10 +36,13 @@
 #include <cmath>
 #include <cstdio>
 #include <cstring>
+#include <memory>
 #include <mutex>
 #include <optional>
 #include <queue>
+#include <unordered_map>
 #include <unordered_set>
+#include <vector>

 #define GGML_COMMON_DECL_C

@ -770,6 +773,21 @@ std::unique_ptr<ggml_cann_pool> ggml_backend_cann_context::new_pool_for_device(i
 }

 // cann buffer
+
+/**
+ * @brief Tracks multi-threaded write progress for a single tensor.
+ *
+ * When multiple threads call set_tensor on different chunks of the same tensor,
+ * this tracker accumulates progress and defers post-processing (quantized format
+ * transform or ND-to-NZ conversion) until all data has been written.
+ */
+struct TensorSetTracker {
+    std::mutex mtx;                   ///< Protects concurrent access to this tracker
+    size_t bytes_written = 0;         ///< Accumulated bytes written so far
+    size_t total_bytes = 0;           ///< Target size (full tensor)
+    std::vector<uint8_t> host_buffer; ///< Host staging buffer for quantized tensors
+};
+
 /**
 * @brief Context for managing a CANN buffer associated with a specific device.
 *
@ -780,6 +798,9 @@ struct ggml_backend_cann_buffer_context {
    int32_t device;             ///< The device ID associated with this buffer context.
    void *  dev_ptr = nullptr;  ///< Pointer to the device memory allocated for the buffer.

+    std::mutex tracker_mutex;   ///< Protects the trackers map
+    std::unordered_map<void *, std::unique_ptr<TensorSetTracker>> trackers;
+
    /**
     * @brief Constructor to initialize the CANN buffer context.
     *
@ -792,6 +813,31 @@ struct ggml_backend_cann_buffer_context {
     * @brief Destructor to free the device memory allocated for the buffer.
     */
    ~ggml_backend_cann_buffer_context() { ACL_CHECK(aclrtFree(dev_ptr)); }
+
+    /**
+     * @brief Get or create a tracker for the given tensor.
+     */
+    TensorSetTracker * get_or_create_tracker(ggml_tensor * tensor) {
+        std::lock_guard<std::mutex> lock(tracker_mutex);
+        auto key = tensor->data;
+        auto it = trackers.find(key);
+        if (it == trackers.end()) {
+            auto tracker = std::make_unique<TensorSetTracker>();
+            tracker->total_bytes = ggml_nbytes(tensor);
+            auto * ptr = tracker.get();
+            trackers[key] = std::move(tracker);
+            return ptr;
+        }
+        return it->second.get();
+    }
+
+    /**
+     * @brief Remove the tracker for the given tensor.
+     */
+    void remove_tracker(ggml_tensor * tensor) {
+        std::lock_guard<std::mutex> lock(tracker_mutex);
+        trackers.erase(tensor->data);
+    }
 };

 // cann buffer type
@ -1124,6 +1170,7 @@ static enum ggml_status ggml_backend_cann_buffer_init_tensor(ggml_backend_buffer
 * designed to be used with a global array, one per device.
 */
 struct ggml_cann_nz_workspace {
+    std::mutex mtx;    // Protects ptr/allocated from concurrent access
    void * ptr;        // Pointer to allocated device buffer
    size_t allocated;  // Size of currently allocated buffer in bytes

@ -1190,13 +1237,15 @@ static ggml_cann_nz_workspace g_nz_workspaces[GGML_CANN_MAX_DEVICES];
 * @note The workspace buffer used in this function is managed globally and reused
 *       across calls. This reduces overhead from repeated memory allocation and deallocation.
 */
-static void weight_format_to_nz(ggml_tensor * tensor, size_t offset, int device) {
-    acl_tensor_ptr weightTransposed = ggml_cann_create_tensor(tensor, tensor->ne, tensor->nb, 2, ACL_FORMAT_ND, offset);
+static void weight_format_to_nz(ggml_tensor * tensor, int device) {
+    acl_tensor_ptr weightTransposed = ggml_cann_create_tensor(tensor, tensor->ne, tensor->nb, 2, ACL_FORMAT_ND, 0);
    uint64_t       workspaceSize    = 0;
    aclOpExecutor * executor;

    // TransMatmulWeight
    ACL_CHECK(aclnnTransMatmulWeightGetWorkspaceSize(weightTransposed.get(), &workspaceSize, &executor));
+
+    std::lock_guard<std::mutex> lock(g_nz_workspaces[device].mtx);
    // Avoid frequent malloc/free of the workspace.
    g_nz_workspaces[device].realloc(workspaceSize);

@ -1210,7 +1259,13 @@ static void weight_format_to_nz(ggml_tensor * tensor, size_t offset, int device)
 * @brief Set tensor data in a CANN buffer.
 *
 * This function sets tensor data in a CANN buffer, handling transformations
- * if needed based on the tensor's type.
+ * if needed based on the tensor's type. It supports multi-threaded calls
+ * where different threads write different chunks of the same tensor.
+ *
+ * For quantized tensors (Q4_0/Q8_0), data is staged in a host buffer and
+ * the format transform is deferred until all chunks are written.
+ * For NZ weight tensors, chunks are uploaded directly but the ND-to-NZ
+ * conversion is deferred until all chunks are written.
 *
 * @param buffer The CANN buffer where the tensor data will be set.
 * @param tensor Pointer to the tensor whose data will be set.
@ -1226,26 +1281,72 @@ static void ggml_backend_cann_buffer_set_tensor(ggml_backend_buffer_t buffer,
    ggml_backend_cann_buffer_context * ctx = (ggml_backend_cann_buffer_context *) buffer->context;

    ggml_cann_set_device(ctx->device);
-    // TODO: refer to cann(#6017), it use thread's default stream.
-    // For acl, synchronous functions use this default stream.
-    // Why aclrtSynchronizeDevice?

    // Only check env once.
    static bool weight_to_nz = parse_bool(get_env_as_lowercase("GGML_CANN_WEIGHT_NZ").value_or("on"));
-    if (!need_transform(tensor->type)) {
+
+    bool is_quantized = need_transform(tensor->type);
+    bool is_nz        = !is_quantized && tensor->type != GGML_TYPE_BF16 && weight_to_nz &&
+                 is_matmul_weight((const ggml_tensor *) tensor);
+
+    // Plain tensor (not quantized, not NZ): direct copy, no tracking needed
+    if (!is_quantized && !is_nz) {
        ACL_CHECK(aclrtMemcpy((char *) tensor->data + offset, size, data, size, ACL_MEMCPY_HOST_TO_DEVICE));
-        if (weight_to_nz && tensor->type != GGML_TYPE_BF16
-            && is_matmul_weight((const ggml_tensor *) tensor)) {
+        return;
+    }
+
+    // Single-shot write (full tensor at once): handle directly without tracking overhead
+    if (offset == 0 && size == ggml_nbytes(tensor)) {
+        if (is_quantized) {
+            void * transform_buffer = malloc(size);
+            ggml_backend_cann_transform(tensor, data, transform_buffer);
+            ACL_CHECK(aclrtMemcpy(tensor->data, size, transform_buffer, size, ACL_MEMCPY_HOST_TO_DEVICE));
+            free(transform_buffer);
+        } else {
+            // NZ weight
            GGML_ASSERT(tensor->ne[2] == 1);
            GGML_ASSERT(tensor->ne[3] == 1);
-            weight_format_to_nz(tensor, offset, ctx->device);
+            ACL_CHECK(aclrtMemcpy(tensor->data, size, data, size, ACL_MEMCPY_HOST_TO_DEVICE));
+            weight_format_to_nz(tensor, ctx->device);
        }
-    } else {
-        void * transform_buffer = malloc(size);
-        ggml_backend_cann_transform(tensor, data, transform_buffer);
+        return;
+    }

-        ACL_CHECK(aclrtMemcpy((char *) tensor->data + offset, size, transform_buffer, size, ACL_MEMCPY_HOST_TO_DEVICE));
-        free(transform_buffer);
+    // Chunked write: use tracker to accumulate progress and defer transform/conversion
+    TensorSetTracker * tracker = ctx->get_or_create_tracker(tensor);
+    std::unique_lock<std::mutex> lock(tracker->mtx);
+
+    if (is_quantized) {
+        // Stage data in host buffer; transform requires full tensor data
+        if (tracker->host_buffer.empty()) {
+            tracker->host_buffer.resize(tracker->total_bytes);
+        }
+        memcpy(tracker->host_buffer.data() + offset, data, size);
+    } else {
+        // NZ weight: upload chunk to device immediately, defer conversion
+        ACL_CHECK(aclrtMemcpy((char *) tensor->data + offset, size, data, size, ACL_MEMCPY_HOST_TO_DEVICE));
+    }
+
+    tracker->bytes_written += size;
+
+    // All chunks received: perform deferred transform/conversion
+    if (tracker->bytes_written >= tracker->total_bytes) {
+        if (is_quantized) {
+            void * transform_buffer = malloc(tracker->total_bytes);
+            ggml_backend_cann_transform(tensor, tracker->host_buffer.data(), transform_buffer);
+            ACL_CHECK(aclrtMemcpy(tensor->data, tracker->total_bytes, transform_buffer, tracker->total_bytes, ACL_MEMCPY_HOST_TO_DEVICE));
+            free(transform_buffer);
+        }
+
+        if (is_nz) {
+            GGML_ASSERT(tensor->ne[2] == 1);
+            GGML_ASSERT(tensor->ne[3] == 1);
+            weight_format_to_nz(tensor, ctx->device);
+        }
+
+        // Unlock before removing tracker, as remove_tracker destroys the mutex
+        lock.unlock();
+        ctx->remove_tracker(tensor);
    }
 }

--- a/ggml/src/ggml-cpu/CMakeLists.txt
+++ b/ggml/src/ggml-cpu/CMakeLists.txt
@ -460,6 +460,10 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
        endif()
        if(NOT GGML_CPU_ALL_VARIANTS)
            set(MARCH_STR "rv64gc")
+            if (GGML_RVV)
+                string(APPEND MARCH_STR "v")
+            endif()
+
            if (GGML_RV_ZFH)
                string(APPEND MARCH_STR "_zfh")
            endif()
@ -467,7 +471,6 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
            if (GGML_XTHEADVECTOR)
                string(APPEND MARCH_STR "_xtheadvector")
            elseif (GGML_RVV)
-                string(APPEND MARCH_STR "_v")
                if (GGML_RV_ZVFH)
                    string(APPEND MARCH_STR "_zvfh")
                endif()
@ -475,12 +478,14 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
                    string(APPEND MARCH_STR "_zvfbfwma")
                endif()
            endif()
+
            if (GGML_RV_ZICBOP)
                string(APPEND MARCH_STR "_zicbop")
            endif()
            if (GGML_RV_ZIHINTPAUSE)
                string(APPEND MARCH_STR "_zihintpause")
            endif()
+
            list(APPEND ARCH_FLAGS "-march=${MARCH_STR}" -mabi=lp64d)
        else()
            # Begin with the lowest baseline
--- a/ggml/src/ggml-cpu/ggml-cpu.c
+++ b/ggml/src/ggml-cpu/ggml-cpu.c
@ -2871,8 +2871,12 @@ struct ggml_cplan ggml_graph_plan(
                        const int64_t ne11 = node->src[1]->ne[1]; // H
                        const int64_t ne12 = node->src[1]->ne[2]; // Channels In

-                        cur += sizeof(ggml_fp16_t)*ne00*ne01*ne02*ne03;
-                        cur += sizeof(ggml_fp16_t)*ne10*ne11*ne12;
+                        GGML_ASSERT(node->src[0]->type == GGML_TYPE_F16 || node->src[0]->type == GGML_TYPE_F32);
+                        GGML_ASSERT(node->src[1]->type == GGML_TYPE_F32);
+
+                        cur += ggml_type_size(node->src[0]->type) * ne00 * ne01 * ne02 * ne03;
+                        cur += ggml_type_size(node->src[0]->type) * ne10 * ne11 * ne12;
+
                    } break;
                case GGML_OP_TOP_K:
                    {
--- a/ggml/src/ggml-cpu/ops.cpp
+++ b/ggml/src/ggml-cpu/ops.cpp
@ -6923,16 +6923,15 @@ void ggml_compute_forward_conv_3d(
    ggml_compute_forward_conv_3d_impl(params, src0, src1, dst, src0->type);
 }

-// ggml_compute_forward_conv_transpose_2d
-
-void ggml_compute_forward_conv_transpose_2d(
-        const ggml_compute_params * params,
-              ggml_tensor * dst) {
+template <typename kernel_t>
+static void ggml_compute_forward_conv_transpose_2d_impl(
+    const ggml_compute_params * params,
+          ggml_tensor * dst) {

    const ggml_tensor * src0 = dst->src[0];
    const ggml_tensor * src1 = dst->src[1];

-    GGML_ASSERT(src0->type == GGML_TYPE_F16);
+    GGML_ASSERT(src0->type == GGML_TYPE_F16 || src0->type == GGML_TYPE_F32);
    GGML_ASSERT(src1->type == GGML_TYPE_F32);
    GGML_ASSERT( dst->type == GGML_TYPE_F32);

@ -6943,7 +6942,7 @@ void ggml_compute_forward_conv_transpose_2d(

    const int nk = ne00*ne01*ne02*ne03;

-    GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
+    GGML_ASSERT(nb00 == ggml_type_size(src0->type));
    GGML_ASSERT(nb10 == sizeof(float));

    if (ith == 0) {
@ -6951,12 +6950,12 @@ void ggml_compute_forward_conv_transpose_2d(

        // permute kernel data (src0) from (Kw x Kh x Cout x Cin) to (Cin x Kw x Kh x Cout)
        {
-            ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0;
+            kernel_t * const wdata = (kernel_t *) params->wdata + 0;

            for (int64_t i03 = 0; i03 < ne03; i03++) {
                for (int64_t i02 = 0; i02 < ne02; i02++) {
-                    const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i03*nb03 + i02*nb02);
-                    ggml_fp16_t * dst_data = wdata + i02*ne01*ne00*ne03;
+                    const kernel_t * const src = (kernel_t *)((char *) src0->data + i03*nb03 + i02*nb02);
+                    kernel_t * dst_data = wdata + i02*ne01*ne00*ne03;
                    for (int64_t i01 = 0; i01 < ne01; i01++) {
                        for (int64_t i00 = 0; i00 < ne00; i00++) {
                            dst_data[i01*ne00*ne03 + i00*ne03 + i03] = src[i01 * ne00 + i00];
@ -6968,13 +6967,17 @@ void ggml_compute_forward_conv_transpose_2d(

        // permute source data (src1) from (Sw x Sh x Cin) to (Cin x Sw x Sh)
        {
-            ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + nk;
+            kernel_t * const wdata = (kernel_t *) params->wdata + nk;
            for (int i12 = 0; i12 < ne12; i12++) {
                for (int i11 = 0; i11 < ne11; i11++) {
                    const float * const src = (float *)((char *) src1->data + i12*nb12 + i11*nb11);
-                    ggml_fp16_t * dst_data = wdata + i11*ne10*ne12;
+                    kernel_t * dst_data = wdata + i11*ne10*ne12;
                    for (int i10 = 0; i10 < ne10; i10++) {
-                        dst_data[i10*ne12 + i12] = GGML_CPU_FP32_TO_FP16(src[i10]);
+                        if constexpr (std::is_same_v<kernel_t, ggml_fp16_t>) {
+                            dst_data[i10*ne12 + i12] = GGML_CPU_FP32_TO_FP16(src[i10]);
+                        } else {
+                            dst_data[i10*ne12 + i12] = src[i10];
+                        }
                    }
                }
            }
@ -6996,21 +6999,27 @@ void ggml_compute_forward_conv_transpose_2d(
    const int ip0 = dp*ith;
    const int ip1 = MIN(ip0 + dp, np);

-    ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0;
-    ggml_fp16_t * const wdata_src = wdata + nk;
+    kernel_t * const wdata = (kernel_t *) params->wdata + 0;
+    kernel_t * const wdata_src = wdata + nk;

    for (int i2 = ip0; i2 < ip1; i2++) { // Cout
        float * dst_data = (float *)((char *) dst->data + i2*nb2);
-        ggml_fp16_t * wdata_kernel = wdata + i2*ne01*ne00*ne03;
+        kernel_t * wdata_kernel = wdata + i2*ne01*ne00*ne03;
        for (int i11 = 0; i11 < ne11; i11++) {
            for (int i10 = 0; i10 < ne10; i10++) {
                const int i1n = i11*ne10*ne12 + i10*ne12;
                for (int i01 = 0; i01 < ne01; i01++) {
                    for (int i00 = 0; i00 < ne00; i00++) {
                        float v = 0;
-                        ggml_vec_dot_f16(ne03, &v, 0,
-                                wdata_src + i1n, 0,
-                                wdata_kernel + i01*ne00*ne03 + i00*ne03, 0, 1);
+                        if constexpr (std::is_same_v<kernel_t, ggml_fp16_t>) {
+                            ggml_vec_dot_f16(ne03, &v, 0,
+                                    wdata_src + i1n, 0,
+                                    wdata_kernel + i01*ne00*ne03 + i00*ne03, 0, 1);
+                        } else {
+                            ggml_vec_dot_f32(ne03, &v, 0,
+                                    wdata_src + i1n, 0,
+                                    wdata_kernel + i01*ne00*ne03 + i00*ne03, 0, 1);
+                        }
                        dst_data[(i11*stride + i01)*ne0 + i10*stride + i00] += v;
                    }
                }
@ -7019,6 +7028,28 @@ void ggml_compute_forward_conv_transpose_2d(
    }
 }

+void ggml_compute_forward_conv_transpose_2d(
+        const ggml_compute_params * params,
+              ggml_tensor * dst) {
+
+    const ggml_tensor * src0 = dst->src[0];
+
+    switch (src0->type) {
+        case GGML_TYPE_F16:
+            {
+                ggml_compute_forward_conv_transpose_2d_impl<ggml_fp16_t>(params, dst);
+            } break;
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_conv_transpose_2d_impl<float>(params, dst);
+            } break;
+        default:
+            {
+                GGML_ABORT("fatal error");
+            }
+    }
+}
+
 // ggml_compute_forward_conv_2d_dw

 struct ggml_conv_2d_dw_params {
--- a/ggml/src/ggml-cuda/argsort.cu
+++ b/ggml/src/ggml-cuda/argsort.cu
@ -47,9 +47,11 @@ void argsort_f32_i32_cuda_cub(ggml_cuda_pool & pool,
 #ifdef STRIDED_ITERATOR_AVAILABLE
    auto offset_iterator = cuda::make_strided_iterator(cuda::make_counting_iterator(0), ncols);
 #else
-    ggml_cuda_pool_alloc<int> offsets_alloc(pool, nrows + 1);
+    // offset_iterator needs to populate nrows + 1 elements, so we also have to ceildiv nrows + 1 by block_size
+    const int                 nrows_offset = nrows + 1;
+    ggml_cuda_pool_alloc<int> offsets_alloc(pool, nrows_offset);
    int *                     offset_iterator = offsets_alloc.get();
-    const dim3                offset_grid((nrows + block_size - 1) / block_size);
+    const dim3                offset_grid((nrows_offset + block_size - 1) / block_size);
    init_offsets<<<offset_grid, block_size, 0, stream>>>(offset_iterator, ncols, nrows);
 #endif
    CUDA_CHECK(cudaMemcpyAsync(temp_keys, x, ncols * nrows * sizeof(float), cudaMemcpyDeviceToDevice, stream));
--- a/ggml/src/ggml-cuda/common.cuh
+++ b/ggml/src/ggml-cuda/common.cuh
@ -799,6 +799,22 @@ static __device__ __forceinline__ float ggml_cuda_e8m0_to_fp32(uint8_t x) {
 #endif // CUDART_VERSION >= 12050
 }

+static __device__ __forceinline__ float ggml_cuda_ue4m3_to_fp32(uint8_t x) {
+#ifdef FP8_AVAILABLE
+    const uint32_t bits = x * (x != 0x7F && x != 0xFF); // Convert NaN to 0.0f to match CPU implementation.
+#if defined(GGML_USE_HIP) && defined(CDNA3)
+    // ROCm dose not support fp8 in software on devices with fp8 hardware,
+    // but CDNA3 supports only e4m3_fnuz (no inf).
+    const __hip_fp8_e4m3_fnuz xf = *reinterpret_cast<const __hip_fp8_e4m3_fnuz *>(&bits);
+#else
+    const __nv_fp8_e4m3 xf = *reinterpret_cast<const __nv_fp8_e4m3 *>(&bits);
+#endif // defined(GGML_USE_HIP) && defined(GGML_USE_HIP)
+    return static_cast<float>(xf) / 2;
+#else
+    NO_DEVICE_CODE;
+#endif // FP8_AVAILABLE
+}
+
 __device__ __forceinline__ uint8_t ggml_cuda_float_to_fp4_e2m1(float x, float e) {
    const uint8_t sign_bit = (x < 0.0f) << 3;
    float         ax       = fabsf(x) * e;
@ -931,6 +947,13 @@ struct ggml_cuda_type_traits<GGML_TYPE_MXFP4> {
    static constexpr int qi = QI_MXFP4;
 };

+template<>
+struct ggml_cuda_type_traits<GGML_TYPE_NVFP4> {
+    static constexpr int qk = QK_NVFP4;
+    static constexpr int qr = QR_NVFP4;
+    static constexpr int qi = QI_NVFP4;
+};
+
 template<>
 struct ggml_cuda_type_traits<GGML_TYPE_Q2_K> {
    static constexpr int qk = QK_K;
--- a/ggml/src/ggml-cuda/conv2d-transpose.cu
+++ b/ggml/src/ggml-cuda/conv2d-transpose.cu
@ -1,12 +1,20 @@
-#include <algorithm>
-
 #include "conv2d-transpose.cuh"
-#include "ggml.h"
+#include "convert.cuh"

-__global__ void conv2d_transpose_kernel(const float * __restrict__ input, const half * __restrict__ kernel,
-                                        float * __restrict__ output, const int in_w, const int in_h, const int out_w,
-                                        const int out_h, const int kernel_w, const int kernel_h, const int stride,
-                                        const int c_in, const int c_out, const int batches) {
+template <typename kernel_t>
+static __global__ void conv2d_transpose_kernel(const float * __restrict__ input,
+                                               const kernel_t * __restrict__ kernel,
+                                               float * __restrict__ output,
+                                               const int in_w,
+                                               const int in_h,
+                                               const int out_w,
+                                               const int out_h,
+                                               const int kernel_w,
+                                               const int kernel_h,
+                                               const int stride,
+                                               const int c_in,
+                                               const int c_out,
+                                               const int batches) {
    const int global_idx = blockIdx.x * blockDim.x + threadIdx.x;

    const int total_elements = out_w * out_h * c_out * batches;
@ -26,24 +34,32 @@ __global__ void conv2d_transpose_kernel(const float * __restrict__ input, const
    for (int c_in_idx = 0; c_in_idx < c_in; c_in_idx++) {
        for (int kh = 0; kh < kernel_h; ++kh) {
            int in_y = out_y_idx - kh;
-            if (in_y < 0 || in_y % stride) continue;
+            if (in_y < 0 || in_y % stride) {
+                continue;
+            }
            in_y /= stride;
-            if (in_y >= in_h) continue;
+            if (in_y >= in_h) {
+                continue;
+            }

            for (int kw = 0; kw < kernel_w; ++kw) {
                int in_x = out_x_idx - kw;
-                if (in_x < 0 || in_x % stride) continue;
+                if (in_x < 0 || in_x % stride) {
+                    continue;
+                }
                in_x /= stride;
-                if (in_x >= in_w) continue;
+                if (in_x >= in_w) {
+                    continue;
+                }

                const int input_idx = (in_w * in_h * c_in) * n_idx + (in_w * in_h) * c_in_idx + (in_w) *in_y + in_x;
                const int kernel_idx =
                    (kernel_h * kernel_w * c_out) * c_in_idx + (kernel_h * kernel_w) * c_idx + (kernel_w) *kh + kw;

-                float input_val = input[input_idx];
-                half  kern_val  = kernel[kernel_idx];
+                float    input_val = input[input_idx];
+                kernel_t kern_val  = kernel[kernel_idx];

-                accumulator += input_val * (float) kern_val;
+                accumulator += input_val * ggml_cuda_cast<float>(kern_val);
            }
        }
    }
@ -56,11 +72,12 @@ void ggml_cuda_conv_2d_transpose_p0(ggml_backend_cuda_context & ctx, ggml_tensor
    const ggml_tensor * kernel = dst->src[0];
    const ggml_tensor * input  = dst->src[1];

-    GGML_ASSERT(kernel->type == GGML_TYPE_F16 && input->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
+    GGML_ASSERT(kernel->type == GGML_TYPE_F16 || kernel->type == GGML_TYPE_F32);
+    GGML_ASSERT(input->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);

    const float * input_data  = (const float *) input->data;
    float *       output_data = (float *) dst->data;
-    const half * kernel_data = (const half *) kernel->data;
+    const void *  kernel_data = kernel->data;

    const int input_w      = input->ne[0];
    const int input_h      = input->ne[1];
@ -82,10 +99,17 @@ void ggml_cuda_conv_2d_transpose_p0(ggml_backend_cuda_context & ctx, ggml_tensor
    GGML_ASSERT(ggml_is_contiguous(kernel));
    GGML_ASSERT(ggml_is_contiguous(dst));

-    const int total  = (output_w * output_h * channels_out * batches);
+    const int total  = output_w * output_h * channels_out * batches;
    const int blocks = (total + CUDA_CONV2D_TRANSPOSE_BLOCK_SIZE - 1) / CUDA_CONV2D_TRANSPOSE_BLOCK_SIZE;

-    conv2d_transpose_kernel<<<blocks, CUDA_CONV2D_TRANSPOSE_BLOCK_SIZE, 0, st>>>(
-        input_data, kernel_data, output_data, input_w, input_h, output_w, output_h, kernel_w, kernel_h, stride,
-        channels_in, channels_out, batches);
+    if (kernel->type == GGML_TYPE_F16) {
+        conv2d_transpose_kernel<half><<<blocks, CUDA_CONV2D_TRANSPOSE_BLOCK_SIZE, 0, st>>>(
+            input_data, (const half *) kernel_data, output_data, input_w, input_h, output_w, output_h, kernel_w,
+            kernel_h, stride, channels_in, channels_out, batches);
+
+    } else {
+        conv2d_transpose_kernel<float><<<blocks, CUDA_CONV2D_TRANSPOSE_BLOCK_SIZE, 0, st>>>(
+            input_data, (const float *) kernel_data, output_data, input_w, input_h, output_w, output_h, kernel_w,
+            kernel_h, stride, channels_in, channels_out, batches);
+    }
 }
--- a/ggml/src/ggml-cuda/conv2d-transpose.cuh
+++ b/ggml/src/ggml-cuda/conv2d-transpose.cuh
@ -1,4 +1,5 @@
 #include "common.cuh"

 #define CUDA_CONV2D_TRANSPOSE_BLOCK_SIZE 256
+
 void ggml_cuda_conv_2d_transpose_p0(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
--- a/ggml/src/ggml-cuda/convert.cu
+++ b/ggml/src/ggml-cuda/convert.cu
@ -617,6 +617,45 @@ static void dequantize_row_mxfp4_cuda(const void * vx, dst_t * y, const int64_t
    dequantize_block_mxfp4<<<nb, 32, 0, stream>>>(vx, y);
 }

+template <typename dst_t>
+static __global__ void dequantize_block_nvfp4(
+        const void * __restrict__ vx,
+        dst_t * __restrict__ yy,
+        const int64_t ne) {
+    const int64_t i = blockIdx.x;
+    const int     tid = threadIdx.x;
+
+    const int64_t base = i * QK_NVFP4;
+    if (base >= ne) {
+        return;
+    }
+
+    const block_nvfp4 * x = (const block_nvfp4 *) vx;
+    const block_nvfp4 & xb = x[i];
+
+    const int sub = tid / (QK_NVFP4_SUB / 2);
+    const int j = tid % (QK_NVFP4_SUB / 2);
+
+    const float d = ggml_cuda_ue4m3_to_fp32(xb.d[sub]);
+    const uint8_t q = xb.qs[sub * (QK_NVFP4_SUB / 2) + j];
+
+    const int64_t y0 = base + sub * QK_NVFP4_SUB + j;
+    const int64_t y1 = y0 + QK_NVFP4_SUB / 2;
+
+    yy[y0] = ggml_cuda_cast<dst_t>(d * kvalues_mxfp4[q & 0x0F]);
+    yy[y1] = ggml_cuda_cast<dst_t>(d * kvalues_mxfp4[q >> 4]);
+}
+
+template <typename dst_t>
+static void dequantize_row_nvfp4_cuda(
+        const void * vx,
+        dst_t * y,
+        const int64_t k,
+        cudaStream_t stream) {
+    GGML_ASSERT(k % QK_NVFP4 == 0);
+    const int nb = k / QK_NVFP4;
+    dequantize_block_nvfp4<<<nb, 32, 0, stream>>>(vx, y, k);
+}
 template <typename src_t, typename dst_t>
 static __global__ void convert_unary(
        const void * __restrict__ vx, dst_t * __restrict__ y, const int64_t ne00, const int64_t ne01,
@ -715,6 +754,8 @@ to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) {
            return dequantize_row_iq3_s_cuda;
        case GGML_TYPE_MXFP4:
            return dequantize_row_mxfp4_cuda;
+        case GGML_TYPE_NVFP4:
+            return dequantize_row_nvfp4_cuda;
        case GGML_TYPE_F32:
            return convert_unary_cont_cuda<float>;
        case GGML_TYPE_BF16:
@ -766,6 +807,8 @@ to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) {
            return dequantize_row_iq3_s_cuda;
        case GGML_TYPE_MXFP4:
            return dequantize_row_mxfp4_cuda;
+        case GGML_TYPE_NVFP4:
+            return dequantize_row_nvfp4_cuda;
        case GGML_TYPE_F16:
            return convert_unary_cont_cuda<half>;
        case GGML_TYPE_BF16:
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@ -1297,7 +1297,12 @@ static void ggml_cuda_op_mul_mat_cublas(
    const bool supports_bf16 = GGML_CUDA_CC_IS_NVIDIA(cc) || GGML_CUDA_CC_IS_AMD(cc) ||
        (GGML_CUDA_CC_IS_MTHREADS(cc) && cc >= GGML_CUDA_CC_QY2);

-    const bool use_fp16 = (src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && ggml_is_contiguous(src0) && row_diff == src0->ne[1] && dst->op_params[0] == GGML_PREC_DEFAULT;
+    const bool use_fp16 =
+        src0->type != GGML_TYPE_NVFP4 &&
+        (src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) &&
+        ggml_is_contiguous(src0) &&
+        row_diff == src0->ne[1] &&
+        dst->op_params[0] == GGML_PREC_DEFAULT;

    if (supports_bf16 && src0->type == GGML_TYPE_BF16 && ggml_is_contiguous(src0) && row_diff == src0->ne[1]) {
        ggml_cuda_pool_alloc<nv_bfloat16> src1_as_bf16(ctx.pool(id));
@ -2338,7 +2343,8 @@ static void ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor *
        static_assert(MMVQ_MAX_BATCH_SIZE == MMVF_MAX_BATCH_SIZE);
        if (ne2 <= MMVQ_MAX_BATCH_SIZE) {
            if (ggml_is_quantized(src0->type)) {
-                if (ne2 <= MMVQ_MMID_MAX_BATCH_SIZE) {
+                const int mmvq_mmid_max = get_mmvq_mmid_max_batch(src0->type, cc);
+                if (ne2 <= mmvq_mmid_max) {
                    ggml_cuda_mul_mat_vec_q(ctx, src0, src1, ids, dst);
                    return;
                }
@ -2941,14 +2947,18 @@ static bool ggml_cuda_graph_check_compability(ggml_cgraph * cgraph) {
        }

        // [TAG_MUL_MAT_ID_CUDA_GRAPHS]
-        if (node->op == GGML_OP_MUL_MAT_ID && (!ggml_is_quantized(node->src[0]->type) || node->ne[2] > MMVQ_MMID_MAX_BATCH_SIZE)) {
-            // under these conditions, the mul_mat_id operation will need to synchronize the stream, so we cannot use CUDA graphs
-            // TODO: figure out a way to enable for larger batch sizes, without hurting performance
-            // ref: https://github.com/ggml-org/llama.cpp/pull/18958
-            use_cuda_graph = false;
+        if (node->op == GGML_OP_MUL_MAT_ID) {
+            const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc;
+            const int mmvq_mmid_max = get_mmvq_mmid_max_batch(node->src[0]->type, cc);
+            if (!ggml_is_quantized(node->src[0]->type) || node->ne[2] > mmvq_mmid_max) {
+                // under these conditions, the mul_mat_id operation will need to synchronize the stream, so we cannot use CUDA graphs
+                // TODO: figure out a way to enable for larger batch sizes, without hurting performance
+                // ref: https://github.com/ggml-org/llama.cpp/pull/18958
+                use_cuda_graph = false;
 #ifndef NDEBUG
-            GGML_LOG_DEBUG("%s: disabling CUDA graphs due to unsupported node type\n", __func__);
+                GGML_LOG_DEBUG("%s: disabling CUDA graphs due to unsupported node type\n", __func__);
 #endif
+            }
        }

        if (!use_cuda_graph) {
@ -4781,6 +4791,9 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
                    case GGML_TYPE_Q5_1:
                    case GGML_TYPE_Q8_0:
                    case GGML_TYPE_MXFP4:
+#ifdef FP8_AVAILABLE
+                    case GGML_TYPE_NVFP4:
+#endif // FP8_AVAILABLE
                    case GGML_TYPE_Q2_K:
                    case GGML_TYPE_Q3_K:
                    case GGML_TYPE_Q4_K:
--- a/ggml/src/ggml-cuda/mmvq.cu
+++ b/ggml/src/ggml-cuda/mmvq.cu
@ -15,6 +15,7 @@ static constexpr __device__ vec_dot_q_cuda_t get_vec_dot_q_cuda(ggml_type type)
        case GGML_TYPE_Q5_1:    return vec_dot_q5_1_q8_1;
        case GGML_TYPE_Q8_0:    return vec_dot_q8_0_q8_1;
        case GGML_TYPE_MXFP4:   return vec_dot_mxfp4_q8_1;
+        case GGML_TYPE_NVFP4:   return vec_dot_nvfp4_q8_1;
        case GGML_TYPE_Q2_K:    return vec_dot_q2_K_q8_1;
        case GGML_TYPE_Q3_K:    return vec_dot_q3_K_q8_1;
        case GGML_TYPE_Q4_K:    return vec_dot_q4_K_q8_1;
@ -41,6 +42,7 @@ static constexpr __host__ __device__ int get_vdr_mmvq(ggml_type type) {
        case GGML_TYPE_Q5_1:    return VDR_Q5_1_Q8_1_MMVQ;
        case GGML_TYPE_Q8_0:    return VDR_Q8_0_Q8_1_MMVQ;
        case GGML_TYPE_MXFP4:   return VDR_MXFP4_Q8_1_MMVQ;
+        case GGML_TYPE_NVFP4:   return VDR_NVFP4_Q8_1_MMVQ;
        case GGML_TYPE_Q2_K:    return VDR_Q2_K_Q8_1_MMVQ;
        case GGML_TYPE_Q3_K:    return VDR_Q3_K_Q8_1_MMVQ;
        case GGML_TYPE_Q4_K:    return VDR_Q4_K_Q8_1_MMVQ;
@ -95,6 +97,194 @@ static __host__ mmvq_parameter_table_id get_device_table_id(int cc) {
    return MMVQ_PARAMETERS_GENERIC;
 }

+// Per-architecture maximum batch size for which MMVQ should be used for MUL_MAT_ID.
+// Returns a value <= MMVQ_MAX_BATCH_SIZE. Default is MMVQ_MAX_BATCH_SIZE.
+// Check https://github.com/ggml-org/llama.cpp/pull/20905#issuecomment-4145835627 for details
+
+static constexpr __host__ __device__ int get_mmvq_mmid_max_batch_pascal_older(ggml_type type) {
+    switch (type) {
+        case GGML_TYPE_IQ1_S:   return 6;
+        case GGML_TYPE_IQ1_M:   return 6;
+        case GGML_TYPE_IQ2_S:   return 4;
+        case GGML_TYPE_IQ2_XS:  return 5;
+        case GGML_TYPE_IQ2_XXS: return 5;
+        case GGML_TYPE_IQ3_S:   return 4;
+        case GGML_TYPE_IQ3_XXS: return 4;
+        case GGML_TYPE_IQ4_NL:  return 6;
+        case GGML_TYPE_IQ4_XS:  return 5;
+        case GGML_TYPE_MXFP4:   return 4;
+        case GGML_TYPE_Q2_K:    return 4;
+        case GGML_TYPE_Q3_K:    return 4;
+        case GGML_TYPE_Q4_0:    return 6;
+        case GGML_TYPE_Q4_1:    return 6;
+        case GGML_TYPE_Q4_K:    return 5;
+        case GGML_TYPE_Q5_0:    return 6;
+        case GGML_TYPE_Q5_1:    return 6;
+        case GGML_TYPE_Q5_K:    return 5;
+        case GGML_TYPE_Q6_K:    return 4;
+        case GGML_TYPE_Q8_0:    return 4;
+        default:                return MMVQ_MAX_BATCH_SIZE;
+    }
+}
+
+static constexpr __host__ __device__ int get_mmvq_mmid_max_batch_turing_plus(ggml_type type) {
+    switch (type) {
+        case GGML_TYPE_IQ2_S:   return 7;
+        case GGML_TYPE_IQ3_S:   return 6;
+        case GGML_TYPE_IQ3_XXS: return 7;
+        case GGML_TYPE_MXFP4:   return 7;
+        case GGML_TYPE_Q2_K:    return 7;
+        case GGML_TYPE_Q3_K:    return 5;
+        default:                return MMVQ_MAX_BATCH_SIZE;
+    }
+}
+
+static constexpr __host__ __device__ int get_mmvq_mmid_max_batch_gcn(ggml_type type) {
+    switch (type) {
+        case GGML_TYPE_IQ1_S:   return 5;
+        case GGML_TYPE_IQ1_M:   return 5;
+        case GGML_TYPE_IQ2_S:   return 4;
+        case GGML_TYPE_IQ2_XS:  return 4;
+        case GGML_TYPE_IQ2_XXS: return 4;
+        case GGML_TYPE_IQ3_S:   return 4;
+        case GGML_TYPE_IQ3_XXS: return 4;
+        case GGML_TYPE_IQ4_NL:  return 6;
+        case GGML_TYPE_IQ4_XS:  return 4;
+        case GGML_TYPE_Q2_K:    return 4;
+        case GGML_TYPE_Q3_K:    return 4;
+        case GGML_TYPE_Q4_0:    return 5;
+        case GGML_TYPE_Q4_1:    return 5;
+        case GGML_TYPE_Q4_K:    return 4;
+        case GGML_TYPE_Q5_K:    return 4;
+        case GGML_TYPE_Q6_K:    return 4;
+        case GGML_TYPE_Q8_0:    return 4;
+        default:                return MMVQ_MAX_BATCH_SIZE;
+    }
+}
+
+static constexpr __host__ __device__ int get_mmvq_mmid_max_batch_cdna(ggml_type type) {
+    switch (type) {
+        case GGML_TYPE_IQ2_S:   return 5;
+        case GGML_TYPE_IQ2_XS:  return 5;
+        case GGML_TYPE_IQ2_XXS: return 5;
+        case GGML_TYPE_IQ3_S:   return 4;
+        case GGML_TYPE_IQ3_XXS: return 5;
+        default:                return MMVQ_MAX_BATCH_SIZE;
+    }
+}
+
+static constexpr __host__ __device__ int get_mmvq_mmid_max_batch_rdna1_rdna2(ggml_type type) {
+    switch (type) {
+        case GGML_TYPE_IQ2_S:   return 4;
+        case GGML_TYPE_IQ2_XS:  return 4;
+        case GGML_TYPE_IQ2_XXS: return 4;
+        case GGML_TYPE_IQ3_S:   return 4;
+        case GGML_TYPE_IQ3_XXS: return 4;
+        case GGML_TYPE_Q2_K:    return 7;
+        case GGML_TYPE_Q3_K:    return 4;
+        case GGML_TYPE_Q4_K:    return 5;
+        case GGML_TYPE_Q5_K:    return 6;
+        case GGML_TYPE_Q6_K:    return 5;
+        default:                return MMVQ_MAX_BATCH_SIZE;
+    }
+}
+
+static constexpr __host__ __device__ int get_mmvq_mmid_max_batch_rdna3(ggml_type type) {
+    switch (type) {
+        case GGML_TYPE_IQ1_S:   return 6;
+        case GGML_TYPE_IQ1_M:   return 6;
+        case GGML_TYPE_IQ2_S:   return 4;
+        case GGML_TYPE_IQ2_XS:  return 4;
+        case GGML_TYPE_IQ2_XXS: return 4;
+        case GGML_TYPE_IQ3_S:   return 4;
+        case GGML_TYPE_IQ3_XXS: return 4;
+        case GGML_TYPE_IQ4_NL:  return 6;
+        case GGML_TYPE_IQ4_XS:  return 6;
+        case GGML_TYPE_Q4_K:    return 4;
+        case GGML_TYPE_Q5_K:    return 4;
+        case GGML_TYPE_Q6_K:    return 4;
+        default:                return MMVQ_MAX_BATCH_SIZE;
+    }
+}
+
+static constexpr __host__ __device__ int get_mmvq_mmid_max_batch_rdna4(ggml_type type) {
+    switch (type) {
+        case GGML_TYPE_IQ1_S:   return 7;
+        case GGML_TYPE_IQ1_M:   return 7;
+        case GGML_TYPE_IQ2_S:   return 4;
+        case GGML_TYPE_IQ2_XS:  return 4;
+        case GGML_TYPE_IQ2_XXS: return 4;
+        case GGML_TYPE_IQ3_S:   return 4;
+        case GGML_TYPE_IQ3_XXS: return 4;
+        case GGML_TYPE_IQ4_NL:  return 7;
+        case GGML_TYPE_IQ4_XS:  return 5;
+        case GGML_TYPE_MXFP4:   return 5;
+        case GGML_TYPE_Q3_K:    return 4;
+        case GGML_TYPE_Q4_0:    return 7;
+        case GGML_TYPE_Q4_1:    return 7;
+        case GGML_TYPE_Q4_K:    return 4;
+        case GGML_TYPE_Q5_0:    return 7;
+        case GGML_TYPE_Q5_1:    return 7;
+        case GGML_TYPE_Q5_K:    return 5;
+        case GGML_TYPE_Q6_K:    return 5;
+        case GGML_TYPE_Q8_0:    return 7;
+        default:                return MMVQ_MAX_BATCH_SIZE;
+    }
+}
+
+// Host function: returns the max batch size for the current arch+type at runtime.
+int get_mmvq_mmid_max_batch(ggml_type type, int cc) {
+    // NVIDIA: Volta, Ada Lovelace, and Blackwell always use MMVQ for MUL_MAT_ID.
+    if (cc == GGML_CUDA_CC_VOLTA || cc >= GGML_CUDA_CC_ADA_LOVELACE) {
+        return MMVQ_MAX_BATCH_SIZE;
+    }
+    if (cc >= GGML_CUDA_CC_TURING) {
+        return get_mmvq_mmid_max_batch_turing_plus(type);
+    }
+    if (GGML_CUDA_CC_IS_NVIDIA(cc)) {
+        return get_mmvq_mmid_max_batch_pascal_older(type);
+    }
+    // AMD
+    if (GGML_CUDA_CC_IS_RDNA4(cc)) {
+        return get_mmvq_mmid_max_batch_rdna4(type);
+    }
+    if (GGML_CUDA_CC_IS_RDNA3(cc)) {
+        return get_mmvq_mmid_max_batch_rdna3(type);
+    }
+    if (GGML_CUDA_CC_IS_RDNA1(cc) || GGML_CUDA_CC_IS_RDNA2(cc)) {
+        return get_mmvq_mmid_max_batch_rdna1_rdna2(type);
+    }
+    if (GGML_CUDA_CC_IS_CDNA(cc)) {
+        return get_mmvq_mmid_max_batch_cdna(type);
+    }
+    if (GGML_CUDA_CC_IS_GCN(cc)) {
+        return get_mmvq_mmid_max_batch_gcn(type);
+    }
+    return MMVQ_MAX_BATCH_SIZE;
+}
+
+// Device constexpr: returns the max batch size for the current arch+type at compile time.
+template <ggml_type type>
+static constexpr __device__ int get_mmvq_mmid_max_batch_for_device() {
+#if defined(RDNA4)
+    return get_mmvq_mmid_max_batch_rdna4(type);
+#elif defined(RDNA3)
+    return get_mmvq_mmid_max_batch_rdna3(type);
+#elif defined(RDNA2) || defined(RDNA1)
+    return get_mmvq_mmid_max_batch_rdna1_rdna2(type);
+#elif defined(CDNA)
+    return get_mmvq_mmid_max_batch_cdna(type);
+#elif defined(GCN)
+    return get_mmvq_mmid_max_batch_gcn(type);
+#elif defined(__CUDA_ARCH__) && (__CUDA_ARCH__ == GGML_CUDA_CC_VOLTA || __CUDA_ARCH__ >= GGML_CUDA_CC_ADA_LOVELACE)
+    return MMVQ_MAX_BATCH_SIZE;
+#elif defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= GGML_CUDA_CC_TURING
+    return get_mmvq_mmid_max_batch_turing_plus(type);
+#else
+    return get_mmvq_mmid_max_batch_pascal_older(type);
+#endif
+}
+
 static constexpr __host__ __device__ int calc_nwarps(ggml_type type, int ncols_dst, mmvq_parameter_table_id table_id) {
    if (table_id == MMVQ_PARAMETERS_GENERIC) {
        switch (ncols_dst) {
@ -193,7 +383,7 @@ static constexpr __host__ __device__ int calc_rows_per_block(int ncols_dst, int
    return 1;
 }

-template <ggml_type type, int ncols_dst, bool has_fusion, bool is_multi_token_id = false, bool small_k = false>
+template <ggml_type type, int ncols_dst, bool has_fusion, bool small_k = false>
 __launch_bounds__(calc_nwarps(type, ncols_dst, get_device_table_id())*ggml_cuda_get_physical_warp_size(), 1)
 static __global__ void mul_mat_vec_q(
        const void * __restrict__ vx, const void * __restrict__ vy, const int32_t * __restrict__ ids, const ggml_cuda_mm_fusion_args_device fusion, float * __restrict__ dst,
@ -220,22 +410,13 @@ static __global__ void mul_mat_vec_q(

    const uint32_t channel_dst = blockIdx.y;

-    uint32_t token_idx = 0;
    uint32_t channel_x;
    uint32_t channel_y;
    uint32_t sample_dst;

-    if constexpr (is_multi_token_id) {
-        // Multi-token MUL_MAT_ID path, adding these in the normal path causes a perf regression for n_tokens=1 case
-        token_idx  = blockIdx.z;
-        channel_x  = ids[channel_dst + token_idx * ids_stride];
-        channel_y  = fastmodulo(channel_dst, nchannels_y);
-        sample_dst = 0;
-    } else {
-        channel_x  = ncols_dst == 1 && ids ? ids[channel_dst]                     : fastdiv(channel_dst, channel_ratio);
-        channel_y  = ncols_dst == 1 && ids ? fastmodulo(channel_dst, nchannels_y) : channel_dst;
-        sample_dst = blockIdx.z;
-    }
+    channel_x  = ncols_dst == 1 && ids ? ids[channel_dst]                     : fastdiv(channel_dst, channel_ratio);
+    channel_y  = ncols_dst == 1 && ids ? fastmodulo(channel_dst, nchannels_y) : channel_dst;
+    sample_dst = blockIdx.z;

    const uint32_t sample_x    = fastdiv(sample_dst, sample_ratio);
    const uint32_t sample_y    = sample_dst;
@ -292,9 +473,6 @@ static __global__ void mul_mat_vec_q(
    float tmp_gate[ncols_dst][rows_per_cuda_block] = {{0.0f}};

    const block_q8_1 * y = ((const block_q8_1 *) vy) + sample_y*stride_sample_y + channel_y*stride_channel_y;
-    if constexpr (is_multi_token_id) {
-        y += token_idx*stride_col_y;
-    }
    const int kbx_offset = sample_x*stride_sample_x + channel_x*stride_channel_x + row0*stride_row_x;

    for (int kbx = tid / (qi/vdr); kbx < blocks_per_row_x; kbx += blocks_per_iter) {
@ -348,10 +526,6 @@ static __global__ void mul_mat_vec_q(

    dst += sample_dst*stride_sample_dst + channel_dst*stride_channel_dst + row0;

-    if constexpr (is_multi_token_id) {
-        dst += token_idx*stride_col_dst;
-    }
-
    // sum up partial sums and write back result
 #pragma unroll
    for (int j = 0; j < ncols_dst; ++j) {
@ -411,6 +585,69 @@ static __global__ void mul_mat_vec_q(
    }
 }

+// Dedicated MoE multi-token kernel.
+// Grid: (ceil(nrows_x / c_rows_per_block), nchannels_dst)
+// Block: (warp_size, ncols_dst) - each warp handles one token independently.
+// No shared memory reduction needed since each warp works alone.
+template <ggml_type type, int c_rows_per_block>
+__launch_bounds__(get_mmvq_mmid_max_batch_for_device<type>()*ggml_cuda_get_physical_warp_size(), 1)
+static __global__ void mul_mat_vec_q_moe(
+        const void * __restrict__ vx, const void * __restrict__ vy, const int32_t * __restrict__ ids,
+        float * __restrict__ dst,
+        const uint32_t ncols_x, const uint3 nchannels_y, const uint32_t nrows_x,
+        const uint32_t stride_row_x, const uint32_t stride_col_y, const uint32_t stride_col_dst,
+        const uint32_t stride_channel_x, const uint32_t stride_channel_y, const uint32_t stride_channel_dst,
+        const uint32_t ncols_dst, const uint32_t ids_stride) {
+
+    constexpr int qk  = ggml_cuda_type_traits<type>::qk;
+    constexpr int qi  = ggml_cuda_type_traits<type>::qi;
+    constexpr int vdr = get_vdr_mmvq(type);
+    constexpr int warp_size = ggml_cuda_get_physical_warp_size();
+
+    constexpr vec_dot_q_cuda_t vec_dot_q_cuda = get_vec_dot_q_cuda(type);
+
+    const uint32_t token_idx   = threadIdx.y;
+    const int      row0        = c_rows_per_block*blockIdx.x;
+    const int      blocks_per_row_x = ncols_x / qk;
+    constexpr int  blocks_per_iter  = vdr * warp_size / qi;
+
+    const uint32_t channel_dst = blockIdx.y;
+
+    if (token_idx >= ncols_dst) {
+        return;
+    }
+
+    const uint32_t channel_x = ids[channel_dst + token_idx * ids_stride];
+    const uint32_t channel_y = fastmodulo(channel_dst, nchannels_y);
+
+    const block_q8_1 * y = ((const block_q8_1 *) vy) + channel_y*stride_channel_y + token_idx*stride_col_y;
+    const int kbx_offset  = channel_x*stride_channel_x + row0*stride_row_x;
+
+    // partial sum for each thread
+    float tmp[c_rows_per_block] = {0.0f};
+
+    for (int kbx = threadIdx.x / (qi/vdr); kbx < blocks_per_row_x; kbx += blocks_per_iter) {
+        const int kby = kbx * (qk/QK8_1);
+        const int kqs = vdr * (threadIdx.x % (qi/vdr));
+
+#pragma unroll
+        for (int i = 0; i < c_rows_per_block; ++i) {
+            tmp[i] += vec_dot_q_cuda(vx, &y[kby], kbx_offset + i*stride_row_x + kbx, kqs);
+        }
+    }
+
+    // Warp-level reduction only - no shared memory needed
+#pragma unroll
+    for (int i = 0; i < c_rows_per_block; ++i) {
+        tmp[i] = warp_reduce_sum<warp_size>(tmp[i]);
+    }
+
+    // Write results
+    if (threadIdx.x < c_rows_per_block && (c_rows_per_block == 1 || uint32_t(row0 + threadIdx.x) < nrows_x)) {
+        dst[channel_dst*stride_channel_dst + token_idx*stride_col_dst + row0 + threadIdx.x] = tmp[threadIdx.x];
+    }
+}
+
 template<ggml_type type>
 static std::pair<dim3, dim3> calc_launch_params(
        const int ncols_dst, const int nrows_x, const int nchannels_dst, const int nsamples_or_ntokens,
@ -423,7 +660,7 @@ static std::pair<dim3, dim3> calc_launch_params(
    return {block_nums, block_dims};
 }

-template<ggml_type type, int c_ncols_dst, bool is_multi_token_id = false, bool small_k = false>
+template<ggml_type type, int c_ncols_dst, bool small_k = false>
 static void mul_mat_vec_q_switch_fusion(
        const void * vx, const void * vy, const int32_t * ids, const ggml_cuda_mm_fusion_args_device fusion, float * dst,
        const uint32_t ncols_x, const uint3 nchannels_y, const uint32_t stride_row_x, const uint32_t stride_col_y,
@ -436,7 +673,7 @@ static void mul_mat_vec_q_switch_fusion(
    const bool has_fusion = fusion.gate != nullptr || fusion.x_bias != nullptr || fusion.gate_bias != nullptr;
    if constexpr (c_ncols_dst == 1) {
        if (has_fusion) {
-            mul_mat_vec_q<type, c_ncols_dst, true, is_multi_token_id, small_k><<<block_nums, block_dims, nbytes_shared, stream>>>
+            mul_mat_vec_q<type, c_ncols_dst, true, small_k><<<block_nums, block_dims, nbytes_shared, stream>>>
                (vx, vy, ids, fusion, dst, ncols_x, nchannels_y, stride_row_x, stride_col_y, stride_col_dst,
                 channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
                 sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst, ids_stride);
@ -446,12 +683,33 @@ static void mul_mat_vec_q_switch_fusion(

    GGML_ASSERT(!has_fusion && "fusion only supported for ncols_dst=1");

-    mul_mat_vec_q<type, c_ncols_dst, false, is_multi_token_id, small_k><<<block_nums, block_dims, nbytes_shared, stream>>>
+    mul_mat_vec_q<type, c_ncols_dst, false, small_k><<<block_nums, block_dims, nbytes_shared, stream>>>
        (vx, vy, ids, fusion, dst, ncols_x, nchannels_y, stride_row_x, stride_col_y, stride_col_dst,
        channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
        sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst, ids_stride);
 }

+template <ggml_type type>
+static void mul_mat_vec_q_moe_launch(
+        const void * vx, const void * vy, const int32_t * ids, float * dst,
+        const uint32_t ncols_x, const uint3 nchannels_y, const uint32_t nrows_x,
+        const uint32_t stride_row_x, const uint32_t stride_col_y, const uint32_t stride_col_dst,
+        const uint32_t stride_channel_x, const uint32_t stride_channel_y, const uint32_t stride_channel_dst,
+        const uint32_t ncols_dst, const uint32_t ids_stride,
+        const int warp_size, const int nchannels_dst, cudaStream_t stream) {
+
+    constexpr int rows_per_block = 2; // 2 gives best perf based on tuning
+    const int64_t nblocks_rows = (nrows_x + rows_per_block - 1) / rows_per_block;
+    const dim3 block_nums(nblocks_rows, nchannels_dst);
+    const dim3 block_dims(warp_size, ncols_dst);
+
+    mul_mat_vec_q_moe<type, rows_per_block><<<block_nums, block_dims, 0, stream>>>(
+        vx, vy, ids, dst, ncols_x, nchannels_y, nrows_x,
+        stride_row_x, stride_col_y, stride_col_dst,
+        stride_channel_x, stride_channel_y, stride_channel_dst,
+        ncols_dst, ids_stride);
+}
+
 template <ggml_type type>
 static void mul_mat_vec_q_switch_ncols_dst(
        const void * vx, const void * vy, const int32_t * ids, const ggml_cuda_mm_fusion_args_device fusion, float * dst,
@ -470,20 +728,62 @@ static void mul_mat_vec_q_switch_ncols_dst(
    const uint3 sample_ratio_fd  = init_fastdiv_values(nsamples_dst  / nsamples_x);

    const int device = ggml_cuda_get_device();
+    const int                     cc        = ggml_cuda_info().devices[device].cc;
    const int warp_size = ggml_cuda_info().devices[device].warp_size;
-    const mmvq_parameter_table_id table_id = get_device_table_id(ggml_cuda_info().devices[device].cc);
+    const mmvq_parameter_table_id table_id  = get_device_table_id(cc);

    const bool has_fusion = fusion.gate != nullptr || fusion.x_bias != nullptr || fusion.gate_bias != nullptr;
    const bool has_ids = ids != nullptr;

+    const auto should_use_small_k = [&](int c_ncols_dst) {
+        // When K is small, increase rows_per_block to match nwarps so each warp has more work to do
+        // Trigger when the full thread block covers all K blocks in a single loop iteration and few threads remain idle.
+        constexpr int qk                    = ggml_cuda_type_traits<type>::qk;
+        constexpr int qi                    = ggml_cuda_type_traits<type>::qi;
+        constexpr int vdr                   = get_vdr_mmvq(type);
+        const int     blocks_per_row_x      = ncols_x / qk;
+        const int     blocks_per_iter_1warp = vdr * warp_size / qi;
+        const int     nwarps                = calc_nwarps(type, c_ncols_dst, table_id);
+        bool          use                   = nwarps > 1 && blocks_per_row_x < nwarps * blocks_per_iter_1warp;
+
+        constexpr std::array<ggml_type, 2> iq_slow_turing = {
+            GGML_TYPE_IQ3_XXS,
+            GGML_TYPE_IQ3_S,
+        };
+        constexpr std::array<ggml_type, 8> iq_slow_other = {
+            GGML_TYPE_IQ1_S, GGML_TYPE_IQ1_M,   GGML_TYPE_IQ2_XXS, GGML_TYPE_IQ2_XS,
+            GGML_TYPE_IQ2_S, GGML_TYPE_IQ3_XXS, GGML_TYPE_IQ3_S,   GGML_TYPE_IQ4_XS,
+        };
+        constexpr std::array<ggml_type, 3> slow_pascal = {
+            GGML_TYPE_IQ3_S,
+            GGML_TYPE_Q2_K,
+            GGML_TYPE_Q3_K,
+        };
+
+        const bool is_nvidia_turing_plus  = GGML_CUDA_CC_IS_NVIDIA(cc) && cc >= GGML_CUDA_CC_TURING;
+        const bool is_nvidia_pascal_older = GGML_CUDA_CC_IS_NVIDIA(cc) && cc < GGML_CUDA_CC_VOLTA;
+
+        if (is_nvidia_turing_plus) {
+            if (ncols_dst == 1 &&
+                    std::find(iq_slow_turing.begin(), iq_slow_turing.end(), type) != iq_slow_turing.end()) {
+                use = false;
+            }
+        } else if ((ncols_dst == 1 && std::find(iq_slow_other.begin(), iq_slow_other.end(), type) != iq_slow_other.end()) ||
+                (is_nvidia_pascal_older && std::find(slow_pascal.begin(), slow_pascal.end(), type) != slow_pascal.end()) ||
+                GGML_CUDA_CC_IS_RDNA(cc)) {
+            use = false;
+        }
+
+        return use;
+    };
+
    if (has_ids && ncols_dst > 1) {
-        // Multi-token MUL_MAT_ID path only - single-token goes through regular path below
-        constexpr int c_ncols_dst = 1;
-        std::pair<dim3, dim3> dims = calc_launch_params<type>(c_ncols_dst, nrows_x, nchannels_dst, ncols_dst, warp_size, table_id);
-        mul_mat_vec_q_switch_fusion<type, c_ncols_dst, true>(vx, vy, ids, fusion, dst, ncols_x, nchannels_y_fd, stride_row_x, stride_col_y, stride_col_dst,
-             channel_ratio_fd, stride_channel_x, stride_channel_y, stride_channel_dst,
-             sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst,
-             dims.first, dims.second, 0, ids_stride, stream);
+        // Multi-token MUL_MAT_ID path - dedicated MoE kernel
+        mul_mat_vec_q_moe_launch<type>(
+            vx, vy, ids, dst, ncols_x, nchannels_y_fd, nrows_x,
+            stride_row_x, stride_col_y, stride_col_dst,
+            stride_channel_x, stride_channel_y, stride_channel_dst,
+            ncols_dst, ids_stride, warp_size, nchannels_dst, stream);
        return;
    }

@ -491,31 +791,24 @@ static void mul_mat_vec_q_switch_ncols_dst(
        case 1: {
            constexpr int c_ncols_dst = 1;

-            // When K is small, increase rows_per_block to match nwarps so each warp has more work to do
-            // Trigger when the full thread block covers all K blocks in a single loop iteration and few threads remain idle.
-            constexpr int qk  = ggml_cuda_type_traits<type>::qk;
-            constexpr int qi  = ggml_cuda_type_traits<type>::qi;
-            constexpr int vdr = get_vdr_mmvq(type);
-            const int blocks_per_row_x = ncols_x / qk;
-            const int blocks_per_iter_1warp = vdr * warp_size / qi;
-            const int nwarps = calc_nwarps(type, c_ncols_dst, table_id);
-            const bool use_small_k = nwarps > 1 && blocks_per_row_x < nwarps * blocks_per_iter_1warp;
+            bool use_small_k = should_use_small_k(c_ncols_dst);
+
            if (use_small_k) {
-                std::pair<dim3, dim3> dims = calc_launch_params<type>(c_ncols_dst, nrows_x, nchannels_dst, nsamples_dst,
-                                                                    warp_size, table_id, true);
-                mul_mat_vec_q_switch_fusion<type, c_ncols_dst, false, true>(
+                std::pair<dim3, dim3> dims = calc_launch_params<type>(c_ncols_dst, nrows_x, nchannels_dst,
+                                                                        nsamples_dst, warp_size, table_id, true);
+                mul_mat_vec_q_switch_fusion<type, c_ncols_dst, true>(
                    vx, vy, ids, fusion, dst, ncols_x, nchannels_y_fd, stride_row_x, stride_col_y, stride_col_dst,
-                    channel_ratio_fd, stride_channel_x, stride_channel_y, stride_channel_dst,
-                    sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst,
-                    dims.first, dims.second, 0, ids_stride, stream);
+                    channel_ratio_fd, stride_channel_x, stride_channel_y, stride_channel_dst, sample_ratio_fd,
+                    stride_sample_x, stride_sample_y, stride_sample_dst, dims.first, dims.second, 0, ids_stride,
+                    stream);
            } else {
-                std::pair<dim3, dim3> dims = calc_launch_params<type>(c_ncols_dst, nrows_x, nchannels_dst, nsamples_dst,
-                                                                    warp_size, table_id);
+                std::pair<dim3, dim3> dims = calc_launch_params<type>(c_ncols_dst, nrows_x, nchannels_dst,
+                                                                        nsamples_dst, warp_size, table_id);
                mul_mat_vec_q_switch_fusion<type, c_ncols_dst>(
                    vx, vy, ids, fusion, dst, ncols_x, nchannels_y_fd, stride_row_x, stride_col_y, stride_col_dst,
-                    channel_ratio_fd, stride_channel_x, stride_channel_y, stride_channel_dst,
-                    sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst,
-                    dims.first, dims.second, 0, ids_stride, stream);
+                    channel_ratio_fd, stride_channel_x, stride_channel_y, stride_channel_dst, sample_ratio_fd,
+                    stride_sample_x, stride_sample_y, stride_sample_dst, dims.first, dims.second, 0, ids_stride,
+                    stream);
            }
        } break;
        case 2: {
@ -626,6 +919,12 @@ static void mul_mat_vec_q_switch_type(
                 nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
                 nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, ids_stride, stream);
            break;
+        case GGML_TYPE_NVFP4:
+            mul_mat_vec_q_switch_ncols_dst<GGML_TYPE_NVFP4>
+                (vx, vy, ids, fusion, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
+                 nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
+                 nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, ids_stride, stream);
+            break;
        case GGML_TYPE_Q2_K:
            mul_mat_vec_q_switch_ncols_dst<GGML_TYPE_Q2_K>
                (vx, vy, ids, fusion, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
--- a/ggml/src/ggml-cuda/mmvq.cuh
+++ b/ggml/src/ggml-cuda/mmvq.cuh
@ -1,7 +1,10 @@
 #include "common.cuh"

 #define MMVQ_MAX_BATCH_SIZE 8 // Max. batch size for which to use MMVQ kernels.
-#define MMVQ_MMID_MAX_BATCH_SIZE 4 // Max. batch size for which to use MMVQ kernels for MUL_MAT_ID
+
+// Returns the maximum batch size for which MMVQ should be used for MUL_MAT_ID,
+// based on the quantization type and GPU architecture (compute capability).
+int get_mmvq_mmid_max_batch(ggml_type type, int cc);

 void ggml_cuda_mul_mat_vec_q(ggml_backend_cuda_context & ctx,
    const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * ids, ggml_tensor * dst, const ggml_cuda_mm_fusion_args_host * fusion = nullptr);
--- a/ggml/src/ggml-cuda/vecdotq.cuh
+++ b/ggml/src/ggml-cuda/vecdotq.cuh
@ -322,6 +322,38 @@ static __device__ __forceinline__ float vec_dot_mxfp4_q8_1(
    return d * sumi;
 }

+#define VDR_NVFP4_Q8_1_MMVQ 4
+#define VDR_NVFP4_Q8_1_MMQ  8
+
+static __device__ __forceinline__ float vec_dot_nvfp4_q8_1(
+                                        const void * __restrict__ vbq,
+                                        const block_q8_1 * __restrict__ bq8_1,
+                                        const int32_t & kbx,
+                                        const int32_t & iqs) {
+
+    const block_nvfp4 * bq4 = (const block_nvfp4 *) vbq + kbx;
+    float sum = 0.0f;
+#pragma unroll
+    for (int i = 0; i < VDR_NVFP4_Q8_1_MMVQ/2; i++) {
+        const int32_t iqs0 = iqs + 2*i;
+        const int32_t iqs1 = iqs0 + 1;
+        const int32_t is = iqs0 >> 1;
+        const int2 v0 = get_int_from_table_16(get_int_b4(bq4->qs, iqs0), kvalues_mxfp4);
+        const int2 v1 = get_int_from_table_16(get_int_b4(bq4->qs, iqs1), kvalues_mxfp4);
+        const block_q8_1 * bq8 = bq8_1 + (is >> 1);
+        const int32_t i8 = ((is & 1) << 2);
+
+        int sumi = ggml_cuda_dp4a(v0.x, get_int_b4(bq8->qs, i8 + 0), 0);
+        sumi = ggml_cuda_dp4a(v0.y, get_int_b4(bq8->qs, i8 + 2), sumi);
+        sumi = ggml_cuda_dp4a(v1.x, get_int_b4(bq8->qs, i8 + 1), sumi);
+        sumi = ggml_cuda_dp4a(v1.y, get_int_b4(bq8->qs, i8 + 3), sumi);
+
+        const float d = ggml_cuda_ue4m3_to_fp32(bq4->d[is]) * __low2float(bq8->ds);
+        sum += d * float(sumi);
+    }
+
+    return sum;
+}
 #define VDR_Q2_K_Q8_1_MMVQ 1
 #define VDR_Q2_K_Q8_1_MMQ  4

--- a/ggml/src/ggml-cuda/vendors/cuda.h
+++ b/ggml/src/ggml-cuda/vendors/cuda.h
@ -6,9 +6,10 @@
 #include <cuda_bf16.h>
 #include <cuda_fp16.h>

-#if CUDART_VERSION >= 12050
+#if CUDART_VERSION >= 11080
 #include <cuda_fp8.h>
-#endif // CUDART_VERSION >= 12050
+#define FP8_AVAILABLE
+#endif // CUDART_VERSION >= 11080

 #if CUDART_VERSION >= 12080
 #include <cuda_fp4.h>
--- a/ggml/src/ggml-cuda/vendors/hip.h
+++ b/ggml/src/ggml-cuda/vendors/hip.h
@ -235,6 +235,12 @@
 typedef __hip_bfloat16 nv_bfloat16;
 typedef __hip_bfloat162 nv_bfloat162;

+#if HIP_VERSION >= 60200000
+#include <hip/hip_fp8.h>
+typedef __hip_fp8_e4m3 __nv_fp8_e4m3;
+#define FP8_AVAILABLE
+#endif // HIP_VERSION >= 60200000
+
 typedef int8_t int8x4_t __attribute__((ext_vector_type(4)));
 typedef uint8_t uint8x4_t __attribute__((ext_vector_type(4)));
 static __device__ __forceinline__ int __vsubss4(const int a, const int b) {
--- a/ggml/src/ggml-hexagon/ggml-hexagon.cpp
+++ b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
@ -1406,6 +1406,13 @@ static void ggml_backend_hexagon_buffer_set_tensor(ggml_backend_buffer_t buffer,
            repack_q8_0_q8x4x2(tensor, data, size);
            break;

+        case GGML_TYPE_IQ4_NL:
+            GGML_ASSERT(offset == 0);
+            GGML_ASSERT(offset + size <= ggml_nbytes(tensor));
+            // IQ4_NL has identical block layout to Q4_0 (ggml_half d + uint8_t qs[16])
+            repack_q4_0_q4x4x2(tensor, data, size);
+            break;
+
        case GGML_TYPE_MXFP4:
            GGML_ASSERT(offset == 0);
            GGML_ASSERT(offset + size <= ggml_nbytes(tensor));
@ -1442,6 +1449,12 @@ static void ggml_backend_hexagon_buffer_get_tensor(ggml_backend_buffer_t buffer,
            repack_q8x4x2_q8_0(data, tensor, size);
            break;

+        case GGML_TYPE_IQ4_NL:
+            GGML_ASSERT(offset == 0);
+            GGML_ASSERT(offset + size <= ggml_nbytes(tensor));
+            repack_q4x4x2_q4_0(data, tensor, size);
+            break;
+
        case GGML_TYPE_MXFP4:
            GGML_ASSERT(offset == 0);
            GGML_ASSERT(offset + size <= ggml_nbytes(tensor));
@ -1819,6 +1832,7 @@ static bool ggml_hexagon_supported_mul_mat(const struct ggml_hexagon_session * s
    switch (src0->type) {
        case GGML_TYPE_Q4_0:
        case GGML_TYPE_Q8_0:
+        case GGML_TYPE_IQ4_NL:
        case GGML_TYPE_MXFP4:
            if (src0->ne[0] % 32) {
                return false;
@ -1868,6 +1882,7 @@ static bool ggml_hexagon_supported_mul_mat_id(const struct ggml_hexagon_session
    switch (src0->type) {
        case GGML_TYPE_Q4_0:
        case GGML_TYPE_Q8_0:
+        case GGML_TYPE_IQ4_NL:
        case GGML_TYPE_MXFP4:
            if ((src0->ne[0] % 32)) {
                return false;
@ -2596,8 +2611,26 @@ static void ggml_backend_hexagon_free(ggml_backend_t backend) {
    delete backend;
 }

+// Map weight type to its activation quantization family.
+// Types in the same family produce identical Q8 formats in VTCM and can
+// safely share quantized activation data via SKIP_QUANTIZE.
+// When adding a new quantized type, assign it the correct family here.
+static inline int act_quant_family(enum ggml_type wtype) {
+    switch (wtype) {
+        case GGML_TYPE_Q4_0:
+        case GGML_TYPE_Q8_0:
+        case GGML_TYPE_IQ4_NL:
+        case GGML_TYPE_MXFP4:
+            return 1;  // Q8x4x2
+        default:
+            return 0;  // unknown / not quantized
+    }
+}
+
 static inline bool op_reuse_src1(const ggml_tensor * op1, const ggml_tensor * op0) {
-    return (op0 && op0->src[1] == op1->src[1] && ggml_is_quantized(op0->src[0]->type));
+    return (op0 && op0->src[1] == op1->src[1] &&
+            act_quant_family(op0->src[0]->type) == act_quant_family(op1->src[0]->type) &&
+            act_quant_family(op0->src[0]->type) != 0);
 }

 static inline bool is_compute_op(ggml_tensor *node)
@ -3364,6 +3397,8 @@ static void ggml_hexagon_init(ggml_backend_reg * reg) {
                  "please update hexagon_type to match ggml_type");
    static_assert((unsigned int) HTP_TYPE_MXFP4 == (unsigned int) GGML_TYPE_MXFP4,
                  "please update hexagon_type to match ggml_type");
+    static_assert((unsigned int) HTP_TYPE_IQ4_NL == (unsigned int) GGML_TYPE_IQ4_NL,
+                  "please update hexagon_type to match ggml_type");

    const char * str_experimental = getenv("GGML_HEXAGON_EXPERIMENTAL");
    const char * str_verbose = getenv("GGML_HEXAGON_VERBOSE");
--- a/ggml/src/ggml-hexagon/htp/flash-attn-ops.c
+++ b/ggml/src/ggml-hexagon/htp/flash-attn-ops.c
@ -346,6 +346,9 @@ static void flash_attn_ext_f16_thread(unsigned int nth, unsigned int ith, void *

    const HVX_Vector logit_cap = hvx_vec_splat_f32(factx->logit_softcap);

+    dma_cache m_cache;
+    dma_cache_init(&m_cache, spad_m, factx->size_m_block, DMA_CACHE_MAX_SIZE);
+
    for (uint32_t ir = ir0; ir < ir1; ++ir) {
        const uint32_t iq3 = fastdiv(ir, &factx->src0_div21);
        const uint32_t iq2 = fastdiv(ir - iq3*neq2*neq1, &factx->src0_div1);
@ -389,9 +392,8 @@ static void flash_attn_ext_f16_thread(unsigned int nth, unsigned int ith, void *
            // Mask
            if (mask) {
                const uint8_t * m_src = (const uint8_t *) (mp_base + ic_start);
-                uint8_t * m_dst = spad_m + (ib % 2) * factx->size_m_block;
                // Mask is 1D contiguous for this row
-                dma_queue_push(dma, dma_make_ptr(m_dst, m_src), current_block_size * 2, current_block_size * 2, current_block_size * 2, 1);
+                dma_cache_push(dma, &m_cache, m_src, current_block_size * 2, current_block_size * 2, current_block_size * 2, 1);
            }

            // FARF(HIGH, "fa %u: prefetch KVM: ir %u ib %u iq1 %u iq2 %u iq3 %u : size_k_row %u size_v_row %u bs %u: usec %u",
@ -554,7 +556,7 @@ static void flash_attn_ext_f16_thread(unsigned int nth, unsigned int ith, void *
                // Mask
                if (mask) {
                    const uint8_t * m_src = (const uint8_t *) (mp_base + next_ic_start);
-                    dma_queue_push(dma, dma_make_ptr(m_base, m_src), next_block_size * 2, next_block_size * 2, next_block_size * 2, 1);
+                    dma_cache_push(dma, &m_cache, m_src, next_block_size * 2, next_block_size * 2, next_block_size * 2, 1);
                }

                // FARF(HIGH, "fa %u: prefetch KVM: ir %u ib %u : iq1 %u iq2 %u iq3 %u : size_k_row %u size_v_row %u bs %u: usec %u",
@ -684,7 +686,7 @@ int op_flash_attn_ext(struct htp_ops_context * octx) {
    octx->src0_spad.size_per_thread = size_q_block * 1;
    octx->src1_spad.size_per_thread = factx.size_k_block * 2;
    octx->src2_spad.size_per_thread = factx.size_v_block * 2;
-    octx->src3_spad.size_per_thread = mask ? factx.size_m_block * 2 : 0;
+    octx->src3_spad.size_per_thread = mask ? factx.size_m_block * DMA_CACHE_MAX_SIZE : 0;
    octx->dst_spad.size_per_thread  = size_vkq_acc;

    octx->src0_spad.size = octx->src0_spad.size_per_thread * octx->n_threads;
@ -705,6 +707,8 @@ int op_flash_attn_ext(struct htp_ops_context * octx) {
    octx->src3_spad.data = octx->src2_spad.data + octx->src2_spad.size;
    octx->dst_spad.data  = octx->src3_spad.data + octx->src3_spad.size;

+    // FARF(ERROR, "fa: qrows-per-thread %u", factx.qrows_per_thread);
+
    if (!(octx->flags & HTP_OPFLAGS_SKIP_COMPUTE)) {
        worker_pool_run_func(octx->ctx->worker_pool, flash_attn_ext_f16_thread, &factx, octx->n_threads);
    }
--- a/ggml/src/ggml-hexagon/htp/hex-dma.h
+++ b/ggml/src/ggml-hexagon/htp/hex-dma.h
@ -143,7 +143,7 @@ static inline bool dma_queue_push_single_1d(dma_queue * q, dma_ptr dptr, size_t
    desc->desc_size  = 0; // 1D mode
    desc->src_bypass = dma_src_l2_bypass_on;
    desc->dst_bypass = dma_dst_l2_bypass_on;
-    desc->order      = 1;
+    desc->order      = 0;
    desc->done       = 0;
    desc->src        = (void *) dptr.src;
    desc->dst        = (void *) dptr.dst;
@ -151,8 +151,12 @@ static inline bool dma_queue_push_single_1d(dma_queue * q, dma_ptr dptr, size_t

    q->dptr[q->push_idx] = dptr;

-    dmlink(q->tail, desc);
-    q->tail = (dma_descriptor_2d *) desc;
+    if (size) {
+        dmlink(q->tail, desc);
+        q->tail = (dma_descriptor_2d *) desc;
+    } else {
+        desc->done = 1;
+    }

    // FARF(ERROR, "dma-push: i %u row-size %u nrows %d dst %p src %p\n", q->push_idx, row_size, nrows, dptr.dst, dptr.src);
    q->push_idx = (q->push_idx + 1) & q->idx_mask;
@ -175,7 +179,7 @@ static inline bool dma_queue_push_single_2d(dma_queue * q, dma_ptr dptr, size_t
    desc->dst_bypass     = dma_dst_l2_bypass_on;
    desc->src_comp       = 0;
    desc->dst_comp       = 0;
-    desc->order          = 1;
+    desc->order          = 0;
    desc->done           = 0;
    desc->src_stride     = src_stride;
    desc->dst_stride     = dst_stride;
@ -197,8 +201,12 @@ static inline bool dma_queue_push_single_2d(dma_queue * q, dma_ptr dptr, size_t

    q->dptr[q->push_idx] = dptr;

-    dmlink(q->tail, desc);
-    q->tail = desc;
+    if (nrows) {
+        dmlink(q->tail, desc);
+        q->tail = desc;
+    } else {
+        desc->done = 1;
+    }

    // FARF(ERROR, "dma-push: i %u row-size %u nrows %d dst %p src %p\n", q->push_idx, row_size, nrows, dptr.dst, dptr.src);
    q->push_idx = (q->push_idx + 1) & q->idx_mask;
@ -215,12 +223,9 @@ static inline dma_ptr dma_queue_pop(dma_queue * q) {
    dma_descriptor_2d * desc = &q->desc[q->pop_idx];

    // Wait for desc to complete
-    while (1) {
-        dmpoll();
-        if (desc->done) {
-            break;
-        }
+    while (!desc->done) {
        // FARF(ERROR, "dma-pop: waiting for DMA : %u\n", q->pop_idx);
+        dmpoll();
    }

    dptr = q->dptr[q->pop_idx];
@ -312,6 +317,54 @@ static inline bool dma_queue_push_vtcm_to_ddr(dma_queue * q, dma_ptr dptr, size_
    return dma_queue_push(q, dptr, dst_row_size, src_row_size, dst_row_size, nrows);
 }

+#define DMA_CACHE_MAX_SIZE 64U
+
+typedef struct {
+    uint8_t *base;
+    uint32_t line_size;
+    uint32_t capacity;
+    uint32_t src[DMA_CACHE_MAX_SIZE];
+    uint16_t age[DMA_CACHE_MAX_SIZE];
+} dma_cache;
+
+static inline void dma_cache_init(dma_cache *c, uint8_t *base, uint32_t line_size, uint32_t capacity)
+{
+    c->capacity  = (capacity > DMA_CACHE_MAX_SIZE) ? DMA_CACHE_MAX_SIZE : capacity;
+    c->base      = base;
+    c->line_size = line_size;
+
+    for (unsigned i=0; i < c->capacity; i++) {
+        c->src[i] = 0;
+        c->age[i] = 0;
+    }
+}
+
+static inline bool dma_cache_push(dma_queue *q, dma_cache *c, const uint8_t * src, uint32_t dst_stride, uint32_t src_stride, uint32_t row_size, uint32_t nrows)
+{
+    uint32_t o_idx = 0;
+    uint16_t o_age = 0;
+    uint8_t *  dst = 0;
+
+    for (unsigned i=0; i < c->capacity; i++) {
+        if (c->src[i] == (uint32_t) src) {
+            c->age[i] = 0;
+            dst = c->base + (i * c->line_size); nrows = 0; // dummy dma
+            // FARF(ERROR, "dma-cache: found %p", src);
+        } else {
+            c->age[i]++;
+            if (c->age[i] > o_age) { o_age = c->age[i]; o_idx = i; }
+        }
+    }
+    if (!dst) {
+        // FARF(ERROR, "dma-cache: replacing #%u : age %u %p -> %p", o_idx, c->age[o_idx], (void *) c->src[o_idx], src);
+        c->age[o_idx] = 0;
+        c->src[o_idx] = (uint32_t) src;
+        dst = c->base + o_idx * c->line_size; // normal nrows dma
+    }
+
+    return dma_queue_push(q, dma_make_ptr(dst, src), dst_stride, src_stride, row_size, nrows);
+}
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
--- a/ggml/src/ggml-hexagon/htp/hmx-matmul-ops.c
+++ b/ggml/src/ggml-hexagon/htp/hmx-matmul-ops.c
@ -30,6 +30,12 @@ static const __fp16 q4_0_to_fp16_lut[64] __attribute__((aligned(VLEN))) = {
    -8, 0, -7, 0, -6, 0, -5, 0, -4, 0, -3, 0, -2, 0, -1, 0, 0, 0, 1, 0, 2, 0, 3, 0, 4, 0, 5, 0, 6, 0, 7, 0,
 };

+// MXFP4 dequantization LUT: maps 4-bit index to fp16 mantissa value
+// kvalues: 0, 0.5, 1, 1.5, 2, 3, 4, 6, 0, -0.5, -1, -1.5, -2, -3, -4, -6
+static const __fp16 mxfp4_to_fp16_lut[64] __attribute__((aligned(VLEN))) = {
+    0, 0, 0.5, 0, 1, 0, 1.5, 0, 2, 0, 3, 0, 4, 0, 6, 0, 0, 0, -0.5, 0, -1, 0, -1.5, 0, -2, 0, -3, 0, -4, 0, -6, 0,
+};
+
 static const __fp16 iq4_nl_to_fp16_lut[64] __attribute__((aligned(VLEN))) = {
    -127, 0, -104, 0, -83, 0, -65, 0, -49, 0, -35, 0, -22, 0, -10, 0,
    1,    0, 13,   0, 25,  0, 38,  0, 53,  0, 69,  0, 89,  0, 113, 0,
@ -46,7 +52,8 @@ static const int32_t weight_transpose_scatter_offsets[32] __attribute__((aligned

 // Scales per x4x2 logical block: 8 × sizeof(__fp16) = 16 bytes
 #define HMX_X4X2_SCALES_PER_BLK  8
-#define HMX_X4X2_DBLK_SIZE       16  // 8 * 2 bytes
+#define HMX_X4X2_DBLK_SIZE       16  // 8 * 2 bytes (fp16 scales for Q4_0/Q8_0/IQ4_NL)
+#define HMX_X4X2_MXFP4_EBLK_SIZE 8   // 8 * 1 byte  (E8M0 scales for MXFP4)

 static inline void swap_ptr(void **p1, void **p2) {
    void *t = *p1;
@ -78,9 +85,11 @@ static inline size_t get_x4x2_row_stride(int weight_type, int k) {
    switch (weight_type) {
        case HTP_TYPE_Q4_0:
        case HTP_TYPE_IQ4_NL:
-            return (size_t)nb * (QK_Q4_0x4x2 / 2 + HMX_X4X2_DBLK_SIZE);  // 144 * nb
+            return (size_t) nb * (QK_Q4_0x4x2 / 2 + HMX_X4X2_DBLK_SIZE);         // 144 * nb
        case HTP_TYPE_Q8_0:
-            return (size_t)nb * (QK_Q8_0x4x2 + HMX_X4X2_DBLK_SIZE);      // 272 * nb
+            return (size_t) nb * (QK_Q8_0x4x2 + HMX_X4X2_DBLK_SIZE);             // 272 * nb
+        case HTP_TYPE_MXFP4:
+            return (size_t) nb * (QK_MXFP4x4x2 / 2 + HMX_X4X2_MXFP4_EBLK_SIZE);  // 136 * nb
        default:
            return 0;
    }
@ -284,6 +293,87 @@ static inline HVX_Vector dequantize_x4x2_q8_0_group_hvx(
    return Q6_Vhf_equals_Vqf16(Q6_Vqf16_vmpy_VhfVhf(v_hf, v_scales));
 }

+// --- MXFP4 E8M0 scale conversion and dequantization ---
+//
+// HVX batch-convert 8 E8M0 bytes (one x4x2 block's scales) to __fp16[8] on stack.
+// Scalar loads from the stack array execute on the scalar pipeline, in parallel
+// with HVX vlut16/vmpy/vscatter — freeing HVX slots in the hot loop.
+// Arithmetic: fp16_bits = clamp(e - 112, 0, 30) << 10
+// e=0..112 -> 0 (underflow), e=113..142 -> valid fp16, e>=143 -> clamped to 2^15.
+
+typedef struct {
+    __fp16 v[8] __attribute__((aligned(16)));
+} mxfp4_scales_t;
+
+static inline mxfp4_scales_t mxfp4_convert_scales(const uint8_t * e8m0_8) {
+    mxfp4_scales_t s;
+    HVX_Vector     v  = hvx_vmemu(e8m0_8);
+    HVX_Vector     vh = Q6_V_lo_W(Q6_Wuh_vunpack_Vub(v));
+    vh                = Q6_Vh_vsub_VhVh(vh, Q6_Vh_vsplat_R(112));
+    vh                = Q6_Vh_vmax_VhVh(vh, Q6_V_vzero());
+    vh                = Q6_Vh_vmin_VhVh(vh, Q6_Vh_vsplat_R(30));
+    vh                = Q6_Vh_vasl_VhR(vh, 10);
+    hvx_vec_store_u(s.v, 16, vh);
+    return s;
+}
+
+static inline HVX_Vector mxfp4_extract_splat(mxfp4_scales_t scales, int idx) {
+    return hvx_vec_splat_f16(scales.v[idx]);
+}
+
+// Dequantize one x4x2 MXFP4 group (32 elements from 32 packed bytes) -> 32 FP16.
+static inline HVX_Vector dequantize_x4x2_mxfp4_group_hvx(const uint8_t *  packed_32,
+                                                         bool             upper_nibbles,
+                                                         int              sub_blk,
+                                                         const HVX_Vector vlut_cvt,
+                                                         mxfp4_scales_t   scales) {
+    HVX_Vector       vq       = hvx_vmemu(packed_32);
+    const HVX_Vector mask_h4  = Q6_Vb_vsplat_R(0x0F);
+    HVX_Vector       v_quants = upper_nibbles ? Q6_Vub_vlsr_VubR(vq, 4) : vq;
+    v_quants                  = Q6_V_vand_VV(v_quants, mask_h4);
+
+    HVX_Vector v_sc = mxfp4_extract_splat(scales, sub_blk);
+
+    v_quants            = Q6_Vb_vshuff_Vb(v_quants);
+    HVX_VectorPair vp   = Q6_Wh_vlut16_VbVhR(v_quants, vlut_cvt, 0);
+    HVX_Vector     v_hf = Q6_V_lo_W(vp);
+
+    return Q6_Vhf_equals_Vqf16(Q6_Vqf16_vmpy_VhfVhf(v_hf, v_sc));
+}
+
+// Batch-dequantize 4 contiguous x4x2 MXFP4 groups (4x32 = 128 packed bytes).
+static inline void dequantize_x4x2_mxfp4_x4groups_hvx(const uint8_t *  packed_128,
+                                                      bool             upper_nibbles,
+                                                      int              sub_blk_base,
+                                                      const HVX_Vector vlut_cvt,
+                                                      mxfp4_scales_t   scales,
+                                                      HVX_Vector       out[4]) {
+    HVX_Vector       vq       = hvx_vmemu(packed_128);
+    const HVX_Vector mask_h4  = Q6_Vb_vsplat_R(0x0F);
+    HVX_Vector       v_quants = upper_nibbles ? Q6_Vub_vlsr_VubR(vq, 4) : vq;
+    v_quants                  = Q6_V_vand_VV(v_quants, mask_h4);
+
+    v_quants = Q6_Vb_vshuff_Vb(v_quants);
+
+    HVX_VectorPair vp   = Q6_Wh_vlut16_VbVhR(v_quants, vlut_cvt, 0);
+    HVX_Vector     v_lo = Q6_V_lo_W(vp);
+    HVX_Vector     v_hi = Q6_V_hi_W(vp);
+
+    HVX_VectorPred q64    = Q6_Q_vsetq_R(64);
+    HVX_Vector     v_sc01 = Q6_V_vmux_QVV(q64, mxfp4_extract_splat(scales, sub_blk_base + 0),
+                                          mxfp4_extract_splat(scales, sub_blk_base + 1));
+    HVX_Vector     v_sc23 = Q6_V_vmux_QVV(q64, mxfp4_extract_splat(scales, sub_blk_base + 2),
+                                          mxfp4_extract_splat(scales, sub_blk_base + 3));
+
+    v_lo = Q6_Vhf_equals_Vqf16(Q6_Vqf16_vmpy_VhfVhf(v_lo, v_sc01));
+    v_hi = Q6_Vhf_equals_Vqf16(Q6_Vqf16_vmpy_VhfVhf(v_hi, v_sc23));
+
+    out[0] = v_lo;
+    out[1] = Q6_V_vror_VR(v_lo, 64);
+    out[2] = v_hi;
+    out[3] = Q6_V_vror_VR(v_hi, 64);
+}
+
 // Dequantize a tile range from x4x2 weight data (already in VTCM) to tile-major FP16.
 // Input:  vtcm_src has n_cols rows of x4x2 data, each row_stride bytes.
 // Output: vtcm_dst in tile-major FP16 layout.
@ -295,11 +385,11 @@ static void dequantize_x4x2_weight_to_fp16_tiles_task(
        int start_tile, int end_tile) {

    const int n_k_tiles = k_block / HMX_FP16_TILE_N_COLS;
-    const bool is_q4 = (weight_type == HTP_TYPE_Q4_0 || weight_type == HTP_TYPE_IQ4_NL);
-    const int qrow_size = is_q4 ? (k_block / 2) : k_block;
+    const int qrow_size = (weight_type == HTP_TYPE_Q8_0) ? k_block : (k_block / 2);

-    const HVX_Vector vlut_cvt = (weight_type == HTP_TYPE_IQ4_NL)
-        ? hvx_vmem(iq4_nl_to_fp16_lut) : hvx_vmem(q4_0_to_fp16_lut);
+    const HVX_Vector vlut_cvt = (weight_type == HTP_TYPE_IQ4_NL) ? hvx_vmem(iq4_nl_to_fp16_lut) :
+                                (weight_type == HTP_TYPE_MXFP4)  ? hvx_vmem(mxfp4_to_fp16_lut) :
+                                                                   hvx_vmem(q4_0_to_fp16_lut);

    // vscatter setup: write dequantized K-values directly to transposed [K][N] tile positions.
    // Each int32 element holds a K-row-pair (2 adjacent fp16 values).  word[i] at offset i*128
@ -312,8 +402,9 @@ static void dequantize_x4x2_weight_to_fp16_tiles_task(
        int ct = t / n_k_tiles;  // column tile index
        int kt = t % n_k_tiles;  // K tile index

-        // --- Batch-4 fast path for Q4: process 4 contiguous K-tiles with one vlut16 per row ---
-        if (is_q4 && (kt % 4 == 0) && (t + 4 <= end_tile) && ((t + 3) / n_k_tiles == ct)) {
+        // --- Batch-4 fast path for Q4_0/IQ4_NL: process 4 contiguous K-tiles with one vlut16 per row ---
+        if ((weight_type == HTP_TYPE_Q4_0 || weight_type == HTP_TYPE_IQ4_NL) && (kt % 4 == 0) && (t + 4 <= end_tile) &&
+            ((t + 3) / n_k_tiles == ct)) {
            int blk_idx      = (kt * 32) / QK_Q4_0x4x2;
            int sub_blk_base = ((kt * 32) % QK_Q4_0x4x2) / 32;  // 0 or 4
            bool upper       = (sub_blk_base >= 4);
@ -351,10 +442,60 @@ static void dequantize_x4x2_weight_to_fp16_tiles_task(
            continue;
        }

+        // --- Batch-4 fast path for MXFP4: same nibble layout but E8M0 scales ---
+        if (weight_type == HTP_TYPE_MXFP4 && (kt % 4 == 0) && (t + 4 <= end_tile) && ((t + 3) / n_k_tiles == ct)) {
+            int  blk_idx      = (kt * 32) / QK_MXFP4x4x2;
+            int  sub_blk_base = ((kt * 32) % QK_MXFP4x4x2) / 32;                 // 0 or 4
+            bool upper        = (sub_blk_base >= 4);
+            int  packed_off   = blk_idx * (QK_MXFP4x4x2 / 2);                    // 128 contiguous packed bytes
+            int  e8m0_blk_off = qrow_size + blk_idx * HMX_X4X2_MXFP4_EBLK_SIZE;  // all 8 E8M0 scales
+
+            __fp16 * tile_bases[4];
+            for (int g = 0; g < 4; g++) {
+                tile_bases[g] = vtcm_dst + (t + g) * HMX_FP16_TILE_N_ELMS;
+            }
+
+            HVX_Vector v_off = v_scat_base;
+            for (int r = 0; r < HMX_FP16_TILE_N_ROWS; r += 2) {
+                int             row0 = ct * HMX_FP16_TILE_N_COLS + r;
+                int             row1 = row0 + 1;
+                const uint8_t * r0   = vtcm_src + row0 * row_stride;
+                const uint8_t * r1   = vtcm_src + row1 * row_stride;
+
+                // Batch-convert all 8 E8M0 scales once per row (stays in HVX register)
+                mxfp4_scales_t r0_e8 = mxfp4_convert_scales(r0 + e8m0_blk_off);
+
+                HVX_Vector v0[4], v1[4];
+                dequantize_x4x2_mxfp4_x4groups_hvx(r0 + packed_off, upper, sub_blk_base, vlut_cvt, r0_e8, v0);
+                if (row1 < n_cols) {
+                    mxfp4_scales_t r1_e8 = mxfp4_convert_scales(r1 + e8m0_blk_off);
+                    dequantize_x4x2_mxfp4_x4groups_hvx(r1 + packed_off, upper, sub_blk_base, vlut_cvt, r1_e8, v1);
+                } else {
+                    v1[0] = v1[1] = v1[2] = v1[3] = Q6_V_vzero();
+                }
+
+                for (int g = 0; g < 4; g++) {
+                    Q6_vscatter_QRMVwV(q_mask64, (size_t) tile_bases[g], HMX_FP16_TILE_SIZE - 1, v_off, v0[g]);
+                }
+                v_off = Q6_Vw_vadd_VwVw(v_off, v_scat_step);
+                for (int g = 0; g < 4; g++) {
+                    Q6_vscatter_QRMVwV(q_mask64, (size_t) tile_bases[g], HMX_FP16_TILE_SIZE - 1, v_off, v1[g]);
+                }
+                v_off = Q6_Vw_vadd_VwVw(v_off, v_scat_step);
+            }
+
+            for (int g = 0; g < 4; g++) {
+                (void) *(volatile HVX_Vector *) (tile_bases[g]);
+            }
+
+            t += 4;
+            continue;
+        }
+
        // --- Single-tile fallback ---
        __fp16 *tile_base = vtcm_dst + t * HMX_FP16_TILE_N_ELMS;

-        if (is_q4) {
+        if (weight_type == HTP_TYPE_Q4_0 || weight_type == HTP_TYPE_IQ4_NL) {
            int blk_idx  = (kt * 32) / QK_Q4_0x4x2;
            int sub_blk  = ((kt * 32) % QK_Q4_0x4x2) / 32;
            bool upper   = (sub_blk >= 4);
@ -382,6 +523,39 @@ static void dequantize_x4x2_weight_to_fp16_tiles_task(
                v_off = Q6_Vw_vadd_VwVw(v_off, v_scat_step);
            }
            (void) *(volatile HVX_Vector *)(tile_base);
+        } else if (weight_type == HTP_TYPE_MXFP4) {
+            int  blk_idx      = (kt * 32) / QK_MXFP4x4x2;
+            int  sub_blk      = ((kt * 32) % QK_MXFP4x4x2) / 32;
+            bool upper        = (sub_blk >= 4);
+            int  byte_off     = blk_idx * (QK_MXFP4x4x2 / 2) + (upper ? (sub_blk - 4) : sub_blk) * 32;
+            int  e8m0_blk_off = qrow_size + blk_idx * HMX_X4X2_MXFP4_EBLK_SIZE;
+
+            HVX_Vector v_off = v_scat_base;
+            for (int r = 0; r < HMX_FP16_TILE_N_ROWS; r += 2) {
+                int row0 = ct * HMX_FP16_TILE_N_COLS + r;
+                int row1 = row0 + 1;
+
+                const uint8_t * r0 = vtcm_src + row0 * row_stride;
+                const uint8_t * r1 = vtcm_src + row1 * row_stride;
+
+                // Batch-convert all 8 E8M0 scales once per row (stays in HVX register)
+                mxfp4_scales_t r0_e8 = mxfp4_convert_scales(r0 + e8m0_blk_off);
+
+                HVX_Vector v0 = dequantize_x4x2_mxfp4_group_hvx(r0 + byte_off, upper, sub_blk, vlut_cvt, r0_e8);
+                HVX_Vector v1;
+                if (row1 < n_cols) {
+                    mxfp4_scales_t r1_e8 = mxfp4_convert_scales(r1 + e8m0_blk_off);
+                    v1 = dequantize_x4x2_mxfp4_group_hvx(r1 + byte_off, upper, sub_blk, vlut_cvt, r1_e8);
+                } else {
+                    v1 = Q6_V_vzero();
+                }
+
+                Q6_vscatter_QRMVwV(q_mask64, (size_t) tile_base, HMX_FP16_TILE_SIZE - 1, v_off, v0);
+                v_off = Q6_Vw_vadd_VwVw(v_off, v_scat_step);
+                Q6_vscatter_QRMVwV(q_mask64, (size_t) tile_base, HMX_FP16_TILE_SIZE - 1, v_off, v1);
+                v_off = Q6_Vw_vadd_VwVw(v_off, v_scat_step);
+            }
+            (void) *(volatile HVX_Vector *) (tile_base);
        } else {
            // Q8_0
            int blk_idx  = (kt * 32) / QK_Q8_0x4x2;
@ -1455,21 +1629,24 @@ int mat_mul_qk_0_d16a32_out_stationary(struct htp_context *ctx, float *restrict
                {
                    qweight_fetch_task_state_t s;

-                    const bool is_q4 = (weight_type == HTP_TYPE_Q4_0 || weight_type == HTP_TYPE_IQ4_NL);
                    const int blk_start = kk / QK_Q4_0x4x2;
                    const int nb_sub = (k_blk_sz + QK_Q4_0x4x2 - 1) / QK_Q4_0x4x2;
-                    const int full_qrow = is_q4 ? (k / 2) : k;
+                    const int    full_qrow      = (weight_type == HTP_TYPE_Q8_0) ? k : (k / 2);
                    const size_t sub_row_stride = get_x4x2_row_stride(weight_type, k_blk_sz);
+                    const int    scale_blk_size =
+                        (weight_type == HTP_TYPE_MXFP4) ? HMX_X4X2_MXFP4_EBLK_SIZE : HMX_X4X2_DBLK_SIZE;

                    s.dst         = vtcm_scratch0;
                    s.src         = w + nc * row_stride;
                    s.n_rows      = n_blk_sz;
                    s.src_stride  = row_stride;
                    s.dst_stride  = sub_row_stride;
-                    s.quant_off   = is_q4 ? (blk_start * (QK_Q4_0x4x2 / 2)) : (blk_start * QK_Q8_0x4x2);
-                    s.quant_width = is_q4 ? (nb_sub    * (QK_Q4_0x4x2 / 2)) : (nb_sub * QK_Q8_0x4x2);
-                    s.scale_off   = full_qrow + blk_start * HMX_X4X2_DBLK_SIZE;
-                    s.scale_width = nb_sub * HMX_X4X2_DBLK_SIZE;
+                    s.quant_off =
+                        (weight_type == HTP_TYPE_Q8_0) ? (blk_start * QK_Q8_0x4x2) : (blk_start * (QK_Q4_0x4x2 / 2));
+                    s.quant_width =
+                        (weight_type == HTP_TYPE_Q8_0) ? (nb_sub * QK_Q8_0x4x2) : (nb_sub * (QK_Q4_0x4x2 / 2));
+                    s.scale_off   = full_qrow + blk_start * scale_blk_size;
+                    s.scale_width = nb_sub * scale_blk_size;

                    // 2D DMA: quants sub-range
                    dma_queue_push(ctx->dma[0], dma_make_ptr(s.dst, s.src + s.quant_off),
--- a/ggml/src/ggml-hexagon/htp/htp-ctx.h
+++ b/ggml/src/ggml-hexagon/htp/htp-ctx.h
@ -31,6 +31,12 @@ struct htp_context {

    uint32_t opmask;

+    // Cached src1 spad position from the last quantize pass.
+    // When SKIP_QUANTIZE is set the Q8 activation data is already in VTCM
+    // at this address; the matmul must read from here instead of recomputing
+    // the offset (which depends on the current op's src0 size).
+    uint8_t * prev_src1_spad;
+
    // HMX acceleration fields (v73+, enabled by compile-time HTP_HAS_HMX)
 #ifdef HTP_HAS_HMX
    int        hmx_enabled;       // Runtime flag: HMX initialisation succeeded
--- a/Show More
+++ b/Show More