codeowners : use teams (#20526 )

* use teams * update * update * update * update * update
ci : split build.yml + server.yml (#20546 )
2026-03-15 14:26:10 +01:00 · 2026-03-15 15:11:17 +02:00 · 2026-03-15 12:15:12 +01:00 · 2026-03-15 10:47:28 +02:00 · 2026-03-15 08:33:39 +01:00 · 2026-03-15 08:18:54 +01:00
793 changed files with 76116 additions and 40476 deletions
--- a/.devops/openvino.Dockerfile
+++ b/.devops/openvino.Dockerfile
@ -0,0 +1,138 @@
 ARG OPENVINO_VERSION_MAJOR=2026.0
 ARG OPENVINO_VERSION_FULL=2026.0.0.20965.c6d6a13a886
 ARG UBUNTU_VERSION=24.04
 # Optional proxy build arguments - empty by default
 ARG http_proxy=
 ARG https_proxy=
 ## Build Image
 FROM ubuntu:${UBUNTU_VERSION} AS build
 # Pass proxy args to build stage
 ARG http_proxy
 ARG https_proxy
 RUN apt-get update && \
    apt-get install -y --no-install-recommends \
        ca-certificates \
        gnupg \
        wget \
        git \
        cmake \
        ninja-build \
        build-essential \
        libtbb12 \
        libssl-dev \
        ocl-icd-opencl-dev \
        opencl-headers \
        opencl-clhpp-headers \
        intel-opencl-icd && \
    rm -rf /var/lib/apt/lists/*
 # Install OpenVINO for Ubuntu 24.04
 ARG OPENVINO_VERSION_MAJOR
 ARG OPENVINO_VERSION_FULL
 RUN mkdir -p /opt/intel && \
    wget https://storage.openvinotoolkit.org/repositories/openvino/packages/${OPENVINO_VERSION_MAJOR}/linux/openvino_toolkit_ubuntu24_${OPENVINO_VERSION_FULL}_x86_64.tgz && \
    tar -xf openvino_toolkit_ubuntu24_${OPENVINO_VERSION_FULL}_x86_64.tgz && \
    mv openvino_toolkit_ubuntu24_${OPENVINO_VERSION_FULL}_x86_64 /opt/intel/openvino_${OPENVINO_VERSION_MAJOR} && \
    cd /opt/intel/openvino_${OPENVINO_VERSION_MAJOR} && \
    echo "Y" | ./install_dependencies/install_openvino_dependencies.sh && \
    cd - && \
    ln -s /opt/intel/openvino_${OPENVINO_VERSION_MAJOR} /opt/intel/openvino
 ENV OpenVINO_DIR=/opt/intel/openvino
 WORKDIR /app
 COPY . .
 # Build Stage
 RUN bash -c "source ${OpenVINO_DIR}/setupvars.sh && \
    cmake -B build/ReleaseOV -G Ninja \
        -DCMAKE_BUILD_TYPE=Release \
        -DGGML_OPENVINO=ON && \
    cmake --build build/ReleaseOV -j$(nproc)"
 # Copy all necessary libraries
 RUN mkdir -p /app/lib && \
    find build/ReleaseOV -name '*.so*' -exec cp {} /app/lib \; && \
    find ${OpenVINO_DIR}/runtime/lib/intel64 -name '*.so*' -exec cp -P {} /app/lib \; 2>/dev/null || \
    find ${OpenVINO_DIR}/lib/intel64 -name '*.so*' -exec cp -P {} /app/lib \;
 # Create runtime directories and copy binaries
 RUN mkdir -p /app/full \
    && cp build/ReleaseOV/bin/* /app/full/ \
    && cp *.py /app/full \
    && cp -r gguf-py /app/full \
    && cp -r requirements /app/full \
    && cp requirements.txt /app/full \
    && cp .devops/tools.sh /app/full/tools.sh
 ## Base Runtime Image
 FROM ubuntu:${UBUNTU_VERSION} AS base
 # Pass proxy args to runtime stage
 ARG http_proxy
 ARG https_proxy
 RUN apt-get update \
    && apt-get install -y libgomp1 libtbb12 curl\
    && apt autoremove -y \
    && apt clean -y \
    && rm -rf /tmp/* /var/tmp/* \
    && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
    && find /var/cache -type f -delete
 COPY --from=build /app/lib/ /app/
 ### Full (all binaries)
 FROM base AS full
 ARG http_proxy
 ARG https_proxy
 COPY --from=build /app/full /app/
 WORKDIR /app
 RUN apt-get update && \
    apt-get install -y --no-install-recommends \
    git \
    python3 \
    python3-venv \
    python3-pip && \
    python3 -m venv /ov-venv && \
    /ov-venv/bin/pip install --no-cache-dir --upgrade pip setuptools wheel && \
    /ov-venv/bin/pip install --no-cache-dir -r requirements.txt && \
    apt-get autoremove -y && \
    apt-get clean && \
    rm -rf /tmp/* /var/tmp/* && \
    find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete && \
    find /var/cache -type f -delete
 ENTRYPOINT ["/bin/bash", "-c", "source /ov-venv/bin/activate && exec /app/tools.sh \"$@\"", "--"]
 ### Light, CLI only
 FROM base AS light
 COPY --from=build /app/full/llama-cli /app/
 WORKDIR /app
 ENTRYPOINT [ "/app/llama-cli" ]
 ### Server, Server only
 FROM base AS server
 ENV LLAMA_ARG_HOST=0.0.0.0
 COPY --from=build /app/full/llama-server /app/
 WORKDIR /app
 HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
 ENTRYPOINT [ "/app/llama-server" ]
--- a/.devops/vulkan.Dockerfile
+++ b/.devops/vulkan.Dockerfile
@ -53,10 +53,11 @@ RUN apt-get update \
    && apt-get install -y \
    build-essential \
    git \
-    python3 \
+    python3.13 \
-    python3-dev \
+    python3.13-dev \
    python3-pip \
    python3-wheel \
    && update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.13 100 \
    && pip install --break-system-packages --upgrade setuptools \
    && pip install --break-system-packages -r requirements.txt \
    && apt autoremove -y \
--- a/.github/actions/linux-setup-openvino/action.yml
+++ b/.github/actions/linux-setup-openvino/action.yml
@ -0,0 +1,25 @@
 name: "Linux - Setup OpenVINO Toolkit"
 description: "Setup OpenVINO Toolkit for Linux"
 inputs:
  path:
    description: "Installation path"
    required: true
  version_major:
    description: "OpenVINO major version (e.g., 2025.3)"
    required: true
  version_full:
    description: "OpenVINO full version (e.g., 2025.3.0.19807.44526285f24)"
    required: true
 runs:
  using: "composite"
  steps:
    - name: Setup OpenVINO Toolkit
      id: setup
      uses: ./.github/actions/unarchive-tar
      with:
        url: https://storage.openvinotoolkit.org/repositories/openvino/packages/${{ inputs.version_major }}/linux/openvino_toolkit_ubuntu24_${{ inputs.version_full }}_x86_64.tgz
        path: ${{ inputs.path }}
        type: z
        strip: 1
--- a/.github/workflows/build-3rd-party.yml
+++ b/.github/workflows/build-3rd-party.yml
@ -0,0 +1,57 @@
 name: CI (3rd-party)
 on:
  workflow_dispatch: # allows manual triggering
  push:
    branches:
      - master
    paths: [
      '.github/workflows/build-3rd-party.yml',
      '**/CMakeLists.txt',
      '**/.cmake',
      '**/*.h',
      '**/*.hpp',
      '**/*.c',
      '**/*.cpp'
    ]
 concurrency:
  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
  cancel-in-progress: true
 env:
  GGML_NLOOP: 3
  GGML_N_THREADS: 1
  LLAMA_LOG_COLORS: 1
  LLAMA_LOG_PREFIX: 1
  LLAMA_LOG_TIMESTAMPS: 1
 jobs:
  ubuntu-24-llguidance:
    runs-on: ${{ 'ubuntu-24.04-arm' || 'ubuntu-24.04' }}
    steps:
      - name: Clone
        id: checkout
        uses: actions/checkout@v6
      - name: Dependencies
        id: depends
        run: |
          sudo apt-get update
          sudo apt-get install build-essential libssl-dev
      - name: Build
        id: cmake_build
        run: |
          cmake -B build \
            -DLLAMA_FATAL_WARNINGS=ON \
            -DLLAMA_LLGUIDANCE=ON
          cmake --build build --config Release -j $(nproc)
      - name: Test
        id: cmake_test
        run: |
          cd build
          ctest -L main --verbose --timeout 900
--- a/.github/workflows/build-android.yml
+++ b/.github/workflows/build-android.yml
@ -0,0 +1,140 @@
 name: CI (android)
 on:
  workflow_dispatch: # allows manual triggering
  push:
    branches:
      - master
    paths: [
      '.github/workflows/build-android.yml',
      '**/CMakeLists.txt',
      '**/.cmake',
      '**/*.h',
      '**/*.hpp',
      '**/*.c',
      '**/*.cpp'
    ]
  pull_request:
    types: [opened, synchronize, reopened]
    paths: [
      '.github/workflows/build-android.yml',
      'examples/llama.android/**'
    ]
 concurrency:
  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
  cancel-in-progress: true
 env:
  GGML_NLOOP: 3
  GGML_N_THREADS: 1
  LLAMA_LOG_COLORS: 1
  LLAMA_LOG_PREFIX: 1
  LLAMA_LOG_TIMESTAMPS: 1
 jobs:
  android:
    runs-on: ubuntu-latest
    steps:
      - name: Clone
        uses: actions/checkout@v6
      # Disabled due to size (400MB) and always 0 cache hits
      # - name: ccache
      #   uses: ggml-org/ccache-action@v1.2.16
      #   with:
      #     key: android-build
      #     evict-old-files: 1d
      - name: Set up JDK
        uses: actions/setup-java@v5
        with:
          java-version: 17
          distribution: zulu
      - name: Setup Android SDK
        uses: android-actions/setup-android@v3
        with:
          log-accepted-android-sdk-licenses: false
      - name: Build
        run: |
          cd examples/llama.android
          ./gradlew build --no-daemon
  android-ndk:
    runs-on: ubuntu-latest
    env:
      OPENCL_VERSION: 2025.07.22
    strategy:
      matrix:
        include:
          - build: 'arm64-cpu'
            defines: '-D ANDROID_ABI=arm64-v8a -D ANDROID_PLATFORM=android-31 -D CMAKE_TOOLCHAIN_FILE=${ANDROID_NDK_ROOT}/build/cmake/android.toolchain.cmake -D GGML_NATIVE=OFF -DGGML_CPU_ARM_ARCH=armv8.5-a+fp16+i8mm -G Ninja -D LLAMA_OPENSSL=OFF -D GGML_OPENMP=OFF'
          - build: 'arm64-snapdragon'
            defines: '--preset arm64-android-snapdragon-release'
    steps:
      - name: Clone
        id: checkout
        uses: actions/checkout@v6
      - name: Install OpenCL Headers and Libs
        id: install_opencl
        if: ${{ matrix.build == 'arm64-snapdragon' }}
        run: |
          mkdir opencl
          curl -L -o opencl/clhpp.tar.gz      https://github.com/KhronosGroup/OpenCL-CLHPP/archive/refs/tags/v${OPENCL_VERSION}.tar.gz
          curl -L -o opencl/headers.tar.gz    https://github.com/KhronosGroup/OpenCL-Headers/archive/refs/tags/v${OPENCL_VERSION}.tar.gz
          curl -L -o opencl/icd-loader.tar.gz https://github.com/KhronosGroup/OpenCL-ICD-Loader/archive/refs/tags/v${OPENCL_VERSION}.tar.gz
          tar -xaf opencl/headers.tar.gz    -C opencl
          tar -xaf opencl/clhpp.tar.gz      -C opencl
          tar -xaf opencl/icd-loader.tar.gz -C opencl
          sudo cp -r opencl/OpenCL-Headers-${OPENCL_VERSION}/CL         ${ANDROID_NDK_ROOT}/toolchains/llvm/prebuilt/linux-x86_64/sysroot/usr/include
          sudo cp -r opencl/OpenCL-CLHPP-${OPENCL_VERSION}/include/CL/* ${ANDROID_NDK_ROOT}/toolchains/llvm/prebuilt/linux-x86_64/sysroot/usr/include/CL
          cd opencl/OpenCL-ICD-Loader-${OPENCL_VERSION}
          cmake -B build -G Ninja -DCMAKE_BUILD_TYPE=Release -DCMAKE_TOOLCHAIN_FILE=${ANDROID_NDK_ROOT}/build/cmake/android.toolchain.cmake -DOPENCL_ICD_LOADER_HEADERS_DIR=${ANDROID_NDK_ROOT}/toolchains/llvm/prebuilt/linux-x86_64/sysroot/usr/include -DANDROID_ABI=arm64-v8a -DANDROID_PLATFORM=31 -DANDROID_STL=c++_shared
          cmake --build build
          sudo cp build/libOpenCL.so ${ANDROID_NDK_ROOT}/toolchains/llvm/prebuilt/linux-x86_64/sysroot/usr/lib/aarch64-linux-android
          rm -rf opencl
      - name: Install Hexagon SDK
        id: install_hexsdk
        if: ${{ matrix.build == 'arm64-snapdragon' }}
        env:
          HEXSDK_VER: 6.4.0.2
          HEXTLS_VER: 19.0.04
        run: |
          curl -L -o hex-sdk.tar.gz https://github.com/snapdragon-toolchain/hexagon-sdk/releases/download/v$HEXSDK_VER/hexagon-sdk-v$HEXSDK_VER-amd64-lnx.tar.xz
          mkdir hex-sdk
          tar -xaf hex-sdk.tar.gz -C hex-sdk
          ls -l hex-sdk
          sudo mv hex-sdk /opt/hexagon
          echo "HEXAGON_SDK_ROOT=/opt/hexagon/$HEXSDK_VER"                                     >> "$GITHUB_ENV"
          echo "HEXAGON_TOOLS_ROOT=/opt/hexagon/$HEXSDK_VER/tools/HEXAGON_Tools/$HEXTLS_VER"   >> "$GITHUB_ENV"
          echo "DEFAULT_HLOS_ARCH=64"                                                          >> "$GITHUB_ENV"
          echo "DEFAULT_TOOLS_VARIANT=toolv19"                                                 >> "$GITHUB_ENV"
          echo "DEFAULT_NO_QURT_INC=0"                                                         >> "$GITHUB_ENV"
          echo "DEFAULT_DSP_ARCH=v73"                                                          >> "$GITHUB_ENV"
      - name: Update CMake presets
        id: update_presets
        if: ${{ matrix.build == 'arm64-snapdragon' }}
        run: |
          cp docs/backend/snapdragon/CMakeUserPresets.json .
      - name: Build
        id: ndk_build
        run: |
          cmake ${{ matrix.defines }} -B build
          cmake --build build
          cmake --install build --prefix pkg-adb/llama.cpp
      - name: Test
        id: cmake_test
        run: |
          echo "FIXME: test on devices"
--- a/.github/workflows/build-apple.yml
+++ b/.github/workflows/build-apple.yml
@ -0,0 +1,214 @@
 name: CI (apple)
 on:
  workflow_dispatch: # allows manual triggering
  push:
    branches:
      - master
    paths: [
      '.github/workflows/build-apple.yml',
      '**/CMakeLists.txt',
      '**/.cmake',
      '**/*.h',
      '**/*.hpp',
      '**/*.c',
      '**/*.cpp',
      '**/*.swift',
      '**/*.m',
      '**/*.metal'
    ]
  pull_request:
    types: [opened, synchronize, reopened]
    paths: [
      '.github/workflows/build-apple.yml',
      'ggml/src/ggml-metal/**'
    ]
 concurrency:
  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
  cancel-in-progress: true
 env:
  GGML_NLOOP: 3
  GGML_N_THREADS: 1
  LLAMA_LOG_COLORS: 1
  LLAMA_LOG_PREFIX: 1
  LLAMA_LOG_TIMESTAMPS: 1
 jobs:
  macOS-latest-ios:
    runs-on: macos-latest
    steps:
      - name: Clone
        id: checkout
        uses: actions/checkout@v6
      - name: ccache
        uses: ggml-org/ccache-action@v1.2.16
        with:
          key: macOS-latest-ios
          evict-old-files: 1d
          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
      - name: Build
        id: cmake_build
        run: |
          sysctl -a
          cmake -B build -G Xcode \
            -DGGML_METAL_USE_BF16=ON \
            -DGGML_METAL_EMBED_LIBRARY=ON \
            -DLLAMA_BUILD_COMMON=OFF \
            -DLLAMA_BUILD_EXAMPLES=OFF \
            -DLLAMA_BUILD_TOOLS=OFF \
            -DLLAMA_BUILD_TESTS=OFF \
            -DLLAMA_BUILD_SERVER=OFF \
            -DCMAKE_SYSTEM_NAME=iOS \
            -DCMAKE_OSX_DEPLOYMENT_TARGET=14.0 \
            -DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml
          cmake --build build --config Release -j $(sysctl -n hw.logicalcpu) -- CODE_SIGNING_ALLOWED=NO
  macos-latest-ios-xcode:
    runs-on: macos-latest
    steps:
      - name: Checkout code
        uses: actions/checkout@v6
      - name: Setup Xcode
        uses: ggml-org/setup-xcode@v1
        with:
          xcode-version: latest-stable
      - name: Build
        id: cmake_build
        run: |
          sysctl -a
          cmake -B build -G Xcode \
            -DGGML_METAL_USE_BF16=ON \
            -DGGML_METAL_EMBED_LIBRARY=ON \
            -DLLAMA_OPENSSL=OFF \
            -DLLAMA_BUILD_EXAMPLES=OFF \
            -DLLAMA_BUILD_TOOLS=OFF \
            -DLLAMA_BUILD_TESTS=OFF \
            -DLLAMA_BUILD_SERVER=OFF \
            -DCMAKE_SYSTEM_NAME=iOS \
            -DCMAKE_OSX_DEPLOYMENT_TARGET=14.0 \
            -DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml
          cmake --build build --config Release -j $(sysctl -n hw.logicalcpu) -- CODE_SIGNING_ALLOWED=NO
      - name: xcodebuild for swift package
        id: xcodebuild
        run: |
          ./build-xcframework.sh
      - name: Upload xcframework artifact
        uses: actions/upload-artifact@v6
        with:
          name: llama-xcframework
          path: build-apple/llama.xcframework/
          retention-days: 1
      - name: Build Xcode project
        run: |
          xcodebuild -downloadPlatform iOS
          xcodebuild -project examples/llama.swiftui/llama.swiftui.xcodeproj -scheme llama.swiftui -sdk iphoneos CODE_SIGNING_REQUIRED=NO CODE_SIGN_IDENTITY= -destination 'generic/platform=iOS' FRAMEWORK_FOLDER_PATH=./build-ios build
  macOS-latest-tvos:
    runs-on: macos-latest
    steps:
      - name: Clone
        id: checkout
        uses: actions/checkout@v6
      - name: ccache
        uses: ggml-org/ccache-action@v1.2.16
        with:
          key: macOS-latest-tvos
          evict-old-files: 1d
          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
      - name: Build
        id: cmake_build
        run: |
          sysctl -a
          cmake -B build -G Xcode \
            -DGGML_METAL_USE_BF16=ON \
            -DGGML_METAL_EMBED_LIBRARY=ON \
            -DLLAMA_BUILD_COMMON=OFF \
            -DLLAMA_BUILD_EXAMPLES=OFF \
            -DLLAMA_BUILD_TOOLS=OFF \
            -DLLAMA_BUILD_TESTS=OFF \
            -DLLAMA_BUILD_SERVER=OFF \
            -DCMAKE_SYSTEM_NAME=tvOS \
            -DCMAKE_OSX_DEPLOYMENT_TARGET=14.0 \
            -DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml
          cmake --build build --config Release -j $(sysctl -n hw.logicalcpu) -- CODE_SIGNING_ALLOWED=NO
  macOS-latest-visionos:
    runs-on: macos-latest
    steps:
      - name: Clone
        id: checkout
        uses: actions/checkout@v6
      - name: Build
        id: cmake_build
        run: |
          sysctl -a
          cmake -B build -G Xcode \
            -DGGML_METAL_USE_BF16=ON \
            -DGGML_METAL_EMBED_LIBRARY=ON \
            -DLLAMA_BUILD_COMMON=OFF \
            -DLLAMA_BUILD_EXAMPLES=OFF \
            -DLLAMA_BUILD_TOOLS=OFF \
            -DLLAMA_BUILD_TESTS=OFF \
            -DLLAMA_BUILD_SERVER=OFF \
            -DCMAKE_SYSTEM_NAME=visionOS \
            -DCMAKE_OSX_DEPLOYMENT_TARGET=1.0 \
            -DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml
          cmake --build build --config Release -j $(sysctl -n hw.logicalcpu) -- CODE_SIGNING_ALLOWED=NO
  macOS-latest-swift:
    runs-on: macos-latest
    needs: macos-latest-ios-xcode
    strategy:
      matrix:
        destination: ['generic/platform=macOS', 'generic/platform=iOS', 'generic/platform=tvOS']
    steps:
      - name: Clone
        id: checkout
        uses: actions/checkout@v6
      - name: ccache
        uses: ggml-org/ccache-action@v1.2.16
        with:
          key: macOS-latest-swift
          evict-old-files: 1d
          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
      - name: Download xcframework artifact
        uses: actions/download-artifact@v7
        with:
          name: llama-xcframework
          path: build-apple/llama.xcframework/
      - name: Build llama.cpp with CMake
        id: cmake_build
        run: |
          sysctl -a
          cmake -B build -G Xcode \
            -DGGML_METAL_USE_BF16=ON \
            -DGGML_METAL_EMBED_LIBRARY=ON \
            -DLLAMA_OPENSSL=OFF \
            -DLLAMA_BUILD_EXAMPLES=OFF \
            -DLLAMA_BUILD_TOOLS=OFF \
            -DLLAMA_BUILD_TESTS=OFF \
            -DLLAMA_BUILD_SERVER=OFF \
            -DCMAKE_OSX_ARCHITECTURES="arm64;x86_64"
          cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)
--- a/.github/workflows/build-cache.yml
+++ b/.github/workflows/build-cache.yml
@ -37,12 +37,39 @@ jobs:
          path: ./vulkan_sdk
          version: ${{ env.VULKAN_SDK_VERSION }}
-  ubuntu-24-spacemit-cache:
+  #ubuntu-24-spacemit-cache:
  #  runs-on: ubuntu-24.04
  #  env:
  #    # Make sure this is in sync with build-linux-cross.yml
  #    SPACEMIT_IME_TOOLCHAIN_VERSION: "1.1.2"
  #  steps:
  #    - name: Clone
  #      id: checkout
  #      uses: actions/checkout@v6
  #    - name: Setup Cache
  #      uses: actions/cache@v5
  #      id: cache-toolchain
  #      with:
  #        path: ./spacemit_toolchain
  #        key: spacemit-ime-toolchain-v${{ env.SPACEMIT_IME_TOOLCHAIN_VERSION }}-${{ runner.os }}
  #    - name: Setup SpacemiT Toolchain
  #      if: steps.cache-toolchain.outputs.cache-hit != 'true'
  #      uses: ./.github/actions/linux-setup-spacemit
  #      with:
  #        path: ./spacemit_toolchain
  #        version: ${{ env.SPACEMIT_IME_TOOLCHAIN_VERSION }}
  ubuntu-24-openvino-cache:
    runs-on: ubuntu-24.04
    env:
-      # Make sure this is in sync with build-linux-cross.yml
+      # Sync versions in build.yml, build-self-hosted.yml, release.yml, build-cache.yml, .devops/openvino.Dockerfile
-      SPACEMIT_IME_TOOLCHAIN_VERSION: "1.1.2"
+      OPENVINO_VERSION_MAJOR: "2026.0"
      OPENVINO_VERSION_FULL: "2026.0.0.20965.c6d6a13a886"
    steps:
      - name: Clone
@ -51,17 +78,18 @@ jobs:
      - name: Setup Cache
        uses: actions/cache@v5
-        id: cache-toolchain
+        id: cache-openvino
        with:
-          path: ./spacemit_toolchain
+          path: ./openvino_toolkit
-          key: spacemit-ime-toolchain-v${{ env.SPACEMIT_IME_TOOLCHAIN_VERSION }}-${{ runner.os }}
+          key: openvino-toolkit-v${{ env.OPENVINO_VERSION_FULL }}-${{ runner.os }}
-      - name: Setup SpacemiT Toolchain
+      - name: Setup OpenVINO Toolkit
-        if: steps.cache-toolchain.outputs.cache-hit != 'true'
+        if: steps.cache-openvino.outputs.cache-hit != 'true'
-        uses: ./.github/actions/linux-setup-spacemit
+        uses: ./.github/actions/linux-setup-openvino
        with:
-          path: ./spacemit_toolchain
+          path: ./openvino_toolkit
-          version: ${{ env.SPACEMIT_IME_TOOLCHAIN_VERSION }}
+          version_major: ${{ env.OPENVINO_VERSION_MAJOR }}
          version_full: ${{ env.OPENVINO_VERSION_FULL }}
  windows-2022-rocm-cache:
    runs-on: windows-2022
--- a/.github/workflows/build-cann.yml
+++ b/.github/workflows/build-cann.yml
@ -0,0 +1,102 @@
 name: CI (cann)
 on:
  workflow_dispatch: # allows manual triggering
  push:
    branches:
      - master
    paths: [
      '.github/workflows/build-cann.yml',
      '**/CMakeLists.txt',
      '**/.cmake',
      '**/*.h',
      '**/*.hpp',
      '**/*.c',
      '**/*.cpp'
    ]
  pull_request:
    types: [opened, synchronize, reopened]
    paths: [
      '.github/workflows/build-cann.yml',
      'ggml/src/ggml-cann/**'
    ]
 concurrency:
  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
  cancel-in-progress: true
 env:
  GGML_NLOOP: 3
  GGML_N_THREADS: 1
  LLAMA_LOG_COLORS: 1
  LLAMA_LOG_PREFIX: 1
  LLAMA_LOG_TIMESTAMPS: 1
 jobs:
  openEuler-latest-cann:
    defaults:
      run:
        shell: bash -el {0}
    strategy:
      matrix:
        arch: [x86, aarch64]
        chip_type: ['910b', '310p']
        build: ['Release']
        use_acl_graph: ['on', 'off']
        exclude:
          # 310P does not support USE_ACL_GRAPH=on
          - chip_type: '310p'
            use_acl_graph: 'on'
    runs-on: ${{ matrix.arch == 'aarch64' && 'ubuntu-24.04-arm' || 'ubuntu-24.04' }}
    steps:
      - name: Checkout
        uses: actions/checkout@v6
        with:
          fetch-depth: 0
      - name: Free up disk space
        uses: ggml-org/free-disk-space@v1.3.1
        with:
          tool-cache: true
      - name: Set container image
        id: cann-image
        run: |
          image="ascendai/cann:${{ matrix.chip_type == '910b' &&  '8.3.rc2-910b-openeuler24.03-py3.11' || '8.3.rc2-310p-openeuler24.03-py3.11' }}"
          echo "image=${image}" >> "${GITHUB_OUTPUT}"
      - name: Pull container image
        run: docker pull "${{ steps.cann-image.outputs.image }}"
      - name: Build
        env:
          BUILD_TYPE: ${{ matrix.build }}
          SOC_TYPE: ascend${{ matrix.chip_type }}
          USE_ACL_GRAPH: ${{ matrix.use_acl_graph }}
        run: |
          HOST_UID=$(id -u)
          HOST_GID=$(id -g)
          docker run --rm \
            -v "${PWD}:/workspace" \
            -w /workspace \
            -e SOC_TYPE=${SOC_TYPE} \
            -e BUILD_TYPE=${BUILD_TYPE} \
            -e USE_ACL_GRAPH=${USE_ACL_GRAPH} \
            "${{ steps.cann-image.outputs.image }}" \
            bash -lc '
              set -e
              yum install -y --setopt=install_weak_deps=False --setopt=tsflags=nodocs git gcc gcc-c++ make cmake openssl-devel
              yum clean all && rm -rf /var/cache/yum
              git config --global --add safe.directory "/workspace"
              export LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:${ASCEND_TOOLKIT_HOME}/$(uname -m)-linux/devlib/:${LD_LIBRARY_PATH}
              cmake -S . -B build \
                  -DCMAKE_BUILD_TYPE=${BUILD_TYPE} \
                  -DGGML_CANN=on \
                  -DSOC_TYPE=${SOC_TYPE} \
                  -DUSE_ACL_GRAPH=${USE_ACL_GRAPH}
              cmake --build build -j $(nproc)
              chown -R '"${HOST_UID}"':'"${HOST_GID}"' /workspace/build
            '
--- a/.github/workflows/build-cmake-pkg.yml
+++ b/.github/workflows/build-cmake-pkg.yml
@ -5,7 +5,7 @@ on:
 jobs:
  linux:
-    runs-on: ubuntu-24.04
+    runs-on: ubuntu-slim
    steps:
      - uses: actions/checkout@v6
        with:
@ -14,7 +14,7 @@ jobs:
      - name: Install dependencies
        run: |
          sudo apt update
-          sudo apt install -y build-essential tcl
+          sudo apt install -y build-essential tcl cmake
      - name: Build
        run: |
--- a/.github/workflows/build-linux-cross.yml
+++ b/.github/workflows/build-linux-cross.yml
@ -1,7 +1,24 @@
-name: Build on Linux using cross-compiler
+name: CI (cross)
 on:
  # only manual triggers due to low-importance of the workflows
  # TODO: for regular runs, provision dedicated self-hosted runners
  workflow_dispatch:
-  workflow_call:
+  push:
    branches:
      - master
    paths: [
      '.github/workflows/build-cross.yml',
      'ggml/src/spacemit/*',
      'ggml/src/arch/loongarch/*'
    ]
  # run once every week
  schedule:
    - cron: '0 0 * * 0'
 concurrency:
  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
  cancel-in-progress: true
 jobs:
  # ubuntu-24-riscv64-cpu-cross:
@ -142,7 +159,7 @@ jobs:
  #         cmake --build build --config Release -j $(nproc)
  debian-13-loongarch64-cpu-cross:
-    runs-on: ubuntu-24.04
+    runs-on: ${{ 'ubuntu-24.04-arm' || 'ubuntu-24.04' }}
    container: debian@sha256:653dfb9f86c3782e8369d5f7d29bb8faba1f4bff9025db46e807fa4c22903671
    steps:
@ -197,7 +214,7 @@ jobs:
          cmake --build build --config Release -j $(nproc)
  debian-13-loongarch64-vulkan-cross:
-    runs-on: ubuntu-24.04
+    runs-on: ${{ 'ubuntu-24.04-arm' || 'ubuntu-24.04' }}
    container: debian@sha256:653dfb9f86c3782e8369d5f7d29bb8faba1f4bff9025db46e807fa4c22903671
    steps:
@ -264,15 +281,15 @@ jobs:
    steps:
      - uses: actions/checkout@v6
-      - name: Use SpacemiT Toolchain Cache
+      #- name: Use SpacemiT Toolchain Cache
-        uses: actions/cache@v5
+      #  uses: actions/cache@v5
-        id: cache-toolchain
+      #  id: cache-toolchain
-        with:
+      #  with:
-          path: ./spacemit_toolchain
+      #    path: ./spacemit_toolchain
-          key: spacemit-ime-toolchain-v${{ env.SPACEMIT_IME_TOOLCHAIN_VERSION }}-${{ runner.os }}
+      #    key: spacemit-ime-toolchain-v${{ env.SPACEMIT_IME_TOOLCHAIN_VERSION }}-${{ runner.os }}
      - name: Setup SpacemiT Toolchain
-        if: steps.cache-toolchain.outputs.cache-hit != 'true'
+        #if: steps.cache-toolchain.outputs.cache-hit != 'true'
        uses: ./.github/actions/linux-setup-spacemit
        with:
          path: ./spacemit_toolchain
--- a/.github/workflows/build-msys.yml
+++ b/.github/workflows/build-msys.yml
@ -0,0 +1,72 @@
 name: CI (msys)
 on:
  # only manual triggers due to low-importance of the workflows
  # TODO: for regular runs, provision dedicated self-hosted runners
  workflow_dispatch:
  # run once every week
  schedule:
    - cron: '0 0 * * 0'
 concurrency:
  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
  cancel-in-progress: true
 env:
  GGML_NLOOP: 3
  GGML_N_THREADS: 1
  LLAMA_LOG_COLORS: 1
  LLAMA_LOG_PREFIX: 1
  LLAMA_LOG_TIMESTAMPS: 1
 jobs:
  windows-msys2:
    runs-on: windows-2025
    strategy:
      fail-fast: false
      matrix:
        include:
          - { sys: UCRT64,  env: ucrt-x86_64,  build: Release }
          - { sys: CLANG64, env: clang-x86_64, build: Release }
    steps:
      - name: Clone
        uses: actions/checkout@v6
      #- name: ccache
      #  uses: ggml-org/ccache-action@v1.2.16
      #  with:
      #    key: windows-msys2
      #    variant: ccache
      #    evict-old-files: 1d
      #    save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
      - name: Setup ${{ matrix.sys }}
        uses: msys2/setup-msys2@v2
        with:
          update: true
          msystem: ${{matrix.sys}}
          install: >-
            base-devel
            git
            mingw-w64-${{matrix.env}}-toolchain
            mingw-w64-${{matrix.env}}-cmake
            mingw-w64-${{matrix.env}}-openblas
      - name: Build using CMake
        shell: msys2 {0}
        run: |
            cmake -B build
            cmake --build build --config ${{ matrix.build }} -j $(nproc)
      - name: Clean after building using CMake
        shell: msys2 {0}
        run: |
            rm -rf build
      - name: Build using CMake w/ OpenBLAS
        shell: msys2 {0}
        run: |
            cmake -B build -DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS
            cmake --build build --config ${{ matrix.build }} -j $(nproc)
--- a/.github/workflows/build-riscv.yml
+++ b/.github/workflows/build-riscv.yml
@ -0,0 +1,136 @@
 name: CI (riscv)
 on:
  workflow_dispatch: # allows manual triggering
  push:
    branches:
      - master
    paths: [
      '.github/workflows/build-riscv.yml',
      '**/CMakeLists.txt',
      '**/.cmake',
      '**/*.h',
      '**/*.hpp',
      '**/*.c',
      '**/*.cpp'
    ]
  pull_request:
    types: [opened, synchronize, reopened]
    paths: [
      '.github/workflows/build-riscv.yml',
      'ggml/src/ggml-cpu/arch/riscv/**'
    ]
 concurrency:
  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
  cancel-in-progress: true
 env:
  GGML_NLOOP: 3
  GGML_N_THREADS: 1
  LLAMA_LOG_COLORS: 1
  LLAMA_LOG_PREFIX: 1
  LLAMA_LOG_TIMESTAMPS: 1
 jobs:
  ubuntu-riscv64-native-sanitizer:
    runs-on: RISCV64
    continue-on-error: true
    strategy:
      matrix:
        sanitizer: [ADDRESS, THREAD, UNDEFINED]
        build_type: [Debug]
    steps:
      - name: Install dependencies
        run: |
          sudo apt-get update
          # Install necessary packages
          sudo apt-get install -y libatomic1 libtsan2 gcc-14 g++-14 rustup cmake build-essential wget ccache git-lfs
          # Set gcc-14 and g++-14 as the default compilers
          sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-14 100
          sudo update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-14 100
          sudo ln -sf /usr/bin/gcc-14 /usr/bin/gcc
          sudo ln -sf /usr/bin/g++-14 /usr/bin/g++
          # Install Rust stable version
          rustup install stable
          rustup default stable
          git lfs install
      - name: GCC version check
        run: |
          gcc --version
          g++ --version
      - name: Clone
        id: checkout
        uses: actions/checkout@v6
      - name: Setup ccache
        run: |
          # Unique cache directory per matrix combination
          export CCACHE_DIR="$HOME/.ccache/sanitizer-${{ matrix.sanitizer }}-${{ matrix.build_type }}"
          mkdir -p "$CCACHE_DIR"
          # Configure ccache
          ccache --set-config=max_size=5G
          ccache --set-config=compression=true
          ccache --set-config=compression_level=6
          ccache --set-config=cache_dir="$CCACHE_DIR"
          ccache --set-config=sloppiness=file_macro,time_macros,include_file_mtime,include_file_ctime
          ccache --set-config=hash_dir=false
          # Export for subsequent steps
          echo "CCACHE_DIR=$CCACHE_DIR" >> $GITHUB_ENV
          echo "PATH=/usr/lib/ccache:$PATH" >> $GITHUB_ENV
      - name: Build
        id: cmake_build
        if: ${{ matrix.sanitizer != 'THREAD' }}
        run: |
          cmake -B build \
            -DLLAMA_OPENSSL=OFF \
            -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
            -DGGML_OPENMP=ON \
            -DLLAMA_BUILD_EXAMPLES=ON \
            -DLLAMA_BUILD_TOOLS=ON \
            -DLLAMA_BUILD_TESTS=OFF \
            -DCMAKE_C_COMPILER_LAUNCHER=ccache \
            -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
            -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \
            -DCMAKE_C_COMPILER=riscv64-linux-gnu-gcc-14 \
            -DCMAKE_CXX_COMPILER=riscv64-linux-gnu-g++-14
          cmake --build build --config ${{ matrix.build_type }} -j $(nproc)
      - name: Build (no OpenMP)
        id: cmake_build_no_openmp
        if: ${{ matrix.sanitizer == 'THREAD' }}
        run: |
          cmake -B build \
            -DLLAMA_OPENSSL=OFF \
            -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
            -DGGML_OPENMP=OFF \
            -DLLAMA_BUILD_EXAMPLES=ON \
            -DLLAMA_BUILD_TOOLS=ON \
            -DLLAMA_BUILD_TESTS=OFF \
            -DCMAKE_C_COMPILER_LAUNCHER=ccache \
            -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
            -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \
            -DCMAKE_C_COMPILER=riscv64-linux-gnu-gcc-14 \
            -DCMAKE_CXX_COMPILER=riscv64-linux-gnu-g++-14
          cmake --build build --config ${{ matrix.build_type }} -j $(nproc)
      - name: Test
        id: cmake_test
        run: |
          cd build
          ctest -L main --verbose --timeout 900
--- a/.github/workflows/build-sanitize.yml
+++ b/.github/workflows/build-sanitize.yml
@ -0,0 +1,87 @@
 name: CI (sanitize)
 on:
  workflow_dispatch: # allows manual triggering
  push:
    branches:
      - master
    paths: [
      '.github/workflows/build-sanitize.yml',
      '**/CMakeLists.txt',
      '**/.cmake',
      '**/*.h',
      '**/*.hpp',
      '**/*.c',
      '**/*.cpp'
    ]
 concurrency:
  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
  cancel-in-progress: true
 env:
  GGML_NLOOP: 3
  GGML_N_THREADS: 1
  LLAMA_LOG_COLORS: 1
  LLAMA_LOG_PREFIX: 1
  LLAMA_LOG_TIMESTAMPS: 1
 jobs:
  ubuntu-latest-sanitizer:
    runs-on: ubuntu-latest
    continue-on-error: true
    strategy:
      matrix:
        sanitizer: [ADDRESS, THREAD, UNDEFINED]
        build_type: [Debug]
    steps:
      - name: Clone
        id: checkout
        uses: actions/checkout@v6
      - name: ccache
        uses: ggml-org/ccache-action@v1.2.16
        with:
          key: ubuntu-latest-sanitizer-${{ matrix.sanitizer }}
          evict-old-files: 1d
          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
      - name: Dependencies
        id: depends
        run: |
          sudo apt-get update
          sudo apt-get install build-essential libssl-dev
      - name: Build
        id: cmake_build
        if: ${{ matrix.sanitizer != 'THREAD' }}
        run: |
          cmake -B build \
            -DLLAMA_FATAL_WARNINGS=ON \
            -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \
            -DGGML_SANITIZE_${{ matrix.sanitizer }}=ON \
            -DCMAKE_BUILD_TYPE=${{ matrix.build_type }}
          cmake --build build --config ${{ matrix.build_type }} -j $(nproc)
      - name: Build (no OpenMP)
        id: cmake_build_no_openmp
        if: ${{ matrix.sanitizer == 'THREAD' }}
        run: |
          cmake -B build \
            -DLLAMA_FATAL_WARNINGS=ON \
            -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \
            -DGGML_SANITIZE_${{ matrix.sanitizer }}=ON \
            -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
            -DGGML_OPENMP=OFF
          cmake --build build --config ${{ matrix.build_type }} -j $(nproc)
      - name: Test
        id: cmake_test
        run: |
          cd build
          ctest -L main --verbose --timeout 900
--- a/.github/workflows/build-self-hosted.yml
+++ b/.github/workflows/build-self-hosted.yml
@ -0,0 +1,250 @@
 name: CI (self-hosted)
 on:
  workflow_dispatch: # allows manual triggering
  push:
    branches:
      - master
    paths: [
      '.github/workflows/build.yml',
      '**/CMakeLists.txt',
      '**/.cmake',
      '**/*.h',
      '**/*.hpp',
      '**/*.c',
      '**/*.cpp',
      '**/*.cu',
      '**/*.cuh',
      '**/*.swift',
      '**/*.m',
      '**/*.metal',
      '**/*.comp',
      '**/*.glsl',
      '**/*.wgsl'
    ]
  pull_request:
    types: [opened, synchronize, reopened]
    paths: [
      '.github/workflows/build-self-hosted.yml',
      '**/CMakeLists.txt',
      '**/.cmake',
      '**/*.h',
      '**/*.hpp',
      '**/*.c',
      '**/*.cpp',
      '**/*.cu',
      '**/*.cuh',
      '**/*.swift',
      '**/*.m',
      '**/*.metal',
      '**/*.comp',
      '**/*.glsl',
      '**/*.wgsl'
    ]
 concurrency:
  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
  cancel-in-progress: true
 env:
  GGML_NLOOP: 3
  GGML_N_THREADS: 1
  LLAMA_LOG_COLORS: 1
  LLAMA_LOG_PREFIX: 1
  LLAMA_LOG_TIMESTAMPS: 1
 jobs:
  ggml-ci-nvidia-cuda:
    runs-on: [self-hosted, Linux, NVIDIA]
    steps:
      - name: Clone
        id: checkout
        uses: actions/checkout@v6
      - name: Test
        id: ggml-ci
        run: |
          nvidia-smi
          GG_BUILD_CUDA=1 bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
  ggml-ci-nvidia-vulkan-cm:
    runs-on: [self-hosted, Linux, NVIDIA]
    steps:
      - name: Clone
        id: checkout
        uses: actions/checkout@v6
      - name: Test
        id: ggml-ci
        run: |
          vulkaninfo --summary
          GG_BUILD_VULKAN=1 GGML_VK_DISABLE_COOPMAT2=1 bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
  ggml-ci-nvidia-vulkan-cm2:
    runs-on: [self-hosted, Linux, NVIDIA, COOPMAT2]
    steps:
      - name: Clone
        id: checkout
        uses: actions/checkout@v6
      - name: Test
        id: ggml-ci
        run: |
          vulkaninfo --summary
          GG_BUILD_VULKAN=1 bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
  ggml-ci-cpu-amx:
    runs-on: [self-hosted, Linux, CPU, AMX]
    steps:
      - name: Clone
        id: checkout
        uses: actions/checkout@v6
      - name: Test
        id: ggml-ci
        run: |
          bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
  # ggml-ci-amd-vulkan:
  #   runs-on: [self-hosted, Linux, AMD]
  #   steps:
  #     - name: Clone
  #       id: checkout
  #       uses: actions/checkout@v6
  #     - name: Test
  #       id: ggml-ci
  #       run: |
  #         vulkaninfo --summary
  #         GG_BUILD_VULKAN=1 bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
  # ggml-ci-amd-rocm:
  #   runs-on: [self-hosted, Linux, AMD]
  #   steps:
  #     - name: Clone
  #       id: checkout
  #       uses: actions/checkout@v6
  #     - name: Test
  #       id: ggml-ci
  #       run: |
  #         amd-smi static
  #         GG_BUILD_ROCM=1 GG_BUILD_AMDGPU_TARGETS="gfx1101" bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
  ggml-ci-mac-metal:
    runs-on: [self-hosted, macOS, ARM64]
    steps:
      - name: Clone
        id: checkout
        uses: actions/checkout@v6
      - name: Test
        id: ggml-ci
        run: |
          GG_BUILD_METAL=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
  ggml-ci-mac-webgpu:
    runs-on: [self-hosted, macOS, ARM64]
    steps:
      - name: Clone
        id: checkout
        uses: actions/checkout@v6
      - name: Dawn Dependency
        id: dawn-depends
        run: |
          DAWN_VERSION="v2.0.0"
          DAWN_OWNER="reeselevine"
          DAWN_REPO="dawn"
          DAWN_ASSET_NAME="Dawn-5e9a4865b1635796ccc77dd30057f2b4002a1355-macos-latest-Release"
          echo "Fetching release asset from https://github.com/${DAWN_OWNER}/${DAWN_REPO}/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}.zip"
          curl -L -o artifact.zip \
            "https://github.com/${DAWN_OWNER}/${DAWN_REPO}/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}.zip"
          mkdir dawn
          unzip artifact.zip
          tar -xvf ${DAWN_ASSET_NAME}.tar.gz -C dawn --strip-components=1
      - name: Test
        id: ggml-ci
        run: |
          GG_BUILD_WEBGPU=1 GG_BUILD_WEBGPU_DAWN_PREFIX="$GITHUB_WORKSPACE/dawn" \
            bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
  ggml-ci-mac-vulkan:
    runs-on: [self-hosted, macOS, ARM64]
    steps:
      - name: Clone
        id: checkout
        uses: actions/checkout@v6
      - name: Test
        id: ggml-ci
        run: |
          vulkaninfo --summary
          GG_BUILD_VULKAN=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
  ggml-ci-linux-intel-vulkan:
    runs-on: [self-hosted, Linux, Intel]
    steps:
      - name: Clone
        id: checkout
        uses: actions/checkout@v6
        with:
          persist-credentials: false
      - name: Test
        id: ggml-ci
        run: |
          vulkaninfo --summary
          GG_BUILD_VULKAN=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
  ggml-ci-intel-openvino-gpu-low-perf:
    runs-on: [self-hosted, Linux, Intel, OpenVINO]
    env:
      # Sync versions in build.yml, build-self-hosted.yml, release.yml, build-cache.yml, .devops/openvino.Dockerfile
      OPENVINO_VERSION_MAJOR: "2026.0"
      OPENVINO_VERSION_FULL: "2026.0.0.20965.c6d6a13a886"
    steps:
      - name: Clone
        id: checkout
        uses: actions/checkout@v6
      - name: Use OpenVINO Toolkit Cache
        uses: actions/cache@v5
        id: cache-openvino
        with:
          path: ./openvino_toolkit
          key: openvino-toolkit-v${{ env.OPENVINO_VERSION_FULL }}-${{ runner.os }}
      - name: Setup OpenVINO Toolkit
        if: steps.cache-openvino.outputs.cache-hit != 'true'
        uses: ./.github/actions/linux-setup-openvino
        with:
          path: ./openvino_toolkit
          version_major: ${{ env.OPENVINO_VERSION_MAJOR }}
          version_full: ${{ env.OPENVINO_VERSION_FULL }}
      - name: Install OpenVINO dependencies
        run: |
          cd ./openvino_toolkit
          chmod +x ./install_dependencies/install_openvino_dependencies.sh
          echo "Y" | sudo -E ./install_dependencies/install_openvino_dependencies.sh
      - name: Test
        id: ggml-ci
        run: |
          source ./openvino_toolkit/setupvars.sh
          GG_BUILD_OPENVINO=1 GGML_OPENVINO_DEVICE=GPU GG_BUILD_LOW_PERF=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
--- a/.github/workflows/build-vulkan.yml
+++ b/.github/workflows/build-vulkan.yml
@ -0,0 +1,96 @@
 name: CI (vulkan)
 on:
  workflow_dispatch: # allows manual triggering
  push:
    branches:
      - master
    paths: [
      '.github/workflows/build-vulkan.yml',
      '**/CMakeLists.txt',
      '**/.cmake',
      '**/*.h',
      '**/*.hpp',
      '**/*.c',
      '**/*.cpp',
      '**/*.comp',
      '**/*.glsl'
    ]
  pull_request:
    types: [opened, synchronize, reopened]
    paths: [
      '.github/workflows/build-vulkan.yml',
      'ggml/src/ggml-vulkan/**'
    ]
 concurrency:
  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
  cancel-in-progress: true
 env:
  GGML_NLOOP: 3
  GGML_N_THREADS: 1
  LLAMA_LOG_COLORS: 1
  LLAMA_LOG_PREFIX: 1
  LLAMA_LOG_TIMESTAMPS: 1
 jobs:
  ubuntu-24-vulkan-llvmpipe:
    runs-on: ubuntu-24.04
    steps:
      - name: Clone
        id: checkout
        uses: actions/checkout@v6
      - name: ccache
        uses: ggml-org/ccache-action@v1.2.16
        with:
          key: ubuntu-24-vulkan-llvmpipe
          evict-old-files: 1d
          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
      - name: Dependencies
        id: depends
        run: |
          sudo add-apt-repository -y ppa:kisak/kisak-mesa
          sudo apt-get update -y
          sudo apt-get install -y build-essential mesa-vulkan-drivers libxcb-xinput0 libxcb-xinerama0 libxcb-cursor-dev libssl-dev
      - name: Get latest Vulkan SDK version
        id: vulkan_sdk_version
        run: |
          echo "VULKAN_SDK_VERSION=$(curl https://vulkan.lunarg.com/sdk/latest/linux.txt)" >> "$GITHUB_ENV"
      - name: Use Vulkan SDK Cache
        uses: actions/cache@v5
        id: cache-sdk
        with:
          path: ./vulkan_sdk
          key: vulkan-sdk-${{ env.VULKAN_SDK_VERSION }}-${{ runner.os }}
      - name: Setup Vulkan SDK
        if: steps.cache-sdk.outputs.cache-hit != 'true'
        uses: ./.github/actions/linux-setup-vulkan-llvmpipe
        with:
          path: ./vulkan_sdk
          version: ${{ env.VULKAN_SDK_VERSION }}
      - name: Build
        id: cmake_build
        run: |
          source ./vulkan_sdk/setup-env.sh
          cmake -B build \
            -DGGML_VULKAN=ON
          cmake --build build --config Release -j $(nproc)
      - name: Test
        id: cmake_test
        run: |
          cd build
          export GGML_VK_VISIBLE_DEVICES=0
          export GGML_VK_DISABLE_F16=1
          export GGML_VK_DISABLE_COOPMAT=1
          # This is using llvmpipe and runs slower than other backends
          ctest -L main --verbose --timeout 4800
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
--- a/.github/workflows/docker.yml
+++ b/.github/workflows/docker.yml
@ -47,6 +47,7 @@ jobs:
          - { tag: "vulkan", dockerfile: ".devops/vulkan.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: false, runs_on: "ubuntu-22.04" }
          - { tag: "s390x",  dockerfile: ".devops/s390x.Dockerfile",  platforms: "linux/s390x", full: true, light: true, server: true, free_disk_space: false, runs_on: "ubuntu-22.04-s390x" }
          - { tag: "rocm",   dockerfile: ".devops/rocm.Dockerfile",   platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: true,  runs_on: "ubuntu-22.04" }
          - { tag: "openvino", dockerfile: ".devops/openvino.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: false, runs_on: "ubuntu-22.04" }
    steps:
      - name: Check out the repo
        uses: actions/checkout@v6
--- a/.github/workflows/gguf-publish.yml
+++ b/.github/workflows/gguf-publish.yml
@ -21,7 +21,7 @@ on:
 jobs:
  deploy:
-    runs-on: ubuntu-slim
+    runs-on: ubuntu-latest
    steps:
    - uses: actions/checkout@v6
--- a/.github/workflows/python-lint.yml
+++ b/.github/workflows/python-lint.yml
@ -4,10 +4,16 @@ on:
  push:
    branches:
      - master
-    paths: ['.github/workflows/python-lint.yml', '**/*.py']
+    paths: [
      '.github/workflows/python-lint.yml',
      '**/*.py'
    ]
  pull_request:
    types: [opened, synchronize, reopened]
-    paths: ['.github/workflows/python-lint.yml', '**/*.py']
+    paths: [
      '.github/workflows/python-lint.yml',
      '**/*.py'
    ]
 concurrency:
  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@ -10,7 +10,22 @@ on:
  push:
    branches:
      - master
-    paths: ['.github/workflows/release.yml', '**/CMakeLists.txt', '**/.cmake', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.cuh', '**/*.swift', '**/*.m', '**/*.metal', '**/*.comp']
+    paths: [
      '.github/workflows/release.yml',
      '**/CMakeLists.txt',
      '**/.cmake',
      '**/*.h',
      '**/*.hpp',
      '**/*.c',
      '**/*.cpp',
      '**/*.cu',
      '**/*.cuh',
      '**/*.swift',
      '**/*.m',
      '**/*.metal',
      '**/*.comp',
      '**/*.glsl'
    ]
 concurrency:
  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
@ -34,7 +49,7 @@ jobs:
      - name: ccache
        uses: ggml-org/ccache-action@v1.2.16
        with:
-          key: macOS-latest-cmake-arm64
+          key: macOS-latest-arm64
          evict-old-files: 1d
      - name: Build
@ -81,7 +96,7 @@ jobs:
      - name: ccache
        uses: ggml-org/ccache-action@v1.2.16
        with:
-          key: macOS-latest-cmake-x64
+          key: macOS-latest-x64
          evict-old-files: 1d
      - name: Build
@ -140,7 +155,7 @@ jobs:
      - name: ccache
        uses: ggml-org/ccache-action@v1.2.16
        with:
-          key: ubuntu-cpu-cmake-${{ matrix.build }}
+          key: ubuntu-cpu-${{ matrix.build }}
          evict-old-files: 1d
      - name: Dependencies
@ -191,7 +206,7 @@ jobs:
      - name: ccache
        uses: ggml-org/ccache-action@v1.2.16
        with:
-          key: ubuntu-22-cmake-vulkan
+          key: ubuntu-22-vulkan
          evict-old-files: 1d
      - name: Dependencies
@ -231,6 +246,86 @@ jobs:
          path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-vulkan-x64.tar.gz
          name: llama-bin-ubuntu-vulkan-x64.tar.gz
  ubuntu-24-openvino:
    runs-on: ubuntu-24.04
    outputs:
      openvino_version: ${{ steps.openvino_version.outputs.value }}
    env:
      # Sync versions in build.yml, build-self-hosted.yml, release.yml, build-cache.yml, .devops/openvino.Dockerfile
      OPENVINO_VERSION_MAJOR: "2026.0"
      OPENVINO_VERSION_FULL: "2026.0.0.20965.c6d6a13a886"
    steps:
      - name: Set OpenVINO version output
        id: openvino_version
        run: echo "value=${{ env.OPENVINO_VERSION_MAJOR }}" >> $GITHUB_OUTPUT
      - name: Clone
        id: checkout
        uses: actions/checkout@v6
        with:
          fetch-depth: 0
      - name: ccache
        uses: ggml-org/ccache-action@v1.2.16
        with:
          key: ubuntu-24-openvino-release-no-preset-v1
          evict-old-files: 1d
      - name: Dependencies
        run: |
          sudo apt-get update
          sudo apt-get install -y build-essential libssl-dev libtbb12 cmake ninja-build python3-pip
          sudo apt install ocl-icd-opencl-dev opencl-headers opencl-clhpp-headers intel-opencl-icd
      - name: Use OpenVINO Toolkit Cache
        uses: actions/cache@v5
        id: cache-openvino
        with:
          path: ./openvino_toolkit
          key: openvino-toolkit-v${{ env.OPENVINO_VERSION_FULL }}-${{ runner.os }}
      - name: Setup OpenVINO Toolkit
        if: steps.cache-openvino.outputs.cache-hit != 'true'
        uses: ./.github/actions/linux-setup-openvino
        with:
          path: ./openvino_toolkit
          version_major: ${{ env.OPENVINO_VERSION_MAJOR }}
          version_full: ${{ env.OPENVINO_VERSION_FULL }}
      - name: Install OpenVINO dependencies
        run: |
          cd ./openvino_toolkit
          chmod +x ./install_dependencies/install_openvino_dependencies.sh
          echo "Y" | sudo -E ./install_dependencies/install_openvino_dependencies.sh
      - name: Build
        id: cmake_build
        run: |
          source ./openvino_toolkit/setupvars.sh
          cmake -B build/ReleaseOV -G Ninja \
            -DCMAKE_BUILD_TYPE=Release \
            -DGGML_OPENVINO=ON
          cmake --build build/ReleaseOV --config Release -j $(nproc)
      - name: Determine tag name
        id: tag
        uses: ./.github/actions/get-tag-name
      - name: Pack artifacts
        id: pack_artifacts
        run: |
          cp LICENSE ./build/ReleaseOV/bin/
          tar -czvf llama-${{ steps.tag.outputs.name }}-bin-ubuntu-openvino-${{ env.OPENVINO_VERSION_MAJOR }}-x64.tar.gz --transform "s,./,llama-${{ steps.tag.outputs.name }}/," -C ./build/ReleaseOV/bin .
      - name: Upload artifacts
        uses: actions/upload-artifact@v6
        with:
          path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-openvino-${{ env.OPENVINO_VERSION_MAJOR }}-x64.tar.gz
          name: llama-bin-ubuntu-openvino-${{ env.OPENVINO_VERSION_MAJOR }}-x64.tar.gz
  windows-cpu:
    runs-on: windows-2025
@ -249,7 +344,7 @@ jobs:
      - name: ccache
        uses: ggml-org/ccache-action@v1.2.16
        with:
-          key: windows-latest-cmake-cpu-${{ matrix.arch }}
+          key: windows-latest-cpu-${{ matrix.arch }}
          variant: ccache
          evict-old-files: 1d
@ -310,7 +405,7 @@ jobs:
      - name: ccache
        uses: ggml-org/ccache-action@v1.2.16
        with:
-          key: windows-latest-cmake-${{ matrix.backend }}-${{ matrix.arch }}
+          key: windows-latest-${{ matrix.backend }}-${{ matrix.arch }}
          variant: ccache
          evict-old-files: 1d
@ -456,7 +551,7 @@ jobs:
      - name: ccache
        uses: ggml-org/ccache-action@v1.2.16
        with:
-          key: windows-latest-cmake-sycl
+          key: windows-latest-sycl
          variant: ccache
          evict-old-files: 1d
@ -536,7 +631,7 @@ jobs:
      - name: ccache
        uses: ggml-org/ccache-action@v1.2.16
        with:
-          key: ubuntu-rocm-cmake-${{ matrix.ROCM_VERSION }}-${{ matrix.build }}
+          key: ubuntu-rocm-${{ matrix.ROCM_VERSION }}-${{ matrix.build }}
          evict-old-files: 1d
      - name: Dependencies
@ -646,7 +741,7 @@ jobs:
      - name: ccache
        uses: ggml-org/ccache-action@v1.2.16
        with:
-          key: windows-latest-cmake-hip-${{ env.HIPSDK_INSTALLER_VERSION }}-${{ matrix.name }}-x64
+          key: windows-latest-hip-${{ env.HIPSDK_INSTALLER_VERSION }}-${{ matrix.name }}-x64
          evict-old-files: 1d
      - name: Install ROCm
@ -872,7 +967,7 @@ jobs:
    permissions:
        contents: write # for creating release
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-slim
    needs:
      - windows
@ -883,6 +978,7 @@ jobs:
      - ubuntu-22-rocm
      - ubuntu-22-cpu
      - ubuntu-22-vulkan
      - ubuntu-24-openvino
      - macOS-arm64
      - macOS-x64
      - ios-xcode-build
@ -967,6 +1063,7 @@ jobs:
            - [Ubuntu x64 (Vulkan)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-vulkan-x64.tar.gz)
            - [Ubuntu x64 (ROCm 7.2)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-rocm-7.2-x64.tar.gz)
            - [Ubuntu s390x (CPU)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-s390x.tar.gz)
            - [Ubuntu x64 (OpenVINO)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-openvino-${{ needs.ubuntu-24-openvino.outputs.openvino_version }}-x64.tar.gz)
            **Windows:**
            - [Windows x64 (CPU)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-cpu-x64.zip)
--- a/.github/workflows/server-sanitize.yml
+++ b/.github/workflows/server-sanitize.yml
@ -0,0 +1,105 @@
 name: Server (sanitize)
 on:
  workflow_dispatch: # allows manual triggering
    inputs:
      sha:
        description: 'Commit SHA1 to build'
        required: false
        type: string
      slow_tests:
        description: 'Run slow tests'
        required: true
        type: boolean
  push:
    branches:
      - master
    paths: [
      '.github/workflows/server-sanitize.yml',
      '**/CMakeLists.txt',
      '**/Makefile',
      '**/*.h',
      '**/*.hpp',
      '**/*.c',
      '**/*.cpp',
      'tools/server/**.*'
    ]
 env:
  LLAMA_LOG_COLORS: 1
  LLAMA_LOG_PREFIX: 1
  LLAMA_LOG_TIMESTAMPS: 1
  LLAMA_LOG_VERBOSITY: 10
 concurrency:
  group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || github.run_id }}
  cancel-in-progress: true
 jobs:
  server:
    runs-on: ubuntu-latest
    strategy:
      matrix:
        sanitizer: [ADDRESS, UNDEFINED] # THREAD is very slow
        build_type: [RelWithDebInfo]
      fail-fast: false
    steps:
      - name: Dependencies
        id: depends
        run: |
          sudo apt-get update
          sudo apt-get -y install \
            build-essential \
            xxd \
            git \
            cmake \
            curl \
            wget \
            language-pack-en \
            libssl-dev
      - name: Clone
        id: checkout
        uses: actions/checkout@v6
        with:
          fetch-depth: 0
          ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
      - name: Build
        id: cmake_build
        run: |
          cmake -B build \
            -DLLAMA_BUILD_BORINGSSL=ON \
            -DGGML_SCHED_NO_REALLOC=ON \
            -DGGML_SANITIZE_ADDRESS=${{ matrix.sanitizer == 'ADDRESS' }} \
            -DGGML_SANITIZE_THREAD=${{ matrix.sanitizer == 'THREAD' }} \
            -DGGML_SANITIZE_UNDEFINED=${{ matrix.sanitizer == 'UNDEFINED' }} \
            -DLLAMA_SANITIZE_ADDRESS=${{ matrix.sanitizer == 'ADDRESS' }} \
            -DLLAMA_SANITIZE_THREAD=${{ matrix.sanitizer == 'THREAD' }} \
            -DLLAMA_SANITIZE_UNDEFINED=${{ matrix.sanitizer == 'UNDEFINED' }}
          cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server
      - name: Python setup
        id: setup_python
        uses: actions/setup-python@v6
        with:
          python-version: '3.11'
          pip-install: -r tools/server/tests/requirements.txt
      - name: Tests
        id: server_integration_tests
        if: ${{ (!matrix.disabled_on_pr || !github.event.pull_request) }}
        run: |
          cd tools/server/tests
          export ${{ matrix.extra_args }}
          pytest -v -x -m "not slow"
      - name: Slow tests
        id: server_integration_tests_slow
        if: ${{ (github.event.schedule || github.event.inputs.slow_tests == 'true') && matrix.build_type == 'Release' }}
        run: |
          cd tools/server/tests
          export ${{ matrix.extra_args }}
          SLOW_TESTS=1 pytest -v -x
--- a/.github/workflows/server-self-hosted.yml
+++ b/.github/workflows/server-self-hosted.yml
@ -1,4 +1,4 @@
-name: Server-Metal
+name: Server (self-hosted)
 on:
  workflow_dispatch: # allows manual triggering
@ -14,7 +14,19 @@ on:
  push:
    branches:
      - master
-    paths: ['.github/workflows/server-metal.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'tools/server/**.*']
+    paths: [
      '.github/workflows/server-self-hosted.yml',
      '**/CMakeLists.txt',
      '**/Makefile',
      '**/*.h',
      '**/*.hpp',
      '**/*.c',
      '**/*.cpp',
      '**/*.cu',
      '**/*.swift',
      '**/*.m',
      'tools/server/**.*'
    ]
 env:
  LLAMA_LOG_COLORS: 1
@ -28,7 +40,7 @@ concurrency:
 jobs:
  server-metal:
-    runs-on: [self-hosted, macOS, ARM64]
+    runs-on: [self-hosted, llama-server, macOS, ARM64]
    name: server-metal (${{ matrix.wf_name }})
    strategy:
@ -71,3 +83,42 @@ jobs:
          pip install -r requirements.txt
          export ${{ matrix.extra_args }}
          pytest -v -x -m "not slow"
  server-cuda:
    runs-on: [self-hosted, llama-server, Linux, NVIDIA]
    name: server-cuda (${{ matrix.wf_name }})
    strategy:
      matrix:
        build_type: [Release]
        wf_name: ["GPUx1"]
        include:
          - build_type: Release
            extra_args: "LLAMA_ARG_BACKEND_SAMPLING=1"
            wf_name:    "GPUx1, backend-sampling"
      fail-fast: false
    steps:
      - name: Clone
        id: checkout
        uses: actions/checkout@v6
        with:
          fetch-depth: 0
          ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
      - name: Build
        id: cmake_build
        run: |
          cmake -B build -DGGML_SCHED_NO_REALLOC=ON
          cmake --build build --config ${{ matrix.build_type }} -j $(sysctl -n hw.logicalcpu) --target llama-server
      - name: Tests
        id: server_integration_tests
        if: ${{ (!matrix.disabled_on_pr || !github.event.pull_request) }}
        run: |
          cd tools/server/tests
          python3 -m venv venv
          source venv/bin/activate
          pip install -r requirements.txt
          export ${{ matrix.extra_args }}
          pytest -v -x -m "not slow"
--- a/.github/workflows/server-webui.yml
+++ b/.github/workflows/server-webui.yml
@ -1,4 +1,3 @@
 # Server WebUI build and tests
 name: Server WebUI
 on:
@ -11,10 +10,20 @@ on:
  push:
    branches:
      - master
-    paths: ['.github/workflows/server-webui.yml', 'tools/server/webui/**.*', 'tools/server/tests/**.*', 'tools/server/public/**']
+    paths: [
      '.github/workflows/server-webui.yml',
      'tools/server/webui/**.*',
      'tools/server/tests/**.*',
      'tools/server/public/**'
    ]
  pull_request:
    types: [opened, synchronize, reopened]
-    paths: ['.github/workflows/server-webui.yml', 'tools/server/webui/**.*', 'tools/server/tests/**.*', 'tools/server/public/**']
+    paths: [
      '.github/workflows/server-webui.yml',
      'tools/server/webui/**.*',
      'tools/server/tests/**.*',
      'tools/server/public/**'
    ]
 env:
  LLAMA_LOG_COLORS: 1
@ -29,7 +38,7 @@ concurrency:
 jobs:
  webui-check:
    name: WebUI Checks
-    runs-on: ubuntu-latest
+    runs-on: ${{ 'ubuntu-24.04-arm' || 'ubuntu-24.04' }}
    continue-on-error: true
    steps:
      - name: Checkout code
--- a/.github/workflows/server.yml
+++ b/.github/workflows/server.yml
@ -1,4 +1,3 @@
 # Server build and tests
 name: Server
 on:
@ -15,10 +14,34 @@ on:
  push:
    branches:
      - master
-    paths: ['.github/workflows/server.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'tools/server/**.*']
+    paths: [
      '.github/workflows/server.yml',
      '**/CMakeLists.txt',
      '**/Makefile',
      '**/*.h',
      '**/*.hpp',
      '**/*.c',
      '**/*.cpp',
      '**/*.cu',
      '**/*.swift',
      '**/*.m',
      'tools/server/**.*'
    ]
  pull_request:
    types: [opened, synchronize, reopened]
-    paths: ['.github/workflows/server.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'tools/server/**.*']
+    paths: [
      '.github/workflows/server.yml',
      '**/CMakeLists.txt',
      '**/Makefile',
      '**/*.h',
      '**/*.hpp',
      '**/*.c',
      '**/*.cpp',
      '**/*.cu',
      '**/*.swift',
      '**/*.m',
      'tools/server/**.*'
    ]
 env:
  LLAMA_LOG_COLORS: 1
@ -34,17 +57,18 @@ jobs:
  server:
    runs-on: ubuntu-latest
    name: server (${{ matrix.wf_name }})
    strategy:
      matrix:
-        sanitizer: [ADDRESS, UNDEFINED] # THREAD is very slow
+        build_type: [Release]
-        build_type: [RelWithDebInfo]
+        wf_name: ["default"]
        include:
          - build_type: Release
            sanitizer: ""
            extra_args: ""
            wf_name:    "default"
          - build_type: Release
            sanitizer: ""
            extra_args: "LLAMA_ARG_BACKEND_SAMPLING=1"
            wf_name:    "backend-sampling"
      fail-fast: false
    steps:
@ -74,13 +98,7 @@ jobs:
        run: |
          cmake -B build \
            -DLLAMA_BUILD_BORINGSSL=ON \
-            -DGGML_SCHED_NO_REALLOC=ON \
+            -DGGML_SCHED_NO_REALLOC=ON
            -DGGML_SANITIZE_ADDRESS=${{ matrix.sanitizer == 'ADDRESS' }} \
            -DGGML_SANITIZE_THREAD=${{ matrix.sanitizer == 'THREAD' }} \
            -DGGML_SANITIZE_UNDEFINED=${{ matrix.sanitizer == 'UNDEFINED' }} \
            -DLLAMA_SANITIZE_ADDRESS=${{ matrix.sanitizer == 'ADDRESS' }} \
            -DLLAMA_SANITIZE_THREAD=${{ matrix.sanitizer == 'THREAD' }} \
            -DLLAMA_SANITIZE_UNDEFINED=${{ matrix.sanitizer == 'UNDEFINED' }}
          cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server
      - name: Python setup
--- a/.gitignore
+++ b/.gitignore
@ -124,6 +124,11 @@ poetry.toml
 # Scripts
 !/scripts/install-oneapi.bat
 # Generated by scripts
 /hellaswag_val_full.txt
 /winogrande-debiased-eval.csv
 /wikitext-2-raw/
 # Test models for lora adapters
 /lora-tests
--- a/57
+++ b/57
@ -2,27 +2,13 @@
 # multiplie collaborators per item can be specified
 /.devops/*.Dockerfile                   @ngxson
-/.github/actions/                       @CISC
+/.github/actions/                       @ggml-org/ci
-/.github/workflows/                     @CISC
+/.github/workflows/                     @ggml-org/ci
 /ci/                                    @ggerganov
 /cmake/                                 @ggerganov
-/common/CMakeLists.txt                  @ggerganov
+/common/                                @ggml-org/llama-common
-/common/arg.*                           @ggerganov
+/common/jinja/                          @CISC
 /common/base64.hpp.*                    @ggerganov
 /common/build-info.*                    @ggerganov
 /common/chat.*                          @pwilkin
 /common/chat-peg-parser.*               @aldehir
 /common/common.*                        @ggerganov
 /common/console.*                       @ggerganov
 /common/http.*                          @angt
 /common/jinja/                          @ngxson @CISC @aldehir
 /common/llguidance.*                    @ggerganov
 /common/log.*                           @ggerganov
 /common/ngram-map.*                     @srogmann
 /common/peg-parser.*                    @aldehir
 /common/sampling.*                      @ggerganov
 /common/speculative.*                   @ggerganov
 /common/unicode.*                       @aldehir
 /convert_*.py                           @CISC
 /examples/batched.swift/                @ggerganov
 /examples/batched/                      @ggerganov
@ -49,29 +35,28 @@
 /examples/speculative/                  @ggerganov
 /ggml/cmake/                            @ggerganov
 /ggml/include/                          @ggerganov
 /ggml/src/ggml-cann/                    @ggml-org/ggml-cann
 /ggml/src/ggml-common.h                 @ggerganov
 /ggml/src/ggml-cpu/                     @ggerganov
 /ggml/src/ggml-cpu/spacemit/            @alex-spacemit
-/ggml/src/ggml-cuda/fattn*              @JohannesGaessler
+/ggml/src/ggml-cuda/                    @ggml-org/ggml-cuda
 /ggml/src/ggml-cuda/mmf.*               @JohannesGaessler @am17an
 /ggml/src/ggml-cuda/mmq.*               @JohannesGaessler
 /ggml/src/ggml-cuda/mmvf.*              @JohannesGaessler
 /ggml/src/ggml-cuda/mmvq.*              @JohannesGaessler
 /ggml/src/ggml-cuda/fattn-wmma*         @IMbackK
 /ggml/src/ggml-hip/                     @IMbackK
 /ggml/src/ggml-cuda/vendors/hip.h       @IMbackK
 /ggml/src/ggml-impl.h                   @ggerganov
-/ggml/src/ggml-metal/                   @ggerganov
+/ggml/src/ggml-metal/                   @ggml-org/ggml-metal
-/ggml/src/ggml-opencl/                  @lhez @max-krasnyansky
+/ggml/src/ggml-opencl/                  @ggml-org/ggml-opencl
-/ggml/src/ggml-hexagon/                 @max-krasnyansky @lhez
+/ggml/src/ggml-hexagon/                 @ggml-org/ggml-hexagon
 /ggml/src/ggml-opt.cpp                  @JohannesGaessler
 /ggml/src/ggml-quants.*                 @ggerganov
-/ggml/src/ggml-rpc/                     @rgerganov
+/ggml/src/ggml-rpc/                     @ggml-org/ggml-rpc
 /ggml/src/ggml-sycl/                    @ggml-org/ggml-sycl
 /ggml/src/ggml-threading.*              @ggerganov
-/ggml/src/ggml-vulkan/                  @0cc4m
+/ggml/src/ggml-vulkan/                  @ggml-org/ggml-vulkan
 /ggml/src/ggml-virtgpu/                 @kpouget
-/ggml/src/ggml-webgpu/                  @reeselevine
+/ggml/src/ggml-webgpu/                  @ggml-org/ggml-webgpu
-/ggml/src/ggml-zdnn/                    @taronaeo @Andreas-Krebbel @AlekseiNikiforovIBM
+/ggml/src/ggml-zdnn/                    @ggml-org/ggml-zdnn @Andreas-Krebbel @AlekseiNikiforovIBM
 /ggml/src/ggml-openvino/                @cavusmustafa @wine99
 /ggml/src/ggml.c                        @ggerganov
 /ggml/src/ggml.cpp                      @ggerganov
 /ggml/src/gguf.cpp                      @JohannesGaessler @Green-Sky
@ -89,16 +74,18 @@
 /src/llama-vocab.*                      @CISC
 /src/models/                            @CISC
 /tests/                                 @ggerganov
-/tests/test-chat-.*                     @pwilkin
+/tests/test-chat.*                      @pwilkin
 /tests/test-llama-archs.cpp             @JohannesGaessler
 /tools/batched-bench/                   @ggerganov
 /tools/cli/                             @ngxson
 /tools/completion/                      @ggerganov
-/tools/mtmd/                            @ngxson
+/tools/mtmd/                            @ggml-org/llama-mtmd
 /tools/perplexity/                      @ggerganov
 /tools/parser/                          @pwilkin
 /tools/quantize/                        @ggerganov
-/tools/rpc/                             @rgerganov
+/tools/rpc/                             @ggml-org/ggml-rpc
-/tools/server/*                         @ngxson @ggerganov # no subdir
+/tools/server/*                         @ggml-org/llama-server # no subdir
-/tools/server/webui/                    @allozaur
+/tools/server/webui/                    @ggml-org/llama-webui
 /tools/tokenize/                        @ggerganov
 /tools/tts/                             @ggerganov
 /vendor/                                @ggerganov
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@ -38,7 +38,13 @@ Before submitting your PR:
  - Avoid combining unrelated changes in a single PR
  - For intricate features, consider opening a feature request first to discuss and align expectations
  - When adding support for a new model or feature, focus on **CPU support only** in the initial PR unless you have a good reason not to. Add support for other backends like CUDA in follow-up PRs
  - In particular, adding new data types (extension of the `ggml_type` enum) carries with it a disproportionate maintenance burden. As such, to add a new quantization type you will need to meet the following *additional* criteria *at minimum*:
    - convert a small model to GGUF using the new type and upload it to HuggingFace
    - provide [perplexity](https://github.com/ggml-org/llama.cpp/tree/master/tools/perplexity) comparisons to FP16/BF16 (whichever is the native precision) as well as to types of similar size
    - provide KL divergence data calculated vs. the FP16/BF16 (whichever is the native precision) version for both the new type as well as types of similar size
    - provide [performance data](https://github.com/ggml-org/llama.cpp/tree/master/tools/llama-bench) for the new type in comparison to types of similar size on pure CPU
 - Consider allowing write access to your branch for faster reviews, as reviewers can push commits directly
 - If you are a new contributor, limit your open PRs to 1.
 After submitting your PR:
 - Expect requests for modifications to ensure the code meets llama.cpp's standards for quality and long-term maintainability
@ -159,7 +165,7 @@ Maintainers reserve the right to decline review or close pull requests for any r
 # Code maintenance
- Existing code should have designated collaborators and/or maintainers specified in the [CODEOWNERS](CODEOWNERS) file reponsible for:
+- Existing code should have designated collaborators and/or maintainers specified in the [CODEOWNERS](CODEOWNERS) file responsible for:
  - Reviewing and merging related PRs
  - Fixing related bugs
  - Providing developer guidance/support
--- a/README.md
+++ b/README.md
@ -259,6 +259,8 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
 - [llama-swap](https://github.com/mostlygeek/llama-swap) - transparent proxy that adds automatic model switching with llama-server
 - [Kalavai](https://github.com/kalavai-net/kalavai-client) - Crowdsource end to end LLM deployment at any scale
 - [llmaz](https://github.com/InftyAI/llmaz) - ☸️ Easy, advanced inference platform for large language models on Kubernetes.
 - [LLMKube](https://github.com/defilantech/llmkube) - Kubernetes operator for llama.cpp with multi-GPU and Apple Silicon Metal
  support"
 </details>
 <details>
@ -277,6 +279,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
 | [BLAS](docs/build.md#blas-build) | All |
 | [BLIS](docs/backend/BLIS.md) | All |
 | [SYCL](docs/backend/SYCL.md) | Intel and Nvidia GPU |
 | [OpenVINO [In Progress]](docs/backend/OPENVINO.md) | Intel CPUs, GPUs, and NPUs |
 | [MUSA](docs/build.md#musa) | Moore Threads GPU |
 | [CUDA](docs/build.md#cuda) | Nvidia GPU |
 | [HIP](docs/build.md#hip) | AMD GPU |
@ -287,7 +290,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
 | [IBM zDNN](docs/backend/zDNN.md) | IBM Z & LinuxONE |
 | [WebGPU [In Progress]](docs/build.md#webgpu) | All |
 | [RPC](https://github.com/ggml-org/llama.cpp/tree/master/tools/rpc) | All |
-| [Hexagon [In Progress]](docs/backend/hexagon/README.md) | Snapdragon |
+| [Hexagon [In Progress]](docs/backend/snapdragon/README.md) | Snapdragon |
 | [VirtGPU](docs/backend/VirtGPU.md) | VirtGPU APIR |
 ## Obtaining and quantizing models
--- a/benches/nemotron/nemotron-dgx-spark.md
+++ b/benches/nemotron/nemotron-dgx-spark.md
@ -0,0 +1,72 @@
 # NVIDIA DGX Spark
 ## System info
 ```bash
 uname --all
 Linux spark-17ed 6.11.0-1016-nvidia #16-Ubuntu SMP PREEMPT_DYNAMIC Sun Sep 21 16:52:46 UTC 2025 aarch64 aarch64 aarch64 GNU/Linux
 g++ --version
 g++ (Ubuntu 13.3.0-6ubuntu2~24.04) 13.3.0
 nvidia-smi
 Fri Mar  6 11:39:45 2026
 +-----------------------------------------------------------------------------------------+
 | NVIDIA-SMI 580.95.05              Driver Version: 580.95.05      CUDA Version: 13.0     |
 +-----------------------------------------+------------------------+----------------------+
 | GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
 | Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
 |                                         |                        |               MIG M. |
 |=========================================+========================+======================|
 |   0  NVIDIA GB10                    On  |   0000000F:01:00.0 Off |                  N/A |
 | N/A   52C    P0             13W /  N/A  | Not Supported          |      0%      Default |
 |                                         |                        |                  N/A |
 +-----------------------------------------+------------------------+----------------------+
 ```
 ## ggml-org/nemotron-3-super-120b-GGUF
 Model: https://huggingface.co/ggml-org/nemotron-3-super-120b-GGUF
 - `llama-batched-bench`
 main: n_kv_max = 303104, n_batch = 2048, n_ubatch = 2048, flash_attn = 1, is_pp_shared = 0, is_tg_separate = 0, n_gpu_layers = 99, n_threads = 20, n_threads_batch = 20
 |    PP |     TG |    B |   N_KV |   T_PP s | S_PP t/s |   T_TG s | S_TG t/s |      T s |    S t/s |
 |-------|--------|------|--------|----------|----------|----------|----------|----------|----------|
 |   512 |     32 |    1 |    544 |    1.094 |   468.05 |    1.621 |    19.74 |    2.715 |   200.37 |
 |   512 |     32 |    2 |   1088 |    1.463 |   700.16 |    2.437 |    26.26 |    3.900 |   279.01 |
 |   512 |     32 |    4 |   2176 |    2.647 |   773.76 |    4.043 |    31.66 |    6.689 |   325.29 |
 |   512 |     32 |    8 |   4352 |    5.291 |   774.14 |    6.151 |    41.62 |   11.442 |   380.37 |
 |   512 |     32 |   16 |   8704 |   10.603 |   772.62 |   10.385 |    49.30 |   20.987 |   414.72 |
 |   512 |     32 |   32 |  17408 |   21.231 |   771.69 |   18.235 |    56.16 |   39.466 |   441.09 |
 |  4096 |     32 |    1 |   4128 |    5.340 |   767.05 |    1.616 |    19.81 |    6.956 |   593.47 |
 |  4096 |     32 |    2 |   8256 |   10.673 |   767.55 |    2.454 |    26.08 |   13.127 |   628.94 |
 |  4096 |     32 |    4 |  16512 |   21.348 |   767.46 |    4.072 |    31.44 |   25.420 |   649.57 |
 |  4096 |     32 |    8 |  33024 |   42.714 |   767.15 |    6.277 |    40.78 |   48.991 |   674.08 |
 |  4096 |     32 |   16 |  66048 |   85.385 |   767.54 |   10.596 |    48.32 |   95.981 |   688.14 |
 |  4096 |     32 |   32 | 132096 |  170.819 |   767.32 |   18.619 |    55.00 |  189.437 |   697.31 |
 |  8192 |     32 |    1 |   8224 |   10.690 |   766.32 |    1.619 |    19.76 |   12.310 |   668.10 |
 |  8192 |     32 |    2 |  16448 |   21.382 |   766.24 |    2.467 |    25.94 |   23.850 |   689.65 |
 |  8192 |     32 |    4 |  32896 |   42.782 |   765.92 |    4.098 |    31.23 |   46.881 |   701.69 |
 |  8192 |     32 |    8 |  65792 |   85.582 |   765.77 |    6.368 |    40.20 |   91.951 |   715.52 |
 |  8192 |     32 |   16 | 131584 |  171.066 |   766.21 |   10.774 |    47.52 |  181.840 |   723.62 |
 |  8192 |     32 |   32 | 263168 |  342.140 |   766.19 |   18.969 |    53.98 |  361.109 |   728.78 |
 - `llama-bench`
 | model                   |       size |     params | backend    | n_ubatch | fa |            test |                  t/s |
 | ----------------------- | ---------: | ---------: | ---------- | -------: | -: | --------------: | -------------------: |
 | nemotron 120B.A12B Q4_K |  65.10 GiB |   120.67 B | CUDA       |     2048 |  1 |          pp2048 |        768.84 ± 0.90 |
 | nemotron 120B.A12B Q4_K |  65.10 GiB |   120.67 B | CUDA       |     2048 |  1 |            tg32 |         19.94 ± 0.16 |
 | nemotron 120B.A12B Q4_K |  65.10 GiB |   120.67 B | CUDA       |     2048 |  1 |  pp2048 @ d4096 |        764.51 ± 0.50 |
 | nemotron 120B.A12B Q4_K |  65.10 GiB |   120.67 B | CUDA       |     2048 |  1 |    tg32 @ d4096 |         19.95 ± 0.18 |
 | nemotron 120B.A12B Q4_K |  65.10 GiB |   120.67 B | CUDA       |     2048 |  1 |  pp2048 @ d8192 |        759.53 ± 0.71 |
 | nemotron 120B.A12B Q4_K |  65.10 GiB |   120.67 B | CUDA       |     2048 |  1 |    tg32 @ d8192 |         19.83 ± 0.18 |
 | nemotron 120B.A12B Q4_K |  65.10 GiB |   120.67 B | CUDA       |     2048 |  1 | pp2048 @ d16384 |        747.98 ± 1.58 |
 | nemotron 120B.A12B Q4_K |  65.10 GiB |   120.67 B | CUDA       |     2048 |  1 |   tg32 @ d16384 |         19.84 ± 0.18 |
 | nemotron 120B.A12B Q4_K |  65.10 GiB |   120.67 B | CUDA       |     2048 |  1 | pp2048 @ d32768 |        724.40 ± 2.70 |
 | nemotron 120B.A12B Q4_K |  65.10 GiB |   120.67 B | CUDA       |     2048 |  1 |   tg32 @ d32768 |         19.45 ± 0.18 |
 build: 04a65daab (8268)
--- a/ci/run.sh
+++ b/ci/run.sh
@ -25,6 +25,9 @@
 # # with KLEIDIAI support
 # GG_BUILD_KLEIDIAI=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
 #
 # # with OPENVINO support
 # GG_BUILD_OPENVINO=1 GG_BUILD_LOW_PERF=1 GGML_OPENVINO_DEVICE=CPU bash ./ci/run.sh ./tmp/results ./tmp/mnt
 #
 if [ -z "$2" ]; then
    echo "usage: $0 <output-dir> <mnt-dir>"
@ -46,6 +49,7 @@ cd $sd/../
 SRC=`pwd`
 CMAKE_EXTRA="-DLLAMA_FATAL_WARNINGS=${LLAMA_FATAL_WARNINGS:-ON} -DLLAMA_OPENSSL=OFF -DGGML_SCHED_NO_REALLOC=ON"
 CTEST_EXTRA=""
 if [ ! -z ${GG_BUILD_METAL} ]; then
    CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_METAL=ON"
@ -165,6 +169,18 @@ if [ -n "${GG_BUILD_KLEIDIAI}" ]; then
        -DBUILD_SHARED_LIBS=OFF"
 fi
 if [ ! -z ${GG_BUILD_OPENVINO} ]; then
    if [ -z ${OpenVINO_DIR} ]; then
        echo "OpenVINO_DIR not found, please install OpenVINO via archives and enable it by:"
        echo "source /opt/intel/openvino/setupvars.sh"
        exit 1
    fi
    CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_OPENVINO=ON"
    # TODO: fix and re-enable the `test-llama-archs` test below
    CTEST_EXTRA="-E test-llama-archs"
 fi
 ## helpers
 # download a file if it does not exist or if it is outdated
@ -222,7 +238,7 @@ function gg_run_ctest_debug {
    (time cmake -DCMAKE_BUILD_TYPE=Debug ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
    (time make -j$(nproc)                                  ) 2>&1 | tee -a $OUT/${ci}-make.log
-    (time ctest --output-on-failure -L main -E "test-opt|test-backend-ops" ) 2>&1 | tee -a $OUT/${ci}-ctest.log
+    (time ctest --output-on-failure -L main -E "test-opt|test-backend-ops" ${CTEST_EXTRA}) 2>&1 | tee -a $OUT/${ci}-ctest.log
    set +e
 }
@ -254,9 +270,9 @@ function gg_run_ctest_release {
    (time make -j$(nproc)                                    ) 2>&1 | tee -a $OUT/${ci}-make.log
    if [ -z ${GG_BUILD_LOW_PERF} ]; then
-        (time ctest --output-on-failure -L 'main|python' ) 2>&1 | tee -a $OUT/${ci}-ctest.log
+        (time ctest --output-on-failure -L 'main|python' ${CTEST_EXTRA}) 2>&1 | tee -a $OUT/${ci}-ctest.log
    else
-        (time ctest --output-on-failure -L main -E test-opt ) 2>&1 | tee -a $OUT/${ci}-ctest.log
+        (time ctest --output-on-failure -L main -E test-opt ${CTEST_EXTRA}) 2>&1 | tee -a $OUT/${ci}-ctest.log
    fi
    set +e
--- a/common/CMakeLists.txt
+++ b/common/CMakeLists.txt
@ -47,10 +47,10 @@ add_library(${TARGET} STATIC
    arg.cpp
    arg.h
    base64.hpp
-    chat-parser.cpp
+    chat-auto-parser-generator.cpp
-    chat-parser.h
+    chat-auto-parser-helpers.cpp
-    chat-parser-xml-toolcall.h
+    chat-auto-parser.h
-    chat-parser-xml-toolcall.cpp
+    chat-diff-analyzer.cpp
    chat-peg-parser.cpp
    chat-peg-parser.h
    chat.cpp
@ -81,6 +81,8 @@ add_library(${TARGET} STATIC
    preset.cpp
    preset.h
    regex-partial.cpp
    reasoning-budget.cpp
    reasoning-budget.h
    regex-partial.h
    sampling.cpp
    sampling.h
--- a/common/arg.cpp
+++ b/common/arg.cpp
@ -732,23 +732,28 @@ static void common_params_print_completion(common_params_context & ctx_arg) {
        "llama-completion",
        "llama-convert-llama2c-to-ggml",
        "llama-cvector-generator",
        "llama-debug",
        "llama-diffusion-cli",
        "llama-embedding",
        "llama-eval-callback",
        "llama-export-lora",
        "llama-finetune",
        "llama-fit-params",
        "llama-gemma3-cli",
        "llama-gen-docs",
        "llama-gguf",
        "llama-gguf-hash",
        "llama-gguf-split",
-        "llama-gritlm",
+        "llama-idle",
        "llama-imatrix",
-        "llama-infill",
+        "llama-llava-cli",
        "llama-mtmd-cli",
        "llama-llava-clip-quantize-cli",
        "llama-lookahead",
        "llama-lookup",
        "llama-lookup-create",
        "llama-lookup-merge",
        "llama-lookup-stats",
        "llama-minicpmv-cli",
        "llama-mtmd-cli",
        "llama-parallel",
        "llama-passkey",
        "llama-perplexity",
@ -1279,13 +1284,20 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        }
    ).set_env("LLAMA_ARG_SWA_FULL"));
    add_opt(common_arg(
-        {"--ctx-checkpoints", "--swa-checkpoints"}, "N",
+        {"-ctxcp", "--ctx-checkpoints", "--swa-checkpoints"}, "N",
        string_format("max number of context checkpoints to create per slot (default: %d)"
            "[(more info)](https://github.com/ggml-org/llama.cpp/pull/15293)", params.n_ctx_checkpoints),
        [](common_params & params, int value) {
            params.n_ctx_checkpoints = value;
        }
    ).set_env("LLAMA_ARG_CTX_CHECKPOINTS").set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
    add_opt(common_arg(
        {"-cpent", "--checkpoint-every-n-tokens"}, "N",
        string_format("create a checkpoint every n tokens during prefill (processing), -1 to disable (default: %d)", params.checkpoint_every_nt),
        [](common_params & params, int value) {
            params.checkpoint_every_nt = value;
        }
    ).set_env("LLAMA_ARG_CHECKPOINT_EVERY_NT").set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
    add_opt(common_arg(
        {"-cram", "--cache-ram"}, "N",
        string_format("set the maximum cache size in MiB (default: %d, -1 - no limit, 0 - disable)"
@ -2399,7 +2411,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
                params.fit_params = false;
            } else {
                throw std::runtime_error(
-                    string_format("error: unkown value for --fit: '%s'\n", value.c_str()));
+                    string_format("error: unknown value for --fit: '%s'\n", value.c_str()));
            }
        }
    ).set_env("LLAMA_ARG_FIT"));
@ -2420,11 +2432,11 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
                );
            }
            if (split_arg.size() == 1) {
-                std::fill(params.fit_params_target.begin(), params.fit_params_target.end(), std::stoul(split_arg[0]) * 1024*1024);
+                std::fill(params.fit_params_target.begin(), params.fit_params_target.end(), std::stoull(split_arg[0]) * 1024*1024);
                return;
            }
            for (size_t i = 0; i < split_arg.size(); i++) {
-                params.fit_params_target[i] = std::stoul(split_arg[i]) * 1024*1024;
+                params.fit_params_target[i] = std::stoull(split_arg[i]) * 1024*1024;
            }
        }
    ).set_env("LLAMA_ARG_FIT_TARGET"));
@ -2520,11 +2532,28 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
    ));
    add_opt(common_arg(
        {"-a", "--alias"}, "STRING",
-        "set alias for model name (to be used by REST API)",
+        "set model name aliases, comma-separated (to be used by API)",
        [](common_params & params, const std::string & value) {
-            params.model_alias = value;
+            for (auto & alias : string_split<std::string>(value, ',')) {
                alias = string_strip(alias);
                if (!alias.empty()) {
                    params.model_alias.insert(alias);
                }
            }
        }
    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ALIAS"));
    add_opt(common_arg(
        {"--tags"}, "STRING",
        "set model tags, comma-separated (informational, not used for routing)",
        [](common_params & params, const std::string & value) {
            for (auto & tag : string_split<std::string>(value, ',')) {
                tag = string_strip(tag);
                if (!tag.empty()) {
                    params.model_tags.insert(tag);
                }
            }
        }
    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_TAGS"));
    add_opt(common_arg(
        {"-m", "--model"}, "FNAME",
        ex == LLAMA_EXAMPLE_EXPORT_LORA
@ -2642,7 +2671,8 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        [](common_params & params, const std::string & value) {
            params.out_file = value;
        }
-    ).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_CVECTOR_GENERATOR, LLAMA_EXAMPLE_EXPORT_LORA, LLAMA_EXAMPLE_TTS, LLAMA_EXAMPLE_FINETUNE}));
+    ).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_CVECTOR_GENERATOR, LLAMA_EXAMPLE_EXPORT_LORA, LLAMA_EXAMPLE_TTS, LLAMA_EXAMPLE_FINETUNE,
                    LLAMA_EXAMPLE_RESULTS, LLAMA_EXAMPLE_EXPORT_GRAPH_OPS}));
    add_opt(common_arg(
        {"-ofreq", "--output-frequency"}, "N",
        string_format("output the imatrix every N iterations (default: %d)", params.n_out_freq),
@ -2810,6 +2840,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
            params.webui_config_json = read_file(value);
        }
    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_WEBUI_CONFIG_FILE"));
    add_opt(common_arg(
        {"--webui-mcp-proxy"},
        {"--no-webui-mcp-proxy"},
        string_format("experimental: whether to enable MCP CORS proxy - do not enable in untrusted environments (default: %s)", params.webui_mcp_proxy ? "enabled" : "disabled"),
        [](common_params & params, bool value) {
            params.webui_mcp_proxy = value;
        }
    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_WEBUI_MCP_PROXY"));
    add_opt(common_arg(
        {"--webui"},
        {"--no-webui"},
@ -2881,6 +2919,10 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        [](common_params & params, const std::string & value) {
            auto parsed = json::parse(value);
            for (const auto & item : parsed.items()) {
                if (item.key() == "enable_thinking") {
                    LOG_WRN("Setting 'enable_thinking' via --chat-template-kwargs is deprecated. "
                            "Use --reasoning on / --reasoning off instead.\n");
                }
                params.default_template_kwargs[item.key()] = item.value().dump();
            }
        }
@ -3016,14 +3058,39 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
            params.reasoning_format = common_reasoning_format_from_name(value);
        }
    ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_THINK"));
    add_opt(common_arg(
        {"-rea", "--reasoning"}, "[on|off|auto]",
        "Use reasoning/thinking in the chat ('on', 'off', or 'auto', default: 'auto' (detect from template))",
        [](common_params & params, const std::string & value) {
            if (is_truthy(value)) {
                params.enable_reasoning = 1;
                params.default_template_kwargs["enable_thinking"] = "true";
            } else if (is_falsey(value)) {
                params.enable_reasoning = 0;
                params.default_template_kwargs["enable_thinking"] = "false";
            } else if (is_autoy(value)) {
                params.enable_reasoning = -1;
            } else {
                throw std::invalid_argument(
                    string_format("error: unknown value for --reasoning: '%s'\n", value.c_str()));
            }
        }
    ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_REASONING"));
    add_opt(common_arg(
        {"--reasoning-budget"}, "N",
-        "controls the amount of thinking allowed; currently only one of: -1 for unrestricted thinking budget, or 0 to disable thinking (default: -1)",
+        "token budget for thinking: -1 for unrestricted, 0 for immediate end, N>0 for token budget (default: -1)",
        [](common_params & params, int value) {
-            if (value != 0 && value != -1) { throw std::invalid_argument("invalid value"); }
+            if (value < -1) { throw std::invalid_argument("invalid value"); }
            params.reasoning_budget = value;
        }
    ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_THINK_BUDGET"));
    add_opt(common_arg(
        {"--reasoning-budget-message"}, "MESSAGE",
        "message injected before the end-of-thinking tag when reasoning budget is exhausted (default: none)",
        [](common_params & params, const std::string & value) {
            params.reasoning_budget_message = value;
        }
    ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_THINK_BUDGET_MESSAGE"));
    add_opt(common_arg(
        {"--chat-template"}, "JINJA_TEMPLATE",
        string_format(
@ -3575,6 +3642,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
            }
        }
    ).set_examples({ LLAMA_EXAMPLE_FINETUNE }));
    add_opt(common_arg(
        {"--check"},
        string_format("check rather than generate results (default: %s)", params.check ? "true" : "false"),
        [](common_params & params) {
            params.check = true;
        }
    ).set_examples({LLAMA_EXAMPLE_RESULTS}));
    add_opt(common_arg(
        {"--save-logits"},
        string_format("save final logits to files for verification (default: %s)", params.save_logits ? "true" : "false"),
--- a/common/chat-auto-parser-generator.cpp
+++ b/common/chat-auto-parser-generator.cpp
@ -0,0 +1,454 @@
 #include "chat-auto-parser.h"
 #include "chat-peg-parser.h"
 #include "chat.h"
 #include "common.h"
 #include "json-schema-to-grammar.h"
 #include "log.h"
 #include "nlohmann/json.hpp"
 #include <stdexcept>
 #include <string>
 using json = nlohmann::ordered_json;
 // Helper to iterate over tools/functions
 static void foreach_function(const json & tools, const std::function<void(const json &)> & fn) {
    for (const auto & tool : tools) {
        if (!tool.contains("type") || tool.at("type") != "function" || !tool.contains("function")) {
            continue;
        }
        fn(tool);
    }
 }
 namespace autoparser {
 parser_build_context::parser_build_context(common_chat_peg_builder & p, const templates_params & inputs) :
    p(p),
    inputs(inputs),
    reasoning_parser(p.eps()) {}
 common_chat_params peg_generator::generate_parser(const common_chat_template &    tmpl,
                                                  const struct templates_params & inputs) {
    // Run differential analysis to extract template structure
    struct autoparser autoparser;
    autoparser.analyze_template(tmpl);
    return generate_parser(tmpl, inputs, autoparser);
 }
 common_chat_params peg_generator::generate_parser(const common_chat_template &    tmpl,
                                                  const struct templates_params & inputs,
                                                  const autoparser &              autoparser) {
    // Build the parser using the analysis results
    auto parser = autoparser.build_parser(inputs);
    // Create the result structure
    common_chat_params data;
    data.prompt           = common_chat_template_direct_apply(tmpl, inputs);
    data.format           = COMMON_CHAT_FORMAT_PEG_NATIVE;
    data.preserved_tokens = autoparser.preserved_tokens;
    data.parser           = parser.save();
    // Build grammar if tools are present
    bool has_tools =
        autoparser.tools.format.mode != tool_format::NONE && inputs.tools.is_array() && !inputs.tools.empty();
    std::string trigger_marker = !autoparser.tools.format.section_start.empty() ? autoparser.tools.format.section_start :
                                                                                  autoparser.tools.format.per_call_start;
    bool has_response_format = !inputs.json_schema.empty() && inputs.json_schema.is_object();
    bool include_grammar = has_response_format || (has_tools &&
            ((inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_AUTO && !trigger_marker.empty()) ||
              inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_REQUIRED));
    if (include_grammar) {
        data.grammar_lazy = !has_response_format && inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_AUTO;
        data.grammar      = build_grammar([&](const common_grammar_builder & builder) {
            foreach_function(inputs.tools, [&](const json & tool) {
                const auto & function = tool.at("function");
                auto         schema   = function.at("parameters");
                builder.resolve_refs(schema);
            });
            parser.build_grammar(builder, data.grammar_lazy);
        });
        // Set grammar triggers based on tool section markers (fall back to per-call markers)
        if (data.grammar_lazy) {
            data.grammar_triggers = {
                { COMMON_GRAMMAR_TRIGGER_TYPE_WORD, trigger_marker }
            };
        }
    }
    return data;
 }
 common_peg_arena autoparser::build_parser(const templates_params & inputs) const {
    if (!analysis_complete) {
        throw std::invalid_argument("Cannot call build_parser on autoparser without performing analysis first, call analyze_template(...)");
    }
    return build_chat_peg_parser([&](common_chat_peg_builder & p) {
        // If the template uses Python dict format (single-quoted strings in JSON structures),
        // pre-register a json-string rule that accepts both quote styles. This must happen
        // before any call to p.json() so that all JSON parsing inherits the flexible rule.
        if (tools.format.uses_python_dicts) {
            p.rule("json-string", p.quoted_string());
        }
        parser_build_context ctx(p, inputs);
        bool                 extract_reasoning = inputs.reasoning_format != COMMON_REASONING_FORMAT_NONE;
        bool                 enable_thinking   = inputs.enable_thinking;
        ctx.extracting_reasoning = extract_reasoning && enable_thinking && reasoning.mode != reasoning_mode::NONE;
        ctx.content              = &content;
        // Build reasoning parser
        ctx.reasoning_parser = reasoning.build_parser(ctx);
        bool has_tools           = inputs.tools.is_array() && !inputs.tools.empty();
        bool has_response_format = inputs.json_schema.is_object() && !inputs.json_schema.empty();
        if (has_response_format) {
            auto response_format = p.rule("response-format", p.content(p.schema(p.json(), "response-format-schema", inputs.json_schema)));
            return ctx.reasoning_parser + p.space() + p.choice({
                p.literal("```json") + p.space() + response_format + p.space() + p.literal("```"),
                response_format
            }) + p.end();
        }
        if (has_tools && inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_NONE && jinja_caps.supports_tool_calls) {
            return tools.build_parser(ctx);
        }
        return content.build_parser(ctx);
    });
 }
 common_peg_parser analyze_reasoning::build_parser(parser_build_context & ctx) const {
    auto & p = ctx.p;
    if (!ctx.extracting_reasoning) {
        return p.eps();
    }
    bool thinking_forced_open   = (mode == reasoning_mode::FORCED_OPEN);
    bool thinking_forced_closed = (mode == reasoning_mode::FORCED_CLOSED);
    if (thinking_forced_open || thinking_forced_closed) {
        // Thinking is forced open OR forced closed with enable_thinking=true
        // In both cases, expect only the closing tag (opening was in template)
        // However, since we might have incorrectly detected the open/close pattern,
        // we admit an optional starting marker
        return p.optional(p.literal(start)) + p.reasoning(p.until(end)) + end;
    }
    if (mode == reasoning_mode::TAG_BASED || mode == reasoning_mode::TOOLS_ONLY) {
        // Standard tag-based reasoning OR tools-only mode (reasoning appears with tools)
        // Both use the same tag-based pattern if markers are available
        if (!start.empty() && !end.empty()) {
            return p.optional(start + p.reasoning(p.until(end)) + end);
        }
    } else if (mode == reasoning_mode::DELIMITER) {
        return p.optional(p.reasoning(p.until(end)) + end);
    }
    return p.eps();
 }
 common_peg_parser analyze_content::build_parser(parser_build_context & ctx) const {
    auto & p = ctx.p;
    if (is_always_wrapped()) {
        if (ctx.extracting_reasoning) {
            return ctx.reasoning_parser + start + p.content(p.until(end)) + end + p.end();
        }
        return p.content(p.until(start)) + start + p.content(p.until(end)) + end + p.end();
    }
    return ctx.reasoning_parser + p.content(p.rest()) + p.end();
 }
 common_peg_parser analyze_content::build_optional_wrapped(parser_build_context & ctx) const {
    auto & p = ctx.p;
    if (is_always_wrapped()) {
        return p.optional(start + p.content(p.until(end)) + end);
    }
    return p.eps();
 }
 common_peg_parser analyze_tools::build_parser(parser_build_context & ctx) const {
    switch (format.mode) {
        case tool_format::JSON_NATIVE:
            return build_tool_parser_json_native(ctx);
        case tool_format::TAG_WITH_JSON:
            return build_tool_parser_tag_json(ctx);
        case tool_format::TAG_WITH_TAGGED:
            return build_tool_parser_tag_tagged(ctx);
        default:
            LOG_ERR("[ERROR] Template seems to support tool calls, but failed to determine tool format. Tool calling will not work properly. "
                "Check for a fixed template for your model in the models/templates directory of your llama.cpp installation or "
                "report an issue at https://github.com/ggml-org/llama.cpp/issues\n");
            return ctx.p.eps();
    }
 }
 common_peg_parser analyze_tools::build_tool_parser_json_native(parser_build_context & ctx) const {
    auto &       p           = ctx.p;
    const auto & inputs      = ctx.inputs;
    bool         force_tools = inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_REQUIRED;
    // Build effective field names with dot notation if function_field is set
    std::string name_field = format.name_field;
    std::string args_field = format.args_field;
    if (!format.function_field.empty() && format.function_field != "function" &&
        name_field.find('.') == std::string::npos) {
        name_field = format.function_field + "." + name_field;
        args_field = format.function_field + "." + args_field;
    }
    auto tools_parser = p.standard_json_tools(
        format.section_start, format.section_end, inputs.tools, inputs.parallel_tool_calls,
        inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_REQUIRED, name_field, args_field, format.tools_array_wrapped,
        format.fun_name_is_key, format.id_field, format.gen_id_field, format.parameter_order);
    // Handle content wrappers if present
    if (ctx.content && ctx.content->is_always_wrapped()) {
        auto wrapped_content = ctx.content->build_optional_wrapped(ctx);
        return ctx.reasoning_parser + wrapped_content + tools_parser + p.end();
    }
    std::string tool_start = "{";
    if (!format.section_start.empty()) {
        tool_start = format.section_start;
    } else if (!format.per_call_start.empty()) {
        tool_start = format.per_call_start;
    }
    return ctx.reasoning_parser + (force_tools ? p.eps() : p.optional(p.content(p.until(tool_start)))) + tools_parser +
           p.end();
 }
 common_peg_parser analyze_tools::build_tool_parser_tag_json(parser_build_context & ctx) const {
    auto &       p           = ctx.p;
    const auto & inputs      = ctx.inputs;
    bool         force_tools = inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_REQUIRED;
    common_peg_parser tool_choice = p.choice();
    foreach_function(inputs.tools, [&](const json & tool) {
        const auto & func   = tool.at("function");
        std::string  name   = func.at("name");
        const auto & schema = func.at("parameters");
        // Build call_id parser based on position (if supported)
        common_peg_parser call_id_section = p.eps();
        if (call_id.pos == call_id_position::BETWEEN_FUNC_AND_ARGS && !call_id.prefix.empty() &&
            !call_id.suffix.empty()) {
            call_id_section = p.optional(call_id.prefix + p.tool_id(p.until(call_id.suffix))) + call_id.suffix;
        }
        auto func_parser = p.tool_open(function.name_prefix + p.tool_name(p.literal(name)) + function.name_suffix) +
                           call_id_section + p.tool_args(p.schema(p.json(), "tool-" + name + "-schema", schema));
        if (!function.close.empty()) {
            func_parser = func_parser + function.close;
        }
        tool_choice |= p.rule("tool-" + name, func_parser);
    });
    auto require_calls = inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_REQUIRED;
    common_peg_parser tool_calls = p.eps();
    if (!format.per_call_start.empty()) {
        auto wrapped_call = format.per_call_start + tool_choice + format.per_call_end;
        if (inputs.parallel_tool_calls) {
            tool_calls = p.trigger_rule("tool-call", wrapped_call + p.zero_or_more(p.space() + wrapped_call));
        } else {
            tool_calls = p.trigger_rule("tool-call", wrapped_call);
        }
        if (!format.section_start.empty()) {
            tool_calls = p.trigger_rule("tool-calls",
                                        p.literal(format.section_start) + p.space() + tool_calls + p.space() +
                                            (format.section_end.empty() ? p.end() : p.literal(format.section_end)));
        }
    } else {
        std::string separator = ", ";  // Default
        if (inputs.parallel_tool_calls) {
            tool_calls = p.trigger_rule("tool-call", format.section_start + tool_choice +
                                                         p.zero_or_more(separator + tool_choice) + format.section_end);
        } else {
            tool_calls = p.trigger_rule("tool-call", format.section_start + tool_choice + format.section_end);
        }
    }
    if (!require_calls) {
        tool_calls = p.optional(tool_calls);
    }
    std::string trigger_marker       = !format.section_start.empty() ? format.section_start : format.per_call_start;
    auto        content_before_tools = trigger_marker.empty() ? p.eps() : p.until(trigger_marker);
    return ctx.reasoning_parser + (force_tools ? p.eps() : p.optional(p.content(content_before_tools))) + tool_calls +
           p.end();
 }
 common_peg_parser analyze_tools::build_tool_parser_tag_tagged(parser_build_context & ctx) const {
    auto &       p           = ctx.p;
    const auto & inputs      = ctx.inputs;
    bool         force_tools = inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_REQUIRED;
    common_peg_parser tool_choice = p.choice();
    foreach_function(inputs.tools, [&](const json & tool) {
        const auto & func   = tool.at("function");
        std::string  name   = func.at("name");
        const auto & params = func.at("parameters");
        if (!params.contains("properties") || !params.at("properties").is_object()) {
            return;
        }
        const auto &          properties = params.at("properties");
        std::set<std::string> required;
        if (params.contains("required") && params.at("required").is_array()) {
            params.at("required").get_to(required);
        }
        // Build parser for each argument, separating required and optional
        std::vector<common_peg_parser> required_parsers;
        std::vector<common_peg_parser> optional_parsers;
        for (const auto & [param_name, param_schema] : properties.items()) {
            bool        is_required = required.find(param_name) != required.end();
            std::string type        = "object";
            auto        type_obj    = param_schema.contains("type") ? param_schema.at("type") : json::object();
            if (type_obj.is_string()) {
                type_obj.get_to(type);
            } else if (type_obj.is_object()) {
                if (type_obj.contains("type") && type_obj.at("type").is_string()) {
                    type_obj.at("type").get_to(type);
                }
            }
            auto arg = p.tool_arg(
                p.tool_arg_open(arguments.name_prefix + p.tool_arg_name(p.literal(param_name)) +
                                arguments.name_suffix) +
                arguments.value_prefix +
                (type == "string" ? p.tool_arg_string_value(p.schema(p.until(arguments.value_suffix),
                                                                     "tool-" + name + "-arg-" + param_name + "-schema",
                                                                     param_schema, true)) :
                                    p.tool_arg_json_value(p.schema(
                                        p.json(), "tool-" + name + "-arg-" + param_name + "-schema", param_schema, format.uses_python_dicts)) +
                                        p.space()) +
                p.tool_arg_close(p.literal(arguments.value_suffix)));
            auto named_arg = p.rule("tool-" + name + "-arg-" + param_name, arg);
            if (is_required) {
                required_parsers.push_back(named_arg);
            } else {
                optional_parsers.push_back(named_arg);
            }
        }
        // Build required arg sequence in definition order
        common_peg_parser args_seq = p.eps();
        for (size_t i = 0; i < required_parsers.size(); i++) {
            if (i > 0) {
                args_seq = args_seq + p.space();
            }
            args_seq = args_seq + required_parsers[i];
        }
        // Build optional args with flexible ordering
        if (!optional_parsers.empty()) {
            common_peg_parser any_opt = p.choice();
            for (const auto & opt : optional_parsers) {
                any_opt |= opt;
            }
            args_seq = args_seq + p.repeat(p.space() + any_opt, 0, (int) optional_parsers.size());
        }
        // Build call_id parser based on position (if supported)
        common_peg_parser call_id_section = p.eps();
        bool have_call_id = false;
        if (call_id.pos == call_id_position::BETWEEN_FUNC_AND_ARGS && !call_id.prefix.empty() &&
            !call_id.suffix.empty()) {
            have_call_id = true;
            call_id_section = p.optional(call_id.prefix + p.tool_id(p.until(call_id.suffix)) + call_id.suffix);
        }
        bool matched_atomic = false;
        common_peg_parser func_parser = p.eps();
        if (!function.name_suffix.empty()) {
            func_parser = p.tool_open(function.name_prefix + p.tool_name(p.literal(name)) + function.name_suffix) +
                call_id_section + p.space() + args_seq;
            matched_atomic = true;
        } else if (have_call_id) {
            func_parser = p.atomic(p.tool_open(function.name_prefix + p.tool_name(p.literal(name)) + function.name_suffix) +
                call_id_section) + p.space() + args_seq;
            matched_atomic = true;
        } else if (!arguments.name_prefix.empty() && properties.size() > 0) {
            func_parser = p.atomic(p.tool_open(function.name_prefix + p.tool_name(p.literal(name)) + function.name_suffix) +
                call_id_section + p.space() + p.peek(p.literal(arguments.name_prefix))) + args_seq;
            matched_atomic = true;
        } else {
            func_parser = p.tool_open(function.name_prefix + p.tool_name(p.literal(name)) + function.name_suffix) +
                call_id_section + p.space() + args_seq;
        }
        if (!function.close.empty()) {
            func_parser = func_parser + p.space() + p.tool_close(p.literal(function.close));
        } else if (!format.per_call_end.empty()) {
            // When there's no func_close but there is a per_call_end marker, use peek() to ensure
            // we only emit tool_close when we can actually see the closing marker. This prevents
            // premature closing during partial parsing when we've seen e.g. "</" which could be
            // either "</tool_call>" (end) or "<arg_key>" prefix that failed to match.
            func_parser = func_parser + p.tool_close(p.peek(p.literal(format.per_call_end)));
        } else {
            func_parser =
                func_parser + p.tool_close(p.space());  // force this to process tool closing callbacks in mapper
        }
        if (!matched_atomic) {
            func_parser = p.atomic(func_parser);
        }
        tool_choice |= p.rule("tool-" + name, func_parser);
    });
    auto require_tools = inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_REQUIRED;
    common_peg_parser tool_calls = p.eps();
    if (!format.per_call_start.empty()) {
        auto wrapped_call = format.per_call_start + p.space() + tool_choice + p.space() + format.per_call_end;
        if (inputs.parallel_tool_calls) {
            tool_calls = p.trigger_rule("tool-call", wrapped_call + p.zero_or_more(p.space() + wrapped_call));
        } else {
            tool_calls = p.trigger_rule("tool-call", wrapped_call);
        }
        if (!format.section_start.empty()) {
            tool_calls = p.trigger_rule("tool-calls",
                                        p.literal(format.section_start) + p.space() + tool_calls + p.space() +
                                            (format.section_end.empty() ? p.end() : p.literal(format.section_end)));
        }
    } else {
        std::string separator = ", ";  // Default
        if (inputs.parallel_tool_calls) {
            tool_calls = p.trigger_rule("tool-call", format.section_start + p.space() + tool_choice +
                                                         p.zero_or_more(separator + tool_choice) + p.space() +
                                                         format.section_end);
        } else {
            tool_calls = p.trigger_rule(
                "tool-call", format.section_start + p.space() + tool_choice + p.space() + format.section_end);
        }
    }
    if (!require_tools) {
        tool_calls = p.optional(tool_calls);
    }
    std::string trigger_marker       = !format.section_start.empty() ? format.section_start : format.per_call_start;
    auto        content_before_tools = trigger_marker.empty() ? p.eps() : p.until(trigger_marker);
    return ctx.reasoning_parser + (force_tools ? p.eps() : p.optional(p.content(content_before_tools))) + tool_calls +
           p.end();
 }
 }  // namespace autoparser
--- a/common/chat-auto-parser-helpers.cpp
+++ b/common/chat-auto-parser-helpers.cpp
@ -0,0 +1,347 @@
 #include "chat-auto-parser-helpers.h"
 #include "chat-auto-parser.h"
 #include "chat.h"
 #include "log.h"
 #include "nlohmann/json.hpp"
 #include <cctype>
 #include <numeric>
 using json = nlohmann::ordered_json;
 std::string trim_whitespace(const std::string & str) {
    size_t start = 0;
    while (start < str.length() && std::isspace(static_cast<unsigned char>(str[start]))) {
        start++;
    }
    if (start == str.length()) {
        return "";
    }
    size_t end = str.length() - 1;
    while (end > start && std::isspace(static_cast<unsigned char>(str[end]))) {
        end--;
    }
    return str.substr(start, end - start + 1);
 }
 std::string trim_leading_whitespace(const std::string & str) {
    size_t start = 0;
    while (start < str.length() && std::isspace(static_cast<unsigned char>(str[start]))) {
        start++;
    }
    return str.substr(start);
 }
 std::string trim_trailing_whitespace(const std::string & str) {
    if (str.empty()) {
        return "";
    }
    size_t end = str.length() - 1;
    while (end > 0 && std::isspace(static_cast<unsigned char>(str[end]))) {
        end--;
    }
    // If first char is also whitespace, return empty string
    if (end == 0 && std::isspace(static_cast<unsigned char>(str[0]))) {
        return "";
    }
    return str.substr(0, end + 1);
 }
 std::string trim_trailing_newlines(const std::string & str) {
    size_t end = str.length();
    while (end > 0 && str[end - 1] == '\n') {
        end--;
    }
    return str.substr(0, end);
 }
 static size_t common_prefix_len(const std::string & left, const std::string & right) {
    size_t prefix_len = 0;
    size_t min_len    = std::min(left.length(), right.length());
    while (prefix_len < min_len && left[prefix_len] == right[prefix_len]) {
        prefix_len++;
    }
    return prefix_len;
 }
 static size_t common_suffix_len(const std::string & left, const std::string & right) {
    size_t suffix_len = 0;
    size_t min_len    = std::min(left.length(), right.length());
    while (suffix_len < min_len && left[left.length() - 1 - suffix_len] == right[right.length() - 1 - suffix_len]) {
        suffix_len++;
    }
    return suffix_len;
 }
 diff_split calculate_diff_split(const std::string & left, const std::string & right) {
    diff_split result;
    auto left_seg = segmentize_markers(left);
    auto right_seg = segmentize_markers(right);
    if (left_seg.empty()) {
        result.right = right;
        return result;
    }
    if (right_seg.empty()) {
        result.left = left;
        return result;
    }
    auto left_start = left_seg.begin();
    auto left_end = --left_seg.end();
    auto right_start = right_seg.begin();
    auto right_end = --right_seg.end();
    auto test = [&] () {
        return left_start != left_end && right_start != right_end;
    };
    bool left_fully_consumed = false;
    bool right_fully_consumed = false;
    while (test()) {
        bool advanced = false;
        if (*left_start == *right_start) {
            result.prefix.append(left_start->value);
            left_start++;
            right_start++;
            advanced = true;
        }
        if (*left_end == *right_end) {
            result.suffix = left_end->value + result.suffix;
            if (left_start != left_end) {
                left_end--;
            } else {
                left_fully_consumed = true;
            }
            if (right_start != right_end) {
                right_end--;
            } else {
                right_fully_consumed = true;
            }
            advanced = true;
        }
        if (!advanced) {
            break;
        }
    }
    if (left_start == left_end && right_start != right_end) {
        if (*left_start == *right_end) {
            result.suffix = right_end->value + result.suffix;
            right_end--;
            left_fully_consumed = true;
        } else if (*left_start == *right_start) {
            result.prefix.append(right_start->value);
            right_start++;
            left_fully_consumed = true;
        }
    } else if (right_start == right_end && left_start != left_end) {
        if (*left_end == *right_start) {
            result.suffix = left_end->value + result.suffix;
            left_end--;
            right_fully_consumed = true;
        } else if (*left_start == *right_start) {
            result.prefix.append(left_start->value);
            left_start++;
            right_fully_consumed = true;
        }
    } else if (left_start == left_end && right_start == right_end && *left_start == *right_start && left_start->type == segment_type::MARKER) {
        result.prefix.append(right_start->value);
        left_fully_consumed = true;
        right_fully_consumed = true;
    }
    auto eat_segment = [](std::string str, const segment & seg) -> std::string { return std::move(str) + seg.value; };
    bool can_have_text_suffix = left_end->type == segment_type::TEXT && right_end->type == segment_type::TEXT;
    bool can_have_text_prefix = right_start->type == segment_type::TEXT && left_start->type == segment_type::TEXT;
    std::string remainder_left = std::accumulate(left_start, left_fully_consumed ? left_end : ++left_end, std::string(), eat_segment);
    std::string remainder_right = std::accumulate(right_start, right_fully_consumed ? right_end : ++right_end, std::string(), eat_segment);
    size_t suffix_len = can_have_text_suffix ? common_suffix_len(remainder_left, remainder_right) : 0;
    // avoid overlaps between prefix and suffix
    size_t prefix_len = can_have_text_prefix ? common_prefix_len(remainder_left.substr(0, remainder_left.size() - suffix_len),
        remainder_right.substr(0, remainder_right.size() - suffix_len)) : 0;
    result.prefix.append(remainder_left.substr(0, prefix_len));
    result.suffix = remainder_left.substr(remainder_left.length() - suffix_len, suffix_len) + result.suffix;
    result.left = remainder_left.substr(prefix_len, remainder_left.length() - prefix_len - suffix_len);
    result.right = remainder_right.substr(prefix_len, remainder_right.length() - prefix_len - suffix_len);
    if (result.left == "" && result.right == "") {
        // degenerate case, no diff
        result.prefix = left;
        result.suffix = "";
        // pick prefix = all as representation
    }
    return result;
 }
 // Returns the prefix of `full` up until the first occurrence of the common prefix of `left` and `right`
 std::string until_common_prefix(const std::string & full, const std::string & left, const std::string & right) {
    // Find the common prefix of left and right
    size_t common_prefix_len = 0;
    size_t min_len           = std::min(left.length(), right.length());
    while (common_prefix_len < min_len && left[common_prefix_len] == right[common_prefix_len]) {
        common_prefix_len++;
    }
    // If there's no common prefix, return empty string
    if (common_prefix_len == 0) {
        return "";
    }
    // Find the common prefix in the full string
    std::string common_prefix = left.substr(0, common_prefix_len);
    size_t      pos           = full.find(common_prefix);
    // If not found, return empty string
    if (pos == std::string::npos) {
        return "";
    }
    // Return everything before the common prefix
    return full.substr(0, pos);
 }
 // Returns the suffix of `full` after the last occurrence of the common suffix of `left` and `right`
 std::string after_common_suffix(const std::string & full, const std::string & left, const std::string & right) {
    // Find the common suffix of left and right (compare from the end)
    size_t common_suffix_len = 0;
    size_t min_len           = std::min(left.length(), right.length());
    while (common_suffix_len < min_len &&
           left[left.length() - 1 - common_suffix_len] == right[right.length() - 1 - common_suffix_len]) {
        common_suffix_len++;
    }
    // If there's no common suffix, return empty string
    if (common_suffix_len == 0) {
        return "";
    }
    // Extract the common suffix
    std::string common_suffix = left.substr(left.length() - common_suffix_len);
    // Find the last occurrence of the common suffix in the full string
    size_t pos = full.rfind(common_suffix);
    // If not found, return empty string
    if (pos == std::string::npos) {
        return "";
    }
    // Return everything after the common suffix
    return full.substr(pos + common_suffix_len);
 }
 // TODO: segmentize will treat a JSON array inside tags as a tag: <calls>[{ "fun": { ... } }]</calls> will be three markers
 // not too worried about that because it hasn't turned out as a problem anywhere, but noting here in case it will
 // Might have to put some restrictions on tag contents as well (like "no { }")
 std::vector<segment> segmentize_markers(const std::string & text) {
    std::vector<segment> retval;
    bool in_marker = false;
    char marker_opener = '\0';
    auto is_marker_opener = [](char c) -> bool { return c == '<' || c == '['; };
    auto is_marker_closer = [](char op, char c) -> bool { return (op == '<' && c == '>') || (op == '[' && c == ']'); };
    size_t last_border = 0;
    for (size_t cur_pos = 0; cur_pos < text.length(); cur_pos++) {
        if (!in_marker && is_marker_opener(text[cur_pos])) {
            if (last_border < cur_pos) {
                retval.push_back(segment(segment_type::TEXT, text.substr(last_border, cur_pos - last_border)));
            }
            last_border = cur_pos;
            in_marker = true;
            marker_opener = text[cur_pos];
        } else if (in_marker && is_marker_closer(marker_opener, text[cur_pos])) {
            // no need to check because last_border will always be smaller
                retval.push_back(segment(segment_type::MARKER, text.substr(last_border, cur_pos - last_border + 1)));
            last_border = cur_pos + 1;
            in_marker = false;
            marker_opener = '\0';
        }
    }
    if (last_border < text.length()) {
            retval.push_back(segment(segment_type::TEXT, text.substr(last_border)));
    }
    return retval;
 }
 std::vector<segment> prune_whitespace_segments(const std::vector<segment> & segments) {
    std::vector<segment> result;
    for (const auto & seg : segments) {
        if (!trim_whitespace(seg.value).empty()) {
            result.push_back(seg);
        }
    }
    return result;
 }
 namespace autoparser {
 std::string apply_template(const common_chat_template & tmpl, const template_params & params) {
    templates_params tmpl_params;
    tmpl_params.messages              = params.messages;
    tmpl_params.tools                 = params.tools;
    tmpl_params.add_generation_prompt = params.add_generation_prompt;
    tmpl_params.enable_thinking       = params.enable_thinking;
    if (params.extra_context) {
        tmpl_params.extra_context = *params.extra_context;
    }
    tmpl_params.extra_context["enable_thinking"] = params.enable_thinking;
    try {
        return common_chat_template_direct_apply(tmpl, tmpl_params);
    } catch (const std::exception & e) {
        LOG_DBG("Template application failed: %s\n", e.what());
        return "";
    }
 }
 std::optional<compare_variants_result> compare_variants(
    const common_chat_template &                   tmpl,
    const template_params &                        params_A,
    const std::function<void(template_params &)> & params_modifier) {
    // Create variant B by copying A
    template_params params_B = params_A;
    // Apply modifier to create variant B
    if (params_modifier) {
        params_modifier(params_B);
    }
    // Apply template to both variants
    std::string output_A = apply_template(tmpl, params_A);
    std::string output_B = apply_template(tmpl, params_B);
    // Check for template application failures
    if (output_A.empty() || output_B.empty()) {
        return std::nullopt;
    }
    // Calculate diff and return result with both outputs
    compare_variants_result result;
    result.diff     = calculate_diff_split(output_A, output_B);
    result.output_A = output_A;
    result.output_B = output_B;
    return result;
 }
 }  // namespace autoparser
--- a/common/chat-auto-parser-helpers.h
+++ b/common/chat-auto-parser-helpers.h
@ -0,0 +1,73 @@
 #pragma once
 #include "chat-auto-parser.h"
 #include <functional>
 #include <optional>
 #include <string>
 std::string trim_whitespace(const std::string & str);
 std::string trim_leading_whitespace(const std::string & str);
 std::string trim_trailing_whitespace(const std::string & str);
 std::string trim_trailing_newlines(const std::string & str);
 // calculate a diff split (longest common prefix, longest common suffix excluding prefix,
 // mismatched part on the left, mismatched part on the right) between two strings
 // account for markers - align prefix and suffix endings so that they end on markers
 // * eg.:
 // calculate_diff_split("<html><body><div></div></body></html>", "<html><body><p>Something</p></body><html>") ->
 //  { "prefix": "<html><body>" (not: "<html><body><"), "suffix": "</body></html>", "left": "<div></div>", "right": "<p>Something</p>" }
 // calculate_diff_split("<html><body>Something</body></html>", "<html><body></body><html>") ->
 //  { "prefix": "<html><body>", "suffix": "</body></html>", "left": "Something", "right": "" }
 diff_split calculate_diff_split(const std::string & left, const std::string & right);
 // Returns the prefix of `full` up until the first occurrence of the common prefix of `left` and `right`
 // Returns empty string if there's no common prefix
 // * eg.:
 // until_common_prefix("really want a FUNCTION call", "FUNCTION alpha", "FUNCTION beta") -> "really want a "
 // until_common_prefix("<tool_call>", "<something>", "<something_else>") -> ""
 // until_common_prefix("some text", "1234", "abcd") -> ""
 // until_common_prefix("one arg two args three args four", "argument alpha", "argument beta") -> "one ""
 std::string until_common_prefix(const std::string & full, const std::string & left, const std::string & right);
 // Returns the suffix of `full` after the last occurrence of the common suffix of `left` and `right`
 // Returns empty string if there's no common suffix
 // Mirror function of `until_common_prefix`
 // * eg.:
 // after_common_suffix("really want a FUNCTION call", "first FUNCTION", "second FUNCTION") -> " call"
 // after_common_suffix("one arg two-args three args four", "alpha-args", "beta-args") -> " three args four"
 std::string after_common_suffix(const std::string & full, const std::string & left, const std::string & right);
 // Segmentize text into markers and non-marker fragments
 // * eg.:
 // segmentize_markers("<html><head><title>The site title</title><body><div>Here's some <b>content</b></div></body></html>" ->
 //  [ (MARKER, "<html>"), (MARKER, "<head>"), (MARKER, "<title>"), (TEXT, "The site title"), (MARKER, "</title>"),
 //    (MARKER, "<body>"), (MARKER, "<div>"), (TEXT, "Here's some "), (MARKER, "<b>"), (TEXT, "content"), (MARKER, "</b>"),
 //    (MARKER, "</div>"), (MARKER, "</body>"), (MARKER, "</html>")
 //  ]
 // segmentize_markers("<|tool_call|>[args]{ are here }[/args]<|tool_call_end|>") ->
 //  [ (MARKER, "<|tool_call|>"), (MARKER, "[args]"), (TEXT, "{ are here }"), (MARKER, "[/args]"), (MARKER, "<|tool_call_end|>") ]
 std::vector<segment> segmentize_markers(const std::string & text);
 // Prune whitespace-only segments from a vector of segments
 // * eg.:
 // segmentize_markers("<tool_call>\n<function=foo>\n<arg=bar>\n   \n</arg>\n</function>\n</tool_call>") ->
 //  X = [ (MARKER, "<tool_call>"), (TEXT, "\n"), (MARKER, "<function=foo>"), (TEXT, "\n"), (MARKER, "<arg=bar>"), (TEXT, "\n   \n"),
 //        (MARKER, "</arg>"), (TEXT, "\n"), (MARKER, "</function>"), (TEXT, "\n"), (MARKER, "</tool_call>") ]
 // prune_whitespace_segments(X) -> [ (MARKER, "<tool_call>"), (MARKER, "<function=foo>"), (MARKER, "<arg=bar>"), (MARKER, "</arg>"),
 //                                   (MARKER, "</function>"), (MARKER, "</tool_call>") ]
 std::vector<segment> prune_whitespace_segments(const std::vector<segment> & segments);
 namespace autoparser {
 // Apply a template with the given parameters, returning the rendered string (empty on failure)
 std::string apply_template(const common_chat_template & tmpl, const template_params & params);
 // Factorized differential comparison function
 // Takes base params and a single modifier lambda to create variant B
 // Returns compare_variants_result containing diff and both outputs, or std::nullopt on failure
 std::optional<compare_variants_result> compare_variants(
    const common_chat_template &                   tmpl,
    const template_params &                        params_A,
    const std::function<void(template_params &)> & params_modifier);
 }  // namespace autoparser
--- a/common/chat-auto-parser.h
+++ b/common/chat-auto-parser.h
@ -0,0 +1,433 @@
 #pragma once
 #include "chat.h"
 #include "common.h"
 #include "jinja/caps.h"
 #include "peg-parser.h"
 #include <chrono>
 #include <optional>
 #include <string>
 #include <utility>
 #include <vector>
 using json = nlohmann::ordered_json;
 class common_chat_peg_builder;
 // ============================================================================
 // Parameters for template application (low-level, used by diff analysis)
 // ============================================================================
 struct template_params {
    json                messages;
    json                tools;
    bool                add_generation_prompt = false;
    bool                enable_thinking       = true;
    std::optional<json> extra_context         = std::nullopt;
 };
 struct diff_split {
    std::string prefix;
    std::string suffix;
    std::string left;
    std::string right;
    bool operator==(struct diff_split & other) const {
        return prefix == other.prefix && suffix == other.suffix && left == other.left && right == other.right;
    }
 };
 // Result of compare_variants containing diff and original outputs
 struct compare_variants_result {
    diff_split  diff;
    std::string output_A;
    std::string output_B;
 };
 namespace autoparser {
 // ============================================================================
 // High-level params for parser generation
 // ============================================================================
 struct templates_params {
    json                                  messages;
    json                                  tools;
    common_chat_tool_choice               tool_choice = COMMON_CHAT_TOOL_CHOICE_AUTO;
    json                                  json_schema;
    bool                                  parallel_tool_calls = true;
    common_reasoning_format               reasoning_format    = COMMON_REASONING_FORMAT_AUTO;
    bool                                  stream              = true;
    std::string                           grammar;
    bool                                  add_generation_prompt = false;
    bool                                  enable_thinking       = true;
    std::chrono::system_clock::time_point now                   = std::chrono::system_clock::now();
    json                                  extra_context;
    bool                                  add_bos       = false;
    bool                                  add_eos       = false;
    bool                                  is_inference  = true;
    bool                                  add_inference = false;
    bool                                  mark_input    = true;  // whether to mark input strings in the jinja context
 };
 // ============================================================================
 // Analysis Result Enums
 // ============================================================================
 // Reasoning handling mode (derived from R1-R3 comparisons)
 enum class reasoning_mode {
    NONE,           // No reasoning markers detected
    TAG_BASED,      // Standard tag-based: <think>...</think>
    DELIMITER,      // Delimiter-based: [BEGIN FINAL RESPONSE] (reasoning ends at delimiter)
    FORCED_OPEN,    // Template ends with open reasoning tag (empty start, non-empty end)
    FORCED_CLOSED,  // Template ends with open reasoning tag on enabled thinking but
                    // with both opened and closed tag for disabled thinking
    TOOLS_ONLY      // Only reason on tool calls, not on normal content
 };
 inline std::ostream & operator<<(std::ostream & os, const reasoning_mode & mode) {
    switch (mode) {
        case reasoning_mode::NONE:
            return os << "NONE";
        case reasoning_mode::TAG_BASED:
            return os << "TAG_BASED";
        case reasoning_mode::DELIMITER:
            return os << "DELIMITER";
        case reasoning_mode::FORCED_OPEN:
            return os << "FORCED_OPEN";
        case reasoning_mode::FORCED_CLOSED:
            return os << "FORCED_CLOSED";
        case reasoning_mode::TOOLS_ONLY:
            return os << "TOOLS_ONLY";
        default:
            return os << "UNKNOWN";
    }
 }
 // Content wrapping mode (derived from C1 comparison)
 enum class content_mode {
    PLAIN,                   // No content markers
    ALWAYS_WRAPPED,          // Content always wrapped with markers
    WRAPPED_WITH_REASONING,  // Content wrapped only when reasoning present
 };
 inline std::ostream & operator<<(std::ostream & os, const content_mode & mode) {
    switch (mode) {
        case content_mode::PLAIN:
            return os << "PLAIN";
        case content_mode::ALWAYS_WRAPPED:
            return os << "ALWAYS_WRAPPED";
        case content_mode::WRAPPED_WITH_REASONING:
            return os << "WRAPPED_WITH_REASONING";
        default:
            return os << "UNKNOWN";
    }
 }
 // Call ID position in tool calls (for non-JSON formats)
 enum class call_id_position {
    NONE,                   // No call ID support detected
    PRE_FUNC_NAME,          // Call ID before function name: [CALL_ID]id[FUNC]name{args}
    BETWEEN_FUNC_AND_ARGS,  // Call ID between function and args: [FUNC]name[CALL_ID]id{args}
    POST_ARGS,              // Call ID after arguments: [FUNC]name{args}[CALL_ID]id
 };
 inline std::ostream & operator<<(std::ostream & os, const call_id_position & pos) {
    switch (pos) {
        case call_id_position::NONE:
            return os << "NONE";
        case call_id_position::PRE_FUNC_NAME:
            return os << "PRE_FUNC_NAME";
        case call_id_position::BETWEEN_FUNC_AND_ARGS:
            return os << "BETWEEN_FUNC_AND_ARGS";
        case call_id_position::POST_ARGS:
            return os << "POST_ARGS";
        default:
            return os << "UNKNOWN";
    }
 }
 // Tool call format classification (derived from T1-T5, A1-A3 comparisons)
 enum class tool_format {
    NONE,             // No tool support detected
    JSON_NATIVE,      // Pure JSON: {"name": "X", "arguments": {...}}
    TAG_WITH_JSON,    // Tag-based with JSON args: <function=X>{...}</function>
    TAG_WITH_TAGGED,  // Tag-based with tagged args: <param=key>value</param>
 };
 inline std::ostream & operator<<(std::ostream & os, const tool_format & format) {
    switch (format) {
        case tool_format::NONE:
            return os << "NONE";
        case tool_format::JSON_NATIVE:
            return os << "JSON_NATIVE";
        case tool_format::TAG_WITH_JSON:
            return os << "TAG_WITH_JSON";
        case tool_format::TAG_WITH_TAGGED:
            return os << "TAG_WITH_TAGGED";
        default:
            return os << "UNKNOWN";
    }
 }
 // ============================================================================
 // Sub-structs for tool analysis
 // ============================================================================
 struct tool_format_analysis {
    tool_format mode = tool_format::NONE;
    std::string section_start;   // e.g., "<tool_call>", "[TOOL_CALLS]", ""
    std::string section_end;     // e.g., "</tool_call>", ""
    std::string per_call_start;  // e.g., "<|tool_call_begin|>", "" (for multi-call templates)
    std::string per_call_end;    // e.g., "<|tool_call_end|>", ""
    bool fun_name_is_key = false;       // In JSON format function name is JSON key, i.e. { "<funname>": { ... arguments ... } }
    bool tools_array_wrapped = false;   // Tool calls wrapped in JSON array [...]
    bool uses_python_dicts = false;     // Tool call args use Python dict format (single-quoted strings)
    std::string              function_field = "function";
    std::string              name_field     = "name";
    std::string              args_field     = "arguments";
    std::string              id_field;
    std::string              gen_id_field;
    std::vector<std::string> parameter_order;
 };
 struct tool_function_analysis {
    std::string name_prefix;  // e.g., "<function=", "\"name\": \"", "functions."
    std::string name_suffix;  // e.g., ">", "\"", ":0"
    std::string close;        // e.g., "</function>", "" (for tag-based)
 };
 struct tool_arguments_analysis {
    std::string start;          // e.g., "<|tool_call_argument_begin|>", "<args>"
    std::string end;            // e.g., "<|tool_call_argument_end|>", "</args>"
    std::string name_prefix;   // e.g., "<param=", "<arg_key>", "\""
    std::string name_suffix;   // e.g., ">", "</arg_key>", "\":"
    std::string value_prefix;  // e.g., "", "<arg_value>", ""
    std::string value_suffix;  // e.g., "</param>", "</arg_value>", ""
    std::string separator;     // e.g., "", "\n", ","
 };
 struct tool_id_analysis {
    call_id_position pos = call_id_position::NONE;
    std::string prefix;  // e.g., "[CALL_ID]" (marker before call ID value)
    std::string suffix;  // e.g., "" (marker after call ID value, before next section)
 };
 // ============================================================================
 // Parser build context (shared interface for build_parser methods)
 // ============================================================================
 struct analyze_content;
 struct parser_build_context {
    common_chat_peg_builder & p;
    const templates_params &          inputs;
    common_peg_parser                 reasoning_parser;
    bool                              extracting_reasoning = false;
    const analyze_content *           content              = nullptr;
    parser_build_context(common_chat_peg_builder & p, const templates_params & inputs);
 };
 // ============================================================================
 // Base class for analyzers with parser building
 // ============================================================================
 struct analyze_base {
    virtual ~analyze_base() = default;
    virtual common_peg_parser build_parser(parser_build_context & ctx) const = 0;
  protected:
    const common_chat_template * tmpl = nullptr;
    analyze_base() = default;
    explicit analyze_base(const common_chat_template & tmpl) : tmpl(&tmpl) {}
 };
 // ============================================================================
 // Reasoning analyzer
 // ============================================================================
 struct analyze_reasoning : analyze_base {
    reasoning_mode mode = reasoning_mode::NONE;
    std::string start;  // e.g., "<think>", "[THINK]", "<|START_THINKING|>", ""
    std::string end;    // e.g., "</think>", "[BEGIN FINAL RESPONSE]", "<|END_THINKING|>"
    analyze_reasoning() = default;
    analyze_reasoning(const common_chat_template & tmpl, bool supports_tools);
    common_peg_parser build_parser(parser_build_context & ctx) const override;
  private:
    // Look for reasoning markers in rendered content
    void compare_reasoning_presence();
    // Compare generation prompt with enable_thinking=true vs false
    void compare_thinking_enabled();
    // Check if reasoning is always possible or only in tool calls
    void compare_reasoning_scope();
 };
 // ============================================================================
 // Content analyzer
 // ============================================================================
 struct analyze_content : analyze_base {
    content_mode mode = content_mode::PLAIN;
    std::string start;  // e.g., "<response>", ">>>all\n", ""
    std::string end;    // e.g., "</response>", ""
    bool requires_nonnull_content = false;
    analyze_content() = default;
    analyze_content(const common_chat_template & tmpl, const analyze_reasoning & reasoning);
    common_peg_parser build_parser(parser_build_context & ctx) const override;
    bool is_always_wrapped() const;
    common_peg_parser build_optional_wrapped(parser_build_context & ctx) const;
 };
 // ============================================================================
 // Tool analyzer
 // ============================================================================
 struct analyze_tools : analyze_base {
    tool_format_analysis    format;
    tool_function_analysis  function;
    tool_arguments_analysis arguments;
    tool_id_analysis        call_id;
    analyze_tools() = default;
    analyze_tools(const common_chat_template & tmpl,
                  const jinja::caps &          caps,
                  const analyze_reasoning &    reasoning);
    common_peg_parser build_parser(parser_build_context & ctx) const override;
  private:
    // Extract tool calling 'haystack' for further analysis and delegate further analysis based on format
    void analyze_tool_calls(const analyze_reasoning & reasoning);
    // Analyze format based on position of function and argument name in needle
    void analyze_tool_call_format(const std::string &       haystack,
                                  const std::string &       fun_name_needle,
                                  const std::string &       arg_name_needle,
                                  const analyze_reasoning & reasoning);
    // Analyze specifics of JSON native format (entire tool call is a JSON object)
    void analyze_tool_call_format_json_native(const std::string & clean_haystack,
                                              const std::string & fun_name_needle,
                                              const std::string & arg_name_needle);
    // Analyze specifics of non-JSON native format (tags for function name or for function name and arguments)
    void analyze_tool_call_format_non_json(const std::string & clean_haystack,
                                           const std::string & fun_name_needle);
    // Check for and extract specific per-call markers for non-native-JSON templates with parallel call support
    void check_per_call_markers();
    // Extract function name markers
    void extract_function_markers();
    // Delegates to separate functions for: separator analysis, argument name analysis, argument value analysis
    void analyze_arguments();
    // Extract argument name markers
    void extract_argument_name_markers();
    // Extract argument value markers
    void extract_argument_value_markers();
    // Extract argument separator, if specified (eg. <arg=foo>...</arg><sep><arg=bar>...</arg>)
    void extract_argument_separator();
    // Extract argument wrapper markers, if present (eg. '<args><arg=foo>...</arg><arg=bar>...</arg></args>')
    void extract_args_markers();
    // Extract call ID markers, if present
    void extract_call_id_markers();
    // Per-format tool parser builders
    common_peg_parser build_tool_parser_json_native(parser_build_context & ctx) const;
    common_peg_parser build_tool_parser_tag_json(parser_build_context & ctx) const;
    common_peg_parser build_tool_parser_tag_tagged(parser_build_context & ctx) const;
 };
 // ============================================================================
 // Main autoparser class
 // ============================================================================
 struct autoparser {
    jinja::caps          jinja_caps;
    analyze_reasoning    reasoning;
    analyze_content      content;
    analyze_tools        tools;
    bool                 analysis_complete = false;
    // Preserved tokens for tokenizer (union of all non-empty markers)
    std::vector<std::string> preserved_tokens;
    autoparser() = default;
    // Run full differential analysis on a template
    void analyze_template(const common_chat_template & tmpl);
    // Build the PEG parser for this template
    common_peg_arena build_parser(const templates_params & inputs) const;
  private:
    // Collect tokens from entire analysis to preserve
    void collect_preserved_tokens();
 };
 // ============================================================================
 // Parser generator
 // ============================================================================
 class peg_generator {
  public:
    static common_chat_params generate_parser(const common_chat_template &    tmpl,
                                              const struct templates_params & inputs);
    static common_chat_params generate_parser(const common_chat_template &    tmpl,
                                              const struct templates_params & inputs,
                                              const autoparser &              autoparser);
 };
 }  // namespace autoparser
 enum segment_type { TEXT, MARKER };
 inline std::ostream & operator<<(std::ostream & os, const segment_type & type) {
    switch (type) {
        case segment_type::TEXT:
            return os << "TEXT";
        case segment_type::MARKER:
            return os << "MARKER";
        default:
            return os << "UNKNOWN";
    }
 }
 struct segment {
    segment_type type;
    std::string  value;
    segment(segment_type type, std::string value) : type(type), value(std::move(value)) {}
    bool operator==(const segment & other) const {
        return type == other.type && value == other.value;
    }
    bool operator!=(const segment & other) const {
        return !(*this == other);
    }
 };
--- a/common/chat-diff-analyzer.cpp
+++ b/common/chat-diff-analyzer.cpp
--- a/common/chat-parser-xml-toolcall.cpp
+++ b/common/chat-parser-xml-toolcall.cpp
@ -1,879 +0,0 @@
 #include "chat.h"
 #include "chat-parser.h"
 #include "common.h"
 #include "json-partial.h"
 #include "json-schema-to-grammar.h"
 #include "log.h"
 #include "regex-partial.h"
 using json = nlohmann::ordered_json;
 class xml_toolcall_syntax_exception : public std::runtime_error {
  public:
    xml_toolcall_syntax_exception(const std::string & message) : std::runtime_error(message) {}
 };
 template<typename T>
 inline void sort_uniq(std::vector<T> &vec) {
    std::sort(vec.begin(), vec.end());
    vec.erase(std::unique(vec.begin(), vec.end()), vec.end());
 }
 template<typename T>
 inline bool all_space(const T &str) {
    return std::all_of(str.begin(), str.end(), [](unsigned char ch) { return std::isspace(ch); });
 }
 static size_t utf8_truncate_safe(const std::string_view s) {
    size_t len = s.size();
    if (len == 0) return 0;
    size_t i = len;
    for (size_t back = 0; back < 4 && i > 0; ++back) {
        --i;
        unsigned char c = s[i];
        if ((c & 0x80) == 0) {
            return len;
        } else if ((c & 0xC0) == 0xC0) {
            size_t expected_len = 0;
            if ((c & 0xE0) == 0xC0) expected_len = 2;
            else if ((c & 0xF0) == 0xE0) expected_len = 3;
            else if ((c & 0xF8) == 0xF0) expected_len = 4;
            else return i;
            if (len - i >= expected_len) {
                return len;
            } else {
                return i;
            }
        }
    }
    return len - std::min(len, size_t(3));
 }
 inline void utf8_truncate_safe_resize(std::string &s) {
    s.resize(utf8_truncate_safe(s));
 }
 inline std::string_view utf8_truncate_safe_view(const std::string_view s) {
    return s.substr(0, utf8_truncate_safe(s));
 }
 static std::optional<common_chat_msg_parser::find_regex_result> try_find_2_literal_splited_by_spaces(common_chat_msg_parser & builder, const std::string & literal1, const std::string & literal2) {
    if (literal1.size() == 0) return builder.try_find_literal(literal2);
    const auto saved_pos = builder.pos();
    while (auto res = builder.try_find_literal(literal1)) {
        builder.consume_spaces();
        const auto match_len = std::min(literal2.size(), builder.input().size() - builder.pos());
        if (builder.input().compare(builder.pos(), match_len, literal2, 0, match_len) == 0) {
            if (res->prelude.size() != res->groups[0].begin - saved_pos) {
                res->prelude = builder.str({saved_pos, res->groups[0].begin});
            }
            builder.move_to(builder.pos() + match_len);
            res->groups[0].end = builder.pos();
            GGML_ASSERT(res->groups[0].begin != res->groups[0].end);
            return res;
        }
        builder.move_to(res->groups[0].begin + 1);
    }
    builder.move_to(saved_pos);
    return std::nullopt;
 }
 /**
 * make a GBNF that accept any strings except those containing any of the forbidden strings.
 */
 std::string make_gbnf_excluding(std::vector<std::string> forbids) {
    constexpr auto charclass_escape = [](unsigned char c) -> std::string {
        if (c == '\\' || c == ']' || c == '^' || c == '-') {
            std::string s = "\\";
            s.push_back((char)c);
            return s;
        }
        if (isprint(c)) {
            return std::string(1, (char)c);
        }
        char buf[16];
        snprintf(buf, 15, "\\x%02X", c);
        return std::string(buf);
    };
    constexpr auto build_expr = [charclass_escape](auto self, const std::vector<std::string>& forbids, int l, int r, int depth) -> std::string {
        std::vector<std::pair<unsigned char, std::pair<int,int>>> children;
        int i = l;
        while (i < r) {
            const std::string &s = forbids[i];
            if ((int)s.size() == depth) {
                ++i;
                continue;
            }
            unsigned char c = (unsigned char)s[depth];
            int j = i;
            while (j < r && (int)forbids[j].size() > depth &&
                   (unsigned char)forbids[j][depth] == c) {
                ++j;
            }
            children.push_back({c, {i, j}});
            i = j;
        }
        std::vector<std::string> alts;
        if (!children.empty()) {
            std::string cls;
            for (auto &ch : children) cls += charclass_escape(ch.first);
            alts.push_back(std::string("[^") + cls + "]");
        }
        for (auto &ch : children) {
            std::string childExpr = self(self, forbids, ch.second.first, ch.second.second, depth+1);
            if (!childExpr.empty()) {
                std::string quoted_ch = "\"";
                if (ch.first == '\\') quoted_ch += "\\\\";
                else if (ch.first == '"') quoted_ch += "\\\"";
                else if (isprint(ch.first)) quoted_ch.push_back(ch.first);
                else {
                    char buf[16];
                    snprintf(buf, 15, "\\x%02X", ch.first);
                    quoted_ch += buf;
                }
                quoted_ch += "\"";
                std::string branch = quoted_ch + std::string(" ") + childExpr;
                alts.push_back(branch);
            }
        }
        if (alts.empty()) return "";
        std::ostringstream oss;
        oss << "( ";
        for (size_t k = 0; k < alts.size(); ++k) {
            if (k) oss << " | ";
            oss << alts[k];
        }
        oss << " )";
        return oss.str();
    };
    if (forbids.empty()) return "( . )*";
    sort(forbids.begin(), forbids.end());
    std::string expr = build_expr(build_expr, forbids, 0, forbids.size(), 0);
    if (expr.empty()) {
        std::string cls;
        for (auto &s : forbids) if (!s.empty()) cls += charclass_escape((unsigned char)s[0]);
        expr = std::string("( [^") + cls + "] )";
    }
    if (forbids.size() == 1)
        return expr + "*";
    else
        return std::string("( ") + expr + " )*";
 }
 /**
 * Build grammar for xml-style tool call
 * form.scope_start and form.scope_end can be empty.
 * Requires data.format for model-specific hacks.
 */
 void build_grammar_xml_tool_call(common_chat_params & data, const json & tools, const struct xml_tool_call_format & form) {
    GGML_ASSERT(!form.tool_start.empty());
    GGML_ASSERT(!form.tool_sep.empty());
    GGML_ASSERT(!form.key_start.empty());
    GGML_ASSERT(!form.val_end.empty());
    GGML_ASSERT(!form.tool_end.empty());
    std::string key_val_sep = form.key_val_sep;
    if (form.key_val_sep2) {
        key_val_sep += "\n";
        key_val_sep += *form.key_val_sep2;
    }
    GGML_ASSERT(!key_val_sep.empty());
    if (tools.is_array() && !tools.empty()) {
        data.grammar = build_grammar([&](const common_grammar_builder &builder) {
            auto string_arg_val = form.last_val_end ?
                    builder.add_rule("string-arg-val", make_gbnf_excluding({form.val_end, *form.last_val_end})) :
                    builder.add_rule("string-arg-val", make_gbnf_excluding({form.val_end}));
            std::vector<std::string> tool_rules;
            for (const auto & tool : tools) {
                if (!tool.contains("type") || tool.at("type") != "function" || !tool.contains("function")) {
                    LOG_WRN("Skipping tool without function: %s", tool.dump(2).c_str());
                    continue;
                }
                const auto & function = tool.at("function");
                if (!function.contains("name") || !function.at("name").is_string()) {
                    LOG_WRN("Skipping invalid function (invalid name): %s", function.dump(2).c_str());
                    continue;
                }
                if (!function.contains("parameters") || !function.at("parameters").is_object()) {
                    LOG_WRN("Skipping invalid function (invalid parameters): %s", function.dump(2).c_str());
                    continue;
                }
                std::string name = function.at("name");
                auto parameters = function.at("parameters");
                builder.resolve_refs(parameters);
                struct parameter_rule {
                    std::string symbol_name;
                    bool is_required;
                };
                std::vector<parameter_rule> arg_rules;
                if (!parameters.contains("properties") || !parameters.at("properties").is_object()) {
                    LOG_WRN("Skipping invalid function (invalid properties): %s", function.dump(2).c_str());
                    continue;
                } else {
                    std::vector<std::string> requiredParameters;
                    if (parameters.contains("required")) {
                        try { parameters.at("required").get_to(requiredParameters); }
                        catch (const std::runtime_error&) {
                            LOG_WRN("Invalid function required parameters, ignoring: %s", function.at("required").dump(2).c_str());
                        }
                    }
                    sort_uniq(requiredParameters);
                    for (const auto & [key, value] : parameters.at("properties").items()) {
                        std::string quoted_key = key;
                        bool required = std::binary_search(requiredParameters.begin(), requiredParameters.end(), key);
                        if (form.key_start.back() == '"' && key_val_sep[0] == '"') {
                            quoted_key = gbnf_format_literal(key);
                            quoted_key = quoted_key.substr(1, quoted_key.size() - 2);
                        }
                        arg_rules.push_back(parameter_rule {builder.add_rule("func-" + name + "-kv-" + key,
                            gbnf_format_literal(form.key_start) + " " +
                            gbnf_format_literal(quoted_key) + " " +
                            gbnf_format_literal(key_val_sep) + " " +
                            ((value.contains("type") && value["type"].is_string() && value["type"] == "string" && (!form.raw_argval || *form.raw_argval)) ?
                                    (form.raw_argval ?
                                            string_arg_val :
                                            "( " + string_arg_val + " | " + builder.add_schema(name + "-arg-" + key, value) + " )"
                                    ) :
                                    builder.add_schema(name + "-arg-" + key, value)
                            )
                        ), required});
                    }
                }
                auto next_arg_with_sep = builder.add_rule(name + "-last-arg-end", form.last_val_end ? gbnf_format_literal(*form.last_val_end) : gbnf_format_literal(form.val_end));
                decltype(next_arg_with_sep) next_arg = "\"\"";
                for (auto i = arg_rules.size() - 1; /* i >= 0 && */ i < arg_rules.size(); --i) {
                    std::string include_this_arg = arg_rules[i].symbol_name + " " + next_arg_with_sep;
                    next_arg = builder.add_rule(name + "-arg-after-" + std::to_string(i), arg_rules[i].is_required ?
                            include_this_arg : "( " + include_this_arg + " ) | " + next_arg
                    );
                    include_this_arg = gbnf_format_literal(form.val_end) + " " + include_this_arg;
                    next_arg_with_sep = builder.add_rule(name + "-arg-after-" + std::to_string(i) + "-with-sep", arg_rules[i].is_required ?
                            include_this_arg : "( " + include_this_arg + " ) | " + next_arg_with_sep
                    );
                }
                std::string quoted_name = name;
                if (form.tool_start.back() == '"' && form.tool_sep[0] == '"') {
                    quoted_name = gbnf_format_literal(name);
                    quoted_name = quoted_name.substr(1, quoted_name.size() - 2);
                }
                quoted_name = gbnf_format_literal(quoted_name);
                // Kimi-K2 uses functions.{{ tool_call['function']['name'] }}:{{ loop.index }} as function name
                if (data.format == COMMON_CHAT_FORMAT_KIMI_K2) {
                    quoted_name = "\"functions.\" " + quoted_name + " \":\" [0-9]+";
                }
                tool_rules.push_back(builder.add_rule(name + "-call",
                        gbnf_format_literal(form.tool_start) + " " +
                        quoted_name + " " +
                        gbnf_format_literal(form.tool_sep) + " " +
                        next_arg
                ));
            }
            auto tool_call_once = builder.add_rule("root-tool-call-once", string_join(tool_rules, " | "));
            auto tool_call_more = builder.add_rule("root-tool-call-more", gbnf_format_literal(form.tool_end) + " " + tool_call_once);
            auto call_end = builder.add_rule("root-call-end", form.last_tool_end ? gbnf_format_literal(*form.last_tool_end) : gbnf_format_literal(form.tool_end));
            auto tool_call_multiple_with_end = builder.add_rule("root-tool-call-multiple-with-end", tool_call_once + " " + tool_call_more + "* " + call_end);
            builder.add_rule("root",
                (form.scope_start.empty() ? "" : gbnf_format_literal(form.scope_start) + " ") +
                tool_call_multiple_with_end  + "?" +
                (form.scope_end.empty() ? "" : " " + gbnf_format_literal(form.scope_end))
            );
        });
        // grammar trigger for tool call
        data.grammar_triggers.push_back({ COMMON_GRAMMAR_TRIGGER_TYPE_WORD, form.scope_start + form.tool_start });
    }
 }
 /**
 * Parse XML-Style tool call for given xml_tool_call_format. Return false for invalid syntax and get the position untouched.
 * Throws xml_toolcall_syntax_exception if there is invalid syntax and cannot recover the original status for common_chat_msg_parser.
 * form.scope_start, form.tool_sep and form.scope_end can be empty.
 */
 inline bool parse_xml_tool_calls(common_chat_msg_parser & builder, const struct xml_tool_call_format & form) {
    GGML_ASSERT(!form.tool_start.empty());
    GGML_ASSERT(!form.key_start.empty());
    GGML_ASSERT(!form.key_val_sep.empty());
    GGML_ASSERT(!form.val_end.empty());
    GGML_ASSERT(!form.tool_end.empty());
    // Helper to choose return false or throw error
    constexpr auto return_error = [](common_chat_msg_parser & builder, auto &start_pos, const bool &recovery) {
        LOG_DBG("Failed to parse XML-Style tool call at position: %s\n", gbnf_format_literal(builder.consume_rest().substr(0, 20)).c_str());
        if (recovery) {
            builder.move_to(start_pos);
            return false;
        } else throw xml_toolcall_syntax_exception("Tool call parsing failed with unrecoverable errors. Try using a grammar to constrain the model’s output.");
    };
    // Drop substring from needle to end from a JSON
    constexpr auto partial_json = [](std::string &json_str, std::string_view needle = "XML_TOOL_CALL_PARTIAL_FLAG") {
        auto pos = json_str.rfind(needle);
        if (pos == std::string::npos) {
            return false;
        }
        for (auto i = pos + needle.size(); i < json_str.size(); ++i) {
            unsigned char ch = static_cast<unsigned char>(json_str[i]);
            if (ch != '\'' && ch != '"' && ch != '}' && ch != ':' && !std::isspace(ch)) {
                return false;
            }
        }
        if (pos != 0 && json_str[pos - 1] == '"') {
            --pos;
        }
        json_str.resize(pos);
        return true;
    };
    // Helper to generate a partial argument JSON
    constexpr auto gen_partial_json = [partial_json](auto set_partial_arg, auto &arguments, auto &builder, auto &function_name) {
        auto rest = builder.consume_rest();
        utf8_truncate_safe_resize(rest);
        set_partial_arg(rest, "XML_TOOL_CALL_PARTIAL_FLAG");
        auto tool_str = arguments.dump();
        if (partial_json(tool_str)) {
            if (builder.add_tool_call(function_name, "", tool_str)) {
                return;
            }
        }
        LOG_DBG("Failed to parse partial XML-Style tool call, fallback to non-partial: %s\n", tool_str.c_str());
    };
    // Helper to find a close (because there may be form.last_val_end or form.last_tool_end)
    constexpr auto try_find_close = [](
            common_chat_msg_parser & builder,
            const std::string & end,
            const std::optional<std::string> & alt_end,
            const std::string & end_next,
            const std::optional<std::string> & alt_end_next
    ) {
        auto saved_pos = builder.pos();
        auto tc = builder.try_find_literal(end);
        auto val_end_size = end.size();
        if (alt_end) {
            auto pos_1 = builder.pos();
            builder.move_to(saved_pos);
            auto tc2 = try_find_2_literal_splited_by_spaces(builder, *alt_end, end_next);
            if (alt_end_next) {
                builder.move_to(saved_pos);
                auto tc3 = try_find_2_literal_splited_by_spaces(builder, *alt_end, *alt_end_next);
                if (tc3 && (!tc2 || tc2->prelude.size() > tc3->prelude.size())) {
                    tc2 = tc3;
                }
            }
            if (tc2 && (!tc || tc->prelude.size() > tc2->prelude.size())) {
                tc = tc2;
                tc->groups[0].end = std::min(builder.input().size(), tc->groups[0].begin + alt_end->size());
                builder.move_to(tc->groups[0].end);
                val_end_size = alt_end->size();
            } else {
                builder.move_to(pos_1);
            }
        }
        return std::make_pair(val_end_size, tc);
    };
    // Helper to find a val_end or last_val_end, returns matched pattern size
    const auto try_find_val_end = [try_find_close, &builder, &form]() {
        return try_find_close(builder, form.val_end, form.last_val_end, form.tool_end, form.last_tool_end);
    };
    // Helper to find a tool_end or last_tool_end, returns matched pattern size
    const auto try_find_tool_end = [try_find_close, &builder, &form]() {
        return try_find_close(builder, form.tool_end, form.last_tool_end, form.scope_end, std::nullopt);
    };
    bool recovery = true;
    const auto start_pos = builder.pos();
    if (!all_space(form.scope_start)) {
        if (auto tc = builder.try_find_literal(form.scope_start)) {
            if (all_space(tc->prelude)) {
                if (form.scope_start.size() != tc->groups[0].end - tc->groups[0].begin)
                    throw common_chat_msg_partial_exception("Partial literal: " + gbnf_format_literal(form.scope_start));
            } else {
                builder.move_to(start_pos);
                return false;
            }
        } else return false;
    }
    while (auto tc = builder.try_find_literal(form.tool_start)) {
        if (!all_space(tc->prelude)) {
            LOG_DBG("XML-Style tool call: Expected %s, but found %s, trying to match next pattern\n",
                    gbnf_format_literal(form.tool_start).c_str(),
                    gbnf_format_literal(tc->prelude).c_str()
            );
            builder.move_to(tc->groups[0].begin - tc->prelude.size());
            break;
        }
        // Find tool name
        auto func_name = builder.try_find_literal(all_space(form.tool_sep) ? form.key_start : form.tool_sep);
        if (!func_name) {
            auto [sz, tc] = try_find_tool_end();
            func_name = tc;
        }
        if (!func_name) {
            // Partial tool name not supported
            throw common_chat_msg_partial_exception("incomplete tool_call");
        }
        // If the model generate multiple tool call and the first tool call has no argument
        if (func_name->prelude.find(form.tool_end) != std::string::npos || (form.last_tool_end ? func_name->prelude.find(*form.last_tool_end) != std::string::npos : false)) {
            builder.move_to(func_name->groups[0].begin - func_name->prelude.size());
            auto [sz, tc] = try_find_tool_end();
            func_name = tc;
        }
        // Parse tool name
        builder.move_to(all_space(form.tool_sep) ? func_name->groups[0].begin : func_name->groups[0].end);
        std::string function_name = string_strip(func_name->prelude);
        // Kimi-K2 uses functions.{{ tool_call['function']['name'] }}:{{ loop.index }} as function name
        if (builder.syntax().format == COMMON_CHAT_FORMAT_KIMI_K2) {
            if (string_starts_with(function_name, "functions.")) {
                static const std::regex re(":\\d+$");
                if (std::regex_search(function_name, re)) {
                    function_name = function_name.substr(10, function_name.rfind(":") - 10);
                }
            }
        }
        // Argument JSON
        json arguments = json::object();
        // Helper to generate a partial argument JSON
        const auto gen_partial_args = [&](auto set_partial_arg) {
            gen_partial_json(set_partial_arg, arguments, builder, function_name);
        };
        // Parse all arg_key/arg_value pairs
        while (auto tc = builder.try_find_literal(form.key_start)) {
            if (!all_space(tc->prelude)) {
                LOG_DBG("XML-Style tool call: Expected %s, but found %s, trying to match next pattern\n",
                        gbnf_format_literal(form.key_start).c_str(),
                        gbnf_format_literal(tc->prelude).c_str()
                );
                builder.move_to(tc->groups[0].begin - tc->prelude.size());
                break;
            }
            if (tc->groups[0].end - tc->groups[0].begin != form.key_start.size()) {
                auto tool_call_arg = arguments.dump();
                if (tool_call_arg.size() != 0 && tool_call_arg[tool_call_arg.size() - 1] == '}') {
                    tool_call_arg.resize(tool_call_arg.size() - 1);
                }
                builder.add_tool_call(function_name, "", tool_call_arg);
                throw common_chat_msg_partial_exception("Partial literal: " + gbnf_format_literal(form.key_start));
            }
            // Parse arg_key
            auto key_res = builder.try_find_literal(form.key_val_sep);
            if (!key_res) {
                gen_partial_args([&](auto &rest, auto &needle) {arguments[rest + needle] = "";});
                throw common_chat_msg_partial_exception("Expected " + gbnf_format_literal(form.key_val_sep) + " after " + gbnf_format_literal(form.key_start));
            }
            if (key_res->groups[0].end - key_res->groups[0].begin != form.key_val_sep.size()) {
                gen_partial_args([&](auto &, auto &needle) {arguments[key_res->prelude + needle] = "";});
                throw common_chat_msg_partial_exception("Partial literal: " + gbnf_format_literal(form.key_val_sep));
            }
            auto &key = key_res->prelude;
            recovery = false;
            // Parse arg_value
            if (form.key_val_sep2) {
                if (auto tc = builder.try_find_literal(*form.key_val_sep2)) {
                    if (!all_space(tc->prelude)) {
                        LOG_DBG("Failed to parse XML-Style tool call: Unexcepted %s between %s and %s\n",
                                gbnf_format_literal(tc->prelude).c_str(),
                                gbnf_format_literal(form.key_val_sep).c_str(),
                                gbnf_format_literal(*form.key_val_sep2).c_str()
                        );
                        return return_error(builder, start_pos, false);
                    }
                    if (tc->groups[0].end - tc->groups[0].begin != form.key_val_sep2->size()) {
                        gen_partial_args([&](auto &, auto &needle) {arguments[key] = needle;});
                        throw common_chat_msg_partial_exception("Partial literal: " + gbnf_format_literal(*form.key_val_sep2));
                    }
                } else {
                    gen_partial_args([&](auto &, auto &needle) {arguments[key] = needle;});
                    throw common_chat_msg_partial_exception("Expected " + gbnf_format_literal(*form.key_val_sep2) + " after " + gbnf_format_literal(form.key_val_sep));
                }
            }
            auto val_start = builder.pos();
            // Test if arg_val is a partial JSON
            std::optional<common_json> value_json = std::nullopt;
            if (!form.raw_argval || !*form.raw_argval) {
                try { value_json = builder.try_consume_json(); }
                catch (const std::runtime_error&) { builder.move_to(val_start); }
                // TODO: Delete this when json_partial adds top-level support for null/true/false
                if (builder.pos() == val_start) {
                    const static std::regex number_regex(R"([0-9-][0-9]*(\.\d*)?([eE][+-]?\d*)?)");
                    builder.consume_spaces();
                    std::string_view sv = utf8_truncate_safe_view(builder.input());
                    sv.remove_prefix(builder.pos());
                    std::string rest = "a";
                    if (sv.size() < 6) rest = sv;
                    if (string_starts_with("null", rest) || string_starts_with("true", rest) || string_starts_with("false", rest) || std::regex_match(sv.begin(), sv.end(), number_regex)) {
                        value_json = {123, {"123", "123"}};
                        builder.consume_rest();
                    } else {
                        builder.move_to(val_start);
                    }
                }
            }
            // If it is a JSON and followed by </arg_value>, parse as json
            // cannot support streaming because it may be a plain text starting with JSON
            if (value_json) {
                auto json_end = builder.pos();
                builder.consume_spaces();
                if (builder.pos() == builder.input().size()) {
                    if (form.raw_argval && !*form.raw_argval && (value_json->json.is_string() || value_json->json.is_object() || value_json->json.is_array())) {
                        arguments[key] = value_json->json;
                        auto json_str = arguments.dump();
                        if (!value_json->healing_marker.json_dump_marker.empty()) {
                            GGML_ASSERT(std::string::npos != json_str.rfind(value_json->healing_marker.json_dump_marker));
                            json_str.resize(json_str.rfind(value_json->healing_marker.json_dump_marker));
                        } else {
                            GGML_ASSERT(json_str.back() == '}');
                            json_str.resize(json_str.size() - 1);
                        }
                        builder.add_tool_call(function_name, "", json_str);
                    } else {
                        gen_partial_args([&](auto &, auto &needle) {arguments[key] = needle;});
                    }
                    LOG_DBG("Possible JSON arg_value: %s\n", value_json->json.dump().c_str());
                    throw common_chat_msg_partial_exception("JSON arg_value detected. Waiting for more tokens for validations.");
                }
                builder.move_to(json_end);
                auto [val_end_size, tc] = try_find_val_end();
                if (tc && all_space(tc->prelude) && value_json->healing_marker.marker.empty()) {
                    if (tc->groups[0].end - tc->groups[0].begin != val_end_size) {
                        gen_partial_args([&](auto &, auto &needle) {arguments[key] = needle;});
                        LOG_DBG("Possible terminated JSON arg_value: %s\n", value_json->json.dump().c_str());
                        throw common_chat_msg_partial_exception("Partial literal: " + gbnf_format_literal(form.val_end) + (form.last_val_end ? gbnf_format_literal(*form.last_val_end) : ""));
                    } else arguments[key] = value_json->json;
                } else builder.move_to(val_start);
            }
            // If not, parse as plain text
            if (val_start == builder.pos()) {
                if (auto [val_end_size, value_plain] = try_find_val_end(); value_plain) {
                    auto &value_str = value_plain->prelude;
                    if (form.trim_raw_argval) value_str = string_strip(value_str);
                    if (value_plain->groups[0].end - value_plain->groups[0].begin != val_end_size) {
                        gen_partial_args([&](auto &, auto &needle) {arguments[key] = value_str + needle;});
                        throw common_chat_msg_partial_exception(
                                "Expected " + gbnf_format_literal(form.val_end) +
                                " after " + gbnf_format_literal(form.key_val_sep) +
                                (form.key_val_sep2 ? " " + gbnf_format_literal(*form.key_val_sep2) : "")
                        );
                    }
                    arguments[key] = value_str;
                } else {
                    if (form.trim_raw_argval) {
                        gen_partial_args([&](auto &rest, auto &needle) {arguments[key] = string_strip(rest) + needle;});
                    } else {
                        gen_partial_args([&](auto &rest, auto &needle) {arguments[key] = rest + needle;});
                    }
                    throw common_chat_msg_partial_exception(
                            "Expected " + gbnf_format_literal(form.val_end) +
                            " after " + gbnf_format_literal(form.key_val_sep) +
                            (form.key_val_sep2 ? " " + gbnf_format_literal(*form.key_val_sep2) : "")
                    );
                }
            }
        }
        // Consume closing tag
        if (auto [tool_end_size, tc] = try_find_tool_end(); tc) {
            if (!all_space(tc->prelude)) {
                LOG_DBG("Failed to parse XML-Style tool call: Expected %s, but found %s\n",
                        gbnf_format_literal(form.tool_end).c_str(),
                        gbnf_format_literal(tc->prelude).c_str()
                );
                return return_error(builder, start_pos, recovery);
            }
            if (tc->groups[0].end - tc->groups[0].begin == tool_end_size) {
                // Add the parsed tool call
                if (!builder.add_tool_call(function_name, "", arguments.dump())) {
                    throw common_chat_msg_partial_exception("Failed to add XML-Style tool call");
                }
                recovery = false;
                continue;
            }
        }
        auto tool_call_arg = arguments.dump();
        if (tool_call_arg.size() != 0 && tool_call_arg[tool_call_arg.size() - 1] == '}') {
            tool_call_arg.resize(tool_call_arg.size() - 1);
        }
        builder.add_tool_call(function_name, "", tool_call_arg);
        throw common_chat_msg_partial_exception("Expected " + gbnf_format_literal(form.tool_end) + " after " + gbnf_format_literal(form.val_end));
    }
    if (auto tc = builder.try_find_literal(form.scope_end)) {
        if (!all_space(tc->prelude)) {
            LOG_DBG("Failed to parse XML-Style tool call: Expected %s, but found %s\n",
                    gbnf_format_literal(form.scope_end).c_str(),
                    gbnf_format_literal(tc->prelude).c_str()
            );
            return return_error(builder, start_pos, recovery);
        }
    } else {
        if (all_space(form.scope_end)) return true;
        builder.consume_spaces();
        if (builder.pos() == builder.input().size())
            throw common_chat_msg_partial_exception("incomplete tool calls");
        LOG_DBG("Failed to parse XML-Style tool call: Expected %s, but found %s\n",
                gbnf_format_literal(form.scope_end).c_str(),
                gbnf_format_literal(builder.consume_rest()).c_str()
        );
        return return_error(builder, start_pos, recovery);
    }
    return true;
 }
 /**
 * Parse XML-Style tool call for given xml_tool_call_format. Return false for invalid syntax and get the position untouched.
 * May cause std::runtime_error if there is invalid syntax because partial valid tool call is already sent out to client.
 * form.scope_start, form.tool_sep and form.scope_end can be empty.
 */
 bool common_chat_msg_parser::try_consume_xml_tool_calls(const struct xml_tool_call_format & form) {
    auto pos = pos_;
    auto tsize = result_.tool_calls.size();
    try { return parse_xml_tool_calls(*this, form); }
    catch (const xml_toolcall_syntax_exception&) {}
    move_to(pos);
    result_.tool_calls.resize(tsize);
    return false;
 }
 /**
 * Parse content uses reasoning and XML-Style tool call
 * TODO: Note that form.allow_toolcall_in_think is not tested yet. If anyone confirms it works, this comment can be removed.
 */
 inline void parse_msg_with_xml_tool_calls(common_chat_msg_parser & builder, const struct xml_tool_call_format & form, const std::string & start_think = "<think>", const std::string & end_think = "</think>") {
    constexpr auto rstrip = [](std::string &s) {
        s.resize(std::distance(s.begin(), std::find_if(s.rbegin(), s.rend(), [](unsigned char ch) { return !std::isspace(ch); }).base()));
    };
    // Erase substring from l to r, along with additional spaces nearby
    constexpr auto erase_spaces = [](auto &str, size_t l, size_t r) {
        while (/* l > -1 && */ --l < str.size() && std::isspace(static_cast<unsigned char>(str[l])));
        ++l;
        while (++r < str.size() && std::isspace(static_cast<unsigned char>(str[r])));
        if (l < r) str[l] = '\n';
        if (l + 1 < r) str[l + 1] = '\n';
        if (l != 0) l += 2;
        str.erase(l, r - l);
        return l;
    };
    constexpr auto trim_suffix = [](std::string &content, std::initializer_list<std::string_view> list) {
        auto best_match = content.size();
        for (auto pattern: list) {
            if (pattern.size() == 0) continue;
            for (auto match_idx = content.size() - std::min(pattern.size(), content.size()); content.size() > match_idx; match_idx++) {
                auto match_len = content.size() - match_idx;
                if (content.compare(match_idx, match_len, pattern.data(), match_len) == 0 && best_match > match_idx) {
                    best_match = match_idx;
                }
            }
        }
        if (content.size() > best_match) {
            content.erase(best_match);
        }
    };
    const auto trim_potential_partial_word = [&start_think, &end_think, &form, trim_suffix](std::string &content) {
        return trim_suffix(content, {
            start_think, end_think, form.scope_start, form.tool_start, form.tool_sep, form.key_start,
            form.key_val_sep, form.key_val_sep2 ? form.key_val_sep2->c_str() : "",
            form.val_end, form.last_val_end ? form.last_val_end->c_str() : "",
            form.tool_end, form.last_tool_end ? form.last_tool_end->c_str() : "",
            form.scope_end
        });
    };
    // Trim leading spaces without affecting keyword matching
    static const common_regex spaces_regex("\\s*");
    {
        auto tc = builder.consume_regex(spaces_regex);
        auto spaces = builder.str(tc.groups[0]);
        auto s1 = spaces.size();
        trim_potential_partial_word(spaces);
        auto s2 = spaces.size();
        builder.move_to(builder.pos() - (s1 - s2));
    }
    // Parse content
    bool reasoning_unclosed = builder.syntax().thinking_forced_open;
    std::string unclosed_reasoning_content("");
    for (;;) {
        auto tc = try_find_2_literal_splited_by_spaces(builder, form.scope_start, form.tool_start);
        std::string content;
        std::string tool_call_start;
        if (tc) {
            content = std::move(tc->prelude);
            tool_call_start = builder.str(tc->groups[0]);
            LOG_DBG("Matched tool start: %s\n", gbnf_format_literal(tool_call_start).c_str());
        } else {
            content = builder.consume_rest();
            utf8_truncate_safe_resize(content);
        }
        // Handle unclosed think block
        if (reasoning_unclosed) {
            if (auto pos = content.find(end_think); pos == std::string::npos && builder.pos() != builder.input().size()) {
                unclosed_reasoning_content += content;
                if (!(form.allow_toolcall_in_think && tc)) {
                    unclosed_reasoning_content += tool_call_start;
                    continue;
                }
            } else {
                reasoning_unclosed = false;
                std::string reasoning_content;
                if (pos == std::string::npos) {
                    reasoning_content = std::move(content);
                } else {
                    reasoning_content = content.substr(0, pos);
                    content.erase(0, pos + end_think.size());
                }
                if (builder.pos() == builder.input().size() && all_space(content)) {
                    rstrip(reasoning_content);
                    trim_potential_partial_word(reasoning_content);
                    rstrip(reasoning_content);
                    if (reasoning_content.empty()) {
                        rstrip(unclosed_reasoning_content);
                        trim_potential_partial_word(unclosed_reasoning_content);
                        rstrip(unclosed_reasoning_content);
                        if (unclosed_reasoning_content.empty()) continue;
                    }
                }
                if (builder.syntax().reasoning_format == COMMON_REASONING_FORMAT_NONE || builder.syntax().reasoning_in_content) {
                    builder.add_content(start_think);
                    builder.add_content(unclosed_reasoning_content);
                    builder.add_content(reasoning_content);
                    if (builder.pos() != builder.input().size() || !all_space(content))
                        builder.add_content(end_think);
                } else {
                    builder.add_reasoning_content(unclosed_reasoning_content);
                    builder.add_reasoning_content(reasoning_content);
                }
                unclosed_reasoning_content.clear();
            }
        }
        // Handle multiple think block
        bool toolcall_in_think = false;
        for (auto think_start = content.find(start_think); think_start != std::string::npos; think_start = content.find(start_think, think_start)) {
            if (auto think_end = content.find(end_think, think_start + start_think.size()); think_end != std::string::npos) {
                if (builder.syntax().reasoning_format != COMMON_REASONING_FORMAT_NONE && !builder.syntax().reasoning_in_content) {
                    auto reasoning_content = content.substr(think_start + start_think.size(), think_end - think_start - start_think.size());
                    builder.add_reasoning_content(reasoning_content);
                    think_start = erase_spaces(content, think_start, think_end + end_think.size() - 1);
                } else {
                    think_start = think_end + end_think.size() - 1;
                }
            } else {
                // This <tool_call> start is in thinking block, skip this tool call
                // This <tool_call> start is in thinking block
                if (form.allow_toolcall_in_think) {
                    unclosed_reasoning_content = content.substr(think_start + start_think.size());
                } else {
                    unclosed_reasoning_content = content.substr(think_start + start_think.size()) + tool_call_start;
                }
                reasoning_unclosed = true;
                content.resize(think_start);
                toolcall_in_think = true;
            }
        }
        if (builder.syntax().reasoning_format != COMMON_REASONING_FORMAT_NONE && !builder.syntax().reasoning_in_content) {
            rstrip(content);
            // Handle unclosed </think> token from content: delete all </think> token
            if (auto pos = content.rfind(end_think); pos != std::string::npos) {
                while (pos != std::string::npos) {
                    pos = erase_spaces(content, pos, pos + end_think.size() - 1);
                    pos = content.rfind(end_think, pos);
                }
            }
            // Strip if needed
            if (content.size() > 0 && std::isspace(static_cast<unsigned char>(content[0]))) {
                content = string_strip(content);
            }
        }
        // remove potential partial suffix
        if (builder.pos() == builder.input().size() && builder.is_partial()) {
            if (unclosed_reasoning_content.empty()) {
                rstrip(content);
                trim_potential_partial_word(content);
                rstrip(content);
            } else {
                rstrip(unclosed_reasoning_content);
                trim_potential_partial_word(unclosed_reasoning_content);
                rstrip(unclosed_reasoning_content);
            }
        }
        // consume unclosed_reasoning_content if allow_toolcall_in_think is set
        if (form.allow_toolcall_in_think && !unclosed_reasoning_content.empty()) {
            if (builder.syntax().reasoning_format != COMMON_REASONING_FORMAT_NONE && !builder.syntax().reasoning_in_content) {
                builder.add_reasoning_content(unclosed_reasoning_content);
            } else {
                if (content.empty()) {
                    content = start_think + unclosed_reasoning_content;
                } else {
                    content += "\n\n" + start_think;
                    content += unclosed_reasoning_content;
                }
            }
            unclosed_reasoning_content.clear();
        }
        // Add content
        if (!content.empty()) {
            // If there are multiple content blocks
            if (builder.syntax().reasoning_format != COMMON_REASONING_FORMAT_NONE && !builder.syntax().reasoning_in_content && builder.result().content.size() != 0) {
                builder.add_content("\n\n");
            }
            builder.add_content(content);
        }
        // This <tool_call> start is in thinking block and toolcall_in_think not set, skip this tool call
        if (toolcall_in_think && !form.allow_toolcall_in_think) {
            continue;
        }
        // There is no tool call and all content is parsed
        if (!tc) {
            GGML_ASSERT(builder.pos() == builder.input().size());
            GGML_ASSERT(unclosed_reasoning_content.empty());
            if (!form.allow_toolcall_in_think) GGML_ASSERT(!reasoning_unclosed);
            break;
        }
        builder.move_to(tc->groups[0].begin);
        if (builder.try_consume_xml_tool_calls(form)) {
            auto end_of_tool = builder.pos();
            builder.consume_spaces();
            if (builder.pos() != builder.input().size()) {
                builder.move_to(end_of_tool);
                if (!builder.result().content.empty()) {
                    builder.add_content("\n\n");
                }
            }
        } else {
            static const common_regex next_char_regex(".");
            auto c = builder.str(builder.consume_regex(next_char_regex).groups[0]);
            rstrip(c);
            builder.add_content(c);
        }
    }
 }
 /**
 * Parse content uses reasoning and XML-Style tool call
 */
 void common_chat_msg_parser::consume_reasoning_with_xml_tool_calls(const struct xml_tool_call_format & form, const std::string & start_think, const std::string & end_think) {
    parse_msg_with_xml_tool_calls(*this, form, start_think, end_think);
 }
--- a/common/chat-parser-xml-toolcall.h
+++ b/common/chat-parser-xml-toolcall.h
@ -1,45 +0,0 @@
 #pragma once
 #include "chat.h"
 #include <nlohmann/json.hpp>
 #include <optional>
 #include <string>
 #include <vector>
 // Sample config:
 // MiniMax-M2 (left): <minimax:tool_call>\n<invoke name="tool-name">\n<parameter name="key">value</parameter>\n...</invoke>\n...</minimax:tool_call>
 // GLM 4.5   (right): <tool_call>function_name\n<arg_key>key</arg_key>\n<arg_value>value</arg_value>\n</tool_call>
 struct xml_tool_call_format {
    std::string scope_start; // <minimax:tool_call>\n  // \n                      // can be empty
    std::string tool_start;  // <invoke name=\"        // <tool_call>
    std::string tool_sep;    // \">\n                  // \n                      // can be empty only for parse_xml_tool_calls
    std::string key_start;   // <parameter name=\"     // <arg_key>
    std::string key_val_sep; // \">                    // </arg_key>\n<arg_value>
    std::string val_end;     // </parameter>\n         // </arg_value>\n
    std::string tool_end;    // </invoke>\n            // </tool_call>\n
    std::string scope_end;   // </minimax:tool_call>   //                         // can be empty
    // Set this if there can be dynamic spaces inside key_val_sep.
    // e.g. key_val_sep=</arg_key> key_val_sep2=<arg_value> for GLM4.5
    std::optional<std::string> key_val_sep2 = std::nullopt;
    // Set true if argval should only be raw string. e.g. Hello "world" hi
    // Set false if argval should only be json string. e.g. "Hello \"world\" hi"
    // Defaults to std::nullopt, both will be allowed.
    std::optional<bool> raw_argval = std::nullopt;
    std::optional<std::string> last_val_end = std::nullopt;
    std::optional<std::string> last_tool_end = std::nullopt;
    bool trim_raw_argval = false;
    bool allow_toolcall_in_think = false;
 };
 // make a GBNF that accept any strings except those containing any of the forbidden strings.
 std::string make_gbnf_excluding(std::vector<std::string> forbids);
 /**
 * Build grammar for xml-style tool call
 * form.scope_start and form.scope_end can be empty.
 * Requires data.format for model-specific hacks.
 */
 void build_grammar_xml_tool_call(common_chat_params & data, const nlohmann::ordered_json & tools, const struct xml_tool_call_format & form);
--- a/common/chat-parser.cpp
+++ b/common/chat-parser.cpp
--- a/common/chat-parser.h
+++ b/common/chat-parser.h
@ -1,133 +0,0 @@
 #pragma once
 #include "chat.h"
 #include "chat-parser-xml-toolcall.h"
 #include "json-partial.h"
 #include "regex-partial.h"
 #include <nlohmann/json_fwd.hpp>
 #include <optional>
 #include <string>
 #include <vector>
 class common_chat_msg_partial_exception : public std::runtime_error {
  public:
    common_chat_msg_partial_exception(const std::string & message) : std::runtime_error(message) {}
 };
 class common_chat_msg_parser {
    std::string input_;
    bool is_partial_;
    common_chat_parser_params syntax_; // TODO: rename to params
    std::string healing_marker_;
    size_t pos_ = 0;
    common_chat_msg result_;
  public:
    common_chat_msg_parser(const std::string & input, bool is_partial, const common_chat_parser_params & syntax);
    const std::string & input() const { return input_; }
    size_t pos() const { return pos_; }
    const std::string & healing_marker() const { return healing_marker_; }
    const bool & is_partial() const { return is_partial_; }
    const common_chat_msg & result() const { return result_; }
    const common_chat_parser_params & syntax() const { return syntax_; }
    void move_to(size_t pos) {
        if (pos > input_.size()) {
            throw std::runtime_error("Invalid position!");
        }
        pos_ = pos;
    }
    void move_back(size_t n) {
        if (pos_ < n) {
            throw std::runtime_error("Can't move back that far!");
        }
        pos_ -= n;
    }
    // Get the substring of the input at the given range
    std::string str(const common_string_range & rng) const;
    // Appends to the result.content field
    void add_content(const std::string & content);
    // Appends to the result.reasoning_content field
    void add_reasoning_content(const std::string & reasoning_content);
    // Adds a tool call to the result. If the tool call is too incomplete (e.g. name empty), it won't add anything.
    bool add_tool_call(const std::string & name, const std::string & id, const std::string & arguments);
    // Adds a tool call using the "name", "id" and "arguments" fields of the json object
    bool add_tool_call(const nlohmann::ordered_json & tool_call);
    // Adds an array of tool calls using their "name", "id" and "arguments" fields.
    bool add_tool_calls(const nlohmann::ordered_json & arr);
    // Adds a tool call using the short form: { "tool_name": { "arg1": val, "arg2": val } }
    bool add_tool_call_short_form(const nlohmann::ordered_json & tool_call);
    void finish();
    bool consume_spaces();
    void consume_literal(const std::string & literal);
    bool try_parse_reasoning(const std::string & start_think, const std::string & end_think);
    std::string consume_rest();
    struct find_regex_result {
        std::string prelude;
        std::vector<common_string_range> groups;
    };
    std::optional<find_regex_result> try_find_regex(const common_regex & regex, size_t from = std::string::npos, bool add_prelude_to_content = true);
    bool try_consume_literal(const std::string & literal);
    std::optional<find_regex_result> try_find_literal(const std::string & literal);
    find_regex_result consume_regex(const common_regex & regex);
    std::optional<find_regex_result> try_consume_regex(const common_regex & regex);
    std::optional<common_json> try_consume_json();
    common_json consume_json();
    struct consume_json_result {
        nlohmann::ordered_json value;
        bool is_partial;
    };
    /*
        Consume (possibly partial) json and converts specific subtrees to (possibly truncated) JSON strings.
        By default, object keys can't be truncated, nor can string values (their corresponding key is removed,
        e.g. `{"foo": "bar", "baz": "b` -> `{"foo": "bar"}`
        But one can allow subpaths to be kept truncated, and possibly json-dumped to truncated json strings
        - with `content_paths={{"foo"}}` -> `{"foo": "b` -> {"foo": "b"}`
        - with `args_paths={{"foo"}}` -> `{"foo": {"b` -> `{"foo": "{b"}`
    */
    consume_json_result consume_json_with_dumped_args(
        const std::vector<std::vector<std::string>> & args_paths = {},
        const std::vector<std::vector<std::string>> & content_paths = {}
    );
    std::optional<consume_json_result> try_consume_json_with_dumped_args(
        const std::vector<std::vector<std::string>> & args_paths = {},
        const std::vector<std::vector<std::string>> & content_paths = {}
    );
    /**
     * Parse XML-Style tool call for given xml_tool_call_format. Return false for invalid syntax and get the position untouched.
     * form.scope_start, form.tool_sep and form.scope_end can be empty.
     */
    bool try_consume_xml_tool_calls(const struct xml_tool_call_format & form);
    // Parse content uses reasoning and XML-Style tool call
    void consume_reasoning_with_xml_tool_calls(const struct xml_tool_call_format & form, const std::string & start_think = "<think>", const std::string & end_think = "</think>");
    void clear_tools();
 };
--- a/common/chat-peg-parser.cpp
+++ b/common/chat-peg-parser.cpp
@ -1,13 +1,17 @@
 #include "chat-peg-parser.h"
 #include "chat-auto-parser.h"
 #include "ggml.h"
 #include "peg-parser.h"
 #include <nlohmann/json.hpp>
-using json = nlohmann::json;
+using ordered_json = nlohmann::ordered_json;
 static std::string_view trim_trailing_space(std::string_view sv, int max = -1) {
    int count = 0;
    while (!sv.empty() && std::isspace(static_cast<unsigned char>(sv.back()))) {
-        if (max != -1 && count <= max) {
+        if (max != -1 && count >= max) {
            break;
        }
        sv.remove_suffix(1);
@ -16,109 +20,820 @@ static std::string_view trim_trailing_space(std::string_view sv, int max = -1) {
    return sv;
 }
-void common_chat_peg_mapper::from_ast(const common_peg_ast_arena & arena, const common_peg_parse_result & result) {
+static std::string_view trim_leading_space(std::string_view sv, int max = -1) {
    int count = 0;
    while (!sv.empty() && std::isspace(static_cast<unsigned char>(sv.front()))) {
        if (max != -1 && count >= max) {
            break;
        }
        sv.remove_prefix(1);
        count++;
    }
    return sv;
 }
 static std::string_view trim(std::string_view sv) {
    return trim_trailing_space(trim_leading_space(sv, 1));
 }
 // Count the number of unclosed '{' braces in a JSON-like string,
 // properly skipping braces inside quoted strings.
 static int json_brace_depth(const std::string & s) {
    int  depth     = 0;
    bool in_string = false;
    bool escaped   = false;
    for (char c : s) {
        if (escaped) {
            escaped = false;
            continue;
        }
        if (c == '\\' && in_string) {
            escaped = true;
            continue;
        }
        if (c == '"') {
            in_string = !in_string;
            continue;
        }
        if (!in_string) {
            if (c == '{') {
                depth++;
            } else if (c == '}') {
                depth--;
            }
        }
    }
    return depth;
 }
 // JSON-escape a string and return the inner content (without surrounding quotes).
 static std::string escape_json_string_inner(const std::string & s) {
    std::string escaped = ordered_json(s).dump();
    if (escaped.size() >= 2 && escaped.front() == '"' && escaped.back() == '"') {
        return escaped.substr(1, escaped.size() - 2);
    }
    return escaped;
 }
 // Convert Python-style single-quoted strings to JSON double-quoted strings
 // Only converts outer string delimiters, properly handling escape sequences:
 // - {'key': 'value'} -> {"key": "value"}
 // - {'code': 'print(\'hello\')'} -> {"code": "print('hello')"}
 // - {'msg': 'He said "hi"'} -> {"msg": "He said \"hi\""}
 static std::string normalize_quotes_to_json(const std::string & input) {
    std::string result;
    result.reserve(input.size() + 16);  // May need extra space for escaping
    bool in_single_quoted = false;
    bool in_double_quoted = false;
    for (size_t i = 0; i < input.size(); ++i) {
        char c = input[i];
        // Handle escape sequences
        if (c == '\\' && i + 1 < input.size()) {
            char next = input[i + 1];
            if (in_single_quoted) {
                // Inside a single-quoted string being converted to double quotes
                if (next == '\'') {
                    // \' -> ' (escaped single quote becomes unescaped in double-quoted string)
                    result += '\'';
                    ++i;
                    continue;
                }
                if (next == '"') {
                    // \" stays as \" (already escaped, works in double-quoted string)
                    result += "\\\"";
                    ++i;
                    continue;
                }
                // Other escapes (\n, \\, etc.): pass through both characters
                result += c;
                result += next;
                ++i;
                continue;
            }
            if (in_double_quoted) {
                // Inside a double-quoted string - pass through escape sequences as-is
                result += c;
                result += next;
                ++i;
                continue;
            }
            // Outside any string - just pass through the backslash
            result += c;
            continue;
        }
        // Handle quote characters
        if (c == '"') {
            if (in_single_quoted) {
                // Unescaped double quote inside single-quoted string -> must escape for JSON
                result += "\\\"";
            } else {
                // Double quote as string delimiter or outside strings
                in_double_quoted = !in_double_quoted;
                result += c;
            }
        } else if (c == '\'') {
            if (in_double_quoted) {
                // Single quote inside double-quoted string -> pass through
                result += c;
            } else if (in_single_quoted) {
                // Closing single quote -> convert to double quote
                in_single_quoted = false;
                result += '"';
            } else {
                // Opening single quote -> convert to double quote
                in_single_quoted = true;
                result += '"';
            }
        } else {
            result += c;
        }
    }
    return result;
 }
 void tag_based_peg_mapper::from_ast(const common_peg_ast_arena & arena, const common_peg_parse_result & result) {
    arena.visit(result, [this](const common_peg_ast_node & node) {
-        map(node);
+        if (!node.tag.empty()) {
            tags[node.tag] = std::string(node.text);
        }
    });
 }
 tagged_parse_result tagged_peg_parser::parse_and_extract(const std::string & input, common_peg_parse_flags extra_flags) const {
    common_peg_parse_context ctx(input, flags | extra_flags);
    auto parse_result = arena.parse(ctx);
    tag_based_peg_mapper mapper;
    mapper.from_ast(ctx.ast, parse_result);
    return { std::move(parse_result), std::move(mapper.tags) };
 }
 tagged_parse_result tagged_peg_parser::parse_anywhere_and_extract(const std::string & input) const {
    if (input.empty()) {
        return parse_and_extract(input);
    }
    for (size_t i = 0; i < input.size(); i++) {
        common_peg_parse_context ctx(input, flags);
        auto parse_result = arena.parse(ctx, i);
        if (parse_result.success() || i == input.size() - 1) {
            tag_based_peg_mapper mapper;
            mapper.from_ast(ctx.ast, parse_result);
            return { std::move(parse_result), std::move(mapper.tags) };
        }
    }
    GGML_ABORT("Should not happen");
 }
 tagged_peg_parser build_tagged_peg_parser(
    const std::function<common_peg_parser(common_peg_parser_builder & builder)> & fn) {
    common_peg_parser_builder builder;
    builder.set_root(fn(builder));
    return { builder.build() };
 }
 common_peg_parser common_chat_peg_builder::tag_with_safe_content(const std::string &       tag_name,
                                                                 const std::string &       marker,
                                                                 const common_peg_parser & p) {
    if (marker.empty()) {
        return zero_or_more(choice({ p, rule(tag_name, content(any())) }));
    }
    auto content_chunk = rule(tag_name, content(negate(literal(marker)) + any() + until(marker)));
    return zero_or_more(choice({ p, content_chunk }));
 }
 std::string & common_chat_peg_mapper::args_target() {
    return (current_tool && !current_tool->name.empty()) ? current_tool->arguments : args_buffer;
 }
 void common_chat_peg_mapper::from_ast(const common_peg_ast_arena &    arena,
                                      const common_peg_parse_result & parse_result_arg) {
    arena.visit(parse_result_arg, [this](const common_peg_ast_node & node) { map(node); });
    // Flush any pending tool call that was started but never got a name
    // This happens during partial parsing when the tool call is incomplete
    if (pending_tool_call.has_value() && !pending_tool_call->name.empty()) {
        if (!args_buffer.empty()) {
            pending_tool_call->arguments = args_buffer;
        }
        if (closing_quote_pending && !pending_tool_call->arguments.empty()) {
            pending_tool_call->arguments += "\"";
        }
        result.tool_calls.push_back(pending_tool_call.value());
        pending_tool_call.reset();
    }
 }
 void common_chat_peg_mapper::map(const common_peg_ast_node & node) {
    // Handle reasoning/content tags
    bool is_reasoning = node.tag == common_chat_peg_builder::REASONING;
    bool is_content   = node.tag == common_chat_peg_builder::CONTENT;
-    if (is_reasoning) {
+    if (is_reasoning) { // GPT OSS can have more than 1 reasoning block, so concatenate here
-        result.reasoning_content = std::string(trim_trailing_space(node.text));
+        result.reasoning_content += std::string(node.text);
    }
    if (is_content) {
-        result.content = std::string(trim_trailing_space(node.text));
+        // Concatenate content from multiple content nodes (e.g., when reasoning markers
-    }
+        // are preserved before content markers in reasoning_format=NONE mode)
        result.content += std::string(node.text);
    }
-void common_chat_peg_native_mapper::map(const common_peg_ast_node & node) {
+    // Handle tool-related tags (supporting both JSON and tagged formats)
-    common_chat_peg_mapper::map(node);
+    bool is_tool_open  = node.tag == common_chat_peg_builder::TOOL_OPEN;
-
+    bool is_tool_close = node.tag == common_chat_peg_builder::TOOL_CLOSE;
-    bool is_tool_open = node.tag == common_chat_peg_native_builder::TOOL_OPEN;
+    bool is_tool_name  = node.tag == common_chat_peg_builder::TOOL_NAME;
-    bool is_tool_name = node.tag == common_chat_peg_native_builder::TOOL_NAME;
+    bool is_tool_id    = node.tag == common_chat_peg_builder::TOOL_ID;
-    bool is_tool_id = node.tag == common_chat_peg_native_builder::TOOL_ID;
+    bool is_tool_args  = node.tag == common_chat_peg_builder::TOOL_ARGS;
-    bool is_tool_args = node.tag == common_chat_peg_native_builder::TOOL_ARGS;
+    bool is_arg_open   = node.tag == common_chat_peg_builder::TOOL_ARG_OPEN;
    bool is_arg_close  = node.tag == common_chat_peg_builder::TOOL_ARG_CLOSE;
    bool is_arg_name         = node.tag == common_chat_peg_builder::TOOL_ARG_NAME;
    bool is_arg_value        = node.tag == common_chat_peg_builder::TOOL_ARG_VALUE;
    bool is_arg_string_value = node.tag == common_chat_peg_builder::TOOL_ARG_STRING_VALUE;
    if (is_tool_open) {
-        result.tool_calls.emplace_back();
+        pending_tool_call     = common_chat_tool_call();
-        current_tool = &result.tool_calls.back();
+        current_tool          = &pending_tool_call.value();
        arg_count             = 0;
        args_buffer.clear();
        closing_quote_pending = false;
    }
    if (is_tool_id && current_tool) {
-        current_tool->id = std::string(trim_trailing_space(node.text));
+        auto text = trim_trailing_space(node.text);
        if (text.size() >= 2 && text.front() == '"' && text.back() == '"') {
            text = text.substr(1, text.size() - 2);
        }
        current_tool->id = std::string(text);
    }
    if (is_tool_name && current_tool) {
        current_tool->name = std::string(trim_trailing_space(node.text));
        // Now that we have the name, populate the arguments from the buffer
        if (!args_buffer.empty()) {
            current_tool->arguments = args_buffer;
            args_buffer.clear();
        } else if (current_tool->arguments.empty()) {
            current_tool->arguments = "{";
        }
        // Add the tool call to results so streaming can see it
        if (pending_tool_call.has_value()) {
            result.tool_calls.push_back(pending_tool_call.value());
            pending_tool_call.reset();
            current_tool = &result.tool_calls.back();
        }
    }
    if (is_tool_args && current_tool) {
-        current_tool->arguments = std::string(trim_trailing_space(node.text));
+        // For JSON format: arguments come as a complete JSON object
        // For tagged format: built up from individual arg_name/arg_value nodes
        auto text = trim_trailing_space(node.text);
        if (!text.empty() && text.front() == '{') {
            args_target() = std::string(text);
        }
    }
 void common_chat_peg_constructed_mapper::map(const common_peg_ast_node & node) {
    common_chat_peg_mapper::map(node);
    bool is_tool_open = node.tag == common_chat_peg_constructed_builder::TOOL_OPEN;
    bool is_tool_name = node.tag == common_chat_peg_constructed_builder::TOOL_NAME;
    bool is_tool_close = node.tag == common_chat_peg_constructed_builder::TOOL_CLOSE;
    bool is_arg_open = node.tag == common_chat_peg_constructed_builder::TOOL_ARG_OPEN;
    bool is_arg_close = node.tag == common_chat_peg_constructed_builder::TOOL_ARG_CLOSE;
    bool is_arg_name = node.tag == common_chat_peg_constructed_builder::TOOL_ARG_NAME;
    bool is_arg_string = node.tag == common_chat_peg_constructed_builder::TOOL_ARG_STRING_VALUE;
    bool is_arg_json = node.tag == common_chat_peg_constructed_builder::TOOL_ARG_JSON_VALUE;
    if (is_tool_open) {
        result.tool_calls.emplace_back();
        current_tool = &result.tool_calls.back();
        arg_count = 0;
    }
    if (is_tool_name) {
        current_tool->name = std::string(node.text);
        current_tool->arguments = "{";
    }
    if (is_arg_open) {
-        needs_closing_quote = false;
+        closing_quote_pending = false;
    }
    if (is_arg_name && current_tool) {
        std::string arg_entry;
        if (arg_count > 0) {
-            current_tool->arguments += ",";
+            arg_entry = ",";
        }
-        current_tool->arguments += json(trim_trailing_space(node.text)).dump() + ":";
+        arg_entry += ordered_json(trim(node.text)).dump() + ":";
        ++arg_count;
        auto & target = args_target();
        if (target.empty()) {
            target = "{";
        }
        target += arg_entry;
    }
-    if (is_arg_string && current_tool) {
+    if ((is_arg_value || is_arg_string_value) && current_tool) {
-        // Serialize to JSON, but exclude the end quote
+        std::string value_content = std::string(trim_trailing_space(trim_leading_space(node.text, 1), 1));
-        std::string dumped = json(trim_trailing_space(node.text)).dump();
+
-        current_tool->arguments += dumped.substr(0, dumped.size() - 1);
+        std::string value_to_add;
-        needs_closing_quote = true;
+        if (value_content.empty() && is_arg_string_value) {
            // Empty string value - arg_close will add the closing quote
            value_to_add          = "\"";
            closing_quote_pending = true;
        } else if (!value_content.empty() && is_arg_string_value) {
            // Schema declares this as string type - always treat as literal string value
            if (!closing_quote_pending) {
                value_to_add          = "\"";
                closing_quote_pending = true;
            }
            value_to_add += escape_json_string_inner(value_content);
        } else if (!value_content.empty()) {
            // For potential containers, normalize Python-style single quotes to JSON double quotes
            bool is_potential_container = value_content[0] == '[' || value_content[0] == '{';
            if (is_potential_container) {
                value_content = normalize_quotes_to_json(value_content);
            }
            // Try to parse as JSON value (number, bool, null, object, array)
            try {
                ordered_json parsed = ordered_json::parse(value_content);
                if (parsed.is_string()) {
                    // Don't add closing quote yet (added by arg_close) for monotonic streaming
                    std::string escaped = parsed.dump();
                    if (!escaped.empty() && escaped.back() == '"') {
                        escaped.pop_back();
                    }
                    value_to_add          = escaped;
                    closing_quote_pending = true;
                } else {
                    // Non-string values: use raw content to preserve whitespace for monotonicity
                    value_to_add = value_content;
                }
            } catch (...) {
                if (node.is_partial && is_potential_container) {
                    // Partial container: pass through the already-normalized content
                    value_to_add = value_content;
                } else {
                    // Not valid JSON - treat as string value
                    if (!closing_quote_pending) {
                        value_to_add          = "\"";
                        closing_quote_pending = true;
                    }
                    value_to_add += escape_json_string_inner(value_content);
                }
            }
        }
        args_target() += value_to_add;
    }
    if (is_arg_close && current_tool) {
-        if (needs_closing_quote) {
+        if (closing_quote_pending) {
-            current_tool->arguments += "\"";
+            args_target() += "\"";
-            needs_closing_quote = false;
+            closing_quote_pending = false;
        }
    }
    if (is_arg_json && current_tool) {
        current_tool->arguments += std::string(trim_trailing_space(node.text));
    }
    if (is_tool_close && current_tool) {
-        if (needs_closing_quote) {
+        // Flush buffer to arguments if tool name was never seen
-            current_tool->arguments += "\"";
+        if (current_tool->name.empty() && !args_buffer.empty()) {
-            needs_closing_quote = false;
+            current_tool->arguments = args_buffer;
            args_buffer.clear();
        }
        // Close any pending string quote
        if (closing_quote_pending) {
            current_tool->arguments += "\"";
            closing_quote_pending = false;
        }
        // Close any unclosed braces (accounts for nested objects)
        for (int d = json_brace_depth(current_tool->arguments); d > 0; d--) {
            current_tool->arguments += "}";
        }
        // Add tool call to results if named; otherwise discard
        if (pending_tool_call.has_value()) {
            if (!current_tool->name.empty()) {
                result.tool_calls.push_back(pending_tool_call.value());
            }
            pending_tool_call.reset();
        }
    }
 }
 common_peg_parser common_chat_peg_builder::standard_constructed_tools(
    const std::map<std::string, std::string> & markers,
    const ordered_json &                       tools,
    bool                                       parallel_tool_calls,
    bool                                       force_tool_calls) {
    if (!tools.is_array() || tools.empty()) {
        return eps();
    }
    // Extract markers with defaults
    auto get_marker = [&markers](const std::string & key, const std::string & default_val = "") -> std::string {
        auto it = markers.find(key);
        return it != markers.end() ? it->second : default_val;
    };
    std::string section_start    = get_marker("tool_call_start_marker", "<tool_call>");
    std::string section_end      = get_marker("tool_call_end_marker", "</tool_call>");
    std::string func_opener      = get_marker("function_opener", "<function=");
    std::string func_name_suffix = get_marker("function_name_suffix", ">");
    std::string func_closer      = get_marker("function_closer", "</function>");
    std::string param_key_prefix = get_marker("parameter_key_prefix", "<param=");
    std::string param_key_suffix = get_marker("parameter_key_suffix", ">");
    std::string param_closer     = get_marker("parameter_closer", "</param>");
    // Build tool choices for tagged format
    auto tool_choices = choice();
    for (const auto & tool_def : tools) {
        if (!tool_def.contains("function")) {
            continue;
        }
        const auto &   function = tool_def.at("function");
        std::string    name     = function.at("name");
        ordered_json   params   = function.contains("parameters") ? function.at("parameters") : ordered_json::object();
        // Build argument parsers
        auto args = eps();
        if (params.contains("properties") && !params["properties"].empty()) {
            auto arg_choice = choice();
            for (const auto & el : params["properties"].items()) {
                const std::string & prop_name = el.key();
                auto arg_name_parser =
                    choice({ literal(prop_name), literal("\"" + prop_name + "\""), literal("'" + prop_name + "'") });
                auto arg_rule = tool_arg(tool_arg_open(literal(param_key_prefix)) + tool_arg_name(arg_name_parser) +
                                         literal(param_key_suffix) + tool_arg_value(until(param_closer)) +
                                         tool_arg_close(literal(param_closer)));
                arg_choice |= arg_rule;
            }
            args = zero_or_more(arg_choice + space());
        }
        // Build function parser: <function=name>args</function>
        auto tool_parser = tool(tool_open(literal(func_opener) + tool_name(literal(name)) + literal(func_name_suffix)) +
                                space() + tool_args(args) + space() + tool_close(literal(func_closer)));
        tool_choices |= rule("tool-" + name, tool_parser);
    }
    // Build the section with markers
    auto section =
        parallel_tool_calls ?
            trigger_rule("tool-call", literal(section_start) + space() + one_or_more(tool_choices + space()) +
                                          literal(section_end)) :
            trigger_rule("tool-call", literal(section_start) + space() + tool_choices + space() + literal(section_end));
    return force_tool_calls ? section : optional(section);
 }
 // Python-style tool calls: name(arg1="value1", arg2=123)
 // Used only by LFM2 for now, so we don't merge it into autoparser
 common_peg_parser common_chat_peg_builder::python_style_tool_calls(
    const ordered_json & tools,
    bool                 parallel_tool_calls) {
    if (!tools.is_array() || tools.empty()) {
        return eps();
    }
    auto tool_choices = choice();
    for (const auto & tool_def : tools) {
        if (!tool_def.contains("function")) {
            continue;
        }
        const auto &   function = tool_def.at("function");
        std::string    name     = function.at("name");
        ordered_json   params   = function.contains("parameters") ? function.at("parameters") : ordered_json::object();
        auto args = eps();
        if (params.contains("properties") && !params["properties"].empty()) {
            auto arg_choice = choice();
            for (const auto & el : params["properties"].items()) {
                const std::string & prop_name = el.key();
                const auto & prop_def = el.value();
                bool is_string_type = (prop_def.contains("type") && prop_def["type"] == "string");
                auto arg_name_parser = literal(prop_name);
                common_peg_parser arg_value_parser = eps();
                auto string_value_parser = choice({
                    literal("\"") + tool_arg_string_value(string_content('"')) + literal("\""),
                    literal("'") + tool_arg_string_value(string_content('\'')) + literal("'")
                });
                if (is_string_type) {
                    arg_value_parser = string_value_parser;
                } else {
                    arg_value_parser = tool_arg_value(python_value());
                }
                // Full argument: name="value" or name=value
                auto arg_rule = tool_arg(
                    tool_arg_open(eps()) +
                    tool_arg_name(arg_name_parser) +
                    literal("=") +
                    arg_value_parser +
                    tool_arg_close(eps())
                );
                arg_choice |= arg_rule;
            }
            args = arg_choice + zero_or_more("," + space() + arg_choice);
        }
        auto tool_parser = tool(tool_open(tool_name(literal(name)) + literal("(")) +
            space() + tool_args(args) + space() + tool_close(literal(")"))
        );
        tool_choices |= rule("tool-" + name, tool_parser);
    }
    if (parallel_tool_calls) {
        return "[" + space() + tool_choices + zero_or_more("," + space() + tool_choices) + space() + "]";
    }
    return "[" + space() + tool_choices + space() + "]";
 }
 // Helper: Parse dot notation key into prefix and field name
 static std::pair<std::string, std::string> parse_key_spec(const std::string & key) {
    auto dot_pos = key.find('.');
    if (dot_pos == std::string::npos) {
        return {"", key};  // Top-level field
    }
    return {key.substr(0, dot_pos), key.substr(dot_pos + 1)};
 }
 // Mode 1: function_is_key — parse {"function_name": {...}}
 common_peg_parser common_chat_peg_builder::build_json_tools_function_is_key(
    const ordered_json & tools,
    const std::string &  args_key,
    const std::string &  effective_args_key,
    const std::string &  call_id_key,
    const std::string &  gen_call_id_key) {
    auto tool_choices = choice();
    for (const auto & tool_def : tools) {
        if (!tool_def.contains("function")) {
            continue;
        }
        const auto &   function = tool_def.at("function");
        std::string    name     = function.at("name");
        ordered_json   params   = function.contains("parameters") ? function.at("parameters") : ordered_json::object();
        // Build inner object fields
        std::vector<common_peg_parser> inner_fields;
        if (!call_id_key.empty()) {
            auto id_parser = atomic(
                literal("\"" + call_id_key + "\"") + space() + literal(":") + space() +
                literal("\"") + tool_id(string_content('"')) + literal("\"")
            );
            inner_fields.push_back(optional(id_parser + space() + optional(literal(",") + space())));
        }
        if (!gen_call_id_key.empty()) {
            auto gen_id_parser = atomic(
                literal("\"" + gen_call_id_key + "\"") + space() + literal(":") + space() +
                choice({
                    literal("\"") + tool_id(string_content('"')) + literal("\""),
                    tool_id(json_number())
                })
            );
            inner_fields.push_back(optional(gen_id_parser + space() + optional(literal(",") + space())));
        }
        // Arguments — either wrapped in args_key or parsed directly
        common_peg_parser args_parser = eps();
        if (args_key.empty()) {
            args_parser = tool_args(schema(json(), "tool-" + name + "-schema", params));
        } else {
            args_parser = literal("\"" + effective_args_key + "\"") + space() + literal(":") + space() +
                          tool_args(schema(json(), "tool-" + name + "-schema", params));
        }
        inner_fields.push_back(args_parser);
        // Build inner object parser
        common_peg_parser inner_object = eps();
        if (args_key.empty() && inner_fields.size() == 1) {
            inner_object = inner_fields[0];
        } else {
            inner_object = literal("{") + space();
            for (size_t i = 0; i < inner_fields.size(); i++) {
                inner_object = inner_object + inner_fields[i];
                if (i < inner_fields.size() - 1) {
                    inner_object = inner_object + space();
                }
            }
            inner_object = inner_object + space() + literal("}");
        }
        auto tool_parser = tool(
            tool_open(literal("{")) + space() +
            literal("\"") + tool_name(literal(name)) + literal("\"") +
            space() + literal(":") + space() +
            inner_object +
            space() + tool_close(literal("}"))
        );
        tool_choices |= rule("tool-" + name, tool_parser);
    }
    return tool_choices;
 }
 // Mode 2: Nested keys (dot notation like "function.name")
 common_peg_parser common_chat_peg_builder::build_json_tools_nested_keys(
    const ordered_json & tools,
    const std::string &  effective_name_key,
    const std::string &  effective_args_key,
    const std::string &  call_id_key,
    const std::string &  gen_call_id_key) {
    auto tool_choices = choice();
    auto name_spec = parse_key_spec(effective_name_key);
    auto args_spec = parse_key_spec(effective_args_key);
    std::string nested_prefix     = !name_spec.first.empty() ? name_spec.first  : args_spec.first;
    std::string nested_name_field = !name_spec.first.empty() ? name_spec.second  : effective_name_key;
    std::string nested_args_field = !args_spec.first.empty() ? args_spec.second  : effective_args_key;
    for (const auto & tool_def : tools) {
        if (!tool_def.contains("function")) {
            continue;
        }
        const auto &   function = tool_def.at("function");
        std::string    name     = function.at("name");
        ordered_json   params   = function.contains("parameters") ? function.at("parameters") : ordered_json::object();
        auto nested_name = literal("\"" + nested_name_field + "\"") + space() + literal(":") + space() +
                          literal("\"") + tool_name(literal(name)) + literal("\"");
        auto nested_args = literal("\"" + nested_args_field + "\"") + space() + literal(":") + space() +
                          tool_args(schema(json(), "tool-" + name + "-schema", params));
        auto nested_object = literal("{") + space() +
                            nested_name + space() + literal(",") + space() +
                            nested_args +
                            space() + literal("}");
        // Format: { id?, "function": {...} }
        auto tool_parser_body = tool_open(literal("{")) + space();
        if (!call_id_key.empty()) {
            auto id_spec = parse_key_spec(call_id_key);
            if (id_spec.first.empty()) {
                auto id_parser = atomic(
                    literal("\"" + call_id_key + "\"") + space() + literal(":") + space() +
                    literal("\"") + tool_id(string_content('"')) + literal("\"")
                );
                tool_parser_body = tool_parser_body + optional(id_parser + space() + literal(",") + space());
            }
        }
        if (!gen_call_id_key.empty()) {
            auto gen_id_spec = parse_key_spec(gen_call_id_key);
            if (gen_id_spec.first.empty()) {
                auto gen_id_parser = atomic(
                    literal("\"" + gen_call_id_key + "\"") + space() + literal(":") + space() +
                    choice({
                        literal("\"") + tool_id(string_content('"')) + literal("\""),
                        tool_id(json_number())
                    })
                );
                tool_parser_body = tool_parser_body + optional(gen_id_parser + space() + literal(",") + space());
            }
        }
        auto nested_field = literal("\"" + nested_prefix + "\"") + space() + literal(":") + space() + nested_object;
        tool_parser_body = tool_parser_body + nested_field + space() + tool_close(literal("}"));
        tool_choices |= rule("tool-" + name, tool(tool_parser_body));
    }
    return tool_choices;
 }
 // Mode 3: Flat keys with optional ID fields and parameter ordering
 common_peg_parser common_chat_peg_builder::build_json_tools_flat_keys(
    const ordered_json &             tools,
    const std::string &              effective_name_key,
    const std::string &              effective_args_key,
    const std::string &              call_id_key,
    const std::string &              gen_call_id_key,
    const std::vector<std::string> & parameters_order) {
    auto tool_choices    = choice();
    auto name_key_parser = literal("\"" + effective_name_key + "\"");
    auto args_key_parser = literal("\"" + effective_args_key + "\"");
    for (const auto & tool_def : tools) {
        if (!tool_def.contains("function")) {
            continue;
        }
        const auto &   function = tool_def.at("function");
        std::string    name     = function.at("name");
        ordered_json   params   = function.contains("parameters") ? function.at("parameters") : ordered_json::object();
        auto tool_name_ = name_key_parser + space() + literal(":") + space() +
                         literal("\"") + tool_name(literal(name)) + literal("\"");
        auto tool_args_ = args_key_parser + space() + literal(":") + space() +
                         tool_args(schema(json(), "tool-" + name + "-schema", params));
        // Build ID parsers if keys are provided
        common_peg_parser id_parser = eps();
        if (!call_id_key.empty()) {
            id_parser = atomic(
                literal("\"" + call_id_key + "\"") + space() + literal(":") + space() +
                choice({
                    literal("\"") + tool_id(string_content('"')) + literal("\""),
                    tool_id(json_number())
                })
            );
        }
        common_peg_parser gen_id_parser = eps();
        if (!gen_call_id_key.empty()) {
            gen_id_parser = atomic(
                literal("\"" + gen_call_id_key + "\"") + space() + literal(":") + space() +
                choice({
                    literal("\"") + tool_id(string_content('"')) + literal("\""),
                    tool_id(json_number())
                })
            );
        }
        // Create (parser, key) pairs for all fields, then sort by parameters_order
        std::vector<std::pair<common_peg_parser, std::string>> parser_pairs;
        parser_pairs.emplace_back(tool_name_, effective_name_key);
        parser_pairs.emplace_back(tool_args_, effective_args_key);
        if (!call_id_key.empty()) {
            parser_pairs.emplace_back(optional(id_parser), call_id_key);
        }
        if (!gen_call_id_key.empty()) {
            parser_pairs.emplace_back(optional(gen_id_parser), gen_call_id_key);
        }
        std::sort(parser_pairs.begin(), parser_pairs.end(),
            [&parameters_order](const auto & a, const auto & b) {
                auto pos_a = std::find(parameters_order.begin(), parameters_order.end(), a.second);
                auto pos_b = std::find(parameters_order.begin(), parameters_order.end(), b.second);
                size_t idx_a = (pos_a == parameters_order.end()) ? parameters_order.size() : std::distance(parameters_order.begin(), pos_a);
                size_t idx_b = (pos_b == parameters_order.end()) ? parameters_order.size() : std::distance(parameters_order.begin(), pos_b);
                return idx_a < idx_b;
            });
        auto ordered_body = tool_open(literal("{")) + space();
        for (size_t i = 0; i < parser_pairs.size(); i++) {
            ordered_body = ordered_body + parser_pairs[i].first;
            if (i < parser_pairs.size() - 1) {
                ordered_body = ordered_body + space() + literal(",") + space();
            }
        }
        ordered_body = ordered_body + space() + tool_close(literal("}"));
        tool_choices |= rule("tool-" + name, tool(ordered_body));
    }
    return tool_choices;
 }
 common_peg_parser common_chat_peg_builder::standard_json_tools(
                                                       const std::string &              section_start,
                                                       const std::string &              section_end,
                                                       const ordered_json &             tools,
                                                       bool                             parallel_tool_calls,
                                                       bool                             force_tool_calls,
                                                       const std::string &              name_key,
                                                       const std::string &              args_key,
                                                       bool                             array_wrapped,
                                                       bool                             function_is_key,
                                                       const std::string &              call_id_key,
                                                       const std::string &              gen_call_id_key,
                                                       const std::vector<std::string> & parameters_order) {
    if (!tools.is_array() || tools.empty()) {
        return eps();
    }
    std::string effective_name_key = name_key.empty() ? "name" : name_key;
    std::string effective_args_key = args_key.empty() ? "arguments" : args_key;
    // Dispatch to the appropriate builder based on the JSON layout mode
    common_peg_parser tool_choices = eps();
    if (function_is_key) {
        tool_choices = build_json_tools_function_is_key(tools, args_key, effective_args_key, call_id_key, gen_call_id_key);
    } else {
        auto name_spec = parse_key_spec(effective_name_key);
        auto args_spec = parse_key_spec(effective_args_key);
        if (!name_spec.first.empty() || !args_spec.first.empty()) {
            tool_choices = build_json_tools_nested_keys(tools, effective_name_key, effective_args_key, call_id_key, gen_call_id_key);
        } else {
            tool_choices = build_json_tools_flat_keys(tools, effective_name_key, effective_args_key, call_id_key, gen_call_id_key, parameters_order);
        }
    }
    // Build the section with markers
    auto tool_calls = tool_choices;
    if (parallel_tool_calls) {
        tool_calls = tool_calls + zero_or_more(space() + literal(",") + space() + tool_choices);
    }
    if (array_wrapped) {
        tool_calls = literal("[") + space() + tool_calls + space() + literal("]");
    }
    auto section =
        trigger_rule("tool-call", literal(section_start) + space() + tool_calls + space() + literal(section_end));
    return force_tool_calls ? section : optional(section);
 }
--- a/common/chat-peg-parser.h
+++ b/common/chat-peg-parser.h
@ -3,22 +3,9 @@
 #include "chat.h"
 #include "peg-parser.h"
-class common_chat_peg_builder : public common_peg_parser_builder {
+#include <map>
-  public:
+#include <optional>
-    static constexpr const char * REASONING_BLOCK = "reasoning-block";
+#include <vector>
    static constexpr const char * REASONING = "reasoning";
    static constexpr const char * CONTENT = "content";
    common_peg_parser reasoning_block(const common_peg_parser & p) { return tag(REASONING_BLOCK, p); }
    common_peg_parser reasoning(const common_peg_parser & p) { return tag(REASONING, p); }
    common_peg_parser content(const common_peg_parser & p) { return tag(CONTENT, p); }
 };
 inline common_peg_arena build_chat_peg_parser(const std::function<common_peg_parser(common_chat_peg_builder & builder)> & fn) {
    common_chat_peg_builder builder;
    builder.set_root(fn(builder));
    return builder.build();
 }
 class common_chat_peg_mapper {
  public:
@ -26,80 +13,169 @@ class common_chat_peg_mapper {
    common_chat_peg_mapper(common_chat_msg & msg) : result(msg) {}
    virtual ~common_chat_peg_mapper() = default;
    virtual void from_ast(const common_peg_ast_arena & arena, const common_peg_parse_result & result);
    virtual void map(const common_peg_ast_node & node);
    private:
      // Tool call handling state
      std::optional<common_chat_tool_call> pending_tool_call;  // Tool call waiting for name
      common_chat_tool_call *              current_tool          = nullptr;
      int                                  arg_count             = 0;
      bool                                 closing_quote_pending = false;
      std::string                          args_buffer;  // Buffer to delay arguments until tool name is known
      // Returns a reference to the active argument destination string.
      // Before tool_name is known, writes go to args_buffer; after, to current_tool->arguments.
      std::string & args_target();
 };
-class common_chat_peg_native_builder : public common_chat_peg_builder {
+struct content_structure;
 struct tool_call_structure;
 class common_chat_peg_builder : public common_peg_parser_builder {
  public:
    // Tag constants (from former common_chat_peg_base_builder)
    static constexpr const char * REASONING_BLOCK = "reasoning-block";
    static constexpr const char * REASONING       = "reasoning";
    static constexpr const char * CONTENT         = "content";
    // Tag constants
    static constexpr const char * TOOL           = "tool";
    static constexpr const char * TOOL_OPEN      = "tool-open";
    static constexpr const char * TOOL_CLOSE     = "tool-close";
    static constexpr const char * TOOL_ID        = "tool-id";
    static constexpr const char * TOOL_NAME      = "tool-name";
    static constexpr const char * TOOL_ARGS      = "tool-args";
    static constexpr const char * TOOL_ARG       = "tool-arg";
    static constexpr const char * TOOL_ARG_OPEN  = "tool-arg-open";
    static constexpr const char * TOOL_ARG_CLOSE = "tool-arg-close";
    static constexpr const char * TOOL_ARG_NAME         = "tool-arg-name";
    static constexpr const char * TOOL_ARG_VALUE        = "tool-arg-value";
    static constexpr const char * TOOL_ARG_STRING_VALUE = "tool-arg-string-value";  // For schema-declared string types
    // Low-level tag methods (from former common_chat_peg_base_builder)
    common_peg_parser reasoning_block(const common_peg_parser & p) { return tag(REASONING_BLOCK, p); }
    common_peg_parser reasoning(const common_peg_parser & p) { return tag(REASONING, p); }
    common_peg_parser content(const common_peg_parser & p) { return tag(CONTENT, p); }
    common_peg_parser tag_with_safe_content(const std::string &       tag_name,
                        const std::string &       marker,
                        const common_peg_parser & p);
    // Low-level tag methods
    common_peg_parser tool(const common_peg_parser & p) { return tag(TOOL, p); }
    common_peg_parser tool_open(const common_peg_parser & p) { return atomic(tag(TOOL_OPEN, p)); }
    common_peg_parser tool_close(const common_peg_parser & p) { return atomic(tag(TOOL_CLOSE, p)); }
    common_peg_parser tool_id(const common_peg_parser & p) { return atomic(tag(TOOL_ID, p)); }
    common_peg_parser tool_name(const common_peg_parser & p) { return atomic(tag(TOOL_NAME, p)); }
    common_peg_parser tool_args(const common_peg_parser & p) { return tag(TOOL_ARGS, p); }
 };
 class common_chat_peg_native_mapper : public common_chat_peg_mapper {
    common_chat_tool_call * current_tool;
  public:
    common_chat_peg_native_mapper(common_chat_msg & msg) : common_chat_peg_mapper(msg) {}
    void map(const common_peg_ast_node & node) override;
 };
 inline common_peg_arena build_chat_peg_native_parser(const std::function<common_peg_parser(common_chat_peg_native_builder & builder)> & fn) {
    common_chat_peg_native_builder builder;
    builder.set_root(fn(builder));
    return builder.build();
 }
 class common_chat_peg_constructed_builder : public common_chat_peg_builder {
  public:
    static constexpr const char * TOOL = "tool";
    static constexpr const char * TOOL_OPEN = "tool-open";
    static constexpr const char * TOOL_CLOSE = "tool-close";
    static constexpr const char * TOOL_NAME = "tool-name";
    static constexpr const char * TOOL_ARG = "tool-arg";
    static constexpr const char * TOOL_ARG_OPEN = "tool-arg-open";
    static constexpr const char * TOOL_ARG_CLOSE = "tool-arg-close";
    static constexpr const char * TOOL_ARG_NAME = "tool-arg-name";
    static constexpr const char * TOOL_ARG_STRING_VALUE = "tool-arg-string-value";
    static constexpr const char * TOOL_ARG_JSON_VALUE = "tool-arg-json-value";
    common_peg_parser tool(const common_peg_parser & p) { return tag(TOOL, p); }
    common_peg_parser tool_open(const common_peg_parser & p) { return atomic(tag(TOOL_OPEN, p)); }
    common_peg_parser tool_close(const common_peg_parser & p) { return atomic(tag(TOOL_CLOSE, p)); }
    common_peg_parser tool_name(const common_peg_parser & p) { return atomic(tag(TOOL_NAME, p)); }
    common_peg_parser tool_arg(const common_peg_parser & p) { return tag(TOOL_ARG, p); }
    common_peg_parser tool_arg_open(const common_peg_parser & p) { return atomic(tag(TOOL_ARG_OPEN, p)); }
    common_peg_parser tool_arg_close(const common_peg_parser & p) { return atomic(tag(TOOL_ARG_CLOSE, p)); }
    common_peg_parser tool_arg_name(const common_peg_parser & p) { return atomic(tag(TOOL_ARG_NAME, p)); }
    common_peg_parser tool_arg_value(const common_peg_parser & p) { return tag(TOOL_ARG_VALUE, p); }
    // Use for schema-declared string types - won't be treated as potential JSON container
    common_peg_parser tool_arg_string_value(const common_peg_parser & p) { return tag(TOOL_ARG_STRING_VALUE, p); }
-    common_peg_parser tool_arg_json_value(const common_peg_parser & p) { return tag(TOOL_ARG_JSON_VALUE, p); }
+    common_peg_parser tool_arg_json_value(const common_peg_parser & p) { return atomic(tag(TOOL_ARG_VALUE, p)); }
    // Legacy-compatible helper for building standard JSON tool calls
    // Used by tests and manual parsers
    // name_key/args_key: JSON key names for function name and arguments
    //   Empty or "name"/"arguments" will accept both common variations
    //   Supports dot notation for nested objects (e.g., "function.name")
    // array_wrapped: if true, tool calls are wrapped in JSON array [...]
    // function_is_key: if true, function name is the JSON key (e.g., {"func_name": {...}})
    // call_id_key: JSON key for string call ID (e.g., "id")
    // gen_call_id_key: JSON key for generated integer call ID (e.g., "tool_call_id")
    // parameters_order: order in which JSON fields should be parsed
    common_peg_parser standard_json_tools(const std::string &              section_start,
                                          const std::string &              section_end,
                                          const nlohmann::ordered_json &   tools,
                                          bool                             parallel_tool_calls,
                                          bool                             force_tool_calls,
                                          const std::string &              name_key = "",
                                          const std::string &              args_key = "",
                                          bool                             array_wrapped = false,
                                          bool                             function_is_key = false,
                                          const std::string &              call_id_key = "",
                                          const std::string &              gen_call_id_key = "",
                                          const std::vector<std::string> & parameters_order = {});
    // Legacy-compatible helper for building XML/tagged style tool calls
    // Used by tests and manual parsers
    common_peg_parser standard_constructed_tools(const std::map<std::string, std::string> & markers,
                                                 const nlohmann::ordered_json &             tools,
                                                 bool                                       parallel_tool_calls,
                                                 bool                                       force_tool_calls);
    // Helper for Python-style function call format: name(arg1="value1", arg2=123)
    // Used by LFM2 and similar templates
    common_peg_parser python_style_tool_calls(const nlohmann::ordered_json & tools,
                                              bool                           parallel_tool_calls);
  private:
    // Implementation helpers for standard_json_tools — one per JSON tool call layout mode
    common_peg_parser build_json_tools_function_is_key(const nlohmann::ordered_json & tools,
                                                       const std::string &            args_key,
                                                       const std::string &            effective_args_key,
                                                       const std::string &            call_id_key,
                                                       const std::string &            gen_call_id_key);
    common_peg_parser build_json_tools_nested_keys(const nlohmann::ordered_json & tools,
                                                   const std::string &            effective_name_key,
                                                   const std::string &            effective_args_key,
                                                   const std::string &            call_id_key,
                                                   const std::string &            gen_call_id_key);
    common_peg_parser build_json_tools_flat_keys(const nlohmann::ordered_json &   tools,
                                                 const std::string &              effective_name_key,
                                                 const std::string &              effective_args_key,
                                                 const std::string &              call_id_key,
                                                 const std::string &              gen_call_id_key,
                                                 const std::vector<std::string> & parameters_order);
 };
-class common_chat_peg_constructed_mapper : public common_chat_peg_mapper {
+inline common_peg_arena build_chat_peg_parser(
-    common_chat_tool_call * current_tool;
+  const std::function<common_peg_parser(common_chat_peg_builder & builder)> & fn) {
-    int arg_count = 0;
+  common_chat_peg_builder builder;
    bool needs_closing_quote = false;
  public:
    common_chat_peg_constructed_mapper(common_chat_msg & msg) : common_chat_peg_mapper(msg) {}
    void map(const common_peg_ast_node & node) override;
 };
 inline common_peg_arena build_chat_peg_constructed_parser(const std::function<common_peg_parser(common_chat_peg_constructed_builder & builder)> & fn) {
    common_chat_peg_constructed_builder builder;
  builder.set_root(fn(builder));
  return builder.build();
 }
 class tag_based_peg_mapper {
  public:
    std::map<std::string, std::string> tags;
    void from_ast(const common_peg_ast_arena & arena, const common_peg_parse_result & result);
 };
 struct tagged_parse_result {
    common_peg_parse_result              result;
    std::map<std::string, std::string> tags;
 };
 struct tagged_peg_parser {
    common_peg_arena arena;
    common_peg_parse_flags flags = COMMON_PEG_PARSE_FLAG_NONE;
    tagged_peg_parser & withDebug() {
      flags |= COMMON_PEG_PARSE_FLAG_DEBUG;
      return *this;
    }
    tagged_peg_parser & withoutDebug() {
      flags = flags & ~COMMON_PEG_PARSE_FLAG_DEBUG;
      return *this;
    }
    tagged_parse_result parse_and_extract(const std::string & input, common_peg_parse_flags extra_flags = COMMON_PEG_PARSE_FLAG_NONE) const;
    tagged_parse_result parse_anywhere_and_extract(const std::string & input) const;
 };
 tagged_peg_parser build_tagged_peg_parser(
    const std::function<common_peg_parser(common_peg_parser_builder & builder)> & fn);
--- a/common/chat.cpp
+++ b/common/chat.cpp
--- a/common/chat.h
+++ b/common/chat.h
@ -3,17 +3,30 @@
 #pragma once
 #include "common.h"
 #include "jinja/parser.h"
 #include "nlohmann/json_fwd.hpp"
 #include "peg-parser.h"
-#include <functional>
+#include "jinja/runtime.h"
 #include "jinja/caps.h"
 #include "nlohmann/json.hpp"
 #include <chrono>
 #include <functional>
 #include <map>
 #include <string>
 #include <vector>
-#include <map>
+
 using chat_template_caps = jinja::caps;
 using json = nlohmann::ordered_json;
 #include <nlohmann/json_fwd.hpp>
 struct common_chat_templates;
 namespace autoparser {
 struct templates_params;
 }  // namespace autoparser
 struct common_chat_tool_call {
    std::string name;
    std::string arguments;
@ -38,6 +51,67 @@ struct common_chat_msg_content_part {
    }
 };
 struct common_chat_template {
    jinja::program prog;
    std::string bos_tok;
    std::string eos_tok;
    std::string src;
    chat_template_caps caps;
    common_chat_template(const std::string & src, const std::string & bos_token, const std::string & eos_token) {
        jinja::lexer lexer;
        auto lexer_res = lexer.tokenize(src);
        this->prog = jinja::parse_from_tokens(lexer_res);
        this->src = lexer_res.source;
        this->bos_tok = bos_token;
        this->eos_tok = eos_token;
        this->caps = jinja::caps_get(prog);
        // LOG_INF("%s: caps:\n%s\n", __func__, this->caps.to_string().c_str());
    }
    const std::string & source() const { return src; }
    const std::string & bos_token() const { return bos_tok; }
    const std::string & eos_token() const { return eos_tok; }
    // TODO: this is ugly, refactor it somehow
    json add_system(const json & messages, const std::string & system_prompt) const {
        GGML_ASSERT(messages.is_array());
        auto msgs_copy = messages;
        if (!caps.supports_system_role) {
            if (msgs_copy.empty()) {
                msgs_copy.insert(msgs_copy.begin(), json{
                    {"role", "user"},
                    {"content", system_prompt}
                });
            } else {
                auto & first_msg = msgs_copy[0];
                if (!first_msg.contains("content")) {
                    first_msg["content"] = "";
                }
                first_msg["content"] = system_prompt + "\n\n"
                    + first_msg["content"].get<std::string>();
            }
        } else {
            if (msgs_copy.empty() || msgs_copy[0].at("role") != "system") {
                msgs_copy.insert(msgs_copy.begin(), json{
                    {"role", "system"},
                    {"content", system_prompt}
                });
            } else if (msgs_copy[0].at("role") == "system") {
                msgs_copy[0]["content"] = system_prompt;
            }
        }
        return msgs_copy;
    }
    chat_template_caps original_caps() const {
        return caps;
    }
 };
 struct common_chat_msg {
    std::string                               role;
    std::string                               content;
@ -50,9 +124,12 @@ struct common_chat_msg {
    nlohmann::ordered_json to_json_oaicompat(bool concat_typed_text = false) const;
    bool empty() const {
-        return content.empty() && content_parts.empty() && tool_calls.empty() && reasoning_content.empty() && tool_name.empty() && tool_call_id.empty();
+        return content.empty() && content_parts.empty() && tool_calls.empty() && reasoning_content.empty() &&
               tool_name.empty() && tool_call_id.empty();
    }
-    void set_tool_call_ids(std::vector<std::string> & ids_cache, const std::function<std::string()> & gen_tool_call_id) {
+
    void set_tool_call_ids(std::vector<std::string> &           ids_cache,
                           const std::function<std::string()> & gen_tool_call_id) {
        for (auto i = 0u; i < tool_calls.size(); i++) {
            if (ids_cache.size() <= i) {
                auto id = tool_calls[i].id;
@ -64,18 +141,14 @@ struct common_chat_msg {
            tool_calls[i].id = ids_cache[i];
        }
    }
    bool operator==(const common_chat_msg & other) const {
-        return role == other.role
+        return role == other.role && content == other.content && content_parts == other.content_parts &&
-            && content == other.content
+               tool_calls == other.tool_calls && reasoning_content == other.reasoning_content &&
-            && content_parts == other.content_parts
+               tool_name == other.tool_name && tool_call_id == other.tool_call_id;
            && tool_calls == other.tool_calls
            && reasoning_content == other.reasoning_content
            && tool_name == other.tool_name
            && tool_call_id == other.tool_call_id;
    }
    bool operator!=(const common_chat_msg & other) const {
        return !(*this == other);
    }
    bool operator!=(const common_chat_msg & other) const { return !(*this == other); }
 };
 struct common_chat_msg_diff {
@ -84,12 +157,12 @@ struct common_chat_msg_diff {
    size_t                tool_call_index = std::string::npos;
    common_chat_tool_call tool_call_delta;
-    static std::vector<common_chat_msg_diff> compute_diffs(const common_chat_msg & msg_prv, const common_chat_msg & msg_new);
+    static std::vector<common_chat_msg_diff> compute_diffs(const common_chat_msg & msg_prv,
                                                           const common_chat_msg & msg_new);
    bool operator==(const common_chat_msg_diff & other) const {
-        return content_delta == other.content_delta
+        return content_delta == other.content_delta && tool_call_index == other.tool_call_index &&
-        && tool_call_index == other.tool_call_index
+               tool_call_delta == other.tool_call_delta;
        && tool_call_delta == other.tool_call_delta;
    }
 };
@ -107,36 +180,10 @@ enum common_chat_tool_choice {
 enum common_chat_format {
    COMMON_CHAT_FORMAT_CONTENT_ONLY,
    COMMON_CHAT_FORMAT_GENERIC,
    COMMON_CHAT_FORMAT_MISTRAL_NEMO,
    COMMON_CHAT_FORMAT_MAGISTRAL,
    COMMON_CHAT_FORMAT_LLAMA_3_X,
    COMMON_CHAT_FORMAT_LLAMA_3_X_WITH_BUILTIN_TOOLS,
    COMMON_CHAT_FORMAT_DEEPSEEK_R1,
    COMMON_CHAT_FORMAT_FIREFUNCTION_V2,
    COMMON_CHAT_FORMAT_FUNCTIONARY_V3_2,
    COMMON_CHAT_FORMAT_FUNCTIONARY_V3_1_LLAMA_3_1,
    COMMON_CHAT_FORMAT_DEEPSEEK_V3_1,
    COMMON_CHAT_FORMAT_HERMES_2_PRO,
    COMMON_CHAT_FORMAT_COMMAND_R7B,
    COMMON_CHAT_FORMAT_GRANITE,
    COMMON_CHAT_FORMAT_GPT_OSS,
    COMMON_CHAT_FORMAT_SEED_OSS,
    COMMON_CHAT_FORMAT_NEMOTRON_V2,
    COMMON_CHAT_FORMAT_APERTUS,
    COMMON_CHAT_FORMAT_LFM2_WITH_JSON_TOOLS,
    COMMON_CHAT_FORMAT_GLM_4_5,
    COMMON_CHAT_FORMAT_MINIMAX_M2,
    COMMON_CHAT_FORMAT_KIMI_K2,
    COMMON_CHAT_FORMAT_APRIEL_1_5,
    COMMON_CHAT_FORMAT_XIAOMI_MIMO,
    COMMON_CHAT_FORMAT_SOLAR_OPEN,
    COMMON_CHAT_FORMAT_EXAONE_MOE,
    // These are intended to be parsed by the PEG parser
    COMMON_CHAT_FORMAT_PEG_SIMPLE,
    COMMON_CHAT_FORMAT_PEG_NATIVE,
    COMMON_CHAT_FORMAT_PEG_CONSTRUCTED,
    COMMON_CHAT_FORMAT_COUNT,  // Not a format, just the # formats
 };
@ -165,6 +212,9 @@ struct common_chat_params {
    std::string                         grammar;
    bool                                grammar_lazy         = false;
    bool                                thinking_forced_open = false;
    bool                                supports_thinking    = false;
    std::string                         thinking_start_tag;  // e.g., "<think>"
    std::string                         thinking_end_tag;    // e.g., "</think>"
    std::vector<common_grammar_trigger> grammar_triggers;
    std::vector<std::string>            preserved_tokens;
    std::vector<std::string>            additional_stops;
@ -180,6 +230,7 @@ struct common_chat_parser_params {
    bool                    reasoning_in_content = false;
    bool                    thinking_forced_open = false;
    bool                    parse_tool_calls     = true;
    bool                    debug                = false;  // Enable debug output for PEG parser
    common_peg_arena        parser               = {};
    common_chat_parser_params() = default;
    common_chat_parser_params(const common_chat_params & chat_params) {
@ -193,12 +244,13 @@ bool common_chat_verify_template(const std::string & tmpl, bool use_jinja);
 void common_chat_templates_free(struct common_chat_templates * tmpls);
-struct common_chat_templates_deleter { void operator()(common_chat_templates * tmpls) { common_chat_templates_free(tmpls); } };
+struct common_chat_templates_deleter {
    void operator()(common_chat_templates * tmpls) { common_chat_templates_free(tmpls); }
 };
 typedef std::unique_ptr<struct common_chat_templates, common_chat_templates_deleter> common_chat_templates_ptr;
-common_chat_templates_ptr common_chat_templates_init(
+common_chat_templates_ptr common_chat_templates_init(const struct llama_model * model,
                                    const struct llama_model * model,
                                                     const std::string &        chat_template_override,
                                                     const std::string &        bos_token_override = "",
                                                     const std::string &        eos_token_override = "");
@ -206,28 +258,24 @@ common_chat_templates_ptr common_chat_templates_init(
 bool         common_chat_templates_was_explicit(const struct common_chat_templates * tmpls);
 std::string  common_chat_templates_source(const struct common_chat_templates * tmpls, const std::string & variant = "");
-
+struct common_chat_params common_chat_templates_apply(const struct common_chat_templates *        tmpls,
 struct common_chat_params      common_chat_templates_apply(
    const struct common_chat_templates * tmpls,
                                                      const struct common_chat_templates_inputs & inputs);
 // Format single message, while taking into account the position of that message in chat history
-std::string common_chat_format_single(
+std::string common_chat_format_single(const struct common_chat_templates * tmpls,
        const struct common_chat_templates * tmpls,
                                      const std::vector<common_chat_msg> & past_msg,
                                      const common_chat_msg &              new_msg,
                                      bool                                 add_ass,
                                      bool                                 use_jinja);
 // Returns an example of formatted chat
-std::string common_chat_format_example(
+std::string common_chat_format_example(const struct common_chat_templates *       tmpls,
    const struct common_chat_templates * tmpls,
                                       bool                                       use_jinja,
                                       const std::map<std::string, std::string> & chat_template_kwargs);
 const char *            common_chat_format_name(common_chat_format format);
-common_chat_msg           common_chat_parse(const std::string & input, bool is_partial, const common_chat_parser_params & syntax);
+common_chat_msg           common_chat_parse(const std::string & input, bool is_partial, const common_chat_parser_params & params);
-common_chat_msg           common_chat_peg_parse(const common_peg_arena & parser, const std::string & input, bool is_partial, const common_chat_parser_params & syntax);
+common_chat_msg           common_chat_peg_parse(const common_peg_arena & src_parser, const std::string & input, bool is_partial, const common_chat_parser_params & params);
 // used by arg and server
 const char *            common_reasoning_format_name(common_reasoning_format format);
@ -250,3 +298,10 @@ nlohmann::ordered_json common_chat_msg_diff_to_json_oaicompat(const common_chat_
 // get template caps, useful for reporting to server /props endpoint
 std::map<std::string, bool> common_chat_templates_get_caps(const common_chat_templates * chat_templates);
 std::string common_chat_template_direct_apply(
    const common_chat_template & tmpl,
    const autoparser::templates_params & inputs,
    const std::optional<json> & messages_override = std::nullopt,
    const std::optional<json> & tools_override = std::nullopt,
    const std::optional<json> & additional_context = std::nullopt);
--- a/common/common.cpp
+++ b/common/common.cpp
@ -676,7 +676,7 @@ bool fs_validate_filename(const std::string & filename, bool allow_subdirs) {
    size_t offset = 0;
    while (offset < filename.size()) {
-        utf8_parse_result result = parse_utf8_codepoint(filename, offset);
+        utf8_parse_result result = common_parse_utf8_codepoint(filename, offset);
        if (result.status != utf8_parse_result::SUCCESS) {
            return false;
--- a/common/common.h
+++ b/common/common.h
@ -104,6 +104,8 @@ enum llama_example {
    LLAMA_EXAMPLE_DIFFUSION,
    LLAMA_EXAMPLE_FINETUNE,
    LLAMA_EXAMPLE_FIT_PARAMS,
    LLAMA_EXAMPLE_RESULTS,
    LLAMA_EXAMPLE_EXPORT_GRAPH_OPS,
    LLAMA_EXAMPLE_COUNT,
 };
@ -234,6 +236,14 @@ struct common_params_sampling {
    std::vector<llama_logit_bias> logit_bias;     // logit biases to apply
    std::vector<llama_logit_bias> logit_bias_eog; // pre-calculated logit biases for EOG tokens
    // reasoning budget sampler parameters
    // these are populated by the server/CLI based on chat template params
    int32_t                  reasoning_budget_tokens   = -1;   // -1 = disabled, >= 0 = token budget
    bool                     reasoning_budget_activate_immediately = false;
    std::vector<llama_token> reasoning_budget_start;           // start tag token sequence
    std::vector<llama_token> reasoning_budget_end;             // end tag token sequence
    std::vector<llama_token> reasoning_budget_forced;          // forced sequence (message + end tag)
    bool backend_sampling = false;
    bool has_logit_bias() const {
@ -410,7 +420,8 @@ struct common_params {
    struct common_params_model model;
-    std::string model_alias          = ""; // model alias                                                   // NOLINT
+    std::set<std::string> model_alias;     // model aliases                                                 // NOLINT
    std::set<std::string> model_tags;      // model tags (informational, not used for routing)              // NOLINT
    std::string hf_token             = ""; // HF token                                                      // NOLINT
    std::string prompt               = "";                                                                  // NOLINT
    std::string system_prompt        = "";                                                                  // NOLINT
@ -455,6 +466,8 @@ struct common_params {
    bool   kl_divergence    = false; // compute KL divergence
    bool check             = false; // check rather than generate results for llama-results
    bool usage             = false; // print usage
    bool completion        = false; // print source-able completion script
    bool use_color         = false; // use color to distinguish generations and inputs
@ -521,7 +534,8 @@ struct common_params {
    int32_t n_threads_http      = -1;    // number of threads to process HTTP requests (TODO: support threadpool)
    int32_t n_cache_reuse       = 0;     // min chunk size to reuse from the cache via KV shifting
    bool    cache_prompt        = true;  // whether to enable prompt caching
-    int32_t n_ctx_checkpoints = 8;            // max number of context checkpoints per slot
+    int32_t n_ctx_checkpoints   = 32;     // max number of context checkpoints per slot
    int32_t checkpoint_every_nt = 8192;   // make a checkpoint every n tokens during prefill
    int32_t cache_ram_mib       = 8192;  // -1 = no limit, 0 - disable, 1 = 1 MiB, etc.
    std::string hostname      = "127.0.0.1";
@ -531,7 +545,9 @@ struct common_params {
    bool use_jinja = true;                                                                                  // NOLINT
    bool enable_chat_template = true;
    common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
    int enable_reasoning = -1; // -1 = auto, 0 = disable, 1 = enable
    int reasoning_budget = -1;
    std::string reasoning_budget_message; // message injected before end tag when budget exhausted
    bool prefill_assistant = true; // if true, any trailing assistant message will be prefilled into the response
    int sleep_idle_seconds = -1;   // if >0, server will sleep after this many seconds of idle time
@ -544,6 +560,7 @@ struct common_params {
    // webui configs
    bool webui = true;
    bool webui_mcp_proxy = false;
    std::string webui_config_json;
    // "advanced" endpoints are disabled by default for better security
@ -868,7 +885,7 @@ std::string common_detokenize(
 // Embedding utils
 //
-// TODO: repace embd_norm with an enum
+// TODO: replace embd_norm with an enum
 void common_embd_normalize(const float * inp, float * out, int n, int embd_norm);
 float common_embd_similarity_cos(const float * embd1, const float * embd2, int n);
@ -910,7 +927,7 @@ const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count";
 // MoE utils
 //
-const char * const LLM_FFN_EXPS_REGEX = "\\.ffn_(up|down|gate)_(ch|)exps";
+const char * const LLM_FFN_EXPS_REGEX = "\\.ffn_(up|down|gate|gate_up)_(ch|)exps";
 inline std::string llm_ffn_exps_block_regex(int idx) {
    return string_format("blk\\.%d%s", idx, LLM_FFN_EXPS_REGEX);
--- a/common/console.cpp
+++ b/common/console.cpp
@ -80,6 +80,8 @@ namespace console {
    static termios      initial_state;
 #endif
    static completion_callback completion_cb = nullptr;
    //
    // Init and cleanup
    //
@ -493,7 +495,7 @@ namespace console {
    }
    static void set_line_contents(std::string new_line, std::string & line, std::vector<int> & widths, size_t & char_pos,
-                                  size_t & byte_pos) {
+                                  size_t & byte_pos, int cursor_byte_pos = -1) {
        move_to_line_start(char_pos, byte_pos, widths);
        clear_current_line(widths);
@ -503,6 +505,7 @@ namespace console {
        char_pos = 0;
        size_t idx = 0;
        int back_width = 0;
        while (idx < line.size()) {
            size_t advance = 0;
            char32_t cp = decode_utf8(line, idx, advance);
@ -511,10 +514,17 @@ namespace console {
            if (real_width < 0) real_width = 0;
            widths.push_back(real_width);
            idx += advance;
            if (cursor_byte_pos >= 0 && static_cast<size_t>(cursor_byte_pos) < idx) {
                back_width += real_width;
            } else {
                ++char_pos;
                byte_pos = idx;
            }
        }
        if (cursor_byte_pos >= 0) {
            move_cursor(-back_width);
        }
    }
    static void move_to_line_start(size_t & char_pos, size_t & byte_pos, const std::vector<int> & widths) {
        int back_width = 0;
@ -784,6 +794,20 @@ namespace console {
                break;
            }
            if (completion_cb && input_char == '\t') {
                auto candidates = completion_cb(line, byte_pos);
                if (!candidates.empty()) {
                    if (candidates.size() > 1 || candidates[0].first != line) {
                        // TODO?: Display all candidates
                        set_line_contents(candidates[0].first, line, widths, char_pos, byte_pos, candidates[0].second);
                    } else {
                        // TODO: Move cursor to new byte_pos
                    }
                    continue;
                }
            }
            if (input_char == (char32_t) WEOF || input_char == 0x04 /* Ctrl+D */) {
                end_of_stream = true;
                break;
@ -1062,6 +1086,10 @@ namespace console {
        return readline_advanced(line, multiline_input);
    }
    void set_completion_callback(completion_callback cb) {
        completion_cb = cb;
    }
    namespace spinner {
        static const char LOADING_CHARS[] = {'|', '/', '-', '\\'};
        static std::condition_variable cv_stop;
--- a/common/console.h
+++ b/common/console.h
@ -4,7 +4,9 @@
 #include "common.h"
 #include <functional>
 #include <string>
 #include <vector>
 enum display_type {
    DISPLAY_TYPE_RESET = 0,
@ -21,6 +23,9 @@ namespace console {
    void set_display(display_type display);
    bool readline(std::string & line, bool multiline_input);
    using completion_callback = std::function<std::vector<std::pair<std::string, size_t>>(std::string_view, size_t)>;
    void set_completion_callback(completion_callback cb);
    namespace spinner {
        void start();
        void stop();
--- a/common/debug.h
+++ b/common/debug.h
@ -18,7 +18,7 @@ template <bool abort_on_nan> void common_debug_print_tensor(uint8_t * data, ggml
 // prints tensors that are processed in the computation graph
 // by default prints all tensors, but can be configured by creating a `base_callback_data` instance with
 // non-empty filter_patterns. See examples/debug.ccp for possible usage patterns
-// The template parameter determins whether an error should be thrown whenever a NaN is encountered
+// The template parameter determines whether an error should be thrown whenever a NaN is encountered
 // in a tensor (useful for stopping debug sessions on first erroneous tensor)
 // The callback data will be passed as the third parameter (user_data)
 template <bool abort_on_nan> bool common_debug_cb_eval(struct ggml_tensor * t, bool ask, void * user_data);
--- a/common/http.h
+++ b/common/http.h
@ -7,6 +7,7 @@ struct common_http_url {
    std::string user;
    std::string password;
    std::string host;
    int port;
    std::string path;
 };
@ -47,6 +48,20 @@ static common_http_url common_http_parse_url(const std::string & url) {
        parts.host = rest;
        parts.path = "/";
    }
    auto colon_pos = parts.host.find(':');
    if (colon_pos != std::string::npos) {
        parts.port = std::stoi(parts.host.substr(colon_pos + 1));
        parts.host = parts.host.substr(0, colon_pos);
    } else if (parts.scheme == "http") {
        parts.port = 80;
    } else if (parts.scheme == "https") {
        parts.port = 443;
    } else {
        throw std::runtime_error("unsupported URL scheme: " + parts.scheme);
    }
    return parts;
 }
@ -68,7 +83,7 @@ static std::pair<httplib::Client, common_http_url> common_http_client(const std:
    }
 #endif
-    httplib::Client cli(parts.scheme + "://" + parts.host);
+    httplib::Client cli(parts.scheme + "://" + parts.host + ":" + std::to_string(parts.port));
    if (!parts.user.empty()) {
        cli.set_basic_auth(parts.user, parts.password);
--- a/common/jinja/README.md
+++ b/common/jinja/README.md
@ -63,7 +63,7 @@ The llama.cpp Jinja engine introduces `jinja::string` (see `jinja/string.h`), wh
  - **One-to-many** (e.g., split): result is marked `is_input` **only if ALL** input parts are marked `is_input`
  - **Many-to-one** (e.g., join): same as one-to-many
-For string concatenation, string parts will be appended to the new string as-is, while perserving the `is_input` flag.
+For string concatenation, string parts will be appended to the new string as-is, while preserving the `is_input` flag.
 **Enabling Input Marking:**
--- a/common/jinja/caps.cpp
+++ b/common/jinja/caps.cpp
@ -1,3 +1,4 @@
 #include "log.h"
 #include "value.h"
 #include "runtime.h"
 #include "caps.h"
@ -36,12 +37,16 @@ static void caps_try_execute(jinja::program & prog,
    auto tools = ctx.get_val("tools");
    bool success = false;
    std::string result;
    try {
        jinja::runtime runtime(ctx);
-        runtime.execute(prog);
+        auto results = runtime.execute(prog);
        auto parts = jinja::runtime::gather_string_parts(results);
        result = parts->as_string().str();
        success = true;
    } catch (const std::exception & e) {
        JJ_DEBUG("Exception during execution: %s", e.what());
        result = "";
        // ignore exceptions during capability analysis
    }
@ -90,6 +95,8 @@ caps caps_get(jinja::program & prog) {
        return v->stats.ops.find(op_name) != v->stats.ops.end();
    };
    JJ_DEBUG("%s\n", ">>> Running capability check: typed content");
    // case: typed content support
    caps_try_execute(
        prog,
@ -120,6 +127,7 @@ caps caps_get(jinja::program & prog) {
        }
    );
    JJ_DEBUG("%s\n", ">>> Running capability check: system prompt");
    // case: system prompt support
    caps_try_execute(
@ -150,7 +158,9 @@ caps caps_get(jinja::program & prog) {
        }
    );
-    // case: tools support
+    JJ_DEBUG("%s\n", ">>> Running capability check: single tool support");
    // case: tools support: single call
    caps_try_execute(
        prog,
        [&]() {
@ -162,10 +172,10 @@ caps caps_get(jinja::program & prog) {
                },
                {
                    {"role", "assistant"},
-                    {"content", "Assistant message"},
+                    {"content", ""}, // Some templates expect content to be empty with tool calls
                    {"tool_calls", json::array({
                        {
-                            {"id", "call1"},
+                            {"id", "call00001"},
                            {"type", "function"},
                            {"function", {
                                {"name", "tool1"},
@ -173,19 +183,18 @@ caps caps_get(jinja::program & prog) {
                                    {"arg", "value"}
                                }}
                            }}
                        },
                        {
                            {"id", "call2"},
                            {"type", "function"},
                            {"function", {
                                {"name", "tool2"},
                                {"arguments", {
                                    {"arg", "value"}
                                }}
                            }}
                        }
                    })}
                },
                {
                    {"role", "tool"},
                    {"content", "Tool response"},
                    {"tool_call_id", "call00001"}
                },
                {
                    {"role", "assistant"},
                    {"content", "The tool response was 'tool response'"}
                },
                {
                    {"role", "user"},
                    {"content", "User message"},
@ -199,7 +208,7 @@ caps caps_get(jinja::program & prog) {
                    {"name", "tool"},
                    {"type", "function"},
                    {"function", {
-                        {"name", "tool"},
+                        {"name", "tool1"},
                        {"description", "Tool description"},
                        {"parameters", {
                            {"type", "object"},
@ -224,6 +233,7 @@ caps caps_get(jinja::program & prog) {
            auto & tool_name = tools->at(0)->at("function")->at("name");
            caps_print_stats(tool_name, "tools[0].function.name");
            caps_print_stats(tools, "tools");
            if (!tool_name->stats.used) {
                result.supports_tools = false;
            }
@ -233,6 +243,93 @@ caps caps_get(jinja::program & prog) {
            if (!tool_calls->stats.used) {
                result.supports_tool_calls = false;
            }
        }
    );
    JJ_DEBUG("%s\n", ">>> Running capability check: parallel tool support");
    // case: tools support: parallel calls
    caps_try_execute(
        prog,
        [&]() {
            // messages
            return json::array({
                {
                    {"role", "user"},
                    {"content", "User message"},
                },
                {
                    {"role", "assistant"},
                    {"content", ""}, // Some templates expect content to be empty with tool calls
                    {"tool_calls", json::array({
                        {
                            {"id", "call00001"},
                            {"type", "function"},
                            {"function", {
                                {"name", "tool1"},
                                {"arguments", {
                                    {"arg", "value"}
                                }}
                            }}
                        },
                        {
                            {"id", "call00002"},
                            {"type", "function"},
                            {"function", {
                                {"name", "tool1"},
                                {"arguments", {
                                    {"arg", "value"}
                                }}
                            }}
                        }
                    })}
                },
                {
                    {"role", "tool"},
                    {"content", "Tool response"},
                    {"tool_call_id", "call00001"}
                },
                {
                    {"role", "assistant"},
                    {"content", "The tool response was 'tool response'"}
                },
                {
                    {"role", "user"},
                    {"content", "User message"},
                },
            });
        },
        [&]() {
            // tools
            return json::array({
                {
                    {"name", "tool"},
                    {"type", "function"},
                    {"function", {
                        {"name", "tool1"},
                        {"description", "Tool description"},
                        {"parameters", {
                            {"type", "object"},
                            {"properties", {
                                {"arg", {
                                    {"type", "string"},
                                    {"description", "Arg description"},
                                }},
                            }},
                            {"required", json::array({ "arg" })},
                        }},
                    }},
                },
            });
        },
        [&](bool success, value & messages, value & /*tools*/) {
            if (!success) {
                result.supports_parallel_tool_calls = false;
                return;
            }
            auto & tool_calls = messages->at(1)->at("tool_calls");;
            caps_print_stats(tool_calls, "messages[1].tool_calls");
            // check for second tool call usage
            auto & tool_call_1 = tool_calls->at(1)->at("function");
@ -243,6 +340,8 @@ caps caps_get(jinja::program & prog) {
        }
    );
    JJ_DEBUG("%s\n", ">>> Running capability check: preserve reasoning");
    // case: preserve reasoning content in chat history
    caps_try_execute(
        prog,
--- a/common/jinja/runtime.cpp
+++ b/common/jinja/runtime.cpp
@ -114,8 +114,10 @@ value binary_expression::execute_impl(context & ctx) {
    // Logical operators
    if (op.value == "and") {
        JJ_DEBUG("Executing logical test: %s AND %s", left->type().c_str(), right->type().c_str());
        return left_val->as_bool() ? right->execute(ctx) : std::move(left_val);
    } else if (op.value == "or") {
        JJ_DEBUG("Executing logical test: %s OR %s", left->type().c_str(), right->type().c_str());
        return left_val->as_bool() ? std::move(left_val) : right->execute(ctx);
    }
@ -838,7 +840,7 @@ value call_expression::execute_impl(context & ctx) {
    for (auto & arg_stmt : this->args) {
        auto arg_val = arg_stmt->execute(ctx);
        JJ_DEBUG("  Argument type: %s", arg_val->type().c_str());
-        args.push_back(std::move(arg_val));
+        args.push_back(arg_val);
    }
    // execute callee
    value callee_val = callee->execute(ctx);
--- a/common/jinja/value.h
+++ b/common/jinja/value.h
@ -12,8 +12,8 @@
 #include <set>
 #include <sstream>
 #include <string>
 #include <unordered_map>
 #include <vector>
 #include <unordered_map>
 namespace jinja {
--- a/common/json-schema-to-grammar.cpp
+++ b/common/json-schema-to-grammar.cpp
@ -27,11 +27,11 @@ static std::string build_repetition(const std::string & item_rule, int min_items
    if (separator_rule.empty()) {
        if (min_items == 1 && !has_max) {
            return item_rule + "+";
        } else if (min_items == 0 && !has_max) {
            return item_rule + "*";
        } else {
            return item_rule + "{" + std::to_string(min_items) + "," + (has_max ? std::to_string(max_items) : "") + "}";
        }
        if (min_items == 0 && !has_max) {
            return item_rule + "*";
        }
        return item_rule + "{" + std::to_string(min_items) + "," + (has_max ? std::to_string(max_items) : "") + "}";
    }
    auto result = item_rule + " " + build_repetition("(" + separator_rule + " " + item_rule + ")", min_items == 0 ? 0 : min_items - 1, has_max ? max_items - 1 : max_items);
@ -41,7 +41,7 @@ static std::string build_repetition(const std::string & item_rule, int min_items
    return result;
 }
-static void _build_min_max_int(int64_t min_value, int64_t max_value, std::stringstream & out, int decimals_left = 16, bool top_level = true) {
+static void build_min_max_int(int64_t min_value, int64_t max_value, std::stringstream & out, int decimals_left = 16, bool top_level = true) {
    auto has_min = min_value != std::numeric_limits<int64_t>::min();
    auto has_max = max_value != std::numeric_limits<int64_t>::max();
@ -128,14 +128,14 @@ static void _build_min_max_int(int64_t min_value, int64_t max_value, std::string
    if (has_min && has_max) {
        if (min_value < 0 && max_value < 0) {
            out << "\"-\" (";
-            _build_min_max_int(-max_value, -min_value, out, decimals_left, /* top_level= */ true);
+            build_min_max_int(-max_value, -min_value, out, decimals_left, /* top_level= */ true);
            out << ")";
            return;
        }
        if (min_value < 0) {
            out << "\"-\" (";
-            _build_min_max_int(0, -min_value, out, decimals_left, /* top_level= */ true);
+            build_min_max_int(0, -min_value, out, decimals_left, /* top_level= */ true);
            out << ") | ";
            min_value = 0;
        }
@ -159,7 +159,7 @@ static void _build_min_max_int(int64_t min_value, int64_t max_value, std::string
    if (has_min) {
        if (min_value < 0) {
            out << "\"-\" (";
-            _build_min_max_int(std::numeric_limits<int64_t>::min(), -min_value, out, decimals_left, /* top_level= */ false);
+            build_min_max_int(std::numeric_limits<int64_t>::min(), -min_value, out, decimals_left, /* top_level= */ false);
            out << ") | [0] | [1-9] ";
            more_digits(0, decimals_left - 1);
        } else if (min_value == 0) {
@ -194,7 +194,7 @@ static void _build_min_max_int(int64_t min_value, int64_t max_value, std::string
            }
            digit_range(c, c);
            out << " (";
-            _build_min_max_int(std::stoll(min_s.substr(1)), std::numeric_limits<int64_t>::max(), out, less_decimals, /* top_level= */ false);
+            build_min_max_int(std::stoll(min_s.substr(1)), std::numeric_limits<int64_t>::max(), out, less_decimals, /* top_level= */ false);
            out << ")";
            if (c < '9') {
                out << " | ";
@ -213,10 +213,10 @@ static void _build_min_max_int(int64_t min_value, int64_t max_value, std::string
                more_digits(0, less_decimals);
                out << " | ";
            }
-            _build_min_max_int(0, max_value, out, decimals_left, /* top_level= */ true);
+            build_min_max_int(0, max_value, out, decimals_left, /* top_level= */ true);
        } else {
            out << "\"-\" (";
-            _build_min_max_int(-max_value, std::numeric_limits<int64_t>::max(), out, decimals_left, /* top_level= */ false);
+            build_min_max_int(-max_value, std::numeric_limits<int64_t>::max(), out, decimals_left, /* top_level= */ false);
            out << ")";
        }
        return;
@ -232,7 +232,7 @@ struct BuiltinRule {
    std::vector<std::string> deps;
 };
-std::unordered_map<std::string, BuiltinRule> PRIMITIVE_RULES = {
+static std::unordered_map<std::string, BuiltinRule> PRIMITIVE_RULES = {
    {"boolean", {"(\"true\" | \"false\") space", {}}},
    {"decimal-part", {"[0-9]{1,16}", {}}},
    {"integral-part", {"[0] | [1-9] [0-9]{0,15}", {}}},
@ -247,7 +247,7 @@ std::unordered_map<std::string, BuiltinRule> PRIMITIVE_RULES = {
    {"null", {"\"null\" space", {}}},
 };
-std::unordered_map<std::string, BuiltinRule> STRING_FORMAT_RULES = {
+static std::unordered_map<std::string, BuiltinRule> STRING_FORMAT_RULES = {
    {"date", {"[0-9]{4} \"-\" ( \"0\" [1-9] | \"1\" [0-2] ) \"-\" ( \"0\" [1-9] | [1-2] [0-9] | \"3\" [0-1] )", {}}},
    {"time", {"([01] [0-9] | \"2\" [0-3]) \":\" [0-5] [0-9] \":\" [0-5] [0-9] ( \".\" [0-9]{3} )? ( \"Z\" | ( \"+\" | \"-\" ) ( [01] [0-9] | \"2\" [0-3] ) \":\" [0-5] [0-9] )", {}}},
    {"date-time", {"date \"T\" time", {"date", "time"}}},
@ -260,22 +260,26 @@ static bool is_reserved_name(const std::string & name) {
    static const std::unordered_set<std::string> RESERVED_NAMES = [] {
        std::unordered_set<std::string> s;
        s.insert("root");
-        for (const auto & p : PRIMITIVE_RULES) s.insert(p.first);
+        for (const auto & p : PRIMITIVE_RULES) {
-        for (const auto & p : STRING_FORMAT_RULES) s.insert(p.first);
+            s.insert(p.first);
        }
        for (const auto & p : STRING_FORMAT_RULES) {
            s.insert(p.first);
        }
        return s;
    }();
    return RESERVED_NAMES.find(name) != RESERVED_NAMES.end();
 }
-std::regex INVALID_RULE_CHARS_RE("[^a-zA-Z0-9-]+");
+static std::regex INVALID_RULE_CHARS_RE("[^a-zA-Z0-9-]+");
-std::regex GRAMMAR_LITERAL_ESCAPE_RE("[\r\n\"\\\\]");
+static std::regex GRAMMAR_LITERAL_ESCAPE_RE("[\r\n\"\\\\]");
-std::regex GRAMMAR_RANGE_LITERAL_ESCAPE_RE("[\r\n\"\\]\\-\\\\]");
+static std::regex GRAMMAR_RANGE_LITERAL_ESCAPE_RE("[\r\n\"\\]\\-\\\\]");
-std::unordered_map<char, std::string> GRAMMAR_LITERAL_ESCAPES = {
+static std::unordered_map<char, std::string> GRAMMAR_LITERAL_ESCAPES = {
    {'\r', "\\r"}, {'\n', "\\n"}, {'"', "\\\""}, {'-', "\\-"}, {']', "\\]"}, {'\\', "\\\\"}
 };
-std::unordered_set<char> NON_LITERAL_SET = {'|', '.', '(', ')', '[', ']', '{', '}', '*', '+', '?'};
+static std::unordered_set<char> NON_LITERAL_SET = {'|', '.', '(', ')', '[', ']', '{', '}', '*', '+', '?'};
-std::unordered_set<char> ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS = {'^', '$', '.', '[', ']', '(', ')', '|', '{', '}', '*', '+', '?'};
+static std::unordered_set<char> ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS = {'^', '$', '.', '[', ']', '(', ')', '|', '{', '}', '*', '+', '?'};
 static std::string replacePattern(const std::string & input, const std::regex & regex, const std::function<std::string(const std::smatch  &)> & replacement) {
    std::smatch match;
@ -322,7 +326,7 @@ private:
        if (_rules.find(esc_name) == _rules.end() || _rules[esc_name] == rule) {
            _rules[esc_name] = rule;
            return esc_name;
-        } else {
+        }
        int i = 0;
        while (_rules.find(esc_name + std::to_string(i)) != _rules.end() && _rules[esc_name + std::to_string(i)] != rule) {
            i++;
@ -331,10 +335,10 @@ private:
        _rules[key] = rule;
        return key;
    }
    }
    std::string _generate_union_rule(const std::string & name, const std::vector<json> & alt_schemas) {
        std::vector<std::string> rules;
        rules.reserve(alt_schemas.size());
        for (size_t i = 0; i < alt_schemas.size(); i++) {
            rules.push_back(visit(alt_schemas[i], name + (name.empty() ? "alternative-" : "-") + std::to_string(i)));
        }
@ -398,6 +402,7 @@ private:
                flush_literal();
                std::vector<std::string> results;
                results.reserve(ret.size());
                for (const auto & item : ret) {
                    results.push_back(to_rule(item));
                }
@ -551,7 +556,7 @@ private:
            TrieNode() : is_end_of_string(false) {}
            void insert(const std::string & string) {
-                auto node = this;
+                auto *node = this;
                for (char c : string) {
                    node = &node->children[c];
                }
@ -676,7 +681,7 @@ private:
                if (ks.empty()) {
                    return res;
                }
-                std::string k = ks[0];
+                const std::string& k = ks[0];
                std::string kv_rule_name = prop_kv_rule_names[k];
                std::string comma_ref = "( \",\" space " + kv_rule_name + " )";
                if (first_is_optional) {
@ -779,13 +784,13 @@ public:
                        std::string pointer = ref.substr(ref.find('#') + 1);
                        std::vector<std::string> tokens = string_split(pointer, "/");
                        for (size_t i = 1; i < tokens.size(); ++i) {
-                            std::string sel = tokens[i];
+                            const std::string& sel = tokens[i];
                            if (target.is_object() && target.contains(sel)) {
                                target = target[sel];
                            } else if (target.is_array()) {
                                size_t sel_index;
                                try {
-                                    sel_index = std::stoul(sel);
+                                    sel_index = std::stoull(sel);
                                } catch (const std::invalid_argument & e) {
                                    sel_index = target.size();
                                }
@ -802,7 +807,7 @@ public:
                        _refs[ref] = target;
                    }
                } else {
-                    for (auto & kv : n.items()) {
+                    for (const auto & kv : n.items()) {
                        visit_refs(kv.value());
                    }
                }
@ -812,7 +817,7 @@ public:
        visit_refs(schema);
    }
-    std::string _generate_constant_rule(const json & value) {
+    static std::string _generate_constant_rule(const json & value) {
        return format_literal(value.dump());
    }
@ -823,10 +828,12 @@ public:
        if (schema.contains("$ref")) {
            return _add_rule(rule_name, _resolve_ref(schema["$ref"]));
-        } else if (schema.contains("oneOf") || schema.contains("anyOf")) {
+        }
        if (schema.contains("oneOf") || schema.contains("anyOf")) {
            std::vector<json> alt_schemas = schema.contains("oneOf") ? schema["oneOf"].get<std::vector<json>>() : schema["anyOf"].get<std::vector<json>>();
            return _add_rule(rule_name, _generate_union_rule(name, alt_schemas));
-        } else if (schema_type.is_array()) {
+        }
        if (schema_type.is_array()) {
            std::vector<json> schema_types;
            for (const auto & t : schema_type) {
                json schema_copy(schema);
@ -834,15 +841,18 @@ public:
                schema_types.push_back(schema_copy);
            }
            return _add_rule(rule_name, _generate_union_rule(name, schema_types));
-        } else if (schema.contains("const")) {
+        }
        if (schema.contains("const")) {
            return _add_rule(rule_name, _generate_constant_rule(schema["const"]) + " space");
-        } else if (schema.contains("enum")) {
+        }
        if (schema.contains("enum")) {
            std::vector<std::string> enum_values;
            for (const auto & v : schema["enum"]) {
                enum_values.push_back(_generate_constant_rule(v));
            }
            return _add_rule(rule_name, "(" + string_join(enum_values, " | ") + ") space");
-        } else if ((schema_type.is_null() || schema_type == "object")
+        }
        if ((schema_type.is_null() || schema_type == "object")
                && (schema.contains("properties") ||
                    (schema.contains("additionalProperties") && schema["additionalProperties"] != true))) {
            std::unordered_set<std::string> required;
@ -863,11 +873,12 @@ public:
                _build_object_rule(
                    properties, required, name,
                    schema.contains("additionalProperties") ? schema["additionalProperties"] : json()));
-        } else if ((schema_type.is_null() || schema_type == "object" || schema_type == "string") && schema.contains("allOf")) {
+        }
        if ((schema_type.is_null() || schema_type == "object" || schema_type == "string") && schema.contains("allOf")) {
            std::unordered_set<std::string> required;
            std::vector<std::pair<std::string, json>> properties;
            std::map<std::string, size_t> enum_values;
-            std::string hybrid_name = name;
+            const std::string& hybrid_name = name;
            std::function<void(const json &, bool)> add_component = [&](const json & comp_schema, bool is_required) {
                if (comp_schema.contains("$ref")) {
                    add_component(_refs[comp_schema["$ref"]], is_required);
@ -890,9 +901,9 @@ public:
                  // todo warning
                }
            };
-            for (auto & t : schema["allOf"]) {
+            for (const auto & t : schema["allOf"]) {
                if (t.contains("anyOf")) {
-                    for (auto & tt : t["anyOf"]) {
+                    for (const auto & tt : t["anyOf"]) {
                        add_component(tt, false);
                    }
                } else {
@ -911,7 +922,8 @@ public:
                }
            }
            return _add_rule(rule_name, _build_object_rule(properties, required, hybrid_name, json()));
-        } else if ((schema_type.is_null() || schema_type == "array") && (schema.contains("items") || schema.contains("prefixItems"))) {
+        }
        if ((schema_type.is_null() || schema_type == "array") && (schema.contains("items") || schema.contains("prefixItems"))) {
            json items = schema.contains("items") ? schema["items"] : schema["prefixItems"];
            if (items.is_array()) {
                std::string rule = "\"[\" space ";
@ -923,7 +935,7 @@ public:
                }
                rule += " \"]\" space";
                return _add_rule(rule_name, rule);
-            } else {
+            }
            std::string item_rule_name = visit(items, name + (name.empty() ? "" : "-") + "item");
            int min_items = schema.contains("minItems") ? schema["minItems"].get<int>() : 0;
            json max_items_json = schema.contains("maxItems") ? schema["maxItems"] : json();
@ -931,19 +943,23 @@ public:
            return _add_rule(rule_name, "\"[\" space " + build_repetition(item_rule_name, min_items, max_items, "\",\" space") + " \"]\" space");
        }
-        } else if ((schema_type.is_null() || schema_type == "string") && schema.contains("pattern")) {
+        if ((schema_type.is_null() || schema_type == "string") && schema.contains("pattern")) {
            return _visit_pattern(schema["pattern"], rule_name);
-        } else if ((schema_type.is_null() || schema_type == "string") && std::regex_match(schema_format, std::regex("^uuid[1-5]?$"))) {
+        }
        if ((schema_type.is_null() || schema_type == "string") && std::regex_match(schema_format, std::regex("^uuid[1-5]?$"))) {
            return _add_primitive(rule_name == "root" ? "root" : schema_format, PRIMITIVE_RULES.at("uuid"));
-        } else if ((schema_type.is_null() || schema_type == "string") && STRING_FORMAT_RULES.find(schema_format + "-string") != STRING_FORMAT_RULES.end()) {
+        }
        if ((schema_type.is_null() || schema_type == "string") && STRING_FORMAT_RULES.find(schema_format + "-string") != STRING_FORMAT_RULES.end()) {
            auto prim_name = schema_format + "-string";
            return _add_rule(rule_name, _add_primitive(prim_name, STRING_FORMAT_RULES.at(prim_name)));
-        } else if (schema_type == "string" && (schema.contains("minLength") || schema.contains("maxLength"))) {
+        }
        if (schema_type == "string" && (schema.contains("minLength") || schema.contains("maxLength"))) {
            std::string char_rule = _add_primitive("char", PRIMITIVE_RULES.at("char"));
            int min_len = schema.contains("minLength") ? schema["minLength"].get<int>() : 0;
            int max_len = schema.contains("maxLength") ? schema["maxLength"].get<int>() : std::numeric_limits<int>::max();
            return _add_rule(rule_name, "\"\\\"\" " + build_repetition(char_rule, min_len, max_len) + " \"\\\"\" space");
-        } else if (schema_type == "integer" && (schema.contains("minimum") || schema.contains("exclusiveMinimum") || schema.contains("maximum") || schema.contains("exclusiveMaximum"))) {
+        }
        if (schema_type == "integer" && (schema.contains("minimum") || schema.contains("exclusiveMinimum") || schema.contains("maximum") || schema.contains("exclusiveMaximum"))) {
            int64_t min_value = std::numeric_limits<int64_t>::min();
            int64_t max_value = std::numeric_limits<int64_t>::max();
            if (schema.contains("minimum")) {
@ -958,12 +974,18 @@ public:
            }
            std::stringstream out;
            out << "(";
-            _build_min_max_int(min_value, max_value, out);
+            build_min_max_int(min_value, max_value, out);
            out << ") space";
            return _add_rule(rule_name, out.str());
-        } else if (schema.empty() || schema_type == "object") {
+        }
        if (schema.empty() || schema_type == "object") {
            return _add_rule(rule_name, _add_primitive("object", PRIMITIVE_RULES.at("object")));
-        } else {
+        }
        if (schema_type.is_null() && schema.is_object()) {
            // No type constraint and no recognized structural keywords (e.g. {"description": "..."}).
            // Per JSON Schema semantics this is equivalent to {} and accepts any value.
            return _add_rule(rule_name, _add_primitive("value", PRIMITIVE_RULES.at("value")));
        }
        if (!schema_type.is_string() || PRIMITIVE_RULES.find(schema_type.get<std::string>()) == PRIMITIVE_RULES.end()) {
            _errors.push_back("Unrecognized schema: " + schema.dump());
            return "";
@ -971,7 +993,6 @@ public:
        // TODO: support minimum, maximum, exclusiveMinimum, exclusiveMaximum at least for zero
        return _add_primitive(rule_name == "root" ? "root" : schema_type.get<std::string>(), PRIMITIVE_RULES.at(schema_type.get<std::string>()));
    }
    }
    void check_errors() {
        if (!_errors.empty()) {
@ -985,7 +1006,7 @@ public:
    std::string format_grammar() {
        std::stringstream ss;
        for (const auto & kv : _rules) {
-            ss << kv.first << " ::= " << kv.second << std::endl;
+            ss << kv.first << " ::= " << kv.second << '\n';
        }
        return ss.str();
    }
--- a/common/peg-parser.cpp
+++ b/common/peg-parser.cpp
@ -1,14 +1,15 @@
 #include "common.h"
 #include "peg-parser.h"
 #include "json-schema-to-grammar.h"
 #include "unicode.h"
-#include <nlohmann/json.hpp>
+#include "common.h"
 #include "json-schema-to-grammar.h"
 #include "log.h"
 #include "unicode.h"
 #include <algorithm>
 #include <initializer_list>
 #include <map>
 #include <memory>
 #include <nlohmann/json.hpp>
 #include <regex>
 #include <stdexcept>
 #include <unordered_set>
@ -34,8 +35,7 @@ static bool is_hex_digit(const char c) {
 // This is used in common_peg_until_parser and to build a GBNF exclusion grammar
 struct trie {
    struct node {
-        size_t depth = 0;
+        std::map<uint32_t, size_t> children;  // Use uint32_t to store Unicode codepoints
        std::map<unsigned char, size_t> children;
        bool is_word;
    };
@ -55,15 +55,22 @@ struct trie {
        size_t current = 0; // Start at root
        size_t pos = start_pos;
        // LOG_DBG("%s: checking at pos %zu, sv='%s'\n", __func__, start_pos, std::string(sv).c_str());
        while (pos < sv.size()) {
-            auto it = nodes[current].children.find(sv[pos]);
+            auto result = common_parse_utf8_codepoint(sv, pos);
            if (result.status != utf8_parse_result::SUCCESS) {
                break;
            }
            auto it = nodes[current].children.find(result.codepoint);
            if (it == nodes[current].children.end()) {
                // Can't continue matching
                return match_result{match_result::NO_MATCH};
            }
            current = it->second;
-            pos++;
+            pos += result.bytes_consumed;
            // Check if we've matched a complete word
            if (nodes[current].is_word) {
@ -82,22 +89,22 @@ struct trie {
    }
    struct prefix_and_next {
-        std::string prefix;
+        std::vector<uint32_t> prefix;
-        std::string next_chars;
+        std::vector<uint32_t> next_chars;
    };
    std::vector<prefix_and_next> collect_prefix_and_next() {
-        std::string prefix;
+        std::vector<uint32_t>        prefix;
        std::vector<prefix_and_next> result;
        collect_prefix_and_next(0, prefix, result);
        return result;
    }
  private:
-    void collect_prefix_and_next(size_t index, std::string & prefix, std::vector<prefix_and_next> & out) {
+    void collect_prefix_and_next(size_t index, std::vector<uint32_t> & prefix, std::vector<prefix_and_next> & out) {
        if (!nodes[index].is_word) {
            if (!nodes[index].children.empty()) {
-                std::string chars;
+                std::vector<uint32_t> chars;
                chars.reserve(nodes[index].children.size());
                for (const auto & p : nodes[index].children) {
                    chars.push_back(p.first);
@ -107,7 +114,7 @@ struct trie {
        }
        for (const auto & p : nodes[index].children) {
-            unsigned char ch = p.first;
+            uint32_t ch = p.first;
            auto child = p.second;
            prefix.push_back(ch);
            collect_prefix_and_next(child, prefix, out);
@ -123,11 +130,19 @@ struct trie {
    void insert(const std::string & word) {
        size_t current = 0;
-        for (unsigned char ch : word) {
+        size_t pos     = 0;
        while (pos < word.length()) {
            auto result = common_parse_utf8_codepoint(word, pos);
            if (result.status != utf8_parse_result::SUCCESS) {
                break;
            }
            uint32_t ch = result.codepoint;
            pos += result.bytes_consumed;
            auto it = nodes[current].children.find(ch);
            if (it == nodes[current].children.end()) {
                size_t child = create_node();
                nodes[child].depth = nodes[current].depth + 1;
                nodes[current].children[ch] = child;
                current = child;
            } else {
@ -286,6 +301,32 @@ struct parser_executor {
    parser_executor(const common_peg_arena & arena, common_peg_parse_context & ctx, size_t start)
        : arena(arena), ctx(ctx), start_pos(start) {}
    std::string debug_indent() const { return std::string(ctx.parse_depth * 2, ' '); }
    std::string debug_input_snippet(size_t pos, size_t len = 60) const {
        if (pos >= ctx.input.size()) {
            return "<EOF>";
        }
        auto        snippet = ctx.input.substr(pos, len);
        // Escape newlines for display
        std::string result;
        for (char c : snippet) {
            if (c == '\n') {
                result += "\\n";
            } else if (c == '\r') {
                result += "\\r";
            } else if (c == '\t') {
                result += "\\t";
            } else {
                result += c;
            }
        }
        if (pos + len < ctx.input.size()) {
            result += "...";
        }
        return result;
    }
    common_peg_parse_result operator()(const common_peg_epsilon_parser & /* p */) const {
        return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_SUCCESS, start_pos);
    }
@ -308,7 +349,7 @@ struct parser_executor {
        auto pos = start_pos;
        for (auto i = 0u; i < p.literal.size(); ++i) {
            if (pos >= ctx.input.size()) {
-                if (!ctx.is_partial) {
+                if (!ctx.is_lenient()) {
                    return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_FAIL, start_pos);
                }
                return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT, start_pos, pos);
@ -323,12 +364,32 @@ struct parser_executor {
    }
    common_peg_parse_result operator()(const common_peg_sequence_parser & p) {
        if (ctx.is_debug()) {
            LOG_DBG("%sSEQ start at %zu '%s' (%zu children)\n", debug_indent().c_str(), start_pos,
                    debug_input_snippet(start_pos).c_str(), p.children.size());
        }
        ctx.parse_depth++;
        auto pos = start_pos;
        std::vector<common_peg_ast_id> nodes;
-        for (const auto & child_id : p.children) {
+        for (size_t i = 0; i < p.children.size(); i++) {
            const auto & child_id = p.children[i];
            if (ctx.is_debug()) {
                fprintf(stderr, "%sSEQ child %zu: %s\n", debug_indent().c_str(), i, arena.dump(child_id).c_str());
            }
            auto result = arena.parse(child_id, ctx, pos);
            if (ctx.is_debug()) {
                fprintf(stderr, "%sSEQ child %zu: %s at %zu->%zu\n", debug_indent().c_str(), i,
                        common_peg_parse_result_type_name(result.type), result.start, result.end);
            }
            if (result.fail()) {
                ctx.parse_depth--;
                if (ctx.is_debug()) {
                    fprintf(stderr, "%sSEQ -> FAIL\n", debug_indent().c_str());
                }
                return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_FAIL, start_pos, result.end);
            }
@ -337,28 +398,65 @@ struct parser_executor {
            }
            if (result.need_more_input()) {
                ctx.parse_depth--;
                if (ctx.is_debug()) {
                    fprintf(stderr, "%sSEQ -> NEED_MORE\n", debug_indent().c_str());
                }
                return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT, start_pos, result.end, std::move(nodes));
            }
            pos = result.end;
        }
        ctx.parse_depth--;
        if (ctx.is_debug()) {
            fprintf(stderr, "%sSEQ -> SUCCESS at %zu->%zu\n", debug_indent().c_str(), start_pos, pos);
        }
        return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_SUCCESS, start_pos, pos, std::move(nodes));
    }
    common_peg_parse_result operator()(const common_peg_choice_parser & p) {
        if (ctx.is_debug()) {
            fprintf(stderr, "%sCHOICE start at %zu '%s' (%zu options)\n", debug_indent().c_str(), start_pos,
                    debug_input_snippet(start_pos).c_str(), p.children.size());
        }
        ctx.parse_depth++;
        auto pos = start_pos;
-        for (const auto & child_id : p.children) {
+        for (size_t i = 0; i < p.children.size(); i++) {
            const auto & child_id = p.children[i];
            if (ctx.is_debug()) {
                fprintf(stderr, "%sCHOICE option %zu: %s\n", debug_indent().c_str(), i, arena.dump(child_id).c_str());
            }
            auto result = arena.parse(child_id, ctx, pos);
            if (ctx.is_debug()) {
                fprintf(stderr, "%sCHOICE option %zu: %s\n", debug_indent().c_str(), i,
                        common_peg_parse_result_type_name(result.type));
            }
            if (!result.fail()) {
                ctx.parse_depth--;
                if (ctx.is_debug()) {
                    fprintf(stderr, "%sCHOICE -> %s (option %zu)\n", debug_indent().c_str(),
                            common_peg_parse_result_type_name(result.type), i);
                }
                return result;
            }
        }
        ctx.parse_depth--;
        if (ctx.is_debug()) {
            fprintf(stderr, "%sCHOICE -> FAIL (no options matched)\n", debug_indent().c_str());
        }
        return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_FAIL, start_pos);
    }
    common_peg_parse_result operator()(const common_peg_repetition_parser & p) {
        if (ctx.is_debug()) {
            fprintf(stderr, "%sREPEAT start at %zu '%s' (min=%d, max=%d)\n", debug_indent().c_str(), start_pos,
                    debug_input_snippet(start_pos).c_str(), p.min_count, p.max_count);
        }
        ctx.parse_depth++;
        auto pos = start_pos;
        int match_count = 0;
        std::vector<common_peg_ast_id> nodes;
@ -366,14 +464,26 @@ struct parser_executor {
        // Try to match up to max_count times (or unlimited if max_count is -1)
        while (p.max_count == -1 || match_count < p.max_count) {
            if (pos >= ctx.input.size()) {
                if (ctx.is_debug()) {
                    fprintf(stderr, "%sREPEAT: at end of input, count=%d\n", debug_indent().c_str(), match_count);
                }
                break;
            }
            auto result = arena.parse(p.child, ctx, pos);
            if (ctx.is_debug()) {
                fprintf(stderr, "%sREPEAT iter %d: %s at %zu->%zu, nodes=%zu\n", debug_indent().c_str(), match_count,
                        common_peg_parse_result_type_name(result.type), result.start, result.end, result.nodes.size());
                fprintf(stderr, "%sREPEAT CHILD: %s\n", debug_indent().c_str(), arena.dump(p.child).c_str());
            }
            if (result.success()) {
                // Prevent infinite loop on empty matches
                if (result.end == pos) {
                    if (ctx.is_debug()) {
                        fprintf(stderr, "%s  REPEAT: empty match, stopping\n", debug_indent().c_str());
                    }
                    break;
                }
@ -391,21 +501,43 @@ struct parser_executor {
                    nodes.insert(nodes.end(), result.nodes.begin(), result.nodes.end());
                }
                ctx.parse_depth--;
                if (ctx.is_debug()) {
                    fprintf(stderr, "%sREPEAT -> NEED_MORE (count=%d, nodes=%zu)\n", debug_indent().c_str(),
                            match_count, nodes.size());
                }
                return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT, start_pos, result.end, std::move(nodes));
            }
            // Child failed - stop trying
            if (ctx.is_debug()) {
                fprintf(stderr, "%sREPEAT: child failed, stopping\n", debug_indent().c_str());
            }
            break;
        }
        // Check if we got enough matches
        if (p.min_count > 0 && match_count < p.min_count) {
-            if (pos >= ctx.input.size() && ctx.is_partial) {
+            ctx.parse_depth--;
            if (pos >= ctx.input.size() && ctx.is_lenient()) {
                if (ctx.is_debug()) {
                    fprintf(stderr, "%sREPEAT -> NEED_MORE (not enough matches: %d < %d)\n", debug_indent().c_str(),
                            match_count, p.min_count);
                }
                return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT, start_pos, pos, std::move(nodes));
            }
            if (ctx.is_debug()) {
                fprintf(stderr, "%sREPEAT -> FAIL (not enough matches: %d < %d)\n", debug_indent().c_str(), match_count,
                        p.min_count);
            }
            return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_FAIL, start_pos, pos);
        }
        ctx.parse_depth--;
        if (ctx.is_debug()) {
            fprintf(stderr, "%sREPEAT -> SUCCESS (count=%d, nodes=%zu)\n", debug_indent().c_str(), match_count,
                    nodes.size());
        }
        return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_SUCCESS, start_pos, pos, std::move(nodes));
    }
@ -434,10 +566,10 @@ struct parser_executor {
    common_peg_parse_result operator()(const common_peg_any_parser & /* p */) const {
        // Parse a single UTF-8 codepoint (not just a single byte)
-        auto result = parse_utf8_codepoint(ctx.input, start_pos);
+        auto result = common_parse_utf8_codepoint(ctx.input, start_pos);
        if (result.status == utf8_parse_result::INCOMPLETE) {
-            if (!ctx.is_partial) {
+            if (!ctx.is_lenient()) {
                return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_FAIL, start_pos);
            }
            return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT, start_pos);
@ -468,7 +600,7 @@ struct parser_executor {
        // Try to match up to max_count times (or unlimited if max_count is -1)
        while (p.max_count == -1 || match_count < p.max_count) {
-            auto result = parse_utf8_codepoint(ctx.input, pos);
+            auto result = common_parse_utf8_codepoint(ctx.input, pos);
            if (result.status == utf8_parse_result::INCOMPLETE) {
                if (match_count >= p.min_count) {
@ -476,7 +608,7 @@ struct parser_executor {
                    return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_SUCCESS, start_pos, pos);
                }
                // Not enough matches yet
-                if (!ctx.is_partial) {
+                if (!ctx.is_lenient()) {
                    return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_FAIL, start_pos);
                }
                return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT, start_pos, pos);
@ -517,7 +649,7 @@ struct parser_executor {
        // Check if we got enough matches
        if (match_count < p.min_count) {
-            if (pos >= ctx.input.size() && ctx.is_partial) {
+            if (pos >= ctx.input.size() && ctx.is_lenient()) {
                return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT, start_pos, pos);
            }
            return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_FAIL, start_pos, pos);
@ -526,30 +658,22 @@ struct parser_executor {
        return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_SUCCESS, start_pos, pos);
    }
-    static common_peg_parse_result handle_escape_sequence(common_peg_parse_context & ctx, size_t start, size_t & pos) {
+    static common_peg_parse_result handle_escape_sequence(common_peg_parse_context & ctx, size_t start, size_t & pos, const char delimiter) {
        ++pos; // consume '\'
        if (pos >= ctx.input.size()) {
-            if (!ctx.is_partial) {
+            if (!ctx.is_lenient()) {
                return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_FAIL, start);
            }
            return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT, start, pos);
        }
-        switch (ctx.input[pos]) {
+        char c = ctx.input[pos];
-            case '"':
+        if (c == delimiter || c == '\\' || c == '/' || c == 'b' || c == 'f' || c == 'n' || c == 'r' || c == 't') {
            case '\\':
            case '/':
            case 'b':
            case 'f':
            case 'n':
            case 'r':
            case 't':
            ++pos;
            return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_SUCCESS, start, pos);
-            case 'u':
+        } else if (c == 'u') {
            return handle_unicode_escape(ctx, start, pos);
-            default:
+        } else {
                // Invalid escape sequence
            return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_FAIL, start);
        }
    }
@ -558,7 +682,7 @@ struct parser_executor {
        ++pos; // consume 'u'
        for (int i = 0; i < 4; ++i) {
            if (pos >= ctx.input.size()) {
-                if (!ctx.is_partial) {
+                if (!ctx.is_lenient()) {
                    return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_FAIL, start);
                }
                return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT, start, pos);
@ -571,28 +695,28 @@ struct parser_executor {
        return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_SUCCESS, start, pos);
    }
-    common_peg_parse_result operator()(const common_peg_json_string_parser & /* p */) {
+    common_peg_parse_result operator()(const common_peg_string_parser & p) {
        auto pos = start_pos;
        // Parse string content (without quotes)
        while (pos < ctx.input.size()) {
            char c = ctx.input[pos];
-            if (c == '"') {
+            if (c == p.delimiter) {
-                // Found closing quote - success (don't consume it)
+                // Found closing delimiter - success (don't consume it)
                return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_SUCCESS, start_pos, pos);
            }
            if (c == '\\') {
-                auto result = handle_escape_sequence(ctx, start_pos, pos);
+                auto result = handle_escape_sequence(ctx, start_pos, pos, p.delimiter);
                if (!result.success()) {
                    return result;
                }
            } else {
-                auto utf8_result = parse_utf8_codepoint(ctx.input, pos);
+                auto utf8_result = common_parse_utf8_codepoint(ctx.input, pos);
                if (utf8_result.status == utf8_parse_result::INCOMPLETE) {
-                    if (!ctx.is_partial) {
+                    if (!ctx.is_lenient()) {
                        return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_FAIL, start_pos);
                    }
                    return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT, start_pos, pos);
@ -607,7 +731,7 @@ struct parser_executor {
        }
        // Reached end without finding closing quote
-        if (!ctx.is_partial) {
+        if (!ctx.is_lenient()) {
            return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_FAIL, start_pos, pos);
        }
        return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT, start_pos, pos);
@ -621,11 +745,11 @@ struct parser_executor {
        size_t last_valid_pos = start_pos;
        while (pos < ctx.input.size()) {
-            auto utf8_result = parse_utf8_codepoint(ctx.input, pos);
+            auto utf8_result = common_parse_utf8_codepoint(ctx.input, pos);
            if (utf8_result.status == utf8_parse_result::INCOMPLETE) {
                // Incomplete UTF-8 sequence
-                if (!ctx.is_partial) {
+                if (!ctx.is_lenient()) {
                    // Input is complete but UTF-8 is incomplete = malformed
                    return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_FAIL, start_pos);
                }
@ -655,7 +779,7 @@ struct parser_executor {
            last_valid_pos = pos;
        }
-        if (last_valid_pos == ctx.input.size() && ctx.is_partial) {
+        if (last_valid_pos == ctx.input.size() && ctx.is_lenient()) {
            // Reached the end of a partial stream, there might still be more input that we need to consume.
            return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT, start_pos, last_valid_pos);
        }
@ -694,6 +818,9 @@ struct parser_executor {
    common_peg_parse_result operator()(const common_peg_tag_parser & p) {
        // Parse the child
        if (ctx.is_debug()) {
            fprintf(stderr, "%sTAG: %s\n", debug_indent().c_str(), p.tag.c_str());
        }
        auto result = arena.parse(p.child, ctx, start_pos);
        if (!result.fail()) {
@ -755,6 +882,31 @@ common_peg_parser_id common_peg_arena::resolve_ref(common_peg_parser_id id) {
    return id;
 }
 static void bfs_node(common_peg_ast_arena &arena, std::ostringstream & oss, const common_peg_ast_node & node, int indent) {
    for (int i = 0; i < indent; i++) {
        oss << "  ";
    }
    oss << "NODE " << node.id;
    if (!node.rule.empty()) {
        oss << " (rule " << node.rule << ")";
    }
    if (!node.tag.empty()) {
        oss << " (tag " << node.tag << ")";
    }
    oss << " ['" << node.text << "']\n";
    for (const auto child : node.children) {
        bfs_node(arena, oss, arena.get(child), indent + 1);
    }
 }
 std::string common_peg_ast_arena::dump() {
    std::ostringstream oss;
    for (auto & node : nodes_) {
        bfs_node(*this, oss, node, 0);
    }
    return oss.str();
 }
 void common_peg_arena::resolve_refs() {
    // Walk through all parsers and replace refs with their corresponding rule IDs
    for (auto & parser : parsers_) {
@ -785,7 +937,7 @@ void common_peg_arena::resolve_refs() {
                                 std::is_same_v<T, common_peg_ref_parser> ||
                                 std::is_same_v<T, common_peg_until_parser> ||
                                 std::is_same_v<T, common_peg_literal_parser> ||
-                                 std::is_same_v<T, common_peg_json_string_parser> ||
+                                 std::is_same_v<T, common_peg_string_parser> ||
                                 std::is_same_v<T, common_peg_chars_parser> ||
                                 std::is_same_v<T, common_peg_any_parser> ||
                                 std::is_same_v<T, common_peg_space_parser>) {
@ -803,9 +955,21 @@ void common_peg_arena::resolve_refs() {
 }
 std::string common_peg_arena::dump(common_peg_parser_id id) const {
    std::unordered_set<common_peg_parser_id> visited;
    return dump_impl(id, visited);
 }
 std::string common_peg_arena::dump_impl(common_peg_parser_id                       id,
                                        std::unordered_set<common_peg_parser_id> & visited) const {
    // Check for cycles
    if (visited.count(id)) {
        return "[cycle]";
    }
    visited.insert(id);
    const auto & parser = parsers_.at(id);
-    return std::visit([this](const auto & p) -> std::string {
+    return std::visit([this, &visited](const auto & p) -> std::string {
        using T = std::decay_t<decltype(p)>;
        if constexpr (std::is_same_v<T, common_peg_epsilon_parser>) {
@ -819,24 +983,27 @@ std::string common_peg_arena::dump(common_peg_parser_id id) const {
        } else if constexpr (std::is_same_v<T, common_peg_sequence_parser>) {
            std::vector<std::string> parts;
            for (const auto & child : p.children) {
-                parts.push_back(dump(child));
+                parts.push_back(dump_impl(child, visited));
            }
            return "Sequence(" + string_join(parts, ", ") + ")";
        } else if constexpr (std::is_same_v<T, common_peg_choice_parser>) {
            std::vector<std::string> parts;
            for (const auto & child : p.children) {
-                parts.push_back(dump(child));
+                parts.push_back(dump_impl(child, visited));
            }
            return "Choice(" + string_join(parts, ", ") + ")";
        } else if constexpr (std::is_same_v<T, common_peg_repetition_parser>) {
            if (p.max_count == -1) {
-                return "Repetition(" + dump(p.child) + ", " + std::to_string(p.min_count) + ", unbounded)";
+                return "Repetition(" + dump_impl(p.child, visited) + ", " + std::to_string(p.min_count) +
                        ", unbounded)";
            }
-            return "Repetition(" + dump(p.child) + ", " + std::to_string(p.min_count) + ", " + std::to_string(p.max_count) + ")";
+            return "Repetition(" + dump_impl(p.child, visited) + ", " + std::to_string(p.min_count) + ", " + std::to_string(p.max_count) + ")";
        } else if constexpr (std::is_same_v<T, common_peg_and_parser>) {
-            return "And(" + dump(p.child) + ")";
+            return "And(" + dump_impl(p.child, visited) + ")";
        } else if constexpr (std::is_same_v<T, common_peg_not_parser>) {
-            return "Not(" + dump(p.child) + ")";
+            return "Not(" + dump_impl(p.child, visited) + ")";
        } else if constexpr (std::is_same_v<T, common_peg_atomic_parser>) {
            return "Atomic(" + dump_impl(p.child, visited) + ")";
        } else if constexpr (std::is_same_v<T, common_peg_any_parser>) {
            return "Any";
        } else if constexpr (std::is_same_v<T, common_peg_space_parser>) {
@ -846,16 +1013,20 @@ std::string common_peg_arena::dump(common_peg_parser_id id) const {
                return "CharRepeat(" + p.pattern + ", " + std::to_string(p.min_count) + ", unbounded)";
            }
            return "CharRepeat(" + p.pattern + ", " + std::to_string(p.min_count) + ", " + std::to_string(p.max_count) + ")";
-        } else if constexpr (std::is_same_v<T, common_peg_json_string_parser>) {
+        } else if constexpr (std::is_same_v<T, common_peg_string_parser>) {
-            return "JsonString()";
+            return "String(" + std::string(1, p.delimiter) + ")";
        } else if constexpr (std::is_same_v<T, common_peg_until_parser>) {
            return "Until(" + string_join(p.delimiters, " | ") + ")";
        } else if constexpr (std::is_same_v<T, common_peg_schema_parser>) {
-            return "Schema(" + dump(p.child) + ", " + (p.schema ? p.schema->dump() : "null") + ")";
+            return "Schema(" + dump_impl(p.child, visited) + ", " + (p.schema ? p.schema->dump() : "null") + ")";
        } else if constexpr (std::is_same_v<T, common_peg_rule_parser>) {
-            return "Rule(" + p.name + ", " + dump(p.child) + ")";
+            return "Rule(" + p.name + ", " + dump_impl(p.child, visited) + ")";
        } else if constexpr (std::is_same_v<T, common_peg_ref_parser>) {
            return "Ref(" + p.name + ")";
        } else if constexpr (std::is_same_v<T, common_peg_tag_parser>) {
            return "Tag(" + p.tag + ", " + dump(p.child) + ")";
        } else if constexpr (std::is_same_v<T, common_peg_atomic_parser>) {
            return "Atomic(" + dump(p.child) + ")";
        } else {
            return "Unknown";
        }
@ -1054,7 +1225,32 @@ common_peg_arena common_peg_parser_builder::build() {
    return std::move(arena_);
 }
 // String primitives
 common_peg_parser common_peg_parser_builder::string_content(char delimiter) {
    return wrap(arena_.add_parser(common_peg_string_parser{delimiter}));
 }
 common_peg_parser common_peg_parser_builder::double_quoted_string() {
    return rule("double-quoted-string", [this]() {
        return sequence({literal("\""), string_content('"'), literal("\""), space()});
    });
 }
 common_peg_parser common_peg_parser_builder::single_quoted_string() {
    return rule("single-quoted-string", [this]() {
        return sequence({literal("'"), string_content('\''), literal("'"), space()});
    });
 }
 common_peg_parser common_peg_parser_builder::quoted_string() {
    return rule("quoted-string", [this]() {
        return choice({double_quoted_string(), single_quoted_string()});
    });
 }
 // JSON parsers
 common_peg_parser common_peg_parser_builder::json_number() {
   return rule("json-number", [this]() {
        auto digit1_9 = chars("[1-9]", 1, 1);
@ -1062,13 +1258,17 @@ common_peg_parser common_peg_parser_builder::json_number() {
        auto int_part = choice({literal("0"), sequence({digit1_9, chars("[0-9]", 0, -1)})});
        auto frac = sequence({literal("."), digits});
        auto exp = sequence({choice({literal("e"), literal("E")}), optional(chars("[+-]", 1, 1)), digits});
-        return sequence({optional(literal("-")), int_part, optional(frac), optional(exp), space()});
+        // Negative lookahead: only commit the number when the next character can't extend it.
        // At EOF in partial mode, chars returns NEED_MORE → negate propagates NEED_MORE → number not committed.
        // This prevents premature commits of partial numbers (e.g. "3" when "3.14" is incoming).
        auto not_number_continuation = negate(chars("[0-9.eE+-]", 1, 1));
        return sequence({ optional(literal("-")), int_part, optional(frac), optional(exp), not_number_continuation, space() });
    });
 }
 common_peg_parser common_peg_parser_builder::json_string() {
    return rule("json-string", [this]() {
-        return sequence({literal("\""), json_string_content(), literal("\""), space()});
+        return sequence({literal("\""), string_content('"'), literal("\""), space()});
    });
 }
@ -1130,8 +1330,81 @@ common_peg_parser common_peg_parser_builder::json() {
    });
 }
-common_peg_parser common_peg_parser_builder::json_string_content() {
+common_peg_parser common_peg_parser_builder::python_string() {
-    return wrap(arena_.add_parser(common_peg_json_string_parser{}));
+    return rule("python-string", [this]() {
        return choice({double_quoted_string(), single_quoted_string()});
    });
 }
 common_peg_parser common_peg_parser_builder::python_number() {
    return json_number();
 }
 common_peg_parser common_peg_parser_builder::python_bool() {
    return rule("python-bool", [this]() {
        return sequence({
            choice({literal("True"), literal("False")}),
            space()
        });
    });
 }
 common_peg_parser common_peg_parser_builder::python_null() {
    return rule("python-none", [this]() {
        return sequence({literal("None"), space()});
    });
 }
 common_peg_parser common_peg_parser_builder::python_dict() {
    return rule("python-dict", [this]() {
        auto ws = space();
        auto member = sequence({python_string(), ws, literal(":"), ws, python_value()});
        auto members = sequence({member, zero_or_more(sequence({ws, literal(","), ws, member}))});
        return sequence({
            literal("{"),
            ws,
            choice({
                literal("}"),
                sequence({members, ws, literal("}")})
            }),
            ws
        });
    });
 }
 common_peg_parser common_peg_parser_builder::python_array() {
    return rule("python-array", [this]() {
        auto ws = space();
        auto elements = sequence({python_value(), zero_or_more(sequence({literal(","), ws, python_value()}))});
        return sequence({
            literal("["),
            ws,
            choice({
                literal("]"),
                sequence({elements, ws, literal("]")})
            }),
            ws
        });
    });
 }
 common_peg_parser common_peg_parser_builder::python_value() {
    return rule("python-value", [this]() {
        return choice({
            python_dict(),
            python_array(),
            python_string(),
            python_number(),
            python_bool(),
            python_null()
        });
    });
 }
 common_peg_parser common_peg_parser_builder::marker() {
    auto sharp_bracket_parser = literal("<") + until(">") + literal(">");
    auto square_bracket_parser = literal("[") + until("]") + literal("]");
    return choice({ sharp_bracket_parser, square_bracket_parser });
 }
 common_peg_parser common_peg_parser_builder::json_member(const std::string & key, const common_peg_parser & p) {
@ -1145,17 +1418,54 @@ common_peg_parser common_peg_parser_builder::json_member(const std::string & key
    });
 }
-
+static std::string gbnf_escape_char_class(uint32_t c) {
-static std::string gbnf_escape_char_class(char c) {
+    if (c == '-' || c == ']' || c == '[' || c == '\\') {
-    switch (c) {
+        return "\\" + std::string(1, (char) c);
        case '\n': return "\\n";
        case '\t': return "\\t";
        case '\r': return "\\r";
        case '\\': return "\\\\";
        case ']':  return "\\]";
        case '[':  return "\\[";
        default:   return std::string(1, c);
    }
    // Escape whitespace control characters
    if (c == '\n') {
        return "\\n";
    }
    if (c == '\t') {
        return "\\t";
    }
    if (c == '\r') {
        return "\\r";
    }
    // Printable ASCII
    if (c >= 0x20 && c <= 0x7E) {
        return std::string(1, (char) c);
    }
    // Hex escape
    char         buf[16];
    const char * hex = "0123456789ABCDEF";
    if (c <= 0xFF) {
        buf[0] = '\\';
        buf[1] = 'x';
        buf[2] = hex[(c >> 4) & 0xF];
        buf[3] = hex[c & 0xF];
        buf[4] = '\0';
    } else if (c <= 0xFFFF) {
        buf[0] = '\\';
        buf[1] = 'u';
        buf[2] = hex[(c >> 12) & 0xF];
        buf[3] = hex[(c >> 8) & 0xF];
        buf[4] = hex[(c >> 4) & 0xF];
        buf[5] = hex[c & 0xF];
        buf[6] = '\0';
    } else {
        buf[0] = '\\';
        buf[1] = 'U';
        for (int i = 0; i < 8; i++) {
            buf[2 + i] = hex[(c >> ((7 - i) * 4)) & 0xF];
        }
        buf[10] = '\0';
    }
    return std::string(buf);
 }
 static std::string gbnf_excluding_pattern(const std::vector<std::string> & strings) {
@ -1173,12 +1483,12 @@ static std::string gbnf_excluding_pattern(const std::vector<std::string> & strin
        std::string cls;
        cls.reserve(chars.size());
-        for (const auto & ch : chars) {
+        for (uint32_t ch : chars) {
            cls += gbnf_escape_char_class(ch);
        }
        if (!pre.empty()) {
-            pattern += gbnf_format_literal(pre) + " [^" + cls + "]";
+            pattern += gbnf_format_literal(common_unicode_cpts_to_utf8(pre)) + " [^" + cls + "]";
        } else {
            pattern += "[^" + cls + "]";
        }
@ -1208,7 +1518,7 @@ static std::unordered_set<std::string> collect_reachable_rules(
                          std::is_same_v<T, common_peg_chars_parser> ||
                          std::is_same_v<T, common_peg_space_parser> ||
                          std::is_same_v<T, common_peg_any_parser> ||
-                          std::is_same_v<T, common_peg_json_string_parser>) {
+                          std::is_same_v<T, common_peg_string_parser>) {
                // These parsers do not have any children
            } else if constexpr (std::is_same_v<T, common_peg_sequence_parser>) {
                for (auto child : p.children) {
@ -1344,8 +1654,9 @@ void common_peg_arena::build_grammar(const common_grammar_builder & builder, boo
                    return result + "{" + std::to_string(p.min_count) + "}";
                }
                return result + "{" + std::to_string(p.min_count) + "," + std::to_string(p.max_count) + "}";
-            } else if constexpr (std::is_same_v<T, common_peg_json_string_parser>) {
+            } else if constexpr (std::is_same_v<T, common_peg_string_parser>) {
-                return R"(( [^"\\] | "\\" ( ["\\/ bfnrt] | "u" [0-9a-fA-F]{4} ) )*)";
+                const std::string delim(1, p.delimiter);
                return R"(( [^)" + delim + R"(\\] | "\\" ( [)" + delim + R"(\\/ bfnrt] | "u" [0-9a-fA-F]{4} ) )*)";
            } else if constexpr (std::is_same_v<T, common_peg_until_parser>) {
                if (p.delimiters.empty()) {
                    return ".*";
@ -1475,8 +1786,8 @@ static nlohmann::json serialize_parser_variant(const common_peg_parser_variant &
                {"min_count", p.min_count},
                {"max_count", p.max_count}
            };
-        } else if constexpr (std::is_same_v<T, common_peg_json_string_parser>) {
+        } else if constexpr (std::is_same_v<T, common_peg_string_parser>) {
-            return json{{"type", "json_string"}};
+            return json{{"type", "string"}, {"delimiter", std::string(1, p.delimiter)}};
        } else if constexpr (std::is_same_v<T, common_peg_until_parser>) {
            return json{{"type", "until"}, {"delimiters", p.delimiters}};
        } else if constexpr (std::is_same_v<T, common_peg_schema_parser>) {
@ -1603,8 +1914,15 @@ static common_peg_parser_variant deserialize_parser_variant(const nlohmann::json
        }
        return parser;
    }
-    if (type == "json_string") {
+    if (type == "string") {
-        return common_peg_json_string_parser{};
+        if (!j.contains("delimiter")) {
            throw std::runtime_error("string parser missing delimiter field.");
        }
        std::string delimiter = j["delimiter"];
        if (delimiter.empty()) {
            throw std::runtime_error("string parser delimiter is empty.");
        }
        return common_peg_string_parser{delimiter[0]};
    }
    if (type == "until") {
        if (!j.contains("delimiters") || !j["delimiters"].is_array()) {
--- a/common/peg-parser.h
+++ b/common/peg-parser.h
@ -4,6 +4,7 @@
 #include <memory>
 #include <unordered_map>
 #include <unordered_set>
 #include <string>
 #include <string_view>
 #include <functional>
@ -111,6 +112,8 @@ class common_peg_ast_arena {
    void visit(common_peg_ast_id id, const common_peg_ast_visitor & visitor) const;
    void visit(const common_peg_parse_result & result, const common_peg_ast_visitor & visitor) const;
    std::string dump();
 };
 struct common_peg_parse_result {
@ -136,21 +139,43 @@ struct common_peg_parse_result {
    bool success() const { return type == COMMON_PEG_PARSE_RESULT_SUCCESS; }
 };
 enum common_peg_parse_flags {
    COMMON_PEG_PARSE_FLAG_NONE    = 0,
    COMMON_PEG_PARSE_FLAG_LENIENT = 1 << 0,
    COMMON_PEG_PARSE_FLAG_DEBUG   = 1 << 1,
 };
 inline common_peg_parse_flags operator|(common_peg_parse_flags a, common_peg_parse_flags b) {
    return static_cast<common_peg_parse_flags>(int(a) | int(b));
 }
 inline common_peg_parse_flags & operator|=(common_peg_parse_flags & a, common_peg_parse_flags b) {
    return a = a | b;
 }
 inline common_peg_parse_flags operator&(common_peg_parse_flags a, common_peg_parse_flags b) {
    return static_cast<common_peg_parse_flags>(int(a) & int(b));
 }
 inline common_peg_parse_flags operator~(common_peg_parse_flags a) {
    return static_cast<common_peg_parse_flags>(~int(a));
 }
 struct common_peg_parse_context {
    std::string input;
-    bool is_partial;
+    common_peg_parse_flags flags;
    common_peg_ast_arena ast;
    int parse_depth;
-    common_peg_parse_context()
+    common_peg_parse_context(common_peg_parse_flags flags = COMMON_PEG_PARSE_FLAG_NONE)
-        : is_partial(false), parse_depth(0) {}
+        : flags(flags), parse_depth(0) {}
-    common_peg_parse_context(const std::string & input)
+    common_peg_parse_context(const std::string & input, common_peg_parse_flags flags = COMMON_PEG_PARSE_FLAG_NONE)
-        : input(input), is_partial(false), parse_depth(0) {}
+        : input(input), flags(flags), parse_depth(0) {}
-    common_peg_parse_context(const std::string & input, bool is_partial)
+    bool is_lenient() const { return flags & COMMON_PEG_PARSE_FLAG_LENIENT; }
-        : input(input), is_partial(is_partial), parse_depth(0) {}
+    bool is_debug() const { return flags & COMMON_PEG_PARSE_FLAG_DEBUG; }
 };
 class common_peg_arena;
@ -206,7 +231,9 @@ struct common_peg_chars_parser {
    int max_count;  // -1 for unbounded
 };
-struct common_peg_json_string_parser {};
+struct common_peg_string_parser {
    char delimiter;
 };
 struct common_peg_until_parser {
    std::vector<std::string> delimiters;
@ -254,7 +281,7 @@ using common_peg_parser_variant = std::variant<
    common_peg_any_parser,
    common_peg_space_parser,
    common_peg_chars_parser,
-    common_peg_json_string_parser,
+    common_peg_string_parser,
    common_peg_until_parser,
    common_peg_schema_parser,
    common_peg_rule_parser,
@ -299,6 +326,8 @@ class common_peg_arena {
    friend class common_peg_parser_builder;
  private:
    std::string dump_impl(common_peg_parser_id id, std::unordered_set<common_peg_parser_id> & visited) const;
    common_peg_parser_id add_parser(common_peg_parser_variant parser);
    void add_rule(const std::string & name, common_peg_parser_id id);
@ -404,6 +433,18 @@ class common_peg_parser_builder {
    //   S -> A{n}
    common_peg_parser repeat(const common_peg_parser & p, int n) { return repeat(p, n, n); }
    // Matches a double-quoted string: '"' content '"' space
    common_peg_parser double_quoted_string();
    // Matches a single-quoted string: "'" content "'" space
    common_peg_parser single_quoted_string();
    // Matches a string that accepts both double-quoted and single-quoted styles.
    common_peg_parser quoted_string();
    // Matches string content without the surrounding delimiter.
    common_peg_parser string_content(char delimiter);
    // Creates a complete JSON parser supporting objects, arrays, strings, numbers, booleans, and null.
    //   value -> object | array | string | number | true | false | null
    common_peg_parser json();
@ -414,14 +455,24 @@ class common_peg_parser_builder {
    common_peg_parser json_bool();
    common_peg_parser json_null();
    // Matches JSON string content without the surrounding quotes.
    // Useful for extracting content within a JSON string.
    common_peg_parser json_string_content();
    // Matches a JSON object member with a key and associated parser as the
    // value.
    common_peg_parser json_member(const std::string & key, const common_peg_parser & p);
    // Creates a complete Python format parser supporting dicts, arrays, strings, numbers, booleans, and None.
    // Differs from JSON: uses True/False/None, accepts both single and double-quoted strings.
    //   value -> dict | array | string | number | True | False | None
    common_peg_parser python_value();
    common_peg_parser python_dict();
    common_peg_parser python_string();
    common_peg_parser python_array();
    common_peg_parser python_number();
    common_peg_parser python_bool();
    common_peg_parser python_null();
    // A marker, i.e. text delimited by a pair of <> or []
    common_peg_parser marker();
    // Wraps a parser with JSON schema metadata for grammar generation.
    // Used internally to convert JSON schemas to GBNF grammar rules.
    common_peg_parser schema(const common_peg_parser & p, const std::string & name, const nlohmann::ordered_json & schema, bool raw = false);
--- a/common/reasoning-budget.cpp
+++ b/common/reasoning-budget.cpp
@ -0,0 +1,219 @@
 #include "reasoning-budget.h"
 #include "common.h"
 #include "unicode.h"
 #include "log.h"
 #include <cmath>
 #include <cstdint>
 #include <string>
 #include <vector>
 struct token_matcher {
    std::vector<llama_token> tokens;
    size_t pos = 0;
    bool advance(llama_token token) {
        if (tokens.empty()) {
            return false;
        }
        if (token == tokens[pos]) {
            pos++;
            if (pos >= tokens.size()) {
                pos = 0;
                return true;
            }
        } else {
            pos = 0;
            if (token == tokens[0]) {
                pos = 1;
            }
        }
        return false;
    }
    void reset() { pos = 0; }
 };
 struct common_reasoning_budget_ctx {
    const llama_vocab * vocab;
    token_matcher start_matcher;
    token_matcher end_matcher;
    std::vector<llama_token> forced_tokens;
    int32_t budget;           // maximum tokens in reasoning block
    int32_t remaining;        // tokens remaining in budget
    common_reasoning_budget_state state;
    // for forcing
    size_t force_pos;         // next position in forced_tokens to force
 };
 static const char * common_reasoning_budget_name(const struct llama_sampler * /*smpl*/) {
    return "reasoning-budget";
 }
 static void common_reasoning_budget_accept(struct llama_sampler * smpl, llama_token token) {
    auto * ctx = (common_reasoning_budget_ctx *) smpl->ctx;
    switch (ctx->state) {
        case REASONING_BUDGET_IDLE:
        {
            if (ctx->start_matcher.advance(token)) {
                ctx->state = REASONING_BUDGET_COUNTING;
                ctx->remaining = ctx->budget;
                LOG_INF("reasoning-budget: activated, budget=%d tokens\n", ctx->budget);
                if (ctx->remaining <= 0) {
                    ctx->state = REASONING_BUDGET_FORCING;
                    ctx->force_pos = 0;
                    LOG_INF("reasoning-budget: budget=0, forcing immediately\n");
                }
            }
            break;
        }
        case REASONING_BUDGET_COUNTING:
        case REASONING_BUDGET_WAITING_UTF8:
        {
            if (ctx->end_matcher.advance(token)) {
                ctx->state = REASONING_BUDGET_DONE;
                LOG_INF("reasoning-budget: deactivated (natural end)\n");
                break;
            }
            bool utf8_complete = true;
            if (ctx->vocab != nullptr) {
                const std::string piece = common_token_to_piece(ctx->vocab, token, false);
                utf8_complete = common_utf8_is_complete(piece);
            }
            if (ctx->state == REASONING_BUDGET_WAITING_UTF8) {
                if (utf8_complete) {
                    ctx->state = REASONING_BUDGET_FORCING;
                    ctx->force_pos = 0;
                    ctx->end_matcher.reset();
                    LOG_INF("reasoning-budget: UTF-8 complete, now forcing end sequence\n");
                }
            } else if (ctx->state == REASONING_BUDGET_COUNTING) {
                ctx->remaining--;
                if (ctx->remaining <= 0) {
                    if (utf8_complete) {
                        ctx->state = REASONING_BUDGET_FORCING;
                        ctx->force_pos = 0;
                        ctx->end_matcher.reset();
                        LOG_INF("reasoning-budget: budget exhausted, forcing end sequence\n");
                    } else {
                        ctx->state = REASONING_BUDGET_WAITING_UTF8;
                        ctx->end_matcher.reset();
                        LOG_INF("reasoning-budget: budget exhausted, waiting for UTF-8 completion\n");
                    }
                }
            }
            break;
        }
        case REASONING_BUDGET_FORCING:
            // force_pos is advanced in apply(), not here.
            // This ensures the first forced token isn't skipped when the sampler
            // is initialized directly in FORCING state (e.g. COUNTING + budget=0)
            break;
        case REASONING_BUDGET_DONE:
            break;
    }
 }
 static void common_reasoning_budget_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
    auto * ctx = (common_reasoning_budget_ctx *) smpl->ctx;
    if (ctx->state != REASONING_BUDGET_FORCING) {
        // passthrough — don't modify logits
        return;
    }
    if (ctx->force_pos >= ctx->forced_tokens.size()) {
        return;
    }
    const llama_token forced = ctx->forced_tokens[ctx->force_pos];
    // set all logits to -inf except the forced token
    for (size_t i = 0; i < cur_p->size; i++) {
        if (cur_p->data[i].id != forced) {
            cur_p->data[i].logit = -INFINITY;
        }
    }
    // advance to next forced token (done here rather than in accept so that
    // the first forced token isn't skipped when starting in FORCING state)
    ctx->force_pos++;
    if (ctx->force_pos >= ctx->forced_tokens.size()) {
        ctx->state = REASONING_BUDGET_DONE;
        LOG_INF("reasoning-budget: forced sequence complete, done\n");
    }
 }
 static void common_reasoning_budget_reset(struct llama_sampler * smpl) {
    auto * ctx = (common_reasoning_budget_ctx *) smpl->ctx;
    ctx->state = REASONING_BUDGET_IDLE;
    ctx->remaining = ctx->budget;
    ctx->start_matcher.reset();
    ctx->end_matcher.reset();
    ctx->force_pos = 0;
 }
 static struct llama_sampler * common_reasoning_budget_clone(const struct llama_sampler * smpl) {
    const auto * ctx = (const common_reasoning_budget_ctx *) smpl->ctx;
    return common_reasoning_budget_init(
        ctx->vocab,
        ctx->start_matcher.tokens,
        ctx->end_matcher.tokens,
        ctx->forced_tokens,
        ctx->budget,
        ctx->state);
 }
 static void common_reasoning_budget_free(struct llama_sampler * smpl) {
    delete (common_reasoning_budget_ctx *) smpl->ctx;
 }
 static struct llama_sampler_i common_reasoning_budget_i = {
    /* .name              = */ common_reasoning_budget_name,
    /* .accept            = */ common_reasoning_budget_accept,
    /* .apply             = */ common_reasoning_budget_apply,
    /* .reset             = */ common_reasoning_budget_reset,
    /* .clone             = */ common_reasoning_budget_clone,
    /* .free              = */ common_reasoning_budget_free,
    /* .backend_init      = */ nullptr,
    /* .backend_accept    = */ nullptr,
    /* .backend_apply     = */ nullptr,
    /* .backend_set_input = */ nullptr,
 };
 struct llama_sampler * common_reasoning_budget_init(
        const struct llama_vocab       * vocab,
        const std::vector<llama_token> & start_tokens,
        const std::vector<llama_token> & end_tokens,
        const std::vector<llama_token> & forced_tokens,
        int32_t                          budget,
        common_reasoning_budget_state    initial_state) {
    // promote COUNTING with budget <= 0 to FORCING
    if (initial_state == REASONING_BUDGET_COUNTING && budget <= 0) {
        initial_state = REASONING_BUDGET_FORCING;
    }
    return llama_sampler_init(
        /* .iface = */ &common_reasoning_budget_i,
        /* .ctx   = */ new common_reasoning_budget_ctx {
            /* .vocab         = */ vocab,
            /* .start_matcher = */ { start_tokens, 0 },
            /* .end_matcher   = */ { end_tokens, 0 },
            /* .forced_tokens = */ forced_tokens,
            /* .budget        = */ budget,
            /* .remaining     = */ budget,
            /* .state         = */ initial_state,
            /* .force_pos     = */ 0,
        }
    );
 }
--- a/common/reasoning-budget.h
+++ b/common/reasoning-budget.h
@ -0,0 +1,41 @@
 #pragma once
 #include "llama.h"
 #include <cstdint>
 #include <vector>
 enum common_reasoning_budget_state {
    REASONING_BUDGET_IDLE,         // waiting for start sequence
    REASONING_BUDGET_COUNTING,     // counting down tokens
    REASONING_BUDGET_FORCING,      // forcing budget message + end sequence
    REASONING_BUDGET_WAITING_UTF8, // budget exhausted, waiting for UTF-8 completion
    REASONING_BUDGET_DONE,         // passthrough forever
 };
 // Creates a reasoning budget sampler that limits token generation inside a
 // reasoning block (e.g. between <think> and </think>).
 //
 // State machine: IDLE -> COUNTING -> WAITING_UTF8 -> FORCING -> DONE
 //   IDLE:         passthrough, watching for start_tokens sequence
 //   COUNTING:     counting down remaining tokens, watching for natural end_tokens
 //   WAITING_UTF8: budget exhausted, allowing tokens to complete a UTF-8 sequence
 //   FORCING:      forces forced_tokens token-by-token (all other logits -> -inf)
 //   DONE:         passthrough forever
 //
 // Parameters:
 //   vocab         - vocabulary (used for UTF-8 boundary detection; can be nullptr)
 //   start_tokens  - token sequence that activates counting
 //   end_tokens    - token sequence for natural deactivation
 //   forced_tokens - token sequence forced when budget expires
 //   budget        - max tokens allowed in the reasoning block
 //   initial_state - initial state of the sampler (e.g. IDLE or COUNTING)
 //                   note: COUNTING with budget <= 0 is promoted to FORCING
 //
 struct llama_sampler * common_reasoning_budget_init(
        const struct llama_vocab       * vocab,
        const std::vector<llama_token> & start_tokens,
        const std::vector<llama_token> & end_tokens,
        const std::vector<llama_token> & forced_tokens,
        int32_t                          budget,
        common_reasoning_budget_state    initial_state);
--- a/common/sampling.cpp
+++ b/common/sampling.cpp
@ -2,6 +2,7 @@
 #include "common.h"
 #include "log.h"
 #include "reasoning-budget.h"
 #include <algorithm>
 #include <cmath>
@ -250,6 +251,17 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, st
        }
    }
    // reasoning budget sampler — added first so it can force tokens before other samplers
    if (params.reasoning_budget_tokens >= 0 && !params.reasoning_budget_forced.empty()) {
        samplers.push_back(common_reasoning_budget_init(
            vocab,
            params.reasoning_budget_start,
            params.reasoning_budget_end,
            params.reasoning_budget_forced,
            params.reasoning_budget_tokens,
            params.reasoning_budget_activate_immediately ? REASONING_BUDGET_COUNTING : REASONING_BUDGET_IDLE));
    }
    if (params.has_logit_bias()) {
        samplers.push_back(llama_sampler_init_logit_bias(llama_vocab_n_tokens(vocab), params.logit_bias.size(), params.logit_bias.data()));
    }
--- a/common/unicode.cpp
+++ b/common/unicode.cpp
@ -1,14 +1,20 @@
 #include "unicode.h"
 #include <algorithm>
 #include <cassert>
 #include <stdexcept>
 #include <string>
 #include <vector>
 // implementation adopted from src/unicode.cpp
-size_t utf8_sequence_length(unsigned char first_byte) {
+size_t common_utf8_sequence_length(unsigned char first_byte) {
    const size_t lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4 };
    uint8_t highbits = static_cast<uint8_t>(first_byte) >> 4;
    return lookup[highbits];
 }
-utf8_parse_result parse_utf8_codepoint(std::string_view input, size_t offset) {
+utf8_parse_result common_parse_utf8_codepoint(std::string_view input, size_t offset) {
    if (offset >= input.size()) {
        return utf8_parse_result(utf8_parse_result::INCOMPLETE);
    }
@ -62,3 +68,57 @@ utf8_parse_result parse_utf8_codepoint(std::string_view input, size_t offset) {
    // Invalid first byte
    return utf8_parse_result(utf8_parse_result::INVALID);
 }
 bool common_utf8_is_complete(const std::string & s) {
    if (s.empty()) {
        return true;
    }
    for (int i = 1; i <= std::min(4, (int)s.size()); i++) {
        unsigned char c = s[s.size() - i];
        if ((c & 0xC0) != 0x80) {
            int expected = (c >= 0xF0) ? 4 : (c >= 0xE0) ? 3 : (c >= 0xC0) ? 2 : 1;
            return i >= expected;
        }
    }
    return false;
 }
 std::string common_unicode_cpts_to_utf8(const std::vector<uint32_t> & cps) {
    std::string result;
    for (size_t i = 0; i < cps.size(); ++i) {
        result.append(common_unicode_cpt_to_utf8(cps[i]));
    }
    return result;
 }
 std::string common_unicode_cpt_to_utf8(uint32_t cpt) {
    std::string result;
    if (/* 0x00 <= cpt && */ cpt <= 0x7f) {
        result.push_back(cpt);
        return result;
    }
    if (0x80 <= cpt && cpt <= 0x7ff) {
        result.push_back(0xc0 | ((cpt >> 6) & 0x1f));
        result.push_back(0x80 | (cpt & 0x3f));
        return result;
    }
    if (0x800 <= cpt && cpt <= 0xffff) {
        result.push_back(0xe0 | ((cpt >> 12) & 0x0f));
        result.push_back(0x80 | ((cpt >> 6) & 0x3f));
        result.push_back(0x80 | (cpt & 0x3f));
        return result;
    }
    if (0x10000 <= cpt && cpt <= 0x10ffff) {
        result.push_back(0xf0 | ((cpt >> 18) & 0x07));
        result.push_back(0x80 | ((cpt >> 12) & 0x3f));
        result.push_back(0x80 | ((cpt >> 6) & 0x3f));
        result.push_back(0x80 | (cpt & 0x3f));
        return result;
    }
    throw std::invalid_argument("invalid codepoint");
 }
--- a/common/unicode.h
+++ b/common/unicode.h
@ -2,6 +2,8 @@
 #include <cstdint>
 #include <string_view>
 #include <vector>
 #include <string>
 // UTF-8 parsing utilities for streaming-aware unicode support
@ -16,7 +18,13 @@ struct utf8_parse_result {
 // Determine the expected length of a UTF-8 sequence from its first byte
 // Returns 0 for invalid first bytes
-size_t utf8_sequence_length(unsigned char first_byte);
+size_t common_utf8_sequence_length(unsigned char first_byte);
 // Check if a string ends with a complete UTF-8 sequence.
 bool common_utf8_is_complete(const std::string & s);
 // Parse a single UTF-8 codepoint from input
-utf8_parse_result parse_utf8_codepoint(std::string_view input, size_t offset);
+utf8_parse_result common_parse_utf8_codepoint(std::string_view input, size_t offset);
 std::string common_unicode_cpts_to_utf8(const std::vector<uint32_t> & cps);
 std::string common_unicode_cpt_to_utf8(uint32_t cpt);
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@ -144,6 +144,7 @@ class ModelBase:
        self.metadata_override = metadata_override
        self.model_name = model_name
        self.dir_model_card = dir_model  # overridden in convert_lora_to_gguf.py
        self._is_nvfp4 = False
        # Apply heuristics to figure out typical tensor encoding based on first tensor's dtype
        # NOTE: can't use field "torch_dtype" in config.json, because some finetunes lie.
@ -271,6 +272,9 @@ class ModelBase:
        return tensors
    def dequant_model(self):
        if self._is_nvfp4:
            return  # NVFP4 weights are repacked in _generate_nvfp4_tensors
        tensors_to_remove: list[str] = []
        new_tensors: dict[str, Callable[[], Tensor]] = {}
@ -516,6 +520,13 @@ class ModelBase:
        raise NotImplementedError("set_gguf_parameters() must be implemented in subclasses")
    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
        # skip NVFP4 auxiliary tensors (handled in _generate_nvfp4_tensors)
        if self._is_nvfp4:
            if name.endswith((".weight_scale", ".weight_scale_2", ".input_scale", ".k_scale", ".v_scale")):
                return []
            if name.endswith(".weight") and name.replace(".weight", ".weight_scale") in self.model_tensors:
                return []
        new_name = self.map_tensor_name(name)
        # Handle gate/up expert tensor fusion if enabled
@ -551,9 +562,135 @@ class ModelBase:
    def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
        return ()
    @staticmethod
    def _nvfp4_pack(weight: Tensor, scale: Tensor) -> tuple[np.ndarray, list[int]]:
        """Repack NVFP4 ModelOpt tensors into ggml super-block layout.
        Preserves original E4M3 scale bits as UE4M3 (strip sign bit).
        The per-tensor scale2 factor is stored as a separate tensor and applied at inference time via ggml_mul().
        Returns (raw_data, logical_shape)."""
        out_features = weight.shape[0]
        n_blocks = scale.shape[1]
        # Unpack ModelOpt nibble-packed weights
        w = weight.reshape(out_features, n_blocks, 8)
        vals = torch.stack([w & 0x0F, w >> 4], dim=-1).reshape(out_features, n_blocks, 16)
        # Preserve original E4M3 scale bits as UE4M3 (strip sign bit)
        d_ue = scale.view(torch.uint8).numpy().reshape(out_features, n_blocks) & 0x7F
        qs = (vals[:, :, :8] | (vals[:, :, 8:] << 4)).to(torch.uint8).numpy()
        # Pack into super-blocks: [4 UE4M3 scales, 32 qs bytes] = 36 bytes per 64 elements
        n_super = n_blocks // 4
        d_grouped = d_ue.reshape(out_features, n_super, 4)
        qs_grouped = qs.reshape(out_features, n_super, 4, 8).reshape(out_features, n_super, 32)
        raw = np.concatenate([d_grouped, qs_grouped], axis=-1).reshape(out_features, n_super * 36)
        return raw, [out_features, n_super * 64]
    @staticmethod
    def _nvfp4_scale2_is_trivial(scale2: Tensor) -> bool:
        return scale2.numel() <= 1 and abs(float(scale2.float().sum()) - 1.0) < 1e-6
    def _repack_nvfp4(self, new_name: str, weight: Tensor, scale: Tensor, scale2: Tensor):
        raw, shape = self._nvfp4_pack(weight, scale)
        logger.info(f"Repacked {new_name} with shape {shape} and quantization NVFP4")
        self.gguf_writer.add_tensor(new_name, raw, raw_dtype=gguf.GGMLQuantizationType.NVFP4)
        # Emit per-tensor scale2 as a separate F32 tensor when non-trivial
        if not self._nvfp4_scale2_is_trivial(scale2):
            scale2_f32 = scale2.float().numpy().flatten()
            scale_name = new_name.replace(".weight", ".scale")
            logger.info(f"  + {scale_name} (per-tensor NVFP4 scale2, shape [{scale2_f32.size}])")
            self.gguf_writer.add_tensor(scale_name, scale2_f32)
    def _generate_nvfp4_tensors(self):
        # Per-layer expert merging to avoid holding all experts in memory
        expert_blocks: dict[tuple[int, str], list[tuple[int, np.ndarray]]] = {}
        expert_scales: dict[tuple[int, str], list[tuple[int, float]]] = {}
        expert_shapes: dict[tuple[int, str], list[int]] = {}
        n_experts = self.find_hparam(["num_local_experts", "num_experts"], optional=True) or 0
        for name in list(self.model_tensors.keys()):
            if not name.endswith(".weight"):
                continue
            scale_name = name.replace(".weight", ".weight_scale")
            scale2_name = name.replace(".weight", ".weight_scale_2")
            if scale_name not in self.model_tensors:
                continue
            # Force eager materialization of lazy tensors
            weight = LazyTorchTensor.to_eager(self.model_tensors[name]())
            scale = LazyTorchTensor.to_eager(self.model_tensors[scale_name]())
            scale2 = LazyTorchTensor.to_eager(self.model_tensors.get(scale2_name, lambda: torch.tensor(1.0))())
            # Check if this is a per-expert tensor
            m = re.search(r'\.experts\.(\d+)\.(gate_proj|up_proj|down_proj)\.weight$', name)
            if m:
                expert_id = int(m.group(1))
                proj_type = m.group(2)
                bid_m = re.search(r'\.layers\.(\d+)\.', name)
                bid = int(bid_m.group(1)) if bid_m else 0
                key = (bid, proj_type)
                raw, shape = self._nvfp4_pack(weight, scale)
                if key not in expert_blocks:
                    expert_blocks[key] = []
                    expert_scales[key] = []
                    expert_shapes[key] = shape
                expert_blocks[key].append((expert_id, raw.copy()))
                # Collect per-expert scale2 (scalar per expert)
                expert_scales[key].append((expert_id, float(scale2.float().sum())))
                # Flush when all experts for this (layer, proj) are collected
                if n_experts > 0 and len(expert_blocks[key]) >= n_experts:
                    self._flush_nvfp4_experts(key, expert_blocks, expert_scales, expert_shapes, bid, proj_type)
            else:
                new_name = self.map_tensor_name(name)
                self._repack_nvfp4(new_name, weight, scale, scale2)
        # Flush any remaining experts (fallback if n_experts was unknown)
        for (bid, proj_type) in list(expert_blocks.keys()):
            self._flush_nvfp4_experts((bid, proj_type), expert_blocks, expert_scales, expert_shapes, bid, proj_type)
    def _flush_nvfp4_experts(self, key, expert_blocks, expert_scales, expert_shapes, bid, proj_type):
        experts = expert_blocks.pop(key)
        scales = expert_scales.pop(key)
        shape = expert_shapes.pop(key)
        experts.sort(key=lambda x: x[0])
        merged = np.stack([e[1] for e in experts], axis=0)
        merged_name = f"model.layers.{bid}.mlp.experts.{proj_type}.weight"
        new_name = self.map_tensor_name(merged_name)
        logger.info(f"Repacked {new_name} with shape [{len(experts)}, {shape[0]}, {shape[1]}] and quantization NVFP4")
        self.gguf_writer.add_tensor(new_name, merged, raw_dtype=gguf.GGMLQuantizationType.NVFP4)
        # Emit per-expert scale2 tensor if any expert has non-trivial scale2
        scales.sort(key=lambda x: x[0])
        scale_vals = np.array([s[1] for s in scales], dtype=np.float32)
        if not np.allclose(scale_vals, 1.0, atol=1e-6):
            scale_name = new_name.replace(".weight", ".scale")
            logger.info(f"  + {scale_name} (per-expert NVFP4 scale2, shape [{len(scales)}])")
            self.gguf_writer.add_tensor(scale_name, scale_vals)
        del experts, merged
    def prepare_tensors(self):
        # detect NVFP4 quantization (ModelOpt format)
        quant_algo = (self.hparams.get("quantization_config") or {}).get("quant_algo")
        quant_config_file = self.dir_model / "hf_quant_config.json"
        if not quant_algo and quant_config_file.is_file():
            with open(quant_config_file, "r", encoding="utf-8") as f:
                quant_algo = (json.load(f).get("quantization") or {}).get("quant_algo")
        self._is_nvfp4 = quant_algo == "NVFP4"
        self.dequant_model()
        # NVFP4 weights are repacked and written directly to gguf_writer
        if self._is_nvfp4:
            self._generate_nvfp4_tensors()
        # Handle empty tensor_map for models with block_count=0 (like MobileNetV5)
        if self.tensor_map.mapping:
            max_name_len = max(len(s) for _, s in self.tensor_map.mapping.values()) + len(".weight,")
@ -2057,6 +2194,8 @@ class GPTNeoXModel(TextModel):
    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
        n_head = self.hparams.get("n_head", self.hparams.get("num_attention_heads"))
        n_embed = self.hparams.get("hidden_size", self.hparams.get("n_embed"))
        assert n_head is not None
        assert n_embed is not None
        if re.match(r"gpt_neox\.layers\.\d+\.attention\.query_key_value\.weight", name):
            # Map bloom-style qkv_linear to gpt-style qkv_linear
@ -2094,6 +2233,8 @@ class BloomModel(TextModel):
    def set_gguf_parameters(self):
        n_embed = self.hparams.get("hidden_size", self.hparams.get("n_embed"))
        n_head = self.hparams.get("n_head", self.hparams.get("num_attention_heads"))
        assert n_head is not None
        assert n_embed is not None
        self.gguf_writer.add_context_length(self.hparams.get("seq_length", n_embed))
        self.gguf_writer.add_embedding_length(n_embed)
        self.gguf_writer.add_feed_forward_length(4 * n_embed)
@ -2106,6 +2247,8 @@ class BloomModel(TextModel):
    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
        n_head = self.hparams.get("n_head", self.hparams.get("num_attention_heads"))
        n_embed = self.hparams.get("hidden_size", self.hparams.get("n_embed"))
        assert n_head is not None
        assert n_embed is not None
        name = re.sub(r'transformer\.', '', name)
@ -3716,6 +3859,7 @@ class LLaDAModel(TextModel):
        if (rope_dim := hparams.get("head_dim")) is None:
            n_heads = hparams.get("num_attention_heads", hparams.get("n_heads"))
            assert n_heads is not None
            rope_dim = hparams.get("hidden_size", hparams.get("d_model")) // n_heads
        self.gguf_writer.add_rope_dimension_count(rope_dim)
@ -3747,6 +3891,7 @@ class LLaDAModel(TextModel):
    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
        n_head = self.hparams.get("num_attention_heads", self.hparams.get("n_heads"))
        assert n_head is not None
        n_kv_head = self.hparams.get("num_key_value_heads", self.hparams.get("n_kv_heads"))
        if self.undo_permute:
@ -4031,7 +4176,7 @@ class Qwen2VLVisionModel(MmprojModel):
                # split Conv3D into Conv2Ds
                c1, c2, kt, kh, kw = data_torch.shape
                del c1, c2, kh, kw  # unused
-                assert kt == 2, "Current implmentation only support temporal_patch_size of 2"
+                assert kt == 2, "Current implementation only support temporal_patch_size of 2"
                yield (gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.V_ENC_EMBD_PATCH] + ".weight"  , data_torch[:, :, 0, ...])
                yield (gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.V_ENC_EMBD_PATCH] + ".weight.1", data_torch[:, :, 1, ...])
            else:
@ -4303,6 +4448,14 @@ class Qwen2MoeModel(TextModel):
        # process the experts separately
        name = name.replace("language_model.", "") # InternVL
        # NVFP4 expert weights are handled in _generate_nvfp4_tensors
        if self._is_nvfp4 and "experts" in name:
            if name.endswith((".weight", ".weight_scale", ".weight_scale_2", ".input_scale")):
                if name.endswith(".weight") and name.replace(".weight", ".weight_scale") in self.model_tensors:
                    return
                if not name.endswith(".weight"):
                    return
        # handle aggregated expert tensors
        # GGUF stores dimensions reversed from PyTorch, so:
        # PyTorch (A,B,C) -> GGUF writes [C,B,A] -> GGML reads ne={C,B,A}
@ -4390,15 +4543,31 @@ class Qwen3Model(Qwen2Model):
        hparams = ModelBase.load_hparams(self.dir_model, is_mistral_format=False)
        self.origin_hf_arch = hparams.get('architectures', [None])[0]
-        # a bit hacky, but currently the only way to detect if this is a rerank model
+        if self._is_qwen3_reranker():
-        # ref: https://huggingface.co/Qwen/Qwen3-Reranker-0.6B
+            self._find_rerank_config()
    def _is_qwen3_reranker(self) -> bool:
        readme_path = self.dir_model / "README.md"
        readme_text = ""
        if readme_path.exists():
            with readme_path.open("r", encoding="utf-8") as f:
                readme_text = f.read()
-        if "# Qwen3-Reranker" in readme_text:
+
-            self._find_rerank_config()
+        name_hints = [
            str(self.dir_model.name),
            str(self.hparams.get("_name_or_path", "")),
            str(self.hparams.get("model_type", "")),
            str(self.origin_hf_arch or ""),
        ]
        name_hints = [hint.lower() for hint in name_hints if hint]
        if "# qwen3-reranker" in readme_text.lower() or "# qwen3-vl-reranker" in readme_text.lower():
            return True
        if any("qwen3-reranker" in hint or "qwen3-vl-reranker" in hint for hint in name_hints):
            return True
        return "sequenceclassification" in (self.origin_hf_arch or "").lower()
    def set_vocab(self):
        # deal with intern-s1-mini
@ -4842,12 +5011,12 @@ class _LinearAttentionVReorderBase(Qwen3NextModel):
        yield from super().modify_tensors(data_torch, name, bid)
-@ModelBase.register("Qwen3_5ForConditionalGeneration")
+@ModelBase.register("Qwen3_5ForConditionalGeneration", "Qwen3_5ForCausalLM")
 class Qwen3_5TextModel(_LinearAttentionVReorderBase):
    model_arch = gguf.MODEL_ARCH.QWEN35
-@ModelBase.register("Qwen3_5MoeForConditionalGeneration")
+@ModelBase.register("Qwen3_5MoeForConditionalGeneration", "Qwen3_5MoeForCausalLM")
 class Qwen3_5MoeTextModel(_LinearAttentionVReorderBase):
    model_arch = gguf.MODEL_ARCH.QWEN35MOE
@ -4901,7 +5070,7 @@ class Phi2Model(TextModel):
        self.gguf_writer.add_add_bos_token(False)
-@ModelBase.register("Phi3ForCausalLM")
+@ModelBase.register("Phi3ForCausalLM", "Phi4ForCausalLMV")
 class Phi3MiniModel(TextModel):
    model_arch = gguf.MODEL_ARCH.PHI3
@ -5076,6 +5245,129 @@ class Phi3MiniModel(TextModel):
        yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_LONG), torch.tensor(long_factors, dtype=torch.float32))
        yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_SHORT), torch.tensor(short_factors, dtype=torch.float32))
    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
        if name.startswith(("model.vision_tower.", "vision_tower.", "model.mm_projector.", "mm_projector.")):
            return
        yield from super().modify_tensors(data_torch, name, bid)
@ModelBase.register("Phi4ForCausalLMV")
 class Phi4VisionMmprojModel(MmprojModel):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        assert self.hparams_vision is not None
        self.vision_total_layers = int(self.find_vparam(self.n_block_keys))
        if self.vision_total_layers < 2:
            raise ValueError(
                f"Phi-4 vision mmproj conversion requires at least 2 vision layers, got {self.vision_total_layers}"
            )
        # Phi-4 uses SigLIP2 hidden_states[-2], so export one fewer encoder block and
        # drop post-layernorm/head weights. This makes the GGUF runtime output match
        # the feature map consumed by the patched siglip.cpp Phi-4 projector path.
        self.vision_export_layers = self.vision_total_layers - 1
        self.vision_last_layer_idx = self.vision_total_layers - 1
        for key in self.n_block_keys:
            if key in self.hparams_vision:
                self.hparams_vision[key] = self.vision_export_layers
                break
        self.block_count = self.vision_export_layers
        self.tensor_map = gguf.get_tensor_name_map(gguf.MODEL_ARCH.MMPROJ, self.block_count)
        patch_size = self.preprocessor_config.get("patch_size")
        if patch_size is None:
            raise KeyError("Phi-4 vision mmproj conversion requires patch_size in preprocessor_config.json")
        self.hparams_vision["patch_size"] = patch_size
        pos_emb_name = next(
            (
                name for name in self.model_tensors
                if name.endswith("vision_model.embeddings.position_embedding.weight")
            ),
            None,
        )
        if pos_emb_name is None:
            raise KeyError("Phi-4 vision mmproj conversion could not find position_embedding.weight")
        pos_emb_shape = self.model_tensors[pos_emb_name]().shape
        base_grid_tokens = int(pos_emb_shape[0])
        grid_side = math.isqrt(base_grid_tokens)
        if grid_side * grid_side != base_grid_tokens:
            raise ValueError(f"Unexpected Phi-4 position embedding shape: {tuple(pos_emb_shape)}")
        self.hparams_vision["image_size"] = grid_side * patch_size
        min_num_patches = self.preprocessor_config.get("min_num_patches", self.global_config.get("min_num_patches"))
        max_num_patches = self.preprocessor_config.get("max_num_patches", self.global_config.get("max_num_patches"))
        if min_num_patches is None or max_num_patches is None:
            raise KeyError("Phi-4 vision mmproj conversion requires min_num_patches and max_num_patches")
        self.min_pixels = int(min_num_patches) * patch_size * patch_size
        self.max_pixels = int(max_num_patches) * patch_size * patch_size
    def set_gguf_parameters(self):
        super().set_gguf_parameters()
        assert self.hparams_vision is not None
        self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.PHI4)
        self.gguf_writer.add_vision_min_pixels(self.min_pixels)
        self.gguf_writer.add_vision_max_pixels(self.max_pixels)
        self.gguf_writer.add_vision_use_gelu(True)
        self.gguf_writer.add_vision_attention_layernorm_eps(self.hparams_vision.get("layer_norm_eps", 1e-6))
    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
        if name.startswith(("model.vision_tower.vision_tower.", "vision_tower.")):
            if ".vision_model.head." in name:
                return
            new_name = name.replace("model.vision_tower.vision_tower.", "vision_tower.")
            if ".vision_model.post_layernorm." in new_name:
                return
            if bid is not None and bid == self.vision_last_layer_idx:
                return
            if new_name.endswith("vision_model.embeddings.patch_embedding.weight"):
                assert self.hparams_vision is not None
                if data_torch.ndim != 2:
                    raise ValueError(f"Unexpected Phi-4 patch embedding shape: {tuple(data_torch.shape)}")
                patch_area = self.hparams_vision["patch_size"] ** 2
                in_features = data_torch.shape[1]
                if in_features % patch_area != 0:
                    raise ValueError(
                        f"Phi-4 patch embedding input dim {in_features} is not divisible by patch area {patch_area}"
                    )
                num_channels = in_features // patch_area
                patch_size = self.hparams_vision["patch_size"]
                data_torch = data_torch.view(data_torch.shape[0], patch_size, patch_size, num_channels)
                data_torch = data_torch.permute(0, 3, 1, 2)
            yield from super().modify_tensors(data_torch, new_name, bid)
            return
        if name.startswith(("model.mm_projector.", "mm_projector.")):
            local_name = name
            local_name = local_name.replace("model.mm_projector.", "")
            local_name = local_name.replace("mm_projector.", "")
            if not (local_name.startswith("0.") or local_name.startswith("2.")):
                return
            suffix = ".bias" if local_name.endswith(".bias") else ".weight"
            mm_idx = int(local_name.split(".", maxsplit=1)[0])
            yield (self.format_tensor_name(gguf.MODEL_TENSOR.V_MMPROJ, mm_idx, suffix=suffix), data_torch)
            return
        return
@ModelBase.register("PhiMoEForCausalLM")
 class PhiMoeModel(Phi3MiniModel):
@ -5404,7 +5696,7 @@ class KimiLinearModel(TextModel):
        # Get ssm_d_conv from linear_attn_config.short_conv_kernel_size or ssm_d_conv
        linear_attn_config = self.hparams["linear_attn_config"]
        # n_head == 0 for KDA layers, n_head > 0 for MLA layers
-        # full_attention_layers list will be used to distingush layer type
+        # full_attention_layers list will be used to distinguish layer type
        _num_kv_heads = list()
        _full_attn_layers = linear_attn_config["full_attn_layers"]
        for il in range(self.hparams["num_hidden_layers"]):
@ -6505,7 +6797,7 @@ class Gemma3VisionModel(MmprojModel):
        super().set_gguf_parameters()
        hparams = self.hparams
        self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.GEMMA3)
-        # default values below are taken from HF tranformers code
+        # default values below are taken from HF transformers code
        self.gguf_writer.add_vision_attention_layernorm_eps(hparams.get("layer_norm_eps", 1e-6))
        self.gguf_writer.add_vision_use_gelu(True)
        # calculate proj_scale_factor (used by tinygemma3 test model)
@ -7097,7 +7389,7 @@ class Rwkv7Model(TextModel):
            if bid == 0 and "time_mix_a" in new_name:
                # dummy v0/v1/v2 on first layer
-                # easist way to make llama happy
+                # easiest way to make llama happy
                yield (new_name.replace("time_mix_a", "time_mix_v"), data_torch)
            yield (new_name, data_torch)
@ -9201,7 +9493,9 @@ class ChatGLMModel(TextModel):
    def set_gguf_parameters(self):
        n_embed = self.hparams.get("hidden_size", self.hparams.get("n_embed"))
        assert n_embed is not None
        n_head = self.hparams.get("n_head", self.hparams.get("num_attention_heads"))
        assert n_head is not None
        n_head_kv = self.hparams.get("multi_query_group_num", self.hparams.get("num_key_value_heads", n_head))
        self.gguf_writer.add_context_length(self.hparams.get("seq_length", n_embed))
        self.gguf_writer.add_embedding_length(n_embed)
@ -9596,7 +9890,7 @@ class GraniteHybridModel(Mamba2Model, GraniteMoeModel):
        # NOTE: Explicitly include hparam prefix prefix for d_model to
        #   disambiguate with top-level head_dim
        # NOTE 2: If needed for future models, this can be isolated in a method
-        #   to separate the prefix setting and teh keys used
+        #   to separate the prefix setting and the keys used
        self.d_model = self.find_hparam([f"{self.hparam_prefixes[0]}_head_dim", "hidden_size", "d_model"])
        self.n_group = self.find_hparam(["n_groups", "num_groups"])
        self.d_inner = self.find_hparam(["expand", "num_heads"]) * self.d_model
@ -9727,23 +10021,38 @@ class NemotronHModel(GraniteHybridModel):
        # M: Mamba2, *: Attention, -: MLP
        # MoE:
        # M: Mamba2, *: Attention, E: Expert
-        hybrid_override_pattern = self.hparams["hybrid_override_pattern"]
+        pattern = self.hparams.get("hybrid_override_pattern") or self.hparams.get("layers_block_type")
-        self._ssm_layers = [i for i, val in enumerate(hybrid_override_pattern) if val == "M"]
+        if pattern is None:
-        self._mlp_layers = [i for i, val in enumerate(hybrid_override_pattern) if val == ("E" if self.is_moe else "-")]
+            self._ssm_layers = []
            self._mlp_layers = []
        elif isinstance(pattern, str):
            self._ssm_layers = [i for i, val in enumerate(pattern) if val == "M"]
            self._mlp_layers = [i for i, val in enumerate(pattern) if val == ("E" if self.is_moe else "-")]
        else:
            self._ssm_layers = [i for i, val in enumerate(pattern) if val == "mamba"]
            self._mlp_layers = [i for i, val in enumerate(pattern) if val == "moe"]
    def get_attn_layers(self):
-        hybrid_override_pattern = self.hparams["hybrid_override_pattern"]
+        pattern = self.hparams.get("hybrid_override_pattern") or self.hparams.get("layers_block_type")
-        assert len(hybrid_override_pattern) == self.block_count, "Mismatch between hybrid override and num_hidden_layers!"
+        if pattern is None:
-        return [i for i, val in enumerate(hybrid_override_pattern) if val == "*"]
+            return []
        assert len(pattern) == self.block_count, f"Mismatch between pattern ({len(pattern)}) and block_count ({self.block_count})!"
        if isinstance(pattern, str):
            return [i for i, val in enumerate(pattern) if val == "*"]
        return [i for i, val in enumerate(pattern) if val == "attention"]
    def set_gguf_parameters(self):
        super().set_gguf_parameters()
-        self.gguf_writer.add_key_length(self.head_dim)
+        head_dim = self.head_dim
-        self.gguf_writer.add_value_length(self.head_dim)
+        if head_dim is None:
            raise ValueError("Could not find the attention head dim in config")
        self.gguf_writer.add_key_length(head_dim)
        self.gguf_writer.add_value_length(head_dim)
        # Set feed_forward_length
-        # NOTE: This will trigger an override warning. This is preferrable to
+        # NOTE: This will trigger an override warning. This is preferable to
        #   duplicating all the parent logic
        if not self.is_moe:
            n_ff = self.find_hparam(["intermediate_size", "n_inner", "hidden_dim"])
@ -9768,6 +10077,9 @@ class NemotronHModel(GraniteHybridModel):
            if (n_experts_used := self.hparams.get("num_experts_per_tok")) is not None:
                self.gguf_writer.add_expert_used_count(n_experts_used)
            if (latent_size := self.hparams.get("moe_latent_size")) is not None:
                self.gguf_writer.add_moe_latent_size(latent_size)
    def set_vocab(self):
        super().set_vocab()
@ -9787,6 +10099,13 @@ class NemotronHModel(GraniteHybridModel):
            name = name[len("language_model."):]
        if self.is_moe and bid is not None:
            # Skip Multi-Token Prediction (MTP) tensors. These are used for
            # for speculative decoding but we don't include them in this model
            # conversion. See https://github.com/ggml-org/llama.cpp/pull/18886
            if name.startswith("mtp."):
                logger.info(f"gguf: Skipping MTP (Speculative) layer: {name}")
                return
            if name.endswith("mixer.gate.e_score_correction_bias"):
                new_name = name.replace("e_score_correction_bias", "e_score_correction.bias")
                yield from ModelBase.modify_tensors(self, data_torch, new_name, bid)
--- a/convert_lora_to_gguf.py
+++ b/convert_lora_to_gguf.py
@ -128,6 +128,12 @@ class LoraTorchTensor:
        assert dim is None
        return self.shape
    def contiguous(self) -> LoraTorchTensor:
        return LoraTorchTensor(
            self._lora_A.contiguous(),
            self._lora_B.contiguous(),
        )
    def reshape(self, *shape: int | tuple[int, ...]) -> LoraTorchTensor:
        if isinstance(shape[0], tuple):
            new_shape: tuple[int, ...] = shape[0]
--- a/docs/autoparser.md
+++ b/docs/autoparser.md
@ -0,0 +1,525 @@
 # Auto-Parser Architecture
 The auto-parser automatically analyzes chat templates to determine how to parse model outputs, including content, reasoning, and tool calls.
 ## Overview
 The unified auto-parser uses a pure differential, compositional approach (inspired by the `git diff` algorithm) to analyze chat templates:
 **Core Philosophy**:
 - **Minimize Hardcoded Patterns**: All markers extracted through template comparison (the only heuristic is JSON detection to distinguish `JSON_NATIVE` from tag-based formats)
 - **Compositional Architecture**: Separate analyzer structs for reasoning, content, and tools — each responsible for its own analysis and parser construction
 **Analysis + Parser Building in Two Steps**:
 1. `autoparser::autoparser tmpl_analysis(tmpl)` — runs all differential comparisons and populates the analysis structs
 2. `autoparser::peg_generator::generate_parser(tmpl, params, tmpl_analysis)` — uses the analysis to build a PEG parser and optional GBNF grammar
 ## Data Structures
 All structs are defined in [common/chat-auto-parser.h](common/chat-auto-parser.h).
 ### Top-Level: `autoparser` (main analyzer and generator)
 [common/chat-auto-parser.h:367-388](common/chat-auto-parser.h#L367-L388) — top-level analysis result aggregating `jinja_caps`, `reasoning`, `content`, and `tools` sub-analyses, plus `preserved_tokens` (union of all non-empty markers).
 ### `analyze_reasoning`
 [common/chat-auto-parser.h:254-274](common/chat-auto-parser.h#L254-L274) — reasoning analysis result: `mode` enum, `start` marker (e.g. `<think>`), and `end` marker (e.g. `</think>`).
 ### `analyze_content`
 [common/chat-auto-parser.h:280-295](common/chat-auto-parser.h#L280-L295) — content analysis result: `mode` enum, `start`/`end` markers, and `requires_nonnull_content` flag.
 ### `analyze_tools` and its sub-structs
 - [common/chat-auto-parser.h:176-194](common/chat-auto-parser.h#L176-L194) — `tool_format_analysis`: `mode` enum, `section_start/end`, `per_call_start/end`, JSON field names (`function_field`, `name_field`, `args_field`, `id_field`, `gen_id_field`), and format flags (`fun_name_is_key`, `tools_array_wrapped`, `uses_python_dicts`)
 - [common/chat-auto-parser.h:196-200](common/chat-auto-parser.h#L196-L200) — `tool_function_analysis`: `name_prefix`, `name_suffix`, `close` markers around function names
 - [common/chat-auto-parser.h:202-210](common/chat-auto-parser.h#L202-L210) — `tool_arguments_analysis`: `start/end` container markers, `name_prefix/suffix`, `value_prefix/suffix`, `separator`
 - [common/chat-auto-parser.h:212-217](common/chat-auto-parser.h#L212-L217) — `tool_id_analysis`: `pos` enum, `prefix`/`suffix` markers around call ID values
 - [common/chat-auto-parser.h:301-361](common/chat-auto-parser.h#L301-L361) — `analyze_tools`: aggregates the four sub-structs above
 ### Enums
 **`reasoning_mode`**: How the template handles reasoning/thinking blocks.
 | Value           | Description                                                                       |
 |-----------------|-----------------------------------------------------------------------------------|
 | `NONE`          | No reasoning markers detected                                                     |
 | `TAG_BASED`     | Standard tag-based: `<think>...</think>`                                          |
 | `DELIMITER`     | Delimiter-based: reasoning ends at a delimiter (e.g., `[BEGIN FINAL RESPONSE]`)   |
 | `FORCED_OPEN`   | Template ends with open reasoning tag when `enable_thinking=true`                 |
 | `FORCED_CLOSED` | `enable_thinking=false` emits both tags; `enable_thinking=true` emits only start  |
 | `TOOLS_ONLY`    | Reasoning only appears in tool call responses, not plain content                  |
 **`content_mode`**: How the template wraps assistant content.
 | Value                    | Description                                                    |
 |--------------------------|----------------------------------------------------------------|
 | `PLAIN`                  | No content markers                                             |
 | `ALWAYS_WRAPPED`         | Content always wrapped: `<response>...</response>`             |
 | `WRAPPED_WITH_REASONING` | Content wrapped only when reasoning is present                 |
 **`tool_format`**: Classification of tool call structure.
 | Value            | Description                                                      |
 |------------------|------------------------------------------------------------------|
 | `NONE`           | No tool support detected                                         |
 | `JSON_NATIVE`    | Pure JSON: `{"name": "X", "arguments": {...}}`                   |
 | `TAG_WITH_JSON`  | Tag-based with JSON args: `<function=X>{...}</function>`         |
 | `TAG_WITH_TAGGED`| Tag-based with tagged args: `<param=key>value</param>`           |
 **`call_id_position`**: Where call IDs appear in tag-based formats.
 | Value                    | Description                                  |
 |--------------------------|----------------------------------------------|
 | `NONE`                   | No call ID support detected                  |
 | `PRE_FUNC_NAME`          | Before function name                         |
 | `BETWEEN_FUNC_AND_ARGS`  | Between function name and arguments          |
 | `POST_ARGS`              | After arguments                              |
 ## Tool Calling Formats
 ### JSON_NATIVE
 **Structure**: The entire tool call (function name, arguments, values) is in JSON format. Optional enclosing tags around the section.
 **Detection**: Function name appears inside a JSON structure (quotes preceded by `{` or `:`).
 **Examples**:
 Standard OpenAI-style:
 ```json
 <tool_call>
 {"name": "get_weather", "arguments": {"location": "Paris", "unit": "celsius"}}
 </tool_call>
 ```
 Mistral Nemo with array wrapper:
 ```json
 [TOOL_CALLS]
 [{"name": "calculate", "arguments": {"expr": "2+2"}}]
 ```
 Function name as JSON key (Apertus style):
 ```json
 {"get_weather": {"location": "Paris"}}
 ```
 ---
 ### TAG_WITH_JSON
 **Structure**: Function name is outside JSON, in tag attributes or XML-style tags. Arguments are a JSON object.
 **Detection**: Function name not in JSON, but argument names appear in JSON context.
 **Examples**:
 Functionary v3.1:
 ```xml
 <function=get_weather>{"location": "Paris", "unit": "celsius"}</function>
 ```
 MiniMax:
 ```xml
 <minimax:tool_call>
 <tool_name>calculate</tool_name>
 <arguments>{"expr": "2+2"}</arguments>
 </minimax:tool_call>
 ```
 ---
 ### TAG_WITH_TAGGED
 **Structure**: Both function name and argument names are in XML-style tags. String values are unquoted; non-string values are JSON-formatted.
 **Detection**: Neither function name nor argument names appear in a JSON context.
 **Examples**:
 Qwen/Hermes XML format:
 ```xml
 <function=get_weather>
 <param=location>Paris</param>
 <param=unit>celsius</param>
 </function>
 ```
 Mixed types:
 ```xml
 <function=calculate>
 <param=expr>2+2</param>
 <param=precision>2</param>
 <param=options>{"round": true}</param>
 </function>
 ```
 String values (`Paris`, `celsius`, `2+2`) are unquoted; `options` (object type) is JSON-formatted.
 ---
 ## Analysis Flow
 ```text
 autoparser::autoparser(tmpl)
    |
    |-- Phase 1: analyze_reasoning(tmpl, jinja_caps.supports_tool_calls)
    |     |-- R1: compare_reasoning_presence()   — with/without reasoning_content field
    |     |-- R2: compare_thinking_enabled()     — enable_thinking=false vs true
    |     '-- R3: compare_reasoning_scope()      — reasoning+content vs reasoning+tools
    |           (only if supports_tool_calls)
    |
    |-- Phase 2: analyze_content(tmpl, reasoning)
    |     '-- C1: compares content-only vs tools output and content-only vs reasoning output
    |
    |-- Phase 3: analyze_tools(tmpl, jinja_caps, reasoning)
    |     (skipped entirely if !jinja_caps.supports_tool_calls)
    |     |
    |     |-- T1: analyze_tool_calls()           — no tools vs with tools; classifies format
    |     |         |-- JSON path → analyze_tool_call_format_json_native()
    |     |         '-- tag path → analyze_tool_call_format_non_json()
    |     |
    |     (if format != NONE and format != JSON_NATIVE:)
    |     |
    |     |-- T2: check_per_call_markers()       — 1 call vs 2 calls; moves section→per-call if needed
    |     |         (only if supports_parallel_tool_calls)
    |     |
    |     |-- T3: extract_function_markers()     — func_alpha vs func_beta; extracts name prefix/suffix/close
    |     |
    |     |-- T4: analyze_arguments()            — (TAG_WITH_TAGGED only)
    |     |         |-- A1: extract_argument_name_markers()   — arg_name_A vs arg_name_B
    |     |         '-- A2: extract_argument_value_markers()  — value "XXXX" vs "YYYY"
    |     |
    |     |-- T5: extract_argument_separator()   — 1 arg vs 2 args; finds separator between args
    |     |
    |     |-- T6: extract_args_markers()         — 0 args vs 1 arg; finds args container markers
    |     |
    |     '-- T7: extract_call_id_markers()      — call_id "call00001" vs "call99999"
    |
    '-- collect_preserved_tokens()               — union of all non-empty markers
    |
    '-- apply workarounds()                      — post-hoc patches for edge-case templates
    |
    v
 autoparser (analysis result)
    |
    v
 autoparser::peg_generator::generate_parser(tmpl, inputs, analysis)
    |-- analysis.build_parser(inputs)            — builds PEG parser arena
    |     |-- reasoning.build_parser(ctx)        — reasoning parser (mode-dependent)
    |     |-- content.build_parser(ctx)          — content parser (mode-dependent)
    |     '-- tools.build_parser(ctx)            — tool parser (dispatches by tool_format)
    |           |-- build_tool_parser_json_native()
    |           |-- build_tool_parser_tag_json()
    |           '-- build_tool_parser_tag_tagged()
    |
    |-- Build GBNF grammar (if tools present and trigger_marker non-empty)
    '-- Set grammar_triggers from section_start or per_call_start
    |
    v
 common_chat_params (prompt, parser, grammar, triggers, preserved_tokens)
 ```
 ## Entry Point
 The auto-parser is invoked in [common/chat.cpp:1280-1310](common/chat.cpp#L1280-L1310) in `common_chat_templates_apply_jinja`. A few specialized templates are handled first (Ministral/Magistral Large 3, GPT-OSS with `<|channel|>`, Functionary v3.2 with `>>>all`), then the auto-parser handles everything else via `autoparser::autoparser` + `peg_generator::generate_parser`.
 ## Algorithm Details
 ### Core Mechanism: Differential Comparison
 All analysis phases use the same factorized comparison function declared in [common/chat-auto-parser-helpers.h:68](common/chat-auto-parser-helpers.h#L68):
 ```cpp
 compare_variants(tmpl, params_A, params_modifier)
 ```
 This creates variant B by applying a modifier lambda to a copy of `params_A`, renders both through the template, and computes a `diff_split` ([common/chat-auto-parser.h:28-37](common/chat-auto-parser.h#L28-L37)):
 - `prefix` — common prefix between A and B
 - `suffix` — common suffix between A and B
 - `left` — unique to variant A
 - `right` — unique to variant B
 The diff is computed via `calculate_diff_split()`, which finds the longest-common-prefix and longest-common-suffix, then iteratively moves incomplete `<...>` or `[...]` markers from the prefix/suffix into left/right until stable (tag boundary fixing).
 Text is segmentized into markers and non-marker fragments using `segmentize_markers()`, which splits on `<...>` and `[...]` boundaries.
 ### Phase 1: Reasoning Analysis
 **R1 — `compare_reasoning_presence()`**: Compares assistant message with vs without a `reasoning_content` field.
 - Searches `diff.right` (output with reasoning) for the reasoning content needle
 - Uses PEG parsers to find surrounding markers:
  - If both pre/post markers found in `diff.right` → `TAG_BASED` (both tags visible in diff = no forced close)
  - If both found but post marker only in the full output B → `FORCED_CLOSED`
  - If only post marker found → `DELIMITER`
 - Sets `reasoning.start` and `reasoning.end`
 **R2 — `compare_thinking_enabled()`**: Compares `enable_thinking=false` vs `true` with a generation prompt.
 - Detects `FORCED_OPEN`: `enable_thinking=true` adds a non-empty marker at the end of the prompt (where model will start generating) — sets `reasoning.start`, mode = `FORCED_OPEN`
 - Detects `FORCED_CLOSED`: `enable_thinking=false` produces both start+end markers; `enable_thinking=true` produces only start marker
 - Handles the reverse case: if both start and end are still empty, looks for a single-segment diff on each side to extract both markers
 **R3 — `compare_reasoning_scope()`**: Compares assistant message with reasoning+text-content vs reasoning+tool-calls.
 - Only runs if `jinja_caps.supports_tool_calls`
 - Detects `TOOLS_ONLY`: reasoning content present in B (with tools) but not in A (with text content)
 - Extracts reasoning markers from the tool call output using PEG parsers
 ### Phase 2: Content Analysis
 **C1**: Two comparisons in the `analyze_content` constructor:
 - Comparison 1: content-only output vs tool-call output → `diff_tools`
 - Comparison 2: content-only output vs reasoning+empty-content output → `diff_reasoning`
 Classification logic:
 - `PLAIN`: `diff_tools.left` equals the response string (content is the entire diff, no wrapper)
 - `ALWAYS_WRAPPED`: markers found surrounding the content text in `pure_content` → extracts `start`/`end`
 ### Phase 3: Tool Call Analysis
 **T1 — `analyze_tool_calls()`**: Compares no-tools vs with-tools output.
 - Extracts the tool call section as `diff.right`
 - Calls `analyze_tool_call_format()` which first strips reasoning markers from the haystack, then:
  - Calls `in_json_haystack()` for both function name and argument name needles
  - `in_json_haystack()` uses a PEG parser to check whether the needle appears in a JSON context (preceded by `{` or `:` with surrounding quotes)
  - If function name is in JSON → `JSON_NATIVE` → `analyze_tool_call_format_json_native()`
  - If function name not in JSON, arg name is in JSON → `TAG_WITH_JSON`
  - If neither in JSON → `TAG_WITH_TAGGED`
  - `analyze_tool_call_format_json_native()`: parses the JSON object, matches field values to needles to populate `name_field`, `args_field`, `id_field`, `gen_id_field`; detects `tools_array_wrapped`; extracts `section_start`/`section_end`
  - `analyze_tool_call_format_non_json()`: uses PEG parsers on the haystack to find up to two opening markers (section + per-call) then up to two closing markers
 **T2 — `check_per_call_markers()`**: Compares 1 call vs 2 calls.
 - Computes a secondary diff of the second call portion vs the common suffix
 - If the second call content starts with `section_start` → the section marker is actually per-call → moves `section_start/end` to `per_call_start/end` and clears the section markers
 **T3 — `extract_function_markers()`**: Compares function name `FUN_FIRST` vs `FUN_SECOND` (two different named functions).
 - Finds where the function name appears in `diff.left`
 - Extracts `function.name_prefix` from the common prefix up to the function marker, and `function.name_suffix` from after the name up to the next marker
 - Extends `name_suffix` into `diff.suffix` (to the first marker for TAG_WITH_TAGGED; to the first `{` or `[` for TAG_WITH_JSON)
 - Extracts `function.close` from after the last argument value up to the per-call/section end marker
 **T4 — `analyze_arguments()`** (TAG_WITH_TAGGED only):
 - **A1 `extract_argument_name_markers()`**: Compares `arg_name_A` vs `arg_name_B` (two different argument names).
  - Finds shared surrounding structure → `arguments.name_prefix`, `arguments.name_suffix`
 - **A2 `extract_argument_value_markers()`**: Compares argument value `"XXXX"` vs `"YYYY"` (same arg, different value).
  - Finds markers surrounding the value → `arguments.value_prefix`, `arguments.value_suffix`
 **T5 — `extract_argument_separator()`**: Compares 1 argument vs 2 arguments (same function).
 - Uses `until_common_prefix(diff.right, ARG_FIRST, ARG_SECOND)` to find what separates the two argument blocks
 **T6 — `extract_args_markers()`**: Compares 0 arguments vs 1 argument.
 - Uses `until_common_prefix()` and `after_common_suffix()` with the empty and single-arg JSON strings as anchors to find container markers (`arguments.start`, `arguments.end`)
 **T7 — `extract_call_id_markers()`**: Compares call IDs `"call00001"` vs `"call99999"`.
 - Determines whether function name appears in `diff.prefix` or `diff.suffix` to classify position:
  - Function name in prefix only → `BETWEEN_FUNC_AND_ARGS` or `POST_ARGS` (further distinguished by where `{` appears)
  - Function name in suffix only → `PRE_FUNC_NAME`
 - Extracts `call_id.prefix` and `call_id.suffix` markers around the call ID value
 - Clears `per_call_end` if it incorrectly incorporated the call ID suffix
 ### Workarounds
 A workaround array in `common/chat-diff-analyzer.cpp` applies post-hoc patches after analysis. Each workaround is a lambda that inspects the template source and overrides analysis results. Current workarounds:
 1. **Old Qwen/DeepSeek thinking templates** — source contains `content.split('</think>')`: sets `reasoning.mode = FORCED_OPEN` with `<think>`/`</think>` markers if no reasoning was detected
 2. **Granite 3.3** — source contains specific "Write your thoughts" text: forces `TAG_BASED` reasoning with `<think>`/`</think>` and `WRAPPED_WITH_REASONING` content with `<response>`/`</response>`
 3. **Cohere Command R+** — source contains `<|CHATBOT_TOKEN|>`: sets `ALWAYS_WRAPPED` content mode if no content start is already set
 4. **Functionary 3.1** — source contains `set has_code_interpreter`: forces `PLAIN` content, specific `per_call_start/end`, clears preserved tokens to only keep Functionary-specific markers
 5. **DeepSeek-R1-Distill-Qwen** — source contains `tool▁calls▁begin` markers: overrides tool section/per-call markers with the correct Unicode block characters
 ### Parser Building
 Each analyzer struct (`analyze_reasoning`, `analyze_content`, `analyze_tools`) implements `build_parser(parser_build_context&)`. They share a `parser_build_context` that carries the PEG builder, inference inputs, the pre-built reasoning parser, and a pointer to the content analyzer.
 #### Reasoning Parser (`analyze_reasoning::build_parser`)
 | Mode                              | Parser                                                              |
 |-----------------------------------|---------------------------------------------------------------------|
 | Not extracting reasoning          | `eps()`                                                             |
 | `FORCED_OPEN` or `FORCED_CLOSED`  | `reasoning(until(end)) + end` — opening tag was in the prompt       |
 | `TAG_BASED` or `TOOLS_ONLY`       | `optional(start + reasoning(until(end)) + end)`                     |
 | `DELIMITER`                       | `optional(reasoning(until(end)) + end)` — no start marker           |
 #### Content Parser (`analyze_content::build_parser`)
 | Condition                              | Parser                                                                          |
 |----------------------------------------|---------------------------------------------------------------------------------|
 | `json_schema` present                  | `reasoning + space() + content(schema(json(), "response-format", ...)) + end()` |
 | Tools present                          | Dispatches to `analyze_tools::build_parser()`                                   |
 | `ALWAYS_WRAPPED` with reasoning        | `reasoning + start + content(until(end)) + end + end()`                         |
 | `ALWAYS_WRAPPED` without reasoning     | `content(until(start)) + start + content(until(end)) + end + end()`             |
 | Default (PLAIN)                        | `reasoning + content(rest()) + end()`                                           |
 #### Tool Parsers (`analyze_tools::build_parser`)
 Dispatches by `format.mode`:
 **`build_tool_parser_json_native()`**: Calls `p.standard_json_tools()` which internally dispatches to:
 - `build_json_tools_function_is_key()` — function name is the JSON key: `{"get_weather": {...}}`
 - `build_json_tools_nested_keys()` — nested: `{"function": {"name": "X", "arguments": {...}}}`
 - `build_json_tools_flat_keys()` — flat: `{"name": "X", "arguments": {...}}`
 Handles content wrappers, array wrapping (`tools_array_wrapped`), parallel calls, and `parameter_order`.
 **`build_tool_parser_tag_json()`**: For each tool function:
 ```text
 tool_open(name_prefix + tool_name(literal(name)) + name_suffix) +
    call_id_section +
    tool_args(schema(json(), tool_schema))
  [+ function.close if non-empty]
 ```
 Wrapped in per-call markers (with optional parallel call repetition) then optionally in section markers.
 **`build_tool_parser_tag_tagged()`**: For each tool function, builds one parser per argument:
 - String types: `tool_arg_string_value(schema(until(value_suffix), ...))`
 - JSON types: `tool_arg_json_value(schema(json(), ...))`
 - Required args are plain; optional args wrapped in `optional()`
 - Arguments joined with `space()` between consecutive parsers
 For closing: uses `function.close` if present; otherwise uses `peek(per_call_end)` to avoid premature close during partial streaming; falls back to `tool_close(space())` to trigger mapper callbacks.
 All three tool parsers return:
 ```text
 reasoning + optional(content(until(trigger_marker))) + tool_calls + end()
 ```
 ### Python Dict Format
 When `format.uses_python_dicts` is true (detected when single-quoted strings appear in JSON argument context), `build_parser()` pre-registers a `json-string` rule that accepts both single-quoted and double-quoted strings. This is done before any `p.json()` call so all JSON parsing inherits the flexible rule.
 ## Mapper
 `common_chat_peg_mapper` maps PEG parse results (AST nodes) into `common_chat_msg` structures. Key design:
 - **Buffered arguments**: Before `tool_name` is known, argument text goes to `args_buffer`; once the name is set, the buffer is flushed to `current_tool->arguments`
 - **`args_target()`**: Returns a reference to whichever destination is currently active (buffer or tool args), eliminating branching
 - **`closing_quote_pending`**: Tracks whether a closing `"` needs to be appended when a string argument value is finalized (for schema-declared string types in tagged format)
 - **Quote normalization**: Python-style quotes (`'key': 'value'`) are converted to JSON (`"key": "value"`)
 - **Brace auto-closing**: At tool close, unclosed `{` braces are closed automatically
 ## Files
 | File                                      | Purpose                                                              |
 |-------------------------------------------|----------------------------------------------------------------------|
 | `common/chat-auto-parser.h`               | All analysis structs, enums, `autoparser`, `peg_generator`, `templates_params` |
 | `common/chat-auto-parser-generator.cpp`   | Parser generator: `generate_parser()` and `build_parser()` methods   |
 | `common/chat-diff-analyzer.cpp`           | Differential analysis implementation and workarounds                 |
 | `common/chat-auto-parser-helpers.h/cpp`   | `calculate_diff_split()`, `segmentize_markers()`,                    |
 |                                           | `compare_variants()`, string helpers                                 |
 | `common/chat-peg-parser.h/cpp`            | `common_chat_peg_builder`, `common_chat_peg_mapper`, and helpers     |
 | `common/chat.cpp`                         | Entry point: `common_chat_templates_apply_jinja()`                   |
 | `tools/parser/debug-template-parser.cpp`  | Debug tool for template analysis                                     |
 | `tools/parser/template-analysis.cpp`      | Template analysis tool                                               |
 ## Testing & Debugging
 ### Debug Tools
 **Template Debugger**: `tools/parser/debug-template-parser.cpp`
 - Usage: `./bin/llama-debug-template-parser path/to/template.jinja`
 - Shows detected format, markers, generated parser, and GBNF grammar
 **Template Analysis**: `tools/parser/template-analysis.cpp`
 - Usage: `./bin/llama-template-analysis path/to/template.jinja`
 **Debug Logging**: Enable with `LLAMA_LOG_VERBOSITY=2`
 - Shows detailed analysis steps, pattern extraction results, and generated parser structure
 **PEG Test Builder**: Fluent API for creating test cases — see [tests/test-chat.cpp:947-1043](tests/test-chat.cpp#L947-L1043). Example usage:
 ```cpp
 auto tst = peg_tester("models/templates/Template.jinja");
 tst.test("input text")
   .reasoning_format(COMMON_REASONING_FORMAT_AUTO)
   .tools({tool_json})
   .parallel_tool_calls(true)
   .enable_thinking(true)
   .expect(expected_message)
   .run();
 ```
 ### Tested Templates
 The following templates have active tests in `tests/test-chat.cpp`:
 | Template | Format | Notes |
 | -------- | ------ | ----- |
 | Ministral-3-14B-Reasoning | Reasoning | `[THINK]...[/THINK]` tags (specialized handler) |
 | NVIDIA-Nemotron-3-Nano-30B | TAG_WITH_TAGGED | Reasoning + tools |
 | CohereForAI Command-R7B | JSON_NATIVE | `<\|START_THINKING\|>`/`<\|START_RESPONSE\|>` markers |
 | Google Gemma 2 2B | Content only | No tool support |
 | Qwen-QwQ-32B | Reasoning | Forced-open thinking |
 | NousResearch Hermes 2 Pro | JSON_NATIVE | `<tool_call>` wrapper |
 | IBM Granite 3.3 | JSON_NATIVE | `<think></think>` + `<response></response>` |
 | ByteDance Seed-OSS | TAG_WITH_TAGGED | Custom `<seed:think>` and `<seed:tool_call>` tags |
 | Qwen3-Coder | TAG_WITH_TAGGED | XML-style tool format |
 | DeepSeek V3.1 | JSON_NATIVE | Forced thinking mode |
 | GLM-4.6 | TAG_WITH_TAGGED | `<tool_call>name\n<arg_key>...<arg_value>...` format |
 | GLM-4.7-Flash | TAG_WITH_TAGGED | Updated GLM format |
 | Kimi-K2-Thinking | JSON_NATIVE | Reasoning + JSON tools |
 | Apertus-8B-Instruct | JSON_NATIVE | Function name as JSON key |
 | MiniMax-M2 | TAG_WITH_JSON | XML invoke with JSON args |
 | NVIDIA-Nemotron-Nano-v2 | JSON_NATIVE | `<TOOLCALL>` wrapper (nested) |
 | CohereForAI Command-R Plus | JSON_NATIVE | Markdown code block format |
 | Mistral-Nemo-Instruct-2407 | JSON_NATIVE | `[TOOL_CALLS]` wrapper with ID field |
 | Functionary v3.1 | TAG_WITH_JSON | `<function=X>` format |
 | Functionary v3.2 | Specialized | `>>>` recipient delimiter (dedicated handler) |
 | Fireworks Firefunction v2 | TAG_WITH_JSON | Fireworks tool format |
 | DeepSeek R1 Distill (Llama/Qwen) | Reasoning | Forced-open thinking |
 | llama-cpp-deepseek-r1 | Reasoning | Forced-open thinking |
 | Kimi-K2 / Kimi-K2-Instruct | JSON_NATIVE | JSON tools with special markers |
 | Llama 3.1/3.2/3.3 | JSON_NATIVE | Standard Llama tool format |
 | OpenAI GPT-OSS | Specialized | Channel-based (dedicated handler) |
 | Apriel 1.5 | JSON_NATIVE | `<tool_calls>` wrapper with JSON array |
 | Apriel 1.6 Thinker | Reasoning | Implicit reasoning start |
 | Mistral Small 3.2 | JSON_NATIVE | `[TOOL_CALLS]func[ARGS]{...}` with call ID |
 | Devstral | JSON_NATIVE | `[TOOL_CALLS]func[ARGS]{...}` without call ID |
 | StepFun 3.5 Flash | TAG_WITH_TAGGED | `<function=X><parameter=Y>` format |
 ## Adding Support for New Templates
 To support a new template format:
 1. **If it follows standard patterns** — The auto-parser should detect it automatically. Run `llama-debug-template-parser` to verify markers are correctly extracted.
 2. **If differential analysis extracts incorrect markers** — Add a workaround lambda to the `workarounds` vector in `common/chat-diff-analyzer.cpp`. Inspect the template source for a unique identifying substring.
 3. **If it needs fundamentally different handling** — Add a dedicated handler function in `chat.cpp` before the auto-parser block (as done for GPT-OSS, Functionary v3.2, and Ministral).
 ## Edge Cases and Quirks
 1. **Forced Thinking**: When `enable_thinking=true` and the model prompt ends with an open reasoning tag (e.g., `<think>`), the parser enters forced thinking mode and immediately expects reasoning content without waiting for a start marker.
 2. **Per-Call vs Per-Section Markers**: Some templates wrap each tool call individually (`per_call_start/end`); others wrap the entire section (`section_start/end`). T2 (`check_per_call_markers()`) disambiguates by checking if the second call in a two-call output starts with the section marker.
 3. **Python Dict Format**: The Seed template family uses single-quoted JSON (`'key': 'value'`). The `uses_python_dicts` flag causes the PEG builder to register a flexible `json-string` rule accepting both quote styles before any JSON rules are built.
 4. **Tag Boundary Fixing**: `calculate_diff_split()` iteratively adjusts prefix/suffix boundaries to avoid splitting `<tag>` or `[marker]` tokens, ensuring clean extraction.
 5. **Call ID Side Effects**: When a call ID is detected, `per_call_end` may have been incorrectly set to include the call ID suffix. T7 clears `per_call_end` in this case.
 6. **Tool Analysis Gating**: `analyze_tools` is only constructed (and all tool analysis phases run) when `jinja_caps.supports_tool_calls` is true. Within tool analysis, `check_per_call_markers()` (T2) only runs if `jinja_caps.supports_parallel_tool_calls`.
 7. **`analyze_arguments()` Gating**: Within tool analysis, A1 and A2 (argument name/value marker extraction) only run for `TAG_WITH_TAGGED` format. `extract_argument_separator()` and `extract_args_markers()` run for all non-`JSON_NATIVE` formats.
--- a/docs/backend/CANN.md
+++ b/docs/backend/CANN.md
@ -20,7 +20,7 @@
 **Llama.cpp + CANN**
-The llama.cpp CANN backend is designed to support Ascend NPU. It utilize the ability of AscendC and ACLNN which are intergrated to CANN Toolkit and kernels to using Ascend NPU directly.
+The llama.cpp CANN backend is designed to support Ascend NPU. It utilize the ability of AscendC and ACLNN which are integrated to CANN Toolkit and kernels to using Ascend NPU directly.
 ## News
@ -210,7 +210,7 @@ docker run --name llamacpp --device /dev/davinci0  --device /dev/davinci_manager
    # and install driver.
    sudo sh Ascend-hdk-910b-npu-firmware_x.x.x.x.X.run --full
    ```
-    If the following messaage appers, firmware is installed successfully.
+    If the following message appears, firmware is installed successfully.
    ```sh
    Firmware package installed successfully!
    ```
--- a/docs/backend/OPENVINO.md
+++ b/docs/backend/OPENVINO.md
@ -0,0 +1,343 @@
 # OpenVINO Backend for llama.cpp
 [OpenVINO](https://docs.openvino.ai/) is an open-source toolkit for optimizing and deploying high-performance AI inference, specifically designed for Intel hardware, including CPUs, GPUs, and NPUs, in the cloud, on-premises, and on the edge.
 This document describes the [OpenVINO backend for llama.cpp](../../src/ggml-openvino), which enables hardware-accelerated inference on **Intel® CPUs, GPUs, and NPUs** while remaining compatible with the existing **GGUF model ecosystem**. The backend translates GGML compute graphs into OpenVINO graphs and leverages graph compilation, kernel fusion, and device-specific optimizations to improve inference performance on supported Intel hardware.
 The OpenVINO backend is implemented in `ggml/src/ggml-openvino` and provides a translation layer for core GGML operations. The OpenVINO backend replaces the standard GGML graph execution path with Intel's OpenVINO inference engine. This approach allows the same GGUF model file to run on Intel CPUs, Intel GPUs (integrated and discrete), and Intel NPUs without changes to the model or the rest of the llama.cpp stack. When a `ggml_cgraph` is dispatched to OpenVINO backend, it:
 - Walks the GGML graph and identifies inputs, outputs, weights, and KV cache tensors.
 - Translates the GGML operations into an `ov::Model` using OpenVINO's frontend API.
 - Compiles and caches the model for the target device.
 - Binds GGML tensor memory to OpenVINO inference tensors and runs inference.
 ## Supported Devices
 OpenVINO backend supports the following hardware:
 - Intel CPUs
 - Intel GPUs (integrated and discrete)
 - Intel NPUs
 Although OpenVINO supports a wide range of [Intel hardware](https://docs.openvino.ai/2026/about-openvino/release-notes-openvino/system-requirements.html), the llama.cpp OpenVINO backend has been validated specifically on AI PCs such as the Intel® Core™ Ultra Series 1 and Series 2.
 ## Supported Model Precisions
 - `FP16`
 - `BF16` (on Intel Xeon)
 - `Q8_0`
 - `Q4_0`
 - `Q4_1`
 - `Q4_K`
 - `Q4_K_M`
 - `Q5_K` (converted to Q8_0_C at runtime)
 - `Q6_K` (converted to Q8_0_C at runtime)
 > [!NOTE]
 > Accuracy validation and performance optimizations for quantized models are a work in progress.
 ## Quantization Support Details
 ### CPU and GPU
 - **`Q4_0`, `Q4_1`, `Q4_K_M`, `Q6_K` models are supported**
 - `Q5_K` and `Q6_K` tensors are converted to `Q8_0_C`
 ### NPU
 - **Primary supported quantization scheme is `Q4_0`**
 - `Q6_K` tensors are requantized to `Q4_0_128` in general. For embedding weights, `Q6_K` tensors are requantized to `Q8_0_C` except for the token embedding matrix which is dequantized to fp16
 ### Additional Notes
 - Both `Q4_0` and `Q4_1` models use `Q6_K` for the token embedding tensor and the final matmul weight tensor (often the same tensor)
 - `Q4_0` models may produce some `Q4_1` tensors if an imatrix is provided during quantization using `llama-quantize`
 - `Q4_K_M` models may include both `Q6_K` and `Q5_K` tensors (observed in Phi-3)
 ## Validated Models
 The following models have been validated for functionality on Intel® Core™ Ultra Series 1 and Series 2:
 - [Llama-3.2-1B-Instruct-GGUF](https://huggingface.co/unsloth/Llama-3.2-1B-Instruct-GGUF/)
 - [Llama-3.1-8B-Instruct](https://huggingface.co/bartowski/Meta-Llama-3.1-8B-Instruct-GGUF)
 - [microsoft/Phi-3-mini-4k-instruct-gguf](https://huggingface.co/microsoft/Phi-3-mini-4k-instruct-gguf)
 - [Qwen/Qwen2.5-1.5B-Instruct-GGUF](https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct-GGUF)
 - [Qwen/Qwen3-8B](https://huggingface.co/Qwen/Qwen3-8B-GGUF)
 - [openbmb/MiniCPM-1B-sft-bf16](https://huggingface.co/openbmb/MiniCPM-S-1B-sft-gguf)
 - [tencent/Hunyuan-7B-Instruct](https://huggingface.co/bartowski/tencent_Hunyuan-7B-Instruct-GGUF)
 - [mistralai/Mistral-7B-Instruct-v0.3](https://huggingface.co/bartowski/Mistral-7B-Instruct-v0.3-GGUF)
 - [bartowski/DeepSeek-R1-Distill-Llama-8B-GGUF](https://huggingface.co/bartowski/DeepSeek-R1-Distill-Llama-8B-GGUF)
 ## Build Instructions
 ### Prerequisites
 - Linux or Windows system with Intel hardware (CPU, GPU, or NPU)
 - **For Intel GPU or NPU Usage**: Install the appropriate hardware drivers for your Intel GPU or NPU. For detailed instructions, see: [Additional Configurations for Hardware Acceleration](https://docs.openvino.ai/2025/get-started/install-openvino/configurations.html).
 - **Linux:**
    - Git, CMake, and Ninja software tools are needed for building.
    ```bash
      sudo apt-get update
      sudo apt-get install -y build-essential libcurl4-openssl-dev libtbb12 cmake ninja-build python3-pip curl wget tar
    ```
    - OpenCL
    ```bash
      sudo apt install ocl-icd-opencl-dev opencl-headers opencl-clhpp-headers intel-opencl-icd
    ```
 - **Windows:**
  - Download and install [Microsoft Visual Studio 2022 Build Tools](https://aka.ms/vs/17/release/vs_BuildTools.exe). During installation, select the **"Desktop development with C++"** workload.
  - Install required tools:
    ```powershell
    # Windows PowerShell
    winget install Git.Git
    winget install GNU.Wget
    winget install Ninja-build.Ninja
    ```
  - Install **OpenCL** using **vcpkg**:
    ```powershell
    # Windows PowerShell
    cd C:\
    git clone https://github.com/microsoft/vcpkg
    cd vcpkg
    .\bootstrap-vcpkg.bat
    .\vcpkg install opencl
    # Optional but recommended: Integrate vcpkg with Visual Studio / CMake:
    .\vcpkg integrate install
    ```
 ### 1. Install OpenVINO Runtime
 - Follow the guide to install OpenVINO Runtime from an archive file: [Linux](https://docs.openvino.ai/2026/get-started/install-openvino/install-openvino-archive-linux.html) | [Windows](https://docs.openvino.ai/2026/get-started/install-openvino/install-openvino-archive-windows.html)
 - **Linux:**
    <details>
    <summary>📦 Click to expand OpenVINO installation from an archive file on Ubuntu</summary>
    <br>
    ```bash
    wget https://raw.githubusercontent.com/ravi9/misc-scripts/main/openvino/ov-archive-install/install-openvino-from-archive.sh
    chmod +x install-openvino-from-archive.sh
    ./install-openvino-from-archive.sh
    ```
    Verify OpenVINO is initialized properly:
    ```bash
    echo $OpenVINO_DIR
    ```
    </details>
 ### 2. Build llama.cpp with OpenVINO Backend
 Clone the OpenVINO-enabled llama.cpp fork and build it:
 ```bash
 git clone https://github.com/ggml-org/llama.cpp
 cd llama.cpp
 ```
 - **Linux:**
    ```bash
    source /opt/intel/openvino/setupvars.sh
    cmake -B build/ReleaseOV -G Ninja -DCMAKE_BUILD_TYPE=Release -DGGML_OPENVINO=ON
    cmake --build build/ReleaseOV --parallel
    ```
 - **Windows:**
    ```cmd
    # x64 Native Tools Command Prompt for VS 2022
    "C:\Program Files (x86)\Intel\openvino_2026.0\setupvars.bat"
    cmake -B build\ReleaseOV -G Ninja -DCMAKE_BUILD_TYPE=Release -DGGML_OPENVINO=ON -DLLAMA_CURL=OFF -DCMAKE_TOOLCHAIN_FILE=C:\vcpkg\scripts\buildsystems\vcpkg.cmake
    cmake --build build\ReleaseOV --parallel
    ```
 > [!NOTE]
 > Use `x64 Native Tools Command Prompt` for Windows build. After building, you could use either `cmd` or `PowerShell` to run the OpenVINO backend.
 ### 3. Download Sample Model
 Download models for testing:
 ```bash
 # Linux
 mkdir -p ~/models/
 wget https://huggingface.co/unsloth/Llama-3.2-1B-Instruct-GGUF/resolve/main/Llama-3.2-1B-Instruct-Q4_0.gguf \
     -O ~/models/Llama-3.2-1B-Instruct-Q4_0.gguf
 # Windows PowerShell
 mkdir C:\models
 Invoke-WebRequest -Uri https://huggingface.co/unsloth/Llama-3.2-1B-Instruct-GGUF/resolve/main/Llama-3.2-1B-Instruct-Q4_0.gguf -OutFile C:\models\Llama-3.2-1B-Instruct-Q4_0.gguf
 # Windows Command Line
 mkdir C:\models
 curl -L https://huggingface.co/unsloth/Llama-3.2-1B-Instruct-GGUF/resolve/main/Llama-3.2-1B-Instruct-Q4_0.gguf -o C:\models\Llama-3.2-1B-Instruct-Q4_0.gguf
 ```
 ### 4. Run Inference with OpenVINO Backend
 When using the OpenVINO backend, the first inference token may have slightly higher latency due to on-the-fly conversion to the OpenVINO graph. Subsequent tokens and runs will be faster.
 ```bash
 # If device is unset or unavailable, defaults to CPU.
 # If the system has multiple GPUs, use GPU.0 or GPU.1 to explicitly target a specific GPU.
 # Linux
 export GGML_OPENVINO_DEVICE=GPU
 # To run llama-simple:
 ./build/ReleaseOV/bin/llama-simple -m ~/models/Llama-3.2-1B-Instruct-Q4_0.gguf -n 50 "The story of AI is "
 # To run in chat mode:
 ./build/ReleaseOV/bin/llama-cli -m ~/models/Llama-3.2-1B-Instruct-Q4_0.gguf
 # Windows Command Line
 set GGML_OPENVINO_DEVICE=GPU
 # Windows PowerShell
 $env:GGML_OPENVINO_DEVICE = "GPU"
 # To run llama-simple
 build\ReleaseOV\bin\llama-simple.exe -m "C:\models\Llama-3.2-1B-Instruct-Q4_0.gguf" -n 50 "The story of AI is "
 # To run in chat mode:
 build\ReleaseOV\bin\llama-cli.exe -m "C:\models\Llama-3.2-1B-Instruct-Q4_0.gguf"
 ```
 > [!NOTE]
 > On systems with multiple GPUs, use `GPU.0` or `GPU.1` to explicitly target specific GPU. See [OpenVINO GPU Device](https://docs.openvino.ai/2026/openvino-workflow/running-inference/inference-devices-and-modes/gpu-device.html) for more details.
 ### Docker Build
 You can build and run llama.cpp with OpenVINO backend using Docker.
 ```bash
 # Build the base runtime image with compiled shared libraries and minimal dependencies.
 docker build -t llama-openvino:base -f .devops/openvino.Dockerfile .
 # Build the complete image with all binaries, Python tools, gguf-py library, and model conversion utilities.
 docker build --target=full -t llama-openvino:full -f .devops/openvino.Dockerfile .
 # Build a minimal CLI-only image containing just the llama-cli executable.
 docker build --target=light -t llama-openvino:light -f .devops/openvino.Dockerfile .
 # Builds a server-only image with llama-server executable, health check endpoint, and REST API support.
 docker build --target=server -t llama-openvino:server -f .devops/openvino.Dockerfile .
 # If you are behind a proxy:
 docker build --build-arg http_proxy=$http_proxy --build-arg https_proxy=$https_proxy --target=light -t llama-openvino:light -f .devops/openvino.Dockerfile .
 ```
 Run llama.cpp with OpenVINO backend Docker container.
 Save sample models in `~/models` as [shown above](#3-download-sample-model). It will be mounted to the container in the examples below.
 ```bash
 #  Run Docker container
 docker run --rm -it -v ~/models:/models llama-openvino:light --no-warmup -m /models/Llama-3.2-1B-Instruct-Q4_0.gguf
 # With Intel GPU access (iGPU or dGPU)
 docker run --rm -it -v ~/models:/models \
 --device=/dev/dri --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1) -u $(id -u):$(id -g) \
 llama-openvino:light --no-warmup -m /models/Llama-3.2-1B-Instruct-Q4_0.gguf
 # With Intel NPU access
 docker run --rm -it --env GGML_OPENVINO_DEVICE=NPU -v ~/models:/models \
 --device=/dev/accel --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1) -u $(id -u):$(id -g) \
 llama-openvino:light --no-warmup -m /models/Llama-3.2-1B-Instruct-Q4_0.gguf
 ```
 Run Llama.cpp Server with OpenVINO Backend:
 ```bash
 # Run the Server Docker container
 docker run --rm -it -p 8080:8080 -v ~/models:/models llama-openvino:server --no-warmup -m /models/Llama-3.2-1B-Instruct-Q4_0.gguf
 # In a NEW terminal, test the server with curl
 # If you are behind a proxy, make sure to set NO_PROXY to avoid proxy for localhost
 export NO_PROXY=localhost,127.0.0.1
 # Test health endpoint
 curl -f http://localhost:8080/health
 # Test with a simple prompt
 curl -X POST "http://localhost:8080/v1/chat/completions" -H "Content-Type: application/json" \
 -d '{"messages":[{"role":"user","content":"Write a poem about OpenVINO"}],"max_tokens":100}' | jq .
 ```
 ## Runtime Configuration
 The OpenVINO backend can be configured using the following environment variables at runtime to control device selection, caching, debugging, and profiling behavior.
 ### Configuration Options
 | Variable                          | Default    | Description                                                                                                 |
 |-----------------------------------|------------|-------------------------------------------------------------------------------------------------------------|
 | `GGML_OPENVINO_DEVICE`            | `CPU`      | Specify the target device (CPU, GPU, NPU). On systems with multiple GPUs, use `GPU.0` or `GPU.1` to explicitly target specific GPU. See [OpenVINO GPU Device](https://docs.openvino.ai/2026/openvino-workflow/running-inference/inference-devices-and-modes/gpu-device.html). When set to **NPU**, static compilation mode is enabled for optimal performance. |
 | `GGML_OPENVINO_CACHE_DIR`         | `not set`  | Directory for OpenVINO model caching (recommended: `/tmp/ov_cache`). Enables model caching when set. **Not supported on NPU devices.** |
 | `GGML_OPENVINO_PREFILL_CHUNK_SIZE`| `256`      | Token chunk size for **NPU** prefill.                                                                       |
 | `GGML_OPENVINO_STATEFUL_EXECUTION`| `0`        | Enable stateful KV cache on for better performance. Recommended on CPU, GPU.                                |
 | `GGML_OPENVINO_PROFILING`         | `0`        | Enable execution-time profiling.                                                                            |
 | `GGML_OPENVINO_DUMP_CGRAPH`       | `0`        | Dump the GGML compute graph to `cgraph_ov.txt`.                                                             |
 | `GGML_OPENVINO_DUMP_IR`           | `0`        | Serialize OpenVINO IR files with timestamps.                                                                |
 | `GGML_OPENVINO_DEBUG_INPUT`       | `0`        | Enable input debugging and print input tensor info.                                                         |
 | `GGML_OPENVINO_DEBUG_OUTPUT`      | `0`        | Enable output debugging and print output tensor info.                                                       |
 | `GGML_OPENVINO_PRINT_CGRAPH_TENSOR_ADDRESS` | `0` | Print tensor address map once.                                                                           |
 > [!NOTE]
 >`GGML_OPENVINO_STATEFUL_EXECUTION` is an **Experimental** feature to allow stateful execution for managing the KV cache internally inside the OpenVINO model, improving performance on CPUs and GPUs. Stateful execution is not effective on NPUs, and not all models currently support this feature. This feature is experimental and has been validated only with the llama-simple, llama-cli, llama-bench, and llama-run applications and is recommended to enable for the best performance. Other applications, such as llama-server and llama-perplexity, are not yet supported.
 ### Example Usage
 #### GPU Inference with Profiling
 ```bash
 # If the system has multiple GPUs, use GPU.0 or GPU.1 to explicitly target a specific GPU.
 # Linux
 export GGML_OPENVINO_CACHE_DIR=/tmp/ov_cache
 export GGML_OPENVINO_PROFILING=1
 export GGML_OPENVINO_DEVICE=GPU
 ./build/ReleaseOV/bin/llama-simple -m ~/models/Llama-3.2-1B-Instruct-Q4_0.gguf -n 50 "The story of AI is "
 # Windows Command Line
 set GGML_OPENVINO_CACHE_DIR=C:\tmp\ov_cache
 set GGML_OPENVINO_PROFILING=1
 set GGML_OPENVINO_DEVICE=GPU
 # Windows PowerShell
 $env:GGML_OPENVINO_CACHE_DIR = "C:\tmp\ov_cache"
 $env:GGML_OPENVINO_PROFILING = "1"
 $env:GGML_OPENVINO_DEVICE = "GPU"
 build\ReleaseOV\bin\llama-simple.exe -m "C:\models\Llama-3.2-1B-Instruct-Q4_0.gguf" -n 50 "The story of AI is "
 ```
 #### llama-bench
 ```bash
 # -fa 1 is required when running llama-bench with the OpenVINO backend.
 GGML_OPENVINO_DEVICE=GPU ./llama-bench -fa 1
 ```
 ### NPU Notes
 - Model caching is not yet supported
 - Does not support llama-server -np > 1 (multiple parallel sequences)
 - Only supports llama-perplexity -b 512 or smaller
 ## Llama.cpp Tools
 The following tools work with the OpenVINO backend on CPU, GPU, NPU:
 - llama-simple
 - llama-run
 - llama-cli
 - llama-server
 - llama-bench
 - llama-perplexity
 ## Work in Progress
 - Performance and memory optimizations
 - Accuracy validation
 - Broader quantization coverage
 - Support for additional model architectures
--- a/docs/backend/SYCL.md
+++ b/docs/backend/SYCL.md
@ -9,6 +9,7 @@
 - [Linux](#linux)
 - [Windows](#windows)
 - [Environment Variable](#environment-variable)
 - [Design Rule](#design-rule)
 - [Known Issue](#known-issues)
 - [Q&A](#qa)
 - [TODO](#todo)
@ -41,6 +42,9 @@ The following releases are verified and recommended:
 ## News
 - 2026.03
  - Support Flash-Attention: less memory usage, performance impact depends on LLM.
 - 2026.02
  - Remove support for Nvidia & AMD GPU, because the oneAPI plugin for Nvidia & AMD GPU is unavailable: download/installation channels are out of work. User can't build up the software for Nvidia & AMD GPU.
@ -378,17 +382,27 @@ use 1 SYCL GPUs: [0] with Max compute units:512
 ## Windows
-### I. Setup Environment
+### Install GPU driver
 1. Install GPU driver
 Intel GPU drivers instructions guide and download page can be found here: [Get Intel GPU Drivers](https://www.intel.com/content/www/us/en/products/docs/discrete-gpus/arc/software/drivers.html).
-2. Install Visual Studio
+### Option 1: download the binary package directly
 Download the binary package for Windows from: https://github.com/ggml-org/llama.cpp/releases.
 Extract the package to local folder, run the llama tools directly. Refer to [Run the inference](#iii-run-the-inference-1).
 Note, the package includes the SYCL running time and all depended dll files, no need to install oneAPI package and activte them.
 ### Option 2: build locally from the source code.
 #### I. Setup environment
 1. Install Visual Studio
 If you already have a recent version of Microsoft Visual Studio, you can skip this step. Otherwise, please refer to the official download page for [Microsoft Visual Studio](https://visualstudio.microsoft.com/).
-3. Install Intel® oneAPI Base toolkit
+2. Install Intel® oneAPI Base toolkit
 SYCL backend depends on:
  - Intel® oneAPI DPC++/C++ compiler/running-time.
@ -439,25 +453,25 @@ Output (example):
 [ext_oneapi_level_zero:gpu:0] Intel(R) Level-Zero, Intel(R) Iris(R) Xe Graphics 1.3 [1.3.28044]
 ```
-4. Install build tools
+3. Install build tools
 a. Download & install cmake for Windows: https://cmake.org/download/ (CMake can also be installed from Visual Studio Installer)
 b. The new Visual Studio will install Ninja as default. (If not, please install it manually: https://ninja-build.org/)
-### II. Build llama.cpp
+#### II. Build llama.cpp
 You could download the release package for Windows directly, which including binary files and depended oneAPI dll files.
 Choose one of following methods to build from source code.
-#### 1. Script
+##### Option 1: Script
 ```sh
 .\examples\sycl\win-build-sycl.bat
 ```
-#### 2. CMake
+##### Option 2: CMake
 On the oneAPI command line window, step into the llama.cpp main directory and run the following:
@ -486,7 +500,7 @@ cmake --preset x64-windows-sycl-debug
 cmake --build build-x64-windows-sycl-debug -j --target llama-completion
 ```
-#### 3. Visual Studio
+##### Option 3: Visual Studio
 You have two options to use Visual Studio to build llama.cpp:
 - As CMake Project using CMake presets.
@ -496,7 +510,7 @@ You have two options to use Visual Studio to build llama.cpp:
 All following commands are executed in PowerShell.
-##### - Open as a CMake Project
+###### - Open as a CMake Project
 You can use Visual Studio to open the `llama.cpp` folder directly as a CMake project. Before compiling, select one of the SYCL CMake presets:
@ -511,7 +525,7 @@ You can use Visual Studio to open the `llama.cpp` folder directly as a CMake pro
    cmake --build build --config Release -j --target llama-completion
    ```
-##### - Generating a Visual Studio Solution
+###### - Generating a Visual Studio Solution
 You can use Visual Studio solution to build and work on llama.cpp on Windows. You need to convert the CMake Project into a `.sln` file.
@ -599,7 +613,7 @@ found 2 SYCL devices:
 ```
-#### Choose level-zero devices
+##### Choose level-zero devices
 |Chosen Device ID|Setting|
 |-|-|
@ -607,7 +621,7 @@ found 2 SYCL devices:
 |1|`set ONEAPI_DEVICE_SELECTOR="level_zero:1"`|
 |0 & 1|`set ONEAPI_DEVICE_SELECTOR="level_zero:0;level_zero:1"` or `set ONEAPI_DEVICE_SELECTOR="level_zero:*"`|
-#### Execute
+##### Execute
 Choose one of following methods to run.
@ -665,7 +679,7 @@ use 1 SYCL GPUs: [0] with Max compute units:512
 ## Environment Variable
-#### Build
+### Build
 | Name               | Value                                 | Function                                    |
 |--------------------|---------------------------------------|---------------------------------------------|
@ -680,23 +694,50 @@ use 1 SYCL GPUs: [0] with Max compute units:512
 1. FP32 or FP16 have different performance impact to LLM. Recommended to test them for better prompt processing performance on your models. You need to rebuild the code after change `GGML_SYCL_F16=OFF/ON`.
-#### Runtime
+### Runtime
 | Name              | Value            | Function                                                                                                                  |
 |-------------------|------------------|---------------------------------------------------------------------------------------------------------------------------|
 | GGML_SYCL_DEBUG   | 0 (default) or 1 | Enable log function by macro: GGML_SYCL_DEBUG                                                                             |
 | GGML_SYCL_ENABLE_FLASH_ATTN | 1 (default) or 0| Enable Flash-Attention. It can reduce memory usage. The performance impact depends on the LLM.|
 | GGML_SYCL_DISABLE_OPT | 0 (default) or 1 | Disable optimize features for Intel GPUs. (Recommended to 1 for intel devices older than Gen 10) |
 | GGML_SYCL_DISABLE_GRAPH | 0 or 1 (default) | Disable running computations through SYCL Graphs feature. Disabled by default because SYCL Graph is still on development, no better performance. |
 | GGML_SYCL_DISABLE_DNN | 0 (default) or 1 | Disable running computations through oneDNN and always use oneMKL. |
 | ZES_ENABLE_SYSMAN | 0 (default) or 1 | Support to get free memory of GPU by sycl::aspect::ext_intel_free_memory.<br>Recommended to use when --split-mode = layer |
 | UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS | 0 (default) or 1 | Support malloc device memory more than 4GB.|
 ## Design Rule
 - Open to all contributors.
 - All code change should be useful to user:
    - Fix bug.
    - Add new function.
    - Improve the performance/usage.
    - Make code be easy to maintain.
    - ...
 - Don't accept the codes of following cases:
    - Break legacy function.
    - Reduce the performance of legacy case in default.
    - Not completed work/the functionality cannot be demonstrated.
 - Encourage to use environment variable to control features to be opened/closed.
    - User can evaluate the feature without rebuild the code.
    - Recommend the best features to user by setting them be opened as default.
 - Design the code based on the published official releases of oneAPI packages: compiler, library, driver, OS kernel.
 - Developers need to maintain the code they submit.
 ## Known Issues
 - `Split-mode:[row]` is not supported.
 - Missed the AOT (Ahead-of-Time) in buiding.
  - Good: build quickly, smaller size of binary file.
  - Bad: The startup is slow (JIT) in first time, but subsequent performance is unaffected.
 ## Q&A
 - Error:  `error while loading shared libraries: libsycl.so: cannot open shared object file: No such file or directory`.
@ -708,7 +749,7 @@ use 1 SYCL GPUs: [0] with Max compute units:512
  - Remove **build** folder or try a clean-build.
- I can **not** see `[ext_oneapi_level_zero:gpu]` afer installing the GPU driver on Linux.
+- I can **not** see `[ext_oneapi_level_zero:gpu]` after installing the GPU driver on Linux.
  Please double-check with `sudo sycl-ls`.
@ -746,7 +787,7 @@ use 1 SYCL GPUs: [0] with Max compute units:512
  ```
 ### **GitHub contribution**:
-Please add the `SYCL :` prefix/tag in issues/PRs titles to help the SYCL contributors to check/address them without delay.
+Please add the `[SYCL]` prefix/tag in issues/PRs titles to help the SYCL contributors to check/address them without delay.
 ## TODO
--- a/docs/backend/VirtGPU/development.md
+++ b/docs/backend/VirtGPU/development.md
@ -55,7 +55,8 @@ LLAMA_MAC_BUILD=$PWD/build/ggml-virtgpu-backend
 cmake -S . -B $LLAMA_MAC_BUILD \
      -DGGML_NATIVE=OFF \
      -DLLAMA_CURL=ON \
-      -DGGML_REMOTINGBACKEND=ONLY \
+      -DGGML_VIRTGPU=ON \
      -DGGML_VIRTGPU_BACKEND=ONLY \
      -DGGML_METAL=ON
 TARGETS="ggml-metal"
@ -71,6 +72,7 @@ cmake --build $LLAMA_MAC_BUILD --parallel 8 --target $EXTRA_TARGETS
 ```bash
 # Build virglrenderer with APIR support
 mkdir virglrenderer
 cd virglrenderer
 git clone https://gitlab.freedesktop.org/kpouget/virglrenderer -b main-macos src
 cd src
@ -95,7 +97,7 @@ mkdir llama.cpp
 git clone https://github.com/ggml-org/llama.cpp.git src
 cd src
-LLAMA_LINUX_BUILD=$PWD//build-virtgpu
+LLAMA_LINUX_BUILD=$PWD/build-virtgpu
 cmake -S . -B $LLAMA_LINUX_BUILD \
      -DGGML_VIRTGPU=ON
--- a/docs/backend/ZenDNN.md
+++ b/docs/backend/ZenDNN.md
@ -22,7 +22,7 @@
 **Llama.cpp + ZenDNN**
-The llama.cpp ZenDNN backend leverages AMD's optimized matrix multiplication primitives to accelerate inference on AMD CPUs. It utilizes ZenDNN's **LowOHA (Low Overhead Hardware Accelerated)** MatMul operator for efficient GEMM operations with minimal execution overhead, built-in weight caching, and direct access to backend libraries (AOCL BLIS, LibXSMM, OneDNN).
+The llama.cpp ZenDNN backend leverages AMD's optimized matrix multiplication primitives to accelerate inference on AMD CPUs. It utilizes ZenDNN's **LowOHA (Low Overhead Hardware Accelerated)** MatMul operator for efficient GEMM operations with minimal execution overhead, built-in weight caching, and direct access to backend libraries (AOCL DLP, LibXSMM, OneDNN).
 For more information about ZenDNN, visit: https://www.amd.com/en/developer/zendnn.html
@ -32,7 +32,7 @@ For more information about ZenDNN, visit: https://www.amd.com/en/developer/zendn
 |:-------:|:-------:|:----------------------------------------------:|
 | Linux   | Support | Ubuntu 20.04, 22.04, 24.04                     |
-For the latest list of supported operating systems, see the [ZenDNN Supported OS](https://github.com/amd/ZenDNN/blob/zendnnl/README.md#15-supported-os).
+For the latest list of supported operating systems, see the [ZenDNN Supported OS](https://github.com/amd/ZenDNN/blob/a18adf8c605fb5f5e52cefd7eda08a7b18febbaf/README.md#15-supported-os).
 ## Hardware
@ -61,7 +61,7 @@ The ZenDNN backend currently accelerates **matrix multiplication (MUL_MAT)** ope
 | Operation    | Status  | Notes                                          |
 |:-------------|:-------:|:----------------------------------------------:|
-| MUL_MAT      |    ✓    | Accelerated via ZenDNN LowOHA MatMul           |
+| MUL_MAT      | Support | Accelerated via ZenDNN LowOHA MatMul           |
 *Note:* Since only MUL_MAT is accelerated, models will benefit most from ZenDNN when matrix multiplications dominate the computational workload (which is typical for transformer-based LLMs).
@ -104,7 +104,6 @@ If you want to build ZenDNN yourself or use a specific version:
 # Clone ZenDNN repository
 git clone https://github.com/amd/ZenDNN.git
 cd ZenDNN
 git checkout zendnnl
 # Build and install (requires CMake >= 3.25)
 mkdir build && cd build
@ -114,7 +113,7 @@ cmake --build . --target all
 Default installation path: `ZenDNN/build/install`
-**For detailed build instructions**, refer to the [ZenDNN README](https://github.com/amd/ZenDNN/blob/zendnnl/README.md).
+**For detailed build instructions**, refer to the [ZenDNN README](https://github.com/amd/ZenDNN/blob/a18adf8c605fb5f5e52cefd7eda08a7b18febbaf/README.md).
 **Step 2: Build llama.cpp with custom ZenDNN path**
@ -146,8 +145,7 @@ Run llama.cpp server with ZenDNN acceleration:
 ```sh
 # Set optimal configuration
-export OMP_NUM_THREADS=64  # Adjust to your CPU core count
+export ZENDNNL_MATMUL_ALGO=1    # Blocked AOCL DLP algo for best performance
 export ZENDNNL_MATMUL_ALGO=2  # Blocked AOCL BLIS for best performance
 # Start server
 ./build/bin/llama-server \
@ -160,62 +158,26 @@ export ZENDNNL_MATMUL_ALGO=2  # Blocked AOCL BLIS for best performance
 Access the server at `http://localhost:8080`.
 **Performance tips**:
- Set `OMP_NUM_THREADS` to match your physical core count
+- Use `ZENDNNL_MATMUL_ALGO=1` for optimal performance
 - Use `ZENDNNL_MATMUL_ALGO=2` for optimal performance
 - For NUMA systems: `numactl --cpunodebind=0 --membind=0 ./build/bin/llama-server ...`
 ## Environment Variable
-### Build Time
+For environment variables related to ZenDNN, refer to the [ZenDNN Environment Variables Documentation](https://github.com/amd/ZenDNN/blob/a18adf8c605fb5f5e52cefd7eda08a7b18febbaf/docs/runtime_env.md).
-| Name               | Value                                 | Function                                    |
+### Performance Optimization
 |--------------------|---------------------------------------|---------------------------------------------|
 | GGML_ZENDNN        | ON/OFF                                | Enable ZenDNN backend support               |
 | ZENDNN_ROOT        | Path to ZenDNN installation           | Set ZenDNN installation directory           |
 | GGML_OPENMP        | ON/OFF (recommended: ON)              | Enable OpenMP for multi-threading           |
-### Runtime
+ZenDNN's LowOHA MatMul supports multiple backend algorithms. For **best performance**, use the **Blocked AOCL DLP** algorithm:
 | Name                    | Value                    | Function                                                          |
 |-------------------------|--------------------------|-------------------------------------------------------------------|
 | OMP_NUM_THREADS         | Number (e.g., 64)        | Set number of OpenMP threads (recommended: physical core count)   |
 | ZENDNNL_MATMUL_ALGO     | 0-5                      | Select MatMul backend algorithm (see Performance Optimization)    |
 | ZENDNNL_PROFILE_LOG_LEVEL | 0-4                    | Profiling log level (0=disabled, 4=verbose)                       |
 | ZENDNNL_ENABLE_PROFILER | 0 or 1                   | Enable detailed profiling (1=enabled)                             |
 | ZENDNNL_API_LOG_LEVEL   | 0-4                      | API log level (0=disabled, 4=verbose)                             |
 **Example**:
 ```sh
-export OMP_NUM_THREADS=64
+export ZENDNNL_MATMUL_ALGO=1    # Blocked AOCL DLP algo (recommended)
 export ZENDNNL_MATMUL_ALGO=2  # Use Blocked AOCL BLIS for best performance
 ./build/bin/llama-cli -m models/llama-2-7b.Q4_0.gguf -p "Test" -n 100
 ```
-## Performance Optimization
+For more details on available algorithms, see the [ZenDNN MatMul Algorithm Documentation](https://github.com/amd/ZenDNN/blob/a18adf8c605fb5f5e52cefd7eda08a7b18febbaf/docs/runtime_env.md#algorithm-details).
 ### MatMul Algorithm Selection
 ZenDNN's LowOHA MatMul supports multiple backend algorithms. For **best performance**, use the **Blocked AOCL BLIS** algorithm:
 ```sh
 export ZENDNNL_MATMUL_ALGO=2  # Blocked AOCL BLIS (recommended)
 ```
 **Available algorithms**:
 | Value | Algorithm              | Description                                    |
 |:-----:|:-----------------------|:----------------------------------------------|
 | 0     | Dynamic Dispatch       | Automatic backend selection (default)         |
 | 1     | AOCL BLIS              | AOCL BLIS backend                             |
 | 2     | AOCL BLIS Blocked      | **Blocked AOCL BLIS (recommended)**           |
 | 3     | OneDNN                 | OneDNN backend                                |
 | 4     | OneDNN Blocked         | Blocked OneDNN                                |
 | 5     | LibXSMM                | LibXSMM backend                               |
 ### Profiling and Debugging
-For detailed profiling and logging options, refer to the [ZenDNN Logging Documentation](https://github.com/amd/ZenDNN/blob/zendnnl/docs/logging.md).
+For detailed profiling and logging options, refer to the [ZenDNN Logging Documentation](https://github.com/amd/ZenDNN/blob/a18adf8c605fb5f5e52cefd7eda08a7b18febbaf/docs/logging.md).
 ## Known Issues
@ -245,10 +207,9 @@ A: Currently, ZenDNN primarily supports FP32 and BF16 data types. Quantized mode
 A: Ensure:
 1. You're using an AMD EPYC or Ryzen processor (Zen 2 or newer)
-2. `OMP_NUM_THREADS` is set appropriately (physical core count)
+2. `ZENDNNL_MATMUL_ALGO=1` is set for best performance (Blocked AOCL DLP)
-3. `ZENDNNL_MATMUL_ALGO=2` is set for best performance (Blocked AOCL BLIS)
+3. You're using a sufficiently large model (small models may not benefit as much)
-4. You're using a sufficiently large model (small models may not benefit as much)
+4. Enable profiling to verify ZenDNN MatMul is being called
 5. Enable profiling to verify ZenDNN MatMul is being called
 ### **GitHub Contribution**:
 Please add the **[ZenDNN]** prefix/tag in issues/PRs titles to help the ZenDNN-team check/address them without delay.
--- a/docs/backend/snapdragon/README.md
+++ b/docs/backend/snapdragon/README.md
@ -116,7 +116,7 @@ Llama-3.2-1B-Instruct-Q4_0.gguf: 1 file pushed, 0 skipped. 38.3 MB/s (773025920
 ### Windows
 All artifacts are already installed in the `pkg-snapdragon` folder.
-To run, adapt below instructions to use Powershell scrits in `scripts/snapdragon/windows`.
+To run, adapt below instructions to use Powershell scripts in `scripts/snapdragon/windows`.
 ## How to Run
--- a/docs/backend/snapdragon/windows.md
+++ b/docs/backend/snapdragon/windows.md
@ -144,7 +144,7 @@ Once the build is complete HTP ops libraries will be installed like this
 -a----         1/22/2026   6:01 PM           4139 libggml-htp.cat
 ```
-The .cat file, the signature and proper certicate installation can be verified with
+The .cat file, the signature and proper certificate installation can be verified with
 ```
 > signtool.exe verify /v /pa .\pkg-snapdragon\lib\libggml-htp.cat
--- a/docs/build.md
+++ b/docs/build.md
@ -13,6 +13,21 @@ cd llama.cpp
 The following sections describe how to build with different backends and options.
 * [CPU Build](#cpu-build)
 * [BLAS Build](#blas-build)
 * [Metal Build](#metal-build)
 * [SYCL](#sycl)
 * [CUDA](#cuda)
 * [MUSA](#musa)
 * [HIP](#hip)
 * [Vulkan](#vulkan)
 * [CANN](#cann)
 * [Arm® KleidiAI™](#arm-kleidiai)
 * [OpenCL](#opencl)
 * [Android](#android-1)
 * [OpenVINO](#openvino)
 * [Notes about GPU-accelerated backends](#notes-about-gpu-accelerated-backends)
 ## CPU Build
 Build llama.cpp using `CMake`:
@ -108,7 +123,7 @@ Building through oneAPI compilers will make avx_vnni instruction set available f
 - Using oneAPI docker image:
  If you do not want to source the environment vars and install oneAPI manually, you can also build the code using intel docker container: [oneAPI-basekit](https://hub.docker.com/r/intel/oneapi-basekit). Then, you can use the commands given above.
-Check [Optimizing and Running LLaMA2 on Intel® CPU](https://www.intel.com/content/www/us/en/content-details/791610/optimizing-and-running-llama2-on-intel-cpu.html) for more information.
+Check [Optimizing and Running LLaMA2 on Intel® CPU](https://builders.intel.com/solutionslibrary/optimizing-and-running-llama2-on-intel-cpu) for more information.
 ### Other BLAS libraries
@ -254,6 +269,14 @@ The environment variable [`CUDA_SCALE_LAUNCH_QUEUES`](https://docs.nvidia.com/cu
 Consider setting `CUDA_SCALE_LAUNCH_QUEUES=4x`, which increases the CUDA command buffer to 4 times its default size. This optimization is particularly beneficial for **Multi-GPU setups with pipeline parallelism**, where it significantly improves prompt processing throughput by allowing more operations to be enqueued across GPUs.
 #### GGML_CUDA_FORCE_CUBLAS_COMPUTE_32F
 Use `GGML_CUDA_FORCE_CUBLAS_COMPUTE_32F` environment variable to use FP32 compute type on all GPUs in FP16 cuBLAS for preventing possible numerical overflows in exchange for slower prompt processing (small impact on RTX PRO/Datacenter products and significant on GeForce products).
 #### GGML_CUDA_FORCE_CUBLAS_COMPUTE_16F
 Use `GGML_CUDA_FORCE_CUBLAS_COMPUTE_16F` environment variable to force use FP16 compute type (instead of default FP32) in FP16 cuBLAS for V100, CDNA and RDNA4.
 ### Unified Memory
 The environment variable `GGML_CUDA_ENABLE_UNIFIED_MEMORY=1` can be used to enable unified memory in Linux. This allows swapping to system RAM instead of crashing when the GPU VRAM is exhausted. In Windows this setting is available in the NVIDIA control panel as `System Memory Fallback`.
@ -265,7 +288,7 @@ The following compilation options are also available to tweak performance:
 | Option                        | Legal values           | Default | Description                                                                                                                                                                                                                                                                                                                                                                      |
 |-------------------------------|------------------------|---------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
 | GGML_CUDA_FORCE_MMQ           | Boolean                | false   | Force the use of custom matrix multiplication kernels for quantized models instead of FP16 cuBLAS even if there is no int8 tensor core implementation available (affects V100, CDNA and RDNA3+). MMQ kernels are enabled by default on GPUs with int8 tensor core support. With MMQ force enabled, speed for large batch sizes will be worse but VRAM consumption will be lower. |
-| GGML_CUDA_FORCE_CUBLAS        | Boolean                | false   | Force the use of FP16 cuBLAS instead of custom matrix multiplication kernels for quantized models. There may be issues with numerical overflows (except for CDNA and RDNA4) and memory use will be higher. Prompt processing may become faster on recent datacenter GPUs (the custom kernels were tuned primarily for RTX 3000/4000).                                            |
+| GGML_CUDA_FORCE_CUBLAS        | Boolean                | false   | Force the use of FP16 cuBLAS instead of custom matrix multiplication kernels for quantized models. There may be issues with numerical overflows (except for V100, CDNA and RDNA4 which use FP32 compute type by default) and memory use will be higher. Prompt processing may become faster on recent datacenter GPUs (the custom kernels were tuned primarily for RTX 3000/4000).   |
 | GGML_CUDA_PEER_MAX_BATCH_SIZE | Positive integer       | 128     | Maximum batch size for which to enable peer access between multiple GPUs. Peer access requires either Linux or NVLink. When using NVLink enabling peer access for larger batch sizes is potentially beneficial.                                                                                                                                                                  |
 | GGML_CUDA_FA_ALL_QUANTS       | Boolean                | false   | Compile support for all KV cache quantization type (combinations) for the FlashAttention CUDA kernels. More fine-grained control over KV cache size but compilation takes much longer.                                                                                                                                                                                           |
@ -595,11 +618,17 @@ You can verify that KleidiAI is being used by running
 ```bash
 ./build/bin/llama-cli -m PATH_TO_MODEL -p "What is a car?"
 ```
-If KleidiAI is enabled, the ouput will contain a line similar to:
+If KleidiAI is enabled, the output will contain a line similar to:
 ```
 load_tensors: CPU_KLEIDIAI model buffer size =  3474.00 MiB
 ```
-KleidiAI's microkernels implement optimized tensor operations using Arm CPU features such as dotprod, int8mm and SME. llama.cpp selects the most efficient kernel based on runtime CPU feature detection. However, on platforms that support SME, you must manually enable SME microkernels by setting the environment variable `GGML_KLEIDIAI_SME=1`.
+KleidiAI’s microkernels implement optimized tensor operations using Arm CPU features such as dotprod, int8mm, SVE, and SME. Llama.cpp selects the most efficient kernels at runtime based on detected CPU capabilities.
 On CPUs that support SME, SME microkernels are enabled automatically using runtime detection.
 The environment variable GGML_KLEIDIAI_SME can be used to control SME behavior:
 - Not set: enable SME automatically if supported and detected.
 - 0: disable SME.
 - <n> > 0: enable SME and assume <n> available SME units (override auto detection).
 If SME is not supported by the CPU, SME microkernels are always disabled.
 Depending on your build target, other higher priority backends may be enabled by default. To ensure the CPU backend is used, you must disable the higher priority backends either at compile time, e.g. -DGGML_METAL=OFF, or during run-time using the command line option `--device none`.
@ -699,7 +728,7 @@ To read documentation for how to build on Android, [click here](./android.md)
 ## WebGPU [In Progress]
-The WebGPU backend relies on [Dawn](https://dawn.googlesource.com/dawn). Follow the instructions [here](https://dawn.googlesource.com/dawn/+/refs/heads/main/docs/quickstart-cmake.md) to install Dawn locally so that llama.cpp can find it using CMake. The currrent implementation is up-to-date with Dawn commit `bed1a61`.
+The WebGPU backend relies on [Dawn](https://dawn.googlesource.com/dawn). Follow the instructions [here](https://dawn.googlesource.com/dawn/+/refs/heads/main/docs/quickstart-cmake.md) to install Dawn locally so that llama.cpp can find it using CMake. The current implementation is up-to-date with Dawn commit `bed1a61`.
 In the llama.cpp directory, build with CMake:
@ -718,6 +747,14 @@ Follow the instructions [here](https://dawn.googlesource.com/dawn/+/refs/heads/m
 To read documentation for how to build on IBM Z & LinuxONE, [click here](./build-s390x.md)
 ## OpenVINO
 [OpenVINO](https://docs.openvino.ai/) is an open-source toolkit for optimizing and deploying high-performance AI inference, specifically designed for Intel hardware (CPUs, GPUs, and NPUs).
 For build instructions and usage examples, refer to [OPENVINO.md](backend/OPENVINO.md).
 ---
 ## Notes about GPU-accelerated backends
 The GPU may still be used to accelerate some parts of the computation even when using the `-ngl 0` option. You can fully disable GPU acceleration by using `--device none`.
--- a/docs/development/parsing.md
+++ b/docs/development/parsing.md
@ -22,7 +22,7 @@ Below is a contrived example demonstrating how to use the PEG parser to parse
 output from a model that emits arguments as JSON.
 ```cpp
-auto parser = build_chat_peg_native_parser([&](common_chat_peg_native_builder & p) {
+auto parser = build_chat_peg_parser([&](common_chat_peg_builder & p) {
    // Build a choice of all available tools
    auto tool_choice = p.choice();
    for (const auto & tool : tools) {
@ -212,7 +212,7 @@ mapper.from_ast(ctx.ast, result);
 ### Native
-The `common_chat_peg_native_builder` builds a `native` parser suitable for
+The `common_chat_peg_builder` builds a `native` parser suitable for
 models that emit tool arguments as a direct JSON object.
 - **`reasoning(p)`** - Tag node for `reasoning_content`
@ -225,7 +225,7 @@ models that emit tool arguments as a direct JSON object.
 - **`tool_args(p)`** - Tag the tool arguments
 ```cpp
-build_chat_peg_native_parser([&](common_chat_peg_native_parser & p) {
+build_chat_peg_parser([&](common_chat_peg_builder & p) {
    auto get_weather_tool = p.tool(p.sequence({
        p.tool_open(p.literal("{")),
        p.json_member("name", "\"" + p.tool_name(p.literal("get_weather")) + "\""),
@ -246,7 +246,7 @@ build_chat_peg_native_parser([&](common_chat_peg_native_parser & p) {
 ### Constructed
-The `common_chat_peg_constructed_builder` builds a `constructed` parser
+The `common_chat_peg_builder` builds a `constructed` parser
 suitable for models that emit tool arguments as separate entities, such as XML
 tags.
@ -264,7 +264,7 @@ tags.
 - **`tool_arg_json_value(p)`** - Tag JSON value for the argument
 ```cpp
-build_chat_peg_constructed_parser([&](common_chat_peg_constructed_builder & p) {
+build_chat_peg_parser([&](common_chat_peg_builder & p) {
    auto location_arg = p.tool_arg(
        p.tool_arg_open("<parameter name=\"" + p.tool_arg_name(p.literal("location")) + "\">"),
        p.tool_arg_string_value(p.until("</parameter>")),
--- a/docs/multimodal/MobileVLM.md
+++ b/docs/multimodal/MobileVLM.md
@ -281,7 +281,7 @@ llama_print_timings:       total time =    5990.25 ms /   202 tokens
 Just the same as above.
-**ouput**
+**output**
 ```sh
 encode_image_with_clip: image embedding created: 144 tokens
@ -305,7 +305,7 @@ llama_print_timings:       total time =   15513.95 ms /   412 tokens
 ## Run on Intel(R) Core(TM) Ultra7 115H
 ### operation system
 Windows11
-### comiple
+### compile
 ```sh
 make -j32
 ```
--- a/docs/ops.md
+++ b/docs/ops.md
@ -15,7 +15,7 @@ Legend:
 | Operation | BLAS | CANN | CPU | CUDA | Metal | OpenCL | SYCL | Vulkan | WebGPU | ZenDNN | zDNN |
 |-----------|------|------|------|------|------|------|------|------|------|------|------|
 |                              ABS | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ |
-|                              ACC | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
+|                              ACC | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | 🟡 | ✅ | ❌ | ❌ | ❌ |
 |                              ADD | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
 |                             ADD1 | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
 |                           ADD_ID | ❌ | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ |
@ -23,30 +23,31 @@ Legend:
 |                           ARGMAX | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ |
 |                          ARGSORT | ❌ | ✅ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ✅ | ❌ | ❌ |
 |                             CEIL | ❌ | ❌ | ✅ | 🟡 | ❌ | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ |
-|                            CLAMP | ❌ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | 🟡 | ✅ | ❌ | ❌ |
+|                            CLAMP | ❌ | ✅ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | 🟡 | ✅ | ❌ | ❌ |
-|                           CONCAT | ❌ | ✅ | ✅ | 🟡 | ✅ | 🟡 | ✅ | ✅ | ❌ | ❌ | ❌ |
+|                           CONCAT | ❌ | ✅ | ✅ | 🟡 | ✅ | 🟡 | ✅ | ✅ | ✅ | ❌ | ❌ |
 |                             CONT | ❌ | 🟡 | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | 🟡 | ❌ | ❌ |
 |                          CONV_2D | ❌ | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ❌ | ❌ | ❌ |
 |                       CONV_2D_DW | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ |
 |                          CONV_3D | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
 |                CONV_TRANSPOSE_1D | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
 |                CONV_TRANSPOSE_2D | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ |
-|                              COS | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ |
+|                              COS | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | 🟡 | 🟡 | ✅ | ❌ | ❌ |
 |                      COUNT_EQUAL | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
 |                              CPY | ❌ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | ❌ | ❌ |
 |               CROSS_ENTROPY_LOSS | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
 |          CROSS_ENTROPY_LOSS_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
 |                           CUMSUM | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ |
-|                             DIAG | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
+|                             DIAG | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ |
 |                    DIAG_MASK_INF | ❌ | ✅ | ✅ | ✅ | ❌ | 🟡 | ✅ | ✅ | ❌ | ❌ | ❌ |
 |                              DIV | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
 |                              DUP | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | ✅ | ❌ | ❌ | ❌ |
-|                              ELU | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | ✅ | ❌ | ✅ | ❌ | ❌ |
+|                              ELU | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ |
 |                              EXP | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ |
 |                            EXPM1 | ❌ | ❌ | ✅ | 🟡 | 🟡 | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ |
 |                             FILL | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ |
-|                   FLASH_ATTN_EXT | ❌ | 🟡 | ✅ | 🟡 | 🟡 | 🟡 | ❌ | 🟡 | 🟡 | ❌ | ❌ |
+|                   FLASH_ATTN_EXT | ❌ | 🟡 | ✅ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | ❌ | ❌ |
 |                            FLOOR | ❌ | ❌ | ✅ | 🟡 | ❌ | ❌ | 🟡 | 🟡 | ✅ | ❌ | ❌ |
 |                  GATED_DELTA_NET | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ |
 |                GATED_LINEAR_ATTN | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ |
 |                            GEGLU | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ✅ | ❌ | ❌ |
 |                        GEGLU_ERF | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ✅ | ❌ | ❌ |
@ -54,7 +55,7 @@ Legend:
 |                             GELU | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | 🟡 | ✅ | ❌ | ❌ |
 |                         GELU_ERF | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | 🟡 | ✅ | ❌ | ❌ |
 |                       GELU_QUICK | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | 🟡 | ✅ | ❌ | ❌ |
-|                         GET_ROWS | ❌ | 🟡 | ✅ | 🟡 | ✅ | 🟡 | 🟡 | 🟡 | 🟡 | ❌ | ❌ |
+|                         GET_ROWS | ❌ | 🟡 | ✅ | 🟡 | ✅ | 🟡 | 🟡 | ✅ | 🟡 | ❌ | ❌ |
 |                    GET_ROWS_BACK | ❌ | ❌ | 🟡 | 🟡 | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
 |                       GROUP_NORM | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ |
 |                      HARDSIGMOID | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ |
@ -63,7 +64,7 @@ Legend:
 |                        IM2COL_3D | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ |
 |                          L2_NORM | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
 |                       LEAKY_RELU | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | ✅ | 🟡 | ❌ | ❌ | ❌ |
-|                              LOG | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ |
+|                              LOG | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | 🟡 | ✅ | ✅ | ❌ | ❌ |
 |                             MEAN | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ |
 |                              MUL | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
 |                          MUL_MAT | 🟡 | 🟡 | 🟡 | 🟡 | ✅ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 |
@ -75,34 +76,34 @@ Legend:
 |                         OUT_PROD | 🟡 | 🟡 | 🟡 | 🟡 | ❌ | ❌ | 🟡 | ❌ | ❌ | ❌ | 🟡 |
 |                              PAD | ❌ | 🟡 | ✅ | 🟡 | 🟡 | 🟡 | 🟡 | ✅ | ✅ | ❌ | ❌ |
 |                   PAD_REFLECT_1D | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ |
-|                          POOL_1D | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
+|                          POOL_1D | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
 |                          POOL_2D | ❌ | 🟡 | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
 |                            REGLU | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ✅ | ❌ | ❌ |
 |                             RELU | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | 🟡 | ✅ | ❌ | ❌ |
-|                           REPEAT | ❌ | ✅ | ✅ | 🟡 | ✅ | 🟡 | ✅ | 🟡 | ❌ | ❌ | ❌ |
+|                           REPEAT | ❌ | ✅ | ✅ | 🟡 | ✅ | 🟡 | ✅ | 🟡 | ✅ | ❌ | ❌ |
 |                      REPEAT_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
 |                         RMS_NORM | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
 |                    RMS_NORM_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
 |                             ROLL | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
 |                             ROPE | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
-|                        ROPE_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ |
+|                        ROPE_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
 |                            ROUND | ❌ | ❌ | ✅ | 🟡 | ❌ | ❌ | 🟡 | 🟡 | ✅ | ❌ | ❌ |
 |                        RWKV_WKV6 | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
 |                        RWKV_WKV7 | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
 |                            SCALE | ❌ | 🟡 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
-|                              SET | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | 🟡 | ❌ | ❌ | ❌ | ❌ |
+|                              SET | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | 🟡 | ✅ | ❌ | ❌ | ❌ |
 |                         SET_ROWS | ❌ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | ❌ | ❌ |
-|                              SGN | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | ✅ | ❌ | ✅ | ❌ | ❌ |
+|                              SGN | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ |
 |                          SIGMOID | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | 🟡 | ✅ | ❌ | ❌ |
 |                             SILU | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | 🟡 | ✅ | ❌ | ❌ |
 |                        SILU_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ |
-|                              SIN | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ |
+|                              SIN | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | 🟡 | 🟡 | ✅ | ❌ | ❌ |
 |                         SOFTPLUS | ❌ | ❌ | ✅ | 🟡 | 🟡 | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ |
 |                         SOFT_MAX | ❌ | 🟡 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
 |                    SOFT_MAX_BACK | ❌ | ❌ | 🟡 | 🟡 | ❌ | ❌ | 🟡 | ✅ | ❌ | ❌ | ❌ |
-|                        SOLVE_TRI | ❌ | ❌ | ✅ | 🟡 | ❌ | ❌ | ❌ | 🟡 | ❌ | ❌ | ❌ |
+|                        SOLVE_TRI | ❌ | ❌ | ✅ | 🟡 | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ |
-|                              SQR | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ✅ | ❌ | ❌ |
+|                              SQR | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | 🟡 | 🟡 | ✅ | ❌ | ❌ |
-|                             SQRT | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ✅ | ❌ | ❌ |
+|                             SQRT | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | 🟡 | 🟡 | ✅ | ❌ | ❌ |
 |                         SSM_CONV | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ |
 |                         SSM_SCAN | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | 🟡 | ❌ | ❌ | ❌ |
 |                             STEP | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ |
@ -116,5 +117,5 @@ Legend:
 |                            TOP_K | ❌ | ❌ | ✅ | ❌ | ✅ | ❌ | 🟡 | 🟡 | ✅ | ❌ | ❌ |
 |                              TRI | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
 |                            TRUNC | ❌ | ❌ | ✅ | 🟡 | ❌ | ❌ | 🟡 | 🟡 | ✅ | ❌ | ❌ |
-|                          UPSCALE | ❌ | 🟡 | ✅ | ✅ | 🟡 | 🟡 | 🟡 | 🟡 | ❌ | ❌ | ❌ |
+|                          UPSCALE | ❌ | 🟡 | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | ❌ | ❌ | ❌ |
-|                            XIELU | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ |
+|                            XIELU | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ |
--- a/docs/ops/CPU.csv
+++ b/docs/ops/CPU.csv
--- a/docs/ops/SYCL.csv
+++ b/docs/ops/SYCL.csv
--- a/docs/ops/Vulkan.csv
+++ b/docs/ops/Vulkan.csv
--- a/docs/ops/WebGPU.csv
+++ b/docs/ops/WebGPU.csv
@ -5023,20 +5023,20 @@
 "WebGPU: WebGPU","ARGMAX","type=f32,ne=[1024,12,1,1]","support","1","yes","WebGPU"
 "WebGPU: WebGPU","ARGMAX","type=f32,ne=[2000,10,1,1]","support","1","yes","WebGPU"
 "WebGPU: WebGPU","ARGMAX","type=f32,ne=[5438,3,1,1]","support","1","yes","WebGPU"
-"WebGPU: WebGPU","REPEAT","type=f32,ne=[10,5,4,1],nr=[1,1,1,1]","support","0","no","WebGPU"
+"WebGPU: WebGPU","REPEAT","type=f32,ne=[10,5,4,1],nr=[1,1,1,1]","support","1","yes","WebGPU"
-"WebGPU: WebGPU","REPEAT","type=f32,ne=[10,5,4,1],nr=[2,1,1,1]","support","0","no","WebGPU"
+"WebGPU: WebGPU","REPEAT","type=f32,ne=[10,5,4,1],nr=[2,1,1,1]","support","1","yes","WebGPU"
-"WebGPU: WebGPU","REPEAT","type=f32,ne=[10,5,4,1],nr=[1,2,1,1]","support","0","no","WebGPU"
+"WebGPU: WebGPU","REPEAT","type=f32,ne=[10,5,4,1],nr=[1,2,1,1]","support","1","yes","WebGPU"
-"WebGPU: WebGPU","REPEAT","type=f32,ne=[10,5,4,1],nr=[1,1,2,1]","support","0","no","WebGPU"
+"WebGPU: WebGPU","REPEAT","type=f32,ne=[10,5,4,1],nr=[1,1,2,1]","support","1","yes","WebGPU"
-"WebGPU: WebGPU","REPEAT","type=f32,ne=[10,5,4,1],nr=[1,1,1,2]","support","0","no","WebGPU"
+"WebGPU: WebGPU","REPEAT","type=f32,ne=[10,5,4,1],nr=[1,1,1,2]","support","1","yes","WebGPU"
-"WebGPU: WebGPU","REPEAT","type=i32,ne=[10,5,4,1],nr=[2,1,1,1]","support","0","no","WebGPU"
+"WebGPU: WebGPU","REPEAT","type=i32,ne=[10,5,4,1],nr=[2,1,1,1]","support","1","yes","WebGPU"
-"WebGPU: WebGPU","REPEAT","type=i16,ne=[10,5,4,1],nr=[1,1,1,2]","support","0","no","WebGPU"
+"WebGPU: WebGPU","REPEAT","type=i16,ne=[10,5,4,1],nr=[1,1,1,2]","support","1","yes","WebGPU"
-"WebGPU: WebGPU","REPEAT","type=f32,ne=[10,5,4,3],nr=[1,1,1,1]","support","0","no","WebGPU"
+"WebGPU: WebGPU","REPEAT","type=f32,ne=[10,5,4,3],nr=[1,1,1,1]","support","1","yes","WebGPU"
-"WebGPU: WebGPU","REPEAT","type=f32,ne=[10,5,4,3],nr=[2,1,1,1]","support","0","no","WebGPU"
+"WebGPU: WebGPU","REPEAT","type=f32,ne=[10,5,4,3],nr=[2,1,1,1]","support","1","yes","WebGPU"
-"WebGPU: WebGPU","REPEAT","type=f32,ne=[10,5,4,3],nr=[1,2,1,1]","support","0","no","WebGPU"
+"WebGPU: WebGPU","REPEAT","type=f32,ne=[10,5,4,3],nr=[1,2,1,1]","support","1","yes","WebGPU"
-"WebGPU: WebGPU","REPEAT","type=f32,ne=[10,5,4,3],nr=[1,1,2,1]","support","0","no","WebGPU"
+"WebGPU: WebGPU","REPEAT","type=f32,ne=[10,5,4,3],nr=[1,1,2,1]","support","1","yes","WebGPU"
-"WebGPU: WebGPU","REPEAT","type=f32,ne=[10,5,4,3],nr=[1,1,1,2]","support","0","no","WebGPU"
+"WebGPU: WebGPU","REPEAT","type=f32,ne=[10,5,4,3],nr=[1,1,1,2]","support","1","yes","WebGPU"
-"WebGPU: WebGPU","REPEAT","type=i32,ne=[10,5,4,3],nr=[2,1,1,1]","support","0","no","WebGPU"
+"WebGPU: WebGPU","REPEAT","type=i32,ne=[10,5,4,3],nr=[2,1,1,1]","support","1","yes","WebGPU"
-"WebGPU: WebGPU","REPEAT","type=i16,ne=[10,5,4,3],nr=[1,1,1,2]","support","0","no","WebGPU"
+"WebGPU: WebGPU","REPEAT","type=i16,ne=[10,5,4,3],nr=[1,1,1,2]","support","1","yes","WebGPU"
 "WebGPU: WebGPU","REPEAT_BACK","type=f32,ne=[8,6,4,2],nr=[1,1,1,1],v=0","support","0","no","WebGPU"
 "WebGPU: WebGPU","REPEAT_BACK","type=f32,ne=[8,6,4,2],nr=[2,1,1,1],v=0","support","0","no","WebGPU"
 "WebGPU: WebGPU","REPEAT_BACK","type=f32,ne=[8,6,4,2],nr=[1,2,1,1],v=0","support","0","no","WebGPU"
@ -9535,38 +9535,38 @@
 "WebGPU: WebGPU","ROPE","type=f16,ne_a=[128,32,2,1],n_dims=128,mode=40,n_ctx=512,fs=1.424500,ef=0.746500,af=1.424500,ff=1,v=0,inplace=1","support","1","yes","WebGPU"
 "WebGPU: WebGPU","ROPE","type=f16,ne_a=[128,32,2,1],n_dims=128,mode=24,n_ctx=512,fs=1.424500,ef=0.746500,af=1.424500,ff=0,v=0,inplace=1","support","1","yes","WebGPU"
 "WebGPU: WebGPU","ROPE","type=f16,ne_a=[128,32,2,1],n_dims=128,mode=24,n_ctx=512,fs=1.424500,ef=0.746500,af=1.424500,ff=1,v=0,inplace=1","support","1","yes","WebGPU"
-"WebGPU: WebGPU","CONCAT","type=f32,ne_a=[11,12,13,14],ne_b_d=7,dim=0,v=0","support","0","no","WebGPU"
+"WebGPU: WebGPU","CONCAT","type=f32,ne_a=[11,12,13,14],ne_b_d=7,dim=0,v=0","support","1","yes","WebGPU"
-"WebGPU: WebGPU","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=0,v=0","support","0","no","WebGPU"
+"WebGPU: WebGPU","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=0,v=0","support","1","yes","WebGPU"
-"WebGPU: WebGPU","CONCAT","type=f32,ne_a=[11,12,13,14],ne_b_d=7,dim=1,v=0","support","0","no","WebGPU"
+"WebGPU: WebGPU","CONCAT","type=f32,ne_a=[11,12,13,14],ne_b_d=7,dim=1,v=0","support","1","yes","WebGPU"
-"WebGPU: WebGPU","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=1,v=0","support","0","no","WebGPU"
+"WebGPU: WebGPU","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=1,v=0","support","1","yes","WebGPU"
-"WebGPU: WebGPU","CONCAT","type=f32,ne_a=[11,12,13,14],ne_b_d=7,dim=2,v=0","support","0","no","WebGPU"
+"WebGPU: WebGPU","CONCAT","type=f32,ne_a=[11,12,13,14],ne_b_d=7,dim=2,v=0","support","1","yes","WebGPU"
-"WebGPU: WebGPU","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=2,v=0","support","0","no","WebGPU"
+"WebGPU: WebGPU","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=2,v=0","support","1","yes","WebGPU"
-"WebGPU: WebGPU","CONCAT","type=f32,ne_a=[11,12,13,14],ne_b_d=7,dim=3,v=0","support","0","no","WebGPU"
+"WebGPU: WebGPU","CONCAT","type=f32,ne_a=[11,12,13,14],ne_b_d=7,dim=3,v=0","support","1","yes","WebGPU"
-"WebGPU: WebGPU","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=3,v=0","support","0","no","WebGPU"
+"WebGPU: WebGPU","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=3,v=0","support","1","yes","WebGPU"
-"WebGPU: WebGPU","CONCAT","type=f32,ne_a=[11,12,13,14],ne_b_d=7,dim=0,v=1","support","0","no","WebGPU"
+"WebGPU: WebGPU","CONCAT","type=f32,ne_a=[11,12,13,14],ne_b_d=7,dim=0,v=1","support","1","yes","WebGPU"
-"WebGPU: WebGPU","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=0,v=1","support","0","no","WebGPU"
+"WebGPU: WebGPU","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=0,v=1","support","1","yes","WebGPU"
-"WebGPU: WebGPU","CONCAT","type=f32,ne_a=[11,12,13,14],ne_b_d=7,dim=1,v=1","support","0","no","WebGPU"
+"WebGPU: WebGPU","CONCAT","type=f32,ne_a=[11,12,13,14],ne_b_d=7,dim=1,v=1","support","1","yes","WebGPU"
-"WebGPU: WebGPU","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=1,v=1","support","0","no","WebGPU"
+"WebGPU: WebGPU","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=1,v=1","support","1","yes","WebGPU"
-"WebGPU: WebGPU","CONCAT","type=f32,ne_a=[11,12,13,14],ne_b_d=7,dim=2,v=1","support","0","no","WebGPU"
+"WebGPU: WebGPU","CONCAT","type=f32,ne_a=[11,12,13,14],ne_b_d=7,dim=2,v=1","support","1","yes","WebGPU"
-"WebGPU: WebGPU","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=2,v=1","support","0","no","WebGPU"
+"WebGPU: WebGPU","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=2,v=1","support","1","yes","WebGPU"
-"WebGPU: WebGPU","CONCAT","type=f32,ne_a=[11,12,13,14],ne_b_d=7,dim=3,v=1","support","0","no","WebGPU"
+"WebGPU: WebGPU","CONCAT","type=f32,ne_a=[11,12,13,14],ne_b_d=7,dim=3,v=1","support","1","yes","WebGPU"
-"WebGPU: WebGPU","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=3,v=1","support","0","no","WebGPU"
+"WebGPU: WebGPU","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=3,v=1","support","1","yes","WebGPU"
-"WebGPU: WebGPU","CONCAT","type=f32,ne_a=[11,12,13,14],ne_b_d=7,dim=0,v=2","support","0","no","WebGPU"
+"WebGPU: WebGPU","CONCAT","type=f32,ne_a=[11,12,13,14],ne_b_d=7,dim=0,v=2","support","1","yes","WebGPU"
-"WebGPU: WebGPU","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=0,v=2","support","0","no","WebGPU"
+"WebGPU: WebGPU","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=0,v=2","support","1","yes","WebGPU"
-"WebGPU: WebGPU","CONCAT","type=f32,ne_a=[11,12,13,14],ne_b_d=7,dim=1,v=2","support","0","no","WebGPU"
+"WebGPU: WebGPU","CONCAT","type=f32,ne_a=[11,12,13,14],ne_b_d=7,dim=1,v=2","support","1","yes","WebGPU"
-"WebGPU: WebGPU","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=1,v=2","support","0","no","WebGPU"
+"WebGPU: WebGPU","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=1,v=2","support","1","yes","WebGPU"
-"WebGPU: WebGPU","CONCAT","type=f32,ne_a=[11,12,13,14],ne_b_d=7,dim=2,v=2","support","0","no","WebGPU"
+"WebGPU: WebGPU","CONCAT","type=f32,ne_a=[11,12,13,14],ne_b_d=7,dim=2,v=2","support","1","yes","WebGPU"
-"WebGPU: WebGPU","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=2,v=2","support","0","no","WebGPU"
+"WebGPU: WebGPU","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=2,v=2","support","1","yes","WebGPU"
-"WebGPU: WebGPU","CONCAT","type=f32,ne_a=[11,12,13,14],ne_b_d=7,dim=3,v=2","support","0","no","WebGPU"
+"WebGPU: WebGPU","CONCAT","type=f32,ne_a=[11,12,13,14],ne_b_d=7,dim=3,v=2","support","1","yes","WebGPU"
-"WebGPU: WebGPU","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=3,v=2","support","0","no","WebGPU"
+"WebGPU: WebGPU","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=3,v=2","support","1","yes","WebGPU"
-"WebGPU: WebGPU","CONCAT","type=f32,ne_a=[11,12,13,14],ne_b_d=7,dim=0,v=3","support","0","no","WebGPU"
+"WebGPU: WebGPU","CONCAT","type=f32,ne_a=[11,12,13,14],ne_b_d=7,dim=0,v=3","support","1","yes","WebGPU"
-"WebGPU: WebGPU","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=0,v=3","support","0","no","WebGPU"
+"WebGPU: WebGPU","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=0,v=3","support","1","yes","WebGPU"
-"WebGPU: WebGPU","CONCAT","type=f32,ne_a=[11,12,13,14],ne_b_d=7,dim=1,v=3","support","0","no","WebGPU"
+"WebGPU: WebGPU","CONCAT","type=f32,ne_a=[11,12,13,14],ne_b_d=7,dim=1,v=3","support","1","yes","WebGPU"
-"WebGPU: WebGPU","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=1,v=3","support","0","no","WebGPU"
+"WebGPU: WebGPU","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=1,v=3","support","1","yes","WebGPU"
-"WebGPU: WebGPU","CONCAT","type=f32,ne_a=[11,12,13,14],ne_b_d=7,dim=2,v=3","support","0","no","WebGPU"
+"WebGPU: WebGPU","CONCAT","type=f32,ne_a=[11,12,13,14],ne_b_d=7,dim=2,v=3","support","1","yes","WebGPU"
-"WebGPU: WebGPU","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=2,v=3","support","0","no","WebGPU"
+"WebGPU: WebGPU","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=2,v=3","support","1","yes","WebGPU"
-"WebGPU: WebGPU","CONCAT","type=f32,ne_a=[11,12,13,14],ne_b_d=7,dim=3,v=3","support","0","no","WebGPU"
+"WebGPU: WebGPU","CONCAT","type=f32,ne_a=[11,12,13,14],ne_b_d=7,dim=3,v=3","support","1","yes","WebGPU"
-"WebGPU: WebGPU","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=3,v=3","support","0","no","WebGPU"
+"WebGPU: WebGPU","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=3,v=3","support","1","yes","WebGPU"
 "WebGPU: WebGPU","ARGSORT","type=f32,ne=[3,1,1,1],order=0","support","1","yes","WebGPU"
 "WebGPU: WebGPU","ARGSORT","type=f32,ne=[4,1,1,1],order=0","support","1","yes","WebGPU"
 "WebGPU: WebGPU","ARGSORT","type=f32,ne=[7,1,1,1],order=0","support","1","yes","WebGPU"
--- a/examples/batched/batched.cpp
+++ b/examples/batched/batched.cpp
@ -5,6 +5,7 @@
 #include "sampling.h"
 #include <algorithm>
 #include <clocale>
 #include <cstdio>
 #include <string>
 #include <vector>
@ -16,6 +17,8 @@ static void print_usage(int, char ** argv) {
 }
 int main(int argc, char ** argv) {
    std::setlocale(LC_NUMERIC, "C");
    common_params params;
    params.prompt = "Hello my name is";
--- a/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp
+++ b/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp
@ -5,14 +5,16 @@
 #include "common.h"
 #include "log.h"
 #include <algorithm>
 #include <cassert>
 #include <cinttypes>
 #include <climits>
 #include <clocale>
 #include <cstdarg>
 #include <cstring>
 #include <ctime>
 #include <unordered_map>
 #include <vector>
 #include <cassert>
 #include <climits>
 #include <cstring>
 #include <cstdarg>
 #include <cinttypes>
 #include <ctime>
 #include <random>
 #include <stdexcept>
 #include <sstream>
@ -874,6 +876,8 @@ static std::string basename(const std::string &path) {
 }
 int main(int argc, char ** argv) {
    std::setlocale(LC_NUMERIC, "C");
    common_init();
    struct train_params params = get_default_train_params();
--- a/examples/debug/README.md
+++ b/examples/debug/README.md
@ -2,7 +2,7 @@
 This is a utility intended to help debug a model by registering a callback that
 logs GGML operations and tensor data. It can also store the generated logits or
-embeddings as well as the prompt and token ids for comparision with the original
+embeddings as well as the prompt and token ids for comparison with the original
 model.
 ### Usage
--- a/examples/deprecation-warning/deprecation-warning.cpp
+++ b/examples/deprecation-warning/deprecation-warning.cpp
@ -1,11 +1,14 @@
 // Warns users that this filename was deprecated, and provides a link for more information.
 #include <clocale>
 #include <cstdio>
 #include <string>
 #include <unordered_map>
 // Main
 int main(int argc, char** argv) {
    std::setlocale(LC_NUMERIC, "C");
    std::string filename = "main";
    if (argc >= 1) {
        filename = argv[0];
--- a/examples/diffusion/README.md
+++ b/examples/diffusion/README.md
@ -43,12 +43,12 @@ Choose one of the following scheduling methods:
 - `-b`: Batch size
 ### Examples
-#### Dream architechture:
+#### Dream architecture:
 ```
 llama-diffusion-cli -m dream7b.gguf -p "write code to train MNIST in pytorch" -ub 512 --diffusion-eps 0.001 --diffusion-algorithm 3 --diffusion-steps 256 --diffusion-visual
 ```
-#### LLaDA architechture:
+#### LLaDA architecture:
 ```
 llama-diffusion-cli -m llada-8b.gguf -p "write code to train MNIST in pytorch" -ub 512 --diffusion-block-length 32 --diffusion-steps 256 --diffusion-visual
 ```
--- a/examples/diffusion/diffusion-cli.cpp
+++ b/examples/diffusion/diffusion-cli.cpp
@ -7,6 +7,7 @@
 #include <limits.h>
 #include <algorithm>
 #include <clocale>
 #include <cmath>
 #include <cstring>
 #include <limits>
@ -538,6 +539,8 @@ static std::string format_input_text(const std::string & prompt, const std::stri
 }
 int main(int argc, char ** argv) {
    std::setlocale(LC_NUMERIC, "C");
    ggml_time_init();
    common_params params;
--- a/examples/embedding/embedding.cpp
+++ b/examples/embedding/embedding.cpp
@ -3,6 +3,7 @@
 #include "log.h"
 #include "llama.h"
 #include <clocale>
 #include <ctime>
 #include <algorithm>
@ -94,6 +95,8 @@ static void print_raw_embeddings(const float * emb,
 }
 int main(int argc, char ** argv) {
    std::setlocale(LC_NUMERIC, "C");
    common_params params;
    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_EMBEDDING)) {
--- a/examples/eval-callback/eval-callback.cpp
+++ b/examples/eval-callback/eval-callback.cpp
@ -4,6 +4,8 @@
 #include "log.h"
 #include "llama.h"
 #include "llama-cpp.h"
 #include <clocale>
 #include <string>
 #include <vector>
@ -29,6 +31,8 @@ static bool run(llama_context * ctx, const common_params & params) {
 }
 int main(int argc, char ** argv) {
    std::setlocale(LC_NUMERIC, "C");
    base_callback_data cb_data;
    common_params params;
--- a/examples/gen-docs/gen-docs.cpp
+++ b/examples/gen-docs/gen-docs.cpp
@ -1,6 +1,7 @@
 #include "arg.h"
 #include "common.h"
 #include <clocale>
 #include <fstream>
 #include <sstream>
 #include <string>
@ -100,6 +101,8 @@ static void write_help(std::ostringstream & ss, const md_file & md) {
 }
 int main(int, char **) {
    std::setlocale(LC_NUMERIC, "C");
    for (const auto & md : md_files) {
        std::ifstream infile(md.fname);
        if (!infile.is_open()) {
--- a/examples/gguf-hash/gguf-hash.cpp
+++ b/examples/gguf-hash/gguf-hash.cpp
@ -1,13 +1,14 @@
 #include "ggml.h"
 #include "gguf.h"
-#include <cstdlib>   /* abort() */
+#include <algorithm>
 #include <clocale>
 #include <cstddef>
 #include <cstdio>
-#include <string>
+#include <cstdlib>   /* abort() */
 #include <stdexcept>
 #include <algorithm>
 #include <cstring>
 #include <stdexcept>
 #include <string>
 #include <sstream>
 #include <fstream>
@ -626,6 +627,8 @@ static hash_exit_code_t gguf_hash(const hash_params & hash_params) {
 }
 int main(int argc, const char ** argv) {
    std::setlocale(LC_NUMERIC, "C");
    hash_params params;
    manifest_check_params manifest_check;
    hash_params_parse(argc, argv, params);
--- a/examples/gguf/gguf.cpp
+++ b/examples/gguf/gguf.cpp
@ -1,6 +1,7 @@
 #include "ggml.h"
 #include "gguf.h"
 #include <clocale>
 #include <cstdio>
 #include <string>
 #include <sstream>
@ -240,6 +241,8 @@ static bool gguf_ex_read_1(const std::string & fname, bool check_data) {
 }
 int main(int argc, char ** argv) {
    std::setlocale(LC_NUMERIC, "C");
    if (argc < 3) {
        printf("usage: %s data.gguf r|w [n]\n", argv[0]);
        printf("r: read data.gguf file\n");
--- a/examples/json_schema_to_grammar.py
+++ b/examples/json_schema_to_grammar.py
@ -633,7 +633,7 @@ class SchemaConverter:
            return self._add_rule(rule_name, self._build_object_rule(properties, required, hybrid_name, additional_properties=None))
        elif schema_type in (None, 'array') and ('items' in schema or 'prefixItems' in schema):
-            items = schema.get('items') or schema['prefixItems']
+            items = schema.get('items', schema.get('prefixItems'))
            if isinstance(items, list):
                return self._add_rule(
                    rule_name,
@ -689,6 +689,11 @@ class SchemaConverter:
        elif (schema_type == 'object') or (len(schema) == 0):
            return self._add_rule(rule_name, self._add_primitive('object', PRIMITIVE_RULES['object']))
        elif schema_type is None and isinstance(schema, dict):
            # No type constraint and no recognized structural keywords (e.g. {"description": "..."}).
            # Per JSON Schema semantics this is equivalent to {} and accepts any value.
            return self._add_rule(rule_name, self._add_primitive('value', PRIMITIVE_RULES['value']))
        else:
            assert schema_type in PRIMITIVE_RULES, f'Unrecognized schema: {schema}'
            # TODO: support minimum, maximum, exclusiveMinimum, exclusiveMaximum at least for zero
--- a/examples/llama.vim
+++ b/examples/llama.vim
@ -52,8 +52,8 @@ highlight llama_hl_info guifg=#77ff2f ctermfg=119
 "   n_prefix:         number of lines before the cursor location to include in the local prefix
 "   n_suffix:         number of lines after  the cursor location to include in the local suffix
 "   n_predict:        max number of tokens to predict
-"   t_max_prompt_ms:  max alloted time for the prompt processing (TODO: not yet supported)
+"   t_max_prompt_ms:  max allotted time for the prompt processing (TODO: not yet supported)
-"   t_max_predict_ms: max alloted time for the prediction
+"   t_max_predict_ms: max allotted time for the prediction
 "   show_info:        show extra info about the inference (0 - disabled, 1 - statusline, 2 - inline)
 "   auto_fim:         trigger FIM completion automatically on cursor movement
 "   max_line_suffix:  do not auto-trigger FIM completion if there are more than this number of characters to the right of the cursor
--- a/examples/lookahead/lookahead.cpp
+++ b/examples/lookahead/lookahead.cpp
@ -4,10 +4,11 @@
 #include "log.h"
 #include "llama.h"
 #include <algorithm>
 #include <clocale>
 #include <cstdio>
 #include <string>
 #include <vector>
 #include <algorithm>
 struct ngram_data {
    bool active = false;
@ -38,6 +39,8 @@ struct ngram_container {
 };
 int main(int argc, char ** argv) {
    std::setlocale(LC_NUMERIC, "C");
    common_params params;
    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) {
--- a/examples/lookup/lookup-create.cpp
+++ b/examples/lookup/lookup-create.cpp
@ -3,10 +3,13 @@
 #include "ngram-cache.h"
 #include "llama.h"
 #include <clocale>
 #include <string>
 #include <vector>
 int main(int argc, char ** argv){
    std::setlocale(LC_NUMERIC, "C");
    common_params params;
    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_LOOKUP)) {
--- a/examples/lookup/lookup-merge.cpp
+++ b/examples/lookup/lookup-merge.cpp
@ -3,6 +3,7 @@
 #include "common.h"
 #include "ngram-cache.h"
 #include <clocale>
 #include <cstdint>
 #include <cstdio>
 #include <fstream>
@ -17,6 +18,8 @@ static void print_usage(char* argv0) {
 }
 int main(int argc, char ** argv){
    std::setlocale(LC_NUMERIC, "C");
    if (argc < 3) {
        print_usage(argv[0]);
        exit(1);
--- a/examples/lookup/lookup-stats.cpp
+++ b/examples/lookup/lookup-stats.cpp
@ -5,14 +5,17 @@
 #include "llama.h"
 #include "ggml.h"
 #include <cinttypes>
 #include <clocale>
 #include <cstdint>
 #include <cstdio>
 #include <cinttypes>
 #include <fstream>
 #include <string>
 #include <vector>
 int main(int argc, char ** argv){
    std::setlocale(LC_NUMERIC, "C");
    common_params params;
    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_LOOKUP)) {
--- a/examples/lookup/lookup.cpp
+++ b/examples/lookup/lookup.cpp
@ -6,6 +6,7 @@
 #include "log.h"
 #include "llama.h"
 #include <clocale>
 #include <cstdint>
 #include <cstdio>
 #include <fstream>
@ -13,6 +14,8 @@
 #include <vector>
 int main(int argc, char ** argv){
    std::setlocale(LC_NUMERIC, "C");
    common_params params;
    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_LOOKUP)) {
--- a/examples/model-conversion/README.md
+++ b/examples/model-conversion/README.md
@ -69,7 +69,7 @@ Command line arguments take precedence over environment variables when both are
 In cases where the transformer implementation for the model has not been released
 yet it is possible to set the environment variable `UNRELEASED_MODEL_NAME` which
-will then cause the transformer implementation to be loaded explicitely and not
+will then cause the transformer implementation to be loaded explicitly and not
 use AutoModelForCausalLM:
 ```
 export UNRELEASED_MODEL_NAME=SomeNewModel
@ -120,7 +120,7 @@ The converted model can be inspected using the following command:
 (venv) $ make causal-run-converted-model
 ```
-### Model logits verfication
+### Model logits verification
 The following target will run the original model and the converted model and
 compare the logits:
 ```console
@ -235,7 +235,7 @@ new model the model can be converted to GGUF format using the following command:
 (venv) $ make embedding-run-converted-model
 ```
-### Model logits verfication
+### Model logits verification
 The following target will run the original model and the converted model (which
 was done manually in the previous steps) and compare the logits:
 ```console
@ -335,7 +335,7 @@ $ make perplexity-run-full QUANTIZED_MODEL=~/path/to/quantized/model-Qxx.gguf LO
 ## HuggingFace utilities
 The following targets are useful for creating collections and model repositories
-on Hugging Face in the the ggml-org. These can be used when preparing a relase
+on Hugging Face in the the ggml-org. These can be used when preparing a release
 to script the process for new model releases.
 For the following targets a `HF_TOKEN` environment variable is required.
@ -347,7 +347,7 @@ For the following targets a `HF_TOKEN` environment variable is required.
 > $ unset HF_TOKEN
 ### Create a new Hugging Face Model (model repository)
-This will create a new model repsository on Hugging Face with the specified
+This will create a new model repository on Hugging Face with the specified
 model name.
 ```console
 (venv) $ make hf-create-model MODEL_NAME='TestModel' NAMESPACE="danbev" ORIGINAL_BASE_MODEL="some-base-model"
--- a/Show More
+++ b/Show More