Merge remote-tracking branch 'upstream/master' into backend-sampling

2025-12-04 08:17:50 +01:00 · 2025-12-04 08:17:50 +01:00 · c0b182f4d6
parent 87b2719eca ef75a89fdb
commit c0b182f4d6
99 changed files with 7663 additions and 1454 deletions
--- a/.github/workflows/build-riscv-native.yml
+++ b/.github/workflows/build-riscv-native.yml
@ -1,120 +0,0 @@
-name: Build on RISCV Linux Machine by Cloud-V
-on:
-  pull_request:
-  workflow_dispatch:
-  workflow_call:
-
-jobs:
-  debian-13-riscv64-native: # Bianbu 2.2
-    runs-on: [self-hosted, RISCV64]
-
-    steps:
-      - name: Install prerequisites
-        run: |
-          sudo apt-get update || true
-          sudo apt-get install -y libatomic1
-      - uses: actions/checkout@v4
-      - name: Setup Riscv
-        run: |
-          sudo apt-get update || true
-          sudo apt-get install -y --no-install-recommends \
-                  build-essential \
-                  gcc-14-riscv64-linux-gnu \
-                  g++-14-riscv64-linux-gnu \
-                  ccache \
-                  cmake
-
-      - name: Setup ccache
-        run: |
-          mkdir -p $HOME/.ccache
-          ccache -M 5G -d $HOME/.ccache
-          export CCACHE_LOGFILE=/home/runneruser/ccache_debug/ccache.log
-          export CCACHE_DEBUGDIR="/home/runneruser/ccache_debug"
-          echo "$GITHUB_WORKSPACE"
-          echo "CCACHE_LOGFILE=$CCACHE_LOGFILE" >> $GITHUB_ENV
-          echo "CCACHE_DEBUGDIR=$CCACHE_DEBUGDIR" >> $GITHUB_ENV
-          echo "CCACHE_BASEDIR=$GITHUB_WORKSPACE" >> $GITHUB_ENV
-          echo "CCACHE_DIR=$HOME/.ccache" >> $GITHUB_ENV
-
-      - name: Build
-        run: |
-          cmake -B build \
-            -DLLAMA_CURL=OFF \
-            -DCMAKE_BUILD_TYPE=Release \
-            -DGGML_OPENMP=OFF \
-            -DLLAMA_BUILD_EXAMPLES=ON \
-            -DLLAMA_BUILD_TOOLS=ON \
-            -DLLAMA_BUILD_TESTS=OFF \
-            -DCMAKE_SYSTEM_NAME=Linux \
-            -DCMAKE_SYSTEM_PROCESSOR=riscv64 \
-            -DCMAKE_C_COMPILER=riscv64-linux-gnu-gcc-14 \
-            -DCMAKE_CXX_COMPILER=riscv64-linux-gnu-g++-14 \
-            -DCMAKE_C_COMPILER_LAUNCHER=ccache \
-            -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
-            -DCMAKE_POSITION_INDEPENDENT_CODE=ON \
-            -DCMAKE_FIND_ROOT_PATH=/usr/lib/riscv64-linux-gnu \
-            -DCMAKE_FIND_ROOT_PATH_MODE_PROGRAM=NEVER \
-            -DCMAKE_FIND_ROOT_PATH_MODE_LIBRARY=ONLY \
-            -DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=BOTH
-
-          cmake --build build --config Release -j $(nproc)
-
-  # debian-13-riscv64-spacemit-ime-native: # Bianbu 2.2
-  #   runs-on: [self-hosted, RISCV64]
-
-  #   steps:
-  #     - name: Install prerequisites
-  #       run: |
-  #         sudo apt-get update || true
-  #         sudo apt-get install -y libatomic1
-  #     - uses: actions/checkout@v4
-  #     - name: Setup Riscv
-  #       run: |
-  #         sudo apt-get update || true
-  #         sudo apt-get install -y --no-install-recommends \
-  #                 build-essential \
-  #                 gcc-14-riscv64-linux-gnu \
-  #                 g++-14-riscv64-linux-gnu \
-  #                 ccache \
-  #                 cmake
-  #         sudo apt-get upgrade binutils -y
-
-  #     - name: Setup ccache
-  #       run: |
-  #         mkdir -p $HOME/.ccache
-  #         ccache -M 5G -d $HOME/.ccache
-  #         export CCACHE_LOGFILE=/home/runneruser/ccache_debug/ccache.log
-  #         export CCACHE_DEBUGDIR="/home/runneruser/ccache_debug"
-  #         echo "$GITHUB_WORKSPACE"
-  #         echo "CCACHE_LOGFILE=$CCACHE_LOGFILE" >> $GITHUB_ENV
-  #         echo "CCACHE_DEBUGDIR=$CCACHE_DEBUGDIR" >> $GITHUB_ENV
-  #         echo "CCACHE_BASEDIR=$GITHUB_WORKSPACE" >> $GITHUB_ENV
-  #         echo "CCACHE_DIR=$HOME/.ccache" >> $GITHUB_ENV
-
-  #     - name: Build
-  #       run: |
-  #         cmake -B build \
-  #           -DLLAMA_CURL=OFF \
-  #           -DCMAKE_BUILD_TYPE=Release \
-  #           -DGGML_OPENMP=OFF \
-  #           -DLLAMA_BUILD_EXAMPLES=ON \
-  #           -DLLAMA_BUILD_TOOLS=ON \
-  #           -DLLAMA_BUILD_TESTS=OFF \
-  #           -DCMAKE_SYSTEM_NAME=Linux \
-  #           -DCMAKE_SYSTEM_PROCESSOR=riscv64 \
-  #           -DCMAKE_C_COMPILER=riscv64-linux-gnu-gcc-14 \
-  #           -DCMAKE_CXX_COMPILER=riscv64-linux-gnu-g++-14 \
-  #           -DCMAKE_C_COMPILER_LAUNCHER=ccache \
-  #           -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
-  #           -DCMAKE_POSITION_INDEPENDENT_CODE=ON \
-  #           -DCMAKE_FIND_ROOT_PATH=/usr/lib/riscv64-linux-gnu \
-  #           -DCMAKE_FIND_ROOT_PATH_MODE_PROGRAM=NEVER \
-  #           -DCMAKE_FIND_ROOT_PATH_MODE_LIBRARY=ONLY \
-  #           -DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=BOTH \
-  #           -DGGML_RVV=ON \
-  #           -DGGML_RV_ZFH=ON \
-  #           -DGGML_RV_ZICBOP=ON \
-  #           -DGGML_CPU_RISCV64_SPACEMIT=ON \
-  #           -DRISCV64_SPACEMIT_IME_SPEC=RISCV64_SPACEMIT_IME1
-
-  #         cmake --build build --config Release -j $(nproc)
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@ -547,6 +547,46 @@ jobs:
          # This is using llvmpipe and runs slower than other backends
          ctest -L main --verbose --timeout 3600

+  ubuntu-24-wasm-webgpu:
+    runs-on: ubuntu-24.04
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v4
+
+      - name: ccache
+        uses: ggml-org/ccache-action@v1.2.16
+        with:
+          key: ubuntu-latest-wasm-webgpu
+          evict-old-files: 1d
+
+      - name: Install Emscripten
+        run: |
+          git clone https://github.com/emscripten-core/emsdk.git
+          cd emsdk
+          ./emsdk install latest
+          ./emsdk activate latest
+
+      - name: Fetch emdawnwebgpu
+        run: |
+          DAWN_TAG="v20251027.212519"
+          EMDAWN_PKG="emdawnwebgpu_pkg-${DAWN_TAG}.zip"
+          echo "Downloading ${EMDAWN_PKG}"
+          curl -L -o emdawn.zip \
+            "https://github.com/google/dawn/releases/download/${DAWN_TAG}/${EMDAWN_PKG}"
+          unzip emdawn.zip
+
+      - name: Build WASM WebGPU
+        run: |
+          source emsdk/emsdk_env.sh
+          emcmake cmake -B build-wasm \
+            -DGGML_WEBGPU=ON \
+            -DLLAMA_CURL=OFF \
+            -DEMDAWNWEBGPU_DIR=emdawnwebgpu_pkg
+
+          cmake --build build-wasm --target test-backend-ops -j $(nproc)
+
  ubuntu-22-cmake-hip:
    runs-on: ubuntu-22.04
    container: rocm/dev-ubuntu-22.04:6.1.2
@ -1642,6 +1682,337 @@ jobs:
         run: |
           GG_BUILD_KLEIDIAI=1 GG_BUILD_EXTRA_TESTS_0=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt

+  ubuntu-cpu-cmake-riscv64-native:
+    runs-on: RISCV64
+
+    steps:
+      - name: Install dependencies
+        run: |
+          sudo apt-get update
+
+          # Install necessary packages
+          sudo apt-get install -y libatomic1 libtsan2 gcc-14 g++-14 rustup cmake build-essential libssl-dev wget ccache
+
+          # Set gcc-14 and g++-14 as the default compilers
+          sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-14 100
+          sudo update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-14 100
+          sudo ln -sf /usr/bin/gcc-14 /usr/bin/gcc
+          sudo ln -sf /usr/bin/g++-14 /usr/bin/g++
+
+          # Install Rust stable version
+          rustup install stable
+          rustup default stable
+
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v4
+
+      - name: Check environment
+        run: |
+          uname -a
+          gcc --version
+          g++ --version
+          ldd --version
+          cmake --version
+          rustc --version
+
+      - name: Setup ccache
+        run: |
+          # Set unique cache directory for this job
+          export CCACHE_DIR="$HOME/.ccache/cpu-cmake-rv64-native"
+          mkdir -p "$CCACHE_DIR"
+
+          # Configure ccache for optimal performance
+          ccache --set-config=max_size=5G
+          ccache --set-config=compression=true
+          ccache --set-config=compression_level=6
+          ccache --set-config=cache_dir="$CCACHE_DIR"
+
+          # Enable more aggressive caching
+          ccache --set-config=sloppiness=file_macro,time_macros,include_file_mtime,include_file_ctime
+          ccache --set-config=hash_dir=false
+
+          # Export for subsequent steps
+          echo "CCACHE_DIR=$CCACHE_DIR" >> $GITHUB_ENV
+          echo "PATH=/usr/lib/ccache:$PATH" >> $GITHUB_ENV
+
+      - name: Build
+        id: cmake_build
+        run: |
+          cmake -B build \
+            -DLLAMA_CURL=OFF \
+            -DLLAMA_OPENSSL=ON \
+            -DCMAKE_BUILD_TYPE=Release \
+            -DGGML_OPENMP=OFF \
+            -DLLAMA_BUILD_EXAMPLES=ON \
+            -DLLAMA_BUILD_TOOLS=ON \
+            -DLLAMA_BUILD_TESTS=ON \
+            -DCMAKE_C_COMPILER_LAUNCHER=ccache \
+            -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
+            -DGGML_RPC=ON \
+            -DCMAKE_C_COMPILER=riscv64-linux-gnu-gcc-14 \
+            -DCMAKE_CXX_COMPILER=riscv64-linux-gnu-g++-14
+
+          cmake --build build --config Release -j $(nproc)
+
+      - name: Test
+        id: cmake_test
+        run: |
+          cd build
+          ctest -L 'main|curl' --verbose --timeout 900
+
+      - name: Test llama2c conversion
+        id: llama2c_test
+        run: |
+          cd build
+          echo "Fetch tokenizer"
+          wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories260K/tok512.bin
+          echo "Fetch llama2c model"
+          wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories260K/stories260K.bin
+          ./bin/llama-convert-llama2c-to-ggml --copy-vocab-from-model ./tok512.bin --llama2c-model stories260K.bin --llama2c-output-model stories260K.gguf
+          ./bin/llama-cli -m stories260K.gguf -p "One day, Lily met a Shoggoth" -n 500 -c 256
+
+  ubuntu-cmake-sanitizer-riscv64-native:
+    runs-on: RISCV64
+
+    continue-on-error: true
+
+    strategy:
+      matrix:
+        sanitizer: [ADDRESS, THREAD, UNDEFINED]
+        build_type: [Debug]
+
+    steps:
+      - name: Install dependencies
+        run: |
+          sudo apt-get update
+
+          # Install necessary packages
+          sudo apt-get install -y libatomic1 libtsan2 gcc-14 g++-14 rustup cmake build-essential wget ccache
+
+          # Set gcc-14 and g++-14 as the default compilers
+          sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-14 100
+          sudo update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-14 100
+          sudo ln -sf /usr/bin/gcc-14 /usr/bin/gcc
+          sudo ln -sf /usr/bin/g++-14 /usr/bin/g++
+
+          # Install Rust stable version
+          rustup install stable
+          rustup default stable
+
+      - name: GCC version check
+        run: |
+          gcc --version
+          g++ --version
+
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v4
+
+      - name: Setup ccache
+        run: |
+          # Unique cache directory per matrix combination
+          export CCACHE_DIR="$HOME/.ccache/sanitizer-${{ matrix.sanitizer }}-${{ matrix.build_type }}"
+          mkdir -p "$CCACHE_DIR"
+
+          # Configure ccache
+          ccache --set-config=max_size=5G
+          ccache --set-config=compression=true
+          ccache --set-config=compression_level=6
+          ccache --set-config=cache_dir="$CCACHE_DIR"
+          ccache --set-config=sloppiness=file_macro,time_macros,include_file_mtime,include_file_ctime
+          ccache --set-config=hash_dir=false
+
+          # Export for subsequent steps
+          echo "CCACHE_DIR=$CCACHE_DIR" >> $GITHUB_ENV
+          echo "PATH=/usr/lib/ccache:$PATH" >> $GITHUB_ENV
+
+      - name: Build
+        id: cmake_build
+        if: ${{ matrix.sanitizer != 'THREAD' }}
+        run: |
+          cmake -B build \
+            -DLLAMA_CURL=OFF \
+            -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
+            -DGGML_OPENMP=ON \
+            -DLLAMA_BUILD_EXAMPLES=ON \
+            -DLLAMA_BUILD_TOOLS=ON \
+            -DLLAMA_BUILD_TESTS=OFF \
+            -DCMAKE_C_COMPILER_LAUNCHER=ccache \
+            -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
+            -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \
+            -DCMAKE_C_COMPILER=riscv64-linux-gnu-gcc-14 \
+            -DCMAKE_CXX_COMPILER=riscv64-linux-gnu-g++-14
+
+          cmake --build build --config ${{ matrix.build_type }} -j $(nproc)
+
+      - name: Build (no OpenMP)
+        id: cmake_build_no_openmp
+        if: ${{ matrix.sanitizer == 'THREAD' }}
+        run: |
+          cmake -B build \
+            -DLLAMA_CURL=OFF \
+            -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
+            -DGGML_OPENMP=OFF \
+            -DLLAMA_BUILD_EXAMPLES=ON \
+            -DLLAMA_BUILD_TOOLS=ON \
+            -DLLAMA_BUILD_TESTS=OFF \
+            -DCMAKE_C_COMPILER_LAUNCHER=ccache \
+            -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
+            -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \
+            -DCMAKE_C_COMPILER=riscv64-linux-gnu-gcc-14 \
+            -DCMAKE_CXX_COMPILER=riscv64-linux-gnu-g++-14
+
+          cmake --build build --config ${{ matrix.build_type }} -j $(nproc)
+
+      - name: Test
+        id: cmake_test
+        run: |
+          cd build
+          ctest -L main --verbose --timeout 900
+
+
+  ubuntu-llguidance-riscv64-native:
+    runs-on: RISCV64
+    steps:
+      - name: Install dependencies
+        run: |
+          sudo apt-get update
+
+          # Install necessary packages
+          sudo apt-get install -y libatomic1 libtsan2 gcc-14 g++-14 rustup cmake build-essential wget ccache
+
+          # Set gcc-14 and g++-14 as the default compilers
+          sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-14 100
+          sudo update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-14 100
+          sudo ln -sf /usr/bin/gcc-14 /usr/bin/gcc
+          sudo ln -sf /usr/bin/g++-14 /usr/bin/g++
+
+          # Install Rust stable version
+          rustup install stable
+          rustup default stable
+
+      - name: GCC version check
+        run: |
+          gcc --version
+          g++ --version
+
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v4
+
+      - name: Setup ccache
+        run: |
+          export CCACHE_DIR="$HOME/.ccache/llguidance-riscv64"
+          mkdir -p "$CCACHE_DIR"
+
+          ccache --set-config=max_size=5G
+          ccache --set-config=compression=true
+          ccache --set-config=compression_level=6
+          ccache --set-config=cache_dir="$CCACHE_DIR"
+          ccache --set-config=sloppiness=file_macro,time_macros,include_file_mtime,include_file_ctime
+          ccache --set-config=hash_dir=false
+
+          echo "CCACHE_DIR=$CCACHE_DIR" >> $GITHUB_ENV
+          echo "PATH=/usr/lib/ccache:$PATH" >> $GITHUB_ENV
+
+      - name: Build
+        id: cmake_build
+        run: |
+          cmake -B build \
+            -DLLAMA_CURL=OFF \
+            -DCMAKE_BUILD_TYPE=Release \
+            -DGGML_OPENMP=OFF \
+            -DLLAMA_BUILD_EXAMPLES=ON \
+            -DLLAMA_BUILD_TOOLS=ON \
+            -DLLAMA_BUILD_TESTS=OFF \
+            -DCMAKE_C_COMPILER_LAUNCHER=ccache \
+            -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
+            -DLLAMA_LLGUIDANCE=ON \
+            -DCMAKE_C_COMPILER=riscv64-linux-gnu-gcc-14 \
+            -DCMAKE_CXX_COMPILER=riscv64-linux-gnu-g++-14
+
+          cmake --build build --config Release -j $(nproc)
+
+      - name: Test
+        id: cmake_test
+        run: |
+          cd build
+          ctest -L main --verbose --timeout 900
+
+
+  ubuntu-cmake-rpc-riscv64-native:
+    runs-on: RISCV64
+
+    continue-on-error: true
+
+    steps:
+      - name: Install dependencies
+        run: |
+          sudo apt-get update
+
+          # Install necessary packages
+          sudo apt-get install -y libatomic1 libtsan2 gcc-14 g++-14 rustup cmake build-essential libssl-dev wget ccache
+
+          # Set gcc-14 and g++-14 as the default compilers
+          sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-14 100
+          sudo update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-14 100
+          sudo ln -sf /usr/bin/gcc-14 /usr/bin/gcc
+          sudo ln -sf /usr/bin/g++-14 /usr/bin/g++
+
+          # Install Rust stable version
+          rustup install stable
+          rustup default stable
+
+      - name: GCC version check
+        run: |
+          gcc --version
+          g++ --version
+
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v4
+
+      - name: Setup ccache
+        run: |
+          export CCACHE_DIR="$HOME/.ccache/rpc-riscv64"
+          mkdir -p "$CCACHE_DIR"
+
+          ccache --set-config=max_size=5G
+          ccache --set-config=compression=true
+          ccache --set-config=compression_level=6
+          ccache --set-config=cache_dir="$CCACHE_DIR"
+          ccache --set-config=sloppiness=file_macro,time_macros,include_file_mtime,include_file_ctime
+          ccache --set-config=hash_dir=false
+
+          echo "CCACHE_DIR=$CCACHE_DIR" >> $GITHUB_ENV
+          echo "PATH=/usr/lib/ccache:$PATH" >> $GITHUB_ENV
+
+      - name: Build
+        id: cmake_build
+        run: |
+          cmake -B build \
+            -DLLAMA_CURL=OFF \
+            -DLLAMA_OPENSSL=ON \
+            -DCMAKE_BUILD_TYPE=Release \
+            -DGGML_OPENMP=OFF \
+            -DLLAMA_BUILD_EXAMPLES=ON \
+            -DLLAMA_BUILD_TOOLS=ON \
+            -DLLAMA_BUILD_TESTS=ON \
+            -DCMAKE_C_COMPILER_LAUNCHER=ccache \
+            -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
+            -DCMAKE_C_COMPILER=riscv64-linux-gnu-gcc-14 \
+            -DCMAKE_CXX_COMPILER=riscv64-linux-gnu-g++-14 \
+            -DGGML_RPC=ON
+
+          cmake --build build --config Release -j $(nproc)
+
+      - name: Test
+        id: cmake_test
+        run: |
+          cd build
+          ctest -L main --verbose
+
  ggml-ci-arm64-graviton4-kleidiai:
     runs-on: ah-ubuntu_22_04-c8g_8x

--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@ -728,58 +728,6 @@ jobs:
          path: llama-${{ steps.tag.outputs.name }}-xcframework.tar.gz
          name: llama-${{ steps.tag.outputs.name }}-xcframework.tar.gz

-  openEuler-cann:
-    strategy:
-      matrix:
-        arch: [x86, aarch64]
-        chip_type: ['910b', '310p']
-        build: ['Release']
-    runs-on: ${{ matrix.arch == 'aarch64' && 'ubuntu-24.04-arm' || 'ubuntu-24.04' }}
-    container: ascendai/cann:${{ matrix.chip_type == '910b' &&  '8.3.rc1.alpha001-910b-openeuler22.03-py3.11' || '8.2.rc1-310p-openeuler22.03-py3.11' }}
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v4
-        with:
-          fetch-depth: 0
-
-      - name: Dependencies
-        run: |
-          yum update -y
-          yum install -y git gcc gcc-c++ make cmake libcurl-devel
-          git config --global --add safe.directory "$GITHUB_WORKSPACE"
-
-      - name: Build
-        run: |
-          export LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:${ASCEND_TOOLKIT_HOME}/$(uname -m)-linux/devlib/:${LD_LIBRARY_PATH}
-
-          cmake -S . -B build \
-              -DCMAKE_BUILD_TYPE=${{ matrix.build }} \
-              -DGGML_CANN=on \
-              -DSOC_TYPE=ascend${{ matrix.chip_type }}
-          cmake --build build -j $(nproc)
-
-      - name: Determine tag name
-        id: tag
-        uses: ./.github/actions/get-tag-name
-
-      - name: Pack artifacts
-        run: |
-          cp LICENSE ./build/bin/
-          zip -y -r llama-${{ steps.tag.outputs.name }}-bin-${{ matrix.chip_type }}-openEuler-${{ matrix.arch }}.zip ./build/bin/*
-          tar -czvf llama-${{ steps.tag.outputs.name }}-bin-${{ matrix.chip_type }}-openEuler-${{ matrix.arch }}.tar.gz -C ./build/bin .
-
-      - name: Upload artifacts (zip)
-        uses: actions/upload-artifact@v4
-        with:
-          path: llama-${{ steps.tag.outputs.name }}-bin-${{ matrix.chip_type }}-openEuler-${{ matrix.arch }}.zip
-          name: llama-bin-${{ matrix.chip_type }}-openEuler-${{ matrix.arch }}.zip
-
-      - name: Upload artifacts (tar)
-        uses: actions/upload-artifact@v4
-        with:
-          path: llama-${{ steps.tag.outputs.name }}-bin-${{ matrix.chip_type }}-openEuler-${{ matrix.arch }}.tar.gz
-          name: llama-bin-${{ matrix.chip_type }}-openEuler-${{ matrix.arch }}.tar.gz
-
  release:
    if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}

@ -801,7 +749,6 @@ jobs:
      - macOS-arm64
      - macOS-x64
      - ios-xcode-build
-      - openEuler-cann

    steps:
      - name: Clone
@ -869,6 +816,12 @@ jobs:
            > [!WARNING]
            > **Release Format Update**: Linux releases will soon use .tar.gz archives instead of .zip. Please make the necessary changes to your deployment scripts.

+            <details open>
+
+            ${{ github.event.head_commit.message }}
+
+            </details>
+
            **macOS/iOS:**
            - [macOS Apple Silicon (arm64)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-macos-arm64.tar.gz)
            - [macOS Intel (x64)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-macos-x64.tar.gz)
@ -887,18 +840,6 @@ jobs:
            - [Windows x64 (SYCL)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-sycl-x64.zip)
            - [Windows x64 (HIP)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-hip-radeon-x64.zip)

-            **openEuler:**
-            - [openEuler x86 (310p)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-310p-openEuler-x86.tar.gz)
-            - [openEuler x86 (910b)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-910b-openEuler-x86.tar.gz)
-            - [openEuler aarch64 (310p)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-310p-openEuler-aarch64.tar.gz)
-            - [openEuler aarch64 (910b)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-910b-openEuler-aarch64.tar.gz)
-
-            <details>
-
-            ${{ github.event.head_commit.message }}
-
-            </details>
-
      - name: Upload release
        id: upload_release
        uses: actions/github-script@v3
--- a/.gitignore
+++ b/.gitignore
@ -134,3 +134,5 @@ poetry.toml
 # IDE
 /*.code-workspace
 /.windsurf/
+# emscripten
+a.out.*
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -33,10 +33,24 @@ endif()

 option(LLAMA_USE_SYSTEM_GGML "Use system libggml" OFF)

+option(LLAMA_WASM_MEM64 "llama: use 64-bit memory in WASM builds" ON)
+
 if (EMSCRIPTEN)
    set(BUILD_SHARED_LIBS_DEFAULT OFF)

-    option(LLAMA_WASM_SINGLE_FILE "llama: embed WASM inside the generated llama.js" ON)
+    # Use 64-bit memory to support backend_get_memory queries
+    # TODO: analyze performance impact, see https://spidermonkey.dev/blog/2025/01/15/is-memory64-actually-worth-using
+    if (LLAMA_WASM_MEM64)
+      add_compile_options("-sMEMORY64=1")
+      add_link_options("-sMEMORY64=1")
+    endif()
+    add_link_options("-sALLOW_MEMORY_GROWTH=1")
+
+    option(LLAMA_WASM_SINGLE_FILE "llama: embed WASM inside the generated llama.js" OFF)
+    option(LLAMA_BUILD_HTML "llama: build HTML file" ON)
+    if (LLAMA_BUILD_HTML)
+        set(CMAKE_EXECUTABLE_SUFFIX ".html")
+    endif()
 else()
    if (MINGW)
        set(BUILD_SHARED_LIBS_DEFAULT OFF)
@ -58,6 +72,12 @@ if (MSVC)
    add_compile_options("$<$<COMPILE_LANGUAGE:CXX>:/bigobj>")
 endif()

+if (LLAMA_STANDALONE)
+    # enable parallel builds for msbuild
+    list(APPEND CMAKE_VS_GLOBALS UseMultiToolTask=true)
+    list(APPEND CMAKE_VS_GLOBALS EnforceProcessCountAcrossBuilds=true)
+endif()
+
 if (CMAKE_SYSTEM_NAME STREQUAL "iOS")
    set(LLAMA_TOOLS_INSTALL_DEFAULT OFF)
 else()
@ -179,11 +199,6 @@ if (NOT TARGET ggml AND NOT LLAMA_USE_SYSTEM_GGML)
    # ... otherwise assume ggml is added by a parent CMakeLists.txt
 endif()

-if (MINGW)
-    # Target Windows 8 for PrefetchVirtualMemory
-    add_compile_definitions(_WIN32_WINNT=${GGML_WIN_VER})
-endif()
-
 #
 # build the library
 #
--- a/8
+++ b/8
@ -7,16 +7,19 @@
 /ci/                                    @ggerganov
 /cmake/                                 @ggerganov
 /common/CMakeLists.txt                  @ggerganov
-/common/arg.*                           @ggerganov @ericcurtin
+/common/arg.*                           @ggerganov
 /common/base64.hpp.*                    @ggerganov
 /common/build-info.*                    @ggerganov
+/common/chat-peg-parser.*               @aldehir
 /common/common.*                        @ggerganov
 /common/console.*                       @ggerganov
 /common/http.*                          @angt
 /common/llguidance.*                    @ggerganov
 /common/log.*                           @ggerganov
+/common/peg-parser.*                    @aldehir
 /common/sampling.*                      @ggerganov
 /common/speculative.*                   @ggerganov
+/common/unicode.*                       @aldehir
 /convert_*.py                           @CISC
 /examples/batched.swift/                @ggerganov
 /examples/batched/                      @ggerganov
@ -87,8 +90,7 @@
 /tools/perplexity/                      @ggerganov
 /tools/quantize/                        @ggerganov
 /tools/rpc/                             @rgerganov
-/tools/run/                             @ericcurtin
-/tools/server/*                         @ngxson @ggerganov @ericcurtin # no subdir
+/tools/server/*                         @ngxson @ggerganov # no subdir
 /tools/server/webui/                    @allozaur
 /tools/tokenize/                        @ggerganov
 /tools/tts/                             @ggerganov
--- a/ci/run.sh
+++ b/ci/run.sh
@ -45,7 +45,7 @@ sd=`dirname $0`
 cd $sd/../
 SRC=`pwd`

-CMAKE_EXTRA="-DLLAMA_FATAL_WARNINGS=ON -DLLAMA_CURL=ON -DGGML_SCHED_NO_REALLOC=ON"
+CMAKE_EXTRA="-DLLAMA_FATAL_WARNINGS=${LLAMA_FATAL_WARNINGS:-ON} -DLLAMA_CURL=ON -DGGML_SCHED_NO_REALLOC=ON"

 if [ ! -z ${GG_BUILD_METAL} ]; then
    CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_METAL=ON"
--- a/common/CMakeLists.txt
+++ b/common/CMakeLists.txt
@ -52,6 +52,8 @@ add_library(${TARGET} STATIC
    chat-parser.h
    chat-parser-xml-toolcall.h
    chat-parser-xml-toolcall.cpp
+    chat-peg-parser.cpp
+    chat-peg-parser.h
    chat.cpp
    chat.h
    common.cpp
@ -69,12 +71,16 @@ add_library(${TARGET} STATIC
    log.h
    ngram-cache.cpp
    ngram-cache.h
+    peg-parser.cpp
+    peg-parser.h
    regex-partial.cpp
    regex-partial.h
    sampling.cpp
    sampling.h
    speculative.cpp
    speculative.h
+    unicode.cpp
+    unicode.h
    )

 if (BUILD_SHARED_LIBS)
--- a/common/arg.cpp
+++ b/common/arg.cpp
@ -30,6 +30,7 @@
 #include <thread> // for hardware_concurrency
 #include <vector>

+#ifndef __EMSCRIPTEN__
 #ifdef __linux__
 #include <linux/limits.h>
 #elif defined(_WIN32)
@ -41,6 +42,8 @@
 #else
 #include <sys/syslimits.h>
 #endif
+#endif
+
 #define LLAMA_MAX_URL_LENGTH 2084 // Maximum URL Length in Chrome: 2083

 using json = nlohmann::ordered_json;
@ -1226,7 +1229,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        [](common_params & params) {
            params.warmup = false;
        }
-    ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_RETRIEVAL, LLAMA_EXAMPLE_PERPLEXITY}));
+    ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MTMD, LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_RETRIEVAL, LLAMA_EXAMPLE_PERPLEXITY}));
    add_opt(common_arg(
        {"--spm-infill"},
        string_format(
@ -2495,12 +2498,29 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        "path to save slot kv cache (default: disabled)",
        [](common_params & params, const std::string & value) {
            params.slot_save_path = value;
+            if (!fs_is_directory(params.slot_save_path)) {
+                throw std::invalid_argument("not a directory: " + value);
+            }
            // if doesn't end with DIRECTORY_SEPARATOR, add it
            if (!params.slot_save_path.empty() && params.slot_save_path[params.slot_save_path.size() - 1] != DIRECTORY_SEPARATOR) {
                params.slot_save_path += DIRECTORY_SEPARATOR;
            }
        }
    ).set_examples({LLAMA_EXAMPLE_SERVER}));
+    add_opt(common_arg(
+        {"--media-path"}, "PATH",
+        "directory for loading local media files; files can be accessed via file:// URLs using relative paths (default: disabled)",
+        [](common_params & params, const std::string & value) {
+            params.media_path = value;
+            if (!fs_is_directory(params.media_path)) {
+                throw std::invalid_argument("not a directory: " + value);
+            }
+            // if doesn't end with DIRECTORY_SEPARATOR, add it
+            if (!params.media_path.empty() && params.media_path[params.media_path.size() - 1] != DIRECTORY_SEPARATOR) {
+                params.media_path += DIRECTORY_SEPARATOR;
+            }
+        }
+    ).set_examples({LLAMA_EXAMPLE_SERVER}));
    add_opt(common_arg(
        {"--models-dir"}, "PATH",
        "directory containing models for the router server (default: disabled)",
--- a/common/chat-parser.cpp
+++ b/common/chat-parser.cpp
@ -1,6 +1,8 @@
 #include "chat-parser.h"
+#include "chat-peg-parser.h"
 #include "common.h"
 #include "log.h"
+#include "peg-parser.h"
 #include "regex-partial.h"

 #include <algorithm>
@ -1483,6 +1485,11 @@ static void common_chat_parse(common_chat_msg_parser & builder) {
 }

 common_chat_msg common_chat_parse(const std::string & input, bool is_partial, const common_chat_syntax & syntax) {
+    if (syntax.format == COMMON_CHAT_FORMAT_PEG_SIMPLE ||
+        syntax.format == COMMON_CHAT_FORMAT_PEG_NATIVE ||
+        syntax.format == COMMON_CHAT_FORMAT_PEG_CONSTRUCTED) {
+        return common_chat_peg_parse(syntax.parser, input, is_partial, syntax);
+    }
    common_chat_msg_parser builder(input, is_partial, syntax);
    try {
        common_chat_parse(builder);
@ -1500,3 +1507,36 @@ common_chat_msg common_chat_parse(const std::string & input, bool is_partial, co
    }
    return msg;
 }
+
+common_chat_msg common_chat_peg_parse(const common_peg_arena & parser, const std::string & input, bool is_partial, const common_chat_syntax & syntax) {
+    if (parser.empty()) {
+        throw std::runtime_error("Failed to parse due to missing parser definition.");
+    }
+
+    LOG_DBG("Parsing input with format %s: %s\n", common_chat_format_name(syntax.format), input.c_str());
+
+    common_peg_parse_context ctx(input, is_partial);
+    auto result = parser.parse(ctx);
+    if (result.fail()) {
+        throw std::runtime_error(std::string("Failed to parse input at pos ") + std::to_string(result.end));
+    }
+
+    common_chat_msg msg;
+    msg.role = "assistant";
+
+    if (syntax.format == COMMON_CHAT_FORMAT_PEG_NATIVE) {
+        auto mapper = common_chat_peg_native_mapper(msg);
+        mapper.from_ast(ctx.ast, result);
+    } else if (syntax.format == COMMON_CHAT_FORMAT_PEG_CONSTRUCTED) {
+        auto mapper = common_chat_peg_constructed_mapper(msg);
+        mapper.from_ast(ctx.ast, result);
+    } else {
+        // Generic mapper
+        auto mapper = common_chat_peg_mapper(msg);
+        mapper.from_ast(ctx.ast, result);
+    }
+    if (!is_partial) {
+        LOG_DBG("Parsed message: %s\n", common_chat_msgs_to_json_oaicompat<json>({msg}).at(0).dump().c_str());
+    }
+    return msg;
+}
--- a/common/chat-peg-parser.cpp
+++ b/common/chat-peg-parser.cpp
@ -0,0 +1,114 @@
+#include "chat-peg-parser.h"
+
+#include <nlohmann/json.hpp>
+
+using json = nlohmann::json;
+
+static std::string_view trim_trailing_space(std::string_view sv) {
+    while (!sv.empty() && std::isspace(static_cast<unsigned char>(sv.back()))) {
+        sv.remove_suffix(1);
+    }
+    return sv;
+}
+
+void common_chat_peg_mapper::from_ast(const common_peg_ast_arena & arena, const common_peg_parse_result & result) {
+    arena.visit(result, [this](const common_peg_ast_node & node) {
+        map(node);
+    });
+}
+
+void common_chat_peg_mapper::map(const common_peg_ast_node & node) {
+    bool is_reasoning = node.tag == common_chat_peg_builder::REASONING;
+    bool is_content = node.tag == common_chat_peg_builder::CONTENT;
+
+    if (is_reasoning) {
+        result.reasoning_content = std::string(trim_trailing_space(node.text));
+    }
+
+    if (is_content) {
+        result.content = std::string(trim_trailing_space(node.text));
+    }
+}
+
+void common_chat_peg_native_mapper::map(const common_peg_ast_node & node) {
+    common_chat_peg_mapper::map(node);
+
+    bool is_tool_open = node.tag == common_chat_peg_native_builder::TOOL_OPEN;
+    bool is_tool_name = node.tag == common_chat_peg_native_builder::TOOL_NAME;
+    bool is_tool_id = node.tag == common_chat_peg_native_builder::TOOL_ID;
+    bool is_tool_args = node.tag == common_chat_peg_native_builder::TOOL_ARGS;
+
+    if (is_tool_open) {
+        result.tool_calls.emplace_back();
+        current_tool = &result.tool_calls.back();
+    }
+
+    if (is_tool_id && current_tool) {
+        current_tool->id = std::string(trim_trailing_space(node.text));
+    }
+
+    if (is_tool_name && current_tool) {
+        current_tool->name = std::string(trim_trailing_space(node.text));
+    }
+
+    if (is_tool_args && current_tool) {
+        current_tool->arguments = std::string(trim_trailing_space(node.text));
+    }
+}
+
+void common_chat_peg_constructed_mapper::map(const common_peg_ast_node & node) {
+    common_chat_peg_mapper::map(node);
+
+    bool is_tool_open = node.tag == common_chat_peg_constructed_builder::TOOL_OPEN;
+    bool is_tool_name = node.tag == common_chat_peg_constructed_builder::TOOL_NAME;
+    bool is_tool_close = node.tag == common_chat_peg_constructed_builder::TOOL_CLOSE;
+    bool is_arg_open = node.tag == common_chat_peg_constructed_builder::TOOL_ARG_OPEN;
+    bool is_arg_close = node.tag == common_chat_peg_constructed_builder::TOOL_ARG_CLOSE;
+    bool is_arg_name = node.tag == common_chat_peg_constructed_builder::TOOL_ARG_NAME;
+    bool is_arg_string = node.tag == common_chat_peg_constructed_builder::TOOL_ARG_STRING_VALUE;
+    bool is_arg_json = node.tag == common_chat_peg_constructed_builder::TOOL_ARG_JSON_VALUE;
+
+    if (is_tool_open) {
+        result.tool_calls.emplace_back();
+        current_tool = &result.tool_calls.back();
+        arg_count = 0;
+    }
+
+    if (is_tool_name) {
+        current_tool->name = std::string(node.text);
+        current_tool->arguments = "{";
+    }
+
+    if (is_arg_open) {
+        needs_closing_quote = false;
+    }
+
+    if (is_arg_name && current_tool) {
+        if (arg_count > 0) {
+            current_tool->arguments += ",";
+        }
+        current_tool->arguments += json(trim_trailing_space(node.text)).dump() + ":";
+        ++arg_count;
+    }
+
+    if (is_arg_string && current_tool) {
+        // Serialize to JSON, but exclude the end quote
+        std::string dumped = json(node.text).dump();
+        current_tool->arguments += dumped.substr(0, dumped.size() - 1);
+        needs_closing_quote = true;
+    }
+
+    if (is_arg_close && current_tool) {
+        if (needs_closing_quote) {
+            current_tool->arguments += "\"";
+        }
+    }
+
+    if (is_arg_json && current_tool) {
+        current_tool->arguments += std::string(trim_trailing_space(node.text));
+    }
+
+    if (is_tool_close && current_tool) {
+        current_tool->arguments += "}";
+    }
+}
--- a/common/chat-peg-parser.h
+++ b/common/chat-peg-parser.h
@ -0,0 +1,105 @@
+#pragma once
+
+#include "chat.h"
+#include "peg-parser.h"
+
+class common_chat_peg_builder : public common_peg_parser_builder {
+  public:
+    static constexpr const char * REASONING_BLOCK = "reasoning-block";
+    static constexpr const char * REASONING = "reasoning";
+    static constexpr const char * CONTENT = "content";
+
+    common_peg_parser reasoning_block(const common_peg_parser & p) { return tag(REASONING_BLOCK, p); }
+    common_peg_parser reasoning(const common_peg_parser & p) { return tag(REASONING, p); }
+    common_peg_parser content(const common_peg_parser & p) { return tag(CONTENT, p); }
+};
+
+inline common_peg_arena build_chat_peg_parser(const std::function<common_peg_parser(common_chat_peg_builder & builder)> & fn) {
+    common_chat_peg_builder builder;
+    builder.set_root(fn(builder));
+    return builder.build();
+}
+
+class common_chat_peg_mapper {
+  public:
+    common_chat_msg & result;
+
+    common_chat_peg_mapper(common_chat_msg & msg) : result(msg) {}
+
+    virtual void from_ast(const common_peg_ast_arena & arena, const common_peg_parse_result & result);
+    virtual void map(const common_peg_ast_node & node);
+};
+
+class common_chat_peg_native_builder : public common_chat_peg_builder {
+  public:
+    static constexpr const char * TOOL = "tool";
+    static constexpr const char * TOOL_OPEN = "tool-open";
+    static constexpr const char * TOOL_CLOSE = "tool-close";
+    static constexpr const char * TOOL_ID = "tool-id";
+    static constexpr const char * TOOL_NAME = "tool-name";
+    static constexpr const char * TOOL_ARGS = "tool-args";
+
+    common_peg_parser tool(const common_peg_parser & p) { return tag(TOOL, p); }
+    common_peg_parser tool_open(const common_peg_parser & p) { return atomic(tag(TOOL_OPEN, p)); }
+    common_peg_parser tool_close(const common_peg_parser & p) { return atomic(tag(TOOL_CLOSE, p)); }
+    common_peg_parser tool_id(const common_peg_parser & p) { return atomic(tag(TOOL_ID, p)); }
+    common_peg_parser tool_name(const common_peg_parser & p) { return atomic(tag(TOOL_NAME, p)); }
+    common_peg_parser tool_args(const common_peg_parser & p) { return tag(TOOL_ARGS, p); }
+};
+
+class common_chat_peg_native_mapper : public common_chat_peg_mapper {
+    common_chat_tool_call * current_tool;
+
+  public:
+    common_chat_peg_native_mapper(common_chat_msg & msg) : common_chat_peg_mapper(msg) {}
+
+    void map(const common_peg_ast_node & node) override;
+};
+
+inline common_peg_arena build_chat_peg_native_parser(const std::function<common_peg_parser(common_chat_peg_native_builder & builder)> & fn) {
+    common_chat_peg_native_builder builder;
+    builder.set_root(fn(builder));
+    return builder.build();
+}
+
+class common_chat_peg_constructed_builder : public common_chat_peg_builder {
+  public:
+    static constexpr const char * TOOL = "tool";
+    static constexpr const char * TOOL_OPEN = "tool-open";
+    static constexpr const char * TOOL_CLOSE = "tool-close";
+    static constexpr const char * TOOL_NAME = "tool-name";
+    static constexpr const char * TOOL_ARG = "tool-arg";
+    static constexpr const char * TOOL_ARG_OPEN = "tool-arg-open";
+    static constexpr const char * TOOL_ARG_CLOSE = "tool-arg-close";
+    static constexpr const char * TOOL_ARG_NAME = "tool-arg-name";
+    static constexpr const char * TOOL_ARG_STRING_VALUE = "tool-arg-string-value";
+    static constexpr const char * TOOL_ARG_JSON_VALUE = "tool-arg-json-value";
+
+    common_peg_parser tool(const common_peg_parser & p) { return tag(TOOL, p); }
+    common_peg_parser tool_open(const common_peg_parser & p) { return atomic(tag(TOOL_OPEN, p)); }
+    common_peg_parser tool_close(const common_peg_parser & p) { return atomic(tag(TOOL_CLOSE, p)); }
+    common_peg_parser tool_name(const common_peg_parser & p) { return atomic(tag(TOOL_NAME, p)); }
+    common_peg_parser tool_arg(const common_peg_parser & p) { return tag(TOOL_ARG, p); }
+    common_peg_parser tool_arg_open(const common_peg_parser & p) { return atomic(tag(TOOL_ARG_OPEN, p)); }
+    common_peg_parser tool_arg_close(const common_peg_parser & p) { return atomic(tag(TOOL_ARG_CLOSE, p)); }
+    common_peg_parser tool_arg_name(const common_peg_parser & p) { return atomic(tag(TOOL_ARG_NAME, p)); }
+    common_peg_parser tool_arg_string_value(const common_peg_parser & p) { return tag(TOOL_ARG_STRING_VALUE, p); }
+    common_peg_parser tool_arg_json_value(const common_peg_parser & p) { return tag(TOOL_ARG_JSON_VALUE, p); }
+};
+
+class common_chat_peg_constructed_mapper : public common_chat_peg_mapper {
+    common_chat_tool_call * current_tool;
+    int arg_count = 0;
+    bool needs_closing_quote = false;
+
+  public:
+    common_chat_peg_constructed_mapper(common_chat_msg & msg) : common_chat_peg_mapper(msg) {}
+
+    void map(const common_peg_ast_node & node) override;
+};
+
+inline common_peg_arena build_chat_peg_constructed_parser(const std::function<common_peg_parser(common_chat_peg_constructed_builder & builder)> & fn) {
+    common_chat_peg_constructed_builder builder;
+    builder.set_root(fn(builder));
+    return builder.build();
+}
--- a/common/chat.cpp
+++ b/common/chat.cpp
@ -85,29 +85,36 @@ json common_chat_msg::to_json_oaicompat() const
    return message;
 }

-std::vector<common_chat_msg_diff> common_chat_msg_diff::compute_diffs(const common_chat_msg & previous_msg, const common_chat_msg & new_msg) {
+std::vector<common_chat_msg_diff> common_chat_msg_diff::compute_diffs(const common_chat_msg & msg_prv, const common_chat_msg & msg_new) {
    std::vector<common_chat_msg_diff> diffs;
-    if (previous_msg.reasoning_content != new_msg.reasoning_content) {
-        auto & diff = diffs.emplace_back();
-        diff.reasoning_content_delta = string_diff(previous_msg.reasoning_content, new_msg.reasoning_content);
-    }
-    if (previous_msg.content != new_msg.content) {
-        auto & diff = diffs.emplace_back();
-        diff.content_delta = string_diff(previous_msg.content, new_msg.content);
+    if (msg_new.tool_calls.size() > msg_prv.tool_calls.size()) {
+        diffs.reserve(msg_new.tool_calls.size() - msg_prv.tool_calls.size() + 3);
+    } else {
+        diffs.reserve(3);
    }

-    if (new_msg.tool_calls.size() < previous_msg.tool_calls.size()) {
+    // TODO: these can become expensive for long messages - how to optimize?
+    if (msg_prv.reasoning_content != msg_new.reasoning_content) {
+        auto & diff = diffs.emplace_back();
+        diff.reasoning_content_delta = string_diff(msg_prv.reasoning_content, msg_new.reasoning_content);
+    }
+    if (msg_prv.content != msg_new.content) {
+        auto & diff = diffs.emplace_back();
+        diff.content_delta = string_diff(msg_prv.content, msg_new.content);
+    }
+
+    if (msg_new.tool_calls.size() < msg_prv.tool_calls.size()) {
        throw std::runtime_error("Invalid diff: now finding less tool calls!");
    }

-    if (!previous_msg.tool_calls.empty()) {
-        auto idx = previous_msg.tool_calls.size() - 1;
-        const auto & pref = previous_msg.tool_calls[idx];
-        const auto & newf = new_msg.tool_calls[idx];
+    if (!msg_prv.tool_calls.empty()) {
+        const auto idx = msg_prv.tool_calls.size() - 1;
+        const auto & pref = msg_prv.tool_calls[idx];
+        const auto & newf = msg_new.tool_calls[idx];
        if (pref.name != newf.name) {
            throw std::runtime_error("Invalid diff: tool call mismatch!");
        }
-        auto args_diff = string_diff(pref.arguments, newf.arguments);
+        const auto args_diff = string_diff(pref.arguments, newf.arguments);
        if (!args_diff.empty() || pref.id != newf.id) {
            auto & diff = diffs.emplace_back();
            diff.tool_call_index = idx;
@ -118,11 +125,12 @@ std::vector<common_chat_msg_diff> common_chat_msg_diff::compute_diffs(const comm
            diff.tool_call_delta.arguments = args_diff;
        }
    }
-    for (size_t idx = previous_msg.tool_calls.size(); idx < new_msg.tool_calls.size(); ++idx) {
+    for (size_t idx = msg_prv.tool_calls.size(); idx < msg_new.tool_calls.size(); ++idx) {
        auto & diff = diffs.emplace_back();
        diff.tool_call_index = idx;
-        diff.tool_call_delta = new_msg.tool_calls[idx];
+        diff.tool_call_delta = msg_new.tool_calls[idx];
    }
+
    return diffs;
 }

@ -163,7 +171,7 @@ common_chat_tool_choice common_chat_tool_choice_parse_oaicompat(const std::strin
    if (tool_choice == "required") {
        return COMMON_CHAT_TOOL_CHOICE_REQUIRED;
    }
-    throw std::runtime_error("Invalid tool_choice: " + tool_choice);
+    throw std::invalid_argument("Invalid tool_choice: " + tool_choice);
 }

 bool common_chat_templates_support_enable_thinking(const common_chat_templates * chat_templates) {
@ -186,17 +194,17 @@ std::vector<common_chat_msg> common_chat_msgs_parse_oaicompat(const json & messa
    try {

        if (!messages.is_array()) {
-            throw std::runtime_error("Expected 'messages' to be an array, got " + messages.dump());
+            throw std::invalid_argument("Expected 'messages' to be an array, got " + messages.dump());
        }

        for (const auto & message : messages) {
            if (!message.is_object()) {
-                throw std::runtime_error("Expected 'message' to be an object, got " + message.dump());
+                throw std::invalid_argument("Expected 'message' to be an object, got " + message.dump());
            }

            common_chat_msg msg;
            if (!message.contains("role")) {
-                throw std::runtime_error("Missing 'role' in message: " + message.dump());
+                throw std::invalid_argument("Missing 'role' in message: " + message.dump());
            }
            msg.role = message.at("role");

@ -209,11 +217,11 @@ std::vector<common_chat_msg> common_chat_msgs_parse_oaicompat(const json & messa
                } else if (content.is_array()) {
                    for (const auto & part : content) {
                        if (!part.contains("type")) {
-                            throw std::runtime_error("Missing content part type: " + part.dump());
+                            throw std::invalid_argument("Missing content part type: " + part.dump());
                        }
                        const auto & type = part.at("type");
                        if (type != "text") {
-                            throw std::runtime_error("Unsupported content part type: " + type.dump());
+                            throw std::invalid_argument("Unsupported content part type: " + type.dump());
                        }
                        common_chat_msg_content_part msg_part;
                        msg_part.type = type;
@ -221,25 +229,25 @@ std::vector<common_chat_msg> common_chat_msgs_parse_oaicompat(const json & messa
                        msg.content_parts.push_back(msg_part);
                    }
                } else if (!content.is_null()) {
-                    throw std::runtime_error("Invalid 'content' type: expected string or array, got " + content.dump() + " (ref: https://github.com/ggml-org/llama.cpp/issues/8367)");
+                    throw std::invalid_argument("Invalid 'content' type: expected string or array, got " + content.dump() + " (ref: https://github.com/ggml-org/llama.cpp/issues/8367)");
                }
            }
            if (has_tool_calls) {
                for (const auto & tool_call : message.at("tool_calls")) {
                    common_chat_tool_call tc;
                    if (!tool_call.contains("type")) {
-                        throw std::runtime_error("Missing tool call type: " + tool_call.dump());
+                        throw std::invalid_argument("Missing tool call type: " + tool_call.dump());
                    }
                    const auto & type = tool_call.at("type");
                    if (type != "function") {
-                        throw std::runtime_error("Unsupported tool call type: " + tool_call.dump());
+                        throw std::invalid_argument("Unsupported tool call type: " + tool_call.dump());
                    }
                    if (!tool_call.contains("function")) {
-                        throw std::runtime_error("Missing tool call function: " + tool_call.dump());
+                        throw std::invalid_argument("Missing tool call function: " + tool_call.dump());
                    }
                    const auto & fc = tool_call.at("function");
                    if (!fc.contains("name")) {
-                        throw std::runtime_error("Missing tool call name: " + tool_call.dump());
+                        throw std::invalid_argument("Missing tool call name: " + tool_call.dump());
                    }
                    tc.name = fc.at("name");
                    tc.arguments = fc.at("arguments");
@ -250,7 +258,7 @@ std::vector<common_chat_msg> common_chat_msgs_parse_oaicompat(const json & messa
                }
            }
            if (!has_content && !has_tool_calls) {
-                throw std::runtime_error("Expected 'content' or 'tool_calls' (ref: https://github.com/ggml-org/llama.cpp/issues/8367 & https://github.com/ggml-org/llama.cpp/issues/12279)");
+                throw std::invalid_argument("Expected 'content' or 'tool_calls' (ref: https://github.com/ggml-org/llama.cpp/issues/8367 & https://github.com/ggml-org/llama.cpp/issues/12279)");
            }
            if (message.contains("reasoning_content")) {
                msg.reasoning_content = message.at("reasoning_content");
@ -353,18 +361,18 @@ std::vector<common_chat_tool> common_chat_tools_parse_oaicompat(const json & too
    try {
        if (!tools.is_null()) {
            if (!tools.is_array()) {
-                throw std::runtime_error("Expected 'tools' to be an array, got " + tools.dump());
+                throw std::invalid_argument("Expected 'tools' to be an array, got " + tools.dump());
            }
            for (const auto & tool : tools) {
                if (!tool.contains("type")) {
-                    throw std::runtime_error("Missing tool type: " + tool.dump());
+                    throw std::invalid_argument("Missing tool type: " + tool.dump());
                }
                const auto & type = tool.at("type");
                if (!type.is_string() || type != "function") {
-                    throw std::runtime_error("Unsupported tool type: " + tool.dump());
+                    throw std::invalid_argument("Unsupported tool type: " + tool.dump());
                }
                if (!tool.contains("function")) {
-                    throw std::runtime_error("Missing tool function: " + tool.dump());
+                    throw std::invalid_argument("Missing tool function: " + tool.dump());
                }

                const auto & function = tool.at("function");
@ -649,6 +657,9 @@ const char * common_chat_format_name(common_chat_format format) {
        case COMMON_CHAT_FORMAT_QWEN3_CODER_XML: return "Qwen3 Coder";
        case COMMON_CHAT_FORMAT_APRIEL_1_5: return "Apriel 1.5";
        case COMMON_CHAT_FORMAT_XIAOMI_MIMO: return "Xiaomi MiMo";
+        case COMMON_CHAT_FORMAT_PEG_SIMPLE: return "peg-simple";
+        case COMMON_CHAT_FORMAT_PEG_NATIVE: return "peg-native";
+        case COMMON_CHAT_FORMAT_PEG_CONSTRUCTED: return "peg-constructed";
        default:
            throw std::runtime_error("Unknown chat format");
    }
--- a/common/chat.h
+++ b/common/chat.h
@ -3,6 +3,7 @@
 #pragma once

 #include "common.h"
+#include "peg-parser.h"
 #include <functional>
 #include <chrono>
 #include <string>
@ -76,7 +77,7 @@ struct common_chat_msg_diff {
    size_t tool_call_index = std::string::npos;
    common_chat_tool_call tool_call_delta;

-    static std::vector<common_chat_msg_diff> compute_diffs(const common_chat_msg & previous_msg, const common_chat_msg & new_msg);
+    static std::vector<common_chat_msg_diff> compute_diffs(const common_chat_msg & msg_prv, const common_chat_msg & msg_new);

    bool operator==(const common_chat_msg_diff & other) const {
        return content_delta == other.content_delta
@ -124,6 +125,11 @@ enum common_chat_format {
    COMMON_CHAT_FORMAT_APRIEL_1_5,
    COMMON_CHAT_FORMAT_XIAOMI_MIMO,

+    // These are intended to be parsed by the PEG parser
+    COMMON_CHAT_FORMAT_PEG_SIMPLE,
+    COMMON_CHAT_FORMAT_PEG_NATIVE,
+    COMMON_CHAT_FORMAT_PEG_CONSTRUCTED,
+
    COMMON_CHAT_FORMAT_COUNT, // Not a format, just the # formats
 };

@ -154,6 +160,7 @@ struct common_chat_params {
    std::vector<common_grammar_trigger> grammar_triggers;
    std::vector<std::string>            preserved_tokens;
    std::vector<std::string>            additional_stops;
+    std::string                         parser;
 };

 struct common_chat_syntax {
@ -163,6 +170,7 @@ struct common_chat_syntax {
    bool                     reasoning_in_content  = false;
    bool                     thinking_forced_open  = false;
    bool                     parse_tool_calls      = true;
+    common_peg_arena         parser                = {};
 };

 // Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid
@ -206,6 +214,7 @@ const char*               common_chat_format_name(common_chat_format format);
 const char*               common_reasoning_format_name(common_reasoning_format format);
 common_reasoning_format   common_reasoning_format_from_name(const std::string & format);
 common_chat_msg           common_chat_parse(const std::string & input, bool is_partial, const common_chat_syntax & syntax);
+common_chat_msg           common_chat_peg_parse(const common_peg_arena & parser, const std::string & input, bool is_partial, const common_chat_syntax & syntax);

 common_chat_tool_choice common_chat_tool_choice_parse_oaicompat(const std::string & tool_choice);

--- a/common/common.cpp
+++ b/common/common.cpp
@ -694,7 +694,7 @@ bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_over

 // Validate if a filename is safe to use
 // To validate a full path, split the path by the OS-specific path separator, and validate each part with this function
-bool fs_validate_filename(const std::string & filename) {
+bool fs_validate_filename(const std::string & filename, bool allow_subdirs) {
    if (!filename.length()) {
        // Empty filename invalid
        return false;
@ -754,10 +754,14 @@ bool fs_validate_filename(const std::string & filename) {
            || (c >= 0xD800 && c <= 0xDFFF) // UTF-16 surrogate pairs
            || c == 0xFFFD // Replacement Character (UTF-8)
            || c == 0xFEFF // Byte Order Mark (BOM)
-            || c == '/' || c == '\\' || c == ':' || c == '*' // Illegal characters
+            || c == ':' || c == '*' // Illegal characters
            || c == '?' || c == '"' || c == '<' || c == '>' || c == '|') {
            return false;
        }
+        if (!allow_subdirs && (c == '/' || c == '\\')) {
+            // Subdirectories not allowed, reject path separators
+            return false;
+        }
    }

    // Reject any leading or trailing ' ', or any trailing '.', these are stripped on Windows and will cause a different filename
@ -859,6 +863,11 @@ bool fs_create_directory_with_parents(const std::string & path) {
 #endif // _WIN32
 }

+bool fs_is_directory(const std::string & path) {
+    std::filesystem::path dir(path);
+    return std::filesystem::exists(dir) && std::filesystem::is_directory(dir);
+}
+
 std::string fs_get_cache_directory() {
    std::string cache_directory = "";
    auto ensure_trailing_slash = [](std::string p) {
@ -893,6 +902,8 @@ std::string fs_get_cache_directory() {
        cache_directory = std::getenv("HOME") + std::string("/Library/Caches/");
 #elif defined(_WIN32)
        cache_directory = std::getenv("LOCALAPPDATA");
+#elif defined(__EMSCRIPTEN__)
+        GGML_ABORT("not implemented on this platform");
 #else
 #  error Unknown architecture
 #endif
--- a/common/common.h
+++ b/common/common.h
@ -12,6 +12,10 @@
 #include <vector>
 #include <map>

+#if defined(_WIN32) && !defined(_WIN32_WINNT)
+#define _WIN32_WINNT 0x0A00
+#endif
+
 #ifdef _WIN32
 #define DIRECTORY_SEPARATOR '\\'
 #else
@ -490,6 +494,7 @@ struct common_params {
    bool log_json = false;

    std::string slot_save_path;
+    std::string media_path; // path to directory for loading media files

    float slot_prompt_similarity = 0.1f;

@ -640,8 +645,9 @@ std::string string_from(const struct llama_context * ctx, const struct llama_bat
 // Filesystem utils
 //

-bool fs_validate_filename(const std::string & filename);
+bool fs_validate_filename(const std::string & filename, bool allow_subdirs = false);
 bool fs_create_directory_with_parents(const std::string & path);
+bool fs_is_directory(const std::string & path);

 std::string fs_get_cache_directory();
 std::string fs_get_cache_file(const std::string & filename);
--- a/common/download.cpp
+++ b/common/download.cpp
@ -24,6 +24,7 @@
 #include "http.h"
 #endif

+#ifndef __EMSCRIPTEN__
 #ifdef __linux__
 #include <linux/limits.h>
 #elif defined(_WIN32)
@ -35,6 +36,8 @@
 #else
 #include <sys/syslimits.h>
 #endif
+#endif
+
 #define LLAMA_MAX_URL_LENGTH 2084 // Maximum URL Length in Chrome: 2083

 // isatty
--- a/common/json-schema-to-grammar.cpp
+++ b/common/json-schema-to-grammar.cpp
@ -974,7 +974,7 @@ public:

    void check_errors() {
        if (!_errors.empty()) {
-            throw std::runtime_error("JSON schema conversion failed:\n" + string_join(_errors, "\n"));
+            throw std::invalid_argument("JSON schema conversion failed:\n" + string_join(_errors, "\n"));
        }
        if (!_warnings.empty()) {
            fprintf(stderr, "WARNING: JSON schema conversion was incomplete: %s\n", string_join(_warnings, "; ").c_str());
--- a/common/peg-parser.cpp
+++ b/common/peg-parser.cpp
--- a/common/peg-parser.h
+++ b/common/peg-parser.h
@ -0,0 +1,459 @@
+#pragma once
+
+#include <nlohmann/json_fwd.hpp>
+
+#include <memory>
+#include <unordered_map>
+#include <string>
+#include <string_view>
+#include <functional>
+#include <vector>
+#include <variant>
+
+struct common_grammar_builder;
+
+class common_peg_parser_builder;
+
+using common_peg_parser_id = size_t;
+constexpr common_peg_parser_id COMMON_PEG_INVALID_PARSER_ID = static_cast<common_peg_parser_id>(-1);
+
+using common_peg_ast_id = size_t;
+constexpr common_peg_ast_id COMMON_PEG_INVALID_AST_ID = static_cast<common_peg_ast_id>(-1);
+
+// Lightweight wrapper around common_peg_parser_id for convenience
+class common_peg_parser {
+    common_peg_parser_id id_;
+    common_peg_parser_builder & builder_;
+
+  public:
+    common_peg_parser(const common_peg_parser & other) : id_(other.id_), builder_(other.builder_) {}
+    common_peg_parser(common_peg_parser_id id, common_peg_parser_builder & builder) : id_(id), builder_(builder) {}
+
+    common_peg_parser & operator=(const common_peg_parser & other);
+    common_peg_parser & operator+=(const common_peg_parser & other);
+    common_peg_parser & operator|=(const common_peg_parser & other);
+
+    operator common_peg_parser_id() const { return id_; }
+    common_peg_parser_id id() const { return id_; }
+
+    common_peg_parser_builder & builder() const { return builder_; }
+
+    // Creates a sequence
+    common_peg_parser operator+(const common_peg_parser & other) const;
+
+    // Creates a sequence separated by spaces.
+    common_peg_parser operator<<(const common_peg_parser & other) const;
+
+    // Creates a choice
+    common_peg_parser operator|(const common_peg_parser & other) const;
+
+    common_peg_parser operator+(const char * str) const;
+    common_peg_parser operator+(const std::string & str) const;
+    common_peg_parser operator<<(const char * str) const;
+    common_peg_parser operator<<(const std::string & str) const;
+    common_peg_parser operator|(const char * str) const;
+    common_peg_parser operator|(const std::string & str) const;
+};
+
+common_peg_parser operator+(const char * str, const common_peg_parser & p);
+common_peg_parser operator+(const std::string & str, const common_peg_parser & p);
+common_peg_parser operator<<(const char * str, const common_peg_parser & p);
+common_peg_parser operator<<(const std::string & str, const common_peg_parser & p);
+common_peg_parser operator|(const char * str, const common_peg_parser & p);
+common_peg_parser operator|(const std::string & str, const common_peg_parser & p);
+
+enum common_peg_parse_result_type {
+    COMMON_PEG_PARSE_RESULT_FAIL            = 0,
+    COMMON_PEG_PARSE_RESULT_SUCCESS         = 1,
+    COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT = 2,
+};
+
+const char * common_peg_parse_result_type_name(common_peg_parse_result_type type);
+
+struct common_peg_ast_node {
+    common_peg_ast_id id;
+    std::string rule;
+    std::string tag;
+    size_t start;
+    size_t end;
+    std::string_view text;
+    std::vector<common_peg_ast_id> children;
+
+    bool is_partial = false;
+};
+
+struct common_peg_parse_result;
+
+using common_peg_ast_visitor = std::function<void(const common_peg_ast_node & node)>;
+
+class common_peg_ast_arena {
+    std::vector<common_peg_ast_node> nodes_;
+  public:
+    common_peg_ast_id add_node(
+        const std::string & rule,
+        const std::string & tag,
+        size_t start,
+        size_t end,
+        std::string_view text,
+        std::vector<common_peg_ast_id> children,
+        bool is_partial = false
+    ) {
+        common_peg_ast_id id = nodes_.size();
+        nodes_.push_back({id, rule, tag, start, end, text, std::move(children), is_partial});
+        return id;
+    }
+
+    const common_peg_ast_node & get(common_peg_ast_id id) const { return nodes_.at(id); }
+
+    size_t size() const { return nodes_.size(); }
+
+    void clear() { nodes_.clear(); }
+
+    void visit(common_peg_ast_id id, const common_peg_ast_visitor & visitor) const;
+    void visit(const common_peg_parse_result & result, const common_peg_ast_visitor & visitor) const;
+};
+
+struct common_peg_parse_result {
+    common_peg_parse_result_type type = COMMON_PEG_PARSE_RESULT_FAIL;
+    size_t start = 0;
+    size_t end = 0;
+
+    std::vector<common_peg_ast_id> nodes;
+
+    common_peg_parse_result() = default;
+
+    common_peg_parse_result(common_peg_parse_result_type type, size_t start)
+        : type(type), start(start), end(start) {}
+
+    common_peg_parse_result(common_peg_parse_result_type type, size_t start, size_t end)
+        : type(type), start(start), end(end) {}
+
+    common_peg_parse_result(common_peg_parse_result_type type, size_t start, size_t end, std::vector<common_peg_ast_id> nodes)
+        : type(type), start(start), end(end), nodes(std::move(nodes)) {}
+
+    bool fail() const { return type == COMMON_PEG_PARSE_RESULT_FAIL; }
+    bool need_more_input() const { return type == COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT; }
+    bool success() const { return type == COMMON_PEG_PARSE_RESULT_SUCCESS; }
+};
+
+struct common_peg_parse_context {
+    std::string input;
+    bool is_partial;
+    common_peg_ast_arena ast;
+
+    int parse_depth;
+
+    common_peg_parse_context()
+        : is_partial(false), parse_depth(0) {}
+
+    common_peg_parse_context(const std::string & input)
+        : input(input), is_partial(false), parse_depth(0) {}
+
+    common_peg_parse_context(const std::string & input, bool is_partial)
+        : input(input), is_partial(is_partial), parse_depth(0) {}
+};
+
+class common_peg_arena;
+
+// Parser variants
+struct common_peg_epsilon_parser {};
+
+struct common_peg_start_parser {};
+
+struct common_peg_end_parser {};
+
+struct common_peg_literal_parser {
+    std::string literal;
+};
+
+struct common_peg_sequence_parser {
+    std::vector<common_peg_parser_id> children;
+};
+
+struct common_peg_choice_parser {
+    std::vector<common_peg_parser_id> children;
+};
+
+struct common_peg_repetition_parser {
+    common_peg_parser_id child;
+    int min_count;
+    int max_count;  // -1 for unbounded
+};
+
+struct common_peg_and_parser {
+    common_peg_parser_id child;
+};
+
+struct common_peg_not_parser {
+    common_peg_parser_id child;
+};
+
+struct common_peg_any_parser {};
+
+struct common_peg_space_parser {};
+
+struct common_peg_chars_parser {
+    struct char_range {
+        uint32_t start;
+        uint32_t end;
+        bool contains(uint32_t codepoint) const { return codepoint >= start && codepoint <= end; }
+    };
+
+    std::string pattern;
+    std::vector<char_range> ranges;
+    bool negated;
+    int min_count;
+    int max_count;  // -1 for unbounded
+};
+
+struct common_peg_json_string_parser {};
+
+struct common_peg_until_parser {
+    std::vector<std::string> delimiters;
+};
+
+struct common_peg_schema_parser {
+    common_peg_parser_id child;
+    std::string name;
+    std::shared_ptr<nlohmann::ordered_json> schema;
+
+    // Indicates if the GBNF should accept a raw string that matches the schema.
+    bool raw;
+};
+
+struct common_peg_rule_parser {
+    std::string name;
+    common_peg_parser_id child;
+    bool trigger;
+};
+
+struct common_peg_ref_parser {
+    std::string name;
+};
+
+struct common_peg_atomic_parser {
+    common_peg_parser_id child;
+};
+
+struct common_peg_tag_parser {
+    common_peg_parser_id child;
+    std::string tag;
+};
+
+// Variant holding all parser types
+using common_peg_parser_variant = std::variant<
+    common_peg_epsilon_parser,
+    common_peg_start_parser,
+    common_peg_end_parser,
+    common_peg_literal_parser,
+    common_peg_sequence_parser,
+    common_peg_choice_parser,
+    common_peg_repetition_parser,
+    common_peg_and_parser,
+    common_peg_not_parser,
+    common_peg_any_parser,
+    common_peg_space_parser,
+    common_peg_chars_parser,
+    common_peg_json_string_parser,
+    common_peg_until_parser,
+    common_peg_schema_parser,
+    common_peg_rule_parser,
+    common_peg_ref_parser,
+    common_peg_atomic_parser,
+    common_peg_tag_parser
+>;
+
+class common_peg_arena {
+    std::vector<common_peg_parser_variant> parsers_;
+    std::unordered_map<std::string, common_peg_parser_id> rules_;
+    common_peg_parser_id root_ = COMMON_PEG_INVALID_PARSER_ID;
+
+  public:
+    const common_peg_parser_variant & get(common_peg_parser_id id) const { return parsers_.at(id); }
+    common_peg_parser_variant & get(common_peg_parser_id id) { return parsers_.at(id); }
+
+    size_t size() const { return parsers_.size(); }
+    bool empty() const { return parsers_.empty(); }
+
+    common_peg_parser_id get_rule(const std::string & name) const;
+    bool has_rule(const std::string & name) const { return rules_.find(name) != rules_.end(); }
+
+    common_peg_parser_id root() const { return root_; }
+    void set_root(common_peg_parser_id id) { root_ = id; }
+
+    common_peg_parse_result parse(common_peg_parse_context & ctx, size_t start = 0) const;
+    common_peg_parse_result parse(common_peg_parser_id id, common_peg_parse_context & ctx, size_t start) const;
+
+    void resolve_refs();
+
+    void build_grammar(const common_grammar_builder & builder, bool lazy = false) const;
+
+    std::string dump(common_peg_parser_id id) const;
+
+    nlohmann::json to_json() const;
+    static common_peg_arena from_json(const nlohmann::json & j);
+
+    std::string save() const;
+    void load(const std::string & data);
+
+    friend class common_peg_parser_builder;
+
+  private:
+    common_peg_parser_id add_parser(common_peg_parser_variant parser);
+    void add_rule(const std::string & name, common_peg_parser_id id);
+
+    common_peg_parser_id resolve_ref(common_peg_parser_id id);
+};
+
+class common_peg_parser_builder {
+    common_peg_arena arena_;
+
+    common_peg_parser wrap(common_peg_parser_id id) { return common_peg_parser(id, *this); }
+    common_peg_parser add(const common_peg_parser_variant & p) { return wrap(arena_.add_parser(p)); }
+
+  public:
+    common_peg_parser_builder();
+
+    // Match nothing, always succeed.
+    //   S -> ε
+    common_peg_parser eps() { return add(common_peg_epsilon_parser{}); }
+
+    // Matches the start of the input.
+    //   S -> ^
+    common_peg_parser start() { return add(common_peg_start_parser{}); }
+
+    // Matches the end of the input.
+    //   S -> $
+    common_peg_parser end() { return add(common_peg_end_parser{}); }
+
+    // Matches an exact literal string.
+    //   S -> "hello"
+    common_peg_parser literal(const std::string & literal) { return add(common_peg_literal_parser{literal}); }
+
+    // Matches a sequence of parsers in order, all must succeed.
+    //   S -> A B C
+    common_peg_parser sequence() { return add(common_peg_sequence_parser{}); }
+    common_peg_parser sequence(const std::vector<common_peg_parser_id> & parsers);
+    common_peg_parser sequence(const std::vector<common_peg_parser> & parsers);
+    common_peg_parser sequence(std::initializer_list<common_peg_parser> parsers);
+
+    // Matches the first parser that succeeds from a list of alternatives.
+    //   S -> A | B | C
+    common_peg_parser choice() { return add(common_peg_choice_parser{}); }
+    common_peg_parser choice(const std::vector<common_peg_parser_id> & parsers);
+    common_peg_parser choice(const std::vector<common_peg_parser> & parsers);
+    common_peg_parser choice(std::initializer_list<common_peg_parser> parsers);
+
+    // Matches one or more repetitions of a parser.
+    //   S -> A+
+    common_peg_parser one_or_more(const common_peg_parser & p) { return repeat(p, 1, -1); }
+
+    // Matches zero or more repetitions of a parser, always succeeds.
+    //   S -> A*
+    common_peg_parser zero_or_more(const common_peg_parser & p) { return repeat(p, 0, -1); }
+
+    // Matches zero or one occurrence of a parser, always succeeds.
+    //   S -> A?
+    common_peg_parser optional(const common_peg_parser & p) { return repeat(p, 0, 1); }
+
+    // Positive lookahead: succeeds if child parser succeeds, consumes no input.
+    //   S -> &A
+    common_peg_parser peek(const common_peg_parser & p) { return add(common_peg_and_parser{p}); }
+
+    // Negative lookahead: succeeds if child parser fails, consumes no input.
+    //   S -> !A
+    common_peg_parser negate(const common_peg_parser & p) { return add(common_peg_not_parser{p}); }
+
+    // Matches any single character.
+    //   S -> .
+    common_peg_parser any() { return add(common_peg_any_parser{}); }
+
+    // Matches between min and max repetitions of characters from a character class.
+    //   S -> [a-z]{m,n}
+    //
+    // Use -1 for max to represent unbounded repetition (equivalent to {m,})
+    common_peg_parser chars(const std::string & classes, int min = 1, int max = -1);
+
+    // Creates a lightweight reference to a named rule (resolved during build()).
+    // Use this for forward references in recursive grammars.
+    //   expr_ref -> expr
+    common_peg_parser ref(const std::string & name) { return add(common_peg_ref_parser{name}); }
+
+    // Matches zero or more whitespace characters (space, tab, newline).
+    //   S -> [ \t\n]*
+    common_peg_parser space() { return add(common_peg_space_parser{}); }
+
+    // Matches all characters until a delimiter is found (delimiter not consumed).
+    //   S -> (!delim .)*
+    common_peg_parser until(const std::string & delimiter) { return add(common_peg_until_parser{{delimiter}}); }
+
+    // Matches all characters until one of the delimiters in the list is found (delimiter not consumed).
+    //   S -> (!delim .)*
+    common_peg_parser until_one_of(const std::vector<std::string> & delimiters) { return add(common_peg_until_parser{delimiters}); }
+
+    // Matches everything
+    //   S -> .*
+    common_peg_parser rest() { return until_one_of({}); }
+
+    // Matches between min and max repetitions of a parser (inclusive).
+    //   S -> A{m,n}
+    // Use -1 for max to represent unbounded repetition (equivalent to {m,})
+    common_peg_parser repeat(const common_peg_parser & p, int min, int max) { return add(common_peg_repetition_parser{p, min,max}); }
+
+    // Matches exactly n repetitions of a parser.
+    //   S -> A{n}
+    common_peg_parser repeat(const common_peg_parser & p, int n) { return repeat(p, n, n); }
+
+    // Creates a complete JSON parser supporting objects, arrays, strings, numbers, booleans, and null.
+    //   value -> object | array | string | number | true | false | null
+    common_peg_parser json();
+    common_peg_parser json_object();
+    common_peg_parser json_string();
+    common_peg_parser json_array();
+    common_peg_parser json_number();
+    common_peg_parser json_bool();
+    common_peg_parser json_null();
+
+    // Matches JSON string content without the surrounding quotes.
+    // Useful for extracting content within a JSON string.
+    common_peg_parser json_string_content();
+
+    // Matches a JSON object member with a key and associated parser as the
+    // value.
+    common_peg_parser json_member(const std::string & key, const common_peg_parser & p);
+
+    // Wraps a parser with JSON schema metadata for grammar generation.
+    // Used internally to convert JSON schemas to GBNF grammar rules.
+    common_peg_parser schema(const common_peg_parser & p, const std::string & name, const nlohmann::ordered_json & schema, bool raw = false);
+
+    // Creates a named rule, stores it in the grammar, and returns a ref.
+    // If trigger=true, marks this rule as an entry point for lazy grammar generation.
+    //   auto json = p.rule("json", json_obj | json_arr | ...)
+    common_peg_parser rule(const std::string & name, const common_peg_parser & p, bool trigger = false);
+
+    // Creates a named rule using a builder function, and returns a ref.
+    // If trigger=true, marks this rule as an entry point for lazy grammar generation.
+    //   auto json = p.rule("json", [&]() { return json_object() | json_array() | ... })
+    common_peg_parser rule(const std::string & name, const std::function<common_peg_parser()> & builder, bool trigger = false);
+
+    // Creates a trigger rule. When generating a lazy grammar from the parser,
+    // only trigger rules and descendents are emitted.
+    common_peg_parser trigger_rule(const std::string & name, const common_peg_parser & p) { return rule(name, p, true); }
+    common_peg_parser trigger_rule(const std::string & name, const std::function<common_peg_parser()> & builder) { return rule(name, builder, true); }
+
+    // Creates an atomic parser. Atomic parsers do not create an AST node if
+    // the child results in a partial parse, i.e. NEEDS_MORE_INPUT. This is
+    // intended for situations where partial output is undesirable.
+    common_peg_parser atomic(const common_peg_parser & p) { return add(common_peg_atomic_parser{p}); }
+
+    // Tags create nodes in the generated AST for semantic purposes.
+    // Unlike rules, you can tag multiple nodes with the same tag.
+    common_peg_parser tag(const std::string & tag, const common_peg_parser & p) { return add(common_peg_tag_parser{p.id(), tag}); }
+
+    void set_root(const common_peg_parser & p);
+
+    common_peg_arena build();
+};
+
+// Helper function for building parsers
+common_peg_arena build_peg_parser(const std::function<common_peg_parser(common_peg_parser_builder & builder)> & fn);
--- a/common/unicode.cpp
+++ b/common/unicode.cpp
@ -0,0 +1,64 @@
+#include "unicode.h"
+
+// implementation adopted from src/unicode.cpp
+
+size_t utf8_sequence_length(unsigned char first_byte) {
+    const size_t lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4 };
+    uint8_t highbits = static_cast<uint8_t>(first_byte) >> 4;
+    return lookup[highbits];
+}
+
+utf8_parse_result parse_utf8_codepoint(std::string_view input, size_t offset) {
+    if (offset >= input.size()) {
+        return utf8_parse_result(utf8_parse_result::INCOMPLETE);
+    }
+
+    // ASCII fast path
+    if (!(input[offset] & 0x80)) {
+        return utf8_parse_result(utf8_parse_result::SUCCESS, input[offset], 1);
+    }
+
+    // Invalid: continuation byte as first byte
+    if (!(input[offset] & 0x40)) {
+        return utf8_parse_result(utf8_parse_result::INVALID);
+    }
+
+    // 2-byte sequence
+    if (!(input[offset] & 0x20)) {
+        if (offset + 1 >= input.size()) {
+            return utf8_parse_result(utf8_parse_result::INCOMPLETE);
+        }
+        if ((input[offset + 1] & 0xc0) != 0x80) {
+            return utf8_parse_result(utf8_parse_result::INVALID);
+        }
+        auto result = ((input[offset] & 0x1f) << 6) | (input[offset + 1] & 0x3f);
+        return utf8_parse_result(utf8_parse_result::SUCCESS, result, 2);
+    }
+
+    // 3-byte sequence
+    if (!(input[offset] & 0x10)) {
+        if (offset + 2 >= input.size()) {
+            return utf8_parse_result(utf8_parse_result::INCOMPLETE);
+        }
+        if ((input[offset + 1] & 0xc0) != 0x80 || (input[offset + 2] & 0xc0) != 0x80) {
+            return utf8_parse_result(utf8_parse_result::INVALID);
+        }
+        auto result = ((input[offset] & 0x0f) << 12) | ((input[offset + 1] & 0x3f) << 6) | (input[offset + 2] & 0x3f);
+        return utf8_parse_result(utf8_parse_result::SUCCESS, result, 3);
+    }
+
+    // 4-byte sequence
+    if (!(input[offset] & 0x08)) {
+        if (offset + 3 >= input.size()) {
+            return utf8_parse_result(utf8_parse_result::INCOMPLETE);
+        }
+        if ((input[offset + 1] & 0xc0) != 0x80 || (input[offset + 2] & 0xc0) != 0x80 || (input[offset + 3] & 0xc0) != 0x80) {
+            return utf8_parse_result(utf8_parse_result::INVALID);
+        }
+        auto result = ((input[offset] & 0x07) << 18) | ((input[offset + 1] & 0x3f) << 12) | ((input[offset + 2] & 0x3f) << 6) | (input[offset + 3] & 0x3f);
+        return utf8_parse_result(utf8_parse_result::SUCCESS, result, 4);
+    }
+
+    // Invalid first byte
+    return utf8_parse_result(utf8_parse_result::INVALID);
+}
--- a/common/unicode.h
+++ b/common/unicode.h
@ -0,0 +1,22 @@
+#pragma once
+
+#include <cstdint>
+#include <string_view>
+
+// UTF-8 parsing utilities for streaming-aware unicode support
+
+struct utf8_parse_result {
+    uint32_t codepoint;      // Decoded codepoint (only valid if status == SUCCESS)
+    size_t bytes_consumed;   // How many bytes this codepoint uses (1-4)
+    enum status { SUCCESS, INCOMPLETE, INVALID } status;
+
+    utf8_parse_result(enum status s, uint32_t cp = 0, size_t bytes = 0)
+        : codepoint(cp), bytes_consumed(bytes), status(s) {}
+};
+
+// Determine the expected length of a UTF-8 sequence from its first byte
+// Returns 0 for invalid first bytes
+size_t utf8_sequence_length(unsigned char first_byte);
+
+// Parse a single UTF-8 codepoint from input
+utf8_parse_result parse_utf8_codepoint(std::string_view input, size_t offset);
--- a/docs/build.md
+++ b/docs/build.md
@ -431,11 +431,22 @@ docker run -it --rm -v "$(pwd):/app:Z" --device /dev/dri/renderD128:/dev/dri/ren

 ### For Linux users:

+#### Using the LunarG Vulkan SDK
+
 First, follow the official LunarG instructions for the installation and setup of the Vulkan SDK in the [Getting Started with the Linux Tarball Vulkan SDK](https://vulkan.lunarg.com/doc/sdk/latest/linux/getting_started.html) guide.

 > [!IMPORTANT]
 > After completing the first step, ensure that you have used the `source` command on the `setup_env.sh` file inside of the Vulkan SDK in your current terminal session. Otherwise, the build won't work. Additionally, if you close out of your terminal, you must perform this step again if you intend to perform a build. However, there are ways to make this persistent. Refer to the Vulkan SDK guide linked in the first step for more information about any of this.

+#### Using system packages
+
+On Debian / Ubuntu, you can install the required dependencies using:
+```sh
+sudo apt-get install libvulkan-dev glslc
+```
+
+#### Common steps
+
 Second, after verifying that you have followed all of the SDK installation/setup steps, use this command to make sure before proceeding:
 ```bash
 vulkaninfo
--- a/docs/development/parsing.md
+++ b/docs/development/parsing.md
@ -0,0 +1,288 @@
+# Parsing Model Output
+
+The `common` library contains a PEG parser implementation suitable for parsing
+model output.
+
+Types with the prefix `common_peg_*` are intended for general use and may have
+applications beyond parsing model output, such as parsing user-provided regex
+patterns.
+
+Types with the prefix `common_chat_peg_*` are specialized helpers for model
+output.
+
+The parser features:
+
+- Partial parsing of streaming input
+- Built-in JSON parsers
+- AST generation with semantics via "tagged" nodes
+
+## Example
+
+Below is a contrived example demonstrating how to use the PEG parser to parse
+output from a model that emits arguments as JSON.
+
+```cpp
+auto parser = build_chat_peg_native_parser([&](common_chat_peg_native_builder & p) {
+    // Build a choice of all available tools
+    auto tool_choice = p.choice();
+    for (const auto & tool : tools) {
+        const auto & function = tool.at("function");
+        std::string name = function.at("name");
+        const auto & schema = function.at("parameters");
+
+        auto tool_name = p.json_member("name", "\"" + p.literal(name) + "\"");
+        auto tool_args = p.json_member("arguments", p.schema(p.json(), "tool-" + name + "-schema", schema));
+
+        tool_choice |= p.rule("tool-" + name, "{" << tool_name << "," << tool_args << "}");
+    }
+
+    // Define the tool call structure: <tool_call>[{tool}]</tool_call>
+    auto tool_call = p.trigger_rule("tool-call",
+        p.sequence({
+            p.literal("<tool_call>["),
+            tool_choice,
+            p.literal("]</tool_call>")
+        })
+    );
+
+    // Parser accepts content, optionally followed by a tool call
+    return p.sequence({
+        p.content(p.until("<tool_call>")),
+        p.optional(tool_call),
+        p.end()
+    });
+});
+```
+
+For a more complete example, see `test_example_native()` in
+[tests/test-chat-peg-parser.cpp](tests/test-chat-peg-parser.cpp).
+
+## Parsers/Combinators
+
+### Basic Matchers
+
+- **`eps()`** - Matches nothing and always succeeds (epsilon/empty match)
+- **`start()`** - Matches the start of input (anchor `^`)
+- **`end()`** - Matches the end of input (anchor `$`)
+- **`literal(string)`** - Matches an exact literal string
+- **`any()`** - Matches any single character (`.`)
+
+### Combinators
+
+- **`sequence(...)`** - Matches parsers in order; all must succeed
+- **`choice(...)`** - Matches the first parser that succeeds from alternatives (ordered choice)
+- **`one_or_more(p)`** - Matches one or more repetitions (`+`)
+- **`zero_or_more(p)`** - Matches zero or more repetitions (`*`)
+- **`optional(p)`** - Matches zero or one occurrence (`?`)
+- **`repeat(p, min, max)`** - Matches between min and max repetitions (use `-1` for unbounded)
+- **`repeat(p, n)`** - Matches exactly n repetitions
+
+### Lookahead
+
+- **`peek(p)`** - Positive lookahead: succeeds if parser succeeds without consuming input (`&`)
+- **`negate(p)`** - Negative lookahead: succeeds if parser fails without consuming input (`!`)
+
+### Character Classes & Utilities
+
+- **`chars(classes, min, max)`** - Matches repetitions of characters from a character class
+- **`space()`** - Matches zero or more whitespace characters (space, tab, newline)
+- **`until(delimiter)`** - Matches characters until delimiter is found (delimiter not consumed)
+- **`until_one_of(delimiters)`** - Matches characters until any delimiter in the list is found
+- **`rest()`** - Matches everything remaining (`.*`)
+
+### JSON Parsers
+
+- **`json()`** - Complete JSON parser (objects, arrays, strings, numbers, booleans, null)
+- **`json_object()`** - JSON object parser
+- **`json_array()`** - JSON array parser
+- **`json_string()`** - JSON string parser
+- **`json_number()`** - JSON number parser
+- **`json_bool()`** - JSON boolean parser
+- **`json_null()`** - JSON null parser
+- **`json_string_content()`** - JSON string content without surrounding quotes
+- **`json_member(key, p)`** - JSON object member with specific key and value parser
+
+### Grammar Building
+
+- **`ref(name)`** - Creates a lightweight reference to a named rule (for recursive grammars)
+- **`rule(name, p, trigger)`** - Creates a named rule and returns a reference
+- **`trigger_rule(name, p)`** - Creates a trigger rule (entry point for lazy grammar generation)
+- **`schema(p, name, schema, raw)`** - Wraps parser with JSON schema metadata for grammar generation
+
+### AST Control
+
+- **`atomic(p)`** - Prevents AST node creation for partial parses
+- **`tag(tag, p)`** - Creates AST nodes with semantic tags (multiple nodes can share tags)
+
+## GBNF Grammar Generation
+
+The PEG parser also acts as a convenient DSL for generating GBNF grammars, with
+some exceptions.
+
+```cpp
+data.grammar = build_grammar([&](const common_grammar_builder & builder) {
+    foreach_function(params.tools, [&](const json & fn) {
+        builder.resolve_refs(fn.at("parameters"));
+    });
+    parser.build_grammar(builder, data.grammar_lazy);
+});
+```
+
+The notable exception is the `negate(p)` lookahead parser, which cannot be
+defined as a CFG grammar and therefore does not produce a rule. Its usage
+should be limited and preferably hidden behind a `schema()` parser. In many
+cases, `until(delimiter)` or `until_one_of(delimiters)` is a better choice.
+
+Another limitation is that the PEG parser requires an unambiguous grammar. In
+contrast, the `llama-grammar` implementation can support ambiguous grammars,
+though they are difficult to parse.
+
+### Lazy Grammars
+
+During lazy grammar generation, only rules reachable from a `trigger_rule(p)`
+are emitted in the grammar. All trigger rules are added as alternations in the
+root rule. It is still necessary to define trigger patterns, as the parser has
+no interaction with the grammar sampling.
+
+### JSON Schema
+
+The `schema(p, name, schema, raw)` parser will use the `json-schema-to-grammar`
+implementation to generate the grammar instead of the underlying parser.
+
+The `raw` option emits a grammar suitable for a raw string instead of a JSON
+string. In other words, it won't be wrapped in quotes or require escaping
+quotes. It should only be used when `type == "string"`.
+
+The downside is that it can potentially lead to ambiguous grammars. For
+example, if a user provides the pattern `^.*$`, the following grammar may be
+generated:
+
+```
+root ::= "<arg>" .* "</arg>"
+```
+
+This creates an ambiguous grammar that cannot be parsed by the PEG parser. To
+help mitigate this, if `.*` is found in the pattern, the grammar from the
+underlying parser will be emitted instead.
+
+## Common AST Shapes for Chat Parsing
+
+Most model output can be placed in one of the following categories:
+
+- Content only
+- Tool calling with arguments emitted as a single JSON object
+- Tool calling with arguments emitted as separate entities, either XML
+  (Qwen3-Coder, MiniMax M2) or pseudo-function calls (LFM2)
+
+To provide broad coverage,
+[`common/chat-peg-parser.h`](common/chat-peg-parser.h) contains builders and
+mappers that help create parsers and visitors/extractors for these types. They
+require parsers to tag nodes to conform to an AST "shape". This normalization
+makes it easy to extract information and generalize parsing.
+
+### Simple
+
+The `common_chat_peg_builder` builds a `simple` parser that supports
+content-only models with optional reasoning.
+
+- **`reasoning(p)`** - Tag node for extracting `reasoning_content`
+- **`content(p)`** - Tag node for extracting `content`
+
+```cpp
+build_chat_peg_parser([&](common_chat_peg_parser & p) {
+    return p.sequence({
+        p.optional("<think>" + p.reasoning(p.until("</think>")) + "</think>"),
+        p.content(p.until("<tool_call>")),
+        p.end()
+    });
+});
+```
+
+Use `common_chat_peg_mapper` to extract the content. Note that this is already
+done for you in `common_chat_peg_parser` when
+`chat_format == COMMON_CHAT_FORMAT_PEG_SIMPLE`.
+
+```cpp
+auto result = parser.parse(ctx);
+
+common_chat_msg msg;
+auto mapper = common_chat_peg_mapper(msg);
+mapper.from_ast(ctx.ast, result);
+```
+
+### Native
+
+The `common_chat_peg_native_builder` builds a `native` parser suitable for
+models that emit tool arguments as a direct JSON object.
+
+- **`reasoning(p)`** - Tag node for `reasoning_content`
+- **`content(p)`** - Tag node for `content`
+- **`tool(p)`** - Tag entirety of a single tool call
+- **`tool_open(p)`** - Tag start of a tool call
+- **`tool_close(p)`** - Tag end of a tool call
+- **`tool_id(p)`** - Tag the tool call ID (optional)
+- **`tool_name(p)`** - Tag the tool name
+- **`tool_args(p)`** - Tag the tool arguments
+
+```cpp
+build_chat_peg_native_parser([&](common_chat_peg_native_parser & p) {
+    auto get_weather_tool = p.tool(p.sequence({
+        p.tool_open(p.literal("{")),
+        p.json_member("name", "\"" + p.tool_name(p.literal("get_weather")) + "\""),
+        p.literal(","),
+        p.json_member("arguments", p.tool_args(p.json())),
+        p.tool_close(p.literal("}"))
+    }));
+
+    return p.sequence({
+        p.content(p.until("<tool_call>")),
+        p.literal("<tool_call>"),
+        get_weather_tool,
+        p.literal("</tool_call>"),
+        p.end()
+    });
+});
+```
+
+### Constructed
+
+The `common_chat_peg_constructed_builder` builds a `constructed` parser
+suitable for models that emit tool arguments as separate entities, such as XML
+tags.
+
+- **`reasoning(p)`** - Tag node for `reasoning_content`
+- **`content(p)`** - Tag node for `content`
+- **`tool(p)`** - Tag entirety of a single tool call
+- **`tool_open(p)`** - Tag start of a tool call
+- **`tool_close(p)`** - Tag end of a tool call
+- **`tool_name(p)`** - Tag the tool name
+- **`tool_arg(p)`** - Tag a complete tool argument (name + value)
+- **`tool_arg_open(p)`** - Tag start of a tool argument
+- **`tool_arg_close(p)`** - Tag end of a tool argument
+- **`tool_arg_name(p)`** - Tag the argument name
+- **`tool_arg_string_value(p)`** - Tag string value for the argument
+- **`tool_arg_json_value(p)`** - Tag JSON value for the argument
+
+```cpp
+build_chat_peg_constructed_parser([&](common_chat_peg_constructed_builder & p) {
+    auto location_arg = p.tool_arg(
+        p.tool_arg_open("<parameter name=\"" + p.tool_arg_name(p.literal("location")) + "\">"),
+        p.tool_arg_string_value(p.until("</parameter>")),
+        p.tool_arg_close(p.literal("</parameter>"))
+    );
+
+    auto get_weather_tool = p.tool(p.sequence({
+        p.tool_open("<function name=\"" + p.tool_name(p.literal("get_weather")) + "\">"),
+        location_arg,
+        p.tool_close(p.literal("</function>"))
+    }));
+
+    return p.sequence({
+        p.content(p.until("<tool_call>")),
+        p.literal("<tool_call>"),
+        get_weather_tool,
+        p.literal("</tool_call>"),
+        p.end()
+    });
+});
+```
--- a/ggml/CMakeLists.txt
+++ b/ggml/CMakeLists.txt
@ -175,11 +175,6 @@ option(GGML_CPU_ALL_VARIANTS "ggml: build all variants of the CPU backend (requi
 set(GGML_CPU_ARM_ARCH        "" CACHE STRING "ggml: CPU architecture for ARM")
 set(GGML_CPU_POWERPC_CPUTYPE "" CACHE STRING "ggml: CPU type for PowerPC")

-
-if (MINGW)
-    set(GGML_WIN_VER "0xA00" CACHE STRING   "ggml: Windows version")
-endif()
-
 # ggml core
 set(GGML_SCHED_MAX_COPIES  "4" CACHE STRING "ggml: max input copies for pipeline parallelism")
 option(GGML_CPU                             "ggml: enable CPU backend"                        ON)
@ -226,7 +221,7 @@ option(GGML_WEBGPU                          "ggml: use WebGPU"
 option(GGML_WEBGPU_DEBUG                    "ggml: enable WebGPU debug output"                OFF)
 option(GGML_WEBGPU_CPU_PROFILE              "ggml: enable WebGPU profiling (CPU)"             OFF)
 option(GGML_WEBGPU_GPU_PROFILE              "ggml: enable WebGPU profiling (GPU)"             OFF)
-
+option(GGML_WEBGPU_JSPI                     "ggml: use JSPI for WebGPU"                       ON)
 option(GGML_ZDNN                            "ggml: use zDNN"                                  OFF)
 option(GGML_METAL                           "ggml: use Metal"                                 ${GGML_METAL_DEFAULT})
 option(GGML_METAL_NDEBUG                    "ggml: disable Metal debugging"                   OFF)
@ -408,62 +403,67 @@ if (MSVC)
        /wd4996  # Disable POSIX deprecation warnings
        /wd4702  # Unreachable code warnings
    )
-    function(disable_msvc_warnings target_name)
+    set(MSVC_COMPILE_OPTIONS
+        "$<$<COMPILE_LANGUAGE:C>:/utf-8>"
+        "$<$<COMPILE_LANGUAGE:CXX>:/utf-8>"
+    )
+    function(configure_msvc_target target_name)
        if(TARGET ${target_name})
            target_compile_options(${target_name} PRIVATE ${MSVC_WARNING_FLAGS})
+            target_compile_options(${target_name} PRIVATE ${MSVC_COMPILE_OPTIONS})
        endif()
    endfunction()

-    disable_msvc_warnings(ggml-base)
-    disable_msvc_warnings(ggml)
-    disable_msvc_warnings(ggml-cpu)
-    disable_msvc_warnings(ggml-cpu-x64)
-    disable_msvc_warnings(ggml-cpu-sse42)
-    disable_msvc_warnings(ggml-cpu-sandybridge)
-    disable_msvc_warnings(ggml-cpu-haswell)
-    disable_msvc_warnings(ggml-cpu-skylakex)
-    disable_msvc_warnings(ggml-cpu-icelake)
-    disable_msvc_warnings(ggml-cpu-alderlake)
+    configure_msvc_target(ggml-base)
+    configure_msvc_target(ggml)
+    configure_msvc_target(ggml-cpu)
+    configure_msvc_target(ggml-cpu-x64)
+    configure_msvc_target(ggml-cpu-sse42)
+    configure_msvc_target(ggml-cpu-sandybridge)
+    configure_msvc_target(ggml-cpu-haswell)
+    configure_msvc_target(ggml-cpu-skylakex)
+    configure_msvc_target(ggml-cpu-icelake)
+    configure_msvc_target(ggml-cpu-alderlake)

    if (GGML_BUILD_EXAMPLES)
-        disable_msvc_warnings(common-ggml)
-        disable_msvc_warnings(common)
+        configure_msvc_target(common-ggml)
+        configure_msvc_target(common)

-        disable_msvc_warnings(mnist-common)
-        disable_msvc_warnings(mnist-eval)
-        disable_msvc_warnings(mnist-train)
+        configure_msvc_target(mnist-common)
+        configure_msvc_target(mnist-eval)
+        configure_msvc_target(mnist-train)

-        disable_msvc_warnings(gpt-2-ctx)
-        disable_msvc_warnings(gpt-2-alloc)
-        disable_msvc_warnings(gpt-2-backend)
-        disable_msvc_warnings(gpt-2-sched)
-        disable_msvc_warnings(gpt-2-quantize)
-        disable_msvc_warnings(gpt-2-batched)
+        configure_msvc_target(gpt-2-ctx)
+        configure_msvc_target(gpt-2-alloc)
+        configure_msvc_target(gpt-2-backend)
+        configure_msvc_target(gpt-2-sched)
+        configure_msvc_target(gpt-2-quantize)
+        configure_msvc_target(gpt-2-batched)

-        disable_msvc_warnings(gpt-j)
-        disable_msvc_warnings(gpt-j-quantize)
+        configure_msvc_target(gpt-j)
+        configure_msvc_target(gpt-j-quantize)

-        disable_msvc_warnings(magika)
-        disable_msvc_warnings(yolov3-tiny)
-        disable_msvc_warnings(sam)
+        configure_msvc_target(magika)
+        configure_msvc_target(yolov3-tiny)
+        configure_msvc_target(sam)

-        disable_msvc_warnings(simple-ctx)
-        disable_msvc_warnings(simple-backend)
+        configure_msvc_target(simple-ctx)
+        configure_msvc_target(simple-backend)
    endif()

    if (GGML_BUILD_TESTS)
-        disable_msvc_warnings(test-mul-mat)
-        disable_msvc_warnings(test-arange)
-        disable_msvc_warnings(test-backend-ops)
-        disable_msvc_warnings(test-cont)
-        disable_msvc_warnings(test-conv-transpose)
-        disable_msvc_warnings(test-conv-transpose-1d)
-        disable_msvc_warnings(test-conv1d)
-        disable_msvc_warnings(test-conv2d)
-        disable_msvc_warnings(test-conv2d-dw)
-        disable_msvc_warnings(test-customop)
-        disable_msvc_warnings(test-dup)
-        disable_msvc_warnings(test-opt)
-        disable_msvc_warnings(test-pool)
+        configure_msvc_target(test-mul-mat)
+        configure_msvc_target(test-arange)
+        configure_msvc_target(test-backend-ops)
+        configure_msvc_target(test-cont)
+        configure_msvc_target(test-conv-transpose)
+        configure_msvc_target(test-conv-transpose-1d)
+        configure_msvc_target(test-conv1d)
+        configure_msvc_target(test-conv2d)
+        configure_msvc_target(test-conv2d-dw)
+        configure_msvc_target(test-customop)
+        configure_msvc_target(test-dup)
+        configure_msvc_target(test-opt)
+        configure_msvc_target(test-pool)
    endif ()
 endif()
--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@ -204,6 +204,10 @@
 #    define GGML_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))
 #endif

+#if defined(_WIN32) && !defined(_WIN32_WINNT)
+#    define _WIN32_WINNT 0x0A00
+#endif
+
 #include <stdbool.h>
 #include <stddef.h>
 #include <stdint.h>
@ -2279,7 +2283,7 @@ extern "C" {
            float                 stop,
            float                 step);

-#define GGML_KQ_MASK_PAD 64
+#define GGML_KQ_MASK_PAD 1

    // q:    [n_embd_k, n_batch,     n_head,    ne3 ]
    // k:    [n_embd_k, n_kv,        n_head_kv, ne3 ]
--- a/ggml/src/CMakeLists.txt
+++ b/ggml/src/CMakeLists.txt
@ -127,10 +127,6 @@ if (NOT MSVC)
    endif()
 endif()

-if (MINGW)
-    add_compile_definitions(_WIN32_WINNT=${GGML_WIN_VER})
-endif()
-
 #
 # POSIX conformance
 #
--- a/ggml/src/ggml-backend.cpp
+++ b/ggml/src/ggml-backend.cpp
@ -1240,10 +1240,8 @@ void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgra
                                tensor_copy = ggml_dup_tensor_layout(sched->ctx, src);
                                ggml_format_name(tensor_copy, "%s#%s#%d", ggml_backend_name(backend), src->name, c);
                            }
-                            if (sched->n_copies > 1) {
                            ggml_set_input(tensor_copy);
                            ggml_set_output(tensor_copy); // prevent ggml-alloc from overwriting the tensor
-                            }
                            tensor_id_copy(src_id, src_backend_id, c) = tensor_copy;
                            SET_CAUSE(tensor_copy, "4.cpy");
                        }
--- a/ggml/src/ggml-cann/ggml-cann.cpp
+++ b/ggml/src/ggml-cann/ggml-cann.cpp
@ -2564,6 +2564,10 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev, const ggml_ten
            return true;
        case GGML_OP_OUT_PROD:
            {
+#ifdef ASCEND_310P
+                // Ger is not supported on 310p device
+                return false;
+#endif
                switch (op->src[0]->type) {
                    case GGML_TYPE_F16:
                    case GGML_TYPE_F32:
--- a/ggml/src/ggml-cpu/ggml-cpu.c
+++ b/ggml/src/ggml-cpu/ggml-cpu.c
@ -683,22 +683,14 @@ bool ggml_is_numa(void) {
 }

 #if defined(__ARM_ARCH)
-
-#if defined(__linux__) && defined(__aarch64__)
-#include <sys/auxv.h>
-#endif
-
-static void ggml_init_arm_arch_features(void) {
 #if defined(__aarch64__) && defined(__ARM_FEATURE_SVE)
-#if defined(__linux__)
-    ggml_arm_arch_features.sve_cnt = PR_SVE_VL_LEN_MASK & prctl(PR_SVE_GET_VL);
-#else
-    // TODO: add support of SVE for non-linux systems
-#error "TODO: SVE is not supported on this platform. To use SVE, sve_cnt needs to be initialized here."
-#endif
-#endif
+#include <arm_sve.h>
+static void ggml_init_arm_arch_features(void) {
+    ggml_arm_arch_features.sve_cnt = svcntb();
 }
-
+#else
+static void ggml_init_arm_arch_features(void) {}
+#endif
 #endif // __ARM_ARCH

 struct ggml_tensor * ggml_new_i32(struct ggml_context * ctx, int32_t value) {
@ -2706,6 +2698,11 @@ struct ggml_cplan ggml_graph_plan(
        n_threads = threadpool ? threadpool->n_threads_max : GGML_DEFAULT_N_THREADS;
    }

+#if defined(__EMSCRIPTEN__) && !defined(__EMSCRIPTEN_PTHREADS__)
+    // Emscripten without pthreads support can only use a single thread
+    n_threads = 1;
+#endif
+
    size_t work_size = 0;

    struct ggml_cplan cplan;
--- a/ggml/src/ggml-cpu/ops.cpp
+++ b/ggml/src/ggml-cpu/ops.cpp
@ -6383,7 +6383,7 @@ static void ggml_compute_forward_im2col_3d_f16(
                                        const int64_t iih = ioh*s1 + ikh*d1 - p1;
                                        const int64_t iid = iod*s2 + ikd*d2 - p2;

-                                        if (iid < 0 || iid >= ID || iih < 0 || iih >= IH || iiw < 0 || iiw >= IW || iid < 0 || iid >= ID) {
+                                        if (iid < 0 || iid >= ID || iih < 0 || iih >= IH || iiw < 0 || iiw >= IW) {
                                            dst_data[iic*KD_KH_KW + ikd * KH_KW + ikh*KW + ikw] = 0;
                                        } else {
                                            const float * const s = (const float *) ((const char *)src_data + iid*nb12 + iih*nb11 + iiw*nb10); // [ID, IH, IW]
--- a/ggml/src/ggml-cuda/fattn-common.cuh
+++ b/ggml/src/ggml-cuda/fattn-common.cuh
@ -25,7 +25,7 @@ typedef void (* fattn_kernel_t)(
        const float m1,
        const uint32_t n_head_log2,
        const float logit_softcap,
-        const int32_t ne00, const int32_t ne01, const int32_t ne02, const int32_t ne03,
+        const int32_t ne00, const uint3   ne01, const int32_t ne02, const int32_t ne03,
                            const int32_t nb01, const int32_t nb02, const int32_t nb03,
        const int32_t ne10, const int32_t ne11, const int32_t ne12, const int32_t ne13,
                            const int32_t nb11, const int32_t nb12, const int64_t nb13,
@ -621,7 +621,8 @@ static __global__ void flash_attn_mask_to_KV_max(
 template<int D, int ncols1, int ncols2> // D == head size
 __launch_bounds__(D, 1)
 static __global__ void flash_attn_stream_k_fixup(
-        float * __restrict__ dst, const float2 * __restrict__ dst_fixup, const int ne01, const int ne02, const int ne03, const int ne11) {
+        float * __restrict__ dst, const float2 * __restrict__ dst_fixup, const int ne01, const int ne02, const int ne03, const int ne11,
+        const int nbatch_fa) {
    constexpr int ncols = ncols1*ncols2;

    const int bidx0 = blockIdx.x;
@ -632,7 +633,7 @@ static __global__ void flash_attn_stream_k_fixup(

    const float * dst_fixup_data = ((const float *) dst_fixup) + gridDim.x*(2*2*ncols);

-    const int iter_k = ne11 / FATTN_KQ_STRIDE;
+    const int iter_k = (ne11 + (nbatch_fa - 1)) / nbatch_fa;
    const int iter_j = (ne01 + (ncols1    - 1)) / ncols1;

    const int kbc0      = (bidx0 + 0)*(iter_k*iter_j*(ne02/ncols2)*ne03) / gridDim.x;
@ -765,7 +766,7 @@ static __global__ void flash_attn_combine_results(
 template <int DV, int ncols1, int ncols2>
 void launch_fattn(
    ggml_backend_cuda_context & ctx, ggml_tensor * dst, fattn_kernel_t fattn_kernel, const int nwarps, const size_t nbytes_shared,
-    const int KQ_row_granularity, const bool need_f16_K, const bool need_f16_V, const bool stream_k, const int warp_size = WARP_SIZE
+    const int nbatch_fa, const bool need_f16_K, const bool need_f16_V, const bool stream_k, const int warp_size = WARP_SIZE
 ) {
    constexpr int ncols = ncols1 * ncols2;

@ -790,8 +791,6 @@ void launch_fattn(
    GGML_ASSERT(!V || V->nb[0] == ggml_element_size(V));

    GGML_ASSERT(!mask || mask->type == GGML_TYPE_F16);
-    GGML_ASSERT(!mask || mask->ne[1] >= GGML_PAD(Q->ne[1], 16) &&
-        "the Flash-Attention CUDA kernel requires the mask to be padded to 16 and at least n_queries big");

    ggml_cuda_pool & pool = ctx.pool();
    cudaStream_t main_stream = ctx.stream();
@ -915,7 +914,7 @@ void launch_fattn(

        dst_tmp_meta.alloc(blocks_num.x*ncols * (2*2 + DV) * sizeof(float));
    } else {
-        const int ntiles_KQ = (K->ne[1] + KQ_row_granularity - 1) / KQ_row_granularity; // Max. number of parallel blocks limited by tensor size.
+        const int ntiles_KQ = (K->ne[1] + nbatch_fa - 1) / nbatch_fa; // Max. number of parallel blocks limited by tensor size.

        // parallel_blocks must not be larger than what the tensor size allows:
        parallel_blocks = std::min(parallel_blocks, ntiles_KQ);
@ -970,6 +969,9 @@ void launch_fattn(
    const float m0 = powf(2.0f, -(max_bias       ) / n_head_log2);
    const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);

+    // TODO other tensor dimensions after removal of WMMA kernel:
+    const uint3 ne01 = init_fastdiv_values(Q->ne[1]);
+
    GGML_ASSERT(block_dim.x % warp_size == 0);
    fattn_kernel<<<blocks_num, block_dim, nbytes_shared, main_stream>>>(
        (const char *) Q->data,
@ -980,7 +982,7 @@ void launch_fattn(
        KV_max.ptr,
        !stream_k && parallel_blocks > 1 ? dst_tmp.ptr : (float *) KQV->data, dst_tmp_meta.ptr,
        scale, max_bias, m0, m1, n_head_log2, logit_softcap,
-        Q->ne[0], Q->ne[1], Q->ne[2], Q->ne[3], Q->nb[1], Q->nb[2], Q->nb[3],
+        Q->ne[0], ne01,     Q->ne[2], Q->ne[3], Q->nb[1], Q->nb[2], Q->nb[3],
        K->ne[0], K->ne[1], K->ne[2], K->ne[3], nb11, nb12, nb13,
        nb21, nb22, nb23,
        mask ? mask->ne[1] : 0, mask ? mask->ne[2] : 0, mask ? mask->ne[3] : 0,
@ -995,7 +997,7 @@ void launch_fattn(

            flash_attn_stream_k_fixup<DV, ncols1, ncols2>
                <<<blocks_num_combine, block_dim_combine, 0, main_stream>>>
-                ((float *) KQV->data, dst_tmp_meta.ptr, Q->ne[1], Q->ne[2], Q->ne[3], K->ne[1]);
+                ((float *) KQV->data, dst_tmp_meta.ptr, Q->ne[1], Q->ne[2], Q->ne[3], K->ne[1], nbatch_fa);
        }
    } else if (parallel_blocks > 1) {
        const dim3 block_dim_combine(DV, 1, 1);
--- a/ggml/src/ggml-cuda/fattn-mma-f16.cuh
+++ b/ggml/src/ggml-cuda/fattn-mma-f16.cuh
--- a/ggml/src/ggml-cuda/fattn-tile.cuh
+++ b/ggml/src/ggml-cuda/fattn-tile.cuh
@ -501,6 +501,7 @@ static __device__ __forceinline__ void flash_attn_tile_iter(
        const half2 * const __restrict__ K_h2,
        const half2 * const __restrict__ V_h2,
        const half  * const __restrict__ mask,
+        const uint3 ne01,
        const float logit_softcap,
        const float slope,
        T_KQ      * const KQ,
@ -512,7 +513,8 @@ static __device__ __forceinline__ void flash_attn_tile_iter(
        float * const KQ_sum,
        T_acc * const VKQ,
        const int k_VKQ_0,
-        const int k_VKQ_max) {
+        const int k_VKQ_max,
+        const int col_Q_0) {
    constexpr int cpy_nb = ggml_cuda_get_max_cpy_bytes();
    constexpr int cpy_ne = cpy_nb / 4;

@ -556,7 +558,7 @@ static __device__ __forceinline__ void flash_attn_tile_iter(
    // Apply logit softcap + mask, update KQ_max:
 #pragma unroll
    for (int jc0 = 0; jc0 < cpw; ++jc0) {
-        const int j = (jc0 + (threadIdx.y / np)*cpw)/ncols2;
+        const int j = fastmodulo(col_Q_0 + (jc0 + (threadIdx.y / np)*cpw)/ncols2, ne01);

 #pragma unroll
        for (int i_KQ_0 = 0; i_KQ_0 < nbatch_fa; i_KQ_0 += np*warp_size) {
@ -736,7 +738,7 @@ static __global__ void flash_attn_tile(
        const float m1,
        const uint32_t n_head_log2,
        const float logit_softcap,
-        const int32_t ne00, const int32_t ne01, const int32_t ne02, const int32_t ne03,
+        const int32_t ne00, const uint3   ne01, const int32_t ne02, const int32_t ne03,
                            const int32_t nb01, const int32_t nb02, const int32_t nb03,
        const int32_t ne10, const int32_t ne11, const int32_t ne12, const int32_t ne13,
                            const int32_t nb11, const int32_t nb12, const int64_t nb13,
@ -781,11 +783,11 @@ static __global__ void flash_attn_tile(
    const int sequence = blockIdx.z / (ne02/ncols2);
    const int head0 = blockIdx.z*ncols2 - sequence*ne02; // == blockIdx.z % (ne02/ncols2)
    const int gqa_ratio = ne02 / ne12; // With grouped query attention there are > 1 Q matrices per K, V matrix.
-    const float * Q_f  = (const float *) (Q + nb03*sequence + nb02* head0              + nb01*col_Q_0);
+    const float * Q_f  = (const float *) (Q + nb03*sequence + nb02* head0);
    const half2 * K_h2 = (const half2 *) (K + nb13*sequence + nb12*(head0 / gqa_ratio));
    const half2 * V_h2 = (const half2 *) (V + nb23*sequence + nb22*(head0 / gqa_ratio)); // K and V have same shape

-    const half * maskh = mask ? (const half *) (mask + nb33*(sequence % ne33) + nb31*col_Q_0) : nullptr;
+    const half * maskh = mask ? (const half *) (mask + nb33*(sequence % ne33)) : nullptr;

    const int stride_K2   = nb11 / sizeof(half2);
    const int stride_V2   = nb21 / sizeof(half2);
@ -842,11 +844,9 @@ static __global__ void flash_attn_tile(
        for (int i0 = 0; i0 < DKQp; i0 += np*warp_size*cpy_ne_D) {
            if (i0 + np*warp_size*cpy_ne_D <= DKQ || i0 + (threadIdx.y % np)*(warp_size*cpy_ne_D) + threadIdx.x*cpy_ne_D < DKQ) {
                float tmp_f[cpy_ne_D] = {0.0f};
-                if (ncols1 == 1 || col_Q_0 + j < ne01) {
                ggml_cuda_memcpy_1<sizeof(tmp_f)>
-                        (tmp_f, &Q_f[c*(nb02/sizeof(float)) + j*(nb01/sizeof(float))
+                    (tmp_f, &Q_f[c*(nb02/sizeof(float)) + fastmodulo(col_Q_0 + j, ne01)*(nb01/sizeof(float))
                                 + i0 + (threadIdx.y % np)*(warp_size*cpy_ne_D) + threadIdx.x*cpy_ne_D]);
-                }

 #pragma unroll
                for (int i1 = 0; i1 < cpy_ne_D; ++i1) {
@ -881,23 +881,23 @@ static __global__ void flash_attn_tile(
        while (k_VKQ_0 < k_VKQ_max - nbatch_fa) {
            constexpr bool oob_check = false;
            flash_attn_tile_iter<warp_size, nwarps, ncols1, ncols2, DKQ, DV, nbatch_fa, nbatch_K, use_logit_softcap, oob_check>
-                (Q_tmp, K_h2, V_h2, maskh, logit_softcap, slope, KQ, KV_tmp,
-                stride_K2, stride_V2, stride_mask, KQ_max, KQ_sum, VKQ, k_VKQ_0, k_VKQ_max);
+                (Q_tmp, K_h2, V_h2, maskh, ne01, logit_softcap, slope, KQ, KV_tmp,
+                stride_K2, stride_V2, stride_mask, KQ_max, KQ_sum, VKQ, k_VKQ_0, k_VKQ_max, col_Q_0);
            k_VKQ_0 += gridDim.y*nbatch_fa;
        }
        if (k_VKQ_0 < k_VKQ_max) {
            constexpr bool oob_check = true;
            flash_attn_tile_iter<warp_size, nwarps, ncols1, ncols2, DKQ, DV, nbatch_fa, nbatch_K, use_logit_softcap, oob_check>
-                (Q_tmp, K_h2, V_h2, maskh, logit_softcap, slope, KQ, KV_tmp,
-                stride_K2, stride_V2, stride_mask, KQ_max, KQ_sum, VKQ, k_VKQ_0, k_VKQ_max);
+                (Q_tmp, K_h2, V_h2, maskh, ne01, logit_softcap, slope, KQ, KV_tmp,
+                stride_K2, stride_V2, stride_mask, KQ_max, KQ_sum, VKQ, k_VKQ_0, k_VKQ_max, col_Q_0);
        }
    } else {
        // Branch without out-of-bounds checks.
        for (int k_VKQ_0 = blockIdx.y*nbatch_fa; k_VKQ_0 < k_VKQ_max; k_VKQ_0 += gridDim.y*nbatch_fa) {
            constexpr bool oob_check = false;
            flash_attn_tile_iter<warp_size, nwarps, ncols1, ncols2, DKQ, DV, nbatch_fa, nbatch_K, use_logit_softcap, oob_check>
-                (Q_tmp, K_h2, V_h2, maskh, logit_softcap, slope, KQ, KV_tmp,
-                stride_K2, stride_V2, stride_mask, KQ_max, KQ_sum, VKQ, k_VKQ_0, k_VKQ_max);
+                (Q_tmp, K_h2, V_h2, maskh, ne01, logit_softcap, slope, KQ, KV_tmp,
+                stride_K2, stride_V2, stride_mask, KQ_max, KQ_sum, VKQ, k_VKQ_0, k_VKQ_max, col_Q_0);
        }
    }

@ -1010,13 +1010,13 @@ static __global__ void flash_attn_tile(
        const int j = jc / ncols2;
        const int c = jc % ncols2;

-        if (ncols1 > 1 && col_Q_0 + j >= ne01) {
+        if (ncols1 > 1 && col_Q_0 + j >= int(ne01.z)) {
            return;
        }

        const float scale = gridDim.y == 1 ? 1.0f/KQ_sum[jc0] : 1.0f;

-        const int j_dst_unrolled = ((sequence*ne01 + col_Q_0 + j)*ne02 + head0 + c)*gridDim.y + blockIdx.y;
+        const int j_dst_unrolled = ((sequence*int(ne01.z) + col_Q_0 + j)*ne02 + head0 + c)*gridDim.y + blockIdx.y;

 #ifdef FAST_FP16_AVAILABLE
        constexpr int cpy_ne_D = cpy_ne/2 < (DVp/2)/warp_size ? cpy_ne/2 : (DVp/2)/warp_size;
--- a/ggml/src/ggml-cuda/fattn-vec.cuh
+++ b/ggml/src/ggml-cuda/fattn-vec.cuh
@ -33,7 +33,7 @@ static __global__ void flash_attn_ext_vec(
        const float m1,
        const uint32_t n_head_log2,
        const float logit_softcap,
-        const int32_t ne00, const int32_t ne01, const int32_t ne02, const int32_t ne03,
+        const int32_t ne00, const uint3   ne01, const int32_t ne02, const int32_t ne03,
                            const int32_t nb01, const int32_t nb02, const int32_t nb03,
        const int32_t ne10, const int32_t ne11, const int32_t ne12, const int32_t ne13,
                            const int32_t nb11, const int32_t nb12, const int64_t nb13,
@ -150,7 +150,7 @@ static __global__ void flash_attn_ext_vec(
            float2 * tmp_q_ds  = (float2 *) (tmp_q_i32 + D/sizeof(int));

            // Set memory to zero if out of bounds:
-            if (ncols > 1 && ic0 + j >= ne01) {
+            if (ncols > 1 && ic0 + j >= int(ne01.z)) {
 #pragma unroll
                for (int i0 = 0; i0 < int(D/sizeof(int)); i0 += WARP_SIZE) {
                    const int i = i0 + threadIdx.x;
@ -201,7 +201,7 @@ static __global__ void flash_attn_ext_vec(
                const int i = i0 + (nthreads_KQ == WARP_SIZE ? threadIdx.x : threadIdx.x % nthreads_KQ)*cpy_ne;

                float2 tmp[cpy_ne] = {{0.0f, 0.0f}};
-                if (ncols == 1 || ic0 + j < ne01) {
+                if (ncols == 1 || ic0 + j < int(ne01.z)) {
                    ggml_cuda_memcpy_1<cpy_nb>(tmp,            &Q_j[i]);
                    ggml_cuda_memcpy_1<cpy_nb>(tmp + cpy_ne/2, &Q_j[i + cpy_ne/2]);
                }
@ -222,7 +222,7 @@ static __global__ void flash_attn_ext_vec(
 #pragma unroll
            for (int i0 = 0; i0 < D/2; i0 += nthreads_KQ*cpy_ne) {
                const int i = i0 + (nthreads_KQ == WARP_SIZE ? threadIdx.x : threadIdx.x % nthreads_KQ)*cpy_ne;
-                if (ncols == 1 || ic0 + j < ne01) {
+                if (ncols == 1 || ic0 + j < int(ne01.z)) {
                    ggml_cuda_memcpy_1<cpy_nb>(&Q_reg[j][i0/nthreads_KQ],            &Q_j[i]);
                    ggml_cuda_memcpy_1<cpy_nb>(&Q_reg[j][i0/nthreads_KQ + cpy_ne/2], &Q_j[i + cpy_ne/2]);
                }
@ -266,7 +266,7 @@ static __global__ void flash_attn_ext_vec(
                    sum = logit_softcap*tanhf(sum);
                }

-                if (mask) {
+                if (mask && (ncols == 1 || ic0 + j < int(ne01.z))) {
                    sum += slope*__half2float(maskh[j*ne11 + i_KQ]);
                }

@ -412,7 +412,7 @@ static __global__ void flash_attn_ext_vec(

 #pragma unroll
    for (int j_VKQ = 0; j_VKQ < ncols; ++j_VKQ) {
-        if (ncols > 1 && ic0 + j_VKQ >= ne01) {
+        if (ncols > 1 && ic0 + j_VKQ >= int(ne01.z)) {
            break;
        }

@ -479,7 +479,7 @@ static __global__ void flash_attn_ext_vec(
                if (gridDim.y == 1) {
                    dst_val /= KQ_sum[j_VKQ];
                }
-                dst[(((sequence*ne01 + ic0 + j_VKQ)*ne02 + head)*gridDim.y + blockIdx.y)*D + i0 + tid] = dst_val;
+                dst[(((sequence*int(ne01.z) + ic0 + j_VKQ)*ne02 + head)*gridDim.y + blockIdx.y)*D + i0 + tid] = dst_val;
            }
        }

@ -489,8 +489,8 @@ static __global__ void flash_attn_ext_vec(

    }

-    if (gridDim.y != 1 && tid < ncols && (ncols == 1 || ic0 + tid < ne01)) {
-        dst_meta[((sequence*ne01 + ic0 + tid)*ne02 + head)*gridDim.y + blockIdx.y] = make_float2(KQ_max[tid], KQ_sum[tid]);
+    if (gridDim.y != 1 && tid < ncols && (ncols == 1 || ic0 + tid < int(ne01.z))) {
+        dst_meta[((sequence*int(ne01.z) + ic0 + tid)*ne02 + head)*gridDim.y + blockIdx.y] = make_float2(KQ_max[tid], KQ_sum[tid]);
    }
 #else
    GGML_UNUSED_VARS(Q, K, V, mask, sinks, KV_max, dst, dst_meta, scale,
--- a/ggml/src/ggml-cuda/fattn-wmma-f16.cu
+++ b/ggml/src/ggml-cuda/fattn-wmma-f16.cu
@ -38,14 +38,14 @@ static __global__ void flash_attn_ext_f16(
        const float m1,
        const uint32_t n_head_log2,
        const float logit_softcap,
-        const int32_t ne00, const int32_t ne01, const int32_t ne02, const int32_t ne03,
+        const int32_t ne00, const uint3   ne01, const int32_t ne02, const int32_t ne03,
                            const int32_t nb01, const int32_t nb02, const int32_t nb03,
        const int32_t ne10, const int32_t ne11, const int32_t ne12, const int32_t ne13,
                            const int32_t nb11, const int32_t nb12, const int64_t nb13,
                            const int32_t nb21, const int32_t nb22, const int64_t nb23,
                            const int32_t ne31, const int32_t ne32, const int32_t ne33,
                            const int32_t nb31, const int32_t nb32, const int64_t nb33) {
-#if defined(FLASH_ATTN_AVAILABLE) && (__CUDA_ARCH__ == GGML_CUDA_CC_VOLTA || (defined(GGML_HIP_ROCWMMA_FATTN) && defined(GGML_USE_WMMA_FATTN)))
+#if defined(FLASH_ATTN_AVAILABLE) && (defined(GGML_HIP_ROCWMMA_FATTN) && defined(GGML_USE_WMMA_FATTN))
    // Skip unused kernel variants for faster compilation:
    if (use_logit_softcap && !(D == 128 || D == 256)) {
        NO_DEVICE_CODE;
@ -149,7 +149,7 @@ static __global__ void flash_attn_ext_f16(
            if (i0 + warp_size > D && i >= D) {
                break;
            }
-            KQ[j*D_padded + i] = ic0 + j < ne01 ? Q_f[j*stride_Q + i] * scale : 0.0f;
+            KQ[j*D_padded + i] = ic0 + j < int(ne01.z) ? Q_f[j*stride_Q + i] * scale : 0.0f;
        }
    }

@ -218,7 +218,8 @@ static __global__ void flash_attn_ext_f16(
                for (int k0 = 0; k0 < FATTN_KQ_STRIDE; k0 += warp_size) {
                    const int k = k0 + threadIdx.x;

-                    KQ_f_tmp[k0/warp_size] += mask ? __half2float(slopeh*maskh[j*(nb31/sizeof(half)) + k_VKQ_0 + k]) : 0.0f;
+                    KQ_f_tmp[k0/warp_size] += mask && ic0 + j < int(ne01.z) ?
+                        __half2float(slopeh*maskh[j*(nb31/sizeof(half)) + k_VKQ_0 + k]) : 0.0f;
                    KQ_max_new = max(KQ_max_new, KQ_f_tmp[k0/warp_size]);
                }
                KQ_max_new = warp_reduce_max<warp_size>(KQ_max_new);
@ -270,7 +271,7 @@ static __global__ void flash_attn_ext_f16(
                for (int k0 = 0; k0 < FATTN_KQ_STRIDE/2; k0 += warp_size) {
                    const int k = k0 + threadIdx.x;

-                    KQ2_tmp[k0/warp_size] += mask ? slope2*mask2[(j*ne11 + k_VKQ_0)/2 + k] : make_half2(0.0f, 0.0f);
+                    KQ2_tmp[k0/warp_size] += mask && ic0 + j < int(ne01.z) ? slope2*mask2[(j*ne11 + k_VKQ_0)/2 + k] : make_half2(0.0f, 0.0f);
                    KQ_max_new = ggml_cuda_hmax2(KQ_max_new, KQ2_tmp[k0/warp_size]);
                }
                KQ_max_new = __half2half2(warp_reduce_max<warp_size>(ggml_cuda_hmax(__low2half(KQ_max_new), __high2half(KQ_max_new))));
@ -431,7 +432,7 @@ static __global__ void flash_attn_ext_f16(
 #pragma unroll
    for (int j0 = 0; j0 < ncols; j0 += nwarps) {
        const int j_VKQ = j0 + threadIdx.y;
-        if (ic0 + j_VKQ >= ne01) {
+        if (ic0 + j_VKQ >= int(ne01.z)) {
            return;
        }

@ -442,7 +443,7 @@ static __global__ void flash_attn_ext_f16(
            KQ_rowsum_j = __low2float(KQ_rowsum_h2[j0/nwarps]) + __high2float(KQ_rowsum_h2[j0/nwarps]);
        }

-        const int j_dst_unrolled = ((sequence*ne01 + ic0 + j_VKQ)*ne02 + head)*gridDim.y + blockIdx.y;
+        const int j_dst_unrolled = ((sequence*int(ne01.z) + ic0 + j_VKQ)*ne02 + head)*gridDim.y + blockIdx.y;

 #pragma unroll
        for (int i0 = 0; i0 < D; i0 += warp_size) {
@ -481,7 +482,7 @@ static __global__ void flash_attn_ext_f16(
              ne31, ne32, ne33,
              nb31, nb32, nb33);
    NO_DEVICE_CODE;
-#endif // defined(FLASH_ATTN_AVAILABLE) && (__CUDA_ARCH__ == GGML_CUDA_CC_VOLTA || (defined(GGML_HIP_ROCWMMA_FATTN) && defined(GGML_USE_WMMA_FATTN)))
+#endif // defined(FLASH_ATTN_AVAILABLE) && (defined(GGML_HIP_ROCWMMA_FATTN) && defined(GGML_USE_WMMA_FATTN))
 }

 constexpr int get_max_power_of_2(int x) {
--- a/ggml/src/ggml-cuda/fattn-wmma-f16.cuh
+++ b/ggml/src/ggml-cuda/fattn-wmma-f16.cuh
@ -2,9 +2,9 @@

 #include "common.cuh"

-#if (!defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA) || defined(GGML_USE_MUSA)
+#if defined(GGML_USE_MUSA)
 #define GGML_USE_WMMA_FATTN
-#endif // (!defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA) || defined(GGML_USE_MUSA)
+#endif // defined(GGML_USE_MUSA)

 #if defined(GGML_HIP_ROCWMMA_FATTN)
 #if defined(CDNA) && (ROCWMMA_VERSION_MAJOR < 2 || ROCWMMA_VERSION_MINOR > 0 || ROCWMMA_VERSION_PATCH > 0)
--- a/ggml/src/ggml-cuda/fattn.cu
+++ b/ggml/src/ggml-cuda/fattn.cu
@ -12,13 +12,13 @@ static void ggml_cuda_flash_attn_ext_mma_f16_switch_ncols1(ggml_backend_cuda_con
    const ggml_tensor * Q = dst->src[0];

    if constexpr (ncols2 <= 8) {
-        if (Q->ne[1] <= 8/ncols2) {
+        if (turing_mma_available(cc) && Q->ne[1] <= 8/ncols2) {
            ggml_cuda_flash_attn_ext_mma_f16_case<DKQ, DV, 8/ncols2, ncols2>(ctx, dst);
            return;
        }
    }

-    if (Q->ne[1] <= 16/ncols2) {
+    if (turing_mma_available(cc) && Q->ne[1] <= 16/ncols2) {
        ggml_cuda_flash_attn_ext_mma_f16_case<DKQ, DV, 16/ncols2, ncols2>(ctx, dst);
        return;
    }
@ -41,7 +41,7 @@ static void ggml_cuda_flash_attn_ext_mma_f16_switch_ncols2(ggml_backend_cuda_con
    float max_bias = 0.0f;
    memcpy(&max_bias, (const float *) KQV->op_params + 1, sizeof(float));

-    const bool use_gqa_opt = mask && max_bias == 0.0f;
+    const bool use_gqa_opt = mask && max_bias == 0.0f && K->ne[1] % FATTN_KQ_STRIDE == 0;

    GGML_ASSERT(Q->ne[2] % K->ne[2] == 0);
    const int gqa_ratio = Q->ne[2] / K->ne[2];
@ -275,8 +275,8 @@ static best_fattn_kernel ggml_cuda_get_best_fattn_kernel(const int device, const
    // For small batch sizes the vector kernel may be preferable over the kernels optimized for large batch sizes:
    const bool can_use_vector_kernel = Q->ne[0] <= 256 && Q->ne[0] % 64 == 0 && K->ne[1] % FATTN_KQ_STRIDE == 0;

-    // If Turing tensor cores available, use them:
-    if (turing_mma_available(cc) && K->ne[1] % FATTN_KQ_STRIDE == 0 && Q->ne[0] != 40 && Q->ne[0] != 72) {
+    // If Turing tensor cores are available, use them:
+    if (turing_mma_available(cc) && Q->ne[0] != 40 && Q->ne[0] != 72) {
        if (can_use_vector_kernel) {
            if (!ggml_is_quantized(K->type) && !ggml_is_quantized(V->type)) {
                if (cc >= GGML_CUDA_CC_ADA_LOVELACE && Q->ne[1] == 1 && Q->ne[3] == 1 && !(gqa_ratio > 4 && K->ne[1] >= 8192)) {
@ -297,7 +297,21 @@ static best_fattn_kernel ggml_cuda_get_best_fattn_kernel(const int device, const
                return BEST_FATTN_KERNEL_VEC;
            }
        }
+        return BEST_FATTN_KERNEL_MMA_F16;
+    }

+    if (volta_mma_available(cc) && Q->ne[0] != 40 && Q->ne[0] != 72) {
+        int gqa_ratio_eff = 1;
+        const int ncols2_max = Q->ne[0] == 576 ? 16 : 8;
+        while (gqa_ratio % (2*gqa_ratio_eff) == 0 && gqa_ratio_eff < ncols2_max) {
+            gqa_ratio_eff *= 2;
+        }
+        if (can_use_vector_kernel && Q->ne[1] * gqa_ratio_eff <= 2) {
+            return BEST_FATTN_KERNEL_VEC;
+        }
+        if (Q->ne[1] * gqa_ratio_eff <= 16) {
+            return BEST_FATTN_KERNEL_TILE; // On Volta tensor cores are only faster for sufficiently large matrices.
+        }
        return BEST_FATTN_KERNEL_MMA_F16;
    }

--- a/ggml/src/ggml-cuda/mma.cuh
+++ b/ggml/src/ggml-cuda/mma.cuh
@ -68,10 +68,31 @@ static __device__ __forceinline__ half2 ggml_cuda_movmatrix(const half2 x) {

 namespace ggml_cuda_mma {

+    // Some architectures like Volta or CDNA3 perform multiple matrix multiplications per warp in parallel,
+    //     effectively the warp is being split into subgroups of threads that each perform a single mma instruction.
+    // In those cases the data can be split in different ways across the warp.
+    enum data_layout {
+        // By default the data uses the I direction as its major dimension and the J direction as its minor dimension.
+        // For the A/C matrices this means I major == row major, J major == column major.
+        // For the B matrix this means I major == column major, J major == row major.
+        // MIRRORED == Each data value is held exactly once per thread subgroup.
+        DATA_LAYOUT_I_MAJOR           =  0, // Always used for Turing, Ampere, Ada Lovelace, consumer Blackwell.
+        DATA_LAYOUT_I_MAJOR_MIRRORED  = 10,
+        DATA_LAYOUT_J_MAJOR_MIRRORED  = 20,
+    };
+    // Implemented mma combinations are:
+    //   - (I_MAJOR, I_MAJOR)          -> I_MAJOR
+    //   - (I_MAJOR, I_MAJOR_MIRRORED) -> I_MAJOR
+    //   - (I_MAJOR, J_MAJOR_MIRRORED) -> I_MAJOR
+
+    template <int I_, int J_, typename T, data_layout ds_=DATA_LAYOUT_I_MAJOR>
+    struct tile {};
+
    template <int I_, int J_, typename T>
-    struct tile {
+    struct tile<I_, J_, T, DATA_LAYOUT_I_MAJOR> {
        static constexpr int         I  = I_;
        static constexpr int         J  = J_;
+        static constexpr data_layout dl = DATA_LAYOUT_I_MAJOR;

 #if defined(AMD_MFMA_AVAILABLE)
        static constexpr int ne = I * J / 64;
@ -131,9 +152,9 @@ namespace ggml_cuda_mma {
        static __device__ __forceinline__ int get_i(const int l) {
            if constexpr (I == 32 && J == 8) {
 #ifdef GGML_CUDA_MMA_NO_VOLTA_PERM
-                return (((threadIdx.x % 16) / 4) * 8) | ((threadIdx.x / 16) * 4) | (l & 2) | (threadIdx.x % 2);
+                return (((threadIdx.x % 16) / 4) * 8) + ((threadIdx.x / 16) * 4) + (l & 2) + (threadIdx.x % 2);
 #else
-                return (l & 2) | (threadIdx.x & ~2);
+                return (l & 2) + (threadIdx.x & ~2);
 #endif // GGML_CUDA_MMA_NO_VOLTA_PERM
            } else {
                NO_DEVICE_CODE;
@ -143,7 +164,7 @@ namespace ggml_cuda_mma {

        static __device__ __forceinline__ int get_j(const int l) {
            if constexpr (I == 32 && J == 8) {
-                return (threadIdx.x & 2) | (l & (4 + 1));
+                return (threadIdx.x & 2) + (l & (4 + 1));
            } else {
                NO_DEVICE_CODE;
                return -1;
@ -196,9 +217,9 @@ namespace ggml_cuda_mma {
            } else if constexpr (I == 8 && J == 8) {
                return threadIdx.x / 4;
            } else if constexpr (I == 16 && J == 8) {
-                return ((l / 2) * 8) | (threadIdx.x / 4);
+                return ((l / 2) * 8) + (threadIdx.x / 4);
            } else if constexpr (I == 16 && J == 16) {
-                return (((l / 2) % 2) * 8) | (threadIdx.x / 4);
+                return (((l / 2) % 2) * 8) + (threadIdx.x / 4);
            } else if constexpr (I == 32 && J == 8) {
                return tile<16, 8, T>::get_i(l); // Memory layout simply repeated with same pattern in i direction.
            } else {
@ -211,11 +232,11 @@ namespace ggml_cuda_mma {
            if constexpr (I == 8 && J == 4) {
                return threadIdx.x % 4;
            } else if constexpr (I == 8 && J == 8) {
-                return (l * 4) | (threadIdx.x % 4);
+                return (l * 4) + (threadIdx.x % 4);
            } else if constexpr (I == 16 && J == 8) {
-                return ((threadIdx.x % 4) * 2) | (l % 2);
+                return ((threadIdx.x % 4) * 2) + (l % 2);
            } else if constexpr (I == 16 && J == 16) {
-                return ((l / 4) * 8) | ((threadIdx.x % 4) * 2) | (l % 2);
+                return ((l / 4) * 8) + ((threadIdx.x % 4) * 2) + (l % 2);
            } else if constexpr (I == 32 && J == 8) {
                return tile<16, 8, T>::get_j(l); // Memory layout simply repeated with same pattern in i direction.
            } else {
@ -227,26 +248,24 @@ namespace ggml_cuda_mma {
    };

    template <int I_, int J_>
-    struct tile<I_, J_, half2> {
+    struct tile<I_, J_, half2, DATA_LAYOUT_I_MAJOR> {
        static constexpr int         I  = I_;
        static constexpr int         J  = J_;
+        static constexpr data_layout dl = DATA_LAYOUT_I_MAJOR;

 #if __CUDA_ARCH__ == GGML_CUDA_CC_VOLTA
-        static constexpr int ne = I == 8 && J == 8 ? I * J / (WARP_SIZE/4) : I * J / WARP_SIZE;
+        static constexpr int ne = I * J / WARP_SIZE;
        half2 x[ne] = {{0.0f, 0.0f}};

        static constexpr __device__ bool supported() {
-            if (I ==  8 && J ==  8) return true;
-            if (I == 32 && J ==  8) return true;
+            if (I == 32 && J ==  4) return true;
            return false;
        }

        static __device__ __forceinline__ int get_i(const int l) {
-            if constexpr (I == 8 && J == 8) {
-                return ((threadIdx.x / 16) * 4) | (threadIdx.x % 4);
-            } else if constexpr (I == 32 && J == 8) {
+            if constexpr (I == 32 && J == 4) {
 #ifdef GGML_CUDA_MMA_NO_VOLTA_PERM
-                return (((threadIdx.x % 16) / 4) * 8) | ((threadIdx.x / 16) * 4) | (threadIdx.x % 4);
+                return (((threadIdx.x % 16) / 4) * 8) + ((threadIdx.x / 16) * 4) + (threadIdx.x % 4);
 #else
                return threadIdx.x;
 #endif // GGML_CUDA_MMA_NO_VOLTA_PERM
@ -257,7 +276,7 @@ namespace ggml_cuda_mma {
        }

        static __device__ __forceinline__ int get_j(const int l) {
-            if constexpr ((I == 8 || I == 32) && J == 8) {
+            if constexpr (I == 32 && J == 4) {
                return l;
            } else {
                NO_DEVICE_CODE;
@ -307,11 +326,11 @@ namespace ggml_cuda_mma {
            if constexpr (I == 8 && J == 8) {
                return threadIdx.x / 4;
            } else if constexpr (I == 16 && J == 4) {
-                return (l * 8) | (threadIdx.x / 4);
+                return (l * 8) + (threadIdx.x / 4);
            } else if constexpr (I == 16 && J == 8) {
-                return ((l % 2) * 8) | (threadIdx.x / 4);
+                return ((l % 2) * 8) + (threadIdx.x / 4);
            } else if constexpr (I == 32 && J == 8) {
-                return ((l / 4) * 16) | ((l % 2) * 8) | (threadIdx.x / 4);
+                return ((l / 4) * 16) + ((l % 2) * 8) + (threadIdx.x / 4);
            } else {
                NO_DEVICE_CODE;
                return -1;
@ -320,13 +339,13 @@ namespace ggml_cuda_mma {

        static __device__ __forceinline__ int get_j(const int l) {
            if constexpr (I == 8 && J == 8) {
-                return (l * 4) | (threadIdx.x % 4);
+                return (l * 4) + (threadIdx.x % 4);
            } else if constexpr (I == 16 && J == 4) {
                return threadIdx.x % 4;
            } else if constexpr (I == 16 && J == 8) {
-                return ((l / 2) * 4) | (threadIdx.x % 4);
+                return ((l / 2) * 4) + (threadIdx.x % 4);
            } else if constexpr (I == 32 && J == 8) {
-                return ((l & 2) * 2) | (threadIdx.x % 4);
+                return ((l & 2) * 2) + (threadIdx.x % 4);
            } else {
                NO_DEVICE_CODE;
                return -1;
@ -336,14 +355,15 @@ namespace ggml_cuda_mma {
    };

    template <int I_, int J_>
-    struct tile<I_, J_, nv_bfloat162> {
+    struct tile<I_, J_, nv_bfloat162, DATA_LAYOUT_I_MAJOR> {
        static constexpr int         I  = I_;
        static constexpr int         J  = J_;
+        static constexpr data_layout dl = DATA_LAYOUT_I_MAJOR;
+        static constexpr int         ne = I * J / WARP_SIZE;

-#if defined(AMD_WMMA_AVAILABLE)
-        static constexpr int ne = I * J / 32;
        nv_bfloat162 x[ne] = {{0.0f, 0.0f}};

+#if defined(AMD_WMMA_AVAILABLE)
        static constexpr __device__ bool supported() {
            if (I == 16 && J == 8) return true;
            return false;
@ -367,9 +387,6 @@ namespace ggml_cuda_mma {
            }
        }
 #else
-        static constexpr int ne = I * J / WARP_SIZE;
-        nv_bfloat162 x[ne] = {{0.0f, 0.0f}};
-
        static constexpr __device__ bool supported() {
            if (I ==  8 && J ==  8) return true;
            if (I == 16 && J ==  4) return true;
@ -381,9 +398,9 @@ namespace ggml_cuda_mma {
            if constexpr (I == 8 && J == 8) {
                return threadIdx.x / 4;
            } else if constexpr (I == 16 && J == 4) {
-                return (l * 8) | (threadIdx.x / 4);
+                return (l * 8) + (threadIdx.x / 4);
            } else if constexpr (I == 16 && J == 8) {
-                return ((l % 2) * 8) | (threadIdx.x / 4);
+                return ((l % 2) * 8) + (threadIdx.x / 4);
            } else {
                NO_DEVICE_CODE;
                return -1;
@ -392,11 +409,11 @@ namespace ggml_cuda_mma {

        static __device__ __forceinline__ int get_j(const int l) {
            if constexpr (I == 8 && J == 8) {
-                return (l * 4) | (threadIdx.x % 4);
+                return (l * 4) + (threadIdx.x % 4);
            } else if constexpr (I == 16 && J == 4) {
                return threadIdx.x % 4;
            } else if constexpr (I == 16 && J == 8) {
-                return ((l / 2) * 4) | (threadIdx.x % 4);
+                return ((l / 2) * 4) + (threadIdx.x % 4);
            } else {
                NO_DEVICE_CODE;
                return -1;
@ -405,6 +422,73 @@ namespace ggml_cuda_mma {
 #endif  // defined(AMD_WMMA_AVAILABLE)
    };

+    template <int I_, int J_>
+    struct tile<I_, J_, half2, DATA_LAYOUT_I_MAJOR_MIRRORED> {
+        static constexpr int         I  = I_;
+        static constexpr int         J  = J_;
+        static constexpr data_layout dl = DATA_LAYOUT_I_MAJOR_MIRRORED;
+        static constexpr int         ne = I * J / (WARP_SIZE/4);
+
+        half2 x[ne] = {{0.0f, 0.0f}};
+
+        static constexpr __device__ bool supported() {
+            if (I ==  8 && J ==  4) return true;
+            return false;
+        }
+
+        static __device__ __forceinline__ int get_i(const int /*l*/) {
+            if constexpr (I == 8 && J == 4) {
+                return ((threadIdx.x / 16) * 4) + (threadIdx.x % 4);
+            } else {
+                NO_DEVICE_CODE;
+                return -1;
+            }
+        }
+
+        static __device__ __forceinline__ int get_j(const int l) {
+            if constexpr (I == 8 && J == 4) {
+                return l;
+            } else {
+                NO_DEVICE_CODE;
+                return -1;
+            }
+        }
+    };
+
+    template <int I_, int J_>
+    struct tile<I_, J_, half2, DATA_LAYOUT_J_MAJOR_MIRRORED> {
+        static constexpr int         I  = I_;
+        static constexpr int         J  = J_;
+        static constexpr data_layout dl = DATA_LAYOUT_J_MAJOR_MIRRORED;
+        static constexpr int         ne = I * J / (WARP_SIZE/4);
+
+        half2 x[ne] = {{0.0f, 0.0f}};
+
+        static constexpr __device__ bool supported() {
+            if (I ==  8 && J ==  4) return true;
+            return false;
+        }
+
+        static __device__ __forceinline__ int get_i(const int l) {
+            if constexpr (I == 8 && J == 4) {
+                return ((l / 2) * 4) + (threadIdx.x % 4);
+            } else {
+                NO_DEVICE_CODE;
+                return -1;
+            }
+        }
+
+        static __device__ __forceinline__ int get_j(const int l) {
+            if constexpr (I == 8 && J == 4) {
+                return ((threadIdx.x / 16) * 2) + (l % 2);
+            } else {
+                NO_DEVICE_CODE;
+                return -1;
+            }
+        }
+    };
+
+#if defined(TURING_MMA_AVAILABLE)
    template <int I, int J>
    static __device__ __forceinline__ tile<I, J/2, half2> get_half2(const tile<I, J, float> & tile_float) {
        tile<I, J/2, half2> ret;
@ -422,9 +506,26 @@ namespace ggml_cuda_mma {

        return ret;
    }
+#else // Volta
+    template <int I, int J>
+    static __device__ __forceinline__ tile<I, J/2, half2> get_half2(const tile<I, J, float> & tile_float) {
+        tile<I, J/2, half2> ret;
+#pragma unroll
+        for (int l0 = 0; l0 < tile_float.ne; l0 += 4) {
+            ret.x[l0/2 + 0] = make_half2(tile_float.x[l0 + 0], tile_float.x[l0 + 1]);
+            ret.x[l0/2 + 1] = make_half2(tile_float.x[l0 + 2], tile_float.x[l0 + 3]);

-    template <int I, int J, typename T>
-    static __device__ __forceinline__ void load_generic(tile<I, J, T> & t, const T * __restrict__ xs0, const int stride) {
+            // On Volta FP16 and FP32 tiles have a different memory layout,
+            //     for the conversion threads with an offset of 2 need to exchange half their values:
+            ret.x[l0/2 + (((threadIdx.x % 4) / 2) ^ 1)] = __shfl_xor_sync(
+                0xFFFFFFFF, ret.x[l0/2 + (((threadIdx.x % 4) / 2) ^ 1)], 2, WARP_SIZE);
+        }
+        return ret;
+    }
+#endif // defined(TURING_MMA_AVAILABLE)
+
+    template <int I, int J, typename T, data_layout dl>
+    static __device__ __forceinline__ void load_generic(tile<I, J, T, dl> & t, const T * __restrict__ xs0, const int stride) {
 #if defined(AMD_MFMA_AVAILABLE)
        if constexpr (I == 64 && J == 2) { // Special tile size to load <16, 4> as <16, 8>
 #pragma unroll
@ -511,18 +612,6 @@ namespace ggml_cuda_mma {
            : "=r"(xi[0]), "=r"(xi[1]), "=r"(xi[2]), "=r"(xi[3])
            : "l"(xs));
 #else
-#if __CUDA_ARCH__ == GGML_CUDA_CC_VOLTA
-        GGML_UNUSED_VARS(t, xs0, stride);
-        NO_DEVICE_CODE;
-#else
-        load_generic(t, xs0, stride);
-#endif // __CUDA_ARCH__ == GGML_CUDA_CC_VOLTA
-#endif // TURING_MMA_AVAILABLE
-    }
-
-    template <typename T>
-    static __device__ __forceinline__ void load_ldmatrix(
-            tile<32, 8, T> & t, const T * __restrict__ xs0, const int stride) {
 #if __CUDA_ARCH__ == GGML_CUDA_CC_VOLTA
 #if 1
        // TODO: more generic handling
@ -533,9 +622,31 @@ namespace ggml_cuda_mma {
        load_generic(t, xs0, stride);
 #endif // 1
 #else
-        tile<16, 8, T> * t16 = (tile<16, 8, T> *) &t;
-        load_ldmatrix(t16[0], xs0 +  0*stride, stride);
-        load_ldmatrix(t16[1], xs0 + 16*stride, stride);
+        load_generic(t, xs0, stride);
+#endif // __CUDA_ARCH__ == GGML_CUDA_CC_VOLTA
+#endif // TURING_MMA_AVAILABLE
+    }
+
+    static __device__ __forceinline__ void load_ldmatrix(
+            tile<8, 4, half2, DATA_LAYOUT_I_MAJOR_MIRRORED> & t, const half2 * __restrict__ xs0, const int stride) {
+        ggml_cuda_memcpy_1<4*sizeof(half2)>(t.x, xs0 + t.get_i(0)*stride);
+    }
+
+    static __device__ __forceinline__ void load_ldmatrix(
+            tile<8, 4, half2, DATA_LAYOUT_J_MAJOR_MIRRORED> & t, const half2 * __restrict__ xs0, const int stride) {
+#pragma unroll
+        for (int l0 = 0; l0 < t.ne; l0 += 2) {
+            ggml_cuda_memcpy_1<2*sizeof(half2)>(t.x + l0, xs0 + t.get_i(l0)*stride + t.get_j(l0));
+        }
+    }
+
+    static __device__ __forceinline__ void load_ldmatrix(
+            tile<32, 4, half2> & t, const half2 * __restrict__ xs0, const int stride) {
+#if __CUDA_ARCH__ == GGML_CUDA_CC_VOLTA
+        ggml_cuda_memcpy_1<4*sizeof(half2)>(t.x, xs0 + t.get_i(0)*stride);
+#else
+        GGML_UNUSED_VARS(t, xs0, stride);
+        NO_DEVICE_CODE;
 #endif // __CUDA_ARCH__ == GGML_CUDA_CC_VOLTA
    }

@ -860,14 +971,14 @@ namespace ggml_cuda_mma {
    template <typename T1, typename T2, int J, int K>
    static __device__ __forceinline__ void mma(
            tile<32, J, T1> & D, const tile<32, K, T2> & A, const tile<J, K, T2> & B) {
-        tile<16, J, T1> * D16 = (tile<16, J, T1> *) &D;
-        tile<16, K, T2> * A16 = (tile<16, K, T2> *) &A;
+        tile      <16, J, T1> * D16 = reinterpret_cast<      tile<16, J, T1> *>(&D);
+        const tile<16, K, T2> * A16 = reinterpret_cast<const tile<16, K, T2> *>(&A);
        mma(D16[0], A16[0], B);
        mma(D16[1], A16[1], B);
    }

    static __device__ __forceinline__ void mma(
-            tile<32, 8, float> & D, const tile<32, 8, half2> & A, const tile<8, 8, half2> & B) {
+            tile<32, 8, float> & D, const tile<32, 4, half2> & A, const tile<8, 4, half2, DATA_LAYOUT_I_MAJOR_MIRRORED> & B) {
 #if __CUDA_ARCH__ == GGML_CUDA_CC_VOLTA
        const int * Axi = (const int *) A.x;
        const int * Bxi = (const int *) B.x;
@ -880,20 +991,30 @@ namespace ggml_cuda_mma {
            "{%0, %1, %2, %3, %4, %5, %6, %7}, {%8, %9}, {%10, %11}, {%0, %1, %2, %3, %4, %5, %6, %7};"
            : "+r"(Dxi[0]), "+r"(Dxi[1]), "+r"(Dxi[2]), "+r"(Dxi[3]), "+r"(Dxi[4]), "+r"(Dxi[5]), "+r"(Dxi[6]), "+r"(Dxi[7])
            : "r"(Axi[2]), "r"(Axi[3]), "r"(Bxi[2]), "r"(Bxi[3]));
-        asm("mma.sync.aligned.m8n8k4.row.col.f32.f16.f16.f32 "
-            "{%0, %1, %2, %3, %4, %5, %6, %7}, {%8, %9}, {%10, %11}, {%0, %1, %2, %3, %4, %5, %6, %7};"
-            : "+r"(Dxi[0]), "+r"(Dxi[1]), "+r"(Dxi[2]), "+r"(Dxi[3]), "+r"(Dxi[4]), "+r"(Dxi[5]), "+r"(Dxi[6]), "+r"(Dxi[7])
-            : "r"(Axi[4]), "r"(Axi[5]), "r"(Bxi[4]), "r"(Bxi[5]));
-        asm("mma.sync.aligned.m8n8k4.row.col.f32.f16.f16.f32 "
-            "{%0, %1, %2, %3, %4, %5, %6, %7}, {%8, %9}, {%10, %11}, {%0, %1, %2, %3, %4, %5, %6, %7};"
-            : "+r"(Dxi[0]), "+r"(Dxi[1]), "+r"(Dxi[2]), "+r"(Dxi[3]), "+r"(Dxi[4]), "+r"(Dxi[5]), "+r"(Dxi[6]), "+r"(Dxi[7])
-            : "r"(Axi[6]), "r"(Axi[7]), "r"(Bxi[6]), "r"(Bxi[7]));
 #else
-        tile      <16, 8, float> * D16 = reinterpret_cast<tile      <16, 8, float> *>(&D);
-        const tile<16, 8, half2> * A16 = reinterpret_cast<const tile<16, 8, half2> *>(&A);
-        mma(D16[0], A16[0], B);
-        mma(D16[1], A16[1], B);
-#endif // __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
+        GGML_UNUSED_VARS(D, A, B);
+        NO_DEVICE_CODE;
+#endif // __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA
+    }
+
+    static __device__ __forceinline__ void mma(
+            tile<32, 4, half2> & D, const tile<32, 4, half2> & A, const tile<8, 4, half2, DATA_LAYOUT_J_MAJOR_MIRRORED> & B) {
+#if __CUDA_ARCH__ == GGML_CUDA_CC_VOLTA
+        const int * Axi = (const int *) A.x;
+        const int * Bxi = (const int *) B.x;
+        int       * Dxi = (int       *) D.x;
+        asm("mma.sync.aligned.m8n8k4.row.row.f16.f16.f16.f16 "
+            "{%0, %1, %2, %3}, {%4, %5}, {%6, %7}, {%0, %1, %2, %3};"
+            : "+r"(Dxi[0]), "+r"(Dxi[1]), "+r"(Dxi[2]), "+r"(Dxi[3])
+            : "r"(Axi[0]), "r"(Axi[1]), "r"(Bxi[0]), "r"(Bxi[1]));
+        asm("mma.sync.aligned.m8n8k4.row.row.f16.f16.f16.f16 "
+            "{%0, %1, %2, %3}, {%4, %5}, {%6, %7}, {%0, %1, %2, %3};"
+            : "+r"(Dxi[0]), "+r"(Dxi[1]), "+r"(Dxi[2]), "+r"(Dxi[3])
+            : "r"(Axi[2]), "r"(Axi[3]), "r"(Bxi[2]), "r"(Bxi[3]));
+#else
+        GGML_UNUSED_VARS(D, A, B);
+        NO_DEVICE_CODE;
+#endif // __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA
    }

 static __device__ __forceinline__ void mma(
--- a/ggml/src/ggml-cuda/mmf.cuh
+++ b/ggml/src/ggml-cuda/mmf.cuh
@ -37,23 +37,19 @@ static __global__ void mul_mat_f(
    typedef tile<16,       8, T>     tile_A;
    typedef tile<tile_B_I, 8, T>     tile_B;
    typedef tile<16,       tile_C_J, float> tile_C;
-
-    constexpr bool a_supported = tile_A::supported();
-    constexpr bool b_supported = tile_B::supported();
-    constexpr bool c_supported = tile_C::supported();
-    constexpr bool supported = a_supported && b_supported && c_supported;
 #else
-    constexpr bool I_16_supported = tile<16, 8, T>::supported() && tile<16, 8, float>::supported();
-    constexpr bool I_32_supported = tile<32, 8, T>::supported() && tile<32, 8, float>::supported();
-    constexpr bool supported = I_16_supported || I_32_supported;
-
-    constexpr int I_preferred = I_16_supported ? 16 : 32; // For Turing MMA both work but 16 is ~1% faster.
-
-    typedef tile<I_preferred, 8, T>     tile_A;
+#ifdef VOLTA_MMA_AVAILABLE
+    if constexpr (!std::is_same_v<T, half2>) {NO_DEVICE_CODE;} else {
+    typedef tile<32, 4, T,     DATA_LAYOUT_I_MAJOR>          tile_A;
+    typedef tile< 8, 4, T,     DATA_LAYOUT_I_MAJOR_MIRRORED> tile_B;
+    typedef tile<32, 8, float, DATA_LAYOUT_I_MAJOR>          tile_C;
+#else
+    typedef tile<16, 8, T>     tile_A;
    typedef tile<8,  8, T>     tile_B;
-    typedef tile<I_preferred, 8, float> tile_C;
+    typedef tile<16, 8, float> tile_C;
+#endif // VOLTA_MMA_AVAILABLE
 #endif // defined(AMD_WMMA_AVAILABLE)
-    if constexpr (!supported) {
+    if constexpr (!tile_A::supported() || !tile_B::supported() || !tile_C::supported()) {
        NO_DEVICE_CODE;
        return;
    }
@ -248,6 +244,9 @@ static __global__ void mul_mat_f(
            }
        }
    }
+#ifdef VOLTA_MMA_AVAILABLE
+    }
+#endif //VOLTA_MMA_AVAILABLE
 #else
    GGML_UNUSED_VARS(x, y, ids, dst,
        ncols, ncols_dst_total, nchannels_dst, stride_row, stride_col_y, stride_col_dst,
@ -278,27 +277,24 @@ static __global__ void mul_mat_f_ids(
    typedef tile<16,       8, T>     tile_A;
    typedef tile<tile_B_I, 8, T>     tile_B;
    typedef tile<16,       tile_C_J, float> tile_C;
-
-    constexpr bool a_supported = tile_A::supported();
-    constexpr bool b_supported = tile_B::supported();
-    constexpr bool c_supported = tile_C::supported();
-    constexpr bool supported = a_supported && b_supported && c_supported;
 #else
-    constexpr bool I_16_supported = tile<16, 8, T>::supported() && tile<16, 8, float>::supported();
-    constexpr bool I_32_supported = tile<32, 8, T>::supported() && tile<32, 8, float>::supported();
-    constexpr bool supported = I_16_supported || I_32_supported;
-
-    constexpr int I_preferred = I_16_supported ? 16 : 32; // For Turing MMA both work but 16 is ~1% faster.
-
-    typedef tile<I_preferred, 8, T>     tile_A;
+#ifdef VOLTA_MMA_AVAILABLE
+    if constexpr (!std::is_same_v<T, half2>) {NO_DEVICE_CODE;} else {
+    typedef tile<32, 4, T,     DATA_LAYOUT_I_MAJOR>          tile_A;
+    typedef tile< 8, 4, T,     DATA_LAYOUT_I_MAJOR_MIRRORED> tile_B;
+    typedef tile<32, 8, float, DATA_LAYOUT_I_MAJOR>          tile_C;
+#else
+    typedef tile<16, 8, T>     tile_A;
    typedef tile<8,  8, T>     tile_B;
-    typedef tile<I_preferred, 8, float> tile_C;
+    typedef tile<16, 8, float> tile_C;
+#endif // VOLTA_MMA_AVAILABLE
 #endif // defined(AMD_WMMA_AVAILABLE)
-    if constexpr (!supported) {
+    if constexpr (!tile_A::supported() || !tile_B::supported() || !tile_C::supported()) {
        NO_DEVICE_CODE;
        return;
    }

+
    constexpr int warp_size = ggml_cuda_get_physical_warp_size();
    constexpr int tile_k_padded = warp_size + 4;
    constexpr int ntA = rows_per_block / tile_A::I;
@ -517,6 +513,9 @@ static __global__ void mul_mat_f_ids(
            }
        }
    }
+#ifdef VOLTA_MMA_AVAILABLE
+    }
+#endif // VOLTA_MMA_AVAILABLE
 #else
    GGML_UNUSED_VARS(x, y, ids_src_compact, ids_dst_compact, expert_bounds, dst,
        ncols, ncols_dst_total, nchannels_dst, stride_row, stride_col_y, stride_col_dst,
--- a/ggml/src/ggml-metal/ggml-metal-device.m
+++ b/ggml/src/ggml-metal/ggml-metal-device.m
@ -146,6 +146,8 @@ struct ggml_metal_library {
    id<MTLDevice> device;

    ggml_metal_pipelines_t pipelines; // cache of compiled pipelines
+
+    NSLock * lock;
 };

 ggml_metal_library_t ggml_metal_library_init(ggml_metal_device_t dev) {
@ -299,6 +301,7 @@ ggml_metal_library_t ggml_metal_library_init(ggml_metal_device_t dev) {
    res->obj       = library;
    res->device    = device;
    res->pipelines = ggml_metal_pipelines_init();
+    res->lock      = [NSLock new];

    return res;
 }
@ -365,6 +368,7 @@ ggml_metal_library_t ggml_metal_library_init_from_source(ggml_metal_device_t dev
    res->obj       = library;
    res->device    = device;
    res->pipelines = ggml_metal_pipelines_init();
+    res->lock      = [NSLock new];

    return res;
 }
@ -380,20 +384,27 @@ void ggml_metal_library_free(ggml_metal_library_t lib) {

    ggml_metal_pipelines_free(lib->pipelines);

+    [lib->lock release];
+
    free(lib);
 }

 ggml_metal_pipeline_t ggml_metal_library_get_pipeline(ggml_metal_library_t lib, const char * name) {
-    return ggml_metal_pipelines_get(lib->pipelines, name);
+    [lib->lock lock];
+
+    ggml_metal_pipeline_t res = ggml_metal_pipelines_get(lib->pipelines, name);
+
+    [lib->lock unlock];
+
+    return res;
 }

 ggml_metal_pipeline_t ggml_metal_library_compile_pipeline(ggml_metal_library_t lib, const char * base, const char * name, ggml_metal_cv_t cv) {
-    // note: the pipelines are cached in the library per device, so they are shared across all metal contexts
-    ggml_critical_section_start();
+    [lib->lock lock];

-    ggml_metal_pipeline_t res = ggml_metal_library_get_pipeline(lib, name);
+    ggml_metal_pipeline_t res = ggml_metal_pipelines_get(lib->pipelines, name);
    if (res) {
-        ggml_critical_section_end();
+        [lib->lock unlock];

        return res;
    }
@ -414,7 +425,7 @@ ggml_metal_pipeline_t ggml_metal_library_compile_pipeline(ggml_metal_library_t l
            mtl_function = [lib->obj newFunctionWithName:base_func constantValues:cv->obj error:&error];
        }
        if (!mtl_function) {
-            ggml_critical_section_end();
+            [lib->lock unlock];

            GGML_LOG_ERROR("%s: failed to compile pipeline: base = '%s', name = '%s'\n", __func__, base, name);
            if (error) {
@ -433,7 +444,7 @@ ggml_metal_pipeline_t ggml_metal_library_compile_pipeline(ggml_metal_library_t l
                (int) res->obj.threadExecutionWidth);

        if (res->obj.maxTotalThreadsPerThreadgroup == 0 || res->obj.threadExecutionWidth == 0) {
-            ggml_critical_section_end();
+            [lib->lock unlock];

            GGML_LOG_ERROR("%s: incompatible pipeline %s\n", __func__, name);

@ -443,7 +454,7 @@ ggml_metal_pipeline_t ggml_metal_library_compile_pipeline(ggml_metal_library_t l
        ggml_metal_pipelines_add(lib->pipelines, name, res);
    }

-    ggml_critical_section_end();
+    [lib->lock unlock];

    return res;
 }
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
@ -1227,6 +1227,7 @@ struct vk_op_topk_push_constants {
    uint32_t orig_ncols;
    uint32_t ncols_input;
    uint32_t ncols_output;
+    uint32_t k;
    uint32_t nrows;
    uint32_t first_pass;
    uint32_t last_pass;
@ -1673,6 +1674,14 @@ class vk_perf_logger {
            timings[name.str()].push_back(time);
            return;
        }
+        if (node->op == GGML_OP_TOP_K) {
+            std::stringstream name;
+            name << ggml_op_name(node->op) <<
+                " K=" << node->ne[0] <<
+                " (" << node->src[0]->ne[0] << "," << node->src[0]->ne[1] << "," << node->src[0]->ne[2] << "," << node->src[0]->ne[3] << ")";
+            timings[name.str()].push_back(time);
+            return;
+        }
        timings[ggml_op_name(node->op)].push_back(time);
    }
  private:
@ -10345,17 +10354,8 @@ static void ggml_vk_topk(ggml_backend_vk_context * ctx, vk_context& subctx, cons
    uint32_t nrows = ggml_nrows(src0);
    uint32_t k = dst->ne[0];

-    vk_op_topk_push_constants pc { ncols, ncols, k, nrows, 0, 0 };
+    vk_op_topk_push_constants pc { ncols, ncols, ncols, k, nrows, 0, 0 };

-    // Reserve space for ivec2 per element, double buffered
-    const size_t dbl_buf_size = size_t{ncols} * nrows * 2 * sizeof(int);
-    const size_t x_sz = dbl_buf_size * 2;
-    uint32_t dbl_buf_index = 0;
-
-    if (ctx->prealloc_size_x < x_sz) {
-        ctx->prealloc_size_x = x_sz;
-        ggml_vk_preallocate_buffers(ctx, subctx);
-    }
    if (ctx->prealloc_x_need_sync) {
        ggml_vk_sync_buffers(ctx, subctx);
    }
@ -10370,8 +10370,9 @@ static void ggml_vk_topk(ggml_backend_vk_context * ctx, vk_context& subctx, cons
    // largest elements. Repeat until we have the top K elements.
    // Need to do at least one iteration to write out the results.
    bool done_one_iter = false;
+    uint32_t dbl_buf_index = 0;
+    size_t dbl_buf_size;
    while (num_elements > k || !done_one_iter) {
-        done_one_iter = true;

        // Prefer going as small as num_topk_pipelines - 3 for perf reasons.
        // But if K is larger, then we need a larger workgroup
@ -10411,6 +10412,21 @@ static void ggml_vk_topk(ggml_backend_vk_context * ctx, vk_context& subctx, cons
        // Number of elements remaining after this pass
        uint32_t num_dst_elements = (num_elements / pipeline->wg_denoms[0]) * k + std::min(k, num_elements % pipeline->wg_denoms[0]);

+        pc2.ncols_output = num_dst_elements;
+
+        if (!done_one_iter) {
+            // Reserve space for ivec2 per element, double buffered
+            // K per workgroup per row
+            dbl_buf_size = num_dst_elements * nrows * 2 * sizeof(int);
+            dbl_buf_size = ROUNDUP_POW2(dbl_buf_size, ctx->device->properties.limits.minStorageBufferOffsetAlignment);
+            const size_t x_sz = dbl_buf_size * 2;
+
+            if (ctx->prealloc_size_x < x_sz) {
+                ctx->prealloc_size_x = x_sz;
+                ggml_vk_preallocate_buffers(ctx, subctx);
+            }
+        }
+
        vk_subbuffer src_buf;
        vk_subbuffer dst_buf;

@ -10436,6 +10452,7 @@ static void ggml_vk_topk(ggml_backend_vk_context * ctx, vk_context& subctx, cons
        if (num_elements > k) {
            ggml_vk_sync_buffers(ctx, subctx);
        }
+        done_one_iter = true;
    }
    ctx->prealloc_x_need_sync = true;
 }
--- a/ggml/src/ggml-vulkan/vulkan-shaders/topk_argsort.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/topk_argsort.comp
@ -19,6 +19,7 @@ layout (push_constant) uniform parameter {
    uint orig_ncols;
    uint ncols_input;
    uint ncols_output;
+    uint k;
    uint nrows;
    uint first_pass;
    uint last_pass;
@ -36,7 +37,7 @@ void topk(bool needs_bounds_check, const uint row) {
            const uint row_offset = row * p.ncols_input;
            dst_row[col] = ivec2(gl_GlobalInvocationID.x, floatBitsToInt(data_a[row_offset + gl_GlobalInvocationID.x]));
        } else {
-            const uint row_offset = row * p.orig_ncols;
+            const uint row_offset = row * p.ncols_input;
            dst_row[col] = data_s[row_offset + gl_GlobalInvocationID.x];
        }
    } else {
@ -44,7 +45,7 @@ void topk(bool needs_bounds_check, const uint row) {
    }
    barrier();

-    if (p.ncols_output == 1) {
+    if (p.k == 1) {
        // Fast path for single output - just do a max reduction
        [[unroll]] for (int s = BLOCK_SIZE / 2; s >= 1; s /= 2) {
            if (col < s) {
@ -84,16 +85,20 @@ void topk(bool needs_bounds_check, const uint row) {
        }
    }

-    if (col < p.ncols_output && gl_GlobalInvocationID.x < p.orig_ncols) {
+    if (col < p.k) {
        if (p.last_pass != 0) {
-            const uint row_offset = row * p.ncols_output;
+            if (gl_GlobalInvocationID.x < p.ncols_input) {
+                const uint row_offset = row * p.k;
                data_d[row_offset + col] = dst_row[col].x;
+            }
        } else {
-            const uint row_offset = row * p.orig_ncols + gl_WorkGroupID.x * p.ncols_output;
+            if (gl_WorkGroupID.x * p.k + col < p.ncols_output) {
+                const uint row_offset = row * p.ncols_output + gl_WorkGroupID.x * p.k;
                data_t[row_offset + col] = dst_row[col];
            }
        }
    }
+}

 void main() {
    // Fast path for fully occupied workgroups
--- a/ggml/src/ggml-vulkan/vulkan-shaders/topk_nary_search.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/topk_nary_search.comp
@ -25,6 +25,7 @@ layout (push_constant) uniform parameter {
    uint orig_ncols;
    uint ncols_input;
    uint ncols_output;
+    uint k;
    uint nrows;
    uint first_pass;
    uint last_pass;
@ -60,7 +61,7 @@ void topk(const uint row) {
            const uint row_offset = row * p.ncols_input;
            dst_row[tid] = ivec2(gl_GlobalInvocationID.x, floatBitsToInt(data_a[row_offset + gl_GlobalInvocationID.x]));
        } else {
-            const uint row_offset = row * p.orig_ncols;
+            const uint row_offset = row * p.ncols_input;
            dst_row[tid] = data_s[row_offset + gl_GlobalInvocationID.x];
        }
    } else {
@ -68,7 +69,7 @@ void topk(const uint row) {
    }
    barrier();

-    if (p.ncols_output == 1) {
+    if (p.k == 1) {
        // Fast path for single output - just do a max reduction
        [[unroll]] for (int s = BLOCK_SIZE / 2; s >= 1; s /= 2) {
            if (tid < s) {
@ -98,7 +99,7 @@ void topk(const uint row) {
        uint range_max = 0xFF800000;
        // How many are above the current range, and how many we need to find.
        uint total = 0;
-        uint limit = min(p.ncols_output, p.ncols_input - gl_WorkGroupID.x * BLOCK_SIZE);
+        uint limit = min(p.k, p.ncols_input - gl_WorkGroupID.x * BLOCK_SIZE);

        while (mask != 0) {
            barrier();
@ -139,7 +140,7 @@ void topk(const uint row) {
            range_max = range_min + ((min_idx + 1) << shift);
            range_min = range_min + (min_idx << shift);

-            if (total == p.ncols_output) {
+            if (total == p.k) {
                break;
            }
            total -= counts[min_idx];
@ -179,16 +180,20 @@ void topk(const uint row) {
        barrier();
    }

-    if (tid < p.ncols_output && gl_GlobalInvocationID.x < p.orig_ncols) {
+    if (tid < p.k) {
        if (p.last_pass != 0) {
-            const uint row_offset = row * p.ncols_output;
+            if (gl_GlobalInvocationID.x < p.ncols_input) {
+                const uint row_offset = row * p.k;
                data_d[row_offset + tid] = dst_row[tid].x;
+            }
        } else {
-            const uint row_offset = row * p.orig_ncols + gl_WorkGroupID.x * p.ncols_output;
+            if (gl_WorkGroupID.x * p.k + tid < p.ncols_output) {
+                const uint row_offset = row * p.ncols_output + gl_WorkGroupID.x * p.k;
                data_t[row_offset + tid] = dst_row[tid];
            }
        }
    }
+}

 void main() {
    uint row = gl_WorkGroupID.y;
--- a/ggml/src/ggml-webgpu/CMakeLists.txt
+++ b/ggml/src/ggml-webgpu/CMakeLists.txt
@ -39,8 +39,23 @@ add_dependencies(ggml-webgpu generate_shaders)
 if(EMSCRIPTEN)
    set(EMDAWNWEBGPU_DIR "" CACHE PATH "Path to emdawnwebgpu_pkg")

+    if(NOT EMDAWNWEBGPU_DIR)
+        # default built-in port
+        target_compile_options(ggml-webgpu PRIVATE "--use-port=emdawnwebgpu")
+        target_link_options(ggml-webgpu INTERFACE "--use-port=emdawnwebgpu")
+    else()
+        # custom port
        target_compile_options(ggml-webgpu PRIVATE "--use-port=${EMDAWNWEBGPU_DIR}/emdawnwebgpu.port.py")
-    target_link_options(ggml-webgpu PRIVATE "--use-port=${EMDAWNWEBGPU_DIR}/emdawnwebgpu.port.py")
+        target_link_options(ggml-webgpu INTERFACE "--use-port=${EMDAWNWEBGPU_DIR}/emdawnwebgpu.port.py")
+    endif()
+
+    if (GGML_WEBGPU_JSPI)
+        target_compile_options(ggml-webgpu PRIVATE "-fwasm-exceptions")
+        target_link_options(ggml-webgpu INTERFACE "-sJSPI" "-fwasm-exceptions")
+    else()
+        target_compile_options(ggml-webgpu PRIVATE "-fexceptions")
+        target_link_options(ggml-webgpu INTERFACE "-sASYNCIFY" "-exceptions")
+    endif()
 else()
    find_package(Dawn REQUIRED)
    set(DawnWebGPU_TARGET dawn::webgpu_dawn)
@ -48,6 +63,9 @@ endif()

 if (GGML_WEBGPU_DEBUG)
    target_compile_definitions(ggml-webgpu PRIVATE GGML_WEBGPU_DEBUG=1)
+    if(EMSCRIPTEN)
+        target_link_options(ggml-webgpu INTERFACE "-sASSERTIONS=2")
+    endif()
 endif()

 if (GGML_WEBGPU_CPU_PROFILE)
--- a/ggml/src/ggml-webgpu/ggml-webgpu.cpp
+++ b/ggml/src/ggml-webgpu/ggml-webgpu.cpp
@ -9,6 +9,10 @@
 #include "ggml-impl.h"
 #include "ggml-wgsl-shaders.hpp"

+#ifdef __EMSCRIPTEN__
+#    include <emscripten/emscripten.h>
+#endif
+
 #include <webgpu/webgpu_cpp.h>

 #include <atomic>
@ -261,9 +265,12 @@ struct webgpu_context_struct {
    wgpu::Queue    queue;
    wgpu::Limits   limits;

-    bool                       supports_subgroup_matrix = false;
    uint32_t subgroup_size;
+
+#ifndef __EMSCRIPTEN__
+    bool                       supports_subgroup_matrix = false;
    wgpu::SubgroupMatrixConfig subgroup_matrix_config;
+#endif

    // Separate this out from limits since on some Metal systems, the limit returned by
    // querying the limits is higher than the actual allowed maximum.
@ -449,8 +456,8 @@ static void ggml_backend_webgpu_wait(webgpu_context &                         ct
    // If we have too many in-flight submissions, wait on the oldest one first. If there are many threads,
    // inflight_max may be 0, meaning that we must wait on all futures.
    uint64_t timeout_ms       = block ? UINT64_MAX : 0;
-    uint     inflight_threads = ctx->inflight_threads;
-    uint     inflight_max     = WEBGPU_MAX_INFLIGHT_SUBS_PER_THREAD / std::max(inflight_threads, 1u);
+    uint32_t inflight_threads = ctx->inflight_threads;
+    uint32_t inflight_max     = WEBGPU_MAX_INFLIGHT_SUBS_PER_THREAD / std::max(inflight_threads, 1u);
    while (futures.size() >= inflight_max && futures.size() > 0) {
        ctx->instance.WaitAny(futures[0].futures.size(), futures[0].futures.data(), UINT64_MAX);
        futures.erase(futures.begin());
@ -986,6 +993,7 @@ static webgpu_command ggml_webgpu_mul_mat(webgpu_context & ctx,
            pipeline = ctx->mul_mat_pipelines[src0->type][src1->type][vectorized];
            uint32_t wg_m;
            uint32_t wg_n;
+#ifndef __EMSCRIPTEN__
            if (ctx->supports_subgroup_matrix) {
                // The total number of subgroups/workgroups needed per matrix.
                uint32_t wg_m_sg_tile =
@ -995,11 +1003,15 @@ static webgpu_command ggml_webgpu_mul_mat(webgpu_context & ctx,
                    WEBGPU_MUL_MAT_SUBGROUP_N * WEBGPU_MUL_MAT_SUBGROUP_MATRIX_N * ctx->subgroup_matrix_config.N;
                wg_n = (dst->ne[1] + wg_n_sg_tile - 1) / wg_n_sg_tile;
            } else {
+#endif
                uint32_t tile_m_s = WEBGPU_MUL_MAT_TILE_M * WEBGPU_MUL_MAT_WG_SIZE_M;
                uint32_t tile_n_s = WEBGPU_MUL_MAT_TILE_N * WEBGPU_MUL_MAT_WG_SIZE_N;
                wg_m              = (dst->ne[0] + tile_m_s - 1) / tile_m_s;
                wg_n              = (dst->ne[1] + tile_n_s - 1) / tile_n_s;
+#ifndef __EMSCRIPTEN__
            }
+#endif
+
            wg_x = wg_m * wg_n * dst->ne[2] * dst->ne[3];
        }
    }
@ -1419,8 +1431,8 @@ static ggml_status ggml_backend_webgpu_graph_compute(ggml_backend_t backend, str
            commands.push_back(*cmd);
        }
        // compute the batch size based on the number of inflight threads
-        uint inflight_threads = ctx->inflight_threads;
-        uint batch_size       = std::min(std::max(1u, WEBGPU_NUM_PARAM_BUFS / std::max(inflight_threads, 1u)),
+        uint32_t inflight_threads = ctx->inflight_threads;
+        uint32_t batch_size       = std::min(std::max(1u, WEBGPU_NUM_PARAM_BUFS / std::max(inflight_threads, 1u)),
                                             WEBGPU_COMMAND_SUBMIT_BATCH_SIZE);
        if (commands.size() >= batch_size) {
            futures.push_back(ggml_backend_webgpu_submit(ctx, commands));
@ -1758,6 +1770,17 @@ static void ggml_webgpu_init_mul_mat_pipeline(webgpu_context & webgpu_ctx) {
    ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->mul_mat_pipeline[GGML_TYPE_IQ4_XS][GGML_TYPE_F32],
                                wgsl_mul_mat_iq4_xs_f32, "mul_mat_iq4_xs_f32");

+    std::string proc_mul_mat_f32_f32;
+    std::string proc_mul_mat_f32_f32_vec;
+    std::string proc_mul_mat_f16_f32;
+    std::string proc_mul_mat_f16_f32_vec;
+    std::string proc_mul_mat_f16_f16;
+    std::string proc_mul_mat_f16_f16_vec;
+    std::string proc_mul_mat_q4_0_f32;
+    std::string proc_mul_mat_q4_0_f32_vec;
+
+    std::vector<wgpu::ConstantEntry> mul_mat_constants;
+#ifndef __EMSCRIPTEN__
    if (webgpu_ctx->supports_subgroup_matrix) {
        std::map<std::string, std::string> sg_matrix_repls;
        sg_matrix_repls["WEBGPU_MAX_SUBGROUP_SIZE"] = std::to_string(webgpu_ctx->subgroup_size);
@ -1770,100 +1793,57 @@ static void ggml_webgpu_init_mul_mat_pipeline(webgpu_context & webgpu_ctx) {
        sg_matrix_repls["WEBGPU_SG_MAT_N_SIZE"]     = std::to_string(webgpu_ctx->subgroup_matrix_config.N);
        sg_matrix_repls["WEBGPU_SG_MAT_K_SIZE"]     = std::to_string(webgpu_ctx->subgroup_matrix_config.K);

-        std::string proc_mul_mat_subgroup_matrix_f32_f32 =
-            ggml_webgpu_process_shader_repls(wgsl_mul_mat_subgroup_matrix_f32_f32, sg_matrix_repls);
-        std::string proc_mul_mat_subgroup_matrix_f32_f32_vec =
+        proc_mul_mat_f32_f32 = ggml_webgpu_process_shader_repls(wgsl_mul_mat_subgroup_matrix_f32_f32, sg_matrix_repls);
+        proc_mul_mat_f32_f32_vec =
            ggml_webgpu_process_shader_repls(wgsl_mul_mat_subgroup_matrix_f32_f32_vec, sg_matrix_repls);
-        std::string proc_mul_mat_subgroup_matrix_f16_f32 =
-            ggml_webgpu_process_shader_repls(wgsl_mul_mat_subgroup_matrix_f16_f32, sg_matrix_repls);
-        std::string proc_mul_mat_subgroup_matrix_f16_f32_vec =
+        proc_mul_mat_f16_f32 = ggml_webgpu_process_shader_repls(wgsl_mul_mat_subgroup_matrix_f16_f32, sg_matrix_repls);
+        proc_mul_mat_f16_f32_vec =
            ggml_webgpu_process_shader_repls(wgsl_mul_mat_subgroup_matrix_f16_f32_vec, sg_matrix_repls);
-        std::string proc_mul_mat_subgroup_matrix_f16_f16 =
-            ggml_webgpu_process_shader_repls(wgsl_mul_mat_subgroup_matrix_f16_f16, sg_matrix_repls);
-        std::string proc_mul_mat_subgroup_matrix_f16_f16_vec =
+        proc_mul_mat_f16_f16 = ggml_webgpu_process_shader_repls(wgsl_mul_mat_subgroup_matrix_f16_f16, sg_matrix_repls);
+        proc_mul_mat_f16_f16_vec =
            ggml_webgpu_process_shader_repls(wgsl_mul_mat_subgroup_matrix_f16_f16_vec, sg_matrix_repls);
-        std::string proc_mul_mat_subgroup_matrix_q4_0_f32 =
+        proc_mul_mat_q4_0_f32 =
            ggml_webgpu_process_shader_repls(wgsl_mul_mat_subgroup_matrix_q4_0_f32, sg_matrix_repls);
-        std::string proc_mul_mat_subgroup_matrix_q4_0_f32_vec =
+        proc_mul_mat_q4_0_f32_vec =
            ggml_webgpu_process_shader_repls(wgsl_mul_mat_subgroup_matrix_q4_0_f32_vec, sg_matrix_repls);
-
-        webgpu_ctx->mul_mat_pipelines[GGML_TYPE_F32][GGML_TYPE_F32][0] = ggml_webgpu_create_pipeline2(
-            webgpu_ctx->device, proc_mul_mat_subgroup_matrix_f32_f32.c_str(), "mul_mat_subgroup_matrix_f32_f32");
-        webgpu_ctx->mul_mat_pipelines[GGML_TYPE_F32][GGML_TYPE_F32][1] =
-            ggml_webgpu_create_pipeline2(webgpu_ctx->device, proc_mul_mat_subgroup_matrix_f32_f32_vec.c_str(),
-                                         "mul_mat_subgroup_matrix_f32_f32_vec");
-        webgpu_ctx->mul_mat_pipelines[GGML_TYPE_F16][GGML_TYPE_F32][0] = ggml_webgpu_create_pipeline2(
-            webgpu_ctx->device, proc_mul_mat_subgroup_matrix_f16_f32.c_str(), "mul_mat_subgroup_matrix_f16_f32");
-        webgpu_ctx->mul_mat_pipelines[GGML_TYPE_F16][GGML_TYPE_F32][1] =
-            ggml_webgpu_create_pipeline2(webgpu_ctx->device, proc_mul_mat_subgroup_matrix_f16_f32_vec.c_str(),
-                                         "mul_mat_subgroup_matrix_f16_f32_vec");
-        webgpu_ctx->mul_mat_pipelines[GGML_TYPE_F16][GGML_TYPE_F16][0] = ggml_webgpu_create_pipeline2(
-            webgpu_ctx->device, proc_mul_mat_subgroup_matrix_f16_f16.c_str(), "mul_mat_subgroup_matrix_f16_f16");
-        webgpu_ctx->mul_mat_pipelines[GGML_TYPE_F16][GGML_TYPE_F16][1] =
-            ggml_webgpu_create_pipeline2(webgpu_ctx->device, proc_mul_mat_subgroup_matrix_f16_f16_vec.c_str(),
-                                         "mul_mat_subgroup_matrix_f16_f16_vec");
-        webgpu_ctx->mul_mat_pipelines[GGML_TYPE_Q4_0][GGML_TYPE_F32][0] = ggml_webgpu_create_pipeline2(
-            webgpu_ctx->device, proc_mul_mat_subgroup_matrix_q4_0_f32.c_str(), "mul_mat_subgroup_matrix_q4_0_f32");
-        webgpu_ctx->mul_mat_pipelines[GGML_TYPE_Q4_0][GGML_TYPE_F32][1] =
-            ggml_webgpu_create_pipeline2(webgpu_ctx->device, proc_mul_mat_subgroup_matrix_q4_0_f32_vec.c_str(),
-                                         "mul_mat_subgroup_matrix_q4_0_f32_vec");
    } else {
-        std::vector<wgpu::ConstantEntry> mul_mat_reg_tile_constants(3);
-        mul_mat_reg_tile_constants[0].key   = "TILE_K";
-        mul_mat_reg_tile_constants[0].value = WEBGPU_MUL_MAT_TILE_K;
-        mul_mat_reg_tile_constants[1].key   = "WORKGROUP_SIZE_M";
-        mul_mat_reg_tile_constants[1].value = WEBGPU_MUL_MAT_WG_SIZE_M;
-        mul_mat_reg_tile_constants[2].key   = "WORKGROUP_SIZE_N";
-        mul_mat_reg_tile_constants[2].value = WEBGPU_MUL_MAT_WG_SIZE_N;
+#endif
+        mul_mat_constants.push_back({ .key = "TILE_K", .value = WEBGPU_MUL_MAT_TILE_K });
+        mul_mat_constants.push_back({ .key = "WORKGROUP_SIZE_M", .value = WEBGPU_MUL_MAT_WG_SIZE_M });
+        mul_mat_constants.push_back({ .key = "WORKGROUP_SIZE_N", .value = WEBGPU_MUL_MAT_WG_SIZE_N });

        std::map<std::string, std::string> reg_repls;
        reg_repls["WEBGPU_TILE_M"] = std::to_string(WEBGPU_MUL_MAT_TILE_M);
        reg_repls["WEBGPU_TILE_N"] = std::to_string(WEBGPU_MUL_MAT_TILE_N);

-        // Process each reg-tile shader with tile replacements.
-        // Keep the processed strings in-scope so .c_str() remains valid.
-        std::string proc_mul_mat_reg_tile_f32_f32 =
-            ggml_webgpu_process_shader_repls(wgsl_mul_mat_reg_tile_f32_f32, reg_repls);
-        std::string proc_mul_mat_reg_tile_f32_f32_vec =
-            ggml_webgpu_process_shader_repls(wgsl_mul_mat_reg_tile_f32_f32_vec, reg_repls);
-        std::string proc_mul_mat_reg_tile_f16_f32 =
-            ggml_webgpu_process_shader_repls(wgsl_mul_mat_reg_tile_f16_f32, reg_repls);
-        std::string proc_mul_mat_reg_tile_f16_f32_vec =
-            ggml_webgpu_process_shader_repls(wgsl_mul_mat_reg_tile_f16_f32_vec, reg_repls);
-        std::string proc_mul_mat_reg_tile_f16_f16 =
-            ggml_webgpu_process_shader_repls(wgsl_mul_mat_reg_tile_f16_f16, reg_repls);
-        std::string proc_mul_mat_reg_tile_f16_f16_vec =
-            ggml_webgpu_process_shader_repls(wgsl_mul_mat_reg_tile_f16_f16_vec, reg_repls);
-        std::string proc_mul_mat_reg_tile_q4_0_f32 =
-            ggml_webgpu_process_shader_repls(wgsl_mul_mat_reg_tile_q4_0_f32, reg_repls);
-        std::string proc_mul_mat_reg_tile_q4_0_f32_vec =
-            ggml_webgpu_process_shader_repls(wgsl_mul_mat_reg_tile_q4_0_f32_vec, reg_repls);
-
-        webgpu_ctx->mul_mat_pipelines[GGML_TYPE_F32][GGML_TYPE_F32][0] =
-            ggml_webgpu_create_pipeline2(webgpu_ctx->device, proc_mul_mat_reg_tile_f32_f32.c_str(),
-                                         "mul_mat_reg_tile_f32_f32", mul_mat_reg_tile_constants);
-        webgpu_ctx->mul_mat_pipelines[GGML_TYPE_F32][GGML_TYPE_F32][1] =
-            ggml_webgpu_create_pipeline2(webgpu_ctx->device, proc_mul_mat_reg_tile_f32_f32_vec.c_str(),
-                                         "mul_mat_reg_tile_f32_f32_vec", mul_mat_reg_tile_constants);
-        webgpu_ctx->mul_mat_pipelines[GGML_TYPE_F16][GGML_TYPE_F32][0] =
-            ggml_webgpu_create_pipeline2(webgpu_ctx->device, proc_mul_mat_reg_tile_f16_f32.c_str(),
-                                         "mul_mat_reg_tile_f16_f32", mul_mat_reg_tile_constants);
-        webgpu_ctx->mul_mat_pipelines[GGML_TYPE_F16][GGML_TYPE_F32][1] =
-            ggml_webgpu_create_pipeline2(webgpu_ctx->device, proc_mul_mat_reg_tile_f16_f32_vec.c_str(),
-                                         "mul_mat_reg_tile_f16_f32_vec", mul_mat_reg_tile_constants);
-        webgpu_ctx->mul_mat_pipelines[GGML_TYPE_F16][GGML_TYPE_F16][0] =
-            ggml_webgpu_create_pipeline2(webgpu_ctx->device, proc_mul_mat_reg_tile_f16_f16.c_str(),
-                                         "mul_mat_reg_tile_f16_f16", mul_mat_reg_tile_constants);
-        webgpu_ctx->mul_mat_pipelines[GGML_TYPE_F16][GGML_TYPE_F16][1] =
-            ggml_webgpu_create_pipeline2(webgpu_ctx->device, proc_mul_mat_reg_tile_f16_f16_vec.c_str(),
-                                         "mul_mat_reg_tile_f16_f16_vec", mul_mat_reg_tile_constants);
-        webgpu_ctx->mul_mat_pipelines[GGML_TYPE_Q4_0][GGML_TYPE_F32][0] =
-            ggml_webgpu_create_pipeline2(webgpu_ctx->device, proc_mul_mat_reg_tile_q4_0_f32.c_str(),
-                                         "mul_mat_reg_tile_q4_0_f32", mul_mat_reg_tile_constants);
-        webgpu_ctx->mul_mat_pipelines[GGML_TYPE_Q4_0][GGML_TYPE_F32][1] =
-            ggml_webgpu_create_pipeline2(webgpu_ctx->device, proc_mul_mat_reg_tile_q4_0_f32_vec.c_str(),
-                                         "mul_mat_reg_tile_q4_0_f32_vec", mul_mat_reg_tile_constants);
+        proc_mul_mat_f32_f32      = ggml_webgpu_process_shader_repls(wgsl_mul_mat_reg_tile_f32_f32, reg_repls);
+        proc_mul_mat_f32_f32_vec  = ggml_webgpu_process_shader_repls(wgsl_mul_mat_reg_tile_f32_f32_vec, reg_repls);
+        proc_mul_mat_f16_f32      = ggml_webgpu_process_shader_repls(wgsl_mul_mat_reg_tile_f16_f32, reg_repls);
+        proc_mul_mat_f16_f32_vec  = ggml_webgpu_process_shader_repls(wgsl_mul_mat_reg_tile_f16_f32_vec, reg_repls);
+        proc_mul_mat_f16_f16      = ggml_webgpu_process_shader_repls(wgsl_mul_mat_reg_tile_f16_f16, reg_repls);
+        proc_mul_mat_f16_f16_vec  = ggml_webgpu_process_shader_repls(wgsl_mul_mat_reg_tile_f16_f16_vec, reg_repls);
+        proc_mul_mat_q4_0_f32     = ggml_webgpu_process_shader_repls(wgsl_mul_mat_reg_tile_q4_0_f32, reg_repls);
+        proc_mul_mat_q4_0_f32_vec = ggml_webgpu_process_shader_repls(wgsl_mul_mat_reg_tile_q4_0_f32_vec, reg_repls);
+#ifndef __EMSCRIPTEN__
    }
+#endif
+
+    webgpu_ctx->mul_mat_pipelines[GGML_TYPE_F32][GGML_TYPE_F32][0] = ggml_webgpu_create_pipeline2(
+        webgpu_ctx->device, proc_mul_mat_f32_f32.c_str(), "mul_mat_f32_f32", mul_mat_constants);
+    webgpu_ctx->mul_mat_pipelines[GGML_TYPE_F32][GGML_TYPE_F32][1] = ggml_webgpu_create_pipeline2(
+        webgpu_ctx->device, proc_mul_mat_f32_f32_vec.c_str(), "mul_mat_f32_f32_vec", mul_mat_constants);
+    webgpu_ctx->mul_mat_pipelines[GGML_TYPE_F16][GGML_TYPE_F32][0] = ggml_webgpu_create_pipeline2(
+        webgpu_ctx->device, proc_mul_mat_f16_f32.c_str(), "mul_mat_f16_f32", mul_mat_constants);
+    webgpu_ctx->mul_mat_pipelines[GGML_TYPE_F16][GGML_TYPE_F32][1] = ggml_webgpu_create_pipeline2(
+        webgpu_ctx->device, proc_mul_mat_f16_f32_vec.c_str(), "mul_mat_f16_f32_vec", mul_mat_constants);
+    webgpu_ctx->mul_mat_pipelines[GGML_TYPE_F16][GGML_TYPE_F16][0] = ggml_webgpu_create_pipeline2(
+        webgpu_ctx->device, proc_mul_mat_f16_f16.c_str(), "mul_mat_f16_f16", mul_mat_constants);
+    webgpu_ctx->mul_mat_pipelines[GGML_TYPE_F16][GGML_TYPE_F16][1] = ggml_webgpu_create_pipeline2(
+        webgpu_ctx->device, proc_mul_mat_f16_f16_vec.c_str(), "mul_mat_f16_f16_vec", mul_mat_constants);
+    webgpu_ctx->mul_mat_pipelines[GGML_TYPE_Q4_0][GGML_TYPE_F32][0] = ggml_webgpu_create_pipeline2(
+        webgpu_ctx->device, proc_mul_mat_q4_0_f32.c_str(), "mul_mat_q4_0_f32", mul_mat_constants);
+    webgpu_ctx->mul_mat_pipelines[GGML_TYPE_Q4_0][GGML_TYPE_F32][1] = ggml_webgpu_create_pipeline2(
+        webgpu_ctx->device, proc_mul_mat_q4_0_f32_vec.c_str(), "mul_mat_q4_0_f32_vec", mul_mat_constants);

    std::vector<wgpu::ConstantEntry> mul_mat_vec_constants(3);
    mul_mat_vec_constants[0].key   = "WORKGROUP_SIZE";
@ -2384,13 +2364,17 @@ static ggml_backend_dev_t ggml_backend_webgpu_reg_get_device(ggml_backend_reg_t

    webgpu_context ctx = reg_ctx->webgpu_ctx;

+    wgpu::RequestAdapterOptions options = {};
+
+#ifndef __EMSCRIPTEN__
    // TODO: track need for these toggles: https://issues.chromium.org/issues/42251215
    const char * const          adapterEnabledToggles[] = { "vulkan_enable_f16_on_nvidia", "use_vulkan_memory_model" };
    wgpu::DawnTogglesDescriptor adapterTogglesDesc;
    adapterTogglesDesc.enabledToggles     = adapterEnabledToggles;
    adapterTogglesDesc.enabledToggleCount = 2;
-    wgpu::RequestAdapterOptions options   = {};
    options.nextInChain                   = &adapterTogglesDesc;
+#endif
+
    ctx->instance.WaitAny(ctx->instance.RequestAdapter(
                              &options, wgpu::CallbackMode::AllowSpontaneous,
                              [&ctx](wgpu::RequestAdapterStatus status, wgpu::Adapter adapter, const char * message) {
@ -2407,10 +2391,12 @@ static ggml_backend_dev_t ggml_backend_webgpu_reg_get_device(ggml_backend_reg_t
    ctx->max_wg_size_x = 288;  // default value

    wgpu::AdapterInfo info{};
+#ifndef __EMSCRIPTEN__
    wgpu::AdapterPropertiesSubgroupMatrixConfigs subgroup_matrix_configs{};
    if (ctx->adapter.HasFeature(wgpu::FeatureName::ChromiumExperimentalSubgroupMatrix)) {
        info.nextInChain = &subgroup_matrix_configs;
    }
+#endif
    ctx->adapter.GetInfo(&info);

    wgpu::SupportedFeatures features;
@ -2418,6 +2404,7 @@ static ggml_backend_dev_t ggml_backend_webgpu_reg_get_device(ggml_backend_reg_t
    // we require f16 support
    GGML_ASSERT(ctx->adapter.HasFeature(wgpu::FeatureName::ShaderF16));

+#ifndef __EMSCRIPTEN__
    // Only support square f16 matrices of size 8 or 16 for now
    bool valid_subgroup_matrix_config = false;
    if (ctx->adapter.HasFeature(wgpu::FeatureName::ChromiumExperimentalSubgroupMatrix)) {
@ -2433,36 +2420,27 @@ static ggml_backend_dev_t ggml_backend_webgpu_reg_get_device(ggml_backend_reg_t
        }
    }

+    ctx->supports_subgroup_matrix = valid_subgroup_matrix_config;
+#endif
    // For subgroup matrix code to be the most efficient, we would like the subgroup size to be consistent and accurate.
    // Unfortunately, that is not possible, so we use the maximum subgroup size reported by the adapter.
    ctx->subgroup_size = info.subgroupMaxSize;
-    ctx->supports_subgroup_matrix = valid_subgroup_matrix_config;

    // Initialize device
-    std::vector<wgpu::FeatureName> required_features = { wgpu::FeatureName::ShaderF16,
-                                                         wgpu::FeatureName::ImplicitDeviceSynchronization };
+    std::vector<wgpu::FeatureName> required_features = { wgpu::FeatureName::ShaderF16 };
+
+#ifndef __EMSCRIPTEN__
+    required_features.push_back(wgpu::FeatureName::ImplicitDeviceSynchronization);
    if (ctx->supports_subgroup_matrix) {
        required_features.push_back(wgpu::FeatureName::Subgroups);
        required_features.push_back(wgpu::FeatureName::ChromiumExperimentalSubgroupMatrix);
    }
+#endif

 #ifdef GGML_WEBGPU_GPU_PROFILE
    required_features.push_back(wgpu::FeatureName::TimestampQuery);
 #endif

-    // Enable Dawn-specific toggles to increase native performance
-    // TODO: Don't enable for WASM builds, they won't have an effect anyways
-    // TODO: Maybe WebGPU needs a "fast" mode where you can request compilers skip adding checks like these,
-    //       only for native performance?
-    const char * const deviceEnabledToggles[]  = { "skip_validation", "disable_robustness", "disable_workgroup_init",
-                                                   "disable_polyfills_on_integer_div_and_mod" };
-    const char * const deviceDisabledToggles[] = { "timestamp_quantization" };
-    wgpu::DawnTogglesDescriptor deviceTogglesDesc;
-    deviceTogglesDesc.enabledToggles      = deviceEnabledToggles;
-    deviceTogglesDesc.enabledToggleCount  = 4;
-    deviceTogglesDesc.disabledToggles     = deviceDisabledToggles;
-    deviceTogglesDesc.disabledToggleCount = 1;
-
    wgpu::DeviceDescriptor dev_desc;
    dev_desc.requiredLimits       = &ctx->limits;
    dev_desc.requiredFeatures     = required_features.data();
@ -2480,7 +2458,23 @@ static ggml_backend_dev_t ggml_backend_webgpu_reg_get_device(ggml_backend_reg_t
            GGML_ABORT("ggml_webgpu: Device error! Reason: %d, Message: %s\n", static_cast<int>(reason),
                       std::string(message).c_str());
        });
+
+#ifndef __EMSCRIPTEN__
+    // Enable Dawn-specific toggles to increase native performance
+    // TODO: Maybe WebGPU needs a "fast" mode where you can request compilers skip adding checks like these,
+    //       only for native performance?
+    const char * const deviceEnabledToggles[]  = { "skip_validation", "disable_robustness", "disable_workgroup_init",
+                                                   "disable_polyfills_on_integer_div_and_mod" };
+    const char * const deviceDisabledToggles[] = { "timestamp_quantization" };
+    wgpu::DawnTogglesDescriptor deviceTogglesDesc;
+    deviceTogglesDesc.enabledToggles      = deviceEnabledToggles;
+    deviceTogglesDesc.enabledToggleCount  = 4;
+    deviceTogglesDesc.disabledToggles     = deviceDisabledToggles;
+    deviceTogglesDesc.disabledToggleCount = 1;
+
    dev_desc.nextInChain = &deviceTogglesDesc;
+#endif
+
    ctx->instance.WaitAny(ctx->adapter.RequestDevice(
                              &dev_desc, wgpu::CallbackMode::AllowSpontaneous,
                              [ctx](wgpu::RequestDeviceStatus status, wgpu::Device device, wgpu::StringView message) {
@ -2578,18 +2572,27 @@ ggml_backend_reg_t ggml_backend_webgpu_reg() {
    ctx.name         = GGML_WEBGPU_NAME;
    ctx.device_count = 1;

-    const char * const instanceEnabledToggles[] = { "allow_unsafe_apis" };
-
-    wgpu::DawnTogglesDescriptor instanceTogglesDesc;
-    instanceTogglesDesc.enabledToggles     = instanceEnabledToggles;
-    instanceTogglesDesc.enabledToggleCount = 1;
    wgpu::InstanceDescriptor               instance_descriptor{};
    std::vector<wgpu::InstanceFeatureName> instance_features = { wgpu::InstanceFeatureName::TimedWaitAny };
    instance_descriptor.requiredFeatures                     = instance_features.data();
    instance_descriptor.requiredFeatureCount                 = instance_features.size();
+
+#ifndef __EMSCRIPTEN__
+    const char * const          instanceEnabledToggles[] = { "allow_unsafe_apis" };
+    wgpu::DawnTogglesDescriptor instanceTogglesDesc;
+    instanceTogglesDesc.enabledToggles     = instanceEnabledToggles;
+    instanceTogglesDesc.enabledToggleCount = 1;
    instance_descriptor.nextInChain        = &instanceTogglesDesc;
+#endif

    webgpu_ctx->instance = wgpu::CreateInstance(&instance_descriptor);
+
+#ifdef __EMSCRIPTEN__
+    if (webgpu_ctx->instance == nullptr) {
+        GGML_LOG_ERROR("ggml_webgpu: Failed to create WebGPU instance. Make sure either -sASYNCIFY or -sJSPI is set\n");
+        return nullptr;
+    }
+#endif
    GGML_ASSERT(webgpu_ctx->instance != nullptr);

    static ggml_backend_reg reg = {
--- a/ggml/src/gguf.cpp
+++ b/ggml/src/gguf.cpp
@ -1169,7 +1169,7 @@ void gguf_set_tensor_data(struct gguf_context * ctx, const char * name, const vo
 struct gguf_writer_base {
    size_t written_bytes {0u};

-    ~gguf_writer_base(void) {}
+    ~gguf_writer_base(void) = default;

    // we bet on devirtualization
    virtual void write(int8_t val) = 0;
--- a/gguf-py/gguf/vocab.py
+++ b/gguf-py/gguf/vocab.py
@ -31,6 +31,14 @@ except ImportError:
 else:
    _mistral_common_installed = True

+try:
+    from mistral_common.tokens.tokenizers.utils import ( # pyright: ignore[reportMissingImports]
+        get_one_valid_tokenizer_file,
+    )
+except ImportError:
+    # We still want the conversion to work with older mistral-common versions.
+    get_one_valid_tokenizer_file = None
+

 import gguf

@ -673,6 +681,10 @@ class MistralVocab(Vocab):

        # Find the tokenizer files
        all_files = [f.as_posix() for f in base_path.glob("**/*") if f.is_file()]
+
+        if get_one_valid_tokenizer_file is not None:
+            tokenizer_file_path = get_one_valid_tokenizer_file(all_files)
+        else:
            valid_tokenizer_files = _filter_valid_tokenizer_files(all_files)

            if len(valid_tokenizer_files) == 0:
@ -689,8 +701,10 @@ class MistralVocab(Vocab):
            else:
                tokenizer_file = valid_tokenizer_files[0]

+            tokenizer_file_path = base_path / tokenizer_file
+
        self.tokenizer = MistralTokenizer.from_file(
-            base_path / tokenizer_file
+            tokenizer_file_path
        ).instruct_tokenizer.tokenizer
        self.tokenizer_type = (
            MistralTokenizerType.tekken
@ -698,7 +712,7 @@ class MistralVocab(Vocab):
            else MistralTokenizerType.spm
        )
        self.vocab_size = self.tokenizer.n_words
-        self.fname_tokenizer = base_path / tokenizer_file
+        self.fname_tokenizer = tokenizer_file_path
        self._name = (
            "mistral-" + self.tokenizer_type.value + "-" + self.tokenizer.version
        )
--- a/scripts/serve-static.js
+++ b/scripts/serve-static.js
@ -0,0 +1,110 @@
+const http = require('http');
+const fs = require('fs').promises;
+const path = require('path');
+
+// This file is used for testing wasm build from emscripten
+// Example build command:
+// emcmake cmake -B build-wasm -DGGML_WEBGPU=ON -DLLAMA_CURL=OFF
+// cmake --build build-wasm --target test-backend-ops -j
+
+const PORT = 8080;
+const STATIC_DIR = path.join(__dirname, '../build-wasm/bin');
+console.log(`Serving static files from: ${STATIC_DIR}`);
+
+const mimeTypes = {
+  '.html': 'text/html',
+  '.js': 'text/javascript',
+  '.css': 'text/css',
+  '.png': 'image/png',
+  '.jpg': 'image/jpeg',
+  '.gif': 'image/gif',
+  '.svg': 'image/svg+xml',
+  '.json': 'application/json',
+  '.woff': 'font/woff',
+  '.woff2': 'font/woff2',
+};
+
+async function generateDirListing(dirPath, reqUrl) {
+  const files = await fs.readdir(dirPath);
+  let html = `
+    <!DOCTYPE html>
+    <html>
+    <head>
+      <title>Directory Listing</title>
+      <style>
+        body { font-family: Arial, sans-serif; padding: 20px; }
+        ul { list-style: none; padding: 0; }
+        li { margin: 5px 0; }
+        a { text-decoration: none; color: #0066cc; }
+        a:hover { text-decoration: underline; }
+      </style>
+    </head>
+    <body>
+      <h1>Directory: ${reqUrl}</h1>
+      <ul>
+  `;
+
+  if (reqUrl !== '/') {
+    html += `<li><a href="../">../ (Parent Directory)</a></li>`;
+  }
+
+  for (const file of files) {
+    const filePath = path.join(dirPath, file);
+    const stats = await fs.stat(filePath);
+    const link = encodeURIComponent(file) + (stats.isDirectory() ? '/' : '');
+    html += `<li><a href="${link}">${file}${stats.isDirectory() ? '/' : ''}</a></li>`;
+  }
+
+  html += `
+      </ul>
+    </body>
+    </html>
+  `;
+  return html;
+}
+
+const server = http.createServer(async (req, res) => {
+  try {
+    // Set COOP and COEP headers
+    res.setHeader('Cross-Origin-Opener-Policy', 'same-origin');
+    res.setHeader('Cross-Origin-Embedder-Policy', 'require-corp');
+    res.setHeader('Cache-Control', 'no-store, no-cache, must-revalidate, proxy-revalidate');
+    res.setHeader('Pragma', 'no-cache');
+    res.setHeader('Expires', '0');
+
+    const filePath = path.join(STATIC_DIR, decodeURIComponent(req.url));
+    const stats = await fs.stat(filePath);
+
+    if (stats.isDirectory()) {
+      const indexPath = path.join(filePath, 'index.html');
+      try {
+        const indexData = await fs.readFile(indexPath);
+        res.writeHeader(200, { 'Content-Type': 'text/html' });
+        res.end(indexData);
+      } catch {
+        // No index.html, generate directory listing
+        const dirListing = await generateDirListing(filePath, req.url);
+        res.writeHeader(200, { 'Content-Type': 'text/html' });
+        res.end(dirListing);
+      }
+    } else {
+      const ext = path.extname(filePath).toLowerCase();
+      const contentType = mimeTypes[ext] || 'application/octet-stream';
+      const data = await fs.readFile(filePath);
+      res.writeHeader(200, { 'Content-Type': contentType });
+      res.end(data);
+    }
+  } catch (err) {
+    if (err.code === 'ENOENT') {
+      res.writeHeader(404, { 'Content-Type': 'text/plain' });
+      res.end('404 Not Found');
+    } else {
+      res.writeHeader(500, { 'Content-Type': 'text/plain' });
+      res.end('500 Internal Server Error');
+    }
+  }
+});
+
+server.listen(PORT, () => {
+  console.log(`Server running at http://localhost:${PORT}/`);
+});
--- a/src/llama-impl.h
+++ b/src/llama-impl.h
@ -37,7 +37,7 @@ void llama_log_callback_default(ggml_log_level level, const char * text, void *
 template <typename T>
 struct no_init {
    T value;
-    no_init() { /* do nothing */ }
+    no_init() = default;
 };

 struct time_meas {
--- a/src/llama-mmap.cpp
+++ b/src/llama-mmap.cpp
@ -485,7 +485,7 @@ struct llama_mlock::impl {
        if (suggest && getrlimit(RLIMIT_MEMLOCK, &lock_limit)) {
            suggest = false;
        }
-        if (suggest && (lock_limit.rlim_max > lock_limit.rlim_cur + size)) {
+        if (suggest && ((uint64_t)lock_limit.rlim_max > (uint64_t)lock_limit.rlim_cur + size)) {
            suggest = false;
        }
 #endif
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@ -423,8 +423,8 @@ static buft_list_t make_gpu_buft_list(ggml_backend_dev_t dev, llama_split_mode s
 }

 struct llama_model::impl {
-    impl() {}
-    ~impl() {}
+    impl() = default;
+    ~impl() = default;

    uint64_t n_elements = 0;

@ -461,7 +461,7 @@ llama_model::llama_model(const llama_model_params & params) : params(params), pi
    pimpl->has_tensor_overrides = params.tensor_buft_overrides && params.tensor_buft_overrides[0].pattern;
 }

-llama_model::~llama_model() {}
+llama_model::~llama_model() = default;

 void llama_model::load_stats(llama_model_loader & ml) {
    pimpl->n_elements = ml.n_elements;
--- a/src/llama-vocab.cpp
+++ b/src/llama-vocab.cpp
@ -3253,8 +3253,7 @@ void llama_vocab::impl::print_info() const {
 llama_vocab::llama_vocab() : pimpl(new impl(*this)) {
 }

-llama_vocab::~llama_vocab() {
-}
+llama_vocab::~llama_vocab() = default;

 void llama_vocab::load(llama_model_loader & ml, const LLM_KV & kv) {
    pimpl->load(ml, kv);
--- a/tests/.gitignore
+++ b/tests/.gitignore
@ -3,3 +3,4 @@
 *.o
 ggml-common.h
 **/*.swp
+!peg-parser
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@ -1,13 +1,15 @@
 llama_add_compile_flags()

 function(llama_build source)
+    set(TEST_SOURCES ${source} ${ARGN})
+
    if (DEFINED LLAMA_TEST_NAME)
        set(TEST_TARGET ${LLAMA_TEST_NAME})
    else()
        get_filename_component(TEST_TARGET ${source} NAME_WE)
    endif()

-    add_executable(${TEST_TARGET} ${source})
+    add_executable(${TEST_TARGET} ${TEST_SOURCES})
    target_link_libraries(${TEST_TARGET} PRIVATE common)
    install(TARGETS ${TEST_TARGET} RUNTIME)
 endfunction()
@ -83,6 +85,8 @@ function(llama_build_and_test source)
    set(multiValueArgs ARGS)
    cmake_parse_arguments(LLAMA_TEST "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})

+    set(TEST_SOURCES ${source} ${LLAMA_TEST_UNPARSED_ARGUMENTS} get-model.cpp)
+
    if (NOT DEFINED LLAMA_TEST_LABEL)
        set(LLAMA_TEST_LABEL "main")
    endif()
@ -95,7 +99,7 @@ function(llama_build_and_test source)
        get_filename_component(TEST_TARGET ${source} NAME_WE)
    endif()

-    add_executable(${TEST_TARGET} ${source} get-model.cpp)
+    add_executable(${TEST_TARGET} ${TEST_SOURCES})
    install(TARGETS ${TEST_TARGET} RUNTIME)
    target_link_libraries(${TEST_TARGET} PRIVATE common)

@ -180,9 +184,21 @@ if (NOT WIN32 OR NOT BUILD_SHARED_LIBS)
 endif()

 llama_build_and_test(test-chat-parser.cpp)
+llama_build_and_test(test-chat-peg-parser.cpp peg-parser/simple-tokenize.cpp)
 llama_build_and_test(test-chat-template.cpp)
 llama_build_and_test(test-json-partial.cpp)
 llama_build_and_test(test-log.cpp)
+llama_build_and_test(
+    test-peg-parser.cpp
+    peg-parser/simple-tokenize.cpp
+    peg-parser/test-basic.cpp
+    peg-parser/test-gbnf-generation.cpp
+    peg-parser/test-json-parser.cpp
+    peg-parser/test-json-serialization.cpp
+    peg-parser/test-unicode.cpp
+    peg-parser/testing.h
+    peg-parser/tests.h
+)
 llama_build_and_test(test-regex-partial.cpp)

 if (NOT ${CMAKE_SYSTEM_PROCESSOR} MATCHES "s390x")
--- a/tests/peg-parser/simple-tokenize.cpp
+++ b/tests/peg-parser/simple-tokenize.cpp
@ -0,0 +1,37 @@
+#include "simple-tokenize.h"
+
+std::vector<std::string> simple_tokenize(const std::string & input) {
+    std::vector<std::string> result;
+    std::string              current;
+
+    for (size_t i = 0; i < input.size(); i++) {
+        switch (input[i]) {
+            case ' ':
+            case '\n':
+            case '\t':
+            case '{':
+            case '}':
+            case ',':
+            case '[':
+            case '"':
+            case ']':
+            case '.':
+            case '<':
+            case '>':
+            case '=':
+            case '/':
+                if (!current.empty()) {
+                    result.push_back(current);
+                    current.clear();
+                }
+            default:;
+        }
+        current += input[i];
+    }
+
+    if (!current.empty()) {
+        result.push_back(current);
+    }
+
+    return result;
+}
--- a/tests/peg-parser/simple-tokenize.h
+++ b/tests/peg-parser/simple-tokenize.h
@ -0,0 +1,6 @@
+#pragma once
+
+#include <string>
+#include <vector>
+
+std::vector<std::string> simple_tokenize(const std::string &);
--- a/tests/peg-parser/test-basic.cpp
+++ b/tests/peg-parser/test-basic.cpp
@ -0,0 +1,454 @@
+#include "tests.h"
+
+void test_basic(testing & t) {
+    t.test("chars", [](testing & t) {
+        // Test common escape sequences - newline
+        t.test("escape_sequence_newline", [](testing &t) {
+            auto common_chat_combinator_parser = build_peg_parser([](common_peg_parser_builder & p) { return p.chars("[\\n\\t\\\\]"); });
+
+            common_peg_parse_context ctx;
+            common_peg_parse_result  result;
+
+            ctx    = common_peg_parse_context("\n");
+            result = common_chat_combinator_parser.parse(ctx);
+            t.assert_equal("escape_sequence_newline", true, result.success());
+        });
+
+        // Test common escape sequences - tab
+        t.test("escape_sequence_tab", [](testing &t) {
+            auto common_chat_combinator_parser = build_peg_parser([](common_peg_parser_builder & p) { return p.chars("[\\n\\t\\\\]"); });
+
+            common_peg_parse_context ctx;
+            common_peg_parse_result  result;
+
+            ctx    = common_peg_parse_context("\t");
+            result = common_chat_combinator_parser.parse(ctx);
+            t.assert_equal("escape_sequence_tab", true, result.success());
+        });
+
+        // Test common escape sequences - backslash
+        t.test("escape_sequence_backslash", [](testing &t) {
+            auto common_chat_combinator_parser = build_peg_parser([](common_peg_parser_builder & p) { return p.chars("[\\n\\t\\\\]"); });
+
+            common_peg_parse_context ctx;
+            common_peg_parse_result  result;
+
+            ctx    = common_peg_parse_context("\\");
+            result = common_chat_combinator_parser.parse(ctx);
+            t.assert_equal("escape_sequence_backslash", true, result.success());
+        });
+
+        // Test common escape sequences - space (should ())
+        t.test("escape_sequence_space_fail", [](testing &t) {
+            auto common_chat_combinator_parser = build_peg_parser([](common_peg_parser_builder & p) { return p.chars("[\\n\\t\\\\]"); });
+
+            common_peg_parse_context ctx;
+            common_peg_parse_result  result;
+
+            ctx    = common_peg_parse_context(" ");
+            result = common_chat_combinator_parser.parse(ctx);
+            t.assert_equal("escape_sequence_space_fail", true, result.fail());
+        });
+
+        // Test escaped dash - 'a' should succeed
+        t.test("escaped_dash_a", [](testing &t) {
+            auto common_chat_combinator_parser = build_peg_parser([](common_peg_parser_builder & p) { return p.chars("[a\\-z]"); });
+
+            common_peg_parse_context ctx;
+            common_peg_parse_result  result;
+
+            ctx    = common_peg_parse_context("a");
+            result = common_chat_combinator_parser.parse(ctx);
+            t.assert_equal("escaped_dash_a", true, result.success());
+        });
+
+        // Test escaped dash - '-' should succeed (literal dash)
+        t.test("escaped_dash_literal", [](testing &t) {
+            auto common_chat_combinator_parser = build_peg_parser([](common_peg_parser_builder & p) { return p.chars("[a\\-z]"); });
+
+            common_peg_parse_context ctx;
+            common_peg_parse_result  result;
+
+            ctx    = common_peg_parse_context("-");
+            result = common_chat_combinator_parser.parse(ctx);
+            t.assert_equal("escaped_dash_literal", true, result.success());
+        });
+
+        // Test escaped dash - 'z' should succeed
+        t.test("escaped_dash_z", [](testing &t) {
+            auto common_chat_combinator_parser = build_peg_parser([](common_peg_parser_builder & p) { return p.chars("[a\\-z]"); });
+
+            common_peg_parse_context ctx;
+            common_peg_parse_result  result;
+
+            ctx    = common_peg_parse_context("z");
+            result = common_chat_combinator_parser.parse(ctx);
+            t.assert_equal("escaped_dash_z", true, result.success());
+        });
+
+        // Test escaped dash - 'b' should NOT match (since \- is literal dash, not range)
+        t.test("escaped_dash_b_fail", [](testing &t) {
+            auto common_chat_combinator_parser = build_peg_parser([](common_peg_parser_builder & p) { return p.chars("[a\\-z]"); });
+
+            common_peg_parse_context ctx;
+            common_peg_parse_result  result;
+
+            ctx    = common_peg_parse_context("b");
+            result = common_chat_combinator_parser.parse(ctx);
+            t.assert_equal("escaped_dash_b_fail", true, result.fail());
+        });
+    });
+
+
+    t.test("optional", [](testing & t) {
+        // Full match with optional part present
+        t.test("optional_present", [](testing &t) {
+            auto parser = build_peg_parser([](common_peg_parser_builder & p) {
+                return p.literal("hello") + p.optional(p.literal(" world"));
+            });
+
+            auto ctx    = common_peg_parse_context("hello world");
+            auto result = parser.parse(ctx);
+            t.assert_equal("optional_present", true, result.success());
+            t.assert_equal("optional_present_end", 11u, result.end);
+        });
+
+        // Full match with optional part absent
+        t.test("optional_absent", [](testing &t) {
+            auto parser = build_peg_parser([](common_peg_parser_builder & p) {
+                return p.literal("hello") + p.optional(p.literal(" world"));
+            });
+
+            auto ctx    = common_peg_parse_context("hello", false);
+            auto result = parser.parse(ctx);
+            t.assert_equal("optional_absent", true, result.success());
+            t.assert_equal("optional_absent_end", 5u, result.end);
+        });
+
+        // Partial match - waiting for more input to determine if optional matches
+        t.test("partial_match_need_more", [](testing &t) {
+            auto parser = build_peg_parser([](common_peg_parser_builder & p) {
+                return p.literal("hello") + p.optional(p.literal(" world"));
+            });
+
+            auto ctx    = common_peg_parse_context("hello ", true);
+            auto result = parser.parse(ctx);
+            t.assert_equal("partial_match_need_more", true, result.need_more_input());
+        });
+    });
+
+    t.test("partial parsing", [](testing & t) {
+        // Literals - Basic Success
+        t.test("literal_success", [&](testing & t) {
+            auto parser = build_peg_parser([](common_peg_parser_builder & p) { return p.literal("hello"); });
+
+            common_peg_parse_context ctx;
+            common_peg_parse_result  result;
+
+            ctx    = common_peg_parse_context("hello");
+            result = parser.parse(ctx);
+            t.assert_equal("literal_success", true, result.success());
+        });
+
+        // Char Classes - Basic Lowercase Success
+        t.test("char_class_lowercase_success", [&](testing & t) {
+            auto parser = build_peg_parser([](common_peg_parser_builder & p) { return p.chars("a-z"); });
+
+            common_peg_parse_context ctx;
+            common_peg_parse_result  result;
+
+            ctx    = common_peg_parse_context("a");
+            result = parser.parse(ctx);
+            t.assert_equal("char_class_lowercase_success", true, result.success());
+        });
+
+        // Char Classes - Uppercase Fail
+        t.test("char_class_uppercase_fail", [&](testing & t) {
+            auto parser = build_peg_parser([](common_peg_parser_builder & p) { return p.chars("a-z"); });
+
+            common_peg_parse_context ctx;
+            common_peg_parse_result  result;
+
+            ctx    = common_peg_parse_context("A");
+            result = parser.parse(ctx);
+            t.assert_equal("char_class_uppercase_fail", true, result.fail());
+        });
+
+        // Char Classes with Dash - Lowercase Success
+        t.test("char_class_with_dash_lowercase", [&](testing & t) {
+            auto parser = build_peg_parser([](common_peg_parser_builder & p) { return p.chars("a-z-"); });
+
+            common_peg_parse_context ctx;
+            common_peg_parse_result  result;
+
+            ctx    = common_peg_parse_context("f");
+            result = parser.parse(ctx);
+            t.assert_equal("char_class_with_dash_lowercase", true, result.success());
+        });
+
+        // Char Classes with Dash - Literal Dash Success
+        t.test("char_class_with_dash_literal_dash", [&](testing & t) {
+            auto parser = build_peg_parser([](common_peg_parser_builder & p) { return p.chars("a-z-"); });
+
+            common_peg_parse_context ctx;
+            common_peg_parse_result  result;
+
+            ctx    = common_peg_parse_context("-");
+            result = parser.parse(ctx);
+            t.assert_equal("char_class_with_dash_literal_dash", true, result.success());
+        });
+
+        // Char Classes with Dash - Uppercase Fail
+        t.test("char_class_with_dash_uppercase_fail", [&](testing & t) {
+            auto parser = build_peg_parser([](common_peg_parser_builder & p) { return p.chars("a-z-"); });
+
+            common_peg_parse_context ctx;
+            common_peg_parse_result  result;
+
+            ctx    = common_peg_parse_context("A");
+            result = parser.parse(ctx);
+            t.assert_equal("char_class_with_dash_uppercase_fail", true, result.fail());
+        });
+
+        // Sequences - Partial Match 1
+        t.test("sequence_partial_match_1", [&](testing & t) {
+            auto parser = build_peg_parser([](common_peg_parser_builder & p) { return p.literal("<think>") + p.literal("</think>"); });
+
+            auto ctx    = common_peg_parse_context("<thi", true);
+            auto result = parser.parse(ctx);
+            t.assert_equal("sequence_partial_match_1", true, result.need_more_input());
+        });
+
+        // Sequences - Partial Match 2
+        t.test("sequence_partial_match_2", [&](testing & t) {
+            auto parser = build_peg_parser([](common_peg_parser_builder & p) { return p.literal("begin") + p.literal("end"); });
+
+            auto ctx    = common_peg_parse_context("begin", true);
+            auto result = parser.parse(ctx);
+            t.assert_equal("sequence_partial_match_2", true, result.need_more_input());
+        });
+
+        // Sequences - Partial Match 3
+        t.test("sequence_partial_match_3", [&](testing & t) {
+            auto parser = build_peg_parser([](common_peg_parser_builder & p) { return p.literal("<think>") + p.literal("</think>"); });
+
+            auto ctx    = common_peg_parse_context("<think></", true);
+            auto result = parser.parse(ctx);
+            t.assert_equal("sequence_partial_match_3", true, result.need_more_input());
+        });
+
+        // Sequences - Full Match
+        t.test("sequence_full_match", [&](testing & t) {
+            auto common_chat_combinator_parser = build_peg_parser([](common_peg_parser_builder & p) { return p.literal("hello") + p.literal("world"); });
+
+            auto ctx    = common_peg_parse_context("helloworld", false);
+            auto result = common_chat_combinator_parser.parse(ctx);
+            t.assert_equal("sequence_full_match", true, result.success());
+        });
+
+        // Sequences - No Match
+        t.test("sequence_no_match", [&](testing & t) {
+            auto parser = build_peg_parser([](common_peg_parser_builder & p) { return p.literal("<think>") + p.literal("</think>"); });
+
+            auto ctx    = common_peg_parse_context("<think>I am common_chat_combinator_parser", true);
+            auto result = parser.parse(ctx);
+            t.assert_equal("sequence_no_match", true, result.fail());
+        });
+
+        // Choices - Partial Match 1
+        t.test("choices_partial_match_1", [&](testing & t) {
+            auto parser = build_peg_parser([](common_peg_parser_builder & p) { return p.literal("option1") | p.literal("option2"); });
+
+            auto ctx    = common_peg_parse_context("opt", true);
+            auto result = parser.parse(ctx);
+            t.assert_equal("choices_partial_match_1", true, result.need_more_input());
+        });
+
+        // Choices - Partial Match 2
+        t.test("choices_partial_match_2", [&](testing & t) {
+            auto parser =
+                build_peg_parser([](common_peg_parser_builder & p) { return p.literal("choice_a") | p.literal("choice_b"); });
+
+            auto ctx    = common_peg_parse_context("choice", true);
+            auto result = parser.parse(ctx);
+            t.assert_equal("choices_partial_match_2", true, result.need_more_input());
+        });
+
+        // Choices - Full Match 1
+        t.test("choices_full_match_1", [&](testing & t) {
+            auto parser = build_peg_parser([](common_peg_parser_builder & p) { return p.literal("first") | p.literal("second"); });
+
+            auto ctx    = common_peg_parse_context("first", false);
+            auto result = parser.parse(ctx);
+            t.assert_equal("choices_full_match_1", true, result.success());
+        });
+
+        // Choices - Full Match 2
+        t.test("choices_full_match_2", [&](testing & t) {
+            auto parser = build_peg_parser([](common_peg_parser_builder & p) { return p.literal("alpha") | p.literal("beta"); });
+
+            auto ctx    = common_peg_parse_context("beta", false);
+            auto result = parser.parse(ctx);
+            t.assert_equal("choices_full_match_2", true, result.success());
+        });
+
+        // Choices - No Match
+        t.test("choices_no_match", [&](testing & t) {
+            auto parser = build_peg_parser([](common_peg_parser_builder & p) { return p.literal("good") | p.literal("better"); });
+
+            auto ctx    = common_peg_parse_context("best", false);
+            auto result = parser.parse(ctx);
+            t.assert_equal("choices_no_match", true, result.fail());
+        });
+
+        // Zero or More - Partial Match 1
+        t.test("zero_or_more_partial_match_1", [&](testing & t) {
+            auto parser = build_peg_parser([](common_peg_parser_builder & p) { return p.zero_or_more(p.literal("ab")); });
+
+            auto ctx    = common_peg_parse_context("a", true);
+            auto result = parser.parse(ctx);
+            t.assert_equal("zero_or_more_partial_match_1", true, result.need_more_input());
+        });
+
+        // Zero or More - Partial Match 2
+        t.test("zero_or_more_partial_match_2", [&](testing & t) {
+            auto parser = build_peg_parser([](common_peg_parser_builder & p) { return p.zero_or_more(p.literal("xy")); });
+
+            auto ctx    = common_peg_parse_context("xyx", true);
+            auto result = parser.parse(ctx);
+            t.assert_equal("zero_or_more_partial_match_2", true, result.need_more_input());
+        });
+
+        // Zero or More - Full Match
+        t.test("zero_or_more_full_match", [&](testing & t) {
+            auto parser = build_peg_parser([](common_peg_parser_builder & p) { return p.zero_or_more(p.literal("test")); });
+
+            auto ctx    = common_peg_parse_context("test", false);
+            auto result = parser.parse(ctx);
+            t.assert_equal("zero_or_more_full_match", true, result.success());
+        });
+
+        // One or More - Partial Match 1
+        t.test("one_or_more_partial_match_1", [&](testing & t) {
+            auto parser = build_peg_parser([](common_peg_parser_builder & p) { return p.one_or_more(p.literal("repeat")); });
+
+            auto ctx    = common_peg_parse_context("rep", true);
+            auto result = parser.parse(ctx);
+            t.assert_equal("one_or_more_partial_match_1", true, result.need_more_input());
+        });
+
+        // One or More - Partial Match 2
+        t.test("one_or_more_partial_match_2", [&](testing & t) {
+            auto parser = build_peg_parser([](common_peg_parser_builder & p) { return p.one_or_more(p.literal("ab")); });
+
+            auto ctx    = common_peg_parse_context("aba", true);
+            auto result = parser.parse(ctx);
+            t.assert_equal("one_or_more_partial_match_2", true, result.need_more_input());
+        });
+
+        // One or More - Full Match
+        t.test("one_or_more_full_match", [&](testing & t) {
+            auto parser = build_peg_parser([](common_peg_parser_builder & p) { return p.one_or_more(p.literal("single")); });
+
+            auto ctx    = common_peg_parse_context("single", false);
+            auto result = parser.parse(ctx);
+            t.assert_equal("one_or_more_full_match", true, result.success());
+        });
+
+        // One or More - No Match
+        t.test("one_or_more_no_match", [&](testing & t) {
+            auto parser = build_peg_parser([](common_peg_parser_builder & p) { return p.one_or_more(p.literal("()")); });
+
+            auto ctx    = common_peg_parse_context("success", false);
+            auto result = parser.parse(ctx);
+            t.assert_equal("one_or_more_no_match", true, result.fail());
+        });
+    });
+
+
+    t.test("recursive rules", [](testing &t) {
+        // Test simple number
+        t.test("simple_number", [](testing &t) {
+            auto value_parser = build_peg_parser([](common_peg_parser_builder & p) {
+                p.rule("number", p.chars("0-9"));
+                p.rule("list", p.literal("[") + p.ref("value") + p.literal("]"));
+                return p.rule("value", p.ref("number") | p.ref("list"));
+            });
+
+            common_peg_parse_context ctx("1", false);
+            auto           result = value_parser.parse(ctx);
+
+            t.assert_equal("result_is_success", true, result.success());
+        });
+
+        // Test simple list
+        t.test("simple_list", [](testing &t) {
+            auto value_parser = build_peg_parser([](common_peg_parser_builder & p) {
+                p.rule("number", p.chars("0-9"));
+                p.rule("list", p.literal("[") + p.ref("value") + p.literal("]"));
+                return p.rule("value", p.ref("number") | p.ref("list"));
+            });
+
+            common_peg_parse_context ctx("[1]", false);
+            auto           result = value_parser.parse(ctx);
+
+            t.assert_equal("result_is_success", true, result.success());
+        });
+
+        // Test nested list
+        t.test("nested_list", [](testing &t) {
+            auto value_parser = build_peg_parser([](common_peg_parser_builder & p) {
+                p.rule("number", p.chars("0-9"));
+                p.rule("list", p.literal("[") + p.ref("value") + p.literal("]"));
+                return p.rule("value", p.ref("number") | p.ref("list"));
+            });
+
+            common_peg_parse_context ctx("[[2]]", false);
+            auto           result = value_parser.parse(ctx);
+
+            t.assert_equal("result_is_success", true, result.success());
+        });
+
+        // Test deeply nested list
+        t.test("deeply_nested_list", [](testing &t) {
+            auto value_parser = build_peg_parser([](common_peg_parser_builder & p) {
+                p.rule("number", p.chars("0-9"));
+                p.rule("list", p.literal("[") + p.ref("value") + p.literal("]"));
+                return p.rule("value", p.ref("number") | p.ref("list"));
+            });
+
+            common_peg_parse_context ctx("[[[3]]]", false);
+            auto           result = value_parser.parse(ctx);
+
+            t.assert_equal("result_is_success", true, result.success());
+        });
+
+        // Test need_more_input match
+        t.test("need_more_input_match", [](testing &t) {
+            auto value_parser = build_peg_parser([](common_peg_parser_builder & p) {
+                p.rule("number", p.chars("0-9"));
+                p.rule("list", p.literal("[") + p.ref("value") + p.literal("]"));
+                return p.rule("value", p.ref("number") | p.ref("list"));
+            });
+
+            common_peg_parse_context ctx("[[", true);
+            auto           result = value_parser.parse(ctx);
+
+            t.assert_equal("result_is_need_more_input", true, result.need_more_input());
+        });
+
+        // Test no match
+        t.test("no_match", [](testing &t) {
+            auto value_parser = build_peg_parser([](common_peg_parser_builder & p) {
+                p.rule("number", p.chars("0-9"));
+                p.rule("list", p.literal("[") + p.ref("value") + p.literal("]"));
+                return p.rule("value", p.ref("number") | p.ref("list"));
+            });
+
+            common_peg_parse_context ctx("[a]", false);
+            auto           result = value_parser.parse(ctx);
+
+            t.assert_equal("result_is_fail", true, result.fail());
+        });
+    });
+}
--- a/tests/peg-parser/test-gbnf-generation.cpp
+++ b/tests/peg-parser/test-gbnf-generation.cpp
@ -0,0 +1,250 @@
+#include "tests.h"
+
+#include "json-schema-to-grammar.h"
+
+#include <regex>
+
+static std::string trim_leading_space(const std::string & s) {
+    static const std::regex leading_ws_re = std::regex(R"((^|\n)\s+)");
+    return std::regex_replace(s, leading_ws_re, "$1");
+}
+
+static void assert_gbnf_equal(testing & t, const std::string & expected, const std::string & actual) {
+    t.assert_equal("gbnf are equal", trim_leading_space(expected), trim_leading_space(actual));
+}
+
+void test_gbnf_generation(testing &t) {
+    t.test("literal grammar generation", [](testing &t) {
+        auto parser = build_peg_parser([](common_peg_parser_builder & p) {
+            return p.literal("hello");
+        });
+
+        auto gbnf = build_grammar([&](const common_grammar_builder & builder) {
+            parser.build_grammar(builder);
+        });
+
+        assert_gbnf_equal(t, R"""(
+            root ::= "hello"
+            space ::= | " " | "\n"{1,2} [ \t]{0,20}
+        )""", gbnf);
+    });
+
+    t.test("char class grammar", [](testing &t) {
+        auto parser = build_peg_parser([](common_peg_parser_builder & p) {
+            return p.chars("[a-z]", 1, 1);
+        });
+
+        auto gbnf = build_grammar([&](const common_grammar_builder & builder) {
+            parser.build_grammar(builder);
+        });
+
+        assert_gbnf_equal(t, R"""(
+            root ::= [a-z]
+            space ::= | " " | "\n"{1,2} [ \t]{0,20}
+        )""", gbnf);
+    });
+
+    t.test("sequence grammar", [](testing &t) {
+        auto parser = build_peg_parser([](common_peg_parser_builder & p) {
+            return p.literal("hello") + p.literal(" ") + p.literal("world");
+        });
+
+        auto gbnf = build_grammar([&](const common_grammar_builder & builder) {
+            parser.build_grammar(builder);
+        });
+
+        assert_gbnf_equal(t, R"""(
+            root ::= "hello" " " "world"
+            space ::= | " " | "\n"{1,2} [ \t]{0,20}
+        )""", gbnf);
+    });
+
+    t.test("choice grammar", [](testing &t) {
+        auto parser = build_peg_parser([](common_peg_parser_builder & p) {
+            return p.literal("cat") | p.literal("dog");
+        });
+
+        auto gbnf = build_grammar([&](const common_grammar_builder & builder) {
+            parser.build_grammar(builder);
+        });
+
+        assert_gbnf_equal(t, R"""(
+            root ::= "cat" | "dog"
+            space ::= | " " | "\n"{1,2} [ \t]{0,20}
+        )""", gbnf);
+    });
+
+    t.test("one_or_more grammar", [](testing &t) {
+        auto parser = build_peg_parser([](common_peg_parser_builder & p) {
+            return p.one_or_more(p.literal("a"));
+        });
+
+        auto gbnf = build_grammar([&](const common_grammar_builder & builder) {
+            parser.build_grammar(builder);
+        });
+
+        assert_gbnf_equal(t, R"""(
+            root ::= "a"+
+            space ::= | " " | "\n"{1,2} [ \t]{0,20}
+        )""", gbnf);
+    });
+
+    t.test("zero_or_more grammar", [](testing &t) {
+        auto parser = build_peg_parser([](common_peg_parser_builder & p) {
+            return p.zero_or_more(p.literal("a"));
+        });
+
+        auto gbnf = build_grammar([&](const common_grammar_builder & builder) {
+            parser.build_grammar(builder);
+        });
+
+        assert_gbnf_equal(t, R"""(
+            root ::= "a"*
+            space ::= | " " | "\n"{1,2} [ \t]{0,20}
+        )""", gbnf);
+    });
+
+    t.test("optional grammar", [](testing &t) {
+        auto parser = build_peg_parser([](common_peg_parser_builder & p) {
+            return p.literal("hello") + p.optional(p.literal(" world"));
+        });
+
+        auto gbnf = build_grammar([&](const common_grammar_builder & builder) {
+            parser.build_grammar(builder);
+        });
+
+        assert_gbnf_equal(t, R"""(
+            root ::= "hello" " world"?
+            space ::= | " " | "\n"{1,2} [ \t]{0,20}
+        )""", gbnf);
+    });
+
+    t.test("until grammar", [](testing &t) {
+        auto parser = build_peg_parser([](common_peg_parser_builder & p)  {
+            return p.until("</tag>");
+        });
+
+        auto gbnf = build_grammar([&](const common_grammar_builder & builder) {
+            parser.build_grammar(builder);
+        });
+
+        assert_gbnf_equal(t, R"""(
+            root ::= ([^<] | "<" [^/] | "</" [^t] | "</t" [^a] | "</ta" [^g] | "</tag" [^>])*
+            space ::= | " " | "\n"{1,2} [ \t]{0,20}
+        )""", gbnf);
+    });
+
+    t.test("complex expressions with parentheses", [](testing &t) {
+        auto parser = build_peg_parser([](common_peg_parser_builder & p) {
+            return p.one_or_more(p.literal("a") | p.literal("b"));
+        });
+
+        auto gbnf = build_grammar([&](const common_grammar_builder & builder) {
+            parser.build_grammar(builder);
+        });
+
+        assert_gbnf_equal(t, R"""(
+            root ::= ("a" | "b")+
+            space ::= | " " | "\n"{1,2} [ \t]{0,20}
+        )""", gbnf);
+    });
+
+    t.test("rule references", [](testing &t) {
+        auto parser = build_peg_parser([](common_peg_parser_builder & p) {
+            auto digit = p.rule("digit", p.chars("[0-9]", 1, 1));
+            return p.one_or_more(digit);
+        });
+
+        auto gbnf = build_grammar([&](const common_grammar_builder & builder) {
+            parser.build_grammar(builder);
+        });
+
+        assert_gbnf_equal(t, R"""(
+            digit ::= [0-9]
+            root ::= digit+
+            space ::= | " " | "\n"{1,2} [ \t]{0,20}
+        )""", gbnf);
+    });
+
+    t.test("escaping in literals", [](testing &t) {
+        auto parser = build_peg_parser([](common_peg_parser_builder & p) {
+            return p.literal("hello\nworld\n!");
+        });
+
+        auto gbnf = build_grammar([&](const common_grammar_builder & builder) {
+            parser.build_grammar(builder);
+        });
+
+        assert_gbnf_equal(t, R"""(
+            root ::= "hello\nworld\n!"
+            space ::= | " " | "\n"{1,2} [ \t]{0,20}
+        )""", gbnf);
+    });
+
+    t.test("operator<< (whitespace insertion)", [](testing &t) {
+        auto parser = build_peg_parser([](common_peg_parser_builder & p) {
+            return p.literal("hello") << p.literal("world");
+        });
+
+        auto gbnf = build_grammar([&](const common_grammar_builder & builder) {
+            parser.build_grammar(builder);
+        });
+
+        assert_gbnf_equal(t, R"""(
+            root ::= "hello" space "world"
+            space ::= | " " | "\n"{1,2} [ \t]{0,20}
+        )""", gbnf);
+    });
+
+    t.test("emit only reachable rules", [](testing &t) {
+        auto parser = build_peg_parser([](common_peg_parser_builder & p) {
+            p.rule("orphan", p.literal("orphan"));
+            return p.literal("hello") + p.rule("child", p.literal(" world"));
+        });
+
+        auto gbnf = build_grammar([&](const common_grammar_builder & builder) {
+            parser.build_grammar(builder);
+        });
+
+        assert_gbnf_equal(t, R"""(
+            child ::= " world"
+            root ::= "hello" child
+            space ::= | " " | "\n"{1,2} [ \t]{0,20}
+        )""", gbnf);
+    });
+
+    t.test("emit only trigger rules (and references)", [](testing &t) {
+        auto parser = build_peg_parser([](common_peg_parser_builder & p) {
+            auto rule1 = p.rule("rule-1", p.literal("a") + p.ref("rule-2"));
+            p.rule("rule-2", p.literal("b") + p.ref("rule-3"), true);
+            p.rule("rule-3", p.literal("c") + p.ref("rule-4"));
+            p.rule("rule-4", p.literal("d"), true);
+            return rule1;
+        });
+
+        auto gbnf = build_grammar([&](const common_grammar_builder & builder) {
+            parser.build_grammar(builder);
+        });
+
+        assert_gbnf_equal(t, R"""(
+            root ::= rule-1
+            rule-1 ::= "a" rule-2
+            rule-2 ::= "b" rule-3
+            rule-3 ::= "c" rule-4
+            rule-4 ::= "d"
+            space ::= | " " | "\n"{1,2} [ \t]{0,20}
+        )""", gbnf);
+
+        auto gbnf_lazy = build_grammar([&](const common_grammar_builder & builder) {
+            parser.build_grammar(builder, true);
+        });
+
+        assert_gbnf_equal(t, R"""(
+            root ::= rule-2 | rule-4
+            rule-2 ::= "b" rule-3
+            rule-3 ::= "c" rule-4
+            rule-4 ::= "d"
+            space ::= | " " | "\n"{1,2} [ \t]{0,20}
+        )""", gbnf_lazy);
+    });
+}
--- a/tests/peg-parser/test-json-parser.cpp
+++ b/tests/peg-parser/test-json-parser.cpp
@ -0,0 +1,109 @@
+#include "tests.h"
+
+void test_json_parser(testing &t) {
+    // Test parsing a simple JSON object
+    t.test("simple JSON object parsing", [](testing &t) {
+        auto json = build_peg_parser([](common_peg_parser_builder & p) { return p.json(); });
+
+        std::string    input = R"({"name": "test", "value": 42, "flag": true})";
+        common_peg_parse_context ctx(input);
+
+        auto result = json.parse(ctx);
+
+        t.assert_equal("result_is_success", true, result.success());
+        t.assert_equal("result_end", input.size(), result.end);
+    });
+
+    // Test parsing a JSON array with mixed types
+    t.test("JSON array with mixed types", [](testing &t) {
+        auto json = build_peg_parser([](common_peg_parser_builder & p) { return p.json(); });
+
+        std::string    input = R"([1, "hello", true, null, 3.14])";
+        common_peg_parse_context ctx(input);
+
+        auto result = json.parse(ctx);
+
+        t.assert_equal("result_is_success", true, result.success());
+        t.assert_equal("result_end", input.size(), result.end);
+    });
+
+    // Test parsing nested JSON with objects and arrays
+    t.test("nested JSON with objects and arrays", [](testing &t) {
+        auto json = build_peg_parser([](common_peg_parser_builder & p) { return p.json(); });
+
+        std::string input =
+            R"({"users": [{"id": 1, "name": "Alice"}, {"id": 2, "name": "Bob"}], "count": 2, "metadata": {"version": "1.0", "tags": ["admin", "user"]}})";
+        common_peg_parse_context ctx(input);
+
+        auto result = json.parse(ctx);
+
+        t.assert_equal("result_is_success", true, result.success());
+        t.assert_equal("result_end", input.size(), result.end);
+    });
+
+    // Test need_more_input() parsing - incomplete object
+    t.test("need_more_input() parsing - incomplete object", [](testing &t) {
+        auto json = build_peg_parser([](common_peg_parser_builder & p) { return p.json(); });
+
+        std::string    input = R"({"name": "test", "value": )";
+        common_peg_parse_context ctx(input, true);
+
+        auto result = json.parse(ctx);
+
+        t.assert_equal("result_is_need_more_input", true, result.need_more_input());
+    });
+
+    // Test need_more_input() parsing - incomplete array
+    t.test("need_more_input() parsing - incomplete array", [](testing &t) {
+        auto json = build_peg_parser([](common_peg_parser_builder & p) { return p.json(); });
+
+        std::string    input = R"([1, 2, 3, )";
+        common_peg_parse_context ctx(input, true);
+
+        auto result = json.parse(ctx);
+
+        t.assert_equal("result_is_need_more_input", true, result.need_more_input());
+    });
+
+    // Test need_more_input() parsing - incomplete nested structure
+    t.test("need_more_input() parsing - incomplete nested structure", [](testing &t) {
+        auto json = build_peg_parser([](common_peg_parser_builder & p) { return p.json(); });
+
+        std::string    input = R"({"data": {"nested": )";
+        common_peg_parse_context ctx(input, true);
+
+        auto result = json.parse(ctx);
+
+        t.assert_equal("result_is_need_more_input", true, result.need_more_input());
+    });
+
+    t.test("object member", [](testing &t) {
+        auto parser = build_peg_parser([](common_peg_parser_builder & p) {
+            return p.json_member("name", "\"" + p.chars("[a-z]") + "\"");
+        });
+
+        t.test("success", [&](testing &t) {
+            std::string input = R"("name": "bob")";
+            common_peg_parse_context ctx(input, false);
+
+            auto result = parser.parse(ctx);
+            t.assert_true("success", result.success());
+        });
+
+        t.test("partial", [&](testing &t) {
+            std::string input = R"("name": "bo)";
+            common_peg_parse_context ctx(input, true);
+
+            auto result = parser.parse(ctx);
+            t.assert_true("need more input", result.need_more_input());
+        });
+
+        t.test("failed", [&](testing &t) {
+            std::string input = R"([])";
+            common_peg_parse_context ctx(input, false);
+
+            auto result = parser.parse(ctx);
+            t.assert_true("fail", result.fail());
+        });
+    });
+}
--- a/tests/peg-parser/test-json-serialization.cpp
+++ b/tests/peg-parser/test-json-serialization.cpp
@ -0,0 +1,28 @@
+#include "tests.h"
+
+void test_json_serialization(testing &t) {
+    auto original = build_peg_parser([](common_peg_parser_builder & p) {
+        return "<tool_call>" + p.json() + "</tool_call>";
+    });
+
+    auto json_serialized = original.to_json().dump();
+
+    t.test("compare before/after", [&](testing &t) {
+        auto deserialized = common_peg_arena::from_json(nlohmann::json::parse(json_serialized));
+
+        // Test complex JSON
+        std::string input = R"({"name": "test", "values": [1, 2, 3], "nested": {"a": true}})";
+        common_peg_parse_context ctx1(input);
+        common_peg_parse_context ctx2(input);
+
+        auto result1 = original.parse(ctx1);
+        auto result2 = deserialized.parse(ctx2);
+
+        t.assert_equal("both_succeed", result1.success(), result2.success());
+        t.assert_equal("same_end_pos", result1.end, result2.end);
+    });
+
+    t.bench("deserialize", [&]() {
+        auto deserialized = common_peg_arena::from_json(nlohmann::json::parse(json_serialized));
+    }, 100);
+}
--- a/tests/peg-parser/test-unicode.cpp
+++ b/tests/peg-parser/test-unicode.cpp
@ -0,0 +1,449 @@
+#include "tests.h"
+
+#include "peg-parser.h"
+
+#include <string>
+#include <sstream>
+#include <iomanip>
+#include <cctype>
+
+static void assert_result_equal(testing & t, common_peg_parse_result_type expected, common_peg_parse_result_type actual) {
+    t.assert_equal(common_peg_parse_result_type_name(expected), common_peg_parse_result_type_name(actual));
+}
+
+static std::string hex_dump(const std::string& str) {
+    std::ostringstream oss;
+    for (unsigned char c : str) {
+        if (std::isprint(c)) {
+            oss << c;
+        } else {
+            oss << "\\x" << std::hex << std::setw(2) << std::setfill('0') << static_cast<int>(c);
+        }
+    }
+    return oss.str();
+}
+
+void test_unicode(testing &t) {
+    struct test_case {
+        std::string input;
+        std::string expected_text;
+        common_peg_parse_result_type expected_result;
+    };
+
+    t.test("any", [](testing &t) {
+        std::vector<test_case> test_cases {
+            // Valid UTF-8 sequences
+            {"Hello", "Hello", COMMON_PEG_PARSE_RESULT_SUCCESS},
+            {std::string("Caf\xC3\xA9"), std::string("Caf\xC3\xA9"), COMMON_PEG_PARSE_RESULT_SUCCESS},
+            {std::string("\xE4\xBD\xA0\xE5\xA5\xBD"), std::string("\xE4\xBD\xA0\xE5\xA5\xBD"), COMMON_PEG_PARSE_RESULT_SUCCESS},
+            {std::string("\xF0\x9F\x9A\x80"), std::string("\xF0\x9F\x9A\x80"), COMMON_PEG_PARSE_RESULT_SUCCESS},
+
+            // Incomplete UTF-8 sequences (partial bytes at end)
+            {std::string("Caf\xC3"), "Caf", COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT},
+            {std::string("\xE4\xBD"), "", COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT},
+            {std::string("\xF0\x9F\x9A"), "", COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT},
+
+            // Invalid/malformed UTF-8 sequences
+            {std::string("\xFF\xFE"), "", COMMON_PEG_PARSE_RESULT_FAIL},
+            {std::string("Hello\x80World"), "Hello", COMMON_PEG_PARSE_RESULT_FAIL},
+            {std::string("\xC3\x28"), "", COMMON_PEG_PARSE_RESULT_FAIL},
+        };
+
+        auto parser = build_peg_parser([](common_peg_parser_builder& p) {
+            return p.sequence({p.one_or_more(p.any()), p.end()});
+        });
+
+        for (size_t i = 0; i < test_cases.size(); i++) {
+            const auto & tc = test_cases[i];
+            std::string test_name = "case " + std::to_string(i) + ": " + hex_dump(tc.input);
+
+            t.test(test_name, [&](testing &t) {
+                common_peg_parse_context ctx(tc.input, true);
+                auto result = parser.parse(ctx);
+
+                // Assert result type matches
+                assert_result_equal(t, tc.expected_result, result.type);
+
+                // Assert matched text if success or need_more_input
+                if (result.success() || result.need_more_input()) {
+                    std::string matched = tc.input.substr(result.start, result.end - result.start);
+                    t.assert_equal(tc.expected_text, matched);
+                }
+            });
+        }
+    });
+
+    t.test("char classes", [](testing &t) {
+        t.test("unicode range U+4E00-U+9FFF (CJK)", [](testing &t) {
+            std::vector<test_case> test_cases {
+                // Within range - CJK Unified Ideographs
+                {std::string("\xE4\xB8\x80"), std::string("\xE4\xB8\x80"), COMMON_PEG_PARSE_RESULT_SUCCESS}, // U+4E00
+                {std::string("\xE4\xBD\xA0"), std::string("\xE4\xBD\xA0"), COMMON_PEG_PARSE_RESULT_SUCCESS}, // U+4F60
+                {std::string("\xE5\xA5\xBD"), std::string("\xE5\xA5\xBD"), COMMON_PEG_PARSE_RESULT_SUCCESS}, // U+597D
+                {std::string("\xE9\xBF\xBF"), std::string("\xE9\xBF\xBF"), COMMON_PEG_PARSE_RESULT_SUCCESS}, // U+9FFF
+
+                // Outside range - should fail
+                {"a", "", COMMON_PEG_PARSE_RESULT_FAIL},                                                     // ASCII
+                {std::string("\xE4\xB7\xBF"), "", COMMON_PEG_PARSE_RESULT_FAIL},                            // U+4DFF (before range)
+                {std::string("\xEA\x80\x80"), "", COMMON_PEG_PARSE_RESULT_FAIL},                            // U+A000 (after range)
+
+                // Incomplete sequences in range
+                {std::string("\xE4\xB8"), "", COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT},                     // Incomplete U+4E00
+                {std::string("\xE5\xA5"), "", COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT},                     // Incomplete U+597D
+            };
+
+            auto parser = build_peg_parser([](common_peg_parser_builder& p) {
+                return p.sequence({p.chars(R"([\u4E00-\u9FFF])"), p.end()});
+            });
+
+            for (size_t i = 0; i < test_cases.size(); i++) {
+                const auto & tc = test_cases[i];
+                std::string test_name = "case " + std::to_string(i) + ": " + hex_dump(tc.input);
+
+                t.test(test_name, [&](testing &t) {
+                    common_peg_parse_context ctx(tc.input, true);
+                    auto result = parser.parse(ctx);
+
+                    // Assert result type matches
+                    assert_result_equal(t, tc.expected_result, result.type);
+
+                    // Assert matched text if success or need_more_input
+                    if (result.success() || result.need_more_input()) {
+                        std::string matched = tc.input.substr(result.start, result.end - result.start);
+                        t.assert_equal(tc.expected_text, matched);
+                    }
+                });
+            }
+        });
+
+        t.test("unicode range U+1F600-U+1F64F (emoticons)", [](testing &t) {
+            std::vector<test_case> test_cases {
+                // Within range - Emoticons (all 4-byte UTF-8)
+                {std::string("\xF0\x9F\x98\x80"), std::string("\xF0\x9F\x98\x80"), COMMON_PEG_PARSE_RESULT_SUCCESS}, // U+1F600
+                {std::string("\xF0\x9F\x98\x81"), std::string("\xF0\x9F\x98\x81"), COMMON_PEG_PARSE_RESULT_SUCCESS}, // U+1F601
+                {std::string("\xF0\x9F\x99\x8F"), std::string("\xF0\x9F\x99\x8F"), COMMON_PEG_PARSE_RESULT_SUCCESS}, // U+1F64F
+
+                // Outside range
+                {std::string("\xF0\x9F\x97\xBF"), "", COMMON_PEG_PARSE_RESULT_FAIL}, // U+1F5FF (before range)
+                {std::string("\xF0\x9F\x99\x90"), "", COMMON_PEG_PARSE_RESULT_FAIL}, // U+1F650 (after range)
+                {std::string("\xF0\x9F\x9A\x80"), "", COMMON_PEG_PARSE_RESULT_FAIL}, // U+1F680 (outside range)
+
+                // Incomplete sequences
+                {std::string("\xF0\x9F\x98"), "", COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT}, // Incomplete emoji
+                {std::string("\xF0\x9F"), "", COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT},     // Very incomplete
+            };
+
+            auto parser = build_peg_parser([](common_peg_parser_builder& p) {
+                return p.sequence({p.chars(R"([\U0001F600-\U0001F64F])"), p.end()});
+            });
+
+            for (size_t i = 0; i < test_cases.size(); i++) {
+                const auto & tc = test_cases[i];
+                std::string test_name = "case " + std::to_string(i) + ": " + hex_dump(tc.input);
+
+                t.test(test_name, [&](testing &t) {
+                    common_peg_parse_context ctx(tc.input, true);
+                    auto result = parser.parse(ctx);
+
+                    // Assert result type matches
+                    assert_result_equal(t, tc.expected_result, result.type);
+
+                    // Assert matched text if success or need_more_input
+                    if (result.success() || result.need_more_input()) {
+                        std::string matched = tc.input.substr(result.start, result.end - result.start);
+                        t.assert_equal(tc.expected_text, matched);
+                    }
+                });
+            }
+        });
+
+        t.test("mixed unicode ranges", [](testing &t) {
+            std::vector<test_case> test_cases {
+                // Match CJK
+                {std::string("\xE4\xB8\x80"), std::string("\xE4\xB8\x80"), COMMON_PEG_PARSE_RESULT_SUCCESS}, // U+4E00
+                {std::string("\xE4\xBD\xA0"), std::string("\xE4\xBD\xA0"), COMMON_PEG_PARSE_RESULT_SUCCESS}, // U+4F60
+
+                // Match emoticons
+                {std::string("\xF0\x9F\x98\x80"), std::string("\xF0\x9F\x98\x80"), COMMON_PEG_PARSE_RESULT_SUCCESS}, // U+1F600
+
+                // Match ASCII digits
+                {"5", "5", COMMON_PEG_PARSE_RESULT_SUCCESS},
+
+                // Don't match outside any range
+                {"a", "", COMMON_PEG_PARSE_RESULT_FAIL},
+                {std::string("\xF0\x9F\x9A\x80"), "", COMMON_PEG_PARSE_RESULT_FAIL}, // U+1F680
+
+                // Incomplete
+                {std::string("\xE4\xB8"), "", COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT},
+                {std::string("\xF0\x9F\x98"), "", COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT},
+            };
+
+            auto parser = build_peg_parser([](common_peg_parser_builder& p) {
+                return p.sequence({p.chars(R"([\u4E00-\u9FFF\U0001F600-\U0001F64F0-9])"), p.end()});
+            });
+
+            for (size_t i = 0; i < test_cases.size(); i++) {
+                const auto & tc = test_cases[i];
+                std::string test_name = "case " + std::to_string(i) + ": " + hex_dump(tc.input);
+
+                t.test(test_name, [&](testing &t) {
+                    common_peg_parse_context ctx(tc.input, true);
+                    auto result = parser.parse(ctx);
+
+                    // Assert result type matches
+                    assert_result_equal(t, tc.expected_result, result.type);
+
+                    // Assert matched text if success or need_more_input
+                    if (result.success() || result.need_more_input()) {
+                        std::string matched = tc.input.substr(result.start, result.end - result.start);
+                        t.assert_equal(tc.expected_text, matched);
+                    }
+                });
+            }
+        });
+    });
+
+    t.test("until parser", [](testing &t) {
+        t.test("ASCII delimiter with Unicode content", [](testing &t) {
+            std::vector<test_case> test_cases {
+                // CJK characters before delimiter
+                {std::string("\xE4\xBD\xA0\xE5\xA5\xBD</tag>"), std::string("\xE4\xBD\xA0\xE5\xA5\xBD"), COMMON_PEG_PARSE_RESULT_SUCCESS},
+
+                // Emoji before delimiter
+                {std::string("\xF0\x9F\x98\x80</tag>"), std::string("\xF0\x9F\x98\x80"), COMMON_PEG_PARSE_RESULT_SUCCESS},
+
+                // Mixed content
+                {std::string("Hello \xE4\xB8\x96\xE7\x95\x8C!</tag>"), std::string("Hello \xE4\xB8\x96\xE7\x95\x8C!"), COMMON_PEG_PARSE_RESULT_SUCCESS},
+            };
+
+            auto parser = build_peg_parser([](common_peg_parser_builder& p) {
+                return p.until("</tag>");
+            });
+
+            for (size_t i = 0; i < test_cases.size(); i++) {
+                const auto & tc = test_cases[i];
+                std::string test_name = "case " + std::to_string(i) + ": " + hex_dump(tc.input);
+
+                t.test(test_name, [&](testing &t) {
+                    common_peg_parse_context ctx(tc.input, false);
+                    auto result = parser.parse(ctx);
+
+                    assert_result_equal(t, tc.expected_result, result.type);
+
+                    if (result.success()) {
+                        std::string matched = tc.input.substr(result.start, result.end - result.start);
+                        t.assert_equal(tc.expected_text, matched);
+                    }
+                });
+            }
+        });
+
+        t.test("incomplete UTF-8 at end", [](testing &t) {
+            std::vector<test_case> test_cases {
+                // Incomplete emoji at end, no delimiter
+                {std::string("content\xF0\x9F\x98"), std::string("content"), COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT},
+
+                // Incomplete CJK at end, no delimiter
+                {std::string("hello\xE4\xB8"), std::string("hello"), COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT},
+
+                // Complete content, no delimiter (should consume all valid UTF-8)
+                {std::string("\xE4\xBD\xA0\xE5\xA5\xBD"), std::string("\xE4\xBD\xA0\xE5\xA5\xBD"), COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT},
+            };
+
+            auto parser = build_peg_parser([](common_peg_parser_builder& p) {
+                return p.until("</tag>");
+            });
+
+            for (size_t i = 0; i < test_cases.size(); i++) {
+                const auto & tc = test_cases[i];
+                std::string test_name = "case " + std::to_string(i) + ": " + hex_dump(tc.input);
+
+                t.test(test_name, [&](testing &t) {
+                    common_peg_parse_context ctx(tc.input, true);
+                    auto result = parser.parse(ctx);
+
+                    assert_result_equal(t, tc.expected_result, result.type);
+
+                    if (result.success() || result.need_more_input()) {
+                        std::string matched = tc.input.substr(result.start, result.end - result.start);
+                        t.assert_equal(tc.expected_text, matched);
+                    }
+                });
+            }
+        });
+
+        t.test("malformed UTF-8", [](testing &t) {
+            std::vector<test_case> test_cases {
+                // Invalid UTF-8 bytes
+                {std::string("Hello\xFF\xFE"), "", COMMON_PEG_PARSE_RESULT_FAIL},
+
+                // Continuation byte without lead byte
+                {std::string("Hello\x80World"), "", COMMON_PEG_PARSE_RESULT_FAIL},
+
+                // Invalid continuation byte
+                {std::string("\xC3\x28"), "", COMMON_PEG_PARSE_RESULT_FAIL},
+            };
+
+            auto parser = build_peg_parser([](common_peg_parser_builder& p) {
+                return p.until("</tag>");
+            });
+
+            for (size_t i = 0; i < test_cases.size(); i++) {
+                const auto & tc = test_cases[i];
+                std::string test_name = "case " + std::to_string(i) + ": " + hex_dump(tc.input);
+
+                t.test(test_name, [&](testing &t) {
+                    common_peg_parse_context ctx(tc.input, false);
+                    auto result = parser.parse(ctx);
+
+                    assert_result_equal(t, tc.expected_result, result.type);
+                });
+            }
+        });
+    });
+
+    t.test("json_string parser", [](testing &t) {
+        t.test("valid UTF-8 characters", [](testing &t) {
+            std::vector<test_case> test_cases {
+                // ASCII only
+                {"Hello World\"", "Hello World", COMMON_PEG_PARSE_RESULT_SUCCESS},
+
+                // 2-byte UTF-8 (accented characters)
+                {std::string("Caf\xC3\xA9\""), std::string("Caf\xC3\xA9"), COMMON_PEG_PARSE_RESULT_SUCCESS},
+
+                // 3-byte UTF-8 (CJK)
+                {std::string("\xE4\xBD\xA0\xE5\xA5\xBD\""), std::string("\xE4\xBD\xA0\xE5\xA5\xBD"), COMMON_PEG_PARSE_RESULT_SUCCESS},
+
+                // 4-byte UTF-8 (emoji)
+                {std::string("\xF0\x9F\x98\x80\""), std::string("\xF0\x9F\x98\x80"), COMMON_PEG_PARSE_RESULT_SUCCESS},
+
+                // Mixed content
+                {std::string("Hello \xE4\xB8\x96\xE7\x95\x8C!\""), std::string("Hello \xE4\xB8\x96\xE7\x95\x8C!"), COMMON_PEG_PARSE_RESULT_SUCCESS},
+            };
+
+            for (size_t i = 0; i < test_cases.size(); i++) {
+                const auto & tc = test_cases[i];
+                std::string test_name = "case " + std::to_string(i) + ": " + hex_dump(tc.input);
+
+                t.test(test_name, [&](testing &t) {
+                    auto parser = build_peg_parser([](common_peg_parser_builder& p) {
+                        return p.sequence({p.json_string_content(), p.literal("\"")});
+                    });
+
+                    common_peg_parse_context ctx(tc.input, false);
+                    auto result = parser.parse(ctx);
+
+                    assert_result_equal(t, tc.expected_result, result.type);
+
+                    if (result.success()) {
+                        std::string matched = tc.input.substr(result.start, result.end - result.start - 1);  // -1 to exclude closing quote
+                        t.assert_equal(tc.expected_text, matched);
+                    }
+                });
+            }
+        });
+
+        t.test("incomplete UTF-8", [](testing &t) {
+            std::vector<test_case> test_cases {
+                // Incomplete 2-byte sequence
+                {std::string("Caf\xC3"), std::string("Caf"), COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT},
+
+                // Incomplete 3-byte sequence
+                {std::string("Hello\xE4\xB8"), std::string("Hello"), COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT},
+
+                // Incomplete 4-byte sequence
+                {std::string("Text\xF0\x9F\x98"), std::string("Text"), COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT},
+
+                // Incomplete at very start
+                {std::string("\xE4\xBD"), std::string(""), COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT},
+            };
+
+            for (size_t i = 0; i < test_cases.size(); i++) {
+                const auto & tc = test_cases[i];
+                std::string test_name = "case " + std::to_string(i) + ": " + hex_dump(tc.input);
+
+                t.test(test_name, [&](testing &t) {
+                    auto parser = build_peg_parser([](common_peg_parser_builder& p) {
+                        return p.json_string_content();
+                    });
+
+                    common_peg_parse_context ctx(tc.input, true);
+                    auto result = parser.parse(ctx);
+
+                    assert_result_equal(t, tc.expected_result, result.type);
+
+                    if (result.need_more_input()) {
+                        std::string matched = tc.input.substr(result.start, result.end - result.start);
+                        t.assert_equal(tc.expected_text, matched);
+                    }
+                });
+            }
+        });
+
+        t.test("malformed UTF-8", [](testing &t) {
+            std::vector<test_case> test_cases {
+                // Invalid UTF-8 bytes
+                {std::string("Hello\xFF\xFE"), "", COMMON_PEG_PARSE_RESULT_FAIL},
+
+                // Continuation byte without lead byte
+                {std::string("Hello\x80World"), "", COMMON_PEG_PARSE_RESULT_FAIL},
+
+                // Invalid continuation byte
+                {std::string("\xC3\x28"), "", COMMON_PEG_PARSE_RESULT_FAIL},
+
+                // Overlong encoding (security issue)
+                {std::string("\xC0\x80"), "", COMMON_PEG_PARSE_RESULT_FAIL},
+            };
+
+            for (size_t i = 0; i < test_cases.size(); i++) {
+                const auto & tc = test_cases[i];
+                std::string test_name = "case " + std::to_string(i) + ": " + hex_dump(tc.input);
+
+                t.test(test_name, [&](testing &t) {
+                    auto parser = build_peg_parser([](common_peg_parser_builder& p) {
+                        return p.json_string_content();
+                    });
+
+                    common_peg_parse_context ctx(tc.input, false);
+                    auto result = parser.parse(ctx);
+
+                    assert_result_equal(t, tc.expected_result, result.type);
+                });
+            }
+        });
+
+        t.test("escape sequences with UTF-8", [](testing &t) {
+            std::vector<test_case> test_cases {
+                // Unicode escape sequence
+                {"Hello\\u0041\"", "Hello\\u0041", COMMON_PEG_PARSE_RESULT_SUCCESS},
+
+                // Mix of UTF-8 and escape sequences
+                {std::string("\xE4\xBD\xA0\\n\xE5\xA5\xBD\""), std::string("\xE4\xBD\xA0\\n\xE5\xA5\xBD"), COMMON_PEG_PARSE_RESULT_SUCCESS},
+
+                // Escaped quote in UTF-8 string
+                {std::string("\xE4\xBD\xA0\\\"\xE5\xA5\xBD\""), std::string("\xE4\xBD\xA0\\\"\xE5\xA5\xBD"), COMMON_PEG_PARSE_RESULT_SUCCESS},
+            };
+
+            for (size_t i = 0; i < test_cases.size(); i++) {
+                const auto & tc = test_cases[i];
+                std::string test_name = "case " + std::to_string(i) + ": " + hex_dump(tc.input);
+
+                t.test(test_name, [&](testing &t) {
+                    auto parser = build_peg_parser([](common_peg_parser_builder& p) {
+                        return p.sequence({p.json_string_content(), p.literal("\"")});
+                    });
+
+                    common_peg_parse_context ctx(tc.input, false);
+                    auto result = parser.parse(ctx);
+
+                    assert_result_equal(t, tc.expected_result, result.type);
+
+                    if (result.success()) {
+                        std::string matched = tc.input.substr(result.start, result.end - result.start - 1);  // -1 to exclude closing quote
+                        t.assert_equal(tc.expected_text, matched);
+                    }
+                });
+            }
+        });
+    });
+}
--- a/tests/peg-parser/testing.h
+++ b/tests/peg-parser/testing.h
@ -0,0 +1,243 @@
+#pragma once
+
+#include "common.h"
+
+#include <chrono>
+#include <exception>
+#include <iostream>
+#include <string>
+#include <regex>
+#include <vector>
+
+struct testing {
+    std::ostream &out;
+    std::vector<std::string> stack;
+    std::regex filter;
+    bool filter_tests = false;
+    bool throw_exception = false;
+    bool verbose = false;
+    int tests = 0;
+    int assertions = 0;
+    int failures = 0;
+    int unnamed = 0;
+    int exceptions = 0;
+
+    static constexpr std::size_t status_column = 80;
+
+    explicit testing(std::ostream &os = std::cout) : out(os) {}
+
+    std::string indent() const {
+        if (stack.empty()) {
+            return "";
+        }
+        return std::string((stack.size() - 1) * 2, ' ');
+    }
+
+    std::string full_name() const {
+        return string_join(stack, ".");
+    }
+
+    void log(const std::string & msg) {
+        if (verbose) {
+            out << indent() << "  " << msg << "\n";
+        }
+    }
+
+    void set_filter(const std::string & re) {
+        filter = std::regex(re);
+        filter_tests = true;
+    }
+
+    bool should_run() const {
+        if (filter_tests) {
+            if (!std::regex_match(full_name(), filter)) {
+                return false;
+            }
+        }
+        return true;
+    }
+
+    template <typename F>
+    void run_with_exceptions(F &&f, const char *ctx) {
+        try {
+            f();
+        } catch (const std::exception &e) {
+            ++failures;
+            ++exceptions;
+            out << indent() << "UNHANDLED EXCEPTION (" << ctx << "): " << e.what() << "\n";
+            if (throw_exception) {
+                throw;
+            }
+        } catch (...) {
+            ++failures;
+            ++exceptions;
+            out << indent() << "UNHANDLED EXCEPTION (" << ctx << "): unknown\n";
+            if (throw_exception) {
+                throw;
+            }
+        }
+    }
+
+    void print_result(const std::string &label, int new_failures, int new_assertions, const std::string &extra = "") const {
+        std::string line = indent() + label;
+
+        std::string details;
+        if (new_assertions > 0) {
+            if (new_failures == 0) {
+                details = std::to_string(new_assertions) + " assertion(s)";
+            } else {
+                details = std::to_string(new_failures) + " of " +
+                          std::to_string(new_assertions) + " assertion(s) failed";
+            }
+        }
+        if (!extra.empty()) {
+            if (!details.empty()) {
+                details += ", ";
+            }
+            details += extra;
+        }
+
+        if (!details.empty()) {
+            line += " (" + details + ")";
+        }
+
+        std::string status = (new_failures == 0) ? "[PASS]" : "[FAIL]";
+
+        if (line.size() + 1 < status_column) {
+            line.append(status_column - line.size(), ' ');
+        } else {
+            line.push_back(' ');
+        }
+
+        out << line << status << "\n";
+    }
+
+    template <typename F>
+    void test(const std::string &name, F f) {
+        stack.push_back(name);
+        if (!should_run()) {
+            stack.pop_back();
+            return;
+        }
+
+        ++tests;
+        out << indent() << name << "\n";
+
+        int before_failures   = failures;
+        int before_assertions = assertions;
+
+        run_with_exceptions([&] { f(*this); }, "test");
+
+        int new_failures   = failures   - before_failures;
+        int new_assertions = assertions - before_assertions;
+
+        print_result(name, new_failures, new_assertions);
+
+        stack.pop_back();
+    }
+
+    template <typename F>
+    void test(F f) {
+        test("test #" + std::to_string(++unnamed), f);
+    }
+
+    template <typename F>
+    void bench(const std::string &name, F f, int iterations = 100) {
+        stack.push_back(name);
+        if (!should_run()) {
+            stack.pop_back();
+            return;
+        }
+
+        ++tests;
+        out << indent() << "[bench] " << name << "\n";
+
+        int before_failures   = failures;
+        int before_assertions = assertions;
+
+        using clock = std::chrono::high_resolution_clock;
+
+        std::chrono::microseconds duration(0);
+
+        run_with_exceptions([&] {
+            for (auto i = 0; i < iterations; i++) {
+                auto start = clock::now();
+                f();
+                duration += std::chrono::duration_cast<std::chrono::microseconds>(clock::now() - start);
+            }
+        }, "bench");
+
+        auto avg_elapsed   = duration.count() / iterations;
+        auto avg_elapsed_s = std::chrono::duration_cast<std::chrono::duration<double>>(duration).count() / iterations;
+        auto rate = (avg_elapsed_s > 0.0) ? (1.0 / avg_elapsed_s) : 0.0;
+
+        int new_failures   = failures   - before_failures;
+        int new_assertions = assertions - before_assertions;
+
+        std::string extra =
+            "n=" + std::to_string(iterations) +
+            " avg=" + std::to_string(avg_elapsed) + "us" +
+            " rate=" + std::to_string(int(rate)) + "/s";
+
+        print_result("[bench] " + name, new_failures, new_assertions, extra);
+
+        stack.pop_back();
+    }
+
+    template <typename F>
+    void bench(F f, int iterations = 100) {
+        bench("bench #" + std::to_string(++unnamed), f, iterations);
+    }
+
+    // Assertions
+    bool assert_true(bool cond) {
+        return assert_true("", cond);
+    }
+
+    bool assert_true(const std::string &msg, bool cond) {
+        ++assertions;
+        if (!cond) {
+            ++failures;
+            out << indent() << "ASSERT TRUE FAILED";
+            if (!msg.empty()) {
+                out << " : " << msg;
+            }
+            out << "\n";
+            return false;
+        }
+        return true;
+    }
+
+    template <typename A, typename B>
+    bool assert_equal(const A &expected, const B &actual) {
+        return assert_equal("", expected, actual);
+    }
+
+    template <typename A, typename B>
+    bool assert_equal(const std::string &msg, const A &expected, const B &actual) {
+        ++assertions;
+        if (!(actual == expected)) {
+            ++failures;
+            out << indent() << "ASSERT EQUAL FAILED";
+            if (!msg.empty()) {
+                out << " : " << msg;
+            }
+            out << "\n";
+
+            out << indent() << "  expected: " << expected << "\n";
+            out << indent() << "  actual  : " << actual << "\n";
+            return false;
+        }
+        return true;
+    }
+
+    // Print summary and return an exit code
+    int summary() const {
+        out << "\n";
+        out << "tests      : " << tests << "\n";
+        out << "assertions : " << assertions << "\n";
+        out << "failures   : " << failures << "\n";
+        out << "exceptions : " << exceptions << "\n";
+        return failures == 0 ? 0 : 1;
+    }
+};
--- a/tests/peg-parser/tests.h
+++ b/tests/peg-parser/tests.h
@ -0,0 +1,24 @@
+#pragma once
+
+// Common includes for all test files
+#include <nlohmann/json.hpp>
+#include <string>
+#include <vector>
+
+#include "testing.h"
+#include "peg-parser.h"
+#include "chat-peg-parser.h"
+#include "simple-tokenize.h"
+
+struct bench_tool_call {
+    std::string            id;
+    std::string            name;
+    nlohmann::ordered_json args;
+};
+
+// Test function declarations
+void test_basic(testing &t);
+void test_json_parser(testing &t);
+void test_gbnf_generation(testing &t);
+void test_unicode(testing &t);
+void test_json_serialization(testing &t);
--- a/tests/test-backend-ops.cpp
+++ b/tests/test-backend-ops.cpp
@ -41,12 +41,18 @@
 #include <vector>
 #include <unordered_map>

+#ifdef __EMSCRIPTEN__
+#   define N_THREADS 1
+#else
+#   define N_THREADS std::thread::hardware_concurrency()
+#endif
+
 static void init_tensor_uniform(ggml_tensor * tensor, float min = -1.0f, float max = 1.0f) {
    size_t nels = ggml_nelements(tensor);
    std::vector<float> data(nels);
    {
        // parallel initialization
-        static const size_t n_threads = std::thread::hardware_concurrency();
+        static const size_t n_threads = N_THREADS;
        // static RNG initialization (revisit if n_threads stops being constant)
        static std::vector<std::default_random_engine> generators = []() {
            std::random_device rd;
@ -65,6 +71,9 @@ static void init_tensor_uniform(ggml_tensor * tensor, float min = -1.0f, float m
            }
        };

+        if (n_threads == 1) {
+            init_thread(0, 0, nels);
+        } else {
            std::vector<std::future<void>> tasks;
            tasks.reserve(n_threads);
            for (size_t i = 0; i < n_threads; i++) {
@ -76,6 +85,7 @@ static void init_tensor_uniform(ggml_tensor * tensor, float min = -1.0f, float m
                t.get();
            }
        }
+    }

    if (tensor->type == GGML_TYPE_F32 || tensor->type == GGML_TYPE_I32) {
        ggml_backend_tensor_set(tensor, data.data(), 0, nels * sizeof(float));
@ -105,19 +115,25 @@ static void init_tensor_uniform(ggml_tensor * tensor, float min = -1.0f, float m
            };

            const size_t min_blocks_per_thread = 1;
-            const size_t n_threads = std::min<size_t>(std::thread::hardware_concurrency()/2,
+            const size_t n_quant_threads = std::min<size_t>(std::max<size_t>(N_THREADS/2, 1),
                                                            std::max<size_t>(1, n_blocks / min_blocks_per_thread));
+
+            if (n_quant_threads == 1) {
+                // single-threaded quantization: do all blocks in the current thread
+                quantize_thread(0, n_blocks);
+            } else {
                std::vector<std::future<void>> tasks;
-            tasks.reserve(n_threads);
-            for (size_t i = 0; i < n_threads; i++) {
-                size_t start =     i*n_blocks/n_threads;
-                size_t end   = (i+1)*n_blocks/n_threads;
+                tasks.reserve(n_quant_threads);
+                for (size_t i = 0; i < n_quant_threads; i++) {
+                    size_t start =     i*n_blocks/n_quant_threads;
+                    size_t end   = (i+1)*n_blocks/n_quant_threads;
                    tasks.push_back(std::async(std::launch::async, quantize_thread, start, end));
                }
                for (auto & t : tasks) {
                    t.get();
                }
            }
+        }
        ggml_backend_tensor_set(tensor, dataq.data(), 0, dataq.size());
    } else if (tensor->type == GGML_TYPE_I8 || tensor->type == GGML_TYPE_I16 || tensor->type == GGML_TYPE_I32) {
        // This is going to create some weird integers though.
@ -8365,7 +8381,7 @@ int main(int argc, char ** argv) {
        auto ggml_backend_set_n_threads_fn = (ggml_backend_set_n_threads_t) ggml_backend_reg_get_proc_address(reg, "ggml_backend_set_n_threads");
        if (ggml_backend_set_n_threads_fn) {
            // TODO: better value for n_threads
-            ggml_backend_set_n_threads_fn(backend, std::thread::hardware_concurrency());
+            ggml_backend_set_n_threads_fn(backend, N_THREADS);
        }

        size_t free, total;  // NOLINT
--- a/tests/test-chat-peg-parser.cpp
+++ b/tests/test-chat-peg-parser.cpp
@ -0,0 +1,768 @@
+#include <string>
+#include <iostream>
+#include <numeric>
+
+#include "chat-parser.h"
+#include "chat-peg-parser.h"
+#include "chat.h"
+#include "common.h"
+#include "json-schema-to-grammar.h"
+#include "peg-parser.h"
+#include "peg-parser/testing.h"
+#include "peg-parser/simple-tokenize.h"
+#include "nlohmann/json.hpp"
+
+using json = nlohmann::ordered_json;
+
+static json create_tools();
+static void test_example_native(testing & t);
+static void test_example_qwen3_coder(testing & t);
+static void test_command7_parser_compare(testing & t);
+
+int main(int argc, char *argv[]) {
+    testing t(std::cout);
+    if (argc >= 2) {
+        t.set_filter(argv[1]);
+    }
+
+    const char * verbose = getenv("LLAMA_TEST_VERBOSE");
+    if (verbose) {
+        t.verbose = std::string(verbose) == "1";
+    }
+
+    t.test("native", test_example_native);
+    t.test("qwen3 coder", test_example_qwen3_coder);
+    t.test("comparison", test_command7_parser_compare);
+
+    return t.summary();
+}
+
+static json create_tools() {
+    json tools = json::array();
+
+    json tool_weather = {
+        {"type", "function"},
+        {"function", {
+            {"name", "get_current_weather"},
+            {"description", "Get the current weather in a given location"},
+            {"parameters", {
+                {"type", "object"},
+                {"properties", {
+                    {"location", {
+                        {"type", "string"},
+                        {"description", "The city and state, e.g. San Francisco, CA"}
+                    }},
+                    {"unit", {
+                        {"type", "string"},
+                        {"enum", {"celsius", "fahrenheit"}},
+                        {"description", "The temperature unit to use. Infer this from the users location."}
+                    }}
+                }},
+                {"required", {"location", "unit"}},
+            }},
+        }}
+    };
+    tools.push_back(tool_weather);
+
+    json tool_forecast = {
+        {"type", "function"},
+        {"function", {
+            {"name", "get_forecast"},
+            {"description", "Get the weather forecast for a given location"},
+            {"parameters", {
+                {"type", "object"},
+                {"properties", {
+                    {"location", {
+                        {"type", "string"},
+                        {"description", "The city and state, e.g. San Francisco, CA"}
+                    }},
+                    {"unit", {
+                        {"type", "string"},
+                        {"enum", {"celsius", "fahrenheit"}},
+                        {"description", "The temperature unit to use. Infer this from the users location."}
+                    }},
+                    {"days", {
+                        {"type", "integer"},
+                        {"description", "Number of days to forecast (1-10)"},
+                        {"minimum", 1},
+                        {"maximum", 10}
+                    }}
+                }},
+                {"required", {"location", "unit"}},
+            }},
+        }}
+    };
+    tools.push_back(tool_forecast);
+
+    json tool_search = {
+        {"type", "function"},
+        {"function", {
+            {"name", "search_knowledge_base"},
+            {"description", "Search the internal technical documentation knowledge base."},
+            {"parameters", {
+                {"type", "object"},
+                {"properties", {
+                    {"query", {
+                        {"type", "string"},
+                        {"description", "The search query string."}
+                    }},
+                    {"max_results", {
+                        {"type", "integer"},
+                        {"description", "The maximum number of results to return."},
+                        {"default", 5}
+                    }},
+                    {"category", {
+                        {"type", "string"},
+                        {"enum", {"api", "troubleshooting", "billing", "general"}},
+                        {"description", "Filter search by specific category."}
+                    }}
+                }},
+                {"required", {"query", "category"}},
+                {"additionalProperties", false}
+            }},
+            {"strict", true}
+        }}
+    };
+    tools.push_back(tool_search);
+
+    return tools;
+}
+
+struct tool_argument {
+    std::string name;
+    std::string type;
+    bool is_required;
+    json schema;
+};
+
+struct tool_definition {
+    std::string name;
+    std::vector<tool_argument> arguments;
+    json schema;
+};
+
+// Test fictitious model output that emits arguments as JSON.
+static void test_example_native(testing & t) {
+    struct test_case {
+        // Parameters
+        std::string name;
+        json tools;
+        common_chat_tool_choice tool_choice;
+        common_reasoning_format reasoning_format;
+        json json_schema;
+        bool parallel_tool_calls;
+        bool thinking_forced_open;
+        std::string input;
+
+        // Expect
+        std::string expect_reasoning;
+        std::string expect_content;
+        std::vector<common_chat_tool_call> expect_tool_calls;
+    };
+
+    auto build_parser = [](const test_case & tc) {
+        return build_chat_peg_native_parser([&](common_chat_peg_native_builder & p) {
+            auto reasoning_in_content = (tc.reasoning_format == COMMON_REASONING_FORMAT_NONE);
+            auto reasoning = p.eps();
+            if (tc.thinking_forced_open) {
+                // If thinking is forced open, expect a closing tag
+                reasoning = p.reasoning(p.until("</think>")) + "</think>" + p.space();
+            } else {
+                // Otherwise, optionally accept thinking wrapped in tags
+                reasoning = p.optional("<think>" + p.reasoning(p.until("</think>")) + "</think>" + p.space());
+            }
+
+            // tool calling parser
+            if (tc.tools.is_array() && !tc.tools.empty()) {
+                auto tools = p.choice();
+                for (const auto & tool : tc.tools) {
+                    const auto & function = tool.at("function");
+                    std::string name = function.at("name");
+                    const auto & schema = function.at("parameters");
+
+                    auto tool_name = p.json_member("name", "\"" + p.tool_name(p.literal(name)) + "\"");
+                    auto tool_args = p.json_member("arguments", p.tool_args(p.schema(p.json(), "tool-" + name + "-schema", schema)));
+
+                    tools |= p.rule("tool-" + name, p.tool_open(p.literal("{")) << tool_name << "," << tool_args << "}");
+                };
+
+                auto parallel_calls = p.eps();
+                if (tc.parallel_tool_calls) {
+                    parallel_calls = p.zero_or_more("," << tools);
+                }
+
+                auto tool_call = p.trigger_rule("tool-call",
+                    p.sequence({
+                        p.literal("<tool_call>["),
+                        tools,
+                        parallel_calls,
+                        p.literal("]</tool_call>")
+                    })
+                );
+
+                return p.sequence({
+                    (reasoning_in_content ? p.eps() : reasoning),
+                    p.content(p.until("<tool_call>")),
+                    p.optional(p.space() + tool_call),
+                    p.space(),
+                    p.end()
+                });
+            }
+
+            // response_format parser
+            if (tc.json_schema.is_object() && !tc.json_schema.empty()) {
+                return p.sequence({
+                    (reasoning_in_content ? p.eps() : reasoning),
+                    p.content(p.schema(p.json(), "response-output", tc.json_schema)),
+                    p.space(),
+                    p.end()
+                });
+            }
+
+            // Content-only parser
+            return p.sequence({
+                (reasoning_in_content ? p.eps() : reasoning),
+                p.content(p.rest()),
+                p.end()
+            });
+        });
+    };
+
+    std::vector<test_case> test_cases = std::vector<test_case>{
+        {
+            /* .name =                 */ "content with thinking_forced_open = false",
+            /* .tools =                */ {},
+            /* .tool_choice =          */ COMMON_CHAT_TOOL_CHOICE_NONE,
+            /* .reasoning_format =     */ COMMON_REASONING_FORMAT_AUTO,
+            /* .json_schema =          */ {},
+            /* .parallel_tool_calls =  */ false,
+            /* .thinking_forced_open = */ false,
+            /* .input =                */ (
+                "<think>The user said hello, I must say hello back</think>\nHello"
+            ),
+            /* .expect_reasoning =     */ "The user said hello, I must say hello back",
+            /* .expect_content =       */ "Hello",
+            /* .expect_tool_calls =    */ {},
+        },
+        {
+            /* .name =                 */ "content with thinking_forced_open = false and no reasoning",
+            /* .tools =                */ {},
+            /* .tool_choice =          */ COMMON_CHAT_TOOL_CHOICE_NONE,
+            /* .reasoning_format =     */ COMMON_REASONING_FORMAT_AUTO,
+            /* .json_schema =          */ {},
+            /* .parallel_tool_calls =  */ false,
+            /* .thinking_forced_open = */ false,
+            /* .input =                */ (
+                "Hello"
+            ),
+            /* .expect_reasoning =     */ "",
+            /* .expect_content =       */ "Hello",
+            /* .expect_tool_calls =    */ {},
+        },
+        {
+            /* .name =                 */ "content with thinking_forced_open = false and reasoning_format = none",
+            /* .tools =                */ {},
+            /* .tool_choice =          */ COMMON_CHAT_TOOL_CHOICE_NONE,
+            /* .reasoning_format =     */ COMMON_REASONING_FORMAT_NONE,
+            /* .json_schema =          */ {},
+            /* .parallel_tool_calls =  */ false,
+            /* .thinking_forced_open = */ true,
+            /* .input =                */ (
+                "<think>The user said hello, I must say hello back</think>\nHello"
+            ),
+            /* .expect_reasoning =     */ "",
+            /* .expect_content =       */ "<think>The user said hello, I must say hello back</think>\nHello",
+            /* .expect_tool_calls =    */ {},
+        },
+        {
+            /* .name =                 */ "content with thinking_forced_open = true",
+            /* .tools =                */ {},
+            /* .tool_choice =          */ COMMON_CHAT_TOOL_CHOICE_NONE,
+            /* .reasoning_format =     */ COMMON_REASONING_FORMAT_AUTO,
+            /* .json_schema =          */ {},
+            /* .parallel_tool_calls =  */ false,
+            /* .thinking_forced_open = */ true,
+            /* .input =                */ (
+                "The user said hello, I must say hello back</think>\nHello"
+            ),
+            /* .expect_reasoning =     */ "The user said hello, I must say hello back",
+            /* .expect_content =       */ "Hello",
+            /* .expect_tool_calls =    */ {},
+        },
+        {
+            /* .name =                 */ "content with thinking_forced_open = true and reasoning_format = none",
+            /* .tools =                */ {},
+            /* .tool_choice =          */ COMMON_CHAT_TOOL_CHOICE_NONE,
+            /* .reasoning_format =     */ COMMON_REASONING_FORMAT_NONE,
+            /* .json_schema =          */ {},
+            /* .parallel_tool_calls =  */ false,
+            /* .thinking_forced_open = */ true,
+            /* .input =                */ (
+                "The user said hello, I must say hello back</think>\nHello"
+            ),
+            /* .expect_reasoning =     */ "",
+            /* .expect_content =       */ "The user said hello, I must say hello back</think>\nHello",
+            /* .expect_tool_calls =    */ {},
+        },
+        {
+            /* .name =                 */ "tools with tool_choice = auto and no parallel_tool_calls",
+            /* .tools =                */ create_tools(),
+            /* .tool_choice =          */ COMMON_CHAT_TOOL_CHOICE_AUTO,
+            /* .reasoning_format =     */ COMMON_REASONING_FORMAT_AUTO,
+            /* .json_schema =          */ {},
+            /* .parallel_tool_calls =  */ false,
+            /* .thinking_forced_open = */ true,
+            /* .input =                */ (
+                "I must get the weather in New York</think>\n"
+                "<tool_call>["
+                R"({"name": "get_current_weather", "arguments": {"location": "New York City, NY", "unit": "fahrenheit"}})"
+                "]</tool_call>"
+            ),
+            /* .expect_reasoning =     */ "I must get the weather in New York",
+            /* .expect_content =       */ "",
+            /* .expect_tool_calls =    */ {{
+                /* .name =      */ "get_current_weather",
+                /* .arguments = */ R"({"location": "New York City, NY", "unit": "fahrenheit"})",
+                /* .id =        */ "",
+            }},
+        },
+        {
+            /* .name =                 */ "tools with tool_choice = auto and parallel_tool_calls",
+            /* .tools =                */ create_tools(),
+            /* .tool_choice =          */ COMMON_CHAT_TOOL_CHOICE_AUTO,
+            /* .reasoning_format =     */ COMMON_REASONING_FORMAT_AUTO,
+            /* .json_schema =          */ {},
+            /* .parallel_tool_calls =  */ true,
+            /* .thinking_forced_open = */ true,
+            /* .input =                */ (
+                "I must get the weather in New York and San Francisco and a 3 day forecast of each.</think>\nLet me search that for you."
+                "<tool_call>["
+                R"({"name": "get_current_weather", "arguments": {"location": "New York City, NY", "unit": "fahrenheit"}})"
+                ", "
+                R"({"name": "get_current_weather", "arguments": {"location": "San Francisco, CA", "unit": "fahrenheit"}})"
+                ", "
+                R"({"name": "get_forecast", "arguments": {"location": "New York City, NY", "unit": "fahrenheit", "days": 3}})"
+                ", "
+                R"({"name": "get_forecast", "arguments": {"location": "San Francisco, CA", "unit": "fahrenheit", "days": 3}})"
+                "]</tool_call>"
+            ),
+            /* .expect_reasoning =     */ "I must get the weather in New York and San Francisco and a 3 day forecast of each.",
+            /* .expect_content =       */ "Let me search that for you.",
+            /* .expect_tool_calls =    */ {{
+                /* .name =      */ "get_current_weather",
+                /* .arguments = */ R"({"location": "New York City, NY", "unit": "fahrenheit"})",
+                /* .id =        */ "",
+            }, {
+                /* .name =      */ "get_current_weather",
+                /* .arguments = */ R"({"location": "San Francisco, CA", "unit": "fahrenheit"})",
+                /* .id =        */ "",
+            }, {
+                /* .name =      */ "get_forecast",
+                /* .arguments = */ R"({"location": "New York City, NY", "unit": "fahrenheit", "days": 3})",
+                /* .id =        */ "",
+            }, {
+                /* .name =      */ "get_forecast",
+                /* .arguments = */ R"({"location": "San Francisco, CA", "unit": "fahrenheit", "days": 3})",
+                /* .id =        */ "",
+            }},
+        },
+        {
+            /* .name =                 */ "response_format with thinking_forced_open = true",
+            /* .tools =                */ {},
+            /* .tool_choice =          */ COMMON_CHAT_TOOL_CHOICE_NONE,
+            /* .reasoning_format =     */ COMMON_REASONING_FORMAT_AUTO,
+            /* .json_schema =          */ {
+                {"type", "object"},
+                {"properties", {
+                    {"invoice_number", {{"type", "string"}}},
+                    {"amount", {{"type", "number"}}},
+                    {"due_date", {{"type", "string"}}}
+                }},
+                {"required", {"invoice_number", "amount", "due_date"}}
+            },
+            /* .parallel_tool_calls =  */ false,
+            /* .thinking_forced_open = */ true,
+            /* .input =                */ (
+                "I must produce the invoice in the requested format</think>\n"
+                R"({"invoice_number": "INV-2025-001", "amount": 1250.50, "due_date": "2025-12-31"})"
+            ),
+            /* .expect_reasoning =     */ "I must produce the invoice in the requested format",
+            /* .expect_content =       */ R"({"invoice_number": "INV-2025-001", "amount": 1250.50, "due_date": "2025-12-31"})",
+            /* .expect_tool_calls =    */ {},
+        },
+    };
+
+    for (const auto & tc : test_cases) {
+        t.test(tc.name, [&](testing & t) {
+            auto parser = build_parser(tc);
+            auto lazy = !tc.tools.empty() && tc.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED;
+            auto grammar = build_grammar([&](const common_grammar_builder & builder) {
+                for (auto const & def : tc.tools) {
+                    auto function = def.at("function");
+                    auto parameters = function.at("parameters");
+                    builder.resolve_refs(parameters);
+                };
+                parser.build_grammar(builder, lazy);
+            });
+
+            t.log("Grammar:");
+            for (auto const & line : string_split(grammar, "\n")) {
+                t.log(line);
+            }
+
+            common_peg_parse_context ctx(tc.input, false);
+            auto result = parser.parse(ctx);
+
+            t.assert_true("success", result.success());
+
+            common_chat_msg msg;
+            auto mapper = common_chat_peg_native_mapper(msg);
+            mapper.from_ast(ctx.ast, result);
+
+            t.assert_equal("content equal", tc.expect_content, msg.content);
+            t.assert_equal("reasoning equal", tc.expect_reasoning, msg.reasoning_content);
+            t.assert_equal("number of tool calls", tc.expect_tool_calls.size(), msg.tool_calls.size());
+            for (auto i = 0u; i < std::min(tc.expect_tool_calls.size(), msg.tool_calls.size()); i++) {
+                t.assert_equal("tool name", tc.expect_tool_calls[i].name, msg.tool_calls[i].name);
+                t.assert_equal("tool args", tc.expect_tool_calls[i].arguments, msg.tool_calls[i].arguments);
+            }
+        });
+    }
+}
+
+static void test_example_qwen3_coder(testing & t) {
+    auto tools = create_tools();
+    auto parser = build_chat_peg_constructed_parser([&](common_chat_peg_constructed_builder & p) {
+        auto content = p.rule("content", p.content(p.until("<tool_call>")));
+
+        std::vector<common_peg_parser> tool_parsers;
+        for (auto const & def : tools) {
+            auto function = def.at("function");
+            std::string name = function.at("name");
+            auto parameters = function.at("parameters");
+            auto properties = parameters.at("properties");
+
+            std::set<std::string> required_properties;
+            if (function.contains("required")) {
+                function.at("required").get_to(required_properties);
+            }
+
+            std::vector<common_peg_parser> arg_parsers;
+            for (const auto & [param_name, param_schema] : properties.items()) {
+                bool is_required = required_properties.find(param_name) != required_properties.end();
+                auto type = param_schema.value("type", "object");
+
+                auto arg = p.tool_arg(p.sequence({
+                    p.tool_arg_open("<parameter=" + p.tool_arg_name(p.literal(param_name)) + ">"),
+                    (type == "string" ?
+                        p.tool_arg_string_value(
+                            p.schema(
+                                p.until_one_of({
+                                    "</parameter>\n<parameter=",
+                                    "</parameter>\n</function>"
+                                }),
+                                "tool-" + name + "-arg-" + param_name + "-schema",
+                                param_schema,
+                                true
+                            )
+                        ) : p.tool_arg_json_value(
+                            p.schema(
+                                p.json(),
+                                "tool-" + name + "-arg-" + param_name + "-schema",
+                                param_schema
+                            )
+                        )
+                    ),
+                    p.tool_arg_close(
+                        "</parameter>\n" +
+                        p.peek(p.literal("<parameter=") | p.literal("</function>"))
+                    )
+                }));
+
+                arg_parsers.push_back(is_required ?
+                    p.rule("tool-" + name + "-arg-" + param_name, arg) :
+                    p.optional(p.rule("tool-" + name + "-arg-" + param_name, arg)));
+            }
+
+            tool_parsers.push_back(p.rule("tool-" + name,
+                p.tool_open("<function=" + p.tool_name(p.literal(name)) + ">")
+                << p.sequence(arg_parsers)
+                << p.tool_close(p.literal("</function>"))
+            ));
+        };
+
+        auto tool_call = p.trigger_rule("tool-call",
+            "<tool_call>"
+            << p.choice(tool_parsers)
+            << "</tool_call>"
+        );
+
+        return content + p.zero_or_more(p.space() + tool_call) + p.end();
+    });
+
+    auto grammar = build_grammar([&](const common_grammar_builder & builder) {
+        for (auto const & def : tools) {
+            auto function = def.at("function");
+            auto parameters = function.at("parameters");
+            builder.resolve_refs(parameters);
+        };
+        parser.build_grammar(builder);
+    });
+
+    t.log("Grammar:");
+    for (auto const & line : string_split(grammar, "\n")) {
+        t.log(line);
+    }
+
+    t.test("incremental parsing", [&](testing &t) {
+        std::string input =
+            "Let me search the knowledge base for cat pictures."
+            "<tool_call>\n"
+            "<function=search_knowledge_base>\n"
+            "<parameter=query>cat pictures</parameter>\n"
+            "<parameter=category>general</parameter>\n"
+            "</function>\n"
+            "</tool_call>";
+
+        std::vector<std::string> tokens = simple_tokenize(input);
+
+        common_chat_msg prev;
+        for (auto it = tokens.begin(); it != tokens.end(); it++) {
+            std::string in = std::accumulate(tokens.begin(), it + 1, std::string());
+
+            common_peg_parse_context ctx(in, it + 1 < tokens.end());
+
+            auto result = parser.parse(ctx);
+            if (!t.assert_equal("not fail", false, result.fail())) {
+                t.log(in.substr(0, result.end) + "[failed->]" + in.substr(result.end));
+            }
+
+            common_chat_msg msg;
+            auto mapper = common_chat_peg_constructed_mapper(msg);
+            mapper.from_ast(ctx.ast, result);
+
+            //t.log("Input: " + input);
+            t.log("===========================================");
+            t.log("Iteration " + std::to_string(in.size()));
+            t.log("Reasoning: " + msg.reasoning_content);
+            t.log("Content  : " + msg.content);
+            for (const auto & tc : msg.tool_calls) {
+                t.log("Tool name: " + tc.name);
+                t.log("Tool args: " + tc.arguments);
+            }
+
+            try {
+                // This shouldn't emit any runtime errors
+                auto diffs = common_chat_msg_diff::compute_diffs(prev, msg);
+            } catch(const std::exception & e) {
+                t.log(in.substr(0, result.end) + "[failed->]" + in.substr(result.end));
+                t.assert_true(std::string("failed with ") + e.what(), false);
+            }
+
+            prev = msg;
+        }
+    });
+}
+
+void test_command7_parser_compare(testing & t) {
+    auto parser = build_chat_peg_native_parser([](common_chat_peg_native_builder & p) {
+        auto thinking = p.reasoning_block(
+            "<|START_THINKING|>" << p.reasoning(p.until("<|END_THINKING|>")) << "<|END_THINKING|>");
+
+        auto response = "<|START_RESPONSE|>" << p.content(p.until("<|END_RESPONSE|>")) << "<|END_RESPONSE|>";
+
+        auto tool_call_id = p.atomic("\"tool_call_id\"" << (":" << ("\"" + p.tool_id(p.json_string_content()) + "\"")));
+        auto tool_call_name = p.atomic("\"tool_name\"" << (":" << ("\"" + p.tool_name(p.json_string_content()) + "\"")));
+        auto tool_call_args = "\"parameters\"" << (":" << p.tool_args(p.json()));
+
+        auto tool_call_fields = p.rule("tool-call-fields", tool_call_id | tool_call_name | tool_call_args);
+        auto tool_call = p.rule("tool-call", p.tool(
+            p.tool_open(p.literal("{"))
+            << tool_call_fields
+            << p.zero_or_more( p.literal(",") << tool_call_fields)
+            << p.tool_close(p.literal("}"))
+        ));
+
+        auto tool_calls = p.rule("tool-calls",
+            "<|START_ACTION|>"
+            << ("[" << tool_call << p.zero_or_more(p.literal(",") << tool_call) << "]")
+            << "<|END_ACTION|>");
+
+        return p.optional(thinking) << (tool_calls | response) + p.end();
+    });
+
+    auto test_current = [&](const common_peg_arena & p, const std::string & input, bool is_partial, bool print_results) {
+        common_peg_parse_context ctx(input, is_partial);
+        auto result = p.parse(ctx);
+
+        common_chat_msg msg;
+        auto mapper = common_chat_peg_native_mapper(msg);
+        mapper.from_ast(ctx.ast, result);
+
+        if (print_results) {
+            std::cout << "== Parsed (new) ==\n";
+            std::cout << "=== Reasoning ===\n";
+            std::cout << msg.reasoning_content << "\n";
+            std::cout << "\n\n=== Content ===\n";
+            std::cout << msg.content << "\n";
+            std::cout << "\n\n=== Tool Calls ===\n";
+            for (const auto & tc : msg.tool_calls) {
+                std::cout << "id: " << tc.id << "\n";
+                std::cout << "name: " << tc.name << "\n";
+                std::cout << "args: " << tc.arguments << "\n";
+            }
+        }
+    };
+
+    auto test_legacy = [&](const std::string & input, bool need_more_input, bool print_results) {
+        // Original common_chat_combinator_parser taken from chat.cpp
+        common_chat_msg_parser builder(
+            input,
+            /* .is_partial = */ need_more_input,
+            {
+                /* .format = */ COMMON_CHAT_FORMAT_GENERIC,
+                /* .reasoning_format = */ COMMON_REASONING_FORMAT_AUTO,
+                /* .reasoning_in_content = */ false,
+                /* .thinking_forced_open = */ false,
+            }
+        );
+
+        builder.try_parse_reasoning("<|START_THINKING|>", "<|END_THINKING|>");
+
+        static const common_regex start_action_regex("<\\|START_ACTION\\|>");
+        static const common_regex end_action_regex("<\\|END_ACTION\\|>");
+        static const common_regex start_response_regex("<\\|START_RESPONSE\\|>");
+        static const common_regex end_response_regex("<\\|END_RESPONSE\\|>");
+
+        if (auto res = builder.try_find_regex(start_action_regex)) {
+            // If we didn't extract thoughts, prelude includes them.
+            auto tool_calls = builder.consume_json_with_dumped_args({ { "parameters" } });
+            for (const auto & tool_call : tool_calls.value) {
+                std::string name      = tool_call.contains("tool_name") ? tool_call.at("tool_name") : "";
+                std::string id        = tool_call.contains("tool_call_id") ? tool_call.at("tool_call_id") : "";
+                std::string arguments = tool_call.contains("parameters") ? tool_call.at("parameters") : "";
+                if (!builder.add_tool_call(name, id, arguments) || tool_calls.is_partial) {
+                    throw common_chat_msg_partial_exception("incomplete tool call");
+                }
+            }
+            if (tool_calls.is_partial) {
+                throw common_chat_msg_partial_exception("incomplete tool call");
+            }
+            builder.consume_regex(end_action_regex);
+        } else if (auto res = builder.try_find_regex(start_response_regex)) {
+            if (!builder.try_find_regex(end_response_regex)) {
+                builder.add_content(builder.consume_rest());
+                throw common_chat_msg_partial_exception(end_response_regex.str());
+            }
+        } else {
+            builder.add_content(builder.consume_rest());
+        }
+
+        if (print_results) {
+            std::cout << "== Parsed (legacy) ==\n";
+            std::cout << "=== Reasoning ===\n";
+            std::cout << builder.result().reasoning_content << "\n";
+            std::cout << "\n\n=== Content ===\n";
+            std::cout << builder.result().content << "\n";
+            std::cout << "\n\n=== Tool Calls ===\n";
+            for (const auto & tc : builder.result().tool_calls) {
+                std::cout << "id: " << tc.id << "\n";
+                std::cout << "name: " << tc.name << "\n";
+                std::cout << "args: " << tc.arguments << "\n";
+            }
+        }
+    };
+
+    std::string reasoning = "To plan an effective trip to Japan that includes both historical sites and modern attractions within a "
+            "budget of $4000 for a two-week stay, we need to:\n\n"
+            "1. Identify key historical sites and modern attractions in Japan.\n"
+            "2. Find affordable accommodation options that provide a balance between comfort and cost.\n"
+            "3. Determine the best modes of transportation for getting around Japan.\n"
+            "4. Create a day-by-day itinerary that ensures the user gets to see a variety of attractions without "
+            "overspending.\n"
+            "5. Provide a detailed cost breakdown that includes accommodation, transportation, meals, and entry fees "
+            "to attractions.";
+
+    std::vector<std::tuple<std::string, std::string, nlohmann::json>> tool_calls = {{
+        "call_0",
+        "plan_trip",
+        nlohmann::json::parse(R"({
+            "destination": "Japan",
+            "duration": 14,
+            "budget": 4000,
+            "interests": ["historical sites", "modern attractions"],
+            "accommodation_preferences": "affordable",
+            "transportation_preferences": "efficient",
+            "meal_preferences": "local cuisine"
+        })")
+    }};
+
+    std::vector<std::string> tokens;
+
+    // Build tokens
+    if (!reasoning.empty()) {
+        auto tokenized = simple_tokenize(reasoning);
+        tokens.emplace_back("<|START_THINKING|>");
+        tokens.insert(tokens.end(), tokenized.begin(), tokenized.end());
+        tokens.emplace_back("<|END_THINKING|>");
+    }
+
+    if (!tool_calls.empty()) {
+        tokens.emplace_back("<|START_ACTION|>");
+
+        auto json = nlohmann::json::array();
+        for (const auto & tc : tool_calls) {
+            auto tc_json = nlohmann::json::object();
+            tc_json["tool_call_id"] = std::get<0>(tc);
+            tc_json["tool_name"] = std::get<1>(tc);
+            tc_json["parameters"] = std::get<2>(tc);
+            json.push_back(tc_json);
+        }
+
+        auto tokenized = simple_tokenize(json.dump(-1, ' ', true));
+        tokens.insert(tokens.end(), tokenized.begin(), tokenized.end());
+
+        tokens.emplace_back("<|END_ACTION|>");
+    }
+
+    std::string input = std::accumulate(tokens.begin(), tokens.end(), std::string());
+
+    // Run tests
+    t.test("legacy_parse", [&](testing & /* t */) {
+        test_legacy(input, false, false);
+    });
+
+    t.test("current_parse", [&](testing & /* t */) {
+        test_current(parser, input, false, false);
+    });
+
+    // Run benchmarks
+    t.bench("legacy_parse_benchmark complete", [&]() {
+        test_legacy(input, false, false);
+    });
+
+    t.bench("legacy_parse_benchmark incremental", [&]() {
+        std::string in;
+        for (auto i = 0u; i < tokens.size(); i++) {
+            in += tokens[i];
+
+            try {
+                test_legacy(in, i + 1 < tokens.size(), false);
+            } catch (common_chat_msg_partial_exception & /* e */) {
+                // Do nothing, this is expected
+            }
+        }
+    }, 20);
+
+    t.bench("current_parse_benchmark complete", [&]() {
+        test_current(parser, input, false, false);
+    }, 100);
+
+    t.bench("current_parse_benchmark incremental", [&]() {
+        std::string in;
+        for (auto i = 0u; i < tokens.size(); i++) {
+            in += tokens[i];
+            test_current(parser, in, i + 1 < tokens.size(), false);
+        }
+    }, 20);
+}
--- a/tests/test-json-schema-to-grammar.cpp
+++ b/tests/test-json-schema-to-grammar.cpp
@ -1375,7 +1375,7 @@ int main() {
        try {
            tc.verify(json_schema_to_grammar(nlohmann::ordered_json::parse(tc.schema), true));
            tc.verify_status(SUCCESS);
-        } catch (const std::runtime_error & ex) {
+        } catch (const std::invalid_argument & ex) {
            fprintf(stderr, "Error: %s\n", ex.what());
            tc.verify_status(FAILURE);
        }
--- a/tests/test-peg-parser.cpp
+++ b/tests/test-peg-parser.cpp
@ -0,0 +1,25 @@
+#include <cstdlib>
+#include <string>
+#include <iostream>
+
+#include "peg-parser/tests.h"
+
+int main(int argc, char *argv[]) {
+    testing t(std::cout);
+    if (argc >= 2) {
+        t.set_filter(argv[1]);
+    }
+
+    const char * verbose = getenv("LLAMA_TEST_VERBOSE");
+    if (verbose) {
+        t.verbose = std::string(verbose) == "1";
+    }
+
+    t.test("basic", test_basic);
+    t.test("unicode", test_unicode);
+    t.test("json", test_json_parser);
+    t.test("gbnf", test_gbnf_generation);
+    t.test("serialization", test_json_serialization);
+
+    return t.summary();
+}
--- a/tools/mtmd/clip.cpp
+++ b/tools/mtmd/clip.cpp
@ -428,6 +428,7 @@ struct clip_ctx {
    int max_nodes = 8192;
    ggml_backend_sched_ptr sched;
    clip_flash_attn_type flash_attn_type = CLIP_FLASH_ATTN_TYPE_AUTO;
+    bool is_allocated = false;

    // for debugging
    bool debug_graph = false;
@ -3305,12 +3306,30 @@ struct clip_model_loader {
    };

    static void warmup(clip_ctx & ctx_clip) {
+        // create a fake batch
+        const auto & hparams = ctx_clip.model.hparams;
+        clip_image_f32_batch batch;
+        clip_image_f32_ptr img(clip_image_f32_init());
+        if (ctx_clip.model.modality == CLIP_MODALITY_VISION) {
+            img->nx = hparams.warmup_image_size;
+            img->ny = hparams.warmup_image_size;
+            LOG_INF("%s: warmup with image size = %d x %d\n", __func__, img->nx, img->ny);
+        } else {
+            img->nx = hparams.warmup_audio_size;
+            img->ny = hparams.n_mel_bins;
+            LOG_INF("%s: warmup with audio size = %d\n", __func__, img->nx);
+        }
+        batch.entries.push_back(std::move(img));
+        warmup(ctx_clip, batch);
+    }
+
+    static void warmup(clip_ctx & ctx_clip, const clip_image_f32_batch & batch) {
        support_info_graph info;

        if (ctx_clip.flash_attn_type == CLIP_FLASH_ATTN_TYPE_AUTO) {
            // try to enable flash attention to see if it's supported
            ctx_clip.flash_attn_type = CLIP_FLASH_ATTN_TYPE_ENABLED;
-            info = alloc_compute_meta(ctx_clip);
+            info = alloc_compute_meta(ctx_clip, batch);
            if (!info.fattn && info.fattn_op) {
                auto op = info.fattn_op;
                LOG_WRN("%s: *****************************************************************\n", __func__);
@ -3329,15 +3348,17 @@ struct clip_model_loader {
                LOG_WRN("%s: please report this on github as an issue\n", __func__);
                LOG_WRN("%s: *****************************************************************\n", __func__);
                ctx_clip.flash_attn_type = CLIP_FLASH_ATTN_TYPE_DISABLED;
-                alloc_compute_meta(ctx_clip);
+                alloc_compute_meta(ctx_clip, batch);
            }
        } else {
-            info = alloc_compute_meta(ctx_clip);
+            info = alloc_compute_meta(ctx_clip, batch);
            if (!info.fattn && ctx_clip.flash_attn_type == CLIP_FLASH_ATTN_TYPE_ENABLED) {
                LOG_WRN("%s: flash attention is not supported by the current backend; falling back to CPU (performance will be degraded)\n", __func__);
            }
        }

+        ctx_clip.is_allocated = true; // mark buffers as allocated
+
        LOG_INF("%s: flash attention is %s\n", __func__,
            (ctx_clip.flash_attn_type == CLIP_FLASH_ATTN_TYPE_ENABLED) ? "enabled" : "disabled");

@ -3369,24 +3390,9 @@ struct clip_model_loader {
        }
    }

-    static support_info_graph alloc_compute_meta(clip_ctx & ctx_clip) {
-        const auto & hparams = ctx_clip.model.hparams;
+    static support_info_graph alloc_compute_meta(clip_ctx & ctx_clip, const clip_image_f32_batch & batch) {
        ctx_clip.buf_compute_meta.resize(ctx_clip.max_nodes * ggml_tensor_overhead() + ggml_graph_overhead());

-        // create a fake batch
-        clip_image_f32_batch batch;
-        clip_image_f32_ptr img(clip_image_f32_init());
-        if (ctx_clip.model.modality == CLIP_MODALITY_VISION) {
-            img->nx = hparams.warmup_image_size;
-            img->ny = hparams.warmup_image_size;
-            LOG_INF("%s: warmup with image size = %d x %d\n", __func__, img->nx, img->ny);
-        } else {
-            img->nx = hparams.warmup_audio_size;
-            img->ny = hparams.n_mel_bins;
-            LOG_INF("%s: warmup with audio size = %d\n", __func__, img->nx);
-        }
-        batch.entries.push_back(std::move(img));
-
        ggml_cgraph * gf = clip_image_build_graph(&ctx_clip, batch);
        ggml_backend_sched_reserve(ctx_clip.sched.get(), gf);

@ -4630,6 +4636,11 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
        return false; // only support batch size of 1
    }

+    // if buffers are not allocated, we need to do a warmup run to allocate them
+    if (!ctx->is_allocated) {
+        clip_model_loader::warmup(*ctx, *imgs_c_ptr);
+    }
+
    // build the inference graph
    ctx->debug_print_tensors.clear();
    ggml_backend_sched_reset(ctx->sched.get());
--- a/tools/server/CMakeLists.txt
+++ b/tools/server/CMakeLists.txt
@ -2,11 +2,6 @@ set(TARGET llama-server)

 include_directories(${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_CURRENT_BINARY_DIR})

-if (MINGW)
-    # fix: https://github.com/ggml-org/llama.cpp/actions/runs/9651004652/job/26617901362?pr=8006
-    add_compile_definitions(_WIN32_WINNT=${GGML_WIN_VER})
-endif()
-
 if (NOT LLAMA_HTTPLIB)
    message(FATAL_ERROR "LLAMA_HTTPLIB is OFF, cannot build llama-server. Hint: to skip building server, set -DLLAMA_BUILD_SERVER=OFF")
 endif()
--- a/tools/server/public/index.html.gz
+++ b/tools/server/public/index.html.gz
--- a/tools/server/server-common.cpp
+++ b/tools/server/server-common.cpp
@ -11,6 +11,7 @@

 #include <random>
 #include <sstream>
+#include <fstream>

 json format_error_response(const std::string & message, const enum error_type type) {
    std::string type_str;
@ -774,6 +775,65 @@ json oaicompat_completion_params_parse(const json & body) {
    return llama_params;
 }

+// media_path always end with '/', see arg.cpp
+static void handle_media(
+        std::vector<raw_buffer> & out_files,
+        json & media_obj,
+        const std::string & media_path) {
+    std::string url = json_value(media_obj, "url", std::string());
+    if (string_starts_with(url, "http")) {
+        // download remote image
+        // TODO @ngxson : maybe make these params configurable
+        common_remote_params params;
+        params.headers.push_back("User-Agent: llama.cpp/" + build_info);
+        params.max_size = 1024 * 1024 * 10; // 10MB
+        params.timeout  = 10; // seconds
+        SRV_INF("downloading image from '%s'\n", url.c_str());
+        auto res = common_remote_get_content(url, params);
+        if (200 <= res.first && res.first < 300) {
+            SRV_INF("downloaded %zu bytes\n", res.second.size());
+            raw_buffer data;
+            data.insert(data.end(), res.second.begin(), res.second.end());
+            out_files.push_back(data);
+        } else {
+            throw std::runtime_error("Failed to download image");
+        }
+
+    } else if (string_starts_with(url, "file://")) {
+        if (media_path.empty()) {
+            throw std::invalid_argument("file:// URLs are not allowed unless --media-path is specified");
+        }
+        // load local image file
+        std::string file_path = url.substr(7); // remove "file://"
+        raw_buffer data;
+        if (!fs_validate_filename(file_path, true)) {
+            throw std::invalid_argument("file path is not allowed: " + file_path);
+        }
+        SRV_INF("loading image from local file '%s'\n", (media_path + file_path).c_str());
+        std::ifstream file(media_path + file_path, std::ios::binary);
+        if (!file) {
+            throw std::invalid_argument("file does not exist or cannot be opened: " + file_path);
+        }
+        data.assign((std::istreambuf_iterator<char>(file)), std::istreambuf_iterator<char>());
+        out_files.push_back(data);
+
+    } else {
+        // try to decode base64 image
+        std::vector<std::string> parts = string_split<std::string>(url, /*separator*/ ',');
+        if (parts.size() != 2) {
+            throw std::runtime_error("Invalid url value");
+        } else if (!string_starts_with(parts[0], "data:image/")) {
+            throw std::runtime_error("Invalid url format: " + parts[0]);
+        } else if (!string_ends_with(parts[0], "base64")) {
+            throw std::runtime_error("url must be base64 encoded");
+        } else {
+            auto base64_data = parts[1];
+            auto decoded_data = base64_decode(base64_data);
+            out_files.push_back(decoded_data);
+        }
+    }
+}
+
 // used by /chat/completions endpoint
 json oaicompat_chat_params_parse(
    json & body, /* openai api json semantics */
@ -819,26 +879,26 @@ json oaicompat_chat_params_parse(
            auto schema_wrapper = json_value(response_format, "json_schema", json::object());
            json_schema = json_value(schema_wrapper, "schema", json::object());
        } else if (!response_type.empty() && response_type != "text") {
-            throw std::runtime_error("response_format type must be one of \"text\" or \"json_object\", but got: " + response_type);
+            throw std::invalid_argument("response_format type must be one of \"text\" or \"json_object\", but got: " + response_type);
        }
    }

    // get input files
    if (!body.contains("messages")) {
-        throw std::runtime_error("'messages' is required");
+        throw std::invalid_argument("'messages' is required");
    }
    json & messages = body.at("messages");
    if (!messages.is_array()) {
-        throw std::runtime_error("Expected 'messages' to be an array");
+        throw std::invalid_argument("Expected 'messages' to be an array");
    }
    for (auto & msg : messages) {
        std::string role = json_value(msg, "role", std::string());
        if (role != "assistant" && !msg.contains("content")) {
-            throw std::runtime_error("All non-assistant messages must contain 'content'");
+            throw std::invalid_argument("All non-assistant messages must contain 'content'");
        }
        if (role == "assistant") {
            if (!msg.contains("content") && !msg.contains("tool_calls")) {
-                throw std::runtime_error("Assistant message must contain either 'content' or 'tool_calls'!");
+                throw std::invalid_argument("Assistant message must contain either 'content' or 'tool_calls'!");
            }
            if (!msg.contains("content")) {
                continue; // avoid errors with no content
@ -850,7 +910,7 @@ json oaicompat_chat_params_parse(
        }

        if (!content.is_array()) {
-            throw std::runtime_error("Expected 'content' to be a string or an array");
+            throw std::invalid_argument("Expected 'content' to be a string or an array");
        }

        for (auto & p : content) {
@ -861,40 +921,7 @@ json oaicompat_chat_params_parse(
                }

                json image_url = json_value(p, "image_url", json::object());
-                std::string url = json_value(image_url, "url", std::string());
-                if (string_starts_with(url, "http")) {
-                    // download remote image
-                    // TODO @ngxson : maybe make these params configurable
-                    common_remote_params params;
-                    params.headers.push_back("User-Agent: llama.cpp/" + build_info);
-                    params.max_size = 1024 * 1024 * 10; // 10MB
-                    params.timeout  = 10; // seconds
-                    SRV_INF("downloading image from '%s'\n", url.c_str());
-                    auto res = common_remote_get_content(url, params);
-                    if (200 <= res.first && res.first < 300) {
-                        SRV_INF("downloaded %ld bytes\n", res.second.size());
-                        raw_buffer data;
-                        data.insert(data.end(), res.second.begin(), res.second.end());
-                        out_files.push_back(data);
-                    } else {
-                        throw std::runtime_error("Failed to download image");
-                    }
-
-                } else {
-                    // try to decode base64 image
-                    std::vector<std::string> parts = string_split<std::string>(url, /*separator*/ ',');
-                    if (parts.size() != 2) {
-                        throw std::runtime_error("Invalid image_url.url value");
-                    } else if (!string_starts_with(parts[0], "data:image/")) {
-                        throw std::runtime_error("Invalid image_url.url format: " + parts[0]);
-                    } else if (!string_ends_with(parts[0], "base64")) {
-                        throw std::runtime_error("image_url.url must be base64 encoded");
-                    } else {
-                        auto base64_data = parts[1];
-                        auto decoded_data = base64_decode(base64_data);
-                        out_files.push_back(decoded_data);
-                    }
-                }
+                handle_media(out_files, image_url, opt.media_path);

                // replace this chunk with a marker
                p["type"] = "text";
@ -911,18 +938,20 @@ json oaicompat_chat_params_parse(
                std::string format = json_value(input_audio, "format", std::string());
                // while we also support flac, we don't allow it here so we matches the OAI spec
                if (format != "wav" && format != "mp3") {
-                    throw std::runtime_error("input_audio.format must be either 'wav' or 'mp3'");
+                    throw std::invalid_argument("input_audio.format must be either 'wav' or 'mp3'");
                }
                auto decoded_data = base64_decode(data); // expected to be base64 encoded
                out_files.push_back(decoded_data);

+                // TODO: add audio_url support by reusing handle_media()
+
                // replace this chunk with a marker
                p["type"] = "text";
                p["text"] = mtmd_default_marker();
                p.erase("input_audio");

            } else if (type != "text") {
-                throw std::runtime_error("unsupported content[].type");
+                throw std::invalid_argument("unsupported content[].type");
            }
        }
    }
@ -940,7 +969,7 @@ json oaicompat_chat_params_parse(
    inputs.enable_thinking       = opt.enable_thinking;
    if (!inputs.tools.empty() && inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_NONE) {
        if (body.contains("grammar")) {
-            throw std::runtime_error("Cannot use custom grammar constraints with tools.");
+            throw std::invalid_argument("Cannot use custom grammar constraints with tools.");
        }
        llama_params["parse_tool_calls"] = true;
    }
@ -959,7 +988,7 @@ json oaicompat_chat_params_parse(
    } else if (enable_thinking_kwarg == "false") {
        inputs.enable_thinking = false;
    } else if (!enable_thinking_kwarg.empty() && enable_thinking_kwarg[0] == '"') {
-        throw std::runtime_error("invalid type for \"enable_thinking\" (expected boolean, got string)");
+        throw std::invalid_argument("invalid type for \"enable_thinking\" (expected boolean, got string)");
    }

    // if the assistant message appears at the end of list, we do not add end-of-turn token
@ -972,14 +1001,14 @@ json oaicompat_chat_params_parse(

        /* sanity check, max one assistant message at the end of the list */
        if (!inputs.messages.empty() && inputs.messages.back().role == "assistant"){
-            throw std::runtime_error("Cannot have 2 or more assistant messages at the end of the list.");
+            throw std::invalid_argument("Cannot have 2 or more assistant messages at the end of the list.");
        }

        /* TODO: test this properly */
        inputs.reasoning_format = COMMON_REASONING_FORMAT_NONE;

        if ( inputs.enable_thinking ) {
-            throw std::runtime_error("Assistant response prefill is incompatible with enable_thinking.");
+            throw std::invalid_argument("Assistant response prefill is incompatible with enable_thinking.");
        }

        inputs.add_generation_prompt = true;
@ -1016,22 +1045,25 @@ json oaicompat_chat_params_parse(
    for (const auto & stop : chat_params.additional_stops) {
        llama_params["stop"].push_back(stop);
    }
+    if (!chat_params.parser.empty()) {
+        llama_params["chat_parser"] = chat_params.parser;
+    }

    // Handle "n" field
    int n_choices = json_value(body, "n", 1);
    if (n_choices != 1) {
-        throw std::runtime_error("Only one completion choice is allowed");
+        throw std::invalid_argument("Only one completion choice is allowed");
    }

    // Handle "logprobs" field
    // TODO: The response format of this option is not yet OAI-compatible, but seems like no one really using it; We may need to fix it in the future
    if (json_value(body, "logprobs", false)) {
        if (has_tools && stream) {
-            throw std::runtime_error("logprobs is not supported with tools + stream");
+            throw std::invalid_argument("logprobs is not supported with tools + stream");
        }
        llama_params["n_probs"] = json_value(body, "top_logprobs", 20);
    } else if (body.contains("top_logprobs") && !body.at("top_logprobs").is_null()) {
-        throw std::runtime_error("top_logprobs requires logprobs to be set to true");
+        throw std::invalid_argument("top_logprobs requires logprobs to be set to true");
    }

    // Copy remaining properties to llama_params
--- a/tools/server/server-common.h
+++ b/tools/server/server-common.h
@ -284,6 +284,7 @@ struct oaicompat_parser_options {
    bool allow_image;
    bool allow_audio;
    bool enable_thinking = true;
+    std::string media_path;
 };

 // used by /chat/completions endpoint
--- a/tools/server/server-context.cpp
+++ b/tools/server/server-context.cpp
@ -785,6 +785,7 @@ struct server_context_impl {
            /* allow_image           */ mctx ? mtmd_support_vision(mctx) : false,
            /* allow_audio           */ mctx ? mtmd_support_audio (mctx) : false,
            /* enable_thinking       */ enable_thinking,
+            /* media_path            */ params_base.media_path,
        };

        // print sample chat example to make it clear which template is used
--- a/tools/server/server-models.cpp
+++ b/tools/server/server-models.cpp
@ -7,6 +7,7 @@
 #include <sheredom/subprocess.h>

 #include <functional>
+#include <algorithm>
 #include <thread>
 #include <mutex>
 #include <condition_variable>
@ -889,6 +890,28 @@ struct pipe_t {
    }
 };

+static std::string to_lower_copy(const std::string & value) {
+    std::string lowered(value.size(), '\0');
+    std::transform(value.begin(), value.end(), lowered.begin(), [](unsigned char c) { return std::tolower(c); });
+    return lowered;
+}
+
+static bool should_strip_proxy_header(const std::string & header_name) {
+    // Headers that get duplicated when router forwards child responses
+    if (header_name == "server" ||
+        header_name == "transfer-encoding" ||
+        header_name == "keep-alive") {
+        return true;
+    }
+
+    // Router injects CORS, child also sends them: duplicate
+    if (header_name.rfind("access-control-", 0) == 0) {
+        return true;
+    }
+
+    return false;
+}
+
 server_http_proxy::server_http_proxy(
        const std::string & method,
        const std::string & host,
@ -925,6 +948,14 @@ server_http_proxy::server_http_proxy(
        msg_t msg;
        msg.status = response.status;
        for (const auto & [key, value] : response.headers) {
+            const auto lowered = to_lower_copy(key);
+            if (should_strip_proxy_header(lowered)) {
+                continue;
+            }
+            if (lowered == "content-type") {
+                msg.content_type = value;
+                continue;
+            }
            msg.headers[key] = value;
        }
        return pipe->write(std::move(msg)); // send headers first
@ -932,7 +963,7 @@ server_http_proxy::server_http_proxy(
    httplib::ContentReceiverWithProgress content_receiver = [pipe](const char * data, size_t data_length, size_t, size_t) {
        // send data chunks
        // returns false if pipe is closed / broken (signal to stop receiving)
-        return pipe->write({{}, 0, std::string(data, data_length)});
+        return pipe->write({{}, 0, std::string(data, data_length), ""});
    };

    // prepare the request to destination server
@ -955,8 +986,8 @@ server_http_proxy::server_http_proxy(
        if (result.error() != httplib::Error::Success) {
            auto err_str = httplib::to_string(result.error());
            SRV_ERR("http client error: %s\n", err_str.c_str());
-            pipe->write({{}, 500, ""}); // header
-            pipe->write({{}, 0, "proxy error: " + err_str}); // body
+            pipe->write({{}, 500, "", ""}); // header
+            pipe->write({{}, 0, "proxy error: " + err_str, ""}); // body
        }
        pipe->close_write(); // signal EOF to reader
        SRV_DBG("%s", "client request thread ended\n");
@ -964,12 +995,17 @@ server_http_proxy::server_http_proxy(
    this->thread.detach();

    // wait for the first chunk (headers)
+    {
        msg_t header;
        if (pipe->read(header, should_stop)) {
            SRV_DBG("%s", "received response headers\n");
            this->status  = header.status;
-        this->headers = header.headers;
+            this->headers = std::move(header.headers);
+            if (!header.content_type.empty()) {
+                this->content_type = std::move(header.content_type);
+            }
        } else {
            SRV_DBG("%s", "no response headers received (request cancelled?)\n");
        }
    }
+}
--- a/tools/server/server-models.h
+++ b/tools/server/server-models.h
@ -170,5 +170,6 @@ private:
        std::map<std::string, std::string> headers;
        int status = 0;
        std::string data;
+        std::string content_type;
    };
 };
--- a/tools/server/server-task.cpp
+++ b/tools/server/server-task.cpp
@ -302,6 +302,9 @@ task_params server_task::params_from_json_cmpl(
        params.oaicompat_chat_syntax.reasoning_in_content = params.stream && (reasoning_format == COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY);
        params.oaicompat_chat_syntax.thinking_forced_open = json_value(data, "thinking_forced_open", false);
        params.oaicompat_chat_syntax.parse_tool_calls = json_value(data, "parse_tool_calls", false);
+        if (data.contains("chat_parser")) {
+            params.oaicompat_chat_syntax.parser.load(data.at("chat_parser").get<std::string>());
+        }
    }

    {
--- a/tools/server/server.cpp
+++ b/tools/server/server.cpp
@ -34,18 +34,26 @@ static inline void signal_handler(int signal) {
 static server_http_context::handler_t ex_wrapper(server_http_context::handler_t func) {
    return [func = std::move(func)](const server_http_req & req) -> server_http_res_ptr {
        std::string message;
+        error_type error;
        try {
            return func(req);
+        } catch (const std::invalid_argument & e) {
+            // treat invalid_argument as invalid request (400)
+            error = ERROR_TYPE_INVALID_REQUEST;
+            message = e.what();
        } catch (const std::exception & e) {
+            // treat other exceptions as server error (500)
+            error = ERROR_TYPE_SERVER;
            message = e.what();
        } catch (...) {
+            error = ERROR_TYPE_SERVER;
            message = "unknown error";
        }

        auto res = std::make_unique<server_http_res>();
        res->status = 500;
        try {
-            json error_data = format_error_response(message, ERROR_TYPE_SERVER);
+            json error_data = format_error_response(message, error);
            res->status = json_value(error_data, "code", 500);
            res->data = safe_json_to_str({{ "error", error_data }});
            SRV_WRN("got exception: %s\n", res->data.c_str());
--- a/tools/server/tests/unit/test_basic.py
+++ b/tools/server/tests/unit/test_basic.py
@ -65,6 +65,7 @@ def test_server_slots():

 def test_load_split_model():
    global server
+    server.offline = False
    server.model_hf_repo = "ggml-org/models"
    server.model_hf_file = "tinyllamas/split/stories15M-q8_0-00001-of-00003.gguf"
    server.model_alias = "tinyllama-split"
--- a/tools/server/tests/unit/test_chat_completion.py
+++ b/tools/server/tests/unit/test_chat_completion.py
@ -199,7 +199,7 @@ def test_completion_with_response_format(response_format: dict, n_predicted: int
        choice = res.body["choices"][0]
        assert match_regex(re_content, choice["message"]["content"])
    else:
-        assert res.status_code != 200
+        assert res.status_code == 400
        assert "error" in res.body


--- a/tools/server/tests/unit/test_router.py
+++ b/tools/server/tests/unit/test_router.py
@ -17,7 +17,6 @@ def create_server():
    ]
 )
 def test_router_chat_completion_stream(model: str, success: bool):
-    # TODO: make sure the model is in cache (ie. ServerProcess.load_all()) before starting the router server
    global server
    server.start()
    content = ""
@ -48,3 +47,148 @@ def test_router_chat_completion_stream(model: str, success: bool):
    else:
        assert ex is not None
        assert content == ""
+
+
+def _get_model_status(model_id: str) -> str:
+    res = server.make_request("GET", "/models")
+    assert res.status_code == 200
+    for item in res.body.get("data", []):
+        if item.get("id") == model_id or item.get("model") == model_id:
+            return item["status"]["value"]
+    raise AssertionError(f"Model {model_id} not found in /models response")
+
+
+def _wait_for_model_status(model_id: str, desired: set[str], timeout: int = 60) -> str:
+    deadline = time.time() + timeout
+    last_status = None
+    while time.time() < deadline:
+        last_status = _get_model_status(model_id)
+        if last_status in desired:
+            return last_status
+        time.sleep(1)
+    raise AssertionError(
+        f"Timed out waiting for {model_id} to reach {desired}, last status: {last_status}"
+    )
+
+
+def _load_model_and_wait(
+    model_id: str, timeout: int = 60, headers: dict | None = None
+) -> None:
+    load_res = server.make_request(
+        "POST", "/models/load", data={"model": model_id}, headers=headers
+    )
+    assert load_res.status_code == 200
+    assert isinstance(load_res.body, dict)
+    assert load_res.body.get("success") is True
+    _wait_for_model_status(model_id, {"loaded"}, timeout=timeout)
+
+
+def test_router_unload_model():
+    global server
+    server.start()
+    model_id = "ggml-org/tinygemma3-GGUF:Q8_0"
+
+    _load_model_and_wait(model_id)
+
+    unload_res = server.make_request("POST", "/models/unload", data={"model": model_id})
+    assert unload_res.status_code == 200
+    assert unload_res.body.get("success") is True
+    _wait_for_model_status(model_id, {"unloaded"})
+
+
+def test_router_models_max_evicts_lru():
+    global server
+    server.models_max = 2
+    server.start()
+
+    candidate_models = [
+        "ggml-org/tinygemma3-GGUF:Q8_0",
+        "ggml-org/test-model-stories260K",
+        "ggml-org/test-model-stories260K-infill",
+    ]
+
+    # Load only the first 2 models to fill the cache
+    first, second, third = candidate_models[:3]
+
+    _load_model_and_wait(first, timeout=120)
+    _load_model_and_wait(second, timeout=120)
+
+    # Verify both models are loaded
+    assert _get_model_status(first) == "loaded"
+    assert _get_model_status(second) == "loaded"
+
+    # Load the third model - this should trigger LRU eviction of the first model
+    _load_model_and_wait(third, timeout=120)
+
+    # Verify eviction: third is loaded, first was evicted
+    assert _get_model_status(third) == "loaded"
+    assert _get_model_status(first) == "unloaded"
+
+
+def test_router_no_models_autoload():
+    global server
+    server.no_models_autoload = True
+    server.start()
+    model_id = "ggml-org/tinygemma3-GGUF:Q8_0"
+
+    res = server.make_request(
+        "POST",
+        "/v1/chat/completions",
+        data={
+            "model": model_id,
+            "messages": [{"role": "user", "content": "hello"}],
+            "max_tokens": 4,
+        },
+    )
+    assert res.status_code == 400
+    assert "error" in res.body
+
+    _load_model_and_wait(model_id)
+
+    success_res = server.make_request(
+        "POST",
+        "/v1/chat/completions",
+        data={
+            "model": model_id,
+            "messages": [{"role": "user", "content": "hello"}],
+            "max_tokens": 4,
+        },
+    )
+    assert success_res.status_code == 200
+    assert "error" not in success_res.body
+
+
+def test_router_api_key_required():
+    global server
+    server.api_key = "sk-router-secret"
+    server.start()
+
+    model_id = "ggml-org/tinygemma3-GGUF:Q8_0"
+    auth_headers = {"Authorization": f"Bearer {server.api_key}"}
+
+    res = server.make_request(
+        "POST",
+        "/v1/chat/completions",
+        data={
+            "model": model_id,
+            "messages": [{"role": "user", "content": "hello"}],
+            "max_tokens": 4,
+        },
+    )
+    assert res.status_code == 401
+    assert res.body.get("error", {}).get("type") == "authentication_error"
+
+    _load_model_and_wait(model_id, headers=auth_headers)
+
+    authed = server.make_request(
+        "POST",
+        "/v1/chat/completions",
+        headers=auth_headers,
+        data={
+            "model": model_id,
+            "messages": [{"role": "user", "content": "hello"}],
+            "max_tokens": 4,
+        },
+    )
+    assert authed.status_code == 200
+    assert "error" not in authed.body
--- a/tools/server/tests/unit/test_security.py
+++ b/tools/server/tests/unit/test_security.py
@ -94,3 +94,34 @@ def test_cors_options(origin: str, cors_header: str, cors_header_value: str):
    assert res.status_code == 200
    assert cors_header in res.headers
    assert res.headers[cors_header] == cors_header_value
+
+
+@pytest.mark.parametrize(
+    "media_path, image_url, success",
+    [
+        (None,             "file://mtmd/test-1.jpeg",    False), # disabled media path, should fail
+        ("../../../tools", "file://mtmd/test-1.jpeg",    True),
+        ("../../../tools", "file:////mtmd//test-1.jpeg", True),  # should be the same file as above
+        ("../../../tools", "file://mtmd/notfound.jpeg",  False), # non-existent file
+        ("../../../tools", "file://../mtmd/test-1.jpeg", False), # no directory traversal
+    ]
+)
+def test_local_media_file(media_path, image_url, success,):
+    server = ServerPreset.tinygemma3()
+    server.media_path = media_path
+    server.start()
+    res = server.make_request("POST", "/chat/completions", data={
+        "max_tokens": 1,
+        "messages": [
+            {"role": "user", "content": [
+                {"type": "text", "text": "test"},
+                {"type": "image_url", "image_url": {
+                    "url": image_url,
+                }},
+            ]},
+        ],
+    })
+    if success:
+        assert res.status_code == 200
+    else:
+        assert res.status_code == 400
--- a/tools/server/tests/utils.py
+++ b/tools/server/tests/utils.py
@ -7,6 +7,7 @@ import subprocess
 import os
 import re
 import json
+from json import JSONDecodeError
 import sys
 import requests
 import time
@ -83,6 +84,9 @@ class ServerProcess:
    pooling: str | None = None
    draft: int | None = None
    api_key: str | None = None
+    models_dir: str | None = None
+    models_max: int | None = None
+    no_models_autoload: bool | None = None
    lora_files: List[str] | None = None
    enable_ctx_shift: int | None = False
    draft_min: int | None = None
@ -95,6 +99,7 @@ class ServerProcess:
    chat_template_file: str | None = None
    server_path: str | None = None
    mmproj_url: str | None = None
+    media_path: str | None = None

    # session variables
    process: subprocess.Popen | None = None
@ -142,6 +147,10 @@ class ServerProcess:
            server_args.extend(["--hf-repo", self.model_hf_repo])
        if self.model_hf_file:
            server_args.extend(["--hf-file", self.model_hf_file])
+        if self.models_dir:
+            server_args.extend(["--models-dir", self.models_dir])
+        if self.models_max is not None:
+            server_args.extend(["--models-max", self.models_max])
        if self.n_batch:
            server_args.extend(["--batch-size", self.n_batch])
        if self.n_ubatch:
@ -203,6 +212,8 @@ class ServerProcess:
            server_args.extend(["--draft-min", self.draft_min])
        if self.no_webui:
            server_args.append("--no-webui")
+        if self.no_models_autoload:
+            server_args.append("--no-models-autoload")
        if self.jinja:
            server_args.append("--jinja")
        else:
@ -217,6 +228,8 @@ class ServerProcess:
            server_args.extend(["--chat-template-file", self.chat_template_file])
        if self.mmproj_url:
            server_args.extend(["--mmproj-url", self.mmproj_url])
+        if self.media_path:
+            server_args.extend(["--media-path", self.media_path])

        args = [str(arg) for arg in [server_path, *server_args]]
        print(f"tests: starting server with: {' '.join(args)}")
@ -292,7 +305,13 @@ class ServerProcess:
        result = ServerResponse()
        result.headers = dict(response.headers)
        result.status_code = response.status_code
-        result.body = response.json() if parse_body else None
+        if parse_body:
+            try:
+                result.body = response.json()
+            except JSONDecodeError:
+                result.body = response.text
+        else:
+            result.body = None
        print("Response from server", json.dumps(result.body, indent=2))
        return result

@ -431,8 +450,9 @@ class ServerPreset:
    @staticmethod
    def tinyllama2() -> ServerProcess:
        server = ServerProcess()
-        server.model_hf_repo = "ggml-org/models"
-        server.model_hf_file = "tinyllamas/stories260K.gguf"
+        server.offline = True # will be downloaded by load_all()
+        server.model_hf_repo = "ggml-org/test-model-stories260K"
+        server.model_hf_file = None
        server.model_alias = "tinyllama-2"
        server.n_ctx = 512
        server.n_batch = 32
@ -476,8 +496,8 @@ class ServerPreset:
    def tinyllama_infill() -> ServerProcess:
        server = ServerProcess()
        server.offline = True # will be downloaded by load_all()
-        server.model_hf_repo = "ggml-org/models"
-        server.model_hf_file = "tinyllamas/stories260K-infill.gguf"
+        server.model_hf_repo = "ggml-org/test-model-stories260K-infill"
+        server.model_hf_file = None
        server.model_alias = "tinyllama-infill"
        server.n_ctx = 2048
        server.n_batch = 1024
@ -534,6 +554,7 @@ class ServerPreset:
    @staticmethod
    def router() -> ServerProcess:
        server = ServerProcess()
+        server.offline = True # will be downloaded by load_all()
        # router server has no models
        server.model_file = None
        server.model_alias = None
--- a/tools/server/webui/docs/flows/data-flow-simplified-router-mode.md
+++ b/tools/server/webui/docs/flows/data-flow-simplified-router-mode.md
@ -15,7 +15,7 @@ sequenceDiagram
    Stores->>DB: load conversations
    Stores->>API: GET /props
    API-->>Stores: {role: "router"}
-    Stores->>API: GET /models
+    Stores->>API: GET /v1/models
    API-->>Stores: models[] with status (loaded/available)
    loop each loaded model
        Stores->>API: GET /props?model=X
@ -28,7 +28,7 @@ sequenceDiagram
    alt model not loaded
        Stores->>API: POST /models/load
        loop poll status
-            Stores->>API: GET /models
+            Stores->>API: GET /v1/models
            API-->>Stores: check if loaded
        end
        Stores->>API: GET /props?model=X
--- a/tools/server/webui/docs/flows/models-flow.md
+++ b/tools/server/webui/docs/flows/models-flow.md
@ -56,7 +56,7 @@ sequenceDiagram
    UI->>modelsStore: fetchRouterModels()
    activate modelsStore
    modelsStore->>ModelsSvc: listRouter()
-    ModelsSvc->>API: GET /models
+    ModelsSvc->>API: GET /v1/models
    API-->>ModelsSvc: ApiRouterModelsListResponse
    Note right of API: {data: [{id, status, path, in_cache}]}
    modelsStore->>modelsStore: routerModels = $state(data)
@ -132,7 +132,7 @@ sequenceDiagram
    loop poll every 500ms (max 60 attempts)
        modelsStore->>modelsStore: fetchRouterModels()
        modelsStore->>ModelsSvc: listRouter()
-        ModelsSvc->>API: GET /models
+        ModelsSvc->>API: GET /v1/models
        API-->>ModelsSvc: models[]
        modelsStore->>modelsStore: getModelStatus(modelId)
        alt status === LOADED
@ -165,7 +165,7 @@ sequenceDiagram
    modelsStore->>modelsStore: pollForModelStatus(modelId, UNLOADED)
    loop poll until unloaded
        modelsStore->>ModelsSvc: listRouter()
-        ModelsSvc->>API: GET /models
+        ModelsSvc->>API: GET /v1/models
    end

    modelsStore->>modelsStore: modelLoadingStates.set(modelId, false)
--- a/tools/server/webui/src/lib/components/app/chat/ChatForm/ChatForm.svelte
+++ b/tools/server/webui/src/lib/components/app/chat/ChatForm/ChatForm.svelte
@ -64,7 +64,10 @@
 	let fileInputRef: ChatFormFileInputInvisible | undefined = $state(undefined);
 	let isRecording = $state(false);
 	let message = $state('');
-	let pasteLongTextToFileLength = $derived(Number(currentConfig.pasteLongTextToFileLen) || 2500);
+	let pasteLongTextToFileLength = $derived.by(() => {
+		const n = Number(currentConfig.pasteLongTextToFileLen);
+		return Number.isNaN(n) ? 2500 : n;
+	});
 	let previousIsLoading = $state(isLoading);
 	let recordingSupported = $state(false);
 	let textareaRef: ChatFormTextarea | undefined = $state(undefined);
--- a/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessages.svelte
+++ b/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessages.svelte
@ -1,6 +1,5 @@
 <script lang="ts">
 	import { ChatMessage } from '$lib/components/app';
-	import { DatabaseService } from '$lib/services/database';
 	import { chatStore } from '$lib/stores/chat.svelte';
 	import { conversationsStore, activeConversation } from '$lib/stores/conversations.svelte';
 	import { getMessageSiblings } from '$lib/utils';
@ -19,7 +18,7 @@
 		const conversation = activeConversation();

 		if (conversation) {
-			DatabaseService.getConversationMessages(conversation.id).then((messages) => {
+			conversationsStore.getConversationMessages(conversation.id).then((messages) => {
 				allConversationMessages = messages;
 			});
 		} else {
--- a/tools/server/webui/src/lib/components/app/chat/ChatSettings/ChatSettingsFields.svelte
+++ b/tools/server/webui/src/lib/components/app/chat/ChatSettings/ChatSettingsFields.svelte
@ -7,7 +7,6 @@
 	import { Textarea } from '$lib/components/ui/textarea';
 	import { SETTING_CONFIG_DEFAULT, SETTING_CONFIG_INFO } from '$lib/constants/settings-config';
 	import { settingsStore } from '$lib/stores/settings.svelte';
-	import { ParameterSyncService } from '$lib/services/parameter-sync';
 	import { ChatSettingsParameterSourceIndicator } from '$lib/components/app';
 	import type { Component } from 'svelte';

@ -22,7 +21,7 @@

 	// Helper function to get parameter source info for syncable parameters
 	function getParameterSourceInfo(key: string) {
-		if (!ParameterSyncService.canSyncParameter(key)) {
+		if (!settingsStore.canSyncParameter(key)) {
 			return null;
 		}

--- a/tools/server/webui/src/lib/components/app/chat/ChatSettings/ChatSettingsImportExportTab.svelte
+++ b/tools/server/webui/src/lib/components/app/chat/ChatSettings/ChatSettingsImportExportTab.svelte
@ -2,9 +2,8 @@
 	import { Download, Upload } from '@lucide/svelte';
 	import { Button } from '$lib/components/ui/button';
 	import { DialogConversationSelection } from '$lib/components/app';
-	import { DatabaseService } from '$lib/services/database';
 	import { createMessageCountMap } from '$lib/utils';
-	import { conversationsStore } from '$lib/stores/conversations.svelte';
+	import { conversationsStore, conversations } from '$lib/stores/conversations.svelte';

 	let exportedConversations = $state<DatabaseConversation[]>([]);
 	let importedConversations = $state<DatabaseConversation[]>([]);
@ -21,15 +20,15 @@

 	async function handleExportClick() {
 		try {
-			const allConversations = await DatabaseService.getAllConversations();
+			const allConversations = conversations();
 			if (allConversations.length === 0) {
 				alert('No conversations to export');
 				return;
 			}

 			const conversationsWithMessages = await Promise.all(
-				allConversations.map(async (conv) => {
-					const messages = await DatabaseService.getConversationMessages(conv.id);
+				allConversations.map(async (conv: DatabaseConversation) => {
+					const messages = await conversationsStore.getConversationMessages(conv.id);
 					return { conv, messages };
 				})
 			);
@ -47,7 +46,7 @@
 		try {
 			const allData: ExportedConversations = await Promise.all(
 				selectedConversations.map(async (conv) => {
-					const messages = await DatabaseService.getConversationMessages(conv.id);
+					const messages = await conversationsStore.getConversationMessages(conv.id);
 					return { conv: $state.snapshot(conv), messages: $state.snapshot(messages) };
 				})
 			);
@ -135,9 +134,7 @@
 				.snapshot(fullImportData)
 				.filter((item) => selectedIds.has(item.conv.id));

-			await DatabaseService.importConversations(selectedData);
-
-			await conversationsStore.loadConversations();
+			await conversationsStore.importConversationsData(selectedData);

 			importedConversations = selectedConversations;
 			showImportSummary = true;
--- a/tools/server/webui/src/lib/components/app/dialogs/DialogModelInformation.svelte
+++ b/tools/server/webui/src/lib/components/app/dialogs/DialogModelInformation.svelte
@ -3,8 +3,7 @@
 	import * as Table from '$lib/components/ui/table';
 	import { BadgeModality, CopyToClipboardIcon } from '$lib/components/app';
 	import { serverStore } from '$lib/stores/server.svelte';
-	import { modelsStore } from '$lib/stores/models.svelte';
-	import { ChatService } from '$lib/services/chat';
+	import { modelsStore, modelOptions, modelsLoading } from '$lib/stores/models.svelte';
 	import { formatFileSize, formatParameters, formatNumber } from '$lib/utils';

 	interface Props {
@ -16,38 +15,24 @@

 	let serverProps = $derived(serverStore.props);
 	let modelName = $derived(modelsStore.singleModelName);
+	let models = $derived(modelOptions());
+	let isLoadingModels = $derived(modelsLoading());
+
+	// Get the first model for single-model mode display
+	let firstModel = $derived(models[0] ?? null);

 	// Get modalities from modelStore using the model ID from the first model
-	// For now it supports only for single-model mode, will be extended with further improvements for multi-model functioanlities
 	let modalities = $derived.by(() => {
-		if (!modelsData?.data?.[0]?.id) return [];
-
-		return modelsStore.getModelModalitiesArray(modelsData.data[0].id);
+		if (!firstModel?.id) return [];
+		return modelsStore.getModelModalitiesArray(firstModel.id);
 	});

-	let modelsData = $state<ApiModelListResponse | null>(null);
-	let isLoadingModels = $state(false);
-
-	// Fetch models data when dialog opens
+	// Ensure models are fetched when dialog opens
 	$effect(() => {
-		if (open && !modelsData) {
-			loadModelsData();
+		if (open && models.length === 0) {
+			modelsStore.fetch();
 		}
 	});
-
-	async function loadModelsData() {
-		isLoadingModels = true;
-
-		try {
-			modelsData = await ChatService.getModels();
-		} catch (error) {
-			console.error('Failed to load models data:', error);
-			// Set empty data to prevent infinite loading
-			modelsData = { object: 'list', data: [] };
-		} finally {
-			isLoadingModels = false;
-		}
-	}
 </script>

 <Dialog.Root bind:open {onOpenChange}>
@ -70,8 +55,8 @@
 				<div class="flex items-center justify-center py-8">
 					<div class="text-sm text-muted-foreground">Loading model information...</div>
 				</div>
-			{:else if modelsData && modelsData.data.length > 0}
-				{@const modelMeta = modelsData.data[0].meta}
+			{:else if firstModel}
+				{@const modelMeta = firstModel.meta}

 				{#if serverProps}
 					<Table.Root>
--- a/tools/server/webui/src/lib/constants/supported-file-types.ts
+++ b/tools/server/webui/src/lib/constants/supported-file-types.ts
@ -126,8 +126,13 @@ export const TEXT_FILE_TYPES = {
 		mimeTypes: [MimeTypeText.JAVA]
 	},
 	[FileTypeText.CPP]: {
-		extensions: [FileExtensionText.CPP, FileExtensionText.C, FileExtensionText.H],
-		mimeTypes: [MimeTypeText.CPP_SRC, MimeTypeText.C_SRC, MimeTypeText.C_HDR]
+		extensions: [
+			FileExtensionText.CPP,
+			FileExtensionText.C,
+			FileExtensionText.H,
+			FileExtensionText.HPP
+		],
+		mimeTypes: [MimeTypeText.CPP_SRC, MimeTypeText.CPP_HDR, MimeTypeText.C_SRC, MimeTypeText.C_HDR]
 	},
 	[FileTypeText.PHP]: {
 		extensions: [FileExtensionText.PHP],
@ -183,10 +188,30 @@ export const TEXT_FILE_TYPES = {
 	},
 	[FileTypeText.LATEX]: {
 		extensions: [FileExtensionText.TEX],
-		mimeTypes: [MimeTypeText.LATEX]
+		mimeTypes: [MimeTypeText.LATEX, MimeTypeText.TEX, MimeTypeText.TEX_APP]
 	},
 	[FileTypeText.BIBTEX]: {
 		extensions: [FileExtensionText.BIB],
 		mimeTypes: [MimeTypeText.BIBTEX]
+	},
+	[FileTypeText.CUDA]: {
+		extensions: [FileExtensionText.CU, FileExtensionText.CUH],
+		mimeTypes: [MimeTypeText.CUDA]
+	},
+	[FileTypeText.VULKAN]: {
+		extensions: [FileExtensionText.COMP],
+		mimeTypes: [MimeTypeText.PLAIN]
+	},
+	[FileTypeText.HASKELL]: {
+		extensions: [FileExtensionText.HS],
+		mimeTypes: [MimeTypeText.HASKELL]
+	},
+	[FileTypeText.CSHARP]: {
+		extensions: [FileExtensionText.CS],
+		mimeTypes: [MimeTypeText.CSHARP]
+	},
+	[FileTypeText.PROPERTIES]: {
+		extensions: [FileExtensionText.PROPERTIES],
+		mimeTypes: [MimeTypeText.PROPERTIES]
 	}
 } as const;
--- a/tools/server/webui/src/lib/enums/files.ts
+++ b/tools/server/webui/src/lib/enums/files.ts
@ -62,7 +62,12 @@ export enum FileTypeText {
 	VUE = 'vue',
 	SVELTE = 'svelte',
 	LATEX = 'latex',
-	BIBTEX = 'bibtex'
+	BIBTEX = 'bibtex',
+	CUDA = 'cuda',
+	VULKAN = 'vulkan',
+	HASKELL = 'haskell',
+	CSHARP = 'csharp',
+	PROPERTIES = 'properties'
 }

 // File extension enums
@ -121,7 +126,14 @@ export enum FileExtensionText {
 	VUE = '.vue',
 	SVELTE = '.svelte',
 	TEX = '.tex',
-	BIB = '.bib'
+	BIB = '.bib',
+	CU = '.cu',
+	CUH = '.cuh',
+	COMP = '.comp',
+	HPP = '.hpp',
+	HS = '.hs',
+	PROPERTIES = '.properties',
+	CS = '.cs'
 }

 // MIME type enums
@ -165,7 +177,10 @@ export enum MimeTypeText {
 	CSV = 'text/csv',
 	PYTHON = 'text/x-python',
 	JAVA = 'text/x-java-source',
+	CPP_HDR = 'text/x-c++hdr',
 	CPP_SRC = 'text/x-c++src',
+	CSHARP = 'text/x-csharp',
+	HASKELL = 'text/x-haskell',
 	C_SRC = 'text/x-csrc',
 	C_HDR = 'text/x-chdr',
 	PHP = 'text/x-php',
@ -182,6 +197,10 @@ export enum MimeTypeText {
 	DART = 'text/x-dart',
 	VUE = 'text/x-vue',
 	SVELTE = 'text/x-svelte',
-	LATEX = 'text/x-tex',
-	BIBTEX = 'text/x-bibtex'
+	TEX = 'text/x-tex',
+	TEX_APP = 'application/x-tex',
+	LATEX = 'application/x-latex',
+	BIBTEX = 'text/x-bibtex',
+	CUDA = 'text/x-cuda',
+	PROPERTIES = 'text/properties'
 }
--- a/tools/server/webui/src/lib/services/chat.ts
+++ b/tools/server/webui/src/lib/services/chat.ts
@ -680,48 +680,6 @@ export class ChatService {
 	// Utilities
 	// ─────────────────────────────────────────────────────────────────────────────

-	/**
-	 * Get server properties - static method for API compatibility (to be refactored)
-	 */
-	static async getServerProps(): Promise<ApiLlamaCppServerProps> {
-		try {
-			const response = await fetch(`./props`, {
-				headers: getJsonHeaders()
-			});
-
-			if (!response.ok) {
-				throw new Error(`Failed to fetch server props: ${response.status}`);
-			}
-
-			const data = await response.json();
-			return data;
-		} catch (error) {
-			console.error('Error fetching server props:', error);
-			throw error;
-		}
-	}
-
-	/**
-	 * Get model information from /models endpoint (to be refactored)
-	 */
-	static async getModels(): Promise<ApiModelListResponse> {
-		try {
-			const response = await fetch(`./models`, {
-				headers: getJsonHeaders()
-			});
-
-			if (!response.ok) {
-				throw new Error(`Failed to fetch models: ${response.status} ${response.statusText}`);
-			}
-
-			const data = await response.json();
-			return data;
-		} catch (error) {
-			console.error('Error fetching models:', error);
-			throw error;
-		}
-	}
-
 	/**
 	 * Injects a system message at the beginning of the conversation if provided.
 	 * Checks for existing system messages to avoid duplication.
--- a/tools/server/webui/src/lib/services/models.ts
+++ b/tools/server/webui/src/lib/services/models.ts
@ -7,7 +7,7 @@ import { getJsonHeaders } from '$lib/utils';
 *
 * This service handles communication with model-related endpoints:
 * - `/v1/models` - OpenAI-compatible model list (MODEL + ROUTER mode)
- * - `/models` - Router-specific model management (ROUTER mode only)
+ * - `/models/load`, `/models/unload` - Router-specific model management (ROUTER mode only)
 *
 * **Responsibilities:**
 * - List available models
@ -43,7 +43,7 @@ export class ModelsService {
 	 * Returns models with load status, paths, and other metadata
 	 */
 	static async listRouter(): Promise<ApiRouterModelsListResponse> {
-		const response = await fetch(`${base}/models`, {
+		const response = await fetch(`${base}/v1/models`, {
 			headers: getJsonHeaders()
 		});

--- a/tools/server/webui/src/lib/stores/conversations.svelte.ts
+++ b/tools/server/webui/src/lib/stores/conversations.svelte.ts
@ -519,6 +519,19 @@ class ConversationsStore {
 		return await DatabaseService.getConversationMessages(convId);
 	}

+	/**
+	 * Imports conversations from provided data (without file picker)
+	 * @param data - Array of conversation data with messages
+	 * @returns Import result with counts
+	 */
+	async importConversationsData(
+		data: ExportedConversations
+	): Promise<{ imported: number; skipped: number }> {
+		const result = await DatabaseService.importConversations(data);
+		await this.loadConversations();
+		return result;
+	}
+
 	/**
 	 * Adds a message to the active messages array
 	 * Used by chatStore when creating new messages
--- a/tools/server/webui/src/lib/stores/settings.svelte.ts
+++ b/tools/server/webui/src/lib/stores/settings.svelte.ts
@ -370,6 +370,10 @@ class SettingsStore {
 		return { ...this.config };
 	}

+	canSyncParameter(key: string): boolean {
+		return ParameterSyncService.canSyncParameter(key);
+	}
+
 	/**
 	 * Get parameter information including source for a specific parameter
 	 */
--- a/tools/server/webui/src/lib/utils/file-type.ts
+++ b/tools/server/webui/src/lib/utils/file-type.ts
@ -77,6 +77,13 @@ export function getFileTypeCategory(mimeType: string): FileTypeCategory | null {
 		case MimeTypeText.SVELTE:
 		case MimeTypeText.LATEX:
 		case MimeTypeText.BIBTEX:
+		case MimeTypeText.CUDA:
+		case MimeTypeText.CPP_HDR:
+		case MimeTypeText.CSHARP:
+		case MimeTypeText.HASKELL:
+		case MimeTypeText.PROPERTIES:
+		case MimeTypeText.TEX:
+		case MimeTypeText.TEX_APP:
 			return FileTypeCategory.TEXT;

 		default:
@ -144,6 +151,12 @@ export function getFileTypeCategoryByExtension(filename: string): FileTypeCatego
 		case FileExtensionText.SVELTE:
 		case FileExtensionText.TEX:
 		case FileExtensionText.BIB:
+		case FileExtensionText.COMP:
+		case FileExtensionText.CU:
+		case FileExtensionText.CUH:
+		case FileExtensionText.HPP:
+		case FileExtensionText.HS:
+		case FileExtensionText.PROPERTIES:
 			return FileTypeCategory.TEXT;

 		default:
--- a/vendor/cpp-httplib/CMakeLists.txt
+++ b/vendor/cpp-httplib/CMakeLists.txt
@ -144,4 +144,7 @@ if (CPPHTTPLIB_OPENSSL_SUPPORT)
        find_library(SECURITY_FRAMEWORK Security REQUIRED)
        target_link_libraries(${TARGET} PUBLIC ${CORE_FOUNDATION_FRAMEWORK} ${SECURITY_FRAMEWORK})
    endif()
+    if (WIN32 AND NOT MSVC)
+        target_link_libraries(${TARGET} PUBLIC crypt32)
+    endif()
 endif()