diff --git a/.github/ISSUE_TEMPLATE/019-bug-misc.yml b/.github/ISSUE_TEMPLATE/019-bug-misc.yml
index 1904e31fdc..e1bd08ddd2 100644
--- a/.github/ISSUE_TEMPLATE/019-bug-misc.yml
+++ b/.github/ISSUE_TEMPLATE/019-bug-misc.yml
@@ -86,6 +86,7 @@ body:
       description: >
           If applicable, please copy and paste any relevant log output, including any generated text.
           This will be automatically formatted into code, so no need for backticks.
+          If you are encountering problems specifically with the `llama_params_fit` module, always upload `--verbose` logs as well.
       render: shell
     validations:
       required: false
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index af4c60be64..de3ad06065 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -70,6 +70,7 @@ jobs:
         with:
           key: macOS-latest-cmake-arm64
           evict-old-files: 1d
+          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
 
       - name: Build
         id: cmake_build
@@ -106,6 +107,7 @@ jobs:
         with:
           key: macOS-latest-cmake-x64
           evict-old-files: 1d
+          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
 
       - name: Build
         id: cmake_build
@@ -142,6 +144,7 @@ jobs:
         with:
           key: macOS-latest-cmake-arm64-webgpu
           evict-old-files: 1d
+          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
 
       - name: Dawn Dependency
         id: dawn-depends
@@ -195,6 +198,7 @@ jobs:
         with:
           key: ubuntu-cpu-cmake-${{ matrix.build }}
           evict-old-files: 1d
+          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
 
       - name: Build Dependencies
         id: build_depends
@@ -276,6 +280,7 @@ jobs:
         with:
           key: ubuntu-latest-cmake-sanitizer-${{ matrix.sanitizer }}
           evict-old-files: 1d
+          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
 
       - name: Dependencies
         id: depends
@@ -396,6 +401,7 @@ jobs:
         with:
           key: ubuntu-24-cmake-vulkan-deb
           evict-old-files: 1d
+          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
 
       - name: Dependencies
         id: depends
@@ -431,6 +437,7 @@ jobs:
         with:
           key: ubuntu-24-cmake-vulkan
           evict-old-files: 1d
+          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
 
       - name: Dependencies
         id: depends
@@ -490,6 +497,7 @@ jobs:
         with:
           key: ubuntu-24-cmake-webgpu
           evict-old-files: 1d
+          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
 
       - name: Dependencies
         id: depends
@@ -562,6 +570,7 @@ jobs:
         with:
           key: ubuntu-latest-wasm-webgpu
           evict-old-files: 1d
+          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
 
       - name: Install Emscripten
         run: |
@@ -609,6 +618,7 @@ jobs:
         with:
           key: ubuntu-22-cmake-hip
           evict-old-files: 1d
+          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
 
       - name: Build with native CMake HIP support
         id: cmake_build
@@ -641,6 +651,7 @@ jobs:
         with:
           key: ubuntu-22-cmake-musa
           evict-old-files: 1d
+          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
 
       - name: Build with native CMake MUSA support
         id: cmake_build
@@ -688,6 +699,7 @@ jobs:
         with:
           key: ubuntu-22-cmake-sycl
           evict-old-files: 1d
+          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
 
       - name: Build
         id: cmake_build
@@ -738,6 +750,7 @@ jobs:
         with:
           key: ubuntu-22-cmake-sycl-fp16
           evict-old-files: 1d
+          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
 
       - name: Build
         id: cmake_build
@@ -771,6 +784,7 @@ jobs:
         with:
           key: macOS-latest-cmake-ios
           evict-old-files: 1d
+          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
 
       - name: Build
         id: cmake_build
@@ -802,6 +816,7 @@ jobs:
         with:
           key: macOS-latest-cmake-tvos
           evict-old-files: 1d
+          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
 
       - name: Build
         id: cmake_build
@@ -863,6 +878,7 @@ jobs:
         with:
           key: macOS-latest-swift
           evict-old-files: 1d
+          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
 
       - name: Download xcframework artifact
         uses: actions/download-artifact@v4
@@ -905,6 +921,7 @@ jobs:
           key: windows-msys2
           variant: ccache
           evict-old-files: 1d
+          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
 
       - name: Setup ${{ matrix.sys }}
         uses: msys2/setup-msys2@v2
@@ -973,6 +990,7 @@ jobs:
           key: windows-latest-cmake-${{ matrix.build }}
           variant: ccache
           evict-old-files: 1d
+          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
 
       - name: Download OpenBLAS
         id: get_openblas
@@ -1077,6 +1095,7 @@ jobs:
           with:
             key: ubuntu-latest-cmake-cuda
             evict-old-files: 1d
+            save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
 
         - name: Build with CMake
           run: |
@@ -1109,6 +1128,7 @@ jobs:
           key: windows-cuda-${{ matrix.cuda }}
           variant: ccache
           evict-old-files: 1d
+          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
 
       - name: Install Cuda Toolkit
         uses: ./.github/actions/windows-setup-cuda
@@ -1160,6 +1180,7 @@ jobs:
           key: windows-latest-cmake-sycl
           variant: ccache
           evict-old-files: 1d
+          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
 
       - name: Install
         run:  |
@@ -1221,6 +1242,7 @@ jobs:
         with:
           key: ${{ github.job }}
           evict-old-files: 1d
+          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
 
       - name: Build
         id: cmake_build
@@ -1466,6 +1488,7 @@ jobs:
         with:
           key: ggml-ci-x64-cpu-low-perf
           evict-old-files: 1d
+          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
 
       - name: Dependencies
         id: depends
@@ -1491,6 +1514,7 @@ jobs:
         with:
           key: ggml-ci-arm64-cpu-low-perf
           evict-old-files: 1d
+          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
 
       - name: Dependencies
         id: depends
@@ -1516,6 +1540,7 @@ jobs:
         with:
           key: ggml-ci-x64-cpu-high-perf
           evict-old-files: 1d
+          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
 
       - name: Dependencies
         id: depends
@@ -1541,6 +1566,7 @@ jobs:
         with:
           key: ggml-ci-arm64-cpu-high-perf
           evict-old-files: 1d
+          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
 
       - name: Dependencies
         id: depends
@@ -1566,6 +1592,7 @@ jobs:
         with:
           key: ggml-ci-arm64-cpu-high-perf-sve
           evict-old-files: 1d
+          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
 
       - name: Dependencies
         id: depends
@@ -1701,6 +1728,7 @@ jobs:
          with:
            key: ggml-ci-arm64-cpu-kleidiai
            evict-old-files: 1d
+           save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
 
        - name: Dependencies
          id: depends
@@ -2084,6 +2112,7 @@ jobs:
          with:
            key: ggml-ci-arm64-graviton4-kleidiai
            evict-old-files: 1d
+           save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
 
        - name: Test
          id: ggml-ci
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
index 446cae9f84..11f850511f 100644
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -66,16 +66,9 @@ jobs:
         id: pack_artifacts
         run: |
           cp LICENSE ./build/bin/
-          zip -y -r llama-${{ steps.tag.outputs.name }}-bin-macos-arm64.zip ./build/bin/*
           tar -czvf llama-${{ steps.tag.outputs.name }}-bin-macos-arm64.tar.gz -s ",./,llama-${{ steps.tag.outputs.name }}/," -C ./build/bin .
 
-      - name: Upload artifacts (zip)
-        uses: actions/upload-artifact@v4
-        with:
-          path: llama-${{ steps.tag.outputs.name }}-bin-macos-arm64.zip
-          name: llama-bin-macos-arm64.zip
-
-      - name: Upload artifacts (tar)
+      - name: Upload artifacts
         uses: actions/upload-artifact@v4
         with:
           path: llama-${{ steps.tag.outputs.name }}-bin-macos-arm64.tar.gz
@@ -127,16 +120,9 @@ jobs:
         id: pack_artifacts
         run: |
           cp LICENSE ./build/bin/
-          zip -y -r llama-${{ steps.tag.outputs.name }}-bin-macos-x64.zip ./build/bin/*
           tar -czvf llama-${{ steps.tag.outputs.name }}-bin-macos-x64.tar.gz -s ",./,llama-${{ steps.tag.outputs.name }}/," -C ./build/bin .
 
-      - name: Upload artifacts (zip)
-        uses: actions/upload-artifact@v4
-        with:
-          path: llama-${{ steps.tag.outputs.name }}-bin-macos-x64.zip
-          name: llama-bin-macos-x64.zip
-
-      - name: Upload artifacts (tar)
+      - name: Upload artifacts
         uses: actions/upload-artifact@v4
         with:
           path: llama-${{ steps.tag.outputs.name }}-bin-macos-x64.tar.gz
@@ -196,16 +182,9 @@ jobs:
         id: pack_artifacts
         run: |
           cp LICENSE ./build/bin/
-          zip -y -r llama-${{ steps.tag.outputs.name }}-bin-ubuntu-${{ matrix.build }}.zip ./build/bin/*
           tar -czvf llama-${{ steps.tag.outputs.name }}-bin-ubuntu-${{ matrix.build }}.tar.gz --transform "s,./,llama-${{ steps.tag.outputs.name }}/," -C ./build/bin .
 
-      - name: Upload artifacts (zip)
-        uses: actions/upload-artifact@v4
-        with:
-          path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-${{ matrix.build }}.zip
-          name: llama-bin-ubuntu-${{ matrix.build }}.zip
-
-      - name: Upload artifacts (tar)
+      - name: Upload artifacts
         uses: actions/upload-artifact@v4
         with:
           path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-${{ matrix.build }}.tar.gz
@@ -256,16 +235,9 @@ jobs:
         id: pack_artifacts
         run: |
           cp LICENSE ./build/bin/
-          zip -y -r llama-${{ steps.tag.outputs.name }}-bin-ubuntu-vulkan-x64.zip ./build/bin/*
           tar -czvf llama-${{ steps.tag.outputs.name }}-bin-ubuntu-vulkan-x64.tar.gz --transform "s,./,llama-${{ steps.tag.outputs.name }}/," -C ./build/bin .
 
-      - name: Upload artifacts (zip)
-        uses: actions/upload-artifact@v4
-        with:
-          path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-vulkan-x64.zip
-          name: llama-bin-ubuntu-vulkan-x64.zip
-
-      - name: Upload artifacts (tar)
+      - name: Upload artifacts
         uses: actions/upload-artifact@v4
         with:
           path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-vulkan-x64.tar.gz
@@ -716,16 +688,9 @@ jobs:
       - name: Pack artifacts
         id: pack_artifacts
         run: |
-          zip -y -r llama-${{ steps.tag.outputs.name }}-xcframework.zip build-apple/llama.xcframework
           tar -czvf llama-${{ steps.tag.outputs.name }}-xcframework.tar.gz -C build-apple llama.xcframework
 
-      - name: Upload artifacts (zip)
-        uses: actions/upload-artifact@v4
-        with:
-          path: llama-${{ steps.tag.outputs.name }}-xcframework.zip
-          name: llama-${{ steps.tag.outputs.name }}-xcframework.zip
-
-      - name: Upload artifacts (tar)
+      - name: Upload artifacts
         uses: actions/upload-artifact@v4
         with:
           path: llama-${{ steps.tag.outputs.name }}-xcframework.tar.gz
@@ -797,7 +762,7 @@ jobs:
           cp LICENSE ./build/bin/
           tar -czvf llama-${{ steps.tag.outputs.name }}-bin-${{ matrix.chip_type }}-openEuler-${{ matrix.arch }}.tar.gz --transform "s,./,llama-${{ steps.tag.outputs.name }}/," -C ./build/bin .
 
-      - name: Upload artifacts (tar)
+      - name: Upload artifacts
         uses: actions/upload-artifact@v4
         with:
           path: llama-${{ steps.tag.outputs.name }}-bin-${{ matrix.chip_type }}-openEuler-${{ matrix.arch }}.tar.gz
@@ -889,9 +854,6 @@ jobs:
         with:
           tag_name: ${{ steps.tag.outputs.name }}
           body: |
-            > [!WARNING]
-            > **Release Format Update**: Linux releases will soon use .tar.gz archives instead of .zip. Please make the necessary changes to your deployment scripts.
-
             <details open>
 
             ${{ github.event.head_commit.message }}
@@ -911,8 +873,8 @@ jobs:
             **Windows:**
             - [Windows x64 (CPU)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-cpu-x64.zip)
             - [Windows arm64 (CPU)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-cpu-arm64.zip)
-            - [Windows x64 (CUDA 12)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-cuda-12.4-x64.zip)
-            - [Windows x64 (CUDA 13)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-cuda-13.1-x64.zip)
+            - [Windows x64 (CUDA 12)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-cuda-12.4-x64.zip) - [CUDA 12.4 DLLs](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/cudart-llama-bin-win-cuda-12.4-x64.zip)
+            - [Windows x64 (CUDA 13)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-cuda-13.1-x64.zip) - [CUDA 13.1 DLLs](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/cudart-llama-bin-win-cuda-13.1-x64.zip)
             - [Windows x64 (Vulkan)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-vulkan-x64.zip)
             - [Windows x64 (SYCL)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-sycl-x64.zip)
             - [Windows x64 (HIP)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-hip-radeon-x64.zip)
diff --git a/.github/workflows/server-webui.yml b/.github/workflows/server-webui.yml
index f8a261eefa..544c4ad408 100644
--- a/.github/workflows/server-webui.yml
+++ b/.github/workflows/server-webui.yml
@@ -31,9 +31,10 @@ concurrency:
   cancel-in-progress: true
 
 jobs:
-  webui-setup:
-    name: WebUI Setup
+  webui-check:
+    name: WebUI Checks
     runs-on: ubuntu-latest
+    continue-on-error: true
     steps:
       - name: Checkout code
         uses: actions/checkout@v4
@@ -42,137 +43,66 @@ jobs:
           ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
 
       - name: Setup Node.js
+        id: node
         uses: actions/setup-node@v4
         with:
           node-version: "22"
           cache: "npm"
           cache-dependency-path: "tools/server/webui/package-lock.json"
 
-      - name: Cache node_modules
-        uses: actions/cache@v4
-        id: cache-node-modules
-        with:
-          path: tools/server/webui/node_modules
-          key: ${{ runner.os }}-node-modules-${{ hashFiles('tools/server/webui/package-lock.json') }}
-          restore-keys: |
-            ${{ runner.os }}-node-modules-
-
       - name: Install dependencies
-        if: steps.cache-node-modules.outputs.cache-hit != 'true'
+        id: setup
+        if: ${{ steps.node.conclusion == 'success' }}
         run: npm ci
         working-directory: tools/server/webui
 
-  webui-check:
-    needs: webui-setup
-    name: WebUI Check
-    runs-on: ubuntu-latest
-    steps:
-      - name: Checkout code
-        uses: actions/checkout@v4
-        with:
-          fetch-depth: 0
-          ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
-
-      - name: Setup Node.js
-        uses: actions/setup-node@v4
-        with:
-          node-version: "22"
-
-      - name: Restore node_modules cache
-        uses: actions/cache@v4
-        with:
-          path: tools/server/webui/node_modules
-          key: ${{ runner.os }}-node-modules-${{ hashFiles('tools/server/webui/package-lock.json') }}
-          restore-keys: |
-            ${{ runner.os }}-node-modules-
-
       - name: Run type checking
+        if: ${{ always() && steps.setup.conclusion == 'success' }}
         run: npm run check
         working-directory: tools/server/webui
 
       - name: Run linting
+        if: ${{ always() && steps.setup.conclusion == 'success' }}
         run: npm run lint
         working-directory: tools/server/webui
 
-  webui-build:
-    needs: webui-check
-    name: WebUI Build
-    runs-on: ubuntu-latest
-    steps:
-      - name: Checkout code
-        uses: actions/checkout@v4
-        with:
-          fetch-depth: 0
-          ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
-
-      - name: Setup Node.js
-        uses: actions/setup-node@v4
-        with:
-          node-version: "22"
-
-      - name: Restore node_modules cache
-        uses: actions/cache@v4
-        with:
-          path: tools/server/webui/node_modules
-          key: ${{ runner.os }}-node-modules-${{ hashFiles('tools/server/webui/package-lock.json') }}
-          restore-keys: |
-            ${{ runner.os }}-node-modules-
-
       - name: Build application
+        if: ${{ always() && steps.setup.conclusion == 'success' }}
         run: npm run build
         working-directory: tools/server/webui
 
-  webui-tests:
-    needs: webui-build
-    name: Run WebUI tests
-    permissions:
-      contents: read
-
-    runs-on: ubuntu-latest
-
-    steps:
-      - name: Checkout code
-        uses: actions/checkout@v4
-
-      - name: Setup Node.js
-        uses: actions/setup-node@v4
-        with:
-          node-version: "22"
-
-      - name: Restore node_modules cache
-        uses: actions/cache@v4
-        with:
-          path: tools/server/webui/node_modules
-          key: ${{ runner.os }}-node-modules-${{ hashFiles('tools/server/webui/package-lock.json') }}
-          restore-keys: |
-            ${{ runner.os }}-node-modules-
-
       - name: Install Playwright browsers
+        id: playwright
+        if: ${{ always() && steps.setup.conclusion == 'success' }}
         run: npx playwright install --with-deps
         working-directory: tools/server/webui
 
       - name: Build Storybook
+        if: ${{ always() && steps.playwright.conclusion == 'success' }}
         run: npm run build-storybook
         working-directory: tools/server/webui
 
       - name: Run Client tests
+        if: ${{ always() && steps.playwright.conclusion == 'success' }}
         run: npm run test:client
         working-directory: tools/server/webui
 
-      - name: Run Server tests
-        run: npm run test:server
+      - name: Run Unit tests
+        if: ${{ always() && steps.playwright.conclusion == 'success' }}
+        run: npm run test:unit
         working-directory: tools/server/webui
 
       - name: Run UI tests
+        if: ${{ always() && steps.playwright.conclusion == 'success' }}
         run: npm run test:ui -- --testTimeout=60000
         working-directory: tools/server/webui
 
       - name: Run E2E tests
+        if: ${{ always() && steps.playwright.conclusion == 'success' }}
         run: npm run test:e2e
         working-directory: tools/server/webui
 
   server-build:
-    needs: [webui-tests]
     runs-on: ubuntu-latest
 
     strategy:
diff --git a/.github/copilot-instructions.md b/AGENTS.md
similarity index 88%
rename from .github/copilot-instructions.md
rename to AGENTS.md
index ad13c6ea8d..e0a65c0ab7 100644
--- a/.github/copilot-instructions.md
+++ b/AGENTS.md
@@ -1,4 +1,4 @@
-# Copilot Instructions for llama.cpp
+# Instructions for llama.cpp
 
 ## Repository Overview
 
@@ -12,6 +12,24 @@ llama.cpp is a large-scale C/C++ project for efficient LLM (Large Language Model
 - **Backends supported**: CPU (AVX/NEON/RVV optimized), CUDA, Metal, Vulkan, SYCL, ROCm, MUSA
 - **License**: MIT
 
+## Disclose AI Usage
+
+It is crucial to remind contributors that the project mandates disclosure of any AI usage in pull requests. This requirement stems from the potential for AI-generated code to include suboptimal optimizations and hidden bugs, owing to the inherent overconfidence in AI outputs.
+
+When generating significant portions of code, address this by:
+- Informing the user that AI-generated content may be rejected by maintainers.
+- Clearly marking AI-generated code in commit messages and comments.
+    - Example of commit message: `[AI] Fix a race condition in ...`
+    - Example of code comment: `// [AI] spawn a new thread ...`
+
+These measures apply to:
+- Changes resulting in large portions of code or complex logic.
+- Modifications or additions to public APIs in `llama.h`, `ggml.h`, or `mtmd.h`.
+- Backend-related changes, such as those involving CPU, CUDA, Metal, Vulkan, etc.
+- Modifications to `tools/server`.
+
+Note: These measures can be omitted for small fixes or trivial changes.
+
 ## Build Instructions
 
 ### Prerequisites
@@ -251,6 +269,7 @@ Primary tools:
 - **Cross-platform compatibility**: Test on Linux, macOS, Windows when possible
 - **Performance focus**: This is a performance-critical inference library
 - **API stability**: Changes to `include/llama.h` require careful consideration
+- **Disclose AI Usage**: Refer to the "Disclose AI Usage" earlier in this document
 
 ### Git Workflow
 - Always create feature branches from `master`
diff --git a/CODEOWNERS b/CODEOWNERS
index 8a0c98c968..750096d9a1 100644
--- a/CODEOWNERS
+++ b/CODEOWNERS
@@ -32,7 +32,7 @@
 /examples/export-docs/                  @ggerganov
 /examples/gen-docs/                     @ggerganov
 /examples/gguf/                         @ggerganov
-/examples/llama.android/                @ggerganov
+/examples/llama.android/                @ggerganov @hanyin-arm @naco-siren
 /examples/llama.swiftui/                @ggerganov
 /examples/llama.vim                     @ggerganov
 /examples/lookahead/                    @ggerganov
diff --git a/README.md b/README.md
index 5f2076d0a3..ed956bb02e 100644
--- a/README.md
+++ b/README.md
@@ -190,6 +190,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
 - Swift [ShenghaiWang/SwiftLlama](https://github.com/ShenghaiWang/SwiftLlama)
 - Delphi [Embarcadero/llama-cpp-delphi](https://github.com/Embarcadero/llama-cpp-delphi)
 - Go (no CGo needed): [hybridgroup/yzma](https://github.com/hybridgroup/yzma)
+- Android: [llama.android](/examples/llama.android)
 
 </details>
 
diff --git a/common/CMakeLists.txt b/common/CMakeLists.txt
index 0182767c2b..f7b99159e3 100644
--- a/common/CMakeLists.txt
+++ b/common/CMakeLists.txt
@@ -85,6 +85,9 @@ add_library(${TARGET} STATIC
     unicode.h
     )
 
+target_include_directories(${TARGET} PUBLIC . ../vendor)
+target_compile_features   (${TARGET} PUBLIC cxx_std_17)
+
 if (BUILD_SHARED_LIBS)
     set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
 endif()
@@ -151,9 +154,7 @@ if (LLAMA_LLGUIDANCE)
     set(LLAMA_COMMON_EXTRA_LIBS ${LLAMA_COMMON_EXTRA_LIBS} llguidance ${LLGUIDANCE_PLATFORM_LIBS})
 endif ()
 
-target_include_directories(${TARGET} PUBLIC . ../vendor)
-target_compile_features   (${TARGET} PUBLIC cxx_std_17)
-target_link_libraries     (${TARGET} PRIVATE ${LLAMA_COMMON_EXTRA_LIBS} PUBLIC llama Threads::Threads)
+target_link_libraries(${TARGET} PRIVATE ${LLAMA_COMMON_EXTRA_LIBS} PUBLIC llama Threads::Threads)
 
 
 #
diff --git a/common/arg.cpp b/common/arg.cpp
index f2aec895ba..1302065498 100644
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -96,6 +96,11 @@ common_arg & common_arg::set_sparam() {
     return *this;
 }
 
+common_arg & common_arg::set_preset_only() {
+    is_preset_only = true;
+    return *this;
+}
+
 bool common_arg::in_example(enum llama_example ex) {
     return examples.find(ex) != examples.end();
 }
@@ -420,6 +425,8 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
         }
     };
 
+    std::set<std::string> seen_args;
+
     for (int i = 1; i < argc; i++) {
         const std::string arg_prefix = "--";
 
@@ -430,6 +437,9 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
         if (arg_to_options.find(arg) == arg_to_options.end()) {
             throw std::invalid_argument(string_format("error: invalid argument: %s", arg.c_str()));
         }
+        if (!seen_args.insert(arg).second) {
+            LOG_WRN("DEPRECATED: argument '%s' specified multiple times, use comma-separated values instead (only last value will be used)\n", arg.c_str());
+        }
         auto & tmp = arg_to_options[arg];
         auto opt = *tmp.first;
         bool is_positive = tmp.second;
@@ -750,6 +760,8 @@ bool common_params_to_map(int argc, char ** argv, llama_example ex, std::map<com
         }
     };
 
+    std::set<std::string> seen_args;
+
     for (int i = 1; i < argc; i++) {
         const std::string arg_prefix = "--";
 
@@ -760,8 +772,16 @@ bool common_params_to_map(int argc, char ** argv, llama_example ex, std::map<com
         if (arg_to_options.find(arg) == arg_to_options.end()) {
             throw std::invalid_argument(string_format("error: invalid argument: %s", arg.c_str()));
         }
+        if (!seen_args.insert(arg).second) {
+            LOG_WRN("DEPRECATED: argument '%s' specified multiple times, use comma-separated values instead (only last value will be used)\n", arg.c_str());
+        }
         auto opt = *arg_to_options[arg];
         std::string val;
+        if (opt.value_hint == nullptr && opt.value_hint_2 == nullptr) {
+            // bool arg (need to reverse the meaning for negative args)
+            bool is_neg = std::find(opt.args_neg.begin(), opt.args_neg.end(), arg) != opt.args_neg.end();
+            val = is_neg ? "0" : "1";
+        }
         if (opt.value_hint != nullptr) {
             // arg with single value
             check_arg(i);
@@ -863,7 +883,9 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         sampler_type_chars += common_sampler_type_to_chr(sampler);
         sampler_type_names += common_sampler_type_to_str(sampler) + ";";
     }
-    sampler_type_names.pop_back();
+    if (!sampler_type_names.empty()) {
+        sampler_type_names.pop_back(); // remove last semicolon
+    }
 
 
     /**
@@ -1127,7 +1149,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         }
     ).set_env("LLAMA_ARG_CTX_CHECKPOINTS").set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
     add_opt(common_arg(
-        {"--cache-ram", "-cram"}, "N",
+        {"-cram", "--cache-ram"}, "N",
         string_format("set the maximum cache size in MiB (default: %d, -1 - no limit, 0 - disable)"
             "[(more info)](https://github.com/ggml-org/llama.cpp/pull/16391)", params.cache_ram_mib),
         [](common_params & params, int value) {
@@ -1135,12 +1157,12 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         }
     ).set_env("LLAMA_ARG_CACHE_RAM").set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
     add_opt(common_arg(
-        {"--kv-unified", "-kvu"},
+        {"-kvu", "--kv-unified"},
         "use single unified KV buffer shared across all sequences (default: enabled if number of slots is auto)",
         [](common_params & params) {
             params.kv_unified = true;
         }
-    ).set_env("LLAMA_ARG_KV_UNIFIED").set_examples({LLAMA_EXAMPLE_SERVER}));
+    ).set_env("LLAMA_ARG_KV_UNIFIED").set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_PERPLEXITY}));
     add_opt(common_arg(
         {"--context-shift"},
         {"--no-context-shift"},
@@ -1184,7 +1206,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         [](common_params & params, const std::string & value) {
             params.system_prompt = value;
         }
-    ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_DIFFUSION}));
+    ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_DIFFUSION, LLAMA_EXAMPLE_MTMD}));
     add_opt(common_arg(
         {"--perf"},
         {"--no-perf"},
@@ -1226,13 +1248,15 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
     ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_DIFFUSION}));
     add_opt(common_arg(
         {"--in-file"}, "FNAME",
-        "an input file (repeat to specify multiple files)",
+        "an input file (use comma-separated values to specify multiple files)",
         [](common_params & params, const std::string & value) {
-            std::ifstream file(value);
-            if (!file) {
-                throw std::runtime_error(string_format("error: failed to open file '%s'\n", value.c_str()));
+            for (const auto & item : string_split<std::string>(value, ',')) {
+                std::ifstream file(item);
+                if (!file) {
+                    throw std::runtime_error(string_format("error: failed to open file '%s'\n", item.c_str()));
+                }
+                params.in_files.push_back(item);
             }
-            params.in_files.push_back(value);
         }
     ).set_examples({LLAMA_EXAMPLE_IMATRIX}));
     add_opt(common_arg(
@@ -1401,7 +1425,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         }
     ).set_sparam());
     add_opt(common_arg(
-        {"--sampling-seq", "--sampler-seq"}, "SEQUENCE",
+        {"--sampler-seq", "--sampling-seq"}, "SEQUENCE",
         string_format("simplified sequence for samplers that will be used (default: %s)", sampler_type_chars.c_str()),
         [](common_params & params, const std::string & value) {
             params.sampling.samplers = common_sampler_types_from_chars(value);
@@ -1969,9 +1993,11 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
     ).set_examples(mmproj_examples).set_env("LLAMA_ARG_MMPROJ_OFFLOAD"));
     add_opt(common_arg(
         {"--image", "--audio"}, "FILE",
-        "path to an image or audio file. use with multimodal models, can be repeated if you have multiple files\n",
+        "path to an image or audio file. use with multimodal models, use comma-separated values for multiple files\n",
         [](common_params & params, const std::string & value) {
-            params.image.emplace_back(value);
+            for (const auto & item : string_split<std::string>(value, ',')) {
+                params.image.emplace_back(item);
+            }
         }
     ).set_examples({LLAMA_EXAMPLE_MTMD, LLAMA_EXAMPLE_CLI}));
     add_opt(common_arg(
@@ -2057,26 +2083,26 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         }
     ));
     add_opt(common_arg(
-        {"--override-tensor", "-ot"}, "<tensor name pattern>=<buffer type>,...",
+        {"-ot", "--override-tensor"}, "<tensor name pattern>=<buffer type>,...",
         "override tensor buffer type", [](common_params & params, const std::string & value) {
             parse_tensor_buffer_overrides(value, params.tensor_buft_overrides);
         }
     ));
     add_opt(common_arg(
-        {"--override-tensor-draft", "-otd"}, "<tensor name pattern>=<buffer type>,...",
+        {"-otd", "--override-tensor-draft"}, "<tensor name pattern>=<buffer type>,...",
         "override tensor buffer type for draft model", [](common_params & params, const std::string & value) {
             parse_tensor_buffer_overrides(value, params.speculative.tensor_buft_overrides);
         }
     ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
     add_opt(common_arg(
-        {"--cpu-moe", "-cmoe"},
+        {"-cmoe", "--cpu-moe"},
         "keep all Mixture of Experts (MoE) weights in the CPU",
         [](common_params & params) {
             params.tensor_buft_overrides.push_back(llm_ffn_exps_cpu_override());
         }
     ).set_env("LLAMA_ARG_CPU_MOE"));
     add_opt(common_arg(
-        {"--n-cpu-moe", "-ncmoe"}, "N",
+        {"-ncmoe", "--n-cpu-moe"}, "N",
         "keep the Mixture of Experts (MoE) weights of the first N layers in the CPU",
         [](common_params & params, int value) {
             if (value < 0) {
@@ -2091,14 +2117,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         }
     ).set_env("LLAMA_ARG_N_CPU_MOE"));
     add_opt(common_arg(
-        {"--cpu-moe-draft", "-cmoed"},
+        {"-cmoed", "--cpu-moe-draft"},
         "keep all Mixture of Experts (MoE) weights in the CPU for the draft model",
         [](common_params & params) {
             params.speculative.tensor_buft_overrides.push_back(llm_ffn_exps_cpu_override());
         }
     ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_CPU_MOE_DRAFT"));
     add_opt(common_arg(
-        {"--n-cpu-moe-draft", "-ncmoed"}, "N",
+        {"-ncmoed", "--n-cpu-moe-draft"}, "N",
         "keep the Mixture of Experts (MoE) weights of the first N layers in the CPU for the draft model",
         [](common_params & params, int value) {
             if (value < 0) {
@@ -2218,12 +2244,39 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         }
     ));
     add_opt(common_arg(
-        {"--override-kv"}, "KEY=TYPE:VALUE",
-        "advanced option to override model metadata by key. may be specified multiple times.\n"
-        "types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false",
+        {"--override-kv"}, "KEY=TYPE:VALUE,...",
+        "advanced option to override model metadata by key. to specify multiple overrides, either use comma-separated or repeat this argument.\n"
+        "types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false,tokenizer.ggml.add_eos_token=bool:false",
         [](common_params & params, const std::string & value) {
-            if (!string_parse_kv_override(value.c_str(), params.kv_overrides)) {
-                throw std::runtime_error(string_format("error: Invalid type for KV override: %s\n", value.c_str()));
+            std::vector<std::string> kv_overrides;
+
+            std::string current;
+            bool escaping = false;
+
+            for (const char c : value) {
+                if (escaping) {
+                    current.push_back(c);
+                    escaping = false;
+                } else if (c == '\\') {
+                    escaping = true;
+                } else if (c == ',') {
+                    kv_overrides.push_back(current);
+                    current.clear();
+                } else {
+                    current.push_back(c);
+                }
+            }
+
+            if (escaping) {
+                current.push_back('\\');
+            }
+
+            kv_overrides.push_back(current);
+
+            for (const auto & kv_override : kv_overrides) {
+                if (!string_parse_kv_override(kv_override.c_str(), params.kv_overrides)) {
+                    throw std::runtime_error(string_format("error: Invalid type for KV override: %s\n", kv_override.c_str()));
+                }
             }
         }
     ));
@@ -2237,33 +2290,50 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
     ));
     add_opt(common_arg(
         {"--lora"}, "FNAME",
-        "path to LoRA adapter (can be repeated to use multiple adapters)",
+        "path to LoRA adapter (use comma-separated values to load multiple adapters)",
         [](common_params & params, const std::string & value) {
-            params.lora_adapters.push_back({ std::string(value), 1.0, "", "", nullptr });
+            for (const auto & item : string_split<std::string>(value, ',')) {
+                params.lora_adapters.push_back({ item, 1.0, "", "", nullptr });
+            }
         }
         // we define this arg on both COMMON and EXPORT_LORA, so when showing help message of export-lora, it will be categorized as "example-specific" arg
     ).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA}));
     add_opt(common_arg(
-        {"--lora-scaled"}, "FNAME", "SCALE",
-        "path to LoRA adapter with user defined scaling (can be repeated to use multiple adapters)",
-        [](common_params & params, const std::string & fname, const std::string & scale) {
-            params.lora_adapters.push_back({ fname, std::stof(scale), "", "", nullptr });
+        {"--lora-scaled"}, "FNAME:SCALE,...",
+        "path to LoRA adapter with user defined scaling (format: FNAME:SCALE,...)\n"
+        "note: use comma-separated values",
+        [](common_params & params, const std::string & value) {
+            for (const auto & item : string_split<std::string>(value, ',')) {
+                auto parts = string_split<std::string>(item, ':');
+                if (parts.size() != 2) {
+                    throw std::invalid_argument("lora-scaled format: FNAME:SCALE");
+                }
+                params.lora_adapters.push_back({ parts[0], std::stof(parts[1]), "", "", nullptr });
+            }
         }
         // we define this arg on both COMMON and EXPORT_LORA, so when showing help message of export-lora, it will be categorized as "example-specific" arg
     ).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA}));
     add_opt(common_arg(
         {"--control-vector"}, "FNAME",
-        "add a control vector\nnote: this argument can be repeated to add multiple control vectors",
+        "add a control vector\nnote: use comma-separated values to add multiple control vectors",
         [](common_params & params, const std::string & value) {
-            params.control_vectors.push_back({ 1.0f, value, });
+            for (const auto & item : string_split<std::string>(value, ',')) {
+                params.control_vectors.push_back({ 1.0f, item, });
+            }
         }
     ));
     add_opt(common_arg(
-        {"--control-vector-scaled"}, "FNAME", "SCALE",
+        {"--control-vector-scaled"}, "FNAME:SCALE,...",
         "add a control vector with user defined scaling SCALE\n"
-        "note: this argument can be repeated to add multiple scaled control vectors",
-        [](common_params & params, const std::string & fname, const std::string & scale) {
-            params.control_vectors.push_back({ std::stof(scale), fname });
+        "note: use comma-separated values (format: FNAME:SCALE,...)",
+        [](common_params & params, const std::string & value) {
+            for (const auto & item : string_split<std::string>(value, ',')) {
+                auto parts = string_split<std::string>(item, ':');
+                if (parts.size() != 2) {
+                    throw std::invalid_argument("control-vector-scaled format: FNAME:SCALE");
+                }
+                params.control_vectors.push_back({ std::stof(parts[1]), parts[0] });
+            }
         }
     ));
     add_opt(common_arg(
@@ -2353,13 +2423,15 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
     ).set_env("HF_TOKEN"));
     add_opt(common_arg(
         {"--context-file"}, "FNAME",
-        "file to load context from (repeat to specify multiple files)",
+        "file to load context from (use comma-separated values to specify multiple files)",
         [](common_params & params, const std::string & value) {
-            std::ifstream file(value, std::ios::binary);
-            if (!file) {
-                throw std::runtime_error(string_format("error: failed to open file '%s'\n", value.c_str()));
+            for (const auto & item : string_split<std::string>(value, ',')) {
+                std::ifstream file(item, std::ios::binary);
+                if (!file) {
+                    throw std::runtime_error(string_format("error: failed to open file '%s'\n", item.c_str()));
+                }
+                params.context_files.push_back(item);
             }
-            params.context_files.push_back(value);
         }
     ).set_examples({LLAMA_EXAMPLE_RETRIEVAL}));
     add_opt(common_arg(
@@ -2550,6 +2622,20 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
             params.api_prefix = value;
         }
     ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_API_PREFIX"));
+    add_opt(common_arg(
+        {"--webui-config"}, "JSON",
+        "JSON that provides default WebUI settings (overrides WebUI defaults)",
+        [](common_params & params, const std::string & value) {
+            params.webui_config_json = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_WEBUI_CONFIG"));
+    add_opt(common_arg(
+        {"--webui-config-file"}, "PATH",
+        "JSON file that provides default WebUI settings (overrides WebUI defaults)",
+        [](common_params & params, const std::string & value) {
+            params.webui_config_json = read_file(value);
+        }
+    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_WEBUI_CONFIG_FILE"));
     add_opt(common_arg(
         {"--webui"},
         {"--no-webui"},
@@ -2566,7 +2652,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         }
     ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_EMBEDDINGS"));
     add_opt(common_arg(
-        {"--reranking", "--rerank"},
+        {"--rerank", "--reranking"},
         string_format("enable reranking endpoint on server (default: %s)", "disabled"),
         [](common_params & params) {
             params.embedding = true;
@@ -2801,6 +2887,16 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
             params.lora_init_without_apply = true;
         }
     ).set_examples({LLAMA_EXAMPLE_SERVER}));
+    add_opt(common_arg(
+        {"--sleep-idle-seconds"}, "SECONDS",
+        string_format("number of seconds of idleness after which the server will sleep (default: %d; -1 = disabled)", params.sleep_idle_seconds),
+        [](common_params & params, int value) {
+            if (value == 0 || value < -1) {
+                throw std::invalid_argument("invalid value: cannot be 0 or less than -1");
+            }
+            params.sleep_idle_seconds = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_SERVER}));
     add_opt(common_arg(
         {"--simple-io"},
         "use basic IO for better compatibility in subprocesses and limited consoles",
@@ -3037,7 +3133,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         }
     ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
     add_opt(common_arg(
-        {"--draft-max", "--draft", "--draft-n"}, "N",
+        {"--draft", "--draft-n", "--draft-max"}, "N",
         string_format("number of tokens to draft for speculative decoding (default: %d)", params.speculative.n_max),
         [](common_params & params, int value) {
             params.speculative.n_max = value;
@@ -3413,3 +3509,24 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
 
     return ctx_arg;
 }
+
+void common_params_add_preset_options(std::vector<common_arg> & args) {
+    // arguments below won't be treated as CLI args, only preset options
+    args.push_back(common_arg(
+        {"load-on-startup"}, "NAME",
+        "in server router mode, autoload this model on startup",
+        [](common_params &, const std::string &) { /* unused */ }
+    ).set_env(COMMON_ARG_PRESET_LOAD_ON_STARTUP).set_preset_only());
+
+    // args.push_back(common_arg(
+    //     {"pin"},
+    //     "in server router mode, do not unload this model if models_max is exceeded",
+    //     [](common_params &) { /* unused */ }
+    // ).set_preset_only());
+
+    // args.push_back(common_arg(
+    //     {"unload-idle-seconds"}, "SECONDS",
+    //     "in server router mode, unload models idle for more than this many seconds",
+    //     [](common_params &, int) { /* unused */ }
+    // ).set_preset_only());
+}
diff --git a/common/arg.h b/common/arg.h
index 1321595c1a..f5111c658f 100644
--- a/common/arg.h
+++ b/common/arg.h
@@ -8,6 +8,9 @@
 #include <vector>
 #include <cstring>
 
+// pseudo-env variable to identify preset-only arguments
+#define COMMON_ARG_PRESET_LOAD_ON_STARTUP "__PRESET_LOAD_ON_STARTUP"
+
 //
 // CLI argument parsing
 //
@@ -22,6 +25,7 @@ struct common_arg {
     const char * env          = nullptr;
     std::string help;
     bool is_sparam = false; // is current arg a sampling param?
+    bool is_preset_only = false; // is current arg preset-only (not treated as CLI arg)
     void (*handler_void)   (common_params & params) = nullptr;
     void (*handler_string) (common_params & params, const std::string &) = nullptr;
     void (*handler_str_str)(common_params & params, const std::string &, const std::string &) = nullptr;
@@ -70,6 +74,7 @@ struct common_arg {
     common_arg & set_excludes(std::initializer_list<enum llama_example> excludes);
     common_arg & set_env(const char * env);
     common_arg & set_sparam();
+    common_arg & set_preset_only();
     bool in_example(enum llama_example ex);
     bool is_exclude(enum llama_example ex);
     bool get_value_from_env(std::string & output) const;
@@ -114,9 +119,13 @@ struct common_params_context {
 bool common_params_parse(int argc, char ** argv, common_params & params, llama_example ex, void(*print_usage)(int, char **) = nullptr);
 
 // parse input arguments from CLI into a map
-// TODO: support repeated args in the future
 bool common_params_to_map(int argc, char ** argv, llama_example ex, std::map<common_arg, std::string> & out_map);
 
+// populate preset-only arguments
+// these arguments are not treated as command line arguments
+// see: https://github.com/ggml-org/llama.cpp/issues/18163
+void common_params_add_preset_options(std::vector<common_arg> & args);
+
 // initialize argument parser context - used by test-arg-parser and preset
 common_params_context common_params_parser_init(common_params & params, llama_example ex, void(*print_usage)(int, char **) = nullptr);
 
diff --git a/common/common.cpp b/common/common.cpp
index 5a8cf52485..d4e8c7405e 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -1092,7 +1092,7 @@ common_init_result::common_init_result(common_params & params) :
     auto cparams = common_context_params_to_llama(params);
 
     if (params.fit_params) {
-        LOG_INF("%s: fitting params to device memory, to report bugs during this step use -fit off (or --verbose if you can't)\n", __func__);
+        LOG_INF("%s: fitting params to device memory, for bugs during this step try to reproduce them with -fit off, or provide --verbose logs if the bug only occurs with -fit on\n", __func__);
         llama_params_fit(params.model.path.c_str(), &mparams, &cparams,
             params.tensor_split, params.tensor_buft_overrides.data(), params.fit_params_target, params.fit_params_min_ctx,
             params.verbosity >= 4 ? GGML_LOG_LEVEL_DEBUG : GGML_LOG_LEVEL_ERROR);
diff --git a/common/common.h b/common/common.h
index d70744840f..334372073a 100644
--- a/common/common.h
+++ b/common/common.h
@@ -475,7 +475,8 @@ struct common_params {
     bool enable_chat_template = true;
     common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
     int reasoning_budget = -1;
-    bool prefill_assistant = true;                                                                          // if true, any trailing assistant message will be prefilled into the response
+    bool prefill_assistant = true; // if true, any trailing assistant message will be prefilled into the response
+    int sleep_idle_seconds = -1;   // if >0, server will sleep after this many seconds of idle time
 
     std::vector<std::string> api_keys;
 
@@ -484,8 +485,11 @@ struct common_params {
 
     std::map<std::string, std::string> default_template_kwargs;
 
+    // webui configs
+    bool webui = true;
+    std::string webui_config_json;
+
     // "advanced" endpoints are disabled by default for better security
-    bool webui            = true;
     bool endpoint_slots   = true;
     bool endpoint_props   = false; // only control POST requests, not GET
     bool endpoint_metrics = false;
diff --git a/common/preset.cpp b/common/preset.cpp
index 60746aad58..e2fc18c5da 100644
--- a/common/preset.cpp
+++ b/common/preset.cpp
@@ -2,6 +2,7 @@
 #include "preset.h"
 #include "peg-parser.h"
 #include "log.h"
+#include "download.h"
 
 #include <fstream>
 #include <sstream>
@@ -15,11 +16,22 @@ static std::string rm_leading_dashes(const std::string & str) {
     return str.substr(pos);
 }
 
-std::vector<std::string> common_preset::to_args() const {
+std::vector<std::string> common_preset::to_args(const std::string & bin_path) const {
     std::vector<std::string> args;
 
+    if (!bin_path.empty()) {
+        args.push_back(bin_path);
+    }
+
     for (const auto & [opt, value] : options) {
-        args.push_back(opt.args.back()); // use the last arg as the main arg
+        if (opt.is_preset_only) {
+            continue; // skip preset-only options (they are not CLI args)
+        }
+
+        // use the last arg as the main arg (i.e. --long-form)
+        args.push_back(opt.args.back());
+
+        // handle value(s)
         if (opt.value_hint == nullptr && opt.value_hint_2 == nullptr) {
             // flag option, no value
             if (common_arg_utils::is_falsey(value)) {
@@ -63,6 +75,52 @@ std::string common_preset::to_ini() const {
     return ss.str();
 }
 
+void common_preset::set_option(const common_preset_context & ctx, const std::string & env, const std::string & value) {
+    // try if option exists, update it
+    for (auto & [opt, val] : options) {
+        if (opt.env && env == opt.env) {
+            val = value;
+            return;
+        }
+    }
+    // if option does not exist, we need to add it
+    if (ctx.key_to_opt.find(env) == ctx.key_to_opt.end()) {
+        throw std::runtime_error(string_format(
+            "%s: option with env '%s' not found in ctx_params",
+            __func__, env.c_str()
+        ));
+    }
+    options[ctx.key_to_opt.at(env)] = value;
+}
+
+void common_preset::unset_option(const std::string & env) {
+    for (auto it = options.begin(); it != options.end(); ) {
+        const common_arg & opt = it->first;
+        if (opt.env && env == opt.env) {
+            it = options.erase(it);
+            return;
+        } else {
+            ++it;
+        }
+    }
+}
+
+bool common_preset::get_option(const std::string & env, std::string & value) const {
+    for (const auto & [opt, val] : options) {
+        if (opt.env && env == opt.env) {
+            value = val;
+            return true;
+        }
+    }
+    return false;
+}
+
+void common_preset::merge(const common_preset & other) {
+    for (const auto & [opt, val] : other.options) {
+        options[opt] = val; // overwrite existing options
+    }
+}
+
 static std::map<std::string, std::map<std::string, std::string>> parse_ini_from_file(const std::string & path) {
     std::map<std::string, std::map<std::string, std::string>> parsed;
 
@@ -172,9 +230,14 @@ static std::string parse_bool_arg(const common_arg & arg, const std::string & ke
     return value;
 }
 
-common_presets common_presets_load(const std::string & path, common_params_context & ctx_params) {
+common_preset_context::common_preset_context(llama_example ex)
+        : ctx_params(common_params_parser_init(default_params, ex)) {
+    common_params_add_preset_options(ctx_params.options);
+    key_to_opt = get_map_key_opt(ctx_params);
+}
+
+common_presets common_preset_context::load_from_ini(const std::string & path, common_preset & global) const {
     common_presets out;
-    auto key_to_opt = get_map_key_opt(ctx_params);
     auto ini_data = parse_ini_from_file(path);
 
     for (auto section : ini_data) {
@@ -188,7 +251,7 @@ common_presets common_presets_load(const std::string & path, common_params_conte
         for (const auto & [key, value] : section.second) {
             LOG_DBG("option: %s = %s\n", key.c_str(), value.c_str());
             if (key_to_opt.find(key) != key_to_opt.end()) {
-                auto & opt = key_to_opt[key];
+                const auto & opt = key_to_opt.at(key);
                 if (is_bool_arg(opt)) {
                     preset.options[opt] = parse_bool_arg(opt, key, value);
                 } else {
@@ -199,8 +262,137 @@ common_presets common_presets_load(const std::string & path, common_params_conte
                 // TODO: maybe warn about unknown key?
             }
         }
+
+        if (preset.name == "*") {
+            // handle global preset
+            global = preset;
+        } else {
+            out[preset.name] = preset;
+        }
+    }
+
+    return out;
+}
+
+common_presets common_preset_context::load_from_cache() const {
+    common_presets out;
+
+    auto cached_models = common_list_cached_models();
+    for (const auto & model : cached_models) {
+        common_preset preset;
+        preset.name = model.to_string();
+        preset.set_option(*this, "LLAMA_ARG_HF_REPO", model.to_string());
         out[preset.name] = preset;
     }
 
     return out;
 }
+
+struct local_model {
+    std::string name;
+    std::string path;
+    std::string path_mmproj;
+};
+
+common_presets common_preset_context::load_from_models_dir(const std::string & models_dir) const {
+    if (!std::filesystem::exists(models_dir) || !std::filesystem::is_directory(models_dir)) {
+        throw std::runtime_error(string_format("error: '%s' does not exist or is not a directory\n", models_dir.c_str()));
+    }
+
+    std::vector<local_model> models;
+    auto scan_subdir = [&models](const std::string & subdir_path, const std::string & name) {
+        auto files = fs_list(subdir_path, false);
+        common_file_info model_file;
+        common_file_info first_shard_file;
+        common_file_info mmproj_file;
+        for (const auto & file : files) {
+            if (string_ends_with(file.name, ".gguf")) {
+                if (file.name.find("mmproj") != std::string::npos) {
+                    mmproj_file = file;
+                } else if (file.name.find("-00001-of-") != std::string::npos) {
+                    first_shard_file = file;
+                } else {
+                    model_file = file;
+                }
+            }
+        }
+        // single file model
+        local_model model{
+            /* name        */ name,
+            /* path        */ first_shard_file.path.empty() ? model_file.path : first_shard_file.path,
+            /* path_mmproj */ mmproj_file.path // can be empty
+        };
+        if (!model.path.empty()) {
+            models.push_back(model);
+        }
+    };
+
+    auto files = fs_list(models_dir, true);
+    for (const auto & file : files) {
+        if (file.is_dir) {
+            scan_subdir(file.path, file.name);
+        } else if (string_ends_with(file.name, ".gguf")) {
+            // single file model
+            std::string name = file.name;
+            string_replace_all(name, ".gguf", "");
+            local_model model{
+                /* name        */ name,
+                /* path        */ file.path,
+                /* path_mmproj */ ""
+            };
+            models.push_back(model);
+        }
+    }
+
+    // convert local models to presets
+    common_presets out;
+    for (const auto & model : models) {
+        common_preset preset;
+        preset.name = model.name;
+        preset.set_option(*this, "LLAMA_ARG_MODEL", model.path);
+        if (!model.path_mmproj.empty()) {
+            preset.set_option(*this, "LLAMA_ARG_MMPROJ", model.path_mmproj);
+        }
+        out[preset.name] = preset;
+    }
+
+    return out;
+}
+
+common_preset common_preset_context::load_from_args(int argc, char ** argv) const {
+    common_preset preset;
+    preset.name = COMMON_PRESET_DEFAULT_NAME;
+
+    bool ok = common_params_to_map(argc, argv, ctx_params.ex, preset.options);
+    if (!ok) {
+        throw std::runtime_error("failed to parse CLI arguments into preset");
+    }
+
+    return preset;
+}
+
+common_presets common_preset_context::cascade(const common_presets & base, const common_presets & added) const {
+    common_presets out = base; // copy
+    for (const auto & [name, preset_added] : added) {
+        if (out.find(name) != out.end()) {
+            // if exists, merge
+            common_preset & target = out[name];
+            target.merge(preset_added);
+        } else {
+            // otherwise, add directly
+            out[name] = preset_added;
+        }
+    }
+    return out;
+}
+
+common_presets common_preset_context::cascade(const common_preset & base, const common_presets & presets) const {
+    common_presets out;
+    for (const auto & [name, preset] : presets) {
+        common_preset tmp = base; // copy
+        tmp.name = name;
+        tmp.merge(preset);
+        out[name] = std::move(tmp);
+    }
+    return out;
+}
diff --git a/common/preset.h b/common/preset.h
index dceb849eb8..3a84d1be29 100644
--- a/common/preset.h
+++ b/common/preset.h
@@ -13,20 +13,62 @@
 
 constexpr const char * COMMON_PRESET_DEFAULT_NAME = "default";
 
+struct common_preset_context;
+
 struct common_preset {
     std::string name;
-    // TODO: support repeated args in the future
+
+    // options are stored as common_arg to string mapping, representing CLI arg and its value
     std::map<common_arg, std::string> options;
 
     // convert preset to CLI argument list
-    std::vector<std::string> to_args() const;
+    std::vector<std::string> to_args(const std::string & bin_path = "") const;
 
     // convert preset to INI format string
     std::string to_ini() const;
 
     // TODO: maybe implement to_env() if needed
+
+    // modify preset options where argument is identified by its env variable
+    void set_option(const common_preset_context & ctx, const std::string & env, const std::string & value);
+
+    // unset option by its env variable
+    void unset_option(const std::string & env);
+
+    // get option value by its env variable, return false if not found
+    bool get_option(const std::string & env, std::string & value) const;
+
+    // merge another preset into this one, overwriting existing options
+    void merge(const common_preset & other);
 };
 
 // interface for multiple presets in one file
 using common_presets = std::map<std::string, common_preset>;
-common_presets common_presets_load(const std::string & path, common_params_context & ctx_params);
+
+// context for loading and editing presets
+struct common_preset_context {
+    common_params default_params; // unused for now
+    common_params_context ctx_params;
+    std::map<std::string, common_arg> key_to_opt;
+    common_preset_context(llama_example ex);
+
+    // load presets from INI file
+    common_presets load_from_ini(const std::string & path, common_preset & global) const;
+
+    // generate presets from cached models
+    common_presets load_from_cache() const;
+
+    // generate presets from local models directory
+    // for the directory structure, see "Using multiple models" in server/README.md
+    common_presets load_from_models_dir(const std::string & models_dir) const;
+
+    // generate one preset from CLI arguments
+    common_preset load_from_args(int argc, char ** argv) const;
+
+    // cascade multiple presets if exist on both: base < added
+    // if preset does not exist in base, it will be added without modification
+    common_presets cascade(const common_presets & base, const common_presets & added) const;
+
+    // apply presets over a base preset (same idea as CSS cascading)
+    common_presets cascade(const common_preset & base, const common_presets & presets) const;
+};
diff --git a/common/sampling.cpp b/common/sampling.cpp
index 6935d84e22..c66f935c65 100644
--- a/common/sampling.cpp
+++ b/common/sampling.cpp
@@ -104,10 +104,9 @@ struct ring_buffer {
 struct common_sampler {
     common_params_sampling params;
 
+    struct llama_sampler * grmr;
     struct llama_sampler * chain;
 
-    bool grammar;
-
     ring_buffer<llama_token> prev;
 
     std::vector<llama_token_data> cur;
@@ -167,15 +166,14 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
 
     lparams.no_perf = params.no_perf;
 
+    llama_sampler * grmr = nullptr;
     llama_sampler * chain = llama_sampler_chain_init(lparams);
 
-    bool grammar = false;
     std::vector<llama_sampler *> samplers;
 
     if (params.grammar.compare(0, 11, "%llguidance") == 0) {
 #ifdef LLAMA_USE_LLGUIDANCE
-        samplers.push_back(llama_sampler_init_llg(vocab, "lark", params.grammar.c_str()));
-        grammar = true;
+        grmr = llama_sampler_init_llg(vocab, "lark", params.grammar.c_str());
 #else
         GGML_ABORT("llguidance (cmake -DLLAMA_LLGUIDANCE=ON) is not enabled");
 #endif // LLAMA_USE_LLGUIDANCE
@@ -224,15 +222,12 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
 
         if (!params.grammar.empty()) {
              if (params.grammar_lazy) {
-                 samplers.push_back(
-                         llama_sampler_init_grammar_lazy_patterns(vocab, params.grammar.c_str(), "root",
-                             trigger_patterns_c.data(), trigger_patterns_c.size(),
-                             trigger_tokens.data(),     trigger_tokens.size()));
+                 grmr = llama_sampler_init_grammar_lazy_patterns(vocab, params.grammar.c_str(), "root",
+                         trigger_patterns_c.data(), trigger_patterns_c.size(),
+                         trigger_tokens.data(), trigger_tokens.size());
              } else {
-                 samplers.push_back(llama_sampler_init_grammar(vocab, params.grammar.c_str(), "root"));
+                 grmr = llama_sampler_init_grammar(vocab, params.grammar.c_str(), "root");
              }
-
-             grammar = true;
         }
     }
 
@@ -303,8 +298,8 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
 
     auto * result = new common_sampler {
         /* .params  = */ params,
+        /* .grmr    = */ grmr,
         /* .chain   = */ chain,
-        /* .grammar = */ grammar,
         /* .prev    = */ ring_buffer<llama_token>(std::max(32, params.n_prev)),
         /* .cur     = */ {},
         /* .cur_p   = */ {},
@@ -315,6 +310,7 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
 
 void common_sampler_free(struct common_sampler * gsmpl) {
     if (gsmpl) {
+        llama_sampler_free(gsmpl->grmr);
         llama_sampler_free(gsmpl->chain);
 
         delete gsmpl;
@@ -324,25 +320,12 @@ void common_sampler_free(struct common_sampler * gsmpl) {
 void common_sampler_accept(struct common_sampler * gsmpl, llama_token token, bool accept_grammar) {
     const auto tm = gsmpl->tm();
 
-    if (gsmpl->grammar) {
-        const int n_smpl = llama_sampler_chain_n(gsmpl->chain);
-
-        for (int i = 0; i < n_smpl; i++) {
-            auto * smpl = llama_sampler_chain_get(gsmpl->chain, i);
-
-            // the grammar sampler is always the first one
-            if (i == 0) {
-                if (accept_grammar) {
-                    llama_sampler_accept(smpl, token);
-                }
-            } else {
-                llama_sampler_accept(smpl, token);
-            }
-        }
-    } else {
-        llama_sampler_accept(gsmpl->chain, token);
+    if (gsmpl->grmr && accept_grammar) {
+        llama_sampler_accept(gsmpl->grmr, token);
     }
 
+    llama_sampler_accept(gsmpl->chain, token);
+
     gsmpl->prev.push_back(token);
 }
 
@@ -353,8 +336,8 @@ void common_sampler_reset(struct common_sampler * gsmpl) {
 struct common_sampler * common_sampler_clone(common_sampler * gsmpl) {
     return new common_sampler {
         /* .params  = */ gsmpl->params,
+        /* .grmr    = */ llama_sampler_clone(gsmpl->grmr),
         /* .chain   = */ llama_sampler_clone(gsmpl->chain),
-        /* .grammar = */ gsmpl->grammar,
         /* .prev    = */ gsmpl->prev,
         /* .cur     = */ gsmpl->cur,
         /* .cur_p   = */ gsmpl->cur_p,
@@ -410,7 +393,7 @@ struct llama_sampler * common_sampler_get(const struct common_sampler * gsmpl) {
     return gsmpl->chain;
 }
 
-llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_context * ctx, int idx) {
+llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_context * ctx, int idx, bool grammar_first) {
     llama_synchronize(ctx);
 
     // start measuring sampling time after the llama_context synchronization in order to not measure any ongoing async operations
@@ -418,11 +401,42 @@ llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_co
 
     llama_token id = LLAMA_TOKEN_NULL;
 
+    auto & grmr  = gsmpl->grmr;
     auto & chain = gsmpl->chain;
     auto & cur_p = gsmpl->cur_p; // initialized by set_logits
 
     gsmpl->set_logits(ctx, idx);
 
+    if (grammar_first) {
+        llama_sampler_apply(grmr, &cur_p);
+    }
+
+    llama_sampler_apply(chain, &cur_p);
+
+    id = cur_p.data[cur_p.selected].id;
+
+    if (grammar_first) {
+        return id;
+    }
+
+    // check if it the sampled token fits the grammar (grammar-based rejection sampling)
+    {
+        llama_token_data       single_token_data       = { id, 1.0f, 0.0f };
+        llama_token_data_array single_token_data_array = { &single_token_data, 1, -1, false };
+
+        llama_sampler_apply(grmr, &single_token_data_array);
+
+        const bool is_valid = single_token_data_array.data[0].logit != -INFINITY;
+        if (is_valid) {
+            return id;
+        }
+    }
+
+    // resampling:
+    // if the token is not valid, sample again, but first apply the grammar sampler and then the sampling chain
+    gsmpl->set_logits(ctx, idx);
+
+    llama_sampler_apply(grmr,  &cur_p);
     llama_sampler_apply(chain, &cur_p);
 
     GGML_ASSERT(cur_p.selected != -1 && "no selected token during sampling - check your sampling configuration");
@@ -432,7 +446,7 @@ llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_co
     return id;
 }
 
-std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const std::vector<int> & idxs, const llama_tokens & draft) {
+std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const std::vector<int> & idxs, const llama_tokens & draft, bool grammar_first) {
     GGML_ASSERT(idxs.size() == draft.size() + 1 && "idxs.size() must be draft.size() + 1");
 
     std::vector<llama_token> result;
@@ -440,7 +454,7 @@ std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sample
 
     size_t i = 0;
     for (; i < draft.size(); i++) {
-        const llama_token id = common_sampler_sample(gsmpl, ctx, idxs[i]);
+        const llama_token id = common_sampler_sample(gsmpl, ctx, idxs[i], grammar_first);
 
         common_sampler_accept(gsmpl, id, true);
 
@@ -452,7 +466,7 @@ std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sample
     }
 
     if (i == draft.size()) {
-        const llama_token id = common_sampler_sample(gsmpl, ctx, idxs[i]);
+        const llama_token id = common_sampler_sample(gsmpl, ctx, idxs[i], grammar_first);
 
         common_sampler_accept(gsmpl, id, true);
 
@@ -462,13 +476,13 @@ std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sample
     return result;
 }
 
-std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const llama_tokens & draft) {
+std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const llama_tokens & draft, bool grammar_first) {
     std::vector<int> idxs(draft.size() + 1);
     for (size_t i = 0; i < idxs.size(); ++i) {
         idxs[i] = i;
     }
 
-    return common_sampler_sample_and_accept_n(gsmpl, ctx, idxs, draft);
+    return common_sampler_sample_and_accept_n(gsmpl, ctx, idxs, draft, grammar_first);
 }
 
 uint32_t common_sampler_get_seed(const struct common_sampler * gsmpl) {
diff --git a/common/sampling.h b/common/sampling.h
index ace5d3d020..c7101032f2 100644
--- a/common/sampling.h
+++ b/common/sampling.h
@@ -57,7 +57,10 @@ struct llama_sampler * common_sampler_get(const struct common_sampler * gsmpl);
 // - check if the token fits the grammar (if any)
 // - if not: resample by first applying the grammar constraints and then sampling again (slower path)
 //
-llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_context * ctx, int idx);
+// if grammar_first is true, the grammar is applied before the samplers (slower)
+// useful in cases where all the resulting candidates (not just the sampled one) must fit the grammar
+//
+llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_context * ctx, int idx, bool grammar_first = false);
 
 // generalized version of common_sampler_sample
 //
@@ -75,10 +78,10 @@ llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_co
 //
 // returns at least 1 token, up to idxs.size()
 //
-std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const std::vector<int> & idxs, const llama_tokens & draft);
+std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const std::vector<int> & idxs, const llama_tokens & draft, bool grammar_first = false);
 
 // assume idxs == [ 0, 1, 2, ..., draft.size() ]
-std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const llama_tokens & draft);
+std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const llama_tokens & draft, bool grammar_first = false);
 
 uint32_t common_sampler_get_seed(const struct common_sampler * gsmpl);
 
diff --git a/common/speculative.cpp b/common/speculative.cpp
index 1e12383ae6..3e83b0964c 100644
--- a/common/speculative.cpp
+++ b/common/speculative.cpp
@@ -315,7 +315,7 @@ llama_tokens common_speculative_gen_draft(
     for (int i = 0; i < params.n_draft; ++i) {
         common_batch_clear(batch);
 
-        common_sampler_sample(smpl, ctx_dft, 0);
+        common_sampler_sample(smpl, ctx_dft, 0, true);
 
         const auto * cur_p = common_sampler_get_candidates(smpl, true);
 
diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
index bd16ba312f..432be59946 100755
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -189,10 +189,10 @@ class ModelBase:
             return tensors
 
         prefix = "model" if not self.is_mistral_format else "consolidated"
-        part_names: set[str] = set(ModelBase.get_model_part_names(self.dir_model, prefix, ".safetensors"))
+        part_names: list[str] = ModelBase.get_model_part_names(self.dir_model, prefix, ".safetensors")
         is_safetensors: bool = len(part_names) > 0
         if not is_safetensors:
-            part_names = set(ModelBase.get_model_part_names(self.dir_model, "pytorch_model", ".bin"))
+            part_names = ModelBase.get_model_part_names(self.dir_model, "pytorch_model", ".bin")
 
         tensor_names_from_index: set[str] = set()
 
@@ -209,7 +209,8 @@ class ModelBase:
                     if weight_map is None or not isinstance(weight_map, dict):
                         raise ValueError(f"Can't load 'weight_map' from {index_name!r}")
                     tensor_names_from_index.update(weight_map.keys())
-                    part_names |= set(weight_map.values())
+                    part_dict: dict[str, None] = dict.fromkeys(weight_map.values(), None)
+                    part_names = sorted(part_dict.keys())
             else:
                 weight_map = {}
         else:
@@ -711,6 +712,9 @@ class ModelBase:
         if "thinker_config" in config:
             # rename for Qwen2.5-Omni
             config["text_config"] = config["thinker_config"]["text_config"]
+        if "lfm" in config:
+            # rename for LFM2-Audio
+            config["text_config"] = config["lfm"]
         return config
 
     @classmethod
@@ -1838,7 +1842,7 @@ class MmprojModel(ModelBase):
 
     def tensor_force_quant(self, name, new_name, bid, n_dims):
         del bid, name, n_dims  # unused
-        if ".patch_embd.weight" in new_name:
+        if ".patch_embd.weight" in new_name or ".patch_merger.weight" in new_name:
             return gguf.GGMLQuantizationType.F16 if self.ftype == gguf.LlamaFileType.MOSTLY_F16 else gguf.GGMLQuantizationType.F32
         return False
 
@@ -9712,12 +9716,12 @@ class LFM2Model(TextModel):
         self._add_feed_forward_length()
 
     def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
-        is_vision_tensor = "vision_tower" in name or "multi_modal_projector" in name
-        if is_vision_tensor:
-            # skip vision tensors
+        if self._is_vision_tensor(name) or self._is_audio_tensor(name):
+            # skip multimodal tensors
             return []
 
-        name = name.replace("language_model.", "")
+        name = name.replace("language_model.", "") # vision
+        name = name.replace("lfm.", "model.")      # audio
 
         # conv op requires 2d tensor
         if 'conv.conv' in name:
@@ -9725,6 +9729,12 @@ class LFM2Model(TextModel):
 
         return [(self.map_tensor_name(name), data_torch)]
 
+    def _is_vision_tensor(self, name: str) -> bool:
+        return "vision_tower" in name or "multi_modal_projector" in name
+
+    def _is_audio_tensor(self, name: str):
+        return any(p in name for p in ["audio", "codebook", "conformer", "depth_embedding", "depthformer", "depth_linear"])
+
 
 @ModelBase.register("Lfm2MoeForCausalLM")
 class LFM2MoeModel(TextModel):
@@ -9830,6 +9840,81 @@ class LFM2VLModel(MmprojModel):
         return [] # skip other tensors
 
 
+@ModelBase.register("Lfm2AudioForConditionalGeneration")
+class LFM2AudioModel(MmprojModel):
+    has_vision_encoder = False
+    has_audio_encoder = True
+    model_name = "Lfm2AudioEncoder"
+
+    _batch_norm_tensors: list[dict[str, Tensor]] | None = None
+
+    def get_audio_config(self) -> dict[str, Any] | None:
+        return self.global_config.get("encoder")
+
+    def set_gguf_parameters(self):
+        assert self.hparams_audio is not None
+        self.hparams_audio["hidden_size"] = self.hparams_audio["d_model"]
+        self.hparams_audio["intermediate_size"] = self.hparams_audio["d_model"]
+        self.hparams_audio["num_attention_heads"] = self.hparams_audio["n_heads"]
+        super().set_gguf_parameters()
+        self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.LFM2A)
+        self.gguf_writer.add_audio_num_mel_bins(self.hparams_audio["feat_in"])
+        self.gguf_writer.add_audio_attention_layernorm_eps(1e-5)
+
+    def tensor_force_quant(self, name, new_name, bid, n_dims):
+        if ".conv" in name and ".weight" in name:
+            return gguf.GGMLQuantizationType.F32
+        return super().tensor_force_quant(name, new_name, bid, n_dims)
+
+    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
+        # skip language model tensors
+        if name.startswith("lfm."):
+            return []
+
+        # for training only
+        if any(p in name for p in ["audio_loss_weight"]):
+            return []
+
+        # for audio output
+        if any(p in name for p in ["codebook_offsets", "depth_embeddings", "depth_linear", "depthformer"]):
+            return []
+
+        # fold running_mean, running_var and eps into weight and bias for batch_norm
+        if "batch_norm" in name:
+            if self._batch_norm_tensors is None:
+                self._batch_norm_tensors = [{} for _ in range(self.block_count)]
+            assert bid is not None
+            self._batch_norm_tensors[bid][name] = data_torch
+
+            if len(self._batch_norm_tensors[bid]) < 5:
+                return []
+
+            weight = self._batch_norm_tensors[bid][f"conformer.layers.{bid}.conv.batch_norm.weight"]
+            bias = self._batch_norm_tensors[bid][f"conformer.layers.{bid}.conv.batch_norm.bias"]
+            running_mean = self._batch_norm_tensors[bid][f"conformer.layers.{bid}.conv.batch_norm.running_mean"]
+            running_var = self._batch_norm_tensors[bid][f"conformer.layers.{bid}.conv.batch_norm.running_var"]
+            eps = 1e-5 # default value
+
+            a = weight / torch.sqrt(running_var + eps)
+            b = bias - running_mean * a
+            return [
+                (self.map_tensor_name(f"conformer.layers.{bid}.conv.batch_norm.weight"), a),
+                (self.map_tensor_name(f"conformer.layers.{bid}.conv.batch_norm.bias"), b),
+            ]
+
+        # reshape conv weights
+        if name.startswith("conformer.pre_encode.conv.") and name.endswith(".bias"):
+            data_torch = data_torch[:, None, None]
+        if "conv.depthwise_conv" in name and name.endswith(".weight"):
+            assert data_torch.shape[1] == 1
+            data_torch = data_torch.reshape(data_torch.shape[0], data_torch.shape[2])
+        if "conv.pointwise_conv" in name and name.endswith(".weight"):
+            assert data_torch.shape[2] == 1
+            data_torch = data_torch.reshape(data_torch.shape[0], data_torch.shape[1])
+
+        return [(self.map_tensor_name(name), data_torch)]
+
+
 @ModelBase.register("SmallThinkerForCausalLM")
 class SmallThinkerModel(TextModel):
     model_arch = gguf.MODEL_ARCH.SMALLTHINKER
diff --git a/docs/android.md b/docs/android.md
index d2a835653f..964ce8a1f0 100644
--- a/docs/android.md
+++ b/docs/android.md
@@ -1,7 +1,27 @@
 
 # Android
 
-## Build on Android using Termux
+## Build GUI binding using Android Studio
+
+Import the `examples/llama.android` directory into Android Studio, then perform a Gradle sync and build the project.
+![Project imported into Android Studio](./android/imported-into-android-studio.jpg)
+
+This Android binding supports hardware acceleration up to `SME2` for **Arm** and `AMX` for **x86-64** CPUs on Android and ChromeOS devices.
+It automatically detects the host's hardware to load compatible kernels. As a result, it runs seamlessly on both the latest premium devices and older devices that may lack modern CPU features or have limited RAM, without requiring any manual configuration.
+
+A minimal Android app frontend is included to showcase the binding’s core functionalities:
+1.	**Parse GGUF metadata** via `GgufMetadataReader` from either a `ContentResolver` provided `Uri` from shared storage, or a local `File` from your app's private storage.
+2.	**Obtain a `InferenceEngine`** instance through the `AiChat` facade and load your selected model via its app-private file path.
+3.	**Send a raw user prompt** for automatic template formatting, prefill, and batch decoding. Then collect the generated tokens in a Kotlin `Flow`.
+
+For a production-ready experience that leverages advanced features such as system prompts and benchmarks, plus friendly UI features such as model management and Arm feature visualizer, check out [Arm AI Chat](https://play.google.com/store/apps/details?id=com.arm.aichat) on Google Play.
+This project is made possible through a collaborative effort by Arm's **CT-ML**, **CE-ML** and **STE** groups:
+
+| ![Home screen](https://naco-siren.github.io/ai-chat/policy/index/1-llm-starter-pack.png)  | ![System prompt](https://naco-siren.github.io/ai-chat/policy/index/5-system-prompt.png)  | !["Haiku"](https://naco-siren.github.io/ai-chat/policy/index/4-metrics.png)  |
+|:------------------------------------------------------:|:----------------------------------------------------:|:--------------------------------------------------------:|
+|                      Home screen                       |                    System prompt                     |                         "Haiku"                          |
+
+## Build CLI on Android using Termux
 
 [Termux](https://termux.dev/en/) is an Android terminal emulator and Linux environment app (no root required). As of writing, Termux is available experimentally in the Google Play Store; otherwise, it may be obtained directly from the project repo or on F-Droid.
 
@@ -32,7 +52,7 @@ To see what it might look like visually, here's an old demo of an interactive se
 
 https://user-images.githubusercontent.com/271616/225014776-1d567049-ad71-4ef2-b050-55b0b3b9274c.mp4
 
-## Cross-compile using Android NDK
+## Cross-compile CLI using Android NDK
 It's possible to build `llama.cpp` for Android on your host system via CMake and the Android NDK. If you are interested in this path, ensure you already have an environment prepared to cross-compile programs for Android (i.e., install the Android SDK). Note that, unlike desktop environments, the Android environment ships with a limited set of native libraries, and so only those libraries are available to CMake when building with the Android NDK (see: https://developer.android.com/ndk/guides/stable_apis.)
 
 Once you're ready and have cloned `llama.cpp`, invoke the following in the project directory:
diff --git a/docs/android/imported-into-android-studio.jpg b/docs/android/imported-into-android-studio.jpg
new file mode 100644
index 0000000000..bbe6867c6c
Binary files /dev/null and b/docs/android/imported-into-android-studio.jpg differ
diff --git a/docs/backend/hexagon/CMakeUserPresets.json b/docs/backend/hexagon/CMakeUserPresets.json
index e0b19db0f5..98d7221b3a 100644
--- a/docs/backend/hexagon/CMakeUserPresets.json
+++ b/docs/backend/hexagon/CMakeUserPresets.json
@@ -22,6 +22,7 @@
             "GGML_LLAMAFILE":   "OFF",
             "GGML_OPENCL":      "ON",
             "GGML_HEXAGON":     "ON",
+            "GGML_HEXAGON_FP32_QUANTIZE_GROUP_SIZE": "128",
             "LLAMA_CURL":       "OFF"
         }
     },
@@ -36,6 +37,7 @@
             "GGML_LLAMAFILE":   "OFF",
             "GGML_OPENCL":      "ON",
             "GGML_HEXAGON":     "ON",
+            "GGML_HEXAGON_FP32_QUANTIZE_GROUP_SIZE": "128",
             "LLAMA_CURL":       "OFF"
         }
     },
diff --git a/docs/development/parsing.md b/docs/development/parsing.md
index 113ab2e2ee..dbb989bf08 100644
--- a/docs/development/parsing.md
+++ b/docs/development/parsing.md
@@ -55,7 +55,7 @@ auto parser = build_chat_peg_native_parser([&](common_chat_peg_native_builder &
 ```
 
 For a more complete example, see `test_example_native()` in
-[tests/test-chat-peg-parser.cpp](tests/test-chat-peg-parser.cpp).
+[tests/test-chat-peg-parser.cpp](/tests/test-chat-peg-parser.cpp).
 
 ## Parsers/Combinators
 
@@ -175,7 +175,7 @@ Most model output can be placed in one of the following categories:
   (Qwen3-Coder, MiniMax M2) or pseudo-function calls (LFM2)
 
 To provide broad coverage,
-[`common/chat-peg-parser.h`](common/chat-peg-parser.h) contains builders and
+[`common/chat-peg-parser.h`](/common/chat-peg-parser.h) contains builders and
 mappers that help create parsers and visitors/extractors for these types. They
 require parsers to tag nodes to conform to an AST "shape". This normalization
 makes it easy to extract information and generalize parsing.
diff --git a/examples/llama.android/README.md b/examples/llama.android/README.md
deleted file mode 100644
index e69de29bb2..0000000000
diff --git a/examples/llama.android/app/build.gradle.kts b/examples/llama.android/app/build.gradle.kts
index 8d1b37195e..3524fe39c4 100644
--- a/examples/llama.android/app/build.gradle.kts
+++ b/examples/llama.android/app/build.gradle.kts
@@ -1,16 +1,18 @@
 plugins {
-    id("com.android.application")
-    id("org.jetbrains.kotlin.android")
+    alias(libs.plugins.android.application)
+    alias(libs.plugins.jetbrains.kotlin.android)
 }
 
 android {
     namespace = "com.example.llama"
-    compileSdk = 34
+    compileSdk = 36
 
     defaultConfig {
-        applicationId = "com.example.llama"
+        applicationId = "com.example.llama.aichat"
+
         minSdk = 33
-        targetSdk = 34
+        targetSdk = 36
+
         versionCode = 1
         versionName = "1.0"
 
@@ -21,8 +23,17 @@ android {
     }
 
     buildTypes {
+        debug {
+            isMinifyEnabled = true
+            isShrinkResources = true
+            proguardFiles(
+                getDefaultProguardFile("proguard-android.txt"),
+                "proguard-rules.pro"
+            )
+        }
         release {
-            isMinifyEnabled = false
+            isMinifyEnabled = true
+            isShrinkResources = true
             proguardFiles(
                 getDefaultProguardFile("proguard-android-optimize.txt"),
                 "proguard-rules.pro"
@@ -36,30 +47,15 @@ android {
     kotlinOptions {
         jvmTarget = "1.8"
     }
-    buildFeatures {
-        compose = true
-    }
-    composeOptions {
-        kotlinCompilerExtensionVersion = "1.5.1"
-    }
 }
 
 dependencies {
+    implementation(libs.bundles.androidx)
+    implementation(libs.material)
 
-    implementation("androidx.core:core-ktx:1.12.0")
-    implementation("androidx.lifecycle:lifecycle-runtime-ktx:2.6.2")
-    implementation("androidx.activity:activity-compose:1.8.2")
-    implementation(platform("androidx.compose:compose-bom:2023.08.00"))
-    implementation("androidx.compose.ui:ui")
-    implementation("androidx.compose.ui:ui-graphics")
-    implementation("androidx.compose.ui:ui-tooling-preview")
-    implementation("androidx.compose.material3:material3")
-    implementation(project(":llama"))
-    testImplementation("junit:junit:4.13.2")
-    androidTestImplementation("androidx.test.ext:junit:1.1.5")
-    androidTestImplementation("androidx.test.espresso:espresso-core:3.5.1")
-    androidTestImplementation(platform("androidx.compose:compose-bom:2023.08.00"))
-    androidTestImplementation("androidx.compose.ui:ui-test-junit4")
-    debugImplementation("androidx.compose.ui:ui-tooling")
-    debugImplementation("androidx.compose.ui:ui-test-manifest")
+    implementation(project(":lib"))
+
+    testImplementation(libs.junit)
+    androidTestImplementation(libs.androidx.junit)
+    androidTestImplementation(libs.androidx.espresso.core)
 }
diff --git a/examples/llama.android/app/proguard-rules.pro b/examples/llama.android/app/proguard-rules.pro
index f1b424510d..358020d2d2 100644
--- a/examples/llama.android/app/proguard-rules.pro
+++ b/examples/llama.android/app/proguard-rules.pro
@@ -19,3 +19,11 @@
 # If you keep the line number information, uncomment this to
 # hide the original source file name.
 #-renamesourcefileattribute SourceFile
+
+-keep class com.arm.aichat.* { *; }
+-keep class com.arm.aichat.gguf.* { *; }
+
+-assumenosideeffects class android.util.Log {
+    public static int v(...);
+    public static int d(...);
+}
diff --git a/examples/llama.android/app/src/main/AndroidManifest.xml b/examples/llama.android/app/src/main/AndroidManifest.xml
index 41a358a299..8f7c606b41 100644
--- a/examples/llama.android/app/src/main/AndroidManifest.xml
+++ b/examples/llama.android/app/src/main/AndroidManifest.xml
@@ -1,24 +1,21 @@
 <?xml version="1.0" encoding="utf-8"?>
-<manifest xmlns:android="http://schemas.android.com/apk/res/android"
-    xmlns:tools="http://schemas.android.com/tools">
-
-    <uses-permission android:name="android.permission.INTERNET" />
+<manifest xmlns:android="http://schemas.android.com/apk/res/android">
 
     <application
         android:allowBackup="true"
         android:dataExtractionRules="@xml/data_extraction_rules"
+        android:extractNativeLibs="true"
         android:fullBackupContent="@xml/backup_rules"
-        android:icon="@mipmap/ic_launcher"
+        android:icon="@mipmap/ic_launcher_round"
         android:label="@string/app_name"
         android:roundIcon="@mipmap/ic_launcher_round"
         android:supportsRtl="true"
-        android:theme="@style/Theme.LlamaAndroid"
+        android:theme="@style/Theme.AiChatSample"
         >
 
         <activity
             android:name=".MainActivity"
-            android:exported="true"
-            android:theme="@style/Theme.LlamaAndroid">
+            android:exported="true">
             <intent-filter>
                 <action android:name="android.intent.action.MAIN" />
 
diff --git a/examples/llama.android/app/src/main/java/com/example/llama/Downloadable.kt b/examples/llama.android/app/src/main/java/com/example/llama/Downloadable.kt
deleted file mode 100644
index 78c231ae55..0000000000
--- a/examples/llama.android/app/src/main/java/com/example/llama/Downloadable.kt
+++ /dev/null
@@ -1,119 +0,0 @@
-package com.example.llama
-
-import android.app.DownloadManager
-import android.net.Uri
-import android.util.Log
-import androidx.compose.material3.Button
-import androidx.compose.material3.Text
-import androidx.compose.runtime.Composable
-import androidx.compose.runtime.getValue
-import androidx.compose.runtime.mutableDoubleStateOf
-import androidx.compose.runtime.mutableStateOf
-import androidx.compose.runtime.remember
-import androidx.compose.runtime.rememberCoroutineScope
-import androidx.compose.runtime.setValue
-import androidx.core.database.getLongOrNull
-import androidx.core.net.toUri
-import kotlinx.coroutines.delay
-import kotlinx.coroutines.launch
-import java.io.File
-
-data class Downloadable(val name: String, val source: Uri, val destination: File) {
-    companion object {
-        @JvmStatic
-        private val tag: String? = this::class.qualifiedName
-
-        sealed interface State
-        data object Ready: State
-        data class Downloading(val id: Long): State
-        data class Downloaded(val downloadable: Downloadable): State
-        data class Error(val message: String): State
-
-        @JvmStatic
-        @Composable
-        fun Button(viewModel: MainViewModel, dm: DownloadManager, item: Downloadable) {
-            var status: State by remember {
-                mutableStateOf(
-                    if (item.destination.exists()) Downloaded(item)
-                    else Ready
-                )
-            }
-            var progress by remember { mutableDoubleStateOf(0.0) }
-
-            val coroutineScope = rememberCoroutineScope()
-
-            suspend fun waitForDownload(result: Downloading, item: Downloadable): State {
-                while (true) {
-                    val cursor = dm.query(DownloadManager.Query().setFilterById(result.id))
-
-                    if (cursor == null) {
-                        Log.e(tag, "dm.query() returned null")
-                        return Error("dm.query() returned null")
-                    }
-
-                    if (!cursor.moveToFirst() || cursor.count < 1) {
-                        cursor.close()
-                        Log.i(tag, "cursor.moveToFirst() returned false or cursor.count < 1, download canceled?")
-                        return Ready
-                    }
-
-                    val pix = cursor.getColumnIndex(DownloadManager.COLUMN_BYTES_DOWNLOADED_SO_FAR)
-                    val tix = cursor.getColumnIndex(DownloadManager.COLUMN_TOTAL_SIZE_BYTES)
-                    val sofar = cursor.getLongOrNull(pix) ?: 0
-                    val total = cursor.getLongOrNull(tix) ?: 1
-                    cursor.close()
-
-                    if (sofar == total) {
-                        return Downloaded(item)
-                    }
-
-                    progress = (sofar * 1.0) / total
-
-                    delay(1000L)
-                }
-            }
-
-            fun onClick() {
-                when (val s = status) {
-                    is Downloaded -> {
-                        viewModel.load(item.destination.path)
-                    }
-
-                    is Downloading -> {
-                        coroutineScope.launch {
-                            status = waitForDownload(s, item)
-                        }
-                    }
-
-                    else -> {
-                        item.destination.delete()
-
-                        val request = DownloadManager.Request(item.source).apply {
-                            setTitle("Downloading model")
-                            setDescription("Downloading model: ${item.name}")
-                            setAllowedNetworkTypes(DownloadManager.Request.NETWORK_WIFI)
-                            setDestinationUri(item.destination.toUri())
-                        }
-
-                        viewModel.log("Saving ${item.name} to ${item.destination.path}")
-                        Log.i(tag, "Saving ${item.name} to ${item.destination.path}")
-
-                        val id = dm.enqueue(request)
-                        status = Downloading(id)
-                        onClick()
-                    }
-                }
-            }
-
-            Button(onClick = { onClick() }, enabled = status !is Downloading) {
-                when (status) {
-                    is Downloading -> Text(text = "Downloading ${(progress * 100).toInt()}%")
-                    is Downloaded -> Text("Load ${item.name}")
-                    is Ready -> Text("Download ${item.name}")
-                    is Error -> Text("Download ${item.name}")
-                }
-            }
-        }
-
-    }
-}
diff --git a/examples/llama.android/app/src/main/java/com/example/llama/MainActivity.kt b/examples/llama.android/app/src/main/java/com/example/llama/MainActivity.kt
index 9da04f7d3c..52c5dc2154 100644
--- a/examples/llama.android/app/src/main/java/com/example/llama/MainActivity.kt
+++ b/examples/llama.android/app/src/main/java/com/example/llama/MainActivity.kt
@@ -1,154 +1,257 @@
 package com.example.llama
 
-import android.app.ActivityManager
-import android.app.DownloadManager
-import android.content.ClipData
-import android.content.ClipboardManager
 import android.net.Uri
 import android.os.Bundle
-import android.os.StrictMode
-import android.os.StrictMode.VmPolicy
-import android.text.format.Formatter
-import androidx.activity.ComponentActivity
-import androidx.activity.compose.setContent
-import androidx.activity.viewModels
-import androidx.compose.foundation.layout.Box
-import androidx.compose.foundation.layout.Column
-import androidx.compose.foundation.layout.Row
-import androidx.compose.foundation.layout.fillMaxSize
-import androidx.compose.foundation.layout.padding
-import androidx.compose.foundation.lazy.LazyColumn
-import androidx.compose.foundation.lazy.items
-import androidx.compose.foundation.lazy.rememberLazyListState
-import androidx.compose.material3.Button
-import androidx.compose.material3.LocalContentColor
-import androidx.compose.material3.MaterialTheme
-import androidx.compose.material3.OutlinedTextField
-import androidx.compose.material3.Surface
-import androidx.compose.material3.Text
-import androidx.compose.runtime.Composable
-import androidx.compose.ui.Modifier
-import androidx.compose.ui.unit.dp
-import androidx.core.content.getSystemService
-import com.example.llama.ui.theme.LlamaAndroidTheme
+import android.util.Log
+import android.widget.EditText
+import android.widget.TextView
+import android.widget.Toast
+import androidx.activity.enableEdgeToEdge
+import androidx.activity.result.contract.ActivityResultContracts
+import androidx.appcompat.app.AppCompatActivity
+import androidx.lifecycle.lifecycleScope
+import androidx.recyclerview.widget.LinearLayoutManager
+import androidx.recyclerview.widget.RecyclerView
+import com.arm.aichat.AiChat
+import com.arm.aichat.InferenceEngine
+import com.arm.aichat.gguf.GgufMetadata
+import com.arm.aichat.gguf.GgufMetadataReader
+import com.google.android.material.floatingactionbutton.FloatingActionButton
+import kotlinx.coroutines.Dispatchers
+import kotlinx.coroutines.flow.onCompletion
+import kotlinx.coroutines.launch
+import kotlinx.coroutines.withContext
 import java.io.File
+import java.io.FileOutputStream
+import java.io.InputStream
+import java.util.UUID
 
-class MainActivity(
-    activityManager: ActivityManager? = null,
-    downloadManager: DownloadManager? = null,
-    clipboardManager: ClipboardManager? = null,
-): ComponentActivity() {
-    private val tag: String? = this::class.simpleName
+class MainActivity : AppCompatActivity() {
 
-    private val activityManager by lazy { activityManager ?: getSystemService<ActivityManager>()!! }
-    private val downloadManager by lazy { downloadManager ?: getSystemService<DownloadManager>()!! }
-    private val clipboardManager by lazy { clipboardManager ?: getSystemService<ClipboardManager>()!! }
+    // Android views
+    private lateinit var ggufTv: TextView
+    private lateinit var messagesRv: RecyclerView
+    private lateinit var userInputEt: EditText
+    private lateinit var userActionFab: FloatingActionButton
 
-    private val viewModel: MainViewModel by viewModels()
+    // Arm AI Chat inference engine
+    private lateinit var engine: InferenceEngine
 
-    // Get a MemoryInfo object for the device's current memory status.
-    private fun availableMemory(): ActivityManager.MemoryInfo {
-        return ActivityManager.MemoryInfo().also { memoryInfo ->
-            activityManager.getMemoryInfo(memoryInfo)
-        }
-    }
+    // Conversation states
+    private var isModelReady = false
+    private val messages = mutableListOf<Message>()
+    private val lastAssistantMsg = StringBuilder()
+    private val messageAdapter = MessageAdapter(messages)
 
     override fun onCreate(savedInstanceState: Bundle?) {
         super.onCreate(savedInstanceState)
+        enableEdgeToEdge()
+        setContentView(R.layout.activity_main)
 
-        StrictMode.setVmPolicy(
-            VmPolicy.Builder(StrictMode.getVmPolicy())
-                .detectLeakedClosableObjects()
-                .build()
-        )
+        // Find views
+        ggufTv = findViewById(R.id.gguf)
+        messagesRv = findViewById(R.id.messages)
+        messagesRv.layoutManager = LinearLayoutManager(this)
+        messagesRv.adapter = messageAdapter
+        userInputEt = findViewById(R.id.user_input)
+        userActionFab = findViewById(R.id.fab)
 
-        val free = Formatter.formatFileSize(this, availableMemory().availMem)
-        val total = Formatter.formatFileSize(this, availableMemory().totalMem)
-
-        viewModel.log("Current memory: $free / $total")
-        viewModel.log("Downloads directory: ${getExternalFilesDir(null)}")
-
-        val extFilesDir = getExternalFilesDir(null)
-
-        val models = listOf(
-            Downloadable(
-                "Phi-2 7B (Q4_0, 1.6 GiB)",
-                Uri.parse("https://huggingface.co/ggml-org/models/resolve/main/phi-2/ggml-model-q4_0.gguf?download=true"),
-                File(extFilesDir, "phi-2-q4_0.gguf"),
-            ),
-            Downloadable(
-                "TinyLlama 1.1B (f16, 2.2 GiB)",
-                Uri.parse("https://huggingface.co/ggml-org/models/resolve/main/tinyllama-1.1b/ggml-model-f16.gguf?download=true"),
-                File(extFilesDir, "tinyllama-1.1-f16.gguf"),
-            ),
-            Downloadable(
-                "Phi 2 DPO (Q3_K_M, 1.48 GiB)",
-                Uri.parse("https://huggingface.co/TheBloke/phi-2-dpo-GGUF/resolve/main/phi-2-dpo.Q3_K_M.gguf?download=true"),
-                File(extFilesDir, "phi-2-dpo.Q3_K_M.gguf")
-            ),
-        )
-
-        setContent {
-            LlamaAndroidTheme {
-                // A surface container using the 'background' color from the theme
-                Surface(
-                    modifier = Modifier.fillMaxSize(),
-                    color = MaterialTheme.colorScheme.background
-                ) {
-                    MainCompose(
-                        viewModel,
-                        clipboardManager,
-                        downloadManager,
-                        models,
-                    )
-                }
+        // Arm AI Chat initialization
+        lifecycleScope.launch(Dispatchers.Default) {
+            engine = AiChat.getInferenceEngine(applicationContext)
+        }
 
+        // Upon CTA button tapped
+        userActionFab.setOnClickListener {
+            if (isModelReady) {
+                // If model is ready, validate input and send to engine
+                handleUserInput()
+            } else {
+                // Otherwise, prompt user to select a GGUF metadata on the device
+                getContent.launch(arrayOf("*/*"))
             }
         }
     }
-}
 
-@Composable
-fun MainCompose(
-    viewModel: MainViewModel,
-    clipboard: ClipboardManager,
-    dm: DownloadManager,
-    models: List<Downloadable>
-) {
-    Column {
-        val scrollState = rememberLazyListState()
+    private val getContent = registerForActivityResult(
+        ActivityResultContracts.OpenDocument()
+    ) { uri ->
+        Log.i(TAG, "Selected file uri:\n $uri")
+        uri?.let { handleSelectedModel(it) }
+    }
 
-        Box(modifier = Modifier.weight(1f)) {
-            LazyColumn(state = scrollState) {
-                items(viewModel.messages) {
-                    Text(
-                        it,
-                        style = MaterialTheme.typography.bodyLarge.copy(color = LocalContentColor.current),
-                        modifier = Modifier.padding(16.dp)
-                    )
+    /**
+     * Handles the file Uri from [getContent] result
+     */
+    private fun handleSelectedModel(uri: Uri) {
+        // Update UI states
+        userActionFab.isEnabled = false
+        userInputEt.hint = "Parsing GGUF..."
+        ggufTv.text = "Parsing metadata from selected file \n$uri"
+
+        lifecycleScope.launch(Dispatchers.IO) {
+            // Parse GGUF metadata
+            Log.i(TAG, "Parsing GGUF metadata...")
+            contentResolver.openInputStream(uri)?.use {
+                GgufMetadataReader.create().readStructuredMetadata(it)
+            }?.let { metadata ->
+                // Update UI to show GGUF metadata to user
+                Log.i(TAG, "GGUF parsed: \n$metadata")
+                withContext(Dispatchers.Main) {
+                    ggufTv.text = metadata.toString()
                 }
-            }
-        }
-        OutlinedTextField(
-            value = viewModel.message,
-            onValueChange = { viewModel.updateMessage(it) },
-            label = { Text("Message") },
-        )
-        Row {
-            Button({ viewModel.send() }) { Text("Send") }
-            Button({ viewModel.bench(8, 4, 1) }) { Text("Bench") }
-            Button({ viewModel.clear() }) { Text("Clear") }
-            Button({
-                viewModel.messages.joinToString("\n").let {
-                    clipboard.setPrimaryClip(ClipData.newPlainText("", it))
-                }
-            }) { Text("Copy") }
-        }
 
-        Column {
-            for (model in models) {
-                Downloadable.Button(viewModel, dm, model)
+                // Ensure the model file is available
+                val modelName = metadata.filename() + FILE_EXTENSION_GGUF
+                contentResolver.openInputStream(uri)?.use { input ->
+                    ensureModelFile(modelName, input)
+                }?.let { modelFile ->
+                    loadModel(modelName, modelFile)
+
+                    withContext(Dispatchers.Main) {
+                        isModelReady = true
+                        userInputEt.hint = "Type and send a message!"
+                        userInputEt.isEnabled = true
+                        userActionFab.setImageResource(R.drawable.outline_send_24)
+                        userActionFab.isEnabled = true
+                    }
+                }
             }
         }
     }
+
+    /**
+     * Prepare the model file within app's private storage
+     */
+    private suspend fun ensureModelFile(modelName: String, input: InputStream) =
+        withContext(Dispatchers.IO) {
+            File(ensureModelsDirectory(), modelName).also { file ->
+                // Copy the file into local storage if not yet done
+                if (!file.exists()) {
+                    Log.i(TAG, "Start copying file to $modelName")
+                    withContext(Dispatchers.Main) {
+                        userInputEt.hint = "Copying file..."
+                    }
+
+                    FileOutputStream(file).use { input.copyTo(it) }
+                    Log.i(TAG, "Finished copying file to $modelName")
+                } else {
+                    Log.i(TAG, "File already exists $modelName")
+                }
+            }
+        }
+
+    /**
+     * Load the model file from the app private storage
+     */
+    private suspend fun loadModel(modelName: String, modelFile: File) =
+        withContext(Dispatchers.IO) {
+            Log.i(TAG, "Loading model $modelName")
+            withContext(Dispatchers.Main) {
+                userInputEt.hint = "Loading model..."
+            }
+            engine.loadModel(modelFile.path)
+        }
+
+    /**
+     * Validate and send the user message into [InferenceEngine]
+     */
+    private fun handleUserInput() {
+        userInputEt.text.toString().also { userSsg ->
+            if (userSsg.isEmpty()) {
+                Toast.makeText(this, "Input message is empty!", Toast.LENGTH_SHORT).show()
+            } else {
+                userInputEt.text = null
+                userActionFab.isEnabled = false
+
+                // Update message states
+                messages.add(Message(UUID.randomUUID().toString(), userSsg, true))
+                lastAssistantMsg.clear()
+                messages.add(Message(UUID.randomUUID().toString(), lastAssistantMsg.toString(), false))
+
+                lifecycleScope.launch(Dispatchers.Default) {
+                    engine.sendUserPrompt(userSsg)
+                        .onCompletion {
+                            withContext(Dispatchers.Main) {
+                                userActionFab.isEnabled = true
+                            }
+                        }.collect { token ->
+                            val messageCount = messages.size
+                            check(messageCount > 0 && !messages[messageCount - 1].isUser)
+
+                            messages.removeAt(messageCount - 1).copy(
+                                content = lastAssistantMsg.append(token).toString()
+                            ).let { messages.add(it) }
+
+                            withContext(Dispatchers.Main) {
+                                messageAdapter.notifyItemChanged(messages.size - 1)
+                            }
+                        }
+                }
+            }
+        }
+    }
+
+    /**
+     * Run a benchmark with the model file
+     */
+    private suspend fun runBenchmark(modelName: String, modelFile: File) =
+        withContext(Dispatchers.Default) {
+            Log.i(TAG, "Starts benchmarking $modelName")
+            withContext(Dispatchers.Main) {
+                userInputEt.hint = "Running benchmark..."
+            }
+            engine.bench(
+                pp=BENCH_PROMPT_PROCESSING_TOKENS,
+                tg=BENCH_TOKEN_GENERATION_TOKENS,
+                pl=BENCH_SEQUENCE,
+                nr=BENCH_REPETITION
+            ).let { result ->
+                messages.add(Message(UUID.randomUUID().toString(), result, false))
+                withContext(Dispatchers.Main) {
+                    messageAdapter.notifyItemChanged(messages.size - 1)
+                }
+            }
+        }
+
+    /**
+     * Create the `models` directory if not exist.
+     */
+    private fun ensureModelsDirectory() =
+        File(filesDir, DIRECTORY_MODELS).also {
+            if (it.exists() && !it.isDirectory) { it.delete() }
+            if (!it.exists()) { it.mkdir() }
+        }
+
+    companion object {
+        private val TAG = MainActivity::class.java.simpleName
+
+        private const val DIRECTORY_MODELS = "models"
+        private const val FILE_EXTENSION_GGUF = ".gguf"
+
+        private const val BENCH_PROMPT_PROCESSING_TOKENS = 512
+        private const val BENCH_TOKEN_GENERATION_TOKENS = 128
+        private const val BENCH_SEQUENCE = 1
+        private const val BENCH_REPETITION = 3
+    }
+}
+
+fun GgufMetadata.filename() = when {
+    basic.name != null -> {
+        basic.name?.let { name ->
+            basic.sizeLabel?.let { size ->
+                "$name-$size"
+            } ?: name
+        }
+    }
+    architecture?.architecture != null -> {
+        architecture?.architecture?.let { arch ->
+            basic.uuid?.let { uuid ->
+                "$arch-$uuid"
+            } ?: "$arch-${System.currentTimeMillis()}"
+        }
+    }
+    else -> {
+        "model-${System.currentTimeMillis().toHexString()}"
+    }
 }
diff --git a/examples/llama.android/app/src/main/java/com/example/llama/MainViewModel.kt b/examples/llama.android/app/src/main/java/com/example/llama/MainViewModel.kt
deleted file mode 100644
index 45ac29938f..0000000000
--- a/examples/llama.android/app/src/main/java/com/example/llama/MainViewModel.kt
+++ /dev/null
@@ -1,105 +0,0 @@
-package com.example.llama
-
-import android.llama.cpp.LLamaAndroid
-import android.util.Log
-import androidx.compose.runtime.getValue
-import androidx.compose.runtime.mutableStateOf
-import androidx.compose.runtime.setValue
-import androidx.lifecycle.ViewModel
-import androidx.lifecycle.viewModelScope
-import kotlinx.coroutines.flow.catch
-import kotlinx.coroutines.launch
-
-class MainViewModel(private val llamaAndroid: LLamaAndroid = LLamaAndroid.instance()): ViewModel() {
-    companion object {
-        @JvmStatic
-        private val NanosPerSecond = 1_000_000_000.0
-    }
-
-    private val tag: String? = this::class.simpleName
-
-    var messages by mutableStateOf(listOf("Initializing..."))
-        private set
-
-    var message by mutableStateOf("")
-        private set
-
-    override fun onCleared() {
-        super.onCleared()
-
-        viewModelScope.launch {
-            try {
-                llamaAndroid.unload()
-            } catch (exc: IllegalStateException) {
-                messages += exc.message!!
-            }
-        }
-    }
-
-    fun send() {
-        val text = message
-        message = ""
-
-        // Add to messages console.
-        messages += text
-        messages += ""
-
-        viewModelScope.launch {
-            llamaAndroid.send(text)
-                .catch {
-                    Log.e(tag, "send() failed", it)
-                    messages += it.message!!
-                }
-                .collect { messages = messages.dropLast(1) + (messages.last() + it) }
-        }
-    }
-
-    fun bench(pp: Int, tg: Int, pl: Int, nr: Int = 1) {
-        viewModelScope.launch {
-            try {
-                val start = System.nanoTime()
-                val warmupResult = llamaAndroid.bench(pp, tg, pl, nr)
-                val end = System.nanoTime()
-
-                messages += warmupResult
-
-                val warmup = (end - start).toDouble() / NanosPerSecond
-                messages += "Warm up time: $warmup seconds, please wait..."
-
-                if (warmup > 5.0) {
-                    messages += "Warm up took too long, aborting benchmark"
-                    return@launch
-                }
-
-                messages += llamaAndroid.bench(512, 128, 1, 3)
-            } catch (exc: IllegalStateException) {
-                Log.e(tag, "bench() failed", exc)
-                messages += exc.message!!
-            }
-        }
-    }
-
-    fun load(pathToModel: String) {
-        viewModelScope.launch {
-            try {
-                llamaAndroid.load(pathToModel)
-                messages += "Loaded $pathToModel"
-            } catch (exc: IllegalStateException) {
-                Log.e(tag, "load() failed", exc)
-                messages += exc.message!!
-            }
-        }
-    }
-
-    fun updateMessage(newMessage: String) {
-        message = newMessage
-    }
-
-    fun clear() {
-        messages = listOf()
-    }
-
-    fun log(message: String) {
-        messages += message
-    }
-}
diff --git a/examples/llama.android/app/src/main/java/com/example/llama/MessageAdapter.kt b/examples/llama.android/app/src/main/java/com/example/llama/MessageAdapter.kt
new file mode 100644
index 0000000000..0439f96441
--- /dev/null
+++ b/examples/llama.android/app/src/main/java/com/example/llama/MessageAdapter.kt
@@ -0,0 +1,51 @@
+package com.example.llama
+
+import android.view.LayoutInflater
+import android.view.View
+import android.view.ViewGroup
+import android.widget.TextView
+import androidx.recyclerview.widget.RecyclerView
+
+data class Message(
+    val id: String,
+    val content: String,
+    val isUser: Boolean
+)
+
+class MessageAdapter(
+    private val messages: List<Message>
+) : RecyclerView.Adapter<RecyclerView.ViewHolder>() {
+
+    companion object {
+        private const val VIEW_TYPE_USER = 1
+        private const val VIEW_TYPE_ASSISTANT = 2
+    }
+
+    override fun getItemViewType(position: Int): Int {
+        return if (messages[position].isUser) VIEW_TYPE_USER else VIEW_TYPE_ASSISTANT
+    }
+
+    override fun onCreateViewHolder(parent: ViewGroup, viewType: Int): RecyclerView.ViewHolder {
+        val layoutInflater = LayoutInflater.from(parent.context)
+        return if (viewType == VIEW_TYPE_USER) {
+            val view = layoutInflater.inflate(R.layout.item_message_user, parent, false)
+            UserMessageViewHolder(view)
+        } else {
+            val view = layoutInflater.inflate(R.layout.item_message_assistant, parent, false)
+            AssistantMessageViewHolder(view)
+        }
+    }
+
+    override fun onBindViewHolder(holder: RecyclerView.ViewHolder, position: Int) {
+        val message = messages[position]
+        if (holder is UserMessageViewHolder || holder is AssistantMessageViewHolder) {
+            val textView = holder.itemView.findViewById<TextView>(R.id.msg_content)
+            textView.text = message.content
+        }
+    }
+
+    override fun getItemCount(): Int = messages.size
+
+    class UserMessageViewHolder(view: View) : RecyclerView.ViewHolder(view)
+    class AssistantMessageViewHolder(view: View) : RecyclerView.ViewHolder(view)
+}
diff --git a/examples/llama.android/app/src/main/java/com/example/llama/ui/theme/Color.kt b/examples/llama.android/app/src/main/java/com/example/llama/ui/theme/Color.kt
deleted file mode 100644
index 40c30e8d97..0000000000
--- a/examples/llama.android/app/src/main/java/com/example/llama/ui/theme/Color.kt
+++ /dev/null
@@ -1,11 +0,0 @@
-package com.example.llama.ui.theme
-
-import androidx.compose.ui.graphics.Color
-
-val Purple80 = Color(0xFFD0BCFF)
-val PurpleGrey80 = Color(0xFFCCC2DC)
-val Pink80 = Color(0xFFEFB8C8)
-
-val Purple40 = Color(0xFF6650a4)
-val PurpleGrey40 = Color(0xFF625b71)
-val Pink40 = Color(0xFF7D5260)
diff --git a/examples/llama.android/app/src/main/java/com/example/llama/ui/theme/Theme.kt b/examples/llama.android/app/src/main/java/com/example/llama/ui/theme/Theme.kt
deleted file mode 100644
index e742220a8d..0000000000
--- a/examples/llama.android/app/src/main/java/com/example/llama/ui/theme/Theme.kt
+++ /dev/null
@@ -1,70 +0,0 @@
-package com.example.llama.ui.theme
-
-import android.app.Activity
-import android.os.Build
-import androidx.compose.foundation.isSystemInDarkTheme
-import androidx.compose.material3.MaterialTheme
-import androidx.compose.material3.darkColorScheme
-import androidx.compose.material3.dynamicDarkColorScheme
-import androidx.compose.material3.dynamicLightColorScheme
-import androidx.compose.material3.lightColorScheme
-import androidx.compose.runtime.Composable
-import androidx.compose.runtime.SideEffect
-import androidx.compose.ui.graphics.toArgb
-import androidx.compose.ui.platform.LocalContext
-import androidx.compose.ui.platform.LocalView
-import androidx.core.view.WindowCompat
-
-private val DarkColorScheme = darkColorScheme(
-    primary = Purple80,
-    secondary = PurpleGrey80,
-    tertiary = Pink80
-)
-
-private val LightColorScheme = lightColorScheme(
-    primary = Purple40,
-    secondary = PurpleGrey40,
-    tertiary = Pink40
-
-    /* Other default colors to override
-    background = Color(0xFFFFFBFE),
-    surface = Color(0xFFFFFBFE),
-    onPrimary = Color.White,
-    onSecondary = Color.White,
-    onTertiary = Color.White,
-    onBackground = Color(0xFF1C1B1F),
-    onSurface = Color(0xFF1C1B1F),
-    */
-)
-
-@Composable
-fun LlamaAndroidTheme(
-    darkTheme: Boolean = isSystemInDarkTheme(),
-    // Dynamic color is available on Android 12+
-    dynamicColor: Boolean = true,
-    content: @Composable () -> Unit
-) {
-    val colorScheme = when {
-        dynamicColor && Build.VERSION.SDK_INT >= Build.VERSION_CODES.S -> {
-            val context = LocalContext.current
-            if (darkTheme) dynamicDarkColorScheme(context) else dynamicLightColorScheme(context)
-        }
-
-        darkTheme -> DarkColorScheme
-        else -> LightColorScheme
-    }
-    val view = LocalView.current
-    if (!view.isInEditMode) {
-        SideEffect {
-            val window = (view.context as Activity).window
-            window.statusBarColor = colorScheme.primary.toArgb()
-            WindowCompat.getInsetsController(window, view).isAppearanceLightStatusBars = darkTheme
-        }
-    }
-
-    MaterialTheme(
-        colorScheme = colorScheme,
-        typography = Typography,
-        content = content
-    )
-}
diff --git a/examples/llama.android/app/src/main/java/com/example/llama/ui/theme/Type.kt b/examples/llama.android/app/src/main/java/com/example/llama/ui/theme/Type.kt
deleted file mode 100644
index 0b87946ca3..0000000000
--- a/examples/llama.android/app/src/main/java/com/example/llama/ui/theme/Type.kt
+++ /dev/null
@@ -1,34 +0,0 @@
-package com.example.llama.ui.theme
-
-import androidx.compose.material3.Typography
-import androidx.compose.ui.text.TextStyle
-import androidx.compose.ui.text.font.FontFamily
-import androidx.compose.ui.text.font.FontWeight
-import androidx.compose.ui.unit.sp
-
-// Set of Material typography styles to start with
-val Typography = Typography(
-    bodyLarge = TextStyle(
-        fontFamily = FontFamily.Default,
-        fontWeight = FontWeight.Normal,
-        fontSize = 16.sp,
-        lineHeight = 24.sp,
-        letterSpacing = 0.5.sp
-    )
-    /* Other default text styles to override
-    titleLarge = TextStyle(
-        fontFamily = FontFamily.Default,
-        fontWeight = FontWeight.Normal,
-        fontSize = 22.sp,
-        lineHeight = 28.sp,
-        letterSpacing = 0.sp
-    ),
-    labelSmall = TextStyle(
-        fontFamily = FontFamily.Default,
-        fontWeight = FontWeight.Medium,
-        fontSize = 11.sp,
-        lineHeight = 16.sp,
-        letterSpacing = 0.5.sp
-    )
-    */
-)
diff --git a/examples/llama.android/app/src/main/res/drawable/bg_assistant_message.xml b/examples/llama.android/app/src/main/res/drawable/bg_assistant_message.xml
new file mode 100644
index 0000000000..f90c3db458
--- /dev/null
+++ b/examples/llama.android/app/src/main/res/drawable/bg_assistant_message.xml
@@ -0,0 +1,4 @@
+<shape xmlns:android="http://schemas.android.com/apk/res/android" android:shape="rectangle">
+    <solid android:color="#E5E5EA" />
+    <corners android:radius="16dp" />
+</shape>
diff --git a/examples/llama.android/app/src/main/res/drawable/bg_user_message.xml b/examples/llama.android/app/src/main/res/drawable/bg_user_message.xml
new file mode 100644
index 0000000000..3ca7daefec
--- /dev/null
+++ b/examples/llama.android/app/src/main/res/drawable/bg_user_message.xml
@@ -0,0 +1,4 @@
+<shape xmlns:android="http://schemas.android.com/apk/res/android" android:shape="rectangle">
+    <solid android:color="#4285F4" />
+    <corners android:radius="16dp" />
+</shape>
diff --git a/examples/llama.android/app/src/main/res/drawable/outline_folder_open_24.xml b/examples/llama.android/app/src/main/res/drawable/outline_folder_open_24.xml
new file mode 100644
index 0000000000..f58b501e3b
--- /dev/null
+++ b/examples/llama.android/app/src/main/res/drawable/outline_folder_open_24.xml
@@ -0,0 +1,10 @@
+<vector xmlns:android="http://schemas.android.com/apk/res/android"
+    android:width="24dp"
+    android:height="24dp"
+    android:viewportWidth="24"
+    android:viewportHeight="24"
+    android:tint="?attr/colorControlNormal">
+  <path
+      android:fillColor="@android:color/white"
+      android:pathData="M20,6h-8l-2,-2L4,4c-1.1,0 -1.99,0.9 -1.99,2L2,18c0,1.1 0.9,2 2,2h16c1.1,0 2,-0.9 2,-2L22,8c0,-1.1 -0.9,-2 -2,-2zM20,18L4,18L4,8h16v10z"/>
+</vector>
diff --git a/examples/llama.android/app/src/main/res/drawable/outline_send_24.xml b/examples/llama.android/app/src/main/res/drawable/outline_send_24.xml
new file mode 100644
index 0000000000..712adc00c4
--- /dev/null
+++ b/examples/llama.android/app/src/main/res/drawable/outline_send_24.xml
@@ -0,0 +1,11 @@
+<vector xmlns:android="http://schemas.android.com/apk/res/android"
+    android:width="24dp"
+    android:height="24dp"
+    android:viewportWidth="24"
+    android:viewportHeight="24"
+    android:tint="?attr/colorControlNormal"
+    android:autoMirrored="true">
+  <path
+      android:fillColor="@android:color/white"
+      android:pathData="M4.01,6.03l7.51,3.22 -7.52,-1 0.01,-2.22m7.5,8.72L4,17.97v-2.22l7.51,-1M2.01,3L2,10l15,2 -15,2 0.01,7L23,12 2.01,3z"/>
+</vector>
diff --git a/examples/llama.android/app/src/main/res/layout/activity_main.xml b/examples/llama.android/app/src/main/res/layout/activity_main.xml
new file mode 100644
index 0000000000..ad805a674e
--- /dev/null
+++ b/examples/llama.android/app/src/main/res/layout/activity_main.xml
@@ -0,0 +1,78 @@
+<?xml version="1.0" encoding="utf-8"?>
+<androidx.constraintlayout.widget.ConstraintLayout xmlns:android="http://schemas.android.com/apk/res/android"
+    xmlns:app="http://schemas.android.com/apk/res-auto"
+    xmlns:tools="http://schemas.android.com/tools"
+    android:id="@+id/main"
+    android:layout_height="match_parent"
+    android:layout_width="match_parent">
+
+    <LinearLayout
+        android:fitsSystemWindows="true"
+        android:layout_width="match_parent"
+        android:layout_height="match_parent"
+        android:orientation="vertical"
+        android:layout_marginEnd="4dp"
+        tools:context=".MainActivity">
+
+        <ScrollView
+            android:layout_width="match_parent"
+            android:layout_height="0dp"
+            android:layout_weight="1"
+            android:fadeScrollbars="false">
+
+            <TextView
+                android:id="@+id/gguf"
+                android:layout_width="match_parent"
+                android:layout_height="wrap_content"
+                android:layout_margin="16dp"
+                android:text="Selected GGUF model's metadata will show here."
+                style="@style/TextAppearance.MaterialComponents.Body2" />
+
+        </ScrollView>
+
+        <com.google.android.material.divider.MaterialDivider
+            android:layout_width="match_parent"
+            android:layout_height="2dp"
+            android:layout_marginHorizontal="16dp"
+            android:layout_marginVertical="8dp" />
+
+        <androidx.recyclerview.widget.RecyclerView
+            android:id="@+id/messages"
+            android:layout_width="match_parent"
+            android:layout_height="0dp"
+            android:layout_weight="4"
+            android:fadeScrollbars="false"
+            android:scrollbars="vertical"
+            app:reverseLayout="true"
+            tools:listitem="@layout/item_message_assistant"/>
+
+        <LinearLayout
+            android:layout_width="match_parent"
+            android:layout_height="wrap_content"
+            android:orientation="horizontal"
+            android:paddingStart="16dp"
+            android:paddingEnd="4dp">
+
+            <EditText
+                android:id="@+id/user_input"
+                android:enabled="false"
+                android:layout_width="0dp"
+                android:layout_weight="1"
+                android:layout_height="match_parent"
+                android:padding="8dp"
+                style="@style/TextAppearance.MaterialComponents.Body2"
+                android:hint="Please first pick a GGUF model file to import." />
+
+            <com.google.android.material.floatingactionbutton.FloatingActionButton
+                android:id="@+id/fab"
+                android:enabled="true"
+                style="@style/Widget.Material3.FloatingActionButton.Primary"
+                android:layout_width="wrap_content"
+                android:layout_height="wrap_content"
+                android:layout_margin="12dp"
+                android:src="@drawable/outline_folder_open_24" />
+
+        </LinearLayout>
+
+    </LinearLayout>
+</androidx.constraintlayout.widget.ConstraintLayout>
diff --git a/examples/llama.android/app/src/main/res/layout/item_message_assistant.xml b/examples/llama.android/app/src/main/res/layout/item_message_assistant.xml
new file mode 100644
index 0000000000..2c8e4bc2e1
--- /dev/null
+++ b/examples/llama.android/app/src/main/res/layout/item_message_assistant.xml
@@ -0,0 +1,16 @@
+<?xml version="1.0" encoding="utf-8"?>
+<LinearLayout xmlns:android="http://schemas.android.com/apk/res/android"
+    android:layout_width="match_parent"
+    android:layout_height="wrap_content"
+    android:layout_marginHorizontal="16dp"
+    android:layout_marginVertical="8dp"
+    android:gravity="start">
+
+    <TextView
+        android:id="@+id/msg_content"
+        android:layout_width="wrap_content"
+        android:layout_height="wrap_content"
+        android:background="@drawable/bg_assistant_message"
+        android:padding="12dp"
+        android:textColor="@android:color/black" />
+</LinearLayout>
diff --git a/examples/llama.android/app/src/main/res/layout/item_message_user.xml b/examples/llama.android/app/src/main/res/layout/item_message_user.xml
new file mode 100644
index 0000000000..5aa79f2df3
--- /dev/null
+++ b/examples/llama.android/app/src/main/res/layout/item_message_user.xml
@@ -0,0 +1,16 @@
+<?xml version="1.0" encoding="utf-8"?>
+<LinearLayout xmlns:android="http://schemas.android.com/apk/res/android"
+    android:layout_width="match_parent"
+    android:layout_height="wrap_content"
+    android:layout_marginHorizontal="16dp"
+    android:layout_marginVertical="8dp"
+    android:gravity="end">
+
+    <TextView
+        android:id="@+id/msg_content"
+        android:layout_width="wrap_content"
+        android:layout_height="wrap_content"
+        android:background="@drawable/bg_user_message"
+        android:padding="12dp"
+        android:textColor="@android:color/white" />
+</LinearLayout>
diff --git a/examples/llama.android/app/src/main/res/values/strings.xml b/examples/llama.android/app/src/main/res/values/strings.xml
index 7a9d314e29..36059fc799 100644
--- a/examples/llama.android/app/src/main/res/values/strings.xml
+++ b/examples/llama.android/app/src/main/res/values/strings.xml
@@ -1,3 +1,3 @@
 <resources>
-    <string name="app_name">LlamaAndroid</string>
+    <string name="app_name">AI Chat basic sample</string>
 </resources>
diff --git a/examples/llama.android/app/src/main/res/values/themes.xml b/examples/llama.android/app/src/main/res/values/themes.xml
index 8a24fda566..2e4fdad72e 100644
--- a/examples/llama.android/app/src/main/res/values/themes.xml
+++ b/examples/llama.android/app/src/main/res/values/themes.xml
@@ -1,5 +1,10 @@
 <?xml version="1.0" encoding="utf-8"?>
 <resources>
 
-    <style name="Theme.LlamaAndroid" parent="android:Theme.Material.Light.NoActionBar" />
+    <style name="Base.Theme.AiChatSample" parent="Theme.Material3.DayNight.NoActionBar">
+        <!-- Customize your light theme here. -->
+        <!-- <item name="colorPrimary">@color/my_light_primary</item> -->
+    </style>
+
+    <style name="Theme.AiChatSample" parent="Base.Theme.AiChatSample" />
 </resources>
diff --git a/examples/llama.android/build.gradle.kts b/examples/llama.android/build.gradle.kts
index acd1ada7d9..076a0f1c9a 100644
--- a/examples/llama.android/build.gradle.kts
+++ b/examples/llama.android/build.gradle.kts
@@ -1,6 +1,6 @@
 // Top-level build file where you can add configuration options common to all sub-projects/modules.
 plugins {
-    id("com.android.application") version "8.2.0" apply false
-    id("org.jetbrains.kotlin.android") version "1.9.0" apply false
-    id("com.android.library") version "8.2.0" apply false
+    alias(libs.plugins.android.application) apply false
+    alias(libs.plugins.android.library) apply false
+    alias(libs.plugins.jetbrains.kotlin.android) apply false
 }
diff --git a/examples/llama.android/gradle.properties b/examples/llama.android/gradle.properties
index 2cbd6d19d3..8888cc9c51 100644
--- a/examples/llama.android/gradle.properties
+++ b/examples/llama.android/gradle.properties
@@ -21,3 +21,4 @@ kotlin.code.style=official
 # resources declared in the library itself and none from the library's dependencies,
 # thereby reducing the size of the R class for that library
 android.nonTransitiveRClass=true
+android.native.buildOutput=verbose
diff --git a/examples/llama.android/gradle/libs.versions.toml b/examples/llama.android/gradle/libs.versions.toml
new file mode 100644
index 0000000000..df32a75661
--- /dev/null
+++ b/examples/llama.android/gradle/libs.versions.toml
@@ -0,0 +1,53 @@
+[versions]
+
+# Plugins
+agp = "8.13.0"
+kotlin = "2.2.20"
+
+# AndroidX
+activity = "1.11.0"
+appcompat = "1.7.1"
+core-ktx = "1.17.0"
+constraint-layout = "2.2.1"
+datastore-preferences = "1.1.7"
+
+# Material
+material = "1.13.0"
+
+# Testing
+espresso-core = "3.7.0"
+androidx-junit = "1.3.0"
+junit = "4.13.2"
+
+
+[plugins]
+android-application = { id = "com.android.application", version.ref = "agp" }
+android-library = { id = "com.android.library", version.ref = "agp" }
+jetbrains-kotlin-android = { id = "org.jetbrains.kotlin.android", version.ref = "kotlin" }
+
+
+[libraries]
+
+# AndroidX
+androidx-activity = { group = "androidx.activity", name = "activity", version.ref = "activity" }
+androidx-appcompat = { group = "androidx.appcompat", name = "appcompat", version.ref = "appcompat" }
+androidx-constraintlayout = { group = "androidx.constraintlayout", name = "constraintlayout", version.ref = "constraint-layout" }
+androidx-core-ktx = { group = "androidx.core", name = "core-ktx", version.ref = "core-ktx" }
+androidx-datastore-preferences = { group = "androidx.datastore", name = "datastore-preferences", version.ref = "datastore-preferences" }
+
+#Material
+material = { group = "com.google.android.material", name = "material", version.ref = "material" }
+
+# Testing
+androidx-espresso-core = { group = "androidx.test.espresso", name = "espresso-core", version.ref = "espresso-core" }
+androidx-junit = { group = "androidx.test.ext", name = "junit", version.ref = "androidx-junit" }
+junit = { group = "junit", name = "junit", version.ref = "junit" }
+
+[bundles]
+androidx = [
+    "androidx-activity",
+    "androidx-appcompat",
+    "androidx-constraintlayout",
+    "androidx-core-ktx",
+    "androidx-datastore-preferences",
+]
diff --git a/examples/llama.android/gradle/wrapper/gradle-wrapper.properties b/examples/llama.android/gradle/wrapper/gradle-wrapper.properties
index a3958c140b..6b993e909f 100644
--- a/examples/llama.android/gradle/wrapper/gradle-wrapper.properties
+++ b/examples/llama.android/gradle/wrapper/gradle-wrapper.properties
@@ -1,6 +1,6 @@
-#Thu Dec 21 14:31:09 AEDT 2023
+#Tue Apr 01 11:15:06 PDT 2025
 distributionBase=GRADLE_USER_HOME
 distributionPath=wrapper/dists
-distributionUrl=https\://services.gradle.org/distributions/gradle-8.2-bin.zip
+distributionUrl=https\://services.gradle.org/distributions/gradle-8.14.3-bin.zip
 zipStoreBase=GRADLE_USER_HOME
 zipStorePath=wrapper/dists
diff --git a/examples/llama.android/llama/.gitignore b/examples/llama.android/lib/.gitignore
similarity index 100%
rename from examples/llama.android/llama/.gitignore
rename to examples/llama.android/lib/.gitignore
diff --git a/examples/llama.android/lib/build.gradle.kts b/examples/llama.android/lib/build.gradle.kts
new file mode 100644
index 0000000000..5255f0c17b
--- /dev/null
+++ b/examples/llama.android/lib/build.gradle.kts
@@ -0,0 +1,78 @@
+plugins {
+    alias(libs.plugins.android.library)
+    alias(libs.plugins.jetbrains.kotlin.android)
+}
+
+android {
+    namespace = "com.arm.aichat"
+    compileSdk = 36
+
+    ndkVersion = "29.0.13113456"
+
+    defaultConfig {
+        minSdk = 33
+
+        testInstrumentationRunner = "androidx.test.runner.AndroidJUnitRunner"
+        consumerProguardFiles("consumer-rules.pro")
+
+        ndk {
+             abiFilters += listOf("arm64-v8a", "x86_64")
+        }
+        externalNativeBuild {
+            cmake {
+                arguments += "-DCMAKE_BUILD_TYPE=Release"
+                arguments += "-DCMAKE_MESSAGE_LOG_LEVEL=DEBUG"
+                arguments += "-DCMAKE_VERBOSE_MAKEFILE=ON"
+
+                arguments += "-DBUILD_SHARED_LIBS=ON"
+                arguments += "-DLLAMA_BUILD_COMMON=ON"
+                arguments += "-DLLAMA_CURL=OFF"
+
+                arguments += "-DGGML_NATIVE=OFF"
+                arguments += "-DGGML_BACKEND_DL=ON"
+                arguments += "-DGGML_CPU_ALL_VARIANTS=ON"
+                arguments += "-DGGML_LLAMAFILE=OFF"
+            }
+        }
+        aarMetadata {
+            minCompileSdk = 35
+        }
+    }
+    externalNativeBuild {
+        cmake {
+            path("src/main/cpp/CMakeLists.txt")
+            version = "3.31.6"
+        }
+    }
+    compileOptions {
+        sourceCompatibility = JavaVersion.VERSION_17
+        targetCompatibility = JavaVersion.VERSION_17
+    }
+    kotlin {
+        jvmToolchain(17)
+
+        compileOptions {
+            targetCompatibility = JavaVersion.VERSION_17
+        }
+    }
+
+    packaging {
+        resources {
+            excludes += "/META-INF/{AL2.0,LGPL2.1}"
+        }
+    }
+
+    publishing {
+        singleVariant("release") {
+            withJavadocJar()
+        }
+    }
+}
+
+dependencies {
+    implementation(libs.androidx.core.ktx)
+    implementation(libs.androidx.datastore.preferences)
+
+    testImplementation(libs.junit)
+    androidTestImplementation(libs.androidx.junit)
+}
diff --git a/examples/llama.android/lib/consumer-rules.pro b/examples/llama.android/lib/consumer-rules.pro
new file mode 100644
index 0000000000..e6eb6f5474
--- /dev/null
+++ b/examples/llama.android/lib/consumer-rules.pro
@@ -0,0 +1,8 @@
+-keep class com.arm.aichat.* { *; }
+-keep class com.arm.aichat.gguf.* { *; }
+
+-keepclasseswithmembernames class * {
+    native <methods>;
+}
+
+-keep class kotlin.Metadata { *; }
diff --git a/examples/llama.android/llama/proguard-rules.pro b/examples/llama.android/lib/proguard-rules.pro
similarity index 100%
rename from examples/llama.android/llama/proguard-rules.pro
rename to examples/llama.android/lib/proguard-rules.pro
diff --git a/examples/llama.android/llama/src/androidTest/java/android/llama/cpp/ExampleInstrumentedTest.kt b/examples/llama.android/lib/src/androidTest/java/android/llama/cpp/ExampleInstrumentedTest.kt
similarity index 100%
rename from examples/llama.android/llama/src/androidTest/java/android/llama/cpp/ExampleInstrumentedTest.kt
rename to examples/llama.android/lib/src/androidTest/java/android/llama/cpp/ExampleInstrumentedTest.kt
diff --git a/examples/llama.android/llama/src/main/AndroidManifest.xml b/examples/llama.android/lib/src/main/AndroidManifest.xml
similarity index 100%
rename from examples/llama.android/llama/src/main/AndroidManifest.xml
rename to examples/llama.android/lib/src/main/AndroidManifest.xml
diff --git a/examples/llama.android/lib/src/main/cpp/CMakeLists.txt b/examples/llama.android/lib/src/main/cpp/CMakeLists.txt
new file mode 100644
index 0000000000..7862c61a3f
--- /dev/null
+++ b/examples/llama.android/lib/src/main/cpp/CMakeLists.txt
@@ -0,0 +1,56 @@
+cmake_minimum_required(VERSION 3.31.6)
+
+project("ai-chat" VERSION 1.0.0 LANGUAGES C CXX)
+
+set(CMAKE_C_STANDARD 11)
+set(CMAKE_C_STANDARD_REQUIRED true)
+
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_STANDARD_REQUIRED true)
+
+set(CMAKE_C_FLAGS   "${CMAKE_C_FLAGS}"   CACHE STRING "" FORCE)
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}" CACHE STRING "" FORCE)
+
+# --------------------------------------------------------------------------
+# AI Chat library
+# --------------------------------------------------------------------------
+
+if(DEFINED ANDROID_ABI)
+    message(STATUS "Detected Android ABI: ${ANDROID_ABI}")
+    if(ANDROID_ABI STREQUAL "arm64-v8a")
+        set(GGML_SYSTEM_ARCH "ARM")
+        set(GGML_CPU_KLEIDIAI ON)
+        set(GGML_OPENMP ON)
+    elseif(ANDROID_ABI STREQUAL "x86_64")
+        set(GGML_SYSTEM_ARCH "x86")
+        set(GGML_CPU_KLEIDIAI OFF)
+        set(GGML_OPENMP OFF)
+    else()
+        message(FATAL_ERROR "Unsupported ABI: ${ANDROID_ABI}")
+    endif()
+endif()
+
+set(LLAMA_SRC ${CMAKE_CURRENT_LIST_DIR}/../../../../../../)
+add_subdirectory(${LLAMA_SRC} build-llama)
+
+add_library(${CMAKE_PROJECT_NAME} SHARED
+        ai_chat.cpp)
+
+target_compile_definitions(${CMAKE_PROJECT_NAME} PRIVATE
+        GGML_SYSTEM_ARCH=${GGML_SYSTEM_ARCH}
+        GGML_CPU_KLEIDIAI=$<BOOL:${GGML_CPU_KLEIDIAI}>
+        GGML_OPENMP=$<BOOL:${GGML_OPENMP}>
+)
+
+target_include_directories(${CMAKE_PROJECT_NAME} PRIVATE
+        ${LLAMA_SRC}
+        ${LLAMA_SRC}/common
+        ${LLAMA_SRC}/include
+        ${LLAMA_SRC}/ggml/include
+        ${LLAMA_SRC}/ggml/src)
+
+target_link_libraries(${CMAKE_PROJECT_NAME}
+        llama
+        common
+        android
+        log)
diff --git a/examples/llama.android/lib/src/main/cpp/ai_chat.cpp b/examples/llama.android/lib/src/main/cpp/ai_chat.cpp
new file mode 100644
index 0000000000..d655a0965f
--- /dev/null
+++ b/examples/llama.android/lib/src/main/cpp/ai_chat.cpp
@@ -0,0 +1,565 @@
+#include <android/log.h>
+#include <jni.h>
+#include <iomanip>
+#include <cmath>
+#include <string>
+#include <unistd.h>
+#include <sampling.h>
+
+#include "logging.h"
+#include "chat.h"
+#include "common.h"
+#include "llama.h"
+
+template<class T>
+static std::string join(const std::vector<T> &values, const std::string &delim) {
+    std::ostringstream str;
+    for (size_t i = 0; i < values.size(); i++) {
+        str << values[i];
+        if (i < values.size() - 1) { str << delim; }
+    }
+    return str.str();
+}
+
+/**
+ * LLama resources: context, model, batch and sampler
+ */
+constexpr int   N_THREADS_MIN           = 2;
+constexpr int   N_THREADS_MAX           = 4;
+constexpr int   N_THREADS_HEADROOM      = 2;
+
+constexpr int   DEFAULT_CONTEXT_SIZE    = 8192;
+constexpr int   OVERFLOW_HEADROOM       = 4;
+constexpr int   BATCH_SIZE              = 512;
+constexpr float DEFAULT_SAMPLER_TEMP    = 0.3f;
+
+static llama_model                      * g_model;
+static llama_context                    * g_context;
+static llama_batch                        g_batch;
+static common_chat_templates_ptr          g_chat_templates;
+static common_sampler                   * g_sampler;
+
+extern "C"
+JNIEXPORT void JNICALL
+Java_com_arm_aichat_internal_InferenceEngineImpl_init(JNIEnv *env, jobject /*unused*/, jstring nativeLibDir) {
+    // Set llama log handler to Android
+    llama_log_set(aichat_android_log_callback, nullptr);
+
+    // Loading all CPU backend variants
+    const auto *path_to_backend = env->GetStringUTFChars(nativeLibDir, 0);
+    LOGi("Loading backends from %s", path_to_backend);
+    ggml_backend_load_all_from_path(path_to_backend);
+    env->ReleaseStringUTFChars(nativeLibDir, path_to_backend);
+
+    // Initialize backends
+    llama_backend_init();
+    LOGi("Backend initiated; Log handler set.");
+}
+
+extern "C"
+JNIEXPORT jint JNICALL
+Java_com_arm_aichat_internal_InferenceEngineImpl_load(JNIEnv *env, jobject, jstring jmodel_path) {
+    llama_model_params model_params = llama_model_default_params();
+
+    const auto *model_path = env->GetStringUTFChars(jmodel_path, 0);
+    LOGd("%s: Loading model from: \n%s\n", __func__, model_path);
+
+    auto *model = llama_model_load_from_file(model_path, model_params);
+    env->ReleaseStringUTFChars(jmodel_path, model_path);
+    if (!model) {
+        return 1;
+    }
+    g_model = model;
+    return 0;
+}
+
+static llama_context *init_context(llama_model *model, const int n_ctx = DEFAULT_CONTEXT_SIZE) {
+    if (!model) {
+        LOGe("%s: model cannot be null", __func__);
+        return nullptr;
+    }
+
+    // Multi-threading setup
+    const int n_threads = std::max(N_THREADS_MIN, std::min(N_THREADS_MAX,
+                                                     (int) sysconf(_SC_NPROCESSORS_ONLN) -
+                                                     N_THREADS_HEADROOM));
+    LOGi("%s: Using %d threads", __func__, n_threads);
+
+    // Context parameters setup
+    llama_context_params ctx_params = llama_context_default_params();
+    const int trained_context_size = llama_model_n_ctx_train(model);
+    if (n_ctx > trained_context_size) {
+        LOGw("%s: Model was trained with only %d context size! Enforcing %d context size...",
+             __func__, trained_context_size, n_ctx);
+    }
+    ctx_params.n_ctx = n_ctx;
+    ctx_params.n_batch = BATCH_SIZE;
+    ctx_params.n_ubatch = BATCH_SIZE;
+    ctx_params.n_threads = n_threads;
+    ctx_params.n_threads_batch = n_threads;
+    auto *context = llama_init_from_model(g_model, ctx_params);
+    if (context == nullptr) {
+        LOGe("%s: llama_new_context_with_model() returned null)", __func__);
+    }
+    return context;
+}
+
+static common_sampler *new_sampler(float temp) {
+    common_params_sampling sparams;
+    sparams.temp = temp;
+    return common_sampler_init(g_model, sparams);
+}
+
+extern "C"
+JNIEXPORT jint JNICALL
+Java_com_arm_aichat_internal_InferenceEngineImpl_prepare(JNIEnv * /*env*/, jobject /*unused*/) {
+    auto *context = init_context(g_model);
+    if (!context) { return 1; }
+    g_context = context;
+    g_batch = llama_batch_init(BATCH_SIZE, 0, 1);
+    g_chat_templates = common_chat_templates_init(g_model, "");
+    g_sampler = new_sampler(DEFAULT_SAMPLER_TEMP);
+    return 0;
+}
+
+static std::string get_backend() {
+    std::vector<std::string> backends;
+    for (size_t i = 0; i < ggml_backend_reg_count(); i++) {
+        auto *reg = ggml_backend_reg_get(i);
+        std::string name = ggml_backend_reg_name(reg);
+        if (name != "CPU") {
+            backends.push_back(ggml_backend_reg_name(reg));
+        }
+    }
+    return backends.empty() ? "CPU" : join(backends, ",");
+}
+
+extern "C"
+JNIEXPORT jstring JNICALL
+Java_com_arm_aichat_internal_InferenceEngineImpl_systemInfo(JNIEnv *env, jobject /*unused*/) {
+    return env->NewStringUTF(llama_print_system_info());
+}
+
+extern "C"
+JNIEXPORT jstring JNICALL
+Java_com_arm_aichat_internal_InferenceEngineImpl_benchModel(JNIEnv *env, jobject /*unused*/, jint pp, jint tg,
+                                                      jint pl, jint nr) {
+    auto *context = init_context(g_model, pp);
+    if (!context) {
+        const auto *const err_msg = "Fail to init_context! Bench aborted.";
+        LOGe(err_msg);
+        return env->NewStringUTF(err_msg);
+    }
+
+    auto pp_avg = 0.0;
+    auto tg_avg = 0.0;
+    auto pp_std = 0.0;
+    auto tg_std = 0.0;
+
+    const uint32_t n_ctx = llama_n_ctx(context);
+    LOGi("n_ctx = %d", n_ctx);
+
+    int i, j;
+    int nri;
+    for (nri = 0; nri < nr; nri++) {
+        LOGi("Benchmark prompt processing (pp = %d)", pp);
+
+        common_batch_clear(g_batch);
+
+        const int n_tokens = pp;
+        for (i = 0; i < n_tokens; i++) {
+            common_batch_add(g_batch, 0, i, {0}, false);
+        }
+
+        g_batch.logits[g_batch.n_tokens - 1] = true;
+        llama_memory_clear(llama_get_memory(context), false);
+
+        const auto t_pp_start = ggml_time_us();
+        if (llama_decode(context, g_batch) != 0) {
+            LOGe("llama_decode() failed during prompt processing");
+        }
+        const auto t_pp_end = ggml_time_us();
+
+        // bench text generation
+
+        LOGi("Benchmark text generation (tg = %d)", tg);
+
+        llama_memory_clear(llama_get_memory(context), false);
+        const auto t_tg_start = ggml_time_us();
+        for (i = 0; i < tg; i++) {
+            common_batch_clear(g_batch);
+            for (j = 0; j < pl; j++) {
+                common_batch_add(g_batch, 0, i, {j}, true);
+            }
+
+            if (llama_decode(context, g_batch) != 0) {
+                LOGe("llama_decode() failed during text generation");
+            }
+        }
+        const auto t_tg_end = ggml_time_us();
+
+        llama_memory_clear(llama_get_memory(context), false);
+
+        const auto t_pp = double(t_pp_end - t_pp_start) / 1000000.0;
+        const auto t_tg = double(t_tg_end - t_tg_start) / 1000000.0;
+
+        const auto speed_pp = double(pp) / t_pp;
+        const auto speed_tg = double(pl * tg) / t_tg;
+
+        pp_avg += speed_pp;
+        tg_avg += speed_tg;
+
+        pp_std += speed_pp * speed_pp;
+        tg_std += speed_tg * speed_tg;
+
+        LOGi("pp %f t/s, tg %f t/s", speed_pp, speed_tg);
+    }
+
+    llama_free(context);
+
+    pp_avg /= double(nr);
+    tg_avg /= double(nr);
+
+    if (nr > 1) {
+        pp_std = sqrt(pp_std / double(nr - 1) - pp_avg * pp_avg * double(nr) / double(nr - 1));
+        tg_std = sqrt(tg_std / double(nr - 1) - tg_avg * tg_avg * double(nr) / double(nr - 1));
+    } else {
+        pp_std = 0;
+        tg_std = 0;
+    }
+
+    char model_desc[128];
+    llama_model_desc(g_model, model_desc, sizeof(model_desc));
+
+    const auto model_size = double(llama_model_size(g_model)) / 1024.0 / 1024.0 / 1024.0;
+    const auto model_n_params = double(llama_model_n_params(g_model)) / 1e9;
+
+    const auto backend = get_backend();
+    std::stringstream result;
+    result << std::setprecision(3);
+    result << "| model | size | params | backend | test | t/s |\n";
+    result << "| --- | --- | --- | --- | --- | --- |\n";
+    result << "| " << model_desc << " | " << model_size << "GiB | " << model_n_params << "B | "
+           << backend << " | pp " << pp << " | " << pp_avg << " ± " << pp_std << " |\n";
+    result << "| " << model_desc << " | " << model_size << "GiB | " << model_n_params << "B | "
+           << backend << " | tg " << tg << " | " << tg_avg << " ± " << tg_std << " |\n";
+    return env->NewStringUTF(result.str().c_str());
+}
+
+
+/**
+ * Completion loop's long-term states:
+ * - chat management
+ * - position tracking
+ */
+constexpr const char *ROLE_SYSTEM       = "system";
+constexpr const char *ROLE_USER         = "user";
+constexpr const char *ROLE_ASSISTANT    = "assistant";
+
+static std::vector<common_chat_msg> chat_msgs;
+static llama_pos system_prompt_position;
+static llama_pos current_position;
+
+static void reset_long_term_states(const bool clear_kv_cache = true) {
+    chat_msgs.clear();
+    system_prompt_position = 0;
+    current_position = 0;
+
+    if (clear_kv_cache)
+        llama_memory_clear(llama_get_memory(g_context), false);
+}
+
+/**
+ * TODO-hyin: implement sliding-window version as a better alternative
+ *
+ * Context shifting by discarding the older half of the tokens appended after system prompt:
+ * - take the [system_prompt_position] first tokens from the original prompt
+ * - take half of the last (system_prompt_position - system_prompt_position) tokens
+ * - recompute the logits in batches
+ */
+static void shift_context() {
+    const int n_discard = (current_position - system_prompt_position) / 2;
+    LOGi("%s: Discarding %d tokens", __func__, n_discard);
+    llama_memory_seq_rm(llama_get_memory(g_context), 0, system_prompt_position, system_prompt_position + n_discard);
+    llama_memory_seq_add(llama_get_memory(g_context), 0, system_prompt_position + n_discard, current_position, -n_discard);
+    current_position -= n_discard;
+    LOGi("%s: Context shifting done! Current position: %d", __func__, current_position);
+}
+
+static std::string chat_add_and_format(const std::string &role, const std::string &content) {
+    common_chat_msg new_msg;
+    new_msg.role = role;
+    new_msg.content = content;
+    auto formatted = common_chat_format_single(
+            g_chat_templates.get(), chat_msgs, new_msg, role == ROLE_USER, /* use_jinja */ false);
+    chat_msgs.push_back(new_msg);
+    LOGi("%s: Formatted and added %s message: \n%s\n", __func__, role.c_str(), formatted.c_str());
+    return formatted;
+}
+
+/**
+ * Completion loop's short-term states:
+ * - stop generation position
+ * - token chars caching
+ * - current assistant message being generated
+ */
+static llama_pos stop_generation_position;
+static std::string cached_token_chars;
+static std::ostringstream assistant_ss;
+
+static void reset_short_term_states() {
+    stop_generation_position = 0;
+    cached_token_chars.clear();
+    assistant_ss.str("");
+}
+
+static int decode_tokens_in_batches(
+        llama_context *context,
+        llama_batch &batch,
+        const llama_tokens &tokens,
+        const llama_pos start_pos,
+        const bool compute_last_logit = false) {
+    // Process tokens in batches using the global batch
+    LOGd("%s: Decode %d tokens starting at position %d", __func__, (int) tokens.size(), start_pos);
+    for (int i = 0; i < (int) tokens.size(); i += BATCH_SIZE) {
+        const int cur_batch_size = std::min((int) tokens.size() - i, BATCH_SIZE);
+        common_batch_clear(batch);
+        LOGv("%s: Preparing a batch size of %d starting at: %d", __func__, cur_batch_size, i);
+
+        // Shift context if current batch cannot fit into the context
+        if (start_pos + i + cur_batch_size >= DEFAULT_CONTEXT_SIZE - OVERFLOW_HEADROOM) {
+            LOGw("%s: Current batch won't fit into context! Shifting...", __func__);
+            shift_context();
+        }
+
+        // Add tokens to the batch with proper positions
+        for (int j = 0; j < cur_batch_size; j++) {
+            const llama_token token_id = tokens[i + j];
+            const llama_pos position = start_pos + i + j;
+            const bool want_logit = compute_last_logit && (i + j == tokens.size() - 1);
+            common_batch_add(batch, token_id, position, {0}, want_logit);
+        }
+
+        // Decode this batch
+        const int decode_result = llama_decode(context, batch);
+        if (decode_result) {
+            LOGe("%s: llama_decode failed w/ %d", __func__, decode_result);
+            return 1;
+        }
+    }
+    return 0;
+}
+
+extern "C"
+JNIEXPORT jint JNICALL
+Java_com_arm_aichat_internal_InferenceEngineImpl_processSystemPrompt(
+        JNIEnv *env,
+        jobject /*unused*/,
+        jstring jsystem_prompt
+) {
+    // Reset long-term & short-term states
+    reset_long_term_states();
+    reset_short_term_states();
+
+    // Obtain system prompt from JEnv
+    const auto *system_prompt = env->GetStringUTFChars(jsystem_prompt, nullptr);
+    LOGd("%s: System prompt received: \n%s", __func__, system_prompt);
+    std::string formatted_system_prompt(system_prompt);
+    env->ReleaseStringUTFChars(jsystem_prompt, system_prompt);
+
+    // Format system prompt if applicable
+    const bool has_chat_template = common_chat_templates_was_explicit(g_chat_templates.get());
+    if (has_chat_template) {
+        formatted_system_prompt = chat_add_and_format(ROLE_SYSTEM, system_prompt);
+    }
+
+    // Tokenize system prompt
+    const auto system_tokens = common_tokenize(g_context, formatted_system_prompt,
+                                               has_chat_template, has_chat_template);
+    for (auto id: system_tokens) {
+        LOGv("token: `%s`\t -> `%d`", common_token_to_piece(g_context, id).c_str(), id);
+    }
+
+    // Handle context overflow
+    const int max_batch_size = DEFAULT_CONTEXT_SIZE - OVERFLOW_HEADROOM;
+    if ((int) system_tokens.size() > max_batch_size) {
+        LOGe("%s: System prompt too long for context! %d tokens, max: %d",
+             __func__, (int) system_tokens.size(), max_batch_size);
+        return 1;
+    }
+
+    // Decode system tokens in batches
+    if (decode_tokens_in_batches(g_context, g_batch, system_tokens, current_position)) {
+        LOGe("%s: llama_decode() failed!", __func__);
+        return 2;
+    }
+
+    // Update position
+    system_prompt_position = current_position = (int) system_tokens.size();
+    return 0;
+}
+
+extern "C"
+JNIEXPORT jint JNICALL
+Java_com_arm_aichat_internal_InferenceEngineImpl_processUserPrompt(
+        JNIEnv *env,
+        jobject /*unused*/,
+        jstring juser_prompt,
+        jint n_predict
+) {
+    // Reset short-term states
+    reset_short_term_states();
+
+    // Obtain and tokenize user prompt
+    const auto *const user_prompt = env->GetStringUTFChars(juser_prompt, nullptr);
+    LOGd("%s: User prompt received: \n%s", __func__, user_prompt);
+    std::string formatted_user_prompt(user_prompt);
+    env->ReleaseStringUTFChars(juser_prompt, user_prompt);
+
+    // Format user prompt if applicable
+    const bool has_chat_template = common_chat_templates_was_explicit(g_chat_templates.get());
+    if (has_chat_template) {
+        formatted_user_prompt = chat_add_and_format(ROLE_USER, user_prompt);
+    }
+
+    // Decode formatted user prompts
+    auto user_tokens = common_tokenize(g_context, formatted_user_prompt, has_chat_template, has_chat_template);
+    for (auto id: user_tokens) {
+        LOGv("token: `%s`\t -> `%d`", common_token_to_piece(g_context, id).c_str(), id);
+    }
+
+    // Ensure user prompt doesn't exceed the context size by truncating if necessary.
+    const int user_prompt_size = (int) user_tokens.size();
+    const int max_batch_size = DEFAULT_CONTEXT_SIZE - OVERFLOW_HEADROOM;
+    if (user_prompt_size > max_batch_size) {
+        const int skipped_tokens = user_prompt_size - max_batch_size;
+        user_tokens.resize(max_batch_size);
+        LOGw("%s: User prompt too long! Skipped %d tokens!", __func__, skipped_tokens);
+    }
+
+    // Decode user tokens in batches
+    if (decode_tokens_in_batches(g_context, g_batch, user_tokens, current_position, true)) {
+        LOGe("%s: llama_decode() failed!", __func__);
+        return 2;
+    }
+
+    // Update position
+    current_position += user_prompt_size;
+    stop_generation_position = current_position + user_prompt_size + n_predict;
+    return 0;
+}
+
+static bool is_valid_utf8(const char *string) {
+    if (!string) { return true; }
+
+    const auto *bytes = (const unsigned char *) string;
+    int num;
+
+    while (*bytes != 0x00) {
+        if ((*bytes & 0x80) == 0x00) {
+            // U+0000 to U+007F
+            num = 1;
+        } else if ((*bytes & 0xE0) == 0xC0) {
+            // U+0080 to U+07FF
+            num = 2;
+        } else if ((*bytes & 0xF0) == 0xE0) {
+            // U+0800 to U+FFFF
+            num = 3;
+        } else if ((*bytes & 0xF8) == 0xF0) {
+            // U+10000 to U+10FFFF
+            num = 4;
+        } else {
+            return false;
+        }
+
+        bytes += 1;
+        for (int i = 1; i < num; ++i) {
+            if ((*bytes & 0xC0) != 0x80) {
+                return false;
+            }
+            bytes += 1;
+        }
+    }
+    return true;
+}
+
+extern "C"
+JNIEXPORT jstring JNICALL
+Java_com_arm_aichat_internal_InferenceEngineImpl_generateNextToken(
+        JNIEnv *env,
+        jobject /*unused*/
+) {
+    // Infinite text generation via context shifting
+    if (current_position >= DEFAULT_CONTEXT_SIZE - OVERFLOW_HEADROOM) {
+        LOGw("%s: Context full! Shifting...", __func__);
+        shift_context();
+    }
+
+    // Stop if reaching the marked position
+    if (current_position >= stop_generation_position) {
+        LOGw("%s: STOP: hitting stop position: %d", __func__, stop_generation_position);
+        return nullptr;
+    }
+
+    // Sample next token
+    const auto new_token_id = common_sampler_sample(g_sampler, g_context, -1);
+    common_sampler_accept(g_sampler, new_token_id, true);
+
+    // Populate the batch with new token, then decode
+    common_batch_clear(g_batch);
+    common_batch_add(g_batch, new_token_id, current_position, {0}, true);
+    if (llama_decode(g_context, g_batch) != 0) {
+        LOGe("%s: llama_decode() failed for generated token", __func__);
+        return nullptr;
+    }
+
+    // Update position
+    current_position++;
+
+    // Stop if next token is EOG
+    if (llama_vocab_is_eog(llama_model_get_vocab(g_model), new_token_id)) {
+        LOGd("id: %d,\tIS EOG!\nSTOP.", new_token_id);
+        chat_add_and_format(ROLE_ASSISTANT, assistant_ss.str());
+        return nullptr;
+    }
+
+    // If not EOG, convert to text
+    auto new_token_chars = common_token_to_piece(g_context, new_token_id);
+    cached_token_chars += new_token_chars;
+
+    // Create and return a valid UTF-8 Java string
+    jstring result = nullptr;
+    if (is_valid_utf8(cached_token_chars.c_str())) {
+        result = env->NewStringUTF(cached_token_chars.c_str());
+        LOGv("id: %d,\tcached: `%s`,\tnew: `%s`", new_token_id, cached_token_chars.c_str(), new_token_chars.c_str());
+
+        assistant_ss << cached_token_chars;
+        cached_token_chars.clear();
+    } else {
+        LOGv("id: %d,\tappend to cache", new_token_id);
+        result = env->NewStringUTF("");
+    }
+    return result;
+}
+
+
+extern "C"
+JNIEXPORT void JNICALL
+Java_com_arm_aichat_internal_InferenceEngineImpl_unload(JNIEnv * /*unused*/, jobject /*unused*/) {
+    // Reset long-term & short-term states
+    reset_long_term_states();
+    reset_short_term_states();
+
+    // Free up resources
+    common_sampler_free(g_sampler);
+    g_chat_templates.reset();
+    llama_batch_free(g_batch);
+    llama_free(g_context);
+    llama_model_free(g_model);
+}
+
+extern "C"
+JNIEXPORT void JNICALL
+Java_com_arm_aichat_internal_InferenceEngineImpl_shutdown(JNIEnv *env, jobject /*unused*/) {
+    llama_backend_free();
+}
diff --git a/examples/llama.android/lib/src/main/cpp/logging.h b/examples/llama.android/lib/src/main/cpp/logging.h
new file mode 100644
index 0000000000..2e768d2beb
--- /dev/null
+++ b/examples/llama.android/lib/src/main/cpp/logging.h
@@ -0,0 +1,61 @@
+//
+// Created by Han Yin on 10/31/25.
+//
+
+#ifndef AICHAT_LOGGING_H
+#define AICHAT_LOGGING_H
+
+#endif //AICHAT_LOGGING_H
+
+#pragma once
+#include <android/log.h>
+
+#ifndef LOG_TAG
+#define LOG_TAG "ai-chat"
+#endif
+
+#ifndef LOG_MIN_LEVEL
+#if defined(NDEBUG)
+#define LOG_MIN_LEVEL ANDROID_LOG_INFO
+#else
+#define LOG_MIN_LEVEL ANDROID_LOG_VERBOSE
+#endif
+#endif
+
+static inline int ai_should_log(int prio) {
+    return __android_log_is_loggable(prio, LOG_TAG, LOG_MIN_LEVEL);
+}
+
+#if LOG_MIN_LEVEL <= ANDROID_LOG_VERBOSE
+#define LOGv(...) do { if (ai_should_log(ANDROID_LOG_VERBOSE)) __android_log_print(ANDROID_LOG_VERBOSE, LOG_TAG, __VA_ARGS__); } while (0)
+#else
+#define LOGv(...) ((void)0)
+#endif
+
+#if LOG_MIN_LEVEL <= ANDROID_LOG_DEBUG
+#define LOGd(...) do { if (ai_should_log(ANDROID_LOG_DEBUG)) __android_log_print(ANDROID_LOG_DEBUG, LOG_TAG, __VA_ARGS__); } while (0)
+#else
+#define LOGd(...) ((void)0)
+#endif
+
+#define LOGi(...)   do { if (ai_should_log(ANDROID_LOG_INFO )) __android_log_print(ANDROID_LOG_INFO , LOG_TAG, __VA_ARGS__); } while (0)
+#define LOGw(...)   do { if (ai_should_log(ANDROID_LOG_WARN )) __android_log_print(ANDROID_LOG_WARN , LOG_TAG, __VA_ARGS__); } while (0)
+#define LOGe(...)   do { if (ai_should_log(ANDROID_LOG_ERROR)) __android_log_print(ANDROID_LOG_ERROR, LOG_TAG, __VA_ARGS__); } while (0)
+
+static inline int android_log_prio_from_ggml(enum ggml_log_level level) {
+    switch (level) {
+        case GGML_LOG_LEVEL_ERROR: return ANDROID_LOG_ERROR;
+        case GGML_LOG_LEVEL_WARN:  return ANDROID_LOG_WARN;
+        case GGML_LOG_LEVEL_INFO:  return ANDROID_LOG_INFO;
+        case GGML_LOG_LEVEL_DEBUG: return ANDROID_LOG_DEBUG;
+        default:                   return ANDROID_LOG_DEFAULT;
+    }
+}
+
+static inline void aichat_android_log_callback(enum ggml_log_level level,
+                                              const char* text,
+                                              void* /*user*/) {
+    const int prio = android_log_prio_from_ggml(level);
+    if (!ai_should_log(prio)) return;
+    __android_log_write(prio, LOG_TAG, text);
+}
diff --git a/examples/llama.android/lib/src/main/java/com/arm/aichat/AiChat.kt b/examples/llama.android/lib/src/main/java/com/arm/aichat/AiChat.kt
new file mode 100644
index 0000000000..b72a24ec1d
--- /dev/null
+++ b/examples/llama.android/lib/src/main/java/com/arm/aichat/AiChat.kt
@@ -0,0 +1,14 @@
+package com.arm.aichat
+
+import android.content.Context
+import com.arm.aichat.internal.InferenceEngineImpl
+
+/**
+ * Main entry point for Arm's AI Chat library.
+ */
+object AiChat {
+    /**
+     * Get the inference engine single instance.
+     */
+    fun getInferenceEngine(context: Context) = InferenceEngineImpl.getInstance(context)
+}
diff --git a/examples/llama.android/lib/src/main/java/com/arm/aichat/InferenceEngine.kt b/examples/llama.android/lib/src/main/java/com/arm/aichat/InferenceEngine.kt
new file mode 100644
index 0000000000..44852fa828
--- /dev/null
+++ b/examples/llama.android/lib/src/main/java/com/arm/aichat/InferenceEngine.kt
@@ -0,0 +1,89 @@
+package com.arm.aichat
+
+import com.arm.aichat.InferenceEngine.State
+import kotlinx.coroutines.flow.Flow
+import kotlinx.coroutines.flow.StateFlow
+
+/**
+ * Interface defining the core LLM inference operations.
+ */
+interface InferenceEngine {
+    /**
+     * Current state of the inference engine
+     */
+    val state: StateFlow<State>
+
+    /**
+     * Load a model from the given path.
+     *
+     * @throws UnsupportedArchitectureException if model architecture not supported
+     */
+    suspend fun loadModel(pathToModel: String)
+
+    /**
+     * Sends a system prompt to the loaded model
+     */
+    suspend fun setSystemPrompt(systemPrompt: String)
+
+    /**
+     * Sends a user prompt to the loaded model and returns a Flow of generated tokens.
+     */
+    fun sendUserPrompt(message: String, predictLength: Int = DEFAULT_PREDICT_LENGTH): Flow<String>
+
+    /**
+     * Runs a benchmark with the specified parameters.
+     */
+    suspend fun bench(pp: Int, tg: Int, pl: Int, nr: Int = 1): String
+
+    /**
+     * Unloads the currently loaded model.
+     */
+    suspend fun cleanUp()
+
+    /**
+     * Cleans up resources when the engine is no longer needed.
+     */
+    fun destroy()
+
+    /**
+     * States of the inference engine
+     */
+    sealed class State {
+        object Uninitialized : State()
+        object Initializing : State()
+        object Initialized : State()
+
+        object LoadingModel : State()
+        object UnloadingModel : State()
+        object ModelReady : State()
+
+        object Benchmarking : State()
+        object ProcessingSystemPrompt : State()
+        object ProcessingUserPrompt : State()
+
+        object Generating : State()
+
+        data class Error(val exception: Exception) : State()
+    }
+
+    companion object {
+        const val DEFAULT_PREDICT_LENGTH = 1024
+    }
+}
+
+val State.isUninterruptible
+    get() = this is State.Initializing ||
+        this is State.LoadingModel ||
+        this is State.UnloadingModel ||
+        this is State.Benchmarking ||
+        this is State.ProcessingSystemPrompt ||
+        this is State.ProcessingUserPrompt
+
+val State.isModelLoaded: Boolean
+    get() = this is State.ModelReady ||
+        this is State.Benchmarking ||
+        this is State.ProcessingSystemPrompt ||
+        this is State.ProcessingUserPrompt ||
+        this is State.Generating
+
+class UnsupportedArchitectureException : Exception()
diff --git a/examples/llama.android/lib/src/main/java/com/arm/aichat/gguf/FileType.kt b/examples/llama.android/lib/src/main/java/com/arm/aichat/gguf/FileType.kt
new file mode 100644
index 0000000000..2f15eef077
--- /dev/null
+++ b/examples/llama.android/lib/src/main/java/com/arm/aichat/gguf/FileType.kt
@@ -0,0 +1,61 @@
+package com.arm.aichat.gguf
+
+import kotlin.collections.get
+
+
+/**
+ * Numerical codes used by `general.file_type` (see llama.cpp repo's `constants.py`).
+ * The `label` matches what llama‑cli prints.
+ */
+enum class FileType(val code: Int, val label: String) {
+    ALL_F32(0, "all F32"),
+    MOSTLY_F16(1, "F16"),
+    MOSTLY_Q4_0(2, "Q4_0"),
+    MOSTLY_Q4_1(3, "Q4_1"),
+    // 4 removed
+    MOSTLY_Q8_0(7, "Q8_0"),
+    MOSTLY_Q5_0(8, "Q5_0"),
+    MOSTLY_Q5_1(9, "Q5_1"),
+
+    /* K‑quants ------------------------------------------------------------ */
+    MOSTLY_Q2_K      (10, "Q2_K - Medium"),
+    MOSTLY_Q3_K_S    (11, "Q3_K - Small"),
+    MOSTLY_Q3_K_M    (12, "Q3_K - Medium"),
+    MOSTLY_Q3_K_L    (13, "Q3_K - Large"),
+    MOSTLY_Q4_K_S    (14, "Q4_K - Small"),
+    MOSTLY_Q4_K_M    (15, "Q4_K - Medium"),
+    MOSTLY_Q5_K_S    (16, "Q5_K - Small"),
+    MOSTLY_Q5_K_M    (17, "Q5_K - Medium"),
+    MOSTLY_Q6_K      (18, "Q6_K"),
+
+    /* IQ quants ----------------------------------------------------------- */
+    MOSTLY_IQ2_XXS   (19, "IQ2_XXS - 2.06 bpw"),
+    MOSTLY_IQ2_XS    (20, "IQ2_XS - 2.31 bpw"),
+    MOSTLY_Q2_K_S    (21, "Q2_K - Small"),
+    MOSTLY_IQ3_XS    (22, "IQ3_XS - 3.30 bpw"),
+    MOSTLY_IQ3_XXS   (23, "IQ3_XXS - 3.06 bpw"),
+    MOSTLY_IQ1_S     (24, "IQ1_S - 1.56 bpw"),
+    MOSTLY_IQ4_NL    (25, "IQ4_NL - 4.5 bpw"),
+    MOSTLY_IQ3_S     (26, "IQ3_S - 3.44 bpw"),
+    MOSTLY_IQ3_M     (27, "IQ3_M - 3.66 bpw"),
+    MOSTLY_IQ2_S     (28, "IQ2_S - 2.50 bpw"),
+    MOSTLY_IQ2_M     (29, "IQ2_M - 2.70 bpw"),
+    MOSTLY_IQ4_XS    (30, "IQ4_XS - 4.25 bpw"),
+    MOSTLY_IQ1_M     (31, "IQ1_M - 1.75 bpw"),
+
+    /* BF16 & Ternary ------------------------------------------------------ */
+    MOSTLY_BF16      (32, "BF16"),
+    MOSTLY_TQ1_0     (36, "TQ1_0 - 1.69 bpw ternary"),
+    MOSTLY_TQ2_0     (37, "TQ2_0 - 2.06 bpw ternary"),
+
+    /* Special flag -------------------------------------------------------- */
+    GUESSED(1024, "(guessed)"),
+
+    UNKNOWN(-1, "unknown");
+
+    companion object {
+        private val map = entries.associateBy(FileType::code)
+
+        fun fromCode(code: Int?): FileType = map[code] ?: UNKNOWN
+    }
+}
diff --git a/examples/llama.android/lib/src/main/java/com/arm/aichat/gguf/GgufMetadata.kt b/examples/llama.android/lib/src/main/java/com/arm/aichat/gguf/GgufMetadata.kt
new file mode 100644
index 0000000000..5e1971ae2f
--- /dev/null
+++ b/examples/llama.android/lib/src/main/java/com/arm/aichat/gguf/GgufMetadata.kt
@@ -0,0 +1,132 @@
+package com.arm.aichat.gguf
+
+import java.io.IOException
+
+
+/**
+ * Structured metadata of GGUF
+ */
+data class GgufMetadata(
+    // Basic file info
+    val version: GgufVersion,
+    val tensorCount: Long,
+    val kvCount: Long,
+
+    // General info
+    val basic: BasicInfo,
+    val author: AuthorInfo? = null,
+    val additional: AdditionalInfo? = null,
+    val architecture: ArchitectureInfo? = null,
+    val baseModels: List<BaseModelInfo>? = null,
+    val tokenizer: TokenizerInfo? = null,
+
+    // Derivative info
+    val dimensions: DimensionsInfo? = null,
+    val attention: AttentionInfo? = null,
+    val rope: RopeInfo? = null,
+    val experts: ExpertsInfo? = null
+) {
+    enum class GgufVersion(val code: Int, val label: String) {
+        /** First public draft; little‑endian only, no alignment key. */
+        LEGACY_V1(1, "Legacy v1"),
+
+        /** Added split‑file support and some extra metadata keys. */
+        EXTENDED_V2(2, "Extended v2"),
+
+        /** Current spec: endian‑aware, mandatory alignment, fully validated. */
+        VALIDATED_V3(3, "Validated v3");
+
+        companion object {
+            fun fromCode(code: Int): GgufVersion =
+                entries.firstOrNull { it.code == code }
+                    ?: throw IOException("Unknown GGUF version code $code")
+        }
+
+        override fun toString(): String = "$label (code=$code)"
+    }
+
+    data class BasicInfo(
+        val uuid: String? = null,
+        val name: String? = null,
+        val nameLabel: String? = null,
+        val sizeLabel: String? = null,  // Size label like "7B"
+    )
+
+    data class AuthorInfo(
+        val organization: String? = null,
+        val author: String? = null,
+        val doi: String? = null,
+        val url: String? = null,
+        val repoUrl: String? = null,
+        val license: String? = null,
+        val licenseLink: String? = null,
+    )
+
+    data class AdditionalInfo(
+        val type: String? = null,
+        val description: String? = null,
+        val tags: List<String>? = null,
+        val languages: List<String>? = null,
+    )
+
+    data class ArchitectureInfo(
+        val architecture: String? = null,
+        val fileType: Int? = null,
+        val vocabSize: Int? = null,
+        val finetune: String? = null,
+        val quantizationVersion: Int? = null,
+    )
+
+    data class BaseModelInfo(
+        val name: String? = null,
+        val author: String? = null,
+        val version: String? = null,
+        val organization: String? = null,
+        val url: String? = null,
+        val doi: String? = null,
+        val uuid: String? = null,
+        val repoUrl: String? = null,
+    )
+
+    data class TokenizerInfo(
+        val model: String? = null,
+        val bosTokenId: Int? = null,
+        val eosTokenId: Int? = null,
+        val unknownTokenId: Int? = null,
+        val paddingTokenId: Int? = null,
+        val addBosToken: Boolean? = null,
+        val addEosToken: Boolean? = null,
+        val chatTemplate: String? = null,
+    )
+
+    data class DimensionsInfo(
+        val contextLength: Int? = null,
+        val embeddingSize: Int? = null,
+        val blockCount: Int? = null,
+        val feedForwardSize: Int? = null,
+    )
+
+    data class AttentionInfo(
+        val headCount: Int? = null,
+        val headCountKv: Int? = null,
+        val keyLength: Int? = null,
+        val valueLength: Int? = null,
+        val layerNormEpsilon: Float? = null,
+        val layerNormRmsEpsilon: Float? = null,
+    )
+
+    data class RopeInfo(
+        val frequencyBase: Float? = null,
+        val dimensionCount: Int? = null,
+        val scalingType: String? = null,
+        val scalingFactor: Float? = null,
+        val attnFactor: Float? = null,
+        val originalContextLength: Int? = null,
+        val finetuned: Boolean? = null,
+    )
+
+    data class ExpertsInfo(
+        val count: Int? = null,
+        val usedCount: Int? = null,
+    )
+}
diff --git a/examples/llama.android/lib/src/main/java/com/arm/aichat/gguf/GgufMetadataReader.kt b/examples/llama.android/lib/src/main/java/com/arm/aichat/gguf/GgufMetadataReader.kt
new file mode 100644
index 0000000000..264a6c0bda
--- /dev/null
+++ b/examples/llama.android/lib/src/main/java/com/arm/aichat/gguf/GgufMetadataReader.kt
@@ -0,0 +1,77 @@
+package com.arm.aichat.gguf
+
+import android.content.Context
+import android.net.Uri
+import com.arm.aichat.internal.gguf.GgufMetadataReaderImpl
+import java.io.File
+import java.io.IOException
+import java.io.InputStream
+
+/**
+ * Interface for reading GGUF metadata from model files.
+ * Use `GgufMetadataReader.create()` to get an instance.
+ */
+interface GgufMetadataReader {
+    /**
+     * Reads the magic number from the specified file path.
+     *
+     * @param file Java File to the GGUF file with absolute path
+     * @return true if file is valid GGUF, otherwise false
+     * @throws InvalidFileFormatException if file format is invalid
+     */
+    suspend fun ensureSourceFileFormat(file: File): Boolean
+
+    /**
+     * Reads the magic number from the specified file path.
+     *
+     * @param context Context for obtaining [android.content.ContentProvider]
+     * @param uri Uri to the GGUF file provided by [android.content.ContentProvider]
+     * @return true if file is valid GGUF, otherwise false
+     * @throws InvalidFileFormatException if file format is invalid
+     */
+    suspend fun ensureSourceFileFormat(context: Context, uri: Uri): Boolean
+
+    /**
+     * Reads and parses GGUF metadata from the specified file path.
+     *
+     * @param input the [InputStream] obtained from a readable file or content
+     * @return Structured metadata extracted from the file
+     * @throws IOException if file is damaged or cannot be read
+     * @throws InvalidFileFormatException if file format is invalid
+     */
+    suspend fun readStructuredMetadata(input: InputStream): GgufMetadata
+
+    companion object {
+        private val DEFAULT_SKIP_KEYS = setOf(
+            "tokenizer.chat_template",
+            "tokenizer.ggml.scores",
+            "tokenizer.ggml.tokens",
+            "tokenizer.ggml.token_type"
+        )
+
+        /**
+         * Creates a default GgufMetadataReader instance
+         */
+        fun create(): GgufMetadataReader = GgufMetadataReaderImpl(
+            skipKeys = DEFAULT_SKIP_KEYS,
+            arraySummariseThreshold = 1_000
+        )
+
+        /**
+         * Creates a GgufMetadataReader with custom configuration
+         *
+         * @param skipKeys Keys whose value should be skipped entirely (not kept in the result map)
+         * @param arraySummariseThreshold If ≥0, arrays longer get summarised, not materialised;
+         *                                If -1, never summarise.
+         */
+        fun create(
+            skipKeys: Set<String> = DEFAULT_SKIP_KEYS,
+            arraySummariseThreshold: Int = 1_000
+        ): GgufMetadataReader = GgufMetadataReaderImpl(
+            skipKeys = skipKeys,
+            arraySummariseThreshold = arraySummariseThreshold
+        )
+    }
+}
+
+class InvalidFileFormatException : IOException()
diff --git a/examples/llama.android/lib/src/main/java/com/arm/aichat/internal/InferenceEngineImpl.kt b/examples/llama.android/lib/src/main/java/com/arm/aichat/internal/InferenceEngineImpl.kt
new file mode 100644
index 0000000000..b9056ea819
--- /dev/null
+++ b/examples/llama.android/lib/src/main/java/com/arm/aichat/internal/InferenceEngineImpl.kt
@@ -0,0 +1,309 @@
+package com.arm.aichat.internal
+
+import android.content.Context
+import android.util.Log
+import com.arm.aichat.InferenceEngine
+import com.arm.aichat.UnsupportedArchitectureException
+import com.arm.aichat.internal.InferenceEngineImpl.Companion.getInstance
+import dalvik.annotation.optimization.FastNative
+import kotlinx.coroutines.CancellationException
+import kotlinx.coroutines.CoroutineScope
+import kotlinx.coroutines.Dispatchers
+import kotlinx.coroutines.ExperimentalCoroutinesApi
+import kotlinx.coroutines.SupervisorJob
+import kotlinx.coroutines.cancel
+import kotlinx.coroutines.flow.Flow
+import kotlinx.coroutines.flow.MutableStateFlow
+import kotlinx.coroutines.flow.StateFlow
+import kotlinx.coroutines.flow.flow
+import kotlinx.coroutines.flow.flowOn
+import kotlinx.coroutines.launch
+import kotlinx.coroutines.withContext
+import java.io.File
+import java.io.IOException
+
+/**
+ * JNI wrapper for the llama.cpp library providing Android-friendly access to large language models.
+ *
+ * This class implements a singleton pattern for managing the lifecycle of a single LLM instance.
+ * All operations are executed on a dedicated single-threaded dispatcher to ensure thread safety
+ * with the underlying C++ native code.
+ *
+ * The typical usage flow is:
+ * 1. Get instance via [getInstance]
+ * 2. Load a model with [loadModel]
+ * 3. Send prompts with [sendUserPrompt]
+ * 4. Generate responses as token streams
+ * 5. Perform [cleanUp] when done with a model
+ * 6. Properly [destroy] when completely done
+ *
+ * State transitions are managed automatically and validated at each operation.
+ *
+ * @see ai_chat.cpp for the native implementation details
+ */
+internal class InferenceEngineImpl private constructor(
+    private val nativeLibDir: String
+) : InferenceEngine {
+
+    companion object {
+        private val TAG = InferenceEngineImpl::class.java.simpleName
+
+        @Volatile
+        private var instance: InferenceEngine? = null
+
+        /**
+         * Create or obtain [InferenceEngineImpl]'s single instance.
+         *
+         * @param Context for obtaining native library directory
+         * @throws IllegalArgumentException if native library path is invalid
+         * @throws UnsatisfiedLinkError if library failed to load
+         */
+        internal fun getInstance(context: Context) =
+            instance ?: synchronized(this) {
+                val nativeLibDir = context.applicationInfo.nativeLibraryDir
+                require(nativeLibDir.isNotBlank()) { "Expected a valid native library path!" }
+
+                try {
+                    Log.i(TAG, "Instantiating InferenceEngineImpl,,,")
+                    InferenceEngineImpl(nativeLibDir).also { instance = it }
+                } catch (e: UnsatisfiedLinkError) {
+                    Log.e(TAG, "Failed to load native library from $nativeLibDir", e)
+                    throw e
+                }
+            }
+    }
+
+    /**
+     * JNI methods
+     * @see ai_chat.cpp
+     */
+    @FastNative
+    private external fun init(nativeLibDir: String)
+
+    @FastNative
+    private external fun load(modelPath: String): Int
+
+    @FastNative
+    private external fun prepare(): Int
+
+    @FastNative
+    private external fun systemInfo(): String
+
+    @FastNative
+    private external fun benchModel(pp: Int, tg: Int, pl: Int, nr: Int): String
+
+    @FastNative
+    private external fun processSystemPrompt(systemPrompt: String): Int
+
+    @FastNative
+    private external fun processUserPrompt(userPrompt: String, predictLength: Int): Int
+
+    @FastNative
+    private external fun generateNextToken(): String?
+
+    @FastNative
+    private external fun unload()
+
+    @FastNative
+    private external fun shutdown()
+
+    private val _state =
+        MutableStateFlow<InferenceEngine.State>(InferenceEngine.State.Uninitialized)
+    override val state: StateFlow<InferenceEngine.State> = _state
+
+    private var _readyForSystemPrompt = false
+
+    /**
+     * Single-threaded coroutine dispatcher & scope for LLama asynchronous operations
+     */
+    @OptIn(ExperimentalCoroutinesApi::class)
+    private val llamaDispatcher = Dispatchers.IO.limitedParallelism(1)
+    private val llamaScope = CoroutineScope(llamaDispatcher + SupervisorJob())
+
+    init {
+        llamaScope.launch {
+            try {
+                check(_state.value is InferenceEngine.State.Uninitialized) {
+                    "Cannot load native library in ${_state.value.javaClass.simpleName}!"
+                }
+                _state.value = InferenceEngine.State.Initializing
+                Log.i(TAG, "Loading native library...")
+                System.loadLibrary("ai-chat")
+                init(nativeLibDir)
+                _state.value = InferenceEngine.State.Initialized
+                Log.i(TAG, "Native library loaded! System info: \n${systemInfo()}")
+
+            } catch (e: Exception) {
+                Log.e(TAG, "Failed to load native library", e)
+                throw e
+            }
+        }
+    }
+
+    /**
+     * Load the LLM
+     */
+    override suspend fun loadModel(pathToModel: String) =
+        withContext(llamaDispatcher) {
+            check(_state.value is InferenceEngine.State.Initialized) {
+                "Cannot load model in ${_state.value.javaClass.simpleName}!"
+            }
+
+            try {
+                Log.i(TAG, "Checking access to model file... \n$pathToModel")
+                File(pathToModel).let {
+                    require(it.exists()) { "File not found" }
+                    require(it.isFile) { "Not a valid file" }
+                    require(it.canRead()) { "Cannot read file" }
+                }
+
+                Log.i(TAG, "Loading model... \n$pathToModel")
+                _readyForSystemPrompt = false
+                _state.value = InferenceEngine.State.LoadingModel
+                load(pathToModel).let {
+                    // TODO-han.yin: find a better way to pass other error codes
+                    if (it != 0) throw UnsupportedArchitectureException()
+                }
+                prepare().let {
+                    if (it != 0) throw IOException("Failed to prepare resources")
+                }
+                Log.i(TAG, "Model loaded!")
+                _readyForSystemPrompt = true
+                _state.value = InferenceEngine.State.ModelReady
+            } catch (e: Exception) {
+                Log.e(TAG, (e.message ?: "Error loading model") + "\n" + pathToModel, e)
+                _state.value = InferenceEngine.State.Error(e)
+                throw e
+            }
+        }
+
+    /**
+     * Process the plain text system prompt
+     *
+     * TODO-han.yin: return error code if system prompt not correct processed?
+     */
+    override suspend fun setSystemPrompt(prompt: String) =
+        withContext(llamaDispatcher) {
+            require(prompt.isNotBlank()) { "Cannot process empty system prompt!" }
+            check(_readyForSystemPrompt) { "System prompt must be set ** RIGHT AFTER ** model loaded!" }
+            check(_state.value is InferenceEngine.State.ModelReady) {
+                "Cannot process system prompt in ${_state.value.javaClass.simpleName}!"
+            }
+
+            Log.i(TAG, "Sending system prompt...")
+            _readyForSystemPrompt = false
+            _state.value = InferenceEngine.State.ProcessingSystemPrompt
+            processSystemPrompt(prompt).let { result ->
+                if (result != 0) {
+                    RuntimeException("Failed to process system prompt: $result").also {
+                        _state.value = InferenceEngine.State.Error(it)
+                        throw it
+                    }
+                }
+            }
+            Log.i(TAG, "System prompt processed! Awaiting user prompt...")
+            _state.value = InferenceEngine.State.ModelReady
+        }
+
+    /**
+     * Send plain text user prompt to LLM, which starts generating tokens in a [Flow]
+     */
+    override fun sendUserPrompt(
+        message: String,
+        predictLength: Int,
+    ): Flow<String> = flow {
+        require(message.isNotEmpty()) { "User prompt discarded due to being empty!" }
+        check(_state.value is InferenceEngine.State.ModelReady) {
+            "User prompt discarded due to: ${_state.value.javaClass.simpleName}"
+        }
+
+        try {
+            Log.i(TAG, "Sending user prompt...")
+            _readyForSystemPrompt = false
+            _state.value = InferenceEngine.State.ProcessingUserPrompt
+
+            processUserPrompt(message, predictLength).let { result ->
+                if (result != 0) {
+                    Log.e(TAG, "Failed to process user prompt: $result")
+                    return@flow
+                }
+            }
+
+            Log.i(TAG, "User prompt processed. Generating assistant prompt...")
+            _state.value = InferenceEngine.State.Generating
+            while (true) {
+                generateNextToken()?.let { utf8token ->
+                    if (utf8token.isNotEmpty()) emit(utf8token)
+                } ?: break
+            }
+            Log.i(TAG, "Assistant generation complete. Awaiting user prompt...")
+            _state.value = InferenceEngine.State.ModelReady
+        } catch (e: CancellationException) {
+            Log.i(TAG, "Generation cancelled by user.")
+            _state.value = InferenceEngine.State.ModelReady
+            throw e
+        } catch (e: Exception) {
+            Log.e(TAG, "Error during generation!", e)
+            _state.value = InferenceEngine.State.Error(e)
+            throw e
+        }
+    }.flowOn(llamaDispatcher)
+
+    /**
+     * Benchmark the model
+     */
+    override suspend fun bench(pp: Int, tg: Int, pl: Int, nr: Int): String =
+        withContext(llamaDispatcher) {
+            check(_state.value is InferenceEngine.State.ModelReady) {
+                "Benchmark request discarded due to: $state"
+            }
+            Log.i(TAG, "Start benchmark (pp: $pp, tg: $tg, pl: $pl, nr: $nr)")
+            _readyForSystemPrompt = false   // Just to be safe
+            _state.value = InferenceEngine.State.Benchmarking
+            benchModel(pp, tg, pl, nr).also {
+                _state.value = InferenceEngine.State.ModelReady
+            }
+        }
+
+    /**
+     * Unloads the model and frees resources, or reset error states
+     */
+    override suspend fun cleanUp() =
+        withContext(llamaDispatcher) {
+            when (val state = _state.value) {
+                is InferenceEngine.State.ModelReady -> {
+                    Log.i(TAG, "Unloading model and free resources...")
+                    _readyForSystemPrompt = false
+                    _state.value = InferenceEngine.State.UnloadingModel
+
+                    unload()
+
+                    _state.value = InferenceEngine.State.Initialized
+                    Log.i(TAG, "Model unloaded!")
+                    Unit
+                }
+
+                is InferenceEngine.State.Error -> {
+                    Log.i(TAG, "Resetting error states...")
+                    _state.value = InferenceEngine.State.Initialized
+                    Log.i(TAG, "States reset!")
+                    Unit
+                }
+
+                else -> throw IllegalStateException("Cannot unload model in ${state.javaClass.simpleName}")
+            }
+        }
+
+    /**
+     * Cancel all ongoing coroutines and free GGML backends
+     */
+    override fun destroy() {
+        _readyForSystemPrompt = false
+        llamaScope.cancel()
+        when(_state.value) {
+            is InferenceEngine.State.Uninitialized -> {}
+            is InferenceEngine.State.Initialized -> shutdown()
+            else -> { unload(); shutdown() }
+        }
+    }
+}
diff --git a/examples/llama.android/lib/src/main/java/com/arm/aichat/internal/gguf/GgufMetadataReaderImpl.kt b/examples/llama.android/lib/src/main/java/com/arm/aichat/internal/gguf/GgufMetadataReaderImpl.kt
new file mode 100644
index 0000000000..bf250ac13c
--- /dev/null
+++ b/examples/llama.android/lib/src/main/java/com/arm/aichat/internal/gguf/GgufMetadataReaderImpl.kt
@@ -0,0 +1,590 @@
+package com.arm.aichat.internal.gguf
+
+import android.content.Context
+import android.net.Uri
+import com.arm.aichat.gguf.GgufMetadata
+import com.arm.aichat.gguf.GgufMetadataReader
+import com.arm.aichat.gguf.InvalidFileFormatException
+import java.io.File
+import java.io.IOException
+import java.io.InputStream
+
+
+/**
+ * Utility class to read GGUF model files and extract metadata key-value pairs.
+ * This parser reads the header and metadata of a GGUF v3 file (little-endian) and skips tensor data.
+ */
+internal class GgufMetadataReaderImpl(
+    private val skipKeys: Set<String>,
+    private val arraySummariseThreshold: Int,
+) : GgufMetadataReader {
+    companion object {
+        private const val ARCH_LLAMA = "llama"
+    }
+
+    /** Enum corresponding to GGUF metadata value types (for convenience and array element typing). */
+    enum class MetadataType(val code: Int) {
+        UINT8(0), INT8(1), UINT16(2), INT16(3),
+        UINT32(4), INT32(5), FLOAT32(6), BOOL(7),
+        STRING(8), ARRAY(9), UINT64(10), INT64(11), FLOAT64(12);
+        companion object {
+            private val codeMap = entries.associateBy(MetadataType::code)
+            fun fromCode(code: Int): MetadataType = codeMap[code]
+                ?: throw IOException("Unknown metadata value type code: $code")
+        }
+    }
+
+    /** Sealed class hierarchy for metadata values, providing type-safe representations for each GGUF metadata type. */
+    sealed class MetadataValue {
+        data class UInt8(val value: UByte) : MetadataValue()       // 0:  8-bit unsigned int
+        data class Int8(val value: Byte) : MetadataValue()         // 1:  8-bit signed int
+        data class UInt16(val value: UShort) : MetadataValue()     // 2:  16-bit unsigned int (little-endian)
+        data class Int16(val value: Short) : MetadataValue()       // 3:  16-bit signed int (little-endian)
+        data class UInt32(val value: UInt) : MetadataValue()       // 4:  32-bit unsigned int (little-endian)
+        data class Int32(val value: Int) : MetadataValue()         // 5:  32-bit signed int (little-endian)
+        data class Float32(val value: Float) : MetadataValue()     // 6:  32-bit IEEE754 float
+        data class Bool(val value: Boolean) : MetadataValue()      // 7:  Boolean (1-byte, 0=false, 1=true)
+        data class StringVal(val value: String) : MetadataValue()  // 8:  UTF-8 string (length-prefixed)
+        data class ArrayVal(val elementType: MetadataType, val elements: List<MetadataValue>) : MetadataValue()
+        data class UInt64(val value: ULong) : MetadataValue()      // 10: 64-bit unsigned int (little-endian)
+        data class Int64(val value: Long) : MetadataValue()        // 11: 64-bit signed int (little-endian)
+        data class Float64(val value: Double) : MetadataValue()    // 12: 64-bit IEEE754 double
+    }
+
+    /* Convert MetadataValue to plain Kotlin primitives for allMetadata map */
+    private fun MetadataValue.toPrimitive(): Any = when (this) {
+        is MetadataValue.UInt8     -> value
+        is MetadataValue.Int8      -> value
+        is MetadataValue.UInt16    -> value
+        is MetadataValue.Int16     -> value
+        is MetadataValue.UInt32    -> value
+        is MetadataValue.Int32     -> value
+        is MetadataValue.Float32   -> value
+        is MetadataValue.Bool      -> value
+        is MetadataValue.StringVal -> value
+        is MetadataValue.UInt64    -> value
+        is MetadataValue.Int64     -> value
+        is MetadataValue.Float64   -> value
+        is MetadataValue.ArrayVal  -> elements.map { it.toPrimitive() }
+    }
+
+    /**
+     * Reads the magic number from the specified file path.
+     *
+     * @param context Context for obtaining ContentResolver
+     * @param uri Uri to the GGUF file provided by ContentProvider
+     * @return true if file is valid GGUF, otherwise false
+     */
+    override suspend fun ensureSourceFileFormat(file: File): Boolean =
+        file.inputStream().buffered().use { ensureMagic(it) }
+
+    /**
+     * Reads the magic number from the specified file path.
+     *
+     * @param context Context for obtaining ContentResolver
+     * @param uri Uri to the GGUF file provided by ContentProvider
+     * @return true if file is valid GGUF, otherwise false
+     */
+    override suspend fun ensureSourceFileFormat(context: Context, uri: Uri): Boolean =
+        context.contentResolver.openInputStream(uri)?.buffered()?.use { ensureMagic(it) } == true
+
+    /** Reads the 4‑byte magic; throws if magic ≠ "GGUF". */
+    private fun ensureMagic(input: InputStream): Boolean =
+        ByteArray(4).let {
+            if (input.read(it) != 4) throw IOException("Not a valid file!")
+            it.contentEquals(byteArrayOf(0x47, 0x47, 0x55, 0x46)) // "GGUF"
+        }
+
+    /**
+     * High‑level entry point: parses a `.gguf` file on disk and returns the fully
+     * populated [GgufMetadata] tree.
+     *
+     * Steps performed internally:
+     * 1.  Reads and validates the 8‑byte header (`"GGUF"` magic + version).
+     * 2.  Streams through the key‑value section, skipping large blobs if the key
+     *     appears in [skipKeys] or if an array exceeds [arraySummariseThreshold].
+     * 3.  Converts the resulting raw map into strongly‑typed sub‑structures
+     *     (basic info, tokenizer, rope, etc.).
+     *
+     * The method is STREAMING‑ONLY: tensors are never mapped or loaded into
+     * memory, so even multi‑GB model files can be processed in < 50 ms.
+     *
+     * @param path Absolute or relative filesystem path to a `.gguf` file.
+     * @return A [GgufMetadata] instance containing all recognised metadata plus
+     *         an `allMetadata` map with any keys that were not given a dedicated
+     *         field.
+     * @throws IOException if the file is not GGUF, the version is unsupported,
+     *         or the metadata block is truncated / corrupt.
+     */
+    override suspend fun readStructuredMetadata(input: InputStream): GgufMetadata {
+        // ── 1. header ──────────────────────────────────────────────────────────
+        // throws on mismatch
+        val version       = ensureMagicAndVersion(input)
+        val tensorCount   = readLittleLong(input)
+        val kvCount       = readLittleLong(input)
+
+        // ── 2. metadata map (reuse our raw parser, but we need access to the stream) ──
+        val meta = readMetaMap(input, kvCount)    // <String, MetadataValue>
+
+        // ── 3. build structured object ────────────────────────────────────────
+        return buildStructured(meta, version, tensorCount, kvCount)
+    }
+
+    /** Reads the 4‑byte magic + 4‑byte version; throws if magic ≠ "GGUF". */
+    private fun ensureMagicAndVersion(input: InputStream): GgufMetadata.GgufVersion {
+        if (!ensureMagic(input)) throw InvalidFileFormatException()
+        return GgufMetadata.GgufVersion.fromCode(readLEUInt32(input))
+    }
+
+    /**
+     * Read an unsigned 32‑bit little‑endian integer.
+     *
+     * @throws IOException if fewer than four bytes are available.
+     */
+    private fun readLEUInt32(input: InputStream): Int {
+        val b0 = input.read(); val b1 = input.read(); val b2 = input.read(); val b3 = input.read()
+        if (b3 == -1) throw IOException("Unexpected EOF while reading UInt32")
+        return (b3 and 0xFF shl 24) or
+            (b2 and 0xFF shl 16) or
+            (b1 and 0xFF shl  8) or
+            (b0 and 0xFF)
+    }
+
+    /**
+     * Low‑level helper that reads the entire “key-value” section from the current
+     * stream position.
+     *
+     * @param input  Open stream positioned JUST AFTER the header.
+     * @param kvCnt  Number of key‑value pairs (taken from the header).
+     * @return       Mutable map with one [MetadataValue] for every key that is NOT skipped.
+     *
+     * The function honours [skipKeys] and [arraySummariseThreshold] by invoking
+     * [skipValue] or [parseValue] accordingly.
+     */
+    private fun readMetaMap(input: InputStream, kvCnt: Long): Map<String, MetadataValue> =
+        mutableMapOf<String, MetadataValue>().apply {
+             repeat(kvCnt.toInt()) {
+                 val key = readString(input)
+                 val valueT = MetadataType.fromCode(littleEndianBytesToInt(input.readNBytesExact(4)))
+                 if (key in skipKeys) {
+                     skipValue(input, valueT)
+                 } else {
+                     this[key] = parseValue(input, valueT)
+                 }
+             }
+         }
+
+    /**
+     * Converts a flat [Map]<[String], [MetadataValue]> into the strongly‑typed
+     * [GgufMetadata] tree used by the rest of the app.
+     *
+     * Only the keys listed in the spec are copied into dedicated data classes;
+     * everything else is preserved in `GgufMetadata.allMetadata`.
+     *
+     * @param m            Raw key/value map.
+     * @param version      GGUF file‑format version (enum).
+     * @param tensorCnt    Number of tensors (from the header).
+     * @param kvCnt        Total metadata pair count (from the header).
+     */
+    private fun buildStructured(
+        m: Map<String, MetadataValue>,
+        version: GgufMetadata.GgufVersion,
+        tensorCnt: Long,
+        kvCnt: Long
+    ): GgufMetadata {
+        // ---------- helpers ----------
+        fun String.str()  = (m[this] as? MetadataValue.StringVal)?.value
+        fun String.bool() = (m[this] as? MetadataValue.Bool)?.value
+        fun String.i32()  = (m[this] as? MetadataValue.Int32)?.value
+        fun String.u32()  = (m[this] as? MetadataValue.UInt32)?.value?.toInt()
+        fun String.f32()  = (m[this] as? MetadataValue.Float32)?.value
+        fun String.f64()  = (m[this] as? MetadataValue.Float64)?.value?.toFloat()
+        fun String.strList(): List<String>? =
+            (m[this] as? MetadataValue.ArrayVal)
+                ?.elements
+                ?.mapNotNull { (it as? MetadataValue.StringVal)?.value }
+
+        val arch = "general.architecture".str() ?: ARCH_LLAMA
+
+        // -------------- populate sections ----------------
+        val basic = GgufMetadata.BasicInfo(
+            uuid      = "general.uuid".str(),
+            name      = "general.basename".str(),
+            nameLabel = "general.name".str(),
+            sizeLabel = "general.size_label".str()
+        )
+
+        val author = GgufMetadata.AuthorInfo(
+            organization = "general.organization".str(),
+            author       = "general.author".str(),
+            doi          = "general.doi".str(),
+            url          = "general.url".str(),
+            repoUrl      = "general.repo_url".str(),
+            license      = "general.license".str(),
+            licenseLink  = "general.license.link".str()
+        ).takeUnless {
+            organization == null && author == null && doi == null &&
+                url == null && repoUrl == null && license == null && licenseLink == null
+        }
+
+        val additional = GgufMetadata.AdditionalInfo(
+            type        = "general.type".str(),
+            description = "general.description".str(),
+            tags        = "general.tags".strList(),
+            languages   = "general.languages".strList()
+        ).takeUnless {
+            type == null && description == null && tags == null && languages == null
+        }
+
+        val architectureInfo = GgufMetadata.ArchitectureInfo(
+            architecture        = arch,
+            fileType            = "general.file_type".u32(),
+            vocabSize           = "$arch.vocab_size".u32(),
+            finetune            = "general.finetune".str(),
+            quantizationVersion = "general.quantization_version".u32()
+        ).takeUnless { fileType == null && vocabSize == null && finetune == null && quantizationVersion == null }
+
+        val baseModels = buildList {
+            val n = "general.base_model.count".u32() ?: 0
+            for (i in 0 until n) {
+                fun k(s: String) = "general.base_model.$i.$s"
+                add(
+                    GgufMetadata.BaseModelInfo(
+                        name         = k("name").str(),
+                        author       = k("author").str(),
+                        version      = k("version").str(),
+                        organization = k("organization").str(),
+                        url          = k("url").str(),
+                        doi          = k("doi").str(),
+                        uuid         = k("uuid").str(),
+                        repoUrl      = k("repo_url").str(),
+                    )
+                )
+            }
+        }.takeIf { it.isNotEmpty() }
+
+        val tokenizer = GgufMetadata.TokenizerInfo(
+            model            = "tokenizer.ggml.model".str(),
+            bosTokenId       = "tokenizer.ggml.bos_token_id".u32(),
+            eosTokenId       = "tokenizer.ggml.eos_token_id".u32(),
+            unknownTokenId   = "tokenizer.ggml.unknown_token_id".u32(),
+            paddingTokenId   = "tokenizer.ggml.padding_token_id".u32(),
+            addBosToken      = "tokenizer.ggml.add_bos_token".bool(),
+            addEosToken      = "tokenizer.ggml.add_eos_token".bool(),
+            chatTemplate     = "tokenizer.chat_template".str()
+        ).takeUnless { model == null && bosTokenId == null && eosTokenId == null &&
+            unknownTokenId == null && paddingTokenId == null &&
+            addBosToken == null && addEosToken == null && chatTemplate == null
+        }
+
+        val dimensions = GgufMetadata.DimensionsInfo(
+            contextLength    = "$arch.context_length".u32(),
+            embeddingSize    = "$arch.embedding_length".u32(),
+            blockCount       = "$arch.block_count".u32(),
+            feedForwardSize  = "$arch.feed_forward_length".u32()
+        ).takeUnless { contextLength == null && embeddingSize == null && blockCount == null && feedForwardSize == null }
+
+        val attention = GgufMetadata.AttentionInfo(
+            headCount             = "$arch.attention.head_count".u32(),
+            headCountKv           = "$arch.attention.head_count_kv".u32(),
+            keyLength             = "$arch.attention.key_length".u32(),
+            valueLength           = "$arch.attention.value_length".u32(),
+            layerNormEpsilon      = "$arch.attention.layer_norm_epsilon".f32(),
+            layerNormRmsEpsilon   = "$arch.attention.layer_norm_rms_epsilon".f32(),
+        ).takeUnless { headCount == null && headCountKv == null && keyLength == null && valueLength == null &&
+            layerNormEpsilon == null && layerNormRmsEpsilon == null
+        }
+
+        val rope = GgufMetadata.RopeInfo(
+            frequencyBase          = "$arch.rope.freq_base".f32(),
+            dimensionCount         = "$arch.rope.dimension_count".u32(),
+            scalingType            = "$arch.rope.scaling.type".str(),
+            scalingFactor          = "$arch.rope.scaling.factor".f32(),
+            attnFactor             = "$arch.rope.scaling.attn_factor".f32(),
+            originalContextLength  = "$arch.rope.scaling.original_context_length".u32(),
+            finetuned              = "$arch.rope.scaling.finetuned".bool()
+        ).takeUnless { frequencyBase == null && dimensionCount == null &&
+            scalingType == null && scalingFactor == null && attnFactor == null &&
+            originalContextLength == null && finetuned == null
+        }
+
+        val experts = GgufMetadata.ExpertsInfo(
+            count      = "$arch.expert_count".u32(),
+            usedCount  = "$arch.expert_used_count".u32()
+        ).takeUnless { count == null && usedCount == null }
+
+        return GgufMetadata(
+            version = version,
+            tensorCount = tensorCnt,
+            kvCount = kvCnt,
+            basic = basic,
+            author = author,
+            additional = additional,
+            architecture = architectureInfo,
+            baseModels = baseModels,
+            tokenizer = tokenizer,
+            dimensions = dimensions,
+            attention = attention,
+            rope = rope,
+            experts = experts
+        )
+    }
+
+    /**
+     * Recursively parses a metadata value of the given type from the input stream.
+     * @param input The input stream positioned at the start of the value.
+     * @param type The metadata value type to parse.
+     */
+    private fun parseValue(input: InputStream, type: MetadataType): MetadataValue = when (type) {
+        MetadataType.UINT8 -> {
+            // 1-byte unsigned integer
+            val byteVal = input.read()
+            if (byteVal == -1) throw IOException("Unexpected EOF while reading uint8 value.")
+            MetadataValue.UInt8(byteVal.toUByte())
+        }
+        MetadataType.INT8 -> {
+            // 1-byte signed integer
+            val byteVal = input.read()
+            if (byteVal == -1) throw IOException("Unexpected EOF while reading int8 value.")
+            MetadataValue.Int8(byteVal.toByte())
+        }
+        MetadataType.UINT16 -> {
+            // 2-byte unsigned integer (little-endian)
+            val bytes = ByteArray(2)
+            if (input.read(bytes) != 2) throw IOException("Unexpected EOF while reading uint16 value.")
+            // Combine two bytes (little-endian) into an unsigned 16-bit value
+            val u16 = ((bytes[1].toInt() and 0xFF) shl 8) or (bytes[0].toInt() and 0xFF)
+            MetadataValue.UInt16(u16.toUShort())
+        }
+        MetadataType.INT16 -> {
+            // 2-byte signed integer (little-endian)
+            val bytes = ByteArray(2)
+            if (input.read(bytes) != 2) throw IOException("Unexpected EOF while reading int16 value.")
+            // Combine to 16-bit and interpret as signed
+            val i16 = ((bytes[1].toInt() and 0xFF) shl 8) or (bytes[0].toInt() and 0xFF)
+            MetadataValue.Int16(i16.toShort())
+        }
+        MetadataType.UINT32 -> {
+            // 4-byte unsigned integer (little-endian)
+            val bytes = ByteArray(4)
+            if (input.read(bytes) != 4) throw IOException("Unexpected EOF while reading uint32 value.")
+            // Combine four bytes into a 32-bit value (as Long to avoid overflow), then convert to UInt
+            val u32 = (bytes[3].toLong() and 0xFFL shl 24) or
+                (bytes[2].toLong() and 0xFFL shl 16) or
+                (bytes[1].toLong() and 0xFFL shl 8) or
+                (bytes[0].toLong() and 0xFFL)
+            MetadataValue.UInt32(u32.toUInt())
+        }
+        MetadataType.INT32 -> {
+            // 4-byte signed integer (little-endian)
+            val bytes = ByteArray(4)
+            if (input.read(bytes) != 4) throw IOException("Unexpected EOF while reading int32 value.")
+            // Combine four bytes into a 32-bit signed int
+            val i32 = (bytes[3].toInt() and 0xFF shl 24) or
+                (bytes[2].toInt() and 0xFF shl 16) or
+                (bytes[1].toInt() and 0xFF shl 8) or
+                (bytes[0].toInt() and 0xFF)
+            MetadataValue.Int32(i32)
+        }
+        MetadataType.FLOAT32 -> {
+            // 4-byte IEEE 754 float (little-endian)
+            val bytes = ByteArray(4)
+            if (input.read(bytes) != 4) throw IOException("Unexpected EOF while reading float32 value.")
+            // Assemble 4 bytes into a 32-bit int bit-pattern, then convert to Float
+            val bits = (bytes[3].toInt() and 0xFF shl 24) or
+                (bytes[2].toInt() and 0xFF shl 16) or
+                (bytes[1].toInt() and 0xFF shl 8) or
+                (bytes[0].toInt() and 0xFF)
+            val floatVal = Float.fromBits(bits)
+            MetadataValue.Float32(floatVal)
+        }
+        MetadataType.BOOL -> {
+            // 1-byte boolean (0 = false, 1 = true)
+            val byteVal = input.read()
+            if (byteVal == -1) throw IOException("Unexpected EOF while reading boolean value.")
+            if (byteVal != 0 && byteVal != 1) {
+                throw IOException("Invalid boolean value: $byteVal (must be 0 or 1).")
+            }
+            MetadataValue.Bool(byteVal != 0)
+        }
+        MetadataType.STRING -> {
+            // UTF-8 string (length-prefixed with 8-byte length)
+            val str = readString(input)
+            MetadataValue.StringVal(str)
+        }
+        MetadataType.ARRAY -> {
+            val elemType = MetadataType.fromCode(littleEndianBytesToInt(input.readNBytesExact(4)))
+            val len      = readLittleLong(input)
+            val count    = len.toInt()
+
+            if (arraySummariseThreshold >= 0 && count > arraySummariseThreshold) {
+                // fast‑forward without allocation
+                repeat(count) { skipValue(input, elemType) }
+                MetadataValue.StringVal("Array($elemType, $count items) /* summarised */")
+            } else {
+                val list = ArrayList<MetadataValue>(count)
+                repeat(count) { list += parseValue(input, elemType) }
+                MetadataValue.ArrayVal(elemType, list)
+            }
+        }
+        MetadataType.UINT64 -> {
+            // 8-byte unsigned integer (little-endian)
+            val bytes = ByteArray(8)
+            if (input.read(bytes) != 8) throw IOException("Unexpected EOF while reading uint64 value.")
+            // Combine 8 bytes into an unsigned 64-bit (ULong). Use ULong for full 0 to 2^64-1 range.
+            val u64 = (bytes[7].toULong() and 0xFFuL shl 56) or
+                (bytes[6].toULong() and 0xFFuL shl 48) or
+                (bytes[5].toULong() and 0xFFuL shl 40) or
+                (bytes[4].toULong() and 0xFFuL shl 32) or
+                (bytes[3].toULong() and 0xFFuL shl 24) or
+                (bytes[2].toULong() and 0xFFuL shl 16) or
+                (bytes[1].toULong() and 0xFFuL shl 8) or
+                (bytes[0].toULong() and 0xFFuL)
+            MetadataValue.UInt64(u64)
+        }
+        MetadataType.INT64 -> {
+            // 8-byte signed integer (little-endian)
+            val bytes = ByteArray(8)
+            if (input.read(bytes) != 8) throw IOException("Unexpected EOF while reading int64 value.")
+            // Combine 8 bytes into a signed 64-bit value (Long)
+            val i64 = (bytes[7].toLong() and 0xFFL shl 56) or
+                (bytes[6].toLong() and 0xFFL shl 48) or
+                (bytes[5].toLong() and 0xFFL shl 40) or
+                (bytes[4].toLong() and 0xFFL shl 32) or
+                (bytes[3].toLong() and 0xFFL shl 24) or
+                (bytes[2].toLong() and 0xFFL shl 16) or
+                (bytes[1].toLong() and 0xFFL shl 8) or
+                (bytes[0].toLong() and 0xFFL)
+            MetadataValue.Int64(i64)
+        }
+        MetadataType.FLOAT64 -> {
+            // 8-byte IEEE 754 double (little-endian)
+            val bytes = ByteArray(8)
+            if (input.read(bytes) != 8) throw IOException("Unexpected EOF while reading float64 value.")
+            // Assemble 8 bytes into a 64-bit bit-pattern, then convert to Double
+            val bits = (bytes[7].toLong() and 0xFFL shl 56) or
+                (bytes[6].toLong() and 0xFFL shl 48) or
+                (bytes[5].toLong() and 0xFFL shl 40) or
+                (bytes[4].toLong() and 0xFFL shl 32) or
+                (bytes[3].toLong() and 0xFFL shl 24) or
+                (bytes[2].toLong() and 0xFFL shl 16) or
+                (bytes[1].toLong() and 0xFFL shl 8) or
+                (bytes[0].toLong() and 0xFFL)
+            val doubleVal = Double.fromBits(bits)
+            MetadataValue.Float64(doubleVal)
+        }
+    }
+
+
+    private fun <T> T?.takeUnless(check: T.() -> Boolean): T? =
+        this?.takeIf { !it.check() }
+
+    /** Helper: Skip a value in the stream without storing it (still maintains pointer). */
+    private fun skipValue(input: InputStream, type: MetadataType) {
+        when (type) {
+            MetadataType.UINT8, MetadataType.INT8, MetadataType.BOOL -> input.skipFully(1)
+            MetadataType.UINT16, MetadataType.INT16                  -> input.skipFully(2)
+            MetadataType.UINT32, MetadataType.INT32, MetadataType.FLOAT32 -> input.skipFully(4)
+            MetadataType.UINT64, MetadataType.INT64, MetadataType.FLOAT64 -> input.skipFully(8)
+            MetadataType.STRING -> {
+                val len = readLittleLong(input); input.skipFully(len)
+            }
+            MetadataType.ARRAY -> {
+                val elemType = MetadataType.fromCode(littleEndianBytesToInt(input.readNBytesExact(4)))
+                val len      = readLittleLong(input)
+                repeat(len.toInt()) { skipValue(input, elemType) }   // recursive skip
+            }
+        }
+    }
+
+    /** Helper: Read an 8-byte little-endian unsigned value and return it as a signed Long (assuming it fits in 63 bits). */
+    private fun readLittleLong(input: InputStream): Long {
+        val bytes = ByteArray(8)
+        input.readFully(bytes)
+
+        // Combine 8 bytes into a 64-bit value (Little Endian).
+        // Note: If the value exceeds Long.MAX_VALUE (bit 63 is 1), this will produce a negative Long (two's complement).
+        // In our context (lengths/counts), such extremely large values are not expected.
+        return (bytes[7].toLong() and 0xFFL shl 56) or
+            (bytes[6].toLong() and 0xFFL shl 48) or
+            (bytes[5].toLong() and 0xFFL shl 40) or
+            (bytes[4].toLong() and 0xFFL shl 32) or
+            (bytes[3].toLong() and 0xFFL shl 24) or
+            (bytes[2].toLong() and 0xFFL shl 16) or
+            (bytes[1].toLong() and 0xFFL shl 8) or
+            (bytes[0].toLong() and 0xFFL)
+    }
+
+    /** Helper: Read a GGUF string from the stream (8-byte length followed by UTF-8 bytes). */
+    private fun readString(input: InputStream): String =
+        // Read 8-byte little-endian length (number of bytes in the string).
+        readLittleLong(input).let { len ->
+            if (len < 0 || len > Int.MAX_VALUE) throw IOException("String too long: $len")
+
+            // Read the UTF-8 bytes of the given length.
+            ByteArray(len.toInt()).let {
+                if (it.isNotEmpty()) input.readFully(it)
+                String(it, Charsets.UTF_8)
+            }
+        }
+
+    /** Helper: Convert a 4-byte little-endian byte array to a 32-bit integer. */
+    private fun littleEndianBytesToInt(bytes: ByteArray): Int =
+        // Note: assumes bytes length is 4.
+        (bytes[3].toInt() and 0xFF shl 24) or
+            (bytes[2].toInt() and 0xFF shl 16) or
+            (bytes[1].toInt() and 0xFF shl 8) or
+            (bytes[0].toInt() and 0xFF)
+
+    /**
+     * Robust skip that works the same on JDK 11 and Android’s desugared runtime.
+     *
+     * @param n  Number of bytes to advance in the stream.
+     * @throws IOException on premature EOF.
+     */
+    private fun InputStream.skipFully(n: Long) {
+        var remaining = n
+        val scratch = ByteArray(8192)                 // read‑and‑toss buffer
+        while (remaining > 0) {
+            val skipped = skip(remaining)
+            when {
+                skipped > 0      -> remaining -= skipped               // normal fast path
+                skipped == 0L    -> {
+                    // fallback: read and discard
+                    val read = read(scratch, 0, minOf(remaining, scratch.size.toLong()).toInt())
+                    if (read == -1) throw IOException("EOF while skipping $n bytes")
+                    remaining -= read
+                }
+                else             -> throw IOException("Skip returned negative value")
+            }
+        }
+    }
+
+    /**
+     * Extension that keeps reading until the requested number of bytes are filled.
+     * Falls back to `read()` when `skip()` returns 0, which happens on some Android
+     * streams.
+     *
+     * @param buf  Destination buffer.
+     * @param len  Number of bytes to fill (defaults to `buf.size`).
+     * @throws IOException on premature EOF.
+     */
+    private fun InputStream.readFully(buf: ByteArray, len: Int = buf.size) {
+        var off = 0
+        while (off < len) {
+            val n = read(buf, off, len - off)
+            if (n == -1) throw IOException("EOF after $off of $len bytes")
+            off += n
+        }
+    }
+
+    /**
+     * Read EXACTLY `n` bytes or throw – never returns a partially‑filled array.
+     * This is used for small fixed‑length reads (e.g. 4‑byte type codes).
+     *
+     * @throws IOException on premature EOF.
+     */
+    private fun InputStream.readNBytesExact(n: Int) = ByteArray(n).also {
+        if (read(it) != n) throw IOException("Unexpected EOF")
+    }
+}
diff --git a/examples/llama.android/llama/src/test/java/android/llama/cpp/ExampleUnitTest.kt b/examples/llama.android/lib/src/test/java/android/llama/cpp/ExampleUnitTest.kt
similarity index 100%
rename from examples/llama.android/llama/src/test/java/android/llama/cpp/ExampleUnitTest.kt
rename to examples/llama.android/lib/src/test/java/android/llama/cpp/ExampleUnitTest.kt
diff --git a/examples/llama.android/llama/build.gradle.kts b/examples/llama.android/llama/build.gradle.kts
deleted file mode 100644
index 5bb6478022..0000000000
--- a/examples/llama.android/llama/build.gradle.kts
+++ /dev/null
@@ -1,71 +0,0 @@
-plugins {
-    id("com.android.library")
-    id("org.jetbrains.kotlin.android")
-}
-
-android {
-    namespace = "android.llama.cpp"
-    compileSdk = 34
-
-    defaultConfig {
-        minSdk = 33
-
-        testInstrumentationRunner = "androidx.test.runner.AndroidJUnitRunner"
-        consumerProguardFiles("consumer-rules.pro")
-        ndk {
-            // Add NDK properties if wanted, e.g.
-            // abiFilters += listOf("arm64-v8a")
-        }
-        externalNativeBuild {
-            cmake {
-                arguments += "-DLLAMA_CURL=OFF"
-                arguments += "-DLLAMA_BUILD_COMMON=ON"
-                arguments += "-DGGML_LLAMAFILE=OFF"
-                arguments += "-DCMAKE_BUILD_TYPE=Release"
-                cppFlags += listOf()
-                arguments += listOf()
-
-                cppFlags("")
-            }
-        }
-    }
-
-    buildTypes {
-        release {
-            isMinifyEnabled = false
-            proguardFiles(
-                getDefaultProguardFile("proguard-android-optimize.txt"),
-                "proguard-rules.pro"
-            )
-        }
-    }
-    externalNativeBuild {
-        cmake {
-            path("src/main/cpp/CMakeLists.txt")
-            version = "3.22.1"
-        }
-    }
-    compileOptions {
-        sourceCompatibility = JavaVersion.VERSION_1_8
-        targetCompatibility = JavaVersion.VERSION_1_8
-    }
-    kotlinOptions {
-        jvmTarget = "1.8"
-    }
-
-    packaging {
-        resources {
-            excludes += "/META-INF/{AL2.0,LGPL2.1}"
-        }
-    }
-}
-
-dependencies {
-
-    implementation("androidx.core:core-ktx:1.12.0")
-    implementation("androidx.appcompat:appcompat:1.6.1")
-    implementation("com.google.android.material:material:1.11.0")
-    testImplementation("junit:junit:4.13.2")
-    androidTestImplementation("androidx.test.ext:junit:1.1.5")
-    androidTestImplementation("androidx.test.espresso:espresso-core:3.5.1")
-}
diff --git a/examples/llama.android/llama/consumer-rules.pro b/examples/llama.android/llama/consumer-rules.pro
deleted file mode 100644
index e69de29bb2..0000000000
diff --git a/examples/llama.android/llama/src/main/cpp/CMakeLists.txt b/examples/llama.android/llama/src/main/cpp/CMakeLists.txt
deleted file mode 100644
index 6119fe09b0..0000000000
--- a/examples/llama.android/llama/src/main/cpp/CMakeLists.txt
+++ /dev/null
@@ -1,53 +0,0 @@
-# For more information about using CMake with Android Studio, read the
-# documentation: https://d.android.com/studio/projects/add-native-code.html.
-# For more examples on how to use CMake, see https://github.com/android/ndk-samples.
-
-# Sets the minimum CMake version required for this project.
-cmake_minimum_required(VERSION 3.22.1)
-
-# Declares the project name. The project name can be accessed via ${ PROJECT_NAME},
-# Since this is the top level CMakeLists.txt, the project name is also accessible
-# with ${CMAKE_PROJECT_NAME} (both CMake variables are in-sync within the top level
-# build script scope).
-project("llama-android")
-
-#include(FetchContent)
-#FetchContent_Declare(
-#        llama
-#        GIT_REPOSITORY https://github.com/ggml-org/llama.cpp
-#        GIT_TAG        master
-#)
-
-# Also provides "common"
-#FetchContent_MakeAvailable(llama)
-
-# Creates and names a library, sets it as either STATIC
-# or SHARED, and provides the relative paths to its source code.
-# You can define multiple libraries, and CMake builds them for you.
-# Gradle automatically packages shared libraries with your APK.
-#
-# In this top level CMakeLists.txt, ${CMAKE_PROJECT_NAME} is used to define
-# the target library name; in the sub-module's CMakeLists.txt, ${PROJECT_NAME}
-# is preferred for the same purpose.
-#
-
-#load local llama.cpp
-add_subdirectory(../../../../../../ build-llama)
-
-# In order to load a library into your app from Java/Kotlin, you must call
-# System.loadLibrary() and pass the name of the library defined here;
-# for GameActivity/NativeActivity derived applications, the same library name must be
-# used in the AndroidManifest.xml file.
-add_library(${CMAKE_PROJECT_NAME} SHARED
-        # List C/C++ source files with relative paths to this CMakeLists.txt.
-        llama-android.cpp)
-
-# Specifies libraries CMake should link to your target library. You
-# can link libraries from various origins, such as libraries defined in this
-# build script, prebuilt third-party libraries, or Android system libraries.
-target_link_libraries(${CMAKE_PROJECT_NAME}
-        # List libraries link to the target library
-        llama
-        common
-        android
-        log)
diff --git a/examples/llama.android/llama/src/main/cpp/llama-android.cpp b/examples/llama.android/llama/src/main/cpp/llama-android.cpp
deleted file mode 100644
index 711ddc5d19..0000000000
--- a/examples/llama.android/llama/src/main/cpp/llama-android.cpp
+++ /dev/null
@@ -1,452 +0,0 @@
-#include <android/log.h>
-#include <jni.h>
-#include <iomanip>
-#include <math.h>
-#include <string>
-#include <unistd.h>
-#include "llama.h"
-#include "common.h"
-
-// Write C++ code here.
-//
-// Do not forget to dynamically load the C++ library into your application.
-//
-// For instance,
-//
-// In MainActivity.java:
-//    static {
-//       System.loadLibrary("llama-android");
-//    }
-//
-// Or, in MainActivity.kt:
-//    companion object {
-//      init {
-//         System.loadLibrary("llama-android")
-//      }
-//    }
-
-#define TAG "llama-android.cpp"
-#define LOGi(...) __android_log_print(ANDROID_LOG_INFO, TAG, __VA_ARGS__)
-#define LOGe(...) __android_log_print(ANDROID_LOG_ERROR, TAG, __VA_ARGS__)
-
-jclass la_int_var;
-jmethodID la_int_var_value;
-jmethodID la_int_var_inc;
-
-std::string cached_token_chars;
-
-bool is_valid_utf8(const char * string) {
-    if (!string) {
-        return true;
-    }
-
-    const unsigned char * bytes = (const unsigned char *)string;
-    int num;
-
-    while (*bytes != 0x00) {
-        if ((*bytes & 0x80) == 0x00) {
-            // U+0000 to U+007F
-            num = 1;
-        } else if ((*bytes & 0xE0) == 0xC0) {
-            // U+0080 to U+07FF
-            num = 2;
-        } else if ((*bytes & 0xF0) == 0xE0) {
-            // U+0800 to U+FFFF
-            num = 3;
-        } else if ((*bytes & 0xF8) == 0xF0) {
-            // U+10000 to U+10FFFF
-            num = 4;
-        } else {
-            return false;
-        }
-
-        bytes += 1;
-        for (int i = 1; i < num; ++i) {
-            if ((*bytes & 0xC0) != 0x80) {
-                return false;
-            }
-            bytes += 1;
-        }
-    }
-
-    return true;
-}
-
-static void log_callback(ggml_log_level level, const char * fmt, void * data) {
-    if (level == GGML_LOG_LEVEL_ERROR)     __android_log_print(ANDROID_LOG_ERROR, TAG, fmt, data);
-    else if (level == GGML_LOG_LEVEL_INFO) __android_log_print(ANDROID_LOG_INFO, TAG, fmt, data);
-    else if (level == GGML_LOG_LEVEL_WARN) __android_log_print(ANDROID_LOG_WARN, TAG, fmt, data);
-    else __android_log_print(ANDROID_LOG_DEFAULT, TAG, fmt, data);
-}
-
-extern "C"
-JNIEXPORT jlong JNICALL
-Java_android_llama_cpp_LLamaAndroid_load_1model(JNIEnv *env, jobject, jstring filename) {
-    llama_model_params model_params = llama_model_default_params();
-
-    auto path_to_model = env->GetStringUTFChars(filename, 0);
-    LOGi("Loading model from %s", path_to_model);
-
-    auto model = llama_model_load_from_file(path_to_model, model_params);
-    env->ReleaseStringUTFChars(filename, path_to_model);
-
-    if (!model) {
-        LOGe("load_model() failed");
-        env->ThrowNew(env->FindClass("java/lang/IllegalStateException"), "load_model() failed");
-        return 0;
-    }
-
-    return reinterpret_cast<jlong>(model);
-}
-
-extern "C"
-JNIEXPORT void JNICALL
-Java_android_llama_cpp_LLamaAndroid_free_1model(JNIEnv *, jobject, jlong model) {
-    llama_model_free(reinterpret_cast<llama_model *>(model));
-}
-
-extern "C"
-JNIEXPORT jlong JNICALL
-Java_android_llama_cpp_LLamaAndroid_new_1context(JNIEnv *env, jobject, jlong jmodel) {
-    auto model = reinterpret_cast<llama_model *>(jmodel);
-
-    if (!model) {
-        LOGe("new_context(): model cannot be null");
-        env->ThrowNew(env->FindClass("java/lang/IllegalArgumentException"), "Model cannot be null");
-        return 0;
-    }
-
-    int n_threads = std::max(1, std::min(8, (int) sysconf(_SC_NPROCESSORS_ONLN) - 2));
-    LOGi("Using %d threads", n_threads);
-
-    llama_context_params ctx_params = llama_context_default_params();
-
-    ctx_params.n_ctx           = 2048;
-    ctx_params.n_threads       = n_threads;
-    ctx_params.n_threads_batch = n_threads;
-
-    llama_context * context = llama_new_context_with_model(model, ctx_params);
-
-    if (!context) {
-        LOGe("llama_new_context_with_model() returned null)");
-        env->ThrowNew(env->FindClass("java/lang/IllegalStateException"),
-                      "llama_new_context_with_model() returned null)");
-        return 0;
-    }
-
-    return reinterpret_cast<jlong>(context);
-}
-
-extern "C"
-JNIEXPORT void JNICALL
-Java_android_llama_cpp_LLamaAndroid_free_1context(JNIEnv *, jobject, jlong context) {
-    llama_free(reinterpret_cast<llama_context *>(context));
-}
-
-extern "C"
-JNIEXPORT void JNICALL
-Java_android_llama_cpp_LLamaAndroid_backend_1free(JNIEnv *, jobject) {
-    llama_backend_free();
-}
-
-extern "C"
-JNIEXPORT void JNICALL
-Java_android_llama_cpp_LLamaAndroid_log_1to_1android(JNIEnv *, jobject) {
-    llama_log_set(log_callback, NULL);
-}
-
-extern "C"
-JNIEXPORT jstring JNICALL
-Java_android_llama_cpp_LLamaAndroid_bench_1model(
-        JNIEnv *env,
-        jobject,
-        jlong context_pointer,
-        jlong model_pointer,
-        jlong batch_pointer,
-        jint pp,
-        jint tg,
-        jint pl,
-        jint nr
-        ) {
-    auto pp_avg = 0.0;
-    auto tg_avg = 0.0;
-    auto pp_std = 0.0;
-    auto tg_std = 0.0;
-
-    const auto context = reinterpret_cast<llama_context *>(context_pointer);
-    const auto model = reinterpret_cast<llama_model *>(model_pointer);
-    const auto batch = reinterpret_cast<llama_batch *>(batch_pointer);
-
-    const int n_ctx = llama_n_ctx(context);
-
-    LOGi("n_ctx = %d", n_ctx);
-
-    int i, j;
-    int nri;
-    for (nri = 0; nri < nr; nri++) {
-        LOGi("Benchmark prompt processing (pp)");
-
-        common_batch_clear(*batch);
-
-        const int n_tokens = pp;
-        for (i = 0; i < n_tokens; i++) {
-            common_batch_add(*batch, 0, i, { 0 }, false);
-        }
-
-        batch->logits[batch->n_tokens - 1] = true;
-        llama_memory_clear(llama_get_memory(context), false);
-
-        const auto t_pp_start = ggml_time_us();
-        if (llama_decode(context, *batch) != 0) {
-            LOGi("llama_decode() failed during prompt processing");
-        }
-        const auto t_pp_end = ggml_time_us();
-
-        // bench text generation
-
-        LOGi("Benchmark text generation (tg)");
-
-        llama_memory_clear(llama_get_memory(context), false);
-        const auto t_tg_start = ggml_time_us();
-        for (i = 0; i < tg; i++) {
-
-            common_batch_clear(*batch);
-            for (j = 0; j < pl; j++) {
-                common_batch_add(*batch, 0, i, { j }, true);
-            }
-
-            LOGi("llama_decode() text generation: %d", i);
-            if (llama_decode(context, *batch) != 0) {
-                LOGi("llama_decode() failed during text generation");
-            }
-        }
-
-        const auto t_tg_end = ggml_time_us();
-
-        llama_memory_clear(llama_get_memory(context), false);
-
-        const auto t_pp = double(t_pp_end - t_pp_start) / 1000000.0;
-        const auto t_tg = double(t_tg_end - t_tg_start) / 1000000.0;
-
-        const auto speed_pp = double(pp) / t_pp;
-        const auto speed_tg = double(pl * tg) / t_tg;
-
-        pp_avg += speed_pp;
-        tg_avg += speed_tg;
-
-        pp_std += speed_pp * speed_pp;
-        tg_std += speed_tg * speed_tg;
-
-        LOGi("pp %f t/s, tg %f t/s", speed_pp, speed_tg);
-    }
-
-    pp_avg /= double(nr);
-    tg_avg /= double(nr);
-
-    if (nr > 1) {
-        pp_std = sqrt(pp_std / double(nr - 1) - pp_avg * pp_avg * double(nr) / double(nr - 1));
-        tg_std = sqrt(tg_std / double(nr - 1) - tg_avg * tg_avg * double(nr) / double(nr - 1));
-    } else {
-        pp_std = 0;
-        tg_std = 0;
-    }
-
-    char model_desc[128];
-    llama_model_desc(model, model_desc, sizeof(model_desc));
-
-    const auto model_size     = double(llama_model_size(model)) / 1024.0 / 1024.0 / 1024.0;
-    const auto model_n_params = double(llama_model_n_params(model)) / 1e9;
-
-    const auto backend    = "(Android)"; // TODO: What should this be?
-
-    std::stringstream result;
-    result << std::setprecision(2);
-    result << "| model | size | params | backend | test | t/s |\n";
-    result << "| --- | --- | --- | --- | --- | --- |\n";
-    result << "| " << model_desc << " | " << model_size << "GiB | " << model_n_params << "B | " << backend << " | pp " << pp << " | " << pp_avg << " ± " << pp_std << " |\n";
-    result << "| " << model_desc << " | " << model_size << "GiB | " << model_n_params << "B | " << backend << " | tg " << tg << " | " << tg_avg << " ± " << tg_std << " |\n";
-
-    return env->NewStringUTF(result.str().c_str());
-}
-
-extern "C"
-JNIEXPORT jlong JNICALL
-Java_android_llama_cpp_LLamaAndroid_new_1batch(JNIEnv *, jobject, jint n_tokens, jint embd, jint n_seq_max) {
-
-    // Source: Copy of llama.cpp:llama_batch_init but heap-allocated.
-
-    llama_batch *batch = new llama_batch {
-        0,
-        nullptr,
-        nullptr,
-        nullptr,
-        nullptr,
-        nullptr,
-        nullptr,
-    };
-
-    if (embd) {
-        batch->embd = (float *) malloc(sizeof(float) * n_tokens * embd);
-    } else {
-        batch->token = (llama_token *) malloc(sizeof(llama_token) * n_tokens);
-    }
-
-    batch->pos      = (llama_pos *)     malloc(sizeof(llama_pos)      * n_tokens);
-    batch->n_seq_id = (int32_t *)       malloc(sizeof(int32_t)        * n_tokens);
-    batch->seq_id   = (llama_seq_id **) malloc(sizeof(llama_seq_id *) * n_tokens);
-    for (int i = 0; i < n_tokens; ++i) {
-        batch->seq_id[i] = (llama_seq_id *) malloc(sizeof(llama_seq_id) * n_seq_max);
-    }
-    batch->logits   = (int8_t *)        malloc(sizeof(int8_t)         * n_tokens);
-
-    return reinterpret_cast<jlong>(batch);
-}
-
-extern "C"
-JNIEXPORT void JNICALL
-Java_android_llama_cpp_LLamaAndroid_free_1batch(JNIEnv *, jobject, jlong batch_pointer) {
-    //llama_batch_free(*reinterpret_cast<llama_batch *>(batch_pointer));
-    const auto batch = reinterpret_cast<llama_batch *>(batch_pointer);
-    delete batch;
-}
-
-extern "C"
-JNIEXPORT jlong JNICALL
-Java_android_llama_cpp_LLamaAndroid_new_1sampler(JNIEnv *, jobject) {
-    auto sparams = llama_sampler_chain_default_params();
-    sparams.no_perf = true;
-    llama_sampler * smpl = llama_sampler_chain_init(sparams);
-    llama_sampler_chain_add(smpl, llama_sampler_init_greedy());
-
-    return reinterpret_cast<jlong>(smpl);
-}
-
-extern "C"
-JNIEXPORT void JNICALL
-Java_android_llama_cpp_LLamaAndroid_free_1sampler(JNIEnv *, jobject, jlong sampler_pointer) {
-    llama_sampler_free(reinterpret_cast<llama_sampler *>(sampler_pointer));
-}
-
-extern "C"
-JNIEXPORT void JNICALL
-Java_android_llama_cpp_LLamaAndroid_backend_1init(JNIEnv *, jobject) {
-    llama_backend_init();
-}
-
-extern "C"
-JNIEXPORT jstring JNICALL
-Java_android_llama_cpp_LLamaAndroid_system_1info(JNIEnv *env, jobject) {
-    return env->NewStringUTF(llama_print_system_info());
-}
-
-extern "C"
-JNIEXPORT jint JNICALL
-Java_android_llama_cpp_LLamaAndroid_completion_1init(
-        JNIEnv *env,
-        jobject,
-        jlong context_pointer,
-        jlong batch_pointer,
-        jstring jtext,
-        jboolean format_chat,
-        jint n_len
-    ) {
-
-    cached_token_chars.clear();
-
-    const auto text = env->GetStringUTFChars(jtext, 0);
-    const auto context = reinterpret_cast<llama_context *>(context_pointer);
-    const auto batch = reinterpret_cast<llama_batch *>(batch_pointer);
-
-    bool parse_special = (format_chat == JNI_TRUE);
-    const auto tokens_list = common_tokenize(context, text, true, parse_special);
-
-    auto n_ctx = llama_n_ctx(context);
-    auto n_kv_req = tokens_list.size() + n_len;
-
-    LOGi("n_len = %d, n_ctx = %d, n_kv_req = %d", n_len, n_ctx, n_kv_req);
-
-    if (n_kv_req > n_ctx) {
-        LOGe("error: n_kv_req > n_ctx, the required KV cache size is not big enough");
-    }
-
-    for (auto id : tokens_list) {
-        LOGi("token: `%s`-> %d ", common_token_to_piece(context, id).c_str(), id);
-    }
-
-    common_batch_clear(*batch);
-
-    // evaluate the initial prompt
-    for (auto i = 0; i < tokens_list.size(); i++) {
-        common_batch_add(*batch, tokens_list[i], i, { 0 }, false);
-    }
-
-    // llama_decode will output logits only for the last token of the prompt
-    batch->logits[batch->n_tokens - 1] = true;
-
-    if (llama_decode(context, *batch) != 0) {
-        LOGe("llama_decode() failed");
-    }
-
-    env->ReleaseStringUTFChars(jtext, text);
-
-    return batch->n_tokens;
-}
-
-extern "C"
-JNIEXPORT jstring JNICALL
-Java_android_llama_cpp_LLamaAndroid_completion_1loop(
-        JNIEnv * env,
-        jobject,
-        jlong context_pointer,
-        jlong batch_pointer,
-        jlong sampler_pointer,
-        jint n_len,
-        jobject intvar_ncur
-) {
-    const auto context = reinterpret_cast<llama_context *>(context_pointer);
-    const auto batch   = reinterpret_cast<llama_batch   *>(batch_pointer);
-    const auto sampler = reinterpret_cast<llama_sampler *>(sampler_pointer);
-    const auto model = llama_get_model(context);
-    const auto vocab = llama_model_get_vocab(model);
-
-    if (!la_int_var) la_int_var = env->GetObjectClass(intvar_ncur);
-    if (!la_int_var_value) la_int_var_value = env->GetMethodID(la_int_var, "getValue", "()I");
-    if (!la_int_var_inc) la_int_var_inc = env->GetMethodID(la_int_var, "inc", "()V");
-
-    // sample the most likely token
-    const auto new_token_id = llama_sampler_sample(sampler, context, -1);
-
-    const auto n_cur = env->CallIntMethod(intvar_ncur, la_int_var_value);
-    if (llama_vocab_is_eog(vocab, new_token_id) || n_cur == n_len) {
-        return nullptr;
-    }
-
-    auto new_token_chars = common_token_to_piece(context, new_token_id);
-    cached_token_chars += new_token_chars;
-
-    jstring new_token = nullptr;
-    if (is_valid_utf8(cached_token_chars.c_str())) {
-        new_token = env->NewStringUTF(cached_token_chars.c_str());
-        LOGi("cached: %s, new_token_chars: `%s`, id: %d", cached_token_chars.c_str(), new_token_chars.c_str(), new_token_id);
-        cached_token_chars.clear();
-    } else {
-        new_token = env->NewStringUTF("");
-    }
-
-    common_batch_clear(*batch);
-    common_batch_add(*batch, new_token_id, n_cur, { 0 }, true);
-
-    env->CallVoidMethod(intvar_ncur, la_int_var_inc);
-
-    if (llama_decode(context, *batch) != 0) {
-        LOGe("llama_decode() returned null");
-    }
-
-    return new_token;
-}
-
-extern "C"
-JNIEXPORT void JNICALL
-Java_android_llama_cpp_LLamaAndroid_kv_1cache_1clear(JNIEnv *, jobject, jlong context) {
-    llama_memory_clear(llama_get_memory(reinterpret_cast<llama_context *>(context)), true);
-}
diff --git a/examples/llama.android/llama/src/main/java/android/llama/cpp/LLamaAndroid.kt b/examples/llama.android/llama/src/main/java/android/llama/cpp/LLamaAndroid.kt
deleted file mode 100644
index b964d93e37..0000000000
--- a/examples/llama.android/llama/src/main/java/android/llama/cpp/LLamaAndroid.kt
+++ /dev/null
@@ -1,180 +0,0 @@
-package android.llama.cpp
-
-import android.util.Log
-import kotlinx.coroutines.CoroutineDispatcher
-import kotlinx.coroutines.asCoroutineDispatcher
-import kotlinx.coroutines.flow.Flow
-import kotlinx.coroutines.flow.flow
-import kotlinx.coroutines.flow.flowOn
-import kotlinx.coroutines.withContext
-import java.util.concurrent.Executors
-import kotlin.concurrent.thread
-
-class LLamaAndroid {
-    private val tag: String? = this::class.simpleName
-
-    private val threadLocalState: ThreadLocal<State> = ThreadLocal.withInitial { State.Idle }
-
-    private val runLoop: CoroutineDispatcher = Executors.newSingleThreadExecutor {
-        thread(start = false, name = "Llm-RunLoop") {
-            Log.d(tag, "Dedicated thread for native code: ${Thread.currentThread().name}")
-
-            // No-op if called more than once.
-            System.loadLibrary("llama-android")
-
-            // Set llama log handler to Android
-            log_to_android()
-            backend_init(false)
-
-            Log.d(tag, system_info())
-
-            it.run()
-        }.apply {
-            uncaughtExceptionHandler = Thread.UncaughtExceptionHandler { _, exception: Throwable ->
-                Log.e(tag, "Unhandled exception", exception)
-            }
-        }
-    }.asCoroutineDispatcher()
-
-    private val nlen: Int = 64
-
-    private external fun log_to_android()
-    private external fun load_model(filename: String): Long
-    private external fun free_model(model: Long)
-    private external fun new_context(model: Long): Long
-    private external fun free_context(context: Long)
-    private external fun backend_init(numa: Boolean)
-    private external fun backend_free()
-    private external fun new_batch(nTokens: Int, embd: Int, nSeqMax: Int): Long
-    private external fun free_batch(batch: Long)
-    private external fun new_sampler(): Long
-    private external fun free_sampler(sampler: Long)
-    private external fun bench_model(
-        context: Long,
-        model: Long,
-        batch: Long,
-        pp: Int,
-        tg: Int,
-        pl: Int,
-        nr: Int
-    ): String
-
-    private external fun system_info(): String
-
-    private external fun completion_init(
-        context: Long,
-        batch: Long,
-        text: String,
-        formatChat: Boolean,
-        nLen: Int
-    ): Int
-
-    private external fun completion_loop(
-        context: Long,
-        batch: Long,
-        sampler: Long,
-        nLen: Int,
-        ncur: IntVar
-    ): String?
-
-    private external fun kv_cache_clear(context: Long)
-
-    suspend fun bench(pp: Int, tg: Int, pl: Int, nr: Int = 1): String {
-        return withContext(runLoop) {
-            when (val state = threadLocalState.get()) {
-                is State.Loaded -> {
-                    Log.d(tag, "bench(): $state")
-                    bench_model(state.context, state.model, state.batch, pp, tg, pl, nr)
-                }
-
-                else -> throw IllegalStateException("No model loaded")
-            }
-        }
-    }
-
-    suspend fun load(pathToModel: String) {
-        withContext(runLoop) {
-            when (threadLocalState.get()) {
-                is State.Idle -> {
-                    val model = load_model(pathToModel)
-                    if (model == 0L)  throw IllegalStateException("load_model() failed")
-
-                    val context = new_context(model)
-                    if (context == 0L) throw IllegalStateException("new_context() failed")
-
-                    val batch = new_batch(512, 0, 1)
-                    if (batch == 0L) throw IllegalStateException("new_batch() failed")
-
-                    val sampler = new_sampler()
-                    if (sampler == 0L) throw IllegalStateException("new_sampler() failed")
-
-                    Log.i(tag, "Loaded model $pathToModel")
-                    threadLocalState.set(State.Loaded(model, context, batch, sampler))
-                }
-                else -> throw IllegalStateException("Model already loaded")
-            }
-        }
-    }
-
-    fun send(message: String, formatChat: Boolean = false): Flow<String> = flow {
-        when (val state = threadLocalState.get()) {
-            is State.Loaded -> {
-                val ncur = IntVar(completion_init(state.context, state.batch, message, formatChat, nlen))
-                while (ncur.value <= nlen) {
-                    val str = completion_loop(state.context, state.batch, state.sampler, nlen, ncur)
-                    if (str == null) {
-                        break
-                    }
-                    emit(str)
-                }
-                kv_cache_clear(state.context)
-            }
-            else -> {}
-        }
-    }.flowOn(runLoop)
-
-    /**
-     * Unloads the model and frees resources.
-     *
-     * This is a no-op if there's no model loaded.
-     */
-    suspend fun unload() {
-        withContext(runLoop) {
-            when (val state = threadLocalState.get()) {
-                is State.Loaded -> {
-                    free_context(state.context)
-                    free_model(state.model)
-                    free_batch(state.batch)
-                    free_sampler(state.sampler);
-
-                    threadLocalState.set(State.Idle)
-                }
-                else -> {}
-            }
-        }
-    }
-
-    companion object {
-        private class IntVar(value: Int) {
-            @Volatile
-            var value: Int = value
-                private set
-
-            fun inc() {
-                synchronized(this) {
-                    value += 1
-                }
-            }
-        }
-
-        private sealed interface State {
-            data object Idle: State
-            data class Loaded(val model: Long, val context: Long, val batch: Long, val sampler: Long): State
-        }
-
-        // Enforce only one instance of Llm.
-        private val _instance: LLamaAndroid = LLamaAndroid()
-
-        fun instance(): LLamaAndroid = _instance
-    }
-}
diff --git a/examples/llama.android/settings.gradle.kts b/examples/llama.android/settings.gradle.kts
index c7c1a034a4..74f4eb3e46 100644
--- a/examples/llama.android/settings.gradle.kts
+++ b/examples/llama.android/settings.gradle.kts
@@ -8,11 +8,11 @@ pluginManagement {
 dependencyResolutionManagement {
     repositoriesMode.set(RepositoriesMode.FAIL_ON_PROJECT_REPOS)
     repositories {
-        google()
         mavenCentral()
+        google()
     }
 }
 
-rootProject.name = "LlamaAndroid"
+rootProject.name = "AiChat"
 include(":app")
-include(":llama")
+include(":lib")
diff --git a/examples/model-conversion/scripts/causal/run-org-model.py b/examples/model-conversion/scripts/causal/run-org-model.py
index da1132c003..14bb12fe68 100755
--- a/examples/model-conversion/scripts/causal/run-org-model.py
+++ b/examples/model-conversion/scripts/causal/run-org-model.py
@@ -2,134 +2,22 @@
 
 import argparse
 import os
+import sys
 import importlib
 from pathlib import Path
 
-from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig
+# Add parent directory to path for imports
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
+
+from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModelForImageTextToText, AutoConfig
 import torch
 import numpy as np
-
-### If you want to dump RoPE activations, apply this monkey patch to the model
-### class from Transformers that you are running (replace apertus.modeling_apertus
-### with the proper package and class for your model
-### === START ROPE DEBUG ===
-# from transformers.models.apertus.modeling_apertus import apply_rotary_pos_emb
-
-# orig_rope = apply_rotary_pos_emb
-# torch.set_printoptions(threshold=float('inf'))
-# torch.set_printoptions(precision=6, sci_mode=False)
-
-# def debug_rope(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
-#     # log inputs
-#     summarize(q, "RoPE.q_in")
-#     summarize(k, "RoPE.k_in")
-
-#     # call original
-#     q_out, k_out = orig_rope(q, k, cos, sin, position_ids, unsqueeze_dim)
-
-#     # log outputs
-#     summarize(q_out, "RoPE.q_out")
-#     summarize(k_out, "RoPE.k_out")
-
-#     return q_out, k_out
-
-# # Patch it
-# import transformers.models.apertus.modeling_apertus as apertus_mod  # noqa: E402
-# apertus_mod.apply_rotary_pos_emb = debug_rope
-### == END ROPE DEBUG ===
-
-
-def summarize(tensor: torch.Tensor, name: str, max_seq: int = 3, max_vals: int = 3):
-    """
-    Print a tensor in llama.cpp debug style.
-
-    Supports:
-    - 2D tensors (seq, hidden)
-    - 3D tensors (batch, seq, hidden)
-    - 4D tensors (batch, seq, heads, dim_per_head) via flattening heads × dim_per_head
-
-    Shows first and last max_vals of each vector per sequence position.
-    """
-    t = tensor.detach().to(torch.float32).cpu()
-
-    # Determine dimensions
-    if t.ndim == 3:
-        _, s, _ = t.shape
-    elif t.ndim == 2:
-        _, s = 1, t.shape[0]
-        t = t.unsqueeze(0)
-    elif t.ndim == 4:
-        _, s, _, _ = t.shape
-    else:
-        print(f"Skipping tensor due to unsupported dimensions: {t.ndim}")
-        return
-
-    ten_shape = t.shape
-
-    print(f"ggml_debug: {name} = (f32)  ... = {{{ten_shape}}}")
-    print("                                     [")
-    print("                                      [")
-
-    # Determine indices for first and last sequences
-    first_indices = list(range(min(s, max_seq)))
-    last_indices = list(range(max(0, s - max_seq), s))
-
-    # Check if there's an overlap between first and last indices or if we're at the edge case of s = 2 * max_seq
-    has_overlap = bool(set(first_indices) & set(last_indices)) or (max_seq * 2 == s)
-
-    # Combine indices
-    if has_overlap:
-        # If there's overlap, just use the combined unique indices
-        indices = sorted(list(set(first_indices + last_indices)))
-        separator_index = None
-    else:
-        # If no overlap, we'll add a separator between first and last sequences
-        indices = first_indices + last_indices
-        separator_index = len(first_indices)
-
-    for i, si in enumerate(indices):
-        # Add separator if needed
-        if separator_index is not None and i == separator_index:
-            print("                                       ...")
-
-        # Extract appropriate slice
-        vec = t[0, si]
-        if vec.ndim == 2:  # 4D case: flatten heads × dim_per_head
-            flat = vec.flatten().tolist()
-        else:  # 2D or 3D case
-            flat = vec.tolist()
-
-        # First and last slices
-        first = flat[:max_vals]
-        last = flat[-max_vals:] if len(flat) >= max_vals else flat
-        first_str = ", ".join(f"{v:12.4f}" for v in first)
-        last_str = ", ".join(f"{v:12.4f}" for v in last)
-
-        print(f"                                       [{first_str}, ..., {last_str}]")
-
-    print("                                      ],")
-    print("                                     ]")
-    print(f"                                     sum = {t.sum().item():.6f}\n")
-
-
-def debug_hook(name):
-    def fn(_m, input, output):
-        if isinstance(input, torch.Tensor):
-            summarize(input, name + "_in")
-        elif isinstance(input, (tuple, list)) and isinstance(input[0], torch.Tensor):
-            summarize(input[0], name + "_in")
-        if isinstance(output, torch.Tensor):
-            summarize(output, name + "_out")
-        elif isinstance(output, (tuple, list)) and isinstance(output[0], torch.Tensor):
-            summarize(output[0], name + "_out")
-
-    return fn
-
-
-unreleased_model_name = os.getenv("UNRELEASED_MODEL_NAME")
+from utils.common import debug_hook
 
 parser = argparse.ArgumentParser(description="Process model with specified path")
 parser.add_argument("--model-path", "-m", help="Path to the model")
+parser.add_argument("--prompt-file", "-f", help="Optional prompt file", required=False)
+parser.add_argument("--verbose", "-v", action="store_true", help="Enable verbose debug output")
 args = parser.parse_args()
 
 model_path = os.environ.get("MODEL_PATH", args.model_path)
@@ -138,18 +26,30 @@ if model_path is None:
         "Model path must be specified either via --model-path argument or MODEL_PATH environment variable"
     )
 
+### If you want to dump RoPE activations, uncomment the following lines:
+### === START ROPE DEBUG ===
+# from utils.common import setup_rope_debug
+# setup_rope_debug("transformers.models.apertus.modeling_apertus")
+### == END ROPE DEBUG ===
+
 
 print("Loading model and tokenizer using AutoTokenizer:", model_path)
 tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
 config = AutoConfig.from_pretrained(model_path, trust_remote_code=True)
+multimodal = False
+full_config = config
 
 print("Model type:       ", config.model_type)
+if "vocab_size" not in config and "text_config" in config:
+    config = config.text_config
+    multimodal = True
 print("Vocab size:       ", config.vocab_size)
 print("Hidden size:      ", config.hidden_size)
 print("Number of layers: ", config.num_hidden_layers)
 print("BOS token id:     ", config.bos_token_id)
 print("EOS token id:     ", config.eos_token_id)
 
+unreleased_model_name = os.getenv("UNRELEASED_MODEL_NAME")
 if unreleased_model_name:
     model_name_lower = unreleased_model_name.lower()
     unreleased_module_path = (
@@ -169,13 +69,19 @@ if unreleased_model_name:
         print(f"Failed to import or load model: {e}")
         exit(1)
 else:
-    model = AutoModelForCausalLM.from_pretrained(
-        model_path, device_map="auto", offload_folder="offload", trust_remote_code=True, config=config
-    )
+    if multimodal:
+        model = AutoModelForImageTextToText.from_pretrained(
+            model_path, device_map="auto", offload_folder="offload", trust_remote_code=True, config=full_config
+        )
+    else:
+        model = AutoModelForCausalLM.from_pretrained(
+            model_path, device_map="auto", offload_folder="offload", trust_remote_code=True, config=config
+        )
 
-for name, module in model.named_modules():
-    if len(list(module.children())) == 0:  # only leaf modules
-        module.register_forward_hook(debug_hook(name))
+if args.verbose:
+    for name, module in model.named_modules():
+        if len(list(module.children())) == 0:  # only leaf modules
+            module.register_forward_hook(debug_hook(name))
 
 model_name = os.path.basename(model_path)
 # Printing the Model class to allow for easier debugging. This can be useful
@@ -185,7 +91,10 @@ model_name = os.path.basename(model_path)
 print(f"Model class: {model.__class__.__name__}")
 
 device = next(model.parameters()).device
-if os.getenv("MODEL_TESTING_PROMPT"):
+if args.prompt_file:
+    with open(args.prompt_file, encoding='utf-8') as f:
+        prompt = f.read()
+elif os.getenv("MODEL_TESTING_PROMPT"):
     prompt = os.getenv("MODEL_TESTING_PROMPT")
 else:
     prompt = "Hello, my name is"
@@ -195,9 +104,18 @@ print(f"Input tokens: {input_ids}")
 print(f"Input text: {repr(prompt)}")
 print(f"Tokenized: {tokenizer.convert_ids_to_tokens(input_ids[0])}")
 
+batch_size = 512
+
 with torch.no_grad():
-    outputs = model(input_ids.to(model.device))
-    logits = outputs.logits
+    past = None
+    outputs = None
+    for i in range(0, input_ids.size(1), batch_size):
+        print(f"Processing chunk with tokens {i} to {i + batch_size}")
+        chunk = input_ids[:, i:i + batch_size]
+        outputs = model(chunk.to(model.device), past_key_values=past, use_cache=True)
+        past = outputs.past_key_values
+
+    logits = outputs.logits # type: ignore
 
     # Extract logits for the last token (next token prediction)
     last_logits = logits[0, -1, :].float().cpu().numpy()
diff --git a/examples/model-conversion/scripts/utils/common.py b/examples/model-conversion/scripts/utils/common.py
index 945f9a1a1d..7595d0410e 100644
--- a/examples/model-conversion/scripts/utils/common.py
+++ b/examples/model-conversion/scripts/utils/common.py
@@ -2,6 +2,8 @@
 
 import os
 import sys
+import torch
+
 
 def get_model_name_from_env_path(env_path_name):
     model_path = os.getenv(env_path_name)
@@ -18,3 +20,131 @@ def get_model_name_from_env_path(env_path_name):
         name = name[:-5]
 
     return name
+
+
+def summarize(tensor: torch.Tensor, name: str, max_seq: int = 3, max_vals: int = 3):
+    """
+    Print a tensor in llama.cpp debug style.
+
+    Supports:
+    - 2D tensors (seq, hidden)
+    - 3D tensors (batch, seq, hidden)
+    - 4D tensors (batch, seq, heads, dim_per_head) via flattening heads × dim_per_head
+
+    Shows first and last max_vals of each vector per sequence position.
+    """
+    t = tensor.detach().to(torch.float32).cpu()
+
+    # Determine dimensions
+    if t.ndim == 3:
+        _, s, _ = t.shape
+    elif t.ndim == 2:
+        _, s = 1, t.shape[0]
+        t = t.unsqueeze(0)
+    elif t.ndim == 4:
+        _, s, _, _ = t.shape
+    else:
+        print(f"Skipping tensor due to unsupported dimensions: {t.ndim}")
+        return
+
+    ten_shape = t.shape
+
+    print(f"ggml_debug: {name} = (f32)  ... = {{{ten_shape}}}")
+    print("                                     [")
+    print("                                      [")
+
+    # Determine indices for first and last sequences
+    first_indices = list(range(min(s, max_seq)))
+    last_indices = list(range(max(0, s - max_seq), s))
+
+    # Check if there's an overlap between first and last indices or if we're at the edge case of s = 2 * max_seq
+    has_overlap = bool(set(first_indices) & set(last_indices)) or (max_seq * 2 == s)
+
+    # Combine indices
+    if has_overlap:
+        # If there's overlap, just use the combined unique indices
+        indices = sorted(list(set(first_indices + last_indices)))
+        separator_index = None
+    else:
+        # If no overlap, we'll add a separator between first and last sequences
+        indices = first_indices + last_indices
+        separator_index = len(first_indices)
+
+    for i, si in enumerate(indices):
+        # Add separator if needed
+        if separator_index is not None and i == separator_index:
+            print("                                       ...")
+
+        # Extract appropriate slice
+        vec = t[0, si]
+        if vec.ndim == 2:  # 4D case: flatten heads × dim_per_head
+            flat = vec.flatten().tolist()
+        else:  # 2D or 3D case
+            flat = vec.tolist()
+
+        # First and last slices
+        first = flat[:max_vals]
+        last = flat[-max_vals:] if len(flat) >= max_vals else flat
+        first_str = ", ".join(f"{v:12.4f}" for v in first)
+        last_str = ", ".join(f"{v:12.4f}" for v in last)
+
+        print(f"                                       [{first_str}, ..., {last_str}]")
+
+    print("                                      ],")
+    print("                                     ]")
+    print(f"                                     sum = {t.sum().item():.6f}\n")
+
+
+def debug_hook(name):
+    def fn(_m, input, output):
+        if isinstance(input, torch.Tensor):
+            summarize(input, name + "_in")
+        elif isinstance(input, (tuple, list)) and len(input) > 0 and isinstance(input[0], torch.Tensor):
+            summarize(input[0], name + "_in")
+        if isinstance(output, torch.Tensor):
+            summarize(output, name + "_out")
+        elif isinstance(output, (tuple, list)) and len(output) > 0 and isinstance(output[0], torch.Tensor):
+            summarize(output[0], name + "_out")
+
+    return fn
+
+
+def setup_rope_debug(model_module_path: str, function_name: str = "apply_rotary_pos_emb"):
+    """
+    Apply monkey patch to dump RoPE activations for debugging.
+
+    Args:
+        model_module_path: Path to the model module (e.g., "transformers.models.apertus.modeling_apertus")
+        function_name: Name of the RoPE function to patch (default: "apply_rotary_pos_emb")
+
+    Example:
+        from utils.common import setup_rope_debug
+        setup_rope_debug("transformers.models.apertus.modeling_apertus")
+    """
+    import importlib
+
+    # Import the module and get the original function
+    module = importlib.import_module(model_module_path)
+    orig_rope = getattr(module, function_name)
+
+    # Set torch print options for better debugging
+    torch.set_printoptions(threshold=float('inf'))
+    torch.set_printoptions(precision=6, sci_mode=False)
+
+    def debug_rope(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
+        # log inputs
+        summarize(q, "RoPE.q_in")
+        summarize(k, "RoPE.k_in")
+
+        # call original
+        q_out, k_out = orig_rope(q, k, cos, sin, position_ids, unsqueeze_dim)
+
+        # log outputs
+        summarize(q_out, "RoPE.q_out")
+        summarize(k_out, "RoPE.k_out")
+
+        return q_out, k_out
+
+    # Patch it
+    setattr(module, function_name, debug_rope)
+    print(f"RoPE debug patching applied to {model_module_path}.{function_name}")
diff --git a/examples/speculative/speculative.cpp b/examples/speculative/speculative.cpp
index 2fb7f6374e..89d3249431 100644
--- a/examples/speculative/speculative.cpp
+++ b/examples/speculative/speculative.cpp
@@ -242,7 +242,7 @@ int main(int argc, char ** argv) {
                 bool accept = false;
                 if (params.sampling.temp > 0) {
                     // stochastic verification
-                    common_sampler_sample(smpl, ctx_tgt, drafts[s_keep].i_batch_tgt[i_dft]);
+                    common_sampler_sample(smpl, ctx_tgt, drafts[s_keep].i_batch_tgt[i_dft], true);
 
                     auto & dist_tgt = *common_sampler_get_candidates(smpl, true);
 
@@ -491,7 +491,7 @@ int main(int argc, char ** argv) {
                     continue;
                 }
 
-                common_sampler_sample(drafts[s].smpl, ctx_dft, drafts[s].i_batch_dft);
+                common_sampler_sample(drafts[s].smpl, ctx_dft, drafts[s].i_batch_dft, true);
 
                 const auto * cur_p = common_sampler_get_candidates(drafts[s].smpl, true);
 
diff --git a/ggml/CMakeLists.txt b/ggml/CMakeLists.txt
index a65dcfbe1e..18d117f7cc 100644
--- a/ggml/CMakeLists.txt
+++ b/ggml/CMakeLists.txt
@@ -254,6 +254,7 @@ set   (GGML_OPENCL_TARGET_VERSION "300" CACHE STRING
                                             "gmml: OpenCL API version to target")
 
 option(GGML_HEXAGON                         "ggml: enable Hexagon backend"                    OFF)
+set(GGML_HEXAGON_FP32_QUANTIZE_GROUP_SIZE 128 CACHE STRING "ggml: quantize group size (32, 64, or 128)")
 
 # toolchain for vulkan-shaders-gen
 set   (GGML_VULKAN_SHADERS_GEN_TOOLCHAIN "" CACHE FILEPATH "ggml: toolchain file for vulkan-shaders-gen")
diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt
index 4c04c33003..262d78a4cf 100644
--- a/ggml/src/CMakeLists.txt
+++ b/ggml/src/CMakeLists.txt
@@ -386,6 +386,9 @@ if (GGML_CPU_ALL_VARIANTS)
             ggml_add_cpu_backend_variant(android_armv8.2_1    DOTPROD)
             ggml_add_cpu_backend_variant(android_armv8.2_2    DOTPROD FP16_VECTOR_ARITHMETIC)
             ggml_add_cpu_backend_variant(android_armv8.6_1    DOTPROD FP16_VECTOR_ARITHMETIC MATMUL_INT8)
+            ggml_add_cpu_backend_variant(android_armv9.0_1    DOTPROD MATMUL_INT8 FP16_VECTOR_ARITHMETIC SVE2)
+            ggml_add_cpu_backend_variant(android_armv9.2_1    DOTPROD MATMUL_INT8 FP16_VECTOR_ARITHMETIC SME)
+            ggml_add_cpu_backend_variant(android_armv9.2_2    DOTPROD MATMUL_INT8 FP16_VECTOR_ARITHMETIC SVE SME)
         elseif (APPLE)
             ggml_add_cpu_backend_variant(apple_m1             DOTPROD)
             ggml_add_cpu_backend_variant(apple_m2_m3          DOTPROD MATMUL_INT8)
diff --git a/ggml/src/ggml-cpu/CMakeLists.txt b/ggml/src/ggml-cpu/CMakeLists.txt
index fc31089f3e..28fb7612e5 100644
--- a/ggml/src/ggml-cpu/CMakeLists.txt
+++ b/ggml/src/ggml-cpu/CMakeLists.txt
@@ -458,6 +458,7 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
             if (GGML_RV_ZFH)
                 string(APPEND MARCH_STR "_zfh")
             endif()
+
             if (GGML_XTHEADVECTOR)
                 string(APPEND MARCH_STR "_xtheadvector")
             elseif (GGML_RVV)
@@ -465,6 +466,9 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
                 if (GGML_RV_ZVFH)
                     string(APPEND MARCH_STR "_zvfh")
                 endif()
+                if (GGML_RV_ZVFBFWMA)
+                    string(APPEND MARCH_STR "_zvfbfwma")
+                endif()
             endif()
             if (GGML_RV_ZICBOP)
                 string(APPEND MARCH_STR "_zicbop")
diff --git a/ggml/src/ggml-cpu/arch-fallback.h b/ggml/src/ggml-cpu/arch-fallback.h
index 0775c87f98..3f8946ac70 100644
--- a/ggml/src/ggml-cpu/arch-fallback.h
+++ b/ggml/src/ggml-cpu/arch-fallback.h
@@ -43,6 +43,8 @@
 #define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
 #define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
 #define ggml_gemv_iq4_nl_8x8_q8_0_generic ggml_gemv_iq4_nl_8x8_q8_0
+#define ggml_gemv_q8_0_4x4_q8_0_generic ggml_gemv_q8_0_4x4_q8_0
+#define ggml_gemv_q8_0_4x8_q8_0_generic ggml_gemv_q8_0_4x8_q8_0
 #define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
 #define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
 #define ggml_gemm_q4_0_8x8_q8_0_generic ggml_gemm_q4_0_8x8_q8_0
@@ -51,6 +53,8 @@
 #define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
 #define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
 #define ggml_gemm_iq4_nl_8x8_q8_0_generic ggml_gemm_iq4_nl_8x8_q8_0
+#define ggml_gemm_q8_0_4x4_q8_0_generic ggml_gemm_q8_0_4x4_q8_0
+#define ggml_gemm_q8_0_4x8_q8_0_generic ggml_gemm_q8_0_4x8_q8_0
 #elif defined(__aarch64__) || defined(__arm__) || defined(_M_ARM) || defined(_M_ARM64)
 // repack.cpp
 #define ggml_quantize_mat_q8_K_4x4_generic ggml_quantize_mat_q8_K_4x4
@@ -67,10 +71,14 @@
 #define ggml_gemv_q4_0_4x8_q8_0_generic ggml_gemv_q4_0_4x8_q8_0
 #define ggml_gemv_q4_K_8x4_q8_K_generic ggml_gemv_q4_K_8x4_q8_K
 #define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
+#define ggml_gemv_q8_0_4x4_q8_0_generic ggml_gemv_q8_0_4x4_q8_0
+#define ggml_gemv_q8_0_4x8_q8_0_generic ggml_gemv_q8_0_4x8_q8_0
 #define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
 #define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
 #define ggml_gemm_q4_K_8x4_q8_K_generic ggml_gemm_q4_K_8x4_q8_K
 #define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
+#define ggml_gemm_q8_0_4x4_q8_0_generic ggml_gemm_q8_0_4x4_q8_0
+#define ggml_gemm_q8_0_4x8_q8_0_generic ggml_gemm_q8_0_4x8_q8_0
 #elif defined(__POWERPC__) || defined(__powerpc__)
 // ref: https://github.com/ggml-org/llama.cpp/pull/14146#issuecomment-2972561679
 // quants.c
@@ -91,6 +99,8 @@
 #define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
 #define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
 #define ggml_gemv_iq4_nl_8x8_q8_0_generic ggml_gemv_iq4_nl_8x8_q8_0
+#define ggml_gemv_q8_0_4x4_q8_0_generic ggml_gemv_q8_0_4x4_q8_0
+#define ggml_gemv_q8_0_4x8_q8_0_generic ggml_gemv_q8_0_4x8_q8_0
 #define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
 #define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
 #define ggml_gemm_q4_0_8x8_q8_0_generic ggml_gemm_q4_0_8x8_q8_0
@@ -99,6 +109,8 @@
 #define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
 #define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
 #define ggml_gemm_iq4_nl_8x8_q8_0_generic ggml_gemm_iq4_nl_8x8_q8_0
+#define ggml_gemm_q8_0_4x4_q8_0_generic ggml_gemm_q8_0_4x4_q8_0
+#define ggml_gemm_q8_0_4x8_q8_0_generic ggml_gemm_q8_0_4x8_q8_0
 #elif defined(__loongarch64)
 // quants.c
 #define quantize_row_q8_K_generic quantize_row_q8_K
@@ -119,6 +131,8 @@
 #define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
 #define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
 #define ggml_gemv_iq4_nl_8x8_q8_0_generic ggml_gemv_iq4_nl_8x8_q8_0
+#define ggml_gemv_q8_0_4x4_q8_0_generic ggml_gemv_q8_0_4x4_q8_0
+#define ggml_gemv_q8_0_4x8_q8_0_generic ggml_gemv_q8_0_4x8_q8_0
 #define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
 #define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
 #define ggml_gemm_q4_0_8x8_q8_0_generic ggml_gemm_q4_0_8x8_q8_0
@@ -127,6 +141,8 @@
 #define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
 #define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
 #define ggml_gemm_iq4_nl_8x8_q8_0_generic ggml_gemm_iq4_nl_8x8_q8_0
+#define ggml_gemm_q8_0_4x4_q8_0_generic ggml_gemm_q8_0_4x4_q8_0
+#define ggml_gemm_q8_0_4x8_q8_0_generic ggml_gemm_q8_0_4x8_q8_0
 #elif defined(__riscv)
 // quants.c
 #define quantize_row_q8_K_generic quantize_row_q8_K
@@ -154,6 +170,8 @@
 #define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
 #define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
 #define ggml_gemv_iq4_nl_8x8_q8_0_generic ggml_gemv_iq4_nl_8x8_q8_0
+#define ggml_gemv_q8_0_4x4_q8_0_generic ggml_gemv_q8_0_4x4_q8_0
+#define ggml_gemv_q8_0_4x8_q8_0_generic ggml_gemv_q8_0_4x8_q8_0
 #define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
 #define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
 #define ggml_gemm_q4_K_8x4_q8_K_generic ggml_gemm_q4_K_8x4_q8_K
@@ -161,6 +179,8 @@
 #define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
 #define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
 #define ggml_gemm_iq4_nl_8x8_q8_0_generic ggml_gemm_iq4_nl_8x8_q8_0
+#define ggml_gemm_q8_0_4x4_q8_0_generic ggml_gemm_q8_0_4x4_q8_0
+#define ggml_gemm_q8_0_4x8_q8_0_generic ggml_gemm_q8_0_4x8_q8_0
 #elif defined(__s390x__)
 // quants.c
 #define quantize_row_q8_K_generic quantize_row_q8_K
@@ -187,6 +207,8 @@
 #define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
 #define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
 #define ggml_gemv_iq4_nl_8x8_q8_0_generic ggml_gemv_iq4_nl_8x8_q8_0
+#define ggml_gemv_q8_0_4x4_q8_0_generic ggml_gemv_q8_0_4x4_q8_0
+#define ggml_gemv_q8_0_4x8_q8_0_generic ggml_gemv_q8_0_4x8_q8_0
 #define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
 #define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
 #define ggml_gemm_q4_0_8x8_q8_0_generic ggml_gemm_q4_0_8x8_q8_0
@@ -195,6 +217,8 @@
 #define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
 #define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
 #define ggml_gemm_iq4_nl_8x8_q8_0_generic ggml_gemm_iq4_nl_8x8_q8_0
+#define ggml_gemm_q8_0_4x4_q8_0_generic ggml_gemm_q8_0_4x4_q8_0
+#define ggml_gemm_q8_0_4x8_q8_0_generic ggml_gemm_q8_0_4x8_q8_0
 #elif defined(__wasm__)
 // quants.c
 #define ggml_vec_dot_q4_1_q8_1_generic ggml_vec_dot_q4_1_q8_1
@@ -223,6 +247,8 @@
 #define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
 #define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
 #define ggml_gemv_iq4_nl_8x8_q8_0_generic ggml_gemv_iq4_nl_8x8_q8_0
+#define ggml_gemv_q8_0_4x4_q8_0_generic ggml_gemv_q8_0_4x4_q8_0
+#define ggml_gemv_q8_0_4x8_q8_0_generic ggml_gemv_q8_0_4x8_q8_0
 #define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
 #define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
 #define ggml_gemm_q4_0_8x8_q8_0_generic ggml_gemm_q4_0_8x8_q8_0
@@ -231,4 +257,6 @@
 #define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
 #define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
 #define ggml_gemm_iq4_nl_8x8_q8_0_generic ggml_gemm_iq4_nl_8x8_q8_0
+#define ggml_gemm_q8_0_4x4_q8_0_generic ggml_gemm_q8_0_4x4_q8_0
+#define ggml_gemm_q8_0_4x8_q8_0_generic ggml_gemm_q8_0_4x8_q8_0
 #endif
diff --git a/ggml/src/ggml-cpu/arch/arm/repack.cpp b/ggml/src/ggml-cpu/arch/arm/repack.cpp
index fb7f074a85..b61220a189 100644
--- a/ggml/src/ggml-cpu/arch/arm/repack.cpp
+++ b/ggml/src/ggml-cpu/arch/arm/repack.cpp
@@ -786,6 +786,133 @@ void ggml_gemv_q4_K_8x8_q8_K(int                        n,
     ggml_gemv_q4_K_8x8_q8_K_generic(n, s, bs, vx, vy, nr, nc);
 }
 
+void ggml_gemv_q8_0_4x4_q8_0(int                        n,
+                             float * GGML_RESTRICT      s,
+                             size_t                     bs,
+                             const void * GGML_RESTRICT vx,
+                             const void * GGML_RESTRICT vy,
+                             int                        nr,
+                             int                        nc) {
+    const int qk                = QK8_0;
+    const int nb                = n / qk;
+    const int ncols_interleaved = 4;
+    const int blocklen          = 4;
+
+    assert(n % qk == 0);
+    assert(nc % ncols_interleaved == 0);
+
+    UNUSED(nb);
+    UNUSED(ncols_interleaved);
+    UNUSED(blocklen);
+
+#if defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD)
+    const block_q8_0x4 * b_ptr = (const block_q8_0x4 *) vx;
+
+    for (int c = 0; c < nc; c += ncols_interleaved) {
+        const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
+        float32x4_t        acc   = vdupq_n_f32(0);
+        for (int b = 0; b < nb; b++) {
+            int8x16x4_t b_low  = vld1q_s8_x4((const int8_t *) b_ptr->qs);
+            int8x16x4_t b_high = vld1q_s8_x4((const int8_t *) b_ptr->qs + 64);
+            float16x4_t bd     = vld1_f16((const __fp16 *) b_ptr->d);
+
+            int8x16x2_t a  = vld1q_s8_x2(a_ptr->qs);
+            float16x4_t ad = vld1_dup_f16((const __fp16 *) &a_ptr->d);
+
+            int32x4_t ret = vdupq_n_s32(0);
+
+            ret = vdotq_laneq_s32(ret, b_low.val[0], a.val[0], 0);
+            ret = vdotq_laneq_s32(ret, b_low.val[1], a.val[0], 1);
+            ret = vdotq_laneq_s32(ret, b_low.val[2], a.val[0], 2);
+            ret = vdotq_laneq_s32(ret, b_low.val[3], a.val[0], 3);
+
+            ret = vdotq_laneq_s32(ret, b_high.val[0], a.val[1], 0);
+            ret = vdotq_laneq_s32(ret, b_high.val[1], a.val[1], 1);
+            ret = vdotq_laneq_s32(ret, b_high.val[2], a.val[1], 2);
+            ret = vdotq_laneq_s32(ret, b_high.val[3], a.val[1], 3);
+
+            acc = vfmaq_f32(acc, vcvtq_f32_s32(ret), vmulq_f32(vcvt_f32_f16(ad), vcvt_f32_f16(bd)));
+            a_ptr++;
+            b_ptr++;
+        }
+        vst1q_f32(s, acc);
+        s += ncols_interleaved;
+    }
+    return;
+
+#endif  // defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD)
+    ggml_gemv_q8_0_4x4_q8_0_generic(n, s, bs, vx, vy, nr, nc);
+}
+
+void ggml_gemv_q8_0_4x8_q8_0(int                        n,
+                             float * GGML_RESTRICT      s,
+                             size_t                     bs,
+                             const void * GGML_RESTRICT vx,
+                             const void * GGML_RESTRICT vy,
+                             int                        nr,
+                             int                        nc) {
+    const int qk                = QK8_0;
+    const int nb                = n / qk;
+    const int ncols_interleaved = 4;
+    const int blocklen          = 8;
+
+    assert(n % qk == 0);
+    assert(nc % ncols_interleaved == 0);
+
+    UNUSED(nb);
+    UNUSED(ncols_interleaved);
+    UNUSED(blocklen);
+
+#if defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD)
+    const block_q8_0x4 * b_ptr = (const block_q8_0x4 *) vx;
+
+    for (int c = 0; c < nc; c += ncols_interleaved) {
+        const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
+        float32x4_t        acc   = vdupq_n_f32(0);
+
+        for (int b = 0; b < nb; b++) {
+            int8x16x4_t b_low  = vld1q_s8_x4((const int8_t *) b_ptr->qs);
+            int8x16x4_t b_high = vld1q_s8_x4((const int8_t *) b_ptr->qs + 64);
+            float16x4_t bd     = vld1_f16((const __fp16 *) b_ptr->d);
+
+            int8x8x4_t  a_chunks = vld1_s8_x4(a_ptr->qs);
+            int8x16_t   a0       = vcombine_s8(a_chunks.val[0], a_chunks.val[0]);
+            int8x16_t   a1       = vcombine_s8(a_chunks.val[1], a_chunks.val[1]);
+            int8x16_t   a2       = vcombine_s8(a_chunks.val[2], a_chunks.val[2]);
+            int8x16_t   a3       = vcombine_s8(a_chunks.val[3], a_chunks.val[3]);
+            float16x4_t ad       = vld1_dup_f16((const __fp16 *) &a_ptr->d);
+
+            int32x4_t ret0 = vdupq_n_s32(0);
+            int32x4_t ret1 = vdupq_n_s32(0);
+
+            // 0..7
+            ret0 = vdotq_s32(ret0, b_low.val[0], a0);
+            ret1 = vdotq_s32(ret1, b_low.val[1], a0);
+            // 8..15
+            ret0 = vdotq_s32(ret0, b_low.val[2], a1);
+            ret1 = vdotq_s32(ret1, b_low.val[3], a1);
+            // 16..23
+            ret0 = vdotq_s32(ret0, b_high.val[0], a2);
+            ret1 = vdotq_s32(ret1, b_high.val[1], a2);
+            // 24..31
+            ret0 = vdotq_s32(ret0, b_high.val[2], a3);
+            ret1 = vdotq_s32(ret1, b_high.val[3], a3);
+
+            int32x4_t ret = vpaddq_s32(ret0, ret1);
+
+            acc = vfmaq_f32(acc, vcvtq_f32_s32(ret), vmulq_f32(vcvt_f32_f16(ad), vcvt_f32_f16(bd)));
+            a_ptr++;
+            b_ptr++;
+        }
+        vst1q_f32(s, acc);
+        s += ncols_interleaved;
+    }
+    return;
+
+#endif  // defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD)
+    ggml_gemv_q8_0_4x8_q8_0_generic(n, s, bs, vx, vy, nr, nc);
+}
+
 void ggml_gemm_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
     const int qk = QK8_0;
     const int nb = n / qk;
@@ -2610,3 +2737,159 @@ void ggml_gemm_q4_K_8x8_q8_K(int                        n,
 #endif  // defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8)
     ggml_gemm_q4_K_8x8_q8_K_generic(n, s, bs, vx, vy, nr, nc);
 }
+
+
+void ggml_gemm_q8_0_4x4_q8_0(int                        n,
+                             float * GGML_RESTRICT      s,
+                             size_t                     bs,
+                             const void * GGML_RESTRICT vx,
+                             const void * GGML_RESTRICT vy,
+                             int                        nr,
+                             int                        nc) {
+    const int qk                = QK8_0;
+    const int nb                = n / qk;
+    const int ncols_interleaved = 4;
+    const int blocklen          = 4;
+
+    assert(n % qk == 0);
+    assert(nr % 4 == 0);
+    assert(nc % ncols_interleaved == 0);
+
+    UNUSED(nb);
+    UNUSED(ncols_interleaved);
+    UNUSED(blocklen);
+
+#if defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD)
+    for (int y = 0; y < nr / 4; y++) {
+        const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
+        for (int x = 0; x < nc / ncols_interleaved; x++) {
+            const block_q8_0x4 * b_ptr = (const block_q8_0x4 *) vx + (x * nb);
+
+            float32x4_t sumf[4];
+            for (int m = 0; m < 4; m++) {
+                sumf[m] = vdupq_n_f32(0);
+            }
+
+            for (int l = 0; l < nb; l++) {
+                float32x4_t a_d = vcvt_f32_f16(vld1_f16((const float16_t *) a_ptr[l].d));
+                float32x4_t b_d = vcvt_f32_f16(vld1_f16((const float16_t *) b_ptr[l].d));
+
+                int32x4_t sumi_0 = vdupq_n_s32(0);
+                int32x4_t sumi_1 = vdupq_n_s32(0);
+                int32x4_t sumi_2 = vdupq_n_s32(0);
+                int32x4_t sumi_3 = vdupq_n_s32(0);
+
+                for (int k_group = 0; k_group < 8; k_group += 4) {
+                    int8x16x4_t a = vld1q_s8_x4(a_ptr[l].qs + 16 * k_group);
+                    int8x16x4_t b = vld1q_s8_x4(b_ptr[l].qs + 16 * k_group);
+
+                    for (int k = 0; k < 4; k++) {
+                        sumi_0 = vdotq_laneq_s32(sumi_0, b.val[k], a.val[k], 0);
+                        sumi_1 = vdotq_laneq_s32(sumi_1, b.val[k], a.val[k], 1);
+                        sumi_2 = vdotq_laneq_s32(sumi_2, b.val[k], a.val[k], 2);
+                        sumi_3 = vdotq_laneq_s32(sumi_3, b.val[k], a.val[k], 3);
+                    }
+                }
+
+                sumf[0] = vmlaq_f32(sumf[0], vmulq_laneq_f32(b_d, a_d, 0), vcvtq_f32_s32(sumi_0));
+                sumf[1] = vmlaq_f32(sumf[1], vmulq_laneq_f32(b_d, a_d, 1), vcvtq_f32_s32(sumi_1));
+                sumf[2] = vmlaq_f32(sumf[2], vmulq_laneq_f32(b_d, a_d, 2), vcvtq_f32_s32(sumi_2));
+                sumf[3] = vmlaq_f32(sumf[3], vmulq_laneq_f32(b_d, a_d, 3), vcvtq_f32_s32(sumi_3));
+            }
+
+            for (int m = 0; m < 4; m++) {
+                vst1q_f32(s + (y * 4 + m) * bs + x * 4, sumf[m]);
+            }
+        }
+    }
+    return;
+#endif  // defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD)
+    ggml_gemm_q8_0_4x4_q8_0_generic(n, s, bs, vx, vy, nr, nc);
+}
+
+void ggml_gemm_q8_0_4x8_q8_0(int                        n,
+                             float * GGML_RESTRICT      s,
+                             size_t                     bs,
+                             const void * GGML_RESTRICT vx,
+                             const void * GGML_RESTRICT vy,
+                             int                        nr,
+                             int                        nc) {
+    const int qk                = QK8_0;
+    const int nb                = n / qk;
+    const int ncols_interleaved = 4;
+    const int blocklen          = 8;
+
+    assert(n % qk == 0);
+    assert(nr % 4 == 0);
+    assert(nc % ncols_interleaved == 0);
+
+    UNUSED(nb);
+    UNUSED(ncols_interleaved);
+    UNUSED(blocklen);
+
+#if defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8)
+    const block_q8_0x4 * b_ptr_base = (const block_q8_0x4 *) vx;
+
+    for (int y = 0; y < nr; y += 4) {
+        const block_q8_0x4 * a_ptr_base = (const block_q8_0x4 *) vy + (y / 4) * nb;
+
+        for (int x = 0; x < nc; x += ncols_interleaved) {
+            const block_q8_0x4 * b_ptr = b_ptr_base + (x / 4) * nb;
+            const block_q8_0x4 * a_ptr = a_ptr_base;
+
+            float32x4_t acc_f32[4];
+            for (int i = 0; i < 4; i++) {
+                acc_f32[i] = vdupq_n_f32(0);
+            }
+
+            for (int b = 0; b < nb; b++) {
+                int32x4_t acc[4];
+                for (int i = 0; i < 4; i++) {
+                    acc[i] = vdupq_n_s32(0);
+                }
+
+                // Process 4 chunks of 8 positions each
+                for (int chunk = 0; chunk < 4; chunk++) {
+                    int8x16_t a01 = vld1q_s8(a_ptr->qs + chunk * 32);
+                    int8x16_t a23 = vld1q_s8(a_ptr->qs + chunk * 32 + 16);
+                    int8x16_t b01 = vld1q_s8(b_ptr->qs + chunk * 32);
+                    int8x16_t b23 = vld1q_s8(b_ptr->qs + chunk * 32 + 16);
+
+                    acc[0] = vmmlaq_s32(acc[0], a01, b01);
+                    acc[1] = vmmlaq_s32(acc[1], a01, b23);
+                    acc[2] = vmmlaq_s32(acc[2], a23, b01);
+                    acc[3] = vmmlaq_s32(acc[3], a23, b23);
+                }
+
+                // Reorder outputs from 2×2 tiles to row-major
+                // acc[0] = [r0c0, r0c1, r1c0, r1c1]
+                // acc[1] = [r0c2, r0c3, r1c2, r1c3]
+                // acc[2] = [r2c0, r2c1, r3c0, r3c1]
+                // acc[3] = [r2c2, r2c3, r3c2, r3c3]
+                int32x4_t row0 = vcombine_s32(vget_low_s32(acc[0]), vget_low_s32(acc[1]));
+                int32x4_t row1 = vcombine_s32(vget_high_s32(acc[0]), vget_high_s32(acc[1]));
+                int32x4_t row2 = vcombine_s32(vget_low_s32(acc[2]), vget_low_s32(acc[3]));
+                int32x4_t row3 = vcombine_s32(vget_high_s32(acc[2]), vget_high_s32(acc[3]));
+
+                // Scales
+                float32x4_t a_d = vcvt_f32_f16(vld1_f16((const __fp16 *) a_ptr->d));
+                float32x4_t b_d = vcvt_f32_f16(vld1_f16((const __fp16 *) b_ptr->d));
+
+                acc_f32[0] = vfmaq_f32(acc_f32[0], vcvtq_f32_s32(row0), vmulq_laneq_f32(b_d, a_d, 0));
+                acc_f32[1] = vfmaq_f32(acc_f32[1], vcvtq_f32_s32(row1), vmulq_laneq_f32(b_d, a_d, 1));
+                acc_f32[2] = vfmaq_f32(acc_f32[2], vcvtq_f32_s32(row2), vmulq_laneq_f32(b_d, a_d, 2));
+                acc_f32[3] = vfmaq_f32(acc_f32[3], vcvtq_f32_s32(row3), vmulq_laneq_f32(b_d, a_d, 3));
+
+                a_ptr++;
+                b_ptr++;
+            }
+
+            for (int row = 0; row < 4; row++) {
+                vst1q_f32(s + (y + row) * bs + x, acc_f32[row]);
+            }
+        }
+    }
+    return;
+#endif  // defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8)
+    ggml_gemm_q8_0_4x8_q8_0_generic(n, s, bs, vx, vy, nr, nc);
+}
diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
index a59b518938..f7ba1fe317 100644
--- a/ggml/src/ggml-cpu/ggml-cpu.c
+++ b/ggml/src/ggml-cpu/ggml-cpu.c
@@ -3320,13 +3320,33 @@ void ggml_cpu_fp16_to_fp32(const ggml_fp16_t * x, float * y, int64_t n) {
         __m128 y_vec = _mm_cvtph_ps(x_vec);
         _mm_storeu_ps(y + i, y_vec);
     }
-#elif defined(__riscv_zvfh)
-    for (int vl; i < n; i += vl) {
-        vl = __riscv_vsetvl_e16m1(n - i);
-        vfloat16m1_t vx = __riscv_vle16_v_f16m1((_Float16 *)&x[i], vl);
-        vfloat32m2_t vy = __riscv_vfwcvt_f_f_v_f32m2(vx, vl);
-        __riscv_vse32_v_f32m2(&y[i], vy, vl);
+
+#elif defined(__riscv_v_intrinsic) && defined(__riscv_zvfhmin)
+    // calculate step size
+    const int epr = __riscv_vsetvlmax_e16m2();
+    const int step = epr * 2;
+    const int np = (n & ~(step - 1));
+
+    // unroll by 2
+    for (; i < np; i += step) {
+        vfloat16m2_t ax0 = __riscv_vle16_v_f16m2((const _Float16*)x + i, epr);
+        vfloat32m4_t ay0 = __riscv_vfwcvt_f_f_v_f32m4(ax0, epr);
+        __riscv_vse32_v_f32m4(y + i, ay0, epr);
+
+        vfloat16m2_t ax1 = __riscv_vle16_v_f16m2((const _Float16*)x + i + epr, epr);
+        vfloat32m4_t ay1 = __riscv_vfwcvt_f_f_v_f32m4(ax1, epr);
+        __riscv_vse32_v_f32m4(y + i + epr, ay1, epr);
     }
+
+    // leftovers
+    int vl;
+    for (i = np; i < n; i += vl) {
+        vl = __riscv_vsetvl_e16m2(n - i);
+        vfloat16m2_t ax0 = __riscv_vle16_v_f16m2((const _Float16*)x + i, vl);
+        vfloat32m4_t ay0 = __riscv_vfwcvt_f_f_v_f32m4(ax0, vl);
+        __riscv_vse32_v_f32m4(y + i, ay0, vl);
+    }
+
 #endif
 
     for (; i < n; ++i) {
@@ -3371,6 +3391,31 @@ void ggml_cpu_bf16_to_fp32(const ggml_bf16_t * x, float * y, int64_t n) {
                                         (const __m128i *)(x + i))),
                                 16)));
     }
+#elif defined(__riscv_v_intrinsic) && defined(__riscv_zvfbfmin)
+    // calculate step size
+    const int epr = __riscv_vsetvlmax_e16m2();
+    const int step = epr * 2;
+    const int np = (n & ~(step - 1));
+
+    // unroll by 2
+    for (; i < np; i += step) {
+        vbfloat16m2_t ax0 = __riscv_vle16_v_bf16m2((const __bf16*)x + i, epr);
+        vfloat32m4_t ay0 = __riscv_vfwcvtbf16_f_f_v_f32m4(ax0, epr);
+        __riscv_vse32_v_f32m4(y + i, ay0, epr);
+
+        vbfloat16m2_t ax1 = __riscv_vle16_v_bf16m2((const __bf16*)x + i + epr, epr);
+        vfloat32m4_t ay1 = __riscv_vfwcvtbf16_f_f_v_f32m4(ax1, epr);
+        __riscv_vse32_v_f32m4(y + i + epr, ay1, epr);
+    }
+
+    // leftovers
+    int vl;
+    for (i = np; i < n; i += vl) {
+        vl = __riscv_vsetvl_e16m2(n - i);
+        vbfloat16m2_t ax0 = __riscv_vle16_v_bf16m2((const __bf16*)x + i, vl);
+        vfloat32m4_t ay0 = __riscv_vfwcvtbf16_f_f_v_f32m4(ax0, vl);
+        __riscv_vse32_v_f32m4(y + i, ay0, vl);
+    }
 #endif
     for (; i < n; i++) {
         y[i] = GGML_BF16_TO_FP32(x[i]);
diff --git a/ggml/src/ggml-cpu/repack.cpp b/ggml/src/ggml-cpu/repack.cpp
index b70ea7d78b..fbf7ed9432 100644
--- a/ggml/src/ggml-cpu/repack.cpp
+++ b/ggml/src/ggml-cpu/repack.cpp
@@ -692,6 +692,100 @@ void ggml_gemv_iq4_nl_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs
     }
 }
 
+void ggml_gemv_q8_0_4x4_q8_0_generic(int                        n,
+                                     float * GGML_RESTRICT      s,
+                                     size_t                     bs,
+                                     const void * GGML_RESTRICT vx,
+                                     const void * GGML_RESTRICT vy,
+                                     int                        nr,
+                                     int                        nc) {
+    const int qk                = QK8_0;
+    const int nb                = n / qk;
+    const int ncols_interleaved = 4;
+    const int blocklen          = 4;
+
+    assert(nr == 1);
+    assert(n % qk == 0);
+    assert(nc % ncols_interleaved == 0);
+
+    UNUSED(bs);
+    UNUSED(nr);
+
+    float sumf[4];
+    int   sumi;
+
+    const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
+    for (int x = 0; x < nc / ncols_interleaved; x++) {
+        const block_q8_0x4 * b_ptr = (const block_q8_0x4 *) vx + (x * nb);
+
+        for (int j = 0; j < ncols_interleaved; j++) {
+            sumf[j] = 0.0;
+        }
+        for (int l = 0; l < nb; l++) {
+            for (int k = 0; k < (qk / blocklen); k++) {
+                for (int j = 0; j < ncols_interleaved; j++) {
+                    sumi = 0;
+                    for (int i = 0; i < blocklen; ++i) {
+                        const int v0 = b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i];
+                        sumi += v0 * a_ptr[l].qs[k * blocklen + i];
+                    }
+                    sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
+                }
+            }
+        }
+        for (int j = 0; j < ncols_interleaved; j++) {
+            s[x * ncols_interleaved + j] = sumf[j];
+        }
+    }
+}
+
+void ggml_gemv_q8_0_4x8_q8_0_generic(int                        n,
+                                     float * GGML_RESTRICT      s,
+                                     size_t                     bs,
+                                     const void * GGML_RESTRICT vx,
+                                     const void * GGML_RESTRICT vy,
+                                     int                        nr,
+                                     int                        nc) {
+    const int qk                = QK8_0;
+    const int nb                = n / qk;
+    const int ncols_interleaved = 4;
+    const int blocklen          = 8;
+
+    assert(nr == 1);
+    assert(n % qk == 0);
+    assert(nc % ncols_interleaved == 0);
+
+    UNUSED(bs);
+    UNUSED(nr);
+
+    float sumf[4];
+    int   sumi;
+
+    const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
+    for (int x = 0; x < nc / ncols_interleaved; x++) {
+        const block_q8_0x4 * b_ptr = (const block_q8_0x4 *) vx + (x * nb);
+
+        for (int j = 0; j < ncols_interleaved; j++) {
+            sumf[j] = 0.0;
+        }
+        for (int l = 0; l < nb; l++) {
+            for (int k = 0; k < (qk / blocklen); k++) {
+                for (int j = 0; j < ncols_interleaved; j++) {
+                    sumi = 0;
+                    for (int i = 0; i < blocklen; ++i) {
+                        const int v0 = b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i];
+                        sumi += v0 * a_ptr[l].qs[k * blocklen + i];
+                    }
+                    sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
+                }
+            }
+        }
+        for (int j = 0; j < ncols_interleaved; j++) {
+            s[x * ncols_interleaved + j] = sumf[j];
+        }
+    }
+}
+
 void ggml_gemm_q4_0_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
     const int qk = QK8_0;
     const int nb = n / qk;
@@ -1219,8 +1313,129 @@ void ggml_gemm_iq4_nl_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs
     }
 }
 
+void ggml_gemm_q8_0_4x4_q8_0_generic(int                        n,
+                                     float * GGML_RESTRICT      s,
+                                     size_t                     bs,
+                                     const void * GGML_RESTRICT vx,
+                                     const void * GGML_RESTRICT vy,
+                                     int                        nr,
+                                     int                        nc) {
+    const int qk                = QK8_0;
+    const int nb                = n / qk;
+    const int ncols_interleaved = 4;
+    const int blocklen          = 4;
+
+    assert(n % qk == 0);
+    assert(nr % 4 == 0);
+    assert(nc % ncols_interleaved == 0);
+
+    float sumf[4][4];
+    int   sumi;
+
+    for (int y = 0; y < nr / 4; y++) {
+        const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
+        for (int x = 0; x < nc / ncols_interleaved; x++) {
+            const block_q8_0x4 * b_ptr = (const block_q8_0x4 *) vx + (x * nb);
+            for (int m = 0; m < 4; m++) {
+                for (int j = 0; j < ncols_interleaved; j++) {
+                    sumf[m][j] = 0.0;
+                }
+            }
+            for (int l = 0; l < nb; l++) {
+                for (int k = 0; k < (qk / blocklen); k++) {
+                    for (int m = 0; m < 4; m++) {
+                        for (int j = 0; j < ncols_interleaved; j++) {
+                            sumi = 0;
+                            for (int i = 0; i < blocklen; ++i) {
+                                const int v0 = b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i];
+                                sumi += v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i];
+                            }
+                            sumf[m][j] +=
+                                sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]);
+                        }
+                    }
+                }
+            }
+            for (int m = 0; m < 4; m++) {
+                for (int j = 0; j < ncols_interleaved; j++) {
+                    s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j];
+                }
+            }
+        }
+    }
+}
+
+void ggml_gemm_q8_0_4x8_q8_0_generic(int                        n,
+                                     float * GGML_RESTRICT      s,
+                                     size_t                     bs,
+                                     const void * GGML_RESTRICT vx,
+                                     const void * GGML_RESTRICT vy,
+                                     int                        nr,
+                                     int                        nc) {
+    const int qk                = QK8_0;
+    const int nb                = n / qk;
+    const int ncols_interleaved = 4;
+    const int blocklen          = 8;
+
+    assert(n % qk == 0);
+    assert(nr % 4 == 0);
+    assert(nc % ncols_interleaved == 0);
+
+    float sumf[4][4];
+    int   sumi;
+
+    for (int y = 0; y < nr / 4; y++) {
+        const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
+        for (int x = 0; x < nc / ncols_interleaved; x++) {
+            const block_q8_0x4 * b_ptr = (const block_q8_0x4 *) vx + (x * nb);
+            for (int m = 0; m < 4; m++) {
+                for (int j = 0; j < ncols_interleaved; j++) {
+                    sumf[m][j] = 0.0;
+                }
+            }
+            for (int l = 0; l < nb; l++) {
+                for (int k = 0; k < (qk / blocklen); k++) {
+                    for (int m = 0; m < 4; m++) {
+                        for (int j = 0; j < ncols_interleaved; j++) {
+                            sumi = 0;
+                            for (int i = 0; i < blocklen; ++i) {
+                                const int v0 = b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i];
+                                sumi += v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i];
+                            }
+                            sumf[m][j] +=
+                                sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]);
+                        }
+                    }
+                }
+            }
+            for (int m = 0; m < 4; m++) {
+                for (int j = 0; j < ncols_interleaved; j++) {
+                    s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j];
+                }
+            }
+        }
+    }
+}
+
 } // extern "C"
 
+static block_q8_0x4 make_block_q8_0x4(block_q8_0 * in, unsigned int blck_size_interleave) {
+    block_q8_0x4 out;
+
+    for (int i = 0; i < 4; i++) {
+        out.d[i] = in[i].d;
+    }
+
+    const int end = QK8_0 * 4 / blck_size_interleave;
+    for (int i = 0; i < end; ++i) {
+        int src_id     = i % 4;
+        int src_offset = (i / 4) * blck_size_interleave;
+        int dst_offset = i * blck_size_interleave;
+        memcpy(&out.qs[dst_offset], &in[src_id].qs[src_offset], blck_size_interleave);
+    }
+    return out;
+}
+
 static block_q4_0x4 make_block_q4_0x4(block_q4_0 * in, unsigned int blck_size_interleave) {
     block_q4_0x4 out;
 
@@ -1534,6 +1749,38 @@ static int repack_q4_0_to_q4_0_8_bl(struct ggml_tensor * t, int interleave_block
     GGML_UNUSED(data_size);
 }
 
+static int repack_q8_0_to_q8_0_4_bl(struct ggml_tensor *       t,
+                                    int                        interleave_block,
+                                    const void * GGML_RESTRICT data,
+                                    size_t                     data_size) {
+    GGML_ASSERT(t->type == GGML_TYPE_Q8_0);
+    GGML_ASSERT(interleave_block == 4 || interleave_block == 8);
+    constexpr int nrows_interleaved = 4;
+
+    block_q8_0x4 *     dst = (block_q8_0x4 *) t->data;
+    const block_q8_0 * src = (const block_q8_0 *) data;
+    block_q8_0         dst_tmp[4];
+    int                nrow    = ggml_nrows(t);
+    int                nblocks = t->ne[0] / QK8_0;
+
+    GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_q8_0));
+
+    if (t->ne[1] % nrows_interleaved != 0 || t->ne[0] % 8 != 0) {
+        return -1;
+    }
+
+    for (int b = 0; b < nrow; b += nrows_interleaved) {
+        for (int64_t x = 0; x < nblocks; x++) {
+            for (int i = 0; i < nrows_interleaved; i++) {
+                dst_tmp[i] = src[x + i * nblocks];
+            }
+            *dst++ = make_block_q8_0x4(dst_tmp, interleave_block);
+        }
+        src += nrows_interleaved * nblocks;
+    }
+    return 0;
+}
+
 static block_iq4_nlx4 make_block_iq4_nlx4(block_iq4_nl * in, unsigned int blck_size_interleave) {
     block_iq4_nlx4 out;
 
@@ -1702,6 +1949,14 @@ template <> int repack<block_iq4_nl, 8, 8>(struct ggml_tensor * t, const void *
     return repack_iq4_nl_to_iq4_nl_8_bl(t, 8, data, data_size);
 }
 
+template <> int repack<block_q8_0, 4, 4>(struct ggml_tensor * t, const void * data, size_t data_size) {
+    return repack_q8_0_to_q8_0_4_bl(t, 4, data, data_size);
+}
+
+template <> int repack<block_q8_0, 8, 4>(struct ggml_tensor * t, const void * data, size_t data_size) {
+    return repack_q8_0_to_q8_0_4_bl(t, 8, data, data_size);
+}
+
 // gemv
 template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PARAM_TYPE>
 void gemv(int, float *, size_t, const void *, const void *, int, int);
@@ -1738,6 +1993,14 @@ template <> void gemv<block_iq4_nl, 8, 8, GGML_TYPE_Q8_0>(int n, float * s, size
     ggml_gemv_iq4_nl_8x8_q8_0(n, s, bs, vx, vy, nr, nc);
 }
 
+template <> void gemv<block_q8_0, 4, 4, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
+    ggml_gemv_q8_0_4x4_q8_0(n, s, bs, vx, vy, nr, nc);
+}
+
+template <> void gemv<block_q8_0, 8, 4, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
+    ggml_gemv_q8_0_4x8_q8_0(n, s, bs, vx, vy, nr, nc);
+}
+
 // gemm
 template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PARAM_TYPE>
 void gemm(int, float *, size_t, const void *, const void *, int, int);
@@ -1774,6 +2037,14 @@ template <> void gemm<block_iq4_nl, 8, 8, GGML_TYPE_Q8_0>(int n, float * s, size
     ggml_gemm_iq4_nl_8x8_q8_0(n, s, bs, vx, vy, nr, nc);
 }
 
+template <> void gemm<block_q8_0, 4, 4, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
+    ggml_gemm_q8_0_4x4_q8_0(n, s, bs, vx, vy, nr, nc);
+}
+
+template <> void gemm<block_q8_0, 8, 4, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
+    ggml_gemm_q8_0_4x8_q8_0(n, s, bs, vx, vy, nr, nc);
+}
+
 class tensor_traits_base : public ggml::cpu::tensor_traits {
   public:
     virtual int repack(struct ggml_tensor * t, const void * data, size_t data_size) = 0;
@@ -2168,6 +2439,10 @@ static const ggml::cpu::tensor_traits * ggml_repack_get_optimal_repack_type(cons
     static const ggml::cpu::repack::tensor_traits<block_iq4_nl, 4, 4, GGML_TYPE_Q8_0> iq4_nl_4x4_q8_0;
     static const ggml::cpu::repack::tensor_traits<block_iq4_nl, 8, 8, GGML_TYPE_Q8_0> iq4_nl_8x8_q8_0;
 
+    // instance for Q8_0
+    static const ggml::cpu::repack::tensor_traits<block_q8_0, 4, 4, GGML_TYPE_Q8_0> q8_0_4x4_q8_0;
+    static const ggml::cpu::repack::tensor_traits<block_q8_0, 8, 4, GGML_TYPE_Q8_0> q8_0_4x8_q8_0;
+
     if (cur->type == GGML_TYPE_Q4_0) {
         if (ggml_cpu_has_avx2() || (ggml_cpu_has_sve() && ggml_cpu_has_matmul_int8() && ggml_cpu_get_sve_cnt() == QK8_0)
             || (ggml_cpu_has_riscv_v() && (ggml_cpu_get_rvv_vlen() >= QK4_0))) {
@@ -2218,6 +2493,17 @@ static const ggml::cpu::tensor_traits * ggml_repack_get_optimal_repack_type(cons
                 return &iq4_nl_4x4_q8_0;
             }
         }
+    } else if (cur->type == GGML_TYPE_Q8_0) {
+        if (ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) {
+            if (cur->ne[1] % 4 == 0) {
+                return &q8_0_4x8_q8_0;
+            }
+        }
+        if (ggml_cpu_has_neon() && ggml_cpu_has_dotprod()) {
+            if (cur->ne[1] % 4 == 0) {
+                return &q8_0_4x4_q8_0;
+            }
+        }
     }
 
     return nullptr;
diff --git a/ggml/src/ggml-cpu/repack.h b/ggml/src/ggml-cpu/repack.h
index c4d928cd15..af98e70344 100644
--- a/ggml/src/ggml-cpu/repack.h
+++ b/ggml/src/ggml-cpu/repack.h
@@ -98,6 +98,10 @@ void ggml_gemm_q4_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
 void ggml_gemm_q2_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
 void ggml_gemm_iq4_nl_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
 void ggml_gemm_iq4_nl_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
+void ggml_gemv_q8_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
+void ggml_gemv_q8_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
+void ggml_gemm_q8_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
+void ggml_gemm_q8_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
 
 // Native implementations
 void ggml_quantize_mat_q8_0_4x4_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k);
@@ -120,6 +124,10 @@ void ggml_gemm_q4_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs,
 void ggml_gemm_q2_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
 void ggml_gemm_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
 void ggml_gemm_iq4_nl_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
+void ggml_gemv_q8_0_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
+void ggml_gemv_q8_0_4x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
+void ggml_gemm_q8_0_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
+void ggml_gemm_q8_0_4x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
 
 #if defined(__cplusplus)
 } // extern "C"
diff --git a/ggml/src/ggml-cpu/vec.cpp b/ggml/src/ggml-cpu/vec.cpp
index ac8633e212..427e63245b 100644
--- a/ggml/src/ggml-cpu/vec.cpp
+++ b/ggml/src/ggml-cpu/vec.cpp
@@ -195,8 +195,48 @@ void ggml_vec_dot_bf16(int n, float * GGML_RESTRICT s, size_t bs, ggml_bf16_t *
     sumf += (ggml_float)_mm_cvtss_f32(g);
 
 #undef LOAD
-#endif
+#elif defined(__riscv_v_intrinsic) && defined(__riscv_zvfbfwma)
+    size_t vl = __riscv_vsetvlmax_e32m4();
 
+    // initialize accumulators to all zeroes
+    vfloat32m4_t vsum0 = __riscv_vfmv_v_f_f32m4(0.0f, vl);
+    vfloat32m4_t vsum1 = __riscv_vfmv_v_f_f32m4(0.0f, vl);
+
+    // calculate step size
+    const size_t epr = __riscv_vsetvlmax_e16m2();
+    const size_t step = epr * 2;
+    const int np = (n & ~(step - 1));
+
+    // unroll by 2
+    for (; i < np; i += step) {
+        vbfloat16m2_t ax0 = __riscv_vle16_v_bf16m2((const __bf16 *)&x[i], epr);
+        vbfloat16m2_t ay0 = __riscv_vle16_v_bf16m2((const __bf16 *)&y[i], epr);
+        vsum0 = __riscv_vfwmaccbf16_vv_f32m4(vsum0, ax0, ay0, epr);
+        __asm__ __volatile__ ("" ::: "memory");
+
+        vbfloat16m2_t ax1 = __riscv_vle16_v_bf16m2((const __bf16 *)&x[i + epr], epr);
+        vbfloat16m2_t ay1 = __riscv_vle16_v_bf16m2((const __bf16 *)&y[i + epr], epr);
+        vsum1 = __riscv_vfwmaccbf16_vv_f32m4(vsum1, ax1, ay1, epr);
+        __asm__ __volatile__ ("" ::: "memory");
+    }
+
+    // accumulate in 1 register
+    vsum0 = __riscv_vfadd_vv_f32m4(vsum0, vsum1, vl);
+
+    // leftovers
+    for (i = np; i < n; i += vl) {
+        vl = __riscv_vsetvl_e16m2(n - i);
+        vbfloat16m2_t ax0 = __riscv_vle16_v_bf16m2((const __bf16 *)&x[i], vl);
+        vbfloat16m2_t ay0 = __riscv_vle16_v_bf16m2((const __bf16 *)&y[i], vl);
+        vsum0 = __riscv_vfwmaccbf16_vv_f32m4(vsum0, ax0, ay0, vl);
+    }
+
+    // reduce
+    vl = __riscv_vsetvlmax_e32m4();
+    vfloat32m1_t redsum = __riscv_vfredusum_vs_f32m4_f32m1(vsum0, __riscv_vfmv_v_f_f32m1(0.0f, 1), vl);
+    sumf += __riscv_vfmv_f_s_f32m1_f32(redsum);
+
+#endif
     for (; i < n; ++i) {
         sumf += (ggml_float)(GGML_BF16_TO_FP32(x[i]) *
                              GGML_BF16_TO_FP32(y[i]));
diff --git a/ggml/src/ggml-cpu/vec.h b/ggml/src/ggml-cpu/vec.h
index bd80805fdc..3198b33b50 100644
--- a/ggml/src/ggml-cpu/vec.h
+++ b/ggml/src/ggml-cpu/vec.h
@@ -224,13 +224,71 @@ inline static void ggml_vec_dot_f16_unroll(const int n, const int xs, float * GG
         }
         GGML_F16x_VEC_REDUCE(sumf[0], sum_00, sum_01, sum_02, sum_03);
         GGML_F16x_VEC_REDUCE(sumf[1], sum_10, sum_11, sum_12, sum_13);
-    #elif defined(__riscv_v_intrinsic)
-      // todo: RVV impl
-      for (int i = 0; i < n; ++i) {
-          for (int j = 0; j < GGML_VEC_DOT_UNROLL; ++j) {
-              sumf[j] += (ggml_float)(GGML_CPU_FP16_TO_FP32(x[j][i])*GGML_CPU_FP16_TO_FP32(y[i]));
-          }
-      }
+
+    #elif defined(__riscv_v_intrinsic) && defined(__riscv_zvfh)
+        size_t vl = __riscv_vsetvlmax_e32m4();
+
+        // initialize accumulators to all zeroes
+        vfloat32m4_t vsum0_0 = __riscv_vfmv_v_f_f32m4(0.0f, vl);
+        vfloat32m4_t vsum0_1 = __riscv_vfmv_v_f_f32m4(0.0f, vl);
+        vfloat32m4_t vsum1_0 = __riscv_vfmv_v_f_f32m4(0.0f, vl);
+        vfloat32m4_t vsum1_1 = __riscv_vfmv_v_f_f32m4(0.0f, vl);
+
+        // calculate step size
+        const size_t epr = __riscv_vsetvlmax_e16m2();
+        const size_t step = epr * 2;
+        const int np = (n & ~(step - 1));
+
+        // unroll by 2 along the row dimension
+        for (int i = 0; i < np; i += step) {
+            vfloat16m2_t ay0 = __riscv_vle16_v_f16m2((const _Float16 *)(y + i), epr);
+            vfloat16m2_t ax0_0 = __riscv_vle16_v_f16m2((const _Float16 *)(x[0] + i), epr);
+            vfloat16m2_t ax1_0 = __riscv_vle16_v_f16m2((const _Float16 *)(x[1] + i), epr);
+            vsum0_0 = __riscv_vfwmacc_vv_f32m4(vsum0_0, ax0_0, ay0, epr);
+            vsum1_0 = __riscv_vfwmacc_vv_f32m4(vsum1_0, ax1_0, ay0, epr);
+
+            vfloat16m2_t ay1 = __riscv_vle16_v_f16m2((const _Float16 *)(y + i + epr), epr);
+            vfloat16m2_t ax0_1 = __riscv_vle16_v_f16m2((const _Float16 *)(x[0] + i + epr), epr);
+            vfloat16m2_t ax1_1 = __riscv_vle16_v_f16m2((const _Float16 *)(x[1] + i + epr), epr);
+            vsum0_1 = __riscv_vfwmacc_vv_f32m4(vsum0_1, ax0_1, ay1, epr);
+            vsum1_1 = __riscv_vfwmacc_vv_f32m4(vsum1_1, ax1_1, ay1, epr);
+        }
+
+        vfloat32m4_t vsum0 = __riscv_vfadd_vv_f32m4(vsum0_0, vsum0_1, vl);
+        vfloat32m4_t vsum1 = __riscv_vfadd_vv_f32m4(vsum1_0, vsum1_1, vl);
+
+        // leftovers
+        for (int i = np; i < n; i += vl) {
+            vl = __riscv_vsetvl_e16m2(n - i);
+            vfloat16m2_t ay = __riscv_vle16_v_f16m2((const _Float16 *)(y + i), vl);
+            vfloat16m2_t ax0 = __riscv_vle16_v_f16m2((const _Float16 *)(x[0] + i), vl);
+            vfloat16m2_t ax1 = __riscv_vle16_v_f16m2((const _Float16 *)(x[1] + i), vl);
+
+            vsum0 = __riscv_vfwmacc_vv_f32m4(vsum0, ax0, ay, vl);
+            vsum1 = __riscv_vfwmacc_vv_f32m4(vsum1, ax1, ay, vl);
+        }
+
+        // reduce
+        vl = __riscv_vsetvlmax_e32m2();
+        vfloat32m2_t acc0_0 = __riscv_vfadd_vv_f32m2(__riscv_vget_v_f32m4_f32m2(vsum0, 0),
+                                    __riscv_vget_v_f32m4_f32m2(vsum0, 1), vl);
+        vl = __riscv_vsetvlmax_e32m1();
+        vfloat32m1_t acc0_1 = __riscv_vfadd_vv_f32m1(__riscv_vget_v_f32m2_f32m1(acc0_0, 0),
+        __riscv_vget_v_f32m2_f32m1(acc0_0, 1), vl);
+        vfloat32m1_t redsum0 = __riscv_vfredusum_vs_f32m1_f32m1(
+                                    acc0_1, __riscv_vfmv_v_f_f32m1(0.0f, 1), vl);
+
+        vl = __riscv_vsetvlmax_e32m2();
+        vfloat32m2_t acc1_0 = __riscv_vfadd_vv_f32m2(__riscv_vget_v_f32m4_f32m2(vsum1, 0),
+                                    __riscv_vget_v_f32m4_f32m2(vsum1, 1), vl);
+        vl = __riscv_vsetvlmax_e32m1();
+        vfloat32m1_t acc1_1 = __riscv_vfadd_vv_f32m1(__riscv_vget_v_f32m2_f32m1(acc1_0, 0),
+                                    __riscv_vget_v_f32m2_f32m1(acc1_0, 1), vl);
+        vfloat32m1_t redsum1 = __riscv_vfredusum_vs_f32m1_f32m1(
+                                    acc1_1, __riscv_vfmv_v_f_f32m1(0.0f, 1), vl);
+        sumf[0] = __riscv_vfmv_f_s_f32m1_f32(redsum0);
+        sumf[1] = __riscv_vfmv_f_s_f32m1_f32(redsum1);
+
     #else
         const int np = (n & ~(GGML_F16_STEP - 1));
 
@@ -475,15 +533,39 @@ inline static void ggml_vec_mad_f16(const int n, ggml_fp16_t * GGML_RESTRICT y,
     }
     np = n;
 #elif defined(__riscv_zvfh) // implies __riscv_v_intrinsic
-    const int np = n;
-    _Float16 hv = (_Float16)v;
-    for (int i = 0, avl; i < n; i += avl) {
-        avl = __riscv_vsetvl_e16m8(n - i);
-        vfloat16m8_t ax = __riscv_vle16_v_f16m8((const _Float16 *)&x[i], avl);
-        vfloat16m8_t ay = __riscv_vle16_v_f16m8((_Float16 *)&y[i], avl);
-        vfloat16m8_t ny = __riscv_vfmadd_vf_f16m8(ax, hv, ay, avl);
-        __riscv_vse16_v_f16m8((_Float16 *)&y[i], ny, avl);
+    const ggml_fp16_t s = GGML_CPU_FP32_TO_FP16(v);
+    const _Float16 scale = *(const _Float16*)(&s);
+
+    // calculate step size
+    const int epr = __riscv_vsetvlmax_e16m4();
+    const int step = epr * 2;
+    int np = (n & ~(step - 1));
+
+    // unroll by 2
+    for (int i = 0; i < np; i += step) {
+        vfloat16m4_t ax0 = __riscv_vle16_v_f16m4((const _Float16*)x + i, epr);
+        vfloat16m4_t ay0 = __riscv_vle16_v_f16m4((const _Float16*)y + i, epr);
+        ay0 = __riscv_vfmacc_vf_f16m4(ay0, scale, ax0, epr);
+        __riscv_vse16_v_f16m4((_Float16*)y + i, ay0, epr);
+        __asm__ __volatile__ ("" ::: "memory");
+
+        vfloat16m4_t ax1 = __riscv_vle16_v_f16m4((const _Float16*)x + i + epr, epr);
+        vfloat16m4_t ay1 = __riscv_vle16_v_f16m4((const _Float16*)y + i + epr, epr);
+        ay1 = __riscv_vfmacc_vf_f16m4(ay1, scale, ax1, epr);
+        __riscv_vse16_v_f16m4((_Float16*)y + i + epr, ay1, epr);
+        __asm__ __volatile__ ("" ::: "memory");
     }
+
+    // leftovers
+    int vl;
+    for (int i = np; i < n; i += vl) {
+        vl = __riscv_vsetvl_e16m4(n - i);
+        vfloat16m4_t ax0 = __riscv_vle16_v_f16m4((const _Float16*)x + i, vl);
+        vfloat16m4_t ay0 = __riscv_vle16_v_f16m4((const _Float16*)y + i, vl);
+        ay0 = __riscv_vfmacc_vf_f16m4(ay0, scale, ax0, vl);
+        __riscv_vse16_v_f16m4((_Float16*)y + i, ay0, vl);
+    }
+    np = n;
 #elif defined(GGML_SIMD)
     const int np = (n & ~(GGML_F16_STEP - 1));
 
@@ -724,13 +806,34 @@ inline static void ggml_vec_scale_f16(const int n, ggml_fp16_t * y, const float
         svst1_f16(pg, (__fp16 *)(y + np), out);
     }
 #elif defined(__riscv_v_intrinsic) && defined(__riscv_zvfh)
-    for (int i = 0, vl; i < n; i += vl) {
-        vl = __riscv_vsetvl_e16m2(n - i);
-        vfloat16m2_t vy = __riscv_vle16_v_f16m2((_Float16 *)&y[i], vl);
-        vfloat32m4_t vy32 = __riscv_vfwcvt_f_f_v_f32m4(vy, vl);
-        vy32 = __riscv_vfmul_vf_f32m4(vy32, v, vl);
-        vy = __riscv_vfncvt_f_f_w_f16m2(vy32, vl);
-        __riscv_vse16_v_f16m2((_Float16 *)&y[i], vy, vl);
+    const ggml_fp16_t s = GGML_CPU_FP32_TO_FP16(v);
+    const _Float16 scale = *(const _Float16*)(&s);
+
+    // calculate step size
+    const int epr = __riscv_vsetvlmax_e16m4();
+    const int step = epr * 2;
+    const int np = (n & ~(step - 1));
+
+    // unroll by 2
+    for (int i = 0; i < np; i += step) {
+        vfloat16m4_t ay0 = __riscv_vle16_v_f16m4((const _Float16*)y + i, epr);
+        ay0 = __riscv_vfmul_vf_f16m4(ay0, scale, epr);
+        __riscv_vse16_v_f16m4((_Float16*)y + i, ay0, epr);
+        __asm__ __volatile__ ("" ::: "memory");
+
+        vfloat16m4_t ay1 = __riscv_vle16_v_f16m4((const _Float16*)y + i + epr, epr);
+        ay1 = __riscv_vfmul_vf_f16m4(ay1, scale, epr);
+        __riscv_vse16_v_f16m4((_Float16*)y + i + epr, ay1, epr);
+        __asm__ __volatile__ ("" ::: "memory");
+    }
+
+    // leftovers
+    int vl;
+    for (int i = np; i < n; i += vl) {
+        vl = __riscv_vsetvl_e16m4(n - i);
+        vfloat16m4_t ay0 = __riscv_vle16_v_f16m4((const _Float16*)y + i, vl);
+        ay0 = __riscv_vfmul_vf_f16m4(ay0, scale, vl);
+        __riscv_vse16_v_f16m4((_Float16*)y + i, ay0, vl);
     }
 #elif defined(GGML_SIMD)
     const int np = (n & ~(GGML_F16_STEP - 1));
diff --git a/ggml/src/ggml-cuda/argmax.cu b/ggml/src/ggml-cuda/argmax.cu
index 5340eedc08..51967c667c 100644
--- a/ggml/src/ggml-cuda/argmax.cu
+++ b/ggml/src/ggml-cuda/argmax.cu
@@ -21,7 +21,7 @@ static __global__ void argmax_f32(const float * __restrict__ x, int32_t * __rest
     }
 
 #pragma unroll
-    for (int offset = 16; offset > 0; offset >>= 1) {
+    for (int offset = WARP_SIZE/2; offset > 0; offset >>= 1) {
         const float val = __shfl_xor_sync(0xFFFFFFFF, maxval, offset, WARP_SIZE);
         const int   col = __shfl_xor_sync(0xFFFFFFFF, argmax, offset, WARP_SIZE);
         if (val > maxval) {
@@ -50,7 +50,7 @@ static __global__ void argmax_f32(const float * __restrict__ x, int32_t * __rest
                 argmax = shared_argmax[lane_id];
             }
 #pragma unroll
-            for (int offset = 16; offset > 0; offset >>= 1) {
+            for (int offset = WARP_SIZE/2; offset > 0; offset >>= 1) {
                 const float val = __shfl_xor_sync(0xFFFFFFFF, maxval, offset, WARP_SIZE);
                 const int   col = __shfl_xor_sync(0xFFFFFFFF, argmax, offset, WARP_SIZE);
                 if (val > maxval) {
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
index ab0f6fe9ce..55fa2e6a7c 100644
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -3076,8 +3076,11 @@ static bool ggml_cuda_can_fuse(const struct ggml_cgraph * cgraph, int node_idx,
         ggml_can_fuse_subgraph(cgraph, node_idx, ops, { node_idx + 3, node_idx + 9 })) {
         ggml_tensor * softmax = cgraph->nodes[node_idx];
         ggml_tensor * weights = cgraph->nodes[node_idx + 9];
+        ggml_tensor * get_rows = cgraph->nodes[node_idx + 4];
+        ggml_tensor * argsort = cgraph->nodes[node_idx + 2];
+        int n_expert = cgraph->nodes[node_idx]->src[0]->ne[0];
 
-        if (ggml_cuda_should_use_topk_moe(softmax, weights)) {
+        if (ggml_cuda_should_use_topk_moe(softmax, weights, get_rows, argsort, nullptr, n_expert)) {
             return true;
         }
     }
@@ -3085,7 +3088,11 @@ static bool ggml_cuda_can_fuse(const struct ggml_cgraph * cgraph, int node_idx,
     if (is_equal(topk_moe_ops, ops) && ggml_can_fuse_subgraph(cgraph, node_idx, ops, { node_idx + 3, node_idx + 4 })) {
         ggml_tensor * softmax = cgraph->nodes[node_idx];
         ggml_tensor * weights = cgraph->nodes[node_idx + 4];
-        if (ggml_cuda_should_use_topk_moe(softmax, weights)) {
+        ggml_tensor * get_rows = cgraph->nodes[node_idx + 4];
+        ggml_tensor * argsort = cgraph->nodes[node_idx + 2];
+        int n_expert = cgraph->nodes[node_idx]->src[0]->ne[0];
+
+        if (ggml_cuda_should_use_topk_moe(softmax, weights, get_rows, argsort, nullptr, n_expert)) {
             return true;
         }
     }
@@ -3094,8 +3101,11 @@ static bool ggml_cuda_can_fuse(const struct ggml_cgraph * cgraph, int node_idx,
         ggml_can_fuse_subgraph(cgraph, node_idx, ops, { node_idx + 1, node_idx + 5 })) {
         ggml_tensor * softmax = cgraph->nodes[node_idx + 4];
         ggml_tensor * weights = cgraph->nodes[node_idx + 5];
+        ggml_tensor * get_rows = cgraph->nodes[node_idx + 2];
+        ggml_tensor * argsort = cgraph->nodes[node_idx + 0];
+        int n_expert = cgraph->nodes[node_idx]->src[0]->ne[0];
 
-        if (ggml_cuda_should_use_topk_moe(softmax, weights)) {
+        if (ggml_cuda_should_use_topk_moe(softmax, weights, get_rows, argsort, nullptr, n_expert)) {
             return true;
         }
     }
diff --git a/ggml/src/ggml-cuda/mean.cu b/ggml/src/ggml-cuda/mean.cu
index 347abc1866..691d8dcb14 100644
--- a/ggml/src/ggml-cuda/mean.cu
+++ b/ggml/src/ggml-cuda/mean.cu
@@ -63,6 +63,9 @@ void ggml_cuda_op_mean(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
 
     const int id  = ggml_cuda_get_device();
     const int nsm = ggml_cuda_info().devices[id].nsm;
+
+    // Heuristic for block size selection to optimize occupancy.
+    // See discussion in: https://github.com/ggml-org/llama.cpp/pull/15132
     if ((nrows / nsm) < 2) {
         const dim3 block_dims(512, 1, 1);
         reduce_rows_f32</*norm=*/true><<<block_nums, block_dims, 0, stream>>>(src0_d, dst_d, ncols);
diff --git a/ggml/src/ggml-cuda/mma.cuh b/ggml/src/ggml-cuda/mma.cuh
index dcfa40f4d5..3268dadfe8 100644
--- a/ggml/src/ggml-cuda/mma.cuh
+++ b/ggml/src/ggml-cuda/mma.cuh
@@ -76,15 +76,29 @@ namespace ggml_cuda_mma {
         // For the A/C matrices this means I major == row major, J major == column major.
         // For the B matrix this means I major == column major, J major == row major.
         // MIRRORED == Each data value is held exactly once per thread subgroup.
-        DATA_LAYOUT_I_MAJOR           =  0, // Always used for Turing, Ampere, Ada Lovelace, consumer Blackwell.
-        DATA_LAYOUT_I_MAJOR_MIRRORED  = 10,
-        DATA_LAYOUT_J_MAJOR_MIRRORED  = 20,
+        DATA_LAYOUT_I_MAJOR           =  0, // Always used for Turing, Ampere, Ada Lovelace, consumer Blackwell, matrix A&B for RDNA4 and CDNA.
+        DATA_LAYOUT_J_MAJOR           = 10, // Matrix C for CDNA and RDNA4, int and float matrix C for RDNA3.
+        DATA_LAYOUT_I_MAJOR_MIRRORED  = 20, // Volta, matrix A&B for RDNA3.
+        DATA_LAYOUT_J_MAJOR_MIRRORED  = 30,
     };
     // Implemented mma combinations are:
     //   - (I_MAJOR, I_MAJOR)          -> I_MAJOR
     //   - (I_MAJOR, I_MAJOR_MIRRORED) -> I_MAJOR
     //   - (I_MAJOR, J_MAJOR_MIRRORED) -> I_MAJOR
 
+    static constexpr bool is_i_major(const data_layout dl) {
+        return dl == DATA_LAYOUT_I_MAJOR ||
+               dl == DATA_LAYOUT_I_MAJOR_MIRRORED;
+    }
+
+    static constexpr __device__ data_layout get_input_data_layout() {
+#if defined(RDNA3) || __CUDA_ARCH__ == GGML_CUDA_CC_VOLTA
+        return DATA_LAYOUT_I_MAJOR_MIRRORED;
+#else
+        return DATA_LAYOUT_I_MAJOR;
+#endif // defined(RDNA3) || __CUDA_ARCH__ == GGML_CUDA_CC_VOLTA
+    }
+
     template <int I_, int J_, typename T, data_layout ds_=DATA_LAYOUT_I_MAJOR>
     struct tile {};
 
@@ -115,9 +129,9 @@ namespace ggml_cuda_mma {
             } else if constexpr (I == 32 && J == 4) {
                 return threadIdx.x % 32;
             } else if constexpr (I == 16 && J == 16) {
-                return 4 * (threadIdx.x / 16) + l;
+                return threadIdx.x % 16;
             } else if constexpr (I == 32 && J == 32) {
-                return 4 * (threadIdx.x / 32) + 8 * (l / 4) + (l % 4);
+                return threadIdx.x % 32;
             } else {
                 NO_DEVICE_CODE;
                 return -1;
@@ -132,9 +146,9 @@ namespace ggml_cuda_mma {
             } else if constexpr (I == 32 && J == 4) {
                 return 2 * (threadIdx.x / 32) + l;
             } else if constexpr (I == 16 && J == 16) {
-                return threadIdx.x % 16;
+                return 4 * (threadIdx.x / 16) + l;
             } else if constexpr (I == 32 && J == 32) {
-                return threadIdx.x % 32;
+                return 4 * (threadIdx.x / 32) + 8 * (l / 4) + (l % 4);
             } else {
                 NO_DEVICE_CODE;
                 return -1;
@@ -171,28 +185,19 @@ namespace ggml_cuda_mma {
             }
         }
 #elif defined(AMD_WMMA_AVAILABLE)
-#if defined(RDNA4)
         static constexpr int ne = I * J / 32;
-#elif defined(RDNA3)
-        static constexpr int ne = (I == 16 && J == 16) ? I * J / 32 : I * J / 16;
-#endif // defined(RDNA4)
         T x[ne] = {0};
 
         static constexpr __device__ bool supported() {
             if (I == 16 && J == 16) return true;
+            if (I == 16 && J == 8) return true;
+            if (I == 16 && J == 4) return true;
             return false;
         }
 
         static __device__ __forceinline__ int get_i(const int l) {
-            if constexpr (I == 16 && J == 16) {
-#if defined(RDNA4)
-                return 8 * (threadIdx.x / 16) + l;
-#elif defined(RDNA3)
-                return 2 * l + (threadIdx.x / 16);
-#else
-                NO_DEVICE_CODE;
-                return -1;
-#endif // defined(RDNA4)
+            if constexpr (supported()) {
+                return threadIdx.x % 16;
             } else {
                 NO_DEVICE_CODE;
                 return -1;
@@ -201,7 +206,17 @@ namespace ggml_cuda_mma {
 
         static __device__ __forceinline__ int get_j(const int l) {
             if constexpr (I == 16 && J == 16) {
-                return threadIdx.x % 16;
+                // matrix C
+#if defined(RDNA3)
+                return 2 * l + (threadIdx.x / 16);
+#else
+                return ne * (threadIdx.x / 16) + l;
+#endif // defined(RDNA3)
+            } else if constexpr (I == 16 && J == 8) {
+                // mmq input for RDNA4
+                return ne * (threadIdx.x / 16) + l;
+            } else if constexpr (I == 16 && J == 4) {
+                return ne * (threadIdx.x / 16) + l;
             } else {
                 NO_DEVICE_CODE;
                 return -1;
@@ -293,12 +308,7 @@ namespace ggml_cuda_mma {
             }
         }
 #elif defined(AMD_WMMA_AVAILABLE)
-#if defined(RDNA3)
-        // RDNA3 has duplicated data as input.
-        static constexpr int ne = I * J / 32 * 2;
-#else
         static constexpr int ne = I * J / 32;
-#endif // defined(RDNA3)
         half2 x[ne] = {{0.0f, 0.0f}};
 
         static constexpr __device__ bool supported() {
@@ -317,14 +327,7 @@ namespace ggml_cuda_mma {
 
         static __device__ __forceinline__ int get_j(const int l) {
             if constexpr (I == 16 && J == 8) {
-#if defined(RDNA4)
                 return 4 * (threadIdx.x / 16) + l;
-#elif defined(RDNA3)
-                return l;
-#else
-                NO_DEVICE_CODE;
-                return -1;
-#endif // defined(RDNA4)
             } else {
                 NO_DEVICE_CODE;
                 return -1;
@@ -382,42 +385,19 @@ namespace ggml_cuda_mma {
         static constexpr data_layout dl = DATA_LAYOUT_I_MAJOR;
 
 #if defined(AMD_WMMA_AVAILABLE)
-#if defined(RDNA3)
-        // RDNA3 has duplicated data as input.
-        static constexpr int ne = I * J / 32 * 2;
-#else
         static constexpr int ne = I * J / 32;
-#endif // defined(RDNA3)
         nv_bfloat162 x[ne] = {{0.0f, 0.0f}};
 
         static constexpr __device__ bool supported() {
-            if (I == 16 && J == 8) return true;
-            return false;
+            return tile<I_, J_, half2, DATA_LAYOUT_I_MAJOR>::supported();
         }
 
         static __device__ __forceinline__ int get_i(const int l) {
-            if constexpr (I == 16 && J == 8) {
-                return threadIdx.x % 16;
-            } else {
-                NO_DEVICE_CODE;
-                return -1;
-            }
+            return tile<I_, J_, half2, DATA_LAYOUT_I_MAJOR>::get_i(l);
         }
 
         static __device__ __forceinline__ int get_j(const int l) {
-            if constexpr (I == 16 && J == 8) {
-#if defined(RDNA4)
-                return 4 * (threadIdx.x / 16) + l;
-#elif defined(RDNA3)
-                return l;
-#else
-                NO_DEVICE_CODE;
-                return -1;
-#endif // defined(RDNA4)
-            } else {
-                NO_DEVICE_CODE;
-                return -1;
-            }
+            return tile<I_, J_, half2, DATA_LAYOUT_I_MAJOR>::get_j(l);
         }
 #else
         static constexpr int ne = I * J / WARP_SIZE;
@@ -458,11 +438,87 @@ namespace ggml_cuda_mma {
 #endif  // defined(AMD_WMMA_AVAILABLE)
     };
 
+    template <int I_, int J_, typename T>
+    struct tile<I_, J_, T, DATA_LAYOUT_J_MAJOR> {
+        static constexpr int         I  = I_;
+        static constexpr int         J  = J_;
+        static constexpr data_layout dl = DATA_LAYOUT_J_MAJOR;
+
+        static constexpr int ne = tile<I_, J_, T, DATA_LAYOUT_I_MAJOR>::ne;
+        T x[ne] = {0};
+
+        static constexpr __device__ bool supported() {
+            return tile<I_, J_, T, DATA_LAYOUT_I_MAJOR>::supported();
+        }
+
+        static __device__ __forceinline__ int get_i(const int l) {
+            return tile<I_, J_, T, DATA_LAYOUT_I_MAJOR>::get_j(l);
+        }
+
+        static __device__ __forceinline__ int get_j(const int l) {
+            return tile<I_, J_, T, DATA_LAYOUT_I_MAJOR>::get_i(l);
+        }
+    };
+
+    template <int I_, int J_, typename T>
+    struct tile<I_, J_, T, DATA_LAYOUT_I_MAJOR_MIRRORED> {
+        static constexpr int         I  = I_;
+        static constexpr int         J  = J_;
+        static constexpr data_layout dl = DATA_LAYOUT_I_MAJOR_MIRRORED;
+
+        // RDNA3
+        static constexpr int         ne = I * J / 32 * 2;
+
+        T x[ne] = {0};
+
+        static constexpr __device__ bool supported() {
+            if (I == 16 && J == 16) return true;
+            if (I == 16 && J == 8)  return true;
+            if (I == 16 && J == 4)  return true;
+            return false;
+        }
+
+        static __device__ __forceinline__ int get_i(const int /*l*/) {
+            if constexpr (supported()) {
+                return threadIdx.x % 16;
+            } else {
+                NO_DEVICE_CODE;
+                return -1;
+            }
+        }
+
+        static __device__ __forceinline__ int get_j(const int l) {
+            if constexpr (supported()) {
+                return l;
+            } else {
+                NO_DEVICE_CODE;
+                return -1;
+            }
+        }
+    };
+
     template <int I_, int J_>
     struct tile<I_, J_, half2, DATA_LAYOUT_I_MAJOR_MIRRORED> {
         static constexpr int         I  = I_;
         static constexpr int         J  = J_;
         static constexpr data_layout dl = DATA_LAYOUT_I_MAJOR_MIRRORED;
+#if defined(RDNA3)
+        static constexpr int         ne = tile<I_, J_, float, DATA_LAYOUT_I_MAJOR_MIRRORED>::ne;
+
+        half2 x[ne] = {{0.0f, 0.0f}};
+
+        static constexpr __device__ bool supported() {
+            return tile<I_, J_, float, DATA_LAYOUT_I_MAJOR_MIRRORED>::supported();
+        }
+
+        static __device__ __forceinline__ int get_i(const int l) {
+            return tile<I_, J_, float, DATA_LAYOUT_I_MAJOR_MIRRORED>::get_i(l);
+        }
+
+        static __device__ __forceinline__ int get_j(const int l) {
+            return tile<I_, J_, float, DATA_LAYOUT_I_MAJOR_MIRRORED>::get_j(l);
+        }
+#else // Volta
         static constexpr int         ne = I * J / (WARP_SIZE/4);
 
         half2 x[ne] = {{0.0f, 0.0f}};
@@ -489,6 +545,29 @@ namespace ggml_cuda_mma {
                 return -1;
             }
         }
+#endif // defined(RDNA3)
+    };
+
+    template <int I_, int J_>
+    struct tile<I_, J_, nv_bfloat162, DATA_LAYOUT_I_MAJOR_MIRRORED> {
+        static constexpr int         I  = I_;
+        static constexpr int         J  = J_;
+        static constexpr data_layout dl = DATA_LAYOUT_I_MAJOR_MIRRORED;
+        static constexpr int         ne = tile<I_, J_, float, DATA_LAYOUT_I_MAJOR_MIRRORED>::ne;
+
+        nv_bfloat162 x[ne] = {{0.0f, 0.0f}};
+
+        static constexpr __device__ bool supported() {
+            return tile<I_, J_, float, DATA_LAYOUT_I_MAJOR_MIRRORED>::supported();
+        }
+
+        static __device__ __forceinline__ int get_i(const int l) {
+            return tile<I_, J_, float, DATA_LAYOUT_I_MAJOR_MIRRORED>::get_i(l);
+        }
+
+        static __device__ __forceinline__ int get_j(const int l) {
+            return tile<I_, J_, float, DATA_LAYOUT_I_MAJOR_MIRRORED>::get_j(l);
+        }
     };
 
     template <int I_, int J_>
@@ -569,55 +648,28 @@ namespace ggml_cuda_mma {
                 t.x[l] = xs0[t.get_i(l)*stride + t.get_j(l)];
             }
         } else {
-            int64_t * xi = (int64_t *) t.x;
-            const int64_t * xs = (int64_t *) ((const int *) xs0 + (threadIdx.x % t.I) * stride + 2 * (threadIdx.x / t.I));
-            xi[0] = xs[0];
+            ggml_cuda_memcpy_1<sizeof(t.x)>(t.x, xs0 + t.get_i(0) * stride + t.get_j(0));
         }
 #elif defined(AMD_WMMA_AVAILABLE)
-        if constexpr (std::is_same_v<T, half2> || std::is_same_v<T, nv_bfloat162>) {
-#if defined(RDNA4)
-                ggml_cuda_memcpy_1<sizeof(t.x)>(t.x, xs0 + t.get_i(0) * stride + t.get_j(0));
-#elif defined(RDNA3)
-                ggml_cuda_memcpy_1<sizeof(t.x)/2>(t.x, xs0 + t.get_i(0) * stride + t.get_j(0));
-                ggml_cuda_memcpy_1<sizeof(t.x)/2>(t.x + t.ne/2, xs0 + t.get_i(0) * stride + t.get_j(t.ne/2));
-#else
-                NO_DEVICE_CODE;
-#endif // defined(RDNA4)
-        } else if constexpr (std::is_same_v<T, int>) {
-            if constexpr (I == 16 && J == 4) {
-                int64_t * xi = (int64_t *) t.x;
-#if defined(RDNA4)
-                const int64_t * xs = (int64_t *) ((const int *) xs0 + (threadIdx.x % t.I) * stride + 2 * (threadIdx.x / t.I));
-                xi[0] = xs[0];
-#elif defined(RDNA3)
-                static_assert(tile<I,J,T>::ne >= 4, "fragment too small");
-                const int64_t * xs = (int64_t *) ((const int *) xs0 + (threadIdx.x % t.I) * stride);
-                xi[0] = xs[0];
-                xi[1] = xs[1];
-#endif // defined(RDNA4)
-            } else if constexpr (I == 16 && J == 8) {
-                int64_t * xi = (int64_t *) t.x;
-#if defined(RDNA4)
-                const int64_t * xs = (int64_t *) ((const int *) xs0 + (threadIdx.x % t.I) * stride + 4 * (threadIdx.x / t.I));
-                xi[0] = xs[0];
-
-                const int64_t * xs1 = (int64_t *) ((const int *) xs0 + (threadIdx.x % t.I) * stride + 4 * (threadIdx.x / t.I) + 2);
-                xi[1] = xs1[0];
-#elif defined(RDNA3)
-                static_assert(tile<I,J,T>::ne >= 8, "fragment too small");
-                const int64_t * xs = (int64_t *) ((const int *) xs0 + (threadIdx.x % t.I) * stride);
-                // contiguous four 64-bit chunks per lane for the wider RDNA3 fragment
-                xi[0] = xs[0];
-                xi[1] = xs[1];
-                const int64_t * xs1 = xs + 2;
-                xi[2] = xs1[0];
-                xi[3] = xs1[1];
-#endif // defined(RDNA4)
+        // All wmma layout has contiguous data when i-major.
+        if constexpr (is_i_major(dl)) {
+            // the data must be aligned to 16 bytes when bigger than ggml_cuda_get_max_cpy_bytes()
+            constexpr int aligned_copy_bytes = ggml_cuda_get_max_cpy_bytes();
+            if constexpr (sizeof(t.x) > aligned_copy_bytes) {
+                static_assert(sizeof(t.x) % aligned_copy_bytes == 0, "bad type size");
+                constexpr int aligned_copy_count = sizeof(t.x)/aligned_copy_bytes;
+#pragma unroll
+                for (int i = 0; i < aligned_copy_count; ++i) {
+                    ggml_cuda_memcpy_1<aligned_copy_bytes>(t.x + t.ne/aligned_copy_count*i, xs0 + t.get_i(0) * stride + t.get_j(t.ne/aligned_copy_count*i));
+                }
             } else {
-                NO_DEVICE_CODE;
+                ggml_cuda_memcpy_1<sizeof(t.x)>(t.x, xs0 + t.get_i(0) * stride + t.get_j(0));
             }
         } else {
-            NO_DEVICE_CODE;
+#pragma unroll
+            for (int l = 0; l < t.ne; ++l) {
+                t.x[l] = xs0[t.get_i(l)*stride + t.get_j(l)];
+            }
         }
 #else
 #pragma unroll
@@ -660,9 +712,9 @@ namespace ggml_cuda_mma {
 #endif // TURING_MMA_AVAILABLE
     }
 
-    template <typename T>
+    template <typename T, data_layout dl>
     static __device__ __forceinline__ void load_ldmatrix(
-            tile<16, 8, T> & t, const T * __restrict__ xs0, const int stride) {
+            tile<16, 8, T, dl> & t, const T * __restrict__ xs0, const int stride) {
 #if defined(TURING_MMA_AVAILABLE)
         int * xi = (int * ) t.x;
         const int * xs = (const int *) xs0 + (threadIdx.x % t.I) * stride + (threadIdx.x / t.I) * (t.J / 2);
@@ -832,8 +884,9 @@ namespace ggml_cuda_mma {
 #endif // TURING_MMA_AVAILABLE
     }
 
+    template <data_layout dl_ab, data_layout dl_d>
     static __device__ __forceinline__ void mma(
-            tile<16, 8, float> & D, const tile<16, 8, float> & A, const tile<8, 8, float> & B) {
+            tile<16, 8, float, dl_d> & D, const tile<16, 8, float, dl_ab> & A, const tile<8, 8, float, dl_ab> & B) {
 #ifdef AMPERE_MMA_AVAILABLE
         const int * Axi = (const int *) A.x;
         const int * Bxi = (const int *) B.x;
@@ -887,8 +940,9 @@ namespace ggml_cuda_mma {
 #endif // AMPERE_MMA_AVAILABLE
     }
 
+    template <data_layout dl_ab, data_layout dl_d>
     static __device__ __forceinline__ void mma(
-            tile<16, 16, float> & D, const tile<16, 8, half2> & A, const tile<16, 8, half2> & B) {
+            tile<16, 16, float, dl_d> & D, const tile<16, 8, half2, dl_ab> & A, const tile<16, 8, half2, dl_ab> & B) {
 #ifdef TURING_MMA_AVAILABLE
         const int * Axi = (const int *) A.x;
         const int * Bxi = (const int *) B.x;
@@ -940,8 +994,9 @@ namespace ggml_cuda_mma {
 #endif // TURING_MMA_AVAILABLE
     }
 
+    template <data_layout dl_ab, data_layout dl_d>
     static __device__ __forceinline__ void mma(
-            tile<16, 16, float> & D, const tile<16, 8, nv_bfloat162> & A, const tile<16, 8, nv_bfloat162> & B) {
+            tile<16, 16, float, dl_d> & D, const tile<16, 8, nv_bfloat162, dl_ab> & A, const tile<16, 8, nv_bfloat162, dl_ab> & B) {
 #if defined(AMD_WMMA_AVAILABLE)
 #if defined(RDNA4)
         using bf16x8_t = __attribute__((ext_vector_type(8))) __bf16;
@@ -967,8 +1022,9 @@ namespace ggml_cuda_mma {
 #endif // AMPERE_MMA_AVAILABLE
     }
 
+    template <data_layout dl_d, data_layout dl_ab>
     static __device__ __forceinline__ void mma(
-            tile<16, 16, int> & D, const tile<16, 8, int> & A, const tile<16, 8, int> & B) {
+            tile<16, 16, int, dl_d> & D, const tile<16, 8, int, dl_ab> & A, const tile<16, 8, int, dl_ab> & B) {
 #if defined(AMD_MFMA_AVAILABLE)
         using int32x4_t = __attribute__((__vector_size__(4 * sizeof(int)))) int;
         int32x4_t * acc = (int32x4_t *) D.x;
@@ -1122,8 +1178,9 @@ namespace ggml_cuda_mma {
 #endif // __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA
     }
 
-static __device__ __forceinline__ void mma(
-            tile<16, 16, int> & D, const tile<16, 4, int> & A, const tile<16, 4, int> & B) {
+    template <data_layout dl_d, data_layout dl_ab>
+    static __device__ __forceinline__ void mma(
+            tile<16, 16, int, dl_d> & D, const tile<16, 4, int, dl_ab> & A, const tile<16, 4, int, dl_ab> & B) {
 #if defined(AMD_WMMA_AVAILABLE)
         using int32x8_t = __attribute__((__vector_size__(8 * sizeof(int)))) int;
         int32x8_t * acc = (int32x8_t *) D.x;
diff --git a/ggml/src/ggml-cuda/mmf.cuh b/ggml/src/ggml-cuda/mmf.cuh
index e1c695c5c0..e36730948f 100644
--- a/ggml/src/ggml-cuda/mmf.cuh
+++ b/ggml/src/ggml-cuda/mmf.cuh
@@ -32,11 +32,13 @@ static __global__ void mul_mat_f(
 #if (!defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)) || defined(AMD_WMMA_AVAILABLE)
 #if defined(AMD_WMMA_AVAILABLE)
     // Special case for tf32, just dummy mma layout as wmma doesn't support it.
-    constexpr int tile_B_I = std::is_same_v<T, float> ? 8 : 16;
-    constexpr int tile_C_J = std::is_same_v<T, float> ? 8 : 16;
-    typedef tile<16,       8, T>     tile_A;
-    typedef tile<tile_B_I, 8, T>     tile_B;
-    typedef tile<16,       tile_C_J, float> tile_C;
+    constexpr bool is_tf32 = std::is_same_v<T, float>;
+    constexpr int tile_B_I = is_tf32 ? 8 : 16;
+    constexpr int tile_C_J = is_tf32 ? 8 : 16;
+    constexpr data_layout ab_layout = is_tf32 ? DATA_LAYOUT_I_MAJOR : get_input_data_layout();
+    typedef tile<16,       8,        T,     ab_layout>           tile_A;
+    typedef tile<tile_B_I, 8,        T,     ab_layout>           tile_B;
+    typedef tile<16,       tile_C_J, float, DATA_LAYOUT_J_MAJOR> tile_C;
 #else
 #ifdef VOLTA_MMA_AVAILABLE
     if constexpr (!std::is_same_v<T, half2>) {NO_DEVICE_CODE;} else {
@@ -272,11 +274,13 @@ static __global__ void mul_mat_f_ids(
 #if (!defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)) || defined(AMD_WMMA_AVAILABLE)
 #if defined(AMD_WMMA_AVAILABLE)
     // Special case for tf32, just dummy mma layout as wmma doesn't support it.
-    constexpr int tile_B_I = std::is_same_v<T, float> ? 8 : 16;
-    constexpr int tile_C_J = std::is_same_v<T, float> ? 8 : 16;
-    typedef tile<16,       8, T>     tile_A;
-    typedef tile<tile_B_I, 8, T>     tile_B;
-    typedef tile<16,       tile_C_J, float> tile_C;
+    constexpr bool is_tf32 = std::is_same_v<T, float>;
+    constexpr int tile_B_I = is_tf32 ? 8 : 16;
+    constexpr int tile_C_J = is_tf32 ? 8 : 16;
+    constexpr data_layout ab_layout = is_tf32 ? DATA_LAYOUT_I_MAJOR : get_input_data_layout();
+    typedef tile<16,       8,        T,     ab_layout>           tile_A;
+    typedef tile<tile_B_I, 8,        T,     ab_layout>           tile_B;
+    typedef tile<16,       tile_C_J, float, DATA_LAYOUT_J_MAJOR> tile_C;
 #else
 #ifdef VOLTA_MMA_AVAILABLE
     if constexpr (!std::is_same_v<T, half2>) {NO_DEVICE_CODE;} else {
diff --git a/ggml/src/ggml-cuda/mmq.cuh b/ggml/src/ggml-cuda/mmq.cuh
index 1298f99fff..fa8a72c9c1 100644
--- a/ggml/src/ggml-cuda/mmq.cuh
+++ b/ggml/src/ggml-cuda/mmq.cuh
@@ -797,9 +797,10 @@ template <int mmq_x, int mmq_y, mmq_q8_1_ds_layout ds_layout>
 static __device__ __forceinline__ void vec_dot_q8_0_q8_1_mma(
     const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int k00) {
 #if defined(AMD_MFMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
-    typedef tile<16,  8, int> tile_A;
-    typedef tile<16,  8, int> tile_B;
-    typedef tile<16, 16, int> tile_C;
+    constexpr data_layout input_layout = get_input_data_layout();
+    typedef tile<16,  8, int, input_layout>        tile_A;
+    typedef tile<16,  8, int, input_layout>        tile_B;
+    typedef tile<16, 16, int, DATA_LAYOUT_J_MAJOR> tile_C;
 
     constexpr int granularity = mmq_get_granularity_device(mmq_x);
     constexpr int rows_per_warp = granularity;
@@ -966,9 +967,10 @@ template <int mmq_x, int mmq_y>
 static __device__ __forceinline__ void vec_dot_q8_1_q8_1_mma(
     const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int k00) {
 #if defined(AMD_MFMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
-    typedef tile<16,  8, int> tile_A;
-    typedef tile<16,  8, int> tile_B;
-    typedef tile<16, 16, int> tile_C;
+    constexpr data_layout input_layout = get_input_data_layout();
+    typedef tile<16,  8, int, input_layout>        tile_A;
+    typedef tile<16,  8, int, input_layout>        tile_B;
+    typedef tile<16, 16, int, DATA_LAYOUT_J_MAJOR> tile_C;
 
     constexpr int granularity = mmq_get_granularity_device(mmq_x);
     constexpr int rows_per_warp = granularity;
@@ -1130,10 +1132,11 @@ template <int mmq_x, int mmq_y>
 static __device__ __forceinline__ void vec_dot_q8_0_16_q8_1_mma(
     const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int k00) {
 #if defined(AMD_MFMA_AVAILABLE)
-    typedef tile<16,  8, int> tile_A;
-    typedef tile<16,  8, int> tile_B;
-    typedef tile<16, 16, int> tile_C;
-    typedef tile<64,  2, int> tile_load;
+    constexpr data_layout input_layout = get_input_data_layout();
+    typedef tile<16,  8, int, input_layout>        tile_A;
+    typedef tile<16,  8, int, input_layout>        tile_B;
+    typedef tile<16, 16, int, DATA_LAYOUT_J_MAJOR> tile_C;
+    typedef tile<64,  2, int, input_layout>        tile_load;
 
     constexpr int granularity = mmq_get_granularity_device(mmq_x);
     constexpr int rows_per_warp = granularity;
@@ -1179,9 +1182,10 @@ static __device__ __forceinline__ void vec_dot_q8_0_16_q8_1_mma(
         }
     }
 #elif defined(AMD_WMMA_AVAILABLE) //wmma instructions can handle 16x4 tiles, does not require loading 64x2 tiles
-    typedef tile<16,  4, int> tile_A;
-    typedef tile<16,  4, int> tile_B;
-    typedef tile<16, 16, int> tile_C;
+    constexpr data_layout input_layout = get_input_data_layout();
+    typedef tile<16,  4, int, input_layout>        tile_A;
+    typedef tile<16,  4, int, input_layout>        tile_B;
+    typedef tile<16, 16, int, DATA_LAYOUT_J_MAJOR> tile_C;
 
     constexpr int granularity = mmq_get_granularity_device(mmq_x);
     constexpr int rows_per_warp = granularity;
@@ -1435,10 +1439,11 @@ template <int mmq_x, int mmq_y>
 static __device__ __forceinline__ void vec_dot_q2_K_q8_1_mma(
     const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int k00) {
 #if defined(AMD_MFMA_AVAILABLE)
-    typedef tile<16,  8, int> tile_A;
-    typedef tile<16,  8, int> tile_B;
-    typedef tile<16, 16, int> tile_C;
-    typedef tile<64,  2, int> tile_load;
+    constexpr data_layout input_layout = get_input_data_layout();
+    typedef tile<16,  8, int, input_layout>        tile_A;
+    typedef tile<16,  8, int, input_layout>        tile_B;
+    typedef tile<16, 16, int, DATA_LAYOUT_J_MAJOR> tile_C;
+    typedef tile<64,  2, int, input_layout>        tile_load;
 
     constexpr int granularity = mmq_get_granularity_device(mmq_x);
     constexpr int rows_per_warp = granularity;
@@ -1501,10 +1506,10 @@ static __device__ __forceinline__ void vec_dot_q2_K_q8_1_mma(
         }
     }
 #elif defined(AMD_WMMA_AVAILABLE) //wmma instructions can handle 16x4 tiles, does not require loading 64x2 tiles
-
-    typedef tile<16,  4, int> tile_A;
-    typedef tile<16,  4, int> tile_B;
-    typedef tile<16, 16, int> tile_C;
+    constexpr data_layout input_layout = get_input_data_layout();
+    typedef tile<16,  4, int, input_layout>        tile_A;
+    typedef tile<16,  4, int, input_layout>        tile_B;
+    typedef tile<16, 16, int, DATA_LAYOUT_J_MAJOR> tile_C;
 
     constexpr int granularity = mmq_get_granularity_device(mmq_x);
     constexpr int rows_per_warp = granularity;
@@ -2265,10 +2270,11 @@ template <int mmq_x, int mmq_y>
 static __device__ __forceinline__ void vec_dot_q6_K_q8_1_mma(
     const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int k00) {
 #if defined(AMD_MFMA_AVAILABLE)
-    typedef tile<16,  8, int> tile_A;
-    typedef tile<16,  8, int> tile_B;
-    typedef tile<16, 16, int> tile_C;
-    typedef tile<64,  2, int> tile_load;
+    constexpr data_layout input_layout = get_input_data_layout();
+    typedef tile<16,  8, int, input_layout>        tile_A;
+    typedef tile<16,  8, int, input_layout>        tile_B;
+    typedef tile<16, 16, int, DATA_LAYOUT_J_MAJOR> tile_C;
+    typedef tile<64,  2, int, input_layout>        tile_load;
 
     constexpr int granularity = mmq_get_granularity_device(mmq_x);
     constexpr int rows_per_warp = granularity;
@@ -2316,9 +2322,10 @@ static __device__ __forceinline__ void vec_dot_q6_K_q8_1_mma(
         }
     }
 #elif defined(AMD_WMMA_AVAILABLE) //wmma instructions can handle 16x4 tiles, does not require loading 64x2 tiles
-    typedef tile<16,  4, int> tile_A;
-    typedef tile<16,  4, int> tile_B;
-    typedef tile<16, 16, int> tile_C;
+    constexpr data_layout input_layout = get_input_data_layout();
+    typedef tile<16,  4, int, input_layout>        tile_A;
+    typedef tile<16,  4, int, input_layout>        tile_B;
+    typedef tile<16, 16, int, DATA_LAYOUT_J_MAJOR> tile_C;
 
     constexpr int granularity = mmq_get_granularity_device(mmq_x);
     constexpr int rows_per_warp = granularity;
@@ -3015,7 +3022,7 @@ static __device__ __forceinline__ void mmq_write_back_mma(
 
 #if defined(AMD_MFMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
     constexpr int tileC_IJ = mmq_get_granularity_device(0);
-    typedef tile<tileC_IJ, tileC_IJ, int> tile_C;
+    typedef tile<tileC_IJ, tileC_IJ, int, DATA_LAYOUT_J_MAJOR> tile_C;
     constexpr int rows_per_warp = granularity;
 #else
     typedef tile<16, 8, int> tile_C;
diff --git a/ggml/src/ggml-cuda/ssm-conv.cu b/ggml/src/ggml-cuda/ssm-conv.cu
index 4197973360..6d5ea704c6 100644
--- a/ggml/src/ggml-cuda/ssm-conv.cu
+++ b/ggml/src/ggml-cuda/ssm-conv.cu
@@ -102,31 +102,25 @@ static void ssm_conv_f32_cuda(const float * src0, const float * src1, const int
     const int threads = 128;
     GGML_ASSERT(nr % threads == 0);
 
-    if (n_t <= 32) {
-        const dim3 blocks(n_s, (nr + threads - 1) / threads, 1);
-        if (nc == 4) {
-            ssm_conv_f32<threads, 4><<<blocks, threads, 0, stream>>>(src0, src1, src0_nb0, src0_nb1, src0_nb2, src1_nb1,
-                                                                     dst, dst_nb0, dst_nb1, dst_nb2, n_t);
-        } else if (nc == 3) {
-            ssm_conv_f32<threads, 3><<<blocks, threads, 0, stream>>>(src0, src1, src0_nb0, src0_nb1, src0_nb2, src1_nb1,
-                                                                     dst, dst_nb0, dst_nb1, dst_nb2, n_t);
+    auto launch_kernel = [&](auto NC) {
+        constexpr int kNC = decltype(NC)::value;
+        if (n_t <= 32) {
+            const dim3 blocks(n_s, (nr + threads - 1) / threads, 1);
+            ssm_conv_f32<threads, kNC><<<blocks, threads, 0, stream>>>(src0, src1, src0_nb0, src0_nb1, src0_nb2, src1_nb1,
+                                                                       dst, dst_nb0, dst_nb1, dst_nb2, n_t);
         } else {
-            GGML_ABORT("Only support kernel size = 3 or size = 4 right now.");
-        }
-    } else {
-        if (nc == 4) {
             const int64_t split_n_t = 32;
             dim3          blocks(n_s, (nr + threads - 1) / threads, (n_t + split_n_t - 1) / split_n_t);
-            ssm_conv_long_token_f32<threads, 4, split_n_t><<<blocks, threads, 0, stream>>>(
+            ssm_conv_long_token_f32<threads, kNC, split_n_t><<<blocks, threads, 0, stream>>>(
                 src0, src1, src0_nb0, src0_nb1, src0_nb2, src1_nb1, dst, dst_nb0, dst_nb1, dst_nb2, n_t);
-        } else if (nc == 3) {
-            const int64_t split_n_t = 32;
-            dim3          blocks(n_s, (nr + threads - 1) / threads, (n_t + split_n_t - 1) / split_n_t);
-            ssm_conv_long_token_f32<threads, 3, split_n_t><<<blocks, threads, 0, stream>>>(
-                src0, src1, src0_nb0, src0_nb1, src0_nb2, src1_nb1, dst, dst_nb0, dst_nb1, dst_nb2, n_t);
-        } else {
-            GGML_ABORT("Only support kernel size = 3 or size = 4 right now.");
         }
+    };
+
+    switch (nc) {
+        case 3: launch_kernel(std::integral_constant<int, 3>{}); break;
+        case 4: launch_kernel(std::integral_constant<int, 4>{}); break;
+        case 9: launch_kernel(std::integral_constant<int, 9>{}); break;
+        default: GGML_ABORT("Only support kernel sizes 3, 4, 9 right now.");
     }
 }
 
diff --git a/ggml/src/ggml-cuda/topk-moe.cu b/ggml/src/ggml-cuda/topk-moe.cu
index 572379fcbf..48e569efa0 100644
--- a/ggml/src/ggml-cuda/topk-moe.cu
+++ b/ggml/src/ggml-cuda/topk-moe.cu
@@ -268,7 +268,23 @@ void ggml_cuda_op_topk_moe(ggml_backend_cuda_context & ctx,
     }
 }
 
-bool ggml_cuda_should_use_topk_moe(const ggml_tensor * softmax, const ggml_tensor * weights, const ggml_tensor * clamp) {
+bool ggml_cuda_should_use_topk_moe(const ggml_tensor * softmax,
+                                   const ggml_tensor * weights,
+                                   const ggml_tensor * get_rows,
+                                   const ggml_tensor * argsort,
+                                   const ggml_tensor * clamp,
+                                   int n_expert) {
+    ggml_tensor * probs = get_rows->src[0];
+    if (probs->op != GGML_OP_RESHAPE) {
+        return false;
+    }
+    probs = probs->src[0];
+    ggml_tensor * selection_probs = argsort->src[0];
+
+    if (probs != selection_probs) {
+        return false;
+    }
+
     float scale    = 1.0f;
     float max_bias = 0.0f;
 
@@ -288,7 +304,6 @@ bool ggml_cuda_should_use_topk_moe(const ggml_tensor * softmax, const ggml_tenso
         return false;
     }
 
-    const int n_expert = softmax->ne[0];
     // n_expert must be a power of 2
     if ((n_expert & (n_expert - 1)) != 0 || n_expert > 512) {
         return false;
diff --git a/ggml/src/ggml-cuda/topk-moe.cuh b/ggml/src/ggml-cuda/topk-moe.cuh
index 2eff408b03..6b6c13c587 100644
--- a/ggml/src/ggml-cuda/topk-moe.cuh
+++ b/ggml/src/ggml-cuda/topk-moe.cuh
@@ -11,6 +11,11 @@ void ggml_cuda_op_topk_moe(ggml_backend_cuda_context & ctx,
                            const bool                  delayed_softmax = false,
                            ggml_tensor *               weight_clamp    = nullptr);
 
-bool ggml_cuda_should_use_topk_moe(const ggml_tensor * softmax, const ggml_tensor * weights, const ggml_tensor * clamp = nullptr);
+bool ggml_cuda_should_use_topk_moe(const ggml_tensor * softmax,
+                                   const ggml_tensor * weights,
+                                   const ggml_tensor * get_rows,
+                                   const ggml_tensor * argsort,
+                                   const ggml_tensor * clamp,
+                                   int n_expert);
 
 std::initializer_list<enum ggml_op> ggml_cuda_topk_moe_ops(bool with_norm, bool delayed_softmax = false);
diff --git a/ggml/src/ggml-hexagon/CMakeLists.txt b/ggml/src/ggml-hexagon/CMakeLists.txt
index ac422027b9..d58e287823 100644
--- a/ggml/src/ggml-hexagon/CMakeLists.txt
+++ b/ggml/src/ggml-hexagon/CMakeLists.txt
@@ -2,6 +2,7 @@ include(${HEXAGON_SDK_ROOT}/build/cmake/hexagon_fun.cmake)
 include(ExternalProject)
 
 option(GGML_HEXAGON_HTP_DEBUG "ggml-hexagon: enable HTP debug output" OFF)
+set(GGML_HEXAGON_FP32_QUANTIZE_GROUP_SIZE 128 CACHE STRING "ggml-hexagon: quantize group size (32, 64, or 128)")
 
 add_library(htp_iface OBJECT
     ${CMAKE_CURRENT_BINARY_DIR}/htp_iface_stub.c)
@@ -41,7 +42,8 @@ set(HTP_CMAKE_ARGS
     -DCMAKE_INSTALL_LIBDIR=${CMAKE_CURRENT_BINARY_DIR}
     -DHEXAGON_SDK_ROOT=$ENV{HEXAGON_SDK_ROOT}
     -DHEXAGON_TOOLS_ROOT=$ENV{HEXAGON_TOOLS_ROOT}
-    -DHEXAGON_HTP_DEBUG=${GGML_HEXAGON_HTP_DEBUG})
+    -DHEXAGON_HTP_DEBUG=${GGML_HEXAGON_HTP_DEBUG}
+    -DGGML_HEXAGON_FP32_QUANTIZE_GROUP_SIZE=${GGML_HEXAGON_FP32_QUANTIZE_GROUP_SIZE})
 
 ExternalProject_Add(htp-v68
     SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/htp BUILD_ALWAYS ON
diff --git a/ggml/src/ggml-hexagon/ggml-hexagon.cpp b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
index 514f086f68..6a00abacc3 100644
--- a/ggml/src/ggml-hexagon/ggml-hexagon.cpp
+++ b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
@@ -2161,8 +2161,14 @@ static bool ggml_hexagon_supported_activations(const struct ggml_hexagon_session
     }
 
     // src0, src1 & dst must be mapped to the same session
-    if (!hex_supported_buffer(sess, src0, src1, dst)) {
-        return false;
+    if(src1){
+        if (!hex_supported_buffer(sess, src0, src1, dst)) {
+            return false;
+        }
+    }else{
+        if (!hex_supported_buffer(sess, src0, dst)) {
+            return false;
+        }
     }
 
     return true;
@@ -2662,6 +2668,10 @@ static void ggml_hexagon_unary(const struct ggml_tensor * op, uint32_t flags) {
                 req.op    = HTP_OP_UNARY_SILU;
                 supported = true;
             }
+            else if (ggml_get_unary_op(dst) == GGML_UNARY_OP_GELU){
+                req.op    = HTP_OP_UNARY_GELU;
+                supported = true;
+            }
             break;
 
         case GGML_OP_GLU:
@@ -2677,6 +2687,7 @@ static void ggml_hexagon_unary(const struct ggml_tensor * op, uint32_t flags) {
         case GGML_OP_SOFT_MAX:
             req.op    = HTP_OP_SOFTMAX;
             supported = true;
+            break;
 
         default:
             break;
@@ -2956,6 +2967,8 @@ static ggml_status ggml_backend_hexagon_graph_compute(ggml_backend_t backend, gg
             case GGML_OP_UNARY:
                 if (ggml_get_unary_op(node) == GGML_UNARY_OP_SILU) {
                     ggml_hexagon_unary(node, flags);
+                } else if (ggml_get_unary_op(node) == GGML_UNARY_OP_GELU) {
+                    ggml_hexagon_unary(node, flags);
                 }
                 break;
             case GGML_OP_GLU:
@@ -3254,7 +3267,6 @@ static bool ggml_backend_hexagon_device_supports_op(ggml_backend_dev_t dev, cons
     auto sess = static_cast<ggml_hexagon_session *>(dev->context);
 
     bool supp = false;
-
     switch (op->op) {
         case GGML_OP_NONE:
         case GGML_OP_RESHAPE:
@@ -3294,10 +3306,13 @@ static bool ggml_backend_hexagon_device_supports_op(ggml_backend_dev_t dev, cons
             if (ggml_get_unary_op(op) == GGML_UNARY_OP_SILU) {
                 supp = ggml_hexagon_supported_activations(sess, op);
             }
+            else if (ggml_get_unary_op(op) == GGML_UNARY_OP_GELU){
+                supp = ggml_hexagon_supported_activations(sess, op);
+            }
             break;
 
         case GGML_OP_GLU:
-            if ((ggml_get_glu_op(op) == GGML_GLU_OP_SWIGLU) /* || (ggml_get_glu_op(op) == GGML_GLU_OP_SWIGLU_OAI) */) {
+            if ((ggml_get_glu_op(op) == GGML_GLU_OP_SWIGLU) || (ggml_get_glu_op(op) == GGML_GLU_OP_SWIGLU_OAI) ) {
                 supp = ggml_hexagon_supported_activations(sess, op);
             }
             break;
diff --git a/ggml/src/ggml-hexagon/htp/CMakeLists.txt b/ggml/src/ggml-hexagon/htp/CMakeLists.txt
index 22e3fea11d..2cf8aaa42a 100644
--- a/ggml/src/ggml-hexagon/htp/CMakeLists.txt
+++ b/ggml/src/ggml-hexagon/htp/CMakeLists.txt
@@ -31,7 +31,8 @@ add_library(${HTP_LIB} SHARED
 )
 
 target_compile_definitions(${HTP_LIB} PRIVATE
-    $<IF:$<BOOL:${HEXAGON_HTP_DEBUG}>,HTP_DEBUG=1,NDEBUG=1>)
+    $<IF:$<BOOL:${HEXAGON_HTP_DEBUG}>,HTP_DEBUG=1,NDEBUG=1>
+    FP32_QUANTIZE_GROUP_SIZE=${GGML_HEXAGON_FP32_QUANTIZE_GROUP_SIZE})
 
 build_idl(htp_iface.idl ${HTP_LIB})
 
diff --git a/ggml/src/ggml-hexagon/htp/act-ops.c b/ggml/src/ggml-hexagon/htp/act-ops.c
index 87b09cca3a..586b5c1f92 100644
--- a/ggml/src/ggml-hexagon/htp/act-ops.c
+++ b/ggml/src/ggml-hexagon/htp/act-ops.c
@@ -231,7 +231,7 @@ static void glu_swiglu_oai_fp32_per_thread(const struct htp_tensor * src0,
         // x (src0_spad_data) = std::min(src0_p[k], limit);
         hvx_min_scalar_f32((const uint8_t *) src0, limit, src0_spad_data, nc);
         // y1 (src1_spad_data) = std::clamp(src1_p[k], -limit, limit);
-        hvx_clamp_scalar_f32((const uint8_t *) src1, limit, limit, src1_spad_data, nc);
+        hvx_clamp_scalar_f32((const uint8_t *) src1, -limit, limit, src1_spad_data, nc);
         // y (src1_spad_data)  = y1 + 1.f
         hvx_add_scalar_f32(src1_spad_data, 1.0, src1_spad_data, nc);
         // x1 (dst_spad_data) = alpha * (x)
@@ -255,6 +255,91 @@ static void glu_swiglu_oai_fp32_per_thread(const struct htp_tensor * src0,
          src1->ne[3], dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1));
 }
 
+
+static void unary_gelu_fp32_per_thread(const struct htp_tensor * src0,
+                                       struct htp_tensor *       dst,
+                                       const int32_t *           op_params,
+                                       struct htp_spad *         src0_spad,
+                                       struct htp_spad *         dst_spad,
+                                       uint32_t                  nth,
+                                       uint32_t                  ith,
+                                       uint32_t                  src0_nrows_per_thread) {
+    htp_act_preamble2;
+
+    uint64_t t1, t2;
+    t1 = HAP_perf_get_qtimer_count();
+
+    const size_t src0_row_size = nb01;
+    const size_t dst_row_size  = nb1;
+
+    const uint32_t src0_nrows = ne01 * ne02 * ne03;
+
+    const uint32_t src0_start_row = src0_nrows_per_thread * ith;
+    const uint32_t src0_end_row   = MIN(src0_start_row + src0_nrows_per_thread, src0_nrows);
+
+    // no work for this thread
+    if (src0_start_row >= src0_end_row) {
+        return;
+    }
+
+    int is_aligned = 1;
+    int opt_path   = 0;
+    if (!htp_is_aligned((void *) src0->data, VLEN) || !htp_is_aligned((void *) dst->data, VLEN)) {
+        is_aligned = 0;
+        FARF(HIGH, "silu-f32: unaligned addresses in elementwise op, possibly slower execution\n");
+    }
+    if ((1 == is_aligned) && !(nb01 & (VLEN - 1))) {
+        opt_path = 1;
+    }
+
+    const uint8_t * restrict data_src0 = (const uint8_t *) src0->data;
+    uint8_t * restrict data_dst        = (uint8_t *) dst->data;
+
+    uint8_t * restrict src0_spad_data = src0_spad->data + (ith * src0_row_size);
+    uint8_t * restrict dst_spad_data  = dst_spad->data + (ith * dst_row_size);
+
+    const int BLOCK = 8;
+    for (uint32_t ir = src0_start_row; ir < src0_end_row; ir += BLOCK) {
+        const uint32_t block_end = MIN(ir + BLOCK, src0_end_row);
+
+        // Prefetch next block
+        if (block_end < src0_end_row) {
+            const float * restrict prefetch_ptr = (float *) (data_src0 + (block_end * src0_row_size));
+            htp_l2fetch(prefetch_ptr, 1, block_end * src0_row_size, src0_row_size);
+        }
+
+        // Process rows in current block
+        for (uint32_t ib = ir; ib < block_end; ib++) {
+            const float * restrict src0 = (float *) (data_src0 + (ib * src0_row_size));
+            float * restrict dst        = (float *) (data_dst + (ib * dst_row_size));
+
+            // gelu = x * sigmoid(1.702 * x) // current implementation
+            if (1 == opt_path) {
+                hvx_mul_scalar_f32((const uint8_t *) src0, (float) 1.702, (uint8_t *) src0_spad_data, ne0);
+                hvx_fast_sigmoid_f32((const uint8_t *) src0_spad_data, (uint8_t *) src0_spad_data, ne0);
+                hvx_mul_f32_opt((const uint8_t *) src0, src0_spad_data, (uint8_t *) dst, ne0);
+            } else {
+                hvx_mul_scalar_f32( (const uint8_t *) src0, (float)1.702, (uint8_t *) src0_spad_data, ne0);
+                hvx_sigmoid_f32((const uint8_t *) src0_spad_data, (uint8_t *) src0_spad_data, ne0);
+                hvx_mul_f32((const uint8_t *) src0, src0_spad_data, (uint8_t *) dst, ne0);
+            }
+        }
+    }
+
+    t2 = HAP_perf_get_qtimer_count();
+
+    FARF(HIGH, "gelu-f32 %d/%d/%d: %ux%ux%ux%u (%u:%u) -> %ux%ux%ux%u usec %u\n", ith, nth, opt_path, ne00, ne01, ne02,
+         ne03, src0_start_row, src0_end_row, ne0, ne1, ne2, ne3, (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1));
+}
+
+static void unary_gelu_fp32(unsigned int n, unsigned int i, void * data) {
+    struct htp_ops_context * octx = (struct htp_ops_context *) data;
+    unary_gelu_fp32_per_thread(&octx->src0, &octx->dst, octx->op_params, &octx->src0_spad, &octx->dst_spad, n, i,
+                               octx->src0_nrows_per_thread);
+}
+
+
+
 static void unary_silu_fp32_per_thread(const struct htp_tensor * src0,
                                        struct htp_tensor *       dst,
                                        const int32_t *           op_params,
@@ -371,7 +456,10 @@ static int execute_op_activations_fp32(struct htp_ops_context * octx) {
             act_op_func = glu_swiglu_oai_fp32;
             op_type     = "swiglu-oai-f32";
             break;
-
+        case HTP_OP_UNARY_GELU:
+            act_op_func = unary_gelu_fp32;
+            op_type     = "gelu-f32";
+            break;
         default:
             FARF(ERROR, "Unsupported activations Op %u\n", octx->op);
             return HTP_STATUS_NO_SUPPORT;
diff --git a/ggml/src/ggml-hexagon/htp/htp-msg.h b/ggml/src/ggml-hexagon/htp/htp-msg.h
index 9278f41f4e..a61652304a 100644
--- a/ggml/src/ggml-hexagon/htp/htp-msg.h
+++ b/ggml/src/ggml-hexagon/htp/htp-msg.h
@@ -51,11 +51,12 @@ enum htp_op {
     HTP_OP_MUL_MAT_ID     = 5,
     HTP_OP_RMS_NORM       = 6,
     HTP_OP_UNARY_SILU     = 7,
-    HTP_OP_GLU_SWIGLU     = 8,
-    HTP_OP_GLU_SWIGLU_OAI = 9,
-    HTP_OP_SOFTMAX        = 10,
-    HTP_OP_ADD_ID         = 11,
-    HTP_OP_ROPE           = 12,
+    HTP_OP_UNARY_GELU     = 8,
+    HTP_OP_GLU_SWIGLU     = 9,
+    HTP_OP_GLU_SWIGLU_OAI = 10,
+    HTP_OP_SOFTMAX        = 11,
+    HTP_OP_ADD_ID         = 12,
+    HTP_OP_ROPE           = 13,
     INVALID
 };
 
diff --git a/ggml/src/ggml-hexagon/htp/hvx-utils.c b/ggml/src/ggml-hexagon/htp/hvx-utils.c
index e02b1d9099..f9e02ab67e 100644
--- a/ggml/src/ggml-hexagon/htp/hvx-utils.c
+++ b/ggml/src/ggml-hexagon/htp/hvx-utils.c
@@ -49,6 +49,8 @@ void hvx_mul_f32(const uint8_t * restrict src0,
         FARF(HIGH, "hvx_mul_f32: unaligned loop in hvx op, possibly slower execution\n");
     }
 
+
+    bool handled_leftover = false;
     if (0 == unaligned_loop) {
         HVX_Vector * restrict vec_in1 = (HVX_Vector *) src0;
         HVX_Vector * restrict vec_in2 = (HVX_Vector *) src1;
@@ -60,18 +62,59 @@ void hvx_mul_f32(const uint8_t * restrict src0,
             *vec_out++   = Q6_Vsf_equals_Vqf32(v);
         }
     } else {
+        int step_of_1 = num_elems_whole >> 5;  // divby 32, because 32 float = 128 bytes per HVX vector
+        int leftover_size = left_over * sizeof(float);
+
+
+        HVX_Vector * restrict vec_in1 = (HVX_Vector *) src0;
+        HVX_Vector * restrict vec_in2 = (HVX_Vector *) src1;
+        HVX_UVector * restrict vec_out = (HVX_UVector *) dst;
+
+        HVX_Vector slinep;
+        HVX_Vector slinec;
+        HVX_Vector sline;
+        HVX_Vector sline2p;
+        HVX_Vector sline2c;
+        HVX_Vector sline2;
+
+        slinep  = *vec_in1++;
+        sline2p = *vec_in2++;
         #pragma unroll(4)
-        for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
-            HVX_Vector in1 = *(HVX_UVector *) (src0 + i * SIZEOF_FP32);
-            HVX_Vector in2 = *(HVX_UVector *) (src1 + i * SIZEOF_FP32);
+        for (int i = step_of_1 - 1; i > 0; i--) {
+            slinec  = *vec_in1++;
+            sline2c = *vec_in2++;
+            sline   = Q6_V_valign_VVR(slinec, slinep, (size_t) src0);
+            sline2  = Q6_V_valign_VVR(sline2c, sline2p, (size_t) src1);
 
-            HVX_Vector out = Q6_Vqf32_vmpy_VsfVsf(in1, in2);
+            *((HVX_UVector *) (vec_out++)) = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(sline, sline2));
+            slinep                         = slinec;
+            sline2p                        = sline2c;
+        }
+        if (step_of_1 > 1) {
+            slinec  = htp_is_aligned(vec_in1, VLEN) && left_over == 0 ? slinep : *vec_in1++;
+            sline2c = htp_is_aligned(vec_in2, VLEN) && left_over == 0 ? sline2p : *vec_in2++;
 
-            *(HVX_UVector *) (dst + i * SIZEOF_FP32) = Q6_Vsf_equals_Vqf32(out);
+            sline                          = Q6_V_valign_VVR(slinec, slinep, (size_t) src0);
+            sline2                         = Q6_V_valign_VVR(sline2c, sline2p, (size_t) src1);
+            *((HVX_UVector *) (vec_out++)) = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(sline, sline2));
+            slinep                         = slinec;
+            sline2p                        = sline2c;
+        }
+        if (left_over > 0) {
+            slinec = (is_in_one_chunk(vec_in1, leftover_size, VLEN) ? slinep : *vec_in1++);
+
+            sline   = Q6_V_valign_VVR(slinec, slinep, (size_t) src0);
+            sline2c = (is_in_one_chunk(vec_in2, leftover_size, VLEN) ? sline2p : *vec_in2++);
+            sline2  = Q6_V_valign_VVR(sline2c, sline2p, (size_t) src1);
+
+            HVX_Vector out = Q6_Vqf32_vmpy_VsfVsf(sline, sline2);
+            hvx_vec_store_u(vec_out, leftover_size, Q6_Vsf_equals_Vqf32(out));
+            handled_leftover = true;
         }
     }
 
-    if (left_over > 0) {
+
+    if (left_over > 0 && !handled_leftover) {
         const float * src0f = (const float *) src0 + num_elems_whole;
         const float * src1f = (const float *) src1 + num_elems_whole;
         float *       dstf  = (float *) dst + num_elems_whole;
@@ -464,7 +507,7 @@ void hvx_mul_scalar_f32(const uint8_t * restrict src, const float val, uint8_t *
     }
 
     HVX_Vector val_vec = hvx_vec_splat_fp32(val);
-
+    bool handled_leftover = false;
     if (0 == unaligned_loop) {
         HVX_Vector * restrict vec_in1 = (HVX_Vector *) src;
         HVX_Vector * restrict vec_out = (HVX_Vector *) dst;
@@ -475,17 +518,47 @@ void hvx_mul_scalar_f32(const uint8_t * restrict src, const float val, uint8_t *
             *vec_out++   = Q6_Vsf_equals_Vqf32(v);
         }
     } else {
+        int step_of_1 = num_elems >> 5;  // divby 32, because 32 float = 128 bytes per HVX vector
+        int leftover_size = left_over * sizeof(float);
+
+        HVX_Vector *  input_v_ptr  = (HVX_Vector *) src;
+        HVX_UVector * output_v_ptr = (HVX_UVector *) dst;
+
+        HVX_Vector slinep;
+        HVX_Vector slinec;
+        HVX_Vector sline;
+
+        slinep = *input_v_ptr++;
+
         #pragma unroll(4)
-        for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
-            HVX_Vector in = *(HVX_UVector *) (src + i * SIZEOF_FP32);
+        for (int i = step_of_1 - 1; i > 0; i--) {
+            slinec                              = *input_v_ptr++;
+            sline                               = Q6_V_valign_VVR(slinec, slinep, (size_t) src);
+            *((HVX_UVector *) (output_v_ptr++)) = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(sline, val_vec));
+            /* Prepare slinep for next iteration */
+            slinep                              = slinec;
+        }
 
-            HVX_Vector out = Q6_Vqf32_vmpy_VsfVsf(in, val_vec);
+        if (step_of_1 > 0) {
+            slinec = htp_is_aligned(input_v_ptr, VLEN) && left_over == 0 ? slinep : *input_v_ptr++;
+            sline  = Q6_V_valign_VVR(slinec, slinep, (size_t) src);
+            *((HVX_UVector *) (output_v_ptr++)) = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(sline, val_vec));
 
-            *(HVX_UVector *) (dst + i * SIZEOF_FP32) = Q6_Vsf_equals_Vqf32(out);
+            slinep = slinec;
+        }
+
+        if (leftover_size > 0) {
+            slinec = (is_in_one_chunk(input_v_ptr, leftover_size, VLEN) ? slinep : *input_v_ptr++);
+
+            sline = Q6_V_valign_VVR(slinec, slinep, (size_t) src);
+
+            HVX_Vector sout = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(sline, val_vec));
+            hvx_vec_store_u(output_v_ptr, leftover_size, sout);
+            handled_leftover = true;
         }
     }
 
-    if (left_over > 0) {
+    if (left_over > 0 && !handled_leftover) {
         const float * srcf = (const float *) src + num_elems_whole;
         float *       dstf = (float *) dst + num_elems_whole;
 
@@ -875,35 +948,45 @@ float hvx_self_max_f32(const uint8_t * restrict src, const int num_elems) {
 void hvx_min_scalar_f32(const uint8_t * restrict src, const float val, uint8_t * restrict dst, const int num_elems) {
     size_t left_over       = num_elems & (VLEN_FP32 - 1);
     size_t num_elems_whole = num_elems - left_over;
-
+    int unalign_address = 0;
     if ((0 == htp_is_aligned((void *) src, VLEN)) || (0 == htp_is_aligned((void *) dst, VLEN))) {
         FARF(HIGH, "hvx_min_scalar_f32: unaligned address in hvx op, possibly slower execution\n");
+        unalign_address = 1;
     }
 
-    assert((1 == htp_is_aligned((void *) src, VLEN)) || (0 == num_elems_whole));
-
     const float * src_f = (const float *) src;
 
-    HVX_Vector vec_min = Q6_V_vsplat_R(val);
+    HVX_Vector vec_min = hvx_vec_splat_fp32(val);
 
-    HVX_Vector * restrict vec_in  = (HVX_Vector *) src;
-    HVX_Vector * restrict vec_out = (HVX_Vector *) dst;
+    if(unalign_address == 0){
+        HVX_Vector * restrict vec_in  = (HVX_Vector *) src;
+        HVX_Vector * restrict vec_out = (HVX_Vector *) dst;
 
-    #pragma unroll(4)
-    for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
-        vec_min    = Q6_Vsf_vmin_VsfVsf(vec_min, *vec_in++);
-        *vec_out++ = Q6_Vsf_equals_Vqf32(vec_min);
+        #pragma unroll(4)
+        for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
+            HVX_Vector min_clamp    = Q6_Vsf_vmin_VsfVsf(vec_min, *vec_in++);
+            *vec_out++ = (min_clamp);
+        }
+    }else{
+        HVX_UVector * restrict vec_in  = (HVX_Vector *) src;
+        HVX_UVector * restrict vec_out = (HVX_Vector *) dst;
+
+        #pragma unroll(4)
+        for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
+            HVX_Vector min_clamp     = Q6_Vsf_vmin_VsfVsf(vec_min, *vec_in++);
+            *vec_out++ = (min_clamp);
+        }
     }
 
-    if (left_over > 0) {
+    if (left_over > 0 ) {
         const float * srcf = (const float *) src + num_elems_whole;
         float *       dstf = (float *) dst + num_elems_whole;
 
-        HVX_Vector in = *(HVX_UVector *) srcf;
+        HVX_UVector in = *(HVX_UVector *) srcf;
 
-        vec_min = Q6_Vsf_vmin_VsfVsf(vec_min, in);
+        HVX_UVector min_clamp = Q6_Vsf_vmin_VsfVsf(vec_min, in);
 
-        hvx_vec_store_u((void *) dstf, left_over * SIZEOF_FP32, Q6_Vsf_equals_Vqf32(vec_min));
+        hvx_vec_store_u((void *) dstf, left_over * SIZEOF_FP32, (min_clamp));
     }
 }
 
@@ -915,46 +998,70 @@ void hvx_clamp_scalar_f32(const uint8_t * restrict src,
     size_t left_over       = num_elems & (VLEN_FP32 - 1);
     size_t num_elems_whole = num_elems - left_over;
 
+    int unalign_address = 0;
     if ((0 == htp_is_aligned((void *) src, VLEN)) || (0 == htp_is_aligned((void *) dst, VLEN))) {
         FARF(HIGH, "hvx_clamp_scalar_f32: unaligned address in hvx op, possibly slower execution\n");
+        unalign_address = 1;
     }
 
-    assert((1 == htp_is_aligned((void *) src, VLEN)) || (0 == num_elems_whole));
-
-    HVX_Vector * restrict vec_in  = (HVX_Vector *) src;
-    HVX_Vector * restrict vec_out = (HVX_Vector *) dst;
-
     HVX_Vector range_left  = hvx_vec_splat_fp32(limit_left);
     HVX_Vector range_right = hvx_vec_splat_fp32(limit_right);
 
-    #pragma unroll(4)
-    for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
-        HVX_Vector in_vec = *vec_in++;
-        HVX_Vector temp_v = in_vec;
+    if(unalign_address == 0){
+        HVX_Vector * restrict vec_in  = (HVX_Vector *) src;
+        HVX_Vector * restrict vec_out = (HVX_Vector *) dst;
 
-        HVX_VectorPred pred_cap_right = Q6_Q_vcmp_gt_VsfVsf(in_vec, range_right);
-        HVX_VectorPred pred_cap_left  = Q6_Q_vcmp_gt_VsfVsf(range_left, in_vec);
 
-        in_vec = Q6_V_vmux_QVV(pred_cap_right, range_right, temp_v);
-        in_vec = Q6_V_vmux_QVV(pred_cap_left, range_left, temp_v);
 
-        *vec_out++ = Q6_Vsf_equals_Vqf32(in_vec);
+        #pragma unroll(4)
+        for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
+            HVX_Vector in_vec = *vec_in++;
+            HVX_Vector temp_v = in_vec;
+
+            HVX_VectorPred pred_cap_right = Q6_Q_vcmp_gt_VsfVsf(in_vec, range_right);
+            HVX_VectorPred pred_cap_left  = Q6_Q_vcmp_gt_VsfVsf(range_left, in_vec);
+
+            in_vec = Q6_V_vmux_QVV(pred_cap_right, range_right, temp_v);
+            in_vec = Q6_V_vmux_QVV(pred_cap_left, range_left, in_vec);
+
+            *vec_out++ = in_vec;
+        }
+
+    }else{
+
+        HVX_UVector * restrict vec_in  = (HVX_UVector *) src;
+        HVX_UVector * restrict vec_out = (HVX_UVector *) dst;
+
+        #pragma unroll(4)
+        for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
+            HVX_Vector in_vec = *vec_in++;
+            HVX_Vector temp_v = in_vec;
+
+            HVX_VectorPred pred_cap_right = Q6_Q_vcmp_gt_VsfVsf(in_vec, range_right);
+            HVX_VectorPred pred_cap_left  = Q6_Q_vcmp_gt_VsfVsf(range_left, in_vec);
+
+            in_vec = Q6_V_vmux_QVV(pred_cap_right, range_right, temp_v);
+            in_vec = Q6_V_vmux_QVV(pred_cap_left, range_left, in_vec);
+
+            *vec_out++ = in_vec;
+        }
+
     }
 
     if (left_over > 0) {
         const float * srcf = (const float *) src + num_elems_whole;
         float *       dstf = (float *) dst + num_elems_whole;
 
-        HVX_Vector in = *(HVX_UVector *) srcf;
+        HVX_Vector in_vec = *(HVX_UVector *) srcf;
 
-        HVX_Vector temp_v = in;
+        HVX_Vector temp_v = in_vec;
 
-        HVX_VectorPred pred_cap_right = Q6_Q_vcmp_gt_VsfVsf(in, range_right);
-        HVX_VectorPred pred_cap_left  = Q6_Q_vcmp_gt_VsfVsf(range_left, in);
+        HVX_VectorPred pred_cap_right = Q6_Q_vcmp_gt_VsfVsf(in_vec, range_right);
+        HVX_VectorPred pred_cap_left  = Q6_Q_vcmp_gt_VsfVsf(range_left, in_vec);
 
-        in = Q6_V_vmux_QVV(pred_cap_right, range_right, temp_v);
-        in = Q6_V_vmux_QVV(pred_cap_left, range_left, temp_v);
+        in_vec = Q6_V_vmux_QVV(pred_cap_right, range_right, temp_v);
+        in_vec = Q6_V_vmux_QVV(pred_cap_left, range_left, in_vec);
 
-        hvx_vec_store_u((void *) dstf, left_over * SIZEOF_FP32, Q6_Vsf_equals_Vqf32(in));
+        hvx_vec_store_u((void *) dstf, left_over * SIZEOF_FP32, in_vec);
     }
 }
diff --git a/ggml/src/ggml-hexagon/htp/hvx-utils.h b/ggml/src/ggml-hexagon/htp/hvx-utils.h
index 80658105c5..566048297d 100644
--- a/ggml/src/ggml-hexagon/htp/hvx-utils.h
+++ b/ggml/src/ggml-hexagon/htp/hvx-utils.h
@@ -265,12 +265,16 @@ static inline void hvx_bcast_fp32_a(uint8_t * restrict dst, float elem, uint32_t
     }
 }
 
+
+/* Return whether 'n' elements from vector are in the one chunk of 'chunk_size'. */
 static __attribute__((always_inline)) int32_t is_in_one_chunk(void * addr, uint32_t n, uint32_t chunk_size) {
     uint32_t left_off  = (size_t) addr & (chunk_size - 1);
     uint32_t right_off = left_off + n;
     return right_off <= chunk_size;
 }
 
+
+
 static void hvx_vec_dump_fp16_n(char * pref, HVX_Vector v, uint32_t n) {
     HVX_VectorAlias u = { .v = v };
 
@@ -994,6 +998,59 @@ static inline void hvx_fast_sigmoid_f32(const uint8_t * restrict src, uint8_t *
     }
 }
 
+
+static inline void hvx_sigmoid_f32(const uint8_t * restrict src, uint8_t * restrict dst, const int num_elems){
+    int step_of_1 = num_elems >> 5;  // divby 32, because 32 float = 128 bytes per HVX vector
+    int leftover = num_elems - (step_of_1 * VLEN_FP32);
+
+    int32_t leftover_size = leftover * sizeof(float);
+
+    static const float kMinExp = -87.f;  // 0
+    static const float kMaxExp = 87.f;   // 1
+
+    const HVX_Vector one     = hvx_vec_splat_fp32(1.f);
+    const HVX_Vector max_exp = hvx_vec_splat_fp32(kMaxExp);
+    const HVX_Vector min_exp = hvx_vec_splat_fp32(kMinExp);
+
+    const float *input = (float *)src;
+    float *output = (float *)dst;
+
+    HVX_Vector *  input_v_ptr  = (HVX_Vector *) input;
+    HVX_UVector * output_v_ptr = (HVX_UVector *) output;
+
+    HVX_Vector slinep;
+    HVX_Vector slinec;
+    HVX_Vector sline;
+
+    slinep = *input_v_ptr++;
+    #pragma unroll(4)
+    for (int i = step_of_1 - 1; i > 0; i--) {
+        slinec                              = *input_v_ptr++;
+        sline                               = Q6_V_valign_VVR(slinec, slinep, (size_t) input);
+        *((HVX_UVector *) (output_v_ptr++)) = hvx_vec_fast_sigmoid_fp32_guard(sline, one, max_exp, min_exp);
+        /* Prepare slinep for next iteration */
+        slinep                              = slinec;
+    }
+
+    if (step_of_1 > 0) {
+        slinec = htp_is_aligned(input_v_ptr, 128) && leftover == 0 ? slinep : *input_v_ptr++;
+        sline  = Q6_V_valign_VVR(slinec, slinep, (size_t) input);
+        *((HVX_UVector *) (output_v_ptr++)) = hvx_vec_fast_sigmoid_fp32_guard(sline, one, max_exp, min_exp);
+        ;
+
+        slinep = slinec;
+    }
+    if (leftover > 0) {
+        slinec = (is_in_one_chunk(input_v_ptr, leftover_size, 128) ? slinep : *input_v_ptr++);
+
+        sline = Q6_V_valign_VVR(slinec, slinep, (size_t) input);
+
+        HVX_Vector sout = hvx_vec_fast_sigmoid_fp32_guard(sline, one, max_exp, min_exp);
+        hvx_vec_store_u(output_v_ptr, leftover_size, sout);
+    }
+}
+
+
 float hvx_sum_of_squares_f32(const uint8_t * restrict src, const int num_elems);
 void  hvx_mul_f32(const uint8_t * restrict src0,
                   const uint8_t * restrict src1,
diff --git a/ggml/src/ggml-hexagon/htp/main.c b/ggml/src/ggml-hexagon/htp/main.c
index b60b352a7b..656c369d0a 100644
--- a/ggml/src/ggml-hexagon/htp/main.c
+++ b/ggml/src/ggml-hexagon/htp/main.c
@@ -798,6 +798,7 @@ static void htp_packet_callback(dspqueue_t queue, int error, void * context) {
                 break;
 
             case HTP_OP_UNARY_SILU:
+            case HTP_OP_UNARY_GELU:
                 if (n_bufs != 2) {
                     FARF(ERROR, "Bad act-req buffer list");
                     continue;
@@ -806,6 +807,7 @@ static void htp_packet_callback(dspqueue_t queue, int error, void * context) {
                 break;
 
             case HTP_OP_GLU_SWIGLU:
+            case HTP_OP_GLU_SWIGLU_OAI:
             case HTP_OP_SOFTMAX:
                 if ((n_bufs != 2) && (n_bufs != 3)) {
                     FARF(ERROR, "Bad act-req buffer list");
diff --git a/ggml/src/ggml-hexagon/htp/matmul-ops.c b/ggml/src/ggml-hexagon/htp/matmul-ops.c
index 346f0bd339..0c9188244d 100644
--- a/ggml/src/ggml-hexagon/htp/matmul-ops.c
+++ b/ggml/src/ggml-hexagon/htp/matmul-ops.c
@@ -92,6 +92,18 @@ static const uint8_t __attribute__((aligned(128))) repl_1x_fp16[128] = {
     0x02, 0x02, 0x04, 0x04, 0x02, 0x02, 0x08, 0x08, 0x02, 0x02, 0x04, 0x04, 0x02, 0x02,
 };
 
+// vdelta control to replicate first fp16 value across all elements
+static const uint8_t __attribute__((aligned(128))) repl_2x_fp16[128] = {
+    0x00, 0x00, 0x02, 0x02, 0x04, 0x04, 0x02, 0x02, 0x08, 0x08, 0x02, 0x02, 0x04, 0x04, 0x02, 0x02,
+    0x10, 0x10, 0x02, 0x02, 0x04, 0x04, 0x02, 0x02, 0x08, 0x08, 0x02, 0x02, 0x04, 0x04, 0x02, 0x02,
+    0x20, 0x20, 0x02, 0x02, 0x04, 0x04, 0x02, 0x02, 0x08, 0x08, 0x02, 0x02, 0x04, 0x04, 0x02, 0x02,
+    0x10, 0x10, 0x02, 0x02, 0x04, 0x04, 0x02, 0x02, 0x08, 0x08, 0x02, 0x02, 0x04, 0x04, 0x02, 0x02,
+    0x00, 0x00, 0x02, 0x02, 0x04, 0x04, 0x02, 0x02, 0x08, 0x08, 0x02, 0x02, 0x04, 0x04, 0x02, 0x02,
+    0x10, 0x10, 0x02, 0x02, 0x04, 0x04, 0x02, 0x02, 0x08, 0x08, 0x02, 0x02, 0x04, 0x04, 0x02, 0x02,
+    0x20, 0x20, 0x02, 0x02, 0x04, 0x04, 0x02, 0x02, 0x08, 0x08, 0x02, 0x02, 0x04, 0x04, 0x02, 0x02,
+    0x10, 0x10, 0x02, 0x02, 0x04, 0x04, 0x02, 0x02, 0x08, 0x08, 0x02, 0x02, 0x04, 0x04, 0x02, 0x02,
+};
+
 // vdelta control to expand first 32 e8m0 values into 32 uint32 elements
 static const uint8_t __attribute__((aligned(128))) expand_x32_e8m0[128] = {
     0x00, 0x00, 0x00, 0x00, 0x01, 0x04, 0x00, 0x00, 0x02, 0x00, 0x08, 0x08, 0x01, 0x02, 0x00, 0x04, 0x04, 0x00, 0x00,
@@ -1594,6 +1606,118 @@ static void matmul_f16_f32(struct htp_tensor * restrict src0,
 
 // *** dynamic quant
 
+static inline void quantize_block_fp32_q8x1(float * restrict x, uint8_t * restrict y_q, uint8_t * restrict y_d) {
+    assert((unsigned long) x % 128 == 0);
+    assert((unsigned long) y_q % 128 == 0);
+
+    HVX_Vector * vx = (HVX_Vector *) x;
+    HVX_Vector zero   = Q6_V_vsplat_R(0);
+
+    // Use reduce max fp32 to find max(abs(e)) first
+    HVX_Vector vmax0_sf = hvx_vec_reduce_max_fp32(hvx_vec_abs_fp32(vx[0]));
+    HVX_Vector vmax1_sf = hvx_vec_reduce_max_fp32(hvx_vec_abs_fp32(vx[1]));
+    HVX_Vector vmax2_sf = hvx_vec_reduce_max_fp32(hvx_vec_abs_fp32(vx[2]));
+    HVX_Vector vmax3_sf = hvx_vec_reduce_max_fp32(hvx_vec_abs_fp32(vx[3]));
+    // Load and convert into QF32
+    HVX_Vector vx0_qf = Q6_Vqf32_vsub_VsfVsf(vx[0], zero);  // 32 elements
+    HVX_Vector vx1_qf = Q6_Vqf32_vsub_VsfVsf(vx[1], zero);  // 32 elements
+    HVX_Vector vx2_qf = Q6_Vqf32_vsub_VsfVsf(vx[2], zero);  // 32 elements
+    HVX_Vector vx3_qf = Q6_Vqf32_vsub_VsfVsf(vx[3], zero);  // 32 elements
+
+    // Convert to QF32
+    HVX_Vector vmax0_qf = Q6_Vqf32_vsub_VsfVsf(vmax0_sf, zero);
+    HVX_Vector vmax1_qf = Q6_Vqf32_vsub_VsfVsf(vmax1_sf, zero);
+    HVX_Vector vmax2_qf = Q6_Vqf32_vsub_VsfVsf(vmax2_sf, zero);
+    HVX_Vector vmax3_qf = Q6_Vqf32_vsub_VsfVsf(vmax3_sf, zero);
+
+    // Combine and convert to fp16
+    HVX_Vector vmax01_hf = Q6_Vh_vdeal_Vh(Q6_Vhf_equals_Wqf32(Q6_W_vcombine_VV(vmax1_qf, vmax0_qf)));
+    HVX_Vector vmax23_hf = Q6_Vh_vdeal_Vh(Q6_Vhf_equals_Wqf32(Q6_W_vcombine_VV(vmax3_qf, vmax2_qf)));
+
+    // Convert into fp16
+    HVX_Vector vx01_hf = Q6_Vh_vdeal_Vh(Q6_Vhf_equals_Wqf32(Q6_W_vcombine_VV(vx1_qf, vx0_qf)));
+    HVX_Vector vx23_hf = Q6_Vh_vdeal_Vh(Q6_Vhf_equals_Wqf32(Q6_W_vcombine_VV(vx3_qf, vx2_qf)));
+
+    // Replicate first fp16 scale across all lanes
+    HVX_Vector ctrl = *(const HVX_Vector *) repl_2x_fp16;
+    vmax01_hf         = Q6_V_vdelta_VV(vmax01_hf, ctrl);
+    vmax23_hf         = Q6_V_vdelta_VV(vmax23_hf, ctrl);
+
+    HVX_Vector vd01_qf16 = Q6_Vqf16_vmpy_VhfVhf(vmax01_hf, Q6_Vh_vsplat_R(0x2008));  // 1.0 / 127.0
+    HVX_Vector vd23_qf16 = Q6_Vqf16_vmpy_VhfVhf(vmax23_hf, Q6_Vh_vsplat_R(0x2008));  // 1.0 / 127.0
+    HVX_Vector vd01_hf   = Q6_Vhf_equals_Vqf16(vd01_qf16);
+    HVX_Vector vd23_hf   = Q6_Vhf_equals_Vqf16(vd23_qf16);
+
+    hvx_vec_store_u(y_d + 0, 2, vd01_hf);
+    HVX_Vector rotated_vd_hf = Q6_V_vror_VR(vd01_hf, 64);
+    hvx_vec_store_u(y_d + 2, 2, rotated_vd_hf);
+
+    hvx_vec_store_u(y_d + 4, 2, vd23_hf);
+    rotated_vd_hf = Q6_V_vror_VR(vd23_hf, 64);
+    hvx_vec_store_u(y_d + 6, 2, rotated_vd_hf);
+
+    // Divide input by the scale
+    HVX_Vector vd01_inv_hf = hvx_vec_inverse_fp16(vd01_hf);
+    HVX_Vector vd23_inv_hf = hvx_vec_inverse_fp16(vd23_hf);
+    vx01_hf              = Q6_Vhf_equals_Vqf16(Q6_Vqf16_vmpy_VhfVhf(vx01_hf, vd01_inv_hf));
+    vx23_hf              = Q6_Vhf_equals_Vqf16(Q6_Vqf16_vmpy_VhfVhf(vx23_hf, vd23_inv_hf));
+
+    // Convert to int8
+    HVX_Vector vx01_i16 = hvx_vec_i16_from_hf_rnd_sat(vx01_hf);
+    HVX_Vector vx23_i16 = hvx_vec_i16_from_hf_rnd_sat(vx23_hf);
+    HVX_Vector vx_i8    = Q6_Vb_vpack_VhVh_sat(vx23_i16, vx01_i16);
+
+    *(HVX_Vector *) y_q = vx_i8;
+}
+
+static inline void quantize_block_fp32_q8x2(float * restrict x, uint8_t * restrict y_q, uint8_t * restrict y_d) {
+    assert((unsigned long) x % 128 == 0);
+    assert((unsigned long) y_q % 128 == 0);
+
+    HVX_Vector * vx = (HVX_Vector *) x;
+
+    // Load and convert into QF32
+    HVX_Vector zero   = Q6_V_vsplat_R(0);
+    HVX_Vector vx0_qf = Q6_Vqf32_vsub_VsfVsf(vx[0], zero);  // 32 elements
+    HVX_Vector vx1_qf = Q6_Vqf32_vsub_VsfVsf(vx[1], zero);  // 32 elements
+    HVX_Vector vx2_qf = Q6_Vqf32_vsub_VsfVsf(vx[2], zero);  // 32 elements
+    HVX_Vector vx3_qf = Q6_Vqf32_vsub_VsfVsf(vx[3], zero);  // 32 elements
+
+    // Convert into fp16
+    HVX_Vector vx01_hf = Q6_Vh_vdeal_Vh(Q6_Vhf_equals_Wqf32(Q6_W_vcombine_VV(vx1_qf, vx0_qf)));
+    HVX_Vector vx23_hf = Q6_Vh_vdeal_Vh(Q6_Vhf_equals_Wqf32(Q6_W_vcombine_VV(vx3_qf, vx2_qf)));
+
+    // Compute max and scale
+    HVX_Vector vmax01_hf = hvx_vec_reduce_max_fp16(hvx_vec_abs_fp16(vx01_hf));
+    HVX_Vector vmax23_hf = hvx_vec_reduce_max_fp16(hvx_vec_abs_fp16(vx23_hf));
+
+    // Replicate first fp16 scale across all lanes
+    HVX_Vector ctrl = *(const HVX_Vector *) repl_1x_fp16;
+    vmax01_hf         = Q6_V_vdelta_VV(vmax01_hf, ctrl);
+    vmax23_hf         = Q6_V_vdelta_VV(vmax23_hf, ctrl);
+
+    HVX_Vector vd01_qf16 = Q6_Vqf16_vmpy_VhfVhf(vmax01_hf, Q6_Vh_vsplat_R(0x2008));  // 1.0 / 127.0
+    HVX_Vector vd23_qf16 = Q6_Vqf16_vmpy_VhfVhf(vmax23_hf, Q6_Vh_vsplat_R(0x2008));  // 1.0 / 127.0
+    HVX_Vector vd01_hf   = Q6_Vhf_equals_Vqf16(vd01_qf16);
+    HVX_Vector vd23_hf   = Q6_Vhf_equals_Vqf16(vd23_qf16);
+
+    hvx_vec_store_u(y_d + 0, 4, vd01_hf);
+    hvx_vec_store_u(y_d + 4, 4, vd23_hf);
+
+    // Divide input by the scale
+    HVX_Vector vd01_inv_hf = hvx_vec_inverse_fp16(vd01_hf);
+    HVX_Vector vd23_inv_hf = hvx_vec_inverse_fp16(vd23_hf);
+    vx01_hf              = Q6_Vhf_equals_Vqf16(Q6_Vqf16_vmpy_VhfVhf(vx01_hf, vd01_inv_hf));
+    vx23_hf              = Q6_Vhf_equals_Vqf16(Q6_Vqf16_vmpy_VhfVhf(vx23_hf, vd23_inv_hf));
+
+    // Convert to int8
+    HVX_Vector vx01_i16 = hvx_vec_i16_from_hf_rnd_sat(vx01_hf);
+    HVX_Vector vx23_i16 = hvx_vec_i16_from_hf_rnd_sat(vx23_hf);
+    HVX_Vector vx_i8    = Q6_Vb_vpack_VhVh_sat(vx23_i16, vx01_i16);
+
+    *(HVX_Vector *) y_q = vx_i8;
+}
+
 static inline void quantize_block_fp32_q8x4(float * restrict x, uint8_t * restrict y_q, uint8_t * restrict y_d) {
     assert((unsigned long) x % 128 == 0);
     assert((unsigned long) y_q % 128 == 0);
@@ -1655,10 +1779,24 @@ static void quantize_row_fp32_q8x4x2(float * restrict x, uint8_t * restrict y, u
     uint8_t * restrict t_d = (uint8_t *) x;
 
     for (uint32_t i = 0; i < nb; i++) {
+#if FP32_QUANTIZE_GROUP_SIZE == 32
+        quantize_block_fp32_q8x1(x + (i * 2 + 0) * qk / 2, y_q + (i * 2 + 0) * qblk_size / 2,
+                                 t_d + (i * 2 + 0) * dblk_size / 2);
+        quantize_block_fp32_q8x1(x + (i * 2 + 1) * qk / 2, y_q + (i * 2 + 1) * qblk_size / 2,
+                                 t_d + (i * 2 + 1) * dblk_size / 2);
+#elif FP32_QUANTIZE_GROUP_SIZE == 64
+        quantize_block_fp32_q8x2(x + (i * 2 + 0) * qk / 2, y_q + (i * 2 + 0) * qblk_size / 2,
+                                 t_d + (i * 2 + 0) * dblk_size / 2);
+        quantize_block_fp32_q8x2(x + (i * 2 + 1) * qk / 2, y_q + (i * 2 + 1) * qblk_size / 2,
+                                 t_d + (i * 2 + 1) * dblk_size / 2);
+#elif FP32_QUANTIZE_GROUP_SIZE == 128
         quantize_block_fp32_q8x4(x + (i * 2 + 0) * qk / 2, y_q + (i * 2 + 0) * qblk_size / 2,
                                  t_d + (i * 2 + 0) * dblk_size / 2);
         quantize_block_fp32_q8x4(x + (i * 2 + 1) * qk / 2, y_q + (i * 2 + 1) * qblk_size / 2,
                                  t_d + (i * 2 + 1) * dblk_size / 2);
+#else
+#error "FP32_QUANTIZE_GROUP_SIZE must be 32, 64, or 128"
+#endif
     }
 
     // now copy the scales into final location
@@ -1671,6 +1809,7 @@ static void quantize_fp32_q8x4x2(const struct htp_tensor * src,
                                  uint32_t          nth,
                                  uint32_t          ith,
                                  uint32_t          nrows_per_thread) {
+
     uint64_t t1 = HAP_perf_get_qtimer_count();
 
     const uint32_t ne0 = src->ne[0];
diff --git a/ggml/src/ggml-rpc/ggml-rpc.cpp b/ggml/src/ggml-rpc/ggml-rpc.cpp
index 18a45d2d96..13cf1f5f9d 100644
--- a/ggml/src/ggml-rpc/ggml-rpc.cpp
+++ b/ggml/src/ggml-rpc/ggml-rpc.cpp
@@ -583,7 +583,7 @@ static rpc_tensor serialize_tensor(const ggml_tensor * tensor) {
     if (tensor->buffer) {
         ggml_backend_buffer_t buffer = tensor->buffer;
         ggml_backend_rpc_buffer_context * ctx = (ggml_backend_rpc_buffer_context *)buffer->context;
-        result.buffer = ctx->remote_ptr;
+        result.buffer = ctx != nullptr ? ctx->remote_ptr : 0;
     } else {
         result.buffer = 0;
     }
diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
index 34ec09d403..c2adca9cba 100644
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
@@ -689,6 +689,7 @@ struct vk_device_struct {
     vk_pipeline pipeline_gelu_quick[2];
     vk_pipeline pipeline_silu[2];
     vk_pipeline pipeline_relu[2];
+    vk_pipeline pipeline_xielu[2];
     vk_pipeline pipeline_neg[2];
     vk_pipeline pipeline_tanh[2];
     vk_pipeline pipeline_sigmoid[2];
@@ -855,6 +856,15 @@ struct vk_subbuffer {
     }
 };
 
+// vk_event is used for the event-related backend interfaces. It uses 'event' for
+// event_wait and 'fence' for event_synchronize. Polling on an event for
+// event_synchronize wouldn't be sufficient to wait for command buffers to complete,
+// and would lead to validation errors.
+struct vk_event {
+    vk::Event event;
+    vk::Fence fence;
+};
+
 struct vk_semaphore {
     vk::Semaphore s;
     uint64_t value;
@@ -990,6 +1000,8 @@ struct vk_op_push_constants {
     uint32_t KY;
     float param1;
     float param2;
+    float param3;
+    float param4;
 };
 
 struct vk_op_glu_push_constants {
@@ -1258,6 +1270,7 @@ struct vk_op_im2col_push_constants {
     int32_t s0; int32_t s1;
     int32_t p0; int32_t p1;
     int32_t d0; int32_t d1;
+    uint32_t batch_IC;
 };
 
 struct vk_op_im2col_3d_push_constants {
@@ -1527,6 +1540,8 @@ private:
 #endif // GGML_VULKAN_MEMORY_DEBUG
 
 static bool vk_perf_logger_enabled = false;
+static bool vk_perf_logger_concurrent = false;
+static bool vk_enable_sync_logger = false;
 // number of calls between perf logger prints
 static uint32_t vk_perf_logger_frequency = 1;
 
@@ -1577,14 +1592,14 @@ class vk_perf_logger {
         flops.clear();
     }
 
-    void log_timing(const ggml_tensor * node, const char *fusion_name, uint64_t time) {
+    std::string get_node_fusion_name(const ggml_tensor * node, const char *fusion_name, uint64_t *n_flops) {
+        *n_flops = 0;
         std::string fusion_str;
         if (fusion_name) {
             fusion_str = fusion_name + std::string(" ");
         }
         if (node->op == GGML_OP_UNARY) {
-            timings[fusion_str + ggml_unary_op_name(ggml_get_unary_op(node))].push_back(time);
-            return;
+            return fusion_str + ggml_unary_op_name(ggml_get_unary_op(node));
         }
         if (node->op == GGML_OP_MUL_MAT || node->op == GGML_OP_MUL_MAT_ID) {
             const uint64_t m     = node->ne[0];
@@ -1606,9 +1621,8 @@ class vk_perf_logger {
                 name += " batch=" + std::to_string(batch);
             }
             name = fusion_str + name;
-            timings[name].push_back(time);
-            flops[name].push_back(m * n * (k + (k - 1)) * batch);
-            return;
+            *n_flops = m * n * (k + (k - 1)) * batch;
+            return name;
         }
         if (node->op == GGML_OP_CONV_2D || node->op == GGML_OP_CONV_TRANSPOSE_2D) {
             std::string   name    = ggml_op_name(node->op);
@@ -1624,20 +1638,17 @@ class vk_perf_logger {
             uint64_t      size_M  = Cout;
             uint64_t      size_K  = Cin * KW * KH;
             uint64_t      size_N  = N * OW * OH;
-            uint64_t      n_flops = size_M * size_N * (size_K + (size_K - 1));
+            *n_flops = size_M * size_N * (size_K + (size_K - 1));
             name += " M=Cout=" + std::to_string(size_M) + ", K=Cin*KW*KH=" + std::to_string(size_K) +
                     ", N=N*OW*OH=" + std::to_string(size_N);
             name = fusion_str + name;
-            flops[name].push_back(n_flops);
-            timings[name].push_back(time);
-            return;
+            return name;
         }
         if (node->op == GGML_OP_RMS_NORM) {
             std::string   name    = ggml_op_name(node->op);
             name += "(" + std::to_string(node->ne[0]) + "," + std::to_string(node->ne[1]) + "," + std::to_string(node->ne[2]) + "," + std::to_string(node->ne[3]) + ")";
             name = fusion_str + name;
-            timings[name].push_back(time);
-            return;
+            return name;
         }
         if (node->op == GGML_OP_FLASH_ATTN_EXT) {
             const ggml_tensor * dst = node;
@@ -1653,8 +1664,7 @@ class vk_perf_logger {
                 " k(" << k->ne[0] << "," << k->ne[1] << "," << k->ne[2] << "," << k->ne[3] << "), " <<
                 " v(" << v->ne[0] << "," << v->ne[1] << "," << v->ne[2] << "," << v->ne[3] << "), " <<
                 " m(" << (m?m->ne[0]:0) << "," << (m?m->ne[1]:0) << "," << (m?m->ne[2]:0) << "," << (m?m->ne[3]:0) << ")";
-            timings[name.str()].push_back(time);
-            return;
+            return name.str();
         }
         if (node->op == GGML_OP_TOP_K) {
             std::stringstream name;
@@ -1662,11 +1672,38 @@ class vk_perf_logger {
             name << ggml_op_name(node->op) <<
                 " K=" << node->ne[0] <<
                 " (" << node->src[0]->ne[0] << "," << node->src[0]->ne[1] << "," << node->src[0]->ne[2] << "," << node->src[0]->ne[3] << ")";
-            timings[name.str()].push_back(time);
-            return;
+            return name.str();
         }
-        timings[fusion_str + ggml_op_name(node->op)].push_back(time);
+        return fusion_str + ggml_op_name(node->op);
     }
+
+    void log_timing(const ggml_tensor * node, const char *fusion_name, uint64_t time) {
+        uint64_t n_flops;
+        std::string name = get_node_fusion_name(node, fusion_name, &n_flops);
+        if (n_flops) {
+            flops[name].push_back(n_flops);
+        }
+        timings[name].push_back(time);
+    }
+
+    void log_timing(const std::vector<ggml_tensor *> &nodes, const std::vector<const char *> &names, uint64_t time) {
+        uint64_t total_flops = 0;
+        std::string name;
+        for (size_t n = 0; n < nodes.size(); ++n) {
+            uint64_t n_flops = 0;
+            name += get_node_fusion_name(nodes[n], names[n], &n_flops);
+            total_flops += n_flops;
+
+            if (n != nodes.size() - 1) {
+                name += ", ";
+            }
+        }
+        if (total_flops) {
+            flops[name].push_back(total_flops);
+        }
+        timings[name].push_back(time);
+    }
+
   private:
     std::map<std::string, std::vector<uint64_t>> timings;
     std::map<std::string, std::vector<uint64_t>> flops;
@@ -1729,7 +1766,9 @@ struct ggml_backend_vk_context {
     std::unique_ptr<vk_perf_logger> perf_logger;
     vk::QueryPool query_pool;
     std::vector<const char *> query_fusion_names;
+    std::vector<int> query_fusion_node_count;
     std::vector<ggml_tensor *> query_nodes;
+    std::vector<int> query_node_idx;
     int32_t num_queries {};
     int32_t query_idx {};
 };
@@ -2514,6 +2553,15 @@ static void ggml_vk_sync_buffers(ggml_backend_vk_context* ctx, vk_context& subct
     );
 }
 
+static void ggml_vk_set_event(vk_context& ctx, vk::Event& event) {
+    VK_LOG_DEBUG("ggml_vk_set_event()");
+
+    ctx->s->buffer.setEvent(
+        event,
+        ctx->p->q->stage_flags
+    );
+}
+
 static void ggml_vk_wait_events(vk_context& ctx, std::vector<vk::Event>&& events) {
     VK_LOG_DEBUG("ggml_vk_wait_events()");
     if (events.empty()) {
@@ -3947,6 +3995,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
     CREATE_UNARY(gelu_quick)
     CREATE_UNARY(silu)
     CREATE_UNARY(relu)
+    CREATE_UNARY(xielu)
     CREATE_UNARY(neg)
     CREATE_UNARY(tanh)
     CREATE_UNARY(sigmoid)
@@ -5194,6 +5243,8 @@ static void ggml_vk_instance_init() {
     }
 
     vk_perf_logger_enabled = getenv("GGML_VK_PERF_LOGGER") != nullptr;
+    vk_perf_logger_concurrent = getenv("GGML_VK_PERF_LOGGER_CONCURRENT") != nullptr;
+    vk_enable_sync_logger = getenv("GGML_VK_SYNC_LOGGER") != nullptr;
     const char* GGML_VK_PERF_LOGGER_FREQUENCY = getenv("GGML_VK_PERF_LOGGER_FREQUENCY");
 
     if (GGML_VK_PERF_LOGGER_FREQUENCY != nullptr) {
@@ -5870,6 +5921,9 @@ static void ggml_vk_dispatch_pipeline(ggml_backend_vk_context* ctx, vk_context&
         std::cerr << "(" << buffer.buffer << ", " << buffer.offset << ", " << buffer.range << "), ";
     }
     std::cerr << "}, (" << wg0 << "," << wg1 << "," << wg2 << "))");
+    GGML_ASSERT(wg0 <= ctx->device->properties.limits.maxComputeWorkGroupCount[0] &&
+                wg1 <= ctx->device->properties.limits.maxComputeWorkGroupCount[1] &&
+                wg2 <= ctx->device->properties.limits.maxComputeWorkGroupCount[2]);
     GGML_ASSERT(ctx->descriptor_set_idx < ctx->descriptor_sets.size());
     GGML_ASSERT(descriptor_buffer_infos.size() <= MAX_PARAMETER_COUNT);
     GGML_ASSERT(pipeline->parameter_count == descriptor_buffer_infos.size());
@@ -6053,13 +6107,8 @@ static void ggml_vk_buffer_write_nc_async(ggml_backend_vk_context * ctx, vk_cont
     }
 }
 
-static void ggml_vk_buffer_write_2d_async(vk_context subctx, vk_buffer& dst, size_t offset, const void * src, size_t spitch, size_t width, size_t height, bool sync_staging = false) {
+static bool ggml_vk_buffer_write_2d_async(vk_context subctx, vk_buffer& dst, size_t offset, const void * src, size_t spitch, size_t width, size_t height, bool sync_staging = false) {
     VK_LOG_DEBUG("ggml_vk_buffer_write_2d_async(" << width << ", " << height << ")");
-    // Buffer is already mapped
-    if(dst->memory_property_flags & vk::MemoryPropertyFlagBits::eHostVisible) {
-        std::cerr << "ggml_vulkan: buffer_write_async dst buffer is host_visible. Use synchronous write." << std::endl;
-        GGML_ABORT("fatal error");
-    }
     // Check if src is pinned memory
     vk_buffer buf = nullptr;
     size_t buf_offset = 0;
@@ -6084,12 +6133,13 @@ static void ggml_vk_buffer_write_2d_async(vk_context subctx, vk_buffer& dst, siz
 
         ggml_vk_sync_buffers(nullptr, subctx);
         subctx->s->buffer.copyBuffer(buf->buffer, dst->buffer, slices);
-        return;
+        return true;
     }
     VK_LOG_DEBUG("STAGING");
 
     if (!sync_staging) {
-        GGML_ABORT("Asynchronous write to non-pinned memory not supported");
+        // copy was not handled caller needs to fall back
+        return false;
     }
 
     // Staging buffer required
@@ -6113,9 +6163,10 @@ static void ggml_vk_buffer_write_2d_async(vk_context subctx, vk_buffer& dst, siz
             deferred_memcpy((uint8_t *)staging_buffer->ptr + i * width, (const uint8_t *) src + i * spitch, width, &subctx->in_memcpys);
         }
     }
+    return true;
 }
 
-static void ggml_vk_buffer_write_async(vk_context subctx, vk_buffer& dst, size_t offset, const void * src, size_t size, bool sync_staging = false) {
+static bool ggml_vk_buffer_write_async(vk_context subctx, vk_buffer& dst, size_t offset, const void * src, size_t size, bool sync_staging = false) {
     VK_LOG_DEBUG("ggml_vk_buffer_write_async(" << size << ")");
     return ggml_vk_buffer_write_2d_async(subctx, dst, offset, src, size, size, 1, sync_staging);
 }
@@ -6134,7 +6185,8 @@ static void ggml_vk_buffer_write_2d(vk_buffer& dst, size_t offset, const void *
 
         vk_context subctx = ggml_vk_create_temporary_context(dst->device->transfer_queue.cmd_pool);
         ggml_vk_ctx_begin(dst->device, subctx);
-        ggml_vk_buffer_write_2d_async(subctx, dst, offset, src, spitch, width, height, true);
+        bool ret = ggml_vk_buffer_write_2d_async(subctx, dst, offset, src, spitch, width, height, true);
+        GGML_ASSERT(ret);
         ggml_vk_ctx_end(subctx);
 
         for (auto& cpy : subctx->in_memcpys) {
@@ -8521,6 +8573,8 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
                 return ctx->device->pipeline_gelu_quick[dst->type == GGML_TYPE_F16];
             case GGML_UNARY_OP_RELU:
                 return ctx->device->pipeline_relu[dst->type == GGML_TYPE_F16];
+            case GGML_UNARY_OP_XIELU:
+                return ctx->device->pipeline_xielu[dst->type == GGML_TYPE_F16];
             case GGML_UNARY_OP_NEG:
                 return ctx->device->pipeline_neg[dst->type == GGML_TYPE_F16];
             case GGML_UNARY_OP_TANH:
@@ -9056,6 +9110,8 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co
             const uint32_t batch = src1->ne[is_2D ? 3 : 2];
 
             elements = { OW * KW * KH, OH, batch * IC };
+            elements[1] = std::min(elements[1], ctx->device->properties.limits.maxComputeWorkGroupCount[1]);
+            elements[2] = std::min(elements[2], ctx->device->properties.limits.maxComputeWorkGroupCount[2]);
         } break;
     case GGML_OP_IM2COL_3D:
         {
@@ -9667,14 +9723,14 @@ static void ggml_vk_opt_step_adamw(ggml_backend_vk_context * ctx, vk_context& su
 
     ggml_vk_op_f32_opt_step_adamw(
         ctx, subctx, dst,
-        { (uint32_t)n, 0, 0.0f, 0.0f }
+        { (uint32_t)n, 0, 0.0f, 0.0f, 0.0f, 0.0f }
     );
 }
 
 static void ggml_vk_opt_step_sgd(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst) {
     const size_t n = ggml_nelements(dst->src[0]);
 
-    ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, src1, src2, nullptr, dst, GGML_OP_OPT_STEP_SGD, { (uint32_t)n, 0, 0.0f, 0.0f });
+    ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, src1, src2, nullptr, dst, GGML_OP_OPT_STEP_SGD, { (uint32_t)n, 0, 0.0f, 0.0f, 0.0f, 0.0f });
 }
 
 static void ggml_vk_concat(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@@ -9760,6 +9816,7 @@ static void ggml_vk_arange(ggml_backend_vk_context * ctx, vk_context& subctx, gg
         1,
         ggml_get_op_params_f32(dst, 0),
         ggml_get_op_params_f32(dst, 2),
+        0.0f, 0.0f,
     };
 
     vk_pipeline pipeline = ggml_vk_op_get_pipeline(ctx, nullptr, nullptr, nullptr, dst, GGML_OP_ARANGE);
@@ -9781,6 +9838,7 @@ static void ggml_vk_fill(ggml_backend_vk_context * ctx, vk_context& subctx, ggml
         1,
         ggml_get_op_params_f32(dst, 0),
         0.0f,
+        0.0f, 0.0f,
     };
 
     vk_pipeline pipeline = ggml_vk_op_get_pipeline(ctx, nullptr, nullptr, nullptr, dst, GGML_OP_FILL);
@@ -9896,13 +9954,13 @@ static void ggml_vk_set_rows(ggml_backend_vk_context * ctx, vk_context& subctx,
 }
 
 static void ggml_vk_silu_back(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, src1, nullptr, nullptr, dst, GGML_OP_SILU_BACK, { (uint32_t)ggml_nelements(src0), 0, 0.0f, 0.0f });
+    ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, src1, nullptr, nullptr, dst, GGML_OP_SILU_BACK, { (uint32_t)ggml_nelements(src0), 0, 0.0f, 0.0f, 0.0f, 0.0f });
 }
 
 static void ggml_vk_norm(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) {
     float * op_params = (float *)dst->op_params;
 
-    ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_NORM, { (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], op_params[0], 0.0f });
+    ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_NORM, { (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], op_params[0], 0.0f, 0.0f, 0.0f });
 }
 
 static void ggml_vk_group_norm(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) {
@@ -9913,7 +9971,7 @@ static void ggml_vk_group_norm(ggml_backend_vk_context * ctx, vk_context& subctx
     const float eps = float_op_params[1];
     const uint32_t group_size = src0->ne[0] * src0->ne[1] * ((src0->ne[2] + num_groups - 1) / num_groups);
 
-    ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_GROUP_NORM, { group_size, 0, eps, 0.0f });
+    ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_GROUP_NORM, { group_size, 0, eps, 0.0f, 0.0f, 0.0f });
 }
 
 static uint32_t ggml_vk_rms_num_partials(ggml_backend_vk_context * ctx, const ggml_tensor *node) {
@@ -10082,16 +10140,26 @@ static void ggml_vk_rms_norm(ggml_backend_vk_context * ctx, vk_context& subctx,
 
 static void ggml_vk_rms_norm_back(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
     float * op_params = (float *)dst->op_params;
-    ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, src1, nullptr, nullptr, dst, GGML_OP_RMS_NORM_BACK, { (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], op_params[0], 0.0f });
+    ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, src1, nullptr, nullptr, dst, GGML_OP_RMS_NORM_BACK, { (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], op_params[0], 0.0f, 0.0f, 0.0f });
 }
 
 static void ggml_vk_l2_norm(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) {
     float * op_params = (float *)dst->op_params;
-    ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_L2_NORM, { (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], op_params[0], 0.0f });
+    ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_L2_NORM, { (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], op_params[0], 0.0f, 0.0f, 0.0f });
 }
 
 static void ggml_vk_unary(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) {
-    ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_UNARY, { (uint32_t)ggml_nelements(src0), 0, 0.0f, 0.0f });
+    ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_UNARY, { (uint32_t)ggml_nelements(src0), 0, 0.0f, 0.0f, 0.0f, 0.0f });
+}
+
+static void ggml_vk_xielu(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) {
+    float * op_params = (float *)dst->op_params;
+    ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_UNARY,
+        {
+            (uint32_t)ggml_nelements(src0), 0,
+            op_params[1], op_params[2], op_params[3], op_params[4]
+        }
+    );
 }
 
 static void ggml_vk_glu(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@@ -10216,7 +10284,7 @@ static void ggml_vk_soft_max(ggml_backend_vk_context * ctx, vk_context& subctx,
 
 static void ggml_vk_soft_max_back(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
     float * op_params = (float *)dst->op_params;
-    ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, src1, nullptr, nullptr, dst, GGML_OP_SOFT_MAX_BACK, { (uint32_t)src0->ne[0], (uint32_t)ggml_nrows(src0), op_params[0], op_params[1] });
+    ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, src1, nullptr, nullptr, dst, GGML_OP_SOFT_MAX_BACK, { (uint32_t)src0->ne[0], (uint32_t)ggml_nrows(src0), op_params[0], op_params[1], 0.0f, 0.0f });
 }
 
 static void ggml_vk_topk_moe(ggml_backend_vk_context * ctx, vk_context& subctx, ggml_cgraph * cgraph, int node_idx) {
@@ -10513,11 +10581,11 @@ static void ggml_vk_cumsum(ggml_backend_vk_context * ctx, vk_context& subctx, co
 }
 
 static void ggml_vk_argmax(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) {
-    ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_ARGMAX, { (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], 0.0f, 0.0f });
+    ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_ARGMAX, { (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], 0.0f, 0.0f, 0.0f, 0.0f });
 }
 
 static void ggml_vk_count_equal(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, src1, nullptr, nullptr, dst, GGML_OP_COUNT_EQUAL, { (uint32_t)ggml_nelements(src0), 0, 0.0f, 0.0f });
+    ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, src1, nullptr, nullptr, dst, GGML_OP_COUNT_EQUAL, { (uint32_t)ggml_nelements(src0), 0, 0.0f, 0.0f, 0.0f, 0.0f });
 }
 
 static void ggml_vk_solve_tri(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@@ -10559,6 +10627,7 @@ static void ggml_vk_im2col(ggml_backend_vk_context * ctx, vk_context& subctx, co
     const uint32_t batch_offset = src1->nb[is_2D ? 3 : 2] / 4; // nb is byte offset, src is type float32
 
     const uint32_t pelements = OW * KW * KH;
+    const uint32_t batch = src1->ne[is_2D ? 3 : 2];
 
     const ggml_backend_vk_buffer_context * d_buf_ctx = (ggml_backend_vk_buffer_context *)dst->buffer->context;
     const vk_buffer d_buf = d_buf_ctx->dev_buffer;
@@ -10571,7 +10640,7 @@ static void ggml_vk_im2col(ggml_backend_vk_context * ctx, vk_context& subctx, co
         IC, IW, IH, OW, OH, KW, KH,
         pelements,
         IC * KH * KW,
-        s0, s1, p0, p1, d0, d1,
+        s0, s1, p0, p1, d0, d1, batch * IC
     });
 }
 
@@ -10776,7 +10845,7 @@ static void ggml_vk_conv_2d_dw(ggml_backend_vk_context * ctx, vk_context& subctx
 
 static void ggml_vk_leaky_relu(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) {
     const float * op_params = (const float *)dst->op_params;
-    ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_LEAKY_RELU, { (uint32_t)ggml_nelements(src0), 0, op_params[0], 0.0f });
+    ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_LEAKY_RELU, { (uint32_t)ggml_nelements(src0), 0, op_params[0], 0.0f, 0.0f, 0.0f });
 }
 
 #ifdef GGML_VULKAN_RUN_TESTS
@@ -11820,15 +11889,18 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr
             }
         }
 
-#define ENABLE_SYNC_LOGGING 0
-
         if (need_sync) {
-#if ENABLE_SYNC_LOGGING
-            std::cerr <<  "sync" << std::endl;
-#endif
+            if (vk_enable_sync_logger) {
+                std::cerr <<  "sync" << std::endl;
+            }
             ctx->unsynced_nodes_written.clear();
             ctx->unsynced_nodes_read.clear();
             ggml_vk_sync_buffers(ctx, compute_ctx);
+
+            if (vk_perf_logger_enabled && vk_perf_logger_concurrent) {
+                ctx->query_node_idx[ctx->query_idx] = node_idx;
+                compute_ctx->s->buffer.writeTimestamp(vk::PipelineStageFlagBits::eAllCommands, ctx->query_pool, ctx->query_idx++);
+            }
         }
         // Add all fused nodes to the unsynchronized lists.
         for (int32_t i = 0; i < ctx->num_additional_fused_ops + 1; ++i) {
@@ -11845,20 +11917,20 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr
             }
         }
     }
-#if ENABLE_SYNC_LOGGING
-    for (int i = 0; i < ctx->num_additional_fused_ops + 1; ++i) {
-        auto *n = cgraph->nodes[node_idx + i];
-        std::cerr << node_idx + i << " " << ggml_op_name(n->op) << " " <<  n->name;
-        if (n->op == GGML_OP_GLU) {
-            std::cerr << " " << ggml_glu_op_name(ggml_get_glu_op(n)) << " " << (n->src[1] ? "split" : "single") << " ";
+    if (vk_enable_sync_logger) {
+        for (int i = 0; i < ctx->num_additional_fused_ops + 1; ++i) {
+            auto *n = cgraph->nodes[node_idx + i];
+            std::cerr << node_idx + i << " " << ggml_op_name(n->op) << " " <<  n->name;
+            if (n->op == GGML_OP_GLU) {
+                std::cerr << " " << ggml_glu_op_name(ggml_get_glu_op(n)) << " " << (n->src[1] ? "split" : "single") << " ";
+            }
+            if (n->op == GGML_OP_ROPE) {
+                const int mode = ((const int32_t *) n->op_params)[2];
+                std::cerr << " rope mode: " << mode;
+            }
+            std::cerr << std::endl;
         }
-        if (n->op == GGML_OP_ROPE) {
-            const int mode = ((const int32_t *) n->op_params)[2];
-            std::cerr << " rope mode: " << mode;
-        }
-        std::cerr << std::endl;
     }
-#endif
 
     switch (node->op) {
     case GGML_OP_REPEAT:
@@ -12019,6 +12091,9 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr
         case GGML_UNARY_OP_TRUNC:
             ggml_vk_unary(ctx, compute_ctx, src0, node);
             break;
+        case GGML_UNARY_OP_XIELU:
+            ggml_vk_xielu(ctx, compute_ctx, src0, node);
+            break;
         default:
             return false;
         }
@@ -12612,7 +12687,23 @@ static void ggml_backend_vk_set_tensor_async(ggml_backend_t backend, ggml_tensor
 
     vk_buffer buf = buf_ctx->dev_buffer;
 
-    ggml_vk_buffer_write_async(transfer_ctx, buf, vk_tensor_offset(tensor) + tensor->view_offs + offset, data, size);
+    auto dst_offset = vk_tensor_offset(tensor) + tensor->view_offs + offset;
+
+    bool ret = ggml_vk_buffer_write_async(transfer_ctx, buf, dst_offset, data, size);
+
+    if (!ret) {
+        ggml_vk_ensure_sync_staging_buffer(ctx, size);
+        ggml_vk_sync_buffers(nullptr, transfer_ctx);
+
+        vk::BufferCopy buffer_cpy;
+        buffer_cpy.srcOffset = 0;
+        buffer_cpy.dstOffset = dst_offset;
+        buffer_cpy.size = size;
+
+        transfer_ctx->s->buffer.copyBuffer(ctx->sync_staging->buffer, buf->buffer, { buffer_cpy });
+        deferred_memcpy(ctx->sync_staging->ptr, data, size, &transfer_ctx->in_memcpys);
+        ggml_vk_synchronize(ctx);
+    }
 }
 
 static void ggml_backend_vk_get_tensor_async(ggml_backend_t backend, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
@@ -12889,24 +12980,43 @@ static bool ggml_vk_can_fuse_topk_moe(ggml_backend_vk_context * ctx, const struc
 
     const ggml_tensor * softmax;
     const ggml_tensor * weights;
+    const ggml_tensor * get_rows;
+    const ggml_tensor * argsort;
 
     switch (mode) {
     case TOPK_MOE_EARLY_SOFTMAX_NORM:
         softmax = cgraph->nodes[node_idx + 0];
         weights = cgraph->nodes[node_idx + 9];
+        get_rows = cgraph->nodes[node_idx + 4];
+        argsort = cgraph->nodes[node_idx + 2];
         break;
     case TOPK_MOE_EARLY_SOFTMAX:
         softmax = cgraph->nodes[node_idx + 0];
         weights = cgraph->nodes[node_idx + 4];
+        get_rows = cgraph->nodes[node_idx + 4];
+        argsort = cgraph->nodes[node_idx + 2];
         break;
     case TOPK_MOE_LATE_SOFTMAX:
         softmax = cgraph->nodes[node_idx + 4];
         weights = cgraph->nodes[node_idx + 5];
+        get_rows = cgraph->nodes[node_idx + 2];
+        argsort = cgraph->nodes[node_idx + 0];
         break;
     default:
         return false;
     }
 
+    ggml_tensor * probs = get_rows->src[0];
+    if (probs->op != GGML_OP_RESHAPE) {
+        return false;
+    }
+    probs = probs->src[0];
+    ggml_tensor * selection_probs = argsort->src[0];
+
+    if (probs != selection_probs) {
+        return false;
+    }
+
     const float * op_params = (const float *)softmax->op_params;
 
     float scale = op_params[0];
@@ -13138,12 +13248,16 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
             ctx->query_pool = ctx->device->device.createQueryPool(query_create_info);
             ctx->num_queries = query_create_info.queryCount;
             ctx->query_fusion_names.resize(ctx->num_queries);
+            ctx->query_fusion_node_count.resize(ctx->num_queries);
             ctx->query_nodes.resize(ctx->num_queries);
+            ctx->query_node_idx.resize(ctx->num_queries);
         }
 
         ctx->device->device.resetQueryPool(ctx->query_pool, 0, cgraph->n_nodes+1);
         std::fill(ctx->query_fusion_names.begin(), ctx->query_fusion_names.end(), nullptr);
+        std::fill(ctx->query_fusion_node_count.begin(), ctx->query_fusion_node_count.end(), 0);
         std::fill(ctx->query_nodes.begin(), ctx->query_nodes.end(), nullptr);
+        std::fill(ctx->query_node_idx.begin(), ctx->query_node_idx.end(), 0);
 
         GGML_ASSERT(ctx->compute_ctx.expired());
         compute_ctx = ggml_vk_create_context(ctx, ctx->compute_cmd_pool);
@@ -13272,9 +13386,16 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
             } else {
                 compute_ctx = ctx->compute_ctx.lock();
             }
-            ctx->query_nodes[ctx->query_idx] = cgraph->nodes[i];
-            ctx->query_fusion_names[ctx->query_idx] = fusion_string;
-            compute_ctx->s->buffer.writeTimestamp(vk::PipelineStageFlagBits::eAllCommands, ctx->query_pool, ctx->query_idx++);
+            if (!vk_perf_logger_concurrent) {
+                // track a single node/fusion for the current query
+                ctx->query_nodes[ctx->query_idx] = cgraph->nodes[i];
+                ctx->query_fusion_names[ctx->query_idx] = fusion_string;
+                compute_ctx->s->buffer.writeTimestamp(vk::PipelineStageFlagBits::eAllCommands, ctx->query_pool, ctx->query_idx++);
+            } else {
+                // track a fusion string and number of fused ops for the current node_idx
+                ctx->query_fusion_names[i] = fusion_string;
+                ctx->query_fusion_node_count[i] = ctx->num_additional_fused_ops;
+            }
         }
 
         if (enqueued) {
@@ -13316,12 +13437,32 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
         // Get the results and pass them to the logger
         std::vector<uint64_t> timestamps(cgraph->n_nodes + 1);
         VK_CHECK(ctx->device->device.getQueryPoolResults(ctx->query_pool, 0, ctx->query_idx, (cgraph->n_nodes + 1)*sizeof(uint64_t), timestamps.data(), sizeof(uint64_t), vk::QueryResultFlagBits::e64 | vk::QueryResultFlagBits::eWait), "get timestamp results");
-        for (int i = 1; i < ctx->query_idx; i++) {
-            auto node = ctx->query_nodes[i];
-            auto name = ctx->query_fusion_names[i];
-            ctx->perf_logger->log_timing(node, name, uint64_t((timestamps[i] - timestamps[i-1]) * ctx->device->properties.limits.timestampPeriod));
+        if (!vk_perf_logger_concurrent) {
+            // Log each op separately
+            for (int i = 1; i < ctx->query_idx; i++) {
+                auto node = ctx->query_nodes[i];
+                auto name = ctx->query_fusion_names[i];
+                ctx->perf_logger->log_timing(node, name, uint64_t((timestamps[i] - timestamps[i-1]) * ctx->device->properties.limits.timestampPeriod));
+            }
+        } else {
+            // Log each group of nodes
+            int prev_node_idx = 0;
+            for (int i = 1; i < ctx->query_idx; i++) {
+                auto cur_node_idx = ctx->query_node_idx[i];
+                std::vector<ggml_tensor *> nodes;
+                std::vector<const char *> names;
+                for (int node_idx = prev_node_idx; node_idx < cur_node_idx; ++node_idx) {
+                    if (ggml_op_is_empty(cgraph->nodes[node_idx]->op)) {
+                        continue;
+                    }
+                    nodes.push_back(cgraph->nodes[node_idx]);
+                    names.push_back(ctx->query_fusion_names[node_idx]);
+                    node_idx += ctx->query_fusion_node_count[node_idx];
+                }
+                prev_node_idx = cur_node_idx;
+                ctx->perf_logger->log_timing(nodes, names, uint64_t((timestamps[i] - timestamps[i-1]) * ctx->device->properties.limits.timestampPeriod));
+            }
         }
-
         ctx->perf_logger->print_timings();
     }
 
@@ -13440,7 +13581,8 @@ static void ggml_vk_graph_optimize(ggml_backend_t backend, struct ggml_cgraph *
                     !(j == c+1 && c == current_set.back() && graph->nodes[c]->op == GGML_OP_RMS_NORM && graph->nodes[j]->op == GGML_OP_MUL) &&
                     !(j == c+1 && c == current_set.back() && graph->nodes[c]->op == GGML_OP_MUL_MAT && graph->nodes[j]->op == GGML_OP_ADD) &&
                     !(j == c+1 && c == current_set.back() && graph->nodes[c]->op == GGML_OP_MUL_MAT_ID && graph->nodes[j]->op == GGML_OP_ADD_ID) &&
-                    !(j == c+1 && c == current_set.back() && graph->nodes[c]->op == GGML_OP_MUL_MAT_ID && graph->nodes[j]->op == GGML_OP_MUL)) {
+                    !(j == c+1 && c == current_set.back() && graph->nodes[c]->op == GGML_OP_MUL_MAT_ID && graph->nodes[j]->op == GGML_OP_MUL) &&
+                    !(j == c+1 && c == current_set.back() && graph->nodes[c]->op == GGML_OP_ADD && graph->nodes[j]->op == GGML_OP_ADD)) {
                     ok = false;
                     break;
                 }
@@ -13568,11 +13710,58 @@ static void ggml_vk_graph_optimize(ggml_backend_t backend, struct ggml_cgraph *
     }
 }
 
+static void ggml_backend_vk_event_record(ggml_backend_t backend, ggml_backend_event_t event) {
+    ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
+    vk_event *vkev = (vk_event *)event->context;
+
+    vk_context transfer_ctx;
+
+    if (ctx->transfer_ctx.expired()) {
+        // Initialize new transfer context
+        transfer_ctx = ggml_vk_create_context(ctx, ctx->compute_cmd_pool);
+        ctx->transfer_ctx = transfer_ctx;
+        ggml_vk_ctx_begin(ctx->device, transfer_ctx);
+    } else {
+        transfer_ctx = ctx->transfer_ctx.lock();
+    }
+
+    // the backend interface doesn't have an explicit reset, so reset it here
+    // before we record the command to set it
+    ctx->device->device.resetEvent(vkev->event);
+    ctx->device->device.resetFences({ vkev->fence });
+
+    ggml_vk_set_event(transfer_ctx, vkev->event);
+
+    ggml_vk_ctx_end(transfer_ctx);
+
+    ggml_vk_submit(transfer_ctx, {vkev->fence});
+    ctx->submit_pending = true;
+    ctx->transfer_ctx.reset();
+}
+
+static void ggml_backend_vk_event_wait(ggml_backend_t backend, ggml_backend_event_t event) {
+    ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
+    vk_event *vkev = (vk_event *)event->context;
+
+    vk_context transfer_ctx;
+
+    if (ctx->transfer_ctx.expired()) {
+        // Initialize new transfer context
+        transfer_ctx = ggml_vk_create_context(ctx, ctx->compute_cmd_pool);
+        ctx->transfer_ctx = transfer_ctx;
+        ggml_vk_ctx_begin(ctx->device, transfer_ctx);
+    } else {
+        transfer_ctx = ctx->transfer_ctx.lock();
+    }
+
+    ggml_vk_wait_events(transfer_ctx, {vkev->event});
+}
+
 // TODO: enable async and synchronize
 static ggml_backend_i ggml_backend_vk_interface = {
     /* .get_name                = */ ggml_backend_vk_name,
     /* .free                    = */ ggml_backend_vk_free,
-    /* .set_tensor_async        = */ NULL,  // ggml_backend_vk_set_tensor_async,
+    /* .set_tensor_async        = */ ggml_backend_vk_set_tensor_async,
     /* .get_tensor_async        = */ ggml_backend_vk_get_tensor_async,
     /* .cpy_tensor_async        = */ NULL,  // ggml_backend_vk_cpy_tensor_async,
     /* .synchronize             = */ ggml_backend_vk_synchronize,
@@ -13581,8 +13770,8 @@ static ggml_backend_i ggml_backend_vk_interface = {
     /* .graph_plan_update       = */ NULL,
     /* .graph_plan_compute      = */ NULL,
     /* .graph_compute           = */ ggml_backend_vk_graph_compute,
-    /* .event_record            = */ NULL,
-    /* .event_wait              = */ NULL,
+    /* .event_record            = */ ggml_backend_vk_event_record,
+    /* .event_wait              = */ ggml_backend_vk_event_wait,
     /* .graph_optimize          = */ ggml_vk_graph_optimize,
 };
 
@@ -13757,10 +13946,10 @@ static void ggml_backend_vk_device_get_props(ggml_backend_dev_t dev, struct ggml
     props->device_id   = ctx->pci_bus_id.empty() ? nullptr : ctx->pci_bus_id.c_str();
     ggml_backend_vk_device_get_memory(dev, &props->memory_free, &props->memory_total);
     props->caps = {
-        /* .async                 = */ false,
+        /* .async                 = */ true,
         /* .host_buffer           = */ true,
         /* .buffer_from_host_ptr  = */ false,
-        /* .events                = */ false,
+        /* .events                = */ true,
     };
 }
 
@@ -13780,6 +13969,7 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
                 case GGML_UNARY_OP_GELU_QUICK:
                 case GGML_UNARY_OP_SILU:
                 case GGML_UNARY_OP_RELU:
+                case GGML_UNARY_OP_XIELU:
                 case GGML_UNARY_OP_NEG:
                 case GGML_UNARY_OP_TANH:
                 case GGML_UNARY_OP_SIGMOID:
@@ -14291,6 +14481,46 @@ static bool ggml_backend_vk_device_offload_op(ggml_backend_dev_t dev, const ggml
     UNUSED(dev);
 }
 
+static ggml_backend_event_t ggml_backend_vk_device_event_new(ggml_backend_dev_t dev) {
+    ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)dev->context;
+    auto device = ggml_vk_get_device(ctx->device);
+
+    vk_event *vkev = new vk_event;
+    if (!vkev) {
+        return nullptr;
+    }
+
+    // The event/fence is expected to initially be in the signaled state.
+    vkev->event = device->device.createEvent({});
+    vkev->fence = device->device.createFence({vk::FenceCreateFlagBits::eSignaled});
+    device->device.setEvent(vkev->event);
+
+    return new ggml_backend_event {
+        /* .device  = */ dev,
+        /* .context = */ vkev,
+    };
+}
+
+static void ggml_backend_vk_device_event_free(ggml_backend_dev_t dev, ggml_backend_event_t event) {
+    ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)dev->context;
+    auto device = ggml_vk_get_device(ctx->device);
+
+    vk_event *vkev = (vk_event *)event->context;
+
+    device->device.destroyFence(vkev->fence);
+    device->device.destroyEvent(vkev->event);
+    delete vkev;
+    delete event;
+}
+
+static void ggml_backend_vk_device_event_synchronize(ggml_backend_dev_t dev, ggml_backend_event_t event) {
+    ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)dev->context;
+    auto device = ggml_vk_get_device(ctx->device);
+    vk_event *vkev = (vk_event *)event->context;
+
+    VK_CHECK(device->device.waitForFences({ vkev->fence }, true, UINT64_MAX), "event_synchronize");
+}
+
 static const struct ggml_backend_device_i ggml_backend_vk_device_i = {
     /* .get_name             = */ ggml_backend_vk_device_get_name,
     /* .get_description      = */ ggml_backend_vk_device_get_description,
@@ -14304,9 +14534,9 @@ static const struct ggml_backend_device_i ggml_backend_vk_device_i = {
     /* .supports_op          = */ ggml_backend_vk_device_supports_op,
     /* .supports_buft        = */ ggml_backend_vk_device_supports_buft,
     /* .offload_op           = */ ggml_backend_vk_device_offload_op,
-    /* .event_new            = */ NULL,
-    /* .event_free           = */ NULL,
-    /* .event_synchronize    = */ NULL,
+    /* .event_new            = */ ggml_backend_vk_device_event_new,
+    /* .event_free           = */ ggml_backend_vk_device_event_free,
+    /* .event_synchronize    = */ ggml_backend_vk_device_event_synchronize,
 };
 
 static const char * ggml_backend_vk_reg_get_name(ggml_backend_reg_t reg) {
@@ -14685,7 +14915,7 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_cgraph *
         } else if (tensor->op == GGML_OP_LOG) {
             tensor_clone = ggml_log(ggml_ctx, src_clone[0]);
         } else if (tensor->op == GGML_OP_TRI) {
-            tensor_clone = ggml_tri(ggml_ctx, src_clone[0], ggml_get_op_params_i32(tensor, 0));
+            tensor_clone = ggml_tri(ggml_ctx, src_clone[0], (ggml_tri_type)ggml_get_op_params_i32(tensor, 0));
         } else if (tensor->op == GGML_OP_DIAG) {
             tensor_clone = ggml_diag(ggml_ctx, src_clone[0]);
         } else if (tensor->op == GGML_OP_CLAMP) {
@@ -14773,6 +15003,13 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_cgraph *
             case GGML_UNARY_OP_RELU:
                 tensor_clone = ggml_relu(ggml_ctx, src_clone[0]);
                 break;
+            case GGML_UNARY_OP_XIELU:
+                tensor_clone = ggml_xielu(ggml_ctx, src_clone[0], 0, 0, 0, 0);
+                ggml_set_op_params_f32(tensor_clone, 1, ggml_get_op_params_f32(tensor, 1));
+                ggml_set_op_params_f32(tensor_clone, 2, ggml_get_op_params_f32(tensor, 2));
+                ggml_set_op_params_f32(tensor_clone, 3, ggml_get_op_params_f32(tensor, 3));
+                ggml_set_op_params_f32(tensor_clone, 4, ggml_get_op_params_f32(tensor, 4));
+                break;
             case GGML_UNARY_OP_NEG:
                 tensor_clone = ggml_neg(ggml_ctx, src_clone[0]);
                 break;
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/generic_head.glsl b/ggml/src/ggml-vulkan/vulkan-shaders/generic_head.glsl
index 66e46ae679..3797901f04 100644
--- a/ggml/src/ggml-vulkan/vulkan-shaders/generic_head.glsl
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/generic_head.glsl
@@ -6,4 +6,6 @@ layout (push_constant) uniform parameter
     uint KY;
     float param1;
     float param2;
+    float param3;
+    float param4;
 } p;
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/im2col.comp b/ggml/src/ggml-vulkan/vulkan-shaders/im2col.comp
index 1827d647a2..db14f5a3cf 100644
--- a/ggml/src/ggml-vulkan/vulkan-shaders/im2col.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/im2col.comp
@@ -19,6 +19,7 @@ layout (push_constant) uniform parameter
     int s0; int s1;
     int p0; int p1;
     int d0; int d1;
+    uint batch_IC;
 } p;
 
 layout(constant_id = 0) const uint BLOCK_SIZE = 32;
@@ -34,12 +35,12 @@ layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
 layout (buffer_reference) buffer D_ptr {D_TYPE d;};
 #endif
 
-void main() {
+void im2col(const uint y, const uint z) {
     const uint gidx = gl_GlobalInvocationID.x;
 
-    const uint oh = gl_GlobalInvocationID.y;
-    const uint batch = gl_GlobalInvocationID.z / p.IC;
-    const uint ic = gl_GlobalInvocationID.z % p.IC;
+    const uint oh = y;
+    const uint batch = z / p.IC;
+    const uint ic = z % p.IC;
 
     const uint src_base = ic * p.offset_delta + batch * p.batch_offset;
     const BDA_OFFSET_T dst_base = ((BDA_OFFSET_T(batch) * p.OH + oh) * p.OW) * p.CHW + BDA_OFFSET_T(ic) * (p.KW * p.KH);
@@ -101,3 +102,15 @@ void main() {
 #endif
     }
 }
+
+void main() {
+    uint y = gl_GlobalInvocationID.y;
+    while (y < p.OH) {
+        uint z = gl_GlobalInvocationID.z;
+        while (z < p.batch_IC) {
+            im2col(y, z);
+            z += gl_NumWorkGroups.z;
+        }
+        y += gl_NumWorkGroups.y;
+    }
+}
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq2_xs.comp b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq2_xs.comp
index 0cd906dbbf..7ec2e04f58 100644
--- a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq2_xs.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq2_xs.comp
@@ -11,36 +11,54 @@ void calc_superblock(const uint a_offset, const uint b_offset, const uint itid,
     const uint y_idx = i * QUANT_K + 16 * itid;
     const uint nibble_shift = 4 * (itid & 1);
     const uint ib32 = itid / 2; // 0..7
-
     uint ibi = a_offset / QUANT_K + first_row * num_blocks_per_row + i;
+    // Precompute db multiplication factors
+    float db_vals[NUM_ROWS];
     [[unroll]] for (uint n = 0; n < num_rows; ++n) {
         const float d = float(data_a[ibi].d);
-        const uint scale = (data_a[ibi].scales[ib32] >> nibble_shift) & 0xF;
-        const float db = d * (0.5 + scale) * 0.25;
-
+        const uint scale_raw = data_a[ibi].scales[ib32];
+        const uint scale = (scale_raw >> nibble_shift) & 0xF;
+        // Merge constant calculations d * (0.5 + scale) * 0.25 = d*0.125 + d*scale*0.25
+        db_vals[n] = d * (0.125f + float(scale) * 0.25f);
+        ibi += num_blocks_per_row;
+    }
+    ibi = a_offset / QUANT_K + first_row * num_blocks_per_row + i;
+    [[unroll]] for (uint n = 0; n < num_rows; ++n) {
+        // Preload grid and sign data for all l values
+        vec4 grid0_vals[2], grid1_vals[2];
+        uint sign_vals[2], sign7_vals[2];
         [[unroll]] for (uint l = 0; l < 2; ++l) {
             const uint qs = data_a[ibi].qs[2 * itid + l];
-            const uint sign = qs >> 9;
-            const uint sign7 = bitCount(sign);
-            const vec4 grid0 = vec4(unpack8(iq2xs_grid[qs & 511].x));
-            const vec4 grid1 = vec4(unpack8(iq2xs_grid[qs & 511].y));
-
-            [[unroll]] for (uint j = 0; j < NUM_COLS; ++j) {
-                vec4 b0 = vec4(data_b_v4[(j*p.batch_stride_b + b_offset + y_idx) / 4 + 2*l + 0]);
-                vec4 b4 = vec4(data_b_v4[(j*p.batch_stride_b + b_offset + y_idx) / 4 + 2*l + 1]);
-
-                FLOAT_TYPE sum =
-                      fma(FLOAT_TYPE(b0.x), FLOAT_TYPE((sign &   1) != 0 ? -grid0.x : grid0.x),
-                      fma(FLOAT_TYPE(b0.y), FLOAT_TYPE((sign &   2) != 0 ? -grid0.y : grid0.y),
-                      fma(FLOAT_TYPE(b0.z), FLOAT_TYPE((sign &   4) != 0 ? -grid0.z : grid0.z),
-                      fma(FLOAT_TYPE(b0.w), FLOAT_TYPE((sign &   8) != 0 ? -grid0.w : grid0.w),
-                      fma(FLOAT_TYPE(b4.x), FLOAT_TYPE((sign &  16) != 0 ? -grid1.x : grid1.x),
-                      fma(FLOAT_TYPE(b4.y), FLOAT_TYPE((sign &  32) != 0 ? -grid1.y : grid1.y),
-                      fma(FLOAT_TYPE(b4.z), FLOAT_TYPE((sign &  64) != 0 ? -grid1.z : grid1.z),
-                      fma(FLOAT_TYPE(b4.w), FLOAT_TYPE((sign7 &  1) != 0 ? -grid1.w : grid1.w),
-                      FLOAT_TYPE(0.0)))))))));
-                temp[j][n] = fma(db, sum, temp[j][n]);
+            sign_vals[l] = qs >> 9;
+            sign7_vals[l] = bitCount(sign_vals[l]);
+            const uvec2 grid_data = iq2xs_grid[qs & 511];
+            grid0_vals[l] = vec4(unpack8(grid_data.x));
+            grid1_vals[l] = vec4(unpack8(grid_data.y));
+        }
+        // Preload B data for all j columns (reduce repeated index calculations)
+        [[unroll]] for (uint j = 0; j < NUM_COLS; ++j) {
+            FLOAT_TYPE sum = FLOAT_TYPE(0.0);
+            [[unroll]] for (uint l = 0; l < 2; ++l) {
+                const uint sign = sign_vals[l];
+                const uint sign7 = sign7_vals[l];
+                const vec4 grid0 = grid0_vals[l];
+                const vec4 grid1 = grid1_vals[l];
+                // Precompute indices
+                const uint b_idx = (j * p.batch_stride_b + b_offset + y_idx) / 4 + 2 * l;
+                const vec4 b0 = vec4(data_b_v4[b_idx + 0]);
+                const vec4 b4 = vec4(data_b_v4[b_idx + 1]);
+                sum +=
+                    fma(FLOAT_TYPE(b0.x), FLOAT_TYPE((sign &   1) != 0 ? -grid0.x : grid0.x),
+                    fma(FLOAT_TYPE(b0.y), FLOAT_TYPE((sign &   2) != 0 ? -grid0.y : grid0.y),
+                    fma(FLOAT_TYPE(b0.z), FLOAT_TYPE((sign &   4) != 0 ? -grid0.z : grid0.z),
+                    fma(FLOAT_TYPE(b0.w), FLOAT_TYPE((sign &   8) != 0 ? -grid0.w : grid0.w),
+                    fma(FLOAT_TYPE(b4.x), FLOAT_TYPE((sign &  16) != 0 ? -grid1.x : grid1.x),
+                    fma(FLOAT_TYPE(b4.y), FLOAT_TYPE((sign &  32) != 0 ? -grid1.y : grid1.y),
+                    fma(FLOAT_TYPE(b4.z), FLOAT_TYPE((sign &  64) != 0 ? -grid1.z : grid1.z),
+                    fma(FLOAT_TYPE(b4.w), FLOAT_TYPE((sign7 &  1) != 0 ? -grid1.w : grid1.w),
+                    FLOAT_TYPE(0.0)))))))));
             }
+            temp[j][n] = fma(FLOAT_TYPE(db_vals[n]), sum, temp[j][n]);
         }
         ibi += num_blocks_per_row;
     }
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp b/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp
index b0ade078c7..92ad3bcab1 100644
--- a/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp
@@ -853,6 +853,8 @@ void process_shaders() {
     string_to_spv("hardswish_f32",  "hardswish.comp",   {{"A_TYPE", "float"},       {"D_TYPE", "float"}});
     string_to_spv("abs_f16",        "abs.comp",         {{"A_TYPE", "float16_t"},   {"D_TYPE", "float16_t"}});
     string_to_spv("abs_f32",        "abs.comp",         {{"A_TYPE", "float"},       {"D_TYPE", "float"}});
+    string_to_spv("xielu_f16",      "xielu.comp",       {{"A_TYPE", "float16_t"},   {"D_TYPE", "float16_t"}});
+    string_to_spv("xielu_f32",      "xielu.comp",       {{"A_TYPE", "float"},       {"D_TYPE", "float"}});
 
     string_to_spv("tri_f16",        "tri.comp",         {{"A_TYPE", "float16_t"},   {"D_TYPE", "float16_t"}});
     string_to_spv("tri_f32",        "tri.comp",         {{"A_TYPE", "float"},       {"D_TYPE", "float"}});
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/xielu.comp b/ggml/src/ggml-vulkan/vulkan-shaders/xielu.comp
new file mode 100644
index 0000000000..35d463bfe4
--- /dev/null
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/xielu.comp
@@ -0,0 +1,35 @@
+#version 450
+
+#include "generic_head.glsl"
+#include "types.glsl"
+
+#extension GL_EXT_control_flow_attributes : enable
+
+layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
+
+layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
+layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
+
+void main() {
+    const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
+
+    if (i >= p.KX) {
+        return;
+    }
+
+    float x = float(data_a[i]);
+
+    float alpha_n = p.param1;
+    float alpha_p = p.param2;
+    float beta = p.param3;
+    float eps = p.param4;
+
+    if (x > 0.0f) {
+        x = alpha_p * x * x + beta * x;
+    } else {
+        const float min_x_eps = min(x, eps);
+        x = (exp(min_x_eps) - 1 - x) * alpha_n + beta * x;
+    }
+
+    data_d[i] = D_TYPE(x);
+}
diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py
index fed0e94dcf..cab8f2901a 100644
--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@@ -690,6 +690,8 @@ class MODEL_TENSOR(IntEnum):
     V_TOK_EOI            = auto() # cogvlm
     # audio (mtmd)
     A_ENC_EMBD_POS       = auto()
+    A_ENC_EMBD_NORM      = auto()
+    A_ENC_EMBD_TO_LOGITS = auto()
     A_ENC_CONV1D         = auto()
     A_PRE_NORM           = auto()
     A_POST_NORM          = auto()
@@ -700,8 +702,13 @@ class MODEL_TENSOR(IntEnum):
     A_ENC_OUTPUT         = auto()
     A_ENC_OUTPUT_NORM    = auto()
     A_ENC_FFN_UP         = auto()
+    A_ENC_FFN_NORM       = auto()
     A_ENC_FFN_GATE       = auto()
     A_ENC_FFN_DOWN       = auto()
+    A_ENC_FFN_UP_1       = auto()
+    A_ENC_FFN_NORM_1     = auto()
+    A_ENC_FFN_GATE_1     = auto()
+    A_ENC_FFN_DOWN_1     = auto()
     A_MMPROJ             = auto()
     A_MMPROJ_FC          = auto()
     A_MM_NORM_PRE        = auto()
@@ -713,6 +720,16 @@ class MODEL_TENSOR(IntEnum):
     NEXTN_HNORM          = auto()
     NEXTN_SHARED_HEAD_HEAD = auto()
     NEXTN_SHARED_HEAD_NORM = auto()
+    # lfm2 audio
+    A_ENC_NORM_CONV        = auto()
+    A_ENC_LINEAR_POS       = auto()
+    A_ENC_POS_BIAS_U       = auto()
+    A_ENC_POS_BIAS_V       = auto()
+    A_ENC_OUT              = auto()
+    A_ENC_CONV_DW          = auto() # SSM conv
+    A_ENC_CONV_NORM        = auto() # SSM conv
+    A_ENC_CONV_PW1         = auto()
+    A_ENC_CONV_PW2         = auto()
 
 
 MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
@@ -1064,7 +1081,10 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
     MODEL_TENSOR.V_TOK_BOI:                 "v.boi",
     MODEL_TENSOR.V_TOK_EOI:                 "v.eoi",
     # audio (mtmd)
+    # note: all audio tensor names must use prefix "a." or "mm.a."
     MODEL_TENSOR.A_ENC_EMBD_POS:            "a.position_embd",
+    MODEL_TENSOR.A_ENC_EMBD_NORM:           "a.position_embd_norm",
+    MODEL_TENSOR.A_ENC_EMBD_TO_LOGITS:      "a.embd_to_logits",
     MODEL_TENSOR.A_ENC_CONV1D:              "a.conv1d.{bid}",
     MODEL_TENSOR.A_PRE_NORM:                "a.pre_ln",
     MODEL_TENSOR.A_POST_NORM:               "a.post_ln",
@@ -1074,13 +1094,28 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
     MODEL_TENSOR.A_ENC_INPUT_NORM:          "a.blk.{bid}.ln1",
     MODEL_TENSOR.A_ENC_OUTPUT:              "a.blk.{bid}.attn_out",
     MODEL_TENSOR.A_ENC_OUTPUT_NORM:         "a.blk.{bid}.ln2",
+    MODEL_TENSOR.A_ENC_FFN_NORM:            "a.blk.{bid}.ffn_norm",
     MODEL_TENSOR.A_ENC_FFN_UP:              "a.blk.{bid}.ffn_up",
     MODEL_TENSOR.A_ENC_FFN_GATE:            "a.blk.{bid}.ffn_gate",
     MODEL_TENSOR.A_ENC_FFN_DOWN:            "a.blk.{bid}.ffn_down",
+    MODEL_TENSOR.A_ENC_FFN_NORM_1:          "a.blk.{bid}.ffn_norm_1",
+    MODEL_TENSOR.A_ENC_FFN_UP_1:            "a.blk.{bid}.ffn_up_1",
+    MODEL_TENSOR.A_ENC_FFN_GATE_1:          "a.blk.{bid}.ffn_gate_1",
+    MODEL_TENSOR.A_ENC_FFN_DOWN_1:          "a.blk.{bid}.ffn_down_1",
     MODEL_TENSOR.A_MMPROJ:                  "mm.a.mlp.{bid}",
     MODEL_TENSOR.A_MMPROJ_FC:               "mm.a.fc",
     MODEL_TENSOR.A_MM_NORM_PRE:             "mm.a.norm_pre",
     MODEL_TENSOR.A_MM_NORM_MID:             "mm.a.norm_mid",
+    # lfm2 audio
+    MODEL_TENSOR.A_ENC_NORM_CONV:           "a.blk.{bid}.norm_conv",
+    MODEL_TENSOR.A_ENC_LINEAR_POS:          "a.blk.{bid}.linear_pos",
+    MODEL_TENSOR.A_ENC_POS_BIAS_U:          "a.blk.{bid}.pos_bias_u",
+    MODEL_TENSOR.A_ENC_POS_BIAS_V:          "a.blk.{bid}.pos_bias_v",
+    MODEL_TENSOR.A_ENC_OUT:                 "a.pre_encode.out",
+    MODEL_TENSOR.A_ENC_CONV_DW:             "a.blk.{bid}.conv_dw",
+    MODEL_TENSOR.A_ENC_CONV_NORM:           "a.blk.{bid}.conv_norm",
+    MODEL_TENSOR.A_ENC_CONV_PW1:            "a.blk.{bid}.conv_pw1",
+    MODEL_TENSOR.A_ENC_CONV_PW2:            "a.blk.{bid}.conv_pw2",
     # NextN/MTP
     MODEL_TENSOR.NEXTN_EH_PROJ:             "blk.{bid}.nextn.eh_proj",
     MODEL_TENSOR.NEXTN_EMBED_TOKENS:        "blk.{bid}.nextn.embed_tokens",
@@ -1145,6 +1180,8 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
         MODEL_TENSOR.V_TOK_EOI,
         # audio
         MODEL_TENSOR.A_ENC_EMBD_POS,
+        MODEL_TENSOR.A_ENC_EMBD_NORM,
+        MODEL_TENSOR.A_ENC_EMBD_TO_LOGITS,
         MODEL_TENSOR.A_ENC_CONV1D,
         MODEL_TENSOR.A_PRE_NORM,
         MODEL_TENSOR.A_POST_NORM,
@@ -1154,13 +1191,27 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
         MODEL_TENSOR.A_ENC_INPUT_NORM,
         MODEL_TENSOR.A_ENC_OUTPUT,
         MODEL_TENSOR.A_ENC_OUTPUT_NORM,
+        MODEL_TENSOR.A_ENC_FFN_NORM,
         MODEL_TENSOR.A_ENC_FFN_UP,
         MODEL_TENSOR.A_ENC_FFN_GATE,
         MODEL_TENSOR.A_ENC_FFN_DOWN,
+        MODEL_TENSOR.A_ENC_FFN_NORM_1,
+        MODEL_TENSOR.A_ENC_FFN_UP_1,
+        MODEL_TENSOR.A_ENC_FFN_GATE_1,
+        MODEL_TENSOR.A_ENC_FFN_DOWN_1,
         MODEL_TENSOR.A_MMPROJ,
         MODEL_TENSOR.A_MMPROJ_FC,
         MODEL_TENSOR.A_MM_NORM_PRE,
         MODEL_TENSOR.A_MM_NORM_MID,
+        MODEL_TENSOR.A_ENC_NORM_CONV,
+        MODEL_TENSOR.A_ENC_LINEAR_POS,
+        MODEL_TENSOR.A_ENC_POS_BIAS_U,
+        MODEL_TENSOR.A_ENC_POS_BIAS_V,
+        MODEL_TENSOR.A_ENC_OUT,
+        MODEL_TENSOR.A_ENC_CONV_DW,
+        MODEL_TENSOR.A_ENC_CONV_NORM,
+        MODEL_TENSOR.A_ENC_CONV_PW1,
+        MODEL_TENSOR.A_ENC_CONV_PW2,
     ],
     MODEL_ARCH.LLAMA: [
         MODEL_TENSOR.TOKEN_EMBD,
@@ -3363,6 +3414,7 @@ class VisionProjectorType:
     LIGHTONOCR = "lightonocr"
     COGVLM = "cogvlm"
     JANUS_PRO = "janus_pro"
+    LFM2A = "lfm2a" # audio
     GLM4V = "glm4v"
 
 
diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py
index e04ff11164..301aafa910 100644
--- a/gguf-py/gguf/tensor_mapping.py
+++ b/gguf-py/gguf/tensor_mapping.py
@@ -1535,10 +1535,20 @@ class TensorNameMap:
 
         MODEL_TENSOR.A_ENC_EMBD_POS: (
             "audio_tower.embed_positions", # ultravox
+            "audio_embedding.embedding", # lfm2
+        ),
+
+        MODEL_TENSOR.A_ENC_EMBD_NORM: (
+            "audio_embedding.embedding_norm", # lfm2
+        ),
+
+        MODEL_TENSOR.A_ENC_EMBD_TO_LOGITS: (
+            "audio_embedding.to_logits", # lfm2
         ),
 
         MODEL_TENSOR.A_ENC_CONV1D: (
             "audio_tower.conv{bid}", # ultravox
+            "conformer.pre_encode.conv.{bid}", # lfm2
         ),
 
         MODEL_TENSOR.A_PRE_NORM: (),
@@ -1550,36 +1560,76 @@ class TensorNameMap:
 
         MODEL_TENSOR.A_ENC_ATTN_Q: (
             "audio_tower.layers.{bid}.self_attn.q_proj", # ultravox
+            "conformer.layers.{bid}.self_attn.linear_q", # lfm2
         ),
 
         MODEL_TENSOR.A_ENC_ATTN_K: (
             "audio_tower.layers.{bid}.self_attn.k_proj", # ultravox
+            "conformer.layers.{bid}.self_attn.linear_k", # lfm2
         ),
 
         MODEL_TENSOR.A_ENC_ATTN_V: (
             "audio_tower.layers.{bid}.self_attn.v_proj", # ultravox
+            "conformer.layers.{bid}.self_attn.linear_v", # lfm2
         ),
 
         MODEL_TENSOR.A_ENC_INPUT_NORM: (
             "audio_tower.layers.{bid}.self_attn_layer_norm", # ultravox
+            "conformer.layers.{bid}.norm_self_att", # lfm2
         ),
 
         MODEL_TENSOR.A_ENC_OUTPUT: (
             "audio_tower.layers.{bid}.self_attn.out_proj", # ultravox
+            "conformer.layers.{bid}.self_attn.linear_out", # lfm2
         ),
 
         MODEL_TENSOR.A_ENC_OUTPUT_NORM: (
             "audio_tower.layers.{bid}.final_layer_norm", # ultravox
+            "conformer.layers.{bid}.norm_out", # lfm2
+        ),
+
+        MODEL_TENSOR.A_ENC_FFN_NORM: (
+            "conformer.layers.{bid}.norm_feed_forward1", # lfm2
         ),
 
         MODEL_TENSOR.A_ENC_FFN_UP: (
             "audio_tower.layers.{bid}.fc1", # ultravox
+            "conformer.layers.{bid}.feed_forward1.linear1", # lfm2
         ),
 
         MODEL_TENSOR.A_ENC_FFN_GATE: (),
 
         MODEL_TENSOR.A_ENC_FFN_DOWN: (
             "audio_tower.layers.{bid}.fc2", # ultravox
+            "conformer.layers.{bid}.feed_forward1.linear2", # lfm2
+        ),
+
+        MODEL_TENSOR.A_ENC_FFN_UP_1: (
+            "conformer.layers.{bid}.feed_forward2.linear1", # lfm2
+        ),
+
+        MODEL_TENSOR.A_ENC_FFN_DOWN_1: (
+            "conformer.layers.{bid}.feed_forward2.linear2", # lfm2
+        ),
+
+        MODEL_TENSOR.A_ENC_FFN_NORM_1: (
+            "conformer.layers.{bid}.norm_feed_forward2", # lfm2
+        ),
+
+        MODEL_TENSOR.A_ENC_LINEAR_POS: (
+            "conformer.layers.{bid}.self_attn.linear_pos", # lfm2
+        ),
+
+        MODEL_TENSOR.A_ENC_POS_BIAS_U: (
+            "conformer.layers.{bid}.self_attn.pos_bias_u", # lfm2
+        ),
+
+        MODEL_TENSOR.A_ENC_POS_BIAS_V: (
+            "conformer.layers.{bid}.self_attn.pos_bias_v", # lfm2
+        ),
+
+        MODEL_TENSOR.A_ENC_OUT: (
+            "conformer.pre_encode.out", # lfm2
         ),
 
         # note: some tensors below has "audio." pseudo-prefix, to prevent conflicts with vision tensors
@@ -1587,6 +1637,7 @@ class TensorNameMap:
 
         MODEL_TENSOR.A_MMPROJ: (
             "audio.multi_modal_projector.linear_{bid}", # ultravox
+            "audio_adapter.model.{bid}" # lfm2
         ),
 
         MODEL_TENSOR.A_MMPROJ_FC: (
@@ -1602,6 +1653,26 @@ class TensorNameMap:
             "audio.multi_modal_projector.ln_mid", # ultravox
         ),
 
+        MODEL_TENSOR.A_ENC_CONV_DW: (
+            "conformer.layers.{bid}.conv.depthwise_conv", # lfm2
+        ),
+
+        MODEL_TENSOR.A_ENC_CONV_NORM: (
+            "conformer.layers.{bid}.conv.batch_norm", # lfm2
+        ),
+
+        MODEL_TENSOR.A_ENC_CONV_PW1: (
+            "conformer.layers.{bid}.conv.pointwise_conv1", # lfm2
+        ),
+
+        MODEL_TENSOR.A_ENC_CONV_PW2: (
+            "conformer.layers.{bid}.conv.pointwise_conv2", # lfm2
+        ),
+
+        MODEL_TENSOR.A_ENC_NORM_CONV: (
+            "conformer.layers.{bid}.norm_conv", # lfm2
+        ),
+
         # NextN/MTP tensors for GLM4_MOE
         MODEL_TENSOR.NEXTN_EH_PROJ: (
             "model.layers.{bid}.eh_proj",
diff --git a/gguf-py/gguf/utility.py b/gguf-py/gguf/utility.py
index c9401a1c0a..4918ae971a 100644
--- a/gguf-py/gguf/utility.py
+++ b/gguf-py/gguf/utility.py
@@ -288,7 +288,7 @@ class LocalTensor:
     data_range: LocalTensorRange
 
     def mmap_bytes(self) -> np.ndarray:
-        return np.memmap(self.data_range.filename, offset=self.data_range.offset, shape=self.data_range.size)
+        return np.memmap(self.data_range.filename, mode='c', offset=self.data_range.offset, shape=self.data_range.size)
 
 
 class SafetensorsLocal:
diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp
index 8caf80afcf..d0eaf317f7 100644
--- a/src/llama-arch.cpp
+++ b/src/llama-arch.cpp
@@ -2055,7 +2055,7 @@ static std::set<llm_tensor> llm_get_tensor_names(llm_arch arch) {
                 LLM_TENSOR_SHORTCONV_INPROJ,
                 LLM_TENSOR_SHORTCONV_OUTPROJ,
                 LLM_TENSOR_TOKEN_EMBD,
-                LLM_TENSOR_OUTPUT_NORM,
+                LLM_TENSOR_OUTPUT_NORM_LFM2,
                 LLM_TENSOR_FFN_GATE_INP,
                 LLM_TENSOR_FFN_GATE_EXPS,
                 LLM_TENSOR_FFN_DOWN_EXPS,
diff --git a/src/llama-mmap.cpp b/src/llama-mmap.cpp
index 0641c2d22f..23b648a2e3 100644
--- a/src/llama-mmap.cpp
+++ b/src/llama-mmap.cpp
@@ -13,9 +13,10 @@
 #ifdef __has_include
     #if __has_include(<unistd.h>)
         #include <unistd.h>
+        #include <fcntl.h>
+        #include <sys/stat.h>
         #if defined(_POSIX_MAPPED_FILES)
             #include <sys/mman.h>
-            #include <fcntl.h>
         #endif
         #if defined(_POSIX_MEMLOCK_RANGE)
             #include <sys/resource.h>
@@ -74,7 +75,7 @@ struct llama_file::impl {
         return ret;
     }
 
-    impl(const char * fname, const char * mode) {
+    impl(const char * fname, const char * mode, [[maybe_unused]] const bool use_direct_io = false) {
         fp = ggml_fopen(fname, mode);
         if (fp == NULL) {
             throw std::runtime_error(format("failed to open %s: %s", fname, strerror(errno)));
@@ -153,13 +154,40 @@ struct llama_file::impl {
         write_raw(&val, sizeof(val));
     }
 
+    void read_aligned_chunk(size_t offset, void * dest, size_t size) const {
+        throw std::runtime_error("DirectIO is not implemented on Windows.");
+    }
+
     ~impl() {
         if (fp) {
             std::fclose(fp);
         }
     }
 #else
-    impl(const char * fname, const char * mode) {
+    impl(const char * fname, const char * mode, [[maybe_unused]] const bool use_direct_io = false) {
+#ifdef __linux__
+        // Try unbuffered I/O for read only
+        if (use_direct_io && std::strcmp(mode, "rb") == 0) {
+            fd = open(fname, O_RDONLY | O_DIRECT);
+
+            if (fd != -1) {
+                struct stat file_stats{};
+                fstat(fd, &file_stats);
+
+                size = file_stats.st_size;
+                alignment = file_stats.st_blksize;
+
+                off_t ret = lseek(fd, 0, SEEK_SET);
+                if (ret == -1) {
+                    throw std::runtime_error(format("seek error: %s", strerror(errno)));
+                }
+                return;
+            }
+
+            LLAMA_LOG_WARN("Failed to open model %s with error: %s. Falling back to buffered I/O",
+                fname, strerror(errno));
+        }
+#endif
         fp = ggml_fopen(fname, mode);
         if (fp == NULL) {
             throw std::runtime_error(format("failed to open %s: %s", fname, strerror(errno)));
@@ -170,27 +198,30 @@ struct llama_file::impl {
     }
 
     size_t tell() const {
-// TODO: this ifdef is never true?
-#ifdef _WIN32
-        __int64 ret = _ftelli64(fp);
-#else
-        long ret = std::ftell(fp);
-#endif
-        if (ret == -1) {
-            throw std::runtime_error(format("ftell error: %s", strerror(errno)));
+        if (fd == -1) {
+            long ret = std::ftell(fp);
+            if (ret == -1) {
+                throw std::runtime_error(format("ftell error: %s", strerror(errno)));
+            }
+
+            return (size_t) ret;
         }
 
-        return (size_t) ret;
+        off_t pos = lseek(fd, 0, SEEK_CUR);
+        if (pos == -1) {
+            throw std::runtime_error(format("lseek error: %s", strerror(errno)));
+        }
+        return (size_t) pos;
     }
 
     void seek(size_t offset, int whence) const {
-// TODO: this ifdef is never true?
-#ifdef _WIN32
-        int ret = _fseeki64(fp, (__int64) offset, whence);
-#else
-        int ret = std::fseek(fp, (long) offset, whence);
-#endif
-        if (ret != 0) {
+        off_t ret = 0;
+        if (fd == -1) {
+            ret = std::fseek(fp, (long) offset, whence);
+        } else {
+            ret = lseek(fd, offset, whence);
+        }
+        if (ret == -1) {
             throw std::runtime_error(format("seek error: %s", strerror(errno)));
         }
     }
@@ -200,13 +231,55 @@ struct llama_file::impl {
             return;
         }
         errno = 0;
-        std::size_t ret = std::fread(ptr, len, 1, fp);
-        if (ferror(fp)) {
-            throw std::runtime_error(format("read error: %s", strerror(errno)));
+        if (fd == -1) {
+            std::size_t ret = std::fread(ptr, len, 1, fp);
+            if (ferror(fp)) {
+                throw std::runtime_error(format("read error: %s", strerror(errno)));
+            }
+            if (ret != 1) {
+                throw std::runtime_error("unexpectedly reached end of file");
+            }
+        } else {
+            bool successful = false;
+            while (!successful) {
+                off_t ret = read(fd, ptr, len);
+
+                if (ret == -1) {
+                    if (errno == EINTR) {
+                        continue;  // Interrupted by signal, retry
+                    }
+                    throw std::runtime_error(format("read error: %s", strerror(errno)));
+                }
+                if (ret == 0) {
+                    throw std::runtime_error("unexpectedly reached end of file");
+                }
+
+                successful = true;
+            }
         }
-        if (ret != 1) {
-            throw std::runtime_error("unexpectedly reached end of file");
+    }
+
+    void read_aligned_chunk(size_t offset, void * dest, size_t size) const {
+        off_t aligned_offset = offset & ~(alignment - 1);
+        off_t offset_from_alignment = offset - aligned_offset;
+        size_t bytes_to_read = (offset_from_alignment + size + alignment - 1) & ~(alignment - 1);
+
+        void * raw_buffer = nullptr;
+        int ret = posix_memalign(&raw_buffer, alignment, bytes_to_read);
+        if (ret != 0) {
+            throw std::runtime_error(format("posix_memalign failed with error %d", ret));
         }
+
+        struct aligned_buffer_deleter {
+            void operator()(void * p) const { free(p); }
+        };
+        std::unique_ptr<void, aligned_buffer_deleter> buffer(raw_buffer);
+
+        seek(aligned_offset, SEEK_SET);
+        read_raw(buffer.get(), bytes_to_read);
+
+        uintptr_t actual_data = reinterpret_cast<uintptr_t>(buffer.get()) + offset_from_alignment;
+        memcpy(dest, reinterpret_cast<void *>(actual_data), size);
     }
 
     uint32_t read_u32() const {
@@ -231,22 +304,43 @@ struct llama_file::impl {
     }
 
     ~impl() {
-        if (fp) {
+        if (fd != -1) {
+            close(fd);
+        } else {
             std::fclose(fp);
         }
     }
+    int fd = -1;
 #endif
 
-    FILE * fp;
-    size_t size;
+    void read_raw_at(void * ptr, size_t len, size_t offset) const {
+        if (alignment != 1) {
+            read_aligned_chunk(offset, ptr, len);
+        } else {
+            seek(offset, SEEK_SET);
+            read_raw(ptr, len);
+        }
+    }
+
+    size_t read_alignment() const {
+        return alignment;
+    }
+
+    size_t alignment = 1;
+
+    FILE * fp{};
+    size_t size{};
 };
 
-llama_file::llama_file(const char * fname, const char * mode) : pimpl(std::make_unique<impl>(fname, mode)) {}
+llama_file::llama_file(const char * fname, const char * mode, const bool use_direct_io) :
+    pimpl(std::make_unique<impl>(fname, mode, use_direct_io)) {}
 llama_file::~llama_file() = default;
 
 size_t llama_file::tell() const { return pimpl->tell(); }
 size_t llama_file::size() const { return pimpl->size; }
 
+size_t llama_file::read_alignment() const { return pimpl->read_alignment(); }
+
 int llama_file::file_id() const {
 #ifdef _WIN32
     return _fileno(pimpl->fp);
@@ -261,6 +355,7 @@ int llama_file::file_id() const {
 
 void llama_file::seek(size_t offset, int whence) const { pimpl->seek(offset, whence); }
 void llama_file::read_raw(void * ptr, size_t len) const { pimpl->read_raw(ptr, len); }
+void llama_file::read_raw_at(void * ptr, size_t len, size_t offset) const { pimpl->read_raw_at(ptr, len, offset); }
 
 uint32_t llama_file::read_u32() const { return pimpl->read_u32(); }
 
diff --git a/src/llama-mmap.h b/src/llama-mmap.h
index 4e5aec3f44..729aac164b 100644
--- a/src/llama-mmap.h
+++ b/src/llama-mmap.h
@@ -3,6 +3,7 @@
 #include <cstdint>
 #include <memory>
 #include <vector>
+#include <cstdio>
 
 struct llama_file;
 struct llama_mmap;
@@ -13,7 +14,7 @@ using llama_mmaps  = std::vector<std::unique_ptr<llama_mmap>>;
 using llama_mlocks = std::vector<std::unique_ptr<llama_mlock>>;
 
 struct llama_file {
-    llama_file(const char * fname, const char * mode);
+    llama_file(const char * fname, const char * mode, bool use_direct_io = false);
     ~llama_file();
 
     size_t tell() const;
@@ -24,11 +25,14 @@ struct llama_file {
     void seek(size_t offset, int whence) const;
 
     void read_raw(void * ptr, size_t len) const;
+    void read_raw_at(void * ptr, size_t len, size_t offset) const;
+    void read_aligned_chunk(size_t offset, void * dest, size_t size) const;
     uint32_t read_u32() const;
 
     void write_raw(const void * ptr, size_t len) const;
     void write_u32(uint32_t val) const;
 
+    size_t read_alignment() const;
 private:
     struct impl;
     std::unique_ptr<impl> pimpl;
diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp
index ca2ea2461d..33a76dba40 100644
--- a/src/llama-model-loader.cpp
+++ b/src/llama-model-loader.cpp
@@ -504,7 +504,7 @@ llama_model_loader::llama_model_loader(
     get_key(llm_kv(LLM_KV_GENERAL_ARCHITECTURE), arch_name, false);
     llm_kv = LLM_KV(llm_arch_from_string(arch_name));
 
-    files.emplace_back(new llama_file(fname.c_str(), "rb"));
+    files.emplace_back(new llama_file(fname.c_str(), "rb", !use_mmap));
     contexts.emplace_back(ctx);
 
     // Save tensors data offset of the main file.
@@ -572,7 +572,7 @@ llama_model_loader::llama_model_loader(
                 }
             }
 
-            files.emplace_back(new llama_file(fname_split, "rb"));
+            files.emplace_back(new llama_file(fname_split, "rb", !use_mmap));
             contexts.emplace_back(ctx);
 
             // Save tensors data offset info of the shard.
@@ -935,7 +935,15 @@ bool llama_model_loader::load_all_data(
     // 4 staging buffers for async uploads, each sized 1MB seems to be a good default for single NVMe drives.
     // NVMe raid configurations might require more / larger buffers.
     constexpr size_t n_buffers = 4;
-    constexpr size_t buffer_size = 1 * 1024 * 1024; // 1MB
+
+    size_t alignment = 1;
+    for (const auto & file : files) {
+        alignment = std::max(file->read_alignment(), alignment);
+    }
+
+    // Buffer size: balance between memory usage and I/O efficiency
+    // 64MB works well for NVMe drives
+    const size_t buffer_size = alignment != 1 ? 64 * 1024 * 1024 + 2 * alignment : 1 * 1024 * 1024;
 
     std::vector<ggml_backend_buffer_t> host_buffers;
     std::vector<ggml_backend_event_t> events;
@@ -985,6 +993,7 @@ bool llama_model_loader::load_all_data(
         // If the backend is supported, create pinned memory buffers and events for synchronisation.
         for (size_t idx = 0; idx < n_buffers; ++idx) {
             auto * buf = ggml_backend_buft_alloc_buffer(host_buft, buffer_size);
+
             if (!buf) {
                 LLAMA_LOG_DEBUG("%s: failed to allocate host buffer for async uploads for device %s\n", func,
                     ggml_backend_dev_name(dev));
@@ -1066,9 +1075,9 @@ bool llama_model_loader::load_all_data(
             }
         } else {
             const auto & file = files.at(weight->idx);
+
             if (ggml_backend_buffer_is_host(cur->buffer)) {
-                file->seek(weight->offs, SEEK_SET);
-                file->read_raw(cur->data, n_size);
+                file->read_raw_at(cur->data, n_size, weight->offs);
                 if (check_tensors) {
                     validation_result.emplace_back(std::async(std::launch::async, [cur, n_size] {
                         return std::make_pair(cur, ggml_validate_row_data(cur->type, cur->data, n_size));
@@ -1077,26 +1086,60 @@ bool llama_model_loader::load_all_data(
             } else {
                 // If upload_backend is valid load the tensor in chunks to pinned memory and upload the buffers asynchronously to the GPU.
                 if (upload_backend) {
-                    file->seek(weight->offs, SEEK_SET);
+                    size_t offset = weight->offs;
+                    alignment = file->read_alignment();
+                    size_t aligned_offset = offset & ~(alignment - 1);
+                    size_t offset_from_alignment = offset - aligned_offset;
+                    file->seek(aligned_offset, SEEK_SET);
+
+                    // Calculate aligned read boundaries
+                    size_t read_start = aligned_offset;
+                    size_t read_end = (offset + n_size + alignment - 1) & ~(alignment - 1);
 
                     size_t bytes_read = 0;
+                    size_t data_read = 0;  // Actual tensor data copied (excluding padding)
 
-                    while (bytes_read < n_size) {
-                        size_t read_iteration = std::min<size_t>(buffer_size, n_size - bytes_read);
+                    while (bytes_read < read_end - read_start) {
+                        size_t read_size = std::min<size_t>(buffer_size, read_end - read_start - bytes_read);
 
+                        // Align the destination pointer within the pinned buffer
+                        uintptr_t ptr_dest_aligned = (reinterpret_cast<uintptr_t>(host_ptrs[buffer_idx]) + alignment - 1) & ~(alignment - 1);
+
+                        // Wait for previous upload to complete before reusing buffer
                         ggml_backend_event_synchronize(events[buffer_idx]);
-                        file->read_raw(host_ptrs[buffer_idx], read_iteration);
-                        ggml_backend_tensor_set_async(upload_backend, cur, host_ptrs[buffer_idx], bytes_read, read_iteration);
+
+                        // Read aligned chunk from file
+                        file->read_raw(reinterpret_cast<void *>(ptr_dest_aligned), read_size);
+
+                        // Calculate actual data portion (excluding alignment padding)
+                        uintptr_t ptr_data = ptr_dest_aligned;
+                        size_t data_to_copy = read_size;
+
+                        // Skip alignment padding at start of first chunk
+                        if (bytes_read == 0) {
+                            ptr_data += offset_from_alignment;
+                            data_to_copy -= offset_from_alignment;
+                        }
+
+                        // Trim alignment padding at end of last chunk
+                        if (aligned_offset + bytes_read + read_size > offset + n_size) {
+                            data_to_copy -= (read_end - (offset + n_size));
+                        }
+
+                        // Async upload actual data to GPU
+                        ggml_backend_tensor_set_async(upload_backend, cur,
+                                                      reinterpret_cast<void *>(ptr_data), data_read, data_to_copy);
                         ggml_backend_event_record(events[buffer_idx], upload_backend);
 
-                        bytes_read += read_iteration;
+                        data_read += data_to_copy;
+                        bytes_read += read_size;
+
                         ++buffer_idx;
                         buffer_idx %= n_buffers;
                     }
                 } else {
                     read_buf.resize(n_size);
-                    file->seek(weight->offs, SEEK_SET);
-                    file->read_raw(read_buf.data(), n_size);
+                    file->read_raw_at(read_buf.data(), n_size, weight->offs);
                     ggml_backend_tensor_set(cur, read_buf.data(), 0, n_size);
                     if (check_tensors && !ggml_validate_row_data(cur->type, read_buf.data(), n_size)) {
                         throw std::runtime_error(format("tensor '%s' has invalid data", ggml_get_name(cur)));
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
index ae8207ee1a..d2270e8f2d 100644
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -2378,10 +2378,10 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
     if (cpu_dev == nullptr) {
         throw std::runtime_error(format("%s: no CPU backend found", __func__));
     }
-    const int i_gpu_start = std::max((int) hparams.n_layer - n_gpu_layers, (int) 0);
-    const int act_gpu_layers = devices.empty() ? 0 : std::min(n_gpu_layers, (int)n_layer + 1);
+    const int i_gpu_start = std::max(int(hparams.n_layer) + 1 - n_gpu_layers, 0);
+    const int act_gpu_layers = devices.empty() ? 0 : std::min(n_gpu_layers, int(n_layer) + 1);
     auto get_layer_buft_list = [&](int il) -> llama_model::impl::layer_dev {
-        const bool is_swa = il < (int) hparams.n_layer && hparams.is_swa(il);
+        const bool is_swa = il < int(hparams.n_layer) && hparams.is_swa(il);
         if (il < i_gpu_start || (il - i_gpu_start) >= act_gpu_layers) {
             LLAMA_LOG_DEBUG("load_tensors: layer %3d assigned to device %s, is_swa = %d\n", il, ggml_backend_dev_name(cpu_dev), is_swa);
             return {cpu_dev, &pimpl->cpu_buft_list};
@@ -6236,8 +6236,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
                 {
                     tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
 
-                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
-                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
+                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM_LFM2, "weight"), {n_embd}, 0);
+                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,           "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
 
                     if (output == NULL) {
                         output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
@@ -6693,10 +6693,12 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
     if (llama_supports_gpu_offload()) {
         const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
 
-        LLAMA_LOG_INFO("%s: offloading %d repeating layers to GPU\n", __func__, n_gpu);
-        if (n_gpu_layers > (int) hparams.n_layer) {
+        int n_repeating = n_gpu;
+        if (n_repeating > 0) {
             LLAMA_LOG_INFO("%s: offloading output layer to GPU\n", __func__);
+            n_repeating--;
         }
+        LLAMA_LOG_INFO("%s: offloading %d repeating layers to GPU\n", __func__, n_repeating);
 
         const int max_backend_supported_layers = hparams.n_layer + 1;
         const int max_offloadable_layers       = hparams.n_layer + 1;
diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp
index 3f4a729bc3..d96f619ae1 100644
--- a/src/llama-sampling.cpp
+++ b/src/llama-sampling.cpp
@@ -362,23 +362,39 @@ const char * llama_sampler_name(const struct llama_sampler * smpl) {
 }
 
 void llama_sampler_accept(struct llama_sampler * smpl, llama_token token) {
+    if (!smpl) {
+        return;
+    }
+
     if (smpl->iface->accept) {
         smpl->iface->accept(smpl, token);
     }
 }
 
 void llama_sampler_apply(struct llama_sampler * smpl, struct llama_token_data_array * cur_p) {
+    if (!smpl) {
+        return;
+    }
+
     GGML_ASSERT(smpl->iface->apply);
     smpl->iface->apply(smpl, cur_p);
 }
 
 void llama_sampler_reset(struct llama_sampler * smpl) {
+    if (!smpl) {
+        return;
+    }
+
     if (smpl->iface->reset) {
         smpl->iface->reset(smpl);
     }
 }
 
 struct llama_sampler * llama_sampler_clone(const struct llama_sampler * smpl) {
+    if (!smpl) {
+        return nullptr;
+    }
+
     if (smpl->iface->clone) {
         return smpl->iface->clone(smpl);
     }
diff --git a/src/llama.cpp b/src/llama.cpp
index 7ed34b80ae..1e18637e36 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -71,8 +71,9 @@ static std::vector<llama_device_memory_data> llama_get_device_memory_data(
     }, &ud);
 
     llama_model_params mparams_copy = *mparams;
-    mparams_copy.no_alloc = true;
-    mparams_copy.use_mmap = false;
+    mparams_copy.no_alloc  = true;
+    mparams_copy.use_mmap  = false;
+    mparams_copy.use_mlock = false;
 
     llama_model * model = llama_model_load_from_file(path_model, mparams_copy);
     if (model == nullptr) {
@@ -180,11 +181,12 @@ static void llama_params_fit_impl(
         }
     }
 
-    int64_t sum_total          = 0;
-    int64_t sum_projected_free = 0;
-    int64_t min_projected_free = INT64_MAX;
-    int64_t sum_projected_used = 0;
-    int64_t sum_projected_ctx  = 0;
+    int64_t sum_total           = 0;
+    int64_t sum_projected_free  = 0;
+    int64_t min_projected_free  = INT64_MAX;
+    int64_t sum_projected_used  = 0;
+    int64_t sum_projected_model = 0;
+    int64_t sum_projected_ctx   = 0;
 
     if (nd > 1) {
         LLAMA_LOG_INFO("%s: projected memory use with initial parameters [MiB]:\n", __func__);
@@ -195,11 +197,12 @@ static void llama_params_fit_impl(
         const int64_t projected_used = dmd.mb.total();
         const int64_t projected_free = dmd.free - projected_used;
 
-        sum_total          += dmd.total;
-        sum_projected_used += projected_used;
-        sum_projected_free += projected_free;
-        min_projected_free  = std::min(min_projected_free, projected_free);
-        sum_projected_ctx  += dmd.mb.context;
+        sum_total           += dmd.total;
+        sum_projected_used  += projected_used;
+        sum_projected_free  += projected_free;
+        min_projected_free   = std::min(min_projected_free, projected_free);
+        sum_projected_model += dmd.mb.model;
+        sum_projected_ctx   += dmd.mb.context;
 
         if (nd > 1) {
             LLAMA_LOG_INFO("%s:   - %s: %6" PRId64 " total, %6" PRId64 " used, %6" PRId64 " %s\n",
@@ -234,13 +237,34 @@ static void llama_params_fit_impl(
             if (cparams->n_ctx == 0) {
                 if (hp_nct > n_ctx_min) {
                     const int64_t bytes_per_ctx = sum_projected_ctx / hp_nct;
-                    const uint32_t ctx_reduction = std::min(
-                        uint32_t((-global_surplus + bytes_per_ctx - 1) / bytes_per_ctx), hp_nct - n_ctx_min);
+
+                    int64_t memory_reduction = -global_surplus;
+                    if (nd > 1) {
+                        // for multiple devices we need to be more conservative in terms of how much context we think can fit:
+                        //   - for dense models only whole layers can be assigned to devices
+                        //   - for MoE models only whole tensors can be assigned to devices, which we estimate to be <= 1/3 of a layer
+                        //   - on average we expect a waste of 0.5 layers/tensors per device
+                        //   - use slightly more than the expected average for nd devices to be safe
+                        const int64_t model_per_layer = sum_projected_model / std::min(uint32_t(mparams->n_gpu_layers), hp_ngl);
+                        memory_reduction += (nd + 1) * model_per_layer / (hp_nex == 0 ? 2 : 6);
+                    }
+
+                    uint32_t ctx_reduction = std::min(uint32_t((memory_reduction + bytes_per_ctx - 1) / bytes_per_ctx), hp_nct - n_ctx_min);
                     cparams->n_ctx = hp_nct - ctx_reduction;
-                    const int64_t memory_reduction = ctx_reduction * bytes_per_ctx;
+                    cparams->n_ctx = std::max(cparams->n_ctx - cparams->n_ctx % 256, n_ctx_min); // round down context for CUDA backend
+
+                    ctx_reduction = hp_nct - cparams->n_ctx;
+                    memory_reduction = ctx_reduction * bytes_per_ctx;
                     global_surplus += memory_reduction;
                     LLAMA_LOG_INFO("%s: context size reduced from %" PRIu32 " to %" PRIu32 " -> need %" PRId64 " MiB less memory in total\n",
                         __func__, hp_nct, cparams->n_ctx, memory_reduction/MiB);
+                    if (global_surplus >= 0) {
+                        if (nd == 1) {
+                            LLAMA_LOG_INFO("%s: entire model can be fit by reducing context\n", __func__);
+                            return;
+                        }
+                        LLAMA_LOG_INFO("%s: entire model should be fit across devices by reducing context\n", __func__);
+                    }
                 } else {
                     LLAMA_LOG_INFO("%s: default model context size is %" PRIu32 " which is <= the min. context size of %" PRIu32 " -> no change\n",
                         __func__, hp_nct, n_ctx_min);
@@ -249,10 +273,6 @@ static void llama_params_fit_impl(
                 LLAMA_LOG_INFO("%s: context size set by user to %" PRIu32 " -> no change\n", __func__, cparams->n_ctx);
             }
         }
-        if (global_surplus >= 0) {
-            LLAMA_LOG_INFO("%s: entire model can be fit across devices by reducing context\n", __func__);
-            return;
-        }
     }
 
     if (mparams->n_gpu_layers != default_mparams.n_gpu_layers) {
@@ -272,10 +292,6 @@ static void llama_params_fit_impl(
         if (mparams->split_mode == LLAMA_SPLIT_MODE_ROW) {
             throw std::runtime_error("changing weight allocation for LLAMA_SPLIT_MODE_ROW not implemented, abort");
         }
-        if (hp_ngl < 2*nd) {
-            throw std::runtime_error("model has only " + std::to_string(hp_ngl) + " layers but need at least "
-                + std::to_string(2*nd) + " to fit memory for " + std::to_string(nd) + " devices, abort");
-        }
     }
     if (!tensor_buft_overrides) {
         throw std::runtime_error("did not provide buffer to set tensor_buft_overrides, abort");
@@ -342,8 +358,7 @@ static void llama_params_fit_impl(
     auto set_ngl_tensor_split_tbo = [&](
             const std::vector<ngl_t> & ngl_per_device,
             const std::vector<ggml_backend_buffer_type_t> & overflow_bufts,
-            llama_model_params & mparams,
-            const bool add_nonrepeating) {
+            llama_model_params & mparams) {
         mparams.n_gpu_layers = 0;
         for (size_t id = 0; id < nd; id++) {
             mparams.n_gpu_layers += ngl_per_device[id].n_layer;
@@ -351,13 +366,9 @@ static void llama_params_fit_impl(
                 tensor_split[id] = ngl_per_device[id].n_layer;
             }
         }
-        assert(uint32_t(mparams.n_gpu_layers) <= hp_ngl);
-        uint32_t il0 = hp_ngl - mparams.n_gpu_layers; // start index for tensor buft overrides
+        assert(uint32_t(mparams.n_gpu_layers) <= hp_ngl + 1);
+        uint32_t il0 = hp_ngl + 1 - mparams.n_gpu_layers; // start index for tensor buft overrides
 
-        if (add_nonrepeating) {
-            mparams.n_gpu_layers += 1;
-            tensor_split[nd - 1] += 1;
-        }
         mparams.tensor_split = tensor_split;
 
         size_t itbo = 0;
@@ -388,10 +399,9 @@ static void llama_params_fit_impl(
     auto get_memory_for_layers = [&](
             const char * func_name,
             const std::vector<ngl_t> & ngl_per_device,
-            const std::vector<ggml_backend_buffer_type_t> & overflow_bufts,
-            const bool add_nonrepeating) -> std::vector<int64_t> {
+            const std::vector<ggml_backend_buffer_type_t> & overflow_bufts) -> std::vector<int64_t> {
         llama_model_params mparams_copy = *mparams;
-        set_ngl_tensor_split_tbo(ngl_per_device, overflow_bufts, mparams_copy, add_nonrepeating);
+        set_ngl_tensor_split_tbo(ngl_per_device, overflow_bufts, mparams_copy);
 
         const dmds_t dmd_nl = llama_get_device_memory_data(
             path_model, &mparams_copy, cparams, devs, hp_ngl, hp_nct, hp_nex, log_level);
@@ -449,9 +459,6 @@ static void llama_params_fit_impl(
         LLAMA_LOG_DEBUG("%s: id=%zu, target=%" PRId64 " MiB\n", __func__, id, targets[id]/MiB);
     }
 
-    // whether for the optimal memory use we expect to load at least some MoE tensors:
-    const bool partial_moe = hp_nex > 0 && global_surplus_cpu_moe > 0;
-
     std::vector<ggml_backend_buffer_type_t> overflow_bufts; // which bufts the partial layers of a device overflow to:
     overflow_bufts.reserve(nd);
     for (size_t id = 0; id < nd - 1; ++id) {
@@ -460,7 +467,7 @@ static void llama_params_fit_impl(
     overflow_bufts.push_back(ggml_backend_cpu_buffer_type());
 
     std::vector<ngl_t> ngl_per_device(nd);
-    std::vector<int64_t> mem = get_memory_for_layers(__func__, ngl_per_device, overflow_bufts, partial_moe);
+    std::vector<int64_t> mem = get_memory_for_layers(__func__, ngl_per_device, overflow_bufts);
     if (hp_nex > 0) {
         for (size_t id = 0; id < nd; id++) {
             ngl_per_device[id].overflow_type = LAYER_FRACTION_MOE;
@@ -473,22 +480,30 @@ static void llama_params_fit_impl(
     //   - interpolate the memory use / layer between low and high linearly to get a guess where it meets our target
     //   - check memory use of our guess, replace either the low or high bound
     //   - once we only have a difference of a single layer, stop and return the lower bound that just barely still fits
+    //   - the last device has the output layer, which cannot be a partial layer
     if (hp_nex == 0) {
         LLAMA_LOG_INFO("%s: filling dense layers back-to-front:\n", __func__);
     } else {
         LLAMA_LOG_INFO("%s: filling dense-only layers back-to-front:\n", __func__);
     }
-    uint32_t n_unassigned = hp_ngl;
     for (int id = nd - 1; id >= 0; id--) {
+        uint32_t n_unassigned = hp_ngl + 1;
+        for (size_t jd = id + 1; jd < nd; ++jd) {
+            assert(n_unassigned >= ngl_per_device[jd].n_layer);
+            n_unassigned -= ngl_per_device[jd].n_layer;
+        }
+
         std::vector<ngl_t> ngl_per_device_high = ngl_per_device;
         ngl_per_device_high[id].n_layer = n_unassigned;
         if (hp_nex > 0) {
-            ngl_per_device_high[id].n_part = ngl_per_device_high[id].n_layer;
+            ngl_per_device_high[id].n_part = size_t(id) < nd - 1 ? ngl_per_device_high[id].n_layer : ngl_per_device_high[id].n_layer - 1;
         }
         if (ngl_per_device_high[id].n_layer > 0) {
-            std::vector<int64_t> mem_high = get_memory_for_layers(__func__, ngl_per_device_high, overflow_bufts, partial_moe);
+            std::vector<int64_t> mem_high = get_memory_for_layers(__func__, ngl_per_device_high, overflow_bufts);
             if (mem_high[id] > targets[id]) {
+                assert(ngl_per_device_high[id].n_layer > ngl_per_device[id].n_layer);
                 uint32_t delta = ngl_per_device_high[id].n_layer - ngl_per_device[id].n_layer;
+                LLAMA_LOG_DEBUG("%s: start filling device %" PRIu32 ", delta=%" PRIu32 "\n", __func__, id, delta);
                 while (delta > 1) {
                     uint32_t step_size = int64_t(delta) * (targets[id] - mem[id]) / (mem_high[id] - mem[id]);
                     step_size = std::max(step_size, uint32_t(1));
@@ -499,23 +514,23 @@ static void llama_params_fit_impl(
                     if (hp_nex) {
                         ngl_per_device_test[id].n_part += step_size;
                     }
-                    const std::vector<int64_t> mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts, partial_moe);
+                    const std::vector<int64_t> mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts);
 
                     if (mem_test[id] <= targets[id]) {
-                        ngl_per_device  = ngl_per_device_test;
-                        mem             = mem_test;
-                        n_unassigned   -= ngl_per_device[id].n_layer;
+                        ngl_per_device = ngl_per_device_test;
+                        mem            = mem_test;
                         LLAMA_LOG_DEBUG("%s: set ngl_per_device[%d].n_layer=%" PRIu32 "\n", __func__, id, ngl_per_device[id].n_layer);
                     } else {
                         ngl_per_device_high = ngl_per_device_test;
                         mem_high            = mem_test;
-                        LLAMA_LOG_DEBUG("%s: set ngl_per_device_high[%d].n_layer=%" PRIu32 "\n", __func__, id, ngl_per_device[id].n_layer);
+                        LLAMA_LOG_DEBUG("%s: set ngl_per_device_high[%d].n_layer=%" PRIu32 "\n", __func__, id, ngl_per_device_high[id].n_layer);
                     }
                     delta = ngl_per_device_high[id].n_layer - ngl_per_device[id].n_layer;
                 }
             } else {
-                ngl_per_device  = ngl_per_device_high;
-                n_unassigned   -= ngl_per_device[id].n_layer;
+                assert(ngl_per_device_high[id].n_layer == n_unassigned);
+                ngl_per_device = ngl_per_device_high;
+                mem            = mem_high;
                 LLAMA_LOG_DEBUG("%s: set ngl_per_device[%d].n_layer=%" PRIu32 "\n", __func__, id, ngl_per_device[id].n_layer);
             }
         }
@@ -526,7 +541,7 @@ static void llama_params_fit_impl(
             __func__, dev_names[id].c_str(), ngl_per_device[id].n_layer, mem[id]/MiB, projected_margin/MiB);
     }
     if (hp_nex == 0 || global_surplus_cpu_moe <= 0) {
-        set_ngl_tensor_split_tbo(ngl_per_device, overflow_bufts, *mparams, partial_moe);
+        set_ngl_tensor_split_tbo(ngl_per_device, overflow_bufts, *mparams);
         return;
     }
 
@@ -549,13 +564,13 @@ static void llama_params_fit_impl(
     for (size_t id = 0; id <= id_dense_start; id++) {
         std::vector<ngl_t> ngl_per_device_high = ngl_per_device;
         for (size_t jd = id_dense_start; jd < nd; jd++) {
-            const uint32_t n_layer_move = ngl_per_device_high[jd].n_layer;
+            const uint32_t n_layer_move = jd < nd - 1 ? ngl_per_device_high[jd].n_layer : ngl_per_device_high[jd].n_layer - 1;
             ngl_per_device_high[id].n_layer += n_layer_move;
             ngl_per_device_high[jd].n_layer -= n_layer_move;
             ngl_per_device_high[jd].n_part = 0;
         }
         size_t id_dense_start_high = nd - 1;
-        std::vector<int64_t> mem_high = get_memory_for_layers(__func__, ngl_per_device_high, overflow_bufts, partial_moe);
+        std::vector<int64_t> mem_high = get_memory_for_layers(__func__, ngl_per_device_high, overflow_bufts);
 
         if (mem_high[id] > targets[id]) {
             assert(ngl_per_device_high[id].n_layer >= ngl_per_device_high[id].n_part);
@@ -583,7 +598,7 @@ static void llama_params_fit_impl(
                         break;
                     }
                 }
-                const std::vector<int64_t> mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts, partial_moe);
+                const std::vector<int64_t> mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts);
 
                 if (mem_test[id] <= targets[id]) {
                     ngl_per_device = ngl_per_device_test;
@@ -603,13 +618,14 @@ static void llama_params_fit_impl(
             }
         } else {
             ngl_per_device = ngl_per_device_high;
+            mem            = mem_high;
             id_dense_start = id_dense_start_high;
             LLAMA_LOG_DEBUG("%s: set ngl_per_device[%zu].(n_layer, n_part)=(%" PRIu32 ", %" PRIu32 "), id_dense_start=%zu\n",
                 __func__, id, ngl_per_device[id].n_layer, ngl_per_device[id].n_part, id_dense_start);
         }
 
         // try to fit at least part of one more layer
-        if (ngl_per_device[id_dense_start].n_layer > 0) {
+        if (ngl_per_device[id_dense_start].n_layer > (id < nd - 1 ? 0 : 1)) {
             std::vector<ngl_t> ngl_per_device_test = ngl_per_device;
             size_t id_dense_start_test = id_dense_start;
             ngl_per_device_test[id_dense_start_test].n_layer--;
@@ -621,7 +637,7 @@ static void llama_params_fit_impl(
             }
             ngl_per_device_test[id].overflow_type = LAYER_FRACTION_UP;
             LLAMA_LOG_DEBUG("%s: trying to fit one extra layer with overflow_type=LAYER_FRACTION_UP\n", __func__);
-            std::vector<int64_t> mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts, partial_moe);
+            std::vector<int64_t> mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts);
             if (mem_test[id] < targets[id]) {
                 ngl_per_device = ngl_per_device_test;
                 mem            = mem_test;
@@ -631,7 +647,7 @@ static void llama_params_fit_impl(
 
                 ngl_per_device_test[id].overflow_type = LAYER_FRACTION_GATE;
                 LLAMA_LOG_DEBUG("%s: trying to fit one extra layer with overflow_type=LAYER_FRACTION_GATE\n", __func__);
-                mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts, partial_moe);
+                mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts);
                 if (mem_test[id] < targets[id]) {
                     ngl_per_device = ngl_per_device_test;
                     mem            = mem_test;
@@ -642,7 +658,7 @@ static void llama_params_fit_impl(
             } else {
                 ngl_per_device_test[id].overflow_type = LAYER_FRACTION_ATTN;
                 LLAMA_LOG_DEBUG("%s: trying to fit one extra layer with overflow_type=LAYER_FRACTION_ATTN\n", __func__);
-                mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts, partial_moe);
+                mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts);
                 if (mem_test[id] < targets[id]) {
                     ngl_per_device = ngl_per_device_test;
                     mem            = mem_test;
@@ -659,7 +675,7 @@ static void llama_params_fit_impl(
             __func__, dev_names[id].c_str(), ngl_per_device[id].n_layer, ngl_per_device[id].n_part, mem[id]/MiB, projected_margin/MiB);
     }
 
-    set_ngl_tensor_split_tbo(ngl_per_device, overflow_bufts, *mparams, partial_moe);
+    set_ngl_tensor_split_tbo(ngl_per_device, overflow_bufts, *mparams);
 }
 
 bool llama_params_fit(
diff --git a/tests/test-arg-parser.cpp b/tests/test-arg-parser.cpp
index 468d325e22..1bbb745e78 100644
--- a/tests/test-arg-parser.cpp
+++ b/tests/test-arg-parser.cpp
@@ -16,6 +16,7 @@ int main(void) {
     for (int ex = 0; ex < LLAMA_EXAMPLE_COUNT; ex++) {
         try {
             auto ctx_arg = common_params_parser_init(params, (enum llama_example)ex);
+            common_params_add_preset_options(ctx_arg.options);
             std::unordered_set<std::string> seen_args;
             std::unordered_set<std::string> seen_env_vars;
             for (const auto & opt : ctx_arg.options) {
@@ -37,6 +38,30 @@ int main(void) {
                         exit(1);
                     }
                 }
+
+                // ensure shorter argument precedes longer argument
+                if (opt.args.size() > 1) {
+                    const std::string first(opt.args.front());
+                    const std::string last(opt.args.back());
+
+                    if (first.length() > last.length()) {
+                        fprintf(stderr, "test-arg-parser: shorter argument should come before longer one: %s, %s\n",
+                                first.c_str(), last.c_str());
+                        assert(false);
+                    }
+                }
+
+                // same check for negated arguments
+                if (opt.args_neg.size() > 1) {
+                    const std::string first(opt.args_neg.front());
+                    const std::string last(opt.args_neg.back());
+
+                    if (first.length() > last.length()) {
+                        fprintf(stderr, "test-arg-parser: shorter negated argument should come before longer one: %s, %s\n",
+                                first.c_str(), last.c_str());
+                        assert(false);
+                    }
+                }
             }
         } catch (std::exception & e) {
             printf("%s\n", e.what());
diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp
index 416218b5b8..2cdbe66a84 100644
--- a/tests/test-backend-ops.cpp
+++ b/tests/test-backend-ops.cpp
@@ -5118,25 +5118,36 @@ struct test_top_k : public test_case {
     }
 };
 
+enum MoeGatingFunc {
+    GATING_FUNC_SOFTMAX,
+    GATING_FUNC_SIGMOID,
+    GATING_FUNC_SOFTMAX_WEIGHT,
+};
+
 struct test_topk_moe : public test_case {
     const std::array<int64_t, 4> ne;
     const int n_expert_used;
     const bool with_norm;
-    const bool                   delayed_softmax;
+    const bool bias_probs;
+    const MoeGatingFunc gating_func;
+    const float scale_w;
 
     test_topk_moe(std::array<int64_t, 4> ne              = { 10, 5, 1, 1 },
                   int                    n_expert_used   = 1,
                   bool                   with_norm       = false,
-                  bool                   delayed_softmax = false) :
+                  bool                   bias_probs      = false,
+                  MoeGatingFunc          gating_func     = GATING_FUNC_SOFTMAX,
+                  float                  scale_w         = 0.0f) :
         ne(ne),
         n_expert_used(n_expert_used),
         with_norm(with_norm),
-        delayed_softmax(delayed_softmax) {
+        bias_probs(bias_probs),
+        gating_func(gating_func),
+        scale_w(scale_w) {
         GGML_ASSERT(n_expert_used <= ne[0]);
-        GGML_ASSERT(!(with_norm && delayed_softmax));
     }
 
-    std::string vars() override { return VARS_TO_STR4(ne, n_expert_used, with_norm, delayed_softmax); }
+    std::string vars() override { return VARS_TO_STR6(ne, n_expert_used, with_norm, bias_probs, gating_func, scale_w); }
 
     std::string op_desc(ggml_tensor * t) override {
         GGML_UNUSED(t);
@@ -5150,28 +5161,47 @@ struct test_topk_moe : public test_case {
         const int n_tokens = ne[1];
 
         ggml_tensor * logits = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne.data());
-        ggml_tensor * probs            = delayed_softmax ? logits : ggml_soft_max(ctx, logits);
-        ggml_tensor * selected_experts = ggml_argsort_top_k(ctx, probs, n_expert_used); // [n_expert_used, n_tokens]
+        ggml_tensor * probs            =
+            (gating_func == GATING_FUNC_SOFTMAX) ? ggml_soft_max(ctx, logits) :
+            (gating_func == GATING_FUNC_SIGMOID) ? ggml_sigmoid(ctx, logits) : logits;
+        ggml_set_name(probs, "probs");
 
-        ggml_tensor * out = ggml_get_rows(ctx, ggml_reshape_3d(ctx, probs, 1, n_expert, n_tokens), selected_experts); // [1, n_expert_used, n_tokens]
+        ggml_tensor * selection_probs = probs;
+        if (bias_probs) {
+            ggml_tensor * exp_probs_b = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne.data());
+            ggml_set_name(exp_probs_b, "exp_probs_b");
+            selection_probs = ggml_add(ctx, probs, exp_probs_b);
+            ggml_set_name(selection_probs, "selection_probs");
+        }
 
-        if (delayed_softmax) {
-            out = ggml_reshape_2d(ctx, out, n_expert_used, n_tokens);
-            out = ggml_soft_max(ctx, out);  // [n_expert_used, n_tokens]
-            out = ggml_reshape_3d(ctx, out, 1, n_expert_used, n_tokens);
+        ggml_tensor * selected_experts = ggml_argsort_top_k(ctx, selection_probs, n_expert_used); // [n_expert_used, n_tokens]
+        ggml_set_name(selected_experts, "selected_experts");
+
+        ggml_tensor * weights = ggml_get_rows(ctx, ggml_reshape_3d(ctx, probs, 1, n_expert, n_tokens), selected_experts); // [1, n_expert_used, n_tokens]
+        ggml_set_name(weights, "weights");
+
+        if (gating_func == GATING_FUNC_SOFTMAX_WEIGHT) {
+            weights = ggml_reshape_2d(ctx, weights, n_expert_used, n_tokens);
+            weights = ggml_soft_max(ctx, weights);  // [n_expert_used, n_tokens]
+            weights = ggml_reshape_3d(ctx, weights, 1, n_expert_used, n_tokens);
         }
 
         if (with_norm) {
-            out = ggml_reshape_2d(ctx, out, n_expert_used, n_tokens);
-            ggml_tensor * weights_sum = ggml_sum_rows(ctx, out); // [1, n_tokens]
+            weights = ggml_reshape_2d(ctx, weights, n_expert_used, n_tokens);
+            ggml_tensor * weights_sum = ggml_sum_rows(ctx, weights); // [1, n_tokens]
+            ggml_set_name(weights_sum, "weights_sum");
 
             weights_sum = ggml_clamp(ctx, weights_sum, 6.103515625e-5, INFINITY);
-            out = ggml_div(ctx, out, weights_sum); // [n_expert_used, n_tokens]
-            out = ggml_reshape_3d(ctx, out, 1, n_expert_used, n_tokens);
+            weights = ggml_div(ctx, weights, weights_sum); // [n_expert_used, n_tokens]
+            weights = ggml_reshape_3d(ctx, weights, 1, n_expert_used, n_tokens);
         }
 
-        ggml_set_name(out, "out");
-        return out;
+        if (scale_w) {
+            weights = ggml_scale(ctx, weights, scale_w);
+        }
+
+        ggml_set_name(weights, "weights");
+        return weights;
     }
 };
 
@@ -5344,6 +5374,13 @@ struct test_sum : public test_case {
     float grad_eps() override {
         return 0.1f * sqrtf(ne[0]*ne[1]*ne[2]*ne[3]);
     }
+
+    // Don't center the distribution around zero. Helps to avoid catastrophic cancellation.
+    void initialize_tensors(ggml_context * ctx) override {
+        for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != nullptr; t = ggml_get_next_tensor(ctx, t)) {
+            init_tensor_uniform(t, -0.9f, 1.1f);
+        }
+    }
 };
 
 // GGML_OP_SUM_ROWS
@@ -5410,6 +5447,13 @@ struct test_mean : public test_case {
     float grad_eps() override {
         return 0.1f * ne[0]*ne[1]*ne[2]*ne[3];
     }
+
+    // Don't center the distribution around zero. Helps to avoid catastrophic cancellation.
+    void initialize_tensors(ggml_context * ctx) override {
+        for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != nullptr; t = ggml_get_next_tensor(ctx, t)) {
+            init_tensor_uniform(t, -0.9f, 1.1f);
+        }
+    }
 };
 
 // GGML_OP_UPSCALE
@@ -6710,6 +6754,11 @@ static const ggml_type other_types[] = {
     GGML_TYPE_BF16,
 };
 
+#ifdef _MSC_VER
+// Workaround long compile time with msvc
+#pragma optimize("", off)
+#endif
+
 // Test cases for evaluation: should try to cover edge cases while using small input sizes to keep the runtime low
 static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
     std::vector<std::unique_ptr<test_case>> test_cases;
@@ -6881,6 +6930,7 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
     test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F16, {12, 12, 1, 2560}, {3, 3, 1, 2560}, 1, 1, 1, 1, 1, 1, true));
     test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F16, {12, 12, 2, 2560}, {3, 3, 2, 2560}, 1, 1, 1, 1, 1, 1, true));
     test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F16, {5, 5, 1, 32}, {3, 4, 1, 32}, 1, 1, 0, 0, 1, 1, true));
+    test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F32, GGML_TYPE_F32, {2, 2, 1536, 729}, {2, 2, 1536, 4096}, 1, 1, 0, 0, 1, 1, true));
 
     // im2col 3D
     test_cases.emplace_back(new test_im2col_3d(GGML_TYPE_F32, GGML_TYPE_F32, GGML_TYPE_F32));
@@ -7295,11 +7345,11 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
 
     test_cases.emplace_back(new test_l2_norm(GGML_TYPE_F32, {64, 5, 4, 3}, 1e-12f));
 
-    for (int64_t d_conv : {3, 4}) {
+    for (int64_t d_conv : {3, 4, 9}) {
         for (int64_t d_inner: {1024, 1536, 2048}) {
-            test_cases.emplace_back(new test_ssm_conv(GGML_TYPE_F32, {4, d_inner, 1, 1}, {d_conv, d_inner, 1, 1}));
-            test_cases.emplace_back(new test_ssm_conv(GGML_TYPE_F32, {8, d_inner, 1, 1}, {d_conv, d_inner, 1, 1}));
-            test_cases.emplace_back(new test_ssm_conv(GGML_TYPE_F32, {4, d_inner, 4, 1}, {d_conv, d_inner, 1, 1}));
+            test_cases.emplace_back(new test_ssm_conv(GGML_TYPE_F32, {d_conv, d_inner, 1, 1}, {d_conv, d_inner, 1, 1}));
+            test_cases.emplace_back(new test_ssm_conv(GGML_TYPE_F32, {2 * d_conv, d_inner, 1, 1}, {d_conv, d_inner, 1, 1}));
+            test_cases.emplace_back(new test_ssm_conv(GGML_TYPE_F32, {d_conv, d_inner, 4, 1}, {d_conv, d_inner, 1, 1}));
         }
     }
 
@@ -7972,19 +8022,22 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
         }
     }
 
-    for (bool with_norm : {false, true}) {
-        test_cases.emplace_back(new test_topk_moe({8, 22, 1, 1}, 4, with_norm));
-        test_cases.emplace_back(new test_topk_moe({31, 22, 1, 1}, 8, with_norm));
-        test_cases.emplace_back(new test_topk_moe({32, 22, 1, 1}, 8, with_norm));
-        test_cases.emplace_back(new test_topk_moe({40, 22, 1, 1}, 8, with_norm));
-        test_cases.emplace_back(new test_topk_moe({71, 22, 1, 1}, 8, with_norm));
-        test_cases.emplace_back(new test_topk_moe({128, 1, 1, 1}, 128, with_norm));
-        test_cases.emplace_back(new test_topk_moe({129, 1, 1, 1}, 128, with_norm));
+    for (auto gate : {GATING_FUNC_SOFTMAX, GATING_FUNC_SIGMOID, GATING_FUNC_SOFTMAX_WEIGHT}) {
+        for (bool with_norm : {false, true}) {
+            for (bool bias_probs : {false, true}) {
+                for (float scale_w : {0.0f, 2.0f}) {
+                    test_cases.emplace_back(new test_topk_moe({8, 22, 1, 1}, 4, with_norm, bias_probs, gate, scale_w));
+                    test_cases.emplace_back(new test_topk_moe({31, 22, 1, 1}, 8, with_norm, bias_probs, gate, scale_w));
+                    test_cases.emplace_back(new test_topk_moe({32, 22, 1, 1}, 8, with_norm, bias_probs, gate, scale_w));
+                    test_cases.emplace_back(new test_topk_moe({40, 22, 1, 1}, 8, with_norm, bias_probs, gate, scale_w));
+                    test_cases.emplace_back(new test_topk_moe({71, 22, 1, 1}, 8, with_norm, bias_probs, gate, scale_w));
+                    test_cases.emplace_back(new test_topk_moe({128, 1, 1, 1}, 128, with_norm, bias_probs, gate, scale_w));
+                    test_cases.emplace_back(new test_topk_moe({129, 1, 1, 1}, 128, with_norm, bias_probs, gate, scale_w));
+                }
+            }
+        }
     }
 
-    test_cases.emplace_back(new test_topk_moe({ 8, 22, 1, 1 }, 4, /*with_norm*/ false, /*delayed_softmax*/ true));
-    test_cases.emplace_back(new test_topk_moe({ 32, 22, 1, 1 }, 8, /*with_norm*/ false, /*delayed_softmax*/ true));
-
 #if 0
     // these tests are disabled to save execution time, sbut they can be handy for debugging
     test_cases.emplace_back(new test_llama(2, true));
@@ -7996,6 +8049,9 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
 
     return test_cases;
 }
+#ifdef _MSC_VER
+#pragma optimize("", on)
+#endif
 
 // Test cases for performance evaluation: should be representative of real-world use cases
 static std::vector<std::unique_ptr<test_case>> make_test_cases_perf() {
diff --git a/tools/cli/cli.cpp b/tools/cli/cli.cpp
index 8a8639207b..128679d020 100644
--- a/tools/cli/cli.cpp
+++ b/tools/cli/cli.cpp
@@ -209,8 +209,6 @@ int main(int argc, char ** argv) {
         return 1;
     }
 
-    ctx_cli.ctx_server.init();
-
     console::spinner::stop();
     console::log("\n");
 
diff --git a/tools/fit-params/fit-params.cpp b/tools/fit-params/fit-params.cpp
index fbf7a2eb37..2c113c453e 100644
--- a/tools/fit-params/fit-params.cpp
+++ b/tools/fit-params/fit-params.cpp
@@ -4,7 +4,11 @@
 #include "common.h"
 #include "log.h"
 
-#include <iostream>
+#include <chrono>
+#include <cinttypes>
+#include <thread>
+
+using namespace std::chrono_literals;
 
 #if defined(_MSC_VER)
 #pragma warning(disable: 4244 4267) // possible loss of data
@@ -22,13 +26,17 @@ int main(int argc, char ** argv) {
     llama_numa_init(params.numa);
     auto mparams = common_model_params_to_llama(params);
     auto cparams = common_context_params_to_llama(params);
-    llama_params_fit(params.model.path.c_str(), &mparams, &cparams,
+    const bool success = llama_params_fit(params.model.path.c_str(), &mparams, &cparams,
         params.tensor_split, params.tensor_buft_overrides.data(), params.fit_params_target, params.fit_params_min_ctx,
         params.verbosity >= 4 ? GGML_LOG_LEVEL_DEBUG : GGML_LOG_LEVEL_ERROR);
+    if (!success) {
+        LOG_ERR("%s: failed to fit CLI arguments to free memory, exiting...\n", __func__);
+        exit(1);
+    }
 
-    LOG_INF("Printing fitted CLI arguments to stdout...\n");
-    std::cout << "-c "    << cparams.n_ctx;
-    std::cout << " -ngl " << mparams.n_gpu_layers;
+    LOG_INF("%s: printing fitted CLI arguments to stdout...\n", __func__);
+    std::this_thread::sleep_for(10ms); // to avoid a race between stderr and stdout
+    printf("-c %" PRIu32 " -ngl %" PRIu32, cparams.n_ctx, mparams.n_gpu_layers);
 
     size_t nd = llama_max_devices();
     while (nd > 1 && mparams.tensor_split[nd - 1] == 0.0f) {
@@ -37,26 +45,22 @@ int main(int argc, char ** argv) {
     if (nd > 1) {
         for (size_t id = 0; id < nd; id++) {
             if (id == 0) {
-                std::cout << " -ts ";
+                printf(" -ts ");
             }
-            if (id > 0) {
-                std::cout << ",";
-            }
-            std::cout << mparams.tensor_split[id];
+            printf("%s%" PRIu32, id > 0 ? "," : "", uint32_t(mparams.tensor_split[id]));
         }
     }
 
     const size_t ntbo = llama_max_tensor_buft_overrides();
+    bool any_tbo = false;
     for (size_t itbo = 0; itbo < ntbo && mparams.tensor_buft_overrides[itbo].pattern != nullptr; itbo++) {
         if (itbo == 0) {
-            std::cout << " -ot ";
+            printf(" -ot \"");
         }
-        if (itbo > 0) {
-            std::cout << ",";
-        }
-        std::cout << mparams.tensor_buft_overrides[itbo].pattern << "=" << ggml_backend_buft_name(mparams.tensor_buft_overrides[itbo].buft);
+        printf("%s%s=%s", itbo > 0 ? "," : "", mparams.tensor_buft_overrides[itbo].pattern, ggml_backend_buft_name(mparams.tensor_buft_overrides[itbo].buft));
+        any_tbo = true;
     }
-    std::cout << "\n";
+    printf("%s\n", any_tbo ? "\"" : "");
 
     return 0;
 }
diff --git a/tools/mtmd/CMakeLists.txt b/tools/mtmd/CMakeLists.txt
index e7f3067a16..317d5f19fd 100644
--- a/tools/mtmd/CMakeLists.txt
+++ b/tools/mtmd/CMakeLists.txt
@@ -15,6 +15,7 @@ add_library(mtmd
             clip-graph.h
             models/models.h
             models/cogvlm.cpp
+            models/conformer.cpp
             models/glm4v.cpp
             models/internvl.cpp
             models/kimivl.cpp
diff --git a/tools/mtmd/clip-impl.h b/tools/mtmd/clip-impl.h
index d75233cc0a..a0939865e3 100644
--- a/tools/mtmd/clip-impl.h
+++ b/tools/mtmd/clip-impl.h
@@ -138,6 +138,21 @@
 #define TN_TOK_BOI         "v.boi"
 #define TN_TOK_EOI         "v.eoi"
 
+// (conformer) lfm2
+#define TN_PRE_ENCODE_OUT  "a.pre_encode.out.%s"
+#define TN_FFN_NORM        "%s.blk.%d.ffn_norm.%s"
+#define TN_FFN_NORM_1      "%s.blk.%d.ffn_norm_1.%s"
+#define TN_FFN_UP_1        "%s.blk.%d.ffn_up_1.%s"
+#define TN_FFN_DOWN_1      "%s.blk.%d.ffn_down_1.%s"
+#define TN_POS_BIAS_U      "%s.blk.%d.pos_bias_u"
+#define TN_POS_BIAS_V      "%s.blk.%d.pos_bias_v"
+#define TN_NORM_CONV       "%s.blk.%d.norm_conv.%s"
+#define TN_LINEAR_POS      "%s.blk.%d.linear_pos.%s"
+#define TN_CONV_DW         "%s.blk.%d.conv_dw.%s"
+#define TN_CONV_NORM       "%s.blk.%d.conv_norm.%s"
+#define TN_CONV_PW1        "%s.blk.%d.conv_pw1.%s"
+#define TN_CONV_PW2        "%s.blk.%d.conv_pw2.%s"
+
 // align x to upper multiple of n
 #define CLIP_ALIGN(x, n) ((((x) + (n) - 1) / (n)) * (n))
 
@@ -170,6 +185,7 @@ enum projector_type {
     PROJECTOR_TYPE_LIGHTONOCR,
     PROJECTOR_TYPE_COGVLM,
     PROJECTOR_TYPE_JANUS_PRO,
+    PROJECTOR_TYPE_LFM2A,
     PROJECTOR_TYPE_GLM4V,
     PROJECTOR_TYPE_UNKNOWN,
 };
@@ -198,6 +214,7 @@ static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
     { PROJECTOR_TYPE_LIGHTONOCR,"lightonocr"},
     { PROJECTOR_TYPE_COGVLM,    "cogvlm"},
     { PROJECTOR_TYPE_JANUS_PRO, "janus_pro"},
+    { PROJECTOR_TYPE_LFM2A,     "lfm2a"},
     { PROJECTOR_TYPE_GLM4V,     "glm4v"},
 };
 
diff --git a/tools/mtmd/clip-model.h b/tools/mtmd/clip-model.h
index f5c41ff138..b4c31cdde6 100644
--- a/tools/mtmd/clip-model.h
+++ b/tools/mtmd/clip-model.h
@@ -4,6 +4,7 @@
 #include "clip.h"
 #include "clip-impl.h"
 
+#include <array>
 #include <vector>
 #include <unordered_set>
 #include <cstdint>
@@ -142,6 +143,30 @@ struct clip_layer {
     ggml_tensor * deepstack_fc2_w = nullptr;
     ggml_tensor * deepstack_fc2_b = nullptr;
 
+    // lfm2
+    ggml_tensor * ff_norm_w     = nullptr;
+    ggml_tensor * ff_norm_b     = nullptr;
+    ggml_tensor * ff_norm_1_w   = nullptr;
+    ggml_tensor * ff_norm_1_b   = nullptr;
+    ggml_tensor * ff_up_1_w     = nullptr;
+    ggml_tensor * ff_up_1_b     = nullptr;
+    ggml_tensor * ff_down_1_w   = nullptr;
+    ggml_tensor * ff_down_1_b   = nullptr;
+    ggml_tensor * pos_bias_u    = nullptr;
+    ggml_tensor * pos_bias_v    = nullptr;
+    ggml_tensor * norm_conv_w   = nullptr;
+    ggml_tensor * norm_conv_b   = nullptr;
+    ggml_tensor * linear_pos_w  = nullptr;
+
+    ggml_tensor * conv_norm_w   = nullptr;
+    ggml_tensor * conv_norm_b   = nullptr;
+    ggml_tensor * conv_dw_w     = nullptr;
+    ggml_tensor * conv_dw_b     = nullptr;
+    ggml_tensor * conv_pw1_w    = nullptr;
+    ggml_tensor * conv_pw1_b    = nullptr;
+    ggml_tensor * conv_pw2_w    = nullptr;
+    ggml_tensor * conv_pw2_b    = nullptr;
+
     bool has_deepstack() const {
         return deepstack_fc1_w != nullptr;
     }
@@ -286,6 +311,12 @@ struct clip_model {
     ggml_tensor * mm_boi = nullptr;
     ggml_tensor * mm_eoi = nullptr;
 
+    // lfm2 audio
+    std::array<ggml_tensor *, 7> pre_encode_conv_X_w = {nullptr};
+    std::array<ggml_tensor *, 7> pre_encode_conv_X_b = {nullptr};
+    ggml_tensor * pre_encode_out_w = nullptr;
+    ggml_tensor * pre_encode_out_b = nullptr;
+
     bool audio_has_avgpool() const {
         return proj_type == PROJECTOR_TYPE_QWEN2A
             || proj_type == PROJECTOR_TYPE_VOXTRAL;
diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp
index 35e3aef0ab..3ba0823def 100644
--- a/tools/mtmd/clip.cpp
+++ b/tools/mtmd/clip.cpp
@@ -837,6 +837,10 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
             {
                 builder = std::make_unique<clip_graph_llava>(ctx, img);
             } break;
+        case PROJECTOR_TYPE_LFM2A:
+            {
+                builder = std::make_unique<clip_graph_conformer>(ctx, img);
+            } break;
         case PROJECTOR_TYPE_GLM4V:
             {
                 builder = std::make_unique<clip_graph_glm4v>(ctx, img);
@@ -1187,6 +1191,15 @@ struct clip_model_loader {
                         hparams.audio_window_len   = 400;
                         hparams.audio_hop_len      = 160;
                     } break;
+                case PROJECTOR_TYPE_LFM2A:
+                    {
+                        // audio preprocessing params
+                        hparams.audio_chunk_len        = 1; // in seconds
+                        hparams.audio_sample_rate      = 16000;
+                        hparams.audio_n_fft            = 512;
+                        hparams.audio_window_len       = 400;
+                        hparams.audio_hop_len          = 160;
+                    } break;
                 default:
                     break;
             }
@@ -1611,6 +1624,52 @@ struct clip_model_loader {
                     model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 1, "weight"));
                     model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 1, "bias"));
                 } break;
+            case PROJECTOR_TYPE_LFM2A:
+                {
+                    for (int i : {0, 2, 3, 5, 6}) {
+                        model.pre_encode_conv_X_w[i] = get_tensor(string_format(TN_CONV1D, i, "weight"));
+                        model.pre_encode_conv_X_b[i] = get_tensor(string_format(TN_CONV1D, i, "bias"));
+                    }
+                    model.pre_encode_out_w    = get_tensor(string_format(TN_PRE_ENCODE_OUT, "weight"));
+                    model.pre_encode_out_b    = get_tensor(string_format(TN_PRE_ENCODE_OUT, "bias"));
+
+                    model.mm_0_w = get_tensor(string_format(TN_MM_AUDIO_MLP, 0, "weight"));
+                    model.mm_0_b = get_tensor(string_format(TN_MM_AUDIO_MLP, 0, "bias"));
+                    model.mm_1_w = get_tensor(string_format(TN_MM_AUDIO_MLP, 1, "weight"));
+                    model.mm_1_b = get_tensor(string_format(TN_MM_AUDIO_MLP, 1, "bias"));
+                    model.mm_3_w = get_tensor(string_format(TN_MM_AUDIO_MLP, 3, "weight"));
+                    model.mm_3_b = get_tensor(string_format(TN_MM_AUDIO_MLP, 3, "bias"));
+
+                    for (int il = 0; il < hparams.n_layer; ++il) {
+                        auto & layer = model.layers[il];
+
+                        layer.ff_norm_w   = get_tensor(string_format(TN_FFN_NORM,   prefix, il, "weight"));
+                        layer.ff_norm_b   = get_tensor(string_format(TN_FFN_NORM,   prefix, il, "bias"));
+                        layer.ff_norm_1_w = get_tensor(string_format(TN_FFN_NORM_1, prefix, il, "weight"));
+                        layer.ff_norm_1_b = get_tensor(string_format(TN_FFN_NORM_1, prefix, il, "bias"));
+                        layer.ff_up_1_w   = get_tensor(string_format(TN_FFN_UP_1,   prefix, il, "weight"));
+                        layer.ff_up_1_b   = get_tensor(string_format(TN_FFN_UP_1,   prefix, il, "bias"));
+                        layer.ff_down_1_w = get_tensor(string_format(TN_FFN_DOWN_1, prefix, il, "weight"));
+                        layer.ff_down_1_b = get_tensor(string_format(TN_FFN_DOWN_1, prefix, il, "bias"));
+
+                        layer.pos_bias_u = get_tensor(string_format(TN_POS_BIAS_U, prefix, il));
+                        layer.pos_bias_v = get_tensor(string_format(TN_POS_BIAS_V, prefix, il));
+
+                        layer.norm_conv_w = get_tensor(string_format(TN_NORM_CONV, prefix, il, "weight"));
+                        layer.norm_conv_b = get_tensor(string_format(TN_NORM_CONV, prefix, il, "bias"));
+
+                        layer.linear_pos_w = get_tensor(string_format(TN_LINEAR_POS, prefix, il, "weight"));
+
+                        layer.conv_norm_w  = get_tensor(string_format(TN_CONV_NORM, prefix, il, "weight"));
+                        layer.conv_norm_b  = get_tensor(string_format(TN_CONV_NORM, prefix, il, "bias"));
+                        layer.conv_dw_w    = get_tensor(string_format(TN_CONV_DW,   prefix, il, "weight"));
+                        layer.conv_dw_b    = get_tensor(string_format(TN_CONV_DW,   prefix, il, "bias"));
+                        layer.conv_pw1_w   = get_tensor(string_format(TN_CONV_PW1,  prefix, il, "weight"));
+                        layer.conv_pw1_b   = get_tensor(string_format(TN_CONV_PW1,  prefix, il, "bias"));
+                        layer.conv_pw2_w   = get_tensor(string_format(TN_CONV_PW2,  prefix, il, "weight"));
+                        layer.conv_pw2_b   = get_tensor(string_format(TN_CONV_PW2,  prefix, il, "bias"));
+                    }
+                } break;
             default:
                 GGML_ASSERT(false && "unknown projector type");
         }
@@ -3004,6 +3063,10 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
             {
                 n_patches += 2; // for BOI and EOI token embeddings
             } break;
+        case PROJECTOR_TYPE_LFM2A:
+            {
+                n_patches = ((((img->nx + 1) / 2) + 1) / 2 + 1) / 2;
+            } break;
         default:
             GGML_ABORT("unsupported projector type");
     }
@@ -3362,6 +3425,27 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
                 }
                 set_input_i32("pos_w", pos_data);
             } break;
+        case PROJECTOR_TYPE_LFM2A:
+            {
+                GGML_ASSERT(imgs.entries.size() == 1);
+                const auto n_frames = clip_n_output_tokens(ctx, imgs.entries.front().get());
+
+                auto d_model = 512;
+                auto seq_len = n_frames * 2 - 1;
+                std::vector<float> pos_emb(d_model*seq_len);
+                std::vector<double> inv_freq(d_model / 2);
+                for (size_t i = 0; i < inv_freq.size(); ++i) {
+                    inv_freq[i] = std::exp(-(std::log(10000.0) / (float)d_model) * (2.0f * (float)(i)));
+                }
+                for (int64_t pos = 0; pos < seq_len; ++pos) {
+                    for (size_t i = 0; i < inv_freq.size(); ++i) {
+                        const float ang = (n_frames - pos - 1) * inv_freq[i];
+                        pos_emb[pos*d_model + 2*i + 0] = sinf(ang);  // even
+                        pos_emb[pos*d_model + 2*i + 1] = cosf(ang);  // odd
+                    }
+                }
+                set_input_f32("pos_emb", pos_emb);
+            } break;
         default:
             GGML_ABORT("Unknown projector type");
     }
@@ -3456,6 +3540,8 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
             return ctx->model.mm_2_w->ne[1];
         case PROJECTOR_TYPE_COGVLM:
             return ctx->model.mm_4h_to_h_w->ne[1];
+        case PROJECTOR_TYPE_LFM2A:
+            return ctx->model.position_embeddings->ne[0];
         case PROJECTOR_TYPE_GLM4V:
             return ctx->model.mm_ffn_down_w->ne[1];
         default:
diff --git a/tools/mtmd/models/conformer.cpp b/tools/mtmd/models/conformer.cpp
new file mode 100644
index 0000000000..fd7e295f7a
--- /dev/null
+++ b/tools/mtmd/models/conformer.cpp
@@ -0,0 +1,217 @@
+#include "models.h"
+
+ggml_cgraph * clip_graph_conformer::build() {
+    const int n_frames   = img.nx;
+    const int n_pos      = n_frames / 2;
+    const int n_pos_embd = (((((n_frames + 1) / 2) + 1) / 2 + 1) / 2) * 2 - 1;
+    GGML_ASSERT(model.position_embeddings->ne[1] >= n_pos);
+
+    ggml_tensor * pos_emb = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, 512, n_pos_embd);
+    ggml_set_name(pos_emb, "pos_emb");
+    ggml_set_input(pos_emb);
+    ggml_build_forward_expand(gf, pos_emb);
+
+    ggml_tensor * inp = build_inp_raw(1);
+    cb(inp, "input", -1);
+
+    auto * cur = ggml_cont(ctx0, ggml_transpose(ctx0, inp));
+
+    // pre encode, conv subsampling
+    {
+        // layer.0 - conv2d
+        cur = ggml_conv_2d(ctx0, model.pre_encode_conv_X_w[0], cur, 2, 2, 1, 1, 1, 1);
+        cur = ggml_add(ctx0, cur, model.pre_encode_conv_X_b[0]);
+        cb(cur, "conformer.pre_encode.conv.{}", 0);
+
+        // layer.1 - relu
+        cur = ggml_relu_inplace(ctx0, cur);
+
+        // layer.2 conv2d dw
+        cur = ggml_conv_2d_dw_direct(ctx0, model.pre_encode_conv_X_w[2], cur, 2, 2, 1, 1, 1, 1);
+        cur = ggml_add(ctx0, cur, model.pre_encode_conv_X_b[2]);
+        cb(cur, "conformer.pre_encode.conv.{}", 2);
+
+        // layer.3 conv2d
+        cur = ggml_conv_2d_direct(ctx0, model.pre_encode_conv_X_w[3], cur, 1, 1, 0, 0, 1, 1);
+        cur = ggml_add(ctx0, cur, model.pre_encode_conv_X_b[3]);
+        cb(cur, "conformer.pre_encode.conv.{}", 3);
+
+        // layer.4 - relu
+        cur = ggml_relu_inplace(ctx0, cur);
+
+        // layer.5 conv2d dw
+        cur = ggml_conv_2d_dw_direct(ctx0, model.pre_encode_conv_X_w[5], cur, 2, 2, 1, 1, 1, 1);
+        cur = ggml_add(ctx0, cur, model.pre_encode_conv_X_b[5]);
+        cb(cur, "conformer.pre_encode.conv.{}", 5);
+
+        // layer.6 conv2d
+        cur = ggml_conv_2d_direct(ctx0, model.pre_encode_conv_X_w[6], cur, 1, 1, 0, 0, 1, 1);
+        cur = ggml_add(ctx0, cur, model.pre_encode_conv_X_b[6]);
+        cb(cur, "conformer.pre_encode.conv.{}", 6);
+
+        // layer.7 - relu
+        cur = ggml_relu_inplace(ctx0, cur);
+
+        // flatten channel and frequency axis
+        cur = ggml_cont(ctx0, ggml_permute(ctx0, cur, 0, 2, 1, 3));
+        cur = ggml_reshape_2d(ctx0, cur, cur->ne[0] * cur->ne[1], cur->ne[2]);
+
+        // calculate out
+        cur = ggml_mul_mat(ctx0, model.pre_encode_out_w, cur);
+        cur = ggml_add(ctx0, cur, model.pre_encode_out_b);
+        cb(cur, "conformer.pre_encode.out", -1);
+    }
+
+    // pos_emb
+    cb(pos_emb, "pos_emb", -1);
+
+    for (int il = 0; il < hparams.n_layer; il++) {
+        const auto & layer = model.layers[il];
+
+        auto * residual = cur;
+
+        cb(cur, "layer.in", il);
+
+        // feed_forward1
+        cur = build_norm(cur, layer.ff_norm_w, layer.ff_norm_b, NORM_TYPE_NORMAL, 1e-5, il);
+        cb(cur, "conformer.layers.{}.norm_feed_forward1", il);
+
+        cur = build_ffn(cur, layer.ff_up_w, layer.ff_up_b, nullptr, nullptr, layer.ff_down_w, layer.ff_down_b, FFN_SILU,
+                        il);
+        cb(cur, "conformer.layers.{}.feed_forward1.linear2", il);
+
+        const auto fc_factor = 0.5f;
+        residual             = ggml_add(ctx0, residual, ggml_scale(ctx0, cur, fc_factor));
+
+        // self-attention
+        {
+            cur = build_norm(residual, layer.ln_1_w, layer.ln_1_b, NORM_TYPE_NORMAL, 1e-5, il);
+            cb(cur, "conformer.layers.{}.norm_self_att", il);
+
+            ggml_tensor * Qcur     = ggml_mul_mat(ctx0, layer.q_w, cur);
+            Qcur                   = ggml_add(ctx0, Qcur, layer.q_b);
+            Qcur                   = ggml_reshape_3d(ctx0, Qcur, d_head, n_head, Qcur->ne[1]);
+            ggml_tensor * Q_bias_u = ggml_add(ctx0, Qcur, layer.pos_bias_u);
+            Q_bias_u               = ggml_permute(ctx0, Q_bias_u, 0, 2, 1, 3);
+            ggml_tensor * Q_bias_v = ggml_add(ctx0, Qcur, layer.pos_bias_v);
+            Q_bias_v               = ggml_permute(ctx0, Q_bias_v, 0, 2, 1, 3);
+
+            // TODO @ngxson : some cont can/should be removed when ggml_mul_mat support these cases
+            ggml_tensor * Kcur = ggml_mul_mat(ctx0, layer.k_w, cur);
+            Kcur               = ggml_add(ctx0, Kcur, layer.k_b);
+            Kcur               = ggml_reshape_3d(ctx0, Kcur, d_head, n_head, Kcur->ne[1]);
+            Kcur               = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 0, 2, 1, 3));
+
+            ggml_tensor * Vcur = ggml_mul_mat(ctx0, layer.v_w, cur);
+            Vcur               = ggml_add(ctx0, Vcur, layer.v_b);
+            Vcur               = ggml_reshape_3d(ctx0, Vcur, d_head, n_head, Vcur->ne[1]);
+            Vcur               = ggml_cont(ctx0, ggml_permute(ctx0, Vcur, 1, 2, 0, 3));
+
+            // build_attn won't fit due to matrix_ac and matrix_bd separation
+            ggml_tensor * matrix_ac = ggml_mul_mat(ctx0, Q_bias_u, Kcur);
+            matrix_ac               = ggml_cont(ctx0, ggml_permute(ctx0, matrix_ac, 1, 0, 2, 3));
+            cb(matrix_ac, "conformer.layers.{}.self_attn.id3", il);
+
+            auto * p = ggml_mul_mat(ctx0, layer.linear_pos_w, pos_emb);
+            cb(p, "conformer.layers.{}.self_attn.linear_pos", il);
+            p = ggml_reshape_3d(ctx0, p, d_head, n_head, p->ne[1]);
+            p = ggml_permute(ctx0, p, 0, 2, 1, 3);
+
+            auto * matrix_bd = ggml_mul_mat(ctx0, Q_bias_v, p);
+            matrix_bd        = ggml_cont(ctx0, ggml_permute(ctx0, matrix_bd, 1, 0, 2, 3));
+
+            // rel shift
+            {
+                const auto pos_len = matrix_bd->ne[0];
+                const auto q_len   = matrix_bd->ne[1];
+                const auto h       = matrix_bd->ne[2];
+                matrix_bd          = ggml_pad(ctx0, matrix_bd, 1, 0, 0, 0);
+                matrix_bd          = ggml_roll(ctx0, matrix_bd, 1, 0, 0, 0);
+                matrix_bd          = ggml_reshape_3d(ctx0, matrix_bd, q_len, pos_len + 1, h);
+                matrix_bd          = ggml_view_3d(ctx0, matrix_bd, q_len, pos_len, h, matrix_bd->nb[1],
+                                                        matrix_bd->nb[2], matrix_bd->nb[0] * q_len);
+                matrix_bd          = ggml_cont_3d(ctx0, matrix_bd, pos_len, q_len, h);
+            }
+
+            matrix_bd     = ggml_view_3d(ctx0, matrix_bd, matrix_ac->ne[0], matrix_bd->ne[1],
+                                               matrix_bd->ne[2], matrix_bd->nb[1], matrix_bd->nb[2], 0);
+            auto * scores = ggml_add(ctx0, matrix_ac, matrix_bd);
+            scores        = ggml_scale(ctx0, scores, 1.0f / std::sqrt(d_head));
+            cb(scores, "conformer.layers.{}.self_attn.id0", il);
+
+            ggml_tensor * attn = ggml_soft_max(ctx0, scores);
+            ggml_tensor * x    = ggml_mul_mat(ctx0, attn, Vcur);
+            x                  = ggml_permute(ctx0, x, 2, 0, 1, 3);
+            x                  = ggml_cont_2d(ctx0, x, x->ne[0] * x->ne[1], x->ne[2]);
+
+            ggml_tensor * out = ggml_mul_mat(ctx0, layer.o_w, x);
+            out               = ggml_add(ctx0, out, layer.o_b);
+            cb(out, "conformer.layers.{}.self_attn.linear_out", il);
+
+            cur = out;
+        }
+
+        residual = ggml_add(ctx0, residual, cur);
+        cur      = build_norm(residual, layer.norm_conv_w, layer.norm_conv_b, NORM_TYPE_NORMAL, 1e-5, il);
+        cb(cur, "conformer.layers.{}.norm_conv", il);
+
+        // conv
+        {
+            auto * x = cur;
+            x = ggml_mul_mat(ctx0, layer.conv_pw1_w, x);
+            x = ggml_add(ctx0, x, layer.conv_pw1_b);
+            cb(x, "conformer.layers.{}.conv.pointwise_conv1", il);
+
+            // ggml_glu doesn't support sigmoid
+            // TODO @ngxson : support this ops in ggml
+            {
+                int64_t       d    = x->ne[0] / 2;
+                ggml_tensor * gate = ggml_sigmoid(ctx0, ggml_view_2d(ctx0, x, d, x->ne[1], x->nb[1], d * x->nb[0]));
+                x                  = ggml_mul(ctx0, ggml_view_2d(ctx0, x, d, x->ne[1], x->nb[1], 0), gate);
+                x                  = ggml_cont(ctx0, ggml_transpose(ctx0, x));
+            }
+
+            // use ggml_ssm_conv for f32 precision
+            x = ggml_pad(ctx0, x, 4, 0, 0, 0);
+            x = ggml_roll(ctx0, x, 4, 0, 0, 0);
+            x = ggml_pad(ctx0, x, 4, 0, 0, 0);
+            x = ggml_ssm_conv(ctx0, x, layer.conv_dw_w);
+            x = ggml_add(ctx0, x, layer.conv_dw_b);
+
+            x = ggml_add(ctx0, ggml_mul(ctx0, x, layer.conv_norm_w), layer.conv_norm_b);
+            x = ggml_silu(ctx0, x);
+
+            // pointwise_conv2
+            x = ggml_mul_mat(ctx0, layer.conv_pw2_w, x);
+            x = ggml_add(ctx0, x, layer.conv_pw2_b);
+
+            cur = x;
+        }
+
+        residual = ggml_add(ctx0, residual, cur);
+
+        cur = build_norm(residual, layer.ff_norm_1_w, layer.ff_norm_1_b, NORM_TYPE_NORMAL, 1e-5, il);
+        cb(cur, "conformer.layers.{}.norm_feed_forward2", il);
+
+        cur = build_ffn(cur, layer.ff_up_1_w, layer.ff_up_1_b, nullptr, nullptr, layer.ff_down_1_w, layer.ff_down_1_b,
+                        FFN_SILU, il);  // TODO(tarek): read activation for ffn from hparams
+        cb(cur, "conformer.layers.{}.feed_forward2.linear2", il);
+
+        residual = ggml_add(ctx0, residual, ggml_scale(ctx0, cur, fc_factor));
+        cb(residual, "conformer.layers.{}.conv.id", il);
+
+        cur = build_norm(residual, layer.ln_2_w, layer.ln_2_b, NORM_TYPE_NORMAL, 1e-5, il);
+        cb(cur, "conformer.layers.{}.norm_out", il);
+    }
+
+    // audio adapter
+    cur = build_norm(cur, model.mm_0_w, model.mm_0_b, NORM_TYPE_NORMAL, 1e-5, -1);
+    cb(cur, "audio_adapter.model.{}", 0);
+    cur = build_ffn(cur, model.mm_1_w, model.mm_1_b, nullptr, nullptr, model.mm_3_w, model.mm_3_b, FFN_GELU_ERF, -1);
+
+    cb(cur, "projected", -1);
+
+    ggml_build_forward_expand(gf, cur);
+
+    return gf;
+}
diff --git a/tools/mtmd/models/models.h b/tools/mtmd/models/models.h
index 0496d6b22f..8d6d4ef67b 100644
--- a/tools/mtmd/models/models.h
+++ b/tools/mtmd/models/models.h
@@ -57,6 +57,11 @@ struct clip_graph_whisper_enc : clip_graph {
     ggml_cgraph * build() override;
 };
 
+struct clip_graph_conformer : clip_graph {
+    clip_graph_conformer(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
+    ggml_cgraph * build() override;
+};
+
 struct clip_graph_glm4v : clip_graph {
     clip_graph_glm4v(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
     ggml_cgraph * build() override;
diff --git a/tools/mtmd/mtmd-audio.cpp b/tools/mtmd/mtmd-audio.cpp
index f68829a61a..e99101184b 100644
--- a/tools/mtmd/mtmd-audio.cpp
+++ b/tools/mtmd/mtmd-audio.cpp
@@ -535,3 +535,56 @@ bool mtmd_audio_preprocessor_whisper::preprocess(
 
     return true;
 }
+
+//
+// mtmd_audio_preprocessor_conformer
+//
+
+void mtmd_audio_preprocessor_conformer::initialize() {
+    g_cache.fill_sin_cos_table(hparams.audio_n_fft);
+    g_cache.fill_hann_window(hparams.audio_window_len, true);
+    g_cache.fill_mel_filterbank_matrix(
+        hparams.n_mel_bins,
+        hparams.audio_n_fft,
+        hparams.audio_sample_rate);
+}
+
+bool mtmd_audio_preprocessor_conformer::preprocess(
+        const float * samples,
+        size_t n_samples,
+        std::vector<mtmd_audio_mel> & output) {
+    // empty audio
+    if (n_samples == 0) {
+        return false;
+    }
+
+    filter_params params;
+    params.n_mel            = hparams.n_mel_bins;
+    params.n_fft_bins       = 1 + (hparams.audio_n_fft / 2);
+    params.hann_window_size = hparams.audio_window_len;
+    params.hop_length       = hparams.audio_hop_len;
+    params.sample_rate      = hparams.audio_sample_rate;
+    params.center_padding   = true;
+    params.preemph          = 0.97f;
+    params.use_natural_log  = true;
+    params.norm_per_feature = true;
+
+    // make sure the global cache is initialized
+    GGML_ASSERT(!g_cache.sin_vals.empty());
+    GGML_ASSERT(!g_cache.cos_vals.empty());
+    GGML_ASSERT(!g_cache.filters.data.empty());
+
+    mtmd_audio_mel out_full;
+    bool ok = log_mel_spectrogram(
+                samples,
+                n_samples,
+                4, // n_threads
+                params,
+                out_full);
+    if (!ok) {
+        return false;
+    }
+
+    output.push_back(std::move(out_full));
+    return true;
+}
diff --git a/tools/mtmd/mtmd-audio.h b/tools/mtmd/mtmd-audio.h
index 1b454337cb..d484c9d030 100644
--- a/tools/mtmd/mtmd-audio.h
+++ b/tools/mtmd/mtmd-audio.h
@@ -32,3 +32,9 @@ struct mtmd_audio_preprocessor_whisper : mtmd_audio_preprocessor {
     void initialize() override;
     bool preprocess(const float * samples, size_t n_samples, std::vector<mtmd_audio_mel> & output) override;
 };
+
+struct mtmd_audio_preprocessor_conformer : mtmd_audio_preprocessor {
+    mtmd_audio_preprocessor_conformer(const clip_ctx * ctx) : mtmd_audio_preprocessor(ctx) {}
+    void initialize() override;
+    bool preprocess(const float * samples, size_t n_samples, std::vector<mtmd_audio_mel> & output) override;
+};
diff --git a/tools/mtmd/mtmd-cli.cpp b/tools/mtmd/mtmd-cli.cpp
index 3ee1c2eccf..1ba02a5233 100644
--- a/tools/mtmd/mtmd-cli.cpp
+++ b/tools/mtmd/mtmd-cli.cpp
@@ -309,9 +309,24 @@ int main(int argc, char ** argv) {
 
     if (g_is_interrupted) return 130;
 
+    auto eval_system_prompt_if_present = [&] {
+        if (params.system_prompt.empty()) {
+            return 0;
+        }
+
+        common_chat_msg msg;
+        msg.role = "system";
+        msg.content = params.system_prompt;
+        return eval_message(ctx, msg);
+    };
+
     LOG_WRN("WARN: This is an experimental CLI for testing multimodal capability.\n");
     LOG_WRN("      For normal use cases, please use the standard llama-cli\n");
 
+    if (eval_system_prompt_if_present()) {
+        return 1;
+    }
+
     if (is_single_turn) {
         g_is_generating = true;
         if (params.prompt.find(mtmd_default_marker()) == std::string::npos) {
@@ -321,6 +336,7 @@ int main(int argc, char ** argv) {
                 params.prompt = mtmd_default_marker() + params.prompt;
             }
         }
+
         common_chat_msg msg;
         msg.role = "user";
         msg.content = params.prompt;
@@ -369,6 +385,9 @@ int main(int argc, char ** argv) {
                 ctx.n_past = 0;
                 ctx.chat_history.clear();
                 llama_memory_clear(llama_get_memory(ctx.lctx), true);
+                if (eval_system_prompt_if_present()) {
+                    return 1;
+                }
                 LOG("Chat history cleared\n\n");
                 continue;
             }
diff --git a/tools/mtmd/mtmd.cpp b/tools/mtmd/mtmd.cpp
index 2638fe4fc5..b9c4fa9098 100644
--- a/tools/mtmd/mtmd.cpp
+++ b/tools/mtmd/mtmd.cpp
@@ -329,8 +329,12 @@ struct mtmd_context {
             case PROJECTOR_TYPE_QWEN25O:
             case PROJECTOR_TYPE_ULTRAVOX:
             case PROJECTOR_TYPE_VOXTRAL:
+            case PROJECTOR_TYPE_GLMA:
                 audio_preproc = std::make_unique<mtmd_audio_preprocessor_whisper>(ctx_a);
                 break;
+            case PROJECTOR_TYPE_LFM2A:
+                audio_preproc = std::make_unique<mtmd_audio_preprocessor_conformer>(ctx_a);
+                break;
             default:
                 GGML_ABORT("unsupported audio projector type");
         }
diff --git a/tools/mtmd/tests.sh b/tools/mtmd/tests.sh
index 82b486ec93..012958e0e0 100755
--- a/tools/mtmd/tests.sh
+++ b/tools/mtmd/tests.sh
@@ -84,6 +84,7 @@ add_test_vision "ggml-org/LightOnOCR-1B-1025-GGUF:Q8_0"
 add_test_audio  "ggml-org/ultravox-v0_5-llama-3_2-1b-GGUF:Q8_0"
 add_test_audio  "ggml-org/Qwen2.5-Omni-3B-GGUF:Q4_K_M"
 add_test_audio  "ggml-org/Voxtral-Mini-3B-2507-GGUF:Q4_K_M"
+add_test_audio  "ggml-org/LFM2-Audio-1.5B-GGUF:Q8_0"
 
 # to test the big models, run: ./tests.sh big
 if [ "$RUN_BIG_TESTS" = true ]; then
diff --git a/tools/server/README-dev.md b/tools/server/README-dev.md
index fbcd6bc1f9..3fea3042f7 100644
--- a/tools/server/README-dev.md
+++ b/tools/server/README-dev.md
@@ -107,6 +107,8 @@ For detailed instructions, see the [test documentation](./tests/README.md).
 - Large-scale code base split into smaller files: https://github.com/ggml-org/llama.cpp/pull/17362
 - Introduction of router mode: https://github.com/ggml-org/llama.cpp/pull/17470
 - Speculative decoding: https://github.com/ggml-org/llama.cpp/pull/17808 and rework in https://github.com/ggml-org/llama.cpp/pull/17808
+- INI presets: https://github.com/ggml-org/llama.cpp/pull/17859 (+ refactoring: https://github.com/ggml-org/llama.cpp/pull/18169)
+- Sleeping mode: https://github.com/ggml-org/llama.cpp/pull/18228
 
 
 
diff --git a/tools/server/README.md b/tools/server/README.md
index 9a2b9b1f36..71f1d4777c 100644
--- a/tools/server/README.md
+++ b/tools/server/README.md
@@ -46,7 +46,7 @@ For the ful list of features, please refer to [server's changelog](https://githu
 | `--cpu-strict-batch <0\|1>` | use strict CPU placement (default: same as --cpu-strict) |
 | `--prio-batch N` | set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: 0)<br/> |
 | `--poll-batch <0\|1>` | use polling to wait for work (default: same as --poll) |
-| `-c, --ctx-size N` | size of the prompt context (default: 4096, 0 = loaded from model)<br/>(env: LLAMA_ARG_CTX_SIZE) |
+| `-c, --ctx-size N` | size of the prompt context (default: 0, 0 = loaded from model)<br/>(env: LLAMA_ARG_CTX_SIZE) |
 | `-n, --predict, --n-predict N` | number of tokens to predict (default: -1, -1 = infinity)<br/>(env: LLAMA_ARG_N_PREDICT) |
 | `-b, --batch-size N` | logical maximum batch size (default: 2048)<br/>(env: LLAMA_ARG_BATCH) |
 | `-ub, --ubatch-size N` | physical maximum batch size (default: 512)<br/>(env: LLAMA_ARG_UBATCH) |
@@ -75,20 +75,23 @@ For the ful list of features, please refer to [server's changelog](https://githu
 | `--numa TYPE` | attempt optimizations that help on some NUMA systems<br/>- distribute: spread execution evenly over all nodes<br/>- isolate: only spawn threads on CPUs on the node that execution started on<br/>- numactl: use the CPU map provided by numactl<br/>if run without this previously, it is recommended to drop the system page cache before using this<br/>see https://github.com/ggml-org/llama.cpp/issues/1437<br/>(env: LLAMA_ARG_NUMA) |
 | `-dev, --device <dev1,dev2,..>` | comma-separated list of devices to use for offloading (none = don't offload)<br/>use --list-devices to see a list of available devices<br/>(env: LLAMA_ARG_DEVICE) |
 | `--list-devices` | print list of available devices and exit |
-| `--override-tensor, -ot <tensor name pattern>=<buffer type>,...` | override tensor buffer type |
-| `--cpu-moe, -cmoe` | keep all Mixture of Experts (MoE) weights in the CPU<br/>(env: LLAMA_ARG_CPU_MOE) |
-| `--n-cpu-moe, -ncmoe N` | keep the Mixture of Experts (MoE) weights of the first N layers in the CPU<br/>(env: LLAMA_ARG_N_CPU_MOE) |
+| `-ot, --override-tensor <tensor name pattern>=<buffer type>,...` | override tensor buffer type |
+| `-cmoe, --cpu-moe` | keep all Mixture of Experts (MoE) weights in the CPU<br/>(env: LLAMA_ARG_CPU_MOE) |
+| `-ncmoe, --n-cpu-moe N` | keep the Mixture of Experts (MoE) weights of the first N layers in the CPU<br/>(env: LLAMA_ARG_N_CPU_MOE) |
 | `-ngl, --gpu-layers, --n-gpu-layers N` | max. number of layers to store in VRAM (default: -1)<br/>(env: LLAMA_ARG_N_GPU_LAYERS) |
 | `-sm, --split-mode {none,layer,row}` | how to split the model across multiple GPUs, one of:<br/>- none: use one GPU only<br/>- layer (default): split layers and KV across GPUs<br/>- row: split rows across GPUs<br/>(env: LLAMA_ARG_SPLIT_MODE) |
 | `-ts, --tensor-split N0,N1,N2,...` | fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1<br/>(env: LLAMA_ARG_TENSOR_SPLIT) |
 | `-mg, --main-gpu INDEX` | the GPU to use for the model (with split-mode = none), or for intermediate results and KV (with split-mode = row) (default: 0)<br/>(env: LLAMA_ARG_MAIN_GPU) |
+| `-fit, --fit [on\|off]` | whether to adjust unset arguments to fit in device memory ('on' or 'off', default: 'on')<br/>(env: LLAMA_ARG_FIT) |
+| `-fitt, --fit-target MiB` | target margin per device for --fit option, default: 1024<br/>(env: LLAMA_ARG_FIT_TARGET) |
+| `-fitc, --fit-ctx N` | minimum ctx size that can be set by --fit option, default: 4096<br/>(env: LLAMA_ARG_FIT_CTX) |
 | `--check-tensors` | check model tensor data for invalid values (default: false) |
-| `--override-kv KEY=TYPE:VALUE` | advanced option to override model metadata by key. may be specified multiple times.<br/>types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false |
+| `--override-kv KEY=TYPE:VALUE,...` | advanced option to override model metadata by key. to specify multiple overrides, either use comma-separated or repeat this argument.<br/>types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false,tokenizer.ggml.add_eos_token=bool:false |
 | `--op-offload, --no-op-offload` | whether to offload host tensor operations to device (default: true) |
-| `--lora FNAME` | path to LoRA adapter (can be repeated to use multiple adapters) |
-| `--lora-scaled FNAME SCALE` | path to LoRA adapter with user defined scaling (can be repeated to use multiple adapters) |
-| `--control-vector FNAME` | add a control vector<br/>note: this argument can be repeated to add multiple control vectors |
-| `--control-vector-scaled FNAME SCALE` | add a control vector with user defined scaling SCALE<br/>note: this argument can be repeated to add multiple scaled control vectors |
+| `--lora FNAME` | path to LoRA adapter (use comma-separated values to load multiple adapters) |
+| `--lora-scaled FNAME:SCALE,...` | path to LoRA adapter with user defined scaling (format: FNAME:SCALE,...)<br/>note: use comma-separated values |
+| `--control-vector FNAME` | add a control vector<br/>note: use comma-separated values to add multiple control vectors |
+| `--control-vector-scaled FNAME:SCALE,...` | add a control vector with user defined scaling SCALE<br/>note: use comma-separated values (format: FNAME:SCALE,...) |
 | `--control-vector-layer-range START END` | layer range to apply the control vector(s) to, start and end inclusive |
 | `-m, --model FNAME` | model path to load<br/>(env: LLAMA_ARG_MODEL) |
 | `-mu, --model-url MODEL_URL` | model download url (default: unused)<br/>(env: LLAMA_ARG_MODEL_URL) |
@@ -117,10 +120,10 @@ For the ful list of features, please refer to [server's changelog](https://githu
 | -------- | ----------- |
 | `--samplers SAMPLERS` | samplers that will be used for generation in the order, separated by ';'<br/>(default: penalties;dry;top_n_sigma;top_k;typ_p;top_p;min_p;xtc;temperature) |
 | `-s, --seed SEED` | RNG seed (default: -1, use random seed for -1) |
-| `--sampling-seq, --sampler-seq SEQUENCE` | simplified sequence for samplers that will be used (default: edskypmxt) |
+| `--sampler-seq, --sampling-seq SEQUENCE` | simplified sequence for samplers that will be used (default: edskypmxt) |
 | `--ignore-eos` | ignore end of stream token and continue generating (implies --logit-bias EOS-inf) |
 | `--temp N` | temperature (default: 0.8) |
-| `--top-k N` | top-k sampling (default: 40, 0 = disabled) |
+| `--top-k N` | top-k sampling (default: 40, 0 = disabled)<br/>(env: LLAMA_ARG_TOP_K) |
 | `--top-p N` | top-p sampling (default: 0.9, 1.0 = disabled) |
 | `--min-p N` | min-p sampling (default: 0.1, 0.0 = disabled) |
 | `--top-nsigma N` | top-n-sigma sampling (default: -1.0, -1.0 = disabled) |
@@ -153,8 +156,8 @@ For the ful list of features, please refer to [server's changelog](https://githu
 | Argument | Explanation |
 | -------- | ----------- |
 | `--ctx-checkpoints, --swa-checkpoints N` | max number of context checkpoints to create per slot (default: 8)[(more info)](https://github.com/ggml-org/llama.cpp/pull/15293)<br/>(env: LLAMA_ARG_CTX_CHECKPOINTS) |
-| `--cache-ram, -cram N` | set the maximum cache size in MiB (default: 8192, -1 - no limit, 0 - disable)[(more info)](https://github.com/ggml-org/llama.cpp/pull/16391)<br/>(env: LLAMA_ARG_CACHE_RAM) |
-| `--kv-unified, -kvu` | use single unified KV buffer shared across all sequences (default: enabled if number of slots is auto)<br/>(env: LLAMA_ARG_KV_UNIFIED) |
+| `-cram, --cache-ram N` | set the maximum cache size in MiB (default: 8192, -1 - no limit, 0 - disable)[(more info)](https://github.com/ggml-org/llama.cpp/pull/16391)<br/>(env: LLAMA_ARG_CACHE_RAM) |
+| `-kvu, --kv-unified` | use single unified KV buffer shared across all sequences (default: enabled if number of slots is auto)<br/>(env: LLAMA_ARG_KV_UNIFIED) |
 | `--context-shift, --no-context-shift` | whether to use context shift on infinite text generation (default: disabled)<br/>(env: LLAMA_ARG_CONTEXT_SHIFT) |
 | `-r, --reverse-prompt PROMPT` | halt generation at PROMPT, return control in interactive mode<br/> |
 | `-sp, --special` | special tokens output enabled (default: false) |
@@ -169,17 +172,19 @@ For the ful list of features, please refer to [server's changelog](https://githu
 | `--mmproj-offload, --no-mmproj-offload` | whether to enable GPU offloading for multimodal projector (default: enabled)<br/>(env: LLAMA_ARG_MMPROJ_OFFLOAD) |
 | `--image-min-tokens N` | minimum number of tokens each image can take, only used by vision models with dynamic resolution (default: read from model)<br/>(env: LLAMA_ARG_IMAGE_MIN_TOKENS) |
 | `--image-max-tokens N` | maximum number of tokens each image can take, only used by vision models with dynamic resolution (default: read from model)<br/>(env: LLAMA_ARG_IMAGE_MAX_TOKENS) |
-| `--override-tensor-draft, -otd <tensor name pattern>=<buffer type>,...` | override tensor buffer type for draft model |
-| `--cpu-moe-draft, -cmoed` | keep all Mixture of Experts (MoE) weights in the CPU for the draft model<br/>(env: LLAMA_ARG_CPU_MOE_DRAFT) |
-| `--n-cpu-moe-draft, -ncmoed N` | keep the Mixture of Experts (MoE) weights of the first N layers in the CPU for the draft model<br/>(env: LLAMA_ARG_N_CPU_MOE_DRAFT) |
+| `-otd, --override-tensor-draft <tensor name pattern>=<buffer type>,...` | override tensor buffer type for draft model |
+| `-cmoed, --cpu-moe-draft` | keep all Mixture of Experts (MoE) weights in the CPU for the draft model<br/>(env: LLAMA_ARG_CPU_MOE_DRAFT) |
+| `-ncmoed, --n-cpu-moe-draft N` | keep the Mixture of Experts (MoE) weights of the first N layers in the CPU for the draft model<br/>(env: LLAMA_ARG_N_CPU_MOE_DRAFT) |
 | `-a, --alias STRING` | set alias for model name (to be used by REST API)<br/>(env: LLAMA_ARG_ALIAS) |
 | `--host HOST` | ip address to listen, or bind to an UNIX socket if the address ends with .sock (default: 127.0.0.1)<br/>(env: LLAMA_ARG_HOST) |
 | `--port PORT` | port to listen (default: 8080)<br/>(env: LLAMA_ARG_PORT) |
 | `--path PATH` | path to serve static files from (default: )<br/>(env: LLAMA_ARG_STATIC_PATH) |
 | `--api-prefix PREFIX` | prefix path the server serves from, without the trailing slash (default: )<br/>(env: LLAMA_ARG_API_PREFIX) |
+| `--webui-config JSON` | JSON that provides default WebUI settings (overrides WebUI defaults)<br/>(env: LLAMA_ARG_WEBUI_CONFIG) |
+| `--webui-config-file PATH` | JSON file that provides default WebUI settings (overrides WebUI defaults)<br/>(env: LLAMA_ARG_WEBUI_CONFIG_FILE) |
 | `--webui, --no-webui` | whether to enable the Web UI (default: enabled)<br/>(env: LLAMA_ARG_WEBUI) |
 | `--embedding, --embeddings` | restrict to only support embedding use case; use only with dedicated embedding models (default: disabled)<br/>(env: LLAMA_ARG_EMBEDDINGS) |
-| `--reranking, --rerank` | enable reranking endpoint on server (default: disabled)<br/>(env: LLAMA_ARG_RERANKING) |
+| `--rerank, --reranking` | enable reranking endpoint on server (default: disabled)<br/>(env: LLAMA_ARG_RERANKING) |
 | `--api-key KEY` | API key to use for authentication (default: none)<br/>(env: LLAMA_API_KEY) |
 | `--api-key-file FNAME` | path to file containing API keys (default: none) |
 | `--ssl-key-file FNAME` | path to file a PEM-encoded SSL private key<br/>(env: LLAMA_ARG_SSL_KEY_FILE) |
@@ -207,7 +212,7 @@ For the ful list of features, please refer to [server's changelog](https://githu
 | `--lora-init-without-apply` | load LoRA adapters without applying them (apply later via POST /lora-adapters) (default: disabled) |
 | `-td, --threads-draft N` | number of threads to use during generation (default: same as --threads) |
 | `-tbd, --threads-batch-draft N` | number of threads to use during batch and prompt processing (default: same as --threads-draft) |
-| `--draft-max, --draft, --draft-n N` | number of tokens to draft for speculative decoding (default: 16)<br/>(env: LLAMA_ARG_DRAFT_MAX) |
+| `--draft, --draft-n, --draft-max N` | number of tokens to draft for speculative decoding (default: 16)<br/>(env: LLAMA_ARG_DRAFT_MAX) |
 | `--draft-min, --draft-n-min N` | minimum number of draft tokens to use for speculative decoding (default: 0)<br/>(env: LLAMA_ARG_DRAFT_MIN) |
 | `--draft-p-min P` | minimum speculative decoding probability (greedy) (default: 0.8)<br/>(env: LLAMA_ARG_DRAFT_P_MIN) |
 | `-cd, --ctx-size-draft N` | size of the prompt context for the draft model (default: 0, 0 = loaded from model)<br/>(env: LLAMA_ARG_CTX_SIZE_DRAFT) |
@@ -1438,6 +1443,12 @@ Example:
 ```ini
 version = 1
 
+; (Optional) This section provides global settings shared across all presets.
+; If the same key is defined in a specific preset, it will override the value in this global section.
+[*]
+c = 8192
+n-gpu-layer = 8
+
 ; If the key corresponds to an existing model on the server,
 ; this will be used as the default config for that model
 [ggml-org/MY-MODEL-GGUF:Q8_0]
@@ -1457,12 +1468,20 @@ model-draft = ./my-models/draft.gguf
 model-draft = /Users/abc/my-models/draft.gguf
 
 ; If the key does NOT correspond to an existing model,
-; you need to specify at least the model path
+; you need to specify at least the model path or HF repo
 [custom_model]
 model = /Users/abc/my-awesome-model-Q4_K_M.gguf
 ```
 
-Note: some arguments are controlled by router (e.g., host, port, API key, HF repo, model alias). They will be removed or overwritten upload loading.
+Note: some arguments are controlled by router (e.g., host, port, API key, HF repo, model alias). They will be removed or overwritten upon loading.
+
+The precedence rule for preset options is as follows:
+1. **Command-line arguments** passed to `llama-server` (highest priority)
+2. **Model-specific options** defined in the preset file (e.g. `[ggml-org/MY-MODEL...]`)
+3. **Global options** defined in the preset file (`[*]`)
+
+We also offer additional options that are exclusive to presets (these aren't treated as command-line arguments):
+- `load-on-startup` (boolean): Controls whether the model loads automatically when the server starts
 
 ### Routing requests
 
@@ -1602,6 +1621,16 @@ Example of an error:
 }
 ```
 
+## Sleeping on Idle
+
+The server supports an automatic sleep mode that activates after a specified period of inactivity (no incoming tasks). This feature, introduced in [PR #18228](https://github.com/ggml-org/llama.cpp/pull/18228), can be enabled using the `--sleep-idle-seconds` command-line argument. It works seamlessly in both single-model and multi-model configurations.
+
+When the server enters sleep mode, the model and its associated memory (including the KV cache) are unloaded from RAM to conserve resources. Any new incoming task will automatically trigger the model to reload.
+
+Note that the following endpoints are exempt from being considered as incoming tasks. They do not trigger model reloading and do not reset the idle timer:
+- `GET /health`
+- `GET /props`
+
 ## More examples
 
 ### Interactive mode
diff --git a/tools/server/public/index.html.gz b/tools/server/public/index.html.gz
index 2ff90e800a..b5266edee7 100644
Binary files a/tools/server/public/index.html.gz and b/tools/server/public/index.html.gz differ
diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp
index 90898b5ec4..cde34e6533 100644
--- a/tools/server/server-context.cpp
+++ b/tools/server/server-context.cpp
@@ -544,6 +544,10 @@ struct server_context_impl {
 
     server_metrics metrics;
 
+    // cached responses for HTTP API (read-only from HTTP threads)
+    json json_server_props      = json::object();
+    json json_server_model_meta = json::object();
+
     // Necessary similarity of prompt for slot selection
     float slot_prompt_similarity = 0.0f;
 
@@ -552,8 +556,23 @@ struct server_context_impl {
     common_chat_templates_ptr chat_templates;
     oaicompat_parser_options  oai_parser_opt;
 
+    bool sleeping = false;
+
     ~server_context_impl() {
+        if (!sleeping) {
+            // destroy() is already called when entering sleeping state
+            // we don't call it again here to avoid double free
+            destroy();
+        }
+    }
+
+    void destroy() {
+        llama_init.reset();
+        ctx = nullptr;
+        model = nullptr;
+
         mtmd_free(mctx);
+        mctx = nullptr;
 
         // Clear any sampling context
         for (server_slot & slot : slots) {
@@ -569,8 +588,25 @@ struct server_context_impl {
         llama_batch_free(batch);
     }
 
+    void handle_sleeping_state(bool new_state) {
+        GGML_ASSERT(sleeping != new_state);
+        if (new_state) {
+            SRV_INF("%s", "server is entering sleeping state\n");
+            destroy();
+        } else {
+            SRV_INF("%s", "server is exiting sleeping state\n");
+            if (!load_model(params_base)) {
+                GGML_ABORT("failed to reload model after sleeping");
+            }
+        }
+        sleeping = new_state;
+    }
+
     // load the model and initialize llama_context
+    // this may also be called to resume from sleeping state
     bool load_model(const common_params & params) {
+        bool is_resume = sleeping;
+
         SRV_INF("loading model '%s'\n", params.model.path.c_str());
 
         params_base = params;
@@ -642,7 +678,9 @@ struct server_context_impl {
 
         std::string & mmproj_path = params_base.mmproj.path;
         if (!mmproj_path.empty()) {
-            mtmd_helper_log_set(common_log_default_callback, nullptr);
+            if (!is_resume) {
+                mtmd_helper_log_set(common_log_default_callback, nullptr);
+            }
 
             mtmd_context_params mparams = mtmd_context_params_default();
             mparams.use_gpu          = params_base.mmproj_use_gpu;
@@ -687,19 +725,6 @@ struct server_context_impl {
             }
         }
 
-        return true;
-    }
-
-    // initialize slots and server-related data
-    void init() {
-        // wiring up server queues
-        queue_tasks.on_new_task([this](server_task && task) {
-            process_single_task(std::move(task));
-        });
-        queue_tasks.on_update_slots([this]() {
-            update_slots();
-        });
-
         // Necessary similarity of prompt for slot selection
         slot_prompt_similarity = params_base.slot_prompt_similarity;
 
@@ -714,6 +739,7 @@ struct server_context_impl {
             n_ctx_slot = n_ctx_train;
         }
 
+        slots.clear();
         for (int i = 0; i < params_base.n_parallel; i++) {
             server_slot slot;
 
@@ -730,13 +756,13 @@ struct server_context_impl {
                 slot.ctx_dft = llama_init_from_model(model_dft, cparams_dft);
                 if (slot.ctx_dft == nullptr) {
                     SRV_ERR("%s", "failed to create draft context\n");
-                    return;
+                    return false;
                 }
 
                 slot.spec = common_speculative_init(slot.ctx, slot.ctx_dft);
                 if (slot.spec == nullptr) {
                     SRV_ERR("%s", "failed to create speculator\n");
-                    return;
+                    return false;
                 }
                 for (auto & pair : params_base.speculative.replacements) {
                     common_speculative_add_replacement_tgt_dft(slot.spec, pair.first.c_str(), pair.second.c_str());
@@ -770,8 +796,6 @@ struct server_context_impl {
             batch = llama_batch_init(std::max(n_batch, params_base.n_parallel), 0, 1);
         }
 
-        metrics.init();
-
         if (params_base.cache_ram_mib != 0) {
             if (params_base.cache_ram_mib < 0) {
                 SRV_WRN("prompt cache is enabled, size limit: %s\n", "no limit");
@@ -820,6 +844,103 @@ struct server_context_impl {
         LOG_INF("%s: chat template, chat_template: %s, example_format: '%s'\n", __func__,
             common_chat_templates_source(chat_templates.get()),
             common_chat_format_example(chat_templates.get(), params_base.use_jinja, params_base.default_template_kwargs).c_str());
+
+        if (!is_resume) {
+            return init();
+        }
+
+        return true;
+    }
+
+    // unlike load_model(), this is only called once during initialization
+    bool init() {
+        GGML_ASSERT(ctx != nullptr);
+        GGML_ASSERT(model != nullptr);
+        GGML_ASSERT(!sleeping);
+
+        // wiring up server queues
+        queue_tasks.on_new_task([this](server_task && task) {
+            process_single_task(std::move(task));
+        });
+        queue_tasks.on_update_slots([this]() {
+            update_slots();
+        });
+        queue_tasks.on_sleeping_state([this](bool sleeping) {
+            handle_sleeping_state(sleeping);
+        });
+
+        metrics.init();
+
+        if (!populate_json_responses()) {
+            SRV_ERR("%s", "failed to populate JSON responses\n");
+            return false;
+        }
+
+        return true;
+    }
+
+    bool populate_json_responses() {
+        // populate webui settings
+        json json_webui_settings = json::object();
+        {
+            if (!params_base.webui_config_json.empty()) {
+                try {
+                    json_webui_settings = json::parse(params_base.webui_config_json);
+                } catch (const std::exception & e) {
+                    SRV_ERR("%s: failed to parse webui config: %s\n", __func__, e.what());
+                    return false;
+                }
+            }
+        }
+
+        // populate server properties
+        {
+            task_params params;
+            params.sampling = params_base.sampling;
+            json default_generation_settings_for_props = json {
+                {"params", params.to_json(true)},
+                {"n_ctx",  get_slot_n_ctx()},
+            };
+
+            json_server_props = {
+                { "default_generation_settings", default_generation_settings_for_props },
+                { "total_slots",                 params_base.n_parallel },
+                { "model_alias",                 model_name },
+                { "model_path",                  params_base.model.path },
+                { "modalities",                  json {
+                    {"vision", oai_parser_opt.allow_image},
+                    {"audio",  oai_parser_opt.allow_audio},
+                } },
+                { "endpoint_slots",              params_base.endpoint_slots },
+                { "endpoint_props",              params_base.endpoint_props },
+                { "endpoint_metrics",            params_base.endpoint_metrics },
+                { "webui",                       params_base.webui },
+                { "webui_settings",              json_webui_settings },
+                { "chat_template",               common_chat_templates_source(chat_templates.get()) },
+                { "bos_token",                   common_token_to_piece(ctx, llama_vocab_bos(vocab), /* special= */ true)},
+                { "eos_token",                   common_token_to_piece(ctx, llama_vocab_eos(vocab), /* special= */ true)},
+                { "build_info",                  build_info },
+            };
+            if (params_base.use_jinja) {
+                if (auto tool_use_src = common_chat_templates_source(chat_templates.get(), "tool_use")) {
+                    json_server_props["chat_template_tool_use"] = tool_use_src;
+                }
+            }
+        }
+
+        // populate model metadata
+        {
+            json_server_model_meta = {
+                {"vocab_type",  llama_vocab_type       (vocab)},
+                {"n_vocab",     llama_vocab_n_tokens   (vocab)},
+                {"n_ctx_train", llama_model_n_ctx_train(model)},
+                {"n_embd",      llama_model_n_embd     (model)},
+                {"n_params",    llama_model_n_params   (model)},
+                {"size",        llama_model_size       (model)},
+            };
+        }
+
+        return true;
     }
 
     server_slot * get_slot_by_id(int id) {
@@ -1962,19 +2083,33 @@ struct server_context_impl {
 
                         if (!slot.can_split()) {
                             if (slot.task->n_tokens() > n_ubatch) {
-                                send_error(slot, "input is too large to process. increase the physical batch size", ERROR_TYPE_SERVER);
+                                send_error(slot,
+                                           string_format(
+                                               "input (%d tokens) is too large to process. increase the physical batch "
+                                               "size (current batch size: %d)",
+                                               slot.task->n_tokens(), n_ubatch),
+                                           ERROR_TYPE_SERVER);
                                 slot.release();
                                 continue;
                             }
 
                             if (slot.task->n_tokens() > slot.n_ctx) {
-                                send_error(slot, "input is larger than the max context size. skipping", ERROR_TYPE_EXCEED_CONTEXT_SIZE);
+                                send_error(
+                                    slot,
+                                    string_format(
+                                        "input (%d tokens) is larger than the max context size (%d tokens). skipping",
+                                        slot.task->n_tokens(), slot.n_ctx),
+                                    ERROR_TYPE_EXCEED_CONTEXT_SIZE);
                                 slot.release();
                                 continue;
                             }
                         } else {
                             if (slot.task->n_tokens() >= slot.n_ctx) {
-                                send_error(slot, "the request exceeds the available context size, try increasing it", ERROR_TYPE_EXCEED_CONTEXT_SIZE);
+                                send_error(slot,
+                                           string_format("request (%d tokens) exceeds the available context size (%d "
+                                                         "tokens), try increasing it",
+                                                         slot.task->n_tokens(), slot.n_ctx),
+                                           ERROR_TYPE_EXCEED_CONTEXT_SIZE);
                                 slot.release();
                                 continue;
                             }
@@ -2602,24 +2737,13 @@ struct server_context_impl {
                     }
                 }
 
-                SLT_DBG(slot, "accepted %d/%d draft tokens, new n_tokens = %d\n", (int) ids.size() - 1, (int) slot.drafted.size(), slot.prompt.n_tokens());
+                SLT_DBG(slot, "accepted %d/%d draft tokens, new n_tokens = %d\n", (int) ids.size() - 1, (int) n_draft, slot.prompt.n_tokens());
             }
         }
 
         SRV_DBG("%s", "run slots completed\n");
     }
 
-    json model_meta() const {
-        return json {
-            {"vocab_type",  llama_vocab_type       (vocab)},
-            {"n_vocab",     llama_vocab_n_tokens   (vocab)},
-            {"n_ctx_train", llama_model_n_ctx_train(model)},
-            {"n_embd",      llama_model_n_embd     (model)},
-            {"n_params",    llama_model_n_params   (model)},
-            {"size",        llama_model_size       (model)},
-        };
-    }
-
     int get_slot_n_ctx() {
         return slots.back().n_ctx;
     }
@@ -2636,16 +2760,13 @@ struct server_context_impl {
 server_context::server_context() : impl(new server_context_impl()) {}
 server_context::~server_context() = default;
 
-void server_context::init() {
-    impl->init();
-}
-
 bool server_context::load_model(const common_params & params) {
     return impl->load_model(params);
 }
 
 void server_context::start_loop() {
-    impl->queue_tasks.start_loop();
+    auto & params = impl->params_base;
+    impl->queue_tasks.start_loop(params.sleep_idle_seconds * 1000);
 }
 
 void server_context::terminate() {
@@ -2672,10 +2793,17 @@ server_context_info server_context::get_info() const {
 
 
 // generator-like API for HTTP response generation
+// may have bypass_sleep = true if the task does not use ctx_server
 struct server_res_generator : server_http_res {
     server_response_reader rd;
-    server_res_generator(server_context_impl & ctx_server)
-        : rd(ctx_server.queue_tasks, ctx_server.queue_results, HTTP_POLLING_SECONDS) {}
+    server_res_generator(server_context_impl & ctx_server, bool bypass_sleep = false)
+            : rd(ctx_server.queue_tasks, ctx_server.queue_results, HTTP_POLLING_SECONDS) {
+        // fast path in case sleeping is disabled
+        bypass_sleep |= ctx_server.params_base.sleep_idle_seconds < 0;
+        if (!bypass_sleep) {
+            ctx_server.queue_tasks.wait_until_no_sleep();
+        }
+    }
     void ok(const json & response_data) {
         status = 200;
         data = safe_json_to_str(response_data);
@@ -2693,6 +2821,7 @@ struct server_res_generator : server_http_res {
 //
 
 static std::unique_ptr<server_res_generator> handle_completions_impl(
+            std::unique_ptr<server_res_generator> && res_ptr,
             server_context_impl & ctx_server,
             server_task_type type,
             const json & data,
@@ -2701,7 +2830,7 @@ static std::unique_ptr<server_res_generator> handle_completions_impl(
             task_response_type res_type) {
     GGML_ASSERT(type == SERVER_TASK_TYPE_COMPLETION || type == SERVER_TASK_TYPE_INFILL);
 
-    auto res = std::make_unique<server_res_generator>(ctx_server);
+    auto res = std::move(res_ptr);
     auto completion_id = gen_chatcmplid();
     auto & rd = res->rd;
 
@@ -2905,9 +3034,12 @@ static std::unique_ptr<server_res_generator> handle_completions_impl(
 }
 
 void server_routes::init_routes() {
+    // IMPORTANT: all lambda functions must start with std::make_unique<server_res_generator>
+    // this is to ensure that the server_res_generator can handle sleeping case correctly
+
     this->get_health = [this](const server_http_req &) {
         // error and loading states are handled by middleware
-        auto res = std::make_unique<server_res_generator>(ctx_server);
+        auto res = std::make_unique<server_res_generator>(ctx_server, true);
         res->ok({{"status", "ok"}});
         return res;
     };
@@ -3089,46 +3221,10 @@ void server_routes::init_routes() {
     };
 
     this->get_props = [this](const server_http_req &) {
-        auto res = std::make_unique<server_res_generator>(ctx_server);
-        json default_generation_settings_for_props;
-
-        {
-            task_params params;
-
-            params.sampling = ctx_server.params_base.sampling;
-
-            default_generation_settings_for_props = json {
-                {"params", params.to_json(true)},
-                {"n_ctx",  ctx_server.get_slot_n_ctx()},
-            };
-        }
-
-        // this endpoint is publicly available, please only return what is safe to be exposed
-        json data = {
-            { "default_generation_settings", default_generation_settings_for_props },
-            { "total_slots",                 ctx_server.params_base.n_parallel },
-            { "model_alias",                 ctx_server.model_name },
-            { "model_path",                  ctx_server.params_base.model.path },
-            { "modalities",                  json {
-                {"vision", ctx_server.oai_parser_opt.allow_image},
-                {"audio",  ctx_server.oai_parser_opt.allow_audio},
-            } },
-            { "endpoint_slots",              params.endpoint_slots },
-            { "endpoint_props",              params.endpoint_props },
-            { "endpoint_metrics",            params.endpoint_metrics },
-            { "webui",                       params.webui },
-            { "chat_template",               common_chat_templates_source(ctx_server.chat_templates.get()) },
-            { "bos_token",                   common_token_to_piece(ctx_server.ctx, llama_vocab_bos(ctx_server.vocab), /* special= */ true)},
-            { "eos_token",                   common_token_to_piece(ctx_server.ctx, llama_vocab_eos(ctx_server.vocab), /* special= */ true)},
-            { "build_info",                  build_info },
-        };
-        if (ctx_server.params_base.use_jinja) {
-            if (auto tool_use_src = common_chat_templates_source(ctx_server.chat_templates.get(), "tool_use")) {
-                data["chat_template_tool_use"] = tool_use_src;
-            }
-        }
-
-        res->ok(data);
+        auto res = std::make_unique<server_res_generator>(ctx_server, true);
+        auto props = ctx_server.json_server_props;
+        props["is_sleeping"] = ctx_server.queue_tasks.is_sleeping();
+        res->ok(props);
         return res;
     };
 
@@ -3246,6 +3342,7 @@ void server_routes::init_routes() {
 
         std::vector<raw_buffer> files; // dummy
         return handle_completions_impl(
+            std::move(res),
             ctx_server,
             SERVER_TASK_TYPE_INFILL,
             data,
@@ -3255,9 +3352,11 @@ void server_routes::init_routes() {
     };
 
     this->post_completions = [this](const server_http_req & req) {
+        auto res = std::make_unique<server_res_generator>(ctx_server);
         std::vector<raw_buffer> files; // dummy
         const json body = json::parse(req.body);
         return handle_completions_impl(
+            std::move(res),
             ctx_server,
             SERVER_TASK_TYPE_COMPLETION,
             body,
@@ -3267,9 +3366,11 @@ void server_routes::init_routes() {
     };
 
     this->post_completions_oai = [this](const server_http_req & req) {
+        auto res = std::make_unique<server_res_generator>(ctx_server);
         std::vector<raw_buffer> files; // dummy
         const json body = json::parse(req.body);
         return handle_completions_impl(
+            std::move(res),
             ctx_server,
             SERVER_TASK_TYPE_COMPLETION,
             body,
@@ -3279,6 +3380,7 @@ void server_routes::init_routes() {
     };
 
     this->post_chat_completions = [this](const server_http_req & req) {
+        auto res = std::make_unique<server_res_generator>(ctx_server);
         std::vector<raw_buffer> files;
         json body = json::parse(req.body);
         json body_parsed = oaicompat_chat_params_parse(
@@ -3286,6 +3388,7 @@ void server_routes::init_routes() {
             ctx_server.oai_parser_opt,
             files);
         return handle_completions_impl(
+            std::move(res),
             ctx_server,
             SERVER_TASK_TYPE_COMPLETION,
             body_parsed,
@@ -3295,6 +3398,7 @@ void server_routes::init_routes() {
     };
 
     this->post_anthropic_messages = [this](const server_http_req & req) {
+        auto res = std::make_unique<server_res_generator>(ctx_server);
         std::vector<raw_buffer> files;
         json body = convert_anthropic_to_oai(json::parse(req.body));
         json body_parsed = oaicompat_chat_params_parse(
@@ -3302,6 +3406,7 @@ void server_routes::init_routes() {
             ctx_server.oai_parser_opt,
             files);
         return handle_completions_impl(
+            std::move(res),
             ctx_server,
             SERVER_TASK_TYPE_COMPLETION,
             body_parsed,
@@ -3339,11 +3444,13 @@ void server_routes::init_routes() {
         return res;
     };
 
+    // TODO: this endpoint is unsafe to access during model reloading (i.e. wake up from sleeping)
+    // how to make it work even during load_model()?
     this->get_models = [this](const server_http_req &) {
         auto res = std::make_unique<server_res_generator>(ctx_server);
         json model_meta = nullptr;
         if (is_ready()) {
-            model_meta = ctx_server.model_meta();
+            model_meta = ctx_server.json_server_model_meta;
         }
         bool has_mtmd = ctx_server.mctx != nullptr;
         json models = {
diff --git a/tools/server/server-context.h b/tools/server/server-context.h
index 230b25952e..a56be7b8e7 100644
--- a/tools/server/server-context.h
+++ b/tools/server/server-context.h
@@ -22,9 +22,6 @@ struct server_context {
     server_context();
     ~server_context();
 
-    // initialize slots and server-related data
-    void init();
-
     // load the model and initialize llama_context
     // returns true on success
     bool load_model(const common_params & params);
@@ -35,7 +32,7 @@ struct server_context {
     // terminate main loop (will unblock start_loop)
     void terminate();
 
-    // get the underlaying llama_context
+    // get the underlaying llama_context, can return nullptr if sleeping
     llama_context * get_llama_context() const;
 
     // get a new response reader, used by CLI application
diff --git a/tools/server/server-models.cpp b/tools/server/server-models.cpp
index 3690c0bb82..08a0da5c87 100644
--- a/tools/server/server-models.cpp
+++ b/tools/server/server-models.cpp
@@ -17,6 +17,7 @@
 #include <chrono>
 #include <queue>
 #include <filesystem>
+#include <cstring>
 
 #ifdef _WIN32
 #include <winsock2.h>
@@ -33,7 +34,8 @@
 #include <limits.h>
 #endif
 
-#define CMD_EXIT "exit"
+#define CMD_ROUTER_TO_CHILD_EXIT  "cmd_router_to_child:exit"
+#define CMD_CHILD_TO_ROUTER_READY "cmd_child_to_router:ready"
 
 // address for child process, this is needed because router may run on 0.0.0.0
 // ref: https://github.com/ggml-org/llama.cpp/issues/17862
@@ -80,149 +82,30 @@ static std::filesystem::path get_server_exec_path() {
 #endif
 }
 
-struct local_model {
-    std::string name;
-    std::string path;
-    std::string path_mmproj;
-};
-
-static std::vector<local_model> list_local_models(const std::string & dir) {
-    if (!std::filesystem::exists(dir) || !std::filesystem::is_directory(dir)) {
-        throw std::runtime_error(string_format("error: '%s' does not exist or is not a directory\n", dir.c_str()));
-    }
-
-    std::vector<local_model> models;
-    auto scan_subdir = [&models](const std::string & subdir_path, const std::string & name) {
-        auto files = fs_list(subdir_path, false);
-        common_file_info model_file;
-        common_file_info first_shard_file;
-        common_file_info mmproj_file;
-        for (const auto & file : files) {
-            if (string_ends_with(file.name, ".gguf")) {
-                if (file.name.find("mmproj") != std::string::npos) {
-                    mmproj_file = file;
-                } else if (file.name.find("-00001-of-") != std::string::npos) {
-                    first_shard_file = file;
-                } else {
-                    model_file = file;
-                }
-            }
-        }
-        // single file model
-        local_model model{
-            /* name        */ name,
-            /* path        */ first_shard_file.path.empty() ? model_file.path : first_shard_file.path,
-            /* path_mmproj */ mmproj_file.path // can be empty
-        };
-        if (!model.path.empty()) {
-            models.push_back(model);
-        }
-    };
-
-    auto files = fs_list(dir, true);
-    for (const auto & file : files) {
-        if (file.is_dir) {
-            scan_subdir(file.path, file.name);
-        } else if (string_ends_with(file.name, ".gguf")) {
-            // single file model
-            std::string name = file.name;
-            string_replace_all(name, ".gguf", "");
-            local_model model{
-                /* name        */ name,
-                /* path        */ file.path,
-                /* path_mmproj */ ""
-            };
-            models.push_back(model);
-        }
-    }
-    return models;
-}
-
-//
-// server_presets
-//
-
-
-server_presets::server_presets(int argc, char ** argv, common_params & base_params, const std::string & presets_path)
-        : ctx_params(common_params_parser_init(base_params, LLAMA_EXAMPLE_SERVER)) {
-    if (!presets_path.empty()) {
-        presets = common_presets_load(presets_path, ctx_params);
-        SRV_INF("Loaded %zu presets from %s\n", presets.size(), presets_path.c_str());
-    }
-
-    // populate reserved args (will be appended by the router)
-    for (auto & opt : ctx_params.options) {
-        if (opt.env == nullptr) {
-            continue;
-        }
-        std::string env = opt.env;
-        if (env == "LLAMA_ARG_PORT" ||
-            env == "LLAMA_ARG_HOST" ||
-            env == "LLAMA_ARG_ALIAS" ||
-            env == "LLAMA_ARG_API_KEY" ||
-            env == "LLAMA_ARG_MODELS_DIR" ||
-            env == "LLAMA_ARG_MODELS_MAX" ||
-            env == "LLAMA_ARG_MODELS_PRESET" ||
-            env == "LLAMA_ARG_MODEL" ||
-            env == "LLAMA_ARG_MMPROJ" ||
-            env == "LLAMA_ARG_HF_REPO" ||
-            env == "LLAMA_ARG_NO_MODELS_AUTOLOAD") {
-            control_args[env] = opt;
-        }
-    }
-
-    // read base args from router's argv
-    common_params_to_map(argc, argv, LLAMA_EXAMPLE_SERVER, base_args);
-
-    // remove any router-controlled args from base_args
-    for (const auto & cargs : control_args) {
-        auto it = base_args.find(cargs.second);
-        if (it != base_args.end()) {
-            base_args.erase(it);
-        }
+static void unset_reserved_args(common_preset & preset, bool unset_model_args) {
+    preset.unset_option("LLAMA_ARG_SSL_KEY_FILE");
+    preset.unset_option("LLAMA_ARG_SSL_CERT_FILE");
+    preset.unset_option("LLAMA_API_KEY");
+    preset.unset_option("LLAMA_ARG_MODELS_DIR");
+    preset.unset_option("LLAMA_ARG_MODELS_MAX");
+    preset.unset_option("LLAMA_ARG_MODELS_PRESET");
+    preset.unset_option("LLAMA_ARG_MODELS_AUTOLOAD");
+    if (unset_model_args) {
+        preset.unset_option("LLAMA_ARG_MODEL");
+        preset.unset_option("LLAMA_ARG_MMPROJ");
+        preset.unset_option("LLAMA_ARG_HF_REPO");
     }
 }
 
-common_preset server_presets::get_preset(const std::string & name) {
-    auto it = presets.find(name);
-    if (it != presets.end()) {
-        return it->second;
-    }
-    return common_preset();
-}
-
-void server_presets::render_args(server_model_meta & meta) {
-    common_preset preset = meta.preset; // copy
-    // merging 3 kinds of args:
-    // 1. model-specific args (from preset)
-    // force removing control args if any
-    for (auto & cargs : control_args) {
-        if (preset.options.find(cargs.second) != preset.options.end()) {
-            SRV_WRN("Preset '%s' contains reserved arg '%s', removing it\n", preset.name.c_str(), cargs.second.args[0]);
-            preset.options.erase(cargs.second);
-        }
-    }
-    // 2. base args (from router)
-    // inherit from base args
-    for (const auto & [arg, value] : base_args) {
-        preset.options[arg] = value;
-    }
-    // 3. control args (from router)
-    // set control values
-    preset.options[control_args["LLAMA_ARG_HOST"]] = CHILD_ADDR;
-    preset.options[control_args["LLAMA_ARG_PORT"]] = std::to_string(meta.port);
-    preset.options[control_args["LLAMA_ARG_ALIAS"]] = meta.name;
-    if (meta.in_cache) {
-        preset.options[control_args["LLAMA_ARG_HF_REPO"]] = meta.name;
-    } else {
-        preset.options[control_args["LLAMA_ARG_MODEL"]] = meta.path;
-        if (!meta.path_mmproj.empty()) {
-            preset.options[control_args["LLAMA_ARG_MMPROJ"]] = meta.path_mmproj;
-        }
-    }
-    meta.args = preset.to_args();
-    // add back the binary path at the front
-    meta.args.insert(meta.args.begin(), get_server_exec_path().string());
+void server_model_meta::update_args(common_preset_context & ctx_preset, std::string bin_path) {
+    // update params
+    unset_reserved_args(preset, false);
+    preset.set_option(ctx_preset, "LLAMA_ARG_HOST",  CHILD_ADDR);
+    preset.set_option(ctx_preset, "LLAMA_ARG_PORT",  std::to_string(port));
+    preset.set_option(ctx_preset, "LLAMA_ARG_ALIAS", name);
+    // TODO: maybe validate preset before rendering ?
+    // render args
+    args = preset.to_args(bin_path);
 }
 
 //
@@ -233,20 +116,22 @@ server_models::server_models(
         const common_params & params,
         int argc,
         char ** argv,
-        char ** envp) : base_params(params), presets(argc, argv, base_params, params.models_preset) {
-    for (int i = 0; i < argc; i++) {
-        base_args.push_back(std::string(argv[i]));
-    }
+        char ** envp)
+            : ctx_preset(LLAMA_EXAMPLE_SERVER),
+              base_params(params),
+              base_preset(ctx_preset.load_from_args(argc, argv)) {
     for (char ** env = envp; *env != nullptr; env++) {
         base_env.push_back(std::string(*env));
     }
-    GGML_ASSERT(!base_args.empty());
+    // clean up base preset
+    unset_reserved_args(base_preset, true);
     // set binary path
     try {
-        base_args[0] = get_server_exec_path().string();
+        bin_path = get_server_exec_path().string();
     } catch (const std::exception & e) {
+        bin_path = argv[0];
         LOG_WRN("failed to get server executable path: %s\n", e.what());
-        LOG_WRN("using original argv[0] as fallback: %s\n", base_args[0].c_str());
+        LOG_WRN("using original argv[0] as fallback: %s\n", argv[0]);
     }
     load_models();
 }
@@ -255,7 +140,7 @@ void server_models::add_model(server_model_meta && meta) {
     if (mapping.find(meta.name) != mapping.end()) {
         throw std::runtime_error(string_format("model '%s' appears multiple times", meta.name.c_str()));
     }
-    presets.render_args(meta); // populate meta.args
+    meta.update_args(ctx_preset, bin_path); // render args
     std::string name = meta.name;
     mapping[name] = instance_t{
         /* subproc */ std::make_shared<subprocess_s>(),
@@ -264,86 +149,62 @@ void server_models::add_model(server_model_meta && meta) {
     };
 }
 
-static std::vector<local_model> list_custom_path_models(server_presets & presets) {
-    // detect any custom-path models in presets
-    std::vector<local_model> custom_models;
-    for (auto & [model_name, preset] : presets.presets) {
-        local_model model;
-        model.name = model_name;
-        std::vector<common_arg> to_erase;
-        for (auto & [arg, value] : preset.options) {
-            std::string env(arg.env ? arg.env : "");
-            if (env == "LLAMA_ARG_MODEL") {
-                model.path = value;
-                to_erase.push_back(arg);
-            }
-            if (env == "LLAMA_ARG_MMPROJ") {
-                model.path_mmproj = value;
-                to_erase.push_back(arg);
-            }
-        }
-        for (auto & arg : to_erase) {
-            preset.options.erase(arg);
-        }
-        if (!model.name.empty() && !model.path.empty()) {
-            custom_models.push_back(model);
-        }
-    }
-    return custom_models;
-}
-
 // TODO: allow refreshing cached model list
 void server_models::load_models() {
     // loading models from 3 sources:
     // 1. cached models
-    auto cached_models = common_list_cached_models();
-    for (const auto & model : cached_models) {
-        server_model_meta meta{
-            /* preset      */ presets.get_preset(model.to_string()),
-            /* name        */ model.to_string(),
-            /* path        */ model.manifest_path,
-            /* path_mmproj */ "", // auto-detected when loading
-            /* in_cache    */ true,
-            /* port        */ 0,
-            /* status      */ SERVER_MODEL_STATUS_UNLOADED,
-            /* last_used   */ 0,
-            /* args        */ std::vector<std::string>(),
-            /* exit_code   */ 0
-        };
-        add_model(std::move(meta));
-    }
-    // 2. local models specificed via --models-dir
+    common_presets cached_models = ctx_preset.load_from_cache();
+    SRV_INF("Loaded %zu cached model presets\n", cached_models.size());
+    // 2. local models from --models-dir
+    common_presets local_models;
     if (!base_params.models_dir.empty()) {
-        auto local_models = list_local_models(base_params.models_dir);
-        for (const auto & model : local_models) {
-            if (mapping.find(model.name) != mapping.end()) {
-                // already exists in cached models, skip
-                continue;
-            }
-            server_model_meta meta{
-                /* preset      */ presets.get_preset(model.name),
-                /* name        */ model.name,
-                /* path        */ model.path,
-                /* path_mmproj */ model.path_mmproj,
-                /* in_cache    */ false,
-                /* port        */ 0,
-                /* status      */ SERVER_MODEL_STATUS_UNLOADED,
-                /* last_used   */ 0,
-                /* args        */ std::vector<std::string>(),
-                /* exit_code   */ 0
-            };
-            add_model(std::move(meta));
+        local_models = ctx_preset.load_from_models_dir(base_params.models_dir);
+        SRV_INF("Loaded %zu local model presets from %s\n", local_models.size(), base_params.models_dir.c_str());
+    }
+    // 3. custom-path models from presets
+    common_preset global = {};
+    common_presets custom_presets = {};
+    if (!base_params.models_preset.empty()) {
+        custom_presets = ctx_preset.load_from_ini(base_params.models_preset, global);
+        SRV_INF("Loaded %zu custom model presets from %s\n", custom_presets.size(), base_params.models_preset.c_str());
+    }
+
+    // cascade, apply global preset first
+    cached_models  = ctx_preset.cascade(global, cached_models);
+    local_models   = ctx_preset.cascade(global, local_models);
+    custom_presets = ctx_preset.cascade(global, custom_presets);
+
+    // note: if a model exists in both cached and local, local takes precedence
+    common_presets final_presets;
+    for (const auto & [name, preset] : cached_models) {
+        final_presets[name] = preset;
+    }
+    for (const auto & [name, preset] : local_models) {
+        final_presets[name] = preset;
+    }
+
+    // process custom presets from INI
+    for (const auto & [name, custom] : custom_presets) {
+        if (final_presets.find(name) != final_presets.end()) {
+            // apply custom config if exists
+            common_preset & target = final_presets[name];
+            target.merge(custom);
+        } else {
+            // otherwise add directly
+            final_presets[name] = custom;
         }
     }
-    // 3. custom-path models specified in presets
-    auto custom_models = list_custom_path_models(presets);
-    for (const auto & model : custom_models) {
+
+    // server base preset from CLI args take highest precedence
+    for (auto & [name, preset] : final_presets) {
+        preset.merge(base_preset);
+    }
+
+    // convert presets to server_model_meta and add to mapping
+    for (const auto & preset : final_presets) {
         server_model_meta meta{
-            /* preset      */ presets.get_preset(model.name),
-            /* name        */ model.name,
-            /* path        */ model.path,
-            /* path_mmproj */ model.path_mmproj,
-            /* in_cache    */ false,
+            /* preset      */ preset.second,
+            /* name        */ preset.first,
             /* port        */ 0,
             /* status      */ SERVER_MODEL_STATUS_UNLOADED,
             /* last_used   */ 0,
@@ -352,10 +213,38 @@ void server_models::load_models() {
         };
         add_model(std::move(meta));
     }
+
     // log available models
-    SRV_INF("Available models (%zu) (*: custom preset)\n", mapping.size());
+    {
+        std::unordered_set<std::string> custom_names;
+        for (const auto & [name, preset] : custom_presets) {
+            custom_names.insert(name);
+        }
+        SRV_INF("Available models (%zu) (*: custom preset)\n", mapping.size());
+        for (const auto & [name, inst] : mapping) {
+            bool has_custom = custom_names.find(name) != custom_names.end();
+            SRV_INF("  %c %s\n", has_custom ? '*' : ' ', name.c_str());
+        }
+    }
+
+    // load any autoload models
+    std::vector<std::string> models_to_load;
     for (const auto & [name, inst] : mapping) {
-        SRV_INF("  %c %s\n", inst.meta.preset.name.empty() ? ' ' : '*', name.c_str());
+        std::string val;
+        if (inst.meta.preset.get_option(COMMON_ARG_PRESET_LOAD_ON_STARTUP, val)) {
+            models_to_load.push_back(name);
+        }
+    }
+    if ((int)models_to_load.size() > base_params.models_max) {
+        throw std::runtime_error(string_format(
+            "number of models to load on startup (%zu) exceeds models_max (%d)",
+            models_to_load.size(),
+            base_params.models_max
+        ));
+    }
+    for (const auto & name : models_to_load) {
+        SRV_INF("(startup) loading model %s\n", name.c_str());
+        load(name);
     }
 }
 
@@ -519,7 +408,7 @@ void server_models::load(const std::string & name) {
     {
         SRV_INF("spawning server instance with name=%s on port %d\n", inst.meta.name.c_str(), inst.meta.port);
 
-        presets.render_args(inst.meta); // update meta.args
+        inst.meta.update_args(ctx_preset, bin_path); // render args
 
         std::vector<std::string> child_args = inst.meta.args; // copy
         std::vector<std::string> child_env  = base_env; // copy
@@ -534,6 +423,8 @@ void server_models::load(const std::string & name) {
         std::vector<char *> argv = to_char_ptr_array(child_args);
         std::vector<char *> envp = to_char_ptr_array(child_env);
 
+        // TODO @ngxson : maybe separate stdout and stderr in the future
+        //                so that we can use stdout for commands and stderr for logging
         int options = subprocess_option_no_window | subprocess_option_combined_stdout_stderr;
         int result = subprocess_create_ex(argv.data(), options, envp.data(), inst.subproc.get());
         if (result != 0) {
@@ -547,11 +438,17 @@ void server_models::load(const std::string & name) {
     // captured variables are guaranteed to be destroyed only after the thread is joined
     inst.th = std::thread([this, name, child_proc = inst.subproc, port = inst.meta.port]() {
         // read stdout/stderr and forward to main server log
+        bool state_received = false; // true if child state received
         FILE * p_stdout_stderr = subprocess_stdout(child_proc.get());
         if (p_stdout_stderr) {
             char buffer[4096];
             while (fgets(buffer, sizeof(buffer), p_stdout_stderr) != nullptr) {
                 LOG("[%5d] %s", port, buffer);
+                if (!state_received && std::strstr(buffer, CMD_CHILD_TO_ROUTER_READY) != nullptr) {
+                    // child process is ready
+                    this->update_status(name, SERVER_MODEL_STATUS_LOADED);
+                    state_received = true;
+                }
             }
         } else {
             SRV_ERR("failed to get stdout/stderr of child process for name=%s\n", name.c_str());
@@ -595,7 +492,7 @@ static void interrupt_subprocess(FILE * stdin_file) {
     // because subprocess.h does not provide a way to send SIGINT,
     // we will send a command to the child process to exit gracefully
     if (stdin_file) {
-        fprintf(stdin_file, "%s\n", CMD_EXIT);
+        fprintf(stdin_file, "%s\n", CMD_ROUTER_TO_CHILD_EXIT);
         fflush(stdin_file);
     }
 }
@@ -707,32 +604,13 @@ server_http_res_ptr server_models::proxy_request(const server_http_req & req, co
     return proxy;
 }
 
-std::thread server_models::setup_child_server(const common_params & base_params, int router_port, const std::string & name, std::function<void(int)> & shutdown_handler) {
+std::thread server_models::setup_child_server(const std::function<void(int)> & shutdown_handler) {
     // send a notification to the router server that a model instance is ready
-    // TODO @ngxson : use HTTP client from libcommon
-    httplib::Client cli(base_params.hostname, router_port);
-    cli.set_connection_timeout(0, 200000); // 200 milliseconds
-
-    httplib::Request req;
-    req.method = "POST";
-    req.path   = "/models/status";
-    req.set_header("Content-Type", "application/json");
-    if (!base_params.api_keys.empty()) {
-        req.set_header("Authorization", "Bearer " + base_params.api_keys[0]);
-    }
-
-    json body;
-    body["model"] = name;
-    body["value"] = server_model_status_to_string(SERVER_MODEL_STATUS_LOADED);
-    req.body = body.dump();
-
-    SRV_INF("notifying router server (port=%d) that model %s is ready\n", router_port, name.c_str());
-    auto result = cli.send(std::move(req));
-    if (result.error() != httplib::Error::Success) {
-        auto err_str = httplib::to_string(result.error());
-        SRV_ERR("failed to notify router server: %s\n", err_str.c_str());
-        exit(1); // force exit
-    }
+    common_log_pause(common_log_main());
+    fflush(stdout);
+    fprintf(stdout, "%s\n", CMD_CHILD_TO_ROUTER_READY);
+    fflush(stdout);
+    common_log_resume(common_log_main());
 
     // setup thread for monitoring stdin
     return std::thread([shutdown_handler]() {
@@ -746,7 +624,7 @@ std::thread server_models::setup_child_server(const common_params & base_params,
                 eof = true;
                 break;
             }
-            if (line.find(CMD_EXIT) != std::string::npos) {
+            if (line.find(CMD_ROUTER_TO_CHILD_EXIT) != std::string::npos) {
                 SRV_INF("%s", "exit command received, exiting...\n");
                 shutdown_handler(0);
                 break;
@@ -822,6 +700,7 @@ void server_models_routes::init_routes() {
                     {"params", json{}},
                     {"n_ctx",  0},
                 }},
+                {"webui_settings", webui_settings},
             });
             return res;
         }
@@ -869,18 +748,6 @@ void server_models_routes::init_routes() {
         return res;
     };
 
-    // used by child process to notify the router about status change
-    // TODO @ngxson : maybe implement authentication for this endpoint in the future
-    this->post_router_models_status = [this](const server_http_req & req) {
-        auto res = std::make_unique<server_http_res>();
-        json body = json::parse(req.body);
-        std::string model = json_value(body, "model", std::string());
-        std::string value = json_value(body, "value", std::string());
-        models.update_status(model, server_model_status_from_string(value));
-        res_ok(res, {{"success", true}});
-        return res;
-    };
-
     this->get_router_models = [this](const server_http_req &) {
         auto res = std::make_unique<server_http_res>();
         json models_json = json::array();
@@ -892,7 +759,12 @@ void server_models_routes::init_routes() {
                 {"args",   meta.args},
             };
             if (!meta.preset.name.empty()) {
-                status["preset"] = meta.preset.to_ini();
+                common_preset preset_copy = meta.preset;
+                unset_reserved_args(preset_copy, false);
+                preset_copy.unset_option("LLAMA_ARG_HOST");
+                preset_copy.unset_option("LLAMA_ARG_PORT");
+                preset_copy.unset_option("LLAMA_ARG_ALIAS");
+                status["preset"] = preset_copy.to_ini();
             }
             if (meta.is_failed()) {
                 status["exit_code"] = meta.exit_code;
@@ -903,8 +775,6 @@ void server_models_routes::init_routes() {
                 {"object",   "model"},    // for OAI-compat
                 {"owned_by", "llamacpp"}, // for OAI-compat
                 {"created",  t},          // for OAI-compat
-                {"in_cache", meta.in_cache},
-                {"path",     meta.path},
                 {"status",   status},
                 // TODO: add other fields, may require reading GGUF metadata
             });
diff --git a/tools/server/server-models.h b/tools/server/server-models.h
index 9cdbbad9b6..3e1868c27c 100644
--- a/tools/server/server-models.h
+++ b/tools/server/server-models.h
@@ -2,6 +2,7 @@
 
 #include "common.h"
 #include "preset.h"
+#include "server-common.h"
 #include "server-http.h"
 
 #include <mutex>
@@ -50,9 +51,6 @@ static std::string server_model_status_to_string(server_model_status status) {
 struct server_model_meta {
     common_preset preset;
     std::string name;
-    std::string path;
-    std::string path_mmproj; // only available if in_cache=false
-    bool in_cache = false; // if true, use -hf; use -m otherwise
     int port = 0;
     server_model_status status = SERVER_MODEL_STATUS_UNLOADED;
     int64_t last_used = 0; // for LRU unloading
@@ -66,19 +64,8 @@ struct server_model_meta {
     bool is_failed() const {
         return status == SERVER_MODEL_STATUS_UNLOADED && exit_code != 0;
     }
-};
 
-// the server_presets struct holds the presets read from presets.ini
-// as well as base args from the router server
-struct server_presets {
-    common_presets presets;
-    common_params_context ctx_params;
-    std::map<common_arg, std::string> base_args;
-    std::map<std::string, common_arg> control_args; // args reserved for server control
-
-    server_presets(int argc, char ** argv, common_params & base_params, const std::string & models_dir);
-    common_preset get_preset(const std::string & name);
-    void render_args(server_model_meta & meta);
+    void update_args(common_preset_context & ctx_presets, std::string bin_path);
 };
 
 struct subprocess_s;
@@ -96,11 +83,12 @@ private:
     std::condition_variable cv;
     std::map<std::string, instance_t> mapping;
 
-    common_params base_params;
-    std::vector<std::string> base_args;
-    std::vector<std::string> base_env;
+    common_preset_context ctx_preset;
 
-    server_presets presets;
+    common_params base_params;
+    std::string bin_path;
+    std::vector<std::string> base_env;
+    common_preset base_preset; // base preset from llama-server CLI args
 
     void update_meta(const std::string & name, const server_model_meta & meta);
 
@@ -115,27 +103,29 @@ public:
 
     void load_models();
 
-    // check if a model instance exists
+    // check if a model instance exists (thread-safe)
     bool has_model(const std::string & name);
 
-    // return a copy of model metadata
+    // return a copy of model metadata (thread-safe)
     std::optional<server_model_meta> get_meta(const std::string & name);
 
-    // return a copy of all model metadata
+    // return a copy of all model metadata (thread-safe)
     std::vector<server_model_meta> get_all_meta();
 
+    // load and unload model instances
+    // these functions are thread-safe
     void load(const std::string & name);
     void unload(const std::string & name);
     void unload_all();
 
-    // update the status of a model instance
+    // update the status of a model instance (thread-safe)
     void update_status(const std::string & name, server_model_status status);
 
-    // wait until the model instance is fully loaded
+    // wait until the model instance is fully loaded (thread-safe)
     // return when the model is loaded or failed to load
     void wait_until_loaded(const std::string & name);
 
-    // load the model if not loaded, otherwise do nothing
+    // load the model if not loaded, otherwise do nothing (thread-safe)
     // return false if model is already loaded; return true otherwise (meta may need to be refreshed)
     bool ensure_model_loaded(const std::string & name);
 
@@ -144,14 +134,23 @@ public:
 
     // notify the router server that a model instance is ready
     // return the monitoring thread (to be joined by the caller)
-    static std::thread setup_child_server(const common_params & base_params, int router_port, const std::string & name, std::function<void(int)> & shutdown_handler);
+    static std::thread setup_child_server(const std::function<void(int)> & shutdown_handler);
 };
 
 struct server_models_routes {
     common_params params;
+    json webui_settings = json::object();
     server_models models;
     server_models_routes(const common_params & params, int argc, char ** argv, char ** envp)
             : params(params), models(params, argc, argv, envp) {
+        if (!this->params.webui_config_json.empty()) {
+            try {
+                webui_settings = json::parse(this->params.webui_config_json);
+            } catch (const std::exception & e) {
+                LOG_ERR("%s: failed to parse webui config: %s\n", __func__, e.what());
+                throw;
+            }
+        }
         init_routes();
     }
 
@@ -162,7 +161,6 @@ struct server_models_routes {
     server_http_context::handler_t proxy_post;
     server_http_context::handler_t get_router_models;
     server_http_context::handler_t post_router_models_load;
-    server_http_context::handler_t post_router_models_status;
     server_http_context::handler_t post_router_models_unload;
 };
 
diff --git a/tools/server/server-queue.cpp b/tools/server/server-queue.cpp
index 3cceb2bbe2..835938bfc2 100644
--- a/tools/server/server-queue.cpp
+++ b/tools/server/server-queue.cpp
@@ -33,6 +33,7 @@ int server_queue::post(server_task && task, bool front) {
     } else {
         queue_tasks.push_back(std::move(task));
     }
+    time_last_task = ggml_time_ms();
     condition_tasks.notify_one();
     return task_id;
 }
@@ -54,6 +55,7 @@ int server_queue::post(std::vector<server_task> && tasks, bool front) {
             queue_tasks.push_back(std::move(task));
         }
     }
+    time_last_task = ggml_time_ms();
     condition_tasks.notify_one();
     return 0;
 }
@@ -62,6 +64,7 @@ void server_queue::defer(server_task && task) {
     std::unique_lock<std::mutex> lock(mutex_tasks);
     QUE_DBG("defer task, id = %d\n", task.id);
     queue_tasks_deferred.push_back(std::move(task));
+    time_last_task = ggml_time_ms();
     condition_tasks.notify_one();
 }
 
@@ -71,31 +74,52 @@ int server_queue::get_new_id() {
     return new_id;
 }
 
-void server_queue::on_new_task(std::function<void(server_task &&)> callback) {
-    callback_new_task = std::move(callback);
-}
-
-void server_queue::on_update_slots(std::function<void(void)> callback) {
-    callback_update_slots = std::move(callback);
-}
-
 void server_queue::pop_deferred_task() {
     std::unique_lock<std::mutex> lock(mutex_tasks);
     if (!queue_tasks_deferred.empty()) {
         queue_tasks.emplace_front(std::move(queue_tasks_deferred.front()));
         queue_tasks_deferred.pop_front();
     }
+    time_last_task = ggml_time_ms();
     condition_tasks.notify_one();
 }
 
+void server_queue::wait_until_no_sleep() {
+    std::unique_lock<std::mutex> lock(mutex_tasks);
+    if (!sleeping) {
+        return;
+    } else {
+        if (!req_stop_sleeping) {
+            QUE_DBG("%s", "requesting to stop sleeping\n");
+            req_stop_sleeping = true;
+            condition_tasks.notify_one(); // only main thread is waiting on this
+        }
+        QUE_DBG("%s", "waiting until no sleep\n");
+        condition_tasks.wait(lock, [&]{
+            return !sleeping;
+        });
+    }
+}
+
 void server_queue::terminate() {
     std::unique_lock<std::mutex> lock(mutex_tasks);
     running = false;
     condition_tasks.notify_all();
 }
 
-void server_queue::start_loop() {
+void server_queue::start_loop(int64_t idle_sleep_ms) {
     running = true;
+    time_last_task = ggml_time_ms();
+
+    constexpr auto max_wait_time = std::chrono::seconds(1);
+    auto should_sleep = [&]() -> bool {
+        // caller must hold mutex_tasks
+        if (idle_sleep_ms < 0) {
+            return false;
+        }
+        int64_t now = ggml_time_ms();
+        return (now - time_last_task) >= idle_sleep_ms;
+    };
 
     while (true) {
         QUE_DBG("%s", "processing new tasks\n");
@@ -117,23 +141,53 @@ void server_queue::start_loop() {
             QUE_DBG("processing task, id = %d\n", task.id);
             callback_new_task(std::move(task));
         }
-
         // all tasks in the current loop is processed, slots data is now ready
         QUE_DBG("%s", "update slots\n");
 
+        // this will run the main inference process for all slots
         callback_update_slots();
+        {
+            // update_slots() may take a while to finish, we need to make sure it's not counted as idle
+            std::unique_lock<std::mutex> lock(mutex_tasks);
+            time_last_task = ggml_time_ms();
+        }
 
         QUE_DBG("%s", "waiting for new tasks\n");
-        {
+        while (true) {
             std::unique_lock<std::mutex> lock(mutex_tasks);
-            if (!running) {
-                QUE_DBG("%s", "terminate\n");
-                return;
+            if (!running || !queue_tasks.empty()) {
+                break; // go back to process new tasks or terminate
             }
-            if (queue_tasks.empty()) {
+
+            // no tasks, check for sleeping state
+            if (should_sleep()) {
+                QUE_INF("%s", "entering sleeping state\n");
+                sleeping = true;
+                callback_sleeping_state(true);
+                req_stop_sleeping = false;
+                // wait until we are requested to exit sleeping state
                 condition_tasks.wait(lock, [&]{
+                    return (!running || req_stop_sleeping);
+                });
+                if (!running) { // may changed during sleep
+                    break; // terminate
+                }
+                QUE_INF("%s", "exiting sleeping state\n");
+                req_stop_sleeping = false;
+                callback_sleeping_state(false);
+                sleeping = false;
+                time_last_task = ggml_time_ms();
+                condition_tasks.notify_all(); // notify wait_until_no_sleep()
+                break; // process new tasks
+            } else {
+                // wait for new tasks or timeout for checking sleeping condition
+                bool res = condition_tasks.wait_for(lock, max_wait_time, [&]{
                     return (!queue_tasks.empty() || !running);
                 });
+                if (res) {
+                    break; // new task arrived or terminate
+                }
+                // otherwise, loop again to check sleeping condition
             }
         }
     }
diff --git a/tools/server/server-queue.h b/tools/server/server-queue.h
index 8780d7fe12..8ac37a20f6 100644
--- a/tools/server/server-queue.h
+++ b/tools/server/server-queue.h
@@ -12,7 +12,10 @@
 struct server_queue {
 private:
     int id = 0;
-    bool running;
+    bool running  = false;
+    bool sleeping = false;
+    bool req_stop_sleeping = false;
+    int64_t time_last_task = 0;
 
     // queues
     std::deque<server_task> queue_tasks;
@@ -24,6 +27,7 @@ private:
     // callback functions
     std::function<void(server_task &&)> callback_new_task;
     std::function<void(void)>           callback_update_slots;
+    std::function<void(bool)>           callback_sleeping_state;
 
 public:
     // Add a new task to the end of the queue
@@ -38,15 +42,18 @@ public:
     // Get the next id for creating a new task
     int get_new_id();
 
-    // Register function to process a new task
-    void on_new_task(std::function<void(server_task &&)> callback);
-
-    // Register the function to be called when all slots data is ready to be processed
-    void on_update_slots(std::function<void(void)> callback);
-
     // Call when the state of one slot is changed, it will move one task from deferred to main queue
     void pop_deferred_task();
 
+    // if sleeping, request exiting sleep state and wait until it is done
+    // returns immediately if not sleeping
+    void wait_until_no_sleep();
+
+    bool is_sleeping() {
+        std::unique_lock<std::mutex> lock(mutex_tasks);
+        return sleeping;
+    }
+
     // end the start_loop routine
     void terminate();
 
@@ -56,8 +63,15 @@ public:
      * - Process the task (i.e. maybe copy data into slot)
      * - Check if multitask is finished
      * - Update all slots
+     *
+     * Sleeping procedure (disabled if idle_sleep_ms < 0):
+     * - If there is no task after idle_sleep_ms, enter sleeping state
+     * - Call callback_sleeping_state(true)
+     * - Wait until req_stop_sleeping is set to true
+     * - Call callback_sleeping_state(false)
+     * - Exit sleeping state
      */
-    void start_loop();
+    void start_loop(int64_t idle_sleep_ms = -1);
 
     // for metrics
     size_t queue_tasks_deferred_size() {
@@ -65,6 +79,27 @@ public:
         return queue_tasks_deferred.size();
     }
 
+    //
+    // Functions below are not thread-safe, must only be used before start_loop() is called
+    //
+
+    // Register function to process a new task
+    void on_new_task(std::function<void(server_task &&)> callback) {
+        callback_new_task = std::move(callback);
+    }
+
+    // Register the function to be called when all slots data is ready to be processed
+    void on_update_slots(std::function<void(void)> callback) {
+        callback_update_slots = std::move(callback);
+    }
+
+    // Register callback for sleeping state change
+    // note: when entering sleeping state, the callback is called AFTER sleeping is set to true
+    //       when leaving sleeping state, the callback is called BEFORE sleeping is set to false
+    void on_sleeping_state(std::function<void(bool)> callback) {
+        callback_sleeping_state = std::move(callback);
+    }
+
 private:
     void cleanup_pending_task(int id_target);
 };
diff --git a/tools/server/server.cpp b/tools/server/server.cpp
index 8538427f73..ff650ab2ec 100644
--- a/tools/server/server.cpp
+++ b/tools/server/server.cpp
@@ -8,6 +8,7 @@
 #include "log.h"
 
 #include <atomic>
+#include <exception>
 #include <signal.h>
 #include <thread> // for std::thread::hardware_concurrency
 
@@ -124,7 +125,12 @@ int main(int argc, char ** argv, char ** envp) {
     std::optional<server_models_routes> models_routes{};
     if (is_router_server) {
         // setup server instances manager
-        models_routes.emplace(params, argc, argv, envp);
+        try {
+            models_routes.emplace(params, argc, argv, envp);
+        } catch (const std::exception & e) {
+            LOG_ERR("%s: failed to initialize router models: %s\n", __func__, e.what());
+            return 1;
+        }
 
         // proxy handlers
         // note: routes.get_health stays the same
@@ -153,7 +159,6 @@ int main(int argc, char ** argv, char ** envp) {
         routes.get_models = models_routes->get_router_models;
         ctx_http.post("/models/load",   ex_wrapper(models_routes->post_router_models_load));
         ctx_http.post("/models/unload", ex_wrapper(models_routes->post_router_models_unload));
-        ctx_http.post("/models/status", ex_wrapper(models_routes->post_router_models_status));
     }
 
     ctx_http.get ("/health",              ex_wrapper(routes.get_health)); // public endpoint (no API key check)
@@ -247,7 +252,6 @@ int main(int argc, char ** argv, char ** envp) {
             return 1;
         }
 
-        ctx_server.init();
         ctx_http.is_ready.store(true);
 
         LOG_INF("%s: model loaded\n", __func__);
@@ -291,7 +295,7 @@ int main(int argc, char ** argv, char ** envp) {
         const char * router_port = std::getenv("LLAMA_SERVER_ROUTER_PORT");
         std::thread monitor_thread;
         if (router_port != nullptr) {
-            monitor_thread = server_models::setup_child_server(params, std::atoi(router_port), params.model_alias, shutdown_handler);
+            monitor_thread = server_models::setup_child_server(shutdown_handler);
         }
 
         // this call blocks the main thread until queue_tasks.terminate() is called
@@ -304,7 +308,11 @@ int main(int argc, char ** argv, char ** envp) {
         if (monitor_thread.joinable()) {
             monitor_thread.join();
         }
-        llama_memory_breakdown_print(ctx_server.get_llama_context());
+
+        auto * ll_ctx = ctx_server.get_llama_context();
+        if (ll_ctx != nullptr) {
+            llama_memory_breakdown_print(ll_ctx);
+        }
     }
 
     return 0;
diff --git a/tools/server/tests/unit/test_sleep.py b/tools/server/tests/unit/test_sleep.py
new file mode 100644
index 0000000000..3374165e83
--- /dev/null
+++ b/tools/server/tests/unit/test_sleep.py
@@ -0,0 +1,39 @@
+import pytest
+import time
+from utils import *
+
+server = ServerPreset.tinyllama2()
+
+
+@pytest.fixture(autouse=True)
+def create_server():
+    global server
+    server = ServerPreset.tinyllama2()
+
+
+def test_server_sleep():
+    global server
+    server.sleep_idle_seconds = 1
+    server.start()
+
+    # wait a bit so that server can go to sleep
+    time.sleep(2)
+
+    # make sure these endpoints are still responsive after sleep
+    res = server.make_request("GET", "/health")
+    assert res.status_code == 200
+    res = server.make_request("GET", "/props")
+    assert res.status_code == 200
+    assert res.body["is_sleeping"] == True
+
+    # make a generation request to wake up the server
+    res = server.make_request("POST", "/completion", data={
+        "n_predict": 1,
+        "prompt": "Hello",
+    })
+    assert res.status_code == 200
+
+    # it should no longer be sleeping
+    res = server.make_request("GET", "/props")
+    assert res.status_code == 200
+    assert res.body["is_sleeping"] == False
diff --git a/tools/server/tests/utils.py b/tools/server/tests/utils.py
index 48e7403602..f76bb1a911 100644
--- a/tools/server/tests/utils.py
+++ b/tools/server/tests/utils.py
@@ -100,6 +100,7 @@ class ServerProcess:
     server_path: str | None = None
     mmproj_url: str | None = None
     media_path: str | None = None
+    sleep_idle_seconds: int | None = None
 
     # session variables
     process: subprocess.Popen | None = None
@@ -230,6 +231,8 @@ class ServerProcess:
             server_args.extend(["--mmproj-url", self.mmproj_url])
         if self.media_path:
             server_args.extend(["--media-path", self.media_path])
+        if self.sleep_idle_seconds is not None:
+            server_args.extend(["--sleep-idle-seconds", self.sleep_idle_seconds])
 
         args = [str(arg) for arg in [server_path, *server_args]]
         print(f"tests: starting server with: {' '.join(args)}")
diff --git a/tools/server/webui/docs/architecture/high-level-architecture-simplified.md b/tools/server/webui/docs/architecture/high-level-architecture-simplified.md
index 50f2e1df0a..a6cb1e9c39 100644
--- a/tools/server/webui/docs/architecture/high-level-architecture-simplified.md
+++ b/tools/server/webui/docs/architecture/high-level-architecture-simplified.md
@@ -11,6 +11,8 @@ flowchart TB
         C_Screen["ChatScreen"]
         C_Form["ChatForm"]
         C_Messages["ChatMessages"]
+        C_Message["ChatMessage"]
+        C_MessageEditForm["ChatMessageEditForm"]
         C_ModelsSelector["ModelsSelector"]
         C_Settings["ChatSettings"]
     end
@@ -54,7 +56,9 @@ flowchart TB
 
     %% Component hierarchy
     C_Screen --> C_Form & C_Messages & C_Settings
-    C_Form & C_Messages --> C_ModelsSelector
+    C_Messages --> C_Message
+    C_Message --> C_MessageEditForm
+    C_Form & C_MessageEditForm --> C_ModelsSelector
 
     %% Components → Hooks → Stores
     C_Form & C_Messages --> H1 & H2
@@ -93,7 +97,7 @@ flowchart TB
     classDef apiStyle fill:#e3f2fd,stroke:#1565c0,stroke-width:2px
 
     class R1,R2,RL routeStyle
-    class C_Sidebar,C_Screen,C_Form,C_Messages,C_ModelsSelector,C_Settings componentStyle
+    class C_Sidebar,C_Screen,C_Form,C_Messages,C_Message,C_MessageEditForm,C_ModelsSelector,C_Settings componentStyle
     class H1,H2 hookStyle
     class S1,S2,S3,S4,S5 storeStyle
     class SV1,SV2,SV3,SV4,SV5 serviceStyle
diff --git a/tools/server/webui/docs/architecture/high-level-architecture.md b/tools/server/webui/docs/architecture/high-level-architecture.md
index 730da10a59..c5ec4d6909 100644
--- a/tools/server/webui/docs/architecture/high-level-architecture.md
+++ b/tools/server/webui/docs/architecture/high-level-architecture.md
@@ -16,6 +16,8 @@ end
             C_Form["ChatForm"]
             C_Messages["ChatMessages"]
             C_Message["ChatMessage"]
+            C_MessageUser["ChatMessageUser"]
+            C_MessageEditForm["ChatMessageEditForm"]
             C_Attach["ChatAttachments"]
             C_ModelsSelector["ModelsSelector"]
             C_Settings["ChatSettings"]
@@ -38,7 +40,7 @@ end
             S1Error["<b>Error Handling:</b><br/>showErrorDialog()<br/>dismissErrorDialog()<br/>isAbortError()"]
             S1Msg["<b>Message Operations:</b><br/>addMessage()<br/>sendMessage()<br/>updateMessage()<br/>deleteMessage()<br/>getDeletionInfo()"]
             S1Regen["<b>Regeneration:</b><br/>regenerateMessage()<br/>regenerateMessageWithBranching()<br/>continueAssistantMessage()"]
-            S1Edit["<b>Editing:</b><br/>editAssistantMessage()<br/>editUserMessagePreserveResponses()<br/>editMessageWithBranching()"]
+            S1Edit["<b>Editing:</b><br/>editAssistantMessage()<br/>editUserMessagePreserveResponses()<br/>editMessageWithBranching()<br/>clearEditMode()<br/>isEditModeActive()<br/>getAddFilesHandler()<br/>setEditModeActive()"]
             S1Utils["<b>Utilities:</b><br/>getApiOptions()<br/>parseTimingData()<br/>getOrCreateAbortController()<br/>getConversationModel()"]
         end
         subgraph S2["conversationsStore"]
@@ -88,6 +90,10 @@ end
                 RE7["getChatStreaming()"]
                 RE8["getAllLoadingChats()"]
                 RE9["getAllStreamingChats()"]
+                RE9a["isEditModeActive()"]
+                RE9b["getAddFilesHandler()"]
+                RE9c["setEditModeActive()"]
+                RE9d["clearEditMode()"]
             end
             subgraph ConvExports["conversationsStore"]
                 RE10["conversations()"]
@@ -182,7 +188,10 @@ end
     %% Component hierarchy
     C_Screen --> C_Form & C_Messages & C_Settings
     C_Messages --> C_Message
-    C_Message --> C_ModelsSelector
+    C_Message --> C_MessageUser
+    C_MessageUser --> C_MessageEditForm
+    C_MessageEditForm --> C_ModelsSelector
+    C_MessageEditForm --> C_Attach
     C_Form --> C_ModelsSelector
     C_Form --> C_Attach
     C_Message --> C_Attach
@@ -190,6 +199,7 @@ end
     %% Components use Hooks
     C_Form --> H1
     C_Message --> H1 & H2
+    C_MessageEditForm --> H1
     C_Screen --> H2
 
     %% Hooks use Stores
@@ -244,7 +254,7 @@ end
     classDef apiStyle fill:#e3f2fd,stroke:#1565c0,stroke-width:2px
 
     class R1,R2,RL routeStyle
-    class C_Sidebar,C_Screen,C_Form,C_Messages,C_Message componentStyle
+    class C_Sidebar,C_Screen,C_Form,C_Messages,C_Message,C_MessageUser,C_MessageEditForm componentStyle
     class C_ModelsSelector,C_Settings componentStyle
     class C_Attach componentStyle
     class H1,H2,H3 methodStyle
diff --git a/tools/server/webui/package-lock.json b/tools/server/webui/package-lock.json
index 4f37b308b1..6fa9d39c71 100644
--- a/tools/server/webui/package-lock.json
+++ b/tools/server/webui/package-lock.json
@@ -25,7 +25,7 @@
 				"@chromatic-com/storybook": "^4.1.2",
 				"@eslint/compat": "^1.2.5",
 				"@eslint/js": "^9.18.0",
-				"@internationalized/date": "^3.8.2",
+				"@internationalized/date": "^3.10.1",
 				"@lucide/svelte": "^0.515.0",
 				"@playwright/test": "^1.49.1",
 				"@storybook/addon-a11y": "^10.0.7",
@@ -862,9 +862,9 @@
 			}
 		},
 		"node_modules/@internationalized/date": {
-			"version": "3.8.2",
-			"resolved": "https://registry.npmjs.org/@internationalized/date/-/date-3.8.2.tgz",
-			"integrity": "sha512-/wENk7CbvLbkUvX1tu0mwq49CVkkWpkXubGel6birjRPyo6uQ4nQpnq5xZu823zRCwwn82zgHrvgF1vZyvmVgA==",
+			"version": "3.10.1",
+			"resolved": "https://registry.npmjs.org/@internationalized/date/-/date-3.10.1.tgz",
+			"integrity": "sha512-oJrXtQiAXLvT9clCf1K4kxp3eKsQhIaZqxEyowkBcsvZDdZkbWrVmnGknxs5flTD0VGsxrxKgBCZty1EzoiMzA==",
 			"dev": true,
 			"license": "Apache-2.0",
 			"dependencies": {
@@ -2109,9 +2109,9 @@
 			}
 		},
 		"node_modules/@sveltejs/kit": {
-			"version": "2.48.5",
-			"resolved": "https://registry.npmjs.org/@sveltejs/kit/-/kit-2.48.5.tgz",
-			"integrity": "sha512-/rnwfSWS3qwUSzvHynUTORF9xSJi7PCR9yXkxUOnRrNqyKmCmh3FPHH+E9BbgqxXfTevGXBqgnlh9kMb+9T5XA==",
+			"version": "2.49.2",
+			"resolved": "https://registry.npmjs.org/@sveltejs/kit/-/kit-2.49.2.tgz",
+			"integrity": "sha512-Vp3zX/qlwerQmHMP6x0Ry1oY7eKKRcOWGc2P59srOp4zcqyn+etJyQpELgOi4+ZSUgteX8Y387NuwruLgGXLUQ==",
 			"dev": true,
 			"license": "MIT",
 			"dependencies": {
@@ -5797,9 +5797,9 @@
 			}
 		},
 		"node_modules/mdast-util-to-hast": {
-			"version": "13.2.0",
-			"resolved": "https://registry.npmjs.org/mdast-util-to-hast/-/mdast-util-to-hast-13.2.0.tgz",
-			"integrity": "sha512-QGYKEuUsYT9ykKBCMOEDLsU5JRObWQusAolFMeko/tYPufNkRffBAQjIE+99jbA87xv6FgmjLtwjh9wBWajwAA==",
+			"version": "13.2.1",
+			"resolved": "https://registry.npmjs.org/mdast-util-to-hast/-/mdast-util-to-hast-13.2.1.tgz",
+			"integrity": "sha512-cctsq2wp5vTsLIcaymblUriiTcZd0CwWtCbLvrOzYCDZoWyMNV8sZ7krj09FSnsiJi3WVsHLM4k6Dq/yaPyCXA==",
 			"license": "MIT",
 			"dependencies": {
 				"@types/hast": "^3.0.0",
diff --git a/tools/server/webui/package.json b/tools/server/webui/package.json
index 1c970ae7a8..1a8c273749 100644
--- a/tools/server/webui/package.json
+++ b/tools/server/webui/package.json
@@ -26,7 +26,7 @@
 		"@chromatic-com/storybook": "^4.1.2",
 		"@eslint/compat": "^1.2.5",
 		"@eslint/js": "^9.18.0",
-		"@internationalized/date": "^3.8.2",
+		"@internationalized/date": "^3.10.1",
 		"@lucide/svelte": "^0.515.0",
 		"@playwright/test": "^1.49.1",
 		"@storybook/addon-a11y": "^10.0.7",
diff --git a/tools/server/webui/src/app.d.ts b/tools/server/webui/src/app.d.ts
index 71976936ed..73287d91b6 100644
--- a/tools/server/webui/src/app.d.ts
+++ b/tools/server/webui/src/app.d.ts
@@ -124,3 +124,10 @@ declare global {
 		SettingsConfigType
 	};
 }
+
+declare global {
+	interface Window {
+		idxThemeStyle?: number;
+		idxCodeBlock?: number;
+	}
+}
diff --git a/tools/server/webui/src/lib/components/app/chat/ChatForm/ChatForm.svelte b/tools/server/webui/src/lib/components/app/chat/ChatForm/ChatForm.svelte
index 3ad14ed3ab..fd2f7f60e5 100644
--- a/tools/server/webui/src/lib/components/app/chat/ChatForm/ChatForm.svelte
+++ b/tools/server/webui/src/lib/components/app/chat/ChatForm/ChatForm.svelte
@@ -8,6 +8,7 @@
 		ChatFormTextarea
 	} from '$lib/components/app';
 	import { INPUT_CLASSES } from '$lib/constants/input-classes';
+	import { SETTING_CONFIG_DEFAULT } from '$lib/constants/settings-config';
 	import { config } from '$lib/stores/settings.svelte';
 	import { modelsStore, modelOptions, selectedModelId } from '$lib/stores/models.svelte';
 	import { isRouterMode } from '$lib/stores/server.svelte';
@@ -66,7 +67,7 @@
 	let message = $state('');
 	let pasteLongTextToFileLength = $derived.by(() => {
 		const n = Number(currentConfig.pasteLongTextToFileLen);
-		return Number.isNaN(n) ? 2500 : n;
+		return Number.isNaN(n) ? Number(SETTING_CONFIG_DEFAULT.pasteLongTextToFileLen) : n;
 	});
 	let previousIsLoading = $state(isLoading);
 	let recordingSupported = $state(false);
diff --git a/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessage.svelte b/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessage.svelte
index 0969a937ed..220276fc9e 100644
--- a/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessage.svelte
+++ b/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessage.svelte
@@ -12,13 +12,21 @@
 		onCopy?: (message: DatabaseMessage) => void;
 		onContinueAssistantMessage?: (message: DatabaseMessage) => void;
 		onDelete?: (message: DatabaseMessage) => void;
-		onEditWithBranching?: (message: DatabaseMessage, newContent: string) => void;
+		onEditWithBranching?: (
+			message: DatabaseMessage,
+			newContent: string,
+			newExtras?: DatabaseMessageExtra[]
+		) => void;
 		onEditWithReplacement?: (
 			message: DatabaseMessage,
 			newContent: string,
 			shouldBranch: boolean
 		) => void;
-		onEditUserMessagePreserveResponses?: (message: DatabaseMessage, newContent: string) => void;
+		onEditUserMessagePreserveResponses?: (
+			message: DatabaseMessage,
+			newContent: string,
+			newExtras?: DatabaseMessageExtra[]
+		) => void;
 		onNavigateToSibling?: (siblingId: string) => void;
 		onRegenerateWithBranching?: (message: DatabaseMessage, modelOverride?: string) => void;
 		siblingInfo?: ChatMessageSiblingInfo | null;
@@ -45,6 +53,8 @@
 		messageTypes: string[];
 	} | null>(null);
 	let editedContent = $state(message.content);
+	let editedExtras = $state<DatabaseMessageExtra[]>(message.extra ? [...message.extra] : []);
+	let editedUploadedFiles = $state<ChatUploadedFile[]>([]);
 	let isEditing = $state(false);
 	let showDeleteDialog = $state(false);
 	let shouldBranchAfterEdit = $state(false);
@@ -85,6 +95,16 @@
 	function handleCancelEdit() {
 		isEditing = false;
 		editedContent = message.content;
+		editedExtras = message.extra ? [...message.extra] : [];
+		editedUploadedFiles = [];
+	}
+
+	function handleEditedExtrasChange(extras: DatabaseMessageExtra[]) {
+		editedExtras = extras;
+	}
+
+	function handleEditedUploadedFilesChange(files: ChatUploadedFile[]) {
+		editedUploadedFiles = files;
 	}
 
 	async function handleCopy() {
@@ -107,6 +127,8 @@
 	function handleEdit() {
 		isEditing = true;
 		editedContent = message.content;
+		editedExtras = message.extra ? [...message.extra] : [];
+		editedUploadedFiles = [];
 
 		setTimeout(() => {
 			if (textareaElement) {
@@ -143,9 +165,10 @@
 		onContinueAssistantMessage?.(message);
 	}
 
-	function handleSaveEdit() {
+	async function handleSaveEdit() {
 		if (message.role === 'user' || message.role === 'system') {
-			onEditWithBranching?.(message, editedContent.trim());
+			const finalExtras = await getMergedExtras();
+			onEditWithBranching?.(message, editedContent.trim(), finalExtras);
 		} else {
 			// For assistant messages, preserve exact content including trailing whitespace
 			// This is important for the Continue feature to work properly
@@ -154,15 +177,30 @@
 
 		isEditing = false;
 		shouldBranchAfterEdit = false;
+		editedUploadedFiles = [];
 	}
 
-	function handleSaveEditOnly() {
+	async function handleSaveEditOnly() {
 		if (message.role === 'user') {
 			// For user messages, trim to avoid accidental whitespace
-			onEditUserMessagePreserveResponses?.(message, editedContent.trim());
+			const finalExtras = await getMergedExtras();
+			onEditUserMessagePreserveResponses?.(message, editedContent.trim(), finalExtras);
 		}
 
 		isEditing = false;
+		editedUploadedFiles = [];
+	}
+
+	async function getMergedExtras(): Promise<DatabaseMessageExtra[]> {
+		if (editedUploadedFiles.length === 0) {
+			return editedExtras;
+		}
+
+		const { parseFilesToMessageExtras } = await import('$lib/utils/browser-only');
+		const result = await parseFilesToMessageExtras(editedUploadedFiles);
+		const newExtras = result?.extras || [];
+
+		return [...editedExtras, ...newExtras];
 	}
 
 	function handleShowDeleteDialogChange(show: boolean) {
@@ -197,6 +235,8 @@
 		class={className}
 		{deletionInfo}
 		{editedContent}
+		{editedExtras}
+		{editedUploadedFiles}
 		{isEditing}
 		{message}
 		onCancelEdit={handleCancelEdit}
@@ -206,6 +246,8 @@
 		onEdit={handleEdit}
 		onEditKeydown={handleEditKeydown}
 		onEditedContentChange={handleEditedContentChange}
+		onEditedExtrasChange={handleEditedExtrasChange}
+		onEditedUploadedFilesChange={handleEditedUploadedFilesChange}
 		{onNavigateToSibling}
 		onSaveEdit={handleSaveEdit}
 		onSaveEditOnly={handleSaveEditOnly}
diff --git a/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessageAssistant.svelte b/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessageAssistant.svelte
index 2c9a012eff..8997963f16 100644
--- a/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessageAssistant.svelte
+++ b/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessageAssistant.svelte
@@ -244,7 +244,7 @@
 
 	<div class="info my-6 grid gap-4">
 		{#if displayedModel()}
-			<span class="inline-flex flex-wrap items-center gap-2 text-xs text-muted-foreground">
+			<div class="inline-flex flex-wrap items-start gap-2 text-xs text-muted-foreground">
 				{#if isRouter}
 					<ModelsSelector
 						currentModel={displayedModel()}
@@ -258,11 +258,13 @@
 
 				{#if currentConfig.showMessageStats && message.timings && message.timings.predicted_n && message.timings.predicted_ms}
 					<ChatMessageStatistics
+						promptTokens={message.timings.prompt_n}
+						promptMs={message.timings.prompt_ms}
 						predictedTokens={message.timings.predicted_n}
 						predictedMs={message.timings.predicted_ms}
 					/>
 				{/if}
-			</span>
+			</div>
 		{/if}
 
 		{#if config().showToolCalls}
diff --git a/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessageEditForm.svelte b/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessageEditForm.svelte
new file mode 100644
index 0000000000..f812ea2fd9
--- /dev/null
+++ b/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessageEditForm.svelte
@@ -0,0 +1,391 @@
+<script lang="ts">
+	import { X, ArrowUp, Paperclip, AlertTriangle } from '@lucide/svelte';
+	import { Button } from '$lib/components/ui/button';
+	import { Switch } from '$lib/components/ui/switch';
+	import { ChatAttachmentsList, DialogConfirmation, ModelsSelector } from '$lib/components/app';
+	import { INPUT_CLASSES } from '$lib/constants/input-classes';
+	import { SETTING_CONFIG_DEFAULT } from '$lib/constants/settings-config';
+	import { AttachmentType, FileTypeCategory, MimeTypeText } from '$lib/enums';
+	import { config } from '$lib/stores/settings.svelte';
+	import { useModelChangeValidation } from '$lib/hooks/use-model-change-validation.svelte';
+	import { setEditModeActive, clearEditMode } from '$lib/stores/chat.svelte';
+	import { conversationsStore } from '$lib/stores/conversations.svelte';
+	import { modelsStore } from '$lib/stores/models.svelte';
+	import { isRouterMode } from '$lib/stores/server.svelte';
+	import {
+		autoResizeTextarea,
+		getFileTypeCategory,
+		getFileTypeCategoryByExtension,
+		parseClipboardContent
+	} from '$lib/utils';
+
+	interface Props {
+		messageId: string;
+		editedContent: string;
+		editedExtras?: DatabaseMessageExtra[];
+		editedUploadedFiles?: ChatUploadedFile[];
+		originalContent: string;
+		originalExtras?: DatabaseMessageExtra[];
+		showSaveOnlyOption?: boolean;
+		onCancelEdit: () => void;
+		onSaveEdit: () => void;
+		onSaveEditOnly?: () => void;
+		onEditKeydown: (event: KeyboardEvent) => void;
+		onEditedContentChange: (content: string) => void;
+		onEditedExtrasChange?: (extras: DatabaseMessageExtra[]) => void;
+		onEditedUploadedFilesChange?: (files: ChatUploadedFile[]) => void;
+		textareaElement?: HTMLTextAreaElement;
+	}
+
+	let {
+		messageId,
+		editedContent,
+		editedExtras = [],
+		editedUploadedFiles = [],
+		originalContent,
+		originalExtras = [],
+		showSaveOnlyOption = false,
+		onCancelEdit,
+		onSaveEdit,
+		onSaveEditOnly,
+		onEditKeydown,
+		onEditedContentChange,
+		onEditedExtrasChange,
+		onEditedUploadedFilesChange,
+		textareaElement = $bindable()
+	}: Props = $props();
+
+	let fileInputElement: HTMLInputElement | undefined = $state();
+	let saveWithoutRegenerate = $state(false);
+	let showDiscardDialog = $state(false);
+	let isRouter = $derived(isRouterMode());
+	let currentConfig = $derived(config());
+
+	let pasteLongTextToFileLength = $derived.by(() => {
+		const n = Number(currentConfig.pasteLongTextToFileLen);
+
+		return Number.isNaN(n) ? Number(SETTING_CONFIG_DEFAULT.pasteLongTextToFileLen) : n;
+	});
+
+	let hasUnsavedChanges = $derived.by(() => {
+		if (editedContent !== originalContent) return true;
+		if (editedUploadedFiles.length > 0) return true;
+
+		const extrasChanged =
+			editedExtras.length !== originalExtras.length ||
+			editedExtras.some((extra, i) => extra !== originalExtras[i]);
+
+		if (extrasChanged) return true;
+
+		return false;
+	});
+
+	let hasAttachments = $derived(
+		(editedExtras && editedExtras.length > 0) ||
+			(editedUploadedFiles && editedUploadedFiles.length > 0)
+	);
+
+	let canSubmit = $derived(editedContent.trim().length > 0 || hasAttachments);
+
+	function getEditedAttachmentsModalities(): ModelModalities {
+		const modalities: ModelModalities = { vision: false, audio: false };
+
+		for (const extra of editedExtras) {
+			if (extra.type === AttachmentType.IMAGE) {
+				modalities.vision = true;
+			}
+
+			if (
+				extra.type === AttachmentType.PDF &&
+				'processedAsImages' in extra &&
+				extra.processedAsImages
+			) {
+				modalities.vision = true;
+			}
+
+			if (extra.type === AttachmentType.AUDIO) {
+				modalities.audio = true;
+			}
+		}
+
+		for (const file of editedUploadedFiles) {
+			const category = getFileTypeCategory(file.type) || getFileTypeCategoryByExtension(file.name);
+			if (category === FileTypeCategory.IMAGE) {
+				modalities.vision = true;
+			}
+			if (category === FileTypeCategory.AUDIO) {
+				modalities.audio = true;
+			}
+		}
+
+		return modalities;
+	}
+
+	function getRequiredModalities(): ModelModalities {
+		const beforeModalities = conversationsStore.getModalitiesUpToMessage(messageId);
+		const editedModalities = getEditedAttachmentsModalities();
+
+		return {
+			vision: beforeModalities.vision || editedModalities.vision,
+			audio: beforeModalities.audio || editedModalities.audio
+		};
+	}
+
+	const { handleModelChange } = useModelChangeValidation({
+		getRequiredModalities,
+		onValidationFailure: async (previousModelId) => {
+			if (previousModelId) {
+				await modelsStore.selectModelById(previousModelId);
+			}
+		}
+	});
+
+	function handleFileInputChange(event: Event) {
+		const input = event.target as HTMLInputElement;
+		if (!input.files || input.files.length === 0) return;
+
+		const files = Array.from(input.files);
+
+		processNewFiles(files);
+		input.value = '';
+	}
+
+	function handleGlobalKeydown(event: KeyboardEvent) {
+		if (event.key === 'Escape') {
+			event.preventDefault();
+			attemptCancel();
+		}
+	}
+
+	function attemptCancel() {
+		if (hasUnsavedChanges) {
+			showDiscardDialog = true;
+		} else {
+			onCancelEdit();
+		}
+	}
+
+	function handleRemoveExistingAttachment(index: number) {
+		if (!onEditedExtrasChange) return;
+
+		const newExtras = [...editedExtras];
+
+		newExtras.splice(index, 1);
+		onEditedExtrasChange(newExtras);
+	}
+
+	function handleRemoveUploadedFile(fileId: string) {
+		if (!onEditedUploadedFilesChange) return;
+
+		const newFiles = editedUploadedFiles.filter((f) => f.id !== fileId);
+
+		onEditedUploadedFilesChange(newFiles);
+	}
+
+	function handleSubmit() {
+		if (!canSubmit) return;
+
+		if (saveWithoutRegenerate && onSaveEditOnly) {
+			onSaveEditOnly();
+		} else {
+			onSaveEdit();
+		}
+
+		saveWithoutRegenerate = false;
+	}
+
+	async function processNewFiles(files: File[]) {
+		if (!onEditedUploadedFilesChange) return;
+
+		const { processFilesToChatUploaded } = await import('$lib/utils/browser-only');
+		const processed = await processFilesToChatUploaded(files);
+
+		onEditedUploadedFilesChange([...editedUploadedFiles, ...processed]);
+	}
+
+	function handlePaste(event: ClipboardEvent) {
+		if (!event.clipboardData) return;
+
+		const files = Array.from(event.clipboardData.items)
+			.filter((item) => item.kind === 'file')
+			.map((item) => item.getAsFile())
+			.filter((file): file is File => file !== null);
+
+		if (files.length > 0) {
+			event.preventDefault();
+			processNewFiles(files);
+
+			return;
+		}
+
+		const text = event.clipboardData.getData(MimeTypeText.PLAIN);
+
+		if (text.startsWith('"')) {
+			const parsed = parseClipboardContent(text);
+
+			if (parsed.textAttachments.length > 0) {
+				event.preventDefault();
+				onEditedContentChange(parsed.message);
+
+				const attachmentFiles = parsed.textAttachments.map(
+					(att) =>
+						new File([att.content], att.name, {
+							type: MimeTypeText.PLAIN
+						})
+				);
+
+				processNewFiles(attachmentFiles);
+
+				setTimeout(() => {
+					textareaElement?.focus();
+				}, 10);
+
+				return;
+			}
+		}
+
+		if (
+			text.length > 0 &&
+			pasteLongTextToFileLength > 0 &&
+			text.length > pasteLongTextToFileLength
+		) {
+			event.preventDefault();
+
+			const textFile = new File([text], 'Pasted', {
+				type: MimeTypeText.PLAIN
+			});
+
+			processNewFiles([textFile]);
+		}
+	}
+
+	$effect(() => {
+		if (textareaElement) {
+			autoResizeTextarea(textareaElement);
+		}
+	});
+
+	$effect(() => {
+		setEditModeActive(processNewFiles);
+
+		return () => {
+			clearEditMode();
+		};
+	});
+</script>
+
+<svelte:window onkeydown={handleGlobalKeydown} />
+
+<input
+	bind:this={fileInputElement}
+	type="file"
+	multiple
+	class="hidden"
+	onchange={handleFileInputChange}
+/>
+
+<div
+	class="{INPUT_CLASSES} w-full max-w-[80%] overflow-hidden rounded-3xl backdrop-blur-md"
+	data-slot="edit-form"
+>
+	<ChatAttachmentsList
+		attachments={editedExtras}
+		uploadedFiles={editedUploadedFiles}
+		readonly={false}
+		onFileRemove={(fileId) => {
+			if (fileId.startsWith('attachment-')) {
+				const index = parseInt(fileId.replace('attachment-', ''), 10);
+				if (!isNaN(index) && index >= 0 && index < editedExtras.length) {
+					handleRemoveExistingAttachment(index);
+				}
+			} else {
+				handleRemoveUploadedFile(fileId);
+			}
+		}}
+		limitToSingleRow
+		class="py-5"
+		style="scroll-padding: 1rem;"
+	/>
+
+	<div class="relative min-h-[48px] px-5 py-3">
+		<textarea
+			bind:this={textareaElement}
+			bind:value={editedContent}
+			class="field-sizing-content max-h-80 min-h-10 w-full resize-none bg-transparent text-sm outline-none"
+			onkeydown={onEditKeydown}
+			oninput={(e) => {
+				autoResizeTextarea(e.currentTarget);
+				onEditedContentChange(e.currentTarget.value);
+			}}
+			onpaste={handlePaste}
+			placeholder="Edit your message..."
+		></textarea>
+
+		<div class="flex w-full items-center gap-3" style="container-type: inline-size">
+			<Button
+				class="h-8 w-8 shrink-0 rounded-full bg-transparent p-0 text-muted-foreground hover:bg-foreground/10 hover:text-foreground"
+				onclick={() => fileInputElement?.click()}
+				type="button"
+				title="Add attachment"
+			>
+				<span class="sr-only">Attach files</span>
+
+				<Paperclip class="h-4 w-4" />
+			</Button>
+
+			<div class="flex-1"></div>
+
+			{#if isRouter}
+				<ModelsSelector
+					forceForegroundText={true}
+					useGlobalSelection={true}
+					onModelChange={handleModelChange}
+				/>
+			{/if}
+
+			<Button
+				class="h-8 w-8 shrink-0 rounded-full p-0"
+				onclick={handleSubmit}
+				disabled={!canSubmit}
+				type="button"
+				title={saveWithoutRegenerate ? 'Save changes' : 'Send and regenerate'}
+			>
+				<span class="sr-only">{saveWithoutRegenerate ? 'Save' : 'Send'}</span>
+
+				<ArrowUp class="h-5 w-5" />
+			</Button>
+		</div>
+	</div>
+</div>
+
+<div class="mt-2 flex w-full max-w-[80%] items-center justify-between">
+	{#if showSaveOnlyOption && onSaveEditOnly}
+		<div class="flex items-center gap-2">
+			<Switch id="save-only-switch" bind:checked={saveWithoutRegenerate} class="scale-75" />
+
+			<label for="save-only-switch" class="cursor-pointer text-xs text-muted-foreground">
+				Update without re-sending
+			</label>
+		</div>
+	{:else}
+		<div></div>
+	{/if}
+
+	<Button class="h-7 px-3 text-xs" onclick={attemptCancel} size="sm" variant="ghost">
+		<X class="mr-1 h-3 w-3" />
+
+		Cancel
+	</Button>
+</div>
+
+<DialogConfirmation
+	bind:open={showDiscardDialog}
+	title="Discard changes?"
+	description="You have unsaved changes. Are you sure you want to discard them?"
+	confirmText="Discard"
+	cancelText="Keep editing"
+	variant="destructive"
+	icon={AlertTriangle}
+	onConfirm={onCancelEdit}
+	onCancel={() => (showDiscardDialog = false)}
+/>
diff --git a/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessageStatistics.svelte b/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessageStatistics.svelte
index a453a31010..a39acb1d75 100644
--- a/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessageStatistics.svelte
+++ b/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessageStatistics.svelte
@@ -1,20 +1,122 @@
 <script lang="ts">
-	import { Clock, Gauge, WholeWord } from '@lucide/svelte';
+	import { Clock, Gauge, WholeWord, BookOpenText, Sparkles } from '@lucide/svelte';
 	import { BadgeChatStatistic } from '$lib/components/app';
+	import * as Tooltip from '$lib/components/ui/tooltip';
+	import { ChatMessageStatsView } from '$lib/enums';
 
 	interface Props {
 		predictedTokens: number;
 		predictedMs: number;
+		promptTokens?: number;
+		promptMs?: number;
 	}
 
-	let { predictedTokens, predictedMs }: Props = $props();
+	let { predictedTokens, predictedMs, promptTokens, promptMs }: Props = $props();
+
+	let activeView: ChatMessageStatsView = $state(ChatMessageStatsView.GENERATION);
 
 	let tokensPerSecond = $derived((predictedTokens / predictedMs) * 1000);
 	let timeInSeconds = $derived((predictedMs / 1000).toFixed(2));
+
+	let promptTokensPerSecond = $derived(
+		promptTokens !== undefined && promptMs !== undefined
+			? (promptTokens / promptMs) * 1000
+			: undefined
+	);
+
+	let promptTimeInSeconds = $derived(
+		promptMs !== undefined ? (promptMs / 1000).toFixed(2) : undefined
+	);
+
+	let hasPromptStats = $derived(
+		promptTokens !== undefined &&
+			promptMs !== undefined &&
+			promptTokensPerSecond !== undefined &&
+			promptTimeInSeconds !== undefined
+	);
 </script>
 
-<BadgeChatStatistic icon={WholeWord} value="{predictedTokens} tokens" />
+<div class="inline-flex items-center text-xs text-muted-foreground">
+	<div class="inline-flex items-center rounded-sm bg-muted-foreground/15 p-0.5">
+		{#if hasPromptStats}
+			<Tooltip.Root>
+				<Tooltip.Trigger>
+					<button
+						type="button"
+						class="inline-flex h-5 w-5 items-center justify-center rounded-sm transition-colors {activeView ===
+						ChatMessageStatsView.READING
+							? 'bg-background text-foreground shadow-sm'
+							: 'hover:text-foreground'}"
+						onclick={() => (activeView = ChatMessageStatsView.READING)}
+					>
+						<BookOpenText class="h-3 w-3" />
+						<span class="sr-only">Reading</span>
+					</button>
+				</Tooltip.Trigger>
+				<Tooltip.Content>
+					<p>Reading (prompt processing)</p>
+				</Tooltip.Content>
+			</Tooltip.Root>
+		{/if}
+		<Tooltip.Root>
+			<Tooltip.Trigger>
+				<button
+					type="button"
+					class="inline-flex h-5 w-5 items-center justify-center rounded-sm transition-colors {activeView ===
+					ChatMessageStatsView.GENERATION
+						? 'bg-background text-foreground shadow-sm'
+						: 'hover:text-foreground'}"
+					onclick={() => (activeView = ChatMessageStatsView.GENERATION)}
+				>
+					<Sparkles class="h-3 w-3" />
+					<span class="sr-only">Generation</span>
+				</button>
+			</Tooltip.Trigger>
+			<Tooltip.Content>
+				<p>Generation (token output)</p>
+			</Tooltip.Content>
+		</Tooltip.Root>
+	</div>
 
-<BadgeChatStatistic icon={Clock} value="{timeInSeconds}s" />
-
-<BadgeChatStatistic icon={Gauge} value="{tokensPerSecond.toFixed(2)} tokens/s" />
+	<div class="flex items-center gap-1 px-2">
+		{#if activeView === ChatMessageStatsView.GENERATION}
+			<BadgeChatStatistic
+				class="bg-transparent"
+				icon={WholeWord}
+				value="{predictedTokens} tokens"
+				tooltipLabel="Generated tokens"
+			/>
+			<BadgeChatStatistic
+				class="bg-transparent"
+				icon={Clock}
+				value="{timeInSeconds}s"
+				tooltipLabel="Generation time"
+			/>
+			<BadgeChatStatistic
+				class="bg-transparent"
+				icon={Gauge}
+				value="{tokensPerSecond.toFixed(2)} tokens/s"
+				tooltipLabel="Generation speed"
+			/>
+		{:else if hasPromptStats}
+			<BadgeChatStatistic
+				class="bg-transparent"
+				icon={WholeWord}
+				value="{promptTokens} tokens"
+				tooltipLabel="Prompt tokens"
+			/>
+			<BadgeChatStatistic
+				class="bg-transparent"
+				icon={Clock}
+				value="{promptTimeInSeconds}s"
+				tooltipLabel="Prompt processing time"
+			/>
+			<BadgeChatStatistic
+				class="bg-transparent"
+				icon={Gauge}
+				value="{promptTokensPerSecond!.toFixed(2)} tokens/s"
+				tooltipLabel="Prompt processing speed"
+			/>
+		{/if}
+	</div>
+</div>
diff --git a/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessageUser.svelte b/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessageUser.svelte
index 3d2b8dd35b..041c6bd251 100644
--- a/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessageUser.svelte
+++ b/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessageUser.svelte
@@ -1,18 +1,17 @@
 <script lang="ts">
-	import { Check, X, Send } from '@lucide/svelte';
 	import { Card } from '$lib/components/ui/card';
-	import { Button } from '$lib/components/ui/button';
 	import { ChatAttachmentsList, MarkdownContent } from '$lib/components/app';
-	import { INPUT_CLASSES } from '$lib/constants/input-classes';
 	import { config } from '$lib/stores/settings.svelte';
-	import { autoResizeTextarea } from '$lib/utils';
 	import ChatMessageActions from './ChatMessageActions.svelte';
+	import ChatMessageEditForm from './ChatMessageEditForm.svelte';
 
 	interface Props {
 		class?: string;
 		message: DatabaseMessage;
 		isEditing: boolean;
 		editedContent: string;
+		editedExtras?: DatabaseMessageExtra[];
+		editedUploadedFiles?: ChatUploadedFile[];
 		siblingInfo?: ChatMessageSiblingInfo | null;
 		showDeleteDialog: boolean;
 		deletionInfo: {
@@ -26,6 +25,8 @@
 		onSaveEditOnly?: () => void;
 		onEditKeydown: (event: KeyboardEvent) => void;
 		onEditedContentChange: (content: string) => void;
+		onEditedExtrasChange?: (extras: DatabaseMessageExtra[]) => void;
+		onEditedUploadedFilesChange?: (files: ChatUploadedFile[]) => void;
 		onCopy: () => void;
 		onEdit: () => void;
 		onDelete: () => void;
@@ -40,6 +41,8 @@
 		message,
 		isEditing,
 		editedContent,
+		editedExtras = [],
+		editedUploadedFiles = [],
 		siblingInfo = null,
 		showDeleteDialog,
 		deletionInfo,
@@ -48,6 +51,8 @@
 		onSaveEditOnly,
 		onEditKeydown,
 		onEditedContentChange,
+		onEditedExtrasChange,
+		onEditedUploadedFilesChange,
 		onCopy,
 		onEdit,
 		onDelete,
@@ -61,12 +66,6 @@
 	let messageElement: HTMLElement | undefined = $state();
 	const currentConfig = config();
 
-	$effect(() => {
-		if (isEditing && textareaElement) {
-			autoResizeTextarea(textareaElement);
-		}
-	});
-
 	$effect(() => {
 		if (!messageElement || !message.content.trim()) return;
 
@@ -98,44 +97,23 @@
 	role="group"
 >
 	{#if isEditing}
-		<div class="w-full max-w-[80%]">
-			<textarea
-				bind:this={textareaElement}
-				bind:value={editedContent}
-				class="min-h-[60px] w-full resize-none rounded-2xl px-3 py-2 text-sm {INPUT_CLASSES}"
-				onkeydown={onEditKeydown}
-				oninput={(e) => {
-					autoResizeTextarea(e.currentTarget);
-					onEditedContentChange(e.currentTarget.value);
-				}}
-				placeholder="Edit your message..."
-			></textarea>
-
-			<div class="mt-2 flex justify-end gap-2">
-				<Button class="h-8 px-3" onclick={onCancelEdit} size="sm" variant="ghost">
-					<X class="mr-1 h-3 w-3" />
-					Cancel
-				</Button>
-
-				{#if onSaveEditOnly}
-					<Button
-						class="h-8 px-3"
-						onclick={onSaveEditOnly}
-						disabled={!editedContent.trim()}
-						size="sm"
-						variant="outline"
-					>
-						<Check class="mr-1 h-3 w-3" />
-						Save
-					</Button>
-				{/if}
-
-				<Button class="h-8 px-3" onclick={onSaveEdit} disabled={!editedContent.trim()} size="sm">
-					<Send class="mr-1 h-3 w-3" />
-					Send
-				</Button>
-			</div>
-		</div>
+		<ChatMessageEditForm
+			bind:textareaElement
+			messageId={message.id}
+			{editedContent}
+			{editedExtras}
+			{editedUploadedFiles}
+			originalContent={message.content}
+			originalExtras={message.extra}
+			showSaveOnlyOption={!!onSaveEditOnly}
+			{onCancelEdit}
+			{onSaveEdit}
+			{onSaveEditOnly}
+			{onEditKeydown}
+			{onEditedContentChange}
+			{onEditedExtrasChange}
+			{onEditedUploadedFilesChange}
+		/>
 	{:else}
 		{#if message.extra && message.extra.length > 0}
 			<div class="mb-2 max-w-[80%]">
diff --git a/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessages.svelte b/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessages.svelte
index 2e5f57cb61..c203f10098 100644
--- a/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessages.svelte
+++ b/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessages.svelte
@@ -66,10 +66,14 @@
 		await conversationsStore.navigateToSibling(siblingId);
 	}
 
-	async function handleEditWithBranching(message: DatabaseMessage, newContent: string) {
+	async function handleEditWithBranching(
+		message: DatabaseMessage,
+		newContent: string,
+		newExtras?: DatabaseMessageExtra[]
+	) {
 		onUserAction?.();
 
-		await chatStore.editMessageWithBranching(message.id, newContent);
+		await chatStore.editMessageWithBranching(message.id, newContent, newExtras);
 
 		refreshAllMessages();
 	}
@@ -104,11 +108,12 @@
 
 	async function handleEditUserMessagePreserveResponses(
 		message: DatabaseMessage,
-		newContent: string
+		newContent: string,
+		newExtras?: DatabaseMessageExtra[]
 	) {
 		onUserAction?.();
 
-		await chatStore.editUserMessagePreserveResponses(message.id, newContent);
+		await chatStore.editUserMessagePreserveResponses(message.id, newContent, newExtras);
 
 		refreshAllMessages();
 	}
diff --git a/tools/server/webui/src/lib/components/app/chat/ChatScreen/ChatScreen.svelte b/tools/server/webui/src/lib/components/app/chat/ChatScreen/ChatScreen.svelte
index 57a2edac58..27439551a1 100644
--- a/tools/server/webui/src/lib/components/app/chat/ChatScreen/ChatScreen.svelte
+++ b/tools/server/webui/src/lib/components/app/chat/ChatScreen/ChatScreen.svelte
@@ -17,7 +17,13 @@
 		AUTO_SCROLL_INTERVAL,
 		INITIAL_SCROLL_DELAY
 	} from '$lib/constants/auto-scroll';
-	import { chatStore, errorDialog, isLoading } from '$lib/stores/chat.svelte';
+	import {
+		chatStore,
+		errorDialog,
+		isLoading,
+		isEditing,
+		getAddFilesHandler
+	} from '$lib/stores/chat.svelte';
 	import {
 		conversationsStore,
 		activeMessages,
@@ -181,7 +187,18 @@
 		dragCounter = 0;
 
 		if (event.dataTransfer?.files) {
-			processFiles(Array.from(event.dataTransfer.files));
+			const files = Array.from(event.dataTransfer.files);
+
+			if (isEditing()) {
+				const handler = getAddFilesHandler();
+
+				if (handler) {
+					handler(files);
+					return;
+				}
+			}
+
+			processFiles(files);
 		}
 	}
 
@@ -410,7 +427,7 @@
 
 			<div class="conversation-chat-form pointer-events-auto rounded-t-3xl pb-4">
 				<ChatForm
-					disabled={hasPropsError}
+					disabled={hasPropsError || isEditing()}
 					isLoading={isCurrentConversationLoading}
 					onFileRemove={handleFileRemove}
 					onFileUpload={handleFileUpload}
@@ -587,7 +604,7 @@
 
 		&::after {
 			content: '';
-			position: fixed;
+			position: absolute;
 			bottom: 0;
 			z-index: -1;
 			left: 0;
diff --git a/tools/server/webui/src/lib/components/app/chat/ChatScreen/ChatScreenHeader.svelte b/tools/server/webui/src/lib/components/app/chat/ChatScreen/ChatScreenHeader.svelte
index 24803d0a00..874140feec 100644
--- a/tools/server/webui/src/lib/components/app/chat/ChatScreen/ChatScreenHeader.svelte
+++ b/tools/server/webui/src/lib/components/app/chat/ChatScreen/ChatScreenHeader.svelte
@@ -2,6 +2,9 @@
 	import { Settings } from '@lucide/svelte';
 	import { DialogChatSettings } from '$lib/components/app';
 	import { Button } from '$lib/components/ui/button';
+	import { useSidebar } from '$lib/components/ui/sidebar';
+
+	const sidebar = useSidebar();
 
 	let settingsOpen = $state(false);
 
@@ -11,7 +14,9 @@
 </script>
 
 <header
-	class="md:background-transparent pointer-events-none fixed top-0 right-0 left-0 z-50 flex items-center justify-end bg-background/40 p-4 backdrop-blur-xl md:left-[var(--sidebar-width)]"
+	class="md:background-transparent pointer-events-none fixed top-0 right-0 left-0 z-50 flex items-center justify-end bg-background/40 p-4 backdrop-blur-xl duration-200 ease-linear {sidebar.open
+		? 'md:left-[var(--sidebar-width)]'
+		: ''}"
 >
 	<div class="pointer-events-auto flex items-center space-x-2">
 		<Button variant="ghost" size="sm" onclick={toggleSettings}>
diff --git a/tools/server/webui/src/lib/components/app/misc/BadgeChatStatistic.svelte b/tools/server/webui/src/lib/components/app/misc/BadgeChatStatistic.svelte
index 9e5339cab5..a2b28d2057 100644
--- a/tools/server/webui/src/lib/components/app/misc/BadgeChatStatistic.svelte
+++ b/tools/server/webui/src/lib/components/app/misc/BadgeChatStatistic.svelte
@@ -1,5 +1,6 @@
 <script lang="ts">
 	import { BadgeInfo } from '$lib/components/app';
+	import * as Tooltip from '$lib/components/ui/tooltip';
 	import { copyToClipboard } from '$lib/utils';
 	import type { Component } from 'svelte';
 
@@ -7,19 +8,37 @@
 		class?: string;
 		icon: Component;
 		value: string | number;
+		tooltipLabel?: string;
 	}
 
-	let { class: className = '', icon: Icon, value }: Props = $props();
+	let { class: className = '', icon: Icon, value, tooltipLabel }: Props = $props();
 
 	function handleClick() {
 		void copyToClipboard(String(value));
 	}
 </script>
 
-<BadgeInfo class={className} onclick={handleClick}>
-	{#snippet icon()}
-		<Icon class="h-3 w-3" />
-	{/snippet}
+{#if tooltipLabel}
+	<Tooltip.Root>
+		<Tooltip.Trigger>
+			<BadgeInfo class={className} onclick={handleClick}>
+				{#snippet icon()}
+					<Icon class="h-3 w-3" />
+				{/snippet}
 
-	{value}
-</BadgeInfo>
+				{value}
+			</BadgeInfo>
+		</Tooltip.Trigger>
+		<Tooltip.Content>
+			<p>{tooltipLabel}</p>
+		</Tooltip.Content>
+	</Tooltip.Root>
+{:else}
+	<BadgeInfo class={className} onclick={handleClick}>
+		{#snippet icon()}
+			<Icon class="h-3 w-3" />
+		{/snippet}
+
+		{value}
+	</BadgeInfo>
+{/if}
diff --git a/tools/server/webui/src/lib/components/app/misc/MarkdownContent.svelte b/tools/server/webui/src/lib/components/app/misc/MarkdownContent.svelte
index 2a4a39535e..cb3ae17a63 100644
--- a/tools/server/webui/src/lib/components/app/misc/MarkdownContent.svelte
+++ b/tools/server/webui/src/lib/components/app/misc/MarkdownContent.svelte
@@ -7,15 +7,19 @@
 	import remarkRehype from 'remark-rehype';
 	import rehypeKatex from 'rehype-katex';
 	import rehypeStringify from 'rehype-stringify';
-	import { copyCodeToClipboard, preprocessLaTeX } from '$lib/utils';
-	import { rehypeRestoreTableHtml } from '$lib/markdown/table-html-restorer';
+	import type { Root as HastRoot, RootContent as HastRootContent } from 'hast';
+	import type { Root as MdastRoot } from 'mdast';
 	import { browser } from '$app/environment';
+	import { onDestroy, tick } from 'svelte';
+	import { rehypeRestoreTableHtml } from '$lib/markdown/table-html-restorer';
+	import { rehypeEnhanceLinks } from '$lib/markdown/enhance-links';
+	import { rehypeEnhanceCodeBlocks } from '$lib/markdown/enhance-code-blocks';
+	import { remarkLiteralHtml } from '$lib/markdown/literal-html';
+	import { copyCodeToClipboard, preprocessLaTeX } from '$lib/utils';
 	import '$styles/katex-custom.scss';
-
 	import githubDarkCss from 'highlight.js/styles/github-dark.css?inline';
 	import githubLightCss from 'highlight.js/styles/github.css?inline';
 	import { mode } from 'mode-watcher';
-	import { remarkLiteralHtml } from '$lib/markdown/literal-html';
 	import CodePreviewDialog from './CodePreviewDialog.svelte';
 
 	interface Props {
@@ -23,33 +27,24 @@
 		class?: string;
 	}
 
+	interface MarkdownBlock {
+		id: string;
+		html: string;
+	}
+
 	let { content, class: className = '' }: Props = $props();
 
 	let containerRef = $state<HTMLDivElement>();
-	let processedHtml = $state('');
+	let renderedBlocks = $state<MarkdownBlock[]>([]);
+	let unstableBlockHtml = $state('');
 	let previewDialogOpen = $state(false);
 	let previewCode = $state('');
 	let previewLanguage = $state('text');
 
-	function loadHighlightTheme(isDark: boolean) {
-		if (!browser) return;
+	let pendingMarkdown: string | null = null;
+	let isProcessing = false;
 
-		const existingThemes = document.querySelectorAll('style[data-highlight-theme]');
-		existingThemes.forEach((style) => style.remove());
-
-		const style = document.createElement('style');
-		style.setAttribute('data-highlight-theme', 'true');
-		style.textContent = isDark ? githubDarkCss : githubLightCss;
-
-		document.head.appendChild(style);
-	}
-
-	$effect(() => {
-		const currentMode = mode.current;
-		const isDark = currentMode === 'dark';
-
-		loadHighlightTheme(isDark);
-	});
+	const themeStyleId = `highlight-theme-${(window.idxThemeStyle = (window.idxThemeStyle ?? 0) + 1)}`;
 
 	let processor = $derived(() => {
 		return remark()
@@ -61,139 +56,64 @@
 			.use(rehypeKatex) // Render math using KaTeX
 			.use(rehypeHighlight) // Add syntax highlighting
 			.use(rehypeRestoreTableHtml) // Restore limited HTML (e.g., <br>, <ul>) inside Markdown tables
-			.use(rehypeStringify); // Convert to HTML string
+			.use(rehypeEnhanceLinks) // Add target="_blank" to links
+			.use(rehypeEnhanceCodeBlocks) // Wrap code blocks with header and actions
+			.use(rehypeStringify, { allowDangerousHtml: true }); // Convert to HTML string
 	});
 
-	function enhanceLinks(html: string): string {
-		if (!html.includes('<a')) {
-			return html;
+	/**
+	 * Removes click event listeners from copy and preview buttons.
+	 * Called on component destroy.
+	 */
+	function cleanupEventListeners() {
+		if (!containerRef) return;
+
+		const copyButtons = containerRef.querySelectorAll<HTMLButtonElement>('.copy-code-btn');
+		const previewButtons = containerRef.querySelectorAll<HTMLButtonElement>('.preview-code-btn');
+
+		for (const button of copyButtons) {
+			button.removeEventListener('click', handleCopyClick);
 		}
 
-		const tempDiv = document.createElement('div');
-		tempDiv.innerHTML = html;
-
-		// Make all links open in new tabs
-		const linkElements = tempDiv.querySelectorAll('a[href]');
-		let mutated = false;
-
-		for (const link of linkElements) {
-			const target = link.getAttribute('target');
-			const rel = link.getAttribute('rel');
-
-			if (target !== '_blank' || rel !== 'noopener noreferrer') {
-				mutated = true;
-			}
-
-			link.setAttribute('target', '_blank');
-			link.setAttribute('rel', 'noopener noreferrer');
-		}
-
-		return mutated ? tempDiv.innerHTML : html;
-	}
-
-	function enhanceCodeBlocks(html: string): string {
-		if (!html.includes('<pre')) {
-			return html;
-		}
-
-		const tempDiv = document.createElement('div');
-		tempDiv.innerHTML = html;
-
-		const preElements = tempDiv.querySelectorAll('pre');
-		let mutated = false;
-
-		for (const [index, pre] of Array.from(preElements).entries()) {
-			const codeElement = pre.querySelector('code');
-
-			if (!codeElement) {
-				continue;
-			}
-
-			mutated = true;
-
-			let language = 'text';
-			const classList = Array.from(codeElement.classList);
-
-			for (const className of classList) {
-				if (className.startsWith('language-')) {
-					language = className.replace('language-', '');
-					break;
-				}
-			}
-
-			const rawCode = codeElement.textContent || '';
-			const codeId = `code-${Date.now()}-${index}`;
-			codeElement.setAttribute('data-code-id', codeId);
-			codeElement.setAttribute('data-raw-code', rawCode);
-
-			const wrapper = document.createElement('div');
-			wrapper.className = 'code-block-wrapper';
-
-			const header = document.createElement('div');
-			header.className = 'code-block-header';
-
-			const languageLabel = document.createElement('span');
-			languageLabel.className = 'code-language';
-			languageLabel.textContent = language;
-
-			const copyButton = document.createElement('button');
-			copyButton.className = 'copy-code-btn';
-			copyButton.setAttribute('data-code-id', codeId);
-			copyButton.setAttribute('title', 'Copy code');
-			copyButton.setAttribute('type', 'button');
-
-			copyButton.innerHTML = `
-                                <svg xmlns="http://www.w3.org/2000/svg" width="16" height="16" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="lucide lucide-copy-icon lucide-copy"><rect width="14" height="14" x="8" y="8" rx="2" ry="2"/><path d="M4 16c-1.1 0-2-.9-2-2V4c0-1.1.9-2 2-2h10c1.1 0 2 .9 2 2"/></svg>
-                        `;
-
-			const actions = document.createElement('div');
-			actions.className = 'code-block-actions';
-
-			actions.appendChild(copyButton);
-
-			if (language.toLowerCase() === 'html') {
-				const previewButton = document.createElement('button');
-				previewButton.className = 'preview-code-btn';
-				previewButton.setAttribute('data-code-id', codeId);
-				previewButton.setAttribute('title', 'Preview code');
-				previewButton.setAttribute('type', 'button');
-
-				previewButton.innerHTML = `
-                                        <svg xmlns="http://www.w3.org/2000/svg" width="16" height="16" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="lucide lucide-eye lucide-eye-icon"><path d="M2.062 12.345a1 1 0 0 1 0-.69C3.5 7.73 7.36 5 12 5s8.5 2.73 9.938 6.655a1 1 0 0 1 0 .69C20.5 16.27 16.64 19 12 19s-8.5-2.73-9.938-6.655"/><circle cx="12" cy="12" r="3"/></svg>
-                                `;
-
-				actions.appendChild(previewButton);
-			}
-
-			header.appendChild(languageLabel);
-			header.appendChild(actions);
-			wrapper.appendChild(header);
-
-			const clonedPre = pre.cloneNode(true) as HTMLElement;
-			wrapper.appendChild(clonedPre);
-
-			pre.parentNode?.replaceChild(wrapper, pre);
-		}
-
-		return mutated ? tempDiv.innerHTML : html;
-	}
-
-	async function processMarkdown(text: string): Promise<string> {
-		try {
-			let normalized = preprocessLaTeX(text);
-			const result = await processor().process(normalized);
-			const html = String(result);
-			const enhancedLinks = enhanceLinks(html);
-
-			return enhanceCodeBlocks(enhancedLinks);
-		} catch (error) {
-			console.error('Markdown processing error:', error);
-
-			// Fallback to plain text with line breaks
-			return text.replace(/\n/g, '<br>');
+		for (const button of previewButtons) {
+			button.removeEventListener('click', handlePreviewClick);
 		}
 	}
 
+	/**
+	 * Removes this component's highlight.js theme style from the document head.
+	 * Called on component destroy to clean up injected styles.
+	 */
+	function cleanupHighlightTheme() {
+		if (!browser) return;
+
+		const existingTheme = document.getElementById(themeStyleId);
+		existingTheme?.remove();
+	}
+
+	/**
+	 * Loads the appropriate highlight.js theme based on dark/light mode.
+	 * Injects a scoped style element into the document head.
+	 * @param isDark - Whether to load the dark theme (true) or light theme (false)
+	 */
+	function loadHighlightTheme(isDark: boolean) {
+		if (!browser) return;
+
+		const existingTheme = document.getElementById(themeStyleId);
+		existingTheme?.remove();
+
+		const style = document.createElement('style');
+		style.id = themeStyleId;
+		style.textContent = isDark ? githubDarkCss : githubLightCss;
+
+		document.head.appendChild(style);
+	}
+
+	/**
+	 * Extracts code information from a button click target within a code block.
+	 * @param target - The clicked button element
+	 * @returns Object with rawCode and language, or null if extraction fails
+	 */
 	function getCodeInfoFromTarget(target: HTMLElement) {
 		const wrapper = target.closest('.code-block-wrapper');
 
@@ -209,12 +129,7 @@
 			return null;
 		}
 
-		const rawCode = codeElement.getAttribute('data-raw-code');
-
-		if (rawCode === null) {
-			console.error('No raw code found');
-			return null;
-		}
+		const rawCode = codeElement.textContent ?? '';
 
 		const languageLabel = wrapper.querySelector<HTMLElement>('.code-language');
 		const language = languageLabel?.textContent?.trim() || 'text';
@@ -222,6 +137,28 @@
 		return { rawCode, language };
 	}
 
+	/**
+	 * Generates a unique identifier for a HAST node based on its position.
+	 * Used for stable block identification during incremental rendering.
+	 * @param node - The HAST root content node
+	 * @param indexFallback - Fallback index if position is unavailable
+	 * @returns Unique string identifier for the node
+	 */
+	function getHastNodeId(node: HastRootContent, indexFallback: number): string {
+		const position = node.position;
+
+		if (position?.start?.offset != null && position?.end?.offset != null) {
+			return `hast-${position.start.offset}-${position.end.offset}`;
+		}
+
+		return `${node.type}-${indexFallback}`;
+	}
+
+	/**
+	 * Handles click events on copy buttons within code blocks.
+	 * Copies the raw code content to the clipboard.
+	 * @param event - The click event from the copy button
+	 */
 	async function handleCopyClick(event: Event) {
 		event.preventDefault();
 		event.stopPropagation();
@@ -245,6 +182,25 @@
 		}
 	}
 
+	/**
+	 * Handles preview dialog open state changes.
+	 * Clears preview content when dialog is closed.
+	 * @param open - Whether the dialog is being opened or closed
+	 */
+	function handlePreviewDialogOpenChange(open: boolean) {
+		previewDialogOpen = open;
+
+		if (!open) {
+			previewCode = '';
+			previewLanguage = 'text';
+		}
+	}
+
+	/**
+	 * Handles click events on preview buttons within HTML code blocks.
+	 * Opens a preview dialog with the rendered HTML content.
+	 * @param event - The click event from the preview button
+	 */
 	function handlePreviewClick(event: Event) {
 		event.preventDefault();
 		event.stopPropagation();
@@ -266,6 +222,61 @@
 		previewDialogOpen = true;
 	}
 
+	/**
+	 * Processes markdown content into stable and unstable HTML blocks.
+	 * Uses incremental rendering: stable blocks are cached, unstable block is re-rendered.
+	 * @param markdown - The raw markdown string to process
+	 */
+	async function processMarkdown(markdown: string) {
+		if (!markdown) {
+			renderedBlocks = [];
+			unstableBlockHtml = '';
+			return;
+		}
+
+		const normalized = preprocessLaTeX(markdown);
+		const processorInstance = processor();
+		const ast = processorInstance.parse(normalized) as MdastRoot;
+		const processedRoot = (await processorInstance.run(ast)) as HastRoot;
+		const processedChildren = processedRoot.children ?? [];
+		const stableCount = Math.max(processedChildren.length - 1, 0);
+		const nextBlocks: MarkdownBlock[] = [];
+
+		for (let index = 0; index < stableCount; index++) {
+			const hastChild = processedChildren[index];
+			const id = getHastNodeId(hastChild, index);
+			const existing = renderedBlocks[index];
+
+			if (existing && existing.id === id) {
+				nextBlocks.push(existing);
+				continue;
+			}
+
+			const html = stringifyProcessedNode(
+				processorInstance,
+				processedRoot,
+				processedChildren[index]
+			);
+
+			nextBlocks.push({ id, html });
+		}
+
+		let unstableHtml = '';
+
+		if (processedChildren.length > stableCount) {
+			const unstableChild = processedChildren[stableCount];
+			unstableHtml = stringifyProcessedNode(processorInstance, processedRoot, unstableChild);
+		}
+
+		renderedBlocks = nextBlocks;
+		await tick(); // Force DOM sync before updating unstable HTML block
+		unstableBlockHtml = unstableHtml;
+	}
+
+	/**
+	 * Attaches click event listeners to copy and preview buttons in code blocks.
+	 * Uses data-listener-bound attribute to prevent duplicate bindings.
+	 */
 	function setupCodeBlockActions() {
 		if (!containerRef) return;
 
@@ -287,40 +298,97 @@
 		}
 	}
 
-	function handlePreviewDialogOpenChange(open: boolean) {
-		previewDialogOpen = open;
+	/**
+	 * Converts a single HAST node to an enhanced HTML string.
+	 * Applies link and code block enhancements to the output.
+	 * @param processorInstance - The remark/rehype processor instance
+	 * @param processedRoot - The full processed HAST root (for context)
+	 * @param child - The specific HAST child node to stringify
+	 * @returns Enhanced HTML string representation of the node
+	 */
+	function stringifyProcessedNode(
+		processorInstance: ReturnType<typeof processor>,
+		processedRoot: HastRoot,
+		child: unknown
+	) {
+		const root: HastRoot = {
+			...(processedRoot as HastRoot),
+			children: [child as never]
+		};
 
-		if (!open) {
-			previewCode = '';
-			previewLanguage = 'text';
+		return processorInstance.stringify(root);
+	}
+
+	/**
+	 * Queues markdown for processing with coalescing support.
+	 * Only processes the latest markdown when multiple updates arrive quickly.
+	 * @param markdown - The markdown content to render
+	 */
+	async function updateRenderedBlocks(markdown: string) {
+		pendingMarkdown = markdown;
+
+		if (isProcessing) {
+			return;
+		}
+
+		isProcessing = true;
+
+		try {
+			while (pendingMarkdown !== null) {
+				const nextMarkdown = pendingMarkdown;
+				pendingMarkdown = null;
+
+				await processMarkdown(nextMarkdown);
+			}
+		} catch (error) {
+			console.error('Failed to process markdown:', error);
+			renderedBlocks = [];
+			unstableBlockHtml = markdown.replace(/\n/g, '<br>');
+		} finally {
+			isProcessing = false;
 		}
 	}
 
 	$effect(() => {
-		if (content) {
-			processMarkdown(content)
-				.then((result) => {
-					processedHtml = result;
-				})
-				.catch((error) => {
-					console.error('Failed to process markdown:', error);
-					processedHtml = content.replace(/\n/g, '<br>');
-				});
-		} else {
-			processedHtml = '';
-		}
+		const currentMode = mode.current;
+		const isDark = currentMode === 'dark';
+
+		loadHighlightTheme(isDark);
 	});
 
 	$effect(() => {
-		if (containerRef && processedHtml) {
+		updateRenderedBlocks(content);
+	});
+
+	$effect(() => {
+		const hasRenderedBlocks = renderedBlocks.length > 0;
+		const hasUnstableBlock = Boolean(unstableBlockHtml);
+
+		if ((hasRenderedBlocks || hasUnstableBlock) && containerRef) {
 			setupCodeBlockActions();
 		}
 	});
+
+	onDestroy(() => {
+		cleanupEventListeners();
+		cleanupHighlightTheme();
+	});
 </script>
 
 <div bind:this={containerRef} class={className}>
-	<!-- eslint-disable-next-line no-at-html-tags -->
-	{@html processedHtml}
+	{#each renderedBlocks as block (block.id)}
+		<div class="markdown-block" data-block-id={block.id}>
+			<!-- eslint-disable-next-line no-at-html-tags -->
+			{@html block.html}
+		</div>
+	{/each}
+
+	{#if unstableBlockHtml}
+		<div class="markdown-block markdown-block--unstable" data-block-id="unstable">
+			<!-- eslint-disable-next-line no-at-html-tags -->
+			{@html unstableBlockHtml}
+		</div>
+	{/if}
 </div>
 
 <CodePreviewDialog
@@ -331,6 +399,11 @@
 />
 
 <style>
+	.markdown-block,
+	.markdown-block--unstable {
+		display: contents;
+	}
+
 	/* Base typography styles */
 	div :global(p:not(:last-child)) {
 		margin-bottom: 1rem;
diff --git a/tools/server/webui/src/lib/components/app/server/ServerErrorSplash.svelte b/tools/server/webui/src/lib/components/app/server/ServerErrorSplash.svelte
index 39613f200c..fa4c2842cc 100644
--- a/tools/server/webui/src/lib/components/app/server/ServerErrorSplash.svelte
+++ b/tools/server/webui/src/lib/components/app/server/ServerErrorSplash.svelte
@@ -1,4 +1,5 @@
 <script lang="ts">
+	import { base } from '$app/paths';
 	import { AlertTriangle, RefreshCw, Key, CheckCircle, XCircle } from '@lucide/svelte';
 	import { goto } from '$app/navigation';
 	import { Button } from '$lib/components/ui/button';
@@ -64,7 +65,7 @@
 			settingsStore.updateConfig('apiKey', apiKeyInput.trim());
 
 			// Test the API key by making a real request to the server
-			const response = await fetch('./props', {
+			const response = await fetch(`${base}/props`, {
 				headers: {
 					'Content-Type': 'application/json',
 					Authorization: `Bearer ${apiKeyInput.trim()}`
diff --git a/tools/server/webui/src/lib/components/ui/switch/index.ts b/tools/server/webui/src/lib/components/ui/switch/index.ts
new file mode 100644
index 0000000000..129f8f5c3b
--- /dev/null
+++ b/tools/server/webui/src/lib/components/ui/switch/index.ts
@@ -0,0 +1,7 @@
+import Root from './switch.svelte';
+
+export {
+	Root,
+	//
+	Root as Switch
+};
diff --git a/tools/server/webui/src/lib/components/ui/switch/switch.svelte b/tools/server/webui/src/lib/components/ui/switch/switch.svelte
new file mode 100644
index 0000000000..5a5975e137
--- /dev/null
+++ b/tools/server/webui/src/lib/components/ui/switch/switch.svelte
@@ -0,0 +1,29 @@
+<script lang="ts">
+	import { Switch as SwitchPrimitive } from 'bits-ui';
+	import { cn, type WithoutChildrenOrChild } from '$lib/components/ui/utils.js';
+
+	let {
+		ref = $bindable(null),
+		class: className,
+		checked = $bindable(false),
+		...restProps
+	}: WithoutChildrenOrChild<SwitchPrimitive.RootProps> = $props();
+</script>
+
+<SwitchPrimitive.Root
+	bind:ref
+	bind:checked
+	data-slot="switch"
+	class={cn(
+		'peer inline-flex h-[1.15rem] w-8 shrink-0 items-center rounded-full border border-transparent shadow-xs transition-all outline-none focus-visible:border-ring focus-visible:ring-[3px] focus-visible:ring-ring/50 disabled:cursor-not-allowed disabled:opacity-50 data-[state=checked]:bg-primary data-[state=unchecked]:bg-input dark:data-[state=unchecked]:bg-input/80',
+		className
+	)}
+	{...restProps}
+>
+	<SwitchPrimitive.Thumb
+		data-slot="switch-thumb"
+		class={cn(
+			'pointer-events-none block size-4 rounded-full bg-background ring-0 transition-transform data-[state=checked]:translate-x-[calc(100%-2px)] data-[state=unchecked]:translate-x-0 dark:data-[state=checked]:bg-primary-foreground dark:data-[state=unchecked]:bg-foreground'
+		)}
+	/>
+</SwitchPrimitive.Root>
diff --git a/tools/server/webui/src/lib/enums/chat.ts b/tools/server/webui/src/lib/enums/chat.ts
new file mode 100644
index 0000000000..2b9eb7bc2e
--- /dev/null
+++ b/tools/server/webui/src/lib/enums/chat.ts
@@ -0,0 +1,4 @@
+export enum ChatMessageStatsView {
+	GENERATION = 'generation',
+	READING = 'reading'
+}
diff --git a/tools/server/webui/src/lib/enums/index.ts b/tools/server/webui/src/lib/enums/index.ts
index d9e9001470..83c86caf66 100644
--- a/tools/server/webui/src/lib/enums/index.ts
+++ b/tools/server/webui/src/lib/enums/index.ts
@@ -1,5 +1,7 @@
 export { AttachmentType } from './attachment';
 
+export { ChatMessageStatsView } from './chat';
+
 export {
 	FileTypeCategory,
 	FileTypeImage,
diff --git a/tools/server/webui/src/lib/markdown/enhance-code-blocks.ts b/tools/server/webui/src/lib/markdown/enhance-code-blocks.ts
new file mode 100644
index 0000000000..6f0e03e211
--- /dev/null
+++ b/tools/server/webui/src/lib/markdown/enhance-code-blocks.ts
@@ -0,0 +1,162 @@
+/**
+ * Rehype plugin to enhance code blocks with wrapper, header, and action buttons.
+ *
+ * Wraps <pre><code> elements with a container that includes:
+ * - Language label
+ * - Copy button
+ * - Preview button (for HTML code blocks)
+ *
+ * This operates directly on the HAST tree for better performance,
+ * avoiding the need to stringify and re-parse HTML.
+ */
+
+import type { Plugin } from 'unified';
+import type { Root, Element, ElementContent } from 'hast';
+import { visit } from 'unist-util-visit';
+
+declare global {
+	interface Window {
+		idxCodeBlock?: number;
+	}
+}
+
+const COPY_ICON_SVG = `<svg xmlns="http://www.w3.org/2000/svg" width="16" height="16" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="lucide lucide-copy-icon lucide-copy"><rect width="14" height="14" x="8" y="8" rx="2" ry="2"/><path d="M4 16c-1.1 0-2-.9-2-2V4c0-1.1.9-2 2-2h10c1.1 0 2 .9 2 2"/></svg>`;
+
+const PREVIEW_ICON_SVG = `<svg xmlns="http://www.w3.org/2000/svg" width="16" height="16" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="lucide lucide-eye lucide-eye-icon"><path d="M2.062 12.345a1 1 0 0 1 0-.69C3.5 7.73 7.36 5 12 5s8.5 2.73 9.938 6.655a1 1 0 0 1 0 .69C20.5 16.27 16.64 19 12 19s-8.5-2.73-9.938-6.655"/><circle cx="12" cy="12" r="3"/></svg>`;
+
+/**
+ * Creates an SVG element node from raw SVG string.
+ * Since we can't parse HTML in HAST directly, we use the raw property.
+ */
+function createRawHtmlElement(html: string): Element {
+	return {
+		type: 'element',
+		tagName: 'span',
+		properties: {},
+		children: [{ type: 'raw', value: html } as unknown as ElementContent]
+	};
+}
+
+function createCopyButton(codeId: string): Element {
+	return {
+		type: 'element',
+		tagName: 'button',
+		properties: {
+			className: ['copy-code-btn'],
+			'data-code-id': codeId,
+			title: 'Copy code',
+			type: 'button'
+		},
+		children: [createRawHtmlElement(COPY_ICON_SVG)]
+	};
+}
+
+function createPreviewButton(codeId: string): Element {
+	return {
+		type: 'element',
+		tagName: 'button',
+		properties: {
+			className: ['preview-code-btn'],
+			'data-code-id': codeId,
+			title: 'Preview code',
+			type: 'button'
+		},
+		children: [createRawHtmlElement(PREVIEW_ICON_SVG)]
+	};
+}
+
+function createHeader(language: string, codeId: string): Element {
+	const actions: Element[] = [createCopyButton(codeId)];
+
+	if (language.toLowerCase() === 'html') {
+		actions.push(createPreviewButton(codeId));
+	}
+
+	return {
+		type: 'element',
+		tagName: 'div',
+		properties: { className: ['code-block-header'] },
+		children: [
+			{
+				type: 'element',
+				tagName: 'span',
+				properties: { className: ['code-language'] },
+				children: [{ type: 'text', value: language }]
+			},
+			{
+				type: 'element',
+				tagName: 'div',
+				properties: { className: ['code-block-actions'] },
+				children: actions
+			}
+		]
+	};
+}
+
+function createWrapper(header: Element, preElement: Element): Element {
+	return {
+		type: 'element',
+		tagName: 'div',
+		properties: { className: ['code-block-wrapper'] },
+		children: [header, preElement]
+	};
+}
+
+function extractLanguage(codeElement: Element): string {
+	const className = codeElement.properties?.className;
+	if (!Array.isArray(className)) return 'text';
+
+	for (const cls of className) {
+		if (typeof cls === 'string' && cls.startsWith('language-')) {
+			return cls.replace('language-', '');
+		}
+	}
+
+	return 'text';
+}
+
+/**
+ * Generates a unique code block ID using a global counter.
+ */
+function generateCodeId(): string {
+	if (typeof window !== 'undefined') {
+		return `code-${(window.idxCodeBlock = (window.idxCodeBlock ?? 0) + 1)}`;
+	}
+	// Fallback for SSR - use timestamp + random
+	return `code-${Date.now()}-${Math.random().toString(36).slice(2, 7)}`;
+}
+
+/**
+ * Rehype plugin to enhance code blocks with wrapper, header, and action buttons.
+ * This plugin wraps <pre><code> elements with a container that includes:
+ * - Language label
+ * - Copy button
+ * - Preview button (for HTML code blocks)
+ */
+export const rehypeEnhanceCodeBlocks: Plugin<[], Root> = () => {
+	return (tree: Root) => {
+		visit(tree, 'element', (node: Element, index, parent) => {
+			if (node.tagName !== 'pre' || !parent || index === undefined) return;
+
+			const codeElement = node.children.find(
+				(child): child is Element => child.type === 'element' && child.tagName === 'code'
+			);
+
+			if (!codeElement) return;
+
+			const language = extractLanguage(codeElement);
+			const codeId = generateCodeId();
+
+			codeElement.properties = {
+				...codeElement.properties,
+				'data-code-id': codeId
+			};
+
+			const header = createHeader(language, codeId);
+			const wrapper = createWrapper(header, node);
+
+			// Replace pre with wrapper in parent
+			(parent.children as ElementContent[])[index] = wrapper;
+		});
+	};
+};
diff --git a/tools/server/webui/src/lib/markdown/enhance-links.ts b/tools/server/webui/src/lib/markdown/enhance-links.ts
new file mode 100644
index 0000000000..b5fbcbdaae
--- /dev/null
+++ b/tools/server/webui/src/lib/markdown/enhance-links.ts
@@ -0,0 +1,33 @@
+/**
+ * Rehype plugin to enhance links with security attributes.
+ *
+ * Adds target="_blank" and rel="noopener noreferrer" to all anchor elements,
+ * ensuring external links open in new tabs safely.
+ */
+
+import type { Plugin } from 'unified';
+import type { Root, Element } from 'hast';
+import { visit } from 'unist-util-visit';
+
+/**
+ * Rehype plugin that adds security attributes to all links.
+ * This plugin ensures external links open in new tabs safely by adding:
+ * - target="_blank"
+ * - rel="noopener noreferrer"
+ */
+export const rehypeEnhanceLinks: Plugin<[], Root> = () => {
+	return (tree: Root) => {
+		visit(tree, 'element', (node: Element) => {
+			if (node.tagName !== 'a') return;
+
+			const props = node.properties ?? {};
+
+			// Only modify if href exists
+			if (!props.href) return;
+
+			props.target = '_blank';
+			props.rel = 'noopener noreferrer';
+			node.properties = props;
+		});
+	};
+};
diff --git a/tools/server/webui/src/lib/services/parameter-sync.spec.ts b/tools/server/webui/src/lib/services/parameter-sync.spec.ts
index 17b12f757c..6b5c58ad4d 100644
--- a/tools/server/webui/src/lib/services/parameter-sync.spec.ts
+++ b/tools/server/webui/src/lib/services/parameter-sync.spec.ts
@@ -130,5 +130,19 @@ describe('ParameterSyncService', () => {
 			expect(result.max_tokens).toBe(-1);
 			expect(result.temperature).toBe(0.7);
 		});
+
+		it('should merge webui settings from props when provided', () => {
+			const result = ParameterSyncService.extractServerDefaults(null, {
+				pasteLongTextToFileLen: 0,
+				pdfAsImage: true,
+				renderUserContentAsMarkdown: false,
+				theme: 'dark'
+			});
+
+			expect(result.pasteLongTextToFileLen).toBe(0);
+			expect(result.pdfAsImage).toBe(true);
+			expect(result.renderUserContentAsMarkdown).toBe(false);
+			expect(result.theme).toBeUndefined();
+		});
 	});
 });
diff --git a/tools/server/webui/src/lib/services/parameter-sync.ts b/tools/server/webui/src/lib/services/parameter-sync.ts
index d32d669264..d124cf5c8d 100644
--- a/tools/server/webui/src/lib/services/parameter-sync.ts
+++ b/tools/server/webui/src/lib/services/parameter-sync.ts
@@ -55,7 +55,55 @@ export const SYNCABLE_PARAMETERS: SyncableParameter[] = [
 	{ key: 'dry_allowed_length', serverKey: 'dry_allowed_length', type: 'number', canSync: true },
 	{ key: 'dry_penalty_last_n', serverKey: 'dry_penalty_last_n', type: 'number', canSync: true },
 	{ key: 'max_tokens', serverKey: 'max_tokens', type: 'number', canSync: true },
-	{ key: 'samplers', serverKey: 'samplers', type: 'string', canSync: true }
+	{ key: 'samplers', serverKey: 'samplers', type: 'string', canSync: true },
+	{
+		key: 'pasteLongTextToFileLen',
+		serverKey: 'pasteLongTextToFileLen',
+		type: 'number',
+		canSync: true
+	},
+	{ key: 'pdfAsImage', serverKey: 'pdfAsImage', type: 'boolean', canSync: true },
+	{
+		key: 'showThoughtInProgress',
+		serverKey: 'showThoughtInProgress',
+		type: 'boolean',
+		canSync: true
+	},
+	{ key: 'showToolCalls', serverKey: 'showToolCalls', type: 'boolean', canSync: true },
+	{
+		key: 'disableReasoningFormat',
+		serverKey: 'disableReasoningFormat',
+		type: 'boolean',
+		canSync: true
+	},
+	{ key: 'keepStatsVisible', serverKey: 'keepStatsVisible', type: 'boolean', canSync: true },
+	{ key: 'showMessageStats', serverKey: 'showMessageStats', type: 'boolean', canSync: true },
+	{
+		key: 'askForTitleConfirmation',
+		serverKey: 'askForTitleConfirmation',
+		type: 'boolean',
+		canSync: true
+	},
+	{ key: 'disableAutoScroll', serverKey: 'disableAutoScroll', type: 'boolean', canSync: true },
+	{
+		key: 'renderUserContentAsMarkdown',
+		serverKey: 'renderUserContentAsMarkdown',
+		type: 'boolean',
+		canSync: true
+	},
+	{ key: 'autoMicOnEmpty', serverKey: 'autoMicOnEmpty', type: 'boolean', canSync: true },
+	{
+		key: 'pyInterpreterEnabled',
+		serverKey: 'pyInterpreterEnabled',
+		type: 'boolean',
+		canSync: true
+	},
+	{
+		key: 'enableContinueGeneration',
+		serverKey: 'enableContinueGeneration',
+		type: 'boolean',
+		canSync: true
+	}
 ];
 
 export class ParameterSyncService {
@@ -74,25 +122,39 @@ export class ParameterSyncService {
 	 * Extract server default parameters that can be synced
 	 */
 	static extractServerDefaults(
-		serverParams: ApiLlamaCppServerProps['default_generation_settings']['params'] | null
+		serverParams: ApiLlamaCppServerProps['default_generation_settings']['params'] | null,
+		webuiSettings?: Record<string, string | number | boolean>
 	): ParameterRecord {
-		if (!serverParams) return {};
-
 		const extracted: ParameterRecord = {};
 
-		for (const param of SYNCABLE_PARAMETERS) {
-			if (param.canSync && param.serverKey in serverParams) {
-				const value = (serverParams as unknown as Record<string, ParameterValue>)[param.serverKey];
-				if (value !== undefined) {
-					// Apply precision rounding to avoid JavaScript floating-point issues
-					extracted[param.key] = this.roundFloatingPoint(value);
+		if (serverParams) {
+			for (const param of SYNCABLE_PARAMETERS) {
+				if (param.canSync && param.serverKey in serverParams) {
+					const value = (serverParams as unknown as Record<string, ParameterValue>)[
+						param.serverKey
+					];
+					if (value !== undefined) {
+						// Apply precision rounding to avoid JavaScript floating-point issues
+						extracted[param.key] = this.roundFloatingPoint(value);
+					}
 				}
 			}
+
+			// Handle samplers array conversion to string
+			if (serverParams.samplers && Array.isArray(serverParams.samplers)) {
+				extracted.samplers = serverParams.samplers.join(';');
+			}
 		}
 
-		// Handle samplers array conversion to string
-		if (serverParams.samplers && Array.isArray(serverParams.samplers)) {
-			extracted.samplers = serverParams.samplers.join(';');
+		if (webuiSettings) {
+			for (const param of SYNCABLE_PARAMETERS) {
+				if (param.canSync && param.serverKey in webuiSettings) {
+					const value = webuiSettings[param.serverKey];
+					if (value !== undefined) {
+						extracted[param.key] = this.roundFloatingPoint(value);
+					}
+				}
+			}
 		}
 
 		return extracted;
diff --git a/tools/server/webui/src/lib/stores/chat.svelte.ts b/tools/server/webui/src/lib/stores/chat.svelte.ts
index 4f78840a57..0108894524 100644
--- a/tools/server/webui/src/lib/stores/chat.svelte.ts
+++ b/tools/server/webui/src/lib/stores/chat.svelte.ts
@@ -74,6 +74,8 @@ class ChatStore {
 	private processingStates = new SvelteMap<string, ApiProcessingState | null>();
 	private activeConversationId = $state<string | null>(null);
 	private isStreamingActive = $state(false);
+	private isEditModeActive = $state(false);
+	private addFilesHandler: ((files: File[]) => void) | null = $state(null);
 
 	// ─────────────────────────────────────────────────────────────────────────────
 	// Loading State
@@ -171,6 +173,7 @@ class ChatStore {
 	updateProcessingStateFromTimings(
 		timingData: {
 			prompt_n: number;
+			prompt_ms?: number;
 			predicted_n: number;
 			predicted_per_second: number;
 			cache_n: number;
@@ -212,6 +215,7 @@ class ChatStore {
 			if (message.role === 'assistant' && message.timings) {
 				const restoredState = this.parseTimingData({
 					prompt_n: message.timings.prompt_n || 0,
+					prompt_ms: message.timings.prompt_ms,
 					predicted_n: message.timings.predicted_n || 0,
 					predicted_per_second:
 						message.timings.predicted_n && message.timings.predicted_ms
@@ -282,6 +286,7 @@ class ChatStore {
 
 	private parseTimingData(timingData: Record<string, unknown>): ApiProcessingState | null {
 		const promptTokens = (timingData.prompt_n as number) || 0;
+		const promptMs = (timingData.prompt_ms as number) || undefined;
 		const predictedTokens = (timingData.predicted_n as number) || 0;
 		const tokensPerSecond = (timingData.predicted_per_second as number) || 0;
 		const cacheTokens = (timingData.cache_n as number) || 0;
@@ -320,6 +325,7 @@ class ChatStore {
 			speculative: false,
 			progressPercent,
 			promptTokens,
+			promptMs,
 			cacheTokens
 		};
 	}
@@ -536,6 +542,7 @@ class ChatStore {
 					this.updateProcessingStateFromTimings(
 						{
 							prompt_n: timings?.prompt_n || 0,
+							prompt_ms: timings?.prompt_ms,
 							predicted_n: timings?.predicted_n || 0,
 							predicted_per_second: tokensPerSecond,
 							cache_n: timings?.cache_n || 0,
@@ -768,10 +775,11 @@ class ChatStore {
 					content: streamingState.response
 				};
 				if (lastMessage.thinking?.trim()) updateData.thinking = lastMessage.thinking;
-				const lastKnownState = this.getCurrentProcessingStateSync();
+				const lastKnownState = this.getProcessingState(conversationId);
 				if (lastKnownState) {
 					updateData.timings = {
 						prompt_n: lastKnownState.promptTokens || 0,
+						prompt_ms: lastKnownState.promptMs,
 						predicted_n: lastKnownState.tokensDecoded || 0,
 						cache_n: lastKnownState.cacheTokens || 0,
 						predicted_ms:
@@ -959,6 +967,160 @@ class ChatStore {
 	// Editing
 	// ─────────────────────────────────────────────────────────────────────────────
 
+	clearEditMode(): void {
+		this.isEditModeActive = false;
+		this.addFilesHandler = null;
+	}
+
+	async continueAssistantMessage(messageId: string): Promise<void> {
+		const activeConv = conversationsStore.activeConversation;
+		if (!activeConv || this.isLoading) return;
+
+		const result = this.getMessageByIdWithRole(messageId, 'assistant');
+		if (!result) return;
+		const { message: msg, index: idx } = result;
+
+		if (this.isChatLoading(activeConv.id)) return;
+
+		try {
+			this.errorDialogState = null;
+			this.setChatLoading(activeConv.id, true);
+			this.clearChatStreaming(activeConv.id);
+
+			const allMessages = await conversationsStore.getConversationMessages(activeConv.id);
+			const dbMessage = allMessages.find((m) => m.id === messageId);
+
+			if (!dbMessage) {
+				this.setChatLoading(activeConv.id, false);
+
+				return;
+			}
+
+			const originalContent = dbMessage.content;
+			const originalThinking = dbMessage.thinking || '';
+
+			const conversationContext = conversationsStore.activeMessages.slice(0, idx);
+			const contextWithContinue = [
+				...conversationContext,
+				{ role: 'assistant' as const, content: originalContent }
+			];
+
+			let appendedContent = '',
+				appendedThinking = '',
+				hasReceivedContent = false;
+
+			const abortController = this.getOrCreateAbortController(msg.convId);
+
+			await ChatService.sendMessage(
+				contextWithContinue,
+				{
+					...this.getApiOptions(),
+
+					onChunk: (chunk: string) => {
+						hasReceivedContent = true;
+						appendedContent += chunk;
+						const fullContent = originalContent + appendedContent;
+						this.setChatStreaming(msg.convId, fullContent, msg.id);
+						conversationsStore.updateMessageAtIndex(idx, { content: fullContent });
+					},
+
+					onReasoningChunk: (reasoningChunk: string) => {
+						hasReceivedContent = true;
+						appendedThinking += reasoningChunk;
+						conversationsStore.updateMessageAtIndex(idx, {
+							thinking: originalThinking + appendedThinking
+						});
+					},
+
+					onTimings: (timings: ChatMessageTimings, promptProgress?: ChatMessagePromptProgress) => {
+						const tokensPerSecond =
+							timings?.predicted_ms && timings?.predicted_n
+								? (timings.predicted_n / timings.predicted_ms) * 1000
+								: 0;
+						this.updateProcessingStateFromTimings(
+							{
+								prompt_n: timings?.prompt_n || 0,
+								prompt_ms: timings?.prompt_ms,
+								predicted_n: timings?.predicted_n || 0,
+								predicted_per_second: tokensPerSecond,
+								cache_n: timings?.cache_n || 0,
+								prompt_progress: promptProgress
+							},
+							msg.convId
+						);
+					},
+
+					onComplete: async (
+						finalContent?: string,
+						reasoningContent?: string,
+						timings?: ChatMessageTimings
+					) => {
+						const fullContent = originalContent + (finalContent || appendedContent);
+						const fullThinking = originalThinking + (reasoningContent || appendedThinking);
+						await DatabaseService.updateMessage(msg.id, {
+							content: fullContent,
+							thinking: fullThinking,
+							timestamp: Date.now(),
+							timings
+						});
+						conversationsStore.updateMessageAtIndex(idx, {
+							content: fullContent,
+							thinking: fullThinking,
+							timestamp: Date.now(),
+							timings
+						});
+						conversationsStore.updateConversationTimestamp();
+						this.setChatLoading(msg.convId, false);
+						this.clearChatStreaming(msg.convId);
+						this.clearProcessingState(msg.convId);
+					},
+
+					onError: async (error: Error) => {
+						if (this.isAbortError(error)) {
+							if (hasReceivedContent && appendedContent) {
+								await DatabaseService.updateMessage(msg.id, {
+									content: originalContent + appendedContent,
+									thinking: originalThinking + appendedThinking,
+									timestamp: Date.now()
+								});
+								conversationsStore.updateMessageAtIndex(idx, {
+									content: originalContent + appendedContent,
+									thinking: originalThinking + appendedThinking,
+									timestamp: Date.now()
+								});
+							}
+							this.setChatLoading(msg.convId, false);
+							this.clearChatStreaming(msg.convId);
+							this.clearProcessingState(msg.convId);
+							return;
+						}
+						console.error('Continue generation error:', error);
+						conversationsStore.updateMessageAtIndex(idx, {
+							content: originalContent,
+							thinking: originalThinking
+						});
+						await DatabaseService.updateMessage(msg.id, {
+							content: originalContent,
+							thinking: originalThinking
+						});
+						this.setChatLoading(msg.convId, false);
+						this.clearChatStreaming(msg.convId);
+						this.clearProcessingState(msg.convId);
+						this.showErrorDialog(
+							error.name === 'TimeoutError' ? 'timeout' : 'server',
+							error.message
+						);
+					}
+				},
+				msg.convId,
+				abortController.signal
+			);
+		} catch (error) {
+			if (!this.isAbortError(error)) console.error('Failed to continue message:', error);
+			if (activeConv) this.setChatLoading(activeConv.id, false);
+		}
+	}
+
 	async editAssistantMessage(
 		messageId: string,
 		newContent: string,
@@ -989,11 +1151,10 @@ class ChatStore {
 				);
 				await conversationsStore.updateCurrentNode(newMessage.id);
 			} else {
-				await DatabaseService.updateMessage(msg.id, { content: newContent, timestamp: Date.now() });
+				await DatabaseService.updateMessage(msg.id, { content: newContent });
 				await conversationsStore.updateCurrentNode(msg.id);
 				conversationsStore.updateMessageAtIndex(idx, {
-					content: newContent,
-					timestamp: Date.now()
+					content: newContent
 				});
 			}
 			conversationsStore.updateConversationTimestamp();
@@ -1003,7 +1164,11 @@ class ChatStore {
 		}
 	}
 
-	async editUserMessagePreserveResponses(messageId: string, newContent: string): Promise<void> {
+	async editUserMessagePreserveResponses(
+		messageId: string,
+		newContent: string,
+		newExtras?: DatabaseMessageExtra[]
+	): Promise<void> {
 		const activeConv = conversationsStore.activeConversation;
 		if (!activeConv) return;
 
@@ -1012,11 +1177,18 @@ class ChatStore {
 		const { message: msg, index: idx } = result;
 
 		try {
-			await DatabaseService.updateMessage(messageId, {
-				content: newContent,
-				timestamp: Date.now()
-			});
-			conversationsStore.updateMessageAtIndex(idx, { content: newContent, timestamp: Date.now() });
+			const updateData: Partial<DatabaseMessage> = {
+				content: newContent
+			};
+
+			// Update extras if provided (including empty array to clear attachments)
+			// Deep clone to avoid Proxy objects from Svelte reactivity
+			if (newExtras !== undefined) {
+				updateData.extra = JSON.parse(JSON.stringify(newExtras));
+			}
+
+			await DatabaseService.updateMessage(messageId, updateData);
+			conversationsStore.updateMessageAtIndex(idx, updateData);
 
 			const allMessages = await conversationsStore.getConversationMessages(activeConv.id);
 			const rootMessage = allMessages.find((m) => m.type === 'root' && m.parent === null);
@@ -1034,7 +1206,11 @@ class ChatStore {
 		}
 	}
 
-	async editMessageWithBranching(messageId: string, newContent: string): Promise<void> {
+	async editMessageWithBranching(
+		messageId: string,
+		newContent: string,
+		newExtras?: DatabaseMessageExtra[]
+	): Promise<void> {
 		const activeConv = conversationsStore.activeConversation;
 		if (!activeConv || this.isLoading) return;
 
@@ -1056,6 +1232,15 @@ class ChatStore {
 			const parentId = msg.parent || rootMessage?.id;
 			if (!parentId) return;
 
+			// Use newExtras if provided, otherwise copy existing extras
+			// Deep clone to avoid Proxy objects from Svelte reactivity
+			const extrasToUse =
+				newExtras !== undefined
+					? JSON.parse(JSON.stringify(newExtras))
+					: msg.extra
+						? JSON.parse(JSON.stringify(msg.extra))
+						: undefined;
+
 			const newMessage = await DatabaseService.createMessageBranch(
 				{
 					convId: msg.convId,
@@ -1066,7 +1251,7 @@ class ChatStore {
 					thinking: msg.thinking || '',
 					toolCalls: msg.toolCalls || '',
 					children: [],
-					extra: msg.extra ? JSON.parse(JSON.stringify(msg.extra)) : undefined,
+					extra: extrasToUse,
 					model: msg.model
 				},
 				parentId
@@ -1185,167 +1370,35 @@ class ChatStore {
 		}
 	}
 
-	async continueAssistantMessage(messageId: string): Promise<void> {
-		const activeConv = conversationsStore.activeConversation;
-		if (!activeConv || this.isLoading) return;
-
-		const result = this.getMessageByIdWithRole(messageId, 'assistant');
-		if (!result) return;
-		const { message: msg, index: idx } = result;
-
-		if (this.isChatLoading(activeConv.id)) return;
-
-		try {
-			this.errorDialogState = null;
-			this.setChatLoading(activeConv.id, true);
-			this.clearChatStreaming(activeConv.id);
-
-			const allMessages = await conversationsStore.getConversationMessages(activeConv.id);
-			const dbMessage = allMessages.find((m) => m.id === messageId);
-
-			if (!dbMessage) {
-				this.setChatLoading(activeConv.id, false);
-
-				return;
-			}
-
-			const originalContent = dbMessage.content;
-			const originalThinking = dbMessage.thinking || '';
-
-			const conversationContext = conversationsStore.activeMessages.slice(0, idx);
-			const contextWithContinue = [
-				...conversationContext,
-				{ role: 'assistant' as const, content: originalContent }
-			];
-
-			let appendedContent = '',
-				appendedThinking = '',
-				hasReceivedContent = false;
-
-			const abortController = this.getOrCreateAbortController(msg.convId);
-
-			await ChatService.sendMessage(
-				contextWithContinue,
-				{
-					...this.getApiOptions(),
-
-					onChunk: (chunk: string) => {
-						hasReceivedContent = true;
-						appendedContent += chunk;
-						const fullContent = originalContent + appendedContent;
-						this.setChatStreaming(msg.convId, fullContent, msg.id);
-						conversationsStore.updateMessageAtIndex(idx, { content: fullContent });
-					},
-
-					onReasoningChunk: (reasoningChunk: string) => {
-						hasReceivedContent = true;
-						appendedThinking += reasoningChunk;
-						conversationsStore.updateMessageAtIndex(idx, {
-							thinking: originalThinking + appendedThinking
-						});
-					},
-
-					onTimings: (timings: ChatMessageTimings, promptProgress?: ChatMessagePromptProgress) => {
-						const tokensPerSecond =
-							timings?.predicted_ms && timings?.predicted_n
-								? (timings.predicted_n / timings.predicted_ms) * 1000
-								: 0;
-						this.updateProcessingStateFromTimings(
-							{
-								prompt_n: timings?.prompt_n || 0,
-								predicted_n: timings?.predicted_n || 0,
-								predicted_per_second: tokensPerSecond,
-								cache_n: timings?.cache_n || 0,
-								prompt_progress: promptProgress
-							},
-							msg.convId
-						);
-					},
-
-					onComplete: async (
-						finalContent?: string,
-						reasoningContent?: string,
-						timings?: ChatMessageTimings
-					) => {
-						const fullContent = originalContent + (finalContent || appendedContent);
-						const fullThinking = originalThinking + (reasoningContent || appendedThinking);
-						await DatabaseService.updateMessage(msg.id, {
-							content: fullContent,
-							thinking: fullThinking,
-							timestamp: Date.now(),
-							timings
-						});
-						conversationsStore.updateMessageAtIndex(idx, {
-							content: fullContent,
-							thinking: fullThinking,
-							timestamp: Date.now(),
-							timings
-						});
-						conversationsStore.updateConversationTimestamp();
-						this.setChatLoading(msg.convId, false);
-						this.clearChatStreaming(msg.convId);
-						this.clearProcessingState(msg.convId);
-					},
-
-					onError: async (error: Error) => {
-						if (this.isAbortError(error)) {
-							if (hasReceivedContent && appendedContent) {
-								await DatabaseService.updateMessage(msg.id, {
-									content: originalContent + appendedContent,
-									thinking: originalThinking + appendedThinking,
-									timestamp: Date.now()
-								});
-								conversationsStore.updateMessageAtIndex(idx, {
-									content: originalContent + appendedContent,
-									thinking: originalThinking + appendedThinking,
-									timestamp: Date.now()
-								});
-							}
-							this.setChatLoading(msg.convId, false);
-							this.clearChatStreaming(msg.convId);
-							this.clearProcessingState(msg.convId);
-							return;
-						}
-						console.error('Continue generation error:', error);
-						conversationsStore.updateMessageAtIndex(idx, {
-							content: originalContent,
-							thinking: originalThinking
-						});
-						await DatabaseService.updateMessage(msg.id, {
-							content: originalContent,
-							thinking: originalThinking
-						});
-						this.setChatLoading(msg.convId, false);
-						this.clearChatStreaming(msg.convId);
-						this.clearProcessingState(msg.convId);
-						this.showErrorDialog(
-							error.name === 'TimeoutError' ? 'timeout' : 'server',
-							error.message
-						);
-					}
-				},
-				msg.convId,
-				abortController.signal
-			);
-		} catch (error) {
-			if (!this.isAbortError(error)) console.error('Failed to continue message:', error);
-			if (activeConv) this.setChatLoading(activeConv.id, false);
-		}
+	getAddFilesHandler(): ((files: File[]) => void) | null {
+		return this.addFilesHandler;
 	}
 
-	public isChatLoadingPublic(convId: string): boolean {
-		return this.isChatLoading(convId);
+	public getAllLoadingChats(): string[] {
+		return Array.from(this.chatLoadingStates.keys());
 	}
+
+	public getAllStreamingChats(): string[] {
+		return Array.from(this.chatStreamingStates.keys());
+	}
+
 	public getChatStreamingPublic(
 		convId: string
 	): { response: string; messageId: string } | undefined {
 		return this.getChatStreaming(convId);
 	}
-	public getAllLoadingChats(): string[] {
-		return Array.from(this.chatLoadingStates.keys());
+
+	public isChatLoadingPublic(convId: string): boolean {
+		return this.isChatLoading(convId);
 	}
-	public getAllStreamingChats(): string[] {
-		return Array.from(this.chatStreamingStates.keys());
+
+	isEditing(): boolean {
+		return this.isEditModeActive;
+	}
+
+	setEditModeActive(handler: (files: File[]) => void): void {
+		this.isEditModeActive = true;
+		this.addFilesHandler = handler;
 	}
 
 	// ─────────────────────────────────────────────────────────────────────────────
@@ -1409,13 +1462,17 @@ class ChatStore {
 
 export const chatStore = new ChatStore();
 
-export const isLoading = () => chatStore.isLoading;
+export const activeProcessingState = () => chatStore.activeProcessingState;
+export const clearEditMode = () => chatStore.clearEditMode();
 export const currentResponse = () => chatStore.currentResponse;
 export const errorDialog = () => chatStore.errorDialogState;
-export const activeProcessingState = () => chatStore.activeProcessingState;
-export const isChatStreaming = () => chatStore.isStreaming();
-
-export const isChatLoading = (convId: string) => chatStore.isChatLoadingPublic(convId);
-export const getChatStreaming = (convId: string) => chatStore.getChatStreamingPublic(convId);
+export const getAddFilesHandler = () => chatStore.getAddFilesHandler();
 export const getAllLoadingChats = () => chatStore.getAllLoadingChats();
 export const getAllStreamingChats = () => chatStore.getAllStreamingChats();
+export const getChatStreaming = (convId: string) => chatStore.getChatStreamingPublic(convId);
+export const isChatLoading = (convId: string) => chatStore.isChatLoadingPublic(convId);
+export const isChatStreaming = () => chatStore.isStreaming();
+export const isEditing = () => chatStore.isEditing();
+export const isLoading = () => chatStore.isLoading;
+export const setEditModeActive = (handler: (files: File[]) => void) =>
+	chatStore.setEditModeActive(handler);
diff --git a/tools/server/webui/src/lib/stores/server.svelte.ts b/tools/server/webui/src/lib/stores/server.svelte.ts
index fd2d335bed..facfd333b6 100644
--- a/tools/server/webui/src/lib/stores/server.svelte.ts
+++ b/tools/server/webui/src/lib/stores/server.svelte.ts
@@ -40,6 +40,10 @@ class ServerStore {
 		return this.props?.default_generation_settings?.n_ctx ?? null;
 	}
 
+	get webuiSettings(): Record<string, string | number | boolean> | undefined {
+		return this.props?.webui_settings;
+	}
+
 	get isRouterMode(): boolean {
 		return this.role === ServerRole.ROUTER;
 	}
diff --git a/tools/server/webui/src/lib/stores/settings.svelte.ts b/tools/server/webui/src/lib/stores/settings.svelte.ts
index 2b7d8db102..e163833bfb 100644
--- a/tools/server/webui/src/lib/stores/settings.svelte.ts
+++ b/tools/server/webui/src/lib/stores/settings.svelte.ts
@@ -66,7 +66,8 @@ class SettingsStore {
 	 */
 	private getServerDefaults(): Record<string, string | number | boolean> {
 		const serverParams = serverStore.defaultParams;
-		return serverParams ? ParameterSyncService.extractServerDefaults(serverParams) : {};
+		const webuiSettings = serverStore.webuiSettings;
+		return ParameterSyncService.extractServerDefaults(serverParams, webuiSettings);
 	}
 
 	constructor() {
diff --git a/tools/server/webui/src/lib/types/api.d.ts b/tools/server/webui/src/lib/types/api.d.ts
index 4bc92b57bc..e5fde24c75 100644
--- a/tools/server/webui/src/lib/types/api.d.ts
+++ b/tools/server/webui/src/lib/types/api.d.ts
@@ -176,6 +176,7 @@ export interface ApiLlamaCppServerProps {
 	bos_token: string;
 	eos_token: string;
 	build_info: string;
+	webui_settings?: Record<string, string | number | boolean>;
 }
 
 export interface ApiChatCompletionRequest {
@@ -341,6 +342,7 @@ export interface ApiProcessingState {
 	// Progress information from prompt_progress
 	progressPercent?: number;
 	promptTokens?: number;
+	promptMs?: number;
 	cacheTokens?: number;
 }
 
diff --git a/tools/server/webui/src/lib/utils/api-key-validation.ts b/tools/server/webui/src/lib/utils/api-key-validation.ts
index 0652467b8c..948b7d7b6b 100644
--- a/tools/server/webui/src/lib/utils/api-key-validation.ts
+++ b/tools/server/webui/src/lib/utils/api-key-validation.ts
@@ -1,3 +1,4 @@
+import { base } from '$app/paths';
 import { error } from '@sveltejs/kit';
 import { browser } from '$app/environment';
 import { config } from '$lib/stores/settings.svelte';
@@ -22,7 +23,7 @@ export async function validateApiKey(fetch: typeof globalThis.fetch): Promise<vo
 			headers.Authorization = `Bearer ${apiKey}`;
 		}
 
-		const response = await fetch(`./props`, { headers });
+		const response = await fetch(`${base}/props`, { headers });
 
 		if (!response.ok) {
 			if (response.status === 401 || response.status === 403) {
diff --git a/tools/server/webui/src/routes/+layout.svelte b/tools/server/webui/src/routes/+layout.svelte
index 17e13e9f33..a14dfb633c 100644
--- a/tools/server/webui/src/routes/+layout.svelte
+++ b/tools/server/webui/src/routes/+layout.svelte
@@ -1,5 +1,6 @@
 <script lang="ts">
 	import '../app.css';
+	import { base } from '$app/paths';
 	import { page } from '$app/state';
 	import { untrack } from 'svelte';
 	import { ChatSidebar, DialogConversationTitleUpdate } from '$lib/components/app';
@@ -157,7 +158,7 @@
 				headers.Authorization = `Bearer ${apiKey.trim()}`;
 			}
 
-			fetch(`./props`, { headers })
+			fetch(`${base}/props`, { headers })
 				.then((response) => {
 					if (response.status === 401 || response.status === 403) {
 						window.location.reload();