Merge remote-tracking branch 'origin/master' into addition

2025-09-04 12:40:34 -07:00 · 2025-09-04 12:40:34 -07:00 · 1b16a91183
parent ac522434c7 fb15d649ed
commit 1b16a91183
342 changed files with 35672 additions and 9894 deletions
--- a/.clang-format
+++ b/.clang-format
@ -22,7 +22,7 @@ AllowShortIfStatementsOnASingleLine: Never
 AllowShortLambdasOnASingleLine: Inline
 AllowShortLoopsOnASingleLine: false
 AlwaysBreakBeforeMultilineStrings: true
-BinPackArguments: false
+BinPackArguments: true
 BinPackParameters: false # OnePerLine
 BitFieldColonSpacing: Both
 BreakBeforeBraces: Custom # Attach
--- a/.devops/cloud-v-pipeline
+++ b/.devops/cloud-v-pipeline
@ -1,22 +0,0 @@
 node('x86_runner1'){            // Running on x86 runner containing latest vector qemu, latest vector gcc and all the necessary libraries
    stage('Cleanup'){
        cleanWs()               // Cleaning previous CI build in workspace
    }
    stage('checkout repo'){
        retry(5){               // Retry if the cloning fails due to some reason
            checkout scm        // Clone the repo on Runner
        }
    }
    stage('Compiling llama.cpp'){
        sh'''#!/bin/bash
            make RISCV=1 RISCV_CROSS_COMPILE=1 # Compiling llama for RISC-V
        '''
    }
    stage('Running llama.cpp'){
        sh'''#!/bin/bash
            module load gnu-bin2/0.1            # loading latest versions of vector qemu and vector gcc
            qemu-riscv64 -L /softwares/gnu-bin2/sysroot  -cpu rv64,v=true,vlen=256,elen=64,vext_spec=v1.0 ./llama-cli -m /home/alitariq/codellama-7b.Q4_K_M.gguf -p "Anything" -n 9 > llama_log.txt            # Running llama.cpp on vector qemu-riscv64
            cat llama_log.txt                   # Printing results
        '''
    }
 }
--- a/.devops/cpu.Dockerfile
+++ b/.devops/cpu.Dockerfile
@ -4,8 +4,6 @@ FROM ubuntu:$UBUNTU_VERSION AS build
 ARG TARGETARCH
 ARG GGML_CPU_ARM_ARCH=armv8-a
 RUN apt-get update && \
    apt-get install -y build-essential git cmake libcurl4-openssl-dev
@ -13,10 +11,8 @@ WORKDIR /app
 COPY . .
-RUN if [ "$TARGETARCH" = "amd64" ]; then \
+RUN if [ "$TARGETARCH" = "amd64" ] || [ "$TARGETARCH" = "arm64" ]; then \
        cmake -S . -B build -DCMAKE_BUILD_TYPE=Release -DGGML_NATIVE=OFF -DLLAMA_BUILD_TESTS=OFF -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON; \
    elif [ "$TARGETARCH" = "arm64" ]; then \
        cmake -S . -B build -DCMAKE_BUILD_TYPE=Release -DGGML_NATIVE=OFF -DLLAMA_BUILD_TESTS=OFF -DGGML_CPU_ARM_ARCH=${GGML_CPU_ARM_ARCH}; \
    else \
        echo "Unsupported architecture"; \
        exit 1; \
--- a/.devops/cuda.Dockerfile
+++ b/.devops/cuda.Dockerfile
@ -61,7 +61,7 @@ RUN apt-get update \
    python3 \
    python3-pip \
    && pip install --upgrade pip setuptools wheel \
-    && pip install -r requirements.txt \
+    && pip install --break-system-packages -r requirements.txt \
    && apt autoremove -y \
    && apt clean -y \
    && rm -rf /tmp/* /var/tmp/* \
--- a/.devops/vulkan.Dockerfile
+++ b/.devops/vulkan.Dockerfile
@ -2,14 +2,30 @@ ARG UBUNTU_VERSION=24.04
 FROM ubuntu:$UBUNTU_VERSION AS build
-# Install build tools
+# Ref: https://vulkan.lunarg.com/doc/sdk/latest/linux/getting_started.html
 RUN apt update && apt install -y git build-essential cmake wget
-# Install Vulkan SDK and cURL
+# Install build tools
-RUN wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key add - && \
+RUN apt update && apt install -y git build-essential cmake wget xz-utils
-    wget -qO /etc/apt/sources.list.d/lunarg-vulkan-noble.list https://packages.lunarg.com/vulkan/lunarg-vulkan-noble.list && \
+
-    apt update -y && \
+# Install Vulkan SDK
-    apt-get install -y vulkan-sdk libcurl4-openssl-dev curl
+ARG VULKAN_VERSION=1.4.321.1
 RUN ARCH=$(uname -m) && \
    wget -qO /tmp/vulkan-sdk.tar.xz https://sdk.lunarg.com/sdk/download/${VULKAN_VERSION}/linux/vulkan-sdk-linux-${ARCH}-${VULKAN_VERSION}.tar.xz && \
    mkdir -p /opt/vulkan && \
    tar -xf /tmp/vulkan-sdk.tar.xz -C /tmp --strip-components=1 && \
    mv /tmp/${ARCH}/* /opt/vulkan/ && \
    rm -rf /tmp/*
 # Install cURL and Vulkan SDK dependencies
 RUN apt install -y libcurl4-openssl-dev curl \
    libxcb-xinput0 libxcb-xinerama0 libxcb-cursor-dev
 # Set environment variables
 ENV VULKAN_SDK=/opt/vulkan
 ENV PATH=$VULKAN_SDK/bin:$PATH
 ENV LD_LIBRARY_PATH=$VULKAN_SDK/lib:$LD_LIBRARY_PATH
 ENV CMAKE_PREFIX_PATH=$VULKAN_SDK:$CMAKE_PREFIX_PATH
 ENV PKG_CONFIG_PATH=$VULKAN_SDK/lib/pkgconfig:$PKG_CONFIG_PATH
 # Build it
 WORKDIR /app
--- a/.github/ISSUE_TEMPLATE/010-bug-compilation.yml
+++ b/.github/ISSUE_TEMPLATE/010-bug-compilation.yml
@ -40,7 +40,7 @@ body:
    attributes:
        label: GGML backends
        description: Which GGML backends do you know to be affected?
-        options: [AMX, BLAS, CPU, CUDA, HIP, Metal, Musa, RPC, SYCL, Vulkan, OpenCL]
+        options: [AMX, BLAS, CPU, CUDA, HIP, Metal, Musa, RPC, SYCL, Vulkan, OpenCL, zDNN]
        multiple: true
    validations:
      required: true
--- a/.github/ISSUE_TEMPLATE/011-bug-results.yml
+++ b/.github/ISSUE_TEMPLATE/011-bug-results.yml
@ -42,7 +42,7 @@ body:
    attributes:
        label: GGML backends
        description: Which GGML backends do you know to be affected?
-        options: [AMX, BLAS, CPU, CUDA, HIP, Metal, Musa, RPC, SYCL, Vulkan, OpenCL]
+        options: [AMX, BLAS, CPU, CUDA, HIP, Metal, Musa, RPC, SYCL, Vulkan, OpenCL, zDNN]
        multiple: true
    validations:
      required: true
--- a/.github/copilot-instructions.md
+++ b/.github/copilot-instructions.md
@ -0,0 +1,262 @@
 # Copilot Instructions for llama.cpp
 ## Repository Overview
 llama.cpp is a large-scale C/C++ project for efficient LLM (Large Language Model) inference with minimal setup and dependencies. The project enables running language models on diverse hardware with state-of-the-art performance.
 **Key Facts:**
 - **Primary language**: C/C++ with Python utility scripts
 - **Size**: ~200k+ lines of code across 1000+ files
 - **Architecture**: Modular design with main library (`libllama`) and 40+ executable tools/examples
 - **Core dependency**: ggml tensor library (vendored in `ggml/` directory)
 - **Backends supported**: CPU (AVX/NEON optimized), CUDA, Metal, Vulkan, SYCL, ROCm, MUSA
 - **License**: MIT
 ## Build Instructions
 ### Prerequisites
 - CMake 3.14+ (primary build system)
 - C++17 compatible compiler (GCC 13.3+, Clang, MSVC)
 - Optional: ccache for faster compilation
 ### Basic Build (CPU-only)
 **ALWAYS run these commands in sequence:**
 ```bash
 cmake -B build
 cmake --build build --config Release -j $(nproc)
 ```
 **Build time**: ~10 minutes on 4-core system with ccache enabled, ~25 minutes without ccache.
 **Important Notes:**
 - The Makefile is deprecated - always use CMake
 - ccache is automatically detected and used if available
 - Built binaries are placed in `build/bin/`
 - Parallel builds (`-j`) significantly reduce build time
 ### Backend-Specific Builds
 For CUDA support:
 ```bash
 cmake -B build -DGGML_CUDA=ON
 cmake --build build --config Release -j $(nproc)
 ```
 For Metal (macOS):
 ```bash
 cmake -B build -DGGML_METAL=ON
 cmake --build build --config Release -j $(nproc)
 ```
 **Important Note**: While all backends can be built as long as the correct requirements for that backend are installed, you will not be able to run them without the correct hardware. The only backend that can be run for testing and validation is the CPU backend.
 ### Debug Builds
 Single-config generators:
 ```bash
 cmake -B build -DCMAKE_BUILD_TYPE=Debug
 cmake --build build
 ```
 Multi-config generators:
 ```bash
 cmake -B build -G "Xcode"
 cmake --build build --config Debug
 ```
 ### Common Build Issues
 - **Issue**: Network tests fail in isolated environments
  **Solution**: Expected behavior - core functionality tests will still pass
 ## Testing
 ### Running Tests
 ```bash
 ctest --test-dir build --output-on-failure -j $(nproc)
 ```
 **Test suite**: 38 tests covering tokenizers, grammar parsing, sampling, backends, and integration
 **Expected failures**: 2-3 tests may fail if network access is unavailable (they download models)
 **Test time**: ~30 seconds for passing tests
 ### Server Unit Tests
 Run server-specific unit tests after building the server:
 ```bash
 # Build the server first
 cmake --build build --target llama-server
 # Navigate to server tests and run
 cd tools/server/tests
 source ../../../.venv/bin/activate
 ./tests.sh
 ```
 **Server test dependencies**: The `.venv` environment includes the required dependencies for server unit tests (pytest, aiohttp, etc.). Tests can be run individually or with various options as documented in `tools/server/tests/README.md`.
 ### Test Categories
 - Tokenizer tests: Various model tokenizers (BERT, GPT-2, LLaMA, etc.)
 - Grammar tests: GBNF parsing and validation
 - Backend tests: Core ggml operations across different backends
 - Integration tests: End-to-end workflows
 ### Manual Testing Commands
 ```bash
 # Test basic inference
 ./build/bin/llama-cli --version
 # Test model loading (requires model file)
 ./build/bin/llama-cli -m path/to/model.gguf -p "Hello" -n 10
 ```
 ## Code Quality and Linting
 ### C++ Code Formatting
 **ALWAYS format C++ code before committing:**
 ```bash
 git clang-format
 ```
 Configuration is in `.clang-format` with these key rules:
 - 4-space indentation
 - 120 column limit
 - Braces on same line for functions
 - Pointer alignment: `void * ptr` (middle)
 - Reference alignment: `int & ref` (middle)
 ### Python Code
 **ALWAYS activate the Python environment in `.venv` and use tools from that environment:**
 ```bash
 # Activate virtual environment
 source .venv/bin/activate
 ```
 Configuration files:
 - `.flake8`: flake8 settings (max-line-length=125, excludes examples/tools)
 - `pyrightconfig.json`: pyright type checking configuration
 ### Pre-commit Hooks
 Run before committing:
 ```bash
 pre-commit run --all-files
 ```
 ## Continuous Integration
 ### GitHub Actions Workflows
 Key workflows that run on every PR:
 - `.github/workflows/build.yml`: Multi-platform builds
 - `.github/workflows/server.yml`: Server functionality tests
 - `.github/workflows/python-lint.yml`: Python code quality
 - `.github/workflows/python-type-check.yml`: Python type checking
 ### Local CI Validation
 **Run full CI locally before submitting PRs:**
 ```bash
 mkdir tmp
 # CPU-only build
 bash ./ci/run.sh ./tmp/results ./tmp/mnt
 ```
 **CI Runtime**: 30-60 minutes depending on backend configuration
 ### Triggering CI
 Add `ggml-ci` to commit message to trigger heavy CI workloads on the custom CI infrastructure.
 ## Project Layout and Architecture
 ### Core Directories
 - **`src/`**: Main llama library implementation (`llama.cpp`, `llama-*.cpp`)
 - **`include/`**: Public API headers, primarily `include/llama.h`
 - **`ggml/`**: Core tensor library (submodule with custom GGML framework)
 - **`examples/`**: 30+ example applications and tools
 - **`tools/`**: Additional development and utility tools (server benchmarks, tests)
 - **`tests/`**: Comprehensive test suite with CTest integration
 - **`docs/`**: Detailed documentation (build guides, API docs, etc.)
 - **`scripts/`**: Utility scripts for CI, data processing, and automation
 - **`common/`**: Shared utility code used across examples
 ### Key Files
 - **`CMakeLists.txt`**: Primary build configuration
 - **`include/llama.h`**: Main C API header (~2000 lines)
 - **`src/llama.cpp`**: Core library implementation (~8000 lines)
 - **`CONTRIBUTING.md`**: Coding guidelines and PR requirements
 - **`.clang-format`**: C++ formatting rules
 - **`.pre-commit-config.yaml`**: Git hook configuration
 ### Built Executables (in `build/bin/`)
 Primary tools:
 - **`llama-cli`**: Main inference tool
 - **`llama-server`**: OpenAI-compatible HTTP server
 - **`llama-quantize`**: Model quantization utility
 - **`llama-perplexity`**: Model evaluation tool
 - **`llama-bench`**: Performance benchmarking
 - **`llama-convert-llama2c-to-ggml`**: Model conversion utilities
 ### Configuration Files
 - **CMake**: `CMakeLists.txt`, `cmake/` directory
 - **Linting**: `.clang-format`, `.clang-tidy`, `.flake8`
 - **CI**: `.github/workflows/`, `ci/run.sh`
 - **Git**: `.gitignore` (includes build artifacts, models, cache)
 ### Dependencies
 - **System**: OpenMP, libcurl (for model downloading)
 - **Optional**: CUDA SDK, Metal framework, Vulkan SDK, Intel oneAPI
 - **Bundled**: httplib, json (header-only libraries in vendored form)
 ## Common Validation Steps
 ### After Making Changes
 1. **Format code**: `git clang-format`
 2. **Build**: `cmake --build build --config Release`
 3. **Test**: `ctest --test-dir build --output-on-failure`
 4. **Server tests** (if modifying server): `cd tools/server/tests && source ../../../.venv/bin/activate && ./tests.sh`
 5. **Manual validation**: Test relevant tools in `build/bin/`
 ### Performance Validation
 ```bash
 # Benchmark inference performance
 ./build/bin/llama-bench -m model.gguf
 # Evaluate model perplexity
 ./build/bin/llama-perplexity -m model.gguf -f dataset.txt
 ```
 ### Backend Validation
 ```bash
 # Test backend operations
 ./build/bin/test-backend-ops
 ```
 ## Environment Setup
 ### Required Tools
 - CMake 3.14+ (install via system package manager)
 - Modern C++ compiler with C++17 support
 - Git (for submodule management)
 - Python 3.9+ with virtual environment (`.venv` is provided)
 ### Optional but Recommended
 - ccache: `apt install ccache` or `brew install ccache`
 - clang-format 15+: Usually included with LLVM/Clang installation
 - pre-commit: `pip install pre-commit`
 ### Backend-Specific Requirements
 - **CUDA**: NVIDIA CUDA Toolkit 11.2+
 - **Metal**: Xcode command line tools (macOS only)
 - **Vulkan**: Vulkan SDK
 - **SYCL**: Intel oneAPI toolkit
 ## Important Guidelines
 ### Code Changes
 - **Minimal dependencies**: Avoid adding new external dependencies
 - **Cross-platform compatibility**: Test on Linux, macOS, Windows when possible
 - **Performance focus**: This is a performance-critical inference library
 - **API stability**: Changes to `include/llama.h` require careful consideration
 ### Git Workflow
 - Always create feature branches from `master`
 - **Never** commit build artifacts (`build/`, `.ccache/`, `*.o`, `*.gguf`)
 - Use descriptive commit messages following project conventions
 ### Trust These Instructions
 Only search for additional information if these instructions are incomplete or found to be incorrect. This document contains validated build and test procedures that work reliably across different environments.
--- a/.github/labeler.yml
+++ b/.github/labeler.yml
@ -22,6 +22,11 @@ Vulkan:
        - any-glob-to-any-file:
            - ggml/include/ggml-vulkan.h
            - ggml/src/ggml-vulkan/**
 IBM zDNN:
    - changed-files:
        - any-glob-to-any-file:
            - ggml/include/ggml-zdnn.h
            - ggml/src/ggml-zdnn/**
 documentation:
    - changed-files:
        - any-glob-to-any-file:
--- a/.github/workflows/build-riscv-native.yml
+++ b/.github/workflows/build-riscv-native.yml
@ -0,0 +1,60 @@
 name: Build on RISCV Linux Machine by Cloud-V
 on:
  pull_request:
  workflow_dispatch:
  workflow_call:
 jobs:
  debian-13-riscv64-native: # Bianbu 2.2
    runs-on: self-hosted
    steps:
      - name: Install prerequisites
        run: |
          sudo apt-get update || true
          sudo apt-get install -y libatomic1
      - uses: actions/checkout@v4
      - name: Setup Riscv
        run: |
          sudo apt-get update || true
          sudo apt-get install -y --no-install-recommends \
                  build-essential \
                  gcc-14-riscv64-linux-gnu \
                  g++-14-riscv64-linux-gnu \
                  ccache \
                  cmake
      - name: Setup ccache
        run: |
          mkdir -p $HOME/.ccache
          ccache -M 5G -d $HOME/.ccache
          export CCACHE_LOGFILE=/home/runneruser/ccache_debug/ccache.log
          export CCACHE_DEBUGDIR="/home/runneruser/ccache_debug"
          echo "$GITHUB_WORKSPACE"
          echo "CCACHE_LOGFILE=$CCACHE_LOGFILE" >> $GITHUB_ENV
          echo "CCACHE_DEBUGDIR=$CCACHE_DEBUGDIR" >> $GITHUB_ENV
          echo "CCACHE_BASEDIR=$GITHUB_WORKSPACE" >> $GITHUB_ENV
          echo "CCACHE_DIR=$HOME/.ccache" >> $GITHUB_ENV
      - name: Build
        run: |
          cmake -B build \
            -DLLAMA_CURL=OFF \
            -DCMAKE_BUILD_TYPE=Release \
            -DGGML_OPENMP=OFF \
            -DLLAMA_BUILD_EXAMPLES=ON \
            -DLLAMA_BUILD_TOOLS=ON \
            -DLLAMA_BUILD_TESTS=OFF \
            -DCMAKE_SYSTEM_NAME=Linux \
            -DCMAKE_SYSTEM_PROCESSOR=riscv64 \
            -DCMAKE_C_COMPILER=riscv64-linux-gnu-gcc-14 \
            -DCMAKE_CXX_COMPILER=riscv64-linux-gnu-g++-14 \
            -DCMAKE_C_COMPILER_LAUNCHER=ccache \
            -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
            -DCMAKE_POSITION_INDEPENDENT_CODE=ON \
            -DCMAKE_FIND_ROOT_PATH=/usr/lib/riscv64-linux-gnu \
            -DCMAKE_FIND_ROOT_PATH_MODE_PROGRAM=NEVER \
            -DCMAKE_FIND_ROOT_PATH_MODE_LIBRARY=ONLY \
            -DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=BOTH
          cmake --build build --config Release -j $(nproc)
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@ -64,7 +64,7 @@ jobs:
        uses: actions/checkout@v4
      - name: ccache
-        uses: hendrikmuhs/ccache-action@v1.2.16
+        uses: ggml-org/ccache-action@v1.2.16
        with:
          key: macOS-latest-cmake-arm64
          evict-old-files: 1d
@ -104,7 +104,7 @@ jobs:
        uses: actions/checkout@v4
      - name: ccache
-        uses: hendrikmuhs/ccache-action@v1.2.16
+        uses: ggml-org/ccache-action@v1.2.16
        with:
          key: macOS-latest-cmake-x64
          evict-old-files: 1d
@ -144,7 +144,7 @@ jobs:
        uses: actions/checkout@v4
      - name: ccache
-        uses: hendrikmuhs/ccache-action@v1.2.16
+        uses: ggml-org/ccache-action@v1.2.16
        with:
          key: macOS-latest-cmake-arm64-webgpu
          evict-old-files: 1d
@ -179,7 +179,6 @@ jobs:
      - name: Test
        id: cmake_test
        run: |
          export LLAMA_SET_ROWS=0
          cd build
          ctest -L main --verbose --timeout 900
@ -200,7 +199,7 @@ jobs:
        uses: actions/checkout@v4
      - name: ccache
-        uses: hendrikmuhs/ccache-action@v1.2.16
+        uses: ggml-org/ccache-action@v1.2.16
        with:
          key: ubuntu-cpu-cmake
          evict-old-files: 1d
@ -252,7 +251,7 @@ jobs:
        uses: actions/checkout@v4
      - name: ccache
-        uses: hendrikmuhs/ccache-action@v1.2.16
+        uses: ggml-org/ccache-action@v1.2.16
        with:
          key: ubuntu-latest-cmake-sanitizer-${{ matrix.sanitizer }}
          evict-old-files: 1d
@ -331,7 +330,7 @@ jobs:
        uses: actions/checkout@v4
      - name: ccache
-        uses: hendrikmuhs/ccache-action@v1.2.16
+        uses: ggml-org/ccache-action@v1.2.16
        with:
          key: ubuntu-latest-cmake-rpc
          evict-old-files: 1d
@ -364,7 +363,7 @@ jobs:
        uses: actions/checkout@v4
      - name: ccache
-        uses: hendrikmuhs/ccache-action@v1.2.16
+        uses: ggml-org/ccache-action@v1.2.16
        with:
          key: ubuntu-22-cmake-vulkan
          evict-old-files: 1d
@ -401,7 +400,7 @@ jobs:
        uses: actions/checkout@v4
      - name: ccache
-        uses: hendrikmuhs/ccache-action@v1.2.16
+        uses: ggml-org/ccache-action@v1.2.16
        with:
          key: ubuntu-22-cmake-webgpu
          evict-old-files: 1d
@ -438,14 +437,13 @@ jobs:
      - name: Test
        id: cmake_test
        run: |
          export LLAMA_SET_ROWS=0
          cd build
          # This is using llvmpipe and runs slower than other backends
          ctest -L main --verbose --timeout 3600
  ubuntu-22-cmake-hip:
    runs-on: ubuntu-22.04
-    container: rocm/dev-ubuntu-22.04:6.0.2
+    container: rocm/dev-ubuntu-22.04:6.1.2
    steps:
      - name: Clone
@ -459,7 +457,7 @@ jobs:
          sudo apt-get install -y build-essential git cmake rocblas-dev hipblas-dev libcurl4-openssl-dev
      - name: ccache
-        uses: hendrikmuhs/ccache-action@v1.2.16
+        uses: ggml-org/ccache-action@v1.2.16
        with:
          key: ubuntu-22-cmake-hip
          evict-old-files: 1d
@ -473,16 +471,6 @@ jobs:
            -DGGML_HIP=ON
          cmake --build build --config Release -j $(nproc)
      - name: Build with legacy HIP support
        id: cmake_build_legacy_hip
        run: |
          cmake -B build2 -S . \
            -DCMAKE_C_COMPILER=hipcc \
            -DCMAKE_CXX_COMPILER=hipcc \
            -DGGML_HIP_ROCWMMA_FATTN=ON \
            -DGGML_HIP=ON
          cmake --build build2 --config Release -j $(nproc)
  ubuntu-22-cmake-musa:
    runs-on: ubuntu-22.04
    container: mthreads/musa:rc4.2.0-devel-ubuntu22.04-amd64
@ -499,7 +487,7 @@ jobs:
          apt-get install -y build-essential git cmake libcurl4-openssl-dev
      - name: ccache
-        uses: hendrikmuhs/ccache-action@v1.2.16
+        uses: ggml-org/ccache-action@v1.2.16
        with:
          key: ubuntu-22-cmake-musa
          evict-old-files: 1d
@ -544,7 +532,7 @@ jobs:
        uses: actions/checkout@v4
      - name: ccache
-        uses: hendrikmuhs/ccache-action@v1.2.16
+        uses: ggml-org/ccache-action@v1.2.16
        with:
          key: ubuntu-22-cmake-sycl
          evict-old-files: 1d
@ -592,7 +580,7 @@ jobs:
        uses: actions/checkout@v4
      - name: ccache
-        uses: hendrikmuhs/ccache-action@v1.2.16
+        uses: ggml-org/ccache-action@v1.2.16
        with:
          key: ubuntu-22-cmake-sycl-fp16
          evict-old-files: 1d
@ -623,7 +611,7 @@ jobs:
        uses: actions/checkout@v4
      - name: ccache
-        uses: hendrikmuhs/ccache-action@v1.2.16
+        uses: ggml-org/ccache-action@v1.2.16
        with:
          key: macOS-latest-cmake-ios
          evict-old-files: 1d
@ -660,7 +648,7 @@ jobs:
        uses: actions/checkout@v4
      - name: ccache
-        uses: hendrikmuhs/ccache-action@v1.2.16
+        uses: ggml-org/ccache-action@v1.2.16
        with:
          key: macOS-latest-cmake-tvos
          evict-old-files: 1d
@ -732,7 +720,7 @@ jobs:
        uses: actions/checkout@v4
      - name: ccache
-        uses: hendrikmuhs/ccache-action@v1.2.16
+        uses: ggml-org/ccache-action@v1.2.16
        with:
          key: macOS-latest-swift
          evict-old-files: 1d
@ -778,7 +766,7 @@ jobs:
        uses: actions/checkout@v4
      - name: ccache
-        uses: hendrikmuhs/ccache-action@v1.2.16
+        uses: ggml-org/ccache-action@v1.2.16
        with:
          key: windows-msys2
          variant: ccache
@ -846,7 +834,7 @@ jobs:
        uses: actions/checkout@v4
      - name: ccache
-        uses: hendrikmuhs/ccache-action@v1.2.16
+        uses: ggml-org/ccache-action@v1.2.16
        with:
          key: windows-latest-cmake-${{ matrix.build }}
          variant: ccache
@ -960,7 +948,7 @@ jobs:
              apt install -y cmake build-essential ninja-build libgomp1 git libcurl4-openssl-dev
        - name: ccache
-          uses: hendrikmuhs/ccache-action@v1.2.16
+          uses: ggml-org/ccache-action@v1.2.16
          with:
            key: ubuntu-latest-cmake-cuda
            evict-old-files: 1d
@ -989,7 +977,7 @@ jobs:
        uses: actions/checkout@v4
      - name: Install ccache
-        uses: hendrikmuhs/ccache-action@v1.2.16
+        uses: ggml-org/ccache-action@v1.2.16
        with:
          key: windows-cuda-${{ matrix.cuda }}
          variant: ccache
@ -1045,7 +1033,7 @@ jobs:
        uses: actions/checkout@v4
      - name: ccache
-        uses: hendrikmuhs/ccache-action@v1.2.16
+        uses: ggml-org/ccache-action@v1.2.16
        with:
          key: windows-latest-cmake-sycl
          variant: ccache
@ -1082,7 +1070,8 @@ jobs:
          write-host "Downloading AMD HIP SDK Installer"
          Invoke-WebRequest -Uri "https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-24.Q3-WinSvr2022-For-HIP.exe" -OutFile "${env:RUNNER_TEMP}\rocm-install.exe"
          write-host "Installing AMD HIP SDK"
-          Start-Process "${env:RUNNER_TEMP}\rocm-install.exe" -ArgumentList '-install' -NoNewWindow -Wait
+          $proc = Start-Process "${env:RUNNER_TEMP}\rocm-install.exe" -ArgumentList '-install' -NoNewWindow -PassThru
          $proc.WaitForExit(600000)
          write-host "Completed AMD HIP SDK installation"
      - name: Verify ROCm
@ -1091,7 +1080,7 @@ jobs:
          & 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' --version
      - name: Install ccache
-        uses: hendrikmuhs/ccache-action@v1.2.16
+        uses: ggml-org/ccache-action@v1.2.16
        with:
          key: ${{ github.job }}
          evict-old-files: 1d
@ -1125,6 +1114,11 @@ jobs:
      - name: Checkout code
        uses: actions/checkout@v4
      - name: Setup Xcode
        uses: maxim-lobanov/setup-xcode@v1
        with:
          xcode-version: latest-stable
      - name: Build
        id: cmake_build
        run: |
@ -1158,7 +1152,7 @@ jobs:
        uses: actions/checkout@v4
      - name: ccache
-        uses: hendrikmuhs/ccache-action@v1.2.16
+        uses: ggml-org/ccache-action@v1.2.16
        with:
          key: android-build
          evict-old-files: 1d
--- a/.github/workflows/copilot-setup-steps.yml
+++ b/.github/workflows/copilot-setup-steps.yml
@ -0,0 +1,57 @@
 name: "Copilot Setup Steps"
 # Automatically run the setup steps when they are changed to allow for easy validation, and
 # allow manual testing through the repository's "Actions" tab
 on:
  workflow_dispatch:
  push:
    paths:
      - .github/workflows/copilot-setup-steps.yml
  pull_request:
    paths:
      - .github/workflows/copilot-setup-steps.yml
 jobs:
  # The job MUST be called `copilot-setup-steps` or it will not be picked up by Copilot.
  copilot-setup-steps:
    runs-on: ubuntu-latest
    # Set the permissions to the lowest permissions possible needed for your steps.
    # Copilot will be given its own token for its operations.
    permissions:
      # If you want to clone the repository as part of your setup steps, for example to install dependencies, you'll need the `contents: read` permission. If you don't clone the repository in your setup steps, Copilot will do this for you automatically after the steps complete.
      contents: read
    # You can define any steps you want, and they will run before the agent starts.
    # If you do not check out your code, Copilot will do this for you.
    steps:
      - name: Checkout code
        uses: actions/checkout@v4
      - name: ccache
        uses: ggml-org/ccache-action@v1.2.16
        with:
          key: copilot-setup-steps
          evict-old-files: 1d
      - name: Dependencies
        id: depends
        run: |
          sudo apt-get update
          sudo apt-get install build-essential libcurl4-openssl-dev
          # Install git-clang-format script for formatting only changed code
          wget -O /tmp/git-clang-format https://raw.githubusercontent.com/llvm/llvm-project/release/18.x/clang/tools/clang-format/git-clang-format
          sudo cp /tmp/git-clang-format /usr/local/bin/git-clang-format
          sudo chmod +x /usr/local/bin/git-clang-format
      - name: Set up Python
        uses: actions/setup-python@v5
        with:
          python-version: '3.11'
      - name: Install Python dependencies
        run: |
          python3 -m venv .venv
          .venv/bin/activate
          pip install -r requirements/requirements-all.txt -r tools/server/tests/requirements.txt
          pip install flake8 pyright pre-commit
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@ -32,7 +32,7 @@ jobs:
          fetch-depth: 0
      - name: ccache
-        uses: hendrikmuhs/ccache-action@v1.2.16
+        uses: ggml-org/ccache-action@v1.2.16
        with:
          key: macOS-latest-cmake-arm64
          evict-old-files: 1d
@ -85,7 +85,7 @@ jobs:
          fetch-depth: 0
      - name: ccache
-        uses: hendrikmuhs/ccache-action@v1.2.16
+        uses: ggml-org/ccache-action@v1.2.16
        with:
          key: macOS-latest-cmake-x64
          evict-old-files: 1d
@ -147,7 +147,7 @@ jobs:
          fetch-depth: 0
      - name: ccache
-        uses: hendrikmuhs/ccache-action@v1.2.16
+        uses: ggml-org/ccache-action@v1.2.16
        with:
          key: ubuntu-cpu-cmake
          evict-old-files: 1d
@ -198,7 +198,7 @@ jobs:
          fetch-depth: 0
      - name: ccache
-        uses: hendrikmuhs/ccache-action@v1.2.16
+        uses: ggml-org/ccache-action@v1.2.16
        with:
          key: ubuntu-22-cmake-vulkan
          evict-old-files: 1d
@ -256,7 +256,7 @@ jobs:
          fetch-depth: 0
      - name: ccache
-        uses: hendrikmuhs/ccache-action@v1.2.16
+        uses: ggml-org/ccache-action@v1.2.16
        with:
          key: windows-latest-cmake-cpu-${{ matrix.arch }}
          variant: ccache
@ -328,7 +328,7 @@ jobs:
        uses: actions/checkout@v4
      - name: ccache
-        uses: hendrikmuhs/ccache-action@v1.2.16
+        uses: ggml-org/ccache-action@v1.2.16
        with:
          key: windows-latest-cmake-${{ matrix.backend }}-${{ matrix.arch }}
          variant: ccache
@ -398,7 +398,7 @@ jobs:
        uses: actions/checkout@v4
      - name: Install ccache
-        uses: hendrikmuhs/ccache-action@v1.2.16
+        uses: ggml-org/ccache-action@v1.2.16
        with:
          key: windows-cuda-${{ matrix.cuda }}
          variant: ccache
@ -471,7 +471,7 @@ jobs:
        uses: actions/checkout@v4
      - name: ccache
-        uses: hendrikmuhs/ccache-action@v1.2.16
+        uses: ggml-org/ccache-action@v1.2.16
        with:
          key: windows-latest-cmake-sycl
          variant: ccache
@ -545,7 +545,7 @@ jobs:
          git clone https://github.com/rocm/rocwmma --branch rocm-6.2.4 --depth 1
      - name: ccache
-        uses: hendrikmuhs/ccache-action@v1.2.16
+        uses: ggml-org/ccache-action@v1.2.16
        with:
          key: windows-latest-cmake-hip-${{ matrix.name }}-x64
          evict-old-files: 1d
@ -557,7 +557,8 @@ jobs:
          write-host "Downloading AMD HIP SDK Installer"
          Invoke-WebRequest -Uri "https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-24.Q3-WinSvr2022-For-HIP.exe" -OutFile "${env:RUNNER_TEMP}\rocm-install.exe"
          write-host "Installing AMD HIP SDK"
-          Start-Process "${env:RUNNER_TEMP}\rocm-install.exe" -ArgumentList '-install' -NoNewWindow -Wait
+          $proc = Start-Process "${env:RUNNER_TEMP}\rocm-install.exe" -ArgumentList '-install' -NoNewWindow -PassThru
          $proc.WaitForExit(600000)
          write-host "Completed AMD HIP SDK installation"
      - name: Verify ROCm
@ -600,7 +601,7 @@ jobs:
          name: llama-bin-win-hip-${{ matrix.name }}-x64.zip
  ios-xcode-build:
-    runs-on: macos-latest
+    runs-on: macos-15
    steps:
      - name: Checkout code
@ -608,6 +609,10 @@ jobs:
        with:
          fetch-depth: 0
      - name: Setup Xcode
        run: |
          sudo xcode-select -s /Applications/Xcode_16.4.app
      - name: Build
        id: cmake_build
        run: |
--- a/.gitignore
+++ b/.gitignore
@ -147,3 +147,4 @@ poetry.toml
 # Local scripts
 /run-vim.sh
 /run-chat.sh
 .ccache/
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -12,6 +12,8 @@ if (NOT XCODE AND NOT MSVC AND NOT CMAKE_BUILD_TYPE)
    set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug" "Release" "MinSizeRel" "RelWithDebInfo")
 endif()
 message("CMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}")
 # Add path to modules
 list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake/")
--- a/2
+++ b/2
@ -5,8 +5,8 @@
 /tools/server/ @ngxson
 /ggml/src/ggml-cuda/fattn* @JohannesGaessler
 /ggml/src/ggml-cuda/mmq.* @JohannesGaessler
 /ggml/src/ggml-cuda/mmv.* @JohannesGaessler
 /ggml/src/ggml-cuda/mmvq.* @JohannesGaessler
 /ggml/src/ggml-opt.cpp @JohannesGaessler
 /ggml/src/gguf.cpp @JohannesGaessler
 /ggml/src/ggml-vulkan/ @0cc4m
 /ggml/src/ggml-zdnn/ @taronaeo
--- a/1611
+++ b/1611
--- a/README.md
+++ b/README.md
@ -17,6 +17,8 @@ LLM inference in C/C++
 ## Hot topics
 - **[guide : running gpt-oss with llama.cpp](https://github.com/ggml-org/llama.cpp/discussions/15396)**
 - **[[FEEDBACK] Better packaging for llama.cpp to support downstream consumers 🤗](https://github.com/ggml-org/llama.cpp/discussions/15313)**
 - Support for the `gpt-oss` model with native MXFP4 format has been added | [PR](https://github.com/ggml-org/llama.cpp/pull/15091) | [Collaboration with NVIDIA](https://blogs.nvidia.com/blog/rtx-ai-garage-openai-oss) | [Comment](https://github.com/ggml-org/llama.cpp/discussions/15095)
 - Hot PRs: [All](https://github.com/ggml-org/llama.cpp/pulls?q=is%3Apr+label%3Ahot+) | [Open](https://github.com/ggml-org/llama.cpp/pulls?q=is%3Apr+label%3Ahot+is%3Aopen)
 - Multimodal support arrived in `llama-server`: [#12898](https://github.com/ggml-org/llama.cpp/pull/12898) | [documentation](./docs/multimodal.md)
@ -135,6 +137,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
 - [X] [Trillion-7B-preview](https://huggingface.co/trillionlabs/Trillion-7B-preview)
 - [x] [Ling models](https://huggingface.co/collections/inclusionAI/ling-67c51c85b34a7ea0aba94c32)
 - [x] [LFM2 models](https://huggingface.co/collections/LiquidAI/lfm2-686d721927015b2ad73eaa38)
 - [x] [Hunyuan models](https://huggingface.co/collections/tencent/hunyuan-dense-model-6890632cda26b19119c9c5e7)
 #### Multimodal
@ -149,6 +152,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
 - [x] [Bunny](https://github.com/BAAI-DCAI/Bunny)
 - [x] [GLM-EDGE](https://huggingface.co/models?search=glm-edge)
 - [x] [Qwen2-VL](https://huggingface.co/collections/Qwen/qwen2-vl-66cee7455501d7126940800d)
 - [x] [LFM2-VL](https://huggingface.co/collections/LiquidAI/lfm2-vl-68963bbc84a610f7638d5ffa)
 </details>
@ -240,7 +244,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
 <details>
 <summary>Infrastructure</summary>
- [Paddler](https://github.com/distantmagic/paddler) - Stateful load balancer custom-tailored for llama.cpp
+- [Paddler](https://github.com/intentee/paddler) - Open-source LLMOps platform for hosting and scaling AI in your own infrastructure
 - [GPUStack](https://github.com/gpustack/gpustack) - Manage GPU clusters for running LLMs
 - [llama_cpp_canister](https://github.com/onicai/llama_cpp_canister) - llama.cpp as a smart contract on the Internet Computer, using WebAssembly
 - [llama-swap](https://github.com/mostlygeek/llama-swap) - transparent proxy that adds automatic model switching with llama-server
--- a/ci/run.sh
+++ b/ci/run.sh
@ -106,7 +106,7 @@ function gg_wget {
    cd $out
    # should not re-download if file is the same
-    wget -nv -N $url
+    wget -nv -c -N $url
    cd $cwd
 }
@ -386,10 +386,10 @@ function gg_run_open_llama_7b_v2 {
    (time ./bin/llama-imatrix --model ${model_f16} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
-    (time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 10 -c 0     ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
+    (time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 10 -c 0 -fa off ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
-    (time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 10 -c 0 -fa ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
+    (time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 10 -c 0 -fa on  ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
-    (time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 99 -c 0     ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
+    (time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 99 -c 0 -fa off ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
-    (time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 99 -c 0 -fa ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
+    (time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 99 -c 0 -fa on  ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
    function check_ppl {
        qnt="$1"
@ -520,8 +520,8 @@ function gg_run_pythia_1_4b {
    (time ./bin/llama-imatrix --model ${model_f16} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
-    (time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 99 -c 0     ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
+    (time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 99 -c 0 -fa off ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
-    (time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 99 -c 0 -fa ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
+    (time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 99 -c 0 -fa on  ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
    function check_ppl {
        qnt="$1"
@ -651,10 +651,10 @@ function gg_run_pythia_2_8b {
    (time ./bin/llama-imatrix --model ${model_f16} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
-    (time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 10 -c 0     ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
+    (time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 10 -c 0 -fa off ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
-    (time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 10 -c 0 -fa ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
+    (time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 10 -c 0 -fa on  ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
-    (time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 99 -c 0     ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
+    (time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 99 -c 0 -fa off ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
-    (time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 99 -c 0 -fa ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
+    (time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 99 -c 0 -fa on  ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
    function check_ppl {
        qnt="$1"
--- a/common/arg.cpp
+++ b/common/arg.cpp
@ -749,6 +749,39 @@ std::pair<long, std::vector<char>> common_remote_get_content(const std::string &
 // utils
 //
 // Helper function to parse tensor buffer override strings
 static void parse_tensor_buffer_overrides(const std::string & value, std::vector<llama_model_tensor_buft_override> & overrides) {
    std::map<std::string, ggml_backend_buffer_type_t> buft_list;
    for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
        auto * dev = ggml_backend_dev_get(i);
        auto * buft = ggml_backend_dev_buffer_type(dev);
        if (buft) {
            buft_list[ggml_backend_buft_name(buft)] = buft;
        }
    }
    for (const auto & override : string_split<std::string>(value, ',')) {
        std::string::size_type pos = override.find('=');
        if (pos == std::string::npos) {
            throw std::invalid_argument("invalid value");
        }
        std::string tensor_name = override.substr(0, pos);
        std::string buffer_type = override.substr(pos + 1);
        if (buft_list.find(buffer_type) == buft_list.end()) {
            printf("Available buffer types:\n");
            for (const auto & it : buft_list) {
                printf("  %s\n", ggml_backend_buft_name(it.second));
            }
            throw std::invalid_argument("unknown buffer type");
        }
        // keep strings alive and avoid leaking memory by storing them in a static vector
        static std::list<std::string> buft_overrides;
        buft_overrides.push_back(tensor_name);
        overrides.push_back({buft_overrides.back().c_str(), buft_list.at(buffer_type)});
    }
 }
 struct handle_model_result {
    bool found_mmproj = false;
    common_params_model mmproj;
@ -993,6 +1026,10 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
        params.tensor_buft_overrides.push_back({nullptr, nullptr});
    }
    if (!params.speculative.tensor_buft_overrides.empty()) {
        params.speculative.tensor_buft_overrides.push_back({nullptr, nullptr});
    }
    if (!params.chat_template.empty() && !common_chat_verify_template(params.chat_template, params.use_jinja)) {
        throw std::runtime_error(string_format(
            "error: the supplied chat template is not supported: %s%s\n",
@ -1069,7 +1106,7 @@ static void common_params_print_completion(common_params_context & ctx_arg) {
    printf("\"\n\n");
    printf("    case \"$prev\" in\n");
-    printf("        --model)\n");
+    printf("        --model|-m)\n");
    printf("            COMPREPLY=( $(compgen -f -X '!*.gguf' -- \"$cur\") $(compgen -d -- \"$cur\") )\n");
    printf("            return 0\n");
    printf("            ;;\n");
@ -1201,6 +1238,7 @@ bool common_params_parse(int argc, char ** argv, common_params & params, llama_e
            common_params_print_completion(ctx_arg);
            exit(0);
        }
        params.lr.init();
    } catch (const std::invalid_argument & ex) {
        fprintf(stderr, "%s\n", ex.what());
        ctx_arg.params = params_org;
@ -1469,6 +1507,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
            params.swa_full = true;
        }
    ).set_env("LLAMA_ARG_SWA_FULL"));
    add_opt(common_arg(
        {"--swa-checkpoints"}, "N",
        string_format("max number of SWA checkpoints per slot to create (default: %d)\n"
            "[(more info)](https://github.com/ggml-org/llama.cpp/pull/15293)", params.n_swa_checkpoints),
        [](common_params & params, int value) {
            params.n_swa_checkpoints = value;
        }
    ).set_env("LLAMA_ARG_SWA_CHECKPOINTS").set_examples({LLAMA_EXAMPLE_SERVER}));
    add_opt(common_arg(
        {"--kv-unified", "-kvu"},
        string_format("use single unified KV buffer for the KV cache of all sequences (default: %s)\n"
@ -1484,6 +1530,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
            params.ctx_shift = false;
        }
    ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_PERPLEXITY}).set_env("LLAMA_ARG_NO_CONTEXT_SHIFT"));
    add_opt(common_arg(
        {"--context-shift"},
        string_format("enables context shift on infinite text generation (default: %s)", params.ctx_shift ? "enabled" : "disabled"),
        [](common_params & params) {
            params.ctx_shift = true;
        }
    ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_PERPLEXITY}).set_env("LLAMA_ARG_CONTEXT_SHIFT"));
    add_opt(common_arg(
        {"--chunks"}, "N",
        string_format("max number of chunks to process (default: %d, -1 = all)", params.n_chunks),
@ -1492,10 +1545,18 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        }
    ).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_PERPLEXITY, LLAMA_EXAMPLE_RETRIEVAL}));
    add_opt(common_arg(
-        {"-fa", "--flash-attn"},
+        {"-fa", "--flash-attn"}, "FA",
-        string_format("enable Flash Attention (default: %s)", params.flash_attn ? "enabled" : "disabled"),
+        string_format("set Flash Attention use ('on', 'off', or 'auto', default: '%s')", llama_flash_attn_type_name(params.flash_attn_type)),
-        [](common_params & params) {
+        [](common_params & params, const std::string & value) {
-            params.flash_attn = true;
+            if (value == "on" || value == "enabled" || value == "1") {
                params.flash_attn_type = LLAMA_FLASH_ATTN_TYPE_ENABLED;
            } else if (value == "off" || value == "disabled" || value == "0") {
                params.flash_attn_type = LLAMA_FLASH_ATTN_TYPE_DISABLED;
            } else if (value == "auto" || value == "-1") {
                params.flash_attn_type = LLAMA_FLASH_ATTN_TYPE_AUTO;
            } else {
                throw std::runtime_error(string_format("error: unkown value for --flash-attn: '%s'\n", value.c_str()));
            }
        }
    ).set_env("LLAMA_ARG_FLASH_ATTN"));
    add_opt(common_arg(
@ -1702,7 +1763,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        [](common_params & params) {
            params.warmup = false;
        }
-    ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_RETRIEVAL}));
+    ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_RETRIEVAL, LLAMA_EXAMPLE_PERPLEXITY}));
    add_opt(common_arg(
        {"--spm-infill"},
        string_format(
@ -1777,7 +1838,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        [](common_params & params, const std::string & value) {
            params.sampling.top_n_sigma = std::stof(value);
        }
-    ).set_examples({LLAMA_EXAMPLE_MAIN}).set_sparam());
+    ).set_sparam());
    add_opt(common_arg(
        {"--xtc-probability"}, "N",
        string_format("xtc probability (default: %.1f, 0.0 = disabled)", (double)params.sampling.xtc_probability),
@ -2201,9 +2262,11 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
    ).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
    add_opt(common_arg(
        {"-dt", "--defrag-thold"}, "N",
-        string_format("KV cache defragmentation threshold (default: %.1f, < 0 - disabled)", (double)params.defrag_thold),
+        string_format("KV cache defragmentation threshold (DEPRECATED)"),
        [](common_params & params, const std::string & value) {
-            params.defrag_thold = std::stof(value);
+            GGML_UNUSED(params);
            GGML_UNUSED(value);
            LOG_WRN("DEPRECATED: --defrag-thold is deprecated and no longer necessary to specify\n");
        }
    ).set_env("LLAMA_ARG_DEFRAG_THOLD"));
    add_opt(common_arg(
@ -2349,40 +2412,15 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
    add_opt(common_arg(
        {"--override-tensor", "-ot"}, "<tensor name pattern>=<buffer type>,...",
        "override tensor buffer type", [](common_params & params, const std::string & value) {
-            /* static */ std::map<std::string, ggml_backend_buffer_type_t> buft_list;
+            parse_tensor_buffer_overrides(value, params.tensor_buft_overrides);
            if (buft_list.empty()) {
                // enumerate all the devices and add their buffer types to the list
                for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
                    auto * dev = ggml_backend_dev_get(i);
                    auto * buft = ggml_backend_dev_buffer_type(dev);
                    if (buft) {
                        buft_list[ggml_backend_buft_name(buft)] = buft;
                    }
                }
            }
            for (const auto & override : string_split<std::string>(value, ',')) {
                std::string::size_type pos = override.find('=');
                if (pos == std::string::npos) {
                    throw std::invalid_argument("invalid value");
                }
                std::string tensor_name = override.substr(0, pos);
                std::string buffer_type = override.substr(pos + 1);
                if (buft_list.find(buffer_type) == buft_list.end()) {
                    printf("Available buffer types:\n");
                    for (const auto & it : buft_list) {
                        printf("  %s\n", ggml_backend_buft_name(it.second));
                    }
                    throw std::invalid_argument("unknown buffer type");
                }
                // keep strings alive and avoid leaking memory by storing them in a static vector
                static std::list<std::string> buft_overrides;
                buft_overrides.push_back(tensor_name);
                params.tensor_buft_overrides.push_back({buft_overrides.back().c_str(), buft_list.at(buffer_type)});
            }
        }
    ));
    add_opt(common_arg(
        {"--override-tensor-draft", "-otd"}, "<tensor name pattern>=<buffer type>,...",
        "override tensor buffer type for draft model", [](common_params & params, const std::string & value) {
            parse_tensor_buffer_overrides(value, params.speculative.tensor_buft_overrides);
        }
    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
    add_opt(common_arg(
        {"--cpu-moe", "-cmoe"},
        "keep all Mixture of Experts (MoE) weights in the CPU",
@ -2405,9 +2443,30 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
            }
        }
    ).set_env("LLAMA_ARG_N_CPU_MOE"));
    add_opt(common_arg(
        {"--cpu-moe-draft", "-cmoed"},
        "keep all Mixture of Experts (MoE) weights in the CPU for the draft model",
        [](common_params & params) {
            params.speculative.tensor_buft_overrides.push_back({"\\.ffn_(up|down|gate)_exps", ggml_backend_cpu_buffer_type()});
        }
    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CPU_MOE_DRAFT"));
    add_opt(common_arg(
        {"--n-cpu-moe-draft", "-ncmoed"}, "N",
        "keep the Mixture of Experts (MoE) weights of the first N layers in the CPU for the draft model",
        [](common_params & params, int value) {
            if (value < 0) {
                throw std::invalid_argument("invalid value");
            }
            for (int i = 0; i < value; ++i) {
                static std::list<std::string> buft_overrides_draft;
                buft_overrides_draft.push_back(string_format("blk\\.%d\\.ffn_(up|down|gate)_exps", i));
                params.speculative.tensor_buft_overrides.push_back({buft_overrides_draft.back().c_str(), ggml_backend_cpu_buffer_type()});
            }
        }
    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_N_CPU_MOE_DRAFT"));
    add_opt(common_arg(
        {"-ngl", "--gpu-layers", "--n-gpu-layers"}, "N",
-        "number of layers to store in VRAM",
+        string_format("max. number of layers to store in VRAM (default: %d)", params.n_gpu_layers),
        [](common_params & params, int value) {
            params.n_gpu_layers = value;
            if (!llama_supports_gpu_offload()) {
@ -2504,7 +2563,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        {"--lora"}, "FNAME",
        "path to LoRA adapter (can be repeated to use multiple adapters)",
        [](common_params & params, const std::string & value) {
-            params.lora_adapters.push_back({ std::string(value), 1.0, nullptr });
+            params.lora_adapters.push_back({ std::string(value), 1.0, "", "", nullptr });
        }
        // we define this arg on both COMMON and EXPORT_LORA, so when showing help message of export-lora, it will be categorized as "example-specific" arg
    ).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA}));
@ -2512,7 +2571,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        {"--lora-scaled"}, "FNAME", "SCALE",
        "path to LoRA adapter with user defined scaling (can be repeated to use multiple adapters)",
        [](common_params & params, const std::string & fname, const std::string & scale) {
-            params.lora_adapters.push_back({ fname, std::stof(scale), nullptr });
+            params.lora_adapters.push_back({ fname, std::stof(scale), "", "", nullptr });
        }
        // we define this arg on both COMMON and EXPORT_LORA, so when showing help message of export-lora, it will be categorized as "example-specific" arg
    ).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA}));
@ -2655,7 +2714,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        [](common_params & params, const std::string & value) {
            params.out_file = value;
        }
-    ).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_CVECTOR_GENERATOR, LLAMA_EXAMPLE_EXPORT_LORA, LLAMA_EXAMPLE_TTS}));
+    ).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_CVECTOR_GENERATOR, LLAMA_EXAMPLE_EXPORT_LORA, LLAMA_EXAMPLE_TTS, LLAMA_EXAMPLE_FINETUNE}));
    add_opt(common_arg(
        {"-ofreq", "--output-frequency"}, "N",
        string_format("output the imatrix every N iterations (default: %d)", params.n_out_freq),
@ -2903,13 +2962,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
            params.endpoint_metrics = true;
        }
    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ENDPOINT_METRICS"));
    add_opt(common_arg(
        {"--slots"},
        string_format("enable slots monitoring endpoint (default: %s)", params.endpoint_slots ? "enabled" : "disabled"),
        [](common_params & params) {
            params.endpoint_slots = true;
        }
    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ENDPOINT_SLOTS"));
    add_opt(common_arg(
        {"--props"},
        string_format("enable changing global properties via POST /props (default: %s)", params.endpoint_props ? "enabled" : "disabled"),
@ -2917,6 +2969,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
            params.endpoint_props = true;
        }
    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ENDPOINT_PROPS"));
    add_opt(common_arg(
        {"--slots"},
        string_format("enable slots monitoring endpoint (default: %s)", params.endpoint_slots ? "enabled" : "disabled"),
        [](common_params & params) {
            params.endpoint_slots = true;
        }
    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ENDPOINT_SLOTS"));
    add_opt(common_arg(
        {"--no-slots"},
        "disables slots monitoring endpoint",
@ -2949,11 +3008,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        "- deepseek: puts thoughts in `message.reasoning_content` (except in streaming mode, which behaves as `none`)\n"
        "(default: auto)",
        [](common_params & params, const std::string & value) {
-            /**/ if (value == "deepseek") { params.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK; }
+            params.reasoning_format = common_reasoning_format_from_name(value);
            else if (value == "deepseek-legacy") { params.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY; }
            else if (value == "none") {     params.reasoning_format = COMMON_REASONING_FORMAT_NONE; }
            else if (value == "auto") {     params.reasoning_format = COMMON_REASONING_FORMAT_AUTO; }
            else { throw std::invalid_argument("invalid value"); }
        }
    ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN}).set_env("LLAMA_ARG_THINK"));
    add_opt(common_arg(
@ -3134,7 +3189,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
                params.speculative.cpuparams.n_threads = std::thread::hardware_concurrency();
            }
        }
-    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
+    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
    add_opt(common_arg(
        {"-tbd", "--threads-batch-draft"}, "N",
        "number of threads to use during batch and prompt processing (default: same as --threads-draft)",
@ -3144,7 +3199,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
                params.speculative.cpuparams_batch.n_threads = std::thread::hardware_concurrency();
            }
        }
-    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
+    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
    add_opt(common_arg(
        {"-Cd", "--cpu-mask-draft"}, "M",
        "Draft model CPU affinity mask. Complements cpu-range-draft (default: same as --cpu-mask)",
@ -3412,8 +3467,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
            params.model.hf_repo = "ggml-org/Qwen2.5-Coder-1.5B-Q8_0-GGUF";
            params.model.hf_file = "qwen2.5-coder-1.5b-q8_0.gguf";
            params.port = 8012;
            params.n_gpu_layers = 99;
            params.flash_attn = true;
            params.n_ubatch = 1024;
            params.n_batch = 1024;
            params.n_ctx = 0;
@ -3428,8 +3481,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
            params.model.hf_repo = "ggml-org/Qwen2.5-Coder-3B-Q8_0-GGUF";
            params.model.hf_file = "qwen2.5-coder-3b-q8_0.gguf";
            params.port = 8012;
            params.n_gpu_layers = 99;
            params.flash_attn = true;
            params.n_ubatch = 1024;
            params.n_batch = 1024;
            params.n_ctx = 0;
@ -3444,8 +3495,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
            params.model.hf_repo = "ggml-org/Qwen2.5-Coder-7B-Q8_0-GGUF";
            params.model.hf_file = "qwen2.5-coder-7b-q8_0.gguf";
            params.port = 8012;
            params.n_gpu_layers = 99;
            params.flash_attn = true;
            params.n_ubatch = 1024;
            params.n_batch = 1024;
            params.n_ctx = 0;
@ -3461,10 +3510,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
            params.model.hf_file = "qwen2.5-coder-7b-q8_0.gguf";
            params.speculative.model.hf_repo = "ggml-org/Qwen2.5-Coder-0.5B-Q8_0-GGUF";
            params.speculative.model.hf_file = "qwen2.5-coder-0.5b-q8_0.gguf";
            params.speculative.n_gpu_layers = 99;
            params.port = 8012;
            params.n_gpu_layers = 99;
            params.flash_attn = true;
            params.n_ubatch = 1024;
            params.n_batch = 1024;
            params.n_ctx = 0;
@ -3480,10 +3526,21 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
            params.model.hf_file = "qwen2.5-coder-14b-q8_0.gguf";
            params.speculative.model.hf_repo = "ggml-org/Qwen2.5-Coder-0.5B-Q8_0-GGUF";
            params.speculative.model.hf_file = "qwen2.5-coder-0.5b-q8_0.gguf";
            params.speculative.n_gpu_layers = 99;
            params.port = 8012;
-            params.n_gpu_layers = 99;
+            params.n_ubatch = 1024;
-            params.flash_attn = true;
+            params.n_batch = 1024;
            params.n_ctx = 0;
            params.n_cache_reuse = 256;
        }
    ).set_examples({LLAMA_EXAMPLE_SERVER}));
    add_opt(common_arg(
        {"--fim-qwen-30b-default"},
        string_format("use default Qwen 3 Coder 30B A3B Instruct (note: can download weights from the internet)"),
        [](common_params & params) {
            params.model.hf_repo = "ggml-org/Qwen3-Coder-30B-A3B-Instruct-Q8_0-GGUF";
            params.model.hf_file = "qwen3-coder-30b-a3b-instruct-q8_0.gguf";
            params.port = 8012;
            params.n_ubatch = 1024;
            params.n_batch = 1024;
            params.n_ctx = 0;
@ -3537,5 +3594,51 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
    ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
    add_opt(
        common_arg({ "-lr", "--learning-rate" }, "ALPHA",
                   string_format(
                       "adamw or sgd optimizer alpha (default: %.2g); note: sgd alpha recommended ~10x (no momentum)",
                       (double) params.lr.lr0),
                   [](common_params & params, const std::string & value) { params.lr.lr0 = std::stof(value); })
            .set_examples({ LLAMA_EXAMPLE_FINETUNE }));
    add_opt(
        common_arg({ "-lr-min", "--learning-rate-min" }, "ALPHA",
                   string_format(
                       "(if >0) final learning rate after decay (if -decay-epochs is set, default=%.2g)",
                       (double) params.lr.lr_min),
                   [](common_params & params, const std::string & value) { params.lr.lr_min = std::stof(value); })
            .set_examples({ LLAMA_EXAMPLE_FINETUNE }));
    add_opt(
        common_arg({ "-decay-epochs", "--learning-rate-decay-epochs" }, "ALPHA",
                   string_format(
                       "(if >0) decay learning rate to -lr-min after this many epochs (exponential decay, default=%.2g)",
                       (double) params.lr.decay_epochs),
                   [](common_params & params, const std::string & value) { params.lr.decay_epochs = std::stof(value); })
            .set_examples({ LLAMA_EXAMPLE_FINETUNE }));
    add_opt(common_arg(
                { "-wd", "--weight-decay" }, "WD",
                string_format(
                    "adamw or sgd optimizer weight decay (0 is off; recommend very small e.g. 1e-9) (default: %.2g).",
                    (double) params.lr.wd),
                [](common_params & params, const std::string & value) { params.lr.wd = std::stof(value); })
                .set_examples({ LLAMA_EXAMPLE_FINETUNE }));
    add_opt(common_arg({ "-val-split", "--val-split" }, "FRACTION",
                       string_format("fraction of data to use as validation set for training (default: %.2g).",
                                     (double) params.val_split),
                       [](common_params & params, const std::string & value) { params.val_split = std::stof(value); })
                .set_examples({ LLAMA_EXAMPLE_FINETUNE }));
    add_opt(common_arg({ "-epochs", "--epochs" }, "N",
                       string_format("optimizer max # of epochs (default: %d)", params.lr.epochs),
                       [](common_params & params, int epochs) { params.lr.epochs = epochs; })
                .set_examples({ LLAMA_EXAMPLE_FINETUNE }));
    add_opt(common_arg({ "-opt", "--optimizer" }, "sgd|adamw", "adamw or sgd",
                       [](common_params & params, const std::string & name) {
                           params.optimizer = common_opt_get_optimizer(name.c_str());
                           if (params.optimizer == GGML_OPT_OPTIMIZER_TYPE_COUNT) {
                               throw std::invalid_argument("invalid --optimizer, valid options: adamw, sgd");
                           }
                       })
                .set_examples({ LLAMA_EXAMPLE_FINETUNE }));
    return ctx_arg;
 }
--- a/common/chat-parser.cpp
+++ b/common/chat-parser.cpp
@ -55,7 +55,15 @@ bool common_chat_msg_parser::add_tool_call(const std::string & name, const std::
 bool common_chat_msg_parser::add_tool_call(const json & tool_call) {
    std::string name = tool_call.contains("name") ? tool_call.at("name") : "";
    std::string id = tool_call.contains("id") ? tool_call.at("id") : "";
-    std::string arguments = tool_call.contains("arguments") ? tool_call.at("arguments") : "";
+    std::string arguments = "";
    if (tool_call.contains("arguments")) {
        if (tool_call.at("arguments").is_object()) {
            arguments = tool_call.at("arguments").dump();
        } else {
            arguments = tool_call.at("arguments");
        }
    }
    return add_tool_call(name, id, arguments);
 }
--- a/common/chat.cpp
+++ b/common/chat.cpp
@ -147,6 +147,7 @@ struct templates_params {
    json extra_context;
    bool add_bos;
    bool add_eos;
    bool is_inference = true;
 };
 common_chat_tool_choice common_chat_tool_choice_parse_oaicompat(const std::string & tool_choice) {
@ -296,6 +297,7 @@ json common_chat_msgs_to_json_oaicompat(const std::vector<common_chat_msg> & msg
        }
        if (!msg.reasoning_content.empty()) {
            jmsg["reasoning_content"] = msg.reasoning_content;
            jmsg["thinking"] = msg.reasoning_content; // gpt-oss
        }
        if (!msg.tool_name.empty()) {
            jmsg["name"] = msg.tool_name;
@ -472,11 +474,12 @@ std::string common_chat_format_single(
    return ss.str();
 }
-std::string common_chat_format_example(const struct common_chat_templates * tmpls, bool use_jinja) {
+std::string common_chat_format_example(const struct common_chat_templates * tmpls, bool use_jinja, const std::map<std::string, std::string> & chat_template_kwargs) {
    common_chat_templates_inputs inputs;
    inputs.use_jinja = use_jinja;
    inputs.add_bos = tmpls->add_bos;
    inputs.add_eos = tmpls->add_eos;
    inputs.chat_template_kwargs = chat_template_kwargs;
    auto add_simple_msg = [&](auto role, auto content) {
        common_chat_msg msg;
        msg.role = role;
@ -552,6 +555,17 @@ common_chat_templates_ptr common_chat_templates_init(
            default_template_src = CHATML_TEMPLATE_SRC;
        }
    }
    // TODO @ngxson : this is a temporary hack to prevent chat template from throwing an error
    // Ref: https://github.com/ggml-org/llama.cpp/pull/15230#issuecomment-3173959633
    if (default_template_src.find("<|channel|>") != std::string::npos
            // search for the error message and patch it
            && default_template_src.find("in message.content or") != std::string::npos) {
        string_replace_all(default_template_src,
            "{%- if \"<|channel|>analysis<|message|>\" in message.content or \"<|channel|>final<|message|>\" in message.content %}",
            "{%- if false %}");
    }
    std::string token_bos = bos_token_override;
    std::string token_eos = eos_token_override;
    bool add_bos = false;
@ -606,7 +620,9 @@ const char * common_chat_format_name(common_chat_format format) {
        case COMMON_CHAT_FORMAT_FUNCTIONARY_V3_1_LLAMA_3_1: return "Functionary v3.1 Llama 3.1";
        case COMMON_CHAT_FORMAT_HERMES_2_PRO: return "Hermes 2 Pro";
        case COMMON_CHAT_FORMAT_COMMAND_R7B: return "Command R7B";
        case COMMON_CHAT_FORMAT_GRANITE: return "Granite";
        case COMMON_CHAT_FORMAT_GPT_OSS: return "GPT-OSS";
        case COMMON_CHAT_FORMAT_SEED_OSS: return "Seed-OSS";
        default:
            throw std::runtime_error("Unknown chat format");
    }
@ -623,6 +639,19 @@ const char * common_reasoning_format_name(common_reasoning_format format) {
    }
 }
 common_reasoning_format common_reasoning_format_from_name(const std::string & format) {
    if (format == "none") {
        return COMMON_REASONING_FORMAT_NONE;
    } else if (format == "auto") {
        return COMMON_REASONING_FORMAT_AUTO;
    } else if (format == "deepseek") {
        return COMMON_REASONING_FORMAT_DEEPSEEK;
    } else if (format == "deepseek-legacy") {
        return COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY;
    }
    throw std::runtime_error("Unknown reasoning format: " + format);
 }
 static std::string wrap_code_as_arguments(common_chat_msg_parser & builder, const std::string & code) {
    std::string arguments;
    if (builder.is_partial()) {
@ -1309,19 +1338,198 @@ static common_chat_params common_chat_params_init_gpt_oss(const common_chat_temp
    common_chat_params data;
    auto prompt = apply(tmpl, inputs);
    // Check if we need to replace the return token with end token during
    // inference and without generation prompt. For more details see:
    // https://github.com/ggml-org/llama.cpp/issues/15417
    if (inputs.is_inference && !inputs.add_generation_prompt) {
        static constexpr std::string_view return_token = "<|return|>";
        static constexpr std::string_view end_token    = "<|end|>";
        if (size_t pos = prompt.rfind(return_token); pos != std::string::npos) {
            prompt.replace(pos, return_token.length(), end_token);
        }
    }
    data.prompt = prompt;
    data.format = COMMON_CHAT_FORMAT_GPT_OSS;
-    // TODO: support tool calls in GPT-OSS?
+    // These special tokens are required to parse properly, so we include them
    // even if parse_tool_calls is false.
    data.preserved_tokens = {
        "<|channel|>",
        "<|constrain|>",
        "<|message|>",
        "<|start|>",
        "<|end|>",
    };
    if (!inputs.json_schema.is_null()) {
        data.grammar_lazy = false;
        data.grammar = build_grammar([&](const common_grammar_builder & builder) {
            auto schema = inputs.json_schema;
            builder.resolve_refs(schema);
            auto not_end = builder.add_rule("not-end",
                "[^<] | \"<\" [^|] | \"<|\" [^e] | \"<|e\" [^n] | \"<|en\" [^d] | \"<|end\" [^|] | \"<|end|\" [^>]");
            auto analysis = builder.add_rule("analysis",
                "\"<|channel|>analysis<|message|>\" ( " + not_end + " )* \"<|end|>\"");
            auto constraint = builder.add_rule("constraint", "\"<|constrain|>\"? [a-zA-Z0-9_-]+");
            auto final = builder.add_rule("final",
                "\"<|channel|>final\" ( \" \" " + constraint + " )? \"<|message|>\" " +
                builder.add_schema("response", schema)
            );
            builder.add_rule("root", "( " + analysis + " \"<|start|>assistant\" )? " + final);
        });
    }
    if (inputs.tools.is_array() && !inputs.tools.empty()) {
        data.grammar_lazy = inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED;
        data.grammar = build_grammar([&](const common_grammar_builder & builder) {
            // tool calls can appear in commentary or analysis channels
            auto channel = builder.add_rule("channel", "\"<|channel|>\" ( \"commentary\" | \"analysis\" )");
            std::vector<std::string> tool_rules_recipient_in_role;
            std::vector<std::string> tool_rules_recipient_in_channel;
            foreach_function(inputs.tools, [&](const json & tool) {
                const auto & function = tool.at("function");
                std::string name = function.at("name");
                auto parameters = function.at("parameters");
                builder.resolve_refs(parameters);
                tool_rules_recipient_in_role.push_back(
                    builder.add_rule(name + "-call",
                        "\"" + name + "\"" + channel + " \" <|constrain|>json\"? \"<|message|>\" " +
                        builder.add_schema(name + "-args", parameters)
                    )
                );
                tool_rules_recipient_in_channel.push_back(
                    builder.add_rule(name + "-call",
                        "\"" + name + "\"" + " \" <|constrain|>json\"? \"<|message|>\" " +
                        builder.add_schema(name + "-args", parameters)
                    )
                );
            });
            auto recipient_in_role = builder.add_rule("recipient_in_role",
                "\"<|start|>assistant\"? \" to=functions.\" ( " +
                string_join(tool_rules_recipient_in_role, " | ") + " )"
            );
            auto recipient_in_channel = builder.add_rule("recipient_in_channel",
                channel + " \" to=functions.\" ( " +
                string_join(tool_rules_recipient_in_channel, " | ") + " )"
            );
            builder.add_rule("root", recipient_in_role + " | " + recipient_in_channel);
            // Trigger on tool calls that appear in the commentary channel
            data.grammar_triggers.push_back({
                COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN,
                "<\\|channel\\|>(commentary|analysis) to"
            });
            // Trigger tool calls that appear in the role section, either at the
            // start or in the middle.
            data.grammar_triggers.push_back({
                COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_FULL,
                "^ to"
            });
            data.grammar_triggers.push_back({
                COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN,
                "<\\|start\\|>assistant to"
            });
        });
    }
    return data;
 }
 static void common_chat_parse_gpt_oss(common_chat_msg_parser & builder) {
-    // TODO @ngxson : this won't work with --special enabled, we should fix that
+    static const std::string constraint = "(?: (<\\|constrain\\|>)?([a-zA-Z0-9_-]+))";
-    builder.try_parse_reasoning("<|channel|>analysis<|message|>", "<|start|>assistant<|channel|>final<|message|>");
+    static const std::string recipient("(?: to=functions\\.([^<\\s]+))");
-    if (!builder.syntax().parse_tool_calls) {
+
-        builder.add_content(builder.consume_rest());
+    static const common_regex start_regex("<\\|start\\|>assistant");
-        return;
+    static const common_regex analysis_regex("<\\|channel\\|>analysis");
    static const common_regex final_regex("<\\|channel\\|>final" + constraint + "?");
    static const common_regex preamble_regex("<\\|channel\\|>commentary");
    static const common_regex tool_call1_regex(recipient + "<\\|channel\\|>(analysis|commentary)" + constraint + "?");
    static const common_regex tool_call2_regex("<\\|channel\\|>(analysis|commentary)" + recipient + constraint + "?");
    auto consume_end = [&](bool include_end = false) {
        if (auto res = builder.try_find_literal("<|end|>")) {
            return res->prelude + (include_end ? builder.str(res->groups[0]) : "");
        }
        return builder.consume_rest();
    };
    auto handle_tool_call = [&](const std::string & name) {
        if (auto args = builder.try_consume_json_with_dumped_args({{}})) {
            if (builder.syntax().parse_tool_calls) {
                if (!builder.add_tool_call(name, "", args->value) || args->is_partial) {
                    throw common_chat_msg_partial_exception("incomplete tool call");
                }
            } else if (args->is_partial) {
                throw common_chat_msg_partial_exception("incomplete tool call");
            }
        }
    };
    auto regex_match = [](const common_regex & regex, const std::string & input) -> std::optional<common_regex_match> {
        auto match = regex.search(input, 0, true);
        if (match.type == COMMON_REGEX_MATCH_TYPE_FULL) {
            return match;
        }
        return std::nullopt;
    };
    do {
        auto header_start_pos = builder.pos();
        auto content_start = builder.try_find_literal("<|message|>");
        if (!content_start) {
            throw common_chat_msg_partial_exception("incomplete header");
        }
        auto header = content_start->prelude;
        if (auto match = regex_match(tool_call1_regex, header)) {
            auto group = match->groups[1];
            auto name = header.substr(group.begin, group.end - group.begin);
            handle_tool_call(name);
            continue;
        }
        if (auto match = regex_match(tool_call2_regex, header)) {
            auto group = match->groups[2];
            auto name = header.substr(group.begin, group.end - group.begin);
            handle_tool_call(name);
            continue;
        }
        if (regex_match(analysis_regex, header)) {
            builder.move_to(header_start_pos);
            if (builder.syntax().reasoning_format == COMMON_REASONING_FORMAT_NONE || builder.syntax().reasoning_in_content) {
                builder.add_content(consume_end(true));
            } else {
                builder.try_parse_reasoning("<|channel|>analysis<|message|>", "<|end|>");
            }
            continue;
        }
        if(regex_match(final_regex, header) || regex_match(preamble_regex, header)) {
            builder.add_content(consume_end());
            continue;
        }
        // Possibly a malformed message, attempt to recover by rolling
        // back to pick up the next <|start|>
        LOG_DBG("%s: unknown header from message: %s\n", __func__, header.c_str());
        builder.move_to(header_start_pos);
    } while (builder.try_find_regex(start_regex, std::string::npos, false));
    auto remaining = builder.consume_rest();
    if (!remaining.empty()) {
        LOG_DBG("%s: content after last message: %s\n", __func__, remaining.c_str());
    }
 }
@ -1734,6 +1942,212 @@ static void common_chat_parse_hermes_2_pro(common_chat_msg_parser & builder) {
    builder.add_content(builder.consume_rest());
 }
 static common_chat_params common_chat_params_init_granite(const common_chat_template & tmpl, const struct templates_params & inputs) {
    common_chat_params data;
    // Pass thinking context for Granite template
    json additional_context = {
        {"thinking", inputs.enable_thinking},
    };
    data.prompt = apply(tmpl, inputs, /* messages_override= */ std::nullopt, /* tools_override= */ std::nullopt, additional_context);
    data.format = COMMON_CHAT_FORMAT_GRANITE;
    if (string_ends_with(data.prompt, "<think>\n") || string_ends_with(data.prompt, "<think>")) {
        if (!inputs.enable_thinking) {
            data.prompt += "</think>";
        } else {
            data.thinking_forced_open = true;
        }
    }
    if (!inputs.tools.is_null()) {
        // Granite uses <|tool_call|> followed by JSON list
        data.grammar_lazy = inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED;
        data.grammar = build_grammar([&](const common_grammar_builder & builder) {
            std::vector<std::string> tool_rules;
            foreach_function(inputs.tools, [&](const json & tool) {
                const auto & function = tool.at("function");
                std::string name = function.at("name");
                auto parameters = function.at("parameters");
                builder.resolve_refs(parameters);
                tool_rules.push_back(builder.add_rule(name + "-call", builder.add_schema(name +
 "-args", {
                    {"type", "object"},
                    {"properties", {
                        {"name", {{"const", name}}},
                        {"arguments", parameters},
                    }},
                    {"required", json::array({"name", "arguments"})},
                })));
            });
            auto tool_call = builder.add_rule("tool_call", string_join(tool_rules, " | "));
            auto tool_list = builder.add_rule("tool_list", "\"[\" space " + tool_call + " (\",\" space " + tool_call + ")* space \"]\"");
            if (data.thinking_forced_open) {
                builder.add_rule("root", "\"</think>\" space \"<response>\" space [^<]* \"</response>\" space \"<|tool_call|>\" space " + tool_list);
            } else {
                builder.add_rule("root", "\"<|tool_call|>\" space " + tool_list);
            }
            data.grammar_triggers.push_back({
                COMMON_GRAMMAR_TRIGGER_TYPE_WORD,
                "<|tool_call|>"
            });
            data.preserved_tokens = {
                "<think>",
                "</think>",
                "<response>",
                "</response>",
                "<|tool_call|>",
            };
        });
    } else {
        // Handle thinking tags for non-tool responses
        if (data.thinking_forced_open && inputs.enable_thinking) {
            data.grammar_lazy = false;
            data.grammar = build_grammar([&](const common_grammar_builder & builder) {
                builder.add_rule("root", "\"</think>\" space \"<response>\" space .* \"</response>\" space");
            });
            data.preserved_tokens = {
                "<think>",
                "</think>",
                "<response>",
                "</response>",
            };
        }
    }
    return data;
 }
 static void common_chat_parse_granite(common_chat_msg_parser & builder) {
    // Parse thinking tags
    builder.try_parse_reasoning("<think>", "</think>");
    // Parse response tags using regex
    static const common_regex response_regex("<response>([\\s\\S]*?)</response>");
    if (auto res = builder.try_find_regex(response_regex)) {
        // Extract the content between the tags (capture group 1)
        auto content = builder.str(res->groups[1]);
        builder.add_content(content);
        builder.move_to(res->groups[0].end);
    }
    if (!builder.syntax().parse_tool_calls) {
        builder.add_content(builder.consume_rest());
        return;
    }
    // Look for tool calls
    static const common_regex tool_call_regex(regex_escape("<|tool_call|>"));
    if (auto res = builder.try_find_regex(tool_call_regex)) {
        builder.move_to(res->groups[0].end);
        // Expect JSON array of tool calls
        auto tool_calls_data = builder.consume_json();
        if (tool_calls_data.json.is_array()) {
            if (!builder.add_tool_calls(tool_calls_data.json)) {
                builder.add_content("<|tool_call|>" + tool_calls_data.json.dump());
            }
        } else {
            builder.add_content("<|tool_call|>" + tool_calls_data.json.dump());
        }
    } else {
        builder.add_content(builder.consume_rest());
    }
 }
 static void common_chat_parse_seed_oss(common_chat_msg_parser & builder) {
    // Parse thinking tags first - this handles the main reasoning content
    builder.try_parse_reasoning("<seed:think>", "</seed:think>");
    if (!builder.syntax().parse_tool_calls) {
        builder.add_content(builder.consume_rest());
        return;
    }
    // Parse tool calls - Seed-OSS uses <seed:tool_call> format
    static const common_regex tool_call_begin_regex("<seed:tool_call>");
    static const common_regex tool_call_end_regex("</seed:tool_call>");
    static const common_regex function_regex("<function=([^>]+)>");
    static const common_regex param_regex("<parameter=([^>]+)>");
    while (auto tool_res = builder.try_find_regex(tool_call_begin_regex)) {
        builder.consume_spaces();  // Consume whitespace after <seed:tool_call>
        // Look for function call inside tool call, ignore any content before it
        if (auto func_res = builder.try_find_regex(function_regex, std::string::npos, false)) {
            auto function_name = builder.str(func_res->groups[1]);
            // Parse Seed-OSS parameters <parameter=name>value</parameter>
            json args = json::object();
            // Parse all parameters
            while (auto param_res = builder.try_find_regex(param_regex, std::string::npos, false)) {
                // again, ignore noise around parameters
                auto param_name = builder.str(param_res->groups[1]);
                builder.move_to(param_res->groups[0].end);
                builder.consume_spaces();  // Consume whitespace after parameter
                auto savedPos = builder.pos();
                if (auto param_parse = builder.try_find_literal("</parameter>")) {
                    auto param = param_parse->prelude;
                    builder.move_to(savedPos);
                    try {
                        if (auto param_res = builder.try_consume_json()) {
                            args[param_name] = param_res->json;
                        } else {
                            args[param_name] = param;
                        }
                    } catch (json::exception &) {
                        args[param_name] = param;
                    }
                } else {
                    throw common_chat_msg_partial_exception("Incomplete tool parameter");
                }
            }
            // Look for closing function tag
            auto end_func = builder.try_find_literal("</function>");
            if (end_func) {
                builder.move_to(end_func->groups[0].end);
                builder.consume_spaces();  // Consume whitespace after </function>
                // Add the tool call with parsed arguments, but only if we REALLY got the literal
                auto eaten_fragment = builder.input().substr(end_func->groups[0].begin, end_func->groups[0].end);
                auto funlen = std::string("</function>").length();
                if (eaten_fragment.length() >= funlen && eaten_fragment.substr(0, funlen) == std::string("</function>")) {
                    if (!builder.add_tool_call(function_name, "", args.dump())) {
                        throw common_chat_msg_partial_exception("Incomplete tool call");
                    }
                } else {
                    throw common_chat_msg_partial_exception("Incomplete tool call");
                }
            } else {
                throw common_chat_msg_partial_exception("Incomplete tool call");
            }
            // Look for closing tool call tag
            if (auto end_tool = builder.try_find_regex(tool_call_end_regex, std::string::npos, false)) {
                builder.move_to(end_tool->groups[0].end);
                builder.consume_spaces();  // Consume trailing whitespace after tool call
            } else {
                throw common_chat_msg_partial_exception("Incomplete tool call");
            }
        } else {
            // No function found - don't consume content here, let it be handled at the end
            break;
        }
    }
    // Consume any remaining whitespace after all tool call processing
    builder.consume_spaces();
    auto remaining = builder.consume_rest();
    // If there's any non-whitespace content remaining, add it as content
    if (!string_strip(remaining).empty()) {
        builder.add_content(remaining);
    }
 }
 static common_chat_params common_chat_params_init_without_tools(const common_chat_template & tmpl, const struct templates_params & inputs) {
    common_chat_params data;
    data.prompt = apply(tmpl, inputs);
@ -1750,6 +2164,60 @@ static common_chat_params common_chat_params_init_without_tools(const common_cha
    return data;
 }
 static common_chat_params common_chat_params_init_seed_oss(
    const common_chat_template         & tmpl,
    templates_params                   & params,
    const common_chat_templates_inputs & inputs)
 {
    common_chat_params data;
    data.prompt = apply(tmpl, params);
    data.format = COMMON_CHAT_FORMAT_SEED_OSS;
    if (string_ends_with(data.prompt, "<seed:think>")) {
        if (!inputs.enable_thinking) {
            data.prompt += "</seed:think>";
        } else {
            data.thinking_forced_open = true;
        }
    }
    if (params.tools.is_array() && !params.tools.empty()) {
        data.grammar_lazy = inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED;
        data.grammar      = build_grammar([&](const common_grammar_builder & builder) {
            std::vector<std::string> tool_rules;
            foreach_function(params.tools, [&](const json & tool) {
                const auto & function   = tool.at("function");
                std::string  name       = function.at("name");
                auto         parameters = function.at("parameters");
                builder.resolve_refs(parameters);
                // Create rule for Seed-OSS function call format
                std::string param_rules;
                if (parameters.contains("properties")) {
                    for (const auto & [key, value] : parameters.at("properties").items()) {
                        param_rules += "\"<parameter=" + key + ">\"" + builder.add_schema(name + "-arg-" + key, value) +
                                       "\"</parameter>\"";
                    }
                }
                tool_rules.push_back(builder.add_rule(name + "-call",
                                                      "\"<seed:tool_call>\" space \"<function=" + name + ">\" space " +
                                                          param_rules +
                                                          " \"</function>\" space \"</seed:tool_call>\""));
            });
            data.grammar_triggers.push_back({ COMMON_GRAMMAR_TRIGGER_TYPE_WORD, "<seed:tool_call>" });
            data.preserved_tokens = {
                "<seed:think>", "</seed:think>", "<seed:tool_call>", "</seed:tool_call>",
                "<function=",   "</function>",   "<parameter=",      "</parameter>",
            };
            builder.add_rule("root", string_join(tool_rules, " | "));
        });
    }
    return data;
 }
 static common_chat_params common_chat_templates_apply_jinja(
    const struct common_chat_templates        * tmpls,
    const struct common_chat_templates_inputs & inputs)
@ -1767,8 +2235,8 @@ static common_chat_params common_chat_templates_apply_jinja(
    params.enable_thinking = inputs.enable_thinking;
    params.grammar = inputs.grammar;
    params.now = inputs.now;
-    params.add_bos = inputs.add_bos;
+    params.add_bos = tmpls->add_bos;
-    params.add_eos = inputs.add_eos;
+    params.add_eos = tmpls->add_eos;
    params.extra_context = json::object();
    for (auto el : inputs.chat_template_kwargs) {
@ -1805,16 +2273,26 @@ static common_chat_params common_chat_templates_apply_jinja(
        return common_chat_params_init_command_r7b(tmpl, params);
    }
    // Granite (IBM) - detects thinking / tools support
    if (src.find("elif thinking") != std::string::npos && src.find("<|tool_call|>") != std::string::npos) {
        return common_chat_params_init_granite(tmpl, params);
    }
    // Hermes 2/3 Pro, Qwen 2.5 Instruct (w/ tools)
    if (src.find("<tool_call>") != std::string::npos && params.json_schema.is_null()) {
        return common_chat_params_init_hermes_2_pro(tmpl, params);
    }
    // GPT-OSS
-    if (src.find("<|channel|>") != std::string::npos && params.json_schema.is_null()) {
+    if (src.find("<|channel|>") != std::string::npos) {
        return common_chat_params_init_gpt_oss(tmpl, params);
    }
    // Seed-OSS
    if (src.find("<seed:think>") != std::string::npos) {
        return common_chat_params_init_seed_oss(tmpl, params, inputs);
    }
    // Use generic handler when mixing tools + JSON schema.
    // TODO: support that mix in handlers below.
    if ((params.tools.is_array() && params.json_schema.is_object())) {
@ -1865,6 +2343,7 @@ static common_chat_params common_chat_templates_apply_legacy(
    int alloc_size = 0;
    std::vector<llama_chat_message> chat;
    std::vector<std::string> contents;
    for (const auto & msg : inputs.messages) {
        auto content = msg.content;
        for (const auto & part : msg.content_parts) {
@ -1966,9 +2445,15 @@ static void common_chat_parse(common_chat_msg_parser & builder) {
        case COMMON_CHAT_FORMAT_COMMAND_R7B:
            common_chat_parse_command_r7b(builder);
            break;
        case COMMON_CHAT_FORMAT_GRANITE:
            common_chat_parse_granite(builder);
            break;
        case COMMON_CHAT_FORMAT_GPT_OSS:
            common_chat_parse_gpt_oss(builder);
            break;
        case COMMON_CHAT_FORMAT_SEED_OSS:
            common_chat_parse_seed_oss(builder);
            break;
        default:
            throw std::runtime_error(std::string("Unsupported format: ") + common_chat_format_name(builder.syntax().format));
    }
--- a/common/chat.h
+++ b/common/chat.h
@ -109,7 +109,9 @@ enum common_chat_format {
    COMMON_CHAT_FORMAT_FUNCTIONARY_V3_1_LLAMA_3_1,
    COMMON_CHAT_FORMAT_HERMES_2_PRO,
    COMMON_CHAT_FORMAT_COMMAND_R7B,
    COMMON_CHAT_FORMAT_GRANITE,
    COMMON_CHAT_FORMAT_GPT_OSS,
    COMMON_CHAT_FORMAT_SEED_OSS,
    COMMON_CHAT_FORMAT_COUNT, // Not a format, just the # formats
 };
@ -186,10 +188,12 @@ std::string common_chat_format_single(
 // Returns an example of formatted chat
 std::string common_chat_format_example(
    const struct common_chat_templates * tmpls,
-    bool use_jinja);
+    bool use_jinja,
    const std::map<std::string, std::string> & chat_template_kwargs);
 const char*               common_chat_format_name(common_chat_format format);
 const char*               common_reasoning_format_name(common_reasoning_format format);
 common_reasoning_format   common_reasoning_format_from_name(const std::string & format);
 common_chat_msg           common_chat_parse(const std::string & input, bool is_partial, const common_chat_syntax & syntax);
 common_chat_tool_choice common_chat_tool_choice_parse_oaicompat(const std::string & tool_choice);
--- a/common/common.cpp
+++ b/common/common.cpp
@ -41,6 +41,7 @@
 #endif
 #include <locale>
 #include <windows.h>
 #include <string.h>
 #include <fcntl.h>
 #include <io.h>
 #else
@ -557,13 +558,6 @@ std::string string_from(const struct llama_context * ctx, const std::vector<llam
        auto detokenized = common_token_to_piece(ctx, token);
        detokenized.erase(
            std::remove_if(
                detokenized.begin(),
                detokenized.end(),
                [](const unsigned char c) { return !std::isprint(c); }),
            detokenized.end());
        buf << "'" << detokenized << "'"
            << ":" << std::to_string(token);
    }
@ -588,13 +582,6 @@ std::string string_from(const struct llama_context * ctx, const struct llama_bat
        auto detokenized = common_token_to_piece(ctx, batch.token[i]);
        detokenized.erase(
                std::remove_if(
                    detokenized.begin(),
                    detokenized.end(),
                    [](const unsigned char c) { return !std::isprint(c); }),
                detokenized.end());
        buf << "\n"          << std::to_string(i)
            << ", token '"   << detokenized << "'"
            << ", pos "      << std::to_string(batch.pos[i])
@ -914,7 +901,8 @@ struct common_init_result common_init_from_params(common_params & params) {
    llama_model * model = llama_model_load_from_file(params.model.path.c_str(), mparams);
    if (model == NULL) {
-        LOG_ERR("%s: failed to load model '%s'\n", __func__, params.model.path.c_str());
+        LOG_ERR("%s: failed to load model '%s', try reducing --n-gpu-layers if you're running out of VRAM\n",
            __func__, params.model.path.c_str());
        return iparams;
    }
@ -924,7 +912,8 @@ struct common_init_result common_init_from_params(common_params & params) {
    llama_context * lctx = llama_init_from_model(model, cparams);
    if (lctx == NULL) {
-        LOG_ERR("%s: failed to create context with model '%s'\n", __func__, params.model.path.c_str());
+        LOG_ERR("%s: failed to create context with model '%s', try reducing --n-gpu-layers if you're running out of VRAM\n",
            __func__, params.model.path.c_str());
        llama_model_free(model);
        return iparams;
    }
@ -1001,7 +990,12 @@ struct common_init_result common_init_from_params(common_params & params) {
            return iparams;
        }
        char buf[1024];
        la.ptr = lora.get();
        llama_adapter_meta_val_str(la.ptr, "adapter.lora.task_name", buf, sizeof(buf));
        la.task_name = buf;
        llama_adapter_meta_val_str(la.ptr, "adapter.lora.prompt_prefix", buf, sizeof(buf));
        la.prompt_prefix = buf;
        iparams.lora.emplace_back(std::move(lora)); // copy to list of loaded adapters
    }
@ -1165,11 +1159,10 @@ struct llama_context_params common_context_params_to_llama(const common_params &
    cparams.yarn_orig_ctx     = params.yarn_orig_ctx;
    cparams.pooling_type      = params.pooling_type;
    cparams.attention_type    = params.attention_type;
-    cparams.defrag_thold      = params.defrag_thold;
+    cparams.flash_attn_type   = params.flash_attn_type;
    cparams.cb_eval           = params.cb_eval;
    cparams.cb_eval_user_data = params.cb_eval_user_data;
    cparams.offload_kqv       = !params.no_kv_offload;
    cparams.flash_attn        = params.flash_attn;
    cparams.no_perf           = params.no_perf;
    cparams.op_offload        = !params.no_op_offload;
    cparams.swa_full          = params.swa_full;
@ -1565,3 +1558,56 @@ ggml_opt_dataset_t common_opt_dataset_init(struct llama_context * ctx, const std
    return result;
 }
 ggml_opt_optimizer_params common_opt_lr_pars(void * userdata) {
    ggml_opt_optimizer_params result = ggml_opt_get_default_optimizer_params(nullptr);
    const lr_opt &            d      = *(lr_opt *) userdata;
    result.adamw.alpha = result.sgd.alpha = d.get_lr(d.epoch);
    result.sgd.wd = result.adamw.wd = d.wd;
    return result;
 }
 // TODO make all command line args case-insensitive
 static inline bool eq_case_insensitive(char const* a, char const* b) {
    return !
 #if defined(_MSC_VER)
        _stricmp
 #else
        strcasecmp
 #endif // defined(_MSC_VER)
        (a, b);
 }
 enum ggml_opt_optimizer_type common_opt_get_optimizer(const char * n) {
    if (eq_case_insensitive("adamw", n)) {
        return GGML_OPT_OPTIMIZER_TYPE_ADAMW;
    }
    if (eq_case_insensitive("sgd", n)) {
        return GGML_OPT_OPTIMIZER_TYPE_SGD;
    }
    return GGML_OPT_OPTIMIZER_TYPE_COUNT;
 }
 // TODO simplify to use just log and exp
 static float const k_log_2 = std::log(2.f);
 void lr_opt::init() {
    if (lr_min > 0 && lr_min < lr0) {
        float nhalf = std::log(lr0 / lr_min) / k_log_2;
        float e     = epochs;
        if (decay_epochs > 0 && decay_epochs < e) {
            e = decay_epochs;
        } else {
            decay_epochs = e;
        }
        scale_epoch = nhalf / e;
    }
 }
 float lr_opt::get_lr(float epoch) const {
    float r = lr_min <= 0 ? lr0 :
        epoch >= decay_epochs ? lr_min :
        lr0 * std::pow(0.5f, epoch * scale_epoch);
    LOG_INF("epoch %.2g lr=%.2g\n", epoch, r);
    return r;
 }
--- a/common/common.h
+++ b/common/common.h
@ -2,14 +2,17 @@
 #pragma once
 #include "llama-cpp.h"
 #include <set>
 #include <sstream>
 #include <string>
 #include <string_view>
 #include <vector>
 #include <map>
 #include <sstream>
 #include <cmath>
 #include "ggml-opt.h"
 #include "llama-cpp.h"
 #ifdef _WIN32
 #define DIRECTORY_SEPARATOR '\\'
@ -31,6 +34,9 @@ struct common_adapter_lora_info {
    std::string path;
    float scale;
    std::string task_name;
    std::string prompt_prefix;
    struct llama_adapter_lora * ptr;
 };
@ -82,6 +88,7 @@ enum llama_example {
    LLAMA_EXAMPLE_PARALLEL,
    LLAMA_EXAMPLE_TTS,
    LLAMA_EXAMPLE_DIFFUSION,
    LLAMA_EXAMPLE_FINETUNE,
    LLAMA_EXAMPLE_COUNT,
 };
@ -202,6 +209,7 @@ struct common_params_speculative {
    float   p_split      =  0.1f; // speculative decoding split probability
    float   p_min        = 0.75f; // minimum speculative decoding probability (greedy)
    std::vector<std::pair<std::string, std::string>> replacements; // main to speculative model replacements
    std::vector<llama_model_tensor_buft_override> tensor_buft_overrides;
    ggml_type cache_type_k = GGML_TYPE_F16; // KV cache data type for the K
    ggml_type cache_type_v = GGML_TYPE_F16; // KV cache data type for the V
@ -234,13 +242,36 @@ struct common_params_diffusion {
    bool    add_gumbel_noise = false; // add gumbel noise to the logits if temp > 0.0
 };
 // reasoning API response format (not to be confused as chat template's reasoning format)
 enum common_reasoning_format {
    COMMON_REASONING_FORMAT_NONE,
-    COMMON_REASONING_FORMAT_AUTO,
+    COMMON_REASONING_FORMAT_AUTO,            // Same as deepseek, using `message.reasoning_content`
    COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY, // Extract thinking tag contents and return as `message.reasoning_content`, or leave inline in <think> tags in stream mode
    COMMON_REASONING_FORMAT_DEEPSEEK,        // Extract thinking tag contents and return as `message.reasoning_content`, including in streaming deltas.
    // do not extend this enum unless you absolutely have to
    // in most cases, use COMMON_REASONING_FORMAT_AUTO
    // see: https://github.com/ggml-org/llama.cpp/pull/15408
 };
 struct lr_opt {
    float    lr0          = 1e-5; // learning rate at first epoch
    float    lr_min       = -1;
    float    decay_epochs = -1;   // if >0, the learning rate starts at lr0 and decays to lr_min after this many epochs
    float    scale_epoch  = 0;
    float    wd           = 0;
    unsigned epochs       = 2;
    unsigned epoch; // set by optimizer outer (epochs) loop
    // learning rate decay - constant LR per epoch only for now
    float get_lr(float e) const;
    float get_lr() const { return get_lr(epoch); }
    // must call after arg parse, before get_lr
    void init();
 };
 struct ggml_opt_optimizer_params common_opt_lr_pars(void * userdata);
 struct common_params {
    int32_t n_predict             =    -1; // new tokens to predict
    int32_t n_ctx                 =  4096; // context size
@ -260,7 +291,6 @@ struct common_params {
    float   yarn_beta_fast        = 32.0f; // YaRN low correction dim
    float   yarn_beta_slow        =  1.0f; // YaRN high correction dim
    int32_t yarn_orig_ctx         =     0; // YaRN original context length
    float   defrag_thold          =  0.1f; // KV cache defragmentation threshold
    // offload params
    std::vector<ggml_backend_dev_t> devices; // devices to use for offloading
@ -282,6 +312,7 @@ struct common_params {
    enum llama_rope_scaling_type rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED;
    enum llama_pooling_type      pooling_type      = LLAMA_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings
    enum llama_attention_type    attention_type    = LLAMA_ATTENTION_TYPE_UNSPECIFIED; // attention type for embeddings
    enum llama_flash_attn_type   flash_attn_type   = LLAMA_FLASH_ATTN_TYPE_AUTO; // whether to use Flash Attention
    struct common_params_sampling    sampling;
    struct common_params_speculative speculative;
@ -345,9 +376,8 @@ struct common_params {
    bool multiline_input   = false; // reverse the usage of `\`
    bool simple_io         = false; // improves compatibility with subprocesses and limited consoles
    bool cont_batching     = true;  // insert new sequences for decoding on-the-fly
    bool flash_attn        = false; // flash attention
    bool no_perf           = false; // disable performance metrics
-    bool ctx_shift         = true;  // context shift on inifinite text generation
+    bool ctx_shift         = false;  // context shift on infinite text generation
    bool swa_full          = false; // use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
    bool kv_unified        = false; // enable unified KV cache
@ -375,6 +405,11 @@ struct common_params {
    bool no_mmproj = false;         // explicitly disable multimodal model
    std::vector<std::string> image; // path to image file(s)
    // finetune
    struct lr_opt lr;
    enum ggml_opt_optimizer_type optimizer = GGML_OPT_OPTIMIZER_TYPE_ADAMW;
    float val_split = 0.05f; // fraction of the data used for the validation set
    // embedding
    bool embedding         = false; // get only sentence embedding
    int32_t embd_normalize = 2;     // normalisation for embeddings (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)
@ -388,6 +423,7 @@ struct common_params {
    int32_t timeout_write     = timeout_read; // http write timeout in seconds
    int32_t n_threads_http    = -1;           // number of threads to process HTTP requests (TODO: support threadpool)
    int32_t n_cache_reuse     = 0;            // min chunk size to reuse from the cache via KV shifting
    int32_t n_swa_checkpoints = 3;            // max number of SWA checkpoints per slot
    std::string hostname      = "127.0.0.1";
    std::string public_path   = "";                                                                         // NOLINT
@ -408,7 +444,7 @@ struct common_params {
    // "advanced" endpoints are disabled by default for better security
    bool webui            = true;
-    bool endpoint_slots   = false;
+    bool endpoint_slots   = true;
    bool endpoint_props   = false; // only control POST requests, not GET
    bool endpoint_metrics = false;
@ -702,3 +738,6 @@ const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count";
 //
 ggml_opt_dataset_t common_opt_dataset_init(struct llama_context * ctx, const std::vector<llama_token> & tokens, int64_t stride);
 // "adamw" or "sgd" (case insensitive)
 enum ggml_opt_optimizer_type common_opt_get_optimizer(const char *);
--- a/common/sampling.cpp
+++ b/common/sampling.cpp
@ -426,8 +426,29 @@ uint32_t common_sampler_get_seed(const struct common_sampler * gsmpl) {
 // helpers
-llama_token_data_array * common_sampler_get_candidates(struct common_sampler * gsmpl) {
+llama_token_data_array * common_sampler_get_candidates(struct common_sampler * gsmpl, bool do_sort) {
-    return &gsmpl->cur_p;
+    auto * res = &gsmpl->cur_p;
    if (do_sort && !res->sorted) {
        // remember the selected token before sorting
        const llama_token id = res->data[res->selected].id;
        std::sort(res->data, res->data + res->size, [](const llama_token_data & a, const llama_token_data & b) {
            return a.p > b.p;
        });
        // restore the selected token after sorting
        for (size_t i = 0; i < res->size; ++i) {
            if (res->data[i].id == id) {
                res->selected = i;
                break;
            }
        }
        res->sorted = true;
    }
    return res;
 }
 llama_token common_sampler_last(const struct common_sampler * gsmpl) {
--- a/common/sampling.h
+++ b/common/sampling.h
@ -86,7 +86,9 @@ uint32_t common_sampler_get_seed(const struct common_sampler * gsmpl);
 // helpers
 // access the internal list of current candidate tokens
-llama_token_data_array * common_sampler_get_candidates(struct common_sampler * gsmpl);
+// if do_sort == true, the candidates are guaranteed to be sorted afterwards (in descending order of probability)
 // the .sorted flag of the result indicates whether the returned candidates are sorted
 llama_token_data_array * common_sampler_get_candidates(struct common_sampler * gsmpl, bool do_sort);
 // get the last accepted token
 llama_token common_sampler_last(const struct common_sampler * gsmpl);
--- a/common/speculative.cpp
+++ b/common/speculative.cpp
@ -317,7 +317,7 @@ llama_tokens common_speculative_gen_draft(
        common_sampler_sample(smpl, ctx_dft, 0, true);
-        const auto * cur_p = common_sampler_get_candidates(smpl);
+        const auto * cur_p = common_sampler_get_candidates(smpl, true);
        for (int k = 0; k < std::min(3, (int) cur_p->size); ++k) {
            LOG_DBG(" - draft candidate %3d, pos %3d: %6d (%8.3f) '%s'\n",
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
--- a/convert_lora_to_gguf.py
+++ b/convert_lora_to_gguf.py
@ -340,7 +340,7 @@ if __name__ == '__main__':
            sys.exit(1)
    else:
        logger.info(f"Loading base model: {dir_base_model.name}")
-        hparams = ModelBase.load_hparams(dir_base_model)
+        hparams = ModelBase.load_hparams(dir_base_model, False)
    with torch.inference_mode():
        try:
--- a/docs/backend/CANN.md
+++ b/docs/backend/CANN.md
@ -293,17 +293,14 @@ We would like to thank Tuo Dai, Shanni Li, and all of the project maintainers fr
 ## Environment variable setup
 ### GGML_CANN_ASYNC_MODE
 Enables asynchronous operator submission. Disabled by default.
 ### GGML_CANN_MEM_POOL
-Specifies the memory pool management strategy:
+Specifies the memory pool management strategy, Default is vmm.
 - vmm: Utilizes a virtual memory manager pool. If hardware support for VMM is unavailable, falls back to the legacy (leg) memory pool.
 - prio: Employs a priority queue-based memory pool management.
 - leg: Uses a fixed-size buffer pool.
 ### GGML_CANN_DISABLE_BUF_POOL_CLEAN
@ -312,5 +309,8 @@ Controls automatic cleanup of the memory pool. This option is only effective whe
 ### GGML_CANN_WEIGHT_NZ
-Converting the matmul weight format from ND to NZ can significantly improve performance on the 310I DUO NPU.
+Converting the matmul weight format from ND to NZ to improve performance. Enabled by default.
 ### GGML_CANN_ACL_GRAPH
 Operators are executed using ACL graph execution, rather than in op-by-op (eager) mode. Enabled by default.
--- a/docs/build-s390x.md
+++ b/docs/build-s390x.md
@ -76,6 +76,23 @@ cmake --build build --config Release -j $(nproc)
    cmake --build build --config Release -j $(nproc)
    ```
 ## IBM zDNN Accelerator
 This provides acceleration using the IBM zAIU co-processor located in the Telum I and Telum II processors. Make sure to have the [IBM zDNN library](https://github.com/IBM/zDNN) installed.
 #### Compile from source from IBM
 You may find the official build instructions here: [Building and Installing zDNN](https://github.com/IBM/zDNN?tab=readme-ov-file#building-and-installing-zdnn)
 ### Compilation
 ```bash
 cmake -S . -B build             \
    -DCMAKE_BUILD_TYPE=Release  \
    -DGGML_ZDNN=ON
 cmake --build build --config Release -j$(nproc)
 ```
 ## Getting GGUF Models
 All models need to be converted to Big-Endian. You can achieve this in three cases:
@ -145,15 +162,15 @@ All models need to be converted to Big-Endian. You can achieve this in three cas
 ### 1. SIMD Acceleration
-Only available in IBM z15 or later system with the `-DGGML_VXE=ON` (turned on by default) compile flag. No hardware acceleration is possible with llama.cpp with older systems, such as IBM z14/arch12. In such systems, the APIs can still run but will use a scalar implementation.
+Only available in IBM z15/LinuxONE 3 or later system with the `-DGGML_VXE=ON` (turned on by default) compile flag. No hardware acceleration is possible with llama.cpp with older systems, such as IBM z14/arch12. In such systems, the APIs can still run but will use a scalar implementation.
 ### 2. NNPA Vector Intrinsics Acceleration
-Only available in IBM z16 or later system with the `-DGGML_NNPA=ON` (turned off by default) compile flag. No hardware acceleration is possible with llama.cpp with older systems, such as IBM z15/arch13. In such systems, the APIs can still run but will use a scalar implementation.
+Only available in IBM z16/LinuxONE 4 or later system with the `-DGGML_NNPA=ON` (turned off by default) compile flag. No hardware acceleration is possible with llama.cpp with older systems, such as IBM z15/arch13. In such systems, the APIs can still run but will use a scalar implementation.
-### 3. zDNN Accelerator
+### 3. zDNN Accelerator (WIP)
-_Only available in IBM z16 / LinuxONE 4 or later system. No support currently available._
+Only available in IBM z17/LinuxONE 5 or later system with the `-DGGML_ZDNN=ON` compile flag. No hardware acceleration is possible with llama.cpp with older systems, such as IBM z15/arch13. In such systems, the APIs will default back to CPU routines.
 ### 4. Spyre Accelerator
@ -230,10 +247,11 @@ IBM VXE/VXE2 SIMD acceleration depends on the BLAS implementation. It is strongl
 ## Appendix A: Hardware Support Matrix
 |          | Support | Minimum Compiler Version |
-| ------- | ------- | ------------------------ |
+| -------- | ------- | ------------------------ |
 | IBM z15  | ✅      |                          |
 | IBM z16  | ✅      |                          |
 | IBM z17  | ✅      | GCC 15.1.0               |
 | IBM zDNN | ✅      |                          |
 -   ✅ - supported and verified to run as intended
 -   🚫 - unsupported, we are unlikely able to provide support
@ -242,13 +260,14 @@ IBM VXE/VXE2 SIMD acceleration depends on the BLAS implementation. It is strongl
 |            | VX/VXE/VXE2 | NNPA | zDNN | Spyre |
 | ---------- | ----------- | ---- | ---- | ----- |
-| FP32       | ✅          | ✅   | ❓   | ❓    |
+| FP32       | ✅          | ✅   | ✅   | ❓    |
 | FP16       | ✅          | ✅   | ❓   | ❓    |
 | BF16       | 🚫          | 🚫   | ❓   | ❓    |
 | Q4_0       | ✅          | ✅   | ❓   | ❓    |
 | Q4_1       | ✅          | ✅   | ❓   | ❓    |
-| Q5_0       | 🚫          | 🚫   | ❓   | ❓    |
+| MXFP4      | 🚫          | 🚫   | ❓   | ❓    |
-| Q5_1       | 🚫          | 🚫   | ❓   | ❓    |
+| Q5_0       | ✅          | ✅   | ❓   | ❓    |
 | Q5_1       | ✅          | ✅   | ❓   | ❓    |
 | Q8_0       | ✅          | ✅   | ❓   | ❓    |
 | Q2_K       | 🚫          | 🚫   | ❓   | ❓    |
 | Q3_K       | ✅          | ✅   | ❓   | ❓    |
@ -273,4 +292,4 @@ IBM VXE/VXE2 SIMD acceleration depends on the BLAS implementation. It is strongl
 -   🚫 - acceleration unavailable, will still run using scalar implementation
 -   ❓ - acceleration unknown, please contribute if you can test it yourself
-Last Updated by **Aaron Teo (aaron.teo1@ibm.com)** on July 25, 2025.
+Last Updated by **Aaron Teo (aaron.teo1@ibm.com)** on Aug 22, 2025.
--- a/docs/build.md
+++ b/docs/build.md
@ -59,8 +59,6 @@ cmake --build build --config Release
    cmake --preset arm64-windows-llvm-release -D GGML_OPENMP=OFF
    cmake --build build-arm64-windows-llvm-release
    ```
    Building for arm64 can also be done with the MSVC compiler with the build-arm64-windows-MSVC preset, or the standard CMake build instructions. However, note that the MSVC compiler does not support inline ARM assembly code, used e.g. for the accelerated Q4_0_N_M CPU kernels.
    For building with ninja generator and clang compiler as default:
      -set path:set LIB=C:\Program Files (x86)\Windows Kits\10\Lib\10.0.22621.0\um\x64;C:\Program Files\Microsoft Visual Studio\2022\Community\VC\Tools\MSVC\14.41.34120\lib\x64\uwp;C:\Program Files (x86)\Windows Kits\10\Lib\10.0.22621.0\ucrt\x64
      ```bash
@ -198,10 +196,9 @@ The environment variable `GGML_CUDA_ENABLE_UNIFIED_MEMORY=1` can be used to enab
 The following compilation options are also available to tweak performance:
 | Option                        | Legal values           | Default | Description                                                                                                                                                                                                                                                                                                                                                                      |
-|-------------------------------|------------------------|---------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+|-------------------------------|------------------------|---------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
 | GGML_CUDA_FORCE_MMQ           | Boolean                | false   | Force the use of custom matrix multiplication kernels for quantized models instead of FP16 cuBLAS even if there is no int8 tensor core implementation available (affects V100, CDNA and RDNA3+). MMQ kernels are enabled by default on GPUs with int8 tensor core support. With MMQ force enabled, speed for large batch sizes will be worse but VRAM consumption will be lower. |
-| GGML_CUDA_FORCE_CUBLAS        | Boolean                | false   | Force the use of FP16 cuBLAS instead of custom matrix multiplication kernels for quantized models                                                                                                                                                                                       |
+| GGML_CUDA_FORCE_CUBLAS        | Boolean                | false   | Force the use of FP16 cuBLAS instead of custom matrix multiplication kernels for quantized models. There may be issues with numerical overflows (except for CDNA and RDNA4) and memory use will be higher. Prompt processing may become faster on recent datacenter GPUs (the custom kernels were tuned primarily for RTX 3000/4000).                                            |
 | GGML_CUDA_F16                 | Boolean                | false   | If enabled, use half-precision floating point arithmetic for the CUDA dequantization + mul mat vec kernels and for the q4_1 and q5_1 matrix matrix multiplication kernels. Can improve performance on relatively recent GPUs.                                                           |
 | GGML_CUDA_PEER_MAX_BATCH_SIZE | Positive integer       | 128     | Maximum batch size for which to enable peer access between multiple GPUs. Peer access requires either Linux or NVLink. When using NVLink enabling peer access for larger batch sizes is potentially beneficial.                                                                                                                                                                  |
 | GGML_CUDA_FA_ALL_QUANTS       | Boolean                | false   | Compile support for all KV cache quantization type (combinations) for the FlashAttention CUDA kernels. More fine-grained control over KV cache size but compilation takes much longer.                                                                                                                                                                                           |
--- a/docs/function-calling.md
+++ b/docs/function-calling.md
@ -21,6 +21,8 @@ Function calling is supported for all models (see https://github.com/ggml-org/ll
  - Use `--chat-template-file` to override the template when appropriate (see examples below)
  - Generic support may consume more tokens and be less efficient than a model's native format.
 - Multiple/parallel tool calling is supported on some models but disabled by default, enable it by passing `"parallel_tool_calls": true` in the completion endpoint payload.
 <details>
 <summary>Show some common templates and which format handler they use</summary>
--- a/docs/multimodal/MobileVLM.md
+++ b/docs/multimodal/MobileVLM.md
@ -194,7 +194,7 @@ llama_print_timings:       total time =   44411.01 ms /   377 tokens
 ## Orin compile and run
 ### compile
 ```sh
-make GGML_CUDA=1 CUDA_DOCKER_ARCH=sm_87 GGML_CUDA_F16=1 -j 32
+make GGML_CUDA=1 CUDA_DOCKER_ARCH=sm_87 -j 32
 ```
 ### run on Orin
 ### case 1
--- a/docs/multimodal/minicpmo2.6.md
+++ b/docs/multimodal/minicpmo2.6.md
@ -13,7 +13,7 @@ If there are differences in usage, please refer to the official build [documenta
 Clone llama.cpp:
 ```bash
-git clone https://github.com/ggerganov/llama.cpp
+git clone https://github.com/ggml-org/llama.cpp
 cd llama.cpp
 ```
--- a/docs/multimodal/minicpmv2.6.md
+++ b/docs/multimodal/minicpmv2.6.md
@ -12,7 +12,7 @@ If there are differences in usage, please refer to the official build [documenta
 Clone llama.cpp:
 ```bash
-git clone https://github.com/ggerganov/llama.cpp
+git clone https://github.com/ggml-org/llama.cpp
 cd llama.cpp
 ```
--- a/docs/multimodal/minicpmv4.0.md
+++ b/docs/multimodal/minicpmv4.0.md
@ -6,7 +6,7 @@ Download [MiniCPM-V-4](https://huggingface.co/openbmb/MiniCPM-V-4) PyTorch model
 ### Build llama.cpp
-Readme modification time: 20250206
+Readme modification time: 20250731
 If there are differences in usage, please refer to the official build [documentation](https://github.com/ggerganov/llama.cpp/blob/master/docs/build.md)
--- a/docs/multimodal/minicpmv4.5.md
+++ b/docs/multimodal/minicpmv4.5.md
@ -0,0 +1,47 @@
 ## MiniCPM-V 4.5
 ### Prepare models and code
 Download [MiniCPM-V-4_5](https://huggingface.co/openbmb/MiniCPM-V-4_5) PyTorch model from huggingface to "MiniCPM-V-4_5" folder.
 ### Build llama.cpp
 Readme modification time: 20250826
 If there are differences in usage, please refer to the official build [documentation](https://github.com/ggerganov/llama.cpp/blob/master/docs/build.md)
 Clone llama.cpp:
 ```bash
 git clone https://github.com/ggerganov/llama.cpp
 cd llama.cpp
 ```
 Build llama.cpp using `CMake`:
 ```bash
 cmake -B build
 cmake --build build --config Release
 ```
 ### Usage of MiniCPM-V 4
 Convert PyTorch model to gguf files (You can also download the converted [gguf](https://huggingface.co/openbmb/MiniCPM-V-4_5-gguf) by us)
 ```bash
 python ./tools/mtmd/legacy-models/minicpmv-surgery.py -m ../MiniCPM-V-4_5
 python ./tools/mtmd/legacy-models/minicpmv-convert-image-encoder-to-gguf.py -m ../MiniCPM-V-4_5 --minicpmv-projector ../MiniCPM-V-4_5/minicpmv.projector --output-dir ../MiniCPM-V-4_5/ --minicpmv_version 6
 python ./convert_hf_to_gguf.py ../MiniCPM-V-4_5/model
 # quantize int4 version
 ./build/bin/llama-quantize ../MiniCPM-V-4_5/model/ggml-model-f16.gguf ../MiniCPM-V-4_5/model/ggml-model-Q4_K_M.gguf Q4_K_M
 ```
 Inference on Linux or Mac
 ```bash
 # run in single-turn mode
 ./build/bin/llama-mtmd-cli -m ../MiniCPM-V-4_5/model/ggml-model-f16.gguf --mmproj ../MiniCPM-V-4_5/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?"
 # run in conversation mode
 ./build/bin/llama-mtmd-cli -m ../MiniCPM-V-4_5/model/ggml-model-Q4_K_M.gguf --mmproj ../MiniCPM-V-4_5/mmproj-model-f16.gguf
 ```
--- a/docs/ops.md
+++ b/docs/ops.md
@ -12,91 +12,92 @@ Legend:
 - 🟡 Partially supported by this backend
 - ❌ Not supported by this backend
-| Operation | BLAS | CANN | CPU | CUDA | Metal | OpenCL | SYCL | Vulkan |
+| Operation | BLAS | CANN | CPU | CUDA | Metal | OpenCL | SYCL | Vulkan | zDNN |
-|-----------|------|------|------|------|------|------|------|------|
+|-----------|------|------|------|------|------|------|------|------|------|
-|                              ABS | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | 🟡 | ❌ |
+|                              ABS | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | 🟡 | ❌ | ❌ |
-|                              ACC | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ |
+|                              ACC | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ |
-|                              ADD | ❌ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ✅ |
+|                              ADD | ❌ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ✅ | ❌ |
-|                             ADD1 | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ |
+|                             ADD1 | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ |
-|                           ARANGE | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ |
+|                           ARANGE | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ |
-|                           ARGMAX | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ |
+|                           ARGMAX | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ |
-|                          ARGSORT | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
+|                          ARGSORT | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ |
-|                            CLAMP | ❌ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | 🟡 |
+|                            CLAMP | ❌ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | 🟡 | ❌ |
-|                           CONCAT | ❌ | ✅ | ✅ | 🟡 | ✅ | 🟡 | 🟡 | ✅ |
+|                           CONCAT | ❌ | ✅ | ✅ | 🟡 | ✅ | 🟡 | 🟡 | ✅ | ❌ |
-|                             CONT | ❌ | 🟡 | ✅ | ✅ | ✅ | 🟡 | 🟡 | 🟡 |
+|                             CONT | ❌ | 🟡 | ✅ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ❌ |
-|                          CONV_2D | ❌ | ❌ | ✅ | ❌ | ❌ | ✅ | ❌ | ✅ |
+|                          CONV_2D | ❌ | ❌ | ✅ | ❌ | ❌ | ✅ | ❌ | ✅ | ❌ |
-|                       CONV_2D_DW | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ |
+|                       CONV_2D_DW | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ |
-|                CONV_TRANSPOSE_1D | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ |
+|                CONV_TRANSPOSE_1D | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ |
-|                CONV_TRANSPOSE_2D | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ |
+|                CONV_TRANSPOSE_2D | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ |
-|                              COS | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | ✅ | 🟡 |
+|                              COS | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | ✅ | 🟡 | ❌ |
-|                      COUNT_EQUAL | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ |
+|                      COUNT_EQUAL | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ |
-|                              CPY | ❌ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 |
+|                              CPY | ❌ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | ❌ |
-|               CROSS_ENTROPY_LOSS | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ |
+|               CROSS_ENTROPY_LOSS | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ |
-|          CROSS_ENTROPY_LOSS_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ |
+|          CROSS_ENTROPY_LOSS_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ |
-|                    DIAG_MASK_INF | ❌ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ✅ |
+|                    DIAG_MASK_INF | ❌ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ✅ | ❌ |
-|                              DIV | ❌ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ✅ |
+|                              DIV | ❌ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ✅ | ❌ |
-|                              DUP | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | 🟡 |
+|                              DUP | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | 🟡 | ❌ |
-|                              ELU | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | 🟡 | ❌ |
+|                              ELU | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | 🟡 | ❌ | ❌ |
-|                              EXP | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | 🟡 | ❌ |
+|                              EXP | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | 🟡 | ❌ | ❌ |
-|                   FLASH_ATTN_EXT | ❌ | 🟡 | ✅ | 🟡 | 🟡 | ❌ | ❌ | 🟡 |
+|                   FLASH_ATTN_EXT | ❌ | 🟡 | ✅ | 🟡 | 🟡 | ❌ | ❌ | 🟡 | ❌ |
-|                GATED_LINEAR_ATTN | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ |
+|                GATED_LINEAR_ATTN | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ |
-|                            GEGLU | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 |
+|                            GEGLU | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ❌ |
-|                        GEGLU_ERF | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 |
+|                        GEGLU_ERF | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ❌ |
-|                      GEGLU_QUICK | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 |
+|                      GEGLU_QUICK | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ❌ |
-|                             GELU | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 |
+|                             GELU | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | ❌ |
-|                         GELU_ERF | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 |
+|                         GELU_ERF | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | ❌ |
-|                       GELU_QUICK | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 |
+|                       GELU_QUICK | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | ❌ |
-|                         GET_ROWS | ❌ | 🟡 | ✅ | 🟡 | ✅ | 🟡 | 🟡 | 🟡 |
+|                         GET_ROWS | ❌ | 🟡 | ✅ | 🟡 | ✅ | 🟡 | 🟡 | 🟡 | ❌ |
-|                    GET_ROWS_BACK | ❌ | ❌ | 🟡 | 🟡 | ❌ | ❌ | ❌ | ❌ |
+|                    GET_ROWS_BACK | ❌ | ❌ | 🟡 | 🟡 | ❌ | ❌ | ❌ | ❌ | ❌ |
-|                       GROUP_NORM | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
+|                       GROUP_NORM | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ |
-|                      HARDSIGMOID | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | 🟡 | ❌ |
+|                      HARDSIGMOID | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | 🟡 | ❌ | ❌ |
-|                        HARDSWISH | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | 🟡 | ❌ |
+|                        HARDSWISH | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | 🟡 | ❌ | ❌ |
-|                           IM2COL | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | ✅ |
+|                           IM2COL | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | ✅ | ❌ |
-|                          L2_NORM | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ |
+|                          L2_NORM | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ |
-|                       LEAKY_RELU | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ |
+|                       LEAKY_RELU | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ |
-|                              LOG | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ |
+|                              LOG | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ |
-|                             MEAN | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ |
+|                             MEAN | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ |
-|                              MUL | ❌ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ✅ |
+|                              MUL | ❌ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ✅ | ❌ |
-|                          MUL_MAT | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 |
+|                          MUL_MAT | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 |
-|                       MUL_MAT_ID | ❌ | 🟡 | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ |
+|                       MUL_MAT_ID | ❌ | 🟡 | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ❌ |
-|                              NEG | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | 🟡 | ❌ |
+|                              NEG | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | 🟡 | ❌ | ❌ |
-|                             NORM | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 |
+|                             NORM | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ❌ |
-|                   OPT_STEP_ADAMW | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ |
+|                   OPT_STEP_ADAMW | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ |
-|                         OUT_PROD | 🟡 | ❌ | 🟡 | 🟡 | ❌ | ❌ | 🟡 | ❌ |
+|                         OUT_PROD | 🟡 | ❌ | 🟡 | 🟡 | ❌ | ❌ | 🟡 | ❌ | ❌ |
-|                              PAD | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
+|                              PAD | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ |
-|                   PAD_REFLECT_1D | ❌ | ✅ | ✅ | ❌ | ✅ | ❌ | ❌ | ❌ |
+|                   PAD_REFLECT_1D | ❌ | ✅ | ✅ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ |
-|                          POOL_2D | ❌ | 🟡 | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ |
+|                          POOL_2D | ❌ | 🟡 | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ |
-|                            REGLU | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 |
+|                            REGLU | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ❌ |
-|                             RELU | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 |
+|                             RELU | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | ❌ |
-|                           REPEAT | ❌ | ✅ | ✅ | 🟡 | ✅ | 🟡 | ✅ | 🟡 |
+|                           REPEAT | ❌ | ✅ | ✅ | 🟡 | ✅ | 🟡 | ✅ | 🟡 | ❌ |
-|                      REPEAT_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ |
+|                      REPEAT_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ |
-|                         RMS_NORM | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | ✅ |
+|                         RMS_NORM | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | ✅ | ❌ |
-|                    RMS_NORM_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ |
+|                    RMS_NORM_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ |
-|                 RMS_NORM_MUL_ADD | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
+|                 RMS_NORM_MUL_ADD | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ |
-|                             ROLL | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ | ✅ |
+|                             ROLL | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ | ✅ | ❌ |
-|                             ROPE | ❌ | 🟡 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
+|                             ROPE | ❌ | 🟡 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ |
-|                        ROPE_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ |
+|                        ROPE_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ |
-|                        RWKV_WKV6 | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ |
+|                        RWKV_WKV6 | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ |
-|                        RWKV_WKV7 | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ |
+|                        RWKV_WKV7 | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ |
-|                            SCALE | ❌ | 🟡 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
+|                            SCALE | ❌ | 🟡 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ |
-|                              SET | ❌ | ❌ | ✅ | ❌ | ✅ | ❌ | ❌ | ❌ |
+|                              SET | ❌ | ❌ | ✅ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ |
-|                         SET_ROWS | ❌ | ❌ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 |
+|                         SET_ROWS | ❌ | ❌ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | ❌ |
-|                              SGN | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | 🟡 | ❌ |
+|                              SGN | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | 🟡 | ❌ | ❌ |
-|                          SIGMOID | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 |
+|                          SIGMOID | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | ❌ |
-|                             SILU | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 |
+|                             SILU | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | ❌ |
-|                        SILU_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ |
+|                        SILU_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ |
-|                              SIN | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | ✅ | 🟡 |
+|                              SIN | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | ✅ | 🟡 | ❌ |
-|                         SOFT_MAX | ❌ | 🟡 | ✅ | ✅ | ✅ | ✅ | 🟡 | ✅ |
+|                          SOFTCAP | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
-|                    SOFT_MAX_BACK | ❌ | ❌ | 🟡 | 🟡 | ❌ | ❌ | ❌ | ✅ |
+|                         SOFT_MAX | ❌ | 🟡 | ✅ | ✅ | ✅ | ✅ | 🟡 | ✅ | ❌ |
-|                              SQR | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | ✅ | 🟡 |
+|                    SOFT_MAX_BACK | ❌ | ❌ | 🟡 | 🟡 | ❌ | ❌ | ❌ | ✅ | ❌ |
-|                             SQRT | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | ✅ | ❌ |
+|                              SQR | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | ✅ | 🟡 | ❌ |
-|                         SSM_CONV | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ |
+|                             SQRT | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | ✅ | ❌ | ❌ |
-|                         SSM_SCAN | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ |
+|                         SSM_CONV | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ |
-|                             STEP | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | 🟡 | ❌ |
+|                         SSM_SCAN | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ |
-|                              SUB | ❌ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ✅ |
+|                             STEP | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | 🟡 | ❌ | ❌ |
-|                              SUM | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ |
+|                              SUB | ❌ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ✅ | ❌ |
-|                         SUM_ROWS | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
+|                              SUM | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ |
-|                           SWIGLU | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 |
+|                         SUM_ROWS | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ |
-|                             TANH | ❌ | ✅ | ✅ | 🟡 | 🟡 | ✅ | 🟡 | 🟡 |
+|                           SWIGLU | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ❌ |
-|               TIMESTEP_EMBEDDING | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
+|                             TANH | ❌ | ✅ | ✅ | 🟡 | 🟡 | ✅ | 🟡 | 🟡 | ❌ |
-|                          UPSCALE | ❌ | 🟡 | ✅ | ✅ | 🟡 | ✅ | 🟡 | ✅ |
+|               TIMESTEP_EMBEDDING | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ |
 |                          UPSCALE | ❌ | 🟡 | ✅ | ✅ | 🟡 | ✅ | 🟡 | ✅ | ❌ |
--- a/docs/ops/zDNN.csv
+++ b/docs/ops/zDNN.csv
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@ -34,6 +34,7 @@ else()
    add_subdirectory(gen-docs)
    add_subdirectory(training)
    add_subdirectory(diffusion)
    add_subdirectory(model-conversion)
    if (NOT GGML_BACKEND_DL)
        add_subdirectory(convert-llama2c-to-ggml)
        # these examples use the backends directly and cannot be built with dynamic loading
--- a/examples/batched.swift/README.md
+++ b/examples/batched.swift/README.md
@ -1,4 +1,5 @@
 This is a swift clone of `examples/batched`.
-$ `make`
+```bash
-$ `./llama-batched-swift MODEL_PATH [PROMPT] [PARALLEL]`
+$ ./llama-batched-swift MODEL_PATH [PROMPT] [PARALLEL]
 ```
--- a/examples/diffusion/diffusion-cli.cpp
+++ b/examples/diffusion/diffusion-cli.cpp
@ -564,7 +564,7 @@ int main(int argc, char ** argv) {
    ctx_params.n_ctx                = params.n_ctx;
    ctx_params.n_batch              = params.n_batch;
    ctx_params.n_ubatch             = params.n_ubatch;
-    ctx_params.flash_attn           = params.flash_attn;
+    ctx_params.flash_attn_type      = params.flash_attn_type;
    ctx_params.no_perf              = params.no_perf;
    ctx_params.type_k               = params.cache_type_k;
    ctx_params.type_v               = params.cache_type_v;
--- a/examples/eval-callback/eval-callback.cpp
+++ b/examples/eval-callback/eval-callback.cpp
@ -7,6 +7,7 @@
 #include <cstdio>
 #include <string>
 #include <vector>
 #include <numeric>
 /**
 * This the arbitrary data which will be passed to each callback.
@ -27,9 +28,40 @@ static std::string ggml_ne_string(const ggml_tensor * t) {
    return str;
 }
 static float ggml_get_float_value(uint8_t * data, ggml_type type, const size_t * nb, size_t i0, size_t i1, size_t i2, size_t i3) {
    size_t i = i3 * nb[3] + i2 * nb[2] + i1 * nb[1] + i0 * nb[0];
    float v;
    if (type == GGML_TYPE_F16) {
        v = ggml_fp16_to_fp32(*(ggml_fp16_t *) &data[i]);
    } else if (type == GGML_TYPE_F32) {
        v = *(float *) &data[i];
    } else if (type == GGML_TYPE_I64) {
        v = (float) *(int64_t *) &data[i];
    } else if (type == GGML_TYPE_I32) {
        v = (float) *(int32_t *) &data[i];
    } else if (type == GGML_TYPE_I16) {
        v = (float) *(int16_t *) &data[i];
    } else if (type == GGML_TYPE_I8) {
        v = (float) *(int8_t *) &data[i];
    } else {
        GGML_ABORT("fatal error");
    }
    return v;
 }
 static void ggml_print_tensor(uint8_t * data, ggml_type type, const int64_t * ne, const size_t * nb, int64_t n) {
    GGML_ASSERT(n > 0);
    float sum = 0;
    for (int64_t i3 = 0; i3 < ne[3]; i3++) {
        for (int64_t i2 = 0; i2 < ne[2]; i2++) {
            for (int64_t i1 = 0; i1 < ne[1]; i1++) {
                for (int64_t i0 = 0; i0 < ne[0]; i0++) {
                    const float v = ggml_get_float_value(data, type, nb, i0, i1, i2, i3);
                    sum += v;
                }
            }
        }
    }
    for (int64_t i3 = 0; i3 < ne[3]; i3++) {
        LOG("                                     [\n");
        for (int64_t i2 = 0; i2 < ne[2]; i2++) {
@ -49,25 +81,8 @@ static void ggml_print_tensor(uint8_t * data, ggml_type type, const int64_t * ne
                        LOG("..., ");
                        i0 = ne[0] - n;
                    }
-                    size_t i = i3 * nb[3] + i2 * nb[2] + i1 * nb[1] + i0 * nb[0];
+                    const float v = ggml_get_float_value(data, type, nb, i0, i1, i2, i3);
                    float v;
                    if (type == GGML_TYPE_F16) {
                        v = ggml_fp16_to_fp32(*(ggml_fp16_t *) &data[i]);
                    } else if (type == GGML_TYPE_F32) {
                        v = *(float *) &data[i];
                    } else if (type == GGML_TYPE_I64) {
                        v = (float) *(int64_t *) &data[i];
                    } else if (type == GGML_TYPE_I32) {
                        v = (float) *(int32_t *) &data[i];
                    } else if (type == GGML_TYPE_I16) {
                        v = (float) *(int16_t *) &data[i];
                    } else if (type == GGML_TYPE_I8) {
                        v = (float) *(int8_t *) &data[i];
                    } else {
                        GGML_ABORT("fatal error");
                    }
                    LOG("%12.4f", v);
                    sum += v;
                    if (i0 < ne[0] - 1) LOG(", ");
                }
                LOG("],\n");
@ -77,6 +92,12 @@ static void ggml_print_tensor(uint8_t * data, ggml_type type, const int64_t * ne
        LOG("                                     ]\n");
        LOG("                                     sum = %f\n", sum);
    }
    // TODO: make this abort configurable/optional?
    if (std::isnan(sum)) {
        LOG_ERR("encountered NaN - aborting\n");
        exit(0);
    }
 }
 /**
--- a/examples/llama.vim
+++ b/examples/llama.vim
@ -17,7 +17,7 @@
 "
 " start the llama.cpp server with a FIM-compatible model. for example:
 "
-"   $ llama-server -m {model.gguf} --port 8012 -ngl 99 -fa -dt 0.1 --ubatch-size 512 --batch-size 1024 --cache-reuse 256
+"   $ llama-server -m {model.gguf} --port 8012 -ngl 99 -fa --ubatch-size 512 --batch-size 1024 --cache-reuse 256
 "
 "   --batch-size [512, model max context]
 "
--- a/examples/lookahead/README.md
+++ b/examples/lookahead/README.md
@ -5,3 +5,9 @@ Demonstration of lookahead decoding technique:
 https://lmsys.org/blog/2023-11-21-lookahead-decoding/
 More info: https://github.com/ggml-org/llama.cpp/pull/4207
 Sample command:
 ```bash
 llama-lookahead -hf ggml-org/Qwen2.5-Coder-3B-Q8_0-GGUF -p "// network server implemented in C\n// author: Peter Hacker\n\n#include" -e -ngl 99 -t 4 -n 512 -c 4096 -kvu
 ```
--- a/examples/model-conversion/.gitignore
+++ b/examples/model-conversion/.gitignore
@ -0,0 +1,3 @@
 .model_name
 data
 ppl
--- a/examples/model-conversion/CMakeLists.txt
+++ b/examples/model-conversion/CMakeLists.txt
@ -0,0 +1,5 @@
 set(TARGET llama-logits)
 add_executable(${TARGET} logits.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_17)
--- a/examples/model-conversion/Makefile
+++ b/examples/model-conversion/Makefile
@ -0,0 +1,206 @@
 MAKEFLAGS += --no-print-directory
 define validate_model_path
 	@if [ -z "$(MODEL_PATH)" ]; then \
 		echo "Error: MODEL_PATH must be provided either as:"; \
 		echo "  1. Environment variable: export MODEL_PATH=/path/to/model"; \
 		echo "  2. Command line argument: make $(1) MODEL_PATH=/path/to/model"; \
 		exit 1; \
 	fi
 endef
 define validate_embedding_model_path
 	@if [ -z "$(EMBEDDING_MODEL_PATH)" ]; then \
 		echo "Error: EMBEDDING_MODEL_PATH must be provided either as:"; \
 		echo "  1. Environment variable: export EMBEDDING_MODEL_PATH=/path/to/model"; \
 		echo "  2. Command line argument: make $(1) EMBEDDING_MODEL_PATH=/path/to/model"; \
 		exit 1; \
 	fi
 endef
 define quantize_model
 	@CONVERTED_MODEL="$(1)" QUANTIZED_TYPE="$(QUANTIZED_TYPE)" \
 	TOKEN_EMBD_TYPE="$(TOKEN_EMBD_TYPE)" OUTPUT_TYPE="$(OUTPUT_TYPE)" \
 	./scripts/utils/quantize.sh "$(1)" "$(QUANTIZED_TYPE)" "$(TOKEN_EMBD_TYPE)" "$(OUTPUT_TYPE)"
 	@echo "Export the quantized model path to $(2) variable in your environment"
 endef
 ###
 ### Casual Model targets/recipes
 ###
 causal-convert-model-bf16: OUTTYPE=bf16
 causal-convert-model-bf16: causal-convert-model
 causal-convert-model:
 	$(call validate_model_path,causal-convert-model)
 	@MODEL_NAME="$(MODEL_NAME)" OUTTYPE="$(OUTTYPE)" MODEL_PATH="$(MODEL_PATH)" \
 	METADATA_OVERRIDE="$(METADATA_OVERRIDE)" \
 	./scripts/causal/convert-model.sh
 causal-convert-mm-model-bf16: OUTTYPE=bf16
 causal-convert-mm-model-bf16: MM_OUTTYPE=f16
 causal-convert-mm-model-bf16: causal-convert-mm-model
 causal-convert-mm-model:
 	$(call validate_model_path,causal-convert-mm-model)
 	@MODEL_NAME="$(MODEL_NAME)" OUTTYPE="$(OUTTYPE)" MODEL_PATH="$(MODEL_PATH)" \
 	METADATA_OVERRIDE="$(METADATA_OVERRIDE)" \
 	./scripts/causal/convert-model.sh
 	@MODEL_NAME="$(MODEL_NAME)" OUTTYPE="$(MM_OUTTYPE)" MODEL_PATH="$(MODEL_PATH)" \
 	METADATA_OVERRIDE="$(METADATA_OVERRIDE)" \
 	./scripts/causal/convert-model.sh --mmproj
 causal-run-original-model:
 	$(call validate_model_path,causal-run-original-model)
 	@MODEL_PATH="$(MODEL_PATH)" ./scripts/causal/run-org-model.py
 causal-run-converted-model:
 	@CONVERTED_MODEL="$(CONVERTED_MODEL)" ./scripts/causal/run-converted-model.sh
 causal-verify-logits: causal-run-original-model causal-run-converted-model
 	@./scripts/causal/compare-logits.py
 	@MODEL_PATH="$(MODEL_PATH)" ./scripts/utils/check-nmse.py -m ${MODEL_PATH}
 causal-run-original-embeddings:
 	@./scripts/causal/run-casual-gen-embeddings-org.py
 causal-run-converted-embeddings:
 	@./scripts/causal/run-converted-model-embeddings-logits.sh
 causal-verify-embeddings: causal-run-original-embeddings causal-run-converted-embeddings
 	@./scripts/causal/compare-embeddings-logits.sh
 causal-inspect-original-model:
 	@./scripts/utils/inspect-org-model.py
 causal-inspect-converted-model:
 	@./scripts/utils/inspect-converted-model.sh
 causal-start-embedding-server:
 	@./scripts/utils/run-embedding-server.sh ${CONVERTED_MODEL}
 causal-curl-embedding-endpoint: causal-run-original-embeddings
 	@./scripts/utils/curl-embedding-server.sh | ./scripts/causal/compare-embeddings-logits.sh
 causal-quantize-Q8_0: QUANTIZED_TYPE = Q8_0
 causal-quantize-Q8_0: causal-quantize-model
 causal-quantize-Q4_0: QUANTIZED_TYPE = Q4_0
 causal-quantize-Q4_0: causal-quantize-model
 # For Quantization Aware Trained (QAT) models in Q4_0 we explicitly set the
 # token embedding and output types to Q8_0 instead of the default Q6_K.
 causal-quantize-qat-Q4_0: QUANTIZED_TYPE = Q4_0
 causal-quantize-qat-Q4_0: TOKEN_EMBD_TYPE = Q8_0
 causal-quantize-qat-Q4_0: OUTPUT_TYPE = Q8_0
 causal-quantize-qat-Q4_0: causal-quantize-model
 causal-quantize-model:
 	$(call quantize_model,$(CONVERTED_MODEL),QUANTIZED_MODEL)
 causal-run-quantized-model:
 	@QUANTIZED_MODEL="$(QUANTIZED_MODEL)" ./scripts/causal/run-converted-model.sh ${QUANTIZED_MODEL}
 ###
 ### Embedding Model targets/recipes
 ###
 embedding-convert-model-bf16: OUTTYPE=bf16
 embedding-convert-model-bf16: embedding-convert-model
 embedding-convert-model:
 	$(call validate_embedding_model_path,embedding-convert-model)
 	@MODEL_NAME="$(MODEL_NAME)" OUTTYPE="$(OUTTYPE)" MODEL_PATH="$(EMBEDDING_MODEL_PATH)" \
 	METADATA_OVERRIDE="$(METADATA_OVERRIDE)" \
 	./scripts/embedding/convert-model.sh
 embedding-run-original-model:
 	$(call validate_embedding_model_path,embedding-run-original-model)
 	@EMBEDDING_MODEL_PATH="$(EMBEDDING_MODEL_PATH)" ./scripts/embedding/run-original-model.py
 embedding-run-converted-model:
 	@CONVERTED_EMBEDDING_MODEL="$(CONVERTED_EMBEDDING_MODEL)" ./scripts/embedding/run-converted-model.sh ${CONVERTED_EMBEDDING_MODEL}
 embedding-verify-logits: embedding-run-original-model embedding-run-converted-model
 	@./scripts/embedding/compare-embeddings-logits.sh
 embedding-inspect-original-model:
 	$(call validate_embedding_model_path,embedding-inspect-original-model)
 	@EMBEDDING_MODEL_PATH="$(EMBEDDING_MODEL_PATH)" ./scripts/utils/inspect-org-model.py -m ${EMBEDDING_MODEL_PATH}
 embedding-inspect-converted-model:
 	@CONVERTED_EMBEDDING_MODEL="$(CONVERTED_EMBEDDING_MODEL)" ./scripts/utils/inspect-converted-model.sh ${CONVERTED_EMBEDDING_MODEL}
 embedding-start-embedding-server:
 	@./scripts/utils/run-embedding-server.sh ${CONVERTED_EMBEDDING_MODEL}
 embedding-curl-embedding-endpoint:
 	@./scripts/utils/curl-embedding-server.sh | ./scripts/embedding/compare-embeddings-logits.sh
 embedding-quantize-Q8_0: QUANTIZED_TYPE = Q8_0
 embedding-quantize-Q8_0: embedding-quantize-model
 embedding-quantize-Q4_0: QUANTIZED_TYPE = Q4_0
 embedding-quantize-Q4_0: embedding-quantize-model
 # For Quantization Aware Trained (QAT) models in Q4_0 we explicitly set the
 # token embedding and output types to Q8_0 instead of the default Q6_K.
 embedding-quantize-qat-Q4_0: QUANTIZED_TYPE = Q4_0
 embedding-quantize-qat-Q4_0: TOKEN_EMBD_TYPE = Q8_0
 embedding-quantize-qat-Q4_0: OUTPUT_TYPE = Q8_0
 embedding-quantize-qat-Q4_0: embedding-quantize-model
 embedding-quantize-model:
 	$(call quantize_model,$(CONVERTED_EMBEDDING_MODEL),QUANTIZED_EMBEDDING_MODEL)
 embedding-run-quantized-model:
 	@./scripts/embedding/run-converted-model.sh ${QUANTIZED_EMBEDDING_MODEL}
 ###
 ### Perplexity targets/recipes
 ###
 perplexity-data-gen:
 	CONVERTED_MODEL="$(CONVERTED_MODEL)" ./scripts/utils/perplexity-gen.sh
 perplexity-run-full:
 	QUANTIZED_MODEL="$(QUANTIZED_MODEL)" LOOGITS_FILE="$(LOGITS_FILE)" \
 	./scripts/utils/perplexity-run.sh
 perplexity-run:
 	QUANTIZED_MODEL="$(QUANTIZED_MODEL)" ./scripts/utils/perplexity-run-simple.sh
 ###
 ### HuggingFace targets/recipes
 ###
 hf-create-model:
 	@./scripts/utils/hf-create-model.py -m "${MODEL_NAME}" -ns "${NAMESPACE}" -b "${ORIGINAL_BASE_MODEL}"
 hf-create-model-dry-run:
 	@./scripts/utils/hf-create-model.py -m "${MODEL_NAME}" -ns "${NAMESPACE}" -b "${ORIGINAL_BASE_MODEL}" -d
 hf-create-model-embedding:
 	@./scripts/utils/hf-create-model.py -m "${MODEL_NAME}" -ns "${NAMESPACE}" -b "${ORIGINAL_BASE_MODEL}" -e
 hf-create-model-embedding-dry-run:
 	@./scripts/utils/hf-create-model.py -m "${MODEL_NAME}" -ns "${NAMESPACE}" -b "${ORIGINAL_BASE_MODEL}" -e -d
 hf-create-model-private:
 	@./scripts/utils/hf-create-model.py -m "${MODEL_NAME}" -ns "${NAMESPACE}" -b "${ORIGINAL_BASE_MODEL}" -p
 hf-upload-gguf-to-model:
 	@./scripts/utils/hf-upload-gguf-model.py -m "${MODEL_PATH}" -r "${REPO_ID}" -o "${NAME_IN_REPO}"
 hf-create-collection:
 	@./scripts/utils/hf-create-collection.py -n "${NAME}" -d "${DESCRIPTION}" -ns "${NAMESPACE}"
 hf-add-model-to-collection:
 	@./scripts/utils/hf-add-model-to-collection.py -c "${COLLECTION}" -m "${MODEL}"
 .PHONY: clean
 clean:
 	@${RM} -rf data .converted_embedding_model.txt .converted_model.txt .embedding_model_name.txt .model_name.txt
--- a/examples/model-conversion/README.md
+++ b/examples/model-conversion/README.md
@ -0,0 +1,367 @@
 # Model Conversion Example
 This directory contains scripts and code to help in the process of converting
 HuggingFace PyTorch models to GGUF format.
 The motivation for having this is that the conversion process can often be an
 iterative process, where the original model is inspected, converted, updates
 made to llama.cpp, converted again, etc. Once the model has been converted it
 needs to be verified against the original model, and then optionally quantified,
 and in some cases perplexity checked of the quantized model. And finally the
 model/models need to the ggml-org on Hugging Face. This tool/example tries to
 help with this process.
 ### Overview
 The idea is that the makefile targets and scripts here can be used in the
 development/conversion process assisting with things like:
 * inspect/run the original model to figure out how it works
 * convert the original model to GGUF format
 * inspect/run the converted model
 * verify the logits produced by the original model and the converted model
 * quantize the model to GGUF format
 * run perplexity evaluation to verify that the quantized model is performing
  as expected
 * upload the model to HuggingFace to make it available for others
 ## Setup
 Create virtual python environment
 ```console
 $ python3.11 -m venv venv
 $ source venv/bin/activate
 (venv) $ pip install -r requirements.txt
 ```
 ## Causal Language Model Conversion
 This section describes the steps to convert a causal language model to GGUF and
 to verify that the conversion was successful.
 ### Download the original model
 First, clone the original model to some local directory:
 ```console
 $ mkdir models && cd models
 $ git clone https://huggingface.co/user/model_name
 $ cd model_name
 $ git lfs install
 $ git lfs pull
 ```
 ### Set the MODEL_PATH
 The path to the downloaded model can be provided in two ways:
 **Option 1: Environment variable (recommended for iterative development)**
 ```console
 export MODEL_PATH=~/work/ai/models/some_model
 ```
 **Option 2: Command line argument (for one-off tasks)**
 ```console
 make causal-convert-model MODEL_PATH=~/work/ai/models/some_model
 ```
 Command line arguments take precedence over environment variables when both are provided.
 In cases where the transformer implementation for the model has not been released
 yet it is possible to set the environment variable `UNRELEASED_MODEL_NAME` which
 will then cause the transformer implementation to be loaded explicitely and not
 use AutoModelForCausalLM:
 ```
 export UNRELEASED_MODEL_NAME=SomeNewModel
 ```
 ### Inspecting the original tensors
 ```console
 # Using environment variable
 (venv) $ make causal-inspect-original-model
 # Or using command line argument
 (venv) $ make causal-inspect-original-model MODEL_PATH=~/work/ai/models/some_model
 ```
 ### Running the original model
 This is mainly to verify that the original model works, and to compare the output
 from the converted model.
 ```console
 # Using environment variable
 (venv) $ make causal-run-original-model
 # Or using command line argument
 (venv) $ make causal-run-original-model MODEL_PATH=~/work/ai/models/some_model
 ```
 This command will save two files to the `data` directory, one is a binary file
 containing logits which will be used for comparison with the converted model
 later, and the other is a text file which allows for manual visual inspection.
 ### Model conversion
 After updates have been made to [gguf-py](../../gguf-py) to add support for the
 new model, the model can be converted to GGUF format using the following command:
 ```console
 # Using environment variable
 (venv) $ make causal-convert-model
 # Or using command line argument
 (venv) $ make causal-convert-model MODEL_PATH=~/work/ai/models/some_model
 ```
 ### Inspecting the converted model
 The converted model can be inspected using the following command:
 ```console
 (venv) $ make inspect-converted-model
 ```
 ### Running the converted model
 ```console
 (venv) $ make run-converted-model
 ```
 ### Model logits verfication
 The following target will run the original model and the converted model and
 compare the logits:
 ```console
 (venv) $ make causal-verify-logits
 ```
 ### Quantizing the model
 The causal model can be quantized to GGUF format using the following command:
 ```console
 (venv) $ make causal-quantize-Q8_0
 Quantized model saved to: /path/to/quantized/model-Q8_0.gguf
 Export the quantized model path to QUANTIZED_MODEL variable in your environment
 ```
 This will show the path to the quantized model in the terminal, which can then
 be used to set the `QUANTIZED_MODEL` environment variable:
 ```console
 export QUANTIZED_MODEL=/path/to/quantized/model-Q8_0.gguf
 ```
 Then the quantized model can be run using the following command:
 ```console
 (venv) $ make causal-run-quantized-model
 ```
 ### Quantizing QAT (Quantization Aware Training) models
 When quantizing to `Q4_0`, the default data type for the token embedding weights
 will be `Q6_K`. For models that are going to be uploaded to ggml-org it is
 recommended to use `Q8_0` instead for the embeddings and output tensors.
 The reason is that although `Q6_K` is smaller in size, it requires more compute
 to unpack, which can hurt performance during output generation when the entire
 embedding matrix must be dequantized to compute vocabulary logits. `Q8_0`
 provides practically full quality with better computational efficiency.
 ```console
 (venv) $ make causal-quantize-qat-Q4_0
 ```
 ## Embedding Language Model Conversion
 ### Download the original model
 ```console
 $ mkdir models && cd models
 $ git clone https://huggingface.co/user/model_name
 $ cd model_name
 $ git lfs install
 $ git lfs pull
 ```
 The path to the embedding model can be provided in two ways:
 **Option 1: Environment variable (recommended for iterative development)**
 ```console
 export EMBEDDING_MODEL_PATH=~/path/to/embedding_model
 ```
 **Option 2: Command line argument (for one-off tasks)**
 ```console
 make embedding-convert-model EMBEDDING_MODEL_PATH=~/path/to/embedding_model
 ```
 Command line arguments take precedence over environment variables when both are provided.
 ### Running the original model
 This is mainly to verify that the original model works and to compare the output
 with the output from the converted model.
 ```console
 # Using environment variable
 (venv) $ make embedding-run-original-model
 # Or using command line argument
 (venv) $ make embedding-run-original-model EMBEDDING_MODEL_PATH=~/path/to/embedding_model
 ```
 This command will save two files to the `data` directory, one is a binary
 file containing logits which will be used for comparison with the converted
 model, and the other is a text file which allows for manual visual inspection.
 ### Model conversion
 After updates have been made to [gguf-py](../../gguf-py) to add support for the
 new model the model can be converted to GGUF format using the following command:
 ```console
 (venv) $ make embedding-convert-model
 ```
 ### Run the converted model
 ```console
 (venv) $ make embedding-run-converted-model
 ```
 ### Model logits verfication
 The following target will run the original model and the converted model (which
 was done manually in the previous steps) and compare the logits:
 ```console
 (venv) $ make embedding-verify-logits
 ```
 ### llama-server verification
 To verify that the converted model works with llama-server, the following
 command can be used:
 ```console
 (venv) $ make embedding-start-embedding-server
 ```
 Then open another terminal and set the `EMBEDDINGS_MODEL_PATH` environment
 variable as this will not be inherited by the new terminal:
 ```console
 (venv) $ make embedding-curl-embedding-endpoint
 ```
 This will call the `embedding` endpoing and the output will be piped into
 the same verification script as used by the target `embedding-verify-logits`.
 The causal model can also be used to produce embeddings and this can be verified
 using the following commands:
 ```console
 (venv) $ make causal-start-embedding-server
 ```
 Then open another terminal and set the `MODEL_PATH` environment
 variable as this will not be inherited by the new terminal:
 ```console
 (venv) $ make casual-curl-embedding-endpoint
 ```
 ### Quantizing the model
 The embedding model can be quantized to GGUF format using the following command:
 ```console
 (venv) $ make embedding-quantize-Q8_0
 Quantized model saved to: /path/to/quantized/model-Q8_0.gguf
 Export the quantized model path to QUANTIZED_EMBEDDING_MODEL variable in your environment
 ```
 This will show the path to the quantized model in the terminal, which can then
 be used to set the `QUANTIZED_EMBEDDING_MODEL` environment variable:
 ```console
 export QUANTIZED_EMBEDDING_MODEL=/path/to/quantized/model-Q8_0.gguf
 ```
 Then the quantized model can be run using the following command:
 ```console
 (venv) $ make embedding-run-quantized-model
 ```
 ### Quantizing QAT (Quantization Aware Training) models
 When quantizing to `Q4_0`, the default data type for the token embedding weights
 will be `Q6_K`. For models that are going to be uploaded to ggml-org it is
 recommended to use `Q8_0` instead for the embeddings and output tensors.
 The reason is that although `Q6_K` is smaller in size, it requires more compute
 to unpack, which can hurt performance during output generation when the entire
 embedding matrix must be dequantized to compute vocabulary logits. `Q8_0`
 provides practically full quality with better computational efficiency.
 ```console
 (venv) $ make embedding-quantize-qat-Q4_0
 ```
 ## Perplexity Evaluation
 ### Simple perplexity evaluation
 This allows to run the perplexity evaluation without having to generate a
 token/logits file:
 ```console
 (venv) $ make perplexity-run QUANTIZED_MODEL=~/path/to/quantized/model.gguf
 ```
 This will use the wikitext dataset to run the perplexity evaluation and
 output the perplexity score to the terminal. This value can then be compared
 with the perplexity score of the unquantized model.
 ### Full perplexity evaluation
 First use the converted, non-quantized, model to generate the perplexity evaluation
 dataset using the following command:
 ```console
 $ make perplexity-data-gen CONVERTED_MODEL=~/path/to/converted/model.gguf
 ```
 This will generate a file in the `data` directory named after the model and with
 a `.kld` suffix which contains the tokens and the logits for the wikitext dataset.
 After the dataset has been generated, the perplexity evaluation can be run using
 the quantized model:
 ```console
 $ make perplexity-run-full QUANTIZED_MODEL=~/path/to/quantized/model-Qxx.gguf LOGITS_FILE=data/model.gguf.ppl
 ```
 > 📝 **Note:** The `LOGITS_FILE` is the file generated by the previous command
 > can be very large, so make sure you have enough disk space available.
 ## HuggingFace utilities
 The following targets are useful for creating collections and model repositories
 on Hugging Face in the the ggml-org. These can be used when preparing a relase
 to script the process for new model releases.
 For the following targets a `HF_TOKEN` environment variable is required.
 > 📝 **Note:** Don't forget to logout from Hugging Face after running these
 > commands, otherwise you might have issues pulling/cloning repositories as
 > the token will still be in use:
 > $ huggingface-cli logout
 > $ unset HF_TOKEN
 ### Create a new Hugging Face Model (model repository)
 This will create a new model repsository on Hugging Face with the specified
 model name.
 ```console
 (venv) $ make hf-create-model MODEL_NAME='TestModel' NAMESPACE="danbev" ORIGINAL_BASE_MODEL="some-base-model"
 Repository ID:  danbev/TestModel-GGUF
 Repository created: https://huggingface.co/danbev/TestModel-GGUF
 ```
 Note that we append a `-GGUF` suffix to the model name to ensure a consistent
 naming convention for GGUF models.
 An embedding model can be created using the following command:
 ```console
 (venv) $ make hf-create-model-embedding MODEL_NAME='TestEmbeddingModel' NAMESPACE="danbev" ORIGINAL_BASE_MODEL="some-base-model"
 ```
 The only difference is that the model card for an embedding model will be different
 with regards to the llama-server command and also how to access/call the embedding
 endpoint.
 ### Upload a GGUF model to model repository
 The following target uploads a model to an existing Hugging Face model repository.
 ```console
 (venv) $ make hf-upload-gguf-to-model MODEL_PATH=dummy-model1.gguf REPO_ID=danbev/TestModel-GGUF
 📤 Uploading dummy-model1.gguf to danbev/TestModel-GGUF/dummy-model1.gguf
 ✅ Upload successful!
 🔗 File available at: https://huggingface.co/danbev/TestModel-GGUF/blob/main/dummy-model1.gguf
 ```
 This command can also be used to update an existing model file in a repository.
 ### Create a new Collection
 ```console
 (venv) $ make hf-new-collection NAME=TestCollection DESCRIPTION="Collection for testing scripts" NAMESPACE=danbev
 🚀 Creating Hugging Face Collection
 Title: TestCollection
 Description: Collection for testing scripts
 Namespace: danbev
 Private: False
 ✅ Authenticated as: danbev
 📚 Creating collection: 'TestCollection'...
 ✅ Collection created successfully!
 📋 Collection slug: danbev/testcollection-68930fcf73eb3fc200b9956d
 🔗 Collection URL: https://huggingface.co/collections/danbev/testcollection-68930fcf73eb3fc200b9956d
 🎉 Collection created successfully!
 Use this slug to add models: danbev/testcollection-68930fcf73eb3fc200b9956d
 ```
 ### Add model to a Collection
 ```console
 (venv) $ make hf-add-model-to-collection COLLECTION=danbev/testcollection-68930fcf73eb3fc200b9956d MODEL=danbev/TestModel-GGUF
 ✅ Authenticated as: danbev
 🔍 Checking if model exists: danbev/TestModel-GGUF
 ✅ Model found: danbev/TestModel-GGUF
 📚 Adding model to collection...
 ✅ Model added to collection successfully!
 🔗 Collection URL: https://huggingface.co/collections/danbev/testcollection-68930fcf73eb3fc200b9956d
 🎉 Model added successfully!
 ```
--- a/examples/model-conversion/logits.cpp
+++ b/examples/model-conversion/logits.cpp
@ -0,0 +1,210 @@
 #include "llama.h"
 #include <cstdio>
 #include <cstring>
 #include <string>
 #include <vector>
 #include <ctype.h>
 #include <filesystem>
 static void print_usage(int, char ** argv) {
    printf("\nexample usage:\n");
    printf("\n    %s -m model.gguf [-ngl n_gpu_layers] -embd-mode [prompt]\n", argv[0]);
    printf("\n");
 }
 int main(int argc, char ** argv) {
    std::string model_path;
    std::string prompt = "Hello, my name is";
    int ngl = 0;
    bool embedding_mode = false;
    {
        int i = 1;
        for (; i < argc; i++) {
            if (strcmp(argv[i], "-m") == 0) {
                if (i + 1 < argc) {
                    model_path = argv[++i];
                } else {
                    print_usage(argc, argv);
                    return 1;
                }
            } else if (strcmp(argv[i], "-ngl") == 0) {
                if (i + 1 < argc) {
                    try {
                        ngl = std::stoi(argv[++i]);
                    } catch (...) {
                        print_usage(argc, argv);
                        return 1;
                    }
                } else {
                    print_usage(argc, argv);
                    return 1;
                }
            } else if (strcmp(argv[i], "-embd-mode") == 0) {
                if (i + 1 < argc) {
                    try {
                        embedding_mode = true;
                    } catch (...) {
                        print_usage(argc, argv);
                        return 1;
                    }
                } else {
                    print_usage(argc, argv);
                    return 1;
                }
            } else {
                // prompt starts here
                break;
            }
        }
        if (model_path.empty()) {
            print_usage(argc, argv);
            return 1;
        }
        if (i < argc) {
            prompt = argv[i++];
            for (; i < argc; i++) {
                prompt += " ";
                prompt += argv[i];
            }
        }
    }
    ggml_backend_load_all();
    llama_model_params model_params = llama_model_default_params();
    model_params.n_gpu_layers = ngl;
    llama_model * model = llama_model_load_from_file(model_path.c_str(), model_params);
    if (model == NULL) {
        fprintf(stderr , "%s: error: unable to load model\n" , __func__);
        return 1;
    }
    // Extract basename from model_path
    const char * basename = strrchr(model_path.c_str(), '/');
    basename = (basename == NULL) ? model_path.c_str() : basename + 1;
    char model_name[256];
    strncpy(model_name, basename, 255);
    model_name[255] = '\0';
    char * dot = strrchr(model_name, '.');
    if (dot != NULL && strcmp(dot, ".gguf") == 0) {
        *dot = '\0';
    }
    printf("Model name: %s\n", model_name);
    const llama_vocab * vocab = llama_model_get_vocab(model);
    const int n_prompt = -llama_tokenize(vocab, prompt.c_str(), prompt.size(), NULL, 0, true, true);
    std::vector<llama_token> prompt_tokens(n_prompt);
    if (llama_tokenize(vocab, prompt.c_str(), prompt.size(), prompt_tokens.data(), prompt_tokens.size(), true, true) < 0) {
        fprintf(stderr, "%s: error: failed to tokenize the prompt\n", __func__);
        return 1;
    }
    llama_context_params ctx_params = llama_context_default_params();
    ctx_params.n_ctx = n_prompt;
    ctx_params.n_batch = n_prompt;
    ctx_params.no_perf = false;
    if (embedding_mode) {
        ctx_params.embeddings = true;
        ctx_params.pooling_type = LLAMA_POOLING_TYPE_NONE;
        ctx_params.n_ubatch = ctx_params.n_batch;
    }
    llama_context * ctx = llama_init_from_model(model, ctx_params);
    if (ctx == NULL) {
        fprintf(stderr , "%s: error: failed to create the llama_context\n" , __func__);
        return 1;
    }
    printf("Input prompt: \"%s\"\n", prompt.c_str());
    printf("Tokenized prompt (%d tokens): ", n_prompt);
    for (auto id : prompt_tokens) {
        char buf[128];
        int n = llama_token_to_piece(vocab, id, buf, sizeof(buf), 0, true);
        if (n < 0) {
            fprintf(stderr, "%s: error: failed to convert token to piece\n", __func__);
            return 1;
        }
        std::string s(buf, n);
        printf("%s", s.c_str());
    }
    printf("\n");
    llama_batch batch = llama_batch_get_one(prompt_tokens.data(), prompt_tokens.size());
    if (llama_decode(ctx, batch)) {
        fprintf(stderr, "%s : failed to eval\n", __func__);
        return 1;
    }
    float * logits;
    int n_logits;
    const char * type;
    if (embedding_mode) {
        logits = llama_get_embeddings(ctx);
        n_logits = llama_model_n_embd(model) * batch.n_tokens;
        type = "-embeddings";
        printf("Embeddings size: %d\n", n_logits);
    } else {
        logits = llama_get_logits_ith(ctx, batch.n_tokens - 1);
        n_logits = llama_vocab_n_tokens(vocab);
        type = "";
        printf("Vocab size: %d\n", n_logits);
    }
    std::filesystem::create_directory("data");
    // Save logits to binary file
    char bin_filename[512];
    snprintf(bin_filename, sizeof(bin_filename), "data/llamacpp-%s%s.bin", model_name, type);
    printf("Saving logits to %s\n", bin_filename);
    FILE * f = fopen(bin_filename, "wb");
    if (f == NULL) {
        fprintf(stderr, "%s: error: failed to open binary output file\n", __func__);
        return 1;
    }
    fwrite(logits, sizeof(float), n_logits, f);
    fclose(f);
    // Also save as text for debugging
    char txt_filename[512];
    snprintf(txt_filename, sizeof(txt_filename), "data/llamacpp-%s%s.txt", model_name, type);
    f = fopen(txt_filename, "w");
    if (f == NULL) {
        fprintf(stderr, "%s: error: failed to open text output file\n", __func__);
        return 1;
    }
    for (int i = 0; i < n_logits; i++) {
        fprintf(f, "%d: %.6f\n", i, logits[i]);  // Added index and changed format
    }
    fclose(f);
    // Print first and last 10 logits for quick verification
    printf("First 10 logits: ");
    for (int i = 0; i < 10 && i < n_logits; i++) {
        printf("%.6f ", logits[i]);
    }
    printf("\n");
    printf("Last 10 logits: ");
    for (int i = n_logits - 10; i < n_logits; i++) {
        if (i >= 0) printf("%.6f ", logits[i]);
    }
    printf("\n\n");
    printf("Logits saved to %s\n", bin_filename);
    printf("Logits saved to %s\n", txt_filename);
    llama_free(ctx);
    llama_model_free(model);
    return 0;
 }
--- a/examples/model-conversion/requirements.txt
+++ b/examples/model-conversion/requirements.txt
@ -0,0 +1,5 @@
 --extra-index-url https://download.pytorch.org/whl/cpu
 torch~=2.6.0
 torchvision~=0.21.0
 transformers~=4.55.0
 huggingface-hub~=0.34.0
--- a/examples/model-conversion/scripts/causal/compare-embeddings-logits.sh
+++ b/examples/model-conversion/scripts/causal/compare-embeddings-logits.sh
@ -0,0 +1,43 @@
 #!/usr/bin/env bash
 set -e
 MODEL_PATH="${1:-"$MODEL_PATH"}"
 MODEL_NAME="${2:-$(basename "$MODEL_PATH")}"
 if [ -t 0 ]; then
    CPP_EMBEDDINGS="data/llamacpp-${MODEL_NAME}-embeddings.bin"
 else
    # Process piped JSON data and convert to binary (matching logits.cpp format)
    TEMP_FILE=$(mktemp /tmp/tmp.XXXXXX.binn)
    python3 -c "
 import json
 import sys
 import struct
 data = json.load(sys.stdin)
 # Flatten all embeddings completely
 flattened = []
 for item in data:
    embedding = item['embedding']
    for token_embedding in embedding:
        flattened.extend(token_embedding)
 print(f'Total embedding values: {len(flattened)}', file=sys.stderr)
 # Write as binary floats - matches logitc.cpp fwrite format
 with open('$TEMP_FILE', 'wb') as f:
    for value in flattened:
        f.write(struct.pack('f', value))
 "
    CPP_EMBEDDINGS="$TEMP_FILE"
    trap "rm -f $TEMP_FILE" EXIT
 fi
 python scripts/utils/semantic_check.py --model-path $MODEL_PATH \
    --python-embeddings data/pytorch-${MODEL_NAME}-embeddings.bin \
    --cpp-embeddings $CPP_EMBEDDINGS \
    --prompt "Hello world today" \
    --causal
--- a/examples/model-conversion/scripts/causal/compare-logits.py
+++ b/examples/model-conversion/scripts/causal/compare-logits.py
@ -0,0 +1,88 @@
 #!/usr/bin/env python3
 import numpy as np
 import sys
 import os
 from pathlib import Path
 def quick_logits_check(pytorch_file, llamacpp_file):
    """Lightweight sanity check before NMSE"""
    try:
        pytorch_logits = np.fromfile(pytorch_file, dtype=np.float32)
        llamacpp_logits = np.fromfile(llamacpp_file, dtype=np.float32)
    except Exception as e:
        print(f"❌ NOK: Failed to load files - {e}")
        return False
    # Check shapes match
    if pytorch_logits.shape != llamacpp_logits.shape:
        print(f"❌ NOK: Shape mismatch - PyTorch: {pytorch_logits.shape}, llama.cpp: {llamacpp_logits.shape}")
        return False
    # Calculate key metrics
    diff = pytorch_logits - llamacpp_logits
    abs_diff = np.abs(diff)
    max_diff = np.max(abs_diff)
    # Get top 10 predictions from both models
    pytorch_top10 = np.argsort(pytorch_logits)[-10:][::-1]
    llamacpp_top10 = np.argsort(llamacpp_logits)[-10:][::-1]
    print(f"Top 10 PyTorch logits: {pytorch_logits[pytorch_top10]}")
    print(f"Top 10 llama.cpp logits: {llamacpp_logits[llamacpp_top10]}")
    print(f"Max absolute difference: {max_diff:.4f}")
    if max_diff > 1.0:
        print(f"❌ NOK: Large differences detected - max diff: {max_diff:.4f}")
        return False
    return True
 def main():
    model_path = os.getenv('MODEL_PATH')
    if not model_path:
        print("Error: MODEL_PATH environment variable not set")
        sys.exit(1)
    if not os.path.exists(model_path):
        print(f"Error: Model file not found: {model_path}")
        sys.exit(1)
    model_name = os.path.splitext(os.path.basename(model_path))[0]
    data_dir = Path("data")
    pytorch_file = data_dir / f"pytorch-{model_name}.bin"
    llamacpp_file = data_dir / f"llamacpp-{model_name}.bin"
    if not pytorch_file.exists():
        print(f"Error: PyTorch logits file not found: {pytorch_file}")
        print("Please run scripts/run-org-model.sh first to generate this file.")
        sys.exit(1)
    if not llamacpp_file.exists():
        print(f"Error: llama.cpp logits file not found: {llamacpp_file}")
        print("Please run scripts/run-converted-model.sh first to generate this file.")
        sys.exit(1)
    print("Checked all required files were found. Proceeding...\n")
    print("🔍 GGML Model Validation for model ", model_name)
    print("=" * 40)
    print(f"PyTorch logits  : {pytorch_file}")
    print(f"llama.cpp logits: {llamacpp_file}")
    print()
    success = quick_logits_check(pytorch_file, llamacpp_file)
    # Exit with appropriate code
    if success:
        print("✅ OK: Lightweight model check successful!")
        print("       Ok to proceed with NMSE check...")
        sys.exit(0)
    else:
        print(f"❌ NOK: Top 10 predictions don't match - generation will differ")
        sys.exit(1)
 if __name__ == "__main__":
    main()
--- a/examples/model-conversion/scripts/causal/convert-model.sh
+++ b/examples/model-conversion/scripts/causal/convert-model.sh
@ -0,0 +1,46 @@
 #!/usr/bin/env bash
 set -e
 # Parse command line arguments
 MMPROJ=""
 while [[ $# -gt 0 ]]; do
    case $1 in
        --mmproj)
            MMPROJ="--mmproj"
            shift
            ;;
        *)
            shift
            ;;
    esac
 done
 MODEL_NAME="${MODEL_NAME:-$(basename "$MODEL_PATH")}"
 OUTPUT_DIR="${OUTPUT_DIR:-../../models}"
 TYPE="${OUTTYPE:-f16}"
 METADATA_OVERRIDE="${METADATA_OVERRIDE:-}"
 CONVERTED_MODEL="${OUTPUT_DIR}/${MODEL_NAME}.gguf"
 echo "Model path: ${MODEL_PATH}"
 echo "Model name: ${MODEL_NAME}"
 echo "Data  type: ${TYPE}"
 echo "Converted model path:: ${CONVERTED_MODEL}"
 echo "Metadata override: ${METADATA_OVERRIDE}"
 CMD_ARGS=("python" "../../convert_hf_to_gguf.py" "--verbose")
 CMD_ARGS+=("${MODEL_PATH}")
 CMD_ARGS+=("--outfile" "${CONVERTED_MODEL}")
 CMD_ARGS+=("--outtype" "${TYPE}")
 [[ -n "$METADATA_OVERRIDE" ]] && CMD_ARGS+=("--metadata" "${METADATA_OVERRIDE}")
 [[ -n "$MMPROJ" ]] && CMD_ARGS+=("${MMPROJ}")
 "${CMD_ARGS[@]}"
 echo ""
 echo "The environment variable CONVERTED_MODEL can be set to this path using:"
 echo "export CONVERTED_MODEL=$(realpath ${CONVERTED_MODEL})"
 if [[ -n "$MMPROJ" ]]; then
    mmproj_file="${OUTPUT_DIR}/mmproj-$(basename "${CONVERTED_MODEL}")"
    echo "The mmproj model was created in $(realpath "$mmproj_file")"
 fi
--- a/examples/model-conversion/scripts/causal/modelcard.template
+++ b/examples/model-conversion/scripts/causal/modelcard.template
@ -0,0 +1,13 @@
 ---
 base_model:
 - {base_model}
 ---
 # {model_name} GGUF
 Recommended way to run this model:
 ```sh
 llama-server -hf {namespace}/{model_name}-GGUF -c 0 -fa
 ```
 Then, access http://localhost:8080
--- a/examples/model-conversion/scripts/causal/run-casual-gen-embeddings-org.py
+++ b/examples/model-conversion/scripts/causal/run-casual-gen-embeddings-org.py
@ -0,0 +1,114 @@
 #!/usr/bin/env python3
 import argparse
 import os
 import importlib
 import torch
 import numpy as np
 from transformers import AutoTokenizer, AutoConfig, AutoModelForCausalLM
 from pathlib import Path
 unreleased_model_name = os.getenv('UNRELEASED_MODEL_NAME')
 parser = argparse.ArgumentParser(description='Process model with specified path')
 parser.add_argument('--model-path', '-m', help='Path to the model')
 args = parser.parse_args()
 model_path = os.environ.get('MODEL_PATH', args.model_path)
 if model_path is None:
    parser.error("Model path must be specified either via --model-path argument or MODEL_PATH environment variable")
 config = AutoConfig.from_pretrained(model_path)
 print("Model type:       ", config.model_type)
 print("Vocab size:       ", config.vocab_size)
 print("Hidden size:      ", config.hidden_size)
 print("Number of layers: ", config.num_hidden_layers)
 print("BOS token id:     ", config.bos_token_id)
 print("EOS token id:     ", config.eos_token_id)
 print("Loading model and tokenizer using AutoTokenizer:", model_path)
 tokenizer = AutoTokenizer.from_pretrained(model_path)
 if unreleased_model_name:
    model_name_lower = unreleased_model_name.lower()
    unreleased_module_path = f"transformers.models.{model_name_lower}.modular_{model_name_lower}"
    class_name = f"{unreleased_model_name}ForCausalLM"
    print(f"Importing unreleased model module: {unreleased_module_path}")
    try:
        model_class = getattr(importlib.import_module(unreleased_module_path), class_name)
        model = model_class.from_pretrained(model_path)
    except (ImportError, AttributeError) as e:
        print(f"Failed to import or load model: {e}")
        print("Falling back to AutoModelForCausalLM")
        model = AutoModelForCausalLM.from_pretrained(model_path)
 else:
    model = AutoModelForCausalLM.from_pretrained(model_path)
 print(f"Model class: {type(model)}")
 #print(f"Model file: {type(model).__module__}")
 model_name = os.path.basename(model_path)
 print(f"Model name: {model_name}")
 prompt = "Hello world today"
 input_ids = tokenizer(prompt, return_tensors="pt").input_ids
 print(f"Input tokens: {input_ids}")
 print(f"Input text: {repr(prompt)}")
 print(f"Tokenized: {tokenizer.convert_ids_to_tokens(input_ids[0])}")
 with torch.no_grad():
    outputs = model(input_ids, output_hidden_states=True)
    # Extract hidden states from the last layer
    # outputs.hidden_states is a tuple of (num_layers + 1) tensors
    # Index -1 gets the last layer, shape: [batch_size, seq_len, hidden_size]
    last_hidden_states = outputs.hidden_states[-1]
    # Get embeddings for all tokens
    token_embeddings = last_hidden_states[0].cpu().numpy()  # Remove batch dimension
    print(f"Hidden states shape: {last_hidden_states.shape}")
    print(f"Token embeddings shape: {token_embeddings.shape}")
    print(f"Hidden dimension: {token_embeddings.shape[-1]}")
    print(f"Number of tokens: {token_embeddings.shape[0]}")
    # Save raw token embeddings
    data_dir = Path("data")
    data_dir.mkdir(exist_ok=True)
    bin_filename = data_dir / f"pytorch-{model_name}-embeddings.bin"
    txt_filename = data_dir / f"pytorch-{model_name}-embeddings.txt"
    # Save all token embeddings as binary
    print(token_embeddings)
    token_embeddings.astype(np.float32).tofile(bin_filename)
    # Save as text for inspection
    with open(txt_filename, "w") as f:
        for i, embedding in enumerate(token_embeddings):
            for j, val in enumerate(embedding):
                f.write(f"{i} {j} {val:.6f}\n")
    # Print embeddings per token in the requested format
    print("\nToken embeddings:")
    tokens = tokenizer.convert_ids_to_tokens(input_ids[0])
    for i, embedding in enumerate(token_embeddings):
        # Format: show first few values, ..., then last few values
        if len(embedding) > 10:
            # Show first 3 and last 3 values with ... in between
            first_vals = " ".join(f"{val:8.6f}" for val in embedding[:3])
            last_vals = " ".join(f"{val:8.6f}" for val in embedding[-3:])
            print(f"embedding {i}: {first_vals}  ... {last_vals}")
        else:
            # If embedding is short, show all values
            vals = " ".join(f"{val:8.6f}" for val in embedding)
            print(f"embedding {i}: {vals}")
    # Also show token info for reference
    print(f"\nToken reference:")
    for i, token in enumerate(tokens):
        print(f"  Token {i}: {repr(token)}")
    print(f"Saved bin logits to: {bin_filename}")
    print(f"Saved txt logist to: {txt_filename}")
--- a/examples/model-conversion/scripts/causal/run-converted-model-embeddings-logits.sh
+++ b/examples/model-conversion/scripts/causal/run-converted-model-embeddings-logits.sh
@ -0,0 +1,18 @@
 #!/usr/bin/env bash
 set -e
 # First try command line argument, then environment variable, then file
 CONVERTED_MODEL="${1:-"$CONVERTED_MODEL"}"
 # Final check if we have a model path
 if [ -z "$CONVERTED_MODEL" ]; then
    echo "Error: Model path must be provided either as:" >&2
    echo "  1. Command line argument" >&2
    echo "  2. CONVERTED_MODEL environment variable" >&2
    exit 1
 fi
 cmake --build ../../build --target llama-logits -j8
 ../../build/bin/llama-logits -m $CONVERTED_MODEL -embd-mode "Hello world today"
--- a/examples/model-conversion/scripts/causal/run-converted-model.sh
+++ b/examples/model-conversion/scripts/causal/run-converted-model.sh
@ -0,0 +1,20 @@
 #!/usr/bin/env bash
 set -e
 # First try command line argument, then environment variable, then file
 CONVERTED_MODEL="${1:-"$CONVERTED_MODEL"}"
 # Final check if we have a model path
 if [ -z "$CONVERTED_MODEL" ]; then
    echo "Error: Model path must be provided either as:" >&2
    echo "  1. Command line argument" >&2
    echo "  2. CONVERTED_MODEL environment variable" >&2
    exit 1
 fi
 echo $CONVERTED_MODEL
 cmake --build ../../build --target llama-logits -j8
 ../../build/bin/llama-logits -m "$CONVERTED_MODEL" "Hello, my name is"
--- a/examples/model-conversion/scripts/causal/run-org-model.py
+++ b/examples/model-conversion/scripts/causal/run-org-model.py
@ -0,0 +1,100 @@
 #!/usr/bin/env python3
 import argparse
 import os
 import importlib
 from pathlib import Path
 from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig
 import torch
 import numpy as np
 unreleased_model_name = os.getenv('UNRELEASED_MODEL_NAME')
 parser = argparse.ArgumentParser(description='Process model with specified path')
 parser.add_argument('--model-path', '-m', help='Path to the model')
 args = parser.parse_args()
 model_path = os.environ.get('MODEL_PATH', args.model_path)
 if model_path is None:
    parser.error("Model path must be specified either via --model-path argument or MODEL_PATH environment variable")
 config = AutoConfig.from_pretrained(model_path)
 print("Model type:       ", config.model_type)
 print("Vocab size:       ", config.vocab_size)
 print("Hidden size:      ", config.hidden_size)
 print("Number of layers: ", config.num_hidden_layers)
 print("BOS token id:     ", config.bos_token_id)
 print("EOS token id:     ", config.eos_token_id)
 print("Loading model and tokenizer using AutoTokenizer:", model_path)
 tokenizer = AutoTokenizer.from_pretrained(model_path)
 config = AutoConfig.from_pretrained(model_path)
 if unreleased_model_name:
    model_name_lower = unreleased_model_name.lower()
    unreleased_module_path = f"transformers.models.{model_name_lower}.modular_{model_name_lower}"
    class_name = f"{unreleased_model_name}ForCausalLM"
    print(f"Importing unreleased model module: {unreleased_module_path}")
    try:
        model_class = getattr(importlib.import_module(unreleased_module_path), class_name)
        model = model_class.from_pretrained(model_path)  # Note: from_pretrained, not fromPretrained
    except (ImportError, AttributeError) as e:
        print(f"Failed to import or load model: {e}")
        exit(1)
 else:
    model = AutoModelForCausalLM.from_pretrained(model_path)
 model_name = os.path.basename(model_path)
 # Printing the Model class to allow for easier debugging. This can be useful
 # when working with models that have not been publicly released yet and this
 # migth require that the concrete class is imported and used directly instead
 # of using AutoModelForCausalLM.
 print(f"Model class: {model.__class__.__name__}")
 prompt = "Hello, my name is"
 input_ids = tokenizer(prompt, return_tensors="pt").input_ids
 print(f"Input tokens: {input_ids}")
 print(f"Input text: {repr(prompt)}")
 print(f"Tokenized: {tokenizer.convert_ids_to_tokens(input_ids[0])}")
 with torch.no_grad():
    outputs = model(input_ids)
    logits = outputs.logits
    # Extract logits for the last token (next token prediction)
    last_logits = logits[0, -1, :].cpu().numpy()
    print(f"Logits shape: {logits.shape}")
    print(f"Last token logits shape: {last_logits.shape}")
    print(f"Vocab size: {len(last_logits)}")
    data_dir = Path("data")
    data_dir.mkdir(exist_ok=True)
    bin_filename = data_dir / f"pytorch-{model_name}.bin"
    txt_filename = data_dir / f"pytorch-{model_name}.txt"
    # Save to file for comparison
    last_logits.astype(np.float32).tofile(bin_filename)
    # Also save as text file for easy inspection
    with open(txt_filename, "w") as f:
        for i, logit in enumerate(last_logits):
            f.write(f"{i}: {logit:.6f}\n")
    # Print some sample logits for quick verification
    print(f"First 10 logits: {last_logits[:10]}")
    print(f"Last 10 logits: {last_logits[-10:]}")
    # Show top 5 predicted tokens
    top_indices = np.argsort(last_logits)[-5:][::-1]
    print("Top 5 predictions:")
    for idx in top_indices:
        token = tokenizer.decode([idx])
        print(f"  Token {idx} ({repr(token)}): {last_logits[idx]:.6f}")
    print(f"Saved bin logits to: {bin_filename}")
    print(f"Saved txt logist to: {txt_filename}")
--- a/examples/model-conversion/scripts/embedding/compare-embeddings-logits.sh
+++ b/examples/model-conversion/scripts/embedding/compare-embeddings-logits.sh
@ -0,0 +1,42 @@
 #!/usr/bin/env bash
 set -e
 MODEL_PATH="${1:-"$EMBEDDING_MODEL_PATH"}"
 MODEL_NAME="${2:-$(basename "$MODEL_PATH")}"
 if [ -t 0 ]; then
    CPP_EMBEDDINGS="data/llamacpp-${MODEL_NAME}-embeddings.bin"
 else
    # Process piped JSON data and convert to binary (matching logits.cpp format)
    TEMP_FILE=$(mktemp /tmp/tmp.XXXXXX.binn)
    python3 -c "
 import json
 import sys
 import struct
 data = json.load(sys.stdin)
 # Flatten all embeddings completely
 flattened = []
 for item in data:
    embedding = item['embedding']
    for token_embedding in embedding:
        flattened.extend(token_embedding)
 print(f'Total embedding values: {len(flattened)}', file=sys.stderr)
 # Write as binary floats - matches logitc.cpp fwrite format
 with open('$TEMP_FILE', 'wb') as f:
    for value in flattened:
        f.write(struct.pack('f', value))
 "
    CPP_EMBEDDINGS="$TEMP_FILE"
    trap "rm -f $TEMP_FILE" EXIT
 fi
 python scripts/utils/semantic_check.py --model-path $MODEL_PATH \
    --python-embeddings data/pytorch-${MODEL_NAME}-embeddings.bin \
    --cpp-embeddings $CPP_EMBEDDINGS \
    --prompt "Hello world today"
--- a/examples/model-conversion/scripts/embedding/convert-model.sh
+++ b/examples/model-conversion/scripts/embedding/convert-model.sh
@ -0,0 +1,22 @@
 #!/usr/bin/env bash
 set -e
 MODEL_NAME="${MODEL_NAME:-$(basename "$EMBEDDING_MODEL_PATH")}"
 OUTPUT_DIR="${OUTPUT_DIR:-../../models}"
 TYPE="${OUTTYPE:-f16}"
 METADATA_OVERRIDE="${METADATA_OVERRIDE:-}"
 CONVERTED_MODEL="${OUTPUT_DIR}/${MODEL_NAME}.gguf"
 echo "Model path: ${EMBEDDING_MODEL_PATH}"
 echo "Model name: ${MODEL_NAME}"
 echo "Data  type: ${TYPE}"
 echo "Converted model path:: ${CONVERTED_MODEL}"
 python ../../convert_hf_to_gguf.py --verbose \
    ${EMBEDDING_MODEL_PATH} \
    --outfile ${CONVERTED_MODEL} \
    --outtype ${TYPE}
 echo ""
 echo "The environment variable CONVERTED_EMBEDDING MODEL can be set to this path using:"
 echo "export CONVERTED_EMBEDDING_MODEL=$(realpath ${CONVERTED_MODEL})"
--- a/examples/model-conversion/scripts/embedding/modelcard.template
+++ b/examples/model-conversion/scripts/embedding/modelcard.template
@ -0,0 +1,48 @@
 ---
 base_model:
 - {base_model}
 ---
 # {model_name} GGUF
 Recommended way to run this model:
 ```sh
 llama-server -hf {namespace}/{model_name}-GGUF
 ```
 Then the endpoint can be accessed at http://localhost:8080/embedding, for
 example using `curl`:
 ```console
 curl --request POST \
    --url http://localhost:8080/embedding \
    --header "Content-Type: application/json" \
    --data '{{"input": "Hello embeddings"}}' \
    --silent
 ```
 Alternatively, the `llama-embedding` command line tool can be used:
 ```sh
 llama-embedding -hf {namespace}/{model_name}-GGUF --verbose-prompt -p "Hello embeddings"
 ```
 #### embd_normalize
 When a model uses pooling, or the pooling method is specified using `--pooling`,
 the normalization can be controlled by the `embd_normalize` parameter.
 The default value is `2` which means that the embeddings are normalized using
 the Euclidean norm (L2). Other options are:
 * -1 No normalization
 *  0 Max absolute
 *  1 Taxicab
 *  2 Euclidean/L2
 * \>2 P-Norm
 This can be passed in the request body to `llama-server`, for example:
 ```sh
    --data '{{"input": "Hello embeddings", "embd_normalize": -1}}' \
 ```
 And for `llama-embedding`, by passing `--embd-normalize <value>`, for example:
 ```sh
 llama-embedding -hf {namespace}/{model_name}-GGUF  --embd-normalize -1 -p "Hello embeddings"
 ```
--- a/examples/model-conversion/scripts/embedding/run-converted-model.sh
+++ b/examples/model-conversion/scripts/embedding/run-converted-model.sh
@ -0,0 +1,20 @@
 #!/usr/bin/env bash
 set -e
 # First try command line argument, then environment variable, then file
 CONVERTED_MODEL="${1:-"$CONVERTED_EMBEDDING_MODEL"}"
 # Final check if we have a model path
 if [ -z "$CONVERTED_MODEL" ]; then
    echo "Error: Model path must be provided either as:" >&2
    echo "  1. Command line argument" >&2
    echo "  2. CONVERTED_EMBEDDING_MODEL environment variable" >&2
    exit 1
 fi
 echo $CONVERTED_MODEL
 cmake --build ../../build --target llama-logits -j8
 ../../build/bin/llama-logits -m "$CONVERTED_MODEL" -embd-mode "Hello world today"
--- a/examples/model-conversion/scripts/embedding/run-original-model.py
+++ b/examples/model-conversion/scripts/embedding/run-original-model.py
@ -0,0 +1,116 @@
 #!/usr/bin/env python3
 import argparse
 import os
 import numpy as np
 import importlib
 from pathlib import Path
 from transformers import AutoTokenizer, AutoConfig, AutoModel
 import torch
 unreleased_model_name = os.getenv('UNRELEASED_MODEL_NAME')
 parser = argparse.ArgumentParser(description='Process model with specified path')
 parser.add_argument('--model-path', '-m', help='Path to the model')
 args = parser.parse_args()
 model_path = os.environ.get('EMBEDDING_MODEL_PATH', args.model_path)
 if model_path is None:
    parser.error("Model path must be specified either via --model-path argument or EMBEDDING_MODEL_PATH environment variable")
 tokenizer = AutoTokenizer.from_pretrained(model_path)
 if unreleased_model_name:
    model_name_lower = unreleased_model_name.lower()
    unreleased_module_path = f"transformers.models.{model_name_lower}.modular_{model_name_lower}"
    class_name = f"{unreleased_model_name}Model"
    print(f"Importing unreleased model module: {unreleased_module_path}")
    try:
        model_class = getattr(importlib.import_module(unreleased_module_path), class_name)
        model = model_class.from_pretrained(model_path)  # Note: from_pretrained, not fromPretrained
    except (ImportError, AttributeError) as e:
        print(f"Failed to import or load model: {e}")
        exit(1)
 else:
    model = AutoModel.from_pretrained(model_path)
 print(f"Model class: {type(model)}")
 #print(f"Model file: {type(model).__module__}")
 config = AutoConfig.from_pretrained(model_path)
 model_name = os.path.basename(model_path)
 texts = [ "Hello world today" ]
 encoded = tokenizer(
    texts,
    padding=True,
    truncation=True,
    return_tensors="pt"
 )
 tokens = encoded['input_ids'][0]
 token_strings = tokenizer.convert_ids_to_tokens(tokens)
 for i, (token_id, token_str) in enumerate(zip(tokens, token_strings)):
    print(f"{token_id:6d} -> '{token_str}'")
 with torch.no_grad():
    outputs = model(**encoded)
    hidden_states = outputs.last_hidden_state  # Shape: [batch_size, seq_len, hidden_size]
    # Extract embeddings for each token (matching LLAMA_POOLING_TYPE_NONE behavior)
    all_embeddings = hidden_states[0].cpu().numpy()  # Shape: [seq_len, hidden_size]
    print(f"Hidden states shape: {hidden_states.shape}")
    print(f"All embeddings shape: {all_embeddings.shape}")
    print(f"Embedding dimension: {all_embeddings.shape[1]}")
    # Print embeddings exactly like embedding.cpp does for LLAMA_POOLING_TYPE_NONE
    n_embd = all_embeddings.shape[1]
    n_embd_count = all_embeddings.shape[0]
    print()  # Empty line to match C++ output
    for j in range(n_embd_count):
        embedding = all_embeddings[j]
        print(f"embedding {j}: ", end="")
        # Print first 3 values
        for i in range(min(3, n_embd)):
            print(f"{embedding[i]:9.6f} ", end="")
        print(" ... ", end="")
        # Print last 3 values
        for i in range(n_embd - 3, n_embd):
            print(f"{embedding[i]:9.6f} ", end="")
        print()  # New line
    print()  # Final empty line to match C++ output
    data_dir = Path("data")
    data_dir.mkdir(exist_ok=True)
    bin_filename = data_dir / f"pytorch-{model_name}-embeddings.bin"
    txt_filename = data_dir / f"pytorch-{model_name}-embeddings.txt"
    # Save all embeddings flattened (matching what embedding.cpp would save if it did)
    flattened_embeddings = all_embeddings.flatten()
    flattened_embeddings.astype(np.float32).tofile(bin_filename)
    with open(txt_filename, "w") as f:
        f.write(f"# Model class: {model_name}\n")
        f.write(f"# Tokens: {token_strings}\n")
        f.write(f"# Shape: {all_embeddings.shape}\n")
        f.write(f"# n_embd_count: {n_embd_count}, n_embd: {n_embd}\n\n")
        for j in range(n_embd_count):
            f.write(f"# Token {j} ({token_strings[j]}):\n")
            for i, value in enumerate(all_embeddings[j]):
                f.write(f"{j}_{i}: {value:.6f}\n")
            f.write("\n")
    print(f"Total values: {len(flattened_embeddings)} ({n_embd_count} tokens × {n_embd} dimensions)")
    print("")
    print(f"Saved bin embeddings to: {bin_filename}")
    print(f"Saved txt embeddings to: {txt_filename}")
--- a/examples/model-conversion/scripts/utils/check-nmse.py
+++ b/examples/model-conversion/scripts/utils/check-nmse.py
@ -0,0 +1,174 @@
 #!/usr/bin/env python3
 import numpy as np
 import sys
 import os
 import argparse
 from pathlib import Path
 def calculate_nmse(reference, test):
    mse = np.mean((test - reference) ** 2)
    ref_var = np.var(reference)
    if ref_var == 0:
        nmse = float('inf') if mse > 0 else 0.0
        return mse, mse, ref_var
    nmse = mse / ref_var
    return nmse, mse, ref_var
 def load_logits(file_path):
    if not os.path.exists(file_path):
        raise FileNotFoundError(f"File not found: {file_path}")
    if file_path.suffix == '.npy':
        return np.load(file_path)
    elif file_path.suffix == '.bin':
        return np.fromfile(file_path, dtype=np.float32)
    else:
        # Try to load as text file
        try:
            # If it has index format "0: value", extract just values
            data = []
            with open(file_path, 'r') as f:
                for line in f:
                    if ':' in line:
                        # Format: "index: value"
                        value = float(line.split(':')[1].strip())
                    else:
                        # Just the value
                        value = float(line.strip())
                    data.append(value)
            return np.array(data, dtype=np.float32)
        except:
            return np.loadtxt(file_path, dtype=np.float32)
 def interpret_nmse(nmse):
    """Provide interpretation of NMSE value"""
    if nmse == 0:
        return "Perfect match", "🎉"
    elif nmse < 1e-6:
        return "Essentially identical", "✅"
    elif nmse < 1e-4:
        return "Excellent match", "✅"
    elif nmse < 1e-3:
        return "Very good match", "👍"
    elif nmse < 1e-2:
        return "Good match", "👍"
    elif nmse < 0.1:
        return "Acceptable match", "⚠️"
    elif nmse < 1.0:
        return "Poor match", "❌"
    else:
        return "Very poor match (worse than noise)", "❌"
 def main():
    parser = argparse.ArgumentParser(description='Validate model logits')
    parser.add_argument('-m', '--model-path', required=True,  help='Path to the model directory')
    args = parser.parse_args()
    model_name = os.path.splitext(os.path.basename(args.model_path))[0]
    data_dir = Path("data")
    pytorch_file = data_dir / f"pytorch-{model_name}.bin"
    llamacpp_file = data_dir / f"llamacpp-{model_name}.bin"
    print(f"Model name: {model_name}")
    print(f"PyTorch logits file: {pytorch_file}")
    print(f"llama.cpp logits file: {llamacpp_file}")
    reference_file = pytorch_file
    test_file = llamacpp_file
    print("📊 NMSE Check for Model Comparison")
    print("=" * 50)
    print(f"Reference (ground truth): {reference_file}")
    print(f"Test (to evaluate):       {test_file}")
    print()
    try:
        print("Loading reference logits...")
        reference = load_logits(reference_file)
        print(f"  Shape: {reference.shape}, Type: {reference.dtype}")
        print("Loading test logits...")
        test = load_logits(test_file)
        print(f"  Shape: {test.shape}, Type: {test.dtype}")
        # Check shapes match
        if reference.shape != test.shape:
            print(f"\n❌ Error: Shape mismatch!")
            print(f"  Reference: {reference.shape}")
            print(f"  Test: {test.shape}")
            sys.exit(1)
        print(f"\n✅ Shapes match: {reference.shape}")
        nmse, mse, ref_var = calculate_nmse(reference, test)
        # Additional metrics
        max_abs_error = np.max(np.abs(test - reference))
        mean_abs_error = np.mean(np.abs(test - reference))
        # Results
        print(f"\n📈 METRICS")
        print("=" * 30)
        print(f"MSE (Mean Squared Error):     {mse:.6e}")
        print(f"Reference Variance:           {ref_var:.6e}")
        print(f"NMSE:                         {nmse:.6e}")
        print(f"Max Absolute Error:           {max_abs_error:.6f}")
        print(f"Mean Absolute Error:          {mean_abs_error:.6f}")
        # NMSE in dB (common in signal processing)
        if nmse > 0:
            nmse_db = 10 * np.log10(nmse)
            print(f"NMSE (dB):                    {nmse_db:.2f} dB")
        # Interpretation
        interpretation, emoji = interpret_nmse(nmse)
        print(f"\n🎯 INTERPRETATION")
        print("=" * 30)
        print(f"{emoji} {interpretation}")
        # Detailed guidance
        print(f"\n📋 GUIDANCE")
        print("=" * 30)
        if nmse < 1e-3:
            print("✅ EXCELLENT: Your GGML conversion is working very well!")
            print("   The differences are negligible for practical use.")
        elif nmse < 1e-2:
            print("👍 GOOD: Your GGML conversion is working well.")
            print("   Small differences are likely due to precision/quantization.")
        elif nmse < 0.1:
            print("⚠️  ACCEPTABLE: Conversion is working but with some differences.")
            print("   Check if you're using quantization (Q4, Q8, etc.)")
            print("   Test generation quality to see if it's acceptable.")
        else:
            print("❌ PROBLEMATIC: Large differences detected.")
            print("   Check your conversion process for potential issues.")
            print("   Verify you're using the same model weights.")
        # NMSE benchmarks
        print(f"\n📚 NMSE BENCHMARKS")
        print("=" * 30)
        print("< 1e-6:  Essentially identical")
        print("< 1e-4:  Excellent (typical for good conversions)")
        print("< 1e-3:  Very good")
        print("< 1e-2:  Good (acceptable for most use cases)")
        print("< 0.1:   Acceptable (may need verification)")
        print("> 1.0:   Poor (worse than random)")
        # Exit code based on NMSE
        if nmse < 1e-2:
            print(f"\n✅ RESULT: PASS (NMSE = {nmse:.2e})")
            sys.exit(0)
        else:
            print(f"\n❌ RESULT: NEEDS REVIEW (NMSE = {nmse:.2e})")
            sys.exit(1)
    except Exception as e:
        print(f"❌ Error: {e}")
        sys.exit(1)
 if __name__ == "__main__":
    main()
--- a/examples/model-conversion/scripts/utils/create-collection-add-model.sh
+++ b/examples/model-conversion/scripts/utils/create-collection-add-model.sh
@ -0,0 +1,8 @@
 #!/usr/bin/env bash
 COLLECTION_SLUG=$(python ./create_collection.py --return-slug)
 echo "Created collection: $COLLECTION_SLUG"
 # Use it in the next command
 python add_model_to_collection.py "$COLLECTION_SLUG" "username/my-model"
--- a/examples/model-conversion/scripts/utils/curl-embedding-server.sh
+++ b/examples/model-conversion/scripts/utils/curl-embedding-server.sh
@ -0,0 +1,6 @@
 #!/usr/bin/env bash
 curl --request POST \
    --url http://localhost:8080/embedding \
    --header "Content-Type: application/json" \
    --data '{"input": "Hello world today"}' \
    --silent
--- a/examples/model-conversion/scripts/utils/hf-add-model-to-collection.py
+++ b/examples/model-conversion/scripts/utils/hf-add-model-to-collection.py
@ -0,0 +1,80 @@
 #!/usr/bin/env python3
 from huggingface_hub import HfApi
 import argparse
 import sys
 def add_model_to_collection(collection_slug, model_id, note=""):
    """
    Add a model to an existing collection
    Args:
        collection_slug: The slug of the collection (e.g., "username/collection-name-12345")
        model_id: The model repository ID (e.g., "username/model-name")
        note: Optional note about the model
    Returns:
        True if successful, False if failed
    """
    # Initialize API
    api = HfApi()
    try:
        user_info = api.whoami()
        print(f"✅ Authenticated as: {user_info['name']}")
        # Verify the model exists
        print(f"🔍 Checking if model exists: {model_id}")
        try:
            model_info = api.model_info(model_id)
        except Exception as e:
            print(f"❌ Model not found or not accessible: {model_id}")
            print(f"Error: {e}")
            return False
        print(f"📚 Adding model to collection...")
        api.add_collection_item(
            collection_slug=collection_slug,
            item_id=model_id,
            item_type="model",
            note=note
        )
        print(f"✅ Model added to collection successfully!")
        print(f"🔗 Collection URL: https://huggingface.co/collections/{collection_slug}")
        return True
    except Exception as e:
        print(f"❌ Error adding model to collection: {e}")
        return False
 def main():
    # This script requires that the environment variable HF_TOKEN is set with your
    # Hugging Face API token.
    api = HfApi()
    parser = argparse.ArgumentParser(description='Add model to a Huggingface Collection')
    parser.add_argument('--collection', '-c', help='The collection slug username/collection-hash', required=True)
    parser.add_argument('--model', '-m', help='The model to add to the Collection', required=True)
    parser.add_argument('--note', '-n', help='An optional note/description', required=False)
    args = parser.parse_args()
    collection = args.collection
    model = args.model
    note = args.note
    success = add_model_to_collection(
        collection_slug=collection,
        model_id=model,
        note=note
    )
    if success:
        print("\n🎉 Model added successfully!")
    else:
        print("\n❌ Failed to add model to collection")
        sys.exit(1)
 if __name__ == "__main__":
    main()
--- a/examples/model-conversion/scripts/utils/hf-create-collection.py
+++ b/examples/model-conversion/scripts/utils/hf-create-collection.py
@ -0,0 +1,106 @@
 #!/usr/bin/env python3
 from huggingface_hub import HfApi
 import argparse
 import os
 import sys
 def create_collection(title, description, private=False, namespace=None, return_slug=False):
    """
    Create a new collection on Hugging Face
    Args:
        title: Collection title
        description: Collection description
        private: Whether the collection should be private (default: False)
        namespace: Optional namespace (defaults to your username)
    Returns:
        Collection object if successful, None if failed
    """
    # Check if HF_TOKEN is available
    token = os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACE_HUB_TOKEN")
    if not token:
        print("❌ No HF_TOKEN or HUGGINGFACE_HUB_TOKEN found in environment variables")
        print("Please set your Hugging Face token as an environment variable")
        return None
    # Initialize API
    api = HfApi()
    try:
        # Test authentication first
        user_info = api.whoami()
        if not return_slug:
            print(f"✅ Authenticated as: {user_info['name']}")
        # Create the collection
        if not return_slug:
            print(f"📚 Creating collection: '{title}'...")
        collection = api.create_collection(
            title=title,
            description=description,
            private=private,
            namespace=namespace
        )
        if not return_slug:
            print(f"✅ Collection created successfully!")
            print(f"📋 Collection slug: {collection.slug}")
            print(f"🔗 Collection URL: https://huggingface.co/collections/{collection.slug}")
        return collection
    except Exception as e:
        print(f"❌ Error creating collection: {e}")
        return None
 def main():
    # This script requires that the environment variable HF_TOKEN is set with your
    # Hugging Face API token.
    api = HfApi()
    parser = argparse.ArgumentParser(description='Create a Huggingface Collection')
    parser.add_argument('--name', '-n', help='The name/title of the Collection', required=True)
    parser.add_argument('--description', '-d', help='The description for the Collection', required=True)
    parser.add_argument('--namespace', '-ns', help='The namespace to add the Collection to', required=True)
    parser.add_argument('--private', '-p', help='Create a private Collection', action='store_true')  # Fixed
    parser.add_argument('--return-slug', '-s', help='Only output the collection slug', action='store_true')  # Fixed
    args = parser.parse_args()
    name = args.name
    description = args.description
    private = args.private
    namespace = args.namespace
    return_slug = args.return_slug
    if not return_slug:
        print("🚀 Creating Hugging Face Collection")
        print(f"Title: {name}")
        print(f"Description: {description}")
        print(f"Namespace: {namespace}")
        print(f"Private: {private}")
    collection = create_collection(
        title=name,
        description=description,
        private=private,
        namespace=namespace,
        return_slug=return_slug
    )
    if collection:
        if return_slug:
            print(collection.slug)
        else:
            print("\n🎉 Collection created successfully!")
            print(f"Use this slug to add models: {collection.slug}")
    else:
        print("\n❌ Failed to create collection")
        sys.exit(1)
 if __name__ == "__main__":
    main()
--- a/examples/model-conversion/scripts/utils/hf-create-model.py
+++ b/examples/model-conversion/scripts/utils/hf-create-model.py
@ -0,0 +1,78 @@
 #!/usr/bin/env python3
 from huggingface_hub import HfApi
 import argparse
 # This script requires that the environment variable HF_TOKEN is set with your
 # Hugging Face API token.
 api = HfApi()
 def load_template_and_substitute(template_path, **kwargs):
    try:
        with open(template_path, 'r', encoding='utf-8') as f:
            template_content = f.read()
        return template_content.format(**kwargs)
    except FileNotFoundError:
        print(f"Template file '{template_path}' not found!")
        return None
    except KeyError as e:
        print(f"Missing template variable: {e}")
        return None
 parser = argparse.ArgumentParser(description='Create a new Hugging Face model repository')
 parser.add_argument('--model-name', '-m', help='Name for the model', required=True)
 parser.add_argument('--namespace', '-ns', help='Namespace to add the model to', required=True)
 parser.add_argument('--org-base-model', '-b', help='Original Base model name', default="")
 parser.add_argument('--no-card', action='store_true', help='Skip creating model card')
 parser.add_argument('--private', '-p', action='store_true', help='Create private model')
 parser.add_argument('--embedding', '-e', action='store_true', help='Use embedding model card template')
 parser.add_argument('--dry-run', '-d', action='store_true', help='Print repository info and template without creating repository')
 args = parser.parse_args()
 repo_id = f"{args.namespace}/{args.model_name}-GGUF"
 print("Repository ID: ", repo_id)
 repo_url = None
 if not args.dry_run:
    repo_url = api.create_repo(
        repo_id=repo_id,
        repo_type="model",
        private=args.private,
        exist_ok=False
    )
 if not args.no_card:
    if args.embedding:
        template_path = "scripts/embedding/modelcard.template"
    else:
        template_path = "scripts/causal/modelcard.template"
    print("Template path: ", template_path)
    model_card_content = load_template_and_substitute(
        template_path,
        model_name=args.model_name,
        namespace=args.namespace,
        base_model=args.org_base_model,
    )
    if args.dry_run:
        print("\nTemplate Content:\n")
        print(model_card_content)
    else:
        if model_card_content:
            api.upload_file(
                path_or_fileobj=model_card_content.encode('utf-8'),
                path_in_repo="README.md",
                repo_id=repo_id
            )
            print("Model card created successfully.")
        else:
            print("Failed to create model card.")
 if not args.dry_run and repo_url:
    print(f"Repository created: {repo_url}")
--- a/examples/model-conversion/scripts/utils/hf-upload-gguf-model.py
+++ b/examples/model-conversion/scripts/utils/hf-upload-gguf-model.py
@ -0,0 +1,58 @@
 #!/usr/bin/env python3
 from huggingface_hub import HfApi
 import argparse
 import os
 def upload_gguf_file(local_file_path, repo_id, filename_in_repo=None):
    """
    Upload a GGUF file to a Hugging Face model repository
    Args:
        local_file_path: Path to your local GGUF file
        repo_id: Your repository ID (e.g., "username/model-name")
        filename_in_repo: Optional custom name for the file in the repo
    """
    if not os.path.exists(local_file_path):
        print(f"❌ File not found: {local_file_path}")
        return False
    if filename_in_repo is None:
        filename_in_repo = os.path.basename(local_file_path)
    if filename_in_repo is None or filename_in_repo == "":
        filename_in_repo = os.path.basename(local_file_path)
    print(f"📤 Uploading {local_file_path} to {repo_id}/{filename_in_repo}")
    api = HfApi()
    try:
        api.upload_file(
            path_or_fileobj=local_file_path,
            path_in_repo=filename_in_repo,
            repo_id=repo_id,
            repo_type="model",
            commit_message=f"Upload {filename_in_repo}"
        )
        print("✅ Upload successful!")
        print(f"🔗 File available at: https://huggingface.co/{repo_id}/blob/main/{filename_in_repo}")
        return True
    except Exception as e:
        print(f"❌ Upload failed: {e}")
        return False
 # This script requires that the environment variable HF_TOKEN is set with your
 # Hugging Face API token.
 api = HfApi()
 parser = argparse.ArgumentParser(description='Upload a GGUF model to a Huggingface model repository')
 parser.add_argument('--gguf-model-path', '-m', help='The GGUF model file to upload', required=True)
 parser.add_argument('--repo-id', '-r', help='The repository to upload to', required=True)
 parser.add_argument('--name', '-o', help='The name in the model repository', required=False)
 args = parser.parse_args()
 upload_gguf_file(args.gguf_model_path, args.repo_id, args.name)
--- a/examples/model-conversion/scripts/utils/inspect-converted-model.sh
+++ b/examples/model-conversion/scripts/utils/inspect-converted-model.sh
@ -0,0 +1,14 @@
 #!/usr/bin/env bash
 # First try command line argument, then environment variable, then file
 CONVERTED_MODEL="${1:-"$CONVERTED_MODEL"}"
 # Final check if we have a model path
 if [ -z "$CONVERTED_MODEL" ]; then
    echo "Error: Model path must be provided either as:" >&2
    echo "  1. Command line argument" >&2
    echo "  2. CONVERTED_MODEL environment variable" >&2
    exit 1
 fi
 ../../gguf-py/gguf/scripts/gguf_dump.py $CONVERTED_MODEL
--- a/examples/model-conversion/scripts/utils/inspect-org-model.py
+++ b/examples/model-conversion/scripts/utils/inspect-org-model.py
@ -0,0 +1,67 @@
 #!/usr/bin/env python3
 import argparse
 import os
 import json
 from safetensors import safe_open
 from collections import defaultdict
 parser = argparse.ArgumentParser(description='Process model with specified path')
 parser.add_argument('--model-path', '-m', help='Path to the model')
 args = parser.parse_args()
 model_path = os.environ.get('MODEL_PATH', args.model_path)
 if model_path is None:
    parser.error("Model path must be specified either via --model-path argument or MODEL_PATH environment variable")
 # Check if there's an index file (multi-file model)
 index_path = os.path.join(model_path, "model.safetensors.index.json")
 single_file_path = os.path.join(model_path, "model.safetensors")
 if os.path.exists(index_path):
    # Multi-file model
    print("Multi-file model detected")
    with open(index_path, 'r') as f:
        index_data = json.load(f)
    # Get the weight map (tensor_name -> file_name)
    weight_map = index_data.get("weight_map", {})
    # Group tensors by file for efficient processing
    file_tensors = defaultdict(list)
    for tensor_name, file_name in weight_map.items():
        file_tensors[file_name].append(tensor_name)
    print("Tensors in model:")
    # Process each shard file
    for file_name, tensor_names in file_tensors.items():
        file_path = os.path.join(model_path, file_name)
        print(f"\n--- From {file_name} ---")
        with safe_open(file_path, framework="pt") as f:  # type: ignore
            for tensor_name in sorted(tensor_names):
                tensor = f.get_tensor(tensor_name)
                print(f"- {tensor_name} : shape = {tensor.shape}, dtype = {tensor.dtype}")
 elif os.path.exists(single_file_path):
    # Single file model (original behavior)
    print("Single-file model detected")
    with safe_open(single_file_path, framework="pt") as f:  # type: ignore
        keys = f.keys()
        print("Tensors in model:")
        for key in sorted(keys):
            tensor = f.get_tensor(key)
            print(f"- {key} : shape = {tensor.shape}, dtype = {tensor.dtype}")
 else:
    print(f"Error: Neither 'model.safetensors.index.json' nor 'model.safetensors' found in {model_path}")
    print("Available files:")
    if os.path.exists(model_path):
        for item in sorted(os.listdir(model_path)):
            print(f"  {item}")
    else:
        print(f"  Directory {model_path} does not exist")
    exit(1)
--- a/examples/model-conversion/scripts/utils/perplexity-gen.sh
+++ b/examples/model-conversion/scripts/utils/perplexity-gen.sh
@ -0,0 +1,35 @@
 #!/usr/bin/env bash
 set -e
 CONVERTED_MODEL="${1:-"$CONVERTED_MODEL"}"
 # Final check if we have a model path
 if [ -z "$CONVERTED_MODEL" ]; then
    echo "Error: Model path must be provided either as:" >&2
    echo "  1. Command line argument" >&2
    echo "  2. CONVERTED_MODEL environment variable" >&2
    exit 1
 fi
 # Check if data/wikitext-2-raw directory exists
 if [ ! -d "ppl/wikitext-2-raw" ]; then
    echo "ppl/wikitext-2-raw directory does not exist. Downloading..." >&2
    mkdir -p ppl
    pushd ppl
    ./../../../scripts/get-wikitext-2.sh
    popd
 fi
 mkdir -p ppl
 OUTPUTFILE="ppl/$(basename $CONVERTED_MODEL).kld"
 echo "Model: $CONVERTED_MODEL"
 cmake --build ../../build --target llama-perplexity -j8
 ../.././build/bin/llama-perplexity -m $CONVERTED_MODEL \
    -f ppl/wikitext-2-raw/wiki.test.raw \
    --kl-divergence-base $OUTPUTFILE
 echo "Generated logits in $OUTPUTFILE"
--- a/examples/model-conversion/scripts/utils/perplexity-run-simple.sh
+++ b/examples/model-conversion/scripts/utils/perplexity-run-simple.sh
@ -0,0 +1,27 @@
 #!/usr/bin/env bash
 set -e
 QUANTIZED_MODEL="${1:-"$QUANTIZED_MODEL"}"
 if [ -z "$QUANTIZED_MODEL" ]; then
    echo "Error: Model path must be provided either as:" >&2
    echo "  1. Command line argument" >&2
    echo "  2. QUANTIZED_MODEL environment variable" >&2
    exit 1
 fi
 # Check if data/wikitext-2-raw directory exists
 if [ ! -d "ppl/wikitext-2-raw" ]; then
    echo "ppl/wikitext-2-raw directory does not exist. Downloading..." >&2
    mkdir -p ppl
    pushd ppl
    ./../../../scripts/get-wikitext-2.sh
    popd
 fi
 cmake --build ../../build --target llama-perplexity -j8
 ../.././build/bin/llama-perplexity -m $QUANTIZED_MODEL -f ppl/wikitext-2-raw/wiki.test.raw
--- a/examples/model-conversion/scripts/utils/perplexity-run.sh
+++ b/examples/model-conversion/scripts/utils/perplexity-run.sh
@ -0,0 +1,28 @@
 #!/usr/bin/env bash
 set -e
 QUANTIZED_MODEL="${1:-"$QUANTIZED_MODEL"}"
 LOGITS_FILE="${1:-"$LOGITS_FILE"}"
 if [ -z "$QUANTIZED_MODEL" ]; then
    echo "Error: Model path must be provided either as:" >&2
    echo "  1. Command line argument" >&2
    echo "  2. QUANTIZED_MODEL environment variable" >&2
    exit 1
 fi
 if [ ! -f ${LOGITS_FILE} ]; then
    echo "Error: logits file '${LOGITS_FILE} was not found"
    echo "Did you run the perplexity-gen.sh script?"
    exit 1
 fi
 echo "Model: $QUANTIZED_MODEL"
 echo "Data file: $LOGITS_FILE"
 cmake --build ../../build --target llama-perplexity -j8
 ../.././build/bin/llama-perplexity -m $QUANTIZED_MODEL \
    --kl-divergence-base $LOGITS_FILE \
    --kl-divergence
--- a/examples/model-conversion/scripts/utils/quantize.sh
+++ b/examples/model-conversion/scripts/utils/quantize.sh
@ -0,0 +1,48 @@
 #!/usr/bin/env bash
 set -e
 CONVERTED_MODEL="${1:-"$CONVERTED_MODEL"}"
 QUANTIZED_TYPE="${2:-"$QUANTIZED_TYPE"}"
 TOKEN_EMBD_TYPE="${3:-"${TOKEN_EMBD_TYPE}"}"
 OUTPUT_TYPE="${4:-"${OUTPUT_TYPE}"}"
 QUANTIZED_MODEL=$CONVERTED_MODEL
 # Final check if we have a model path
 if [ -z "$CONVERTED_MODEL" ]; then
    echo "Error: Model path must be provided either as:" >&2
    echo "  1. Command line argument" >&2
    echo "  2. CONVERTED_MODEL environment variable" >&2
    exit 1
 fi
 if [ -z "$QUANTIZED_TYPE" ]; then
    echo "Error: QUANTIZED_TYPE is required" >&2
    exit 1
 fi
 echo $CONVERTED_MODEL
 # Process the quantized model filename
 if [[ "$QUANTIZED_MODEL" == *.gguf ]]; then
    # Remove .gguf suffix, add quantized type, then add .gguf back
    BASE_NAME="${QUANTIZED_MODEL%.gguf}"
    QUANTIZED_MODEL="${BASE_NAME}-${QUANTIZED_TYPE}.gguf"
 else
    echo "Error: QUANTIZED_MODEL must end with .gguf extension" >&2
    exit 1
 fi
 cmake --build ../../build --target llama-quantize -j8
 echo $TOKEN_EMBD_TYPE
 echo $OUTPUT_TYPE
 CMD_ARGS=("../../build/bin/llama-quantize")
 [[ -n "$TOKEN_EMBD_TYPE" ]] && CMD_ARGS+=("--token-embedding-type" "$TOKEN_EMBD_TYPE")
 [[ -n "$OUTPUT_TYPE" ]]     && CMD_ARGS+=("--output-tensor-type" "$OUTPUT_TYPE")
 CMD_ARGS+=("$CONVERTED_MODEL" "$QUANTIZED_MODEL" "$QUANTIZED_TYPE")
 "${CMD_ARGS[@]}"
 echo "Quantized model saved to: $QUANTIZED_MODEL"
--- a/examples/model-conversion/scripts/utils/run-embedding-server.sh
+++ b/examples/model-conversion/scripts/utils/run-embedding-server.sh
@ -0,0 +1,22 @@
 #!/usr/bin/env bash
 set -e
 #
 # First try command line argument, then environment variable, then file
 CONVERTED_MODEL="${1:-"$CONVERTED_MODEL"}"
 # Final check if we have a model path
 if [ -z "$CONVERTED_MODEL" ]; then
    echo "Error: Model path must be provided either as:" >&2
    echo "  1. Command line argument" >&2
    echo "  2. CONVERTED_MODEL environment variable" >&2
    exit 1
 fi
 echo $CONVERTED_MODEL
 cmake --build ../../build --target llama-server
 ../../build/bin/llama-server -m $CONVERTED_MODEL \
    --embedding \
    --pooling none
--- a/examples/model-conversion/scripts/utils/semantic_check.py
+++ b/examples/model-conversion/scripts/utils/semantic_check.py
@ -0,0 +1,179 @@
 #!/usr/bin/env python3
 import numpy as np
 import argparse
 import os
 import importlib
 from transformers import AutoTokenizer, AutoConfig, AutoModelForCausalLM, AutoModel
 unreleased_model_name = os.getenv('UNRELEASED_MODEL_NAME')
 def cosine_similarity(a, b=None):
    a = np.asarray(a)
    if b is None:
        b = a
    else:
        b = np.asarray(b)
    if a.ndim == 1:
        a = a.reshape(1, -1)
    if b.ndim == 1:
        b = b.reshape(1, -1)
    a_norms = np.linalg.norm(a, axis=1, keepdims=True)
    b_norms = np.linalg.norm(b, axis=1, keepdims=True)
    a_norms = np.where(a_norms == 0, 1e-8, a_norms)
    b_norms = np.where(b_norms == 0, 1e-8, b_norms)
    a_normalized = a / a_norms
    b_normalized = b / b_norms
    # Compute cosine similarity
    return np.dot(a_normalized, b_normalized.T)
 def load_embeddings_from_file(filename, n_tokens, n_embd):
    embeddings = np.fromfile(filename, dtype=np.float32)
    return embeddings.reshape(n_tokens, n_embd)
 def test_single_prompt_similarity(python_emb, cpp_emb, tokens, prompt):
    np.set_printoptions(suppress=True, precision=6)
    print("pytorch embeddings:");
    print(python_emb)
    print("llama.cpp embeddings:");
    print(cpp_emb)
    print(f"\n=== Prompt: '{prompt}' ===")
    print(f"Tokens: {tokens}")
    print(f"Embeddings shape: Python {python_emb.shape}, llama.cpp {cpp_emb.shape}")
    n_tokens = len(tokens)
    # 1. Direct embedding comparison
    print(f"\n1. Raw Embedding Magnitude Comparison:")
    # Check if the distance of each token embedding from the origin and compare
    # if the vectors are on the same "sphere". This does not tell us about
    # direction (meaning of the token embedding), just magnitude.
    for i in range(n_tokens):
        py_mag = np.linalg.norm(python_emb[i]) # calculate standard euclidean norm for Python embeddings
        cpp_mag = np.linalg.norm(cpp_emb[i])   # calculate standard euclidean norm for llama.cpp embeddings
        ratio = py_mag / cpp_mag if cpp_mag > 0 else float('inf')
        print(f"   Token {i} ({tokens[i]}): Python={py_mag:.3f}, llama.cpp={cpp_mag:.3f}, ratio={ratio:.3f}")
    # 2. Cosine similarity between tokens within each model
    # Here we check the direction of token embeddings to see if the have the
    # same meaning (similarity). This is done by calculating cosine similarity
    # of a pair of token embeddings within each model.
    print(f"\n2. Within-Model Token Similarities:")
    print("   Python model:")
    for i in range(n_tokens):
        for j in range(i+1, n_tokens):
            sim = cosine_similarity([python_emb[i]], [python_emb[j]])[0][0]
            print(f"     {tokens[i]} ↔ {tokens[j]}: {sim:.4f}")
    print("   llama.cpp model:")
    for i in range(n_tokens):
        for j in range(i+1, n_tokens):
            sim = cosine_similarity([cpp_emb[i]], [cpp_emb[j]])[0][0]
            print(f"     {tokens[i]} ↔ {tokens[j]}: {sim:.4f}")
    # 3. Cross-model similarity (same token position)
    print(f"\n3. Cross-Model Same-Token Similarities:")
    for i in range(n_tokens):
        sim = cosine_similarity([python_emb[i]], [cpp_emb[i]])[0][0]
        print(f"   Token {i} ({tokens[i]}): {sim:.4f}")
    # 4. Similarity matrix comparison
    print(f"\n4. Similarity Matrix Differences:")
    py_sim_matrix = cosine_similarity(python_emb)
    cpp_sim_matrix = cosine_similarity(cpp_emb)
    diff_matrix = np.abs(py_sim_matrix - cpp_sim_matrix)
    print(f"   Max difference: {np.max(diff_matrix):.4f}")
    print(f"   Mean difference: {np.mean(diff_matrix):.4f}")
    print(f"   RMS difference: {np.sqrt(np.mean(diff_matrix**2)):.4f}")
    return {
        'cross_model_similarities': [cosine_similarity([python_emb[i]], [cpp_emb[i]])[0][0] for i in range(n_tokens)],
        'similarity_matrix_diff': diff_matrix,
        'max_diff': np.max(diff_matrix),
        'mean_diff': np.mean(diff_matrix),
        'rms_diff': np.sqrt(np.mean(diff_matrix**2))
    }
 def main():
    parser = argparse.ArgumentParser(description='Test semantic similarity between Python and llama.cpp embeddings')
    parser.add_argument('--model-path', '-m', required=True, help='Path to the original Python model')
    parser.add_argument('--python-embeddings', '-pe', help='Path to pytorch embeddings "logits" binary file')
    parser.add_argument('--cpp-embeddings', '-ce', help='Path to llama.cpp embeddings "logits" binary file')
    parser.add_argument('--causal', '-c', default=False, help='if the model is causal (default: false)', action='store_true')
    parser.add_argument('--prompt', '-p', default='Hello world today', help='Test prompt')
    args = parser.parse_args()
    print("Semantic Similarity Test Between Python and llama.cpp Embedding Models")
    print("=" * 70)
    # Single prompt detailed comparison
    print(f"\nTesting with prompt: '{args.prompt}'")
    # Load the python model to get configuration information and also to load the tokenizer.
    print("Loading model and tokenizer using AutoTokenizer:", args.model_path)
    tokenizer = AutoTokenizer.from_pretrained(args.model_path)
    config = AutoConfig.from_pretrained(args.model_path)
    if unreleased_model_name:
        model_name_lower = unreleased_model_name.lower()
        unreleased_module_path = f"transformers.models.{model_name_lower}.modular_{model_name_lower}"
        if args.causal:
            class_name = f"{unreleased_model_name}ForCausalLM"
        else:
            class_name = f"{unreleased_model_name}Model"
        print(f"Model class: {class_name}")
        print(f"Importing unreleased model module: {unreleased_module_path}")
        try:
            model_class = getattr(importlib.import_module(unreleased_module_path), class_name)
            model = model_class.from_pretrained(args.model_path)
        except (ImportError, AttributeError) as e:
            print(f"Failed to import or load model: {e}")
            exit(1)
    else:
        if args.causal:
            model = AutoModelForCausalLM.from_pretrained(args.model_path)
        else:
            model = AutoModel.from_pretrained(args.model_path)
    encoded = tokenizer(args.prompt, return_tensors="pt")
    tokens = tokenizer.convert_ids_to_tokens(encoded['input_ids'][0])
    n_tokens = len(tokens)
    print(f"n_tokens: {n_tokens}");
    print(f"hidden_size: {model.config.hidden_size}")
    # Load binary embeddings from data directory.
    llamacpp_embeddings = load_embeddings_from_file(args.cpp_embeddings, n_tokens, model.config.hidden_size)
    python_embeddings = load_embeddings_from_file(args.python_embeddings, n_tokens, model.config.hidden_size)
    # Run comparison
    results = test_single_prompt_similarity(python_embeddings, llamacpp_embeddings, tokens, args.prompt)
    # Summary
    print(f"\n=== SUMMARY ===")
    avg_cross_sim = np.mean(results['cross_model_similarities'])
    print(f"Average cross-model similarity: {avg_cross_sim:.4f}")
    print(f"Similarity matrix RMS difference: {results['rms_diff']:.4f}")
    # Quality assessment
    if avg_cross_sim > 0.95:
        print("✅ EXCELLENT: Models are highly similar")
    elif avg_cross_sim > 0.90:
        print("✅ VERY GOOD: Models are very similar")
    elif avg_cross_sim > 0.80:
        print("⚠️  GOOD: Models are reasonably similar")
    elif avg_cross_sim > 0.70:
        print("⚠️  FAIR: Models have some differences")
    else:
        print("❌ POOR: Models are significantly different")
 if __name__ == "__main__":
    main()
--- a/examples/passkey/README.md
+++ b/examples/passkey/README.md
@ -11,5 +11,5 @@ See the following PRs for more info:
 ### Usage
 ```bash
-make -j && ./llama-passkey -m ./models/llama-7b-v2/ggml-model-f16.gguf --junk 250
+llama-passkey -m ./models/llama-7b-v2/ggml-model-f16.gguf --junk 250
 ```
--- a/examples/retrieval/README.md
+++ b/examples/retrieval/README.md
@ -15,7 +15,7 @@ https://github.com/ggml-org/llama.cpp/pull/6193
 `retrieval` example can be tested as follows:
 ```bash
-make -j && ./llama-retrieval --model ./models/bge-base-en-v1.5-f16.gguf --top-k 3 --context-file README.md --context-file License --chunk-size 100 --chunk-separator .
+llama-retrieval --model ./models/bge-base-en-v1.5-f16.gguf --top-k 3 --context-file README.md --context-file License --chunk-size 100 --chunk-separator .
 ```
 This chunks and embeds all given files and starts a loop requesting query inputs:
--- a/examples/speculative-simple/speculative-simple.cpp
+++ b/examples/speculative-simple/speculative-simple.cpp
@ -59,6 +59,8 @@ int main(int argc, char ** argv) {
    }
    params.cpuparams_batch.n_threads = params.speculative.cpuparams_batch.n_threads;
    params.tensor_buft_overrides     = params.speculative.tensor_buft_overrides;
    common_init_result llama_init_dft = common_init_from_params(params);
    //model_dft = llama_init_dft.model.get();
--- a/examples/speculative/speculative.cpp
+++ b/examples/speculative/speculative.cpp
@ -85,6 +85,8 @@ int main(int argc, char ** argv) {
    }
    params.cpuparams_batch.n_threads = params.speculative.cpuparams_batch.n_threads;
    params.tensor_buft_overrides     = params.speculative.tensor_buft_overrides;
    common_init_result llama_init_dft = common_init_from_params(params);
    model_dft = llama_init_dft.model.get();
@ -242,7 +244,7 @@ int main(int argc, char ** argv) {
                    // stochastic verification
                    common_sampler_sample(smpl, ctx_tgt, drafts[s_keep].i_batch_tgt[i_dft], true);
-                    auto & dist_tgt = *common_sampler_get_candidates(smpl);
+                    auto & dist_tgt = *common_sampler_get_candidates(smpl, true);
                    float p_tgt = 0.0f;
                    float p_dft = 0.0f;
@ -491,7 +493,7 @@ int main(int argc, char ** argv) {
                common_sampler_sample(drafts[s].smpl, ctx_dft, drafts[s].i_batch_dft, true);
-                const auto * cur_p = common_sampler_get_candidates(drafts[s].smpl);
+                const auto * cur_p = common_sampler_get_candidates(drafts[s].smpl, true);
                for (int k = 0; k < std::min(n_seq_dft + 3, (int) cur_p->size); ++k) {
                    LOG_DBG(" - draft candidate %3d for seq %3d, pos %3d: %6d (%8.3f) '%s'\n",
--- a/examples/sycl/win-build-sycl.bat
+++ b/examples/sycl/win-build-sycl.bat
@ -18,8 +18,6 @@ if %errorlevel% neq 0 goto ERROR
 ::  for FP32
 cmake -G "Ninja" .. -DLLAMA_CURL=OFF -DGGML_SYCL=ON -DCMAKE_C_COMPILER=cl -DCMAKE_CXX_COMPILER=icx -DBUILD_SHARED_LIBS=ON -DCMAKE_BUILD_TYPE=Release
 if %errorlevel% neq 0 goto ERROR
 ::  build example/main only
 ::  make main
 ::  build all binary
 cmake --build . -j
--- a/examples/training/finetune.cpp
+++ b/examples/training/finetune.cpp
@ -15,15 +15,15 @@
 int main(int argc, char ** argv) {
    common_params params;
    params.escape = false;
-    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_PERPLEXITY)) {
+    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_FINETUNE)) {
        return 1;
    }
    if (params.use_mmap) {
-        LOG_INF("%s: force disabling memory mapping because it would result in-read-only pointers to the weights\n", __func__);
+        LOG_INF("%s: force disabling memory mapping because it would result in-read-only pointers to the weights\n",
                __func__);
        params.use_mmap = false;
    }
    if (params.cache_type_k != GGML_TYPE_F32) {
@ -38,7 +38,6 @@ int main(int argc, char ** argv) {
    common_init();
    llama_backend_init();
    llama_numa_init(params.numa);
    // load the model and apply lora adapter, if any
    common_init_result   llama_init = common_init_from_params(params);
    llama_model_ptr    & model      = llama_init.model;
@ -55,29 +54,30 @@ int main(int argc, char ** argv) {
        LOG_INF("%s\n", common_params_get_system_info(params).c_str());
    }
    constexpr float val_split = 0.05f;
    std::vector<llama_token> tokens  = common_tokenize(ctx.get(), params.prompt, true);
-    ggml_opt_dataset_t dataset = common_opt_dataset_init(ctx.get(), tokens, llama_n_ctx(ctx.get())/2);
+    ggml_opt_dataset_t       dataset = common_opt_dataset_init(ctx.get(), tokens, llama_n_ctx(ctx.get()) / 2);
-    struct ggml_opt_optimizer_params optimizer_params = ggml_opt_get_default_optimizer_params(nullptr);
+    struct lr_opt & lr = params.lr;
-    optimizer_params.adamw.alpha = 1e-7f; // learning rate
+    LOG_INF("-optimizer %s -lr0 %.2g -wd %.2g -lr-min %.2g -min-epochs %.2g -epochs %d -period %.2g -val %.2g\n",
            ggml_opt_optimizer_name(params.optimizer), (double) lr.lr0, (double) lr.wd, (double) lr.lr_min, (double) lr.decay_epochs,
            (unsigned) lr.epochs, (double) params.n_batch / params.n_ubatch, (double) params.val_split);
-    struct llama_opt_params lopt_params {
+    struct llama_opt_params lopt_params{
-        /*n_ctx_train     =*/ 0,
+        /*n_ctx_train     =*/0,
-        /*param_filter    =*/ llama_opt_param_filter_all,
+        /*param_filter    =*/llama_opt_param_filter_all,
-        /*param_filter_ud =*/ nullptr,
+        /*param_filter_ud =*/nullptr,
-        /*get_opt_pars    =*/ ggml_opt_get_constant_optimizer_params,
+        /*get_opt_pars    =*/common_opt_lr_pars,
-        /*get_opt_pars_ud =*/ &optimizer_params,
+        /*get_opt_pars_ud =*/&params.lr,
        /*optimizer_type  =*/params.optimizer,
    };
    llama_opt_init(ctx.get(), model.get(), lopt_params);
-    const int64_t idata_split = ggml_opt_dataset_ndata(dataset) * (1.0f - val_split);
+    const int64_t idata_split = ggml_opt_dataset_ndata(dataset) * (1.0f - params.val_split);
    ggml_opt_result_t result_train = ggml_opt_result_init();
    ggml_opt_result_t result_eval  = ggml_opt_result_init();
-    for (int epoch = 0; epoch < 2; ++epoch) {
+    for (lr.epoch = 0; lr.epoch < lr.epochs; ++lr.epoch) {
        llama_opt_epoch(ctx.get(), dataset, result_train, result_eval, idata_split,
                        ggml_opt_epoch_callback_progress_bar, ggml_opt_epoch_callback_progress_bar);
        fprintf(stderr, "\n");
@ -88,7 +88,7 @@ int main(int argc, char ** argv) {
    ggml_opt_result_free(result_train);
    ggml_opt_result_free(result_eval);
-    llama_model_save_to_file(model.get(), "finetuned-model.gguf");
+    llama_model_save_to_file(model.get(), params.out_file.c_str());
    llama_backend_free();
--- a/flake.nix
+++ b/flake.nix
@ -36,9 +36,6 @@
  # ```
  # nixConfig = {
  #   extra-substituters = [
  #     # Populated by the CI in ggml-org/llama.cpp
  #     "https://llama-cpp.cachix.org"
  #
  #     # A development cache for nixpkgs imported with `config.cudaSupport = true`.
  #     # Populated by https://hercules-ci.com/github/SomeoneSerge/nixpkgs-cuda-ci.
  #     # This lets one skip building e.g. the CUDA-enabled openmpi.
@ -47,10 +44,8 @@
  #   ];
  #
  #   # Verify these are the same keys as published on
  #   # - https://app.cachix.org/cache/llama-cpp
  #   # - https://app.cachix.org/cache/cuda-maintainers
  #   extra-trusted-public-keys = [
  #     "llama-cpp.cachix.org-1:H75X+w83wUKTIPSO1KWy9ADUrzThyGs8P5tmAbkWhQc="
  #     "cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E="
  #   ];
  # };
--- a/ggml/CMakeLists.txt
+++ b/ggml/CMakeLists.txt
@ -1,5 +1,5 @@
 cmake_minimum_required(VERSION 3.14) # for add_link_options and implicit target directories.
-project("ggml" C CXX)
+project("ggml" C CXX ASM)
 include(CheckIncludeFileCXX)
 set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
@ -129,7 +129,9 @@ endif()
 option(GGML_LASX             "ggml: enable lasx"             ON)
 option(GGML_LSX              "ggml: enable lsx"              ON)
 option(GGML_RVV              "ggml: enable rvv"              ON)
-option(GGML_RV_ZFH           "ggml: enable riscv zfh"        OFF)
+option(GGML_RV_ZFH           "ggml: enable riscv zfh"        ON)
 option(GGML_RV_ZVFH          "ggml: enable riscv zvfh"       ON)
 option(GGML_RV_ZICBOP        "ggml: enable riscv zicbop"     ON)
 option(GGML_XTHEADVECTOR     "ggml: enable xtheadvector"     OFF)
 option(GGML_VXE              "ggml: enable vxe"              ON)
 option(GGML_NNPA             "ggml: enable nnpa"             OFF)  # temp disabled by default, see: https://github.com/ggml-org/llama.cpp/issues/14877
@ -158,7 +160,6 @@ option(GGML_CUDA                            "ggml: use CUDA"
 option(GGML_MUSA                            "ggml: use MUSA"                                  OFF)
 option(GGML_CUDA_FORCE_MMQ                  "ggml: use mmq kernels instead of cuBLAS"         OFF)
 option(GGML_CUDA_FORCE_CUBLAS               "ggml: always use cuBLAS instead of mmq kernels"  OFF)
 option(GGML_CUDA_F16                        "ggml: use 16 bit floats for some calculations"   OFF)
 set   (GGML_CUDA_PEER_MAX_BATCH_SIZE "128" CACHE STRING
                                            "ggml: max. batch size for using peer access")
 option(GGML_CUDA_NO_PEER_COPY               "ggml: do not use peer to peer copies"            OFF)
@ -176,6 +177,7 @@ option(GGML_HIP_NO_VMM                      "ggml: do not try to use HIP VMM"
 option(GGML_HIP_ROCWMMA_FATTN               "ggml: enable rocWMMA for FlashAttention"         OFF)
 option(GGML_HIP_FORCE_ROCWMMA_FATTN_GFX12   "ggml: enable rocWMMA FlashAttention on GFX12"    OFF)
 option(GGML_HIP_MMQ_MFMA                    "ggml: enable MFMA MMA for CDNA in MMQ"           ON)
 option(GGML_HIP_EXPORT_METRICS              "ggml: enable kernel perf metrics output"         OFF)
 option(GGML_MUSA_GRAPHS                     "ggml: use MUSA graph, experimental, unstable"    OFF)
 option(GGML_MUSA_MUDNN_COPY                 "ggml: enable muDNN for accelerated copy"         OFF)
 option(GGML_VULKAN                          "ggml: use Vulkan"                                OFF)
@ -187,6 +189,7 @@ option(GGML_VULKAN_VALIDATE                 "ggml: enable Vulkan validation"
 option(GGML_VULKAN_RUN_TESTS                "ggml: run Vulkan tests"                          OFF)
 option(GGML_WEBGPU                          "ggml: use WebGPU"                                OFF)
 option(GGML_WEBGPU_DEBUG                    "ggml: enable WebGPU debug output"                OFF)
 option(GGML_ZDNN                            "ggml: use zDNN"                                  OFF)
 option(GGML_METAL                           "ggml: use Metal"                                 ${GGML_METAL_DEFAULT})
 option(GGML_METAL_USE_BF16                  "ggml: use bfloat if available"                   OFF)
 option(GGML_METAL_NDEBUG                    "ggml: disable Metal debugging"                   OFF)
--- a/ggml/cmake/ggml-config.cmake.in
+++ b/ggml/cmake/ggml-config.cmake.in
@ -106,7 +106,7 @@ if(NOT TARGET ggml::ggml)
    find_library(GGML_LIBRARY ggml
        REQUIRED
-        HINTS ${GGML_LIB_DIR} ${GGML_BACKEND_DIR}
+        HINTS ${GGML_LIB_DIR}
        NO_CMAKE_FIND_ROOT_PATH)
    add_library(ggml::ggml UNKNOWN IMPORTED)
@ -125,6 +125,7 @@ if(NOT TARGET ggml::ggml)
            IMPORTED_LOCATION "${GGML_BASE_LIBRARY}")
    set(_ggml_all_targets "")
    if (NOT GGML_BACKEND_DL)
        foreach(_ggml_backend ${GGML_AVAILABLE_BACKENDS})
            string(REPLACE "-" "_" _ggml_backend_pfx "${_ggml_backend}")
            string(TOUPPER "${_ggml_backend_pfx}" _ggml_backend_pfx)
@ -173,6 +174,7 @@ if(NOT TARGET ggml::ggml)
            list(APPEND _ggml_all_targets ggml::${_ggml_backend})
        endforeach()
    endif()
    list(APPEND GGML_INTERFACE_LINK_LIBRARIES ggml::ggml-base "${_ggml_all_targets}")
    set_target_properties(ggml::ggml
--- a/ggml/include/ggml-backend.h
+++ b/ggml/include/ggml-backend.h
@ -307,6 +307,9 @@ extern "C" {
    GGML_API void                 ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend);
    GGML_API ggml_backend_t       ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node);
    // Split graph without allocating it
    GGML_API void                 ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph);
    // Allocate and compute graph on the backend scheduler
    GGML_API bool                 ggml_backend_sched_alloc_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph); // returns success
    GGML_API enum ggml_status     ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph);
--- a/ggml/include/ggml-opt.h
+++ b/ggml/include/ggml-opt.h
@ -74,16 +74,26 @@ extern "C" {
        GGML_OPT_BUILD_TYPE_OPT     = 30,
    };
    enum ggml_opt_optimizer_type {
        GGML_OPT_OPTIMIZER_TYPE_ADAMW,
        GGML_OPT_OPTIMIZER_TYPE_SGD,
        GGML_OPT_OPTIMIZER_TYPE_COUNT
    };
    // parameters that control which optimizer is used and how said optimizer tries to find the minimal loss
    struct ggml_opt_optimizer_params {
        // AdamW optimizer parameters
        struct {
            float alpha; // learning rate
-            float beta1;
+            float beta1; // first AdamW momentum
-            float beta2;
+            float beta2; // second AdamW momentum
            float eps;   // epsilon for numerical stability
-            float wd;    // weight decay for AdamW, use 0.0f to disable
+            float wd;    // weight decay - 0.0f to disable
        } adamw;
        struct {
            float alpha; // learning rate
            float wd;    // weight decay
        } sgd;
    };
    // callback to calculate optimizer parameters prior to a backward pass
@ -114,6 +124,9 @@ extern "C" {
        ggml_opt_get_optimizer_params get_opt_pars;    // callback for calculating optimizer parameters
        void *                        get_opt_pars_ud; // userdata for calculating optimizer parameters
        // only GGML_OPT_OPTIMIZER_TYPE_ADAMW needs m, v momenta per parameter tensor
        enum ggml_opt_optimizer_type optimizer;
    };
    // get parameters for an optimization context with defaults set where possible
@ -142,6 +155,10 @@ extern "C" {
    // get the gradient accumulator for a node from the forward graph
    GGML_API struct ggml_tensor * ggml_opt_grad_acc(ggml_opt_context_t opt_ctx, struct ggml_tensor * node);
    GGML_API enum ggml_opt_optimizer_type ggml_opt_context_optimizer_type(ggml_opt_context_t); //TODO consistent naming scheme
    GGML_API const char * ggml_opt_optimizer_name(enum ggml_opt_optimizer_type);
    // ====== Optimization Result ======
    GGML_API ggml_opt_result_t ggml_opt_result_init(void);
@ -226,12 +243,14 @@ extern "C" {
            struct ggml_tensor            * outputs,        // output tensor, must have shape [ne_label, ndata_batch] if labels are used
            ggml_opt_dataset_t              dataset,        // dataset with data and optionally also labels
            enum ggml_opt_loss_type         loss_type,      // loss to minimize
            enum ggml_opt_optimizer_type    optimizer,      // sgd or adamw
            ggml_opt_get_optimizer_params   get_opt_pars,   // callback to get optimizer params, userdata is pointer to epoch (of type int64_t)
            int64_t                         nepoch,         // how many times the dataset should be iterated over
            int64_t                         nbatch_logical, // datapoints optimizer step, must be a multiple of ndata_batch in inputs/outputs
            float                           val_split,      // fraction of the dataset to use for validation, must be in [0.0f, 1.0f)
            bool                            silent);        // whether or not info prints to stderr should be suppressed
 #ifdef  __cplusplus
 }
 #endif
--- a/ggml/include/ggml-zdnn.h
+++ b/ggml/include/ggml-zdnn.h
@ -0,0 +1,16 @@
 #pragma once
 #include "ggml.h"
 #include "ggml-backend.h"
 #ifdef __cplusplus
 extern "C" {
 #endif
 GGML_BACKEND_API ggml_backend_t ggml_backend_zdnn_init(void);
 GGML_BACKEND_API ggml_backend_reg_t ggml_backend_zdnn_reg(void);
 #ifdef __cplusplus
 }
 #endif
--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@ -241,7 +241,16 @@
 #define GGML_ROPE_TYPE_MROPE  8
 #define GGML_ROPE_TYPE_VISION 24
 #define GGML_MROPE_SECTIONS   4
 #define GGML_UNUSED(x) (void)(x)
 #ifdef __CUDACC__
 template<typename... Args>
 __host__ __device__ constexpr inline void ggml_unused_vars_impl(Args&&...) noexcept {}
 #define GGML_UNUSED_VARS(...) ggml_unused_vars_impl(__VA_ARGS__)
 #else
 #define GGML_UNUSED_VARS(...) do { (void)sizeof((__VA_ARGS__, 0)); } while(0)
 #endif // __CUDACC__
 #define GGML_PAD(x, n) (((x) + (n) - 1) & ~((n) - 1))
@ -502,7 +511,9 @@ extern "C" {
        GGML_OP_CONV_TRANSPOSE_1D,
        GGML_OP_IM2COL,
        GGML_OP_IM2COL_BACK,
        GGML_OP_IM2COL_3D,
        GGML_OP_CONV_2D,
        GGML_OP_CONV_3D,
        GGML_OP_CONV_2D_DW,
        GGML_OP_CONV_TRANSPOSE_2D,
        GGML_OP_POOL_1D,
@ -540,6 +551,7 @@ extern "C" {
        GGML_OP_CROSS_ENTROPY_LOSS,
        GGML_OP_CROSS_ENTROPY_LOSS_BACK,
        GGML_OP_OPT_STEP_ADAMW,
        GGML_OP_OPT_STEP_SGD,
        GGML_OP_GLU,
@ -1660,7 +1672,7 @@ extern "C" {
            struct ggml_tensor  * b,
            struct ggml_tensor  * c,
            int                   n_dims,
-            int                   sections[4],
+            int                   sections[GGML_MROPE_SECTIONS],
            int                   mode,
            int                   n_ctx_orig,
            float                 freq_base,
@ -1686,6 +1698,22 @@ extern "C" {
            float                 beta_fast,
            float                 beta_slow);
    GGML_API struct ggml_tensor * ggml_rope_multi_inplace(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
            struct ggml_tensor  * b,
            struct ggml_tensor  * c,
            int                   n_dims,
            int                   sections[GGML_MROPE_SECTIONS],
            int                   mode,
            int                   n_ctx_orig,
            float                 freq_base,
            float                 freq_scale,
            float                 ext_factor,
            float                 attn_factor,
            float                 beta_fast,
            float                 beta_slow);
    GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_rope_custom(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
@ -1843,6 +1871,41 @@ extern "C" {
            int                   d0,  // dilation dimension 0
            int                   d1); // dilation dimension 1
    GGML_API struct ggml_tensor * ggml_im2col_3d(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
            struct ggml_tensor  * b,
            int64_t               IC,
            int                   s0, // stride width
            int                   s1, // stride height
            int                   s2, // stride depth
            int                   p0, // padding width
            int                   p1, // padding height
            int                   p2, // padding depth
            int                   d0, // dilation width
            int                   d1, // dilation height
            int                   d2, // dilation depth
            enum ggml_type        dst_type);
    // a: [OC*IC, KD, KH, KW]
    // b: [N*IC, ID, IH, IW]
    // result: [N*OC, OD, OH, OW]
    GGML_API struct ggml_tensor * ggml_conv_3d(
                struct ggml_context * ctx,
                struct ggml_tensor  * a,
                struct ggml_tensor  * b,
                int64_t               IC,
                int                   s0, // stride width
                int                   s1, // stride height
                int                   s2, // stride depth
                int                   p0, // padding width
                int                   p1, // padding height
                int                   p2, // padding depth
                int                   d0, // dilation width
                int                   d1, // dilation height
                int                   d2  // dilation depth
        );
    // kernel size is a->ne[0] x a->ne[1]
    // stride is equal to kernel size
    // padding is zero
@ -1914,6 +1977,23 @@ extern "C" {
            int                   d0,  // dilation dimension 0
            int                   d1); // dilation dimension 1
    GGML_API struct ggml_tensor * ggml_conv_3d_direct(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,   // kernel [KW, KH, KD, IC * OC]
            struct ggml_tensor  * b,   // input  [W, H, D, C * N]
            int                   s0,  // stride
            int                   s1,
            int                   s2,
            int                   p0,  // padding
            int                   p1,
            int                   p2,
            int                   d0,  // dilation
            int                   d1,
            int                   d2,
            int                   n_channels,
            int                   n_batch,
            int                   n_channels_out);
    enum ggml_op_pool {
        GGML_OP_POOL_MAX,
        GGML_OP_POOL_AVG,
@ -2004,6 +2084,19 @@ extern "C" {
            int                  p2,
            int                  p3);
    GGML_API struct ggml_tensor * ggml_pad_ext(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
            int                  lp0,
            int                  rp0,
            int                  lp1,
            int                  rp1,
            int                  lp2,
            int                  rp2,
            int                  lp3,
            int                  rp3
            );
    // pad each dimension with reflection: [a, b, c, d] -> [b, a, b, c, d, c]
    GGML_API struct ggml_tensor * ggml_pad_reflect_1d(
            struct ggml_context * ctx,
@ -2293,7 +2386,14 @@ extern "C" {
            struct ggml_tensor  * grad,
            struct ggml_tensor  * m,
            struct ggml_tensor  * v,
-            struct ggml_tensor  * adamw_params); // parameters such a the learning rate
+            struct ggml_tensor  * adamw_params); // parameters such as the learning rate
    // stochastic gradient descent step (with weight decay)
    GGML_API struct ggml_tensor * ggml_opt_step_sgd(
        struct ggml_context * ctx,
        struct ggml_tensor *  a,
        struct ggml_tensor *  grad,
        struct ggml_tensor *  sgd_params); // alpha, weight decay
    //
    // automatic differentiation
--- a/ggml/src/CMakeLists.txt
+++ b/ggml/src/CMakeLists.txt
@ -382,6 +382,7 @@ ggml_add_backend(RPC)
 ggml_add_backend(SYCL)
 ggml_add_backend(Vulkan)
 ggml_add_backend(WebGPU)
 ggml_add_backend(zDNN)
 ggml_add_backend(OpenCL)
 foreach (target ggml-base ggml)
--- a/ggml/src/ggml-backend-reg.cpp
+++ b/ggml/src/ggml-backend-reg.cpp
@ -49,6 +49,10 @@
 #include "ggml-webgpu.h"
 #endif
 #ifdef GGML_USE_ZDNN
 #include "ggml-zdnn.h"
 #endif
 #ifdef GGML_USE_OPENCL
 #include "ggml-opencl.h"
 #endif
@ -180,6 +184,9 @@ struct ggml_backend_registry {
 #ifdef GGML_USE_WEBGPU
        register_backend(ggml_backend_webgpu_reg());
 #endif
 #ifdef GGML_USE_ZDNN
        register_backend(ggml_backend_zdnn_reg());
 #endif
 #ifdef GGML_USE_OPENCL
        register_backend(ggml_backend_opencl_reg());
 #endif
--- a/ggml/src/ggml-backend.cpp
+++ b/ggml/src/ggml-backend.cpp
@ -19,9 +19,8 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <string>
 #include <vector>
 #include <algorithm>
 #include <vector>
 #ifdef __APPLE__
 #include <sys/types.h>
@ -32,6 +31,7 @@
 // backend buffer type
 const char * ggml_backend_buft_name(ggml_backend_buffer_type_t buft) {
    GGML_ASSERT(buft);
    return buft->iface.get_name(buft);
 }
@ -41,14 +41,17 @@ ggml_backend_buffer_t ggml_backend_buft_alloc_buffer(ggml_backend_buffer_type_t
        return ggml_backend_buffer_init(buft, {}, NULL, 0);
    }
    GGML_ASSERT(buft);
    return buft->iface.alloc_buffer(buft, size);
 }
 size_t ggml_backend_buft_get_alignment(ggml_backend_buffer_type_t buft) {
    GGML_ASSERT(buft);
    return buft->iface.get_alignment(buft);
 }
 size_t ggml_backend_buft_get_max_size(ggml_backend_buffer_type_t buft) {
    GGML_ASSERT(buft);
    // get_max_size is optional, defaults to SIZE_MAX
    if (buft->iface.get_max_size) {
        return buft->iface.get_max_size(buft);
@ -57,6 +60,7 @@ size_t ggml_backend_buft_get_max_size(ggml_backend_buffer_type_t buft) {
 }
 size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor) {
    GGML_ASSERT(buft);
    // get_alloc_size is optional, defaults to ggml_nbytes
    if (buft->iface.get_alloc_size) {
        size_t size = buft->iface.get_alloc_size(buft, tensor);
@ -67,6 +71,7 @@ size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, const s
 }
 bool ggml_backend_buft_is_host(ggml_backend_buffer_type_t buft) {
    GGML_ASSERT(buft);
    if (buft->iface.is_host) {
        return buft->iface.is_host(buft);
    }
@ -74,6 +79,7 @@ bool ggml_backend_buft_is_host(ggml_backend_buffer_type_t buft) {
 }
 ggml_backend_dev_t ggml_backend_buft_get_device(ggml_backend_buffer_type_t buft) {
    GGML_ASSERT(buft);
    return buft->device;
 }
@ -111,10 +117,12 @@ void ggml_backend_buffer_free(ggml_backend_buffer_t buffer) {
 }
 size_t ggml_backend_buffer_get_size(ggml_backend_buffer_t buffer) {
    GGML_ASSERT(buffer);
    return buffer->size;
 }
 void * ggml_backend_buffer_get_base(ggml_backend_buffer_t buffer) {
    GGML_ASSERT(buffer);
    // get_base is optional if the buffer is zero-sized
    if (buffer->size == 0) {
        return NULL;
@ -128,6 +136,7 @@ void * ggml_backend_buffer_get_base(ggml_backend_buffer_t buffer) {
 }
 enum ggml_status ggml_backend_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
    GGML_ASSERT(buffer);
    // init_tensor is optional
    if (buffer->iface.init_tensor) {
        return buffer->iface.init_tensor(buffer, tensor);
@ -136,6 +145,7 @@ enum ggml_status ggml_backend_buffer_init_tensor(ggml_backend_buffer_t buffer, s
 }
 void ggml_backend_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
    GGML_ASSERT(buffer);
    // clear is optional if the buffer is zero-sized
    if (buffer->size == 0) {
        return;
@ -161,6 +171,7 @@ bool ggml_backend_buffer_is_host(ggml_backend_buffer_t buffer) {
 }
 void ggml_backend_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage) {
    GGML_ASSERT(buffer);
    buffer->usage = usage;
    // FIXME: add a generic callback to the buffer interface
@ -170,14 +181,17 @@ void ggml_backend_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backe
 }
 enum ggml_backend_buffer_usage ggml_backend_buffer_get_usage(ggml_backend_buffer_t buffer) {
    GGML_ASSERT(buffer);
    return buffer->usage;
 }
 ggml_backend_buffer_type_t ggml_backend_buffer_get_type(ggml_backend_buffer_t buffer) {
    GGML_ASSERT(buffer);
    return buffer->buft;
 }
 void ggml_backend_buffer_reset(ggml_backend_buffer_t buffer) {
    GGML_ASSERT(buffer);
    if (buffer->iface.reset) {
        buffer->iface.reset(buffer);
    }
@ -216,6 +230,7 @@ void ggml_backend_free(ggml_backend_t backend) {
 }
 ggml_backend_buffer_type_t ggml_backend_get_default_buffer_type(ggml_backend_t backend) {
    GGML_ASSERT(backend);
    return ggml_backend_dev_buffer_type(backend->device);
 }
@ -232,6 +247,8 @@ size_t ggml_backend_get_max_size(ggml_backend_t backend) {
 }
 void ggml_backend_tensor_set_async(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
    GGML_ASSERT(backend);
    GGML_ASSERT(tensor);
    GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
    GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
@ -243,6 +260,8 @@ void ggml_backend_tensor_set_async(ggml_backend_t backend, struct ggml_tensor *
 }
 void ggml_backend_tensor_get_async(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
    GGML_ASSERT(backend);
    GGML_ASSERT(tensor);
    GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
    GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
@ -284,6 +303,7 @@ void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, siz
 }
 void ggml_backend_tensor_memset(struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
    GGML_ASSERT(tensor);
    ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
    if (size == 0) {
@ -299,6 +319,7 @@ void ggml_backend_tensor_memset(struct ggml_tensor * tensor, uint8_t value, size
 }
 void ggml_backend_synchronize(ggml_backend_t backend) {
    GGML_ASSERT(backend);
    if (backend->iface.synchronize == NULL) {
        return;
    }
@ -307,18 +328,21 @@ void ggml_backend_synchronize(ggml_backend_t backend) {
 }
 ggml_backend_graph_plan_t ggml_backend_graph_plan_create(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
    GGML_ASSERT(backend);
    GGML_ASSERT(backend->iface.graph_plan_create != NULL);
    return backend->iface.graph_plan_create(backend, cgraph);
 }
 void ggml_backend_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
    GGML_ASSERT(backend);
    GGML_ASSERT(backend->iface.graph_plan_free != NULL);
    backend->iface.graph_plan_free(backend, plan);
 }
 enum ggml_status ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
    GGML_ASSERT(backend);
    GGML_ASSERT(backend->iface.graph_plan_compute != NULL);
    return backend->iface.graph_plan_compute(backend, plan);
@ -331,22 +355,27 @@ enum ggml_status ggml_backend_graph_compute(ggml_backend_t backend, struct ggml_
 }
 enum ggml_status ggml_backend_graph_compute_async(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
    GGML_ASSERT(backend);
    return backend->iface.graph_compute(backend, cgraph);
 }
 bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
    GGML_ASSERT(backend);
    return ggml_backend_dev_supports_op(backend->device, op);
 }
 bool ggml_backend_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
    GGML_ASSERT(backend);
    return ggml_backend_dev_supports_buft(backend->device, buft);
 }
 bool ggml_backend_offload_op(ggml_backend_t backend, const struct ggml_tensor * op) {
    GGML_ASSERT(backend);
    return ggml_backend_dev_offload_op(backend->device, op);
 }
 ggml_backend_dev_t ggml_backend_get_device(ggml_backend_t backend) {
    GGML_ASSERT(backend);
    return backend->device;
 }
@ -382,6 +411,7 @@ void ggml_backend_tensor_copy_async(ggml_backend_t backend_src, ggml_backend_t b
        return;
    }
    GGML_ASSERT(backend_dst);
    if (backend_dst->iface.cpy_tensor_async != NULL) {
        if (backend_dst->iface.cpy_tensor_async(backend_src, backend_dst, src, dst)) {
            return;
@ -413,18 +443,21 @@ void ggml_backend_event_free(ggml_backend_event_t event) {
 }
 void ggml_backend_event_record(ggml_backend_event_t event, ggml_backend_t backend) {
    GGML_ASSERT(backend);
    GGML_ASSERT(backend->iface.event_record != NULL);
    backend->iface.event_record(backend, event);
 }
 void ggml_backend_event_synchronize(ggml_backend_event_t event) {
    GGML_ASSERT(event);
    GGML_ASSERT(event->device->iface.event_synchronize);
    event->device->iface.event_synchronize(event->device, event);
 }
 void ggml_backend_event_wait(ggml_backend_t backend, ggml_backend_event_t event) {
    GGML_ASSERT(backend);
    GGML_ASSERT(backend->iface.event_wait != NULL);
    backend->iface.event_wait(backend, event);
@ -433,18 +466,22 @@ void ggml_backend_event_wait(ggml_backend_t backend, ggml_backend_event_t event)
 // Backend device
 const char * ggml_backend_dev_name(ggml_backend_dev_t device) {
    GGML_ASSERT(device);
    return device->iface.get_name(device);
 }
 const char * ggml_backend_dev_description(ggml_backend_dev_t device) {
    GGML_ASSERT(device);
    return device->iface.get_description(device);
 }
 void ggml_backend_dev_memory(ggml_backend_dev_t device, size_t * free, size_t * total) {
    GGML_ASSERT(device);
    device->iface.get_memory(device, free, total);
 }
 enum ggml_backend_dev_type ggml_backend_dev_type(ggml_backend_dev_t device) {
    GGML_ASSERT(device);
    return device->iface.get_type(device);
 }
@ -454,18 +491,22 @@ void ggml_backend_dev_get_props(ggml_backend_dev_t device, struct ggml_backend_d
 }
 ggml_backend_reg_t ggml_backend_dev_backend_reg(ggml_backend_dev_t device) {
    GGML_ASSERT(device);
    return device->reg;
 }
 ggml_backend_t ggml_backend_dev_init(ggml_backend_dev_t device, const char * params) {
    GGML_ASSERT(device);
    return device->iface.init_backend(device, params);
 }
 ggml_backend_buffer_type_t ggml_backend_dev_buffer_type(ggml_backend_dev_t device) {
    GGML_ASSERT(device);
    return device->iface.get_buffer_type(device);
 }
 ggml_backend_buffer_type_t ggml_backend_dev_host_buffer_type(ggml_backend_dev_t device) {
    GGML_ASSERT(device);
    if (device->iface.get_host_buffer_type == NULL) {
        return NULL;
    }
@ -474,18 +515,22 @@ ggml_backend_buffer_type_t ggml_backend_dev_host_buffer_type(ggml_backend_dev_t
 }
 ggml_backend_buffer_t ggml_backend_dev_buffer_from_host_ptr(ggml_backend_dev_t device, void * ptr, size_t size, size_t max_tensor_size) {
    GGML_ASSERT(device);
    return device->iface.buffer_from_host_ptr(device, ptr, size, max_tensor_size);
 }
 bool ggml_backend_dev_supports_op(ggml_backend_dev_t device, const struct ggml_tensor * op) {
    GGML_ASSERT(device);
    return device->iface.supports_op(device, op);
 }
 bool ggml_backend_dev_supports_buft(ggml_backend_dev_t device, ggml_backend_buffer_type_t buft) {
    GGML_ASSERT(device);
    return device->iface.supports_buft(device, buft);
 }
 bool ggml_backend_dev_offload_op(ggml_backend_dev_t device, const struct ggml_tensor * op) {
    GGML_ASSERT(device);
    if (device->iface.offload_op != NULL) {
        return device->iface.offload_op(device, op);
    }
@ -496,18 +541,22 @@ bool ggml_backend_dev_offload_op(ggml_backend_dev_t device, const struct ggml_te
 // Backend (reg)
 const char * ggml_backend_reg_name(ggml_backend_reg_t reg) {
    GGML_ASSERT(reg);
    return reg->iface.get_name(reg);
 }
 size_t ggml_backend_reg_dev_count(ggml_backend_reg_t reg) {
    GGML_ASSERT(reg);
    return reg->iface.get_device_count(reg);
 }
 ggml_backend_dev_t ggml_backend_reg_dev_get(ggml_backend_reg_t reg, size_t index) {
    GGML_ASSERT(reg);
    return reg->iface.get_device(reg, index);
 }
 void * ggml_backend_reg_get_proc_address(ggml_backend_reg_t reg, const char * name) {
    GGML_ASSERT(reg);
    if (!reg->iface.get_proc_address) {
        return NULL;
    }
@ -522,6 +571,7 @@ struct ggml_backend_multi_buffer_context {
 };
 static void ggml_backend_multi_buffer_free_buffer(ggml_backend_buffer_t buffer) {
    GGML_ASSERT(buffer);
    ggml_backend_multi_buffer_context * ctx = (ggml_backend_multi_buffer_context *) buffer->context;
    for (size_t i = 0; i < ctx->n_buffers; i++) {
        ggml_backend_buffer_free(ctx->buffers[i]);
@ -532,6 +582,7 @@ static void ggml_backend_multi_buffer_free_buffer(ggml_backend_buffer_t buffer)
 }
 static void ggml_backend_multi_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
    GGML_ASSERT(buffer);
    ggml_backend_multi_buffer_context * ctx = (ggml_backend_multi_buffer_context *) buffer->context;
    for (size_t i = 0; i < ctx->n_buffers; i++) {
        ggml_backend_buffer_clear(ctx->buffers[i], value);
@ -567,10 +618,12 @@ ggml_backend_buffer_t ggml_backend_multi_buffer_alloc_buffer(ggml_backend_buffer
 }
 bool ggml_backend_buffer_is_multi_buffer(ggml_backend_buffer_t buffer) {
    GGML_ASSERT(buffer);
    return buffer->iface.free_buffer == ggml_backend_multi_buffer_free_buffer;
 }
 void ggml_backend_multi_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage) {
    GGML_ASSERT(buffer);
    GGML_ASSERT(ggml_backend_buffer_is_multi_buffer(buffer));
    ggml_backend_multi_buffer_context * ctx = (ggml_backend_multi_buffer_context *) buffer->context;
    for (size_t i = 0; i < ctx->n_buffers; i++) {
@ -598,7 +651,7 @@ static bool ggml_is_view_op(enum ggml_op op) {
 #endif
 #ifndef GGML_SCHED_MAX_SPLIT_INPUTS
-#define GGML_SCHED_MAX_SPLIT_INPUTS GGML_MAX_SRC
+#define GGML_SCHED_MAX_SPLIT_INPUTS 30
 #endif
 #ifndef GGML_SCHED_MAX_COPIES
@ -849,7 +902,7 @@ static void ggml_backend_sched_set_if_supported(ggml_backend_sched_t sched, stru
 }
 // assigns backends to ops and splits the graph into subgraphs that can be computed on the same backend
-static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
+void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
    // reset splits
    sched->n_splits = 0;
    sched->n_graph_inputs = 0;
@ -1071,6 +1124,11 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
                }
            }
        }
        // if the node is still unassigned, assign it to the first backend that supports it
        for (int b = 0; b < sched->n_backends && *cur_backend_id == -1; b++) {
            ggml_backend_sched_set_if_supported(sched, node, b, cur_backend_id);
        }
        GGML_ASSERT(*cur_backend_id != -1);
    }
    // pass 5: split graph, find tensors that need to be copied
@ -1098,7 +1156,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
            const int node_backend_id = tensor_backend_id(node);
-            assert(node_backend_id != -1); // all nodes should be assigned by now, this can happen if there is no CPU fallback
+            GGML_ASSERT(node_backend_id != -1); // all nodes should be assigned by now, this can happen if there is no CPU fallback
            // check if we should start a new split based on the sources of the current node
            bool need_new_split = false;
@ -1156,7 +1214,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
                size_t src_id = hash_id(src);
                const int src_backend_id = sched->hv_tensor_backend_ids[src_id];
-                assert(src_backend_id != -1); // all inputs should be assigned by now
+                GGML_ASSERT(src_backend_id != -1); // all inputs should be assigned by now
                if (src->flags & GGML_TENSOR_FLAG_INPUT && sched->n_copies > 1) {
                    if (tensor_id_copy(src_id, src_backend_id, 0) == NULL) {
@ -1345,17 +1403,22 @@ static bool ggml_backend_sched_alloc_splits(ggml_backend_sched_t sched) {
 }
 static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t sched) {
    GGML_ASSERT(sched);
    struct ggml_backend_sched_split * splits = sched->splits;
-    for (int i = 0; i < sched->n_splits; i++) {
+    ggml_tensor * prev_ids_tensor = nullptr;
-        struct ggml_backend_sched_split * split = &splits[i];
+    std::vector<int32_t> ids;
    std::vector<ggml_bitset_t> used_ids;
    for (int split_id = 0; split_id < sched->n_splits; split_id++) {
        struct ggml_backend_sched_split * split = &splits[split_id];
        int split_backend_id = split->backend_id;
        ggml_backend_t split_backend = sched->backends[split_backend_id];
        // copy the input tensors to the split backend
-        for (int j = 0; j < split->n_inputs; j++) {
+        for (int input_id = 0; input_id < split->n_inputs; input_id++) {
-            ggml_backend_t input_backend = ggml_backend_sched_get_tensor_backend(sched, split->inputs[j]);
+            ggml_backend_t input_backend = ggml_backend_sched_get_tensor_backend(sched, split->inputs[input_id]);
-            struct ggml_tensor * input = split->inputs[j];
+            struct ggml_tensor * input = split->inputs[input_id];
            struct ggml_tensor * input_cpy = tensor_copy(input, split_backend_id, sched->cur_copy);
            if (input->flags & GGML_TENSOR_FLAG_INPUT) {
@ -1373,6 +1436,93 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
                } else {
                    ggml_backend_synchronize(split_backend);
                }
                // when offloading MoE weights, we can reduce the amount of data copied by copying only the experts that are used
                ggml_tensor * node = split->graph.nodes[0];
                if (split->graph.n_nodes > 0 &&
                    ggml_backend_buffer_get_usage(input->buffer) == GGML_BACKEND_BUFFER_USAGE_WEIGHTS &&
                    ggml_backend_buffer_is_host(input->buffer) && (
                    (node->src[0] == input_cpy && node->op == GGML_OP_MUL_MAT_ID)
                    //|| (node->src[1] == input_cpy && node->op == GGML_OP_ADD_ID) /* GGML_OP_ADD_ID weights are small and not worth splitting */
                    )) {
                    const int64_t n_expert   = node->op == GGML_OP_MUL_MAT_ID ? input->ne[2] : input->ne[1];
                    const size_t expert_size = node->op == GGML_OP_MUL_MAT_ID ? input->nb[2] : input->nb[1];
                    ggml_backend_synchronize(input_backend);
                    // get the ids
                    ggml_tensor * ids_tensor = node->src[2];
                    ggml_backend_t ids_backend = split_backend;
                    // if the ids tensor is also an input of the split, it may not have been copied yet to the split backend
                    // in that case, we use the original ids tensor
                    for (int i = input_id + 1; i < split->n_inputs; i++) {
                        if (ids_tensor == tensor_copy(split->inputs[i], split_backend_id, sched->cur_copy)) {
                            ids_tensor = split->inputs[i];
                            ids_backend = ggml_backend_sched_get_tensor_backend(sched, split->inputs[i]);
                            break;
                        }
                    }
                    if (ids_tensor != prev_ids_tensor) {
                        ids.resize(ggml_nbytes(ids_tensor) / sizeof(int32_t));
                        ggml_backend_tensor_get_async(ids_backend, ids_tensor, ids.data(), 0, ggml_nbytes(ids_tensor));
                        ggml_backend_synchronize(ids_backend);
                        // find the used experts
                        used_ids.clear();
                        used_ids.resize(ggml_bitset_size(n_expert));
                        for (int64_t i1 = 0; i1 < ids_tensor->ne[1]; i1++) {
                            for (int64_t i0 = 0; i0 < ids_tensor->ne[0]; i0++) {
                                int32_t id = ids[i1 * ids_tensor->nb[1]/sizeof(int32_t) + i0 * ids_tensor->nb[0]/sizeof(int32_t)];
                                GGML_ASSERT(id >= 0 && id < n_expert);
                                ggml_bitset_set(used_ids.data(), id);
                            }
                        }
                        prev_ids_tensor = ids_tensor;
                    }
                    // group consecutive experts and copy them together
                    auto copy_experts = [&](int32_t first_id, int32_t last_id) {
                        const size_t expert_offset = first_id * expert_size;
                        const size_t expert_size_copy =  (last_id - first_id + 1) * expert_size;
                        const size_t padding = std::min<size_t>(expert_size, 512);
                        const size_t padding_end = last_id < n_expert - 1 ? padding : 0;
                        ggml_backend_tensor_set_async(split_backend,
                            input_cpy,
                            (const uint8_t *)input->data + expert_offset, expert_offset,
                            // copy a bit extra at the to ensure there are no NaNs in the padding of the last expert
                            // this is necessary for MMQ in the CUDA backend
                            expert_size_copy + padding_end);
                    };
                    int id = 0;
                    while (!ggml_bitset_get(used_ids.data(), id)) {
                        id++;
                    }
                    int32_t first_id = id;
                    int32_t last_id = first_id;
                    for (++id; id < n_expert; ++id) {
                        if (!ggml_bitset_get(used_ids.data(), id)) {
                            continue;
                        }
                        if (id == last_id + 1) {
                            last_id = id;
                            continue;
                        }
                        copy_experts(first_id, last_id);
                        first_id = id;
                        last_id = id;
                    }
                    copy_experts(first_id, last_id);
                } else {
                    // try async copy, but if not possible, we can still use a sync copy without synchronizing the dst backend, since we handle the synchronization here with multiple copies and events
                    // TODO: add public function to facilitate this, since applications do not have direct access to the backend interface
                    if (!split_backend->iface.cpy_tensor_async || !split_backend->iface.cpy_tensor_async(input_backend, split_backend, input, input_cpy)) {
@ -1386,6 +1536,7 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
                    }
                }
            }
        }
        if (!sched->callback_eval) {
            enum ggml_status ec = ggml_backend_graph_compute_async(split_backend, &split->graph);
@ -1521,6 +1672,7 @@ void ggml_backend_sched_free(ggml_backend_sched_t sched) {
 }
 void ggml_backend_sched_reset(ggml_backend_sched_t sched) {
    GGML_ASSERT(sched);
    // reset state for the next run
    if (!sched->is_reset) {
        ggml_hash_set_reset(&sched->hash_set);
@ -1532,8 +1684,11 @@ void ggml_backend_sched_reset(ggml_backend_sched_t sched) {
 }
 bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph) {
    GGML_ASSERT(sched);
    GGML_ASSERT((int)sched->hash_set.size >= measure_graph->n_nodes + measure_graph->n_leafs);
    ggml_backend_sched_reset(sched);
    ggml_backend_sched_synchronize(sched);
    ggml_backend_sched_split_graph(sched, measure_graph);
@ -1548,6 +1703,7 @@ bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph *
 }
 bool ggml_backend_sched_alloc_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
    GGML_ASSERT(sched);
    GGML_ASSERT((int)sched->hash_set.size >= graph->n_nodes + graph->n_leafs);
    GGML_ASSERT(!sched->is_alloc);
@ -1572,6 +1728,7 @@ enum ggml_status ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, st
 }
 enum ggml_status ggml_backend_sched_graph_compute_async(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
    GGML_ASSERT(sched);
    if (!sched->is_reset && !sched->is_alloc) {
        ggml_backend_sched_reset(sched);
    }
@ -1586,6 +1743,7 @@ enum ggml_status ggml_backend_sched_graph_compute_async(ggml_backend_sched_t sch
 }
 void ggml_backend_sched_synchronize(ggml_backend_sched_t sched) {
    GGML_ASSERT(sched);
    for (int i = 0; i < sched->n_backends; i++) {
        ggml_backend_synchronize(sched->backends[i]);
    }
@ -1598,28 +1756,34 @@ void ggml_backend_sched_synchronize(ggml_backend_sched_t sched) {
 }
 void ggml_backend_sched_set_eval_callback(ggml_backend_sched_t sched, ggml_backend_sched_eval_callback callback, void * user_data) {
    GGML_ASSERT(sched);
    sched->callback_eval = callback;
    sched->callback_eval_user_data = user_data;
 }
 int ggml_backend_sched_get_n_splits(ggml_backend_sched_t sched) {
    GGML_ASSERT(sched);
    return sched->n_splits;
 }
 int ggml_backend_sched_get_n_copies(ggml_backend_sched_t sched) {
    GGML_ASSERT(sched);
    return sched->n_copies;
 }
 int ggml_backend_sched_get_n_backends(ggml_backend_sched_t sched) {
    GGML_ASSERT(sched);
    return sched->n_backends;
 }
 ggml_backend_t ggml_backend_sched_get_backend(ggml_backend_sched_t sched, int i) {
    GGML_ASSERT(sched);
    GGML_ASSERT(i >= 0 && i < sched->n_backends);
    return sched->backends[i];
 }
 size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend) {
    GGML_ASSERT(sched);
    int backend_index = ggml_backend_sched_backend_id(sched, backend);
    GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
@ -1627,6 +1791,7 @@ size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backe
 }
 void ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend) {
    GGML_ASSERT(sched);
    int backend_index = ggml_backend_sched_backend_id(sched, backend);
    GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
    tensor_backend_id(node) = backend_index;
@ -1635,6 +1800,7 @@ void ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct gg
 }
 ggml_backend_t ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node) {
    GGML_ASSERT(sched);
    int backend_index = tensor_backend_id(node);
    if (backend_index == -1) {
        return NULL;
@ -1645,6 +1811,7 @@ ggml_backend_t ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched,
 // utils
 enum ggml_status ggml_backend_view_init(struct ggml_tensor * tensor) {
    GGML_ASSERT(tensor);
    GGML_ASSERT(tensor->buffer == NULL);
    GGML_ASSERT(tensor->view_src != NULL);
    GGML_ASSERT(tensor->view_src->buffer != NULL);
@ -1656,6 +1823,7 @@ enum ggml_status ggml_backend_view_init(struct ggml_tensor * tensor) {
 }
 enum ggml_status ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr) {
    GGML_ASSERT(tensor);
    GGML_ASSERT(tensor->buffer == NULL);
    GGML_ASSERT(tensor->data == NULL);
    GGML_ASSERT(tensor->view_src == NULL);
@ -1729,6 +1897,7 @@ static void graph_copy_init_tensor(struct ggml_hash_set * hash_set, struct ggml_
 }
 struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, struct ggml_cgraph * graph) {
    GGML_ASSERT(graph);
    struct ggml_hash_set hash_set = ggml_hash_set_new(graph->visited_hash_set.size);
    struct ggml_tensor ** node_copies = (ggml_tensor **) calloc(hash_set.size, sizeof(node_copies[0])); // NOLINT
    bool * node_init = (bool *) calloc(hash_set.size, sizeof(node_init[0]));
@ -1873,6 +2042,7 @@ bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t
 // CPU backend - buffer
 static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) {
    GGML_ASSERT(buffer);
    uintptr_t data = (uintptr_t)buffer->context;
    // align the buffer
@ -1884,28 +2054,33 @@ static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) {
 }
 static void ggml_backend_cpu_buffer_free_buffer(ggml_backend_buffer_t buffer) {
    GGML_ASSERT(buffer);
    ggml_aligned_free(buffer->context, buffer->size);
 }
 static void ggml_backend_cpu_buffer_memset_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
    GGML_ASSERT(tensor);
    memset((char *)tensor->data + offset, value, size);
    GGML_UNUSED(buffer);
 }
 static void ggml_backend_cpu_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
    GGML_ASSERT(tensor);
    memcpy((char *)tensor->data + offset, data, size);
    GGML_UNUSED(buffer);
 }
 static void ggml_backend_cpu_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
    GGML_ASSERT(tensor);
    memcpy(data, (const char *)tensor->data + offset, size);
    GGML_UNUSED(buffer);
 }
 static bool ggml_backend_cpu_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst) {
    GGML_ASSERT(src);
    if (ggml_backend_buffer_is_host(src->buffer)) {
        memcpy(dst->data, src->data, ggml_nbytes(src));
        return true;
@ -1916,6 +2091,7 @@ static bool ggml_backend_cpu_buffer_cpy_tensor(ggml_backend_buffer_t buffer, con
 }
 static void ggml_backend_cpu_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
    GGML_ASSERT(buffer);
    memset(buffer->context, value, buffer->size);
 }
--- a/ggml/src/ggml-blas/ggml-blas.cpp
+++ b/ggml/src/ggml-blas/ggml-blas.cpp
@ -282,7 +282,7 @@ ggml_backend_t ggml_backend_blas_init(void) {
    ggml_backend_t backend = new ggml_backend {
        /* .guid    = */ ggml_backend_blas_guid(),
-        /* .interface = */ blas_backend_i,
+        /* .iface   = */ blas_backend_i,
        /* .device  = */ ggml_backend_reg_dev_get(ggml_backend_blas_reg(), 0),
        /* .context = */ ctx,
    };
--- a/ggml/src/ggml-cann/CMakeLists.txt
+++ b/ggml/src/ggml-cann/CMakeLists.txt
@ -31,6 +31,13 @@ string(REGEX MATCH "[0-9]+[a-zA-Z]" SOC_TYPE_MAJOR_SN "${SOC_VERSION}")
 set(SOC_TYPE_COMPILE_OPTION "ASCEND_${SOC_TYPE_MAJOR_SN}")
 string(TOUPPER ${SOC_TYPE_COMPILE_OPTION} SOC_TYPE_COMPILE_OPTION)
 message(STATUS "CANN: SOC_VERSION =  ${SOC_VERSION}")
 option(USE_ACL_GRAPH "Enable CANN graph execution (ACL graph mode)" OFF)
 if(USE_ACL_GRAPH AND (SOC_TYPE_MAJOR_SN STREQUAL "310P" OR SOC_TYPE_COMPILE_OPTION STREQUAL "ASCEND_310P"))
    message(FATAL_ERROR
        "CANN Graph (ACL graph mode) is not supported on 310P devices. "
        "Please build with -DUSE_ACL_GRAPH=OFF or use a supported SOC.")
 endif()
 if (CANN_INSTALL_DIR)
    # Only Support Linux.
@ -68,6 +75,13 @@ if (CANN_INSTALL_DIR)
    target_compile_definitions(ggml-cann PRIVATE "-D${SOC_TYPE_COMPILE_OPTION}")
    if (USE_ACL_GRAPH)
        target_compile_definitions(ggml-cann PRIVATE USE_ACL_GRAPH)
        message(STATUS "CANN: USE_ACL_GRAPH is enabled.")
    else()
        message(STATUS "CANN: USE_ACL_GRAPH is disabled.")
    endif()
    message(STATUS "CANN: CANN_INCLUDE_DIRS =  ${CANN_INCLUDE_DIRS}")
    message(STATUS "CANN: CANN_LIBRARIES =  ${CANN_LIBRARIES}")
 else()
--- a/ggml/src/ggml-cann/aclnn_ops.cpp
+++ b/ggml/src/ggml-cann/aclnn_ops.cpp
--- a/Show More
+++ b/Show More