Merge branch 'ggml-org:master' into set-default-subgroup-for-intel

This commit is contained in:
Masato Nakasaka 2025-12-04 17:51:41 +09:00 committed by GitHub
commit 565557ca09
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
358 changed files with 33418 additions and 11979 deletions

View File

@ -50,6 +50,7 @@ WORKDIR /app
RUN apt-get update \
&& apt-get install -y \
build-essential \
git \
python3 \
python3-pip \

View File

@ -1,120 +0,0 @@
name: Build on RISCV Linux Machine by Cloud-V
on:
pull_request:
workflow_dispatch:
workflow_call:
jobs:
debian-13-riscv64-native: # Bianbu 2.2
runs-on: [self-hosted, RISCV64]
steps:
- name: Install prerequisites
run: |
sudo apt-get update || true
sudo apt-get install -y libatomic1
- uses: actions/checkout@v4
- name: Setup Riscv
run: |
sudo apt-get update || true
sudo apt-get install -y --no-install-recommends \
build-essential \
gcc-14-riscv64-linux-gnu \
g++-14-riscv64-linux-gnu \
ccache \
cmake
- name: Setup ccache
run: |
mkdir -p $HOME/.ccache
ccache -M 5G -d $HOME/.ccache
export CCACHE_LOGFILE=/home/runneruser/ccache_debug/ccache.log
export CCACHE_DEBUGDIR="/home/runneruser/ccache_debug"
echo "$GITHUB_WORKSPACE"
echo "CCACHE_LOGFILE=$CCACHE_LOGFILE" >> $GITHUB_ENV
echo "CCACHE_DEBUGDIR=$CCACHE_DEBUGDIR" >> $GITHUB_ENV
echo "CCACHE_BASEDIR=$GITHUB_WORKSPACE" >> $GITHUB_ENV
echo "CCACHE_DIR=$HOME/.ccache" >> $GITHUB_ENV
- name: Build
run: |
cmake -B build \
-DLLAMA_CURL=OFF \
-DCMAKE_BUILD_TYPE=Release \
-DGGML_OPENMP=OFF \
-DLLAMA_BUILD_EXAMPLES=ON \
-DLLAMA_BUILD_TOOLS=ON \
-DLLAMA_BUILD_TESTS=OFF \
-DCMAKE_SYSTEM_NAME=Linux \
-DCMAKE_SYSTEM_PROCESSOR=riscv64 \
-DCMAKE_C_COMPILER=riscv64-linux-gnu-gcc-14 \
-DCMAKE_CXX_COMPILER=riscv64-linux-gnu-g++-14 \
-DCMAKE_C_COMPILER_LAUNCHER=ccache \
-DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
-DCMAKE_POSITION_INDEPENDENT_CODE=ON \
-DCMAKE_FIND_ROOT_PATH=/usr/lib/riscv64-linux-gnu \
-DCMAKE_FIND_ROOT_PATH_MODE_PROGRAM=NEVER \
-DCMAKE_FIND_ROOT_PATH_MODE_LIBRARY=ONLY \
-DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=BOTH
cmake --build build --config Release -j $(nproc)
# debian-13-riscv64-spacemit-ime-native: # Bianbu 2.2
# runs-on: [self-hosted, RISCV64]
# steps:
# - name: Install prerequisites
# run: |
# sudo apt-get update || true
# sudo apt-get install -y libatomic1
# - uses: actions/checkout@v4
# - name: Setup Riscv
# run: |
# sudo apt-get update || true
# sudo apt-get install -y --no-install-recommends \
# build-essential \
# gcc-14-riscv64-linux-gnu \
# g++-14-riscv64-linux-gnu \
# ccache \
# cmake
# sudo apt-get upgrade binutils -y
# - name: Setup ccache
# run: |
# mkdir -p $HOME/.ccache
# ccache -M 5G -d $HOME/.ccache
# export CCACHE_LOGFILE=/home/runneruser/ccache_debug/ccache.log
# export CCACHE_DEBUGDIR="/home/runneruser/ccache_debug"
# echo "$GITHUB_WORKSPACE"
# echo "CCACHE_LOGFILE=$CCACHE_LOGFILE" >> $GITHUB_ENV
# echo "CCACHE_DEBUGDIR=$CCACHE_DEBUGDIR" >> $GITHUB_ENV
# echo "CCACHE_BASEDIR=$GITHUB_WORKSPACE" >> $GITHUB_ENV
# echo "CCACHE_DIR=$HOME/.ccache" >> $GITHUB_ENV
# - name: Build
# run: |
# cmake -B build \
# -DLLAMA_CURL=OFF \
# -DCMAKE_BUILD_TYPE=Release \
# -DGGML_OPENMP=OFF \
# -DLLAMA_BUILD_EXAMPLES=ON \
# -DLLAMA_BUILD_TOOLS=ON \
# -DLLAMA_BUILD_TESTS=OFF \
# -DCMAKE_SYSTEM_NAME=Linux \
# -DCMAKE_SYSTEM_PROCESSOR=riscv64 \
# -DCMAKE_C_COMPILER=riscv64-linux-gnu-gcc-14 \
# -DCMAKE_CXX_COMPILER=riscv64-linux-gnu-g++-14 \
# -DCMAKE_C_COMPILER_LAUNCHER=ccache \
# -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
# -DCMAKE_POSITION_INDEPENDENT_CODE=ON \
# -DCMAKE_FIND_ROOT_PATH=/usr/lib/riscv64-linux-gnu \
# -DCMAKE_FIND_ROOT_PATH_MODE_PROGRAM=NEVER \
# -DCMAKE_FIND_ROOT_PATH_MODE_LIBRARY=ONLY \
# -DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=BOTH \
# -DGGML_RVV=ON \
# -DGGML_RV_ZFH=ON \
# -DGGML_RV_ZICBOP=ON \
# -DGGML_CPU_RISCV64_SPACEMIT=ON \
# -DRISCV64_SPACEMIT_IME_SPEC=RISCV64_SPACEMIT_IME1
# cmake --build build --config Release -j $(nproc)

View File

@ -547,6 +547,46 @@ jobs:
# This is using llvmpipe and runs slower than other backends
ctest -L main --verbose --timeout 3600
ubuntu-24-wasm-webgpu:
runs-on: ubuntu-24.04
steps:
- name: Clone
id: checkout
uses: actions/checkout@v4
- name: ccache
uses: ggml-org/ccache-action@v1.2.16
with:
key: ubuntu-latest-wasm-webgpu
evict-old-files: 1d
- name: Install Emscripten
run: |
git clone https://github.com/emscripten-core/emsdk.git
cd emsdk
./emsdk install latest
./emsdk activate latest
- name: Fetch emdawnwebgpu
run: |
DAWN_TAG="v20251027.212519"
EMDAWN_PKG="emdawnwebgpu_pkg-${DAWN_TAG}.zip"
echo "Downloading ${EMDAWN_PKG}"
curl -L -o emdawn.zip \
"https://github.com/google/dawn/releases/download/${DAWN_TAG}/${EMDAWN_PKG}"
unzip emdawn.zip
- name: Build WASM WebGPU
run: |
source emsdk/emsdk_env.sh
emcmake cmake -B build-wasm \
-DGGML_WEBGPU=ON \
-DLLAMA_CURL=OFF \
-DEMDAWNWEBGPU_DIR=emdawnwebgpu_pkg
cmake --build build-wasm --target test-backend-ops -j $(nproc)
ubuntu-22-cmake-hip:
runs-on: ubuntu-22.04
container: rocm/dev-ubuntu-22.04:6.1.2
@ -1642,6 +1682,337 @@ jobs:
run: |
GG_BUILD_KLEIDIAI=1 GG_BUILD_EXTRA_TESTS_0=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
ubuntu-cpu-cmake-riscv64-native:
runs-on: RISCV64
steps:
- name: Install dependencies
run: |
sudo apt-get update
# Install necessary packages
sudo apt-get install -y libatomic1 libtsan2 gcc-14 g++-14 rustup cmake build-essential libssl-dev wget ccache
# Set gcc-14 and g++-14 as the default compilers
sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-14 100
sudo update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-14 100
sudo ln -sf /usr/bin/gcc-14 /usr/bin/gcc
sudo ln -sf /usr/bin/g++-14 /usr/bin/g++
# Install Rust stable version
rustup install stable
rustup default stable
- name: Clone
id: checkout
uses: actions/checkout@v4
- name: Check environment
run: |
uname -a
gcc --version
g++ --version
ldd --version
cmake --version
rustc --version
- name: Setup ccache
run: |
# Set unique cache directory for this job
export CCACHE_DIR="$HOME/.ccache/cpu-cmake-rv64-native"
mkdir -p "$CCACHE_DIR"
# Configure ccache for optimal performance
ccache --set-config=max_size=5G
ccache --set-config=compression=true
ccache --set-config=compression_level=6
ccache --set-config=cache_dir="$CCACHE_DIR"
# Enable more aggressive caching
ccache --set-config=sloppiness=file_macro,time_macros,include_file_mtime,include_file_ctime
ccache --set-config=hash_dir=false
# Export for subsequent steps
echo "CCACHE_DIR=$CCACHE_DIR" >> $GITHUB_ENV
echo "PATH=/usr/lib/ccache:$PATH" >> $GITHUB_ENV
- name: Build
id: cmake_build
run: |
cmake -B build \
-DLLAMA_CURL=OFF \
-DLLAMA_OPENSSL=ON \
-DCMAKE_BUILD_TYPE=Release \
-DGGML_OPENMP=OFF \
-DLLAMA_BUILD_EXAMPLES=ON \
-DLLAMA_BUILD_TOOLS=ON \
-DLLAMA_BUILD_TESTS=ON \
-DCMAKE_C_COMPILER_LAUNCHER=ccache \
-DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
-DGGML_RPC=ON \
-DCMAKE_C_COMPILER=riscv64-linux-gnu-gcc-14 \
-DCMAKE_CXX_COMPILER=riscv64-linux-gnu-g++-14
cmake --build build --config Release -j $(nproc)
- name: Test
id: cmake_test
run: |
cd build
ctest -L 'main|curl' --verbose --timeout 900
- name: Test llama2c conversion
id: llama2c_test
run: |
cd build
echo "Fetch tokenizer"
wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories260K/tok512.bin
echo "Fetch llama2c model"
wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories260K/stories260K.bin
./bin/llama-convert-llama2c-to-ggml --copy-vocab-from-model ./tok512.bin --llama2c-model stories260K.bin --llama2c-output-model stories260K.gguf
./bin/llama-cli -m stories260K.gguf -p "One day, Lily met a Shoggoth" -n 500 -c 256
ubuntu-cmake-sanitizer-riscv64-native:
runs-on: RISCV64
continue-on-error: true
strategy:
matrix:
sanitizer: [ADDRESS, THREAD, UNDEFINED]
build_type: [Debug]
steps:
- name: Install dependencies
run: |
sudo apt-get update
# Install necessary packages
sudo apt-get install -y libatomic1 libtsan2 gcc-14 g++-14 rustup cmake build-essential wget ccache
# Set gcc-14 and g++-14 as the default compilers
sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-14 100
sudo update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-14 100
sudo ln -sf /usr/bin/gcc-14 /usr/bin/gcc
sudo ln -sf /usr/bin/g++-14 /usr/bin/g++
# Install Rust stable version
rustup install stable
rustup default stable
- name: GCC version check
run: |
gcc --version
g++ --version
- name: Clone
id: checkout
uses: actions/checkout@v4
- name: Setup ccache
run: |
# Unique cache directory per matrix combination
export CCACHE_DIR="$HOME/.ccache/sanitizer-${{ matrix.sanitizer }}-${{ matrix.build_type }}"
mkdir -p "$CCACHE_DIR"
# Configure ccache
ccache --set-config=max_size=5G
ccache --set-config=compression=true
ccache --set-config=compression_level=6
ccache --set-config=cache_dir="$CCACHE_DIR"
ccache --set-config=sloppiness=file_macro,time_macros,include_file_mtime,include_file_ctime
ccache --set-config=hash_dir=false
# Export for subsequent steps
echo "CCACHE_DIR=$CCACHE_DIR" >> $GITHUB_ENV
echo "PATH=/usr/lib/ccache:$PATH" >> $GITHUB_ENV
- name: Build
id: cmake_build
if: ${{ matrix.sanitizer != 'THREAD' }}
run: |
cmake -B build \
-DLLAMA_CURL=OFF \
-DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
-DGGML_OPENMP=ON \
-DLLAMA_BUILD_EXAMPLES=ON \
-DLLAMA_BUILD_TOOLS=ON \
-DLLAMA_BUILD_TESTS=OFF \
-DCMAKE_C_COMPILER_LAUNCHER=ccache \
-DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
-DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \
-DCMAKE_C_COMPILER=riscv64-linux-gnu-gcc-14 \
-DCMAKE_CXX_COMPILER=riscv64-linux-gnu-g++-14
cmake --build build --config ${{ matrix.build_type }} -j $(nproc)
- name: Build (no OpenMP)
id: cmake_build_no_openmp
if: ${{ matrix.sanitizer == 'THREAD' }}
run: |
cmake -B build \
-DLLAMA_CURL=OFF \
-DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
-DGGML_OPENMP=OFF \
-DLLAMA_BUILD_EXAMPLES=ON \
-DLLAMA_BUILD_TOOLS=ON \
-DLLAMA_BUILD_TESTS=OFF \
-DCMAKE_C_COMPILER_LAUNCHER=ccache \
-DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
-DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \
-DCMAKE_C_COMPILER=riscv64-linux-gnu-gcc-14 \
-DCMAKE_CXX_COMPILER=riscv64-linux-gnu-g++-14
cmake --build build --config ${{ matrix.build_type }} -j $(nproc)
- name: Test
id: cmake_test
run: |
cd build
ctest -L main --verbose --timeout 900
ubuntu-llguidance-riscv64-native:
runs-on: RISCV64
steps:
- name: Install dependencies
run: |
sudo apt-get update
# Install necessary packages
sudo apt-get install -y libatomic1 libtsan2 gcc-14 g++-14 rustup cmake build-essential wget ccache
# Set gcc-14 and g++-14 as the default compilers
sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-14 100
sudo update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-14 100
sudo ln -sf /usr/bin/gcc-14 /usr/bin/gcc
sudo ln -sf /usr/bin/g++-14 /usr/bin/g++
# Install Rust stable version
rustup install stable
rustup default stable
- name: GCC version check
run: |
gcc --version
g++ --version
- name: Clone
id: checkout
uses: actions/checkout@v4
- name: Setup ccache
run: |
export CCACHE_DIR="$HOME/.ccache/llguidance-riscv64"
mkdir -p "$CCACHE_DIR"
ccache --set-config=max_size=5G
ccache --set-config=compression=true
ccache --set-config=compression_level=6
ccache --set-config=cache_dir="$CCACHE_DIR"
ccache --set-config=sloppiness=file_macro,time_macros,include_file_mtime,include_file_ctime
ccache --set-config=hash_dir=false
echo "CCACHE_DIR=$CCACHE_DIR" >> $GITHUB_ENV
echo "PATH=/usr/lib/ccache:$PATH" >> $GITHUB_ENV
- name: Build
id: cmake_build
run: |
cmake -B build \
-DLLAMA_CURL=OFF \
-DCMAKE_BUILD_TYPE=Release \
-DGGML_OPENMP=OFF \
-DLLAMA_BUILD_EXAMPLES=ON \
-DLLAMA_BUILD_TOOLS=ON \
-DLLAMA_BUILD_TESTS=OFF \
-DCMAKE_C_COMPILER_LAUNCHER=ccache \
-DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
-DLLAMA_LLGUIDANCE=ON \
-DCMAKE_C_COMPILER=riscv64-linux-gnu-gcc-14 \
-DCMAKE_CXX_COMPILER=riscv64-linux-gnu-g++-14
cmake --build build --config Release -j $(nproc)
- name: Test
id: cmake_test
run: |
cd build
ctest -L main --verbose --timeout 900
ubuntu-cmake-rpc-riscv64-native:
runs-on: RISCV64
continue-on-error: true
steps:
- name: Install dependencies
run: |
sudo apt-get update
# Install necessary packages
sudo apt-get install -y libatomic1 libtsan2 gcc-14 g++-14 rustup cmake build-essential libssl-dev wget ccache
# Set gcc-14 and g++-14 as the default compilers
sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-14 100
sudo update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-14 100
sudo ln -sf /usr/bin/gcc-14 /usr/bin/gcc
sudo ln -sf /usr/bin/g++-14 /usr/bin/g++
# Install Rust stable version
rustup install stable
rustup default stable
- name: GCC version check
run: |
gcc --version
g++ --version
- name: Clone
id: checkout
uses: actions/checkout@v4
- name: Setup ccache
run: |
export CCACHE_DIR="$HOME/.ccache/rpc-riscv64"
mkdir -p "$CCACHE_DIR"
ccache --set-config=max_size=5G
ccache --set-config=compression=true
ccache --set-config=compression_level=6
ccache --set-config=cache_dir="$CCACHE_DIR"
ccache --set-config=sloppiness=file_macro,time_macros,include_file_mtime,include_file_ctime
ccache --set-config=hash_dir=false
echo "CCACHE_DIR=$CCACHE_DIR" >> $GITHUB_ENV
echo "PATH=/usr/lib/ccache:$PATH" >> $GITHUB_ENV
- name: Build
id: cmake_build
run: |
cmake -B build \
-DLLAMA_CURL=OFF \
-DLLAMA_OPENSSL=ON \
-DCMAKE_BUILD_TYPE=Release \
-DGGML_OPENMP=OFF \
-DLLAMA_BUILD_EXAMPLES=ON \
-DLLAMA_BUILD_TOOLS=ON \
-DLLAMA_BUILD_TESTS=ON \
-DCMAKE_C_COMPILER_LAUNCHER=ccache \
-DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
-DCMAKE_C_COMPILER=riscv64-linux-gnu-gcc-14 \
-DCMAKE_CXX_COMPILER=riscv64-linux-gnu-g++-14 \
-DGGML_RPC=ON
cmake --build build --config Release -j $(nproc)
- name: Test
id: cmake_test
run: |
cd build
ctest -L main --verbose
ggml-ci-arm64-graviton4-kleidiai:
runs-on: ah-ubuntu_22_04-c8g_8x

View File

@ -66,14 +66,21 @@ jobs:
id: pack_artifacts
run: |
cp LICENSE ./build/bin/
zip -r llama-${{ steps.tag.outputs.name }}-bin-macos-arm64.zip ./build/bin/*
zip -y -r llama-${{ steps.tag.outputs.name }}-bin-macos-arm64.zip ./build/bin/*
tar -czvf llama-${{ steps.tag.outputs.name }}-bin-macos-arm64.tar.gz -C ./build/bin .
- name: Upload artifacts
- name: Upload artifacts (zip)
uses: actions/upload-artifact@v4
with:
path: llama-${{ steps.tag.outputs.name }}-bin-macos-arm64.zip
name: llama-bin-macos-arm64.zip
- name: Upload artifacts (tar)
uses: actions/upload-artifact@v4
with:
path: llama-${{ steps.tag.outputs.name }}-bin-macos-arm64.tar.gz
name: llama-bin-macos-arm64.tar.gz
macOS-x64:
runs-on: macos-15-intel
@ -120,14 +127,21 @@ jobs:
id: pack_artifacts
run: |
cp LICENSE ./build/bin/
zip -r llama-${{ steps.tag.outputs.name }}-bin-macos-x64.zip ./build/bin/*
zip -y -r llama-${{ steps.tag.outputs.name }}-bin-macos-x64.zip ./build/bin/*
tar -czvf llama-${{ steps.tag.outputs.name }}-bin-macos-x64.tar.gz -C ./build/bin .
- name: Upload artifacts
- name: Upload artifacts (zip)
uses: actions/upload-artifact@v4
with:
path: llama-${{ steps.tag.outputs.name }}-bin-macos-x64.zip
name: llama-bin-macos-x64.zip
- name: Upload artifacts (tar)
uses: actions/upload-artifact@v4
with:
path: llama-${{ steps.tag.outputs.name }}-bin-macos-x64.tar.gz
name: llama-bin-macos-x64.tar.gz
ubuntu-22-cpu:
strategy:
matrix:
@ -182,14 +196,21 @@ jobs:
id: pack_artifacts
run: |
cp LICENSE ./build/bin/
zip -r llama-${{ steps.tag.outputs.name }}-bin-ubuntu-${{ matrix.build }}.zip ./build/bin/*
zip -y -r llama-${{ steps.tag.outputs.name }}-bin-ubuntu-${{ matrix.build }}.zip ./build/bin/*
tar -czvf llama-${{ steps.tag.outputs.name }}-bin-ubuntu-${{ matrix.build }}.tar.gz -C ./build/bin .
- name: Upload artifacts
- name: Upload artifacts (zip)
uses: actions/upload-artifact@v4
with:
path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-${{ matrix.build }}.zip
name: llama-bin-ubuntu-${{ matrix.build }}.zip
- name: Upload artifacts (tar)
uses: actions/upload-artifact@v4
with:
path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-${{ matrix.build }}.tar.gz
name: llama-bin-ubuntu-${{ matrix.build }}.tar.gz
ubuntu-22-vulkan:
runs-on: ubuntu-22.04
@ -235,14 +256,21 @@ jobs:
id: pack_artifacts
run: |
cp LICENSE ./build/bin/
zip -r llama-${{ steps.tag.outputs.name }}-bin-ubuntu-vulkan-x64.zip ./build/bin/*
zip -y -r llama-${{ steps.tag.outputs.name }}-bin-ubuntu-vulkan-x64.zip ./build/bin/*
tar -czvf llama-${{ steps.tag.outputs.name }}-bin-ubuntu-vulkan-x64.tar.gz -C ./build/bin .
- name: Upload artifacts
- name: Upload artifacts (zip)
uses: actions/upload-artifact@v4
with:
path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-vulkan-x64.zip
name: llama-bin-ubuntu-vulkan-x64.zip
- name: Upload artifacts (tar)
uses: actions/upload-artifact@v4
with:
path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-vulkan-x64.tar.gz
name: llama-bin-ubuntu-vulkan-x64.tar.gz
windows-cpu:
runs-on: windows-2025
@ -298,7 +326,7 @@ jobs:
run: |
Copy-Item $env:CURL_PATH\bin\libcurl-${{ matrix.arch }}.dll .\build\bin\Release\
Copy-Item "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Redist\MSVC\14.44.35112\debug_nonredist\${{ matrix.arch }}\Microsoft.VC143.OpenMP.LLVM\libomp140.${{ matrix.arch == 'x64' && 'x86_64' || 'aarch64' }}.dll" .\build\bin\Release\
7z a llama-bin-win-cpu-${{ matrix.arch }}.zip .\build\bin\Release\*
7z a -snl llama-bin-win-cpu-${{ matrix.arch }}.zip .\build\bin\Release\*
- name: Upload artifacts
uses: actions/upload-artifact@v4
@ -380,7 +408,7 @@ jobs:
- name: Pack artifacts
id: pack_artifacts
run: |
7z a llama-bin-win-${{ matrix.backend }}-${{ matrix.arch }}.zip .\build\bin\Release\${{ matrix.target }}.dll
7z a -snl llama-bin-win-${{ matrix.backend }}-${{ matrix.arch }}.zip .\build\bin\Release\${{ matrix.target }}.dll
- name: Upload artifacts
uses: actions/upload-artifact@v4
@ -434,7 +462,7 @@ jobs:
- name: Pack artifacts
id: pack_artifacts
run: |
7z a llama-bin-win-cuda-${{ matrix.cuda }}-x64.zip .\build\bin\Release\ggml-cuda.dll
7z a -snl llama-bin-win-cuda-${{ matrix.cuda }}-x64.zip .\build\bin\Release\ggml-cuda.dll
- name: Upload artifacts
uses: actions/upload-artifact@v4
@ -526,7 +554,7 @@ jobs:
cp "${{ env.ONEAPI_ROOT }}/umf/latest/bin/umf.dll" ./build/bin
echo "cp oneAPI running time dll files to ./build/bin done"
7z a llama-bin-win-sycl-x64.zip ./build/bin/*
7z a -snl llama-bin-win-sycl-x64.zip ./build/bin/*
- name: Upload the release package
uses: actions/upload-artifact@v4
@ -632,7 +660,7 @@ jobs:
- name: Pack artifacts
id: pack_artifacts
run: |
7z a llama-bin-win-hip-${{ matrix.name }}-x64.zip .\build\bin\*
7z a -snl llama-bin-win-hip-${{ matrix.name }}-x64.zip .\build\bin\*
- name: Upload artifacts
uses: actions/upload-artifact@v4
@ -685,58 +713,20 @@ jobs:
- name: Pack artifacts
id: pack_artifacts
run: |
zip --symlinks -r llama-${{ steps.tag.outputs.name }}-xcframework.zip build-apple/llama.xcframework
zip -y -r llama-${{ steps.tag.outputs.name }}-xcframework.zip build-apple/llama.xcframework
tar -czvf llama-${{ steps.tag.outputs.name }}-xcframework.tar.gz -C build-apple llama.xcframework
- name: Upload artifacts
- name: Upload artifacts (zip)
uses: actions/upload-artifact@v4
with:
path: llama-${{ steps.tag.outputs.name }}-xcframework.zip
name: llama-${{ steps.tag.outputs.name }}-xcframework
name: llama-${{ steps.tag.outputs.name }}-xcframework.zip
openEuler-cann:
strategy:
matrix:
arch: [x86, aarch64]
chip_type: ['910b', '310p']
build: ['Release']
runs-on: ${{ matrix.arch == 'aarch64' && 'ubuntu-24.04-arm' || 'ubuntu-24.04' }}
container: ascendai/cann:${{ matrix.chip_type == '910b' && '8.3.rc1.alpha001-910b-openeuler22.03-py3.11' || '8.2.rc1-310p-openeuler22.03-py3.11' }}
steps:
- name: Checkout
uses: actions/checkout@v4
with:
fetch-depth: 0
- name: Dependencies
run: |
yum update -y
yum install -y git gcc gcc-c++ make cmake libcurl-devel
git config --global --add safe.directory "$GITHUB_WORKSPACE"
- name: Build
run: |
export LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:${ASCEND_TOOLKIT_HOME}/$(uname -m)-linux/devlib/:${LD_LIBRARY_PATH}
cmake -S . -B build \
-DCMAKE_BUILD_TYPE=${{ matrix.build }} \
-DGGML_CANN=on \
-DSOC_TYPE=ascend${{ matrix.chip_type }}
cmake --build build -j $(nproc)
- name: Determine tag name
id: tag
uses: ./.github/actions/get-tag-name
- name: Pack artifacts
run: |
cp LICENSE ./build/bin/
zip -r llama-${{ steps.tag.outputs.name }}-bin-${{ matrix.chip_type }}-openEuler-${{ matrix.arch }}.zip ./build/bin/*
- name: Upload artifacts
- name: Upload artifacts (tar)
uses: actions/upload-artifact@v4
with:
path: llama-${{ steps.tag.outputs.name }}-bin-${{ matrix.chip_type }}-openEuler-${{ matrix.arch }}.zip
name: llama-bin-${{ matrix.chip_type }}-openEuler-${{ matrix.arch }}.zip
path: llama-${{ steps.tag.outputs.name }}-xcframework.tar.gz
name: llama-${{ steps.tag.outputs.name }}-xcframework.tar.gz
release:
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
@ -759,7 +749,6 @@ jobs:
- macOS-arm64
- macOS-x64
- ios-xcode-build
- openEuler-cann
steps:
- name: Clone
@ -814,6 +803,7 @@ jobs:
echo "Moving other artifacts..."
mv -v artifact/*.zip release
mv -v artifact/*.tar.gz release
- name: Create release
id: create_release
@ -822,6 +812,33 @@ jobs:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
with:
tag_name: ${{ steps.tag.outputs.name }}
body: |
> [!WARNING]
> **Release Format Update**: Linux releases will soon use .tar.gz archives instead of .zip. Please make the necessary changes to your deployment scripts.
<details open>
${{ github.event.head_commit.message }}
</details>
**macOS/iOS:**
- [macOS Apple Silicon (arm64)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-macos-arm64.tar.gz)
- [macOS Intel (x64)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-macos-x64.tar.gz)
- [iOS XCFramework](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-xcframework.tar.gz)
**Linux:**
- [Ubuntu x64 (CPU)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-x64.tar.gz)
- [Ubuntu x64 (Vulkan)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-vulkan-x64.tar.gz)
- [Ubuntu s390x (CPU)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-s390x.tar.gz)
**Windows:**
- [Windows x64 (CPU)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-cpu-x64.zip)
- [Windows arm64 (CPU)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-cpu-arm64.zip)
- [Windows x64 (CUDA)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-cuda-12.4-x64.zip)
- [Windows x64 (Vulkan)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-vulkan-x64.zip)
- [Windows x64 (SYCL)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-sycl-x64.zip)
- [Windows x64 (HIP)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-hip-radeon-x64.zip)
- name: Upload release
id: upload_release
@ -833,7 +850,7 @@ jobs:
const fs = require('fs');
const release_id = '${{ steps.create_release.outputs.id }}';
for (let file of await fs.readdirSync('./release')) {
if (path.extname(file) === '.zip') {
if (path.extname(file) === '.zip' || file.endsWith('.tar.gz')) {
console.log('uploadReleaseAsset', file);
await github.repos.uploadReleaseAsset({
owner: context.repo.owner,

View File

@ -9,6 +9,7 @@ jobs:
update:
name: Update Winget Package
runs-on: ubuntu-latest
if: ${{ github.repository.owner.login == 'ggml-org' }}
steps:
- name: Install cargo binstall

2
.gitignore vendored
View File

@ -134,3 +134,5 @@ poetry.toml
# IDE
/*.code-workspace
/.windsurf/
# emscripten
a.out.*

View File

@ -33,10 +33,24 @@ endif()
option(LLAMA_USE_SYSTEM_GGML "Use system libggml" OFF)
option(LLAMA_WASM_MEM64 "llama: use 64-bit memory in WASM builds" ON)
if (EMSCRIPTEN)
set(BUILD_SHARED_LIBS_DEFAULT OFF)
option(LLAMA_WASM_SINGLE_FILE "llama: embed WASM inside the generated llama.js" ON)
# Use 64-bit memory to support backend_get_memory queries
# TODO: analyze performance impact, see https://spidermonkey.dev/blog/2025/01/15/is-memory64-actually-worth-using
if (LLAMA_WASM_MEM64)
add_compile_options("-sMEMORY64=1")
add_link_options("-sMEMORY64=1")
endif()
add_link_options("-sALLOW_MEMORY_GROWTH=1")
option(LLAMA_WASM_SINGLE_FILE "llama: embed WASM inside the generated llama.js" OFF)
option(LLAMA_BUILD_HTML "llama: build HTML file" ON)
if (LLAMA_BUILD_HTML)
set(CMAKE_EXECUTABLE_SUFFIX ".html")
endif()
else()
if (MINGW)
set(BUILD_SHARED_LIBS_DEFAULT OFF)
@ -58,6 +72,12 @@ if (MSVC)
add_compile_options("$<$<COMPILE_LANGUAGE:CXX>:/bigobj>")
endif()
if (LLAMA_STANDALONE)
# enable parallel builds for msbuild
list(APPEND CMAKE_VS_GLOBALS UseMultiToolTask=true)
list(APPEND CMAKE_VS_GLOBALS EnforceProcessCountAcrossBuilds=true)
endif()
if (CMAKE_SYSTEM_NAME STREQUAL "iOS")
set(LLAMA_TOOLS_INSTALL_DEFAULT OFF)
else()
@ -179,11 +199,6 @@ if (NOT TARGET ggml AND NOT LLAMA_USE_SYSTEM_GGML)
# ... otherwise assume ggml is added by a parent CMakeLists.txt
endif()
if (MINGW)
# Target Windows 8 for PrefetchVirtualMemory
add_compile_definitions(_WIN32_WINNT=${GGML_WIN_VER})
endif()
#
# build the library
#

View File

@ -7,16 +7,19 @@
/ci/ @ggerganov
/cmake/ @ggerganov
/common/CMakeLists.txt @ggerganov
/common/arg.* @ggerganov @ericcurtin
/common/arg.* @ggerganov
/common/base64.hpp.* @ggerganov
/common/build-info.* @ggerganov
/common/chat-peg-parser.* @aldehir
/common/common.* @ggerganov
/common/console.* @ggerganov
/common/http.* @angt
/common/llguidance.* @ggerganov
/common/log.* @ggerganov
/common/peg-parser.* @aldehir
/common/sampling.* @ggerganov
/common/speculative.* @ggerganov
/common/unicode.* @aldehir
/convert_*.py @CISC
/examples/batched.swift/ @ggerganov
/examples/batched/ @ggerganov
@ -87,8 +90,7 @@
/tools/perplexity/ @ggerganov
/tools/quantize/ @ggerganov
/tools/rpc/ @rgerganov
/tools/run/ @ericcurtin
/tools/server/* @ngxson @ggerganov @ericcurtin # no subdir
/tools/server/* @ngxson @ggerganov # no subdir
/tools/server/webui/ @allozaur
/tools/tokenize/ @ggerganov
/tools/tts/ @ggerganov

View File

@ -19,6 +19,7 @@ The project differentiates between 3 levels of contributors:
- If your PR becomes stale, don't hesitate to ping the maintainers in the comments
- Maintainers will rely on your insights and approval when making a final decision to approve and merge a PR
- Consider adding yourself to [CODEOWNERS](CODEOWNERS) to indicate your availability for reviewing related PRs
- Using AI to generate PRs is permitted. However, you must (1) explicitly disclose how AI was used and (2) conduct a thorough manual review before publishing the PR. Note that trivial tab autocompletions do not require disclosure.
# Pull requests (for maintainers)

View File

@ -613,3 +613,4 @@ $ echo "source ~/.llama-completion.bash" >> ~/.bashrc
- [linenoise.cpp](./tools/run/linenoise.cpp/linenoise.cpp) - C++ library that provides readline-like line editing capabilities, used by `llama-run` - BSD 2-Clause License
- [curl](https://curl.se/) - Client-side URL transfer library, used by various tools/examples - [CURL License](https://curl.se/docs/copyright.html)
- [miniaudio.h](https://github.com/mackron/miniaudio) - Single-header audio format decoder, used by multimodal subsystem - Public domain
- [subprocess.h](https://github.com/sheredom/subprocess.h) - Single-header process launching solution for C and C++ - Public domain

View File

@ -65,4 +65,6 @@ However, If you have discovered a security vulnerability in this project, please
Please disclose it as a private [security advisory](https://github.com/ggml-org/llama.cpp/security/advisories/new).
Please note that using AI to identify vulnerabilities and generate reports is permitted. However, you must (1) explicitly disclose how AI was used and (2) conduct a thorough manual review before submitting the report.
A team of volunteers on a reasonable-effort basis maintains this project. As such, please give us at least 90 days to work on a fix before public exposure.

View File

@ -45,7 +45,7 @@ sd=`dirname $0`
cd $sd/../
SRC=`pwd`
CMAKE_EXTRA="-DLLAMA_FATAL_WARNINGS=ON -DLLAMA_CURL=ON"
CMAKE_EXTRA="-DLLAMA_FATAL_WARNINGS=${LLAMA_FATAL_WARNINGS:-ON} -DLLAMA_CURL=ON -DGGML_SCHED_NO_REALLOC=ON"
if [ ! -z ${GG_BUILD_METAL} ]; then
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_METAL=ON"
@ -428,10 +428,10 @@ function gg_run_qwen3_0_6b {
(time ./bin/llama-imatrix --model ${model_f16} -f ${wiki_test} -ngl 99 -c 1024 -b 512 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
(time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 10 -c 1024 -fa off ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
(time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 10 -c 1024 -fa on ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
(time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 99 -c 1024 -fa off ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
(time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 99 -c 1024 -fa on ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
(time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 10 -c 1024 -fa off --no-op-offload) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
(time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 10 -c 1024 -fa on --no-op-offload) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
(time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 99 -c 1024 -fa off ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
(time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 99 -c 1024 -fa on ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
function check_ppl {
qnt="$1"
@ -523,8 +523,8 @@ function gg_run_embd_bge_small {
./bin/llama-quantize ${model_f16} ${model_q8_0} q8_0
(time ./bin/llama-embedding --model ${model_f16} -p "I believe the meaning of life is" -ngl 99 -c 0 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
(time ./bin/llama-embedding --model ${model_q8_0} -p "I believe the meaning of life is" -ngl 99 -c 0 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
(time ./bin/llama-embedding --model ${model_f16} -p "I believe the meaning of life is" -ngl 99 -c 0 --no-op-offload) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
(time ./bin/llama-embedding --model ${model_q8_0} -p "I believe the meaning of life is" -ngl 99 -c 0 --no-op-offload) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
set +e
}
@ -564,7 +564,7 @@ function gg_run_rerank_tiny {
model_f16="${path_models}/ggml-model-f16.gguf"
# for this model, the SEP token is "</s>"
(time ./bin/llama-embedding --model ${model_f16} -p "what is panda?\thi\nwhat is panda?\tit's a bear\nwhat is panda?\tThe giant panda (Ailuropoda melanoleuca), sometimes called a panda bear or simply panda, is a bear species endemic to China." -ngl 99 -c 0 --pooling rank --embd-normalize -1 --verbose-prompt) 2>&1 | tee -a $OUT/${ci}-rk-f16.log
(time ./bin/llama-embedding --model ${model_f16} -p "what is panda?\thi\nwhat is panda?\tit's a bear\nwhat is panda?\tThe giant panda (Ailuropoda melanoleuca), sometimes called a panda bear or simply panda, is a bear species endemic to China." -ngl 99 -c 0 --pooling rank --embd-normalize -1 --no-op-offload --verbose-prompt) 2>&1 | tee -a $OUT/${ci}-rk-f16.log
# sample output
# rerank score 0: 0.029

View File

@ -52,6 +52,8 @@ add_library(${TARGET} STATIC
chat-parser.h
chat-parser-xml-toolcall.h
chat-parser-xml-toolcall.cpp
chat-peg-parser.cpp
chat-peg-parser.h
chat.cpp
chat.h
common.cpp
@ -69,12 +71,16 @@ add_library(${TARGET} STATIC
log.h
ngram-cache.cpp
ngram-cache.h
peg-parser.cpp
peg-parser.h
regex-partial.cpp
regex-partial.h
sampling.cpp
sampling.h
speculative.cpp
speculative.h
unicode.cpp
unicode.h
)
if (BUILD_SHARED_LIBS)

View File

@ -30,6 +30,7 @@
#include <thread> // for hardware_concurrency
#include <vector>
#ifndef __EMSCRIPTEN__
#ifdef __linux__
#include <linux/limits.h>
#elif defined(_WIN32)
@ -41,6 +42,8 @@
#else
#include <sys/syslimits.h>
#endif
#endif
#define LLAMA_MAX_URL_LENGTH 2084 // Maximum URL Length in Chrome: 2083
using json = nlohmann::ordered_json;
@ -212,13 +215,13 @@ struct handle_model_result {
static handle_model_result common_params_handle_model(
struct common_params_model & model,
const std::string & bearer_token,
const std::string & model_path_default,
bool offline) {
handle_model_result result;
// handle pre-fill default model path and url based on hf_repo and hf_file
{
if (!model.docker_repo.empty()) { // Handle Docker URLs by resolving them to local paths
model.path = common_docker_resolve_model(model.docker_repo);
model.name = model.docker_repo; // set name for consistency
} else if (!model.hf_repo.empty()) {
// short-hand to avoid specifying --hf-file -> default it to --model
if (model.hf_file.empty()) {
@ -227,7 +230,8 @@ static handle_model_result common_params_handle_model(
if (auto_detected.repo.empty() || auto_detected.ggufFile.empty()) {
exit(1); // built without CURL, error message already printed
}
model.hf_repo = auto_detected.repo;
model.name = model.hf_repo; // repo name with tag
model.hf_repo = auto_detected.repo; // repo name without tag
model.hf_file = auto_detected.ggufFile;
if (!auto_detected.mmprojFile.empty()) {
result.found_mmproj = true;
@ -257,8 +261,6 @@ static handle_model_result common_params_handle_model(
model.path = fs_get_cache_file(string_split<std::string>(f, '/').back());
}
} else if (model.path.empty()) {
model.path = model_path_default;
}
}
@ -405,7 +407,7 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
// handle model and download
{
auto res = common_params_handle_model(params.model, params.hf_token, DEFAULT_MODEL_PATH, params.offline);
auto res = common_params_handle_model(params.model, params.hf_token, params.offline);
if (params.no_mmproj) {
params.mmproj = {};
} else if (res.found_mmproj && params.mmproj.path.empty() && params.mmproj.url.empty()) {
@ -415,12 +417,18 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
// only download mmproj if the current example is using it
for (auto & ex : mmproj_examples) {
if (ctx_arg.ex == ex) {
common_params_handle_model(params.mmproj, params.hf_token, "", params.offline);
common_params_handle_model(params.mmproj, params.hf_token, params.offline);
break;
}
}
common_params_handle_model(params.speculative.model, params.hf_token, "", params.offline);
common_params_handle_model(params.vocoder.model, params.hf_token, "", params.offline);
common_params_handle_model(params.speculative.model, params.hf_token, params.offline);
common_params_handle_model(params.vocoder.model, params.hf_token, params.offline);
}
// model is required (except for server)
// TODO @ngxson : maybe show a list of available models in CLI in this case
if (params.model.path.empty() && ctx_arg.ex != LLAMA_EXAMPLE_SERVER) {
throw std::invalid_argument("error: --model is required\n");
}
if (params.escape) {
@ -694,6 +702,12 @@ static bool is_autoy(const std::string & value) {
}
common_params_context common_params_parser_init(common_params & params, llama_example ex, void(*print_usage)(int, char **)) {
// default values specific to example
// note: we place it here instead of inside server.cpp to allow llama-gen-docs to pick it up
if (ex == LLAMA_EXAMPLE_SERVER) {
params.use_jinja = true;
}
// load dynamic backends
ggml_backend_load_all();
@ -974,7 +988,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
[](common_params & params) {
params.kv_unified = true;
}
).set_env("LLAMA_ARG_KV_SPLIT"));
).set_env("LLAMA_ARG_KV_UNIFIED"));
add_opt(common_arg(
{"--no-context-shift"},
string_format("disables context shift on infinite text generation (default: %s)", params.ctx_shift ? "disabled" : "enabled"),
@ -1215,7 +1229,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
[](common_params & params) {
params.warmup = false;
}
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_RETRIEVAL, LLAMA_EXAMPLE_PERPLEXITY}));
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MTMD, LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_RETRIEVAL, LLAMA_EXAMPLE_PERPLEXITY}));
add_opt(common_arg(
{"--spm-infill"},
string_format(
@ -2084,11 +2098,8 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
add_opt(common_arg(
{"-m", "--model"}, "FNAME",
ex == LLAMA_EXAMPLE_EXPORT_LORA
? std::string("model path from which to load base model")
: string_format(
"model path (default: `models/$filename` with filename from `--hf-file` "
"or `--model-url` if set, otherwise %s)", DEFAULT_MODEL_PATH
),
? "model path from which to load base model"
: "model path to load",
[](common_params & params, const std::string & value) {
params.model.path = value;
}
@ -2480,19 +2491,64 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
"path to save slot kv cache (default: disabled)",
[](common_params & params, const std::string & value) {
params.slot_save_path = value;
if (!fs_is_directory(params.slot_save_path)) {
throw std::invalid_argument("not a directory: " + value);
}
// if doesn't end with DIRECTORY_SEPARATOR, add it
if (!params.slot_save_path.empty() && params.slot_save_path[params.slot_save_path.size() - 1] != DIRECTORY_SEPARATOR) {
params.slot_save_path += DIRECTORY_SEPARATOR;
}
}
).set_examples({LLAMA_EXAMPLE_SERVER}));
add_opt(common_arg(
{"--media-path"}, "PATH",
"directory for loading local media files; files can be accessed via file:// URLs using relative paths (default: disabled)",
[](common_params & params, const std::string & value) {
params.media_path = value;
if (!fs_is_directory(params.media_path)) {
throw std::invalid_argument("not a directory: " + value);
}
// if doesn't end with DIRECTORY_SEPARATOR, add it
if (!params.media_path.empty() && params.media_path[params.media_path.size() - 1] != DIRECTORY_SEPARATOR) {
params.media_path += DIRECTORY_SEPARATOR;
}
}
).set_examples({LLAMA_EXAMPLE_SERVER}));
add_opt(common_arg(
{"--models-dir"}, "PATH",
"directory containing models for the router server (default: disabled)",
[](common_params & params, const std::string & value) {
params.models_dir = value;
}
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODELS_DIR"));
add_opt(common_arg(
{"--models-max"}, "N",
string_format("for router server, maximum number of models to load simultaneously (default: %d, 0 = unlimited)", params.models_max),
[](common_params & params, int value) {
params.models_max = value;
}
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODELS_MAX"));
add_opt(common_arg(
{"--no-models-autoload"},
"disables automatic loading of models (default: enabled)",
[](common_params & params) {
params.models_autoload = false;
}
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_MODELS_AUTOLOAD"));
add_opt(common_arg(
{"--jinja"},
"use jinja template for chat (default: disabled)",
string_format("use jinja template for chat (default: %s)\n", params.use_jinja ? "enabled" : "disabled"),
[](common_params & params) {
params.use_jinja = true;
}
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_MTMD}).set_env("LLAMA_ARG_JINJA"));
add_opt(common_arg(
{"--no-jinja"},
string_format("disable jinja template for chat (default: %s)\n", params.use_jinja ? "enabled" : "disabled"),
[](common_params & params) {
params.use_jinja = false;
}
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_MTMD}).set_env("LLAMA_ARG_NO_JINJA"));
add_opt(common_arg(
{"--reasoning-format"}, "FORMAT",
"controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned; one of:\n"
@ -2626,7 +2682,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
[](common_params &, const std::string & value) {
common_log_set_file(common_log_main(), value.c_str());
}
));
).set_env("LLAMA_LOG_FILE"));
add_opt(common_arg(
{"--log-colors"}, "[on|off|auto]",
"Set colored logging ('on', 'off', or 'auto', default: 'auto')\n"
@ -2661,7 +2717,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
).set_env("LLAMA_OFFLINE"));
add_opt(common_arg(
{"-lv", "--verbosity", "--log-verbosity"}, "N",
"Set the verbosity threshold. Messages with a higher verbosity will be ignored.",
string_format("Set the verbosity threshold. Messages with a higher verbosity will be ignored. Values:\n"
" - 0: generic output\n"
" - 1: error\n"
" - 2: warning\n"
" - 3: info\n"
" - 4: debug\n"
"(default: %d)\n", params.verbosity),
[](common_params & params, int value) {
params.verbosity = value;
common_log_set_verbosity_thold(value);

File diff suppressed because it is too large Load Diff

114
common/chat-peg-parser.cpp Normal file
View File

@ -0,0 +1,114 @@
#include "chat-peg-parser.h"
#include <nlohmann/json.hpp>
using json = nlohmann::json;
static std::string_view trim_trailing_space(std::string_view sv) {
while (!sv.empty() && std::isspace(static_cast<unsigned char>(sv.back()))) {
sv.remove_suffix(1);
}
return sv;
}
void common_chat_peg_mapper::from_ast(const common_peg_ast_arena & arena, const common_peg_parse_result & result) {
arena.visit(result, [this](const common_peg_ast_node & node) {
map(node);
});
}
void common_chat_peg_mapper::map(const common_peg_ast_node & node) {
bool is_reasoning = node.tag == common_chat_peg_builder::REASONING;
bool is_content = node.tag == common_chat_peg_builder::CONTENT;
if (is_reasoning) {
result.reasoning_content = std::string(trim_trailing_space(node.text));
}
if (is_content) {
result.content = std::string(trim_trailing_space(node.text));
}
}
void common_chat_peg_native_mapper::map(const common_peg_ast_node & node) {
common_chat_peg_mapper::map(node);
bool is_tool_open = node.tag == common_chat_peg_native_builder::TOOL_OPEN;
bool is_tool_name = node.tag == common_chat_peg_native_builder::TOOL_NAME;
bool is_tool_id = node.tag == common_chat_peg_native_builder::TOOL_ID;
bool is_tool_args = node.tag == common_chat_peg_native_builder::TOOL_ARGS;
if (is_tool_open) {
result.tool_calls.emplace_back();
current_tool = &result.tool_calls.back();
}
if (is_tool_id && current_tool) {
current_tool->id = std::string(trim_trailing_space(node.text));
}
if (is_tool_name && current_tool) {
current_tool->name = std::string(trim_trailing_space(node.text));
}
if (is_tool_args && current_tool) {
current_tool->arguments = std::string(trim_trailing_space(node.text));
}
}
void common_chat_peg_constructed_mapper::map(const common_peg_ast_node & node) {
common_chat_peg_mapper::map(node);
bool is_tool_open = node.tag == common_chat_peg_constructed_builder::TOOL_OPEN;
bool is_tool_name = node.tag == common_chat_peg_constructed_builder::TOOL_NAME;
bool is_tool_close = node.tag == common_chat_peg_constructed_builder::TOOL_CLOSE;
bool is_arg_open = node.tag == common_chat_peg_constructed_builder::TOOL_ARG_OPEN;
bool is_arg_close = node.tag == common_chat_peg_constructed_builder::TOOL_ARG_CLOSE;
bool is_arg_name = node.tag == common_chat_peg_constructed_builder::TOOL_ARG_NAME;
bool is_arg_string = node.tag == common_chat_peg_constructed_builder::TOOL_ARG_STRING_VALUE;
bool is_arg_json = node.tag == common_chat_peg_constructed_builder::TOOL_ARG_JSON_VALUE;
if (is_tool_open) {
result.tool_calls.emplace_back();
current_tool = &result.tool_calls.back();
arg_count = 0;
}
if (is_tool_name) {
current_tool->name = std::string(node.text);
current_tool->arguments = "{";
}
if (is_arg_open) {
needs_closing_quote = false;
}
if (is_arg_name && current_tool) {
if (arg_count > 0) {
current_tool->arguments += ",";
}
current_tool->arguments += json(trim_trailing_space(node.text)).dump() + ":";
++arg_count;
}
if (is_arg_string && current_tool) {
// Serialize to JSON, but exclude the end quote
std::string dumped = json(node.text).dump();
current_tool->arguments += dumped.substr(0, dumped.size() - 1);
needs_closing_quote = true;
}
if (is_arg_close && current_tool) {
if (needs_closing_quote) {
current_tool->arguments += "\"";
}
}
if (is_arg_json && current_tool) {
current_tool->arguments += std::string(trim_trailing_space(node.text));
}
if (is_tool_close && current_tool) {
current_tool->arguments += "}";
}
}

105
common/chat-peg-parser.h Normal file
View File

@ -0,0 +1,105 @@
#pragma once
#include "chat.h"
#include "peg-parser.h"
class common_chat_peg_builder : public common_peg_parser_builder {
public:
static constexpr const char * REASONING_BLOCK = "reasoning-block";
static constexpr const char * REASONING = "reasoning";
static constexpr const char * CONTENT = "content";
common_peg_parser reasoning_block(const common_peg_parser & p) { return tag(REASONING_BLOCK, p); }
common_peg_parser reasoning(const common_peg_parser & p) { return tag(REASONING, p); }
common_peg_parser content(const common_peg_parser & p) { return tag(CONTENT, p); }
};
inline common_peg_arena build_chat_peg_parser(const std::function<common_peg_parser(common_chat_peg_builder & builder)> & fn) {
common_chat_peg_builder builder;
builder.set_root(fn(builder));
return builder.build();
}
class common_chat_peg_mapper {
public:
common_chat_msg & result;
common_chat_peg_mapper(common_chat_msg & msg) : result(msg) {}
virtual void from_ast(const common_peg_ast_arena & arena, const common_peg_parse_result & result);
virtual void map(const common_peg_ast_node & node);
};
class common_chat_peg_native_builder : public common_chat_peg_builder {
public:
static constexpr const char * TOOL = "tool";
static constexpr const char * TOOL_OPEN = "tool-open";
static constexpr const char * TOOL_CLOSE = "tool-close";
static constexpr const char * TOOL_ID = "tool-id";
static constexpr const char * TOOL_NAME = "tool-name";
static constexpr const char * TOOL_ARGS = "tool-args";
common_peg_parser tool(const common_peg_parser & p) { return tag(TOOL, p); }
common_peg_parser tool_open(const common_peg_parser & p) { return atomic(tag(TOOL_OPEN, p)); }
common_peg_parser tool_close(const common_peg_parser & p) { return atomic(tag(TOOL_CLOSE, p)); }
common_peg_parser tool_id(const common_peg_parser & p) { return atomic(tag(TOOL_ID, p)); }
common_peg_parser tool_name(const common_peg_parser & p) { return atomic(tag(TOOL_NAME, p)); }
common_peg_parser tool_args(const common_peg_parser & p) { return tag(TOOL_ARGS, p); }
};
class common_chat_peg_native_mapper : public common_chat_peg_mapper {
common_chat_tool_call * current_tool;
public:
common_chat_peg_native_mapper(common_chat_msg & msg) : common_chat_peg_mapper(msg) {}
void map(const common_peg_ast_node & node) override;
};
inline common_peg_arena build_chat_peg_native_parser(const std::function<common_peg_parser(common_chat_peg_native_builder & builder)> & fn) {
common_chat_peg_native_builder builder;
builder.set_root(fn(builder));
return builder.build();
}
class common_chat_peg_constructed_builder : public common_chat_peg_builder {
public:
static constexpr const char * TOOL = "tool";
static constexpr const char * TOOL_OPEN = "tool-open";
static constexpr const char * TOOL_CLOSE = "tool-close";
static constexpr const char * TOOL_NAME = "tool-name";
static constexpr const char * TOOL_ARG = "tool-arg";
static constexpr const char * TOOL_ARG_OPEN = "tool-arg-open";
static constexpr const char * TOOL_ARG_CLOSE = "tool-arg-close";
static constexpr const char * TOOL_ARG_NAME = "tool-arg-name";
static constexpr const char * TOOL_ARG_STRING_VALUE = "tool-arg-string-value";
static constexpr const char * TOOL_ARG_JSON_VALUE = "tool-arg-json-value";
common_peg_parser tool(const common_peg_parser & p) { return tag(TOOL, p); }
common_peg_parser tool_open(const common_peg_parser & p) { return atomic(tag(TOOL_OPEN, p)); }
common_peg_parser tool_close(const common_peg_parser & p) { return atomic(tag(TOOL_CLOSE, p)); }
common_peg_parser tool_name(const common_peg_parser & p) { return atomic(tag(TOOL_NAME, p)); }
common_peg_parser tool_arg(const common_peg_parser & p) { return tag(TOOL_ARG, p); }
common_peg_parser tool_arg_open(const common_peg_parser & p) { return atomic(tag(TOOL_ARG_OPEN, p)); }
common_peg_parser tool_arg_close(const common_peg_parser & p) { return atomic(tag(TOOL_ARG_CLOSE, p)); }
common_peg_parser tool_arg_name(const common_peg_parser & p) { return atomic(tag(TOOL_ARG_NAME, p)); }
common_peg_parser tool_arg_string_value(const common_peg_parser & p) { return tag(TOOL_ARG_STRING_VALUE, p); }
common_peg_parser tool_arg_json_value(const common_peg_parser & p) { return tag(TOOL_ARG_JSON_VALUE, p); }
};
class common_chat_peg_constructed_mapper : public common_chat_peg_mapper {
common_chat_tool_call * current_tool;
int arg_count = 0;
bool needs_closing_quote = false;
public:
common_chat_peg_constructed_mapper(common_chat_msg & msg) : common_chat_peg_mapper(msg) {}
void map(const common_peg_ast_node & node) override;
};
inline common_peg_arena build_chat_peg_constructed_parser(const std::function<common_peg_parser(common_chat_peg_constructed_builder & builder)> & fn) {
common_chat_peg_constructed_builder builder;
builder.set_root(fn(builder));
return builder.build();
}

File diff suppressed because it is too large Load Diff

View File

@ -3,6 +3,7 @@
#pragma once
#include "common.h"
#include "peg-parser.h"
#include <functional>
#include <chrono>
#include <string>
@ -76,7 +77,7 @@ struct common_chat_msg_diff {
size_t tool_call_index = std::string::npos;
common_chat_tool_call tool_call_delta;
static std::vector<common_chat_msg_diff> compute_diffs(const common_chat_msg & previous_msg, const common_chat_msg & new_msg);
static std::vector<common_chat_msg_diff> compute_diffs(const common_chat_msg & msg_prv, const common_chat_msg & msg_new);
bool operator==(const common_chat_msg_diff & other) const {
return content_delta == other.content_delta
@ -124,6 +125,11 @@ enum common_chat_format {
COMMON_CHAT_FORMAT_APRIEL_1_5,
COMMON_CHAT_FORMAT_XIAOMI_MIMO,
// These are intended to be parsed by the PEG parser
COMMON_CHAT_FORMAT_PEG_SIMPLE,
COMMON_CHAT_FORMAT_PEG_NATIVE,
COMMON_CHAT_FORMAT_PEG_CONSTRUCTED,
COMMON_CHAT_FORMAT_COUNT, // Not a format, just the # formats
};
@ -154,6 +160,7 @@ struct common_chat_params {
std::vector<common_grammar_trigger> grammar_triggers;
std::vector<std::string> preserved_tokens;
std::vector<std::string> additional_stops;
std::string parser;
};
struct common_chat_syntax {
@ -163,6 +170,7 @@ struct common_chat_syntax {
bool reasoning_in_content = false;
bool thinking_forced_open = false;
bool parse_tool_calls = true;
common_peg_arena parser = {};
};
// Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid
@ -206,6 +214,7 @@ const char* common_chat_format_name(common_chat_format format);
const char* common_reasoning_format_name(common_reasoning_format format);
common_reasoning_format common_reasoning_format_from_name(const std::string & format);
common_chat_msg common_chat_parse(const std::string & input, bool is_partial, const common_chat_syntax & syntax);
common_chat_msg common_chat_peg_parse(const common_peg_arena & parser, const std::string & input, bool is_partial, const common_chat_syntax & syntax);
common_chat_tool_choice common_chat_tool_choice_parse_oaicompat(const std::string & tool_choice);

View File

@ -694,7 +694,7 @@ bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_over
// Validate if a filename is safe to use
// To validate a full path, split the path by the OS-specific path separator, and validate each part with this function
bool fs_validate_filename(const std::string & filename) {
bool fs_validate_filename(const std::string & filename, bool allow_subdirs) {
if (!filename.length()) {
// Empty filename invalid
return false;
@ -754,10 +754,14 @@ bool fs_validate_filename(const std::string & filename) {
|| (c >= 0xD800 && c <= 0xDFFF) // UTF-16 surrogate pairs
|| c == 0xFFFD // Replacement Character (UTF-8)
|| c == 0xFEFF // Byte Order Mark (BOM)
|| c == '/' || c == '\\' || c == ':' || c == '*' // Illegal characters
|| c == ':' || c == '*' // Illegal characters
|| c == '?' || c == '"' || c == '<' || c == '>' || c == '|') {
return false;
}
if (!allow_subdirs && (c == '/' || c == '\\')) {
// Subdirectories not allowed, reject path separators
return false;
}
}
// Reject any leading or trailing ' ', or any trailing '.', these are stripped on Windows and will cause a different filename
@ -859,6 +863,11 @@ bool fs_create_directory_with_parents(const std::string & path) {
#endif // _WIN32
}
bool fs_is_directory(const std::string & path) {
std::filesystem::path dir(path);
return std::filesystem::exists(dir) && std::filesystem::is_directory(dir);
}
std::string fs_get_cache_directory() {
std::string cache_directory = "";
auto ensure_trailing_slash = [](std::string p) {
@ -893,6 +902,8 @@ std::string fs_get_cache_directory() {
cache_directory = std::getenv("HOME") + std::string("/Library/Caches/");
#elif defined(_WIN32)
cache_directory = std::getenv("LOCALAPPDATA");
#elif defined(__EMSCRIPTEN__)
GGML_ABORT("not implemented on this platform");
#else
# error Unknown architecture
#endif
@ -912,7 +923,7 @@ std::string fs_get_cache_file(const std::string & filename) {
return cache_directory + filename;
}
std::vector<common_file_info> fs_list_files(const std::string & path) {
std::vector<common_file_info> fs_list(const std::string & path, bool include_directories) {
std::vector<common_file_info> files;
if (path.empty()) return files;
@ -927,14 +938,22 @@ std::vector<common_file_info> fs_list_files(const std::string & path) {
const auto & p = entry.path();
if (std::filesystem::is_regular_file(p)) {
common_file_info info;
info.path = p.string();
info.name = p.filename().string();
info.path = p.string();
info.name = p.filename().string();
info.is_dir = false;
try {
info.size = static_cast<size_t>(std::filesystem::file_size(p));
} catch (const std::filesystem::filesystem_error &) {
info.size = 0;
}
files.push_back(std::move(info));
} else if (include_directories && std::filesystem::is_directory(p)) {
common_file_info info;
info.path = p.string();
info.name = p.filename().string();
info.size = 0; // Directories have no size
info.is_dir = true;
files.push_back(std::move(info));
}
} catch (const std::filesystem::filesystem_error &) {
// skip entries we cannot inspect

View File

@ -12,6 +12,10 @@
#include <vector>
#include <map>
#if defined(_WIN32) && !defined(_WIN32_WINNT)
#define _WIN32_WINNT 0x0A00
#endif
#ifdef _WIN32
#define DIRECTORY_SEPARATOR '\\'
#else
@ -26,8 +30,6 @@
fprintf(stderr, "%s: built with %s for %s\n", __func__, LLAMA_COMPILER, LLAMA_BUILD_TARGET); \
} while(0)
#define DEFAULT_MODEL_PATH "models/7B/ggml-model-f16.gguf"
struct common_time_meas {
common_time_meas(int64_t & t_acc, bool disable = false);
~common_time_meas();
@ -223,6 +225,7 @@ struct common_params_model {
std::string hf_repo = ""; // HF repo // NOLINT
std::string hf_file = ""; // HF file // NOLINT
std::string docker_repo = ""; // Docker repo // NOLINT
std::string name = ""; // in format <user>/<model>[:<tag>] (tag is optional) // NOLINT
};
struct common_params_speculative {
@ -369,7 +372,7 @@ struct common_params {
std::vector<common_control_vector_load_info> control_vectors; // control vector with user defined scale
int32_t verbosity = 0;
int32_t verbosity = 3; // LOG_LEVEL_INFO
int32_t control_vector_layer_start = -1; // layer range for control vector
int32_t control_vector_layer_end = -1; // layer range for control vector
bool offline = false;
@ -478,9 +481,15 @@ struct common_params {
bool endpoint_props = false; // only control POST requests, not GET
bool endpoint_metrics = false;
// router server configs
std::string models_dir = ""; // directory containing models for the router server
int models_max = 4; // maximum number of models to load simultaneously
bool models_autoload = true; // automatically load models when requested via the router server
bool log_json = false;
std::string slot_save_path;
std::string media_path; // path to directory for loading media files
float slot_prompt_similarity = 0.1f;
@ -631,8 +640,9 @@ std::string string_from(const struct llama_context * ctx, const struct llama_bat
// Filesystem utils
//
bool fs_validate_filename(const std::string & filename);
bool fs_validate_filename(const std::string & filename, bool allow_subdirs = false);
bool fs_create_directory_with_parents(const std::string & path);
bool fs_is_directory(const std::string & path);
std::string fs_get_cache_directory();
std::string fs_get_cache_file(const std::string & filename);
@ -641,8 +651,9 @@ struct common_file_info {
std::string path;
std::string name;
size_t size = 0; // in bytes
bool is_dir = false;
};
std::vector<common_file_info> fs_list_files(const std::string & path);
std::vector<common_file_info> fs_list(const std::string & path, bool include_directories);
//
// Model utils

View File

@ -24,6 +24,7 @@
#include "http.h"
#endif
#ifndef __EMSCRIPTEN__
#ifdef __linux__
#include <linux/limits.h>
#elif defined(_WIN32)
@ -35,6 +36,8 @@
#else
#include <sys/syslimits.h>
#endif
#endif
#define LLAMA_MAX_URL_LENGTH 2084 // Maximum URL Length in Chrome: 2083
// isatty
@ -430,7 +433,7 @@ std::pair<long, std::vector<char>> common_remote_get_content(const std::string &
curl_easy_setopt(curl.get(), CURLOPT_URL, url.c_str());
curl_easy_setopt(curl.get(), CURLOPT_NOPROGRESS, 1L);
curl_easy_setopt(curl.get(), CURLOPT_FOLLOWLOCATION, 1L);
curl_easy_setopt(curl.get(), CURLOPT_VERBOSE, 1L);
curl_easy_setopt(curl.get(), CURLOPT_VERBOSE, 0L);
typedef size_t(*CURLOPT_WRITEFUNCTION_PTR)(void * ptr, size_t size, size_t nmemb, void * data);
auto write_callback = [](void * ptr, size_t size, size_t nmemb, void * data) -> size_t {
auto data_vec = static_cast<std::vector<char> *>(data);
@ -517,16 +520,18 @@ static bool common_pull_file(httplib::Client & cli,
headers.emplace("Range", "bytes=" + std::to_string(existing_size) + "-");
}
std::atomic<size_t> downloaded{existing_size};
const char * func = __func__; // avoid __func__ inside a lambda
size_t downloaded = existing_size;
size_t progress_step = 0;
auto res = cli.Get(resolve_path, headers,
[&](const httplib::Response &response) {
if (existing_size > 0 && response.status != 206) {
LOG_WRN("%s: server did not respond with 206 Partial Content for a resume request. Status: %d\n", __func__, response.status);
LOG_WRN("%s: server did not respond with 206 Partial Content for a resume request. Status: %d\n", func, response.status);
return false;
}
if (existing_size == 0 && response.status != 200) {
LOG_WRN("%s: download received non-successful status code: %d\n", __func__, response.status);
LOG_WRN("%s: download received non-successful status code: %d\n", func, response.status);
return false;
}
if (total_size == 0 && response.has_header("Content-Length")) {
@ -534,7 +539,7 @@ static bool common_pull_file(httplib::Client & cli,
size_t content_length = std::stoull(response.get_header_value("Content-Length"));
total_size = existing_size + content_length;
} catch (const std::exception &e) {
LOG_WRN("%s: invalid Content-Length header: %s\n", __func__, e.what());
LOG_WRN("%s: invalid Content-Length header: %s\n", func, e.what());
}
}
return true;
@ -542,11 +547,16 @@ static bool common_pull_file(httplib::Client & cli,
[&](const char *data, size_t len) {
ofs.write(data, len);
if (!ofs) {
LOG_ERR("%s: error writing to file: %s\n", __func__, path_tmp.c_str());
LOG_ERR("%s: error writing to file: %s\n", func, path_tmp.c_str());
return false;
}
downloaded += len;
print_progress(downloaded, total_size);
progress_step += len;
if (progress_step >= total_size / 1000 || downloaded == total_size) {
print_progress(downloaded, total_size);
progress_step = 0;
}
return true;
},
nullptr
@ -1047,7 +1057,7 @@ std::string common_docker_resolve_model(const std::string &) {
std::vector<common_cached_model_info> common_list_cached_models() {
std::vector<common_cached_model_info> models;
const std::string cache_dir = fs_get_cache_directory();
const std::vector<common_file_info> files = fs_list_files(cache_dir);
const std::vector<common_file_info> files = fs_list(cache_dir, false);
for (const auto & file : files) {
if (string_starts_with(file.name, "manifest=") && string_ends_with(file.name, ".json")) {
common_cached_model_info model_info;

View File

@ -14,8 +14,10 @@ struct common_cached_model_info {
std::string model;
std::string tag;
size_t size = 0; // GGUF size in bytes
// return string representation like "user/model:tag"
// if tag is "latest", it will be omitted
std::string to_string() const {
return user + "/" + model + ":" + tag;
return user + "/" + model + (tag == "latest" ? "" : ":" + tag);
}
};

View File

@ -268,10 +268,10 @@ static bool is_reserved_name(const std::string & name) {
}
std::regex INVALID_RULE_CHARS_RE("[^a-zA-Z0-9-]+");
std::regex GRAMMAR_LITERAL_ESCAPE_RE("[\r\n\"]");
std::regex GRAMMAR_LITERAL_ESCAPE_RE("[\r\n\"\\\\]");
std::regex GRAMMAR_RANGE_LITERAL_ESCAPE_RE("[\r\n\"\\]\\-\\\\]");
std::unordered_map<char, std::string> GRAMMAR_LITERAL_ESCAPES = {
{'\r', "\\r"}, {'\n', "\\n"}, {'"', "\\\""}, {'-', "\\-"}, {']', "\\]"}
{'\r', "\\r"}, {'\n', "\\n"}, {'"', "\\\""}, {'-', "\\-"}, {']', "\\]"}, {'\\', "\\\\"}
};
std::unordered_set<char> NON_LITERAL_SET = {'|', '.', '(', ')', '[', ']', '{', '}', '*', '+', '?'};
@ -974,7 +974,7 @@ public:
void check_errors() {
if (!_errors.empty()) {
throw std::runtime_error("JSON schema conversion failed:\n" + string_join(_errors, "\n"));
throw std::invalid_argument("JSON schema conversion failed:\n" + string_join(_errors, "\n"));
}
if (!_warnings.empty()) {
fprintf(stderr, "WARNING: JSON schema conversion was incomplete: %s\n", string_join(_warnings, "; ").c_str());

View File

@ -443,8 +443,22 @@ void common_log_set_timestamps(struct common_log * log, bool timestamps) {
log->set_timestamps(timestamps);
}
static int common_get_verbosity(enum ggml_log_level level) {
switch (level) {
case GGML_LOG_LEVEL_DEBUG: return LOG_LEVEL_DEBUG;
case GGML_LOG_LEVEL_INFO: return LOG_LEVEL_INFO;
case GGML_LOG_LEVEL_WARN: return LOG_LEVEL_WARN;
case GGML_LOG_LEVEL_ERROR: return LOG_LEVEL_ERROR;
case GGML_LOG_LEVEL_CONT: return LOG_LEVEL_INFO; // same as INFO
case GGML_LOG_LEVEL_NONE:
default:
return LOG_LEVEL_OUTPUT;
}
}
void common_log_default_callback(enum ggml_log_level level, const char * text, void * /*user_data*/) {
if (LOG_DEFAULT_LLAMA <= common_log_verbosity_thold) {
auto verbosity = common_get_verbosity(level);
if (verbosity <= common_log_verbosity_thold) {
common_log_add(common_log_main(), level, "%s", text);
}
}

View File

@ -21,8 +21,14 @@
# define LOG_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))
#endif
#define LOG_DEFAULT_DEBUG 1
#define LOG_DEFAULT_LLAMA 0
#define LOG_LEVEL_DEBUG 4
#define LOG_LEVEL_INFO 3
#define LOG_LEVEL_WARN 2
#define LOG_LEVEL_ERROR 1
#define LOG_LEVEL_OUTPUT 0 // output data from tools
#define LOG_DEFAULT_DEBUG LOG_LEVEL_DEBUG
#define LOG_DEFAULT_LLAMA LOG_LEVEL_INFO
enum log_colors {
LOG_COLORS_AUTO = -1,
@ -67,10 +73,11 @@ void common_log_add(struct common_log * log, enum ggml_log_level level, const ch
// 0.00.090.578 I llm_load_tensors: offloading 32 repeating layers to GPU
// 0.00.090.579 I llm_load_tensors: offloading non-repeating layers to GPU
//
// I - info (stdout, V = 0)
// W - warning (stderr, V = 0)
// E - error (stderr, V = 0)
// D - debug (stderr, V = LOG_DEFAULT_DEBUG)
// I - info (stdout, V = LOG_DEFAULT_INFO)
// W - warning (stderr, V = LOG_DEFAULT_WARN)
// E - error (stderr, V = LOG_DEFAULT_ERROR)
// O - output (stdout, V = LOG_DEFAULT_OUTPUT)
//
void common_log_set_file (struct common_log * log, const char * file); // not thread-safe
@ -95,14 +102,14 @@ void common_log_set_timestamps(struct common_log * log, bool timestamps); // w
} \
} while (0)
#define LOG(...) LOG_TMPL(GGML_LOG_LEVEL_NONE, 0, __VA_ARGS__)
#define LOGV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_NONE, verbosity, __VA_ARGS__)
#define LOG(...) LOG_TMPL(GGML_LOG_LEVEL_NONE, LOG_LEVEL_OUTPUT, __VA_ARGS__)
#define LOGV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_NONE, verbosity, __VA_ARGS__)
#define LOG_INF(...) LOG_TMPL(GGML_LOG_LEVEL_INFO, 0, __VA_ARGS__)
#define LOG_WRN(...) LOG_TMPL(GGML_LOG_LEVEL_WARN, 0, __VA_ARGS__)
#define LOG_ERR(...) LOG_TMPL(GGML_LOG_LEVEL_ERROR, 0, __VA_ARGS__)
#define LOG_DBG(...) LOG_TMPL(GGML_LOG_LEVEL_DEBUG, LOG_DEFAULT_DEBUG, __VA_ARGS__)
#define LOG_CNT(...) LOG_TMPL(GGML_LOG_LEVEL_CONT, 0, __VA_ARGS__)
#define LOG_DBG(...) LOG_TMPL(GGML_LOG_LEVEL_DEBUG, LOG_LEVEL_DEBUG, __VA_ARGS__)
#define LOG_INF(...) LOG_TMPL(GGML_LOG_LEVEL_INFO, LOG_LEVEL_INFO, __VA_ARGS__)
#define LOG_WRN(...) LOG_TMPL(GGML_LOG_LEVEL_WARN, LOG_LEVEL_WARN, __VA_ARGS__)
#define LOG_ERR(...) LOG_TMPL(GGML_LOG_LEVEL_ERROR, LOG_LEVEL_ERROR, __VA_ARGS__)
#define LOG_CNT(...) LOG_TMPL(GGML_LOG_LEVEL_CONT, LOG_LEVEL_INFO, __VA_ARGS__) // same as INFO
#define LOG_INFV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_INFO, verbosity, __VA_ARGS__)
#define LOG_WRNV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_WARN, verbosity, __VA_ARGS__)

1712
common/peg-parser.cpp Normal file

File diff suppressed because it is too large Load Diff

459
common/peg-parser.h Normal file
View File

@ -0,0 +1,459 @@
#pragma once
#include <nlohmann/json_fwd.hpp>
#include <memory>
#include <unordered_map>
#include <string>
#include <string_view>
#include <functional>
#include <vector>
#include <variant>
struct common_grammar_builder;
class common_peg_parser_builder;
using common_peg_parser_id = size_t;
constexpr common_peg_parser_id COMMON_PEG_INVALID_PARSER_ID = static_cast<common_peg_parser_id>(-1);
using common_peg_ast_id = size_t;
constexpr common_peg_ast_id COMMON_PEG_INVALID_AST_ID = static_cast<common_peg_ast_id>(-1);
// Lightweight wrapper around common_peg_parser_id for convenience
class common_peg_parser {
common_peg_parser_id id_;
common_peg_parser_builder & builder_;
public:
common_peg_parser(const common_peg_parser & other) : id_(other.id_), builder_(other.builder_) {}
common_peg_parser(common_peg_parser_id id, common_peg_parser_builder & builder) : id_(id), builder_(builder) {}
common_peg_parser & operator=(const common_peg_parser & other);
common_peg_parser & operator+=(const common_peg_parser & other);
common_peg_parser & operator|=(const common_peg_parser & other);
operator common_peg_parser_id() const { return id_; }
common_peg_parser_id id() const { return id_; }
common_peg_parser_builder & builder() const { return builder_; }
// Creates a sequence
common_peg_parser operator+(const common_peg_parser & other) const;
// Creates a sequence separated by spaces.
common_peg_parser operator<<(const common_peg_parser & other) const;
// Creates a choice
common_peg_parser operator|(const common_peg_parser & other) const;
common_peg_parser operator+(const char * str) const;
common_peg_parser operator+(const std::string & str) const;
common_peg_parser operator<<(const char * str) const;
common_peg_parser operator<<(const std::string & str) const;
common_peg_parser operator|(const char * str) const;
common_peg_parser operator|(const std::string & str) const;
};
common_peg_parser operator+(const char * str, const common_peg_parser & p);
common_peg_parser operator+(const std::string & str, const common_peg_parser & p);
common_peg_parser operator<<(const char * str, const common_peg_parser & p);
common_peg_parser operator<<(const std::string & str, const common_peg_parser & p);
common_peg_parser operator|(const char * str, const common_peg_parser & p);
common_peg_parser operator|(const std::string & str, const common_peg_parser & p);
enum common_peg_parse_result_type {
COMMON_PEG_PARSE_RESULT_FAIL = 0,
COMMON_PEG_PARSE_RESULT_SUCCESS = 1,
COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT = 2,
};
const char * common_peg_parse_result_type_name(common_peg_parse_result_type type);
struct common_peg_ast_node {
common_peg_ast_id id;
std::string rule;
std::string tag;
size_t start;
size_t end;
std::string_view text;
std::vector<common_peg_ast_id> children;
bool is_partial = false;
};
struct common_peg_parse_result;
using common_peg_ast_visitor = std::function<void(const common_peg_ast_node & node)>;
class common_peg_ast_arena {
std::vector<common_peg_ast_node> nodes_;
public:
common_peg_ast_id add_node(
const std::string & rule,
const std::string & tag,
size_t start,
size_t end,
std::string_view text,
std::vector<common_peg_ast_id> children,
bool is_partial = false
) {
common_peg_ast_id id = nodes_.size();
nodes_.push_back({id, rule, tag, start, end, text, std::move(children), is_partial});
return id;
}
const common_peg_ast_node & get(common_peg_ast_id id) const { return nodes_.at(id); }
size_t size() const { return nodes_.size(); }
void clear() { nodes_.clear(); }
void visit(common_peg_ast_id id, const common_peg_ast_visitor & visitor) const;
void visit(const common_peg_parse_result & result, const common_peg_ast_visitor & visitor) const;
};
struct common_peg_parse_result {
common_peg_parse_result_type type = COMMON_PEG_PARSE_RESULT_FAIL;
size_t start = 0;
size_t end = 0;
std::vector<common_peg_ast_id> nodes;
common_peg_parse_result() = default;
common_peg_parse_result(common_peg_parse_result_type type, size_t start)
: type(type), start(start), end(start) {}
common_peg_parse_result(common_peg_parse_result_type type, size_t start, size_t end)
: type(type), start(start), end(end) {}
common_peg_parse_result(common_peg_parse_result_type type, size_t start, size_t end, std::vector<common_peg_ast_id> nodes)
: type(type), start(start), end(end), nodes(std::move(nodes)) {}
bool fail() const { return type == COMMON_PEG_PARSE_RESULT_FAIL; }
bool need_more_input() const { return type == COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT; }
bool success() const { return type == COMMON_PEG_PARSE_RESULT_SUCCESS; }
};
struct common_peg_parse_context {
std::string input;
bool is_partial;
common_peg_ast_arena ast;
int parse_depth;
common_peg_parse_context()
: is_partial(false), parse_depth(0) {}
common_peg_parse_context(const std::string & input)
: input(input), is_partial(false), parse_depth(0) {}
common_peg_parse_context(const std::string & input, bool is_partial)
: input(input), is_partial(is_partial), parse_depth(0) {}
};
class common_peg_arena;
// Parser variants
struct common_peg_epsilon_parser {};
struct common_peg_start_parser {};
struct common_peg_end_parser {};
struct common_peg_literal_parser {
std::string literal;
};
struct common_peg_sequence_parser {
std::vector<common_peg_parser_id> children;
};
struct common_peg_choice_parser {
std::vector<common_peg_parser_id> children;
};
struct common_peg_repetition_parser {
common_peg_parser_id child;
int min_count;
int max_count; // -1 for unbounded
};
struct common_peg_and_parser {
common_peg_parser_id child;
};
struct common_peg_not_parser {
common_peg_parser_id child;
};
struct common_peg_any_parser {};
struct common_peg_space_parser {};
struct common_peg_chars_parser {
struct char_range {
uint32_t start;
uint32_t end;
bool contains(uint32_t codepoint) const { return codepoint >= start && codepoint <= end; }
};
std::string pattern;
std::vector<char_range> ranges;
bool negated;
int min_count;
int max_count; // -1 for unbounded
};
struct common_peg_json_string_parser {};
struct common_peg_until_parser {
std::vector<std::string> delimiters;
};
struct common_peg_schema_parser {
common_peg_parser_id child;
std::string name;
std::shared_ptr<nlohmann::ordered_json> schema;
// Indicates if the GBNF should accept a raw string that matches the schema.
bool raw;
};
struct common_peg_rule_parser {
std::string name;
common_peg_parser_id child;
bool trigger;
};
struct common_peg_ref_parser {
std::string name;
};
struct common_peg_atomic_parser {
common_peg_parser_id child;
};
struct common_peg_tag_parser {
common_peg_parser_id child;
std::string tag;
};
// Variant holding all parser types
using common_peg_parser_variant = std::variant<
common_peg_epsilon_parser,
common_peg_start_parser,
common_peg_end_parser,
common_peg_literal_parser,
common_peg_sequence_parser,
common_peg_choice_parser,
common_peg_repetition_parser,
common_peg_and_parser,
common_peg_not_parser,
common_peg_any_parser,
common_peg_space_parser,
common_peg_chars_parser,
common_peg_json_string_parser,
common_peg_until_parser,
common_peg_schema_parser,
common_peg_rule_parser,
common_peg_ref_parser,
common_peg_atomic_parser,
common_peg_tag_parser
>;
class common_peg_arena {
std::vector<common_peg_parser_variant> parsers_;
std::unordered_map<std::string, common_peg_parser_id> rules_;
common_peg_parser_id root_ = COMMON_PEG_INVALID_PARSER_ID;
public:
const common_peg_parser_variant & get(common_peg_parser_id id) const { return parsers_.at(id); }
common_peg_parser_variant & get(common_peg_parser_id id) { return parsers_.at(id); }
size_t size() const { return parsers_.size(); }
bool empty() const { return parsers_.empty(); }
common_peg_parser_id get_rule(const std::string & name) const;
bool has_rule(const std::string & name) const { return rules_.find(name) != rules_.end(); }
common_peg_parser_id root() const { return root_; }
void set_root(common_peg_parser_id id) { root_ = id; }
common_peg_parse_result parse(common_peg_parse_context & ctx, size_t start = 0) const;
common_peg_parse_result parse(common_peg_parser_id id, common_peg_parse_context & ctx, size_t start) const;
void resolve_refs();
void build_grammar(const common_grammar_builder & builder, bool lazy = false) const;
std::string dump(common_peg_parser_id id) const;
nlohmann::json to_json() const;
static common_peg_arena from_json(const nlohmann::json & j);
std::string save() const;
void load(const std::string & data);
friend class common_peg_parser_builder;
private:
common_peg_parser_id add_parser(common_peg_parser_variant parser);
void add_rule(const std::string & name, common_peg_parser_id id);
common_peg_parser_id resolve_ref(common_peg_parser_id id);
};
class common_peg_parser_builder {
common_peg_arena arena_;
common_peg_parser wrap(common_peg_parser_id id) { return common_peg_parser(id, *this); }
common_peg_parser add(const common_peg_parser_variant & p) { return wrap(arena_.add_parser(p)); }
public:
common_peg_parser_builder();
// Match nothing, always succeed.
// S -> ε
common_peg_parser eps() { return add(common_peg_epsilon_parser{}); }
// Matches the start of the input.
// S -> ^
common_peg_parser start() { return add(common_peg_start_parser{}); }
// Matches the end of the input.
// S -> $
common_peg_parser end() { return add(common_peg_end_parser{}); }
// Matches an exact literal string.
// S -> "hello"
common_peg_parser literal(const std::string & literal) { return add(common_peg_literal_parser{literal}); }
// Matches a sequence of parsers in order, all must succeed.
// S -> A B C
common_peg_parser sequence() { return add(common_peg_sequence_parser{}); }
common_peg_parser sequence(const std::vector<common_peg_parser_id> & parsers);
common_peg_parser sequence(const std::vector<common_peg_parser> & parsers);
common_peg_parser sequence(std::initializer_list<common_peg_parser> parsers);
// Matches the first parser that succeeds from a list of alternatives.
// S -> A | B | C
common_peg_parser choice() { return add(common_peg_choice_parser{}); }
common_peg_parser choice(const std::vector<common_peg_parser_id> & parsers);
common_peg_parser choice(const std::vector<common_peg_parser> & parsers);
common_peg_parser choice(std::initializer_list<common_peg_parser> parsers);
// Matches one or more repetitions of a parser.
// S -> A+
common_peg_parser one_or_more(const common_peg_parser & p) { return repeat(p, 1, -1); }
// Matches zero or more repetitions of a parser, always succeeds.
// S -> A*
common_peg_parser zero_or_more(const common_peg_parser & p) { return repeat(p, 0, -1); }
// Matches zero or one occurrence of a parser, always succeeds.
// S -> A?
common_peg_parser optional(const common_peg_parser & p) { return repeat(p, 0, 1); }
// Positive lookahead: succeeds if child parser succeeds, consumes no input.
// S -> &A
common_peg_parser peek(const common_peg_parser & p) { return add(common_peg_and_parser{p}); }
// Negative lookahead: succeeds if child parser fails, consumes no input.
// S -> !A
common_peg_parser negate(const common_peg_parser & p) { return add(common_peg_not_parser{p}); }
// Matches any single character.
// S -> .
common_peg_parser any() { return add(common_peg_any_parser{}); }
// Matches between min and max repetitions of characters from a character class.
// S -> [a-z]{m,n}
//
// Use -1 for max to represent unbounded repetition (equivalent to {m,})
common_peg_parser chars(const std::string & classes, int min = 1, int max = -1);
// Creates a lightweight reference to a named rule (resolved during build()).
// Use this for forward references in recursive grammars.
// expr_ref -> expr
common_peg_parser ref(const std::string & name) { return add(common_peg_ref_parser{name}); }
// Matches zero or more whitespace characters (space, tab, newline).
// S -> [ \t\n]*
common_peg_parser space() { return add(common_peg_space_parser{}); }
// Matches all characters until a delimiter is found (delimiter not consumed).
// S -> (!delim .)*
common_peg_parser until(const std::string & delimiter) { return add(common_peg_until_parser{{delimiter}}); }
// Matches all characters until one of the delimiters in the list is found (delimiter not consumed).
// S -> (!delim .)*
common_peg_parser until_one_of(const std::vector<std::string> & delimiters) { return add(common_peg_until_parser{delimiters}); }
// Matches everything
// S -> .*
common_peg_parser rest() { return until_one_of({}); }
// Matches between min and max repetitions of a parser (inclusive).
// S -> A{m,n}
// Use -1 for max to represent unbounded repetition (equivalent to {m,})
common_peg_parser repeat(const common_peg_parser & p, int min, int max) { return add(common_peg_repetition_parser{p, min,max}); }
// Matches exactly n repetitions of a parser.
// S -> A{n}
common_peg_parser repeat(const common_peg_parser & p, int n) { return repeat(p, n, n); }
// Creates a complete JSON parser supporting objects, arrays, strings, numbers, booleans, and null.
// value -> object | array | string | number | true | false | null
common_peg_parser json();
common_peg_parser json_object();
common_peg_parser json_string();
common_peg_parser json_array();
common_peg_parser json_number();
common_peg_parser json_bool();
common_peg_parser json_null();
// Matches JSON string content without the surrounding quotes.
// Useful for extracting content within a JSON string.
common_peg_parser json_string_content();
// Matches a JSON object member with a key and associated parser as the
// value.
common_peg_parser json_member(const std::string & key, const common_peg_parser & p);
// Wraps a parser with JSON schema metadata for grammar generation.
// Used internally to convert JSON schemas to GBNF grammar rules.
common_peg_parser schema(const common_peg_parser & p, const std::string & name, const nlohmann::ordered_json & schema, bool raw = false);
// Creates a named rule, stores it in the grammar, and returns a ref.
// If trigger=true, marks this rule as an entry point for lazy grammar generation.
// auto json = p.rule("json", json_obj | json_arr | ...)
common_peg_parser rule(const std::string & name, const common_peg_parser & p, bool trigger = false);
// Creates a named rule using a builder function, and returns a ref.
// If trigger=true, marks this rule as an entry point for lazy grammar generation.
// auto json = p.rule("json", [&]() { return json_object() | json_array() | ... })
common_peg_parser rule(const std::string & name, const std::function<common_peg_parser()> & builder, bool trigger = false);
// Creates a trigger rule. When generating a lazy grammar from the parser,
// only trigger rules and descendents are emitted.
common_peg_parser trigger_rule(const std::string & name, const common_peg_parser & p) { return rule(name, p, true); }
common_peg_parser trigger_rule(const std::string & name, const std::function<common_peg_parser()> & builder) { return rule(name, builder, true); }
// Creates an atomic parser. Atomic parsers do not create an AST node if
// the child results in a partial parse, i.e. NEEDS_MORE_INPUT. This is
// intended for situations where partial output is undesirable.
common_peg_parser atomic(const common_peg_parser & p) { return add(common_peg_atomic_parser{p}); }
// Tags create nodes in the generated AST for semantic purposes.
// Unlike rules, you can tag multiple nodes with the same tag.
common_peg_parser tag(const std::string & tag, const common_peg_parser & p) { return add(common_peg_tag_parser{p.id(), tag}); }
void set_root(const common_peg_parser & p);
common_peg_arena build();
};
// Helper function for building parsers
common_peg_arena build_peg_parser(const std::function<common_peg_parser(common_peg_parser_builder & builder)> & fn);

64
common/unicode.cpp Normal file
View File

@ -0,0 +1,64 @@
#include "unicode.h"
// implementation adopted from src/unicode.cpp
size_t utf8_sequence_length(unsigned char first_byte) {
const size_t lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4 };
uint8_t highbits = static_cast<uint8_t>(first_byte) >> 4;
return lookup[highbits];
}
utf8_parse_result parse_utf8_codepoint(std::string_view input, size_t offset) {
if (offset >= input.size()) {
return utf8_parse_result(utf8_parse_result::INCOMPLETE);
}
// ASCII fast path
if (!(input[offset] & 0x80)) {
return utf8_parse_result(utf8_parse_result::SUCCESS, input[offset], 1);
}
// Invalid: continuation byte as first byte
if (!(input[offset] & 0x40)) {
return utf8_parse_result(utf8_parse_result::INVALID);
}
// 2-byte sequence
if (!(input[offset] & 0x20)) {
if (offset + 1 >= input.size()) {
return utf8_parse_result(utf8_parse_result::INCOMPLETE);
}
if ((input[offset + 1] & 0xc0) != 0x80) {
return utf8_parse_result(utf8_parse_result::INVALID);
}
auto result = ((input[offset] & 0x1f) << 6) | (input[offset + 1] & 0x3f);
return utf8_parse_result(utf8_parse_result::SUCCESS, result, 2);
}
// 3-byte sequence
if (!(input[offset] & 0x10)) {
if (offset + 2 >= input.size()) {
return utf8_parse_result(utf8_parse_result::INCOMPLETE);
}
if ((input[offset + 1] & 0xc0) != 0x80 || (input[offset + 2] & 0xc0) != 0x80) {
return utf8_parse_result(utf8_parse_result::INVALID);
}
auto result = ((input[offset] & 0x0f) << 12) | ((input[offset + 1] & 0x3f) << 6) | (input[offset + 2] & 0x3f);
return utf8_parse_result(utf8_parse_result::SUCCESS, result, 3);
}
// 4-byte sequence
if (!(input[offset] & 0x08)) {
if (offset + 3 >= input.size()) {
return utf8_parse_result(utf8_parse_result::INCOMPLETE);
}
if ((input[offset + 1] & 0xc0) != 0x80 || (input[offset + 2] & 0xc0) != 0x80 || (input[offset + 3] & 0xc0) != 0x80) {
return utf8_parse_result(utf8_parse_result::INVALID);
}
auto result = ((input[offset] & 0x07) << 18) | ((input[offset + 1] & 0x3f) << 12) | ((input[offset + 2] & 0x3f) << 6) | (input[offset + 3] & 0x3f);
return utf8_parse_result(utf8_parse_result::SUCCESS, result, 4);
}
// Invalid first byte
return utf8_parse_result(utf8_parse_result::INVALID);
}

22
common/unicode.h Normal file
View File

@ -0,0 +1,22 @@
#pragma once
#include <cstdint>
#include <string_view>
// UTF-8 parsing utilities for streaming-aware unicode support
struct utf8_parse_result {
uint32_t codepoint; // Decoded codepoint (only valid if status == SUCCESS)
size_t bytes_consumed; // How many bytes this codepoint uses (1-4)
enum status { SUCCESS, INCOMPLETE, INVALID } status;
utf8_parse_result(enum status s, uint32_t cp = 0, size_t bytes = 0)
: codepoint(cp), bytes_consumed(bytes), status(s) {}
};
// Determine the expected length of a UTF-8 sequence from its first byte
// Returns 0 for invalid first bytes
size_t utf8_sequence_length(unsigned char first_byte);
// Parse a single UTF-8 codepoint from input
utf8_parse_result parse_utf8_codepoint(std::string_view input, size_t offset);

View File

@ -1581,10 +1581,27 @@ class MmprojModel(ModelBase):
# load preprocessor config
self.preprocessor_config = {}
if not self.is_mistral_format:
with open(self.dir_model / "preprocessor_config.json", "r", encoding="utf-8") as f:
# prefer preprocessor_config.json if possible
preprocessor_config_path = self.dir_model / "preprocessor_config.json"
if preprocessor_config_path.is_file():
with open(preprocessor_config_path, "r", encoding="utf-8") as f:
self.preprocessor_config = json.load(f)
# prefer processor_config.json if possible
processor_config_path = self.dir_model / "processor_config.json"
if processor_config_path.is_file():
with open(processor_config_path, "r", encoding="utf-8") as f:
cfg = json.load(f)
# move image_processor to root level for compat
if "image_processor" in cfg:
cfg = {
**cfg,
**cfg["image_processor"],
}
# merge configs
self.preprocessor_config = {**self.preprocessor_config, **cfg}
def get_vision_config(self) -> dict[str, Any] | None:
config_name = "vision_config" if not self.is_mistral_format else "vision_encoder"
return self.global_config.get(config_name)
@ -2797,9 +2814,38 @@ class Llama4VisionModel(MmprojModel):
@ModelBase.register("Mistral3ForConditionalGeneration")
class Mistral3Model(LlamaModel):
model_arch = gguf.MODEL_ARCH.LLAMA
model_arch = gguf.MODEL_ARCH.MISTRAL3
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
# for compatibility, we use LLAMA arch for older models
# TODO: remove this once everyone has migrated to newer version of llama.cpp
if self.hparams.get("model_type") != "ministral3":
self.model_arch = gguf.MODEL_ARCH.LLAMA
self.gguf_writer.arch = gguf.MODEL_ARCH_NAMES[self.model_arch]
self.gguf_writer.add_architecture()
self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count)
def set_gguf_parameters(self):
super().set_gguf_parameters()
rope_params = self.hparams.get("rope_parameters")
if self.hparams.get("model_type") == "ministral3":
assert rope_params is not None, "ministral3 must have 'rope_parameters' config"
assert rope_params["rope_type"] == "yarn", "ministral3 rope_type must be 'yarn'"
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
self.gguf_writer.add_rope_scaling_factor(rope_params["factor"])
self.gguf_writer.add_rope_scaling_yarn_beta_fast(rope_params["beta_fast"])
self.gguf_writer.add_rope_scaling_yarn_beta_slow(rope_params["beta_slow"])
self.gguf_writer.add_rope_scaling_yarn_log_mul(rope_params["mscale_all_dim"])
self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_params["original_max_position_embeddings"])
self.gguf_writer.add_rope_freq_base(rope_params["rope_theta"])
self.gguf_writer.add_attn_temperature_scale(rope_params["llama_4_scaling_beta"])
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None):
# TODO: probably not worth supporting quantized weight, as official BF16 is also available
if name.endswith("weight_scale_inv"):
raise ValueError("This is a quantized weight, please use BF16 weight instead")
name = name.replace("language_model.", "")
if "multi_modal_projector" in name or "vision_tower" in name:
return []
@ -4183,6 +4229,36 @@ class Qwen3MoeModel(Qwen2MoeModel):
super().set_vocab()
@ModelBase.register("Qwen3NextForCausalLM")
class Qwen3NextModel(Qwen2MoeModel):
model_arch = gguf.MODEL_ARCH.QWEN3NEXT
def set_gguf_parameters(self):
super().set_gguf_parameters()
self.gguf_writer.add_ssm_conv_kernel(self.hparams["linear_conv_kernel_dim"])
self.gguf_writer.add_ssm_state_size(self.hparams["linear_key_head_dim"])
self.gguf_writer.add_ssm_group_count(self.hparams["linear_num_key_heads"])
self.gguf_writer.add_ssm_time_step_rank(self.hparams["linear_num_value_heads"])
self.gguf_writer.add_ssm_inner_size(self.hparams["linear_value_head_dim"] * self.hparams["linear_num_value_heads"])
if (rope_dim := self.hparams.get("head_dim")) is None:
rope_dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
self.gguf_writer.add_rope_dimension_count(int(rope_dim * self.hparams.get("partial_rotary_factor", 0.25)))
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
if name.startswith("mtp"):
return [] # ignore MTP layers for now
if name.endswith(".A_log"):
data_torch = -torch.exp(data_torch)
elif name.endswith(".dt_bias"):
name = name.rpartition(".dt_bias")[0] + ".dt_proj.bias"
elif "conv1d" in name:
data_torch = data_torch.squeeze()
elif name.endswith("norm.weight") and not name.endswith("linear_attn.norm.weight"):
data_torch = data_torch + 1
yield from super().modify_tensors(data_torch, name, bid)
@ModelBase.register("RND1")
class RND1Model(Qwen2MoeModel):
model_arch = gguf.MODEL_ARCH.RND1
@ -9779,12 +9855,22 @@ class ApertusModel(LlamaModel):
class MistralModel(LlamaModel):
model_arch = gguf.MODEL_ARCH.LLAMA
model_arch = gguf.MODEL_ARCH.MISTRAL3
model_name = "Mistral"
hf_arch = ""
is_mistral_format = True
undo_permute = False
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
# for compatibility, we use LLAMA arch for older models
# TODO: remove this once everyone migrates to newer version of llama.cpp
if "llama_4_scaling" not in self.hparams:
self.model_arch = gguf.MODEL_ARCH.LLAMA
self.gguf_writer.arch = gguf.MODEL_ARCH_NAMES[self.model_arch]
self.gguf_writer.add_architecture()
self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count)
@staticmethod
def get_community_chat_template(vocab: MistralVocab, templates_dir: Path, is_mistral_format: bool):
assert TokenizerVersion is not None and Tekkenizer is not None and SentencePieceTokenizer is not None, _mistral_import_error_msg
@ -9824,6 +9910,20 @@ class MistralModel(LlamaModel):
return template
def set_gguf_parameters(self):
super().set_gguf_parameters()
if "yarn" in self.hparams:
yarn_params = self.hparams["yarn"]
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
self.gguf_writer.add_rope_scaling_factor(yarn_params["factor"])
self.gguf_writer.add_rope_scaling_yarn_beta_fast(yarn_params["beta"])
self.gguf_writer.add_rope_scaling_yarn_beta_slow(yarn_params["alpha"])
self.gguf_writer.add_rope_scaling_yarn_log_mul(1.0) # mscale_all_dim
self.gguf_writer.add_rope_scaling_orig_ctx_len(yarn_params["original_max_position_embeddings"])
if "llama_4_scaling" in self.hparams:
self.gguf_writer.add_attn_temperature_scale(self.hparams["llama_4_scaling"]["beta"])
class PixtralModel(LlavaVisionModel):
model_name = "Pixtral"

View File

@ -42,6 +42,9 @@ The following releases are verified and recommended:
## News
- 2025.11
- Support malloc memory on device more than 4GB.
- 2025.2
- Optimize MUL_MAT Q4_0 on Intel GPU for all dGPUs and built-in GPUs since MTL. Increase the performance of LLM (llama-2-7b.Q4_0.gguf) 21%-87% on Intel GPUs (MTL, ARL-H, Arc, Flex, PVC).
|GPU|Base tokens/s|Increased tokens/s|Percent|
@ -789,6 +792,8 @@ use 1 SYCL GPUs: [0] with Max compute units:512
| GGML_SYCL_DISABLE_GRAPH | 0 or 1 (default) | Disable running computations through SYCL Graphs feature. Disabled by default because graph performance isn't yet better than non-graph performance. |
| GGML_SYCL_DISABLE_DNN | 0 (default) or 1 | Disable running computations through oneDNN and always use oneMKL. |
| ZES_ENABLE_SYSMAN | 0 (default) or 1 | Support to get free memory of GPU by sycl::aspect::ext_intel_free_memory.<br>Recommended to use when --split-mode = layer |
| UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS | 0 (default) or 1 | Support malloc device memory more than 4GB.|
## Known Issues
@ -835,6 +840,14 @@ use 1 SYCL GPUs: [0] with Max compute units:512
| The default context is too big. It leads to excessive memory usage.|Set `-c 8192` or a smaller value.|
| The model is too big and requires more memory than what is available.|Choose a smaller model or change to a smaller quantization, like Q5 -> Q4;<br>Alternatively, use more than one device to load model.|
- `ggml_backend_sycl_buffer_type_alloc_buffer: can't allocate 5000000000 Bytes of memory on device`
You need to enable to support 4GB memory malloc by:
```
export UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS=1
set UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS=1
```
### **GitHub contribution**:
Please add the `SYCL :` prefix/tag in issues/PRs titles to help the SYCL contributors to check/address them without delay.

View File

@ -431,11 +431,22 @@ docker run -it --rm -v "$(pwd):/app:Z" --device /dev/dri/renderD128:/dev/dri/ren
### For Linux users:
#### Using the LunarG Vulkan SDK
First, follow the official LunarG instructions for the installation and setup of the Vulkan SDK in the [Getting Started with the Linux Tarball Vulkan SDK](https://vulkan.lunarg.com/doc/sdk/latest/linux/getting_started.html) guide.
> [!IMPORTANT]
> After completing the first step, ensure that you have used the `source` command on the `setup_env.sh` file inside of the Vulkan SDK in your current terminal session. Otherwise, the build won't work. Additionally, if you close out of your terminal, you must perform this step again if you intend to perform a build. However, there are ways to make this persistent. Refer to the Vulkan SDK guide linked in the first step for more information about any of this.
#### Using system packages
On Debian / Ubuntu, you can install the required dependencies using:
```sh
sudo apt-get install libvulkan-dev glslc
```
#### Common steps
Second, after verifying that you have followed all of the SDK installation/setup steps, use this command to make sure before proceeding:
```bash
vulkaninfo

288
docs/development/parsing.md Normal file
View File

@ -0,0 +1,288 @@
# Parsing Model Output
The `common` library contains a PEG parser implementation suitable for parsing
model output.
Types with the prefix `common_peg_*` are intended for general use and may have
applications beyond parsing model output, such as parsing user-provided regex
patterns.
Types with the prefix `common_chat_peg_*` are specialized helpers for model
output.
The parser features:
- Partial parsing of streaming input
- Built-in JSON parsers
- AST generation with semantics via "tagged" nodes
## Example
Below is a contrived example demonstrating how to use the PEG parser to parse
output from a model that emits arguments as JSON.
```cpp
auto parser = build_chat_peg_native_parser([&](common_chat_peg_native_builder & p) {
// Build a choice of all available tools
auto tool_choice = p.choice();
for (const auto & tool : tools) {
const auto & function = tool.at("function");
std::string name = function.at("name");
const auto & schema = function.at("parameters");
auto tool_name = p.json_member("name", "\"" + p.literal(name) + "\"");
auto tool_args = p.json_member("arguments", p.schema(p.json(), "tool-" + name + "-schema", schema));
tool_choice |= p.rule("tool-" + name, "{" << tool_name << "," << tool_args << "}");
}
// Define the tool call structure: <tool_call>[{tool}]</tool_call>
auto tool_call = p.trigger_rule("tool-call",
p.sequence({
p.literal("<tool_call>["),
tool_choice,
p.literal("]</tool_call>")
})
);
// Parser accepts content, optionally followed by a tool call
return p.sequence({
p.content(p.until("<tool_call>")),
p.optional(tool_call),
p.end()
});
});
```
For a more complete example, see `test_example_native()` in
[tests/test-chat-peg-parser.cpp](tests/test-chat-peg-parser.cpp).
## Parsers/Combinators
### Basic Matchers
- **`eps()`** - Matches nothing and always succeeds (epsilon/empty match)
- **`start()`** - Matches the start of input (anchor `^`)
- **`end()`** - Matches the end of input (anchor `$`)
- **`literal(string)`** - Matches an exact literal string
- **`any()`** - Matches any single character (`.`)
### Combinators
- **`sequence(...)`** - Matches parsers in order; all must succeed
- **`choice(...)`** - Matches the first parser that succeeds from alternatives (ordered choice)
- **`one_or_more(p)`** - Matches one or more repetitions (`+`)
- **`zero_or_more(p)`** - Matches zero or more repetitions (`*`)
- **`optional(p)`** - Matches zero or one occurrence (`?`)
- **`repeat(p, min, max)`** - Matches between min and max repetitions (use `-1` for unbounded)
- **`repeat(p, n)`** - Matches exactly n repetitions
### Lookahead
- **`peek(p)`** - Positive lookahead: succeeds if parser succeeds without consuming input (`&`)
- **`negate(p)`** - Negative lookahead: succeeds if parser fails without consuming input (`!`)
### Character Classes & Utilities
- **`chars(classes, min, max)`** - Matches repetitions of characters from a character class
- **`space()`** - Matches zero or more whitespace characters (space, tab, newline)
- **`until(delimiter)`** - Matches characters until delimiter is found (delimiter not consumed)
- **`until_one_of(delimiters)`** - Matches characters until any delimiter in the list is found
- **`rest()`** - Matches everything remaining (`.*`)
### JSON Parsers
- **`json()`** - Complete JSON parser (objects, arrays, strings, numbers, booleans, null)
- **`json_object()`** - JSON object parser
- **`json_array()`** - JSON array parser
- **`json_string()`** - JSON string parser
- **`json_number()`** - JSON number parser
- **`json_bool()`** - JSON boolean parser
- **`json_null()`** - JSON null parser
- **`json_string_content()`** - JSON string content without surrounding quotes
- **`json_member(key, p)`** - JSON object member with specific key and value parser
### Grammar Building
- **`ref(name)`** - Creates a lightweight reference to a named rule (for recursive grammars)
- **`rule(name, p, trigger)`** - Creates a named rule and returns a reference
- **`trigger_rule(name, p)`** - Creates a trigger rule (entry point for lazy grammar generation)
- **`schema(p, name, schema, raw)`** - Wraps parser with JSON schema metadata for grammar generation
### AST Control
- **`atomic(p)`** - Prevents AST node creation for partial parses
- **`tag(tag, p)`** - Creates AST nodes with semantic tags (multiple nodes can share tags)
## GBNF Grammar Generation
The PEG parser also acts as a convenient DSL for generating GBNF grammars, with
some exceptions.
```cpp
data.grammar = build_grammar([&](const common_grammar_builder & builder) {
foreach_function(params.tools, [&](const json & fn) {
builder.resolve_refs(fn.at("parameters"));
});
parser.build_grammar(builder, data.grammar_lazy);
});
```
The notable exception is the `negate(p)` lookahead parser, which cannot be
defined as a CFG grammar and therefore does not produce a rule. Its usage
should be limited and preferably hidden behind a `schema()` parser. In many
cases, `until(delimiter)` or `until_one_of(delimiters)` is a better choice.
Another limitation is that the PEG parser requires an unambiguous grammar. In
contrast, the `llama-grammar` implementation can support ambiguous grammars,
though they are difficult to parse.
### Lazy Grammars
During lazy grammar generation, only rules reachable from a `trigger_rule(p)`
are emitted in the grammar. All trigger rules are added as alternations in the
root rule. It is still necessary to define trigger patterns, as the parser has
no interaction with the grammar sampling.
### JSON Schema
The `schema(p, name, schema, raw)` parser will use the `json-schema-to-grammar`
implementation to generate the grammar instead of the underlying parser.
The `raw` option emits a grammar suitable for a raw string instead of a JSON
string. In other words, it won't be wrapped in quotes or require escaping
quotes. It should only be used when `type == "string"`.
The downside is that it can potentially lead to ambiguous grammars. For
example, if a user provides the pattern `^.*$`, the following grammar may be
generated:
```
root ::= "<arg>" .* "</arg>"
```
This creates an ambiguous grammar that cannot be parsed by the PEG parser. To
help mitigate this, if `.*` is found in the pattern, the grammar from the
underlying parser will be emitted instead.
## Common AST Shapes for Chat Parsing
Most model output can be placed in one of the following categories:
- Content only
- Tool calling with arguments emitted as a single JSON object
- Tool calling with arguments emitted as separate entities, either XML
(Qwen3-Coder, MiniMax M2) or pseudo-function calls (LFM2)
To provide broad coverage,
[`common/chat-peg-parser.h`](common/chat-peg-parser.h) contains builders and
mappers that help create parsers and visitors/extractors for these types. They
require parsers to tag nodes to conform to an AST "shape". This normalization
makes it easy to extract information and generalize parsing.
### Simple
The `common_chat_peg_builder` builds a `simple` parser that supports
content-only models with optional reasoning.
- **`reasoning(p)`** - Tag node for extracting `reasoning_content`
- **`content(p)`** - Tag node for extracting `content`
```cpp
build_chat_peg_parser([&](common_chat_peg_parser & p) {
return p.sequence({
p.optional("<think>" + p.reasoning(p.until("</think>")) + "</think>"),
p.content(p.until("<tool_call>")),
p.end()
});
});
```
Use `common_chat_peg_mapper` to extract the content. Note that this is already
done for you in `common_chat_peg_parser` when
`chat_format == COMMON_CHAT_FORMAT_PEG_SIMPLE`.
```cpp
auto result = parser.parse(ctx);
common_chat_msg msg;
auto mapper = common_chat_peg_mapper(msg);
mapper.from_ast(ctx.ast, result);
```
### Native
The `common_chat_peg_native_builder` builds a `native` parser suitable for
models that emit tool arguments as a direct JSON object.
- **`reasoning(p)`** - Tag node for `reasoning_content`
- **`content(p)`** - Tag node for `content`
- **`tool(p)`** - Tag entirety of a single tool call
- **`tool_open(p)`** - Tag start of a tool call
- **`tool_close(p)`** - Tag end of a tool call
- **`tool_id(p)`** - Tag the tool call ID (optional)
- **`tool_name(p)`** - Tag the tool name
- **`tool_args(p)`** - Tag the tool arguments
```cpp
build_chat_peg_native_parser([&](common_chat_peg_native_parser & p) {
auto get_weather_tool = p.tool(p.sequence({
p.tool_open(p.literal("{")),
p.json_member("name", "\"" + p.tool_name(p.literal("get_weather")) + "\""),
p.literal(","),
p.json_member("arguments", p.tool_args(p.json())),
p.tool_close(p.literal("}"))
}));
return p.sequence({
p.content(p.until("<tool_call>")),
p.literal("<tool_call>"),
get_weather_tool,
p.literal("</tool_call>"),
p.end()
});
});
```
### Constructed
The `common_chat_peg_constructed_builder` builds a `constructed` parser
suitable for models that emit tool arguments as separate entities, such as XML
tags.
- **`reasoning(p)`** - Tag node for `reasoning_content`
- **`content(p)`** - Tag node for `content`
- **`tool(p)`** - Tag entirety of a single tool call
- **`tool_open(p)`** - Tag start of a tool call
- **`tool_close(p)`** - Tag end of a tool call
- **`tool_name(p)`** - Tag the tool name
- **`tool_arg(p)`** - Tag a complete tool argument (name + value)
- **`tool_arg_open(p)`** - Tag start of a tool argument
- **`tool_arg_close(p)`** - Tag end of a tool argument
- **`tool_arg_name(p)`** - Tag the argument name
- **`tool_arg_string_value(p)`** - Tag string value for the argument
- **`tool_arg_json_value(p)`** - Tag JSON value for the argument
```cpp
build_chat_peg_constructed_parser([&](common_chat_peg_constructed_builder & p) {
auto location_arg = p.tool_arg(
p.tool_arg_open("<parameter name=\"" + p.tool_arg_name(p.literal("location")) + "\">"),
p.tool_arg_string_value(p.until("</parameter>")),
p.tool_arg_close(p.literal("</parameter>"))
);
auto get_weather_tool = p.tool(p.sequence({
p.tool_open("<function name=\"" + p.tool_name(p.literal("get_weather")) + "\">"),
location_arg,
p.tool_close(p.literal("</function>"))
}));
return p.sequence({
p.content(p.until("<tool_call>")),
p.literal("<tool_call>"),
get_weather_tool,
p.literal("</tool_call>"),
p.end()
});
});
```

View File

@ -21,11 +21,11 @@ Legend:
| ADD_ID | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ |
| ARANGE | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ |
| ARGMAX | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ |
| ARGSORT | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | 🟡 | ❌ |
| ARGSORT | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | | ❌ |
| CEIL | ❌ | ❌ | ✅ | 🟡 | ❌ | ❌ | 🟡 | 🟡 | ❌ |
| CLAMP | ❌ | ✅ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | 🟡 | ❌ |
| CONCAT | ❌ | ✅ | ✅ | 🟡 | ✅ | 🟡 | ✅ | ✅ | ❌ |
| CONT | ❌ | 🟡 | ✅ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ❌ |
| CONT | ❌ | 🟡 | ✅ | ✅ | ✅ | 🟡 | 🟡 | | ❌ |
| CONV_2D | ❌ | ❌ | ✅ | ✅ | ❌ | ✅ | ❌ | ✅ | ❌ |
| CONV_2D_DW | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ |
| CONV_3D | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
@ -36,10 +36,10 @@ Legend:
| CPY | ❌ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | ❌ |
| CROSS_ENTROPY_LOSS | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ |
| CROSS_ENTROPY_LOSS_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ |
| CUMSUM | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ | | ❌ |
| CUMSUM | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ | | ❌ |
| DIAG_MASK_INF | ❌ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ✅ | ❌ |
| DIV | ❌ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ✅ | ❌ |
| DUP | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | 🟡 | ❌ |
| DUP | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | | ❌ |
| ELU | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | ✅ | ❌ | ❌ |
| EXP | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | ✅ | 🟡 | ❌ |
| EXPM1 | ❌ | ❌ | ✅ | 🟡 | ❌ | ❌ | ❌ | ❌ | ❌ |
@ -102,7 +102,7 @@ Legend:
| SOFTPLUS | ❌ | ❌ | ✅ | 🟡 | ❌ | ❌ | ❌ | 🟡 | ❌ |
| SOFT_MAX | ❌ | 🟡 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ |
| SOFT_MAX_BACK | ❌ | ❌ | 🟡 | 🟡 | ❌ | ❌ | 🟡 | ✅ | ❌ |
| SOLVE_TRI | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ | | ❌ |
| SOLVE_TRI | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ | 🟡 | ❌ |
| SQR | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | 🟡 | 🟡 | ❌ |
| SQRT | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | 🟡 | 🟡 | ❌ |
| SSM_CONV | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ |
@ -115,7 +115,8 @@ Legend:
| SWIGLU_OAI | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | 🟡 | ❌ |
| TANH | ❌ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ✅ | 🟡 | ❌ |
| TIMESTEP_EMBEDDING | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ |
| TRI | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
| TOP_K | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | 🟡 | ❌ |
| TRI | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ | ✅ | ❌ |
| TRUNC | ❌ | ❌ | ✅ | 🟡 | ❌ | ❌ | 🟡 | 🟡 | ❌ |
| UPSCALE | ❌ | 🟡 | ✅ | ✅ | 🟡 | ✅ | 🟡 | | ❌ |
| UPSCALE | ❌ | 🟡 | ✅ | ✅ | 🟡 | ✅ | 🟡 | 🟡 | ❌ |
| XIELU | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |

View File

@ -5005,8 +5005,8 @@
"Vulkan0","DUP","type=f16,ne=[10,10,5,1],permute=[0,2,1,3]","support","1","yes","Vulkan"
"Vulkan0","DUP","type=f32,ne=[10,10,5,1],permute=[1,0,2,3]","support","1","yes","Vulkan"
"Vulkan0","DUP","type=f16,ne=[10,10,5,1],permute=[1,0,2,3]","support","1","yes","Vulkan"
"Vulkan0","DUP","type=i16,ne=[10,8,3,1],permute=[0,2,1,3]","support","0","no","Vulkan"
"Vulkan0","DUP","type=i16,ne=[10,8,3,1],permute=[1,2,0,3]","support","0","no","Vulkan"
"Vulkan0","DUP","type=i16,ne=[10,8,3,1],permute=[0,2,1,3]","support","1","yes","Vulkan"
"Vulkan0","DUP","type=i16,ne=[10,8,3,1],permute=[1,2,0,3]","support","1","yes","Vulkan"
"Vulkan0","SET","type_src=f32,type_dst=f32,ne=[6,5,4,3],dim=1","support","0","no","Vulkan"
"Vulkan0","SET","type_src=f32,type_dst=f32,ne=[6,5,4,3],dim=2","support","0","no","Vulkan"
"Vulkan0","SET","type_src=f32,type_dst=f32,ne=[6,5,4,3],dim=3","support","0","no","Vulkan"
@ -5032,14 +5032,14 @@
"Vulkan0","CPY","type_src=f16,type_dst=f16,ne=[3,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan"
"Vulkan0","CPY","type_src=f16,type_dst=f16,ne=[3,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","1","yes","Vulkan"
"Vulkan0","CPY","type_src=bf16,type_dst=bf16,ne=[1,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan"
"Vulkan0","CPY","type_src=bf16,type_dst=bf16,ne=[1,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan"
"Vulkan0","CPY","type_src=bf16,type_dst=bf16,ne=[1,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","Vulkan"
"Vulkan0","CPY","type_src=bf16,type_dst=bf16,ne=[1,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan"
"Vulkan0","CPY","type_src=bf16,type_dst=bf16,ne=[1,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","1","yes","Vulkan"
"Vulkan0","CPY","type_src=bf16,type_dst=bf16,ne=[2,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan"
"Vulkan0","CPY","type_src=bf16,type_dst=bf16,ne=[2,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan"
"Vulkan0","CPY","type_src=bf16,type_dst=bf16,ne=[2,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","Vulkan"
"Vulkan0","CPY","type_src=bf16,type_dst=bf16,ne=[2,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan"
"Vulkan0","CPY","type_src=bf16,type_dst=bf16,ne=[2,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","1","yes","Vulkan"
"Vulkan0","CPY","type_src=bf16,type_dst=bf16,ne=[3,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan"
"Vulkan0","CPY","type_src=bf16,type_dst=bf16,ne=[3,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan"
"Vulkan0","CPY","type_src=bf16,type_dst=bf16,ne=[3,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","Vulkan"
"Vulkan0","CPY","type_src=bf16,type_dst=bf16,ne=[3,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan"
"Vulkan0","CPY","type_src=bf16,type_dst=bf16,ne=[3,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","1","yes","Vulkan"
"Vulkan0","CPY","type_src=q4_0,type_dst=q4_0,ne=[32,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan"
"Vulkan0","CPY","type_src=q4_0,type_dst=q4_0,ne=[32,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan"
"Vulkan0","CPY","type_src=q4_0,type_dst=q4_0,ne=[32,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","Vulkan"
@ -5271,7 +5271,7 @@
"Vulkan0","CPY","type_src=bf16,type_dst=f16,ne=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan"
"Vulkan0","CPY","type_src=bf16,type_dst=f16,ne=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan"
"Vulkan0","CPY","type_src=bf16,type_dst=bf16,ne=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan"
"Vulkan0","CPY","type_src=bf16,type_dst=bf16,ne=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan"
"Vulkan0","CPY","type_src=bf16,type_dst=bf16,ne=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan"
"Vulkan0","CPY","type_src=bf16,type_dst=q4_0,ne=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan"
"Vulkan0","CPY","type_src=bf16,type_dst=q4_0,ne=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan"
"Vulkan0","CPY","type_src=bf16,type_dst=q4_1,ne=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan"
@ -5415,21 +5415,49 @@
"Vulkan0","CPY","type_src=f16,type_dst=f16,ne=[256,4,3,1],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=1","support","1","yes","Vulkan"
"Vulkan0","CPY","type_src=f32,type_dst=f32,ne=[256,4,3,1],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=1","support","1","yes","Vulkan"
"Vulkan0","CPY","type_src=f32,type_dst=f32,ne=[256,4,3,3],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=1","support","1","yes","Vulkan"
"Vulkan0","CPY","type_src=bf16,type_dst=bf16,ne=[256,4,3,1],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=1","support","0","no","Vulkan"
"Vulkan0","CPY","type_src=bf16,type_dst=bf16,ne=[256,4,3,1],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=1","support","1","yes","Vulkan"
"Vulkan0","CPY","type_src=f16,type_dst=f16,ne=[256,4,1,1],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=1","support","1","yes","Vulkan"
"Vulkan0","CPY","type_src=f32,type_dst=f32,ne=[256,4,1,1],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=1","support","1","yes","Vulkan"
"Vulkan0","CPY","type_src=bf16,type_dst=bf16,ne=[256,4,1,1],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=1","support","0","no","Vulkan"
"Vulkan0","CPY","type_src=bf16,type_dst=bf16,ne=[256,4,1,1],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=1","support","1","yes","Vulkan"
"Vulkan0","CPY","type_src=i32,type_dst=i32,ne=[256,4,1,1],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=1","support","1","yes","Vulkan"
"Vulkan0","CPY","type_src=i32,type_dst=i32,ne=[256,1,4,1],permute_src=[1,2,0,3],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan"
"Vulkan0","CPY","type_src=f32,type_dst=f32,ne=[256,1,4,1],permute_src=[1,2,0,3],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan"
"Vulkan0","CONT","type=f32,ne=[10,10,10,1]","support","1","yes","Vulkan"
"Vulkan0","CONT","type=f32,ne=[2,1,1,1]","support","1","yes","Vulkan"
"Vulkan0","CONT","type=f32,ne=[2,1,3,5]","support","1","yes","Vulkan"
"Vulkan0","CONT","type=f32,ne=[2,3,5,7]","support","1","yes","Vulkan"
"Vulkan0","CONT","type=f16,ne=[2,1,1,1]","support","1","yes","Vulkan"
"Vulkan0","CONT","type=f16,ne=[2,1,3,5]","support","1","yes","Vulkan"
"Vulkan0","CONT","type=f16,ne=[2,3,5,7]","support","1","yes","Vulkan"
"Vulkan0","CONT","type=bf16,ne=[2,1,1,1]","support","1","yes","Vulkan"
"Vulkan0","CONT","type=bf16,ne=[2,1,3,5]","support","1","yes","Vulkan"
"Vulkan0","CONT","type=bf16,ne=[2,3,5,7]","support","0","no","Vulkan"
"Vulkan0","CONT","type=f32,ne=[2,1,1,1],use_view_slice=1","support","1","yes","Vulkan"
"Vulkan0","CONT","type=f32,ne=[2,1,3,5],use_view_slice=1","support","1","yes","Vulkan"
"Vulkan0","CONT","type=f32,ne=[2,3,5,7],use_view_slice=1","support","1","yes","Vulkan"
"Vulkan0","CONT","type=f32,ne=[1,4,4,1],use_view_slice=1","support","1","yes","Vulkan"
"Vulkan0","CONT","type=f32,ne=[1,8,17,1],use_view_slice=1","support","1","yes","Vulkan"
"Vulkan0","CONT","type=f32,ne=[10,10,10,1],use_view_slice=1","support","1","yes","Vulkan"
"Vulkan0","CONT","type=f32,ne=[2,1,1,1],use_view_slice=0","support","1","yes","Vulkan"
"Vulkan0","CONT","type=f32,ne=[2,1,3,5],use_view_slice=0","support","1","yes","Vulkan"
"Vulkan0","CONT","type=f32,ne=[2,3,5,7],use_view_slice=0","support","1","yes","Vulkan"
"Vulkan0","CONT","type=f32,ne=[1,4,4,1],use_view_slice=0","support","1","yes","Vulkan"
"Vulkan0","CONT","type=f32,ne=[1,8,17,1],use_view_slice=0","support","1","yes","Vulkan"
"Vulkan0","CONT","type=f32,ne=[10,10,10,1],use_view_slice=0","support","1","yes","Vulkan"
"Vulkan0","CONT","type=i32,ne=[2,1,1,1],use_view_slice=1","support","1","yes","Vulkan"
"Vulkan0","CONT","type=i32,ne=[2,1,3,5],use_view_slice=1","support","1","yes","Vulkan"
"Vulkan0","CONT","type=i32,ne=[2,3,5,7],use_view_slice=1","support","1","yes","Vulkan"
"Vulkan0","CONT","type=i32,ne=[1,4,4,1],use_view_slice=1","support","1","yes","Vulkan"
"Vulkan0","CONT","type=i32,ne=[1,8,17,1],use_view_slice=1","support","1","yes","Vulkan"
"Vulkan0","CONT","type=i32,ne=[10,10,10,1],use_view_slice=1","support","1","yes","Vulkan"
"Vulkan0","CONT","type=i32,ne=[2,1,1,1],use_view_slice=0","support","1","yes","Vulkan"
"Vulkan0","CONT","type=i32,ne=[2,1,3,5],use_view_slice=0","support","1","yes","Vulkan"
"Vulkan0","CONT","type=i32,ne=[2,3,5,7],use_view_slice=0","support","1","yes","Vulkan"
"Vulkan0","CONT","type=i32,ne=[1,4,4,1],use_view_slice=0","support","1","yes","Vulkan"
"Vulkan0","CONT","type=i32,ne=[1,8,17,1],use_view_slice=0","support","1","yes","Vulkan"
"Vulkan0","CONT","type=i32,ne=[10,10,10,1],use_view_slice=0","support","1","yes","Vulkan"
"Vulkan0","CONT","type=f16,ne=[2,1,1,1],use_view_slice=0","support","1","yes","Vulkan"
"Vulkan0","CONT","type=f16,ne=[2,1,3,5],use_view_slice=0","support","1","yes","Vulkan"
"Vulkan0","CONT","type=f16,ne=[2,3,5,7],use_view_slice=0","support","1","yes","Vulkan"
"Vulkan0","CONT","type=f16,ne=[1,4,4,1],use_view_slice=0","support","1","yes","Vulkan"
"Vulkan0","CONT","type=f16,ne=[1,8,17,1],use_view_slice=0","support","1","yes","Vulkan"
"Vulkan0","CONT","type=f16,ne=[10,10,10,1],use_view_slice=0","support","1","yes","Vulkan"
"Vulkan0","CONT","type=bf16,ne=[2,1,1,1],use_view_slice=0","support","1","yes","Vulkan"
"Vulkan0","CONT","type=bf16,ne=[2,1,3,5],use_view_slice=0","support","1","yes","Vulkan"
"Vulkan0","CONT","type=bf16,ne=[2,3,5,7],use_view_slice=0","support","1","yes","Vulkan"
"Vulkan0","CONT","type=bf16,ne=[1,4,4,1],use_view_slice=0","support","1","yes","Vulkan"
"Vulkan0","CONT","type=bf16,ne=[1,8,17,1],use_view_slice=0","support","1","yes","Vulkan"
"Vulkan0","CONT","type=bf16,ne=[10,10,10,1],use_view_slice=0","support","1","yes","Vulkan"
"Vulkan0","ADD","type=f16,ne=[1,1,8,1],nr=[1,1,1,1],nf=1","support","1","yes","Vulkan"
"Vulkan0","SUB","type=f16,ne=[1,1,8,1],nr=[1,1,1,1],nf=1","support","1","yes","Vulkan"
"Vulkan0","MUL","type=f16,ne=[1,1,8,1],nr=[1,1,1,1],nf=1","support","1","yes","Vulkan"
@ -5655,6 +5683,7 @@
"Vulkan0","MUL","type=f32,ne=[64,262144,1,1],nr=[1,1,1,1],nf=1","support","1","yes","Vulkan"
"Vulkan0","DIV","type=f32,ne=[64,262144,1,1],nr=[1,1,1,1],nf=1","support","1","yes","Vulkan"
"Vulkan0","ADD1","type=f32,ne=[10,5,4,3]","support","1","yes","Vulkan"
"Vulkan0","ADD1","type=f32,ne=[1024,1024,1,1]","support","1","yes","Vulkan"
"Vulkan0","SCALE","type=f32,ne=[10,10,10,10],scale=2.000000,bias=0.000000,inplace=0","support","1","yes","Vulkan"
"Vulkan0","SCALE","type=f32,ne=[10,10,10,10],scale=2.000000,bias=1.000000,inplace=0","support","1","yes","Vulkan"
"Vulkan0","SCALE","type=f32,ne=[10,10,10,10],scale=2.000000,bias=1.000000,inplace=1","support","1","yes","Vulkan"
@ -8644,9 +8673,13 @@
"Vulkan0","CLAMP","type=f16,ne=[7,1,5,3],min=-0.500000,max=0.500000","support","0","no","Vulkan"
"Vulkan0","LEAKY_RELU","type=f16,ne_a=[7,1,5,3],negative_slope=0.100000","support","0","no","Vulkan"
"Vulkan0","FLOOR","type=f16,ne=[7,1,5,3]","support","1","yes","Vulkan"
"Vulkan0","FLOOR","type=f16,ne=[1024,1024,1,1]","support","1","yes","Vulkan"
"Vulkan0","CEIL","type=f16,ne=[7,1,5,3]","support","1","yes","Vulkan"
"Vulkan0","CEIL","type=f16,ne=[1024,1024,1,1]","support","1","yes","Vulkan"
"Vulkan0","ROUND","type=f16,ne=[7,1,5,3]","support","1","yes","Vulkan"
"Vulkan0","ROUND","type=f16,ne=[1024,1024,1,1]","support","1","yes","Vulkan"
"Vulkan0","TRUNC","type=f16,ne=[7,1,5,3]","support","1","yes","Vulkan"
"Vulkan0","TRUNC","type=f16,ne=[1024,1024,1,1]","support","1","yes","Vulkan"
"Vulkan0","SQR","type=f32,ne=[10,5,4,3]","support","1","yes","Vulkan"
"Vulkan0","SQRT","type=f32,ne=[10,3,3,2]","support","1","yes","Vulkan"
"Vulkan0","LOG","type=f32,ne=[10,5,4,3]","support","1","yes","Vulkan"
@ -8666,9 +8699,13 @@
"Vulkan0","CLAMP","type=f32,ne=[7,1,5,3],min=-0.500000,max=0.500000","support","1","yes","Vulkan"
"Vulkan0","LEAKY_RELU","type=f32,ne_a=[7,1,5,3],negative_slope=0.100000","support","1","yes","Vulkan"
"Vulkan0","FLOOR","type=f32,ne=[7,1,5,3]","support","1","yes","Vulkan"
"Vulkan0","FLOOR","type=f32,ne=[1024,1024,1,1]","support","1","yes","Vulkan"
"Vulkan0","CEIL","type=f32,ne=[7,1,5,3]","support","1","yes","Vulkan"
"Vulkan0","CEIL","type=f32,ne=[1024,1024,1,1]","support","1","yes","Vulkan"
"Vulkan0","ROUND","type=f32,ne=[7,1,5,3]","support","1","yes","Vulkan"
"Vulkan0","ROUND","type=f32,ne=[1024,1024,1,1]","support","1","yes","Vulkan"
"Vulkan0","TRUNC","type=f32,ne=[7,1,5,3]","support","1","yes","Vulkan"
"Vulkan0","TRUNC","type=f32,ne=[1024,1024,1,1]","support","1","yes","Vulkan"
"Vulkan0","DIAG_MASK_INF","type=f32,ne=[10,10,1,1],n_past=5","support","1","yes","Vulkan"
"Vulkan0","DIAG_MASK_INF","type=f32,ne=[10,10,3,1],n_past=5","support","1","yes","Vulkan"
"Vulkan0","DIAG_MASK_INF","type=f32,ne=[10,10,3,2],n_past=5","support","1","yes","Vulkan"
@ -9411,28 +9448,405 @@
"Vulkan0","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=2,v=3","support","1","yes","Vulkan"
"Vulkan0","CONCAT","type=f32,ne_a=[11,12,13,14],ne_b_d=7,dim=3,v=3","support","1","yes","Vulkan"
"Vulkan0","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=3,v=3","support","1","yes","Vulkan"
"Vulkan0","ARGSORT","type=f32,ne=[3,1,1,1],order=0","support","1","yes","Vulkan"
"Vulkan0","ARGSORT","type=f32,ne=[4,1,1,1],order=0","support","1","yes","Vulkan"
"Vulkan0","ARGSORT","type=f32,ne=[7,1,1,1],order=0","support","1","yes","Vulkan"
"Vulkan0","ARGSORT","type=f32,ne=[8,1,1,1],order=0","support","1","yes","Vulkan"
"Vulkan0","ARGSORT","type=f32,ne=[15,1,1,1],order=0","support","1","yes","Vulkan"
"Vulkan0","ARGSORT","type=f32,ne=[16,1,1,1],order=0","support","1","yes","Vulkan"
"Vulkan0","ARGSORT","type=f32,ne=[31,1,1,1],order=0","support","1","yes","Vulkan"
"Vulkan0","ARGSORT","type=f32,ne=[32,1,1,1],order=0","support","1","yes","Vulkan"
"Vulkan0","ARGSORT","type=f32,ne=[63,1,1,1],order=0","support","1","yes","Vulkan"
"Vulkan0","ARGSORT","type=f32,ne=[64,1,1,1],order=0","support","1","yes","Vulkan"
"Vulkan0","ARGSORT","type=f32,ne=[127,1,1,1],order=0","support","1","yes","Vulkan"
"Vulkan0","ARGSORT","type=f32,ne=[128,1,1,1],order=0","support","1","yes","Vulkan"
"Vulkan0","ARGSORT","type=f32,ne=[255,1,1,1],order=0","support","1","yes","Vulkan"
"Vulkan0","ARGSORT","type=f32,ne=[256,1,1,1],order=0","support","1","yes","Vulkan"
"Vulkan0","ARGSORT","type=f32,ne=[511,1,1,1],order=0","support","1","yes","Vulkan"
"Vulkan0","ARGSORT","type=f32,ne=[512,1,1,1],order=0","support","1","yes","Vulkan"
"Vulkan0","ARGSORT","type=f32,ne=[1023,1,1,1],order=0","support","1","yes","Vulkan"
"Vulkan0","ARGSORT","type=f32,ne=[1024,1,1,1],order=0","support","1","yes","Vulkan"
"Vulkan0","ARGSORT","type=f32,ne=[2047,1,1,1],order=0","support","1","yes","Vulkan"
"Vulkan0","ARGSORT","type=f32,ne=[2048,1,1,1],order=0","support","1","yes","Vulkan"
"Vulkan0","ARGSORT","type=f32,ne=[4095,1,1,1],order=0","support","1","yes","Vulkan"
"Vulkan0","ARGSORT","type=f32,ne=[4096,1,1,1],order=0","support","1","yes","Vulkan"
"Vulkan0","ARGSORT","type=f32,ne=[8191,1,1,1],order=0","support","1","yes","Vulkan"
"Vulkan0","ARGSORT","type=f32,ne=[8192,1,1,1],order=0","support","1","yes","Vulkan"
"Vulkan0","ARGSORT","type=f32,ne=[16383,1,1,1],order=0","support","1","yes","Vulkan"
"Vulkan0","ARGSORT","type=f32,ne=[16384,1,1,1],order=0","support","1","yes","Vulkan"
"Vulkan0","ARGSORT","type=f32,ne=[32767,1,1,1],order=0","support","1","yes","Vulkan"
"Vulkan0","ARGSORT","type=f32,ne=[32768,1,1,1],order=0","support","1","yes","Vulkan"
"Vulkan0","ARGSORT","type=f32,ne=[65535,1,1,1],order=0","support","1","yes","Vulkan"
"Vulkan0","ARGSORT","type=f32,ne=[65536,1,1,1],order=0","support","1","yes","Vulkan"
"Vulkan0","ARGSORT","type=f32,ne=[131071,1,1,1],order=0","support","1","yes","Vulkan"
"Vulkan0","ARGSORT","type=f32,ne=[131072,1,1,1],order=0","support","1","yes","Vulkan"
"Vulkan0","ARGSORT","type=f32,ne=[262143,1,1,1],order=0","support","1","yes","Vulkan"
"Vulkan0","ARGSORT","type=f32,ne=[262144,1,1,1],order=0","support","1","yes","Vulkan"
"Vulkan0","ARGSORT","type=f32,ne=[524287,1,1,1],order=0","support","1","yes","Vulkan"
"Vulkan0","ARGSORT","type=f32,ne=[524288,1,1,1],order=0","support","1","yes","Vulkan"
"Vulkan0","ARGSORT","type=f32,ne=[1048575,1,1,1],order=0","support","1","yes","Vulkan"
"Vulkan0","ARGSORT","type=f32,ne=[1048576,1,1,1],order=0","support","1","yes","Vulkan"
"Vulkan0","ARGSORT","type=f32,ne=[16,10,10,10],order=0","support","1","yes","Vulkan"
"Vulkan0","ARGSORT","type=f32,ne=[60,10,10,10],order=0","support","1","yes","Vulkan"
"Vulkan0","ARGSORT","type=f32,ne=[1023,2,1,3],order=0","support","1","yes","Vulkan"
"Vulkan0","ARGSORT","type=f32,ne=[1024,2,1,3],order=0","support","1","yes","Vulkan"
"Vulkan0","ARGSORT","type=f32,ne=[1025,2,1,3],order=0","support","0","no","Vulkan"
"Vulkan0","ARGSORT","type=f32,ne=[16384,1,1,1],order=0","support","0","no","Vulkan"
"Vulkan0","ARGSORT","type=f32,ne=[2047,2,1,3],order=0","support","0","no","Vulkan"
"Vulkan0","ARGSORT","type=f32,ne=[2048,2,1,3],order=0","support","0","no","Vulkan"
"Vulkan0","ARGSORT","type=f32,ne=[2049,2,1,3],order=0","support","0","no","Vulkan"
"Vulkan0","ARGSORT","type=f32,ne=[1025,2,1,3],order=0","support","1","yes","Vulkan"
"Vulkan0","ARGSORT","type=f32,ne=[2047,2,1,3],order=0","support","1","yes","Vulkan"
"Vulkan0","ARGSORT","type=f32,ne=[2048,2,1,3],order=0","support","1","yes","Vulkan"
"Vulkan0","ARGSORT","type=f32,ne=[2049,2,1,3],order=0","support","1","yes","Vulkan"
"Vulkan0","ARGSORT","type=f32,ne=[2,8,8192,1],order=0","support","1","yes","Vulkan"
"Vulkan0","ARGSORT","type=f32,ne=[8,1,1,1],order=1","support","1","yes","Vulkan"
"Vulkan0","ARGSORT","type=f32,ne=[3,1,1,1],order=0","support","1","yes","Vulkan"
"Vulkan0","ARGSORT","type=f32,ne=[4,1,1,1],order=0","support","1","yes","Vulkan"
"Vulkan0","ARGSORT","type=f32,ne=[7,1,1,1],order=0","support","1","yes","Vulkan"
"Vulkan0","ARGSORT","type=f32,ne=[8,1,1,1],order=0","support","1","yes","Vulkan"
"Vulkan0","ARGSORT","type=f32,ne=[15,1,1,1],order=0","support","1","yes","Vulkan"
"Vulkan0","ARGSORT","type=f32,ne=[16,1,1,1],order=0","support","1","yes","Vulkan"
"Vulkan0","ARGSORT","type=f32,ne=[31,1,1,1],order=0","support","1","yes","Vulkan"
"Vulkan0","ARGSORT","type=f32,ne=[32,1,1,1],order=0","support","1","yes","Vulkan"
"Vulkan0","ARGSORT","type=f32,ne=[63,1,1,1],order=0","support","1","yes","Vulkan"
"Vulkan0","ARGSORT","type=f32,ne=[64,1,1,1],order=0","support","1","yes","Vulkan"
"Vulkan0","ARGSORT","type=f32,ne=[127,1,1,1],order=0","support","1","yes","Vulkan"
"Vulkan0","ARGSORT","type=f32,ne=[128,1,1,1],order=0","support","1","yes","Vulkan"
"Vulkan0","ARGSORT","type=f32,ne=[255,1,1,1],order=0","support","1","yes","Vulkan"
"Vulkan0","ARGSORT","type=f32,ne=[256,1,1,1],order=0","support","1","yes","Vulkan"
"Vulkan0","ARGSORT","type=f32,ne=[511,1,1,1],order=0","support","1","yes","Vulkan"
"Vulkan0","ARGSORT","type=f32,ne=[512,1,1,1],order=0","support","1","yes","Vulkan"
"Vulkan0","ARGSORT","type=f32,ne=[1023,1,1,1],order=0","support","1","yes","Vulkan"
"Vulkan0","ARGSORT","type=f32,ne=[1024,1,1,1],order=0","support","1","yes","Vulkan"
"Vulkan0","ARGSORT","type=f32,ne=[2047,1,1,1],order=0","support","1","yes","Vulkan"
"Vulkan0","ARGSORT","type=f32,ne=[2048,1,1,1],order=0","support","1","yes","Vulkan"
"Vulkan0","ARGSORT","type=f32,ne=[4095,1,1,1],order=0","support","1","yes","Vulkan"
"Vulkan0","ARGSORT","type=f32,ne=[4096,1,1,1],order=0","support","1","yes","Vulkan"
"Vulkan0","ARGSORT","type=f32,ne=[8191,1,1,1],order=0","support","1","yes","Vulkan"
"Vulkan0","ARGSORT","type=f32,ne=[8192,1,1,1],order=0","support","1","yes","Vulkan"
"Vulkan0","ARGSORT","type=f32,ne=[16383,1,1,1],order=0","support","1","yes","Vulkan"
"Vulkan0","ARGSORT","type=f32,ne=[16384,1,1,1],order=0","support","1","yes","Vulkan"
"Vulkan0","ARGSORT","type=f32,ne=[32767,1,1,1],order=0","support","1","yes","Vulkan"
"Vulkan0","ARGSORT","type=f32,ne=[32768,1,1,1],order=0","support","1","yes","Vulkan"
"Vulkan0","ARGSORT","type=f32,ne=[65535,1,1,1],order=0","support","1","yes","Vulkan"
"Vulkan0","ARGSORT","type=f32,ne=[65536,1,1,1],order=0","support","1","yes","Vulkan"
"Vulkan0","ARGSORT","type=f32,ne=[131071,1,1,1],order=0","support","1","yes","Vulkan"
"Vulkan0","ARGSORT","type=f32,ne=[131072,1,1,1],order=0","support","1","yes","Vulkan"
"Vulkan0","ARGSORT","type=f32,ne=[262143,1,1,1],order=0","support","1","yes","Vulkan"
"Vulkan0","ARGSORT","type=f32,ne=[262144,1,1,1],order=0","support","1","yes","Vulkan"
"Vulkan0","ARGSORT","type=f32,ne=[524287,1,1,1],order=0","support","1","yes","Vulkan"
"Vulkan0","ARGSORT","type=f32,ne=[524288,1,1,1],order=0","support","1","yes","Vulkan"
"Vulkan0","ARGSORT","type=f32,ne=[1048575,1,1,1],order=0","support","1","yes","Vulkan"
"Vulkan0","ARGSORT","type=f32,ne=[1048576,1,1,1],order=0","support","1","yes","Vulkan"
"Vulkan0","ARGSORT","type=f32,ne=[16,10,10,10],order=1","support","1","yes","Vulkan"
"Vulkan0","ARGSORT","type=f32,ne=[60,10,10,10],order=1","support","1","yes","Vulkan"
"Vulkan0","ARGSORT","type=f32,ne=[1023,2,1,3],order=1","support","1","yes","Vulkan"
"Vulkan0","ARGSORT","type=f32,ne=[1024,2,1,3],order=1","support","1","yes","Vulkan"
"Vulkan0","ARGSORT","type=f32,ne=[1025,2,1,3],order=1","support","0","no","Vulkan"
"Vulkan0","ARGSORT","type=f32,ne=[16384,1,1,1],order=1","support","0","no","Vulkan"
"Vulkan0","ARGSORT","type=f32,ne=[2047,2,1,3],order=1","support","0","no","Vulkan"
"Vulkan0","ARGSORT","type=f32,ne=[2048,2,1,3],order=1","support","0","no","Vulkan"
"Vulkan0","ARGSORT","type=f32,ne=[2049,2,1,3],order=1","support","0","no","Vulkan"
"Vulkan0","ARGSORT","type=f32,ne=[1025,2,1,3],order=1","support","1","yes","Vulkan"
"Vulkan0","ARGSORT","type=f32,ne=[2047,2,1,3],order=1","support","1","yes","Vulkan"
"Vulkan0","ARGSORT","type=f32,ne=[2048,2,1,3],order=1","support","1","yes","Vulkan"
"Vulkan0","ARGSORT","type=f32,ne=[2049,2,1,3],order=1","support","1","yes","Vulkan"
"Vulkan0","ARGSORT","type=f32,ne=[2,8,8192,1],order=1","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[1,1,1,1],k=1","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[12,1,2,1],k=1","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[2,1,1,1],k=1","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[13,1,2,1],k=1","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[2,1,1,1],k=2","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[13,1,2,1],k=2","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[4,1,1,1],k=1","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[15,1,2,1],k=1","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[4,1,1,1],k=2","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[15,1,2,1],k=2","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[4,1,1,1],k=3","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[15,1,2,1],k=3","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[8,1,1,1],k=1","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[19,1,2,1],k=1","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[8,1,1,1],k=2","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[19,1,2,1],k=2","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[8,1,1,1],k=3","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[19,1,2,1],k=3","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[8,1,1,1],k=7","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[19,1,2,1],k=7","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[16,1,1,1],k=1","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[27,1,2,1],k=1","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[16,1,1,1],k=2","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[27,1,2,1],k=2","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[16,1,1,1],k=3","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[27,1,2,1],k=3","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[16,1,1,1],k=7","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[27,1,2,1],k=7","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[16,1,1,1],k=15","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[27,1,2,1],k=15","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[32,1,1,1],k=1","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[43,1,2,1],k=1","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[32,1,1,1],k=2","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[43,1,2,1],k=2","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[32,1,1,1],k=3","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[43,1,2,1],k=3","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[32,1,1,1],k=7","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[43,1,2,1],k=7","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[32,1,1,1],k=15","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[43,1,2,1],k=15","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[64,1,1,1],k=1","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[75,1,2,1],k=1","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[64,1,1,1],k=2","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[75,1,2,1],k=2","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[64,1,1,1],k=3","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[75,1,2,1],k=3","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[64,1,1,1],k=7","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[75,1,2,1],k=7","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[64,1,1,1],k=15","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[75,1,2,1],k=15","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[128,1,1,1],k=1","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[139,1,2,1],k=1","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[128,1,1,1],k=2","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[139,1,2,1],k=2","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[128,1,1,1],k=3","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[139,1,2,1],k=3","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[128,1,1,1],k=7","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[139,1,2,1],k=7","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[128,1,1,1],k=15","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[139,1,2,1],k=15","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[128,1,1,1],k=100","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[139,1,2,1],k=100","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[256,1,1,1],k=1","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[267,1,2,1],k=1","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[256,1,1,1],k=2","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[267,1,2,1],k=2","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[256,1,1,1],k=3","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[267,1,2,1],k=3","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[256,1,1,1],k=7","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[267,1,2,1],k=7","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[256,1,1,1],k=15","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[267,1,2,1],k=15","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[256,1,1,1],k=100","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[267,1,2,1],k=100","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[512,1,1,1],k=1","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[523,1,2,1],k=1","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[512,1,1,1],k=2","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[523,1,2,1],k=2","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[512,1,1,1],k=3","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[523,1,2,1],k=3","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[512,1,1,1],k=7","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[523,1,2,1],k=7","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[512,1,1,1],k=15","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[523,1,2,1],k=15","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[512,1,1,1],k=100","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[523,1,2,1],k=100","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[512,1,1,1],k=500","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[523,1,2,1],k=500","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[1024,1,1,1],k=1","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[1035,1,2,1],k=1","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[1024,1,1,1],k=2","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[1035,1,2,1],k=2","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[1024,1,1,1],k=3","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[1035,1,2,1],k=3","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[1024,1,1,1],k=7","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[1035,1,2,1],k=7","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[1024,1,1,1],k=15","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[1035,1,2,1],k=15","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[1024,1,1,1],k=100","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[1035,1,2,1],k=100","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[1024,1,1,1],k=500","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[1035,1,2,1],k=500","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[1024,1,1,1],k=1023","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[1035,1,2,1],k=1023","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[2048,1,1,1],k=1","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[2059,1,2,1],k=1","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[2048,1,1,1],k=2","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[2059,1,2,1],k=2","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[2048,1,1,1],k=3","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[2059,1,2,1],k=3","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[2048,1,1,1],k=7","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[2059,1,2,1],k=7","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[2048,1,1,1],k=15","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[2059,1,2,1],k=15","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[2048,1,1,1],k=100","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[2059,1,2,1],k=100","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[2048,1,1,1],k=500","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[2059,1,2,1],k=500","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[2048,1,1,1],k=1023","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[2059,1,2,1],k=1023","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[4096,1,1,1],k=1","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[4107,1,2,1],k=1","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[4096,1,1,1],k=2","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[4107,1,2,1],k=2","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[4096,1,1,1],k=3","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[4107,1,2,1],k=3","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[4096,1,1,1],k=7","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[4107,1,2,1],k=7","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[4096,1,1,1],k=15","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[4107,1,2,1],k=15","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[4096,1,1,1],k=100","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[4107,1,2,1],k=100","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[4096,1,1,1],k=500","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[4107,1,2,1],k=500","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[4096,1,1,1],k=1023","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[4107,1,2,1],k=1023","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[8192,1,1,1],k=1","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[8203,1,2,1],k=1","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[8192,1,1,1],k=2","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[8203,1,2,1],k=2","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[8192,1,1,1],k=3","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[8203,1,2,1],k=3","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[8192,1,1,1],k=7","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[8203,1,2,1],k=7","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[8192,1,1,1],k=15","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[8203,1,2,1],k=15","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[8192,1,1,1],k=100","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[8203,1,2,1],k=100","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[8192,1,1,1],k=500","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[8203,1,2,1],k=500","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[8192,1,1,1],k=1023","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[8203,1,2,1],k=1023","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[16384,1,1,1],k=1","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[16395,1,2,1],k=1","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[16384,1,1,1],k=2","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[16395,1,2,1],k=2","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[16384,1,1,1],k=3","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[16395,1,2,1],k=3","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[16384,1,1,1],k=7","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[16395,1,2,1],k=7","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[16384,1,1,1],k=15","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[16395,1,2,1],k=15","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[16384,1,1,1],k=100","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[16395,1,2,1],k=100","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[16384,1,1,1],k=500","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[16395,1,2,1],k=500","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[16384,1,1,1],k=1023","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[16395,1,2,1],k=1023","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[16384,1,1,1],k=9999","support","0","no","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[16395,1,2,1],k=9999","support","0","no","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[32768,1,1,1],k=1","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[32779,1,2,1],k=1","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[32768,1,1,1],k=2","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[32779,1,2,1],k=2","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[32768,1,1,1],k=3","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[32779,1,2,1],k=3","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[32768,1,1,1],k=7","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[32779,1,2,1],k=7","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[32768,1,1,1],k=15","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[32779,1,2,1],k=15","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[32768,1,1,1],k=100","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[32779,1,2,1],k=100","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[32768,1,1,1],k=500","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[32779,1,2,1],k=500","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[32768,1,1,1],k=1023","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[32779,1,2,1],k=1023","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[32768,1,1,1],k=9999","support","0","no","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[32779,1,2,1],k=9999","support","0","no","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[65536,1,1,1],k=1","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[65547,1,2,1],k=1","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[65536,1,1,1],k=2","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[65547,1,2,1],k=2","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[65536,1,1,1],k=3","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[65547,1,2,1],k=3","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[65536,1,1,1],k=7","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[65547,1,2,1],k=7","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[65536,1,1,1],k=15","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[65547,1,2,1],k=15","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[65536,1,1,1],k=100","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[65547,1,2,1],k=100","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[65536,1,1,1],k=500","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[65547,1,2,1],k=500","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[65536,1,1,1],k=1023","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[65547,1,2,1],k=1023","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[65536,1,1,1],k=9999","support","0","no","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[65547,1,2,1],k=9999","support","0","no","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[131072,1,1,1],k=1","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[131083,1,2,1],k=1","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[131072,1,1,1],k=2","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[131083,1,2,1],k=2","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[131072,1,1,1],k=3","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[131083,1,2,1],k=3","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[131072,1,1,1],k=7","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[131083,1,2,1],k=7","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[131072,1,1,1],k=15","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[131083,1,2,1],k=15","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[131072,1,1,1],k=100","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[131083,1,2,1],k=100","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[131072,1,1,1],k=500","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[131083,1,2,1],k=500","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[131072,1,1,1],k=1023","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[131083,1,2,1],k=1023","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[131072,1,1,1],k=9999","support","0","no","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[131083,1,2,1],k=9999","support","0","no","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[262144,1,1,1],k=1","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[262155,1,2,1],k=1","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[262144,1,1,1],k=2","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[262155,1,2,1],k=2","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[262144,1,1,1],k=3","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[262155,1,2,1],k=3","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[262144,1,1,1],k=7","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[262155,1,2,1],k=7","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[262144,1,1,1],k=15","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[262155,1,2,1],k=15","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[262144,1,1,1],k=100","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[262155,1,2,1],k=100","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[262144,1,1,1],k=500","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[262155,1,2,1],k=500","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[262144,1,1,1],k=1023","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[262155,1,2,1],k=1023","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[262144,1,1,1],k=9999","support","0","no","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[262155,1,2,1],k=9999","support","0","no","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[524288,1,1,1],k=1","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[524299,1,2,1],k=1","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[524288,1,1,1],k=2","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[524299,1,2,1],k=2","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[524288,1,1,1],k=3","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[524299,1,2,1],k=3","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[524288,1,1,1],k=7","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[524299,1,2,1],k=7","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[524288,1,1,1],k=15","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[524299,1,2,1],k=15","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[524288,1,1,1],k=100","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[524299,1,2,1],k=100","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[524288,1,1,1],k=500","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[524299,1,2,1],k=500","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[524288,1,1,1],k=1023","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[524299,1,2,1],k=1023","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[524288,1,1,1],k=9999","support","0","no","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[524299,1,2,1],k=9999","support","0","no","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[16,10,10,10],k=1","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[60,10,10,10],k=1","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[1023,2,1,3],k=1","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[1024,2,1,3],k=1","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[1025,2,1,3],k=1","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[16384,1,1,1],k=1","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[2047,2,1,3],k=1","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[2048,2,1,3],k=1","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[2049,2,1,3],k=1","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[16,10,10,10],k=2","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[60,10,10,10],k=2","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[1023,2,1,3],k=2","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[1024,2,1,3],k=2","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[1025,2,1,3],k=2","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[16384,1,1,1],k=2","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[2047,2,1,3],k=2","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[2048,2,1,3],k=2","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[2049,2,1,3],k=2","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[16,10,10,10],k=3","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[60,10,10,10],k=3","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[1023,2,1,3],k=3","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[1024,2,1,3],k=3","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[1025,2,1,3],k=3","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[16384,1,1,1],k=3","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[2047,2,1,3],k=3","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[2048,2,1,3],k=3","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[2049,2,1,3],k=3","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[16,10,10,10],k=7","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[60,10,10,10],k=7","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[1023,2,1,3],k=7","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[1024,2,1,3],k=7","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[1025,2,1,3],k=7","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[16384,1,1,1],k=7","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[2047,2,1,3],k=7","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[2048,2,1,3],k=7","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[2049,2,1,3],k=7","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[16,10,10,10],k=15","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[60,10,10,10],k=15","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[1023,2,1,3],k=15","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[1024,2,1,3],k=15","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[1025,2,1,3],k=15","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[16384,1,1,1],k=15","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[2047,2,1,3],k=15","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[2048,2,1,3],k=15","support","1","yes","Vulkan"
"Vulkan0","TOP_K","type=f32,ne=[2049,2,1,3],k=15","support","1","yes","Vulkan"
"Vulkan0","UPSCALE","type=f32,ne=[512,512,3,2],scale_factor=2,mode=nearest,transpose=0","support","1","yes","Vulkan"
"Vulkan0","UPSCALE","type=f32,ne=[512,512,3,2],scale_factor=2,mode=nearest,transpose=1","support","1","yes","Vulkan"
"Vulkan0","UPSCALE","type=f32,ne=[2,5,7,11],ne_tgt=[5,7,11,13],mode=nearest,flags=none","support","1","yes","Vulkan"
@ -9445,6 +9859,10 @@
"Vulkan0","UPSCALE","type=f32,ne=[512,512,3,2],scale_factor=2,mode=bicubic,transpose=1","support","1","yes","Vulkan"
"Vulkan0","UPSCALE","type=f32,ne=[2,5,7,11],ne_tgt=[5,7,11,13],mode=bicubic,flags=none","support","1","yes","Vulkan"
"Vulkan0","UPSCALE","type=f32,ne=[5,7,11,13],ne_tgt=[2,5,7,11],mode=bicubic,flags=none","support","1","yes","Vulkan"
"Vulkan0","UPSCALE","type=f32,ne=[512,512,3,2],scale_factor=2,mode=513,transpose=0","support","0","no","Vulkan"
"Vulkan0","UPSCALE","type=f32,ne=[512,512,3,2],scale_factor=2,mode=513,transpose=1","support","0","no","Vulkan"
"Vulkan0","UPSCALE","type=f32,ne=[2,5,7,11],ne_tgt=[5,7,11,13],mode=bilinear,flags=none","support","0","no","Vulkan"
"Vulkan0","UPSCALE","type=f32,ne=[5,7,11,13],ne_tgt=[2,5,7,11],mode=bilinear,flags=none","support","0","no","Vulkan"
"Vulkan0","UPSCALE","type=f32,ne=[2,5,7,11],ne_tgt=[5,7,11,13],mode=bilinear,flags=align_corners","support","1","yes","Vulkan"
"Vulkan0","UPSCALE","type=f32,ne=[1,4,3,2],ne_tgt=[2,8,3,2],mode=bilinear,flags=align_corners","support","1","yes","Vulkan"
"Vulkan0","UPSCALE","type=f32,ne=[4,1,3,2],ne_tgt=[1,1,3,2],mode=bilinear,flags=align_corners","support","1","yes","Vulkan"
@ -9479,23 +9897,37 @@
"Vulkan0","PAD_REFLECT_1D","type=f32,ne_a=[3000,384,4,1],pad_0=10,pad_1=9","support","0","no","Vulkan"
"Vulkan0","ROLL","shift0=3,shift1=-2,shift3=1,shift4=-1","support","1","yes","Vulkan"
"Vulkan0","ARANGE","type=f32,start=0.000000,stop=10.000000,step=1.000000","support","1","yes","Vulkan"
"Vulkan0","ARANGE","type=f32,start=0.000000,stop=1048576.000000,step=1.000000","support","1","yes","Vulkan"
"Vulkan0","TIMESTEP_EMBEDDING","type=f32,ne_a=[2,1,1,1],dim=320,max_period=10000","support","1","yes","Vulkan"
"Vulkan0","LEAKY_RELU","type=f32,ne_a=[10,5,4,3],negative_slope=0.100000","support","1","yes","Vulkan"
"Vulkan0","CUMSUM","type=f32,ne=[10,5,4,3]","support","0","no","Vulkan"
"Vulkan0","CUMSUM","type=f32,ne=[10,5,4,3]","support","1","yes","Vulkan"
"Vulkan0","CUMSUM","type=f32,ne=[127,5,4,3]","support","1","yes","Vulkan"
"Vulkan0","CUMSUM","type=f32,ne=[128,5,4,3]","support","1","yes","Vulkan"
"Vulkan0","CUMSUM","type=f32,ne=[255,5,4,3]","support","1","yes","Vulkan"
"Vulkan0","CUMSUM","type=f32,ne=[256,5,4,3]","support","1","yes","Vulkan"
"Vulkan0","CUMSUM","type=f32,ne=[511,5,4,3]","support","1","yes","Vulkan"
"Vulkan0","CUMSUM","type=f32,ne=[512,5,4,3]","support","1","yes","Vulkan"
"Vulkan0","CUMSUM","type=f32,ne=[1023,5,4,3]","support","1","yes","Vulkan"
"Vulkan0","CUMSUM","type=f32,ne=[1024,5,4,3]","support","1","yes","Vulkan"
"Vulkan0","CUMSUM","type=f32,ne=[2047,5,4,3]","support","1","yes","Vulkan"
"Vulkan0","CUMSUM","type=f32,ne=[2048,5,4,3]","support","1","yes","Vulkan"
"Vulkan0","CUMSUM","type=f32,ne=[242004,1,1,1]","support","1","yes","Vulkan"
"Vulkan0","CUMSUM","type=f32,ne=[375960,1,1,1]","support","1","yes","Vulkan"
"Vulkan0","XIELU","type=f32,ne=[10,5,4,3]","support","0","no","Vulkan"
"Vulkan0","TRI","type=f32,ne=[10,10,4,3],tri_type=3","support","0","no","Vulkan"
"Vulkan0","TRI","type=f32,ne=[10,10,4,3],tri_type=2","support","0","no","Vulkan"
"Vulkan0","TRI","type=f32,ne=[10,10,4,3],tri_type=1","support","0","no","Vulkan"
"Vulkan0","TRI","type=f32,ne=[10,10,4,3],tri_type=0","support","0","no","Vulkan"
"Vulkan0","TRI","type=f32,ne=[10,10,4,3],tri_type=3","support","1","yes","Vulkan"
"Vulkan0","TRI","type=f32,ne=[10,10,4,3],tri_type=2","support","1","yes","Vulkan"
"Vulkan0","TRI","type=f32,ne=[10,10,4,3],tri_type=1","support","1","yes","Vulkan"
"Vulkan0","TRI","type=f32,ne=[10,10,4,3],tri_type=0","support","1","yes","Vulkan"
"Vulkan0","FILL","type=f32,ne=[10,10,4,3],c=0.000000","support","1","yes","Vulkan"
"Vulkan0","FILL","type=f32,ne=[303,207,11,3],c=2.000000","support","1","yes","Vulkan"
"Vulkan0","FILL","type=f32,ne=[800,600,4,4],c=-152.000000","support","1","yes","Vulkan"
"Vulkan0","SOLVE_TRI","type=f32,ne_lhs=[10,10,4,3],ne_rhs=[3,10,4,3]","support","0","no","Vulkan"
"Vulkan0","SOLVE_TRI","type=f32,ne_lhs=[11,11,1,1],ne_rhs=[5,11,1,1]","support","0","no","Vulkan"
"Vulkan0","SOLVE_TRI","type=f32,ne_lhs=[17,17,2,4],ne_rhs=[9,17,2,4]","support","0","no","Vulkan"
"Vulkan0","SOLVE_TRI","type=f32,ne_lhs=[30,30,7,1],ne_rhs=[8,30,7,1]","support","0","no","Vulkan"
"Vulkan0","SOLVE_TRI","type=f32,ne_lhs=[42,42,5,2],ne_rhs=[10,42,5,2]","support","0","no","Vulkan"
"Vulkan0","SOLVE_TRI","type=f32,ne_lhs=[64,64,2,2],ne_rhs=[10,64,2,2]","support","0","no","Vulkan"
"Vulkan0","FILL","type=f32,ne=[2048,512,2,2],c=3.500000","support","1","yes","Vulkan"
"Vulkan0","SOLVE_TRI","type=f32,ne_lhs=[10,10,4,3],ne_rhs=[3,10,4,3]","support","1","yes","Vulkan"
"Vulkan0","SOLVE_TRI","type=f32,ne_lhs=[11,11,1,1],ne_rhs=[5,11,1,1]","support","1","yes","Vulkan"
"Vulkan0","SOLVE_TRI","type=f32,ne_lhs=[17,17,2,4],ne_rhs=[9,17,2,4]","support","1","yes","Vulkan"
"Vulkan0","SOLVE_TRI","type=f32,ne_lhs=[30,30,7,1],ne_rhs=[8,30,7,1]","support","1","yes","Vulkan"
"Vulkan0","SOLVE_TRI","type=f32,ne_lhs=[42,42,5,2],ne_rhs=[10,42,5,2]","support","1","yes","Vulkan"
"Vulkan0","SOLVE_TRI","type=f32,ne_lhs=[64,64,2,2],ne_rhs=[10,64,2,2]","support","1","yes","Vulkan"
"Vulkan0","SOLVE_TRI","type=f32,ne_lhs=[100,100,4,4],ne_rhs=[41,100,4,4]","support","0","no","Vulkan"
"Vulkan0","PAD","type=f32,ne_a=[512,512,1,1],lp0=0,rp0=1,lp1=0,rp1=1,lp2=0,rp2=0,lp3=0,rp3=0,v=0","support","1","yes","Vulkan"
"Vulkan0","PAD","type=f32,ne_a=[11,22,33,44],lp0=1,rp0=2,lp1=3,rp1=4,lp2=5,rp2=6,lp3=7,rp3=8,v=0","support","1","yes","Vulkan"

Can't render this file because it is too large.

View File

@ -104,12 +104,16 @@ int main(int argc, char ** argv) {
params.embedding = true;
// get max number of sequences per batch
const int n_seq_max = llama_max_parallel_sequences();
// if the number of prompts that would be encoded is known in advance, it's more efficient to specify the
// --parallel argument accordingly. for convenience, if not specified, we fallback to unified KV cache
// in order to support any number of prompts
if (params.n_parallel == 1) {
LOG_INF("%s: n_parallel == 1 -> unified KV cache is enabled\n", __func__);
params.kv_unified = true;
params.n_parallel = n_seq_max;
}
// utilize the full context
@ -123,9 +127,6 @@ int main(int argc, char ** argv) {
params.n_ubatch = params.n_batch;
}
// get max number of sequences per batch
const int n_seq_max = llama_max_parallel_sequences();
llama_backend_init();
llama_numa_init(params.numa);

View File

@ -231,9 +231,9 @@ DOT = '[^\\x0A\\x0D]'
RESERVED_NAMES = set(["root", "dot", *PRIMITIVE_RULES.keys(), *STRING_FORMAT_RULES.keys()])
INVALID_RULE_CHARS_RE = re.compile(r'[^a-zA-Z0-9-]+')
GRAMMAR_LITERAL_ESCAPE_RE = re.compile(r'[\r\n"]')
GRAMMAR_LITERAL_ESCAPE_RE = re.compile(r'[\r\n"\\]')
GRAMMAR_RANGE_LITERAL_ESCAPE_RE = re.compile(r'[\r\n"\]\-\\]')
GRAMMAR_LITERAL_ESCAPES = {'\r': '\\r', '\n': '\\n', '"': '\\"', '-': '\\-', ']': '\\]'}
GRAMMAR_LITERAL_ESCAPES = {'\r': '\\r', '\n': '\\n', '"': '\\"', '-': '\\-', ']': '\\]', '\\': '\\\\'}
NON_LITERAL_SET = set('|.()[]{}*+?')
ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS = set('^$.[]()|{}*+?')

View File

@ -4,6 +4,11 @@ set -e
# First try command line argument, then environment variable, then file
CONVERTED_MODEL="${1:-"$CONVERTED_MODEL"}"
MODEL_TESTING_PROMPT="${2:-"$MODEL_TESTING_PROMPT"}"
if [ -z "$MODEL_TESTING_PROMPT"]; then
MODEL_TESTING_PROMPT="Hello, my name is"
fi
# Final check if we have a model path
if [ -z "$CONVERTED_MODEL" ]; then
@ -14,7 +19,8 @@ if [ -z "$CONVERTED_MODEL" ]; then
fi
echo $CONVERTED_MODEL
echo $MODEL_TESTING_PROMPT
cmake --build ../../build --target llama-logits -j8
../../build/bin/llama-logits -m "$CONVERTED_MODEL" "Hello, my name is"
../../build/bin/llama-logits -m "$CONVERTED_MODEL" "$MODEL_TESTING_PROMPT"

View File

@ -184,8 +184,12 @@ model_name = os.path.basename(model_path)
# of using AutoModelForCausalLM.
print(f"Model class: {model.__class__.__name__}")
prompt = "Hello, my name is"
input_ids = tokenizer(prompt, return_tensors="pt").input_ids
device = next(model.parameters()).device
if os.getenv("MODEL_TESTING_PROMPT"):
prompt = os.getenv("MODEL_TESTING_PROMPT")
else:
prompt = "Hello, my name is"
input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(device)
print(f"Input tokens: {input_ids}")
print(f"Input text: {repr(prompt)}")

View File

@ -15,6 +15,9 @@ MODEL_FILE=models/llama-2-7b.Q4_0.gguf
NGL=99
CONTEXT=4096
#support malloc device memory more than 4GB.
export UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS=1
if [ $# -gt 0 ]; then
GGML_SYCL_DEVICE=$1
echo "use $GGML_SYCL_DEVICE as main GPU"

View File

@ -6,7 +6,7 @@
# If you want more control, DPC++ Allows selecting a specific device through the
# following environment variable
#export ONEAPI_DEVICE_SELECTOR="level_zero:0"
export ONEAPI_DEVICE_SELECTOR="level_zero:0"
source /opt/intel/oneapi/setvars.sh
#export GGML_SYCL_DEBUG=1
@ -18,11 +18,14 @@ MODEL_FILE=models/Meta-Llama-3.1-8B-Instruct-Q4_K_M.gguf
NGL=99 # Layers offloaded to the GPU. If the device runs out of memory, reduce this value according to the model you are using.
CONTEXT=4096
#support malloc device memory more than 4GB.
export UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS=1
if [ $# -gt 0 ]; then
GGML_SYCL_DEVICE=$1
echo "Using $GGML_SYCL_DEVICE as the main GPU"
ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m ${MODEL_FILE} -p "${INPUT_PROMPT}" -n 400 -e -ngl ${NGL} -c ${CONTEXT} -mg $GGML_SYCL_DEVICE -sm none
ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m ${MODEL_FILE} -p "${INPUT_PROMPT}" -n 400 -e -ngl ${NGL} -s 0 -c ${CONTEXT} -mg $GGML_SYCL_DEVICE -sm none
else
#use multiple GPUs with same max compute units
ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m ${MODEL_FILE} -p "${INPUT_PROMPT}" -n 400 -e -ngl ${NGL} -c ${CONTEXT}
ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m ${MODEL_FILE} -p "${INPUT_PROMPT}" -n 400 -e -ngl ${NGL} -s 0 -c ${CONTEXT}
fi

View File

@ -5,5 +5,7 @@
set INPUT2="Building a website can be done in 10 simple steps:\nStep 1:"
@call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64 --force
:: support malloc device memory more than 4GB.
set UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS=1
.\build\bin\llama-cli.exe -m models\llama-2-7b.Q4_0.gguf -p %INPUT2% -n 400 -e -ngl 99 -s 0

View File

@ -5,5 +5,7 @@
set INPUT2="Building a website can be done in 10 simple steps:\nStep 1:"
@call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64 --force
:: support malloc device memory more than 4GB.
set UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS=1
.\build\bin\llama-cli.exe -m models\Meta-Llama-3.1-8B-Instruct-Q4_K_M.gguf -p %INPUT2% -n 400 -e -ngl 99
.\build\bin\llama-cli.exe -m models\Meta-Llama-3.1-8B-Instruct-Q4_K_M.gguf -p %INPUT2% -n 400 -s 0 -e -ngl 99

View File

@ -175,14 +175,10 @@ option(GGML_CPU_ALL_VARIANTS "ggml: build all variants of the CPU backend (requi
set(GGML_CPU_ARM_ARCH "" CACHE STRING "ggml: CPU architecture for ARM")
set(GGML_CPU_POWERPC_CPUTYPE "" CACHE STRING "ggml: CPU type for PowerPC")
if (MINGW)
set(GGML_WIN_VER "0xA00" CACHE STRING "ggml: Windows version")
endif()
# ggml core
set(GGML_SCHED_MAX_COPIES "4" CACHE STRING "ggml: max input copies for pipeline parallelism")
option(GGML_CPU "ggml: enable CPU backend" ON)
option(GGML_SCHED_NO_REALLOC "ggml: disallow reallocations in ggml-alloc (for debugging)" OFF)
# 3rd party libs / backends
option(GGML_ACCELERATE "ggml: enable Accelerate framework" ON)
@ -225,7 +221,7 @@ option(GGML_WEBGPU "ggml: use WebGPU"
option(GGML_WEBGPU_DEBUG "ggml: enable WebGPU debug output" OFF)
option(GGML_WEBGPU_CPU_PROFILE "ggml: enable WebGPU profiling (CPU)" OFF)
option(GGML_WEBGPU_GPU_PROFILE "ggml: enable WebGPU profiling (GPU)" OFF)
option(GGML_WEBGPU_JSPI "ggml: use JSPI for WebGPU" ON)
option(GGML_ZDNN "ggml: use zDNN" OFF)
option(GGML_METAL "ggml: use Metal" ${GGML_METAL_DEFAULT})
option(GGML_METAL_NDEBUG "ggml: disable Metal debugging" OFF)
@ -407,62 +403,67 @@ if (MSVC)
/wd4996 # Disable POSIX deprecation warnings
/wd4702 # Unreachable code warnings
)
function(disable_msvc_warnings target_name)
set(MSVC_COMPILE_OPTIONS
"$<$<COMPILE_LANGUAGE:C>:/utf-8>"
"$<$<COMPILE_LANGUAGE:CXX>:/utf-8>"
)
function(configure_msvc_target target_name)
if(TARGET ${target_name})
target_compile_options(${target_name} PRIVATE ${MSVC_WARNING_FLAGS})
target_compile_options(${target_name} PRIVATE ${MSVC_COMPILE_OPTIONS})
endif()
endfunction()
disable_msvc_warnings(ggml-base)
disable_msvc_warnings(ggml)
disable_msvc_warnings(ggml-cpu)
disable_msvc_warnings(ggml-cpu-x64)
disable_msvc_warnings(ggml-cpu-sse42)
disable_msvc_warnings(ggml-cpu-sandybridge)
disable_msvc_warnings(ggml-cpu-haswell)
disable_msvc_warnings(ggml-cpu-skylakex)
disable_msvc_warnings(ggml-cpu-icelake)
disable_msvc_warnings(ggml-cpu-alderlake)
configure_msvc_target(ggml-base)
configure_msvc_target(ggml)
configure_msvc_target(ggml-cpu)
configure_msvc_target(ggml-cpu-x64)
configure_msvc_target(ggml-cpu-sse42)
configure_msvc_target(ggml-cpu-sandybridge)
configure_msvc_target(ggml-cpu-haswell)
configure_msvc_target(ggml-cpu-skylakex)
configure_msvc_target(ggml-cpu-icelake)
configure_msvc_target(ggml-cpu-alderlake)
if (GGML_BUILD_EXAMPLES)
disable_msvc_warnings(common-ggml)
disable_msvc_warnings(common)
configure_msvc_target(common-ggml)
configure_msvc_target(common)
disable_msvc_warnings(mnist-common)
disable_msvc_warnings(mnist-eval)
disable_msvc_warnings(mnist-train)
configure_msvc_target(mnist-common)
configure_msvc_target(mnist-eval)
configure_msvc_target(mnist-train)
disable_msvc_warnings(gpt-2-ctx)
disable_msvc_warnings(gpt-2-alloc)
disable_msvc_warnings(gpt-2-backend)
disable_msvc_warnings(gpt-2-sched)
disable_msvc_warnings(gpt-2-quantize)
disable_msvc_warnings(gpt-2-batched)
configure_msvc_target(gpt-2-ctx)
configure_msvc_target(gpt-2-alloc)
configure_msvc_target(gpt-2-backend)
configure_msvc_target(gpt-2-sched)
configure_msvc_target(gpt-2-quantize)
configure_msvc_target(gpt-2-batched)
disable_msvc_warnings(gpt-j)
disable_msvc_warnings(gpt-j-quantize)
configure_msvc_target(gpt-j)
configure_msvc_target(gpt-j-quantize)
disable_msvc_warnings(magika)
disable_msvc_warnings(yolov3-tiny)
disable_msvc_warnings(sam)
configure_msvc_target(magika)
configure_msvc_target(yolov3-tiny)
configure_msvc_target(sam)
disable_msvc_warnings(simple-ctx)
disable_msvc_warnings(simple-backend)
configure_msvc_target(simple-ctx)
configure_msvc_target(simple-backend)
endif()
if (GGML_BUILD_TESTS)
disable_msvc_warnings(test-mul-mat)
disable_msvc_warnings(test-arange)
disable_msvc_warnings(test-backend-ops)
disable_msvc_warnings(test-cont)
disable_msvc_warnings(test-conv-transpose)
disable_msvc_warnings(test-conv-transpose-1d)
disable_msvc_warnings(test-conv1d)
disable_msvc_warnings(test-conv2d)
disable_msvc_warnings(test-conv2d-dw)
disable_msvc_warnings(test-customop)
disable_msvc_warnings(test-dup)
disable_msvc_warnings(test-opt)
disable_msvc_warnings(test-pool)
configure_msvc_target(test-mul-mat)
configure_msvc_target(test-arange)
configure_msvc_target(test-backend-ops)
configure_msvc_target(test-cont)
configure_msvc_target(test-conv-transpose)
configure_msvc_target(test-conv-transpose-1d)
configure_msvc_target(test-conv1d)
configure_msvc_target(test-conv2d)
configure_msvc_target(test-conv2d-dw)
configure_msvc_target(test-customop)
configure_msvc_target(test-dup)
configure_msvc_target(test-opt)
configure_msvc_target(test-pool)
endif ()
endif()

View File

@ -8,7 +8,7 @@ extern "C" {
#endif
#define RPC_PROTO_MAJOR_VERSION 3
#define RPC_PROTO_MINOR_VERSION 0
#define RPC_PROTO_MINOR_VERSION 5
#define RPC_PROTO_PATCH_VERSION 0
#define GGML_RPC_MAX_SERVERS 16

View File

@ -204,6 +204,10 @@
# define GGML_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))
#endif
#if defined(_WIN32) && !defined(_WIN32_WINNT)
# define _WIN32_WINNT 0x0A00
#endif
#include <stdbool.h>
#include <stddef.h>
#include <stdint.h>
@ -2148,7 +2152,8 @@ extern "C" {
};
enum ggml_scale_flag {
GGML_SCALE_FLAG_ALIGN_CORNERS = (1 << 8)
GGML_SCALE_FLAG_ALIGN_CORNERS = (1 << 8),
GGML_SCALE_FLAG_ANTIALIAS = (1 << 9),
};
// interpolate
@ -2278,7 +2283,7 @@ extern "C" {
float stop,
float step);
#define GGML_KQ_MASK_PAD 64
#define GGML_KQ_MASK_PAD 1
// q: [n_embd_k, n_batch, n_head, ne3 ]
// k: [n_embd_k, n_kv, n_head_kv, ne3 ]

View File

@ -127,10 +127,6 @@ if (NOT MSVC)
endif()
endif()
if (MINGW)
add_compile_definitions(_WIN32_WINNT=${GGML_WIN_VER})
endif()
#
# POSIX conformance
#
@ -221,6 +217,10 @@ if (GGML_BACKEND_DL)
target_compile_definitions(ggml-base PUBLIC GGML_BACKEND_DL)
endif()
if (GGML_SCHED_NO_REALLOC)
target_compile_definitions(ggml-base PUBLIC GGML_SCHED_NO_REALLOC)
endif()
add_library(ggml
ggml-backend-reg.cpp)
add_library(ggml::ggml ALIAS ggml)
@ -270,10 +270,13 @@ function(ggml_add_backend_library backend)
endif()
# Set versioning properties for all backend libraries
set_target_properties(${backend} PROPERTIES
VERSION ${GGML_VERSION}
SOVERSION ${GGML_VERSION_MAJOR}
)
# Building a MODULE library with a version is not supported on macOS (https://gitlab.kitware.com/cmake/cmake/-/issues/20782)
if (NOT (APPLE AND GGML_BACKEND_DL))
set_target_properties(${backend} PROPERTIES
VERSION ${GGML_VERSION}
SOVERSION ${GGML_VERSION_MAJOR}
)
endif()
if(NOT GGML_AVAILABLE_BACKENDS)
set(GGML_AVAILABLE_BACKENDS "${backend}"

View File

@ -921,10 +921,15 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
}
if (realloc) {
#ifndef NDEBUG
size_t cur_size = galloc->buffers[i] ? ggml_vbuffer_size(galloc->buffers[i]) : 0;
GGML_LOG_DEBUG("%s: reallocating %s buffer from size %.02f MiB to %.02f MiB\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), cur_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
{
size_t cur_size = galloc->buffers[i] ? ggml_vbuffer_size(galloc->buffers[i]) : 0;
if (cur_size > 0) {
GGML_LOG_DEBUG("%s: reallocating %s buffer from size %.02f MiB to %.02f MiB\n",
__func__, ggml_backend_buft_name(galloc->bufts[i]),
cur_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
}
}
#endif
ggml_vbuffer_free(galloc->buffers[i]);
galloc->buffers[i] = ggml_vbuffer_alloc(galloc->bufts[i], galloc->buf_tallocs[i], GGML_BACKEND_BUFFER_USAGE_COMPUTE);
if (galloc->buffers[i] == NULL) {

View File

@ -723,6 +723,12 @@ struct ggml_backend_sched {
bool op_offload;
int debug;
// used for debugging graph reallocations [GGML_SCHED_DEBUG_REALLOC]
// ref: https://github.com/ggml-org/llama.cpp/pull/17617
int debug_realloc;
int debug_graph_size;
int debug_prev_graph_size;
};
#define hash_id(tensor) ggml_hash_find_or_insert(&sched->hash_set, tensor)
@ -1234,10 +1240,8 @@ void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgra
tensor_copy = ggml_dup_tensor_layout(sched->ctx, src);
ggml_format_name(tensor_copy, "%s#%s#%d", ggml_backend_name(backend), src->name, c);
}
if (sched->n_copies > 1) {
ggml_set_input(tensor_copy);
ggml_set_output(tensor_copy); // prevent ggml-alloc from overwriting the tensor
}
ggml_set_input(tensor_copy);
ggml_set_output(tensor_copy); // prevent ggml-alloc from overwriting the tensor
tensor_id_copy(src_id, src_backend_id, c) = tensor_copy;
SET_CAUSE(tensor_copy, "4.cpy");
}
@ -1289,6 +1293,11 @@ void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgra
}
int graph_size = std::max(graph->n_nodes, graph->n_leafs) + sched->n_splits*GGML_SCHED_MAX_SPLIT_INPUTS*2*sched->n_copies;
// remember the actual graph_size for performing reallocation checks later [GGML_SCHED_DEBUG_REALLOC]
sched->debug_prev_graph_size = sched->debug_graph_size;
sched->debug_graph_size = graph_size;
if (sched->graph.size < graph_size) {
sched->graph.size = graph_size;
sched->graph.nodes = (ggml_tensor **) realloc(sched->graph.nodes, graph_size * sizeof(struct ggml_tensor *));
@ -1395,14 +1404,27 @@ static bool ggml_backend_sched_alloc_splits(ggml_backend_sched_t sched) {
// allocate graph
if (backend_ids_changed || !ggml_gallocr_alloc_graph(sched->galloc, &sched->graph)) {
#ifndef NDEBUG
GGML_LOG_DEBUG("%s: failed to allocate graph, reserving (backend_ids_changed = %d)\n", __func__, backend_ids_changed);
#endif
if (sched->debug_realloc > 0) {
// we are interested only in situations where the graph was reallocated even though its size remained the same [GGML_SCHED_DEBUG_REALLOC]
// example: https://github.com/ggml-org/llama.cpp/pull/17143
const bool unexpected = !backend_ids_changed && sched->debug_prev_graph_size == sched->debug_graph_size;
if (unexpected || sched->debug_realloc > 1) {
GGML_ABORT("%s: unexpected graph reallocation (graph size = %d, nodes = %d, leafs = %d), debug_realloc = %d\n", __func__,
sched->debug_graph_size, sched->graph.n_nodes, sched->graph.n_leafs, sched->debug_realloc);
}
}
// the re-allocation may cause the split inputs to be moved to a different address
// synchronize without ggml_backend_sched_synchronize to avoid changing cur_copy
for (int i = 0; i < sched->n_backends; i++) {
ggml_backend_synchronize(sched->backends[i]);
}
#ifndef NDEBUG
GGML_LOG_DEBUG("%s: failed to allocate graph, reserving (backend_ids_changed = %d)\n", __func__, backend_ids_changed);
#endif
ggml_gallocr_reserve_n(sched->galloc, &sched->graph, sched->node_backend_ids, sched->leaf_backend_ids);
if (!ggml_gallocr_alloc_graph(sched->galloc, &sched->graph)) {
GGML_LOG_ERROR("%s: failed to allocate graph\n", __func__);
@ -1614,6 +1636,14 @@ ggml_backend_sched_t ggml_backend_sched_new(
const char * GGML_SCHED_DEBUG = getenv("GGML_SCHED_DEBUG");
sched->debug = GGML_SCHED_DEBUG ? atoi(GGML_SCHED_DEBUG) : 0;
sched->debug_realloc = 0;
#ifdef GGML_SCHED_NO_REALLOC
sched->debug_realloc = 1;
#endif
const char * GGML_SCHED_DEBUG_REALLOC = getenv("GGML_SCHED_DEBUG_REALLOC");
sched->debug_realloc = GGML_SCHED_DEBUG_REALLOC ? atoi(GGML_SCHED_DEBUG_REALLOC) : sched->debug_realloc;
sched->n_backends = n_backends;
sched->n_copies = parallel ? GGML_SCHED_MAX_COPIES : 1;
@ -1630,6 +1660,9 @@ ggml_backend_sched_t ggml_backend_sched_new(
sched->prev_node_backend_ids = (int *) calloc(nodes_size, sizeof(sched->prev_node_backend_ids[0]));
sched->prev_leaf_backend_ids = (int *) calloc(nodes_size, sizeof(sched->prev_leaf_backend_ids[0]));
sched->debug_graph_size = 0;
sched->debug_prev_graph_size = 0;
sched->context_buffer_size = ggml_sched_max_splits*GGML_SCHED_MAX_SPLIT_INPUTS*2*sizeof(struct ggml_tensor) + ggml_graph_overhead_custom(graph_size, false);
sched->context_buffer = (char *) malloc(sched->context_buffer_size);

View File

@ -2207,78 +2207,120 @@ static void aclnn_index_fill_tensor(ggml_backend_cann_context & ctx,
}
/**
* @brief Initializes and caches sine/cosine positional encoding values
* (used in RoPE, Rotary Position Embedding) for attention layers.
* @brief Initializes and caches all intermediate tensors required for RoPE
* (Rotary Position Embedding), including support for Yarn, mRoPE,
* i-mRoPE, Neox repeat strategy, independent sectors, frequency factors
* and multi-section rotary groups.
*
* This function computes and caches the sin/cos values of
* θ = position * theta_scale for RoPE encoding. The cache is shared
* across attention layers, and only the first attention layer will
* trigger initialization. The cache includes repeated sin/cos values
* with different repeat methods depending on the @param is_neox flag.
* This function computes and caches the per-dimension θ coefficients used for
* Q/K rotary embedding. The cache is shared across layers, and recomputed only
* when any dependent parameter changes.
*
* Steps performed by this function:
* 1. Identify whether the target tensor belongs to Q/K in attention
* and restrict computation to the first layer only.
* 2. Initialize the theta scale array (arange power freq scaling).
* 3. Allocate sin/cos caches if the max prompt length increases.
* 4. Compute θ = position * theta_scale.
* 5. Compute sin(θ), cos(θ) and optionally scale by attn_factor.
* 6. Expand sin/cos values by repeat or repeat_interleave depending
* on whether @param is_neox is enabled.
* The function now supports:
* - Yarn RoPE extrapolation (via @param corr_dims and @param ext_factor)
* - Per-dimension independent sector exponent rules (indep_sects + sections[])
* - Multi-section RoPE (mRoPE) index mapping (mrope_used + is_imrope)
* - Frequency factor division (src2)
* - Neox / normal repeat expansion modes
*
* @param ctx The CANN backend context, holding memory pool,
* stream, and persistent buffers for rope init/cache.
* @param dst The destination ggml_tensor whose computation
* depends on the RoPE values (usually Qcur/Kcur).
* @param theta_scale Scalar exponent base for computing theta scale values.
* @param freq_scale Frequency scaling factor, applied to theta scale.
* @param attn_factor Attention scaling factor, applied to sin/cos.
* @param is_neox Whether to use Neox-style repeat strategy
* (dim expansion vs repeat_interleave).
* @param ctx CANN backend context, containing memory pool,
* cached buffers, and runtime stream.
* @param dst Destination ggml_tensor whose computation
* depends on RoPE (typically Qcur or Kcur).
* @param corr_dims [low, high] Yarn correction range.
* @param ext_factor Yarn extrapolation strength. 0 = disabled.
* @param theta_scale Base multiplier for per-dimension θ exponent.
* @param freq_scale Global frequency scaling factor.
* @param attn_factor Optional scaling applied to sin/cos (if needed).
* @param is_neox Whether to use Neox-style dimension interleave.
* @param sections 4-way sector sizes for independent-section RoPE
* and multi-section mRoPE (t/h/w/e).
* @param mrope_used Whether to enable multi-section rotary embedding.
* @param is_imrope Whether to apply interleaved mRoPE rules.
* @param indep_sects Whether each dimension runs independent exponent
* resets based on @p sections.
*/
static void aclnn_cache_init(ggml_backend_cann_context & ctx,
ggml_tensor * dst,
float * corr_dims,
float ext_factor,
float theta_scale,
float freq_scale,
float attn_factor,
bool is_neox) {
static void aclnn_rope_cache_init(ggml_backend_cann_context & ctx,
ggml_tensor * dst,
float * corr_dims,
float ext_factor,
float theta_scale,
float freq_scale,
float attn_factor,
bool is_neox,
int sections[4],
bool mrope_used,
bool is_imrope,
bool indep_sects) {
ggml_tensor * src0 = dst->src[0]; // input
ggml_tensor * src1 = dst->src[1]; // position
ggml_tensor * src2 = dst->src[2]; // freq_factors
if (src2 == nullptr && ctx.rope_cache.cached && ctx.rope_cache.ext_factor == ext_factor &&
ctx.rope_cache.theta_scale == theta_scale && ctx.rope_cache.freq_scale == freq_scale &&
ctx.rope_cache.attn_factor == attn_factor && ctx.rope_cache.is_neox == is_neox) {
int64_t theta_scale_length = src0->ne[0] / 2;
int64_t position_length = dst->ne[2];
// TODO: check theta_scale_length and position_length.
if (src2 == nullptr && ctx.rope_cache.cached &&
ctx.rope_cache.equal(theta_scale_length, position_length, ext_factor, theta_scale, freq_scale, attn_factor,
is_neox, indep_sects, mrope_used, is_imrope, sections)) {
// use cache.
return;
}
int64_t theta_scale_length = src0->ne[0] / 2;
int64_t theta_scale_ne[] = { theta_scale_length, 1, 1, 1 };
size_t theta_scale_nb[] = { sizeof(float), sizeof(float), sizeof(float), theta_scale_length * sizeof(float) };
// Step0: calculate tensor shape.
int64_t theta_scale_ne[] = { theta_scale_length, 1, 1, 1 };
size_t theta_scale_nb[] = { sizeof(float), theta_scale_length * sizeof(float), theta_scale_length * sizeof(float),
theta_scale_length * sizeof(float) };
GGML_ASSERT(src1->type == GGML_TYPE_I32);
int64_t position_length = src1->ne[0];
int64_t position_ne[] = { 1, 1, position_length, 1 };
size_t position_nb[] = { sizeof(int32_t), sizeof(int32_t), sizeof(int32_t), sizeof(int32_t) * position_length };
int64_t position_ne[] = { 1, 1, position_length, 1 };
size_t position_nb[] = { sizeof(int32_t), sizeof(int32_t), sizeof(int32_t), sizeof(int32_t) * position_length };
int64_t theta_ne[] = { theta_scale_length, 1, position_length, 1 };
size_t theta_nb[GGML_MAX_DIMS];
theta_nb[0] = sizeof(float);
int64_t cache_ne[] = { theta_scale_length, 1, position_length, 1 };
size_t cache_nb[GGML_MAX_DIMS];
cache_nb[0] = sizeof(float);
for (int i = 1; i < GGML_MAX_DIMS; i++) {
theta_nb[i] = theta_nb[i - 1] * theta_ne[i - 1];
cache_nb[i] = cache_nb[i - 1] * cache_ne[i - 1];
}
// theta_scale arange, [0,1,...,ne00/2 - 1]
// Step1: Compute the coefficient of theta. During the cache_init process, aside from
// (1) multiplying by the position,
// (2) dividing by freq_factors,
// (3) computing the sine and cosine,
// the other parameters used in the computation generally do not change in most scenarios.
// Therefore, we can first compute this part of the result and then cache it.
// Step1.1: prepare theta_scale exponent. if this exponent updated, should update theta_scale_tensor.
acl_tensor_ptr acl_theta_scale_tensor;
// cache theta scale
if (ctx.rope_cache.theta_scale_length != theta_scale_length ||
// theta_scale and freq_scale should not change during the current token inference process,
// so we can directly use == here instead of comparing the absolute difference.
ctx.rope_cache.theta_scale != theta_scale || ctx.rope_cache.freq_scale != freq_scale) {
ctx.rope_cache.theta_scale_length = theta_scale_length;
bool theta_scale_updated = false;
if (ctx.rope_cache.theta_scale_length != theta_scale_length || ctx.rope_cache.theta_scale != theta_scale ||
ctx.rope_cache.indep_sects != indep_sects) {
theta_scale_updated = true;
if (ctx.rope_cache.theta_scale_exp_host != nullptr) {
free(ctx.rope_cache.theta_scale_exp_host);
}
ctx.rope_cache.theta_scale_exp_host = (float *) malloc(theta_scale_length * sizeof(float));
GGML_ASSERT(ctx.rope_cache.theta_scale_exp_host != nullptr);
if (!indep_sects) {
ctx.rope_cache.theta_scale_exp_host[0] = 1;
for (int i = 1; i < theta_scale_length; i++) {
ctx.rope_cache.theta_scale_exp_host[i] = ctx.rope_cache.theta_scale_exp_host[i - 1] * theta_scale;
}
} else {
int sect_dims = sections[0] + sections[1] + sections[2] + sections[3];
int sec_w = sections[1] + sections[0];
int sec_e = sections[2] + sec_w;
ctx.rope_cache.theta_scale_exp_host[0] = 1;
for (int i = 1; i < theta_scale_length; i++) {
int sector = i % sect_dims;
if (sector == 0 || sector == sections[0] || sector == sec_w || sector == sec_e) {
ctx.rope_cache.theta_scale_exp_host[i] = 1;
continue;
}
ctx.rope_cache.theta_scale_exp_host[i] = ctx.rope_cache.theta_scale_exp_host[i - 1] * theta_scale;
}
}
if (ctx.rope_cache.theta_scale_cache != nullptr) {
ACL_CHECK(aclrtFree(ctx.rope_cache.theta_scale_cache));
@ -2286,74 +2328,138 @@ static void aclnn_cache_init(ggml_backend_cann_context & ctx,
ACL_CHECK(aclrtMalloc(&ctx.rope_cache.theta_scale_cache, theta_scale_length * sizeof(float),
ACL_MEM_MALLOC_HUGE_FIRST));
ACL_CHECK(aclrtMemcpyAsync(ctx.rope_cache.theta_scale_cache, theta_scale_length * sizeof(float),
ctx.rope_cache.theta_scale_exp_host, theta_scale_length * sizeof(float),
ACL_MEMCPY_HOST_TO_DEVICE, ctx.stream()));
acl_theta_scale_tensor = ggml_cann_create_tensor(ctx.rope_cache.theta_scale_cache, ACL_FLOAT, sizeof(float),
theta_scale_ne, theta_scale_nb, 1);
}
float start = 0;
float step = 1;
float stop = theta_scale_length;
float n_elements = theta_scale_length;
aclnn_arange(ctx, acl_theta_scale_tensor.get(), start, stop, step, n_elements);
// Step1.2: prepare rope_yarn_ramp, if this part updated, should update theta_scale_tensor.
bool yarn_ramp_tensor_updated = false;
ggml_cann_pool_alloc yarn_ramp_allocator(ctx.pool());
acl_tensor_ptr acl_yarn_ramp_tensor;
if (ext_factor != 0 &&
// TODO: check more parameter.
(ctx.rope_cache.theta_scale_length != theta_scale_length || ctx.rope_cache.freq_scale != freq_scale)) {
yarn_ramp_tensor_updated = true;
ggml_cann_pool_alloc yarn_ramp_allocator(ctx.pool());
acl_tensor_ptr acl_yarn_ramp_tensor;
if (ext_factor != 0) {
// -rope_yarn_ramp
// const float y = (i0 / 2 - low) / MAX(0.001f, high - low);
// return MIN(1, MAX(0, y)) - 1;
yarn_ramp_allocator.alloc(theta_scale_length * sizeof(float));
void * yarn_ramp_buffer = yarn_ramp_allocator.get();
acl_yarn_ramp_tensor =
ggml_cann_create_tensor(yarn_ramp_buffer, ACL_FLOAT, sizeof(float), theta_scale_ne, theta_scale_nb, 1);
float zero_value = 0, one_value = 1;
float denom_safe_value = MAX(0.001f, corr_dims[1] - corr_dims[0]);
acl_scalar_ptr low = ggml_cann_create_scalar(&corr_dims[0], aclDataType::ACL_FLOAT);
acl_scalar_ptr zero = ggml_cann_create_scalar(&zero_value, aclDataType::ACL_FLOAT);
acl_scalar_ptr one = ggml_cann_create_scalar(&one_value, aclDataType::ACL_FLOAT);
acl_scalar_ptr denom_safe = ggml_cann_create_scalar(&denom_safe_value, aclDataType::ACL_FLOAT);
acl_scalar_ptr ext_factor_sc = ggml_cann_create_scalar(&ext_factor, aclDataType::ACL_FLOAT);
// -rope_yarn_ramp
// const float y = (i0 / 2 - low) / MAX(0.001f, high - low);
// return MIN(1, MAX(0, y)) - 1;
yarn_ramp_allocator.alloc(theta_scale_length * sizeof(float));
void * yarn_ramp_buffer = yarn_ramp_allocator.get();
acl_yarn_ramp_tensor =
ggml_cann_create_tensor(yarn_ramp_buffer, ACL_FLOAT, sizeof(float), theta_scale_ne, theta_scale_nb, 1);
float zero_value = 0, one_value = 1;
float denom_safe_value = MAX(0.001f, corr_dims[1] - corr_dims[0]);
acl_scalar_ptr low = ggml_cann_create_scalar(&corr_dims[0], aclDataType::ACL_FLOAT);
acl_scalar_ptr zero = ggml_cann_create_scalar(&zero_value, aclDataType::ACL_FLOAT);
acl_scalar_ptr one = ggml_cann_create_scalar(&one_value, aclDataType::ACL_FLOAT);
acl_scalar_ptr denom_safe = ggml_cann_create_scalar(&denom_safe_value, aclDataType::ACL_FLOAT);
acl_scalar_ptr ext_factor_sc = ggml_cann_create_scalar(&ext_factor, aclDataType::ACL_FLOAT);
GGML_CANN_CALL_ACLNN_OP(ctx, Subs, acl_theta_scale_tensor.get(), low.get(), one.get(),
acl_yarn_ramp_tensor.get());
GGML_CANN_CALL_ACLNN_OP(ctx, InplaceDivs, acl_yarn_ramp_tensor.get(), denom_safe.get());
GGML_CANN_CALL_ACLNN_OP(ctx, InplaceThreshold, acl_yarn_ramp_tensor.get(), zero.get(), zero.get());
GGML_CANN_CALL_ACLNN_OP(ctx, InplaceClampMax, acl_yarn_ramp_tensor.get(), one.get());
GGML_CANN_CALL_ACLNN_OP(ctx, InplaceSubs, acl_yarn_ramp_tensor.get(), one.get(), one.get());
GGML_CANN_CALL_ACLNN_OP(ctx, InplaceMuls, acl_yarn_ramp_tensor.get(), ext_factor_sc.get());
aclnn_arange(ctx, acl_yarn_ramp_tensor.get(), 0, theta_scale_length, 1, theta_scale_length);
GGML_CANN_CALL_ACLNN_OP(ctx, InplaceSubs, acl_yarn_ramp_tensor.get(), low.get(), one.get());
GGML_CANN_CALL_ACLNN_OP(ctx, InplaceDivs, acl_yarn_ramp_tensor.get(), denom_safe.get());
GGML_CANN_CALL_ACLNN_OP(ctx, InplaceThreshold, acl_yarn_ramp_tensor.get(), zero.get(), zero.get());
GGML_CANN_CALL_ACLNN_OP(ctx, InplaceClampMax, acl_yarn_ramp_tensor.get(), one.get());
GGML_CANN_CALL_ACLNN_OP(ctx, InplaceSubs, acl_yarn_ramp_tensor.get(), one.get(), one.get());
GGML_CANN_CALL_ACLNN_OP(ctx, InplaceMuls, acl_yarn_ramp_tensor.get(), ext_factor_sc.get());
// theta_interp = freq_scale * theta_extrap;
// theta = theta_interp * (1 - ramp_mix) + theta_extrap * ramp_mix;
// theta = freq_scale * theta_extrap * (1 - ramp_mix) + theta_extrap * ramp_mix;
// theta = freq_scale * theta_extrap - freq_scale * theta_extrap * ramp_mix + theta_extrap * ramp_mix;
// theta = theta_extrap * (freq_scale - freq_scale * ramp_mix + ramp_mix);
//
// we cache (freq_scale - freq_scale * ramp_mix + ramp_mix), Considering that the rope_yarn_ramp here is the inverse
// cache freq_scale + (freq_scale - 1) * ramp_mix
float freq_scale_1 = freq_scale - 1;
acl_scalar_ptr freq_scale_sc = ggml_cann_create_scalar(&freq_scale, aclDataType::ACL_FLOAT);
acl_scalar_ptr freq_scale_1_sc = ggml_cann_create_scalar(&freq_scale_1, aclDataType::ACL_FLOAT);
GGML_CANN_CALL_ACLNN_OP(ctx, InplaceMuls, acl_yarn_ramp_tensor.get(), freq_scale_1_sc.get());
GGML_CANN_CALL_ACLNN_OP(ctx, InplaceAdds, acl_yarn_ramp_tensor.get(), freq_scale_sc.get(), one.get());
}
// theta_interp = freq_scale * theta_extrap;
// theta = theta_interp * (1 - ramp_mix) + theta_extrap * ramp_mix;
// theta = freq_scale * theta_extrap * (1 - ramp_mix) + theta_extrap * ramp_mix;
// theta = freq_scale * theta_extrap - freq_scale * theta_extrap * ramp_mix + theta_extrap * ramp_mix;
// theta = theta_extrap * (freq_scale - freq_scale * ramp_mix + ramp_mix);
//
// we cache (freq_scale - freq_scale * ramp_mix + ramp_mix), Considering that the rope_yarn_ramp here is the inverse
// cache freq_scale + (freq_scale - 1) * ramp_mix
float freq_scale_1 = freq_scale - 1;
acl_scalar_ptr freq_scale_sc = ggml_cann_create_scalar(&freq_scale, aclDataType::ACL_FLOAT);
acl_scalar_ptr freq_scale_1_sc = ggml_cann_create_scalar(&freq_scale_1, aclDataType::ACL_FLOAT);
GGML_CANN_CALL_ACLNN_OP(ctx, InplaceMuls, acl_yarn_ramp_tensor.get(), freq_scale_1_sc.get());
GGML_CANN_CALL_ACLNN_OP(ctx, InplaceAdds, acl_yarn_ramp_tensor.get(), freq_scale_sc.get(), one.get());
}
// power
acl_scalar_ptr acl_theta_scale = ggml_cann_create_scalar(&theta_scale, aclDataType::ACL_FLOAT);
GGML_CANN_CALL_ACLNN_OP(ctx, PowScalarTensor, acl_theta_scale.get(), acl_theta_scale_tensor.get(),
acl_theta_scale_tensor.get());
if (ext_factor != 0) {
// Step 1.3: update theta_scale_tensor according to ext_factor or freq_scale.
if (ext_factor != 0) {
if (theta_scale_updated || yarn_ramp_tensor_updated) {
theta_scale_updated = true;
aclnn_mul(ctx, acl_theta_scale_tensor.get(), acl_yarn_ramp_tensor.get());
} else if (freq_scale != 1) {
aclnn_muls(ctx, acl_theta_scale_tensor.get(), freq_scale, nullptr, true);
}
} else {
// use cache
if (freq_scale != 1 && (ctx.rope_cache.freq_scale != freq_scale || theta_scale_updated)) {
theta_scale_updated = true;
aclnn_muls(ctx, acl_theta_scale_tensor.get(), freq_scale, nullptr, true);
}
}
// Nothing changed, use cache.
if (!theta_scale_updated) {
acl_theta_scale_tensor = ggml_cann_create_tensor(ctx.rope_cache.theta_scale_cache, ACL_FLOAT, sizeof(float),
theta_scale_ne, theta_scale_nb, GGML_MAX_DIMS);
}
// Step 1.4: prepare select index if mrope
acl_tensor_ptr position_select_index_tensor;
if (mrope_used) {
if (ctx.rope_cache.sections[0] != sections[0] || ctx.rope_cache.sections[1] != sections[1] ||
ctx.rope_cache.sections[2] != sections[2] || ctx.rope_cache.sections[3] != sections[3] ||
ctx.rope_cache.theta_scale_length != theta_scale_length || ctx.rope_cache.is_imrope != is_imrope) {
if (ctx.rope_cache.position_select_index_host != nullptr) {
free(ctx.rope_cache.position_select_index_host);
}
ctx.rope_cache.position_select_index_host = (int *) malloc(theta_scale_length * sizeof(int));
GGML_ASSERT(ctx.rope_cache.position_select_index_host != nullptr);
int sect_dims = sections[0] + sections[1] + sections[2] + sections[3];
int sec_w = sections[1] + sections[0];
int sec_e = sections[2] + sec_w;
// t,h,w,e
for (int i = 0; i < theta_scale_length; i++) {
int sector = i % sect_dims;
if (is_imrope) { // qwen3vl apply interleaved mrope
if (sector % 3 == 1 && sector < 3 * sections[1]) {
ctx.rope_cache.position_select_index_host[i] = 1;
} else if (sector % 3 == 2 && sector < 3 * sections[2]) {
ctx.rope_cache.position_select_index_host[i] = 2;
} else if (sector % 3 == 0 && sector < 3 * sections[0]) {
ctx.rope_cache.position_select_index_host[i] = 0;
} else {
ctx.rope_cache.position_select_index_host[i] = 3;
}
} else {
if (sector >= sections[0] && sector < sec_w) {
ctx.rope_cache.position_select_index_host[i] = 1;
} else if (sector >= sec_w && sector < sec_e) {
ctx.rope_cache.position_select_index_host[i] = 2;
} else if (sector >= sec_e) {
ctx.rope_cache.position_select_index_host[i] = 3;
} else {
ctx.rope_cache.position_select_index_host[i] = 0;
}
}
}
if (ctx.rope_cache.position_select_index != nullptr) {
ACL_CHECK(aclrtFree(ctx.rope_cache.position_select_index));
}
ACL_CHECK(aclrtMalloc(&ctx.rope_cache.position_select_index, theta_scale_length * sizeof(int),
ACL_MEM_MALLOC_HUGE_FIRST));
ACL_CHECK(aclrtMemcpyAsync(ctx.rope_cache.position_select_index, theta_scale_length * sizeof(int),
ctx.rope_cache.position_select_index_host, theta_scale_length * sizeof(int),
ACL_MEMCPY_HOST_TO_DEVICE, ctx.stream()));
}
position_select_index_tensor = ggml_cann_create_tensor(ctx.rope_cache.position_select_index, ACL_INT32,
sizeof(int), theta_scale_ne, theta_scale_nb, 1);
}
// Step2: divide by freq_factors
ggml_cann_pool_alloc freq_fac_res_allocator(ctx.pool());
// freq_factors
if (src2) {
freq_fac_res_allocator.alloc(theta_scale_length * sizeof(float));
void * freq_fac_res_ptr = freq_fac_res_allocator.get();
@ -2366,6 +2472,85 @@ static void aclnn_cache_init(ggml_backend_cann_context & ctx,
std::swap(acl_theta_scale_tensor, acl_freq_fac_res_tensor);
}
// Step3: prepare position_tensor
acl_tensor_ptr acl_position_tensor;
ggml_cann_pool_alloc mrope_position_acllocator(ctx.pool());
if (mrope_used) {
// Step3.1: select current position;
// position :
// pos1: [[0, 1 ,2 ,3 ],
// pos2: [4, 5 ,6 ,7 ],
// pos3: [8, 9 ,10,11],
// pos4: [12,13,14,15] ]
//
// select index = [0, 1, 2, 2, 1, 0]
//
// selected_tensor:
// [[0, 1 ,2 ,3 ],
// [4, 5 ,6 ,7 ],
// [8, 9 ,10,11],
// [8, 9 ,10,11],
// [4, 5 ,6 ,7 ],
// [0, 1 ,2 ,3 ]]
//
// transpose, from [seq_len:dims] to [dims:seq_len]
// [0, 4, 8 ,8 ,4, 0],
// [1, 5, 9, 9, 5, 1],
// [2, 6, 10,10,6 ,2],
// [3, 7, 11,11,7 3 ]]
//
// multipy by theta_scale_tensor
// [theta_scale^0, theta_scale^1, ..., theta_scale ^ n]
int64_t mrope_position_ne[] = { position_length, 4 };
size_t mrope_position_nb[] = { sizeof(int), position_length * sizeof(int) };
acl_tensor_ptr mrope_position =
ggml_cann_create_tensor(src1->data, ggml_cann_type_mapping(src1->type), ggml_type_size(src1->type),
mrope_position_ne, mrope_position_nb, 2);
// selected position tensor's shape is a transpose of cache tensor.
int64_t selected_position_ne[] = { position_length, theta_scale_length };
size_t selected_position_nb[] = { sizeof(float), position_length * sizeof(float) };
mrope_position_acllocator.alloc(theta_scale_length * position_length * sizeof(float));
void * mrope_position_buffer = mrope_position_acllocator.get();
acl_position_tensor =
ggml_cann_create_tensor(mrope_position_buffer, ggml_cann_type_mapping(src1->type),
ggml_type_size(src1->type), selected_position_ne, selected_position_nb, 2);
GGML_CANN_CALL_ACLNN_OP(ctx, IndexSelect, mrope_position.get(), 0, position_select_index_tensor.get(),
acl_position_tensor.get());
// transpose
int64_t transposed_ne[] = { position_length, 1, theta_scale_length, 1 };
size_t transposed_nb[GGML_MAX_DIMS];
transposed_nb[0] = sizeof(float);
for (int i = 1; i < GGML_MAX_DIMS; i++) {
transposed_nb[i] = transposed_nb[i - 1] * transposed_ne[i - 1];
}
std::swap(transposed_ne[0], transposed_ne[2]);
std::swap(transposed_nb[0], transposed_nb[2]);
acl_position_tensor =
ggml_cann_create_tensor(mrope_position_buffer, ggml_cann_type_mapping(src1->type),
ggml_type_size(src1->type), transposed_ne, transposed_nb, GGML_MAX_DIMS);
} else {
// auto bcast.
acl_position_tensor =
ggml_cann_create_tensor(src1->data, ggml_cann_type_mapping(src1->type), ggml_type_size(src1->type),
position_ne, position_nb, GGML_MAX_DIMS);
}
// Step4: multiply by the position
int64_t theta_length = theta_scale_length * position_length;
ggml_cann_pool_alloc theta_allocator(ctx.pool(), theta_length * sizeof(float));
void * theta_buffer = theta_allocator.get();
acl_tensor_ptr acl_theta_tensor =
ggml_cann_create_tensor(theta_buffer, ACL_FLOAT, sizeof(float), cache_ne, cache_nb, GGML_MAX_DIMS);
aclnn_mul(ctx, acl_position_tensor.get(), acl_theta_scale_tensor.get(), acl_theta_tensor.get());
// Step5: calculate sin cos.
// init sin_repeat && cos_repeat, only to accelerate first layer on each device
if (position_length > ctx.rope_cache.position_length) {
ctx.rope_cache.position_length = position_length;
@ -2382,44 +2567,30 @@ static void aclnn_cache_init(ggml_backend_cann_context & ctx,
aclrtMalloc(&ctx.rope_cache.cos_cache, repeat_theta_length * sizeof(float), ACL_MEM_MALLOC_HUGE_FIRST));
}
// position
acl_tensor_ptr acl_position_tensor =
ggml_cann_create_tensor(src1->data, ggml_cann_type_mapping(src1->type), ggml_type_size(src1->type), position_ne,
position_nb, GGML_MAX_DIMS);
// power * position
int64_t theta_length = theta_scale_length * position_length;
ggml_cann_pool_alloc theta_allocator(ctx.pool(), theta_length * sizeof(float));
void * theta_buffer = theta_allocator.get();
acl_tensor_ptr acl_theta_tensor =
ggml_cann_create_tensor(theta_buffer, ACL_FLOAT, sizeof(float), theta_ne, theta_nb, GGML_MAX_DIMS);
aclnn_mul(ctx, acl_position_tensor.get(), acl_theta_scale_tensor.get(), acl_theta_tensor.get());
// sin/cos
ggml_cann_pool_alloc sin_allocator(ctx.pool(), theta_length * sizeof(float));
void * sin_buffer = sin_allocator.get();
acl_tensor_ptr acl_sin_tensor =
ggml_cann_create_tensor(sin_buffer, ACL_FLOAT, sizeof(float), theta_ne, theta_nb, GGML_MAX_DIMS, ACL_FORMAT_ND);
ggml_cann_create_tensor(sin_buffer, ACL_FLOAT, sizeof(float), cache_ne, cache_nb, GGML_MAX_DIMS, ACL_FORMAT_ND);
aclnn_sin(ctx, acl_theta_tensor.get(), acl_sin_tensor.get());
ggml_cann_pool_alloc cos_allocator(ctx.pool(), theta_length * sizeof(float));
void * cos_buffer = cos_allocator.get();
acl_tensor_ptr acl_cos_tensor =
ggml_cann_create_tensor(cos_buffer, ACL_FLOAT, sizeof(float), theta_ne, theta_nb, GGML_MAX_DIMS, ACL_FORMAT_ND);
ggml_cann_create_tensor(cos_buffer, ACL_FLOAT, sizeof(float), cache_ne, cache_nb, GGML_MAX_DIMS, ACL_FORMAT_ND);
aclnn_cos(ctx, acl_theta_tensor.get(), acl_cos_tensor.get());
if (ext_factor != 0) {
attn_factor *= 1.0f + 0.1f * logf(1.0f / freq_scale);
}
// attn_factor
// Step 5: multiply by attn_factor
if (attn_factor != 1) {
aclnn_muls(ctx, acl_sin_tensor.get(), attn_factor, nullptr, true);
aclnn_muls(ctx, acl_cos_tensor.get(), attn_factor, nullptr, true);
}
int64_t sin_reshape_ne[4] = { src0->ne[0], 1, src0->ne[2], 1 };
int64_t sin_reshape_ne[4] = { src0->ne[0], 1, dst->ne[2], 1 };
size_t sin_reshape_nb[GGML_MAX_DIMS];
sin_reshape_nb[0] = sizeof(float);
for (int i = 1; i < GGML_MAX_DIMS; i++) {
@ -2430,8 +2601,9 @@ static void aclnn_cache_init(ggml_backend_cann_context & ctx,
acl_tensor_ptr acl_cos_repeat_tensor = ggml_cann_create_tensor(ctx.rope_cache.cos_cache, ACL_FLOAT, sizeof(float),
sin_reshape_ne, sin_reshape_nb, GGML_MAX_DIMS);
// repeat
// Step 6: repeat
if (is_neox) {
// [sinθ1, sinθ1, sinθ2, sinθ2, ..., sinθn, sinθn]
int64_t repeatsArray[] = { 1, 1, 1, 2 };
aclnn_repeat(ctx, acl_sin_tensor.get(), acl_sin_repeat_tensor.get(), repeatsArray);
aclnn_repeat(ctx, acl_cos_tensor.get(), acl_cos_repeat_tensor.get(), repeatsArray);
@ -2439,17 +2611,15 @@ static void aclnn_cache_init(ggml_backend_cann_context & ctx,
int64_t num_repeats = 2;
int64_t dim = 3;
int64_t output_size = theta_scale_length * num_repeats;
// [sinθ1, sinθ2, ..., sinθn, sinθ1, sinθ2, ..., sinθn]
aclnn_repeat_interleave(ctx, acl_sin_tensor.get(), acl_sin_repeat_tensor.get(), dim, num_repeats, output_size);
aclnn_repeat_interleave(ctx, acl_cos_tensor.get(), acl_cos_repeat_tensor.get(), dim, num_repeats, output_size);
}
// Other layers use cache except first layer.
ctx.rope_cache.cached = true;
ctx.rope_cache.ext_factor = ext_factor;
ctx.rope_cache.theta_scale = theta_scale;
ctx.rope_cache.freq_scale = freq_scale;
ctx.rope_cache.attn_factor = attn_factor;
ctx.rope_cache.is_neox = is_neox;
// Update cached value.
ctx.rope_cache.cached = true;
ctx.rope_cache.set(theta_scale_length, position_length, ext_factor, theta_scale, freq_scale, attn_factor, is_neox,
indep_sects, mrope_used, is_imrope, sections);
}
#ifdef __cplusplus
@ -2475,6 +2645,7 @@ void ggml_cann_rope(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
// param
float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
int sections[4];
// const int n_past = ((int32_t *) dst->op_params)[0];
const int n_dims = ((int32_t *) dst->op_params)[1];
const int mode = ((int32_t *) dst->op_params)[2];
@ -2483,12 +2654,13 @@ void ggml_cann_rope(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
GGML_TENSOR_UNARY_OP_LOCALS
memcpy(&freq_base, (int32_t *) dst->op_params + 5, sizeof(float));
memcpy(&freq_scale, (int32_t *) dst->op_params + 6, sizeof(float));
memcpy(&ext_factor, (int32_t *) dst->op_params + 7, sizeof(float));
memcpy(&attn_factor, (int32_t *) dst->op_params + 8, sizeof(float));
memcpy(&beta_fast, (int32_t *) dst->op_params + 9, sizeof(float));
memcpy(&beta_slow, (int32_t *) dst->op_params + 10, sizeof(float));
memcpy(&freq_base, (int32_t *) dst->op_params + 5, sizeof(float));
memcpy(&freq_scale, (int32_t *) dst->op_params + 6, sizeof(float));
memcpy(&ext_factor, (int32_t *) dst->op_params + 7, sizeof(float));
memcpy(&attn_factor, (int32_t *) dst->op_params + 8, sizeof(float));
memcpy(&beta_fast, (int32_t *) dst->op_params + 9, sizeof(float));
memcpy(&beta_slow, (int32_t *) dst->op_params + 10, sizeof(float));
memcpy(&sections, (int32_t *) dst->op_params + 11, sizeof(int)*4);
// TODO: n_dims <= ne0
GGML_ASSERT(n_dims == ne0);
@ -2499,10 +2671,25 @@ void ggml_cann_rope(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
float corr_dims[2];
ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow, corr_dims);
const bool is_neox = mode & GGML_ROPE_TYPE_NEOX;
bool is_neox = mode & GGML_ROPE_TYPE_NEOX;
const bool is_imrope = mode == GGML_ROPE_TYPE_IMROPE; // qwen3vl apply interleaved mrope
const bool mrope_used = mode & GGML_ROPE_TYPE_MROPE; // ggml_rope_multi, note: also true for vision (24 & 8 == true) and for imrope
const bool is_vision = mode == GGML_ROPE_TYPE_VISION;
if (mrope_used) {
GGML_ASSERT(sections[0] > 0 || sections[1] > 0 || sections[2] > 0);
}
if (is_vision) {
GGML_ASSERT(n_dims == ne0/2);
}
if (is_imrope || mrope_used) {
is_neox = true;
}
// init ctx.rope_cos/rope_sin cache
aclnn_cache_init(ctx, dst, corr_dims, ext_factor, theta_scale, freq_scale, attn_factor, is_neox);
aclnn_rope_cache_init(ctx, dst, corr_dims, ext_factor, theta_scale, freq_scale, attn_factor, is_neox, sections, mrope_used, is_imrope, is_vision);
int64_t sin_reshape_ne[4] = { ne00, 1, ne02, 1 };
size_t sin_reshape_nb[GGML_MAX_DIMS];
@ -2658,8 +2845,7 @@ void ggml_cann_rope(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
return;
#endif
// ggml_mode = 0 --> aclnn_model = 1
int64_t acl_mode = mode == 0 ? 1 : mode;
int64_t acl_mode = is_neox ? 0 : 1;
switch (src0->type) {
case GGML_TYPE_F32:

View File

@ -300,30 +300,92 @@ struct ggml_cann_graph_lru_cache {
struct ggml_cann_rope_cache {
~ggml_cann_rope_cache() {
if (theta_scale_cache != nullptr) {
if (theta_scale_cache) {
ACL_CHECK(aclrtFree(theta_scale_cache));
}
if (sin_cache != nullptr) {
if (sin_cache) {
ACL_CHECK(aclrtFree(sin_cache));
}
if (cos_cache != nullptr) {
if (cos_cache) {
ACL_CHECK(aclrtFree(cos_cache));
}
if (position_select_index) {
ACL_CHECK(aclrtFree(position_select_index));
}
if (theta_scale_exp_host) {
free(theta_scale_exp_host);
}
if(position_select_index_host) {
free(position_select_index_host);
}
}
void * theta_scale_cache = nullptr;
int64_t theta_scale_length = 0;
bool equal(int64_t theta_scale_length,
int64_t position_length,
float ext_factor,
float theta_scale,
float freq_scale,
float attn_factor,
bool is_neox,
bool indep_sects,
bool mrope_used,
bool is_imrope,
int sections[4]) {
return this->theta_scale_length == theta_scale_length && this->position_length == position_length &&
this->ext_factor == ext_factor && this->theta_scale == theta_scale && this->freq_scale == freq_scale &&
this->attn_factor == attn_factor && this->is_neox == is_neox && this->indep_sects == indep_sects &&
this->mrope_used == mrope_used && this->is_imrope == is_imrope && this->sections[0] == sections[0] &&
this->sections[1] == sections[1] && this->sections[2] == sections[2] && this->sections[3] == sections[3];
}
void set(int64_t theta_scale_length,
int64_t position_length,
float ext_factor,
float theta_scale,
float freq_scale,
float attn_factor,
bool is_neox,
bool indep_sects,
bool mrope_used,
bool is_imrope,
int sections[4]) {
this->theta_scale_length = theta_scale_length;
this->position_length = position_length;
this->ext_factor = ext_factor;
this->theta_scale = theta_scale;
this->freq_scale = freq_scale;
this->attn_factor = attn_factor;
this->is_neox = is_neox;
this->indep_sects = indep_sects;
this->mrope_used = mrope_used;
this->is_imrope = is_imrope;
this->sections[0] = sections[0];
this->sections[1] = sections[1];
this->sections[2] = sections[2];
this->sections[3] = sections[3];
}
// memory cache, prepare before inferencing.
void * theta_scale_cache = nullptr;
float * theta_scale_exp_host = nullptr;
int * position_select_index_host = nullptr;
void * position_select_index = nullptr;
// sin/cos cache, used only to accelerate first layer on each device
void * sin_cache = nullptr;
void * cos_cache = nullptr;
int64_t position_length = 0;
void * sin_cache = nullptr;
void * cos_cache = nullptr;
// Properties to check before reusing the sincos cache
bool cached = false;
float ext_factor = 0.0f;
float theta_scale = 0.0f;
float freq_scale = 0.0f;
float attn_factor = 0.0f;
bool is_neox = false;
int64_t theta_scale_length = 0;
int64_t position_length = 0;
bool cached = false;
float ext_factor = 0.0f;
float theta_scale = 0.0f;
float freq_scale = 0.0f;
float attn_factor = 0.0f;
bool is_neox = false;
bool indep_sects = false;
bool mrope_used = false;
int sections[4] = { 0, 0, 0, 0 };
bool is_imrope = false;
};
struct ggml_cann_tensor_cache {

View File

@ -2480,13 +2480,6 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev, const ggml_ten
return false;
}
const int mode = ((const int32_t *) op->op_params)[2];
if (mode & GGML_ROPE_TYPE_MROPE) {
return false;
}
if (mode & GGML_ROPE_TYPE_VISION) {
return false;
}
if (op->src[0]->ne[0] > 896) {
return false;
}
@ -2507,6 +2500,9 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev, const ggml_ten
if (op->op_params[0] != GGML_SCALE_MODE_NEAREST) {
return false;
}
if (op->op_params[0] & GGML_SCALE_FLAG_ANTIALIAS) {
return false;
}
return true;
}
case GGML_OP_POOL_2D:
@ -2568,6 +2564,10 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev, const ggml_ten
return true;
case GGML_OP_OUT_PROD:
{
#ifdef ASCEND_310P
// Ger is not supported on 310p device
return false;
#endif
switch (op->src[0]->type) {
case GGML_TYPE_F16:
case GGML_TYPE_F32:

View File

@ -224,7 +224,8 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
include(CheckCXXSourceCompiles)
set(CMAKE_REQUIRED_FLAGS_SAVE ${CMAKE_REQUIRED_FLAGS})
set(CMAKE_REQUIRED_FLAGS "${ARCH_FLAGS}")
string(REPLACE ";" " " ARCH_FLAGS_STR "${ARCH_FLAGS}")
set(CMAKE_REQUIRED_FLAGS "${ARCH_FLAGS_STR}")
foreach(feature DOTPROD SVE MATMUL_INT8 FMA FP16_VECTOR_ARITHMETIC SME)
set(ARM_FEATURE "HAVE_${feature}")
check_cxx_source_compiles(

View File

@ -33,10 +33,12 @@
// repack.cpp
#define ggml_quantize_mat_q8_0_4x4_generic ggml_quantize_mat_q8_0_4x4
#define ggml_quantize_mat_q8_0_4x8_generic ggml_quantize_mat_q8_0_4x8
#define ggml_quantize_mat_q8_K_4x4_generic ggml_quantize_mat_q8_K_4x4
#define ggml_quantize_mat_q8_K_4x8_generic ggml_quantize_mat_q8_K_4x8
#define ggml_gemv_q4_0_4x4_q8_0_generic ggml_gemv_q4_0_4x4_q8_0
#define ggml_gemv_q4_0_4x8_q8_0_generic ggml_gemv_q4_0_4x8_q8_0
#define ggml_gemv_q4_0_8x8_q8_0_generic ggml_gemv_q4_0_8x8_q8_0
#define ggml_gemv_q4_K_8x4_q8_K_generic ggml_gemv_q4_K_8x4_q8_K
#define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
#define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
#define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
@ -44,12 +46,14 @@
#define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
#define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
#define ggml_gemm_q4_0_8x8_q8_0_generic ggml_gemm_q4_0_8x8_q8_0
#define ggml_gemm_q4_K_8x4_q8_K_generic ggml_gemm_q4_K_8x4_q8_K
#define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
#define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
#define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
#define ggml_gemm_iq4_nl_8x8_q8_0_generic ggml_gemm_iq4_nl_8x8_q8_0
#elif defined(__aarch64__) || defined(__arm__) || defined(_M_ARM) || defined(_M_ARM64)
// repack.cpp
#define ggml_quantize_mat_q8_K_4x4_generic ggml_quantize_mat_q8_K_4x4
#define ggml_quantize_mat_q8_K_4x8_generic ggml_quantize_mat_q8_K_4x8
#define ggml_gemv_iq4_nl_8x8_q8_0_generic ggml_gemv_iq4_nl_8x8_q8_0
#define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
@ -58,11 +62,14 @@
#elif defined(__x86_64__) || defined(__i386__) || defined(_M_IX86) || defined(_M_X64)
// repack.cpp
#define ggml_quantize_mat_q8_0_4x4_generic ggml_quantize_mat_q8_0_4x4
#define ggml_quantize_mat_q8_K_4x4_generic ggml_quantize_mat_q8_K_4x4
#define ggml_gemv_q4_0_4x4_q8_0_generic ggml_gemv_q4_0_4x4_q8_0
#define ggml_gemv_q4_0_4x8_q8_0_generic ggml_gemv_q4_0_4x8_q8_0
#define ggml_gemv_q4_K_8x4_q8_K_generic ggml_gemv_q4_K_8x4_q8_K
#define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
#define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
#define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
#define ggml_gemm_q4_K_8x4_q8_K_generic ggml_gemm_q4_K_8x4_q8_K
#define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
#elif defined(__POWERPC__) || defined(__powerpc__)
// ref: https://github.com/ggml-org/llama.cpp/pull/14146#issuecomment-2972561679
@ -74,10 +81,12 @@
// repack.cpp
#define ggml_quantize_mat_q8_0_4x4_generic ggml_quantize_mat_q8_0_4x4
#define ggml_quantize_mat_q8_0_4x8_generic ggml_quantize_mat_q8_0_4x8
#define ggml_quantize_mat_q8_K_4x4_generic ggml_quantize_mat_q8_K_4x4
#define ggml_quantize_mat_q8_K_4x8_generic ggml_quantize_mat_q8_K_4x8
#define ggml_gemv_q4_0_4x4_q8_0_generic ggml_gemv_q4_0_4x4_q8_0
#define ggml_gemv_q4_0_4x8_q8_0_generic ggml_gemv_q4_0_4x8_q8_0
#define ggml_gemv_q4_0_8x8_q8_0_generic ggml_gemv_q4_0_8x8_q8_0
#define ggml_gemv_q4_K_8x4_q8_K_generic ggml_gemv_q4_K_8x4_q8_K
#define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
#define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
#define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
@ -85,6 +94,7 @@
#define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
#define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
#define ggml_gemm_q4_0_8x8_q8_0_generic ggml_gemm_q4_0_8x8_q8_0
#define ggml_gemm_q4_K_8x4_q8_K_generic ggml_gemm_q4_K_8x4_q8_K
#define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
#define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
#define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
@ -99,10 +109,12 @@
// repack.cpp
#define ggml_quantize_mat_q8_0_4x4_generic ggml_quantize_mat_q8_0_4x4
#define ggml_quantize_mat_q8_0_4x8_generic ggml_quantize_mat_q8_0_4x8
#define ggml_quantize_mat_q8_K_4x4_generic ggml_quantize_mat_q8_K_4x4
#define ggml_quantize_mat_q8_K_4x8_generic ggml_quantize_mat_q8_K_4x8
#define ggml_gemv_q4_0_4x4_q8_0_generic ggml_gemv_q4_0_4x4_q8_0
#define ggml_gemv_q4_0_4x8_q8_0_generic ggml_gemv_q4_0_4x8_q8_0
#define ggml_gemv_q4_0_8x8_q8_0_generic ggml_gemv_q4_0_8x8_q8_0
#define ggml_gemv_q4_K_8x4_q8_K_generic ggml_gemv_q4_K_8x4_q8_K
#define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
#define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
#define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
@ -110,6 +122,7 @@
#define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
#define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
#define ggml_gemm_q4_0_8x8_q8_0_generic ggml_gemm_q4_0_8x8_q8_0
#define ggml_gemm_q4_K_8x4_q8_K_generic ggml_gemm_q4_K_8x4_q8_K
#define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
#define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
#define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
@ -132,15 +145,18 @@
// repack.cpp
#define ggml_quantize_mat_q8_0_4x4_generic ggml_quantize_mat_q8_0_4x4
#define ggml_quantize_mat_q8_0_4x8_generic ggml_quantize_mat_q8_0_4x8
#define ggml_quantize_mat_q8_K_4x4_generic ggml_quantize_mat_q8_K_4x4
#define ggml_quantize_mat_q8_K_4x8_generic ggml_quantize_mat_q8_K_4x8
#define ggml_gemv_q4_0_4x4_q8_0_generic ggml_gemv_q4_0_4x4_q8_0
#define ggml_gemv_q4_0_4x8_q8_0_generic ggml_gemv_q4_0_4x8_q8_0
#define ggml_gemv_q4_K_8x4_q8_K_generic ggml_gemv_q4_K_8x4_q8_K
#define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
#define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
#define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
#define ggml_gemv_iq4_nl_8x8_q8_0_generic ggml_gemv_iq4_nl_8x8_q8_0
#define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
#define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
#define ggml_gemm_q4_K_8x4_q8_K_generic ggml_gemm_q4_K_8x4_q8_K
#define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
#define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
#define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
@ -161,10 +177,12 @@
// repack.cpp
#define ggml_quantize_mat_q8_0_4x4_generic ggml_quantize_mat_q8_0_4x4
#define ggml_quantize_mat_q8_0_4x8_generic ggml_quantize_mat_q8_0_4x8
#define ggml_quantize_mat_q8_K_4x4_generic ggml_quantize_mat_q8_K_4x4
#define ggml_quantize_mat_q8_K_4x8_generic ggml_quantize_mat_q8_K_4x8
#define ggml_gemv_q4_0_4x4_q8_0_generic ggml_gemv_q4_0_4x4_q8_0
#define ggml_gemv_q4_0_4x8_q8_0_generic ggml_gemv_q4_0_4x8_q8_0
#define ggml_gemv_q4_0_8x8_q8_0_generic ggml_gemv_q4_0_8x8_q8_0
#define ggml_gemv_q4_K_8x4_q8_K_generic ggml_gemv_q4_K_8x4_q8_K
#define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
#define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
#define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
@ -172,6 +190,7 @@
#define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
#define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
#define ggml_gemm_q4_0_8x8_q8_0_generic ggml_gemm_q4_0_8x8_q8_0
#define ggml_gemm_q4_K_8x4_q8_K_generic ggml_gemm_q4_K_8x4_q8_K
#define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
#define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
#define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0
@ -194,10 +213,12 @@
// repack.cpp
#define ggml_quantize_mat_q8_0_4x4_generic ggml_quantize_mat_q8_0_4x4
#define ggml_quantize_mat_q8_0_4x8_generic ggml_quantize_mat_q8_0_4x8
#define ggml_quantize_mat_q8_K_4x4_generic ggml_quantize_mat_q8_K_4x4
#define ggml_quantize_mat_q8_K_4x8_generic ggml_quantize_mat_q8_K_4x8
#define ggml_gemv_q4_0_4x4_q8_0_generic ggml_gemv_q4_0_4x4_q8_0
#define ggml_gemv_q4_0_4x8_q8_0_generic ggml_gemv_q4_0_4x8_q8_0
#define ggml_gemv_q4_0_8x8_q8_0_generic ggml_gemv_q4_0_8x8_q8_0
#define ggml_gemv_q4_K_8x4_q8_K_generic ggml_gemv_q4_K_8x4_q8_K
#define ggml_gemv_q4_K_8x8_q8_K_generic ggml_gemv_q4_K_8x8_q8_K
#define ggml_gemv_q2_K_8x8_q8_K_generic ggml_gemv_q2_K_8x8_q8_K
#define ggml_gemv_iq4_nl_4x4_q8_0_generic ggml_gemv_iq4_nl_4x4_q8_0
@ -205,6 +226,7 @@
#define ggml_gemm_q4_0_4x4_q8_0_generic ggml_gemm_q4_0_4x4_q8_0
#define ggml_gemm_q4_0_4x8_q8_0_generic ggml_gemm_q4_0_4x8_q8_0
#define ggml_gemm_q4_0_8x8_q8_0_generic ggml_gemm_q4_0_8x8_q8_0
#define ggml_gemm_q4_K_8x4_q8_K_generic ggml_gemm_q4_K_8x4_q8_K
#define ggml_gemm_q4_K_8x8_q8_K_generic ggml_gemm_q4_K_8x8_q8_K
#define ggml_gemm_q2_K_8x8_q8_K_generic ggml_gemm_q2_K_8x8_q8_K
#define ggml_gemm_iq4_nl_4x4_q8_0_generic ggml_gemm_iq4_nl_4x4_q8_0

View File

@ -8,6 +8,10 @@
#include <sys/sysctl.h>
#endif
#if !defined(HWCAP2_SVE2)
#define HWCAP2_SVE2 (1 << 1)
#endif
#if !defined(HWCAP2_I8MM)
#define HWCAP2_I8MM (1 << 13)
#endif

View File

@ -497,6 +497,140 @@ void ggml_gemv_iq4_nl_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const
ggml_gemv_iq4_nl_4x4_q8_0_generic(n, s, bs, vx, vy, nr, nc);
}
void ggml_gemv_q4_K_8x4_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
constexpr int qk = QK_K;
const int nb = n / qk;
constexpr int ncols_interleaved = 8;
constexpr int blocklen = 8;
assert(n % qk == 0);
assert(nr % 4 == 0);
assert(nc % ncols_interleaved == 0);
UNUSED(nb);
UNUSED(ncols_interleaved);
UNUSED(blocklen);
#if defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD)
constexpr int col_groups = ncols_interleaved / 4; // 0123 and 4567
const uint8x16_t m4b = vdupq_n_u8(0x0f);
// 1x8 tile = 2 x 4
float32x4_t acc_f32[col_groups];
const block_q8_K * GGML_RESTRICT q8_ptr = (const block_q8_K *) vy;
for (int x = 0; x < nc / ncols_interleaved; x++) {
const block_q4_Kx8 * GGML_RESTRICT q4_ptr = (const block_q4_Kx8 *) vx + (x * nb);
for (int i = 0; i < col_groups; i++) {
acc_f32[i] = vdupq_n_f32(0);
}
for (int b = 0; b < nb; b++) {
float32x4_t q4_d_0 = vcvt_f32_f16(vld1_f16((const __fp16 *) q4_ptr[b].d)); // d0 d1 d2 d3
float32x4_t q4_d_1 = vcvt_f32_f16(vld1_f16((const __fp16 *) q4_ptr[b].d + 4)); // d4 d5 d6 d7
float32x4_t q8_d = vdupq_n_f32(q8_ptr[b].d);
float32x4_t sb_scale_0123 = vmulq_f32(q4_d_0, q8_d);
float32x4_t sb_scale_4567 = vmulq_f32(q4_d_1, q8_d);
float32x4_t q4_dmin_0 = vcvt_f32_f16(vld1_f16((const __fp16 *) q4_ptr[b].dmin)); // dmin 0..3
float32x4_t q4_dmin_1 = vcvt_f32_f16(vld1_f16((const __fp16 *) q4_ptr[b].dmin + 4)); // dmin 4..7
float32x4_t sb_min_0123 = vmulq_f32(q4_dmin_0, q8_d);
float32x4_t sb_min_4567 = vmulq_f32(q4_dmin_1, q8_d);
// interleaved bias_acc: [0]->r0 0123, [1]->r0 4567
int32x4_t bias_acc[2] = { vdupq_n_s32(0), vdupq_n_s32(0) };
int32x4_t acc_lo[col_groups];
int32x4_t acc_hi[col_groups];
// Each bsum is 16 elements, pairwise add leaves us with the 8 bsums of the entire block
const int16x8_t bsums = vpaddq_s16(vld1q_s16(q8_ptr[b].bsums), vld1q_s16(q8_ptr[b].bsums + 8));
int16_t bsums_arr[8];
vst1q_s16(bsums_arr, bsums);
for (int sb = 0; sb < QK_K / 64; sb++) {
for (int i = 0; i < col_groups; i++) {
acc_lo[i] = vdupq_n_s32(0);
acc_hi[i] = vdupq_n_s32(0);
}
// Need scales for the low and high nibbles
// 2 * 12 = 24 bytes per subblock, 4 sbs -> 4 * 24 = 96 bytes total
int16x8_t q4sb_mins[2];
int16x8_t q4sb_scales[2];
for (int i = 0; i < 2; i++) {
int8_t aux_q4sb[8];
const int offset = sb * 24 + i * 12;
decode_q4_Kx8_scales_mins(&q4_ptr[b].scales[offset], &q4sb_mins[i], aux_q4sb);
q4sb_scales[i] = vmovl_s8(vld1_s8(aux_q4sb));
}
int8x16_t q8_qs[64 / 16];
for (int i = 0; i < 64 / 16; i++) {
q8_qs[i] = vld1q_s8(q8_ptr[b].qs + sb * 64 + i * 16);
}
for (int c = 0; c < col_groups; c++) {
uint8x16_t q4_cols[8];
for (int i = 0; i < 8; i++) {
q4_cols[i] = vld1q_u8(q4_ptr[b].qs + sb * QK_K + i * 32 + 16 * c);
}
acc_lo[c] = vdotq_laneq_s32(acc_lo[c], vreinterpretq_s8_u8(vandq_u8(q4_cols[0], m4b)), q8_qs[0], 0);
acc_lo[c] = vdotq_laneq_s32(acc_lo[c], vreinterpretq_s8_u8(vandq_u8(q4_cols[1], m4b)), q8_qs[0], 1);
acc_lo[c] = vdotq_laneq_s32(acc_lo[c], vreinterpretq_s8_u8(vandq_u8(q4_cols[2], m4b)), q8_qs[0], 2);
acc_lo[c] = vdotq_laneq_s32(acc_lo[c], vreinterpretq_s8_u8(vandq_u8(q4_cols[3], m4b)), q8_qs[0], 3);
acc_lo[c] = vdotq_laneq_s32(acc_lo[c], vreinterpretq_s8_u8(vandq_u8(q4_cols[4], m4b)), q8_qs[1], 0);
acc_lo[c] = vdotq_laneq_s32(acc_lo[c], vreinterpretq_s8_u8(vandq_u8(q4_cols[5], m4b)), q8_qs[1], 1);
acc_lo[c] = vdotq_laneq_s32(acc_lo[c], vreinterpretq_s8_u8(vandq_u8(q4_cols[6], m4b)), q8_qs[1], 2);
acc_lo[c] = vdotq_laneq_s32(acc_lo[c], vreinterpretq_s8_u8(vandq_u8(q4_cols[7], m4b)), q8_qs[1], 3);
acc_hi[c] = vdotq_laneq_s32(acc_hi[c], vreinterpretq_s8_u8(vshrq_n_u8(q4_cols[0], 4)), q8_qs[2], 0);
acc_hi[c] = vdotq_laneq_s32(acc_hi[c], vreinterpretq_s8_u8(vshrq_n_u8(q4_cols[1], 4)), q8_qs[2], 1);
acc_hi[c] = vdotq_laneq_s32(acc_hi[c], vreinterpretq_s8_u8(vshrq_n_u8(q4_cols[2], 4)), q8_qs[2], 2);
acc_hi[c] = vdotq_laneq_s32(acc_hi[c], vreinterpretq_s8_u8(vshrq_n_u8(q4_cols[3], 4)), q8_qs[2], 3);
acc_hi[c] = vdotq_laneq_s32(acc_hi[c], vreinterpretq_s8_u8(vshrq_n_u8(q4_cols[4], 4)), q8_qs[3], 0);
acc_hi[c] = vdotq_laneq_s32(acc_hi[c], vreinterpretq_s8_u8(vshrq_n_u8(q4_cols[5], 4)), q8_qs[3], 1);
acc_hi[c] = vdotq_laneq_s32(acc_hi[c], vreinterpretq_s8_u8(vshrq_n_u8(q4_cols[6], 4)), q8_qs[3], 2);
acc_hi[c] = vdotq_laneq_s32(acc_hi[c], vreinterpretq_s8_u8(vshrq_n_u8(q4_cols[7], 4)), q8_qs[3], 3);
}
// Scales
// row c0123 blk0 and blk1
const int16x4_t sc_0123_lo = vget_low_s16(q4sb_scales[0]);
const int16x4_t sc_0123_hi = vget_low_s16(q4sb_scales[1]);
const float32x4_t sumf_0123 = vcvtq_f32_s32(vaddq_s32(vmulq_s32(vmovl_s16(sc_0123_lo), acc_lo[0]),
vmulq_s32(vmovl_s16(sc_0123_hi), acc_hi[0])));
acc_f32[0] = vfmaq_f32(acc_f32[0], sb_scale_0123, sumf_0123);
// row c4567 blk0 and blk1
const int16x4_t sc_4567_lo = vget_high_s16(q4sb_scales[0]);
const int16x4_t sc_4567_hi = vget_high_s16(q4sb_scales[1]);
const float32x4_t sumf_4567 = vcvtq_f32_s32(vaddq_s32(vmulq_s32(vmovl_s16(sc_4567_lo), acc_lo[1]),
vmulq_s32(vmovl_s16(sc_4567_hi), acc_hi[1])));
acc_f32[1] = vfmaq_f32(acc_f32[1], sb_scale_4567, sumf_4567);
// Bias Correction
const int16x4_t bsums_vec_lo = vdup_n_s16(bsums_arr[2 * sb + 0]);
const int16x4_t bsums_vec_hi = vdup_n_s16(bsums_arr[2 * sb + 1]);
bias_acc[0] = vmlal_s16(bias_acc[0], bsums_vec_lo, vget_low_s16(q4sb_mins[0]));
bias_acc[0] = vmlal_s16(bias_acc[0], bsums_vec_hi, vget_low_s16(q4sb_mins[1]));
bias_acc[1] = vmlal_s16(bias_acc[1], bsums_vec_lo, vget_high_s16(q4sb_mins[0]));
bias_acc[1] = vmlal_s16(bias_acc[1], bsums_vec_hi, vget_high_s16(q4sb_mins[1]));
} // for sb
acc_f32[0] = vmlsq_f32(acc_f32[0], vcvtq_f32_s32(bias_acc[0]), sb_min_0123);
acc_f32[1] = vmlsq_f32(acc_f32[1], vcvtq_f32_s32(bias_acc[1]), sb_min_4567);
} // for b
int base = x * ncols_interleaved;
vst1q_f32(s + base, acc_f32[0]);
vst1q_f32(s + base + 4, acc_f32[1]);
} // for x
return;
#endif // #if defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD)
ggml_gemv_q4_K_8x4_q8_K_generic(n, s, bs, vx, vy, nr, nc);
}
void ggml_gemv_q4_K_8x8_q8_K(int n,
float * GGML_RESTRICT s,
size_t bs,
@ -518,7 +652,7 @@ void ggml_gemv_q4_K_8x8_q8_K(int n,
UNUSED(ncols_interleaved);
UNUSED(blocklen);
#if defined(__aarch64__) && defined(__ARM_NEON)
#if defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD)
constexpr int col_pairs = ncols_interleaved / 2;
const uint8x16_t m4b = vdupq_n_u8(0x0f);
@ -615,7 +749,6 @@ void ggml_gemv_q4_K_8x8_q8_K(int n,
float32x4_t sb_scale = p == 0 ? sb_scale_0 : sb_scale_1;
// 0123 or 4567
// TODO: Single superblock mul at the end of the superblock
float32x4_t sumf_0 =
vcvtq_f32_s32(vmulq_s32(vmovl_s16(group_scales_lo), vpaddq_s32(acc_lo[p], acc_lo[p + 1])));
acc_f32[i] = vfmaq_f32(acc_f32[i], sb_scale, sumf_0);
@ -649,7 +782,7 @@ void ggml_gemv_q4_K_8x8_q8_K(int n,
vst1q_f32(s + base + 4, acc_f32[1]);
} // for x
return;
#endif // defined(__aarch64__) && defined(__ARM_NEON)
#endif // defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD)
ggml_gemv_q4_K_8x8_q8_K_generic(n, s, bs, vx, vy, nr, nc);
}
@ -2069,6 +2202,206 @@ void ggml_gemm_iq4_nl_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const
ggml_gemm_iq4_nl_4x4_q8_0_generic(n, s, bs, vx, vy, nr, nc);
}
void ggml_gemm_q4_K_8x4_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
constexpr int qk = QK_K;
const int nb = n / qk;
constexpr int ncols_interleaved = 8;
constexpr int blocklen = 4;
assert(n % qk == 0);
assert(nr % 4 == 0);
assert(nc % ncols_interleaved == 0);
UNUSED(nb);
UNUSED(ncols_interleaved);
UNUSED(blocklen);
#if defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD)
constexpr int q8_k_blocklen = 4;
constexpr int acc_size = 2 * 4; // 2 row pairs × 4 col pairs
const uint8x16_t m4b = vdupq_n_u8(0x0f);
// 8 accumulators: 2 row pairs × 4 col pairs
float32x4_t acc_f32[acc_size];
for (int y = 0; y < nr / q8_k_blocklen; y++) {
const block_q8_Kx4 * GGML_RESTRICT q8_ptr = (const block_q8_Kx4 *) vy + (y * nb);
for (int x = 0; x < nc / ncols_interleaved; x++) {
const block_q4_Kx8 * GGML_RESTRICT q4_ptr = (const block_q4_Kx8 *) vx + (x * nb);
for (int i = 0; i < acc_size; i++) {
acc_f32[i] = vdupq_n_f32(0);
}
for (int b = 0; b < nb; b++) {
// d4 0 1 2 3, 4 5 6 7
float32x4_t q4_d_0123 = vcvt_f32_f16(vld1_f16((const __fp16 *) q4_ptr[b].d));
float32x4_t q4_d_4567 = vcvt_f32_f16(vld1_f16((const __fp16 *) q4_ptr[b].d + 4));
// d8 0 1 2 3
float32x4_t q8_d_0123 = vld1q_f32(q8_ptr[b].d);
// mins
float32x4_t q4_dmin_0123 = vcvt_f32_f16(vld1_f16((const __fp16 *) q4_ptr[b].dmin));
float32x4_t q4_dmin_4567 = vcvt_f32_f16(vld1_f16((const __fp16 *) q4_ptr[b].dmin + 4));
// Precomputation of scales and mins
float32x4_t sbd_scale_0123[q8_k_blocklen];
float32x4_t sbd_scale_4567[q8_k_blocklen];
float32x4_t sbd_min_0123[q8_k_blocklen];
float32x4_t sbd_min_4567[q8_k_blocklen];
sbd_scale_0123[0] = vmulq_laneq_f32(q4_d_0123, q8_d_0123, 0);
sbd_scale_4567[0] = vmulq_laneq_f32(q4_d_4567, q8_d_0123, 0);
sbd_min_0123[0] = vmulq_laneq_f32(q4_dmin_0123, q8_d_0123, 0);
sbd_min_4567[0] = vmulq_laneq_f32(q4_dmin_4567, q8_d_0123, 0);
sbd_scale_0123[1] = vmulq_laneq_f32(q4_d_0123, q8_d_0123, 1);
sbd_scale_4567[1] = vmulq_laneq_f32(q4_d_4567, q8_d_0123, 1);
sbd_min_0123[1] = vmulq_laneq_f32(q4_dmin_0123, q8_d_0123, 1);
sbd_min_4567[1] = vmulq_laneq_f32(q4_dmin_4567, q8_d_0123, 1);
sbd_scale_0123[2] = vmulq_laneq_f32(q4_d_0123, q8_d_0123, 2);
sbd_scale_4567[2] = vmulq_laneq_f32(q4_d_4567, q8_d_0123, 2);
sbd_min_0123[2] = vmulq_laneq_f32(q4_dmin_0123, q8_d_0123, 2);
sbd_min_4567[2] = vmulq_laneq_f32(q4_dmin_4567, q8_d_0123, 2);
sbd_scale_0123[3] = vmulq_laneq_f32(q4_d_0123, q8_d_0123, 3);
sbd_scale_4567[3] = vmulq_laneq_f32(q4_d_4567, q8_d_0123, 3);
sbd_min_0123[3] = vmulq_laneq_f32(q4_dmin_0123, q8_d_0123, 3);
sbd_min_4567[3] = vmulq_laneq_f32(q4_dmin_4567, q8_d_0123, 3);
// Precomputation of bsums, each vpaddq calcs all the bsums for each row
const int16x8_t bsums[q8_k_blocklen] = {
vpaddq_s16(vld1q_s16(q8_ptr[b].bsums + 16 * 0), vld1q_s16(q8_ptr[b].bsums + 16 * 0 + 8)),
vpaddq_s16(vld1q_s16(q8_ptr[b].bsums + 16 * 1), vld1q_s16(q8_ptr[b].bsums + 16 * 1 + 8)),
vpaddq_s16(vld1q_s16(q8_ptr[b].bsums + 16 * 2), vld1q_s16(q8_ptr[b].bsums + 16 * 2 + 8)),
vpaddq_s16(vld1q_s16(q8_ptr[b].bsums + 16 * 3), vld1q_s16(q8_ptr[b].bsums + 16 * 3 + 8)),
};
int16_t bsums_arr[QK_K / 64][8];
for (int q8_row = 0; q8_row < 4; q8_row++) {
vst1q_s16(bsums_arr[q8_row], bsums[q8_row]);
}
// interleaved bias_acc: [0]->r0 0123, [1]->r1 0123, .., [4]->r0 4567, [5]->r1 4567 ..
int32x4_t bias_acc[acc_size];
for (int i = 0; i < acc_size; i++) {
bias_acc[i] = vdupq_n_s32(0);
}
for (int sb = 0; sb < QK_K / 64; sb++) {
// Int accumulators for qs vecdot (4 row x 2 col quartets)
int32x4_t acc_lo[acc_size];
int32x4_t acc_hi[acc_size];
for (int i = 0; i < acc_size; i++) {
acc_lo[i] = vdupq_n_s32(0);
acc_hi[i] = vdupq_n_s32(0);
}
// Need scales for the low and high nibbles
// 2 * 12 = 24 bytes per subblock, 4 sbs -> 4 * 24 = 96 bytes total
int16x8_t q4sb_scales[2];
int16x8_t q4sb_mins[2];
for (int i = 0; i < 2; i++) {
int8_t aux_q4sb[8];
const int offset = sb * 24 + i * 12;
decode_q4_Kx8_scales_mins(&q4_ptr[b].scales[offset], &q4sb_mins[i], aux_q4sb);
q4sb_scales[i] = vmovl_s8(vld1_s8(aux_q4sb));
}
constexpr int reads_per_sb = 8; // 8 * 16 bytes each => 32 qs * 4 rows
for (int k = 0; k < reads_per_sb; k++) {
const int8x16_t q8_blk0 = vld1q_s8(q8_ptr[b].qs + sb * 256 + 16 * k);
const int8x16_t q8_blk1 = vld1q_s8(q8_ptr[b].qs + sb * 256 + 16 * k + 128);
// 0..3 & 32..35
const uint8x16_t q4_0123 = vld1q_u8(q4_ptr[b].qs + sb * QK_K + 32 * k);
const uint8x16_t q4_4567 = vld1q_u8(q4_ptr[b].qs + sb * QK_K + 32 * k + 16);
const int8x16_t q4_0123_lo = vreinterpretq_s8_u8(vandq_u8(q4_0123, m4b));
const int8x16_t q4_0123_hi = vreinterpretq_s8_u8(vshrq_n_u8(q4_0123, 4));
acc_lo[0] = vdotq_laneq_s32(acc_lo[0], q4_0123_lo, q8_blk0, 0); // 0..3 r0 c0123
acc_lo[1] = vdotq_laneq_s32(acc_lo[1], q4_0123_lo, q8_blk0, 1); // 0..3 r1 c0123
acc_lo[2] = vdotq_laneq_s32(acc_lo[2], q4_0123_lo, q8_blk0, 2); // 0..3 r2 c0123
acc_lo[3] = vdotq_laneq_s32(acc_lo[3], q4_0123_lo, q8_blk0, 3); // 0..3 r3 c0123
acc_hi[0] = vdotq_laneq_s32(acc_hi[0], q4_0123_hi, q8_blk1, 0); // 32..35 r0 c0123
acc_hi[1] = vdotq_laneq_s32(acc_hi[1], q4_0123_hi, q8_blk1, 1); // 32..35 r1 c0123
acc_hi[2] = vdotq_laneq_s32(acc_hi[2], q4_0123_hi, q8_blk1, 2); // 32..35 r2 c0123
acc_hi[3] = vdotq_laneq_s32(acc_hi[3], q4_0123_hi, q8_blk1, 3); // 32..35 r3 c0123
const int8x16_t q4_4567_lo = vreinterpretq_s8_u8(vandq_u8(q4_4567, m4b));
const int8x16_t q4_4567_hi = vreinterpretq_s8_u8(vshrq_n_u8(q4_4567, 4));
acc_lo[4] = vdotq_laneq_s32(acc_lo[4], q4_4567_lo, q8_blk0, 0); // 0..3 r0 c4567
acc_lo[5] = vdotq_laneq_s32(acc_lo[5], q4_4567_lo, q8_blk0, 1); // 0..3 r1 c4567
acc_lo[6] = vdotq_laneq_s32(acc_lo[6], q4_4567_lo, q8_blk0, 2); // 0..3 r2 c4567
acc_lo[7] = vdotq_laneq_s32(acc_lo[7], q4_4567_lo, q8_blk0, 3); // 0..3 r3 c4567
acc_hi[4] = vdotq_laneq_s32(acc_hi[4], q4_4567_hi, q8_blk1, 0); // 32..35 r0 c4567
acc_hi[5] = vdotq_laneq_s32(acc_hi[5], q4_4567_hi, q8_blk1, 1); // 32..35 r1 c4567
acc_hi[6] = vdotq_laneq_s32(acc_hi[6], q4_4567_hi, q8_blk1, 2); // 32..35 r2 c4567
acc_hi[7] = vdotq_laneq_s32(acc_hi[7], q4_4567_hi, q8_blk1, 3); // 32..35 r3 c4567
}
// Scale and bias application
// acc is stored interleaved to match output layout
const int16x4_t sc_0123_lo = vget_low_s16(q4sb_scales[0]);
const int16x4_t sc_4567_lo = vget_high_s16(q4sb_scales[0]);
const int16x4_t sc_0123_hi = vget_low_s16(q4sb_scales[1]);
const int16x4_t sc_4567_hi = vget_high_s16(q4sb_scales[1]);
for (int row = 0; row < q8_k_blocklen; row++) {
// Bias correction
// row c0123 blk0 and blk1
const float32x4_t sumf_0123 =
vcvtq_f32_s32(vaddq_s32(vmulq_s32(vmovl_s16(sc_0123_lo), acc_lo[row]),
vmulq_s32(vmovl_s16(sc_0123_hi), acc_hi[row])));
acc_f32[2 * row] = vfmaq_f32(acc_f32[2 * row], sbd_scale_0123[row], sumf_0123);
// row c4567 blk0 and blk1
const float32x4_t sumf_4567 =
vcvtq_f32_s32(vaddq_s32(vmulq_s32(vmovl_s16(sc_4567_lo), acc_lo[row + 4]),
vmulq_s32(vmovl_s16(sc_4567_hi), acc_hi[row + 4])));
acc_f32[2 * row + 1] = vfmaq_f32(acc_f32[2 * row + 1], sbd_scale_4567[row], sumf_4567);
// Bias
const int16x4_t bsums_vec_lo = vdup_n_s16(bsums_arr[sb][row * 2]);
const int16x4_t bsums_vec_hi = vdup_n_s16(bsums_arr[sb][row * 2 + 1]);
// row c0123 blk0 and blk1
bias_acc[2 * row] = vmlal_s16(bias_acc[2 * row], bsums_vec_lo, vget_low_s16(q4sb_mins[0]));
bias_acc[2 * row] = vmlal_s16(bias_acc[2 * row], bsums_vec_hi, vget_low_s16(q4sb_mins[1]));
// row c4567 blk0 and blk1
bias_acc[2 * row + 1] =
vmlal_s16(bias_acc[2 * row + 1], bsums_vec_lo, vget_high_s16(q4sb_mins[0]));
bias_acc[2 * row + 1] =
vmlal_s16(bias_acc[2 * row + 1], bsums_vec_hi, vget_high_s16(q4sb_mins[1]));
}
} // for sb
for (int row = 0; row < q8_k_blocklen; row++) {
acc_f32[2 * row] = vmlsq_f32(acc_f32[2 * row], vcvtq_f32_s32(bias_acc[2 * row]), sbd_min_0123[row]);
acc_f32[2 * row + 1] =
vmlsq_f32(acc_f32[2 * row + 1], vcvtq_f32_s32(bias_acc[2 * row + 1]), sbd_min_4567[row]);
}
} // for b
for (int i = 0; i < q8_k_blocklen; i++) {
int row = y * q8_k_blocklen + i;
for (int j = 0; j < 2; j++) {
int col = x * ncols_interleaved + j * 4;
int offset = row * bs + col;
vst1q_f32(s + offset, acc_f32[2 * i + j]);
}
}
} // for x
} // for y
return;
#endif // defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD)
ggml_gemm_q4_K_8x4_q8_K_generic(n, s, bs, vx, vy, nr, nc);
}
void ggml_gemm_q4_K_8x8_q8_K(int n,
float * GGML_RESTRICT s,
size_t bs,

View File

@ -1,20 +1,23 @@
#include "ggml-backend-impl.h"
#if defined(__riscv) && __riscv_xlen == 64
#include <sys/auxv.h>
//https://github.com/torvalds/linux/blob/master/arch/riscv/include/uapi/asm/hwcap.h#L24
#ifndef COMPAT_HWCAP_ISA_V
#define COMPAT_HWCAP_ISA_V (1 << ('V' - 'A'))
#endif
#include <asm/hwprobe.h>
#include <asm/unistd.h>
#include <unistd.h>
struct riscv64_features {
bool has_rvv = false;
riscv64_features() {
uint32_t hwcap = getauxval(AT_HWCAP);
struct riscv_hwprobe probe;
probe.key = RISCV_HWPROBE_KEY_IMA_EXT_0;
probe.value = 0;
has_rvv = !!(hwcap & COMPAT_HWCAP_ISA_V);
int ret = syscall(__NR_riscv_hwprobe, &probe, 1, 0, NULL, 0);
if (0 == ret) {
has_rvv = !!(probe.value & RISCV_HWPROBE_IMA_V);
}
}
};

View File

@ -683,22 +683,14 @@ bool ggml_is_numa(void) {
}
#if defined(__ARM_ARCH)
#if defined(__linux__) && defined(__aarch64__)
#include <sys/auxv.h>
#endif
static void ggml_init_arm_arch_features(void) {
#if defined(__aarch64__) && defined(__ARM_FEATURE_SVE)
#if defined(__linux__)
ggml_arm_arch_features.sve_cnt = PR_SVE_VL_LEN_MASK & prctl(PR_SVE_GET_VL);
#else
// TODO: add support of SVE for non-linux systems
#error "TODO: SVE is not supported on this platform. To use SVE, sve_cnt needs to be initialized here."
#endif
#endif
#include <arm_sve.h>
static void ggml_init_arm_arch_features(void) {
ggml_arm_arch_features.sve_cnt = svcntb();
}
#else
static void ggml_init_arm_arch_features(void) {}
#endif
#endif // __ARM_ARCH
struct ggml_tensor * ggml_new_i32(struct ggml_context * ctx, int32_t value) {
@ -2706,6 +2698,11 @@ struct ggml_cplan ggml_graph_plan(
n_threads = threadpool ? threadpool->n_threads_max : GGML_DEFAULT_N_THREADS;
}
#if defined(__EMSCRIPTEN__) && !defined(__EMSCRIPTEN_PTHREADS__)
// Emscripten without pthreads support can only use a single thread
n_threads = 1;
#endif
size_t work_size = 0;
struct ggml_cplan cplan;

View File

@ -6383,7 +6383,7 @@ static void ggml_compute_forward_im2col_3d_f16(
const int64_t iih = ioh*s1 + ikh*d1 - p1;
const int64_t iid = iod*s2 + ikd*d2 - p2;
if (iid < 0 || iid >= ID || iih < 0 || iih >= IH || iiw < 0 || iiw >= IW || iid < 0 || iid >= ID) {
if (iid < 0 || iid >= ID || iih < 0 || iih >= IH || iiw < 0 || iiw >= IW) {
dst_data[iic*KD_KH_KW + ikd * KH_KW + ikh*KW + ikw] = 0;
} else {
const float * const s = (const float *) ((const char *)src_data + iid*nb12 + iih*nb11 + iiw*nb10); // [ID, IH, IW]
@ -7420,6 +7420,65 @@ static void ggml_compute_forward_upscale_f32(
}
}
}
} else if (mode == GGML_SCALE_MODE_BILINEAR && (mode_flags & GGML_SCALE_FLAG_ANTIALIAS)) {
// Similar to F.interpolate(..., mode="bilinear", align_corners=False, antialias=True)
// https://github.com/pytorch/pytorch/blob/8871ff29b743948d1225389d5b7068f37b22750b/aten/src/ATen/native/cpu/UpSampleKernel.cpp
auto triangle_filter = [](float x) -> float {
return std::max(1.0f - fabsf(x), 0.0f);
};
// support and invscale, minimum 1 pixel for bilinear
const float support1 = std::max(1.0f, 1.0f / sf1);
const float invscale1 = 1.0f / support1;
const float support0 = std::max(1.0f, 1.0f / sf0);
const float invscale0 = 1.0f / support0;
for (int64_t i3 = 0; i3 < ne3; i3++) {
const int64_t i03 = i3 / sf3;
for (int64_t i2 = ith; i2 < ne2; i2 += nth) {
const int64_t i02 = i2 / sf2;
for (int64_t i1 = 0; i1 < ne1; i1++) {
const float y = ((float) i1 + pixel_offset) / sf1;
for (int64_t i0 = 0; i0 < ne0; i0++) {
const float x = ((float) i0 + pixel_offset) / sf0;
// the range of source pixels that contribute
const int64_t x_min = std::max<int64_t>(x - support0 + pixel_offset, 0);
const int64_t x_max = std::min<int64_t>(x + support0 + pixel_offset, ne00);
const int64_t y_min = std::max<int64_t>(y - support1 + pixel_offset, 0);
const int64_t y_max = std::min<int64_t>(y + support1 + pixel_offset, ne01);
// bilinear filter with antialiasing
float val = 0.0f;
float total_weight = 0.0f;
for (int64_t sy = y_min; sy < y_max; sy++) {
const float weight_y = triangle_filter((sy - y + pixel_offset) * invscale1);
for (int64_t sx = x_min; sx < x_max; sx++) {
const float weight_x = triangle_filter((sx - x + pixel_offset) * invscale0);
const float weight = weight_x * weight_y;
if (weight <= 0.0f) {
continue;
}
const float pixel = *(const float *)((const char *)src0->data + sx*nb00 + sy*nb01 + i02*nb02 + i03*nb03);
val += pixel * weight;
total_weight += weight;
}
}
if (total_weight > 0.0f) {
val /= total_weight;
}
float * dst_ptr = (float *)((char *)dst->data + i0*nb0 + i1*nb1 + i2*nb2 + i3*nb3);
*dst_ptr = val;
}
}
}
}
} else if (mode == GGML_SCALE_MODE_BILINEAR) {
for (int64_t i3 = 0; i3 < ne3; i3++) {
const int64_t i03 = i3 / sf3;
@ -9766,7 +9825,8 @@ static void ggml_compute_forward_solve_tri_f32(const struct ggml_compute_params
}
const float diag = A_batch[i00 * n + i00];
GGML_ASSERT(diag != 0.0f && "Zero diagonal in triangular matrix");
assert(diag != 0.0f && "Zero diagonal in triangular matrix");
X_batch[i00 * k + i01] = (B_batch[i00 * k + i01] - sum) / diag;
}
}

View File

@ -124,6 +124,58 @@ void ggml_quantize_mat_q8_0_4x8_generic(const float * GGML_RESTRICT x, void * GG
}
}
void ggml_quantize_mat_q8_K_4x4_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
assert(QK_K == 256);
assert(k % QK_K == 0);
const int nb = k / QK_K;
block_q8_Kx4 * GGML_RESTRICT y = (block_q8_Kx4 *) vy;
// scalar
const int blck_size_interleave = 4;
float srcv[4][QK_K];
float iscale[4];
for (int i = 0; i < nb; i++) {
for (int row_iter = 0; row_iter < 4; row_iter++) {
float amax = 0.0f; // absolute max
float max = 0;
for (int j = 0; j < QK_K; j++) {
srcv[row_iter][j] = x[row_iter * k + i * QK_K + j];
// Update the maximum value of the corresponding super block
if(amax < fabsf(srcv[row_iter][j])) {
amax = fabsf(srcv[row_iter][j]);
max = srcv[row_iter][j];
}
}
iscale[row_iter] = amax ? -127.f/max : 0;
y[i].d[row_iter] = amax ? 1/iscale[row_iter] : 0;
}
for (int j = 0; j < QK_K / 4; j++) {
y[i].bsums[j] = 0;
}
// Quants values are interleaved in sequence of four bytes from corresponding super blocks
// Bsums values are interleaved in sequence of four bsums from each super block taken for interleaving
// i.e first four bsums from the first super block, followed by first four bsums from second super block and so on
for (int j = 0; j < QK_K * 4; j++) {
int src_offset = (j / (4 * blck_size_interleave)) * blck_size_interleave;
int src_id = (j % (4 * blck_size_interleave)) / blck_size_interleave;
src_offset += (j % blck_size_interleave);
int index = (((j & 15) >> 2) << 2) + ((j >> 8) << 4) + ((j >> 6) & 3);
float x0 = srcv[src_id][src_offset] * iscale[src_id];
y[i].qs[j] = nearest_int(x0);
y[i].bsums[index] += y[i].qs[j];
}
}
}
void ggml_quantize_mat_q8_K_4x8_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
assert(QK_K == 256);
assert(k % QK_K == 0);
@ -192,6 +244,12 @@ template <> void ggml_quantize_mat_t<8, GGML_TYPE_Q8_0>(const float * GGML_RESTR
ggml_quantize_mat_q8_0_4x8(x, vy, n_per_row);
}
template <> void ggml_quantize_mat_t<4, GGML_TYPE_Q8_K>(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t nrow, int64_t n_per_row) {
assert(nrow == 4);
UNUSED(nrow);
ggml_quantize_mat_q8_K_4x4(x, vy, n_per_row);
}
template <> void ggml_quantize_mat_t<8, GGML_TYPE_Q8_K>(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t nrow, int64_t n_per_row) {
assert(nrow == 4);
UNUSED(nrow);
@ -333,6 +391,77 @@ void ggml_gemv_q4_0_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs,
}
}
void ggml_gemv_q4_K_8x4_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
const int qk = QK_K;
const int nb = n / qk;
const int ncols_interleaved = 8;
const int blocklen = 4;
static const uint32_t kmask1 = 0x3f3f3f3f;
static const uint32_t kmask2 = 0x0f0f0f0f;
static const uint32_t kmask3 = 0x03030303;
assert (n % qk == 0);
assert (nc % ncols_interleaved == 0);
UNUSED(bs);
UNUSED(nr);
float sumf[8];
float sum_minf[8];
uint32_t utmp[32];
int sumi1;
int sumi2;
int sumi;
const block_q8_K * a_ptr = (const block_q8_K *) vy;
for (int x = 0; x < nc / ncols_interleaved; x++) {
const block_q4_Kx8 * b_ptr = (const block_q4_Kx8 *) vx + (x * nb);
for (int j = 0; j < ncols_interleaved; j++) {
sumf[j] = 0.0;
sum_minf[j] = 0.0;
}
for (int l = 0; l < nb; l++) {
for (int sb = 0; sb < 8; sb++) {
memcpy(utmp + sb * 4, b_ptr[l].scales + sb * 12, 12);
utmp[sb * 4 + 3] = ((utmp[sb * 4 + 2] >> 4) & kmask2) | (((utmp[sb * 4 + 1] >> 6) & kmask3) << 4);
const uint32_t uaux_0 = utmp[sb * 4 + 1] & kmask1;
utmp[sb * 4 + 1] = (utmp[sb * 4 + 2] & kmask2) | (((utmp[sb * 4 + 0] >> 6) & kmask3) << 4);
utmp[sb * 4 + 2] = uaux_0;
utmp[sb * 4 + 0] &= kmask1;
}
for (int k = 0; k < (qk / (2 * blocklen)); k++) {
uint8_t * scales_0 = (uint8_t *) utmp + (k / 8) * 32;
uint8_t * scales_1 = (uint8_t *) utmp + (k / 8) * 32 + 16;
for (int j = 0; j < ncols_interleaved; j++) {
sumi1 = 0;
sumi2 = 0;
sumi = 0;
for (int i = 0; i < blocklen; ++i) {
const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF);
const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4);
sumi1 = (v0 * a_ptr[l].qs[(k / 8) * 64 + (k % 8) * blocklen + i]);
sumi2 = (v1 * a_ptr[l].qs[(k / 8) * 64 + (k % 8) * blocklen + i + 32]);
sumi1 = sumi1 * scales_0[j];
sumi2 = sumi2 * scales_1[j];
sumi += sumi1 + sumi2;
}
sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d;
}
}
for (int sb = 0; sb < 8; sb++) {
uint8_t * mins = (uint8_t *) utmp + 8 + sb * 16;
for (int j = 0; j < ncols_interleaved; j++) {
sum_minf[j] += mins[j] * (a_ptr[l].bsums[sb * 2] + a_ptr[l].bsums[sb * 2 + 1]) * GGML_CPU_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d;
}
}
}
for (int j = 0; j < ncols_interleaved; j++) {
s[x * ncols_interleaved + j] = sumf[j] - sum_minf[j];
}
}
}
void ggml_gemv_q4_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
const int qk = QK_K;
const int nb = n / qk;
@ -727,6 +856,89 @@ void ggml_gemm_q4_0_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs,
}
}
void ggml_gemm_q4_K_8x4_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
const int qk = QK_K;
const int nb = n / qk;
const int ncols_interleaved = 8;
const int blocklen = 4;
static const uint32_t kmask1 = 0x3f3f3f3f;
static const uint32_t kmask2 = 0x0f0f0f0f;
static const uint32_t kmask3 = 0x03030303;
assert (n % qk == 0);
assert (nr % 4 == 0);
assert (nc % ncols_interleaved == 0);
UNUSED(nb);
UNUSED(ncols_interleaved);
UNUSED(blocklen);
float sumf[4][8];
float sum_minf[4][8];
uint32_t utmp[32];
int sumi1;
int sumi2;
int sumi;
for (int y = 0; y < nr / 4; y++) {
const block_q8_Kx4 * a_ptr = (const block_q8_Kx4 *) vy + (y * nb);
for (int x = 0; x < nc / ncols_interleaved; x++) {
const block_q4_Kx8 * b_ptr = (const block_q4_Kx8 *) vx + (x * nb);
for (int m = 0; m < 4; m++) {
for (int j = 0; j < ncols_interleaved; j++) {
sumf[m][j] = 0.0;
sum_minf[m][j] = 0.0;
}
}
for (int l = 0; l < nb; l++) {
for (int sb = 0; sb < 8; sb++) {
memcpy(utmp + sb * 4, b_ptr[l].scales + sb * 12, 12);
utmp[sb * 4 + 3] = ((utmp[sb * 4 + 2] >> 4) & kmask2) | (((utmp[sb * 4 + 1] >> 6) & kmask3) << 4);
const uint32_t uaux_0 = utmp[sb * 4 + 1] & kmask1;
utmp[sb * 4 + 1] = (utmp[sb * 4 + 2] & kmask2) | (((utmp[sb * 4 + 0] >> 6) & kmask3) << 4);
utmp[sb * 4 + 2] = uaux_0;
utmp[sb * 4 + 0] &= kmask1;
}
for (int k = 0; k < (qk / (2 * blocklen)); k++) {
uint8_t * scales_0 = (uint8_t *) utmp + (k / 8) * 32;
uint8_t * scales_1 = (uint8_t *) utmp + (k / 8) * 32 + 16;
for (int m = 0; m < 4; m++) {
for (int j = 0; j < ncols_interleaved; j++) {
sumi1 = 0;
sumi2 = 0;
sumi = 0;
for (int i = 0; i < blocklen; ++i) {
const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF);
const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4);
sumi1 = (v0 * a_ptr[l].qs[(k / 8) * 256 + (k % 8) * 4 * blocklen + m * blocklen + i]);
sumi2 = (v1 * a_ptr[l].qs[(k / 8) * 256 + (k % 8) * 4 * blocklen + m * blocklen + i + 128]);
sumi1 = sumi1 * scales_0[j];
sumi2 = sumi2 * scales_1[j];
sumi += sumi1 + sumi2;
}
sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d[m];
}
}
}
for (int sb = 0; sb < 8; sb++) {
uint8_t * mins = (uint8_t *) utmp + 8 + sb * 16;
for(int m = 0; m < 4; m++) {
const int16_t * bsums = a_ptr[l].bsums + (sb * 8) + (m * 4) - ((sb % 2) * 6);
for(int j = 0; j < ncols_interleaved; j++) {
sum_minf[m][j] += mins[j] * (bsums[0] + bsums[1]) * GGML_CPU_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d[m];
}
}
}
}
for (int m = 0; m < 4; m++) {
for (int j = 0; j < ncols_interleaved; j++) {
s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j] - sum_minf[m][j];
}
}
}
}
}
void ggml_gemm_q4_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
const int qk = QK_K;
const int nb = n / qk;
@ -1228,9 +1440,10 @@ static int repack_q4_0_to_q4_0_4_bl(struct ggml_tensor * t, int interleave_block
GGML_UNUSED(data_size);
}
static int repack_q4_K_to_q4_K_8_bl(struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) {
GGML_ASSERT(t->type == GGML_TYPE_Q4_K);
GGML_ASSERT(interleave_block == 8);
GGML_ASSERT(interleave_block == 8 || interleave_block == 4);
constexpr int nrows_interleaved = 8;
block_q4_Kx8 * dst = (block_q4_Kx8*)t->data;
@ -1468,6 +1681,10 @@ template <> int repack<block_q4_K, 8, 8>(struct ggml_tensor * t, const void * da
return repack_q4_K_to_q4_K_8_bl(t, 8, data, data_size);
}
template <> int repack<block_q4_K, 4, 8>(struct ggml_tensor * t, const void * data, size_t data_size) {
return repack_q4_K_to_q4_K_8_bl(t, 4, data, data_size);
}
template <> int repack<block_q2_K, 8, 8>(struct ggml_tensor * t, const void * data, size_t data_size) {
return repack_q2_K_to_q2_K_8_bl(t, 8, data, data_size);
}
@ -1501,6 +1718,10 @@ template <> void gemv<block_q4_0, 8, 8, GGML_TYPE_Q8_0>(int n, float * s, size_t
ggml_gemv_q4_0_8x8_q8_0(n, s, bs, vx, vy, nr, nc);
}
template <> void gemv<block_q4_K, 4, 8, GGML_TYPE_Q8_K>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
ggml_gemv_q4_K_8x4_q8_K(n, s, bs, vx, vy, nr, nc);
}
template <> void gemv<block_q4_K, 8, 8, GGML_TYPE_Q8_K>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
ggml_gemv_q4_K_8x8_q8_K(n, s, bs, vx, vy, nr, nc);
}
@ -1529,6 +1750,10 @@ template <> void gemm<block_q4_0, 8, 4, GGML_TYPE_Q8_0>(int n, float * s, size_t
ggml_gemm_q4_0_4x8_q8_0(n, s, bs, vx, vy, nr, nc);
}
template <> void gemm<block_q4_K, 4, 8, GGML_TYPE_Q8_K>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
ggml_gemm_q4_K_8x4_q8_K(n, s, bs, vx, vy, nr, nc);
}
template <> void gemm<block_q4_0, 8, 8, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
ggml_gemm_q4_0_8x8_q8_0(n, s, bs, vx, vy, nr, nc);
}
@ -1731,12 +1956,13 @@ template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PAR
nchunk0 = (nr0 + min_chunk_size - 1) / min_chunk_size;
}
if (nth == 1 || nchunk0 < nth || disable_chunking) {
int64_t dr0 = (nr0 + nchunk0 - 1) / nchunk0;
// Only increase nchunk0 to nth if it won't make chunks too small
if (nth == 1 || ((nchunk0 < nth || disable_chunking) && (nr0 + nth - 1) / nth >= min_chunk_size)) {
nchunk0 = nth;
dr0 = (nr0 + nchunk0 - 1) / nchunk0;
}
const int64_t dr0 = (nr0 + nchunk0 - 1) / nchunk0;
// Ensure nchunk doesn't exceed the number of rows divided by minimum chunk size
// This prevents creating too many tiny chunks that could overlap after alignment
const int64_t max_nchunk = (nr0 + min_chunk_size - 1) / min_chunk_size;
@ -1930,6 +2156,9 @@ static const ggml::cpu::tensor_traits * ggml_repack_get_optimal_repack_type(cons
static const ggml::cpu::repack::tensor_traits<block_q4_0, 4, 4, GGML_TYPE_Q8_0> q4_0_4x4_q8_0;
static const ggml::cpu::repack::tensor_traits<block_q4_0, 8, 4, GGML_TYPE_Q8_0> q4_0_4x8_q8_0;
static const ggml::cpu::repack::tensor_traits<block_q4_0, 8, 8, GGML_TYPE_Q8_0> q4_0_8x8_q8_0;
// instance for Q4_K
static const ggml::cpu::repack::tensor_traits<block_q4_K, 4, 8, GGML_TYPE_Q8_K> q4_K_8x4_q8_K;
static const ggml::cpu::repack::tensor_traits<block_q4_K, 8, 8, GGML_TYPE_Q8_K> q4_K_8x8_q8_K;
// instance for Q2
@ -1966,6 +2195,11 @@ static const ggml::cpu::tensor_traits * ggml_repack_get_optimal_repack_type(cons
return &q4_K_8x8_q8_K;
}
}
if (ggml_cpu_has_neon() && ggml_cpu_has_dotprod()) {
if (cur->ne[1] % 8 == 0) {
return &q4_K_8x4_q8_K;
}
}
} else if (cur->type == GGML_TYPE_Q2_K) {
if (ggml_cpu_has_avx512()) {
if (cur->ne[1] % 8 == 0) {

View File

@ -80,10 +80,12 @@ extern "C" {
void ggml_quantize_mat_q8_0_4x4(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k);
void ggml_quantize_mat_q8_0_4x8(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k);
void ggml_quantize_mat_q8_K_4x4(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k);
void ggml_quantize_mat_q8_K_4x8(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k);
void ggml_gemv_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
void ggml_gemv_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
void ggml_gemv_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
void ggml_gemv_q4_K_8x4_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
void ggml_gemv_q4_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
void ggml_gemv_q2_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
void ggml_gemv_iq4_nl_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
@ -91,6 +93,7 @@ void ggml_gemv_iq4_nl_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const
void ggml_gemm_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
void ggml_gemm_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
void ggml_gemm_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
void ggml_gemm_q4_K_8x4_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
void ggml_gemm_q4_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
void ggml_gemm_q2_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
void ggml_gemm_iq4_nl_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
@ -99,10 +102,12 @@ void ggml_gemm_iq4_nl_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const
// Native implementations
void ggml_quantize_mat_q8_0_4x4_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k);
void ggml_quantize_mat_q8_0_4x8_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k);
void ggml_quantize_mat_q8_K_4x4_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k);
void ggml_quantize_mat_q8_K_4x8_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k);
void ggml_gemv_q4_0_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
void ggml_gemv_q4_0_4x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
void ggml_gemv_q4_0_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
void ggml_gemv_q4_K_8x4_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
void ggml_gemv_q4_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
void ggml_gemv_q2_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
void ggml_gemv_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
@ -110,6 +115,7 @@ void ggml_gemv_iq4_nl_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs
void ggml_gemm_q4_0_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
void ggml_gemm_q4_0_4x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
void ggml_gemm_q4_0_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
void ggml_gemm_q4_K_8x4_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
void ggml_gemm_q4_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
void ggml_gemm_q2_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
void ggml_gemm_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);

View File

@ -397,119 +397,118 @@ inline static void ggml_vec_mad_f32(const int n, float * GGML_RESTRICT y, const
}
inline static void ggml_vec_mad_f16(const int n, ggml_fp16_t * GGML_RESTRICT y, const ggml_fp16_t * GGML_RESTRICT x, const float v) {
#if defined(GGML_SIMD)
#if defined(__ARM_FEATURE_SVE)
const int sve_register_length = svcntb() * 8;
const int ggml_f16_epr = sve_register_length / 16;
const int ggml_f16_step = 8 * ggml_f16_epr;
#if defined(GGML_SIMD) && defined(__ARM_FEATURE_SVE)
const int sve_register_length = svcntb() * 8;
const int ggml_f16_epr = sve_register_length / 16;
const int ggml_f16_step = 8 * ggml_f16_epr;
GGML_F16x_VEC vx = GGML_F16x_VEC_SET1(v);
GGML_F16x_VEC vx = GGML_F16x_VEC_SET1(v);
const int np= (n & ~(ggml_f16_step - 1));
int np = (n & ~(ggml_f16_step - 1));
svfloat16_t ax1, ax2, ax3, ax4, ax5, ax6, ax7, ax8;
svfloat16_t ay1, ay2, ay3, ay4, ay5, ay6, ay7, ay8;
for (int i = 0; i < np; i += ggml_f16_step) {
ax1 = GGML_F16x_VEC_LOAD(x + i + 0 * ggml_f16_epr, 0);
ay1 = GGML_F16x_VEC_LOAD(y + i + 0 * ggml_f16_epr, 0);
ay1 = GGML_F16x_VEC_FMA(ay1, ax1, vx);
svfloat16_t ax1, ax2, ax3, ax4, ax5, ax6, ax7, ax8;
svfloat16_t ay1, ay2, ay3, ay4, ay5, ay6, ay7, ay8;
for (int i = 0; i < np; i += ggml_f16_step) {
ax1 = GGML_F16x_VEC_LOAD(x + i + 0 * ggml_f16_epr, 0);
ay1 = GGML_F16x_VEC_LOAD(y + i + 0 * ggml_f16_epr, 0);
ay1 = GGML_F16x_VEC_FMA(ay1, ax1, vx);
GGML_F16x_VEC_STORE(y + i + 0 * ggml_f16_epr, ay1, 0);
GGML_F16x_VEC_STORE(y + i + 0 * ggml_f16_epr, ay1, 0);
ax2 = GGML_F16x_VEC_LOAD(x + i + 1 * ggml_f16_epr, 1);
ay2 = GGML_F16x_VEC_LOAD(y + i + 1 * ggml_f16_epr, 1);
ay2 = GGML_F16x_VEC_FMA(ay2, ax2, vx);
ax2 = GGML_F16x_VEC_LOAD(x + i + 1 * ggml_f16_epr, 1);
ay2 = GGML_F16x_VEC_LOAD(y + i + 1 * ggml_f16_epr, 1);
ay2 = GGML_F16x_VEC_FMA(ay2, ax2, vx);
GGML_F16x_VEC_STORE(y + i + 1 * ggml_f16_epr, ay2, 1);
GGML_F16x_VEC_STORE(y + i + 1 * ggml_f16_epr, ay2, 1);
ax3 = GGML_F16x_VEC_LOAD(x + i + 2 * ggml_f16_epr, 2);
ay3 = GGML_F16x_VEC_LOAD(y + i + 2 * ggml_f16_epr, 2);
ay3 = GGML_F16x_VEC_FMA(ay3, ax3, vx);
ax3 = GGML_F16x_VEC_LOAD(x + i + 2 * ggml_f16_epr, 2);
ay3 = GGML_F16x_VEC_LOAD(y + i + 2 * ggml_f16_epr, 2);
ay3 = GGML_F16x_VEC_FMA(ay3, ax3, vx);
GGML_F16x_VEC_STORE(y + i + 2 * ggml_f16_epr, ay3, 2);
GGML_F16x_VEC_STORE(y + i + 2 * ggml_f16_epr, ay3, 2);
ax4 = GGML_F16x_VEC_LOAD(x + i + 3 * ggml_f16_epr, 3);
ay4 = GGML_F16x_VEC_LOAD(y + i + 3 * ggml_f16_epr, 3);
ay4 = GGML_F16x_VEC_FMA(ay4, ax4, vx);
ax4 = GGML_F16x_VEC_LOAD(x + i + 3 * ggml_f16_epr, 3);
ay4 = GGML_F16x_VEC_LOAD(y + i + 3 * ggml_f16_epr, 3);
ay4 = GGML_F16x_VEC_FMA(ay4, ax4, vx);
GGML_F16x_VEC_STORE(y + i + 3 * ggml_f16_epr, ay4, 3);
GGML_F16x_VEC_STORE(y + i + 3 * ggml_f16_epr, ay4, 3);
ax5 = GGML_F16x_VEC_LOAD(x + i + 4 * ggml_f16_epr, 4);
ay5 = GGML_F16x_VEC_LOAD(y + i + 4 * ggml_f16_epr, 4);
ay5 = GGML_F16x_VEC_FMA(ay5, ax5, vx);
ax5 = GGML_F16x_VEC_LOAD(x + i + 4 * ggml_f16_epr, 4);
ay5 = GGML_F16x_VEC_LOAD(y + i + 4 * ggml_f16_epr, 4);
ay5 = GGML_F16x_VEC_FMA(ay5, ax5, vx);
GGML_F16x_VEC_STORE(y + i + 4 * ggml_f16_epr, ay5, 4);
GGML_F16x_VEC_STORE(y + i + 4 * ggml_f16_epr, ay5, 4);
ax6 = GGML_F16x_VEC_LOAD(x + i + 5 * ggml_f16_epr, 5);
ay6 = GGML_F16x_VEC_LOAD(y + i + 5 * ggml_f16_epr, 5);
ay6 = GGML_F16x_VEC_FMA(ay6, ax6, vx);
ax6 = GGML_F16x_VEC_LOAD(x + i + 5 * ggml_f16_epr, 5);
ay6 = GGML_F16x_VEC_LOAD(y + i + 5 * ggml_f16_epr, 5);
ay6 = GGML_F16x_VEC_FMA(ay6, ax6, vx);
GGML_F16x_VEC_STORE(y + i + 5 * ggml_f16_epr, ay6, 5);
GGML_F16x_VEC_STORE(y + i + 5 * ggml_f16_epr, ay6, 5);
ax7 = GGML_F16x_VEC_LOAD(x + i + 6 * ggml_f16_epr, 6);
ay7 = GGML_F16x_VEC_LOAD(y + i + 6 * ggml_f16_epr, 6);
ay7 = GGML_F16x_VEC_FMA(ay7, ax7, vx);
ax7 = GGML_F16x_VEC_LOAD(x + i + 6 * ggml_f16_epr, 6);
ay7 = GGML_F16x_VEC_LOAD(y + i + 6 * ggml_f16_epr, 6);
ay7 = GGML_F16x_VEC_FMA(ay7, ax7, vx);
GGML_F16x_VEC_STORE(y + i + 6 * ggml_f16_epr, ay7, 6);
GGML_F16x_VEC_STORE(y + i + 6 * ggml_f16_epr, ay7, 6);
ax8 = GGML_F16x_VEC_LOAD(x + i + 7 * ggml_f16_epr, 7);
ay8 = GGML_F16x_VEC_LOAD(y + i + 7 * ggml_f16_epr, 7);
ay8 = GGML_F16x_VEC_FMA(ay8, ax8, vx);
ax8 = GGML_F16x_VEC_LOAD(x + i + 7 * ggml_f16_epr, 7);
ay8 = GGML_F16x_VEC_LOAD(y + i + 7 * ggml_f16_epr, 7);
ay8 = GGML_F16x_VEC_FMA(ay8, ax8, vx);
GGML_F16x_VEC_STORE(y + i + 7 * ggml_f16_epr, ay8, 7);
GGML_F16x_VEC_STORE(y + i + 7 * ggml_f16_epr, ay8, 7);
}
const int np2 = (n & ~(ggml_f16_epr - 1));
for (int k = np; k < np2; k += ggml_f16_epr) {
svfloat16_t rx = GGML_F16x_VEC_LOAD(x + k, 0);
svfloat16_t ry = GGML_F16x_VEC_LOAD(y + k, 0);
ry = GGML_F16x_VEC_FMA(ry, rx, vx);
GGML_F16x_VEC_STORE(y + k, ry, 0);
}
if (np2 < n) {
svbool_t pg = svwhilelt_b16(np2, n);
svfloat16_t hx = svld1_f16(pg, (const __fp16 *)(x + np2));
svfloat16_t hy = svld1_f16(pg, (const __fp16 *)(y + np2));
hy = svmad_f16_x(pg, hx, vx, hy);
svst1_f16(pg, (__fp16 *)(y + np2), hy);
}
np = n;
#elif defined(__riscv_zvfh) // implies __riscv_v_intrinsic
const int np = n;
_Float16 hv = (_Float16)v;
for (int i = 0, avl; i < n; i += avl) {
avl = __riscv_vsetvl_e16m8(n - i);
vfloat16m8_t ax = __riscv_vle16_v_f16m8((const _Float16 *)&x[i], avl);
vfloat16m8_t ay = __riscv_vle16_v_f16m8((_Float16 *)&y[i], avl);
vfloat16m8_t ny = __riscv_vfmadd_vf_f16m8(ax, hv, ay, avl);
__riscv_vse16_v_f16m8((_Float16 *)&y[i], ny, avl);
}
#elif defined(GGML_SIMD)
const int np = (n & ~(GGML_F16_STEP - 1));
GGML_F16_VEC vx = GGML_F16_VEC_SET1(v);
GGML_F16_VEC ax[GGML_F16_ARR];
GGML_F16_VEC ay[GGML_F16_ARR];
for (int i = 0; i < np; i += GGML_F16_STEP) {
for (int j = 0; j < GGML_F16_ARR; j++) {
ax[j] = GGML_F16_VEC_LOAD(x + i + j*GGML_F16_EPR, j);
ay[j] = GGML_F16_VEC_LOAD(y + i + j*GGML_F16_EPR, j);
ay[j] = GGML_F16_VEC_FMA(ay[j], ax[j], vx);
GGML_F16_VEC_STORE(y + i + j*GGML_F16_EPR, ay, j);
}
const int np2 = (n & ~(ggml_f16_epr - 1));
for (int k = np; k < np2; k += ggml_f16_epr) {
svfloat16_t rx = GGML_F16x_VEC_LOAD(x + k, 0);
svfloat16_t ry = GGML_F16x_VEC_LOAD(y + k, 0);
ry = GGML_F16x_VEC_FMA(ry, rx, vx);
GGML_F16x_VEC_STORE(y + k, ry, 0);
}
if (np2 < n) {
svbool_t pg = svwhilelt_b16(np2, n);
svfloat16_t hx = svld1_f16(pg, (const __fp16 *)(x + np2));
svfloat16_t hy = svld1_f16(pg, (const __fp16 *)(y + np2));
hy = svmad_f16_x(pg, hx, vx, hy);
svst1_f16(pg, (__fp16 *)(y + np2), hy);
}
#elif defined(__riscv_v_intrinsic)
// todo: RVV impl
// scalar
for (int i = 0; i < n; ++i) {
y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(y[i]) + GGML_CPU_FP16_TO_FP32(x[i])*v);
}
#else
const int np = (n & ~(GGML_F16_STEP - 1));
GGML_F16_VEC vx = GGML_F16_VEC_SET1(v);
GGML_F16_VEC ax[GGML_F16_ARR];
GGML_F16_VEC ay[GGML_F16_ARR];
for (int i = 0; i < np; i += GGML_F16_STEP) {
for (int j = 0; j < GGML_F16_ARR; j++) {
ax[j] = GGML_F16_VEC_LOAD(x + i + j*GGML_F16_EPR, j);
ay[j] = GGML_F16_VEC_LOAD(y + i + j*GGML_F16_EPR, j);
ay[j] = GGML_F16_VEC_FMA(ay[j], ax[j], vx);
GGML_F16_VEC_STORE(y + i + j*GGML_F16_EPR, ay, j);
}
}
// leftovers
for (int i = np; i < n; ++i) {
y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(y[i]) + GGML_CPU_FP16_TO_FP32(x[i])*v);
}
#endif
}
#else
// scalar
for (int i = 0; i < n; ++i) {
const int np = 0;
#endif
// leftovers
for (int i = np; i < n; ++i) {
y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(y[i]) + GGML_CPU_FP16_TO_FP32(x[i])*v);
}
#endif
}
// xs and vs are byte strides of x and v

View File

@ -44,7 +44,7 @@ static void argsort_f32_i32_cuda_cub(ggml_cuda_pool & pool,
const dim3 offset_grid((nrows + block_size - 1) / block_size);
init_offsets<<<offset_grid, block_size, 0, stream>>>(d_offsets, ncols, nrows);
cudaMemcpyAsync(temp_keys, x, ncols * nrows * sizeof(float), cudaMemcpyDeviceToDevice, stream);
CUDA_CHECK(cudaMemcpyAsync(temp_keys, x, ncols * nrows * sizeof(float), cudaMemcpyDeviceToDevice, stream));
size_t temp_storage_bytes = 0;

View File

@ -21,10 +21,12 @@
#include "ggml-common.h"
#include <array>
#include <algorithm>
#include <cassert>
#include <cfloat>
#include <cstdio>
#include <string>
#include <unordered_map>
#include <vector>
#if defined(GGML_USE_HIP)
@ -84,12 +86,12 @@
#define GGML_CUDA_CC_QY1 (GGML_CUDA_CC_OFFSET_MTHREADS + 0x210) // MTT S80, MTT S3000
#define GGML_CUDA_CC_QY2 (GGML_CUDA_CC_OFFSET_MTHREADS + 0x220) // MTT S4000
#define GGML_CUDA_CC_NG (GGML_CUDA_CC_OFFSET_MTHREADS + 0x310) // TBD
#define GGML_CUDA_CC_PH1 (GGML_CUDA_CC_OFFSET_MTHREADS + 0x310) // MTT S5000
#define GGML_CUDA_CC_IS_MTHREADS(cc) (cc >= GGML_CUDA_CC_OFFSET_MTHREADS && cc < GGML_CUDA_CC_OFFSET_AMD)
#define GGML_CUDA_CC_IS_QY1(cc) (cc >= GGML_CUDA_CC_QY1 && cc < GGML_CUDA_CC_QY2)
#define GGML_CUDA_CC_IS_QY2(cc) (cc >= GGML_CUDA_CC_QY2 && cc < GGML_CUDA_CC_NG)
#define GGML_CUDA_CC_IS_NG(cc) (cc >= GGML_CUDA_CC_NG)
#define GGML_CUDA_CC_IS_QY2(cc) (cc >= GGML_CUDA_CC_QY2 && cc < GGML_CUDA_CC_PH1)
#define GGML_CUDA_CC_IS_PH1(cc) (cc >= GGML_CUDA_CC_PH1)
#if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA) && CUDART_VERSION >= 11070
# define GGML_CUDA_USE_CUB
@ -212,9 +214,9 @@ static const char * cu_get_error_str(CUresult err) {
#define GGML_USE_VMM
#endif // (!defined(GGML_USE_HIP) && !defined(GGML_CUDA_NO_VMM)) || (defined(GGML_USE_HIP) && !defined(GGML_HIP_NO_VMM))
#if defined(GGML_USE_HIP) || __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL
#if defined(GGML_USE_HIP) || defined(GGML_USE_MUSA) || __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL
#define FP16_AVAILABLE
#endif // defined(GGML_USE_HIP) || __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL
#endif // defined(GGML_USE_HIP) || defined(GGML_USE_MUSA) || __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL
#if defined(FP16_AVAILABLE) && __CUDA_ARCH__ != 610
#define FAST_FP16_AVAILABLE
@ -250,12 +252,14 @@ static const char * cu_get_error_str(CUresult err) {
#endif // !defined(GGML_CUDA_NO_FA) && !(defined(GGML_USE_MUSA) && __MUSA_ARCH__ < 220)
static bool fp16_available(const int cc) {
return ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_PASCAL;
return ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_PASCAL ||
(GGML_CUDA_CC_IS_MTHREADS(cc) && cc >= GGML_CUDA_CC_PH1);
}
static bool fast_fp16_available(const int cc) {
return GGML_CUDA_CC_IS_AMD(cc) ||
(GGML_CUDA_CC_IS_NVIDIA(cc) && fp16_available(cc) && ggml_cuda_highest_compiled_arch(cc) != 610);
(GGML_CUDA_CC_IS_NVIDIA(cc) && fp16_available(cc) && ggml_cuda_highest_compiled_arch(cc) != 610) ||
(GGML_CUDA_CC_IS_MTHREADS(cc) && fp16_available(cc));
}
// To be used for feature selection of external libraries, e.g. cuBLAS.
@ -272,7 +276,9 @@ static bool fp16_mma_hardware_available(const int cc) {
}
static bool bf16_mma_hardware_available(const int cc) {
return (GGML_CUDA_CC_IS_NVIDIA(cc) && cc >= GGML_CUDA_CC_AMPERE) || GGML_CUDA_CC_IS_CDNA(cc) || cc >= GGML_CUDA_CC_RDNA3;
return (GGML_CUDA_CC_IS_NVIDIA(cc) && cc >= GGML_CUDA_CC_AMPERE) ||
GGML_CUDA_CC_IS_CDNA(cc) || cc >= GGML_CUDA_CC_RDNA3 ||
(GGML_CUDA_CC_IS_MTHREADS(cc) && cc >= GGML_CUDA_CC_PH1);
}
static bool fp32_mma_hardware_available(const int cc) {
@ -558,8 +564,12 @@ static __device__ __forceinline__ void ggml_cuda_mad(float & acc, const float2 v
acc += v.y*u.y;
}
static __device__ __forceinline__ void ggml_cuda_mad(float & acc, const half2 v, const half2 u) {
#if defined(GGML_USE_HIP) && (defined(RDNA2) || defined(RDNA3) || defined(RDNA4) || defined(__gfx906__) || defined(CDNA))
#define V_DOT2_F32_F16_AVAILABLE
#endif // defined(GGML_USE_HIP) && (defined(RDNA2) || defined(RDNA3) || defined(RDNA4) || defined(__gfx906__) || defined(CDNA))
static __device__ __forceinline__ void ggml_cuda_mad(float & acc, const half2 v, const half2 u) {
#ifdef V_DOT2_F32_F16_AVAILABLE
asm volatile("v_dot2_f32_f16 %0, %1, %2, %0" : "+v"(acc) : "v"(v), "v"(u));
#else
#ifdef FAST_FP16_AVAILABLE
@ -571,7 +581,7 @@ static __device__ __forceinline__ void ggml_cuda_mad(float & acc, const half2 v,
acc += tmpv.x * tmpu.x;
acc += tmpv.y * tmpu.y;
#endif // FAST_FP16_AVAILABLE
#endif // defined(GGML_USE_HIP) && (defined(RDNA2) || defined(RDNA3) || defined(RDNA4) || defined(GCN5) || defined(CDNA))
#endif // V_DOT2_F32_F16_AVAILABLE
}
static __device__ __forceinline__ void ggml_cuda_mad(half2 & acc, const half2 v, const half2 u) {
@ -972,6 +982,157 @@ struct ggml_cuda_graph {
#endif
};
struct ggml_cuda_concurrent_event {
std::vector<cudaEvent_t> join_events;
cudaEvent_t fork_event = nullptr;
int n_streams = 0;
std::unordered_map<const ggml_tensor *, int> stream_mapping;
// Original order of nodes in this concurrent region (before interleaving)
// Used to restore grouping for fusion within streams
std::vector<const ggml_tensor *> original_order;
const ggml_tensor * join_node;
ggml_cuda_concurrent_event() = default;
ggml_cuda_concurrent_event(const ggml_cuda_concurrent_event &) = delete;
ggml_cuda_concurrent_event & operator=(const ggml_cuda_concurrent_event &) = delete;
explicit ggml_cuda_concurrent_event(int n_streams) : n_streams(n_streams) {
join_events.resize(n_streams);
for (size_t i = 0; i < join_events.size(); ++i) {
CUDA_CHECK(cudaEventCreateWithFlags(&join_events[i], cudaEventDisableTiming));
}
CUDA_CHECK(cudaEventCreateWithFlags(&fork_event, cudaEventDisableTiming));
}
ggml_cuda_concurrent_event(ggml_cuda_concurrent_event && other) noexcept
: join_events(std::move(other.join_events))
, fork_event(other.fork_event)
, n_streams(other.n_streams)
, stream_mapping(std::move(other.stream_mapping))
, original_order(std::move(other.original_order))
, join_node(other.join_node) {
other.fork_event = nullptr;
}
// 1. check if any branches write to overlapping memory ranges (except the join node)
// 2. check whether all srcs are either within the branch or outside the nodes covered by ggml_cuda_concurrent_event
// we assume all nodes have the same buffer
bool is_valid() const {
std::vector<std::vector<std::pair<int64_t, int64_t>>> write_ranges;
write_ranges.resize(n_streams);
// get join_node's memory range to exclude from overlap checking.
// multiple nodes can use join_node's buffer; we synchronize on the join node.
const ggml_tensor * join_t = join_node->view_src ? join_node->view_src : join_node;
const int64_t join_start = (int64_t) join_t->data;
const int64_t join_end = join_start + ggml_nbytes(join_t);
for (const auto & [tensor, stream] : stream_mapping) {
const ggml_tensor * t = tensor->view_src ? tensor->view_src : tensor;
const int64_t t_start = (int64_t) t->data;
const int64_t t_end = t_start + ggml_nbytes(t);
// skip tensors that overlap with join_node's buffer.
if ((t_start <= join_start && join_start < t_end) || (join_start <= t_start && t_start < join_end)) {
continue;
}
// concurrent streams begin from 1
write_ranges[stream - 1].emplace_back(t_start, t_end);
}
for (int i = 0; i < n_streams; ++i) {
// sorts first by start then by end of write range
std::sort(write_ranges[i].begin(), write_ranges[i].end());
}
bool writes_overlap = false;
bool dependent_srcs = false;
for (const auto & [tensor, stream] : stream_mapping) {
const ggml_tensor * t = tensor->view_src ? tensor->view_src : tensor;
const int64_t t_start = (int64_t) t->data;
const int64_t t_end = t_start + ggml_nbytes(t);
// skip tensors that overlap with join_node's buffer
if ((t_start <= join_start && join_start < t_end) || (join_start <= t_start && t_start < join_end)) {
continue;
}
// check if this buffer's write data overlaps with another stream's
std::pair<int64_t, int64_t> data_range = std::make_pair(t_start, t_end);
for (int i = 0; i < n_streams; ++i) {
if (i == stream - 1) {
continue;
}
auto it = std::lower_bound(write_ranges[i].begin(), write_ranges[i].end(), data_range);
if (it != write_ranges[i].end()) {
const std::pair<int64_t, int64_t> & other = *it;
// std::lower_bound returns the first element where other >= data_range (lexicographically).
// This guarantees other.first >= data_range.first.
// Therefore, overlap occurs iff other.first < data_range.second
// (i.e., the other range starts before this range ends).
if (other.first < data_range.second) {
GGML_LOG_DEBUG("Writes overlap for %s", tensor->name);
writes_overlap = true;
break;
}
}
}
//check if all srcs are either in branch or don't have a branch
for (int i = 0; i < GGML_MAX_SRC; ++i) {
if (!tensor->src[i]) {
continue;
}
auto it = stream_mapping.find(tensor->src[i]);
if (it == stream_mapping.end()) {
continue;
}
if (it->second != stream) {
dependent_srcs = true;
break;
}
}
if (dependent_srcs || writes_overlap) {
break;
}
}
return !writes_overlap && !dependent_srcs;
}
~ggml_cuda_concurrent_event() {
if (fork_event != nullptr) {
CUDA_CHECK(cudaEventDestroy(fork_event));
}
for (cudaEvent_t e : join_events) {
if (e != nullptr) {
CUDA_CHECK(cudaEventDestroy(e));
}
}
}
};
struct ggml_cuda_stream_context {
std::unordered_map<const ggml_tensor *, ggml_cuda_concurrent_event> concurrent_events;
void reset() {
concurrent_events.clear();
}
};
struct ggml_backend_cuda_context {
int device;
std::string name;
@ -982,11 +1143,15 @@ struct ggml_backend_cuda_context {
std::unique_ptr<ggml_cuda_graph> cuda_graph;
int curr_stream_no = 0;
explicit ggml_backend_cuda_context(int device) :
device(device),
name(GGML_CUDA_NAME + std::to_string(device)) {
}
ggml_cuda_stream_context concurrent_stream_context;
~ggml_backend_cuda_context();
cudaStream_t stream(int device, int stream) {
@ -997,9 +1162,9 @@ struct ggml_backend_cuda_context {
return streams[device][stream];
}
cudaStream_t stream() {
return stream(device, 0);
}
cudaStream_t stream() { return stream(device, curr_stream_no); }
ggml_cuda_stream_context & stream_context() { return concurrent_stream_context; }
cublasHandle_t cublas_handle(int device) {
if (cublas_handles[device] == nullptr) {
@ -1015,15 +1180,15 @@ struct ggml_backend_cuda_context {
}
// pool
std::unique_ptr<ggml_cuda_pool> pools[GGML_CUDA_MAX_DEVICES];
std::unique_ptr<ggml_cuda_pool> pools[GGML_CUDA_MAX_DEVICES][GGML_CUDA_MAX_STREAMS];
static std::unique_ptr<ggml_cuda_pool> new_pool_for_device(int device);
static std::unique_ptr<ggml_cuda_pool> new_pool_for_device(int device, int stream_no);
ggml_cuda_pool & pool(int device) {
if (pools[device] == nullptr) {
pools[device] = new_pool_for_device(device);
if (pools[device][curr_stream_no] == nullptr) {
pools[device][curr_stream_no] = new_pool_for_device(device, curr_stream_no);
}
return *pools[device];
return *pools[device][curr_stream_no];
}
ggml_cuda_pool & pool() {

View File

@ -86,6 +86,9 @@ static __global__ void cpy_scalar_transpose(const char * cx, char * cdst, const
}
}
}
GGML_UNUSED_VARS(ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11,
nb12, nb13);
}
static __device__ void cpy_blck_q8_0_f32(const char * cxi, char * cdsti) {
@ -202,7 +205,7 @@ static void ggml_cpy_scalar_cuda(
ne00n = ne00;
ne01n = ne01;
ne02n = ne02;
} else if (nb00 > nb02) {
} else {
ne00n = ne00;
ne01n = ne01*ne02;
ne02n = 1;

View File

@ -25,7 +25,7 @@ typedef void (* fattn_kernel_t)(
const float m1,
const uint32_t n_head_log2,
const float logit_softcap,
const int32_t ne00, const int32_t ne01, const int32_t ne02, const int32_t ne03,
const int32_t ne00, const uint3 ne01, const int32_t ne02, const int32_t ne03,
const int32_t nb01, const int32_t nb02, const int32_t nb03,
const int32_t ne10, const int32_t ne11, const int32_t ne12, const int32_t ne13,
const int32_t nb11, const int32_t nb12, const int64_t nb13,
@ -55,11 +55,11 @@ static __device__ __forceinline__ float vec_dot_fattn_vec_KQ_f16(
ggml_cuda_memcpy_1<sizeof(tmp)>(tmp, K_h2 + k_KQ_0 + (threadIdx.x % nthreads)*cpy_ne);
#pragma unroll
for (int k_KQ_1 = 0; k_KQ_1 < cpy_ne; ++k_KQ_1) {
#ifdef FAST_FP16_AVAILABLE
#ifdef V_DOT2_F32_F16_AVAILABLE
ggml_cuda_mad(sum, tmp[k_KQ_1] , ((const half2 *) Q_v)[k_KQ_0/nthreads + k_KQ_1]);
#else
ggml_cuda_mad(sum, __half22float2(tmp[k_KQ_1]), ((const float2 *) Q_v)[k_KQ_0/nthreads + k_KQ_1]);
#endif // FP16_AVAILABLE
#endif // V_DOT2_F32_F16_AVAILABLE
}
}
@ -621,7 +621,8 @@ static __global__ void flash_attn_mask_to_KV_max(
template<int D, int ncols1, int ncols2> // D == head size
__launch_bounds__(D, 1)
static __global__ void flash_attn_stream_k_fixup(
float * __restrict__ dst, const float2 * __restrict__ dst_fixup, const int ne01, const int ne02, const int ne03, const int ne11) {
float * __restrict__ dst, const float2 * __restrict__ dst_fixup, const int ne01, const int ne02, const int ne03, const int ne11,
const int nbatch_fa) {
constexpr int ncols = ncols1*ncols2;
const int bidx0 = blockIdx.x;
@ -632,8 +633,8 @@ static __global__ void flash_attn_stream_k_fixup(
const float * dst_fixup_data = ((const float *) dst_fixup) + gridDim.x*(2*2*ncols);
const int iter_k = ne11 / FATTN_KQ_STRIDE;
const int iter_j = (ne01 + (ncols1 - 1)) / ncols1;
const int iter_k = (ne11 + (nbatch_fa - 1)) / nbatch_fa;
const int iter_j = (ne01 + (ncols1 - 1)) / ncols1;
const int kbc0 = (bidx0 + 0)*(iter_k*iter_j*(ne02/ncols2)*ne03) / gridDim.x;
const int kbc0_stop = (bidx0 + 1)*(iter_k*iter_j*(ne02/ncols2)*ne03) / gridDim.x;
@ -765,7 +766,7 @@ static __global__ void flash_attn_combine_results(
template <int DV, int ncols1, int ncols2>
void launch_fattn(
ggml_backend_cuda_context & ctx, ggml_tensor * dst, fattn_kernel_t fattn_kernel, const int nwarps, const size_t nbytes_shared,
const int KQ_row_granularity, const bool need_f16_K, const bool need_f16_V, const bool stream_k, const int warp_size = WARP_SIZE
const int nbatch_fa, const bool need_f16_K, const bool need_f16_V, const bool stream_k, const int warp_size = WARP_SIZE
) {
constexpr int ncols = ncols1 * ncols2;
@ -790,8 +791,6 @@ void launch_fattn(
GGML_ASSERT(!V || V->nb[0] == ggml_element_size(V));
GGML_ASSERT(!mask || mask->type == GGML_TYPE_F16);
GGML_ASSERT(!mask || mask->ne[1] >= GGML_PAD(Q->ne[1], 16) &&
"the Flash-Attention CUDA kernel requires the mask to be padded to 16 and at least n_queries big");
ggml_cuda_pool & pool = ctx.pool();
cudaStream_t main_stream = ctx.stream();
@ -915,7 +914,7 @@ void launch_fattn(
dst_tmp_meta.alloc(blocks_num.x*ncols * (2*2 + DV) * sizeof(float));
} else {
const int ntiles_KQ = (K->ne[1] + KQ_row_granularity - 1) / KQ_row_granularity; // Max. number of parallel blocks limited by tensor size.
const int ntiles_KQ = (K->ne[1] + nbatch_fa - 1) / nbatch_fa; // Max. number of parallel blocks limited by tensor size.
// parallel_blocks must not be larger than what the tensor size allows:
parallel_blocks = std::min(parallel_blocks, ntiles_KQ);
@ -970,6 +969,9 @@ void launch_fattn(
const float m0 = powf(2.0f, -(max_bias ) / n_head_log2);
const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
// TODO other tensor dimensions after removal of WMMA kernel:
const uint3 ne01 = init_fastdiv_values(Q->ne[1]);
GGML_ASSERT(block_dim.x % warp_size == 0);
fattn_kernel<<<blocks_num, block_dim, nbytes_shared, main_stream>>>(
(const char *) Q->data,
@ -980,7 +982,7 @@ void launch_fattn(
KV_max.ptr,
!stream_k && parallel_blocks > 1 ? dst_tmp.ptr : (float *) KQV->data, dst_tmp_meta.ptr,
scale, max_bias, m0, m1, n_head_log2, logit_softcap,
Q->ne[0], Q->ne[1], Q->ne[2], Q->ne[3], Q->nb[1], Q->nb[2], Q->nb[3],
Q->ne[0], ne01, Q->ne[2], Q->ne[3], Q->nb[1], Q->nb[2], Q->nb[3],
K->ne[0], K->ne[1], K->ne[2], K->ne[3], nb11, nb12, nb13,
nb21, nb22, nb23,
mask ? mask->ne[1] : 0, mask ? mask->ne[2] : 0, mask ? mask->ne[3] : 0,
@ -995,7 +997,7 @@ void launch_fattn(
flash_attn_stream_k_fixup<DV, ncols1, ncols2>
<<<blocks_num_combine, block_dim_combine, 0, main_stream>>>
((float *) KQV->data, dst_tmp_meta.ptr, Q->ne[1], Q->ne[2], Q->ne[3], K->ne[1]);
((float *) KQV->data, dst_tmp_meta.ptr, Q->ne[1], Q->ne[2], Q->ne[3], K->ne[1], nbatch_fa);
}
} else if (parallel_blocks > 1) {
const dim3 block_dim_combine(DV, 1, 1);

File diff suppressed because it is too large Load Diff

View File

@ -501,6 +501,7 @@ static __device__ __forceinline__ void flash_attn_tile_iter(
const half2 * const __restrict__ K_h2,
const half2 * const __restrict__ V_h2,
const half * const __restrict__ mask,
const uint3 ne01,
const float logit_softcap,
const float slope,
T_KQ * const KQ,
@ -512,7 +513,8 @@ static __device__ __forceinline__ void flash_attn_tile_iter(
float * const KQ_sum,
T_acc * const VKQ,
const int k_VKQ_0,
const int k_VKQ_max) {
const int k_VKQ_max,
const int col_Q_0) {
constexpr int cpy_nb = ggml_cuda_get_max_cpy_bytes();
constexpr int cpy_ne = cpy_nb / 4;
@ -556,7 +558,7 @@ static __device__ __forceinline__ void flash_attn_tile_iter(
// Apply logit softcap + mask, update KQ_max:
#pragma unroll
for (int jc0 = 0; jc0 < cpw; ++jc0) {
const int j = (jc0 + (threadIdx.y / np)*cpw)/ncols2;
const int j = fastmodulo(col_Q_0 + (jc0 + (threadIdx.y / np)*cpw)/ncols2, ne01);
#pragma unroll
for (int i_KQ_0 = 0; i_KQ_0 < nbatch_fa; i_KQ_0 += np*warp_size) {
@ -609,7 +611,7 @@ static __device__ __forceinline__ void flash_attn_tile_iter(
float KQ_sum_add = 0.0f;
#pragma unroll
for (int i0 = 0; i0 < nbatch_fa; i0 += np*warp_size) {
const float val = !oob_check || i0 + (threadIdx.y % np)*warp_size + threadIdx.x < k_VKQ_sup ?
const float val = !oob_check || i0 + (threadIdx.y % np)*warp_size + threadIdx.x < static_cast<uint32_t>(k_VKQ_sup) ?
expf(KQ_acc[(i0/(np*warp_size))*cpw + jc] - KQ_max[jc]) : 0.0f;
KQ_sum_add += val;
tmp[i0/(np*warp_size)][jc1] = val;
@ -736,7 +738,7 @@ static __global__ void flash_attn_tile(
const float m1,
const uint32_t n_head_log2,
const float logit_softcap,
const int32_t ne00, const int32_t ne01, const int32_t ne02, const int32_t ne03,
const int32_t ne00, const uint3 ne01, const int32_t ne02, const int32_t ne03,
const int32_t nb01, const int32_t nb02, const int32_t nb03,
const int32_t ne10, const int32_t ne11, const int32_t ne12, const int32_t ne13,
const int32_t nb11, const int32_t nb12, const int64_t nb13,
@ -781,11 +783,11 @@ static __global__ void flash_attn_tile(
const int sequence = blockIdx.z / (ne02/ncols2);
const int head0 = blockIdx.z*ncols2 - sequence*ne02; // == blockIdx.z % (ne02/ncols2)
const int gqa_ratio = ne02 / ne12; // With grouped query attention there are > 1 Q matrices per K, V matrix.
const float * Q_f = (const float *) (Q + nb03*sequence + nb02* head0 + nb01*col_Q_0);
const float * Q_f = (const float *) (Q + nb03*sequence + nb02* head0);
const half2 * K_h2 = (const half2 *) (K + nb13*sequence + nb12*(head0 / gqa_ratio));
const half2 * V_h2 = (const half2 *) (V + nb23*sequence + nb22*(head0 / gqa_ratio)); // K and V have same shape
const half * maskh = mask ? (const half *) (mask + nb33*(sequence % ne33) + nb31*col_Q_0) : nullptr;
const half * maskh = mask ? (const half *) (mask + nb33*(sequence % ne33)) : nullptr;
const int stride_K2 = nb11 / sizeof(half2);
const int stride_V2 = nb21 / sizeof(half2);
@ -842,11 +844,9 @@ static __global__ void flash_attn_tile(
for (int i0 = 0; i0 < DKQp; i0 += np*warp_size*cpy_ne_D) {
if (i0 + np*warp_size*cpy_ne_D <= DKQ || i0 + (threadIdx.y % np)*(warp_size*cpy_ne_D) + threadIdx.x*cpy_ne_D < DKQ) {
float tmp_f[cpy_ne_D] = {0.0f};
if (ncols1 == 1 || col_Q_0 + j < ne01) {
ggml_cuda_memcpy_1<sizeof(tmp_f)>
(tmp_f, &Q_f[c*(nb02/sizeof(float)) + j*(nb01/sizeof(float))
+ i0 + (threadIdx.y % np)*(warp_size*cpy_ne_D) + threadIdx.x*cpy_ne_D]);
}
ggml_cuda_memcpy_1<sizeof(tmp_f)>
(tmp_f, &Q_f[c*(nb02/sizeof(float)) + fastmodulo(col_Q_0 + j, ne01)*(nb01/sizeof(float))
+ i0 + (threadIdx.y % np)*(warp_size*cpy_ne_D) + threadIdx.x*cpy_ne_D]);
#pragma unroll
for (int i1 = 0; i1 < cpy_ne_D; ++i1) {
@ -881,23 +881,23 @@ static __global__ void flash_attn_tile(
while (k_VKQ_0 < k_VKQ_max - nbatch_fa) {
constexpr bool oob_check = false;
flash_attn_tile_iter<warp_size, nwarps, ncols1, ncols2, DKQ, DV, nbatch_fa, nbatch_K, use_logit_softcap, oob_check>
(Q_tmp, K_h2, V_h2, maskh, logit_softcap, slope, KQ, KV_tmp,
stride_K2, stride_V2, stride_mask, KQ_max, KQ_sum, VKQ, k_VKQ_0, k_VKQ_max);
(Q_tmp, K_h2, V_h2, maskh, ne01, logit_softcap, slope, KQ, KV_tmp,
stride_K2, stride_V2, stride_mask, KQ_max, KQ_sum, VKQ, k_VKQ_0, k_VKQ_max, col_Q_0);
k_VKQ_0 += gridDim.y*nbatch_fa;
}
if (k_VKQ_0 < k_VKQ_max) {
constexpr bool oob_check = true;
flash_attn_tile_iter<warp_size, nwarps, ncols1, ncols2, DKQ, DV, nbatch_fa, nbatch_K, use_logit_softcap, oob_check>
(Q_tmp, K_h2, V_h2, maskh, logit_softcap, slope, KQ, KV_tmp,
stride_K2, stride_V2, stride_mask, KQ_max, KQ_sum, VKQ, k_VKQ_0, k_VKQ_max);
(Q_tmp, K_h2, V_h2, maskh, ne01, logit_softcap, slope, KQ, KV_tmp,
stride_K2, stride_V2, stride_mask, KQ_max, KQ_sum, VKQ, k_VKQ_0, k_VKQ_max, col_Q_0);
}
} else {
// Branch without out-of-bounds checks.
for (int k_VKQ_0 = blockIdx.y*nbatch_fa; k_VKQ_0 < k_VKQ_max; k_VKQ_0 += gridDim.y*nbatch_fa) {
constexpr bool oob_check = false;
flash_attn_tile_iter<warp_size, nwarps, ncols1, ncols2, DKQ, DV, nbatch_fa, nbatch_K, use_logit_softcap, oob_check>
(Q_tmp, K_h2, V_h2, maskh, logit_softcap, slope, KQ, KV_tmp,
stride_K2, stride_V2, stride_mask, KQ_max, KQ_sum, VKQ, k_VKQ_0, k_VKQ_max);
(Q_tmp, K_h2, V_h2, maskh, ne01, logit_softcap, slope, KQ, KV_tmp,
stride_K2, stride_V2, stride_mask, KQ_max, KQ_sum, VKQ, k_VKQ_0, k_VKQ_max, col_Q_0);
}
}
@ -1010,13 +1010,13 @@ static __global__ void flash_attn_tile(
const int j = jc / ncols2;
const int c = jc % ncols2;
if (ncols1 > 1 && col_Q_0 + j >= ne01) {
if (ncols1 > 1 && col_Q_0 + j >= int(ne01.z)) {
return;
}
const float scale = gridDim.y == 1 ? 1.0f/KQ_sum[jc0] : 1.0f;
const int j_dst_unrolled = ((sequence*ne01 + col_Q_0 + j)*ne02 + head0 + c)*gridDim.y + blockIdx.y;
const int j_dst_unrolled = ((sequence*int(ne01.z) + col_Q_0 + j)*ne02 + head0 + c)*gridDim.y + blockIdx.y;
#ifdef FAST_FP16_AVAILABLE
constexpr int cpy_ne_D = cpy_ne/2 < (DVp/2)/warp_size ? cpy_ne/2 : (DVp/2)/warp_size;

View File

@ -33,7 +33,7 @@ static __global__ void flash_attn_ext_vec(
const float m1,
const uint32_t n_head_log2,
const float logit_softcap,
const int32_t ne00, const int32_t ne01, const int32_t ne02, const int32_t ne03,
const int32_t ne00, const uint3 ne01, const int32_t ne02, const int32_t ne03,
const int32_t nb01, const int32_t nb02, const int32_t nb03,
const int32_t ne10, const int32_t ne11, const int32_t ne12, const int32_t ne13,
const int32_t nb11, const int32_t nb12, const int64_t nb13,
@ -86,11 +86,11 @@ static __global__ void flash_attn_ext_vec(
constexpr vec_dot_KQ_t vec_dot_KQ = get_vec_dot_KQ<type_K, D, nthreads_KQ>();
constexpr bool Q_q8_1 = type_K != GGML_TYPE_F16;
#ifdef FAST_FP16_AVAILABLE
#ifdef V_DOT2_F32_F16_AVAILABLE
constexpr dequantize_V_t dequantize_V = get_dequantize_V<type_V, half, V_rows_per_thread>();
#else
constexpr dequantize_V_t dequantize_V = get_dequantize_V<type_V, float, V_rows_per_thread>();
#endif // FAST_FP16_AVAILABLE
#endif // V_DOT2_F32_F16_AVAILABLE
const int ic0 = blockIdx.x * ncols; // Index of the Q/QKV column to work on.
@ -112,13 +112,13 @@ static __global__ void flash_attn_ext_vec(
constexpr int ne_KQ = ncols*D;
constexpr int ne_combine = nwarps*V_cols_per_iter*D;
#ifdef FAST_FP16_AVAILABLE
#ifdef V_DOT2_F32_F16_AVAILABLE
half2 VKQ[ncols][(D/2)/nthreads_V] = {{{0.0f, 0.0f}}};
__shared__ half KQ[ne_KQ > ne_combine ? ne_KQ : ne_combine];
#else
float2 VKQ[ncols][(D/2)/nthreads_V] = {{{0.0f, 0.0f}}};
__shared__ float KQ[ne_KQ > ne_combine ? ne_KQ : ne_combine];
#endif // FAST_FP16_AVAILABLE
#endif // V_DOT2_F32_F16_AVAILABLE
float KQ_max[ncols];
float KQ_sum[ncols];
@ -129,11 +129,11 @@ static __global__ void flash_attn_ext_vec(
}
// Convert Q to float2 (f16 K) or q8_1 (quantized K) and store in registers:
#ifdef FAST_FP16_AVAILABLE
#ifdef V_DOT2_F32_F16_AVAILABLE
half2 Q_reg[ncols][(D/2)/nthreads_KQ]; // Will be initialized completely.
#else
float2 Q_reg[ncols][(D/2)/nthreads_KQ] = {{{0.0f, 0.0f}}}; // May be only partially initialized.
#endif // FAST_FP16_AVAILABLE
#endif // V_DOT2_F32_F16_AVAILABLE
int Q_i32[ncols][1 > D/(sizeof(int)*nthreads_KQ) ? 1 : D/(sizeof(int)*nthreads_KQ)];
float2 Q_ds[ncols][1 > D/(sizeof(int)*nthreads_KQ) ? 1 : D/(sizeof(int)*nthreads_KQ)];
if constexpr (Q_q8_1) {
@ -150,12 +150,12 @@ static __global__ void flash_attn_ext_vec(
float2 * tmp_q_ds = (float2 *) (tmp_q_i32 + D/sizeof(int));
// Set memory to zero if out of bounds:
if (ncols > 1 && ic0 + j >= ne01) {
if (ncols > 1 && ic0 + j >= int(ne01.z)) {
#pragma unroll
for (int i0 = 0; i0 < int(D/sizeof(int)); i0 += WARP_SIZE) {
const int i = i0 + threadIdx.x;
if (i0 + WARP_SIZE <= D/sizeof(int) || i < D/sizeof(int)) {
if (i0 + WARP_SIZE <= int(D/sizeof(int)) || i < int(D/sizeof(int))) {
tmp_q_i32[i] = 0;
}
}
@ -191,7 +191,7 @@ static __global__ void flash_attn_ext_vec(
__syncthreads();
} else {
#ifdef FAST_FP16_AVAILABLE
#ifdef V_DOT2_F32_F16_AVAILABLE
const half2 scale_h2 = make_half2(scale, scale);
#pragma unroll
for (int j = 0; j < ncols; ++j) {
@ -201,7 +201,7 @@ static __global__ void flash_attn_ext_vec(
const int i = i0 + (nthreads_KQ == WARP_SIZE ? threadIdx.x : threadIdx.x % nthreads_KQ)*cpy_ne;
float2 tmp[cpy_ne] = {{0.0f, 0.0f}};
if (ncols == 1 || ic0 + j < ne01) {
if (ncols == 1 || ic0 + j < int(ne01.z)) {
ggml_cuda_memcpy_1<cpy_nb>(tmp, &Q_j[i]);
ggml_cuda_memcpy_1<cpy_nb>(tmp + cpy_ne/2, &Q_j[i + cpy_ne/2]);
}
@ -222,7 +222,7 @@ static __global__ void flash_attn_ext_vec(
#pragma unroll
for (int i0 = 0; i0 < D/2; i0 += nthreads_KQ*cpy_ne) {
const int i = i0 + (nthreads_KQ == WARP_SIZE ? threadIdx.x : threadIdx.x % nthreads_KQ)*cpy_ne;
if (ncols == 1 || ic0 + j < ne01) {
if (ncols == 1 || ic0 + j < int(ne01.z)) {
ggml_cuda_memcpy_1<cpy_nb>(&Q_reg[j][i0/nthreads_KQ], &Q_j[i]);
ggml_cuda_memcpy_1<cpy_nb>(&Q_reg[j][i0/nthreads_KQ + cpy_ne/2], &Q_j[i + cpy_ne/2]);
}
@ -233,7 +233,7 @@ static __global__ void flash_attn_ext_vec(
Q_reg[j][k].y *= scale;
}
}
#endif // FAST_FP16_AVAILABLE
#endif // V_DOT2_F32_F16_AVAILABLE
}
const int k_VKQ_max = KV_max ? KV_max[sequence*gridDim.x + blockIdx.x] : ne11;
@ -266,13 +266,13 @@ static __global__ void flash_attn_ext_vec(
sum = logit_softcap*tanhf(sum);
}
if (mask) {
if (mask && (ncols == 1 || ic0 + j < int(ne01.z))) {
sum += slope*__half2float(maskh[j*ne11 + i_KQ]);
}
KQ_max_new[j] = fmaxf(KQ_max_new[j], sum);
if ((nthreads_KQ == WARP_SIZE ? threadIdx.x : threadIdx.x % nthreads_KQ) == i_KQ_0) {
if ((nthreads_KQ == WARP_SIZE ? threadIdx.x : threadIdx.x % nthreads_KQ) == uint32_t(i_KQ_0)) {
KQ_reg[j] = sum;
}
}
@ -291,7 +291,7 @@ static __global__ void flash_attn_ext_vec(
KQ_sum[j] = KQ_sum[j]*KQ_max_scale + KQ_reg[j];
KQ[j*nthreads + tid] = KQ_reg[j];
#ifdef FAST_FP16_AVAILABLE
#ifdef V_DOT2_F32_F16_AVAILABLE
const half2 KQ_max_scale_h2 = make_half2(KQ_max_scale, KQ_max_scale);
#pragma unroll
for (int i_VKQ_0 = 0; i_VKQ_0 < D/2; i_VKQ_0 += nthreads_V) {
@ -303,7 +303,7 @@ static __global__ void flash_attn_ext_vec(
VKQ[j][i_VKQ_0/nthreads_V].x *= KQ_max_scale;
VKQ[j][i_VKQ_0/nthreads_V].y *= KQ_max_scale;
}
#endif // FAST_FP16_AVAILABLE
#endif // V_DOT2_F32_F16_AVAILABLE
}
#ifndef GGML_USE_HIP
@ -314,7 +314,7 @@ static __global__ void flash_attn_ext_vec(
for (int k0 = 0; k0 < WARP_SIZE; k0 += V_cols_per_iter) {
const int k = threadIdx.y*WARP_SIZE + k0 + (nthreads_V == WARP_SIZE ? 0 : threadIdx.x / nthreads_V);
#ifdef FAST_FP16_AVAILABLE
#ifdef V_DOT2_F32_F16_AVAILABLE
half2 KQ_k[ncols];
#pragma unroll
for (int j = 0; j < ncols; ++j) {
@ -353,7 +353,7 @@ static __global__ void flash_attn_ext_vec(
}
}
}
#endif // FAST_FP16_AVAILABLE
#endif // V_DOT2_F32_F16_AVAILABLE
}
}
@ -374,7 +374,7 @@ static __global__ void flash_attn_ext_vec(
KQ_sum[j] = KQ_sum[j]*KQ_max_scale + (threadIdx.x == 0 ? expf(sink - KQ_max[j]) : 0.0f);
#ifdef FAST_FP16_AVAILABLE
#ifdef V_DOT2_F32_F16_AVAILABLE
const half2 KQ_max_scale_h2 = make_half2(KQ_max_scale, KQ_max_scale);
#pragma unroll
for (int i_VKQ_0 = 0; i_VKQ_0 < D/2; i_VKQ_0 += nthreads_V) {
@ -386,7 +386,7 @@ static __global__ void flash_attn_ext_vec(
VKQ[j][i_VKQ_0/nthreads_V].x *= KQ_max_scale;
VKQ[j][i_VKQ_0/nthreads_V].y *= KQ_max_scale;
}
#endif // FAST_FP16_AVAILABLE
#endif // V_DOT2_F32_F16_AVAILABLE
}
}
@ -412,7 +412,7 @@ static __global__ void flash_attn_ext_vec(
#pragma unroll
for (int j_VKQ = 0; j_VKQ < ncols; ++j_VKQ) {
if (ncols > 1 && ic0 + j_VKQ >= ne01) {
if (ncols > 1 && ic0 + j_VKQ >= int(ne01.z)) {
break;
}
@ -421,7 +421,7 @@ static __global__ void flash_attn_ext_vec(
const float kqmax_scale = expf(KQ_max[j_VKQ] - kqmax_new);
KQ_max[j_VKQ] = kqmax_new;
#ifdef FAST_FP16_AVAILABLE
#ifdef V_DOT2_F32_F16_AVAILABLE
half2 * VKQ_tmp = (half2 *) KQ + threadIdx.y*(V_cols_per_iter*D/2)
+ (nthreads_V == WARP_SIZE ? 0 : threadIdx.x / nthreads_V)*(D/2);
@ -452,7 +452,7 @@ static __global__ void flash_attn_ext_vec(
ggml_cuda_memcpy_1<V_rows_per_thread/2*sizeof(float)>(VKQ_tmp + i_VKQ, &VKQ[j_VKQ][i_VKQ_0/nthreads_V]);
ggml_cuda_memcpy_1<V_rows_per_thread/2*sizeof(float)>(VKQ_tmp + i_VKQ + V_rows_per_thread/4, &VKQ[j_VKQ][i_VKQ_0/nthreads_V + V_rows_per_thread/4]);
}
#endif // FAST_FP16_AVAILABLE
#endif // V_DOT2_F32_F16_AVAILABLE
KQ_sum[j_VKQ] *= kqmax_scale;
KQ_sum[j_VKQ] = warp_reduce_sum(KQ_sum[j_VKQ]);
@ -479,7 +479,7 @@ static __global__ void flash_attn_ext_vec(
if (gridDim.y == 1) {
dst_val /= KQ_sum[j_VKQ];
}
dst[(((sequence*ne01 + ic0 + j_VKQ)*ne02 + head)*gridDim.y + blockIdx.y)*D + i0 + tid] = dst_val;
dst[(((sequence*int(ne01.z) + ic0 + j_VKQ)*ne02 + head)*gridDim.y + blockIdx.y)*D + i0 + tid] = dst_val;
}
}
@ -489,8 +489,8 @@ static __global__ void flash_attn_ext_vec(
}
if (gridDim.y != 1 && tid < ncols && (ncols == 1 || ic0 + tid < ne01)) {
dst_meta[((sequence*ne01 + ic0 + tid)*ne02 + head)*gridDim.y + blockIdx.y] = make_float2(KQ_max[tid], KQ_sum[tid]);
if (gridDim.y != 1 && tid < ncols && (ncols == 1 || ic0 + tid < int(ne01.z))) {
dst_meta[((sequence*int(ne01.z) + ic0 + tid)*ne02 + head)*gridDim.y + blockIdx.y] = make_float2(KQ_max[tid], KQ_sum[tid]);
}
#else
GGML_UNUSED_VARS(Q, K, V, mask, sinks, KV_max, dst, dst_meta, scale,

View File

@ -38,14 +38,14 @@ static __global__ void flash_attn_ext_f16(
const float m1,
const uint32_t n_head_log2,
const float logit_softcap,
const int32_t ne00, const int32_t ne01, const int32_t ne02, const int32_t ne03,
const int32_t ne00, const uint3 ne01, const int32_t ne02, const int32_t ne03,
const int32_t nb01, const int32_t nb02, const int32_t nb03,
const int32_t ne10, const int32_t ne11, const int32_t ne12, const int32_t ne13,
const int32_t nb11, const int32_t nb12, const int64_t nb13,
const int32_t nb21, const int32_t nb22, const int64_t nb23,
const int32_t ne31, const int32_t ne32, const int32_t ne33,
const int32_t nb31, const int32_t nb32, const int64_t nb33) {
#if defined(FLASH_ATTN_AVAILABLE) && (__CUDA_ARCH__ == GGML_CUDA_CC_VOLTA || (defined(GGML_HIP_ROCWMMA_FATTN) && defined(GGML_USE_WMMA_FATTN)))
#if defined(FLASH_ATTN_AVAILABLE) && (defined(GGML_HIP_ROCWMMA_FATTN) && defined(GGML_USE_WMMA_FATTN))
// Skip unused kernel variants for faster compilation:
if (use_logit_softcap && !(D == 128 || D == 256)) {
NO_DEVICE_CODE;
@ -149,7 +149,7 @@ static __global__ void flash_attn_ext_f16(
if (i0 + warp_size > D && i >= D) {
break;
}
KQ[j*D_padded + i] = ic0 + j < ne01 ? Q_f[j*stride_Q + i] * scale : 0.0f;
KQ[j*D_padded + i] = ic0 + j < int(ne01.z) ? Q_f[j*stride_Q + i] * scale : 0.0f;
}
}
@ -218,7 +218,8 @@ static __global__ void flash_attn_ext_f16(
for (int k0 = 0; k0 < FATTN_KQ_STRIDE; k0 += warp_size) {
const int k = k0 + threadIdx.x;
KQ_f_tmp[k0/warp_size] += mask ? __half2float(slopeh*maskh[j*(nb31/sizeof(half)) + k_VKQ_0 + k]) : 0.0f;
KQ_f_tmp[k0/warp_size] += mask && ic0 + j < int(ne01.z) ?
__half2float(slopeh*maskh[j*(nb31/sizeof(half)) + k_VKQ_0 + k]) : 0.0f;
KQ_max_new = max(KQ_max_new, KQ_f_tmp[k0/warp_size]);
}
KQ_max_new = warp_reduce_max<warp_size>(KQ_max_new);
@ -270,7 +271,7 @@ static __global__ void flash_attn_ext_f16(
for (int k0 = 0; k0 < FATTN_KQ_STRIDE/2; k0 += warp_size) {
const int k = k0 + threadIdx.x;
KQ2_tmp[k0/warp_size] += mask ? slope2*mask2[(j*ne11 + k_VKQ_0)/2 + k] : make_half2(0.0f, 0.0f);
KQ2_tmp[k0/warp_size] += mask && ic0 + j < int(ne01.z) ? slope2*mask2[(j*ne11 + k_VKQ_0)/2 + k] : make_half2(0.0f, 0.0f);
KQ_max_new = ggml_cuda_hmax2(KQ_max_new, KQ2_tmp[k0/warp_size]);
}
KQ_max_new = __half2half2(warp_reduce_max<warp_size>(ggml_cuda_hmax(__low2half(KQ_max_new), __high2half(KQ_max_new))));
@ -431,7 +432,7 @@ static __global__ void flash_attn_ext_f16(
#pragma unroll
for (int j0 = 0; j0 < ncols; j0 += nwarps) {
const int j_VKQ = j0 + threadIdx.y;
if (ic0 + j_VKQ >= ne01) {
if (ic0 + j_VKQ >= int(ne01.z)) {
return;
}
@ -442,7 +443,7 @@ static __global__ void flash_attn_ext_f16(
KQ_rowsum_j = __low2float(KQ_rowsum_h2[j0/nwarps]) + __high2float(KQ_rowsum_h2[j0/nwarps]);
}
const int j_dst_unrolled = ((sequence*ne01 + ic0 + j_VKQ)*ne02 + head)*gridDim.y + blockIdx.y;
const int j_dst_unrolled = ((sequence*int(ne01.z) + ic0 + j_VKQ)*ne02 + head)*gridDim.y + blockIdx.y;
#pragma unroll
for (int i0 = 0; i0 < D; i0 += warp_size) {
@ -481,7 +482,7 @@ static __global__ void flash_attn_ext_f16(
ne31, ne32, ne33,
nb31, nb32, nb33);
NO_DEVICE_CODE;
#endif // defined(FLASH_ATTN_AVAILABLE) && (__CUDA_ARCH__ == GGML_CUDA_CC_VOLTA || (defined(GGML_HIP_ROCWMMA_FATTN) && defined(GGML_USE_WMMA_FATTN)))
#endif // defined(FLASH_ATTN_AVAILABLE) && (defined(GGML_HIP_ROCWMMA_FATTN) && defined(GGML_USE_WMMA_FATTN))
}
constexpr int get_max_power_of_2(int x) {

View File

@ -2,9 +2,9 @@
#include "common.cuh"
#if (!defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA) || defined(GGML_USE_MUSA)
#if defined(GGML_USE_MUSA)
#define GGML_USE_WMMA_FATTN
#endif // (!defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA) || defined(GGML_USE_MUSA)
#endif // defined(GGML_USE_MUSA)
#if defined(GGML_HIP_ROCWMMA_FATTN)
#if defined(CDNA) && (ROCWMMA_VERSION_MAJOR < 2 || ROCWMMA_VERSION_MINOR > 0 || ROCWMMA_VERSION_PATCH > 0)

View File

@ -12,13 +12,13 @@ static void ggml_cuda_flash_attn_ext_mma_f16_switch_ncols1(ggml_backend_cuda_con
const ggml_tensor * Q = dst->src[0];
if constexpr (ncols2 <= 8) {
if (Q->ne[1] <= 8/ncols2) {
if (turing_mma_available(cc) && Q->ne[1] <= 8/ncols2) {
ggml_cuda_flash_attn_ext_mma_f16_case<DKQ, DV, 8/ncols2, ncols2>(ctx, dst);
return;
}
}
if (Q->ne[1] <= 16/ncols2) {
if (turing_mma_available(cc) && Q->ne[1] <= 16/ncols2) {
ggml_cuda_flash_attn_ext_mma_f16_case<DKQ, DV, 16/ncols2, ncols2>(ctx, dst);
return;
}
@ -41,7 +41,7 @@ static void ggml_cuda_flash_attn_ext_mma_f16_switch_ncols2(ggml_backend_cuda_con
float max_bias = 0.0f;
memcpy(&max_bias, (const float *) KQV->op_params + 1, sizeof(float));
const bool use_gqa_opt = mask && max_bias == 0.0f;
const bool use_gqa_opt = mask && max_bias == 0.0f && K->ne[1] % FATTN_KQ_STRIDE == 0;
GGML_ASSERT(Q->ne[2] % K->ne[2] == 0);
const int gqa_ratio = Q->ne[2] / K->ne[2];
@ -275,8 +275,8 @@ static best_fattn_kernel ggml_cuda_get_best_fattn_kernel(const int device, const
// For small batch sizes the vector kernel may be preferable over the kernels optimized for large batch sizes:
const bool can_use_vector_kernel = Q->ne[0] <= 256 && Q->ne[0] % 64 == 0 && K->ne[1] % FATTN_KQ_STRIDE == 0;
// If Turing tensor cores available, use them:
if (turing_mma_available(cc) && K->ne[1] % FATTN_KQ_STRIDE == 0 && Q->ne[0] != 40 && Q->ne[0] != 72) {
// If Turing tensor cores are available, use them:
if (turing_mma_available(cc) && Q->ne[0] != 40 && Q->ne[0] != 72) {
if (can_use_vector_kernel) {
if (!ggml_is_quantized(K->type) && !ggml_is_quantized(V->type)) {
if (cc >= GGML_CUDA_CC_ADA_LOVELACE && Q->ne[1] == 1 && Q->ne[3] == 1 && !(gqa_ratio > 4 && K->ne[1] >= 8192)) {
@ -297,7 +297,21 @@ static best_fattn_kernel ggml_cuda_get_best_fattn_kernel(const int device, const
return BEST_FATTN_KERNEL_VEC;
}
}
return BEST_FATTN_KERNEL_MMA_F16;
}
if (volta_mma_available(cc) && Q->ne[0] != 40 && Q->ne[0] != 72) {
int gqa_ratio_eff = 1;
const int ncols2_max = Q->ne[0] == 576 ? 16 : 8;
while (gqa_ratio % (2*gqa_ratio_eff) == 0 && gqa_ratio_eff < ncols2_max) {
gqa_ratio_eff *= 2;
}
if (can_use_vector_kernel && Q->ne[1] * gqa_ratio_eff <= 2) {
return BEST_FATTN_KERNEL_VEC;
}
if (Q->ne[1] * gqa_ratio_eff <= 16) {
return BEST_FATTN_KERNEL_TILE; // On Volta tensor cores are only faster for sufficiently large matrices.
}
return BEST_FATTN_KERNEL_MMA_F16;
}

View File

@ -53,6 +53,7 @@
#include "ggml-cuda/set.cuh"
#include "ggml-cuda/set-rows.cuh"
#include "ggml-cuda/pad_reflect_1d.cuh"
#include "ggml-cuda/solve_tri.cuh"
#include "ggml.h"
#include <algorithm>
@ -521,7 +522,8 @@ struct ggml_cuda_pool_vmm : public ggml_cuda_pool {
};
#endif // defined(GGML_USE_VMM)
std::unique_ptr<ggml_cuda_pool> ggml_backend_cuda_context::new_pool_for_device(int device) {
std::unique_ptr<ggml_cuda_pool> ggml_backend_cuda_context::new_pool_for_device(int device,
[[maybe_unused]] int stream_no) {
#if defined(GGML_USE_VMM)
if (ggml_cuda_info().devices[device].vmm) {
return std::unique_ptr<ggml_cuda_pool>(new ggml_cuda_pool_vmm(device));
@ -2717,6 +2719,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
case GGML_OP_OPT_STEP_SGD:
ggml_cuda_opt_step_sgd(ctx, dst);
break;
case GGML_OP_SOLVE_TRI:
ggml_cuda_op_solve_tri(ctx, dst);
break;
default:
return false;
}
@ -3046,7 +3051,12 @@ static bool ggml_cuda_can_fuse(const struct ggml_cgraph * cgraph, int node_idx,
std::initializer_list<enum ggml_op> topk_moe_ops_delayed_softmax =
ggml_cuda_topk_moe_ops(/*with_norm=*/false, /*delayed_softmax=*/true);
if (ops.size() == topk_moe_ops_with_norm.size() &&
const auto is_equal = [](const std::initializer_list<enum ggml_op> & list1,
const std::initializer_list<enum ggml_op> & list2) {
return std::equal(list1.begin(), list1.end(), list2.begin(), list2.end());
};
if (is_equal(topk_moe_ops_with_norm, ops) &&
ggml_can_fuse_subgraph(cgraph, node_idx, ops, { node_idx + 3, node_idx + 9 })) {
ggml_tensor * softmax = cgraph->nodes[node_idx];
ggml_tensor * weights = cgraph->nodes[node_idx + 9];
@ -3056,8 +3066,7 @@ static bool ggml_cuda_can_fuse(const struct ggml_cgraph * cgraph, int node_idx,
}
}
if (ops.size() == topk_moe_ops.size() &&
ggml_can_fuse_subgraph(cgraph, node_idx, ops, { node_idx + 3, node_idx + 4 })) {
if (is_equal(topk_moe_ops, ops) && ggml_can_fuse_subgraph(cgraph, node_idx, ops, { node_idx + 3, node_idx + 4 })) {
ggml_tensor * softmax = cgraph->nodes[node_idx];
ggml_tensor * weights = cgraph->nodes[node_idx + 4];
if (ggml_cuda_should_use_topk_moe(softmax, weights)) {
@ -3065,7 +3074,7 @@ static bool ggml_cuda_can_fuse(const struct ggml_cgraph * cgraph, int node_idx,
}
}
if (ops.size() == topk_moe_ops_delayed_softmax.size() &&
if (is_equal(topk_moe_ops_delayed_softmax, ops) &&
ggml_can_fuse_subgraph(cgraph, node_idx, ops, { node_idx + 1, node_idx + 5 })) {
ggml_tensor * softmax = cgraph->nodes[node_idx + 4];
ggml_tensor * weights = cgraph->nodes[node_idx + 5];
@ -3081,9 +3090,8 @@ static bool ggml_cuda_can_fuse(const struct ggml_cgraph * cgraph, int node_idx,
std::initializer_list<enum ggml_op> mul_mat_id_glu_ops = { GGML_OP_MUL_MAT_ID, GGML_OP_MUL_MAT_ID, GGML_OP_GLU };
std::initializer_list<enum ggml_op> mul_mat_glu_ops = { GGML_OP_MUL_MAT, GGML_OP_MUL_MAT, GGML_OP_GLU };
if (ops.size() == 5 && (ggml_can_fuse_subgraph(cgraph, node_idx, ops, {node_idx + 4}) ||
ggml_can_fuse_subgraph(cgraph, node_idx, ops, {node_idx + 4}))) {
if ((is_equal(mul_mat_bias_glu_ops, ops) || is_equal(mul_mat_id_bias_glu_ops, ops)) &&
ggml_can_fuse_subgraph(cgraph, node_idx, ops, { node_idx + 4 })) {
const ggml_tensor * ffn_gate = cgraph->nodes[node_idx];
const ggml_tensor * ffn_gate_bias = cgraph->nodes[node_idx + 1];
const ggml_tensor * ffn_up = cgraph->nodes[node_idx + 2];
@ -3095,9 +3103,8 @@ static bool ggml_cuda_can_fuse(const struct ggml_cgraph * cgraph, int node_idx,
}
}
if (ops.size() == 3 && (ggml_can_fuse_subgraph(cgraph, node_idx, ops, {node_idx + 2}) ||
ggml_can_fuse_subgraph(cgraph, node_idx, ops, {node_idx + 2}))) {
if ((is_equal(mul_mat_id_glu_ops, ops) || is_equal(mul_mat_glu_ops, ops)) &&
ggml_can_fuse_subgraph(cgraph, node_idx, ops, { node_idx + 2 })) {
const ggml_tensor * ffn_gate = cgraph->nodes[node_idx];
const ggml_tensor * ffn_up = cgraph->nodes[node_idx + 1];
const ggml_tensor * glu = cgraph->nodes[node_idx + 2];
@ -3107,7 +3114,9 @@ static bool ggml_cuda_can_fuse(const struct ggml_cgraph * cgraph, int node_idx,
}
}
if (ops.size() == 3 && ggml_can_fuse_subgraph(cgraph, node_idx, ops, { node_idx + 2 })) {
std::initializer_list<enum ggml_op> rope_set_rows_ops = { GGML_OP_ROPE, GGML_OP_VIEW, GGML_OP_SET_ROWS };
if (is_equal(rope_set_rows_ops, ops) && ggml_can_fuse_subgraph(cgraph, node_idx, ops, { node_idx + 2 })) {
const ggml_tensor * rope = cgraph->nodes[node_idx];
const ggml_tensor * view = cgraph->nodes[node_idx + 1];
const ggml_tensor * set_rows = cgraph->nodes[node_idx + 2];
@ -3192,27 +3201,141 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
// flag used to determine whether it is an integrated_gpu
const bool integrated = ggml_cuda_info().devices[cuda_ctx->device].integrated;
ggml_cuda_stream_context & stream_ctx = cuda_ctx->stream_context();
bool is_concurrent_event_active = false;
ggml_cuda_concurrent_event * concurrent_event = nullptr;
bool should_launch_concurrent_events = false;
const auto try_launch_concurrent_event = [&](const ggml_tensor * node) {
if (stream_ctx.concurrent_events.find(node) != stream_ctx.concurrent_events.end()) {
concurrent_event = &stream_ctx.concurrent_events[node];
is_concurrent_event_active = true;
GGML_LOG_DEBUG("Launching %d streams at %s\n", concurrent_event->n_streams, node->name);
cudaStream_t main_stream = cuda_ctx->stream(); // this should be stream 0
GGML_ASSERT(cuda_ctx->curr_stream_no == 0);
CUDA_CHECK(cudaEventRecord(concurrent_event->fork_event, main_stream));
for (int i = 1; i <= concurrent_event->n_streams; ++i) {
cudaStream_t stream = cuda_ctx->stream(cuda_ctx->device, i);
CUDA_CHECK(cudaStreamWaitEvent(stream, concurrent_event->fork_event));
}
}
};
while (!graph_evaluated_or_captured) {
// Only perform the graph execution if CUDA graphs are not enabled, or we are capturing the graph.
// With the use of CUDA graphs, the execution will be performed by the graph launch.
if (!use_cuda_graph || cuda_graph_update_required) {
[[maybe_unused]] int prev_i = 0;
if (stream_ctx.concurrent_events.size() > 0) {
should_launch_concurrent_events = true;
for (const auto & [tensor, event] : stream_ctx.concurrent_events) {
should_launch_concurrent_events = should_launch_concurrent_events && event.is_valid();
}
}
if (should_launch_concurrent_events) {
// Restore original node order within each concurrent region to enable fusion within streams
std::unordered_map<const ggml_tensor *, int> node_to_idx;
node_to_idx.reserve(cgraph->n_nodes);
for (int i = 0; i < cgraph->n_nodes; ++i) {
node_to_idx[cgraph->nodes[i]] = i;
}
for (auto & [fork_node, event] : stream_ctx.concurrent_events) {
// Find positions of all nodes from this event in the current graph
std::vector<int> positions;
positions.reserve(event.original_order.size());
bool all_found = true;
for (const ggml_tensor * orig_node : event.original_order) {
auto it = node_to_idx.find(orig_node);
if (it != node_to_idx.end()) {
positions.push_back(it->second);
} else {
all_found = false;
break;
}
}
if (!all_found || positions.size() != event.original_order.size()) {
continue;
}
// Sort positions to get contiguous range
std::vector<int> sorted_positions = positions;
std::sort(sorted_positions.begin(), sorted_positions.end());
bool is_contiguous = true;
for (size_t i = 1; i < sorted_positions.size(); ++i) {
if (sorted_positions[i] != sorted_positions[i-1] + 1) {
is_contiguous = false;
break;
}
}
if (!is_contiguous) {
continue;
}
// Restore original order at the sorted positions
int start_pos = sorted_positions[0];
for (size_t i = 0; i < event.original_order.size(); ++i) {
cgraph->nodes[start_pos + i] = const_cast<ggml_tensor *>(event.original_order[i]);
}
}
}
for (int i = 0; i < cgraph->n_nodes; i++) {
ggml_tensor * node = cgraph->nodes[i];
if (is_concurrent_event_active) {
GGML_ASSERT(concurrent_event);
if (node == concurrent_event->join_node) {
cuda_ctx->curr_stream_no = 0;
for (int i = 1; i <= concurrent_event->n_streams; ++i) {
// Wait on join events of forked streams in the main stream
CUDA_CHECK(cudaEventRecord(concurrent_event->join_events[i - 1],
cuda_ctx->stream(cuda_ctx->device, i)));
CUDA_CHECK(cudaStreamWaitEvent(cuda_ctx->stream(), concurrent_event->join_events[i - 1]));
}
is_concurrent_event_active = false;
concurrent_event = nullptr;
} else {
GGML_ASSERT (concurrent_event->stream_mapping.find(node) != concurrent_event->stream_mapping.end());
cuda_ctx->curr_stream_no = concurrent_event->stream_mapping[node];
GGML_LOG_DEBUG("Setting stream no to %d for node %s\n", cuda_ctx->curr_stream_no, node->name);
}
} else if (i - prev_i > 1) {
//the previous node was fused
const ggml_tensor * prev_node = cgraph->nodes[i - 1];
try_launch_concurrent_event(prev_node);
if (is_concurrent_event_active) {
cuda_ctx->curr_stream_no = concurrent_event->stream_mapping[node];
GGML_LOG_DEBUG("Setting stream no to %d for node %s\n", cuda_ctx->curr_stream_no, node->name);
}
}
#ifdef GGML_CUDA_DEBUG
const int nodes_fused = i - prev_i - 1;
prev_i = i;
if (nodes_fused > 0) {
GGML_LOG_INFO("nodes_fused: %d\n", nodes_fused);
}
#endif
prev_i = i;
if (ggml_is_empty(node) || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) {
continue;
}
// start of fusion operations
static bool disable_fusion = (getenv("GGML_CUDA_DISABLE_FUSION") != nullptr);
if (!disable_fusion) {
@ -3505,13 +3628,17 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
}
#else
GGML_UNUSED(integrated);
#endif // NDEBUG
#endif // NDEBUG
bool ok = ggml_cuda_compute_forward(*cuda_ctx, node);
if (!ok) {
GGML_LOG_ERROR("%s: op not supported %s (%s)\n", __func__, node->name, ggml_op_name(node->op));
}
GGML_ASSERT(ok);
if (!is_concurrent_event_active) {
try_launch_concurrent_event(node);
}
}
}
@ -3651,6 +3778,234 @@ static void ggml_backend_cuda_event_wait(ggml_backend_t backend, ggml_backend_ev
}
}
static void ggml_backend_cuda_graph_optimize(ggml_backend_t backend, ggml_cgraph * cgraph) {
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *) backend->context;
static bool enable_graph_optimization = [] {
const char * env = getenv("GGML_CUDA_GRAPH_OPT");
return env != nullptr && atoi(env) == 1;
}();
if (!enable_graph_optimization) {
return;
}
GGML_ASSERT(ggml_backend_cuda_get_device_count() == 1 && "compute graph optimization is only supported on single GPU in the CUDA backend");
GGML_LOG_DEBUG("Optimizing CUDA graph %p with %d nodes\n", cgraph->nodes, cgraph->n_nodes);
ggml_cuda_stream_context & stream_context = cuda_ctx->stream_context();
stream_context.reset();
// number of out-degrees for a particular node
std::unordered_map<const ggml_tensor *, int> fan_out;
// reverse mapping of node to index in the cgraph
std::unordered_map<const ggml_tensor *, int> node_indices;
const auto & is_noop = [](const ggml_tensor * node) -> bool {
return ggml_is_empty(node) || node->op == GGML_OP_NONE || node->op == GGML_OP_RESHAPE ||
node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE;
};
const auto & depends_on = [](const ggml_tensor * dst, const ggml_tensor * src) -> bool {
for (uint32_t s = 0; s < GGML_MAX_SRC; ++s) {
if (dst->src[s] == src) {
return true;
}
}
// implicit dependency if they view the same tensor
const ggml_tensor * dst2 = dst->view_src ? dst->view_src : dst;
const ggml_tensor * src2 = src->view_src ? src->view_src : src;
if (dst2 == src2) {
return true;
}
return false;
};
for (int node_idx = 0; node_idx < cgraph->n_nodes; node_idx++) {
const ggml_tensor * node = cgraph->nodes[node_idx];
node_indices[node] = node_idx;
if (is_noop(node)) {
continue;
}
for (int src_idx = 0; src_idx < GGML_MAX_SRC; ++src_idx) {
const ggml_tensor * src = cgraph->nodes[node_idx]->src[src_idx];
//TODO: check why nrows > 1 fails
if (node && !is_noop(node) && ggml_nrows(node) <= 1) {
fan_out[src] += 1;
}
}
}
// Target Q, K, V for concurrency
// this is a more general way to find nodes which can be candidates for concurrency (although it has not been tested for anything else):
// 1. find fan-out (fork) nodes where the same input is used at least N times (in QKV, it would be "attn-norm")
// 2. find the join node, where 2 or more of the outputs are required (in QKV, this would "KQ" or "flash-attn")
// 3. account for all branches from the fork to the join
// 4. To extend lifetimes of the tensors, we interleave the branches (see below for more details)
// 5. save the original cgraph and restore it in graph_compute, to enable fusion within streams
// See discussion: https://github.com/ggml-org/llama.cpp/pull/16991#issuecomment-3522620030
const int min_fan_out = 3;
const int max_fan_out = 3;
// store {fork_idx, join_idx}
std::vector<std::pair<int, int>> concurrent_node_ranges;
for (const auto & [root_node, count] : fan_out) {
if (count >= min_fan_out && count <= max_fan_out) {
const int root_node_idx = node_indices[root_node];
bool is_part_of_event = false;
for (const auto & [start, end] : concurrent_node_ranges) {
if (root_node_idx >= start && root_node_idx <= end) {
is_part_of_event = true;
}
}
if (is_part_of_event) {
continue;
}
std::vector<std::vector<const ggml_tensor *>> nodes_per_branch;
for (int i = root_node_idx + 1; i < cgraph->n_nodes; ++i) {
const ggml_tensor * node = cgraph->nodes[i];
if (!is_noop(node) && depends_on(node, root_node)) {
nodes_per_branch.push_back({ node });
}
}
GGML_ASSERT(nodes_per_branch.size() == (size_t) count);
//find the join point
const ggml_tensor * join_node = nullptr;
const auto & belongs_to_branch = [&](const ggml_tensor * node,
const std::vector<const ggml_tensor *> & branch) -> bool {
for (const ggml_tensor * n : branch) {
if (depends_on(node, n)) {
return true;
}
}
return false;
};
for (int i = root_node_idx + 1; i < cgraph->n_nodes; ++i) {
const ggml_tensor * curr_node = cgraph->nodes[i];
int num_joins = 0;
for (size_t branch_idx = 0; branch_idx < nodes_per_branch.size(); branch_idx++) {
if (belongs_to_branch(curr_node, nodes_per_branch[branch_idx])) {
num_joins++;
}
}
if (num_joins >= 2) {
join_node = curr_node;
break;
}
bool found_branch = false;
for (size_t branch_idx = 0; branch_idx < nodes_per_branch.size(); branch_idx++) {
std::vector<const ggml_tensor *> & branch_vec = nodes_per_branch[branch_idx];
if (belongs_to_branch(curr_node, branch_vec)) {
//continue accumulating
if (std::find(branch_vec.begin(), branch_vec.end(), curr_node) == branch_vec.end()) {
branch_vec.push_back(curr_node);
}
found_branch = true;
}
}
if (!found_branch && is_noop(curr_node)) {
// we can put it in any branch because it will be ignored
nodes_per_branch[0].push_back({ curr_node });
}
}
if (join_node) {
//Create ggml_cuda_concurrent_event
ggml_cuda_concurrent_event concurrent_event(nodes_per_branch.size());
concurrent_event.join_node = join_node;
for (size_t branch_idx = 0; branch_idx < nodes_per_branch.size(); branch_idx++) {
for (const ggml_tensor * n : nodes_per_branch[branch_idx]) {
concurrent_event.stream_mapping[n] = branch_idx + 1;
}
}
int fork_node_idx = node_indices[root_node];
int join_node_idx = node_indices[join_node];
int current_branch_idx = 0;
int current_node_idx = fork_node_idx + 1;
const int n_branches = nodes_per_branch.size();
int total_branch_nodes = 0;
for (std::vector<const ggml_tensor *> branch_nodes : nodes_per_branch) {
total_branch_nodes += branch_nodes.size();
}
// there are other nodes in the middle which are unaccounted for
// usually (cpy) nodes, then ignore this fork
if (join_node_idx - fork_node_idx - 1 != total_branch_nodes) {
GGML_LOG_DEBUG(
"Skipping %s because the number of nodes in the middle is not equal to the total number of "
"branch nodes %d != %d\n",
root_node->name, join_node_idx - fork_node_idx - 1, total_branch_nodes);
continue;
}
// Save the original order of nodes in this region before interleaving
// This is used later to restore grouping for fusion within streams
concurrent_event.original_order.reserve(total_branch_nodes);
for (int i = fork_node_idx + 1; i < join_node_idx; ++i) {
concurrent_event.original_order.push_back(cgraph->nodes[i]);
}
std::unordered_map<const ggml_tensor *, ggml_cuda_concurrent_event> & concurrent_events = cuda_ctx->stream_context().concurrent_events;
GGML_ASSERT(concurrent_events.find(root_node) == concurrent_events.end());
concurrent_events.emplace(root_node, std::move(concurrent_event));
GGML_LOG_DEBUG("Adding stream at node %s %p\n", root_node->name, root_node);
concurrent_node_ranges.emplace_back(fork_node_idx, join_node_idx);
// interleave tensors to extend lifetimes so that ggml graph doesn't recycle them
// example transformation:
// [attn-norm, QMul, QNorm, QRope, KMul, KNorm, KRope, VMul, attn] ->
// [attn-norm, QMul, KMul, VMul, QNorm, VNorm, QRope, KRope, attn]
while (current_node_idx < join_node_idx) {
std::vector<const ggml_tensor *> & branch_nodes = nodes_per_branch[current_branch_idx];
bool has_node = false;
for (std::vector<const ggml_tensor *> branch_node : nodes_per_branch) {
has_node |= branch_node.size() > 0;
}
GGML_ASSERT(has_node);
if (branch_nodes.empty()) {
current_branch_idx = (current_branch_idx + 1) % n_branches;
continue;
}
cgraph->nodes[current_node_idx] = const_cast<ggml_tensor *>(branch_nodes.front());
current_node_idx++;
branch_nodes.erase(branch_nodes.begin());
// append all empty nodes
while (!branch_nodes.empty() && is_noop(branch_nodes.front())) {
cgraph->nodes[current_node_idx] = const_cast<ggml_tensor *>(branch_nodes.front());
current_node_idx++;
branch_nodes.erase(branch_nodes.begin());
}
current_branch_idx = (current_branch_idx + 1) % n_branches;
}
}
}
}
}
static const ggml_backend_i ggml_backend_cuda_interface = {
/* .get_name = */ ggml_backend_cuda_get_name,
/* .free = */ ggml_backend_cuda_free,
@ -3665,7 +4020,7 @@ static const ggml_backend_i ggml_backend_cuda_interface = {
/* .graph_compute = */ ggml_backend_cuda_graph_compute,
/* .event_record = */ ggml_backend_cuda_event_record,
/* .event_wait = */ ggml_backend_cuda_event_wait,
/* .graph_optimize = */ NULL,
/* .graph_optimize = */ ggml_backend_cuda_graph_optimize,
};
static ggml_guid_t ggml_backend_cuda_guid() {
@ -3837,7 +4192,7 @@ static void ggml_backend_cuda_device_get_memory(ggml_backend_dev_t dev, size_t *
// Check if UMA is explicitly enabled via environment variable
bool uma_env = getenv("GGML_CUDA_ENABLE_UNIFIED_MEMORY") != nullptr;
bool is_uma = prop.unifiedAddressing > 0 || uma_env;
bool is_uma = prop.integrated > 0 || uma_env;
if (is_uma) {
// For UMA systems (like DGX Spark), use system memory info
@ -4255,6 +4610,8 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
case GGML_OP_OPT_STEP_ADAMW:
case GGML_OP_OPT_STEP_SGD:
return true;
case GGML_OP_SOLVE_TRI:
return op->src[0]->ne[0] <= 64 && op->src[1]->ne[0] <= 32;
default:
return false;
}

View File

@ -68,10 +68,31 @@ static __device__ __forceinline__ half2 ggml_cuda_movmatrix(const half2 x) {
namespace ggml_cuda_mma {
// Some architectures like Volta or CDNA3 perform multiple matrix multiplications per warp in parallel,
// effectively the warp is being split into subgroups of threads that each perform a single mma instruction.
// In those cases the data can be split in different ways across the warp.
enum data_layout {
// By default the data uses the I direction as its major dimension and the J direction as its minor dimension.
// For the A/C matrices this means I major == row major, J major == column major.
// For the B matrix this means I major == column major, J major == row major.
// MIRRORED == Each data value is held exactly once per thread subgroup.
DATA_LAYOUT_I_MAJOR = 0, // Always used for Turing, Ampere, Ada Lovelace, consumer Blackwell.
DATA_LAYOUT_I_MAJOR_MIRRORED = 10,
DATA_LAYOUT_J_MAJOR_MIRRORED = 20,
};
// Implemented mma combinations are:
// - (I_MAJOR, I_MAJOR) -> I_MAJOR
// - (I_MAJOR, I_MAJOR_MIRRORED) -> I_MAJOR
// - (I_MAJOR, J_MAJOR_MIRRORED) -> I_MAJOR
template <int I_, int J_, typename T, data_layout ds_=DATA_LAYOUT_I_MAJOR>
struct tile {};
template <int I_, int J_, typename T>
struct tile {
static constexpr int I = I_;
static constexpr int J = J_;
struct tile<I_, J_, T, DATA_LAYOUT_I_MAJOR> {
static constexpr int I = I_;
static constexpr int J = J_;
static constexpr data_layout dl = DATA_LAYOUT_I_MAJOR;
#if defined(AMD_MFMA_AVAILABLE)
static constexpr int ne = I * J / 64;
@ -131,9 +152,9 @@ namespace ggml_cuda_mma {
static __device__ __forceinline__ int get_i(const int l) {
if constexpr (I == 32 && J == 8) {
#ifdef GGML_CUDA_MMA_NO_VOLTA_PERM
return (((threadIdx.x % 16) / 4) * 8) | ((threadIdx.x / 16) * 4) | (l & 2) | (threadIdx.x % 2);
return (((threadIdx.x % 16) / 4) * 8) + ((threadIdx.x / 16) * 4) + (l & 2) + (threadIdx.x % 2);
#else
return (l & 2) | (threadIdx.x & ~2);
return (l & 2) + (threadIdx.x & ~2);
#endif // GGML_CUDA_MMA_NO_VOLTA_PERM
} else {
NO_DEVICE_CODE;
@ -143,7 +164,7 @@ namespace ggml_cuda_mma {
static __device__ __forceinline__ int get_j(const int l) {
if constexpr (I == 32 && J == 8) {
return (threadIdx.x & 2) | (l & (4 + 1));
return (threadIdx.x & 2) + (l & (4 + 1));
} else {
NO_DEVICE_CODE;
return -1;
@ -196,9 +217,9 @@ namespace ggml_cuda_mma {
} else if constexpr (I == 8 && J == 8) {
return threadIdx.x / 4;
} else if constexpr (I == 16 && J == 8) {
return ((l / 2) * 8) | (threadIdx.x / 4);
return ((l / 2) * 8) + (threadIdx.x / 4);
} else if constexpr (I == 16 && J == 16) {
return (((l / 2) % 2) * 8) | (threadIdx.x / 4);
return (((l / 2) % 2) * 8) + (threadIdx.x / 4);
} else if constexpr (I == 32 && J == 8) {
return tile<16, 8, T>::get_i(l); // Memory layout simply repeated with same pattern in i direction.
} else {
@ -211,11 +232,11 @@ namespace ggml_cuda_mma {
if constexpr (I == 8 && J == 4) {
return threadIdx.x % 4;
} else if constexpr (I == 8 && J == 8) {
return (l * 4) | (threadIdx.x % 4);
return (l * 4) + (threadIdx.x % 4);
} else if constexpr (I == 16 && J == 8) {
return ((threadIdx.x % 4) * 2) | (l % 2);
return ((threadIdx.x % 4) * 2) + (l % 2);
} else if constexpr (I == 16 && J == 16) {
return ((l / 4) * 8) | ((threadIdx.x % 4) * 2) | (l % 2);
return ((l / 4) * 8) + ((threadIdx.x % 4) * 2) + (l % 2);
} else if constexpr (I == 32 && J == 8) {
return tile<16, 8, T>::get_j(l); // Memory layout simply repeated with same pattern in i direction.
} else {
@ -227,26 +248,24 @@ namespace ggml_cuda_mma {
};
template <int I_, int J_>
struct tile<I_, J_, half2> {
static constexpr int I = I_;
static constexpr int J = J_;
struct tile<I_, J_, half2, DATA_LAYOUT_I_MAJOR> {
static constexpr int I = I_;
static constexpr int J = J_;
static constexpr data_layout dl = DATA_LAYOUT_I_MAJOR;
#if __CUDA_ARCH__ == GGML_CUDA_CC_VOLTA
static constexpr int ne = I == 8 && J == 8 ? I * J / (WARP_SIZE/4) : I * J / WARP_SIZE;
static constexpr int ne = I * J / WARP_SIZE;
half2 x[ne] = {{0.0f, 0.0f}};
static constexpr __device__ bool supported() {
if (I == 8 && J == 8) return true;
if (I == 32 && J == 8) return true;
if (I == 32 && J == 4) return true;
return false;
}
static __device__ __forceinline__ int get_i(const int l) {
if constexpr (I == 8 && J == 8) {
return ((threadIdx.x / 16) * 4) | (threadIdx.x % 4);
} else if constexpr (I == 32 && J == 8) {
if constexpr (I == 32 && J == 4) {
#ifdef GGML_CUDA_MMA_NO_VOLTA_PERM
return (((threadIdx.x % 16) / 4) * 8) | ((threadIdx.x / 16) * 4) | (threadIdx.x % 4);
return (((threadIdx.x % 16) / 4) * 8) + ((threadIdx.x / 16) * 4) + (threadIdx.x % 4);
#else
return threadIdx.x;
#endif // GGML_CUDA_MMA_NO_VOLTA_PERM
@ -257,7 +276,7 @@ namespace ggml_cuda_mma {
}
static __device__ __forceinline__ int get_j(const int l) {
if constexpr ((I == 8 || I == 32) && J == 8) {
if constexpr (I == 32 && J == 4) {
return l;
} else {
NO_DEVICE_CODE;
@ -307,11 +326,11 @@ namespace ggml_cuda_mma {
if constexpr (I == 8 && J == 8) {
return threadIdx.x / 4;
} else if constexpr (I == 16 && J == 4) {
return (l * 8) | (threadIdx.x / 4);
return (l * 8) + (threadIdx.x / 4);
} else if constexpr (I == 16 && J == 8) {
return ((l % 2) * 8) | (threadIdx.x / 4);
return ((l % 2) * 8) + (threadIdx.x / 4);
} else if constexpr (I == 32 && J == 8) {
return ((l / 4) * 16) | ((l % 2) * 8) | (threadIdx.x / 4);
return ((l / 4) * 16) + ((l % 2) * 8) + (threadIdx.x / 4);
} else {
NO_DEVICE_CODE;
return -1;
@ -320,13 +339,13 @@ namespace ggml_cuda_mma {
static __device__ __forceinline__ int get_j(const int l) {
if constexpr (I == 8 && J == 8) {
return (l * 4) | (threadIdx.x % 4);
return (l * 4) + (threadIdx.x % 4);
} else if constexpr (I == 16 && J == 4) {
return threadIdx.x % 4;
} else if constexpr (I == 16 && J == 8) {
return ((l / 2) * 4) | (threadIdx.x % 4);
return ((l / 2) * 4) + (threadIdx.x % 4);
} else if constexpr (I == 32 && J == 8) {
return ((l & 2) * 2) | (threadIdx.x % 4);
return ((l & 2) * 2) + (threadIdx.x % 4);
} else {
NO_DEVICE_CODE;
return -1;
@ -336,14 +355,15 @@ namespace ggml_cuda_mma {
};
template <int I_, int J_>
struct tile<I_, J_, nv_bfloat162> {
static constexpr int I = I_;
static constexpr int J = J_;
struct tile<I_, J_, nv_bfloat162, DATA_LAYOUT_I_MAJOR> {
static constexpr int I = I_;
static constexpr int J = J_;
static constexpr data_layout dl = DATA_LAYOUT_I_MAJOR;
static constexpr int ne = I * J / WARP_SIZE;
#if defined(AMD_WMMA_AVAILABLE)
static constexpr int ne = I * J / 32;
nv_bfloat162 x[ne] = {{0.0f, 0.0f}};
#if defined(AMD_WMMA_AVAILABLE)
static constexpr __device__ bool supported() {
if (I == 16 && J == 8) return true;
return false;
@ -367,9 +387,6 @@ namespace ggml_cuda_mma {
}
}
#else
static constexpr int ne = I * J / WARP_SIZE;
nv_bfloat162 x[ne] = {{0.0f, 0.0f}};
static constexpr __device__ bool supported() {
if (I == 8 && J == 8) return true;
if (I == 16 && J == 4) return true;
@ -381,9 +398,9 @@ namespace ggml_cuda_mma {
if constexpr (I == 8 && J == 8) {
return threadIdx.x / 4;
} else if constexpr (I == 16 && J == 4) {
return (l * 8) | (threadIdx.x / 4);
return (l * 8) + (threadIdx.x / 4);
} else if constexpr (I == 16 && J == 8) {
return ((l % 2) * 8) | (threadIdx.x / 4);
return ((l % 2) * 8) + (threadIdx.x / 4);
} else {
NO_DEVICE_CODE;
return -1;
@ -392,11 +409,11 @@ namespace ggml_cuda_mma {
static __device__ __forceinline__ int get_j(const int l) {
if constexpr (I == 8 && J == 8) {
return (l * 4) | (threadIdx.x % 4);
return (l * 4) + (threadIdx.x % 4);
} else if constexpr (I == 16 && J == 4) {
return threadIdx.x % 4;
} else if constexpr (I == 16 && J == 8) {
return ((l / 2) * 4) | (threadIdx.x % 4);
return ((l / 2) * 4) + (threadIdx.x % 4);
} else {
NO_DEVICE_CODE;
return -1;
@ -405,6 +422,73 @@ namespace ggml_cuda_mma {
#endif // defined(AMD_WMMA_AVAILABLE)
};
template <int I_, int J_>
struct tile<I_, J_, half2, DATA_LAYOUT_I_MAJOR_MIRRORED> {
static constexpr int I = I_;
static constexpr int J = J_;
static constexpr data_layout dl = DATA_LAYOUT_I_MAJOR_MIRRORED;
static constexpr int ne = I * J / (WARP_SIZE/4);
half2 x[ne] = {{0.0f, 0.0f}};
static constexpr __device__ bool supported() {
if (I == 8 && J == 4) return true;
return false;
}
static __device__ __forceinline__ int get_i(const int /*l*/) {
if constexpr (I == 8 && J == 4) {
return ((threadIdx.x / 16) * 4) + (threadIdx.x % 4);
} else {
NO_DEVICE_CODE;
return -1;
}
}
static __device__ __forceinline__ int get_j(const int l) {
if constexpr (I == 8 && J == 4) {
return l;
} else {
NO_DEVICE_CODE;
return -1;
}
}
};
template <int I_, int J_>
struct tile<I_, J_, half2, DATA_LAYOUT_J_MAJOR_MIRRORED> {
static constexpr int I = I_;
static constexpr int J = J_;
static constexpr data_layout dl = DATA_LAYOUT_J_MAJOR_MIRRORED;
static constexpr int ne = I * J / (WARP_SIZE/4);
half2 x[ne] = {{0.0f, 0.0f}};
static constexpr __device__ bool supported() {
if (I == 8 && J == 4) return true;
return false;
}
static __device__ __forceinline__ int get_i(const int l) {
if constexpr (I == 8 && J == 4) {
return ((l / 2) * 4) + (threadIdx.x % 4);
} else {
NO_DEVICE_CODE;
return -1;
}
}
static __device__ __forceinline__ int get_j(const int l) {
if constexpr (I == 8 && J == 4) {
return ((threadIdx.x / 16) * 2) + (l % 2);
} else {
NO_DEVICE_CODE;
return -1;
}
}
};
#if defined(TURING_MMA_AVAILABLE)
template <int I, int J>
static __device__ __forceinline__ tile<I, J/2, half2> get_half2(const tile<I, J, float> & tile_float) {
tile<I, J/2, half2> ret;
@ -422,9 +506,26 @@ namespace ggml_cuda_mma {
return ret;
}
#else // Volta
template <int I, int J>
static __device__ __forceinline__ tile<I, J/2, half2> get_half2(const tile<I, J, float> & tile_float) {
tile<I, J/2, half2> ret;
#pragma unroll
for (int l0 = 0; l0 < tile_float.ne; l0 += 4) {
ret.x[l0/2 + 0] = make_half2(tile_float.x[l0 + 0], tile_float.x[l0 + 1]);
ret.x[l0/2 + 1] = make_half2(tile_float.x[l0 + 2], tile_float.x[l0 + 3]);
template <int I, int J, typename T>
static __device__ __forceinline__ void load_generic(tile<I, J, T> & t, const T * __restrict__ xs0, const int stride) {
// On Volta FP16 and FP32 tiles have a different memory layout,
// for the conversion threads with an offset of 2 need to exchange half their values:
ret.x[l0/2 + (((threadIdx.x % 4) / 2) ^ 1)] = __shfl_xor_sync(
0xFFFFFFFF, ret.x[l0/2 + (((threadIdx.x % 4) / 2) ^ 1)], 2, WARP_SIZE);
}
return ret;
}
#endif // defined(TURING_MMA_AVAILABLE)
template <int I, int J, typename T, data_layout dl>
static __device__ __forceinline__ void load_generic(tile<I, J, T, dl> & t, const T * __restrict__ xs0, const int stride) {
#if defined(AMD_MFMA_AVAILABLE)
if constexpr (I == 64 && J == 2) { // Special tile size to load <16, 4> as <16, 8>
#pragma unroll
@ -437,18 +538,27 @@ namespace ggml_cuda_mma {
xi[0] = xs[0];
}
#elif defined(AMD_WMMA_AVAILABLE)
if constexpr (I == 16 && J == 4) {
int64_t * xi = (int64_t *) t.x;
const int64_t * xs = (int64_t *) ((const int *) xs0 + (threadIdx.x % t.I) * stride + 2 * (threadIdx.x / t.I));
xi[0] = xs[0];
}else if constexpr (I == 16 && J == 8) {
int64_t * xi = (int64_t *) t.x;
const int64_t * xs = (int64_t *) ((const int *) xs0 + (threadIdx.x % t.I) * stride + 4 * (threadIdx.x / t.I));
xi[0] = xs[0];
if constexpr (std::is_same_v<T, half2> || std::is_same_v<T, nv_bfloat162>) {
ggml_cuda_memcpy_1<sizeof(t.x)>(t.x, xs0 + t.get_i(0) * stride + t.get_j(0));
const int64_t * xs1 = (int64_t *) ((const int *) xs0 + (threadIdx.x % t.I) * stride + 4 * (threadIdx.x / t.I) + 2);
xi[1] = xs1[0];
}else{
} else if constexpr (std::is_same_v<T, int>) {
if constexpr (I == 16 && J == 4) {
int64_t * xi = (int64_t *) t.x;
const int64_t * xs = (int64_t *) ((const int *) xs0 + (threadIdx.x % t.I) * stride + 2 * (threadIdx.x / t.I));
xi[0] = xs[0];
}else if constexpr (I == 16 && J == 8) {
int64_t * xi = (int64_t *) t.x;
const int64_t * xs = (int64_t *) ((const int *) xs0 + (threadIdx.x % t.I) * stride + 4 * (threadIdx.x / t.I));
xi[0] = xs[0];
const int64_t * xs1 = (int64_t *) ((const int *) xs0 + (threadIdx.x % t.I) * stride + 4 * (threadIdx.x / t.I) + 2);
xi[1] = xs1[0];
}else{
NO_DEVICE_CODE;
}
} else {
NO_DEVICE_CODE;
}
#else
@ -502,18 +612,6 @@ namespace ggml_cuda_mma {
: "=r"(xi[0]), "=r"(xi[1]), "=r"(xi[2]), "=r"(xi[3])
: "l"(xs));
#else
#if __CUDA_ARCH__ == GGML_CUDA_CC_VOLTA
GGML_UNUSED_VARS(t, xs0, stride);
NO_DEVICE_CODE;
#else
load_generic(t, xs0, stride);
#endif // __CUDA_ARCH__ == GGML_CUDA_CC_VOLTA
#endif // TURING_MMA_AVAILABLE
}
template <typename T>
static __device__ __forceinline__ void load_ldmatrix(
tile<32, 8, T> & t, const T * __restrict__ xs0, const int stride) {
#if __CUDA_ARCH__ == GGML_CUDA_CC_VOLTA
#if 1
// TODO: more generic handling
@ -524,9 +622,31 @@ namespace ggml_cuda_mma {
load_generic(t, xs0, stride);
#endif // 1
#else
tile<16, 8, T> * t16 = (tile<16, 8, T> *) &t;
load_ldmatrix(t16[0], xs0 + 0*stride, stride);
load_ldmatrix(t16[1], xs0 + 16*stride, stride);
load_generic(t, xs0, stride);
#endif // __CUDA_ARCH__ == GGML_CUDA_CC_VOLTA
#endif // TURING_MMA_AVAILABLE
}
static __device__ __forceinline__ void load_ldmatrix(
tile<8, 4, half2, DATA_LAYOUT_I_MAJOR_MIRRORED> & t, const half2 * __restrict__ xs0, const int stride) {
ggml_cuda_memcpy_1<4*sizeof(half2)>(t.x, xs0 + t.get_i(0)*stride);
}
static __device__ __forceinline__ void load_ldmatrix(
tile<8, 4, half2, DATA_LAYOUT_J_MAJOR_MIRRORED> & t, const half2 * __restrict__ xs0, const int stride) {
#pragma unroll
for (int l0 = 0; l0 < t.ne; l0 += 2) {
ggml_cuda_memcpy_1<2*sizeof(half2)>(t.x + l0, xs0 + t.get_i(l0)*stride + t.get_j(l0));
}
}
static __device__ __forceinline__ void load_ldmatrix(
tile<32, 4, half2> & t, const half2 * __restrict__ xs0, const int stride) {
#if __CUDA_ARCH__ == GGML_CUDA_CC_VOLTA
ggml_cuda_memcpy_1<4*sizeof(half2)>(t.x, xs0 + t.get_i(0)*stride);
#else
GGML_UNUSED_VARS(t, xs0, stride);
NO_DEVICE_CODE;
#endif // __CUDA_ARCH__ == GGML_CUDA_CC_VOLTA
}
@ -851,14 +971,14 @@ namespace ggml_cuda_mma {
template <typename T1, typename T2, int J, int K>
static __device__ __forceinline__ void mma(
tile<32, J, T1> & D, const tile<32, K, T2> & A, const tile<J, K, T2> & B) {
tile<16, J, T1> * D16 = (tile<16, J, T1> *) &D;
tile<16, K, T2> * A16 = (tile<16, K, T2> *) &A;
tile <16, J, T1> * D16 = reinterpret_cast< tile<16, J, T1> *>(&D);
const tile<16, K, T2> * A16 = reinterpret_cast<const tile<16, K, T2> *>(&A);
mma(D16[0], A16[0], B);
mma(D16[1], A16[1], B);
}
static __device__ __forceinline__ void mma(
tile<32, 8, float> & D, const tile<32, 8, half2> & A, const tile<8, 8, half2> & B) {
tile<32, 8, float> & D, const tile<32, 4, half2> & A, const tile<8, 4, half2, DATA_LAYOUT_I_MAJOR_MIRRORED> & B) {
#if __CUDA_ARCH__ == GGML_CUDA_CC_VOLTA
const int * Axi = (const int *) A.x;
const int * Bxi = (const int *) B.x;
@ -871,20 +991,30 @@ namespace ggml_cuda_mma {
"{%0, %1, %2, %3, %4, %5, %6, %7}, {%8, %9}, {%10, %11}, {%0, %1, %2, %3, %4, %5, %6, %7};"
: "+r"(Dxi[0]), "+r"(Dxi[1]), "+r"(Dxi[2]), "+r"(Dxi[3]), "+r"(Dxi[4]), "+r"(Dxi[5]), "+r"(Dxi[6]), "+r"(Dxi[7])
: "r"(Axi[2]), "r"(Axi[3]), "r"(Bxi[2]), "r"(Bxi[3]));
asm("mma.sync.aligned.m8n8k4.row.col.f32.f16.f16.f32 "
"{%0, %1, %2, %3, %4, %5, %6, %7}, {%8, %9}, {%10, %11}, {%0, %1, %2, %3, %4, %5, %6, %7};"
: "+r"(Dxi[0]), "+r"(Dxi[1]), "+r"(Dxi[2]), "+r"(Dxi[3]), "+r"(Dxi[4]), "+r"(Dxi[5]), "+r"(Dxi[6]), "+r"(Dxi[7])
: "r"(Axi[4]), "r"(Axi[5]), "r"(Bxi[4]), "r"(Bxi[5]));
asm("mma.sync.aligned.m8n8k4.row.col.f32.f16.f16.f32 "
"{%0, %1, %2, %3, %4, %5, %6, %7}, {%8, %9}, {%10, %11}, {%0, %1, %2, %3, %4, %5, %6, %7};"
: "+r"(Dxi[0]), "+r"(Dxi[1]), "+r"(Dxi[2]), "+r"(Dxi[3]), "+r"(Dxi[4]), "+r"(Dxi[5]), "+r"(Dxi[6]), "+r"(Dxi[7])
: "r"(Axi[6]), "r"(Axi[7]), "r"(Bxi[6]), "r"(Bxi[7]));
#else
tile<16, 8, float> * D16 = (tile<16, 8, float> *) &D;
tile<16, 8, half2> * A16 = (tile<16, 8, half2> *) &A;
mma(D16[0], A16[0], B);
mma(D16[1], A16[1], B);
#endif // __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
GGML_UNUSED_VARS(D, A, B);
NO_DEVICE_CODE;
#endif // __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA
}
static __device__ __forceinline__ void mma(
tile<32, 4, half2> & D, const tile<32, 4, half2> & A, const tile<8, 4, half2, DATA_LAYOUT_J_MAJOR_MIRRORED> & B) {
#if __CUDA_ARCH__ == GGML_CUDA_CC_VOLTA
const int * Axi = (const int *) A.x;
const int * Bxi = (const int *) B.x;
int * Dxi = (int *) D.x;
asm("mma.sync.aligned.m8n8k4.row.row.f16.f16.f16.f16 "
"{%0, %1, %2, %3}, {%4, %5}, {%6, %7}, {%0, %1, %2, %3};"
: "+r"(Dxi[0]), "+r"(Dxi[1]), "+r"(Dxi[2]), "+r"(Dxi[3])
: "r"(Axi[0]), "r"(Axi[1]), "r"(Bxi[0]), "r"(Bxi[1]));
asm("mma.sync.aligned.m8n8k4.row.row.f16.f16.f16.f16 "
"{%0, %1, %2, %3}, {%4, %5}, {%6, %7}, {%0, %1, %2, %3};"
: "+r"(Dxi[0]), "+r"(Dxi[1]), "+r"(Dxi[2]), "+r"(Dxi[3])
: "r"(Axi[2]), "r"(Axi[3]), "r"(Bxi[2]), "r"(Bxi[3]));
#else
GGML_UNUSED_VARS(D, A, B);
NO_DEVICE_CODE;
#endif // __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA
}
static __device__ __forceinline__ void mma(

View File

@ -151,7 +151,7 @@ bool ggml_cuda_should_use_mmf(enum ggml_type type, int cc, int warp_size, const
return false;
}
} else {
if (src1_ncols > 16 || GGML_CUDA_CC_IS_RDNA4(cc)) {
if (src1_ncols > 16) {
return false;
}
}

View File

@ -37,23 +37,19 @@ static __global__ void mul_mat_f(
typedef tile<16, 8, T> tile_A;
typedef tile<tile_B_I, 8, T> tile_B;
typedef tile<16, tile_C_J, float> tile_C;
constexpr bool a_supported = tile_A::supported();
constexpr bool b_supported = tile_B::supported();
constexpr bool c_supported = tile_C::supported();
constexpr bool supported = a_supported && b_supported && c_supported;
#else
constexpr bool I_16_supported = tile<16, 8, T>::supported() && tile<16, 8, float>::supported();
constexpr bool I_32_supported = tile<32, 8, T>::supported() && tile<32, 8, float>::supported();
constexpr bool supported = I_16_supported || I_32_supported;
constexpr int I_preferred = I_16_supported ? 16 : 32; // For Turing MMA both work but 16 is ~1% faster.
typedef tile<I_preferred, 8, T> tile_A;
typedef tile<8, 8, T> tile_B;
typedef tile<I_preferred, 8, float> tile_C;
#ifdef VOLTA_MMA_AVAILABLE
if constexpr (!std::is_same_v<T, half2>) {NO_DEVICE_CODE;} else {
typedef tile<32, 4, T, DATA_LAYOUT_I_MAJOR> tile_A;
typedef tile< 8, 4, T, DATA_LAYOUT_I_MAJOR_MIRRORED> tile_B;
typedef tile<32, 8, float, DATA_LAYOUT_I_MAJOR> tile_C;
#else
typedef tile<16, 8, T> tile_A;
typedef tile<8, 8, T> tile_B;
typedef tile<16, 8, float> tile_C;
#endif // VOLTA_MMA_AVAILABLE
#endif // defined(AMD_WMMA_AVAILABLE)
if constexpr (!supported) {
if constexpr (!tile_A::supported() || !tile_B::supported() || !tile_C::supported()) {
NO_DEVICE_CODE;
return;
}
@ -248,6 +244,9 @@ static __global__ void mul_mat_f(
}
}
}
#ifdef VOLTA_MMA_AVAILABLE
}
#endif //VOLTA_MMA_AVAILABLE
#else
GGML_UNUSED_VARS(x, y, ids, dst,
ncols, ncols_dst_total, nchannels_dst, stride_row, stride_col_y, stride_col_dst,
@ -278,27 +277,24 @@ static __global__ void mul_mat_f_ids(
typedef tile<16, 8, T> tile_A;
typedef tile<tile_B_I, 8, T> tile_B;
typedef tile<16, tile_C_J, float> tile_C;
constexpr bool a_supported = tile_A::supported();
constexpr bool b_supported = tile_B::supported();
constexpr bool c_supported = tile_C::supported();
constexpr bool supported = a_supported && b_supported && c_supported;
#else
constexpr bool I_16_supported = tile<16, 8, T>::supported() && tile<16, 8, float>::supported();
constexpr bool I_32_supported = tile<32, 8, T>::supported() && tile<32, 8, float>::supported();
constexpr bool supported = I_16_supported || I_32_supported;
constexpr int I_preferred = I_16_supported ? 16 : 32; // For Turing MMA both work but 16 is ~1% faster.
typedef tile<I_preferred, 8, T> tile_A;
typedef tile<8, 8, T> tile_B;
typedef tile<I_preferred, 8, float> tile_C;
#ifdef VOLTA_MMA_AVAILABLE
if constexpr (!std::is_same_v<T, half2>) {NO_DEVICE_CODE;} else {
typedef tile<32, 4, T, DATA_LAYOUT_I_MAJOR> tile_A;
typedef tile< 8, 4, T, DATA_LAYOUT_I_MAJOR_MIRRORED> tile_B;
typedef tile<32, 8, float, DATA_LAYOUT_I_MAJOR> tile_C;
#else
typedef tile<16, 8, T> tile_A;
typedef tile<8, 8, T> tile_B;
typedef tile<16, 8, float> tile_C;
#endif // VOLTA_MMA_AVAILABLE
#endif // defined(AMD_WMMA_AVAILABLE)
if constexpr (!supported) {
if constexpr (!tile_A::supported() || !tile_B::supported() || !tile_C::supported()) {
NO_DEVICE_CODE;
return;
}
constexpr int warp_size = ggml_cuda_get_physical_warp_size();
constexpr int tile_k_padded = warp_size + 4;
constexpr int ntA = rows_per_block / tile_A::I;
@ -517,6 +513,9 @@ static __global__ void mul_mat_f_ids(
}
}
}
#ifdef VOLTA_MMA_AVAILABLE
}
#endif // VOLTA_MMA_AVAILABLE
#else
GGML_UNUSED_VARS(x, y, ids_src_compact, ids_dst_compact, expert_bounds, dst,
ncols, ncols_dst_total, nchannels_dst, stride_row, stride_col_y, stride_col_dst,

View File

@ -3701,7 +3701,7 @@ static size_t mmq_get_nbytes_shared(const int mmq_x, const int mmq_y, const int
const tile_x_sizes txs = mmq_get_dp4a_tile_x_sizes(type, mmq_y);
const int mmq_tile_x_k = mmq_get_mma_tile_x_k(type);
const size_t nbs_ids = mmq_x*sizeof(int);
const size_t nbs_x = (turing_mma_available(cc) || amd_mfma_available(cc)) ? mmq_y*mmq_tile_x_k*sizeof(int) : txs.qs*sizeof(int) + txs.dm*sizeof(half2) + txs.sc*sizeof(int);
const size_t nbs_x = (turing_mma_available(cc) || amd_mfma_available(cc) || amd_wmma_available(cc)) ? mmq_y*mmq_tile_x_k*sizeof(int) : txs.qs*sizeof(int) + txs.dm*sizeof(half2) + txs.sc*sizeof(int);
const size_t nbs_y = mmq_x*sizeof(block_q8_1_mmq);
return nbs_ids + nbs_x + GGML_PAD(nbs_y, nwarps*warp_size*sizeof(int));
}

View File

@ -0,0 +1,203 @@
#include "common.cuh"
#include "ggml.h"
#include "solve_tri.cuh"
#define MAX_N_FAST 64
#define MAX_K_FAST 32
// ======================
// Fast Kernel (n <= 64, k <= 32) - Warp-based parallel reduction
// ======================
// When ncols_template == 0 the bounds for the loops in this function are not
// known and can't be unrolled. As we want to keep pragma unroll for all other
// cases we supress the clang transformation warning here.
#ifdef __clang__
# pragma clang diagnostic push
# pragma clang diagnostic ignored "-Wpass-failed"
#endif // __clang__
template <int n_template, int k_template>
static __global__ void solve_tri_f32_fast(const float * __restrict__ A,
const float * __restrict__ B,
float * __restrict__ X,
const uint3 ne02,
const size_t nb02,
const size_t nb03,
const size_t nb12,
const size_t nb13,
const size_t nb2,
const size_t nb3,
const int n_arg,
const int k_arg) {
const int n = n_template == 0 ? n_arg : n_template;
const int k = k_template == 0 ? k_arg : k_template;
const int batch_idx = blockIdx.x;
const int lane = threadIdx.x;
const int col_idx = threadIdx.y;
if (col_idx >= k) {
return;
}
const uint2 i02_i03 = fast_div_modulo(batch_idx, ne02);
const int64_t i02 = i02_i03.y;
const int64_t i03 = i02_i03.x;
const float * const A_batch = (const float *) (A + i02 * nb02 + i03 * nb03);
const float * const B_batch = (const float *) (B + i02 * nb12 + i03 * nb13);
float * X_batch = (float *) (X + i02 * nb2 + i03 * nb3);
__shared__ float sA[MAX_N_FAST * MAX_N_FAST];
__shared__ float sXt[MAX_N_FAST * (MAX_K_FAST + 1)];
const int offset = threadIdx.x + threadIdx.y * blockDim.x;
#pragma unroll
for (int i = 0; i < n * n; i += k * WARP_SIZE) {
int i0 = i + offset;
if (i0 < n * n) {
sA[i0] = A_batch[i0];
}
}
const int rows_per_warp = (n + WARP_SIZE - 1) / WARP_SIZE;
#pragma unroll
for (int i = 0; i < rows_per_warp; i++) {
const int i0 = lane + i * WARP_SIZE;
if (i0 < n) {
sXt[col_idx * n + i0] = B_batch[i0 * k + col_idx];
}
}
__syncthreads();
#pragma unroll
for (int row = 0; row < n; ++row) {
float sum = 0.0f;
{
int j = lane;
if (j < row) {
sum += sA[row * n + j] * sXt[col_idx * n + j];
}
}
if (row >= WARP_SIZE) {
int j = WARP_SIZE + lane;
if (j < row) {
sum += sA[row * n + j] * sXt[col_idx * n + j];
}
}
sum = warp_reduce_sum(sum);
if (lane == 0) {
const float b_val = sXt[col_idx * n + row];
const float a_diag = sA[row * n + row];
// no safeguards for division by zero because that indicates corrupt
// data anyway
sXt[col_idx * n + row] = (b_val - sum) / a_diag;
}
}
__syncthreads();
#pragma unroll
for (int i = 0; i < rows_per_warp; i++) {
const int i0 = lane + i * WARP_SIZE;
if (i0 < n) {
X_batch[i0 * k + col_idx] = sXt[col_idx * n + i0];
}
}
}
#ifdef __clang__
# pragma clang diagnostic pop
#endif // __clang__
static void solve_tri_f32_cuda(const float * A,
const float * B,
float * X,
int n,
int k,
int64_t ne02,
int64_t ne03,
size_t nb02,
size_t nb03,
size_t nb12,
size_t nb13,
size_t nb2,
size_t nb3,
cudaStream_t stream) {
const uint3 ne02_fd = init_fastdiv_values((uint32_t) ne02);
dim3 threads(WARP_SIZE, k);
dim3 grid(ne02 * ne03);
if (n == 64) {
switch (k) {
case 32:
solve_tri_f32_fast<64, 32>
<<<grid, threads, 0, stream>>>(A, B, X, ne02_fd, nb02, nb03, nb12, nb13, nb2, nb3, 0, 0);
break;
case 16:
solve_tri_f32_fast<64, 16>
<<<grid, threads, 0, stream>>>(A, B, X, ne02_fd, nb02, nb03, nb12, nb13, nb2, nb3, 0, 0);
break;
case 14:
solve_tri_f32_fast<64, 14>
<<<grid, threads, 0, stream>>>(A, B, X, ne02_fd, nb02, nb03, nb12, nb13, nb2, nb3, 0, 0);
break;
case 12:
solve_tri_f32_fast<64, 12>
<<<grid, threads, 0, stream>>>(A, B, X, ne02_fd, nb02, nb03, nb12, nb13, nb2, nb3, 0, 0);
break;
case 10:
solve_tri_f32_fast<64, 10>
<<<grid, threads, 0, stream>>>(A, B, X, ne02_fd, nb02, nb03, nb12, nb13, nb2, nb3, 0, 0);
break;
case 8:
solve_tri_f32_fast<64, 8>
<<<grid, threads, 0, stream>>>(A, B, X, ne02_fd, nb02, nb03, nb12, nb13, nb2, nb3, 0, 0);
break;
case 6:
solve_tri_f32_fast<64, 6>
<<<grid, threads, 0, stream>>>(A, B, X, ne02_fd, nb02, nb03, nb12, nb13, nb2, nb3, 0, 0);
break;
case 4:
solve_tri_f32_fast<64, 4>
<<<grid, threads, 0, stream>>>(A, B, X, ne02_fd, nb02, nb03, nb12, nb13, nb2, nb3, 0, 0);
break;
case 2:
solve_tri_f32_fast<64, 2>
<<<grid, threads, 0, stream>>>(A, B, X, ne02_fd, nb02, nb03, nb12, nb13, nb2, nb3, 0, 0);
break;
case 1:
solve_tri_f32_fast<64, 1>
<<<grid, threads, 0, stream>>>(A, B, X, ne02_fd, nb02, nb03, nb12, nb13, nb2, nb3, 0, 0);
break;
default:
solve_tri_f32_fast<0, 0>
<<<grid, threads, 0, stream>>>(A, B, X, ne02_fd, nb02, nb03, nb12, nb13, nb2, nb3, n, k);
}
} else { // run general case
solve_tri_f32_fast<0, 0>
<<<grid, threads, 0, stream>>>(A, B, X, ne02_fd, nb02, nb03, nb12, nb13, nb2, nb3, n, k);
}
}
void ggml_cuda_op_solve_tri(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
const ggml_tensor * src0 = dst->src[0]; // A (triangular n x x matrix)
const ggml_tensor * src1 = dst->src[1]; // B (right hand side of n x k equation columns)
ggml_is_contiguous(src0);
ggml_is_contiguous(src1);
const int64_t n = src0->ne[0];
const int64_t k = src1->ne[0];
GGML_ASSERT(n <= 64);
GGML_ASSERT(k <= 32);
solve_tri_f32_cuda((const float *) src0->data, (const float *) src1->data, (float *) dst->data, n, k, src0->ne[2],
src0->ne[3], src0->nb[2] / sizeof(float), src0->nb[3] / sizeof(float),
src1->nb[2] / sizeof(float), src1->nb[3] / sizeof(float), dst->nb[2] / sizeof(float),
dst->nb[3] / sizeof(float), ctx.stream());
}

View File

@ -0,0 +1,3 @@
#include "common.cuh"
void ggml_cuda_op_solve_tri(ggml_backend_cuda_context & ctx, ggml_tensor * dst);

View File

@ -81,6 +81,76 @@ static __global__ void upscale_f32_bilinear(const float * x, float * dst,
dst[index] = result;
}
// Similar to F.interpolate(..., mode="bilinear", align_corners=False, antialias=True)
// https://github.com/pytorch/pytorch/blob/8871ff29b743948d1225389d5b7068f37b22750b/aten/src/ATen/native/cpu/UpSampleKernel.cpp
static __global__ void upscale_f32_bilinear_antialias(const float * src0, float * dst,
const int nb00, const int nb01, const int nb02, const int nb03,
const int ne00_src, const int ne01_src,
const int ne10_dst, const int ne11_dst, const int ne12_dst, const int ne13_dst,
const float sf0, const float sf1, const float sf2, const float sf3,
const float pixel_offset) {
const int64_t index = threadIdx.x + blockIdx.x * blockDim.x;
const int64_t dst_total_elements = ne10_dst * ne11_dst * ne12_dst * ne13_dst;
if (index >= dst_total_elements) {
return;
}
const int i10_dst = index % ne10_dst;
const int i11_dst = (index / ne10_dst) % ne11_dst;
const int i12_dst = (index / (ne10_dst * ne11_dst)) % ne12_dst;
const int i13_dst = index / (ne10_dst * ne11_dst * ne12_dst);
const int i02_src = (int)(i12_dst / sf2);
const int i03_src = (int)(i13_dst / sf3);
const float y = ((float)i11_dst + pixel_offset) / sf1;
const float x = ((float)i10_dst + pixel_offset) / sf0;
// support and invscale, minimum 1 pixel for bilinear
const float support1 = max(1.0f / sf1, 1.0f);
const float invscale1 = 1.0f / support1;
const float support0 = max(1.0f / sf0, 1.0f);
const float invscale0 = 1.0f / support0;
// the range of source pixels that contribute
const int64_t x_min = max(int64_t(0), int64_t(x - support0 + pixel_offset));
const int64_t x_max = min(int64_t(ne00_src), int64_t(x + support0 + pixel_offset));
const int64_t y_min = max(int64_t(0), int64_t(y - support1 + pixel_offset));
const int64_t y_max = min(int64_t(ne01_src), int64_t(y + support1 + pixel_offset));
// bilinear filter with antialiasing
float val = 0.0f;
float total_weight = 0.0f;
auto triangle_filter = [](float x) -> float {
return max(1.0f - fabsf(x), 0.0f);
};
for (int64_t sy = y_min; sy < y_max; sy++) {
const float weight_y = triangle_filter((sy - y + pixel_offset) * invscale1);
for (int64_t sx = x_min; sx < x_max; sx++) {
const float weight_x = triangle_filter((sx - x + pixel_offset) * invscale0);
const float weight = weight_x * weight_y;
if (weight <= 0.0f) {
continue;
}
const float pixel = *(const float *)((const char *)src0 + sx*nb00 + sy*nb01 + i02_src*nb02 + i03_src*nb03);
val += pixel * weight;
total_weight += weight;
}
}
if (total_weight > 0.0f) {
val /= total_weight;
}
dst[index] = val;
}
namespace bicubic_interpolation {
// https://en.wikipedia.org/wiki/Bicubic_interpolation#Bicubic_convolution_algorithm
__device__ const float a = -0.75f; // use alpha = -0.75 (same as PyTorch)
@ -161,11 +231,15 @@ static void upscale_f32_bilinear_cuda(const float * x, float * dst,
const int ne00_src, const int ne01_src,
const int ne10_dst, const int ne11_dst, const int ne12_dst, const int ne13_dst,
const float sf0, const float sf1, const float sf2, const float sf3,
const float pixel_offset, cudaStream_t stream) {
const float pixel_offset, bool antialias, cudaStream_t stream) {
const int64_t dst_size = ne10_dst * ne11_dst * ne12_dst * ne13_dst;
const int64_t num_blocks = (dst_size + CUDA_UPSCALE_BLOCK_SIZE - 1) / CUDA_UPSCALE_BLOCK_SIZE;
upscale_f32_bilinear<<<num_blocks, CUDA_UPSCALE_BLOCK_SIZE,0,stream>>>(x, dst, nb00, nb01, nb02, nb03, ne00_src, ne01_src, ne10_dst, ne11_dst, ne12_dst, ne13_dst, sf0, sf1, sf2, sf3, pixel_offset);
if (antialias) {
upscale_f32_bilinear_antialias<<<num_blocks, CUDA_UPSCALE_BLOCK_SIZE,0,stream>>>(x, dst, nb00, nb01, nb02, nb03, ne00_src, ne01_src, ne10_dst, ne11_dst, ne12_dst, ne13_dst, sf0, sf1, sf2, sf3, pixel_offset);
} else {
upscale_f32_bilinear<<<num_blocks, CUDA_UPSCALE_BLOCK_SIZE,0,stream>>>(x, dst, nb00, nb01, nb02, nb03, ne00_src, ne01_src, ne10_dst, ne11_dst, ne12_dst, ne13_dst, sf0, sf1, sf2, sf3, pixel_offset);
}
}
static void upscale_f32_bicubic_cuda(const float * x, float * dst,
@ -207,9 +281,10 @@ void ggml_cuda_op_upscale(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
if (mode == GGML_SCALE_MODE_NEAREST) {
upscale_f32_cuda(src0_d, dst_d, src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3], dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], sf0, sf1, sf2, sf3, stream);
} else if (mode == GGML_SCALE_MODE_BILINEAR) {
const bool antialias = (mode_flags & GGML_SCALE_FLAG_ANTIALIAS);
upscale_f32_bilinear_cuda(src0_d, dst_d, src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3],
src0->ne[0], src0->ne[1], dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3],
sf0, sf1, sf2, sf3, pixel_offset, stream);
sf0, sf1, sf2, sf3, pixel_offset, antialias, stream);
} else if (mode == GGML_SCALE_MODE_BICUBIC) {
upscale_f32_bicubic_cuda(src0_d, dst_d, src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3],
src0->ne[0], src0->ne[1], dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3],

View File

@ -105,7 +105,7 @@
#define cudaStreamNonBlocking hipStreamNonBlocking
#define cudaStreamPerThread hipStreamPerThread
#define cudaStreamSynchronize hipStreamSynchronize
#define cudaStreamWaitEvent(stream, event, flags) hipStreamWaitEvent(stream, event, flags)
#define cudaStreamWaitEvent hipStreamWaitEvent
#define cudaGraphExec_t hipGraphExec_t
#define cudaGraphNode_t hipGraphNode_t
#define cudaKernelNodeParams hipKernelNodeParams

File diff suppressed because it is too large Load Diff

View File

@ -35,20 +35,6 @@ typedef struct ggml_metal_pipeline * ggml_metal_pipeline_t;
ggml_metal_pipeline_t ggml_metal_pipeline_init(void);
void ggml_metal_pipeline_free(ggml_metal_pipeline_t pipeline);
void ggml_metal_pipeline_set_nsg(ggml_metal_pipeline_t pipeline, int nsg);
int ggml_metal_pipeline_get_nsg(ggml_metal_pipeline_t pipeline);
void ggml_metal_pipeline_set_nr0(ggml_metal_pipeline_t pipeline, int nr0);
int ggml_metal_pipeline_get_nr0(ggml_metal_pipeline_t pipeline);
void ggml_metal_pipeline_set_nr1(ggml_metal_pipeline_t pipeline, int nr1);
int ggml_metal_pipeline_get_nr1(ggml_metal_pipeline_t pipeline);
void ggml_metal_pipeline_set_smem(ggml_metal_pipeline_t pipeline, size_t smem);
size_t ggml_metal_pipeline_get_smem(ggml_metal_pipeline_t pipeline);
int ggml_metal_pipeline_max_theads_per_threadgroup(ggml_metal_pipeline_t pipeline);
// a collection of pipelines
typedef struct ggml_metal_pipelines * ggml_metal_pipelines_t;
@ -58,6 +44,19 @@ void ggml_metal_pipelines_free(ggml_metal_pipelines_t ppls);
void ggml_metal_pipelines_add(ggml_metal_pipelines_t ppls, const char * name, ggml_metal_pipeline_t pipeline);
ggml_metal_pipeline_t ggml_metal_pipelines_get(ggml_metal_pipelines_t ppls, const char * name);
struct ggml_metal_pipeline_with_params {
ggml_metal_pipeline_t pipeline;
int nsg;
int nr0;
int nr1;
size_t smem;
};
int ggml_metal_pipeline_max_theads_per_threadgroup(struct ggml_metal_pipeline_with_params pipeline);
//
// MTLCommandBuffer wrapper
//
@ -76,7 +75,7 @@ void ggml_metal_encoder_free(ggml_metal_encoder_t encoder);
void ggml_metal_encoder_debug_group_push(ggml_metal_encoder_t encoder, const char * name);
void ggml_metal_encoder_debug_group_pop (ggml_metal_encoder_t encoder);
void ggml_metal_encoder_set_pipeline(ggml_metal_encoder_t encoder, ggml_metal_pipeline_t pipeline);
void ggml_metal_encoder_set_pipeline(ggml_metal_encoder_t encoder, struct ggml_metal_pipeline_with_params pipeline);
void ggml_metal_encoder_set_bytes (ggml_metal_encoder_t encoder, void * data, size_t size, int idx);
void ggml_metal_encoder_set_buffer(ggml_metal_encoder_t encoder, struct ggml_metal_buffer_id buffer, int idx);
@ -100,66 +99,66 @@ ggml_metal_library_t ggml_metal_library_init_from_source(ggml_metal_device_t dev
void ggml_metal_library_free(ggml_metal_library_t lib);
ggml_metal_pipeline_t ggml_metal_library_get_pipeline (ggml_metal_library_t lib, const char * name);
ggml_metal_pipeline_t ggml_metal_library_compile_pipeline(ggml_metal_library_t lib, const char * base, const char * name, ggml_metal_cv_t cv);
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline (ggml_metal_library_t lib, const char * name);
struct ggml_metal_pipeline_with_params ggml_metal_library_compile_pipeline(ggml_metal_library_t lib, const char * base, const char * name, ggml_metal_cv_t cv);
ggml_metal_pipeline_t ggml_metal_library_get_pipeline_base (ggml_metal_library_t lib, enum ggml_op op);
ggml_metal_pipeline_t ggml_metal_library_get_pipeline_cpy (ggml_metal_library_t lib, enum ggml_type tsrc, enum ggml_type tdst);
ggml_metal_pipeline_t ggml_metal_library_get_pipeline_pool_2d (ggml_metal_library_t lib, const struct ggml_tensor * op, enum ggml_op_pool op_pool);
ggml_metal_pipeline_t ggml_metal_library_get_pipeline_get_rows (ggml_metal_library_t lib, enum ggml_type tsrc);
ggml_metal_pipeline_t ggml_metal_library_get_pipeline_set_rows (ggml_metal_library_t lib, enum ggml_type tidx, enum ggml_type tdst);
ggml_metal_pipeline_t ggml_metal_library_get_pipeline_repeat (ggml_metal_library_t lib, enum ggml_type tsrc);
ggml_metal_pipeline_t ggml_metal_library_get_pipeline_unary (ggml_metal_library_t lib, const struct ggml_tensor * op);
ggml_metal_pipeline_t ggml_metal_library_get_pipeline_glu (ggml_metal_library_t lib, const struct ggml_tensor * op);
ggml_metal_pipeline_t ggml_metal_library_get_pipeline_sum (ggml_metal_library_t lib, const struct ggml_tensor * op);
ggml_metal_pipeline_t ggml_metal_library_get_pipeline_sum_rows (ggml_metal_library_t lib, const struct ggml_tensor * op);
ggml_metal_pipeline_t ggml_metal_library_get_pipeline_cumsum_blk (ggml_metal_library_t lib, const struct ggml_tensor * op);
ggml_metal_pipeline_t ggml_metal_library_get_pipeline_cumsum_add (ggml_metal_library_t lib, const struct ggml_tensor * op);
ggml_metal_pipeline_t ggml_metal_library_get_pipeline_soft_max (ggml_metal_library_t lib, const struct ggml_tensor * op);
ggml_metal_pipeline_t ggml_metal_library_get_pipeline_ssm_conv (ggml_metal_library_t lib, const struct ggml_tensor * op);
ggml_metal_pipeline_t ggml_metal_library_get_pipeline_ssm_scan (ggml_metal_library_t lib, const struct ggml_tensor * op);
ggml_metal_pipeline_t ggml_metal_library_get_pipeline_rwkv (ggml_metal_library_t lib, const struct ggml_tensor * op);
ggml_metal_pipeline_t ggml_metal_library_get_pipeline_mul_mv_ext (ggml_metal_library_t lib, enum ggml_type tsrc0, enum ggml_type tsrc1, int nsg, int nxpsg, int r1ptg);
ggml_metal_pipeline_t ggml_metal_library_get_pipeline_mul_mm (ggml_metal_library_t lib, const struct ggml_tensor * op);
ggml_metal_pipeline_t ggml_metal_library_get_pipeline_mul_mv (ggml_metal_library_t lib, const struct ggml_tensor * op);
ggml_metal_pipeline_t ggml_metal_library_get_pipeline_mul_mm_id_map0 (ggml_metal_library_t lib, int ne02, int ne20);
ggml_metal_pipeline_t ggml_metal_library_get_pipeline_mul_mm_id (ggml_metal_library_t lib, const struct ggml_tensor * op);
ggml_metal_pipeline_t ggml_metal_library_get_pipeline_mul_mv_id (ggml_metal_library_t lib, const struct ggml_tensor * op);
ggml_metal_pipeline_t ggml_metal_library_get_pipeline_argmax (ggml_metal_library_t lib, const struct ggml_tensor * op);
ggml_metal_pipeline_t ggml_metal_library_get_pipeline_argsort (ggml_metal_library_t lib, const struct ggml_tensor * op);
ggml_metal_pipeline_t ggml_metal_library_get_pipeline_argsort_merge (ggml_metal_library_t lib, const struct ggml_tensor * op);
ggml_metal_pipeline_t ggml_metal_library_get_pipeline_top_k (ggml_metal_library_t lib, const struct ggml_tensor * op);
ggml_metal_pipeline_t ggml_metal_library_get_pipeline_top_k_merge (ggml_metal_library_t lib, const struct ggml_tensor * op);
ggml_metal_pipeline_t ggml_metal_library_get_pipeline_bin (ggml_metal_library_t lib, enum ggml_op op, int32_t n_fuse, bool row);
ggml_metal_pipeline_t ggml_metal_library_get_pipeline_l2_norm (ggml_metal_library_t lib, const struct ggml_tensor * op);
ggml_metal_pipeline_t ggml_metal_library_get_pipeline_group_norm (ggml_metal_library_t lib, const struct ggml_tensor * op);
ggml_metal_pipeline_t ggml_metal_library_get_pipeline_norm (ggml_metal_library_t lib, const struct ggml_tensor * op, int32_t n_fuse);
ggml_metal_pipeline_t ggml_metal_library_get_pipeline_rope (ggml_metal_library_t lib, const struct ggml_tensor * op);
ggml_metal_pipeline_t ggml_metal_library_get_pipeline_im2col (ggml_metal_library_t lib, const struct ggml_tensor * op);
ggml_metal_pipeline_t ggml_metal_library_get_pipeline_conv_transpose_1d (ggml_metal_library_t lib, const struct ggml_tensor * op);
ggml_metal_pipeline_t ggml_metal_library_get_pipeline_conv_transpose_2d (ggml_metal_library_t lib, const struct ggml_tensor * op);
ggml_metal_pipeline_t ggml_metal_library_get_pipeline_conv_2d (ggml_metal_library_t lib, const struct ggml_tensor * op);
ggml_metal_pipeline_t ggml_metal_library_get_pipeline_upscale (ggml_metal_library_t lib, const struct ggml_tensor * op);
ggml_metal_pipeline_t ggml_metal_library_get_pipeline_pad (ggml_metal_library_t lib, const struct ggml_tensor * op);
ggml_metal_pipeline_t ggml_metal_library_get_pipeline_pad_reflect_1d (ggml_metal_library_t lib, const struct ggml_tensor * op);
ggml_metal_pipeline_t ggml_metal_library_get_pipeline_arange (ggml_metal_library_t lib, const struct ggml_tensor * op);
ggml_metal_pipeline_t ggml_metal_library_get_pipeline_timestep_embedding(ggml_metal_library_t lib, const struct ggml_tensor * op);
ggml_metal_pipeline_t ggml_metal_library_get_pipeline_opt_step_adamw (ggml_metal_library_t lib, const struct ggml_tensor * op);
ggml_metal_pipeline_t ggml_metal_library_get_pipeline_opt_step_sgd (ggml_metal_library_t lib, const struct ggml_tensor * op);
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_base (ggml_metal_library_t lib, enum ggml_op op);
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_cpy (ggml_metal_library_t lib, enum ggml_type tsrc, enum ggml_type tdst);
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_pool_2d (ggml_metal_library_t lib, const struct ggml_tensor * op, enum ggml_op_pool op_pool);
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_get_rows (ggml_metal_library_t lib, enum ggml_type tsrc);
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_set_rows (ggml_metal_library_t lib, enum ggml_type tidx, enum ggml_type tdst);
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_repeat (ggml_metal_library_t lib, enum ggml_type tsrc);
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_unary (ggml_metal_library_t lib, const struct ggml_tensor * op);
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_glu (ggml_metal_library_t lib, const struct ggml_tensor * op);
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_sum (ggml_metal_library_t lib, const struct ggml_tensor * op);
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_sum_rows (ggml_metal_library_t lib, const struct ggml_tensor * op);
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_cumsum_blk (ggml_metal_library_t lib, const struct ggml_tensor * op);
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_cumsum_add (ggml_metal_library_t lib, const struct ggml_tensor * op);
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_soft_max (ggml_metal_library_t lib, const struct ggml_tensor * op);
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_ssm_conv (ggml_metal_library_t lib, const struct ggml_tensor * op);
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_ssm_scan (ggml_metal_library_t lib, const struct ggml_tensor * op);
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_rwkv (ggml_metal_library_t lib, const struct ggml_tensor * op);
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_mul_mv_ext (ggml_metal_library_t lib, enum ggml_type tsrc0, enum ggml_type tsrc1, int nsg, int nxpsg, int r1ptg);
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_mul_mm (ggml_metal_library_t lib, const struct ggml_tensor * op);
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_mul_mv (ggml_metal_library_t lib, const struct ggml_tensor * op);
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_mul_mm_id_map0 (ggml_metal_library_t lib, int ne02, int ne20);
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_mul_mm_id (ggml_metal_library_t lib, const struct ggml_tensor * op);
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_mul_mv_id (ggml_metal_library_t lib, const struct ggml_tensor * op);
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_argmax (ggml_metal_library_t lib, const struct ggml_tensor * op);
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_argsort (ggml_metal_library_t lib, const struct ggml_tensor * op);
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_argsort_merge (ggml_metal_library_t lib, const struct ggml_tensor * op);
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_top_k (ggml_metal_library_t lib, const struct ggml_tensor * op);
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_top_k_merge (ggml_metal_library_t lib, const struct ggml_tensor * op);
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_bin (ggml_metal_library_t lib, enum ggml_op op, int32_t n_fuse, bool row);
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_l2_norm (ggml_metal_library_t lib, const struct ggml_tensor * op);
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_group_norm (ggml_metal_library_t lib, const struct ggml_tensor * op);
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_norm (ggml_metal_library_t lib, const struct ggml_tensor * op, int32_t n_fuse);
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_rope (ggml_metal_library_t lib, const struct ggml_tensor * op);
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_im2col (ggml_metal_library_t lib, const struct ggml_tensor * op);
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_conv_transpose_1d (ggml_metal_library_t lib, const struct ggml_tensor * op);
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_conv_transpose_2d (ggml_metal_library_t lib, const struct ggml_tensor * op);
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_conv_2d (ggml_metal_library_t lib, const struct ggml_tensor * op);
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_upscale (ggml_metal_library_t lib, const struct ggml_tensor * op);
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_pad (ggml_metal_library_t lib, const struct ggml_tensor * op);
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_pad_reflect_1d (ggml_metal_library_t lib, const struct ggml_tensor * op);
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_arange (ggml_metal_library_t lib, const struct ggml_tensor * op);
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_timestep_embedding(ggml_metal_library_t lib, const struct ggml_tensor * op);
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_opt_step_adamw (ggml_metal_library_t lib, const struct ggml_tensor * op);
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_opt_step_sgd (ggml_metal_library_t lib, const struct ggml_tensor * op);
ggml_metal_pipeline_t ggml_metal_library_get_pipeline_flash_attn_ext_pad(
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_flash_attn_ext_pad(
ggml_metal_library_t lib,
const struct ggml_tensor * op,
bool has_mask,
int32_t ncpsg);
ggml_metal_pipeline_t ggml_metal_library_get_pipeline_flash_attn_ext_blk(
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_flash_attn_ext_blk(
ggml_metal_library_t lib,
const struct ggml_tensor * op,
int32_t nqptg,
int32_t ncpsg);
ggml_metal_pipeline_t ggml_metal_library_get_pipeline_flash_attn_ext(
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_flash_attn_ext(
ggml_metal_library_t lib,
const struct ggml_tensor * op,
bool has_mask,
@ -169,7 +168,7 @@ ggml_metal_pipeline_t ggml_metal_library_get_pipeline_flash_attn_ext(
bool has_kvpad,
int32_t nsg);
ggml_metal_pipeline_t ggml_metal_library_get_pipeline_flash_attn_ext_vec(
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_flash_attn_ext_vec(
ggml_metal_library_t lib,
const struct ggml_tensor * op,
bool has_mask,
@ -180,7 +179,7 @@ ggml_metal_pipeline_t ggml_metal_library_get_pipeline_flash_attn_ext_vec(
int32_t nsg,
int32_t nwg);
ggml_metal_pipeline_t ggml_metal_library_get_pipeline_flash_attn_ext_vec_reduce(
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_flash_attn_ext_vec_reduce(
ggml_metal_library_t lib,
const struct ggml_tensor * op,
int32_t dv,

View File

@ -75,14 +75,6 @@ void ggml_metal_cv_set_bool(ggml_metal_cv_t cv, bool value, int32_t idx) {
struct ggml_metal_pipeline {
id<MTLComputePipelineState> obj;
// suggested dispatch sizes
int nsg;
int nr0;
int nr1;
size_t smem;
};
ggml_metal_pipeline_t ggml_metal_pipeline_init(void) {
@ -90,10 +82,6 @@ ggml_metal_pipeline_t ggml_metal_pipeline_init(void) {
*res = (struct ggml_metal_pipeline) {
/*.obj =*/ nil,
/*.nsg =*/ 0,
/*.nr0 =*/ 0,
/*.nr1 =*/ 0,
/*.smem =*/ 0,
};
return res;
@ -105,40 +93,8 @@ void ggml_metal_pipeline_free(ggml_metal_pipeline_t pipeline) {
free(pipeline);
}
void ggml_metal_pipeline_set_nsg(ggml_metal_pipeline_t pipeline, int nsg) {
pipeline->nsg = nsg;
}
int ggml_metal_pipeline_get_nsg(ggml_metal_pipeline_t pipeline) {
return pipeline->nsg;
}
void ggml_metal_pipeline_set_nr0(ggml_metal_pipeline_t pipeline, int nr0) {
pipeline->nr0 = nr0;
}
int ggml_metal_pipeline_get_nr0(ggml_metal_pipeline_t pipeline) {
return pipeline->nr0;
}
void ggml_metal_pipeline_set_nr1(ggml_metal_pipeline_t pipeline, int nr1) {
pipeline->nr1 = nr1;
}
int ggml_metal_pipeline_get_nr1(ggml_metal_pipeline_t pipeline) {
return pipeline->nr1;
}
void ggml_metal_pipeline_set_smem(ggml_metal_pipeline_t pipeline, size_t smem) {
pipeline->smem = smem;
}
size_t ggml_metal_pipeline_get_smem(ggml_metal_pipeline_t pipeline) {
return pipeline->smem;
}
int ggml_metal_pipeline_max_theads_per_threadgroup(ggml_metal_pipeline_t pipeline) {
return pipeline->obj.maxTotalThreadsPerThreadgroup;
int ggml_metal_pipeline_max_theads_per_threadgroup(struct ggml_metal_pipeline_with_params pipeline) {
return pipeline.pipeline->obj.maxTotalThreadsPerThreadgroup;
}
struct ggml_metal_library {
@ -146,6 +102,8 @@ struct ggml_metal_library {
id<MTLDevice> device;
ggml_metal_pipelines_t pipelines; // cache of compiled pipelines
NSLock * lock;
};
ggml_metal_library_t ggml_metal_library_init(ggml_metal_device_t dev) {
@ -296,9 +254,10 @@ ggml_metal_library_t ggml_metal_library_init(ggml_metal_device_t dev) {
ggml_metal_library_t res = calloc(1, sizeof(struct ggml_metal_library));
res->obj = library;
res->device = device;
res->obj = library;
res->device = device;
res->pipelines = ggml_metal_pipelines_init();
res->lock = [NSLock new];
return res;
}
@ -365,6 +324,7 @@ ggml_metal_library_t ggml_metal_library_init_from_source(ggml_metal_device_t dev
res->obj = library;
res->device = device;
res->pipelines = ggml_metal_pipelines_init();
res->lock = [NSLock new];
return res;
}
@ -380,26 +340,47 @@ void ggml_metal_library_free(ggml_metal_library_t lib) {
ggml_metal_pipelines_free(lib->pipelines);
[lib->lock release];
free(lib);
}
ggml_metal_pipeline_t ggml_metal_library_get_pipeline(ggml_metal_library_t lib, const char * name) {
return ggml_metal_pipelines_get(lib->pipelines, name);
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline(ggml_metal_library_t lib, const char * name) {
[lib->lock lock];
struct ggml_metal_pipeline_with_params res = {
/*.pipeline =*/ nil,
/*.nr0 =*/ 0,
/*.nr1 =*/ 0,
/*.nsg =*/ 0,
/*.smem =*/ 0,
};
res.pipeline = ggml_metal_pipelines_get(lib->pipelines, name);
[lib->lock unlock];
return res;
}
ggml_metal_pipeline_t ggml_metal_library_compile_pipeline(ggml_metal_library_t lib, const char * base, const char * name, ggml_metal_cv_t cv) {
// note: the pipelines are cached in the library per device, so they are shared across all metal contexts
ggml_critical_section_start();
struct ggml_metal_pipeline_with_params ggml_metal_library_compile_pipeline(ggml_metal_library_t lib, const char * base, const char * name, ggml_metal_cv_t cv) {
struct ggml_metal_pipeline_with_params res = {
/*.pipeline =*/ nil,
/*.nr0 =*/ 0,
/*.nr1 =*/ 0,
/*.nsg =*/ 0,
/*.smem =*/ 0,
};
ggml_metal_pipeline_t res = ggml_metal_library_get_pipeline(lib, name);
if (res) {
ggml_critical_section_end();
[lib->lock lock];
res.pipeline = ggml_metal_pipelines_get(lib->pipelines, name);
if (res.pipeline) {
[lib->lock unlock];
return res;
}
res = ggml_metal_pipeline_init();
@autoreleasepool {
NSError * error = nil;
@ -414,36 +395,53 @@ ggml_metal_pipeline_t ggml_metal_library_compile_pipeline(ggml_metal_library_t l
mtl_function = [lib->obj newFunctionWithName:base_func constantValues:cv->obj error:&error];
}
if (!mtl_function) {
ggml_critical_section_end();
[lib->lock unlock];
GGML_LOG_ERROR("%s: failed to compile pipeline: base = '%s', name = '%s'\n", __func__, base, name);
if (error) {
GGML_LOG_ERROR("%s: %s\n", __func__, [[error description] UTF8String]);
}
return nil;
return res;
}
res->obj = [lib->device newComputePipelineStateWithFunction:mtl_function error:&error];
id<MTLComputePipelineState> obj = [lib->device newComputePipelineStateWithFunction:mtl_function error:&error];
[mtl_function release];
GGML_LOG_DEBUG("%s: loaded %-40s %16p | th_max = %4d | th_width = %4d\n", __func__, name, (void *) res->obj,
(int) res->obj.maxTotalThreadsPerThreadgroup,
(int) res->obj.threadExecutionWidth);
if (!obj) {
[lib->lock unlock];
if (res->obj.maxTotalThreadsPerThreadgroup == 0 || res->obj.threadExecutionWidth == 0) {
ggml_critical_section_end();
GGML_LOG_ERROR("%s: failed to create pipeline state: base = '%s', name = '%s'\n", __func__, base, name);
if (error) {
GGML_LOG_ERROR("%s: %s\n", __func__, [[error description] UTF8String]);
}
return res;
}
GGML_LOG_DEBUG("%s: loaded %-40s %16p | th_max = %4d | th_width = %4d\n", __func__, name,
(void *) obj,
(int) obj.maxTotalThreadsPerThreadgroup,
(int) obj.threadExecutionWidth);
if (obj.maxTotalThreadsPerThreadgroup == 0 || obj.threadExecutionWidth == 0) {
[obj release];
[lib->lock unlock];
GGML_LOG_ERROR("%s: incompatible pipeline %s\n", __func__, name);
return nil;
return res;
}
ggml_metal_pipelines_add(lib->pipelines, name, res);
res.pipeline = ggml_metal_pipeline_init();
res.pipeline->obj = obj;
ggml_metal_pipelines_add(lib->pipelines, name, res.pipeline);
}
ggml_critical_section_end();
[lib->lock unlock];
return res;
}
@ -485,8 +483,8 @@ void ggml_metal_encoder_debug_group_pop (ggml_metal_encoder_t encoder) {
[encoder->obj popDebugGroup];
}
void ggml_metal_encoder_set_pipeline(ggml_metal_encoder_t encoder, ggml_metal_pipeline_t pipeline) {
[encoder->obj setComputePipelineState:pipeline->obj];
void ggml_metal_encoder_set_pipeline(ggml_metal_encoder_t encoder, struct ggml_metal_pipeline_with_params pipeline) {
[encoder->obj setComputePipelineState:pipeline.pipeline->obj];
}
void ggml_metal_encoder_set_bytes(ggml_metal_encoder_t encoder, void * data, size_t size, int idx) {
@ -611,8 +609,8 @@ ggml_metal_device_t ggml_metal_device_init(void) {
GGML_LOG_WARN("%s: - the tensor API is not supported in this environment - disabling\n", __func__);
dev->props.has_tensor = false;
} else {
ggml_metal_pipeline_t ppl = ggml_metal_library_compile_pipeline(lib, "dummy_kernel", "dummy_kernel", nil);
if (!ppl) {
struct ggml_metal_pipeline_with_params ppl = ggml_metal_library_compile_pipeline(lib, "dummy_kernel", "dummy_kernel", nil);
if (!ppl.pipeline) {
GGML_LOG_WARN("%s: - the tensor API is not supported in this environment - disabling\n", __func__);
dev->props.has_tensor = false;
}
@ -661,8 +659,8 @@ ggml_metal_device_t ggml_metal_device_init(void) {
GGML_LOG_WARN("%s: - the tensor API does not support bfloat - disabling bfloat support\n", __func__);
dev->props.has_bfloat = false;
} else {
ggml_metal_pipeline_t ppl = ggml_metal_library_compile_pipeline(lib, "dummy_kernel", "dummy_kernel", nil);
if (!ppl) {
struct ggml_metal_pipeline_with_params ppl = ggml_metal_library_compile_pipeline(lib, "dummy_kernel", "dummy_kernel", nil);
if (!ppl.pipeline) {
GGML_LOG_WARN("%s: - the tensor API does not support bfloat - disabling bfloat support\n", __func__);
dev->props.has_bfloat = false;
}
@ -894,7 +892,7 @@ bool ggml_metal_device_supports_op(ggml_metal_device_t dev, const struct ggml_te
case GGML_OP_POOL_1D:
return false;
case GGML_OP_UPSCALE:
return op->src[0]->type == GGML_TYPE_F32 && op->op_params[0] == GGML_SCALE_MODE_NEAREST;
return op->src[0]->type == GGML_TYPE_F32 && op->op_params[0] == GGML_SCALE_MODE_NEAREST && !(op->op_params[0] & GGML_SCALE_FLAG_ANTIALIAS);
case GGML_OP_POOL_2D:
return op->src[0]->type == GGML_TYPE_F32;
case GGML_OP_PAD:
@ -912,6 +910,7 @@ bool ggml_metal_device_supports_op(ggml_metal_device_t dev, const struct ggml_te
// for new head sizes, add checks here
if (op->src[0]->ne[0] != 32 &&
op->src[0]->ne[0] != 40 &&
op->src[0]->ne[0] != 48 &&
op->src[0]->ne[0] != 64 &&
op->src[0]->ne[0] != 72 &&
op->src[0]->ne[0] != 80 &&

View File

@ -524,7 +524,7 @@ int ggml_metal_op_concat(ggml_metal_op_t ctx, int idx) {
/*.dim =*/ dim,
};
ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_base(lib, GGML_OP_CONCAT);
auto pipeline = ggml_metal_library_get_pipeline_base(lib, GGML_OP_CONCAT);
ggml_metal_encoder_set_pipeline(enc, pipeline);
ggml_metal_encoder_set_bytes (enc, &args, sizeof(args), 0);
@ -550,7 +550,7 @@ int ggml_metal_op_repeat(ggml_metal_op_t ctx, int idx) {
GGML_TENSOR_LOCALS( int32_t, ne, op, ne);
GGML_TENSOR_LOCALS(uint64_t, nb, op, nb);
ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_repeat(lib, op->type);
auto pipeline = ggml_metal_library_get_pipeline_repeat(lib, op->type);
ggml_metal_kargs_repeat args = {
/*.ne00 =*/ ne00,
@ -616,7 +616,7 @@ int ggml_metal_op_acc(ggml_metal_op_t ctx, int idx) {
// TODO: make a simpler cpy_bytes kernel
//const id<MTLComputePipelineState> pipeline = ctx->pipelines[GGML_METAL_PIPELINE_TYPE_CPY_F32_F32].obj;
ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_cpy(lib, op->src[0]->type, op->type);
auto pipeline = ggml_metal_library_get_pipeline_cpy(lib, op->src[0]->type, op->type);
ggml_metal_kargs_cpy args = {
/*.nk0 =*/ ne00,
@ -679,7 +679,7 @@ int ggml_metal_op_acc(ggml_metal_op_t ctx, int idx) {
/*.o1 =*/ { 0 },
};
ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_bin(lib, GGML_OP_ADD, 1, false);
auto pipeline = ggml_metal_library_get_pipeline_bin(lib, GGML_OP_ADD, 1, false);
ggml_metal_encoder_set_pipeline(enc, pipeline);
ggml_metal_encoder_set_bytes (enc, &args, sizeof(args), 0);
@ -721,7 +721,7 @@ int ggml_metal_op_scale(ggml_metal_op_t ctx, int idx) {
n /= 4;
}
ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_unary(lib, op);
auto pipeline = ggml_metal_library_get_pipeline_unary(lib, op);
ggml_metal_encoder_set_pipeline(enc, pipeline);
ggml_metal_encoder_set_bytes (enc, &args, sizeof(args), 0);
@ -760,7 +760,7 @@ int ggml_metal_op_clamp(ggml_metal_op_t ctx, int idx) {
n /= 4;
}
ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_unary(lib, op);
auto pipeline = ggml_metal_library_get_pipeline_unary(lib, op);
ggml_metal_encoder_set_pipeline(enc, pipeline);
ggml_metal_encoder_set_bytes (enc, &args, sizeof(args), 0);
@ -789,7 +789,7 @@ int ggml_metal_op_unary(ggml_metal_op_t ctx, int idx) {
n /= 4;
}
ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_unary(lib, op);
auto pipeline = ggml_metal_library_get_pipeline_unary(lib, op);
ggml_metal_encoder_set_pipeline(enc, pipeline);
ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[0]), 0);
@ -817,7 +817,7 @@ int ggml_metal_op_glu(ggml_metal_op_t ctx, int idx) {
GGML_ASSERT(ggml_are_same_shape(op->src[0], op->src[1]));
}
ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_glu(lib, op);
auto pipeline = ggml_metal_library_get_pipeline_glu(lib, op);
const int32_t swp = ggml_get_op_params_i32(op, 1);
const float alpha = ggml_get_op_params_f32(op, 2);
@ -870,7 +870,7 @@ int ggml_metal_op_sum(ggml_metal_op_t ctx, int idx) {
/*.np =*/ n,
};
ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_sum(lib, op);
auto pipeline = ggml_metal_library_get_pipeline_sum(lib, op);
int nth = 32; // SIMD width
@ -925,7 +925,7 @@ int ggml_metal_op_sum_rows(ggml_metal_op_t ctx, int idx) {
/*.nb3 =*/ nb3,
};
ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_sum_rows(lib, op);
auto pipeline = ggml_metal_library_get_pipeline_sum_rows(lib, op);
int nth = 32; // SIMD width
@ -936,7 +936,7 @@ int ggml_metal_op_sum_rows(ggml_metal_op_t ctx, int idx) {
nth = std::min(nth, ggml_metal_pipeline_max_theads_per_threadgroup(pipeline));
nth = std::min(nth, ne00);
const size_t smem = ggml_metal_pipeline_get_smem(pipeline);
const size_t smem = pipeline.smem;
ggml_metal_encoder_set_pipeline(enc, pipeline);
ggml_metal_encoder_set_bytes (enc, &args, sizeof(args), 0);
@ -963,7 +963,7 @@ int ggml_metal_op_cumsum(ggml_metal_op_t ctx, int idx) {
GGML_TENSOR_LOCALS( int32_t, ne, op, ne);
GGML_TENSOR_LOCALS(uint64_t, nb, op, nb);
ggml_metal_pipeline_t pipeline_blk = ggml_metal_library_get_pipeline_cumsum_blk(lib, op);
auto pipeline_blk = ggml_metal_library_get_pipeline_cumsum_blk(lib, op);
int nth = 1;
while (nth < ne00 && 2*nth <= ggml_metal_pipeline_max_theads_per_threadgroup(pipeline_blk)) {
@ -1060,7 +1060,7 @@ int ggml_metal_op_cumsum(ggml_metal_op_t ctx, int idx) {
ggml_metal_op_concurrency_reset(ctx);
{
ggml_metal_pipeline_t pipeline_add = ggml_metal_library_get_pipeline_cumsum_add(lib, op);
auto pipeline_add = ggml_metal_library_get_pipeline_cumsum_add(lib, op);
ggml_metal_kargs_cumsum_add args = {
/*.ne00 =*/ ne00,
@ -1106,7 +1106,7 @@ int ggml_metal_op_get_rows(ggml_metal_op_t ctx, int idx) {
GGML_TENSOR_LOCALS( int32_t, ne, op, ne);
GGML_TENSOR_LOCALS(uint64_t, nb, op, nb);
ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_get_rows(lib, op->src[0]->type);
auto pipeline = ggml_metal_library_get_pipeline_get_rows(lib, op->src[0]->type);
ggml_metal_kargs_get_rows args = {
/*.ne00t =*/ ggml_is_quantized(op->src[0]->type) ? ne00/16 : ne00,
@ -1151,7 +1151,7 @@ int ggml_metal_op_set_rows(ggml_metal_op_t ctx, int idx) {
GGML_TENSOR_LOCALS( int32_t, ne, op, ne);
GGML_TENSOR_LOCALS(uint64_t, nb, op, nb);
ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_set_rows(lib, op->src[1]->type, op->type);
auto pipeline = ggml_metal_library_get_pipeline_set_rows(lib, op->src[1]->type, op->type);
const int32_t nk0 = ne0/ggml_blck_size(op->type);
@ -1252,7 +1252,7 @@ int ggml_metal_op_soft_max(ggml_metal_op_t ctx, int idx) {
/*.n_head_log2 =*/ n_head_log2,
};
ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_soft_max(lib, op);
auto pipeline = ggml_metal_library_get_pipeline_soft_max(lib, op);
int nth = 32; // SIMD width
@ -1266,7 +1266,7 @@ int ggml_metal_op_soft_max(ggml_metal_op_t ctx, int idx) {
}
}
const size_t smem = ggml_metal_pipeline_get_smem(pipeline);
const size_t smem = pipeline.smem;
ggml_metal_encoder_set_pipeline(enc, pipeline);
ggml_metal_encoder_set_bytes(enc, &args, sizeof(args), 0);
@ -1322,7 +1322,7 @@ int ggml_metal_op_ssm_conv(ggml_metal_op_t ctx, int idx) {
/*.nb2 =*/ nb2,
};
ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_ssm_conv(lib, op);
auto pipeline = ggml_metal_library_get_pipeline_ssm_conv(lib, op);
ggml_metal_encoder_set_pipeline(enc, pipeline);
ggml_metal_encoder_set_bytes(enc, &args, sizeof(args), 0);
@ -1409,11 +1409,11 @@ int ggml_metal_op_ssm_scan(ggml_metal_op_t ctx, int idx) {
/*.nb0 =*/ nb0,
};
ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_ssm_scan(lib, op);
auto pipeline = ggml_metal_library_get_pipeline_ssm_scan(lib, op);
GGML_ASSERT(d_state <= ggml_metal_pipeline_max_theads_per_threadgroup(pipeline));
const size_t sms = ggml_metal_pipeline_get_smem(pipeline);
const size_t smem = pipeline.smem;
ggml_metal_encoder_set_pipeline(enc, pipeline);
ggml_metal_encoder_set_bytes (enc, &args, sizeof(args), 0);
@ -1426,7 +1426,7 @@ int ggml_metal_op_ssm_scan(ggml_metal_op_t ctx, int idx) {
ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[6]), 7);
ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op), 8);
ggml_metal_encoder_set_threadgroup_memory_size(enc, sms, 0);
ggml_metal_encoder_set_threadgroup_memory_size(enc, smem, 0);
ggml_metal_encoder_dispatch_threadgroups(enc, d_inner, n_head, n_seqs, d_state, 1, 1);
@ -1449,7 +1449,7 @@ int ggml_metal_op_rwkv(ggml_metal_op_t ctx, int idx) {
const int64_t C = op->ne[0];
const int64_t H = op->src[0]->ne[1];
ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_rwkv(lib, op);
auto pipeline = ggml_metal_library_get_pipeline_rwkv(lib, op);
int ida = 0;
@ -1485,7 +1485,7 @@ int ggml_metal_op_cpy(ggml_metal_op_t ctx, int idx) {
GGML_TENSOR_LOCALS( int32_t, ne, op, ne);
GGML_TENSOR_LOCALS(uint64_t, nb, op, nb);
ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_cpy(lib, op->src[0]->type, op->type);
auto pipeline = ggml_metal_library_get_pipeline_cpy(lib, op->src[0]->type, op->type);
GGML_ASSERT(ne00 % ggml_blck_size(op->src[0]->type) == 0);
@ -1592,7 +1592,7 @@ int ggml_metal_op_pool_2d(ggml_metal_op_t ctx, int idx) {
/* .np = */ np
};
ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_pool_2d(lib, op, op_pool);
auto pipeline = ggml_metal_library_get_pipeline_pool_2d(lib, op, op_pool);
const int nth = std::min(ggml_metal_pipeline_max_theads_per_threadgroup(pipeline), (int) np);
const int ntg = (np + nth - 1) / nth;
@ -1701,7 +1701,7 @@ int ggml_metal_op_mul_mat(ggml_metal_op_t ctx, int idx) {
GGML_ABORT("unsupported ne11");
};
ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_mul_mv_ext(lib, op->src[0]->type, op->src[1]->type, nsg, nxpsg, r1ptg);
auto pipeline = ggml_metal_library_get_pipeline_mul_mv_ext(lib, op->src[0]->type, op->src[1]->type, nsg, nxpsg, r1ptg);
ggml_metal_kargs_mul_mv_ext args = {
/*.ne00 =*/ ne00,
@ -1748,7 +1748,7 @@ int ggml_metal_op_mul_mat(ggml_metal_op_t ctx, int idx) {
// default: break;
//}
ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_mul_mm(lib, op);
auto pipeline = ggml_metal_library_get_pipeline_mul_mm(lib, op);
ggml_metal_kargs_mul_mm args = {
/*.ne00 =*/ ne00,
@ -1773,18 +1773,18 @@ int ggml_metal_op_mul_mat(ggml_metal_op_t ctx, int idx) {
ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[1]), 2);
ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op), 3);
const size_t smem = ggml_metal_pipeline_get_smem(pipeline);
const size_t smem = pipeline.smem;
ggml_metal_encoder_set_threadgroup_memory_size(enc, smem, 0);
ggml_metal_encoder_dispatch_threadgroups(enc, ((ne11 + 31)/32), ((ne01 + 63)/64), ne12*ne13, 128, 1, 1);
} else {
ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_mul_mv(lib, op);
auto pipeline = ggml_metal_library_get_pipeline_mul_mv(lib, op);
const int nr0 = ggml_metal_pipeline_get_nr0(pipeline);
const int nr1 = ggml_metal_pipeline_get_nr1(pipeline);
const int nsg = ggml_metal_pipeline_get_nsg(pipeline);
const int nr0 = pipeline.nr0;
const int nr1 = pipeline.nr1;
const int nsg = pipeline.nsg;
const size_t smem = ggml_metal_pipeline_get_smem(pipeline);
const size_t smem = pipeline.smem;
ggml_metal_kargs_mul_mv args = {
/*.ne00 =*/ ne00,
@ -1915,9 +1915,9 @@ int ggml_metal_op_mul_mat_id(ggml_metal_op_t ctx, int idx) {
nb21,
};
ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_mul_mm_id_map0(lib, ne02, ne20);
auto pipeline = ggml_metal_library_get_pipeline_mul_mm_id_map0(lib, ne02, ne20);
const size_t smem = ggml_metal_pipeline_get_smem(pipeline);
const size_t smem = pipeline.smem;
GGML_ASSERT(ne02 <= ggml_metal_pipeline_max_theads_per_threadgroup(pipeline));
@ -1938,7 +1938,7 @@ int ggml_metal_op_mul_mat_id(ggml_metal_op_t ctx, int idx) {
ggml_metal_op_concurrency_reset(ctx);
{
ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_mul_mm_id(lib, op);
auto pipeline = ggml_metal_library_get_pipeline_mul_mm_id(lib, op);
ggml_metal_kargs_mul_mm_id args = {
/*.ne00 =*/ ne00,
@ -1967,20 +1967,20 @@ int ggml_metal_op_mul_mat_id(ggml_metal_op_t ctx, int idx) {
ggml_metal_encoder_set_buffer (enc, bid_ids, 4);
ggml_metal_encoder_set_buffer (enc, bid_dst, 5);
const size_t smem = ggml_metal_pipeline_get_smem(pipeline);
const size_t smem = pipeline.smem;
ggml_metal_encoder_set_threadgroup_memory_size(enc, smem, 0);
ggml_metal_encoder_dispatch_threadgroups(enc, (ne21 + 31)/32, (ne01 + 63)/64, ne02, 128, 1, 1);
}
} else {
ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_mul_mv_id(lib, op);
auto pipeline = ggml_metal_library_get_pipeline_mul_mv_id(lib, op);
const int nr0 = ggml_metal_pipeline_get_nr0(pipeline);
const int nr1 = ggml_metal_pipeline_get_nr1(pipeline);
const int nsg = ggml_metal_pipeline_get_nsg(pipeline);
const int nr0 = pipeline.nr0;
const int nr1 = pipeline.nr1;
const int nsg = pipeline.nsg;
const size_t smem = ggml_metal_pipeline_get_smem(pipeline);
const size_t smem = pipeline.smem;
ggml_metal_kargs_mul_mv_id args = {
/*.nei0 =*/ ne20,
@ -2064,7 +2064,7 @@ int ggml_metal_op_add_id(ggml_metal_op_t ctx, int idx) {
/*.nb21 =*/ nb21,
};
ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_base(lib, GGML_OP_ADD_ID);
auto pipeline = ggml_metal_library_get_pipeline_base(lib, GGML_OP_ADD_ID);
ggml_metal_encoder_set_pipeline(enc, pipeline);
ggml_metal_encoder_set_bytes (enc, &args, sizeof(args), 0);
@ -2308,7 +2308,7 @@ int ggml_metal_op_flash_attn_ext(ggml_metal_op_t ctx, int idx) {
/*.nb33 =*/nb33,
};
ggml_metal_pipeline_t pipeline0 = ggml_metal_library_get_pipeline_flash_attn_ext_pad(lib, op, has_mask, ncpsg);
auto pipeline0 = ggml_metal_library_get_pipeline_flash_attn_ext_pad(lib, op, has_mask, ncpsg);
ggml_metal_encoder_set_pipeline(enc, pipeline0);
ggml_metal_encoder_set_bytes (enc, &args0, sizeof(args0), 0);
@ -2339,7 +2339,7 @@ int ggml_metal_op_flash_attn_ext(ggml_metal_op_t ctx, int idx) {
/*.nb33 =*/ nb33,
};
ggml_metal_pipeline_t pipeline0 = ggml_metal_library_get_pipeline_flash_attn_ext_blk(lib, op, nqptg, ncpsg);
auto pipeline0 = ggml_metal_library_get_pipeline_flash_attn_ext_blk(lib, op, nqptg, ncpsg);
ggml_metal_encoder_set_pipeline(enc, pipeline0);
ggml_metal_encoder_set_bytes (enc, &args0, sizeof(args0), 0);
@ -2424,7 +2424,7 @@ int ggml_metal_op_flash_attn_ext(ggml_metal_op_t ctx, int idx) {
/*.logit_softcap =*/ logit_softcap,
};
ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_flash_attn_ext(lib, op, has_mask, has_sinks, has_bias, has_scap, has_kvpad, nsg);
auto pipeline = ggml_metal_library_get_pipeline_flash_attn_ext(lib, op, has_mask, has_sinks, has_bias, has_scap, has_kvpad, nsg);
ggml_metal_encoder_set_pipeline(enc, pipeline);
ggml_metal_encoder_set_bytes (enc, &args, sizeof(args), 0);
@ -2476,7 +2476,7 @@ int ggml_metal_op_flash_attn_ext(ggml_metal_op_t ctx, int idx) {
/*.nb33 =*/nb33,
};
ggml_metal_pipeline_t pipeline0 = ggml_metal_library_get_pipeline_flash_attn_ext_pad(lib, op, has_mask, ncpsg);
auto pipeline0 = ggml_metal_library_get_pipeline_flash_attn_ext_pad(lib, op, has_mask, ncpsg);
ggml_metal_encoder_set_pipeline(enc, pipeline0);
ggml_metal_encoder_set_bytes (enc, &args0, sizeof(args0), 0);
@ -2578,7 +2578,7 @@ int ggml_metal_op_flash_attn_ext(ggml_metal_op_t ctx, int idx) {
/*.logit_softcap =*/ logit_softcap,
};
ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_flash_attn_ext_vec(lib, op, has_mask, has_sinks, has_bias, has_scap, has_kvpad, nsg, nwg);
auto pipeline = ggml_metal_library_get_pipeline_flash_attn_ext_vec(lib, op, has_mask, has_sinks, has_bias, has_scap, has_kvpad, nsg, nwg);
GGML_ASSERT(nsg*32 <= ggml_metal_pipeline_max_theads_per_threadgroup(pipeline));
@ -2630,7 +2630,7 @@ int ggml_metal_op_flash_attn_ext(ggml_metal_op_t ctx, int idx) {
nrows,
};
ggml_metal_pipeline_t pipeline0 = ggml_metal_library_get_pipeline_flash_attn_ext_vec_reduce(lib, op, ne20, nwg);
auto pipeline0 = ggml_metal_library_get_pipeline_flash_attn_ext_vec_reduce(lib, op, ne20, nwg);
ggml_metal_encoder_set_pipeline(enc, pipeline0);
ggml_metal_encoder_set_bytes (enc, &args0, sizeof(args0), 0);
@ -2762,7 +2762,7 @@ int ggml_metal_op_bin(ggml_metal_op_t ctx, int idx) {
// the offsets of src1 and all fused buffers are relative to the start of the src1 buffer
bid_src1.offs = 0;
ggml_metal_pipeline_t pipeline = nullptr;
struct ggml_metal_pipeline_with_params pipeline;
if (ggml_nelements(op->src[1]) == ne10 && ggml_is_contiguous(op->src[1]) && ne00 % 4 == 0 && ne10 % 4 == 0) {
GGML_ASSERT(ggml_is_contiguous(op->src[0]));
@ -2835,7 +2835,7 @@ int ggml_metal_op_l2_norm(ggml_metal_op_t ctx, int idx) {
/*.eps =*/ eps,
};
ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_l2_norm(lib, op);
auto pipeline = ggml_metal_library_get_pipeline_l2_norm(lib, op);
while (nth < ne00/4 && nth < ggml_metal_pipeline_max_theads_per_threadgroup(pipeline)) {
nth *= 2;
@ -2844,7 +2844,7 @@ int ggml_metal_op_l2_norm(ggml_metal_op_t ctx, int idx) {
nth = std::min(nth, ggml_metal_pipeline_max_theads_per_threadgroup(pipeline));
nth = std::min(nth, ne00/4);
const size_t smem = ggml_metal_pipeline_get_smem(pipeline);
const size_t smem = pipeline.smem;
const int64_t nrows = ggml_nrows(op->src[0]);
@ -2887,7 +2887,7 @@ int ggml_metal_op_group_norm(ggml_metal_op_t ctx, int idx) {
/*.eps =*/ eps,
};
ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_group_norm(lib, op);
auto pipeline = ggml_metal_library_get_pipeline_group_norm(lib, op);
int nth = 32; // SIMD width
//while (nth < ne00/4 && nth < ggml_metal_pipeline_max_theads_per_threadgroup(pipeline)) {
@ -2897,7 +2897,7 @@ int ggml_metal_op_group_norm(ggml_metal_op_t ctx, int idx) {
//nth = std::min(nth, ggml_metal_pipeline_max_theads_per_threadgroup(pipeline));
//nth = std::min(nth, ne00/4);
const size_t smem = ggml_metal_pipeline_get_smem(pipeline);
const size_t smem = pipeline.smem;
ggml_metal_encoder_set_pipeline(enc, pipeline);
ggml_metal_encoder_set_bytes (enc, &args, sizeof(args), 0);
@ -3022,7 +3022,7 @@ int ggml_metal_op_norm(ggml_metal_op_t ctx, int idx) {
}
}
ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_norm(lib, op, n_fuse);
auto pipeline = ggml_metal_library_get_pipeline_norm(lib, op, n_fuse);
int nth = 32; // SIMD width
@ -3033,7 +3033,7 @@ int ggml_metal_op_norm(ggml_metal_op_t ctx, int idx) {
nth = std::min(nth, ggml_metal_pipeline_max_theads_per_threadgroup(pipeline));
nth = std::min(nth, args.ne00_t);
const size_t smem = ggml_metal_pipeline_get_smem(pipeline);
const size_t smem = pipeline.smem;
ggml_metal_encoder_set_pipeline(enc, pipeline);
ggml_metal_encoder_set_bytes (enc, &args, sizeof(args), 0);
@ -3127,7 +3127,7 @@ int ggml_metal_op_rope(ggml_metal_op_t ctx, int idx) {
/* src2 =*/ op->src[2] != nullptr,
};
ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_rope(lib, op);
auto pipeline = ggml_metal_library_get_pipeline_rope(lib, op);
ggml_metal_encoder_set_pipeline(enc, pipeline);
ggml_metal_encoder_set_bytes (enc, &args, sizeof(args), 0);
@ -3199,7 +3199,7 @@ int ggml_metal_op_im2col(ggml_metal_op_t ctx, int idx) {
/*.KHW =*/ KH * KW,
};
ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_im2col(lib, op);
auto pipeline = ggml_metal_library_get_pipeline_im2col(lib, op);
GGML_ASSERT(KH*KW <= ggml_metal_pipeline_max_theads_per_threadgroup(pipeline));
@ -3270,7 +3270,7 @@ int ggml_metal_op_conv_2d(ggml_metal_op_t ctx, int idx) {
/*.d1 =*/ d1,
};
ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_conv_2d(lib, op);
auto pipeline = ggml_metal_library_get_pipeline_conv_2d(lib, op);
int nth = ggml_metal_pipeline_max_theads_per_threadgroup(pipeline);
nth = std::min(nth, 256);
@ -3325,7 +3325,7 @@ int ggml_metal_op_conv_transpose_1d(ggml_metal_op_t ctx, int idx) {
/*.nb1 =*/ nb1,
};
ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_conv_transpose_1d(lib, op);
auto pipeline = ggml_metal_library_get_pipeline_conv_transpose_1d(lib, op);
ggml_metal_encoder_set_pipeline(enc, pipeline);
ggml_metal_encoder_set_bytes (enc, &args, sizeof(args), 0);
@ -3377,7 +3377,7 @@ int ggml_metal_op_conv_transpose_2d(ggml_metal_op_t ctx, int idx) {
/*.nb2 =*/ nb2,
};
ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_conv_transpose_2d(lib, op);
auto pipeline = ggml_metal_library_get_pipeline_conv_transpose_2d(lib, op);
ggml_metal_encoder_set_pipeline(enc, pipeline);
ggml_metal_encoder_set_bytes (enc, &args, sizeof(args), 0);
@ -3433,7 +3433,7 @@ int ggml_metal_op_upscale(ggml_metal_op_t ctx, int idx) {
/*.sf3 =*/ sf3
};
ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_upscale(lib, op);
auto pipeline = ggml_metal_library_get_pipeline_upscale(lib, op);
const int nth = std::min(ggml_metal_pipeline_max_theads_per_threadgroup(pipeline), ne0);
@ -3477,7 +3477,7 @@ int ggml_metal_op_pad(ggml_metal_op_t ctx, int idx) {
/*.nb3 =*/ nb3
};
ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_pad(lib, op);
auto pipeline = ggml_metal_library_get_pipeline_pad(lib, op);
const int nth = std::min(1024, ne0);
@ -3523,7 +3523,7 @@ int ggml_metal_op_pad_reflect_1d(ggml_metal_op_t ctx, int idx) {
/*.p1 =*/ ((const int32_t *)(op->op_params))[1]
};
ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_pad_reflect_1d(lib, op);
auto pipeline = ggml_metal_library_get_pipeline_pad_reflect_1d(lib, op);
const int nth = std::min(1024, ne0);
@ -3560,7 +3560,7 @@ int ggml_metal_op_arange(ggml_metal_op_t ctx, int idx) {
const int nth = std::min(1024, ne0);
ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_arange(lib, op);
auto pipeline = ggml_metal_library_get_pipeline_arange(lib, op);
ggml_metal_encoder_set_pipeline(enc, pipeline);
ggml_metal_encoder_set_bytes (enc, &args, sizeof(args), 0);
@ -3591,7 +3591,7 @@ int ggml_metal_op_timestep_embedding(ggml_metal_op_t ctx, int idx) {
/*.max_period =*/ max_period,
};
ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_timestep_embedding(lib, op);
auto pipeline = ggml_metal_library_get_pipeline_timestep_embedding(lib, op);
const int nth = std::max(1, std::min(1024, dim/2));
@ -3621,7 +3621,7 @@ int ggml_metal_op_argmax(ggml_metal_op_t ctx, int idx) {
/*.nb01 = */ nb01,
};
ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_argmax(lib, op);
auto pipeline = ggml_metal_library_get_pipeline_argmax(lib, op);
const int64_t nrows = ggml_nrows(op->src[0]);
@ -3630,7 +3630,7 @@ int ggml_metal_op_argmax(ggml_metal_op_t ctx, int idx) {
nth *= 2;
}
const size_t smem = ggml_metal_pipeline_get_smem(pipeline);
const size_t smem = pipeline.smem;
ggml_metal_encoder_set_pipeline(enc, pipeline);
ggml_metal_encoder_set_bytes (enc, &args, sizeof(args), 0);
@ -3657,7 +3657,7 @@ int ggml_metal_op_argsort(ggml_metal_op_t ctx, int idx) {
GGML_TENSOR_LOCALS( int32_t, ne, op, ne);
GGML_TENSOR_LOCALS(uint64_t, nb, op, nb);
ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_argsort(lib, op);
auto pipeline = ggml_metal_library_get_pipeline_argsort(lib, op);
// bitonic sort requires the number of elements to be power of 2
int nth = 1;
@ -3706,7 +3706,7 @@ int ggml_metal_op_argsort(ggml_metal_op_t ctx, int idx) {
ggml_metal_encoder_dispatch_threadgroups(enc, npr*ne01, ne02, ne03, nth, 1, 1);
ggml_metal_pipeline_t pipeline_merge = ggml_metal_library_get_pipeline_argsort_merge(lib, op);
auto pipeline_merge = ggml_metal_library_get_pipeline_argsort_merge(lib, op);
int len = nth;
@ -3764,7 +3764,7 @@ int ggml_metal_op_top_k(ggml_metal_op_t ctx, int idx) {
GGML_TENSOR_LOCALS( int32_t, ne, op, ne);
GGML_TENSOR_LOCALS(uint64_t, nb, op, nb);
ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_top_k(lib, op);
auto pipeline = ggml_metal_library_get_pipeline_top_k(lib, op);
// bitonic sort requires the number of elements to be power of 2
int nth = 1;
@ -3818,7 +3818,7 @@ int ggml_metal_op_top_k(ggml_metal_op_t ctx, int idx) {
ggml_metal_encoder_dispatch_threadgroups(enc, npr*ne01, ne02, ne03, nth, 1, 1);
ggml_metal_pipeline_t pipeline_merge = ggml_metal_library_get_pipeline_top_k_merge(lib, op);
auto pipeline_merge = ggml_metal_library_get_pipeline_top_k_merge(lib, op);
int len = args.top_k;
@ -3881,7 +3881,7 @@ int ggml_metal_op_leaky_relu(ggml_metal_op_t ctx, int idx) {
/*.slope =*/ slope
};
ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_unary(lib, op);
auto pipeline = ggml_metal_library_get_pipeline_unary(lib, op);
int64_t n = ggml_nelements(op);
@ -3910,7 +3910,7 @@ int ggml_metal_op_opt_step_adamw(ggml_metal_op_t ctx, int idx) {
GGML_TENSOR_LOCALS( int32_t, ne, op, ne);
GGML_TENSOR_LOCALS(uint64_t, nb, op, nb);
ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_opt_step_adamw(lib, op);
auto pipeline = ggml_metal_library_get_pipeline_opt_step_adamw(lib, op);
const int64_t np = ggml_nelements(op->src[0]);
ggml_metal_kargs_opt_step_adamw args = {
@ -3946,7 +3946,7 @@ int ggml_metal_op_opt_step_sgd(ggml_metal_op_t ctx, int idx) {
GGML_TENSOR_LOCALS( int32_t, ne, op, ne);
GGML_TENSOR_LOCALS(uint64_t, nb, op, nb);
ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_opt_step_sgd(lib, op);
auto pipeline = ggml_metal_library_get_pipeline_opt_step_sgd(lib, op);
const int64_t np = ggml_nelements(op->src[0]);
ggml_metal_kargs_opt_step_sgd args = {

View File

@ -5757,6 +5757,7 @@ typedef decltype(kernel_flash_attn_ext<FA_TYPES, half4x4, 1, dequantize_f16, hal
template [[host_name("kernel_flash_attn_ext_f32_dk32_dv32" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES_F32, float4x4, 1, dequantize_f32, float4x4, 1, dequantize_f32, 32, 32>;
template [[host_name("kernel_flash_attn_ext_f32_dk40_dv40" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES_F32, float4x4, 1, dequantize_f32, float4x4, 1, dequantize_f32, 40, 40>;
template [[host_name("kernel_flash_attn_ext_f32_dk48_dv48" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES_F32, float4x4, 1, dequantize_f32, float4x4, 1, dequantize_f32, 48, 48>;
template [[host_name("kernel_flash_attn_ext_f32_dk64_dv64" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES_F32, float4x4, 1, dequantize_f32, float4x4, 1, dequantize_f32, 64, 64>;
template [[host_name("kernel_flash_attn_ext_f32_dk72_dv72" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES_F32, float4x4, 1, dequantize_f32, float4x4, 1, dequantize_f32, 72, 72>;
template [[host_name("kernel_flash_attn_ext_f32_dk80_dv80" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES_F32, float4x4, 1, dequantize_f32, float4x4, 1, dequantize_f32, 80, 80>;
@ -5770,6 +5771,7 @@ template [[host_name("kernel_flash_attn_ext_f32_dk576_dv512")]] kernel flash_at
template [[host_name("kernel_flash_attn_ext_f16_dk32_dv32" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, half4x4, 1, dequantize_f16, half4x4, 1, dequantize_f16, 32, 32>;
template [[host_name("kernel_flash_attn_ext_f16_dk40_dv40" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, half4x4, 1, dequantize_f16, half4x4, 1, dequantize_f16, 40, 40>;
template [[host_name("kernel_flash_attn_ext_f16_dk48_dv48" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, half4x4, 1, dequantize_f16, half4x4, 1, dequantize_f16, 48, 48>;
template [[host_name("kernel_flash_attn_ext_f16_dk64_dv64" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, half4x4, 1, dequantize_f16, half4x4, 1, dequantize_f16, 64, 64>;
template [[host_name("kernel_flash_attn_ext_f16_dk72_dv72" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, half4x4, 1, dequantize_f16, half4x4, 1, dequantize_f16, 72, 72>;
template [[host_name("kernel_flash_attn_ext_f16_dk80_dv80" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, half4x4, 1, dequantize_f16, half4x4, 1, dequantize_f16, 80, 80>;
@ -5784,6 +5786,7 @@ template [[host_name("kernel_flash_attn_ext_f16_dk576_dv512")]] kernel flash_at
#if defined(GGML_METAL_HAS_BF16)
template [[host_name("kernel_flash_attn_ext_bf16_dk32_dv32" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES_BF, bfloat4x4, 1, dequantize_bf16, bfloat4x4, 1, dequantize_bf16, 32, 32>;
template [[host_name("kernel_flash_attn_ext_bf16_dk40_dv40" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES_BF, bfloat4x4, 1, dequantize_bf16, bfloat4x4, 1, dequantize_bf16, 40, 40>;
template [[host_name("kernel_flash_attn_ext_bf16_dk48_dv48" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES_BF, bfloat4x4, 1, dequantize_bf16, bfloat4x4, 1, dequantize_bf16, 48, 48>;
template [[host_name("kernel_flash_attn_ext_bf16_dk64_dv64" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES_BF, bfloat4x4, 1, dequantize_bf16, bfloat4x4, 1, dequantize_bf16, 64, 64>;
template [[host_name("kernel_flash_attn_ext_bf16_dk72_dv72" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES_BF, bfloat4x4, 1, dequantize_bf16, bfloat4x4, 1, dequantize_bf16, 72, 72>;
template [[host_name("kernel_flash_attn_ext_bf16_dk80_dv80" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES_BF, bfloat4x4, 1, dequantize_bf16, bfloat4x4, 1, dequantize_bf16, 80, 80>;
@ -5798,6 +5801,7 @@ template [[host_name("kernel_flash_attn_ext_bf16_dk576_dv512")]] kernel flash_at
template [[host_name("kernel_flash_attn_ext_q4_0_dk32_dv32" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q4_0, 2, dequantize_q4_0, block_q4_0, 2, dequantize_q4_0, 32, 32>;
template [[host_name("kernel_flash_attn_ext_q4_0_dk40_dv40" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q4_0, 2, dequantize_q4_0, block_q4_0, 2, dequantize_q4_0, 40, 40>;
template [[host_name("kernel_flash_attn_ext_q4_0_dk48_dv48" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q4_0, 2, dequantize_q4_0, block_q4_0, 2, dequantize_q4_0, 48, 48>;
template [[host_name("kernel_flash_attn_ext_q4_0_dk64_dv64" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q4_0, 2, dequantize_q4_0, block_q4_0, 2, dequantize_q4_0, 64, 64>;
template [[host_name("kernel_flash_attn_ext_q4_0_dk72_dv72" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q4_0, 2, dequantize_q4_0, block_q4_0, 2, dequantize_q4_0, 72, 72>;
template [[host_name("kernel_flash_attn_ext_q4_0_dk80_dv80" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q4_0, 2, dequantize_q4_0, block_q4_0, 2, dequantize_q4_0, 80, 80>;
@ -5811,6 +5815,7 @@ template [[host_name("kernel_flash_attn_ext_q4_0_dk576_dv512")]] kernel flash_at
template [[host_name("kernel_flash_attn_ext_q4_1_dk32_dv32" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q4_1, 2, dequantize_q4_1, block_q4_1, 2, dequantize_q4_1, 32, 32>;
template [[host_name("kernel_flash_attn_ext_q4_1_dk40_dv40" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q4_1, 2, dequantize_q4_1, block_q4_1, 2, dequantize_q4_1, 40, 40>;
template [[host_name("kernel_flash_attn_ext_q4_1_dk48_dv48" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q4_1, 2, dequantize_q4_1, block_q4_1, 2, dequantize_q4_1, 48, 48>;
template [[host_name("kernel_flash_attn_ext_q4_1_dk64_dv64" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q4_1, 2, dequantize_q4_1, block_q4_1, 2, dequantize_q4_1, 64, 64>;
template [[host_name("kernel_flash_attn_ext_q4_1_dk72_dv72" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q4_1, 2, dequantize_q4_1, block_q4_1, 2, dequantize_q4_1, 72, 72>;
template [[host_name("kernel_flash_attn_ext_q4_1_dk80_dv80" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q4_1, 2, dequantize_q4_1, block_q4_1, 2, dequantize_q4_1, 80, 80>;
@ -5824,6 +5829,7 @@ template [[host_name("kernel_flash_attn_ext_q4_1_dk576_dv512")]] kernel flash_at
template [[host_name("kernel_flash_attn_ext_q5_0_dk32_dv32" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q5_0, 2, dequantize_q5_0, block_q5_0, 2, dequantize_q5_0, 32, 32>;
template [[host_name("kernel_flash_attn_ext_q5_0_dk40_dv40" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q5_0, 2, dequantize_q5_0, block_q5_0, 2, dequantize_q5_0, 40, 40>;
template [[host_name("kernel_flash_attn_ext_q5_0_dk48_dv48" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q5_0, 2, dequantize_q5_0, block_q5_0, 2, dequantize_q5_0, 48, 48>;
template [[host_name("kernel_flash_attn_ext_q5_0_dk64_dv64" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q5_0, 2, dequantize_q5_0, block_q5_0, 2, dequantize_q5_0, 64, 64>;
template [[host_name("kernel_flash_attn_ext_q5_0_dk72_dv72" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q5_0, 2, dequantize_q5_0, block_q5_0, 2, dequantize_q5_0, 72, 72>;
template [[host_name("kernel_flash_attn_ext_q5_0_dk80_dv80" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q5_0, 2, dequantize_q5_0, block_q5_0, 2, dequantize_q5_0, 80, 80>;
@ -5837,6 +5843,7 @@ template [[host_name("kernel_flash_attn_ext_q5_0_dk576_dv512")]] kernel flash_at
template [[host_name("kernel_flash_attn_ext_q5_1_dk32_dv32" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q5_1, 2, dequantize_q5_1, block_q5_1, 2, dequantize_q5_1, 32, 32>;
template [[host_name("kernel_flash_attn_ext_q5_1_dk40_dv40" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q5_1, 2, dequantize_q5_1, block_q5_1, 2, dequantize_q5_1, 40, 40>;
template [[host_name("kernel_flash_attn_ext_q5_1_dk48_dv48" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q5_1, 2, dequantize_q5_1, block_q5_1, 2, dequantize_q5_1, 48, 48>;
template [[host_name("kernel_flash_attn_ext_q5_1_dk64_dv64" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q5_1, 2, dequantize_q5_1, block_q5_1, 2, dequantize_q5_1, 64, 64>;
template [[host_name("kernel_flash_attn_ext_q5_1_dk72_dv72" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q5_1, 2, dequantize_q5_1, block_q5_1, 2, dequantize_q5_1, 72, 72>;
template [[host_name("kernel_flash_attn_ext_q5_1_dk80_dv80" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q5_1, 2, dequantize_q5_1, block_q5_1, 2, dequantize_q5_1, 80, 80>;
@ -5850,6 +5857,7 @@ template [[host_name("kernel_flash_attn_ext_q5_1_dk576_dv512")]] kernel flash_at
template [[host_name("kernel_flash_attn_ext_q8_0_dk32_dv32" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q8_0, 2, dequantize_q8_0, block_q8_0, 2, dequantize_q8_0, 32, 32>;
template [[host_name("kernel_flash_attn_ext_q8_0_dk40_dv40" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q8_0, 2, dequantize_q8_0, block_q8_0, 2, dequantize_q8_0, 40, 40>;
template [[host_name("kernel_flash_attn_ext_q8_0_dk48_dv48" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q8_0, 2, dequantize_q8_0, block_q8_0, 2, dequantize_q8_0, 48, 48>;
template [[host_name("kernel_flash_attn_ext_q8_0_dk64_dv64" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q8_0, 2, dequantize_q8_0, block_q8_0, 2, dequantize_q8_0, 64, 64>;
template [[host_name("kernel_flash_attn_ext_q8_0_dk72_dv72" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q8_0, 2, dequantize_q8_0, block_q8_0, 2, dequantize_q8_0, 72, 72>;
template [[host_name("kernel_flash_attn_ext_q8_0_dk80_dv80" )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES, block_q8_0, 2, dequantize_q8_0, block_q8_0, 2, dequantize_q8_0, 80, 80>;

View File

@ -70,6 +70,7 @@ set(GGML_OPENCL_KERNELS
group_norm
im2col_f32
im2col_f16
mean
mul_mat_Ab_Bi_8x4
mul_mv_f16_f16
mul_mv_f16_f32_1row
@ -109,6 +110,9 @@ set(GGML_OPENCL_KERNELS
softmax_4_f16
softmax_f32
softmax_f16
sqr
sqrt
ssm_conv
sub
sum_rows
transpose

View File

@ -449,6 +449,9 @@ struct ggml_backend_opencl_context {
cl_kernel kernel_sub, kernel_sub_row, kernel_sub_f16, kernel_sub_row_f16;
cl_kernel kernel_add_id;
cl_kernel kernel_scale;
cl_kernel kernel_sqr_cont_f32, kernel_sqr_cont_f32_4, kernel_sqr_cont_f16, kernel_sqr_cont_f16_4;
cl_kernel kernel_sqrt_cont_f32, kernel_sqrt_cont_f32_4, kernel_sqrt_cont_f16, kernel_sqrt_cont_f16_4;
cl_kernel kernel_mean_f32;
cl_kernel kernel_silu, kernel_silu_4;
cl_kernel kernel_gelu, kernel_gelu_4;
cl_kernel kernel_gelu_erf, kernel_gelu_erf_4;
@ -509,6 +512,7 @@ struct ggml_backend_opencl_context {
cl_kernel kernel_conv_2d_f16;
cl_kernel kernel_conv_2d_f32;
cl_kernel kernel_conv_2d_f16_f32;
cl_kernel kernel_ssm_conv_f32_f32, kernel_ssm_conv_f32_f32_4;
cl_kernel kernel_timestep_embedding;
cl_kernel kernel_gemv_moe_mxfp4_f32, kernel_gemm_moe_mxfp4_f32;
cl_kernel kernel_mul_mv_id_q4_0_f32_8x_flat;
@ -1552,6 +1556,66 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
GGML_LOG_CONT(".");
}
// sqr
{
#ifdef GGML_OPENCL_EMBED_KERNELS
const std::string kernel_src {
#include "sqr.cl.h"
};
#else
const std::string kernel_src = read_file("sqr.cl");
#endif
cl_program prog =
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
CL_CHECK((backend_ctx->kernel_sqr_cont_f32 = clCreateKernel(prog, "kernel_sqr_cont_f32", &err), err));
CL_CHECK((backend_ctx->kernel_sqr_cont_f32_4 = clCreateKernel(prog, "kernel_sqr_cont_f32_4", &err), err));
CL_CHECK((backend_ctx->kernel_sqr_cont_f16 = clCreateKernel(prog, "kernel_sqr_cont_f16", &err), err));
CL_CHECK((backend_ctx->kernel_sqr_cont_f16_4 = clCreateKernel(prog, "kernel_sqr_cont_f16_4", &err), err));
CL_CHECK(clReleaseProgram(prog));
GGML_LOG_CONT(".");
}
// sqrt
{
#ifdef GGML_OPENCL_EMBED_KERNELS
const std::string kernel_src {
#include "sqrt.cl.h"
};
#else
const std::string kernel_src = read_file("sqrt.cl");
#endif
cl_program prog =
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
CL_CHECK((backend_ctx->kernel_sqrt_cont_f32 = clCreateKernel(prog, "kernel_sqrt_cont_f32", &err), err));
CL_CHECK((backend_ctx->kernel_sqrt_cont_f32_4 = clCreateKernel(prog, "kernel_sqrt_cont_f32_4", &err), err));
CL_CHECK((backend_ctx->kernel_sqrt_cont_f16 = clCreateKernel(prog, "kernel_sqrt_cont_f16", &err), err));
CL_CHECK((backend_ctx->kernel_sqrt_cont_f16_4 = clCreateKernel(prog, "kernel_sqrt_cont_f16_4", &err), err));
CL_CHECK(clReleaseProgram(prog));
GGML_LOG_CONT(".");
}
// mean
{
#ifdef GGML_OPENCL_EMBED_KERNELS
const std::string kernel_src {
#include "mean.cl.h"
};
#else
const std::string kernel_src = read_file("mean.cl");
#endif
cl_program prog =
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
CL_CHECK((backend_ctx->kernel_mean_f32 = clCreateKernel(prog, "kernel_mean_f32", &err), err));
CL_CHECK(clReleaseProgram(prog));
GGML_LOG_CONT(".");
}
// sub
{
#ifdef GGML_OPENCL_EMBED_KERNELS
@ -1825,6 +1889,24 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
}
}
// ssm_conv
{
#ifdef GGML_OPENCL_EMBED_KERNELS
const std::string kernel_src {
#include "ssm_conv.cl.h"
};
#else
const std::string kernel_src = read_file("ssm_conv.cl");
#endif
cl_program prog =
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
CL_CHECK((backend_ctx->kernel_ssm_conv_f32_f32 = clCreateKernel(prog, "kernel_ssm_conv_f32_f32", &err), err));
CL_CHECK((backend_ctx->kernel_ssm_conv_f32_f32_4 = clCreateKernel(prog, "kernel_ssm_conv_f32_f32_4", &err), err));
CL_CHECK(clReleaseProgram(prog));
GGML_LOG_CONT(".");
}
// mul_mv_id_q4_0_f32_8x_flat
{
#ifdef GGML_OPENCL_EMBED_KERNELS
@ -2959,6 +3041,10 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
(op->src[0]->type == GGML_TYPE_F32 || op->src[0]->type == GGML_TYPE_F16);
case GGML_OP_ADD_ID:
return op->src[0]->type == GGML_TYPE_F32;
case GGML_OP_SQR:
case GGML_OP_SQRT:
return (op->src[0]->type == GGML_TYPE_F32 || op->src[0]->type == GGML_TYPE_F16) &&
ggml_is_contiguous(op->src[0]);
case GGML_OP_UNARY:
switch (ggml_get_unary_op(op)) {
case GGML_UNARY_OP_GELU:
@ -3000,13 +3086,16 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
return op->src[0]->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32;
case GGML_OP_UPSCALE: {
ggml_scale_mode mode = (ggml_scale_mode)(ggml_get_op_params_i32(op, 0) & 0xFF);
const bool antialias = (ggml_scale_mode)(ggml_get_op_params_i32(op, 0) & GGML_SCALE_FLAG_ANTIALIAS);
return op->src[0]->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32 &&
(mode == GGML_SCALE_MODE_NEAREST || mode == GGML_SCALE_MODE_BILINEAR);
(mode == GGML_SCALE_MODE_NEAREST || mode == GGML_SCALE_MODE_BILINEAR) && !antialias;
}
case GGML_OP_CONV_2D:
return (op->src[0]->type == GGML_TYPE_F16 && op->src[1]->type == GGML_TYPE_F16 && op->type == GGML_TYPE_F16) ||
(op->src[0]->type == GGML_TYPE_F32 && op->src[1]->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32) ||
(op->src[0]->type == GGML_TYPE_F16 && op->src[1]->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32);
case GGML_OP_SSM_CONV:
return (op->src[0]->type == GGML_TYPE_F32 && op->src[1]->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32);
case GGML_OP_CONCAT:
return op->src[0]->type == GGML_TYPE_F32 && op->src[1]->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32;
case GGML_OP_TIMESTEP_EMBEDDING:
@ -3075,6 +3164,7 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
return cols <= max_workgroup_size && op->src[0]->type == GGML_TYPE_F32;
}
case GGML_OP_SUM_ROWS:
case GGML_OP_MEAN:
return op->src[0]->type == GGML_TYPE_F32 && ggml_is_contiguous(op->src[0]);
case GGML_OP_FLASH_ATTN_EXT:
{
@ -5193,6 +5283,224 @@ static void ggml_cl_sub(ggml_backend_t backend, const ggml_tensor * src0, const
}
}
static void ggml_cl_sqr(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
GGML_ASSERT(src0);
GGML_ASSERT(src0->extra);
GGML_ASSERT(dst);
GGML_ASSERT(dst->extra);
UNUSED(src1);
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
cl_ulong offset0 = extra0->offset + src0->view_offs;
cl_ulong offsetd = extrad->offset + dst->view_offs;
cl_kernel kernel;
// Currently assumes src0 is contiguous
int n = ggml_nelements(dst);
if (n % 4 == 0) {
if (src0->type == GGML_TYPE_F32) {
kernel = backend_ctx->kernel_sqr_cont_f32_4;
} else {
kernel = backend_ctx->kernel_sqr_cont_f16_4;
}
n /= 4;
} else {
if (src0->type == GGML_TYPE_F32) {
kernel = backend_ctx->kernel_sqr_cont_f32;
} else {
kernel = backend_ctx->kernel_sqr_cont_f16;
}
}
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device));
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
size_t global_work_size[] = {(size_t)n, 1, 1};
size_t local_work_size[] = {64, 1, 1};
size_t * local_work_size_ptr = local_work_size;
if (n % 64 != 0 && !backend_ctx->non_uniform_workgroups) {
local_work_size_ptr = nullptr;
}
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst);
}
static void ggml_cl_sqrt(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
GGML_ASSERT(src0);
GGML_ASSERT(src0->extra);
GGML_ASSERT(dst);
GGML_ASSERT(dst->extra);
UNUSED(src1);
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
cl_ulong offset0 = extra0->offset + src0->view_offs;
cl_ulong offsetd = extrad->offset + dst->view_offs;
cl_kernel kernel;
// Currently assumes src0 is contiguous
int n = ggml_nelements(dst);
if (n % 4 == 0) {
if (src0->type == GGML_TYPE_F32) {
kernel = backend_ctx->kernel_sqrt_cont_f32_4;
} else {
kernel = backend_ctx->kernel_sqrt_cont_f16_4;
}
n /= 4;
} else {
if (src0->type == GGML_TYPE_F32) {
kernel = backend_ctx->kernel_sqrt_cont_f32;
} else {
kernel = backend_ctx->kernel_sqrt_cont_f16;
}
}
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device));
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
size_t global_work_size[] = {(size_t)n, 1, 1};
size_t local_work_size[] = {64, 1, 1};
size_t * local_work_size_ptr = local_work_size;
if (n % 64 != 0 && !backend_ctx->non_uniform_workgroups) {
local_work_size_ptr = nullptr;
}
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst);
}
static void ggml_cl_mean(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
GGML_ASSERT(src0);
GGML_ASSERT(src0->extra);
GGML_ASSERT(dst);
GGML_ASSERT(dst->extra);
GGML_UNUSED(src1);
GGML_ASSERT(src0->nb[0] == ggml_type_size(src0->type));
GGML_ASSERT(ggml_is_contiguous(src0));
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
cl_ulong offset0 = extra0->offset + src0->view_offs;
cl_ulong offsetd = extrad->offset + dst->view_offs;
const int ne00 = src0->ne[0];
const int ne01 = src0->ne[1];
const int ne02 = src0->ne[2];
const int ne03 = src0->ne[3];
const cl_ulong nb01 = src0->nb[1];
const cl_ulong nb02 = src0->nb[2];
const cl_ulong nb03 = src0->nb[3];
const cl_ulong nb1 = dst->nb[1];
const cl_ulong nb2 = dst->nb[2];
const cl_ulong nb3 = dst->nb[3];
cl_kernel kernel = backend_ctx->kernel_mean_f32;
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device));
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &ne00));
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &ne01));
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne02));
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne03));
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &nb01));
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb02));
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb03));
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb1));
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb2));
CL_CHECK(clSetKernelArg(kernel, 13, sizeof(cl_ulong), &nb3));
size_t global_work_size[] = {(size_t)ne01, (size_t)ne02, (size_t)ne03};
size_t local_work_size[] = {(size_t)64, 1, 1};
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
}
static void ggml_cl_ssm_conv(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
GGML_ASSERT(src0);
GGML_ASSERT(src0->extra);
GGML_ASSERT(src1);
GGML_ASSERT(src1->extra);
GGML_ASSERT(dst);
GGML_ASSERT(dst->extra);
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
cl_ulong offset0 = extra0->offset + src0->view_offs;
cl_ulong offset1 = extra1->offset + src1->view_offs;
cl_ulong offsetd = extrad->offset + dst->view_offs;
int ne01 = src0->ne[1];
cl_ulong nb00 = src0->nb[0];
cl_ulong nb01 = src0->nb[1];
cl_ulong nb02 = src0->nb[2];
int ne10 = src1->ne[0];
cl_ulong nb11 = src1->nb[1];
int ne1 = dst->ne[1];
int ne2 = dst->ne[2];
cl_ulong nb0 = dst->nb[0];
cl_ulong nb1 = dst->nb[1];
cl_ulong nb2 = dst->nb[2];
cl_kernel kernel = backend_ctx->kernel_ssm_conv_f32_f32;
if (ne10 % 4 == 0) {
kernel = backend_ctx->kernel_ssm_conv_f32_f32_4;
}
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device));
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device));
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_ulong), &nb00));
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_ulong), &nb01));
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &nb02));
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &ne10));
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb11));
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb0));
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb1));
CL_CHECK(clSetKernelArg(kernel, 13, sizeof(cl_ulong), &nb2));
size_t global_work_size[] = {(size_t)ne01, (size_t)ne1, (size_t)ne2};
size_t local_work_size[] = {64, 1, 1};
size_t * local_work_size_ptr = local_work_size;
if (ne01 % 64 != 0 && !backend_ctx->non_uniform_workgroups) {
local_work_size_ptr = nullptr;
}
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst);
}
static void ggml_cl_gelu(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
GGML_ASSERT(src0);
GGML_ASSERT(src0->extra);
@ -9091,6 +9399,24 @@ bool ggml_cl_compute_forward(ggml_backend_t backend, struct ggml_tensor * tensor
}
func = ggml_cl_sub;
break;
case GGML_OP_SQR:
if (!any_on_device) {
return false;
}
func = ggml_cl_sqr;
break;
case GGML_OP_SQRT:
if (!any_on_device) {
return false;
}
func = ggml_cl_sqrt;
break;
case GGML_OP_MEAN:
if (!any_on_device) {
return false;
}
func = ggml_cl_mean;
break;
case GGML_OP_UNARY:
switch (ggml_get_unary_op(tensor)) {
case GGML_UNARY_OP_GELU:
@ -9192,6 +9518,12 @@ bool ggml_cl_compute_forward(ggml_backend_t backend, struct ggml_tensor * tensor
}
func = ggml_cl_conv_2d;
break;
case GGML_OP_SSM_CONV:
if (!any_on_device) {
return false;
}
func = ggml_cl_ssm_conv;
break;
case GGML_OP_CONCAT:
if (!any_on_device) {
return false;

View File

@ -0,0 +1,39 @@
kernel void kernel_mean_f32(
global float * src0,
ulong offset0,
global float * dst,
ulong offsetd,
int ne00,
int ne01,
int ne02,
int ne03,
ulong nb01,
ulong nb02,
ulong nb03,
ulong nb1,
ulong nb2,
ulong nb3
) {
src0 = (global float *)((global char *)src0 + offset0);
dst = (global float *)((global char *)dst + offsetd);
int i3 = get_global_id(2);
int i2 = get_global_id(1);
int i1 = get_global_id(0);
if (i3 >= ne03 || i2 >= ne02 || i1 >= ne01) {
return;
}
global float * src_row = (global float *) ((global char *) src0 + i1*nb01 + i2*nb02 + i3*nb03);
global float * dst_row = (global float *) ((global char *) dst + i1*nb1 + i2*nb2 + i3*nb3);
float row_sum = 0;
for (int i0 = 0; i0 < ne00; i0++) {
row_sum += src_row[i0];
}
dst_row[0] = row_sum / ne00;
}

View File

@ -0,0 +1,53 @@
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
kernel void kernel_sqr_cont_f32(
global float * src0,
ulong offset0,
global float * dst,
ulong offsetd
) {
src0 = (global float*)((global char*)src0 + offset0);
dst = (global float*)((global char*)dst + offsetd);
uint gid = get_global_id(0);
dst[gid] = src0[gid] * src0[gid];
}
kernel void kernel_sqr_cont_f32_4(
global float4 * src0,
ulong offset0,
global float4 * dst,
ulong offsetd
) {
src0 = (global float4*)((global char*)src0 + offset0);
dst = (global float4*)((global char*)dst + offsetd);
uint gid = get_global_id(0);
dst[gid] = src0[gid] * src0[gid];
}
kernel void kernel_sqr_cont_f16(
global half * src0,
ulong offset0,
global half * dst,
ulong offsetd
) {
src0 = (global half*)((global char*)src0 + offset0);
dst = (global half*)((global char*)dst + offsetd);
uint gid = get_global_id(0);
dst[gid] = src0[gid] * src0[gid];
}
kernel void kernel_sqr_cont_f16_4(
global half4 * src0,
ulong offset0,
global half4 * dst,
ulong offsetd
) {
src0 = (global half4*)((global char*)src0 + offset0);
dst = (global half4*)((global char*)dst + offsetd);
uint gid = get_global_id(0);
dst[gid] = src0[gid] * src0[gid];
}

View File

@ -0,0 +1,53 @@
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
kernel void kernel_sqrt_cont_f32(
global float * src0,
ulong offset0,
global float * dst,
ulong offsetd
) {
src0 = (global float*)((global char*)src0 + offset0);
dst = (global float*)((global char*)dst + offsetd);
uint gid = get_global_id(0);
dst[gid] = sqrt(src0[gid]);
}
kernel void kernel_sqrt_cont_f32_4(
global float4 * src0,
ulong offset0,
global float4 * dst,
ulong offsetd
) {
src0 = (global float4*)((global char*)src0 + offset0);
dst = (global float4*)((global char*)dst + offsetd);
uint gid = get_global_id(0);
dst[gid] = sqrt(src0[gid]);
}
kernel void kernel_sqrt_cont_f16(
global half * src0,
ulong offset0,
global half * dst,
ulong offsetd
) {
src0 = (global half*)((global char*)src0 + offset0);
dst = (global half*)((global char*)dst + offsetd);
uint gid = get_global_id(0);
dst[gid] = convert_half(sqrt(convert_float(src0[gid])));
}
kernel void kernel_sqrt_cont_f16_4(
global half4 * src0,
ulong offset0,
global half4 * dst,
ulong offsetd
) {
src0 = (global half4*)((global char*)src0 + offset0);
dst = (global half4*)((global char*)dst + offsetd);
uint gid = get_global_id(0);
dst[gid] = convert_half4(sqrt(convert_float4(src0[gid])));
}

View File

@ -0,0 +1,77 @@
kernel void kernel_ssm_conv_f32_f32(
global char * src0,
ulong offset0,
global char * src1,
ulong offset1,
global char * dst,
ulong offsetd,
ulong nb00,
ulong nb01,
ulong nb02,
int ne10,
ulong nb11,
ulong nb0,
ulong nb1,
ulong nb2
){
src0 = src0 + offset0;
src1 = src1 + offset1;
dst = dst + offsetd;
int ir = get_global_id(0);
int i2 = get_global_id(1);
int i3 = get_global_id(2);
int nc = ne10;
global float * s = (global float *) (src0 + ir*nb01 + i2*nb00 + i3*nb02);
global float * c = (global float *) (src1 + ir*nb11);
global float * d = (global float *) (dst + ir*nb0 + i2*nb1 + i3*nb2);
float sumf = 0.0f;
for (int i0 = 0; i0 < nc; ++i0) {
sumf += s[i0] * c[i0];
}
d[0] = sumf;
}
kernel void kernel_ssm_conv_f32_f32_4(
global char * src0,
ulong offset0,
global char * src1,
ulong offset1,
global char * dst,
ulong offsetd,
ulong nb00,
ulong nb01,
ulong nb02,
int ne10,
ulong nb11,
ulong nb0,
ulong nb1,
ulong nb2
) {
src0 = src0 + offset0;
src1 = src1 + offset1;
dst = dst + offsetd;
int ir = get_global_id(0);
int i2 = get_global_id(1);
int i3 = get_global_id(2);
int nc = ne10;
global float4 * s = (global float4 *) (src0 + ir*nb01 + i2*nb00 + i3*nb02);
global float4 * c = (global float4 *) (src1 + ir*nb11);
global float * d = (global float *) (dst + ir*nb0 + i2*nb1 + i3*nb2);
float sumf = 0.0f;
for (int i0 = 0; i0 < nc/4; ++i0) {
sumf += dot(s[i0], c[i0]);
}
d[0] = sumf;
}

View File

@ -106,6 +106,7 @@ enum rpc_cmd {
RPC_CMD_GET_ALLOC_SIZE,
RPC_CMD_HELLO,
RPC_CMD_DEVICE_COUNT,
RPC_CMD_GRAPH_RECOMPUTE,
RPC_CMD_COUNT,
};
@ -205,10 +206,6 @@ struct rpc_msg_copy_tensor_rsp {
uint8_t result;
};
struct rpc_msg_graph_compute_rsp {
uint8_t result;
};
struct rpc_msg_get_device_memory_req {
uint32_t device;
};
@ -217,6 +214,11 @@ struct rpc_msg_get_device_memory_rsp {
uint64_t free_mem;
uint64_t total_mem;
};
struct rpc_msg_graph_recompute_req {
uint32_t device;
};
#pragma pack(pop)
// RPC data structures
@ -234,10 +236,35 @@ struct ggml_backend_rpc_buffer_type_context {
size_t max_size;
};
struct graph_cache {
bool is_cached(const ggml_cgraph * cgraph) {
if ((int)last_graph.size() != cgraph->n_nodes) {
return false;
}
for (int i = 0; i < cgraph->n_nodes; i++) {
if (memcmp(&last_graph[i], cgraph->nodes[i], sizeof(ggml_tensor)) != 0) {
return false;
}
}
return true;
}
void add(const ggml_cgraph * cgraph) {
last_graph.resize(cgraph->n_nodes);
for (int i = 0; i < cgraph->n_nodes; i++) {
memcpy(&last_graph[i], cgraph->nodes[i], sizeof(ggml_tensor));
}
}
std::vector<ggml_tensor> last_graph;
};
struct ggml_backend_rpc_context {
std::string endpoint;
uint32_t device;
std::string name;
graph_cache gc;
};
struct ggml_backend_rpc_buffer_context {
@ -815,13 +842,24 @@ static void serialize_graph(uint32_t device, const ggml_cgraph * cgraph, std::ve
static enum ggml_status ggml_backend_rpc_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
ggml_backend_rpc_context * rpc_ctx = (ggml_backend_rpc_context *)backend->context;
std::vector<uint8_t> input;
serialize_graph(rpc_ctx->device, cgraph, input);
rpc_msg_graph_compute_rsp response;
auto sock = get_socket(rpc_ctx->endpoint);
bool status = send_rpc_cmd(sock, RPC_CMD_GRAPH_COMPUTE, input.data(), input.size(), &response, sizeof(response));
RPC_STATUS_ASSERT(status);
return (enum ggml_status)response.result;
GGML_ASSERT(cgraph->n_nodes > 0);
bool reuse = rpc_ctx->gc.is_cached(cgraph);
if (reuse) {
rpc_msg_graph_recompute_req request;
request.device = rpc_ctx->device;
auto sock = get_socket(rpc_ctx->endpoint);
bool status = send_rpc_cmd(sock, RPC_CMD_GRAPH_RECOMPUTE, &request, sizeof(request));
RPC_STATUS_ASSERT(status);
} else {
rpc_ctx->gc.add(cgraph);
std::vector<uint8_t> input;
serialize_graph(rpc_ctx->device, cgraph, input);
auto sock = get_socket(rpc_ctx->endpoint);
bool status = send_rpc_cmd(sock, RPC_CMD_GRAPH_COMPUTE, input.data(), input.size());
RPC_STATUS_ASSERT(status);
}
return GGML_STATUS_SUCCESS;
}
static ggml_backend_i ggml_backend_rpc_interface = {
@ -880,7 +918,8 @@ ggml_backend_t ggml_backend_rpc_init(const char * endpoint, uint32_t device) {
ggml_backend_rpc_context * ctx = new ggml_backend_rpc_context {
/* .endpoint = */ endpoint,
/* .device = */ device,
/* .name = */ dev_name
/* .name = */ dev_name,
/* .gc = */ {},
};
auto reg = ggml_backend_rpc_add_server(endpoint);
ggml_backend_t backend = new ggml_backend {
@ -920,8 +959,9 @@ void ggml_backend_rpc_get_device_memory(const char * endpoint, uint32_t device,
class rpc_server {
public:
rpc_server(std::vector<ggml_backend_t> backends, const char * cache_dir)
: backends(std::move(backends)), cache_dir(cache_dir) {
rpc_server(std::vector<ggml_backend_t> all_backends, const char * cache_dir)
: backends(std::move(all_backends)), cache_dir(cache_dir) {
stored_graphs.resize(backends.size());
}
~rpc_server();
@ -936,11 +976,17 @@ public:
bool set_tensor_hash(const rpc_msg_set_tensor_hash_req & request, rpc_msg_set_tensor_hash_rsp & response);
bool get_tensor(const rpc_msg_get_tensor_req & request, std::vector<uint8_t> & response);
bool copy_tensor(const rpc_msg_copy_tensor_req & request, rpc_msg_copy_tensor_rsp & response);
bool graph_compute(const std::vector<uint8_t> & input, rpc_msg_graph_compute_rsp & response);
bool graph_compute(const std::vector<uint8_t> & input);
bool graph_recompute(const rpc_msg_graph_recompute_req & request);
bool init_tensor(const rpc_msg_init_tensor_req & request);
bool get_alloc_size(const rpc_msg_get_alloc_size_req & request, rpc_msg_get_alloc_size_rsp & response);
bool get_device_memory(const rpc_msg_get_device_memory_req & request, rpc_msg_get_device_memory_rsp & response);
struct stored_graph {
ggml_context_ptr ctx_ptr;
ggml_cgraph * graph;
};
private:
bool get_cached_file(uint64_t hash, std::vector<uint8_t> & data);
ggml_tensor * deserialize_tensor(struct ggml_context * ctx, const rpc_tensor * tensor);
@ -953,6 +999,8 @@ private:
std::vector<ggml_backend_t> backends;
const char * cache_dir;
std::unordered_set<ggml_backend_buffer_t> buffers;
// store the last computed graph for each backend
std::vector<stored_graph> stored_graphs;
};
void rpc_server::hello(rpc_msg_hello_rsp & response) {
@ -1394,7 +1442,7 @@ ggml_tensor * rpc_server::create_node(uint64_t id,
return result;
}
bool rpc_server::graph_compute(const std::vector<uint8_t> & input, rpc_msg_graph_compute_rsp & response) {
bool rpc_server::graph_compute(const std::vector<uint8_t> & input) {
// serialization format:
// | device (4 bytes) | n_nodes (4 bytes) | nodes (n_nodes * sizeof(uint64_t) | n_tensors (4 bytes) | tensors (n_tensors * sizeof(rpc_tensor)) |
if (input.size() < 2*sizeof(uint32_t)) {
@ -1455,7 +1503,24 @@ bool rpc_server::graph_compute(const std::vector<uint8_t> & input, rpc_msg_graph
}
}
ggml_status status = ggml_backend_graph_compute(backends[device], graph);
response.result = status;
GGML_ASSERT(status == GGML_STATUS_SUCCESS && "Unsuccessful graph computations are not supported with RPC");
stored_graphs[device].ctx_ptr.swap(ctx_ptr);
stored_graphs[device].graph = graph;
return true;
}
bool rpc_server::graph_recompute(const rpc_msg_graph_recompute_req & request) {
uint32_t device = request.device;
if (device >= backends.size()) {
return false;
}
if (stored_graphs[device].graph == nullptr) {
return false;
}
ggml_cgraph * graph = stored_graphs[device].graph;
LOG_DBG("[%s] device: %u\n", __func__, device);
ggml_status status = ggml_backend_graph_compute(backends[device], graph);
GGML_ASSERT(status == GGML_STATUS_SUCCESS && "Unsuccessful graph computations are not supported with RPC");
return true;
}
@ -1690,11 +1755,17 @@ static void rpc_serve_client(const std::vector<ggml_backend_t> & backends, const
if (!recv_msg(sockfd, input)) {
return;
}
rpc_msg_graph_compute_rsp response;
if (!server.graph_compute(input, response)) {
if (!server.graph_compute(input)) {
return;
}
if (!send_msg(sockfd, &response, sizeof(response))) {
break;
}
case RPC_CMD_GRAPH_RECOMPUTE: {
rpc_msg_graph_recompute_req request;
if (!recv_msg(sockfd, &request, sizeof(request))) {
return;
}
if (!server.graph_recompute(request)) {
return;
}
break;

View File

@ -91,7 +91,10 @@ if (GGML_SYCL_F16)
add_compile_definitions(GGML_SYCL_F16)
endif()
if (GGML_SYCL_TARGET STREQUAL "NVIDIA")
if (GGML_SYCL_TARGET STREQUAL "INTEL")
add_compile_definitions(GGML_SYCL_WARP_SIZE=16)
target_link_options(ggml-sycl PRIVATE -Xs -ze-intel-greater-than-4GB-buffer-required)
elseif (GGML_SYCL_TARGET STREQUAL "NVIDIA")
add_compile_definitions(GGML_SYCL_WARP_SIZE=32)
elseif (GGML_SYCL_TARGET STREQUAL "AMD")
# INFO: Allowed Sub_group_sizes are not consistent through all
@ -100,7 +103,8 @@ elseif (GGML_SYCL_TARGET STREQUAL "AMD")
# Target archs tested working: gfx1030, gfx1031, (Only tested sub_group_size = 32)
add_compile_definitions(GGML_SYCL_WARP_SIZE=32)
else()
add_compile_definitions(GGML_SYCL_WARP_SIZE=16)
# default for other target
add_compile_definitions(GGML_SYCL_WARP_SIZE=32)
endif()
if (GGML_SYCL_GRAPH)

View File

@ -617,4 +617,30 @@ static __dpct_inline__ float get_alibi_slope(const float max_bias,
return dpct::pow(base, exph);
}
static const sycl::uint3 init_fastdiv_values(uint32_t d) {
GGML_ASSERT(d != 0);
uint32_t L = 0;
while (L < 32 && (uint32_t{ 1 } << L) < d) {
L++;
}
uint32_t mp = (uint32_t) ((uint64_t{ 1 } << 32) * ((uint64_t{ 1 } << L) - d) / d + 1);
return sycl::uint3(mp, L, d);
}
static __dpct_inline__ uint32_t fastdiv(uint32_t n, const sycl::uint3 fastdiv_values) {
const uint32_t hi = sycl::mul_hi<unsigned>(n, fastdiv_values.x());
return (hi + n) >> fastdiv_values.y();
}
static __dpct_inline__ sycl::uint2 fast_div_modulo(uint32_t n, const sycl::uint3 fastdiv_values) {
const uint32_t div_val = fastdiv(n, fastdiv_values);
const uint32_t mod_val = n - div_val * fastdiv_values.z();
return sycl::uint2(div_val, mod_val);
}
#endif // GGML_SYCL_COMMON_HPP

View File

@ -515,9 +515,6 @@ void ggml_sycl_cpy(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, co
const int64_t ne = ggml_nelements(src0);
GGML_ASSERT(ne == ggml_nelements(src1));
GGML_ASSERT(ggml_nbytes(src0) <= INT_MAX);
GGML_ASSERT(ggml_nbytes(src1) <= INT_MAX);
GGML_TENSOR_BINARY_OP_LOCALS01;
SYCL_CHECK(ggml_sycl_set_device(ctx.device));

View File

@ -1787,6 +1787,7 @@ static void argsort_f32_i32_sycl(const float *x, int *dst, const int ncols,
const sycl::range<3> block_dims(1, 1, nth);
const sycl::range<3> block_nums(1, nrows, 1);
const size_t shared_mem = ncols_pad * sizeof(int);
GGML_ASSERT(shared_mem<=ggml_sycl_info().devices[device].smpbo);
if (order == GGML_SORT_ORDER_ASC) {
stream->submit([&](sycl::handler &cgh) {
@ -4348,6 +4349,9 @@ static ggml_backend_buffer_t ggml_backend_sycl_device_buffer_from_host_ptr(ggml_
}
static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
ggml_backend_sycl_device_context *sycl_ctx =
(ggml_backend_sycl_device_context *)dev->context;
int device = sycl_ctx->device;
switch (op->op) {
case GGML_OP_CONV_TRANSPOSE_1D:
{
@ -4597,12 +4601,14 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
case GGML_OP_IM2COL:
return true;
case GGML_OP_UPSCALE:
return op->src[0]->type == GGML_TYPE_F32 && op->op_params[0] == GGML_SCALE_MODE_NEAREST;
return op->src[0]->type == GGML_TYPE_F32 && op->op_params[0] == GGML_SCALE_MODE_NEAREST && !(op->op_params[0] & GGML_SCALE_FLAG_ANTIALIAS);
case GGML_OP_SUM:
case GGML_OP_SUM_ROWS:
case GGML_OP_MEAN:
case GGML_OP_ARGSORT:
return ggml_is_contiguous(op->src[0]);
case GGML_OP_ARGSORT:
return op->src[0]->ne[0] * sizeof(int) <=
ggml_sycl_info().devices[device].smpbo;
case GGML_OP_POOL_2D:
case GGML_OP_ACC:
return true;

View File

@ -1,72 +1,100 @@
#include "pad_reflect_1d.hpp"
void pad_reflect_1d_f32(const float* src,float* dst,
const int64_t ne0, const int64_t ne02, const int p0, const int p1,
const int64_t nb0, const int64_t nb1, const int64_t nb2, const int64_t nb3,
const int64_t nb00, const int64_t nb01, const int64_t nb02, const int64_t nb03,
const sycl::nd_item<3> &item_ct1){
static void pad_reflect_1d_kernel_f32(
const void *__restrict__ src0, void *__restrict__ dst, const int64_t ne0,
const int64_t ne00, const sycl::uint3 ne01, const int64_t ne02,
const int64_t ne03, const int64_t nb00, const int64_t nb01,
const int64_t nb02, const int64_t nb03, const int64_t nb0,
const int64_t nb1, const int64_t nb2, const int64_t nb3, const int p0,
const int p1, sycl::nd_item<3> item_ct1) {
const int i0 = item_ct1.get_group(0) * SYCL_CONCAT_BLOCK_SIZE + item_ct1.get_local_id(0);
const int i1 = item_ct1.get_group(1);
const int g2 = item_ct1.get_group(2);
const int i2 = g2 % ne02;
const int i3 = g2 / ne02;
const int64_t i3 = item_ct1.get_group(0);
const int64_t i2 = item_ct1.get_group(1);
if (i0 >= p0 + ne0 + p1) return;
const sycl::uint2 div_mod_packed =
fast_div_modulo(item_ct1.get_group(2), ne01);
const int64_t tile1 = div_mod_packed.y();
const int64_t tile0 = div_mod_packed.x();
const int64_t i1 = tile1;
const int64_t i0 =
item_ct1.get_local_id(2) + tile0 * item_ct1.get_local_range(2);
int t = i0 - p0;
int period = 2 * ne0 -2;
int m = t % period;
m += (m < 0) * period;
int center = ne0 -1;
int srci0 = center - abs(center - m);
if (i0 >= ne0 || i1 >= ne01.z() || i2 >= ne02 || i3 >= ne03) {
return;
}
int offest_src = i3*nb3 + i2*nb2 + i1*nb1 + srci0*nb0;
int offest_dst = i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00;
dst[offest_dst] = src[offest_src];
const char *src0_ptr =
(const char *)src0 + i3 * nb03 + i2 * nb02 + i1 * nb01;
char *dst_ptr = (char *)dst + i3 * nb3 + i2 * nb2 + i1 * nb1;
const int64_t rel_i0 = i0 - p0; // relative i0 in src0
int64_t src_idx;
if (rel_i0 < 0) {
// Left padding - reflect
src_idx = -rel_i0;
} else if (rel_i0 < ne00) {
// Middle - copy
src_idx = rel_i0;
} else {
// Right padding - reflect
src_idx = 2 * ne00 - 2 - rel_i0;
}
const float value = *(const float *)(src0_ptr + src_idx * nb00);
*(float *)(dst_ptr + i0 * nb0) = value;
GGML_UNUSED(p1);
}
void ggml_sycl_op_pad_reflect_1d(ggml_backend_sycl_context& ctx, ggml_tensor* dst){
void ggml_sycl_op_pad_reflect_1d(ggml_backend_sycl_context &ctx,
ggml_tensor *dst) {
const ggml_tensor * src0 = dst->src[0];
queue_ptr stream = ctx.stream();
const ggml_tensor *src0 = dst->src[0];
dpct::queue_ptr stream = ctx.stream();
GGML_ASSERT(src0->type == GGML_TYPE_F32);
GGML_ASSERT( dst->type == GGML_TYPE_F32);
GGML_ASSERT(dst->type == GGML_TYPE_F32);
const int32_t * opts = (const int32_t *) dst->op_params;
const int32_t *opts = (const int32_t *)dst->op_params;
const int p0 = opts[0];
const int p1 = opts[1];
const int64_t ne0 = src0->ne[0];
const int64_t ne00 = src0->ne[0];
const int64_t ne01 = src0->ne[1];
const sycl::uint3 ne01_packed = init_fastdiv_values(ne01);
const int64_t ne02 = src0->ne[2];
const int64_t ne03 = src0->ne[3];
const int64_t ne00 = dst->ne[0];
const int64_t ne01 = dst->ne[1];
const int64_t ne02 = dst->ne[2];
const int64_t ne03 = dst->ne[3];
const int64_t ne0 = dst->ne[0];
const int64_t nb00 = dst->nb[0];
const int64_t nb01 = dst->nb[1];
const int64_t nb02 = dst->nb[2];
const int64_t nb03 = dst->nb[3];
const int64_t nb0 = src0->nb[0];
const int64_t nb1 = src0->nb[1];
const int64_t nb2 = src0->nb[2];
const int64_t nb3 = src0->nb[3];
GGML_ASSERT(ne0 == ne00 + p0 + p1);
int num_blocks = (ne00 + SYCL_CONCAT_BLOCK_SIZE - 1) / SYCL_CONCAT_BLOCK_SIZE;
sycl::range<3> global(num_blocks * SYCL_CONCAT_BLOCK_SIZE, ne01, ne02*ne03);
sycl::range<3> local(SYCL_CONCAT_BLOCK_SIZE, 1, 1);
constexpr int64_t bx = SYCL_PAD_REFLECT_1D_BLOCK_SIZE;
const int64_t tiles0 = (ne0 + bx - 1) / bx;
const dpct::dim3 grid_dims((unsigned)(ne01 * tiles0), (unsigned)ne02,
(unsigned)ne03);
const dpct::dim3 block_dims((unsigned)bx, 1, 1);
stream->parallel_for(
sycl::nd_range<3>(global,
local),
[=](sycl::nd_item<3> item_ct1) { pad_reflect_1d_f32(
(const float *) src0->data, (float *) dst->data,
ne0, ne02, p0, p1,
nb0, nb1, nb2, nb3,
nb00, nb01, nb02, nb03
, item_ct1);
});
stream->submit([&](sycl::handler &cgh) {
auto src0_data_ct0 = src0->data;
auto dst_data_ct1 = dst->data;
auto src0_nb_ct7 = src0->nb[0];
auto src0_nb_ct8 = src0->nb[1];
auto src0_nb_ct9 = src0->nb[2];
auto src0_nb_ct10 = src0->nb[3];
auto dst_nb_ct11 = dst->nb[0];
auto dst_nb_ct12 = dst->nb[1];
auto dst_nb_ct13 = dst->nb[2];
auto dst_nb_ct14 = dst->nb[3];
cgh.parallel_for(sycl::nd_range<3>(grid_dims * block_dims, block_dims),
[=](sycl::nd_item<3> item_ct1) {
pad_reflect_1d_kernel_f32(
src0_data_ct0, dst_data_ct1, ne0, ne00,
ne01_packed, ne02, ne03, src0_nb_ct7,
src0_nb_ct8, src0_nb_ct9, src0_nb_ct10,
dst_nb_ct11, dst_nb_ct12, dst_nb_ct13,
dst_nb_ct14, p0, p1, item_ct1);
});
});
}

View File

@ -3,6 +3,8 @@
#include "common.hpp"
#define SYCL_PAD_REFLECT_1D_BLOCK_SIZE 256
void ggml_sycl_op_pad_reflect_1d(ggml_backend_sycl_context& ctx, ggml_tensor* dst);
#endif // GGML_SYCL_PAD_REFLECT_1D_HPP

Some files were not shown because too many files have changed in this diff Show More