Merge branch 'master' into hksdpc255-patch-2
This commit is contained in:
commit
819c76aeab
|
|
@ -65,3 +65,34 @@ runs:
|
||||||
echo "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\libnvvp" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
|
echo "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\libnvvp" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
|
||||||
echo "CUDA_PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8
|
echo "CUDA_PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8
|
||||||
echo "CUDA_PATH_V12_4=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8
|
echo "CUDA_PATH_V12_4=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8
|
||||||
|
|
||||||
|
- name: Install Cuda Toolkit 13.1
|
||||||
|
if: ${{ inputs.cuda_version == '13.1' }}
|
||||||
|
shell: pwsh
|
||||||
|
run: |
|
||||||
|
mkdir -p "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1"
|
||||||
|
choco install unzip -y
|
||||||
|
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_crt/windows-x86_64/cuda_crt-windows-x86_64-13.1.80-archive.zip"
|
||||||
|
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_cudart/windows-x86_64/cuda_cudart-windows-x86_64-13.1.80-archive.zip"
|
||||||
|
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvcc/windows-x86_64/cuda_nvcc-windows-x86_64-13.1.80-archive.zip"
|
||||||
|
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvrtc/windows-x86_64/cuda_nvrtc-windows-x86_64-13.1.80-archive.zip"
|
||||||
|
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/libcublas/windows-x86_64/libcublas-windows-x86_64-13.2.0.9-archive.zip"
|
||||||
|
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/libnvvm/windows-x86_64/libnvvm-windows-x86_64-13.1.80-archive.zip"
|
||||||
|
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvtx/windows-x86_64/cuda_nvtx-windows-x86_64-13.1.68-archive.zip"
|
||||||
|
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_profiler_api/windows-x86_64/cuda_profiler_api-windows-x86_64-13.1.80-archive.zip"
|
||||||
|
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/visual_studio_integration/windows-x86_64/visual_studio_integration-windows-x86_64-13.1.68-archive.zip"
|
||||||
|
curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_cccl/windows-x86_64/cuda_cccl-windows-x86_64-13.1.78-archive.zip"
|
||||||
|
unzip '*.zip' -d "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1"
|
||||||
|
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1\cuda_crt-windows-x86_64-13.1.80-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1" /E /I /H /Y
|
||||||
|
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1\cuda_cudart-windows-x86_64-13.1.80-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1" /E /I /H /Y
|
||||||
|
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1\cuda_nvcc-windows-x86_64-13.1.80-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1" /E /I /H /Y
|
||||||
|
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1\cuda_nvrtc-windows-x86_64-13.1.80-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1" /E /I /H /Y
|
||||||
|
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1\libcublas-windows-x86_64-13.2.0.9-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1" /E /I /H /Y
|
||||||
|
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1\libnvvm-windows-x86_64-13.1.80-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1" /E /I /H /Y
|
||||||
|
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1\cuda_nvtx-windows-x86_64-13.1.68-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1" /E /I /H /Y
|
||||||
|
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1\cuda_profiler_api-windows-x86_64-13.1.80-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1" /E /I /H /Y
|
||||||
|
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1\visual_studio_integration-windows-x86_64-13.1.68-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1" /E /I /H /Y
|
||||||
|
xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1\cuda_cccl-windows-x86_64-13.1.78-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1" /E /I /H /Y
|
||||||
|
echo "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
|
||||||
|
echo "CUDA_PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8
|
||||||
|
echo "CUDA_PATH_V13_1=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8
|
||||||
|
|
|
||||||
|
|
@ -291,6 +291,7 @@ jobs:
|
||||||
-DGGML_RVV=ON \
|
-DGGML_RVV=ON \
|
||||||
-DGGML_RV_ZFH=ON \
|
-DGGML_RV_ZFH=ON \
|
||||||
-DGGML_RV_ZICBOP=ON \
|
-DGGML_RV_ZICBOP=ON \
|
||||||
|
-DGGML_RV_ZIHINTPAUSE=ON \
|
||||||
-DRISCV64_SPACEMIT_IME_SPEC=RISCV64_SPACEMIT_IME1 \
|
-DRISCV64_SPACEMIT_IME_SPEC=RISCV64_SPACEMIT_IME1 \
|
||||||
-DCMAKE_TOOLCHAIN_FILE=${PWD}/cmake/riscv64-spacemit-linux-gnu-gcc.cmake
|
-DCMAKE_TOOLCHAIN_FILE=${PWD}/cmake/riscv64-spacemit-linux-gnu-gcc.cmake
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -1,120 +0,0 @@
|
||||||
name: Build on RISCV Linux Machine by Cloud-V
|
|
||||||
on:
|
|
||||||
pull_request:
|
|
||||||
workflow_dispatch:
|
|
||||||
workflow_call:
|
|
||||||
|
|
||||||
jobs:
|
|
||||||
debian-13-riscv64-native: # Bianbu 2.2
|
|
||||||
runs-on: [self-hosted, RISCV64]
|
|
||||||
|
|
||||||
steps:
|
|
||||||
- name: Install prerequisites
|
|
||||||
run: |
|
|
||||||
sudo apt-get update || true
|
|
||||||
sudo apt-get install -y libatomic1
|
|
||||||
- uses: actions/checkout@v4
|
|
||||||
- name: Setup Riscv
|
|
||||||
run: |
|
|
||||||
sudo apt-get update || true
|
|
||||||
sudo apt-get install -y --no-install-recommends \
|
|
||||||
build-essential \
|
|
||||||
gcc-14-riscv64-linux-gnu \
|
|
||||||
g++-14-riscv64-linux-gnu \
|
|
||||||
ccache \
|
|
||||||
cmake
|
|
||||||
|
|
||||||
- name: Setup ccache
|
|
||||||
run: |
|
|
||||||
mkdir -p $HOME/.ccache
|
|
||||||
ccache -M 5G -d $HOME/.ccache
|
|
||||||
export CCACHE_LOGFILE=/home/runneruser/ccache_debug/ccache.log
|
|
||||||
export CCACHE_DEBUGDIR="/home/runneruser/ccache_debug"
|
|
||||||
echo "$GITHUB_WORKSPACE"
|
|
||||||
echo "CCACHE_LOGFILE=$CCACHE_LOGFILE" >> $GITHUB_ENV
|
|
||||||
echo "CCACHE_DEBUGDIR=$CCACHE_DEBUGDIR" >> $GITHUB_ENV
|
|
||||||
echo "CCACHE_BASEDIR=$GITHUB_WORKSPACE" >> $GITHUB_ENV
|
|
||||||
echo "CCACHE_DIR=$HOME/.ccache" >> $GITHUB_ENV
|
|
||||||
|
|
||||||
- name: Build
|
|
||||||
run: |
|
|
||||||
cmake -B build \
|
|
||||||
-DLLAMA_CURL=OFF \
|
|
||||||
-DCMAKE_BUILD_TYPE=Release \
|
|
||||||
-DGGML_OPENMP=OFF \
|
|
||||||
-DLLAMA_BUILD_EXAMPLES=ON \
|
|
||||||
-DLLAMA_BUILD_TOOLS=ON \
|
|
||||||
-DLLAMA_BUILD_TESTS=OFF \
|
|
||||||
-DCMAKE_SYSTEM_NAME=Linux \
|
|
||||||
-DCMAKE_SYSTEM_PROCESSOR=riscv64 \
|
|
||||||
-DCMAKE_C_COMPILER=riscv64-linux-gnu-gcc-14 \
|
|
||||||
-DCMAKE_CXX_COMPILER=riscv64-linux-gnu-g++-14 \
|
|
||||||
-DCMAKE_C_COMPILER_LAUNCHER=ccache \
|
|
||||||
-DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
|
|
||||||
-DCMAKE_POSITION_INDEPENDENT_CODE=ON \
|
|
||||||
-DCMAKE_FIND_ROOT_PATH=/usr/lib/riscv64-linux-gnu \
|
|
||||||
-DCMAKE_FIND_ROOT_PATH_MODE_PROGRAM=NEVER \
|
|
||||||
-DCMAKE_FIND_ROOT_PATH_MODE_LIBRARY=ONLY \
|
|
||||||
-DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=BOTH
|
|
||||||
|
|
||||||
cmake --build build --config Release -j $(nproc)
|
|
||||||
|
|
||||||
# debian-13-riscv64-spacemit-ime-native: # Bianbu 2.2
|
|
||||||
# runs-on: [self-hosted, RISCV64]
|
|
||||||
|
|
||||||
# steps:
|
|
||||||
# - name: Install prerequisites
|
|
||||||
# run: |
|
|
||||||
# sudo apt-get update || true
|
|
||||||
# sudo apt-get install -y libatomic1
|
|
||||||
# - uses: actions/checkout@v4
|
|
||||||
# - name: Setup Riscv
|
|
||||||
# run: |
|
|
||||||
# sudo apt-get update || true
|
|
||||||
# sudo apt-get install -y --no-install-recommends \
|
|
||||||
# build-essential \
|
|
||||||
# gcc-14-riscv64-linux-gnu \
|
|
||||||
# g++-14-riscv64-linux-gnu \
|
|
||||||
# ccache \
|
|
||||||
# cmake
|
|
||||||
# sudo apt-get upgrade binutils -y
|
|
||||||
|
|
||||||
# - name: Setup ccache
|
|
||||||
# run: |
|
|
||||||
# mkdir -p $HOME/.ccache
|
|
||||||
# ccache -M 5G -d $HOME/.ccache
|
|
||||||
# export CCACHE_LOGFILE=/home/runneruser/ccache_debug/ccache.log
|
|
||||||
# export CCACHE_DEBUGDIR="/home/runneruser/ccache_debug"
|
|
||||||
# echo "$GITHUB_WORKSPACE"
|
|
||||||
# echo "CCACHE_LOGFILE=$CCACHE_LOGFILE" >> $GITHUB_ENV
|
|
||||||
# echo "CCACHE_DEBUGDIR=$CCACHE_DEBUGDIR" >> $GITHUB_ENV
|
|
||||||
# echo "CCACHE_BASEDIR=$GITHUB_WORKSPACE" >> $GITHUB_ENV
|
|
||||||
# echo "CCACHE_DIR=$HOME/.ccache" >> $GITHUB_ENV
|
|
||||||
|
|
||||||
# - name: Build
|
|
||||||
# run: |
|
|
||||||
# cmake -B build \
|
|
||||||
# -DLLAMA_CURL=OFF \
|
|
||||||
# -DCMAKE_BUILD_TYPE=Release \
|
|
||||||
# -DGGML_OPENMP=OFF \
|
|
||||||
# -DLLAMA_BUILD_EXAMPLES=ON \
|
|
||||||
# -DLLAMA_BUILD_TOOLS=ON \
|
|
||||||
# -DLLAMA_BUILD_TESTS=OFF \
|
|
||||||
# -DCMAKE_SYSTEM_NAME=Linux \
|
|
||||||
# -DCMAKE_SYSTEM_PROCESSOR=riscv64 \
|
|
||||||
# -DCMAKE_C_COMPILER=riscv64-linux-gnu-gcc-14 \
|
|
||||||
# -DCMAKE_CXX_COMPILER=riscv64-linux-gnu-g++-14 \
|
|
||||||
# -DCMAKE_C_COMPILER_LAUNCHER=ccache \
|
|
||||||
# -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
|
|
||||||
# -DCMAKE_POSITION_INDEPENDENT_CODE=ON \
|
|
||||||
# -DCMAKE_FIND_ROOT_PATH=/usr/lib/riscv64-linux-gnu \
|
|
||||||
# -DCMAKE_FIND_ROOT_PATH_MODE_PROGRAM=NEVER \
|
|
||||||
# -DCMAKE_FIND_ROOT_PATH_MODE_LIBRARY=ONLY \
|
|
||||||
# -DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=BOTH \
|
|
||||||
# -DGGML_RVV=ON \
|
|
||||||
# -DGGML_RV_ZFH=ON \
|
|
||||||
# -DGGML_RV_ZICBOP=ON \
|
|
||||||
# -DGGML_CPU_RISCV64_SPACEMIT=ON \
|
|
||||||
# -DRISCV64_SPACEMIT_IME_SPEC=RISCV64_SPACEMIT_IME1
|
|
||||||
|
|
||||||
# cmake --build build --config Release -j $(nproc)
|
|
||||||
|
|
@ -547,6 +547,46 @@ jobs:
|
||||||
# This is using llvmpipe and runs slower than other backends
|
# This is using llvmpipe and runs slower than other backends
|
||||||
ctest -L main --verbose --timeout 3600
|
ctest -L main --verbose --timeout 3600
|
||||||
|
|
||||||
|
ubuntu-24-wasm-webgpu:
|
||||||
|
runs-on: ubuntu-24.04
|
||||||
|
|
||||||
|
steps:
|
||||||
|
- name: Clone
|
||||||
|
id: checkout
|
||||||
|
uses: actions/checkout@v4
|
||||||
|
|
||||||
|
- name: ccache
|
||||||
|
uses: ggml-org/ccache-action@v1.2.16
|
||||||
|
with:
|
||||||
|
key: ubuntu-latest-wasm-webgpu
|
||||||
|
evict-old-files: 1d
|
||||||
|
|
||||||
|
- name: Install Emscripten
|
||||||
|
run: |
|
||||||
|
git clone https://github.com/emscripten-core/emsdk.git
|
||||||
|
cd emsdk
|
||||||
|
./emsdk install latest
|
||||||
|
./emsdk activate latest
|
||||||
|
|
||||||
|
- name: Fetch emdawnwebgpu
|
||||||
|
run: |
|
||||||
|
DAWN_TAG="v20251027.212519"
|
||||||
|
EMDAWN_PKG="emdawnwebgpu_pkg-${DAWN_TAG}.zip"
|
||||||
|
echo "Downloading ${EMDAWN_PKG}"
|
||||||
|
curl -L -o emdawn.zip \
|
||||||
|
"https://github.com/google/dawn/releases/download/${DAWN_TAG}/${EMDAWN_PKG}"
|
||||||
|
unzip emdawn.zip
|
||||||
|
|
||||||
|
- name: Build WASM WebGPU
|
||||||
|
run: |
|
||||||
|
source emsdk/emsdk_env.sh
|
||||||
|
emcmake cmake -B build-wasm \
|
||||||
|
-DGGML_WEBGPU=ON \
|
||||||
|
-DLLAMA_CURL=OFF \
|
||||||
|
-DEMDAWNWEBGPU_DIR=emdawnwebgpu_pkg
|
||||||
|
|
||||||
|
cmake --build build-wasm --target test-backend-ops -j $(nproc)
|
||||||
|
|
||||||
ubuntu-22-cmake-hip:
|
ubuntu-22-cmake-hip:
|
||||||
runs-on: ubuntu-22.04
|
runs-on: ubuntu-22.04
|
||||||
container: rocm/dev-ubuntu-22.04:6.1.2
|
container: rocm/dev-ubuntu-22.04:6.1.2
|
||||||
|
|
@ -1562,33 +1602,33 @@ jobs:
|
||||||
run: |
|
run: |
|
||||||
bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
|
bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
|
||||||
|
|
||||||
ggml-ci-x64-amd-vulkan:
|
# ggml-ci-x64-amd-vulkan:
|
||||||
runs-on: [self-hosted, Linux, X64, AMD]
|
# runs-on: [self-hosted, Linux, X64, AMD]
|
||||||
|
|
||||||
steps:
|
# steps:
|
||||||
- name: Clone
|
# - name: Clone
|
||||||
id: checkout
|
# id: checkout
|
||||||
uses: actions/checkout@v4
|
# uses: actions/checkout@v4
|
||||||
|
|
||||||
- name: Test
|
# - name: Test
|
||||||
id: ggml-ci
|
# id: ggml-ci
|
||||||
run: |
|
# run: |
|
||||||
vulkaninfo --summary
|
# vulkaninfo --summary
|
||||||
GG_BUILD_VULKAN=1 bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
|
# GG_BUILD_VULKAN=1 bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
|
||||||
|
|
||||||
ggml-ci-x64-amd-rocm:
|
# ggml-ci-x64-amd-rocm:
|
||||||
runs-on: [self-hosted, Linux, X64, AMD]
|
# runs-on: [self-hosted, Linux, X64, AMD]
|
||||||
|
|
||||||
steps:
|
# steps:
|
||||||
- name: Clone
|
# - name: Clone
|
||||||
id: checkout
|
# id: checkout
|
||||||
uses: actions/checkout@v4
|
# uses: actions/checkout@v4
|
||||||
|
|
||||||
- name: Test
|
# - name: Test
|
||||||
id: ggml-ci
|
# id: ggml-ci
|
||||||
run: |
|
# run: |
|
||||||
amd-smi static
|
# amd-smi static
|
||||||
GG_BUILD_ROCM=1 GG_BUILD_AMDGPU_TARGETS="gfx1101" bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
|
# GG_BUILD_ROCM=1 GG_BUILD_AMDGPU_TARGETS="gfx1101" bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
|
||||||
|
|
||||||
ggml-ci-mac-metal:
|
ggml-ci-mac-metal:
|
||||||
runs-on: [self-hosted, macOS, ARM64]
|
runs-on: [self-hosted, macOS, ARM64]
|
||||||
|
|
@ -1642,6 +1682,337 @@ jobs:
|
||||||
run: |
|
run: |
|
||||||
GG_BUILD_KLEIDIAI=1 GG_BUILD_EXTRA_TESTS_0=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
|
GG_BUILD_KLEIDIAI=1 GG_BUILD_EXTRA_TESTS_0=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
|
||||||
|
|
||||||
|
ubuntu-cpu-cmake-riscv64-native:
|
||||||
|
runs-on: RISCV64
|
||||||
|
|
||||||
|
steps:
|
||||||
|
- name: Install dependencies
|
||||||
|
run: |
|
||||||
|
sudo apt-get update
|
||||||
|
|
||||||
|
# Install necessary packages
|
||||||
|
sudo apt-get install -y libatomic1 libtsan2 gcc-14 g++-14 rustup cmake build-essential libssl-dev wget ccache
|
||||||
|
|
||||||
|
# Set gcc-14 and g++-14 as the default compilers
|
||||||
|
sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-14 100
|
||||||
|
sudo update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-14 100
|
||||||
|
sudo ln -sf /usr/bin/gcc-14 /usr/bin/gcc
|
||||||
|
sudo ln -sf /usr/bin/g++-14 /usr/bin/g++
|
||||||
|
|
||||||
|
# Install Rust stable version
|
||||||
|
rustup install stable
|
||||||
|
rustup default stable
|
||||||
|
|
||||||
|
- name: Clone
|
||||||
|
id: checkout
|
||||||
|
uses: actions/checkout@v4
|
||||||
|
|
||||||
|
- name: Check environment
|
||||||
|
run: |
|
||||||
|
uname -a
|
||||||
|
gcc --version
|
||||||
|
g++ --version
|
||||||
|
ldd --version
|
||||||
|
cmake --version
|
||||||
|
rustc --version
|
||||||
|
|
||||||
|
- name: Setup ccache
|
||||||
|
run: |
|
||||||
|
# Set unique cache directory for this job
|
||||||
|
export CCACHE_DIR="$HOME/.ccache/cpu-cmake-rv64-native"
|
||||||
|
mkdir -p "$CCACHE_DIR"
|
||||||
|
|
||||||
|
# Configure ccache for optimal performance
|
||||||
|
ccache --set-config=max_size=5G
|
||||||
|
ccache --set-config=compression=true
|
||||||
|
ccache --set-config=compression_level=6
|
||||||
|
ccache --set-config=cache_dir="$CCACHE_DIR"
|
||||||
|
|
||||||
|
# Enable more aggressive caching
|
||||||
|
ccache --set-config=sloppiness=file_macro,time_macros,include_file_mtime,include_file_ctime
|
||||||
|
ccache --set-config=hash_dir=false
|
||||||
|
|
||||||
|
# Export for subsequent steps
|
||||||
|
echo "CCACHE_DIR=$CCACHE_DIR" >> $GITHUB_ENV
|
||||||
|
echo "PATH=/usr/lib/ccache:$PATH" >> $GITHUB_ENV
|
||||||
|
|
||||||
|
- name: Build
|
||||||
|
id: cmake_build
|
||||||
|
run: |
|
||||||
|
cmake -B build \
|
||||||
|
-DLLAMA_CURL=OFF \
|
||||||
|
-DLLAMA_OPENSSL=ON \
|
||||||
|
-DCMAKE_BUILD_TYPE=Release \
|
||||||
|
-DGGML_OPENMP=OFF \
|
||||||
|
-DLLAMA_BUILD_EXAMPLES=ON \
|
||||||
|
-DLLAMA_BUILD_TOOLS=ON \
|
||||||
|
-DLLAMA_BUILD_TESTS=ON \
|
||||||
|
-DCMAKE_C_COMPILER_LAUNCHER=ccache \
|
||||||
|
-DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
|
||||||
|
-DGGML_RPC=ON \
|
||||||
|
-DCMAKE_C_COMPILER=riscv64-linux-gnu-gcc-14 \
|
||||||
|
-DCMAKE_CXX_COMPILER=riscv64-linux-gnu-g++-14
|
||||||
|
|
||||||
|
cmake --build build --config Release -j $(nproc)
|
||||||
|
|
||||||
|
- name: Test
|
||||||
|
id: cmake_test
|
||||||
|
run: |
|
||||||
|
cd build
|
||||||
|
ctest -L 'main|curl' --verbose --timeout 900
|
||||||
|
|
||||||
|
- name: Test llama2c conversion
|
||||||
|
id: llama2c_test
|
||||||
|
run: |
|
||||||
|
cd build
|
||||||
|
echo "Fetch tokenizer"
|
||||||
|
wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories260K/tok512.bin
|
||||||
|
echo "Fetch llama2c model"
|
||||||
|
wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories260K/stories260K.bin
|
||||||
|
./bin/llama-convert-llama2c-to-ggml --copy-vocab-from-model ./tok512.bin --llama2c-model stories260K.bin --llama2c-output-model stories260K.gguf
|
||||||
|
./bin/llama-cli -m stories260K.gguf -p "One day, Lily met a Shoggoth" -n 500 -c 256
|
||||||
|
|
||||||
|
ubuntu-cmake-sanitizer-riscv64-native:
|
||||||
|
runs-on: RISCV64
|
||||||
|
|
||||||
|
continue-on-error: true
|
||||||
|
|
||||||
|
strategy:
|
||||||
|
matrix:
|
||||||
|
sanitizer: [ADDRESS, THREAD, UNDEFINED]
|
||||||
|
build_type: [Debug]
|
||||||
|
|
||||||
|
steps:
|
||||||
|
- name: Install dependencies
|
||||||
|
run: |
|
||||||
|
sudo apt-get update
|
||||||
|
|
||||||
|
# Install necessary packages
|
||||||
|
sudo apt-get install -y libatomic1 libtsan2 gcc-14 g++-14 rustup cmake build-essential wget ccache
|
||||||
|
|
||||||
|
# Set gcc-14 and g++-14 as the default compilers
|
||||||
|
sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-14 100
|
||||||
|
sudo update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-14 100
|
||||||
|
sudo ln -sf /usr/bin/gcc-14 /usr/bin/gcc
|
||||||
|
sudo ln -sf /usr/bin/g++-14 /usr/bin/g++
|
||||||
|
|
||||||
|
# Install Rust stable version
|
||||||
|
rustup install stable
|
||||||
|
rustup default stable
|
||||||
|
|
||||||
|
- name: GCC version check
|
||||||
|
run: |
|
||||||
|
gcc --version
|
||||||
|
g++ --version
|
||||||
|
|
||||||
|
- name: Clone
|
||||||
|
id: checkout
|
||||||
|
uses: actions/checkout@v4
|
||||||
|
|
||||||
|
- name: Setup ccache
|
||||||
|
run: |
|
||||||
|
# Unique cache directory per matrix combination
|
||||||
|
export CCACHE_DIR="$HOME/.ccache/sanitizer-${{ matrix.sanitizer }}-${{ matrix.build_type }}"
|
||||||
|
mkdir -p "$CCACHE_DIR"
|
||||||
|
|
||||||
|
# Configure ccache
|
||||||
|
ccache --set-config=max_size=5G
|
||||||
|
ccache --set-config=compression=true
|
||||||
|
ccache --set-config=compression_level=6
|
||||||
|
ccache --set-config=cache_dir="$CCACHE_DIR"
|
||||||
|
ccache --set-config=sloppiness=file_macro,time_macros,include_file_mtime,include_file_ctime
|
||||||
|
ccache --set-config=hash_dir=false
|
||||||
|
|
||||||
|
# Export for subsequent steps
|
||||||
|
echo "CCACHE_DIR=$CCACHE_DIR" >> $GITHUB_ENV
|
||||||
|
echo "PATH=/usr/lib/ccache:$PATH" >> $GITHUB_ENV
|
||||||
|
|
||||||
|
- name: Build
|
||||||
|
id: cmake_build
|
||||||
|
if: ${{ matrix.sanitizer != 'THREAD' }}
|
||||||
|
run: |
|
||||||
|
cmake -B build \
|
||||||
|
-DLLAMA_CURL=OFF \
|
||||||
|
-DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
|
||||||
|
-DGGML_OPENMP=ON \
|
||||||
|
-DLLAMA_BUILD_EXAMPLES=ON \
|
||||||
|
-DLLAMA_BUILD_TOOLS=ON \
|
||||||
|
-DLLAMA_BUILD_TESTS=OFF \
|
||||||
|
-DCMAKE_C_COMPILER_LAUNCHER=ccache \
|
||||||
|
-DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
|
||||||
|
-DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \
|
||||||
|
-DCMAKE_C_COMPILER=riscv64-linux-gnu-gcc-14 \
|
||||||
|
-DCMAKE_CXX_COMPILER=riscv64-linux-gnu-g++-14
|
||||||
|
|
||||||
|
cmake --build build --config ${{ matrix.build_type }} -j $(nproc)
|
||||||
|
|
||||||
|
- name: Build (no OpenMP)
|
||||||
|
id: cmake_build_no_openmp
|
||||||
|
if: ${{ matrix.sanitizer == 'THREAD' }}
|
||||||
|
run: |
|
||||||
|
cmake -B build \
|
||||||
|
-DLLAMA_CURL=OFF \
|
||||||
|
-DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
|
||||||
|
-DGGML_OPENMP=OFF \
|
||||||
|
-DLLAMA_BUILD_EXAMPLES=ON \
|
||||||
|
-DLLAMA_BUILD_TOOLS=ON \
|
||||||
|
-DLLAMA_BUILD_TESTS=OFF \
|
||||||
|
-DCMAKE_C_COMPILER_LAUNCHER=ccache \
|
||||||
|
-DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
|
||||||
|
-DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \
|
||||||
|
-DCMAKE_C_COMPILER=riscv64-linux-gnu-gcc-14 \
|
||||||
|
-DCMAKE_CXX_COMPILER=riscv64-linux-gnu-g++-14
|
||||||
|
|
||||||
|
cmake --build build --config ${{ matrix.build_type }} -j $(nproc)
|
||||||
|
|
||||||
|
- name: Test
|
||||||
|
id: cmake_test
|
||||||
|
run: |
|
||||||
|
cd build
|
||||||
|
ctest -L main --verbose --timeout 900
|
||||||
|
|
||||||
|
|
||||||
|
ubuntu-llguidance-riscv64-native:
|
||||||
|
runs-on: RISCV64
|
||||||
|
steps:
|
||||||
|
- name: Install dependencies
|
||||||
|
run: |
|
||||||
|
sudo apt-get update
|
||||||
|
|
||||||
|
# Install necessary packages
|
||||||
|
sudo apt-get install -y libatomic1 libtsan2 gcc-14 g++-14 rustup cmake build-essential wget ccache
|
||||||
|
|
||||||
|
# Set gcc-14 and g++-14 as the default compilers
|
||||||
|
sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-14 100
|
||||||
|
sudo update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-14 100
|
||||||
|
sudo ln -sf /usr/bin/gcc-14 /usr/bin/gcc
|
||||||
|
sudo ln -sf /usr/bin/g++-14 /usr/bin/g++
|
||||||
|
|
||||||
|
# Install Rust stable version
|
||||||
|
rustup install stable
|
||||||
|
rustup default stable
|
||||||
|
|
||||||
|
- name: GCC version check
|
||||||
|
run: |
|
||||||
|
gcc --version
|
||||||
|
g++ --version
|
||||||
|
|
||||||
|
- name: Clone
|
||||||
|
id: checkout
|
||||||
|
uses: actions/checkout@v4
|
||||||
|
|
||||||
|
- name: Setup ccache
|
||||||
|
run: |
|
||||||
|
export CCACHE_DIR="$HOME/.ccache/llguidance-riscv64"
|
||||||
|
mkdir -p "$CCACHE_DIR"
|
||||||
|
|
||||||
|
ccache --set-config=max_size=5G
|
||||||
|
ccache --set-config=compression=true
|
||||||
|
ccache --set-config=compression_level=6
|
||||||
|
ccache --set-config=cache_dir="$CCACHE_DIR"
|
||||||
|
ccache --set-config=sloppiness=file_macro,time_macros,include_file_mtime,include_file_ctime
|
||||||
|
ccache --set-config=hash_dir=false
|
||||||
|
|
||||||
|
echo "CCACHE_DIR=$CCACHE_DIR" >> $GITHUB_ENV
|
||||||
|
echo "PATH=/usr/lib/ccache:$PATH" >> $GITHUB_ENV
|
||||||
|
|
||||||
|
- name: Build
|
||||||
|
id: cmake_build
|
||||||
|
run: |
|
||||||
|
cmake -B build \
|
||||||
|
-DLLAMA_CURL=OFF \
|
||||||
|
-DCMAKE_BUILD_TYPE=Release \
|
||||||
|
-DGGML_OPENMP=OFF \
|
||||||
|
-DLLAMA_BUILD_EXAMPLES=ON \
|
||||||
|
-DLLAMA_BUILD_TOOLS=ON \
|
||||||
|
-DLLAMA_BUILD_TESTS=OFF \
|
||||||
|
-DCMAKE_C_COMPILER_LAUNCHER=ccache \
|
||||||
|
-DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
|
||||||
|
-DLLAMA_LLGUIDANCE=ON \
|
||||||
|
-DCMAKE_C_COMPILER=riscv64-linux-gnu-gcc-14 \
|
||||||
|
-DCMAKE_CXX_COMPILER=riscv64-linux-gnu-g++-14
|
||||||
|
|
||||||
|
cmake --build build --config Release -j $(nproc)
|
||||||
|
|
||||||
|
- name: Test
|
||||||
|
id: cmake_test
|
||||||
|
run: |
|
||||||
|
cd build
|
||||||
|
ctest -L main --verbose --timeout 900
|
||||||
|
|
||||||
|
|
||||||
|
ubuntu-cmake-rpc-riscv64-native:
|
||||||
|
runs-on: RISCV64
|
||||||
|
|
||||||
|
continue-on-error: true
|
||||||
|
|
||||||
|
steps:
|
||||||
|
- name: Install dependencies
|
||||||
|
run: |
|
||||||
|
sudo apt-get update
|
||||||
|
|
||||||
|
# Install necessary packages
|
||||||
|
sudo apt-get install -y libatomic1 libtsan2 gcc-14 g++-14 rustup cmake build-essential libssl-dev wget ccache
|
||||||
|
|
||||||
|
# Set gcc-14 and g++-14 as the default compilers
|
||||||
|
sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-14 100
|
||||||
|
sudo update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-14 100
|
||||||
|
sudo ln -sf /usr/bin/gcc-14 /usr/bin/gcc
|
||||||
|
sudo ln -sf /usr/bin/g++-14 /usr/bin/g++
|
||||||
|
|
||||||
|
# Install Rust stable version
|
||||||
|
rustup install stable
|
||||||
|
rustup default stable
|
||||||
|
|
||||||
|
- name: GCC version check
|
||||||
|
run: |
|
||||||
|
gcc --version
|
||||||
|
g++ --version
|
||||||
|
|
||||||
|
- name: Clone
|
||||||
|
id: checkout
|
||||||
|
uses: actions/checkout@v4
|
||||||
|
|
||||||
|
- name: Setup ccache
|
||||||
|
run: |
|
||||||
|
export CCACHE_DIR="$HOME/.ccache/rpc-riscv64"
|
||||||
|
mkdir -p "$CCACHE_DIR"
|
||||||
|
|
||||||
|
ccache --set-config=max_size=5G
|
||||||
|
ccache --set-config=compression=true
|
||||||
|
ccache --set-config=compression_level=6
|
||||||
|
ccache --set-config=cache_dir="$CCACHE_DIR"
|
||||||
|
ccache --set-config=sloppiness=file_macro,time_macros,include_file_mtime,include_file_ctime
|
||||||
|
ccache --set-config=hash_dir=false
|
||||||
|
|
||||||
|
echo "CCACHE_DIR=$CCACHE_DIR" >> $GITHUB_ENV
|
||||||
|
echo "PATH=/usr/lib/ccache:$PATH" >> $GITHUB_ENV
|
||||||
|
|
||||||
|
- name: Build
|
||||||
|
id: cmake_build
|
||||||
|
run: |
|
||||||
|
cmake -B build \
|
||||||
|
-DLLAMA_CURL=OFF \
|
||||||
|
-DLLAMA_OPENSSL=ON \
|
||||||
|
-DCMAKE_BUILD_TYPE=Release \
|
||||||
|
-DGGML_OPENMP=OFF \
|
||||||
|
-DLLAMA_BUILD_EXAMPLES=ON \
|
||||||
|
-DLLAMA_BUILD_TOOLS=ON \
|
||||||
|
-DLLAMA_BUILD_TESTS=ON \
|
||||||
|
-DCMAKE_C_COMPILER_LAUNCHER=ccache \
|
||||||
|
-DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
|
||||||
|
-DCMAKE_C_COMPILER=riscv64-linux-gnu-gcc-14 \
|
||||||
|
-DCMAKE_CXX_COMPILER=riscv64-linux-gnu-g++-14 \
|
||||||
|
-DGGML_RPC=ON
|
||||||
|
|
||||||
|
cmake --build build --config Release -j $(nproc)
|
||||||
|
|
||||||
|
- name: Test
|
||||||
|
id: cmake_test
|
||||||
|
run: |
|
||||||
|
cd build
|
||||||
|
ctest -L main --verbose
|
||||||
|
|
||||||
ggml-ci-arm64-graviton4-kleidiai:
|
ggml-ci-arm64-graviton4-kleidiai:
|
||||||
runs-on: ah-ubuntu_22_04-c8g_8x
|
runs-on: ah-ubuntu_22_04-c8g_8x
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -66,14 +66,21 @@ jobs:
|
||||||
id: pack_artifacts
|
id: pack_artifacts
|
||||||
run: |
|
run: |
|
||||||
cp LICENSE ./build/bin/
|
cp LICENSE ./build/bin/
|
||||||
zip -r llama-${{ steps.tag.outputs.name }}-bin-macos-arm64.zip ./build/bin/*
|
zip -y -r llama-${{ steps.tag.outputs.name }}-bin-macos-arm64.zip ./build/bin/*
|
||||||
|
tar -czvf llama-${{ steps.tag.outputs.name }}-bin-macos-arm64.tar.gz -s ",./,llama-${{ steps.tag.outputs.name }}/," -C ./build/bin .
|
||||||
|
|
||||||
- name: Upload artifacts
|
- name: Upload artifacts (zip)
|
||||||
uses: actions/upload-artifact@v4
|
uses: actions/upload-artifact@v4
|
||||||
with:
|
with:
|
||||||
path: llama-${{ steps.tag.outputs.name }}-bin-macos-arm64.zip
|
path: llama-${{ steps.tag.outputs.name }}-bin-macos-arm64.zip
|
||||||
name: llama-bin-macos-arm64.zip
|
name: llama-bin-macos-arm64.zip
|
||||||
|
|
||||||
|
- name: Upload artifacts (tar)
|
||||||
|
uses: actions/upload-artifact@v4
|
||||||
|
with:
|
||||||
|
path: llama-${{ steps.tag.outputs.name }}-bin-macos-arm64.tar.gz
|
||||||
|
name: llama-bin-macos-arm64.tar.gz
|
||||||
|
|
||||||
macOS-x64:
|
macOS-x64:
|
||||||
runs-on: macos-15-intel
|
runs-on: macos-15-intel
|
||||||
|
|
||||||
|
|
@ -120,14 +127,21 @@ jobs:
|
||||||
id: pack_artifacts
|
id: pack_artifacts
|
||||||
run: |
|
run: |
|
||||||
cp LICENSE ./build/bin/
|
cp LICENSE ./build/bin/
|
||||||
zip -r llama-${{ steps.tag.outputs.name }}-bin-macos-x64.zip ./build/bin/*
|
zip -y -r llama-${{ steps.tag.outputs.name }}-bin-macos-x64.zip ./build/bin/*
|
||||||
|
tar -czvf llama-${{ steps.tag.outputs.name }}-bin-macos-x64.tar.gz -s ",./,llama-${{ steps.tag.outputs.name }}/," -C ./build/bin .
|
||||||
|
|
||||||
- name: Upload artifacts
|
- name: Upload artifacts (zip)
|
||||||
uses: actions/upload-artifact@v4
|
uses: actions/upload-artifact@v4
|
||||||
with:
|
with:
|
||||||
path: llama-${{ steps.tag.outputs.name }}-bin-macos-x64.zip
|
path: llama-${{ steps.tag.outputs.name }}-bin-macos-x64.zip
|
||||||
name: llama-bin-macos-x64.zip
|
name: llama-bin-macos-x64.zip
|
||||||
|
|
||||||
|
- name: Upload artifacts (tar)
|
||||||
|
uses: actions/upload-artifact@v4
|
||||||
|
with:
|
||||||
|
path: llama-${{ steps.tag.outputs.name }}-bin-macos-x64.tar.gz
|
||||||
|
name: llama-bin-macos-x64.tar.gz
|
||||||
|
|
||||||
ubuntu-22-cpu:
|
ubuntu-22-cpu:
|
||||||
strategy:
|
strategy:
|
||||||
matrix:
|
matrix:
|
||||||
|
|
@ -182,14 +196,21 @@ jobs:
|
||||||
id: pack_artifacts
|
id: pack_artifacts
|
||||||
run: |
|
run: |
|
||||||
cp LICENSE ./build/bin/
|
cp LICENSE ./build/bin/
|
||||||
zip -r llama-${{ steps.tag.outputs.name }}-bin-ubuntu-${{ matrix.build }}.zip ./build/bin/*
|
zip -y -r llama-${{ steps.tag.outputs.name }}-bin-ubuntu-${{ matrix.build }}.zip ./build/bin/*
|
||||||
|
tar -czvf llama-${{ steps.tag.outputs.name }}-bin-ubuntu-${{ matrix.build }}.tar.gz --transform "s,./,llama-${{ steps.tag.outputs.name }}/," -C ./build/bin .
|
||||||
|
|
||||||
- name: Upload artifacts
|
- name: Upload artifacts (zip)
|
||||||
uses: actions/upload-artifact@v4
|
uses: actions/upload-artifact@v4
|
||||||
with:
|
with:
|
||||||
path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-${{ matrix.build }}.zip
|
path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-${{ matrix.build }}.zip
|
||||||
name: llama-bin-ubuntu-${{ matrix.build }}.zip
|
name: llama-bin-ubuntu-${{ matrix.build }}.zip
|
||||||
|
|
||||||
|
- name: Upload artifacts (tar)
|
||||||
|
uses: actions/upload-artifact@v4
|
||||||
|
with:
|
||||||
|
path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-${{ matrix.build }}.tar.gz
|
||||||
|
name: llama-bin-ubuntu-${{ matrix.build }}.tar.gz
|
||||||
|
|
||||||
ubuntu-22-vulkan:
|
ubuntu-22-vulkan:
|
||||||
runs-on: ubuntu-22.04
|
runs-on: ubuntu-22.04
|
||||||
|
|
||||||
|
|
@ -235,14 +256,21 @@ jobs:
|
||||||
id: pack_artifacts
|
id: pack_artifacts
|
||||||
run: |
|
run: |
|
||||||
cp LICENSE ./build/bin/
|
cp LICENSE ./build/bin/
|
||||||
zip -r llama-${{ steps.tag.outputs.name }}-bin-ubuntu-vulkan-x64.zip ./build/bin/*
|
zip -y -r llama-${{ steps.tag.outputs.name }}-bin-ubuntu-vulkan-x64.zip ./build/bin/*
|
||||||
|
tar -czvf llama-${{ steps.tag.outputs.name }}-bin-ubuntu-vulkan-x64.tar.gz --transform "s,./,llama-${{ steps.tag.outputs.name }}/," -C ./build/bin .
|
||||||
|
|
||||||
- name: Upload artifacts
|
- name: Upload artifacts (zip)
|
||||||
uses: actions/upload-artifact@v4
|
uses: actions/upload-artifact@v4
|
||||||
with:
|
with:
|
||||||
path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-vulkan-x64.zip
|
path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-vulkan-x64.zip
|
||||||
name: llama-bin-ubuntu-vulkan-x64.zip
|
name: llama-bin-ubuntu-vulkan-x64.zip
|
||||||
|
|
||||||
|
- name: Upload artifacts (tar)
|
||||||
|
uses: actions/upload-artifact@v4
|
||||||
|
with:
|
||||||
|
path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-vulkan-x64.tar.gz
|
||||||
|
name: llama-bin-ubuntu-vulkan-x64.tar.gz
|
||||||
|
|
||||||
windows-cpu:
|
windows-cpu:
|
||||||
runs-on: windows-2025
|
runs-on: windows-2025
|
||||||
|
|
||||||
|
|
@ -298,7 +326,7 @@ jobs:
|
||||||
run: |
|
run: |
|
||||||
Copy-Item $env:CURL_PATH\bin\libcurl-${{ matrix.arch }}.dll .\build\bin\Release\
|
Copy-Item $env:CURL_PATH\bin\libcurl-${{ matrix.arch }}.dll .\build\bin\Release\
|
||||||
Copy-Item "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Redist\MSVC\14.44.35112\debug_nonredist\${{ matrix.arch }}\Microsoft.VC143.OpenMP.LLVM\libomp140.${{ matrix.arch == 'x64' && 'x86_64' || 'aarch64' }}.dll" .\build\bin\Release\
|
Copy-Item "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Redist\MSVC\14.44.35112\debug_nonredist\${{ matrix.arch }}\Microsoft.VC143.OpenMP.LLVM\libomp140.${{ matrix.arch == 'x64' && 'x86_64' || 'aarch64' }}.dll" .\build\bin\Release\
|
||||||
7z a llama-bin-win-cpu-${{ matrix.arch }}.zip .\build\bin\Release\*
|
7z a -snl llama-bin-win-cpu-${{ matrix.arch }}.zip .\build\bin\Release\*
|
||||||
|
|
||||||
- name: Upload artifacts
|
- name: Upload artifacts
|
||||||
uses: actions/upload-artifact@v4
|
uses: actions/upload-artifact@v4
|
||||||
|
|
@ -380,7 +408,7 @@ jobs:
|
||||||
- name: Pack artifacts
|
- name: Pack artifacts
|
||||||
id: pack_artifacts
|
id: pack_artifacts
|
||||||
run: |
|
run: |
|
||||||
7z a llama-bin-win-${{ matrix.backend }}-${{ matrix.arch }}.zip .\build\bin\Release\${{ matrix.target }}.dll
|
7z a -snl llama-bin-win-${{ matrix.backend }}-${{ matrix.arch }}.zip .\build\bin\Release\${{ matrix.target }}.dll
|
||||||
|
|
||||||
- name: Upload artifacts
|
- name: Upload artifacts
|
||||||
uses: actions/upload-artifact@v4
|
uses: actions/upload-artifact@v4
|
||||||
|
|
@ -393,7 +421,7 @@ jobs:
|
||||||
|
|
||||||
strategy:
|
strategy:
|
||||||
matrix:
|
matrix:
|
||||||
cuda: ['12.4']
|
cuda: ['12.4', '13.1']
|
||||||
|
|
||||||
steps:
|
steps:
|
||||||
- name: Clone
|
- name: Clone
|
||||||
|
|
@ -434,7 +462,7 @@ jobs:
|
||||||
- name: Pack artifacts
|
- name: Pack artifacts
|
||||||
id: pack_artifacts
|
id: pack_artifacts
|
||||||
run: |
|
run: |
|
||||||
7z a llama-bin-win-cuda-${{ matrix.cuda }}-x64.zip .\build\bin\Release\ggml-cuda.dll
|
7z a -snl llama-bin-win-cuda-${{ matrix.cuda }}-x64.zip .\build\bin\Release\ggml-cuda.dll
|
||||||
|
|
||||||
- name: Upload artifacts
|
- name: Upload artifacts
|
||||||
uses: actions/upload-artifact@v4
|
uses: actions/upload-artifact@v4
|
||||||
|
|
@ -448,6 +476,7 @@ jobs:
|
||||||
$dst='.\build\bin\cudart\'
|
$dst='.\build\bin\cudart\'
|
||||||
robocopy "${{env.CUDA_PATH}}\bin" $dst cudart64_*.dll cublas64_*.dll cublasLt64_*.dll
|
robocopy "${{env.CUDA_PATH}}\bin" $dst cudart64_*.dll cublas64_*.dll cublasLt64_*.dll
|
||||||
robocopy "${{env.CUDA_PATH}}\lib" $dst cudart64_*.dll cublas64_*.dll cublasLt64_*.dll
|
robocopy "${{env.CUDA_PATH}}\lib" $dst cudart64_*.dll cublas64_*.dll cublasLt64_*.dll
|
||||||
|
robocopy "${{env.CUDA_PATH}}\bin\x64" $dst cudart64_*.dll cublas64_*.dll cublasLt64_*.dll
|
||||||
7z a cudart-llama-bin-win-cuda-${{ matrix.cuda }}-x64.zip $dst\*
|
7z a cudart-llama-bin-win-cuda-${{ matrix.cuda }}-x64.zip $dst\*
|
||||||
|
|
||||||
- name: Upload Cuda runtime
|
- name: Upload Cuda runtime
|
||||||
|
|
@ -517,6 +546,8 @@ jobs:
|
||||||
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libmmd.dll" ./build/bin
|
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libmmd.dll" ./build/bin
|
||||||
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libiomp5md.dll" ./build/bin
|
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libiomp5md.dll" ./build/bin
|
||||||
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/sycl-ls.exe" ./build/bin
|
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/sycl-ls.exe" ./build/bin
|
||||||
|
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libsycl-fallback-bfloat16.spv" ./build/bin
|
||||||
|
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libsycl-native-bfloat16.spv" ./build/bin
|
||||||
|
|
||||||
cp "${{ env.ONEAPI_ROOT }}/dnnl/latest/bin/dnnl.dll" ./build/bin
|
cp "${{ env.ONEAPI_ROOT }}/dnnl/latest/bin/dnnl.dll" ./build/bin
|
||||||
cp "${{ env.ONEAPI_ROOT }}/tbb/latest/bin/tbb12.dll" ./build/bin
|
cp "${{ env.ONEAPI_ROOT }}/tbb/latest/bin/tbb12.dll" ./build/bin
|
||||||
|
|
@ -526,7 +557,7 @@ jobs:
|
||||||
cp "${{ env.ONEAPI_ROOT }}/umf/latest/bin/umf.dll" ./build/bin
|
cp "${{ env.ONEAPI_ROOT }}/umf/latest/bin/umf.dll" ./build/bin
|
||||||
|
|
||||||
echo "cp oneAPI running time dll files to ./build/bin done"
|
echo "cp oneAPI running time dll files to ./build/bin done"
|
||||||
7z a llama-bin-win-sycl-x64.zip ./build/bin/*
|
7z a -snl llama-bin-win-sycl-x64.zip ./build/bin/*
|
||||||
|
|
||||||
- name: Upload the release package
|
- name: Upload the release package
|
||||||
uses: actions/upload-artifact@v4
|
uses: actions/upload-artifact@v4
|
||||||
|
|
@ -632,7 +663,7 @@ jobs:
|
||||||
- name: Pack artifacts
|
- name: Pack artifacts
|
||||||
id: pack_artifacts
|
id: pack_artifacts
|
||||||
run: |
|
run: |
|
||||||
7z a llama-bin-win-hip-${{ matrix.name }}-x64.zip .\build\bin\*
|
7z a -snl llama-bin-win-hip-${{ matrix.name }}-x64.zip .\build\bin\*
|
||||||
|
|
||||||
- name: Upload artifacts
|
- name: Upload artifacts
|
||||||
uses: actions/upload-artifact@v4
|
uses: actions/upload-artifact@v4
|
||||||
|
|
@ -685,58 +716,20 @@ jobs:
|
||||||
- name: Pack artifacts
|
- name: Pack artifacts
|
||||||
id: pack_artifacts
|
id: pack_artifacts
|
||||||
run: |
|
run: |
|
||||||
zip --symlinks -r llama-${{ steps.tag.outputs.name }}-xcframework.zip build-apple/llama.xcframework
|
zip -y -r llama-${{ steps.tag.outputs.name }}-xcframework.zip build-apple/llama.xcframework
|
||||||
|
tar -czvf llama-${{ steps.tag.outputs.name }}-xcframework.tar.gz -C build-apple llama.xcframework
|
||||||
|
|
||||||
- name: Upload artifacts
|
- name: Upload artifacts (zip)
|
||||||
uses: actions/upload-artifact@v4
|
uses: actions/upload-artifact@v4
|
||||||
with:
|
with:
|
||||||
path: llama-${{ steps.tag.outputs.name }}-xcframework.zip
|
path: llama-${{ steps.tag.outputs.name }}-xcframework.zip
|
||||||
name: llama-${{ steps.tag.outputs.name }}-xcframework
|
name: llama-${{ steps.tag.outputs.name }}-xcframework.zip
|
||||||
|
|
||||||
openEuler-cann:
|
- name: Upload artifacts (tar)
|
||||||
strategy:
|
|
||||||
matrix:
|
|
||||||
arch: [x86, aarch64]
|
|
||||||
chip_type: ['910b', '310p']
|
|
||||||
build: ['Release']
|
|
||||||
runs-on: ${{ matrix.arch == 'aarch64' && 'ubuntu-24.04-arm' || 'ubuntu-24.04' }}
|
|
||||||
container: ascendai/cann:${{ matrix.chip_type == '910b' && '8.3.rc1.alpha001-910b-openeuler22.03-py3.11' || '8.2.rc1-310p-openeuler22.03-py3.11' }}
|
|
||||||
steps:
|
|
||||||
- name: Checkout
|
|
||||||
uses: actions/checkout@v4
|
|
||||||
with:
|
|
||||||
fetch-depth: 0
|
|
||||||
|
|
||||||
- name: Dependencies
|
|
||||||
run: |
|
|
||||||
yum update -y
|
|
||||||
yum install -y git gcc gcc-c++ make cmake libcurl-devel
|
|
||||||
git config --global --add safe.directory "$GITHUB_WORKSPACE"
|
|
||||||
|
|
||||||
- name: Build
|
|
||||||
run: |
|
|
||||||
export LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:${ASCEND_TOOLKIT_HOME}/$(uname -m)-linux/devlib/:${LD_LIBRARY_PATH}
|
|
||||||
|
|
||||||
cmake -S . -B build \
|
|
||||||
-DCMAKE_BUILD_TYPE=${{ matrix.build }} \
|
|
||||||
-DGGML_CANN=on \
|
|
||||||
-DSOC_TYPE=ascend${{ matrix.chip_type }}
|
|
||||||
cmake --build build -j $(nproc)
|
|
||||||
|
|
||||||
- name: Determine tag name
|
|
||||||
id: tag
|
|
||||||
uses: ./.github/actions/get-tag-name
|
|
||||||
|
|
||||||
- name: Pack artifacts
|
|
||||||
run: |
|
|
||||||
cp LICENSE ./build/bin/
|
|
||||||
zip -r llama-${{ steps.tag.outputs.name }}-bin-${{ matrix.chip_type }}-openEuler-${{ matrix.arch }}.zip ./build/bin/*
|
|
||||||
|
|
||||||
- name: Upload artifacts
|
|
||||||
uses: actions/upload-artifact@v4
|
uses: actions/upload-artifact@v4
|
||||||
with:
|
with:
|
||||||
path: llama-${{ steps.tag.outputs.name }}-bin-${{ matrix.chip_type }}-openEuler-${{ matrix.arch }}.zip
|
path: llama-${{ steps.tag.outputs.name }}-xcframework.tar.gz
|
||||||
name: llama-bin-${{ matrix.chip_type }}-openEuler-${{ matrix.arch }}.zip
|
name: llama-${{ steps.tag.outputs.name }}-xcframework.tar.gz
|
||||||
|
|
||||||
release:
|
release:
|
||||||
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
|
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
|
||||||
|
|
@ -759,7 +752,6 @@ jobs:
|
||||||
- macOS-arm64
|
- macOS-arm64
|
||||||
- macOS-x64
|
- macOS-x64
|
||||||
- ios-xcode-build
|
- ios-xcode-build
|
||||||
- openEuler-cann
|
|
||||||
|
|
||||||
steps:
|
steps:
|
||||||
- name: Clone
|
- name: Clone
|
||||||
|
|
@ -814,6 +806,7 @@ jobs:
|
||||||
|
|
||||||
echo "Moving other artifacts..."
|
echo "Moving other artifacts..."
|
||||||
mv -v artifact/*.zip release
|
mv -v artifact/*.zip release
|
||||||
|
mv -v artifact/*.tar.gz release
|
||||||
|
|
||||||
- name: Create release
|
- name: Create release
|
||||||
id: create_release
|
id: create_release
|
||||||
|
|
@ -822,6 +815,34 @@ jobs:
|
||||||
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||||
with:
|
with:
|
||||||
tag_name: ${{ steps.tag.outputs.name }}
|
tag_name: ${{ steps.tag.outputs.name }}
|
||||||
|
body: |
|
||||||
|
> [!WARNING]
|
||||||
|
> **Release Format Update**: Linux releases will soon use .tar.gz archives instead of .zip. Please make the necessary changes to your deployment scripts.
|
||||||
|
|
||||||
|
<details open>
|
||||||
|
|
||||||
|
${{ github.event.head_commit.message }}
|
||||||
|
|
||||||
|
</details>
|
||||||
|
|
||||||
|
**macOS/iOS:**
|
||||||
|
- [macOS Apple Silicon (arm64)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-macos-arm64.tar.gz)
|
||||||
|
- [macOS Intel (x64)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-macos-x64.tar.gz)
|
||||||
|
- [iOS XCFramework](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-xcframework.tar.gz)
|
||||||
|
|
||||||
|
**Linux:**
|
||||||
|
- [Ubuntu x64 (CPU)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-x64.tar.gz)
|
||||||
|
- [Ubuntu x64 (Vulkan)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-vulkan-x64.tar.gz)
|
||||||
|
- [Ubuntu s390x (CPU)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-s390x.tar.gz)
|
||||||
|
|
||||||
|
**Windows:**
|
||||||
|
- [Windows x64 (CPU)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-cpu-x64.zip)
|
||||||
|
- [Windows arm64 (CPU)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-cpu-arm64.zip)
|
||||||
|
- [Windows x64 (CUDA 12)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-cuda-12.4-x64.zip)
|
||||||
|
- [Windows x64 (CUDA 13)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-cuda-13.1-x64.zip)
|
||||||
|
- [Windows x64 (Vulkan)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-vulkan-x64.zip)
|
||||||
|
- [Windows x64 (SYCL)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-sycl-x64.zip)
|
||||||
|
- [Windows x64 (HIP)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-hip-radeon-x64.zip)
|
||||||
|
|
||||||
- name: Upload release
|
- name: Upload release
|
||||||
id: upload_release
|
id: upload_release
|
||||||
|
|
@ -833,7 +854,7 @@ jobs:
|
||||||
const fs = require('fs');
|
const fs = require('fs');
|
||||||
const release_id = '${{ steps.create_release.outputs.id }}';
|
const release_id = '${{ steps.create_release.outputs.id }}';
|
||||||
for (let file of await fs.readdirSync('./release')) {
|
for (let file of await fs.readdirSync('./release')) {
|
||||||
if (path.extname(file) === '.zip') {
|
if (path.extname(file) === '.zip' || file.endsWith('.tar.gz')) {
|
||||||
console.log('uploadReleaseAsset', file);
|
console.log('uploadReleaseAsset', file);
|
||||||
await github.repos.uploadReleaseAsset({
|
await github.repos.uploadReleaseAsset({
|
||||||
owner: context.repo.owner,
|
owner: context.repo.owner,
|
||||||
|
|
|
||||||
|
|
@ -9,6 +9,7 @@ jobs:
|
||||||
update:
|
update:
|
||||||
name: Update Winget Package
|
name: Update Winget Package
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
|
if: github.repository_owner == 'ggml-org'
|
||||||
|
|
||||||
steps:
|
steps:
|
||||||
- name: Install cargo binstall
|
- name: Install cargo binstall
|
||||||
|
|
|
||||||
|
|
@ -134,3 +134,5 @@ poetry.toml
|
||||||
# IDE
|
# IDE
|
||||||
/*.code-workspace
|
/*.code-workspace
|
||||||
/.windsurf/
|
/.windsurf/
|
||||||
|
# emscripten
|
||||||
|
a.out.*
|
||||||
|
|
|
||||||
|
|
@ -33,10 +33,24 @@ endif()
|
||||||
|
|
||||||
option(LLAMA_USE_SYSTEM_GGML "Use system libggml" OFF)
|
option(LLAMA_USE_SYSTEM_GGML "Use system libggml" OFF)
|
||||||
|
|
||||||
|
option(LLAMA_WASM_MEM64 "llama: use 64-bit memory in WASM builds" ON)
|
||||||
|
|
||||||
if (EMSCRIPTEN)
|
if (EMSCRIPTEN)
|
||||||
set(BUILD_SHARED_LIBS_DEFAULT OFF)
|
set(BUILD_SHARED_LIBS_DEFAULT OFF)
|
||||||
|
|
||||||
option(LLAMA_WASM_SINGLE_FILE "llama: embed WASM inside the generated llama.js" ON)
|
# Use 64-bit memory to support backend_get_memory queries
|
||||||
|
# TODO: analyze performance impact, see https://spidermonkey.dev/blog/2025/01/15/is-memory64-actually-worth-using
|
||||||
|
if (LLAMA_WASM_MEM64)
|
||||||
|
add_compile_options("-sMEMORY64=1")
|
||||||
|
add_link_options("-sMEMORY64=1")
|
||||||
|
endif()
|
||||||
|
add_link_options("-sALLOW_MEMORY_GROWTH=1")
|
||||||
|
|
||||||
|
option(LLAMA_WASM_SINGLE_FILE "llama: embed WASM inside the generated llama.js" OFF)
|
||||||
|
option(LLAMA_BUILD_HTML "llama: build HTML file" ON)
|
||||||
|
if (LLAMA_BUILD_HTML)
|
||||||
|
set(CMAKE_EXECUTABLE_SUFFIX ".html")
|
||||||
|
endif()
|
||||||
else()
|
else()
|
||||||
if (MINGW)
|
if (MINGW)
|
||||||
set(BUILD_SHARED_LIBS_DEFAULT OFF)
|
set(BUILD_SHARED_LIBS_DEFAULT OFF)
|
||||||
|
|
@ -58,6 +72,12 @@ if (MSVC)
|
||||||
add_compile_options("$<$<COMPILE_LANGUAGE:CXX>:/bigobj>")
|
add_compile_options("$<$<COMPILE_LANGUAGE:CXX>:/bigobj>")
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
|
if (LLAMA_STANDALONE)
|
||||||
|
# enable parallel builds for msbuild
|
||||||
|
list(APPEND CMAKE_VS_GLOBALS UseMultiToolTask=true)
|
||||||
|
list(APPEND CMAKE_VS_GLOBALS EnforceProcessCountAcrossBuilds=true)
|
||||||
|
endif()
|
||||||
|
|
||||||
if (CMAKE_SYSTEM_NAME STREQUAL "iOS")
|
if (CMAKE_SYSTEM_NAME STREQUAL "iOS")
|
||||||
set(LLAMA_TOOLS_INSTALL_DEFAULT OFF)
|
set(LLAMA_TOOLS_INSTALL_DEFAULT OFF)
|
||||||
else()
|
else()
|
||||||
|
|
@ -179,11 +199,6 @@ if (NOT TARGET ggml AND NOT LLAMA_USE_SYSTEM_GGML)
|
||||||
# ... otherwise assume ggml is added by a parent CMakeLists.txt
|
# ... otherwise assume ggml is added by a parent CMakeLists.txt
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
if (MINGW)
|
|
||||||
# Target Windows 8 for PrefetchVirtualMemory
|
|
||||||
add_compile_definitions(_WIN32_WINNT=${GGML_WIN_VER})
|
|
||||||
endif()
|
|
||||||
|
|
||||||
#
|
#
|
||||||
# build the library
|
# build the library
|
||||||
#
|
#
|
||||||
|
|
|
||||||
10
CODEOWNERS
10
CODEOWNERS
|
|
@ -7,16 +7,20 @@
|
||||||
/ci/ @ggerganov
|
/ci/ @ggerganov
|
||||||
/cmake/ @ggerganov
|
/cmake/ @ggerganov
|
||||||
/common/CMakeLists.txt @ggerganov
|
/common/CMakeLists.txt @ggerganov
|
||||||
/common/arg.* @ggerganov @ericcurtin
|
/common/arg.* @ggerganov
|
||||||
/common/base64.hpp.* @ggerganov
|
/common/base64.hpp.* @ggerganov
|
||||||
/common/build-info.* @ggerganov
|
/common/build-info.* @ggerganov
|
||||||
|
/common/chat.* @pwilkin
|
||||||
|
/common/chat-peg-parser.* @aldehir
|
||||||
/common/common.* @ggerganov
|
/common/common.* @ggerganov
|
||||||
/common/console.* @ggerganov
|
/common/console.* @ggerganov
|
||||||
/common/http.* @angt
|
/common/http.* @angt
|
||||||
/common/llguidance.* @ggerganov
|
/common/llguidance.* @ggerganov
|
||||||
/common/log.* @ggerganov
|
/common/log.* @ggerganov
|
||||||
|
/common/peg-parser.* @aldehir
|
||||||
/common/sampling.* @ggerganov
|
/common/sampling.* @ggerganov
|
||||||
/common/speculative.* @ggerganov
|
/common/speculative.* @ggerganov
|
||||||
|
/common/unicode.* @aldehir
|
||||||
/convert_*.py @CISC
|
/convert_*.py @CISC
|
||||||
/examples/batched.swift/ @ggerganov
|
/examples/batched.swift/ @ggerganov
|
||||||
/examples/batched/ @ggerganov
|
/examples/batched/ @ggerganov
|
||||||
|
|
@ -81,14 +85,14 @@
|
||||||
/src/llama-vocab.* @CISC
|
/src/llama-vocab.* @CISC
|
||||||
/src/models/ @CISC
|
/src/models/ @CISC
|
||||||
/tests/ @ggerganov
|
/tests/ @ggerganov
|
||||||
|
/tests/test-chat-.* @pwilkin
|
||||||
/tools/batched-bench/ @ggerganov
|
/tools/batched-bench/ @ggerganov
|
||||||
/tools/main/ @ggerganov
|
/tools/main/ @ggerganov
|
||||||
/tools/mtmd/ @ngxson
|
/tools/mtmd/ @ngxson
|
||||||
/tools/perplexity/ @ggerganov
|
/tools/perplexity/ @ggerganov
|
||||||
/tools/quantize/ @ggerganov
|
/tools/quantize/ @ggerganov
|
||||||
/tools/rpc/ @rgerganov
|
/tools/rpc/ @rgerganov
|
||||||
/tools/run/ @ericcurtin
|
/tools/server/* @ngxson @ggerganov # no subdir
|
||||||
/tools/server/* @ngxson @ggerganov @ericcurtin # no subdir
|
|
||||||
/tools/server/webui/ @allozaur
|
/tools/server/webui/ @allozaur
|
||||||
/tools/tokenize/ @ggerganov
|
/tools/tokenize/ @ggerganov
|
||||||
/tools/tts/ @ggerganov
|
/tools/tts/ @ggerganov
|
||||||
|
|
|
||||||
|
|
@ -16,7 +16,7 @@ The project differentiates between 3 levels of contributors:
|
||||||
- If you modified a `ggml` operator or added a new one, add the corresponding test cases to `test-backend-ops`
|
- If you modified a `ggml` operator or added a new one, add the corresponding test cases to `test-backend-ops`
|
||||||
- Create separate PRs for each feature or fix. Avoid combining unrelated changes in a single PR
|
- Create separate PRs for each feature or fix. Avoid combining unrelated changes in a single PR
|
||||||
- Consider allowing write access to your branch for faster reviews, as reviewers can push commits directly
|
- Consider allowing write access to your branch for faster reviews, as reviewers can push commits directly
|
||||||
- If your PR becomes stale, don't hesitate to ping the maintainers in the comments
|
- If your PR becomes stale, rebase it on top of latest `master` to get maintainers attention
|
||||||
- Maintainers will rely on your insights and approval when making a final decision to approve and merge a PR
|
- Maintainers will rely on your insights and approval when making a final decision to approve and merge a PR
|
||||||
- Consider adding yourself to [CODEOWNERS](CODEOWNERS) to indicate your availability for reviewing related PRs
|
- Consider adding yourself to [CODEOWNERS](CODEOWNERS) to indicate your availability for reviewing related PRs
|
||||||
- Using AI to generate PRs is permitted. However, you must (1) explicitly disclose how AI was used and (2) conduct a thorough manual review before publishing the PR. Note that trivial tab autocompletions do not require disclosure.
|
- Using AI to generate PRs is permitted. However, you must (1) explicitly disclose how AI was used and (2) conduct a thorough manual review before publishing the PR. Note that trivial tab autocompletions do not require disclosure.
|
||||||
|
|
|
||||||
|
|
@ -61,7 +61,7 @@ range of hardware - locally and in the cloud.
|
||||||
- Plain C/C++ implementation without any dependencies
|
- Plain C/C++ implementation without any dependencies
|
||||||
- Apple silicon is a first-class citizen - optimized via ARM NEON, Accelerate and Metal frameworks
|
- Apple silicon is a first-class citizen - optimized via ARM NEON, Accelerate and Metal frameworks
|
||||||
- AVX, AVX2, AVX512 and AMX support for x86 architectures
|
- AVX, AVX2, AVX512 and AMX support for x86 architectures
|
||||||
- RVV, ZVFH, ZFH and ZICBOP support for RISC-V architectures
|
- RVV, ZVFH, ZFH, ZICBOP and ZIHINTPAUSE support for RISC-V architectures
|
||||||
- 1.5-bit, 2-bit, 3-bit, 4-bit, 5-bit, 6-bit, and 8-bit integer quantization for faster inference and reduced memory use
|
- 1.5-bit, 2-bit, 3-bit, 4-bit, 5-bit, 6-bit, and 8-bit integer quantization for faster inference and reduced memory use
|
||||||
- Custom CUDA kernels for running LLMs on NVIDIA GPUs (support for AMD GPUs via HIP and Moore Threads GPUs via MUSA)
|
- Custom CUDA kernels for running LLMs on NVIDIA GPUs (support for AMD GPUs via HIP and Moore Threads GPUs via MUSA)
|
||||||
- Vulkan and SYCL backend support
|
- Vulkan and SYCL backend support
|
||||||
|
|
@ -276,6 +276,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
|
||||||
| [MUSA](docs/build.md#musa) | Moore Threads GPU |
|
| [MUSA](docs/build.md#musa) | Moore Threads GPU |
|
||||||
| [CUDA](docs/build.md#cuda) | Nvidia GPU |
|
| [CUDA](docs/build.md#cuda) | Nvidia GPU |
|
||||||
| [HIP](docs/build.md#hip) | AMD GPU |
|
| [HIP](docs/build.md#hip) | AMD GPU |
|
||||||
|
| [ZenDNN](docs/build.md#zendnn) | AMD CPU |
|
||||||
| [Vulkan](docs/build.md#vulkan) | GPU |
|
| [Vulkan](docs/build.md#vulkan) | GPU |
|
||||||
| [CANN](docs/build.md#cann) | Ascend NPU |
|
| [CANN](docs/build.md#cann) | Ascend NPU |
|
||||||
| [OpenCL](docs/backend/OPENCL.md) | Adreno GPU |
|
| [OpenCL](docs/backend/OPENCL.md) | Adreno GPU |
|
||||||
|
|
|
||||||
|
|
@ -45,7 +45,7 @@ sd=`dirname $0`
|
||||||
cd $sd/../
|
cd $sd/../
|
||||||
SRC=`pwd`
|
SRC=`pwd`
|
||||||
|
|
||||||
CMAKE_EXTRA="-DLLAMA_FATAL_WARNINGS=ON -DLLAMA_CURL=ON -DGGML_SCHED_NO_REALLOC=ON"
|
CMAKE_EXTRA="-DLLAMA_FATAL_WARNINGS=${LLAMA_FATAL_WARNINGS:-ON} -DLLAMA_CURL=ON -DGGML_SCHED_NO_REALLOC=ON"
|
||||||
|
|
||||||
if [ ! -z ${GG_BUILD_METAL} ]; then
|
if [ ! -z ${GG_BUILD_METAL} ]; then
|
||||||
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_METAL=ON"
|
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_METAL=ON"
|
||||||
|
|
|
||||||
|
|
@ -39,26 +39,10 @@ if(Git_FOUND)
|
||||||
endif()
|
endif()
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
if(MSVC)
|
|
||||||
set(BUILD_COMPILER "${CMAKE_C_COMPILER_ID} ${CMAKE_C_COMPILER_VERSION}")
|
set(BUILD_COMPILER "${CMAKE_C_COMPILER_ID} ${CMAKE_C_COMPILER_VERSION}")
|
||||||
|
|
||||||
if(CMAKE_VS_PLATFORM_NAME)
|
if(CMAKE_VS_PLATFORM_NAME)
|
||||||
set(BUILD_TARGET ${CMAKE_VS_PLATFORM_NAME})
|
set(BUILD_TARGET ${CMAKE_VS_PLATFORM_NAME})
|
||||||
else()
|
else()
|
||||||
set(BUILD_TARGET "${CMAKE_SYSTEM_NAME} ${CMAKE_SYSTEM_PROCESSOR}")
|
set(BUILD_TARGET "${CMAKE_SYSTEM_NAME} ${CMAKE_SYSTEM_PROCESSOR}")
|
||||||
endif()
|
endif()
|
||||||
else()
|
|
||||||
execute_process(
|
|
||||||
COMMAND ${CMAKE_C_COMPILER} --version
|
|
||||||
OUTPUT_VARIABLE OUT
|
|
||||||
OUTPUT_STRIP_TRAILING_WHITESPACE
|
|
||||||
)
|
|
||||||
string(REGEX REPLACE " *\n.*" "" OUT "${OUT}")
|
|
||||||
set(BUILD_COMPILER ${OUT})
|
|
||||||
|
|
||||||
execute_process(
|
|
||||||
COMMAND ${CMAKE_C_COMPILER} -dumpmachine
|
|
||||||
OUTPUT_VARIABLE OUT
|
|
||||||
OUTPUT_STRIP_TRAILING_WHITESPACE
|
|
||||||
)
|
|
||||||
set(BUILD_TARGET ${OUT})
|
|
||||||
endif()
|
|
||||||
|
|
|
||||||
|
|
@ -52,6 +52,8 @@ add_library(${TARGET} STATIC
|
||||||
chat-parser.h
|
chat-parser.h
|
||||||
chat-parser-xml-toolcall.h
|
chat-parser-xml-toolcall.h
|
||||||
chat-parser-xml-toolcall.cpp
|
chat-parser-xml-toolcall.cpp
|
||||||
|
chat-peg-parser.cpp
|
||||||
|
chat-peg-parser.h
|
||||||
chat.cpp
|
chat.cpp
|
||||||
chat.h
|
chat.h
|
||||||
common.cpp
|
common.cpp
|
||||||
|
|
@ -69,12 +71,16 @@ add_library(${TARGET} STATIC
|
||||||
log.h
|
log.h
|
||||||
ngram-cache.cpp
|
ngram-cache.cpp
|
||||||
ngram-cache.h
|
ngram-cache.h
|
||||||
|
peg-parser.cpp
|
||||||
|
peg-parser.h
|
||||||
regex-partial.cpp
|
regex-partial.cpp
|
||||||
regex-partial.h
|
regex-partial.h
|
||||||
sampling.cpp
|
sampling.cpp
|
||||||
sampling.h
|
sampling.h
|
||||||
speculative.cpp
|
speculative.cpp
|
||||||
speculative.h
|
speculative.h
|
||||||
|
unicode.cpp
|
||||||
|
unicode.h
|
||||||
)
|
)
|
||||||
|
|
||||||
if (BUILD_SHARED_LIBS)
|
if (BUILD_SHARED_LIBS)
|
||||||
|
|
|
||||||
|
|
@ -30,6 +30,7 @@
|
||||||
#include <thread> // for hardware_concurrency
|
#include <thread> // for hardware_concurrency
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
|
#ifndef __EMSCRIPTEN__
|
||||||
#ifdef __linux__
|
#ifdef __linux__
|
||||||
#include <linux/limits.h>
|
#include <linux/limits.h>
|
||||||
#elif defined(_WIN32)
|
#elif defined(_WIN32)
|
||||||
|
|
@ -41,6 +42,8 @@
|
||||||
#else
|
#else
|
||||||
#include <sys/syslimits.h>
|
#include <sys/syslimits.h>
|
||||||
#endif
|
#endif
|
||||||
|
#endif
|
||||||
|
|
||||||
#define LLAMA_MAX_URL_LENGTH 2084 // Maximum URL Length in Chrome: 2083
|
#define LLAMA_MAX_URL_LENGTH 2084 // Maximum URL Length in Chrome: 2083
|
||||||
|
|
||||||
using json = nlohmann::ordered_json;
|
using json = nlohmann::ordered_json;
|
||||||
|
|
@ -424,7 +427,7 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
|
||||||
|
|
||||||
// model is required (except for server)
|
// model is required (except for server)
|
||||||
// TODO @ngxson : maybe show a list of available models in CLI in this case
|
// TODO @ngxson : maybe show a list of available models in CLI in this case
|
||||||
if (params.model.path.empty() && ctx_arg.ex != LLAMA_EXAMPLE_SERVER) {
|
if (params.model.path.empty() && ctx_arg.ex != LLAMA_EXAMPLE_SERVER && !params.usage) {
|
||||||
throw std::invalid_argument("error: --model is required\n");
|
throw std::invalid_argument("error: --model is required\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -705,6 +708,8 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||||
params.use_jinja = true;
|
params.use_jinja = true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
params.use_color = tty_can_use_colors();
|
||||||
|
|
||||||
// load dynamic backends
|
// load dynamic backends
|
||||||
ggml_backend_load_all();
|
ggml_backend_load_all();
|
||||||
|
|
||||||
|
|
@ -787,10 +792,20 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||||
}
|
}
|
||||||
).set_examples({LLAMA_EXAMPLE_MAIN}));
|
).set_examples({LLAMA_EXAMPLE_MAIN}));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"-co", "--color"},
|
{"-co", "--color"}, "[on|off|auto]",
|
||||||
string_format("colorise output to distinguish prompt and user input from generations (default: %s)", params.use_color ? "true" : "false"),
|
"Colorize output to distinguish prompt and user input from generations ('on', 'off', or 'auto', default: 'auto')\n"
|
||||||
[](common_params & params) {
|
"'auto' enables colors when output is to a terminal",
|
||||||
|
[](common_params & params, const std::string & value) {
|
||||||
|
if (is_truthy(value)) {
|
||||||
params.use_color = true;
|
params.use_color = true;
|
||||||
|
} else if (is_falsey(value)) {
|
||||||
|
params.use_color = false;
|
||||||
|
} else if (is_autoy(value)) {
|
||||||
|
params.use_color = tty_can_use_colors();
|
||||||
|
} else {
|
||||||
|
throw std::invalid_argument(
|
||||||
|
string_format("error: unknown value for --color: '%s'\n", value.c_str()));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP}));
|
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP}));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
|
|
@ -1019,7 +1034,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||||
params.flash_attn_type = LLAMA_FLASH_ATTN_TYPE_AUTO;
|
params.flash_attn_type = LLAMA_FLASH_ATTN_TYPE_AUTO;
|
||||||
} else {
|
} else {
|
||||||
throw std::runtime_error(
|
throw std::runtime_error(
|
||||||
string_format("error: unkown value for --flash-attn: '%s'\n", value.c_str()));
|
string_format("error: unknown value for --flash-attn: '%s'\n", value.c_str()));
|
||||||
}
|
}
|
||||||
}).set_env("LLAMA_ARG_FLASH_ATTN"));
|
}).set_env("LLAMA_ARG_FLASH_ATTN"));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
|
|
@ -1226,7 +1241,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||||
[](common_params & params) {
|
[](common_params & params) {
|
||||||
params.warmup = false;
|
params.warmup = false;
|
||||||
}
|
}
|
||||||
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_RETRIEVAL, LLAMA_EXAMPLE_PERPLEXITY}));
|
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MTMD, LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_RETRIEVAL, LLAMA_EXAMPLE_PERPLEXITY}));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"--spm-infill"},
|
{"--spm-infill"},
|
||||||
string_format(
|
string_format(
|
||||||
|
|
@ -2488,12 +2503,29 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||||
"path to save slot kv cache (default: disabled)",
|
"path to save slot kv cache (default: disabled)",
|
||||||
[](common_params & params, const std::string & value) {
|
[](common_params & params, const std::string & value) {
|
||||||
params.slot_save_path = value;
|
params.slot_save_path = value;
|
||||||
|
if (!fs_is_directory(params.slot_save_path)) {
|
||||||
|
throw std::invalid_argument("not a directory: " + value);
|
||||||
|
}
|
||||||
// if doesn't end with DIRECTORY_SEPARATOR, add it
|
// if doesn't end with DIRECTORY_SEPARATOR, add it
|
||||||
if (!params.slot_save_path.empty() && params.slot_save_path[params.slot_save_path.size() - 1] != DIRECTORY_SEPARATOR) {
|
if (!params.slot_save_path.empty() && params.slot_save_path[params.slot_save_path.size() - 1] != DIRECTORY_SEPARATOR) {
|
||||||
params.slot_save_path += DIRECTORY_SEPARATOR;
|
params.slot_save_path += DIRECTORY_SEPARATOR;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
).set_examples({LLAMA_EXAMPLE_SERVER}));
|
).set_examples({LLAMA_EXAMPLE_SERVER}));
|
||||||
|
add_opt(common_arg(
|
||||||
|
{"--media-path"}, "PATH",
|
||||||
|
"directory for loading local media files; files can be accessed via file:// URLs using relative paths (default: disabled)",
|
||||||
|
[](common_params & params, const std::string & value) {
|
||||||
|
params.media_path = value;
|
||||||
|
if (!fs_is_directory(params.media_path)) {
|
||||||
|
throw std::invalid_argument("not a directory: " + value);
|
||||||
|
}
|
||||||
|
// if doesn't end with DIRECTORY_SEPARATOR, add it
|
||||||
|
if (!params.media_path.empty() && params.media_path[params.media_path.size() - 1] != DIRECTORY_SEPARATOR) {
|
||||||
|
params.media_path += DIRECTORY_SEPARATOR;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
).set_examples({LLAMA_EXAMPLE_SERVER}));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"--models-dir"}, "PATH",
|
{"--models-dir"}, "PATH",
|
||||||
"directory containing models for the router server (default: disabled)",
|
"directory containing models for the router server (default: disabled)",
|
||||||
|
|
@ -2676,7 +2708,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||||
common_log_set_colors(common_log_main(), LOG_COLORS_AUTO);
|
common_log_set_colors(common_log_main(), LOG_COLORS_AUTO);
|
||||||
} else {
|
} else {
|
||||||
throw std::invalid_argument(
|
throw std::invalid_argument(
|
||||||
string_format("error: unkown value for --log-colors: '%s'\n", value.c_str()));
|
string_format("error: unknown value for --log-colors: '%s'\n", value.c_str()));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
).set_env("LLAMA_LOG_COLORS"));
|
).set_env("LLAMA_LOG_COLORS"));
|
||||||
|
|
|
||||||
|
|
@ -1,6 +1,8 @@
|
||||||
#include "chat-parser.h"
|
#include "chat-parser.h"
|
||||||
|
#include "chat-peg-parser.h"
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
#include "log.h"
|
#include "log.h"
|
||||||
|
#include "peg-parser.h"
|
||||||
#include "regex-partial.h"
|
#include "regex-partial.h"
|
||||||
|
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
|
|
@ -1505,6 +1507,11 @@ static void common_chat_parse(common_chat_msg_parser & builder) {
|
||||||
}
|
}
|
||||||
|
|
||||||
common_chat_msg common_chat_parse(const std::string & input, bool is_partial, const common_chat_syntax & syntax) {
|
common_chat_msg common_chat_parse(const std::string & input, bool is_partial, const common_chat_syntax & syntax) {
|
||||||
|
if (syntax.format == COMMON_CHAT_FORMAT_PEG_SIMPLE ||
|
||||||
|
syntax.format == COMMON_CHAT_FORMAT_PEG_NATIVE ||
|
||||||
|
syntax.format == COMMON_CHAT_FORMAT_PEG_CONSTRUCTED) {
|
||||||
|
return common_chat_peg_parse(syntax.parser, input, is_partial, syntax);
|
||||||
|
}
|
||||||
common_chat_msg_parser builder(input, is_partial, syntax);
|
common_chat_msg_parser builder(input, is_partial, syntax);
|
||||||
try {
|
try {
|
||||||
common_chat_parse(builder);
|
common_chat_parse(builder);
|
||||||
|
|
@ -1522,3 +1529,36 @@ common_chat_msg common_chat_parse(const std::string & input, bool is_partial, co
|
||||||
}
|
}
|
||||||
return msg;
|
return msg;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
common_chat_msg common_chat_peg_parse(const common_peg_arena & parser, const std::string & input, bool is_partial, const common_chat_syntax & syntax) {
|
||||||
|
if (parser.empty()) {
|
||||||
|
throw std::runtime_error("Failed to parse due to missing parser definition.");
|
||||||
|
}
|
||||||
|
|
||||||
|
LOG_DBG("Parsing input with format %s: %s\n", common_chat_format_name(syntax.format), input.c_str());
|
||||||
|
|
||||||
|
common_peg_parse_context ctx(input, is_partial);
|
||||||
|
auto result = parser.parse(ctx);
|
||||||
|
if (result.fail()) {
|
||||||
|
throw std::runtime_error(std::string("Failed to parse input at pos ") + std::to_string(result.end));
|
||||||
|
}
|
||||||
|
|
||||||
|
common_chat_msg msg;
|
||||||
|
msg.role = "assistant";
|
||||||
|
|
||||||
|
if (syntax.format == COMMON_CHAT_FORMAT_PEG_NATIVE) {
|
||||||
|
auto mapper = common_chat_peg_native_mapper(msg);
|
||||||
|
mapper.from_ast(ctx.ast, result);
|
||||||
|
} else if (syntax.format == COMMON_CHAT_FORMAT_PEG_CONSTRUCTED) {
|
||||||
|
auto mapper = common_chat_peg_constructed_mapper(msg);
|
||||||
|
mapper.from_ast(ctx.ast, result);
|
||||||
|
} else {
|
||||||
|
// Generic mapper
|
||||||
|
auto mapper = common_chat_peg_mapper(msg);
|
||||||
|
mapper.from_ast(ctx.ast, result);
|
||||||
|
}
|
||||||
|
if (!is_partial) {
|
||||||
|
LOG_DBG("Parsed message: %s\n", common_chat_msgs_to_json_oaicompat<json>({msg}).at(0).dump().c_str());
|
||||||
|
}
|
||||||
|
return msg;
|
||||||
|
}
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,114 @@
|
||||||
|
#include "chat-peg-parser.h"
|
||||||
|
|
||||||
|
#include <nlohmann/json.hpp>
|
||||||
|
|
||||||
|
using json = nlohmann::json;
|
||||||
|
|
||||||
|
static std::string_view trim_trailing_space(std::string_view sv) {
|
||||||
|
while (!sv.empty() && std::isspace(static_cast<unsigned char>(sv.back()))) {
|
||||||
|
sv.remove_suffix(1);
|
||||||
|
}
|
||||||
|
return sv;
|
||||||
|
}
|
||||||
|
|
||||||
|
void common_chat_peg_mapper::from_ast(const common_peg_ast_arena & arena, const common_peg_parse_result & result) {
|
||||||
|
arena.visit(result, [this](const common_peg_ast_node & node) {
|
||||||
|
map(node);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
void common_chat_peg_mapper::map(const common_peg_ast_node & node) {
|
||||||
|
bool is_reasoning = node.tag == common_chat_peg_builder::REASONING;
|
||||||
|
bool is_content = node.tag == common_chat_peg_builder::CONTENT;
|
||||||
|
|
||||||
|
if (is_reasoning) {
|
||||||
|
result.reasoning_content = std::string(trim_trailing_space(node.text));
|
||||||
|
}
|
||||||
|
|
||||||
|
if (is_content) {
|
||||||
|
result.content = std::string(trim_trailing_space(node.text));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void common_chat_peg_native_mapper::map(const common_peg_ast_node & node) {
|
||||||
|
common_chat_peg_mapper::map(node);
|
||||||
|
|
||||||
|
bool is_tool_open = node.tag == common_chat_peg_native_builder::TOOL_OPEN;
|
||||||
|
bool is_tool_name = node.tag == common_chat_peg_native_builder::TOOL_NAME;
|
||||||
|
bool is_tool_id = node.tag == common_chat_peg_native_builder::TOOL_ID;
|
||||||
|
bool is_tool_args = node.tag == common_chat_peg_native_builder::TOOL_ARGS;
|
||||||
|
|
||||||
|
if (is_tool_open) {
|
||||||
|
result.tool_calls.emplace_back();
|
||||||
|
current_tool = &result.tool_calls.back();
|
||||||
|
}
|
||||||
|
|
||||||
|
if (is_tool_id && current_tool) {
|
||||||
|
current_tool->id = std::string(trim_trailing_space(node.text));
|
||||||
|
}
|
||||||
|
|
||||||
|
if (is_tool_name && current_tool) {
|
||||||
|
current_tool->name = std::string(trim_trailing_space(node.text));
|
||||||
|
}
|
||||||
|
|
||||||
|
if (is_tool_args && current_tool) {
|
||||||
|
current_tool->arguments = std::string(trim_trailing_space(node.text));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void common_chat_peg_constructed_mapper::map(const common_peg_ast_node & node) {
|
||||||
|
common_chat_peg_mapper::map(node);
|
||||||
|
|
||||||
|
bool is_tool_open = node.tag == common_chat_peg_constructed_builder::TOOL_OPEN;
|
||||||
|
bool is_tool_name = node.tag == common_chat_peg_constructed_builder::TOOL_NAME;
|
||||||
|
bool is_tool_close = node.tag == common_chat_peg_constructed_builder::TOOL_CLOSE;
|
||||||
|
bool is_arg_open = node.tag == common_chat_peg_constructed_builder::TOOL_ARG_OPEN;
|
||||||
|
bool is_arg_close = node.tag == common_chat_peg_constructed_builder::TOOL_ARG_CLOSE;
|
||||||
|
bool is_arg_name = node.tag == common_chat_peg_constructed_builder::TOOL_ARG_NAME;
|
||||||
|
bool is_arg_string = node.tag == common_chat_peg_constructed_builder::TOOL_ARG_STRING_VALUE;
|
||||||
|
bool is_arg_json = node.tag == common_chat_peg_constructed_builder::TOOL_ARG_JSON_VALUE;
|
||||||
|
|
||||||
|
if (is_tool_open) {
|
||||||
|
result.tool_calls.emplace_back();
|
||||||
|
current_tool = &result.tool_calls.back();
|
||||||
|
arg_count = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (is_tool_name) {
|
||||||
|
current_tool->name = std::string(node.text);
|
||||||
|
current_tool->arguments = "{";
|
||||||
|
}
|
||||||
|
|
||||||
|
if (is_arg_open) {
|
||||||
|
needs_closing_quote = false;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (is_arg_name && current_tool) {
|
||||||
|
if (arg_count > 0) {
|
||||||
|
current_tool->arguments += ",";
|
||||||
|
}
|
||||||
|
current_tool->arguments += json(trim_trailing_space(node.text)).dump() + ":";
|
||||||
|
++arg_count;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (is_arg_string && current_tool) {
|
||||||
|
// Serialize to JSON, but exclude the end quote
|
||||||
|
std::string dumped = json(node.text).dump();
|
||||||
|
current_tool->arguments += dumped.substr(0, dumped.size() - 1);
|
||||||
|
needs_closing_quote = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (is_arg_close && current_tool) {
|
||||||
|
if (needs_closing_quote) {
|
||||||
|
current_tool->arguments += "\"";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (is_arg_json && current_tool) {
|
||||||
|
current_tool->arguments += std::string(trim_trailing_space(node.text));
|
||||||
|
}
|
||||||
|
|
||||||
|
if (is_tool_close && current_tool) {
|
||||||
|
current_tool->arguments += "}";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,105 @@
|
||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include "chat.h"
|
||||||
|
#include "peg-parser.h"
|
||||||
|
|
||||||
|
class common_chat_peg_builder : public common_peg_parser_builder {
|
||||||
|
public:
|
||||||
|
static constexpr const char * REASONING_BLOCK = "reasoning-block";
|
||||||
|
static constexpr const char * REASONING = "reasoning";
|
||||||
|
static constexpr const char * CONTENT = "content";
|
||||||
|
|
||||||
|
common_peg_parser reasoning_block(const common_peg_parser & p) { return tag(REASONING_BLOCK, p); }
|
||||||
|
common_peg_parser reasoning(const common_peg_parser & p) { return tag(REASONING, p); }
|
||||||
|
common_peg_parser content(const common_peg_parser & p) { return tag(CONTENT, p); }
|
||||||
|
};
|
||||||
|
|
||||||
|
inline common_peg_arena build_chat_peg_parser(const std::function<common_peg_parser(common_chat_peg_builder & builder)> & fn) {
|
||||||
|
common_chat_peg_builder builder;
|
||||||
|
builder.set_root(fn(builder));
|
||||||
|
return builder.build();
|
||||||
|
}
|
||||||
|
|
||||||
|
class common_chat_peg_mapper {
|
||||||
|
public:
|
||||||
|
common_chat_msg & result;
|
||||||
|
|
||||||
|
common_chat_peg_mapper(common_chat_msg & msg) : result(msg) {}
|
||||||
|
|
||||||
|
virtual void from_ast(const common_peg_ast_arena & arena, const common_peg_parse_result & result);
|
||||||
|
virtual void map(const common_peg_ast_node & node);
|
||||||
|
};
|
||||||
|
|
||||||
|
class common_chat_peg_native_builder : public common_chat_peg_builder {
|
||||||
|
public:
|
||||||
|
static constexpr const char * TOOL = "tool";
|
||||||
|
static constexpr const char * TOOL_OPEN = "tool-open";
|
||||||
|
static constexpr const char * TOOL_CLOSE = "tool-close";
|
||||||
|
static constexpr const char * TOOL_ID = "tool-id";
|
||||||
|
static constexpr const char * TOOL_NAME = "tool-name";
|
||||||
|
static constexpr const char * TOOL_ARGS = "tool-args";
|
||||||
|
|
||||||
|
common_peg_parser tool(const common_peg_parser & p) { return tag(TOOL, p); }
|
||||||
|
common_peg_parser tool_open(const common_peg_parser & p) { return atomic(tag(TOOL_OPEN, p)); }
|
||||||
|
common_peg_parser tool_close(const common_peg_parser & p) { return atomic(tag(TOOL_CLOSE, p)); }
|
||||||
|
common_peg_parser tool_id(const common_peg_parser & p) { return atomic(tag(TOOL_ID, p)); }
|
||||||
|
common_peg_parser tool_name(const common_peg_parser & p) { return atomic(tag(TOOL_NAME, p)); }
|
||||||
|
common_peg_parser tool_args(const common_peg_parser & p) { return tag(TOOL_ARGS, p); }
|
||||||
|
};
|
||||||
|
|
||||||
|
class common_chat_peg_native_mapper : public common_chat_peg_mapper {
|
||||||
|
common_chat_tool_call * current_tool;
|
||||||
|
|
||||||
|
public:
|
||||||
|
common_chat_peg_native_mapper(common_chat_msg & msg) : common_chat_peg_mapper(msg) {}
|
||||||
|
|
||||||
|
void map(const common_peg_ast_node & node) override;
|
||||||
|
};
|
||||||
|
|
||||||
|
inline common_peg_arena build_chat_peg_native_parser(const std::function<common_peg_parser(common_chat_peg_native_builder & builder)> & fn) {
|
||||||
|
common_chat_peg_native_builder builder;
|
||||||
|
builder.set_root(fn(builder));
|
||||||
|
return builder.build();
|
||||||
|
}
|
||||||
|
|
||||||
|
class common_chat_peg_constructed_builder : public common_chat_peg_builder {
|
||||||
|
public:
|
||||||
|
static constexpr const char * TOOL = "tool";
|
||||||
|
static constexpr const char * TOOL_OPEN = "tool-open";
|
||||||
|
static constexpr const char * TOOL_CLOSE = "tool-close";
|
||||||
|
static constexpr const char * TOOL_NAME = "tool-name";
|
||||||
|
static constexpr const char * TOOL_ARG = "tool-arg";
|
||||||
|
static constexpr const char * TOOL_ARG_OPEN = "tool-arg-open";
|
||||||
|
static constexpr const char * TOOL_ARG_CLOSE = "tool-arg-close";
|
||||||
|
static constexpr const char * TOOL_ARG_NAME = "tool-arg-name";
|
||||||
|
static constexpr const char * TOOL_ARG_STRING_VALUE = "tool-arg-string-value";
|
||||||
|
static constexpr const char * TOOL_ARG_JSON_VALUE = "tool-arg-json-value";
|
||||||
|
|
||||||
|
common_peg_parser tool(const common_peg_parser & p) { return tag(TOOL, p); }
|
||||||
|
common_peg_parser tool_open(const common_peg_parser & p) { return atomic(tag(TOOL_OPEN, p)); }
|
||||||
|
common_peg_parser tool_close(const common_peg_parser & p) { return atomic(tag(TOOL_CLOSE, p)); }
|
||||||
|
common_peg_parser tool_name(const common_peg_parser & p) { return atomic(tag(TOOL_NAME, p)); }
|
||||||
|
common_peg_parser tool_arg(const common_peg_parser & p) { return tag(TOOL_ARG, p); }
|
||||||
|
common_peg_parser tool_arg_open(const common_peg_parser & p) { return atomic(tag(TOOL_ARG_OPEN, p)); }
|
||||||
|
common_peg_parser tool_arg_close(const common_peg_parser & p) { return atomic(tag(TOOL_ARG_CLOSE, p)); }
|
||||||
|
common_peg_parser tool_arg_name(const common_peg_parser & p) { return atomic(tag(TOOL_ARG_NAME, p)); }
|
||||||
|
common_peg_parser tool_arg_string_value(const common_peg_parser & p) { return tag(TOOL_ARG_STRING_VALUE, p); }
|
||||||
|
common_peg_parser tool_arg_json_value(const common_peg_parser & p) { return tag(TOOL_ARG_JSON_VALUE, p); }
|
||||||
|
};
|
||||||
|
|
||||||
|
class common_chat_peg_constructed_mapper : public common_chat_peg_mapper {
|
||||||
|
common_chat_tool_call * current_tool;
|
||||||
|
int arg_count = 0;
|
||||||
|
bool needs_closing_quote = false;
|
||||||
|
|
||||||
|
public:
|
||||||
|
common_chat_peg_constructed_mapper(common_chat_msg & msg) : common_chat_peg_mapper(msg) {}
|
||||||
|
|
||||||
|
void map(const common_peg_ast_node & node) override;
|
||||||
|
};
|
||||||
|
|
||||||
|
inline common_peg_arena build_chat_peg_constructed_parser(const std::function<common_peg_parser(common_chat_peg_constructed_builder & builder)> & fn) {
|
||||||
|
common_chat_peg_constructed_builder builder;
|
||||||
|
builder.set_root(fn(builder));
|
||||||
|
return builder.build();
|
||||||
|
}
|
||||||
|
|
@ -85,29 +85,36 @@ json common_chat_msg::to_json_oaicompat() const
|
||||||
return message;
|
return message;
|
||||||
}
|
}
|
||||||
|
|
||||||
std::vector<common_chat_msg_diff> common_chat_msg_diff::compute_diffs(const common_chat_msg & previous_msg, const common_chat_msg & new_msg) {
|
std::vector<common_chat_msg_diff> common_chat_msg_diff::compute_diffs(const common_chat_msg & msg_prv, const common_chat_msg & msg_new) {
|
||||||
std::vector<common_chat_msg_diff> diffs;
|
std::vector<common_chat_msg_diff> diffs;
|
||||||
if (previous_msg.reasoning_content != new_msg.reasoning_content) {
|
if (msg_new.tool_calls.size() > msg_prv.tool_calls.size()) {
|
||||||
auto & diff = diffs.emplace_back();
|
diffs.reserve(msg_new.tool_calls.size() - msg_prv.tool_calls.size() + 3);
|
||||||
diff.reasoning_content_delta = string_diff(previous_msg.reasoning_content, new_msg.reasoning_content);
|
} else {
|
||||||
}
|
diffs.reserve(3);
|
||||||
if (previous_msg.content != new_msg.content) {
|
|
||||||
auto & diff = diffs.emplace_back();
|
|
||||||
diff.content_delta = string_diff(previous_msg.content, new_msg.content);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if (new_msg.tool_calls.size() < previous_msg.tool_calls.size()) {
|
// TODO: these can become expensive for long messages - how to optimize?
|
||||||
|
if (msg_prv.reasoning_content != msg_new.reasoning_content) {
|
||||||
|
auto & diff = diffs.emplace_back();
|
||||||
|
diff.reasoning_content_delta = string_diff(msg_prv.reasoning_content, msg_new.reasoning_content);
|
||||||
|
}
|
||||||
|
if (msg_prv.content != msg_new.content) {
|
||||||
|
auto & diff = diffs.emplace_back();
|
||||||
|
diff.content_delta = string_diff(msg_prv.content, msg_new.content);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (msg_new.tool_calls.size() < msg_prv.tool_calls.size()) {
|
||||||
throw std::runtime_error("Invalid diff: now finding less tool calls!");
|
throw std::runtime_error("Invalid diff: now finding less tool calls!");
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!previous_msg.tool_calls.empty()) {
|
if (!msg_prv.tool_calls.empty()) {
|
||||||
auto idx = previous_msg.tool_calls.size() - 1;
|
const auto idx = msg_prv.tool_calls.size() - 1;
|
||||||
const auto & pref = previous_msg.tool_calls[idx];
|
const auto & pref = msg_prv.tool_calls[idx];
|
||||||
const auto & newf = new_msg.tool_calls[idx];
|
const auto & newf = msg_new.tool_calls[idx];
|
||||||
if (pref.name != newf.name) {
|
if (pref.name != newf.name) {
|
||||||
throw std::runtime_error("Invalid diff: tool call mismatch!");
|
throw std::runtime_error("Invalid diff: tool call mismatch!");
|
||||||
}
|
}
|
||||||
auto args_diff = string_diff(pref.arguments, newf.arguments);
|
const auto args_diff = string_diff(pref.arguments, newf.arguments);
|
||||||
if (!args_diff.empty() || pref.id != newf.id) {
|
if (!args_diff.empty() || pref.id != newf.id) {
|
||||||
auto & diff = diffs.emplace_back();
|
auto & diff = diffs.emplace_back();
|
||||||
diff.tool_call_index = idx;
|
diff.tool_call_index = idx;
|
||||||
|
|
@ -118,11 +125,12 @@ std::vector<common_chat_msg_diff> common_chat_msg_diff::compute_diffs(const comm
|
||||||
diff.tool_call_delta.arguments = args_diff;
|
diff.tool_call_delta.arguments = args_diff;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
for (size_t idx = previous_msg.tool_calls.size(); idx < new_msg.tool_calls.size(); ++idx) {
|
for (size_t idx = msg_prv.tool_calls.size(); idx < msg_new.tool_calls.size(); ++idx) {
|
||||||
auto & diff = diffs.emplace_back();
|
auto & diff = diffs.emplace_back();
|
||||||
diff.tool_call_index = idx;
|
diff.tool_call_index = idx;
|
||||||
diff.tool_call_delta = new_msg.tool_calls[idx];
|
diff.tool_call_delta = msg_new.tool_calls[idx];
|
||||||
}
|
}
|
||||||
|
|
||||||
return diffs;
|
return diffs;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -163,7 +171,7 @@ common_chat_tool_choice common_chat_tool_choice_parse_oaicompat(const std::strin
|
||||||
if (tool_choice == "required") {
|
if (tool_choice == "required") {
|
||||||
return COMMON_CHAT_TOOL_CHOICE_REQUIRED;
|
return COMMON_CHAT_TOOL_CHOICE_REQUIRED;
|
||||||
}
|
}
|
||||||
throw std::runtime_error("Invalid tool_choice: " + tool_choice);
|
throw std::invalid_argument("Invalid tool_choice: " + tool_choice);
|
||||||
}
|
}
|
||||||
|
|
||||||
bool common_chat_templates_support_enable_thinking(const common_chat_templates * chat_templates) {
|
bool common_chat_templates_support_enable_thinking(const common_chat_templates * chat_templates) {
|
||||||
|
|
@ -186,17 +194,17 @@ std::vector<common_chat_msg> common_chat_msgs_parse_oaicompat(const json & messa
|
||||||
try {
|
try {
|
||||||
|
|
||||||
if (!messages.is_array()) {
|
if (!messages.is_array()) {
|
||||||
throw std::runtime_error("Expected 'messages' to be an array, got " + messages.dump());
|
throw std::invalid_argument("Expected 'messages' to be an array, got " + messages.dump());
|
||||||
}
|
}
|
||||||
|
|
||||||
for (const auto & message : messages) {
|
for (const auto & message : messages) {
|
||||||
if (!message.is_object()) {
|
if (!message.is_object()) {
|
||||||
throw std::runtime_error("Expected 'message' to be an object, got " + message.dump());
|
throw std::invalid_argument("Expected 'message' to be an object, got " + message.dump());
|
||||||
}
|
}
|
||||||
|
|
||||||
common_chat_msg msg;
|
common_chat_msg msg;
|
||||||
if (!message.contains("role")) {
|
if (!message.contains("role")) {
|
||||||
throw std::runtime_error("Missing 'role' in message: " + message.dump());
|
throw std::invalid_argument("Missing 'role' in message: " + message.dump());
|
||||||
}
|
}
|
||||||
msg.role = message.at("role");
|
msg.role = message.at("role");
|
||||||
|
|
||||||
|
|
@ -209,11 +217,11 @@ std::vector<common_chat_msg> common_chat_msgs_parse_oaicompat(const json & messa
|
||||||
} else if (content.is_array()) {
|
} else if (content.is_array()) {
|
||||||
for (const auto & part : content) {
|
for (const auto & part : content) {
|
||||||
if (!part.contains("type")) {
|
if (!part.contains("type")) {
|
||||||
throw std::runtime_error("Missing content part type: " + part.dump());
|
throw std::invalid_argument("Missing content part type: " + part.dump());
|
||||||
}
|
}
|
||||||
const auto & type = part.at("type");
|
const auto & type = part.at("type");
|
||||||
if (type != "text") {
|
if (type != "text") {
|
||||||
throw std::runtime_error("Unsupported content part type: " + type.dump());
|
throw std::invalid_argument("Unsupported content part type: " + type.dump());
|
||||||
}
|
}
|
||||||
common_chat_msg_content_part msg_part;
|
common_chat_msg_content_part msg_part;
|
||||||
msg_part.type = type;
|
msg_part.type = type;
|
||||||
|
|
@ -221,25 +229,25 @@ std::vector<common_chat_msg> common_chat_msgs_parse_oaicompat(const json & messa
|
||||||
msg.content_parts.push_back(msg_part);
|
msg.content_parts.push_back(msg_part);
|
||||||
}
|
}
|
||||||
} else if (!content.is_null()) {
|
} else if (!content.is_null()) {
|
||||||
throw std::runtime_error("Invalid 'content' type: expected string or array, got " + content.dump() + " (ref: https://github.com/ggml-org/llama.cpp/issues/8367)");
|
throw std::invalid_argument("Invalid 'content' type: expected string or array, got " + content.dump() + " (ref: https://github.com/ggml-org/llama.cpp/issues/8367)");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (has_tool_calls) {
|
if (has_tool_calls) {
|
||||||
for (const auto & tool_call : message.at("tool_calls")) {
|
for (const auto & tool_call : message.at("tool_calls")) {
|
||||||
common_chat_tool_call tc;
|
common_chat_tool_call tc;
|
||||||
if (!tool_call.contains("type")) {
|
if (!tool_call.contains("type")) {
|
||||||
throw std::runtime_error("Missing tool call type: " + tool_call.dump());
|
throw std::invalid_argument("Missing tool call type: " + tool_call.dump());
|
||||||
}
|
}
|
||||||
const auto & type = tool_call.at("type");
|
const auto & type = tool_call.at("type");
|
||||||
if (type != "function") {
|
if (type != "function") {
|
||||||
throw std::runtime_error("Unsupported tool call type: " + tool_call.dump());
|
throw std::invalid_argument("Unsupported tool call type: " + tool_call.dump());
|
||||||
}
|
}
|
||||||
if (!tool_call.contains("function")) {
|
if (!tool_call.contains("function")) {
|
||||||
throw std::runtime_error("Missing tool call function: " + tool_call.dump());
|
throw std::invalid_argument("Missing tool call function: " + tool_call.dump());
|
||||||
}
|
}
|
||||||
const auto & fc = tool_call.at("function");
|
const auto & fc = tool_call.at("function");
|
||||||
if (!fc.contains("name")) {
|
if (!fc.contains("name")) {
|
||||||
throw std::runtime_error("Missing tool call name: " + tool_call.dump());
|
throw std::invalid_argument("Missing tool call name: " + tool_call.dump());
|
||||||
}
|
}
|
||||||
tc.name = fc.at("name");
|
tc.name = fc.at("name");
|
||||||
tc.arguments = fc.at("arguments");
|
tc.arguments = fc.at("arguments");
|
||||||
|
|
@ -250,7 +258,7 @@ std::vector<common_chat_msg> common_chat_msgs_parse_oaicompat(const json & messa
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (!has_content && !has_tool_calls) {
|
if (!has_content && !has_tool_calls) {
|
||||||
throw std::runtime_error("Expected 'content' or 'tool_calls' (ref: https://github.com/ggml-org/llama.cpp/issues/8367 & https://github.com/ggml-org/llama.cpp/issues/12279)");
|
throw std::invalid_argument("Expected 'content' or 'tool_calls' (ref: https://github.com/ggml-org/llama.cpp/issues/8367 & https://github.com/ggml-org/llama.cpp/issues/12279)");
|
||||||
}
|
}
|
||||||
if (message.contains("reasoning_content")) {
|
if (message.contains("reasoning_content")) {
|
||||||
msg.reasoning_content = message.at("reasoning_content");
|
msg.reasoning_content = message.at("reasoning_content");
|
||||||
|
|
@ -353,18 +361,18 @@ std::vector<common_chat_tool> common_chat_tools_parse_oaicompat(const json & too
|
||||||
try {
|
try {
|
||||||
if (!tools.is_null()) {
|
if (!tools.is_null()) {
|
||||||
if (!tools.is_array()) {
|
if (!tools.is_array()) {
|
||||||
throw std::runtime_error("Expected 'tools' to be an array, got " + tools.dump());
|
throw std::invalid_argument("Expected 'tools' to be an array, got " + tools.dump());
|
||||||
}
|
}
|
||||||
for (const auto & tool : tools) {
|
for (const auto & tool : tools) {
|
||||||
if (!tool.contains("type")) {
|
if (!tool.contains("type")) {
|
||||||
throw std::runtime_error("Missing tool type: " + tool.dump());
|
throw std::invalid_argument("Missing tool type: " + tool.dump());
|
||||||
}
|
}
|
||||||
const auto & type = tool.at("type");
|
const auto & type = tool.at("type");
|
||||||
if (!type.is_string() || type != "function") {
|
if (!type.is_string() || type != "function") {
|
||||||
throw std::runtime_error("Unsupported tool type: " + tool.dump());
|
throw std::invalid_argument("Unsupported tool type: " + tool.dump());
|
||||||
}
|
}
|
||||||
if (!tool.contains("function")) {
|
if (!tool.contains("function")) {
|
||||||
throw std::runtime_error("Missing tool function: " + tool.dump());
|
throw std::invalid_argument("Missing tool function: " + tool.dump());
|
||||||
}
|
}
|
||||||
|
|
||||||
const auto & function = tool.at("function");
|
const auto & function = tool.at("function");
|
||||||
|
|
@ -649,6 +657,9 @@ const char * common_chat_format_name(common_chat_format format) {
|
||||||
case COMMON_CHAT_FORMAT_QWEN3_CODER_XML: return "Qwen3 Coder";
|
case COMMON_CHAT_FORMAT_QWEN3_CODER_XML: return "Qwen3 Coder";
|
||||||
case COMMON_CHAT_FORMAT_APRIEL_1_5: return "Apriel 1.5";
|
case COMMON_CHAT_FORMAT_APRIEL_1_5: return "Apriel 1.5";
|
||||||
case COMMON_CHAT_FORMAT_XIAOMI_MIMO: return "Xiaomi MiMo";
|
case COMMON_CHAT_FORMAT_XIAOMI_MIMO: return "Xiaomi MiMo";
|
||||||
|
case COMMON_CHAT_FORMAT_PEG_SIMPLE: return "peg-simple";
|
||||||
|
case COMMON_CHAT_FORMAT_PEG_NATIVE: return "peg-native";
|
||||||
|
case COMMON_CHAT_FORMAT_PEG_CONSTRUCTED: return "peg-constructed";
|
||||||
case COMMON_CHAT_FORMAT_DEEPSEEK_V3_2: return "DeepSeek V3.2";
|
case COMMON_CHAT_FORMAT_DEEPSEEK_V3_2: return "DeepSeek V3.2";
|
||||||
default:
|
default:
|
||||||
throw std::runtime_error("Unknown chat format");
|
throw std::runtime_error("Unknown chat format");
|
||||||
|
|
|
||||||
|
|
@ -3,6 +3,7 @@
|
||||||
#pragma once
|
#pragma once
|
||||||
|
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
|
#include "peg-parser.h"
|
||||||
#include <functional>
|
#include <functional>
|
||||||
#include <chrono>
|
#include <chrono>
|
||||||
#include <string>
|
#include <string>
|
||||||
|
|
@ -76,7 +77,7 @@ struct common_chat_msg_diff {
|
||||||
size_t tool_call_index = std::string::npos;
|
size_t tool_call_index = std::string::npos;
|
||||||
common_chat_tool_call tool_call_delta;
|
common_chat_tool_call tool_call_delta;
|
||||||
|
|
||||||
static std::vector<common_chat_msg_diff> compute_diffs(const common_chat_msg & previous_msg, const common_chat_msg & new_msg);
|
static std::vector<common_chat_msg_diff> compute_diffs(const common_chat_msg & msg_prv, const common_chat_msg & msg_new);
|
||||||
|
|
||||||
bool operator==(const common_chat_msg_diff & other) const {
|
bool operator==(const common_chat_msg_diff & other) const {
|
||||||
return content_delta == other.content_delta
|
return content_delta == other.content_delta
|
||||||
|
|
@ -125,6 +126,11 @@ enum common_chat_format {
|
||||||
COMMON_CHAT_FORMAT_XIAOMI_MIMO,
|
COMMON_CHAT_FORMAT_XIAOMI_MIMO,
|
||||||
COMMON_CHAT_FORMAT_DEEPSEEK_V3_2,
|
COMMON_CHAT_FORMAT_DEEPSEEK_V3_2,
|
||||||
|
|
||||||
|
// These are intended to be parsed by the PEG parser
|
||||||
|
COMMON_CHAT_FORMAT_PEG_SIMPLE,
|
||||||
|
COMMON_CHAT_FORMAT_PEG_NATIVE,
|
||||||
|
COMMON_CHAT_FORMAT_PEG_CONSTRUCTED,
|
||||||
|
|
||||||
COMMON_CHAT_FORMAT_COUNT, // Not a format, just the # formats
|
COMMON_CHAT_FORMAT_COUNT, // Not a format, just the # formats
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
@ -155,6 +161,7 @@ struct common_chat_params {
|
||||||
std::vector<common_grammar_trigger> grammar_triggers;
|
std::vector<common_grammar_trigger> grammar_triggers;
|
||||||
std::vector<std::string> preserved_tokens;
|
std::vector<std::string> preserved_tokens;
|
||||||
std::vector<std::string> additional_stops;
|
std::vector<std::string> additional_stops;
|
||||||
|
std::string parser;
|
||||||
};
|
};
|
||||||
|
|
||||||
struct common_chat_syntax {
|
struct common_chat_syntax {
|
||||||
|
|
@ -164,6 +171,7 @@ struct common_chat_syntax {
|
||||||
bool reasoning_in_content = false;
|
bool reasoning_in_content = false;
|
||||||
bool thinking_forced_open = false;
|
bool thinking_forced_open = false;
|
||||||
bool parse_tool_calls = true;
|
bool parse_tool_calls = true;
|
||||||
|
common_peg_arena parser = {};
|
||||||
};
|
};
|
||||||
|
|
||||||
// Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid
|
// Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid
|
||||||
|
|
@ -207,6 +215,7 @@ const char* common_chat_format_name(common_chat_format format);
|
||||||
const char* common_reasoning_format_name(common_reasoning_format format);
|
const char* common_reasoning_format_name(common_reasoning_format format);
|
||||||
common_reasoning_format common_reasoning_format_from_name(const std::string & format);
|
common_reasoning_format common_reasoning_format_from_name(const std::string & format);
|
||||||
common_chat_msg common_chat_parse(const std::string & input, bool is_partial, const common_chat_syntax & syntax);
|
common_chat_msg common_chat_parse(const std::string & input, bool is_partial, const common_chat_syntax & syntax);
|
||||||
|
common_chat_msg common_chat_peg_parse(const common_peg_arena & parser, const std::string & input, bool is_partial, const common_chat_syntax & syntax);
|
||||||
|
|
||||||
common_chat_tool_choice common_chat_tool_choice_parse_oaicompat(const std::string & tool_choice);
|
common_chat_tool_choice common_chat_tool_choice_parse_oaicompat(const std::string & tool_choice);
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -694,7 +694,7 @@ bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_over
|
||||||
|
|
||||||
// Validate if a filename is safe to use
|
// Validate if a filename is safe to use
|
||||||
// To validate a full path, split the path by the OS-specific path separator, and validate each part with this function
|
// To validate a full path, split the path by the OS-specific path separator, and validate each part with this function
|
||||||
bool fs_validate_filename(const std::string & filename) {
|
bool fs_validate_filename(const std::string & filename, bool allow_subdirs) {
|
||||||
if (!filename.length()) {
|
if (!filename.length()) {
|
||||||
// Empty filename invalid
|
// Empty filename invalid
|
||||||
return false;
|
return false;
|
||||||
|
|
@ -754,10 +754,14 @@ bool fs_validate_filename(const std::string & filename) {
|
||||||
|| (c >= 0xD800 && c <= 0xDFFF) // UTF-16 surrogate pairs
|
|| (c >= 0xD800 && c <= 0xDFFF) // UTF-16 surrogate pairs
|
||||||
|| c == 0xFFFD // Replacement Character (UTF-8)
|
|| c == 0xFFFD // Replacement Character (UTF-8)
|
||||||
|| c == 0xFEFF // Byte Order Mark (BOM)
|
|| c == 0xFEFF // Byte Order Mark (BOM)
|
||||||
|| c == '/' || c == '\\' || c == ':' || c == '*' // Illegal characters
|
|| c == ':' || c == '*' // Illegal characters
|
||||||
|| c == '?' || c == '"' || c == '<' || c == '>' || c == '|') {
|
|| c == '?' || c == '"' || c == '<' || c == '>' || c == '|') {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
if (!allow_subdirs && (c == '/' || c == '\\')) {
|
||||||
|
// Subdirectories not allowed, reject path separators
|
||||||
|
return false;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Reject any leading or trailing ' ', or any trailing '.', these are stripped on Windows and will cause a different filename
|
// Reject any leading or trailing ' ', or any trailing '.', these are stripped on Windows and will cause a different filename
|
||||||
|
|
@ -782,11 +786,29 @@ bool fs_validate_filename(const std::string & filename) {
|
||||||
#include <iostream>
|
#include <iostream>
|
||||||
|
|
||||||
|
|
||||||
|
#ifdef _WIN32
|
||||||
|
static std::wstring utf8_to_wstring(const std::string & str) {
|
||||||
|
if (str.empty()) {
|
||||||
|
return std::wstring();
|
||||||
|
}
|
||||||
|
|
||||||
|
int size = MultiByteToWideChar(CP_UTF8, 0, str.c_str(), (int)str.size(), NULL, 0);
|
||||||
|
|
||||||
|
if (size <= 0) {
|
||||||
|
return std::wstring();
|
||||||
|
}
|
||||||
|
|
||||||
|
std::wstring wstr(size, 0);
|
||||||
|
MultiByteToWideChar(CP_UTF8, 0, str.c_str(), (int)str.size(), &wstr[0], size);
|
||||||
|
|
||||||
|
return wstr;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
// returns true if successful, false otherwise
|
// returns true if successful, false otherwise
|
||||||
bool fs_create_directory_with_parents(const std::string & path) {
|
bool fs_create_directory_with_parents(const std::string & path) {
|
||||||
#ifdef _WIN32
|
#ifdef _WIN32
|
||||||
std::wstring_convert<std::codecvt_utf8<wchar_t>> converter;
|
std::wstring wpath = utf8_to_wstring(path);
|
||||||
std::wstring wpath = converter.from_bytes(path);
|
|
||||||
|
|
||||||
// if the path already exists, check whether it's a directory
|
// if the path already exists, check whether it's a directory
|
||||||
const DWORD attributes = GetFileAttributesW(wpath.c_str());
|
const DWORD attributes = GetFileAttributesW(wpath.c_str());
|
||||||
|
|
@ -859,6 +881,11 @@ bool fs_create_directory_with_parents(const std::string & path) {
|
||||||
#endif // _WIN32
|
#endif // _WIN32
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool fs_is_directory(const std::string & path) {
|
||||||
|
std::filesystem::path dir(path);
|
||||||
|
return std::filesystem::exists(dir) && std::filesystem::is_directory(dir);
|
||||||
|
}
|
||||||
|
|
||||||
std::string fs_get_cache_directory() {
|
std::string fs_get_cache_directory() {
|
||||||
std::string cache_directory = "";
|
std::string cache_directory = "";
|
||||||
auto ensure_trailing_slash = [](std::string p) {
|
auto ensure_trailing_slash = [](std::string p) {
|
||||||
|
|
@ -893,6 +920,8 @@ std::string fs_get_cache_directory() {
|
||||||
cache_directory = std::getenv("HOME") + std::string("/Library/Caches/");
|
cache_directory = std::getenv("HOME") + std::string("/Library/Caches/");
|
||||||
#elif defined(_WIN32)
|
#elif defined(_WIN32)
|
||||||
cache_directory = std::getenv("LOCALAPPDATA");
|
cache_directory = std::getenv("LOCALAPPDATA");
|
||||||
|
#elif defined(__EMSCRIPTEN__)
|
||||||
|
GGML_ABORT("not implemented on this platform");
|
||||||
#else
|
#else
|
||||||
# error Unknown architecture
|
# error Unknown architecture
|
||||||
#endif
|
#endif
|
||||||
|
|
@ -953,6 +982,32 @@ std::vector<common_file_info> fs_list(const std::string & path, bool include_dir
|
||||||
return files;
|
return files;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
//
|
||||||
|
// TTY utils
|
||||||
|
//
|
||||||
|
|
||||||
|
bool tty_can_use_colors() {
|
||||||
|
// Check NO_COLOR environment variable (https://no-color.org/)
|
||||||
|
if (const char * no_color = std::getenv("NO_COLOR")) {
|
||||||
|
if (no_color[0] != '\0') {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check TERM environment variable
|
||||||
|
if (const char * term = std::getenv("TERM")) {
|
||||||
|
if (std::strcmp(term, "dumb") == 0) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check if stdout and stderr are connected to a terminal
|
||||||
|
// We check both because log messages can go to either
|
||||||
|
bool stdout_is_tty = isatty(fileno(stdout));
|
||||||
|
bool stderr_is_tty = isatty(fileno(stderr));
|
||||||
|
|
||||||
|
return stdout_is_tty || stderr_is_tty;
|
||||||
|
}
|
||||||
|
|
||||||
//
|
//
|
||||||
// Model utils
|
// Model utils
|
||||||
|
|
|
||||||
|
|
@ -12,6 +12,10 @@
|
||||||
#include <vector>
|
#include <vector>
|
||||||
#include <map>
|
#include <map>
|
||||||
|
|
||||||
|
#if defined(_WIN32) && !defined(_WIN32_WINNT)
|
||||||
|
#define _WIN32_WINNT 0x0A00
|
||||||
|
#endif
|
||||||
|
|
||||||
#ifdef _WIN32
|
#ifdef _WIN32
|
||||||
#define DIRECTORY_SEPARATOR '\\'
|
#define DIRECTORY_SEPARATOR '\\'
|
||||||
#else
|
#else
|
||||||
|
|
@ -485,6 +489,7 @@ struct common_params {
|
||||||
bool log_json = false;
|
bool log_json = false;
|
||||||
|
|
||||||
std::string slot_save_path;
|
std::string slot_save_path;
|
||||||
|
std::string media_path; // path to directory for loading media files
|
||||||
|
|
||||||
float slot_prompt_similarity = 0.1f;
|
float slot_prompt_similarity = 0.1f;
|
||||||
|
|
||||||
|
|
@ -635,8 +640,9 @@ std::string string_from(const struct llama_context * ctx, const struct llama_bat
|
||||||
// Filesystem utils
|
// Filesystem utils
|
||||||
//
|
//
|
||||||
|
|
||||||
bool fs_validate_filename(const std::string & filename);
|
bool fs_validate_filename(const std::string & filename, bool allow_subdirs = false);
|
||||||
bool fs_create_directory_with_parents(const std::string & path);
|
bool fs_create_directory_with_parents(const std::string & path);
|
||||||
|
bool fs_is_directory(const std::string & path);
|
||||||
|
|
||||||
std::string fs_get_cache_directory();
|
std::string fs_get_cache_directory();
|
||||||
std::string fs_get_cache_file(const std::string & filename);
|
std::string fs_get_cache_file(const std::string & filename);
|
||||||
|
|
@ -649,6 +655,13 @@ struct common_file_info {
|
||||||
};
|
};
|
||||||
std::vector<common_file_info> fs_list(const std::string & path, bool include_directories);
|
std::vector<common_file_info> fs_list(const std::string & path, bool include_directories);
|
||||||
|
|
||||||
|
//
|
||||||
|
// TTY utils
|
||||||
|
//
|
||||||
|
|
||||||
|
// Auto-detect if colors can be enabled based on terminal and environment
|
||||||
|
bool tty_can_use_colors();
|
||||||
|
|
||||||
//
|
//
|
||||||
// Model utils
|
// Model utils
|
||||||
//
|
//
|
||||||
|
|
|
||||||
|
|
@ -24,6 +24,7 @@
|
||||||
#include "http.h"
|
#include "http.h"
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#ifndef __EMSCRIPTEN__
|
||||||
#ifdef __linux__
|
#ifdef __linux__
|
||||||
#include <linux/limits.h>
|
#include <linux/limits.h>
|
||||||
#elif defined(_WIN32)
|
#elif defined(_WIN32)
|
||||||
|
|
@ -35,6 +36,8 @@
|
||||||
#else
|
#else
|
||||||
#include <sys/syslimits.h>
|
#include <sys/syslimits.h>
|
||||||
#endif
|
#endif
|
||||||
|
#endif
|
||||||
|
|
||||||
#define LLAMA_MAX_URL_LENGTH 2084 // Maximum URL Length in Chrome: 2083
|
#define LLAMA_MAX_URL_LENGTH 2084 // Maximum URL Length in Chrome: 2083
|
||||||
|
|
||||||
// isatty
|
// isatty
|
||||||
|
|
|
||||||
|
|
@ -974,7 +974,7 @@ public:
|
||||||
|
|
||||||
void check_errors() {
|
void check_errors() {
|
||||||
if (!_errors.empty()) {
|
if (!_errors.empty()) {
|
||||||
throw std::runtime_error("JSON schema conversion failed:\n" + string_join(_errors, "\n"));
|
throw std::invalid_argument("JSON schema conversion failed:\n" + string_join(_errors, "\n"));
|
||||||
}
|
}
|
||||||
if (!_warnings.empty()) {
|
if (!_warnings.empty()) {
|
||||||
fprintf(stderr, "WARNING: JSON schema conversion was incomplete: %s\n", string_join(_warnings, "; ").c_str());
|
fprintf(stderr, "WARNING: JSON schema conversion was incomplete: %s\n", string_join(_warnings, "; ").c_str());
|
||||||
|
|
|
||||||
|
|
@ -1,3 +1,4 @@
|
||||||
|
#include "common.h"
|
||||||
#include "log.h"
|
#include "log.h"
|
||||||
|
|
||||||
#include <chrono>
|
#include <chrono>
|
||||||
|
|
@ -26,30 +27,6 @@ void common_log_set_verbosity_thold(int verbosity) {
|
||||||
common_log_verbosity_thold = verbosity;
|
common_log_verbosity_thold = verbosity;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Auto-detect if colors should be enabled based on terminal and environment
|
|
||||||
static bool common_log_should_use_colors_auto() {
|
|
||||||
// Check NO_COLOR environment variable (https://no-color.org/)
|
|
||||||
if (const char * no_color = std::getenv("NO_COLOR")) {
|
|
||||||
if (no_color[0] != '\0') {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Check TERM environment variable
|
|
||||||
if (const char * term = std::getenv("TERM")) {
|
|
||||||
if (std::strcmp(term, "dumb") == 0) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Check if stdout and stderr are connected to a terminal
|
|
||||||
// We check both because log messages can go to either
|
|
||||||
bool stdout_is_tty = isatty(fileno(stdout));
|
|
||||||
bool stderr_is_tty = isatty(fileno(stderr));
|
|
||||||
|
|
||||||
return stdout_is_tty || stderr_is_tty;
|
|
||||||
}
|
|
||||||
|
|
||||||
static int64_t t_us() {
|
static int64_t t_us() {
|
||||||
return std::chrono::duration_cast<std::chrono::microseconds>(std::chrono::system_clock::now().time_since_epoch()).count();
|
return std::chrono::duration_cast<std::chrono::microseconds>(std::chrono::system_clock::now().time_since_epoch()).count();
|
||||||
}
|
}
|
||||||
|
|
@ -391,7 +368,7 @@ struct common_log * common_log_main() {
|
||||||
static std::once_flag init_flag;
|
static std::once_flag init_flag;
|
||||||
std::call_once(init_flag, [&]() {
|
std::call_once(init_flag, [&]() {
|
||||||
// Set default to auto-detect colors
|
// Set default to auto-detect colors
|
||||||
log.set_colors(common_log_should_use_colors_auto());
|
log.set_colors(tty_can_use_colors());
|
||||||
});
|
});
|
||||||
|
|
||||||
return &log;
|
return &log;
|
||||||
|
|
@ -422,7 +399,7 @@ void common_log_set_file(struct common_log * log, const char * file) {
|
||||||
|
|
||||||
void common_log_set_colors(struct common_log * log, log_colors colors) {
|
void common_log_set_colors(struct common_log * log, log_colors colors) {
|
||||||
if (colors == LOG_COLORS_AUTO) {
|
if (colors == LOG_COLORS_AUTO) {
|
||||||
log->set_colors(common_log_should_use_colors_auto());
|
log->set_colors(tty_can_use_colors());
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
File diff suppressed because it is too large
Load Diff
|
|
@ -0,0 +1,459 @@
|
||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include <nlohmann/json_fwd.hpp>
|
||||||
|
|
||||||
|
#include <memory>
|
||||||
|
#include <unordered_map>
|
||||||
|
#include <string>
|
||||||
|
#include <string_view>
|
||||||
|
#include <functional>
|
||||||
|
#include <vector>
|
||||||
|
#include <variant>
|
||||||
|
|
||||||
|
struct common_grammar_builder;
|
||||||
|
|
||||||
|
class common_peg_parser_builder;
|
||||||
|
|
||||||
|
using common_peg_parser_id = size_t;
|
||||||
|
constexpr common_peg_parser_id COMMON_PEG_INVALID_PARSER_ID = static_cast<common_peg_parser_id>(-1);
|
||||||
|
|
||||||
|
using common_peg_ast_id = size_t;
|
||||||
|
constexpr common_peg_ast_id COMMON_PEG_INVALID_AST_ID = static_cast<common_peg_ast_id>(-1);
|
||||||
|
|
||||||
|
// Lightweight wrapper around common_peg_parser_id for convenience
|
||||||
|
class common_peg_parser {
|
||||||
|
common_peg_parser_id id_;
|
||||||
|
common_peg_parser_builder & builder_;
|
||||||
|
|
||||||
|
public:
|
||||||
|
common_peg_parser(const common_peg_parser & other) : id_(other.id_), builder_(other.builder_) {}
|
||||||
|
common_peg_parser(common_peg_parser_id id, common_peg_parser_builder & builder) : id_(id), builder_(builder) {}
|
||||||
|
|
||||||
|
common_peg_parser & operator=(const common_peg_parser & other);
|
||||||
|
common_peg_parser & operator+=(const common_peg_parser & other);
|
||||||
|
common_peg_parser & operator|=(const common_peg_parser & other);
|
||||||
|
|
||||||
|
operator common_peg_parser_id() const { return id_; }
|
||||||
|
common_peg_parser_id id() const { return id_; }
|
||||||
|
|
||||||
|
common_peg_parser_builder & builder() const { return builder_; }
|
||||||
|
|
||||||
|
// Creates a sequence
|
||||||
|
common_peg_parser operator+(const common_peg_parser & other) const;
|
||||||
|
|
||||||
|
// Creates a sequence separated by spaces.
|
||||||
|
common_peg_parser operator<<(const common_peg_parser & other) const;
|
||||||
|
|
||||||
|
// Creates a choice
|
||||||
|
common_peg_parser operator|(const common_peg_parser & other) const;
|
||||||
|
|
||||||
|
common_peg_parser operator+(const char * str) const;
|
||||||
|
common_peg_parser operator+(const std::string & str) const;
|
||||||
|
common_peg_parser operator<<(const char * str) const;
|
||||||
|
common_peg_parser operator<<(const std::string & str) const;
|
||||||
|
common_peg_parser operator|(const char * str) const;
|
||||||
|
common_peg_parser operator|(const std::string & str) const;
|
||||||
|
};
|
||||||
|
|
||||||
|
common_peg_parser operator+(const char * str, const common_peg_parser & p);
|
||||||
|
common_peg_parser operator+(const std::string & str, const common_peg_parser & p);
|
||||||
|
common_peg_parser operator<<(const char * str, const common_peg_parser & p);
|
||||||
|
common_peg_parser operator<<(const std::string & str, const common_peg_parser & p);
|
||||||
|
common_peg_parser operator|(const char * str, const common_peg_parser & p);
|
||||||
|
common_peg_parser operator|(const std::string & str, const common_peg_parser & p);
|
||||||
|
|
||||||
|
enum common_peg_parse_result_type {
|
||||||
|
COMMON_PEG_PARSE_RESULT_FAIL = 0,
|
||||||
|
COMMON_PEG_PARSE_RESULT_SUCCESS = 1,
|
||||||
|
COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT = 2,
|
||||||
|
};
|
||||||
|
|
||||||
|
const char * common_peg_parse_result_type_name(common_peg_parse_result_type type);
|
||||||
|
|
||||||
|
struct common_peg_ast_node {
|
||||||
|
common_peg_ast_id id;
|
||||||
|
std::string rule;
|
||||||
|
std::string tag;
|
||||||
|
size_t start;
|
||||||
|
size_t end;
|
||||||
|
std::string_view text;
|
||||||
|
std::vector<common_peg_ast_id> children;
|
||||||
|
|
||||||
|
bool is_partial = false;
|
||||||
|
};
|
||||||
|
|
||||||
|
struct common_peg_parse_result;
|
||||||
|
|
||||||
|
using common_peg_ast_visitor = std::function<void(const common_peg_ast_node & node)>;
|
||||||
|
|
||||||
|
class common_peg_ast_arena {
|
||||||
|
std::vector<common_peg_ast_node> nodes_;
|
||||||
|
public:
|
||||||
|
common_peg_ast_id add_node(
|
||||||
|
const std::string & rule,
|
||||||
|
const std::string & tag,
|
||||||
|
size_t start,
|
||||||
|
size_t end,
|
||||||
|
std::string_view text,
|
||||||
|
std::vector<common_peg_ast_id> children,
|
||||||
|
bool is_partial = false
|
||||||
|
) {
|
||||||
|
common_peg_ast_id id = nodes_.size();
|
||||||
|
nodes_.push_back({id, rule, tag, start, end, text, std::move(children), is_partial});
|
||||||
|
return id;
|
||||||
|
}
|
||||||
|
|
||||||
|
const common_peg_ast_node & get(common_peg_ast_id id) const { return nodes_.at(id); }
|
||||||
|
|
||||||
|
size_t size() const { return nodes_.size(); }
|
||||||
|
|
||||||
|
void clear() { nodes_.clear(); }
|
||||||
|
|
||||||
|
void visit(common_peg_ast_id id, const common_peg_ast_visitor & visitor) const;
|
||||||
|
void visit(const common_peg_parse_result & result, const common_peg_ast_visitor & visitor) const;
|
||||||
|
};
|
||||||
|
|
||||||
|
struct common_peg_parse_result {
|
||||||
|
common_peg_parse_result_type type = COMMON_PEG_PARSE_RESULT_FAIL;
|
||||||
|
size_t start = 0;
|
||||||
|
size_t end = 0;
|
||||||
|
|
||||||
|
std::vector<common_peg_ast_id> nodes;
|
||||||
|
|
||||||
|
common_peg_parse_result() = default;
|
||||||
|
|
||||||
|
common_peg_parse_result(common_peg_parse_result_type type, size_t start)
|
||||||
|
: type(type), start(start), end(start) {}
|
||||||
|
|
||||||
|
common_peg_parse_result(common_peg_parse_result_type type, size_t start, size_t end)
|
||||||
|
: type(type), start(start), end(end) {}
|
||||||
|
|
||||||
|
common_peg_parse_result(common_peg_parse_result_type type, size_t start, size_t end, std::vector<common_peg_ast_id> nodes)
|
||||||
|
: type(type), start(start), end(end), nodes(std::move(nodes)) {}
|
||||||
|
|
||||||
|
bool fail() const { return type == COMMON_PEG_PARSE_RESULT_FAIL; }
|
||||||
|
bool need_more_input() const { return type == COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT; }
|
||||||
|
bool success() const { return type == COMMON_PEG_PARSE_RESULT_SUCCESS; }
|
||||||
|
};
|
||||||
|
|
||||||
|
struct common_peg_parse_context {
|
||||||
|
std::string input;
|
||||||
|
bool is_partial;
|
||||||
|
common_peg_ast_arena ast;
|
||||||
|
|
||||||
|
int parse_depth;
|
||||||
|
|
||||||
|
common_peg_parse_context()
|
||||||
|
: is_partial(false), parse_depth(0) {}
|
||||||
|
|
||||||
|
common_peg_parse_context(const std::string & input)
|
||||||
|
: input(input), is_partial(false), parse_depth(0) {}
|
||||||
|
|
||||||
|
common_peg_parse_context(const std::string & input, bool is_partial)
|
||||||
|
: input(input), is_partial(is_partial), parse_depth(0) {}
|
||||||
|
};
|
||||||
|
|
||||||
|
class common_peg_arena;
|
||||||
|
|
||||||
|
// Parser variants
|
||||||
|
struct common_peg_epsilon_parser {};
|
||||||
|
|
||||||
|
struct common_peg_start_parser {};
|
||||||
|
|
||||||
|
struct common_peg_end_parser {};
|
||||||
|
|
||||||
|
struct common_peg_literal_parser {
|
||||||
|
std::string literal;
|
||||||
|
};
|
||||||
|
|
||||||
|
struct common_peg_sequence_parser {
|
||||||
|
std::vector<common_peg_parser_id> children;
|
||||||
|
};
|
||||||
|
|
||||||
|
struct common_peg_choice_parser {
|
||||||
|
std::vector<common_peg_parser_id> children;
|
||||||
|
};
|
||||||
|
|
||||||
|
struct common_peg_repetition_parser {
|
||||||
|
common_peg_parser_id child;
|
||||||
|
int min_count;
|
||||||
|
int max_count; // -1 for unbounded
|
||||||
|
};
|
||||||
|
|
||||||
|
struct common_peg_and_parser {
|
||||||
|
common_peg_parser_id child;
|
||||||
|
};
|
||||||
|
|
||||||
|
struct common_peg_not_parser {
|
||||||
|
common_peg_parser_id child;
|
||||||
|
};
|
||||||
|
|
||||||
|
struct common_peg_any_parser {};
|
||||||
|
|
||||||
|
struct common_peg_space_parser {};
|
||||||
|
|
||||||
|
struct common_peg_chars_parser {
|
||||||
|
struct char_range {
|
||||||
|
uint32_t start;
|
||||||
|
uint32_t end;
|
||||||
|
bool contains(uint32_t codepoint) const { return codepoint >= start && codepoint <= end; }
|
||||||
|
};
|
||||||
|
|
||||||
|
std::string pattern;
|
||||||
|
std::vector<char_range> ranges;
|
||||||
|
bool negated;
|
||||||
|
int min_count;
|
||||||
|
int max_count; // -1 for unbounded
|
||||||
|
};
|
||||||
|
|
||||||
|
struct common_peg_json_string_parser {};
|
||||||
|
|
||||||
|
struct common_peg_until_parser {
|
||||||
|
std::vector<std::string> delimiters;
|
||||||
|
};
|
||||||
|
|
||||||
|
struct common_peg_schema_parser {
|
||||||
|
common_peg_parser_id child;
|
||||||
|
std::string name;
|
||||||
|
std::shared_ptr<nlohmann::ordered_json> schema;
|
||||||
|
|
||||||
|
// Indicates if the GBNF should accept a raw string that matches the schema.
|
||||||
|
bool raw;
|
||||||
|
};
|
||||||
|
|
||||||
|
struct common_peg_rule_parser {
|
||||||
|
std::string name;
|
||||||
|
common_peg_parser_id child;
|
||||||
|
bool trigger;
|
||||||
|
};
|
||||||
|
|
||||||
|
struct common_peg_ref_parser {
|
||||||
|
std::string name;
|
||||||
|
};
|
||||||
|
|
||||||
|
struct common_peg_atomic_parser {
|
||||||
|
common_peg_parser_id child;
|
||||||
|
};
|
||||||
|
|
||||||
|
struct common_peg_tag_parser {
|
||||||
|
common_peg_parser_id child;
|
||||||
|
std::string tag;
|
||||||
|
};
|
||||||
|
|
||||||
|
// Variant holding all parser types
|
||||||
|
using common_peg_parser_variant = std::variant<
|
||||||
|
common_peg_epsilon_parser,
|
||||||
|
common_peg_start_parser,
|
||||||
|
common_peg_end_parser,
|
||||||
|
common_peg_literal_parser,
|
||||||
|
common_peg_sequence_parser,
|
||||||
|
common_peg_choice_parser,
|
||||||
|
common_peg_repetition_parser,
|
||||||
|
common_peg_and_parser,
|
||||||
|
common_peg_not_parser,
|
||||||
|
common_peg_any_parser,
|
||||||
|
common_peg_space_parser,
|
||||||
|
common_peg_chars_parser,
|
||||||
|
common_peg_json_string_parser,
|
||||||
|
common_peg_until_parser,
|
||||||
|
common_peg_schema_parser,
|
||||||
|
common_peg_rule_parser,
|
||||||
|
common_peg_ref_parser,
|
||||||
|
common_peg_atomic_parser,
|
||||||
|
common_peg_tag_parser
|
||||||
|
>;
|
||||||
|
|
||||||
|
class common_peg_arena {
|
||||||
|
std::vector<common_peg_parser_variant> parsers_;
|
||||||
|
std::unordered_map<std::string, common_peg_parser_id> rules_;
|
||||||
|
common_peg_parser_id root_ = COMMON_PEG_INVALID_PARSER_ID;
|
||||||
|
|
||||||
|
public:
|
||||||
|
const common_peg_parser_variant & get(common_peg_parser_id id) const { return parsers_.at(id); }
|
||||||
|
common_peg_parser_variant & get(common_peg_parser_id id) { return parsers_.at(id); }
|
||||||
|
|
||||||
|
size_t size() const { return parsers_.size(); }
|
||||||
|
bool empty() const { return parsers_.empty(); }
|
||||||
|
|
||||||
|
common_peg_parser_id get_rule(const std::string & name) const;
|
||||||
|
bool has_rule(const std::string & name) const { return rules_.find(name) != rules_.end(); }
|
||||||
|
|
||||||
|
common_peg_parser_id root() const { return root_; }
|
||||||
|
void set_root(common_peg_parser_id id) { root_ = id; }
|
||||||
|
|
||||||
|
common_peg_parse_result parse(common_peg_parse_context & ctx, size_t start = 0) const;
|
||||||
|
common_peg_parse_result parse(common_peg_parser_id id, common_peg_parse_context & ctx, size_t start) const;
|
||||||
|
|
||||||
|
void resolve_refs();
|
||||||
|
|
||||||
|
void build_grammar(const common_grammar_builder & builder, bool lazy = false) const;
|
||||||
|
|
||||||
|
std::string dump(common_peg_parser_id id) const;
|
||||||
|
|
||||||
|
nlohmann::json to_json() const;
|
||||||
|
static common_peg_arena from_json(const nlohmann::json & j);
|
||||||
|
|
||||||
|
std::string save() const;
|
||||||
|
void load(const std::string & data);
|
||||||
|
|
||||||
|
friend class common_peg_parser_builder;
|
||||||
|
|
||||||
|
private:
|
||||||
|
common_peg_parser_id add_parser(common_peg_parser_variant parser);
|
||||||
|
void add_rule(const std::string & name, common_peg_parser_id id);
|
||||||
|
|
||||||
|
common_peg_parser_id resolve_ref(common_peg_parser_id id);
|
||||||
|
};
|
||||||
|
|
||||||
|
class common_peg_parser_builder {
|
||||||
|
common_peg_arena arena_;
|
||||||
|
|
||||||
|
common_peg_parser wrap(common_peg_parser_id id) { return common_peg_parser(id, *this); }
|
||||||
|
common_peg_parser add(const common_peg_parser_variant & p) { return wrap(arena_.add_parser(p)); }
|
||||||
|
|
||||||
|
public:
|
||||||
|
common_peg_parser_builder();
|
||||||
|
|
||||||
|
// Match nothing, always succeed.
|
||||||
|
// S -> ε
|
||||||
|
common_peg_parser eps() { return add(common_peg_epsilon_parser{}); }
|
||||||
|
|
||||||
|
// Matches the start of the input.
|
||||||
|
// S -> ^
|
||||||
|
common_peg_parser start() { return add(common_peg_start_parser{}); }
|
||||||
|
|
||||||
|
// Matches the end of the input.
|
||||||
|
// S -> $
|
||||||
|
common_peg_parser end() { return add(common_peg_end_parser{}); }
|
||||||
|
|
||||||
|
// Matches an exact literal string.
|
||||||
|
// S -> "hello"
|
||||||
|
common_peg_parser literal(const std::string & literal) { return add(common_peg_literal_parser{literal}); }
|
||||||
|
|
||||||
|
// Matches a sequence of parsers in order, all must succeed.
|
||||||
|
// S -> A B C
|
||||||
|
common_peg_parser sequence() { return add(common_peg_sequence_parser{}); }
|
||||||
|
common_peg_parser sequence(const std::vector<common_peg_parser_id> & parsers);
|
||||||
|
common_peg_parser sequence(const std::vector<common_peg_parser> & parsers);
|
||||||
|
common_peg_parser sequence(std::initializer_list<common_peg_parser> parsers);
|
||||||
|
|
||||||
|
// Matches the first parser that succeeds from a list of alternatives.
|
||||||
|
// S -> A | B | C
|
||||||
|
common_peg_parser choice() { return add(common_peg_choice_parser{}); }
|
||||||
|
common_peg_parser choice(const std::vector<common_peg_parser_id> & parsers);
|
||||||
|
common_peg_parser choice(const std::vector<common_peg_parser> & parsers);
|
||||||
|
common_peg_parser choice(std::initializer_list<common_peg_parser> parsers);
|
||||||
|
|
||||||
|
// Matches one or more repetitions of a parser.
|
||||||
|
// S -> A+
|
||||||
|
common_peg_parser one_or_more(const common_peg_parser & p) { return repeat(p, 1, -1); }
|
||||||
|
|
||||||
|
// Matches zero or more repetitions of a parser, always succeeds.
|
||||||
|
// S -> A*
|
||||||
|
common_peg_parser zero_or_more(const common_peg_parser & p) { return repeat(p, 0, -1); }
|
||||||
|
|
||||||
|
// Matches zero or one occurrence of a parser, always succeeds.
|
||||||
|
// S -> A?
|
||||||
|
common_peg_parser optional(const common_peg_parser & p) { return repeat(p, 0, 1); }
|
||||||
|
|
||||||
|
// Positive lookahead: succeeds if child parser succeeds, consumes no input.
|
||||||
|
// S -> &A
|
||||||
|
common_peg_parser peek(const common_peg_parser & p) { return add(common_peg_and_parser{p}); }
|
||||||
|
|
||||||
|
// Negative lookahead: succeeds if child parser fails, consumes no input.
|
||||||
|
// S -> !A
|
||||||
|
common_peg_parser negate(const common_peg_parser & p) { return add(common_peg_not_parser{p}); }
|
||||||
|
|
||||||
|
// Matches any single character.
|
||||||
|
// S -> .
|
||||||
|
common_peg_parser any() { return add(common_peg_any_parser{}); }
|
||||||
|
|
||||||
|
// Matches between min and max repetitions of characters from a character class.
|
||||||
|
// S -> [a-z]{m,n}
|
||||||
|
//
|
||||||
|
// Use -1 for max to represent unbounded repetition (equivalent to {m,})
|
||||||
|
common_peg_parser chars(const std::string & classes, int min = 1, int max = -1);
|
||||||
|
|
||||||
|
// Creates a lightweight reference to a named rule (resolved during build()).
|
||||||
|
// Use this for forward references in recursive grammars.
|
||||||
|
// expr_ref -> expr
|
||||||
|
common_peg_parser ref(const std::string & name) { return add(common_peg_ref_parser{name}); }
|
||||||
|
|
||||||
|
// Matches zero or more whitespace characters (space, tab, newline).
|
||||||
|
// S -> [ \t\n]*
|
||||||
|
common_peg_parser space() { return add(common_peg_space_parser{}); }
|
||||||
|
|
||||||
|
// Matches all characters until a delimiter is found (delimiter not consumed).
|
||||||
|
// S -> (!delim .)*
|
||||||
|
common_peg_parser until(const std::string & delimiter) { return add(common_peg_until_parser{{delimiter}}); }
|
||||||
|
|
||||||
|
// Matches all characters until one of the delimiters in the list is found (delimiter not consumed).
|
||||||
|
// S -> (!delim .)*
|
||||||
|
common_peg_parser until_one_of(const std::vector<std::string> & delimiters) { return add(common_peg_until_parser{delimiters}); }
|
||||||
|
|
||||||
|
// Matches everything
|
||||||
|
// S -> .*
|
||||||
|
common_peg_parser rest() { return until_one_of({}); }
|
||||||
|
|
||||||
|
// Matches between min and max repetitions of a parser (inclusive).
|
||||||
|
// S -> A{m,n}
|
||||||
|
// Use -1 for max to represent unbounded repetition (equivalent to {m,})
|
||||||
|
common_peg_parser repeat(const common_peg_parser & p, int min, int max) { return add(common_peg_repetition_parser{p, min,max}); }
|
||||||
|
|
||||||
|
// Matches exactly n repetitions of a parser.
|
||||||
|
// S -> A{n}
|
||||||
|
common_peg_parser repeat(const common_peg_parser & p, int n) { return repeat(p, n, n); }
|
||||||
|
|
||||||
|
// Creates a complete JSON parser supporting objects, arrays, strings, numbers, booleans, and null.
|
||||||
|
// value -> object | array | string | number | true | false | null
|
||||||
|
common_peg_parser json();
|
||||||
|
common_peg_parser json_object();
|
||||||
|
common_peg_parser json_string();
|
||||||
|
common_peg_parser json_array();
|
||||||
|
common_peg_parser json_number();
|
||||||
|
common_peg_parser json_bool();
|
||||||
|
common_peg_parser json_null();
|
||||||
|
|
||||||
|
// Matches JSON string content without the surrounding quotes.
|
||||||
|
// Useful for extracting content within a JSON string.
|
||||||
|
common_peg_parser json_string_content();
|
||||||
|
|
||||||
|
// Matches a JSON object member with a key and associated parser as the
|
||||||
|
// value.
|
||||||
|
common_peg_parser json_member(const std::string & key, const common_peg_parser & p);
|
||||||
|
|
||||||
|
// Wraps a parser with JSON schema metadata for grammar generation.
|
||||||
|
// Used internally to convert JSON schemas to GBNF grammar rules.
|
||||||
|
common_peg_parser schema(const common_peg_parser & p, const std::string & name, const nlohmann::ordered_json & schema, bool raw = false);
|
||||||
|
|
||||||
|
// Creates a named rule, stores it in the grammar, and returns a ref.
|
||||||
|
// If trigger=true, marks this rule as an entry point for lazy grammar generation.
|
||||||
|
// auto json = p.rule("json", json_obj | json_arr | ...)
|
||||||
|
common_peg_parser rule(const std::string & name, const common_peg_parser & p, bool trigger = false);
|
||||||
|
|
||||||
|
// Creates a named rule using a builder function, and returns a ref.
|
||||||
|
// If trigger=true, marks this rule as an entry point for lazy grammar generation.
|
||||||
|
// auto json = p.rule("json", [&]() { return json_object() | json_array() | ... })
|
||||||
|
common_peg_parser rule(const std::string & name, const std::function<common_peg_parser()> & builder, bool trigger = false);
|
||||||
|
|
||||||
|
// Creates a trigger rule. When generating a lazy grammar from the parser,
|
||||||
|
// only trigger rules and descendents are emitted.
|
||||||
|
common_peg_parser trigger_rule(const std::string & name, const common_peg_parser & p) { return rule(name, p, true); }
|
||||||
|
common_peg_parser trigger_rule(const std::string & name, const std::function<common_peg_parser()> & builder) { return rule(name, builder, true); }
|
||||||
|
|
||||||
|
// Creates an atomic parser. Atomic parsers do not create an AST node if
|
||||||
|
// the child results in a partial parse, i.e. NEEDS_MORE_INPUT. This is
|
||||||
|
// intended for situations where partial output is undesirable.
|
||||||
|
common_peg_parser atomic(const common_peg_parser & p) { return add(common_peg_atomic_parser{p}); }
|
||||||
|
|
||||||
|
// Tags create nodes in the generated AST for semantic purposes.
|
||||||
|
// Unlike rules, you can tag multiple nodes with the same tag.
|
||||||
|
common_peg_parser tag(const std::string & tag, const common_peg_parser & p) { return add(common_peg_tag_parser{p.id(), tag}); }
|
||||||
|
|
||||||
|
void set_root(const common_peg_parser & p);
|
||||||
|
|
||||||
|
common_peg_arena build();
|
||||||
|
};
|
||||||
|
|
||||||
|
// Helper function for building parsers
|
||||||
|
common_peg_arena build_peg_parser(const std::function<common_peg_parser(common_peg_parser_builder & builder)> & fn);
|
||||||
|
|
@ -0,0 +1,64 @@
|
||||||
|
#include "unicode.h"
|
||||||
|
|
||||||
|
// implementation adopted from src/unicode.cpp
|
||||||
|
|
||||||
|
size_t utf8_sequence_length(unsigned char first_byte) {
|
||||||
|
const size_t lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4 };
|
||||||
|
uint8_t highbits = static_cast<uint8_t>(first_byte) >> 4;
|
||||||
|
return lookup[highbits];
|
||||||
|
}
|
||||||
|
|
||||||
|
utf8_parse_result parse_utf8_codepoint(std::string_view input, size_t offset) {
|
||||||
|
if (offset >= input.size()) {
|
||||||
|
return utf8_parse_result(utf8_parse_result::INCOMPLETE);
|
||||||
|
}
|
||||||
|
|
||||||
|
// ASCII fast path
|
||||||
|
if (!(input[offset] & 0x80)) {
|
||||||
|
return utf8_parse_result(utf8_parse_result::SUCCESS, input[offset], 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Invalid: continuation byte as first byte
|
||||||
|
if (!(input[offset] & 0x40)) {
|
||||||
|
return utf8_parse_result(utf8_parse_result::INVALID);
|
||||||
|
}
|
||||||
|
|
||||||
|
// 2-byte sequence
|
||||||
|
if (!(input[offset] & 0x20)) {
|
||||||
|
if (offset + 1 >= input.size()) {
|
||||||
|
return utf8_parse_result(utf8_parse_result::INCOMPLETE);
|
||||||
|
}
|
||||||
|
if ((input[offset + 1] & 0xc0) != 0x80) {
|
||||||
|
return utf8_parse_result(utf8_parse_result::INVALID);
|
||||||
|
}
|
||||||
|
auto result = ((input[offset] & 0x1f) << 6) | (input[offset + 1] & 0x3f);
|
||||||
|
return utf8_parse_result(utf8_parse_result::SUCCESS, result, 2);
|
||||||
|
}
|
||||||
|
|
||||||
|
// 3-byte sequence
|
||||||
|
if (!(input[offset] & 0x10)) {
|
||||||
|
if (offset + 2 >= input.size()) {
|
||||||
|
return utf8_parse_result(utf8_parse_result::INCOMPLETE);
|
||||||
|
}
|
||||||
|
if ((input[offset + 1] & 0xc0) != 0x80 || (input[offset + 2] & 0xc0) != 0x80) {
|
||||||
|
return utf8_parse_result(utf8_parse_result::INVALID);
|
||||||
|
}
|
||||||
|
auto result = ((input[offset] & 0x0f) << 12) | ((input[offset + 1] & 0x3f) << 6) | (input[offset + 2] & 0x3f);
|
||||||
|
return utf8_parse_result(utf8_parse_result::SUCCESS, result, 3);
|
||||||
|
}
|
||||||
|
|
||||||
|
// 4-byte sequence
|
||||||
|
if (!(input[offset] & 0x08)) {
|
||||||
|
if (offset + 3 >= input.size()) {
|
||||||
|
return utf8_parse_result(utf8_parse_result::INCOMPLETE);
|
||||||
|
}
|
||||||
|
if ((input[offset + 1] & 0xc0) != 0x80 || (input[offset + 2] & 0xc0) != 0x80 || (input[offset + 3] & 0xc0) != 0x80) {
|
||||||
|
return utf8_parse_result(utf8_parse_result::INVALID);
|
||||||
|
}
|
||||||
|
auto result = ((input[offset] & 0x07) << 18) | ((input[offset + 1] & 0x3f) << 12) | ((input[offset + 2] & 0x3f) << 6) | (input[offset + 3] & 0x3f);
|
||||||
|
return utf8_parse_result(utf8_parse_result::SUCCESS, result, 4);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Invalid first byte
|
||||||
|
return utf8_parse_result(utf8_parse_result::INVALID);
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,22 @@
|
||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include <cstdint>
|
||||||
|
#include <string_view>
|
||||||
|
|
||||||
|
// UTF-8 parsing utilities for streaming-aware unicode support
|
||||||
|
|
||||||
|
struct utf8_parse_result {
|
||||||
|
uint32_t codepoint; // Decoded codepoint (only valid if status == SUCCESS)
|
||||||
|
size_t bytes_consumed; // How many bytes this codepoint uses (1-4)
|
||||||
|
enum status { SUCCESS, INCOMPLETE, INVALID } status;
|
||||||
|
|
||||||
|
utf8_parse_result(enum status s, uint32_t cp = 0, size_t bytes = 0)
|
||||||
|
: codepoint(cp), bytes_consumed(bytes), status(s) {}
|
||||||
|
};
|
||||||
|
|
||||||
|
// Determine the expected length of a UTF-8 sequence from its first byte
|
||||||
|
// Returns 0 for invalid first bytes
|
||||||
|
size_t utf8_sequence_length(unsigned char first_byte);
|
||||||
|
|
||||||
|
// Parse a single UTF-8 codepoint from input
|
||||||
|
utf8_parse_result parse_utf8_codepoint(std::string_view input, size_t offset);
|
||||||
|
|
@ -1524,6 +1524,79 @@ class TextModel(ModelBase):
|
||||||
special_vocab._set_special_token("bos", 151643)
|
special_vocab._set_special_token("bos", 151643)
|
||||||
special_vocab.add_to_gguf(self.gguf_writer)
|
special_vocab.add_to_gguf(self.gguf_writer)
|
||||||
|
|
||||||
|
def _set_vocab_mistral(self):
|
||||||
|
if not _mistral_common_installed:
|
||||||
|
raise ImportError(_mistral_import_error_msg)
|
||||||
|
|
||||||
|
vocab = MistralVocab(self.dir_model)
|
||||||
|
logger.info(
|
||||||
|
f"Converting tokenizer {vocab.tokenizer_type} of size {vocab.vocab_size}."
|
||||||
|
)
|
||||||
|
|
||||||
|
self.gguf_writer.add_tokenizer_model(vocab.gguf_tokenizer_model)
|
||||||
|
|
||||||
|
tokens = []
|
||||||
|
scores = []
|
||||||
|
toktypes = []
|
||||||
|
|
||||||
|
for text, score, toktype in vocab.all_tokens():
|
||||||
|
tokens.append(text)
|
||||||
|
scores.append(score)
|
||||||
|
toktypes.append(toktype)
|
||||||
|
|
||||||
|
assert len(tokens) == vocab.vocab_size, (
|
||||||
|
f"token count ({len(tokens)}) != vocab size ({vocab.vocab_size})"
|
||||||
|
)
|
||||||
|
|
||||||
|
if vocab.tokenizer_type == MistralTokenizerType.tekken:
|
||||||
|
self.gguf_writer.add_tokenizer_pre("tekken")
|
||||||
|
self.gguf_writer.add_token_merges(
|
||||||
|
vocab.extract_vocab_merges_from_model()
|
||||||
|
)
|
||||||
|
|
||||||
|
logger.info(
|
||||||
|
f"Setting bos, eos, unk and pad token IDs to {vocab.bos_id}, {vocab.eos_id}, {vocab.unk_id}, {vocab.pad_id}."
|
||||||
|
)
|
||||||
|
|
||||||
|
self.gguf_writer.add_bos_token_id(vocab.bos_id)
|
||||||
|
self.gguf_writer.add_eos_token_id(vocab.eos_id)
|
||||||
|
self.gguf_writer.add_unk_token_id(vocab.unk_id)
|
||||||
|
self.gguf_writer.add_pad_token_id(vocab.pad_id)
|
||||||
|
|
||||||
|
self.gguf_writer.add_token_list(tokens)
|
||||||
|
self.gguf_writer.add_token_scores(scores)
|
||||||
|
self.gguf_writer.add_token_types(toktypes)
|
||||||
|
self.gguf_writer.add_vocab_size(vocab.vocab_size)
|
||||||
|
|
||||||
|
self.gguf_writer.add_add_bos_token(True)
|
||||||
|
self.gguf_writer.add_add_eos_token(False)
|
||||||
|
|
||||||
|
local_template_file_path = self.dir_model / "chat_template.jinja"
|
||||||
|
|
||||||
|
if self.is_mistral_format and local_template_file_path.is_file():
|
||||||
|
# Ministral-3 and other new Mistral models come with chat templates.
|
||||||
|
# ref: https://huggingface.co/mistralai/Ministral-3-14B-Instruct-2512/tree/main
|
||||||
|
logger.info("Using an existing Mistral local chat template.")
|
||||||
|
|
||||||
|
with open(local_template_file_path, "r", encoding="utf-8") as f:
|
||||||
|
template = f.read()
|
||||||
|
elif not self.is_mistral_format or not self.disable_mistral_community_chat_template:
|
||||||
|
template_dir = Path(__file__).parent / "models/templates/"
|
||||||
|
|
||||||
|
# Log only for Mistral format that the official tokenization and detokenization is via `mistral-common`.
|
||||||
|
if self.is_mistral_format:
|
||||||
|
logger.info(
|
||||||
|
"Using a Mistral community chat template. These templates can be subject to errors in early days or weeks after a release. "
|
||||||
|
"Mistral recommends to use `mistral-common` to perform tokenization and detokenization."
|
||||||
|
)
|
||||||
|
template = MistralModel.get_community_chat_template(vocab, template_dir, self.is_mistral_format)
|
||||||
|
else:
|
||||||
|
logger.info("Not using a Mistral local or community chat template. Ensure to perform the tokenization and detokenization via `mistral-common`.")
|
||||||
|
template = None
|
||||||
|
|
||||||
|
if template is not None:
|
||||||
|
self.gguf_writer.add_chat_template(template)
|
||||||
|
|
||||||
|
|
||||||
class MmprojModel(ModelBase):
|
class MmprojModel(ModelBase):
|
||||||
model_type = ModelType.MMPROJ
|
model_type = ModelType.MMPROJ
|
||||||
|
|
@ -2294,67 +2367,6 @@ class LlamaModel(TextModel):
|
||||||
if self.hf_arch == "VLlama3ForCausalLM":
|
if self.hf_arch == "VLlama3ForCausalLM":
|
||||||
self.hparams["num_attention_heads"] = self.hparams.get("num_attention_heads", 32)
|
self.hparams["num_attention_heads"] = self.hparams.get("num_attention_heads", 32)
|
||||||
|
|
||||||
def _set_vocab_mistral(self):
|
|
||||||
if not _mistral_common_installed:
|
|
||||||
raise ImportError(_mistral_import_error_msg)
|
|
||||||
|
|
||||||
vocab = MistralVocab(self.dir_model)
|
|
||||||
logger.info(
|
|
||||||
f"Converting tokenizer {vocab.tokenizer_type} of size {vocab.vocab_size}."
|
|
||||||
)
|
|
||||||
|
|
||||||
self.gguf_writer.add_tokenizer_model(vocab.gguf_tokenizer_model)
|
|
||||||
|
|
||||||
tokens = []
|
|
||||||
scores = []
|
|
||||||
toktypes = []
|
|
||||||
|
|
||||||
for text, score, toktype in vocab.all_tokens():
|
|
||||||
tokens.append(text)
|
|
||||||
scores.append(score)
|
|
||||||
toktypes.append(toktype)
|
|
||||||
|
|
||||||
assert len(tokens) == vocab.vocab_size, (
|
|
||||||
f"token count ({len(tokens)}) != vocab size ({vocab.vocab_size})"
|
|
||||||
)
|
|
||||||
|
|
||||||
if vocab.tokenizer_type == MistralTokenizerType.tekken:
|
|
||||||
self.gguf_writer.add_tokenizer_pre("tekken")
|
|
||||||
self.gguf_writer.add_token_merges(
|
|
||||||
vocab.extract_vocab_merges_from_model()
|
|
||||||
)
|
|
||||||
|
|
||||||
logger.info(
|
|
||||||
f"Setting bos, eos, unk and pad token IDs to {vocab.bos_id}, {vocab.eos_id}, {vocab.unk_id}, {vocab.pad_id}."
|
|
||||||
)
|
|
||||||
|
|
||||||
self.gguf_writer.add_bos_token_id(vocab.bos_id)
|
|
||||||
self.gguf_writer.add_eos_token_id(vocab.eos_id)
|
|
||||||
self.gguf_writer.add_unk_token_id(vocab.unk_id)
|
|
||||||
self.gguf_writer.add_pad_token_id(vocab.pad_id)
|
|
||||||
|
|
||||||
self.gguf_writer.add_token_list(tokens)
|
|
||||||
self.gguf_writer.add_token_scores(scores)
|
|
||||||
self.gguf_writer.add_token_types(toktypes)
|
|
||||||
self.gguf_writer.add_vocab_size(vocab.vocab_size)
|
|
||||||
|
|
||||||
self.gguf_writer.add_add_bos_token(True)
|
|
||||||
self.gguf_writer.add_add_eos_token(False)
|
|
||||||
|
|
||||||
template_dir = Path(__file__).parent / "models/templates/"
|
|
||||||
|
|
||||||
if not self.is_mistral_format or not self.disable_mistral_community_chat_template:
|
|
||||||
# Log only for Mistral format that the official tokenization and detokenization is via `mistral-common`.
|
|
||||||
if self.is_mistral_format:
|
|
||||||
logger.info(
|
|
||||||
"Using a Mistral community chat template. These templates can be subject to errors in early days or weeks after a release. "
|
|
||||||
"Mistral recommends to use `mistral-common` to perform tokenization and detokenization."
|
|
||||||
)
|
|
||||||
template = MistralModel.get_community_chat_template(vocab, template_dir, self.is_mistral_format)
|
|
||||||
self.gguf_writer.add_chat_template(template)
|
|
||||||
else:
|
|
||||||
logger.info("Not using a Mistral community chat template. Ensure to perform the tokenization and detokenization via `mistral-common`.")
|
|
||||||
|
|
||||||
def set_vocab(self):
|
def set_vocab(self):
|
||||||
if self.is_mistral_format:
|
if self.is_mistral_format:
|
||||||
return self._set_vocab_mistral()
|
return self._set_vocab_mistral()
|
||||||
|
|
@ -2842,6 +2854,10 @@ class Mistral3Model(LlamaModel):
|
||||||
self.gguf_writer.add_attn_temperature_scale(rope_params["llama_4_scaling_beta"])
|
self.gguf_writer.add_attn_temperature_scale(rope_params["llama_4_scaling_beta"])
|
||||||
|
|
||||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None):
|
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None):
|
||||||
|
# TODO: probably not worth supporting quantized weight, as official BF16 is also available
|
||||||
|
if name.endswith("weight_scale_inv"):
|
||||||
|
raise ValueError("This is a quantized weight, please use BF16 weight instead")
|
||||||
|
|
||||||
name = name.replace("language_model.", "")
|
name = name.replace("language_model.", "")
|
||||||
if "multi_modal_projector" in name or "vision_tower" in name:
|
if "multi_modal_projector" in name or "vision_tower" in name:
|
||||||
return []
|
return []
|
||||||
|
|
@ -9908,17 +9924,109 @@ class MistralModel(LlamaModel):
|
||||||
|
|
||||||
def set_gguf_parameters(self):
|
def set_gguf_parameters(self):
|
||||||
super().set_gguf_parameters()
|
super().set_gguf_parameters()
|
||||||
if "yarn" in self.hparams:
|
MistralModel.set_mistral_config(self.gguf_writer, self.hparams)
|
||||||
yarn_params = self.hparams["yarn"]
|
|
||||||
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
|
|
||||||
self.gguf_writer.add_rope_scaling_factor(yarn_params["factor"])
|
|
||||||
self.gguf_writer.add_rope_scaling_yarn_beta_fast(yarn_params["beta"])
|
|
||||||
self.gguf_writer.add_rope_scaling_yarn_beta_slow(yarn_params["alpha"])
|
|
||||||
self.gguf_writer.add_rope_scaling_yarn_log_mul(1.0) # mscale_all_dim
|
|
||||||
self.gguf_writer.add_rope_scaling_orig_ctx_len(yarn_params["original_max_position_embeddings"])
|
|
||||||
|
|
||||||
if "llama_4_scaling" in self.hparams:
|
@staticmethod
|
||||||
self.gguf_writer.add_attn_temperature_scale(self.hparams["llama_4_scaling"]["beta"])
|
def set_mistral_config(gguf_writer: gguf.GGUFWriter, hparams: dict):
|
||||||
|
if "yarn" in hparams:
|
||||||
|
yarn_params = hparams["yarn"]
|
||||||
|
gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
|
||||||
|
gguf_writer.add_rope_scaling_factor(yarn_params["factor"])
|
||||||
|
gguf_writer.add_rope_scaling_yarn_beta_fast(yarn_params["beta"])
|
||||||
|
gguf_writer.add_rope_scaling_yarn_beta_slow(yarn_params["alpha"])
|
||||||
|
gguf_writer.add_rope_scaling_yarn_log_mul(1.0) # mscale_all_dim
|
||||||
|
gguf_writer.add_rope_scaling_orig_ctx_len(yarn_params["original_max_position_embeddings"])
|
||||||
|
|
||||||
|
if "llama_4_scaling" in hparams:
|
||||||
|
gguf_writer.add_attn_temperature_scale(hparams["llama_4_scaling"]["beta"])
|
||||||
|
|
||||||
|
|
||||||
|
class MistralMoeModel(DeepseekV2Model):
|
||||||
|
model_arch = gguf.MODEL_ARCH.DEEPSEEK2
|
||||||
|
model_name = "Mistral"
|
||||||
|
hf_arch = ""
|
||||||
|
is_mistral_format = True
|
||||||
|
|
||||||
|
def __init__(self, *args, **kwargs):
|
||||||
|
super().__init__(*args, **kwargs)
|
||||||
|
logger.info("Using MistralMoeModel")
|
||||||
|
# remap hparams from Mistral MoE format to DeepseekV2 format
|
||||||
|
# we do this way to be able to reuse DeepseekV2Model set_gguf_parameters logic
|
||||||
|
# ref: https://github.com/vllm-project/vllm/blob/b294e28db2c5dee61bc25157664edcada8b90b31/vllm/transformers_utils/configs/mistral.py
|
||||||
|
config = self.hparams
|
||||||
|
# Mistral key -> HF key
|
||||||
|
config_mapping = {
|
||||||
|
"dim": "hidden_size",
|
||||||
|
"norm_eps": "rms_norm_eps",
|
||||||
|
"n_kv_heads": "num_key_value_heads",
|
||||||
|
"n_layers": "num_hidden_layers",
|
||||||
|
"n_heads": "num_attention_heads",
|
||||||
|
"hidden_dim": "intermediate_size",
|
||||||
|
}
|
||||||
|
# HF key -> (Mistral key, default value)
|
||||||
|
top_level_mapping_with_default = {
|
||||||
|
"model_type": ("model_type", "transformer"),
|
||||||
|
"hidden_act": ("activation", "silu"),
|
||||||
|
"tie_word_embeddings": ("tied_embeddings", False),
|
||||||
|
"max_seq_len": ("max_seq_len", config.get("max_position_embeddings", 128_000)),
|
||||||
|
"max_position_embeddings": ("max_position_embeddings", 128_000),
|
||||||
|
}
|
||||||
|
# mapping top-level keys
|
||||||
|
for key, new_key in config_mapping.items():
|
||||||
|
if key in config:
|
||||||
|
config[new_key] = config[key]
|
||||||
|
for new_key, (key, default_value) in top_level_mapping_with_default.items():
|
||||||
|
config[new_key] = config.get(key, default_value)
|
||||||
|
# mapping MoE-specific keys
|
||||||
|
moe_config_map = {
|
||||||
|
"route_every_n": "moe_layer_freq",
|
||||||
|
"first_k_dense_replace": "first_k_dense_replace",
|
||||||
|
"num_experts_per_tok": "num_experts_per_tok",
|
||||||
|
"num_experts": "n_routed_experts",
|
||||||
|
"expert_hidden_dim": "moe_intermediate_size",
|
||||||
|
"routed_scale": "routed_scaling_factor",
|
||||||
|
"num_shared_experts": "n_shared_experts",
|
||||||
|
"num_expert_groups": "n_group",
|
||||||
|
"num_expert_groups_per_tok": "topk_group",
|
||||||
|
}
|
||||||
|
moe = config["moe"]
|
||||||
|
for key, new_key in moe_config_map.items():
|
||||||
|
if key in moe:
|
||||||
|
config[new_key] = moe[key]
|
||||||
|
# provide missing values
|
||||||
|
config["topk_method"] = None
|
||||||
|
config["norm_topk_prob"] = True
|
||||||
|
config["scoring_func"] = "softmax"
|
||||||
|
|
||||||
|
def set_vocab(self):
|
||||||
|
self._set_vocab_mistral()
|
||||||
|
|
||||||
|
def set_gguf_parameters(self):
|
||||||
|
super().set_gguf_parameters()
|
||||||
|
MistralModel.set_mistral_config(self.gguf_writer, self.hparams)
|
||||||
|
yarn_params = self.hparams["yarn"]
|
||||||
|
self.gguf_writer.add_attn_temperature_length(yarn_params["original_max_position_embeddings"])
|
||||||
|
self.gguf_writer.add_rope_scaling_yarn_log_mul(0.1) # mscale_all_dim * 0.1
|
||||||
|
|
||||||
|
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None):
|
||||||
|
if name.startswith("vision_") or name.startswith("patch_merger.") or "mm_projector" in name:
|
||||||
|
return []
|
||||||
|
|
||||||
|
# rename certain tensors so that we can reuse DeepseekV2Model modify_tensors logic
|
||||||
|
if name.endswith(".qscale_act"):
|
||||||
|
name = name.replace(".qscale_act", ".input_scale")
|
||||||
|
if name.endswith(".qscale_weight"):
|
||||||
|
name = name.replace(".qscale_weight", ".weight_scale")
|
||||||
|
if ".wkv_b." in name:
|
||||||
|
name = name.replace(".wkv_b.", ".kv_b_proj.")
|
||||||
|
if ".experts." in name:
|
||||||
|
name = name.replace(".experts.", ".mlp.experts.")
|
||||||
|
name = name.replace(".w1.", ".gate_proj.")
|
||||||
|
name = name.replace(".w2.", ".down_proj.")
|
||||||
|
name = name.replace(".w3.", ".up_proj.")
|
||||||
|
name = "model." + name
|
||||||
|
|
||||||
|
return super().modify_tensors(data_torch, name, bid)
|
||||||
|
|
||||||
|
|
||||||
class PixtralModel(LlavaVisionModel):
|
class PixtralModel(LlavaVisionModel):
|
||||||
|
|
@ -10474,6 +10582,8 @@ def main() -> None:
|
||||||
elif args.mmproj:
|
elif args.mmproj:
|
||||||
assert hparams.get("vision_encoder") is not None, "This model does not support multimodal"
|
assert hparams.get("vision_encoder") is not None, "This model does not support multimodal"
|
||||||
model_class = PixtralModel
|
model_class = PixtralModel
|
||||||
|
elif "moe" in hparams:
|
||||||
|
model_class = MistralMoeModel
|
||||||
else:
|
else:
|
||||||
model_class = MistralModel
|
model_class = MistralModel
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,258 @@
|
||||||
|
# llama.cpp for AMD ZenDNN
|
||||||
|
|
||||||
|
> [!WARNING]
|
||||||
|
> **Note:** ZenDNN is **not** the same as zDNN.
|
||||||
|
> - **ZenDNN** (this page): AMD's deep learning library for AMD EPYC CPUs
|
||||||
|
> - **zDNN**: IBM's Deep Neural Network acceleration library for IBM Z & LinuxONE Mainframes ([see zDNN documentation](zDNN.md))
|
||||||
|
|
||||||
|
- [Background](#background)
|
||||||
|
- [OS](#os)
|
||||||
|
- [Hardware](#hardware)
|
||||||
|
- [Supported Operations](#supported-operations)
|
||||||
|
- [DataType Supports](#datatype-supports)
|
||||||
|
- [Linux](#linux)
|
||||||
|
- [Environment Variable](#environment-variable)
|
||||||
|
- [Performance Optimization](#performance-optimization)
|
||||||
|
- [Known Issues](#known-issues)
|
||||||
|
- [TODO](#todo)
|
||||||
|
|
||||||
|
## Background
|
||||||
|
|
||||||
|
**ZenDNN** (Zen Deep Neural Network Library) is AMD's high-performance deep learning inference library optimized for AMD EPYC™ CPUs. It provides optimized implementations of key deep learning primitives and operations, delivering significant performance improvements for neural network workloads on AMD Zen-based processor architectures.
|
||||||
|
|
||||||
|
**Llama.cpp + ZenDNN**
|
||||||
|
|
||||||
|
The llama.cpp ZenDNN backend leverages AMD's optimized matrix multiplication primitives to accelerate inference on AMD CPUs. It utilizes ZenDNN's **LowOHA (Low Overhead Hardware Accelerated)** MatMul operator for efficient GEMM operations with minimal execution overhead, built-in weight caching, and direct access to backend libraries (AOCL BLIS, LibXSMM, OneDNN).
|
||||||
|
|
||||||
|
For more information about ZenDNN, visit: https://www.amd.com/en/developer/zendnn.html
|
||||||
|
|
||||||
|
## OS
|
||||||
|
|
||||||
|
| OS | Status | Verified |
|
||||||
|
|:-------:|:-------:|:----------------------------------------------:|
|
||||||
|
| Linux | Support | Ubuntu 20.04, 22.04, 24.04 |
|
||||||
|
|
||||||
|
For the latest list of supported operating systems, see the [ZenDNN Supported OS](https://github.com/amd/ZenDNN/blob/zendnnl/README.md#15-supported-os).
|
||||||
|
|
||||||
|
## Hardware
|
||||||
|
|
||||||
|
### AMD CPUs
|
||||||
|
|
||||||
|
**Recommended Processors**
|
||||||
|
|
||||||
|
ZenDNN is optimized for AMD EPYC™ processors and AMD Ryzen™ processors based on "Zen" microarchitecture and newer.
|
||||||
|
|
||||||
|
| CPU Family | Status | Notes |
|
||||||
|
|:-----------------------------:|:-------:|:----------------------------------:|
|
||||||
|
| AMD EPYC™ 9005 Series (Turin)| Support | 5th Gen - Zen 5 architecture |
|
||||||
|
| AMD EPYC™ 9004 Series (Genoa)| Support | 4th Gen - Zen 4 architecture |
|
||||||
|
| AMD EPYC™ 7003 Series (Milan)| Support | 3rd Gen - Zen 3 architecture |
|
||||||
|
| AMD Ryzen™ AI MAX (Strix Halo)| Support | High-performance mobile processors |
|
||||||
|
|
||||||
|
*Notes:*
|
||||||
|
|
||||||
|
- Best performance is achieved on AMD EPYC™ processors with high core counts (e.g., EPYC 9005 series).
|
||||||
|
- ZenDNN leverages AMD's advanced CPU features including AVX2 and AVX-512 instruction sets.
|
||||||
|
- For optimal performance, ensure your system has sufficient memory bandwidth.
|
||||||
|
|
||||||
|
## Supported Operations
|
||||||
|
|
||||||
|
The ZenDNN backend currently accelerates **matrix multiplication (MUL_MAT)** operations only. Other operations are handled by the standard CPU backend.
|
||||||
|
|
||||||
|
| Operation | Status | Notes |
|
||||||
|
|:-------------|:-------:|:----------------------------------------------:|
|
||||||
|
| MUL_MAT | ✓ | Accelerated via ZenDNN LowOHA MatMul |
|
||||||
|
|
||||||
|
*Note:* Since only MUL_MAT is accelerated, models will benefit most from ZenDNN when matrix multiplications dominate the computational workload (which is typical for transformer-based LLMs).
|
||||||
|
|
||||||
|
## DataType Supports
|
||||||
|
|
||||||
|
| DataType | Status | Notes |
|
||||||
|
|:----------------------:|:-------:|:---------------------------------------------:|
|
||||||
|
| FP32 | Support | Full precision floating point |
|
||||||
|
| BF16 | Support | BFloat16 (best performance on Zen 4/Zen 5) |
|
||||||
|
|
||||||
|
*Notes:*
|
||||||
|
|
||||||
|
- **BF16** provides best performance on Zen 4 and Zen 5 EPYC™ processors (Genoa, Turin).
|
||||||
|
|
||||||
|
## Linux
|
||||||
|
|
||||||
|
### I. Setup Environment
|
||||||
|
|
||||||
|
You have two options to set up ZenDNN:
|
||||||
|
|
||||||
|
#### Option 1: Automatic Download and Build (Recommended)
|
||||||
|
|
||||||
|
CMake will automatically download and build ZenDNN for you:
|
||||||
|
|
||||||
|
```sh
|
||||||
|
# Build llama.cpp - ZenDNN will be automatically downloaded and built
|
||||||
|
cmake -B build -DGGML_ZENDNN=ON -DCMAKE_BUILD_TYPE=Release
|
||||||
|
cmake --build build --config Release -j $(nproc)
|
||||||
|
```
|
||||||
|
|
||||||
|
No manual ZenDNN installation required. CMake will handle everything automatically.
|
||||||
|
|
||||||
|
#### Option 2: Use Custom ZenDNN Installation
|
||||||
|
|
||||||
|
If you want to build ZenDNN yourself or use a specific version:
|
||||||
|
|
||||||
|
**Step 1: Build ZenDNN from source**
|
||||||
|
|
||||||
|
```sh
|
||||||
|
# Clone ZenDNN repository
|
||||||
|
git clone https://github.com/amd/ZenDNN.git
|
||||||
|
cd ZenDNN
|
||||||
|
git checkout zendnnl
|
||||||
|
|
||||||
|
# Build and install (requires CMake >= 3.25)
|
||||||
|
mkdir build && cd build
|
||||||
|
cmake ..
|
||||||
|
cmake --build . --target all
|
||||||
|
```
|
||||||
|
|
||||||
|
Default installation path: `ZenDNN/build/install`
|
||||||
|
|
||||||
|
**For detailed build instructions**, refer to the [ZenDNN README](https://github.com/amd/ZenDNN/blob/zendnnl/README.md).
|
||||||
|
|
||||||
|
**Step 2: Build llama.cpp with custom ZenDNN path**
|
||||||
|
|
||||||
|
```sh
|
||||||
|
# Using environment variable
|
||||||
|
export ZENDNN_ROOT=/path/to/ZenDNN/build/install
|
||||||
|
cmake -B build -DGGML_ZENDNN=ON -DCMAKE_BUILD_TYPE=Release
|
||||||
|
cmake --build build --config Release -j $(nproc)
|
||||||
|
|
||||||
|
# OR specify path directly in CMake
|
||||||
|
cmake -B build -DGGML_ZENDNN=ON -DZENDNN_ROOT=/path/to/ZenDNN/build/install -DCMAKE_BUILD_TYPE=Release
|
||||||
|
cmake --build build --config Release -j $(nproc)
|
||||||
|
```
|
||||||
|
|
||||||
|
### II. Run the Server
|
||||||
|
|
||||||
|
#### 1. Download Model
|
||||||
|
|
||||||
|
Download LLaMA 3.1 8B Instruct BF16 model:
|
||||||
|
|
||||||
|
```sh
|
||||||
|
# Download from Hugging Face
|
||||||
|
huggingface-cli download meta-llama/Llama-3.1-8B-Instruct-GGUF --local-dir models/
|
||||||
|
```
|
||||||
|
|
||||||
|
#### 2. Start Server
|
||||||
|
|
||||||
|
Run llama.cpp server with ZenDNN acceleration:
|
||||||
|
|
||||||
|
```sh
|
||||||
|
# Set optimal configuration
|
||||||
|
export OMP_NUM_THREADS=64 # Adjust to your CPU core count
|
||||||
|
export ZENDNNL_MATMUL_ALGO=2 # Blocked AOCL BLIS for best performance
|
||||||
|
|
||||||
|
# Start server
|
||||||
|
./build/bin/llama-server \
|
||||||
|
-m models/Llama-3.1-8B-Instruct.BF16.gguf \
|
||||||
|
--host 0.0.0.0 \
|
||||||
|
--port 8080 \
|
||||||
|
-t 64
|
||||||
|
```
|
||||||
|
|
||||||
|
Access the server at `http://localhost:8080`.
|
||||||
|
|
||||||
|
**Performance tips**:
|
||||||
|
- Set `OMP_NUM_THREADS` to match your physical core count
|
||||||
|
- Use `ZENDNNL_MATMUL_ALGO=2` for optimal performance
|
||||||
|
- For NUMA systems: `numactl --cpunodebind=0 --membind=0 ./build/bin/llama-server ...`
|
||||||
|
|
||||||
|
## Environment Variable
|
||||||
|
|
||||||
|
### Build Time
|
||||||
|
|
||||||
|
| Name | Value | Function |
|
||||||
|
|--------------------|---------------------------------------|---------------------------------------------|
|
||||||
|
| GGML_ZENDNN | ON/OFF | Enable ZenDNN backend support |
|
||||||
|
| ZENDNN_ROOT | Path to ZenDNN installation | Set ZenDNN installation directory |
|
||||||
|
| GGML_OPENMP | ON/OFF (recommended: ON) | Enable OpenMP for multi-threading |
|
||||||
|
|
||||||
|
### Runtime
|
||||||
|
|
||||||
|
| Name | Value | Function |
|
||||||
|
|-------------------------|--------------------------|-------------------------------------------------------------------|
|
||||||
|
| OMP_NUM_THREADS | Number (e.g., 64) | Set number of OpenMP threads (recommended: physical core count) |
|
||||||
|
| ZENDNNL_MATMUL_ALGO | 0-5 | Select MatMul backend algorithm (see Performance Optimization) |
|
||||||
|
| ZENDNNL_PROFILE_LOG_LEVEL | 0-4 | Profiling log level (0=disabled, 4=verbose) |
|
||||||
|
| ZENDNNL_ENABLE_PROFILER | 0 or 1 | Enable detailed profiling (1=enabled) |
|
||||||
|
| ZENDNNL_API_LOG_LEVEL | 0-4 | API log level (0=disabled, 4=verbose) |
|
||||||
|
|
||||||
|
**Example**:
|
||||||
|
|
||||||
|
```sh
|
||||||
|
export OMP_NUM_THREADS=64
|
||||||
|
export ZENDNNL_MATMUL_ALGO=2 # Use Blocked AOCL BLIS for best performance
|
||||||
|
./build/bin/llama-cli -m models/llama-2-7b.Q4_0.gguf -p "Test" -n 100
|
||||||
|
```
|
||||||
|
|
||||||
|
## Performance Optimization
|
||||||
|
|
||||||
|
### MatMul Algorithm Selection
|
||||||
|
|
||||||
|
ZenDNN's LowOHA MatMul supports multiple backend algorithms. For **best performance**, use the **Blocked AOCL BLIS** algorithm:
|
||||||
|
|
||||||
|
```sh
|
||||||
|
export ZENDNNL_MATMUL_ALGO=2 # Blocked AOCL BLIS (recommended)
|
||||||
|
```
|
||||||
|
|
||||||
|
**Available algorithms**:
|
||||||
|
|
||||||
|
| Value | Algorithm | Description |
|
||||||
|
|:-----:|:-----------------------|:----------------------------------------------|
|
||||||
|
| 0 | Dynamic Dispatch | Automatic backend selection (default) |
|
||||||
|
| 1 | AOCL BLIS | AOCL BLIS backend |
|
||||||
|
| 2 | AOCL BLIS Blocked | **Blocked AOCL BLIS (recommended)** |
|
||||||
|
| 3 | OneDNN | OneDNN backend |
|
||||||
|
| 4 | OneDNN Blocked | Blocked OneDNN |
|
||||||
|
| 5 | LibXSMM | LibXSMM backend |
|
||||||
|
|
||||||
|
### Profiling and Debugging
|
||||||
|
|
||||||
|
For detailed profiling and logging options, refer to the [ZenDNN Logging Documentation](https://github.com/amd/ZenDNN/blob/zendnnl/docs/logging.md).
|
||||||
|
|
||||||
|
## Known Issues
|
||||||
|
|
||||||
|
- **Limited operation support**: Currently only matrix multiplication (MUL_MAT) is accelerated via ZenDNN. Other operations fall back to the standard CPU backend.
|
||||||
|
- **BF16 support**: BF16 operations require AMD Zen 4 or Zen 5 architecture (EPYC 9004/9005 series). On older CPUs, operations will use FP32.
|
||||||
|
- **NUMA awareness**: For multi-socket systems, manual NUMA binding may be required for optimal performance.
|
||||||
|
|
||||||
|
## Q&A
|
||||||
|
|
||||||
|
**Q: How do I verify that ZenDNN backend is being used?**
|
||||||
|
|
||||||
|
A: Check the log output when running llama.cpp. You should see messages indicating the ZenDNN backend is initialized. You can also check the backend name in the output.
|
||||||
|
|
||||||
|
**Q: What performance improvement can I expect?**
|
||||||
|
|
||||||
|
A: Performance gains vary depending on the model size, batch size, and CPU architecture. On AMD EPYC processors, you can typically expect 1.1x-2x speedup compared to standard CPU inference for matrix multiplication operations.
|
||||||
|
|
||||||
|
**Q: Can I use ZenDNN on non-AMD processors?**
|
||||||
|
|
||||||
|
A: ZenDNN is optimized specifically for AMD processors. While it may work on other x86-64 CPUs, performance benefits are only guaranteed on AMD Zen-based architectures.
|
||||||
|
|
||||||
|
**Q: Does ZenDNN support quantized models?**
|
||||||
|
|
||||||
|
A: Currently, ZenDNN primarily supports FP32 and BF16 data types. Quantized model support is not available at this time.
|
||||||
|
|
||||||
|
**Q: Why is my inference not faster with ZenDNN?**
|
||||||
|
|
||||||
|
A: Ensure:
|
||||||
|
1. You're using an AMD EPYC or Ryzen processor (Zen 2 or newer)
|
||||||
|
2. `OMP_NUM_THREADS` is set appropriately (physical core count)
|
||||||
|
3. `ZENDNNL_MATMUL_ALGO=2` is set for best performance (Blocked AOCL BLIS)
|
||||||
|
4. You're using a sufficiently large model (small models may not benefit as much)
|
||||||
|
5. Enable profiling to verify ZenDNN MatMul is being called
|
||||||
|
|
||||||
|
### **GitHub Contribution**:
|
||||||
|
Please add the **[ZenDNN]** prefix/tag in issues/PRs titles to help the ZenDNN-team check/address them without delay.
|
||||||
|
|
||||||
|
## TODO
|
||||||
|
|
||||||
|
- Expand operation support beyond MUL_MAT (attention operations, activations, etc.)
|
||||||
|
|
@ -1,5 +1,10 @@
|
||||||
# llama.cpp for IBM zDNN Accelerator
|
# llama.cpp for IBM zDNN Accelerator
|
||||||
|
|
||||||
|
> [!WARNING]
|
||||||
|
> **Note:** zDNN is **not** the same as ZenDNN.
|
||||||
|
> - **zDNN** (this page): IBM's Deep Neural Network acceleration library for IBM Z & LinuxONE Mainframes
|
||||||
|
> - **ZenDNN**: AMD's deep learning library for AMD EPYC CPUs ([see ZenDNN documentation](ZenDNN.md))
|
||||||
|
|
||||||
## Background
|
## Background
|
||||||
|
|
||||||
IBM zDNN (Z Deep Neural Network) is a hardware acceleration library designed specifically to leverage the IBM NNPA (Neural Network Processor Assist) accelerator located within IBM Telum I and II processors. It provides significant performance improvements for neural network inference operations.
|
IBM zDNN (Z Deep Neural Network) is a hardware acceleration library designed specifically to leverage the IBM NNPA (Neural Network Processor Assist) accelerator located within IBM Telum I and II processors. It provides significant performance improvements for neural network inference operations.
|
||||||
|
|
|
||||||
|
|
@ -19,6 +19,7 @@ cmake -B build \
|
||||||
-DGGML_RVV=ON \
|
-DGGML_RVV=ON \
|
||||||
-DGGML_RV_ZFH=ON \
|
-DGGML_RV_ZFH=ON \
|
||||||
-DGGML_RV_ZICBOP=ON \
|
-DGGML_RV_ZICBOP=ON \
|
||||||
|
-DGGML_RV_ZIHINTPAUSE=ON \
|
||||||
-DRISCV64_SPACEMIT_IME_SPEC=RISCV64_SPACEMIT_IME1 \
|
-DRISCV64_SPACEMIT_IME_SPEC=RISCV64_SPACEMIT_IME1 \
|
||||||
-DCMAKE_TOOLCHAIN_FILE=${PWD}/cmake/riscv64-spacemit-linux-gnu-gcc.cmake \
|
-DCMAKE_TOOLCHAIN_FILE=${PWD}/cmake/riscv64-spacemit-linux-gnu-gcc.cmake \
|
||||||
-DCMAKE_INSTALL_PREFIX=build/installed
|
-DCMAKE_INSTALL_PREFIX=build/installed
|
||||||
|
|
|
||||||
|
|
@ -431,11 +431,22 @@ docker run -it --rm -v "$(pwd):/app:Z" --device /dev/dri/renderD128:/dev/dri/ren
|
||||||
|
|
||||||
### For Linux users:
|
### For Linux users:
|
||||||
|
|
||||||
|
#### Using the LunarG Vulkan SDK
|
||||||
|
|
||||||
First, follow the official LunarG instructions for the installation and setup of the Vulkan SDK in the [Getting Started with the Linux Tarball Vulkan SDK](https://vulkan.lunarg.com/doc/sdk/latest/linux/getting_started.html) guide.
|
First, follow the official LunarG instructions for the installation and setup of the Vulkan SDK in the [Getting Started with the Linux Tarball Vulkan SDK](https://vulkan.lunarg.com/doc/sdk/latest/linux/getting_started.html) guide.
|
||||||
|
|
||||||
> [!IMPORTANT]
|
> [!IMPORTANT]
|
||||||
> After completing the first step, ensure that you have used the `source` command on the `setup_env.sh` file inside of the Vulkan SDK in your current terminal session. Otherwise, the build won't work. Additionally, if you close out of your terminal, you must perform this step again if you intend to perform a build. However, there are ways to make this persistent. Refer to the Vulkan SDK guide linked in the first step for more information about any of this.
|
> After completing the first step, ensure that you have used the `source` command on the `setup_env.sh` file inside of the Vulkan SDK in your current terminal session. Otherwise, the build won't work. Additionally, if you close out of your terminal, you must perform this step again if you intend to perform a build. However, there are ways to make this persistent. Refer to the Vulkan SDK guide linked in the first step for more information about any of this.
|
||||||
|
|
||||||
|
#### Using system packages
|
||||||
|
|
||||||
|
On Debian / Ubuntu, you can install the required dependencies using:
|
||||||
|
```sh
|
||||||
|
sudo apt-get install libvulkan-dev glslc
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Common steps
|
||||||
|
|
||||||
Second, after verifying that you have followed all of the SDK installation/setup steps, use this command to make sure before proceeding:
|
Second, after verifying that you have followed all of the SDK installation/setup steps, use this command to make sure before proceeding:
|
||||||
```bash
|
```bash
|
||||||
vulkaninfo
|
vulkaninfo
|
||||||
|
|
@ -484,6 +495,38 @@ llama_new_context_with_model: CANN compute buffer size = 1260.81 MiB
|
||||||
|
|
||||||
For detailed info, such as model/device supports, CANN install, please refer to [llama.cpp for CANN](./backend/CANN.md).
|
For detailed info, such as model/device supports, CANN install, please refer to [llama.cpp for CANN](./backend/CANN.md).
|
||||||
|
|
||||||
|
## ZenDNN
|
||||||
|
|
||||||
|
ZenDNN provides optimized deep learning primitives for AMD EPYC™ CPUs. It accelerates matrix multiplication operations for inference workloads.
|
||||||
|
|
||||||
|
### Compilation
|
||||||
|
|
||||||
|
- Using `CMake` on Linux (automatic build):
|
||||||
|
|
||||||
|
```bash
|
||||||
|
cmake -B build -DGGML_ZENDNN=ON
|
||||||
|
cmake --build build --config Release
|
||||||
|
```
|
||||||
|
|
||||||
|
The first build will automatically download and build ZenDNN, which may take 5-10 minutes. Subsequent builds will be much faster.
|
||||||
|
|
||||||
|
- Using `CMake` with custom ZenDNN installation:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
cmake -B build -DGGML_ZENDNN=ON -DZENDNN_ROOT=/path/to/zendnn/install
|
||||||
|
cmake --build build --config Release
|
||||||
|
```
|
||||||
|
|
||||||
|
### Testing
|
||||||
|
|
||||||
|
You can test with:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
./build/bin/llama-cli -m PATH_TO_MODEL -p "Building a website can be done in 10 steps:" -n 50
|
||||||
|
```
|
||||||
|
|
||||||
|
For detailed information about hardware support, setup instructions, and performance optimization, refer to [llama.cpp for ZenDNN](./backend/ZenDNN.md).
|
||||||
|
|
||||||
## Arm® KleidiAI™
|
## Arm® KleidiAI™
|
||||||
KleidiAI is a library of optimized microkernels for AI workloads, specifically designed for Arm CPUs. These microkernels enhance performance and can be enabled for use by the CPU backend.
|
KleidiAI is a library of optimized microkernels for AI workloads, specifically designed for Arm CPUs. These microkernels enhance performance and can be enabled for use by the CPU backend.
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,288 @@
|
||||||
|
# Parsing Model Output
|
||||||
|
|
||||||
|
The `common` library contains a PEG parser implementation suitable for parsing
|
||||||
|
model output.
|
||||||
|
|
||||||
|
Types with the prefix `common_peg_*` are intended for general use and may have
|
||||||
|
applications beyond parsing model output, such as parsing user-provided regex
|
||||||
|
patterns.
|
||||||
|
|
||||||
|
Types with the prefix `common_chat_peg_*` are specialized helpers for model
|
||||||
|
output.
|
||||||
|
|
||||||
|
The parser features:
|
||||||
|
|
||||||
|
- Partial parsing of streaming input
|
||||||
|
- Built-in JSON parsers
|
||||||
|
- AST generation with semantics via "tagged" nodes
|
||||||
|
|
||||||
|
## Example
|
||||||
|
|
||||||
|
Below is a contrived example demonstrating how to use the PEG parser to parse
|
||||||
|
output from a model that emits arguments as JSON.
|
||||||
|
|
||||||
|
```cpp
|
||||||
|
auto parser = build_chat_peg_native_parser([&](common_chat_peg_native_builder & p) {
|
||||||
|
// Build a choice of all available tools
|
||||||
|
auto tool_choice = p.choice();
|
||||||
|
for (const auto & tool : tools) {
|
||||||
|
const auto & function = tool.at("function");
|
||||||
|
std::string name = function.at("name");
|
||||||
|
const auto & schema = function.at("parameters");
|
||||||
|
|
||||||
|
auto tool_name = p.json_member("name", "\"" + p.literal(name) + "\"");
|
||||||
|
auto tool_args = p.json_member("arguments", p.schema(p.json(), "tool-" + name + "-schema", schema));
|
||||||
|
|
||||||
|
tool_choice |= p.rule("tool-" + name, "{" << tool_name << "," << tool_args << "}");
|
||||||
|
}
|
||||||
|
|
||||||
|
// Define the tool call structure: <tool_call>[{tool}]</tool_call>
|
||||||
|
auto tool_call = p.trigger_rule("tool-call",
|
||||||
|
p.sequence({
|
||||||
|
p.literal("<tool_call>["),
|
||||||
|
tool_choice,
|
||||||
|
p.literal("]</tool_call>")
|
||||||
|
})
|
||||||
|
);
|
||||||
|
|
||||||
|
// Parser accepts content, optionally followed by a tool call
|
||||||
|
return p.sequence({
|
||||||
|
p.content(p.until("<tool_call>")),
|
||||||
|
p.optional(tool_call),
|
||||||
|
p.end()
|
||||||
|
});
|
||||||
|
});
|
||||||
|
```
|
||||||
|
|
||||||
|
For a more complete example, see `test_example_native()` in
|
||||||
|
[tests/test-chat-peg-parser.cpp](tests/test-chat-peg-parser.cpp).
|
||||||
|
|
||||||
|
## Parsers/Combinators
|
||||||
|
|
||||||
|
### Basic Matchers
|
||||||
|
|
||||||
|
- **`eps()`** - Matches nothing and always succeeds (epsilon/empty match)
|
||||||
|
- **`start()`** - Matches the start of input (anchor `^`)
|
||||||
|
- **`end()`** - Matches the end of input (anchor `$`)
|
||||||
|
- **`literal(string)`** - Matches an exact literal string
|
||||||
|
- **`any()`** - Matches any single character (`.`)
|
||||||
|
|
||||||
|
### Combinators
|
||||||
|
|
||||||
|
- **`sequence(...)`** - Matches parsers in order; all must succeed
|
||||||
|
- **`choice(...)`** - Matches the first parser that succeeds from alternatives (ordered choice)
|
||||||
|
- **`one_or_more(p)`** - Matches one or more repetitions (`+`)
|
||||||
|
- **`zero_or_more(p)`** - Matches zero or more repetitions (`*`)
|
||||||
|
- **`optional(p)`** - Matches zero or one occurrence (`?`)
|
||||||
|
- **`repeat(p, min, max)`** - Matches between min and max repetitions (use `-1` for unbounded)
|
||||||
|
- **`repeat(p, n)`** - Matches exactly n repetitions
|
||||||
|
|
||||||
|
### Lookahead
|
||||||
|
|
||||||
|
- **`peek(p)`** - Positive lookahead: succeeds if parser succeeds without consuming input (`&`)
|
||||||
|
- **`negate(p)`** - Negative lookahead: succeeds if parser fails without consuming input (`!`)
|
||||||
|
|
||||||
|
### Character Classes & Utilities
|
||||||
|
|
||||||
|
- **`chars(classes, min, max)`** - Matches repetitions of characters from a character class
|
||||||
|
- **`space()`** - Matches zero or more whitespace characters (space, tab, newline)
|
||||||
|
- **`until(delimiter)`** - Matches characters until delimiter is found (delimiter not consumed)
|
||||||
|
- **`until_one_of(delimiters)`** - Matches characters until any delimiter in the list is found
|
||||||
|
- **`rest()`** - Matches everything remaining (`.*`)
|
||||||
|
|
||||||
|
### JSON Parsers
|
||||||
|
|
||||||
|
- **`json()`** - Complete JSON parser (objects, arrays, strings, numbers, booleans, null)
|
||||||
|
- **`json_object()`** - JSON object parser
|
||||||
|
- **`json_array()`** - JSON array parser
|
||||||
|
- **`json_string()`** - JSON string parser
|
||||||
|
- **`json_number()`** - JSON number parser
|
||||||
|
- **`json_bool()`** - JSON boolean parser
|
||||||
|
- **`json_null()`** - JSON null parser
|
||||||
|
- **`json_string_content()`** - JSON string content without surrounding quotes
|
||||||
|
- **`json_member(key, p)`** - JSON object member with specific key and value parser
|
||||||
|
|
||||||
|
### Grammar Building
|
||||||
|
|
||||||
|
- **`ref(name)`** - Creates a lightweight reference to a named rule (for recursive grammars)
|
||||||
|
- **`rule(name, p, trigger)`** - Creates a named rule and returns a reference
|
||||||
|
- **`trigger_rule(name, p)`** - Creates a trigger rule (entry point for lazy grammar generation)
|
||||||
|
- **`schema(p, name, schema, raw)`** - Wraps parser with JSON schema metadata for grammar generation
|
||||||
|
|
||||||
|
### AST Control
|
||||||
|
|
||||||
|
- **`atomic(p)`** - Prevents AST node creation for partial parses
|
||||||
|
- **`tag(tag, p)`** - Creates AST nodes with semantic tags (multiple nodes can share tags)
|
||||||
|
|
||||||
|
## GBNF Grammar Generation
|
||||||
|
|
||||||
|
The PEG parser also acts as a convenient DSL for generating GBNF grammars, with
|
||||||
|
some exceptions.
|
||||||
|
|
||||||
|
```cpp
|
||||||
|
data.grammar = build_grammar([&](const common_grammar_builder & builder) {
|
||||||
|
foreach_function(params.tools, [&](const json & fn) {
|
||||||
|
builder.resolve_refs(fn.at("parameters"));
|
||||||
|
});
|
||||||
|
parser.build_grammar(builder, data.grammar_lazy);
|
||||||
|
});
|
||||||
|
```
|
||||||
|
|
||||||
|
The notable exception is the `negate(p)` lookahead parser, which cannot be
|
||||||
|
defined as a CFG grammar and therefore does not produce a rule. Its usage
|
||||||
|
should be limited and preferably hidden behind a `schema()` parser. In many
|
||||||
|
cases, `until(delimiter)` or `until_one_of(delimiters)` is a better choice.
|
||||||
|
|
||||||
|
Another limitation is that the PEG parser requires an unambiguous grammar. In
|
||||||
|
contrast, the `llama-grammar` implementation can support ambiguous grammars,
|
||||||
|
though they are difficult to parse.
|
||||||
|
|
||||||
|
### Lazy Grammars
|
||||||
|
|
||||||
|
During lazy grammar generation, only rules reachable from a `trigger_rule(p)`
|
||||||
|
are emitted in the grammar. All trigger rules are added as alternations in the
|
||||||
|
root rule. It is still necessary to define trigger patterns, as the parser has
|
||||||
|
no interaction with the grammar sampling.
|
||||||
|
|
||||||
|
### JSON Schema
|
||||||
|
|
||||||
|
The `schema(p, name, schema, raw)` parser will use the `json-schema-to-grammar`
|
||||||
|
implementation to generate the grammar instead of the underlying parser.
|
||||||
|
|
||||||
|
The `raw` option emits a grammar suitable for a raw string instead of a JSON
|
||||||
|
string. In other words, it won't be wrapped in quotes or require escaping
|
||||||
|
quotes. It should only be used when `type == "string"`.
|
||||||
|
|
||||||
|
The downside is that it can potentially lead to ambiguous grammars. For
|
||||||
|
example, if a user provides the pattern `^.*$`, the following grammar may be
|
||||||
|
generated:
|
||||||
|
|
||||||
|
```
|
||||||
|
root ::= "<arg>" .* "</arg>"
|
||||||
|
```
|
||||||
|
|
||||||
|
This creates an ambiguous grammar that cannot be parsed by the PEG parser. To
|
||||||
|
help mitigate this, if `.*` is found in the pattern, the grammar from the
|
||||||
|
underlying parser will be emitted instead.
|
||||||
|
|
||||||
|
## Common AST Shapes for Chat Parsing
|
||||||
|
|
||||||
|
Most model output can be placed in one of the following categories:
|
||||||
|
|
||||||
|
- Content only
|
||||||
|
- Tool calling with arguments emitted as a single JSON object
|
||||||
|
- Tool calling with arguments emitted as separate entities, either XML
|
||||||
|
(Qwen3-Coder, MiniMax M2) or pseudo-function calls (LFM2)
|
||||||
|
|
||||||
|
To provide broad coverage,
|
||||||
|
[`common/chat-peg-parser.h`](common/chat-peg-parser.h) contains builders and
|
||||||
|
mappers that help create parsers and visitors/extractors for these types. They
|
||||||
|
require parsers to tag nodes to conform to an AST "shape". This normalization
|
||||||
|
makes it easy to extract information and generalize parsing.
|
||||||
|
|
||||||
|
### Simple
|
||||||
|
|
||||||
|
The `common_chat_peg_builder` builds a `simple` parser that supports
|
||||||
|
content-only models with optional reasoning.
|
||||||
|
|
||||||
|
- **`reasoning(p)`** - Tag node for extracting `reasoning_content`
|
||||||
|
- **`content(p)`** - Tag node for extracting `content`
|
||||||
|
|
||||||
|
```cpp
|
||||||
|
build_chat_peg_parser([&](common_chat_peg_parser & p) {
|
||||||
|
return p.sequence({
|
||||||
|
p.optional("<think>" + p.reasoning(p.until("</think>")) + "</think>"),
|
||||||
|
p.content(p.until("<tool_call>")),
|
||||||
|
p.end()
|
||||||
|
});
|
||||||
|
});
|
||||||
|
```
|
||||||
|
|
||||||
|
Use `common_chat_peg_mapper` to extract the content. Note that this is already
|
||||||
|
done for you in `common_chat_peg_parser` when
|
||||||
|
`chat_format == COMMON_CHAT_FORMAT_PEG_SIMPLE`.
|
||||||
|
|
||||||
|
```cpp
|
||||||
|
auto result = parser.parse(ctx);
|
||||||
|
|
||||||
|
common_chat_msg msg;
|
||||||
|
auto mapper = common_chat_peg_mapper(msg);
|
||||||
|
mapper.from_ast(ctx.ast, result);
|
||||||
|
```
|
||||||
|
|
||||||
|
### Native
|
||||||
|
|
||||||
|
The `common_chat_peg_native_builder` builds a `native` parser suitable for
|
||||||
|
models that emit tool arguments as a direct JSON object.
|
||||||
|
|
||||||
|
- **`reasoning(p)`** - Tag node for `reasoning_content`
|
||||||
|
- **`content(p)`** - Tag node for `content`
|
||||||
|
- **`tool(p)`** - Tag entirety of a single tool call
|
||||||
|
- **`tool_open(p)`** - Tag start of a tool call
|
||||||
|
- **`tool_close(p)`** - Tag end of a tool call
|
||||||
|
- **`tool_id(p)`** - Tag the tool call ID (optional)
|
||||||
|
- **`tool_name(p)`** - Tag the tool name
|
||||||
|
- **`tool_args(p)`** - Tag the tool arguments
|
||||||
|
|
||||||
|
```cpp
|
||||||
|
build_chat_peg_native_parser([&](common_chat_peg_native_parser & p) {
|
||||||
|
auto get_weather_tool = p.tool(p.sequence({
|
||||||
|
p.tool_open(p.literal("{")),
|
||||||
|
p.json_member("name", "\"" + p.tool_name(p.literal("get_weather")) + "\""),
|
||||||
|
p.literal(","),
|
||||||
|
p.json_member("arguments", p.tool_args(p.json())),
|
||||||
|
p.tool_close(p.literal("}"))
|
||||||
|
}));
|
||||||
|
|
||||||
|
return p.sequence({
|
||||||
|
p.content(p.until("<tool_call>")),
|
||||||
|
p.literal("<tool_call>"),
|
||||||
|
get_weather_tool,
|
||||||
|
p.literal("</tool_call>"),
|
||||||
|
p.end()
|
||||||
|
});
|
||||||
|
});
|
||||||
|
```
|
||||||
|
|
||||||
|
### Constructed
|
||||||
|
|
||||||
|
The `common_chat_peg_constructed_builder` builds a `constructed` parser
|
||||||
|
suitable for models that emit tool arguments as separate entities, such as XML
|
||||||
|
tags.
|
||||||
|
|
||||||
|
- **`reasoning(p)`** - Tag node for `reasoning_content`
|
||||||
|
- **`content(p)`** - Tag node for `content`
|
||||||
|
- **`tool(p)`** - Tag entirety of a single tool call
|
||||||
|
- **`tool_open(p)`** - Tag start of a tool call
|
||||||
|
- **`tool_close(p)`** - Tag end of a tool call
|
||||||
|
- **`tool_name(p)`** - Tag the tool name
|
||||||
|
- **`tool_arg(p)`** - Tag a complete tool argument (name + value)
|
||||||
|
- **`tool_arg_open(p)`** - Tag start of a tool argument
|
||||||
|
- **`tool_arg_close(p)`** - Tag end of a tool argument
|
||||||
|
- **`tool_arg_name(p)`** - Tag the argument name
|
||||||
|
- **`tool_arg_string_value(p)`** - Tag string value for the argument
|
||||||
|
- **`tool_arg_json_value(p)`** - Tag JSON value for the argument
|
||||||
|
|
||||||
|
```cpp
|
||||||
|
build_chat_peg_constructed_parser([&](common_chat_peg_constructed_builder & p) {
|
||||||
|
auto location_arg = p.tool_arg(
|
||||||
|
p.tool_arg_open("<parameter name=\"" + p.tool_arg_name(p.literal("location")) + "\">"),
|
||||||
|
p.tool_arg_string_value(p.until("</parameter>")),
|
||||||
|
p.tool_arg_close(p.literal("</parameter>"))
|
||||||
|
);
|
||||||
|
|
||||||
|
auto get_weather_tool = p.tool(p.sequence({
|
||||||
|
p.tool_open("<function name=\"" + p.tool_name(p.literal("get_weather")) + "\">"),
|
||||||
|
location_arg,
|
||||||
|
p.tool_close(p.literal("</function>"))
|
||||||
|
}));
|
||||||
|
|
||||||
|
return p.sequence({
|
||||||
|
p.content(p.until("<tool_call>")),
|
||||||
|
p.literal("<tool_call>"),
|
||||||
|
get_weather_tool,
|
||||||
|
p.literal("</tool_call>"),
|
||||||
|
p.end()
|
||||||
|
});
|
||||||
|
});
|
||||||
|
```
|
||||||
216
docs/ops.md
216
docs/ops.md
|
|
@ -12,111 +12,111 @@ Legend:
|
||||||
- 🟡 Partially supported by this backend
|
- 🟡 Partially supported by this backend
|
||||||
- ❌ Not supported by this backend
|
- ❌ Not supported by this backend
|
||||||
|
|
||||||
| Operation | BLAS | CANN | CPU | CUDA | Metal | OpenCL | SYCL | Vulkan | zDNN |
|
| Operation | BLAS | CANN | CPU | CUDA | Metal | OpenCL | SYCL | Vulkan | WebGPU | ZenDNN | zDNN |
|
||||||
|-----------|------|------|------|------|------|------|------|------|------|
|
|-----------|------|------|------|------|------|------|------|------|------|------|------|
|
||||||
| ABS | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | ✅ | 🟡 | ❌ |
|
| ABS | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ |
|
||||||
| ACC | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ |
|
| ACC | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
|
||||||
| ADD | ❌ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ✅ | ❌ |
|
| ADD | ❌ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ✅ | ✅ | ❌ | ❌ |
|
||||||
| ADD1 | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ |
|
| ADD1 | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
|
||||||
| ADD_ID | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ |
|
| ADD_ID | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ |
|
||||||
| ARANGE | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ |
|
| ARANGE | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
|
||||||
| ARGMAX | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ |
|
| ARGMAX | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
|
||||||
| ARGSORT | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ |
|
| ARGSORT | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ |
|
||||||
| CEIL | ❌ | ❌ | ✅ | 🟡 | ❌ | ❌ | 🟡 | 🟡 | ❌ |
|
| CEIL | ❌ | ❌ | ✅ | 🟡 | ❌ | ❌ | 🟡 | 🟡 | ❌ | ❌ | ❌ |
|
||||||
| CLAMP | ❌ | ✅ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | 🟡 | ❌ |
|
| CLAMP | ❌ | ✅ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | 🟡 | ❌ | ❌ | ❌ |
|
||||||
| CONCAT | ❌ | ✅ | ✅ | 🟡 | ✅ | 🟡 | ✅ | ✅ | ❌ |
|
| CONCAT | ❌ | ✅ | ✅ | 🟡 | ✅ | 🟡 | ✅ | ✅ | ❌ | ❌ | ❌ |
|
||||||
| CONT | ❌ | 🟡 | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ❌ |
|
| CONT | ❌ | 🟡 | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | 🟡 | ❌ | ❌ |
|
||||||
| CONV_2D | ❌ | ❌ | ✅ | ✅ | ❌ | ✅ | ❌ | ✅ | ❌ |
|
| CONV_2D | ❌ | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ❌ | ❌ | ❌ |
|
||||||
| CONV_2D_DW | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ |
|
| CONV_2D_DW | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ |
|
||||||
| CONV_3D | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
|
| CONV_3D | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
|
||||||
| CONV_TRANSPOSE_1D | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ |
|
| CONV_TRANSPOSE_1D | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
|
||||||
| CONV_TRANSPOSE_2D | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ |
|
| CONV_TRANSPOSE_2D | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ |
|
||||||
| COS | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | 🟡 | 🟡 | ❌ |
|
| COS | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | 🟡 | 🟡 | ❌ | ❌ | ❌ |
|
||||||
| COUNT_EQUAL | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ |
|
| COUNT_EQUAL | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
|
||||||
| CPY | ❌ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | ❌ |
|
| CPY | ❌ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | ❌ | ❌ |
|
||||||
| CROSS_ENTROPY_LOSS | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ |
|
| CROSS_ENTROPY_LOSS | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
|
||||||
| CROSS_ENTROPY_LOSS_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ |
|
| CROSS_ENTROPY_LOSS_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
|
||||||
| CUMSUM | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ | ✅ | ❌ |
|
| CUMSUM | ❌ | ❌ | ✅ | ❌ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ |
|
||||||
| DIAG_MASK_INF | ❌ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ✅ | ❌ |
|
| DIAG_MASK_INF | ❌ | ✅ | ✅ | ✅ | ❌ | 🟡 | ✅ | ✅ | ❌ | ❌ | ❌ |
|
||||||
| DIV | ❌ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ✅ | ❌ |
|
| DIV | ❌ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ✅ | ✅ | ❌ | ❌ |
|
||||||
| DUP | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | ✅ | ❌ |
|
| DUP | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | ✅ | ❌ | ❌ | ❌ |
|
||||||
| ELU | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | ✅ | ❌ | ❌ |
|
| ELU | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | ✅ | ❌ | ✅ | ❌ | ❌ |
|
||||||
| EXP | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | ✅ | 🟡 | ❌ |
|
| EXP | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ |
|
||||||
| EXPM1 | ❌ | ❌ | ✅ | 🟡 | ❌ | ❌ | ❌ | ❌ | ❌ |
|
| EXPM1 | ❌ | ❌ | ✅ | 🟡 | 🟡 | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
|
||||||
| FILL | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ | ✅ | ❌ |
|
| FILL | ❌ | ❌ | ✅ | ❌ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ |
|
||||||
| FLASH_ATTN_EXT | ❌ | 🟡 | ✅ | 🟡 | 🟡 | ❌ | ❌ | 🟡 | ❌ |
|
| FLASH_ATTN_EXT | ❌ | 🟡 | ✅ | 🟡 | 🟡 | ❌ | ❌ | 🟡 | ❌ | ❌ | ❌ |
|
||||||
| FLOOR | ❌ | ❌ | ✅ | 🟡 | ❌ | ❌ | 🟡 | 🟡 | ❌ |
|
| FLOOR | ❌ | ❌ | ✅ | 🟡 | ❌ | ❌ | 🟡 | 🟡 | ❌ | ❌ | ❌ |
|
||||||
| GATED_LINEAR_ATTN | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ |
|
| GATED_LINEAR_ATTN | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ |
|
||||||
| GEGLU | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ❌ |
|
| GEGLU | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ✅ | ❌ | ❌ |
|
||||||
| GEGLU_ERF | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ❌ |
|
| GEGLU_ERF | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ✅ | ❌ | ❌ |
|
||||||
| GEGLU_QUICK | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ❌ |
|
| GEGLU_QUICK | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ✅ | ❌ | ❌ |
|
||||||
| GELU | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | 🟡 | ❌ |
|
| GELU | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | 🟡 | ✅ | ❌ | ❌ |
|
||||||
| GELU_ERF | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | 🟡 | ❌ |
|
| GELU_ERF | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | 🟡 | ✅ | ❌ | ❌ |
|
||||||
| GELU_QUICK | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | 🟡 | ❌ |
|
| GELU_QUICK | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | 🟡 | ✅ | ❌ | ❌ |
|
||||||
| GET_ROWS | ❌ | 🟡 | ✅ | 🟡 | ✅ | 🟡 | 🟡 | 🟡 | ❌ |
|
| GET_ROWS | ❌ | 🟡 | ✅ | 🟡 | ✅ | 🟡 | 🟡 | 🟡 | 🟡 | ❌ | ❌ |
|
||||||
| GET_ROWS_BACK | ❌ | ❌ | 🟡 | 🟡 | ❌ | ❌ | ❌ | ❌ | ❌ |
|
| GET_ROWS_BACK | ❌ | ❌ | 🟡 | 🟡 | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
|
||||||
| GROUP_NORM | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ |
|
| GROUP_NORM | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ |
|
||||||
| GROUP_NORM_MUL_ADD | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
|
| GROUP_NORM_MUL_ADD | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
|
||||||
| HARDSIGMOID | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | ✅ | 🟡 | ❌ |
|
| HARDSIGMOID | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ |
|
||||||
| HARDSWISH | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | ✅ | 🟡 | ❌ |
|
| HARDSWISH | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ |
|
||||||
| IM2COL | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | ✅ | ❌ |
|
| IM2COL | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ |
|
||||||
| IM2COL_3D | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ |
|
| IM2COL_3D | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ |
|
||||||
| L2_NORM | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ |
|
| L2_NORM | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
|
||||||
| LEAKY_RELU | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | 🟡 | ❌ |
|
| LEAKY_RELU | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | ✅ | 🟡 | ❌ | ❌ | ❌ |
|
||||||
| LOG | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | 🟡 | ✅ | ❌ |
|
| LOG | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | 🟡 | ✅ | ❌ | ❌ | ❌ |
|
||||||
| MEAN | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ |
|
| MEAN | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
|
||||||
| MUL | ❌ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ✅ | ❌ |
|
| MUL | ❌ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ✅ | ✅ | ❌ | ❌ |
|
||||||
| MUL_MAT | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 |
|
| MUL_MAT | 🟡 | 🟡 | 🟡 | 🟡 | ✅ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 |
|
||||||
| MUL_MAT_ID | ❌ | 🟡 | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ❌ |
|
| MUL_MAT_ID | ❌ | 🟡 | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ❌ | ❌ | ❌ |
|
||||||
| NEG | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | ✅ | 🟡 | ❌ |
|
| NEG | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ |
|
||||||
| NORM | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ❌ |
|
| NORM | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | 🟡 | ❌ | ❌ | ❌ |
|
||||||
| NORM_MUL_ADD | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
|
| NORM_MUL_ADD | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
|
||||||
| OPT_STEP_ADAMW | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ |
|
| OPT_STEP_ADAMW | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ |
|
||||||
| OPT_STEP_SGD | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ |
|
| OPT_STEP_SGD | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ |
|
||||||
| OUT_PROD | 🟡 | ❌ | 🟡 | 🟡 | ❌ | ❌ | 🟡 | ❌ | ❌ |
|
| OUT_PROD | 🟡 | ❌ | 🟡 | 🟡 | ❌ | ❌ | 🟡 | ❌ | ❌ | ❌ | ❌ |
|
||||||
| PAD | ❌ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ✅ | ❌ |
|
| PAD | ❌ | ✅ | ✅ | 🟡 | 🟡 | ✅ | 🟡 | ✅ | ❌ | ❌ | ❌ |
|
||||||
| PAD_REFLECT_1D | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ❌ | ❌ |
|
| PAD_REFLECT_1D | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ |
|
||||||
| POOL_2D | ❌ | 🟡 | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ |
|
| POOL_2D | ❌ | 🟡 | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
|
||||||
| REGLU | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ❌ |
|
| REGLU | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ✅ | ❌ | ❌ |
|
||||||
| RELU | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | 🟡 | ❌ |
|
| RELU | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | 🟡 | ✅ | ❌ | ❌ |
|
||||||
| REPEAT | ❌ | ✅ | ✅ | 🟡 | ✅ | 🟡 | ✅ | 🟡 | ❌ |
|
| REPEAT | ❌ | ✅ | ✅ | 🟡 | ✅ | 🟡 | ✅ | 🟡 | ❌ | ❌ | ❌ |
|
||||||
| REPEAT_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ |
|
| REPEAT_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
|
||||||
| RMS_NORM | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | ✅ | ❌ |
|
| RMS_NORM | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
|
||||||
| RMS_NORM_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ |
|
| RMS_NORM_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
|
||||||
| RMS_NORM_MUL_ADD | ❌ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
|
| RMS_NORM_MUL_ADD | ❌ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ |
|
||||||
| ROLL | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ |
|
| ROLL | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
|
||||||
| ROPE | ❌ | 🟡 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ |
|
| ROPE | ❌ | 🟡 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
|
||||||
| ROPE_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ |
|
| ROPE_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ |
|
||||||
| ROUND | ❌ | ❌ | ✅ | 🟡 | ❌ | ❌ | 🟡 | 🟡 | ❌ |
|
| ROUND | ❌ | ❌ | ✅ | 🟡 | ❌ | ❌ | 🟡 | 🟡 | ❌ | ❌ | ❌ |
|
||||||
| RWKV_WKV6 | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ |
|
| RWKV_WKV6 | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
|
||||||
| RWKV_WKV7 | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ |
|
| RWKV_WKV7 | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
|
||||||
| SCALE | ❌ | 🟡 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ |
|
| SCALE | ❌ | 🟡 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
|
||||||
| SET | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | 🟡 | ❌ | ❌ |
|
| SET | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | 🟡 | ❌ | ❌ | ❌ | ❌ |
|
||||||
| SET_ROWS | ❌ | ❌ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | ❌ |
|
| SET_ROWS | ❌ | ❌ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | ❌ | ❌ |
|
||||||
| SGN | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | ✅ | ❌ | ❌ |
|
| SGN | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | ✅ | ❌ | ✅ | ❌ | ❌ |
|
||||||
| SIGMOID | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | 🟡 | ❌ |
|
| SIGMOID | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | 🟡 | ✅ | ❌ | ❌ |
|
||||||
| SILU | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | 🟡 | ❌ |
|
| SILU | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | 🟡 | ✅ | ❌ | ❌ |
|
||||||
| SILU_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ |
|
| SILU_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ |
|
||||||
| SIN | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | 🟡 | 🟡 | ❌ |
|
| SIN | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | 🟡 | 🟡 | ❌ | ❌ | ❌ |
|
||||||
| SOFTCAP | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
|
| SOFTCAP | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
|
||||||
| SOFTPLUS | ❌ | ❌ | ✅ | 🟡 | ❌ | ❌ | ❌ | 🟡 | ❌ |
|
| SOFTPLUS | ❌ | ❌ | ✅ | 🟡 | 🟡 | ❌ | ❌ | 🟡 | ❌ | ❌ | ❌ |
|
||||||
| SOFT_MAX | ❌ | 🟡 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ |
|
| SOFT_MAX | ❌ | 🟡 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
|
||||||
| SOFT_MAX_BACK | ❌ | ❌ | 🟡 | 🟡 | ❌ | ❌ | 🟡 | ✅ | ❌ |
|
| SOFT_MAX_BACK | ❌ | ❌ | 🟡 | 🟡 | ❌ | ❌ | 🟡 | ✅ | ❌ | ❌ | ❌ |
|
||||||
| SOLVE_TRI | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ | 🟡 | ❌ |
|
| SOLVE_TRI | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ | 🟡 | ❌ | ❌ | ❌ |
|
||||||
| SQR | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | 🟡 | 🟡 | ❌ |
|
| SQR | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | 🟡 | 🟡 | ❌ | ❌ | ❌ |
|
||||||
| SQRT | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | 🟡 | 🟡 | ❌ |
|
| SQRT | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | 🟡 | 🟡 | ❌ | ❌ | ❌ |
|
||||||
| SSM_CONV | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ |
|
| SSM_CONV | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
|
||||||
| SSM_SCAN | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | 🟡 | ❌ |
|
| SSM_SCAN | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | 🟡 | ❌ | ❌ | ❌ |
|
||||||
| STEP | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | ✅ | 🟡 | ❌ |
|
| STEP | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ |
|
||||||
| SUB | ❌ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ✅ | ❌ |
|
| SUB | ❌ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ✅ | ✅ | ❌ | ❌ |
|
||||||
| SUM | ❌ | ✅ | ✅ | 🟡 | ❌ | ❌ | 🟡 | 🟡 | ❌ |
|
| SUM | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | 🟡 | 🟡 | ❌ | ❌ | ❌ |
|
||||||
| SUM_ROWS | ❌ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ✅ | ❌ |
|
| SUM_ROWS | ❌ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ✅ | ❌ | ❌ | ❌ |
|
||||||
| SWIGLU | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ❌ |
|
| SWIGLU | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ✅ | ❌ | ❌ |
|
||||||
| SWIGLU_OAI | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | 🟡 | ❌ |
|
| SWIGLU_OAI | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | 🟡 | ✅ | ❌ | ❌ |
|
||||||
| TANH | ❌ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ✅ | 🟡 | ❌ |
|
| TANH | ❌ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ✅ | 🟡 | ✅ | ❌ | ❌ |
|
||||||
| TIMESTEP_EMBEDDING | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ |
|
| TIMESTEP_EMBEDDING | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ |
|
||||||
| TOP_K | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | 🟡 | ❌ |
|
| TOP_K | ❌ | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ | 🟡 | ❌ | ❌ | ❌ |
|
||||||
| TRI | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ | ✅ | ❌ |
|
| TRI | ❌ | ❌ | ✅ | ❌ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ |
|
||||||
| TRUNC | ❌ | ❌ | ✅ | 🟡 | ❌ | ❌ | 🟡 | 🟡 | ❌ |
|
| TRUNC | ❌ | ❌ | ✅ | 🟡 | ❌ | ❌ | 🟡 | 🟡 | ❌ | ❌ | ❌ |
|
||||||
| UPSCALE | ❌ | 🟡 | ✅ | ✅ | 🟡 | ✅ | 🟡 | 🟡 | ❌ |
|
| UPSCALE | ❌ | 🟡 | ✅ | ✅ | 🟡 | ✅ | 🟡 | 🟡 | ❌ | ❌ | ❌ |
|
||||||
| XIELU | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
|
| XIELU | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ |
|
||||||
|
|
|
||||||
19297
docs/ops/BLAS.csv
19297
docs/ops/BLAS.csv
File diff suppressed because it is too large
Load Diff
22617
docs/ops/Metal.csv
22617
docs/ops/Metal.csv
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
|
|
@ -20,6 +20,7 @@ else()
|
||||||
|
|
||||||
add_subdirectory(gguf-hash)
|
add_subdirectory(gguf-hash)
|
||||||
add_subdirectory(gguf)
|
add_subdirectory(gguf)
|
||||||
|
add_subdirectory(idle)
|
||||||
add_subdirectory(lookahead)
|
add_subdirectory(lookahead)
|
||||||
add_subdirectory(lookup)
|
add_subdirectory(lookup)
|
||||||
add_subdirectory(parallel)
|
add_subdirectory(parallel)
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,5 @@
|
||||||
|
set(TARGET llama-idle)
|
||||||
|
add_executable(${TARGET} idle.cpp)
|
||||||
|
install(TARGETS ${TARGET} RUNTIME)
|
||||||
|
target_link_libraries(${TARGET} PRIVATE llama common ${CMAKE_THREAD_LIBS_INIT})
|
||||||
|
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||||
|
|
@ -0,0 +1,3 @@
|
||||||
|
# llama.cpp/example/idle
|
||||||
|
|
||||||
|
https://github.com/ggml-org/llama.cpp/pull/17766
|
||||||
|
|
@ -0,0 +1,110 @@
|
||||||
|
#include "arg.h"
|
||||||
|
#include "common.h"
|
||||||
|
#include "log.h"
|
||||||
|
#include "llama.h"
|
||||||
|
|
||||||
|
#include <cmath>
|
||||||
|
#include <cstdio>
|
||||||
|
#include <cstring>
|
||||||
|
#include <string>
|
||||||
|
#include <thread>
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
|
static void print_usage(int /*argc*/, char ** argv) {
|
||||||
|
printf("\nexample usage:\n");
|
||||||
|
printf("\n %s -m model.gguf [-ngl n_gpu_layers]\n", argv[0]);
|
||||||
|
printf("\n");
|
||||||
|
}
|
||||||
|
|
||||||
|
int main(int argc, char ** argv) {
|
||||||
|
common_params params;
|
||||||
|
|
||||||
|
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON, print_usage)) {
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
common_init();
|
||||||
|
|
||||||
|
// init LLM
|
||||||
|
|
||||||
|
llama_backend_init();
|
||||||
|
llama_numa_init(params.numa);
|
||||||
|
|
||||||
|
// initialize the model
|
||||||
|
|
||||||
|
llama_model_params model_params = common_model_params_to_llama(params);
|
||||||
|
|
||||||
|
llama_model * model = llama_model_load_from_file(params.model.path.c_str(), model_params);
|
||||||
|
|
||||||
|
if (model == NULL) {
|
||||||
|
LOG_ERR("%s: error: unable to load model\n" , __func__);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
const llama_vocab * vocab = llama_model_get_vocab(model);
|
||||||
|
|
||||||
|
// we need just a dummy token to evaluate
|
||||||
|
std::vector<llama_token> prompt_tokens(1, llama_vocab_bos(vocab));
|
||||||
|
|
||||||
|
llama_context_params ctx_params = llama_context_default_params();
|
||||||
|
ctx_params.n_ctx = 512;
|
||||||
|
ctx_params.n_batch = 512;
|
||||||
|
ctx_params.no_perf = false;
|
||||||
|
|
||||||
|
llama_context * ctx = llama_init_from_model(model, ctx_params);
|
||||||
|
if (ctx == NULL) {
|
||||||
|
fprintf(stderr , "%s: error: failed to create the llama_context\n" , __func__);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
llama_batch batch = llama_batch_get_one(prompt_tokens.data(), prompt_tokens.size());
|
||||||
|
|
||||||
|
const int n_iters = 3;
|
||||||
|
|
||||||
|
// warm-up
|
||||||
|
llama_decode(ctx, batch);
|
||||||
|
llama_memory_clear(llama_get_memory(ctx), true);
|
||||||
|
llama_synchronize(ctx);
|
||||||
|
|
||||||
|
for (int64_t t_pause_ms = 0; t_pause_ms <= 4000; t_pause_ms += 800) {
|
||||||
|
double t_sum_us = 0.0;
|
||||||
|
double t_sum2_us = 0.0;
|
||||||
|
|
||||||
|
for (int i = 0; i < n_iters; i++) {
|
||||||
|
// this pause is important - it simulates "idle GPU"
|
||||||
|
std::this_thread::sleep_for(std::chrono::milliseconds(t_pause_ms));
|
||||||
|
|
||||||
|
const int64_t t_start_us = llama_time_us();
|
||||||
|
|
||||||
|
// this should take constant time
|
||||||
|
llama_decode(ctx, batch);
|
||||||
|
llama_synchronize(ctx);
|
||||||
|
|
||||||
|
const int64_t t_end_us = llama_time_us();
|
||||||
|
|
||||||
|
const double t_cur_us = t_end_us - t_start_us;
|
||||||
|
|
||||||
|
#if 1
|
||||||
|
// print individual decode times
|
||||||
|
printf(" - decode time: %8.2f ms\n", t_cur_us / 1000);
|
||||||
|
#endif
|
||||||
|
|
||||||
|
t_sum_us += t_cur_us;
|
||||||
|
t_sum2_us += t_cur_us * t_cur_us;
|
||||||
|
|
||||||
|
llama_memory_clear(llama_get_memory(ctx), true);
|
||||||
|
llama_synchronize(ctx); // just in case
|
||||||
|
}
|
||||||
|
|
||||||
|
const double t_avg_us = t_sum_us / n_iters;
|
||||||
|
const double t_dev_us = sqrt((t_sum2_us / (n_iters - 1)) - (t_avg_us * t_avg_us * n_iters) / (n_iters - 1));
|
||||||
|
|
||||||
|
printf("iters: %4d, pause: %5d ms, avg decode time: %8.2f +/- %4.2f ms\n", n_iters, (int) t_pause_ms, t_avg_us / 1000, t_dev_us / 1000);
|
||||||
|
fflush(stdout);
|
||||||
|
}
|
||||||
|
|
||||||
|
llama_free(ctx);
|
||||||
|
llama_model_free(model);
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
@ -144,7 +144,7 @@ int main(int argc, char ** argv) {
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
std::string s(buf, n);
|
std::string s(buf, n);
|
||||||
printf("%s", s.c_str());
|
printf("%s (%d)", s.c_str(), id);
|
||||||
}
|
}
|
||||||
printf("\n");
|
printf("\n");
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -241,6 +241,12 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
llama_batch_free(batch);
|
llama_batch_free(batch);
|
||||||
|
|
||||||
|
// this one is managed by common_init_result
|
||||||
|
//llama_free(ctx);
|
||||||
|
|
||||||
|
llama_free(ctx2);
|
||||||
|
llama_free(ctx3);
|
||||||
|
|
||||||
if (result0 != result2) {
|
if (result0 != result2) {
|
||||||
fprintf(stderr, "\n%s : error : the seq restore generation is different\n", __func__);
|
fprintf(stderr, "\n%s : error : the seq restore generation is different\n", __func__);
|
||||||
return 1;
|
return 1;
|
||||||
|
|
|
||||||
|
|
@ -18,6 +18,7 @@ cd llama.cpp
|
||||||
cmake -S . -B build
|
cmake -S . -B build
|
||||||
cmake --build build
|
cmake --build build
|
||||||
cmake --install build --prefix inst
|
cmake --install build --prefix inst
|
||||||
|
```
|
||||||
|
|
||||||
### Build simple-cmake-pkg
|
### Build simple-cmake-pkg
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -168,6 +168,7 @@ option(GGML_RVV "ggml: enable rvv" ON)
|
||||||
option(GGML_RV_ZFH "ggml: enable riscv zfh" ON)
|
option(GGML_RV_ZFH "ggml: enable riscv zfh" ON)
|
||||||
option(GGML_RV_ZVFH "ggml: enable riscv zvfh" ON)
|
option(GGML_RV_ZVFH "ggml: enable riscv zvfh" ON)
|
||||||
option(GGML_RV_ZICBOP "ggml: enable riscv zicbop" ON)
|
option(GGML_RV_ZICBOP "ggml: enable riscv zicbop" ON)
|
||||||
|
option(GGML_RV_ZIHINTPAUSE "ggml: enable riscv zihintpause " ON)
|
||||||
option(GGML_XTHEADVECTOR "ggml: enable xtheadvector" OFF)
|
option(GGML_XTHEADVECTOR "ggml: enable xtheadvector" OFF)
|
||||||
option(GGML_VXE "ggml: enable vxe" ${GGML_NATIVE})
|
option(GGML_VXE "ggml: enable vxe" ${GGML_NATIVE})
|
||||||
|
|
||||||
|
|
@ -175,11 +176,6 @@ option(GGML_CPU_ALL_VARIANTS "ggml: build all variants of the CPU backend (requi
|
||||||
set(GGML_CPU_ARM_ARCH "" CACHE STRING "ggml: CPU architecture for ARM")
|
set(GGML_CPU_ARM_ARCH "" CACHE STRING "ggml: CPU architecture for ARM")
|
||||||
set(GGML_CPU_POWERPC_CPUTYPE "" CACHE STRING "ggml: CPU type for PowerPC")
|
set(GGML_CPU_POWERPC_CPUTYPE "" CACHE STRING "ggml: CPU type for PowerPC")
|
||||||
|
|
||||||
|
|
||||||
if (MINGW)
|
|
||||||
set(GGML_WIN_VER "0xA00" CACHE STRING "ggml: Windows version")
|
|
||||||
endif()
|
|
||||||
|
|
||||||
# ggml core
|
# ggml core
|
||||||
set(GGML_SCHED_MAX_COPIES "4" CACHE STRING "ggml: max input copies for pipeline parallelism")
|
set(GGML_SCHED_MAX_COPIES "4" CACHE STRING "ggml: max input copies for pipeline parallelism")
|
||||||
option(GGML_CPU "ggml: enable CPU backend" ON)
|
option(GGML_CPU "ggml: enable CPU backend" ON)
|
||||||
|
|
@ -226,7 +222,7 @@ option(GGML_WEBGPU "ggml: use WebGPU"
|
||||||
option(GGML_WEBGPU_DEBUG "ggml: enable WebGPU debug output" OFF)
|
option(GGML_WEBGPU_DEBUG "ggml: enable WebGPU debug output" OFF)
|
||||||
option(GGML_WEBGPU_CPU_PROFILE "ggml: enable WebGPU profiling (CPU)" OFF)
|
option(GGML_WEBGPU_CPU_PROFILE "ggml: enable WebGPU profiling (CPU)" OFF)
|
||||||
option(GGML_WEBGPU_GPU_PROFILE "ggml: enable WebGPU profiling (GPU)" OFF)
|
option(GGML_WEBGPU_GPU_PROFILE "ggml: enable WebGPU profiling (GPU)" OFF)
|
||||||
|
option(GGML_WEBGPU_JSPI "ggml: use JSPI for WebGPU" ON)
|
||||||
option(GGML_ZDNN "ggml: use zDNN" OFF)
|
option(GGML_ZDNN "ggml: use zDNN" OFF)
|
||||||
option(GGML_METAL "ggml: use Metal" ${GGML_METAL_DEFAULT})
|
option(GGML_METAL "ggml: use Metal" ${GGML_METAL_DEFAULT})
|
||||||
option(GGML_METAL_NDEBUG "ggml: disable Metal debugging" OFF)
|
option(GGML_METAL_NDEBUG "ggml: disable Metal debugging" OFF)
|
||||||
|
|
@ -258,6 +254,9 @@ option(GGML_HEXAGON "ggml: enable Hexagon backend"
|
||||||
# toolchain for vulkan-shaders-gen
|
# toolchain for vulkan-shaders-gen
|
||||||
set (GGML_VULKAN_SHADERS_GEN_TOOLCHAIN "" CACHE FILEPATH "ggml: toolchain file for vulkan-shaders-gen")
|
set (GGML_VULKAN_SHADERS_GEN_TOOLCHAIN "" CACHE FILEPATH "ggml: toolchain file for vulkan-shaders-gen")
|
||||||
|
|
||||||
|
option(GGML_ZENDNN "ggml: use ZenDNN" OFF)
|
||||||
|
option(ZENDNN_ROOT "ggml: path to ZenDNN installation" "")
|
||||||
|
|
||||||
# extra artifacts
|
# extra artifacts
|
||||||
option(GGML_BUILD_TESTS "ggml: build tests" ${GGML_STANDALONE})
|
option(GGML_BUILD_TESTS "ggml: build tests" ${GGML_STANDALONE})
|
||||||
option(GGML_BUILD_EXAMPLES "ggml: build examples" ${GGML_STANDALONE})
|
option(GGML_BUILD_EXAMPLES "ggml: build examples" ${GGML_STANDALONE})
|
||||||
|
|
@ -319,6 +318,7 @@ set(GGML_PUBLIC_HEADERS
|
||||||
include/ggml-sycl.h
|
include/ggml-sycl.h
|
||||||
include/ggml-vulkan.h
|
include/ggml-vulkan.h
|
||||||
include/ggml-webgpu.h
|
include/ggml-webgpu.h
|
||||||
|
include/ggml-zendnn.h
|
||||||
include/gguf.h)
|
include/gguf.h)
|
||||||
|
|
||||||
set_target_properties(ggml PROPERTIES PUBLIC_HEADER "${GGML_PUBLIC_HEADERS}")
|
set_target_properties(ggml PROPERTIES PUBLIC_HEADER "${GGML_PUBLIC_HEADERS}")
|
||||||
|
|
@ -408,62 +408,67 @@ if (MSVC)
|
||||||
/wd4996 # Disable POSIX deprecation warnings
|
/wd4996 # Disable POSIX deprecation warnings
|
||||||
/wd4702 # Unreachable code warnings
|
/wd4702 # Unreachable code warnings
|
||||||
)
|
)
|
||||||
function(disable_msvc_warnings target_name)
|
set(MSVC_COMPILE_OPTIONS
|
||||||
|
"$<$<COMPILE_LANGUAGE:C>:/utf-8>"
|
||||||
|
"$<$<COMPILE_LANGUAGE:CXX>:/utf-8>"
|
||||||
|
)
|
||||||
|
function(configure_msvc_target target_name)
|
||||||
if(TARGET ${target_name})
|
if(TARGET ${target_name})
|
||||||
target_compile_options(${target_name} PRIVATE ${MSVC_WARNING_FLAGS})
|
target_compile_options(${target_name} PRIVATE ${MSVC_WARNING_FLAGS})
|
||||||
|
target_compile_options(${target_name} PRIVATE ${MSVC_COMPILE_OPTIONS})
|
||||||
endif()
|
endif()
|
||||||
endfunction()
|
endfunction()
|
||||||
|
|
||||||
disable_msvc_warnings(ggml-base)
|
configure_msvc_target(ggml-base)
|
||||||
disable_msvc_warnings(ggml)
|
configure_msvc_target(ggml)
|
||||||
disable_msvc_warnings(ggml-cpu)
|
configure_msvc_target(ggml-cpu)
|
||||||
disable_msvc_warnings(ggml-cpu-x64)
|
configure_msvc_target(ggml-cpu-x64)
|
||||||
disable_msvc_warnings(ggml-cpu-sse42)
|
configure_msvc_target(ggml-cpu-sse42)
|
||||||
disable_msvc_warnings(ggml-cpu-sandybridge)
|
configure_msvc_target(ggml-cpu-sandybridge)
|
||||||
disable_msvc_warnings(ggml-cpu-haswell)
|
configure_msvc_target(ggml-cpu-haswell)
|
||||||
disable_msvc_warnings(ggml-cpu-skylakex)
|
configure_msvc_target(ggml-cpu-skylakex)
|
||||||
disable_msvc_warnings(ggml-cpu-icelake)
|
configure_msvc_target(ggml-cpu-icelake)
|
||||||
disable_msvc_warnings(ggml-cpu-alderlake)
|
configure_msvc_target(ggml-cpu-alderlake)
|
||||||
|
|
||||||
if (GGML_BUILD_EXAMPLES)
|
if (GGML_BUILD_EXAMPLES)
|
||||||
disable_msvc_warnings(common-ggml)
|
configure_msvc_target(common-ggml)
|
||||||
disable_msvc_warnings(common)
|
configure_msvc_target(common)
|
||||||
|
|
||||||
disable_msvc_warnings(mnist-common)
|
configure_msvc_target(mnist-common)
|
||||||
disable_msvc_warnings(mnist-eval)
|
configure_msvc_target(mnist-eval)
|
||||||
disable_msvc_warnings(mnist-train)
|
configure_msvc_target(mnist-train)
|
||||||
|
|
||||||
disable_msvc_warnings(gpt-2-ctx)
|
configure_msvc_target(gpt-2-ctx)
|
||||||
disable_msvc_warnings(gpt-2-alloc)
|
configure_msvc_target(gpt-2-alloc)
|
||||||
disable_msvc_warnings(gpt-2-backend)
|
configure_msvc_target(gpt-2-backend)
|
||||||
disable_msvc_warnings(gpt-2-sched)
|
configure_msvc_target(gpt-2-sched)
|
||||||
disable_msvc_warnings(gpt-2-quantize)
|
configure_msvc_target(gpt-2-quantize)
|
||||||
disable_msvc_warnings(gpt-2-batched)
|
configure_msvc_target(gpt-2-batched)
|
||||||
|
|
||||||
disable_msvc_warnings(gpt-j)
|
configure_msvc_target(gpt-j)
|
||||||
disable_msvc_warnings(gpt-j-quantize)
|
configure_msvc_target(gpt-j-quantize)
|
||||||
|
|
||||||
disable_msvc_warnings(magika)
|
configure_msvc_target(magika)
|
||||||
disable_msvc_warnings(yolov3-tiny)
|
configure_msvc_target(yolov3-tiny)
|
||||||
disable_msvc_warnings(sam)
|
configure_msvc_target(sam)
|
||||||
|
|
||||||
disable_msvc_warnings(simple-ctx)
|
configure_msvc_target(simple-ctx)
|
||||||
disable_msvc_warnings(simple-backend)
|
configure_msvc_target(simple-backend)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
if (GGML_BUILD_TESTS)
|
if (GGML_BUILD_TESTS)
|
||||||
disable_msvc_warnings(test-mul-mat)
|
configure_msvc_target(test-mul-mat)
|
||||||
disable_msvc_warnings(test-arange)
|
configure_msvc_target(test-arange)
|
||||||
disable_msvc_warnings(test-backend-ops)
|
configure_msvc_target(test-backend-ops)
|
||||||
disable_msvc_warnings(test-cont)
|
configure_msvc_target(test-cont)
|
||||||
disable_msvc_warnings(test-conv-transpose)
|
configure_msvc_target(test-conv-transpose)
|
||||||
disable_msvc_warnings(test-conv-transpose-1d)
|
configure_msvc_target(test-conv-transpose-1d)
|
||||||
disable_msvc_warnings(test-conv1d)
|
configure_msvc_target(test-conv1d)
|
||||||
disable_msvc_warnings(test-conv2d)
|
configure_msvc_target(test-conv2d)
|
||||||
disable_msvc_warnings(test-conv2d-dw)
|
configure_msvc_target(test-conv2d-dw)
|
||||||
disable_msvc_warnings(test-customop)
|
configure_msvc_target(test-customop)
|
||||||
disable_msvc_warnings(test-dup)
|
configure_msvc_target(test-dup)
|
||||||
disable_msvc_warnings(test-opt)
|
configure_msvc_target(test-opt)
|
||||||
disable_msvc_warnings(test-pool)
|
configure_msvc_target(test-pool)
|
||||||
endif ()
|
endif ()
|
||||||
endif()
|
endif()
|
||||||
|
|
|
||||||
|
|
@ -1,6 +1,5 @@
|
||||||
#pragma once
|
#pragma once
|
||||||
|
|
||||||
#include "ggml.h"
|
|
||||||
#include "ggml-backend.h"
|
#include "ggml-backend.h"
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
|
|
@ -8,7 +7,7 @@ extern "C" {
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#define RPC_PROTO_MAJOR_VERSION 3
|
#define RPC_PROTO_MAJOR_VERSION 3
|
||||||
#define RPC_PROTO_MINOR_VERSION 5
|
#define RPC_PROTO_MINOR_VERSION 6
|
||||||
#define RPC_PROTO_PATCH_VERSION 0
|
#define RPC_PROTO_PATCH_VERSION 0
|
||||||
#define GGML_RPC_MAX_SERVERS 16
|
#define GGML_RPC_MAX_SERVERS 16
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,22 @@
|
||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include "ggml-backend.h"
|
||||||
|
#include "ggml.h"
|
||||||
|
|
||||||
|
#ifdef __cplusplus
|
||||||
|
extern "C" {
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// backend API
|
||||||
|
GGML_BACKEND_API ggml_backend_t ggml_backend_zendnn_init(void);
|
||||||
|
|
||||||
|
GGML_BACKEND_API bool ggml_backend_is_zendnn(ggml_backend_t backend);
|
||||||
|
|
||||||
|
// number of threads used for zendnn operations
|
||||||
|
GGML_BACKEND_API void ggml_backend_zendnn_set_n_threads(ggml_backend_t backend_zendnn, int n_threads);
|
||||||
|
|
||||||
|
GGML_BACKEND_API ggml_backend_reg_t ggml_backend_zendnn_reg(void);
|
||||||
|
|
||||||
|
#ifdef __cplusplus
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
@ -204,6 +204,10 @@
|
||||||
# define GGML_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))
|
# define GGML_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#if defined(_WIN32) && !defined(_WIN32_WINNT)
|
||||||
|
# define _WIN32_WINNT 0x0A00
|
||||||
|
#endif
|
||||||
|
|
||||||
#include <stdbool.h>
|
#include <stdbool.h>
|
||||||
#include <stddef.h>
|
#include <stddef.h>
|
||||||
#include <stdint.h>
|
#include <stdint.h>
|
||||||
|
|
@ -2192,6 +2196,15 @@ extern "C" {
|
||||||
int p2,
|
int p2,
|
||||||
int p3);
|
int p3);
|
||||||
|
|
||||||
|
// pad each dimension with values on the other side of the torus (looping around)
|
||||||
|
GGML_API struct ggml_tensor * ggml_pad_circular(
|
||||||
|
struct ggml_context * ctx,
|
||||||
|
struct ggml_tensor * a,
|
||||||
|
int p0,
|
||||||
|
int p1,
|
||||||
|
int p2,
|
||||||
|
int p3);
|
||||||
|
|
||||||
GGML_API struct ggml_tensor * ggml_pad_ext(
|
GGML_API struct ggml_tensor * ggml_pad_ext(
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
struct ggml_tensor * a,
|
struct ggml_tensor * a,
|
||||||
|
|
@ -2205,6 +2218,19 @@ extern "C" {
|
||||||
int rp3
|
int rp3
|
||||||
);
|
);
|
||||||
|
|
||||||
|
// pad each dimension with values on the other side of the torus (looping around)
|
||||||
|
GGML_API struct ggml_tensor * ggml_pad_ext_circular(
|
||||||
|
struct ggml_context * ctx,
|
||||||
|
struct ggml_tensor * a,
|
||||||
|
int lp0,
|
||||||
|
int rp0,
|
||||||
|
int lp1,
|
||||||
|
int rp1,
|
||||||
|
int lp2,
|
||||||
|
int rp2,
|
||||||
|
int lp3,
|
||||||
|
int rp3);
|
||||||
|
|
||||||
// pad each dimension with reflection: [a, b, c, d] -> [b, a, b, c, d, c]
|
// pad each dimension with reflection: [a, b, c, d] -> [b, a, b, c, d, c]
|
||||||
GGML_API struct ggml_tensor * ggml_pad_reflect_1d(
|
GGML_API struct ggml_tensor * ggml_pad_reflect_1d(
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
|
|
@ -2279,7 +2305,7 @@ extern "C" {
|
||||||
float stop,
|
float stop,
|
||||||
float step);
|
float step);
|
||||||
|
|
||||||
#define GGML_KQ_MASK_PAD 64
|
#define GGML_KQ_MASK_PAD 1
|
||||||
|
|
||||||
// q: [n_embd_k, n_batch, n_head, ne3 ]
|
// q: [n_embd_k, n_batch, n_head, ne3 ]
|
||||||
// k: [n_embd_k, n_kv, n_head_kv, ne3 ]
|
// k: [n_embd_k, n_kv, n_head_kv, ne3 ]
|
||||||
|
|
|
||||||
|
|
@ -127,10 +127,6 @@ if (NOT MSVC)
|
||||||
endif()
|
endif()
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
if (MINGW)
|
|
||||||
add_compile_definitions(_WIN32_WINNT=${GGML_WIN_VER})
|
|
||||||
endif()
|
|
||||||
|
|
||||||
#
|
#
|
||||||
# POSIX conformance
|
# POSIX conformance
|
||||||
#
|
#
|
||||||
|
|
@ -444,6 +440,7 @@ ggml_add_backend(WebGPU)
|
||||||
ggml_add_backend(zDNN)
|
ggml_add_backend(zDNN)
|
||||||
ggml_add_backend(OpenCL)
|
ggml_add_backend(OpenCL)
|
||||||
ggml_add_backend(Hexagon)
|
ggml_add_backend(Hexagon)
|
||||||
|
ggml_add_backend(ZenDNN)
|
||||||
|
|
||||||
foreach (target ggml-base ggml)
|
foreach (target ggml-base ggml)
|
||||||
target_include_directories(${target} PUBLIC $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/../include> $<INSTALL_INTERFACE:include>)
|
target_include_directories(${target} PUBLIC $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/../include> $<INSTALL_INTERFACE:include>)
|
||||||
|
|
|
||||||
|
|
@ -73,6 +73,10 @@
|
||||||
#include "ggml-cann.h"
|
#include "ggml-cann.h"
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#ifdef GGML_USE_ZENDNN
|
||||||
|
#include "ggml-zendnn.h"
|
||||||
|
#endif
|
||||||
|
|
||||||
// disable C++17 deprecation warning for std::codecvt_utf8
|
// disable C++17 deprecation warning for std::codecvt_utf8
|
||||||
#if defined(__clang__)
|
#if defined(__clang__)
|
||||||
# pragma clang diagnostic push
|
# pragma clang diagnostic push
|
||||||
|
|
@ -203,6 +207,9 @@ struct ggml_backend_registry {
|
||||||
#ifdef GGML_USE_OPENCL
|
#ifdef GGML_USE_OPENCL
|
||||||
register_backend(ggml_backend_opencl_reg());
|
register_backend(ggml_backend_opencl_reg());
|
||||||
#endif
|
#endif
|
||||||
|
#ifdef GGML_USE_ZENDNN
|
||||||
|
register_backend(ggml_backend_zendnn_reg());
|
||||||
|
#endif
|
||||||
#ifdef GGML_USE_HEXAGON
|
#ifdef GGML_USE_HEXAGON
|
||||||
register_backend(ggml_backend_hexagon_reg());
|
register_backend(ggml_backend_hexagon_reg());
|
||||||
#endif
|
#endif
|
||||||
|
|
@ -534,8 +541,12 @@ static ggml_backend_reg_t ggml_backend_load_best(const char * name, bool silent,
|
||||||
fs::path best_path;
|
fs::path best_path;
|
||||||
|
|
||||||
for (const auto & search_path : search_paths) {
|
for (const auto & search_path : search_paths) {
|
||||||
if (!fs::exists(search_path)) {
|
if (std::error_code ec; !fs::exists(search_path, ec)) {
|
||||||
|
if (ec) {
|
||||||
|
GGML_LOG_DEBUG("%s: posix_stat(%s) failure, error-message: %s\n", __func__, path_str(search_path).c_str(), ec.message().c_str());
|
||||||
|
} else {
|
||||||
GGML_LOG_DEBUG("%s: search path %s does not exist\n", __func__, path_str(search_path).c_str());
|
GGML_LOG_DEBUG("%s: search path %s does not exist\n", __func__, path_str(search_path).c_str());
|
||||||
|
}
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
fs::directory_iterator dir_it(search_path, fs::directory_options::skip_permission_denied);
|
fs::directory_iterator dir_it(search_path, fs::directory_options::skip_permission_denied);
|
||||||
|
|
@ -575,8 +586,12 @@ static ggml_backend_reg_t ggml_backend_load_best(const char * name, bool silent,
|
||||||
for (const auto & search_path : search_paths) {
|
for (const auto & search_path : search_paths) {
|
||||||
fs::path filename = backend_filename_prefix().native() + name_path.native() + backend_filename_extension().native();
|
fs::path filename = backend_filename_prefix().native() + name_path.native() + backend_filename_extension().native();
|
||||||
fs::path path = search_path / filename;
|
fs::path path = search_path / filename;
|
||||||
if (fs::exists(path)) {
|
if (std::error_code ec; fs::exists(path, ec)) {
|
||||||
return get_reg().load_backend(path, silent);
|
return get_reg().load_backend(path, silent);
|
||||||
|
} else {
|
||||||
|
if (ec) {
|
||||||
|
GGML_LOG_DEBUG("%s: posix_stat(%s) failure, error-message: %s\n", __func__, path_str(path).c_str(), ec.message().c_str());
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return nullptr;
|
return nullptr;
|
||||||
|
|
@ -597,6 +612,7 @@ void ggml_backend_load_all_from_path(const char * dir_path) {
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
ggml_backend_load_best("blas", silent, dir_path);
|
ggml_backend_load_best("blas", silent, dir_path);
|
||||||
|
ggml_backend_load_best("zendnn", silent, dir_path);
|
||||||
ggml_backend_load_best("cann", silent, dir_path);
|
ggml_backend_load_best("cann", silent, dir_path);
|
||||||
ggml_backend_load_best("cuda", silent, dir_path);
|
ggml_backend_load_best("cuda", silent, dir_path);
|
||||||
ggml_backend_load_best("hip", silent, dir_path);
|
ggml_backend_load_best("hip", silent, dir_path);
|
||||||
|
|
|
||||||
|
|
@ -1240,10 +1240,8 @@ void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgra
|
||||||
tensor_copy = ggml_dup_tensor_layout(sched->ctx, src);
|
tensor_copy = ggml_dup_tensor_layout(sched->ctx, src);
|
||||||
ggml_format_name(tensor_copy, "%s#%s#%d", ggml_backend_name(backend), src->name, c);
|
ggml_format_name(tensor_copy, "%s#%s#%d", ggml_backend_name(backend), src->name, c);
|
||||||
}
|
}
|
||||||
if (sched->n_copies > 1) {
|
|
||||||
ggml_set_input(tensor_copy);
|
ggml_set_input(tensor_copy);
|
||||||
ggml_set_output(tensor_copy); // prevent ggml-alloc from overwriting the tensor
|
ggml_set_output(tensor_copy); // prevent ggml-alloc from overwriting the tensor
|
||||||
}
|
|
||||||
tensor_id_copy(src_id, src_backend_id, c) = tensor_copy;
|
tensor_id_copy(src_id, src_backend_id, c) = tensor_copy;
|
||||||
SET_CAUSE(tensor_copy, "4.cpy");
|
SET_CAUSE(tensor_copy, "4.cpy");
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -2551,6 +2551,8 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev, const ggml_ten
|
||||||
case GGML_OP_ACC:
|
case GGML_OP_ACC:
|
||||||
case GGML_OP_GROUP_NORM:
|
case GGML_OP_GROUP_NORM:
|
||||||
case GGML_OP_PAD:
|
case GGML_OP_PAD:
|
||||||
|
// TODO: add circular padding support for cann, see https://github.com/ggml-org/llama.cpp/pull/16985
|
||||||
|
return ggml_get_op_params_i32(op, 8) == 0;
|
||||||
case GGML_OP_ARANGE:
|
case GGML_OP_ARANGE:
|
||||||
case GGML_OP_TIMESTEP_EMBEDDING:
|
case GGML_OP_TIMESTEP_EMBEDDING:
|
||||||
case GGML_OP_LEAKY_RELU:
|
case GGML_OP_LEAKY_RELU:
|
||||||
|
|
@ -2564,6 +2566,10 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev, const ggml_ten
|
||||||
return true;
|
return true;
|
||||||
case GGML_OP_OUT_PROD:
|
case GGML_OP_OUT_PROD:
|
||||||
{
|
{
|
||||||
|
#ifdef ASCEND_310P
|
||||||
|
// Ger is not supported on 310p device
|
||||||
|
return false;
|
||||||
|
#endif
|
||||||
switch (op->src[0]->type) {
|
switch (op->src[0]->type) {
|
||||||
case GGML_TYPE_F16:
|
case GGML_TYPE_F16:
|
||||||
case GGML_TYPE_F32:
|
case GGML_TYPE_F32:
|
||||||
|
|
|
||||||
|
|
@ -469,6 +469,9 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
||||||
if (GGML_RV_ZICBOP)
|
if (GGML_RV_ZICBOP)
|
||||||
string(APPEND MARCH_STR "_zicbop")
|
string(APPEND MARCH_STR "_zicbop")
|
||||||
endif()
|
endif()
|
||||||
|
if (GGML_RV_ZIHINTPAUSE)
|
||||||
|
string(APPEND MARCH_STR "_zihintpause")
|
||||||
|
endif()
|
||||||
list(APPEND ARCH_FLAGS "-march=${MARCH_STR}" -mabi=lp64d)
|
list(APPEND ARCH_FLAGS "-march=${MARCH_STR}" -mabi=lp64d)
|
||||||
else()
|
else()
|
||||||
# Begin with the lowest baseline
|
# Begin with the lowest baseline
|
||||||
|
|
|
||||||
|
|
@ -8,6 +8,10 @@
|
||||||
#include <sys/sysctl.h>
|
#include <sys/sysctl.h>
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#if !defined(HWCAP2_SVE2)
|
||||||
|
#define HWCAP2_SVE2 (1 << 1)
|
||||||
|
#endif
|
||||||
|
|
||||||
#if !defined(HWCAP2_I8MM)
|
#if !defined(HWCAP2_I8MM)
|
||||||
#define HWCAP2_I8MM (1 << 13)
|
#define HWCAP2_I8MM (1 << 13)
|
||||||
#endif
|
#endif
|
||||||
|
|
|
||||||
|
|
@ -505,7 +505,6 @@ void ggml_gemv_q4_K_8x4_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
||||||
constexpr int blocklen = 8;
|
constexpr int blocklen = 8;
|
||||||
|
|
||||||
assert(n % qk == 0);
|
assert(n % qk == 0);
|
||||||
assert(nr % 4 == 0);
|
|
||||||
assert(nc % ncols_interleaved == 0);
|
assert(nc % ncols_interleaved == 0);
|
||||||
|
|
||||||
UNUSED(nb);
|
UNUSED(nb);
|
||||||
|
|
@ -645,7 +644,6 @@ void ggml_gemv_q4_K_8x8_q8_K(int n,
|
||||||
constexpr int blocklen = 8;
|
constexpr int blocklen = 8;
|
||||||
|
|
||||||
assert(n % qk == 0);
|
assert(n % qk == 0);
|
||||||
assert(nr % 4 == 0);
|
|
||||||
assert(nc % ncols_interleaved == 0);
|
assert(nc % ncols_interleaved == 0);
|
||||||
|
|
||||||
UNUSED(nb);
|
UNUSED(nb);
|
||||||
|
|
|
||||||
|
|
@ -490,6 +490,15 @@ static inline void ggml_thread_cpu_relax(void) {
|
||||||
static inline void ggml_thread_cpu_relax(void) {
|
static inline void ggml_thread_cpu_relax(void) {
|
||||||
_mm_pause();
|
_mm_pause();
|
||||||
}
|
}
|
||||||
|
#elif defined(__riscv)
|
||||||
|
static inline void ggml_thread_cpu_relax(void) {
|
||||||
|
#ifdef __riscv_zihintpause
|
||||||
|
__asm__ __volatile__ ("pause");
|
||||||
|
#else
|
||||||
|
/* Encoding of the pause instruction */
|
||||||
|
__asm__ __volatile__ (".4byte 0x100000F");
|
||||||
|
#endif
|
||||||
|
}
|
||||||
#else
|
#else
|
||||||
static inline void ggml_thread_cpu_relax(void) {;}
|
static inline void ggml_thread_cpu_relax(void) {;}
|
||||||
#endif
|
#endif
|
||||||
|
|
@ -683,22 +692,14 @@ bool ggml_is_numa(void) {
|
||||||
}
|
}
|
||||||
|
|
||||||
#if defined(__ARM_ARCH)
|
#if defined(__ARM_ARCH)
|
||||||
|
|
||||||
#if defined(__linux__) && defined(__aarch64__)
|
|
||||||
#include <sys/auxv.h>
|
|
||||||
#endif
|
|
||||||
|
|
||||||
static void ggml_init_arm_arch_features(void) {
|
|
||||||
#if defined(__aarch64__) && defined(__ARM_FEATURE_SVE)
|
#if defined(__aarch64__) && defined(__ARM_FEATURE_SVE)
|
||||||
#if defined(__linux__)
|
#include <arm_sve.h>
|
||||||
ggml_arm_arch_features.sve_cnt = PR_SVE_VL_LEN_MASK & prctl(PR_SVE_GET_VL);
|
static void ggml_init_arm_arch_features(void) {
|
||||||
#else
|
ggml_arm_arch_features.sve_cnt = svcntb();
|
||||||
// TODO: add support of SVE for non-linux systems
|
|
||||||
#error "TODO: SVE is not supported on this platform. To use SVE, sve_cnt needs to be initialized here."
|
|
||||||
#endif
|
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
|
#else
|
||||||
|
static void ggml_init_arm_arch_features(void) {}
|
||||||
|
#endif
|
||||||
#endif // __ARM_ARCH
|
#endif // __ARM_ARCH
|
||||||
|
|
||||||
struct ggml_tensor * ggml_new_i32(struct ggml_context * ctx, int32_t value) {
|
struct ggml_tensor * ggml_new_i32(struct ggml_context * ctx, int32_t value) {
|
||||||
|
|
@ -2706,6 +2707,11 @@ struct ggml_cplan ggml_graph_plan(
|
||||||
n_threads = threadpool ? threadpool->n_threads_max : GGML_DEFAULT_N_THREADS;
|
n_threads = threadpool ? threadpool->n_threads_max : GGML_DEFAULT_N_THREADS;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#if defined(__EMSCRIPTEN__) && !defined(__EMSCRIPTEN_PTHREADS__)
|
||||||
|
// Emscripten without pthreads support can only use a single thread
|
||||||
|
n_threads = 1;
|
||||||
|
#endif
|
||||||
|
|
||||||
size_t work_size = 0;
|
size_t work_size = 0;
|
||||||
|
|
||||||
struct ggml_cplan cplan;
|
struct ggml_cplan cplan;
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,333 @@
|
||||||
|
#pragma once
|
||||||
|
|
||||||
|
typedef vector unsigned char vec_t;
|
||||||
|
typedef __vector_quad acc_t;
|
||||||
|
|
||||||
|
template <typename TA>
|
||||||
|
class tinyBLAS_Q0_PPC {
|
||||||
|
public:
|
||||||
|
tinyBLAS_Q0_PPC(int64_t k,
|
||||||
|
const TA *A, int64_t lda,
|
||||||
|
const block_q8_0 *B, int64_t ldb,
|
||||||
|
float *C, int64_t ldc,
|
||||||
|
int ith, int nth);
|
||||||
|
|
||||||
|
void matmul(int64_t m, int64_t n);
|
||||||
|
void matmul_tiled_q0(int64_t m, int64_t n, int64_t mc, int64_t nc, int64_t kc) {
|
||||||
|
vec_t A_pack[mc*kc*2];
|
||||||
|
vec_t B_pack[nc*kc*2];
|
||||||
|
int comparray[mc*kc];
|
||||||
|
constexpr bool is_Ablock_q4 = std::is_same_v<TA, block_q4_0>;
|
||||||
|
int64_t ytiles = m / mc;
|
||||||
|
int64_t xtiles = n / nc;
|
||||||
|
int64_t tiles = xtiles * ytiles;
|
||||||
|
int64_t duty = (tiles + nth - 1) / nth;
|
||||||
|
int64_t start = duty * ith;
|
||||||
|
int64_t end = start + duty;
|
||||||
|
if (end > tiles) {
|
||||||
|
end = tiles;
|
||||||
|
}
|
||||||
|
for (int64_t job = start; job < end; ++job) {
|
||||||
|
int64_t ii = (job / xtiles) * mc;
|
||||||
|
int64_t jj = (job % xtiles) * nc;
|
||||||
|
for (int64_t kk = 0; kk < k; kk += kc) {
|
||||||
|
if constexpr(is_Ablock_q4) {
|
||||||
|
packNormalInt4_large(A + ii*lda + kk, lda, mc, 4, (int8_t*)A_pack, comparray);
|
||||||
|
} else {
|
||||||
|
packNormal_large<int8_t, vector signed char>(A + ii*lda + kk, lda, mc, 8, (int8_t*)A_pack, false, comparray);
|
||||||
|
}
|
||||||
|
packNormal_large<uint8_t, vector unsigned char>(B + jj*ldb + kk, ldb, nc, 8, (uint8_t*)B_pack, true);
|
||||||
|
KERNEL_Q0(ii, jj, mc, nc, kc, kk, A_pack, B_pack, comparray);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private:
|
||||||
|
inline void save_res(int ii, int jj, int idx, vector float* fin_res, int RM=4, int RN=4) {
|
||||||
|
for (int I = 0; I < RM; I++) {
|
||||||
|
for (int J = 0; J < RN; J++) {
|
||||||
|
*((float*)(C+ii+((jj+J)*ldc)+I)) = *((float*)&fin_res[idx+I]+J);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
inline void add_save_res(int ii, int jj, int idx, vector float* fin_res, int RM=4, int RN=4) {
|
||||||
|
for (int I = 0; I < RM; I++) {
|
||||||
|
for (int J = 0; J < RN; J++) {
|
||||||
|
float * c_ptr = (float *)(C+ii+((jj+J)*ldc)+I);
|
||||||
|
*c_ptr += *((float*)&fin_res[idx+I]+J);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
template<typename ArrayType>
|
||||||
|
inline void compute(acc_t* ACC, int c_idx, int s_idx, ArrayType& comparray, vector float* vs, vector float* fin_res) {
|
||||||
|
vector signed int vec_C[4];
|
||||||
|
vector float CA[4] = {0};
|
||||||
|
vector float res[4] = {0};
|
||||||
|
__builtin_mma_disassemble_acc(vec_C, ACC);
|
||||||
|
for (int i = 0; i < 4; i++) {
|
||||||
|
CA[i] = vec_splats((float)(((double)comparray[c_idx+i]) * -128.0));
|
||||||
|
res[i] = vec_add(vec_ctf(vec_C[i], 0), CA[i]);
|
||||||
|
fin_res[s_idx+i] = vec_madd(res[i], vs[s_idx+i], fin_res[s_idx+i]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
inline void process_q4_elements(vector signed char (&c)[2], int* ca) {
|
||||||
|
const vector signed char lowMask = vec_splats((signed char)0xF);
|
||||||
|
const vector unsigned char v4 = vec_splats((unsigned char)0x4);
|
||||||
|
const vector signed char v8 = vec_splats((signed char)0x8);
|
||||||
|
vector signed int vsum = {0};
|
||||||
|
vector signed int vsum2 = {0};
|
||||||
|
c[0] = vec_and(c[1], lowMask);
|
||||||
|
c[1] = vec_sr(c[1], v4);
|
||||||
|
c[0] = vec_sub(c[0], v8);
|
||||||
|
c[1] = vec_sub(c[1], v8);
|
||||||
|
vsum = vec_sum4s(c[0], vsum);
|
||||||
|
vsum2 = vec_sum4s(c[1], vsum2);
|
||||||
|
vsum = vec_add(vsum, vsum2);
|
||||||
|
*(ca) = vsum[0] + vsum[1] + vsum[2] + vsum[3];
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename V1, typename V2>
|
||||||
|
inline void vector_permute_store(V2 &s1, V2 &s2, V2 &s3, V2 &s4, V1 *vecOffset, bool flip) {
|
||||||
|
vector unsigned char swiz1 = {0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23};
|
||||||
|
vector unsigned char swiz2 = {8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31};
|
||||||
|
vector unsigned char swiz3 = {0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27};
|
||||||
|
vector unsigned char swiz4 = {4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31};
|
||||||
|
V2 t1, t2, t3, t4, t5, t6, t7, t8;
|
||||||
|
vector unsigned char xor_vector;
|
||||||
|
uint8_t flip_vec = 0x80;
|
||||||
|
xor_vector = vec_splats(flip_vec);
|
||||||
|
t1 = vec_perm(s1, s2, swiz1);
|
||||||
|
t2 = vec_perm(s1, s2, swiz2);
|
||||||
|
t3 = vec_perm(s3, s4, swiz1);
|
||||||
|
t4 = vec_perm(s3, s4, swiz2);
|
||||||
|
t5 = vec_perm(t1, t3, swiz3);
|
||||||
|
t6 = vec_perm(t1, t3, swiz4);
|
||||||
|
t7 = vec_perm(t2, t4, swiz3);
|
||||||
|
t8 = vec_perm(t2, t4, swiz4);
|
||||||
|
if (flip == true) {
|
||||||
|
t5 = vec_xor(t5, xor_vector);
|
||||||
|
t6 = vec_xor(t6, xor_vector);
|
||||||
|
t7 = vec_xor(t7, xor_vector);
|
||||||
|
t8 = vec_xor(t8, xor_vector);
|
||||||
|
}
|
||||||
|
vec_xst(t5, 0, vecOffset);
|
||||||
|
vec_xst(t6, 0, vecOffset+16);
|
||||||
|
vec_xst(t7, 0, vecOffset+32);
|
||||||
|
vec_xst(t8, 0, vecOffset+48);
|
||||||
|
}
|
||||||
|
|
||||||
|
template<int RM, int RN>
|
||||||
|
inline void kernel(int64_t ii, int64_t jj) {
|
||||||
|
if constexpr(RM == 4 && RN == 8) {
|
||||||
|
KERNEL_4x8(ii,jj);
|
||||||
|
} else if constexpr(RM == 8 && RN == 4) {
|
||||||
|
KERNEL_8x4(ii,jj);
|
||||||
|
} else if constexpr(RM == 8 && RN == 8) {
|
||||||
|
KERNEL_8x8(ii,jj);
|
||||||
|
} else {
|
||||||
|
assert(false && "RN/RM values not supported");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
template<int size>
|
||||||
|
void packNormalInt4(const TA* a, int64_t lda, int rows, int cols, int8_t* vec, std::array<int, size>& comparray);
|
||||||
|
template<typename VA, typename VB>
|
||||||
|
void packNormal(const block_q8_0* a, int64_t lda, int rows, int cols, VA* vec, bool flip);
|
||||||
|
void mnpack(int64_t m0, int64_t m, int64_t n0, int64_t n);
|
||||||
|
void KERNEL_4x8(int64_t ii, int64_t jj);
|
||||||
|
void KERNEL_8x4(int64_t ii, int64_t jj);
|
||||||
|
void KERNEL_8x8(int64_t ii, int64_t jj);
|
||||||
|
void gemm_small(int64_t m0, int64_t m, int64_t n0, int64_t n, int RM, int RN);
|
||||||
|
template <int RM, int RN>
|
||||||
|
void gemm(int64_t m0, int64_t m, int64_t n0, int64_t n);
|
||||||
|
|
||||||
|
void compute_scale(int64_t ii, int64_t jj, int blk, vector float* vs){
|
||||||
|
for (int I = 0; I<8; I++) {
|
||||||
|
float a_scale = unhalf((A+((ii+I)*lda)+blk)->d);
|
||||||
|
for (int J = 0; J<4; J++) {
|
||||||
|
*((float*)&vs[I]+J) = (a_scale * unhalf((B+((jj+J)*ldb)+blk)->d));
|
||||||
|
*((float*)&vs[I+8]+J) = (a_scale * unhalf((B+((jj+J+4)*ldb)+blk)->d));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
inline void process_q8_elements(const int8_t *qs, int *ca) {
|
||||||
|
vector signed char c1 = vec_xl(0, qs);
|
||||||
|
vector signed char c2 = vec_xl(16, qs);
|
||||||
|
vector signed int vsum1 = {0};
|
||||||
|
vector signed int vsum2 = {0};
|
||||||
|
vsum1 = vec_sum4s(c1, vsum1);
|
||||||
|
vsum2 = vec_sum4s(c2, vsum2);
|
||||||
|
vector signed int vsum = vec_add(vsum1, vsum2);
|
||||||
|
*ca = vsum[0] + vsum[1] + vsum[2] + vsum[3];
|
||||||
|
}
|
||||||
|
|
||||||
|
template<typename VA, typename VB>
|
||||||
|
void packNormal_large(const block_q8_0* a, int64_t lda, int rows, int cols, VA* vec, bool flip, int* comparray=nullptr) {
|
||||||
|
int64_t i, j;
|
||||||
|
block_q8_0 *aoffset = NULL;
|
||||||
|
VA *vecOffset = NULL;
|
||||||
|
block_q8_0* aoffsets[8];
|
||||||
|
__vector_pair arr[8];
|
||||||
|
VB c[8][2] = {0};
|
||||||
|
VB c1[8] = {0}; VB c2[8] = {0};
|
||||||
|
aoffset = const_cast<block_q8_0*>(a);
|
||||||
|
vecOffset = vec;
|
||||||
|
j = (rows >> 3);
|
||||||
|
int index = 0;
|
||||||
|
if (j > 0) {
|
||||||
|
do {
|
||||||
|
for (int it = 0; it < 8; it++)
|
||||||
|
aoffsets[it] = aoffset + it*lda;
|
||||||
|
aoffset += 8 * lda;
|
||||||
|
for (int blk = 0; blk < kc; blk++) {
|
||||||
|
for (int it = 0; it < 8; it++) {
|
||||||
|
arr[it] = __builtin_vsx_lxvp(0, (__vector_pair*)(aoffsets[it]+blk)->qs);
|
||||||
|
__builtin_vsx_disassemble_pair(c[it], &arr[it]);
|
||||||
|
c1[it] = c[it][0];
|
||||||
|
c2[it] = c[it][1];
|
||||||
|
if (comparray){
|
||||||
|
process_q8_elements((aoffsets[it]+ blk)->qs, &comparray[index + 8*blk + it]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
vector_permute_store<VA, VB>(c1[0], c1[1], c1[2], c1[3], vecOffset, flip);
|
||||||
|
vector_permute_store<VA, VB>(c2[0], c2[1], c2[2], c2[3], vecOffset+64, flip);
|
||||||
|
vector_permute_store<VA, VB>(c1[4], c1[5], c1[6], c1[7], vecOffset+128, flip);
|
||||||
|
vector_permute_store<VA, VB>(c2[4], c2[5], c2[6], c2[7], vecOffset+192, flip);
|
||||||
|
vecOffset += 256;
|
||||||
|
}
|
||||||
|
j--;
|
||||||
|
index += 8*kc;
|
||||||
|
} while(j > 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
void packNormalInt4_large(const TA* a, int64_t lda, int rows, int cols, int8_t* vec, int*comparray) {
|
||||||
|
int64_t i, j;
|
||||||
|
TA *aoffset = NULL;
|
||||||
|
int8_t *vecOffset = NULL;
|
||||||
|
TA *aoffset1 = NULL, *aoffset2 = NULL, *aoffset3 = NULL, *aoffset4 = NULL;
|
||||||
|
TA *aoffset5 = NULL, *aoffset6 = NULL, *aoffset7 = NULL, *aoffset8 = NULL;
|
||||||
|
vector signed char c1[2] = {0}, c2[2] = {0}, c3[2] = {0}, c4[2] = {0};
|
||||||
|
vector signed char c5[2] = {0}, c6[2] = {0}, c7[2] = {0}, c8[2] = {0};
|
||||||
|
aoffset = const_cast<TA*>(a);
|
||||||
|
vecOffset = vec;
|
||||||
|
int index = 0;
|
||||||
|
j = (rows >> 3);
|
||||||
|
if (j > 0) {
|
||||||
|
do {
|
||||||
|
aoffset1 = aoffset;
|
||||||
|
aoffset2 = aoffset1 + lda;
|
||||||
|
aoffset3 = aoffset2 + lda;
|
||||||
|
aoffset4 = aoffset3 + lda;
|
||||||
|
aoffset5 = aoffset4 + lda;
|
||||||
|
aoffset6 = aoffset5 + lda;
|
||||||
|
aoffset7 = aoffset6 + lda;
|
||||||
|
aoffset8 = aoffset7 + lda;
|
||||||
|
aoffset += 8 * lda;
|
||||||
|
for (int blk = 0; blk < kc; blk++) {
|
||||||
|
c1[1] = reinterpret_cast<vector signed char>(vec_xl(0, (aoffset1+blk)->qs));
|
||||||
|
c2[1] = reinterpret_cast<vector signed char>(vec_xl(0, (aoffset2+blk)->qs));
|
||||||
|
c3[1] = reinterpret_cast<vector signed char>(vec_xl(0, (aoffset3+blk)->qs));
|
||||||
|
c4[1] = reinterpret_cast<vector signed char>(vec_xl(0, (aoffset4+blk)->qs));
|
||||||
|
c5[1] = reinterpret_cast<vector signed char>(vec_xl(0, (aoffset5+blk)->qs));
|
||||||
|
c6[1] = reinterpret_cast<vector signed char>(vec_xl(0, (aoffset6+blk)->qs));
|
||||||
|
c7[1] = reinterpret_cast<vector signed char>(vec_xl(0, (aoffset7+blk)->qs));
|
||||||
|
c8[1] = reinterpret_cast<vector signed char>(vec_xl(0, (aoffset8+blk)->qs));
|
||||||
|
|
||||||
|
process_q4_elements(c1, &comparray[index + 8*blk+0]);
|
||||||
|
process_q4_elements(c2, &comparray[index + 8*blk+1]);
|
||||||
|
process_q4_elements(c3, &comparray[index + 8*blk+2]);
|
||||||
|
process_q4_elements(c4, &comparray[index + 8*blk+3]);
|
||||||
|
process_q4_elements(c5, &comparray[index + 8*blk+4]);
|
||||||
|
process_q4_elements(c6, &comparray[index + 8*blk+5]);
|
||||||
|
process_q4_elements(c7, &comparray[index + 8*blk+6]);
|
||||||
|
process_q4_elements(c8, &comparray[index + 8*blk+7]);
|
||||||
|
vector_permute_store<int8_t, vector signed char>(c1[0], c2[0], c3[0], c4[0], vecOffset, false);
|
||||||
|
vector_permute_store<int8_t, vector signed char>(c1[1], c2[1], c3[1], c4[1], vecOffset+64, false);
|
||||||
|
vector_permute_store<int8_t, vector signed char>(c5[0], c6[0], c7[0], c8[0], vecOffset+128, false);
|
||||||
|
vector_permute_store<int8_t, vector signed char>(c5[1], c6[1], c7[1], c8[1], vecOffset+192, false);
|
||||||
|
vecOffset += 256;
|
||||||
|
}
|
||||||
|
j--;
|
||||||
|
index += 8*kc;
|
||||||
|
} while (j > 0);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void KERNEL_Q0(int64_t ii, int64_t jj, int64_t mc, int64_t nc, int64_t kc, int64_t l, vec_t *vec_A, vec_t *vec_B, int *comparray) {
|
||||||
|
acc_t acc[8];
|
||||||
|
for (int i = 0; i < mc ; i += 8) {
|
||||||
|
for (int j = 0; j < nc; j += 8) {
|
||||||
|
vector float fin_res[16] = {0};
|
||||||
|
vector float vs[16] = {0};
|
||||||
|
for (int64_t kk = 0; kk < kc; kk+=2) {
|
||||||
|
for (int x = 0; x < 8; x++) {
|
||||||
|
__builtin_mma_xxsetaccz(&acc[x]);
|
||||||
|
}
|
||||||
|
int A_block_idx = (i/8)*(16*kc) + kk*16;
|
||||||
|
int B_block_idx = (j/8)*(16*kc)+ kk*16;
|
||||||
|
vec_t *A_block = &vec_A[A_block_idx];
|
||||||
|
vec_t *B_block = &vec_B[B_block_idx];
|
||||||
|
for (int x = 0; x < 8; x++) {
|
||||||
|
__builtin_mma_xvi8ger4pp(&acc[0], A_block[x], B_block[x]);
|
||||||
|
__builtin_mma_xvi8ger4pp(&acc[1], A_block[x + 8], B_block[x]);
|
||||||
|
__builtin_mma_xvi8ger4pp(&acc[2], A_block[x], B_block[x+8]);
|
||||||
|
__builtin_mma_xvi8ger4pp(&acc[3], A_block[x+8], B_block[x+8]);
|
||||||
|
}
|
||||||
|
compute_scale(ii+i, jj+j, l+kk, vs);
|
||||||
|
int c_index = (i/8)*(8*kc)+ kk*8;
|
||||||
|
int* c_block = &comparray[c_index];
|
||||||
|
compute(&acc[0], 0, 0, c_block, vs, fin_res);
|
||||||
|
compute(&acc[1], 4, 4, c_block, vs, fin_res);
|
||||||
|
compute(&acc[2], 0, 8, c_block, vs, fin_res);
|
||||||
|
compute(&acc[3], 4, 12, c_block, vs, fin_res);
|
||||||
|
|
||||||
|
A_block_idx = (i/8)*(16*kc) + (kk+1)*16;
|
||||||
|
B_block_idx = (j/8)*(16*kc)+ (kk+1)*16;
|
||||||
|
A_block = &vec_A[A_block_idx];
|
||||||
|
B_block = &vec_B[B_block_idx];
|
||||||
|
for (int x = 0; x < 8; x++) {
|
||||||
|
__builtin_mma_xvi8ger4pp(&acc[4], A_block[x], B_block[x]);
|
||||||
|
__builtin_mma_xvi8ger4pp(&acc[5], A_block[x + 8], B_block[x]);
|
||||||
|
__builtin_mma_xvi8ger4pp(&acc[6], A_block[x], B_block[x+8]);
|
||||||
|
__builtin_mma_xvi8ger4pp(&acc[7], A_block[x+8], B_block[x+8]);
|
||||||
|
}
|
||||||
|
compute_scale(ii+i, jj+j, l+kk+1, vs);
|
||||||
|
c_index = (i/8)*(8*kc)+ (kk+1)*8;
|
||||||
|
c_block = &comparray[c_index];
|
||||||
|
compute(&acc[4], 0, 0, c_block, vs, fin_res);
|
||||||
|
compute(&acc[5], 4, 4, c_block, vs, fin_res);
|
||||||
|
compute(&acc[6], 0, 8, c_block, vs, fin_res);
|
||||||
|
compute(&acc[7], 4, 12, c_block, vs, fin_res);
|
||||||
|
|
||||||
|
}
|
||||||
|
if (l == 0) {
|
||||||
|
save_res(ii+i, jj+j, 0, fin_res);
|
||||||
|
save_res(ii+i+4, jj+j, 4, fin_res);
|
||||||
|
save_res(ii+i, jj+j+4, 8, fin_res);
|
||||||
|
save_res(ii+i+4, jj+j+4, 12, fin_res);
|
||||||
|
} else {
|
||||||
|
add_save_res(ii+i, jj+j, 0, fin_res);
|
||||||
|
add_save_res(ii+i+4, jj+j, 4, fin_res);
|
||||||
|
add_save_res(ii+i, jj+j+4, 8, fin_res);
|
||||||
|
add_save_res(ii+i+4, jj+j+4, 12, fin_res);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const TA *const A;
|
||||||
|
const block_q8_0 *const B;
|
||||||
|
float *C;
|
||||||
|
const int64_t k;
|
||||||
|
int64_t kc;
|
||||||
|
const int64_t lda;
|
||||||
|
const int64_t ldb;
|
||||||
|
const int64_t ldc;
|
||||||
|
const int ith;
|
||||||
|
const int nth;
|
||||||
|
};
|
||||||
|
|
@ -117,8 +117,7 @@ inline float32x4_t mul(float32x4_t x, float32x4_t y) { return vec_mul(x, y); }
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(__MMA__)
|
#if defined(__MMA__)
|
||||||
typedef vector unsigned char vec_t;
|
#include "sgemm-ppc.h"
|
||||||
typedef __vector_quad acc_t;
|
|
||||||
#endif
|
#endif
|
||||||
////////////////////////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
// VECTORIZED FUSED MULTIPLY ADD
|
// VECTORIZED FUSED MULTIPLY ADD
|
||||||
|
|
@ -1574,94 +1573,34 @@ class tinyBLAS_BF16_PPC {
|
||||||
};
|
};
|
||||||
|
|
||||||
template <typename TA>
|
template <typename TA>
|
||||||
class tinyBLAS_Q0_PPC {
|
tinyBLAS_Q0_PPC<TA>::tinyBLAS_Q0_PPC(int64_t k,
|
||||||
public:
|
|
||||||
tinyBLAS_Q0_PPC(int64_t k,
|
|
||||||
const TA *A, int64_t lda,
|
const TA *A, int64_t lda,
|
||||||
const block_q8_0 *B, int64_t ldb,
|
const block_q8_0 *B, int64_t ldb,
|
||||||
float *C, int64_t ldc,
|
float *C, int64_t ldc,
|
||||||
int ith, int nth)
|
int ith, int nth)
|
||||||
: A(A), B(B), C(C), k(k), lda(lda), ldb(ldb), ldc(ldc), ith(ith), nth(nth) {
|
: A(A), B(B), C(C), k(k), lda(lda), ldb(ldb), ldc(ldc), ith(ith), nth(nth) {
|
||||||
|
kc = 64;
|
||||||
}
|
}
|
||||||
|
|
||||||
void matmul(int64_t m, int64_t n) {
|
template<typename TA>
|
||||||
|
void tinyBLAS_Q0_PPC<TA>::matmul(int64_t m, int64_t n) {
|
||||||
|
int mc = 64; int nc = 64;
|
||||||
|
if (n % 8 == 0 && n < nc) {
|
||||||
|
nc = n;
|
||||||
|
mc = 32 ;
|
||||||
|
kc = 32;
|
||||||
|
}
|
||||||
|
const bool is_aligned = ((m & (mc - 1)) == 0) & ((n & (nc - 1)) == 0) & ((k & (kc - 1)) == 0);
|
||||||
|
if (is_aligned) {
|
||||||
|
this->matmul_tiled_q0(m, n, mc, nc, kc);
|
||||||
|
} else {
|
||||||
mnpack(0, m, 0, n);
|
mnpack(0, m, 0, n);
|
||||||
}
|
}
|
||||||
|
|
||||||
private:
|
|
||||||
|
|
||||||
inline void save_res(int ii, int jj, int idx, vector float* fin_res, int RM=4, int RN=4) {
|
|
||||||
for (int I = 0; I < RM; I++) {
|
|
||||||
for (int J = 0; J < RN; J++) {
|
|
||||||
*((float*)(C+ii+((jj+J)*ldc)+I)) = *((float*)&fin_res[idx+I]+J);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template<typename TA>
|
||||||
template<int size>
|
template<int size>
|
||||||
inline void compute(acc_t* ACC, int c_idx, int s_idx, std::array<int, size>& comparray, vector float* vs, vector float* fin_res) {
|
void tinyBLAS_Q0_PPC<TA>::packNormalInt4(const TA* a, int64_t lda, int rows, int cols, int8_t* vec, std::array<int, size>& comparray) {
|
||||||
vector signed int vec_C[4];
|
|
||||||
vector float CA[4] = {0};
|
|
||||||
vector float res[4] = {0};
|
|
||||||
__builtin_mma_disassemble_acc(vec_C, ACC);
|
|
||||||
for (int i = 0; i < 4; i++) {
|
|
||||||
CA[i] = vec_splats((float)(((double)comparray[c_idx+i]) * -128.0));
|
|
||||||
res[i] = vec_add(vec_ctf(vec_C[i], 0), CA[i]);
|
|
||||||
fin_res[s_idx+i] = vec_madd(res[i], vs[s_idx+i], fin_res[s_idx+i]);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
/* This function processes quantized data from block_q4_0 elements.
|
|
||||||
* First the we try to extract the two int4 values stored in single int8_t into two signed int8.
|
|
||||||
* And then we subtract each of the resultant element with 8, to convert signed int8 to unsigned int8.
|
|
||||||
* Also compute the rowsum which is required to compensate the above conversion. */
|
|
||||||
inline void process_q4_elements(vector signed char (&c)[2], int* ca) {
|
|
||||||
const vector signed char lowMask = vec_splats((signed char)0xF);
|
|
||||||
const vector unsigned char v4 = vec_splats((unsigned char)0x4);
|
|
||||||
const vector signed char v8 = vec_splats((signed char)0x8);
|
|
||||||
vector signed int vsum = {0};
|
|
||||||
vector signed int vsum2 = {0};
|
|
||||||
c[0] = vec_and(c[1], lowMask);
|
|
||||||
c[1] = vec_sr(c[1], v4);
|
|
||||||
c[0] = vec_sub(c[0], v8);
|
|
||||||
c[1] = vec_sub(c[1], v8);
|
|
||||||
vsum = vec_sum4s(c[0], vsum);
|
|
||||||
vsum2 = vec_sum4s(c[1], vsum2);
|
|
||||||
vsum = vec_add(vsum, vsum2);
|
|
||||||
*(ca) = vsum[0] + vsum[1] + vsum[2] + vsum[3];
|
|
||||||
}
|
|
||||||
|
|
||||||
template <typename V1, typename V2>
|
|
||||||
inline void vector_permute_store(V2 &s1, V2 &s2, V2 &s3, V2 &s4, V1 *vecOffset, bool flip) {
|
|
||||||
vector unsigned char swiz1 = {0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23};
|
|
||||||
vector unsigned char swiz2 = {8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31};
|
|
||||||
vector unsigned char swiz3 = {0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27};
|
|
||||||
vector unsigned char swiz4 = {4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31};
|
|
||||||
V2 t1, t2, t3, t4, t5, t6, t7, t8;
|
|
||||||
vector unsigned char xor_vector;
|
|
||||||
uint8_t flip_vec = 0x80;
|
|
||||||
xor_vector = vec_splats(flip_vec);
|
|
||||||
t1 = vec_perm(s1, s2, swiz1);
|
|
||||||
t2 = vec_perm(s1, s2, swiz2);
|
|
||||||
t3 = vec_perm(s3, s4, swiz1);
|
|
||||||
t4 = vec_perm(s3, s4, swiz2);
|
|
||||||
t5 = vec_perm(t1, t3, swiz3);
|
|
||||||
t6 = vec_perm(t1, t3, swiz4);
|
|
||||||
t7 = vec_perm(t2, t4, swiz3);
|
|
||||||
t8 = vec_perm(t2, t4, swiz4);
|
|
||||||
if (flip == true) {
|
|
||||||
t5 = vec_xor(t5, xor_vector);
|
|
||||||
t6 = vec_xor(t6, xor_vector);
|
|
||||||
t7 = vec_xor(t7, xor_vector);
|
|
||||||
t8 = vec_xor(t8, xor_vector);
|
|
||||||
}
|
|
||||||
vec_xst(t5, 0, vecOffset);
|
|
||||||
vec_xst(t6, 0, vecOffset+16);
|
|
||||||
vec_xst(t7, 0, vecOffset+32);
|
|
||||||
vec_xst(t8, 0, vecOffset+48);
|
|
||||||
}
|
|
||||||
|
|
||||||
template<int size>
|
|
||||||
void packNormalInt4(const TA* a, int64_t lda, int rows, int cols, int8_t* vec, std::array<int, size>& comparray) {
|
|
||||||
int64_t i, j;
|
int64_t i, j;
|
||||||
TA *aoffset = NULL;
|
TA *aoffset = NULL;
|
||||||
int8_t *vecOffset = NULL;
|
int8_t *vecOffset = NULL;
|
||||||
|
|
@ -1781,8 +1720,10 @@ class tinyBLAS_Q0_PPC {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template<typename TA>
|
||||||
template<typename VA, typename VB>
|
template<typename VA, typename VB>
|
||||||
void packNormal(const block_q8_0* a, int64_t lda, int rows, int cols, VA* vec, bool flip) {
|
void tinyBLAS_Q0_PPC<TA>::packNormal(const block_q8_0* a, int64_t lda, int rows, int cols, VA* vec, bool flip) {
|
||||||
int64_t i, j;
|
int64_t i, j;
|
||||||
block_q8_0 *aoffset = NULL;
|
block_q8_0 *aoffset = NULL;
|
||||||
VA *vecOffset = NULL;
|
VA *vecOffset = NULL;
|
||||||
|
|
@ -1822,7 +1763,6 @@ class tinyBLAS_Q0_PPC {
|
||||||
j--;
|
j--;
|
||||||
} while(j > 0);
|
} while(j > 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (rows & 4) {
|
if (rows & 4) {
|
||||||
aoffsets[0] = aoffset;
|
aoffsets[0] = aoffset;
|
||||||
for (int it = 1; it < 4; it++ )
|
for (int it = 1; it < 4; it++ )
|
||||||
|
|
@ -1878,7 +1818,8 @@ class tinyBLAS_Q0_PPC {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void mnpack(int64_t m0, int64_t m, int64_t n0, int64_t n) {
|
template<typename TA>
|
||||||
|
void tinyBLAS_Q0_PPC<TA>::mnpack(int64_t m0, int64_t m, int64_t n0, int64_t n) {
|
||||||
int m_rem = MIN(m - m0, 16);
|
int m_rem = MIN(m - m0, 16);
|
||||||
int n_rem = MIN(n - n0, 16);
|
int n_rem = MIN(n - n0, 16);
|
||||||
|
|
||||||
|
|
@ -1915,7 +1856,8 @@ class tinyBLAS_Q0_PPC {
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
void KERNEL_4x8(int64_t ii, int64_t jj) {
|
template<typename TA>
|
||||||
|
void tinyBLAS_Q0_PPC<TA>::KERNEL_4x8(int64_t ii, int64_t jj) {
|
||||||
vec_t vec_A[8], vec_B[16] = {0};
|
vec_t vec_A[8], vec_B[16] = {0};
|
||||||
acc_t acc_0, acc_1;
|
acc_t acc_0, acc_1;
|
||||||
std::array<int, 4> comparray {};
|
std::array<int, 4> comparray {};
|
||||||
|
|
@ -1953,14 +1895,15 @@ class tinyBLAS_Q0_PPC {
|
||||||
aoffset += lda;
|
aoffset += lda;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
compute<4>(&acc_0, 0, 0, comparray, vs, fin_res);
|
compute(&acc_0, 0, 0, comparray, vs, fin_res);
|
||||||
compute<4>(&acc_1, 0, 4, comparray, vs, fin_res);
|
compute(&acc_1, 0, 4, comparray, vs, fin_res);
|
||||||
}
|
}
|
||||||
save_res(ii, jj, 0, fin_res);
|
save_res(ii, jj, 0, fin_res);
|
||||||
save_res(ii, jj+4, 4, fin_res);
|
save_res(ii, jj+4, 4, fin_res);
|
||||||
}
|
}
|
||||||
|
|
||||||
void KERNEL_8x4(int64_t ii, int64_t jj) {
|
template<typename TA>
|
||||||
|
void tinyBLAS_Q0_PPC<TA>::KERNEL_8x4(int64_t ii, int64_t jj) {
|
||||||
vec_t vec_A[16], vec_B[8] = {0};
|
vec_t vec_A[16], vec_B[8] = {0};
|
||||||
acc_t acc_0, acc_1;
|
acc_t acc_0, acc_1;
|
||||||
std::array<int, 8> comparray {};
|
std::array<int, 8> comparray {};
|
||||||
|
|
@ -1997,16 +1940,18 @@ class tinyBLAS_Q0_PPC {
|
||||||
aoffset += lda;
|
aoffset += lda;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
compute<8>(&acc_0, 0, 0, comparray, vs, fin_res);
|
compute(&acc_0, 0, 0, comparray, vs, fin_res);
|
||||||
compute<8>(&acc_1, 4, 4, comparray, vs, fin_res);
|
compute(&acc_1, 4, 4, comparray, vs, fin_res);
|
||||||
}
|
}
|
||||||
save_res(ii, jj, 0, fin_res);
|
save_res(ii, jj, 0, fin_res);
|
||||||
save_res(ii+4, jj, 4, fin_res);
|
save_res(ii+4, jj, 4, fin_res);
|
||||||
}
|
}
|
||||||
|
|
||||||
void KERNEL_8x8(int64_t ii, int64_t jj) {
|
template<typename TA>
|
||||||
|
void tinyBLAS_Q0_PPC<TA>::KERNEL_8x8(int64_t ii, int64_t jj) {
|
||||||
vec_t vec_A[16], vec_B[16] = {0};
|
vec_t vec_A[16], vec_B[16] = {0};
|
||||||
acc_t acc_0, acc_1, acc_2, acc_3;
|
acc_t acc_0, acc_1, acc_2, acc_3;
|
||||||
|
acc_t acc_4, acc_5, acc_6, acc_7;
|
||||||
std::array<int, 8> comparray {};
|
std::array<int, 8> comparray {};
|
||||||
vector float fin_res[16] = {0};
|
vector float fin_res[16] = {0};
|
||||||
vector float vs[16] = {0};
|
vector float vs[16] = {0};
|
||||||
|
|
@ -2046,10 +1991,10 @@ class tinyBLAS_Q0_PPC {
|
||||||
aoffset += lda;
|
aoffset += lda;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
compute<8>(&acc_0, 0, 0, comparray, vs, fin_res);
|
compute(&acc_0, 0, 0, comparray, vs, fin_res);
|
||||||
compute<8>(&acc_1, 4, 4, comparray, vs, fin_res);
|
compute(&acc_1, 4, 4, comparray, vs, fin_res);
|
||||||
compute<8>(&acc_2, 0, 8, comparray, vs, fin_res);
|
compute(&acc_2, 0, 8, comparray, vs, fin_res);
|
||||||
compute<8>(&acc_3, 4, 12, comparray, vs, fin_res);
|
compute(&acc_3, 4, 12, comparray, vs, fin_res);
|
||||||
}
|
}
|
||||||
save_res(ii, jj, 0, fin_res);
|
save_res(ii, jj, 0, fin_res);
|
||||||
save_res(ii+4, jj, 4, fin_res);
|
save_res(ii+4, jj, 4, fin_res);
|
||||||
|
|
@ -2057,7 +2002,8 @@ class tinyBLAS_Q0_PPC {
|
||||||
save_res(ii+4, jj+4, 12, fin_res);
|
save_res(ii+4, jj+4, 12, fin_res);
|
||||||
}
|
}
|
||||||
|
|
||||||
void gemm_small(int64_t m0, int64_t m, int64_t n0, int64_t n, int RM, int RN) {
|
template<typename TA>
|
||||||
|
void tinyBLAS_Q0_PPC<TA>::gemm_small(int64_t m0, int64_t m, int64_t n0, int64_t n, int RM, int RN) {
|
||||||
int64_t ytiles = (m - m0) / RM;
|
int64_t ytiles = (m - m0) / RM;
|
||||||
int64_t xtiles = (n - n0) / RN;
|
int64_t xtiles = (n - n0) / RN;
|
||||||
int64_t tiles = xtiles * ytiles;
|
int64_t tiles = xtiles * ytiles;
|
||||||
|
|
@ -2125,21 +2071,9 @@ class tinyBLAS_Q0_PPC {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template<typename TA>
|
||||||
template <int RM, int RN>
|
template <int RM, int RN>
|
||||||
inline void kernel(int64_t ii, int64_t jj) {
|
NOINLINE void tinyBLAS_Q0_PPC<TA>::gemm(int64_t m0, int64_t m, int64_t n0, int64_t n) {
|
||||||
if constexpr(RM == 4 && RN == 8) {
|
|
||||||
KERNEL_4x8(ii,jj);
|
|
||||||
} else if constexpr(RM == 8 && RN == 4) {
|
|
||||||
KERNEL_8x4(ii,jj);
|
|
||||||
} else if constexpr(RM == 8 && RN == 8) {
|
|
||||||
KERNEL_8x8(ii,jj);
|
|
||||||
} else {
|
|
||||||
assert(false && "RN/RM values not supported");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
template <int RM, int RN>
|
|
||||||
NOINLINE void gemm(int64_t m0, int64_t m, int64_t n0, int64_t n) {
|
|
||||||
int64_t ytiles = (m - m0) / RM;
|
int64_t ytiles = (m - m0) / RM;
|
||||||
int64_t xtiles = (n - n0) / RN;
|
int64_t xtiles = (n - n0) / RN;
|
||||||
int64_t tiles = xtiles * ytiles;
|
int64_t tiles = xtiles * ytiles;
|
||||||
|
|
@ -2151,20 +2085,12 @@ class tinyBLAS_Q0_PPC {
|
||||||
for (int64_t job = start; job < end; ++job) {
|
for (int64_t job = start; job < end; ++job) {
|
||||||
int64_t ii = m0 + job / xtiles * RM;
|
int64_t ii = m0 + job / xtiles * RM;
|
||||||
int64_t jj = n0 + job % xtiles * RN;
|
int64_t jj = n0 + job % xtiles * RN;
|
||||||
kernel<RM, RN>(ii, jj);
|
this->kernel<RM, RN>(ii, jj);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
const TA *const A;
|
template class tinyBLAS_Q0_PPC<block_q4_0>;
|
||||||
const block_q8_0 *const B;
|
template class tinyBLAS_Q0_PPC<block_q8_0>;
|
||||||
float *C;
|
|
||||||
const int64_t k;
|
|
||||||
const int64_t lda;
|
|
||||||
const int64_t ldb;
|
|
||||||
const int64_t ldc;
|
|
||||||
const int ith;
|
|
||||||
const int nth;
|
|
||||||
};
|
|
||||||
|
|
||||||
class tinyBLAS_PPC {
|
class tinyBLAS_PPC {
|
||||||
public:
|
public:
|
||||||
|
|
|
||||||
|
|
@ -6,6 +6,12 @@
|
||||||
#include <vecintrin.h>
|
#include <vecintrin.h>
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#ifdef _MSC_VER
|
||||||
|
#define NOINLINE __declspec(noinline)
|
||||||
|
#else
|
||||||
|
#define NOINLINE __attribute__((__noinline__))
|
||||||
|
#endif
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
extern "C" {
|
extern "C" {
|
||||||
#endif
|
#endif
|
||||||
|
|
|
||||||
|
|
@ -6383,7 +6383,7 @@ static void ggml_compute_forward_im2col_3d_f16(
|
||||||
const int64_t iih = ioh*s1 + ikh*d1 - p1;
|
const int64_t iih = ioh*s1 + ikh*d1 - p1;
|
||||||
const int64_t iid = iod*s2 + ikd*d2 - p2;
|
const int64_t iid = iod*s2 + ikd*d2 - p2;
|
||||||
|
|
||||||
if (iid < 0 || iid >= ID || iih < 0 || iih >= IH || iiw < 0 || iiw >= IW || iid < 0 || iid >= ID) {
|
if (iid < 0 || iid >= ID || iih < 0 || iih >= IH || iiw < 0 || iiw >= IW) {
|
||||||
dst_data[iic*KD_KH_KW + ikd * KH_KW + ikh*KW + ikw] = 0;
|
dst_data[iic*KD_KH_KW + ikd * KH_KW + ikh*KW + ikw] = 0;
|
||||||
} else {
|
} else {
|
||||||
const float * const s = (const float *) ((const char *)src_data + iid*nb12 + iih*nb11 + iiw*nb10); // [ID, IH, IW]
|
const float * const s = (const float *) ((const char *)src_data + iid*nb12 + iih*nb11 + iiw*nb10); // [ID, IH, IW]
|
||||||
|
|
@ -6554,8 +6554,13 @@ static void ggml_call_mul_mat(ggml_type type, const ggml_compute_params * params
|
||||||
ggml_compute_forward_mul_mat(params, &dst);
|
ggml_compute_forward_mul_mat(params, &dst);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static inline int64_t ggml_wrap_around(int64_t coord, int64_t size) {
|
||||||
|
return (coord + size) % size; // adding size avoids negative number weirdness
|
||||||
|
}
|
||||||
|
|
||||||
// ggml_compute_forward_conv_2d
|
// ggml_compute_forward_conv_2d
|
||||||
|
|
||||||
|
|
||||||
static void ggml_compute_forward_conv_2d_impl(const ggml_compute_params * params,
|
static void ggml_compute_forward_conv_2d_impl(const ggml_compute_params * params,
|
||||||
const ggml_tensor * kernel, // [KW, KH, IC, OC]
|
const ggml_tensor * kernel, // [KW, KH, IC, OC]
|
||||||
const ggml_tensor * src, // [W, H, C, N]
|
const ggml_tensor * src, // [W, H, C, N]
|
||||||
|
|
@ -7591,6 +7596,7 @@ void ggml_compute_forward_upscale(
|
||||||
|
|
||||||
// ggml_compute_forward_pad
|
// ggml_compute_forward_pad
|
||||||
|
|
||||||
|
template<bool circular_t>
|
||||||
static void ggml_compute_forward_pad_f32(
|
static void ggml_compute_forward_pad_f32(
|
||||||
const ggml_compute_params * params,
|
const ggml_compute_params * params,
|
||||||
ggml_tensor * dst) {
|
ggml_tensor * dst) {
|
||||||
|
|
@ -7615,13 +7621,29 @@ static void ggml_compute_forward_pad_f32(
|
||||||
const int32_t lp3 = ggml_get_op_params_i32(dst, 6);
|
const int32_t lp3 = ggml_get_op_params_i32(dst, 6);
|
||||||
const int32_t rp3 = ggml_get_op_params_i32(dst, 7);
|
const int32_t rp3 = ggml_get_op_params_i32(dst, 7);
|
||||||
|
|
||||||
|
|
||||||
// TODO: optimize
|
// TODO: optimize
|
||||||
|
|
||||||
for (int64_t i2 = 0; i2 < ne2; ++i2) {
|
for (int64_t i2 = 0; i2 < ne2; ++i2) {
|
||||||
for (int64_t i1 = ith; i1 < ne1; i1 += nth) {
|
for (int64_t i1 = ith; i1 < ne1; i1 += nth) {
|
||||||
for (int64_t i0 = 0; i0 < ne0; ++i0) {
|
for (int64_t i0 = 0; i0 < ne0; ++i0) {
|
||||||
for (int64_t i3 = 0; i3 < ne3; ++i3) {
|
for (int64_t i3 = 0; i3 < ne3; ++i3) {
|
||||||
|
// circular means wrap around on a torus, so x and y loop around
|
||||||
|
if constexpr (circular_t) {
|
||||||
|
const int64_t dst_idx = i3*(ne0*ne1*ne2) + i2*(ne0*ne1) + i1*ne0 + i0;
|
||||||
|
const int64_t src_i0 = ggml_wrap_around(i0 - lp0, ne00);
|
||||||
|
const int64_t src_i1 = ggml_wrap_around(i1 - lp1, ne01);
|
||||||
|
const int64_t src_i2 = ggml_wrap_around(i2 - lp2, ne02);
|
||||||
|
const int64_t src_i3 = ggml_wrap_around(i3 - lp3, ne03);
|
||||||
|
|
||||||
|
const int64_t src_idx =
|
||||||
|
src_i3*nb03 +
|
||||||
|
src_i2*nb02 +
|
||||||
|
src_i1*nb01 +
|
||||||
|
src_i0*nb00;
|
||||||
|
|
||||||
|
const float * src_ptr = (const float *)((char *) src0->data + src_idx);
|
||||||
|
dst_ptr[dst_idx] = *src_ptr;
|
||||||
|
} else {
|
||||||
const int64_t dst_idx = i3*(ne0*ne1*ne2) + i2*(ne0*ne1) + i1*ne0 + i0;
|
const int64_t dst_idx = i3*(ne0*ne1*ne2) + i2*(ne0*ne1) + i1*ne0 + i0;
|
||||||
if ((i0 >= lp0 && i0 < ne0 - rp0) \
|
if ((i0 >= lp0 && i0 < ne0 - rp0) \
|
||||||
&& (i1 >= lp1 && i1 < ne1 - rp1) \
|
&& (i1 >= lp1 && i1 < ne1 - rp1) \
|
||||||
|
|
@ -7638,17 +7660,22 @@ static void ggml_compute_forward_pad_f32(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
void ggml_compute_forward_pad(
|
void ggml_compute_forward_pad(
|
||||||
const ggml_compute_params * params,
|
const ggml_compute_params * params,
|
||||||
ggml_tensor * dst) {
|
ggml_tensor * dst) {
|
||||||
|
|
||||||
const ggml_tensor * src0 = dst->src[0];
|
const ggml_tensor * src0 = dst->src[0];
|
||||||
|
const bool circular = (bool) ggml_get_op_params_i32(dst, 8);
|
||||||
switch (src0->type) {
|
switch (src0->type) {
|
||||||
case GGML_TYPE_F32:
|
case GGML_TYPE_F32:
|
||||||
{
|
{
|
||||||
ggml_compute_forward_pad_f32(params, dst);
|
if (circular) {
|
||||||
|
ggml_compute_forward_pad_f32<true>(params, dst);
|
||||||
|
} else {
|
||||||
|
ggml_compute_forward_pad_f32<false>(params, dst);
|
||||||
|
}
|
||||||
} break;
|
} break;
|
||||||
default:
|
default:
|
||||||
{
|
{
|
||||||
|
|
|
||||||
|
|
@ -226,7 +226,7 @@ static const char * cu_get_error_str(CUresult err) {
|
||||||
#define AMD_MFMA_AVAILABLE
|
#define AMD_MFMA_AVAILABLE
|
||||||
#endif // defined(GGML_USE_HIP) && defined(CDNA) && !defined(GGML_HIP_NO_MMQ_MFMA)
|
#endif // defined(GGML_USE_HIP) && defined(CDNA) && !defined(GGML_HIP_NO_MMQ_MFMA)
|
||||||
|
|
||||||
#if defined(GGML_USE_HIP) && defined(RDNA4)
|
#if defined(GGML_USE_HIP) && (defined(RDNA4) || defined(RDNA3))
|
||||||
#define AMD_WMMA_AVAILABLE
|
#define AMD_WMMA_AVAILABLE
|
||||||
#endif // defined(GGML_USE_HIP) && defined(RDNA4)
|
#endif // defined(GGML_USE_HIP) && defined(RDNA4)
|
||||||
|
|
||||||
|
|
@ -294,7 +294,7 @@ static bool amd_mfma_available(const int cc) {
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool amd_wmma_available(const int cc) {
|
static bool amd_wmma_available(const int cc) {
|
||||||
return GGML_CUDA_CC_IS_RDNA4(cc);
|
return (GGML_CUDA_CC_IS_RDNA4(cc) || GGML_CUDA_CC_IS_RDNA3(cc));
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool volta_mma_available(const int cc) {
|
static bool volta_mma_available(const int cc) {
|
||||||
|
|
@ -463,6 +463,53 @@ static __device__ __forceinline__ float warp_reduce_max(float x) {
|
||||||
return x;
|
return x;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template<typename T, int width = WARP_SIZE>
|
||||||
|
static __device__ __forceinline__ T warp_prefix_inclusive_sum(T x) {
|
||||||
|
const int lane_id = threadIdx.x % width;
|
||||||
|
#pragma unroll
|
||||||
|
for (int offset = 1; offset < width; offset <<= 1) {
|
||||||
|
const T t = __shfl_up_sync(0xffffffff, x, offset, width);
|
||||||
|
if (lane_id >= offset) {
|
||||||
|
x += t;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return x;
|
||||||
|
}
|
||||||
|
|
||||||
|
template<int width = WARP_SIZE>
|
||||||
|
static __device__ __forceinline__ float2 warp_prefix_inclusive_sum(float2 a) {
|
||||||
|
const int lane_id = threadIdx.x % width;
|
||||||
|
#pragma unroll
|
||||||
|
for (int offset = 1; offset < width; offset <<= 1) {
|
||||||
|
const float t_x = __shfl_up_sync(0xffffffff, a.x, offset, width);
|
||||||
|
const float t_y = __shfl_up_sync(0xffffffff, a.y, offset, width);
|
||||||
|
if (lane_id >= offset) {
|
||||||
|
a.x += t_x;
|
||||||
|
a.y += t_y;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return a;
|
||||||
|
}
|
||||||
|
|
||||||
|
template<int width = WARP_SIZE>
|
||||||
|
static __device__ __forceinline__ half2 warp_prefix_inclusive_sum(half2 a) {
|
||||||
|
#ifdef FP16_AVAILABLE
|
||||||
|
const int lane_id = threadIdx.x % width;
|
||||||
|
#pragma unroll
|
||||||
|
for (int offset = 1; offset < width; offset <<= 1) {
|
||||||
|
const half2 t = __shfl_up_sync(0xffffffff, a, offset, width);
|
||||||
|
if (lane_id >= offset) {
|
||||||
|
a = __hadd2(a, t);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return a;
|
||||||
|
|
||||||
|
#else
|
||||||
|
NO_DEVICE_CODE;
|
||||||
|
return a;
|
||||||
|
#endif // FP16_AVAILABLE
|
||||||
|
}
|
||||||
|
|
||||||
static __device__ __forceinline__ half ggml_cuda_hmax(const half a, const half b) {
|
static __device__ __forceinline__ half ggml_cuda_hmax(const half a, const half b) {
|
||||||
#ifdef FP16_AVAILABLE
|
#ifdef FP16_AVAILABLE
|
||||||
|
|
||||||
|
|
@ -989,6 +1036,10 @@ struct ggml_cuda_concurrent_event {
|
||||||
int n_streams = 0;
|
int n_streams = 0;
|
||||||
std::unordered_map<const ggml_tensor *, int> stream_mapping;
|
std::unordered_map<const ggml_tensor *, int> stream_mapping;
|
||||||
|
|
||||||
|
// Original order of nodes in this concurrent region (before interleaving)
|
||||||
|
// Used to restore grouping for fusion within streams
|
||||||
|
std::vector<const ggml_tensor *> original_order;
|
||||||
|
|
||||||
const ggml_tensor * join_node;
|
const ggml_tensor * join_node;
|
||||||
|
|
||||||
ggml_cuda_concurrent_event() = default;
|
ggml_cuda_concurrent_event() = default;
|
||||||
|
|
@ -1011,6 +1062,7 @@ struct ggml_cuda_concurrent_event {
|
||||||
, fork_event(other.fork_event)
|
, fork_event(other.fork_event)
|
||||||
, n_streams(other.n_streams)
|
, n_streams(other.n_streams)
|
||||||
, stream_mapping(std::move(other.stream_mapping))
|
, stream_mapping(std::move(other.stream_mapping))
|
||||||
|
, original_order(std::move(other.original_order))
|
||||||
, join_node(other.join_node) {
|
, join_node(other.join_node) {
|
||||||
other.fork_event = nullptr;
|
other.fork_event = nullptr;
|
||||||
}
|
}
|
||||||
|
|
@ -1121,11 +1173,9 @@ struct ggml_cuda_concurrent_event {
|
||||||
};
|
};
|
||||||
|
|
||||||
struct ggml_cuda_stream_context {
|
struct ggml_cuda_stream_context {
|
||||||
std::vector<const ggml_tensor *> original_nodes;
|
|
||||||
std::unordered_map<const ggml_tensor *, ggml_cuda_concurrent_event> concurrent_events;
|
std::unordered_map<const ggml_tensor *, ggml_cuda_concurrent_event> concurrent_events;
|
||||||
|
|
||||||
void reset() {
|
void reset() {
|
||||||
original_nodes.clear();
|
|
||||||
concurrent_events.clear();
|
concurrent_events.clear();
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,237 @@
|
||||||
|
#include <algorithm>
|
||||||
|
#include "cumsum.cuh"
|
||||||
|
#include "convert.cuh"
|
||||||
|
#include "ggml-cuda/common.cuh"
|
||||||
|
#include "ggml.h"
|
||||||
|
|
||||||
|
#ifdef GGML_CUDA_USE_CUB
|
||||||
|
# include <cub/device/device_scan.cuh>
|
||||||
|
#endif // GGML_CUDA_USE_CUB
|
||||||
|
|
||||||
|
template<typename T, int BLOCK_SIZE>
|
||||||
|
static __global__ void cumsum_cub_kernel(
|
||||||
|
const T * __restrict__ src,
|
||||||
|
T * __restrict__ dst,
|
||||||
|
const int64_t ne00, const int64_t ne01, const int64_t ne02, const int64_t ne03,
|
||||||
|
const int64_t s01, const int64_t s02, const int64_t s03,
|
||||||
|
const int64_t s1, const int64_t s2, const int64_t s3) {
|
||||||
|
#ifdef GGML_CUDA_USE_CUB
|
||||||
|
using BlockScan = cub::BlockScan<T, BLOCK_SIZE>;
|
||||||
|
|
||||||
|
__shared__ typename BlockScan::TempStorage temp_storage;
|
||||||
|
__shared__ T block_carry; // carry from previous tile
|
||||||
|
|
||||||
|
const int tid = threadIdx.x;
|
||||||
|
|
||||||
|
const int64_t i1 = blockIdx.x;
|
||||||
|
const int64_t i2 = blockIdx.y;
|
||||||
|
const int64_t i3 = blockIdx.z;
|
||||||
|
|
||||||
|
if (i1 >= ne01 || i2 >= ne02 || i3 >= ne03) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
const T * src_row = src + i1 * s01 + i2 * s02 + i3 * s03;
|
||||||
|
T * dst_row = dst + i1 * s1 + i2 * s2 + i3 * s3;
|
||||||
|
|
||||||
|
if (tid == 0) {
|
||||||
|
block_carry = 0;
|
||||||
|
}
|
||||||
|
__syncthreads();
|
||||||
|
|
||||||
|
for (int64_t start = 0; start < ne00; start += BLOCK_SIZE) {
|
||||||
|
int64_t idx = start + tid;
|
||||||
|
T x = (idx < ne00) ? src_row[idx] : T(0);
|
||||||
|
|
||||||
|
T inclusive;
|
||||||
|
T block_total;
|
||||||
|
BlockScan(temp_storage).InclusiveSum(x, inclusive, block_total);
|
||||||
|
|
||||||
|
__syncthreads();
|
||||||
|
|
||||||
|
T final_val = inclusive + block_carry;
|
||||||
|
|
||||||
|
// store result
|
||||||
|
if (idx < ne00) {
|
||||||
|
dst_row[idx] = final_val;
|
||||||
|
}
|
||||||
|
|
||||||
|
__syncthreads();
|
||||||
|
|
||||||
|
if (tid == 0) {
|
||||||
|
block_carry += block_total;
|
||||||
|
}
|
||||||
|
|
||||||
|
__syncthreads();
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
NO_DEVICE_CODE;
|
||||||
|
#endif // GGML_CUDA_USE_CUB
|
||||||
|
}
|
||||||
|
|
||||||
|
// Fallback kernel implementation (original)
|
||||||
|
template<typename T>
|
||||||
|
static __global__ void cumsum_kernel(
|
||||||
|
const T * src, T * dst,
|
||||||
|
const int64_t ne00, const int64_t ne01, const int64_t ne02, const int64_t ne03,
|
||||||
|
const int64_t s00, const int64_t s01, const int64_t s02, const int64_t s03,
|
||||||
|
const int64_t s0, const int64_t s1, const int64_t s2, const int64_t s3) {
|
||||||
|
|
||||||
|
GGML_UNUSED_VARS(s00, s0);
|
||||||
|
|
||||||
|
const int tid = threadIdx.x;
|
||||||
|
constexpr int warp_size = ggml_cuda_get_physical_warp_size();
|
||||||
|
const int lane = tid % warp_size;
|
||||||
|
const int warp = tid / warp_size;
|
||||||
|
const int warps_per_block = blockDim.x / warp_size;
|
||||||
|
|
||||||
|
extern __shared__ float smem[];
|
||||||
|
float * s_vals = smem;
|
||||||
|
float * s_warp_sums = smem + blockDim.x;
|
||||||
|
float * s_carry = smem + blockDim.x + warps_per_block;
|
||||||
|
float * s_chunk_total = s_carry + 1;
|
||||||
|
|
||||||
|
// Initialize carry
|
||||||
|
if (tid == 0) {
|
||||||
|
*s_carry = 0.0f;
|
||||||
|
}
|
||||||
|
__syncthreads();
|
||||||
|
|
||||||
|
const int64_t i3 = blockIdx.z;
|
||||||
|
const int64_t i2 = blockIdx.y;
|
||||||
|
const int64_t i1 = blockIdx.x;
|
||||||
|
if (i3 >= ne03 || i2 >= ne02 || i1 >= ne01) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
const T * src_row = src + i1 * s01 + i2 * s02 + i3 * s03;
|
||||||
|
T * dst_row = dst + i1 * s1 + i2 * s2 + i3 * s3;
|
||||||
|
|
||||||
|
for (int64_t start = 0; start < ne00; start += blockDim.x) {
|
||||||
|
int64_t idx = start + tid;
|
||||||
|
float val = (idx < ne00) ? ggml_cuda_cast<float, T>(src_row[idx]) : 0.0f;
|
||||||
|
|
||||||
|
// 1. Warp inclusive scan
|
||||||
|
val = warp_prefix_inclusive_sum<T, warp_size>(val);
|
||||||
|
s_vals[tid] = val;
|
||||||
|
|
||||||
|
// Store warp total
|
||||||
|
if (lane == warp_size - 1) {
|
||||||
|
s_warp_sums[warp] = val;
|
||||||
|
}
|
||||||
|
__syncthreads();
|
||||||
|
|
||||||
|
// 2. Exclusive scan of warp sums (warp 0 only)
|
||||||
|
if (warp == 0) {
|
||||||
|
float w = (tid < warps_per_block) ? s_warp_sums[tid] : 0.0f;
|
||||||
|
float inc = warp_prefix_inclusive_sum<T, warp_size>(w);
|
||||||
|
if (tid < warps_per_block) {
|
||||||
|
s_warp_sums[tid] = inc - w; // exclusive sum
|
||||||
|
}
|
||||||
|
if (tid == warps_per_block - 1) {
|
||||||
|
*s_chunk_total = inc; // total sum of this chunk
|
||||||
|
}
|
||||||
|
}
|
||||||
|
__syncthreads();
|
||||||
|
|
||||||
|
float carry = *s_carry;
|
||||||
|
float final_val = s_vals[tid] + s_warp_sums[warp] + carry;
|
||||||
|
if (idx < ne00) {
|
||||||
|
dst_row[idx] = ggml_cuda_cast<T, float>(final_val);
|
||||||
|
}
|
||||||
|
__syncthreads();
|
||||||
|
|
||||||
|
// Update carry for next chunk
|
||||||
|
if (tid == 0) {
|
||||||
|
*s_carry += *s_chunk_total;
|
||||||
|
}
|
||||||
|
__syncthreads();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
template<typename T>
|
||||||
|
static void cumsum_cuda(
|
||||||
|
const T * src, T * dst,
|
||||||
|
const int64_t ne00, const int64_t ne01, const int64_t ne02, const int64_t ne03,
|
||||||
|
const int64_t nb00, const int64_t nb01, const int64_t nb02, const int64_t nb03,
|
||||||
|
const int64_t nb0, const int64_t nb1, const int64_t nb2, const int64_t nb3,
|
||||||
|
cudaStream_t stream) {
|
||||||
|
|
||||||
|
const size_t type_size = sizeof(T);
|
||||||
|
bool use_cub = false;
|
||||||
|
#ifdef GGML_CUDA_USE_CUB
|
||||||
|
// Check if we can use CUB (data must be contiguous along innermost dimension)
|
||||||
|
const bool is_contiguous = (nb00 == type_size) && (nb0 == type_size);
|
||||||
|
|
||||||
|
if (is_contiguous) {
|
||||||
|
use_cub = true;
|
||||||
|
}
|
||||||
|
#endif // GGML_CUDA_USE_CUB
|
||||||
|
dim3 grid_dims(ne01, ne02, ne03);
|
||||||
|
const auto &info = ggml_cuda_info().devices[ggml_cuda_get_device()];
|
||||||
|
const int warp_size = info.warp_size;
|
||||||
|
const int num_warps = (ne00 + warp_size - 1) / warp_size;
|
||||||
|
int block_size = num_warps * warp_size;
|
||||||
|
block_size = std::min(block_size, CUDA_CUMSUM_BLOCK_SIZE);
|
||||||
|
dim3 block_dims(block_size, 1, 1);
|
||||||
|
const int warps_per_block = block_size / warp_size;
|
||||||
|
const size_t shmem_size = (block_size + warps_per_block + 2) * sizeof(float);
|
||||||
|
|
||||||
|
if (use_cub) {
|
||||||
|
cumsum_cub_kernel<T, CUDA_CUMSUM_BLOCK_SIZE><<<grid_dims, CUDA_CUMSUM_BLOCK_SIZE, 0, stream>>>(
|
||||||
|
src, dst,
|
||||||
|
ne00, ne01, ne02, ne03,
|
||||||
|
nb01 / type_size, nb02 / type_size, nb03 / type_size,
|
||||||
|
nb1 / type_size, nb2 / type_size, nb3 / type_size
|
||||||
|
);
|
||||||
|
} else {
|
||||||
|
cumsum_kernel<<<grid_dims, block_dims, shmem_size, stream>>>(
|
||||||
|
src, dst,
|
||||||
|
ne00, ne01, ne02, ne03,
|
||||||
|
nb00 / type_size, nb01 / type_size, nb02 / type_size, nb03 / type_size,
|
||||||
|
nb0 / type_size, nb1 / type_size, nb2 / type_size, nb3 / type_size
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void ggml_cuda_op_cumsum(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||||
|
const ggml_tensor * src0 = dst->src[0];
|
||||||
|
cudaStream_t stream = ctx.stream();
|
||||||
|
|
||||||
|
GGML_ASSERT(src0->type == dst->type);
|
||||||
|
switch(src0->type) {
|
||||||
|
case GGML_TYPE_F32:
|
||||||
|
{
|
||||||
|
cumsum_cuda(
|
||||||
|
(const float *)src0->data, (float *)dst->data,
|
||||||
|
src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3],
|
||||||
|
src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3],
|
||||||
|
dst->nb[0], dst->nb[1], dst->nb[2], dst->nb[3],
|
||||||
|
stream
|
||||||
|
);
|
||||||
|
} break;
|
||||||
|
// We do not support those on CPU for now anyway, so comment them out because they cause errors on some CI platforms
|
||||||
|
/*case GGML_TYPE_F16:
|
||||||
|
{
|
||||||
|
cumsum_cuda(
|
||||||
|
(const half *)src0->data, (half *)dst->data,
|
||||||
|
src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3],
|
||||||
|
src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3],
|
||||||
|
dst->nb[0], dst->nb[1], dst->nb[2], dst->nb[3],
|
||||||
|
stream
|
||||||
|
);
|
||||||
|
} break;
|
||||||
|
case GGML_TYPE_BF16:
|
||||||
|
{
|
||||||
|
cumsum_cuda(
|
||||||
|
(const nv_bfloat16 *)src0->data, (nv_bfloat16 *)dst->data,
|
||||||
|
src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3],
|
||||||
|
src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3],
|
||||||
|
dst->nb[0], dst->nb[1], dst->nb[2], dst->nb[3],
|
||||||
|
stream
|
||||||
|
);
|
||||||
|
} break;*/
|
||||||
|
default:
|
||||||
|
GGML_ABORT("fatal error");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,5 @@
|
||||||
|
#include "common.cuh"
|
||||||
|
|
||||||
|
#define CUDA_CUMSUM_BLOCK_SIZE 256
|
||||||
|
|
||||||
|
void ggml_cuda_op_cumsum(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
||||||
|
|
@ -10,6 +10,12 @@
|
||||||
#define HALF_MAX_HALF __float2half(65504.0f/2) // Use neg. of this instead of -INFINITY to initialize KQ max vals to avoid NaN upon subtraction.
|
#define HALF_MAX_HALF __float2half(65504.0f/2) // Use neg. of this instead of -INFINITY to initialize KQ max vals to avoid NaN upon subtraction.
|
||||||
#define SOFTMAX_FTZ_THRESHOLD -20.0f // Softmax exp. of values smaller than this are flushed to zero to avoid NaNs.
|
#define SOFTMAX_FTZ_THRESHOLD -20.0f // Softmax exp. of values smaller than this are flushed to zero to avoid NaNs.
|
||||||
|
|
||||||
|
// log(2) = 0.6931, by adding this to the KQ maximum used for the softmax the numerical range representable
|
||||||
|
// by the VKQ accumulators is effectively being shifted up by a factor of 8.
|
||||||
|
// This reduces issues with numerical overflow but also causes larger values to be flushed to zero.
|
||||||
|
// However, as the output from FlashAttention will usually be used as an input for a matrix multiplication this should be negligible.
|
||||||
|
#define FATTN_KQ_MAX_OFFSET 0.6931f
|
||||||
|
|
||||||
typedef void (* fattn_kernel_t)(
|
typedef void (* fattn_kernel_t)(
|
||||||
const char * __restrict__ Q,
|
const char * __restrict__ Q,
|
||||||
const char * __restrict__ K,
|
const char * __restrict__ K,
|
||||||
|
|
@ -25,7 +31,7 @@ typedef void (* fattn_kernel_t)(
|
||||||
const float m1,
|
const float m1,
|
||||||
const uint32_t n_head_log2,
|
const uint32_t n_head_log2,
|
||||||
const float logit_softcap,
|
const float logit_softcap,
|
||||||
const int32_t ne00, const int32_t ne01, const int32_t ne02, const int32_t ne03,
|
const int32_t ne00, const uint3 ne01, const int32_t ne02, const int32_t ne03,
|
||||||
const int32_t nb01, const int32_t nb02, const int32_t nb03,
|
const int32_t nb01, const int32_t nb02, const int32_t nb03,
|
||||||
const int32_t ne10, const int32_t ne11, const int32_t ne12, const int32_t ne13,
|
const int32_t ne10, const int32_t ne11, const int32_t ne12, const int32_t ne13,
|
||||||
const int32_t nb11, const int32_t nb12, const int64_t nb13,
|
const int32_t nb11, const int32_t nb12, const int64_t nb13,
|
||||||
|
|
@ -621,7 +627,8 @@ static __global__ void flash_attn_mask_to_KV_max(
|
||||||
template<int D, int ncols1, int ncols2> // D == head size
|
template<int D, int ncols1, int ncols2> // D == head size
|
||||||
__launch_bounds__(D, 1)
|
__launch_bounds__(D, 1)
|
||||||
static __global__ void flash_attn_stream_k_fixup(
|
static __global__ void flash_attn_stream_k_fixup(
|
||||||
float * __restrict__ dst, const float2 * __restrict__ dst_fixup, const int ne01, const int ne02, const int ne03, const int ne11) {
|
float * __restrict__ dst, const float2 * __restrict__ dst_fixup, const int ne01, const int ne02, const int ne03, const int ne11,
|
||||||
|
const int nbatch_fa) {
|
||||||
constexpr int ncols = ncols1*ncols2;
|
constexpr int ncols = ncols1*ncols2;
|
||||||
|
|
||||||
const int bidx0 = blockIdx.x;
|
const int bidx0 = blockIdx.x;
|
||||||
|
|
@ -632,7 +639,7 @@ static __global__ void flash_attn_stream_k_fixup(
|
||||||
|
|
||||||
const float * dst_fixup_data = ((const float *) dst_fixup) + gridDim.x*(2*2*ncols);
|
const float * dst_fixup_data = ((const float *) dst_fixup) + gridDim.x*(2*2*ncols);
|
||||||
|
|
||||||
const int iter_k = ne11 / FATTN_KQ_STRIDE;
|
const int iter_k = (ne11 + (nbatch_fa - 1)) / nbatch_fa;
|
||||||
const int iter_j = (ne01 + (ncols1 - 1)) / ncols1;
|
const int iter_j = (ne01 + (ncols1 - 1)) / ncols1;
|
||||||
|
|
||||||
const int kbc0 = (bidx0 + 0)*(iter_k*iter_j*(ne02/ncols2)*ne03) / gridDim.x;
|
const int kbc0 = (bidx0 + 0)*(iter_k*iter_j*(ne02/ncols2)*ne03) / gridDim.x;
|
||||||
|
|
@ -765,7 +772,7 @@ static __global__ void flash_attn_combine_results(
|
||||||
template <int DV, int ncols1, int ncols2>
|
template <int DV, int ncols1, int ncols2>
|
||||||
void launch_fattn(
|
void launch_fattn(
|
||||||
ggml_backend_cuda_context & ctx, ggml_tensor * dst, fattn_kernel_t fattn_kernel, const int nwarps, const size_t nbytes_shared,
|
ggml_backend_cuda_context & ctx, ggml_tensor * dst, fattn_kernel_t fattn_kernel, const int nwarps, const size_t nbytes_shared,
|
||||||
const int KQ_row_granularity, const bool need_f16_K, const bool need_f16_V, const bool stream_k, const int warp_size = WARP_SIZE
|
const int nbatch_fa, const bool need_f16_K, const bool need_f16_V, const bool stream_k, const int warp_size = WARP_SIZE
|
||||||
) {
|
) {
|
||||||
constexpr int ncols = ncols1 * ncols2;
|
constexpr int ncols = ncols1 * ncols2;
|
||||||
|
|
||||||
|
|
@ -790,8 +797,6 @@ void launch_fattn(
|
||||||
GGML_ASSERT(!V || V->nb[0] == ggml_element_size(V));
|
GGML_ASSERT(!V || V->nb[0] == ggml_element_size(V));
|
||||||
|
|
||||||
GGML_ASSERT(!mask || mask->type == GGML_TYPE_F16);
|
GGML_ASSERT(!mask || mask->type == GGML_TYPE_F16);
|
||||||
GGML_ASSERT(!mask || mask->ne[1] >= GGML_PAD(Q->ne[1], 16) &&
|
|
||||||
"the Flash-Attention CUDA kernel requires the mask to be padded to 16 and at least n_queries big");
|
|
||||||
|
|
||||||
ggml_cuda_pool & pool = ctx.pool();
|
ggml_cuda_pool & pool = ctx.pool();
|
||||||
cudaStream_t main_stream = ctx.stream();
|
cudaStream_t main_stream = ctx.stream();
|
||||||
|
|
@ -915,7 +920,7 @@ void launch_fattn(
|
||||||
|
|
||||||
dst_tmp_meta.alloc(blocks_num.x*ncols * (2*2 + DV) * sizeof(float));
|
dst_tmp_meta.alloc(blocks_num.x*ncols * (2*2 + DV) * sizeof(float));
|
||||||
} else {
|
} else {
|
||||||
const int ntiles_KQ = (K->ne[1] + KQ_row_granularity - 1) / KQ_row_granularity; // Max. number of parallel blocks limited by tensor size.
|
const int ntiles_KQ = (K->ne[1] + nbatch_fa - 1) / nbatch_fa; // Max. number of parallel blocks limited by tensor size.
|
||||||
|
|
||||||
// parallel_blocks must not be larger than what the tensor size allows:
|
// parallel_blocks must not be larger than what the tensor size allows:
|
||||||
parallel_blocks = std::min(parallel_blocks, ntiles_KQ);
|
parallel_blocks = std::min(parallel_blocks, ntiles_KQ);
|
||||||
|
|
@ -970,6 +975,9 @@ void launch_fattn(
|
||||||
const float m0 = powf(2.0f, -(max_bias ) / n_head_log2);
|
const float m0 = powf(2.0f, -(max_bias ) / n_head_log2);
|
||||||
const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
|
const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
|
||||||
|
|
||||||
|
// TODO other tensor dimensions after removal of WMMA kernel:
|
||||||
|
const uint3 ne01 = init_fastdiv_values(Q->ne[1]);
|
||||||
|
|
||||||
GGML_ASSERT(block_dim.x % warp_size == 0);
|
GGML_ASSERT(block_dim.x % warp_size == 0);
|
||||||
fattn_kernel<<<blocks_num, block_dim, nbytes_shared, main_stream>>>(
|
fattn_kernel<<<blocks_num, block_dim, nbytes_shared, main_stream>>>(
|
||||||
(const char *) Q->data,
|
(const char *) Q->data,
|
||||||
|
|
@ -980,7 +988,7 @@ void launch_fattn(
|
||||||
KV_max.ptr,
|
KV_max.ptr,
|
||||||
!stream_k && parallel_blocks > 1 ? dst_tmp.ptr : (float *) KQV->data, dst_tmp_meta.ptr,
|
!stream_k && parallel_blocks > 1 ? dst_tmp.ptr : (float *) KQV->data, dst_tmp_meta.ptr,
|
||||||
scale, max_bias, m0, m1, n_head_log2, logit_softcap,
|
scale, max_bias, m0, m1, n_head_log2, logit_softcap,
|
||||||
Q->ne[0], Q->ne[1], Q->ne[2], Q->ne[3], Q->nb[1], Q->nb[2], Q->nb[3],
|
Q->ne[0], ne01, Q->ne[2], Q->ne[3], Q->nb[1], Q->nb[2], Q->nb[3],
|
||||||
K->ne[0], K->ne[1], K->ne[2], K->ne[3], nb11, nb12, nb13,
|
K->ne[0], K->ne[1], K->ne[2], K->ne[3], nb11, nb12, nb13,
|
||||||
nb21, nb22, nb23,
|
nb21, nb22, nb23,
|
||||||
mask ? mask->ne[1] : 0, mask ? mask->ne[2] : 0, mask ? mask->ne[3] : 0,
|
mask ? mask->ne[1] : 0, mask ? mask->ne[2] : 0, mask ? mask->ne[3] : 0,
|
||||||
|
|
@ -995,7 +1003,7 @@ void launch_fattn(
|
||||||
|
|
||||||
flash_attn_stream_k_fixup<DV, ncols1, ncols2>
|
flash_attn_stream_k_fixup<DV, ncols1, ncols2>
|
||||||
<<<blocks_num_combine, block_dim_combine, 0, main_stream>>>
|
<<<blocks_num_combine, block_dim_combine, 0, main_stream>>>
|
||||||
((float *) KQV->data, dst_tmp_meta.ptr, Q->ne[1], Q->ne[2], Q->ne[3], K->ne[1]);
|
((float *) KQV->data, dst_tmp_meta.ptr, Q->ne[1], Q->ne[2], Q->ne[3], K->ne[1], nbatch_fa);
|
||||||
}
|
}
|
||||||
} else if (parallel_blocks > 1) {
|
} else if (parallel_blocks > 1) {
|
||||||
const dim3 block_dim_combine(DV, 1, 1);
|
const dim3 block_dim_combine(DV, 1, 1);
|
||||||
|
|
|
||||||
File diff suppressed because it is too large
Load Diff
|
|
@ -501,6 +501,7 @@ static __device__ __forceinline__ void flash_attn_tile_iter(
|
||||||
const half2 * const __restrict__ K_h2,
|
const half2 * const __restrict__ K_h2,
|
||||||
const half2 * const __restrict__ V_h2,
|
const half2 * const __restrict__ V_h2,
|
||||||
const half * const __restrict__ mask,
|
const half * const __restrict__ mask,
|
||||||
|
const uint3 ne01,
|
||||||
const float logit_softcap,
|
const float logit_softcap,
|
||||||
const float slope,
|
const float slope,
|
||||||
T_KQ * const KQ,
|
T_KQ * const KQ,
|
||||||
|
|
@ -512,7 +513,8 @@ static __device__ __forceinline__ void flash_attn_tile_iter(
|
||||||
float * const KQ_sum,
|
float * const KQ_sum,
|
||||||
T_acc * const VKQ,
|
T_acc * const VKQ,
|
||||||
const int k_VKQ_0,
|
const int k_VKQ_0,
|
||||||
const int k_VKQ_max) {
|
const int k_VKQ_max,
|
||||||
|
const int col_Q_0) {
|
||||||
constexpr int cpy_nb = ggml_cuda_get_max_cpy_bytes();
|
constexpr int cpy_nb = ggml_cuda_get_max_cpy_bytes();
|
||||||
constexpr int cpy_ne = cpy_nb / 4;
|
constexpr int cpy_ne = cpy_nb / 4;
|
||||||
|
|
||||||
|
|
@ -556,7 +558,7 @@ static __device__ __forceinline__ void flash_attn_tile_iter(
|
||||||
// Apply logit softcap + mask, update KQ_max:
|
// Apply logit softcap + mask, update KQ_max:
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int jc0 = 0; jc0 < cpw; ++jc0) {
|
for (int jc0 = 0; jc0 < cpw; ++jc0) {
|
||||||
const int j = (jc0 + (threadIdx.y / np)*cpw)/ncols2;
|
const int j = fastmodulo(col_Q_0 + (jc0 + (threadIdx.y / np)*cpw)/ncols2, ne01);
|
||||||
|
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int i_KQ_0 = 0; i_KQ_0 < nbatch_fa; i_KQ_0 += np*warp_size) {
|
for (int i_KQ_0 = 0; i_KQ_0 < nbatch_fa; i_KQ_0 += np*warp_size) {
|
||||||
|
|
@ -570,7 +572,7 @@ static __device__ __forceinline__ void flash_attn_tile_iter(
|
||||||
KQ_acc[(i_KQ_0/(np*warp_size))*cpw + jc0] += (ncols2 > 1 || mask) ?
|
KQ_acc[(i_KQ_0/(np*warp_size))*cpw + jc0] += (ncols2 > 1 || mask) ?
|
||||||
slope*__half2float(mask[j*stride_mask + k_VKQ_0 + i_KQ]) : 0.0f;
|
slope*__half2float(mask[j*stride_mask + k_VKQ_0 + i_KQ]) : 0.0f;
|
||||||
|
|
||||||
KQ_max_new[jc0] = fmaxf(KQ_max_new[jc0], KQ_acc[(i_KQ_0/(np*warp_size))*cpw + jc0]);
|
KQ_max_new[jc0] = fmaxf(KQ_max_new[jc0], KQ_acc[(i_KQ_0/(np*warp_size))*cpw + jc0] + FATTN_KQ_MAX_OFFSET);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -736,7 +738,7 @@ static __global__ void flash_attn_tile(
|
||||||
const float m1,
|
const float m1,
|
||||||
const uint32_t n_head_log2,
|
const uint32_t n_head_log2,
|
||||||
const float logit_softcap,
|
const float logit_softcap,
|
||||||
const int32_t ne00, const int32_t ne01, const int32_t ne02, const int32_t ne03,
|
const int32_t ne00, const uint3 ne01, const int32_t ne02, const int32_t ne03,
|
||||||
const int32_t nb01, const int32_t nb02, const int32_t nb03,
|
const int32_t nb01, const int32_t nb02, const int32_t nb03,
|
||||||
const int32_t ne10, const int32_t ne11, const int32_t ne12, const int32_t ne13,
|
const int32_t ne10, const int32_t ne11, const int32_t ne12, const int32_t ne13,
|
||||||
const int32_t nb11, const int32_t nb12, const int64_t nb13,
|
const int32_t nb11, const int32_t nb12, const int64_t nb13,
|
||||||
|
|
@ -781,11 +783,11 @@ static __global__ void flash_attn_tile(
|
||||||
const int sequence = blockIdx.z / (ne02/ncols2);
|
const int sequence = blockIdx.z / (ne02/ncols2);
|
||||||
const int head0 = blockIdx.z*ncols2 - sequence*ne02; // == blockIdx.z % (ne02/ncols2)
|
const int head0 = blockIdx.z*ncols2 - sequence*ne02; // == blockIdx.z % (ne02/ncols2)
|
||||||
const int gqa_ratio = ne02 / ne12; // With grouped query attention there are > 1 Q matrices per K, V matrix.
|
const int gqa_ratio = ne02 / ne12; // With grouped query attention there are > 1 Q matrices per K, V matrix.
|
||||||
const float * Q_f = (const float *) (Q + nb03*sequence + nb02* head0 + nb01*col_Q_0);
|
const float * Q_f = (const float *) (Q + nb03*sequence + nb02* head0);
|
||||||
const half2 * K_h2 = (const half2 *) (K + nb13*sequence + nb12*(head0 / gqa_ratio));
|
const half2 * K_h2 = (const half2 *) (K + nb13*sequence + nb12*(head0 / gqa_ratio));
|
||||||
const half2 * V_h2 = (const half2 *) (V + nb23*sequence + nb22*(head0 / gqa_ratio)); // K and V have same shape
|
const half2 * V_h2 = (const half2 *) (V + nb23*sequence + nb22*(head0 / gqa_ratio)); // K and V have same shape
|
||||||
|
|
||||||
const half * maskh = mask ? (const half *) (mask + nb33*(sequence % ne33) + nb31*col_Q_0) : nullptr;
|
const half * maskh = mask ? (const half *) (mask + nb33*(sequence % ne33)) : nullptr;
|
||||||
|
|
||||||
const int stride_K2 = nb11 / sizeof(half2);
|
const int stride_K2 = nb11 / sizeof(half2);
|
||||||
const int stride_V2 = nb21 / sizeof(half2);
|
const int stride_V2 = nb21 / sizeof(half2);
|
||||||
|
|
@ -842,11 +844,9 @@ static __global__ void flash_attn_tile(
|
||||||
for (int i0 = 0; i0 < DKQp; i0 += np*warp_size*cpy_ne_D) {
|
for (int i0 = 0; i0 < DKQp; i0 += np*warp_size*cpy_ne_D) {
|
||||||
if (i0 + np*warp_size*cpy_ne_D <= DKQ || i0 + (threadIdx.y % np)*(warp_size*cpy_ne_D) + threadIdx.x*cpy_ne_D < DKQ) {
|
if (i0 + np*warp_size*cpy_ne_D <= DKQ || i0 + (threadIdx.y % np)*(warp_size*cpy_ne_D) + threadIdx.x*cpy_ne_D < DKQ) {
|
||||||
float tmp_f[cpy_ne_D] = {0.0f};
|
float tmp_f[cpy_ne_D] = {0.0f};
|
||||||
if (ncols1 == 1 || col_Q_0 + j < ne01) {
|
|
||||||
ggml_cuda_memcpy_1<sizeof(tmp_f)>
|
ggml_cuda_memcpy_1<sizeof(tmp_f)>
|
||||||
(tmp_f, &Q_f[c*(nb02/sizeof(float)) + j*(nb01/sizeof(float))
|
(tmp_f, &Q_f[c*(nb02/sizeof(float)) + fastmodulo(col_Q_0 + j, ne01)*(nb01/sizeof(float))
|
||||||
+ i0 + (threadIdx.y % np)*(warp_size*cpy_ne_D) + threadIdx.x*cpy_ne_D]);
|
+ i0 + (threadIdx.y % np)*(warp_size*cpy_ne_D) + threadIdx.x*cpy_ne_D]);
|
||||||
}
|
|
||||||
|
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int i1 = 0; i1 < cpy_ne_D; ++i1) {
|
for (int i1 = 0; i1 < cpy_ne_D; ++i1) {
|
||||||
|
|
@ -881,23 +881,23 @@ static __global__ void flash_attn_tile(
|
||||||
while (k_VKQ_0 < k_VKQ_max - nbatch_fa) {
|
while (k_VKQ_0 < k_VKQ_max - nbatch_fa) {
|
||||||
constexpr bool oob_check = false;
|
constexpr bool oob_check = false;
|
||||||
flash_attn_tile_iter<warp_size, nwarps, ncols1, ncols2, DKQ, DV, nbatch_fa, nbatch_K, use_logit_softcap, oob_check>
|
flash_attn_tile_iter<warp_size, nwarps, ncols1, ncols2, DKQ, DV, nbatch_fa, nbatch_K, use_logit_softcap, oob_check>
|
||||||
(Q_tmp, K_h2, V_h2, maskh, logit_softcap, slope, KQ, KV_tmp,
|
(Q_tmp, K_h2, V_h2, maskh, ne01, logit_softcap, slope, KQ, KV_tmp,
|
||||||
stride_K2, stride_V2, stride_mask, KQ_max, KQ_sum, VKQ, k_VKQ_0, k_VKQ_max);
|
stride_K2, stride_V2, stride_mask, KQ_max, KQ_sum, VKQ, k_VKQ_0, k_VKQ_max, col_Q_0);
|
||||||
k_VKQ_0 += gridDim.y*nbatch_fa;
|
k_VKQ_0 += gridDim.y*nbatch_fa;
|
||||||
}
|
}
|
||||||
if (k_VKQ_0 < k_VKQ_max) {
|
if (k_VKQ_0 < k_VKQ_max) {
|
||||||
constexpr bool oob_check = true;
|
constexpr bool oob_check = true;
|
||||||
flash_attn_tile_iter<warp_size, nwarps, ncols1, ncols2, DKQ, DV, nbatch_fa, nbatch_K, use_logit_softcap, oob_check>
|
flash_attn_tile_iter<warp_size, nwarps, ncols1, ncols2, DKQ, DV, nbatch_fa, nbatch_K, use_logit_softcap, oob_check>
|
||||||
(Q_tmp, K_h2, V_h2, maskh, logit_softcap, slope, KQ, KV_tmp,
|
(Q_tmp, K_h2, V_h2, maskh, ne01, logit_softcap, slope, KQ, KV_tmp,
|
||||||
stride_K2, stride_V2, stride_mask, KQ_max, KQ_sum, VKQ, k_VKQ_0, k_VKQ_max);
|
stride_K2, stride_V2, stride_mask, KQ_max, KQ_sum, VKQ, k_VKQ_0, k_VKQ_max, col_Q_0);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
// Branch without out-of-bounds checks.
|
// Branch without out-of-bounds checks.
|
||||||
for (int k_VKQ_0 = blockIdx.y*nbatch_fa; k_VKQ_0 < k_VKQ_max; k_VKQ_0 += gridDim.y*nbatch_fa) {
|
for (int k_VKQ_0 = blockIdx.y*nbatch_fa; k_VKQ_0 < k_VKQ_max; k_VKQ_0 += gridDim.y*nbatch_fa) {
|
||||||
constexpr bool oob_check = false;
|
constexpr bool oob_check = false;
|
||||||
flash_attn_tile_iter<warp_size, nwarps, ncols1, ncols2, DKQ, DV, nbatch_fa, nbatch_K, use_logit_softcap, oob_check>
|
flash_attn_tile_iter<warp_size, nwarps, ncols1, ncols2, DKQ, DV, nbatch_fa, nbatch_K, use_logit_softcap, oob_check>
|
||||||
(Q_tmp, K_h2, V_h2, maskh, logit_softcap, slope, KQ, KV_tmp,
|
(Q_tmp, K_h2, V_h2, maskh, ne01, logit_softcap, slope, KQ, KV_tmp,
|
||||||
stride_K2, stride_V2, stride_mask, KQ_max, KQ_sum, VKQ, k_VKQ_0, k_VKQ_max);
|
stride_K2, stride_V2, stride_mask, KQ_max, KQ_sum, VKQ, k_VKQ_0, k_VKQ_max, col_Q_0);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -1010,13 +1010,13 @@ static __global__ void flash_attn_tile(
|
||||||
const int j = jc / ncols2;
|
const int j = jc / ncols2;
|
||||||
const int c = jc % ncols2;
|
const int c = jc % ncols2;
|
||||||
|
|
||||||
if (ncols1 > 1 && col_Q_0 + j >= ne01) {
|
if (ncols1 > 1 && col_Q_0 + j >= int(ne01.z)) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
const float scale = gridDim.y == 1 ? 1.0f/KQ_sum[jc0] : 1.0f;
|
const float scale = gridDim.y == 1 ? 1.0f/KQ_sum[jc0] : 1.0f;
|
||||||
|
|
||||||
const int j_dst_unrolled = ((sequence*ne01 + col_Q_0 + j)*ne02 + head0 + c)*gridDim.y + blockIdx.y;
|
const int j_dst_unrolled = ((sequence*int(ne01.z) + col_Q_0 + j)*ne02 + head0 + c)*gridDim.y + blockIdx.y;
|
||||||
|
|
||||||
#ifdef FAST_FP16_AVAILABLE
|
#ifdef FAST_FP16_AVAILABLE
|
||||||
constexpr int cpy_ne_D = cpy_ne/2 < (DVp/2)/warp_size ? cpy_ne/2 : (DVp/2)/warp_size;
|
constexpr int cpy_ne_D = cpy_ne/2 < (DVp/2)/warp_size ? cpy_ne/2 : (DVp/2)/warp_size;
|
||||||
|
|
|
||||||
|
|
@ -33,7 +33,7 @@ static __global__ void flash_attn_ext_vec(
|
||||||
const float m1,
|
const float m1,
|
||||||
const uint32_t n_head_log2,
|
const uint32_t n_head_log2,
|
||||||
const float logit_softcap,
|
const float logit_softcap,
|
||||||
const int32_t ne00, const int32_t ne01, const int32_t ne02, const int32_t ne03,
|
const int32_t ne00, const uint3 ne01, const int32_t ne02, const int32_t ne03,
|
||||||
const int32_t nb01, const int32_t nb02, const int32_t nb03,
|
const int32_t nb01, const int32_t nb02, const int32_t nb03,
|
||||||
const int32_t ne10, const int32_t ne11, const int32_t ne12, const int32_t ne13,
|
const int32_t ne10, const int32_t ne11, const int32_t ne12, const int32_t ne13,
|
||||||
const int32_t nb11, const int32_t nb12, const int64_t nb13,
|
const int32_t nb11, const int32_t nb12, const int64_t nb13,
|
||||||
|
|
@ -150,7 +150,7 @@ static __global__ void flash_attn_ext_vec(
|
||||||
float2 * tmp_q_ds = (float2 *) (tmp_q_i32 + D/sizeof(int));
|
float2 * tmp_q_ds = (float2 *) (tmp_q_i32 + D/sizeof(int));
|
||||||
|
|
||||||
// Set memory to zero if out of bounds:
|
// Set memory to zero if out of bounds:
|
||||||
if (ncols > 1 && ic0 + j >= ne01) {
|
if (ncols > 1 && ic0 + j >= int(ne01.z)) {
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int i0 = 0; i0 < int(D/sizeof(int)); i0 += WARP_SIZE) {
|
for (int i0 = 0; i0 < int(D/sizeof(int)); i0 += WARP_SIZE) {
|
||||||
const int i = i0 + threadIdx.x;
|
const int i = i0 + threadIdx.x;
|
||||||
|
|
@ -201,7 +201,7 @@ static __global__ void flash_attn_ext_vec(
|
||||||
const int i = i0 + (nthreads_KQ == WARP_SIZE ? threadIdx.x : threadIdx.x % nthreads_KQ)*cpy_ne;
|
const int i = i0 + (nthreads_KQ == WARP_SIZE ? threadIdx.x : threadIdx.x % nthreads_KQ)*cpy_ne;
|
||||||
|
|
||||||
float2 tmp[cpy_ne] = {{0.0f, 0.0f}};
|
float2 tmp[cpy_ne] = {{0.0f, 0.0f}};
|
||||||
if (ncols == 1 || ic0 + j < ne01) {
|
if (ncols == 1 || ic0 + j < int(ne01.z)) {
|
||||||
ggml_cuda_memcpy_1<cpy_nb>(tmp, &Q_j[i]);
|
ggml_cuda_memcpy_1<cpy_nb>(tmp, &Q_j[i]);
|
||||||
ggml_cuda_memcpy_1<cpy_nb>(tmp + cpy_ne/2, &Q_j[i + cpy_ne/2]);
|
ggml_cuda_memcpy_1<cpy_nb>(tmp + cpy_ne/2, &Q_j[i + cpy_ne/2]);
|
||||||
}
|
}
|
||||||
|
|
@ -222,7 +222,7 @@ static __global__ void flash_attn_ext_vec(
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int i0 = 0; i0 < D/2; i0 += nthreads_KQ*cpy_ne) {
|
for (int i0 = 0; i0 < D/2; i0 += nthreads_KQ*cpy_ne) {
|
||||||
const int i = i0 + (nthreads_KQ == WARP_SIZE ? threadIdx.x : threadIdx.x % nthreads_KQ)*cpy_ne;
|
const int i = i0 + (nthreads_KQ == WARP_SIZE ? threadIdx.x : threadIdx.x % nthreads_KQ)*cpy_ne;
|
||||||
if (ncols == 1 || ic0 + j < ne01) {
|
if (ncols == 1 || ic0 + j < int(ne01.z)) {
|
||||||
ggml_cuda_memcpy_1<cpy_nb>(&Q_reg[j][i0/nthreads_KQ], &Q_j[i]);
|
ggml_cuda_memcpy_1<cpy_nb>(&Q_reg[j][i0/nthreads_KQ], &Q_j[i]);
|
||||||
ggml_cuda_memcpy_1<cpy_nb>(&Q_reg[j][i0/nthreads_KQ + cpy_ne/2], &Q_j[i + cpy_ne/2]);
|
ggml_cuda_memcpy_1<cpy_nb>(&Q_reg[j][i0/nthreads_KQ + cpy_ne/2], &Q_j[i + cpy_ne/2]);
|
||||||
}
|
}
|
||||||
|
|
@ -266,11 +266,11 @@ static __global__ void flash_attn_ext_vec(
|
||||||
sum = logit_softcap*tanhf(sum);
|
sum = logit_softcap*tanhf(sum);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (mask) {
|
if (mask && (ncols == 1 || ic0 + j < int(ne01.z))) {
|
||||||
sum += slope*__half2float(maskh[j*ne11 + i_KQ]);
|
sum += slope*__half2float(maskh[j*ne11 + i_KQ]);
|
||||||
}
|
}
|
||||||
|
|
||||||
KQ_max_new[j] = fmaxf(KQ_max_new[j], sum);
|
KQ_max_new[j] = fmaxf(KQ_max_new[j], sum + FATTN_KQ_MAX_OFFSET);
|
||||||
|
|
||||||
if ((nthreads_KQ == WARP_SIZE ? threadIdx.x : threadIdx.x % nthreads_KQ) == uint32_t(i_KQ_0)) {
|
if ((nthreads_KQ == WARP_SIZE ? threadIdx.x : threadIdx.x % nthreads_KQ) == uint32_t(i_KQ_0)) {
|
||||||
KQ_reg[j] = sum;
|
KQ_reg[j] = sum;
|
||||||
|
|
@ -412,7 +412,7 @@ static __global__ void flash_attn_ext_vec(
|
||||||
|
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int j_VKQ = 0; j_VKQ < ncols; ++j_VKQ) {
|
for (int j_VKQ = 0; j_VKQ < ncols; ++j_VKQ) {
|
||||||
if (ncols > 1 && ic0 + j_VKQ >= ne01) {
|
if (ncols > 1 && ic0 + j_VKQ >= int(ne01.z)) {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -479,7 +479,7 @@ static __global__ void flash_attn_ext_vec(
|
||||||
if (gridDim.y == 1) {
|
if (gridDim.y == 1) {
|
||||||
dst_val /= KQ_sum[j_VKQ];
|
dst_val /= KQ_sum[j_VKQ];
|
||||||
}
|
}
|
||||||
dst[(((sequence*ne01 + ic0 + j_VKQ)*ne02 + head)*gridDim.y + blockIdx.y)*D + i0 + tid] = dst_val;
|
dst[(((sequence*int(ne01.z) + ic0 + j_VKQ)*ne02 + head)*gridDim.y + blockIdx.y)*D + i0 + tid] = dst_val;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -489,8 +489,8 @@ static __global__ void flash_attn_ext_vec(
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if (gridDim.y != 1 && tid < ncols && (ncols == 1 || ic0 + tid < ne01)) {
|
if (gridDim.y != 1 && tid < ncols && (ncols == 1 || ic0 + tid < int(ne01.z))) {
|
||||||
dst_meta[((sequence*ne01 + ic0 + tid)*ne02 + head)*gridDim.y + blockIdx.y] = make_float2(KQ_max[tid], KQ_sum[tid]);
|
dst_meta[((sequence*int(ne01.z) + ic0 + tid)*ne02 + head)*gridDim.y + blockIdx.y] = make_float2(KQ_max[tid], KQ_sum[tid]);
|
||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
GGML_UNUSED_VARS(Q, K, V, mask, sinks, KV_max, dst, dst_meta, scale,
|
GGML_UNUSED_VARS(Q, K, V, mask, sinks, KV_max, dst, dst_meta, scale,
|
||||||
|
|
|
||||||
|
|
@ -38,14 +38,14 @@ static __global__ void flash_attn_ext_f16(
|
||||||
const float m1,
|
const float m1,
|
||||||
const uint32_t n_head_log2,
|
const uint32_t n_head_log2,
|
||||||
const float logit_softcap,
|
const float logit_softcap,
|
||||||
const int32_t ne00, const int32_t ne01, const int32_t ne02, const int32_t ne03,
|
const int32_t ne00, const uint3 ne01, const int32_t ne02, const int32_t ne03,
|
||||||
const int32_t nb01, const int32_t nb02, const int32_t nb03,
|
const int32_t nb01, const int32_t nb02, const int32_t nb03,
|
||||||
const int32_t ne10, const int32_t ne11, const int32_t ne12, const int32_t ne13,
|
const int32_t ne10, const int32_t ne11, const int32_t ne12, const int32_t ne13,
|
||||||
const int32_t nb11, const int32_t nb12, const int64_t nb13,
|
const int32_t nb11, const int32_t nb12, const int64_t nb13,
|
||||||
const int32_t nb21, const int32_t nb22, const int64_t nb23,
|
const int32_t nb21, const int32_t nb22, const int64_t nb23,
|
||||||
const int32_t ne31, const int32_t ne32, const int32_t ne33,
|
const int32_t ne31, const int32_t ne32, const int32_t ne33,
|
||||||
const int32_t nb31, const int32_t nb32, const int64_t nb33) {
|
const int32_t nb31, const int32_t nb32, const int64_t nb33) {
|
||||||
#if defined(FLASH_ATTN_AVAILABLE) && (__CUDA_ARCH__ == GGML_CUDA_CC_VOLTA || (defined(GGML_HIP_ROCWMMA_FATTN) && defined(GGML_USE_WMMA_FATTN)))
|
#if defined(FLASH_ATTN_AVAILABLE) && (defined(GGML_HIP_ROCWMMA_FATTN) && defined(GGML_USE_WMMA_FATTN))
|
||||||
// Skip unused kernel variants for faster compilation:
|
// Skip unused kernel variants for faster compilation:
|
||||||
if (use_logit_softcap && !(D == 128 || D == 256)) {
|
if (use_logit_softcap && !(D == 128 || D == 256)) {
|
||||||
NO_DEVICE_CODE;
|
NO_DEVICE_CODE;
|
||||||
|
|
@ -149,7 +149,7 @@ static __global__ void flash_attn_ext_f16(
|
||||||
if (i0 + warp_size > D && i >= D) {
|
if (i0 + warp_size > D && i >= D) {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
KQ[j*D_padded + i] = ic0 + j < ne01 ? Q_f[j*stride_Q + i] * scale : 0.0f;
|
KQ[j*D_padded + i] = ic0 + j < int(ne01.z) ? Q_f[j*stride_Q + i] * scale : 0.0f;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -218,8 +218,9 @@ static __global__ void flash_attn_ext_f16(
|
||||||
for (int k0 = 0; k0 < FATTN_KQ_STRIDE; k0 += warp_size) {
|
for (int k0 = 0; k0 < FATTN_KQ_STRIDE; k0 += warp_size) {
|
||||||
const int k = k0 + threadIdx.x;
|
const int k = k0 + threadIdx.x;
|
||||||
|
|
||||||
KQ_f_tmp[k0/warp_size] += mask ? __half2float(slopeh*maskh[j*(nb31/sizeof(half)) + k_VKQ_0 + k]) : 0.0f;
|
KQ_f_tmp[k0/warp_size] += mask && ic0 + j < int(ne01.z) ?
|
||||||
KQ_max_new = max(KQ_max_new, KQ_f_tmp[k0/warp_size]);
|
__half2float(slopeh*maskh[j*(nb31/sizeof(half)) + k_VKQ_0 + k]) : 0.0f;
|
||||||
|
KQ_max_new = max(KQ_max_new, KQ_f_tmp[k0/warp_size] + FATTN_KQ_MAX_OFFSET);
|
||||||
}
|
}
|
||||||
KQ_max_new = warp_reduce_max<warp_size>(KQ_max_new);
|
KQ_max_new = warp_reduce_max<warp_size>(KQ_max_new);
|
||||||
|
|
||||||
|
|
@ -270,7 +271,7 @@ static __global__ void flash_attn_ext_f16(
|
||||||
for (int k0 = 0; k0 < FATTN_KQ_STRIDE/2; k0 += warp_size) {
|
for (int k0 = 0; k0 < FATTN_KQ_STRIDE/2; k0 += warp_size) {
|
||||||
const int k = k0 + threadIdx.x;
|
const int k = k0 + threadIdx.x;
|
||||||
|
|
||||||
KQ2_tmp[k0/warp_size] += mask ? slope2*mask2[(j*ne11 + k_VKQ_0)/2 + k] : make_half2(0.0f, 0.0f);
|
KQ2_tmp[k0/warp_size] += mask && ic0 + j < int(ne01.z) ? slope2*mask2[(j*ne11 + k_VKQ_0)/2 + k] : make_half2(0.0f, 0.0f);
|
||||||
KQ_max_new = ggml_cuda_hmax2(KQ_max_new, KQ2_tmp[k0/warp_size]);
|
KQ_max_new = ggml_cuda_hmax2(KQ_max_new, KQ2_tmp[k0/warp_size]);
|
||||||
}
|
}
|
||||||
KQ_max_new = __half2half2(warp_reduce_max<warp_size>(ggml_cuda_hmax(__low2half(KQ_max_new), __high2half(KQ_max_new))));
|
KQ_max_new = __half2half2(warp_reduce_max<warp_size>(ggml_cuda_hmax(__low2half(KQ_max_new), __high2half(KQ_max_new))));
|
||||||
|
|
@ -431,7 +432,7 @@ static __global__ void flash_attn_ext_f16(
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int j0 = 0; j0 < ncols; j0 += nwarps) {
|
for (int j0 = 0; j0 < ncols; j0 += nwarps) {
|
||||||
const int j_VKQ = j0 + threadIdx.y;
|
const int j_VKQ = j0 + threadIdx.y;
|
||||||
if (ic0 + j_VKQ >= ne01) {
|
if (ic0 + j_VKQ >= int(ne01.z)) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -442,7 +443,7 @@ static __global__ void flash_attn_ext_f16(
|
||||||
KQ_rowsum_j = __low2float(KQ_rowsum_h2[j0/nwarps]) + __high2float(KQ_rowsum_h2[j0/nwarps]);
|
KQ_rowsum_j = __low2float(KQ_rowsum_h2[j0/nwarps]) + __high2float(KQ_rowsum_h2[j0/nwarps]);
|
||||||
}
|
}
|
||||||
|
|
||||||
const int j_dst_unrolled = ((sequence*ne01 + ic0 + j_VKQ)*ne02 + head)*gridDim.y + blockIdx.y;
|
const int j_dst_unrolled = ((sequence*int(ne01.z) + ic0 + j_VKQ)*ne02 + head)*gridDim.y + blockIdx.y;
|
||||||
|
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int i0 = 0; i0 < D; i0 += warp_size) {
|
for (int i0 = 0; i0 < D; i0 += warp_size) {
|
||||||
|
|
@ -481,7 +482,7 @@ static __global__ void flash_attn_ext_f16(
|
||||||
ne31, ne32, ne33,
|
ne31, ne32, ne33,
|
||||||
nb31, nb32, nb33);
|
nb31, nb32, nb33);
|
||||||
NO_DEVICE_CODE;
|
NO_DEVICE_CODE;
|
||||||
#endif // defined(FLASH_ATTN_AVAILABLE) && (__CUDA_ARCH__ == GGML_CUDA_CC_VOLTA || (defined(GGML_HIP_ROCWMMA_FATTN) && defined(GGML_USE_WMMA_FATTN)))
|
#endif // defined(FLASH_ATTN_AVAILABLE) && (defined(GGML_HIP_ROCWMMA_FATTN) && defined(GGML_USE_WMMA_FATTN))
|
||||||
}
|
}
|
||||||
|
|
||||||
constexpr int get_max_power_of_2(int x) {
|
constexpr int get_max_power_of_2(int x) {
|
||||||
|
|
|
||||||
|
|
@ -2,9 +2,9 @@
|
||||||
|
|
||||||
#include "common.cuh"
|
#include "common.cuh"
|
||||||
|
|
||||||
#if (!defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA) || defined(GGML_USE_MUSA)
|
#if defined(GGML_USE_MUSA)
|
||||||
#define GGML_USE_WMMA_FATTN
|
#define GGML_USE_WMMA_FATTN
|
||||||
#endif // (!defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA) || defined(GGML_USE_MUSA)
|
#endif // defined(GGML_USE_MUSA)
|
||||||
|
|
||||||
#if defined(GGML_HIP_ROCWMMA_FATTN)
|
#if defined(GGML_HIP_ROCWMMA_FATTN)
|
||||||
#if defined(CDNA) && (ROCWMMA_VERSION_MAJOR < 2 || ROCWMMA_VERSION_MINOR > 0 || ROCWMMA_VERSION_PATCH > 0)
|
#if defined(CDNA) && (ROCWMMA_VERSION_MAJOR < 2 || ROCWMMA_VERSION_MINOR > 0 || ROCWMMA_VERSION_PATCH > 0)
|
||||||
|
|
|
||||||
|
|
@ -12,13 +12,13 @@ static void ggml_cuda_flash_attn_ext_mma_f16_switch_ncols1(ggml_backend_cuda_con
|
||||||
const ggml_tensor * Q = dst->src[0];
|
const ggml_tensor * Q = dst->src[0];
|
||||||
|
|
||||||
if constexpr (ncols2 <= 8) {
|
if constexpr (ncols2 <= 8) {
|
||||||
if (Q->ne[1] <= 8/ncols2) {
|
if (turing_mma_available(cc) && Q->ne[1] <= 8/ncols2) {
|
||||||
ggml_cuda_flash_attn_ext_mma_f16_case<DKQ, DV, 8/ncols2, ncols2>(ctx, dst);
|
ggml_cuda_flash_attn_ext_mma_f16_case<DKQ, DV, 8/ncols2, ncols2>(ctx, dst);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (Q->ne[1] <= 16/ncols2) {
|
if (turing_mma_available(cc) && Q->ne[1] <= 16/ncols2) {
|
||||||
ggml_cuda_flash_attn_ext_mma_f16_case<DKQ, DV, 16/ncols2, ncols2>(ctx, dst);
|
ggml_cuda_flash_attn_ext_mma_f16_case<DKQ, DV, 16/ncols2, ncols2>(ctx, dst);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
@ -41,7 +41,7 @@ static void ggml_cuda_flash_attn_ext_mma_f16_switch_ncols2(ggml_backend_cuda_con
|
||||||
float max_bias = 0.0f;
|
float max_bias = 0.0f;
|
||||||
memcpy(&max_bias, (const float *) KQV->op_params + 1, sizeof(float));
|
memcpy(&max_bias, (const float *) KQV->op_params + 1, sizeof(float));
|
||||||
|
|
||||||
const bool use_gqa_opt = mask && max_bias == 0.0f;
|
const bool use_gqa_opt = mask && max_bias == 0.0f && K->ne[1] % FATTN_KQ_STRIDE == 0;
|
||||||
|
|
||||||
GGML_ASSERT(Q->ne[2] % K->ne[2] == 0);
|
GGML_ASSERT(Q->ne[2] % K->ne[2] == 0);
|
||||||
const int gqa_ratio = Q->ne[2] / K->ne[2];
|
const int gqa_ratio = Q->ne[2] / K->ne[2];
|
||||||
|
|
@ -275,8 +275,8 @@ static best_fattn_kernel ggml_cuda_get_best_fattn_kernel(const int device, const
|
||||||
// For small batch sizes the vector kernel may be preferable over the kernels optimized for large batch sizes:
|
// For small batch sizes the vector kernel may be preferable over the kernels optimized for large batch sizes:
|
||||||
const bool can_use_vector_kernel = Q->ne[0] <= 256 && Q->ne[0] % 64 == 0 && K->ne[1] % FATTN_KQ_STRIDE == 0;
|
const bool can_use_vector_kernel = Q->ne[0] <= 256 && Q->ne[0] % 64 == 0 && K->ne[1] % FATTN_KQ_STRIDE == 0;
|
||||||
|
|
||||||
// If Turing tensor cores available, use them:
|
// If Turing tensor cores are available, use them:
|
||||||
if (turing_mma_available(cc) && K->ne[1] % FATTN_KQ_STRIDE == 0 && Q->ne[0] != 40 && Q->ne[0] != 72) {
|
if (turing_mma_available(cc) && Q->ne[0] != 40 && Q->ne[0] != 72) {
|
||||||
if (can_use_vector_kernel) {
|
if (can_use_vector_kernel) {
|
||||||
if (!ggml_is_quantized(K->type) && !ggml_is_quantized(V->type)) {
|
if (!ggml_is_quantized(K->type) && !ggml_is_quantized(V->type)) {
|
||||||
if (cc >= GGML_CUDA_CC_ADA_LOVELACE && Q->ne[1] == 1 && Q->ne[3] == 1 && !(gqa_ratio > 4 && K->ne[1] >= 8192)) {
|
if (cc >= GGML_CUDA_CC_ADA_LOVELACE && Q->ne[1] == 1 && Q->ne[3] == 1 && !(gqa_ratio > 4 && K->ne[1] >= 8192)) {
|
||||||
|
|
@ -297,7 +297,21 @@ static best_fattn_kernel ggml_cuda_get_best_fattn_kernel(const int device, const
|
||||||
return BEST_FATTN_KERNEL_VEC;
|
return BEST_FATTN_KERNEL_VEC;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
return BEST_FATTN_KERNEL_MMA_F16;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (volta_mma_available(cc) && Q->ne[0] != 40 && Q->ne[0] != 72) {
|
||||||
|
int gqa_ratio_eff = 1;
|
||||||
|
const int ncols2_max = Q->ne[0] == 576 ? 16 : 8;
|
||||||
|
while (gqa_ratio % (2*gqa_ratio_eff) == 0 && gqa_ratio_eff < ncols2_max) {
|
||||||
|
gqa_ratio_eff *= 2;
|
||||||
|
}
|
||||||
|
if (can_use_vector_kernel && Q->ne[1] * gqa_ratio_eff <= 2) {
|
||||||
|
return BEST_FATTN_KERNEL_VEC;
|
||||||
|
}
|
||||||
|
if (Q->ne[1] * gqa_ratio_eff <= 16) {
|
||||||
|
return BEST_FATTN_KERNEL_TILE; // On Volta tensor cores are only faster for sufficiently large matrices.
|
||||||
|
}
|
||||||
return BEST_FATTN_KERNEL_MMA_F16;
|
return BEST_FATTN_KERNEL_MMA_F16;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,37 @@
|
||||||
|
#include "fill.cuh"
|
||||||
|
#include "convert.cuh"
|
||||||
|
|
||||||
|
#define CUDA_FILL_BLOCK_SIZE 256
|
||||||
|
|
||||||
|
template <typename T>
|
||||||
|
static __global__ void fill_kernel(T * __restrict__ dst, const int64_t k, const T value) {
|
||||||
|
const int64_t i = (int64_t)blockDim.x * blockIdx.x + threadIdx.x;
|
||||||
|
if (i >= k) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
dst[i] = value;
|
||||||
|
}
|
||||||
|
|
||||||
|
void ggml_cuda_op_fill(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||||
|
void * dst_d = dst->data;
|
||||||
|
cudaStream_t stream = ctx.stream();
|
||||||
|
|
||||||
|
GGML_ASSERT(ggml_is_contiguous(dst));
|
||||||
|
|
||||||
|
float value;
|
||||||
|
memcpy(&value, dst->op_params, sizeof(float));
|
||||||
|
|
||||||
|
const int64_t k = ggml_nelements(dst);
|
||||||
|
const int64_t num_blocks = (k + CUDA_FILL_BLOCK_SIZE - 1) / CUDA_FILL_BLOCK_SIZE;
|
||||||
|
|
||||||
|
switch (dst->type) {
|
||||||
|
case GGML_TYPE_F32:
|
||||||
|
fill_kernel<<<num_blocks, CUDA_FILL_BLOCK_SIZE, 0, stream>>>((float *)dst_d, k, value);
|
||||||
|
break;
|
||||||
|
case GGML_TYPE_F16:
|
||||||
|
fill_kernel<<<num_blocks, CUDA_FILL_BLOCK_SIZE, 0, stream>>>((half *)dst_d, k, ggml_cuda_cast<half>(value));
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
GGML_ABORT("unsupported type");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,3 @@
|
||||||
|
#include "common.cuh"
|
||||||
|
|
||||||
|
void ggml_cuda_op_fill(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
||||||
|
|
@ -54,6 +54,9 @@
|
||||||
#include "ggml-cuda/set-rows.cuh"
|
#include "ggml-cuda/set-rows.cuh"
|
||||||
#include "ggml-cuda/pad_reflect_1d.cuh"
|
#include "ggml-cuda/pad_reflect_1d.cuh"
|
||||||
#include "ggml-cuda/solve_tri.cuh"
|
#include "ggml-cuda/solve_tri.cuh"
|
||||||
|
#include "ggml-cuda/tri.cuh"
|
||||||
|
#include "ggml-cuda/cumsum.cuh"
|
||||||
|
#include "ggml-cuda/fill.cuh"
|
||||||
#include "ggml.h"
|
#include "ggml.h"
|
||||||
|
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
|
|
@ -2701,6 +2704,12 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
|
||||||
case GGML_OP_CROSS_ENTROPY_LOSS:
|
case GGML_OP_CROSS_ENTROPY_LOSS:
|
||||||
ggml_cuda_cross_entropy_loss(ctx, dst);
|
ggml_cuda_cross_entropy_loss(ctx, dst);
|
||||||
break;
|
break;
|
||||||
|
case GGML_OP_CUMSUM:
|
||||||
|
ggml_cuda_op_cumsum(ctx, dst);
|
||||||
|
break;
|
||||||
|
case GGML_OP_TRI:
|
||||||
|
ggml_cuda_op_tri(ctx, dst);
|
||||||
|
break;
|
||||||
case GGML_OP_RWKV_WKV6:
|
case GGML_OP_RWKV_WKV6:
|
||||||
ggml_cuda_op_rwkv_wkv6(ctx, dst);
|
ggml_cuda_op_rwkv_wkv6(ctx, dst);
|
||||||
break;
|
break;
|
||||||
|
|
@ -2722,6 +2731,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
|
||||||
case GGML_OP_SOLVE_TRI:
|
case GGML_OP_SOLVE_TRI:
|
||||||
ggml_cuda_op_solve_tri(ctx, dst);
|
ggml_cuda_op_solve_tri(ctx, dst);
|
||||||
break;
|
break;
|
||||||
|
case GGML_OP_FILL:
|
||||||
|
ggml_cuda_op_fill(ctx, dst);
|
||||||
|
break;
|
||||||
default:
|
default:
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
@ -3238,9 +3250,56 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (should_launch_concurrent_events) {
|
if (should_launch_concurrent_events) {
|
||||||
//Restore the original graph to enable fusion within the streams
|
// Restore original node order within each concurrent region to enable fusion within streams
|
||||||
cgraph->nodes = const_cast<ggml_tensor **>(stream_ctx.original_nodes.data());
|
|
||||||
cgraph->n_nodes = (int) stream_ctx.original_nodes.size();
|
std::unordered_map<const ggml_tensor *, int> node_to_idx;
|
||||||
|
node_to_idx.reserve(cgraph->n_nodes);
|
||||||
|
for (int i = 0; i < cgraph->n_nodes; ++i) {
|
||||||
|
node_to_idx[cgraph->nodes[i]] = i;
|
||||||
|
}
|
||||||
|
|
||||||
|
for (auto & [fork_node, event] : stream_ctx.concurrent_events) {
|
||||||
|
// Find positions of all nodes from this event in the current graph
|
||||||
|
std::vector<int> positions;
|
||||||
|
positions.reserve(event.original_order.size());
|
||||||
|
|
||||||
|
bool all_found = true;
|
||||||
|
for (const ggml_tensor * orig_node : event.original_order) {
|
||||||
|
auto it = node_to_idx.find(orig_node);
|
||||||
|
if (it != node_to_idx.end()) {
|
||||||
|
positions.push_back(it->second);
|
||||||
|
} else {
|
||||||
|
all_found = false;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!all_found || positions.size() != event.original_order.size()) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Sort positions to get contiguous range
|
||||||
|
std::vector<int> sorted_positions = positions;
|
||||||
|
std::sort(sorted_positions.begin(), sorted_positions.end());
|
||||||
|
|
||||||
|
bool is_contiguous = true;
|
||||||
|
for (size_t i = 1; i < sorted_positions.size(); ++i) {
|
||||||
|
if (sorted_positions[i] != sorted_positions[i-1] + 1) {
|
||||||
|
is_contiguous = false;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!is_contiguous) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Restore original order at the sorted positions
|
||||||
|
int start_pos = sorted_positions[0];
|
||||||
|
for (size_t i = 0; i < event.original_order.size(); ++i) {
|
||||||
|
cgraph->nodes[start_pos + i] = const_cast<ggml_tensor *>(event.original_order[i]);
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
for (int i = 0; i < cgraph->n_nodes; i++) {
|
for (int i = 0; i < cgraph->n_nodes; i++) {
|
||||||
|
|
@ -3805,14 +3864,6 @@ static void ggml_backend_cuda_graph_optimize(ggml_backend_t backend, ggml_cgraph
|
||||||
// store {fork_idx, join_idx}
|
// store {fork_idx, join_idx}
|
||||||
std::vector<std::pair<int, int>> concurrent_node_ranges;
|
std::vector<std::pair<int, int>> concurrent_node_ranges;
|
||||||
|
|
||||||
// save the original nodes
|
|
||||||
std::vector<const ggml_tensor *> original_nodes;
|
|
||||||
original_nodes.reserve(cgraph->n_nodes);
|
|
||||||
for (int i = 0; i < cgraph->n_nodes; ++i) {
|
|
||||||
original_nodes.push_back(cgraph->nodes[i]);
|
|
||||||
}
|
|
||||||
cuda_ctx->stream_context().original_nodes = std::move(original_nodes);
|
|
||||||
|
|
||||||
for (const auto & [root_node, count] : fan_out) {
|
for (const auto & [root_node, count] : fan_out) {
|
||||||
if (count >= min_fan_out && count <= max_fan_out) {
|
if (count >= min_fan_out && count <= max_fan_out) {
|
||||||
const int root_node_idx = node_indices[root_node];
|
const int root_node_idx = node_indices[root_node];
|
||||||
|
|
@ -3917,6 +3968,13 @@ static void ggml_backend_cuda_graph_optimize(ggml_backend_t backend, ggml_cgraph
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Save the original order of nodes in this region before interleaving
|
||||||
|
// This is used later to restore grouping for fusion within streams
|
||||||
|
concurrent_event.original_order.reserve(total_branch_nodes);
|
||||||
|
for (int i = fork_node_idx + 1; i < join_node_idx; ++i) {
|
||||||
|
concurrent_event.original_order.push_back(cgraph->nodes[i]);
|
||||||
|
}
|
||||||
|
|
||||||
std::unordered_map<const ggml_tensor *, ggml_cuda_concurrent_event> & concurrent_events = cuda_ctx->stream_context().concurrent_events;
|
std::unordered_map<const ggml_tensor *, ggml_cuda_concurrent_event> & concurrent_events = cuda_ctx->stream_context().concurrent_events;
|
||||||
GGML_ASSERT(concurrent_events.find(root_node) == concurrent_events.end());
|
GGML_ASSERT(concurrent_events.find(root_node) == concurrent_events.end());
|
||||||
concurrent_events.emplace(root_node, std::move(concurrent_event));
|
concurrent_events.emplace(root_node, std::move(concurrent_event));
|
||||||
|
|
@ -4563,6 +4621,9 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
|
||||||
case GGML_OP_CROSS_ENTROPY_LOSS_BACK:
|
case GGML_OP_CROSS_ENTROPY_LOSS_BACK:
|
||||||
case GGML_OP_OPT_STEP_ADAMW:
|
case GGML_OP_OPT_STEP_ADAMW:
|
||||||
case GGML_OP_OPT_STEP_SGD:
|
case GGML_OP_OPT_STEP_SGD:
|
||||||
|
case GGML_OP_FILL:
|
||||||
|
case GGML_OP_CUMSUM:
|
||||||
|
case GGML_OP_TRI:
|
||||||
return true;
|
return true;
|
||||||
case GGML_OP_SOLVE_TRI:
|
case GGML_OP_SOLVE_TRI:
|
||||||
return op->src[0]->ne[0] <= 64 && op->src[1]->ne[0] <= 32;
|
return op->src[0]->ne[0] <= 64 && op->src[1]->ne[0] <= 32;
|
||||||
|
|
|
||||||
|
|
@ -68,10 +68,31 @@ static __device__ __forceinline__ half2 ggml_cuda_movmatrix(const half2 x) {
|
||||||
|
|
||||||
namespace ggml_cuda_mma {
|
namespace ggml_cuda_mma {
|
||||||
|
|
||||||
|
// Some architectures like Volta or CDNA3 perform multiple matrix multiplications per warp in parallel,
|
||||||
|
// effectively the warp is being split into subgroups of threads that each perform a single mma instruction.
|
||||||
|
// In those cases the data can be split in different ways across the warp.
|
||||||
|
enum data_layout {
|
||||||
|
// By default the data uses the I direction as its major dimension and the J direction as its minor dimension.
|
||||||
|
// For the A/C matrices this means I major == row major, J major == column major.
|
||||||
|
// For the B matrix this means I major == column major, J major == row major.
|
||||||
|
// MIRRORED == Each data value is held exactly once per thread subgroup.
|
||||||
|
DATA_LAYOUT_I_MAJOR = 0, // Always used for Turing, Ampere, Ada Lovelace, consumer Blackwell.
|
||||||
|
DATA_LAYOUT_I_MAJOR_MIRRORED = 10,
|
||||||
|
DATA_LAYOUT_J_MAJOR_MIRRORED = 20,
|
||||||
|
};
|
||||||
|
// Implemented mma combinations are:
|
||||||
|
// - (I_MAJOR, I_MAJOR) -> I_MAJOR
|
||||||
|
// - (I_MAJOR, I_MAJOR_MIRRORED) -> I_MAJOR
|
||||||
|
// - (I_MAJOR, J_MAJOR_MIRRORED) -> I_MAJOR
|
||||||
|
|
||||||
|
template <int I_, int J_, typename T, data_layout ds_=DATA_LAYOUT_I_MAJOR>
|
||||||
|
struct tile {};
|
||||||
|
|
||||||
template <int I_, int J_, typename T>
|
template <int I_, int J_, typename T>
|
||||||
struct tile {
|
struct tile<I_, J_, T, DATA_LAYOUT_I_MAJOR> {
|
||||||
static constexpr int I = I_;
|
static constexpr int I = I_;
|
||||||
static constexpr int J = J_;
|
static constexpr int J = J_;
|
||||||
|
static constexpr data_layout dl = DATA_LAYOUT_I_MAJOR;
|
||||||
|
|
||||||
#if defined(AMD_MFMA_AVAILABLE)
|
#if defined(AMD_MFMA_AVAILABLE)
|
||||||
static constexpr int ne = I * J / 64;
|
static constexpr int ne = I * J / 64;
|
||||||
|
|
@ -131,9 +152,9 @@ namespace ggml_cuda_mma {
|
||||||
static __device__ __forceinline__ int get_i(const int l) {
|
static __device__ __forceinline__ int get_i(const int l) {
|
||||||
if constexpr (I == 32 && J == 8) {
|
if constexpr (I == 32 && J == 8) {
|
||||||
#ifdef GGML_CUDA_MMA_NO_VOLTA_PERM
|
#ifdef GGML_CUDA_MMA_NO_VOLTA_PERM
|
||||||
return (((threadIdx.x % 16) / 4) * 8) | ((threadIdx.x / 16) * 4) | (l & 2) | (threadIdx.x % 2);
|
return (((threadIdx.x % 16) / 4) * 8) + ((threadIdx.x / 16) * 4) + (l & 2) + (threadIdx.x % 2);
|
||||||
#else
|
#else
|
||||||
return (l & 2) | (threadIdx.x & ~2);
|
return (l & 2) + (threadIdx.x & ~2);
|
||||||
#endif // GGML_CUDA_MMA_NO_VOLTA_PERM
|
#endif // GGML_CUDA_MMA_NO_VOLTA_PERM
|
||||||
} else {
|
} else {
|
||||||
NO_DEVICE_CODE;
|
NO_DEVICE_CODE;
|
||||||
|
|
@ -143,7 +164,7 @@ namespace ggml_cuda_mma {
|
||||||
|
|
||||||
static __device__ __forceinline__ int get_j(const int l) {
|
static __device__ __forceinline__ int get_j(const int l) {
|
||||||
if constexpr (I == 32 && J == 8) {
|
if constexpr (I == 32 && J == 8) {
|
||||||
return (threadIdx.x & 2) | (l & (4 + 1));
|
return (threadIdx.x & 2) + (l & (4 + 1));
|
||||||
} else {
|
} else {
|
||||||
NO_DEVICE_CODE;
|
NO_DEVICE_CODE;
|
||||||
return -1;
|
return -1;
|
||||||
|
|
@ -152,6 +173,9 @@ namespace ggml_cuda_mma {
|
||||||
#elif defined(AMD_WMMA_AVAILABLE)
|
#elif defined(AMD_WMMA_AVAILABLE)
|
||||||
#if defined(RDNA4)
|
#if defined(RDNA4)
|
||||||
static constexpr int ne = I * J / 32;
|
static constexpr int ne = I * J / 32;
|
||||||
|
#elif defined(RDNA3)
|
||||||
|
static constexpr int ne = (I == 16 && J == 16) ? I * J / 32 : I * J / 16;
|
||||||
|
#endif // defined(RDNA4)
|
||||||
T x[ne] = {0};
|
T x[ne] = {0};
|
||||||
|
|
||||||
static constexpr __device__ bool supported() {
|
static constexpr __device__ bool supported() {
|
||||||
|
|
@ -161,7 +185,11 @@ namespace ggml_cuda_mma {
|
||||||
|
|
||||||
static __device__ __forceinline__ int get_i(const int l) {
|
static __device__ __forceinline__ int get_i(const int l) {
|
||||||
if constexpr (I == 16 && J == 16) {
|
if constexpr (I == 16 && J == 16) {
|
||||||
|
#if defined(RDNA4)
|
||||||
return 8 * (threadIdx.x / 16) + l;
|
return 8 * (threadIdx.x / 16) + l;
|
||||||
|
#elif defined(RDNA3)
|
||||||
|
return 2 * l + (threadIdx.x / 16);
|
||||||
|
#endif // defined(RDNA4)
|
||||||
} else {
|
} else {
|
||||||
NO_DEVICE_CODE;
|
NO_DEVICE_CODE;
|
||||||
return -1;
|
return -1;
|
||||||
|
|
@ -176,7 +204,6 @@ namespace ggml_cuda_mma {
|
||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#endif
|
|
||||||
#else
|
#else
|
||||||
static constexpr int ne = I * J / 32;
|
static constexpr int ne = I * J / 32;
|
||||||
T x[ne] = {0};
|
T x[ne] = {0};
|
||||||
|
|
@ -196,9 +223,9 @@ namespace ggml_cuda_mma {
|
||||||
} else if constexpr (I == 8 && J == 8) {
|
} else if constexpr (I == 8 && J == 8) {
|
||||||
return threadIdx.x / 4;
|
return threadIdx.x / 4;
|
||||||
} else if constexpr (I == 16 && J == 8) {
|
} else if constexpr (I == 16 && J == 8) {
|
||||||
return ((l / 2) * 8) | (threadIdx.x / 4);
|
return ((l / 2) * 8) + (threadIdx.x / 4);
|
||||||
} else if constexpr (I == 16 && J == 16) {
|
} else if constexpr (I == 16 && J == 16) {
|
||||||
return (((l / 2) % 2) * 8) | (threadIdx.x / 4);
|
return (((l / 2) % 2) * 8) + (threadIdx.x / 4);
|
||||||
} else if constexpr (I == 32 && J == 8) {
|
} else if constexpr (I == 32 && J == 8) {
|
||||||
return tile<16, 8, T>::get_i(l); // Memory layout simply repeated with same pattern in i direction.
|
return tile<16, 8, T>::get_i(l); // Memory layout simply repeated with same pattern in i direction.
|
||||||
} else {
|
} else {
|
||||||
|
|
@ -211,11 +238,11 @@ namespace ggml_cuda_mma {
|
||||||
if constexpr (I == 8 && J == 4) {
|
if constexpr (I == 8 && J == 4) {
|
||||||
return threadIdx.x % 4;
|
return threadIdx.x % 4;
|
||||||
} else if constexpr (I == 8 && J == 8) {
|
} else if constexpr (I == 8 && J == 8) {
|
||||||
return (l * 4) | (threadIdx.x % 4);
|
return (l * 4) + (threadIdx.x % 4);
|
||||||
} else if constexpr (I == 16 && J == 8) {
|
} else if constexpr (I == 16 && J == 8) {
|
||||||
return ((threadIdx.x % 4) * 2) | (l % 2);
|
return ((threadIdx.x % 4) * 2) + (l % 2);
|
||||||
} else if constexpr (I == 16 && J == 16) {
|
} else if constexpr (I == 16 && J == 16) {
|
||||||
return ((l / 4) * 8) | ((threadIdx.x % 4) * 2) | (l % 2);
|
return ((l / 4) * 8) + ((threadIdx.x % 4) * 2) + (l % 2);
|
||||||
} else if constexpr (I == 32 && J == 8) {
|
} else if constexpr (I == 32 && J == 8) {
|
||||||
return tile<16, 8, T>::get_j(l); // Memory layout simply repeated with same pattern in i direction.
|
return tile<16, 8, T>::get_j(l); // Memory layout simply repeated with same pattern in i direction.
|
||||||
} else {
|
} else {
|
||||||
|
|
@ -227,26 +254,24 @@ namespace ggml_cuda_mma {
|
||||||
};
|
};
|
||||||
|
|
||||||
template <int I_, int J_>
|
template <int I_, int J_>
|
||||||
struct tile<I_, J_, half2> {
|
struct tile<I_, J_, half2, DATA_LAYOUT_I_MAJOR> {
|
||||||
static constexpr int I = I_;
|
static constexpr int I = I_;
|
||||||
static constexpr int J = J_;
|
static constexpr int J = J_;
|
||||||
|
static constexpr data_layout dl = DATA_LAYOUT_I_MAJOR;
|
||||||
|
|
||||||
#if __CUDA_ARCH__ == GGML_CUDA_CC_VOLTA
|
#if __CUDA_ARCH__ == GGML_CUDA_CC_VOLTA
|
||||||
static constexpr int ne = I == 8 && J == 8 ? I * J / (WARP_SIZE/4) : I * J / WARP_SIZE;
|
static constexpr int ne = I * J / WARP_SIZE;
|
||||||
half2 x[ne] = {{0.0f, 0.0f}};
|
half2 x[ne] = {{0.0f, 0.0f}};
|
||||||
|
|
||||||
static constexpr __device__ bool supported() {
|
static constexpr __device__ bool supported() {
|
||||||
if (I == 8 && J == 8) return true;
|
if (I == 32 && J == 4) return true;
|
||||||
if (I == 32 && J == 8) return true;
|
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
static __device__ __forceinline__ int get_i(const int l) {
|
static __device__ __forceinline__ int get_i(const int l) {
|
||||||
if constexpr (I == 8 && J == 8) {
|
if constexpr (I == 32 && J == 4) {
|
||||||
return ((threadIdx.x / 16) * 4) | (threadIdx.x % 4);
|
|
||||||
} else if constexpr (I == 32 && J == 8) {
|
|
||||||
#ifdef GGML_CUDA_MMA_NO_VOLTA_PERM
|
#ifdef GGML_CUDA_MMA_NO_VOLTA_PERM
|
||||||
return (((threadIdx.x % 16) / 4) * 8) | ((threadIdx.x / 16) * 4) | (threadIdx.x % 4);
|
return (((threadIdx.x % 16) / 4) * 8) + ((threadIdx.x / 16) * 4) + (threadIdx.x % 4);
|
||||||
#else
|
#else
|
||||||
return threadIdx.x;
|
return threadIdx.x;
|
||||||
#endif // GGML_CUDA_MMA_NO_VOLTA_PERM
|
#endif // GGML_CUDA_MMA_NO_VOLTA_PERM
|
||||||
|
|
@ -257,7 +282,7 @@ namespace ggml_cuda_mma {
|
||||||
}
|
}
|
||||||
|
|
||||||
static __device__ __forceinline__ int get_j(const int l) {
|
static __device__ __forceinline__ int get_j(const int l) {
|
||||||
if constexpr ((I == 8 || I == 32) && J == 8) {
|
if constexpr (I == 32 && J == 4) {
|
||||||
return l;
|
return l;
|
||||||
} else {
|
} else {
|
||||||
NO_DEVICE_CODE;
|
NO_DEVICE_CODE;
|
||||||
|
|
@ -265,6 +290,7 @@ namespace ggml_cuda_mma {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#elif defined(AMD_WMMA_AVAILABLE)
|
#elif defined(AMD_WMMA_AVAILABLE)
|
||||||
|
|
||||||
static constexpr int ne = I * J / 32;
|
static constexpr int ne = I * J / 32;
|
||||||
half2 x[ne] = {{0.0f, 0.0f}};
|
half2 x[ne] = {{0.0f, 0.0f}};
|
||||||
|
|
||||||
|
|
@ -307,11 +333,11 @@ namespace ggml_cuda_mma {
|
||||||
if constexpr (I == 8 && J == 8) {
|
if constexpr (I == 8 && J == 8) {
|
||||||
return threadIdx.x / 4;
|
return threadIdx.x / 4;
|
||||||
} else if constexpr (I == 16 && J == 4) {
|
} else if constexpr (I == 16 && J == 4) {
|
||||||
return (l * 8) | (threadIdx.x / 4);
|
return (l * 8) + (threadIdx.x / 4);
|
||||||
} else if constexpr (I == 16 && J == 8) {
|
} else if constexpr (I == 16 && J == 8) {
|
||||||
return ((l % 2) * 8) | (threadIdx.x / 4);
|
return ((l % 2) * 8) + (threadIdx.x / 4);
|
||||||
} else if constexpr (I == 32 && J == 8) {
|
} else if constexpr (I == 32 && J == 8) {
|
||||||
return ((l / 4) * 16) | ((l % 2) * 8) | (threadIdx.x / 4);
|
return ((l / 4) * 16) + ((l % 2) * 8) + (threadIdx.x / 4);
|
||||||
} else {
|
} else {
|
||||||
NO_DEVICE_CODE;
|
NO_DEVICE_CODE;
|
||||||
return -1;
|
return -1;
|
||||||
|
|
@ -320,13 +346,13 @@ namespace ggml_cuda_mma {
|
||||||
|
|
||||||
static __device__ __forceinline__ int get_j(const int l) {
|
static __device__ __forceinline__ int get_j(const int l) {
|
||||||
if constexpr (I == 8 && J == 8) {
|
if constexpr (I == 8 && J == 8) {
|
||||||
return (l * 4) | (threadIdx.x % 4);
|
return (l * 4) + (threadIdx.x % 4);
|
||||||
} else if constexpr (I == 16 && J == 4) {
|
} else if constexpr (I == 16 && J == 4) {
|
||||||
return threadIdx.x % 4;
|
return threadIdx.x % 4;
|
||||||
} else if constexpr (I == 16 && J == 8) {
|
} else if constexpr (I == 16 && J == 8) {
|
||||||
return ((l / 2) * 4) | (threadIdx.x % 4);
|
return ((l / 2) * 4) + (threadIdx.x % 4);
|
||||||
} else if constexpr (I == 32 && J == 8) {
|
} else if constexpr (I == 32 && J == 8) {
|
||||||
return ((l & 2) * 2) | (threadIdx.x % 4);
|
return ((l & 2) * 2) + (threadIdx.x % 4);
|
||||||
} else {
|
} else {
|
||||||
NO_DEVICE_CODE;
|
NO_DEVICE_CODE;
|
||||||
return -1;
|
return -1;
|
||||||
|
|
@ -336,14 +362,15 @@ namespace ggml_cuda_mma {
|
||||||
};
|
};
|
||||||
|
|
||||||
template <int I_, int J_>
|
template <int I_, int J_>
|
||||||
struct tile<I_, J_, nv_bfloat162> {
|
struct tile<I_, J_, nv_bfloat162, DATA_LAYOUT_I_MAJOR> {
|
||||||
static constexpr int I = I_;
|
static constexpr int I = I_;
|
||||||
static constexpr int J = J_;
|
static constexpr int J = J_;
|
||||||
|
static constexpr data_layout dl = DATA_LAYOUT_I_MAJOR;
|
||||||
|
static constexpr int ne = I * J / WARP_SIZE;
|
||||||
|
|
||||||
#if defined(AMD_WMMA_AVAILABLE)
|
|
||||||
static constexpr int ne = I * J / 32;
|
|
||||||
nv_bfloat162 x[ne] = {{0.0f, 0.0f}};
|
nv_bfloat162 x[ne] = {{0.0f, 0.0f}};
|
||||||
|
|
||||||
|
#if defined(AMD_WMMA_AVAILABLE)
|
||||||
static constexpr __device__ bool supported() {
|
static constexpr __device__ bool supported() {
|
||||||
if (I == 16 && J == 8) return true;
|
if (I == 16 && J == 8) return true;
|
||||||
return false;
|
return false;
|
||||||
|
|
@ -367,9 +394,6 @@ namespace ggml_cuda_mma {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
static constexpr int ne = I * J / WARP_SIZE;
|
|
||||||
nv_bfloat162 x[ne] = {{0.0f, 0.0f}};
|
|
||||||
|
|
||||||
static constexpr __device__ bool supported() {
|
static constexpr __device__ bool supported() {
|
||||||
if (I == 8 && J == 8) return true;
|
if (I == 8 && J == 8) return true;
|
||||||
if (I == 16 && J == 4) return true;
|
if (I == 16 && J == 4) return true;
|
||||||
|
|
@ -381,9 +405,9 @@ namespace ggml_cuda_mma {
|
||||||
if constexpr (I == 8 && J == 8) {
|
if constexpr (I == 8 && J == 8) {
|
||||||
return threadIdx.x / 4;
|
return threadIdx.x / 4;
|
||||||
} else if constexpr (I == 16 && J == 4) {
|
} else if constexpr (I == 16 && J == 4) {
|
||||||
return (l * 8) | (threadIdx.x / 4);
|
return (l * 8) + (threadIdx.x / 4);
|
||||||
} else if constexpr (I == 16 && J == 8) {
|
} else if constexpr (I == 16 && J == 8) {
|
||||||
return ((l % 2) * 8) | (threadIdx.x / 4);
|
return ((l % 2) * 8) + (threadIdx.x / 4);
|
||||||
} else {
|
} else {
|
||||||
NO_DEVICE_CODE;
|
NO_DEVICE_CODE;
|
||||||
return -1;
|
return -1;
|
||||||
|
|
@ -392,11 +416,11 @@ namespace ggml_cuda_mma {
|
||||||
|
|
||||||
static __device__ __forceinline__ int get_j(const int l) {
|
static __device__ __forceinline__ int get_j(const int l) {
|
||||||
if constexpr (I == 8 && J == 8) {
|
if constexpr (I == 8 && J == 8) {
|
||||||
return (l * 4) | (threadIdx.x % 4);
|
return (l * 4) + (threadIdx.x % 4);
|
||||||
} else if constexpr (I == 16 && J == 4) {
|
} else if constexpr (I == 16 && J == 4) {
|
||||||
return threadIdx.x % 4;
|
return threadIdx.x % 4;
|
||||||
} else if constexpr (I == 16 && J == 8) {
|
} else if constexpr (I == 16 && J == 8) {
|
||||||
return ((l / 2) * 4) | (threadIdx.x % 4);
|
return ((l / 2) * 4) + (threadIdx.x % 4);
|
||||||
} else {
|
} else {
|
||||||
NO_DEVICE_CODE;
|
NO_DEVICE_CODE;
|
||||||
return -1;
|
return -1;
|
||||||
|
|
@ -405,6 +429,73 @@ namespace ggml_cuda_mma {
|
||||||
#endif // defined(AMD_WMMA_AVAILABLE)
|
#endif // defined(AMD_WMMA_AVAILABLE)
|
||||||
};
|
};
|
||||||
|
|
||||||
|
template <int I_, int J_>
|
||||||
|
struct tile<I_, J_, half2, DATA_LAYOUT_I_MAJOR_MIRRORED> {
|
||||||
|
static constexpr int I = I_;
|
||||||
|
static constexpr int J = J_;
|
||||||
|
static constexpr data_layout dl = DATA_LAYOUT_I_MAJOR_MIRRORED;
|
||||||
|
static constexpr int ne = I * J / (WARP_SIZE/4);
|
||||||
|
|
||||||
|
half2 x[ne] = {{0.0f, 0.0f}};
|
||||||
|
|
||||||
|
static constexpr __device__ bool supported() {
|
||||||
|
if (I == 8 && J == 4) return true;
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
static __device__ __forceinline__ int get_i(const int /*l*/) {
|
||||||
|
if constexpr (I == 8 && J == 4) {
|
||||||
|
return ((threadIdx.x / 16) * 4) + (threadIdx.x % 4);
|
||||||
|
} else {
|
||||||
|
NO_DEVICE_CODE;
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static __device__ __forceinline__ int get_j(const int l) {
|
||||||
|
if constexpr (I == 8 && J == 4) {
|
||||||
|
return l;
|
||||||
|
} else {
|
||||||
|
NO_DEVICE_CODE;
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
template <int I_, int J_>
|
||||||
|
struct tile<I_, J_, half2, DATA_LAYOUT_J_MAJOR_MIRRORED> {
|
||||||
|
static constexpr int I = I_;
|
||||||
|
static constexpr int J = J_;
|
||||||
|
static constexpr data_layout dl = DATA_LAYOUT_J_MAJOR_MIRRORED;
|
||||||
|
static constexpr int ne = I * J / (WARP_SIZE/4);
|
||||||
|
|
||||||
|
half2 x[ne] = {{0.0f, 0.0f}};
|
||||||
|
|
||||||
|
static constexpr __device__ bool supported() {
|
||||||
|
if (I == 8 && J == 4) return true;
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
static __device__ __forceinline__ int get_i(const int l) {
|
||||||
|
if constexpr (I == 8 && J == 4) {
|
||||||
|
return ((l / 2) * 4) + (threadIdx.x % 4);
|
||||||
|
} else {
|
||||||
|
NO_DEVICE_CODE;
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static __device__ __forceinline__ int get_j(const int l) {
|
||||||
|
if constexpr (I == 8 && J == 4) {
|
||||||
|
return ((threadIdx.x / 16) * 2) + (l % 2);
|
||||||
|
} else {
|
||||||
|
NO_DEVICE_CODE;
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
#if defined(TURING_MMA_AVAILABLE)
|
||||||
template <int I, int J>
|
template <int I, int J>
|
||||||
static __device__ __forceinline__ tile<I, J/2, half2> get_half2(const tile<I, J, float> & tile_float) {
|
static __device__ __forceinline__ tile<I, J/2, half2> get_half2(const tile<I, J, float> & tile_float) {
|
||||||
tile<I, J/2, half2> ret;
|
tile<I, J/2, half2> ret;
|
||||||
|
|
@ -422,9 +513,26 @@ namespace ggml_cuda_mma {
|
||||||
|
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
#else // Volta
|
||||||
|
template <int I, int J>
|
||||||
|
static __device__ __forceinline__ tile<I, J/2, half2> get_half2(const tile<I, J, float> & tile_float) {
|
||||||
|
tile<I, J/2, half2> ret;
|
||||||
|
#pragma unroll
|
||||||
|
for (int l0 = 0; l0 < tile_float.ne; l0 += 4) {
|
||||||
|
ret.x[l0/2 + 0] = make_half2(tile_float.x[l0 + 0], tile_float.x[l0 + 1]);
|
||||||
|
ret.x[l0/2 + 1] = make_half2(tile_float.x[l0 + 2], tile_float.x[l0 + 3]);
|
||||||
|
|
||||||
template <int I, int J, typename T>
|
// On Volta FP16 and FP32 tiles have a different memory layout,
|
||||||
static __device__ __forceinline__ void load_generic(tile<I, J, T> & t, const T * __restrict__ xs0, const int stride) {
|
// for the conversion threads with an offset of 2 need to exchange half their values:
|
||||||
|
ret.x[l0/2 + (((threadIdx.x % 4) / 2) ^ 1)] = __shfl_xor_sync(
|
||||||
|
0xFFFFFFFF, ret.x[l0/2 + (((threadIdx.x % 4) / 2) ^ 1)], 2, WARP_SIZE);
|
||||||
|
}
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
#endif // defined(TURING_MMA_AVAILABLE)
|
||||||
|
|
||||||
|
template <int I, int J, typename T, data_layout dl>
|
||||||
|
static __device__ __forceinline__ void load_generic(tile<I, J, T, dl> & t, const T * __restrict__ xs0, const int stride) {
|
||||||
#if defined(AMD_MFMA_AVAILABLE)
|
#if defined(AMD_MFMA_AVAILABLE)
|
||||||
if constexpr (I == 64 && J == 2) { // Special tile size to load <16, 4> as <16, 8>
|
if constexpr (I == 64 && J == 2) { // Special tile size to load <16, 4> as <16, 8>
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
|
|
@ -443,17 +551,33 @@ namespace ggml_cuda_mma {
|
||||||
} else if constexpr (std::is_same_v<T, int>) {
|
} else if constexpr (std::is_same_v<T, int>) {
|
||||||
if constexpr (I == 16 && J == 4) {
|
if constexpr (I == 16 && J == 4) {
|
||||||
int64_t * xi = (int64_t *) t.x;
|
int64_t * xi = (int64_t *) t.x;
|
||||||
|
#if defined(RDNA4)
|
||||||
const int64_t * xs = (int64_t *) ((const int *) xs0 + (threadIdx.x % t.I) * stride + 2 * (threadIdx.x / t.I));
|
const int64_t * xs = (int64_t *) ((const int *) xs0 + (threadIdx.x % t.I) * stride + 2 * (threadIdx.x / t.I));
|
||||||
xi[0] = xs[0];
|
xi[0] = xs[0];
|
||||||
|
#elif defined(RDNA3)
|
||||||
|
static_assert(tile<I,J,T>::ne >= 4, "fragment too small");
|
||||||
|
const int64_t * xs = (int64_t *) ((const int *) xs0 + (threadIdx.x % t.I) * stride);
|
||||||
|
xi[0] = xs[0];
|
||||||
|
xi[1] = xs[1];
|
||||||
|
#endif // defined(RDNA4)
|
||||||
} else if constexpr (I == 16 && J == 8) {
|
} else if constexpr (I == 16 && J == 8) {
|
||||||
int64_t * xi = (int64_t *) t.x;
|
int64_t * xi = (int64_t *) t.x;
|
||||||
|
#if defined(RDNA4)
|
||||||
const int64_t * xs = (int64_t *) ((const int *) xs0 + (threadIdx.x % t.I) * stride + 4 * (threadIdx.x / t.I));
|
const int64_t * xs = (int64_t *) ((const int *) xs0 + (threadIdx.x % t.I) * stride + 4 * (threadIdx.x / t.I));
|
||||||
xi[0] = xs[0];
|
xi[0] = xs[0];
|
||||||
|
|
||||||
const int64_t * xs1 = (int64_t *) ((const int *) xs0 + (threadIdx.x % t.I) * stride + 4 * (threadIdx.x / t.I) + 2);
|
const int64_t * xs1 = (int64_t *) ((const int *) xs0 + (threadIdx.x % t.I) * stride + 4 * (threadIdx.x / t.I) + 2);
|
||||||
xi[1] = xs1[0];
|
xi[1] = xs1[0];
|
||||||
|
#elif defined(RDNA3)
|
||||||
|
static_assert(tile<I,J,T>::ne >= 8, "fragment too small");
|
||||||
|
const int64_t * xs = (int64_t *) ((const int *) xs0 + (threadIdx.x % t.I) * stride);
|
||||||
|
// contiguous four 64-bit chunks per lane for the wider RDNA3 fragment
|
||||||
|
xi[0] = xs[0];
|
||||||
|
xi[1] = xs[1];
|
||||||
|
const int64_t * xs1 = xs + 2;
|
||||||
|
xi[2] = xs1[0];
|
||||||
|
xi[3] = xs1[1];
|
||||||
|
#endif // defined(RDNA4)
|
||||||
} else {
|
} else {
|
||||||
NO_DEVICE_CODE;
|
NO_DEVICE_CODE;
|
||||||
}
|
}
|
||||||
|
|
@ -511,18 +635,6 @@ namespace ggml_cuda_mma {
|
||||||
: "=r"(xi[0]), "=r"(xi[1]), "=r"(xi[2]), "=r"(xi[3])
|
: "=r"(xi[0]), "=r"(xi[1]), "=r"(xi[2]), "=r"(xi[3])
|
||||||
: "l"(xs));
|
: "l"(xs));
|
||||||
#else
|
#else
|
||||||
#if __CUDA_ARCH__ == GGML_CUDA_CC_VOLTA
|
|
||||||
GGML_UNUSED_VARS(t, xs0, stride);
|
|
||||||
NO_DEVICE_CODE;
|
|
||||||
#else
|
|
||||||
load_generic(t, xs0, stride);
|
|
||||||
#endif // __CUDA_ARCH__ == GGML_CUDA_CC_VOLTA
|
|
||||||
#endif // TURING_MMA_AVAILABLE
|
|
||||||
}
|
|
||||||
|
|
||||||
template <typename T>
|
|
||||||
static __device__ __forceinline__ void load_ldmatrix(
|
|
||||||
tile<32, 8, T> & t, const T * __restrict__ xs0, const int stride) {
|
|
||||||
#if __CUDA_ARCH__ == GGML_CUDA_CC_VOLTA
|
#if __CUDA_ARCH__ == GGML_CUDA_CC_VOLTA
|
||||||
#if 1
|
#if 1
|
||||||
// TODO: more generic handling
|
// TODO: more generic handling
|
||||||
|
|
@ -533,9 +645,31 @@ namespace ggml_cuda_mma {
|
||||||
load_generic(t, xs0, stride);
|
load_generic(t, xs0, stride);
|
||||||
#endif // 1
|
#endif // 1
|
||||||
#else
|
#else
|
||||||
tile<16, 8, T> * t16 = (tile<16, 8, T> *) &t;
|
load_generic(t, xs0, stride);
|
||||||
load_ldmatrix(t16[0], xs0 + 0*stride, stride);
|
#endif // __CUDA_ARCH__ == GGML_CUDA_CC_VOLTA
|
||||||
load_ldmatrix(t16[1], xs0 + 16*stride, stride);
|
#endif // TURING_MMA_AVAILABLE
|
||||||
|
}
|
||||||
|
|
||||||
|
static __device__ __forceinline__ void load_ldmatrix(
|
||||||
|
tile<8, 4, half2, DATA_LAYOUT_I_MAJOR_MIRRORED> & t, const half2 * __restrict__ xs0, const int stride) {
|
||||||
|
ggml_cuda_memcpy_1<4*sizeof(half2)>(t.x, xs0 + t.get_i(0)*stride);
|
||||||
|
}
|
||||||
|
|
||||||
|
static __device__ __forceinline__ void load_ldmatrix(
|
||||||
|
tile<8, 4, half2, DATA_LAYOUT_J_MAJOR_MIRRORED> & t, const half2 * __restrict__ xs0, const int stride) {
|
||||||
|
#pragma unroll
|
||||||
|
for (int l0 = 0; l0 < t.ne; l0 += 2) {
|
||||||
|
ggml_cuda_memcpy_1<2*sizeof(half2)>(t.x + l0, xs0 + t.get_i(l0)*stride + t.get_j(l0));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static __device__ __forceinline__ void load_ldmatrix(
|
||||||
|
tile<32, 4, half2> & t, const half2 * __restrict__ xs0, const int stride) {
|
||||||
|
#if __CUDA_ARCH__ == GGML_CUDA_CC_VOLTA
|
||||||
|
ggml_cuda_memcpy_1<4*sizeof(half2)>(t.x, xs0 + t.get_i(0)*stride);
|
||||||
|
#else
|
||||||
|
GGML_UNUSED_VARS(t, xs0, stride);
|
||||||
|
NO_DEVICE_CODE;
|
||||||
#endif // __CUDA_ARCH__ == GGML_CUDA_CC_VOLTA
|
#endif // __CUDA_ARCH__ == GGML_CUDA_CC_VOLTA
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -747,12 +881,14 @@ namespace ggml_cuda_mma {
|
||||||
: "r"(Axi[2]), "r"(Axi[3]), "r"(Bxi[3]));
|
: "r"(Axi[2]), "r"(Axi[3]), "r"(Bxi[3]));
|
||||||
#endif // __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
|
#endif // __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
|
||||||
#elif defined(AMD_WMMA_AVAILABLE)
|
#elif defined(AMD_WMMA_AVAILABLE)
|
||||||
|
#if defined(RDNA4)
|
||||||
using halfx8_t = __attribute__((ext_vector_type(8))) _Float16;
|
using halfx8_t = __attribute__((ext_vector_type(8))) _Float16;
|
||||||
using floatx8_t = __attribute__((ext_vector_type(8))) float;
|
using floatx8_t = __attribute__((ext_vector_type(8))) float;
|
||||||
floatx8_t& acc_frag = reinterpret_cast<floatx8_t&>(D.x[0]);
|
floatx8_t& acc_frag = reinterpret_cast<floatx8_t&>(D.x[0]);
|
||||||
const halfx8_t& a_frag = reinterpret_cast<const halfx8_t&>(A.x[0]);
|
const halfx8_t& a_frag = reinterpret_cast<const halfx8_t&>(A.x[0]);
|
||||||
const halfx8_t& b_frag = reinterpret_cast<const halfx8_t&>(B.x[0]);
|
const halfx8_t& b_frag = reinterpret_cast<const halfx8_t&>(B.x[0]);
|
||||||
acc_frag = __builtin_amdgcn_wmma_f32_16x16x16_f16_w32_gfx12(a_frag, b_frag, acc_frag);
|
acc_frag = __builtin_amdgcn_wmma_f32_16x16x16_f16_w32_gfx12(a_frag, b_frag, acc_frag);
|
||||||
|
#endif // RDNA4
|
||||||
#else
|
#else
|
||||||
GGML_UNUSED_VARS(D, A, B);
|
GGML_UNUSED_VARS(D, A, B);
|
||||||
NO_DEVICE_CODE;
|
NO_DEVICE_CODE;
|
||||||
|
|
@ -762,12 +898,14 @@ namespace ggml_cuda_mma {
|
||||||
static __device__ __forceinline__ void mma(
|
static __device__ __forceinline__ void mma(
|
||||||
tile<16, 16, float> & D, const tile<16, 8, nv_bfloat162> & A, const tile<16, 8, nv_bfloat162> & B) {
|
tile<16, 16, float> & D, const tile<16, 8, nv_bfloat162> & A, const tile<16, 8, nv_bfloat162> & B) {
|
||||||
#if defined(AMD_WMMA_AVAILABLE)
|
#if defined(AMD_WMMA_AVAILABLE)
|
||||||
|
#if defined(RDNA4)
|
||||||
using bf16x8_t = __attribute__((ext_vector_type(8))) __bf16;
|
using bf16x8_t = __attribute__((ext_vector_type(8))) __bf16;
|
||||||
using floatx8_t = __attribute__((ext_vector_type(8))) float;
|
using floatx8_t = __attribute__((ext_vector_type(8))) float;
|
||||||
floatx8_t& acc_frag = reinterpret_cast<floatx8_t&>(D.x[0]);
|
floatx8_t& acc_frag = reinterpret_cast<floatx8_t&>(D.x[0]);
|
||||||
const bf16x8_t& a_frag = reinterpret_cast<const bf16x8_t&>(A.x[0]);
|
const bf16x8_t& a_frag = reinterpret_cast<const bf16x8_t&>(A.x[0]);
|
||||||
const bf16x8_t& b_frag = reinterpret_cast<const bf16x8_t&>(B.x[0]);
|
const bf16x8_t& b_frag = reinterpret_cast<const bf16x8_t&>(B.x[0]);
|
||||||
acc_frag = __builtin_amdgcn_wmma_f32_16x16x16_bf16_w32_gfx12(a_frag, b_frag, acc_frag);
|
acc_frag = __builtin_amdgcn_wmma_f32_16x16x16_bf16_w32_gfx12(a_frag, b_frag, acc_frag);
|
||||||
|
#endif // RDNA4
|
||||||
#else
|
#else
|
||||||
GGML_UNUSED_VARS(D, A, B);
|
GGML_UNUSED_VARS(D, A, B);
|
||||||
NO_DEVICE_CODE;
|
NO_DEVICE_CODE;
|
||||||
|
|
@ -796,14 +934,14 @@ namespace ggml_cuda_mma {
|
||||||
#endif // defined(CDNA3)
|
#endif // defined(CDNA3)
|
||||||
|
|
||||||
#elif defined(AMD_WMMA_AVAILABLE)
|
#elif defined(AMD_WMMA_AVAILABLE)
|
||||||
using int32x2_t = __attribute__((__vector_size__(2 * sizeof(int)))) int;
|
|
||||||
int32x2_t * a_vec = (int32x2_t *) A.x;
|
|
||||||
int32x2_t * b_vec = (int32x2_t *) B.x;
|
|
||||||
|
|
||||||
using int32x8_t = __attribute__((__vector_size__(8 * sizeof(int)))) int;
|
using int32x8_t = __attribute__((__vector_size__(8 * sizeof(int)))) int;
|
||||||
int32x8_t * acc = (int32x8_t *) D.x;
|
int32x8_t * acc = (int32x8_t *) D.x;
|
||||||
|
|
||||||
#if defined(RDNA4)
|
#if defined(RDNA4)
|
||||||
|
using int32x2_t = __attribute__((__vector_size__(2 * sizeof(int)))) int;
|
||||||
|
int32x2_t * a_vec = (int32x2_t *) A.x;
|
||||||
|
int32x2_t * b_vec = (int32x2_t *) B.x;
|
||||||
|
|
||||||
acc[0] = __builtin_amdgcn_wmma_i32_16x16x16_iu8_w32_gfx12(
|
acc[0] = __builtin_amdgcn_wmma_i32_16x16x16_iu8_w32_gfx12(
|
||||||
true,
|
true,
|
||||||
|
|
@ -822,7 +960,30 @@ namespace ggml_cuda_mma {
|
||||||
acc[0],
|
acc[0],
|
||||||
true
|
true
|
||||||
);
|
);
|
||||||
#endif // defined(RDNA4)
|
|
||||||
|
#elif defined(RDNA3)
|
||||||
|
using int32x4_t = __attribute__((__vector_size__(4 * sizeof(int)))) int;
|
||||||
|
int32x4_t * a_vec = (int32x4_t *) A.x;
|
||||||
|
int32x4_t * b_vec = (int32x4_t *) B.x;
|
||||||
|
|
||||||
|
acc[0] = __builtin_amdgcn_wmma_i32_16x16x16_iu8_w32(
|
||||||
|
true,
|
||||||
|
a_vec[0],
|
||||||
|
true,
|
||||||
|
b_vec[0],
|
||||||
|
acc[0],
|
||||||
|
true
|
||||||
|
);
|
||||||
|
|
||||||
|
acc[0] = __builtin_amdgcn_wmma_i32_16x16x16_iu8_w32(
|
||||||
|
true,
|
||||||
|
a_vec[1],
|
||||||
|
true,
|
||||||
|
b_vec[1],
|
||||||
|
acc[0],
|
||||||
|
true
|
||||||
|
);
|
||||||
|
#endif // RDNA4
|
||||||
|
|
||||||
#else
|
#else
|
||||||
GGML_UNUSED_VARS(D, A, B);
|
GGML_UNUSED_VARS(D, A, B);
|
||||||
|
|
@ -860,14 +1021,14 @@ namespace ggml_cuda_mma {
|
||||||
template <typename T1, typename T2, int J, int K>
|
template <typename T1, typename T2, int J, int K>
|
||||||
static __device__ __forceinline__ void mma(
|
static __device__ __forceinline__ void mma(
|
||||||
tile<32, J, T1> & D, const tile<32, K, T2> & A, const tile<J, K, T2> & B) {
|
tile<32, J, T1> & D, const tile<32, K, T2> & A, const tile<J, K, T2> & B) {
|
||||||
tile<16, J, T1> * D16 = (tile<16, J, T1> *) &D;
|
tile <16, J, T1> * D16 = reinterpret_cast< tile<16, J, T1> *>(&D);
|
||||||
tile<16, K, T2> * A16 = (tile<16, K, T2> *) &A;
|
const tile<16, K, T2> * A16 = reinterpret_cast<const tile<16, K, T2> *>(&A);
|
||||||
mma(D16[0], A16[0], B);
|
mma(D16[0], A16[0], B);
|
||||||
mma(D16[1], A16[1], B);
|
mma(D16[1], A16[1], B);
|
||||||
}
|
}
|
||||||
|
|
||||||
static __device__ __forceinline__ void mma(
|
static __device__ __forceinline__ void mma(
|
||||||
tile<32, 8, float> & D, const tile<32, 8, half2> & A, const tile<8, 8, half2> & B) {
|
tile<32, 8, float> & D, const tile<32, 4, half2> & A, const tile<8, 4, half2, DATA_LAYOUT_I_MAJOR_MIRRORED> & B) {
|
||||||
#if __CUDA_ARCH__ == GGML_CUDA_CC_VOLTA
|
#if __CUDA_ARCH__ == GGML_CUDA_CC_VOLTA
|
||||||
const int * Axi = (const int *) A.x;
|
const int * Axi = (const int *) A.x;
|
||||||
const int * Bxi = (const int *) B.x;
|
const int * Bxi = (const int *) B.x;
|
||||||
|
|
@ -880,32 +1041,42 @@ namespace ggml_cuda_mma {
|
||||||
"{%0, %1, %2, %3, %4, %5, %6, %7}, {%8, %9}, {%10, %11}, {%0, %1, %2, %3, %4, %5, %6, %7};"
|
"{%0, %1, %2, %3, %4, %5, %6, %7}, {%8, %9}, {%10, %11}, {%0, %1, %2, %3, %4, %5, %6, %7};"
|
||||||
: "+r"(Dxi[0]), "+r"(Dxi[1]), "+r"(Dxi[2]), "+r"(Dxi[3]), "+r"(Dxi[4]), "+r"(Dxi[5]), "+r"(Dxi[6]), "+r"(Dxi[7])
|
: "+r"(Dxi[0]), "+r"(Dxi[1]), "+r"(Dxi[2]), "+r"(Dxi[3]), "+r"(Dxi[4]), "+r"(Dxi[5]), "+r"(Dxi[6]), "+r"(Dxi[7])
|
||||||
: "r"(Axi[2]), "r"(Axi[3]), "r"(Bxi[2]), "r"(Bxi[3]));
|
: "r"(Axi[2]), "r"(Axi[3]), "r"(Bxi[2]), "r"(Bxi[3]));
|
||||||
asm("mma.sync.aligned.m8n8k4.row.col.f32.f16.f16.f32 "
|
|
||||||
"{%0, %1, %2, %3, %4, %5, %6, %7}, {%8, %9}, {%10, %11}, {%0, %1, %2, %3, %4, %5, %6, %7};"
|
|
||||||
: "+r"(Dxi[0]), "+r"(Dxi[1]), "+r"(Dxi[2]), "+r"(Dxi[3]), "+r"(Dxi[4]), "+r"(Dxi[5]), "+r"(Dxi[6]), "+r"(Dxi[7])
|
|
||||||
: "r"(Axi[4]), "r"(Axi[5]), "r"(Bxi[4]), "r"(Bxi[5]));
|
|
||||||
asm("mma.sync.aligned.m8n8k4.row.col.f32.f16.f16.f32 "
|
|
||||||
"{%0, %1, %2, %3, %4, %5, %6, %7}, {%8, %9}, {%10, %11}, {%0, %1, %2, %3, %4, %5, %6, %7};"
|
|
||||||
: "+r"(Dxi[0]), "+r"(Dxi[1]), "+r"(Dxi[2]), "+r"(Dxi[3]), "+r"(Dxi[4]), "+r"(Dxi[5]), "+r"(Dxi[6]), "+r"(Dxi[7])
|
|
||||||
: "r"(Axi[6]), "r"(Axi[7]), "r"(Bxi[6]), "r"(Bxi[7]));
|
|
||||||
#else
|
#else
|
||||||
tile <16, 8, float> * D16 = reinterpret_cast<tile <16, 8, float> *>(&D);
|
GGML_UNUSED_VARS(D, A, B);
|
||||||
const tile<16, 8, half2> * A16 = reinterpret_cast<const tile<16, 8, half2> *>(&A);
|
NO_DEVICE_CODE;
|
||||||
mma(D16[0], A16[0], B);
|
#endif // __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA
|
||||||
mma(D16[1], A16[1], B);
|
}
|
||||||
#endif // __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
|
|
||||||
|
static __device__ __forceinline__ void mma(
|
||||||
|
tile<32, 4, half2> & D, const tile<32, 4, half2> & A, const tile<8, 4, half2, DATA_LAYOUT_J_MAJOR_MIRRORED> & B) {
|
||||||
|
#if __CUDA_ARCH__ == GGML_CUDA_CC_VOLTA
|
||||||
|
const int * Axi = (const int *) A.x;
|
||||||
|
const int * Bxi = (const int *) B.x;
|
||||||
|
int * Dxi = (int *) D.x;
|
||||||
|
asm("mma.sync.aligned.m8n8k4.row.row.f16.f16.f16.f16 "
|
||||||
|
"{%0, %1, %2, %3}, {%4, %5}, {%6, %7}, {%0, %1, %2, %3};"
|
||||||
|
: "+r"(Dxi[0]), "+r"(Dxi[1]), "+r"(Dxi[2]), "+r"(Dxi[3])
|
||||||
|
: "r"(Axi[0]), "r"(Axi[1]), "r"(Bxi[0]), "r"(Bxi[1]));
|
||||||
|
asm("mma.sync.aligned.m8n8k4.row.row.f16.f16.f16.f16 "
|
||||||
|
"{%0, %1, %2, %3}, {%4, %5}, {%6, %7}, {%0, %1, %2, %3};"
|
||||||
|
: "+r"(Dxi[0]), "+r"(Dxi[1]), "+r"(Dxi[2]), "+r"(Dxi[3])
|
||||||
|
: "r"(Axi[2]), "r"(Axi[3]), "r"(Bxi[2]), "r"(Bxi[3]));
|
||||||
|
#else
|
||||||
|
GGML_UNUSED_VARS(D, A, B);
|
||||||
|
NO_DEVICE_CODE;
|
||||||
|
#endif // __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA
|
||||||
}
|
}
|
||||||
|
|
||||||
static __device__ __forceinline__ void mma(
|
static __device__ __forceinline__ void mma(
|
||||||
tile<16, 16, int> & D, const tile<16, 4, int> & A, const tile<16, 4, int> & B) {
|
tile<16, 16, int> & D, const tile<16, 4, int> & A, const tile<16, 4, int> & B) {
|
||||||
#if defined(AMD_WMMA_AVAILABLE)
|
#if defined(AMD_WMMA_AVAILABLE)
|
||||||
|
using int32x8_t = __attribute__((__vector_size__(8 * sizeof(int)))) int;
|
||||||
|
int32x8_t * acc = (int32x8_t *) D.x;
|
||||||
|
#if defined(RDNA4)
|
||||||
using int32x2_t = __attribute__((__vector_size__(2 * sizeof(int)))) int;
|
using int32x2_t = __attribute__((__vector_size__(2 * sizeof(int)))) int;
|
||||||
int32x2_t * a_vec = (int32x2_t *) A.x;
|
int32x2_t * a_vec = (int32x2_t *) A.x;
|
||||||
int32x2_t * b_vec = (int32x2_t *) B.x;
|
int32x2_t * b_vec = (int32x2_t *) B.x;
|
||||||
|
|
||||||
using int32x8_t = __attribute__((__vector_size__(8 * sizeof(int)))) int;
|
|
||||||
int32x8_t * acc = (int32x8_t *) D.x;
|
|
||||||
|
|
||||||
acc[0] = __builtin_amdgcn_wmma_i32_16x16x16_iu8_w32_gfx12(
|
acc[0] = __builtin_amdgcn_wmma_i32_16x16x16_iu8_w32_gfx12(
|
||||||
true,
|
true,
|
||||||
a_vec[0],
|
a_vec[0],
|
||||||
|
|
@ -914,12 +1085,25 @@ static __device__ __forceinline__ void mma(
|
||||||
acc[0],
|
acc[0],
|
||||||
false
|
false
|
||||||
);
|
);
|
||||||
|
#elif defined(RDNA3)
|
||||||
|
using int32x4_t = __attribute__((__vector_size__(4 * sizeof(int)))) int;
|
||||||
|
int32x4_t * a_vec = (int32x4_t *) A.x;
|
||||||
|
int32x4_t * b_vec = (int32x4_t *) B.x;
|
||||||
|
|
||||||
|
acc[0] = __builtin_amdgcn_wmma_i32_16x16x16_iu8_w32(
|
||||||
|
true,
|
||||||
|
a_vec[0],
|
||||||
|
true,
|
||||||
|
b_vec[0],
|
||||||
|
acc[0],
|
||||||
|
false
|
||||||
|
);
|
||||||
|
#endif // RDNA4
|
||||||
#else
|
#else
|
||||||
GGML_UNUSED(D);
|
GGML_UNUSED(D);
|
||||||
GGML_UNUSED(A);
|
GGML_UNUSED(A);
|
||||||
GGML_UNUSED(B);
|
GGML_UNUSED(B);
|
||||||
NO_DEVICE_CODE;
|
NO_DEVICE_CODE;
|
||||||
#endif
|
#endif // AMD_WMMA_AVAILABLE
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -160,9 +160,9 @@ bool ggml_cuda_should_use_mmf(enum ggml_type type, int cc, int warp_size, const
|
||||||
case GGML_TYPE_F32:
|
case GGML_TYPE_F32:
|
||||||
return ampere_mma_available(cc);
|
return ampere_mma_available(cc);
|
||||||
case GGML_TYPE_F16:
|
case GGML_TYPE_F16:
|
||||||
return volta_mma_available(cc) || turing_mma_available(cc) || amd_wmma_available(cc);
|
return volta_mma_available(cc) || turing_mma_available(cc) || (amd_wmma_available(cc) && GGML_CUDA_CC_IS_RDNA4(cc));
|
||||||
case GGML_TYPE_BF16:
|
case GGML_TYPE_BF16:
|
||||||
return ampere_mma_available(cc) || amd_wmma_available(cc);
|
return ampere_mma_available(cc) || (amd_wmma_available(cc) && GGML_CUDA_CC_IS_RDNA4(cc));
|
||||||
default:
|
default:
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -37,23 +37,19 @@ static __global__ void mul_mat_f(
|
||||||
typedef tile<16, 8, T> tile_A;
|
typedef tile<16, 8, T> tile_A;
|
||||||
typedef tile<tile_B_I, 8, T> tile_B;
|
typedef tile<tile_B_I, 8, T> tile_B;
|
||||||
typedef tile<16, tile_C_J, float> tile_C;
|
typedef tile<16, tile_C_J, float> tile_C;
|
||||||
|
|
||||||
constexpr bool a_supported = tile_A::supported();
|
|
||||||
constexpr bool b_supported = tile_B::supported();
|
|
||||||
constexpr bool c_supported = tile_C::supported();
|
|
||||||
constexpr bool supported = a_supported && b_supported && c_supported;
|
|
||||||
#else
|
#else
|
||||||
constexpr bool I_16_supported = tile<16, 8, T>::supported() && tile<16, 8, float>::supported();
|
#ifdef VOLTA_MMA_AVAILABLE
|
||||||
constexpr bool I_32_supported = tile<32, 8, T>::supported() && tile<32, 8, float>::supported();
|
if constexpr (!std::is_same_v<T, half2>) {NO_DEVICE_CODE;} else {
|
||||||
constexpr bool supported = I_16_supported || I_32_supported;
|
typedef tile<32, 4, T, DATA_LAYOUT_I_MAJOR> tile_A;
|
||||||
|
typedef tile< 8, 4, T, DATA_LAYOUT_I_MAJOR_MIRRORED> tile_B;
|
||||||
constexpr int I_preferred = I_16_supported ? 16 : 32; // For Turing MMA both work but 16 is ~1% faster.
|
typedef tile<32, 8, float, DATA_LAYOUT_I_MAJOR> tile_C;
|
||||||
|
#else
|
||||||
typedef tile<I_preferred, 8, T> tile_A;
|
typedef tile<16, 8, T> tile_A;
|
||||||
typedef tile<8, 8, T> tile_B;
|
typedef tile<8, 8, T> tile_B;
|
||||||
typedef tile<I_preferred, 8, float> tile_C;
|
typedef tile<16, 8, float> tile_C;
|
||||||
|
#endif // VOLTA_MMA_AVAILABLE
|
||||||
#endif // defined(AMD_WMMA_AVAILABLE)
|
#endif // defined(AMD_WMMA_AVAILABLE)
|
||||||
if constexpr (!supported) {
|
if constexpr (!tile_A::supported() || !tile_B::supported() || !tile_C::supported()) {
|
||||||
NO_DEVICE_CODE;
|
NO_DEVICE_CODE;
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
@ -248,6 +244,9 @@ static __global__ void mul_mat_f(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
#ifdef VOLTA_MMA_AVAILABLE
|
||||||
|
}
|
||||||
|
#endif //VOLTA_MMA_AVAILABLE
|
||||||
#else
|
#else
|
||||||
GGML_UNUSED_VARS(x, y, ids, dst,
|
GGML_UNUSED_VARS(x, y, ids, dst,
|
||||||
ncols, ncols_dst_total, nchannels_dst, stride_row, stride_col_y, stride_col_dst,
|
ncols, ncols_dst_total, nchannels_dst, stride_row, stride_col_y, stride_col_dst,
|
||||||
|
|
@ -278,27 +277,24 @@ static __global__ void mul_mat_f_ids(
|
||||||
typedef tile<16, 8, T> tile_A;
|
typedef tile<16, 8, T> tile_A;
|
||||||
typedef tile<tile_B_I, 8, T> tile_B;
|
typedef tile<tile_B_I, 8, T> tile_B;
|
||||||
typedef tile<16, tile_C_J, float> tile_C;
|
typedef tile<16, tile_C_J, float> tile_C;
|
||||||
|
|
||||||
constexpr bool a_supported = tile_A::supported();
|
|
||||||
constexpr bool b_supported = tile_B::supported();
|
|
||||||
constexpr bool c_supported = tile_C::supported();
|
|
||||||
constexpr bool supported = a_supported && b_supported && c_supported;
|
|
||||||
#else
|
#else
|
||||||
constexpr bool I_16_supported = tile<16, 8, T>::supported() && tile<16, 8, float>::supported();
|
#ifdef VOLTA_MMA_AVAILABLE
|
||||||
constexpr bool I_32_supported = tile<32, 8, T>::supported() && tile<32, 8, float>::supported();
|
if constexpr (!std::is_same_v<T, half2>) {NO_DEVICE_CODE;} else {
|
||||||
constexpr bool supported = I_16_supported || I_32_supported;
|
typedef tile<32, 4, T, DATA_LAYOUT_I_MAJOR> tile_A;
|
||||||
|
typedef tile< 8, 4, T, DATA_LAYOUT_I_MAJOR_MIRRORED> tile_B;
|
||||||
constexpr int I_preferred = I_16_supported ? 16 : 32; // For Turing MMA both work but 16 is ~1% faster.
|
typedef tile<32, 8, float, DATA_LAYOUT_I_MAJOR> tile_C;
|
||||||
|
#else
|
||||||
typedef tile<I_preferred, 8, T> tile_A;
|
typedef tile<16, 8, T> tile_A;
|
||||||
typedef tile<8, 8, T> tile_B;
|
typedef tile<8, 8, T> tile_B;
|
||||||
typedef tile<I_preferred, 8, float> tile_C;
|
typedef tile<16, 8, float> tile_C;
|
||||||
|
#endif // VOLTA_MMA_AVAILABLE
|
||||||
#endif // defined(AMD_WMMA_AVAILABLE)
|
#endif // defined(AMD_WMMA_AVAILABLE)
|
||||||
if constexpr (!supported) {
|
if constexpr (!tile_A::supported() || !tile_B::supported() || !tile_C::supported()) {
|
||||||
NO_DEVICE_CODE;
|
NO_DEVICE_CODE;
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
constexpr int warp_size = ggml_cuda_get_physical_warp_size();
|
constexpr int warp_size = ggml_cuda_get_physical_warp_size();
|
||||||
constexpr int tile_k_padded = warp_size + 4;
|
constexpr int tile_k_padded = warp_size + 4;
|
||||||
constexpr int ntA = rows_per_block / tile_A::I;
|
constexpr int ntA = rows_per_block / tile_A::I;
|
||||||
|
|
@ -517,6 +513,9 @@ static __global__ void mul_mat_f_ids(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
#ifdef VOLTA_MMA_AVAILABLE
|
||||||
|
}
|
||||||
|
#endif // VOLTA_MMA_AVAILABLE
|
||||||
#else
|
#else
|
||||||
GGML_UNUSED_VARS(x, y, ids_src_compact, ids_dst_compact, expert_bounds, dst,
|
GGML_UNUSED_VARS(x, y, ids_src_compact, ids_dst_compact, expert_bounds, dst,
|
||||||
ncols, ncols_dst_total, nchannels_dst, stride_row, stride_col_y, stride_col_dst,
|
ncols, ncols_dst_total, nchannels_dst, stride_row, stride_col_y, stride_col_dst,
|
||||||
|
|
|
||||||
|
|
@ -307,10 +307,9 @@ bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11) {
|
||||||
}
|
}
|
||||||
|
|
||||||
if (amd_wmma_available(cc)) {
|
if (amd_wmma_available(cc)) {
|
||||||
if (GGML_CUDA_CC_IS_RDNA4(cc)) {
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
return (!GGML_CUDA_CC_IS_RDNA3(cc) && !GGML_CUDA_CC_IS_CDNA(cc)) || ne11 < MMQ_DP4A_MAX_BATCH_SIZE;
|
return (!GGML_CUDA_CC_IS_CDNA(cc)) || ne11 < MMQ_DP4A_MAX_BATCH_SIZE;
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -1542,8 +1542,10 @@ static __device__ __forceinline__ void vec_dot_q2_K_q8_1_mma(
|
||||||
tile_C Cm;
|
tile_C Cm;
|
||||||
if (k01 >= MMQ_TILE_NE_K * 3/4) {
|
if (k01 >= MMQ_TILE_NE_K * 3/4) {
|
||||||
tile_A A1;
|
tile_A A1;
|
||||||
A1.x[0] = 0x01010101;
|
#pragma unroll
|
||||||
A1.x[1] = 0x01010101;
|
for (int l = 0; l < tile_A::ne; ++l) {
|
||||||
|
A1.x[l] = 0x01010101;
|
||||||
|
}
|
||||||
mma(Cm, A1, B);
|
mma(Cm, A1, B);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -1,9 +1,17 @@
|
||||||
#include "pad.cuh"
|
#include "pad.cuh"
|
||||||
|
|
||||||
|
#include <stdint.h>
|
||||||
|
|
||||||
|
__device__ __forceinline__ int64_t wrap_around(int64_t coord, int64_t size) {
|
||||||
|
// + size ensures negatives are handled properly
|
||||||
|
return (coord + size) % size;
|
||||||
|
}
|
||||||
|
|
||||||
static __global__ void pad_f32(const float * src, float * dst,
|
static __global__ void pad_f32(const float * src, float * dst,
|
||||||
const int lp0, const int rp0, const int lp1, const int rp1,
|
const int lp0, const int rp0, const int lp1, const int rp1,
|
||||||
const int lp2, const int rp2, const int lp3, const int rp3,
|
const int lp2, const int rp2, const int lp3, const int rp3,
|
||||||
const int ne0, const int ne1, const int ne2, const int ne3) {
|
const int ne0, const int ne1, const int ne2, const int ne3,
|
||||||
|
const bool circular) {
|
||||||
// blockIdx.z: i3*ne2+i2
|
// blockIdx.z: i3*ne2+i2
|
||||||
// blockIdx.y: i1
|
// blockIdx.y: i1
|
||||||
// blockIDx.x: i0 / CUDA_PAD_BLOCK_SIZE
|
// blockIDx.x: i0 / CUDA_PAD_BLOCK_SIZE
|
||||||
|
|
@ -12,15 +20,15 @@ static __global__ void pad_f32(const float * src, float * dst,
|
||||||
int i1 = blockIdx.y;
|
int i1 = blockIdx.y;
|
||||||
int i2 = blockIdx.z % ne2;
|
int i2 = blockIdx.z % ne2;
|
||||||
int i3 = blockIdx.z / ne2;
|
int i3 = blockIdx.z / ne2;
|
||||||
|
|
||||||
if (i0 >= ne0 || i1 >= ne1 || i2 >= ne2 || i3 >= ne3) {
|
if (i0 >= ne0 || i1 >= ne1 || i2 >= ne2 || i3 >= ne3) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
// operation
|
|
||||||
const int64_t dst_idx = i3 * (ne0 * ne1 * ne2) + i2 * (ne0 * ne1) + i1 * ne0 + i0;
|
const int64_t dst_idx = i3 * (ne0 * ne1 * ne2) + i2 * (ne0 * ne1) + i1 * ne0 + i0;
|
||||||
if ((i0 >= lp0 && i0 < ne0 - rp0) &&
|
|
||||||
(i1 >= lp1 && i1 < ne1 - rp1) &&
|
if (!circular) {
|
||||||
(i2 >= lp2 && i2 < ne2 - rp2) &&
|
if ((i0 >= lp0 && i0 < ne0 - rp0) && (i1 >= lp1 && i1 < ne1 - rp1) && (i2 >= lp2 && i2 < ne2 - rp2) &&
|
||||||
(i3 >= lp3 && i3 < ne3 - rp3)) {
|
(i3 >= lp3 && i3 < ne3 - rp3)) {
|
||||||
const int64_t i00 = i0 - lp0;
|
const int64_t i00 = i0 - lp0;
|
||||||
const int64_t i01 = i1 - lp1;
|
const int64_t i01 = i1 - lp1;
|
||||||
|
|
@ -37,14 +45,35 @@ static __global__ void pad_f32(const float * src, float * dst,
|
||||||
dst[dst_idx] = 0.0f;
|
dst[dst_idx] = 0.0f;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
// circular means on a torus, so x and y wrap around
|
||||||
|
else {
|
||||||
|
const int64_t ne00 = ne0 - lp0 - rp0;
|
||||||
|
const int64_t ne01 = ne1 - lp1 - rp1;
|
||||||
|
const int64_t ne02 = ne2 - lp2 - rp2;
|
||||||
|
const int64_t ne03 = ne3 - lp3 - rp3;
|
||||||
|
|
||||||
|
const int64_t i00 = wrap_around(i0 - lp0, ne00);
|
||||||
|
const int64_t i01 = wrap_around(i1 - lp1, ne01);
|
||||||
|
const int64_t i02 = wrap_around(i2 - lp2, ne02);
|
||||||
|
const int64_t i03 = wrap_around(i3 - lp3, ne03);
|
||||||
|
|
||||||
|
const int64_t src_idx = i03 * (ne00 * ne01 * ne02) + i02 * (ne00 * ne01) + i01 * ne00 + i00;
|
||||||
|
|
||||||
|
dst[dst_idx] = src[src_idx];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
static void pad_f32_cuda(const float * src, float * dst,
|
static void pad_f32_cuda(const float * src, float * dst,
|
||||||
const int lp0, const int rp0, const int lp1, const int rp1,
|
const int lp0, const int rp0, const int lp1, const int rp1,
|
||||||
const int lp2, const int rp2, const int lp3, const int rp3,
|
const int lp2, const int rp2, const int lp3, const int rp3,
|
||||||
const int ne0, const int ne1, const int ne2, const int ne3, cudaStream_t stream) {
|
const int ne0, const int ne1, const int ne2, const int ne3,
|
||||||
|
const bool circular, cudaStream_t stream) {
|
||||||
int num_blocks = (ne0 + CUDA_PAD_BLOCK_SIZE - 1) / CUDA_PAD_BLOCK_SIZE;
|
int num_blocks = (ne0 + CUDA_PAD_BLOCK_SIZE - 1) / CUDA_PAD_BLOCK_SIZE;
|
||||||
dim3 gridDim(num_blocks, ne1, ne2 * ne3);
|
dim3 gridDim(num_blocks, ne1, ne2 * ne3);
|
||||||
pad_f32<<<gridDim, CUDA_PAD_BLOCK_SIZE, 0, stream>>>(src, dst, lp0, rp0, lp1, rp1, lp2, rp2, lp3, rp3, ne0, ne1, ne2, ne3);
|
pad_f32<<<gridDim, CUDA_PAD_BLOCK_SIZE, 0, stream>>>(src, dst,
|
||||||
|
lp0, rp0, lp1, rp1, lp2, rp2, lp3, rp3,
|
||||||
|
ne0, ne1, ne2, ne3, circular);
|
||||||
}
|
}
|
||||||
|
|
||||||
void ggml_cuda_op_pad(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
void ggml_cuda_op_pad(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||||
|
|
@ -65,8 +94,10 @@ void ggml_cuda_op_pad(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||||
const int32_t rp2 = ((const int32_t *) (dst->op_params))[5];
|
const int32_t rp2 = ((const int32_t *) (dst->op_params))[5];
|
||||||
const int32_t lp3 = ((const int32_t *) (dst->op_params))[6];
|
const int32_t lp3 = ((const int32_t *) (dst->op_params))[6];
|
||||||
const int32_t rp3 = ((const int32_t *) (dst->op_params))[7];
|
const int32_t rp3 = ((const int32_t *) (dst->op_params))[7];
|
||||||
|
const int32_t circular = ((const int32_t *) (dst->op_params))[8];
|
||||||
|
|
||||||
pad_f32_cuda(src0_d, dst_d,
|
pad_f32_cuda(src0_d, dst_d,
|
||||||
lp0, rp0, lp1, rp1, lp2, rp2, lp3, rp3,
|
lp0, rp0, lp1, rp1, lp2, rp2, lp3, rp3,
|
||||||
dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], stream);
|
dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3],
|
||||||
|
(bool) circular, stream);
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -3,7 +3,6 @@
|
||||||
#include "solve_tri.cuh"
|
#include "solve_tri.cuh"
|
||||||
|
|
||||||
#define MAX_N_FAST 64
|
#define MAX_N_FAST 64
|
||||||
#define MAX_K_FAST 32
|
|
||||||
|
|
||||||
// ======================
|
// ======================
|
||||||
// Fast Kernel (n <= 64, k <= 32) - Warp-based parallel reduction
|
// Fast Kernel (n <= 64, k <= 32) - Warp-based parallel reduction
|
||||||
|
|
@ -48,65 +47,58 @@ static __global__ void solve_tri_f32_fast(const float * __restrict__ A,
|
||||||
float * X_batch = (float *) (X + i02 * nb2 + i03 * nb3);
|
float * X_batch = (float *) (X + i02 * nb2 + i03 * nb3);
|
||||||
|
|
||||||
__shared__ float sA[MAX_N_FAST * MAX_N_FAST];
|
__shared__ float sA[MAX_N_FAST * MAX_N_FAST];
|
||||||
__shared__ float sXt[MAX_N_FAST * (MAX_K_FAST + 1)];
|
|
||||||
|
|
||||||
const int offset = threadIdx.x + threadIdx.y * blockDim.x;
|
const int offset = threadIdx.x + threadIdx.y * blockDim.x;
|
||||||
|
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int i = 0; i < n * n; i += k * WARP_SIZE) {
|
for (int i = 0; i < n * n; i += k * WARP_SIZE) {
|
||||||
int i0 = i + offset;
|
const int i0 = i + offset;
|
||||||
if (i0 < n * n) {
|
if (i0 < n * n) {
|
||||||
sA[i0] = A_batch[i0];
|
sA[i0] = A_batch[i0];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
const int rows_per_warp = (n + WARP_SIZE - 1) / WARP_SIZE;
|
|
||||||
|
|
||||||
#pragma unroll
|
|
||||||
for (int i = 0; i < rows_per_warp; i++) {
|
|
||||||
const int i0 = lane + i * WARP_SIZE;
|
|
||||||
if (i0 < n) {
|
|
||||||
sXt[col_idx * n + i0] = B_batch[i0 * k + col_idx];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
|
|
||||||
|
float x_low = (lane < n) ? B_batch[lane * k + col_idx] : 0.0f;
|
||||||
|
float x_high = (WARP_SIZE + lane < n) ? B_batch[(WARP_SIZE + lane) * k + col_idx] : 0.0f;
|
||||||
|
|
||||||
|
const int half = WARP_SIZE;
|
||||||
|
const int nrows_low = (n < half) ? n : half;
|
||||||
|
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int row = 0; row < n; ++row) {
|
for (int row = 0; row < nrows_low; ++row) {
|
||||||
float sum = 0.0f;
|
float sum = 0.0f;
|
||||||
|
if (lane < row) {
|
||||||
{
|
sum += sA[row * n + lane] * x_low;
|
||||||
int j = lane;
|
|
||||||
if (j < row) {
|
|
||||||
sum += sA[row * n + j] * sXt[col_idx * n + j];
|
|
||||||
}
|
}
|
||||||
}
|
|
||||||
if (row >= WARP_SIZE) {
|
|
||||||
int j = WARP_SIZE + lane;
|
|
||||||
if (j < row) {
|
|
||||||
sum += sA[row * n + j] * sXt[col_idx * n + j];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
sum = warp_reduce_sum(sum);
|
sum = warp_reduce_sum(sum);
|
||||||
|
|
||||||
if (lane == 0) {
|
if (lane == row) {
|
||||||
const float b_val = sXt[col_idx * n + row];
|
x_low = (x_low - sum) / sA[row * n + row];
|
||||||
const float a_diag = sA[row * n + row];
|
|
||||||
// no safeguards for division by zero because that indicates corrupt
|
|
||||||
// data anyway
|
|
||||||
sXt[col_idx * n + row] = (b_val - sum) / a_diag;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
__syncthreads();
|
|
||||||
|
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int i = 0; i < rows_per_warp; i++) {
|
for (int row = half; row < n; ++row) {
|
||||||
const int i0 = lane + i * WARP_SIZE;
|
float sum = sA[row * n + lane] * x_low;
|
||||||
if (i0 < n) {
|
const int j = half + lane;
|
||||||
X_batch[i0 * k + col_idx] = sXt[col_idx * n + i0];
|
if (j < row) {
|
||||||
|
sum += sA[row * n + j] * x_high;
|
||||||
|
}
|
||||||
|
sum = warp_reduce_sum(sum);
|
||||||
|
|
||||||
|
if (lane == row - half) {
|
||||||
|
x_high = (x_high - sum) / sA[row * n + row];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#pragma unroll
|
||||||
|
for (int rr = 0; rr < 2; ++rr) {
|
||||||
|
const int row = rr * WARP_SIZE + lane;
|
||||||
|
if (row < n) {
|
||||||
|
const float val = (row < half) ? x_low : x_high;
|
||||||
|
X_batch[row * k + col_idx] = val;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,136 @@
|
||||||
|
#include "common.cuh"
|
||||||
|
#include "convert.cuh"
|
||||||
|
#include "tri.cuh"
|
||||||
|
#include "ggml.h"
|
||||||
|
|
||||||
|
template<typename T, bool prefix_keep, int add_to_split>
|
||||||
|
static __global__ void tri_kernel(
|
||||||
|
const T * src, T * dst,
|
||||||
|
const int64_t ne00, const int64_t ne01, const int64_t ne02, const int64_t ne03,
|
||||||
|
const int64_t nb00, const int64_t nb01, const int64_t nb02, const int64_t nb03,
|
||||||
|
const int64_t nb0, const int64_t nb1, const int64_t nb2, const int64_t nb3) {
|
||||||
|
const int64_t i3 = blockIdx.z;
|
||||||
|
const int64_t i2 = blockIdx.y;
|
||||||
|
const int64_t i1 = blockIdx.x;
|
||||||
|
const int64_t split_point = i1 + add_to_split;
|
||||||
|
|
||||||
|
GGML_UNUSED_VARS(nb00, nb0);
|
||||||
|
|
||||||
|
if (i3 >= ne03 || i2 >= ne02 || i1 >= ne01) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
const T * src_row = src + i1*nb01 + i2*nb02 + i3*nb03;
|
||||||
|
T * dst_row = dst + i1*nb1 + i2*nb2 + i3*nb3;
|
||||||
|
|
||||||
|
if constexpr (prefix_keep) {
|
||||||
|
for (int64_t i0 = threadIdx.x; i0 < split_point; i0 += blockDim.x) {
|
||||||
|
dst_row[i0] = src_row[i0];
|
||||||
|
}
|
||||||
|
for (int64_t i0 = threadIdx.x + split_point; i0 < ne00; i0 += blockDim.x) {
|
||||||
|
dst_row[i0] = ggml_cuda_cast<T, float>(0.0f);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
for (int64_t i0 = threadIdx.x; i0 < split_point; i0 += blockDim.x) {
|
||||||
|
dst_row[i0] = ggml_cuda_cast<T, float>(0.0f);
|
||||||
|
}
|
||||||
|
for (int64_t i0 = threadIdx.x + split_point; i0 < ne00; i0 += blockDim.x) {
|
||||||
|
dst_row[i0] = src_row[i0];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
template<typename T>
|
||||||
|
static void tri_cuda(
|
||||||
|
const T * src, T * dst,
|
||||||
|
const int64_t ne00, const int64_t ne01, const int64_t ne02, const int64_t ne03,
|
||||||
|
const int64_t nb00, const int64_t nb01, const int64_t nb02, const int64_t nb03,
|
||||||
|
const int64_t nb0, const int64_t nb1, const int64_t nb2, const int64_t nb3,
|
||||||
|
const ggml_tri_type ttype,
|
||||||
|
cudaStream_t stream) {
|
||||||
|
|
||||||
|
dim3 block_dims(CUDA_TRI_BLOCK_SIZE, 1, 1);
|
||||||
|
dim3 grid_dims(ne01, ne02, ne03);
|
||||||
|
const size_t type_size = sizeof(T);
|
||||||
|
|
||||||
|
const int add_to_split = (ttype == GGML_TRI_TYPE_LOWER_DIAG || ttype == GGML_TRI_TYPE_UPPER) ? 1 : 0;
|
||||||
|
const bool prefix_keep = (ttype == GGML_TRI_TYPE_LOWER || ttype == GGML_TRI_TYPE_LOWER_DIAG);
|
||||||
|
|
||||||
|
if (prefix_keep) {
|
||||||
|
if (add_to_split == 0) {
|
||||||
|
tri_kernel<T, true, 0><<<grid_dims, block_dims, 0, stream>>>(
|
||||||
|
src, dst,
|
||||||
|
ne00, ne01, ne02, ne03,
|
||||||
|
nb00 / type_size, nb01 / type_size, nb02 / type_size, nb03 / type_size,
|
||||||
|
nb0 / type_size, nb1 / type_size, nb2 / type_size, nb3 / type_size
|
||||||
|
);
|
||||||
|
} else { // only 0 and 1 supported
|
||||||
|
tri_kernel<T, true, 1><<<grid_dims, block_dims, 0, stream>>>(
|
||||||
|
src, dst,
|
||||||
|
ne00, ne01, ne02, ne03,
|
||||||
|
nb00 / type_size, nb01 / type_size, nb02 / type_size, nb03 / type_size,
|
||||||
|
nb0 / type_size, nb1 / type_size, nb2 / type_size, nb3 / type_size
|
||||||
|
);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
if (add_to_split == 0) {
|
||||||
|
tri_kernel<T, false, 0><<<grid_dims, block_dims, 0, stream>>>(
|
||||||
|
src, dst,
|
||||||
|
ne00, ne01, ne02, ne03,
|
||||||
|
nb00 / type_size, nb01 / type_size, nb02 / type_size, nb03 / type_size,
|
||||||
|
nb0 / type_size, nb1 / type_size, nb2 / type_size, nb3 / type_size
|
||||||
|
);
|
||||||
|
} else {
|
||||||
|
tri_kernel<T, false, 1><<<grid_dims, block_dims, 0, stream>>>(
|
||||||
|
src, dst,
|
||||||
|
ne00, ne01, ne02, ne03,
|
||||||
|
nb00 / type_size, nb01 / type_size, nb02 / type_size, nb03 / type_size,
|
||||||
|
nb0 / type_size, nb1 / type_size, nb2 / type_size, nb3 / type_size
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void ggml_cuda_op_tri(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||||
|
const ggml_tensor * src0 = dst->src[0];
|
||||||
|
cudaStream_t stream = ctx.stream();
|
||||||
|
|
||||||
|
const ggml_tri_type ttype = static_cast<ggml_tri_type>(ggml_get_op_params_i32(dst, 0));
|
||||||
|
|
||||||
|
GGML_ASSERT(src0->type == dst->type);
|
||||||
|
|
||||||
|
switch(src0->type) {
|
||||||
|
case GGML_TYPE_F32:
|
||||||
|
{
|
||||||
|
tri_cuda(
|
||||||
|
(const float *)src0->data, (float *)dst->data,
|
||||||
|
src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3],
|
||||||
|
src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3],
|
||||||
|
dst->nb[0], dst->nb[1], dst->nb[2], dst->nb[3],
|
||||||
|
ttype, stream
|
||||||
|
);
|
||||||
|
} break;
|
||||||
|
case GGML_TYPE_F16:
|
||||||
|
{
|
||||||
|
tri_cuda(
|
||||||
|
(const half *)src0->data, (half *)dst->data,
|
||||||
|
src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3],
|
||||||
|
src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3],
|
||||||
|
dst->nb[0], dst->nb[1], dst->nb[2], dst->nb[3],
|
||||||
|
ttype, stream
|
||||||
|
);
|
||||||
|
} break;
|
||||||
|
case GGML_TYPE_BF16:
|
||||||
|
{
|
||||||
|
tri_cuda(
|
||||||
|
(const nv_bfloat16 *)src0->data, (nv_bfloat16 *)dst->data,
|
||||||
|
src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3],
|
||||||
|
src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3],
|
||||||
|
dst->nb[0], dst->nb[1], dst->nb[2], dst->nb[3],
|
||||||
|
ttype, stream
|
||||||
|
);
|
||||||
|
} break;
|
||||||
|
default:
|
||||||
|
GGML_ABORT("fatal error");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,5 @@
|
||||||
|
#include "common.cuh"
|
||||||
|
|
||||||
|
#define CUDA_TRI_BLOCK_SIZE 256
|
||||||
|
|
||||||
|
void ggml_cuda_op_tri(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
||||||
|
|
@ -24,9 +24,6 @@ struct ggml_metal_command_buffer {
|
||||||
};
|
};
|
||||||
|
|
||||||
struct ggml_metal {
|
struct ggml_metal {
|
||||||
id<MTLDevice> device;
|
|
||||||
id<MTLCommandQueue> queue; // currently a pointer to the device queue, but might become separate queue [TAG_QUEUE_PER_BACKEND]
|
|
||||||
|
|
||||||
ggml_metal_device_t dev;
|
ggml_metal_device_t dev;
|
||||||
ggml_metal_library_t lib;
|
ggml_metal_library_t lib;
|
||||||
|
|
||||||
|
|
@ -91,15 +88,15 @@ ggml_metal_t ggml_metal_init(ggml_metal_device_t dev) {
|
||||||
// init context
|
// init context
|
||||||
ggml_metal_t res = calloc(1, sizeof(struct ggml_metal));
|
ggml_metal_t res = calloc(1, sizeof(struct ggml_metal));
|
||||||
|
|
||||||
res->device = ggml_metal_device_get_obj(dev);
|
id<MTLDevice> device = ggml_metal_device_get_obj(dev);
|
||||||
|
|
||||||
GGML_LOG_INFO("%s: picking default device: %s\n", __func__, [[res->device name] UTF8String]);
|
GGML_LOG_INFO("%s: picking default device: %s\n", __func__, [[device name] UTF8String]);
|
||||||
|
|
||||||
// TODO: would it be better to have one queue for the backend and one queue for the device?
|
// TODO: would it be better to have one queue for the backend and one queue for the device?
|
||||||
// the graph encoders and async ops would use the backend queue while the sync ops would use the device queue?
|
// the graph encoders and async ops would use the backend queue while the sync ops would use the device queue?
|
||||||
//res->queue = [device newCommandQueue]; [TAG_QUEUE_PER_BACKEND]
|
//res->queue = [device newCommandQueue]; [TAG_QUEUE_PER_BACKEND]
|
||||||
res->queue = ggml_metal_device_get_queue(dev);
|
id<MTLCommandQueue> queue = ggml_metal_device_get_queue(dev);
|
||||||
if (res->queue == nil) {
|
if (queue == nil) {
|
||||||
GGML_LOG_ERROR("%s: error: failed to create command queue\n", __func__);
|
GGML_LOG_ERROR("%s: error: failed to create command queue\n", __func__);
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
@ -274,7 +271,8 @@ static struct ggml_metal_buffer_id ggml_metal_get_buffer_id(const struct ggml_te
|
||||||
void ggml_metal_set_tensor_async(ggml_metal_t ctx, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
void ggml_metal_set_tensor_async(ggml_metal_t ctx, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
||||||
@autoreleasepool {
|
@autoreleasepool {
|
||||||
// wrap the source data into a Metal buffer
|
// wrap the source data into a Metal buffer
|
||||||
id<MTLBuffer> buf_src = [ctx->device newBufferWithBytes:data
|
id<MTLDevice> device = ggml_metal_device_get_obj(ctx->dev);
|
||||||
|
id<MTLBuffer> buf_src = [device newBufferWithBytes:data
|
||||||
length:size
|
length:size
|
||||||
options:MTLResourceStorageModeShared];
|
options:MTLResourceStorageModeShared];
|
||||||
|
|
||||||
|
|
@ -289,7 +287,8 @@ void ggml_metal_set_tensor_async(ggml_metal_t ctx, struct ggml_tensor * tensor,
|
||||||
|
|
||||||
// queue the copy operation into the queue of the Metal context
|
// queue the copy operation into the queue of the Metal context
|
||||||
// this will be queued at the end, after any currently ongoing GPU operations
|
// this will be queued at the end, after any currently ongoing GPU operations
|
||||||
id<MTLCommandBuffer> cmd_buf = [ctx->queue commandBuffer];
|
id<MTLCommandQueue> queue = ggml_metal_device_get_queue(ctx->dev);
|
||||||
|
id<MTLCommandBuffer> cmd_buf = [queue commandBuffer];
|
||||||
id<MTLBlitCommandEncoder> encoder = [cmd_buf blitCommandEncoder];
|
id<MTLBlitCommandEncoder> encoder = [cmd_buf blitCommandEncoder];
|
||||||
|
|
||||||
[encoder copyFromBuffer:buf_src
|
[encoder copyFromBuffer:buf_src
|
||||||
|
|
@ -315,7 +314,8 @@ void ggml_metal_set_tensor_async(ggml_metal_t ctx, struct ggml_tensor * tensor,
|
||||||
|
|
||||||
void ggml_metal_get_tensor_async(ggml_metal_t ctx, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
void ggml_metal_get_tensor_async(ggml_metal_t ctx, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
||||||
@autoreleasepool {
|
@autoreleasepool {
|
||||||
id<MTLBuffer> buf_dst = [ctx->device newBufferWithBytesNoCopy:data
|
id<MTLDevice> device = ggml_metal_device_get_obj(ctx->dev);
|
||||||
|
id<MTLBuffer> buf_dst = [device newBufferWithBytesNoCopy:data
|
||||||
length:size
|
length:size
|
||||||
options:MTLResourceStorageModeShared
|
options:MTLResourceStorageModeShared
|
||||||
deallocator:nil];
|
deallocator:nil];
|
||||||
|
|
@ -331,7 +331,8 @@ void ggml_metal_get_tensor_async(ggml_metal_t ctx, const struct ggml_tensor * te
|
||||||
|
|
||||||
// queue the copy operation into the queue of the Metal context
|
// queue the copy operation into the queue of the Metal context
|
||||||
// this will be queued at the end, after any currently ongoing GPU operations
|
// this will be queued at the end, after any currently ongoing GPU operations
|
||||||
id<MTLCommandBuffer> cmd_buf = [ctx->queue commandBuffer];
|
id<MTLCommandQueue> queue = ggml_metal_device_get_queue(ctx->dev);
|
||||||
|
id<MTLCommandBuffer> cmd_buf = [queue commandBuffer];
|
||||||
id<MTLBlitCommandEncoder> encoder = [cmd_buf blitCommandEncoder];
|
id<MTLBlitCommandEncoder> encoder = [cmd_buf blitCommandEncoder];
|
||||||
|
|
||||||
[encoder copyFromBuffer:bid_src.metal
|
[encoder copyFromBuffer:bid_src.metal
|
||||||
|
|
@ -362,6 +363,9 @@ enum ggml_status ggml_metal_graph_compute(ggml_metal_t ctx, struct ggml_cgraph *
|
||||||
// number of threads in addition to the main thread
|
// number of threads in addition to the main thread
|
||||||
const int n_cb = ctx->n_cb;
|
const int n_cb = ctx->n_cb;
|
||||||
|
|
||||||
|
// keep the memory wired
|
||||||
|
ggml_metal_device_rsets_keep_alive(ctx->dev);
|
||||||
|
|
||||||
// submit the ggml compute graph to the GPU by creating command buffers and encoding the ops in them
|
// submit the ggml compute graph to the GPU by creating command buffers and encoding the ops in them
|
||||||
// the first n_nodes_0 are encoded and submitted for processing directly by the calling thread
|
// the first n_nodes_0 are encoded and submitted for processing directly by the calling thread
|
||||||
// while these nodes are processing, we start n_cb threads to enqueue the rest of the nodes
|
// while these nodes are processing, we start n_cb threads to enqueue the rest of the nodes
|
||||||
|
|
@ -389,7 +393,8 @@ enum ggml_status ggml_metal_graph_compute(ggml_metal_t ctx, struct ggml_cgraph *
|
||||||
|
|
||||||
if (!ctx->capture_started) {
|
if (!ctx->capture_started) {
|
||||||
// create capture scope
|
// create capture scope
|
||||||
ctx->capture_scope = [[MTLCaptureManager sharedCaptureManager] newCaptureScopeWithDevice:ctx->device];
|
id<MTLDevice> device = ggml_metal_device_get_obj(ctx->dev);
|
||||||
|
ctx->capture_scope = [[MTLCaptureManager sharedCaptureManager] newCaptureScopeWithDevice:device];
|
||||||
|
|
||||||
MTLCaptureDescriptor * descriptor = [MTLCaptureDescriptor new];
|
MTLCaptureDescriptor * descriptor = [MTLCaptureDescriptor new];
|
||||||
descriptor.captureObject = ctx->capture_scope;
|
descriptor.captureObject = ctx->capture_scope;
|
||||||
|
|
@ -406,10 +411,13 @@ enum ggml_status ggml_metal_graph_compute(ggml_metal_t ctx, struct ggml_cgraph *
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// short-hand
|
||||||
|
id<MTLCommandQueue> queue = ggml_metal_device_get_queue(ctx->dev);
|
||||||
|
|
||||||
// the main thread commits the first few commands immediately
|
// the main thread commits the first few commands immediately
|
||||||
// cmd_buf[n_cb]
|
// cmd_buf[n_cb]
|
||||||
{
|
{
|
||||||
id<MTLCommandBuffer> cmd_buf = [ctx->queue commandBufferWithUnretainedReferences];
|
id<MTLCommandBuffer> cmd_buf = [queue commandBufferWithUnretainedReferences];
|
||||||
[cmd_buf retain];
|
[cmd_buf retain];
|
||||||
|
|
||||||
if (ctx->cmd_bufs[n_cb].obj) {
|
if (ctx->cmd_bufs[n_cb].obj) {
|
||||||
|
|
@ -428,7 +436,7 @@ enum ggml_status ggml_metal_graph_compute(ggml_metal_t ctx, struct ggml_cgraph *
|
||||||
// prepare the rest of the command buffers asynchronously (optional)
|
// prepare the rest of the command buffers asynchronously (optional)
|
||||||
// cmd_buf[0.. n_cb)
|
// cmd_buf[0.. n_cb)
|
||||||
for (int cb_idx = 0; cb_idx < n_cb; ++cb_idx) {
|
for (int cb_idx = 0; cb_idx < n_cb; ++cb_idx) {
|
||||||
id<MTLCommandBuffer> cmd_buf = [ctx->queue commandBufferWithUnretainedReferences];
|
id<MTLCommandBuffer> cmd_buf = [queue commandBufferWithUnretainedReferences];
|
||||||
[cmd_buf retain];
|
[cmd_buf retain];
|
||||||
|
|
||||||
if (ctx->cmd_bufs[cb_idx].obj) {
|
if (ctx->cmd_bufs[cb_idx].obj) {
|
||||||
|
|
@ -589,9 +597,11 @@ void ggml_metal_set_abort_callback(ggml_metal_t ctx, ggml_abort_callback abort_c
|
||||||
}
|
}
|
||||||
|
|
||||||
bool ggml_metal_supports_family(ggml_metal_t ctx, int family) {
|
bool ggml_metal_supports_family(ggml_metal_t ctx, int family) {
|
||||||
GGML_ASSERT(ctx->device != nil);
|
GGML_ASSERT(ctx->dev != nil);
|
||||||
|
|
||||||
return [ctx->device supportsFamily:(MTLGPUFamilyApple1 + family - 1)];
|
id<MTLDevice> device = ggml_metal_device_get_obj(ctx->dev);
|
||||||
|
|
||||||
|
return [device supportsFamily:(MTLGPUFamilyApple1 + family - 1)];
|
||||||
}
|
}
|
||||||
|
|
||||||
void ggml_metal_capture_next_compute(ggml_metal_t ctx) {
|
void ggml_metal_capture_next_compute(ggml_metal_t ctx) {
|
||||||
|
|
|
||||||
File diff suppressed because it is too large
Load Diff
|
|
@ -35,20 +35,6 @@ typedef struct ggml_metal_pipeline * ggml_metal_pipeline_t;
|
||||||
ggml_metal_pipeline_t ggml_metal_pipeline_init(void);
|
ggml_metal_pipeline_t ggml_metal_pipeline_init(void);
|
||||||
void ggml_metal_pipeline_free(ggml_metal_pipeline_t pipeline);
|
void ggml_metal_pipeline_free(ggml_metal_pipeline_t pipeline);
|
||||||
|
|
||||||
void ggml_metal_pipeline_set_nsg(ggml_metal_pipeline_t pipeline, int nsg);
|
|
||||||
int ggml_metal_pipeline_get_nsg(ggml_metal_pipeline_t pipeline);
|
|
||||||
|
|
||||||
void ggml_metal_pipeline_set_nr0(ggml_metal_pipeline_t pipeline, int nr0);
|
|
||||||
int ggml_metal_pipeline_get_nr0(ggml_metal_pipeline_t pipeline);
|
|
||||||
|
|
||||||
void ggml_metal_pipeline_set_nr1(ggml_metal_pipeline_t pipeline, int nr1);
|
|
||||||
int ggml_metal_pipeline_get_nr1(ggml_metal_pipeline_t pipeline);
|
|
||||||
|
|
||||||
void ggml_metal_pipeline_set_smem(ggml_metal_pipeline_t pipeline, size_t smem);
|
|
||||||
size_t ggml_metal_pipeline_get_smem(ggml_metal_pipeline_t pipeline);
|
|
||||||
|
|
||||||
int ggml_metal_pipeline_max_theads_per_threadgroup(ggml_metal_pipeline_t pipeline);
|
|
||||||
|
|
||||||
// a collection of pipelines
|
// a collection of pipelines
|
||||||
typedef struct ggml_metal_pipelines * ggml_metal_pipelines_t;
|
typedef struct ggml_metal_pipelines * ggml_metal_pipelines_t;
|
||||||
|
|
||||||
|
|
@ -58,6 +44,19 @@ void ggml_metal_pipelines_free(ggml_metal_pipelines_t ppls);
|
||||||
void ggml_metal_pipelines_add(ggml_metal_pipelines_t ppls, const char * name, ggml_metal_pipeline_t pipeline);
|
void ggml_metal_pipelines_add(ggml_metal_pipelines_t ppls, const char * name, ggml_metal_pipeline_t pipeline);
|
||||||
ggml_metal_pipeline_t ggml_metal_pipelines_get(ggml_metal_pipelines_t ppls, const char * name);
|
ggml_metal_pipeline_t ggml_metal_pipelines_get(ggml_metal_pipelines_t ppls, const char * name);
|
||||||
|
|
||||||
|
struct ggml_metal_pipeline_with_params {
|
||||||
|
ggml_metal_pipeline_t pipeline;
|
||||||
|
|
||||||
|
int nsg;
|
||||||
|
|
||||||
|
int nr0;
|
||||||
|
int nr1;
|
||||||
|
|
||||||
|
size_t smem;
|
||||||
|
};
|
||||||
|
|
||||||
|
int ggml_metal_pipeline_max_theads_per_threadgroup(struct ggml_metal_pipeline_with_params pipeline);
|
||||||
|
|
||||||
//
|
//
|
||||||
// MTLCommandBuffer wrapper
|
// MTLCommandBuffer wrapper
|
||||||
//
|
//
|
||||||
|
|
@ -76,7 +75,7 @@ void ggml_metal_encoder_free(ggml_metal_encoder_t encoder);
|
||||||
void ggml_metal_encoder_debug_group_push(ggml_metal_encoder_t encoder, const char * name);
|
void ggml_metal_encoder_debug_group_push(ggml_metal_encoder_t encoder, const char * name);
|
||||||
void ggml_metal_encoder_debug_group_pop (ggml_metal_encoder_t encoder);
|
void ggml_metal_encoder_debug_group_pop (ggml_metal_encoder_t encoder);
|
||||||
|
|
||||||
void ggml_metal_encoder_set_pipeline(ggml_metal_encoder_t encoder, ggml_metal_pipeline_t pipeline);
|
void ggml_metal_encoder_set_pipeline(ggml_metal_encoder_t encoder, struct ggml_metal_pipeline_with_params pipeline);
|
||||||
|
|
||||||
void ggml_metal_encoder_set_bytes (ggml_metal_encoder_t encoder, void * data, size_t size, int idx);
|
void ggml_metal_encoder_set_bytes (ggml_metal_encoder_t encoder, void * data, size_t size, int idx);
|
||||||
void ggml_metal_encoder_set_buffer(ggml_metal_encoder_t encoder, struct ggml_metal_buffer_id buffer, int idx);
|
void ggml_metal_encoder_set_buffer(ggml_metal_encoder_t encoder, struct ggml_metal_buffer_id buffer, int idx);
|
||||||
|
|
@ -100,66 +99,67 @@ ggml_metal_library_t ggml_metal_library_init_from_source(ggml_metal_device_t dev
|
||||||
|
|
||||||
void ggml_metal_library_free(ggml_metal_library_t lib);
|
void ggml_metal_library_free(ggml_metal_library_t lib);
|
||||||
|
|
||||||
ggml_metal_pipeline_t ggml_metal_library_get_pipeline (ggml_metal_library_t lib, const char * name);
|
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline (ggml_metal_library_t lib, const char * name);
|
||||||
ggml_metal_pipeline_t ggml_metal_library_compile_pipeline(ggml_metal_library_t lib, const char * base, const char * name, ggml_metal_cv_t cv);
|
struct ggml_metal_pipeline_with_params ggml_metal_library_compile_pipeline(ggml_metal_library_t lib, const char * base, const char * name, ggml_metal_cv_t cv);
|
||||||
|
|
||||||
ggml_metal_pipeline_t ggml_metal_library_get_pipeline_base (ggml_metal_library_t lib, enum ggml_op op);
|
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_base (ggml_metal_library_t lib, enum ggml_op op);
|
||||||
ggml_metal_pipeline_t ggml_metal_library_get_pipeline_cpy (ggml_metal_library_t lib, enum ggml_type tsrc, enum ggml_type tdst);
|
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_cpy (ggml_metal_library_t lib, enum ggml_type tsrc, enum ggml_type tdst);
|
||||||
ggml_metal_pipeline_t ggml_metal_library_get_pipeline_pool_2d (ggml_metal_library_t lib, const struct ggml_tensor * op, enum ggml_op_pool op_pool);
|
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_pool_2d (ggml_metal_library_t lib, const struct ggml_tensor * op, enum ggml_op_pool op_pool);
|
||||||
ggml_metal_pipeline_t ggml_metal_library_get_pipeline_get_rows (ggml_metal_library_t lib, enum ggml_type tsrc);
|
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_get_rows (ggml_metal_library_t lib, enum ggml_type tsrc);
|
||||||
ggml_metal_pipeline_t ggml_metal_library_get_pipeline_set_rows (ggml_metal_library_t lib, enum ggml_type tidx, enum ggml_type tdst);
|
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_set_rows (ggml_metal_library_t lib, enum ggml_type tidx, enum ggml_type tdst);
|
||||||
ggml_metal_pipeline_t ggml_metal_library_get_pipeline_repeat (ggml_metal_library_t lib, enum ggml_type tsrc);
|
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_repeat (ggml_metal_library_t lib, enum ggml_type tsrc);
|
||||||
ggml_metal_pipeline_t ggml_metal_library_get_pipeline_unary (ggml_metal_library_t lib, const struct ggml_tensor * op);
|
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_unary (ggml_metal_library_t lib, const struct ggml_tensor * op);
|
||||||
ggml_metal_pipeline_t ggml_metal_library_get_pipeline_glu (ggml_metal_library_t lib, const struct ggml_tensor * op);
|
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_glu (ggml_metal_library_t lib, const struct ggml_tensor * op);
|
||||||
ggml_metal_pipeline_t ggml_metal_library_get_pipeline_sum (ggml_metal_library_t lib, const struct ggml_tensor * op);
|
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_sum (ggml_metal_library_t lib, const struct ggml_tensor * op);
|
||||||
ggml_metal_pipeline_t ggml_metal_library_get_pipeline_sum_rows (ggml_metal_library_t lib, const struct ggml_tensor * op);
|
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_sum_rows (ggml_metal_library_t lib, const struct ggml_tensor * op);
|
||||||
ggml_metal_pipeline_t ggml_metal_library_get_pipeline_cumsum_blk (ggml_metal_library_t lib, const struct ggml_tensor * op);
|
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_cumsum_blk (ggml_metal_library_t lib, const struct ggml_tensor * op);
|
||||||
ggml_metal_pipeline_t ggml_metal_library_get_pipeline_cumsum_add (ggml_metal_library_t lib, const struct ggml_tensor * op);
|
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_cumsum_add (ggml_metal_library_t lib, const struct ggml_tensor * op);
|
||||||
ggml_metal_pipeline_t ggml_metal_library_get_pipeline_soft_max (ggml_metal_library_t lib, const struct ggml_tensor * op);
|
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_tri (ggml_metal_library_t lib, const struct ggml_tensor * op);
|
||||||
ggml_metal_pipeline_t ggml_metal_library_get_pipeline_ssm_conv (ggml_metal_library_t lib, const struct ggml_tensor * op);
|
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_soft_max (ggml_metal_library_t lib, const struct ggml_tensor * op);
|
||||||
ggml_metal_pipeline_t ggml_metal_library_get_pipeline_ssm_scan (ggml_metal_library_t lib, const struct ggml_tensor * op);
|
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_ssm_conv (ggml_metal_library_t lib, const struct ggml_tensor * op);
|
||||||
ggml_metal_pipeline_t ggml_metal_library_get_pipeline_rwkv (ggml_metal_library_t lib, const struct ggml_tensor * op);
|
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_ssm_scan (ggml_metal_library_t lib, const struct ggml_tensor * op);
|
||||||
ggml_metal_pipeline_t ggml_metal_library_get_pipeline_mul_mv_ext (ggml_metal_library_t lib, enum ggml_type tsrc0, enum ggml_type tsrc1, int nsg, int nxpsg, int r1ptg);
|
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_rwkv (ggml_metal_library_t lib, const struct ggml_tensor * op);
|
||||||
ggml_metal_pipeline_t ggml_metal_library_get_pipeline_mul_mm (ggml_metal_library_t lib, const struct ggml_tensor * op);
|
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_mul_mv_ext (ggml_metal_library_t lib, enum ggml_type tsrc0, enum ggml_type tsrc1, int nsg, int nxpsg, int r1ptg);
|
||||||
ggml_metal_pipeline_t ggml_metal_library_get_pipeline_mul_mv (ggml_metal_library_t lib, const struct ggml_tensor * op);
|
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_mul_mm (ggml_metal_library_t lib, const struct ggml_tensor * op);
|
||||||
ggml_metal_pipeline_t ggml_metal_library_get_pipeline_mul_mm_id_map0 (ggml_metal_library_t lib, int ne02, int ne20);
|
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_mul_mv (ggml_metal_library_t lib, const struct ggml_tensor * op);
|
||||||
ggml_metal_pipeline_t ggml_metal_library_get_pipeline_mul_mm_id (ggml_metal_library_t lib, const struct ggml_tensor * op);
|
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_mul_mm_id_map0 (ggml_metal_library_t lib, int ne02, int ne20);
|
||||||
ggml_metal_pipeline_t ggml_metal_library_get_pipeline_mul_mv_id (ggml_metal_library_t lib, const struct ggml_tensor * op);
|
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_mul_mm_id (ggml_metal_library_t lib, const struct ggml_tensor * op);
|
||||||
ggml_metal_pipeline_t ggml_metal_library_get_pipeline_argmax (ggml_metal_library_t lib, const struct ggml_tensor * op);
|
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_mul_mv_id (ggml_metal_library_t lib, const struct ggml_tensor * op);
|
||||||
ggml_metal_pipeline_t ggml_metal_library_get_pipeline_argsort (ggml_metal_library_t lib, const struct ggml_tensor * op);
|
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_argmax (ggml_metal_library_t lib, const struct ggml_tensor * op);
|
||||||
ggml_metal_pipeline_t ggml_metal_library_get_pipeline_argsort_merge (ggml_metal_library_t lib, const struct ggml_tensor * op);
|
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_argsort (ggml_metal_library_t lib, const struct ggml_tensor * op);
|
||||||
ggml_metal_pipeline_t ggml_metal_library_get_pipeline_top_k (ggml_metal_library_t lib, const struct ggml_tensor * op);
|
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_argsort_merge (ggml_metal_library_t lib, const struct ggml_tensor * op);
|
||||||
ggml_metal_pipeline_t ggml_metal_library_get_pipeline_top_k_merge (ggml_metal_library_t lib, const struct ggml_tensor * op);
|
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_top_k (ggml_metal_library_t lib, const struct ggml_tensor * op);
|
||||||
ggml_metal_pipeline_t ggml_metal_library_get_pipeline_bin (ggml_metal_library_t lib, enum ggml_op op, int32_t n_fuse, bool row);
|
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_top_k_merge (ggml_metal_library_t lib, const struct ggml_tensor * op);
|
||||||
ggml_metal_pipeline_t ggml_metal_library_get_pipeline_l2_norm (ggml_metal_library_t lib, const struct ggml_tensor * op);
|
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_bin (ggml_metal_library_t lib, enum ggml_op op, int32_t n_fuse, bool row);
|
||||||
ggml_metal_pipeline_t ggml_metal_library_get_pipeline_group_norm (ggml_metal_library_t lib, const struct ggml_tensor * op);
|
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_l2_norm (ggml_metal_library_t lib, const struct ggml_tensor * op);
|
||||||
ggml_metal_pipeline_t ggml_metal_library_get_pipeline_norm (ggml_metal_library_t lib, const struct ggml_tensor * op, int32_t n_fuse);
|
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_group_norm (ggml_metal_library_t lib, const struct ggml_tensor * op);
|
||||||
ggml_metal_pipeline_t ggml_metal_library_get_pipeline_rope (ggml_metal_library_t lib, const struct ggml_tensor * op);
|
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_norm (ggml_metal_library_t lib, const struct ggml_tensor * op, int32_t n_fuse);
|
||||||
ggml_metal_pipeline_t ggml_metal_library_get_pipeline_im2col (ggml_metal_library_t lib, const struct ggml_tensor * op);
|
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_rope (ggml_metal_library_t lib, const struct ggml_tensor * op);
|
||||||
ggml_metal_pipeline_t ggml_metal_library_get_pipeline_conv_transpose_1d (ggml_metal_library_t lib, const struct ggml_tensor * op);
|
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_im2col (ggml_metal_library_t lib, const struct ggml_tensor * op);
|
||||||
ggml_metal_pipeline_t ggml_metal_library_get_pipeline_conv_transpose_2d (ggml_metal_library_t lib, const struct ggml_tensor * op);
|
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_conv_transpose_1d (ggml_metal_library_t lib, const struct ggml_tensor * op);
|
||||||
ggml_metal_pipeline_t ggml_metal_library_get_pipeline_conv_2d (ggml_metal_library_t lib, const struct ggml_tensor * op);
|
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_conv_transpose_2d (ggml_metal_library_t lib, const struct ggml_tensor * op);
|
||||||
ggml_metal_pipeline_t ggml_metal_library_get_pipeline_upscale (ggml_metal_library_t lib, const struct ggml_tensor * op);
|
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_conv_2d (ggml_metal_library_t lib, const struct ggml_tensor * op);
|
||||||
ggml_metal_pipeline_t ggml_metal_library_get_pipeline_pad (ggml_metal_library_t lib, const struct ggml_tensor * op);
|
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_upscale (ggml_metal_library_t lib, const struct ggml_tensor * op);
|
||||||
ggml_metal_pipeline_t ggml_metal_library_get_pipeline_pad_reflect_1d (ggml_metal_library_t lib, const struct ggml_tensor * op);
|
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_pad (ggml_metal_library_t lib, const struct ggml_tensor * op);
|
||||||
ggml_metal_pipeline_t ggml_metal_library_get_pipeline_arange (ggml_metal_library_t lib, const struct ggml_tensor * op);
|
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_pad_reflect_1d (ggml_metal_library_t lib, const struct ggml_tensor * op);
|
||||||
ggml_metal_pipeline_t ggml_metal_library_get_pipeline_timestep_embedding(ggml_metal_library_t lib, const struct ggml_tensor * op);
|
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_arange (ggml_metal_library_t lib, const struct ggml_tensor * op);
|
||||||
ggml_metal_pipeline_t ggml_metal_library_get_pipeline_opt_step_adamw (ggml_metal_library_t lib, const struct ggml_tensor * op);
|
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_timestep_embedding(ggml_metal_library_t lib, const struct ggml_tensor * op);
|
||||||
ggml_metal_pipeline_t ggml_metal_library_get_pipeline_opt_step_sgd (ggml_metal_library_t lib, const struct ggml_tensor * op);
|
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_opt_step_adamw (ggml_metal_library_t lib, const struct ggml_tensor * op);
|
||||||
|
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_opt_step_sgd (ggml_metal_library_t lib, const struct ggml_tensor * op);
|
||||||
|
|
||||||
ggml_metal_pipeline_t ggml_metal_library_get_pipeline_flash_attn_ext_pad(
|
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_flash_attn_ext_pad(
|
||||||
ggml_metal_library_t lib,
|
ggml_metal_library_t lib,
|
||||||
const struct ggml_tensor * op,
|
const struct ggml_tensor * op,
|
||||||
bool has_mask,
|
bool has_mask,
|
||||||
int32_t ncpsg);
|
int32_t ncpsg);
|
||||||
|
|
||||||
ggml_metal_pipeline_t ggml_metal_library_get_pipeline_flash_attn_ext_blk(
|
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_flash_attn_ext_blk(
|
||||||
ggml_metal_library_t lib,
|
ggml_metal_library_t lib,
|
||||||
const struct ggml_tensor * op,
|
const struct ggml_tensor * op,
|
||||||
int32_t nqptg,
|
int32_t nqptg,
|
||||||
int32_t ncpsg);
|
int32_t ncpsg);
|
||||||
|
|
||||||
ggml_metal_pipeline_t ggml_metal_library_get_pipeline_flash_attn_ext(
|
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_flash_attn_ext(
|
||||||
ggml_metal_library_t lib,
|
ggml_metal_library_t lib,
|
||||||
const struct ggml_tensor * op,
|
const struct ggml_tensor * op,
|
||||||
bool has_mask,
|
bool has_mask,
|
||||||
|
|
@ -169,7 +169,7 @@ ggml_metal_pipeline_t ggml_metal_library_get_pipeline_flash_attn_ext(
|
||||||
bool has_kvpad,
|
bool has_kvpad,
|
||||||
int32_t nsg);
|
int32_t nsg);
|
||||||
|
|
||||||
ggml_metal_pipeline_t ggml_metal_library_get_pipeline_flash_attn_ext_vec(
|
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_flash_attn_ext_vec(
|
||||||
ggml_metal_library_t lib,
|
ggml_metal_library_t lib,
|
||||||
const struct ggml_tensor * op,
|
const struct ggml_tensor * op,
|
||||||
bool has_mask,
|
bool has_mask,
|
||||||
|
|
@ -180,12 +180,22 @@ ggml_metal_pipeline_t ggml_metal_library_get_pipeline_flash_attn_ext_vec(
|
||||||
int32_t nsg,
|
int32_t nsg,
|
||||||
int32_t nwg);
|
int32_t nwg);
|
||||||
|
|
||||||
ggml_metal_pipeline_t ggml_metal_library_get_pipeline_flash_attn_ext_vec_reduce(
|
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_flash_attn_ext_vec_reduce(
|
||||||
ggml_metal_library_t lib,
|
ggml_metal_library_t lib,
|
||||||
const struct ggml_tensor * op,
|
const struct ggml_tensor * op,
|
||||||
int32_t dv,
|
int32_t dv,
|
||||||
int32_t nwg);
|
int32_t nwg);
|
||||||
|
|
||||||
|
// MTLResidencySet wrapper
|
||||||
|
|
||||||
|
typedef void * ggml_metal_rset_t;
|
||||||
|
|
||||||
|
// a collection of residency sets (non-owning)
|
||||||
|
typedef struct ggml_metal_rsets * ggml_metal_rsets_t;
|
||||||
|
|
||||||
|
ggml_metal_rsets_t ggml_metal_rsets_init(void);
|
||||||
|
void ggml_metal_rsets_free(ggml_metal_rsets_t rsets);
|
||||||
|
|
||||||
//
|
//
|
||||||
// device
|
// device
|
||||||
//
|
//
|
||||||
|
|
@ -219,6 +229,11 @@ void * ggml_metal_device_get_queue(ggml_metal_device_t dev); // id<MTLCommandQue
|
||||||
|
|
||||||
ggml_metal_library_t ggml_metal_device_get_library(ggml_metal_device_t dev);
|
ggml_metal_library_t ggml_metal_device_get_library(ggml_metal_device_t dev);
|
||||||
|
|
||||||
|
void ggml_metal_device_rsets_add(ggml_metal_device_t dev, ggml_metal_rset_t rset);
|
||||||
|
void ggml_metal_device_rsets_rm (ggml_metal_device_t dev, ggml_metal_rset_t rset);
|
||||||
|
|
||||||
|
void ggml_metal_device_rsets_keep_alive(ggml_metal_device_t dev);
|
||||||
|
|
||||||
void ggml_metal_device_get_memory(ggml_metal_device_t dev, size_t * free, size_t * total);
|
void ggml_metal_device_get_memory(ggml_metal_device_t dev, size_t * free, size_t * total);
|
||||||
bool ggml_metal_device_supports_op(ggml_metal_device_t dev, const struct ggml_tensor * op);
|
bool ggml_metal_device_supports_op(ggml_metal_device_t dev, const struct ggml_tensor * op);
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -1,7 +1,6 @@
|
||||||
#import "ggml-metal-device.h"
|
#import "ggml-metal-device.h"
|
||||||
|
|
||||||
#import "ggml-impl.h"
|
#import "ggml-impl.h"
|
||||||
#import "ggml-threading.h"
|
|
||||||
|
|
||||||
#include <Foundation/Foundation.h>
|
#include <Foundation/Foundation.h>
|
||||||
|
|
||||||
|
|
@ -75,14 +74,6 @@ void ggml_metal_cv_set_bool(ggml_metal_cv_t cv, bool value, int32_t idx) {
|
||||||
|
|
||||||
struct ggml_metal_pipeline {
|
struct ggml_metal_pipeline {
|
||||||
id<MTLComputePipelineState> obj;
|
id<MTLComputePipelineState> obj;
|
||||||
|
|
||||||
// suggested dispatch sizes
|
|
||||||
int nsg;
|
|
||||||
|
|
||||||
int nr0;
|
|
||||||
int nr1;
|
|
||||||
|
|
||||||
size_t smem;
|
|
||||||
};
|
};
|
||||||
|
|
||||||
ggml_metal_pipeline_t ggml_metal_pipeline_init(void) {
|
ggml_metal_pipeline_t ggml_metal_pipeline_init(void) {
|
||||||
|
|
@ -90,10 +81,6 @@ ggml_metal_pipeline_t ggml_metal_pipeline_init(void) {
|
||||||
|
|
||||||
*res = (struct ggml_metal_pipeline) {
|
*res = (struct ggml_metal_pipeline) {
|
||||||
/*.obj =*/ nil,
|
/*.obj =*/ nil,
|
||||||
/*.nsg =*/ 0,
|
|
||||||
/*.nr0 =*/ 0,
|
|
||||||
/*.nr1 =*/ 0,
|
|
||||||
/*.smem =*/ 0,
|
|
||||||
};
|
};
|
||||||
|
|
||||||
return res;
|
return res;
|
||||||
|
|
@ -105,40 +92,8 @@ void ggml_metal_pipeline_free(ggml_metal_pipeline_t pipeline) {
|
||||||
free(pipeline);
|
free(pipeline);
|
||||||
}
|
}
|
||||||
|
|
||||||
void ggml_metal_pipeline_set_nsg(ggml_metal_pipeline_t pipeline, int nsg) {
|
int ggml_metal_pipeline_max_theads_per_threadgroup(struct ggml_metal_pipeline_with_params pipeline) {
|
||||||
pipeline->nsg = nsg;
|
return pipeline.pipeline->obj.maxTotalThreadsPerThreadgroup;
|
||||||
}
|
|
||||||
|
|
||||||
int ggml_metal_pipeline_get_nsg(ggml_metal_pipeline_t pipeline) {
|
|
||||||
return pipeline->nsg;
|
|
||||||
}
|
|
||||||
|
|
||||||
void ggml_metal_pipeline_set_nr0(ggml_metal_pipeline_t pipeline, int nr0) {
|
|
||||||
pipeline->nr0 = nr0;
|
|
||||||
}
|
|
||||||
|
|
||||||
int ggml_metal_pipeline_get_nr0(ggml_metal_pipeline_t pipeline) {
|
|
||||||
return pipeline->nr0;
|
|
||||||
}
|
|
||||||
|
|
||||||
void ggml_metal_pipeline_set_nr1(ggml_metal_pipeline_t pipeline, int nr1) {
|
|
||||||
pipeline->nr1 = nr1;
|
|
||||||
}
|
|
||||||
|
|
||||||
int ggml_metal_pipeline_get_nr1(ggml_metal_pipeline_t pipeline) {
|
|
||||||
return pipeline->nr1;
|
|
||||||
}
|
|
||||||
|
|
||||||
void ggml_metal_pipeline_set_smem(ggml_metal_pipeline_t pipeline, size_t smem) {
|
|
||||||
pipeline->smem = smem;
|
|
||||||
}
|
|
||||||
|
|
||||||
size_t ggml_metal_pipeline_get_smem(ggml_metal_pipeline_t pipeline) {
|
|
||||||
return pipeline->smem;
|
|
||||||
}
|
|
||||||
|
|
||||||
int ggml_metal_pipeline_max_theads_per_threadgroup(ggml_metal_pipeline_t pipeline) {
|
|
||||||
return pipeline->obj.maxTotalThreadsPerThreadgroup;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
struct ggml_metal_library {
|
struct ggml_metal_library {
|
||||||
|
|
@ -146,6 +101,8 @@ struct ggml_metal_library {
|
||||||
id<MTLDevice> device;
|
id<MTLDevice> device;
|
||||||
|
|
||||||
ggml_metal_pipelines_t pipelines; // cache of compiled pipelines
|
ggml_metal_pipelines_t pipelines; // cache of compiled pipelines
|
||||||
|
|
||||||
|
NSLock * lock;
|
||||||
};
|
};
|
||||||
|
|
||||||
ggml_metal_library_t ggml_metal_library_init(ggml_metal_device_t dev) {
|
ggml_metal_library_t ggml_metal_library_init(ggml_metal_device_t dev) {
|
||||||
|
|
@ -299,6 +256,7 @@ ggml_metal_library_t ggml_metal_library_init(ggml_metal_device_t dev) {
|
||||||
res->obj = library;
|
res->obj = library;
|
||||||
res->device = device;
|
res->device = device;
|
||||||
res->pipelines = ggml_metal_pipelines_init();
|
res->pipelines = ggml_metal_pipelines_init();
|
||||||
|
res->lock = [NSLock new];
|
||||||
|
|
||||||
return res;
|
return res;
|
||||||
}
|
}
|
||||||
|
|
@ -365,6 +323,7 @@ ggml_metal_library_t ggml_metal_library_init_from_source(ggml_metal_device_t dev
|
||||||
res->obj = library;
|
res->obj = library;
|
||||||
res->device = device;
|
res->device = device;
|
||||||
res->pipelines = ggml_metal_pipelines_init();
|
res->pipelines = ggml_metal_pipelines_init();
|
||||||
|
res->lock = [NSLock new];
|
||||||
|
|
||||||
return res;
|
return res;
|
||||||
}
|
}
|
||||||
|
|
@ -380,25 +339,46 @@ void ggml_metal_library_free(ggml_metal_library_t lib) {
|
||||||
|
|
||||||
ggml_metal_pipelines_free(lib->pipelines);
|
ggml_metal_pipelines_free(lib->pipelines);
|
||||||
|
|
||||||
|
[lib->lock release];
|
||||||
|
|
||||||
free(lib);
|
free(lib);
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_metal_pipeline_t ggml_metal_library_get_pipeline(ggml_metal_library_t lib, const char * name) {
|
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline(ggml_metal_library_t lib, const char * name) {
|
||||||
return ggml_metal_pipelines_get(lib->pipelines, name);
|
[lib->lock lock];
|
||||||
}
|
|
||||||
|
|
||||||
ggml_metal_pipeline_t ggml_metal_library_compile_pipeline(ggml_metal_library_t lib, const char * base, const char * name, ggml_metal_cv_t cv) {
|
struct ggml_metal_pipeline_with_params res = {
|
||||||
// note: the pipelines are cached in the library per device, so they are shared across all metal contexts
|
/*.pipeline =*/ nil,
|
||||||
ggml_critical_section_start();
|
/*.nr0 =*/ 0,
|
||||||
|
/*.nr1 =*/ 0,
|
||||||
|
/*.nsg =*/ 0,
|
||||||
|
/*.smem =*/ 0,
|
||||||
|
};
|
||||||
|
|
||||||
ggml_metal_pipeline_t res = ggml_metal_library_get_pipeline(lib, name);
|
res.pipeline = ggml_metal_pipelines_get(lib->pipelines, name);
|
||||||
if (res) {
|
|
||||||
ggml_critical_section_end();
|
[lib->lock unlock];
|
||||||
|
|
||||||
return res;
|
return res;
|
||||||
}
|
}
|
||||||
|
|
||||||
res = ggml_metal_pipeline_init();
|
struct ggml_metal_pipeline_with_params ggml_metal_library_compile_pipeline(ggml_metal_library_t lib, const char * base, const char * name, ggml_metal_cv_t cv) {
|
||||||
|
struct ggml_metal_pipeline_with_params res = {
|
||||||
|
/*.pipeline =*/ nil,
|
||||||
|
/*.nr0 =*/ 0,
|
||||||
|
/*.nr1 =*/ 0,
|
||||||
|
/*.nsg =*/ 0,
|
||||||
|
/*.smem =*/ 0,
|
||||||
|
};
|
||||||
|
|
||||||
|
[lib->lock lock];
|
||||||
|
|
||||||
|
res.pipeline = ggml_metal_pipelines_get(lib->pipelines, name);
|
||||||
|
if (res.pipeline) {
|
||||||
|
[lib->lock unlock];
|
||||||
|
|
||||||
|
return res;
|
||||||
|
}
|
||||||
|
|
||||||
@autoreleasepool {
|
@autoreleasepool {
|
||||||
NSError * error = nil;
|
NSError * error = nil;
|
||||||
|
|
@ -414,36 +394,53 @@ ggml_metal_pipeline_t ggml_metal_library_compile_pipeline(ggml_metal_library_t l
|
||||||
mtl_function = [lib->obj newFunctionWithName:base_func constantValues:cv->obj error:&error];
|
mtl_function = [lib->obj newFunctionWithName:base_func constantValues:cv->obj error:&error];
|
||||||
}
|
}
|
||||||
if (!mtl_function) {
|
if (!mtl_function) {
|
||||||
ggml_critical_section_end();
|
[lib->lock unlock];
|
||||||
|
|
||||||
GGML_LOG_ERROR("%s: failed to compile pipeline: base = '%s', name = '%s'\n", __func__, base, name);
|
GGML_LOG_ERROR("%s: failed to compile pipeline: base = '%s', name = '%s'\n", __func__, base, name);
|
||||||
if (error) {
|
if (error) {
|
||||||
GGML_LOG_ERROR("%s: %s\n", __func__, [[error description] UTF8String]);
|
GGML_LOG_ERROR("%s: %s\n", __func__, [[error description] UTF8String]);
|
||||||
}
|
}
|
||||||
|
|
||||||
return nil;
|
return res;
|
||||||
}
|
}
|
||||||
|
|
||||||
res->obj = [lib->device newComputePipelineStateWithFunction:mtl_function error:&error];
|
id<MTLComputePipelineState> obj = [lib->device newComputePipelineStateWithFunction:mtl_function error:&error];
|
||||||
|
|
||||||
[mtl_function release];
|
[mtl_function release];
|
||||||
|
|
||||||
GGML_LOG_DEBUG("%s: loaded %-40s %16p | th_max = %4d | th_width = %4d\n", __func__, name, (void *) res->obj,
|
if (!obj) {
|
||||||
(int) res->obj.maxTotalThreadsPerThreadgroup,
|
[lib->lock unlock];
|
||||||
(int) res->obj.threadExecutionWidth);
|
|
||||||
|
|
||||||
if (res->obj.maxTotalThreadsPerThreadgroup == 0 || res->obj.threadExecutionWidth == 0) {
|
GGML_LOG_ERROR("%s: failed to create pipeline state: base = '%s', name = '%s'\n", __func__, base, name);
|
||||||
ggml_critical_section_end();
|
if (error) {
|
||||||
|
GGML_LOG_ERROR("%s: %s\n", __func__, [[error description] UTF8String]);
|
||||||
|
}
|
||||||
|
|
||||||
|
return res;
|
||||||
|
}
|
||||||
|
|
||||||
|
GGML_LOG_DEBUG("%s: loaded %-40s %16p | th_max = %4d | th_width = %4d\n", __func__, name,
|
||||||
|
(void *) obj,
|
||||||
|
(int) obj.maxTotalThreadsPerThreadgroup,
|
||||||
|
(int) obj.threadExecutionWidth);
|
||||||
|
|
||||||
|
if (obj.maxTotalThreadsPerThreadgroup == 0 || obj.threadExecutionWidth == 0) {
|
||||||
|
[obj release];
|
||||||
|
|
||||||
|
[lib->lock unlock];
|
||||||
|
|
||||||
GGML_LOG_ERROR("%s: incompatible pipeline %s\n", __func__, name);
|
GGML_LOG_ERROR("%s: incompatible pipeline %s\n", __func__, name);
|
||||||
|
|
||||||
return nil;
|
return res;
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_metal_pipelines_add(lib->pipelines, name, res);
|
res.pipeline = ggml_metal_pipeline_init();
|
||||||
|
res.pipeline->obj = obj;
|
||||||
|
|
||||||
|
ggml_metal_pipelines_add(lib->pipelines, name, res.pipeline);
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_critical_section_end();
|
[lib->lock unlock];
|
||||||
|
|
||||||
return res;
|
return res;
|
||||||
}
|
}
|
||||||
|
|
@ -485,8 +482,8 @@ void ggml_metal_encoder_debug_group_pop (ggml_metal_encoder_t encoder) {
|
||||||
[encoder->obj popDebugGroup];
|
[encoder->obj popDebugGroup];
|
||||||
}
|
}
|
||||||
|
|
||||||
void ggml_metal_encoder_set_pipeline(ggml_metal_encoder_t encoder, ggml_metal_pipeline_t pipeline) {
|
void ggml_metal_encoder_set_pipeline(ggml_metal_encoder_t encoder, struct ggml_metal_pipeline_with_params pipeline) {
|
||||||
[encoder->obj setComputePipelineState:pipeline->obj];
|
[encoder->obj setComputePipelineState:pipeline.pipeline->obj];
|
||||||
}
|
}
|
||||||
|
|
||||||
void ggml_metal_encoder_set_bytes(ggml_metal_encoder_t encoder, void * data, size_t size, int idx) {
|
void ggml_metal_encoder_set_bytes(ggml_metal_encoder_t encoder, void * data, size_t size, int idx) {
|
||||||
|
|
@ -521,11 +518,106 @@ struct ggml_metal_device {
|
||||||
// ref: https://github.com/ggml-org/llama.cpp/pull/15906
|
// ref: https://github.com/ggml-org/llama.cpp/pull/15906
|
||||||
id<MTLCommandQueue> mtl_queue;
|
id<MTLCommandQueue> mtl_queue;
|
||||||
|
|
||||||
|
ggml_metal_rsets_t rsets;
|
||||||
|
|
||||||
ggml_metal_library_t library;
|
ggml_metal_library_t library;
|
||||||
|
|
||||||
struct ggml_metal_device_props props;
|
struct ggml_metal_device_props props;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
//
|
||||||
|
// MTLResidenceSet wrapper
|
||||||
|
//
|
||||||
|
|
||||||
|
struct ggml_metal_rsets {
|
||||||
|
NSLock * lock;
|
||||||
|
|
||||||
|
NSMutableArray * data;
|
||||||
|
|
||||||
|
// number of seconds since the last graph computation
|
||||||
|
// keep the residency sets wired for that amount of time to avoid being collected by the OS
|
||||||
|
int keep_alive_s;
|
||||||
|
|
||||||
|
// background heartbeat thread to keep the residency sets alive
|
||||||
|
atomic_bool d_stop;
|
||||||
|
atomic_int d_loop;
|
||||||
|
|
||||||
|
dispatch_group_t d_group;
|
||||||
|
};
|
||||||
|
|
||||||
|
ggml_metal_rsets_t ggml_metal_rsets_init(void) {
|
||||||
|
ggml_metal_rsets_t res = calloc(1, sizeof(struct ggml_metal_rsets));
|
||||||
|
|
||||||
|
res->lock = [[NSLock alloc] init];
|
||||||
|
res->data = [[NSMutableArray alloc] init];
|
||||||
|
|
||||||
|
// by default keep the memory wired for 3 minutes
|
||||||
|
res->keep_alive_s = 3*60;
|
||||||
|
|
||||||
|
const char * GGML_METAL_RESIDENCY_KEEP_ALIVE_S = getenv("GGML_METAL_RESIDENCY_KEEP_ALIVE_S");
|
||||||
|
if (GGML_METAL_RESIDENCY_KEEP_ALIVE_S) {
|
||||||
|
res->keep_alive_s = atoi(GGML_METAL_RESIDENCY_KEEP_ALIVE_S);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (res->keep_alive_s <= 0) {
|
||||||
|
res->keep_alive_s = 3*60;
|
||||||
|
}
|
||||||
|
|
||||||
|
GGML_LOG_INFO("%s: creating a residency set collection (keep_alive = %d s)\n", __func__, res->keep_alive_s);
|
||||||
|
|
||||||
|
atomic_store_explicit(&res->d_stop, false, memory_order_relaxed);
|
||||||
|
atomic_store_explicit(&res->d_loop, 2*res->keep_alive_s, memory_order_relaxed);
|
||||||
|
|
||||||
|
res->d_group = dispatch_group_create();
|
||||||
|
|
||||||
|
// start a background thread that periodically requests residency for all the currently active sets in the collection
|
||||||
|
// the requests stop after a certain amount of time (keep_alive_s) of inactivity
|
||||||
|
dispatch_queue_t d_queue = dispatch_get_global_queue(QOS_CLASS_DEFAULT, 0);
|
||||||
|
dispatch_group_async(res->d_group, d_queue, ^{
|
||||||
|
#if defined(GGML_METAL_HAS_RESIDENCY_SETS)
|
||||||
|
if (@available(macOS 15.0, iOS 18.0, tvOS 18.0, visionOS 2.0, *)) {
|
||||||
|
while (!atomic_load_explicit(&res->d_stop, memory_order_relaxed)) {
|
||||||
|
if (atomic_load_explicit(&res->d_loop, memory_order_relaxed) > 0) {
|
||||||
|
[res->lock lock];
|
||||||
|
|
||||||
|
for (int i = 0; i < (int) res->data.count; ++i) {
|
||||||
|
[res->data[i] requestResidency];
|
||||||
|
}
|
||||||
|
|
||||||
|
atomic_fetch_sub_explicit(&res->d_loop, 1, memory_order_relaxed);
|
||||||
|
|
||||||
|
[res->lock unlock];
|
||||||
|
}
|
||||||
|
|
||||||
|
// half a second
|
||||||
|
usleep(500 * 1000);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
});
|
||||||
|
|
||||||
|
return res;
|
||||||
|
}
|
||||||
|
|
||||||
|
void ggml_metal_rsets_free(ggml_metal_rsets_t rsets) {
|
||||||
|
if (rsets == NULL) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// note: if you hit this assert, most likely you haven't deallocated all Metal resources before exiting
|
||||||
|
GGML_ASSERT([rsets->data count] == 0);
|
||||||
|
|
||||||
|
atomic_store_explicit(&rsets->d_stop, true, memory_order_relaxed);
|
||||||
|
|
||||||
|
dispatch_group_wait(rsets->d_group, DISPATCH_TIME_FOREVER);
|
||||||
|
dispatch_release(rsets->d_group);
|
||||||
|
|
||||||
|
[rsets->data release];
|
||||||
|
[rsets->lock release];
|
||||||
|
|
||||||
|
free(rsets);
|
||||||
|
}
|
||||||
|
|
||||||
ggml_metal_device_t ggml_metal_device_init(void) {
|
ggml_metal_device_t ggml_metal_device_init(void) {
|
||||||
ggml_metal_device_t dev = calloc(1, sizeof(struct ggml_metal_device));
|
ggml_metal_device_t dev = calloc(1, sizeof(struct ggml_metal_device));
|
||||||
|
|
||||||
|
|
@ -611,8 +703,8 @@ ggml_metal_device_t ggml_metal_device_init(void) {
|
||||||
GGML_LOG_WARN("%s: - the tensor API is not supported in this environment - disabling\n", __func__);
|
GGML_LOG_WARN("%s: - the tensor API is not supported in this environment - disabling\n", __func__);
|
||||||
dev->props.has_tensor = false;
|
dev->props.has_tensor = false;
|
||||||
} else {
|
} else {
|
||||||
ggml_metal_pipeline_t ppl = ggml_metal_library_compile_pipeline(lib, "dummy_kernel", "dummy_kernel", nil);
|
struct ggml_metal_pipeline_with_params ppl = ggml_metal_library_compile_pipeline(lib, "dummy_kernel", "dummy_kernel", nil);
|
||||||
if (!ppl) {
|
if (!ppl.pipeline) {
|
||||||
GGML_LOG_WARN("%s: - the tensor API is not supported in this environment - disabling\n", __func__);
|
GGML_LOG_WARN("%s: - the tensor API is not supported in this environment - disabling\n", __func__);
|
||||||
dev->props.has_tensor = false;
|
dev->props.has_tensor = false;
|
||||||
}
|
}
|
||||||
|
|
@ -661,8 +753,8 @@ ggml_metal_device_t ggml_metal_device_init(void) {
|
||||||
GGML_LOG_WARN("%s: - the tensor API does not support bfloat - disabling bfloat support\n", __func__);
|
GGML_LOG_WARN("%s: - the tensor API does not support bfloat - disabling bfloat support\n", __func__);
|
||||||
dev->props.has_bfloat = false;
|
dev->props.has_bfloat = false;
|
||||||
} else {
|
} else {
|
||||||
ggml_metal_pipeline_t ppl = ggml_metal_library_compile_pipeline(lib, "dummy_kernel", "dummy_kernel", nil);
|
struct ggml_metal_pipeline_with_params ppl = ggml_metal_library_compile_pipeline(lib, "dummy_kernel", "dummy_kernel", nil);
|
||||||
if (!ppl) {
|
if (!ppl.pipeline) {
|
||||||
GGML_LOG_WARN("%s: - the tensor API does not support bfloat - disabling bfloat support\n", __func__);
|
GGML_LOG_WARN("%s: - the tensor API does not support bfloat - disabling bfloat support\n", __func__);
|
||||||
dev->props.has_bfloat = false;
|
dev->props.has_bfloat = false;
|
||||||
}
|
}
|
||||||
|
|
@ -694,7 +786,11 @@ ggml_metal_device_t ggml_metal_device_init(void) {
|
||||||
GGML_LOG_ERROR("%s: error: failed to create library\n", __func__);
|
GGML_LOG_ERROR("%s: error: failed to create library\n", __func__);
|
||||||
}
|
}
|
||||||
|
|
||||||
// --------------------------------------------------
|
if (dev->props.use_residency_sets) {
|
||||||
|
dev->rsets = ggml_metal_rsets_init();
|
||||||
|
} else {
|
||||||
|
dev->rsets = nil;
|
||||||
|
}
|
||||||
|
|
||||||
// print MTL GPU family:
|
// print MTL GPU family:
|
||||||
GGML_LOG_INFO("%s: GPU name: %s\n", __func__, dev->props.name);
|
GGML_LOG_INFO("%s: GPU name: %s\n", __func__, dev->props.name);
|
||||||
|
|
@ -747,6 +843,8 @@ ggml_metal_device_t ggml_metal_device_init(void) {
|
||||||
void ggml_metal_device_free(ggml_metal_device_t dev) {
|
void ggml_metal_device_free(ggml_metal_device_t dev) {
|
||||||
assert(dev != NULL);
|
assert(dev != NULL);
|
||||||
|
|
||||||
|
ggml_metal_rsets_free(dev->rsets);
|
||||||
|
|
||||||
ggml_metal_library_free(dev->library);
|
ggml_metal_library_free(dev->library);
|
||||||
dev->library = NULL;
|
dev->library = NULL;
|
||||||
|
|
||||||
|
|
@ -775,6 +873,42 @@ ggml_metal_library_t ggml_metal_device_get_library(ggml_metal_device_t dev) {
|
||||||
return dev->library;
|
return dev->library;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void ggml_metal_device_rsets_add(ggml_metal_device_t dev, ggml_metal_rset_t rset) {
|
||||||
|
if (rset == nil) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
GGML_ASSERT(dev->rsets);
|
||||||
|
|
||||||
|
[dev->rsets->lock lock];
|
||||||
|
|
||||||
|
[dev->rsets->data addObject:rset];
|
||||||
|
|
||||||
|
[dev->rsets->lock unlock];
|
||||||
|
}
|
||||||
|
|
||||||
|
void ggml_metal_device_rsets_rm(ggml_metal_device_t dev, ggml_metal_rset_t rset) {
|
||||||
|
if (rset == nil) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
GGML_ASSERT(dev->rsets);
|
||||||
|
|
||||||
|
[dev->rsets->lock lock];
|
||||||
|
|
||||||
|
[dev->rsets->data removeObject:rset];
|
||||||
|
|
||||||
|
[dev->rsets->lock unlock];
|
||||||
|
}
|
||||||
|
|
||||||
|
void ggml_metal_device_rsets_keep_alive(ggml_metal_device_t dev) {
|
||||||
|
if (dev->rsets == NULL) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
atomic_store_explicit(&dev->rsets->d_loop, 2*dev->rsets->keep_alive_s, memory_order_relaxed);
|
||||||
|
}
|
||||||
|
|
||||||
void ggml_metal_device_get_memory(ggml_metal_device_t dev, size_t * free, size_t * total) {
|
void ggml_metal_device_get_memory(ggml_metal_device_t dev, size_t * free, size_t * total) {
|
||||||
if (@available(macOS 10.12, iOS 16.0, *)) {
|
if (@available(macOS 10.12, iOS 16.0, *)) {
|
||||||
*total = dev->mtl_device.recommendedMaxWorkingSetSize;
|
*total = dev->mtl_device.recommendedMaxWorkingSetSize;
|
||||||
|
|
@ -820,6 +954,8 @@ bool ggml_metal_device_supports_op(ggml_metal_device_t dev, const struct ggml_te
|
||||||
case GGML_UNARY_OP_HARDSWISH:
|
case GGML_UNARY_OP_HARDSWISH:
|
||||||
case GGML_UNARY_OP_HARDSIGMOID:
|
case GGML_UNARY_OP_HARDSIGMOID:
|
||||||
case GGML_UNARY_OP_EXP:
|
case GGML_UNARY_OP_EXP:
|
||||||
|
case GGML_UNARY_OP_SOFTPLUS:
|
||||||
|
case GGML_UNARY_OP_EXPM1:
|
||||||
return ggml_is_contiguous(op->src[0]) && op->src[0]->type == GGML_TYPE_F32;
|
return ggml_is_contiguous(op->src[0]) && op->src[0]->type == GGML_TYPE_F32;
|
||||||
default:
|
default:
|
||||||
return false;
|
return false;
|
||||||
|
|
@ -852,6 +988,7 @@ bool ggml_metal_device_supports_op(ggml_metal_device_t dev, const struct ggml_te
|
||||||
case GGML_OP_ACC:
|
case GGML_OP_ACC:
|
||||||
case GGML_OP_REPEAT:
|
case GGML_OP_REPEAT:
|
||||||
case GGML_OP_SCALE:
|
case GGML_OP_SCALE:
|
||||||
|
case GGML_OP_FILL:
|
||||||
case GGML_OP_CONV_TRANSPOSE_1D:
|
case GGML_OP_CONV_TRANSPOSE_1D:
|
||||||
return true;
|
return true;
|
||||||
case GGML_OP_CONV_TRANSPOSE_2D:
|
case GGML_OP_CONV_TRANSPOSE_2D:
|
||||||
|
|
@ -869,6 +1006,8 @@ bool ggml_metal_device_supports_op(ggml_metal_device_t dev, const struct ggml_te
|
||||||
return ggml_is_contiguous(op->src[0]) && op->src[0]->type == GGML_TYPE_F32;
|
return ggml_is_contiguous(op->src[0]) && op->src[0]->type == GGML_TYPE_F32;
|
||||||
case GGML_OP_SUM:
|
case GGML_OP_SUM:
|
||||||
return has_simdgroup_reduction && ggml_is_contiguous(op->src[0]);
|
return has_simdgroup_reduction && ggml_is_contiguous(op->src[0]);
|
||||||
|
case GGML_OP_TRI:
|
||||||
|
return ggml_is_contiguous_rows(op->src[0]);
|
||||||
case GGML_OP_SUM_ROWS:
|
case GGML_OP_SUM_ROWS:
|
||||||
case GGML_OP_CUMSUM:
|
case GGML_OP_CUMSUM:
|
||||||
case GGML_OP_MEAN:
|
case GGML_OP_MEAN:
|
||||||
|
|
@ -898,6 +1037,11 @@ bool ggml_metal_device_supports_op(ggml_metal_device_t dev, const struct ggml_te
|
||||||
case GGML_OP_POOL_2D:
|
case GGML_OP_POOL_2D:
|
||||||
return op->src[0]->type == GGML_TYPE_F32;
|
return op->src[0]->type == GGML_TYPE_F32;
|
||||||
case GGML_OP_PAD:
|
case GGML_OP_PAD:
|
||||||
|
// TODO: add circular padding support for metal, see https://github.com/ggml-org/llama.cpp/pull/16985
|
||||||
|
if (ggml_get_op_params_i32(op, 8) != 0) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
return (ggml_get_op_params_i32(op, 0) == 0) && (ggml_get_op_params_i32(op, 2) == 0) &&
|
return (ggml_get_op_params_i32(op, 0) == 0) && (ggml_get_op_params_i32(op, 2) == 0) &&
|
||||||
(ggml_get_op_params_i32(op, 4) == 0) && (ggml_get_op_params_i32(op, 6) == 0);
|
(ggml_get_op_params_i32(op, 4) == 0) && (ggml_get_op_params_i32(op, 6) == 0);
|
||||||
case GGML_OP_PAD_REFLECT_1D:
|
case GGML_OP_PAD_REFLECT_1D:
|
||||||
|
|
@ -1063,9 +1207,8 @@ struct ggml_metal_buffer {
|
||||||
// note: cannot use explicity "id<MTLResidencySet>" here because it is not available on certain OSes
|
// note: cannot use explicity "id<MTLResidencySet>" here because it is not available on certain OSes
|
||||||
id rset;
|
id rset;
|
||||||
|
|
||||||
// pointers to global device objects
|
// pointers to global device
|
||||||
id<MTLDevice> device;
|
ggml_metal_device_t dev;
|
||||||
id<MTLCommandQueue> queue;
|
|
||||||
};
|
};
|
||||||
|
|
||||||
static void ggml_metal_log_allocated_size(id<MTLDevice> device, size_t size_aligned) {
|
static void ggml_metal_log_allocated_size(id<MTLDevice> device, size_t size_aligned) {
|
||||||
|
|
@ -1108,7 +1251,7 @@ static bool ggml_metal_buffer_rset_init(ggml_metal_buffer_t buf) {
|
||||||
desc.initialCapacity = buf->n_buffers;
|
desc.initialCapacity = buf->n_buffers;
|
||||||
|
|
||||||
NSError * error;
|
NSError * error;
|
||||||
buf->rset = [buf->device newResidencySetWithDescriptor:desc error:&error];
|
buf->rset = [buf->dev->mtl_device newResidencySetWithDescriptor:desc error:&error];
|
||||||
if (error) {
|
if (error) {
|
||||||
GGML_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);
|
GGML_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);
|
||||||
[desc release];
|
[desc release];
|
||||||
|
|
@ -1169,6 +1312,8 @@ static void * ggml_metal_host_malloc(size_t n) {
|
||||||
ggml_metal_buffer_t ggml_metal_buffer_init(ggml_metal_device_t dev, size_t size, bool shared) {
|
ggml_metal_buffer_t ggml_metal_buffer_init(ggml_metal_device_t dev, size_t size, bool shared) {
|
||||||
ggml_metal_buffer_t res = calloc(1, sizeof(struct ggml_metal_buffer));
|
ggml_metal_buffer_t res = calloc(1, sizeof(struct ggml_metal_buffer));
|
||||||
|
|
||||||
|
res->dev = dev;
|
||||||
|
|
||||||
const size_t size_page = sysconf(_SC_PAGESIZE);
|
const size_t size_page = sysconf(_SC_PAGESIZE);
|
||||||
|
|
||||||
size_t size_aligned = size;
|
size_t size_aligned = size;
|
||||||
|
|
@ -1193,9 +1338,6 @@ ggml_metal_buffer_t ggml_metal_buffer_init(ggml_metal_device_t dev, size_t size,
|
||||||
|
|
||||||
res->owned = true;
|
res->owned = true;
|
||||||
|
|
||||||
res->device = ggml_metal_device_get_obj(dev);
|
|
||||||
res->queue = ggml_metal_device_get_queue(dev);
|
|
||||||
|
|
||||||
res->n_buffers = 1;
|
res->n_buffers = 1;
|
||||||
|
|
||||||
if (res->all_data != NULL) {
|
if (res->all_data != NULL) {
|
||||||
|
|
@ -1204,12 +1346,12 @@ ggml_metal_buffer_t ggml_metal_buffer_init(ggml_metal_device_t dev, size_t size,
|
||||||
|
|
||||||
if (size_aligned > 0) {
|
if (size_aligned > 0) {
|
||||||
if (props_dev->use_shared_buffers && shared) {
|
if (props_dev->use_shared_buffers && shared) {
|
||||||
res->buffers[0].metal = [res->device newBufferWithBytesNoCopy:res->all_data
|
res->buffers[0].metal = [res->dev->mtl_device newBufferWithBytesNoCopy:res->all_data
|
||||||
length:size_aligned
|
length:size_aligned
|
||||||
options:MTLResourceStorageModeShared
|
options:MTLResourceStorageModeShared
|
||||||
deallocator:nil];
|
deallocator:nil];
|
||||||
} else {
|
} else {
|
||||||
res->buffers[0].metal = [res->device newBufferWithLength:size_aligned options:MTLResourceStorageModePrivate];
|
res->buffers[0].metal = [res->dev->mtl_device newBufferWithLength:size_aligned options:MTLResourceStorageModePrivate];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -1230,6 +1372,8 @@ ggml_metal_buffer_t ggml_metal_buffer_init(ggml_metal_device_t dev, size_t size,
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
ggml_metal_device_rsets_add(dev, res->rset);
|
||||||
|
|
||||||
//ggml_metal_log_allocated_size(device, size_aligned);
|
//ggml_metal_log_allocated_size(device, size_aligned);
|
||||||
|
|
||||||
return res;
|
return res;
|
||||||
|
|
@ -1238,6 +1382,8 @@ ggml_metal_buffer_t ggml_metal_buffer_init(ggml_metal_device_t dev, size_t size,
|
||||||
ggml_metal_buffer_t ggml_metal_buffer_map(ggml_metal_device_t dev, void * ptr, size_t size, size_t max_tensor_size) {
|
ggml_metal_buffer_t ggml_metal_buffer_map(ggml_metal_device_t dev, void * ptr, size_t size, size_t max_tensor_size) {
|
||||||
ggml_metal_buffer_t res = calloc(1, sizeof(struct ggml_metal_buffer));
|
ggml_metal_buffer_t res = calloc(1, sizeof(struct ggml_metal_buffer));
|
||||||
|
|
||||||
|
res->dev = dev;
|
||||||
|
|
||||||
res->all_data = ptr;
|
res->all_data = ptr;
|
||||||
res->all_size = size;
|
res->all_size = size;
|
||||||
|
|
||||||
|
|
@ -1260,9 +1406,6 @@ ggml_metal_buffer_t ggml_metal_buffer_map(ggml_metal_device_t dev, void * ptr, s
|
||||||
size_aligned += (size_page - (size_aligned % size_page));
|
size_aligned += (size_page - (size_aligned % size_page));
|
||||||
}
|
}
|
||||||
|
|
||||||
res->device = ggml_metal_device_get_obj(dev);
|
|
||||||
res->queue = ggml_metal_device_get_queue(dev);
|
|
||||||
|
|
||||||
const struct ggml_metal_device_props * props_dev = ggml_metal_device_get_props(dev);
|
const struct ggml_metal_device_props * props_dev = ggml_metal_device_get_props(dev);
|
||||||
|
|
||||||
// the buffer fits into the max buffer size allowed by the device
|
// the buffer fits into the max buffer size allowed by the device
|
||||||
|
|
@ -1272,7 +1415,7 @@ ggml_metal_buffer_t ggml_metal_buffer_map(ggml_metal_device_t dev, void * ptr, s
|
||||||
res->buffers[res->n_buffers].metal = nil;
|
res->buffers[res->n_buffers].metal = nil;
|
||||||
|
|
||||||
if (size_aligned > 0) {
|
if (size_aligned > 0) {
|
||||||
res->buffers[res->n_buffers].metal = [res->device newBufferWithBytesNoCopy:ptr length:size_aligned options:MTLResourceStorageModeShared deallocator:nil];
|
res->buffers[res->n_buffers].metal = [res->dev->mtl_device newBufferWithBytesNoCopy:ptr length:size_aligned options:MTLResourceStorageModeShared deallocator:nil];
|
||||||
|
|
||||||
if (res->buffers[res->n_buffers].metal == nil) {
|
if (res->buffers[res->n_buffers].metal == nil) {
|
||||||
GGML_LOG_ERROR("%s: error: failed to allocate buffer, size = %8.2f MiB\n", __func__, size_aligned / 1024.0 / 1024.0);
|
GGML_LOG_ERROR("%s: error: failed to allocate buffer, size = %8.2f MiB\n", __func__, size_aligned / 1024.0 / 1024.0);
|
||||||
|
|
@ -1281,7 +1424,7 @@ ggml_metal_buffer_t ggml_metal_buffer_map(ggml_metal_device_t dev, void * ptr, s
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_metal_log_allocated_size(res->device, size_aligned);
|
ggml_metal_log_allocated_size(res->dev->mtl_device, size_aligned);
|
||||||
|
|
||||||
++res->n_buffers;
|
++res->n_buffers;
|
||||||
} else {
|
} else {
|
||||||
|
|
@ -1299,7 +1442,7 @@ ggml_metal_buffer_t ggml_metal_buffer_map(ggml_metal_device_t dev, void * ptr, s
|
||||||
res->buffers[res->n_buffers].metal = nil;
|
res->buffers[res->n_buffers].metal = nil;
|
||||||
|
|
||||||
if (size_step_aligned > 0) {
|
if (size_step_aligned > 0) {
|
||||||
res->buffers[res->n_buffers].metal = [res->device newBufferWithBytesNoCopy:(void *) ((uint8_t *) ptr + i) length:size_step_aligned options:MTLResourceStorageModeShared deallocator:nil];
|
res->buffers[res->n_buffers].metal = [res->dev->mtl_device newBufferWithBytesNoCopy:(void *) ((uint8_t *) ptr + i) length:size_step_aligned options:MTLResourceStorageModeShared deallocator:nil];
|
||||||
|
|
||||||
if (res->buffers[res->n_buffers].metal == nil) {
|
if (res->buffers[res->n_buffers].metal == nil) {
|
||||||
GGML_LOG_ERROR("%s: error: failed to allocate buffer, size = %8.2f MiB\n", __func__, size_step_aligned / 1024.0 / 1024.0);
|
GGML_LOG_ERROR("%s: error: failed to allocate buffer, size = %8.2f MiB\n", __func__, size_step_aligned / 1024.0 / 1024.0);
|
||||||
|
|
@ -1308,7 +1451,7 @@ ggml_metal_buffer_t ggml_metal_buffer_map(ggml_metal_device_t dev, void * ptr, s
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_metal_log_allocated_size(res->device, size_step_aligned);
|
ggml_metal_log_allocated_size(res->dev->mtl_device, size_step_aligned);
|
||||||
|
|
||||||
if (i + size_step < size) {
|
if (i + size_step < size) {
|
||||||
GGML_LOG_INFO("\n");
|
GGML_LOG_INFO("\n");
|
||||||
|
|
@ -1326,10 +1469,14 @@ ggml_metal_buffer_t ggml_metal_buffer_map(ggml_metal_device_t dev, void * ptr, s
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
ggml_metal_device_rsets_add(dev, res->rset);
|
||||||
|
|
||||||
return res;
|
return res;
|
||||||
}
|
}
|
||||||
|
|
||||||
void ggml_metal_buffer_free(ggml_metal_buffer_t buf) {
|
void ggml_metal_buffer_free(ggml_metal_buffer_t buf) {
|
||||||
|
ggml_metal_device_rsets_rm(buf->dev, buf->rset);
|
||||||
|
|
||||||
for (int i = 0; i < buf->n_buffers; i++) {
|
for (int i = 0; i < buf->n_buffers; i++) {
|
||||||
[buf->buffers[i].metal release];
|
[buf->buffers[i].metal release];
|
||||||
}
|
}
|
||||||
|
|
@ -1366,8 +1513,7 @@ void ggml_metal_buffer_memset_tensor(ggml_metal_buffer_t buf, struct ggml_tensor
|
||||||
struct ggml_metal_buffer_id bid_dst = ggml_metal_buffer_get_id(buf, tensor);
|
struct ggml_metal_buffer_id bid_dst = ggml_metal_buffer_get_id(buf, tensor);
|
||||||
bid_dst.offs += offset;
|
bid_dst.offs += offset;
|
||||||
|
|
||||||
id<MTLCommandQueue> queue = buf->queue;
|
id<MTLCommandBuffer> cmd_buf = [buf->dev->mtl_queue commandBufferWithUnretainedReferences];
|
||||||
id<MTLCommandBuffer> cmd_buf = [queue commandBufferWithUnretainedReferences];
|
|
||||||
|
|
||||||
{
|
{
|
||||||
id<MTLBlitCommandEncoder> encoder = [cmd_buf blitCommandEncoder];
|
id<MTLBlitCommandEncoder> encoder = [cmd_buf blitCommandEncoder];
|
||||||
|
|
@ -1393,7 +1539,7 @@ void ggml_metal_buffer_set_tensor(ggml_metal_buffer_t buf, struct ggml_tensor *
|
||||||
@autoreleasepool {
|
@autoreleasepool {
|
||||||
// src
|
// src
|
||||||
void * data_ptr = (void *)(uintptr_t) data; // "const cast" the src data
|
void * data_ptr = (void *)(uintptr_t) data; // "const cast" the src data
|
||||||
id<MTLBuffer> buf_src = [buf->device newBufferWithBytesNoCopy:data_ptr
|
id<MTLBuffer> buf_src = [buf->dev->mtl_device newBufferWithBytesNoCopy:data_ptr
|
||||||
length:size
|
length:size
|
||||||
options:MTLResourceStorageModeShared
|
options:MTLResourceStorageModeShared
|
||||||
deallocator:nil];
|
deallocator:nil];
|
||||||
|
|
@ -1408,8 +1554,7 @@ void ggml_metal_buffer_set_tensor(ggml_metal_buffer_t buf, struct ggml_tensor *
|
||||||
// this is alternative to waitUntilCompleted, which should be faster, but don't seem to make much difference
|
// this is alternative to waitUntilCompleted, which should be faster, but don't seem to make much difference
|
||||||
dispatch_semaphore_t completion_semaphore = dispatch_semaphore_create(0);
|
dispatch_semaphore_t completion_semaphore = dispatch_semaphore_create(0);
|
||||||
|
|
||||||
id<MTLCommandQueue> queue = buf->queue;
|
id<MTLCommandBuffer> cmd_buf = [buf->dev->mtl_queue commandBufferWithUnretainedReferences];
|
||||||
id<MTLCommandBuffer> cmd_buf = [queue commandBufferWithUnretainedReferences];
|
|
||||||
|
|
||||||
{
|
{
|
||||||
id<MTLBlitCommandEncoder> encoder = [cmd_buf blitCommandEncoder];
|
id<MTLBlitCommandEncoder> encoder = [cmd_buf blitCommandEncoder];
|
||||||
|
|
@ -1451,15 +1596,14 @@ void ggml_metal_buffer_get_tensor(ggml_metal_buffer_t buf, const struct ggml_ten
|
||||||
bid_src.offs += offset;
|
bid_src.offs += offset;
|
||||||
|
|
||||||
// dst
|
// dst
|
||||||
id<MTLBuffer> buf_dst = [buf->device newBufferWithBytesNoCopy:data
|
id<MTLBuffer> buf_dst = [buf->dev->mtl_device newBufferWithBytesNoCopy:data
|
||||||
length:size
|
length:size
|
||||||
options:MTLResourceStorageModeShared
|
options:MTLResourceStorageModeShared
|
||||||
deallocator:nil];
|
deallocator:nil];
|
||||||
|
|
||||||
GGML_ASSERT(buf_dst);
|
GGML_ASSERT(buf_dst);
|
||||||
|
|
||||||
id<MTLCommandQueue> queue = buf->queue;
|
id<MTLCommandBuffer> cmd_buf = [buf->dev->mtl_queue commandBufferWithUnretainedReferences];
|
||||||
id<MTLCommandBuffer> cmd_buf = [queue commandBufferWithUnretainedReferences];
|
|
||||||
|
|
||||||
{
|
{
|
||||||
id<MTLBlitCommandEncoder> encoder = [cmd_buf blitCommandEncoder];
|
id<MTLBlitCommandEncoder> encoder = [cmd_buf blitCommandEncoder];
|
||||||
|
|
@ -1485,8 +1629,7 @@ void ggml_metal_buffer_clear(ggml_metal_buffer_t buf, uint8_t value) {
|
||||||
}
|
}
|
||||||
|
|
||||||
@autoreleasepool {
|
@autoreleasepool {
|
||||||
id<MTLCommandQueue> queue = buf->queue;
|
id<MTLCommandBuffer> cmd_buf = [buf->dev->mtl_queue commandBufferWithUnretainedReferences];
|
||||||
id<MTLCommandBuffer> cmd_buf = [queue commandBufferWithUnretainedReferences];
|
|
||||||
|
|
||||||
{
|
{
|
||||||
id<MTLBlitCommandEncoder> encoder = [cmd_buf blitCommandEncoder];
|
id<MTLBlitCommandEncoder> encoder = [cmd_buf blitCommandEncoder];
|
||||||
|
|
|
||||||
|
|
@ -182,6 +182,10 @@ typedef struct {
|
||||||
float bias;
|
float bias;
|
||||||
} ggml_metal_kargs_scale;
|
} ggml_metal_kargs_scale;
|
||||||
|
|
||||||
|
typedef struct {
|
||||||
|
float val;
|
||||||
|
} ggml_metal_kargs_fill;
|
||||||
|
|
||||||
typedef struct {
|
typedef struct {
|
||||||
float min;
|
float min;
|
||||||
float max;
|
float max;
|
||||||
|
|
@ -831,6 +835,25 @@ typedef struct {
|
||||||
float slope;
|
float slope;
|
||||||
} ggml_metal_kargs_leaky_relu;
|
} ggml_metal_kargs_leaky_relu;
|
||||||
|
|
||||||
|
typedef struct {
|
||||||
|
int32_t ne00;
|
||||||
|
int32_t ne01;
|
||||||
|
int32_t ne02;
|
||||||
|
int32_t ne03;
|
||||||
|
uint64_t nb00;
|
||||||
|
uint64_t nb01;
|
||||||
|
uint64_t nb02;
|
||||||
|
uint64_t nb03;
|
||||||
|
int32_t ne0;
|
||||||
|
int32_t ne1;
|
||||||
|
int32_t ne2;
|
||||||
|
int32_t ne3;
|
||||||
|
uint64_t nb0;
|
||||||
|
uint64_t nb1;
|
||||||
|
uint64_t nb2;
|
||||||
|
uint64_t nb3;
|
||||||
|
} ggml_metal_kargs_tri;
|
||||||
|
|
||||||
typedef struct {
|
typedef struct {
|
||||||
int32_t ne00;
|
int32_t ne00;
|
||||||
int32_t ne01;
|
int32_t ne01;
|
||||||
|
|
|
||||||
|
|
@ -286,6 +286,10 @@ static int ggml_metal_op_encode_impl(ggml_metal_op_t ctx, int idx) {
|
||||||
{
|
{
|
||||||
n_fuse = ggml_metal_op_scale(ctx, idx);
|
n_fuse = ggml_metal_op_scale(ctx, idx);
|
||||||
} break;
|
} break;
|
||||||
|
case GGML_OP_FILL:
|
||||||
|
{
|
||||||
|
n_fuse = ggml_metal_op_fill(ctx, idx);
|
||||||
|
} break;
|
||||||
case GGML_OP_CLAMP:
|
case GGML_OP_CLAMP:
|
||||||
{
|
{
|
||||||
n_fuse = ggml_metal_op_clamp(ctx, idx);
|
n_fuse = ggml_metal_op_clamp(ctx, idx);
|
||||||
|
|
@ -414,6 +418,10 @@ static int ggml_metal_op_encode_impl(ggml_metal_op_t ctx, int idx) {
|
||||||
{
|
{
|
||||||
n_fuse = ggml_metal_op_leaky_relu(ctx, idx);
|
n_fuse = ggml_metal_op_leaky_relu(ctx, idx);
|
||||||
} break;
|
} break;
|
||||||
|
case GGML_OP_TRI:
|
||||||
|
{
|
||||||
|
n_fuse = ggml_metal_op_tri(ctx, idx);
|
||||||
|
} break;
|
||||||
case GGML_OP_FLASH_ATTN_EXT:
|
case GGML_OP_FLASH_ATTN_EXT:
|
||||||
{
|
{
|
||||||
n_fuse = ggml_metal_op_flash_attn_ext(ctx, idx);
|
n_fuse = ggml_metal_op_flash_attn_ext(ctx, idx);
|
||||||
|
|
@ -524,7 +532,7 @@ int ggml_metal_op_concat(ggml_metal_op_t ctx, int idx) {
|
||||||
/*.dim =*/ dim,
|
/*.dim =*/ dim,
|
||||||
};
|
};
|
||||||
|
|
||||||
ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_base(lib, GGML_OP_CONCAT);
|
auto pipeline = ggml_metal_library_get_pipeline_base(lib, GGML_OP_CONCAT);
|
||||||
|
|
||||||
ggml_metal_encoder_set_pipeline(enc, pipeline);
|
ggml_metal_encoder_set_pipeline(enc, pipeline);
|
||||||
ggml_metal_encoder_set_bytes (enc, &args, sizeof(args), 0);
|
ggml_metal_encoder_set_bytes (enc, &args, sizeof(args), 0);
|
||||||
|
|
@ -550,7 +558,7 @@ int ggml_metal_op_repeat(ggml_metal_op_t ctx, int idx) {
|
||||||
GGML_TENSOR_LOCALS( int32_t, ne, op, ne);
|
GGML_TENSOR_LOCALS( int32_t, ne, op, ne);
|
||||||
GGML_TENSOR_LOCALS(uint64_t, nb, op, nb);
|
GGML_TENSOR_LOCALS(uint64_t, nb, op, nb);
|
||||||
|
|
||||||
ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_repeat(lib, op->type);
|
auto pipeline = ggml_metal_library_get_pipeline_repeat(lib, op->type);
|
||||||
|
|
||||||
ggml_metal_kargs_repeat args = {
|
ggml_metal_kargs_repeat args = {
|
||||||
/*.ne00 =*/ ne00,
|
/*.ne00 =*/ ne00,
|
||||||
|
|
@ -616,7 +624,7 @@ int ggml_metal_op_acc(ggml_metal_op_t ctx, int idx) {
|
||||||
// TODO: make a simpler cpy_bytes kernel
|
// TODO: make a simpler cpy_bytes kernel
|
||||||
|
|
||||||
//const id<MTLComputePipelineState> pipeline = ctx->pipelines[GGML_METAL_PIPELINE_TYPE_CPY_F32_F32].obj;
|
//const id<MTLComputePipelineState> pipeline = ctx->pipelines[GGML_METAL_PIPELINE_TYPE_CPY_F32_F32].obj;
|
||||||
ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_cpy(lib, op->src[0]->type, op->type);
|
auto pipeline = ggml_metal_library_get_pipeline_cpy(lib, op->src[0]->type, op->type);
|
||||||
|
|
||||||
ggml_metal_kargs_cpy args = {
|
ggml_metal_kargs_cpy args = {
|
||||||
/*.nk0 =*/ ne00,
|
/*.nk0 =*/ ne00,
|
||||||
|
|
@ -679,7 +687,7 @@ int ggml_metal_op_acc(ggml_metal_op_t ctx, int idx) {
|
||||||
/*.o1 =*/ { 0 },
|
/*.o1 =*/ { 0 },
|
||||||
};
|
};
|
||||||
|
|
||||||
ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_bin(lib, GGML_OP_ADD, 1, false);
|
auto pipeline = ggml_metal_library_get_pipeline_bin(lib, GGML_OP_ADD, 1, false);
|
||||||
|
|
||||||
ggml_metal_encoder_set_pipeline(enc, pipeline);
|
ggml_metal_encoder_set_pipeline(enc, pipeline);
|
||||||
ggml_metal_encoder_set_bytes (enc, &args, sizeof(args), 0);
|
ggml_metal_encoder_set_bytes (enc, &args, sizeof(args), 0);
|
||||||
|
|
@ -721,7 +729,42 @@ int ggml_metal_op_scale(ggml_metal_op_t ctx, int idx) {
|
||||||
n /= 4;
|
n /= 4;
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_unary(lib, op);
|
auto pipeline = ggml_metal_library_get_pipeline_unary(lib, op);
|
||||||
|
|
||||||
|
ggml_metal_encoder_set_pipeline(enc, pipeline);
|
||||||
|
ggml_metal_encoder_set_bytes (enc, &args, sizeof(args), 0);
|
||||||
|
ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[0]), 1);
|
||||||
|
ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op), 2);
|
||||||
|
|
||||||
|
ggml_metal_encoder_dispatch_threadgroups(enc, n, 1, 1, 1, 1, 1);
|
||||||
|
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
int ggml_metal_op_fill(ggml_metal_op_t ctx, int idx) {
|
||||||
|
ggml_tensor * op = ctx->node(idx);
|
||||||
|
|
||||||
|
ggml_metal_library_t lib = ctx->lib;
|
||||||
|
ggml_metal_encoder_t enc = ctx->enc;
|
||||||
|
|
||||||
|
GGML_TENSOR_LOCALS( int32_t, ne0, op->src[0], ne);
|
||||||
|
GGML_TENSOR_LOCALS(uint64_t, nb0, op->src[0], nb);
|
||||||
|
GGML_TENSOR_LOCALS( int32_t, ne, op, ne);
|
||||||
|
GGML_TENSOR_LOCALS(uint64_t, nb, op, nb);
|
||||||
|
|
||||||
|
const float val = ggml_get_op_params_f32(op, 0);
|
||||||
|
|
||||||
|
ggml_metal_kargs_fill args = {
|
||||||
|
/*.val =*/ val
|
||||||
|
};
|
||||||
|
|
||||||
|
int64_t n = ggml_nelements(op);
|
||||||
|
|
||||||
|
if (n % 4 == 0) {
|
||||||
|
n /= 4;
|
||||||
|
}
|
||||||
|
|
||||||
|
auto pipeline = ggml_metal_library_get_pipeline_unary(lib, op);
|
||||||
|
|
||||||
ggml_metal_encoder_set_pipeline(enc, pipeline);
|
ggml_metal_encoder_set_pipeline(enc, pipeline);
|
||||||
ggml_metal_encoder_set_bytes (enc, &args, sizeof(args), 0);
|
ggml_metal_encoder_set_bytes (enc, &args, sizeof(args), 0);
|
||||||
|
|
@ -760,7 +803,7 @@ int ggml_metal_op_clamp(ggml_metal_op_t ctx, int idx) {
|
||||||
n /= 4;
|
n /= 4;
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_unary(lib, op);
|
auto pipeline = ggml_metal_library_get_pipeline_unary(lib, op);
|
||||||
|
|
||||||
ggml_metal_encoder_set_pipeline(enc, pipeline);
|
ggml_metal_encoder_set_pipeline(enc, pipeline);
|
||||||
ggml_metal_encoder_set_bytes (enc, &args, sizeof(args), 0);
|
ggml_metal_encoder_set_bytes (enc, &args, sizeof(args), 0);
|
||||||
|
|
@ -789,7 +832,7 @@ int ggml_metal_op_unary(ggml_metal_op_t ctx, int idx) {
|
||||||
n /= 4;
|
n /= 4;
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_unary(lib, op);
|
auto pipeline = ggml_metal_library_get_pipeline_unary(lib, op);
|
||||||
|
|
||||||
ggml_metal_encoder_set_pipeline(enc, pipeline);
|
ggml_metal_encoder_set_pipeline(enc, pipeline);
|
||||||
ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[0]), 0);
|
ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[0]), 0);
|
||||||
|
|
@ -817,7 +860,7 @@ int ggml_metal_op_glu(ggml_metal_op_t ctx, int idx) {
|
||||||
GGML_ASSERT(ggml_are_same_shape(op->src[0], op->src[1]));
|
GGML_ASSERT(ggml_are_same_shape(op->src[0], op->src[1]));
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_glu(lib, op);
|
auto pipeline = ggml_metal_library_get_pipeline_glu(lib, op);
|
||||||
|
|
||||||
const int32_t swp = ggml_get_op_params_i32(op, 1);
|
const int32_t swp = ggml_get_op_params_i32(op, 1);
|
||||||
const float alpha = ggml_get_op_params_f32(op, 2);
|
const float alpha = ggml_get_op_params_f32(op, 2);
|
||||||
|
|
@ -870,7 +913,7 @@ int ggml_metal_op_sum(ggml_metal_op_t ctx, int idx) {
|
||||||
/*.np =*/ n,
|
/*.np =*/ n,
|
||||||
};
|
};
|
||||||
|
|
||||||
ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_sum(lib, op);
|
auto pipeline = ggml_metal_library_get_pipeline_sum(lib, op);
|
||||||
|
|
||||||
int nth = 32; // SIMD width
|
int nth = 32; // SIMD width
|
||||||
|
|
||||||
|
|
@ -925,7 +968,7 @@ int ggml_metal_op_sum_rows(ggml_metal_op_t ctx, int idx) {
|
||||||
/*.nb3 =*/ nb3,
|
/*.nb3 =*/ nb3,
|
||||||
};
|
};
|
||||||
|
|
||||||
ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_sum_rows(lib, op);
|
auto pipeline = ggml_metal_library_get_pipeline_sum_rows(lib, op);
|
||||||
|
|
||||||
int nth = 32; // SIMD width
|
int nth = 32; // SIMD width
|
||||||
|
|
||||||
|
|
@ -936,7 +979,7 @@ int ggml_metal_op_sum_rows(ggml_metal_op_t ctx, int idx) {
|
||||||
nth = std::min(nth, ggml_metal_pipeline_max_theads_per_threadgroup(pipeline));
|
nth = std::min(nth, ggml_metal_pipeline_max_theads_per_threadgroup(pipeline));
|
||||||
nth = std::min(nth, ne00);
|
nth = std::min(nth, ne00);
|
||||||
|
|
||||||
const size_t smem = ggml_metal_pipeline_get_smem(pipeline);
|
const size_t smem = pipeline.smem;
|
||||||
|
|
||||||
ggml_metal_encoder_set_pipeline(enc, pipeline);
|
ggml_metal_encoder_set_pipeline(enc, pipeline);
|
||||||
ggml_metal_encoder_set_bytes (enc, &args, sizeof(args), 0);
|
ggml_metal_encoder_set_bytes (enc, &args, sizeof(args), 0);
|
||||||
|
|
@ -963,7 +1006,7 @@ int ggml_metal_op_cumsum(ggml_metal_op_t ctx, int idx) {
|
||||||
GGML_TENSOR_LOCALS( int32_t, ne, op, ne);
|
GGML_TENSOR_LOCALS( int32_t, ne, op, ne);
|
||||||
GGML_TENSOR_LOCALS(uint64_t, nb, op, nb);
|
GGML_TENSOR_LOCALS(uint64_t, nb, op, nb);
|
||||||
|
|
||||||
ggml_metal_pipeline_t pipeline_blk = ggml_metal_library_get_pipeline_cumsum_blk(lib, op);
|
auto pipeline_blk = ggml_metal_library_get_pipeline_cumsum_blk(lib, op);
|
||||||
|
|
||||||
int nth = 1;
|
int nth = 1;
|
||||||
while (nth < ne00 && 2*nth <= ggml_metal_pipeline_max_theads_per_threadgroup(pipeline_blk)) {
|
while (nth < ne00 && 2*nth <= ggml_metal_pipeline_max_theads_per_threadgroup(pipeline_blk)) {
|
||||||
|
|
@ -1060,7 +1103,7 @@ int ggml_metal_op_cumsum(ggml_metal_op_t ctx, int idx) {
|
||||||
ggml_metal_op_concurrency_reset(ctx);
|
ggml_metal_op_concurrency_reset(ctx);
|
||||||
|
|
||||||
{
|
{
|
||||||
ggml_metal_pipeline_t pipeline_add = ggml_metal_library_get_pipeline_cumsum_add(lib, op);
|
auto pipeline_add = ggml_metal_library_get_pipeline_cumsum_add(lib, op);
|
||||||
|
|
||||||
ggml_metal_kargs_cumsum_add args = {
|
ggml_metal_kargs_cumsum_add args = {
|
||||||
/*.ne00 =*/ ne00,
|
/*.ne00 =*/ ne00,
|
||||||
|
|
@ -1106,7 +1149,7 @@ int ggml_metal_op_get_rows(ggml_metal_op_t ctx, int idx) {
|
||||||
GGML_TENSOR_LOCALS( int32_t, ne, op, ne);
|
GGML_TENSOR_LOCALS( int32_t, ne, op, ne);
|
||||||
GGML_TENSOR_LOCALS(uint64_t, nb, op, nb);
|
GGML_TENSOR_LOCALS(uint64_t, nb, op, nb);
|
||||||
|
|
||||||
ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_get_rows(lib, op->src[0]->type);
|
auto pipeline = ggml_metal_library_get_pipeline_get_rows(lib, op->src[0]->type);
|
||||||
|
|
||||||
ggml_metal_kargs_get_rows args = {
|
ggml_metal_kargs_get_rows args = {
|
||||||
/*.ne00t =*/ ggml_is_quantized(op->src[0]->type) ? ne00/16 : ne00,
|
/*.ne00t =*/ ggml_is_quantized(op->src[0]->type) ? ne00/16 : ne00,
|
||||||
|
|
@ -1151,7 +1194,7 @@ int ggml_metal_op_set_rows(ggml_metal_op_t ctx, int idx) {
|
||||||
GGML_TENSOR_LOCALS( int32_t, ne, op, ne);
|
GGML_TENSOR_LOCALS( int32_t, ne, op, ne);
|
||||||
GGML_TENSOR_LOCALS(uint64_t, nb, op, nb);
|
GGML_TENSOR_LOCALS(uint64_t, nb, op, nb);
|
||||||
|
|
||||||
ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_set_rows(lib, op->src[1]->type, op->type);
|
auto pipeline = ggml_metal_library_get_pipeline_set_rows(lib, op->src[1]->type, op->type);
|
||||||
|
|
||||||
const int32_t nk0 = ne0/ggml_blck_size(op->type);
|
const int32_t nk0 = ne0/ggml_blck_size(op->type);
|
||||||
|
|
||||||
|
|
@ -1252,7 +1295,7 @@ int ggml_metal_op_soft_max(ggml_metal_op_t ctx, int idx) {
|
||||||
/*.n_head_log2 =*/ n_head_log2,
|
/*.n_head_log2 =*/ n_head_log2,
|
||||||
};
|
};
|
||||||
|
|
||||||
ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_soft_max(lib, op);
|
auto pipeline = ggml_metal_library_get_pipeline_soft_max(lib, op);
|
||||||
|
|
||||||
int nth = 32; // SIMD width
|
int nth = 32; // SIMD width
|
||||||
|
|
||||||
|
|
@ -1266,7 +1309,7 @@ int ggml_metal_op_soft_max(ggml_metal_op_t ctx, int idx) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
const size_t smem = ggml_metal_pipeline_get_smem(pipeline);
|
const size_t smem = pipeline.smem;
|
||||||
|
|
||||||
ggml_metal_encoder_set_pipeline(enc, pipeline);
|
ggml_metal_encoder_set_pipeline(enc, pipeline);
|
||||||
ggml_metal_encoder_set_bytes(enc, &args, sizeof(args), 0);
|
ggml_metal_encoder_set_bytes(enc, &args, sizeof(args), 0);
|
||||||
|
|
@ -1322,7 +1365,7 @@ int ggml_metal_op_ssm_conv(ggml_metal_op_t ctx, int idx) {
|
||||||
/*.nb2 =*/ nb2,
|
/*.nb2 =*/ nb2,
|
||||||
};
|
};
|
||||||
|
|
||||||
ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_ssm_conv(lib, op);
|
auto pipeline = ggml_metal_library_get_pipeline_ssm_conv(lib, op);
|
||||||
|
|
||||||
ggml_metal_encoder_set_pipeline(enc, pipeline);
|
ggml_metal_encoder_set_pipeline(enc, pipeline);
|
||||||
ggml_metal_encoder_set_bytes(enc, &args, sizeof(args), 0);
|
ggml_metal_encoder_set_bytes(enc, &args, sizeof(args), 0);
|
||||||
|
|
@ -1409,11 +1452,11 @@ int ggml_metal_op_ssm_scan(ggml_metal_op_t ctx, int idx) {
|
||||||
/*.nb0 =*/ nb0,
|
/*.nb0 =*/ nb0,
|
||||||
};
|
};
|
||||||
|
|
||||||
ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_ssm_scan(lib, op);
|
auto pipeline = ggml_metal_library_get_pipeline_ssm_scan(lib, op);
|
||||||
|
|
||||||
GGML_ASSERT(d_state <= ggml_metal_pipeline_max_theads_per_threadgroup(pipeline));
|
GGML_ASSERT(d_state <= ggml_metal_pipeline_max_theads_per_threadgroup(pipeline));
|
||||||
|
|
||||||
const size_t sms = ggml_metal_pipeline_get_smem(pipeline);
|
const size_t smem = pipeline.smem;
|
||||||
|
|
||||||
ggml_metal_encoder_set_pipeline(enc, pipeline);
|
ggml_metal_encoder_set_pipeline(enc, pipeline);
|
||||||
ggml_metal_encoder_set_bytes (enc, &args, sizeof(args), 0);
|
ggml_metal_encoder_set_bytes (enc, &args, sizeof(args), 0);
|
||||||
|
|
@ -1426,7 +1469,7 @@ int ggml_metal_op_ssm_scan(ggml_metal_op_t ctx, int idx) {
|
||||||
ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[6]), 7);
|
ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[6]), 7);
|
||||||
ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op), 8);
|
ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op), 8);
|
||||||
|
|
||||||
ggml_metal_encoder_set_threadgroup_memory_size(enc, sms, 0);
|
ggml_metal_encoder_set_threadgroup_memory_size(enc, smem, 0);
|
||||||
|
|
||||||
ggml_metal_encoder_dispatch_threadgroups(enc, d_inner, n_head, n_seqs, d_state, 1, 1);
|
ggml_metal_encoder_dispatch_threadgroups(enc, d_inner, n_head, n_seqs, d_state, 1, 1);
|
||||||
|
|
||||||
|
|
@ -1449,7 +1492,7 @@ int ggml_metal_op_rwkv(ggml_metal_op_t ctx, int idx) {
|
||||||
const int64_t C = op->ne[0];
|
const int64_t C = op->ne[0];
|
||||||
const int64_t H = op->src[0]->ne[1];
|
const int64_t H = op->src[0]->ne[1];
|
||||||
|
|
||||||
ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_rwkv(lib, op);
|
auto pipeline = ggml_metal_library_get_pipeline_rwkv(lib, op);
|
||||||
|
|
||||||
int ida = 0;
|
int ida = 0;
|
||||||
|
|
||||||
|
|
@ -1485,7 +1528,7 @@ int ggml_metal_op_cpy(ggml_metal_op_t ctx, int idx) {
|
||||||
GGML_TENSOR_LOCALS( int32_t, ne, op, ne);
|
GGML_TENSOR_LOCALS( int32_t, ne, op, ne);
|
||||||
GGML_TENSOR_LOCALS(uint64_t, nb, op, nb);
|
GGML_TENSOR_LOCALS(uint64_t, nb, op, nb);
|
||||||
|
|
||||||
ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_cpy(lib, op->src[0]->type, op->type);
|
auto pipeline = ggml_metal_library_get_pipeline_cpy(lib, op->src[0]->type, op->type);
|
||||||
|
|
||||||
GGML_ASSERT(ne00 % ggml_blck_size(op->src[0]->type) == 0);
|
GGML_ASSERT(ne00 % ggml_blck_size(op->src[0]->type) == 0);
|
||||||
|
|
||||||
|
|
@ -1592,7 +1635,7 @@ int ggml_metal_op_pool_2d(ggml_metal_op_t ctx, int idx) {
|
||||||
/* .np = */ np
|
/* .np = */ np
|
||||||
};
|
};
|
||||||
|
|
||||||
ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_pool_2d(lib, op, op_pool);
|
auto pipeline = ggml_metal_library_get_pipeline_pool_2d(lib, op, op_pool);
|
||||||
|
|
||||||
const int nth = std::min(ggml_metal_pipeline_max_theads_per_threadgroup(pipeline), (int) np);
|
const int nth = std::min(ggml_metal_pipeline_max_theads_per_threadgroup(pipeline), (int) np);
|
||||||
const int ntg = (np + nth - 1) / nth;
|
const int ntg = (np + nth - 1) / nth;
|
||||||
|
|
@ -1701,7 +1744,7 @@ int ggml_metal_op_mul_mat(ggml_metal_op_t ctx, int idx) {
|
||||||
GGML_ABORT("unsupported ne11");
|
GGML_ABORT("unsupported ne11");
|
||||||
};
|
};
|
||||||
|
|
||||||
ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_mul_mv_ext(lib, op->src[0]->type, op->src[1]->type, nsg, nxpsg, r1ptg);
|
auto pipeline = ggml_metal_library_get_pipeline_mul_mv_ext(lib, op->src[0]->type, op->src[1]->type, nsg, nxpsg, r1ptg);
|
||||||
|
|
||||||
ggml_metal_kargs_mul_mv_ext args = {
|
ggml_metal_kargs_mul_mv_ext args = {
|
||||||
/*.ne00 =*/ ne00,
|
/*.ne00 =*/ ne00,
|
||||||
|
|
@ -1748,7 +1791,7 @@ int ggml_metal_op_mul_mat(ggml_metal_op_t ctx, int idx) {
|
||||||
// default: break;
|
// default: break;
|
||||||
//}
|
//}
|
||||||
|
|
||||||
ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_mul_mm(lib, op);
|
auto pipeline = ggml_metal_library_get_pipeline_mul_mm(lib, op);
|
||||||
|
|
||||||
ggml_metal_kargs_mul_mm args = {
|
ggml_metal_kargs_mul_mm args = {
|
||||||
/*.ne00 =*/ ne00,
|
/*.ne00 =*/ ne00,
|
||||||
|
|
@ -1773,18 +1816,18 @@ int ggml_metal_op_mul_mat(ggml_metal_op_t ctx, int idx) {
|
||||||
ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[1]), 2);
|
ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[1]), 2);
|
||||||
ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op), 3);
|
ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op), 3);
|
||||||
|
|
||||||
const size_t smem = ggml_metal_pipeline_get_smem(pipeline);
|
const size_t smem = pipeline.smem;
|
||||||
|
|
||||||
ggml_metal_encoder_set_threadgroup_memory_size(enc, smem, 0);
|
ggml_metal_encoder_set_threadgroup_memory_size(enc, smem, 0);
|
||||||
ggml_metal_encoder_dispatch_threadgroups(enc, ((ne11 + 31)/32), ((ne01 + 63)/64), ne12*ne13, 128, 1, 1);
|
ggml_metal_encoder_dispatch_threadgroups(enc, ((ne11 + 31)/32), ((ne01 + 63)/64), ne12*ne13, 128, 1, 1);
|
||||||
} else {
|
} else {
|
||||||
ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_mul_mv(lib, op);
|
auto pipeline = ggml_metal_library_get_pipeline_mul_mv(lib, op);
|
||||||
|
|
||||||
const int nr0 = ggml_metal_pipeline_get_nr0(pipeline);
|
const int nr0 = pipeline.nr0;
|
||||||
const int nr1 = ggml_metal_pipeline_get_nr1(pipeline);
|
const int nr1 = pipeline.nr1;
|
||||||
const int nsg = ggml_metal_pipeline_get_nsg(pipeline);
|
const int nsg = pipeline.nsg;
|
||||||
|
|
||||||
const size_t smem = ggml_metal_pipeline_get_smem(pipeline);
|
const size_t smem = pipeline.smem;
|
||||||
|
|
||||||
ggml_metal_kargs_mul_mv args = {
|
ggml_metal_kargs_mul_mv args = {
|
||||||
/*.ne00 =*/ ne00,
|
/*.ne00 =*/ ne00,
|
||||||
|
|
@ -1915,9 +1958,9 @@ int ggml_metal_op_mul_mat_id(ggml_metal_op_t ctx, int idx) {
|
||||||
nb21,
|
nb21,
|
||||||
};
|
};
|
||||||
|
|
||||||
ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_mul_mm_id_map0(lib, ne02, ne20);
|
auto pipeline = ggml_metal_library_get_pipeline_mul_mm_id_map0(lib, ne02, ne20);
|
||||||
|
|
||||||
const size_t smem = ggml_metal_pipeline_get_smem(pipeline);
|
const size_t smem = pipeline.smem;
|
||||||
|
|
||||||
GGML_ASSERT(ne02 <= ggml_metal_pipeline_max_theads_per_threadgroup(pipeline));
|
GGML_ASSERT(ne02 <= ggml_metal_pipeline_max_theads_per_threadgroup(pipeline));
|
||||||
|
|
||||||
|
|
@ -1938,7 +1981,7 @@ int ggml_metal_op_mul_mat_id(ggml_metal_op_t ctx, int idx) {
|
||||||
ggml_metal_op_concurrency_reset(ctx);
|
ggml_metal_op_concurrency_reset(ctx);
|
||||||
|
|
||||||
{
|
{
|
||||||
ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_mul_mm_id(lib, op);
|
auto pipeline = ggml_metal_library_get_pipeline_mul_mm_id(lib, op);
|
||||||
|
|
||||||
ggml_metal_kargs_mul_mm_id args = {
|
ggml_metal_kargs_mul_mm_id args = {
|
||||||
/*.ne00 =*/ ne00,
|
/*.ne00 =*/ ne00,
|
||||||
|
|
@ -1967,20 +2010,20 @@ int ggml_metal_op_mul_mat_id(ggml_metal_op_t ctx, int idx) {
|
||||||
ggml_metal_encoder_set_buffer (enc, bid_ids, 4);
|
ggml_metal_encoder_set_buffer (enc, bid_ids, 4);
|
||||||
ggml_metal_encoder_set_buffer (enc, bid_dst, 5);
|
ggml_metal_encoder_set_buffer (enc, bid_dst, 5);
|
||||||
|
|
||||||
const size_t smem = ggml_metal_pipeline_get_smem(pipeline);
|
const size_t smem = pipeline.smem;
|
||||||
|
|
||||||
ggml_metal_encoder_set_threadgroup_memory_size(enc, smem, 0);
|
ggml_metal_encoder_set_threadgroup_memory_size(enc, smem, 0);
|
||||||
|
|
||||||
ggml_metal_encoder_dispatch_threadgroups(enc, (ne21 + 31)/32, (ne01 + 63)/64, ne02, 128, 1, 1);
|
ggml_metal_encoder_dispatch_threadgroups(enc, (ne21 + 31)/32, (ne01 + 63)/64, ne02, 128, 1, 1);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_mul_mv_id(lib, op);
|
auto pipeline = ggml_metal_library_get_pipeline_mul_mv_id(lib, op);
|
||||||
|
|
||||||
const int nr0 = ggml_metal_pipeline_get_nr0(pipeline);
|
const int nr0 = pipeline.nr0;
|
||||||
const int nr1 = ggml_metal_pipeline_get_nr1(pipeline);
|
const int nr1 = pipeline.nr1;
|
||||||
const int nsg = ggml_metal_pipeline_get_nsg(pipeline);
|
const int nsg = pipeline.nsg;
|
||||||
|
|
||||||
const size_t smem = ggml_metal_pipeline_get_smem(pipeline);
|
const size_t smem = pipeline.smem;
|
||||||
|
|
||||||
ggml_metal_kargs_mul_mv_id args = {
|
ggml_metal_kargs_mul_mv_id args = {
|
||||||
/*.nei0 =*/ ne20,
|
/*.nei0 =*/ ne20,
|
||||||
|
|
@ -2064,7 +2107,7 @@ int ggml_metal_op_add_id(ggml_metal_op_t ctx, int idx) {
|
||||||
/*.nb21 =*/ nb21,
|
/*.nb21 =*/ nb21,
|
||||||
};
|
};
|
||||||
|
|
||||||
ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_base(lib, GGML_OP_ADD_ID);
|
auto pipeline = ggml_metal_library_get_pipeline_base(lib, GGML_OP_ADD_ID);
|
||||||
|
|
||||||
ggml_metal_encoder_set_pipeline(enc, pipeline);
|
ggml_metal_encoder_set_pipeline(enc, pipeline);
|
||||||
ggml_metal_encoder_set_bytes (enc, &args, sizeof(args), 0);
|
ggml_metal_encoder_set_bytes (enc, &args, sizeof(args), 0);
|
||||||
|
|
@ -2308,7 +2351,7 @@ int ggml_metal_op_flash_attn_ext(ggml_metal_op_t ctx, int idx) {
|
||||||
/*.nb33 =*/nb33,
|
/*.nb33 =*/nb33,
|
||||||
};
|
};
|
||||||
|
|
||||||
ggml_metal_pipeline_t pipeline0 = ggml_metal_library_get_pipeline_flash_attn_ext_pad(lib, op, has_mask, ncpsg);
|
auto pipeline0 = ggml_metal_library_get_pipeline_flash_attn_ext_pad(lib, op, has_mask, ncpsg);
|
||||||
|
|
||||||
ggml_metal_encoder_set_pipeline(enc, pipeline0);
|
ggml_metal_encoder_set_pipeline(enc, pipeline0);
|
||||||
ggml_metal_encoder_set_bytes (enc, &args0, sizeof(args0), 0);
|
ggml_metal_encoder_set_bytes (enc, &args0, sizeof(args0), 0);
|
||||||
|
|
@ -2339,7 +2382,7 @@ int ggml_metal_op_flash_attn_ext(ggml_metal_op_t ctx, int idx) {
|
||||||
/*.nb33 =*/ nb33,
|
/*.nb33 =*/ nb33,
|
||||||
};
|
};
|
||||||
|
|
||||||
ggml_metal_pipeline_t pipeline0 = ggml_metal_library_get_pipeline_flash_attn_ext_blk(lib, op, nqptg, ncpsg);
|
auto pipeline0 = ggml_metal_library_get_pipeline_flash_attn_ext_blk(lib, op, nqptg, ncpsg);
|
||||||
|
|
||||||
ggml_metal_encoder_set_pipeline(enc, pipeline0);
|
ggml_metal_encoder_set_pipeline(enc, pipeline0);
|
||||||
ggml_metal_encoder_set_bytes (enc, &args0, sizeof(args0), 0);
|
ggml_metal_encoder_set_bytes (enc, &args0, sizeof(args0), 0);
|
||||||
|
|
@ -2424,7 +2467,7 @@ int ggml_metal_op_flash_attn_ext(ggml_metal_op_t ctx, int idx) {
|
||||||
/*.logit_softcap =*/ logit_softcap,
|
/*.logit_softcap =*/ logit_softcap,
|
||||||
};
|
};
|
||||||
|
|
||||||
ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_flash_attn_ext(lib, op, has_mask, has_sinks, has_bias, has_scap, has_kvpad, nsg);
|
auto pipeline = ggml_metal_library_get_pipeline_flash_attn_ext(lib, op, has_mask, has_sinks, has_bias, has_scap, has_kvpad, nsg);
|
||||||
|
|
||||||
ggml_metal_encoder_set_pipeline(enc, pipeline);
|
ggml_metal_encoder_set_pipeline(enc, pipeline);
|
||||||
ggml_metal_encoder_set_bytes (enc, &args, sizeof(args), 0);
|
ggml_metal_encoder_set_bytes (enc, &args, sizeof(args), 0);
|
||||||
|
|
@ -2476,7 +2519,7 @@ int ggml_metal_op_flash_attn_ext(ggml_metal_op_t ctx, int idx) {
|
||||||
/*.nb33 =*/nb33,
|
/*.nb33 =*/nb33,
|
||||||
};
|
};
|
||||||
|
|
||||||
ggml_metal_pipeline_t pipeline0 = ggml_metal_library_get_pipeline_flash_attn_ext_pad(lib, op, has_mask, ncpsg);
|
auto pipeline0 = ggml_metal_library_get_pipeline_flash_attn_ext_pad(lib, op, has_mask, ncpsg);
|
||||||
|
|
||||||
ggml_metal_encoder_set_pipeline(enc, pipeline0);
|
ggml_metal_encoder_set_pipeline(enc, pipeline0);
|
||||||
ggml_metal_encoder_set_bytes (enc, &args0, sizeof(args0), 0);
|
ggml_metal_encoder_set_bytes (enc, &args0, sizeof(args0), 0);
|
||||||
|
|
@ -2578,7 +2621,7 @@ int ggml_metal_op_flash_attn_ext(ggml_metal_op_t ctx, int idx) {
|
||||||
/*.logit_softcap =*/ logit_softcap,
|
/*.logit_softcap =*/ logit_softcap,
|
||||||
};
|
};
|
||||||
|
|
||||||
ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_flash_attn_ext_vec(lib, op, has_mask, has_sinks, has_bias, has_scap, has_kvpad, nsg, nwg);
|
auto pipeline = ggml_metal_library_get_pipeline_flash_attn_ext_vec(lib, op, has_mask, has_sinks, has_bias, has_scap, has_kvpad, nsg, nwg);
|
||||||
|
|
||||||
GGML_ASSERT(nsg*32 <= ggml_metal_pipeline_max_theads_per_threadgroup(pipeline));
|
GGML_ASSERT(nsg*32 <= ggml_metal_pipeline_max_theads_per_threadgroup(pipeline));
|
||||||
|
|
||||||
|
|
@ -2630,7 +2673,7 @@ int ggml_metal_op_flash_attn_ext(ggml_metal_op_t ctx, int idx) {
|
||||||
nrows,
|
nrows,
|
||||||
};
|
};
|
||||||
|
|
||||||
ggml_metal_pipeline_t pipeline0 = ggml_metal_library_get_pipeline_flash_attn_ext_vec_reduce(lib, op, ne20, nwg);
|
auto pipeline0 = ggml_metal_library_get_pipeline_flash_attn_ext_vec_reduce(lib, op, ne20, nwg);
|
||||||
|
|
||||||
ggml_metal_encoder_set_pipeline(enc, pipeline0);
|
ggml_metal_encoder_set_pipeline(enc, pipeline0);
|
||||||
ggml_metal_encoder_set_bytes (enc, &args0, sizeof(args0), 0);
|
ggml_metal_encoder_set_bytes (enc, &args0, sizeof(args0), 0);
|
||||||
|
|
@ -2762,7 +2805,7 @@ int ggml_metal_op_bin(ggml_metal_op_t ctx, int idx) {
|
||||||
// the offsets of src1 and all fused buffers are relative to the start of the src1 buffer
|
// the offsets of src1 and all fused buffers are relative to the start of the src1 buffer
|
||||||
bid_src1.offs = 0;
|
bid_src1.offs = 0;
|
||||||
|
|
||||||
ggml_metal_pipeline_t pipeline = nullptr;
|
struct ggml_metal_pipeline_with_params pipeline;
|
||||||
|
|
||||||
if (ggml_nelements(op->src[1]) == ne10 && ggml_is_contiguous(op->src[1]) && ne00 % 4 == 0 && ne10 % 4 == 0) {
|
if (ggml_nelements(op->src[1]) == ne10 && ggml_is_contiguous(op->src[1]) && ne00 % 4 == 0 && ne10 % 4 == 0) {
|
||||||
GGML_ASSERT(ggml_is_contiguous(op->src[0]));
|
GGML_ASSERT(ggml_is_contiguous(op->src[0]));
|
||||||
|
|
@ -2835,7 +2878,7 @@ int ggml_metal_op_l2_norm(ggml_metal_op_t ctx, int idx) {
|
||||||
/*.eps =*/ eps,
|
/*.eps =*/ eps,
|
||||||
};
|
};
|
||||||
|
|
||||||
ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_l2_norm(lib, op);
|
auto pipeline = ggml_metal_library_get_pipeline_l2_norm(lib, op);
|
||||||
|
|
||||||
while (nth < ne00/4 && nth < ggml_metal_pipeline_max_theads_per_threadgroup(pipeline)) {
|
while (nth < ne00/4 && nth < ggml_metal_pipeline_max_theads_per_threadgroup(pipeline)) {
|
||||||
nth *= 2;
|
nth *= 2;
|
||||||
|
|
@ -2844,7 +2887,7 @@ int ggml_metal_op_l2_norm(ggml_metal_op_t ctx, int idx) {
|
||||||
nth = std::min(nth, ggml_metal_pipeline_max_theads_per_threadgroup(pipeline));
|
nth = std::min(nth, ggml_metal_pipeline_max_theads_per_threadgroup(pipeline));
|
||||||
nth = std::min(nth, ne00/4);
|
nth = std::min(nth, ne00/4);
|
||||||
|
|
||||||
const size_t smem = ggml_metal_pipeline_get_smem(pipeline);
|
const size_t smem = pipeline.smem;
|
||||||
|
|
||||||
const int64_t nrows = ggml_nrows(op->src[0]);
|
const int64_t nrows = ggml_nrows(op->src[0]);
|
||||||
|
|
||||||
|
|
@ -2887,7 +2930,7 @@ int ggml_metal_op_group_norm(ggml_metal_op_t ctx, int idx) {
|
||||||
/*.eps =*/ eps,
|
/*.eps =*/ eps,
|
||||||
};
|
};
|
||||||
|
|
||||||
ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_group_norm(lib, op);
|
auto pipeline = ggml_metal_library_get_pipeline_group_norm(lib, op);
|
||||||
|
|
||||||
int nth = 32; // SIMD width
|
int nth = 32; // SIMD width
|
||||||
//while (nth < ne00/4 && nth < ggml_metal_pipeline_max_theads_per_threadgroup(pipeline)) {
|
//while (nth < ne00/4 && nth < ggml_metal_pipeline_max_theads_per_threadgroup(pipeline)) {
|
||||||
|
|
@ -2897,7 +2940,7 @@ int ggml_metal_op_group_norm(ggml_metal_op_t ctx, int idx) {
|
||||||
//nth = std::min(nth, ggml_metal_pipeline_max_theads_per_threadgroup(pipeline));
|
//nth = std::min(nth, ggml_metal_pipeline_max_theads_per_threadgroup(pipeline));
|
||||||
//nth = std::min(nth, ne00/4);
|
//nth = std::min(nth, ne00/4);
|
||||||
|
|
||||||
const size_t smem = ggml_metal_pipeline_get_smem(pipeline);
|
const size_t smem = pipeline.smem;
|
||||||
|
|
||||||
ggml_metal_encoder_set_pipeline(enc, pipeline);
|
ggml_metal_encoder_set_pipeline(enc, pipeline);
|
||||||
ggml_metal_encoder_set_bytes (enc, &args, sizeof(args), 0);
|
ggml_metal_encoder_set_bytes (enc, &args, sizeof(args), 0);
|
||||||
|
|
@ -3022,7 +3065,7 @@ int ggml_metal_op_norm(ggml_metal_op_t ctx, int idx) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_norm(lib, op, n_fuse);
|
auto pipeline = ggml_metal_library_get_pipeline_norm(lib, op, n_fuse);
|
||||||
|
|
||||||
int nth = 32; // SIMD width
|
int nth = 32; // SIMD width
|
||||||
|
|
||||||
|
|
@ -3033,7 +3076,7 @@ int ggml_metal_op_norm(ggml_metal_op_t ctx, int idx) {
|
||||||
nth = std::min(nth, ggml_metal_pipeline_max_theads_per_threadgroup(pipeline));
|
nth = std::min(nth, ggml_metal_pipeline_max_theads_per_threadgroup(pipeline));
|
||||||
nth = std::min(nth, args.ne00_t);
|
nth = std::min(nth, args.ne00_t);
|
||||||
|
|
||||||
const size_t smem = ggml_metal_pipeline_get_smem(pipeline);
|
const size_t smem = pipeline.smem;
|
||||||
|
|
||||||
ggml_metal_encoder_set_pipeline(enc, pipeline);
|
ggml_metal_encoder_set_pipeline(enc, pipeline);
|
||||||
ggml_metal_encoder_set_bytes (enc, &args, sizeof(args), 0);
|
ggml_metal_encoder_set_bytes (enc, &args, sizeof(args), 0);
|
||||||
|
|
@ -3127,7 +3170,7 @@ int ggml_metal_op_rope(ggml_metal_op_t ctx, int idx) {
|
||||||
/* src2 =*/ op->src[2] != nullptr,
|
/* src2 =*/ op->src[2] != nullptr,
|
||||||
};
|
};
|
||||||
|
|
||||||
ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_rope(lib, op);
|
auto pipeline = ggml_metal_library_get_pipeline_rope(lib, op);
|
||||||
|
|
||||||
ggml_metal_encoder_set_pipeline(enc, pipeline);
|
ggml_metal_encoder_set_pipeline(enc, pipeline);
|
||||||
ggml_metal_encoder_set_bytes (enc, &args, sizeof(args), 0);
|
ggml_metal_encoder_set_bytes (enc, &args, sizeof(args), 0);
|
||||||
|
|
@ -3199,7 +3242,7 @@ int ggml_metal_op_im2col(ggml_metal_op_t ctx, int idx) {
|
||||||
/*.KHW =*/ KH * KW,
|
/*.KHW =*/ KH * KW,
|
||||||
};
|
};
|
||||||
|
|
||||||
ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_im2col(lib, op);
|
auto pipeline = ggml_metal_library_get_pipeline_im2col(lib, op);
|
||||||
|
|
||||||
GGML_ASSERT(KH*KW <= ggml_metal_pipeline_max_theads_per_threadgroup(pipeline));
|
GGML_ASSERT(KH*KW <= ggml_metal_pipeline_max_theads_per_threadgroup(pipeline));
|
||||||
|
|
||||||
|
|
@ -3270,7 +3313,7 @@ int ggml_metal_op_conv_2d(ggml_metal_op_t ctx, int idx) {
|
||||||
/*.d1 =*/ d1,
|
/*.d1 =*/ d1,
|
||||||
};
|
};
|
||||||
|
|
||||||
ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_conv_2d(lib, op);
|
auto pipeline = ggml_metal_library_get_pipeline_conv_2d(lib, op);
|
||||||
|
|
||||||
int nth = ggml_metal_pipeline_max_theads_per_threadgroup(pipeline);
|
int nth = ggml_metal_pipeline_max_theads_per_threadgroup(pipeline);
|
||||||
nth = std::min(nth, 256);
|
nth = std::min(nth, 256);
|
||||||
|
|
@ -3325,7 +3368,7 @@ int ggml_metal_op_conv_transpose_1d(ggml_metal_op_t ctx, int idx) {
|
||||||
/*.nb1 =*/ nb1,
|
/*.nb1 =*/ nb1,
|
||||||
};
|
};
|
||||||
|
|
||||||
ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_conv_transpose_1d(lib, op);
|
auto pipeline = ggml_metal_library_get_pipeline_conv_transpose_1d(lib, op);
|
||||||
|
|
||||||
ggml_metal_encoder_set_pipeline(enc, pipeline);
|
ggml_metal_encoder_set_pipeline(enc, pipeline);
|
||||||
ggml_metal_encoder_set_bytes (enc, &args, sizeof(args), 0);
|
ggml_metal_encoder_set_bytes (enc, &args, sizeof(args), 0);
|
||||||
|
|
@ -3377,7 +3420,7 @@ int ggml_metal_op_conv_transpose_2d(ggml_metal_op_t ctx, int idx) {
|
||||||
/*.nb2 =*/ nb2,
|
/*.nb2 =*/ nb2,
|
||||||
};
|
};
|
||||||
|
|
||||||
ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_conv_transpose_2d(lib, op);
|
auto pipeline = ggml_metal_library_get_pipeline_conv_transpose_2d(lib, op);
|
||||||
|
|
||||||
ggml_metal_encoder_set_pipeline(enc, pipeline);
|
ggml_metal_encoder_set_pipeline(enc, pipeline);
|
||||||
ggml_metal_encoder_set_bytes (enc, &args, sizeof(args), 0);
|
ggml_metal_encoder_set_bytes (enc, &args, sizeof(args), 0);
|
||||||
|
|
@ -3433,7 +3476,7 @@ int ggml_metal_op_upscale(ggml_metal_op_t ctx, int idx) {
|
||||||
/*.sf3 =*/ sf3
|
/*.sf3 =*/ sf3
|
||||||
};
|
};
|
||||||
|
|
||||||
ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_upscale(lib, op);
|
auto pipeline = ggml_metal_library_get_pipeline_upscale(lib, op);
|
||||||
|
|
||||||
const int nth = std::min(ggml_metal_pipeline_max_theads_per_threadgroup(pipeline), ne0);
|
const int nth = std::min(ggml_metal_pipeline_max_theads_per_threadgroup(pipeline), ne0);
|
||||||
|
|
||||||
|
|
@ -3477,7 +3520,7 @@ int ggml_metal_op_pad(ggml_metal_op_t ctx, int idx) {
|
||||||
/*.nb3 =*/ nb3
|
/*.nb3 =*/ nb3
|
||||||
};
|
};
|
||||||
|
|
||||||
ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_pad(lib, op);
|
auto pipeline = ggml_metal_library_get_pipeline_pad(lib, op);
|
||||||
|
|
||||||
const int nth = std::min(1024, ne0);
|
const int nth = std::min(1024, ne0);
|
||||||
|
|
||||||
|
|
@ -3523,7 +3566,7 @@ int ggml_metal_op_pad_reflect_1d(ggml_metal_op_t ctx, int idx) {
|
||||||
/*.p1 =*/ ((const int32_t *)(op->op_params))[1]
|
/*.p1 =*/ ((const int32_t *)(op->op_params))[1]
|
||||||
};
|
};
|
||||||
|
|
||||||
ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_pad_reflect_1d(lib, op);
|
auto pipeline = ggml_metal_library_get_pipeline_pad_reflect_1d(lib, op);
|
||||||
|
|
||||||
const int nth = std::min(1024, ne0);
|
const int nth = std::min(1024, ne0);
|
||||||
|
|
||||||
|
|
@ -3560,7 +3603,7 @@ int ggml_metal_op_arange(ggml_metal_op_t ctx, int idx) {
|
||||||
|
|
||||||
const int nth = std::min(1024, ne0);
|
const int nth = std::min(1024, ne0);
|
||||||
|
|
||||||
ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_arange(lib, op);
|
auto pipeline = ggml_metal_library_get_pipeline_arange(lib, op);
|
||||||
|
|
||||||
ggml_metal_encoder_set_pipeline(enc, pipeline);
|
ggml_metal_encoder_set_pipeline(enc, pipeline);
|
||||||
ggml_metal_encoder_set_bytes (enc, &args, sizeof(args), 0);
|
ggml_metal_encoder_set_bytes (enc, &args, sizeof(args), 0);
|
||||||
|
|
@ -3591,7 +3634,7 @@ int ggml_metal_op_timestep_embedding(ggml_metal_op_t ctx, int idx) {
|
||||||
/*.max_period =*/ max_period,
|
/*.max_period =*/ max_period,
|
||||||
};
|
};
|
||||||
|
|
||||||
ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_timestep_embedding(lib, op);
|
auto pipeline = ggml_metal_library_get_pipeline_timestep_embedding(lib, op);
|
||||||
|
|
||||||
const int nth = std::max(1, std::min(1024, dim/2));
|
const int nth = std::max(1, std::min(1024, dim/2));
|
||||||
|
|
||||||
|
|
@ -3621,7 +3664,7 @@ int ggml_metal_op_argmax(ggml_metal_op_t ctx, int idx) {
|
||||||
/*.nb01 = */ nb01,
|
/*.nb01 = */ nb01,
|
||||||
};
|
};
|
||||||
|
|
||||||
ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_argmax(lib, op);
|
auto pipeline = ggml_metal_library_get_pipeline_argmax(lib, op);
|
||||||
|
|
||||||
const int64_t nrows = ggml_nrows(op->src[0]);
|
const int64_t nrows = ggml_nrows(op->src[0]);
|
||||||
|
|
||||||
|
|
@ -3630,7 +3673,7 @@ int ggml_metal_op_argmax(ggml_metal_op_t ctx, int idx) {
|
||||||
nth *= 2;
|
nth *= 2;
|
||||||
}
|
}
|
||||||
|
|
||||||
const size_t smem = ggml_metal_pipeline_get_smem(pipeline);
|
const size_t smem = pipeline.smem;
|
||||||
|
|
||||||
ggml_metal_encoder_set_pipeline(enc, pipeline);
|
ggml_metal_encoder_set_pipeline(enc, pipeline);
|
||||||
ggml_metal_encoder_set_bytes (enc, &args, sizeof(args), 0);
|
ggml_metal_encoder_set_bytes (enc, &args, sizeof(args), 0);
|
||||||
|
|
@ -3657,7 +3700,7 @@ int ggml_metal_op_argsort(ggml_metal_op_t ctx, int idx) {
|
||||||
GGML_TENSOR_LOCALS( int32_t, ne, op, ne);
|
GGML_TENSOR_LOCALS( int32_t, ne, op, ne);
|
||||||
GGML_TENSOR_LOCALS(uint64_t, nb, op, nb);
|
GGML_TENSOR_LOCALS(uint64_t, nb, op, nb);
|
||||||
|
|
||||||
ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_argsort(lib, op);
|
auto pipeline = ggml_metal_library_get_pipeline_argsort(lib, op);
|
||||||
|
|
||||||
// bitonic sort requires the number of elements to be power of 2
|
// bitonic sort requires the number of elements to be power of 2
|
||||||
int nth = 1;
|
int nth = 1;
|
||||||
|
|
@ -3706,7 +3749,7 @@ int ggml_metal_op_argsort(ggml_metal_op_t ctx, int idx) {
|
||||||
|
|
||||||
ggml_metal_encoder_dispatch_threadgroups(enc, npr*ne01, ne02, ne03, nth, 1, 1);
|
ggml_metal_encoder_dispatch_threadgroups(enc, npr*ne01, ne02, ne03, nth, 1, 1);
|
||||||
|
|
||||||
ggml_metal_pipeline_t pipeline_merge = ggml_metal_library_get_pipeline_argsort_merge(lib, op);
|
auto pipeline_merge = ggml_metal_library_get_pipeline_argsort_merge(lib, op);
|
||||||
|
|
||||||
int len = nth;
|
int len = nth;
|
||||||
|
|
||||||
|
|
@ -3764,7 +3807,7 @@ int ggml_metal_op_top_k(ggml_metal_op_t ctx, int idx) {
|
||||||
GGML_TENSOR_LOCALS( int32_t, ne, op, ne);
|
GGML_TENSOR_LOCALS( int32_t, ne, op, ne);
|
||||||
GGML_TENSOR_LOCALS(uint64_t, nb, op, nb);
|
GGML_TENSOR_LOCALS(uint64_t, nb, op, nb);
|
||||||
|
|
||||||
ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_top_k(lib, op);
|
auto pipeline = ggml_metal_library_get_pipeline_top_k(lib, op);
|
||||||
|
|
||||||
// bitonic sort requires the number of elements to be power of 2
|
// bitonic sort requires the number of elements to be power of 2
|
||||||
int nth = 1;
|
int nth = 1;
|
||||||
|
|
@ -3818,7 +3861,7 @@ int ggml_metal_op_top_k(ggml_metal_op_t ctx, int idx) {
|
||||||
|
|
||||||
ggml_metal_encoder_dispatch_threadgroups(enc, npr*ne01, ne02, ne03, nth, 1, 1);
|
ggml_metal_encoder_dispatch_threadgroups(enc, npr*ne01, ne02, ne03, nth, 1, 1);
|
||||||
|
|
||||||
ggml_metal_pipeline_t pipeline_merge = ggml_metal_library_get_pipeline_top_k_merge(lib, op);
|
auto pipeline_merge = ggml_metal_library_get_pipeline_top_k_merge(lib, op);
|
||||||
|
|
||||||
int len = args.top_k;
|
int len = args.top_k;
|
||||||
|
|
||||||
|
|
@ -3881,7 +3924,7 @@ int ggml_metal_op_leaky_relu(ggml_metal_op_t ctx, int idx) {
|
||||||
/*.slope =*/ slope
|
/*.slope =*/ slope
|
||||||
};
|
};
|
||||||
|
|
||||||
ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_unary(lib, op);
|
auto pipeline = ggml_metal_library_get_pipeline_unary(lib, op);
|
||||||
|
|
||||||
int64_t n = ggml_nelements(op);
|
int64_t n = ggml_nelements(op);
|
||||||
|
|
||||||
|
|
@ -3899,6 +3942,57 @@ int ggml_metal_op_leaky_relu(ggml_metal_op_t ctx, int idx) {
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
int ggml_metal_op_tri(ggml_metal_op_t ctx, int idx) {
|
||||||
|
ggml_tensor * op = ctx->node(idx);
|
||||||
|
|
||||||
|
ggml_metal_library_t lib = ctx->lib;
|
||||||
|
ggml_metal_encoder_t enc = ctx->enc;
|
||||||
|
|
||||||
|
GGML_TENSOR_LOCALS( int32_t, ne0, op->src[0], ne);
|
||||||
|
GGML_TENSOR_LOCALS(uint64_t, nb0, op->src[0], nb);
|
||||||
|
GGML_TENSOR_LOCALS( int32_t, ne, op, ne);
|
||||||
|
GGML_TENSOR_LOCALS(uint64_t, nb, op, nb);
|
||||||
|
|
||||||
|
ggml_metal_kargs_tri args = {
|
||||||
|
/*.ne00 =*/ ne00,
|
||||||
|
/*.ne01 =*/ ne01,
|
||||||
|
/*.ne02 =*/ ne02,
|
||||||
|
/*.ne03 =*/ ne03,
|
||||||
|
/*.nb00 =*/ nb00,
|
||||||
|
/*.nb01 =*/ nb01,
|
||||||
|
/*.nb02 =*/ nb02,
|
||||||
|
/*.nb03 =*/ nb03,
|
||||||
|
/*.ne0 =*/ ne0,
|
||||||
|
/*.ne1 =*/ ne1,
|
||||||
|
/*.ne2 =*/ ne2,
|
||||||
|
/*.ne3 =*/ ne3,
|
||||||
|
/*.nb0 =*/ nb0,
|
||||||
|
/*.nb1 =*/ nb1,
|
||||||
|
/*.nb2 =*/ nb2,
|
||||||
|
/*.nb3 =*/ nb3,
|
||||||
|
};
|
||||||
|
|
||||||
|
auto pipeline = ggml_metal_library_get_pipeline_tri(lib, op);
|
||||||
|
|
||||||
|
int nth = 32; // SIMD width
|
||||||
|
|
||||||
|
while (nth < ne00 && nth < ggml_metal_pipeline_max_theads_per_threadgroup(pipeline)) {
|
||||||
|
nth *= 2;
|
||||||
|
}
|
||||||
|
|
||||||
|
nth = std::min(nth, ggml_metal_pipeline_max_theads_per_threadgroup(pipeline));
|
||||||
|
nth = std::min(nth, ne00);
|
||||||
|
|
||||||
|
ggml_metal_encoder_set_pipeline(enc, pipeline);
|
||||||
|
ggml_metal_encoder_set_bytes (enc, &args, sizeof(args), 0);
|
||||||
|
ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[0]), 1);
|
||||||
|
ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op), 2);
|
||||||
|
|
||||||
|
ggml_metal_encoder_dispatch_threadgroups(enc, ne01, ne02, ne03, nth, 1, 1);
|
||||||
|
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
int ggml_metal_op_opt_step_adamw(ggml_metal_op_t ctx, int idx) {
|
int ggml_metal_op_opt_step_adamw(ggml_metal_op_t ctx, int idx) {
|
||||||
ggml_tensor * op = ctx->node(idx);
|
ggml_tensor * op = ctx->node(idx);
|
||||||
|
|
||||||
|
|
@ -3910,7 +4004,7 @@ int ggml_metal_op_opt_step_adamw(ggml_metal_op_t ctx, int idx) {
|
||||||
GGML_TENSOR_LOCALS( int32_t, ne, op, ne);
|
GGML_TENSOR_LOCALS( int32_t, ne, op, ne);
|
||||||
GGML_TENSOR_LOCALS(uint64_t, nb, op, nb);
|
GGML_TENSOR_LOCALS(uint64_t, nb, op, nb);
|
||||||
|
|
||||||
ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_opt_step_adamw(lib, op);
|
auto pipeline = ggml_metal_library_get_pipeline_opt_step_adamw(lib, op);
|
||||||
|
|
||||||
const int64_t np = ggml_nelements(op->src[0]);
|
const int64_t np = ggml_nelements(op->src[0]);
|
||||||
ggml_metal_kargs_opt_step_adamw args = {
|
ggml_metal_kargs_opt_step_adamw args = {
|
||||||
|
|
@ -3946,7 +4040,7 @@ int ggml_metal_op_opt_step_sgd(ggml_metal_op_t ctx, int idx) {
|
||||||
GGML_TENSOR_LOCALS( int32_t, ne, op, ne);
|
GGML_TENSOR_LOCALS( int32_t, ne, op, ne);
|
||||||
GGML_TENSOR_LOCALS(uint64_t, nb, op, nb);
|
GGML_TENSOR_LOCALS(uint64_t, nb, op, nb);
|
||||||
|
|
||||||
ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_opt_step_sgd(lib, op);
|
auto pipeline = ggml_metal_library_get_pipeline_opt_step_sgd(lib, op);
|
||||||
|
|
||||||
const int64_t np = ggml_nelements(op->src[0]);
|
const int64_t np = ggml_nelements(op->src[0]);
|
||||||
ggml_metal_kargs_opt_step_sgd args = {
|
ggml_metal_kargs_opt_step_sgd args = {
|
||||||
|
|
|
||||||
|
|
@ -47,6 +47,7 @@ int ggml_metal_op_concat (ggml_metal_op_t ctx, int idx);
|
||||||
int ggml_metal_op_repeat (ggml_metal_op_t ctx, int idx);
|
int ggml_metal_op_repeat (ggml_metal_op_t ctx, int idx);
|
||||||
int ggml_metal_op_acc (ggml_metal_op_t ctx, int idx);
|
int ggml_metal_op_acc (ggml_metal_op_t ctx, int idx);
|
||||||
int ggml_metal_op_scale (ggml_metal_op_t ctx, int idx);
|
int ggml_metal_op_scale (ggml_metal_op_t ctx, int idx);
|
||||||
|
int ggml_metal_op_fill (ggml_metal_op_t ctx, int idx);
|
||||||
int ggml_metal_op_clamp (ggml_metal_op_t ctx, int idx);
|
int ggml_metal_op_clamp (ggml_metal_op_t ctx, int idx);
|
||||||
int ggml_metal_op_unary (ggml_metal_op_t ctx, int idx);
|
int ggml_metal_op_unary (ggml_metal_op_t ctx, int idx);
|
||||||
int ggml_metal_op_glu (ggml_metal_op_t ctx, int idx);
|
int ggml_metal_op_glu (ggml_metal_op_t ctx, int idx);
|
||||||
|
|
@ -83,6 +84,7 @@ int ggml_metal_op_argmax (ggml_metal_op_t ctx, int idx);
|
||||||
int ggml_metal_op_argsort (ggml_metal_op_t ctx, int idx);
|
int ggml_metal_op_argsort (ggml_metal_op_t ctx, int idx);
|
||||||
int ggml_metal_op_top_k (ggml_metal_op_t ctx, int idx);
|
int ggml_metal_op_top_k (ggml_metal_op_t ctx, int idx);
|
||||||
int ggml_metal_op_leaky_relu (ggml_metal_op_t ctx, int idx);
|
int ggml_metal_op_leaky_relu (ggml_metal_op_t ctx, int idx);
|
||||||
|
int ggml_metal_op_tri (ggml_metal_op_t ctx, int idx);
|
||||||
int ggml_metal_op_opt_step_adamw (ggml_metal_op_t ctx, int idx);
|
int ggml_metal_op_opt_step_adamw (ggml_metal_op_t ctx, int idx);
|
||||||
int ggml_metal_op_opt_step_sgd (ggml_metal_op_t ctx, int idx);
|
int ggml_metal_op_opt_step_sgd (ggml_metal_op_t ctx, int idx);
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -1249,6 +1249,22 @@ kernel void kernel_scale_f32_4(
|
||||||
dst[tpig] = src0[tpig] * args.scale + args.bias;
|
dst[tpig] = src0[tpig] * args.scale + args.bias;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
kernel void kernel_fill_f32(
|
||||||
|
constant ggml_metal_kargs_fill & args,
|
||||||
|
device const float * src0,
|
||||||
|
device float * dst,
|
||||||
|
uint tpig[[thread_position_in_grid]]) {
|
||||||
|
dst[tpig] = args.val;
|
||||||
|
}
|
||||||
|
|
||||||
|
kernel void kernel_fill_f32_4(
|
||||||
|
constant ggml_metal_kargs_fill & args,
|
||||||
|
device const float4 * src0,
|
||||||
|
device float4 * dst,
|
||||||
|
uint tpig[[thread_position_in_grid]]) {
|
||||||
|
dst[tpig] = args.val;
|
||||||
|
}
|
||||||
|
|
||||||
kernel void kernel_clamp_f32(
|
kernel void kernel_clamp_f32(
|
||||||
constant ggml_metal_kargs_clamp & args,
|
constant ggml_metal_kargs_clamp & args,
|
||||||
device const float * src0,
|
device const float * src0,
|
||||||
|
|
@ -1595,6 +1611,36 @@ kernel void kernel_exp_f32_4(
|
||||||
dst[tpig] = exp(src0[tpig]);
|
dst[tpig] = exp(src0[tpig]);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
kernel void kernel_softplus_f32(
|
||||||
|
device const float * src0,
|
||||||
|
device float * dst,
|
||||||
|
uint tpig[[thread_position_in_grid]]) {
|
||||||
|
device const float & x = src0[tpig];
|
||||||
|
dst[tpig] = select(log(1.0f + exp(x)), x, x > 20.0f);
|
||||||
|
}
|
||||||
|
|
||||||
|
kernel void kernel_softplus_f32_4(
|
||||||
|
device const float4 * src0,
|
||||||
|
device float4 * dst,
|
||||||
|
uint tpig[[thread_position_in_grid]]) {
|
||||||
|
device const float4 & x = src0[tpig];
|
||||||
|
dst[tpig] = select(log(1.0f + exp(x)), x, x > 20.0f);
|
||||||
|
}
|
||||||
|
|
||||||
|
kernel void kernel_expm1_f32(
|
||||||
|
device const float * src0,
|
||||||
|
device float * dst,
|
||||||
|
uint tpig[[thread_position_in_grid]]) {
|
||||||
|
dst[tpig] = exp(src0[tpig]) - 1.0f;
|
||||||
|
}
|
||||||
|
|
||||||
|
kernel void kernel_expm1_f32_4(
|
||||||
|
device const float4 * src0,
|
||||||
|
device float4 * dst,
|
||||||
|
uint tpig[[thread_position_in_grid]]) {
|
||||||
|
dst[tpig] = exp(src0[tpig]) - 1.0f;
|
||||||
|
}
|
||||||
|
|
||||||
kernel void kernel_reglu_f32(
|
kernel void kernel_reglu_f32(
|
||||||
constant ggml_metal_kargs_glu & args,
|
constant ggml_metal_kargs_glu & args,
|
||||||
device const char * src0,
|
device const char * src0,
|
||||||
|
|
@ -1943,6 +1989,75 @@ typedef decltype(kernel_cumsum_add<float>) kernel_cumsum_add_t;
|
||||||
|
|
||||||
template [[host_name("kernel_cumsum_add_f32")]] kernel kernel_cumsum_add_t kernel_cumsum_add<float>;
|
template [[host_name("kernel_cumsum_add_f32")]] kernel kernel_cumsum_add_t kernel_cumsum_add<float>;
|
||||||
|
|
||||||
|
|
||||||
|
template<uint32_t ttype>
|
||||||
|
bool _ggml_vec_tri_cmp(const int i, const int r);
|
||||||
|
|
||||||
|
template<>
|
||||||
|
bool _ggml_vec_tri_cmp</* GGML_TRI_TYPE_LOWER */ 3>(const int i, const int r) {
|
||||||
|
return i < r;
|
||||||
|
}
|
||||||
|
|
||||||
|
template<>
|
||||||
|
bool _ggml_vec_tri_cmp</* GGML_TRI_TYPE_LOWER_DIAG */ 2>(const int i, const int r) {
|
||||||
|
return i <= r;
|
||||||
|
}
|
||||||
|
|
||||||
|
template<>
|
||||||
|
bool _ggml_vec_tri_cmp</* GGML_TRI_TYPE_UPPER */ 1>(const int i, const int r) {
|
||||||
|
return i > r;
|
||||||
|
}
|
||||||
|
|
||||||
|
template<>
|
||||||
|
bool _ggml_vec_tri_cmp</* GGML_TRI_TYPE_UPPER_DIAG */ 0>(const int i, const int r) {
|
||||||
|
return i >= r;
|
||||||
|
}
|
||||||
|
|
||||||
|
template<typename T, int ttype>
|
||||||
|
kernel void kernel_tri(
|
||||||
|
constant ggml_metal_kargs_tri & args,
|
||||||
|
device const char * src0,
|
||||||
|
device const char * dst,
|
||||||
|
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||||
|
ushort3 tpitg[[thread_position_in_threadgroup]],
|
||||||
|
ushort3 ntg[[threads_per_threadgroup]]) {
|
||||||
|
const int i3 = tgpig.z;
|
||||||
|
const int i2 = tgpig.y;
|
||||||
|
const int i1 = tgpig.x;
|
||||||
|
|
||||||
|
if (i3 >= args.ne03 || i2 >= args.ne02 || i1 >= args.ne01) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
device const T * src_row = (device const T *) ((device const char *) src0 + i1*args.nb01 + i2*args.nb02 + i3*args.nb03);
|
||||||
|
device T * dst_row = (device T *) ((device char *) dst + i1*args.nb1 + i2*args.nb2 + i3*args.nb3);
|
||||||
|
|
||||||
|
// Each thread is a single element of the row if ne00 < max threads per
|
||||||
|
// threadgroup, so this will loop once for each index that this thread is
|
||||||
|
// responsible for
|
||||||
|
for (int64_t i0 = tpitg.x; i0 < args.ne00; i0 += ntg.x) {
|
||||||
|
// Use the comparison as a mask for branchless
|
||||||
|
dst_row[i0] = static_cast<T>(_ggml_vec_tri_cmp<ttype>(i0, i1)) * src_row[i0];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
typedef decltype(kernel_tri<float, 0>) kernel_tri_t;
|
||||||
|
|
||||||
|
template [[host_name("kernel_tri_f32_0")]] kernel kernel_tri_t kernel_tri<float, 0>;
|
||||||
|
template [[host_name("kernel_tri_f32_1")]] kernel kernel_tri_t kernel_tri<float, 1>;
|
||||||
|
template [[host_name("kernel_tri_f32_2")]] kernel kernel_tri_t kernel_tri<float, 2>;
|
||||||
|
template [[host_name("kernel_tri_f32_3")]] kernel kernel_tri_t kernel_tri<float, 3>;
|
||||||
|
template [[host_name("kernel_tri_f16_0")]] kernel kernel_tri_t kernel_tri<half, 0>;
|
||||||
|
template [[host_name("kernel_tri_f16_1")]] kernel kernel_tri_t kernel_tri<half, 1>;
|
||||||
|
template [[host_name("kernel_tri_f16_2")]] kernel kernel_tri_t kernel_tri<half, 2>;
|
||||||
|
template [[host_name("kernel_tri_f16_3")]] kernel kernel_tri_t kernel_tri<half, 3>;
|
||||||
|
#if defined(GGML_METAL_HAS_BF16)
|
||||||
|
template [[host_name("kernel_tri_bf16_0")]] kernel kernel_tri_t kernel_tri<bfloat, 0>;
|
||||||
|
template [[host_name("kernel_tri_bf16_1")]] kernel kernel_tri_t kernel_tri<bfloat, 1>;
|
||||||
|
template [[host_name("kernel_tri_bf16_2")]] kernel kernel_tri_t kernel_tri<bfloat, 2>;
|
||||||
|
template [[host_name("kernel_tri_bf16_3")]] kernel kernel_tri_t kernel_tri<bfloat, 3>;
|
||||||
|
#endif
|
||||||
|
|
||||||
template<typename T>
|
template<typename T>
|
||||||
kernel void kernel_soft_max(
|
kernel void kernel_soft_max(
|
||||||
constant ggml_metal_kargs_soft_max & args,
|
constant ggml_metal_kargs_soft_max & args,
|
||||||
|
|
|
||||||
|
|
@ -3083,6 +3083,10 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
|
||||||
case GGML_OP_REPEAT:
|
case GGML_OP_REPEAT:
|
||||||
return op->src[0]->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32; // Assuming F32 for now, can be expanded
|
return op->src[0]->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32; // Assuming F32 for now, can be expanded
|
||||||
case GGML_OP_PAD:
|
case GGML_OP_PAD:
|
||||||
|
// TODO: add circular padding support for opencl, see https://github.com/ggml-org/llama.cpp/pull/16985
|
||||||
|
if (ggml_get_op_params_i32(op, 8) != 0) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
return op->src[0]->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32;
|
return op->src[0]->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32;
|
||||||
case GGML_OP_UPSCALE: {
|
case GGML_OP_UPSCALE: {
|
||||||
ggml_scale_mode mode = (ggml_scale_mode)(ggml_get_op_params_i32(op, 0) & 0xFF);
|
ggml_scale_mode mode = (ggml_scale_mode)(ggml_get_op_params_i32(op, 0) & 0xFF);
|
||||||
|
|
|
||||||
|
|
@ -128,6 +128,7 @@ struct rpc_msg_device_count_rsp {
|
||||||
struct rpc_msg_get_alloc_size_req {
|
struct rpc_msg_get_alloc_size_req {
|
||||||
uint32_t device;
|
uint32_t device;
|
||||||
rpc_tensor tensor;
|
rpc_tensor tensor;
|
||||||
|
rpc_tensor srcs[GGML_MAX_SRC];
|
||||||
};
|
};
|
||||||
|
|
||||||
struct rpc_msg_get_alloc_size_rsp {
|
struct rpc_msg_get_alloc_size_rsp {
|
||||||
|
|
@ -572,6 +573,11 @@ static void * ggml_backend_rpc_buffer_get_base(ggml_backend_buffer_t buffer) {
|
||||||
|
|
||||||
static rpc_tensor serialize_tensor(const ggml_tensor * tensor) {
|
static rpc_tensor serialize_tensor(const ggml_tensor * tensor) {
|
||||||
rpc_tensor result;
|
rpc_tensor result;
|
||||||
|
if (!tensor) {
|
||||||
|
memset(&result, 0, sizeof(result));
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
result.id = reinterpret_cast<uint64_t>(tensor);
|
result.id = reinterpret_cast<uint64_t>(tensor);
|
||||||
result.type = tensor->type;
|
result.type = tensor->type;
|
||||||
if (tensor->buffer) {
|
if (tensor->buffer) {
|
||||||
|
|
@ -753,23 +759,41 @@ static size_t ggml_backend_rpc_get_max_size(ggml_backend_buffer_type_t buft) {
|
||||||
}
|
}
|
||||||
|
|
||||||
static size_t ggml_backend_rpc_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) {
|
static size_t ggml_backend_rpc_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) {
|
||||||
|
// should we query the remote server for the actual size
|
||||||
|
bool rpc_get = false;
|
||||||
|
|
||||||
// See comments in init_tensor.
|
// See comments in init_tensor.
|
||||||
if (ggml_is_quantized(tensor->type) && (tensor->ne[0] % 512 != 0) && (tensor->view_src == nullptr)) {
|
rpc_get |= ggml_is_quantized(tensor->type) && (tensor->ne[0] % 512 != 0) && (tensor->view_src == nullptr);
|
||||||
|
|
||||||
|
// ops that require additional memory for fleeting data on certain backends
|
||||||
|
// ref: https://github.com/ggml-org/llama.cpp/pull/15966
|
||||||
|
rpc_get |= tensor->op == GGML_OP_FLASH_ATTN_EXT;
|
||||||
|
rpc_get |= tensor->op == GGML_OP_MUL_MAT_ID;
|
||||||
|
|
||||||
|
if (rpc_get) {
|
||||||
ggml_backend_rpc_buffer_type_context * buft_ctx = (ggml_backend_rpc_buffer_type_context *)buft->context;
|
ggml_backend_rpc_buffer_type_context * buft_ctx = (ggml_backend_rpc_buffer_type_context *)buft->context;
|
||||||
auto sock = get_socket(buft_ctx->endpoint);
|
auto sock = get_socket(buft_ctx->endpoint);
|
||||||
|
|
||||||
rpc_msg_get_alloc_size_req request;
|
rpc_msg_get_alloc_size_req request = {
|
||||||
request.device = buft_ctx->device;
|
/*.device =*/ buft_ctx->device,
|
||||||
request.tensor = serialize_tensor(tensor);
|
/*.tensor =*/ serialize_tensor(tensor),
|
||||||
|
/*.srcs =*/ {},
|
||||||
|
};
|
||||||
|
|
||||||
|
// .get_alloc_size could be a function of the tensor's srcs, so we must serialize them as well
|
||||||
|
for (int i = 0; i < GGML_MAX_SRC; i++) {
|
||||||
|
request.srcs[i] = serialize_tensor(tensor->src[i]);
|
||||||
|
}
|
||||||
|
|
||||||
|
// TODO: cache the alloc responses to avoid extra RPC calls?
|
||||||
rpc_msg_get_alloc_size_rsp response;
|
rpc_msg_get_alloc_size_rsp response;
|
||||||
bool status = send_rpc_cmd(sock, RPC_CMD_GET_ALLOC_SIZE, &request, sizeof(request), &response, sizeof(response));
|
bool status = send_rpc_cmd(sock, RPC_CMD_GET_ALLOC_SIZE, &request, sizeof(request), &response, sizeof(response));
|
||||||
RPC_STATUS_ASSERT(status);
|
RPC_STATUS_ASSERT(status);
|
||||||
|
|
||||||
return response.alloc_size;
|
return response.alloc_size;
|
||||||
} else {
|
|
||||||
return ggml_nbytes(tensor);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
return ggml_nbytes(tensor);
|
||||||
}
|
}
|
||||||
|
|
||||||
static ggml_backend_buffer_type_i ggml_backend_rpc_buffer_type_interface = {
|
static ggml_backend_buffer_type_i ggml_backend_rpc_buffer_type_interface = {
|
||||||
|
|
@ -1017,7 +1041,7 @@ bool rpc_server::get_alloc_size(const rpc_msg_get_alloc_size_req & request, rpc_
|
||||||
}
|
}
|
||||||
ggml_backend_buffer_type_t buft;
|
ggml_backend_buffer_type_t buft;
|
||||||
struct ggml_init_params params {
|
struct ggml_init_params params {
|
||||||
/*.mem_size =*/ ggml_tensor_overhead(),
|
/*.mem_size =*/ ggml_tensor_overhead()*(1 + GGML_MAX_SRC),
|
||||||
/*.mem_buffer =*/ NULL,
|
/*.mem_buffer =*/ NULL,
|
||||||
/*.no_alloc =*/ true,
|
/*.no_alloc =*/ true,
|
||||||
};
|
};
|
||||||
|
|
@ -1025,12 +1049,18 @@ bool rpc_server::get_alloc_size(const rpc_msg_get_alloc_size_req & request, rpc_
|
||||||
ggml_context_ptr ctx_ptr { ggml_init(params) };
|
ggml_context_ptr ctx_ptr { ggml_init(params) };
|
||||||
GGML_ASSERT(ctx_ptr != nullptr);
|
GGML_ASSERT(ctx_ptr != nullptr);
|
||||||
ggml_context * ctx = ctx_ptr.get();
|
ggml_context * ctx = ctx_ptr.get();
|
||||||
ggml_tensor * tensor = deserialize_tensor(ctx, &request.tensor);
|
|
||||||
|
|
||||||
|
ggml_tensor * tensor = deserialize_tensor(ctx, &request.tensor);
|
||||||
if (tensor == nullptr) {
|
if (tensor == nullptr) {
|
||||||
GGML_LOG_ERROR("Null tensor pointer passed to server get_alloc_size function.\n");
|
GGML_LOG_ERROR("Null tensor pointer passed to server get_alloc_size function.\n");
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
for (int i = 0; i < GGML_MAX_SRC; i++) {
|
||||||
|
if (request.srcs[i].id != 0) {
|
||||||
|
tensor->src[i] = deserialize_tensor(ctx, &request.srcs[i]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
LOG_DBG("[%s] device: %d, buffer: %p, data: %p\n", __func__, dev_id, (void*)tensor->buffer, tensor->data);
|
LOG_DBG("[%s] device: %d, buffer: %p, data: %p\n", __func__, dev_id, (void*)tensor->buffer, tensor->data);
|
||||||
if (tensor->buffer == nullptr) {
|
if (tensor->buffer == nullptr) {
|
||||||
//No buffer allocated.
|
//No buffer allocated.
|
||||||
|
|
@ -1227,7 +1257,8 @@ bool rpc_server::get_cached_file(uint64_t hash, std::vector<uint8_t> & data) {
|
||||||
char hash_str[17];
|
char hash_str[17];
|
||||||
snprintf(hash_str, sizeof(hash_str), "%016" PRIx64, hash);
|
snprintf(hash_str, sizeof(hash_str), "%016" PRIx64, hash);
|
||||||
fs::path cache_file = fs::path(cache_dir) / hash_str;
|
fs::path cache_file = fs::path(cache_dir) / hash_str;
|
||||||
if (!fs::exists(cache_file)) {
|
std::error_code ec;
|
||||||
|
if (!fs::exists(cache_file, ec)) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
std::ifstream ifs(cache_file, std::ios::binary);
|
std::ifstream ifs(cache_file, std::ios::binary);
|
||||||
|
|
|
||||||
|
|
@ -2,6 +2,13 @@
|
||||||
#include "dequantize.hpp"
|
#include "dequantize.hpp"
|
||||||
#include "presets.hpp"
|
#include "presets.hpp"
|
||||||
|
|
||||||
|
#if defined(__INTEL_LLVM_COMPILER)
|
||||||
|
#if __has_include(<sycl/ext/oneapi/bfloat16.hpp>)
|
||||||
|
#include <sycl/ext/oneapi/bfloat16.hpp>
|
||||||
|
#define GGML_SYCL_HAS_BF16
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
|
||||||
template <int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
|
template <int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
|
||||||
static void dequantize_block(const void * __restrict__ vx, dst_t * __restrict__ y, const int64_t k,
|
static void dequantize_block(const void * __restrict__ vx, dst_t * __restrict__ y, const int64_t k,
|
||||||
const sycl::nd_item<3> &item_ct1) {
|
const sycl::nd_item<3> &item_ct1) {
|
||||||
|
|
@ -566,6 +573,10 @@ to_fp16_sycl_t ggml_get_to_fp16_sycl(ggml_type type, ggml_tensor * dst) {
|
||||||
return dequantize_row_iq4_nl_sycl;
|
return dequantize_row_iq4_nl_sycl;
|
||||||
case GGML_TYPE_F32:
|
case GGML_TYPE_F32:
|
||||||
return convert_unary_sycl<float>;
|
return convert_unary_sycl<float>;
|
||||||
|
#ifdef GGML_SYCL_HAS_BF16
|
||||||
|
case GGML_TYPE_BF16:
|
||||||
|
return convert_unary_sycl<sycl::ext::oneapi::bfloat16>;
|
||||||
|
#endif
|
||||||
default:
|
default:
|
||||||
return nullptr;
|
return nullptr;
|
||||||
}
|
}
|
||||||
|
|
@ -627,6 +638,10 @@ to_fp32_sycl_t ggml_get_to_fp32_sycl(ggml_type type, ggml_tensor *dst) {
|
||||||
return dequantize_row_iq4_nl_sycl;
|
return dequantize_row_iq4_nl_sycl;
|
||||||
case GGML_TYPE_F16:
|
case GGML_TYPE_F16:
|
||||||
return convert_unary_sycl<sycl::half>;
|
return convert_unary_sycl<sycl::half>;
|
||||||
|
#ifdef GGML_SYCL_HAS_BF16
|
||||||
|
case GGML_TYPE_BF16:
|
||||||
|
return convert_unary_sycl<sycl::ext::oneapi::bfloat16>;
|
||||||
|
#endif
|
||||||
default:
|
default:
|
||||||
return nullptr;
|
return nullptr;
|
||||||
}
|
}
|
||||||
|
|
@ -636,6 +651,10 @@ to_fp16_nc_sycl_t get_to_fp16_nc_sycl(ggml_type type) {
|
||||||
switch (type) {
|
switch (type) {
|
||||||
case GGML_TYPE_F32:
|
case GGML_TYPE_F32:
|
||||||
return convert_unary_nc_sycl<float>;
|
return convert_unary_nc_sycl<float>;
|
||||||
|
#ifdef GGML_SYCL_HAS_BF16
|
||||||
|
case GGML_TYPE_BF16:
|
||||||
|
return convert_unary_nc_sycl<sycl::ext::oneapi::bfloat16>;
|
||||||
|
#endif
|
||||||
default:
|
default:
|
||||||
return nullptr;
|
return nullptr;
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -4613,6 +4613,10 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
|
||||||
case GGML_OP_ACC:
|
case GGML_OP_ACC:
|
||||||
return true;
|
return true;
|
||||||
case GGML_OP_PAD:
|
case GGML_OP_PAD:
|
||||||
|
// TODO: add circular padding support for syscl, see https://github.com/ggml-org/llama.cpp/pull/16985
|
||||||
|
if (ggml_get_op_params_i32(op, 8) != 0) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
return ggml_is_contiguous(op->src[0]);
|
return ggml_is_contiguous(op->src[0]);
|
||||||
case GGML_OP_LEAKY_RELU:
|
case GGML_OP_LEAKY_RELU:
|
||||||
case GGML_OP_TIMESTEP_EMBEDDING:
|
case GGML_OP_TIMESTEP_EMBEDDING:
|
||||||
|
|
|
||||||
File diff suppressed because it is too large
Load Diff
|
|
@ -32,22 +32,12 @@ layout(push_constant) uniform parameter {
|
||||||
uint32_t Cin;
|
uint32_t Cin;
|
||||||
uint32_t N;
|
uint32_t N;
|
||||||
|
|
||||||
// Tensor spatial sizes: kernel, input, output
|
// Tensor spatial sizes: input, output
|
||||||
uint32_t KW;
|
|
||||||
uint32_t KH;
|
|
||||||
uint32_t W;
|
uint32_t W;
|
||||||
uint32_t H;
|
uint32_t H;
|
||||||
uint32_t OW;
|
uint32_t OW;
|
||||||
uint32_t OH;
|
uint32_t OH;
|
||||||
|
|
||||||
// Parameters: stride, padding, dilation - 0=y, 1=x
|
|
||||||
uint32_t s0;
|
|
||||||
uint32_t s1;
|
|
||||||
uint32_t p0;
|
|
||||||
uint32_t p1;
|
|
||||||
uint32_t d0;
|
|
||||||
uint32_t d1;
|
|
||||||
|
|
||||||
// Strides in elements
|
// Strides in elements
|
||||||
uint32_t nb01;
|
uint32_t nb01;
|
||||||
uint32_t nb02;
|
uint32_t nb02;
|
||||||
|
|
@ -77,13 +67,14 @@ layout(constant_id = 3) const uint BS_NPQ = 128;
|
||||||
layout(constant_id = 4) const uint TS_K = 8;
|
layout(constant_id = 4) const uint TS_K = 8;
|
||||||
layout(constant_id = 5) const uint use_collectives = 1;
|
layout(constant_id = 5) const uint use_collectives = 1;
|
||||||
layout(constant_id = 6) const uint SHMEM_PAD = 4;
|
layout(constant_id = 6) const uint SHMEM_PAD = 4;
|
||||||
|
// Stride, padding, dilation
|
||||||
layout(constant_id = 7) const uint s0 = 1;
|
layout(constant_id = 7) const uint s0 = 1;
|
||||||
layout(constant_id = 8) const uint s1 = 1;
|
layout(constant_id = 8) const uint s1 = 1;
|
||||||
layout(constant_id = 9) const uint p0 = 0;
|
layout(constant_id = 9) const uint p0 = 0;
|
||||||
layout(constant_id = 10) const uint p1 = 0;
|
layout(constant_id = 10) const uint p1 = 0;
|
||||||
layout(constant_id = 11) const uint d0 = 1;
|
layout(constant_id = 11) const uint d0 = 1;
|
||||||
layout(constant_id = 12) const uint d1 = 1;
|
layout(constant_id = 12) const uint d1 = 1;
|
||||||
|
// Kernel spatial sizes
|
||||||
layout(constant_id = 13) const uint KW = 1;
|
layout(constant_id = 13) const uint KW = 1;
|
||||||
layout(constant_id = 14) const uint KH = 1;
|
layout(constant_id = 14) const uint KH = 1;
|
||||||
|
|
||||||
|
|
@ -138,7 +129,7 @@ P,Q=OH,OW
|
||||||
*/
|
*/
|
||||||
|
|
||||||
uint32_t B_idx_K = gl_WorkGroupID.x;
|
uint32_t B_idx_K = gl_WorkGroupID.x;
|
||||||
uint32_t B_idx_NPQ = gl_WorkGroupID.y;
|
uint32_t B_idx_NPQ = gl_WorkGroupID.y + gl_WorkGroupID.z * 512;
|
||||||
|
|
||||||
uint32_t T_y = tid / NT_NPQ;
|
uint32_t T_y = tid / NT_NPQ;
|
||||||
uint32_t T_x = tid % NT_NPQ;
|
uint32_t T_x = tid % NT_NPQ;
|
||||||
|
|
@ -178,6 +169,10 @@ ACC_TYPE perElemOpStore(const in uint32_t r, const in uint32_t c, const in ACC_T
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
void main() {
|
void main() {
|
||||||
|
if (B_idx_NPQ * BS_NPQ >= NPQ) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
#ifdef COOPMAT2
|
#ifdef COOPMAT2
|
||||||
coopmat<ACC_TYPE, gl_ScopeWorkgroup, BS_K, BS_NPQ, gl_MatrixUseAccumulator> matC;
|
coopmat<ACC_TYPE, gl_ScopeWorkgroup, BS_K, BS_NPQ, gl_MatrixUseAccumulator> matC;
|
||||||
matC = coopmat<ACC_TYPE, gl_ScopeWorkgroup, BS_K, BS_NPQ, gl_MatrixUseAccumulator>(0.0);
|
matC = coopmat<ACC_TYPE, gl_ScopeWorkgroup, BS_K, BS_NPQ, gl_MatrixUseAccumulator>(0.0);
|
||||||
|
|
|
||||||
|
|
@ -7,35 +7,85 @@ layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;
|
||||||
|
|
||||||
FLOAT_TYPE temp[NUM_COLS][NUM_ROWS];
|
FLOAT_TYPE temp[NUM_COLS][NUM_ROWS];
|
||||||
|
|
||||||
void calc_superblock(const uint a_offset, const uint b_offset, const uint ib32, const uint i, const uint num_blocks_per_row, const uint first_row, const uint num_rows) {
|
void calc_superblock(const uint a_offset, const uint b_offset, const uint ib32, const uint i,
|
||||||
|
const uint num_blocks_per_row, const uint first_row, const uint num_rows) {
|
||||||
|
// Compute starting index in matrix B for this superblock
|
||||||
const uint y_idx = i * QUANT_K + 32 * ib32;
|
const uint y_idx = i * QUANT_K + 32 * ib32;
|
||||||
|
|
||||||
uint ibi = a_offset / QUANT_K + first_row * num_blocks_per_row + i;
|
uint ibi = a_offset / QUANT_K + first_row * num_blocks_per_row + i;
|
||||||
|
|
||||||
|
// Precompute indices for quantization lookup tables
|
||||||
|
const uint qh_base = 2 * ib32;
|
||||||
|
const uint qs_base = 4 * ib32;
|
||||||
|
const uint sc_index = ib32 / 2;
|
||||||
|
const uint sc_shift = 6 * (ib32 & 1);
|
||||||
|
|
||||||
|
// Loop over rows in the superblock
|
||||||
[[unroll]] for (uint n = 0; n < num_rows; ++n) {
|
[[unroll]] for (uint n = 0; n < num_rows; ++n) {
|
||||||
|
// Load per-block scales and shift for quantization
|
||||||
const uint16_t[4] scales = data_a[ibi].scales;
|
const uint16_t[4] scales = data_a[ibi].scales;
|
||||||
const u16vec4 s = u16vec4(scales[0], scales[1], scales[2], scales[3]) >> 12;
|
const u16vec4 s = u16vec4(scales[0], scales[1], scales[2], scales[3]) >> 12;
|
||||||
const float d = float(unpackHalf2x16(s.x | (s.y << 4) | (s.z << 8) | (s.w << 12)).x);
|
const float d = float(unpackHalf2x16(s.x | (s.y << 4) | (s.z << 8) | (s.w << 12)).x);
|
||||||
|
const uint sc = data_a[ibi].scales[sc_index] >> sc_shift;
|
||||||
|
|
||||||
const uint sc = data_a[ibi].scales[ib32 / 2] >> (6 * (ib32 & 1));
|
// Temporary caches for decoding
|
||||||
|
FLOAT_TYPE dl_cache[4];
|
||||||
|
uint16_t gvf_cache[4];
|
||||||
|
float delta_cache[4];
|
||||||
|
|
||||||
|
// Precompute the multiplier and lookup values for 4 sub-blocks
|
||||||
[[unroll]] for (uint l = 0; l < 4; ++l) {
|
[[unroll]] for (uint l = 0; l < 4; ++l) {
|
||||||
const uint qh = data_a[ibi].qh[2 * ib32 + l / 2] >> (4 * (l&1));
|
dl_cache[l] = FLOAT_TYPE(d * (2 * bitfieldExtract(sc, 3 * int(l / 2), 3) + 1));
|
||||||
const uint qs = data_a[ibi].qs[4 * ib32 + l];
|
const uint qh = data_a[ibi].qh[qh_base + l / 2] >> (4 * (l & 1));
|
||||||
const float delta = ((qh & 8) != 0) ? -IQ1M_DELTA : IQ1M_DELTA;
|
const uint qs = data_a[ibi].qs[qs_base + l];
|
||||||
const float dl = d * (2 * bitfieldExtract(sc, 3 * int(l / 2), 3) + 1);
|
gvf_cache[l] = iq1s_grid[qs | ((qh & 7) << 8)];
|
||||||
|
delta_cache[l] = ((qh & 8) != 0) ? -IQ1M_DELTA : IQ1M_DELTA;
|
||||||
const int16_t grid = int16_t(iq1s_grid[qs | ((qh & 7) << 8)]);
|
}
|
||||||
|
|
||||||
|
// Loop over columns of the output
|
||||||
[[unroll]] for (uint j = 0; j < NUM_COLS; ++j) {
|
[[unroll]] for (uint j = 0; j < NUM_COLS; ++j) {
|
||||||
vec4 b0 = vec4(data_b_v4[(j*p.batch_stride_b + b_offset + y_idx) / 4 + 2*l + 0]);
|
// Compute base index for matrix B
|
||||||
vec4 b4 = vec4(data_b_v4[(j*p.batch_stride_b + b_offset + y_idx) / 4 + 2*l + 1]);
|
const uint base_b_idx = (j * p.batch_stride_b + b_offset + y_idx) / 4;
|
||||||
|
vec4 b_vals[8];
|
||||||
|
|
||||||
FLOAT_TYPE sum = FLOAT_TYPE(0.0);
|
// Load 8 vec4 values from matrix B
|
||||||
[[unroll]] for (int k = 0; k < 4; ++k) {
|
[[unroll]] for (int idx = 0; idx < 8; ++idx) {
|
||||||
sum = fma(FLOAT_TYPE(b0[k]), bitfieldExtract(grid, 2 * k, 2) + delta,
|
b_vals[idx] = vec4(data_b_v4[base_b_idx + idx]);
|
||||||
fma(FLOAT_TYPE(b4[k]), bitfieldExtract(grid, 8 + 2 * k, 2) + delta, sum));
|
|
||||||
}
|
}
|
||||||
temp[j][n] = fma(dl, sum, temp[j][n]);
|
|
||||||
|
FLOAT_TYPE col_sum = FLOAT_TYPE(0.0);
|
||||||
|
|
||||||
|
// Loop over sub-blocks
|
||||||
|
[[unroll]] for (uint l = 0; l < 4; ++l) {
|
||||||
|
const uint16_t grid = gvf_cache[l];
|
||||||
|
const float dl = dl_cache[l];
|
||||||
|
|
||||||
|
// Decode 8 2-bit fbits from gvf_cache
|
||||||
|
float f0 = float(bitfieldExtract(grid, 0, 2));
|
||||||
|
float f1 = float(bitfieldExtract(grid, 2, 2));
|
||||||
|
float f2 = float(bitfieldExtract(grid, 4, 2));
|
||||||
|
float f3 = float(bitfieldExtract(grid, 6, 2));
|
||||||
|
float f4 = float(bitfieldExtract(grid, 8, 2));
|
||||||
|
float f5 = float(bitfieldExtract(grid, 10, 2));
|
||||||
|
float f6 = float(bitfieldExtract(grid, 12, 2));
|
||||||
|
float f7 = float(bitfieldExtract(grid, 14, 2));
|
||||||
|
|
||||||
|
// Pack into vec4 for vectorized FMA
|
||||||
|
const vec4 fbits_v0 = vec4(f0, f1, f2, f3);
|
||||||
|
const vec4 fbits_v1 = vec4(f4, f5, f6, f7);
|
||||||
|
const vec4 delta_v = vec4(delta_cache[l]);
|
||||||
|
|
||||||
|
// Vectorized fused multiply-add
|
||||||
|
vec4 sum_v = fma(b_vals[2*l + 0], fbits_v0 + delta_v, vec4(0.0));
|
||||||
|
sum_v = fma(b_vals[2*l + 1], fbits_v1 + delta_v, sum_v);
|
||||||
|
|
||||||
|
// Horizontal add to get scalar sum
|
||||||
|
FLOAT_TYPE sum = sum_v.x + sum_v.y + sum_v.z + sum_v.w;
|
||||||
|
|
||||||
|
// Accumulate to column sum
|
||||||
|
col_sum = fma(dl, sum, col_sum);
|
||||||
}
|
}
|
||||||
|
// Write result to temporary buffer
|
||||||
|
temp[j][n] += col_sum;
|
||||||
}
|
}
|
||||||
ibi += num_blocks_per_row;
|
ibi += num_blocks_per_row;
|
||||||
}
|
}
|
||||||
|
|
|
||||||
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue