Merge branch 'master' into imatrix

This commit is contained in:
Ed Addario 2025-10-01 19:06:48 +01:00
commit a28ee30310
No known key found for this signature in database
GPG Key ID: E7875815A3230993
276 changed files with 13460 additions and 4931 deletions

View File

@ -1,6 +1,6 @@
ARG UBUNTU_VERSION=22.04 ARG UBUNTU_VERSION=22.04
# This needs to generally match the container host's environment. # This needs to generally match the container host's environment.
ARG MUSA_VERSION=rc4.2.0 ARG MUSA_VERSION=rc4.3.0
# Target the MUSA build image # Target the MUSA build image
ARG BASE_MUSA_DEV_CONTAINER=mthreads/musa:${MUSA_VERSION}-devel-ubuntu${UBUNTU_VERSION}-amd64 ARG BASE_MUSA_DEV_CONTAINER=mthreads/musa:${MUSA_VERSION}-devel-ubuntu${UBUNTU_VERSION}-amd64

52
.github/workflows/build-amd.yml vendored Normal file
View File

@ -0,0 +1,52 @@
name: CI (AMD)
on:
workflow_dispatch: # allows manual triggering
push:
branches:
- master
paths: [
'.github/workflows/build-amd.yml',
'**/CMakeLists.txt',
'**/.cmake',
'**/*.h',
'**/*.hpp',
'**/*.c',
'**/*.cpp',
'**/*.cu',
'**/*.cuh',
'**/*.comp'
]
concurrency:
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
cancel-in-progress: true
jobs:
ggml-ci-x64-amd-vulkan:
runs-on: [self-hosted, Linux, X64, AMD]
steps:
- name: Clone
id: checkout
uses: actions/checkout@v4
- name: Test
id: ggml-ci
run: |
vulkaninfo --summary
GG_BUILD_VULKAN=1 bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
ggml-ci-x64-amd-rocm:
runs-on: [self-hosted, Linux, X64, AMD]
steps:
- name: Clone
id: checkout
uses: actions/checkout@v4
- name: Test
id: ggml-ci
run: |
amd-smi static
GG_BUILD_ROCM=1 GG_BUILD_AMDGPU_TARGETS="gfx1101" bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp

View File

@ -141,97 +141,6 @@ jobs:
# cmake --build build --config Release -j $(nproc) # cmake --build build --config Release -j $(nproc)
ubuntu-24-ppc64el-cpu-cross:
runs-on: ubuntu-24.04
steps:
- uses: actions/checkout@v4
- name: Setup PowerPC64le
run: |
sudo dpkg --add-architecture ppc64el
# Add arch-specific repositories for non-amd64 architectures
cat << EOF | sudo tee /etc/apt/sources.list.d/ppc64el-ports.list
deb [arch=ppc64el] http://ports.ubuntu.com/ubuntu-ports/ noble main universe
deb [arch=ppc64el] http://ports.ubuntu.com/ubuntu-ports/ noble-updates main universe
deb [arch=ppc64el] http://ports.ubuntu.com/ubuntu-ports/ noble-security main universe
deb [arch=ppc64el] http://ports.ubuntu.com/ubuntu-ports/ noble-backports main universe
EOF
sudo apt-get update || true ;# Prevent failure due to missing URLs.
sudo apt-get install -y --no-install-recommends \
build-essential \
gcc-14-powerpc64le-linux-gnu \
g++-14-powerpc64le-linux-gnu
- name: Build
run: |
cmake -B build -DLLAMA_CURL=OFF \
-DCMAKE_BUILD_TYPE=Release \
-DGGML_OPENMP=OFF \
-DLLAMA_BUILD_EXAMPLES=ON \
-DLLAMA_BUILD_TOOLS=ON \
-DLLAMA_BUILD_TESTS=OFF \
-DCMAKE_SYSTEM_NAME=Linux \
-DCMAKE_SYSTEM_PROCESSOR=ppc64 \
-DCMAKE_C_COMPILER=powerpc64le-linux-gnu-gcc-14 \
-DCMAKE_CXX_COMPILER=powerpc64le-linux-gnu-g++-14 \
-DCMAKE_POSITION_INDEPENDENT_CODE=ON \
-DCMAKE_FIND_ROOT_PATH=/usr/lib/powerpc64le-linux-gnu \
-DCMAKE_FIND_ROOT_PATH_MODE_PROGRAM=NEVER \
-DCMAKE_FIND_ROOT_PATH_MODE_LIBRARY=ONLY \
-DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=BOTH
cmake --build build --config Release -j $(nproc)
# ubuntu-24-ppc64el-vulkan-cross:
# runs-on: ubuntu-24.04
# steps:
# - uses: actions/checkout@v4
# - name: Setup PowerPC64le
# run: |
# sudo dpkg --add-architecture ppc64el
# # Add arch-specific repositories for non-amd64 architectures
# cat << EOF | sudo tee /etc/apt/sources.list.d/ppc64el-ports.list
# deb [arch=ppc64el] http://ports.ubuntu.com/ubuntu-ports/ noble main universe
# deb [arch=ppc64el] http://ports.ubuntu.com/ubuntu-ports/ noble-updates main universe
# deb [arch=ppc64el] http://ports.ubuntu.com/ubuntu-ports/ noble-security main universe
# deb [arch=ppc64el] http://ports.ubuntu.com/ubuntu-ports/ noble-backports main universe
# EOF
# sudo apt-get update || true ;# Prevent failure due to missing URLs.
# sudo apt-get install -y --no-install-recommends \
# build-essential \
# glslc \
# gcc-14-powerpc64le-linux-gnu \
# g++-14-powerpc64le-linux-gnu \
# libvulkan-dev:ppc64el
# - name: Build
# run: |
# cmake -B build -DLLAMA_CURL=OFF \
# -DCMAKE_BUILD_TYPE=Release \
# -DGGML_VULKAN=ON \
# -DGGML_OPENMP=OFF \
# -DLLAMA_BUILD_EXAMPLES=ON \
# -DLLAMA_BUILD_TOOLS=ON \
# -DLLAMA_BUILD_TESTS=OFF \
# -DCMAKE_SYSTEM_NAME=Linux \
# -DCMAKE_SYSTEM_PROCESSOR=ppc64 \
# -DCMAKE_C_COMPILER=powerpc64le-linux-gnu-gcc-14 \
# -DCMAKE_CXX_COMPILER=powerpc64le-linux-gnu-g++-14 \
# -DCMAKE_POSITION_INDEPENDENT_CODE=ON \
# -DCMAKE_FIND_ROOT_PATH=/usr/lib/powerpc64le-linux-gnu \
# -DCMAKE_FIND_ROOT_PATH_MODE_PROGRAM=NEVER \
# -DCMAKE_FIND_ROOT_PATH_MODE_LIBRARY=ONLY \
# -DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=BOTH
# cmake --build build --config Release -j $(nproc)
debian-13-loongarch64-cpu-cross: debian-13-loongarch64-cpu-cross:
runs-on: ubuntu-24.04 runs-on: ubuntu-24.04
container: debian@sha256:653dfb9f86c3782e8369d5f7d29bb8faba1f4bff9025db46e807fa4c22903671 container: debian@sha256:653dfb9f86c3782e8369d5f7d29bb8faba1f4bff9025db46e807fa4c22903671
@ -344,3 +253,47 @@ jobs:
-DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=BOTH -DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=BOTH
cmake --build build --config Release -j $(nproc) cmake --build build --config Release -j $(nproc)
ubuntu-24-riscv64-cpu-spacemit-ime-cross:
runs-on: ubuntu-24.04
env:
SPACEMIT_IME_TOOLCHAIN_VERSION: "1.1.2"
SPACEMIT_IME_TOOLCHAIN_PATH: "spacemit-toolchain-linux-glibc-x86_64"
steps:
- uses: actions/checkout@v4
- name: Cache Toolchain
uses: actions/cache@v4
id: cache-spacemit-ime-cross-toolchain
with:
path: ./${{ env.SPACEMIT_IME_TOOLCHAIN_PATH }}
key: ${{ runner.os }}-spacemit-ime-toolchain-v${{ env.SPACEMIT_IME_TOOLCHAIN_VERSION }}
- name: Setup Toolchain
if: steps.cache-spacemit-ime-cross-toolchain.outputs.cache-hit != 'true'
run: |
wget --quiet --no-check-certificate https://archive.spacemit.com/toolchain/spacemit-toolchain-linux-glibc-x86_64-v${{ env.SPACEMIT_IME_TOOLCHAIN_VERSION }}.tar.xz -O ${{ env.SPACEMIT_IME_TOOLCHAIN_PATH }}.tar.xz
rm -rf ${{ env.SPACEMIT_IME_TOOLCHAIN_PATH }}
mkdir -p ${{ env.SPACEMIT_IME_TOOLCHAIN_PATH }}
tar xf ${{ env.SPACEMIT_IME_TOOLCHAIN_PATH }}.tar.xz -C ${{ env.SPACEMIT_IME_TOOLCHAIN_PATH }} --strip-components=1
rm -rf ${{ env.SPACEMIT_IME_TOOLCHAIN_PATH }}.tar.xz
- name: Build
run: |
export RISCV_ROOT_PATH=${PWD}/${{ env.SPACEMIT_IME_TOOLCHAIN_PATH }}
cmake -B build -DLLAMA_CURL=OFF \
-DCMAKE_BUILD_TYPE=Release \
-DGGML_OPENMP=OFF \
-DLLAMA_BUILD_EXAMPLES=ON \
-DLLAMA_BUILD_TOOLS=ON \
-DLLAMA_BUILD_TESTS=OFF \
-DGGML_CPU_RISCV64_SPACEMIT=ON \
-DGGML_RVV=ON \
-DGGML_RV_ZFH=ON \
-DGGML_RV_ZICBOP=ON \
-DRISCV64_SPACEMIT_IME_SPEC=RISCV64_SPACEMIT_IME1 \
-DCMAKE_TOOLCHAIN_FILE=${PWD}/cmake/riscv64-spacemit-linux-gnu-gcc.cmake
cmake --build build --config Release -j $(nproc)

View File

@ -58,3 +58,63 @@ jobs:
-DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=BOTH -DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=BOTH
cmake --build build --config Release -j $(nproc) cmake --build build --config Release -j $(nproc)
# debian-13-riscv64-spacemit-ime-native: # Bianbu 2.2
# runs-on: [self-hosted, RISCV64]
# steps:
# - name: Install prerequisites
# run: |
# sudo apt-get update || true
# sudo apt-get install -y libatomic1
# - uses: actions/checkout@v4
# - name: Setup Riscv
# run: |
# sudo apt-get update || true
# sudo apt-get install -y --no-install-recommends \
# build-essential \
# gcc-14-riscv64-linux-gnu \
# g++-14-riscv64-linux-gnu \
# ccache \
# cmake
# sudo apt-get upgrade binutils -y
# - name: Setup ccache
# run: |
# mkdir -p $HOME/.ccache
# ccache -M 5G -d $HOME/.ccache
# export CCACHE_LOGFILE=/home/runneruser/ccache_debug/ccache.log
# export CCACHE_DEBUGDIR="/home/runneruser/ccache_debug"
# echo "$GITHUB_WORKSPACE"
# echo "CCACHE_LOGFILE=$CCACHE_LOGFILE" >> $GITHUB_ENV
# echo "CCACHE_DEBUGDIR=$CCACHE_DEBUGDIR" >> $GITHUB_ENV
# echo "CCACHE_BASEDIR=$GITHUB_WORKSPACE" >> $GITHUB_ENV
# echo "CCACHE_DIR=$HOME/.ccache" >> $GITHUB_ENV
# - name: Build
# run: |
# cmake -B build \
# -DLLAMA_CURL=OFF \
# -DCMAKE_BUILD_TYPE=Release \
# -DGGML_OPENMP=OFF \
# -DLLAMA_BUILD_EXAMPLES=ON \
# -DLLAMA_BUILD_TOOLS=ON \
# -DLLAMA_BUILD_TESTS=OFF \
# -DCMAKE_SYSTEM_NAME=Linux \
# -DCMAKE_SYSTEM_PROCESSOR=riscv64 \
# -DCMAKE_C_COMPILER=riscv64-linux-gnu-gcc-14 \
# -DCMAKE_CXX_COMPILER=riscv64-linux-gnu-g++-14 \
# -DCMAKE_C_COMPILER_LAUNCHER=ccache \
# -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
# -DCMAKE_POSITION_INDEPENDENT_CODE=ON \
# -DCMAKE_FIND_ROOT_PATH=/usr/lib/riscv64-linux-gnu \
# -DCMAKE_FIND_ROOT_PATH_MODE_PROGRAM=NEVER \
# -DCMAKE_FIND_ROOT_PATH_MODE_LIBRARY=ONLY \
# -DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=BOTH \
# -DGGML_RVV=ON \
# -DGGML_RV_ZFH=ON \
# -DGGML_RV_ZICBOP=ON \
# -DGGML_CPU_RISCV64_SPACEMIT=ON \
# -DRISCV64_SPACEMIT_IME_SPEC=RISCV64_SPACEMIT_IME1
# cmake --build build --config Release -j $(nproc)

View File

@ -192,6 +192,10 @@ jobs:
os: ubuntu-22.04 os: ubuntu-22.04
- build: 'arm64' - build: 'arm64'
os: ubuntu-22.04-arm os: ubuntu-22.04-arm
- build: 's390x'
os: ubuntu-24.04-s390x
- build: 'ppc64le'
os: ubuntu-24.04-ppc64le
runs-on: ${{ matrix.os }} runs-on: ${{ matrix.os }}
@ -203,14 +207,31 @@ jobs:
- name: ccache - name: ccache
uses: ggml-org/ccache-action@v1.2.16 uses: ggml-org/ccache-action@v1.2.16
with: with:
key: ubuntu-cpu-cmake key: ubuntu-cpu-cmake-${{ matrix.build }}
evict-old-files: 1d evict-old-files: 1d
- name: Dependencies - name: Build Dependencies
id: depends id: build_depends
run: | run: |
sudo apt-get update sudo apt-get update
sudo apt-get install build-essential libcurl4-openssl-dev sudo apt-get install -y --no-install-recommends \
python3 python3-pip python3-dev \
libjpeg-dev build-essential libcurl4-openssl-dev \
git-lfs
- name: Python Dependencies
id: python_depends
run: |
python3 -m pip install --upgrade pip
pip3 install ./gguf-py
- name: Swap Endianness
id: endianness
if: ${{ matrix.build == 's390x' }}
run: |
for f in models/*.gguf; do
echo YES | python3 gguf-py/gguf/scripts/gguf_convert_endian.py $f big
done
- name: Build - name: Build
id: cmake_build id: cmake_build
@ -228,6 +249,7 @@ jobs:
- name: Test llama2c conversion - name: Test llama2c conversion
id: llama2c_test id: llama2c_test
if: ${{ matrix.build != 's390x' }}
run: | run: |
cd build cd build
echo "Fetch tokenizer" echo "Fetch tokenizer"
@ -237,6 +259,15 @@ jobs:
./bin/llama-convert-llama2c-to-ggml --copy-vocab-from-model ./tok512.bin --llama2c-model stories260K.bin --llama2c-output-model stories260K.gguf ./bin/llama-convert-llama2c-to-ggml --copy-vocab-from-model ./tok512.bin --llama2c-model stories260K.bin --llama2c-output-model stories260K.gguf
./bin/llama-cli -m stories260K.gguf -p "One day, Lily met a Shoggoth" -n 500 -c 256 ./bin/llama-cli -m stories260K.gguf -p "One day, Lily met a Shoggoth" -n 500 -c 256
- name: Test llama2c (s390x)
id: llama2c_test_s390x
if: ${{ matrix.build == 's390x' }}
run: |
cd build
echo "Fetch llama2c big-endian model"
wget https://huggingface.co/ggml-org/models/resolve/main/tinyllamas/stories260K-be.gguf
./bin/llama-cli -m stories260K-be.gguf -p "One day, Lily met a Shoggoth" -n 500 -c 256
ubuntu-latest-cmake-sanitizer: ubuntu-latest-cmake-sanitizer:
runs-on: ubuntu-latest runs-on: ubuntu-latest
@ -475,7 +506,7 @@ jobs:
ubuntu-22-cmake-musa: ubuntu-22-cmake-musa:
runs-on: ubuntu-22.04 runs-on: ubuntu-22.04
container: mthreads/musa:rc4.2.0-devel-ubuntu22.04-amd64 container: mthreads/musa:rc4.3.0-devel-ubuntu22.04-amd64
steps: steps:
- name: Clone - name: Clone
@ -1191,11 +1222,12 @@ jobs:
- name: Clone - name: Clone
uses: actions/checkout@v4 uses: actions/checkout@v4
- name: ccache # Disabled due to size (400MB) and always 0 cache hits
uses: ggml-org/ccache-action@v1.2.16 # - name: ccache
with: # uses: ggml-org/ccache-action@v1.2.16
key: android-build # with:
evict-old-files: 1d # key: android-build
# evict-old-files: 1d
- name: Set up JDK - name: Set up JDK
uses: actions/setup-java@v3 uses: actions/setup-java@v3
@ -1430,34 +1462,6 @@ jobs:
run: | run: |
bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
# ggml-ci-x64-amd-vulkan:
# runs-on: [self-hosted, Linux, X64, AMD]
#
# steps:
# - name: Clone
# id: checkout
# uses: actions/checkout@v4
#
# - name: Test
# id: ggml-ci
# run: |
# vulkaninfo --summary
# GG_BUILD_VULKAN=1 bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
#
# ggml-ci-x64-amd-rocm:
# runs-on: [self-hosted, Linux, X64, AMD]
#
# steps:
# - name: Clone
# id: checkout
# uses: actions/checkout@v4
#
# - name: Test
# id: ggml-ci
# run: |
# amd-smi static
# GG_BUILD_ROCM=1 GG_BUILD_AMDGPU_TARGETS="gfx1101" bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
ggml-ci-mac-metal: ggml-ci-mac-metal:
runs-on: [self-hosted, macOS, ARM64] runs-on: [self-hosted, macOS, ARM64]

View File

@ -28,7 +28,7 @@ jobs:
push_to_registry: push_to_registry:
name: Push Docker image to Docker Hub name: Push Docker image to Docker Hub
runs-on: ubuntu-22.04 runs-on: ${{ matrix.config.runs_on }}
env: env:
COMMIT_SHA: ${{ github.sha }} COMMIT_SHA: ${{ github.sha }}
strategy: strategy:
@ -39,12 +39,12 @@ jobs:
# Note: the arm64 images are failing, which prevents the amd64 images from being built # Note: the arm64 images are failing, which prevents the amd64 images from being built
# https://github.com/ggml-org/llama.cpp/issues/11888 # https://github.com/ggml-org/llama.cpp/issues/11888
#- { tag: "cpu", dockerfile: ".devops/cpu.Dockerfile", platforms: "linux/amd64,linux/arm64", full: true, light: true, server: true, free_disk_space: false } #- { tag: "cpu", dockerfile: ".devops/cpu.Dockerfile", platforms: "linux/amd64,linux/arm64", full: true, light: true, server: true, free_disk_space: false }
- { tag: "cpu", dockerfile: ".devops/cpu.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: false } - { tag: "cpu", dockerfile: ".devops/cpu.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: false, runs_on: "ubuntu-22.04" }
- { tag: "cuda", dockerfile: ".devops/cuda.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: false } - { tag: "cuda", dockerfile: ".devops/cuda.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: false, runs_on: "ubuntu-22.04" }
- { tag: "musa", dockerfile: ".devops/musa.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: true } - { tag: "musa", dockerfile: ".devops/musa.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: true, runs_on: "ubuntu-22.04" }
- { tag: "intel", dockerfile: ".devops/intel.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: true } - { tag: "intel", dockerfile: ".devops/intel.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: true, runs_on: "ubuntu-22.04" }
- { tag: "vulkan", dockerfile: ".devops/vulkan.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: false } - { tag: "vulkan", dockerfile: ".devops/vulkan.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: false, runs_on: "ubuntu-22.04" }
- { tag: "s390x", dockerfile: ".devops/s390x.Dockerfile", platforms: "linux/s390x", full: true, light: true, server: true, free_disk_space: false } - { tag: "s390x", dockerfile: ".devops/s390x.Dockerfile", platforms: "linux/s390x", full: true, light: true, server: true, free_disk_space: false, runs_on: "ubuntu-22.04-s390x" }
# Note: the rocm images are failing due to a compiler error and are disabled until this is fixed to allow the workflow to complete # Note: the rocm images are failing due to a compiler error and are disabled until this is fixed to allow the workflow to complete
#- {tag: "rocm", dockerfile: ".devops/rocm.Dockerfile", platforms: "linux/amd64,linux/arm64", full: true, light: true, server: true, free_disk_space: true } #- {tag: "rocm", dockerfile: ".devops/rocm.Dockerfile", platforms: "linux/amd64,linux/arm64", full: true, light: true, server: true, free_disk_space: true }
steps: steps:
@ -54,6 +54,7 @@ jobs:
fetch-depth: 0 # preserve git history, so we can determine the build number fetch-depth: 0 # preserve git history, so we can determine the build number
- name: Set up QEMU - name: Set up QEMU
if: ${{ matrix.config.tag != 's390x' }}
uses: docker/setup-qemu-action@v3 uses: docker/setup-qemu-action@v3
with: with:
image: tonistiigi/binfmt:qemu-v7.0.0-28 image: tonistiigi/binfmt:qemu-v7.0.0-28
@ -68,22 +69,19 @@ jobs:
username: ${{ github.repository_owner }} username: ${{ github.repository_owner }}
password: ${{ secrets.GITHUB_TOKEN }} password: ${{ secrets.GITHUB_TOKEN }}
- name: Determine tag name - name: Determine source tag name
id: srctag
uses: ./.github/actions/get-tag-name
env:
BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
- name: Determine image tag name
id: tag id: tag
shell: bash shell: bash
run: | run: |
BUILD_NUMBER="$(git rev-list --count HEAD)"
SHORT_HASH="$(git rev-parse --short=7 HEAD)"
REPO_OWNER="${GITHUB_REPOSITORY_OWNER@L}" # to lower case REPO_OWNER="${GITHUB_REPOSITORY_OWNER@L}" # to lower case
REPO_NAME="${{ github.event.repository.name }}" REPO_NAME="${{ github.event.repository.name }}"
# determine tag name postfix (build number, commit hash)
if [[ "${{ env.GITHUB_BRANCH_NAME }}" == "master" ]]; then
TAG_POSTFIX="-b${BUILD_NUMBER}"
else
SAFE_NAME=$(echo "${{ env.GITHUB_BRANCH_NAME }}" | tr '/' '-')
TAG_POSTFIX="-${SAFE_NAME}-${SHORT_HASH}"
fi
# list all tags possible # list all tags possible
if [[ "${{ matrix.config.tag }}" == "cpu" ]]; then if [[ "${{ matrix.config.tag }}" == "cpu" ]]; then
TYPE="" TYPE=""
@ -91,17 +89,19 @@ jobs:
TYPE="-${{ matrix.config.tag }}" TYPE="-${{ matrix.config.tag }}"
fi fi
PREFIX="ghcr.io/${REPO_OWNER}/${REPO_NAME}:" PREFIX="ghcr.io/${REPO_OWNER}/${REPO_NAME}:"
FULLTAGS="${PREFIX}full${TYPE},${PREFIX}full${TYPE}${TAG_POSTFIX}" CACHETAGS="${PREFIX}buildcache${TYPE}"
LIGHTTAGS="${PREFIX}light${TYPE},${PREFIX}light${TYPE}${TAG_POSTFIX}" FULLTAGS="${PREFIX}full${TYPE},${PREFIX}full${TYPE}-${{ steps.srctag.outputs.name }}"
SERVERTAGS="${PREFIX}server${TYPE},${PREFIX}server${TYPE}${TAG_POSTFIX}" LIGHTTAGS="${PREFIX}light${TYPE},${PREFIX}light${TYPE}-${{ steps.srctag.outputs.name }}"
SERVERTAGS="${PREFIX}server${TYPE},${PREFIX}server${TYPE}-${{ steps.srctag.outputs.name }}"
echo "cache_output_tags=$CACHETAGS" >> $GITHUB_OUTPUT
echo "full_output_tags=$FULLTAGS" >> $GITHUB_OUTPUT echo "full_output_tags=$FULLTAGS" >> $GITHUB_OUTPUT
echo "light_output_tags=$LIGHTTAGS" >> $GITHUB_OUTPUT echo "light_output_tags=$LIGHTTAGS" >> $GITHUB_OUTPUT
echo "server_output_tags=$SERVERTAGS" >> $GITHUB_OUTPUT echo "server_output_tags=$SERVERTAGS" >> $GITHUB_OUTPUT
echo "cache_output_tags=$CACHETAGS" # print out for debugging
echo "full_output_tags=$FULLTAGS" # print out for debugging echo "full_output_tags=$FULLTAGS" # print out for debugging
echo "light_output_tags=$LIGHTTAGS" # print out for debugging echo "light_output_tags=$LIGHTTAGS" # print out for debugging
echo "server_output_tags=$SERVERTAGS" # print out for debugging echo "server_output_tags=$SERVERTAGS" # print out for debugging
env: env:
GITHUB_BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
GITHUB_REPOSITORY_OWNER: '${{ github.repository_owner }}' GITHUB_REPOSITORY_OWNER: '${{ github.repository_owner }}'
- name: Free Disk Space (Ubuntu) - name: Free Disk Space (Ubuntu)
@ -134,11 +134,14 @@ jobs:
target: full target: full
provenance: false provenance: false
# using github experimental cache # using github experimental cache
cache-from: type=gha #cache-from: type=gha
cache-to: type=gha,mode=max #cache-to: type=gha,mode=max
# return to this if the experimental github cache is having issues # return to this if the experimental github cache is having issues
#cache-to: type=local,dest=/tmp/.buildx-cache #cache-to: type=local,dest=/tmp/.buildx-cache
#cache-from: type=local,src=/tmp/.buildx-cache #cache-from: type=local,src=/tmp/.buildx-cache
# using registry cache (no storage limit)
cache-from: type=registry,ref=${{ steps.tag.outputs.cache_output_tags }}
cache-to: type=registry,ref=${{ steps.tag.outputs.cache_output_tags }},mode=max
- name: Build and push Light Docker image (tagged + versioned) - name: Build and push Light Docker image (tagged + versioned)
if: ${{ (github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch') && matrix.config.light == true }} if: ${{ (github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch') && matrix.config.light == true }}
@ -153,11 +156,14 @@ jobs:
target: light target: light
provenance: false provenance: false
# using github experimental cache # using github experimental cache
cache-from: type=gha #cache-from: type=gha
cache-to: type=gha,mode=max #cache-to: type=gha,mode=max
# return to this if the experimental github cache is having issues # return to this if the experimental github cache is having issues
#cache-to: type=local,dest=/tmp/.buildx-cache #cache-to: type=local,dest=/tmp/.buildx-cache
#cache-from: type=local,src=/tmp/.buildx-cache #cache-from: type=local,src=/tmp/.buildx-cache
# using registry cache (no storage limit)
cache-from: type=registry,ref=${{ steps.tag.outputs.cache_output_tags }}
cache-to: type=registry,ref=${{ steps.tag.outputs.cache_output_tags }},mode=max
- name: Build and push Server Docker image (tagged + versioned) - name: Build and push Server Docker image (tagged + versioned)
if: ${{ (github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch') && matrix.config.server == true }} if: ${{ (github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch') && matrix.config.server == true }}
@ -172,8 +178,37 @@ jobs:
target: server target: server
provenance: false provenance: false
# using github experimental cache # using github experimental cache
cache-from: type=gha #cache-from: type=gha
cache-to: type=gha,mode=max #cache-to: type=gha,mode=max
# return to this if the experimental github cache is having issues # return to this if the experimental github cache is having issues
#cache-to: type=local,dest=/tmp/.buildx-cache #cache-to: type=local,dest=/tmp/.buildx-cache
#cache-from: type=local,src=/tmp/.buildx-cache #cache-from: type=local,src=/tmp/.buildx-cache
# using registry cache (no storage limit)
cache-from: type=registry,ref=${{ steps.tag.outputs.cache_output_tags }}
cache-to: type=registry,ref=${{ steps.tag.outputs.cache_output_tags }},mode=max
create_tag:
name: Create and push git tag
runs-on: ubuntu-22.04
permissions:
contents: write
steps:
- name: Clone
id: checkout
uses: actions/checkout@v4
with:
fetch-depth: 0
- name: Determine source tag name
id: srctag
uses: ./.github/actions/get-tag-name
env:
BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
- name: Create and push git tag
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
run: |
git tag ${{ steps.srctag.outputs.name }} || exit 0
git push origin ${{ steps.srctag.outputs.name }} || exit 0

View File

@ -150,7 +150,7 @@ jobs:
- name: ccache - name: ccache
uses: ggml-org/ccache-action@v1.2.16 uses: ggml-org/ccache-action@v1.2.16
with: with:
key: ubuntu-cpu-cmake key: ubuntu-cpu-cmake-${{ matrix.build }}
evict-old-files: 1d evict-old-files: 1d
- name: Dependencies - name: Dependencies

4
.gitignore vendored
View File

@ -149,6 +149,6 @@ poetry.toml
/run-chat.sh /run-chat.sh
.ccache/ .ccache/
# Code Workspace # IDE
*.code-workspace *.code-workspace
.windsurf/

View File

@ -1,7 +0,0 @@
---
trigger: manual
---
#### Tailwind & CSS
- We are using Tailwind v4 which uses oklch colors so we now want to refer to the CSS vars directly, without wrapping it with any color function like `hsla/hsl`, `rgba` etc.

View File

@ -1,48 +0,0 @@
---
trigger: manual
---
# Coding rules
## Svelte & SvelteKit
### Services vs Stores Separation Pattern
#### `lib/services/` - Pure Business Logic
- **Purpose**: Stateless business logic and external communication
- **Contains**:
- API calls to external services (ApiService)
- Pure business logic functions (ChatService, etc.)
- **Rules**:
- NO Svelte runes ($state, $derived, $effect)
- NO reactive state management
- Pure functions and classes only
- Can import types but not stores
- Focus on "how" - implementation details
#### `lib/stores/` - Reactive State Management
- **Purpose**: Svelte-specific reactive state with runes
- **Contains**:
- Reactive state classes with $state, $derived, $effect
- Database operations (DatabaseStore)
- UI-focused state management
- Store orchestration logic
- **Rules**:
- USE Svelte runes for reactivity
- Import and use services for business logic
- NO direct database operations
- NO direct API calls (use services)
- Focus on "what" - reactive state for UI
#### Enforcement
- Services should be testable without Svelte
- Stores should leverage Svelte's reactivity system
- Clear separation: services handle data, stores handle state
- Services can be reused across multiple stores
#### Misc
- Always use `let` for $derived state variables

View File

@ -1,9 +0,0 @@
---
trigger: manual
---
# Automated Tests
## General rules
- NEVER include any test code in the production code - we should always have it in a separate dedicated files

View File

@ -1,7 +0,0 @@
---
trigger: manual
---
## TypeScript
- Add JSDocs for functions

View File

@ -92,6 +92,7 @@ option(LLAMA_TOOLS_INSTALL "llama: install tools" ${LLAMA_TOOLS_INSTALL_
# 3rd party libs # 3rd party libs
option(LLAMA_CURL "llama: use libcurl to download model from an URL" ON) option(LLAMA_CURL "llama: use libcurl to download model from an URL" ON)
option(LLAMA_OPENSSL "llama: use openssl to support HTTPS" OFF)
option(LLAMA_LLGUIDANCE "llama-common: include LLGuidance library for structured output in common utils" OFF) option(LLAMA_LLGUIDANCE "llama-common: include LLGuidance library for structured output in common utils" OFF)
# Required for relocatable CMake package # Required for relocatable CMake package

View File

@ -14,6 +14,7 @@
/common/build-info.* @ggerganov /common/build-info.* @ggerganov
/common/common.* @ggerganov /common/common.* @ggerganov
/common/console.* @ggerganov /common/console.* @ggerganov
/common/http.* @angt
/common/llguidance.* @ggerganov /common/llguidance.* @ggerganov
/common/log.* @ggerganov /common/log.* @ggerganov
/common/sampling.* @ggerganov /common/sampling.* @ggerganov
@ -50,6 +51,7 @@
/ggml/src/ggml-blas/ @slaren /ggml/src/ggml-blas/ @slaren
/ggml/src/ggml-common.h @ggerganov @slaren /ggml/src/ggml-common.h @ggerganov @slaren
/ggml/src/ggml-cpu/ @ggerganov @slaren /ggml/src/ggml-cpu/ @ggerganov @slaren
/ggml/src/ggml-cpu/spacemit/ @alex-spacemit
/ggml/src/ggml-cuda/common.cuh @slaren /ggml/src/ggml-cuda/common.cuh @slaren
/ggml/src/ggml-cuda/fattn* @JohannesGaessler /ggml/src/ggml-cuda/fattn* @JohannesGaessler
/ggml/src/ggml-cuda/ggml-cuda.cu @slaren /ggml/src/ggml-cuda/ggml-cuda.cu @slaren
@ -59,8 +61,10 @@
/ggml/src/ggml-cuda/mmvq.* @JohannesGaessler /ggml/src/ggml-cuda/mmvq.* @JohannesGaessler
/ggml/src/ggml-impl.h @ggerganov @slaren /ggml/src/ggml-impl.h @ggerganov @slaren
/ggml/src/ggml-metal/ @ggerganov /ggml/src/ggml-metal/ @ggerganov
/ggml/src/ggml-opencl/ @lhez @max-krasnyansky
/ggml/src/ggml-opt.cpp @JohannesGaessler /ggml/src/ggml-opt.cpp @JohannesGaessler
/ggml/src/ggml-quants.* @ggerganov /ggml/src/ggml-quants.* @ggerganov
/ggml/src/ggml-rpc/ @rgerganov
/ggml/src/ggml-threading.* @ggerganov @slaren /ggml/src/ggml-threading.* @ggerganov @slaren
/ggml/src/ggml-vulkan/ @0cc4m /ggml/src/ggml-vulkan/ @0cc4m
/ggml/src/ggml-zdnn/ @taronaeo @Andreas-Krebbel @AlekseiNikiforovIBM /ggml/src/ggml-zdnn/ @taronaeo @Andreas-Krebbel @AlekseiNikiforovIBM
@ -89,6 +93,7 @@
/tools/mtmd/ @ngxson /tools/mtmd/ @ngxson
/tools/perplexity/ @ggerganov /tools/perplexity/ @ggerganov
/tools/quantize/ @ggerganov /tools/quantize/ @ggerganov
/tools/rpc/ @rgerganov
/tools/run/ @ericcurtin /tools/run/ @ericcurtin
/tools/server/* @ngxson @ggerganov @ericcurtin # no subdir /tools/server/* @ngxson @ggerganov @ericcurtin # no subdir
/tools/server/webui/ @allozaur /tools/server/webui/ @allozaur
@ -103,4 +108,5 @@
/LICENSE @ggerganov /LICENSE @ggerganov
/README.md @ggerganov /README.md @ggerganov
/SECURITY.md @ggerganov /SECURITY.md @ggerganov
/build-xcframework.sh @danbev
requirements*.txt @CISC requirements*.txt @CISC

View File

@ -422,6 +422,7 @@ echo "Building for iOS devices..."
cmake -B build-ios-device -G Xcode \ cmake -B build-ios-device -G Xcode \
"${COMMON_CMAKE_ARGS[@]}" \ "${COMMON_CMAKE_ARGS[@]}" \
-DCMAKE_OSX_DEPLOYMENT_TARGET=${IOS_MIN_OS_VERSION} \ -DCMAKE_OSX_DEPLOYMENT_TARGET=${IOS_MIN_OS_VERSION} \
-DCMAKE_SYSTEM_NAME=iOS \
-DCMAKE_OSX_SYSROOT=iphoneos \ -DCMAKE_OSX_SYSROOT=iphoneos \
-DCMAKE_OSX_ARCHITECTURES="arm64" \ -DCMAKE_OSX_ARCHITECTURES="arm64" \
-DCMAKE_XCODE_ATTRIBUTE_SUPPORTED_PLATFORMS=iphoneos \ -DCMAKE_XCODE_ATTRIBUTE_SUPPORTED_PLATFORMS=iphoneos \

View File

@ -21,7 +21,7 @@ docker run --privileged -it \
-v $HOME/llama.cpp/ci-cache:/ci-cache \ -v $HOME/llama.cpp/ci-cache:/ci-cache \
-v $HOME/llama.cpp/ci-results:/ci-results \ -v $HOME/llama.cpp/ci-results:/ci-results \
-v $PWD:/ws -w /ws \ -v $PWD:/ws -w /ws \
mthreads/musa:rc4.2.0-devel-ubuntu22.04-amd64 mthreads/musa:rc4.3.0-devel-ubuntu22.04-amd64
``` ```
Inside the container, execute the following commands: Inside the container, execute the following commands:

View File

@ -114,6 +114,7 @@ if [ ! -z ${GG_BUILD_NO_SVE} ]; then
# arm 9 and newer enables sve by default, adjust these flags depending on the cpu used # arm 9 and newer enables sve by default, adjust these flags depending on the cpu used
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_NATIVE=OFF -DGGML_CPU_ARM_ARCH=armv8.5-a+fp16+i8mm" CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_NATIVE=OFF -DGGML_CPU_ARM_ARCH=armv8.5-a+fp16+i8mm"
fi fi
## helpers ## helpers
# download a file if it does not exist or if it is outdated # download a file if it does not exist or if it is outdated

View File

@ -0,0 +1,29 @@
set(CMAKE_SYSTEM_NAME Linux)
set(CMAKE_SYSTEM_PROCESSOR riscv64)
set(CMAKE_SYSTEM_VERSION 1)
if (CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "^(riscv)")
message(STATUS "HOST SYSTEM ${CMAKE_HOST_SYSTEM_PROCESSOR}")
else()
set(GNU_MACHINE riscv64-unknown-linux-gnu CACHE STRING "GNU compiler triple")
if (DEFINED ENV{RISCV_ROOT_PATH})
file(TO_CMAKE_PATH $ENV{RISCV_ROOT_PATH} RISCV_ROOT_PATH)
else()
message(FATAL_ERROR "RISCV_ROOT_PATH env must be defined")
endif()
set(RISCV_ROOT_PATH ${RISCV_ROOT_PATH} CACHE STRING "root path to riscv toolchain")
set(CMAKE_C_COMPILER ${RISCV_ROOT_PATH}/bin/riscv64-unknown-linux-gnu-gcc)
set(CMAKE_CXX_COMPILER ${RISCV_ROOT_PATH}/bin/riscv64-unknown-linux-gnu-g++)
set(CMAKE_STRIP ${RISCV_ROOT_PATH}/bin/riscv64-unknown-linux-gnu-strip)
set(CMAKE_FIND_ROOT_PATH "${RISCV_ROOT_PATH}/riscv64-unknown-linux-gnu")
set(CMAKE_SYSROOT "${RISCV_ROOT_PATH}/sysroot")
endif()
set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)
set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
set(CMAKE_FIND_ROOT_PATH_MODE_PACKAGE ONLY)
set(CMAKE_C_FLAGS "-march=rv64gcv_zfh_zba_zicbop -mabi=lp64d ${CMAKE_C_FLAGS}")
set(CMAKE_CXX_FLAGS "-march=rv64gcv_zfh_zba_zicbop -mabi=lp64d ${CXX_FLAGS}")
set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -latomic")

View File

@ -56,6 +56,7 @@ add_library(${TARGET} STATIC
common.h common.h
console.cpp console.cpp
console.h console.h
http.h
json-partial.cpp json-partial.cpp
json-partial.h json-partial.h
json-schema-to-grammar.cpp json-schema-to-grammar.cpp
@ -87,7 +88,43 @@ if (LLAMA_CURL)
target_compile_definitions(${TARGET} PUBLIC LLAMA_USE_CURL) target_compile_definitions(${TARGET} PUBLIC LLAMA_USE_CURL)
include_directories(${CURL_INCLUDE_DIRS}) include_directories(${CURL_INCLUDE_DIRS})
set(LLAMA_COMMON_EXTRA_LIBS ${LLAMA_COMMON_EXTRA_LIBS} ${CURL_LIBRARIES}) set(LLAMA_COMMON_EXTRA_LIBS ${LLAMA_COMMON_EXTRA_LIBS} ${CURL_LIBRARIES})
endif () endif()
if (LLAMA_OPENSSL)
find_package(OpenSSL)
if (OpenSSL_FOUND)
include(CheckCSourceCompiles)
set(SAVED_CMAKE_REQUIRED_INCLUDES ${CMAKE_REQUIRED_INCLUDES})
set(CMAKE_REQUIRED_INCLUDES ${OPENSSL_INCLUDE_DIR})
check_c_source_compiles("
#include <openssl/opensslv.h>
#if defined(OPENSSL_IS_BORINGSSL) || defined(LIBRESSL_VERSION_NUMBER)
# if OPENSSL_VERSION_NUMBER < 0x1010107f
# error bad version
# endif
#else
# if OPENSSL_VERSION_NUMBER < 0x30000000L
# error bad version
# endif
#endif
int main() { return 0; }
" OPENSSL_VERSION_SUPPORTED)
set(CMAKE_REQUIRED_INCLUDES ${SAVED_CMAKE_REQUIRED_INCLUDES})
if (OPENSSL_VERSION_SUPPORTED)
message(STATUS "OpenSSL found: ${OPENSSL_VERSION}")
target_compile_definitions(${TARGET} PUBLIC CPPHTTPLIB_OPENSSL_SUPPORT)
target_link_libraries(${TARGET} PUBLIC OpenSSL::SSL OpenSSL::Crypto)
if (APPLE AND CMAKE_SYSTEM_NAME STREQUAL "Darwin")
target_compile_definitions(${TARGET} PUBLIC CPPHTTPLIB_USE_CERTS_FROM_MACOSX_KEYCHAIN)
find_library(CORE_FOUNDATION_FRAMEWORK CoreFoundation REQUIRED)
find_library(SECURITY_FRAMEWORK Security REQUIRED)
target_link_libraries(${TARGET} PUBLIC ${CORE_FOUNDATION_FRAMEWORK} ${SECURITY_FRAMEWORK})
endif()
endif()
else()
message(STATUS "OpenSSL not found, SSL support disabled")
endif()
endif()
if (LLAMA_LLGUIDANCE) if (LLAMA_LLGUIDANCE)
include(ExternalProject) include(ExternalProject)

View File

@ -32,11 +32,11 @@
#include <thread> #include <thread>
#include <vector> #include <vector>
//#define LLAMA_USE_CURL
#if defined(LLAMA_USE_CURL) #if defined(LLAMA_USE_CURL)
#include <curl/curl.h> #include <curl/curl.h>
#include <curl/easy.h> #include <curl/easy.h>
#else
#include "http.h"
#endif #endif
#ifdef __linux__ #ifdef __linux__
@ -52,6 +52,13 @@
#endif #endif
#define LLAMA_MAX_URL_LENGTH 2084 // Maximum URL Length in Chrome: 2083 #define LLAMA_MAX_URL_LENGTH 2084 // Maximum URL Length in Chrome: 2083
// isatty
#if defined(_WIN32)
#include <io.h>
#else
#include <unistd.h>
#endif
using json = nlohmann::ordered_json; using json = nlohmann::ordered_json;
std::initializer_list<enum llama_example> mmproj_examples = { std::initializer_list<enum llama_example> mmproj_examples = {
@ -98,6 +105,14 @@ static void write_file(const std::string & fname, const std::string & content) {
} }
} }
static bool is_output_a_tty() {
#if defined(_WIN32)
return _isatty(_fileno(stdout));
#else
return isatty(1);
#endif
}
common_arg & common_arg::set_examples(std::initializer_list<enum llama_example> examples) { common_arg & common_arg::set_examples(std::initializer_list<enum llama_example> examples) {
this->examples = std::move(examples); this->examples = std::move(examples);
return *this; return *this;
@ -215,12 +230,55 @@ struct common_hf_file_res {
std::string mmprojFile; std::string mmprojFile;
}; };
#ifdef LLAMA_USE_CURL static void write_etag(const std::string & path, const std::string & etag) {
const std::string etag_path = path + ".etag";
bool common_has_curl() { write_file(etag_path, etag);
return true; LOG_DBG("%s: file etag saved: %s\n", __func__, etag_path.c_str());
} }
static std::string read_etag(const std::string & path) {
std::string none;
const std::string etag_path = path + ".etag";
if (std::filesystem::exists(etag_path)) {
std::ifstream etag_in(etag_path);
if (!etag_in) {
LOG_ERR("%s: could not open .etag file for reading: %s\n", __func__, etag_path.c_str());
return none;
}
std::string etag;
std::getline(etag_in, etag);
return etag;
}
// no etag file, but maybe there is an old .json
// remove this code later
const std::string metadata_path = path + ".json";
if (std::filesystem::exists(metadata_path)) {
std::ifstream metadata_in(metadata_path);
try {
nlohmann::json metadata_json;
metadata_in >> metadata_json;
LOG_DBG("%s: previous metadata file found %s: %s\n", __func__, metadata_path.c_str(),
metadata_json.dump().c_str());
if (metadata_json.contains("etag") && metadata_json.at("etag").is_string()) {
std::string etag = metadata_json.at("etag");
write_etag(path, etag);
if (!std::filesystem::remove(metadata_path)) {
LOG_WRN("%s: failed to delete old .json metadata file: %s\n", __func__, metadata_path.c_str());
}
return etag;
}
} catch (const nlohmann::json::exception & e) {
LOG_ERR("%s: error reading metadata file %s: %s\n", __func__, metadata_path.c_str(), e.what());
}
}
return none;
}
#ifdef LLAMA_USE_CURL
// //
// CURL utils // CURL utils
// //
@ -371,36 +429,15 @@ static bool common_download_head(CURL * curl,
static bool common_download_file_single_online(const std::string & url, static bool common_download_file_single_online(const std::string & url,
const std::string & path, const std::string & path,
const std::string & bearer_token) { const std::string & bearer_token) {
// If the file exists, check its JSON metadata companion file.
std::string metadata_path = path + ".json";
static const int max_attempts = 3; static const int max_attempts = 3;
static const int retry_delay_seconds = 2; static const int retry_delay_seconds = 2;
for (int i = 0; i < max_attempts; ++i) { for (int i = 0; i < max_attempts; ++i) {
nlohmann::json metadata; // TODO @ngxson : get rid of this json, use regex instead
std::string etag; std::string etag;
std::string last_modified;
// Check if the file already exists locally // Check if the file already exists locally
const auto file_exists = std::filesystem::exists(path); const auto file_exists = std::filesystem::exists(path);
if (file_exists) { if (file_exists) {
// Try and read the JSON metadata file (note: stream autoclosed upon exiting this block). etag = read_etag(path);
std::ifstream metadata_in(metadata_path);
if (metadata_in.good()) {
try {
metadata_in >> metadata;
LOG_DBG("%s: previous metadata file found %s: %s\n", __func__, metadata_path.c_str(),
metadata.dump().c_str());
if (metadata.contains("etag") && metadata.at("etag").is_string()) {
etag = metadata.at("etag");
}
if (metadata.contains("lastModified") && metadata.at("lastModified").is_string()) {
last_modified = metadata.at("lastModified");
}
} catch (const nlohmann::json::exception & e) {
LOG_ERR("%s: error reading metadata file %s: %s\n", __func__, metadata_path.c_str(), e.what());
}
}
// if we cannot open the metadata file, we assume that the downloaded file is not valid (etag and last-modified are left empty, so we will download it again)
} else { } else {
LOG_INF("%s: no previous model file found %s\n", __func__, path.c_str()); LOG_INF("%s: no previous model file found %s\n", __func__, path.c_str());
} }
@ -438,11 +475,6 @@ static bool common_download_file_single_online(const std::string & url,
headers.etag.c_str()); headers.etag.c_str());
should_download = true; should_download = true;
should_download_from_scratch = true; should_download_from_scratch = true;
} else if (!last_modified.empty() && last_modified != headers.last_modified) {
LOG_WRN("%s: Last-Modified header is different (%s != %s): triggering a new download\n", __func__,
last_modified.c_str(), headers.last_modified.c_str());
should_download = true;
should_download_from_scratch = true;
} }
} }
@ -473,15 +505,9 @@ static bool common_download_file_single_online(const std::string & url,
} }
} }
} }
if (head_request_ok) {
// Write the updated JSON metadata file. write_etag(path, headers.etag);
metadata.update({ }
{ "url", url },
{ "etag", headers.etag },
{ "lastModified", headers.last_modified }
});
write_file(metadata_path, metadata.dump(4));
LOG_DBG("%s: file metadata saved: %s\n", __func__, metadata_path.c_str());
// start the download // start the download
LOG_INF("%s: trying to download model from %s to %s (server_etag:%s, server_last_modified:%s)...\n", LOG_INF("%s: trying to download model from %s to %s (server_etag:%s, server_last_modified:%s)...\n",
@ -568,21 +594,238 @@ std::pair<long, std::vector<char>> common_remote_get_content(const std::string &
#else #else
bool common_has_curl() { static void print_progress(size_t current, size_t total) {
return false; if (!is_output_a_tty()) {
} return;
static bool common_download_file_single_online(const std::string &, const std::string &, const std::string &) {
LOG_ERR("error: built without CURL, cannot download model from internet\n");
return false;
}
std::pair<long, std::vector<char>> common_remote_get_content(const std::string & url, const common_remote_params &) {
if (!url.empty()) {
throw std::runtime_error("error: built without CURL, cannot download model from the internet");
} }
return {}; if (!total) {
return;
}
size_t width = 50;
size_t pct = (100 * current) / total;
size_t pos = (width * current) / total;
std::cout << "["
<< std::string(pos, '=')
<< (pos < width ? ">" : "")
<< std::string(width - pos, ' ')
<< "] " << std::setw(3) << pct << "% ("
<< current / (1024 * 1024) << " MB / "
<< total / (1024 * 1024) << " MB)\r";
std::cout.flush();
}
static bool common_pull_file(httplib::Client & cli,
const std::string & resolve_path,
const std::string & path_tmp,
bool supports_ranges,
size_t existing_size,
size_t & total_size) {
std::ofstream ofs(path_tmp, std::ios::binary | std::ios::app);
if (!ofs.is_open()) {
LOG_ERR("%s: error opening local file for writing: %s\n", __func__, path_tmp.c_str());
return false;
}
httplib::Headers headers;
if (supports_ranges && existing_size > 0) {
headers.emplace("Range", "bytes=" + std::to_string(existing_size) + "-");
}
std::atomic<size_t> downloaded{existing_size};
auto res = cli.Get(resolve_path, headers,
[&](const httplib::Response &response) {
if (existing_size > 0 && response.status != 206) {
LOG_WRN("%s: server did not respond with 206 Partial Content for a resume request. Status: %d\n", __func__, response.status);
return false;
}
if (existing_size == 0 && response.status != 200) {
LOG_WRN("%s: download received non-successful status code: %d\n", __func__, response.status);
return false;
}
if (total_size == 0 && response.has_header("Content-Length")) {
try {
size_t content_length = std::stoull(response.get_header_value("Content-Length"));
total_size = existing_size + content_length;
} catch (const std::exception &e) {
LOG_WRN("%s: invalid Content-Length header: %s\n", __func__, e.what());
}
}
return true;
},
[&](const char *data, size_t len) {
ofs.write(data, len);
if (!ofs) {
LOG_ERR("%s: error writing to file: %s\n", __func__, path_tmp.c_str());
return false;
}
downloaded += len;
print_progress(downloaded, total_size);
return true;
},
nullptr
);
std::cout << "\n";
if (!res) {
LOG_ERR("%s: error during download. Status: %d\n", __func__, res ? res->status : -1);
return false;
}
return true;
}
// download one single file from remote URL to local path
static bool common_download_file_single_online(const std::string & url,
const std::string & path,
const std::string & bearer_token) {
static const int max_attempts = 3;
static const int retry_delay_seconds = 2;
auto [cli, parts] = common_http_client(url);
httplib::Headers default_headers = {{"User-Agent", "llama-cpp"}};
if (!bearer_token.empty()) {
default_headers.insert({"Authorization", "Bearer " + bearer_token});
}
cli.set_default_headers(default_headers);
const bool file_exists = std::filesystem::exists(path);
std::string last_etag;
if (file_exists) {
last_etag = read_etag(path);
} else {
LOG_INF("%s: no previous model file found %s\n", __func__, path.c_str());
}
for (int i = 0; i < max_attempts; ++i) {
auto head = cli.Head(parts.path);
bool head_ok = head && head->status >= 200 && head->status < 300;
if (!head_ok) {
LOG_WRN("%s: HEAD invalid http status code received: %d\n", __func__, head ? head->status : -1);
if (file_exists) {
LOG_INF("%s: Using cached file (HEAD failed): %s\n", __func__, path.c_str());
return true;
}
}
std::string etag;
if (head_ok && head->has_header("ETag")) {
etag = head->get_header_value("ETag");
}
size_t total_size = 0;
if (head_ok && head->has_header("Content-Length")) {
try {
total_size = std::stoull(head->get_header_value("Content-Length"));
} catch (const std::exception& e) {
LOG_WRN("%s: Invalid Content-Length in HEAD response: %s\n", __func__, e.what());
}
}
bool supports_ranges = false;
if (head_ok && head->has_header("Accept-Ranges")) {
supports_ranges = head->get_header_value("Accept-Ranges") != "none";
}
bool should_download_from_scratch = false;
if (!last_etag.empty() && !etag.empty() && last_etag != etag) {
LOG_WRN("%s: ETag header is different (%s != %s): triggering a new download\n", __func__,
last_etag.c_str(), etag.c_str());
should_download_from_scratch = true;
}
if (file_exists) {
if (!should_download_from_scratch) {
LOG_INF("%s: using cached file: %s\n", __func__, path.c_str());
return true;
}
LOG_WRN("%s: deleting previous downloaded file: %s\n", __func__, path.c_str());
if (remove(path.c_str()) != 0) {
LOG_ERR("%s: unable to delete file: %s\n", __func__, path.c_str());
return false;
}
}
const std::string path_temporary = path + ".downloadInProgress";
size_t existing_size = 0;
if (std::filesystem::exists(path_temporary)) {
if (supports_ranges && !should_download_from_scratch) {
existing_size = std::filesystem::file_size(path_temporary);
} else if (remove(path_temporary.c_str()) != 0) {
LOG_ERR("%s: unable to delete file: %s\n", __func__, path_temporary.c_str());
return false;
}
}
// start the download
LOG_INF("%s: trying to download model from %s to %s (etag:%s)...\n",
__func__, common_http_show_masked_url(parts).c_str(), path_temporary.c_str(), etag.c_str());
const bool was_pull_successful = common_pull_file(cli, parts.path, path_temporary, supports_ranges, existing_size, total_size);
if (!was_pull_successful) {
if (i + 1 < max_attempts) {
const int exponential_backoff_delay = std::pow(retry_delay_seconds, i) * 1000;
LOG_WRN("%s: retrying after %d milliseconds...\n", __func__, exponential_backoff_delay);
std::this_thread::sleep_for(std::chrono::milliseconds(exponential_backoff_delay));
} else {
LOG_ERR("%s: download failed after %d attempts\n", __func__, max_attempts);
}
continue;
}
if (std::rename(path_temporary.c_str(), path.c_str()) != 0) {
LOG_ERR("%s: unable to rename file: %s to %s\n", __func__, path_temporary.c_str(), path.c_str());
return false;
}
if (!etag.empty()) {
write_etag(path, etag);
}
break;
}
return true;
}
std::pair<long, std::vector<char>> common_remote_get_content(const std::string & url,
const common_remote_params & params) {
auto [cli, parts] = common_http_client(url);
httplib::Headers headers = {{"User-Agent", "llama-cpp"}};
for (const auto & header : params.headers) {
size_t pos = header.find(':');
if (pos != std::string::npos) {
headers.emplace(header.substr(0, pos), header.substr(pos + 1));
} else {
headers.emplace(header, "");
}
}
if (params.timeout > 0) {
cli.set_read_timeout(params.timeout, 0);
cli.set_write_timeout(params.timeout, 0);
}
std::vector<char> buf;
auto res = cli.Get(parts.path, headers,
[&](const char *data, size_t len) {
buf.insert(buf.end(), data, data + len);
return params.max_size == 0 ||
buf.size() <= static_cast<size_t>(params.max_size);
},
nullptr
);
if (!res) {
throw std::runtime_error("error: cannot make GET request");
}
return { res->status, std::move(buf) };
} }
#endif // LLAMA_USE_CURL #endif // LLAMA_USE_CURL

View File

@ -78,7 +78,6 @@ bool common_params_parse(int argc, char ** argv, common_params & params, llama_e
// function to be used by test-arg-parser // function to be used by test-arg-parser
common_params_context common_params_parser_init(common_params & params, llama_example ex, void(*print_usage)(int, char **) = nullptr); common_params_context common_params_parser_init(common_params & params, llama_example ex, void(*print_usage)(int, char **) = nullptr);
bool common_has_curl();
struct common_remote_params { struct common_remote_params {
std::vector<std::string> headers; std::vector<std::string> headers;

View File

@ -1616,17 +1616,36 @@ static common_chat_params common_chat_params_init_gpt_oss(const common_chat_temp
); );
}); });
auto recipient_in_role = builder.add_rule("recipient_in_role",
"\"<|start|>assistant\"? \" to=functions.\" ( " +
string_join(tool_rules_recipient_in_role, " | ") + " )"
);
auto recipient_in_channel = builder.add_rule("recipient_in_channel", auto recipient_in_channel = builder.add_rule("recipient_in_channel",
channel + " \" to=functions.\" ( " + channel + " \" to=functions.\" ( " +
string_join(tool_rules_recipient_in_channel, " | ") + " )" string_join(tool_rules_recipient_in_channel, " | ") + " )"
); );
if (data.grammar_lazy) {
auto recipient_in_role = builder.add_rule("recipient_in_role",
"\"<|start|>assistant\"? \" to=functions.\" ( " +
string_join(tool_rules_recipient_in_role, " | ") + " )"
);
builder.add_rule("root", recipient_in_role + " | " + recipient_in_channel); builder.add_rule("root", recipient_in_role + " | " + recipient_in_channel);
} else {
auto not_end = builder.add_rule("not-end",
"[^<] | \"<\" [^|] | \"<|\" [^e] | \"<|e\" [^n] | \"<|en\" [^d] | \"<|end\" [^|] | \"<|end|\" [^>]");
auto analysis = builder.add_rule("analysis",
"\"<|channel|>analysis<|message|>\" ( " + not_end + " )* \"<|end|>\"");
auto commentary = builder.add_rule("commentary",
"\"<|channel|>commentary<|message|>\" ( " + not_end + " )* \"<|end|>\"");
auto recipient_in_role = builder.add_rule("recipient_in_role",
"\" to=functions.\" ( " + string_join(tool_rules_recipient_in_role, " | ") + " )"
);
builder.add_rule("root",
"( " + analysis + " \"<|start|>assistant\" )? " +
"( " + commentary + " \"<|start|>assistant\" )? " +
"( " + recipient_in_role + " | " + recipient_in_channel + " )"
);
}
// Trigger on tool calls that appear in the commentary channel // Trigger on tool calls that appear in the commentary channel
data.grammar_triggers.push_back({ data.grammar_triggers.push_back({

View File

@ -51,6 +51,11 @@
#include <unistd.h> #include <unistd.h>
#endif #endif
#if defined(__linux__)
#include <sys/types.h>
#include <pwd.h>
#endif
#if defined(_MSC_VER) #if defined(_MSC_VER)
#pragma warning(disable: 4244 4267) // possible loss of data #pragma warning(disable: 4244 4267) // possible loss of data
#endif #endif
@ -865,8 +870,20 @@ std::string fs_get_cache_directory() {
#if defined(__linux__) || defined(__FreeBSD__) || defined(_AIX) || defined(__OpenBSD__) #if defined(__linux__) || defined(__FreeBSD__) || defined(_AIX) || defined(__OpenBSD__)
if (std::getenv("XDG_CACHE_HOME")) { if (std::getenv("XDG_CACHE_HOME")) {
cache_directory = std::getenv("XDG_CACHE_HOME"); cache_directory = std::getenv("XDG_CACHE_HOME");
} else { } else if (std::getenv("HOME")) {
cache_directory = std::getenv("HOME") + std::string("/.cache/"); cache_directory = std::getenv("HOME") + std::string("/.cache/");
} else {
#if defined(__linux__)
/* no $HOME is defined, fallback to getpwuid */
struct passwd *pw = getpwuid(getuid());
if ((!pw) || (!pw->pw_dir)) {
throw std::runtime_error("Failed to find $HOME directory");
}
cache_directory = std::string(pw->pw_dir) + std::string("/.cache/");
#else /* defined(__linux__) */
throw std::runtime_error("Failed to find $HOME directory");
#endif /* defined(__linux__) */
} }
#elif defined(__APPLE__) #elif defined(__APPLE__)
cache_directory = std::getenv("HOME") + std::string("/Library/Caches/"); cache_directory = std::getenv("HOME") + std::string("/Library/Caches/");

73
common/http.h Normal file
View File

@ -0,0 +1,73 @@
#pragma once
#include <cpp-httplib/httplib.h>
struct common_http_url {
std::string scheme;
std::string user;
std::string password;
std::string host;
std::string path;
};
static common_http_url common_http_parse_url(const std::string & url) {
common_http_url parts;
auto scheme_end = url.find("://");
if (scheme_end == std::string::npos) {
throw std::runtime_error("invalid URL: no scheme");
}
parts.scheme = url.substr(0, scheme_end);
if (parts.scheme != "http" && parts.scheme != "https") {
throw std::runtime_error("unsupported URL scheme: " + parts.scheme);
}
auto rest = url.substr(scheme_end + 3);
auto at_pos = rest.find('@');
if (at_pos != std::string::npos) {
auto auth = rest.substr(0, at_pos);
auto colon_pos = auth.find(':');
if (colon_pos != std::string::npos) {
parts.user = auth.substr(0, colon_pos);
parts.password = auth.substr(colon_pos + 1);
} else {
parts.user = auth;
}
rest = rest.substr(at_pos + 1);
}
auto slash_pos = rest.find('/');
if (slash_pos != std::string::npos) {
parts.host = rest.substr(0, slash_pos);
parts.path = rest.substr(slash_pos);
} else {
parts.host = rest;
parts.path = "/";
}
return parts;
}
static std::pair<httplib::Client, common_http_url> common_http_client(const std::string & url) {
common_http_url parts = common_http_parse_url(url);
if (parts.host.empty()) {
throw std::runtime_error("error: invalid URL format");
}
httplib::Client cli(parts.scheme + "://" + parts.host);
if (!parts.user.empty()) {
cli.set_basic_auth(parts.user, parts.password);
}
cli.set_follow_location(true);
return { std::move(cli), std::move(parts) };
}
static std::string common_http_show_masked_url(const common_http_url & parts) {
return parts.scheme + "://" + (parts.user.empty() ? "" : "****:****@") + parts.host + parts.path;
}

View File

@ -0,0 +1,89 @@
> [!IMPORTANT]
> This build documentation is specific only to RISC-V SpacemiT SOCs.
## Build llama.cpp locally (for riscv64)
1. Prepare Toolchain For RISCV
~~~
wget https://archive.spacemit.com/toolchain/spacemit-toolchain-linux-glibc-x86_64-v1.1.2.tar.xz
~~~
2. Build
Below is the build script: it requires utilizing RISC-V vector instructions for acceleration. Ensure the `GGML_CPU_RISCV64_SPACEMIT` compilation option is enabled. The currently supported optimization version is `RISCV64_SPACEMIT_IME1`, corresponding to the `RISCV64_SPACEMIT_IME_SPEC` compilation option. Compiler configurations are defined in the `riscv64-spacemit-linux-gnu-gcc.cmake` file. Please ensure you have installed the RISC-V compiler and set the environment variable via `export RISCV_ROOT_PATH={your_compiler_path}`.
```bash
cmake -B build \
-DCMAKE_BUILD_TYPE=Release \
-DGGML_CPU_RISCV64_SPACEMIT=ON \
-DLLAMA_CURL=OFF \
-DGGML_RVV=ON \
-DGGML_RV_ZFH=ON \
-DGGML_RV_ZICBOP=ON \
-DRISCV64_SPACEMIT_IME_SPEC=RISCV64_SPACEMIT_IME1 \
-DCMAKE_TOOLCHAIN_FILE=${PWD}/cmake/riscv64-spacemit-linux-gnu-gcc.cmake \
-DCMAKE_INSTALL_PREFIX=build/installed
cmake --build build --parallel $(nproc) --config Release
pushd build
make install
popd
```
## Simulation
You can use QEMU to perform emulation on non-RISC-V architectures.
1. Download QEMU
~~~
wget https://archive.spacemit.com/spacemit-ai/qemu/jdsk-qemu-v0.0.14.tar.gz
~~~
2. Run Simulation
After build your llama.cpp, you can run the executable file via QEMU for simulation, for example:
~~~
export QEMU_ROOT_PATH={your QEMU file path}
export RISCV_ROOT_PATH_IME1={your RISC-V compiler path}
${QEMU_ROOT_PATH}/bin/qemu-riscv64 -L ${RISCV_ROOT_PATH_IME1}/sysroot -cpu max,vlen=256,elen=64,vext_spec=v1.0 ${PWD}/build/bin/llama-cli -m ${PWD}/models/Qwen2.5-0.5B-Instruct-Q4_0.gguf -t 1
~~~
## Performance
#### Quantization Support For Matrix
~~~
model name : Spacemit(R) X60
isa : rv64imafdcv_zicbom_zicboz_zicntr_zicond_zicsr_zifencei_zihintpause_zihpm_zfh_zfhmin_zca_zcd_zba_zbb_zbc_zbs_zkt_zve32f_zve32x_zve64d_zve64f_zve64x_zvfh_zvfhmin_zvkt_sscofpmf_sstc_svinval_svnapot_svpbmt
mmu : sv39
uarch : spacemit,x60
mvendorid : 0x710
marchid : 0x8000000058000001
~~~
Q4_0
| Model | Size | Params | backend | threads | test | t/s |
| -----------| -------- | ------ | ------- | ------- | ---- |------|
Qwen2.5 0.5B |403.20 MiB|630.17 M| cpu | 4 | pp512|64.12 ± 0.26|
Qwen2.5 0.5B |403.20 MiB|630.17 M| cpu | 4 | tg128|10.03 ± 0.01|
Qwen2.5 1.5B |1011.16 MiB| 1.78 B | cpu | 4 | pp512|24.16 ± 0.02|
Qwen2.5 1.5B |1011.16 MiB| 1.78 B | cpu | 4 | tg128|3.83 ± 0.06|
Qwen2.5 3B | 1.86 GiB | 3.40 B | cpu | 4 | pp512|12.08 ± 0.02|
Qwen2.5 3B | 1.86 GiB | 3.40 B | cpu | 4 | tg128|2.23 ± 0.02|
Q4_1
| Model | Size | Params | backend | threads | test | t/s |
| -----------| -------- | ------ | ------- | ------- | ---- |------|
Qwen2.5 0.5B |351.50 MiB|494.03 M| cpu | 4 | pp512|62.07 ± 0.12|
Qwen2.5 0.5B |351.50 MiB|494.03 M| cpu | 4 | tg128|9.91 ± 0.01|
Qwen2.5 1.5B |964.06 MiB| 1.54 B | cpu | 4 | pp512|22.95 ± 0.25|
Qwen2.5 1.5B |964.06 MiB| 1.54 B | cpu | 4 | tg128|4.01 ± 0.15|
Qwen2.5 3B | 1.85 GiB | 3.09 B | cpu | 4 | pp512|11.55 ± 0.16|
Qwen2.5 3B | 1.85 GiB | 3.09 B | cpu | 4 | tg128|2.25 ± 0.04|
Q4_K
| Model | Size | Params | backend | threads | test | t/s |
| -----------| -------- | ------ | ------- | ------- | ---- |------|
Qwen2.5 0.5B |462.96 MiB|630.17 M| cpu | 4 | pp512|9.29 ± 0.05|
Qwen2.5 0.5B |462.96 MiB|630.17 M| cpu | 4 | tg128|5.67 ± 0.04|
Qwen2.5 1.5B | 1.04 GiB | 1.78 B | cpu | 4 | pp512|10.38 ± 0.10|
Qwen2.5 1.5B | 1.04 GiB | 1.78 B | cpu | 4 | tg128|3.17 ± 0.08|
Qwen2.5 3B | 1.95 GiB | 3.40 B | cpu | 4 | pp512|4.23 ± 0.04|
Qwen2.5 3B | 1.95 GiB | 3.40 B | cpu | 4 | tg128|1.73 ± 0.00|

View File

@ -110,7 +110,7 @@ You may want to pass in some different `ARGS`, depending on the MUSA environment
The defaults are: The defaults are:
- `MUSA_VERSION` set to `rc4.2.0` - `MUSA_VERSION` set to `rc4.3.0`
The resulting images, are essentially the same as the non-MUSA images: The resulting images, are essentially the same as the non-MUSA images:

View File

@ -5,6 +5,11 @@ target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_17) target_compile_features(${TARGET} PRIVATE cxx_std_17)
set(TEST_TARGET test-eval-callback) set(TEST_TARGET test-eval-callback)
add_test(NAME ${TEST_TARGET} if(NOT ${CMAKE_SYSTEM_PROCESSOR} MATCHES "s390x")
add_test(NAME ${TEST_TARGET}
COMMAND llama-eval-callback --hf-repo ggml-org/models --hf-file tinyllamas/stories260K.gguf --model stories260K.gguf --prompt hello --seed 42 -ngl 0) COMMAND llama-eval-callback --hf-repo ggml-org/models --hf-file tinyllamas/stories260K.gguf --model stories260K.gguf --prompt hello --seed 42 -ngl 0)
else()
add_test(NAME ${TEST_TARGET}
COMMAND llama-eval-callback --hf-repo ggml-org/models --hf-file tinyllamas/stories260K-be.gguf --model stories260K-be.gguf --prompt hello --seed 42 -ngl 0)
endif()
set_property(TEST ${TEST_TARGET} PROPERTY LABELS eval-callback curl) set_property(TEST ${TEST_TARGET} PROPERTY LABELS eval-callback curl)

View File

@ -4,8 +4,7 @@ project("ggml" C CXX ASM)
### GGML Version ### GGML Version
set(GGML_VERSION_MAJOR 0) set(GGML_VERSION_MAJOR 0)
set(GGML_VERSION_MINOR 9) set(GGML_VERSION_MINOR 9)
set(GGML_VERSION_PATCH 0) set(GGML_VERSION_PATCH 4)
set(GGML_VERSION_DEV "-dev") # "-dev" for development, "" for releases
set(GGML_VERSION_BASE "${GGML_VERSION_MAJOR}.${GGML_VERSION_MINOR}.${GGML_VERSION_PATCH}") set(GGML_VERSION_BASE "${GGML_VERSION_MAJOR}.${GGML_VERSION_MINOR}.${GGML_VERSION_PATCH}")
find_program(GIT_EXE NAMES git git.exe NO_CMAKE_FIND_ROOT_PATH) find_program(GIT_EXE NAMES git git.exe NO_CMAKE_FIND_ROOT_PATH)
@ -26,8 +25,8 @@ if(GIT_EXE)
) )
endif() endif()
# Build the version string with optional -dev suffix and dirty flag # Build the version string with optional dirty flag
set(GGML_VERSION "${GGML_VERSION_BASE}${GGML_VERSION_DEV}") set(GGML_VERSION "${GGML_VERSION_BASE}")
if(GGML_GIT_DIRTY AND NOT GGML_GIT_DIRTY EQUAL 0) if(GGML_GIT_DIRTY AND NOT GGML_GIT_DIRTY EQUAL 0)
set(GGML_VERSION "${GGML_VERSION}-dirty") set(GGML_VERSION "${GGML_VERSION}-dirty")
endif() endif()
@ -177,7 +176,7 @@ set(GGML_CPU_POWERPC_CPUTYPE "" CACHE STRING "ggml: CPU type for PowerPC")
if (MINGW) if (MINGW)
set(GGML_WIN_VER "0x602" CACHE STRING "ggml: Windows version") set(GGML_WIN_VER "0xA00" CACHE STRING "ggml: Windows version")
endif() endif()
# ggml core # ggml core

View File

@ -237,6 +237,8 @@
#define GGML_EXIT_SUCCESS 0 #define GGML_EXIT_SUCCESS 0
#define GGML_EXIT_ABORTED 1 #define GGML_EXIT_ABORTED 1
// TODO: convert to enum https://github.com/ggml-org/llama.cpp/pull/16187#discussion_r2388538726
#define GGML_ROPE_TYPE_NORMAL 0
#define GGML_ROPE_TYPE_NEOX 2 #define GGML_ROPE_TYPE_NEOX 2
#define GGML_ROPE_TYPE_MROPE 8 #define GGML_ROPE_TYPE_MROPE 8
#define GGML_ROPE_TYPE_VISION 24 #define GGML_ROPE_TYPE_VISION 24

View File

@ -135,6 +135,10 @@ static void * dl_get_sym(dl_handle * handle, const char * name) {
return p; return p;
} }
static const char * dl_error() {
return "";
}
#else #else
using dl_handle = void; using dl_handle = void;
@ -155,6 +159,11 @@ static void * dl_get_sym(dl_handle * handle, const char * name) {
return dlsym(handle, name); return dlsym(handle, name);
} }
static const char * dl_error() {
const char *rslt = dlerror();
return rslt != nullptr ? rslt : "";
}
#endif #endif
using dl_handle_ptr = std::unique_ptr<dl_handle, dl_handle_deleter>; using dl_handle_ptr = std::unique_ptr<dl_handle, dl_handle_deleter>;
@ -240,7 +249,7 @@ struct ggml_backend_registry {
dl_handle_ptr handle { dl_load_library(path) }; dl_handle_ptr handle { dl_load_library(path) };
if (!handle) { if (!handle) {
if (!silent) { if (!silent) {
GGML_LOG_ERROR("%s: failed to load %s\n", __func__, path_str(path).c_str()); GGML_LOG_ERROR("%s: failed to load %s: %s\n", __func__, path_str(path).c_str(), dl_error());
} }
return nullptr; return nullptr;
} }
@ -530,7 +539,7 @@ static ggml_backend_reg_t ggml_backend_load_best(const char * name, bool silent,
if (filename.native().find(file_prefix) == 0 && ext == file_extension) { if (filename.native().find(file_prefix) == 0 && ext == file_extension) {
dl_handle_ptr handle { dl_load_library(entry) }; dl_handle_ptr handle { dl_load_library(entry) };
if (!handle && !silent) { if (!handle && !silent) {
GGML_LOG_ERROR("%s: failed to load %s\n", __func__, path_str(entry.path()).c_str()); GGML_LOG_ERROR("%s: failed to load %s: %s\n", __func__, path_str(entry.path()).c_str(), dl_error());
} }
if (handle) { if (handle) {
auto score_fn = (ggml_backend_score_t) dl_get_sym(handle.get(), "ggml_backend_score"); auto score_fn = (ggml_backend_score_t) dl_get_sym(handle.get(), "ggml_backend_score");

View File

@ -74,7 +74,7 @@ if (BLAS_FOUND)
target_compile_options(ggml-blas PRIVATE ${BLAS_LINKER_FLAGS}) target_compile_options(ggml-blas PRIVATE ${BLAS_LINKER_FLAGS})
if (${BLAS_INCLUDE_DIRS} MATCHES "mkl" AND (${GGML_BLAS_VENDOR} MATCHES "Generic" OR ${GGML_BLAS_VENDOR} MATCHES "Intel")) if ("${BLAS_INCLUDE_DIRS}" MATCHES "mkl" AND (${GGML_BLAS_VENDOR} MATCHES "Generic" OR ${GGML_BLAS_VENDOR} MATCHES "Intel"))
add_compile_definitions(GGML_BLAS_USE_MKL) add_compile_definitions(GGML_BLAS_USE_MKL)
endif() endif()

View File

@ -439,6 +439,15 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
ggml-cpu/arch/riscv/quants.c ggml-cpu/arch/riscv/quants.c
ggml-cpu/arch/riscv/repack.cpp ggml-cpu/arch/riscv/repack.cpp
) )
if (GGML_CPU_RISCV64_SPACEMIT)
target_compile_definitions(${GGML_CPU_NAME} PRIVATE GGML_USE_CPU_RISCV64_SPACEMIT ${RISCV64_SPACEMIT_IME_SPEC})
list(APPEND GGML_CPU_SOURCES
ggml-cpu/spacemit/ime.cpp
ggml-cpu/spacemit/ime.h
ggml-cpu/spacemit/ime1_kernels.cpp
ggml-cpu/spacemit/ime_kernels.h
)
endif()
set(MARCH_STR "rv64gc") set(MARCH_STR "rv64gc")
if (GGML_RV_ZFH) if (GGML_RV_ZFH)
string(APPEND MARCH_STR "_zfh") string(APPEND MARCH_STR "_zfh")
@ -504,9 +513,9 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
# Fetch KleidiAI sources: # Fetch KleidiAI sources:
include(FetchContent) include(FetchContent)
set(KLEIDIAI_COMMIT_TAG "v1.13.0") set(KLEIDIAI_COMMIT_TAG "v1.14.0")
set(KLEIDIAI_DOWNLOAD_URL "https://github.com/ARM-software/kleidiai/archive/refs/tags/${KLEIDIAI_COMMIT_TAG}.tar.gz") set(KLEIDIAI_DOWNLOAD_URL "https://github.com/ARM-software/kleidiai/archive/refs/tags/${KLEIDIAI_COMMIT_TAG}.tar.gz")
set(KLEIDIAI_ARCHIVE_MD5 "d82a8de939d9814621a5ba23907bdac1") set(KLEIDIAI_ARCHIVE_MD5 "45e110675d93f99f82c23a1afcca76bc")
if (POLICY CMP0135) if (POLICY CMP0135)
cmake_policy(SET CMP0135 NEW) cmake_policy(SET CMP0135 NEW)
@ -583,6 +592,7 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1vlx4_qsi4c32p4vlx4_1vlx4vl_sme2_mopa.c ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1vlx4_qsi4c32p4vlx4_1vlx4vl_sme2_mopa.c
${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4vlx4_1x4vl_sme2_sdot.c ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4vlx4_1x4vl_sme2_sdot.c
${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_fp32_bf16p_bf16p/kai_matmul_clamp_f32_bf16p2vlx2_bf16p2vlx2_2vlx2vl_sme2_mopa.c ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_fp32_bf16p_bf16p/kai_matmul_clamp_f32_bf16p2vlx2_bf16p2vlx2_2vlx2vl_sme2_mopa.c
${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_fp32_bf16p_bf16p/kai_matmul_clamp_f32_bf16p2vlx2_bf16p2vlx2_2vlx2vl_sme2_mopa_asm.S
${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_lhs_pack_bf16p2vlx2_f32_sme.c ${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_lhs_pack_bf16p2vlx2_f32_sme.c
${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_rhs_pack_kxn_bf16p2vlx2b_f32_x32_sme.c ${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_rhs_pack_kxn_bf16p2vlx2b_f32_x32_sme.c
${KLEIDIAI_SRC}/kai/kai_common_sme_asm.S) ${KLEIDIAI_SRC}/kai/kai_common_sme_asm.S)

View File

@ -160,7 +160,6 @@
#define ggml_vec_dot_iq3_s_q8_K_generic ggml_vec_dot_iq3_s_q8_K #define ggml_vec_dot_iq3_s_q8_K_generic ggml_vec_dot_iq3_s_q8_K
#define ggml_vec_dot_iq1_s_q8_K_generic ggml_vec_dot_iq1_s_q8_K #define ggml_vec_dot_iq1_s_q8_K_generic ggml_vec_dot_iq1_s_q8_K
#define ggml_vec_dot_iq1_m_q8_K_generic ggml_vec_dot_iq1_m_q8_K #define ggml_vec_dot_iq1_m_q8_K_generic ggml_vec_dot_iq1_m_q8_K
#define ggml_vec_dot_mxfp4_q8_0_generic ggml_vec_dot_mxfp4_q8_0
// repack.cpp // repack.cpp
#define ggml_quantize_mat_q8_0_4x4_generic ggml_quantize_mat_q8_0_4x4 #define ggml_quantize_mat_q8_0_4x4_generic ggml_quantize_mat_q8_0_4x4
#define ggml_quantize_mat_q8_0_4x8_generic ggml_quantize_mat_q8_0_4x8 #define ggml_quantize_mat_q8_0_4x8_generic ggml_quantize_mat_q8_0_4x8

View File

@ -75,7 +75,8 @@ void quantize_row_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i
for (int j = 0; j < 8; j++) { for (int j = 0; j < 8; j++) {
const float32x4_t v = vec_mul(srcv[j], vec_splats(id)); const float32x4_t v = vec_mul(srcv[j], vec_splats(id));
const int32x4_t vi = vec_signed(v); /* Uses non-default rounding for vec_signed or vec_round */
const int32x4_t vi = vec_signed(__builtin_s390_vfisb(v, 4, 1));
y[i].qs[4*j + 0] = vec_extract(vi, 0); y[i].qs[4*j + 0] = vec_extract(vi, 0);
y[i].qs[4*j + 1] = vec_extract(vi, 1); y[i].qs[4*j + 1] = vec_extract(vi, 1);
@ -122,7 +123,8 @@ void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i
for (int j = 0; j < 8; j++) { for (int j = 0; j < 8; j++) {
const float32x4_t v = vec_mul(srcv[j], vec_splats(id)); const float32x4_t v = vec_mul(srcv[j], vec_splats(id));
const int32x4_t vi = vec_signed(v); /* Uses non-default rounding for vec_signed or vec_round */
const int32x4_t vi = vec_signed(__builtin_s390_vfisb(v, 4, 1));
y[i].qs[4*j + 0] = vec_extract(vi, 0); y[i].qs[4*j + 0] = vec_extract(vi, 0);
y[i].qs[4*j + 1] = vec_extract(vi, 1); y[i].qs[4*j + 1] = vec_extract(vi, 1);
@ -260,6 +262,101 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
#endif #endif
} }
void ggml_vec_dot_mxfp4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
assert(nrc == 1);
UNUSED(nrc);
UNUSED(bx);
UNUSED(by);
UNUSED(bs);
assert(n % QK_MXFP4 == 0);
static_assert(QK_MXFP4 == QK8_0, "QK_MXFP4 and QK8_0 must be the same");
const int qk = QK_MXFP4;
const int nb = n / qk;
const block_mxfp4 * GGML_RESTRICT x = vx;
const block_q8_0 * GGML_RESTRICT y = vy;
int ib = 0;
float sumf = 0.0f;
#if defined(__VXE__) || defined(__VXE2__)
const int8x16_t v_k = vec_xl(0, kvalues_mxfp4);
const uint8x16_t v_m = vec_splats((const uint8_t)0x0F);
float32x4_t v_acc = vec_splats(0.0f);
#pragma GCC unroll 8
for (; ib + 1 < nb; ib += 2) {
const block_mxfp4 * GGML_RESTRICT x0 = &x[ib + 0];
const block_mxfp4 * GGML_RESTRICT x1 = &x[ib + 1];
const block_q8_0 * GGML_RESTRICT y0 = &y[ib + 0];
const block_q8_0 * GGML_RESTRICT y1 = &y[ib + 1];
const uint8x16_t v_x0 = vec_xl(0, x0->qs);
const uint8x16_t v_x1 = vec_xl(0, x1->qs);
int8x16_t v_x0l = (int8x16_t)vec_and(v_x0, v_m);
int8x16_t v_x0h = (int8x16_t)vec_sr(v_x0, 4);
int8x16_t v_x1l = (int8x16_t)vec_and(v_x1, v_m);
int8x16_t v_x1h = (int8x16_t)vec_sr(v_x1, 4);
v_x0l = vec_perm(v_k, v_k, (uchar8x16_t)v_x0l);
v_x0h = vec_perm(v_k, v_k, (uchar8x16_t)v_x0h);
v_x1l = vec_perm(v_k, v_k, (uchar8x16_t)v_x1l);
v_x1h = vec_perm(v_k, v_k, (uchar8x16_t)v_x1h);
const int8x16_t v_y0l = vec_xl(0, y0->qs);
const int8x16_t v_y0h = vec_xl(QK8_0/2, y0->qs);
const int8x16_t v_y1l = vec_xl(0, y1->qs);
const int8x16_t v_y1h = vec_xl(QK8_0/2, y1->qs);
const int32x4_t v_xy0 = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_x0l, v_y0l), v_x0h, v_y0h);
const int32x4_t v_xy1 = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_x1l, v_y1l), v_x1h, v_y1h);
const float32x4_t v_xy0f = vec_float(v_xy0);
const float32x4_t v_xy1f = vec_float(v_xy1);
const float32x4_t v_d0 = vec_splats(GGML_E8M0_TO_FP32_HALF(x0->e) * GGML_CPU_FP16_TO_FP32(y0->d));
const float32x4_t v_d1 = vec_splats(GGML_E8M0_TO_FP32_HALF(x1->e) * GGML_CPU_FP16_TO_FP32(y1->d));
v_acc = vec_madd(v_xy0f, v_d0, v_acc);
v_acc = vec_madd(v_xy1f, v_d1, v_acc);
}
for (; ib < nb; ++ib) {
const block_mxfp4 * GGML_RESTRICT x0 = &x[ib + 0];
const block_q8_0 * GGML_RESTRICT y0 = &y[ib + 0];
const uint8x16_t v_x = vec_xl(0, x0->qs);
int8x16_t v_xl = (int8x16_t)vec_and(v_x, v_m);
int8x16_t v_xh = (int8x16_t)vec_sr(v_x, 4);
v_xl = vec_perm(v_k, v_k, (uchar8x16_t)v_xl);
v_xh = vec_perm(v_k, v_k, (uchar8x16_t)v_xh);
const int8x16_t v_yl = vec_xl(0, y0->qs);
const int8x16_t v_yh = vec_xl(QK8_0/2, y0->qs);
const int32x4_t v_xy = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_xl, v_yl), v_xh, v_yh);
const float32x4_t v_xyf = vec_float(v_xy);
const float32x4_t v_d = vec_splats(GGML_E8M0_TO_FP32_HALF(x0->e) * GGML_CPU_FP16_TO_FP32(y0->d));
v_acc = vec_madd(v_xyf, v_d, v_acc);
}
sumf = vec_hsum_f32x4(v_acc);
*s = sumf;
#else
UNUSED(x);
UNUSED(y);
UNUSED(ib);
UNUSED(sumf);
ggml_vec_dot_mxfp4_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
#endif
}
void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
const int qk = QK8_0; const int qk = QK8_0;
const int nb = n / qk; const int nb = n / qk;
@ -636,7 +733,7 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
uint8x16_t q3h[4]; uint8x16_t q3h[4];
uint8x16_t q3b[2]; uint8x16_t q3b[2];
int8x16_t q3bytes[4]; int8x16_t q3bytes[4];
int8x16_t q8bytes[4]; int8x16_t q8bytes[8];
uint8x16_t qhbits[2]; uint8x16_t qhbits[2];
float sum = 0; float sum = 0;

View File

@ -18,6 +18,10 @@
# include "kleidiai/kleidiai.h" # include "kleidiai/kleidiai.h"
#endif #endif
#ifdef GGML_USE_CPU_RISCV64_SPACEMIT
# include "spacemit/ime.h"
#endif
#if defined(_WIN32) #if defined(_WIN32)
# define WIN32_LEAN_AND_MEAN # define WIN32_LEAN_AND_MEAN
# ifndef NOMINMAX # ifndef NOMINMAX
@ -45,6 +49,12 @@ std::vector<ggml_backend_buffer_type_t> & ggml_backend_cpu_get_extra_buffer_type
} }
#endif #endif
#ifdef GGML_USE_CPU_RISCV64_SPACEMIT
if (ggml_backend_cpu_riscv64_spacemit_buffer_type()) {
bufts.push_back(ggml_backend_cpu_riscv64_spacemit_buffer_type());
}
#endif
#ifdef GGML_USE_CPU_KLEIDIAI #ifdef GGML_USE_CPU_KLEIDIAI
if (ggml_backend_cpu_kleidiai_buffer_type()) { if (ggml_backend_cpu_kleidiai_buffer_type()) {
bufts.push_back(ggml_backend_cpu_kleidiai_buffer_type()); bufts.push_back(ggml_backend_cpu_kleidiai_buffer_type());

View File

@ -87,15 +87,38 @@ static inline int64_t ggml_ne(const ggml_tensor * tensor, int dim) {
return tensor->ne[dim]; return tensor->ne[dim];
} }
template <typename Variant, typename Ret, typename... Args, std::size_t... Is>
constexpr bool variant_any_invocable_impl(std::index_sequence<Is...>) {
using V = std::remove_reference_t<Variant>;
return (std::is_invocable_r_v<
Ret,
std::variant_alternative_t<Is, V>,
Args...> || ...);
}
template <typename Variant, typename Ret, typename... Args>
constexpr bool variant_any_invocable_v =
variant_any_invocable_impl<Variant, Ret, Args...>(
std::make_index_sequence<
std::variant_size_v<std::remove_reference_t<Variant>>>{});
template<typename Ret, typename Variant, typename... Args> template<typename Ret, typename Variant, typename... Args>
static Ret variant_call(const Variant & var, Args&&... args) { static inline Ret variant_call(Variant && var, Args&&... args) {
return std::visit([&](auto&& func) -> Ret { static_assert(variant_any_invocable_v<std::remove_reference_t<Variant>, Ret, Args...>,
if constexpr (std::is_invocable_r_v<Ret, decltype(func), Args...>) { "No alternative in Variant is invocable with the provided arguments and return type.");
return func(std::forward<Args>(args)...);
return std::visit(
[&](auto && f) -> Ret {
using F = std::decay_t<decltype(f)>;
if constexpr (std::is_invocable_r_v<Ret, F, Args...>) {
return std::invoke(std::forward<decltype(f)>(f), std::forward<Args>(args)...);
} else { } else {
throw std::runtime_error("Invalid function type in variant_call"); GGML_ABORT("Invalid function type in variant_call");
GGML_UNREACHABLE();
} }
}, var); },
std::forward<Variant>(var)
);
} }
namespace ggml::cpu::kleidiai { namespace ggml::cpu::kleidiai {
@ -138,7 +161,10 @@ class tensor_traits : public ggml::cpu::tensor_traits {
if (kernels->rhs_type == GGML_TYPE_Q4_0) { if (kernels->rhs_type == GGML_TYPE_Q4_0) {
size = variant_call<size_t>(lhs_info->packed_size, m, k, QK4_0, mr, kr, sr); size = variant_call<size_t>(lhs_info->packed_size, m, k, QK4_0, mr, kr, sr);
} else if (kernels->rhs_type == GGML_TYPE_F16) { } else if (kernels->rhs_type == GGML_TYPE_F16) {
size = variant_call<size_t>(lhs_info->packed_size, m, k, mr, kr, sr) + const int64_t lhs_batch_size0 = op->src[1]->ne[2];
const int64_t rhs_batch_size0 = op->src[0]->ne[2];
const int64_t r = lhs_batch_size0 / rhs_batch_size0;
size = variant_call<size_t>(lhs_info->packed_size, m * r, k, mr, kr, sr) +
variant_call<size_t>(kernels->rhs_info.packed_size, n, k) + variant_call<size_t>(kernels->rhs_info.packed_size, n, k) +
k * n * sizeof(float) + n * sizeof(float); k * n * sizeof(float) + n * sizeof(float);
} else { } else {
@ -148,7 +174,6 @@ class tensor_traits : public ggml::cpu::tensor_traits {
return true; return true;
} }
bool compute_forward(struct ggml_compute_params * params, struct ggml_tensor * dst) override { bool compute_forward(struct ggml_compute_params * params, struct ggml_tensor * dst) override {
if (dst->op == GGML_OP_MUL_MAT) { if (dst->op == GGML_OP_MUL_MAT) {
if (dst->src[0]->type == GGML_TYPE_Q4_0) { if (dst->src[0]->type == GGML_TYPE_Q4_0) {
@ -165,8 +190,6 @@ class tensor_traits : public ggml::cpu::tensor_traits {
} }
bool compute_forward_fp16(ggml_compute_params * params, struct ggml_tensor * dst) { bool compute_forward_fp16(ggml_compute_params * params, struct ggml_tensor * dst) {
static std::atomic_flag first_to_arrive = ATOMIC_FLAG_INIT;
const ggml_tensor * src0 = dst->src[0]; const ggml_tensor * src0 = dst->src[0];
const ggml_tensor * src1 = dst->src[1]; const ggml_tensor * src1 = dst->src[1];
@ -175,7 +198,7 @@ class tensor_traits : public ggml::cpu::tensor_traits {
ggml_kleidiai_kernels *kernels = ggml_kleidiai_select_kernels(ctx.features, dst); ggml_kleidiai_kernels *kernels = ggml_kleidiai_select_kernels(ctx.features, dst);
GGML_ASSERT(kernels); GGML_ASSERT(kernels);
bool is_gemv = src1->ne[1] == 1; const bool is_gemv = src1->ne[1] == 1;
kernel_info * kernel = is_gemv ? &kernels->gemv : &kernels->gemm; kernel_info * kernel = is_gemv ? &kernels->gemv : &kernels->gemm;
lhs_packing_info * lhs_info = is_gemv ? &kernels->gemv_lhs_info : &kernels->gemm_lhs_info; lhs_packing_info * lhs_info = is_gemv ? &kernels->gemv_lhs_info : &kernels->gemm_lhs_info;
GGML_ASSERT(kernel); GGML_ASSERT(kernel);
@ -185,11 +208,14 @@ class tensor_traits : public ggml::cpu::tensor_traits {
const int64_t lhs_batch_size0 = ne12; const int64_t lhs_batch_size0 = ne12;
const int64_t rhs_batch_size0 = ne02; const int64_t rhs_batch_size0 = ne02;
const int64_t batch_size = rhs_batch_size0; const int64_t batch_size = lhs_batch_size0;
GGML_ASSERT(rhs_batch_size0 > 0);
GGML_ASSERT(lhs_batch_size0 % rhs_batch_size0 == 0);
const int64_t r = lhs_batch_size0 / rhs_batch_size0; const int64_t r = lhs_batch_size0 / rhs_batch_size0;
const int64_t m = ne11 * r; const int64_t m_group = ne11;
const int64_t m = m_group;
const int64_t n = ne01; const int64_t n = ne01;
const int64_t k = ne00; const int64_t k = ne00;
@ -197,15 +223,15 @@ class tensor_traits : public ggml::cpu::tensor_traits {
const size_t rhs_stride = src0->nb[1]; const size_t rhs_stride = src0->nb[1];
const size_t dst_stride = dst->nb[1]; const size_t dst_stride = dst->nb[1];
const int64_t mr = static_cast<int64_t>(kernel->get_mr()); const int64_t mr = (int64_t) kernel->get_mr();
const int64_t nr = static_cast<int64_t>(kernel->get_nr()); const int64_t nr = (int64_t) kernel->get_nr();
const int64_t kr = static_cast<int64_t>(kernel->get_kr()); const int64_t kr = (int64_t) kernel->get_kr();
const int64_t sr = static_cast<int64_t>(kernel->get_sr()); const int64_t sr = (int64_t) kernel->get_sr();
const size_t lhs_packed_size = variant_call<size_t>(lhs_info->packed_size, m, k, mr, kr, sr); const size_t lhs_packed_size = variant_call<size_t>(lhs_info->packed_size, (size_t)m, (size_t)k, (size_t)mr, (size_t)kr, (size_t)sr);
const size_t rhs_packed_size = variant_call<size_t>(kernels->rhs_info.packed_size, n, k); const size_t rhs_packed_size = variant_call<size_t>(kernels->rhs_info.packed_size, (size_t)n, (size_t)k);
const size_t kxn_size = k * n * sizeof(float); const size_t kxn_size = (size_t)k * (size_t)n * sizeof(float);
const size_t bias_size = n * sizeof(float); const size_t bias_size = (size_t)n * sizeof(float);
const size_t wsize_required = lhs_packed_size + rhs_packed_size + kxn_size + bias_size; const size_t wsize_required = lhs_packed_size + rhs_packed_size + kxn_size + bias_size;
GGML_ASSERT(wsize_required <= params->wsize); GGML_ASSERT(wsize_required <= params->wsize);
@ -216,82 +242,102 @@ class tensor_traits : public ggml::cpu::tensor_traits {
uint8_t * bias = rhs_kxn + kxn_size; uint8_t * bias = rhs_kxn + kxn_size;
for (int64_t batch_idx = 0; batch_idx < batch_size; ++batch_idx) { for (int64_t batch_idx = 0; batch_idx < batch_size; ++batch_idx) {
const uint8_t * lhs_batch = static_cast<const uint8_t *>(src1->data) + batch_idx * m * lhs_stride; const int64_t rhs_batch_idx = batch_idx / r;
const uint8_t * rhs_batch = static_cast<const uint8_t *>(src0->data) + batch_idx * n * rhs_stride; const uint8_t * rhs_batch_base = static_cast<const uint8_t *>(src0->data) + rhs_batch_idx * src0->nb[2];
uint8_t * dst_batch = static_cast<uint8_t *>(dst->data) + batch_idx * m * dst_stride; uint8_t * dst_batch_base = static_cast<uint8_t *>(dst->data) + batch_idx * dst->nb[2];
// LHS packing // LHS packing (threaded over m, honoring mr alignment and KV groups)
{ {
const int64_t m_roundup_mr = kai_roundup(m, mr); const int64_t m_roundup_mr = kai_roundup(m, mr);
const int64_t num_threads = KAI_MIN(m_roundup_mr / mr, nth); const int64_t num_threads = KAI_MIN(m_roundup_mr / mr, nth);
if (ith < num_threads) { if (ith < num_threads) {
const int64_t num_m_per_thread0 = round_down(m_roundup_mr / num_threads, mr); const int64_t num_m_per_thread0 = round_down((size_t)(m_roundup_mr / num_threads), (size_t)mr);
const int64_t num_m_per_threadN_1 = m - (num_threads - 1) * num_m_per_thread0; const int64_t num_m_per_threadN_1 = m - (num_threads - 1) * num_m_per_thread0;
const int64_t m_start = ith * num_m_per_thread0; const int64_t m_start = ith * num_m_per_thread0;
const int64_t num_m_per_thread = (ith == num_threads - 1) ? num_m_per_threadN_1 : num_m_per_thread0; const int64_t m_count = (ith == num_threads - 1) ? num_m_per_threadN_1 : num_m_per_thread0;
const size_t lhs_offset = variant_call<size_t>(kernels->gemm.get_lhs_offset, m_start, lhs_stride); // Base packed offset (aligned) and per-row stride in bytes
const size_t lhs_packed_offset = variant_call<size_t>(lhs_info->get_packed_offset, m_start, k, mr, kr, sr); const size_t base_packed_off = variant_call<size_t>(
lhs_info->get_packed_offset, (size_t)m_start, (size_t)k, (size_t)mr, (size_t)kr, (size_t)sr);
const size_t next_block_off = variant_call<size_t>(
lhs_info->get_packed_offset, (size_t)(m_start + mr), (size_t)k, (size_t)mr, (size_t)kr, (size_t)sr);
const size_t row_stride_bytes = (next_block_off - base_packed_off) / (size_t)mr;
const void * src_ptr = static_cast<const uint8_t *>(lhs_batch) + lhs_offset; int64_t remaining = m_count;
void * dst_ptr = static_cast<uint8_t *>(lhs_packed) + lhs_packed_offset; int64_t cur = m_start;
variant_call<void>(lhs_info->pack_func, num_m_per_thread, k, mr, kr, sr, 0, src_ptr, lhs_stride, dst_ptr); while (remaining > 0) {
const int64_t row_in_group = cur;
const int64_t avail = m_group - row_in_group;
const int64_t take = std::min(avail, remaining);
const uint8_t * lhs_batch_base = static_cast<const uint8_t *>(src1->data) + batch_idx * src1->nb[2];
const void * src_ptr = lhs_batch_base + (size_t)row_in_group * lhs_stride;
const size_t dst_off = base_packed_off + (size_t)(cur - m_start) * row_stride_bytes;
void * dst_ptr = lhs_packed + dst_off;
variant_call<void>(lhs_info->pack_func,
(size_t)take, (size_t)k, (size_t)mr, (size_t)kr, (size_t)sr,
/*m_idx_start*/ 0, src_ptr, lhs_stride, dst_ptr);
cur += take;
remaining -= take;
}
} }
} }
// RHS packing // RHS packing (single thread), then synchronize
if (first_to_arrive.test_and_set(std::memory_order_acquire) == false) { if (ith == 0) {
// First thread to reach this point handles RHS packing memset(bias, 0, (size_t)n * sizeof(float));
memset(bias, 0, n * sizeof(float)); transpose_f32kxn_f16nxk((size_t)n, (size_t)k,
transpose_f32kxn_f16nxk(n, k, reinterpret_cast<float *>(rhs_kxn), reinterpret_cast<float *>(rhs_kxn),
reinterpret_cast<const uint16_t *>(rhs_batch), rhs_stride); reinterpret_cast<const uint16_t *>(rhs_batch_base),
rhs_stride);
variant_call<void>(kernels->rhs_info.pack_func, 1, n, k, nr, kr, sr, n * sizeof(float), variant_call<void>(kernels->rhs_info.pack_func,
rhs_kxn, bias, nullptr, rhs_packed, 0, nullptr); /*num_groups*/ 1, (size_t)n, (size_t)k, (size_t)nr, (size_t)kr, (size_t)sr,
/*rhs_stride (bytes)*/ (size_t)(n * sizeof(float)),
rhs_kxn, bias, nullptr, rhs_packed, /*extra_bytes*/ 0, /*params*/ nullptr);
} }
ggml_barrier(params->threadpool); ggml_barrier(params->threadpool);
first_to_arrive.clear(std::memory_order_release); // Matmul (threaded over n)
// Perform the matmul
{ {
const int64_t m_to_process = m; const int64_t n_step = (int64_t) kernel->get_n_step();
const int64_t m_start = 0; int64_t num_threads_n = KAI_MIN(n / n_step, nth);
if (num_threads_n <= 0) {
const int64_t n_step = static_cast<int64_t>(kernel->get_n_step()); num_threads_n = 1;
int64_t num_threads = KAI_MIN(n / n_step, nth);
if (num_threads <= 0) {
num_threads = 1;
} }
if (ith < num_threads) { if (ith < num_threads_n) {
const int64_t num_n_per_thread0 = round_down(n / num_threads, n_step); const int64_t num_n_per_thread0 = round_down((size_t)(n / num_threads_n), (size_t)n_step);
const int64_t num_n_per_threadN_1 = n - (num_threads - 1) * num_n_per_thread0; const int64_t num_n_per_threadN_1 = n - (num_threads_n - 1) * num_n_per_thread0;
const int64_t n_start = ith * num_n_per_thread0; const int64_t n_start = ith * num_n_per_thread0;
const int64_t n_to_process = (ith == num_threads - 1) ? num_n_per_threadN_1 : num_n_per_thread0; const int64_t n_to_process = (ith == num_threads_n - 1) ? num_n_per_threadN_1 : num_n_per_thread0;
const size_t lhs_packed_offset = variant_call<size_t>(kernel->get_lhs_offset, m_start, k); // LHS packed base at row 0 (consistent with packing above)
const size_t rhs_packed_offset = variant_call<size_t>(kernel->get_rhs_packed_offset, n_start, k); const size_t lhs_packed_offset0 = variant_call<size_t>(
const size_t dst_offset = kernel->get_dst_offset(m_start, n_start, dst_stride); lhs_info->get_packed_offset, (size_t)0, (size_t)k, (size_t)mr, (size_t)kr, (size_t)sr);
const size_t rhs_packed_offset = variant_call<size_t>(kernel->get_rhs_packed_offset, (size_t)n_start, (size_t)k);
const size_t dst_offset = kernel->get_dst_offset((size_t)0, (size_t)n_start, dst_stride);
const void * lhs_ptr = lhs_packed + lhs_packed_offset; const void * lhs_ptr = lhs_packed + lhs_packed_offset0;
const void * rhs_ptr = rhs_packed + rhs_packed_offset; const void * rhs_ptr = rhs_packed + rhs_packed_offset;
float * dst_ptr = reinterpret_cast<float *>(dst_batch + dst_offset); float * dst_ptr = reinterpret_cast<float *>(dst_batch_base + dst_offset);
variant_call<void>(kernel->run_kernel, m_to_process, n_to_process, k, lhs_ptr, rhs_ptr, dst_ptr, dst_stride, sizeof(float), -FLT_MAX, FLT_MAX); variant_call<void>(kernel->run_kernel,
(size_t)m, (size_t)n_to_process, (size_t)k,
lhs_ptr, rhs_ptr,
dst_ptr, dst_stride, sizeof(float),
-FLT_MAX, FLT_MAX);
} }
} }
if (batch_idx != batch_size - 1) { if (batch_idx != batch_size - 1) {
// This barrier is necessary when the batch size is larger than 1. While processing a batch,
// the work data buffer (params->wdata) is used as temporary storage which means that only
// a single batch can be processed at any given time. No barrier is needed for the last
// batch since GGML inserts a barrier between the execution of every operator.
ggml_barrier(params->threadpool); ggml_barrier(params->threadpool);
} }
} }

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,13 @@
#pragma once
#include "ggml-alloc.h"
#ifdef __cplusplus
extern "C" {
#endif
ggml_backend_buffer_type_t ggml_backend_cpu_riscv64_spacemit_buffer_type(void);
#ifdef __cplusplus
}
#endif

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,26 @@
#pragma once
#include <cstddef>
namespace sqnbitgemm_spacemit_ime {
namespace ime1 {
size_t gemm_kernel_i8i4(size_t blk_len,
const std::byte * quant_a_ptr,
const std::byte * quant_b_data,
const float * quant_b_scale,
const std::byte * quant_b_zp,
float * c_ptr,
size_t count_m,
size_t count_n,
size_t count_k,
size_t block_count_k,
size_t ldc,
const float * bias,
const size_t scale_stride);
void quantize_a_row_i8(size_t blk_len, const float * a_ptr, size_t count_k, std::byte * quant_a_ptr);
void quantize_a_4row_i8(size_t blk_len, const float * a_ptr, size_t count_k, std::byte * quant_a_ptr);
} // namespace ime1
} // namespace sqnbitgemm_spacemit_ime

View File

@ -610,7 +610,7 @@ inline static void ggml_vec_mad1_f32(const int n, float * y, const float * x, co
for (int i = 0; i < np; i += GGML_F32_STEP) { for (int i = 0; i < np; i += GGML_F32_STEP) {
for (int j = 0; j < GGML_F32_ARR; j++) { for (int j = 0; j < GGML_F32_ARR; j++) {
ay[j] = GGML_F32_VEC_LOAD(x + i + j*GGML_F32_EPR); ay[j] = GGML_F32_VEC_LOAD(x + i + j*GGML_F32_EPR);
ay[j] = GGML_F32_VEC_FMA(ay[j], vs, vb); ay[j] = GGML_F32_VEC_FMA(vb, ay[j], vs);
GGML_F32_VEC_STORE(y + i + j*GGML_F32_EPR, ay[j]); GGML_F32_VEC_STORE(y + i + j*GGML_F32_EPR, ay[j]);
} }

View File

@ -54,7 +54,7 @@ static __global__ void k_bin_bcast(const src0_t * src0,
const uint32_t i2 = fastdiv((blockDim.z * blockIdx.z + threadIdx.z), ne3); const uint32_t i2 = fastdiv((blockDim.z * blockIdx.z + threadIdx.z), ne3);
const uint32_t i3 = (blockDim.z * blockIdx.z + threadIdx.z) - (i2 * ne3.z); const uint32_t i3 = (blockDim.z * blockIdx.z + threadIdx.z) - (i2 * ne3.z);
if (i0s >= ne0 || i1 >= ne1 || i2 >= ne2 || i3 >= ne3.z) { if (i0s >= (uint32_t)ne0 || i1 >= (uint32_t)ne1 || i2 >= (uint32_t)ne2 || i3 >= ne3.z) {
return; return;
} }

View File

@ -586,18 +586,43 @@ static __device__ __forceinline__ void ggml_cuda_mad(float & acc, const half2 v,
#endif // defined(GGML_USE_HIP) && (defined(RDNA2) || defined(RDNA3) || defined(RDNA4) || defined(GCN5) || defined(CDNA)) #endif // defined(GGML_USE_HIP) && (defined(RDNA2) || defined(RDNA3) || defined(RDNA4) || defined(GCN5) || defined(CDNA))
} }
static __device__ __forceinline__ void ggml_cuda_mad(half2 & acc, const half2 v, const half2 u) {
#ifdef FAST_FP16_AVAILABLE
acc += v*u;
#else
const float2 tmpv = __half22float2(v);
const float2 tmpu = __half22float2(u);
float2 tmpacc = __half22float2(acc);
tmpacc.x += tmpv.x * tmpu.x;
tmpacc.y += tmpv.y * tmpu.y;
acc = make_half2(tmpacc.x, tmpacc.y);
#endif // FAST_FP16_AVAILABLE
}
// Aligned memory transfers of 8/16 bytes can be faster than 2 transfers with 4 bytes, especially on AMD. // Aligned memory transfers of 8/16 bytes can be faster than 2 transfers with 4 bytes, especially on AMD.
template <int nbytes> template <int nbytes, int alignment = 0>
static __device__ __forceinline__ void ggml_cuda_memcpy_1(void * __restrict__ dst, const void * __restrict__ src) { static __device__ __forceinline__ void ggml_cuda_memcpy_1(void * __restrict__ dst, const void * __restrict__ src) {
if constexpr (nbytes == 4) { if constexpr (alignment != 0) {
*(int *) dst = *(const int *) src; static_assert(nbytes % alignment == 0, "bad alignment");
} else if constexpr (nbytes == 8) { }
*(int2 *) dst = *(const int2 *) src; constexpr int nb_per_cpy = alignment == 0 ? nbytes : alignment;
} else if constexpr (nbytes == 16) {
*(int4 *) dst = *(const int4 *) src; #pragma unroll
for (int i = 0; i < nbytes/nb_per_cpy; ++i) {
if constexpr (nb_per_cpy == 1) {
((char *) dst)[i] = ((const char *) src)[i];
} else if constexpr (nb_per_cpy == 2) {
((short *) dst)[i] = ((const short *) src)[i];
} else if constexpr (nb_per_cpy == 4) {
((int *) dst)[i] = ((const int *) src)[i];
} else if constexpr (nb_per_cpy == 8) {
((int2 *) dst)[i] = ((const int2 *) src)[i];
} else if constexpr (nb_per_cpy == 16) {
((int4 *) dst)[i] = ((const int4 *) src)[i];
} else { } else {
static_assert(nbytes == 0 && nbytes == -1, "bad nbytes"); static_assert(nbytes == 0 && nbytes == -1, "bad nbytes");
} }
}
} }
static __device__ __forceinline__ float ggml_cuda_e8m0_to_fp32(uint8_t x) { static __device__ __forceinline__ float ggml_cuda_e8m0_to_fp32(uint8_t x) {

View File

@ -329,8 +329,12 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg
} else } else
#endif // GGML_USE_MUSA && GGML_MUSA_MUDNN_COPY #endif // GGML_USE_MUSA && GGML_MUSA_MUDNN_COPY
{ {
if (src0->type == GGML_TYPE_F32) {
ggml_cpy_flt_cuda<float, float> (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
} else {
CUDA_CHECK(cudaMemcpyAsync(src1_ddc, src0_ddc, ggml_nbytes(src0), cudaMemcpyDeviceToDevice, main_stream)); CUDA_CHECK(cudaMemcpyAsync(src1_ddc, src0_ddc, ggml_nbytes(src0), cudaMemcpyDeviceToDevice, main_stream));
} }
}
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) { } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) {
ggml_cpy_flt_cuda<float, float> (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index); ggml_cpy_flt_cuda<float, float> (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_BF16) { } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_BF16) {
@ -400,7 +404,13 @@ void ggml_cuda_dup(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
void* ggml_cuda_cpy_fn(const ggml_tensor * src0, ggml_tensor * src1) { void* ggml_cuda_cpy_fn(const ggml_tensor * src0, ggml_tensor * src1) {
if (src0->type == src1->type && ggml_is_contiguous(src0) && ggml_is_contiguous(src1)) { if (src0->type == src1->type && ggml_is_contiguous(src0) && ggml_is_contiguous(src1)) {
// Prioritize CUDA graph compatibility over direct memory copy optimization.
// Using copy kernels here maintains graph indirection support, preventing performance regression from disabled CUDA graphs.
if (src0->type == GGML_TYPE_F32) {
return (void*) cpy_flt<cpy_1_flt<float, float>>;
} else {
return nullptr; return nullptr;
}
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) { } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) {
return (void*) cpy_flt<cpy_1_flt<float, float>>; return (void*) cpy_flt<cpy_1_flt<float, float>>;
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_BF16) { } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_BF16) {

View File

@ -33,276 +33,230 @@ typedef void (* fattn_kernel_t)(
const int32_t ne31, const int32_t ne32, const int32_t ne33, const int32_t ne31, const int32_t ne32, const int32_t ne33,
const int32_t nb31, const int32_t nb32, const int64_t nb33); const int32_t nb31, const int32_t nb32, const int64_t nb33);
typedef half (*vec_dot_KQ_f16_t)( typedef float (*vec_dot_KQ_t)(
const char * __restrict__ K_c, const void * __restrict__ Q_v, const int * __restrict__ Q_q8 , const void * __restrict__ Q_ds);
typedef float (*vec_dot_KQ_f32_t)(
const char * __restrict__ K_c, const void * __restrict__ Q_v, const int * __restrict__ Q_q8 , const void * __restrict__ Q_ds); const char * __restrict__ K_c, const void * __restrict__ Q_v, const int * __restrict__ Q_q8 , const void * __restrict__ Q_ds);
template<typename T, int D, int warp_size> template <int D, int nthreads>
static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q4_0( static __device__ __forceinline__ float vec_dot_fattn_vec_KQ_f16(
const char * __restrict__ K_c, const void * __restrict__ Q_v, const int * __restrict__ Q_q8, const void * __restrict__ Q_ds_v) {
const block_q4_0 * K_q4_0 = (const block_q4_0 *) K_c;
GGML_UNUSED(Q_v);
T sum = 0.0f;
#pragma unroll
for (int k_KQ_0 = 0; k_KQ_0 < int(D/sizeof(int)); k_KQ_0 += warp_size) {
const int k_KQ = k_KQ_0 + threadIdx.x;
const int ib = k_KQ / QI8_1;
const int iqs4 = k_KQ % QI4_0;
const int shift = k_KQ & (QI8_1/2);
const int v = (get_int_b2(K_q4_0[ib].qs, iqs4) >> shift) & 0x0F0F0F0F;
const int u = Q_q8[k_KQ_0/warp_size];
const int sumi = ggml_cuda_dp4a(v, u, 0);
#ifdef FP16_AVAILABLE
if (std::is_same<T, half>::value) {
const half2 * Q_ds = (const half2 *) Q_ds_v;
const half2 sum2 = __half2half2(K_q4_0[ib].d) * Q_ds[k_KQ_0/warp_size];
sum += (T) (((half) sumi)*__low2half(sum2) - __high2half(sum2) /* *8/QI8_1 == 1 */);
} else
#endif // FP16_AVAILABLE
{
const float2 * Q_ds = (const float2 *) Q_ds_v;
sum += (T) (__half2float(K_q4_0[ib].d) * (sumi*Q_ds[k_KQ_0/warp_size].x - (8/QI8_1)*Q_ds[k_KQ_0/warp_size].y));
}
}
return sum;
}
template<typename T, int D, int warp_size>
static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q4_1(
const char * __restrict__ K_c, const void * __restrict__ Q_v, const int * __restrict__ Q_q8, const void * __restrict__ Q_ds_v) {
const block_q4_1 * K_q4_1 = (const block_q4_1 *) K_c;
GGML_UNUSED(Q_v);
T sum = 0.0f;
#pragma unroll
for (int k_KQ_0 = 0; k_KQ_0 < int(D/sizeof(int)); k_KQ_0 += warp_size) {
const int k_KQ = k_KQ_0 + threadIdx.x;
const int ib = k_KQ / QI8_1;
const int iqs4 = k_KQ % QI4_1;
const int shift = k_KQ & (QI8_1/2);
const int v = (get_int_b4(K_q4_1[ib].qs, iqs4) >> shift) & 0x0F0F0F0F;
const int u = Q_q8[k_KQ_0/warp_size];
const int sumi = ggml_cuda_dp4a(v, u, 0);
#ifdef FP16_AVAILABLE
if (std::is_same<T, half>::value) {
const half2 * Q_ds = (const half2 *) Q_ds_v;
const half2 d4d8_m4s8 = K_q4_1[ib].dm * Q_ds[k_KQ_0/warp_size];
const half2 sumid4d8_m4s8scaled = d4d8_m4s8 * make_half2(sumi, 1.0f/QI8_1);
sum += (T) (__low2half(sumid4d8_m4s8scaled) + __high2half(sumid4d8_m4s8scaled));
} else
#endif // FP16_AVAILABLE
{
const float2 * Q_ds = (const float2 *) Q_ds_v;
const float sumid4d8 = __low2float(K_q4_1[ib].dm)*Q_ds[k_KQ_0/warp_size].x * sumi;
const float m4s8scaled = __high2float(K_q4_1[ib].dm)*Q_ds[k_KQ_0/warp_size].y / QI8_1;
sum += (T) (sumid4d8 + m4s8scaled);
}
}
return sum;
}
template<typename T, int D, int warp_size>
static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q5_0(
const char * __restrict__ K_c, const void * __restrict__ Q_v, const int * __restrict__ Q_q8, const void * __restrict__ Q_ds_v) {
const block_q5_0 * K_q5_0 = (const block_q5_0 *) K_c;
GGML_UNUSED(Q_v);
T sum = 0.0f;
#pragma unroll
for (int k_KQ_0 = 0; k_KQ_0 < int(D/sizeof(int)); k_KQ_0 += warp_size) {
const int k_KQ = k_KQ_0 + threadIdx.x;
const int ib = k_KQ / QI8_1;
const int iqs4 = k_KQ % QI5_0;
const int iqs8 = k_KQ % QI8_1;
const int shift = k_KQ & (QI8_1/2);
int v = (get_int_b2(K_q5_0[ib].qs, iqs4) >> shift) & 0x0F0F0F0F;
const int vh = get_int_b2(K_q5_0[ib].qh, 0) >> (iqs8 * QI5_0);
v |= (vh << 4) & 0x00000010; // 0 -> 4
v |= (vh << 11) & 0x00001000; // 1 -> 12
v |= (vh << 18) & 0x00100000; // 2 -> 20
v |= (vh << 25) & 0x10000000; // 3 -> 28
const int u = Q_q8[k_KQ_0/warp_size];
const int sumi = ggml_cuda_dp4a(v, u, 0);
#ifdef FP16_AVAILABLE
if (std::is_same<T, half>::value) {
const half2 * Q_ds = (const half2 *) Q_ds_v;
const half2 sum2 = __half2half2(K_q5_0[ib].d) * Q_ds[k_KQ_0/warp_size];
sum += (T) (((half) sumi)*__low2half(sum2) - __high2half(sum2)*__float2half(2.0f)) /* *16/QI8_1 == 2 */;
} else
#endif // FP16_AVAILABLE
{
const float2 * Q_ds = (const float2 *) Q_ds_v;
sum += (T) (__half2float(K_q5_0[ib].d) * (sumi*Q_ds[k_KQ_0/warp_size].x - (16/QI8_1)*Q_ds[k_KQ_0/warp_size].y));
}
}
return sum;
}
template<typename T, int D, int warp_size>
static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q5_1(
const char * __restrict__ K_c, const void * __restrict__ Q_v, const int * __restrict__ Q_q8, const void * __restrict__ Q_ds_v) {
const block_q5_1 * K_q5_1 = (const block_q5_1 *) K_c;
GGML_UNUSED(Q_v);
T sum = 0.0f;
#pragma unroll
for (int k_KQ_0 = 0; k_KQ_0 < int(D/sizeof(int)); k_KQ_0 += warp_size) {
const int k_KQ = k_KQ_0 + threadIdx.x;
const int ib = k_KQ / QI8_1;
const int iqs4 = k_KQ % QI5_1;
const int iqs8 = k_KQ % QI8_1;
const int shift = k_KQ & (QI8_1/2);
int v = (get_int_b2(K_q5_1[ib].qs, iqs4) >> shift) & 0x0F0F0F0F;
const int vh = get_int_b2(K_q5_1[ib].qh, 0) >> (iqs8 * QI5_1);
v |= (vh << 4) & 0x00000010; // 0 -> 4
v |= (vh << 11) & 0x00001000; // 1 -> 12
v |= (vh << 18) & 0x00100000; // 2 -> 20
v |= (vh << 25) & 0x10000000; // 3 -> 28
const int u = Q_q8[k_KQ_0/warp_size];
const int sumi = ggml_cuda_dp4a(v, u, 0);
#ifdef FP16_AVAILABLE
if (std::is_same<T, half>::value) {
const half2 * Q_ds = (const half2 *) Q_ds_v;
const half2 d5d8_m5s8 = K_q5_1[ib].dm * Q_ds[k_KQ_0/warp_size];
const half2 sumid5d8_m5s8scaled = d5d8_m5s8 * make_half2(sumi, 1.0f/QI8_1);
sum += (T) (__low2half(sumid5d8_m5s8scaled) + __high2half(sumid5d8_m5s8scaled));
} else
#endif // FP16_AVAILABLE
{
const float2 * Q_ds = (const float2 *) Q_ds_v;
const float sumid5d8 = __low2float(K_q5_1[ib].dm)*Q_ds[k_KQ_0/warp_size].x * sumi;
const float m5s8scaled = __high2float(K_q5_1[ib].dm)*Q_ds[k_KQ_0/warp_size].y / QI8_1;
sum += (T) (sumid5d8 + m5s8scaled);
}
}
return sum;
}
template <typename T, int D, int warp_size>
static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q8_0(
const char * __restrict__ K_c, const void * __restrict__ Q_v, const int * __restrict__ Q_q8, const void * __restrict__ Q_ds_v) {
const block_q8_0 * K_q8_0 = (const block_q8_0 *) K_c;
GGML_UNUSED(Q_v);
T sum = 0.0f;
#pragma unroll
for (int k_KQ_0 = 0; k_KQ_0 < int(D/sizeof(int)); k_KQ_0 += warp_size) {
const int k_KQ = k_KQ_0 + threadIdx.x;
const int ib = k_KQ / QI8_0;
const int iqs = k_KQ % QI8_0;
const int v = get_int_b2(K_q8_0[ib].qs, iqs);
T Q_d;
if (std::is_same<T, half>::value) {
const half2 * Q_ds = (const half2 *) Q_ds_v;
Q_d = __low2half(Q_ds[k_KQ_0/warp_size]);
} else {
const float2 * Q_ds = (const float2 *) Q_ds_v;
Q_d = Q_ds[k_KQ_0/warp_size].x;
}
sum += vec_dot_q8_0_q8_1_impl<T, 1>(&v, &Q_q8[k_KQ_0/warp_size], K_q8_0[ib].d, Q_d);
}
return sum;
}
template <typename T, int D, int warp_size>
static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_f16(
const char * __restrict__ K_c, const void * __restrict__ Q_v, const int * __restrict__ Q_q8 , const void * __restrict__ Q_ds_v) { const char * __restrict__ K_c, const void * __restrict__ Q_v, const int * __restrict__ Q_q8 , const void * __restrict__ Q_ds_v) {
const half2 * K_h2 = (const half2 *) K_c; const half2 * K_h2 = (const half2 *) K_c;
GGML_UNUSED(Q_q8); GGML_UNUSED(Q_q8);
GGML_UNUSED(Q_ds_v); GGML_UNUSED(Q_ds_v);
#ifdef FP16_AVAILABLE constexpr int cpy_nb = ggml_cuda_get_max_cpy_bytes();
if (std::is_same<T, half>::value) { constexpr int cpy_ne = cpy_nb / 4;
const half2 * Q_h2 = (const half2 *) Q_v;
half2 sum2 = make_half2(0.0f, 0.0f);
#pragma unroll
for (int k_KQ_0 = 0; k_KQ_0 < D/2; k_KQ_0 += warp_size) {
const int k_KQ = k_KQ_0 + threadIdx.x;
const half2 K_ik = K_h2[k_KQ];
sum2 += K_ik * Q_h2[k_KQ_0/warp_size];
}
return __low2half(sum2) + __high2half(sum2);
}
#endif // FP16_AVAILABLE
const float2 * Q_f2 = (const float2 *) Q_v;
float sum = 0.0f; float sum = 0.0f;
#pragma unroll #pragma unroll
for (int k_KQ_0 = 0; k_KQ_0 < D/2; k_KQ_0 += warp_size) { for (int k_KQ_0 = 0; k_KQ_0 < D/2; k_KQ_0 += nthreads*cpy_ne) {
const int k_KQ = k_KQ_0 + threadIdx.x; half2 tmp[cpy_ne];
ggml_cuda_memcpy_1<sizeof(tmp)>(tmp, K_h2 + k_KQ_0 + (threadIdx.x % nthreads)*cpy_ne);
const half2 K_ik = K_h2[k_KQ]; #pragma unroll
sum += __low2float(K_ik) * Q_f2[k_KQ_0/warp_size].x; for (int k_KQ_1 = 0; k_KQ_1 < cpy_ne; ++k_KQ_1) {
sum += __high2float(K_ik) * Q_f2[k_KQ_0/warp_size].y; #ifdef FAST_FP16_AVAILABLE
ggml_cuda_mad(sum, tmp[k_KQ_1] , ((const half2 *) Q_v)[k_KQ_0/nthreads + k_KQ_1]);
#else
ggml_cuda_mad(sum, __half22float2(tmp[k_KQ_1]), ((const float2 *) Q_v)[k_KQ_0/nthreads + k_KQ_1]);
#endif // FP16_AVAILABLE
}
} }
return sum; return sum;
} }
template <typename Tds> template<int D, int nthreads>
static __device__ __forceinline__ float vec_dot_fattn_vec_KQ_q4_0(
const char * __restrict__ K_c, const void * __restrict__ Q_v, const int * __restrict__ Q_q8, const void * __restrict__ Q_ds_v) {
const block_q4_0 * K_q4_0 = (const block_q4_0 *) K_c;
GGML_UNUSED(Q_v);
float sum = 0.0f;
#pragma unroll
for (int k_KQ_0 = 0; k_KQ_0 < int(D/sizeof(int)); k_KQ_0 += nthreads) {
const int k_KQ = k_KQ_0 + (nthreads == WARP_SIZE ? threadIdx.x : threadIdx.x % nthreads);
const int ib = k_KQ / QI8_1;
const int iqs4 = k_KQ % QI4_0;
const int shift = k_KQ & (QI8_1/2);
int v;
ggml_cuda_memcpy_1<sizeof(int), 2>(&v, K_q4_0[ib].qs + sizeof(int)*iqs4);
v = (v >> shift) & 0x0F0F0F0F;
const int u = Q_q8[k_KQ_0/nthreads];
const int sumi = ggml_cuda_dp4a(v, u, 0);
const float2 Q_ds = ((const float2 *) Q_ds_v)[k_KQ_0/nthreads];
sum += __half2float(K_q4_0[ib].d) * (sumi*Q_ds.x - (8/QI8_1)*Q_ds.y);
}
return sum;
}
template<int D, int nthreads>
static __device__ __forceinline__ float vec_dot_fattn_vec_KQ_q4_1(
const char * __restrict__ K_c, const void * __restrict__ Q_v, const int * __restrict__ Q_q8, const void * __restrict__ Q_ds_v) {
const block_q4_1 * K_q4_1 = (const block_q4_1 *) K_c;
GGML_UNUSED(Q_v);
float sum = 0.0f;
#pragma unroll
for (int k_KQ_0 = 0; k_KQ_0 < int(D/sizeof(int)); k_KQ_0 += nthreads) {
const int k_KQ = k_KQ_0 + (nthreads == WARP_SIZE ? threadIdx.x : threadIdx.x % nthreads);
const int ib = k_KQ / QI8_1;
const int iqs4 = k_KQ % QI4_1;
const int shift = k_KQ & (QI8_1/2);
int v;
ggml_cuda_memcpy_1<sizeof(int)>(&v, K_q4_1[ib].qs + sizeof(int)*iqs4);
v = (v >> shift) & 0x0F0F0F0F;
const int u = Q_q8[k_KQ_0/nthreads];
const int sumi = ggml_cuda_dp4a(v, u, 0);
const float2 K_dm = __half22float2(K_q4_1[ib].dm);
const float2 Q_ds = ((const float2 *) Q_ds_v)[k_KQ_0/nthreads];
sum += K_dm.x*Q_ds.x*sumi + K_dm.y*Q_ds.y/QI8_1;
}
return sum;
}
template<int D, int nthreads>
static __device__ __forceinline__ float vec_dot_fattn_vec_KQ_q5_0(
const char * __restrict__ K_c, const void * __restrict__ Q_v, const int * __restrict__ Q_q8, const void * __restrict__ Q_ds_v) {
const block_q5_0 * K_q5_0 = (const block_q5_0 *) K_c;
GGML_UNUSED(Q_v);
float sum = 0.0f;
#pragma unroll
for (int k_KQ_0 = 0; k_KQ_0 < int(D/sizeof(int)); k_KQ_0 += nthreads) {
const int k_KQ = k_KQ_0 + (nthreads == WARP_SIZE ? threadIdx.x : threadIdx.x % nthreads);
const int ib = k_KQ / QI8_1;
const int iqs4 = k_KQ % QI5_0;
const int iqs8 = k_KQ % QI8_1;
const int shift = k_KQ & (QI8_1/2);
int v;
ggml_cuda_memcpy_1<sizeof(int), 2>(&v, K_q5_0[ib].qs + sizeof(int)*iqs4);
v = (v >> shift) & 0x0F0F0F0F;
{
int vh;
ggml_cuda_memcpy_1<sizeof(int), 2>(&vh, K_q5_0[ib].qh);
vh >>= iqs8 * QI5_0;
v |= (vh << 4) & 0x00000010; // 0 -> 4
v |= (vh << 11) & 0x00001000; // 1 -> 12
v |= (vh << 18) & 0x00100000; // 2 -> 20
v |= (vh << 25) & 0x10000000; // 3 -> 28
}
const int u = Q_q8[k_KQ_0/nthreads];
const int sumi = ggml_cuda_dp4a(v, u, 0);
const float2 Q_ds = ((const float2 *) Q_ds_v)[k_KQ_0/nthreads];
sum += __half2float(K_q5_0[ib].d) * (sumi*Q_ds.x - (16/QI8_1)*Q_ds.y);
}
return sum;
}
template<int D, int nthreads>
static __device__ __forceinline__ float vec_dot_fattn_vec_KQ_q5_1(
const char * __restrict__ K_c, const void * __restrict__ Q_v, const int * __restrict__ Q_q8, const void * __restrict__ Q_ds_v) {
const block_q5_1 * K_q5_1 = (const block_q5_1 *) K_c;
GGML_UNUSED(Q_v);
float sum = 0.0f;
#pragma unroll
for (int k_KQ_0 = 0; k_KQ_0 < int(D/sizeof(int)); k_KQ_0 += nthreads) {
const int k_KQ = k_KQ_0 + (nthreads == WARP_SIZE ? threadIdx.x : threadIdx.x % nthreads);
const int ib = k_KQ / QI8_1;
const int iqs4 = k_KQ % QI5_1;
const int iqs8 = k_KQ % QI8_1;
const int shift = k_KQ & (QI8_1/2);
int v;
ggml_cuda_memcpy_1<sizeof(int)>(&v, K_q5_1[ib].qs + sizeof(int)*iqs4);
v = (v >> shift) & 0x0F0F0F0F;
{
int vh;
ggml_cuda_memcpy_1<sizeof(int)>(&vh, K_q5_1[ib].qh);
vh >>= iqs8 * QI5_0;
v |= (vh << 4) & 0x00000010; // 0 -> 4
v |= (vh << 11) & 0x00001000; // 1 -> 12
v |= (vh << 18) & 0x00100000; // 2 -> 20
v |= (vh << 25) & 0x10000000; // 3 -> 28
}
const int u = Q_q8[k_KQ_0/nthreads];
const int sumi = ggml_cuda_dp4a(v, u, 0);
const float2 K_dm = __half22float2(K_q5_1[ib].dm);
const float2 Q_ds = ((const float2 *) Q_ds_v)[k_KQ_0/nthreads];
sum += K_dm.x*Q_ds.x*sumi + K_dm.y*Q_ds.y/QI8_1;
}
return sum;
}
template <int D, int nthreads>
static __device__ __forceinline__ float vec_dot_fattn_vec_KQ_q8_0(
const char * __restrict__ K_c, const void * __restrict__ Q_v, const int * __restrict__ Q_q8, const void * __restrict__ Q_ds_v) {
const block_q8_0 * K_q8_0 = (const block_q8_0 *) K_c;
GGML_UNUSED(Q_v);
float sum = 0.0f;
#pragma unroll
for (int k_KQ_0 = 0; k_KQ_0 < int(D/sizeof(int)); k_KQ_0 += nthreads) {
const int k_KQ = k_KQ_0 + (nthreads == WARP_SIZE ? threadIdx.x : threadIdx.x % nthreads);
const int ib = k_KQ / QI8_0;
const int iqs = k_KQ % QI8_0;
int v;
ggml_cuda_memcpy_1<sizeof(v), 2>(&v, K_q8_0[ib].qs + 4*iqs);
const float2 * Q_ds = (const float2 *) Q_ds_v;
const float Q_d = Q_ds[k_KQ_0/nthreads].x;
sum += vec_dot_q8_0_q8_1_impl<float, 1>(&v, &Q_q8[k_KQ_0/nthreads], K_q8_0[ib].d, Q_d);
}
return sum;
}
template <typename Tds, int ni>
static __device__ __forceinline__ void quantize_q8_1_to_shared( static __device__ __forceinline__ void quantize_q8_1_to_shared(
const float * __restrict__ x, const float scale, int * __restrict__ yq32, void * __restrict__ yds) { const float * __restrict__ x, const float scale, int * __restrict__ yq32, void * __restrict__ yds) {
float vals[sizeof(int)] = {0.0f}; float vals[sizeof(int)] = {0.0f};
#pragma unroll #pragma unroll
for (int l = 0; l < int(sizeof(int)); ++l) { for (int l = 0; l < int(sizeof(int)); ++l) {
vals[l] = scale * x[4*threadIdx.x + l]; vals[l] = (ni == WARP_SIZE || threadIdx.x < ni) ? scale * x[4*threadIdx.x + l] : 0.0f;
} }
float amax = fabsf(vals[0]); float amax = fabsf(vals[0]);
@ -330,7 +284,7 @@ static __device__ __forceinline__ void quantize_q8_1_to_shared(
} }
yq32[threadIdx.x] = q32; yq32[threadIdx.x] = q32;
if (threadIdx.x % QI8_1 == 0) { if (threadIdx.x % QI8_1 == 0 && (ni == WARP_SIZE || threadIdx.x < ni)) {
if (std::is_same<Tds, half2>::value) { if (std::is_same<Tds, half2>::value) {
((half2 *) yds)[threadIdx.x/QI8_1] = make_half2(d, sum); ((half2 *) yds)[threadIdx.x/QI8_1] = make_half2(d, sum);
} else { } else {
@ -339,167 +293,276 @@ static __device__ __forceinline__ void quantize_q8_1_to_shared(
} }
} }
typedef half (*dequantize_1_f16_t)(const void *, const int64_t); typedef void (*dequantize_V_t)(const void *, void *, const int64_t);
typedef float (*dequantize_1_f32_t)(const void *, const int64_t);
template <typename T> template <typename T, int ne>
static __device__ __forceinline__ T dequantize_1_q4_0(const void * __restrict__ vx, const int64_t i) { static __device__ __forceinline__ void dequantize_V_f16(const void * __restrict__ vx, void * __restrict__ dst, const int64_t i0) {
if constexpr (std::is_same_v<T, half>) {
ggml_cuda_memcpy_1<ne*sizeof(half)>(dst, (const half *) vx + i0);
} else if constexpr (std::is_same_v<T, float>) {
static_assert(ne % 2 == 0, "bad ne");
half2 tmp[ne/2];
ggml_cuda_memcpy_1<ne*sizeof(half)>(tmp, (const half *) vx + i0);
float2 * dst_f2 = (float2 *) dst;
#pragma unroll
for (int l = 0; l < ne/2; ++l) {
dst_f2[l] = __half22float2(tmp[l]);
}
} else {
static_assert(std::is_same_v<T, void>, "unsupported type");
}
}
template <typename T, int ne>
static __device__ __forceinline__ void dequantize_V_q4_0(const void * __restrict__ vx, void * __restrict__ dst, const int64_t i0) {
const block_q4_0 * x = (const block_q4_0 *) vx; const block_q4_0 * x = (const block_q4_0 *) vx;
const int64_t ib = i / QK4_0; const int64_t ib = i0 / QK4_0;
const int iqs = i % (QK4_0/2); const int iqs = i0 % (QK4_0/2);
const int shift = (i % QK4_0) / (QK4_0/2); const int shift = (i0 % QK4_0) / (QK4_0/2);
const T d = x[ib].d; int q;
const int q0 = x[ib].qs[iqs]; static_assert(ne == 2 || ne == 4, "bad ne");
const int q = ((q0 >> (4*shift)) & 0x0F) - 8; ggml_cuda_memcpy_1<ne, 2>(&q, x[ib].qs + iqs);
q >>= 4*shift;
q &= 0x0F0F0F0F;
q = __vsubss4(q, 0x08080808);
const int8_t * q8 = (const int8_t *) &q;
#ifdef FP16_AVAILABLE #ifdef FP16_AVAILABLE
if (std::is_same<T, half>::value) { if constexpr (std::is_same_v<T, half>) {
return ((half) d)*((half) q); const half2 d = __half2half2(x[ib].d);
}
#endif // FP16_AVAILABLE
return ((float) d)*((float) q); #pragma unroll
for (int l0 = 0; l0 < ne; l0 += 2) {
((half2 *) dst)[l0/2] = d * make_half2(q8[l0 + 0], q8[l0 + 1]);
}
} else
#endif // FP16_AVAILABLE
if constexpr (std::is_same_v<T, float>) {
const float d = x[ib].d;
#pragma unroll
for (int l = 0; l < ne; ++l) {
((float *) dst)[l] = d * q8[l];
}
} else {
static_assert(std::is_same_v<T, void>, "bad type");
}
} }
template <typename T> template <typename T, int ne>
static __device__ __forceinline__ T dequantize_1_q4_1(const void * __restrict__ vx, const int64_t i) { static __device__ __forceinline__ void dequantize_V_q4_1(const void * __restrict__ vx, void * __restrict__ dst, const int64_t i0) {
const block_q4_1 * x = (const block_q4_1 *) vx; const block_q4_1 * x = (const block_q4_1 *) vx;
const int64_t ib = i / QK4_1; const int64_t ib = i0 / QK4_1;
const int iqs = i % (QK4_1/2); const int iqs = i0 % (QK4_1/2);
const int shift = (i % QK4_1) / (QK4_1/2); const int shift = (i0 % QK4_1) / (QK4_1/2);
const half2 dm = x[ib].dm; int q;
const int q0 = x[ib].qs[iqs]; static_assert(ne == 2 || ne == 4, "bad ne");
const int q = ((q0 >> (4*shift)) & 0x0F); ggml_cuda_memcpy_1<ne>(&q, x[ib].qs + iqs);
q >>= 4*shift;
q &= 0x0F0F0F0F;
const int8_t * q8 = (const int8_t *) &q;
#ifdef FP16_AVAILABLE #ifdef FP16_AVAILABLE
if (std::is_same<T, half>::value) { if constexpr (std::is_same_v<T, half>) {
return __low2half(dm)*((half) q) + __high2half(dm); const half2 dm = x[ib].dm;
} const half2 d = __half2half2( __low2half(dm));
#endif // FP16_AVAILABLE const half2 m = __half2half2(__high2half(dm));
return __low2float(dm)*((float) q) + __high2float(dm); #pragma unroll
for (int l0 = 0; l0 < ne; l0 += 2) {
((half2 *) dst)[l0/2] = d * make_half2(q8[l0 + 0], q8[l0 + 1]) + m;
}
} else
#endif // FP16_AVAILABLE
if constexpr (std::is_same_v<T, float>) {
const float2 dm = __half22float2(x[ib].dm);
#pragma unroll
for (int l = 0; l < ne; ++l) {
((float *) dst)[l] = dm.x * q8[l] + dm.y;
}
} else {
static_assert(std::is_same_v<T, void>, "bad type");
}
} }
template <typename T> template <typename T, int ne>
static __device__ __forceinline__ T dequantize_1_q5_0(const void * __restrict__ vx, const int64_t i) { static __device__ __forceinline__ void dequantize_V_q5_0(const void * __restrict__ vx, void * __restrict__ dst, const int64_t i0) {
const block_q5_0 * x = (const block_q5_0 *) vx; const block_q5_0 * x = (const block_q5_0 *) vx;
const int64_t ib = i / QK5_0; const int64_t ib = i0 / QK5_0;
const int idq = i % QK5_0; const int idq = i0 % QK5_0;
const int iqs = i % (QK5_0/2); const int iqs = i0 % (QK5_0/2);
const int shift = (i % QK5_0) / (QK5_0/2); const int shift = (i0 % QK5_0) / (QK5_0/2);
const T d = x[ib].d; int q;
const int ql0 = x[ib].qs[iqs]; static_assert(ne == 2 || ne == 4, "bad ne");
const int qh0 = get_int_b2(x[ib].qh, 0); ggml_cuda_memcpy_1<ne, 2>(&q, x[ib].qs + iqs);
const int ql = ((ql0 >> (4*shift)) & 0x0F); q >>= 4*shift;
const int qh = ((qh0 >> idq) << 4) & 0x10; q &= 0x0F0F0F0F;
const int q = (ql | qh) - 16;
{
int qh;
ggml_cuda_memcpy_1<ne, 2>(&qh, x[ib].qh);
#pragma unroll
for (int l = 0; l < ne; ++l) {
q |= ((qh >> (idq + l)) & 0x00000001) << (8*l + 4);
}
}
q = __vsubss4(q, 0x10101010);
const int8_t * q8 = (const int8_t *) &q;
#ifdef FP16_AVAILABLE #ifdef FP16_AVAILABLE
if (std::is_same<T, half>::value) { if constexpr (std::is_same_v<T, half>) {
return ((half) d)*((half) q); const half2 d = __half2half2(x[ib].d);
}
#endif // FP16_AVAILABLE
return ((float) d)*((float) q); #pragma unroll
for (int l0 = 0; l0 < ne; l0 += 2) {
((half2 *) dst)[l0/2] = d * make_half2(q8[l0 + 0], q8[l0 + 1]);
}
} else
#endif // FP16_AVAILABLE
if constexpr (std::is_same_v<T, float>) {
const float d = x[ib].d;
#pragma unroll
for (int l = 0; l < ne; ++l) {
((float *) dst)[l] = d * q8[l];
}
} else {
static_assert(std::is_same_v<T, void>, "bad type");
}
} }
template <typename T> template <typename T, int ne>
static __device__ __forceinline__ T dequantize_1_q5_1(const void * __restrict__ vx, const int64_t i) { static __device__ __forceinline__ void dequantize_V_q5_1(const void * __restrict__ vx, void * __restrict__ dst, const int64_t i0) {
const block_q5_1 * x = (const block_q5_1 *) vx; const block_q5_1 * x = (const block_q5_1 *) vx;
const int64_t ib = i / QK5_1; const int64_t ib = i0 / QK5_1;
const int idq = i % QK5_1; const int idq = i0 % QK5_1;
const int iqs = i % (QK5_1/2); const int iqs = i0 % (QK5_1/2);
const int shift = (i % QK5_1) / (QK5_1/2); const int shift = (i0 % QK5_1) / (QK5_1/2);
const half2 dm = x[ib].dm; int q;
const int ql0 = x[ib].qs[iqs]; static_assert(ne == 2 || ne == 4, "bad ne");
const int qh0 = get_int_b4(x[ib].qh, 0); ggml_cuda_memcpy_1<ne>(&q, x[ib].qs + iqs);
const int ql = ((ql0 >> (4*shift)) & 0x0F); q >>= 4*shift;
const int qh = ((qh0 >> idq) << 4) & 0x10; q &= 0x0F0F0F0F;
const int q = (ql | qh);
{
int qh;
ggml_cuda_memcpy_1<ne>(&qh, x[ib].qh);
#pragma unroll
for (int l = 0; l < ne; ++l) {
q |= ((qh >> (idq + l)) & 0x00000001) << (8*l + 4);
}
}
const int8_t * q8 = (const int8_t *) &q;
#ifdef FP16_AVAILABLE #ifdef FP16_AVAILABLE
if (std::is_same<T, half>::value) { if constexpr (std::is_same_v<T, half>) {
return __low2half(dm)*((half) q) + __high2half(dm); const half2 dm = x[ib].dm;
} const half2 d = __half2half2( __low2half(dm));
#endif // FP16_AVAILABLE const half2 m = __half2half2(__high2half(dm));
return __low2float(dm)*((float) q) + __high2float(dm); #pragma unroll
for (int l0 = 0; l0 < ne; l0 += 2) {
((half2 *) dst)[l0/2] = d * make_half2(q8[l0 + 0], q8[l0 + 1]) + m;
}
} else
#endif // FP16_AVAILABLE
if constexpr (std::is_same_v<T, float>) {
const float2 dm = __half22float2(x[ib].dm);
#pragma unroll
for (int l = 0; l < ne; ++l) {
((float *) dst)[l] = dm.x * q8[l] + dm.y;
}
} else {
static_assert(std::is_same_v<T, void>, "bad type");
}
} }
template <typename T> template <typename T, int ne>
static __device__ __forceinline__ T dequantize_1_q8_0(const void * __restrict__ vx, const int64_t i) { static __device__ __forceinline__ void dequantize_V_q8_0(const void * __restrict__ vx, void * __restrict__ dst, const int64_t i0) {
const block_q8_0 * x = (const block_q8_0 *) vx; const block_q8_0 * x = (const block_q8_0 *) vx;
const int64_t ib = i / QK8_0; const int64_t ib = i0 / QK8_0;
const int iqs = i % QK8_0; const int iqs = i0 % QK8_0;
const T d = x[ib].d; static_assert(ne % 2 == 0, "bad ne");
const int q = x[ib].qs[iqs]; int8_t qs[ne];
ggml_cuda_memcpy_1<ne, 2>(qs, x[ib].qs + iqs);
#ifdef FP16_AVAILABLE #ifdef FP16_AVAILABLE
if (std::is_same<T, half>::value) { if constexpr (std::is_same<T, half>::value) {
return ((half) d)*((half) q); const half2 d = __half2half2(x[ib].d);
#pragma unroll
for (int l0 = 0; l0 < ne; l0 += 2) {
((half2 *) dst)[l0/2] = d * make_half2(qs[l0 + 0], qs[l0 + 1]);
} }
} else
#endif // FP16_AVAILABLE #endif // FP16_AVAILABLE
if constexpr (std::is_same<T, float>::value) {
const float d = x[ib].d;
return ((float) d)*((float) q); #pragma unroll
for (int l = 0; l < ne; ++l) {
((float *) dst)[l] = d * qs[l];
}
} else {
static_assert(std::is_same_v<T, void>, "unsupported type");
}
} }
template <typename T> template <ggml_type type_K, int D, int nthreads>
static __device__ __forceinline__ T dequantize_1_f16(const void * __restrict__ vx, const int64_t i) { constexpr __device__ vec_dot_KQ_t get_vec_dot_KQ() {
const half * x = (const half *) vx; if constexpr (type_K == GGML_TYPE_F16) {
return vec_dot_fattn_vec_KQ_f16<D, nthreads>;
return x[i]; } else if constexpr (type_K == GGML_TYPE_Q4_0) {
return vec_dot_fattn_vec_KQ_q4_0<D, nthreads>;
} else if constexpr (type_K == GGML_TYPE_Q4_1) {
return vec_dot_fattn_vec_KQ_q4_1<D, nthreads>;
} else if constexpr (type_K == GGML_TYPE_Q5_0) {
return vec_dot_fattn_vec_KQ_q5_0<D, nthreads>;
} else if constexpr (type_K == GGML_TYPE_Q5_1) {
return vec_dot_fattn_vec_KQ_q5_1<D, nthreads>;
} else if constexpr (type_K == GGML_TYPE_Q8_0) {
return vec_dot_fattn_vec_KQ_q8_0<D, nthreads>;
} else {
static_assert(type_K == -1, "bad type");
return nullptr;
}
} }
template <int D, int warp_size = WARP_SIZE> template <ggml_type type_V, typename T, int ne>
constexpr __device__ vec_dot_KQ_f16_t get_vec_dot_KQ_f16(ggml_type type_K) { constexpr __device__ dequantize_V_t get_dequantize_V() {
return type_K == GGML_TYPE_Q4_0 ? vec_dot_fattn_vec_KQ_q4_0<half, D, warp_size> : if constexpr (type_V == GGML_TYPE_F16) {
type_K == GGML_TYPE_Q4_1 ? vec_dot_fattn_vec_KQ_q4_1<half, D, warp_size> : return dequantize_V_f16<T, ne>;
type_K == GGML_TYPE_Q5_0 ? vec_dot_fattn_vec_KQ_q5_0<half, D, warp_size> : } else if constexpr (type_V == GGML_TYPE_Q4_0) {
type_K == GGML_TYPE_Q5_1 ? vec_dot_fattn_vec_KQ_q5_1<half, D, warp_size> : return dequantize_V_q4_0<T, ne>;
type_K == GGML_TYPE_Q8_0 ? vec_dot_fattn_vec_KQ_q8_0<half, D, warp_size> : } else if constexpr (type_V == GGML_TYPE_Q4_1) {
type_K == GGML_TYPE_F16 ? vec_dot_fattn_vec_KQ_f16<half, D, warp_size> : return dequantize_V_q4_1<T, ne>;
nullptr; } else if constexpr (type_V == GGML_TYPE_Q5_0) {
} return dequantize_V_q5_0<T, ne>;
} else if constexpr (type_V == GGML_TYPE_Q5_1) {
template <int D, int warp_size = WARP_SIZE> return dequantize_V_q5_1<T, ne>;
constexpr __device__ vec_dot_KQ_f32_t get_vec_dot_KQ_f32(ggml_type type_K) { } else if constexpr (type_V == GGML_TYPE_Q8_0) {
return type_K == GGML_TYPE_Q4_0 ? vec_dot_fattn_vec_KQ_q4_0<float, D, warp_size> : return dequantize_V_q8_0<T, ne>;
type_K == GGML_TYPE_Q4_1 ? vec_dot_fattn_vec_KQ_q4_1<float, D, warp_size> : } else {
type_K == GGML_TYPE_Q5_0 ? vec_dot_fattn_vec_KQ_q5_0<float, D, warp_size> : static_assert(type_V == -1, "bad type");
type_K == GGML_TYPE_Q5_1 ? vec_dot_fattn_vec_KQ_q5_1<float, D, warp_size> : return nullptr;
type_K == GGML_TYPE_Q8_0 ? vec_dot_fattn_vec_KQ_q8_0<float, D, warp_size> : }
type_K == GGML_TYPE_F16 ? vec_dot_fattn_vec_KQ_f16<float, D, warp_size> :
nullptr;
}
constexpr __device__ dequantize_1_f16_t get_dequantize_1_f16(ggml_type type_V) {
return type_V == GGML_TYPE_Q4_0 ? dequantize_1_q4_0<half> :
type_V == GGML_TYPE_Q4_1 ? dequantize_1_q4_1<half> :
type_V == GGML_TYPE_Q5_0 ? dequantize_1_q5_0<half> :
type_V == GGML_TYPE_Q5_1 ? dequantize_1_q5_1<half> :
type_V == GGML_TYPE_Q8_0 ? dequantize_1_q8_0<half> :
type_V == GGML_TYPE_F16 ? dequantize_1_f16<half> :
nullptr;
}
constexpr __device__ dequantize_1_f32_t get_dequantize_1_f32(ggml_type type_V) {
return type_V == GGML_TYPE_Q4_0 ? dequantize_1_q4_0<float> :
type_V == GGML_TYPE_Q4_1 ? dequantize_1_q4_1<float> :
type_V == GGML_TYPE_Q5_0 ? dequantize_1_q5_0<float> :
type_V == GGML_TYPE_Q5_1 ? dequantize_1_q5_1<float> :
type_V == GGML_TYPE_Q8_0 ? dequantize_1_q8_0<float> :
type_V == GGML_TYPE_F16 ? dequantize_1_f16<float> :
nullptr;
} }
template <int ncols1> template <int ncols1>
@ -870,7 +933,7 @@ void launch_fattn(
const int efficiency_percent = 100 * nblocks_total / (nwaves*blocks_per_wave); const int efficiency_percent = 100 * nblocks_total / (nwaves*blocks_per_wave);
// Stop trying configurations with more waves if we already have good efficiency to avoid excessive overhead. // Stop trying configurations with more waves if we already have good efficiency to avoid excessive overhead.
if (efficiency_percent_best >= 90 && nwaves > nwaves_best) { if (efficiency_percent_best >= 95 && nwaves > nwaves_best) {
break; break;
} }

View File

@ -1,495 +0,0 @@
#include "common.cuh"
#include "fattn-common.cuh"
// Currenlty llvm with the amdgcn target dose not support unrolling loops
// that contain a break that can not be resolved at compile time.
#ifdef __clang__
#pragma clang diagnostic push
#pragma clang diagnostic ignored "-Wpass-failed"
#endif // __clang__
template<int D, int ncols, ggml_type type_K, ggml_type type_V, bool use_logit_softcap> // D == head size
#ifndef GGML_USE_HIP
__launch_bounds__(D, 1)
#endif // GGML_USE_HIP
static __global__ void flash_attn_vec_ext_f16(
const char * __restrict__ Q,
const char * __restrict__ K,
const char * __restrict__ V,
const char * __restrict__ mask,
const char * __restrict__ sinks,
const int * __restrict__ KV_max,
float * __restrict__ dst,
float2 * __restrict__ dst_meta,
const float scale,
const float max_bias,
const float m0,
const float m1,
const uint32_t n_head_log2,
const float logit_softcap,
const int32_t ne00, const int32_t ne01, const int32_t ne02, const int32_t ne03,
const int32_t nb01, const int32_t nb02, const int32_t nb03,
const int32_t ne10, const int32_t ne11, const int32_t ne12, const int32_t ne13,
const int32_t nb11, const int32_t nb12, const int64_t nb13,
const int32_t nb21, const int32_t nb22, const int64_t nb23,
const int32_t ne31, const int32_t ne32, const int32_t ne33,
const int32_t nb31, const int32_t nb32, const int64_t nb33) {
#if defined(FLASH_ATTN_AVAILABLE) && defined(FP16_AVAILABLE)
// Skip unused kernel variants for faster compilation:
if (use_logit_softcap && !(D == 128 || D == 256)) {
NO_DEVICE_CODE;
return;
}
#if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
if (ncols > 1) {
NO_DEVICE_CODE;
return;
}
#endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
//In this kernel Q, K, V are matrices while i, j, k are matrix indices.
constexpr vec_dot_KQ_f16_t vec_dot_KQ = get_vec_dot_KQ_f16<D>(type_K);
constexpr bool Q_q8_1 = type_K != GGML_TYPE_F16;
constexpr dequantize_1_f16_t dequantize_1_v = get_dequantize_1_f16(type_V);
const int ic0 = blockIdx.x * ncols; // Index of the Q/QKV column to work on.
const int sequence = blockIdx.z / ne02;
const int head = blockIdx.z - sequence*ne02;
const int gqa_ratio = ne02 / ne12; // With grouped query attention there are > 1 Q matrices per K, V matrix.
Q += nb03*sequence + nb02* head + nb01*ic0;
K += nb13*sequence + nb12*(head / gqa_ratio);
V += nb23*sequence + nb22*(head / gqa_ratio);
const half * maskh = (const half *) (mask + nb33*(sequence % ne33) + nb31*ic0);
const float * sinksf = (const float *) (sinks);
const float slopef = get_alibi_slope(max_bias, head, n_head_log2, m0, m1);
const half slopeh = __float2half(slopef);
static_assert(D % (2*WARP_SIZE) == 0, "D not divisible by 2*WARP_SIZE == 64.");
constexpr int nwarps = D / WARP_SIZE;
const int tid = WARP_SIZE*threadIdx.y + threadIdx.x;
__builtin_assume(tid < D);
__shared__ half KQ[ncols*D];
half2 * KQ2 = (half2 *) KQ;
half kqmax[ncols];
half kqsum[ncols];
#pragma unroll
for (int j = 0; j < ncols; ++j) {
kqmax[j] = -HALF_MAX_HALF;
kqsum[j] = 0.0f;
}
__shared__ half kqmax_shared[ncols][WARP_SIZE];
__shared__ half kqsum_shared[ncols][WARP_SIZE];
#pragma unroll
for (int j = 0; j < ncols; ++j) {
if (threadIdx.y == 0) {
kqmax_shared[j][threadIdx.x] = -HALF_MAX_HALF;
kqsum_shared[j][threadIdx.x] = 0.0f;
}
}
__shared__ half maskh_shared[ncols*D];
#pragma unroll
for (int j = 0; j < ncols; ++j) {
maskh_shared[j*D + tid] = 0.0f;
}
__syncthreads();
// Convert Q to half2 (f16 K) or q8_1 (quantized K) and store in registers:
half2 Q_h2[ncols][D/(2*WARP_SIZE)];
int Q_i32[ncols][D/(sizeof(int)*QK8_1) == 0 ? 1 : D/(sizeof(int)*QK8_1)];
half2 Q_ds[ncols][D/QK8_1 == 0 ? 1 : D/QK8_1];
if (Q_q8_1) {
#pragma unroll
for (int j0 = 0; j0 < ncols; j0 += nwarps) {
const int j = j0 + threadIdx.y;
if (j0 + nwarps > ncols && j >= ncols) {
break;
}
// Reuse KQ as temporary storage for converting Q to q8_1:
int * tmp_q_i32 = (int *) &KQ[j*D];
half2 * tmp_q_ds = (half2 *) (tmp_q_i32 + D/sizeof(int));
// Set memory to zero if out of bounds:
if (ncols > 2 && ic0 + j >= ne01) {
#pragma unroll
for (int i0 = 0; i0 < D/sizeof(int); i0 += WARP_SIZE) {
const int i = i0 + threadIdx.x;
tmp_q_i32[i] = 0;
}
if (threadIdx.x < D/QK8_1) {
tmp_q_ds[threadIdx.x] = make_half2(0.0f, 0.0f);
}
continue;
}
const float * Q_f = (const float *) (Q + j*nb01);
#pragma unroll
for (int i0 = 0; i0 < D/sizeof(int); i0 += WARP_SIZE) {
quantize_q8_1_to_shared<half2>(Q_f + 4*i0, scale, tmp_q_i32, tmp_q_ds);
}
}
__syncthreads();
#pragma unroll
for (int j = 0; j < ncols; ++j) {
int * tmp_q_i32 = (int *) &KQ[j*D];
half2 * tmp_q_ds = (half2 *) (tmp_q_i32 + D/sizeof(int));
#pragma unroll
for (int i0 = 0; i0 < D/sizeof(int); i0 += WARP_SIZE) {
const int i = i0 + threadIdx.x;
Q_i32[j][i0/WARP_SIZE] = tmp_q_i32[i];
Q_ds[j][i0/WARP_SIZE] = tmp_q_ds[i/QI8_1];
}
}
__syncthreads();
} else {
#pragma unroll
for (int j = 0; j < ncols; ++j) {
const float2 * Q_f2_j = (const float2 *) (Q + j*nb01);
#pragma unroll
for (int i0 = 0; i0 < D/2; i0 += WARP_SIZE) {
const int i = i0 + threadIdx.x;
const float2 tmp = ncols <= 2 || ic0 + j < ne01 ? Q_f2_j[i] : make_float2(0.0f, 0.0f);
Q_h2[j][i0/WARP_SIZE] = make_half2(scale, scale) * make_half2(tmp.x, tmp.y);
}
}
}
#pragma unroll
for (int j = 0; j < ncols; ++j) {
KQ[j*D + tid] = -HALF_MAX_HALF;
}
__syncthreads();
half2 VKQ[ncols] = {{0.0f, 0.0f}};
const int k_VKQ_max = KV_max ? KV_max[sequence*gridDim.x + blockIdx.x] : ne11;
K += blockIdx.y*D * nb11;
V += blockIdx.y*D * nb21;
maskh += blockIdx.y*D;
for (int k_VKQ_0 = blockIdx.y*D; k_VKQ_0 < k_VKQ_max; k_VKQ_0 += gridDim.y*D,
// Increment pointers after each loop:
K += gridDim.y*D*nb11, V += gridDim.y*D*nb21, maskh += gridDim.y*D) {
// Calculate KQ tile and keep track of new maximum KQ values:
if (mask) {
#pragma unroll
for (int j = 0; j < ncols; ++j) {
maskh_shared[j*D + tid] = slopeh*maskh[j*ne11 + tid];
}
__syncthreads();
}
// For unknown reasons using a half array of size 1 for kqmax_new causes a performance regression,
// see https://github.com/ggerganov/llama.cpp/pull/7061 .
// Therefore this variable is defined twice but only used once (so that the compiler can optimize out the unused variable).
half kqmax_new = kqmax[0];
half kqmax_new_arr[ncols];
#pragma unroll
for (int j = 0; j < ncols; ++j) {
kqmax_new_arr[j] = kqmax[j];
}
#pragma unroll
for (int i_KQ_0 = 0; i_KQ_0 < D; i_KQ_0 += nwarps) {
const int i_KQ = i_KQ_0 + threadIdx.y;
if ((i_KQ_0 + nwarps > D && i_KQ >= D) || (FATTN_KQ_STRIDE % D != 0 && k_VKQ_0 + i_KQ >= ne11)) {
break;
}
#pragma unroll
for (int j = 0; j < ncols; ++j) {
half sum = vec_dot_KQ(K + i_KQ*nb11, Q_h2[j], Q_i32[j], Q_ds[j]);
sum = warp_reduce_sum((float)sum);
if (use_logit_softcap) {
sum = logit_softcap*tanhf(sum);
}
sum += maskh_shared[j*D + i_KQ];
if (ncols == 1) {
kqmax_new = ggml_cuda_hmax(kqmax_new, sum);
} else {
kqmax_new_arr[j] = ggml_cuda_hmax(kqmax_new_arr[j], sum);
}
if (threadIdx.x == 0) {
KQ[j*D + i_KQ] = sum;
}
}
}
#pragma unroll
for (int j = 0; j < ncols; ++j) {
half kqmax_new_j = ncols == 1 ? kqmax_new : kqmax_new_arr[j];
if (threadIdx.x == 0) {
kqmax_shared[j][threadIdx.y] = kqmax_new_j;
}
}
__syncthreads();
#pragma unroll
for (int j = 0; j < ncols; ++j) {
half kqmax_new_j = kqmax_shared[j][threadIdx.x];
kqmax_new_j = warp_reduce_max(kqmax_new_j);
const half KQ_max_scale = hexp(kqmax[j] - kqmax_new_j);
kqmax[j] = kqmax_new_j;
const half val = hexp(KQ[j*D + tid] - kqmax[j]);
kqsum[j] = kqsum[j]*KQ_max_scale + val;
KQ[j*D + tid] = val;
VKQ[j] *= __half2half2(KQ_max_scale);
}
__syncthreads();
#pragma unroll
for (int k0 = 0; k0 < D; k0 += 2) {
if (FATTN_KQ_STRIDE % D != 0 && k_VKQ_0 + k0 >= ne11) {
break;
}
half2 V_k;
reinterpret_cast<half&>(V_k.x) = dequantize_1_v(V + (k0 + 0)*nb21, tid);
reinterpret_cast<half&>(V_k.y) = dequantize_1_v(V + (k0 + 1)*nb21, tid);
#pragma unroll
for (int j = 0; j < ncols; ++j) {
VKQ[j] += V_k*KQ2[j*(D/2) + k0/2];
}
}
__syncthreads();
}
if (sinksf && blockIdx.y == 0) {
const half sink = __float2half(sinksf[head]);
#pragma unroll
for (int j = 0; j < ncols; ++j) {
if (threadIdx.x == 0) {
kqmax_shared[j][threadIdx.y] = fmaxf(kqmax[j], sink);
}
}
__syncthreads();
#pragma unroll
for (int j = 0; j < ncols; ++j) {
half kqmax_new_j = kqmax_shared[j][threadIdx.x];
kqmax_new_j = warp_reduce_max(kqmax_new_j);
const half KQ_max_scale = hexp(kqmax[j] - kqmax_new_j);
kqmax[j] = kqmax_new_j;
const half val = hexp(sink - kqmax[j]);
kqsum[j] = kqsum[j]*KQ_max_scale;
if (tid == 0) {
kqsum[j] += val;
}
VKQ[j] *= __half2half2(KQ_max_scale);
}
__syncthreads();
}
#pragma unroll
for (int j = 0; j < ncols; ++j) {
kqsum[j] = warp_reduce_sum((float)kqsum[j]);
if (threadIdx.x == 0) {
kqsum_shared[j][threadIdx.y] = kqsum[j];
}
}
__syncthreads();
#pragma unroll
for (int j_VKQ = 0; j_VKQ < ncols; ++j_VKQ) {
if (ncols > 2 && ic0 + j_VKQ >= ne01) {
break;
}
kqsum[j_VKQ] = kqsum_shared[j_VKQ][threadIdx.x];
kqsum[j_VKQ] = warp_reduce_sum((float)kqsum[j_VKQ]);
half dst_val = (__low2half(VKQ[j_VKQ]) + __high2half(VKQ[j_VKQ]));
if (gridDim.y == 1) {
dst_val /= kqsum[j_VKQ];
}
dst[(((sequence*ne01 + ic0 + j_VKQ)*ne02 + head)*gridDim.y + blockIdx.y)*D + tid] = dst_val;
}
if (gridDim.y != 1 && tid < ncols && (ncols <= 2 || ic0 + tid < ne01)) {
dst_meta[((sequence*ne01 + ic0 + tid)*ne02 + head)*gridDim.y + blockIdx.y] = make_float2(kqmax[tid], kqsum[tid]);
}
#else
GGML_UNUSED_VARS(Q, K, V, mask, sinks, KV_max, dst, dst_meta, scale,
max_bias, m0, m1, n_head_log2, logit_softcap,
ne00, ne01, ne02, ne03,
nb01, nb02, nb03,
ne10, ne11, ne12, ne13,
nb11, nb12, nb13,
nb21, nb22, nb23,
ne31, ne32, ne33,
nb31, nb32, nb33);
NO_DEVICE_CODE;
#endif // defined(FLASH_ATTN_AVAILABLE) && defined(FP16_AVAILABLE)
}
#ifdef __clang__
#pragma clang diagnostic pop
#endif // __clang__
template <int D, int cols_per_block, ggml_type type_K, ggml_type type_V, bool use_logit_softcap>
void ggml_cuda_flash_attn_ext_vec_f16_case_impl(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
constexpr int nwarps = D/WARP_SIZE;
fattn_kernel_t fattn_kernel = flash_attn_vec_ext_f16<D, cols_per_block, type_K, type_V, use_logit_softcap>;
constexpr bool need_f16_K = D != 128;
constexpr bool need_f16_V = D != 128 && D != 64;
constexpr size_t nbytes_shared = 0;
launch_fattn<D, cols_per_block, 1>(ctx, dst, fattn_kernel, nwarps, nbytes_shared, D, need_f16_K, need_f16_V, false);
}
template <int D, ggml_type type_K, ggml_type type_V>
void ggml_cuda_flash_attn_ext_vec_f16_case(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
const ggml_tensor * KQV = dst;
const ggml_tensor * Q = dst->src[0];
const ggml_tensor * K = dst->src[1];
const ggml_tensor * V = dst->src[2];
const int32_t precision = KQV->op_params[3];
GGML_ASSERT(precision == GGML_PREC_DEFAULT);
GGML_ASSERT(K->type == type_K);
GGML_ASSERT(V->type == type_V);
float logit_softcap;
memcpy(&logit_softcap, (const float *) KQV->op_params + 2, sizeof(float));
const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc;
if (Q->ne[1] == 1 || GGML_CUDA_CC_IS_NVIDIA(cc)) {
constexpr int cols_per_block = 1;
if (logit_softcap == 0.0f) {
constexpr bool use_logit_softcap = false;
ggml_cuda_flash_attn_ext_vec_f16_case_impl<D, cols_per_block, type_K, type_V, use_logit_softcap>(ctx, dst);
} else {
constexpr bool use_logit_softcap = true;
ggml_cuda_flash_attn_ext_vec_f16_case_impl<D, cols_per_block, type_K, type_V, use_logit_softcap>(ctx, dst);
}
return;
}
if (Q->ne[1] == 2) {
constexpr int cols_per_block = 2;
if (logit_softcap == 0.0f) {
constexpr bool use_logit_softcap = false;
ggml_cuda_flash_attn_ext_vec_f16_case_impl<D, cols_per_block, type_K, type_V, use_logit_softcap>(ctx, dst);
} else {
constexpr bool use_logit_softcap = true;
ggml_cuda_flash_attn_ext_vec_f16_case_impl<D, cols_per_block, type_K, type_V, use_logit_softcap>(ctx, dst);
}
return;
}
if (Q->ne[1] <= 4) {
constexpr int cols_per_block = 4;
if (logit_softcap == 0.0f) {
constexpr bool use_logit_softcap = false;
ggml_cuda_flash_attn_ext_vec_f16_case_impl<D, cols_per_block, type_K, type_V, use_logit_softcap>(ctx, dst);
} else {
constexpr bool use_logit_softcap = true;
ggml_cuda_flash_attn_ext_vec_f16_case_impl<D, cols_per_block, type_K, type_V, use_logit_softcap>(ctx, dst);
}
return;
}
constexpr int cols_per_block = 8;
if (logit_softcap == 0.0f) {
constexpr bool use_logit_softcap = false;
ggml_cuda_flash_attn_ext_vec_f16_case_impl<D, cols_per_block, type_K, type_V, use_logit_softcap>(ctx, dst);
} else {
constexpr bool use_logit_softcap = true;
ggml_cuda_flash_attn_ext_vec_f16_case_impl<D, cols_per_block, type_K, type_V, use_logit_softcap>(ctx, dst);
}
}
#define DECL_FATTN_VEC_F16_CASE(D, type_K, type_V) \
template void ggml_cuda_flash_attn_ext_vec_f16_case \
<D, type_K, type_V>(ggml_backend_cuda_context & ctx, ggml_tensor * dst) \
extern DECL_FATTN_VEC_F16_CASE( 64, GGML_TYPE_F16, GGML_TYPE_Q4_0);
extern DECL_FATTN_VEC_F16_CASE( 64, GGML_TYPE_F16, GGML_TYPE_Q4_1);
extern DECL_FATTN_VEC_F16_CASE( 64, GGML_TYPE_F16, GGML_TYPE_Q5_0);
extern DECL_FATTN_VEC_F16_CASE( 64, GGML_TYPE_F16, GGML_TYPE_Q5_1);
extern DECL_FATTN_VEC_F16_CASE( 64, GGML_TYPE_F16, GGML_TYPE_Q8_0);
extern DECL_FATTN_VEC_F16_CASE( 64, GGML_TYPE_F16, GGML_TYPE_F16);
extern DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q4_0);
extern DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q4_0);
extern DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q4_0);
extern DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q4_0);
extern DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q4_0);
extern DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q4_0);
extern DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q4_1);
extern DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q4_1);
extern DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q4_1);
extern DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q4_1);
extern DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q4_1);
extern DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q4_1);
extern DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q5_0);
extern DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q5_0);
extern DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q5_0);
extern DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q5_0);
extern DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q5_0);
extern DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q5_0);
extern DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q5_1);
extern DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q5_1);
extern DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q5_1);
extern DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q5_1);
extern DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q5_1);
extern DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q5_1);
extern DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q8_0);
extern DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q8_0);
extern DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q8_0);
extern DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q8_0);
extern DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q8_0);
extern DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q8_0);
extern DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_F16);
extern DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_F16);
extern DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_F16);
extern DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_F16);
extern DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_F16);
extern DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_F16, GGML_TYPE_F16);
extern DECL_FATTN_VEC_F16_CASE(256, GGML_TYPE_F16, GGML_TYPE_F16);

View File

@ -1,486 +0,0 @@
#include "common.cuh"
#include "fattn-common.cuh"
// Currenlty llvm with the amdgcn target dose not support unrolling loops
// that contain a break that can not be resolved at compile time.
#ifdef __clang__
#pragma clang diagnostic push
#pragma clang diagnostic ignored "-Wpass-failed"
#endif // __clang__
template<int D, int ncols, ggml_type type_K, ggml_type type_V, bool use_logit_softcap> // D == head size
#ifndef GGML_USE_HIP
__launch_bounds__(D, 1)
#endif // GGML_USE_HIP
static __global__ void flash_attn_vec_ext_f32(
const char * __restrict__ Q,
const char * __restrict__ K,
const char * __restrict__ V,
const char * __restrict__ mask,
const char * __restrict__ sinks,
const int * __restrict__ KV_max,
float * __restrict__ dst,
float2 * __restrict__ dst_meta,
const float scale,
const float max_bias,
const float m0,
const float m1,
const uint32_t n_head_log2,
const float logit_softcap,
const int32_t ne00, const int32_t ne01, const int32_t ne02, const int32_t ne03,
const int32_t nb01, const int32_t nb02, const int32_t nb03,
const int32_t ne10, const int32_t ne11, const int32_t ne12, const int32_t ne13,
const int32_t nb11, const int32_t nb12, const int64_t nb13,
const int32_t nb21, const int32_t nb22, const int64_t nb23,
const int32_t ne31, const int32_t ne32, const int32_t ne33,
const int32_t nb31, const int32_t nb32, const int64_t nb33) {
#ifdef FLASH_ATTN_AVAILABLE
// Skip unused kernel variants for faster compilation:
if (use_logit_softcap && !(D == 128 || D == 256)) {
GGML_UNUSED_VARS(Q, K, V, mask, sinks, KV_max, dst, dst_meta, scale,
max_bias, m0, m1, n_head_log2, logit_softcap,
ne00, ne01, ne02, ne03,
nb01, nb02, nb03,
ne10, ne11, ne12, ne13,
nb11, nb12, nb13,
nb21, nb22, nb23,
ne31, ne32, ne33,
nb31, nb32, nb33);
NO_DEVICE_CODE;
return;
}
#if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
if (ncols > 1) {
NO_DEVICE_CODE;
return;
}
#endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
//In this kernel Q, K, V are matrices while i, j, k are matrix indices.
constexpr vec_dot_KQ_f32_t vec_dot_KQ = get_vec_dot_KQ_f32<D>(type_K);
constexpr bool Q_q8_1 = type_K != GGML_TYPE_F16;
constexpr dequantize_1_f32_t dequantize_1_v = get_dequantize_1_f32(type_V);
const int ic0 = blockIdx.x * ncols; // Index of the Q/QKV column to work on.
const int sequence = blockIdx.z / ne02;
const int head = blockIdx.z - sequence*ne02;
const int gqa_ratio = ne02 / ne12; // With grouped query attention there are > 1 Q matrices per K, V matrix.
Q += nb03*sequence + nb02* head + nb01*ic0;
K += nb13*sequence + nb12*(head / gqa_ratio);
V += nb23*sequence + nb22*(head / gqa_ratio);
const half * maskh = (const half *) (mask + nb33*(sequence % ne33) + nb31*ic0);
const float * sinksf = (const float *) (sinks);
const float slope = get_alibi_slope(max_bias, head, n_head_log2, m0, m1);
static_assert(D % (2*WARP_SIZE) == 0, "D not divisible by 2*WARP_SIZE == 64.");
constexpr int nwarps = D / WARP_SIZE;
const int tid = WARP_SIZE*threadIdx.y + threadIdx.x;
__builtin_assume(tid < D);
__shared__ float KQ[ncols*D];
#pragma unroll
for (int j = 0; j < ncols; ++j) {
KQ[j*D + tid] = -FLT_MAX/2.0f;
}
float kqmax[ncols];
float kqsum[ncols];
#pragma unroll
for (int j = 0; j < ncols; ++j) {
kqmax[j] = -FLT_MAX/2.0f;
kqsum[j] = 0.0f;
}
__shared__ float kqmax_shared[ncols][WARP_SIZE];
__shared__ float kqsum_shared[ncols][WARP_SIZE];
#pragma unroll
for (int j = 0; j < ncols; ++j) {
if (threadIdx.y == 0) {
kqmax_shared[j][threadIdx.x] = -FLT_MAX/2.0f;
kqsum_shared[j][threadIdx.x] = 0.0f;
}
}
__shared__ float maskf_shared[ncols*D];
#pragma unroll
for (int j = 0; j < ncols; ++j) {
maskf_shared[j*D + tid] = 0.0f;
}
__syncthreads();
// Convert Q to float2 (f16 K) or q8_1 (quantized K) and store in registers:
float2 Q_f2[ncols][D/(2*WARP_SIZE)];
int Q_i32[ncols][D/(sizeof(int)*QK8_1) == 0 ? 1 : D >= D/(sizeof(int)*QK8_1)];
float2 Q_ds[ncols][D/QK8_1 == 0 ? 1 : D/QK8_1];
if (Q_q8_1) {
#pragma unroll
for (int j0 = 0; j0 < ncols; j0 += nwarps) {
const int j = j0 + threadIdx.y;
if (j0 + nwarps > ncols && j >= ncols) {
break;
}
// Reuse KQ as temporary storage for converting Q to q8_1:
int * tmp_q_i32 = (int *) &KQ[j*D];
float2 * tmp_q_ds = (float2 *) (tmp_q_i32 + D/sizeof(int));
// Set memory to zero if out of bounds:
if (ncols > 2 && ic0 + j >= ne01) {
#pragma unroll
for (int i0 = 0; i0 < int(D/sizeof(int)); i0 += WARP_SIZE) {
const int i = i0 + threadIdx.x;
tmp_q_i32[i] = 0;
}
if (threadIdx.x < D/QK8_1) {
tmp_q_ds[threadIdx.x] = make_float2(0.0f, 0.0f);
}
continue;
}
const float * Q_f = (const float *) (Q + j*nb01);
#pragma unroll
for (int i0 = 0; i0 < int(D/sizeof(int)); i0 += WARP_SIZE) {
quantize_q8_1_to_shared<float2>(Q_f + 4*i0, scale, tmp_q_i32, tmp_q_ds);
}
}
__syncthreads();
#pragma unroll
for (int j = 0; j < ncols; ++j) {
int * tmp_q_i32 = (int *) &KQ[j*D];
float2 * tmp_q_ds = (float2 *) (tmp_q_i32 + D/sizeof(int));
#pragma unroll
for (int i0 = 0; i0 < int(D/sizeof(int)); i0 += WARP_SIZE) {
const int i = i0 + threadIdx.x;
Q_i32[j][i0/WARP_SIZE] = tmp_q_i32[i];
Q_ds[j][i0/WARP_SIZE] = tmp_q_ds[i/QI8_1];
}
}
__syncthreads();
} else {
#pragma unroll
for (int j = 0; j < ncols; ++j) {
const float2 * Q_f2_j = (const float2 *) (Q + j*nb01);
#pragma unroll
for (int i0 = 0; i0 < D/2; i0 += WARP_SIZE) {
const int i = i0 + threadIdx.x;
Q_f2[j][i0/WARP_SIZE] = ncols <= 2 || ic0 + j < ne01 ? Q_f2_j[i] : make_float2(0.0f, 0.0f);
Q_f2[j][i0/WARP_SIZE].x *= scale;
Q_f2[j][i0/WARP_SIZE].y *= scale;
}
}
}
float VKQ[ncols] = {0.0f};
const int k_VKQ_max = KV_max ? KV_max[sequence*gridDim.x + blockIdx.x] : ne11;
K += blockIdx.y*D * nb11;
V += blockIdx.y*D * nb21;
maskh += blockIdx.y*D;
for (int k_VKQ_0 = blockIdx.y*D; k_VKQ_0 < k_VKQ_max; k_VKQ_0 += gridDim.y*D,
// Increment pointers after each loop:
K += gridDim.y*D*nb11, V += gridDim.y*D*nb21, maskh += gridDim.y*D) {
// Calculate KQ tile and keep track of new maximum KQ values:
if (mask) {
#pragma unroll
for (int j = 0; j < ncols; ++j) {
maskf_shared[j*D + tid] = slope*__half2float(maskh[j*ne11 + tid]);
}
__syncthreads();
}
float kqmax_new_arr[ncols];
#pragma unroll
for (int j = 0; j < ncols; ++j) {
kqmax_new_arr[j] = kqmax[j];
}
#pragma unroll
for (int i_KQ_0 = 0; i_KQ_0 < D; i_KQ_0 += nwarps) {
const int i_KQ = i_KQ_0 + threadIdx.y;
if ((i_KQ_0 + nwarps > D && i_KQ >= D) || (FATTN_KQ_STRIDE % D != 0 && k_VKQ_0 + i_KQ >= ne11)) {
break;
}
#pragma unroll
for (int j = 0; j < ncols; ++j) {
float sum = vec_dot_KQ(K + i_KQ*nb11, Q_f2[j], Q_i32[j], Q_ds[j]);
sum = warp_reduce_sum(sum);
if (use_logit_softcap) {
sum = logit_softcap*tanhf(sum);
}
sum += maskf_shared[j*D + i_KQ];
kqmax_new_arr[j] = fmaxf(kqmax_new_arr[j], sum);
if (threadIdx.x == 0) {
KQ[j*D + i_KQ] = sum;
}
}
}
#pragma unroll
for (int j = 0; j < ncols; ++j) {
float kqmax_new_j = kqmax_new_arr[j];
if (threadIdx.x == 0) {
kqmax_shared[j][threadIdx.y] = kqmax_new_j;
}
}
__syncthreads();
#pragma unroll
for (int j = 0; j < ncols; ++j) {
float kqmax_new_j = kqmax_shared[j][threadIdx.x];
kqmax_new_j = warp_reduce_max(kqmax_new_j);
const float KQ_max_scale = expf(kqmax[j] - kqmax_new_j);
kqmax[j] = kqmax_new_j;
const float val = expf(KQ[j*D + tid] - kqmax[j]);
kqsum[j] = kqsum[j]*KQ_max_scale + val;
KQ[j*D + tid] = val;
VKQ[j] *= KQ_max_scale;
}
__syncthreads();
#pragma unroll
for (int k = 0; k < D; ++k) {
if (FATTN_KQ_STRIDE % D != 0 && k_VKQ_0 + k >= ne11) {
break;
}
const float V_ki = dequantize_1_v(V + k*nb21, tid);
#pragma unroll
for (int j = 0; j < ncols; ++j) {
VKQ[j] += V_ki*KQ[j*D + k];
}
}
__syncthreads();
}
if (sinksf && blockIdx.y == 0) {
const float sink = sinksf[head];
#pragma unroll
for (int j = 0; j < ncols; ++j) {
if (threadIdx.x == 0) {
kqmax_shared[j][threadIdx.y] = fmaxf(kqmax[j], sink);
}
}
__syncthreads();
#pragma unroll
for (int j = 0; j < ncols; ++j) {
float kqmax_new_j = kqmax_shared[j][threadIdx.x];
kqmax_new_j = warp_reduce_max(kqmax_new_j);
const float KQ_max_scale = expf(kqmax[j] - kqmax_new_j);
kqmax[j] = kqmax_new_j;
const float val = expf(sink - kqmax[j]);
kqsum[j] = kqsum[j]*KQ_max_scale;
if (tid == 0) {
kqsum[j] += val;
}
VKQ[j] *= KQ_max_scale;
}
__syncthreads();
}
#pragma unroll
for (int j = 0; j < ncols; ++j) {
kqsum[j] = warp_reduce_sum(kqsum[j]);
if (threadIdx.x == 0) {
kqsum_shared[j][threadIdx.y] = kqsum[j];
}
}
__syncthreads();
#pragma unroll
for (int j_VKQ = 0; j_VKQ < ncols; ++j_VKQ) {
if (ncols > 2 && ic0 + j_VKQ >= ne01) {
break;
}
kqsum[j_VKQ] = kqsum_shared[j_VKQ][threadIdx.x];
kqsum[j_VKQ] = warp_reduce_sum(kqsum[j_VKQ]);
float dst_val = VKQ[j_VKQ];
if (gridDim.y == 1) {
dst_val /= kqsum[j_VKQ];
}
dst[(((sequence*ne01 + ic0 + j_VKQ)*ne02 + head)*gridDim.y + blockIdx.y)*D + tid] = dst_val;
}
if (gridDim.y != 1 && tid < ncols && (ncols <= 2 || ic0 + tid < ne01)) {
dst_meta[((sequence*ne01 + ic0 + tid)*ne02 + head)*gridDim.y + blockIdx.y] = make_float2(kqmax[tid], kqsum[tid]);
}
#else
GGML_UNUSED_VARS(Q, K, V, mask, sinks, KV_max, dst, dst_meta, scale,
max_bias, m0, m1, n_head_log2, logit_softcap,
ne00, ne01, ne02, ne03,
nb01, nb02, nb03,
ne10, ne11, ne12, ne13,
nb11, nb12, nb13,
nb21, nb22, nb23,
ne31, ne32, ne33,
nb31, nb32, nb33);
NO_DEVICE_CODE;
#endif // FLASH_ATTN_AVAILABLE
}
#ifdef __clang__
#pragma clang diagnostic pop
#endif // __clang__
template <int D, int cols_per_block, ggml_type type_K, ggml_type type_V, bool use_logit_softcap>
void ggml_cuda_flash_attn_ext_vec_f32_case_impl(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
constexpr int nwarps = D/WARP_SIZE;
fattn_kernel_t fattn_kernel = flash_attn_vec_ext_f32<D, cols_per_block, type_K, type_V, use_logit_softcap>;
constexpr bool need_f16_K = D != 128;
constexpr bool need_f16_V = D != 128 && D != 64;
constexpr size_t nbytes_shared = 0;
launch_fattn<D, cols_per_block, 1>(ctx, dst, fattn_kernel, nwarps, nbytes_shared, D, need_f16_K, need_f16_V, false);
}
template <int D, ggml_type type_K, ggml_type type_V>
void ggml_cuda_flash_attn_ext_vec_f32_case(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
const ggml_tensor * KQV = dst;
const ggml_tensor * Q = dst->src[0];
const ggml_tensor * K = dst->src[1];
const ggml_tensor * V = dst->src[2];
GGML_ASSERT(K->type == type_K);
GGML_ASSERT(V->type == type_V);
float logit_softcap;
memcpy(&logit_softcap, (const float *) KQV->op_params + 2, sizeof(float));
const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc;
if (Q->ne[1] == 1 || GGML_CUDA_CC_IS_NVIDIA(cc)) {
constexpr int cols_per_block = 1;
if (logit_softcap == 0.0f) {
constexpr bool use_logit_softcap = false;
ggml_cuda_flash_attn_ext_vec_f32_case_impl<D, cols_per_block, type_K, type_V, use_logit_softcap>(ctx, dst);
} else {
constexpr bool use_logit_softcap = true;
ggml_cuda_flash_attn_ext_vec_f32_case_impl<D, cols_per_block, type_K, type_V, use_logit_softcap>(ctx, dst);
}
return;
}
if (Q->ne[1] == 2) {
constexpr int cols_per_block = 2;
if (logit_softcap == 0.0f) {
constexpr bool use_logit_softcap = false;
ggml_cuda_flash_attn_ext_vec_f32_case_impl<D, cols_per_block, type_K, type_V, use_logit_softcap>(ctx, dst);
} else {
constexpr bool use_logit_softcap = true;
ggml_cuda_flash_attn_ext_vec_f32_case_impl<D, cols_per_block, type_K, type_V, use_logit_softcap>(ctx, dst);
}
return;
}
if (Q->ne[1] <= 4) {
constexpr int cols_per_block = 4;
if (logit_softcap == 0.0f) {
constexpr bool use_logit_softcap = false;
ggml_cuda_flash_attn_ext_vec_f32_case_impl<D, cols_per_block, type_K, type_V, use_logit_softcap>(ctx, dst);
} else {
constexpr bool use_logit_softcap = true;
ggml_cuda_flash_attn_ext_vec_f32_case_impl<D, cols_per_block, type_K, type_V, use_logit_softcap>(ctx, dst);
}
return;
}
constexpr int cols_per_block = 8;
if (logit_softcap == 0.0f) {
constexpr bool use_logit_softcap = false;
ggml_cuda_flash_attn_ext_vec_f32_case_impl<D, cols_per_block, type_K, type_V, use_logit_softcap>(ctx, dst);
} else {
constexpr bool use_logit_softcap = true;
ggml_cuda_flash_attn_ext_vec_f32_case_impl<D, cols_per_block, type_K, type_V, use_logit_softcap>(ctx, dst);
}
}
#define DECL_FATTN_VEC_F32_CASE(D, type_K, type_V) \
template void ggml_cuda_flash_attn_ext_vec_f32_case \
<D, type_K, type_V>(ggml_backend_cuda_context & ctx, ggml_tensor * dst) \
extern DECL_FATTN_VEC_F32_CASE( 64, GGML_TYPE_F16, GGML_TYPE_Q4_0);
extern DECL_FATTN_VEC_F32_CASE( 64, GGML_TYPE_F16, GGML_TYPE_Q4_1);
extern DECL_FATTN_VEC_F32_CASE( 64, GGML_TYPE_F16, GGML_TYPE_Q5_0);
extern DECL_FATTN_VEC_F32_CASE( 64, GGML_TYPE_F16, GGML_TYPE_Q5_1);
extern DECL_FATTN_VEC_F32_CASE( 64, GGML_TYPE_F16, GGML_TYPE_Q8_0);
extern DECL_FATTN_VEC_F32_CASE( 64, GGML_TYPE_F16, GGML_TYPE_F16);
extern DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q4_0);
extern DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q4_0);
extern DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q4_0);
extern DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q4_0);
extern DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q4_0);
extern DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q4_0);
extern DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q4_1);
extern DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q4_1);
extern DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q4_1);
extern DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q4_1);
extern DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q4_1);
extern DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q4_1);
extern DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q5_0);
extern DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q5_0);
extern DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q5_0);
extern DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q5_0);
extern DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q5_0);
extern DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q5_0);
extern DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q5_1);
extern DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q5_1);
extern DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q5_1);
extern DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q5_1);
extern DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q5_1);
extern DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q5_1);
extern DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q8_0);
extern DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q8_0);
extern DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q8_0);
extern DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q8_0);
extern DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q8_0);
extern DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q8_0);
extern DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_F16);
extern DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_F16);
extern DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_F16);
extern DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_F16);
extern DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_F16);
extern DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_F16, GGML_TYPE_F16);
extern DECL_FATTN_VEC_F32_CASE(256, GGML_TYPE_F16, GGML_TYPE_F16);

View File

@ -0,0 +1,593 @@
#include "common.cuh"
#include "fattn-common.cuh"
static int ggml_cuda_fattn_vec_get_nthreads_host(const int cc) {
return 128;
GGML_UNUSED(cc);
}
static constexpr __device__ int ggml_cuda_fattn_vec_get_nthreads_device() {
return 128;
}
// Currenlty llvm with the amdgcn target dose not support unrolling loops
// that contain a break that can not be resolved at compile time.
#ifdef __clang__
#pragma clang diagnostic push
#pragma clang diagnostic ignored "-Wpass-failed"
#endif // __clang__
template<int D, int ncols, ggml_type type_K, ggml_type type_V, bool use_logit_softcap> // D == head size
__launch_bounds__(ggml_cuda_fattn_vec_get_nthreads_device(), 1)
static __global__ void flash_attn_ext_vec(
const char * __restrict__ Q,
const char * __restrict__ K,
const char * __restrict__ V,
const char * __restrict__ mask,
const char * __restrict__ sinks,
const int * __restrict__ KV_max,
float * __restrict__ dst,
float2 * __restrict__ dst_meta,
const float scale,
const float max_bias,
const float m0,
const float m1,
const uint32_t n_head_log2,
const float logit_softcap,
const int32_t ne00, const int32_t ne01, const int32_t ne02, const int32_t ne03,
const int32_t nb01, const int32_t nb02, const int32_t nb03,
const int32_t ne10, const int32_t ne11, const int32_t ne12, const int32_t ne13,
const int32_t nb11, const int32_t nb12, const int64_t nb13,
const int32_t nb21, const int32_t nb22, const int64_t nb23,
const int32_t ne31, const int32_t ne32, const int32_t ne33,
const int32_t nb31, const int32_t nb32, const int64_t nb33) {
#ifdef FLASH_ATTN_AVAILABLE
// Skip unused kernel variants for faster compilation:
if (use_logit_softcap && !(D == 128 || D == 256)) {
GGML_UNUSED_VARS(Q, K, V, mask, sinks, KV_max, dst, dst_meta, scale,
max_bias, m0, m1, n_head_log2, logit_softcap,
ne00, ne01, ne02, ne03,
nb01, nb02, nb03,
ne10, ne11, ne12, ne13,
nb11, nb12, nb13,
nb21, nb22, nb23,
ne31, ne32, ne33,
nb31, nb32, nb33);
NO_DEVICE_CODE;
return;
}
//In this kernel Q, K, V are matrices while i, j, k are matrix indices.
constexpr int cpy_nb = ggml_cuda_get_max_cpy_bytes();
constexpr int cpy_ne = cpy_nb / 4;
#ifdef GGML_USE_HIP
#ifdef RDNA
constexpr int nthreads_KQ_q = 2;
#else
constexpr int nthreads_KQ_q = 4;
#endif // RDNA
constexpr int nthreads_V_q = (D/4 < 32 ? D/4 : 32);
#else
constexpr int nthreads_KQ_q = (D/4 < 32 ? D/4 : 32);
constexpr int nthreads_V_q = (D/4 < 32 ? D/4 : 32);
#endif // GGML_USE_HIP
constexpr int nthreads = ggml_cuda_fattn_vec_get_nthreads_device();
constexpr int nthreads_KQ = type_K == GGML_TYPE_F16 ? 128 / cpy_nb : nthreads_KQ_q;
constexpr int nthreads_V = type_V == GGML_TYPE_F16 ? 128 / cpy_nb : nthreads_V_q;
static_assert(WARP_SIZE % nthreads_KQ == 0, "bad nthreads_K");
static_assert(WARP_SIZE % nthreads_V == 0, "bad nthreads_V");
constexpr int V_rows_per_thread = type_V == GGML_TYPE_F16 ? 2*cpy_ne : 4;
constexpr int V_cols_per_iter = WARP_SIZE / nthreads_V;
constexpr vec_dot_KQ_t vec_dot_KQ = get_vec_dot_KQ<type_K, D, nthreads_KQ>();
constexpr bool Q_q8_1 = type_K != GGML_TYPE_F16;
#ifdef FAST_FP16_AVAILABLE
constexpr dequantize_V_t dequantize_V = get_dequantize_V<type_V, half, V_rows_per_thread>();
#else
constexpr dequantize_V_t dequantize_V = get_dequantize_V<type_V, float, V_rows_per_thread>();
#endif // FAST_FP16_AVAILABLE
const int ic0 = blockIdx.x * ncols; // Index of the Q/QKV column to work on.
const int sequence = blockIdx.z / ne02;
const int head = blockIdx.z - sequence*ne02;
const int gqa_ratio = ne02 / ne12; // With grouped query attention there are > 1 Q matrices per K, V matrix.
Q += nb03*sequence + nb02* head + nb01*ic0;
K += nb13*sequence + nb12*(head / gqa_ratio);
V += nb23*sequence + nb22*(head / gqa_ratio);
const half * maskh = (const half *) (mask + nb33*(sequence % ne33) + nb31*ic0);
const float slope = get_alibi_slope(max_bias, head, n_head_log2, m0, m1);
static_assert(D % (2*WARP_SIZE) == 0, "D not divisible by 2*WARP_SIZE == 64.");
constexpr int nwarps = nthreads / WARP_SIZE;
const int tid = WARP_SIZE*threadIdx.y + threadIdx.x;
__builtin_assume(tid < nthreads);
constexpr int ne_KQ = ncols*D;
constexpr int ne_combine = nwarps*V_cols_per_iter*D;
#ifdef FAST_FP16_AVAILABLE
half2 VKQ[ncols][(D/2)/nthreads_V] = {{{0.0f, 0.0f}}};
__shared__ half KQ[ne_KQ > ne_combine ? ne_KQ : ne_combine];
#else
float2 VKQ[ncols][(D/2)/nthreads_V] = {{{0.0f, 0.0f}}};
__shared__ float KQ[ne_KQ > ne_combine ? ne_KQ : ne_combine];
#endif // FAST_FP16_AVAILABLE
float KQ_max[ncols];
float KQ_sum[ncols];
#pragma unroll
for (int j = 0; j < ncols; ++j) {
KQ_max[j] = -FLT_MAX/2.0f;
KQ_sum[j] = 0.0f;
}
// Convert Q to float2 (f16 K) or q8_1 (quantized K) and store in registers:
#ifdef FAST_FP16_AVAILABLE
half2 Q_reg[ncols][(D/2)/nthreads_KQ]; // Will be initialized completely.
#else
float2 Q_reg[ncols][(D/2)/nthreads_KQ] = {{{0.0f, 0.0f}}}; // May be only partially initialized.
#endif // FAST_FP16_AVAILABLE
int Q_i32[ncols][1 > D/(sizeof(int)*nthreads_KQ) ? 1 : D/(sizeof(int)*nthreads_KQ)];
float2 Q_ds[ncols][1 > D/(sizeof(int)*nthreads_KQ) ? 1 : D/(sizeof(int)*nthreads_KQ)];
if constexpr (Q_q8_1) {
#pragma unroll
for (int j0 = 0; j0 < ncols; j0 += nwarps) {
const int j = j0 + threadIdx.y;
if (j0 + nwarps > ncols && j >= ncols) {
break;
}
// Reuse KQ as temporary storage for converting Q to q8_1:
int * tmp_q_i32 = (int *) &KQ[j*D];
float2 * tmp_q_ds = (float2 *) (tmp_q_i32 + D/sizeof(int));
// Set memory to zero if out of bounds:
if (ncols > 1 && ic0 + j >= ne01) {
#pragma unroll
for (int i0 = 0; i0 < int(D/sizeof(int)); i0 += WARP_SIZE) {
const int i = i0 + threadIdx.x;
if (i0 + WARP_SIZE <= D/sizeof(int) || i < D/sizeof(int)) {
tmp_q_i32[i] = 0;
}
}
if (threadIdx.x < D/QK8_1) {
tmp_q_ds[threadIdx.x] = make_float2(0.0f, 0.0f);
}
} else {
const float * Q_f = (const float *) (Q + j*nb01);
constexpr int nthreads_quantize = D/sizeof(int) < WARP_SIZE ? D/sizeof(int) : WARP_SIZE;
#pragma unroll
for (int i0 = 0; i0 < int(D/sizeof(int)); i0 += nthreads_quantize) {
quantize_q8_1_to_shared<float2, nthreads_quantize>
(Q_f + i0*sizeof(int), scale, tmp_q_i32 + i0, tmp_q_ds + i0/QI8_1);
}
}
}
__syncthreads();
#pragma unroll
for (int j = 0; j < ncols; ++j) {
int * tmp_q_i32 = (int *) &KQ[j*D];
float2 * tmp_q_ds = (float2 *) (tmp_q_i32 + D/sizeof(int));
#pragma unroll
for (int i0 = 0; i0 < int(D/sizeof(int)); i0 += nthreads_KQ) {
const int i = i0 + (nthreads_KQ == WARP_SIZE ? threadIdx.x : threadIdx.x % nthreads_KQ);
Q_i32[j][i0/nthreads_KQ] = tmp_q_i32[i];
Q_ds[j][i0/nthreads_KQ] = tmp_q_ds[i/QI8_1];
}
}
__syncthreads();
} else {
#ifdef FAST_FP16_AVAILABLE
const half2 scale_h2 = make_half2(scale, scale);
#pragma unroll
for (int j = 0; j < ncols; ++j) {
const float2 * Q_j = (const float2 *) (Q + j*nb01);
#pragma unroll
for (int i0 = 0; i0 < D/2; i0 += nthreads_KQ*cpy_ne) {
const int i = i0 + (nthreads_KQ == WARP_SIZE ? threadIdx.x : threadIdx.x % nthreads_KQ)*cpy_ne;
float2 tmp[cpy_ne] = {{0.0f, 0.0f}};
if (ncols == 1 || ic0 + j < ne01) {
ggml_cuda_memcpy_1<cpy_nb>(tmp, &Q_j[i]);
ggml_cuda_memcpy_1<cpy_nb>(tmp + cpy_ne/2, &Q_j[i + cpy_ne/2]);
}
#pragma unroll
for (int i1 = 0; i1 < cpy_ne; ++i1) {
Q_reg[j][i0/nthreads_KQ + i1] = make_half2(tmp[i1].x, tmp[i1].y);
}
}
#pragma unroll
for (int k = 0; k < (D/2)/nthreads_KQ; ++k) {
Q_reg[j][k] *= scale_h2;
}
}
#else
#pragma unroll
for (int j = 0; j < ncols; ++j) {
const float2 * Q_j = (const float2 *) (Q + j*nb01);
#pragma unroll
for (int i0 = 0; i0 < D/2; i0 += nthreads_KQ*cpy_ne) {
const int i = i0 + (nthreads_KQ == WARP_SIZE ? threadIdx.x : threadIdx.x % nthreads_KQ)*cpy_ne;
if (ncols == 1 || ic0 + j < ne01) {
ggml_cuda_memcpy_1<cpy_nb>(&Q_reg[j][i0/nthreads_KQ], &Q_j[i]);
ggml_cuda_memcpy_1<cpy_nb>(&Q_reg[j][i0/nthreads_KQ + cpy_ne/2], &Q_j[i + cpy_ne/2]);
}
}
#pragma unroll
for (int k = 0; k < (D/2)/nthreads_KQ; ++k) {
Q_reg[j][k].x *= scale;
Q_reg[j][k].y *= scale;
}
}
#endif // FAST_FP16_AVAILABLE
}
const int k_VKQ_max = KV_max ? KV_max[sequence*gridDim.x + blockIdx.x] : ne11;
K += blockIdx.y*nthreads * nb11;
V += blockIdx.y*nthreads * nb21;
maskh += blockIdx.y*nthreads;
for (int k_VKQ_0 = blockIdx.y*nthreads; k_VKQ_0 < k_VKQ_max; k_VKQ_0 += gridDim.y*nthreads,
// Increment pointers after each loop:
K += gridDim.y*nthreads*nb11, V += gridDim.y*nthreads*nb21, maskh += gridDim.y*nthreads) {
// Calculate KQ tile and keep track of new maximum KQ values:
float KQ_reg[ncols]; // KQ in registers.
float KQ_max_new[ncols];
#pragma unroll
for (int j = 0; j < ncols; ++j) {
KQ_max_new[j] = KQ_max[j];
}
#pragma unroll
for (int i_KQ_0 = 0; i_KQ_0 < nthreads_KQ; ++i_KQ_0) {
const int i_KQ = threadIdx.y*WARP_SIZE + (nthreads_KQ == WARP_SIZE ? 0 : (threadIdx.x & ~(nthreads_KQ-1))) + i_KQ_0;
#pragma unroll
for (int j = 0; j < ncols; ++j) {
float sum = vec_dot_KQ(K + i_KQ*nb11, Q_reg[j], Q_i32[j], Q_ds[j]);
sum = warp_reduce_sum<nthreads_KQ>(sum);
if (use_logit_softcap) {
sum = logit_softcap*tanhf(sum);
}
if (mask) {
sum += slope*__half2float(maskh[j*ne11 + i_KQ]);
}
KQ_max_new[j] = fmaxf(KQ_max_new[j], sum);
if ((nthreads_KQ == WARP_SIZE ? threadIdx.x : threadIdx.x % nthreads_KQ) == i_KQ_0) {
KQ_reg[j] = sum;
}
}
}
#pragma unroll
for (int j = 0; j < ncols; ++j) {
#pragma unroll
for (int offset = nthreads_KQ; offset < WARP_SIZE; offset <<= 1) {
KQ_max_new[j] = fmaxf(KQ_max_new[j], __shfl_xor_sync(0xFFFFFFFF, KQ_max_new[j], offset, WARP_SIZE));
}
const float KQ_max_scale = expf(KQ_max[j] - KQ_max_new[j]);
KQ_max[j] = KQ_max_new[j];
KQ_reg[j] = expf(KQ_reg[j] - KQ_max[j]);
KQ_sum[j] = KQ_sum[j]*KQ_max_scale + KQ_reg[j];
KQ[j*nthreads + tid] = KQ_reg[j];
#ifdef FAST_FP16_AVAILABLE
const half2 KQ_max_scale_h2 = make_half2(KQ_max_scale, KQ_max_scale);
#pragma unroll
for (int i_VKQ_0 = 0; i_VKQ_0 < D/2; i_VKQ_0 += nthreads_V) {
VKQ[j][i_VKQ_0/nthreads_V] *= KQ_max_scale_h2;
}
#else
#pragma unroll
for (int i_VKQ_0 = 0; i_VKQ_0 < D/2; i_VKQ_0 += nthreads_V) {
VKQ[j][i_VKQ_0/nthreads_V].x *= KQ_max_scale;
VKQ[j][i_VKQ_0/nthreads_V].y *= KQ_max_scale;
}
#endif // FAST_FP16_AVAILABLE
}
#ifndef GGML_USE_HIP
__syncwarp();
#endif // GGML_USE_HIP
#pragma unroll
for (int k0 = 0; k0 < WARP_SIZE; k0 += V_cols_per_iter) {
const int k = threadIdx.y*WARP_SIZE + k0 + (nthreads_V == WARP_SIZE ? 0 : threadIdx.x / nthreads_V);
#ifdef FAST_FP16_AVAILABLE
half2 KQ_k[ncols];
#pragma unroll
for (int j = 0; j < ncols; ++j) {
KQ_k[j] = __half2half2(KQ[j*nthreads + k]);
}
#pragma unroll
for (int i_VKQ_0 = 0; i_VKQ_0 < D/2; i_VKQ_0 += nthreads_V*V_rows_per_thread/2) {
half2 tmp[V_rows_per_thread/2];
dequantize_V(V + k*nb21, tmp,
2*i_VKQ_0 + (nthreads_V == WARP_SIZE ? threadIdx.x : threadIdx.x % nthreads_V)*V_rows_per_thread);
#pragma unroll
for (int i_VKQ_1 = 0; i_VKQ_1 < V_rows_per_thread/2; ++i_VKQ_1) {
#pragma unroll
for (int j = 0; j < ncols; ++j) {
VKQ[j][i_VKQ_0/nthreads_V + i_VKQ_1] += tmp[i_VKQ_1]*KQ_k[j];
}
}
}
#else
float KQ_k[ncols];
#pragma unroll
for (int j = 0; j < ncols; ++j) {
KQ_k[j] = KQ[j*nthreads + k];
}
#pragma unroll
for (int i_VKQ_0 = 0; i_VKQ_0 < D/2; i_VKQ_0 += nthreads_V*V_rows_per_thread/2) {
float2 tmp[V_rows_per_thread/2];
dequantize_V(V + k*nb21, tmp,
2*i_VKQ_0 + (nthreads_V == WARP_SIZE ? threadIdx.x : threadIdx.x % nthreads_V)*V_rows_per_thread);
#pragma unroll
for (int i_VKQ_1 = 0; i_VKQ_1 < V_rows_per_thread/2; ++i_VKQ_1) {
#pragma unroll
for (int j = 0; j < ncols; ++j) {
VKQ[j][i_VKQ_0/nthreads_V + i_VKQ_1].x += tmp[i_VKQ_1].x*KQ_k[j];
VKQ[j][i_VKQ_0/nthreads_V + i_VKQ_1].y += tmp[i_VKQ_1].y*KQ_k[j];
}
}
}
#endif // FAST_FP16_AVAILABLE
}
}
if (sinks && blockIdx.y == 0) {
const float sink = ((const float *) sinks)[head];
#pragma unroll
for (int j0 = 0; j0 < ncols; j0 += nwarps) {
const int j = j0 + threadIdx.y;
if (j0 + nwarps > ncols && j >= ncols) {
break;
}
const float kqmax_new_j = fmaxf(sink, KQ_max[j]);
const float KQ_max_scale = expf(KQ_max[j] - kqmax_new_j);
KQ_max[j] = kqmax_new_j;
KQ_sum[j] = KQ_sum[j]*KQ_max_scale + (threadIdx.x == 0 ? expf(sink - KQ_max[j]) : 0.0f);
#ifdef FAST_FP16_AVAILABLE
const half2 KQ_max_scale_h2 = make_half2(KQ_max_scale, KQ_max_scale);
#pragma unroll
for (int i_VKQ_0 = 0; i_VKQ_0 < D/2; i_VKQ_0 += nthreads_V) {
VKQ[j][i_VKQ_0/nthreads_V] *= KQ_max_scale_h2;
}
#else
#pragma unroll
for (int i_VKQ_0 = 0; i_VKQ_0 < D/2; i_VKQ_0 += nthreads_V) {
VKQ[j][i_VKQ_0/nthreads_V].x *= KQ_max_scale;
VKQ[j][i_VKQ_0/nthreads_V].y *= KQ_max_scale;
}
#endif // FAST_FP16_AVAILABLE
}
}
__shared__ float KQ_max_shared[ncols][WARP_SIZE];
__shared__ float KQ_sum_shared[ncols][WARP_SIZE];
#pragma unroll
for (int j = 0; j < ncols; ++j) {
if (threadIdx.y == 0) {
KQ_max_shared[j][threadIdx.x] = -FLT_MAX/2.0f;
KQ_sum_shared[j][threadIdx.x] = 0.0f;
}
}
__syncthreads();
#pragma unroll
for (int j = 0; j < ncols; ++j) {
if (threadIdx.x == 0) {
KQ_max_shared[j][threadIdx.y] = KQ_max[j];
}
}
__syncthreads();
#pragma unroll
for (int j_VKQ = 0; j_VKQ < ncols; ++j_VKQ) {
if (ncols > 1 && ic0 + j_VKQ >= ne01) {
break;
}
float kqmax_new = KQ_max_shared[j_VKQ][threadIdx.x];
kqmax_new = warp_reduce_max(kqmax_new);
const float kqmax_scale = expf(KQ_max[j_VKQ] - kqmax_new);
KQ_max[j_VKQ] = kqmax_new;
#ifdef FAST_FP16_AVAILABLE
half2 * VKQ_tmp = (half2 *) KQ + threadIdx.y*(V_cols_per_iter*D/2)
+ (nthreads_V == WARP_SIZE ? 0 : threadIdx.x / nthreads_V)*(D/2);
const half2 kqmax_scale_h2 = make_half2(kqmax_scale, kqmax_scale);
#pragma unroll
for (int i_VKQ_0 = 0; i_VKQ_0 < D/2; i_VKQ_0 += nthreads_V) {
VKQ[j_VKQ][i_VKQ_0/nthreads_V] *= kqmax_scale_h2;
}
#pragma unroll
for (int i_VKQ_0 = 0; i_VKQ_0 < D/2; i_VKQ_0 += nthreads_V*V_rows_per_thread/2) {
const int i_VKQ = i_VKQ_0 + (nthreads_V == WARP_SIZE ? threadIdx.x : threadIdx.x % nthreads_V)*(V_rows_per_thread/2);
ggml_cuda_memcpy_1<V_rows_per_thread*sizeof(half)>(VKQ_tmp + i_VKQ, &VKQ[j_VKQ][i_VKQ_0/nthreads_V]);
}
#else
float2 * VKQ_tmp = (float2 *) KQ + threadIdx.y*(V_cols_per_iter*D/2)
+ (nthreads_V == WARP_SIZE ? 0 : threadIdx.x / nthreads_V)*(D/2);
#pragma unroll
for (int i_VKQ_0 = 0; i_VKQ_0 < D/2; i_VKQ_0 += nthreads_V) {
VKQ[j_VKQ][i_VKQ_0/nthreads_V].x *= kqmax_scale;
VKQ[j_VKQ][i_VKQ_0/nthreads_V].y *= kqmax_scale;
}
#pragma unroll
for (int i_VKQ_0 = 0; i_VKQ_0 < D/2; i_VKQ_0 += nthreads_V*V_rows_per_thread/2) {
const int i_VKQ = i_VKQ_0 + (nthreads_V == WARP_SIZE ? threadIdx.x : threadIdx.x % nthreads_V)*(V_rows_per_thread/2);
ggml_cuda_memcpy_1<V_rows_per_thread/2*sizeof(float)>(VKQ_tmp + i_VKQ, &VKQ[j_VKQ][i_VKQ_0/nthreads_V]);
ggml_cuda_memcpy_1<V_rows_per_thread/2*sizeof(float)>(VKQ_tmp + i_VKQ + V_rows_per_thread/4, &VKQ[j_VKQ][i_VKQ_0/nthreads_V + V_rows_per_thread/4]);
}
#endif // FAST_FP16_AVAILABLE
KQ_sum[j_VKQ] *= kqmax_scale;
KQ_sum[j_VKQ] = warp_reduce_sum(KQ_sum[j_VKQ]);
if (threadIdx.x == 0) {
KQ_sum_shared[j_VKQ][threadIdx.y] = KQ_sum[j_VKQ];
}
__syncthreads();
if (nthreads <= D || tid < D) {
KQ_sum[j_VKQ] = KQ_sum_shared[j_VKQ][threadIdx.x];
KQ_sum[j_VKQ] = warp_reduce_sum(KQ_sum[j_VKQ]);
#pragma unroll
for (int i0 = 0; i0 < D; i0 += nthreads) {
float dst_val = 0;
#pragma unroll
for (int w = 0; w < nwarps; ++w) {
#pragma unroll
for (int v = 0; v < V_cols_per_iter; ++v) {
dst_val += float(KQ[w*V_cols_per_iter*D + v*D + i0 + tid]);
}
}
if (gridDim.y == 1) {
dst_val /= KQ_sum[j_VKQ];
}
dst[(((sequence*ne01 + ic0 + j_VKQ)*ne02 + head)*gridDim.y + blockIdx.y)*D + i0 + tid] = dst_val;
}
}
if (j_VKQ < ncols-1) {
__syncthreads();
}
}
if (gridDim.y != 1 && tid < ncols && (ncols == 1 || ic0 + tid < ne01)) {
dst_meta[((sequence*ne01 + ic0 + tid)*ne02 + head)*gridDim.y + blockIdx.y] = make_float2(KQ_max[tid], KQ_sum[tid]);
}
#else
GGML_UNUSED_VARS(Q, K, V, mask, sinks, KV_max, dst, dst_meta, scale,
max_bias, m0, m1, n_head_log2, logit_softcap,
ne00, ne01, ne02, ne03,
nb01, nb02, nb03,
ne10, ne11, ne12, ne13,
nb11, nb12, nb13,
nb21, nb22, nb23,
ne31, ne32, ne33,
nb31, nb32, nb33);
NO_DEVICE_CODE;
#endif // FLASH_ATTN_AVAILABLE
}
#ifdef __clang__
#pragma clang diagnostic pop
#endif // __clang__
template <int D, int cols_per_block, ggml_type type_K, ggml_type type_V, bool use_logit_softcap>
void ggml_cuda_flash_attn_ext_vec_case_impl(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc;
const int nthreads = ggml_cuda_fattn_vec_get_nthreads_host(cc);
const int nwarps = nthreads / WARP_SIZE;
fattn_kernel_t fattn_kernel = flash_attn_ext_vec<D, cols_per_block, type_K, type_V, use_logit_softcap>;
constexpr bool need_f16_K = false;
constexpr bool need_f16_V = false;
constexpr size_t nbytes_shared = 0;
launch_fattn<D, cols_per_block, 1>(ctx, dst, fattn_kernel, nwarps, nbytes_shared, D, need_f16_K, need_f16_V, false);
}
template <int D, ggml_type type_K, ggml_type type_V>
void ggml_cuda_flash_attn_ext_vec_case(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
const ggml_tensor * KQV = dst;
const ggml_tensor * Q = dst->src[0];
const ggml_tensor * K = dst->src[1];
const ggml_tensor * V = dst->src[2];
GGML_ASSERT(K->type == type_K);
GGML_ASSERT(V->type == type_V);
float logit_softcap;
memcpy(&logit_softcap, (const float *) KQV->op_params + 2, sizeof(float));
const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc;
if (Q->ne[1] == 1) {
constexpr int cols_per_block = 1;
if (logit_softcap == 0.0f) {
constexpr bool use_logit_softcap = false;
ggml_cuda_flash_attn_ext_vec_case_impl<D, cols_per_block, type_K, type_V, use_logit_softcap>(ctx, dst);
} else {
constexpr bool use_logit_softcap = true;
ggml_cuda_flash_attn_ext_vec_case_impl<D, cols_per_block, type_K, type_V, use_logit_softcap>(ctx, dst);
}
return;
}
constexpr int cols_per_block = 2;
if (logit_softcap == 0.0f) {
constexpr bool use_logit_softcap = false;
ggml_cuda_flash_attn_ext_vec_case_impl<D, cols_per_block, type_K, type_V, use_logit_softcap>(ctx, dst);
} else {
constexpr bool use_logit_softcap = true;
ggml_cuda_flash_attn_ext_vec_case_impl<D, cols_per_block, type_K, type_V, use_logit_softcap>(ctx, dst);
}
}
#define DECL_FATTN_VEC_CASE(D, type_K, type_V) \
template void ggml_cuda_flash_attn_ext_vec_case \
<D, type_K, type_V>(ggml_backend_cuda_context & ctx, ggml_tensor * dst) \
#define EXTERN_DECL_FATTN_VEC_CASES(D, type_K) \
extern DECL_FATTN_VEC_CASE(D, type_K, GGML_TYPE_F16); \
extern DECL_FATTN_VEC_CASE(D, type_K, GGML_TYPE_Q4_0); \
extern DECL_FATTN_VEC_CASE(D, type_K, GGML_TYPE_Q4_1); \
extern DECL_FATTN_VEC_CASE(D, type_K, GGML_TYPE_Q5_0); \
extern DECL_FATTN_VEC_CASE(D, type_K, GGML_TYPE_Q5_1); \
extern DECL_FATTN_VEC_CASE(D, type_K, GGML_TYPE_Q8_0); \
EXTERN_DECL_FATTN_VEC_CASES( 64, GGML_TYPE_F16)
EXTERN_DECL_FATTN_VEC_CASES( 64, GGML_TYPE_Q4_0)
EXTERN_DECL_FATTN_VEC_CASES( 64, GGML_TYPE_Q4_1)
EXTERN_DECL_FATTN_VEC_CASES( 64, GGML_TYPE_Q5_0)
EXTERN_DECL_FATTN_VEC_CASES( 64, GGML_TYPE_Q5_1)
EXTERN_DECL_FATTN_VEC_CASES( 64, GGML_TYPE_Q8_0)
EXTERN_DECL_FATTN_VEC_CASES(128, GGML_TYPE_F16)
EXTERN_DECL_FATTN_VEC_CASES(128, GGML_TYPE_Q4_0)
EXTERN_DECL_FATTN_VEC_CASES(128, GGML_TYPE_Q4_1)
EXTERN_DECL_FATTN_VEC_CASES(128, GGML_TYPE_Q5_0)
EXTERN_DECL_FATTN_VEC_CASES(128, GGML_TYPE_Q5_1)
EXTERN_DECL_FATTN_VEC_CASES(128, GGML_TYPE_Q8_0)
EXTERN_DECL_FATTN_VEC_CASES(256, GGML_TYPE_F16)
EXTERN_DECL_FATTN_VEC_CASES(256, GGML_TYPE_Q4_0)
EXTERN_DECL_FATTN_VEC_CASES(256, GGML_TYPE_Q4_1)
EXTERN_DECL_FATTN_VEC_CASES(256, GGML_TYPE_Q5_0)
EXTERN_DECL_FATTN_VEC_CASES(256, GGML_TYPE_Q5_1)
EXTERN_DECL_FATTN_VEC_CASES(256, GGML_TYPE_Q8_0)

View File

@ -2,8 +2,7 @@
#include "fattn-common.cuh" #include "fattn-common.cuh"
#include "fattn-mma-f16.cuh" #include "fattn-mma-f16.cuh"
#include "fattn-tile.cuh" #include "fattn-tile.cuh"
#include "fattn-vec-f16.cuh" #include "fattn-vec.cuh"
#include "fattn-vec-f32.cuh"
#include "fattn-wmma-f16.cuh" #include "fattn-wmma-f16.cuh"
#include "fattn.cuh" #include "fattn.cuh"
@ -117,151 +116,68 @@ static void ggml_cuda_flash_attn_ext_mma_f16(ggml_backend_cuda_context & ctx, gg
} }
} }
#define FATTN_VEC_F16_CASE(D, type_K, type_V) \ #define FATTN_VEC_CASE(D, type_K, type_V) \
if (Q->ne[0] == (D) && K->type == (type_K) && V->type == (type_V)) { \ if (Q->ne[0] == (D) && K->type == (type_K) && V->type == (type_V)) { \
ggml_cuda_flash_attn_ext_vec_f16_case<D, type_K, type_V>(ctx, dst); \ ggml_cuda_flash_attn_ext_vec_case<D, type_K, type_V>(ctx, dst); \
return; \ return; \
} \ } \
static void ggml_cuda_flash_attn_ext_vec_f16(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { #define FATTN_VEC_CASES_ALL_D(type_K, type_V) \
FATTN_VEC_CASE( 64, type_K, type_V) \
FATTN_VEC_CASE(128, type_K, type_V) \
FATTN_VEC_CASE(256, type_K, type_V) \
static void ggml_cuda_flash_attn_ext_vec(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
ggml_tensor * Q = dst->src[0]; ggml_tensor * Q = dst->src[0];
ggml_tensor * K = dst->src[1]; ggml_tensor * K = dst->src[1];
ggml_tensor * V = dst->src[2]; ggml_tensor * V = dst->src[2];
#ifdef GGML_CUDA_FA_ALL_QUANTS #ifdef GGML_CUDA_FA_ALL_QUANTS
FATTN_VEC_F16_CASE( 64, GGML_TYPE_F16, GGML_TYPE_Q4_0) FATTN_VEC_CASES_ALL_D(GGML_TYPE_F16, GGML_TYPE_F16)
FATTN_VEC_F16_CASE( 64, GGML_TYPE_F16, GGML_TYPE_Q4_1) FATTN_VEC_CASES_ALL_D(GGML_TYPE_Q4_0, GGML_TYPE_F16)
FATTN_VEC_F16_CASE( 64, GGML_TYPE_F16, GGML_TYPE_Q5_0) FATTN_VEC_CASES_ALL_D(GGML_TYPE_Q4_1, GGML_TYPE_F16)
FATTN_VEC_F16_CASE( 64, GGML_TYPE_F16, GGML_TYPE_Q5_1) FATTN_VEC_CASES_ALL_D(GGML_TYPE_Q5_0, GGML_TYPE_F16)
FATTN_VEC_F16_CASE( 64, GGML_TYPE_F16, GGML_TYPE_Q8_0) FATTN_VEC_CASES_ALL_D(GGML_TYPE_Q5_1, GGML_TYPE_F16)
FATTN_VEC_F16_CASE( 64, GGML_TYPE_F16, GGML_TYPE_F16 ) FATTN_VEC_CASES_ALL_D(GGML_TYPE_Q8_0, GGML_TYPE_F16)
FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q4_0) FATTN_VEC_CASES_ALL_D(GGML_TYPE_F16, GGML_TYPE_Q4_0)
FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q4_0) FATTN_VEC_CASES_ALL_D(GGML_TYPE_Q4_0, GGML_TYPE_Q4_0)
FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q4_0) FATTN_VEC_CASES_ALL_D(GGML_TYPE_Q4_1, GGML_TYPE_Q4_0)
FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q4_0) FATTN_VEC_CASES_ALL_D(GGML_TYPE_Q5_0, GGML_TYPE_Q4_0)
FATTN_VEC_F16_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q4_0) FATTN_VEC_CASES_ALL_D(GGML_TYPE_Q5_1, GGML_TYPE_Q4_0)
FATTN_VEC_F16_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q4_0) FATTN_VEC_CASES_ALL_D(GGML_TYPE_Q8_0, GGML_TYPE_Q4_0)
FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q4_1) FATTN_VEC_CASES_ALL_D(GGML_TYPE_F16, GGML_TYPE_Q4_1)
FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q4_1) FATTN_VEC_CASES_ALL_D(GGML_TYPE_Q4_0, GGML_TYPE_Q4_1)
FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q4_1) FATTN_VEC_CASES_ALL_D(GGML_TYPE_Q4_1, GGML_TYPE_Q4_1)
FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q4_1) FATTN_VEC_CASES_ALL_D(GGML_TYPE_Q5_0, GGML_TYPE_Q4_1)
FATTN_VEC_F16_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q4_1) FATTN_VEC_CASES_ALL_D(GGML_TYPE_Q5_1, GGML_TYPE_Q4_1)
FATTN_VEC_F16_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q4_1) FATTN_VEC_CASES_ALL_D(GGML_TYPE_Q8_0, GGML_TYPE_Q4_1)
FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q5_0) FATTN_VEC_CASES_ALL_D(GGML_TYPE_F16, GGML_TYPE_Q5_0)
FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q5_0) FATTN_VEC_CASES_ALL_D(GGML_TYPE_Q4_0, GGML_TYPE_Q5_0)
FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q5_0) FATTN_VEC_CASES_ALL_D(GGML_TYPE_Q4_1, GGML_TYPE_Q5_0)
FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q5_0) FATTN_VEC_CASES_ALL_D(GGML_TYPE_Q5_0, GGML_TYPE_Q5_0)
FATTN_VEC_F16_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q5_0) FATTN_VEC_CASES_ALL_D(GGML_TYPE_Q5_1, GGML_TYPE_Q5_0)
FATTN_VEC_F16_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q5_0) FATTN_VEC_CASES_ALL_D(GGML_TYPE_Q8_0, GGML_TYPE_Q5_0)
FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q5_1) FATTN_VEC_CASES_ALL_D(GGML_TYPE_F16, GGML_TYPE_Q5_1)
FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q5_1) FATTN_VEC_CASES_ALL_D(GGML_TYPE_Q4_0, GGML_TYPE_Q5_1)
FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q5_1) FATTN_VEC_CASES_ALL_D(GGML_TYPE_Q4_1, GGML_TYPE_Q5_1)
FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q5_1) FATTN_VEC_CASES_ALL_D(GGML_TYPE_Q5_0, GGML_TYPE_Q5_1)
FATTN_VEC_F16_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q5_1) FATTN_VEC_CASES_ALL_D(GGML_TYPE_Q5_1, GGML_TYPE_Q5_1)
FATTN_VEC_F16_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q5_1) FATTN_VEC_CASES_ALL_D(GGML_TYPE_Q8_0, GGML_TYPE_Q5_1)
FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q8_0) FATTN_VEC_CASES_ALL_D(GGML_TYPE_F16, GGML_TYPE_Q8_0)
FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q8_0) FATTN_VEC_CASES_ALL_D(GGML_TYPE_Q4_0, GGML_TYPE_Q8_0)
FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q8_0) FATTN_VEC_CASES_ALL_D(GGML_TYPE_Q4_1, GGML_TYPE_Q8_0)
FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q8_0) FATTN_VEC_CASES_ALL_D(GGML_TYPE_Q5_0, GGML_TYPE_Q8_0)
FATTN_VEC_F16_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q8_0) FATTN_VEC_CASES_ALL_D(GGML_TYPE_Q5_1, GGML_TYPE_Q8_0)
FATTN_VEC_F16_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q8_0) FATTN_VEC_CASES_ALL_D(GGML_TYPE_Q8_0, GGML_TYPE_Q8_0)
FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_F16)
FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_F16)
FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_F16)
FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_F16)
FATTN_VEC_F16_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_F16)
FATTN_VEC_F16_CASE(128, GGML_TYPE_F16, GGML_TYPE_F16)
FATTN_VEC_F16_CASE(256, GGML_TYPE_F16, GGML_TYPE_F16)
#else #else
FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q4_0) FATTN_VEC_CASES_ALL_D(GGML_TYPE_F16, GGML_TYPE_F16)
FATTN_VEC_CASES_ALL_D(GGML_TYPE_Q4_0, GGML_TYPE_Q4_0)
FATTN_VEC_F16_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q8_0) FATTN_VEC_CASES_ALL_D(GGML_TYPE_Q8_0, GGML_TYPE_Q8_0)
FATTN_VEC_F16_CASE( 64, GGML_TYPE_F16, GGML_TYPE_F16)
FATTN_VEC_F16_CASE(128, GGML_TYPE_F16, GGML_TYPE_F16)
FATTN_VEC_F16_CASE(256, GGML_TYPE_F16, GGML_TYPE_F16)
#endif // GGML_CUDA_FA_ALL_QUANTS
GGML_ABORT("fatal error");
}
#define FATTN_VEC_F32_CASE(D, type_K, type_V) \
if (Q->ne[0] == (D) && K->type == (type_K) && V->type == (type_V)) { \
ggml_cuda_flash_attn_ext_vec_f32_case<D, type_K, type_V>(ctx, dst); \
return; \
} \
static void ggml_cuda_flash_attn_ext_vec_f32(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
ggml_tensor * Q = dst->src[0];
ggml_tensor * K = dst->src[1];
ggml_tensor * V = dst->src[2];
#ifdef GGML_CUDA_FA_ALL_QUANTS
FATTN_VEC_F32_CASE( 64, GGML_TYPE_F16, GGML_TYPE_Q4_0)
FATTN_VEC_F32_CASE( 64, GGML_TYPE_F16, GGML_TYPE_Q4_1)
FATTN_VEC_F32_CASE( 64, GGML_TYPE_F16, GGML_TYPE_Q5_0)
FATTN_VEC_F32_CASE( 64, GGML_TYPE_F16, GGML_TYPE_Q5_1)
FATTN_VEC_F32_CASE( 64, GGML_TYPE_F16, GGML_TYPE_Q8_0)
FATTN_VEC_F32_CASE( 64, GGML_TYPE_F16, GGML_TYPE_F16)
FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q4_0)
FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q4_0)
FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q4_0)
FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q4_0)
FATTN_VEC_F32_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q4_0)
FATTN_VEC_F32_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q4_0)
FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q4_1)
FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q4_1)
FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q4_1)
FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q4_1)
FATTN_VEC_F32_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q4_1)
FATTN_VEC_F32_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q4_1)
FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q5_0)
FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q5_0)
FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q5_0)
FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q5_0)
FATTN_VEC_F32_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q5_0)
FATTN_VEC_F32_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q5_0)
FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q5_1)
FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q5_1)
FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q5_1)
FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q5_1)
FATTN_VEC_F32_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q5_1)
FATTN_VEC_F32_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q5_1)
FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q8_0)
FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q8_0)
FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q8_0)
FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q8_0)
FATTN_VEC_F32_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q8_0)
FATTN_VEC_F32_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q8_0)
FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_F16)
FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_F16)
FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_F16)
FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_F16)
FATTN_VEC_F32_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_F16)
FATTN_VEC_F32_CASE(128, GGML_TYPE_F16, GGML_TYPE_F16)
FATTN_VEC_F32_CASE(256, GGML_TYPE_F16, GGML_TYPE_F16)
#else
FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q4_0)
FATTN_VEC_F32_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q8_0)
FATTN_VEC_F32_CASE( 64, GGML_TYPE_F16, GGML_TYPE_F16)
FATTN_VEC_F32_CASE(128, GGML_TYPE_F16, GGML_TYPE_F16)
FATTN_VEC_F32_CASE(256, GGML_TYPE_F16, GGML_TYPE_F16)
#endif // GGML_CUDA_FA_ALL_QUANTS #endif // GGML_CUDA_FA_ALL_QUANTS
GGML_ABORT("fatal error"); GGML_ABORT("fatal error");
@ -271,8 +187,7 @@ static void ggml_cuda_flash_attn_ext_vec_f32(ggml_backend_cuda_context & ctx, gg
enum best_fattn_kernel { enum best_fattn_kernel {
BEST_FATTN_KERNEL_NONE = 0, BEST_FATTN_KERNEL_NONE = 0,
BEST_FATTN_KERNEL_TILE = 200, BEST_FATTN_KERNEL_TILE = 200,
BEST_FATTN_KERNEL_VEC_F32 = 100, BEST_FATTN_KERNEL_VEC = 100,
BEST_FATTN_KERNEL_VEC_F16 = 110,
BEST_FATTN_KERNEL_WMMA_F16 = 300, BEST_FATTN_KERNEL_WMMA_F16 = 300,
BEST_FATTN_KERNEL_MMA_F16 = 400, BEST_FATTN_KERNEL_MMA_F16 = 400,
}; };
@ -283,7 +198,6 @@ static best_fattn_kernel ggml_cuda_get_best_fattn_kernel(const int device, const
return BEST_FATTN_KERNEL_NONE; return BEST_FATTN_KERNEL_NONE;
#endif// FLASH_ATTN_AVAILABLE #endif// FLASH_ATTN_AVAILABLE
const ggml_tensor * KQV = dst;
const ggml_tensor * Q = dst->src[0]; const ggml_tensor * Q = dst->src[0];
const ggml_tensor * K = dst->src[1]; const ggml_tensor * K = dst->src[1];
const ggml_tensor * V = dst->src[2]; const ggml_tensor * V = dst->src[2];
@ -293,8 +207,6 @@ static best_fattn_kernel ggml_cuda_get_best_fattn_kernel(const int device, const
GGML_ASSERT(Q->ne[2] % K->ne[2] == 0); GGML_ASSERT(Q->ne[2] % K->ne[2] == 0);
const int cc = ggml_cuda_info().devices[device].cc; const int cc = ggml_cuda_info().devices[device].cc;
const int warp_size = ggml_cuda_info().devices[device].warp_size;
const enum ggml_prec prec = ggml_flash_attn_ext_get_prec(KQV);
switch (K->ne[0]) { switch (K->ne[0]) {
case 64: case 64:
@ -343,31 +255,6 @@ static best_fattn_kernel ggml_cuda_get_best_fattn_kernel(const int device, const
#endif // GGML_CUDA_FA_ALL_QUANTS #endif // GGML_CUDA_FA_ALL_QUANTS
case GGML_TYPE_Q4_0: case GGML_TYPE_Q4_0:
case GGML_TYPE_Q8_0: case GGML_TYPE_Q8_0:
#ifdef GGML_CUDA_FA_ALL_QUANTS
if (K->ne[0] != 128 && K->ne[0] != 64) {
return BEST_FATTN_KERNEL_NONE;
}
#else
if (K->ne[0] != 128) {
return BEST_FATTN_KERNEL_NONE;
}
#endif // GGML_CUDA_FA_ALL_QUANTS
break;
default:
return BEST_FATTN_KERNEL_NONE;
}
switch (V->type) {
case GGML_TYPE_F16:
break;
case GGML_TYPE_Q4_1:
case GGML_TYPE_Q5_0:
case GGML_TYPE_Q5_1:
case GGML_TYPE_Q4_0:
case GGML_TYPE_Q8_0:
if (K->ne[0] != 128) {
return BEST_FATTN_KERNEL_NONE;
}
break; break;
default: default:
return BEST_FATTN_KERNEL_NONE; return BEST_FATTN_KERNEL_NONE;
@ -377,30 +264,39 @@ static best_fattn_kernel ggml_cuda_get_best_fattn_kernel(const int device, const
return BEST_FATTN_KERNEL_NONE; return BEST_FATTN_KERNEL_NONE;
} }
const bool can_use_vector_kernel = Q->ne[0] <= 256 && Q->ne[0] % (2*warp_size) == 0; const bool can_use_vector_kernel = Q->ne[0] <= 256 && Q->ne[0] % 64 == 0;
// If Turing tensor cores available, use them except for some cases with batch size 1: // If Turing tensor cores available, use them except for some cases with batch size 1:
if (turing_mma_available(cc)) { if (turing_mma_available(cc)) {
const bool gqa_opt_applies = gqa_ratio % 2 == 0 && mask; // The mma-based kernels have GQA-specific optimizations best_fattn_kernel best = BEST_FATTN_KERNEL_MMA_F16;
const bool mma_needs_data_conversion = K->type != GGML_TYPE_F16 || V->type != GGML_TYPE_F16;
const bool mma_faster_for_rtx4000 = Q->ne[3] > 1 || (gqa_ratio > 4 && K->ne[1] >= 8192); if (can_use_vector_kernel) {
const bool mma_faster_for_bs1 = gqa_opt_applies && !mma_needs_data_conversion && if (K->type == GGML_TYPE_F16 && V->type == GGML_TYPE_F16) {
(cc < GGML_CUDA_CC_ADA_LOVELACE || mma_faster_for_rtx4000); if (cc >= GGML_CUDA_CC_ADA_LOVELACE && Q->ne[1] == 1 && Q->ne[3] == 1 && !(gqa_ratio > 4 && K->ne[1] >= 8192)) {
if (Q->ne[1] == 1 && can_use_vector_kernel && !mma_faster_for_bs1) { best = BEST_FATTN_KERNEL_VEC;
if (prec == GGML_PREC_DEFAULT && fast_fp16_available(cc)) {
return BEST_FATTN_KERNEL_VEC_F16;
} }
return BEST_FATTN_KERNEL_VEC_F32; } else {
if (cc >= GGML_CUDA_CC_ADA_LOVELACE) {
if (Q->ne[1] <= 2) {
best = BEST_FATTN_KERNEL_VEC;
}
} else {
if (Q->ne[1] == 1) {
best = BEST_FATTN_KERNEL_VEC;
}
}
}
if ((gqa_ratio % 2 != 0 || !mask) && Q->ne[1] == 1) {
best = BEST_FATTN_KERNEL_VEC; // GQA-specific optimizations in the mma kernel do not apply.
} }
return BEST_FATTN_KERNEL_MMA_F16;
} }
// Use kernels specializes for small batch sizes if possible: return best;
if (Q->ne[1] <= 8 && can_use_vector_kernel) {
if (prec == GGML_PREC_DEFAULT && fast_fp16_available(cc)) {
return BEST_FATTN_KERNEL_VEC_F16;
} }
return BEST_FATTN_KERNEL_VEC_F32;
// Use kernels specialized for small batch sizes if possible:
if (Q->ne[1] <= 8 && can_use_vector_kernel) {
return BEST_FATTN_KERNEL_VEC;
} }
// For large batch sizes, use the WMMA kernel if possible: // For large batch sizes, use the WMMA kernel if possible:
@ -420,11 +316,8 @@ void ggml_cuda_flash_attn_ext(ggml_backend_cuda_context & ctx, ggml_tensor * dst
case BEST_FATTN_KERNEL_TILE: case BEST_FATTN_KERNEL_TILE:
ggml_cuda_flash_attn_ext_tile(ctx, dst); ggml_cuda_flash_attn_ext_tile(ctx, dst);
break; break;
case BEST_FATTN_KERNEL_VEC_F32: case BEST_FATTN_KERNEL_VEC:
ggml_cuda_flash_attn_ext_vec_f32(ctx, dst); ggml_cuda_flash_attn_ext_vec(ctx, dst);
break;
case BEST_FATTN_KERNEL_VEC_F16:
ggml_cuda_flash_attn_ext_vec_f16(ctx, dst);
break; break;
case BEST_FATTN_KERNEL_WMMA_F16: case BEST_FATTN_KERNEL_WMMA_F16:
ggml_cuda_flash_attn_ext_wmma_f16(ctx, dst); ggml_cuda_flash_attn_ext_wmma_f16(ctx, dst);

View File

@ -2031,7 +2031,7 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
const int cc = ggml_cuda_info().devices[id].cc; const int cc = ggml_cuda_info().devices[id].cc;
const int warp_size = ggml_cuda_info().devices[id].warp_size; const int warp_size = ggml_cuda_info().devices[id].warp_size;
use_mul_mat_q = use_mul_mat_q && ggml_cuda_should_use_mmq(src0->type, cc, src1->ne[1]); use_mul_mat_q = use_mul_mat_q && ggml_cuda_should_use_mmq(src0->type, cc, src1->ne[1]);
use_mul_mat_f = use_mul_mat_f && ggml_cuda_should_use_mmf(src0->type, cc, warp_size, src0->ne, src1->ne[1]); use_mul_mat_f = use_mul_mat_f && ggml_cuda_should_use_mmf(src0->type, cc, warp_size, src0->ne, src1->ne[1], /*mul_mat_id=*/false);
use_mul_mat_vec_f = use_mul_mat_vec_f && ggml_cuda_should_use_mmvf(src0->type, cc, src0->ne, src1->ne[1]); use_mul_mat_vec_f = use_mul_mat_vec_f && ggml_cuda_should_use_mmvf(src0->type, cc, src0->ne, src1->ne[1]);
any_gpus_with_slow_fp16 = any_gpus_with_slow_fp16 || !fast_fp16_hardware_available(cc); any_gpus_with_slow_fp16 = any_gpus_with_slow_fp16 || !fast_fp16_hardware_available(cc);
} }
@ -2039,7 +2039,7 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
const int cc = ggml_cuda_info().devices[ctx.device].cc; const int cc = ggml_cuda_info().devices[ctx.device].cc;
const int warp_size = ggml_cuda_info().devices[ctx.device].warp_size; const int warp_size = ggml_cuda_info().devices[ctx.device].warp_size;
use_mul_mat_q = use_mul_mat_q && ggml_cuda_should_use_mmq(src0->type, cc, src1->ne[1]); use_mul_mat_q = use_mul_mat_q && ggml_cuda_should_use_mmq(src0->type, cc, src1->ne[1]);
use_mul_mat_f = use_mul_mat_f && ggml_cuda_should_use_mmf(src0->type, cc, warp_size, src0->ne, src1->ne[1]); use_mul_mat_f = use_mul_mat_f && ggml_cuda_should_use_mmf(src0->type, cc, warp_size, src0->ne, src1->ne[1], /*mul_mat_id=*/false);
use_mul_mat_vec_f = use_mul_mat_vec_f && ggml_cuda_should_use_mmvf(src0->type, cc, src0->ne, src1->ne[1]); use_mul_mat_vec_f = use_mul_mat_vec_f && ggml_cuda_should_use_mmvf(src0->type, cc, src0->ne, src1->ne[1]);
any_gpus_with_slow_fp16 = any_gpus_with_slow_fp16 || !fast_fp16_hardware_available(cc); any_gpus_with_slow_fp16 = any_gpus_with_slow_fp16 || !fast_fp16_hardware_available(cc);
} }
@ -2111,7 +2111,7 @@ static void ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor *
return; return;
} }
if (ggml_cuda_should_use_mmf(src0->type, cc, WARP_SIZE, src0->ne, src1->ne[2])) { if (ggml_cuda_should_use_mmf(src0->type, cc, WARP_SIZE, src0->ne, src1->ne[2], /*mul_mat_id=*/true)) {
ggml_cuda_mul_mat_f(ctx, src0, src1, ids, dst); ggml_cuda_mul_mat_f(ctx, src0, src1, ids, dst);
return; return;
} }
@ -2641,6 +2641,8 @@ static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_backend_cud
const std::string ffn_moe_gate_bias_prefix = "ffn_moe_gate_biased"; const std::string ffn_moe_gate_bias_prefix = "ffn_moe_gate_biased";
const std::string ffn_moe_up_bias_prefix = "ffn_moe_up_biased"; const std::string ffn_moe_up_bias_prefix = "ffn_moe_up_biased";
const std::string ffn_moe_down_bias_prefix = "ffn_moe_down_biased"; const std::string ffn_moe_down_bias_prefix = "ffn_moe_down_biased";
const std::string nemotron_h_block_out_prefix = "nemotron_h_block_out";
const std::string mamba2_y_add_d_prefix = "mamba2_y_add_d";
for (int i = 0; i < cgraph->n_nodes; i++) { for (int i = 0; i < cgraph->n_nodes; i++) {
ggml_tensor * node = cgraph->nodes[i]; ggml_tensor * node = cgraph->nodes[i];
@ -2669,7 +2671,9 @@ static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_backend_cud
(node->src[1] ? node->src[1]->name != gemma3n_per_layer_proj_src1_name : true) && (node->src[1] ? node->src[1]->name != gemma3n_per_layer_proj_src1_name : true) &&
strncmp(node->name, ffn_moe_gate_bias_prefix.c_str(), ffn_moe_gate_bias_prefix.size()) != 0 && strncmp(node->name, ffn_moe_gate_bias_prefix.c_str(), ffn_moe_gate_bias_prefix.size()) != 0 &&
strncmp(node->name, ffn_moe_up_bias_prefix.c_str(), ffn_moe_up_bias_prefix.size()) != 0 && strncmp(node->name, ffn_moe_up_bias_prefix.c_str(), ffn_moe_up_bias_prefix.size()) != 0 &&
strncmp(node->name, ffn_moe_down_bias_prefix.c_str(), ffn_moe_down_bias_prefix.size()) != 0) { strncmp(node->name, ffn_moe_down_bias_prefix.c_str(), ffn_moe_down_bias_prefix.size()) != 0 &&
strncmp(node->name, nemotron_h_block_out_prefix.c_str(), nemotron_h_block_out_prefix.size()) != 0 &&
strncmp(node->name, mamba2_y_add_d_prefix.c_str(), mamba2_y_add_d_prefix.size()) != 0) {
// disable CUDA graphs for batch size > 1 for now while excluding the matrix-matrix addition as part of Gemma3n's `project_per_layer_input` operation // disable CUDA graphs for batch size > 1 for now while excluding the matrix-matrix addition as part of Gemma3n's `project_per_layer_input` operation
// by means of matching node names. See // by means of matching node names. See
// https://github.com/ggml-org/llama.cpp/blob/f9a31eea06a859e34cecb88b4d020c7f03d86cc4/src/llama-model.cpp#L10199-L10241 and // https://github.com/ggml-org/llama.cpp/blob/f9a31eea06a859e34cecb88b4d020c7f03d86cc4/src/llama-model.cpp#L10199-L10241 and
@ -3639,9 +3643,11 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
case GGML_OP_CONV_TRANSPOSE_2D: case GGML_OP_CONV_TRANSPOSE_2D:
case GGML_OP_POOL_2D: case GGML_OP_POOL_2D:
case GGML_OP_SUM: case GGML_OP_SUM:
case GGML_OP_ARGSORT:
case GGML_OP_ACC: case GGML_OP_ACC:
return true; return true;
case GGML_OP_ARGSORT:
// TODO: Support arbitrary column width
return op->src[0]->ne[0] <= 1024;
case GGML_OP_SUM_ROWS: case GGML_OP_SUM_ROWS:
case GGML_OP_MEAN: case GGML_OP_MEAN:
case GGML_OP_GROUP_NORM: case GGML_OP_GROUP_NORM:

View File

@ -84,7 +84,7 @@ void ggml_cuda_mul_mat_f(ggml_backend_cuda_context & ctx, const ggml_tensor * sr
} }
} }
bool ggml_cuda_should_use_mmf(enum ggml_type type, int cc, int warp_size, const int64_t * src0_ne, const int src1_ncols) { bool ggml_cuda_should_use_mmf(enum ggml_type type, int cc, int warp_size, const int64_t * src0_ne, const int src1_ncols, bool mul_mat_id) {
if (ggml_is_quantized(type)) { if (ggml_is_quantized(type)) {
return false; return false;
@ -96,9 +96,19 @@ bool ggml_cuda_should_use_mmf(enum ggml_type type, int cc, int warp_size, const
if (src0_ne[1] % MMF_ROWS_PER_BLOCK != 0) { if (src0_ne[1] % MMF_ROWS_PER_BLOCK != 0) {
return false; return false;
} }
if (mul_mat_id) {
if (type == GGML_TYPE_F32 && src1_ncols > 32) {
return false;
}
if ((type == GGML_TYPE_F16 || type == GGML_TYPE_BF16) && src1_ncols > 64) {
return false;
}
} else {
if (src1_ncols > 16) { if (src1_ncols > 16) {
return false; return false;
} }
}
switch (type) { switch (type) {
case GGML_TYPE_F32: case GGML_TYPE_F32:

View File

@ -9,13 +9,13 @@ using namespace ggml_cuda_mma;
void ggml_cuda_mul_mat_f(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * ids, ggml_tensor * dst); void ggml_cuda_mul_mat_f(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * ids, ggml_tensor * dst);
bool ggml_cuda_should_use_mmf(enum ggml_type type, int cc, int warp_size, const int64_t * scr0_ne, const int src1_ncols); bool ggml_cuda_should_use_mmf(enum ggml_type type, int cc, int warp_size, const int64_t * scr0_ne, const int src1_ncols, bool mul_mat_id);
template <typename T, int rows_per_block, int cols_per_block, int nwarps, bool has_ids> template <typename T, int rows_per_block, int cols_per_block, int nwarps, bool has_ids>
__launch_bounds__(ggml_cuda_get_physical_warp_size()*nwarps, 1) __launch_bounds__(ggml_cuda_get_physical_warp_size()*nwarps, 1)
static __global__ void mul_mat_f( static __global__ void mul_mat_f(
const T * __restrict__ x, const float * __restrict__ y, const int32_t * __restrict__ ids, float * __restrict__ dst, const T * __restrict__ x, const float * __restrict__ y, const int32_t * __restrict__ ids, float * __restrict__ dst,
const int ncols, const int nchannels_dst, const int stride_row, const int stride_col_y, const int stride_col_dst, const int ncols, const int ncols_dst_total, const int nchannels_dst, const int stride_row, const int stride_col_y, const int stride_col_dst,
const int stride_col_id, const int stride_row_id, const int stride_col_id, const int stride_row_id,
const int channel_ratio, const int stride_channel_x, const int stride_channel_y, const int stride_channel_dst, const int channel_ratio, const int stride_channel_x, const int stride_channel_y, const int stride_channel_dst,
const int sample_ratio, const int stride_sample_x, const int stride_sample_y, const int stride_sample_dst) { const int sample_ratio, const int stride_sample_x, const int stride_sample_y, const int stride_sample_dst) {
@ -31,9 +31,20 @@ static __global__ void mul_mat_f(
const int row0 = blockIdx.x * rows_per_block; const int row0 = blockIdx.x * rows_per_block;
const int expert_idx = has_ids ? blockIdx.y : 0; int expert_idx = 0;
int col_base = 0;
const int channel_dst = has_ids ? 0 : blockIdx.y; const int channel_dst = has_ids ? 0 : blockIdx.y;
if constexpr (has_ids) {
// experts + tiles of ncols_dst are packed in the y dimension
int col_tiles = (ncols_dst_total + cols_per_block - 1) / cols_per_block;
const int nchannels_x = gridDim.y / col_tiles;
const int tile_idx = blockIdx.y / nchannels_x;
expert_idx = blockIdx.y - tile_idx * nchannels_x;
col_base = tile_idx * cols_per_block;
}
const int channel_x = has_ids ? expert_idx : (channel_dst / channel_ratio); const int channel_x = has_ids ? expert_idx : (channel_dst / channel_ratio);
const int channel_y = channel_dst; const int channel_y = channel_dst;
const int sample_dst = blockIdx.z; const int sample_dst = blockIdx.z;
@ -44,6 +55,14 @@ static __global__ void mul_mat_f(
y += int64_t(sample_y) *stride_sample_y + (has_ids ? 0 : channel_y *stride_channel_y); y += int64_t(sample_y) *stride_sample_y + (has_ids ? 0 : channel_y *stride_channel_y);
dst += int64_t(sample_dst)*stride_sample_dst + (has_ids ? 0 : channel_dst*stride_channel_dst); dst += int64_t(sample_dst)*stride_sample_dst + (has_ids ? 0 : channel_dst*stride_channel_dst);
if constexpr (has_ids) {
constexpr int y_stride_scale = std::is_same_v<T, float> ? 1 : 2;
const int64_t col_offset = col_base;
y += col_offset * stride_col_y * y_stride_scale;
dst += col_offset * stride_col_dst;
ids += col_offset * stride_row_id;
}
const float2 * y2 = (const float2 *) y; const float2 * y2 = (const float2 *) y;
extern __shared__ char data_mmv[]; extern __shared__ char data_mmv[];
@ -61,12 +80,17 @@ static __global__ void mul_mat_f(
for (int j0 = 0; j0 < cols_per_block; j0 += nwarps) { for (int j0 = 0; j0 < cols_per_block; j0 += nwarps) {
const int j = j0 + threadIdx.y; const int j = j0 + threadIdx.y;
const int32_t * __restrict__ id_row = ids + j*stride_row_id;
if (threadIdx.x == 0) { if (threadIdx.x == 0) {
slot_map[j] = -1; slot_map[j] = -1;
} }
if (col_base + j >= ncols_dst_total) {
continue;
}
const int32_t * __restrict__ id_row = ids + j*stride_row_id;
for (int k = threadIdx.x; k < nchannels_dst; k += warp_size) { for (int k = threadIdx.x; k < nchannels_dst; k += warp_size) {
int match = id_row[k*stride_col_id] == expert_idx; int match = id_row[k*stride_col_id] == expert_idx;
@ -108,7 +132,8 @@ static __global__ void mul_mat_f(
if constexpr (!has_ids) { if constexpr (!has_ids) {
tile_xy[j0*tile_k_padded + threadIdx.x] = j < cols_per_block ? y[j*stride_col_y + col] : 0.0f; tile_xy[j0*tile_k_padded + threadIdx.x] = j < cols_per_block ? y[j*stride_col_y + col] : 0.0f;
} else { } else {
tile_xy[j0*tile_k_padded + threadIdx.x] = j < cols_per_block ? y[slot_map[j]*stride_channel_y + j*stride_col_y + col] : 0.0f; const bool valid = j < cols_per_block && (col_base + j) < ncols_dst_total && slot_map[j] >= 0;
tile_xy[j0*tile_k_padded + threadIdx.x] = valid ? y[slot_map[j]*stride_channel_y + j*stride_col_y + col] : 0.0f;
} }
} }
} else if constexpr (std::is_same_v<T, half2> || std::is_same_v<T, nv_bfloat162>) { } else if constexpr (std::is_same_v<T, half2> || std::is_same_v<T, nv_bfloat162>) {
@ -120,7 +145,8 @@ static __global__ void mul_mat_f(
const float2 tmp = j < cols_per_block ? y2[j*stride_col_y + col] : make_float2(0.0f, 0.0f); const float2 tmp = j < cols_per_block ? y2[j*stride_col_y + col] : make_float2(0.0f, 0.0f);
tile_xy[j0*tile_k_padded + threadIdx.x] = {tmp.x, tmp.y}; tile_xy[j0*tile_k_padded + threadIdx.x] = {tmp.x, tmp.y};
} else { } else {
float2 tmp = j < cols_per_block && slot_map[j] >= 0 ? *(const float2*) &y[slot_map[j]*stride_channel_y + 2*(j*stride_col_y + col)] : make_float2(0.0f, 0.0f); const bool valid = j < cols_per_block && (col_base + j) < ncols_dst_total && slot_map[j] >= 0;
float2 tmp = valid ? *(const float2*) &y[slot_map[j]*stride_channel_y + 2*(j*stride_col_y + col)] : make_float2(0.0f, 0.0f);
tile_xy[j0*tile_k_padded + threadIdx.x] = {tmp.x, tmp.y}; tile_xy[j0*tile_k_padded + threadIdx.x] = {tmp.x, tmp.y};
} }
} }
@ -183,14 +209,14 @@ static __global__ void mul_mat_f(
dst[j*stride_col_dst + row0 + threadIdx.x] = sum; dst[j*stride_col_dst + row0 + threadIdx.x] = sum;
} else { } else {
const int slot = (j < cols_per_block) ? slot_map[j] : -1; const int slot = (j < cols_per_block) ? slot_map[j] : -1;
if (slot >= 0) { if (slot >= 0 && (col_base + j) < ncols_dst_total) {
dst[slot*stride_channel_dst + j*stride_col_dst + row0 + threadIdx.x] = sum; dst[slot*stride_channel_dst + j*stride_col_dst + row0 + threadIdx.x] = sum;
} }
} }
} }
#else #else
GGML_UNUSED_VARS(x, y, ids, dst, GGML_UNUSED_VARS(x, y, ids, dst,
ncols, nchannels_dst, stride_row, stride_col_y, stride_col_dst, ncols, ncols_dst_total, nchannels_dst, stride_row, stride_col_y, stride_col_dst,
stride_col_id, stride_row_id, stride_col_id, stride_row_id,
channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst); sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
@ -201,20 +227,23 @@ static __global__ void mul_mat_f(
template<typename T, int cols_per_block, int nwarps> template<typename T, int cols_per_block, int nwarps>
static inline void mul_mat_f_switch_ids( static inline void mul_mat_f_switch_ids(
const T * x, const float * y, const int32_t * ids, float * dst, const T * x, const float * y, const int32_t * ids, float * dst,
const int64_t ncols_x, const int64_t nchannels_dst, const int64_t ncols_x, const int64_t ncols_dst, const int64_t nchannels_dst,
const int64_t stride_row, const int64_t stride_col_y, const int64_t stride_col_dst, const int64_t stride_row, const int64_t stride_col_y, const int64_t stride_col_dst,
const int64_t stride_col_id, const int64_t stride_row_id, const int64_t stride_col_id, const int64_t stride_row_id,
const int64_t channel_ratio, const int64_t stride_channel_x, const int64_t stride_channel_y, const int64_t stride_channel_dst, const int64_t channel_ratio, const int64_t stride_channel_x, const int64_t stride_channel_y, const int64_t stride_channel_dst,
const int64_t sample_ratio, const int64_t stride_sample_x, const int64_t stride_sample_y, const int64_t stride_sample_dst, const int64_t sample_ratio, const int64_t stride_sample_x, const int64_t stride_sample_y, const int64_t stride_sample_dst,
const dim3 & block_nums, const dim3 & block_dims, const int nbytes_shared_total, cudaStream_t stream) { const dim3 & block_nums, const dim3 & block_dims, const int nbytes_shared_total, cudaStream_t stream) {
if (ids) { if (ids) {
mul_mat_f<T, MMF_ROWS_PER_BLOCK, cols_per_block, nwarps, true><<<block_nums, block_dims, nbytes_shared_total, stream>>> const int64_t col_tiles = (ncols_dst + cols_per_block - 1) / cols_per_block;
(x, y, ids, dst, ncols_x, nchannels_dst, stride_row, stride_col_y, stride_col_dst, dim3 block_nums_ids = block_nums;
block_nums_ids.y *= col_tiles;
mul_mat_f<T, MMF_ROWS_PER_BLOCK, cols_per_block, nwarps, true><<<block_nums_ids, block_dims, nbytes_shared_total, stream>>>
(x, y, ids, dst, ncols_x, ncols_dst, nchannels_dst, stride_row, stride_col_y, stride_col_dst,
stride_col_id, stride_row_id, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst, stride_col_id, stride_row_id, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst); sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
} else { } else {
mul_mat_f<T, MMF_ROWS_PER_BLOCK, cols_per_block, nwarps, false><<<block_nums, block_dims, nbytes_shared_total, stream>>> mul_mat_f<T, MMF_ROWS_PER_BLOCK, cols_per_block, nwarps, false><<<block_nums, block_dims, nbytes_shared_total, stream>>>
(x, y, ids, dst, ncols_x, nchannels_dst, stride_row, stride_col_y, stride_col_dst, (x, y, ids, dst, ncols_x, cols_per_block, nchannels_dst, stride_row, stride_col_y, stride_col_dst,
stride_col_id, stride_row_id, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst, stride_col_id, stride_row_id, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst); sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
} }
@ -223,7 +252,8 @@ static inline void mul_mat_f_switch_ids(
template <typename T, int cols_per_block> template <typename T, int cols_per_block>
void mul_mat_f_cuda( void mul_mat_f_cuda(
const T * x, const float * y, const int32_t * ids, float * dst, const T * x, const float * y, const int32_t * ids, float * dst,
const int64_t ncols_x, const int64_t nrows_x, const int64_t stride_row, const int64_t stride_col_y, const int64_t stride_col_dst, const int64_t ncols_x, const int64_t nrows_x, const int64_t ncols_dst,
const int64_t stride_row, const int64_t stride_col_y, const int64_t stride_col_dst,
const int64_t stride_col_id, const int64_t stride_row_id, const int64_t stride_col_id, const int64_t stride_row_id,
const int64_t nchannels_x, const int64_t nchannels_y, const int64_t nchannels_dst, const int64_t nchannels_x, const int64_t nchannels_y, const int64_t nchannels_dst,
const int64_t stride_channel_x, const int64_t stride_channel_y, const int64_t stride_channel_dst, const int64_t nsamples_x, const int64_t stride_channel_x, const int64_t stride_channel_y, const int64_t stride_channel_dst, const int64_t nsamples_x,
@ -268,49 +298,49 @@ void mul_mat_f_cuda(
switch (nwarps_best) { switch (nwarps_best) {
case 1: { case 1: {
mul_mat_f_switch_ids<T, cols_per_block, 1>( mul_mat_f_switch_ids<T, cols_per_block, 1>(
x, y, ids, dst, ncols_x, nchannels_dst, stride_row, stride_col_y, stride_col_dst, x, y, ids, dst, ncols_x, ncols_dst, nchannels_dst, stride_row, stride_col_y, stride_col_dst,
stride_col_id, stride_row_id, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst, stride_col_id, stride_row_id, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst, block_nums, block_dims, nbytes_shared_total, stream); sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst, block_nums, block_dims, nbytes_shared_total, stream);
} break; } break;
case 2: { case 2: {
mul_mat_f_switch_ids<T, cols_per_block, 2>( mul_mat_f_switch_ids<T, cols_per_block, 2>(
x, y, ids, dst, ncols_x, nchannels_dst, stride_row, stride_col_y, stride_col_dst, x, y, ids, dst, ncols_x, ncols_dst, nchannels_dst, stride_row, stride_col_y, stride_col_dst,
stride_col_id, stride_row_id, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst, stride_col_id, stride_row_id, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst, block_nums, block_dims, nbytes_shared_total, stream); sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst, block_nums, block_dims, nbytes_shared_total, stream);
} break; } break;
case 3: { case 3: {
mul_mat_f_switch_ids<T, cols_per_block, 3>( mul_mat_f_switch_ids<T, cols_per_block, 3>(
x, y, ids, dst, ncols_x, nchannels_dst, stride_row, stride_col_y, stride_col_dst, x, y, ids, dst, ncols_x, ncols_dst, nchannels_dst, stride_row, stride_col_y, stride_col_dst,
stride_col_id, stride_row_id, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst, stride_col_id, stride_row_id, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst, block_nums, block_dims, nbytes_shared_total, stream); sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst, block_nums, block_dims, nbytes_shared_total, stream);
} break; } break;
case 4: { case 4: {
mul_mat_f_switch_ids<T, cols_per_block, 4>( mul_mat_f_switch_ids<T, cols_per_block, 4>(
x, y, ids, dst, ncols_x, nchannels_dst, stride_row, stride_col_y, stride_col_dst, x, y, ids, dst, ncols_x, ncols_dst, nchannels_dst, stride_row, stride_col_y, stride_col_dst,
stride_col_id, stride_row_id, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst, stride_col_id, stride_row_id, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst, block_nums, block_dims, nbytes_shared_total, stream); sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst, block_nums, block_dims, nbytes_shared_total, stream);
} break; } break;
case 5: { case 5: {
mul_mat_f_switch_ids<T, cols_per_block, 5>( mul_mat_f_switch_ids<T, cols_per_block, 5>(
x, y, ids, dst, ncols_x, nchannels_dst, stride_row, stride_col_y, stride_col_dst, x, y, ids, dst, ncols_x, ncols_dst, nchannels_dst, stride_row, stride_col_y, stride_col_dst,
stride_col_id, stride_row_id, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst, stride_col_id, stride_row_id, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst, block_nums, block_dims, nbytes_shared_total, stream); sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst, block_nums, block_dims, nbytes_shared_total, stream);
} break; } break;
case 6: { case 6: {
mul_mat_f_switch_ids<T, cols_per_block, 6>( mul_mat_f_switch_ids<T, cols_per_block, 6>(
x, y, ids, dst, ncols_x, nchannels_dst, stride_row, stride_col_y, stride_col_dst, x, y, ids, dst, ncols_x, ncols_dst, nchannels_dst, stride_row, stride_col_y, stride_col_dst,
stride_col_id, stride_row_id, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst, stride_col_id, stride_row_id, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst, block_nums, block_dims, nbytes_shared_total, stream); sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst, block_nums, block_dims, nbytes_shared_total, stream);
} break; } break;
case 7: { case 7: {
mul_mat_f_switch_ids<T, cols_per_block, 7>( mul_mat_f_switch_ids<T, cols_per_block, 7>(
x, y, ids, dst, ncols_x, nchannels_dst, stride_row, stride_col_y, stride_col_dst, x, y, ids, dst, ncols_x, ncols_dst, nchannels_dst, stride_row, stride_col_y, stride_col_dst,
stride_col_id, stride_row_id, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst, stride_col_id, stride_row_id, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst, block_nums, block_dims, nbytes_shared_total, stream); sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst, block_nums, block_dims, nbytes_shared_total, stream);
} break; } break;
case 8: { case 8: {
mul_mat_f_switch_ids<T, cols_per_block, 8>( mul_mat_f_switch_ids<T, cols_per_block, 8>(
x, y, ids, dst, ncols_x, nchannels_dst, stride_row, stride_col_y, stride_col_dst, x, y, ids, dst, ncols_x, ncols_dst, nchannels_dst, stride_row, stride_col_y, stride_col_dst,
stride_col_id, stride_row_id, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst, stride_col_id, stride_row_id, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst, block_nums, block_dims, nbytes_shared_total, stream); sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst, block_nums, block_dims, nbytes_shared_total, stream);
} break; } break;
@ -332,84 +362,89 @@ static void mul_mat_f_switch_cols_per_block(
const int64_t stride_channel_x, const int64_t stride_channel_y, const int64_t stride_channel_dst, const int64_t nsamples_x, const int64_t stride_channel_x, const int64_t stride_channel_y, const int64_t stride_channel_dst, const int64_t nsamples_x,
const int64_t nsamples_dst, const int64_t stride_sample_x, const int64_t stride_sample_y, const int64_t stride_sample_dst, const int64_t nsamples_dst, const int64_t stride_sample_x, const int64_t stride_sample_y, const int64_t stride_sample_dst,
cudaStream_t stream) { cudaStream_t stream) {
switch (ncols_dst) {
const int ncols_case = (ids && ncols_dst > 16) ? 16 : ncols_dst;
GGML_ASSERT(ids || ncols_dst <= 16);
switch (ncols_case) {
case 1: { case 1: {
mul_mat_f_cuda<T, 1>(x, y, ids, dst, ncols_x, nrows_x, stride_row, stride_col_y, stride_col_dst, mul_mat_f_cuda<T, 1>(x, y, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row, stride_col_y, stride_col_dst,
stride_col_id, stride_row_id, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst, stride_col_id, stride_row_id, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream); nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
} break; } break;
case 2: { case 2: {
mul_mat_f_cuda<T, 2>(x, y, ids, dst, ncols_x, nrows_x, stride_row, stride_col_y, stride_col_dst, mul_mat_f_cuda<T, 2>(x, y, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row, stride_col_y, stride_col_dst,
stride_col_id, stride_row_id, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst, stride_col_id, stride_row_id, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream); nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
} break; } break;
case 3: { case 3: {
mul_mat_f_cuda<T, 3>(x, y, ids, dst, ncols_x, nrows_x, stride_row, stride_col_y, stride_col_dst, mul_mat_f_cuda<T, 3>(x, y, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row, stride_col_y, stride_col_dst,
stride_col_id, stride_row_id, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst, stride_col_id, stride_row_id, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream); nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
} break; } break;
case 4: { case 4: {
mul_mat_f_cuda<T, 4>(x, y, ids, dst, ncols_x, nrows_x, stride_row, stride_col_y, stride_col_dst, mul_mat_f_cuda<T, 4>(x, y, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row, stride_col_y, stride_col_dst,
stride_col_id, stride_row_id, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst, stride_col_id, stride_row_id, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream); nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
} break; } break;
case 5: { case 5: {
mul_mat_f_cuda<T, 5>(x, y, ids, dst, ncols_x, nrows_x, stride_row, stride_col_y, stride_col_dst, mul_mat_f_cuda<T, 5>(x, y, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row, stride_col_y, stride_col_dst,
stride_col_id, stride_row_id, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst, stride_col_id, stride_row_id, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream); nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
} break; } break;
case 6: { case 6: {
mul_mat_f_cuda<T, 6>(x, y, ids, dst, ncols_x, nrows_x, stride_row, stride_col_y, stride_col_dst, mul_mat_f_cuda<T, 6>(x, y, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row, stride_col_y, stride_col_dst,
stride_col_id, stride_row_id, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst, stride_col_id, stride_row_id, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream); nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
} break; } break;
case 7: { case 7: {
mul_mat_f_cuda<T, 7>(x, y, ids, dst, ncols_x, nrows_x, stride_row, stride_col_y, stride_col_dst, mul_mat_f_cuda<T, 7>(x, y, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row, stride_col_y, stride_col_dst,
stride_col_id, stride_row_id, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst, stride_col_id, stride_row_id, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream); nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
} break; } break;
case 8: { case 8: {
mul_mat_f_cuda<T, 8>(x, y, ids, dst, ncols_x, nrows_x, stride_row, stride_col_y, stride_col_dst, mul_mat_f_cuda<T, 8>(x, y, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row, stride_col_y, stride_col_dst,
stride_col_id, stride_row_id, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst, stride_col_id, stride_row_id, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream); nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
} break; } break;
case 9: { case 9: {
mul_mat_f_cuda<T, 9>(x, y, ids, dst, ncols_x, nrows_x, stride_row, stride_col_y, stride_col_dst, mul_mat_f_cuda<T, 9>(x, y, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row, stride_col_y, stride_col_dst,
stride_col_id, stride_row_id, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst, stride_col_id, stride_row_id, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream); nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
} break; } break;
case 10: { case 10: {
mul_mat_f_cuda<T, 10>(x, y, ids, dst, ncols_x, nrows_x, stride_row, stride_col_y, stride_col_dst, mul_mat_f_cuda<T, 10>(x, y, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row, stride_col_y, stride_col_dst,
stride_col_id, stride_row_id, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst, stride_col_id, stride_row_id, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream); nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
} break; } break;
case 11: { case 11: {
mul_mat_f_cuda<T, 11>(x, y, ids, dst, ncols_x, nrows_x, stride_row, stride_col_y, stride_col_dst, mul_mat_f_cuda<T, 11>(x, y, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row, stride_col_y, stride_col_dst,
stride_col_id, stride_row_id, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst, stride_col_id, stride_row_id, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream); nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
} break; } break;
case 12: { case 12: {
mul_mat_f_cuda<T, 12>(x, y, ids, dst, ncols_x, nrows_x, stride_row, stride_col_y, stride_col_dst, mul_mat_f_cuda<T, 12>(x, y, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row, stride_col_y, stride_col_dst,
stride_col_id, stride_row_id, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst, stride_col_id, stride_row_id, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream); nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
} break; } break;
case 13: { case 13: {
mul_mat_f_cuda<T, 13>(x, y, ids, dst, ncols_x, nrows_x, stride_row, stride_col_y, stride_col_dst, mul_mat_f_cuda<T, 13>(x, y, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row, stride_col_y, stride_col_dst,
stride_col_id, stride_row_id, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst, stride_col_id, stride_row_id, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream); nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
} break; } break;
case 14: { case 14: {
mul_mat_f_cuda<T, 14>(x, y, ids, dst, ncols_x, nrows_x, stride_row, stride_col_y, stride_col_dst, mul_mat_f_cuda<T, 14>(x, y, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row, stride_col_y, stride_col_dst,
stride_col_id, stride_row_id, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst, stride_col_id, stride_row_id, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream); nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
} break; } break;
case 15: { case 15: {
mul_mat_f_cuda<T, 15>(x, y, ids, dst, ncols_x, nrows_x, stride_row, stride_col_y, stride_col_dst, mul_mat_f_cuda<T, 15>(x, y, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row, stride_col_y, stride_col_dst,
stride_col_id, stride_row_id, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst, stride_col_id, stride_row_id, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream); nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
} break; } break;
case 16: { case 16: {
mul_mat_f_cuda<T, 16>(x, y, ids, dst, ncols_x, nrows_x, stride_row, stride_col_y, stride_col_dst, mul_mat_f_cuda<T, 16>(x, y, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row, stride_col_y, stride_col_dst,
stride_col_id, stride_row_id, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst, stride_col_id, stride_row_id, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream); nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
} break; } break;
@ -422,7 +457,7 @@ static void mul_mat_f_switch_cols_per_block(
#define DECL_MMF_CASE_HELPER(T, ncols_dst) \ #define DECL_MMF_CASE_HELPER(T, ncols_dst) \
template void mul_mat_f_cuda<T, ncols_dst>( \ template void mul_mat_f_cuda<T, ncols_dst>( \
const T * x, const float * y, const int32_t * ids, float * dst, \ const T * x, const float * y, const int32_t * ids, float * dst, \
const int64_t ncols_x, const int64_t nrows_x, const int64_t stride_row, const int64_t stride_col_y, const int64_t stride_col_dst, \ const int64_t ncols_x, const int64_t nrows_x, int64_t ncols_dst_total, const int64_t stride_row, const int64_t stride_col_y, const int64_t stride_col_dst, \
const int64_t stride_col_id, const int64_t stride_row_id, \ const int64_t stride_col_id, const int64_t stride_row_id, \
const int64_t nchannels_x, const int64_t nchannels_y, const int64_t nchannels_dst, \ const int64_t nchannels_x, const int64_t nchannels_y, const int64_t nchannels_dst, \
const int64_t stride_channel_x, const int64_t stride_channel_y, const int64_t stride_channel_dst, const int64_t nsamples_x,\ const int64_t stride_channel_x, const int64_t stride_channel_y, const int64_t stride_channel_dst, const int64_t nsamples_x,\

View File

@ -81,7 +81,7 @@ static __global__ void mmq_ids_helper(
#pragma unroll #pragma unroll
for (int offset = neu_padded; offset < warp_size; offset += neu_padded) { for (int offset = neu_padded; offset < warp_size; offset += neu_padded) {
const int tmp = __shfl_up_sync(0xFFFFFFFF, it_compact_add_self, offset, warp_size); const int tmp = __shfl_up_sync(0xFFFFFFFF, it_compact_add_self, offset, warp_size);
if (threadIdx.x >= offset) { if (threadIdx.x >= static_cast<unsigned int>(offset)) {
it_compact_add_lower += tmp; it_compact_add_lower += tmp;
} }
} }
@ -110,7 +110,7 @@ static __global__ void mmq_ids_helper(
expert_bounds[expert] = nex_prev; expert_bounds[expert] = nex_prev;
if (expert < gridDim.x - 1) { if (expert < static_cast<int>(gridDim.x) - 1) {
return; return;
} }

View File

@ -220,7 +220,7 @@ static __global__ void mul_mat_vec_q(
tmp[j][i] = warp_reduce_sum<warp_size>(tmp[j][i]); tmp[j][i] = warp_reduce_sum<warp_size>(tmp[j][i]);
} }
if (threadIdx.x < rows_per_cuda_block && (rows_per_cuda_block == 1 || row0 + int(threadIdx.x) < stride_col_dst)) { if (threadIdx.x < rows_per_cuda_block && (rows_per_cuda_block == 1 || uint32_t(row0 + threadIdx.x) < stride_col_dst)) {
dst[j*stride_col_dst + threadIdx.x] = tmp[j][threadIdx.x]; dst[j*stride_col_dst + threadIdx.x] = tmp[j][threadIdx.x];
} }
} }

View File

@ -51,6 +51,8 @@ static __global__ __launch_bounds__(CUDA_PAD_REFLECT_1D_BLOCK_SIZE, 1) void
} }
const float value = *(const float *) (src0_ptr + src_idx * nb00); const float value = *(const float *) (src0_ptr + src_idx * nb00);
*(float *) (dst_ptr + i0 * nb0) = value; *(float *) (dst_ptr + i0 * nb0) = value;
GGML_UNUSED(p1);
} }
void ggml_cuda_op_pad_reflect_1d(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { void ggml_cuda_op_pad_reflect_1d(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {

View File

@ -1,5 +0,0 @@
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
#include "../fattn-vec-f16.cuh"
DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_F16, GGML_TYPE_F16);

View File

@ -1,5 +0,0 @@
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
#include "../fattn-vec-f16.cuh"
DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q4_0);

View File

@ -1,5 +0,0 @@
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
#include "../fattn-vec-f16.cuh"
DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q4_1);

View File

@ -1,5 +0,0 @@
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
#include "../fattn-vec-f16.cuh"
DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q5_0);

View File

@ -1,5 +0,0 @@
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
#include "../fattn-vec-f16.cuh"
DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q5_1);

View File

@ -1,5 +0,0 @@
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
#include "../fattn-vec-f16.cuh"
DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q8_0);

View File

@ -1,5 +0,0 @@
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
#include "../fattn-vec-f16.cuh"
DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_F16);

View File

@ -1,5 +0,0 @@
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
#include "../fattn-vec-f16.cuh"
DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q4_0);

View File

@ -1,5 +0,0 @@
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
#include "../fattn-vec-f16.cuh"
DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q4_1);

View File

@ -1,5 +0,0 @@
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
#include "../fattn-vec-f16.cuh"
DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q5_0);

View File

@ -1,5 +0,0 @@
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
#include "../fattn-vec-f16.cuh"
DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q5_1);

View File

@ -1,5 +0,0 @@
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
#include "../fattn-vec-f16.cuh"
DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q8_0);

View File

@ -1,5 +0,0 @@
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
#include "../fattn-vec-f16.cuh"
DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_F16);

View File

@ -1,5 +0,0 @@
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
#include "../fattn-vec-f16.cuh"
DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q4_0);

View File

@ -1,5 +0,0 @@
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
#include "../fattn-vec-f16.cuh"
DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q4_1);

View File

@ -1,5 +0,0 @@
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
#include "../fattn-vec-f16.cuh"
DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q5_0);

View File

@ -1,5 +0,0 @@
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
#include "../fattn-vec-f16.cuh"
DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q5_1);

View File

@ -1,5 +0,0 @@
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
#include "../fattn-vec-f16.cuh"
DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q8_0);

View File

@ -1,5 +0,0 @@
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
#include "../fattn-vec-f16.cuh"
DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_F16);

View File

@ -1,5 +0,0 @@
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
#include "../fattn-vec-f16.cuh"
DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q4_0);

View File

@ -1,5 +0,0 @@
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
#include "../fattn-vec-f16.cuh"
DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q4_1);

View File

@ -1,5 +0,0 @@
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
#include "../fattn-vec-f16.cuh"
DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q5_0);

View File

@ -1,5 +0,0 @@
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
#include "../fattn-vec-f16.cuh"
DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q5_1);

View File

@ -1,5 +0,0 @@
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
#include "../fattn-vec-f16.cuh"
DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q8_0);

View File

@ -1,5 +0,0 @@
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
#include "../fattn-vec-f16.cuh"
DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_F16);

View File

@ -1,5 +0,0 @@
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
#include "../fattn-vec-f16.cuh"
DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q4_0);

View File

@ -1,5 +0,0 @@
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
#include "../fattn-vec-f16.cuh"
DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q4_1);

View File

@ -1,5 +0,0 @@
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
#include "../fattn-vec-f16.cuh"
DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q5_0);

View File

@ -1,5 +0,0 @@
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
#include "../fattn-vec-f16.cuh"
DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q5_1);

View File

@ -1,5 +0,0 @@
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
#include "../fattn-vec-f16.cuh"
DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q8_0);

View File

@ -1,5 +0,0 @@
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
#include "../fattn-vec-f16.cuh"
DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_F16);

View File

@ -1,5 +0,0 @@
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
#include "../fattn-vec-f16.cuh"
DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q4_0);

View File

@ -1,5 +0,0 @@
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
#include "../fattn-vec-f16.cuh"
DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q4_1);

View File

@ -1,5 +0,0 @@
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
#include "../fattn-vec-f16.cuh"
DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q5_0);

View File

@ -1,5 +0,0 @@
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
#include "../fattn-vec-f16.cuh"
DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q5_1);

View File

@ -1,5 +0,0 @@
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
#include "../fattn-vec-f16.cuh"
DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q8_0);

View File

@ -1,5 +0,0 @@
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
#include "../fattn-vec-f16.cuh"
DECL_FATTN_VEC_F16_CASE(256, GGML_TYPE_F16, GGML_TYPE_F16);

View File

@ -1,5 +0,0 @@
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
#include "../fattn-vec-f16.cuh"
DECL_FATTN_VEC_F16_CASE(64, GGML_TYPE_F16, GGML_TYPE_F16);

View File

@ -1,5 +0,0 @@
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
#include "../fattn-vec-f16.cuh"
DECL_FATTN_VEC_F16_CASE(64, GGML_TYPE_F16, GGML_TYPE_Q4_0);

View File

@ -1,5 +0,0 @@
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
#include "../fattn-vec-f16.cuh"
DECL_FATTN_VEC_F16_CASE(64, GGML_TYPE_F16, GGML_TYPE_Q4_1);

View File

@ -1,5 +0,0 @@
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
#include "../fattn-vec-f16.cuh"
DECL_FATTN_VEC_F16_CASE(64, GGML_TYPE_F16, GGML_TYPE_Q5_0);

View File

@ -1,5 +0,0 @@
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
#include "../fattn-vec-f16.cuh"
DECL_FATTN_VEC_F16_CASE(64, GGML_TYPE_F16, GGML_TYPE_Q5_1);

View File

@ -1,5 +0,0 @@
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
#include "../fattn-vec-f16.cuh"
DECL_FATTN_VEC_F16_CASE(64, GGML_TYPE_F16, GGML_TYPE_Q8_0);

View File

@ -1,5 +0,0 @@
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
#include "../fattn-vec-f32.cuh"
DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_F16, GGML_TYPE_F16);

View File

@ -1,5 +0,0 @@
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
#include "../fattn-vec-f32.cuh"
DECL_FATTN_VEC_F32_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q4_0);

Some files were not shown because too many files have changed in this diff Show More